changeset 273:603b46f3062d default tip

maybe ready
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 12 Feb 2025 13:01:05 +0000
parents 5c81ff10a66a
children
files lib/python/cc/lmh/test_cdb.py
diffstat 1 files changed, 21 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/test_cdb.py	Wed Feb 12 12:59:28 2025 +0000
+++ b/lib/python/cc/lmh/test_cdb.py	Wed Feb 12 13:01:05 2025 +0000
@@ -21,43 +21,46 @@
   m: typing.Match[cython.bytes] | None
 
   D: int = 17
+  d: int = int(100/(D-1))
 
   CC: List[CCdb]
   mv: List[char[::1]]
   ff: List[file]
 
-  bb: List[(int,int)] = list(zip(list(range(0,100,int(100/(D-1)))),
-                                 list(range(5,100,int(100/(D-1))))+[99]))
+  bb: List[(int,int)] = list(zip(list(range(0,100,d)),
+                                 list(range(5,100,d))+[99]))
 
 
   ff = [open(sys.argv[1]%(b,e),'rb') for b,e in bb]
   CC = [CCdb() for i in range(D)]
   mv = [CC[i].init(ff[i].fileno()) for i in range(D)]
 
-  exit(0)
   with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out:
     for l in cdx_in:
       key: cython.bytes
       cdate: cython.bytes
       props: cython.bytes
+      res: int
       key, cdate, props = l.split(b' ',maxsplit=2)
       if (m:=PAT.search(props)):
         seg = int(m[2])
-        if seg == 0:
-          N += 1
-          k = cdate+m[1]
-          try:
-            mv: cython.bytes = d[k]
-            hits += 1
-            cdx_out.write(key)
-            cdx_out.write(b' ')
-            cdx_out.write(cdate)
-            cdx_out.write(b' ')
-            cdx_out.write(memoryview(props)[:-2])
-            cdx_out.write(b', "lastmod": "%b"}\n'%mv)
-            continue
-          except KeyError:
-            pass
+        N += 1
+        int: i = int(seg / d)
+        key = cdate+m[1]
+        CC[i].findstart()
+        if (res := CC[i].find(key)) == 1:
+          hits += 1
+          cdx_out.write(key)
+          cdx_out.write(b' ')
+          cdx_out.write(cdate)
+          cdx_out.write(b' ')
+          cdx_out.write(memoryview(props)[:-2])
+          cdx_out.write(b', "lastmod": "%b"}\n'%CC[i].value())
+          continue
+        elif res == 0:
+          pass
+        else:
+          raise ValueError((key,props,seg,i))
       else:
         raise ValueError(props)
       cdx_out.write(l)