changeset 271:ac367fc4b562

towards a real test of cdb
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 12 Feb 2025 11:29:41 +0000
parents f88abc69f876
children 5c81ff10a66a
files lib/python/cc/lmh/test_lookup3.py
diffstat 1 files changed, 69 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/test_lookup3.py	Wed Feb 12 11:29:41 2025 +0000
@@ -0,0 +1,69 @@
+#!/usr/bin/python3
+# cython: profile=False
+'''Usage: test_lookup3.py cdbpat, e.g. .../cdb/ks_%d-%d.cdb'''
+
+
+import cython, typing, timeit, re, sys
+from db import CCdb
+
+from isal import igzip
+
+def mainp() -> None:
+  PAT: typing.Pattern[cython.bytes] = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/')
+
+  N: int = 0
+  hits: int = 0
+
+  cdx_in: typing.BinaryIO
+  cdx_out: typing.BinaryIO
+
+  l: cython.bytes
+  m: typing.Match[cython.bytes] | None
+
+  D: int = 17
+
+  CC: List[CCdb]
+  mv: List[char[::1]]
+  ff: List[file]
+
+  bb: List[(int,int)] = list(zip(list(range(0,100,int(100/(D-1)))),
+                                 list(range(5,100,int(100/(D-1))))+[99]))
+
+
+  ff = [open(sys.argv[1]%(b,e),'rb') for b,e in bb]
+  CC = [CCdb() for i in range(D)]
+  mv = [CC[i].init(ff[i].fileno()) for i in range(D)]
+
+  exit(0)
+  with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out:
+    for l in cdx_in:
+      key: cython.bytes
+      cdate: cython.bytes
+      props: cython.bytes
+      key, cdate, props = l.split(b' ',maxsplit=2)
+      if (m:=PAT.search(props)):
+        seg = int(m[2])
+        if seg == 0:
+          N += 1
+          k = cdate+m[1]
+          try:
+            mv: cython.bytes = d[k]
+            hits += 1
+            cdx_out.write(key)
+            cdx_out.write(b' ')
+            cdx_out.write(cdate)
+            cdx_out.write(b' ')
+            cdx_out.write(memoryview(props)[:-2])
+            cdx_out.write(b', "lastmod": "%b"}\n'%mv)
+            continue
+          except KeyError:
+            pass
+      else:
+        raise ValueError(props)
+      cdx_out.write(l)
+  print('%s entries, %s given lastmod'%(N,hits))
+          
+if __name__ == "__main__":
+    mainp()
+
+