changeset 276:76fb260e893b

try piping instead of python.isal, push value printing into C, other tweaks to try to speed up, no joy
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 19 Feb 2025 17:48:11 +0000
parents 5439c4c7777e
children 018866252464
files lib/python/cc/lmh/test_cdbp.py
diffstat 1 files changed, 74 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/test_cdbp.py	Wed Feb 19 17:48:11 2025 +0000
@@ -0,0 +1,74 @@
+#!/usr/bin/python3
+# cython: profile=False
+'''Usage: uz .../cdx-....gz | test_lookup3.py cdbpat | igzip -c > cdc-...gz
+cdbpat identifies a set of 17 CDB files E.g. .../cdb/ks_%d-%d.cdb'''
+
+import cython, typing, timeit, re, sys
+from db import CCdb
+
+def mainp() -> None:
+  PAT: typing.Pattern[cython.bytes] = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/')
+
+  N: int = 0
+  hits: int = 0
+
+  cdx_in: typing.BinaryIO = sys.stdin.buffer
+  cdx_out: typing.BinaryIO = sys.stdout.buffer
+
+  l: cython.bytes
+  m: typing.Match[cython.bytes] | None
+
+  D: int = 17
+  d: int = int(100/(D-1))
+
+  CC: List[CCdb]
+  mv: List[char[::1]]
+  ff: List[file]
+
+  bb: List[(int,int)] = list(zip(list(range(0,100,d)),
+                                 list(range(5,100,d))+[99]))
+
+
+  ff = [open(sys.argv[1]%(b,e),'rb') for b,e in bb]
+  CC = [CCdb() for i in range(D)]
+  for i in range(D):
+    CC[i].init(ff[i].fileno())
+
+  for l in cdx_in:
+    key: cython.bytes
+    cdate: cython.bytes
+    props: cython.bytes
+    res: int
+    key, cdate, props = l.split(b' ',maxsplit=2)
+    if (m:=PAT.search(props)):
+      seg = int(m[2])
+      N += 1
+      i: int = int(seg / d)
+      k: cython.bytes = cdate+m[1]
+      if (seg == 0) & (res := CC[i].find(k)) == 1:
+        hits += 1
+        cdx_out.write(key)
+        cdx_out.write(b' ')
+        cdx_out.write(cdate)
+        cdx_out.write(b' ')
+        cdx_out.write(memoryview(props)[:-2])
+        cdx_out.write(b', "lastmod": "')
+        cdx_out.flush()
+        CC[i].write(1)
+        cdx_out.write(b'"}\n')
+        continue
+      elif (seg != 0) | (res == 0):
+        if res == 0:
+          print(k,key,props,seg,i,file=sys.stderr)
+        pass
+      else:
+        raise ValueError((key,props,seg,i))
+    else:
+      raise ValueError(props)
+    cdx_out.write(l)
+  print('%s entries, %s given lastmod'%(N,hits))
+          
+if __name__ == "__main__":
+    mainp()
+
+