Mercurial > hg > cc > cirrus_work
changeset 276:76fb260e893b
try piping instead of python.isal,
push value printing into C,
other tweaks to try to speed up, no joy
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 19 Feb 2025 17:48:11 +0000 |
parents | 5439c4c7777e |
children | 018866252464 |
files | lib/python/cc/lmh/test_cdbp.py |
diffstat | 1 files changed, 74 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/test_cdbp.py Wed Feb 19 17:48:11 2025 +0000 @@ -0,0 +1,74 @@ +#!/usr/bin/python3 +# cython: profile=False +'''Usage: uz .../cdx-....gz | test_lookup3.py cdbpat | igzip -c > cdc-...gz +cdbpat identifies a set of 17 CDB files E.g. .../cdb/ks_%d-%d.cdb''' + +import cython, typing, timeit, re, sys +from db import CCdb + +def mainp() -> None: + PAT: typing.Pattern[cython.bytes] = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/') + + N: int = 0 + hits: int = 0 + + cdx_in: typing.BinaryIO = sys.stdin.buffer + cdx_out: typing.BinaryIO = sys.stdout.buffer + + l: cython.bytes + m: typing.Match[cython.bytes] | None + + D: int = 17 + d: int = int(100/(D-1)) + + CC: List[CCdb] + mv: List[char[::1]] + ff: List[file] + + bb: List[(int,int)] = list(zip(list(range(0,100,d)), + list(range(5,100,d))+[99])) + + + ff = [open(sys.argv[1]%(b,e),'rb') for b,e in bb] + CC = [CCdb() for i in range(D)] + for i in range(D): + CC[i].init(ff[i].fileno()) + + for l in cdx_in: + key: cython.bytes + cdate: cython.bytes + props: cython.bytes + res: int + key, cdate, props = l.split(b' ',maxsplit=2) + if (m:=PAT.search(props)): + seg = int(m[2]) + N += 1 + i: int = int(seg / d) + k: cython.bytes = cdate+m[1] + if (seg == 0) & (res := CC[i].find(k)) == 1: + hits += 1 + cdx_out.write(key) + cdx_out.write(b' ') + cdx_out.write(cdate) + cdx_out.write(b' ') + cdx_out.write(memoryview(props)[:-2]) + cdx_out.write(b', "lastmod": "') + cdx_out.flush() + CC[i].write(1) + cdx_out.write(b'"}\n') + continue + elif (seg != 0) | (res == 0): + if res == 0: + print(k,key,props,seg,i,file=sys.stderr) + pass + else: + raise ValueError((key,props,seg,i)) + else: + raise ValueError(props) + cdx_out.write(l) + print('%s entries, %s given lastmod'%(N,hits)) + +if __name__ == "__main__": + mainp() + +