Mercurial > hg > cc > cirrus_work
changeset 271:ac367fc4b562
towards a real test of cdb
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 12 Feb 2025 11:29:41 +0000 |
parents | f88abc69f876 |
children | 5c81ff10a66a |
files | lib/python/cc/lmh/test_lookup3.py |
diffstat | 1 files changed, 69 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/test_lookup3.py Wed Feb 12 11:29:41 2025 +0000 @@ -0,0 +1,69 @@ +#!/usr/bin/python3 +# cython: profile=False +'''Usage: test_lookup3.py cdbpat, e.g. .../cdb/ks_%d-%d.cdb''' + + +import cython, typing, timeit, re, sys +from db import CCdb + +from isal import igzip + +def mainp() -> None: + PAT: typing.Pattern[cython.bytes] = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/') + + N: int = 0 + hits: int = 0 + + cdx_in: typing.BinaryIO + cdx_out: typing.BinaryIO + + l: cython.bytes + m: typing.Match[cython.bytes] | None + + D: int = 17 + + CC: List[CCdb] + mv: List[char[::1]] + ff: List[file] + + bb: List[(int,int)] = list(zip(list(range(0,100,int(100/(D-1)))), + list(range(5,100,int(100/(D-1))))+[99])) + + + ff = [open(sys.argv[1]%(b,e),'rb') for b,e in bb] + CC = [CCdb() for i in range(D)] + mv = [CC[i].init(ff[i].fileno()) for i in range(D)] + + exit(0) + with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out: + for l in cdx_in: + key: cython.bytes + cdate: cython.bytes + props: cython.bytes + key, cdate, props = l.split(b' ',maxsplit=2) + if (m:=PAT.search(props)): + seg = int(m[2]) + if seg == 0: + N += 1 + k = cdate+m[1] + try: + mv: cython.bytes = d[k] + hits += 1 + cdx_out.write(key) + cdx_out.write(b' ') + cdx_out.write(cdate) + cdx_out.write(b' ') + cdx_out.write(memoryview(props)[:-2]) + cdx_out.write(b', "lastmod": "%b"}\n'%mv) + continue + except KeyError: + pass + else: + raise ValueError(props) + cdx_out.write(l) + print('%s entries, %s given lastmod'%(N,hits)) + +if __name__ == "__main__": + mainp() + +