Mercurial > hg > cc > cirrus_work
changeset 273:603b46f3062d default tip
maybe ready
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 12 Feb 2025 13:01:05 +0000 |
parents | 5c81ff10a66a |
children | |
files | lib/python/cc/lmh/test_cdb.py |
diffstat | 1 files changed, 21 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/test_cdb.py Wed Feb 12 12:59:28 2025 +0000 +++ b/lib/python/cc/lmh/test_cdb.py Wed Feb 12 13:01:05 2025 +0000 @@ -21,43 +21,46 @@ m: typing.Match[cython.bytes] | None D: int = 17 + d: int = int(100/(D-1)) CC: List[CCdb] mv: List[char[::1]] ff: List[file] - bb: List[(int,int)] = list(zip(list(range(0,100,int(100/(D-1)))), - list(range(5,100,int(100/(D-1))))+[99])) + bb: List[(int,int)] = list(zip(list(range(0,100,d)), + list(range(5,100,d))+[99])) ff = [open(sys.argv[1]%(b,e),'rb') for b,e in bb] CC = [CCdb() for i in range(D)] mv = [CC[i].init(ff[i].fileno()) for i in range(D)] - exit(0) with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out: for l in cdx_in: key: cython.bytes cdate: cython.bytes props: cython.bytes + res: int key, cdate, props = l.split(b' ',maxsplit=2) if (m:=PAT.search(props)): seg = int(m[2]) - if seg == 0: - N += 1 - k = cdate+m[1] - try: - mv: cython.bytes = d[k] - hits += 1 - cdx_out.write(key) - cdx_out.write(b' ') - cdx_out.write(cdate) - cdx_out.write(b' ') - cdx_out.write(memoryview(props)[:-2]) - cdx_out.write(b', "lastmod": "%b"}\n'%mv) - continue - except KeyError: - pass + N += 1 + int: i = int(seg / d) + key = cdate+m[1] + CC[i].findstart() + if (res := CC[i].find(key)) == 1: + hits += 1 + cdx_out.write(key) + cdx_out.write(b' ') + cdx_out.write(cdate) + cdx_out.write(b' ') + cdx_out.write(memoryview(props)[:-2]) + cdx_out.write(b', "lastmod": "%b"}\n'%CC[i].value()) + continue + elif res == 0: + pass + else: + raise ValueError((key,props,seg,i)) else: raise ValueError(props) cdx_out.write(l)