Mercurial > hg > cc > cirrus_work
changeset 272:5c81ff10a66a
renamed
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 12 Feb 2025 12:59:28 +0000 |
parents | ac367fc4b562 |
children | 603b46f3062d |
files | lib/python/cc/lmh/test_cdb.py lib/python/cc/lmh/test_lookup3.py |
diffstat | 2 files changed, 69 insertions(+), 69 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/test_cdb.py Wed Feb 12 12:59:28 2025 +0000 @@ -0,0 +1,69 @@ +#!/usr/bin/python3 +# cython: profile=False +'''Usage: test_lookup3.py cdbpat, e.g. .../cdb/ks_%d-%d.cdb''' + + +import cython, typing, timeit, re, sys +from db import CCdb + +from isal import igzip + +def mainp() -> None: + PAT: typing.Pattern[cython.bytes] = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/') + + N: int = 0 + hits: int = 0 + + cdx_in: typing.BinaryIO + cdx_out: typing.BinaryIO + + l: cython.bytes + m: typing.Match[cython.bytes] | None + + D: int = 17 + + CC: List[CCdb] + mv: List[char[::1]] + ff: List[file] + + bb: List[(int,int)] = list(zip(list(range(0,100,int(100/(D-1)))), + list(range(5,100,int(100/(D-1))))+[99])) + + + ff = [open(sys.argv[1]%(b,e),'rb') for b,e in bb] + CC = [CCdb() for i in range(D)] + mv = [CC[i].init(ff[i].fileno()) for i in range(D)] + + exit(0) + with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out: + for l in cdx_in: + key: cython.bytes + cdate: cython.bytes + props: cython.bytes + key, cdate, props = l.split(b' ',maxsplit=2) + if (m:=PAT.search(props)): + seg = int(m[2]) + if seg == 0: + N += 1 + k = cdate+m[1] + try: + mv: cython.bytes = d[k] + hits += 1 + cdx_out.write(key) + cdx_out.write(b' ') + cdx_out.write(cdate) + cdx_out.write(b' ') + cdx_out.write(memoryview(props)[:-2]) + cdx_out.write(b', "lastmod": "%b"}\n'%mv) + continue + except KeyError: + pass + else: + raise ValueError(props) + cdx_out.write(l) + print('%s entries, %s given lastmod'%(N,hits)) + +if __name__ == "__main__": + mainp() + +
--- a/lib/python/cc/lmh/test_lookup3.py Wed Feb 12 11:29:41 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ -#!/usr/bin/python3 -# cython: profile=False -'''Usage: test_lookup3.py cdbpat, e.g. .../cdb/ks_%d-%d.cdb''' - - -import cython, typing, timeit, re, sys -from db import CCdb - -from isal import igzip - -def mainp() -> None: - PAT: typing.Pattern[cython.bytes] = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/') - - N: int = 0 - hits: int = 0 - - cdx_in: typing.BinaryIO - cdx_out: typing.BinaryIO - - l: cython.bytes - m: typing.Match[cython.bytes] | None - - D: int = 17 - - CC: List[CCdb] - mv: List[char[::1]] - ff: List[file] - - bb: List[(int,int)] = list(zip(list(range(0,100,int(100/(D-1)))), - list(range(5,100,int(100/(D-1))))+[99])) - - - ff = [open(sys.argv[1]%(b,e),'rb') for b,e in bb] - CC = [CCdb() for i in range(D)] - mv = [CC[i].init(ff[i].fileno()) for i in range(D)] - - exit(0) - with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out: - for l in cdx_in: - key: cython.bytes - cdate: cython.bytes - props: cython.bytes - key, cdate, props = l.split(b' ',maxsplit=2) - if (m:=PAT.search(props)): - seg = int(m[2]) - if seg == 0: - N += 1 - k = cdate+m[1] - try: - mv: cython.bytes = d[k] - hits += 1 - cdx_out.write(key) - cdx_out.write(b' ') - cdx_out.write(cdate) - cdx_out.write(b' ') - cdx_out.write(memoryview(props)[:-2]) - cdx_out.write(b', "lastmod": "%b"}\n'%mv) - continue - except KeyError: - pass - else: - raise ValueError(props) - cdx_out.write(l) - print('%s entries, %s given lastmod'%(N,hits)) - -if __name__ == "__main__": - mainp() - -