Mercurial > hg > cc > cirrus_work
changeset 253:79701366f438
add some cython decoration, not much effect
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 17 Jan 2025 20:37:10 +0000 |
parents | 39c3835716f3 |
children | aeb755b72a7d |
files | lib/python/cc/lmh/test_lookup1.py |
diffstat | 1 files changed, 47 insertions(+), 38 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/test_lookup1.py Fri Jan 17 20:35:21 2025 +0000 +++ b/lib/python/cc/lmh/test_lookup1.py Fri Jan 17 20:37:10 2025 +0000 @@ -1,45 +1,54 @@ #!/usr/bin/python3 +# cython: profile=True +import cython, typing + from isal import igzip -import re, pickle, timeit -global d, handle +import re, pickle -PAT = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/') - -handle = None -d = [0] +def mainp() -> None: + PAT: typing.Pattern[cython.bytes] = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/') -with open('results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle', 'rb') as handle: - t = timeit.Timer('d[0] = pickle.load(handle)', globals = globals()) - t.timeit(number=1) - d = d[0] - print(len(d)) - -N = 0 -hits = 0 + handle: typing.BinaryIO + N: int = 0 + hits: int = 0 + with open('results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle', 'rb') as handle: + d: typing.Dict[cython.bytes,cython.bytes] = pickle.load(handle) + print(len(d)) -with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out: - for l in cdx_in: - key, cdate, props = l.split(b' ',maxsplit=2) - if (m:=PAT.search(props)): - seg = int(m[2]) - if seg >= 0 and seg < 10: - N += 1 - k = cdate+m[1] - try: - m = d[k] - hits += 1 - cdx_out.write(key) - cdx_out.write(b' ') - cdx_out.write(cdate) - cdx_out.write(b' ') - cdx_out.write(memoryview(props)[:-2]) - cdx_out.write(b', "lastmod": "%b"}\n'%m) - continue - except KeyError: - pass - else: - raise ValueError(props) - cdx_out.write(l) -print('%s entries, %s given lastmod'%(N,hits)) + cdx_in: typing.BinaryIO + cdx_out: typing.BinaryIO + l: cython.bytes + m: typing.Match[cython.bytes] | None + with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out: + for l in cdx_in: + key: cython.bytes + cdate: cython.bytes + props: cython.bytes + key, cdate, props = l.split(b' ',maxsplit=2) + if (m:=PAT.search(props)): + seg = int(m[2]) + if seg >= 0 and seg < 10: + N += 1 + k = cdate+m[1] + try: + mv: cython.bytes = d[k] + hits += 1 + cdx_out.write(key) + cdx_out.write(b' ') + cdx_out.write(cdate) + cdx_out.write(b' ') + cdx_out.write(memoryview(props)[:-2]) + cdx_out.write(b', "lastmod": "%b"}\n'%mv) + continue + except KeyError: + pass + else: + raise ValueError(props) + cdx_out.write(l) + print('%s entries, %s given lastmod'%(N,hits)) +if __name__ == "__main__": + mainp() + +