Mercurial > hg > cc > cirrus_work
changeset 274:5c013fd18ea0
working, but very slowly
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 12 Feb 2025 20:17:39 +0000 |
parents | 603b46f3062d |
children | 5439c4c7777e |
files | lib/python/cc/lmh/test_cdb.py |
diffstat | 1 files changed, 4 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/test_cdb.py Wed Feb 12 13:01:05 2025 +0000 +++ b/lib/python/cc/lmh/test_cdb.py Wed Feb 12 20:17:39 2025 +0000 @@ -35,7 +35,7 @@ CC = [CCdb() for i in range(D)] mv = [CC[i].init(ff[i].fileno()) for i in range(D)] - with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out: + with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('/work/dc007/dc007/hst/results/CC-MAIN-2019-35/warc_lmhx/cdx-00100.gz', 'wb') as cdx_out: for l in cdx_in: key: cython.bytes cdate: cython.bytes @@ -45,10 +45,10 @@ if (m:=PAT.search(props)): seg = int(m[2]) N += 1 - int: i = int(seg / d) - key = cdate+m[1] + i: int = int(seg / d) + k: cython.bytes = cdate+m[1] CC[i].findstart() - if (res := CC[i].find(key)) == 1: + if (res := CC[i].find(k)) == 1: hits += 1 cdx_out.write(key) cdx_out.write(b' ')