Mercurial > hg > cc > cirrus_work
changeset 248:650383a798e5
with bloom prefilter
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 02 Jan 2025 18:30:03 +0000 |
parents | 7737da0ccb8c |
children | 87a35540104b |
files | lib/python/cc/lmh/test_lookup2.py |
diffstat | 1 files changed, 51 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/test_lookup2.py Thu Jan 02 18:30:03 2025 +0000 @@ -0,0 +1,51 @@ +#!/usr/bin/python3 +from isal import igzip +from pybloomfilter import BloomFilter + +import re, pickle, timeit +global d, handle + +PAT = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/') + +handle = None +d = [0] + +uris = BloomFilter.open('results/CC-MAIN-2019-35/warc_lmhx/uris_20.bloom') + +with open('results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle', 'rb') as handle: + t = timeit.Timer('d[0] = pickle.load(handle)', globals = globals()) + t.timeit(number=1) + d = d[0] + print(len(d)) + +N = 0 +hits = 0 +fp = 0 + +with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out: + for l in cdx_in: + key, cdate, props = l.split(b' ',maxsplit=2) + if (m:=PAT.search(props)): + seg = int(m[2]) + if seg >= 0 and seg < 10: + N += 1 + if (u:=m[1]) in uris: + k = cdate+u + try: + m = d[k] + hits += 1 + cdx_out.write(key) + cdx_out.write(b' ') + cdx_out.write(cdate) + cdx_out.write(b' ') + cdx_out.write(memoryview(props)[:-2]) + cdx_out.write(b', "lastmod": "%b"}\n'%m) + continue + except KeyError: + fp += 1 + pass + else: + raise ValueError(props) + cdx_out.write(l) +print('%s entries, %s given lastmod, %s false positives'%(N,hits,fp)) +