# HG changeset patch # User Henry S. Thompson # Date 1735842603 0 # Node ID 650383a798e5680ea0270bd1f128797de444ed0c # Parent 7737da0ccb8cc30fe92c5bbd5eb3f4322ec52cc8 with bloom prefilter diff -r 7737da0ccb8c -r 650383a798e5 lib/python/cc/lmh/test_lookup2.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/test_lookup2.py Thu Jan 02 18:30:03 2025 +0000 @@ -0,0 +1,51 @@ +#!/usr/bin/python3 +from isal import igzip +from pybloomfilter import BloomFilter + +import re, pickle, timeit +global d, handle + +PAT = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/') + +handle = None +d = [0] + +uris = BloomFilter.open('results/CC-MAIN-2019-35/warc_lmhx/uris_20.bloom') + +with open('results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle', 'rb') as handle: + t = timeit.Timer('d[0] = pickle.load(handle)', globals = globals()) + t.timeit(number=1) + d = d[0] + print(len(d)) + +N = 0 +hits = 0 +fp = 0 + +with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out: + for l in cdx_in: + key, cdate, props = l.split(b' ',maxsplit=2) + if (m:=PAT.search(props)): + seg = int(m[2]) + if seg >= 0 and seg < 10: + N += 1 + if (u:=m[1]) in uris: + k = cdate+u + try: + m = d[k] + hits += 1 + cdx_out.write(key) + cdx_out.write(b' ') + cdx_out.write(cdate) + cdx_out.write(b' ') + cdx_out.write(memoryview(props)[:-2]) + cdx_out.write(b', "lastmod": "%b"}\n'%m) + continue + except KeyError: + fp += 1 + pass + else: + raise ValueError(props) + cdx_out.write(l) +print('%s entries, %s given lastmod, %s false positives'%(N,hits,fp)) +