# HG changeset patch # User Henry S. Thompson # Date 1735829534 0 # Node ID 7737da0ccb8cc30fe92c5bbd5eb3f4322ec52cc8 # Parent 666069efb0c66ba52b00aa955e5fdafadd8c7e91 try adding lm to existing index from ks_0-9 diff -r 666069efb0c6 -r 7737da0ccb8c lib/python/cc/lmh/test_lookup1.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/test_lookup1.py Thu Jan 02 14:52:14 2025 +0000 @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +from isal import igzip +import re, pickle + +PAT = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/') + +with open('results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle', 'rb') as handle: + d = pickle.load(handle) # this takes ~20 seconds + print(len(d)) + +N = 0 +hits = 0 + +with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out: + for l in cdx_in: + key, cdate, props = l.split(b' ',maxsplit=2) + if (m:=PAT.search(props)): + seg = int(m[2]) + if seg >= 0 and seg < 10: + N += 1 + k = cdate+m[1] + try: + m = d[k] + hits += 1 + cdx_out.write(key) + cdx_out.write(b' ') + cdx_out.write(cdate) + cdx_out.write(b' ') + cdx_out.write(memoryview(props)[:-2]) + cdx_out.write(b', "lastmod": "%b"}\n'%m) + continue + except KeyError: + pass + else: + raise ValueError(props) + cdx_out.write(l) +print(N,hits) +