Mercurial > hg > cc > cirrus_work
changeset 247:7737da0ccb8c
try adding lm to existing index from ks_0-9
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 02 Jan 2025 14:52:14 +0000 |
parents | 666069efb0c6 |
children | 650383a798e5 |
files | lib/python/cc/lmh/test_lookup1.py |
diffstat | 1 files changed, 38 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/test_lookup1.py Thu Jan 02 14:52:14 2025 +0000 @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +from isal import igzip +import re, pickle + +PAT = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/') + +with open('results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle', 'rb') as handle: + d = pickle.load(handle) # this takes ~20 seconds + print(len(d)) + +N = 0 +hits = 0 + +with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out: + for l in cdx_in: + key, cdate, props = l.split(b' ',maxsplit=2) + if (m:=PAT.search(props)): + seg = int(m[2]) + if seg >= 0 and seg < 10: + N += 1 + k = cdate+m[1] + try: + m = d[k] + hits += 1 + cdx_out.write(key) + cdx_out.write(b' ') + cdx_out.write(cdate) + cdx_out.write(b' ') + cdx_out.write(memoryview(props)[:-2]) + cdx_out.write(b', "lastmod": "%b"}\n'%m) + continue + except KeyError: + pass + else: + raise ValueError(props) + cdx_out.write(l) +print(N,hits) +