changeset 247:7737da0ccb8c

try adding lm to existing index from ks_0-9
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 02 Jan 2025 14:52:14 +0000
parents 666069efb0c6
children 650383a798e5
files lib/python/cc/lmh/test_lookup1.py
diffstat 1 files changed, 38 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/test_lookup1.py	Thu Jan 02 14:52:14 2025 +0000
@@ -0,0 +1,38 @@
+#!/usr/bin/python3
+from isal import igzip
+import re, pickle
+
+PAT = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/')
+
+with open('results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle', 'rb') as handle:
+  d = pickle.load(handle) # this takes ~20 seconds
+  print(len(d))
+
+N = 0
+hits = 0
+
+with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out:
+  for l in cdx_in:
+    key, cdate, props = l.split(b' ',maxsplit=2)
+    if (m:=PAT.search(props)):
+      seg = int(m[2])
+      if seg >= 0 and seg < 10:
+        N += 1
+        k = cdate+m[1]
+        try:
+          m = d[k]
+          hits += 1
+          cdx_out.write(key)
+          cdx_out.write(b' ')
+          cdx_out.write(cdate)
+          cdx_out.write(b' ')
+          cdx_out.write(memoryview(props)[:-2])
+          cdx_out.write(b', "lastmod": "%b"}\n'%m)
+          continue
+        except KeyError:
+          pass
+    else:
+      raise ValueError(props)
+    cdx_out.write(l)
+print(N,hits)
+