changeset 248:650383a798e5

with bloom prefilter
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 02 Jan 2025 18:30:03 +0000
parents 7737da0ccb8c
children 87a35540104b
files lib/python/cc/lmh/test_lookup2.py
diffstat 1 files changed, 51 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/test_lookup2.py	Thu Jan 02 18:30:03 2025 +0000
@@ -0,0 +1,51 @@
+#!/usr/bin/python3
+from isal import igzip
+from pybloomfilter import BloomFilter
+
+import re, pickle, timeit
+global d, handle
+
+PAT = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/')
+
+handle = None
+d = [0]
+
+uris = BloomFilter.open('results/CC-MAIN-2019-35/warc_lmhx/uris_20.bloom')
+
+with open('results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle', 'rb') as handle:
+  t = timeit.Timer('d[0] = pickle.load(handle)', globals = globals())
+  t.timeit(number=1)
+  d = d[0]
+  print(len(d))
+
+N = 0
+hits = 0
+fp = 0
+
+with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out:
+  for l in cdx_in:
+    key, cdate, props = l.split(b' ',maxsplit=2)
+    if (m:=PAT.search(props)):
+      seg = int(m[2])
+      if seg >= 0 and seg < 10:
+        N += 1
+        if (u:=m[1]) in uris:
+          k = cdate+u
+          try:
+            m = d[k]
+            hits += 1
+            cdx_out.write(key)
+            cdx_out.write(b' ')
+            cdx_out.write(cdate)
+            cdx_out.write(b' ')
+            cdx_out.write(memoryview(props)[:-2])
+            cdx_out.write(b', "lastmod": "%b"}\n'%m)
+            continue
+          except KeyError:
+            fp += 1
+            pass
+    else:
+      raise ValueError(props)
+    cdx_out.write(l)
+print('%s entries, %s given lastmod, %s false positives'%(N,hits,fp))
+