changeset 274:5c013fd18ea0

working, but very slowly
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 12 Feb 2025 20:17:39 +0000
parents 603b46f3062d
children 5439c4c7777e
files lib/python/cc/lmh/test_cdb.py
diffstat 1 files changed, 4 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/test_cdb.py	Wed Feb 12 13:01:05 2025 +0000
+++ b/lib/python/cc/lmh/test_cdb.py	Wed Feb 12 20:17:39 2025 +0000
@@ -35,7 +35,7 @@
   CC = [CCdb() for i in range(D)]
   mv = [CC[i].init(ff[i].fileno()) for i in range(D)]
 
-  with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out:
+  with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('/work/dc007/dc007/hst/results/CC-MAIN-2019-35/warc_lmhx/cdx-00100.gz', 'wb') as cdx_out:
     for l in cdx_in:
       key: cython.bytes
       cdate: cython.bytes
@@ -45,10 +45,10 @@
       if (m:=PAT.search(props)):
         seg = int(m[2])
         N += 1
-        int: i = int(seg / d)
-        key = cdate+m[1]
+        i: int = int(seg / d)
+        k: cython.bytes = cdate+m[1]
         CC[i].findstart()
-        if (res := CC[i].find(key)) == 1:
+        if (res := CC[i].find(k)) == 1:
           hits += 1
           cdx_out.write(key)
           cdx_out.write(b' ')