changeset 272:5c81ff10a66a

renamed
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 12 Feb 2025 12:59:28 +0000
parents ac367fc4b562
children 603b46f3062d
files lib/python/cc/lmh/test_cdb.py lib/python/cc/lmh/test_lookup3.py
diffstat 2 files changed, 69 insertions(+), 69 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/test_cdb.py	Wed Feb 12 12:59:28 2025 +0000
@@ -0,0 +1,69 @@
+#!/usr/bin/python3
+# cython: profile=False
+'''Usage: test_lookup3.py cdbpat, e.g. .../cdb/ks_%d-%d.cdb'''
+
+
+import cython, typing, timeit, re, sys
+from db import CCdb
+
+from isal import igzip
+
+def mainp() -> None:
+  PAT: typing.Pattern[cython.bytes] = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/')
+
+  N: int = 0
+  hits: int = 0
+
+  cdx_in: typing.BinaryIO
+  cdx_out: typing.BinaryIO
+
+  l: cython.bytes
+  m: typing.Match[cython.bytes] | None
+
+  D: int = 17
+
+  CC: List[CCdb]
+  mv: List[char[::1]]
+  ff: List[file]
+
+  bb: List[(int,int)] = list(zip(list(range(0,100,int(100/(D-1)))),
+                                 list(range(5,100,int(100/(D-1))))+[99]))
+
+
+  ff = [open(sys.argv[1]%(b,e),'rb') for b,e in bb]
+  CC = [CCdb() for i in range(D)]
+  mv = [CC[i].init(ff[i].fileno()) for i in range(D)]
+
+  exit(0)
+  with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out:
+    for l in cdx_in:
+      key: cython.bytes
+      cdate: cython.bytes
+      props: cython.bytes
+      key, cdate, props = l.split(b' ',maxsplit=2)
+      if (m:=PAT.search(props)):
+        seg = int(m[2])
+        if seg == 0:
+          N += 1
+          k = cdate+m[1]
+          try:
+            mv: cython.bytes = d[k]
+            hits += 1
+            cdx_out.write(key)
+            cdx_out.write(b' ')
+            cdx_out.write(cdate)
+            cdx_out.write(b' ')
+            cdx_out.write(memoryview(props)[:-2])
+            cdx_out.write(b', "lastmod": "%b"}\n'%mv)
+            continue
+          except KeyError:
+            pass
+      else:
+        raise ValueError(props)
+      cdx_out.write(l)
+  print('%s entries, %s given lastmod'%(N,hits))
+          
+if __name__ == "__main__":
+    mainp()
+
+
--- a/lib/python/cc/lmh/test_lookup3.py	Wed Feb 12 11:29:41 2025 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,69 +0,0 @@
-#!/usr/bin/python3
-# cython: profile=False
-'''Usage: test_lookup3.py cdbpat, e.g. .../cdb/ks_%d-%d.cdb'''
-
-
-import cython, typing, timeit, re, sys
-from db import CCdb
-
-from isal import igzip
-
-def mainp() -> None:
-  PAT: typing.Pattern[cython.bytes] = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/')
-
-  N: int = 0
-  hits: int = 0
-
-  cdx_in: typing.BinaryIO
-  cdx_out: typing.BinaryIO
-
-  l: cython.bytes
-  m: typing.Match[cython.bytes] | None
-
-  D: int = 17
-
-  CC: List[CCdb]
-  mv: List[char[::1]]
-  ff: List[file]
-
-  bb: List[(int,int)] = list(zip(list(range(0,100,int(100/(D-1)))),
-                                 list(range(5,100,int(100/(D-1))))+[99]))
-
-
-  ff = [open(sys.argv[1]%(b,e),'rb') for b,e in bb]
-  CC = [CCdb() for i in range(D)]
-  mv = [CC[i].init(ff[i].fileno()) for i in range(D)]
-
-  exit(0)
-  with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out:
-    for l in cdx_in:
-      key: cython.bytes
-      cdate: cython.bytes
-      props: cython.bytes
-      key, cdate, props = l.split(b' ',maxsplit=2)
-      if (m:=PAT.search(props)):
-        seg = int(m[2])
-        if seg == 0:
-          N += 1
-          k = cdate+m[1]
-          try:
-            mv: cython.bytes = d[k]
-            hits += 1
-            cdx_out.write(key)
-            cdx_out.write(b' ')
-            cdx_out.write(cdate)
-            cdx_out.write(b' ')
-            cdx_out.write(memoryview(props)[:-2])
-            cdx_out.write(b', "lastmod": "%b"}\n'%mv)
-            continue
-          except KeyError:
-            pass
-      else:
-        raise ValueError(props)
-      cdx_out.write(l)
-  print('%s entries, %s given lastmod'%(N,hits))
-          
-if __name__ == "__main__":
-    mainp()
-
-