changeset 253:79701366f438

add some cython decoration, not much effect
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 17 Jan 2025 20:37:10 +0000
parents 39c3835716f3
children aeb755b72a7d
files lib/python/cc/lmh/test_lookup1.py
diffstat 1 files changed, 47 insertions(+), 38 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/test_lookup1.py	Fri Jan 17 20:35:21 2025 +0000
+++ b/lib/python/cc/lmh/test_lookup1.py	Fri Jan 17 20:37:10 2025 +0000
@@ -1,45 +1,54 @@
 #!/usr/bin/python3
+# cython: profile=True
+import cython, typing
+
 from isal import igzip
 
-import re, pickle, timeit
-global d, handle
+import re, pickle
 
-PAT = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/')
-
-handle = None
-d = [0]
+def mainp() -> None:
+  PAT: typing.Pattern[cython.bytes] = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/')
 
-with open('results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle', 'rb') as handle:
-  t = timeit.Timer('d[0] = pickle.load(handle)', globals = globals())
-  t.timeit(number=1)
-  d = d[0]
-  print(len(d))
-
-N = 0
-hits = 0
+  handle: typing.BinaryIO
+  N: int = 0
+  hits: int = 0
+  with open('results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle', 'rb') as handle:
+    d: typing.Dict[cython.bytes,cython.bytes] = pickle.load(handle)
+    print(len(d))
 
-with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out:
-  for l in cdx_in:
-    key, cdate, props = l.split(b' ',maxsplit=2)
-    if (m:=PAT.search(props)):
-      seg = int(m[2])
-      if seg >= 0 and seg < 10:
-        N += 1
-        k = cdate+m[1]
-        try:
-          m = d[k]
-          hits += 1
-          cdx_out.write(key)
-          cdx_out.write(b' ')
-          cdx_out.write(cdate)
-          cdx_out.write(b' ')
-          cdx_out.write(memoryview(props)[:-2])
-          cdx_out.write(b', "lastmod": "%b"}\n'%m)
-          continue
-        except KeyError:
-          pass
-    else:
-      raise ValueError(props)
-    cdx_out.write(l)
-print('%s entries, %s given lastmod'%(N,hits))
+  cdx_in: typing.BinaryIO
+  cdx_out: typing.BinaryIO
+  l: cython.bytes
+  m: typing.Match[cython.bytes] | None
+  with igzip.IGzipFile('/beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00100.gz','rb') as cdx_in, igzip.IGzipFile('results/CC-MAIN-2019-35/warc_lmhx/cdxp-00100.gz', 'wb') as cdx_out:
+    for l in cdx_in:
+      key: cython.bytes
+      cdate: cython.bytes
+      props: cython.bytes
+      key, cdate, props = l.split(b' ',maxsplit=2)
+      if (m:=PAT.search(props)):
+        seg = int(m[2])
+        if seg >= 0 and seg < 10:
+          N += 1
+          k = cdate+m[1]
+          try:
+            mv: cython.bytes = d[k]
+            hits += 1
+            cdx_out.write(key)
+            cdx_out.write(b' ')
+            cdx_out.write(cdate)
+            cdx_out.write(b' ')
+            cdx_out.write(memoryview(props)[:-2])
+            cdx_out.write(b', "lastmod": "%b"}\n'%mv)
+            continue
+          except KeyError:
+            pass
+      else:
+        raise ValueError(props)
+      cdx_out.write(l)
+  print('%s entries, %s given lastmod'%(N,hits))
           
+if __name__ == "__main__":
+    mainp()
+
+