changeset 249:87a35540104b

time the unpickling
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 02 Jan 2025 18:35:08 +0000
parents 650383a798e5
children 417103100fd0
files lib/python/cc/lmh/test_lookup1.py
diffstat 1 files changed, 10 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/test_lookup1.py	Thu Jan 02 18:30:03 2025 +0000
+++ b/lib/python/cc/lmh/test_lookup1.py	Thu Jan 02 18:35:08 2025 +0000
@@ -1,11 +1,18 @@
 #!/usr/bin/python3
 from isal import igzip
-import re, pickle
+
+import re, pickle, timeit
+global d, handle
 
 PAT = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/')
 
+handle = None
+d = [0]
+
 with open('results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle', 'rb') as handle:
-  d = pickle.load(handle) # this takes ~20 seconds
+  t = timeit.Timer('d[0] = pickle.load(handle)', globals = globals())
+  t.timeit(number=1)
+  d = d[0]
   print(len(d))
 
 N = 0
@@ -34,5 +41,5 @@
     else:
       raise ValueError(props)
     cdx_out.write(l)
-print(N,hits)
+print('%s entries, %s given lastmod'%(N,hits))