# HG changeset patch # User Henry S. Thompson # Date 1735830108 0 # Node ID 3be7b53d726edc04794b4a80e6c56aa58986afcb # Parent d9ba3ce783ff0eba18ae2293d6719d7dd0678c60 using python dict test diff -r d9ba3ce783ff -r 3be7b53d726e lurid3/notes.txt --- a/lurid3/notes.txt Wed Jan 01 23:03:07 2025 +0000 +++ b/lurid3/notes.txt Thu Jan 02 15:01:48 2025 +0000 @@ -883,13 +883,39 @@ 52369734 [69.63967163302004, 69.09140252694488, 66.49750975705683] That's tolerable. - >: ~/lib/python/cc/lmh/test.py -r 3 -f /work/dc007/dc007/hst/results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.tsv + >: ~/lib/python/cc/lmh/test_hash.py -r 3 -f /work/dc007/dc007/hst/results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.tsv 52369734 52369734 52369734 [64.51177835091949, 71.6610240675509, 67.74966451153159] [0.0034751780331134796, 0.0034532323479652405, 0.0033454522490501404] Last line is 100000 lookups. + +So, try a test: + >: time ~/lib/python/cc/lmh/test_hash.py -r 1 -p results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle -f /work/dc007/dc007/hst/results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.tsv + 52369734 + [70.98342595621943] + [0.0037928372621536255] + + real 1m51.456s + user 1m32.901s + sys 0m17.937s + >: ls -lh results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle + -rw-r--r-- 1 hst dc007 5.5G Jan 2 12:19 results/CC-MAIN-2019-35/warc_lmhx/ks_0-9.pickle + cdx_out.write(b' ') + cdx_out.write(b' ') + >: time ~/lib/python/cc/lmh/test_lookup1.py + 52369734 + 1076046 130318 + + real 1m52.668s + user 1m40.751s + sys 0m9.610s + +Not bad. 1.5 minutes per file, plus 10 x 20 secs or so for the +unpickles =~ 453 minutes == 8 hours. + +Try pre-filter with the Bloom filter. ================