Mercurial > hg > cc > work
changeset 46:49672e9b4c1c
unpackz.py working
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 01 Oct 2024 16:00:22 +0100 |
parents | 737c61f98cbf |
children | fbdaede4155a |
files | lurid3/notes.txt |
diffstat | 1 files changed, 181 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/lurid3/notes.txt Thu Sep 26 17:47:58 2024 +0100 +++ b/lurid3/notes.txt Tue Oct 01 16:00:22 2024 +0100 @@ -373,3 +373,184 @@ >: echo $((60784640 % 8192)) 0 +Even with buffer 1MB: + 21 + 160245 + CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz + Process fail: Compressed file ended before the end-of-stream marker was reached, input: + length=8415, offset=1059033915, file=/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz + 0 + 160246 + + >: tail -60 /tmp/hst/r3b|head -20 + 1059013061 423 + 1059013484 7218 + 1059020702 425 + 1059021127 424 + 1059021551 11471 + 1059033022 426 + 1059033448g 467 + 1059033915 8415 + +Argh. This is at the _same_ point (before 51 fails before EOF). Ah, +maybe that's the point -- this is the last read before EOF, and it's +not a full buffer! + + >: ix.py 467 1059033448 CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|less + ... + WARC-Target-URI: https://zowiecarrpsychicmedium.com/tag/oracle/ + +Reran with more instrumentation, took at least all day: + + >: n=0 && ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2> /tmp/hst/r3e_err.txt | while read o l; do + echo $((n+=1)); echo $o $l >> /tmp/hst/r3e_val; ix.py $l $o CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | wc -l; + done > /tmp/hst/r3e_log 2>&1 + >: wc -l /tmp/hst/r3e_err.txt + 160296 /tmp/hst/r3e_err.txt + >: tail -60 /tmp/hst/r3e_err.txt|cat -n | grep -C2 True\ True + 7 b 28738 28738 28312 426 False False + 8 b 28312 28312 27845 467 False False + 9 b 27845 378162 369747 8415 True True < this is the first hit the last + (partial) block + 10 b 369747 369747 369312 435 False True + 11 b 369312 369312 368878 434 False True + + >: tail -55 /tmp/hst/r3e_val | head -3 + 1059033022 426 + 1059033448 467 + 1059033915 8415 + >: dd ibs=1 skip=1059033022 count=426 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t + ... + 426 bytes copied, 0.00468243 s, 91.0 kB/s + sing<3411>: dd ibs=1 skip=1059033448 count=467 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t + ... + 467 bytes copied, 0.00382692 s, 122 kB/s + sing<3412>: dd ibs=1 skip=1059033915 count=8415 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t + igzip: Error (null) does not contain a complete gzip file + ... + 8415 bytes (8.4 kB, 8.2 KiB) copied, 0.00968889 s, 869 kB/s + +So, tried one change to use the actually size rather than BUFSIZE at +one point, seems to work now: + + >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2> /tmp/hst/r3f_err.txt | tee /tmp/hst/r3f_val | while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz'; +done 2>&1 | tee /tmp/hst/r3f_log | ix.py -w | egrep -c '^WARC/1\.0' + 160296 + real 3m48.393s + user 0m47.997s + sys 0m26.641s + + >: tail /tmp/hst/r3f_val +10851 1059370472 +475 1059381323 +444 1059381798 +22437 1059382242 +447 1059404679 +506 1059405126 +15183 1059405632 +471 1059420815 +457 1059421286 +17754 1059421743 + + >: wc -l /tmp/hst/*_val + 171 /tmp/hst/r3d_val + 160297 /tmp/hst/r3e_val + 160296 /tmp/hst/r3f_val + 320764 total + >: uz /tmp/hst/head.warc.gz |egrep -c '^WARC/1\.0.$' + 171 + >: tail -n 3 /tmp/hst/*_val + ==> /tmp/hst/r3d_val <== + 454 1351795 + 414 1352249 + 0 1352663 [so the 171 above is bogus, and we're missing one] + + ==> /tmp/hst/r3e_val <== + 1059393441 457 + 1059393898 17754 + 0 [likewise bogus, so see below] + + ==> /tmp/hst/r3f_val <== + 471 1059420815 + 457 1059421286 + 17754 1059421743 [better, but still one missing] + >: uz /tmp/hst/head.warc.gz |egrep '^WARC-Type: ' | tee >(wc -l 1>&2) | tail -4 + WARC-Type: response + WARC-Type: metadata + WARC-Type: request + WARC-Type: response [missing] + 171 + >: ls -lt /tmp/hst/*_val + -rw-r--r-- 1 hst dc007 1977 Sep 29 09:27 /tmp/hst/r3d_val + -rw-r--r-- 1 hst dc007 2319237 Sep 28 14:28 /tmp/hst/r3f_val + -rw-r--r-- 1 hst dc007 2319238 Sep 27 19:41 /tmp/hst/r3e_val + >: ls -l ~/lib/python/unpackz.py + -rwxr-xr-x 1 hst dc007 1821 Sep 28 15:13 .../dc007/hst/lib/python/unpackz.py +So e and f are stale, rerun + >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3f_err.txt| tee /tmp/hst/r3f_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3f_log |ix.py -w |egrep '^WARC-Type: ' | tail -4 & + >: Reading length, offset, filename tab-delimited triples from stdin... + WARC-Type: response + WARC-Type: metadata + WARC-Type: request + WARC-Type: response + + real 3m49.760s + user 0m47.180s + sys 0m32.218s +So missing the final metadata... +Back to head.warc.gz, with debug info + + >: n=0 && ~/lib/python/unpackz.py /tmp/hst/head.warc.gz 2>/tmp/hst/ttd.txt|while read l o; do echo $((n+=1)); echo $l $o >> /tmp/hst/r3d_val; dd ibs=1 skip=$o count=$l if=/tmp/hst/head.warc.gz of=/dev/stdout 2>/tmp/hst/r3d_ido| uz -t ; done >/tmp/hst/r3d_log 2>&1 + >: tail -2 /tmp/hst/r3d_log + 171 + igzip: Error invalid gzip header found for file (null) + >: tail -n 3 /tmp/hst/ttd.txt /tmp/hst/r3d_val + ==> /tmp/hst/ttd.txt <== + b 9697 9697 9243 454 False True + b 9243 9243 8829 414 False True + n 8829 + + ==> /tmp/hst/r3d_val <== + 454 1351795 + 414 1352249 + 0 1352663 + + >: cat -n /tmp/hst/r3f_val | head -172 | tail -4 + 169 454 1351795 + 170 414 1352249 + 171 8829 1352663 + 172 446 1361492 + +Fixed, maybe + + >: tail -n 3 /tmp/hst/r3d_log /tmp/hst/r3d_val + ==> /tmp/hst/r3d_log <== + 169 + 170 + 171 + + ==> /tmp/hst/r3d_val <== + 454 1351795 + 414 1352249 + 8829 1352663 + +Yes! + + >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3f_err| tee /tmp/hst/r3f_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3f_log |ix.py -w |egrep '^WARC-Type: ' | tail -4 + Reading length, offset, filename tab-delimited triples from stdin... + WARC-Type: metadata + WARC-Type: request + WARC-Type: response + WARC-Type: metadata + + real 3m26.042s + user 0m44.167s + sys 0m24.716s + >: tail -n 3 /tmp/hst/r3f* + ==> /tmp/hst/r3f_err <== + + ==> /tmp/hst/r3f_val <== + 457 1059421286 + 17754 1059421743 + 425 1059439497 +