changeset 46:49672e9b4c1c

unpackz.py working
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 01 Oct 2024 16:00:22 +0100
parents 737c61f98cbf
children fbdaede4155a
files lurid3/notes.txt
diffstat 1 files changed, 181 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/lurid3/notes.txt	Thu Sep 26 17:47:58 2024 +0100
+++ b/lurid3/notes.txt	Tue Oct 01 16:00:22 2024 +0100
@@ -373,3 +373,184 @@
   >: echo $((60784640 % 8192))
   0
 
+Even with buffer 1MB:
+  21
+  160245
+  CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz
+  Process fail: Compressed file ended before the end-of-stream marker was reached, input:
+   length=8415, offset=1059033915, file=/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz
+  0
+  160246
+
+  >: tail -60 /tmp/hst/r3b|head -20
+  1059013061 423
+  1059013484 7218
+  1059020702 425
+  1059021127 424
+  1059021551 11471
+  1059033022 426
+  1059033448g 467
+  1059033915 8415
+
+Argh.  This is at the _same_ point (before 51 fails before EOF).  Ah,
+maybe that's the point -- this is the last read before EOF, and it's
+not a full buffer!
+
+  >: ix.py 467 1059033448   CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|less
+  ...
+  WARC-Target-URI: https://zowiecarrpsychicmedium.com/tag/oracle/
+
+Reran with more instrumentation, took at least all day:
+
+  >: n=0 && ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2> /tmp/hst/r3e_err.txt | while read o l; do
+      echo $((n+=1)); echo $o $l >> /tmp/hst/r3e_val; ix.py $l $o CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | wc -l;
+  done > /tmp/hst/r3e_log 2>&1
+  >: wc -l /tmp/hst/r3e_err.txt
+  160296 /tmp/hst/r3e_err.txt
+  >: tail -60 /tmp/hst/r3e_err.txt|cat -n | grep -C2 True\ True
+       7  b 28738 28738 28312 426 False False
+       8  b 28312 28312 27845 467 False False
+       9  b 27845 378162 369747 8415 True True  < this is the first hit the last
+                                                  (partial) block
+      10  b 369747 369747 369312 435 False True
+      11  b 369312 369312 368878 434 False True
+
+  >: tail -55 /tmp/hst/r3e_val | head -3
+  1059033022 426
+  1059033448 467
+  1059033915 8415
+  >: dd ibs=1  skip=1059033022 count=426 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t
+  ...
+  426 bytes copied, 0.00468243 s, 91.0 kB/s
+  sing<3411>: dd ibs=1  skip=1059033448 count=467 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t
+  ...
+  467 bytes copied, 0.00382692 s, 122 kB/s
+  sing<3412>: dd ibs=1  skip=1059033915 count=8415 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t
+  igzip: Error (null) does not contain a complete gzip file
+  ...
+  8415 bytes (8.4 kB, 8.2 KiB) copied, 0.00968889 s, 869 kB/s
+
+So, tried one change to use the actually size rather than BUFSIZE at
+one point, seems to work now:
+
+  >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2> /tmp/hst/r3f_err.txt | tee /tmp/hst/r3f_val | while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz';
+done 2>&1 | tee /tmp/hst/r3f_log | ix.py -w | egrep -c '^WARC/1\.0'
+  160296
+  real  3m48.393s
+  user  0m47.997s
+  sys   0m26.641s
+
+  >: tail /tmp/hst/r3f_val
+10851 1059370472
+475 1059381323
+444 1059381798
+22437 1059382242
+447 1059404679
+506 1059405126
+15183 1059405632
+471 1059420815
+457 1059421286
+17754 1059421743
+
+  >: wc -l /tmp/hst/*_val
+    171 /tmp/hst/r3d_val
+  160297 /tmp/hst/r3e_val
+  160296 /tmp/hst/r3f_val
+  320764 total
+  >: uz /tmp/hst/head.warc.gz |egrep -c '^WARC/1\.0.$'
+  171
+  >: tail -n 3 /tmp/hst/*_val
+  ==> /tmp/hst/r3d_val <==
+  454 1351795
+  414 1352249
+  0 1352663 [so the 171 above is bogus, and we're missing one]
+
+  ==> /tmp/hst/r3e_val <==
+  1059393441 457
+  1059393898 17754
+  0 [likewise bogus, so see below]
+
+  ==> /tmp/hst/r3f_val <==
+  471 1059420815
+  457 1059421286
+  17754 1059421743 [better, but still one missing]
+  >: uz /tmp/hst/head.warc.gz |egrep '^WARC-Type: ' | tee >(wc -l 1>&2) | tail -4
+  WARC-Type: response
+  WARC-Type: metadata
+  WARC-Type: request
+  WARC-Type: response [missing]
+  171
+  >: ls -lt /tmp/hst/*_val
+  -rw-r--r-- 1 hst dc007    1977 Sep 29 09:27 /tmp/hst/r3d_val
+  -rw-r--r-- 1 hst dc007 2319237 Sep 28 14:28 /tmp/hst/r3f_val
+  -rw-r--r-- 1 hst dc007 2319238 Sep 27 19:41 /tmp/hst/r3e_val
+  >: ls -l ~/lib/python/unpackz.py
+  -rwxr-xr-x 1 hst dc007 1821 Sep 28 15:13 .../dc007/hst/lib/python/unpackz.py
+So e and f are stale, rerun
+  >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3f_err.txt| tee /tmp/hst/r3f_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3f_log |ix.py -w |egrep '^WARC-Type: ' | tail -4 &
+  >: Reading length, offset, filename tab-delimited triples from stdin...
+  WARC-Type: response
+  WARC-Type: metadata
+  WARC-Type: request
+  WARC-Type: response
+
+  real  3m49.760s
+  user  0m47.180s
+  sys   0m32.218s
+So missing the final metadata...
+Back to head.warc.gz, with debug info
+   
+  >: n=0 && ~/lib/python/unpackz.py /tmp/hst/head.warc.gz 2>/tmp/hst/ttd.txt|while read l o; do echo $((n+=1)); echo $l $o >> /tmp/hst/r3d_val; dd ibs=1 skip=$o count=$l if=/tmp/hst/head.warc.gz of=/dev/stdout 2>/tmp/hst/r3d_ido| uz -t ; done >/tmp/hst/r3d_log 2>&1
+  >: tail -2 /tmp/hst/r3d_log
+  171
+  igzip: Error invalid gzip header found for file (null)
+  >: tail -n 3 /tmp/hst/ttd.txt /tmp/hst/r3d_val
+  ==> /tmp/hst/ttd.txt <==
+  b 9697 9697 9243 454 False True
+  b 9243 9243 8829 414 False True
+  n 8829
+
+  ==> /tmp/hst/r3d_val <==
+  454 1351795
+  414 1352249
+  0 1352663
+
+  >: cat -n /tmp/hst/r3f_val | head -172 | tail -4
+     169  454 1351795
+     170  414 1352249
+     171  8829 1352663
+     172  446 1361492
+
+Fixed, maybe
+
+  >: tail -n 3 /tmp/hst/r3d_log /tmp/hst/r3d_val
+  ==> /tmp/hst/r3d_log <==
+  169
+  170
+  171
+
+  ==> /tmp/hst/r3d_val <==
+  454 1351795
+  414 1352249
+  8829 1352663
+
+Yes!
+
+  >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3f_err| tee /tmp/hst/r3f_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3f_log |ix.py -w |egrep '^WARC-Type: ' | tail -4
+  Reading length, offset, filename tab-delimited triples from stdin...
+  WARC-Type: metadata
+  WARC-Type: request
+  WARC-Type: response
+  WARC-Type: metadata
+
+  real  3m26.042s
+  user  0m44.167s
+  sys   0m24.716s
+  >: tail -n 3 /tmp/hst/r3f*
+  ==> /tmp/hst/r3f_err <==
+
+  ==> /tmp/hst/r3f_val <==
+  457 1059421286
+  17754 1059421743
+  425 1059439497
+