comparison lurid3/notes.txt @ 46:49672e9b4c1c

unpackz.py working
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 01 Oct 2024 16:00:22 +0100
parents 737c61f98cbf
children fbdaede4155a
comparison
equal deleted inserted replaced
45:737c61f98cbf 46:49672e9b4c1c
371 Yes: 371 Yes:
372 372
373 >: echo $((60784640 % 8192)) 373 >: echo $((60784640 % 8192))
374 0 374 0
375 375
376 Even with buffer 1MB:
377 21
378 160245
379 CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz
380 Process fail: Compressed file ended before the end-of-stream marker was reached, input:
381 length=8415, offset=1059033915, file=/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz
382 0
383 160246
384
385 >: tail -60 /tmp/hst/r3b|head -20
386 1059013061 423
387 1059013484 7218
388 1059020702 425
389 1059021127 424
390 1059021551 11471
391 1059033022 426
392 1059033448g 467
393 1059033915 8415
394
395 Argh. This is at the _same_ point (before 51 fails before EOF). Ah,
396 maybe that's the point -- this is the last read before EOF, and it's
397 not a full buffer!
398
399 >: ix.py 467 1059033448 CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|less
400 ...
401 WARC-Target-URI: https://zowiecarrpsychicmedium.com/tag/oracle/
402
403 Reran with more instrumentation, took at least all day:
404
405 >: n=0 && ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2> /tmp/hst/r3e_err.txt | while read o l; do
406 echo $((n+=1)); echo $o $l >> /tmp/hst/r3e_val; ix.py $l $o CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | wc -l;
407 done > /tmp/hst/r3e_log 2>&1
408 >: wc -l /tmp/hst/r3e_err.txt
409 160296 /tmp/hst/r3e_err.txt
410 >: tail -60 /tmp/hst/r3e_err.txt|cat -n | grep -C2 True\ True
411 7 b 28738 28738 28312 426 False False
412 8 b 28312 28312 27845 467 False False
413 9 b 27845 378162 369747 8415 True True < this is the first hit the last
414 (partial) block
415 10 b 369747 369747 369312 435 False True
416 11 b 369312 369312 368878 434 False True
417
418 >: tail -55 /tmp/hst/r3e_val | head -3
419 1059033022 426
420 1059033448 467
421 1059033915 8415
422 >: dd ibs=1 skip=1059033022 count=426 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t
423 ...
424 426 bytes copied, 0.00468243 s, 91.0 kB/s
425 sing<3411>: dd ibs=1 skip=1059033448 count=467 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t
426 ...
427 467 bytes copied, 0.00382692 s, 122 kB/s
428 sing<3412>: dd ibs=1 skip=1059033915 count=8415 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t
429 igzip: Error (null) does not contain a complete gzip file
430 ...
431 8415 bytes (8.4 kB, 8.2 KiB) copied, 0.00968889 s, 869 kB/s
432
433 So, tried one change to use the actually size rather than BUFSIZE at
434 one point, seems to work now:
435
436 >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2> /tmp/hst/r3f_err.txt | tee /tmp/hst/r3f_val | while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz';
437 done 2>&1 | tee /tmp/hst/r3f_log | ix.py -w | egrep -c '^WARC/1\.0'
438 160296
439 real 3m48.393s
440 user 0m47.997s
441 sys 0m26.641s
442
443 >: tail /tmp/hst/r3f_val
444 10851 1059370472
445 475 1059381323
446 444 1059381798
447 22437 1059382242
448 447 1059404679
449 506 1059405126
450 15183 1059405632
451 471 1059420815
452 457 1059421286
453 17754 1059421743
454
455 >: wc -l /tmp/hst/*_val
456 171 /tmp/hst/r3d_val
457 160297 /tmp/hst/r3e_val
458 160296 /tmp/hst/r3f_val
459 320764 total
460 >: uz /tmp/hst/head.warc.gz |egrep -c '^WARC/1\.0.$'
461 171
462 >: tail -n 3 /tmp/hst/*_val
463 ==> /tmp/hst/r3d_val <==
464 454 1351795
465 414 1352249
466 0 1352663 [so the 171 above is bogus, and we're missing one]
467
468 ==> /tmp/hst/r3e_val <==
469 1059393441 457
470 1059393898 17754
471 0 [likewise bogus, so see below]
472
473 ==> /tmp/hst/r3f_val <==
474 471 1059420815
475 457 1059421286
476 17754 1059421743 [better, but still one missing]
477 >: uz /tmp/hst/head.warc.gz |egrep '^WARC-Type: ' | tee >(wc -l 1>&2) | tail -4
478 WARC-Type: response
479 WARC-Type: metadata
480 WARC-Type: request
481 WARC-Type: response [missing]
482 171
483 >: ls -lt /tmp/hst/*_val
484 -rw-r--r-- 1 hst dc007 1977 Sep 29 09:27 /tmp/hst/r3d_val
485 -rw-r--r-- 1 hst dc007 2319237 Sep 28 14:28 /tmp/hst/r3f_val
486 -rw-r--r-- 1 hst dc007 2319238 Sep 27 19:41 /tmp/hst/r3e_val
487 >: ls -l ~/lib/python/unpackz.py
488 -rwxr-xr-x 1 hst dc007 1821 Sep 28 15:13 .../dc007/hst/lib/python/unpackz.py
489 So e and f are stale, rerun
490 >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3f_err.txt| tee /tmp/hst/r3f_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3f_log |ix.py -w |egrep '^WARC-Type: ' | tail -4 &
491 >: Reading length, offset, filename tab-delimited triples from stdin...
492 WARC-Type: response
493 WARC-Type: metadata
494 WARC-Type: request
495 WARC-Type: response
496
497 real 3m49.760s
498 user 0m47.180s
499 sys 0m32.218s
500 So missing the final metadata...
501 Back to head.warc.gz, with debug info
502
503 >: n=0 && ~/lib/python/unpackz.py /tmp/hst/head.warc.gz 2>/tmp/hst/ttd.txt|while read l o; do echo $((n+=1)); echo $l $o >> /tmp/hst/r3d_val; dd ibs=1 skip=$o count=$l if=/tmp/hst/head.warc.gz of=/dev/stdout 2>/tmp/hst/r3d_ido| uz -t ; done >/tmp/hst/r3d_log 2>&1
504 >: tail -2 /tmp/hst/r3d_log
505 171
506 igzip: Error invalid gzip header found for file (null)
507 >: tail -n 3 /tmp/hst/ttd.txt /tmp/hst/r3d_val
508 ==> /tmp/hst/ttd.txt <==
509 b 9697 9697 9243 454 False True
510 b 9243 9243 8829 414 False True
511 n 8829
512
513 ==> /tmp/hst/r3d_val <==
514 454 1351795
515 414 1352249
516 0 1352663
517
518 >: cat -n /tmp/hst/r3f_val | head -172 | tail -4
519 169 454 1351795
520 170 414 1352249
521 171 8829 1352663
522 172 446 1361492
523
524 Fixed, maybe
525
526 >: tail -n 3 /tmp/hst/r3d_log /tmp/hst/r3d_val
527 ==> /tmp/hst/r3d_log <==
528 169
529 170
530 171
531
532 ==> /tmp/hst/r3d_val <==
533 454 1351795
534 414 1352249
535 8829 1352663
536
537 Yes!
538
539 >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3f_err| tee /tmp/hst/r3f_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3f_log |ix.py -w |egrep '^WARC-Type: ' | tail -4
540 Reading length, offset, filename tab-delimited triples from stdin...
541 WARC-Type: metadata
542 WARC-Type: request
543 WARC-Type: response
544 WARC-Type: metadata
545
546 real 3m26.042s
547 user 0m44.167s
548 sys 0m24.716s
549 >: tail -n 3 /tmp/hst/r3f*
550 ==> /tmp/hst/r3f_err <==
551
552 ==> /tmp/hst/r3f_val <==
553 457 1059421286
554 17754 1059421743
555 425 1059439497
556