Mercurial > hg > cc > work
comparison lurid3/notes.txt @ 46:49672e9b4c1c
unpackz.py working
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 01 Oct 2024 16:00:22 +0100 |
parents | 737c61f98cbf |
children | fbdaede4155a |
comparison
equal
deleted
inserted
replaced
45:737c61f98cbf | 46:49672e9b4c1c |
---|---|
371 Yes: | 371 Yes: |
372 | 372 |
373 >: echo $((60784640 % 8192)) | 373 >: echo $((60784640 % 8192)) |
374 0 | 374 0 |
375 | 375 |
376 Even with buffer 1MB: | |
377 21 | |
378 160245 | |
379 CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | |
380 Process fail: Compressed file ended before the end-of-stream marker was reached, input: | |
381 length=8415, offset=1059033915, file=/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | |
382 0 | |
383 160246 | |
384 | |
385 >: tail -60 /tmp/hst/r3b|head -20 | |
386 1059013061 423 | |
387 1059013484 7218 | |
388 1059020702 425 | |
389 1059021127 424 | |
390 1059021551 11471 | |
391 1059033022 426 | |
392 1059033448g 467 | |
393 1059033915 8415 | |
394 | |
395 Argh. This is at the _same_ point (before 51 fails before EOF). Ah, | |
396 maybe that's the point -- this is the last read before EOF, and it's | |
397 not a full buffer! | |
398 | |
399 >: ix.py 467 1059033448 CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|less | |
400 ... | |
401 WARC-Target-URI: https://zowiecarrpsychicmedium.com/tag/oracle/ | |
402 | |
403 Reran with more instrumentation, took at least all day: | |
404 | |
405 >: n=0 && ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2> /tmp/hst/r3e_err.txt | while read o l; do | |
406 echo $((n+=1)); echo $o $l >> /tmp/hst/r3e_val; ix.py $l $o CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | wc -l; | |
407 done > /tmp/hst/r3e_log 2>&1 | |
408 >: wc -l /tmp/hst/r3e_err.txt | |
409 160296 /tmp/hst/r3e_err.txt | |
410 >: tail -60 /tmp/hst/r3e_err.txt|cat -n | grep -C2 True\ True | |
411 7 b 28738 28738 28312 426 False False | |
412 8 b 28312 28312 27845 467 False False | |
413 9 b 27845 378162 369747 8415 True True < this is the first hit the last | |
414 (partial) block | |
415 10 b 369747 369747 369312 435 False True | |
416 11 b 369312 369312 368878 434 False True | |
417 | |
418 >: tail -55 /tmp/hst/r3e_val | head -3 | |
419 1059033022 426 | |
420 1059033448 467 | |
421 1059033915 8415 | |
422 >: dd ibs=1 skip=1059033022 count=426 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t | |
423 ... | |
424 426 bytes copied, 0.00468243 s, 91.0 kB/s | |
425 sing<3411>: dd ibs=1 skip=1059033448 count=467 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t | |
426 ... | |
427 467 bytes copied, 0.00382692 s, 122 kB/s | |
428 sing<3412>: dd ibs=1 skip=1059033915 count=8415 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t | |
429 igzip: Error (null) does not contain a complete gzip file | |
430 ... | |
431 8415 bytes (8.4 kB, 8.2 KiB) copied, 0.00968889 s, 869 kB/s | |
432 | |
433 So, tried one change to use the actually size rather than BUFSIZE at | |
434 one point, seems to work now: | |
435 | |
436 >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2> /tmp/hst/r3f_err.txt | tee /tmp/hst/r3f_val | while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz'; | |
437 done 2>&1 | tee /tmp/hst/r3f_log | ix.py -w | egrep -c '^WARC/1\.0' | |
438 160296 | |
439 real 3m48.393s | |
440 user 0m47.997s | |
441 sys 0m26.641s | |
442 | |
443 >: tail /tmp/hst/r3f_val | |
444 10851 1059370472 | |
445 475 1059381323 | |
446 444 1059381798 | |
447 22437 1059382242 | |
448 447 1059404679 | |
449 506 1059405126 | |
450 15183 1059405632 | |
451 471 1059420815 | |
452 457 1059421286 | |
453 17754 1059421743 | |
454 | |
455 >: wc -l /tmp/hst/*_val | |
456 171 /tmp/hst/r3d_val | |
457 160297 /tmp/hst/r3e_val | |
458 160296 /tmp/hst/r3f_val | |
459 320764 total | |
460 >: uz /tmp/hst/head.warc.gz |egrep -c '^WARC/1\.0.$' | |
461 171 | |
462 >: tail -n 3 /tmp/hst/*_val | |
463 ==> /tmp/hst/r3d_val <== | |
464 454 1351795 | |
465 414 1352249 | |
466 0 1352663 [so the 171 above is bogus, and we're missing one] | |
467 | |
468 ==> /tmp/hst/r3e_val <== | |
469 1059393441 457 | |
470 1059393898 17754 | |
471 0 [likewise bogus, so see below] | |
472 | |
473 ==> /tmp/hst/r3f_val <== | |
474 471 1059420815 | |
475 457 1059421286 | |
476 17754 1059421743 [better, but still one missing] | |
477 >: uz /tmp/hst/head.warc.gz |egrep '^WARC-Type: ' | tee >(wc -l 1>&2) | tail -4 | |
478 WARC-Type: response | |
479 WARC-Type: metadata | |
480 WARC-Type: request | |
481 WARC-Type: response [missing] | |
482 171 | |
483 >: ls -lt /tmp/hst/*_val | |
484 -rw-r--r-- 1 hst dc007 1977 Sep 29 09:27 /tmp/hst/r3d_val | |
485 -rw-r--r-- 1 hst dc007 2319237 Sep 28 14:28 /tmp/hst/r3f_val | |
486 -rw-r--r-- 1 hst dc007 2319238 Sep 27 19:41 /tmp/hst/r3e_val | |
487 >: ls -l ~/lib/python/unpackz.py | |
488 -rwxr-xr-x 1 hst dc007 1821 Sep 28 15:13 .../dc007/hst/lib/python/unpackz.py | |
489 So e and f are stale, rerun | |
490 >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3f_err.txt| tee /tmp/hst/r3f_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3f_log |ix.py -w |egrep '^WARC-Type: ' | tail -4 & | |
491 >: Reading length, offset, filename tab-delimited triples from stdin... | |
492 WARC-Type: response | |
493 WARC-Type: metadata | |
494 WARC-Type: request | |
495 WARC-Type: response | |
496 | |
497 real 3m49.760s | |
498 user 0m47.180s | |
499 sys 0m32.218s | |
500 So missing the final metadata... | |
501 Back to head.warc.gz, with debug info | |
502 | |
503 >: n=0 && ~/lib/python/unpackz.py /tmp/hst/head.warc.gz 2>/tmp/hst/ttd.txt|while read l o; do echo $((n+=1)); echo $l $o >> /tmp/hst/r3d_val; dd ibs=1 skip=$o count=$l if=/tmp/hst/head.warc.gz of=/dev/stdout 2>/tmp/hst/r3d_ido| uz -t ; done >/tmp/hst/r3d_log 2>&1 | |
504 >: tail -2 /tmp/hst/r3d_log | |
505 171 | |
506 igzip: Error invalid gzip header found for file (null) | |
507 >: tail -n 3 /tmp/hst/ttd.txt /tmp/hst/r3d_val | |
508 ==> /tmp/hst/ttd.txt <== | |
509 b 9697 9697 9243 454 False True | |
510 b 9243 9243 8829 414 False True | |
511 n 8829 | |
512 | |
513 ==> /tmp/hst/r3d_val <== | |
514 454 1351795 | |
515 414 1352249 | |
516 0 1352663 | |
517 | |
518 >: cat -n /tmp/hst/r3f_val | head -172 | tail -4 | |
519 169 454 1351795 | |
520 170 414 1352249 | |
521 171 8829 1352663 | |
522 172 446 1361492 | |
523 | |
524 Fixed, maybe | |
525 | |
526 >: tail -n 3 /tmp/hst/r3d_log /tmp/hst/r3d_val | |
527 ==> /tmp/hst/r3d_log <== | |
528 169 | |
529 170 | |
530 171 | |
531 | |
532 ==> /tmp/hst/r3d_val <== | |
533 454 1351795 | |
534 414 1352249 | |
535 8829 1352663 | |
536 | |
537 Yes! | |
538 | |
539 >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3f_err| tee /tmp/hst/r3f_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3f_log |ix.py -w |egrep '^WARC-Type: ' | tail -4 | |
540 Reading length, offset, filename tab-delimited triples from stdin... | |
541 WARC-Type: metadata | |
542 WARC-Type: request | |
543 WARC-Type: response | |
544 WARC-Type: metadata | |
545 | |
546 real 3m26.042s | |
547 user 0m44.167s | |
548 sys 0m24.716s | |
549 >: tail -n 3 /tmp/hst/r3f* | |
550 ==> /tmp/hst/r3f_err <== | |
551 | |
552 ==> /tmp/hst/r3f_val <== | |
553 457 1059421286 | |
554 17754 1059421743 | |
555 425 1059439497 | |
556 |