Mercurial > hg > cc > work
changeset 47:fbdaede4155a
cdx_extras and unpackz.py working
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 03 Oct 2024 18:16:05 +0100 |
parents | 49672e9b4c1c |
children | f688c437180b |
files | lurid3/notes.txt |
diffstat | 1 files changed, 201 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/lurid3/notes.txt Tue Oct 01 16:00:22 2024 +0100 +++ b/lurid3/notes.txt Thu Oct 03 18:16:05 2024 +0100 @@ -554,3 +554,204 @@ 17754 1059421743 425 1059439497 +Doubling buffer size doesn't speed up + >: time ~/lib/python/unpackz.py -b $((2 * 1024 * 1024)) /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3g_err| tee /tmp/hst/r3g_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3g_log |ix.py -w |egrep '^WARC-Type: ' | tail -4 + Reading length, offset, filename tab-delimited triples from stdin... + WARC-Type: metadata + WARC-Type: request + WARC-Type: response + WARC-Type: metadata + + real 3m34.519s + user 0m52.312s + sys 0m24.875s + +Tried using FileIO.readinto([a fixed buffer]), but didn't immediately +work. Abandoned because I still don't understand how zlib.decompress +works at all... + +Time to convert unpackz to a library which takes a callback +alternative to an output file -- Done + +W/o using callback, timing and structure for what we need for +re-indexing task looks encouraging: + >: time ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz |egrep -aA20 '^WARC-Type: response' | cut -f 1 -d ' ' | egrep -a '^WARC-' |sus | tee >(wc -l 1>&2) + 52468 WARC-Block-Digest: + 52468 WARC-Concurrent-To: + 52468 WARC-Date: + 52468 WARC-Identified-Payload-Type: + 52468 WARC-IP-Address: + 52468 WARC-Payload-Digest: + 52468 WARC-Record-ID: + 52468 WARC-Target-URI: + 52468 WARC-Type: + 52468 WARC-Warcinfo-ID: + 236 WARC-Truncated: + 11 + + real 0m20.308s + user 0m19.720s + sys 0m4.505s + +Whole thing, with no pre-filtering: + + >: time ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | cut -f 1 -d ' ' | egrep -a '^(WARC-|Content-|Last-Modified)' |sus | tee >(wc -l 1>&2) + 211794 Content-Length: + 211162 Content-Type: + 159323 WARC-Target-URI: + 159311 WARC-Warcinfo-ID: + 159301 WARC-Record-ID: + 159299 WARC-Date: + 159297 WARC-Type: + 105901 WARC-Concurrent-To: + 105896 WARC-IP-Address: + 52484 WARC-Block-Digest: + 52484 WARC-Identified-Payload-Type: + 52482 WARC-Payload-Digest: + 9239 Last-Modified: + 3941 Content-Language: + 2262 Content-Security-Policy: + 642 Content-language: + 326 Content-Security-Policy-Report-Only: + 238 WARC-Truncated: + 114 Content-Disposition: + 352 Content-*: + 1 WARC-Filename: + 42 + +real 0m30.896s +user 0m37.335s +sys 0m7.542s + +First 51 after WARC-Type: response + + >: time ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz |egrep -aA50 '^WARC-Type: response' | cut -f 1 -d ' ' | egrep -a '^(WARC-|Content-|Last-Modified)' |sus | tee >(wc -l 1>&2) + 106775 Content-Length: + 106485 Content-Type: + 55215 WARC-Type: + 55123 WARC-Date: + 54988 WARC-Record-ID: + 54551 WARC-Warcinfo-ID: + 54246 WARC-Target-URI: + 54025 WARC-Concurrent-To: + 52806 WARC-IP-Address: + 52468 WARC-Block-Digest: + 52468 WARC-Identified-Payload-Type: + 52468 WARC-Payload-Digest: + 9230 Last-Modified: + 3938 Content-Language: + 2261 Content-Security-Policy: + 639 Content-language: + 324 Content-Security-Policy-Report-Only: + 236 WARC-Truncated: + 114 Content-Disposition: + 342 Content-*: + 41 + + real 0m21.483s + user 0m22.372s + sys 0m5.400s + +So, not worth the risk, let's try python + + >: time ~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|wc -l + 9238 + + real 0m25.426s + user 0m23.201s + sys 0m0.711s + +Looks good, but why 9238 instead of 9239??? + + >: ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | egrep -a '^Last-Modified: ' > /tmp/hst/lmo.tsv + +Argh. Serious bug in unpackz, wasn't handline cross-buffer-boundary +records correctly. Fixed. Redoing the above... + +No pre-filter: + >: uz /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|egrep -c '^WARC/1\.0.$' + 160297 + + >: time ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | cut -f 1 -d ' ' | egrep -a '^(WARC-|Content-|Last-Modified)' |sus | tee >(wc -l 1>&2) + + 213719 Content-Length: + 213088 Content-Type: + 160297 WARC-Date: + 160297 WARC-Record-ID: + 160297 WARC-Type: + 160296 WARC-Target-URI: + 160296 WARC-Warcinfo-ID: + 106864 WARC-Concurrent-To: + 106864 WARC-IP-Address: + 53432 WARC-Block-Digest: [consistent with 106297 == (3 * 53432) + 1] + 53432 WARC-Identified-Payload-Type: + 53432 WARC-Payload-Digest: + 9430 Last-Modified: + 4006 Content-Language: + 2325 Content-Security-Policy: + 653 Content-language: + 331 Content-Security-Policy-Report-Only: + 298 WARC-Truncated: + 128 Content-Disposition: + 83 Content-Location: + 67 Content-type: + 51 Content-MD5: + 45 Content-Script-Type: + 42 Content-Style-Type: + 31 Content-Transfer-Encoding: + 13 Content-disposition: + 8 Content-Md5: + 5 Content-Description: + 5 Content-script-type: + 5 Content-style-type: + 3 Content-transfer-encoding: + 2 Content-Encoding-handler: + 1 Content-DocumentTitle: + 1 Content-Hash: + 1 Content-ID: + 1 Content-Legth: + 1 Content-length: + 1 Content-Range: + 1 Content-Secure-Policy: + 1 Content-security-policy: + 1 Content-Type-Options: + 1 WARC-Filename: +42 + +real 0m28.876s +user 0m35.703s +sys 0m6.976s + + >: ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | egrep -a '^Last-Modified: ' > /tmp/hst/lmo.tsv + >: wc -l /tmp/hst/lmo.tsv + 9430 /tmp/hst/lmo.tsv + >: time ~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz > /tmp/hst/lm.tsv + + real 0m17.191s + user 0m15.739s + sys 0m0.594s + >: wc -l /tmp/hst/lm.tsv + 9423 /tmp/hst/lm.tsv + + >: diff <(sed 's/^Last-Modified: //' /tmp/hst/lmo.tsv | tr -d '\r') <(cut -f 3 /tmp/hst/lm.tsv) + 853d852 + < Mon, 19 Aug 2019 01:46:49 GMT + 4058d4056 + < Tue, 03 Nov 2015 21:31:18 GMT<br /> + 4405d4402 + < Mon, 19 Aug 2019 01:54:52 GMT + 5237,5238d5233 + < 3 + < Asia/Amman + 7009d7003 + < Mon, 19 Aug 2019 02:34:20 GMT + 9198d9191 + < Mon, 19 Aug 2019 02:14:49 GMT + +All good. The only implausable case is + < Mon, 19 Aug 2019 01:54:52 GMT +which turns out to be a case of two Last-Modified headers in the same +the same response record's HTTP headers. RFCs 2616 and 7230 rule it +out but neither specifies a recovery, so first-wins is as good as +anything, and indeed 6797 specifies that. +