Mercurial > hg > cc > work
diff lurid3/notes.txt @ 55:237105932af5
merge
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 15 Oct 2024 16:06:27 +0100 |
parents | d533894173d0 |
children | dd06d7afbfe0 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lurid3/notes.txt Tue Oct 15 16:06:27 2024 +0100 @@ -0,0 +1,987 @@ +See old_notes.txt for all older notes on Common Crawl dataprocessing, +starting from Azure via Turing and then LURID and LURID2. + +Installed /beegfs/common_crawl/CC-MAIN-2024-33/cdx + >: cd results/CC-MAIN-2024-33/cdx/ + >: cut -f 2 counts.tsv | btot + 2,793,986,828 + +State of play wrt data -- see status.xlsx + +[in trying to tabulate the date ranges of the crawls, I found that the +WARC timestamp is sometimes bogus: + + >: fgrep ' 2009' CC-MAIN-2018-34/cdx/cluster.idx + net,tyredeyes)/robots.txt 20090201191318 cdx-00230.gz 160573468 198277 920675 + + >: zgrep '^net,tyredeyes)/robots.txt' CC-MAIN-2018-34/cdx/warc/cdx-00230.gz + net,tyredeyes)/robots.txt 20090201191318 {"url": "http://tyredeyes.net/robots.txt", "mime": "text/html", "mime-detected": "text/html", "status": "301", "digest": "QH732FYSV7UM34JYWVYMB7EZGR2CYM6B", "length": "582", "offset": "1224614", "filename": "crawl-data/CC-MAIN-2018-34/segments/1534221215075.58/robotstxt/CC-MAIN-20180819090604-20180819110604-00558.warc.gz"} + net,tyredeyes)/robots.txt 20090201191319 {"url": "http://www.tyredeyes.net/robots.txt", "mime": "text/plain", "mime-detected": "text/plain", "status": "200", "digest": "PSX5IZU4B4SIXGNDKXCVFH75Q27VHUTJ", "length": "549", "offset": "2069841", "filename": "crawl-data/CC-MAIN-2018-34/segments/1534221215075.58/robotstxt/CC-MAIN-20180819090604-20180819110604-00485.warc.gz"} + +This happens in 2019-35 as well :-( + + >: fgrep ' 20181023' CC-MAIN-2019-35/cdx/cluster.idx + com,gyshbsh)/robots.txt 20181023022000 cdx-00078.gz 356340085 162332 315406 + >: zgrep ' 20181023' CC-MAIN-2019-35/cdx/warc/cdx-00078.gz + com,gyshbsh)/robots.txt 20181023022000 {"url": "http://gyshbsh.com/robots.txt", "mime": "text/html", "mime-detected": "text/html", "status": "301", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "529", "offset": "638892", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027315618.73/robotstxt/CC-MAIN-20190820200701-20190820222701-00120.warc.gz"} + ... + +Tabulate all the date ranges for the WARC files we have + + >: for d in {2017-30,2019-35,2020-34,2021-25,2023-40,2023-50}; do printf "%s\t" $d; (ls CC-MAIN-$d/*.{?,??}/orig/warc | fgrep .gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | head -1 ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done | cut -f 1,2,4 -d - | sed 's/-20/ 20/;s/.$//' | tr ' ' '\t' > dates.tsv + >: for d in {2018-30,2018-34}; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | { sleep 10 ; head -1 ; } ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done >> dates.tsv + >: for d in 2019-18; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u | head -1); done +2019-18 20190418101243-20190418122248 + >: for d in 2019-18; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u | tail -1); done +2019-18 20190426153423-20190426175423 + >: echo 2019-18 20190418101243-20190418122248 20190426153423-20190426175423 >> dates.tsv + >: pwd + /beegfs/common_crawl/CC-MAIN-2016-30/cdx/warc + >: echo {000..299} | tr ' ' '\n' | parallel -j 10 'uz cdx-00{}.gz | cut -f 2 -d " " | sort -u > /tmp/hst/{}' + >: sort -mu /tmp/hst/??? > /tmp/hst/all + >: wc -l /tmp/hst/all + 679686 /tmp/hst/all + >: head -1 /tmp/hst/all + 20160723090435 + >: tail -1 /tmp/hst/all + 20160731110639 + >: cd ../../.. + >: echo 2016-30 20160723090435 20160731110639 >> dates.tsv +tweaked and sorted in xemacs: + 2016-30 20160723090435 20160731110639 + 2017-30 20170720121902 20170729132938 + 2018-30 20180715183800 20180723184955 + 2018-34 20180814062251 20180822085454 + 2019-18 20190418101243 20190426175423 + 2019-35 20190817102624 20190826111356 + 2020-34 20200803083123 20200815214756 + 2021-25 20210612103920 20210625145905 + 2023-40 20230921073711 20231005042006 + 2023-50 20231128083443 20231212000408 + +Added to status.xlsx in shortened form, with number of days + 8 + 9 + 8 + 8 + 8 + 9 + 12 + 13 + 15 + 15 + +Fill a gap by downloading 2022-33 + + >: for s in 0; do ~/bin/getcc_multi.aws CC-MAIN-2022-33 $s 5; done > /tmp/hst/get_22-33_0.log & + 130 minutes... + >: for s in 1; do ~/bin/getcc_multi.aws CC-MAIN-2022-33 $s 10; done > /tmp/hst/get_22-33_1.log & + 59 minutes + +Another day to get to a quarter? + >: for s in {2..23}; do ~/bin/getcc_multi.aws CC-MAIN-2022-33 $s 10; done > /tmp/hst/get_22-33_2-23.log & + + +And finally 2015-35 +Fetched in just 2 chunks, 0-9 and 10-99, e.g. + >: for s in {10..99}; do ~/bin/getcc_multi.aws CC-MAIN-2015-35 $s 10; done > /tmp/hst/get_15-35_10-99.log & + +Much smaller. +Compare 2023-40, with 900 files per segment: + >: lss */orig/warc/*-0023?.* | cut -f 5 -d ' ' | stats + n = 1000 + min = 1.14775e+09 + max = 1.26702e+09 + sum = 1.20192e+12 + mean = 1.20192e+09 + sd = 2.26049e+07 + +with 2015-35, with 353 files per segment + >: lss */orig/warc/*-0023?-* | cut -f 5 -d ' ' | stats + n = 1000 + min = 1.66471e+08 + max = 9.6322e+08 + sum = 9.19222e+11 + mean = 9.19222e+08 + sd = 8.20542e+07 + +The min files all come from segment 1440644060633.7, whose files are +_all_ small: + >: uz *00123-*.gz | wc -l + 12,759,931 +Compare to 1440644060103.8 + >: zcat *00123-*.gz | wc -l + 75,806,738 +Mystery + +Also faster +Compare 2022-33: + >: fgrep -h BST /tmp/hst/get_22-33_{2-23,24-49,50-99}.log | cut -f 1-7 -d ' ' | while read s; do if read e; then echo $((($(date --date="$e" +%s) - $(date --date="$s" +%s)) / 60)); fi; done | stats n min max mean sd + 98 19 256 75.1 25.2 +with 2015-35: + >: fgrep -h BST /tmp/hst/get_15-35_{0-9,10-99}.log | cut -f 1-7 -d ' ' | while read s; do if read e; then echo $((($(date --date="$e" +%s) - $(date --date="$s" +%s)) / 60)); fi; done | stats n min max mean sd + 100 15 40 32.6 2.9 + + >: echo {000..299} | tr ' ' '\n' | parallel -j 10 'uz cdx-00{}.gz | cut -f 2 -d " " | sort -u > /tmp/hst/2015_{}' & + >: sort --parallel=10 -mu /tmp/hst/2015_??? > /tmp/hst/2015_all + >: head -1 /tmp/hst/2015_all + 20150827191534 + >: tail -1 /tmp/hst/2015_all + 20150905180914 + >: wc -l /tmp/hst/2015_all + 698128 /tmp/hst/2015_all + +What about wet files -- do they include text from pdfs? What about +truncated pdfs? + + >: time for s in 0; do ~/bin/getcc_wet_multi.aws CC-MAIN-2019-35 $s 10; done > /tmp/hst/get_wet_19-35_0.log & + real 26m3.049s + user 0m1.225s + sys 0m1.310s + +In the segment 0 cdx file (!) we find 3747 probable truncations: + >: zgrep -a '"mime-detected": "application/pdf", ' cdx.gz > /tmp/hst/2019-35_seg0_pdf.idx + >: wc -l /tmp/hst/2019-35_seg0_pdf.idx + 42345 /tmp/hst/2019-35_seg0_pdf.idx + >: egrep -a '"length": "10....."' /tmp/hst/2019-35_seg0_pdf.idx > /tmp/hst/2019-35_seg0_long_pdf.idx & + >: wc -l < /tmp/hst/2019-35_seg0_long_pdf.idx + 3747 +Of which 70 are in file 0: + >: egrep -a '.-00000\.' /tmp/hst/2019-35_seg0_pdf.idx > /tmp/hst/2019-35_seg0_file0_pdf.idx + >: wc -l /tmp/hst/2019-35_seg0_file0_pdf.idx + 70 /tmp/hst/2019-35_seg0_file0_pdf.idx + +In segment 0 file 0 we find 70 application/pdf Content-Type headers: + >: ix.py -h -w -x </tmp/hst/2019-35_seg0_file0_pdf.idx |egrep '^(WARC-Target-URI:|Content-Length:) '|cut -f 2 -d ' ' |tr -d '\r'|while read l1; do read uri; read l2; printf '%s\t%s\t%s\n' $l1 $l2 "$uri"; done > ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv + >: wc -l < ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv + 70 + >: head -3 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv + + +Of which 14 are truncated: + >: fgrep -c 1048576 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv + 14 + +E.g. + >: fgrep 1048576 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | head -3 + 1049051 1048576 https://en.heks.ch/sites/default/files/documents/2017-09/HEKS_EPER_Mission_Statement_2016_e.pdf + 1049469 1048576 https://bmcmicrobiol.biomedcentral.com/track/pdf/10.1186/s12866-017-0951-4 + 1048824 1048576 https://citydocs.fcgov.com/?action=cao-cases&cmd=convert&docid=3332339 + +Are any of the pdfs in the corresponding wet file? + +Yes, 2: + >: cut -f 3 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | fgrep -af - <(uz 1566027313501.0/orig/wet/*-00000.warc.wet.gz) + WARC-Target-URI: http://bdds.deux-sevres.com/recherche/simple/Editeur/2/Belfond/vignette?format=pdf + WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D00 + +Is it in fact corresponding? + >: diff -bw <(uz 1566027313501.0/orig/warc/*-00000.warc.gz | egrep -a '^WARC-Target-URI: ' | uniq | head -1000) <(uz 1566027313501.0/orig/wet/*-00000.warc.wet.gz | egrep -a '^WARC-Target-URI: ' | head -1000)|egrep -c '^<' + 19 + +So, yes, mostly. .2% are missing + +Just checking the search: + >: cut -f 3 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | fgrep -af - <(uz 1566027313501.0/orig/warc/*-00000.warc.gz) | wc -l + 210 +Correct + +So, what pdfs make it into the WET: + >: cut -f 3 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | fgrep -af - <(uz 1566027313501.0/orig/wet/*-00000.warc.wet.gz) > ~/results/CC-MAIN-2019-35/s0_file0_pdf.txt + >: wc -l < ~/results/CC-MAIN-2019-35/s0_file0_pdf.txt + 2 + >: cut -f 2 -d ' ' ~/results/CC-MAIN-2019-35/s0_file0_pdf.txt | tr -d '\r' | fgrep -f - ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv + 11588 10913 http://bdds.deux-sevres.com/recherche/simple/Editeur/2/Belfond/vignette?format=pdf + 1048979 1048576 https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 + +Here's the short one: +WARC/1.0 +WARC-Type: response +WARC-Date: 2019-08-17T22:40:17Z +WARC-Record-ID: <urn:uuid:ea98167b-c42a-4233-b57e-994aa627e38a> +Content-Length: 11588 +Content-Type: application/http; msgtype=response +WARC-Warcinfo-ID: <urn:uuid:f689f8d0-24f3-4824-9a38-4f3fee422a4e> +WARC-Concurrent-To: <urn:uuid:2d51c956-0012-4d78-affc-8f57fe9d2e15> +WARC-IP-Address: 92.175.114.24 +WARC-Target-URI: http://bdds.deux-sevres.com/recherche/simple/Editeur/2/Belfond/vignette?format=pdf +WARC-Payload-Digest: sha1:7VVIUDQ4Q6XKNOAURYU4VTMRSZNPHDQA +WARC-Block-Digest: sha1:OSTWXLV772XNHS22T4UBSCSJAAXM2J6T +WARC-Identified-Payload-Type: application/pdf + +HTTP/1.1 200 OK +Cache-Control: must-revalidate, post-check=0, pre-check=0,no-cache +Pragma: public,no-cache +Content-Type: application/pdf",text/html; charset=utf-8 +X-Crawler-Content-Encoding: gzip +Expires: 0 +Server: +X-Powered-By: +Set-Cookie: 166d74d734106ba68b20ea303011f622=301619e3fe31ecb98c8473f0ff5f35a2; path=/ +Content-Disposition: attachment; filename="Mdiathque dpartementale des Deux-Svres - Rsultats de la recherche Belfond.pdf" +Content-Transfer-Encoding: binary +P3P: CP="NOI ADM DEV PSAi COM NAV OUR OTRo STP IND DEM" +X-Content-Encoded-By: +X-Powered-By: +Date: Sat, 17 Aug 2019 22:40:16 GMT +X-Crawler-Content-Length: 5448 +Content-Length: 10913 + + %PDF-1.7 +%<E2><E3><CF><D3> +7 0 obj +<< /Type /Page /Parent 1 0 R /LastModified (D:20190818004016+02'00') /Resources 2 + 0 R /MediaBox [0.000000 0.000000 595.276000 841.890000] /CropBox [0.000000 0.000 +000 595.276000 841.890000] /BleedBox [0.000000 0.000000 595.276000 841.890000] /T +rimBox [0.000000 0.000000 595.276000 841.890000] /ArtBox [0.000000 0.000000 595.2 +76000 841.890000] /Contents 8 0 R /Rotate 0 /Group << /Type /Group /S /Transparen +cy /CS /DeviceRGB >> /PZ 1 >> +endobj +8 0 obj + + >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|tail -n +1823434 | tail -n +24 | head -c 20000 > ~/results/CC-MAIN-2019-35/mediatheque.pdf + >: ps2ascii mediatheque.pdf + Médiathèque départementale des Deux-Sèvres - Résultats de la recherche Belfond + + Médiathèque départementale des Deux-Sèvres - Résultats de + la recherche Belfond + A charge de revanche + Titre : + Auteur : Grippando, James (1958-....) + ... + etc., three pages, no errors + + >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|fgrep -an https://museum.wrap.gov.tw/GetFile4.ashx + 38896837:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 + 38896858:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 + 38904590:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 + >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|tail -n +38896858 | egrep -an '^%%EOF' + 27:%%EOF + 1114658:%%EOF + 1313299:%%EOF + +Hunh? + + >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|tail -n +38896858 | egrep -an '^(%%EOF|WARC)' | head -30 + 1:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 + 2:WARC-Payload-Digest: sha1:SZ53DQQHENC7DDN7GQ5IS7VMEPAXAMBE + 3:WARC-Block-Digest: sha1:QTKJA6A7445Z7264K2YAFBUUM2OYH2T2 + 4:WARC-Truncated: length + 5:WARC-Identified-Payload-Type: application/pdf + 27:%%EOF + 7725:WARC/1.0 + 7726:WARC-Type: metadata + 7727:WARC-Date: 2019-08-17T22:59:14Z + 7728:WARC-Record-ID: <urn:uuid:77df2747-e567-45d3-8646-3069ae9a9f25> + 7731:WARC-Warcinfo-ID: <urn:uuid:f689f8d0-24f3-4824-9a38-4f3fee422a4e> + 7732:WARC-Concurrent-To: <urn:uuid:eceb4adc-d81e-4497-82fe-eea61ce171f4> + 7733:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 + 7739:WARC/1.0 + +OK, so indeed truncated after 7700 lines or so... + >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|tail -n +38896858 | tail -n +21 | head -c 1048576 > ~/results/CC-MAIN-2019-35/museum.pdf + >: ps2ascii ~/results/CC-MAIN-2019-35/museum.pdf + **** Error: An error occurred while reading an XREF table. + **** The file has been damaged. +Look in big_pdf? + +====Modify the original CC indexer to write new indices including lastmod===== +Looks like WarcRecordWriter.write, in +src/nutch-cc/src/java/org/commoncrawl/util/WarcRecordWriter, is what +needs to be editted to include LastModified date + +To rebuild nutch-cc, particularly to recompile jar files after editting +anything + + >: cd $HHOME/src/nutch-cc + >: ant + +Fixed deprecation bug in WarcCdxWriter.java + +Modified src/java/org/commoncrawl/util/WarcCdxWriter.java +to include lastmod + +Can run just one test, which should allow testing this: + + >: ant test-core -Dtestcase='TestWarcRecordWriter' + +Logic is tricky, and there's no easy way in + +Basically, tools/WarcExport.java is launches a hadoop job based on a +hadoop-runnable WarcExport instance. Hadoop will in due course call +ExportReducer.reduce, which will create an instance of WarcCapture +"for each page capture", and call ExportMapper.context.write with that instance (via +some configuration magic with the hadoop job Context). That in turn +uses (more magic) WarcOutputFormat.getRecordWriter, which +(finally!) calls a previously created WarcRecordWriter +instance.write(the capture). + +So to fake a test case, I need to build + 1) a WarcRecordWriter instance + 2) a WarcCapture instance +and then invoke 1.write(2) + +Got that working, although still can't figure out where in the normal +flow the metadata entry for Response.CONTENT_TYPE gets set. + +Now, add a test that takes a stream of WARC Response extracts and +rewrites their index entries + + >: head -8804 <(uz /beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00150.gz)|tail -10| ix.py -h -w -x > /tmp/hst/headers.txt + >: cp /tmp/hst/headers.txt src/test/org/commoncrawl/util/ + >: shuf /tmp/hst/headers.txt > src/test/org/commoncrawl/util/headers_mixed.txt + +Won't quite work :-( +How do We reconstruct the Warc filename, offset and length from the +original index? + +Well, we can find a .warc.gz records! +Thanks to https://stackoverflow.com/a/37042747/2595465 + + >: ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz > /tmp/hst/recs.txt + +Nearly working, got 1/3rd of the way through a single WARC and then failed: + + >: n=0 && ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/tt.txt|while read o l; do echo $((n+=1)); echo $o $l >> /tmp/hst/r3a; ix.py $l $o CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz| wc -l; done + ... + 20 + 10215 + CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz + Process fail: Compressed file ended before the end-of-stream marker was reached, input: + length=10762, offset=60784640, file=/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz + + >: head -10217 /tmp/hst/r3a | tail -4 + 60784173 467 + 60784640 10762 + 60795402 463 + 60795865 460 + >: ix.py 467 60784173 CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|fgrep Target + WARC-Target-URI: http://drycarerestoration.co/corner-furniture-piece/unique-corner-decoration-pieces-or-corner-furniture-pieces-corner-corner-furniture-piece-corner-furniture-pieces-bedroom/ + + >: zcat /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/cdx/warc/cdx.gz + ... + co,drycarerestoration)/corner-furniture-piece/unique-corner-decoration-pieces-or-corner-furniture-pieces-corner-corner-furniture-piece-corner-furniture-pieces-bedroom 20190819020224 {"url": "http://drycarerestoration.co/corner-furniture-piece/unique-corner-decoration-pieces-or-corner-furniture-pieces-corner-corner-furniture-piece-corner-furniture-pieces-bedroom/", "mime": "text/html", "mime-detected": "text/html", "status": "200", "digest": "DTKGJL45XQDXUS7PTXPYR6POMPLG46RZ", "length": "2570", "offset": "60784640", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz", "charset": "UTF-8", "languages": "eng"} + >: ix.py 2570 60784640 CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|less + >: echo $((10762 - 2570)) + 8192 + +Ah, the error I was dreading :-( I _think_ this happens when an +individual record ends exactly on a 8K boundary. + +Yes: + + >: echo $((60784640 % 8192)) + 0 + +Even with buffer 1MB: + 21 + 160245 + CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz + Process fail: Compressed file ended before the end-of-stream marker was reached, input: + length=8415, offset=1059033915, file=/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz + 0 + 160246 + + >: tail -60 /tmp/hst/r3b|head -20 + 1059013061 423 + 1059013484 7218 + 1059020702 425 + 1059021127 424 + 1059021551 11471 + 1059033022 426 + 1059033448g 467 + 1059033915 8415 + +Argh. This is at the _same_ point (before 51 fails before EOF). Ah, +maybe that's the point -- this is the last read before EOF, and it's +not a full buffer! + + >: ix.py 467 1059033448 CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|less + ... + WARC-Target-URI: https://zowiecarrpsychicmedium.com/tag/oracle/ + +Reran with more instrumentation, took at least all day: + + >: n=0 && ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2> /tmp/hst/r3e_err.txt | while read o l; do + echo $((n+=1)); echo $o $l >> /tmp/hst/r3e_val; ix.py $l $o CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | wc -l; + done > /tmp/hst/r3e_log 2>&1 + >: wc -l /tmp/hst/r3e_err.txt + 160296 /tmp/hst/r3e_err.txt + >: tail -60 /tmp/hst/r3e_err.txt|cat -n | grep -C2 True\ True + 7 b 28738 28738 28312 426 False False + 8 b 28312 28312 27845 467 False False + 9 b 27845 378162 369747 8415 True True < this is the first hit the last + (partial) block + 10 b 369747 369747 369312 435 False True + 11 b 369312 369312 368878 434 False True + + >: tail -55 /tmp/hst/r3e_val | head -3 + 1059033022 426 + 1059033448 467 + 1059033915 8415 + >: dd ibs=1 skip=1059033022 count=426 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t + ... + 426 bytes copied, 0.00468243 s, 91.0 kB/s + sing<3411>: dd ibs=1 skip=1059033448 count=467 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t + ... + 467 bytes copied, 0.00382692 s, 122 kB/s + sing<3412>: dd ibs=1 skip=1059033915 count=8415 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t + igzip: Error (null) does not contain a complete gzip file + ... + 8415 bytes (8.4 kB, 8.2 KiB) copied, 0.00968889 s, 869 kB/s + +So, tried one change to use the actually size rather than BUFSIZE at +one point, seems to work now: + + >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2> /tmp/hst/r3f_err.txt | tee /tmp/hst/r3f_val | while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz'; +done 2>&1 | tee /tmp/hst/r3f_log | ix.py -w | egrep -c '^WARC/1\.0' + 160296 + real 3m48.393s + user 0m47.997s + sys 0m26.641s + + >: tail /tmp/hst/r3f_val +10851 1059370472 +475 1059381323 +444 1059381798 +22437 1059382242 +447 1059404679 +506 1059405126 +15183 1059405632 +471 1059420815 +457 1059421286 +17754 1059421743 + + >: wc -l /tmp/hst/*_val + 171 /tmp/hst/r3d_val + 160297 /tmp/hst/r3e_val + 160296 /tmp/hst/r3f_val + 320764 total + >: uz /tmp/hst/head.warc.gz |egrep -c '^WARC/1\.0.$' + 171 + >: tail -n 3 /tmp/hst/*_val + ==> /tmp/hst/r3d_val <== + 454 1351795 + 414 1352249 + 0 1352663 [so the 171 above is bogus, and we're missing one] + + ==> /tmp/hst/r3e_val <== + 1059393441 457 + 1059393898 17754 + 0 [likewise bogus, so see below] + + ==> /tmp/hst/r3f_val <== + 471 1059420815 + 457 1059421286 + 17754 1059421743 [better, but still one missing] + >: uz /tmp/hst/head.warc.gz |egrep '^WARC-Type: ' | tee >(wc -l 1>&2) | tail -4 + WARC-Type: response + WARC-Type: metadata + WARC-Type: request + WARC-Type: response [missing] + 171 + >: ls -lt /tmp/hst/*_val + -rw-r--r-- 1 hst dc007 1977 Sep 29 09:27 /tmp/hst/r3d_val + -rw-r--r-- 1 hst dc007 2319237 Sep 28 14:28 /tmp/hst/r3f_val + -rw-r--r-- 1 hst dc007 2319238 Sep 27 19:41 /tmp/hst/r3e_val + >: ls -l ~/lib/python/unpackz.py + -rwxr-xr-x 1 hst dc007 1821 Sep 28 15:13 .../dc007/hst/lib/python/unpackz.py +So e and f are stale, rerun + >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3f_err.txt| tee /tmp/hst/r3f_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3f_log |ix.py -w |egrep '^WARC-Type: ' | tail -4 & + >: Reading length, offset, filename tab-delimited triples from stdin... + WARC-Type: response + WARC-Type: metadata + WARC-Type: request + WARC-Type: response + + real 3m49.760s + user 0m47.180s + sys 0m32.218s +So missing the final metadata... +Back to head.warc.gz, with debug info + + >: n=0 && ~/lib/python/unpackz.py /tmp/hst/head.warc.gz 2>/tmp/hst/ttd.txt|while read l o; do echo $((n+=1)); echo $l $o >> /tmp/hst/r3d_val; dd ibs=1 skip=$o count=$l if=/tmp/hst/head.warc.gz of=/dev/stdout 2>/tmp/hst/r3d_ido| uz -t ; done >/tmp/hst/r3d_log 2>&1 + >: tail -2 /tmp/hst/r3d_log + 171 + igzip: Error invalid gzip header found for file (null) + >: tail -n 3 /tmp/hst/ttd.txt /tmp/hst/r3d_val + ==> /tmp/hst/ttd.txt <== + b 9697 9697 9243 454 False True + b 9243 9243 8829 414 False True + n 8829 + + ==> /tmp/hst/r3d_val <== + 454 1351795 + 414 1352249 + 0 1352663 + + >: cat -n /tmp/hst/r3f_val | head -172 | tail -4 + 169 454 1351795 + 170 414 1352249 + 171 8829 1352663 + 172 446 1361492 + +Fixed, maybe + + >: tail -n 3 /tmp/hst/r3d_log /tmp/hst/r3d_val + ==> /tmp/hst/r3d_log <== + 169 + 170 + 171 + + ==> /tmp/hst/r3d_val <== + 454 1351795 + 414 1352249 + 8829 1352663 + +Yes! + + >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3f_err| tee /tmp/hst/r3f_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3f_log |ix.py -w |egrep '^WARC-Type: ' | tail -4 + Reading length, offset, filename tab-delimited triples from stdin... + WARC-Type: metadata + WARC-Type: request + WARC-Type: response + WARC-Type: metadata + + real 3m26.042s + user 0m44.167s + sys 0m24.716s + >: tail -n 3 /tmp/hst/r3f* + ==> /tmp/hst/r3f_err <== + + ==> /tmp/hst/r3f_val <== + 457 1059421286 + 17754 1059421743 + 425 1059439497 + +Doubling buffer size doesn't speed up + >: time ~/lib/python/unpackz.py -b $((2 * 1024 * 1024)) /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3g_err| tee /tmp/hst/r3g_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3g_log |ix.py -w |egrep '^WARC-Type: ' | tail -4 + Reading length, offset, filename tab-delimited triples from stdin... + WARC-Type: metadata + WARC-Type: request + WARC-Type: response + WARC-Type: metadata + + real 3m34.519s + user 0m52.312s + sys 0m24.875s + +Tried using FileIO.readinto([a fixed buffer]), but didn't immediately +work. Abandoned because I still don't understand how zlib.decompress +works at all... + +Time to convert unpackz to a library which takes a callback +alternative to an output file -- Done + +W/o using callback, timing and structure for what we need for +re-indexing task looks encouraging: + >: time ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz |egrep -aA20 '^WARC-Type: response' | cut -f 1 -d ' ' | egrep -a '^WARC-' |sus | tee >(wc -l 1>&2) + 52468 WARC-Block-Digest: + 52468 WARC-Concurrent-To: + 52468 WARC-Date: + 52468 WARC-Identified-Payload-Type: + 52468 WARC-IP-Address: + 52468 WARC-Payload-Digest: + 52468 WARC-Record-ID: + 52468 WARC-Target-URI: + 52468 WARC-Type: + 52468 WARC-Warcinfo-ID: + 236 WARC-Truncated: + 11 + + real 0m20.308s + user 0m19.720s + sys 0m4.505s + +Whole thing, with no pre-filtering: + + >: time ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | cut -f 1 -d ' ' | egrep -a '^(WARC-|Content-|Last-Modified)' |sus | tee >(wc -l 1>&2) + 211794 Content-Length: + 211162 Content-Type: + 159323 WARC-Target-URI: + 159311 WARC-Warcinfo-ID: + 159301 WARC-Record-ID: + 159299 WARC-Date: + 159297 WARC-Type: + 105901 WARC-Concurrent-To: + 105896 WARC-IP-Address: + 52484 WARC-Block-Digest: + 52484 WARC-Identified-Payload-Type: + 52482 WARC-Payload-Digest: + 9239 Last-Modified: + 3941 Content-Language: + 2262 Content-Security-Policy: + 642 Content-language: + 326 Content-Security-Policy-Report-Only: + 238 WARC-Truncated: + 114 Content-Disposition: + 352 Content-*: + 1 WARC-Filename: + 42 + +real 0m30.896s +user 0m37.335s +sys 0m7.542s + +First 51 after WARC-Type: response + + >: time ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz |egrep -aA50 '^WARC-Type: response' | cut -f 1 -d ' ' | egrep -a '^(WARC-|Content-|Last-Modified)' |sus | tee >(wc -l 1>&2) + 106775 Content-Length: + 106485 Content-Type: + 55215 WARC-Type: + 55123 WARC-Date: + 54988 WARC-Record-ID: + 54551 WARC-Warcinfo-ID: + 54246 WARC-Target-URI: + 54025 WARC-Concurrent-To: + 52806 WARC-IP-Address: + 52468 WARC-Block-Digest: + 52468 WARC-Identified-Payload-Type: + 52468 WARC-Payload-Digest: + 9230 Last-Modified: + 3938 Content-Language: + 2261 Content-Security-Policy: + 639 Content-language: + 324 Content-Security-Policy-Report-Only: + 236 WARC-Truncated: + 114 Content-Disposition: + 342 Content-*: + 41 + + real 0m21.483s + user 0m22.372s + sys 0m5.400s + +So, not worth the risk, let's try python + + >: time ~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|wc -l + 9238 + + real 0m25.426s + user 0m23.201s + sys 0m0.711s + +Looks good, but why 9238 instead of 9239??? + + >: ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | egrep -a '^Last-Modified: ' > /tmp/hst/lmo.tsv + +Argh. Serious bug in unpackz, wasn't handline cross-buffer-boundary +records correctly. Fixed. Redoing the above... + +No pre-filter: + >: uz /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|egrep -c '^WARC/1\.0.$' + 160297 + + >: time ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | cut -f 1 -d ' ' | egrep -a '^(WARC-|Content-|Last-Modified)' |sus | tee >(wc -l 1>&2) + + 213719 Content-Length: + 213088 Content-Type: + 160297 WARC-Date: + 160297 WARC-Record-ID: + 160297 WARC-Type: + 160296 WARC-Target-URI: + 160296 WARC-Warcinfo-ID: + 106864 WARC-Concurrent-To: + 106864 WARC-IP-Address: + 53432 WARC-Block-Digest: [consistent with 106297 == (3 * 53432) + 1] + 53432 WARC-Identified-Payload-Type: + 53432 WARC-Payload-Digest: + 9430 Last-Modified: + 4006 Content-Language: + 2325 Content-Security-Policy: + 653 Content-language: + 331 Content-Security-Policy-Report-Only: + 298 WARC-Truncated: + 128 Content-Disposition: + 83 Content-Location: + 67 Content-type: + 51 Content-MD5: + 45 Content-Script-Type: + 42 Content-Style-Type: + 31 Content-Transfer-Encoding: + 13 Content-disposition: + 8 Content-Md5: + 5 Content-Description: + 5 Content-script-type: + 5 Content-style-type: + 3 Content-transfer-encoding: + 2 Content-Encoding-handler: + 1 Content-DocumentTitle: + 1 Content-Hash: + 1 Content-ID: + 1 Content-Legth: + 1 Content-length: + 1 Content-Range: + 1 Content-Secure-Policy: + 1 Content-security-policy: + 1 Content-Type-Options: + 1 WARC-Filename: +42 + +real 0m28.876s +user 0m35.703s +sys 0m6.976s + + >: ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | egrep -a '^Last-Modified: ' > /tmp/hst/lmo.tsv + >: wc -l /tmp/hst/lmo.tsv + 9430 /tmp/hst/lmo.tsv + >: time ~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz > /tmp/hst/lm.tsv + + real 0m17.191s + user 0m15.739s + sys 0m0.594s + >: wc -l /tmp/hst/lm.tsv + 9423 /tmp/hst/lm.tsv + + >: diff <(sed 's/^Last-Modified: //' /tmp/hst/lmo.tsv | tr -d '\r') <(cut -f 3 /tmp/hst/lm.tsv) + 853d852 + < Mon, 19 Aug 2019 01:46:49 GMT + 4058d4056 + < Tue, 03 Nov 2015 21:31:18 GMT<br /> + 4405d4402 + < Mon, 19 Aug 2019 01:54:52 GMT + 5237,5238d5233 + < 3 + < Asia/Amman + 7009d7003 + < Mon, 19 Aug 2019 02:34:20 GMT + 9198d9191 + < Mon, 19 Aug 2019 02:14:49 GMT + +All good. The only implausable case is + < Mon, 19 Aug 2019 01:54:52 GMT +which turns out to be a case of two Last-Modified headers in the same +the same response record's HTTP headers. RFCs 2616 and 7230 rule it +out but neither specifies a recovery, so first-wins is as good as +anything, and indeed 6797 specifies that. + +Start looking at how we do the merge of cdx_extras.py with existing index + +Try it with the existing _per segment_ index we have for 2019-35 + +Assuming we have to key on segment plus offset, as reconstructing the +proper index key is such a pain / buggy / is going to change with the year. + +Stay with segment 49 + + >: uz cdx.gz |wc -l + 29,870,307 + + >: time uz cdx.gz|egrep -ao ' "length": "[0-9]*", "offset": "[0-9]*"' |wc + 29,870,307 119,481,228 1,241,098,122 + = 4 * 29,870,307 + +So no bogons, not _too_ surprising :-) + +Bad news is it's a _big_ file: + + >: ls -lh cdx.gz + -rw-r--r-- 1 hst dc007 2.0G Mar 18 2021 cdx.gz + +So not viable to paste offset as a key and then sort on command line, +or to load it in to python and do the work there... + +Do it per warc file and then merge? + + >: time uz cdx.gz |fgrep -a warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | sort -n -t\" -k28,28 > /tmp/hst/558.warc.cdx + + real 0m23.494s + user 0m14.541s + sys 0m9.158s + + >: wc -l /tmp/hst/558.warc.cdx + 53432 /tmp/hst/558.warc.cdx + + >: echo $((600 * 53432)) + 32,059,200 + +So, 600 of those, plus approx. same again for extracting, that pbly +_is_ doable in python, not more than 10 hours total, assuming internal +sort and external merge is not too expensive... + +For each segment, suppose we pull out 60 groups of 10 target files + >: time uz cdx.gz |egrep -a warc/CC-MAIN-2019[^-]*-2019[^-]*-0000..warc.gz > /tmp/hst/0000.warc.cdx + + real 0m42.129s + user 0m35.147s + sys 0m9.140s + >: wc -l /tmp/hst/0000.warc.cdx + 533150 + +Key it with offset and sort: + + >: time egrep -ao ' "length": "[0-9]*", "offset": "[0-9]*"' /tmp/hst/0000.warc.cdx | cut -f 5 -d ' ' | tr -d \" > /tmp/hst/0000_offsets + + real 0m5.578s + user 0m5.593s + sys 0m0.265s + + >: time paste /tmp/hst/0000_offsets /tmp/hst/0000.warc.cdx |sort -nk1,1 | cut -f 2 > /tmp/hst/0000_sorted.warc.cdx + + real 0m4.185s + user 0m2.001s + sys 0m1.334s + + >: time seq 0 9 | parallel -j 10 "~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-*-*-0000'{}'.warc.gz > /tmp/hst/lm_0000'{}'.tsv" + + real 0m24.610s + user 2m54.146s + sys 0m10.226s + + >: head /tmp/hst/lm_00000.tsv + 9398 16432 Mon, 19 Aug 2019 02:44:15 GMT + 20796 26748 Tue, 16 Jul 2019 04:39:09 GMT + 4648 340633 Fri, 07 Dec 2018 09:05:59 GMT + 3465 357109 Sun, 18 Aug 2019 11:48:23 GMT + 7450 914189 Mon, 19 Aug 2019 02:50:08 GMT + ... + sing<3956>: fgrep '"length": "9398", "offset": "16432"' /tmp/hst/0000_sorted.warc.cdx + com,roommeme,0401a)/index.phtml?channel=&op=&p=140&put=show&r2= 20190819024416 {"url": "http://0401a.roommeme.com/index.phtml?PUT=SHOW&R2=&OP=&P=140&CHANNEL=", "mime": "text/html", "mime-detected": "text/html", "status": "200", "digest": "5DNDVX5HQBOOBHISSCOI4UBVMUL63L36", "length": "9398", "offset": "16432", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00000.warc.gz", "charset": "Big5", "languages": "zho"} + +bingo + +So, the python code is pretty straightfoward: open the 10 individual +lm-*.tsv outputs into an array, initialise a 10-elt array with the +first line of each and another with its offset, record the +fileno(s) of the lowest offset, then iterate + + read cdx lines and write unchanged until offset = lowest + merge line from fileno and output + remove fileno from list of matches + read and store a new line for fileno [handle EOF] + if list of matches is empty, redo setting of lowest + +Resort the result by actual key + +Meanwhile, get a whole test set: +sbatch --output=slurm_aug_cdx_49_10-599-out --time=01:00:00 --ntasks=10 -c 36 --exclusive $HOME/bin/runme.sh -m 00 59 $PWD -t 18 -b 'export resdir=CC-MAIN-2019-35/aug_cdx/49 +export DEC=$xarg' "export PYTHONPATH=./lib/python/cc:$PYTHONPATH +seq 0 9 | parallel -j 10 \"~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-*-*-00\${DEC}'{}'.warc.gz > \$resdir/00\${DEC}'{}'.tsv\"" + +Actually finished 360 in the hour. + +Leaving + +sbatch --output=slurm_aug_cdx_49_360-599-out --time=01:00:00 --ntasks=10 -c 36 --exclusive $HOME/bin/runme.sh -m 36 59 $PWD -t 18 -b 'export resdir=CC-MAIN-2019-35/aug_cdx/49 +export DEC=$xarg' "export PYTHONPATH=./lib/python/cc:$PYTHONPATH +seq 0 9 | parallel -j 10 \"~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-*-*-00\${DEC}'{}'.warc.gz > \$resdir/00\${DEC}'{}'.tsv\"" + +But something is wrong, the number of jobs is all wrong: + + 5>: fgrep -c parallel slurm_aug_cdx_49_0-359-out + 741 + sing<4046>: ls -lt CC-MAIN-2019-35/aug_cdx/49/|wc -l + 372 + +Every file is being produced twice. + +Took me a while to figure out my own code :-( + + >: sbatch --output=slurm_aug_cdx_49_360-599-out --time=01:00:00 --ntasks=10 -c 36 --exclusive $HOME/bin/runme.sh -m 49 49 $PWD -t 18 -b 'export resdir=CC-MAIN-2019-35/aug_cdx/$xarg + export SEG=$xarg + share_by_task.sh -f "%03g\n" -s 360 599 $n $task > /tmp/hst_$task' -i 'cat /tmp/hst_$task' 'export PYTHONPATH=./lib/python/cc:$PYTHONPATH + ~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/*.$SEG/orig/warc/CC-MAIN-*-*-00${arg}.warc.gz > $resdir/00${arg}.tsv' + +Oops, only 560, not 600 + +Took 3.5 minutes for 200, so call it 10 for 560, so do 6 more in an +hour: + + >: sbatch --output=slurm_aug_cdx_50-55_out --time=01:00:00 --ntasks=10 -c 36 --exclusive $HOME/bin/runme.sh -m 50 55 $PWD -t 18 -b 'export resdir=CC-MAIN-2019-35/aug_cdx/$xarg +mkdir -p $resdir +> export SEG=$xarg +share_by_task.sh -f "%03g\n" -s 360 599 $n $task > /tmp/hst_$task' -i 'cat /tmp/hst_$task' 'export PYTHONPATH=./lib/python/cc:$PYTHONPATH + ~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/*.$SEG/orig/warc/CC-MAIN-*-*-00${arg}.warc.gz > $resdir/00${arg}.tsv' + + >: tail slurm_aug_cdx_50-55_out + ... + Wed Oct 9 22:25:47 BST 2024 Finished 55 + >: head -1 slurm_aug_cdx_50-55_out + Wed Oct 9 21:29:43 BST + 56:04 + + >: du -s CC-MAIN-2019-35/aug_cdx + 1,902,916 + +Not bad, so order 20MB for the whole thing + +Next step, compare to my existing cdx with timestamp + +First check looks about right: + + [cd .../warc_lmhx] + >: seq --format='%03g' 0 299 > /tmp/hst/cdx_nums + >: parallel -j 20 -a /tmp/hst/cdx_nums 'uz idx/cdx-00{}.gz | egrep -o "\"filename\": \"crawl-data/CC-MAIN-2019-35/segments/[^.]*[.]50.*\"lastmod\":" | sed "s/^.*-00//;s/^\(...\).*/\1/"| sus > /tmp/hst/checkseg_50_{}' + + [cd .../aug_cdx/50] + >: wc -l 00123.tsv + 9333 + >: egrep -h '123$' /tmp/hst/checkseg_50_??? | acut 1 | btot + 9300 + >: wc -l 00400.tsv + 9477 00400.tsv + >: egrep -h '400$' /tmp/hst/checkseg_50_??? | acut 1 | btot + 9439 + +Difference is presumable the bogus timestamps aren't in the augmented +cdx as shipped. + +Note that the following 'bad' kind of timestamp is fixed before +sort_date.py does its thing: + + ... sort_date.sh <(uz $arg/*00???.warc.gz | '"fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/')"' >$arg/ks.tsv + + + >: egrep -c '[^ ]GMT$' 50/00123.tsv + 22 + >: egrep -c '[^ ]GMT$' 50/00400.tsv + 14 + + >: PYTHONPATH=~/.local/lib/python3.9/site-packages:$PYTHONPATH sort_date.sh <(uz ../warc_lmhx/50/*00123.warc.gz | fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2> /tmp/hst/123_errs | wc -l + 9300 + >: fgrep -c Invalid /tmp/hst/123_errs + 33 + >: PYTHONPATH=~/.local/lib/python3.9/site-packages:$PYTHONPATH sort_date.sh <(uz ../warc_lmhx/50/*00400.warc.gz | fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2> /tmp/hst/400_errs | wc -l + 9439 + >: fgrep -c Invalid /tmp/hst/400_errs + 38 + +All good. + +But + >: seq --format='%03g' 0 559 > /tmp/hst/warc_nums + >: xx () { + r=$(diff -bw + <(echo $(( + $(sort_date.sh <(uz ../warc_lmhx/50/*00$1.warc.gz | + fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2>/tmp/hst/ec_$1 |wc -l) + + + $(fgrep -c Invalid /tmp/hst/ec_$1)))) + <(wc -l < 50/00$1.tsv)) + if [ "$r" ] + then printf "%s:\n%s\n" $2 "$r" + fi + } + >: parallel -j 20 -a /tmp/hst/warc_nums xx '{}' '$(({#} - 1))' | tee /tmp/hst/aug_bugs + >: fgrep -c 1c1 /tmp/hst/aug_bugs + 77 + sing<4318>: wc -l < /tmp/hst/aug_bugs + 385 + sing<4319>: echo $((77 * 5)) + 385 + +OK, there are a few other error messages from date conversion + >: xx () { r=$(diff -bw <(echo $(($(sort_date.sh <(uz ../warc_lmhx/50/*00$1.warc.gz | fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2>/tmp/hst/ec_$1 |wc -l) + $(egrep -c 'Invalid|must be in|out of range' /tmp/hst/ec_$1)))) <(wc -l < 50/00$1.tsv)); if [ "$r" ]; then printf "%s:\n%s\n" $2 "$r"; fi; } +sing<4337>: parallel -j 20 -a /tmp/hst/warc_nums xx '{}' '$(({#} - 1))' | tee /tmp/hst/aug_bugs2 + [nothing] + +So, I think we can believe we're OK +But 7 is better than 1: + >: xx () { r=$(diff -bw <(echo $(($(sort_date.sh <(uz ../warc_lmhx/$3/*00$1.warc.gz | fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2>/tmp/hst/ec_$1 |wc -l) + $(egrep -c 'Invalid|must be in|out of range' /tmp/hst/ec_$1)))) <(wc -l < $3/00$1.tsv)); if [ "$r" ]; then printf "%s:\n%s\n" $2 "$r"; fi; } + >: for s in 49 {51..55}; do parallel -j 20 -a /tmp/hst/warc_nums xx '{}' '$(({#} - 1))' $s | tee /tmp/hst/aug_bugs_$s; done + [nothing] + +Next step: ? + + +