diff lurid3/notes.txt @ 55:237105932af5

merge
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 15 Oct 2024 16:06:27 +0100
parents d533894173d0
children dd06d7afbfe0
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lurid3/notes.txt	Tue Oct 15 16:06:27 2024 +0100
@@ -0,0 +1,987 @@
+See old_notes.txt for all older notes on Common Crawl dataprocessing,
+starting from Azure via Turing and then LURID and LURID2.
+
+Installed /beegfs/common_crawl/CC-MAIN-2024-33/cdx
+  >: cd results/CC-MAIN-2024-33/cdx/
+  >: cut -f 2 counts.tsv | btot
+  2,793,986,828 
+
+State of play wrt data -- see status.xlsx
+
+[in trying to tabulate the date ranges of the crawls, I found that the
+WARC timestamp is sometimes bogus:
+
+  >: fgrep ' 2009' CC-MAIN-2018-34/cdx/cluster.idx
+  net,tyredeyes)/robots.txt 20090201191318	cdx-00230.gz	160573468	198277	920675
+
+  >: zgrep '^net,tyredeyes)/robots.txt' CC-MAIN-2018-34/cdx/warc/cdx-00230.gz
+  net,tyredeyes)/robots.txt 20090201191318 {"url": "http://tyredeyes.net/robots.txt", "mime": "text/html", "mime-detected": "text/html", "status": "301", "digest": "QH732FYSV7UM34JYWVYMB7EZGR2CYM6B", "length": "582", "offset": "1224614", "filename": "crawl-data/CC-MAIN-2018-34/segments/1534221215075.58/robotstxt/CC-MAIN-20180819090604-20180819110604-00558.warc.gz"}
+  net,tyredeyes)/robots.txt 20090201191319 {"url": "http://www.tyredeyes.net/robots.txt", "mime": "text/plain", "mime-detected": "text/plain", "status": "200", "digest": "PSX5IZU4B4SIXGNDKXCVFH75Q27VHUTJ", "length": "549", "offset": "2069841", "filename": "crawl-data/CC-MAIN-2018-34/segments/1534221215075.58/robotstxt/CC-MAIN-20180819090604-20180819110604-00485.warc.gz"}
+
+This happens in 2019-35 as well :-(
+
+  >: fgrep ' 20181023' CC-MAIN-2019-35/cdx/cluster.idx
+  com,gyshbsh)/robots.txt 20181023022000	cdx-00078.gz	356340085	162332	315406
+  >: zgrep ' 20181023' CC-MAIN-2019-35/cdx/warc/cdx-00078.gz
+  com,gyshbsh)/robots.txt 20181023022000 {"url": "http://gyshbsh.com/robots.txt", "mime": "text/html", "mime-detected": "text/html", "status": "301", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "529", "offset": "638892", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027315618.73/robotstxt/CC-MAIN-20190820200701-20190820222701-00120.warc.gz"}
+  ...
+
+Tabulate all the date ranges for the WARC files we have
+
+  >: for d in {2017-30,2019-35,2020-34,2021-25,2023-40,2023-50}; do printf "%s\t" $d; (ls CC-MAIN-$d/*.{?,??}/orig/warc | fgrep .gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | head -1 ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done | cut -f 1,2,4 -d -  | sed 's/-20/ 20/;s/.$//' | tr ' ' '\t' > dates.tsv
+  >: for d in {2018-30,2018-34}; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | { sleep 10 ; head -1 ; } ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done >> dates.tsv
+  >: for d in 2019-18; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u | head -1); done
+2019-18	20190418101243-20190418122248
+  >: for d in 2019-18; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u | tail -1); done
+2019-18	20190426153423-20190426175423
+  >: echo 2019-18       20190418101243-20190418122248   20190426153423-20190426175423 >> dates.tsv 
+  >: pwd
+  /beegfs/common_crawl/CC-MAIN-2016-30/cdx/warc
+  >: echo {000..299} | tr ' ' '\n' | parallel -j 10 'uz cdx-00{}.gz | cut -f 2 -d " " | sort -u > /tmp/hst/{}'
+  >: sort -mu /tmp/hst/??? > /tmp/hst/all
+  >: wc -l /tmp/hst/all
+  679686 /tmp/hst/all
+  >: head -1 /tmp/hst/all
+  20160723090435
+  >: tail -1 /tmp/hst/all
+  20160731110639
+  >: cd ../../..
+  >: echo 2016-30       20160723090435  20160731110639 >> dates.tsv 
+tweaked and sorted in xemacs:
+  2016-30	20160723090435	20160731110639
+  2017-30	20170720121902	20170729132938
+  2018-30	20180715183800	20180723184955
+  2018-34	20180814062251	20180822085454
+  2019-18	20190418101243	20190426175423
+  2019-35	20190817102624	20190826111356
+  2020-34	20200803083123	20200815214756
+  2021-25	20210612103920	20210625145905
+  2023-40	20230921073711	20231005042006
+  2023-50	20231128083443	20231212000408
+
+Added to status.xlsx in shortened form, with number of days
+  8
+  9
+  8
+  8
+  8
+  9
+  12
+  13
+  15
+  15
+
+Fill a gap by downloading 2022-33
+
+  >: for s in 0; do ~/bin/getcc_multi.aws CC-MAIN-2022-33 $s 5; done > /tmp/hst/get_22-33_0.log &
+  130 minutes...
+  >: for s in 1; do ~/bin/getcc_multi.aws CC-MAIN-2022-33 $s 10; done > /tmp/hst/get_22-33_1.log &
+  59 minutes
+
+Another day to get to a quarter?
+  >: for s in {2..23}; do ~/bin/getcc_multi.aws CC-MAIN-2022-33 $s 10; done > /tmp/hst/get_22-33_2-23.log &
+
+
+And finally 2015-35
+Fetched in just 2 chunks, 0-9 and 10-99, e.g.
+  >: for s in {10..99}; do ~/bin/getcc_multi.aws CC-MAIN-2015-35 $s 10; done > /tmp/hst/get_15-35_10-99.log &
+
+Much smaller.
+Compare 2023-40, with 900 files per segment:
+  >: lss */orig/warc/*-0023?.* | cut -f 5 -d ' ' | stats
+  n	=	1000
+  min	=	1.14775e+09
+  max	=	1.26702e+09
+  sum	=	1.20192e+12
+  mean	=	1.20192e+09
+  sd	=	2.26049e+07
+
+with 2015-35, with 353 files per segment
+  >: lss */orig/warc/*-0023?-* | cut -f 5 -d ' ' | stats
+  n	=	1000
+  min	=	1.66471e+08
+  max	=	9.6322e+08
+  sum	=	9.19222e+11
+  mean	=	9.19222e+08
+  sd	=	8.20542e+07
+
+The min files all come from segment 1440644060633.7, whose files are
+_all_ small:
+  >: uz *00123-*.gz | wc -l
+  12,759,931
+Compare to 1440644060103.8
+  >: zcat *00123-*.gz | wc -l
+  75,806,738
+Mystery
+
+Also faster
+Compare 2022-33:
+ >: fgrep -h BST /tmp/hst/get_22-33_{2-23,24-49,50-99}.log |  cut -f 1-7 -d ' ' | while read s; do if read e; then echo $((($(date --date="$e" +%s) - $(date --date="$s" +%s)) / 60)); fi; done   | stats n min max  mean    sd
+                              98 19 256  75.1   25.2
+with 2015-35:
+  >: fgrep -h BST /tmp/hst/get_15-35_{0-9,10-99}.log |  cut -f 1-7 -d ' ' | while read s; do if read e; then echo $((($(date --date="$e" +%s) - $(date --date="$s" +%s)) / 60)); fi; done   | stats n min max mean sd
+		       100 15  40 32.6 2.9
+
+  >: echo {000..299} | tr ' ' '\n' | parallel -j 10 'uz cdx-00{}.gz | cut -f 2 -d " " | sort -u > /tmp/hst/2015_{}' &
+  >: sort --parallel=10 -mu /tmp/hst/2015_??? > /tmp/hst/2015_all
+  >: head -1 /tmp/hst/2015_all
+  20150827191534
+  >: tail -1 /tmp/hst/2015_all
+  20150905180914
+  >: wc -l /tmp/hst/2015_all
+  698128 /tmp/hst/2015_all
+
+What about wet files -- do they include text from pdfs?  What about
+truncated pdfs?
+
+  >: time for s in 0; do ~/bin/getcc_wet_multi.aws CC-MAIN-2019-35 $s 10; done > /tmp/hst/get_wet_19-35_0.log &
+  real    26m3.049s
+  user    0m1.225s
+  sys     0m1.310s
+
+In the segment 0 cdx file (!) we find 3747 probable truncations:
+  >: zgrep -a '"mime-detected": "application/pdf", ' cdx.gz > /tmp/hst/2019-35_seg0_pdf.idx
+  >: wc -l /tmp/hst/2019-35_seg0_pdf.idx
+  42345 /tmp/hst/2019-35_seg0_pdf.idx
+  >: egrep -a '"length": "10....."' /tmp/hst/2019-35_seg0_pdf.idx > /tmp/hst/2019-35_seg0_long_pdf.idx &
+  >: wc -l < /tmp/hst/2019-35_seg0_long_pdf.idx
+  3747
+Of which 70 are in file 0:
+  >: egrep -a '.-00000\.' /tmp/hst/2019-35_seg0_pdf.idx > /tmp/hst/2019-35_seg0_file0_pdf.idx
+  >: wc -l /tmp/hst/2019-35_seg0_file0_pdf.idx
+  70 /tmp/hst/2019-35_seg0_file0_pdf.idx
+
+In segment 0 file 0 we find 70 application/pdf Content-Type headers:
+  >: ix.py -h -w  -x </tmp/hst/2019-35_seg0_file0_pdf.idx |egrep '^(WARC-Target-URI:|Content-Length:) '|cut -f 2 -d ' ' |tr -d '\r'|while read l1; do read uri; read l2; printf '%s\t%s\t%s\n' $l1 $l2 "$uri"; done > ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv
+  >: wc -l < ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv
+  70
+  >: head -3 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv
+
+
+Of which 14 are truncated:
+  >: fgrep -c 1048576 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv
+  14
+
+E.g.
+  >: fgrep 1048576 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | head -3
+  1049051 1048576 https://en.heks.ch/sites/default/files/documents/2017-09/HEKS_EPER_Mission_Statement_2016_e.pdf
+  1049469 1048576 https://bmcmicrobiol.biomedcentral.com/track/pdf/10.1186/s12866-017-0951-4
+  1048824 1048576 https://citydocs.fcgov.com/?action=cao-cases&cmd=convert&docid=3332339
+
+Are any of the pdfs in the corresponding wet file?
+
+Yes, 2:
+  >: cut -f 3 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | fgrep -af - <(uz 1566027313501.0/orig/wet/*-00000.warc.wet.gz)
+  WARC-Target-URI: http://bdds.deux-sevres.com/recherche/simple/Editeur/2/Belfond/vignette?format=pdf
+  WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D00
+
+Is it in fact corresponding?
+  >: diff -bw <(uz 1566027313501.0/orig/warc/*-00000.warc.gz | egrep -a '^WARC-Target-URI: ' | uniq | head -1000) <(uz 1566027313501.0/orig/wet/*-00000.warc.wet.gz | egrep -a '^WARC-Target-URI: ' | head -1000)|egrep -c '^<'
+  19
+
+So, yes, mostly.  .2% are missing
+
+Just checking the search:
+  >: cut -f 3 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | fgrep -af - <(uz 1566027313501.0/orig/warc/*-00000.warc.gz) | wc -l
+  210
+Correct
+
+So, what pdfs make it into the WET:
+  >: cut -f 3 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | fgrep -af - <(uz 1566027313501.0/orig/wet/*-00000.warc.wet.gz) > ~/results/CC-MAIN-2019-35/s0_file0_pdf.txt
+  >: wc -l < ~/results/CC-MAIN-2019-35/s0_file0_pdf.txt
+  2
+ >: cut -f 2 -d ' ' ~/results/CC-MAIN-2019-35/s0_file0_pdf.txt | tr -d '\r' | fgrep -f -   ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv
+  11588   10913   http://bdds.deux-sevres.com/recherche/simple/Editeur/2/Belfond/vignette?format=pdf
+  1048979 1048576 https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 
+
+Here's the short one:
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2019-08-17T22:40:17Z
+WARC-Record-ID: <urn:uuid:ea98167b-c42a-4233-b57e-994aa627e38a>
+Content-Length: 11588
+Content-Type: application/http; msgtype=response
+WARC-Warcinfo-ID: <urn:uuid:f689f8d0-24f3-4824-9a38-4f3fee422a4e>
+WARC-Concurrent-To: <urn:uuid:2d51c956-0012-4d78-affc-8f57fe9d2e15>
+WARC-IP-Address: 92.175.114.24
+WARC-Target-URI: http://bdds.deux-sevres.com/recherche/simple/Editeur/2/Belfond/vignette?format=pdf
+WARC-Payload-Digest: sha1:7VVIUDQ4Q6XKNOAURYU4VTMRSZNPHDQA
+WARC-Block-Digest: sha1:OSTWXLV772XNHS22T4UBSCSJAAXM2J6T
+WARC-Identified-Payload-Type: application/pdf
+
+HTTP/1.1 200 OK
+Cache-Control: must-revalidate, post-check=0, pre-check=0,no-cache
+Pragma: public,no-cache
+Content-Type: application/pdf",text/html; charset=utf-8
+X-Crawler-Content-Encoding: gzip
+Expires: 0
+Server:
+X-Powered-By:
+Set-Cookie: 166d74d734106ba68b20ea303011f622=301619e3fe31ecb98c8473f0ff5f35a2; path=/
+Content-Disposition: attachment; filename="Mdiathque dpartementale des Deux-Svres - Rsultats de la recherche Belfond.pdf"
+Content-Transfer-Encoding: binary
+P3P: CP="NOI ADM DEV PSAi COM NAV OUR OTRo STP IND DEM"
+X-Content-Encoded-By:
+X-Powered-By:
+Date: Sat, 17 Aug 2019 22:40:16 GMT
+X-Crawler-Content-Length: 5448
+Content-Length: 10913
+
+        %PDF-1.7
+%<E2><E3><CF><D3>
+7 0 obj
+<< /Type /Page /Parent 1 0 R /LastModified (D:20190818004016+02'00') /Resources 2
+ 0 R /MediaBox [0.000000 0.000000 595.276000 841.890000] /CropBox [0.000000 0.000
+000 595.276000 841.890000] /BleedBox [0.000000 0.000000 595.276000 841.890000] /T
+rimBox [0.000000 0.000000 595.276000 841.890000] /ArtBox [0.000000 0.000000 595.2
+76000 841.890000] /Contents 8 0 R /Rotate 0 /Group << /Type /Group /S /Transparen
+cy /CS /DeviceRGB >> /PZ 1 >>
+endobj
+8 0 obj
+
+  >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|tail -n +1823434 | tail -n +24 | head -c 20000 > ~/results/CC-MAIN-2019-35/mediatheque.pdf
+  >: ps2ascii mediatheque.pdf
+                             Médiathèque départementale des Deux-Sèvres - Résultats de la recherche Belfond
+
+                             Médiathèque départementale des Deux-Sèvres - Résultats de
+                             la recherche Belfond
+                                                               A charge de revanche
+                             Titre :
+                             Auteur : Grippando, James (1958-....)
+  ...
+  etc., three pages, no errors
+
+  >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|fgrep -an  https://museum.wrap.gov.tw/GetFile4.ashx
+  38896837:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005
+  38896858:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005
+  38904590:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005
+    >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|tail -n +38896858 | egrep -an '^%%EOF'
+  27:%%EOF
+  1114658:%%EOF
+  1313299:%%EOF
+
+Hunh?
+
+  >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|tail -n +38896858 | egrep -an '^(%%EOF|WARC)' | head -30
+  1:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005
+  2:WARC-Payload-Digest: sha1:SZ53DQQHENC7DDN7GQ5IS7VMEPAXAMBE
+  3:WARC-Block-Digest: sha1:QTKJA6A7445Z7264K2YAFBUUM2OYH2T2
+  4:WARC-Truncated: length
+  5:WARC-Identified-Payload-Type: application/pdf
+  27:%%EOF
+  7725:WARC/1.0
+  7726:WARC-Type: metadata
+  7727:WARC-Date: 2019-08-17T22:59:14Z
+  7728:WARC-Record-ID: <urn:uuid:77df2747-e567-45d3-8646-3069ae9a9f25>
+  7731:WARC-Warcinfo-ID: <urn:uuid:f689f8d0-24f3-4824-9a38-4f3fee422a4e>
+  7732:WARC-Concurrent-To: <urn:uuid:eceb4adc-d81e-4497-82fe-eea61ce171f4>
+  7733:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005
+  7739:WARC/1.0
+
+OK, so indeed truncated after 7700 lines or so...
+  >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|tail -n +38896858 | tail -n +21 | head -c 1048576 > ~/results/CC-MAIN-2019-35/museum.pdf
+  >: ps2ascii ~/results/CC-MAIN-2019-35/museum.pdf
+   **** Error:  An error occurred while reading an XREF table.
+   **** The file has been damaged.
+Look in big_pdf?
+
+====Modify the original CC indexer to write new indices including lastmod=====
+Looks like WarcRecordWriter.write, in
+src/nutch-cc/src/java/org/commoncrawl/util/WarcRecordWriter, is what
+needs to be editted to include LastModified date
+
+To rebuild nutch-cc, particularly to recompile jar files after editting
+anything
+
+  >: cd $HHOME/src/nutch-cc
+  >: ant
+
+Fixed deprecation bug in WarcCdxWriter.java
+
+Modified src/java/org/commoncrawl/util/WarcCdxWriter.java
+to include lastmod
+
+Can run just one test, which should allow testing this:
+
+  >: ant test-core -Dtestcase='TestWarcRecordWriter'
+
+Logic is tricky, and there's no easy way in
+
+Basically, tools/WarcExport.java is launches a hadoop job based on a
+hadoop-runnable WarcExport instance.  Hadoop will in due course call
+ExportReducer.reduce, which will create an instance of WarcCapture
+"for each page capture", and call ExportMapper.context.write with that instance (via
+some configuration magic with the hadoop job Context).  That in turn
+uses (more magic) WarcOutputFormat.getRecordWriter, which
+(finally!) calls a previously created WarcRecordWriter
+instance.write(the capture).
+
+So to fake a test case, I need to build
+ 1) a WarcRecordWriter instance
+ 2) a WarcCapture instance
+and then invoke 1.write(2)
+
+Got that working, although still can't figure out where in the normal
+flow the metadata entry for Response.CONTENT_TYPE gets set.
+
+Now, add a test that takes a stream of WARC Response extracts and
+rewrites their index entries
+
+  >: head -8804 <(uz /beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00150.gz)|tail -10|  ix.py -h -w -x  > /tmp/hst/headers.txt
+  >: cp /tmp/hst/headers.txt src/test/org/commoncrawl/util/
+  >: shuf /tmp/hst/headers.txt > src/test/org/commoncrawl/util/headers_mixed.txt
+
+Won't quite work :-(
+How do We reconstruct the Warc filename, offset and length from the
+original index?
+
+Well, we can find a .warc.gz records!
+Thanks to https://stackoverflow.com/a/37042747/2595465
+
+  >: ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz > /tmp/hst/recs.txt
+
+Nearly working, got 1/3rd of the way through a single WARC and then failed:
+
+  >: n=0 && ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/tt.txt|while read o l; do echo $((n+=1)); echo $o $l >> /tmp/hst/r3a; ix.py $l $o   CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz| wc -l; done
+  ...
+  20
+  10215
+  CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz
+  Process fail: Compressed file ended before the end-of-stream marker was reached, input:
+   length=10762, offset=60784640, file=/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz
+
+  >: head -10217 /tmp/hst/r3a | tail -4
+  60784173 467
+  60784640 10762
+  60795402 463
+  60795865 460
+  >: ix.py 467 60784173   CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|fgrep Target
+  WARC-Target-URI: http://drycarerestoration.co/corner-furniture-piece/unique-corner-decoration-pieces-or-corner-furniture-pieces-corner-corner-furniture-piece-corner-furniture-pieces-bedroom/
+
+  >: zcat /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/cdx/warc/cdx.gz 
+  ...
+  co,drycarerestoration)/corner-furniture-piece/unique-corner-decoration-pieces-or-corner-furniture-pieces-corner-corner-furniture-piece-corner-furniture-pieces-bedroom 20190819020224 {"url": "http://drycarerestoration.co/corner-furniture-piece/unique-corner-decoration-pieces-or-corner-furniture-pieces-corner-corner-furniture-piece-corner-furniture-pieces-bedroom/", "mime": "text/html", "mime-detected": "text/html", "status": "200", "digest": "DTKGJL45XQDXUS7PTXPYR6POMPLG46RZ", "length": "2570", "offset": "60784640", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz", "charset": "UTF-8", "languages": "eng"}
+  >: ix.py 2570 60784640   CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|less
+  >: echo $((10762 - 2570))
+  8192
+
+Ah, the error I was dreading :-(  I _think_ this happens when an
+individual record ends exactly on a 8K boundary.
+
+Yes:
+
+  >: echo $((60784640 % 8192))
+  0
+
+Even with buffer 1MB:
+  21
+  160245
+  CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz
+  Process fail: Compressed file ended before the end-of-stream marker was reached, input:
+   length=8415, offset=1059033915, file=/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz
+  0
+  160246
+
+  >: tail -60 /tmp/hst/r3b|head -20
+  1059013061 423
+  1059013484 7218
+  1059020702 425
+  1059021127 424
+  1059021551 11471
+  1059033022 426
+  1059033448g 467
+  1059033915 8415
+
+Argh.  This is at the _same_ point (before 51 fails before EOF).  Ah,
+maybe that's the point -- this is the last read before EOF, and it's
+not a full buffer!
+
+  >: ix.py 467 1059033448   CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|less
+  ...
+  WARC-Target-URI: https://zowiecarrpsychicmedium.com/tag/oracle/
+
+Reran with more instrumentation, took at least all day:
+
+  >: n=0 && ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2> /tmp/hst/r3e_err.txt | while read o l; do
+      echo $((n+=1)); echo $o $l >> /tmp/hst/r3e_val; ix.py $l $o CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | wc -l;
+  done > /tmp/hst/r3e_log 2>&1
+  >: wc -l /tmp/hst/r3e_err.txt
+  160296 /tmp/hst/r3e_err.txt
+  >: tail -60 /tmp/hst/r3e_err.txt|cat -n | grep -C2 True\ True
+       7  b 28738 28738 28312 426 False False
+       8  b 28312 28312 27845 467 False False
+       9  b 27845 378162 369747 8415 True True  < this is the first hit the last
+                                                  (partial) block
+      10  b 369747 369747 369312 435 False True
+      11  b 369312 369312 368878 434 False True
+
+  >: tail -55 /tmp/hst/r3e_val | head -3
+  1059033022 426
+  1059033448 467
+  1059033915 8415
+  >: dd ibs=1  skip=1059033022 count=426 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t
+  ...
+  426 bytes copied, 0.00468243 s, 91.0 kB/s
+  sing<3411>: dd ibs=1  skip=1059033448 count=467 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t
+  ...
+  467 bytes copied, 0.00382692 s, 122 kB/s
+  sing<3412>: dd ibs=1  skip=1059033915 count=8415 if=/beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz of=/dev/stdout | uz -t
+  igzip: Error (null) does not contain a complete gzip file
+  ...
+  8415 bytes (8.4 kB, 8.2 KiB) copied, 0.00968889 s, 869 kB/s
+
+So, tried one change to use the actually size rather than BUFSIZE at
+one point, seems to work now:
+
+  >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2> /tmp/hst/r3f_err.txt | tee /tmp/hst/r3f_val | while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz';
+done 2>&1 | tee /tmp/hst/r3f_log | ix.py -w | egrep -c '^WARC/1\.0'
+  160296
+  real  3m48.393s
+  user  0m47.997s
+  sys   0m26.641s
+
+  >: tail /tmp/hst/r3f_val
+10851 1059370472
+475 1059381323
+444 1059381798
+22437 1059382242
+447 1059404679
+506 1059405126
+15183 1059405632
+471 1059420815
+457 1059421286
+17754 1059421743
+
+  >: wc -l /tmp/hst/*_val
+    171 /tmp/hst/r3d_val
+  160297 /tmp/hst/r3e_val
+  160296 /tmp/hst/r3f_val
+  320764 total
+  >: uz /tmp/hst/head.warc.gz |egrep -c '^WARC/1\.0.$'
+  171
+  >: tail -n 3 /tmp/hst/*_val
+  ==> /tmp/hst/r3d_val <==
+  454 1351795
+  414 1352249
+  0 1352663 [so the 171 above is bogus, and we're missing one]
+
+  ==> /tmp/hst/r3e_val <==
+  1059393441 457
+  1059393898 17754
+  0 [likewise bogus, so see below]
+
+  ==> /tmp/hst/r3f_val <==
+  471 1059420815
+  457 1059421286
+  17754 1059421743 [better, but still one missing]
+  >: uz /tmp/hst/head.warc.gz |egrep '^WARC-Type: ' | tee >(wc -l 1>&2) | tail -4
+  WARC-Type: response
+  WARC-Type: metadata
+  WARC-Type: request
+  WARC-Type: response [missing]
+  171
+  >: ls -lt /tmp/hst/*_val
+  -rw-r--r-- 1 hst dc007    1977 Sep 29 09:27 /tmp/hst/r3d_val
+  -rw-r--r-- 1 hst dc007 2319237 Sep 28 14:28 /tmp/hst/r3f_val
+  -rw-r--r-- 1 hst dc007 2319238 Sep 27 19:41 /tmp/hst/r3e_val
+  >: ls -l ~/lib/python/unpackz.py
+  -rwxr-xr-x 1 hst dc007 1821 Sep 28 15:13 .../dc007/hst/lib/python/unpackz.py
+So e and f are stale, rerun
+  >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3f_err.txt| tee /tmp/hst/r3f_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3f_log |ix.py -w |egrep '^WARC-Type: ' | tail -4 &
+  >: Reading length, offset, filename tab-delimited triples from stdin...
+  WARC-Type: response
+  WARC-Type: metadata
+  WARC-Type: request
+  WARC-Type: response
+
+  real  3m49.760s
+  user  0m47.180s
+  sys   0m32.218s
+So missing the final metadata...
+Back to head.warc.gz, with debug info
+   
+  >: n=0 && ~/lib/python/unpackz.py /tmp/hst/head.warc.gz 2>/tmp/hst/ttd.txt|while read l o; do echo $((n+=1)); echo $l $o >> /tmp/hst/r3d_val; dd ibs=1 skip=$o count=$l if=/tmp/hst/head.warc.gz of=/dev/stdout 2>/tmp/hst/r3d_ido| uz -t ; done >/tmp/hst/r3d_log 2>&1
+  >: tail -2 /tmp/hst/r3d_log
+  171
+  igzip: Error invalid gzip header found for file (null)
+  >: tail -n 3 /tmp/hst/ttd.txt /tmp/hst/r3d_val
+  ==> /tmp/hst/ttd.txt <==
+  b 9697 9697 9243 454 False True
+  b 9243 9243 8829 414 False True
+  n 8829
+
+  ==> /tmp/hst/r3d_val <==
+  454 1351795
+  414 1352249
+  0 1352663
+
+  >: cat -n /tmp/hst/r3f_val | head -172 | tail -4
+     169  454 1351795
+     170  414 1352249
+     171  8829 1352663
+     172  446 1361492
+
+Fixed, maybe
+
+  >: tail -n 3 /tmp/hst/r3d_log /tmp/hst/r3d_val
+  ==> /tmp/hst/r3d_log <==
+  169
+  170
+  171
+
+  ==> /tmp/hst/r3d_val <==
+  454 1351795
+  414 1352249
+  8829 1352663
+
+Yes!
+
+  >: time ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3f_err| tee /tmp/hst/r3f_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3f_log |ix.py -w |egrep '^WARC-Type: ' | tail -4
+  Reading length, offset, filename tab-delimited triples from stdin...
+  WARC-Type: metadata
+  WARC-Type: request
+  WARC-Type: response
+  WARC-Type: metadata
+
+  real  3m26.042s
+  user  0m44.167s
+  sys   0m24.716s
+  >: tail -n 3 /tmp/hst/r3f*
+  ==> /tmp/hst/r3f_err <==
+
+  ==> /tmp/hst/r3f_val <==
+  457 1059421286
+  17754 1059421743
+  425 1059439497
+
+Doubling buffer size doesn't speed up
+  >: time ~/lib/python/unpackz.py -b $((2 * 1024 * 1024)) /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/r3g_err| tee /tmp/hst/r3g_val|while read l o; do printf '%s\t%s\t%s\n' $l $o 'CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz' ;done |& tee /tmp/hst/r3g_log |ix.py -w |egrep '^WARC-Type: ' | tail -4
+  Reading length, offset, filename tab-delimited triples from stdin...
+  WARC-Type: metadata
+  WARC-Type: request
+  WARC-Type: response
+  WARC-Type: metadata
+
+  real  3m34.519s
+  user  0m52.312s
+  sys   0m24.875s
+
+Tried using FileIO.readinto([a fixed buffer]), but didn't immediately
+work. Abandoned because I still don't understand how zlib.decompress
+works at all...
+
+Time to convert unpackz to a library which takes a callback
+alternative to an output file -- Done
+
+W/o using callback, timing and structure for what we need for
+re-indexing task looks encouraging:
+  >: time ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz |egrep -aA20 '^WARC-Type: response' | cut -f 1 -d ' ' | egrep -a '^WARC-' |sus | tee >(wc -l 1>&2)
+    52468 WARC-Block-Digest:
+    52468 WARC-Concurrent-To:
+    52468 WARC-Date:
+    52468 WARC-Identified-Payload-Type:
+    52468 WARC-IP-Address:
+    52468 WARC-Payload-Digest:
+    52468 WARC-Record-ID:
+    52468 WARC-Target-URI:
+    52468 WARC-Type:
+    52468 WARC-Warcinfo-ID:
+      236 WARC-Truncated:
+  11
+
+  real  0m20.308s
+  user  0m19.720s
+  sys   0m4.505s
+
+Whole thing, with no pre-filtering:
+
+  >: time ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz  | cut -f 1 -d ' ' | egrep -a '^(WARC-|Content-|Last-Modified)' |sus | tee >(wc -l 1>&2)
+   211794 Content-Length:
+   211162 Content-Type:
+   159323 WARC-Target-URI:
+   159311 WARC-Warcinfo-ID:
+   159301 WARC-Record-ID:
+   159299 WARC-Date:
+   159297 WARC-Type:
+   105901 WARC-Concurrent-To:
+   105896 WARC-IP-Address:
+    52484 WARC-Block-Digest:
+    52484 WARC-Identified-Payload-Type:
+    52482 WARC-Payload-Digest:
+     9239 Last-Modified:
+     3941 Content-Language:
+     2262 Content-Security-Policy:
+      642 Content-language:
+      326 Content-Security-Policy-Report-Only:
+      238 WARC-Truncated:
+      114 Content-Disposition:
+      352 Content-*:
+	1 WARC-Filename:
+  42
+
+real  0m30.896s
+user  0m37.335s
+sys   0m7.542s
+
+First 51 after WARC-Type: response
+
+  >: time ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz |egrep -aA50 '^WARC-Type: response' | cut -f 1 -d ' ' | egrep -a '^(WARC-|Content-|Last-Modified)' |sus | tee >(wc -l 1>&2)
+   106775 Content-Length:
+   106485 Content-Type:
+    55215 WARC-Type:
+    55123 WARC-Date:
+    54988 WARC-Record-ID:
+    54551 WARC-Warcinfo-ID:
+    54246 WARC-Target-URI:
+    54025 WARC-Concurrent-To:
+    52806 WARC-IP-Address:
+    52468 WARC-Block-Digest:
+    52468 WARC-Identified-Payload-Type:
+    52468 WARC-Payload-Digest:
+     9230 Last-Modified:
+     3938 Content-Language:
+     2261 Content-Security-Policy:
+      639 Content-language:
+      324 Content-Security-Policy-Report-Only:
+      236 WARC-Truncated:
+      114 Content-Disposition:
+      342 Content-*:
+  41
+
+  real  0m21.483s
+  user  0m22.372s
+  sys   0m5.400s
+
+So, not worth the risk, let's try python
+
+  >: time ~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|wc -l
+  9238
+
+  real  0m25.426s
+  user  0m23.201s
+  sys   0m0.711s
+
+Looks good, but why 9238 instead of 9239???
+
+  >: ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | egrep -a '^Last-Modified: ' > /tmp/hst/lmo.tsv
+
+Argh.  Serious bug in unpackz, wasn't handline cross-buffer-boundary
+records correctly.  Fixed.  Redoing the above...
+
+No pre-filter:
+  >: uz /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|egrep -c '^WARC/1\.0.$'
+  160297
+
+  >: time ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz  | cut -f 1 -d ' ' | egrep -a '^(WARC-|Content-|Last-Modified)' |sus | tee >(wc -l 1>&2)
+
+ 213719 Content-Length:
+ 213088 Content-Type:
+ 160297 WARC-Date:
+ 160297 WARC-Record-ID:
+ 160297 WARC-Type:
+ 160296 WARC-Target-URI:
+ 160296 WARC-Warcinfo-ID:
+ 106864 WARC-Concurrent-To:
+ 106864 WARC-IP-Address:
+  53432 WARC-Block-Digest:  [consistent with 106297 == (3 * 53432) + 1]
+  53432 WARC-Identified-Payload-Type:
+  53432 WARC-Payload-Digest:
+   9430 Last-Modified:
+   4006 Content-Language:
+   2325 Content-Security-Policy:
+    653 Content-language:
+    331 Content-Security-Policy-Report-Only:
+    298 WARC-Truncated:
+    128 Content-Disposition:
+     83 Content-Location:
+     67 Content-type:
+     51 Content-MD5:
+     45 Content-Script-Type:
+     42 Content-Style-Type:
+     31 Content-Transfer-Encoding:
+     13 Content-disposition:
+      8 Content-Md5:
+      5 Content-Description:
+      5 Content-script-type:
+      5 Content-style-type:
+      3 Content-transfer-encoding:
+      2 Content-Encoding-handler:
+      1 Content-DocumentTitle:
+      1 Content-Hash:
+      1 Content-ID:
+      1 Content-Legth:
+      1 Content-length:
+      1 Content-Range:
+      1 Content-Secure-Policy:
+      1 Content-security-policy:
+      1 Content-Type-Options:
+      1 WARC-Filename:
+42
+
+real  0m28.876s
+user  0m35.703s
+sys   0m6.976s
+
+  >: ~/lib/python/cc/unpackz.py -o /dev/stdout /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | egrep -a '^Last-Modified: ' > /tmp/hst/lmo.tsv
+  >: wc -l /tmp/hst/lmo.tsv
+  9430 /tmp/hst/lmo.tsv
+  >: time ~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz > /tmp/hst/lm.tsv
+
+  real  0m17.191s
+  user  0m15.739s
+  sys   0m0.594s
+  >: wc -l /tmp/hst/lm.tsv
+  9423 /tmp/hst/lm.tsv
+
+  >: diff <(sed 's/^Last-Modified: //' /tmp/hst/lmo.tsv | tr -d '\r') <(cut -f 3 /tmp/hst/lm.tsv)
+  853d852
+  <       Mon, 19 Aug 2019 01:46:49 GMT
+  4058d4056
+  < Tue, 03 Nov 2015 21:31:18 GMT<br />
+  4405d4402
+  < Mon, 19 Aug 2019 01:54:52 GMT
+  5237,5238d5233
+  < 3
+  < Asia/Amman
+  7009d7003
+  <       Mon, 19 Aug 2019 02:34:20 GMT
+  9198d9191
+  <       Mon, 19 Aug 2019 02:14:49 GMT
+
+All good.  The only implausable case is
+  < Mon, 19 Aug 2019 01:54:52 GMT
+which turns out to be a case of two Last-Modified headers in the same
+the same response record's HTTP headers.  RFCs 2616 and 7230 rule it
+out but neither specifies a recovery, so first-wins is as good as
+anything, and indeed 6797 specifies that.
+
+Start looking at how we do the merge of cdx_extras.py with existing index
+
+Try it with the existing _per segment_ index we have for 2019-35
+
+Assuming we have to key on segment plus offset, as reconstructing the
+proper index key is such a pain / buggy / is going to change with the year.
+
+Stay with segment 49
+
+  >: uz cdx.gz |wc -l
+ 29,870,307
+
+  >: time uz cdx.gz|egrep -ao ' "length": "[0-9]*", "offset": "[0-9]*"' |wc
+  29,870,307 119,481,228 1,241,098,122
+             = 4 * 29,870,307
+
+So no bogons, not _too_ surprising :-)
+
+Bad news is it's a _big_ file:
+
+  >: ls -lh cdx.gz
+  -rw-r--r-- 1 hst dc007 2.0G Mar 18  2021 cdx.gz
+
+So not viable to paste offset as a key and then sort on command line,
+or to load it in to python and do the work there...
+
+Do it per warc file and then merge?
+
+  >: time uz cdx.gz |fgrep -a warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | sort -n -t\" -k28,28 > /tmp/hst/558.warc.cdx
+
+  real  0m23.494s
+  user  0m14.541s
+  sys   0m9.158s
+
+  >: wc -l /tmp/hst/558.warc.cdx
+  53432 /tmp/hst/558.warc.cdx
+
+  >: echo $((600 * 53432))
+  32,059,200
+
+So, 600 of those, plus approx. same again for extracting, that pbly
+_is_ doable in python, not more than 10 hours total, assuming internal
+sort and external merge is not too expensive...
+
+For each segment, suppose we pull out 60 groups of 10 target files
+  >: time uz cdx.gz |egrep -a warc/CC-MAIN-2019[^-]*-2019[^-]*-0000..warc.gz > /tmp/hst/0000.warc.cdx
+
+  real  0m42.129s
+  user  0m35.147s
+  sys   0m9.140s
+  >: wc -l /tmp/hst/0000.warc.cdx
+  533150
+
+Key it with offset and sort:
+
+  >: time egrep -ao ' "length": "[0-9]*", "offset": "[0-9]*"' /tmp/hst/0000.warc.cdx | cut -f 5 -d ' ' | tr -d \"  > /tmp/hst/0000_offsets
+
+  real  0m5.578s
+  user  0m5.593s
+  sys   0m0.265s
+
+  >: time paste /tmp/hst/0000_offsets /tmp/hst/0000.warc.cdx |sort -nk1,1 | cut -f 2 > /tmp/hst/0000_sorted.warc.cdx
+
+  real  0m4.185s
+  user  0m2.001s
+  sys   0m1.334s
+
+  >: time seq 0 9 | parallel -j 10 "~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-*-*-0000'{}'.warc.gz > /tmp/hst/lm_0000'{}'.tsv"
+
+  real  0m24.610s
+  user  2m54.146s
+  sys   0m10.226s
+
+  >: head /tmp/hst/lm_00000.tsv
+  9398  16432     Mon, 19 Aug 2019 02:44:15 GMT
+  20796 26748     Tue, 16 Jul 2019 04:39:09 GMT
+  4648  340633    Fri, 07 Dec 2018 09:05:59 GMT
+  3465  357109    Sun, 18 Aug 2019 11:48:23 GMT
+  7450  914189    Mon, 19 Aug 2019 02:50:08 GMT
+  ...
+  sing<3956>: fgrep '"length": "9398", "offset": "16432"' /tmp/hst/0000_sorted.warc.cdx
+  com,roommeme,0401a)/index.phtml?channel=&op=&p=140&put=show&r2= 20190819024416 {"url": "http://0401a.roommeme.com/index.phtml?PUT=SHOW&R2=&OP=&P=140&CHANNEL=", "mime": "text/html", "mime-detected": "text/html", "status": "200", "digest": "5DNDVX5HQBOOBHISSCOI4UBVMUL63L36", "length": "9398", "offset": "16432", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00000.warc.gz", "charset": "Big5", "languages": "zho"}
+
+bingo
+
+So, the python code is pretty straightfoward: open the 10 individual
+lm-*.tsv outputs into an array, initialise a 10-elt array with the
+first line of each and another with its offset, record the
+fileno(s) of the lowest offset, then iterate
+
+  read cdx lines and write unchanged until offset = lowest
+  merge line from fileno and output
+  remove fileno from list of matches
+  read and store a new line for fileno [handle EOF]
+  if list of matches is empty, redo setting of lowest
+
+Resort the result by actual key
+
+Meanwhile, get a whole test set:
+sbatch --output=slurm_aug_cdx_49_10-599-out --time=01:00:00 --ntasks=10 -c 36 --exclusive $HOME/bin/runme.sh -m 00 59 $PWD -t 18 -b 'export resdir=CC-MAIN-2019-35/aug_cdx/49
+export DEC=$xarg' "export PYTHONPATH=./lib/python/cc:$PYTHONPATH
+seq 0 9 | parallel -j 10 \"~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-*-*-00\${DEC}'{}'.warc.gz > \$resdir/00\${DEC}'{}'.tsv\""
+
+Actually finished 360 in the hour.
+
+Leaving
+
+sbatch --output=slurm_aug_cdx_49_360-599-out --time=01:00:00 --ntasks=10 -c 36 --exclusive $HOME/bin/runme.sh -m 36 59 $PWD -t 18 -b 'export resdir=CC-MAIN-2019-35/aug_cdx/49
+export DEC=$xarg' "export PYTHONPATH=./lib/python/cc:$PYTHONPATH
+seq 0 9 | parallel -j 10 \"~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-*-*-00\${DEC}'{}'.warc.gz > \$resdir/00\${DEC}'{}'.tsv\""
+
+But something is wrong, the number of jobs is all wrong:
+  
+  5>: fgrep -c parallel slurm_aug_cdx_49_0-359-out
+  741
+  sing<4046>: ls -lt CC-MAIN-2019-35/aug_cdx/49/|wc -l
+  372
+
+Every file is being produced twice.
+
+Took me a while to figure out my own code :-(
+
+  >: sbatch --output=slurm_aug_cdx_49_360-599-out --time=01:00:00 --ntasks=10 -c 36 --exclusive $HOME/bin/runme.sh -m 49 49 $PWD -t 18 -b 'export resdir=CC-MAIN-2019-35/aug_cdx/$xarg
+  export SEG=$xarg
+  share_by_task.sh -f "%03g\n" -s 360 599 $n $task > /tmp/hst_$task' -i 'cat /tmp/hst_$task' 'export PYTHONPATH=./lib/python/cc:$PYTHONPATH
+   ~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/*.$SEG/orig/warc/CC-MAIN-*-*-00${arg}.warc.gz > $resdir/00${arg}.tsv'
+
+Oops, only 560, not 600
+
+Took 3.5 minutes for 200, so call it 10 for 560, so do 6 more in an
+hour:
+
+  >: sbatch --output=slurm_aug_cdx_50-55_out --time=01:00:00 --ntasks=10 -c 36 --exclusive $HOME/bin/runme.sh -m 50 55 $PWD -t 18 -b 'export resdir=CC-MAIN-2019-35/aug_cdx/$xarg
+mkdir -p $resdir
+> export SEG=$xarg
+share_by_task.sh -f "%03g\n" -s 360 599 $n $task > /tmp/hst_$task' -i 'cat /tmp/hst_$task' 'export PYTHONPATH=./lib/python/cc:$PYTHONPATH
+ ~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/*.$SEG/orig/warc/CC-MAIN-*-*-00${arg}.warc.gz > $resdir/00${arg}.tsv'
+  
+  >: tail slurm_aug_cdx_50-55_out
+  ...
+  Wed Oct 9 22:25:47 BST 2024 Finished 55
+  >: head -1 slurm_aug_cdx_50-55_out
+  Wed Oct 9 21:29:43 BST
+               56:04
+
+  >: du -s CC-MAIN-2019-35/aug_cdx
+  1,902,916
+  
+Not bad, so order 20MB for the whole thing
+
+Next step, compare to my existing cdx with timestamp
+
+First check looks about right:
+
+  [cd .../warc_lmhx]
+  >: seq --format='%03g' 0 299 > /tmp/hst/cdx_nums
+  >: parallel -j 20 -a /tmp/hst/cdx_nums 'uz idx/cdx-00{}.gz | egrep -o "\"filename\": \"crawl-data/CC-MAIN-2019-35/segments/[^.]*[.]50.*\"lastmod\":" | sed "s/^.*-00//;s/^\(...\).*/\1/"| sus > /tmp/hst/checkseg_50_{}'
+
+  [cd .../aug_cdx/50]
+  >: wc -l 00123.tsv
+  9333
+  >: egrep -h '123$' /tmp/hst/checkseg_50_??? | acut 1 | btot
+  9300
+  >: wc -l 00400.tsv
+  9477 00400.tsv
+  >: egrep -h '400$' /tmp/hst/checkseg_50_??? | acut 1 | btot
+  9439
+
+Difference is presumable the bogus timestamps aren't in the augmented
+cdx as shipped.
+
+Note that the following 'bad' kind of timestamp is fixed before
+sort_date.py does its thing:
+
+   ... sort_date.sh <(uz $arg/*00???.warc.gz | '"fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/')"' >$arg/ks.tsv
+  
+
+  >: egrep -c '[^ ]GMT$' 50/00123.tsv
+  22
+  >: egrep -c '[^ ]GMT$' 50/00400.tsv
+  14
+
+  >: PYTHONPATH=~/.local/lib/python3.9/site-packages:$PYTHONPATH sort_date.sh <(uz ../warc_lmhx/50/*00123.warc.gz | fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2> /tmp/hst/123_errs | wc -l
+  9300
+  >: fgrep -c Invalid /tmp/hst/123_errs
+  33
+  >: PYTHONPATH=~/.local/lib/python3.9/site-packages:$PYTHONPATH sort_date.sh <(uz ../warc_lmhx/50/*00400.warc.gz | fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2> /tmp/hst/400_errs | wc -l
+  9439
+  >: fgrep -c Invalid /tmp/hst/400_errs
+  38
+
+All good.
+
+But
+  >: seq --format='%03g' 0 559 > /tmp/hst/warc_nums
+  >: xx () {
+  r=$(diff -bw
+   <(echo $((
+    $(sort_date.sh <(uz ../warc_lmhx/50/*00$1.warc.gz |
+      fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2>/tmp/hst/ec_$1 |wc -l)
+       +
+    $(fgrep -c Invalid /tmp/hst/ec_$1))))
+  <(wc -l < 50/00$1.tsv))
+ if [ "$r" ]
+ then printf "%s:\n%s\n" $2 "$r"
+ fi
+ }
+  >: parallel -j 20 -a /tmp/hst/warc_nums xx '{}'  '$(({#} - 1))'  | tee /tmp/hst/aug_bugs
+  >: fgrep -c 1c1 /tmp/hst/aug_bugs
+  77
+  sing<4318>: wc -l < /tmp/hst/aug_bugs
+  385
+  sing<4319>: echo $((77 * 5))
+  385
+
+OK, there are a few other error messages from date conversion
+   >: xx () { r=$(diff -bw <(echo $(($(sort_date.sh <(uz ../warc_lmhx/50/*00$1.warc.gz | fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2>/tmp/hst/ec_$1 |wc -l) + $(egrep -c 'Invalid|must be in|out of range' /tmp/hst/ec_$1)))) <(wc -l < 50/00$1.tsv)); if [ "$r" ]; then  printf "%s:\n%s\n" $2 "$r"; fi; }
+sing<4337>: parallel -j 20 -a /tmp/hst/warc_nums xx '{}'  '$(({#} - 1))'  | tee /tmp/hst/aug_bugs2
+   [nothing]
+
+So, I think we can believe we're OK
+But 7 is better than 1:
+  >: xx () { r=$(diff -bw <(echo $(($(sort_date.sh <(uz ../warc_lmhx/$3/*00$1.warc.gz | fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2>/tmp/hst/ec_$1 |wc -l) + $(egrep -c 'Invalid|must be in|out of range' /tmp/hst/ec_$1)))) <(wc -l < $3/00$1.tsv)); if [ "$r" ]; then  printf "%s:\n%s\n" $2 "$r"; fi; }
+  >: for s in 49 {51..55}; do parallel -j 20 -a /tmp/hst/warc_nums xx '{}'  '$(({#} - 1))' $s  | tee /tmp/hst/aug_bugs_$s; done
+  [nothing]
+
+Next step: ?
+
+
+