changeset 53:d533894173d0

detailed consistency check with 7 segments from published lmh-augmented cdx
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 11 Oct 2024 16:41:32 +0100
parents 8dffb8aa33da
children 237105932af5
files lurid3/notes.txt
diffstat 1 files changed, 60 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/lurid3/notes.txt	Thu Oct 10 17:44:58 2024 +0100
+++ b/lurid3/notes.txt	Fri Oct 11 16:41:32 2024 +0100
@@ -925,3 +925,63 @@
 
 Difference is presumable the bogus timestamps aren't in the augmented
 cdx as shipped.
+
+Note that the following 'bad' kind of timestamp is fixed before
+sort_date.py does its thing:
+
+   ... sort_date.sh <(uz $arg/*00???.warc.gz | '"fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/')"' >$arg/ks.tsv
+  
+
+  >: egrep -c '[^ ]GMT$' 50/00123.tsv
+  22
+  >: egrep -c '[^ ]GMT$' 50/00400.tsv
+  14
+
+  >: PYTHONPATH=~/.local/lib/python3.9/site-packages:$PYTHONPATH sort_date.sh <(uz ../warc_lmhx/50/*00123.warc.gz | fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2> /tmp/hst/123_errs | wc -l
+  9300
+  >: fgrep -c Invalid /tmp/hst/123_errs
+  33
+  >: PYTHONPATH=~/.local/lib/python3.9/site-packages:$PYTHONPATH sort_date.sh <(uz ../warc_lmhx/50/*00400.warc.gz | fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2> /tmp/hst/400_errs | wc -l
+  9439
+  >: fgrep -c Invalid /tmp/hst/400_errs
+  38
+
+All good.
+
+But
+  >: seq --format='%03g' 0 559 > /tmp/hst/warc_nums
+  >: xx () {
+  r=$(diff -bw
+   <(echo $((
+    $(sort_date.sh <(uz ../warc_lmhx/50/*00$1.warc.gz |
+      fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2>/tmp/hst/ec_$1 |wc -l)
+       +
+    $(fgrep -c Invalid /tmp/hst/ec_$1))))
+  <(wc -l < 50/00$1.tsv))
+ if [ "$r" ]
+ then printf "%s:\n%s\n" $2 "$r"
+ fi
+ }
+  >: parallel -j 20 -a /tmp/hst/warc_nums xx '{}'  '$(({#} - 1))'  | tee /tmp/hst/aug_bugs
+  >: fgrep -c 1c1 /tmp/hst/aug_bugs
+  77
+  sing<4318>: wc -l < /tmp/hst/aug_bugs
+  385
+  sing<4319>: echo $((77 * 5))
+  385
+
+OK, there are a few other error messages from date conversion
+   >: xx () { r=$(diff -bw <(echo $(($(sort_date.sh <(uz ../warc_lmhx/50/*00$1.warc.gz | fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2>/tmp/hst/ec_$1 |wc -l) + $(egrep -c 'Invalid|must be in|out of range' /tmp/hst/ec_$1)))) <(wc -l < 50/00$1.tsv)); if [ "$r" ]; then  printf "%s:\n%s\n" $2 "$r"; fi; }
+sing<4337>: parallel -j 20 -a /tmp/hst/warc_nums xx '{}'  '$(({#} - 1))'  | tee /tmp/hst/aug_bugs2
+   [nothing]
+
+So, I think we can believe we're OK
+But 7 is better than 1:
+  >: xx () { r=$(diff -bw <(echo $(($(sort_date.sh <(uz ../warc_lmhx/$3/*00$1.warc.gz | fgrep $'\t'|sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/') 2>/tmp/hst/ec_$1 |wc -l) + $(egrep -c 'Invalid|must be in|out of range' /tmp/hst/ec_$1)))) <(wc -l < $3/00$1.tsv)); if [ "$r" ]; then  printf "%s:\n%s\n" $2 "$r"; fi; }
+  >: for s in 49 {51..55}; do parallel -j 20 -a /tmp/hst/warc_nums xx '{}'  '$(({#} - 1))' $s  | tee /tmp/hst/aug_bugs_$s; done
+  [nothing]
+
+Next step: ?
+
+
+