# HG changeset patch # User Henry S. Thompson # Date 1728578698 -3600 # Node ID 8dffb8aa33da8cf375c91bc6676beea8436497ea # Parent dc24bb6e524f8d2d011f4ccf4201629831be062e prelim consistency check with published lmh-augmented cdx diff -r dc24bb6e524f -r 8dffb8aa33da lurid3/notes.txt --- a/lurid3/notes.txt Wed Oct 09 22:55:27 2024 +0100 +++ b/lurid3/notes.txt Thu Oct 10 17:44:58 2024 +0100 @@ -906,3 +906,22 @@ Not bad, so order 20MB for the whole thing Next step, compare to my existing cdx with timestamp + +First check looks about right: + + [cd .../warc_lmhx] + >: seq --format='%03g' 0 299 > /tmp/hst/cdx_nums + >: parallel -j 20 -a /tmp/hst/cdx_nums 'uz idx/cdx-00{}.gz | egrep -o "\"filename\": \"crawl-data/CC-MAIN-2019-35/segments/[^.]*[.]50.*\"lastmod\":" | sed "s/^.*-00//;s/^\(...\).*/\1/"| sus > /tmp/hst/checkseg_50_{}' + + [cd .../aug_cdx/50] + >: wc -l 00123.tsv + 9333 + >: egrep -h '123$' /tmp/hst/checkseg_50_??? | acut 1 | btot + 9300 + >: wc -l 00400.tsv + 9477 00400.tsv + >: egrep -h '400$' /tmp/hst/checkseg_50_??? | acut 1 | btot + 9439 + +Difference is presumable the bogus timestamps aren't in the augmented +cdx as shipped.