# HG changeset patch # User Henry S. Thompson # Date 1734470728 0 # Node ID 4b5117db4929a7e86c227e301e86a93a9fb372ee # Parent dd06d7afbfe072d460975729f31b73f88cbeff05 minor updates diff -r dd06d7afbfe0 -r 4b5117db4929 lurid3/notes.txt --- a/lurid3/notes.txt Tue Oct 22 14:00:33 2024 +0100 +++ b/lurid3/notes.txt Tue Dec 17 21:25:28 2024 +0000 @@ -26,6 +26,31 @@ com,gyshbsh)/robots.txt 20181023022000 {"url": "http://gyshbsh.com/robots.txt", "mime": "text/html", "mime-detected": "text/html", "status": "301", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "529", "offset": "638892", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027315618.73/robotstxt/CC-MAIN-20190820200701-20190820222701-00120.warc.gz"} ... +Full search: + >: find CC*/cdx -type f -name cluster.idx > /tmp/hst/clus + >: cat /tmp/hst/clus | while read c; do printf '%s\t%s\n' $c $(cut -f 1 -d ' ' $c | fgrep -vc ${c:8:4}); done + CC-MAIN-2013-20/cdx/cluster.idx 0 + CC-MAIN-2014-35/cdx/cluster.idx 0 + CC-MAIN-2015-35/cdx/cluster.idx 0 + CC-MAIN-2016-30/cdx/cluster.idx 0 + CC-MAIN-2017-30/cdx/cluster.idx 0 + CC-MAIN-2018-30/cdx/warc/cluster.idx 0 + CC-MAIN-2018-34/cdx/cluster.idx 36 + CC-MAIN-2019-18/cdx/warc/cluster.idx 3 + CC-MAIN-2019-35/cdx/cluster.idx 1 + CC-MAIN-2020-34/cdx/cluster.idx 0 + CC-MAIN-2021-25/cdx/cluster.idx 0 + CC-MAIN-2021-31/cdx/cluster.idx 0 + CC-MAIN-2021-49/cdx/cluster.idx 0 + CC-MAIN-2022-21/cdx/warc/cluster.idx 0 + CC-MAIN-2022-33/cdx/warc/cluster.idx 0 + CC-MAIN-2022-40/cdx/warc/cluster.idx 0 + CC-MAIN-2022-49/cdx/warc/cluster.idx 0 + CC-MAIN-2023-40/cdx/warc/cluster.idx 0 + CC-MAIN-2023-50/cdx/warc/cluster.idx 0 + CC-MAIN-2024-33/cdx/warc/cluster.idx 0 +Emailed this info to Sebastian Nagel 2024-12-17 + Tabulate all the date ranges for the WARC files we have >: for d in {2017-30,2019-35,2020-34,2021-25,2023-40,2023-50}; do printf "%s\t" $d; (ls CC-MAIN-$d/*.{?,??}/orig/warc | fgrep .gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | head -1 ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done | cut -f 1,2,4 -d - | sed 's/-20/ 20/;s/.$//' | tr ' ' '\t' > dates.tsv diff -r dd06d7afbfe0 -r 4b5117db4929 lurid3/status.xlsx Binary file lurid3/status.xlsx has changed