comparison lurid3/notes.txt @ 57:4b5117db4929

minor updates
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 17 Dec 2024 21:25:28 +0000
parents dd06d7afbfe0
children 3012ca7fc6b7
comparison
equal deleted inserted replaced
56:dd06d7afbfe0 57:4b5117db4929
23 >: fgrep ' 20181023' CC-MAIN-2019-35/cdx/cluster.idx 23 >: fgrep ' 20181023' CC-MAIN-2019-35/cdx/cluster.idx
24 com,gyshbsh)/robots.txt 20181023022000 cdx-00078.gz 356340085 162332 315406 24 com,gyshbsh)/robots.txt 20181023022000 cdx-00078.gz 356340085 162332 315406
25 >: zgrep ' 20181023' CC-MAIN-2019-35/cdx/warc/cdx-00078.gz 25 >: zgrep ' 20181023' CC-MAIN-2019-35/cdx/warc/cdx-00078.gz
26 com,gyshbsh)/robots.txt 20181023022000 {"url": "http://gyshbsh.com/robots.txt", "mime": "text/html", "mime-detected": "text/html", "status": "301", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "529", "offset": "638892", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027315618.73/robotstxt/CC-MAIN-20190820200701-20190820222701-00120.warc.gz"} 26 com,gyshbsh)/robots.txt 20181023022000 {"url": "http://gyshbsh.com/robots.txt", "mime": "text/html", "mime-detected": "text/html", "status": "301", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "529", "offset": "638892", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027315618.73/robotstxt/CC-MAIN-20190820200701-20190820222701-00120.warc.gz"}
27 ... 27 ...
28
29 Full search:
30 >: find CC*/cdx -type f -name cluster.idx > /tmp/hst/clus
31 >: cat /tmp/hst/clus | while read c; do printf '%s\t%s\n' $c $(cut -f 1 -d ' ' $c | fgrep -vc ${c:8:4}); done
32 CC-MAIN-2013-20/cdx/cluster.idx 0
33 CC-MAIN-2014-35/cdx/cluster.idx 0
34 CC-MAIN-2015-35/cdx/cluster.idx 0
35 CC-MAIN-2016-30/cdx/cluster.idx 0
36 CC-MAIN-2017-30/cdx/cluster.idx 0
37 CC-MAIN-2018-30/cdx/warc/cluster.idx 0
38 CC-MAIN-2018-34/cdx/cluster.idx 36
39 CC-MAIN-2019-18/cdx/warc/cluster.idx 3
40 CC-MAIN-2019-35/cdx/cluster.idx 1
41 CC-MAIN-2020-34/cdx/cluster.idx 0
42 CC-MAIN-2021-25/cdx/cluster.idx 0
43 CC-MAIN-2021-31/cdx/cluster.idx 0
44 CC-MAIN-2021-49/cdx/cluster.idx 0
45 CC-MAIN-2022-21/cdx/warc/cluster.idx 0
46 CC-MAIN-2022-33/cdx/warc/cluster.idx 0
47 CC-MAIN-2022-40/cdx/warc/cluster.idx 0
48 CC-MAIN-2022-49/cdx/warc/cluster.idx 0
49 CC-MAIN-2023-40/cdx/warc/cluster.idx 0
50 CC-MAIN-2023-50/cdx/warc/cluster.idx 0
51 CC-MAIN-2024-33/cdx/warc/cluster.idx 0
52 Emailed this info to Sebastian Nagel 2024-12-17
28 53
29 Tabulate all the date ranges for the WARC files we have 54 Tabulate all the date ranges for the WARC files we have
30 55
31 >: for d in {2017-30,2019-35,2020-34,2021-25,2023-40,2023-50}; do printf "%s\t" $d; (ls CC-MAIN-$d/*.{?,??}/orig/warc | fgrep .gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | head -1 ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done | cut -f 1,2,4 -d - | sed 's/-20/ 20/;s/.$//' | tr ' ' '\t' > dates.tsv 56 >: for d in {2017-30,2019-35,2020-34,2021-25,2023-40,2023-50}; do printf "%s\t" $d; (ls CC-MAIN-$d/*.{?,??}/orig/warc | fgrep .gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | head -1 ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done | cut -f 1,2,4 -d - | sed 's/-20/ 20/;s/.$//' | tr ' ' '\t' > dates.tsv
32 >: for d in {2018-30,2018-34}; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | { sleep 10 ; head -1 ; } ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done >> dates.tsv 57 >: for d in {2018-30,2018-34}; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | { sleep 10 ; head -1 ; } ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done >> dates.tsv