Mercurial > hg > cc > work
comparison lurid3/notes.txt @ 57:4b5117db4929
minor updates
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 17 Dec 2024 21:25:28 +0000 |
parents | dd06d7afbfe0 |
children | 3012ca7fc6b7 |
comparison
equal
deleted
inserted
replaced
56:dd06d7afbfe0 | 57:4b5117db4929 |
---|---|
23 >: fgrep ' 20181023' CC-MAIN-2019-35/cdx/cluster.idx | 23 >: fgrep ' 20181023' CC-MAIN-2019-35/cdx/cluster.idx |
24 com,gyshbsh)/robots.txt 20181023022000 cdx-00078.gz 356340085 162332 315406 | 24 com,gyshbsh)/robots.txt 20181023022000 cdx-00078.gz 356340085 162332 315406 |
25 >: zgrep ' 20181023' CC-MAIN-2019-35/cdx/warc/cdx-00078.gz | 25 >: zgrep ' 20181023' CC-MAIN-2019-35/cdx/warc/cdx-00078.gz |
26 com,gyshbsh)/robots.txt 20181023022000 {"url": "http://gyshbsh.com/robots.txt", "mime": "text/html", "mime-detected": "text/html", "status": "301", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "529", "offset": "638892", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027315618.73/robotstxt/CC-MAIN-20190820200701-20190820222701-00120.warc.gz"} | 26 com,gyshbsh)/robots.txt 20181023022000 {"url": "http://gyshbsh.com/robots.txt", "mime": "text/html", "mime-detected": "text/html", "status": "301", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "529", "offset": "638892", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027315618.73/robotstxt/CC-MAIN-20190820200701-20190820222701-00120.warc.gz"} |
27 ... | 27 ... |
28 | |
29 Full search: | |
30 >: find CC*/cdx -type f -name cluster.idx > /tmp/hst/clus | |
31 >: cat /tmp/hst/clus | while read c; do printf '%s\t%s\n' $c $(cut -f 1 -d ' ' $c | fgrep -vc ${c:8:4}); done | |
32 CC-MAIN-2013-20/cdx/cluster.idx 0 | |
33 CC-MAIN-2014-35/cdx/cluster.idx 0 | |
34 CC-MAIN-2015-35/cdx/cluster.idx 0 | |
35 CC-MAIN-2016-30/cdx/cluster.idx 0 | |
36 CC-MAIN-2017-30/cdx/cluster.idx 0 | |
37 CC-MAIN-2018-30/cdx/warc/cluster.idx 0 | |
38 CC-MAIN-2018-34/cdx/cluster.idx 36 | |
39 CC-MAIN-2019-18/cdx/warc/cluster.idx 3 | |
40 CC-MAIN-2019-35/cdx/cluster.idx 1 | |
41 CC-MAIN-2020-34/cdx/cluster.idx 0 | |
42 CC-MAIN-2021-25/cdx/cluster.idx 0 | |
43 CC-MAIN-2021-31/cdx/cluster.idx 0 | |
44 CC-MAIN-2021-49/cdx/cluster.idx 0 | |
45 CC-MAIN-2022-21/cdx/warc/cluster.idx 0 | |
46 CC-MAIN-2022-33/cdx/warc/cluster.idx 0 | |
47 CC-MAIN-2022-40/cdx/warc/cluster.idx 0 | |
48 CC-MAIN-2022-49/cdx/warc/cluster.idx 0 | |
49 CC-MAIN-2023-40/cdx/warc/cluster.idx 0 | |
50 CC-MAIN-2023-50/cdx/warc/cluster.idx 0 | |
51 CC-MAIN-2024-33/cdx/warc/cluster.idx 0 | |
52 Emailed this info to Sebastian Nagel 2024-12-17 | |
28 | 53 |
29 Tabulate all the date ranges for the WARC files we have | 54 Tabulate all the date ranges for the WARC files we have |
30 | 55 |
31 >: for d in {2017-30,2019-35,2020-34,2021-25,2023-40,2023-50}; do printf "%s\t" $d; (ls CC-MAIN-$d/*.{?,??}/orig/warc | fgrep .gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | head -1 ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done | cut -f 1,2,4 -d - | sed 's/-20/ 20/;s/.$//' | tr ' ' '\t' > dates.tsv | 56 >: for d in {2017-30,2019-35,2020-34,2021-25,2023-40,2023-50}; do printf "%s\t" $d; (ls CC-MAIN-$d/*.{?,??}/orig/warc | fgrep .gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | head -1 ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done | cut -f 1,2,4 -d - | sed 's/-20/ 20/;s/.$//' | tr ' ' '\t' > dates.tsv |
32 >: for d in {2018-30,2018-34}; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | { sleep 10 ; head -1 ; } ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done >> dates.tsv | 57 >: for d in {2018-30,2018-34}; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | { sleep 10 ; head -1 ; } ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done >> dates.tsv |