Mercurial > hg > cc > work
annotate lurid3/notes.txt @ 45:737c61f98cbf
foo
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 26 Sep 2024 17:47:58 +0100 |
parents | 7209df5fa5b4 |
children | 49672e9b4c1c |
rev | line source |
---|---|
40 | 1 See old_notes.txt for all older notes on Common Crawl dataprocessing, |
2 starting from Azure via Turing and then LURID and LURID2. | |
3 | |
4 Installed /beegfs/common_crawl/CC-MAIN-2024-33/cdx | |
5 >: cd results/CC-MAIN-2024-33/cdx/ | |
6 >: cut -f 2 counts.tsv | btot | |
7 2,793,986,828 | |
8 | |
9 State of play wrt data -- see status.xlsx | |
10 | |
41
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
11 [in trying to tabulate the date ranges of the crawls, I found that the |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
12 WARC timestamp is sometimes bogus: |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
13 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
14 >: fgrep ' 2009' CC-MAIN-2018-34/cdx/cluster.idx |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
15 net,tyredeyes)/robots.txt 20090201191318 cdx-00230.gz 160573468 198277 920675 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
16 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
17 >: zgrep '^net,tyredeyes)/robots.txt' CC-MAIN-2018-34/cdx/warc/cdx-00230.gz |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
18 net,tyredeyes)/robots.txt 20090201191318 {"url": "http://tyredeyes.net/robots.txt", "mime": "text/html", "mime-detected": "text/html", "status": "301", "digest": "QH732FYSV7UM34JYWVYMB7EZGR2CYM6B", "length": "582", "offset": "1224614", "filename": "crawl-data/CC-MAIN-2018-34/segments/1534221215075.58/robotstxt/CC-MAIN-20180819090604-20180819110604-00558.warc.gz"} |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
19 net,tyredeyes)/robots.txt 20090201191319 {"url": "http://www.tyredeyes.net/robots.txt", "mime": "text/plain", "mime-detected": "text/plain", "status": "200", "digest": "PSX5IZU4B4SIXGNDKXCVFH75Q27VHUTJ", "length": "549", "offset": "2069841", "filename": "crawl-data/CC-MAIN-2018-34/segments/1534221215075.58/robotstxt/CC-MAIN-20180819090604-20180819110604-00485.warc.gz"} |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
20 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
21 This happens in 2019-35 as well :-( |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
22 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
23 >: fgrep ' 20181023' CC-MAIN-2019-35/cdx/cluster.idx |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
24 com,gyshbsh)/robots.txt 20181023022000 cdx-00078.gz 356340085 162332 315406 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
25 >: zgrep ' 20181023' CC-MAIN-2019-35/cdx/warc/cdx-00078.gz |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
26 com,gyshbsh)/robots.txt 20181023022000 {"url": "http://gyshbsh.com/robots.txt", "mime": "text/html", "mime-detected": "text/html", "status": "301", "digest": "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ", "length": "529", "offset": "638892", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027315618.73/robotstxt/CC-MAIN-20190820200701-20190820222701-00120.warc.gz"} |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
27 ... |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
28 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
29 Tabulate all the date ranges for the WARC files we have |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
30 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
31 >: for d in {2017-30,2019-35,2020-34,2021-25,2023-40,2023-50}; do printf "%s\t" $d; (ls CC-MAIN-$d/*.{?,??}/orig/warc | fgrep .gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | head -1 ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done | cut -f 1,2,4 -d - | sed 's/-20/ 20/;s/.$//' | tr ' ' '\t' > dates.tsv |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
32 >: for d in {2018-30,2018-34}; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u |tee /dev/fd/3 | { sleep 10 ; head -1 ; } ) 3> >( tail -1 ) | tr '\n' '\t'; echo; done >> dates.tsv |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
33 >: for d in 2019-18; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u | head -1); done |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
34 2019-18 20190418101243-20190418122248 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
35 >: for d in 2019-18; do printf "%s\t" $d; (ls CC-MAIN-$d/{*.?,*.??} | fgrep warc.gz | cut -f 3,4 -d - | sort -u | tail -1); done |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
36 2019-18 20190426153423-20190426175423 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
37 >: echo 2019-18 20190418101243-20190418122248 20190426153423-20190426175423 >> dates.tsv |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
38 >: pwd |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
39 /beegfs/common_crawl/CC-MAIN-2016-30/cdx/warc |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
40 >: echo {000..299} | tr ' ' '\n' | parallel -j 10 'uz cdx-00{}.gz | cut -f 2 -d " " | sort -u > /tmp/hst/{}' |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
41 >: sort -mu /tmp/hst/??? > /tmp/hst/all |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
42 >: wc -l /tmp/hst/all |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
43 679686 /tmp/hst/all |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
44 >: head -1 /tmp/hst/all |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
45 20160723090435 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
46 >: tail -1 /tmp/hst/all |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
47 20160731110639 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
48 >: cd ../../.. |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
49 >: echo 2016-30 20160723090435 20160731110639 >> dates.tsv |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
50 tweaked and sorted in xemacs: |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
51 2016-30 20160723090435 20160731110639 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
52 2017-30 20170720121902 20170729132938 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
53 2018-30 20180715183800 20180723184955 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
54 2018-34 20180814062251 20180822085454 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
55 2019-18 20190418101243 20190426175423 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
56 2019-35 20190817102624 20190826111356 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
57 2020-34 20200803083123 20200815214756 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
58 2021-25 20210612103920 20210625145905 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
59 2023-40 20230921073711 20231005042006 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
60 2023-50 20231128083443 20231212000408 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
61 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
62 Added to status.xlsx in shortened form, with number of days |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
63 8 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
64 9 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
65 8 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
66 8 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
67 8 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
68 9 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
69 12 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
70 13 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
71 15 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
72 15 |
64b7fb44e8dc
extract actual date info for WARC crawls
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
73 |
42
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
74 Fill a gap by downloading 2022-33 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
75 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
76 >: for s in 0; do ~/bin/getcc_multi.aws CC-MAIN-2022-33 $s 5; done > /tmp/hst/get_22-33_0.log & |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
77 130 minutes... |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
78 >: for s in 1; do ~/bin/getcc_multi.aws CC-MAIN-2022-33 $s 10; done > /tmp/hst/get_22-33_1.log & |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
79 59 minutes |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
80 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
81 Another day to get to a quarter? |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
82 >: for s in {2..23}; do ~/bin/getcc_multi.aws CC-MAIN-2022-33 $s 10; done > /tmp/hst/get_22-33_2-23.log & |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
83 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
84 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
85 And finally 2015-35 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
86 Fetched in just 2 chunks, 0-9 and 10-99, e.g. |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
87 >: for s in {10..99}; do ~/bin/getcc_multi.aws CC-MAIN-2015-35 $s 10; done > /tmp/hst/get_15-35_10-99.log & |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
88 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
89 Much smaller. |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
90 Compare 2023-40, with 900 files per segment: |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
91 >: lss */orig/warc/*-0023?.* | cut -f 5 -d ' ' | stats |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
92 n = 1000 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
93 min = 1.14775e+09 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
94 max = 1.26702e+09 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
95 sum = 1.20192e+12 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
96 mean = 1.20192e+09 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
97 sd = 2.26049e+07 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
98 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
99 with 2015-35, with 353 files per segment |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
100 >: lss */orig/warc/*-0023?-* | cut -f 5 -d ' ' | stats |
43 | 101 n = 1000 |
102 min = 1.66471e+08 | |
42
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
103 max = 9.6322e+08 |
43 | 104 sum = 9.19222e+11 |
105 mean = 9.19222e+08 | |
106 sd = 8.20542e+07 | |
42
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
107 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
108 The min files all come from segment 1440644060633.7, whose files are |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
109 _all_ small: |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
110 >: uz *00123-*.gz | wc -l |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
111 12,759,931 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
112 Compare to 1440644060103.8 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
113 >: zcat *00123-*.gz | wc -l |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
114 75,806,738 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
115 Mystery |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
116 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
117 Also faster |
43 | 118 Compare 2022-33: |
42
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
119 >: fgrep -h BST /tmp/hst/get_22-33_{2-23,24-49,50-99}.log | cut -f 1-7 -d ' ' | while read s; do if read e; then echo $((($(date --date="$e" +%s) - $(date --date="$s" +%s)) / 60)); fi; done | stats n min max mean sd |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
120 98 19 256 75.1 25.2 |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
121 with 2015-35: |
0c472ae05f71
nearly finished downloading for now
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
41
diff
changeset
|
122 >: fgrep -h BST /tmp/hst/get_15-35_{0-9,10-99}.log | cut -f 1-7 -d ' ' | while read s; do if read e; then echo $((($(date --date="$e" +%s) - $(date --date="$s" +%s)) / 60)); fi; done | stats n min max mean sd |
43 | 123 100 15 40 32.6 2.9 |
124 | |
125 >: echo {000..299} | tr ' ' '\n' | parallel -j 10 'uz cdx-00{}.gz | cut -f 2 -d " " | sort -u > /tmp/hst/2015_{}' & | |
126 >: sort --parallel=10 -mu /tmp/hst/2015_??? > /tmp/hst/2015_all | |
127 >: head -1 /tmp/hst/2015_all | |
128 20150827191534 | |
129 >: tail -1 /tmp/hst/2015_all | |
130 20150905180914 | |
131 >: wc -l /tmp/hst/2015_all | |
132 698128 /tmp/hst/2015_all | |
133 | |
134 What about wet files -- do they include text from pdfs? What about | |
135 truncated pdfs? | |
136 | |
137 >: time for s in 0; do ~/bin/getcc_wet_multi.aws CC-MAIN-2019-35 $s 10; done > /tmp/hst/get_wet_19-35_0.log & | |
138 real 26m3.049s | |
139 user 0m1.225s | |
140 sys 0m1.310s | |
141 | |
142 In the segment 0 cdx file (!) we find 3747 probable truncations: | |
143 >: zgrep -a '"mime-detected": "application/pdf", ' cdx.gz > /tmp/hst/2019-35_seg0_pdf.idx | |
144 >: wc -l /tmp/hst/2019-35_seg0_pdf.idx | |
145 42345 /tmp/hst/2019-35_seg0_pdf.idx | |
146 >: egrep -a '"length": "10....."' /tmp/hst/2019-35_seg0_pdf.idx > /tmp/hst/2019-35_seg0_long_pdf.idx & | |
147 >: wc -l < /tmp/hst/2019-35_seg0_long_pdf.idx | |
148 3747 | |
149 Of which 70 are in file 0: | |
150 >: egrep -a '.-00000\.' /tmp/hst/2019-35_seg0_pdf.idx > /tmp/hst/2019-35_seg0_file0_pdf.idx | |
151 >: wc -l /tmp/hst/2019-35_seg0_file0_pdf.idx | |
152 70 /tmp/hst/2019-35_seg0_file0_pdf.idx | |
153 | |
154 In segment 0 file 0 we find 70 application/pdf Content-Type headers: | |
155 >: ix.py -h -w -x </tmp/hst/2019-35_seg0_file0_pdf.idx |egrep '^(WARC-Target-URI:|Content-Length:) '|cut -f 2 -d ' ' |tr -d '\r'|while read l1; do read uri; read l2; printf '%s\t%s\t%s\n' $l1 $l2 "$uri"; done > ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | |
156 >: wc -l < ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | |
157 70 | |
158 >: head -3 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | |
159 | |
160 | |
161 Of which 14 are truncated: | |
162 >: fgrep -c 1048576 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | |
163 14 | |
164 | |
165 E.g. | |
166 >: fgrep 1048576 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | head -3 | |
167 1049051 1048576 https://en.heks.ch/sites/default/files/documents/2017-09/HEKS_EPER_Mission_Statement_2016_e.pdf | |
168 1049469 1048576 https://bmcmicrobiol.biomedcentral.com/track/pdf/10.1186/s12866-017-0951-4 | |
169 1048824 1048576 https://citydocs.fcgov.com/?action=cao-cases&cmd=convert&docid=3332339 | |
170 | |
171 Are any of the pdfs in the corresponding wet file? | |
172 | |
173 Yes, 2: | |
174 >: cut -f 3 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | fgrep -af - <(uz 1566027313501.0/orig/wet/*-00000.warc.wet.gz) | |
175 WARC-Target-URI: http://bdds.deux-sevres.com/recherche/simple/Editeur/2/Belfond/vignette?format=pdf | |
176 WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D00 | |
177 | |
178 Is it in fact corresponding? | |
179 >: diff -bw <(uz 1566027313501.0/orig/warc/*-00000.warc.gz | egrep -a '^WARC-Target-URI: ' | uniq | head -1000) <(uz 1566027313501.0/orig/wet/*-00000.warc.wet.gz | egrep -a '^WARC-Target-URI: ' | head -1000)|egrep -c '^<' | |
180 19 | |
181 | |
182 So, yes, mostly. .2% are missing | |
183 | |
184 Just checking the search: | |
185 >: cut -f 3 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | fgrep -af - <(uz 1566027313501.0/orig/warc/*-00000.warc.gz) | wc -l | |
186 210 | |
187 Correct | |
188 | |
189 So, what pdfs make it into the WET: | |
190 >: cut -f 3 ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | fgrep -af - <(uz 1566027313501.0/orig/wet/*-00000.warc.wet.gz) > ~/results/CC-MAIN-2019-35/s0_file0_pdf.txt | |
191 >: wc -l < ~/results/CC-MAIN-2019-35/s0_file0_pdf.txt | |
192 2 | |
193 >: cut -f 2 -d ' ' ~/results/CC-MAIN-2019-35/s0_file0_pdf.txt | tr -d '\r' | fgrep -f - ~/results/CC-MAIN-2019-35/seg0_file0_lengths.tsv | |
194 11588 10913 http://bdds.deux-sevres.com/recherche/simple/Editeur/2/Belfond/vignette?format=pdf | |
195 1048979 1048576 https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 | |
196 | |
197 Here's the short one: | |
198 WARC/1.0 | |
199 WARC-Type: response | |
200 WARC-Date: 2019-08-17T22:40:17Z | |
201 WARC-Record-ID: <urn:uuid:ea98167b-c42a-4233-b57e-994aa627e38a> | |
202 Content-Length: 11588 | |
203 Content-Type: application/http; msgtype=response | |
204 WARC-Warcinfo-ID: <urn:uuid:f689f8d0-24f3-4824-9a38-4f3fee422a4e> | |
205 WARC-Concurrent-To: <urn:uuid:2d51c956-0012-4d78-affc-8f57fe9d2e15> | |
206 WARC-IP-Address: 92.175.114.24 | |
207 WARC-Target-URI: http://bdds.deux-sevres.com/recherche/simple/Editeur/2/Belfond/vignette?format=pdf | |
208 WARC-Payload-Digest: sha1:7VVIUDQ4Q6XKNOAURYU4VTMRSZNPHDQA | |
209 WARC-Block-Digest: sha1:OSTWXLV772XNHS22T4UBSCSJAAXM2J6T | |
210 WARC-Identified-Payload-Type: application/pdf | |
211 | |
212 HTTP/1.1 200 OK | |
213 Cache-Control: must-revalidate, post-check=0, pre-check=0,no-cache | |
214 Pragma: public,no-cache | |
215 Content-Type: application/pdf",text/html; charset=utf-8 | |
216 X-Crawler-Content-Encoding: gzip | |
217 Expires: 0 | |
218 Server: | |
219 X-Powered-By: | |
220 Set-Cookie: 166d74d734106ba68b20ea303011f622=301619e3fe31ecb98c8473f0ff5f35a2; path=/ | |
221 Content-Disposition: attachment; filename="Mdiathque dpartementale des Deux-Svres - Rsultats de la recherche Belfond.pdf" | |
222 Content-Transfer-Encoding: binary | |
223 P3P: CP="NOI ADM DEV PSAi COM NAV OUR OTRo STP IND DEM" | |
224 X-Content-Encoded-By: | |
225 X-Powered-By: | |
226 Date: Sat, 17 Aug 2019 22:40:16 GMT | |
227 X-Crawler-Content-Length: 5448 | |
228 Content-Length: 10913 | |
229 | |
230 %PDF-1.7 | |
231 %<E2><E3><CF><D3> | |
232 7 0 obj | |
233 << /Type /Page /Parent 1 0 R /LastModified (D:20190818004016+02'00') /Resources 2 | |
234 0 R /MediaBox [0.000000 0.000000 595.276000 841.890000] /CropBox [0.000000 0.000 | |
235 000 595.276000 841.890000] /BleedBox [0.000000 0.000000 595.276000 841.890000] /T | |
236 rimBox [0.000000 0.000000 595.276000 841.890000] /ArtBox [0.000000 0.000000 595.2 | |
237 76000 841.890000] /Contents 8 0 R /Rotate 0 /Group << /Type /Group /S /Transparen | |
238 cy /CS /DeviceRGB >> /PZ 1 >> | |
239 endobj | |
240 8 0 obj | |
241 | |
242 >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|tail -n +1823434 | tail -n +24 | head -c 20000 > ~/results/CC-MAIN-2019-35/mediatheque.pdf | |
243 >: ps2ascii mediatheque.pdf | |
244 Médiathèque départementale des Deux-Sèvres - Résultats de la recherche Belfond | |
245 | |
246 Médiathèque départementale des Deux-Sèvres - Résultats de | |
247 la recherche Belfond | |
248 A charge de revanche | |
249 Titre : | |
250 Auteur : Grippando, James (1958-....) | |
251 ... | |
252 etc., three pages, no errors | |
253 | |
254 >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|fgrep -an https://museum.wrap.gov.tw/GetFile4.ashx | |
255 38896837:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 | |
256 38896858:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 | |
257 38904590:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 | |
258 >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|tail -n +38896858 | egrep -an '^%%EOF' | |
259 27:%%EOF | |
260 1114658:%%EOF | |
261 1313299:%%EOF | |
262 | |
263 Hunh? | |
264 | |
265 >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|tail -n +38896858 | egrep -an '^(%%EOF|WARC)' | head -30 | |
266 1:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 | |
267 2:WARC-Payload-Digest: sha1:SZ53DQQHENC7DDN7GQ5IS7VMEPAXAMBE | |
268 3:WARC-Block-Digest: sha1:QTKJA6A7445Z7264K2YAFBUUM2OYH2T2 | |
269 4:WARC-Truncated: length | |
270 5:WARC-Identified-Payload-Type: application/pdf | |
271 27:%%EOF | |
272 7725:WARC/1.0 | |
273 7726:WARC-Type: metadata | |
274 7727:WARC-Date: 2019-08-17T22:59:14Z | |
275 7728:WARC-Record-ID: <urn:uuid:77df2747-e567-45d3-8646-3069ae9a9f25> | |
276 7731:WARC-Warcinfo-ID: <urn:uuid:f689f8d0-24f3-4824-9a38-4f3fee422a4e> | |
277 7732:WARC-Concurrent-To: <urn:uuid:eceb4adc-d81e-4497-82fe-eea61ce171f4> | |
278 7733:WARC-Target-URI: https://museum.wrap.gov.tw/GetFile4.ashx?Serial=201609200919D005 | |
279 7739:WARC/1.0 | |
280 | |
281 OK, so indeed truncated after 7700 lines or so... | |
282 >: uz 1566027313501.0/orig/warc/*-00000.warc.gz|tail -n +38896858 | tail -n +21 | head -c 1048576 > ~/results/CC-MAIN-2019-35/museum.pdf | |
283 >: ps2ascii ~/results/CC-MAIN-2019-35/museum.pdf | |
284 **** Error: An error occurred while reading an XREF table. | |
285 **** The file has been damaged. | |
286 Look in big_pdf? | |
44
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
287 |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
288 ====Modify the original CC indexer to write new indices including lastmod===== |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
289 Looks like WarcRecordWriter.write, in |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
290 src/nutch-cc/src/java/org/commoncrawl/util/WarcRecordWriter, is what |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
291 needs to be editted to include LastModified date |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
292 |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
293 To rebuild nutch-cc, particularly to recompile jar files after editting |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
294 anything |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
295 |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
296 >: cd $HHOME/src/nutch-cc |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
297 >: ant |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
298 |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
299 Fixed deprecation bug in WarcCdxWriter.java |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
300 |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
301 Modified src/java/org/commoncrawl/util/WarcCdxWriter.java |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
302 to include lastmod |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
303 |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
304 Can run just one test, which should allow testing this: |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
305 |
7209df5fa5b4
turn attention to nutch-cc and its Cdx code
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
43
diff
changeset
|
306 >: ant test-core -Dtestcase='TestWarcRecordWriter' |
45 | 307 |
308 Logic is tricky, and there's no easy way in | |
309 | |
310 Basically, tools/WarcExport.java is launches a hadoop job based on a | |
311 hadoop-runnable WarcExport instance. Hadoop will in due course call | |
312 ExportReducer.reduce, which will create an instance of WarcCapture | |
313 "for each page capture", and call ExportMapper.context.write with that instance (via | |
314 some configuration magic with the hadoop job Context). That in turn | |
315 uses (more magic) WarcOutputFormat.getRecordWriter, which | |
316 (finally!) calls a previously created WarcRecordWriter | |
317 instance.write(the capture). | |
318 | |
319 So to fake a test case, I need to build | |
320 1) a WarcRecordWriter instance | |
321 2) a WarcCapture instance | |
322 and then invoke 1.write(2) | |
323 | |
324 Got that working, although still can't figure out where in the normal | |
325 flow the metadata entry for Response.CONTENT_TYPE gets set. | |
326 | |
327 Now, add a test that takes a stream of WARC Response extracts and | |
328 rewrites their index entries | |
329 | |
330 >: head -8804 <(uz /beegfs/common_crawl/CC-MAIN-2019-35/cdx/warc/cdx-00150.gz)|tail -10| ix.py -h -w -x > /tmp/hst/headers.txt | |
331 >: cp /tmp/hst/headers.txt src/test/org/commoncrawl/util/ | |
332 >: shuf /tmp/hst/headers.txt > src/test/org/commoncrawl/util/headers_mixed.txt | |
333 | |
334 Won't quite work :-( | |
335 How do We reconstruct the Warc filename, offset and length from the | |
336 original index? | |
337 | |
338 Well, we can find a .warc.gz records! | |
339 Thanks to https://stackoverflow.com/a/37042747/2595465 | |
340 | |
341 >: ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz > /tmp/hst/recs.txt | |
342 | |
343 Nearly working, got 1/3rd of the way through a single WARC and then failed: | |
344 | |
345 >: n=0 && ~/lib/python/unpackz.py /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz 2>/tmp/hst/tt.txt|while read o l; do echo $((n+=1)); echo $o $l >> /tmp/hst/r3a; ix.py $l $o CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz| wc -l; done | |
346 ... | |
347 20 | |
348 10215 | |
349 CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | |
350 Process fail: Compressed file ended before the end-of-stream marker was reached, input: | |
351 length=10762, offset=60784640, file=/CC-MAIN-2019-35/1566027314638.49/orig/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz | |
352 | |
353 >: head -10217 /tmp/hst/r3a | tail -4 | |
354 60784173 467 | |
355 60784640 10762 | |
356 60795402 463 | |
357 60795865 460 | |
358 >: ix.py 467 60784173 CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|fgrep Target | |
359 WARC-Target-URI: http://drycarerestoration.co/corner-furniture-piece/unique-corner-decoration-pieces-or-corner-furniture-pieces-corner-corner-furniture-piece-corner-furniture-pieces-bedroom/ | |
360 | |
361 >: zcat /beegfs/common_crawl/CC-MAIN-2019-35/1566027314638.49/orig/cdx/warc/cdx.gz | |
362 ... | |
363 co,drycarerestoration)/corner-furniture-piece/unique-corner-decoration-pieces-or-corner-furniture-pieces-corner-corner-furniture-piece-corner-furniture-pieces-bedroom 20190819020224 {"url": "http://drycarerestoration.co/corner-furniture-piece/unique-corner-decoration-pieces-or-corner-furniture-pieces-corner-corner-furniture-piece-corner-furniture-pieces-bedroom/", "mime": "text/html", "mime-detected": "text/html", "status": "200", "digest": "DTKGJL45XQDXUS7PTXPYR6POMPLG46RZ", "length": "2570", "offset": "60784640", "filename": "crawl-data/CC-MAIN-2019-35/segments/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz", "charset": "UTF-8", "languages": "eng"} | |
364 >: ix.py 2570 60784640 CC-MAIN-2019-35/1566027314638.49/warc/CC-MAIN-20190819011034-20190819033034-00558.warc.gz|less | |
365 >: echo $((10762 - 2570)) | |
366 8192 | |
367 | |
368 Ah, the error I was dreading :-( I _think_ this happens when an | |
369 individual record ends exactly on a 8K boundary. | |
370 | |
371 Yes: | |
372 | |
373 >: echo $((60784640 % 8192)) | |
374 0 | |
375 |