Mercurial > hg > cc > cirrus_work
annotate bin/getcc.aws @ 223:36610ddfbc7a
replaced mean_lens by w or wo bogon
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 28 Feb 2024 14:49:45 +0000 |
parents | afd2ece1fb22 |
children |
rev | line source |
---|---|
155
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 # courtesy wwaites |
159
ebff60e85c59
now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
157
diff
changeset
|
2 # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [segment ID, e.g. 68] |
ebff60e85c59
now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
157
diff
changeset
|
3 # Single segment, single thread |
155
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 ARCHIVE="$1" |
159
ebff60e85c59
now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
157
diff
changeset
|
5 SEG="$2" |
155
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 wf=warc.paths |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 mkdir -p /beegfs/common_crawl/${ARCHIVE} |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 cd /beegfs/common_crawl/${ARCHIVE} |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 if [ ! -f $wf ] |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 then |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 curl --retry 4 -s ${WARCS} | gzip -dc > $wf |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 fi |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 |
157
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
19 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
20 #export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu" |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
21 #export PASSPHRASE="annebooththompson" |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
22 #export AWS_RETRY_MODE=adaptive |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
23 #export AWS_MAX_ATTEMPTS=100 |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
24 # Set these in ~/.aws/credentials as follows |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
25 # [hst] |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
26 # aws_access_key_id = AKIAIKBLNO2XNVMWM5JA |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
27 # aws_secret_access_key = WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
28 # And these in ~/.aws/config as follows |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
29 # [profile hst] |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
30 # retry_mode = adaptive |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
31 # max_attempts = 100 |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
32 # s3 = |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
33 # multipart_threshold = 4GB |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
34 # max_concurrent_requests = 1 |
463fc7b09119
convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
155
diff
changeset
|
35 # multipart_chunksize = 32MB |
155
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 |
159
ebff60e85c59
now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
157
diff
changeset
|
37 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1) |
160 | 38 mkdir -p $s/orig/warc |
159
ebff60e85c59
now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
157
diff
changeset
|
39 fgrep -w $s $wf |\ |
ebff60e85c59
now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
157
diff
changeset
|
40 while read f |
155
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 do |
159
ebff60e85c59
now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
157
diff
changeset
|
42 g=$s/orig/warc/${f##*/} |
ebff60e85c59
now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
157
diff
changeset
|
43 if [ ! -f "$g" ] |
ebff60e85c59
now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
157
diff
changeset
|
44 then |
ebff60e85c59
now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
157
diff
changeset
|
45 aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_$SEG ) |
ebff60e85c59
now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
157
diff
changeset
|
46 fi |
155
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 done |