Mercurial > hg > cc > cirrus_work
comparison bin/getcc.aws @ 157:463fc7b09119
convert to single thread,
use aws settings to improve performance when throttling is bad
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 24 Oct 2023 14:34:58 +0100 |
parents | 56825fc8459d |
children | ebff60e85c59 |
comparison
equal
deleted
inserted
replaced
156:adb1e22ad708 | 157:463fc7b09119 |
---|---|
1 # courtesy wwaites | 1 # courtesy wwaites |
2 # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers] | 2 # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers] |
3 ARCHIVE="$1" | 3 ARCHIVE="$1" |
4 shift | 4 shift |
5 if [ "$1" ] | 5 if [ "$1" = -w ] |
6 then | 6 then |
7 shift | |
7 wait="; sleep $1" | 8 wait="; sleep $1" |
8 shift | 9 shift |
9 fi | 10 fi |
10 SEGS="${1-all_segments}" | 11 SEGS="${1-all_segments}" |
11 | 12 |
19 if [ ! -f $wf ] | 20 if [ ! -f $wf ] |
20 then | 21 then |
21 curl --retry 4 -s ${WARCS} | gzip -dc > $wf | 22 curl --retry 4 -s ${WARCS} | gzip -dc > $wf |
22 fi | 23 fi |
23 | 24 |
24 if [ ! -f all_segments ] | 25 if [ ! -f $SEGS ] |
25 then | 26 then |
26 cut -f 4 -d / $wf |uniq > all_segments | 27 cut -f 4 -d / $wf |uniq > all_segments |
27 fi | 28 fi |
28 | 29 |
29 if [ ! -s "$(ls segment_* | head -1)" ] | 30 if [ ! -s "$(ls segment_* | head -1)" ] |
31 n=$(cat $SEGS | wc -l) | 32 n=$(cat $SEGS | wc -l) |
32 m=$((n / 8)) | 33 m=$((n / 8)) |
33 split -n l/$m $SEGS segment_ | 34 split -n l/$m $SEGS segment_ |
34 fi | 35 fi |
35 | 36 |
36 export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" | 37 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" |
37 export AWS_SECRET_ACCESS_KEY="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" | 38 #export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu" |
38 export PASSPHRASE="annebooththompson" | 39 #export PASSPHRASE="annebooththompson" |
39 | 40 #export AWS_RETRY_MODE=adaptive |
40 mkdir -p cdx/warc | 41 #export AWS_MAX_ATTEMPTS=100 |
41 cd cdx/warc | 42 # Set these in ~/.aws/credentials as follows |
42 cat ../../$cf|\ | 43 # [hst] |
44 # aws_access_key_id = AKIAIKBLNO2XNVMWM5JA | |
45 # aws_secret_access_key = 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 | |
46 # And these in ~/.aws/config as follows | |
47 # [profile hst] | |
48 # retry_mode = adaptive | |
49 # max_attempts = 100 | |
50 # s3 = | |
51 # multipart_threshold = 4GB | |
52 # max_concurrent_requests = 1 | |
53 # multipart_chunksize = 32MB | |
43 | 54 |
44 for sf in segment_* | 55 for sf in segment_* |
45 do | 56 do |
46 for s in $(cat $sf) | 57 for s in $(cat $sf) |
47 do | 58 do |
48 mkdir -p $s | 59 fgrep -w $s $wf |\ |
49 fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\ | 60 while read f |
50 parallel --colsep '\t' --will-cite -j 8 \ | 61 do |
51 "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}' 2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)" | 62 g=$s/orig/warc/${f##*/} |
63 if [ ! -f "$g" ] | |
64 then | |
65 aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog ) | |
66 fi | |
67 done | |
52 done | 68 done |
53 done | 69 done |