comparison bin/getcc.aws @ 157:463fc7b09119

convert to single thread, use aws settings to improve performance when throttling is bad
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 24 Oct 2023 14:34:58 +0100
parents 56825fc8459d
children ebff60e85c59
comparison
equal deleted inserted replaced
156:adb1e22ad708 157:463fc7b09119
1 # courtesy wwaites 1 # courtesy wwaites
2 # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers] 2 # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers]
3 ARCHIVE="$1" 3 ARCHIVE="$1"
4 shift 4 shift
5 if [ "$1" ] 5 if [ "$1" = -w ]
6 then 6 then
7 shift
7 wait="; sleep $1" 8 wait="; sleep $1"
8 shift 9 shift
9 fi 10 fi
10 SEGS="${1-all_segments}" 11 SEGS="${1-all_segments}"
11 12
19 if [ ! -f $wf ] 20 if [ ! -f $wf ]
20 then 21 then
21 curl --retry 4 -s ${WARCS} | gzip -dc > $wf 22 curl --retry 4 -s ${WARCS} | gzip -dc > $wf
22 fi 23 fi
23 24
24 if [ ! -f all_segments ] 25 if [ ! -f $SEGS ]
25 then 26 then
26 cut -f 4 -d / $wf |uniq > all_segments 27 cut -f 4 -d / $wf |uniq > all_segments
27 fi 28 fi
28 29
29 if [ ! -s "$(ls segment_* | head -1)" ] 30 if [ ! -s "$(ls segment_* | head -1)" ]
31 n=$(cat $SEGS | wc -l) 32 n=$(cat $SEGS | wc -l)
32 m=$((n / 8)) 33 m=$((n / 8))
33 split -n l/$m $SEGS segment_ 34 split -n l/$m $SEGS segment_
34 fi 35 fi
35 36
36 export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" 37 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
37 export AWS_SECRET_ACCESS_KEY="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" 38 #export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu"
38 export PASSPHRASE="annebooththompson" 39 #export PASSPHRASE="annebooththompson"
39 40 #export AWS_RETRY_MODE=adaptive
40 mkdir -p cdx/warc 41 #export AWS_MAX_ATTEMPTS=100
41 cd cdx/warc 42 # Set these in ~/.aws/credentials as follows
42 cat ../../$cf|\ 43 # [hst]
44 # aws_access_key_id = AKIAIKBLNO2XNVMWM5JA
45 # aws_secret_access_key = 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
46 # And these in ~/.aws/config as follows
47 # [profile hst]
48 # retry_mode = adaptive
49 # max_attempts = 100
50 # s3 =
51 # multipart_threshold = 4GB
52 # max_concurrent_requests = 1
53 # multipart_chunksize = 32MB
43 54
44 for sf in segment_* 55 for sf in segment_*
45 do 56 do
46 for s in $(cat $sf) 57 for s in $(cat $sf)
47 do 58 do
48 mkdir -p $s 59 fgrep -w $s $wf |\
49 fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\ 60 while read f
50 parallel --colsep '\t' --will-cite -j 8 \ 61 do
51 "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}' 2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)" 62 g=$s/orig/warc/${f##*/}
63 if [ ! -f "$g" ]
64 then
65 aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog )
66 fi
67 done
52 done 68 done
53 done 69 done