Mercurial > hg > cc > cirrus_work
changeset 157:463fc7b09119
convert to single thread,
use aws settings to improve performance when throttling is bad
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 24 Oct 2023 14:34:58 +0100 |
parents | adb1e22ad708 |
children | 5d1c3359e210 |
files | bin/getcc.aws |
diffstat | 1 files changed, 29 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/getcc.aws Tue Oct 24 14:26:36 2023 +0100 +++ b/bin/getcc.aws Tue Oct 24 14:34:58 2023 +0100 @@ -2,8 +2,9 @@ # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers] ARCHIVE="$1" shift -if [ "$1" ] +if [ "$1" = -w ] then + shift wait="; sleep $1" shift fi @@ -21,7 +22,7 @@ curl --retry 4 -s ${WARCS} | gzip -dc > $wf fi -if [ ! -f all_segments ] +if [ ! -f $SEGS ] then cut -f 4 -d / $wf |uniq > all_segments fi @@ -33,21 +34,36 @@ split -n l/$m $SEGS segment_ fi -export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" -export AWS_SECRET_ACCESS_KEY="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" -export PASSPHRASE="annebooththompson" - -mkdir -p cdx/warc -cd cdx/warc -cat ../../$cf|\ +#export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" +#export AWS_SECRET_ACCESS_KEY="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" +#export PASSPHRASE="annebooththompson" +#export AWS_RETRY_MODE=adaptive +#export AWS_MAX_ATTEMPTS=100 +# Set these in ~/.aws/credentials as follows +# [hst] +# aws_access_key_id = AKIAIKBLNO2XNVMWM5JA +# aws_secret_access_key = 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 +# And these in ~/.aws/config as follows +# [profile hst] +# retry_mode = adaptive +# max_attempts = 100 +# s3 = +# multipart_threshold = 4GB +# max_concurrent_requests = 1 +# multipart_chunksize = 32MB for sf in segment_* do for s in $(cat $sf) do - mkdir -p $s - fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\ - parallel --colsep '\t' --will-cite -j 8 \ - "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}' 2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)" + fgrep -w $s $wf |\ + while read f + do + g=$s/orig/warc/${f##*/} + if [ ! -f "$g" ] + then + aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog ) + fi + done done done