Mercurial > hg > cc > cirrus_work
view bin/getcc_multi.aws @ 158:5d1c3359e210
resurrect parallel fetch
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 24 Oct 2023 16:58:44 +0100 |
parents | |
children | d0dbfefd6fc0 |
line wrap: on
line source
# courtesy wwaites, yossi # Usage: getcc_multi.aws [archive, e.g. CC-MAIN-2019-35] [-p nthreads] [file listing segment numbers] ARCHIVE="$1" shift if [ "$1" = -p ] then shift nthreads=$1 shift else nthreads=2 fi SEGS="${1-all_segments}" wf=warc.paths WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz mkdir -p /beegfs/common_crawl/${ARCHIVE} cd /beegfs/common_crawl/${ARCHIVE} if [ ! -f $wf ] then curl --retry 4 -s ${WARCS} | gzip -dc > $wf fi if [ ! -f $SEGS ] then cut -f 4 -d / $wf |uniq > all_segments SEGS=all_segments fi if [ ! -s "$(ls segment_* | head -1)" ] then n=$(cat $SEGS | wc -l) m=$((n / 8)) split -n l/$m $SEGS segment_ fi #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" #export AWS_SECRET_ACCESS_KEY="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" #export PASSPHRASE="annebooththompson" #export AWS_RETRY_MODE=adaptive #export AWS_MAX_ATTEMPTS=100 # Set these in ~/.aws/credentials as follows # [hst] # aws_access_key_id = AKIAIKBLNO2XNVMWM5JA # aws_secret_access_key = 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 # And these in ~/.aws/config as follows # [profile hst] # retry_mode = adaptive # max_attempts = 100 # s3 = # multipart_threshold = 4GB # max_concurrent_requests = 1 # multipart_chunksize = 32MB echo args $ARCHIVE $nthreads 1>&2 for sf in segment_* do echo sf $sf for s in $(cat $sf) do echo s $s mkdir -p $s fgrep -w $s $wf | tee >( echo nf $(wc -l) 1>&2 ) |\ parallel --will-cite -j $nthreads \ "echo '{#}' 1>&2 f='{}' g=$s/orig/warc/\${f##*/} echo \|\$f\|\$g\| 1>&2 if [ ! -f \$g ] then aws s3 cp s3://commoncrawl/\$f \$g --only-show-errors 2> >( { echo \$(date +%D:%T) \$g ; cat ; } >>m_errlog ) fi " done done