Mercurial > hg > cc > cirrus_work
changeset 158:5d1c3359e210
resurrect parallel fetch
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 24 Oct 2023 16:58:44 +0100 |
parents | 463fc7b09119 |
children | ebff60e85c59 |
files | bin/getcc_multi.aws |
diffstat | 1 files changed, 78 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/getcc_multi.aws Tue Oct 24 16:58:44 2023 +0100 @@ -0,0 +1,78 @@ +# courtesy wwaites, yossi +# Usage: getcc_multi.aws [archive, e.g. CC-MAIN-2019-35] [-p nthreads] [file listing segment numbers] +ARCHIVE="$1" +shift +if [ "$1" = -p ] +then + shift + nthreads=$1 + shift +else + nthreads=2 +fi +SEGS="${1-all_segments}" + +wf=warc.paths + +WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz + +mkdir -p /beegfs/common_crawl/${ARCHIVE} +cd /beegfs/common_crawl/${ARCHIVE} + +if [ ! -f $wf ] +then + curl --retry 4 -s ${WARCS} | gzip -dc > $wf +fi + +if [ ! -f $SEGS ] +then + cut -f 4 -d / $wf |uniq > all_segments + SEGS=all_segments +fi + +if [ ! -s "$(ls segment_* | head -1)" ] +then + n=$(cat $SEGS | wc -l) + m=$((n / 8)) + split -n l/$m $SEGS segment_ +fi + +#export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" +#export AWS_SECRET_ACCESS_KEY="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" +#export PASSPHRASE="annebooththompson" +#export AWS_RETRY_MODE=adaptive +#export AWS_MAX_ATTEMPTS=100 +# Set these in ~/.aws/credentials as follows +# [hst] +# aws_access_key_id = AKIAIKBLNO2XNVMWM5JA +# aws_secret_access_key = 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 +# And these in ~/.aws/config as follows +# [profile hst] +# retry_mode = adaptive +# max_attempts = 100 +# s3 = +# multipart_threshold = 4GB +# max_concurrent_requests = 1 +# multipart_chunksize = 32MB + +echo args $ARCHIVE $nthreads 1>&2 +for sf in segment_* +do + echo sf $sf + for s in $(cat $sf) + do + echo s $s + mkdir -p $s + fgrep -w $s $wf | tee >( echo nf $(wc -l) 1>&2 ) |\ + parallel --will-cite -j $nthreads \ + "echo '{#}' 1>&2 + f='{}' + g=$s/orig/warc/\${f##*/} + echo \|\$f\|\$g\| 1>&2 + if [ ! -f \$g ] + then + aws s3 cp s3://commoncrawl/\$f \$g --only-show-errors 2> >( { echo \$(date +%D:%T) \$g ; cat ; } >>m_errlog ) + fi + " + done +done