view bin/getcc_multi.aws @ 158:5d1c3359e210

resurrect parallel fetch
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 24 Oct 2023 16:58:44 +0100
parents
children d0dbfefd6fc0
line wrap: on
line source

# courtesy wwaites, yossi
# Usage: getcc_multi.aws [archive, e.g. CC-MAIN-2019-35] [-p nthreads] [file listing segment numbers]
ARCHIVE="$1"
shift
if [ "$1" = -p ]
then
 shift
 nthreads=$1
 shift
else
 nthreads=2
fi
SEGS="${1-all_segments}"

wf=warc.paths

WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz

mkdir -p /beegfs/common_crawl/${ARCHIVE}
cd /beegfs/common_crawl/${ARCHIVE}

if [ ! -f $wf ]
then
 curl --retry 4 -s ${WARCS} | gzip -dc > $wf
fi

if [ ! -f $SEGS ]
then
 cut -f 4 -d / $wf |uniq > all_segments
 SEGS=all_segments
fi

if [ ! -s "$(ls segment_* | head -1)" ]
then
 n=$(cat $SEGS | wc -l)
 m=$((n / 8))
 split -n l/$m $SEGS segment_
fi

#export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
#export AWS_SECRET_ACCESS_KEY="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"
#export PASSPHRASE="annebooththompson"
#export AWS_RETRY_MODE=adaptive
#export AWS_MAX_ATTEMPTS=100
# Set these in ~/.aws/credentials as follows
# [hst]
# aws_access_key_id = AKIAIKBLNO2XNVMWM5JA
# aws_secret_access_key = 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
# And these in ~/.aws/config as follows
# [profile hst]
# retry_mode = adaptive
# max_attempts = 100
# s3 =
#     multipart_threshold = 4GB
#     max_concurrent_requests = 1
#     multipart_chunksize = 32MB

echo args $ARCHIVE $nthreads 1>&2
for sf in segment_*
do
    echo sf $sf
    for s in $(cat $sf)
    do
	  echo s $s
	  mkdir -p $s
     	  fgrep -w $s $wf | tee >( echo nf $(wc -l) 1>&2 ) |\
	    parallel --will-cite -j $nthreads \
            "echo '{#}' 1>&2
             f='{}'
             g=$s/orig/warc/\${f##*/}
	     echo \|\$f\|\$g\| 1>&2
 	     if [ ! -f \$g ]
             then
              aws s3 cp s3://commoncrawl/\$f \$g --only-show-errors  2> >( { echo \$(date +%D:%T) \$g ; cat ; } >>m_errlog )
	     fi
            "
    done
done