view bin/getcc_multi.aws @ 161:d0dbfefd6fc0

forget parallel, just do (default 2) parallel single threads
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 25 Oct 2023 23:01:59 +0100
parents 5d1c3359e210
children 348f4a31228f
line wrap: on
line source

# courtesy wwaites
# Usage: getcc.aws <archive, e.g. CC-MAIN-2019-35> <segment ID, e.g. 68> [nthreads]
# Single segment, multiple threads
ARCHIVE="$1"
SEG=$2
nthreads=${3:-2}

wf=warc.paths

WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz

mkdir -p /beegfs/common_crawl/${ARCHIVE}
cd /beegfs/common_crawl/${ARCHIVE}

if [ ! -f $wf ]
then
 curl --retry 4 -s ${WARCS} | gzip -dc > $wf
fi

#export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
#export AWS_SECRET_ACCESS_KEY="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"
#export PASSPHRASE="annebooththompson"
#export AWS_RETRY_MODE=adaptive
#export AWS_MAX_ATTEMPTS=100
# Set these in ~/.aws/credentials as follows
# [hst]
# aws_access_key_id = AKIAIKBLNO2XNVMWM5JA
# aws_secret_access_key = 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
# And these in ~/.aws/config as follows
# [profile hst]
# retry_mode = adaptive
# max_attempts = 100
# s3 =
#     multipart_threshold = 4GB
#     max_concurrent_requests = 1
#     multipart_chunksize = 32MB

s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1)
mkdir -p $s/orig/warc
fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz) <(fgrep -w $s warc.paths) > /tmp/hst/$s

split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_

seq 1 $nthreads | while read i
do
  cat /tmp/hst/${s}_$i | while read f
  do
    g=$s/orig/warc/${f##*/}
    if [ ! -f "$g" ]
    then
      aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors  2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i )
    fi
  done &
done