Mercurial > hg > cc > cirrus_work
view bin/getcc_multi.aws @ 247:7737da0ccb8c
try adding lm to existing index from ks_0-9
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 02 Jan 2025 14:52:14 +0000 |
parents | 6faed9e5d9c9 |
children |
line wrap: on
line source
# courtesy wwaites # Usage: getcc.aws <archive, e.g. CC-MAIN-2019-35> <segment ID, e.g. 68> [nthreads] # Single segment, multiple threads ARCHIVE="$1" SEG=$2 nthreads=${3:-2} wf=warc.paths WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz mkdir -p /beegfs/common_crawl/${ARCHIVE} cd /beegfs/common_crawl/${ARCHIVE} if [ ! -f $wf ] then curl --retry 4 -s ${WARCS} | gzip -dc > $wf fi #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" #export AWS_SECRET_ACCESS_KEY="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" #export PASSPHRASE="annebooththompson" #export AWS_RETRY_MODE=adaptive #export AWS_MAX_ATTEMPTS=100 # Set these in ~/.aws/credentials as follows # [hst] # aws_access_key_id = AKIAIKBLNO2XNVMWM5JA # aws_secret_access_key = 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 # And these in ~/.aws/config as follows # [profile hst] # retry_mode = adaptive # max_attempts = 100 # s3 = # multipart_threshold = 4GB # max_concurrent_requests = 1 # multipart_chunksize = 32MB echo $(date) start $SEG s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1) mkdir -p $s/orig/warc fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz || :) <(fgrep -w $s warc.paths) > /tmp/hst/$s split -a 2 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ printf "%02d\n" $(seq 1 $nthreads) | while read i do cat /tmp/hst/${s}_$i | { printf "thread\t%s\t%s\t%s\n" $i $$ $(ps -o pgid= -p "$$") >> errlog_${SEG}_$i while read f do g=$s/orig/warc/${f##*/} if [ ! -f "$g" ] then aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i ) fi done & sleep 30 } done while pgrep -a aws |grep -c "aws s3.*${s}"; do sleep 60; done echo $(date) end $SEG