Mercurial > hg > cc > cirrus_work
diff bin/getcc_multi.aws @ 161:d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 25 Oct 2023 23:01:59 +0100 |
parents | 5d1c3359e210 |
children | 348f4a31228f |
line wrap: on
line diff
--- a/bin/getcc_multi.aws Wed Oct 25 23:00:45 2023 +0100 +++ b/bin/getcc_multi.aws Wed Oct 25 23:01:59 2023 +0100 @@ -1,16 +1,9 @@ -# courtesy wwaites, yossi -# Usage: getcc_multi.aws [archive, e.g. CC-MAIN-2019-35] [-p nthreads] [file listing segment numbers] +# courtesy wwaites +# Usage: getcc.aws <archive, e.g. CC-MAIN-2019-35> <segment ID, e.g. 68> [nthreads] +# Single segment, multiple threads ARCHIVE="$1" -shift -if [ "$1" = -p ] -then - shift - nthreads=$1 - shift -else - nthreads=2 -fi -SEGS="${1-all_segments}" +SEG=$2 +nthreads=${3:-2} wf=warc.paths @@ -24,19 +17,6 @@ curl --retry 4 -s ${WARCS} | gzip -dc > $wf fi -if [ ! -f $SEGS ] -then - cut -f 4 -d / $wf |uniq > all_segments - SEGS=all_segments -fi - -if [ ! -s "$(ls segment_* | head -1)" ] -then - n=$(cat $SEGS | wc -l) - m=$((n / 8)) - split -n l/$m $SEGS segment_ -fi - #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" #export AWS_SECRET_ACCESS_KEY="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" #export PASSPHRASE="annebooththompson" @@ -55,24 +35,20 @@ # max_concurrent_requests = 1 # multipart_chunksize = 32MB -echo args $ARCHIVE $nthreads 1>&2 -for sf in segment_* +s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1) +mkdir -p $s/orig/warc +fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz) <(fgrep -w $s warc.paths) > /tmp/hst/$s + +split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ + +seq 1 $nthreads | while read i do - echo sf $sf - for s in $(cat $sf) - do - echo s $s - mkdir -p $s - fgrep -w $s $wf | tee >( echo nf $(wc -l) 1>&2 ) |\ - parallel --will-cite -j $nthreads \ - "echo '{#}' 1>&2 - f='{}' - g=$s/orig/warc/\${f##*/} - echo \|\$f\|\$g\| 1>&2 - if [ ! -f \$g ] - then - aws s3 cp s3://commoncrawl/\$f \$g --only-show-errors 2> >( { echo \$(date +%D:%T) \$g ; cat ; } >>m_errlog ) - fi - " - done + cat /tmp/hst/${s}_$i | while read f + do + g=$s/orig/warc/${f##*/} + if [ ! -f "$g" ] + then + aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i ) + fi + done & done