Mercurial > hg > cc > cirrus_work
changeset 159:ebff60e85c59
now does one named segment only
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 24 Oct 2023 16:59:23 +0100 |
parents | 5d1c3359e210 |
children | afd2ece1fb22 |
files | bin/getcc.aws |
diffstat | 1 files changed, 11 insertions(+), 34 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/getcc.aws Tue Oct 24 16:58:44 2023 +0100 +++ b/bin/getcc.aws Tue Oct 24 16:59:23 2023 +0100 @@ -1,14 +1,8 @@ # courtesy wwaites -# Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers] +# Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [segment ID, e.g. 68] +# Single segment, single thread ARCHIVE="$1" -shift -if [ "$1" = -w ] -then - shift - wait="; sleep $1" - shift -fi -SEGS="${1-all_segments}" +SEG="$2" wf=warc.paths @@ -22,18 +16,6 @@ curl --retry 4 -s ${WARCS} | gzip -dc > $wf fi -if [ ! -f $SEGS ] -then - cut -f 4 -d / $wf |uniq > all_segments -fi - -if [ ! -s "$(ls segment_* | head -1)" ] -then - n=$(cat $SEGS | wc -l) - m=$((n / 8)) - split -n l/$m $SEGS segment_ -fi - #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" #export AWS_SECRET_ACCESS_KEY="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" #export PASSPHRASE="annebooththompson" @@ -52,18 +34,13 @@ # max_concurrent_requests = 1 # multipart_chunksize = 32MB -for sf in segment_* +s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1) +fgrep -w $s $wf |\ +while read f do - for s in $(cat $sf) - do - fgrep -w $s $wf |\ - while read f - do - g=$s/orig/warc/${f##*/} - if [ ! -f "$g" ] - then - aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog ) - fi - done - done + g=$s/orig/warc/${f##*/} + if [ ! -f "$g" ] + then + aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_$SEG ) + fi done