Mercurial > hg > cc > cirrus_work
view bin/getidx.aws @ 208:b1190db19d78
sic
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 07 Dec 2023 18:23:11 +0000 |
parents | 56825fc8459d |
children |
line wrap: on
line source
# courtesy wwaites ARCHIVE="$1" shift if [ "$1" ] then wait="; sleep $1" fi cf=cc-index.paths CCs=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/cc-index.paths.gz mkdir -p /beegfs/common_crawl/${ARCHIVE} cd /beegfs/common_crawl/${ARCHIVE} if [ ! -f $cf ] then curl --retry 4 -s ${CCs} | gzip -dc > $cf fi # n=$(cat $SEGS | wc -l) # m=$((n / 8)) # split -n l/$m $SEGS segment_ # Export some ENV variables so you don't have to type anything export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" export AWS_SECRET_ACCESS_KEY="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" export PASSPHRASE="annebooththompson" mkdir -p cdx/warc cd cdx/warc cat ../../$cf|\ parallel --will-cite -j 5 \ "f='{}' && aws s3 cp s3://commoncrawl/\$f \${f##*/} --only-show-errors 2> >( { echo \$(date +%D:%T) '{}' ; cat ; } >>ierrlog) $wait"