view bin/getidx.aws @ 198:1ebc75d10a00

tweaked formatting
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 04 Dec 2023 10:40:47 +0000
parents 56825fc8459d
children
line wrap: on
line source

# courtesy wwaites
ARCHIVE="$1"
shift
if [ "$1" ]
then
 wait="; sleep $1"
fi

cf=cc-index.paths

CCs=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/cc-index.paths.gz

mkdir -p /beegfs/common_crawl/${ARCHIVE}
cd /beegfs/common_crawl/${ARCHIVE}

if [ ! -f $cf ]
then
 curl --retry 4 -s ${CCs} | gzip -dc > $cf
fi

# n=$(cat $SEGS | wc -l)
# m=$((n / 8))
# split -n l/$m $SEGS segment_

# Export some ENV variables so you don't have to type anything
export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
export AWS_SECRET_ACCESS_KEY="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"
export PASSPHRASE="annebooththompson"

mkdir -p cdx/warc
cd cdx/warc
cat ../../$cf|\
 parallel --will-cite -j 5 \
  "f='{}' && aws s3 cp s3://commoncrawl/\$f \${f##*/}  --only-show-errors 2> >( { echo \$(date +%D:%T) '{}' ; cat ; } >>ierrlog) $wait"