Mercurial > hg > cc > cirrus_work
diff bin/getidx.aws @ 155:56825fc8459d
moved from /beegfs/common-crawl to get under .hg
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 11 Oct 2023 12:51:06 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/getidx.aws Wed Oct 11 12:51:06 2023 +0100 @@ -0,0 +1,35 @@ +# courtesy wwaites +ARCHIVE="$1" +shift +if [ "$1" ] +then + wait="; sleep $1" +fi + +cf=cc-index.paths + +CCs=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/cc-index.paths.gz + +mkdir -p /beegfs/common_crawl/${ARCHIVE} +cd /beegfs/common_crawl/${ARCHIVE} + +if [ ! -f $cf ] +then + curl --retry 4 -s ${CCs} | gzip -dc > $cf +fi + +# n=$(cat $SEGS | wc -l) +# m=$((n / 8)) +# split -n l/$m $SEGS segment_ + +# Export some ENV variables so you don't have to type anything +export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" +export AWS_SECRET_ACCESS_KEY="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" +export PASSPHRASE="annebooththompson" + +mkdir -p cdx/warc +cd cdx/warc +cat ../../$cf|\ + parallel --will-cite -j 5 \ + "f='{}' && aws s3 cp s3://commoncrawl/\$f \${f##*/} --only-show-errors 2> >( { echo \$(date +%D:%T) '{}' ; cat ; } >>ierrlog) $wait" +