Mercurial > hg > cc > cirrus_work
diff bin/getcc.aws @ 155:56825fc8459d
moved from /beegfs/common-crawl to get under .hg
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 11 Oct 2023 12:51:06 +0100 |
parents | |
children | 463fc7b09119 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/getcc.aws Wed Oct 11 12:51:06 2023 +0100 @@ -0,0 +1,53 @@ +# courtesy wwaites +# Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers] +ARCHIVE="$1" +shift +if [ "$1" ] +then + wait="; sleep $1" + shift +fi +SEGS="${1-all_segments}" + +wf=warc.paths + +WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz + +mkdir -p /beegfs/common_crawl/${ARCHIVE} +cd /beegfs/common_crawl/${ARCHIVE} + +if [ ! -f $wf ] +then + curl --retry 4 -s ${WARCS} | gzip -dc > $wf +fi + +if [ ! -f all_segments ] +then + cut -f 4 -d / $wf |uniq > all_segments +fi + +if [ ! -s "$(ls segment_* | head -1)" ] +then + n=$(cat $SEGS | wc -l) + m=$((n / 8)) + split -n l/$m $SEGS segment_ +fi + +export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" +export AWS_SECRET_ACCESS_KEY="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" +export PASSPHRASE="annebooththompson" + +mkdir -p cdx/warc +cd cdx/warc +cat ../../$cf|\ + +for sf in segment_* +do + for s in $(cat $sf) + do + mkdir -p $s + fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\ + parallel --colsep '\t' --will-cite -j 8 \ + "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}' 2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)" + done +done