Mercurial > hg > cc > cirrus_work
view bin/getcc.aws @ 155:56825fc8459d
moved from /beegfs/common-crawl to get under .hg
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 11 Oct 2023 12:51:06 +0100 |
parents | |
children | 463fc7b09119 |
line wrap: on
line source
# courtesy wwaites # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers] ARCHIVE="$1" shift if [ "$1" ] then wait="; sleep $1" shift fi SEGS="${1-all_segments}" wf=warc.paths WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz mkdir -p /beegfs/common_crawl/${ARCHIVE} cd /beegfs/common_crawl/${ARCHIVE} if [ ! -f $wf ] then curl --retry 4 -s ${WARCS} | gzip -dc > $wf fi if [ ! -f all_segments ] then cut -f 4 -d / $wf |uniq > all_segments fi if [ ! -s "$(ls segment_* | head -1)" ] then n=$(cat $SEGS | wc -l) m=$((n / 8)) split -n l/$m $SEGS segment_ fi export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" export AWS_SECRET_ACCESS_KEY="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" export PASSPHRASE="annebooththompson" mkdir -p cdx/warc cd cdx/warc cat ../../$cf|\ for sf in segment_* do for s in $(cat $sf) do mkdir -p $s fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\ parallel --colsep '\t' --will-cite -j 8 \ "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}' 2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)" done done