view bin/getcc.aws @ 155:56825fc8459d

moved from /beegfs/common-crawl to get under .hg
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 11 Oct 2023 12:51:06 +0100
parents
children 463fc7b09119
line wrap: on
line source

# courtesy wwaites
# Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers]
ARCHIVE="$1"
shift
if [ "$1" ]
then
 wait="; sleep $1"
 shift
fi
SEGS="${1-all_segments}"

wf=warc.paths

WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz

mkdir -p /beegfs/common_crawl/${ARCHIVE}
cd /beegfs/common_crawl/${ARCHIVE}

if [ ! -f $wf ]
then
 curl --retry 4 -s ${WARCS} | gzip -dc > $wf
fi

if [ ! -f all_segments ]
then
 cut -f 4 -d / $wf |uniq > all_segments
fi

if [ ! -s "$(ls segment_* | head -1)" ]
then
 n=$(cat $SEGS | wc -l)
 m=$((n / 8))
 split -n l/$m $SEGS segment_
fi

export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
export AWS_SECRET_ACCESS_KEY="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"
export PASSPHRASE="annebooththompson"

mkdir -p cdx/warc
cd cdx/warc
cat ../../$cf|\

for sf in segment_*
do
    for s in $(cat $sf)
    do
	mkdir -p $s
    	fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\
	parallel --colsep '\t' --will-cite -j 8 \
	    "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}'  2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)"
    done
done