Mercurial > hg > cc > cirrus_work
changeset 155:56825fc8459d
moved from /beegfs/common-crawl to get under .hg
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 11 Oct 2023 12:51:06 +0100 |
parents | 5d30cd8c6254 |
children | adb1e22ad708 |
files | bin/getcc.aws bin/getidx.aws |
diffstat | 2 files changed, 88 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/getcc.aws Wed Oct 11 12:51:06 2023 +0100 @@ -0,0 +1,53 @@ +# courtesy wwaites +# Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers] +ARCHIVE="$1" +shift +if [ "$1" ] +then + wait="; sleep $1" + shift +fi +SEGS="${1-all_segments}" + +wf=warc.paths + +WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz + +mkdir -p /beegfs/common_crawl/${ARCHIVE} +cd /beegfs/common_crawl/${ARCHIVE} + +if [ ! -f $wf ] +then + curl --retry 4 -s ${WARCS} | gzip -dc > $wf +fi + +if [ ! -f all_segments ] +then + cut -f 4 -d / $wf |uniq > all_segments +fi + +if [ ! -s "$(ls segment_* | head -1)" ] +then + n=$(cat $SEGS | wc -l) + m=$((n / 8)) + split -n l/$m $SEGS segment_ +fi + +export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" +export AWS_SECRET_ACCESS_KEY="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" +export PASSPHRASE="annebooththompson" + +mkdir -p cdx/warc +cd cdx/warc +cat ../../$cf|\ + +for sf in segment_* +do + for s in $(cat $sf) + do + mkdir -p $s + fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\ + parallel --colsep '\t' --will-cite -j 8 \ + "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}' 2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)" + done +done
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/getidx.aws Wed Oct 11 12:51:06 2023 +0100 @@ -0,0 +1,35 @@ +# courtesy wwaites +ARCHIVE="$1" +shift +if [ "$1" ] +then + wait="; sleep $1" +fi + +cf=cc-index.paths + +CCs=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/cc-index.paths.gz + +mkdir -p /beegfs/common_crawl/${ARCHIVE} +cd /beegfs/common_crawl/${ARCHIVE} + +if [ ! -f $cf ] +then + curl --retry 4 -s ${CCs} | gzip -dc > $cf +fi + +# n=$(cat $SEGS | wc -l) +# m=$((n / 8)) +# split -n l/$m $SEGS segment_ + +# Export some ENV variables so you don't have to type anything +export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" +export AWS_SECRET_ACCESS_KEY="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" +export PASSPHRASE="annebooththompson" + +mkdir -p cdx/warc +cd cdx/warc +cat ../../$cf|\ + parallel --will-cite -j 5 \ + "f='{}' && aws s3 cp s3://commoncrawl/\$f \${f##*/} --only-show-errors 2> >( { echo \$(date +%D:%T) '{}' ; cat ; } >>ierrlog) $wait" +