changeset 155:56825fc8459d

moved from /beegfs/common-crawl to get under .hg
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 11 Oct 2023 12:51:06 +0100
parents 5d30cd8c6254
children adb1e22ad708
files bin/getcc.aws bin/getidx.aws
diffstat 2 files changed, 88 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/getcc.aws	Wed Oct 11 12:51:06 2023 +0100
@@ -0,0 +1,53 @@
+# courtesy wwaites
+# Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers]
+ARCHIVE="$1"
+shift
+if [ "$1" ]
+then
+ wait="; sleep $1"
+ shift
+fi
+SEGS="${1-all_segments}"
+
+wf=warc.paths
+
+WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz
+
+mkdir -p /beegfs/common_crawl/${ARCHIVE}
+cd /beegfs/common_crawl/${ARCHIVE}
+
+if [ ! -f $wf ]
+then
+ curl --retry 4 -s ${WARCS} | gzip -dc > $wf
+fi
+
+if [ ! -f all_segments ]
+then
+ cut -f 4 -d / $wf |uniq > all_segments
+fi
+
+if [ ! -s "$(ls segment_* | head -1)" ]
+then
+ n=$(cat $SEGS | wc -l)
+ m=$((n / 8))
+ split -n l/$m $SEGS segment_
+fi
+
+export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
+export AWS_SECRET_ACCESS_KEY="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"
+export PASSPHRASE="annebooththompson"
+
+mkdir -p cdx/warc
+cd cdx/warc
+cat ../../$cf|\
+
+for sf in segment_*
+do
+    for s in $(cat $sf)
+    do
+	mkdir -p $s
+    	fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\
+	parallel --colsep '\t' --will-cite -j 8 \
+	    "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}'  2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)"
+    done
+done
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/getidx.aws	Wed Oct 11 12:51:06 2023 +0100
@@ -0,0 +1,35 @@
+# courtesy wwaites
+ARCHIVE="$1"
+shift
+if [ "$1" ]
+then
+ wait="; sleep $1"
+fi
+
+cf=cc-index.paths
+
+CCs=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/cc-index.paths.gz
+
+mkdir -p /beegfs/common_crawl/${ARCHIVE}
+cd /beegfs/common_crawl/${ARCHIVE}
+
+if [ ! -f $cf ]
+then
+ curl --retry 4 -s ${CCs} | gzip -dc > $cf
+fi
+
+# n=$(cat $SEGS | wc -l)
+# m=$((n / 8))
+# split -n l/$m $SEGS segment_
+
+# Export some ENV variables so you don't have to type anything
+export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
+export AWS_SECRET_ACCESS_KEY="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"
+export PASSPHRASE="annebooththompson"
+
+mkdir -p cdx/warc
+cd cdx/warc
+cat ../../$cf|\
+ parallel --will-cite -j 5 \
+  "f='{}' && aws s3 cp s3://commoncrawl/\$f \${f##*/}  --only-show-errors 2> >( { echo \$(date +%D:%T) '{}' ; cat ; } >>ierrlog) $wait"
+