changeset 158:5d1c3359e210

resurrect parallel fetch
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 24 Oct 2023 16:58:44 +0100
parents 463fc7b09119
children ebff60e85c59
files bin/getcc_multi.aws
diffstat 1 files changed, 78 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/getcc_multi.aws	Tue Oct 24 16:58:44 2023 +0100
@@ -0,0 +1,78 @@
+# courtesy wwaites, yossi
+# Usage: getcc_multi.aws [archive, e.g. CC-MAIN-2019-35] [-p nthreads] [file listing segment numbers]
+ARCHIVE="$1"
+shift
+if [ "$1" = -p ]
+then
+ shift
+ nthreads=$1
+ shift
+else
+ nthreads=2
+fi
+SEGS="${1-all_segments}"
+
+wf=warc.paths
+
+WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz
+
+mkdir -p /beegfs/common_crawl/${ARCHIVE}
+cd /beegfs/common_crawl/${ARCHIVE}
+
+if [ ! -f $wf ]
+then
+ curl --retry 4 -s ${WARCS} | gzip -dc > $wf
+fi
+
+if [ ! -f $SEGS ]
+then
+ cut -f 4 -d / $wf |uniq > all_segments
+ SEGS=all_segments
+fi
+
+if [ ! -s "$(ls segment_* | head -1)" ]
+then
+ n=$(cat $SEGS | wc -l)
+ m=$((n / 8))
+ split -n l/$m $SEGS segment_
+fi
+
+#export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
+#export AWS_SECRET_ACCESS_KEY="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"
+#export PASSPHRASE="annebooththompson"
+#export AWS_RETRY_MODE=adaptive
+#export AWS_MAX_ATTEMPTS=100
+# Set these in ~/.aws/credentials as follows
+# [hst]
+# aws_access_key_id = AKIAIKBLNO2XNVMWM5JA
+# aws_secret_access_key = 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
+# And these in ~/.aws/config as follows
+# [profile hst]
+# retry_mode = adaptive
+# max_attempts = 100
+# s3 =
+#     multipart_threshold = 4GB
+#     max_concurrent_requests = 1
+#     multipart_chunksize = 32MB
+
+echo args $ARCHIVE $nthreads 1>&2
+for sf in segment_*
+do
+    echo sf $sf
+    for s in $(cat $sf)
+    do
+	  echo s $s
+	  mkdir -p $s
+     	  fgrep -w $s $wf | tee >( echo nf $(wc -l) 1>&2 ) |\
+	    parallel --will-cite -j $nthreads \
+            "echo '{#}' 1>&2
+             f='{}'
+             g=$s/orig/warc/\${f##*/}
+	     echo \|\$f\|\$g\| 1>&2
+ 	     if [ ! -f \$g ]
+             then
+              aws s3 cp s3://commoncrawl/\$f \$g --only-show-errors  2> >( { echo \$(date +%D:%T) \$g ; cat ; } >>m_errlog )
+	     fi
+            "
+    done
+done