changeset 157:463fc7b09119

convert to single thread, use aws settings to improve performance when throttling is bad
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 24 Oct 2023 14:34:58 +0100
parents adb1e22ad708
children 5d1c3359e210
files bin/getcc.aws
diffstat 1 files changed, 29 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- a/bin/getcc.aws	Tue Oct 24 14:26:36 2023 +0100
+++ b/bin/getcc.aws	Tue Oct 24 14:34:58 2023 +0100
@@ -2,8 +2,9 @@
 # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers]
 ARCHIVE="$1"
 shift
-if [ "$1" ]
+if [ "$1" = -w ]
 then
+ shift
  wait="; sleep $1"
  shift
 fi
@@ -21,7 +22,7 @@
  curl --retry 4 -s ${WARCS} | gzip -dc > $wf
 fi
 
-if [ ! -f all_segments ]
+if [ ! -f $SEGS ]
 then
  cut -f 4 -d / $wf |uniq > all_segments
 fi
@@ -33,21 +34,36 @@
  split -n l/$m $SEGS segment_
 fi
 
-export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
-export AWS_SECRET_ACCESS_KEY="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"
-export PASSPHRASE="annebooththompson"
-
-mkdir -p cdx/warc
-cd cdx/warc
-cat ../../$cf|\
+#export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
+#export AWS_SECRET_ACCESS_KEY="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"
+#export PASSPHRASE="annebooththompson"
+#export AWS_RETRY_MODE=adaptive
+#export AWS_MAX_ATTEMPTS=100
+# Set these in ~/.aws/credentials as follows
+# [hst]
+# aws_access_key_id = AKIAIKBLNO2XNVMWM5JA
+# aws_secret_access_key = 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
+# And these in ~/.aws/config as follows
+# [profile hst]
+# retry_mode = adaptive
+# max_attempts = 100
+# s3 =
+#     multipart_threshold = 4GB
+#     max_concurrent_requests = 1
+#     multipart_chunksize = 32MB
 
 for sf in segment_*
 do
     for s in $(cat $sf)
     do
-	mkdir -p $s
-    	fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\
-	parallel --colsep '\t' --will-cite -j 8 \
-	    "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}'  2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)"
+     	  fgrep -w $s $wf  |\
+	  while read f
+          do
+            g=$s/orig/warc/${f##*/}
+	    if [ ! -f "$g" ]
+            then
+              aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors  2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog )
+	    fi
+          done
     done
 done