changeset 159:ebff60e85c59

now does one named segment only
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 24 Oct 2023 16:59:23 +0100
parents 5d1c3359e210
children afd2ece1fb22
files bin/getcc.aws
diffstat 1 files changed, 11 insertions(+), 34 deletions(-) [+]
line wrap: on
line diff
--- a/bin/getcc.aws	Tue Oct 24 16:58:44 2023 +0100
+++ b/bin/getcc.aws	Tue Oct 24 16:59:23 2023 +0100
@@ -1,14 +1,8 @@
 # courtesy wwaites
-# Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers]
+# Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [segment ID, e.g. 68]
+# Single segment, single thread
 ARCHIVE="$1"
-shift
-if [ "$1" = -w ]
-then
- shift
- wait="; sleep $1"
- shift
-fi
-SEGS="${1-all_segments}"
+SEG="$2"
 
 wf=warc.paths
 
@@ -22,18 +16,6 @@
  curl --retry 4 -s ${WARCS} | gzip -dc > $wf
 fi
 
-if [ ! -f $SEGS ]
-then
- cut -f 4 -d / $wf |uniq > all_segments
-fi
-
-if [ ! -s "$(ls segment_* | head -1)" ]
-then
- n=$(cat $SEGS | wc -l)
- m=$((n / 8))
- split -n l/$m $SEGS segment_
-fi
-
 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
 #export AWS_SECRET_ACCESS_KEY="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"
 #export PASSPHRASE="annebooththompson"
@@ -52,18 +34,13 @@
 #     max_concurrent_requests = 1
 #     multipart_chunksize = 32MB
 
-for sf in segment_*
+s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1)
+fgrep -w $s $wf |\
+while read f
 do
-    for s in $(cat $sf)
-    do
-     	  fgrep -w $s $wf  |\
-	  while read f
-          do
-            g=$s/orig/warc/${f##*/}
-	    if [ ! -f "$g" ]
-            then
-              aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors  2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog )
-	    fi
-          done
-    done
+  g=$s/orig/warc/${f##*/}
+  if [ ! -f "$g" ]
+  then
+    aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors  2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_$SEG )
+  fi
 done