changeset 161:d0dbfefd6fc0

forget parallel, just do (default 2) parallel single threads
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 25 Oct 2023 23:01:59 +0100
parents afd2ece1fb22
children 72631d4ac30b
files bin/getcc_multi.aws
diffstat 1 files changed, 20 insertions(+), 44 deletions(-) [+]
line wrap: on
line diff
--- a/bin/getcc_multi.aws	Wed Oct 25 23:00:45 2023 +0100
+++ b/bin/getcc_multi.aws	Wed Oct 25 23:01:59 2023 +0100
@@ -1,16 +1,9 @@
-# courtesy wwaites, yossi
-# Usage: getcc_multi.aws [archive, e.g. CC-MAIN-2019-35] [-p nthreads] [file listing segment numbers]
+# courtesy wwaites
+# Usage: getcc.aws <archive, e.g. CC-MAIN-2019-35> <segment ID, e.g. 68> [nthreads]
+# Single segment, multiple threads
 ARCHIVE="$1"
-shift
-if [ "$1" = -p ]
-then
- shift
- nthreads=$1
- shift
-else
- nthreads=2
-fi
-SEGS="${1-all_segments}"
+SEG=$2
+nthreads=${3:-2}
 
 wf=warc.paths
 
@@ -24,19 +17,6 @@
  curl --retry 4 -s ${WARCS} | gzip -dc > $wf
 fi
 
-if [ ! -f $SEGS ]
-then
- cut -f 4 -d / $wf |uniq > all_segments
- SEGS=all_segments
-fi
-
-if [ ! -s "$(ls segment_* | head -1)" ]
-then
- n=$(cat $SEGS | wc -l)
- m=$((n / 8))
- split -n l/$m $SEGS segment_
-fi
-
 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
 #export AWS_SECRET_ACCESS_KEY="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"
 #export PASSPHRASE="annebooththompson"
@@ -55,24 +35,20 @@
 #     max_concurrent_requests = 1
 #     multipart_chunksize = 32MB
 
-echo args $ARCHIVE $nthreads 1>&2
-for sf in segment_*
+s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1)
+mkdir -p $s/orig/warc
+fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz) <(fgrep -w $s warc.paths) > /tmp/hst/$s
+
+split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_
+
+seq 1 $nthreads | while read i
 do
-    echo sf $sf
-    for s in $(cat $sf)
-    do
-	  echo s $s
-	  mkdir -p $s
-     	  fgrep -w $s $wf | tee >( echo nf $(wc -l) 1>&2 ) |\
-	    parallel --will-cite -j $nthreads \
-            "echo '{#}' 1>&2
-             f='{}'
-             g=$s/orig/warc/\${f##*/}
-	     echo \|\$f\|\$g\| 1>&2
- 	     if [ ! -f \$g ]
-             then
-              aws s3 cp s3://commoncrawl/\$f \$g --only-show-errors  2> >( { echo \$(date +%D:%T) \$g ; cat ; } >>m_errlog )
-	     fi
-            "
-    done
+  cat /tmp/hst/${s}_$i | while read f
+  do
+    g=$s/orig/warc/${f##*/}
+    if [ ! -f "$g" ]
+    then
+      aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors  2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i )
+    fi
+  done &
 done