changeset 163:348f4a31228f

bug-fix wrt 1st time, try to log thread group info, stagger aws launches
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 31 Oct 2023 14:01:50 +0000
parents 72631d4ac30b
children 4315a36b1672
files bin/getcc_multi.aws
diffstat 1 files changed, 6 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/bin/getcc_multi.aws	Mon Oct 30 12:19:53 2023 +0000
+++ b/bin/getcc_multi.aws	Tue Oct 31 14:01:50 2023 +0000
@@ -37,13 +37,15 @@
 
 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1)
 mkdir -p $s/orig/warc
-fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz) <(fgrep -w $s warc.paths) > /tmp/hst/$s
+fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz || pass) <(fgrep -w $s warc.paths) > /tmp/hst/$s
 
 split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_
 
 seq 1 $nthreads | while read i
 do
-  cat /tmp/hst/${s}_$i | while read f
+  cat /tmp/hst/${s}_$i | {
+  printf "thread\t%s\t%s\t%s\n" $i $$ $(ps -o pgid= -p "$$") >> errlog_${SEG}_$i
+  while read f
   do
     g=$s/orig/warc/${f##*/}
     if [ ! -f "$g" ]
@@ -51,4 +53,6 @@
       aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors  2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i )
     fi
   done &
+  sleep 30
+  }
 done