# HG changeset patch # User Henry S. Thompson # Date 1698760910 0 # Node ID 348f4a31228f2e956e3a037ba8fbbf1838d83931 # Parent 72631d4ac30bf649e3f5e2743e7d5eb78cdf9e83 bug-fix wrt 1st time, try to log thread group info, stagger aws launches diff -r 72631d4ac30b -r 348f4a31228f bin/getcc_multi.aws --- a/bin/getcc_multi.aws Mon Oct 30 12:19:53 2023 +0000 +++ b/bin/getcc_multi.aws Tue Oct 31 14:01:50 2023 +0000 @@ -37,13 +37,15 @@ s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1) mkdir -p $s/orig/warc -fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz) <(fgrep -w $s warc.paths) > /tmp/hst/$s +fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz || pass) <(fgrep -w $s warc.paths) > /tmp/hst/$s split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ seq 1 $nthreads | while read i do - cat /tmp/hst/${s}_$i | while read f + cat /tmp/hst/${s}_$i | { + printf "thread\t%s\t%s\t%s\n" $i $$ $(ps -o pgid= -p "$$") >> errlog_${SEG}_$i + while read f do g=$s/orig/warc/${f##*/} if [ ! -f "$g" ] @@ -51,4 +53,6 @@ aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i ) fi done & + sleep 30 + } done