comparison bin/getcc_multi.aws @ 210:6faed9e5d9c9

use 2-digit suffixes, use more care in what we wait for in the outer loop
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 11 Jan 2024 16:43:16 +0000
parents 10c87f5c704d
children
comparison
equal deleted inserted replaced
209:b6669d78a5d9 210:6faed9e5d9c9
38 echo $(date) start $SEG 38 echo $(date) start $SEG
39 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1) 39 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1)
40 mkdir -p $s/orig/warc 40 mkdir -p $s/orig/warc
41 fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz || :) <(fgrep -w $s warc.paths) > /tmp/hst/$s 41 fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz || :) <(fgrep -w $s warc.paths) > /tmp/hst/$s
42 42
43 split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ 43 split -a 2 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_
44 44 printf "%02d\n" $(seq 1 $nthreads) | while read i
45 seq 1 $nthreads | while read i
46 do 45 do
47 cat /tmp/hst/${s}_$i | { 46 cat /tmp/hst/${s}_$i | {
48 printf "thread\t%s\t%s\t%s\n" $i $$ $(ps -o pgid= -p "$$") >> errlog_${SEG}_$i 47 printf "thread\t%s\t%s\t%s\n" $i $$ $(ps -o pgid= -p "$$") >> errlog_${SEG}_$i
49 while read f 48 while read f
50 do 49 do
55 fi 54 fi
56 done & 55 done &
57 sleep 30 56 sleep 30
58 } 57 }
59 done 58 done
60 while pgrep -a aws |grep -c s3; do sleep 60; done 59 while pgrep -a aws |grep -c "aws s3.*${s}"; do sleep 60; done
61 echo $(date) end $SEG 60 echo $(date) end $SEG