comparison bin/getcc_multi.aws @ 163:348f4a31228f

bug-fix wrt 1st time, try to log thread group info, stagger aws launches
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 31 Oct 2023 14:01:50 +0000
parents d0dbfefd6fc0
children 143d2c6d56da
comparison
equal deleted inserted replaced
162:72631d4ac30b 163:348f4a31228f
35 # max_concurrent_requests = 1 35 # max_concurrent_requests = 1
36 # multipart_chunksize = 32MB 36 # multipart_chunksize = 32MB
37 37
38 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1) 38 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1)
39 mkdir -p $s/orig/warc 39 mkdir -p $s/orig/warc
40 fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz) <(fgrep -w $s warc.paths) > /tmp/hst/$s 40 fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz || pass) <(fgrep -w $s warc.paths) > /tmp/hst/$s
41 41
42 split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ 42 split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_
43 43
44 seq 1 $nthreads | while read i 44 seq 1 $nthreads | while read i
45 do 45 do
46 cat /tmp/hst/${s}_$i | while read f 46 cat /tmp/hst/${s}_$i | {
47 printf "thread\t%s\t%s\t%s\n" $i $$ $(ps -o pgid= -p "$$") >> errlog_${SEG}_$i
48 while read f
47 do 49 do
48 g=$s/orig/warc/${f##*/} 50 g=$s/orig/warc/${f##*/}
49 if [ ! -f "$g" ] 51 if [ ! -f "$g" ]
50 then 52 then
51 aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i ) 53 aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i )
52 fi 54 fi
53 done & 55 done &
56 sleep 30
57 }
54 done 58 done