Mercurial > hg > cc > cirrus_work
comparison bin/getcc_multi.aws @ 163:348f4a31228f
bug-fix wrt 1st time,
try to log thread group info,
stagger aws launches
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 31 Oct 2023 14:01:50 +0000 |
parents | d0dbfefd6fc0 |
children | 143d2c6d56da |
comparison
equal
deleted
inserted
replaced
162:72631d4ac30b | 163:348f4a31228f |
---|---|
35 # max_concurrent_requests = 1 | 35 # max_concurrent_requests = 1 |
36 # multipart_chunksize = 32MB | 36 # multipart_chunksize = 32MB |
37 | 37 |
38 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1) | 38 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1) |
39 mkdir -p $s/orig/warc | 39 mkdir -p $s/orig/warc |
40 fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz) <(fgrep -w $s warc.paths) > /tmp/hst/$s | 40 fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz || pass) <(fgrep -w $s warc.paths) > /tmp/hst/$s |
41 | 41 |
42 split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ | 42 split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ |
43 | 43 |
44 seq 1 $nthreads | while read i | 44 seq 1 $nthreads | while read i |
45 do | 45 do |
46 cat /tmp/hst/${s}_$i | while read f | 46 cat /tmp/hst/${s}_$i | { |
47 printf "thread\t%s\t%s\t%s\n" $i $$ $(ps -o pgid= -p "$$") >> errlog_${SEG}_$i | |
48 while read f | |
47 do | 49 do |
48 g=$s/orig/warc/${f##*/} | 50 g=$s/orig/warc/${f##*/} |
49 if [ ! -f "$g" ] | 51 if [ ! -f "$g" ] |
50 then | 52 then |
51 aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i ) | 53 aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i ) |
52 fi | 54 fi |
53 done & | 55 done & |
56 sleep 30 | |
57 } | |
54 done | 58 done |