Mercurial > hg > cc > cirrus_work
changeset 163:348f4a31228f
bug-fix wrt 1st time,
try to log thread group info,
stagger aws launches
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 31 Oct 2023 14:01:50 +0000 |
parents | 72631d4ac30b |
children | 4315a36b1672 |
files | bin/getcc_multi.aws |
diffstat | 1 files changed, 6 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/getcc_multi.aws Mon Oct 30 12:19:53 2023 +0000 +++ b/bin/getcc_multi.aws Tue Oct 31 14:01:50 2023 +0000 @@ -37,13 +37,15 @@ s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1) mkdir -p $s/orig/warc -fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz) <(fgrep -w $s warc.paths) > /tmp/hst/$s +fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz || pass) <(fgrep -w $s warc.paths) > /tmp/hst/$s split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ seq 1 $nthreads | while read i do - cat /tmp/hst/${s}_$i | while read f + cat /tmp/hst/${s}_$i | { + printf "thread\t%s\t%s\t%s\n" $i $$ $(ps -o pgid= -p "$$") >> errlog_${SEG}_$i + while read f do g=$s/orig/warc/${f##*/} if [ ! -f "$g" ] @@ -51,4 +53,6 @@ aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i ) fi done & + sleep 30 + } done