Mercurial > hg > cc > azure
changeset 21:a214fd3a8001
use /var/data, don't store but include subproc _inside_ tryread
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Sun, 21 Oct 2018 12:41:10 +0000 |
parents | 0f4a0f4e38d4 |
children | 60d4042dab26 |
files | workers/bin/timedWhich.sh |
diffstat | 1 files changed, 18 insertions(+), 19 deletions(-) [+] |
line wrap: on
line diff
--- a/workers/bin/timedWhich.sh Sat Oct 20 16:13:58 2018 +0000 +++ b/workers/bin/timedWhich.sh Sun Oct 21 12:41:10 2018 +0000 @@ -7,7 +7,7 @@ #set -e -o pipefail echo $$ > test1.pid proc=$1 -res=res$proc +res=/var/data/res$proc home=$2 shift 2 function lrand { @@ -28,24 +28,26 @@ set -o pipefail until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \ -{ egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2 +{ egrep -i '"WARC-Target-URI":"https?:' || : ;} | doit do # try to avoid lockstep retries echo ${PIPESTATUS[@]} 1>&2 sleep $(lrand 10) - echo $(date) $2 retry number $m 1>&2 + echo $(date) retry number $m 1>&2 done set +o pipefail } trap "{ - #set -e -o pipefail - ln -s ../nohup.cc . - tar -czhf - CC* $res | \ + set -e -o pipefail + cd /var/data + tar -czhf - CC* res* | \ ssh -o StrictHostKeyChecking=no -q $home \"{ cd data mkdir -p which cd which tar -xzf - ; } 2>>errs\" - rm -rf $res CC* ifile.txt *.pid + rm -rf res* CC* + cd + rm ifile.txt *.pid ( sleep 5 ; rm nohup.cc ) & }" EXIT mkdir -p $res @@ -54,24 +56,21 @@ sleep $pause echo \# $(date) > $log pRes=0 +doit () +{ +echo -n "# $(date) $id " >> $log +tee >(wc -l >> $log) |\ + parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 +} while read s id do url="https://commoncrawl.s3.amazonaws.com/$s" - ccm=${s##*/wat/} - cci=${ccm%%-ip*} + cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-) export ID=$id echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log # Experimental retry loop - tryRead "$url" crawl$id - if [ -s crawl$id ] - then - echo \# $(date) $id $(wc -l crawl$id) >> $log - parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 - else - echo "crawl$id empty" 1>&2 - fi - rm crawl$id - cat $res/$cci.* > $cci + tryRead "$url" + cat $res/$cci.* > /var/data/$cci done < ifile.txt 2>> $res/errs || pRes=$? echo \# $(date) main loop exit code=$pRes >> $log rm .running