comparison workers/bin/timedWhich.sh @ 43:c2b72d29a3ee

update to use _timedWhich.py
author Henry S. Thompson <ht@markup.co.uk>
date Fri, 30 Nov 2018 18:37:40 +0000
parents 60d4042dab26
children
comparison
equal deleted inserted replaced
42:1d776e96c16a 43:c2b72d29a3ee
25 touch .running 25 touch .running
26 function tryRead { 26 function tryRead {
27 m=0 27 m=0
28 set -o pipefail 28 set -o pipefail
29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ 29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \
30 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \ 30 curl -s --insecure -o - "$1"| zcat | doit
31 { egrep -i '"WARC-Target-URI":"https?:' || : ;} | doit
32 do 31 do
33 # try to avoid lockstep retries 32 # try to avoid lockstep retries
34 echo ${PIPESTATUS[@]} 1>&2 33 echo ${PIPESTATUS[@]} 1>&2
35 sleep $(lrand 10) 34 sleep $(lrand 10)
36 echo $(date) retry number $m 1>&2 35 echo $(date) retry number $m 1>&2
58 pRes=0 57 pRes=0
59 doit () 58 doit ()
60 { 59 {
61 echo -n "# $(date) $id " >> $log 60 echo -n "# $(date) $id " >> $log
62 tee >(wc -l >> $log) |\ 61 tee >(wc -l >> $log) |\
63 parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} > $res/$cci.{#} 2>>$res/errs{#}" || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 62 parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.py {#} > $res/$cci.{#} 2>>$res/errs{#}" || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
64 } 63 }
65 while read s id 64 while read s id
66 do 65 do
67 url="https://commoncrawl.s3.amazonaws.com/$s" 66 url="https://commoncrawl.s3.amazonaws.com/$s"
68 cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-) 67 cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-)