Mercurial > hg > cc > azure
comparison workers/bin/timedWhich.sh @ 43:c2b72d29a3ee
update to use _timedWhich.py
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Fri, 30 Nov 2018 18:37:40 +0000 |
parents | 60d4042dab26 |
children |
comparison
equal
deleted
inserted
replaced
42:1d776e96c16a | 43:c2b72d29a3ee |
---|---|
25 touch .running | 25 touch .running |
26 function tryRead { | 26 function tryRead { |
27 m=0 | 27 m=0 |
28 set -o pipefail | 28 set -o pipefail |
29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ | 29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ |
30 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \ | 30 curl -s --insecure -o - "$1"| zcat | doit |
31 { egrep -i '"WARC-Target-URI":"https?:' || : ;} | doit | |
32 do | 31 do |
33 # try to avoid lockstep retries | 32 # try to avoid lockstep retries |
34 echo ${PIPESTATUS[@]} 1>&2 | 33 echo ${PIPESTATUS[@]} 1>&2 |
35 sleep $(lrand 10) | 34 sleep $(lrand 10) |
36 echo $(date) retry number $m 1>&2 | 35 echo $(date) retry number $m 1>&2 |
58 pRes=0 | 57 pRes=0 |
59 doit () | 58 doit () |
60 { | 59 { |
61 echo -n "# $(date) $id " >> $log | 60 echo -n "# $(date) $id " >> $log |
62 tee >(wc -l >> $log) |\ | 61 tee >(wc -l >> $log) |\ |
63 parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} > $res/$cci.{#} 2>>$res/errs{#}" || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 | 62 parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.py {#} > $res/$cci.{#} 2>>$res/errs{#}" || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 |
64 } | 63 } |
65 while read s id | 64 while read s id |
66 do | 65 do |
67 url="https://commoncrawl.s3.amazonaws.com/$s" | 66 url="https://commoncrawl.s3.amazonaws.com/$s" |
68 cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-) | 67 cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-) |