Mercurial > hg > cc > azure
comparison workers/bin/timedWhich.sh @ 21:a214fd3a8001
use /var/data, don't store but include subproc _inside_ tryread
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Sun, 21 Oct 2018 12:41:10 +0000 |
parents | 9631fca89cc6 |
children | 60d4042dab26 |
comparison
equal
deleted
inserted
replaced
20:0f4a0f4e38d4 | 21:a214fd3a8001 |
---|---|
5 # If -t, no random wait, just id seconds | 5 # If -t, no random wait, just id seconds |
6 # remove >>errs once tested | 6 # remove >>errs once tested |
7 #set -e -o pipefail | 7 #set -e -o pipefail |
8 echo $$ > test1.pid | 8 echo $$ > test1.pid |
9 proc=$1 | 9 proc=$1 |
10 res=res$proc | 10 res=/var/data/res$proc |
11 home=$2 | 11 home=$2 |
12 shift 2 | 12 shift 2 |
13 function lrand { | 13 function lrand { |
14 # cheap bad little random number generator | 14 # cheap bad little random number generator |
15 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) | 15 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) |
26 function tryRead { | 26 function tryRead { |
27 m=0 | 27 m=0 |
28 set -o pipefail | 28 set -o pipefail |
29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ | 29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ |
30 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \ | 30 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \ |
31 { egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2 | 31 { egrep -i '"WARC-Target-URI":"https?:' || : ;} | doit |
32 do | 32 do |
33 # try to avoid lockstep retries | 33 # try to avoid lockstep retries |
34 echo ${PIPESTATUS[@]} 1>&2 | 34 echo ${PIPESTATUS[@]} 1>&2 |
35 sleep $(lrand 10) | 35 sleep $(lrand 10) |
36 echo $(date) $2 retry number $m 1>&2 | 36 echo $(date) retry number $m 1>&2 |
37 done | 37 done |
38 set +o pipefail | 38 set +o pipefail |
39 } | 39 } |
40 trap "{ | 40 trap "{ |
41 #set -e -o pipefail | 41 set -e -o pipefail |
42 ln -s ../nohup.cc . | 42 cd /var/data |
43 tar -czhf - CC* $res | \ | 43 tar -czhf - CC* res* | \ |
44 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data | 44 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data |
45 mkdir -p which | 45 mkdir -p which |
46 cd which | 46 cd which |
47 tar -xzf - ; } 2>>errs\" | 47 tar -xzf - ; } 2>>errs\" |
48 rm -rf $res CC* ifile.txt *.pid | 48 rm -rf res* CC* |
49 cd | |
50 rm ifile.txt *.pid | |
49 ( sleep 5 ; rm nohup.cc ) & | 51 ( sleep 5 ; rm nohup.cc ) & |
50 }" EXIT | 52 }" EXIT |
51 mkdir -p $res | 53 mkdir -p $res |
52 log=$res/log | 54 log=$res/log |
53 # Don't all start at once | 55 # Don't all start at once |
54 sleep $pause | 56 sleep $pause |
55 echo \# $(date) > $log | 57 echo \# $(date) > $log |
56 pRes=0 | 58 pRes=0 |
59 doit () | |
60 { | |
61 echo -n "# $(date) $id " >> $log | |
62 tee >(wc -l >> $log) |\ | |
63 parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 | |
64 } | |
57 while read s id | 65 while read s id |
58 do | 66 do |
59 url="https://commoncrawl.s3.amazonaws.com/$s" | 67 url="https://commoncrawl.s3.amazonaws.com/$s" |
60 ccm=${s##*/wat/} | 68 cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-) |
61 cci=${ccm%%-ip*} | |
62 export ID=$id | 69 export ID=$id |
63 echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log | 70 echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log |
64 # Experimental retry loop | 71 # Experimental retry loop |
65 tryRead "$url" crawl$id | 72 tryRead "$url" |
66 if [ -s crawl$id ] | 73 cat $res/$cci.* > /var/data/$cci |
67 then | |
68 echo \# $(date) $id $(wc -l crawl$id) >> $log | |
69 parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 | |
70 else | |
71 echo "crawl$id empty" 1>&2 | |
72 fi | |
73 rm crawl$id | |
74 cat $res/$cci.* > $cci | |
75 done < ifile.txt 2>> $res/errs || pRes=$? | 74 done < ifile.txt 2>> $res/errs || pRes=$? |
76 echo \# $(date) main loop exit code=$pRes >> $log | 75 echo \# $(date) main loop exit code=$pRes >> $log |
77 rm .running | 76 rm .running |
78 | 77 |
79 | 78 |