comparison workers/bin/timedWhich.sh @ 21:a214fd3a8001

use /var/data, don't store but include subproc _inside_ tryread
author Henry S. Thompson <ht@markup.co.uk>
date Sun, 21 Oct 2018 12:41:10 +0000
parents 9631fca89cc6
children 60d4042dab26
comparison
equal deleted inserted replaced
20:0f4a0f4e38d4 21:a214fd3a8001
5 # If -t, no random wait, just id seconds 5 # If -t, no random wait, just id seconds
6 # remove >>errs once tested 6 # remove >>errs once tested
7 #set -e -o pipefail 7 #set -e -o pipefail
8 echo $$ > test1.pid 8 echo $$ > test1.pid
9 proc=$1 9 proc=$1
10 res=res$proc 10 res=/var/data/res$proc
11 home=$2 11 home=$2
12 shift 2 12 shift 2
13 function lrand { 13 function lrand {
14 # cheap bad little random number generator 14 # cheap bad little random number generator
15 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) 15 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
26 function tryRead { 26 function tryRead {
27 m=0 27 m=0
28 set -o pipefail 28 set -o pipefail
29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ 29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \
30 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \ 30 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \
31 { egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2 31 { egrep -i '"WARC-Target-URI":"https?:' || : ;} | doit
32 do 32 do
33 # try to avoid lockstep retries 33 # try to avoid lockstep retries
34 echo ${PIPESTATUS[@]} 1>&2 34 echo ${PIPESTATUS[@]} 1>&2
35 sleep $(lrand 10) 35 sleep $(lrand 10)
36 echo $(date) $2 retry number $m 1>&2 36 echo $(date) retry number $m 1>&2
37 done 37 done
38 set +o pipefail 38 set +o pipefail
39 } 39 }
40 trap "{ 40 trap "{
41 #set -e -o pipefail 41 set -e -o pipefail
42 ln -s ../nohup.cc . 42 cd /var/data
43 tar -czhf - CC* $res | \ 43 tar -czhf - CC* res* | \
44 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data 44 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data
45 mkdir -p which 45 mkdir -p which
46 cd which 46 cd which
47 tar -xzf - ; } 2>>errs\" 47 tar -xzf - ; } 2>>errs\"
48 rm -rf $res CC* ifile.txt *.pid 48 rm -rf res* CC*
49 cd
50 rm ifile.txt *.pid
49 ( sleep 5 ; rm nohup.cc ) & 51 ( sleep 5 ; rm nohup.cc ) &
50 }" EXIT 52 }" EXIT
51 mkdir -p $res 53 mkdir -p $res
52 log=$res/log 54 log=$res/log
53 # Don't all start at once 55 # Don't all start at once
54 sleep $pause 56 sleep $pause
55 echo \# $(date) > $log 57 echo \# $(date) > $log
56 pRes=0 58 pRes=0
59 doit ()
60 {
61 echo -n "# $(date) $id " >> $log
62 tee >(wc -l >> $log) |\
63 parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
64 }
57 while read s id 65 while read s id
58 do 66 do
59 url="https://commoncrawl.s3.amazonaws.com/$s" 67 url="https://commoncrawl.s3.amazonaws.com/$s"
60 ccm=${s##*/wat/} 68 cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-)
61 cci=${ccm%%-ip*}
62 export ID=$id 69 export ID=$id
63 echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log 70 echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log
64 # Experimental retry loop 71 # Experimental retry loop
65 tryRead "$url" crawl$id 72 tryRead "$url"
66 if [ -s crawl$id ] 73 cat $res/$cci.* > /var/data/$cci
67 then
68 echo \# $(date) $id $(wc -l crawl$id) >> $log
69 parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
70 else
71 echo "crawl$id empty" 1>&2
72 fi
73 rm crawl$id
74 cat $res/$cci.* > $cci
75 done < ifile.txt 2>> $res/errs || pRes=$? 74 done < ifile.txt 2>> $res/errs || pRes=$?
76 echo \# $(date) main loop exit code=$pRes >> $log 75 echo \# $(date) main loop exit code=$pRes >> $log
77 rm .running 76 rm .running
78 77
79 78