Mercurial > hg > cc > azure
comparison workers/bin/ptimedWhich.sh @ 46:7a4e49689935
finally got logging sorted
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Mon, 03 Dec 2018 21:10:02 +0000 |
parents | |
children | 2a0dab424418 |
comparison
equal
deleted
inserted
replaced
45:21152d241e1a | 46:7a4e49689935 |
---|---|
1 #!/bin/bash | |
2 # Test script to split CC WAT files across threads | |
3 # to tabulate http vs. https by last-modified date: | |
4 # Usage: [echo file file_id] | timedWhich.sh id home [-t] numWorkerProcesses | |
5 # If -t, no random wait, just id seconds | |
6 # remove >>errs once tested | |
7 #set -e -o pipefail | |
8 echo $$ > test1.pid | |
9 proc=$1 | |
10 res=/var/data/res$proc | |
11 home=$2 | |
12 shift 2 | |
13 function lrand { | |
14 # cheap bad little random number generator | |
15 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) | |
16 } | |
17 if [ "$1" = "-t" ] | |
18 then | |
19 shift | |
20 pause=$proc | |
21 else | |
22 pause=$(lrand 60) | |
23 fi | |
24 wp=$1 | |
25 touch .running | |
26 function tryread { | |
27 n=$1 | |
28 while read u o | |
29 do | |
30 m=0 | |
31 set -o pipefail | |
32 until if [ $((m+=1)) -gt 5 ]; then echo " tried 5 times w/o success, giving up" 1>&2; return 1; fi && echo -n \# $(date) "reading $u ..." 1>&2 && \ | |
33 curl -s -S --max-time 60 --insecure -o - "$u" | \ | |
34 { echo "done at " $(date) 1>&2 ; zcat ; } |\ | |
35 _timedWhich.py > "$o" | |
36 do | |
37 # try to avoid lockstep retries | |
38 echo \# ${PIPESTATUS[@]} 1>&2 | |
39 sleep $(lrand 10) | |
40 echo \# $(date) retry number $m 1>&2 | |
41 done | |
42 set +o pipefail | |
43 done | |
44 } | |
45 trap "{ | |
46 set -e -o pipefail | |
47 cd /var/data | |
48 tar -czhf - CC* res* | \ | |
49 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data | |
50 mkdir -p which | |
51 cd which | |
52 tar -xzf - ; } 2>>errs\" | |
53 rm -rf res* CC* | |
54 cd | |
55 rm ifile.txt *.pid | |
56 ( sleep 5 ; rm nohup.cc ) & | |
57 }" EXIT | |
58 mkdir -p $res | |
59 log=$res/log | |
60 # Don't all start at once | |
61 sleep $pause | |
62 echo \# $(date) > $log | |
63 pRes=0 | |
64 N=$(wc -l< ifile.txt) | |
65 export -f tryread lrand | |
66 while read s | |
67 do | |
68 url="https://commoncrawl.s3.amazonaws.com/$s" | |
69 cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-) | |
70 echo $url /var/data/$cci | |
71 done < ifile.txt 2>> $res/errs | \ | |
72 parallel --pipe -N$((N / wp)) -j $wp "bash -c \"tryread 2>>$res/errs{#}\"" 2>>$res/errs || pRes=$? | |
73 echo \# $(date) main loop exit code=$pRes >> $log | |
74 rm .running | |
75 | |
76 |