Mercurial > hg > cc > azure
annotate workers/bin/test1.sh @ 8:5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Mon, 01 Oct 2018 18:29:39 +0000 |
parents | |
children | 2fbefb8d1a9e |
rev | line source |
---|---|
8
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
2 # Test script to split CC WAT files across threads |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
3 # to count http: vs. https: |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 # Usage: [echo file file_id] | test1.sh id home |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
5 # remove >>errs once tested |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
6 #set -e -o pipefail |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
7 echo $$ > test1.pid |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
8 proc=$1 |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
9 res=res$proc |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
10 home=$2 |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
11 touch .running |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
12 function lrand { |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
13 # cheap bad little random number generator |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
14 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
15 } |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
16 function tryRead { |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
17 m=0 |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
18 set -o pipefail |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
19 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
20 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \ |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
21 { egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2 |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
22 do |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
23 # try to avoid lockstep retries |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
24 echo ${PIPESTATUS[@]} 1>&2 |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
25 sleep $(lrand 10) |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
26 echo $(date) $2 retry number $m 1>&2 |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
27 done |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
28 set +o pipefail |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
29 } |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
30 trap "{ |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
31 #set -e -o pipefail |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
32 cd $res |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
33 ln -s ../nohup.cc . |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
34 tar -czhf - * | \ |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
35 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data/jobs/ |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
36 mkdir -p test1.$proc |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
37 cd test1.$proc |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
38 tar -xzf - ; } 2>>errs.1\" |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
39 cd |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
40 rm -rf $res |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
41 ( sleep 5 ; rm nohup.cc ) & |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
42 }" EXIT |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
43 mkdir -p $res |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
44 log=$res/log |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
45 # Don't all start at once |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
46 sleep $(lrand 60) |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
47 echo \# $(date) > $log |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
48 pRes=0 |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
49 while read s id |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
50 do |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
51 url="https://commoncrawl.s3.amazonaws.com/$s" |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
52 export ID=$id |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
53 echo $(date) "running |$@|$id|" >> $log |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
54 # Experimental retry loop |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
55 tryRead "$url" crawl$id |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
56 if [ -s crawl$id ] |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
57 then |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
58 echo \# $id $(wc -l crawl$id) >> $log |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
59 parallel --round-robin --pipe -j 4 "count1.sh >> $res/{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
60 else |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
61 echo "crawl$id empty" 1>&2 |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
62 fi |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
63 rm crawl$id |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
64 done < ifile.txt 2>> $res/errs || pRes=$? |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
65 ( cd $res && fgrep -h -v \# [1-9]* ) | sort | uniq -c | tr -d \" > $res/tots |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
66 echo \# $(date) main loop exit code=$pRes >> $log |
5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
67 rm .running |