annotate workers/bin/test1.sh @ 8:5db6015689a2

slightly updated, 4 W only, versions of old test4.sh and subcount.sh
author Henry S. Thompson <ht@markup.co.uk>
date Mon, 01 Oct 2018 18:29:39 +0000
parents
children 2fbefb8d1a9e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/bin/bash
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
2 # Test script to split CC WAT files across threads
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
3 # to count http: vs. https:
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 # Usage: [echo file file_id] | test1.sh id home
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5 # remove >>errs once tested
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
6 #set -e -o pipefail
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
7 echo $$ > test1.pid
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
8 proc=$1
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
9 res=res$proc
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
10 home=$2
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
11 touch .running
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
12 function lrand {
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
13 # cheap bad little random number generator
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
14 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
15 }
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
16 function tryRead {
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
17 m=0
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
18 set -o pipefail
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
19 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
20 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
21 { egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
22 do
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
23 # try to avoid lockstep retries
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
24 echo ${PIPESTATUS[@]} 1>&2
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
25 sleep $(lrand 10)
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
26 echo $(date) $2 retry number $m 1>&2
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
27 done
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
28 set +o pipefail
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
29 }
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
30 trap "{
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
31 #set -e -o pipefail
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
32 cd $res
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
33 ln -s ../nohup.cc .
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
34 tar -czhf - * | \
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
35 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data/jobs/
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
36 mkdir -p test1.$proc
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
37 cd test1.$proc
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
38 tar -xzf - ; } 2>>errs.1\"
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
39 cd
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
40 rm -rf $res
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
41 ( sleep 5 ; rm nohup.cc ) &
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
42 }" EXIT
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
43 mkdir -p $res
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
44 log=$res/log
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
45 # Don't all start at once
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
46 sleep $(lrand 60)
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
47 echo \# $(date) > $log
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
48 pRes=0
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
49 while read s id
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
50 do
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
51 url="https://commoncrawl.s3.amazonaws.com/$s"
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
52 export ID=$id
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
53 echo $(date) "running |$@|$id|" >> $log
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
54 # Experimental retry loop
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
55 tryRead "$url" crawl$id
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
56 if [ -s crawl$id ]
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
57 then
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
58 echo \# $id $(wc -l crawl$id) >> $log
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
59 parallel --round-robin --pipe -j 4 "count1.sh >> $res/{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
60 else
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
61 echo "crawl$id empty" 1>&2
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
62 fi
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
63 rm crawl$id
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
64 done < ifile.txt 2>> $res/errs || pRes=$?
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
65 ( cd $res && fgrep -h -v \# [1-9]* ) | sort | uniq -c | tr -d \" > $res/tots
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
66 echo \# $(date) main loop exit code=$pRes >> $log
5db6015689a2 slightly updated, 4 W only, versions of old test4.sh and subcount.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
67 rm .running