Mercurial > hg > cc > azure
comparison workers/bin/test1.sh @ 10:2fbefb8d1a9e
wrun.sh: usage catchup
invoke.sh: force terminal allocation on workers
test1.sh: support control of number of worker processes are spawned,
support -t to turn off random delay at startup
count1.sh: actual do the counting in subprocs to avoid disk contention
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Mon, 08 Oct 2018 13:17:23 +0000 |
parents | 5db6015689a2 |
children | 36b5d379909a |
comparison
equal
deleted
inserted
replaced
9:55e953e5c66f | 10:2fbefb8d1a9e |
---|---|
1 #!/bin/bash | 1 #!/bin/bash |
2 # Test script to split CC WAT files across threads | 2 # Test script to split CC WAT files across threads |
3 # to count http: vs. https: | 3 # to count http: vs. https: |
4 # Usage: [echo file file_id] | test1.sh id home | 4 # Usage: [echo file file_id] | test1.sh id home [-t] numWorkerProcesses |
5 # If -t, no random wait, just id seconds | |
5 # remove >>errs once tested | 6 # remove >>errs once tested |
6 #set -e -o pipefail | 7 #set -e -o pipefail |
7 echo $$ > test1.pid | 8 echo $$ > test1.pid |
8 proc=$1 | 9 proc=$1 |
9 res=res$proc | 10 res=res$proc |
10 home=$2 | 11 home=$2 |
12 shift 2 | |
13 if [ "$1" = "-t" ] | |
14 then | |
15 shift | |
16 pause=$proc | |
17 else | |
18 pause=$(lrand 60) | |
19 fi | |
20 wp=$1 | |
11 touch .running | 21 touch .running |
12 function lrand { | 22 function lrand { |
13 # cheap bad little random number generator | 23 # cheap bad little random number generator |
14 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) | 24 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) |
15 } | 25 } |
41 ( sleep 5 ; rm nohup.cc ) & | 51 ( sleep 5 ; rm nohup.cc ) & |
42 }" EXIT | 52 }" EXIT |
43 mkdir -p $res | 53 mkdir -p $res |
44 log=$res/log | 54 log=$res/log |
45 # Don't all start at once | 55 # Don't all start at once |
46 sleep $(lrand 60) | 56 sleep $pause |
47 echo \# $(date) > $log | 57 echo \# $(date) > $log |
48 pRes=0 | 58 pRes=0 |
49 while read s id | 59 while read s id |
50 do | 60 do |
51 url="https://commoncrawl.s3.amazonaws.com/$s" | 61 url="https://commoncrawl.s3.amazonaws.com/$s" |
52 export ID=$id | 62 export ID=$id |
53 echo $(date) "running |$@|$id|" >> $log | 63 echo $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log |
54 # Experimental retry loop | 64 # Experimental retry loop |
55 tryRead "$url" crawl$id | 65 tryRead "$url" crawl$id |
56 if [ -s crawl$id ] | 66 if [ -s crawl$id ] |
57 then | 67 then |
58 echo \# $id $(wc -l crawl$id) >> $log | 68 echo \# $id $(wc -l crawl$id) >> $log |
59 parallel --round-robin --pipe -j 4 "count1.sh >> $res/{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 | 69 parallel --round-robin --pipe -j $wp "count1.sh >> $res/{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 |
60 else | 70 else |
61 echo "crawl$id empty" 1>&2 | 71 echo "crawl$id empty" 1>&2 |
62 fi | 72 fi |
63 rm crawl$id | 73 rm crawl$id |
64 done < ifile.txt 2>> $res/errs || pRes=$? | 74 done < ifile.txt 2>> $res/errs || pRes=$? |
65 ( cd $res && fgrep -h -v \# [1-9]* ) | sort | uniq -c | tr -d \" > $res/tots | 75 ( cd $res && fgrep -h -v \# [1-9]* ) | tr -d \" > $res/tots |
66 echo \# $(date) main loop exit code=$pRes >> $log | 76 echo \# $(date) main loop exit code=$pRes >> $log |
67 rm .running | 77 rm .running |
78 |