comparison workers/bin/test1.sh @ 10:2fbefb8d1a9e

wrun.sh: usage catchup invoke.sh: force terminal allocation on workers test1.sh: support control of number of worker processes are spawned, support -t to turn off random delay at startup count1.sh: actual do the counting in subprocs to avoid disk contention
author Henry S. Thompson <ht@markup.co.uk>
date Mon, 08 Oct 2018 13:17:23 +0000
parents 5db6015689a2
children 36b5d379909a
comparison
equal deleted inserted replaced
9:55e953e5c66f 10:2fbefb8d1a9e
1 #!/bin/bash 1 #!/bin/bash
2 # Test script to split CC WAT files across threads 2 # Test script to split CC WAT files across threads
3 # to count http: vs. https: 3 # to count http: vs. https:
4 # Usage: [echo file file_id] | test1.sh id home 4 # Usage: [echo file file_id] | test1.sh id home [-t] numWorkerProcesses
5 # If -t, no random wait, just id seconds
5 # remove >>errs once tested 6 # remove >>errs once tested
6 #set -e -o pipefail 7 #set -e -o pipefail
7 echo $$ > test1.pid 8 echo $$ > test1.pid
8 proc=$1 9 proc=$1
9 res=res$proc 10 res=res$proc
10 home=$2 11 home=$2
12 shift 2
13 if [ "$1" = "-t" ]
14 then
15 shift
16 pause=$proc
17 else
18 pause=$(lrand 60)
19 fi
20 wp=$1
11 touch .running 21 touch .running
12 function lrand { 22 function lrand {
13 # cheap bad little random number generator 23 # cheap bad little random number generator
14 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) 24 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
15 } 25 }
41 ( sleep 5 ; rm nohup.cc ) & 51 ( sleep 5 ; rm nohup.cc ) &
42 }" EXIT 52 }" EXIT
43 mkdir -p $res 53 mkdir -p $res
44 log=$res/log 54 log=$res/log
45 # Don't all start at once 55 # Don't all start at once
46 sleep $(lrand 60) 56 sleep $pause
47 echo \# $(date) > $log 57 echo \# $(date) > $log
48 pRes=0 58 pRes=0
49 while read s id 59 while read s id
50 do 60 do
51 url="https://commoncrawl.s3.amazonaws.com/$s" 61 url="https://commoncrawl.s3.amazonaws.com/$s"
52 export ID=$id 62 export ID=$id
53 echo $(date) "running |$@|$id|" >> $log 63 echo $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log
54 # Experimental retry loop 64 # Experimental retry loop
55 tryRead "$url" crawl$id 65 tryRead "$url" crawl$id
56 if [ -s crawl$id ] 66 if [ -s crawl$id ]
57 then 67 then
58 echo \# $id $(wc -l crawl$id) >> $log 68 echo \# $id $(wc -l crawl$id) >> $log
59 parallel --round-robin --pipe -j 4 "count1.sh >> $res/{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 69 parallel --round-robin --pipe -j $wp "count1.sh >> $res/{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
60 else 70 else
61 echo "crawl$id empty" 1>&2 71 echo "crawl$id empty" 1>&2
62 fi 72 fi
63 rm crawl$id 73 rm crawl$id
64 done < ifile.txt 2>> $res/errs || pRes=$? 74 done < ifile.txt 2>> $res/errs || pRes=$?
65 ( cd $res && fgrep -h -v \# [1-9]* ) | sort | uniq -c | tr -d \" > $res/tots 75 ( cd $res && fgrep -h -v \# [1-9]* ) | tr -d \" > $res/tots
66 echo \# $(date) main loop exit code=$pRes >> $log 76 echo \# $(date) main loop exit code=$pRes >> $log
67 rm .running 77 rm .running
78