# HG changeset patch # User Henry S. Thompson # Date 1539004643 0 # Node ID 2fbefb8d1a9e3e78b3c05ee52413c13f98f1600c # Parent 55e953e5c66f9bd6a1b2f690a3b7e61af97b9946 wrun.sh: usage catchup invoke.sh: force terminal allocation on workers test1.sh: support control of number of worker processes are spawned, support -t to turn off random delay at startup count1.sh: actual do the counting in subprocs to avoid disk contention diff -r 55e953e5c66f -r 2fbefb8d1a9e master/bin/internal/invoke.sh --- a/master/bin/internal/invoke.sh Tue Oct 02 10:52:45 2018 +0000 +++ b/master/bin/internal/invoke.sh Mon Oct 08 13:17:23 2018 +0000 @@ -33,7 +33,7 @@ fi || echo scp failed, status=$? 1>&2 if [ "$wait" ] then - ssh -t -p $port $ip "nohup $cmd $id $me ""$@"" > nohup.cc" + ssh -tt -p $port $ip "nohup $cmd $id $me ""$@"" > nohup.cc" else ssh -p $port $ip "$cmd $id $me ""$@" fi || echo ssh failed, status=$? 1>&2 diff -r 55e953e5c66f -r 2fbefb8d1a9e master/bin/wrun.sh --- a/master/bin/wrun.sh Tue Oct 02 10:52:45 2018 +0000 +++ b/master/bin/wrun.sh Mon Oct 08 13:17:23 2018 +0000 @@ -6,12 +6,13 @@ Where name is the name of a VM scale set. Runs cmd on every machine in a scale set, - passing args and, + (only using n machines if -np n is present) + passing args and (as ~/ifile.txt), if -f, lines from file split per worker if -ff, complete file sent to all workers - unless -x, worker id + and, unless -x, worker id by doing as it were - [ echo line(s)-from-file |] ssh machine "$cmd [id] "$args"" + scp machine: <(line(s)-from-file) ifile.txt && ssh machine "$cmd [id] "$args"" if -i, don\'t use nohup on the workers so returns immediately [default is to use nohup unless neither -f or -ff] EOF diff -r 55e953e5c66f -r 2fbefb8d1a9e workers/bin/count1.sh --- a/workers/bin/count1.sh Tue Oct 02 10:52:45 2018 +0000 +++ b/workers/bin/count1.sh Mon Oct 08 13:17:23 2018 +0000 @@ -1,6 +1,6 @@ #!/bin/bash echo "# $ID" -jq '.Envelope|.["WARC-Header-Metadata"]["WARC-Target-URI"]'|cut -f 1 -d ':' +jq '.Envelope|.["WARC-Header-Metadata"]["WARC-Target-URI"]'|cut -f 1 -d ':'|awk '{c[$1]+=1} END {for (k in c) {print k, c[k]}}' diff -r 55e953e5c66f -r 2fbefb8d1a9e workers/bin/test1.sh --- a/workers/bin/test1.sh Tue Oct 02 10:52:45 2018 +0000 +++ b/workers/bin/test1.sh Mon Oct 08 13:17:23 2018 +0000 @@ -1,13 +1,23 @@ #!/bin/bash # Test script to split CC WAT files across threads # to count http: vs. https: -# Usage: [echo file file_id] | test1.sh id home +# Usage: [echo file file_id] | test1.sh id home [-t] numWorkerProcesses +# If -t, no random wait, just id seconds # remove >>errs once tested #set -e -o pipefail echo $$ > test1.pid proc=$1 res=res$proc home=$2 +shift 2 +if [ "$1" = "-t" ] +then + shift + pause=$proc +else + pause=$(lrand 60) +fi +wp=$1 touch .running function lrand { # cheap bad little random number generator @@ -43,25 +53,26 @@ mkdir -p $res log=$res/log # Don't all start at once -sleep $(lrand 60) +sleep $pause echo \# $(date) > $log pRes=0 while read s id do url="https://commoncrawl.s3.amazonaws.com/$s" export ID=$id - echo $(date) "running |$@|$id|" >> $log + echo $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log # Experimental retry loop tryRead "$url" crawl$id if [ -s crawl$id ] then echo \# $id $(wc -l crawl$id) >> $log - parallel --round-robin --pipe -j 4 "count1.sh >> $res/{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 + parallel --round-robin --pipe -j $wp "count1.sh >> $res/{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 else echo "crawl$id empty" 1>&2 fi rm crawl$id done < ifile.txt 2>> $res/errs || pRes=$? -( cd $res && fgrep -h -v \# [1-9]* ) | sort | uniq -c | tr -d \" > $res/tots +( cd $res && fgrep -h -v \# [1-9]* ) | tr -d \" > $res/tots echo \# $(date) main loop exit code=$pRes >> $log rm .running +