changeset 10:2fbefb8d1a9e

wrun.sh: usage catchup invoke.sh: force terminal allocation on workers test1.sh: support control of number of worker processes are spawned, support -t to turn off random delay at startup count1.sh: actual do the counting in subprocs to avoid disk contention
author Henry S. Thompson <ht@markup.co.uk>
date Mon, 08 Oct 2018 13:17:23 +0000
parents 55e953e5c66f
children 36b5d379909a
files master/bin/internal/invoke.sh master/bin/wrun.sh workers/bin/count1.sh workers/bin/test1.sh
diffstat 4 files changed, 22 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/master/bin/internal/invoke.sh	Tue Oct 02 10:52:45 2018 +0000
+++ b/master/bin/internal/invoke.sh	Mon Oct 08 13:17:23 2018 +0000
@@ -33,7 +33,7 @@
 fi || echo scp failed, status=$? 1>&2
 if [ "$wait" ]
 then
-  ssh -t -p $port $ip "nohup $cmd $id $me ""$@"" > nohup.cc"
+  ssh -tt -p $port $ip "nohup $cmd $id $me ""$@"" > nohup.cc"
 else
   ssh -p $port $ip "$cmd $id $me ""$@"
 fi || echo ssh failed, status=$? 1>&2
--- a/master/bin/wrun.sh	Tue Oct 02 10:52:45 2018 +0000
+++ b/master/bin/wrun.sh	Mon Oct 08 13:17:23 2018 +0000
@@ -6,12 +6,13 @@
 Where name is the name of a VM scale set.
 
 Runs cmd on every machine in a scale set,
- passing args and, 
+ (only using n machines if -np n is present)
+ passing args and (as ~/ifile.txt), 
   if -f, lines from file split per worker
   if -ff, complete file sent to all workers
-  unless -x, worker id
+  and, unless -x, worker id
  by doing as it were
-  [ echo line(s)-from-file |] ssh machine "$cmd [id] "$args"" 
+  scp machine: <(line(s)-from-file) ifile.txt && ssh machine "$cmd [id] "$args""
   if -i, don\'t use nohup on the workers so returns immediately
    [default is to use nohup unless neither -f or -ff]
 EOF
--- a/workers/bin/count1.sh	Tue Oct 02 10:52:45 2018 +0000
+++ b/workers/bin/count1.sh	Mon Oct 08 13:17:23 2018 +0000
@@ -1,6 +1,6 @@
 #!/bin/bash
 echo "# $ID"
-jq '.Envelope|.["WARC-Header-Metadata"]["WARC-Target-URI"]'|cut -f 1 -d ':'
+jq '.Envelope|.["WARC-Header-Metadata"]["WARC-Target-URI"]'|cut -f 1 -d ':'|awk '{c[$1]+=1} END {for (k in c) {print k, c[k]}}'
 
 
 
--- a/workers/bin/test1.sh	Tue Oct 02 10:52:45 2018 +0000
+++ b/workers/bin/test1.sh	Mon Oct 08 13:17:23 2018 +0000
@@ -1,13 +1,23 @@
 #!/bin/bash
 # Test script to split CC WAT files across  threads
 #   to count http: vs. https:
-# Usage: [echo file file_id] | test1.sh id home
+# Usage: [echo file file_id] | test1.sh id home [-t] numWorkerProcesses
+#   If -t, no random wait, just id seconds
 # remove >>errs once tested
 #set -e -o pipefail
 echo $$ > test1.pid
 proc=$1
 res=res$proc
 home=$2
+shift 2
+if [ "$1" = "-t" ]
+then
+ shift
+ pause=$proc
+else
+ pause=$(lrand 60)
+fi
+wp=$1
 touch .running
 function lrand {
 # cheap bad little random number generator
@@ -43,25 +53,26 @@
 mkdir -p $res
 log=$res/log
 # Don't all start at once
-sleep $(lrand 60)
+sleep $pause
 echo \# $(date) >  $log
 pRes=0
 while read s id
 do
  url="https://commoncrawl.s3.amazonaws.com/$s"
  export ID=$id
- echo $(date) "running |$@|$id|" >> $log
+ echo $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log
  # Experimental retry loop
  tryRead "$url" crawl$id
  if [ -s crawl$id ]
  then
   echo \# $id $(wc -l crawl$id) >> $log
-  parallel --round-robin --pipe -j 4 "count1.sh >> $res/{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
+  parallel --round-robin --pipe -j $wp "count1.sh >> $res/{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
  else
   echo "crawl$id empty" 1>&2
  fi
  rm crawl$id
 done < ifile.txt 2>> $res/errs || pRes=$?
-( cd $res && fgrep -h -v \# [1-9]* ) | sort | uniq -c | tr -d \" > $res/tots
+( cd $res && fgrep -h -v \# [1-9]* ) | tr -d \" > $res/tots
 echo \# $(date) main loop exit code=$pRes >> $log
 rm .running
+