diff master/src/wecu/run_sac.sh @ 61:cfaf5223b071

trying to get my own mapper working
author Henry S. Thompson <ht@markup.co.uk>
date Sun, 31 May 2020 12:06:44 +0000
parents 5fdca5baa4e9
children 892e1c0240e1
line wrap: on
line diff
--- a/master/src/wecu/run_sac.sh	Thu May 28 12:55:03 2020 +0000
+++ b/master/src/wecu/run_sac.sh	Sun May 31 12:06:44 2020 +0000
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Usage: run_sac.sh numcores hostsFilename workDir map resType patType patterns
+# Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) resType patType patterns
 cores=$1
 hosts=$2
 wd=$3
@@ -8,24 +8,40 @@
 shift
 shift
 shift
+if [ "$1" = "-f" ]
+then
+ shift
+ filter="$1"
+ shift
+else
+ filter=\"\"
+fi
+
 rm -f allout
 
 # Get quoting right...
 worker () {
+  set -e
+  set -o pipefail
   f=$1
   shift
-  mapper=$1
+  j=$1
+  shift
+  mapper="$1"
+  shift
+  filter="$1"
   shift
   shift # we don't need/want the resType either
-  hostname 1>&2
+  echo $(date) $(hostname) start $f >>${j}_log
   export PYTHONIOENCODING=utf-8
-  curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
-   unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./$mapper "$@" 2>&1
+  { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
+   unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>>${j}_log
+  echo $(date) $(hostname) finished $f >>${j}_log
 }
 
 export -f worker
 
-parallel -v \
+parallel \
     --sshloginfile $hosts \
     --retries 3 \
     --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
@@ -34,5 +50,5 @@
     --workdir $wd \
     -a input_paths \
     --env worker \
-    worker '{}' "$mapper" "$@" | tee -a allout | grep -v 'Authorized uses only' | \
+    worker '{}' '{#}' "$mapper" "$filter" "$@" 2>errs | grep -v 'Authorized uses only' | tee >(wc -l 1>&2) |\
     sac_reducer.py "$@"