diff master/src/wecu/run_sac.sh @ 63:d46c8b12fc04

support multiple approaches to key combination, use local files to collect results
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 03 Jun 2020 16:40:34 +0000
parents 892e1c0240e1
children b91e44355bbf
line wrap: on
line diff
--- a/master/src/wecu/run_sac.sh	Tue Jun 02 17:35:07 2020 +0000
+++ b/master/src/wecu/run_sac.sh	Wed Jun 03 16:40:34 2020 +0000
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) (-k numKeys) resType patType patterns
+# Usage: run_sac.sh numcores hostsFilename workDir mapper keyHandler (-f filter) (-k numKeys) resType patType patterns
 echo "$@" 1>cmd
 cores=$1
 hosts=$2
@@ -29,6 +29,7 @@
   set -e
   set -o pipefail
   mkdir -p logs
+  mkdir -p res
   f=$1
   shift
   j=$1
@@ -43,7 +44,7 @@
   echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log
   export PYTHONIOENCODING=utf-8
   { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
-   unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>&4 ; ) ) ; unset IFS ; } 4>&1
+   unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; }
   { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff
     printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to
       # guarantee atomic entry in the log
@@ -60,6 +61,6 @@
     --workdir $wd \
     -a input_paths \
     --env worker \
-    --return 'logs/{#}_log' --cleanup \
-    worker '{}' '{#}' "$mapper" "$filter" "$@" | tee >(wc -l 1>&2) |\
-    sac_reducer.py $1 $numKeys
+    --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \
+    worker '{}' '{#}' "$mapper" "$filter" "$@"
+cat res/*.tsv | sac_reducer.py $1 $numKeys