diff master/src/wecu/run_sac.sh @ 62:892e1c0240e1

added more robust (I hope) error handling, got reducer working with support for choosing dict or tsv output
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 02 Jun 2020 17:35:07 +0000
parents cfaf5223b071
children d46c8b12fc04
line wrap: on
line diff
--- a/master/src/wecu/run_sac.sh	Sun May 31 12:06:44 2020 +0000
+++ b/master/src/wecu/run_sac.sh	Tue Jun 02 17:35:07 2020 +0000
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) resType patType patterns
+# Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) (-k numKeys) resType patType patterns
+echo "$@" 1>cmd
 cores=$1
 hosts=$2
 wd=$3
@@ -16,13 +17,18 @@
 else
  filter=\"\"
 fi
-
-rm -f allout
+if [ "$1" = "-k" ]
+then
+ shift
+ numKeys="$1"
+ shift
+fi
 
 # Get quoting right...
 worker () {
   set -e
   set -o pipefail
+  mkdir -p logs
   f=$1
   shift
   j=$1
@@ -32,11 +38,15 @@
   filter="$1"
   shift
   shift # we don't need/want the resType either
-  echo $(date) $(hostname) start $f >>${j}_log
+  me=$(hostname | cut -c 15)
+  ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//')
+  echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log
   export PYTHONIOENCODING=utf-8
-  { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
-   unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>>${j}_log
-  echo $(date) $(hostname) finished $f >>${j}_log
+  { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
+   unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>&4 ; ) ) ; unset IFS ; } 4>&1
+  { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff
+    printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to
+      # guarantee atomic entry in the log
 }
 
 export -f worker
@@ -50,5 +60,6 @@
     --workdir $wd \
     -a input_paths \
     --env worker \
-    worker '{}' '{#}' "$mapper" "$filter" "$@" 2>errs | grep -v 'Authorized uses only' | tee >(wc -l 1>&2) |\
-    sac_reducer.py "$@"
+    --return 'logs/{#}_log' --cleanup \
+    worker '{}' '{#}' "$mapper" "$filter" "$@" | tee >(wc -l 1>&2) |\
+    sac_reducer.py $1 $numKeys