diff master/src/wecu/run_sac.sh @ 60:5fdca5baa4e9

refactor a bit, add support for sac with bespoke mapper
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 28 May 2020 12:55:03 +0000
parents 8332faef25e1
children cfaf5223b071
line wrap: on
line diff
--- a/master/src/wecu/run_sac.sh	Thu May 28 09:58:38 2020 +0000
+++ b/master/src/wecu/run_sac.sh	Thu May 28 12:55:03 2020 +0000
@@ -1,8 +1,10 @@
 #!/bin/bash
-# Usage: run_sac.sh numcores hostsFilename workDir resType patType patterns
+# Usage: run_sac.sh numcores hostsFilename workDir map resType patType patterns
 cores=$1
 hosts=$2
 wd=$3
+mapper=$4
+shift
 shift
 shift
 shift
@@ -12,11 +14,13 @@
 worker () {
   f=$1
   shift
+  mapper=$1
+  shift
   shift # we don't need/want the resType either
   hostname 1>&2
   export PYTHONIOENCODING=utf-8
   curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
-   unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./sac_mapper.py "$@" 2>&1
+   unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./$mapper "$@" 2>&1
 }
 
 export -f worker
@@ -24,11 +28,11 @@
 parallel -v \
     --sshloginfile $hosts \
     --retries 3 \
-    --transferfile $(which sac_mapper.py|sed 's/sac_/.\/sac_/') \
+    --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
     --will-cite \
     --jobs $cores \
     --workdir $wd \
     -a input_paths \
     --env worker \
-    worker '{}' "$@" | tee -a allout | grep -v 'Authorized uses only' | \
+    worker '{}' "$mapper" "$@" | tee -a allout | grep -v 'Authorized uses only' | \
     sac_reducer.py "$@"