Mercurial > hg > cc > azure
diff master/src/wecu/run_sac.sh @ 61:cfaf5223b071
trying to get my own mapper working
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Sun, 31 May 2020 12:06:44 +0000 |
parents | 5fdca5baa4e9 |
children | 892e1c0240e1 |
line wrap: on
line diff
--- a/master/src/wecu/run_sac.sh Thu May 28 12:55:03 2020 +0000 +++ b/master/src/wecu/run_sac.sh Sun May 31 12:06:44 2020 +0000 @@ -1,5 +1,5 @@ #!/bin/bash -# Usage: run_sac.sh numcores hostsFilename workDir map resType patType patterns +# Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) resType patType patterns cores=$1 hosts=$2 wd=$3 @@ -8,24 +8,40 @@ shift shift shift +if [ "$1" = "-f" ] +then + shift + filter="$1" + shift +else + filter=\"\" +fi + rm -f allout # Get quoting right... worker () { + set -e + set -o pipefail f=$1 shift - mapper=$1 + j=$1 + shift + mapper="$1" + shift + filter="$1" shift shift # we don't need/want the resType either - hostname 1>&2 + echo $(date) $(hostname) start $f >>${j}_log export PYTHONIOENCODING=utf-8 - curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ - unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./$mapper "$@" 2>&1 + { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ + unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>>${j}_log + echo $(date) $(hostname) finished $f >>${j}_log } export -f worker -parallel -v \ +parallel \ --sshloginfile $hosts \ --retries 3 \ --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ @@ -34,5 +50,5 @@ --workdir $wd \ -a input_paths \ --env worker \ - worker '{}' "$mapper" "$@" | tee -a allout | grep -v 'Authorized uses only' | \ + worker '{}' '{#}' "$mapper" "$filter" "$@" 2>errs | grep -v 'Authorized uses only' | tee >(wc -l 1>&2) |\ sac_reducer.py "$@"