Mercurial > hg > cc > azure
diff master/src/wecu/run_sac.sh @ 60:5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Thu, 28 May 2020 12:55:03 +0000 |
parents | 8332faef25e1 |
children | cfaf5223b071 |
line wrap: on
line diff
--- a/master/src/wecu/run_sac.sh Thu May 28 09:58:38 2020 +0000 +++ b/master/src/wecu/run_sac.sh Thu May 28 12:55:03 2020 +0000 @@ -1,8 +1,10 @@ #!/bin/bash -# Usage: run_sac.sh numcores hostsFilename workDir resType patType patterns +# Usage: run_sac.sh numcores hostsFilename workDir map resType patType patterns cores=$1 hosts=$2 wd=$3 +mapper=$4 +shift shift shift shift @@ -12,11 +14,13 @@ worker () { f=$1 shift + mapper=$1 + shift shift # we don't need/want the resType either hostname 1>&2 export PYTHONIOENCODING=utf-8 curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ - unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./sac_mapper.py "$@" 2>&1 + unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./$mapper "$@" 2>&1 } export -f worker @@ -24,11 +28,11 @@ parallel -v \ --sshloginfile $hosts \ --retries 3 \ - --transferfile $(which sac_mapper.py|sed 's/sac_/.\/sac_/') \ + --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ --will-cite \ --jobs $cores \ --workdir $wd \ -a input_paths \ --env worker \ - worker '{}' "$@" | tee -a allout | grep -v 'Authorized uses only' | \ + worker '{}' "$mapper" "$@" | tee -a allout | grep -v 'Authorized uses only' | \ sac_reducer.py "$@"