comparison master/src/wecu/run_sac.sh @ 60:5fdca5baa4e9

refactor a bit, add support for sac with bespoke mapper
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 28 May 2020 12:55:03 +0000
parents 8332faef25e1
children cfaf5223b071
comparison
equal deleted inserted replaced
59:8332faef25e1 60:5fdca5baa4e9
1 #!/bin/bash 1 #!/bin/bash
2 # Usage: run_sac.sh numcores hostsFilename workDir resType patType patterns 2 # Usage: run_sac.sh numcores hostsFilename workDir map resType patType patterns
3 cores=$1 3 cores=$1
4 hosts=$2 4 hosts=$2
5 wd=$3 5 wd=$3
6 mapper=$4
7 shift
6 shift 8 shift
7 shift 9 shift
8 shift 10 shift
9 rm -f allout 11 rm -f allout
10 12
11 # Get quoting right... 13 # Get quoting right...
12 worker () { 14 worker () {
13 f=$1 15 f=$1
14 shift 16 shift
17 mapper=$1
18 shift
15 shift # we don't need/want the resType either 19 shift # we don't need/want the resType either
16 hostname 1>&2 20 hostname 1>&2
17 export PYTHONIOENCODING=utf-8 21 export PYTHONIOENCODING=utf-8
18 curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ 22 curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
19 unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./sac_mapper.py "$@" 2>&1 23 unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./$mapper "$@" 2>&1
20 } 24 }
21 25
22 export -f worker 26 export -f worker
23 27
24 parallel -v \ 28 parallel -v \
25 --sshloginfile $hosts \ 29 --sshloginfile $hosts \
26 --retries 3 \ 30 --retries 3 \
27 --transferfile $(which sac_mapper.py|sed 's/sac_/.\/sac_/') \ 31 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
28 --will-cite \ 32 --will-cite \
29 --jobs $cores \ 33 --jobs $cores \
30 --workdir $wd \ 34 --workdir $wd \
31 -a input_paths \ 35 -a input_paths \
32 --env worker \ 36 --env worker \
33 worker '{}' "$@" | tee -a allout | grep -v 'Authorized uses only' | \ 37 worker '{}' "$mapper" "$@" | tee -a allout | grep -v 'Authorized uses only' | \
34 sac_reducer.py "$@" 38 sac_reducer.py "$@"