Mercurial > hg > cc > azure
comparison master/src/wecu/run_sac.sh @ 60:5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Thu, 28 May 2020 12:55:03 +0000 |
parents | 8332faef25e1 |
children | cfaf5223b071 |
comparison
equal
deleted
inserted
replaced
59:8332faef25e1 | 60:5fdca5baa4e9 |
---|---|
1 #!/bin/bash | 1 #!/bin/bash |
2 # Usage: run_sac.sh numcores hostsFilename workDir resType patType patterns | 2 # Usage: run_sac.sh numcores hostsFilename workDir map resType patType patterns |
3 cores=$1 | 3 cores=$1 |
4 hosts=$2 | 4 hosts=$2 |
5 wd=$3 | 5 wd=$3 |
6 mapper=$4 | |
7 shift | |
6 shift | 8 shift |
7 shift | 9 shift |
8 shift | 10 shift |
9 rm -f allout | 11 rm -f allout |
10 | 12 |
11 # Get quoting right... | 13 # Get quoting right... |
12 worker () { | 14 worker () { |
13 f=$1 | 15 f=$1 |
14 shift | 16 shift |
17 mapper=$1 | |
18 shift | |
15 shift # we don't need/want the resType either | 19 shift # we don't need/want the resType either |
16 hostname 1>&2 | 20 hostname 1>&2 |
17 export PYTHONIOENCODING=utf-8 | 21 export PYTHONIOENCODING=utf-8 |
18 curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ | 22 curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ |
19 unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./sac_mapper.py "$@" 2>&1 | 23 unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./$mapper "$@" 2>&1 |
20 } | 24 } |
21 | 25 |
22 export -f worker | 26 export -f worker |
23 | 27 |
24 parallel -v \ | 28 parallel -v \ |
25 --sshloginfile $hosts \ | 29 --sshloginfile $hosts \ |
26 --retries 3 \ | 30 --retries 3 \ |
27 --transferfile $(which sac_mapper.py|sed 's/sac_/.\/sac_/') \ | 31 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ |
28 --will-cite \ | 32 --will-cite \ |
29 --jobs $cores \ | 33 --jobs $cores \ |
30 --workdir $wd \ | 34 --workdir $wd \ |
31 -a input_paths \ | 35 -a input_paths \ |
32 --env worker \ | 36 --env worker \ |
33 worker '{}' "$@" | tee -a allout | grep -v 'Authorized uses only' | \ | 37 worker '{}' "$mapper" "$@" | tee -a allout | grep -v 'Authorized uses only' | \ |
34 sac_reducer.py "$@" | 38 sac_reducer.py "$@" |