Mercurial > hg > cc > azure
comparison master/src/wecu/run_sac.sh @ 61:cfaf5223b071
trying to get my own mapper working
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Sun, 31 May 2020 12:06:44 +0000 |
parents | 5fdca5baa4e9 |
children | 892e1c0240e1 |
comparison
equal
deleted
inserted
replaced
60:5fdca5baa4e9 | 61:cfaf5223b071 |
---|---|
1 #!/bin/bash | 1 #!/bin/bash |
2 # Usage: run_sac.sh numcores hostsFilename workDir map resType patType patterns | 2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) resType patType patterns |
3 cores=$1 | 3 cores=$1 |
4 hosts=$2 | 4 hosts=$2 |
5 wd=$3 | 5 wd=$3 |
6 mapper=$4 | 6 mapper=$4 |
7 shift | 7 shift |
8 shift | 8 shift |
9 shift | 9 shift |
10 shift | 10 shift |
11 if [ "$1" = "-f" ] | |
12 then | |
13 shift | |
14 filter="$1" | |
15 shift | |
16 else | |
17 filter=\"\" | |
18 fi | |
19 | |
11 rm -f allout | 20 rm -f allout |
12 | 21 |
13 # Get quoting right... | 22 # Get quoting right... |
14 worker () { | 23 worker () { |
24 set -e | |
25 set -o pipefail | |
15 f=$1 | 26 f=$1 |
16 shift | 27 shift |
17 mapper=$1 | 28 j=$1 |
29 shift | |
30 mapper="$1" | |
31 shift | |
32 filter="$1" | |
18 shift | 33 shift |
19 shift # we don't need/want the resType either | 34 shift # we don't need/want the resType either |
20 hostname 1>&2 | 35 echo $(date) $(hostname) start $f >>${j}_log |
21 export PYTHONIOENCODING=utf-8 | 36 export PYTHONIOENCODING=utf-8 |
22 curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ | 37 { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ |
23 unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./$mapper "$@" 2>&1 | 38 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>>${j}_log |
39 echo $(date) $(hostname) finished $f >>${j}_log | |
24 } | 40 } |
25 | 41 |
26 export -f worker | 42 export -f worker |
27 | 43 |
28 parallel -v \ | 44 parallel \ |
29 --sshloginfile $hosts \ | 45 --sshloginfile $hosts \ |
30 --retries 3 \ | 46 --retries 3 \ |
31 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ | 47 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ |
32 --will-cite \ | 48 --will-cite \ |
33 --jobs $cores \ | 49 --jobs $cores \ |
34 --workdir $wd \ | 50 --workdir $wd \ |
35 -a input_paths \ | 51 -a input_paths \ |
36 --env worker \ | 52 --env worker \ |
37 worker '{}' "$mapper" "$@" | tee -a allout | grep -v 'Authorized uses only' | \ | 53 worker '{}' '{#}' "$mapper" "$filter" "$@" 2>errs | grep -v 'Authorized uses only' | tee >(wc -l 1>&2) |\ |
38 sac_reducer.py "$@" | 54 sac_reducer.py "$@" |