comparison master/src/wecu/run_sac.sh @ 61:cfaf5223b071

trying to get my own mapper working
author Henry S. Thompson <ht@markup.co.uk>
date Sun, 31 May 2020 12:06:44 +0000
parents 5fdca5baa4e9
children 892e1c0240e1
comparison
equal deleted inserted replaced
60:5fdca5baa4e9 61:cfaf5223b071
1 #!/bin/bash 1 #!/bin/bash
2 # Usage: run_sac.sh numcores hostsFilename workDir map resType patType patterns 2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) resType patType patterns
3 cores=$1 3 cores=$1
4 hosts=$2 4 hosts=$2
5 wd=$3 5 wd=$3
6 mapper=$4 6 mapper=$4
7 shift 7 shift
8 shift 8 shift
9 shift 9 shift
10 shift 10 shift
11 if [ "$1" = "-f" ]
12 then
13 shift
14 filter="$1"
15 shift
16 else
17 filter=\"\"
18 fi
19
11 rm -f allout 20 rm -f allout
12 21
13 # Get quoting right... 22 # Get quoting right...
14 worker () { 23 worker () {
24 set -e
25 set -o pipefail
15 f=$1 26 f=$1
16 shift 27 shift
17 mapper=$1 28 j=$1
29 shift
30 mapper="$1"
31 shift
32 filter="$1"
18 shift 33 shift
19 shift # we don't need/want the resType either 34 shift # we don't need/want the resType either
20 hostname 1>&2 35 echo $(date) $(hostname) start $f >>${j}_log
21 export PYTHONIOENCODING=utf-8 36 export PYTHONIOENCODING=utf-8
22 curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ 37 { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
23 unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./$mapper "$@" 2>&1 38 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>>${j}_log
39 echo $(date) $(hostname) finished $f >>${j}_log
24 } 40 }
25 41
26 export -f worker 42 export -f worker
27 43
28 parallel -v \ 44 parallel \
29 --sshloginfile $hosts \ 45 --sshloginfile $hosts \
30 --retries 3 \ 46 --retries 3 \
31 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ 47 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
32 --will-cite \ 48 --will-cite \
33 --jobs $cores \ 49 --jobs $cores \
34 --workdir $wd \ 50 --workdir $wd \
35 -a input_paths \ 51 -a input_paths \
36 --env worker \ 52 --env worker \
37 worker '{}' "$mapper" "$@" | tee -a allout | grep -v 'Authorized uses only' | \ 53 worker '{}' '{#}' "$mapper" "$filter" "$@" 2>errs | grep -v 'Authorized uses only' | tee >(wc -l 1>&2) |\
38 sac_reducer.py "$@" 54 sac_reducer.py "$@"