annotate master/src/wecu/run_sac.sh @ 61:cfaf5223b071

trying to get my own mapper working
author Henry S. Thompson <ht@markup.co.uk>
date Sun, 31 May 2020 12:06:44 +0000
parents 5fdca5baa4e9
children 892e1c0240e1
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/bin/bash
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) resType patType patterns
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
3 cores=$1
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 hosts=$2
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5 wd=$3
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
6 mapper=$4
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
7 shift
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
8 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
9 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
10 shift
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
11 if [ "$1" = "-f" ]
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
12 then
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
13 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
14 filter="$1"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
15 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
16 else
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
17 filter=\"\"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
18 fi
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
19
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
20 rm -f allout
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
21
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
22 # Get quoting right...
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
23 worker () {
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
24 set -e
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
25 set -o pipefail
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
26 f=$1
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
27 shift
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
28 j=$1
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
29 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
30 mapper="$1"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
31 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
32 filter="$1"
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
33 shift
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
34 shift # we don't need/want the resType either
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
35 echo $(date) $(hostname) start $f >>${j}_log
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
36 export PYTHONIOENCODING=utf-8
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
37 { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
38 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>>${j}_log
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
39 echo $(date) $(hostname) finished $f >>${j}_log
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
40 }
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
41
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
42 export -f worker
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
43
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
44 parallel \
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
45 --sshloginfile $hosts \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
46 --retries 3 \
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
47 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
48 --will-cite \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
49 --jobs $cores \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
50 --workdir $wd \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
51 -a input_paths \
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
52 --env worker \
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
53 worker '{}' '{#}' "$mapper" "$filter" "$@" 2>errs | grep -v 'Authorized uses only' | tee >(wc -l 1>&2) |\
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
54 sac_reducer.py "$@"