Mercurial > hg > cc > azure
annotate master/src/wecu/run_sac.sh @ 61:cfaf5223b071
trying to get my own mapper working
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Sun, 31 May 2020 12:06:44 +0000 |
parents | 5fdca5baa4e9 |
children | 892e1c0240e1 |
rev | line source |
---|---|
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) resType patType patterns |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
3 cores=$1 |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 hosts=$2 |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
5 wd=$3 |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
6 mapper=$4 |
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
7 shift |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
8 shift |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
9 shift |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
10 shift |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
11 if [ "$1" = "-f" ] |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
12 then |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
13 shift |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
14 filter="$1" |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
15 shift |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
16 else |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
17 filter=\"\" |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
18 fi |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
19 |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
20 rm -f allout |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
21 |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
22 # Get quoting right... |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
23 worker () { |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
24 set -e |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
25 set -o pipefail |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
26 f=$1 |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
27 shift |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
28 j=$1 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
29 shift |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
30 mapper="$1" |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
31 shift |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
32 filter="$1" |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
33 shift |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
34 shift # we don't need/want the resType either |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
35 echo $(date) $(hostname) start $f >>${j}_log |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
36 export PYTHONIOENCODING=utf-8 |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
37 { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
38 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>>${j}_log |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
39 echo $(date) $(hostname) finished $f >>${j}_log |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
40 } |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
41 |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
42 export -f worker |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
43 |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
44 parallel \ |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
45 --sshloginfile $hosts \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
46 --retries 3 \ |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
47 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
48 --will-cite \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
49 --jobs $cores \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
50 --workdir $wd \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
51 -a input_paths \ |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
52 --env worker \ |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
53 worker '{}' '{#}' "$mapper" "$filter" "$@" 2>errs | grep -v 'Authorized uses only' | tee >(wc -l 1>&2) |\ |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
54 sac_reducer.py "$@" |