Mercurial > hg > cc > azure
annotate master/src/wecu/run_sac.sh @ 60:5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Thu, 28 May 2020 12:55:03 +0000 |
parents | 8332faef25e1 |
children | cfaf5223b071 |
rev | line source |
---|---|
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
2 # Usage: run_sac.sh numcores hostsFilename workDir map resType patType patterns |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
3 cores=$1 |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 hosts=$2 |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
5 wd=$3 |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
6 mapper=$4 |
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
7 shift |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
8 shift |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
9 shift |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
10 shift |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
11 rm -f allout |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
12 |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
13 # Get quoting right... |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
14 worker () { |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
15 f=$1 |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
16 shift |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
17 mapper=$1 |
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
18 shift |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
19 shift # we don't need/want the resType either |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
20 hostname 1>&2 |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
21 export PYTHONIOENCODING=utf-8 |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
22 curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
23 unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./$mapper "$@" 2>&1 |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
24 } |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
25 |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
26 export -f worker |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
27 |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
28 parallel -v \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
29 --sshloginfile $hosts \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
30 --retries 3 \ |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
31 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
32 --will-cite \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
33 --jobs $cores \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
34 --workdir $wd \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
35 -a input_paths \ |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
36 --env worker \ |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
37 worker '{}' "$mapper" "$@" | tee -a allout | grep -v 'Authorized uses only' | \ |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
38 sac_reducer.py "$@" |