annotate master/src/wecu/run_sac.sh @ 60:5fdca5baa4e9

refactor a bit, add support for sac with bespoke mapper
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 28 May 2020 12:55:03 +0000
parents 8332faef25e1
children cfaf5223b071
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/bin/bash
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
2 # Usage: run_sac.sh numcores hostsFilename workDir map resType patType patterns
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
3 cores=$1
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 hosts=$2
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5 wd=$3
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
6 mapper=$4
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
7 shift
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
8 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
9 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
10 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
11 rm -f allout
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
12
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
13 # Get quoting right...
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
14 worker () {
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
15 f=$1
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
16 shift
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
17 mapper=$1
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
18 shift
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
19 shift # we don't need/want the resType either
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
20 hostname 1>&2
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
21 export PYTHONIOENCODING=utf-8
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
22 curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
23 unpigz -dp 1 -c | tee >(wc -l 1>&2) | ./$mapper "$@" 2>&1
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
24 }
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
25
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
26 export -f worker
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
27
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
28 parallel -v \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
29 --sshloginfile $hosts \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
30 --retries 3 \
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
31 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
32 --will-cite \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
33 --jobs $cores \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
34 --workdir $wd \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
35 -a input_paths \
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
36 --env worker \
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
37 worker '{}' "$mapper" "$@" | tee -a allout | grep -v 'Authorized uses only' | \
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
38 sac_reducer.py "$@"