Mercurial > hg > cc > azure
annotate master/src/wecu/run_sac.sh @ 63:d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 03 Jun 2020 16:40:34 +0000 |
parents | 892e1c0240e1 |
children | b91e44355bbf |
rev | line source |
---|---|
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
63
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper keyHandler (-f filter) (-k numKeys) resType patType patterns |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
3 echo "$@" 1>cmd |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 cores=$1 |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
5 hosts=$2 |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
6 wd=$3 |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
7 mapper=$4 |
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
8 shift |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
9 shift |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
10 shift |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
11 shift |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
12 if [ "$1" = "-f" ] |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
13 then |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
14 shift |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
15 filter="$1" |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
16 shift |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
17 else |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
18 filter=\"\" |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
19 fi |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
20 if [ "$1" = "-k" ] |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
21 then |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
22 shift |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
23 numKeys="$1" |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
24 shift |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
25 fi |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
26 |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
27 # Get quoting right... |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
28 worker () { |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
29 set -e |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
30 set -o pipefail |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
31 mkdir -p logs |
63
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
32 mkdir -p res |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
33 f=$1 |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
34 shift |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
35 j=$1 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
36 shift |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
37 mapper="$1" |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
38 shift |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
39 filter="$1" |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
40 shift |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
41 shift # we don't need/want the resType either |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
42 me=$(hostname | cut -c 15) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
43 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
44 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
45 export PYTHONIOENCODING=utf-8 |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
46 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ |
63
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
47 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; } |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
48 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
49 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
50 # guarantee atomic entry in the log |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
51 } |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
52 |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
53 export -f worker |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
54 |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
55 parallel \ |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
56 --sshloginfile $hosts \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
57 --retries 3 \ |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
58 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
59 --will-cite \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
60 --jobs $cores \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
61 --workdir $wd \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
62 -a input_paths \ |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
63 --env worker \ |
63
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
64 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \ |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
65 worker '{}' '{#}' "$mapper" "$filter" "$@" |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
66 cat res/*.tsv | sac_reducer.py $1 $numKeys |