annotate master/src/wecu/run_sac.sh @ 64:b91e44355bbf

fix minor argument passing snafus
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 03 Jun 2020 22:08:01 +0000
parents d46c8b12fc04
children e1f61f94b196
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/bin/bash
64
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-h keyHandler) (-f filter) (-k numKeys) resType patType patterns
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
3 echo "$@" 1>cmd
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 cores=$1
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5 hosts=$2
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
6 wd=$3
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
7 mapper=$4
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
8 shift
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
9 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
10 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
11 shift
64
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
12 if [ "$1" = "-h" ]
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
13 then
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
14 shift
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
15 keyHandler="$1"
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
16 shift
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
17 fi
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
18 if [ "$1" = "-f" ]
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
19 then
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
20 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
21 filter="$1"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
22 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
23 else
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
24 filter=\"\"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
25 fi
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
26 if [ "$1" = "-k" ]
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
27 then
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
28 shift
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
29 numKeys="$1"
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
30 shift
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
31 fi
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
32
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
33 # Get quoting right...
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
34 worker () {
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
35 set -e
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
36 set -o pipefail
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
37 mkdir -p logs
63
d46c8b12fc04 support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents: 62
diff changeset
38 mkdir -p res
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
39 f=$1
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
40 shift
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
41 j=$1
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
42 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
43 mapper="$1"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
44 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
45 filter="$1"
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
46 shift
64
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
47 keyHandler="$1"
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
48 shift
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
49 shift # we don't need/want the resType either
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
50 me=$(hostname | cut -c 15)
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
51 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//')
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
52 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
53 export PYTHONIOENCODING=utf-8
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
54 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
64
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
55 unpigz -dp 1 -c | $filter ./$mapper "$keyHandler" "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; }
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
56 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
57 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
58 # guarantee atomic entry in the log
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
59 }
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
60
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
61 export -f worker
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
62
64
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
63 echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
64
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
65 date 1>&2
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
66 parallel \
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
67 --sshloginfile $hosts \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
68 --retries 3 \
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
69 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
70 --will-cite \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
71 --jobs $cores \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
72 --workdir $wd \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
73 -a input_paths \
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
74 --env worker \
63
d46c8b12fc04 support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents: 62
diff changeset
75 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \
64
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
76 worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@"
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
77 res=$?
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
78 echo $(date) $res
63
d46c8b12fc04 support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents: 62
diff changeset
79 cat res/*.tsv | sac_reducer.py $1 $numKeys