Mercurial > hg > cc > azure
annotate master/src/wecu/run_sac.sh @ 68:1f04bce6ead7 default tip
use basefile instead of transferfile, and remove cleanup: belt and braces wrt lossage of sac_schemes.py in 15% of 1000_k3,
this as used in a_2
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Thu, 04 Jun 2020 20:44:44 +0000 |
parents | e1f61f94b196 |
children |
rev | line source |
---|---|
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
64
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-h keyHandler) (-f filter) (-k numKeys) resType patType patterns |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
3 echo "$@" 1>cmd |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 cores=$1 |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
5 hosts=$2 |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
6 wd=$3 |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
7 mapper=$4 |
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
8 shift |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
9 shift |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
10 shift |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
11 shift |
64
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
12 if [ "$1" = "-h" ] |
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
13 then |
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
14 shift |
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
15 keyHandler="$1" |
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
16 shift |
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
17 fi |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
18 if [ "$1" = "-f" ] |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
19 then |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
20 shift |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
21 filter="$1" |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
22 shift |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
23 else |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
24 filter=\"\" |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
25 fi |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
26 if [ "$1" = "-k" ] |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
27 then |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
28 shift |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
29 numKeys="$1" |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
30 shift |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
31 fi |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
32 |
65
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
33 lrand () { |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
34 # cheap bad little random number generator |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
35 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
36 } |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
37 |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
38 tryread () { |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
39 m=0 |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
40 u=$1 |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
41 f=$2 |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
42 set -o pipefail |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
43 until if [ $((m+=1)) -gt 5 ]; then echo " tried 5 times w/o success, giving up" 1>&2; return 1; fi && echo $(date) "Reading $u ..." 1>&2 && \ |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
44 curl -s -S --max-time 60 --insecure -o "$f" "$u" && |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
45 echo " done at " $(date) 1>&2 |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
46 do |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
47 # try to avoid lockstep retries |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
48 echo \# ${PIPESTATUS[@]} 1>&2 |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
49 sleep $(lrand 10) |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
50 echo \# $(date) retry number $m 1>&2 |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
51 done |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
52 set +o pipefail |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
53 } |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
54 |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
55 # Get quoting right... |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
56 worker () { |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
57 set -o pipefail |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
58 mkdir -p logs |
63
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
59 mkdir -p res |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
60 f=$1 |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
61 shift |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
62 j=$1 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
63 shift |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
64 mapper="$1" |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
65 shift |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
66 filter="$1" |
60
5fdca5baa4e9
refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents:
59
diff
changeset
|
67 shift |
64
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
68 keyHandler="$1" |
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
69 shift |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
70 shift # we don't need/want the resType either |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
71 me=$(hostname | cut -c 15) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
72 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
73 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
74 export PYTHONIOENCODING=utf-8 |
65
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
75 { IFS=$'\n' ; stderr=( $( { set -e |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
76 #curl -s -N -o ${j}.gz https://commoncrawl.s3.amazonaws.com/$f |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
77 tryread "https://commoncrawl.s3.amazonaws.com/$f" "${j}.gz" |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
78 unpigz -dp 1 -c ${j}.gz |\ |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
79 $filter ./$mapper "$keyHandler" "$@" |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
80 } 2>&1 1>res/${j}.tsv ) ) ; subres="$?" ; unset IFS ; } |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
81 rm "${j}.gz" |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
82 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff $subres |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
83 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
84 # to try to guarantee atomic entry in the log |
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
85 # Pbly not necessary with current sub-structure... |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
86 } |
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
87 |
65
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
88 export -f worker tryread lrand |
59
8332faef25e1
get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents:
58
diff
changeset
|
89 |
64
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
90 echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2 |
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
91 |
65
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
92 echo starting... $(date) 1>&2 |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
93 parallel \ |
65
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
94 --joblog parlog.txt \ |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
95 --sshloginfile $hosts \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
96 --retries 3 \ |
68
1f04bce6ead7
use basefile instead of transferfile, and remove cleanup: belt and braces wrt lossage of sac_schemes.py in 15% of 1000_k3,
Henry S. Thompson <ht@markup.co.uk>
parents:
65
diff
changeset
|
97 --basefile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
98 --will-cite \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
99 --jobs $cores \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
100 --workdir $wd \ |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
101 -a input_paths \ |
65
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
102 --env worker --env tryread --env lrand \ |
68
1f04bce6ead7
use basefile instead of transferfile, and remove cleanup: belt and braces wrt lossage of sac_schemes.py in 15% of 1000_k3,
Henry S. Thompson <ht@markup.co.uk>
parents:
65
diff
changeset
|
103 --return 'logs/{#}_log' --return 'res/{#}.tsv' \ |
64
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
104 worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" |
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
105 res=$? |
65
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
106 echo reducing... $(date) pres=$res 1>&2 |
63
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
107 cat res/*.tsv | sac_reducer.py $1 $numKeys |
65
e1f61f94b196
switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents:
64
diff
changeset
|
108 echo done $(date) 1>&2 |