annotate master/src/wecu/run_sac.sh @ 68:1f04bce6ead7 default tip

use basefile instead of transferfile, and remove cleanup: belt and braces wrt lossage of sac_schemes.py in 15% of 1000_k3, this as used in a_2
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 04 Jun 2020 20:44:44 +0000
parents e1f61f94b196
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/bin/bash
64
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-h keyHandler) (-f filter) (-k numKeys) resType patType patterns
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
3 echo "$@" 1>cmd
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 cores=$1
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5 hosts=$2
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
6 wd=$3
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
7 mapper=$4
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
8 shift
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
9 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
10 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
11 shift
64
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
12 if [ "$1" = "-h" ]
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
13 then
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
14 shift
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
15 keyHandler="$1"
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
16 shift
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
17 fi
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
18 if [ "$1" = "-f" ]
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
19 then
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
20 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
21 filter="$1"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
22 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
23 else
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
24 filter=\"\"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
25 fi
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
26 if [ "$1" = "-k" ]
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
27 then
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
28 shift
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
29 numKeys="$1"
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
30 shift
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
31 fi
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
32
65
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
33 lrand () {
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
34 # cheap bad little random number generator
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
35 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
36 }
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
37
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
38 tryread () {
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
39 m=0
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
40 u=$1
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
41 f=$2
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
42 set -o pipefail
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
43 until if [ $((m+=1)) -gt 5 ]; then echo " tried 5 times w/o success, giving up" 1>&2; return 1; fi && echo $(date) "Reading $u ..." 1>&2 && \
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
44 curl -s -S --max-time 60 --insecure -o "$f" "$u" &&
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
45 echo " done at " $(date) 1>&2
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
46 do
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
47 # try to avoid lockstep retries
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
48 echo \# ${PIPESTATUS[@]} 1>&2
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
49 sleep $(lrand 10)
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
50 echo \# $(date) retry number $m 1>&2
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
51 done
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
52 set +o pipefail
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
53 }
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
54
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
55 # Get quoting right...
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
56 worker () {
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
57 set -o pipefail
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
58 mkdir -p logs
63
d46c8b12fc04 support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents: 62
diff changeset
59 mkdir -p res
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
60 f=$1
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
61 shift
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
62 j=$1
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
63 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
64 mapper="$1"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
65 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
66 filter="$1"
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
67 shift
64
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
68 keyHandler="$1"
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
69 shift
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
70 shift # we don't need/want the resType either
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
71 me=$(hostname | cut -c 15)
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
72 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//')
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
73 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
74 export PYTHONIOENCODING=utf-8
65
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
75 { IFS=$'\n' ; stderr=( $( { set -e
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
76 #curl -s -N -o ${j}.gz https://commoncrawl.s3.amazonaws.com/$f
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
77 tryread "https://commoncrawl.s3.amazonaws.com/$f" "${j}.gz"
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
78 unpigz -dp 1 -c ${j}.gz |\
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
79 $filter ./$mapper "$keyHandler" "$@"
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
80 } 2>&1 1>res/${j}.tsv ) ) ; subres="$?" ; unset IFS ; }
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
81 rm "${j}.gz"
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
82 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff $subres
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
83 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
84 # to try to guarantee atomic entry in the log
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
85 # Pbly not necessary with current sub-structure...
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
86 }
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
87
65
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
88 export -f worker tryread lrand
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
89
64
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
90 echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
91
65
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
92 echo starting... $(date) 1>&2
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
93 parallel \
65
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
94 --joblog parlog.txt \
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
95 --sshloginfile $hosts \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
96 --retries 3 \
68
1f04bce6ead7 use basefile instead of transferfile, and remove cleanup: belt and braces wrt lossage of sac_schemes.py in 15% of 1000_k3,
Henry S. Thompson <ht@markup.co.uk>
parents: 65
diff changeset
97 --basefile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
98 --will-cite \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
99 --jobs $cores \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
100 --workdir $wd \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
101 -a input_paths \
65
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
102 --env worker --env tryread --env lrand \
68
1f04bce6ead7 use basefile instead of transferfile, and remove cleanup: belt and braces wrt lossage of sac_schemes.py in 15% of 1000_k3,
Henry S. Thompson <ht@markup.co.uk>
parents: 65
diff changeset
103 --return 'logs/{#}_log' --return 'res/{#}.tsv' \
64
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
104 worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@"
b91e44355bbf fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents: 63
diff changeset
105 res=$?
65
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
106 echo reducing... $(date) pres=$res 1>&2
63
d46c8b12fc04 support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents: 62
diff changeset
107 cat res/*.tsv | sac_reducer.py $1 $numKeys
65
e1f61f94b196 switch to curl->file, enable retries
Henry S. Thompson <ht@markup.co.uk>
parents: 64
diff changeset
108 echo done $(date) 1>&2