annotate master/src/wecu/run_sac.sh @ 63:d46c8b12fc04

support multiple approaches to key combination, use local files to collect results
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 03 Jun 2020 16:40:34 +0000
parents 892e1c0240e1
children b91e44355bbf
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/bin/bash
63
d46c8b12fc04 support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents: 62
diff changeset
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper keyHandler (-f filter) (-k numKeys) resType patType patterns
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
3 echo "$@" 1>cmd
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 cores=$1
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5 hosts=$2
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
6 wd=$3
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
7 mapper=$4
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
8 shift
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
9 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
10 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
11 shift
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
12 if [ "$1" = "-f" ]
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
13 then
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
14 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
15 filter="$1"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
16 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
17 else
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
18 filter=\"\"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
19 fi
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
20 if [ "$1" = "-k" ]
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
21 then
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
22 shift
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
23 numKeys="$1"
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
24 shift
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
25 fi
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
26
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
27 # Get quoting right...
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
28 worker () {
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
29 set -e
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
30 set -o pipefail
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
31 mkdir -p logs
63
d46c8b12fc04 support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents: 62
diff changeset
32 mkdir -p res
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
33 f=$1
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
34 shift
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
35 j=$1
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
36 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
37 mapper="$1"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
38 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
39 filter="$1"
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
40 shift
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
41 shift # we don't need/want the resType either
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
42 me=$(hostname | cut -c 15)
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
43 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//')
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
44 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
45 export PYTHONIOENCODING=utf-8
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
46 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
63
d46c8b12fc04 support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents: 62
diff changeset
47 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; }
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
48 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
49 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
50 # guarantee atomic entry in the log
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
51 }
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
52
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
53 export -f worker
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
54
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
55 parallel \
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
56 --sshloginfile $hosts \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
57 --retries 3 \
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
58 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
59 --will-cite \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
60 --jobs $cores \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
61 --workdir $wd \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
62 -a input_paths \
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
63 --env worker \
63
d46c8b12fc04 support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents: 62
diff changeset
64 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \
d46c8b12fc04 support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents: 62
diff changeset
65 worker '{}' '{#}' "$mapper" "$filter" "$@"
d46c8b12fc04 support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents: 62
diff changeset
66 cat res/*.tsv | sac_reducer.py $1 $numKeys