annotate master/src/wecu/run_sac.sh @ 62:892e1c0240e1

added more robust (I hope) error handling, got reducer working with support for choosing dict or tsv output
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 02 Jun 2020 17:35:07 +0000
parents cfaf5223b071
children d46c8b12fc04
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/bin/bash
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) (-k numKeys) resType patType patterns
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
3 echo "$@" 1>cmd
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 cores=$1
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5 hosts=$2
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
6 wd=$3
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
7 mapper=$4
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
8 shift
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
9 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
10 shift
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
11 shift
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
12 if [ "$1" = "-f" ]
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
13 then
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
14 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
15 filter="$1"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
16 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
17 else
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
18 filter=\"\"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
19 fi
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
20 if [ "$1" = "-k" ]
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
21 then
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
22 shift
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
23 numKeys="$1"
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
24 shift
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
25 fi
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
26
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
27 # Get quoting right...
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
28 worker () {
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
29 set -e
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
30 set -o pipefail
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
31 mkdir -p logs
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
32 f=$1
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
33 shift
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
34 j=$1
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
35 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
36 mapper="$1"
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
37 shift
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
38 filter="$1"
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
39 shift
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
40 shift # we don't need/want the resType either
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
41 me=$(hostname | cut -c 15)
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
42 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//')
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
43 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
44 export PYTHONIOENCODING=utf-8
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
45 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
46 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>&4 ; ) ) ; unset IFS ; } 4>&1
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
47 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
48 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
49 # guarantee atomic entry in the log
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
50 }
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
51
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
52 export -f worker
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
53
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
54 parallel \
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
55 --sshloginfile $hosts \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
56 --retries 3 \
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 59
diff changeset
57 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
58 --will-cite \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
59 --jobs $cores \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
60 --workdir $wd \
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
61 -a input_paths \
59
8332faef25e1 get quoting and arg positions right
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
62 --env worker \
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
63 --return 'logs/{#}_log' --cleanup \
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
64 worker '{}' '{#}' "$mapper" "$filter" "$@" | tee >(wc -l 1>&2) |\
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
65 sac_reducer.py $1 $numKeys