comparison master/src/wecu/run_sac.sh @ 62:892e1c0240e1

added more robust (I hope) error handling, got reducer working with support for choosing dict or tsv output
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 02 Jun 2020 17:35:07 +0000
parents cfaf5223b071
children d46c8b12fc04
comparison
equal deleted inserted replaced
61:cfaf5223b071 62:892e1c0240e1
1 #!/bin/bash 1 #!/bin/bash
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) resType patType patterns 2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) (-k numKeys) resType patType patterns
3 echo "$@" 1>cmd
3 cores=$1 4 cores=$1
4 hosts=$2 5 hosts=$2
5 wd=$3 6 wd=$3
6 mapper=$4 7 mapper=$4
7 shift 8 shift
14 filter="$1" 15 filter="$1"
15 shift 16 shift
16 else 17 else
17 filter=\"\" 18 filter=\"\"
18 fi 19 fi
19 20 if [ "$1" = "-k" ]
20 rm -f allout 21 then
22 shift
23 numKeys="$1"
24 shift
25 fi
21 26
22 # Get quoting right... 27 # Get quoting right...
23 worker () { 28 worker () {
24 set -e 29 set -e
25 set -o pipefail 30 set -o pipefail
31 mkdir -p logs
26 f=$1 32 f=$1
27 shift 33 shift
28 j=$1 34 j=$1
29 shift 35 shift
30 mapper="$1" 36 mapper="$1"
31 shift 37 shift
32 filter="$1" 38 filter="$1"
33 shift 39 shift
34 shift # we don't need/want the resType either 40 shift # we don't need/want the resType either
35 echo $(date) $(hostname) start $f >>${j}_log 41 me=$(hostname | cut -c 15)
42 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//')
43 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log
36 export PYTHONIOENCODING=utf-8 44 export PYTHONIOENCODING=utf-8
37 { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ 45 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
38 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>>${j}_log 46 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>&4 ; ) ) ; unset IFS ; } 4>&1
39 echo $(date) $(hostname) finished $f >>${j}_log 47 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff
48 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to
49 # guarantee atomic entry in the log
40 } 50 }
41 51
42 export -f worker 52 export -f worker
43 53
44 parallel \ 54 parallel \
48 --will-cite \ 58 --will-cite \
49 --jobs $cores \ 59 --jobs $cores \
50 --workdir $wd \ 60 --workdir $wd \
51 -a input_paths \ 61 -a input_paths \
52 --env worker \ 62 --env worker \
53 worker '{}' '{#}' "$mapper" "$filter" "$@" 2>errs | grep -v 'Authorized uses only' | tee >(wc -l 1>&2) |\ 63 --return 'logs/{#}_log' --cleanup \
54 sac_reducer.py "$@" 64 worker '{}' '{#}' "$mapper" "$filter" "$@" | tee >(wc -l 1>&2) |\
65 sac_reducer.py $1 $numKeys