Mercurial > hg > cc > azure
comparison master/src/wecu/run_sac.sh @ 62:892e1c0240e1
added more robust (I hope) error handling,
got reducer working with support for choosing dict or tsv output
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 02 Jun 2020 17:35:07 +0000 |
parents | cfaf5223b071 |
children | d46c8b12fc04 |
comparison
equal
deleted
inserted
replaced
61:cfaf5223b071 | 62:892e1c0240e1 |
---|---|
1 #!/bin/bash | 1 #!/bin/bash |
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) resType patType patterns | 2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-f filter) (-k numKeys) resType patType patterns |
3 echo "$@" 1>cmd | |
3 cores=$1 | 4 cores=$1 |
4 hosts=$2 | 5 hosts=$2 |
5 wd=$3 | 6 wd=$3 |
6 mapper=$4 | 7 mapper=$4 |
7 shift | 8 shift |
14 filter="$1" | 15 filter="$1" |
15 shift | 16 shift |
16 else | 17 else |
17 filter=\"\" | 18 filter=\"\" |
18 fi | 19 fi |
19 | 20 if [ "$1" = "-k" ] |
20 rm -f allout | 21 then |
22 shift | |
23 numKeys="$1" | |
24 shift | |
25 fi | |
21 | 26 |
22 # Get quoting right... | 27 # Get quoting right... |
23 worker () { | 28 worker () { |
24 set -e | 29 set -e |
25 set -o pipefail | 30 set -o pipefail |
31 mkdir -p logs | |
26 f=$1 | 32 f=$1 |
27 shift | 33 shift |
28 j=$1 | 34 j=$1 |
29 shift | 35 shift |
30 mapper="$1" | 36 mapper="$1" |
31 shift | 37 shift |
32 filter="$1" | 38 filter="$1" |
33 shift | 39 shift |
34 shift # we don't need/want the resType either | 40 shift # we don't need/want the resType either |
35 echo $(date) $(hostname) start $f >>${j}_log | 41 me=$(hostname | cut -c 15) |
42 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') | |
43 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log | |
36 export PYTHONIOENCODING=utf-8 | 44 export PYTHONIOENCODING=utf-8 |
37 { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ | 45 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ |
38 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>>${j}_log | 46 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>&4 ; ) ) ; unset IFS ; } 4>&1 |
39 echo $(date) $(hostname) finished $f >>${j}_log | 47 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff |
48 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to | |
49 # guarantee atomic entry in the log | |
40 } | 50 } |
41 | 51 |
42 export -f worker | 52 export -f worker |
43 | 53 |
44 parallel \ | 54 parallel \ |
48 --will-cite \ | 58 --will-cite \ |
49 --jobs $cores \ | 59 --jobs $cores \ |
50 --workdir $wd \ | 60 --workdir $wd \ |
51 -a input_paths \ | 61 -a input_paths \ |
52 --env worker \ | 62 --env worker \ |
53 worker '{}' '{#}' "$mapper" "$filter" "$@" 2>errs | grep -v 'Authorized uses only' | tee >(wc -l 1>&2) |\ | 63 --return 'logs/{#}_log' --cleanup \ |
54 sac_reducer.py "$@" | 64 worker '{}' '{#}' "$mapper" "$filter" "$@" | tee >(wc -l 1>&2) |\ |
65 sac_reducer.py $1 $numKeys |