Mercurial > hg > cc > azure
comparison master/src/wecu/run_sac.sh @ 64:b91e44355bbf
fix minor argument passing snafus
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 03 Jun 2020 22:08:01 +0000 |
parents | d46c8b12fc04 |
children | e1f61f94b196 |
comparison
equal
deleted
inserted
replaced
63:d46c8b12fc04 | 64:b91e44355bbf |
---|---|
1 #!/bin/bash | 1 #!/bin/bash |
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper keyHandler (-f filter) (-k numKeys) resType patType patterns | 2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-h keyHandler) (-f filter) (-k numKeys) resType patType patterns |
3 echo "$@" 1>cmd | 3 echo "$@" 1>cmd |
4 cores=$1 | 4 cores=$1 |
5 hosts=$2 | 5 hosts=$2 |
6 wd=$3 | 6 wd=$3 |
7 mapper=$4 | 7 mapper=$4 |
8 shift | 8 shift |
9 shift | 9 shift |
10 shift | 10 shift |
11 shift | 11 shift |
12 if [ "$1" = "-h" ] | |
13 then | |
14 shift | |
15 keyHandler="$1" | |
16 shift | |
17 fi | |
12 if [ "$1" = "-f" ] | 18 if [ "$1" = "-f" ] |
13 then | 19 then |
14 shift | 20 shift |
15 filter="$1" | 21 filter="$1" |
16 shift | 22 shift |
36 shift | 42 shift |
37 mapper="$1" | 43 mapper="$1" |
38 shift | 44 shift |
39 filter="$1" | 45 filter="$1" |
40 shift | 46 shift |
47 keyHandler="$1" | |
48 shift | |
41 shift # we don't need/want the resType either | 49 shift # we don't need/want the resType either |
42 me=$(hostname | cut -c 15) | 50 me=$(hostname | cut -c 15) |
43 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') | 51 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') |
44 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log | 52 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log |
45 export PYTHONIOENCODING=utf-8 | 53 export PYTHONIOENCODING=utf-8 |
46 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ | 54 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ |
47 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; } | 55 unpigz -dp 1 -c | $filter ./$mapper "$keyHandler" "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; } |
48 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff | 56 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff |
49 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to | 57 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to |
50 # guarantee atomic entry in the log | 58 # guarantee atomic entry in the log |
51 } | 59 } |
52 | 60 |
53 export -f worker | 61 export -f worker |
54 | 62 |
63 echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2 | |
64 | |
65 date 1>&2 | |
55 parallel \ | 66 parallel \ |
56 --sshloginfile $hosts \ | 67 --sshloginfile $hosts \ |
57 --retries 3 \ | 68 --retries 3 \ |
58 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ | 69 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ |
59 --will-cite \ | 70 --will-cite \ |
60 --jobs $cores \ | 71 --jobs $cores \ |
61 --workdir $wd \ | 72 --workdir $wd \ |
62 -a input_paths \ | 73 -a input_paths \ |
63 --env worker \ | 74 --env worker \ |
64 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \ | 75 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \ |
65 worker '{}' '{#}' "$mapper" "$filter" "$@" | 76 worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" |
77 res=$? | |
78 echo $(date) $res | |
66 cat res/*.tsv | sac_reducer.py $1 $numKeys | 79 cat res/*.tsv | sac_reducer.py $1 $numKeys |