comparison master/src/wecu/run_sac.sh @ 64:b91e44355bbf

fix minor argument passing snafus
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 03 Jun 2020 22:08:01 +0000
parents d46c8b12fc04
children e1f61f94b196
comparison
equal deleted inserted replaced
63:d46c8b12fc04 64:b91e44355bbf
1 #!/bin/bash 1 #!/bin/bash
2 # Usage: run_sac.sh numcores hostsFilename workDir mapper keyHandler (-f filter) (-k numKeys) resType patType patterns 2 # Usage: run_sac.sh numcores hostsFilename workDir mapper (-h keyHandler) (-f filter) (-k numKeys) resType patType patterns
3 echo "$@" 1>cmd 3 echo "$@" 1>cmd
4 cores=$1 4 cores=$1
5 hosts=$2 5 hosts=$2
6 wd=$3 6 wd=$3
7 mapper=$4 7 mapper=$4
8 shift 8 shift
9 shift 9 shift
10 shift 10 shift
11 shift 11 shift
12 if [ "$1" = "-h" ]
13 then
14 shift
15 keyHandler="$1"
16 shift
17 fi
12 if [ "$1" = "-f" ] 18 if [ "$1" = "-f" ]
13 then 19 then
14 shift 20 shift
15 filter="$1" 21 filter="$1"
16 shift 22 shift
36 shift 42 shift
37 mapper="$1" 43 mapper="$1"
38 shift 44 shift
39 filter="$1" 45 filter="$1"
40 shift 46 shift
47 keyHandler="$1"
48 shift
41 shift # we don't need/want the resType either 49 shift # we don't need/want the resType either
42 me=$(hostname | cut -c 15) 50 me=$(hostname | cut -c 15)
43 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') 51 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//')
44 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log 52 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log
45 export PYTHONIOENCODING=utf-8 53 export PYTHONIOENCODING=utf-8
46 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ 54 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \
47 unpigz -dp 1 -c | $filter ./$mapper "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; } 55 unpigz -dp 1 -c | $filter ./$mapper "$keyHandler" "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; }
48 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff 56 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff
49 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to 57 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to
50 # guarantee atomic entry in the log 58 # guarantee atomic entry in the log
51 } 59 }
52 60
53 export -f worker 61 export -f worker
54 62
63 echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2
64
65 date 1>&2
55 parallel \ 66 parallel \
56 --sshloginfile $hosts \ 67 --sshloginfile $hosts \
57 --retries 3 \ 68 --retries 3 \
58 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ 69 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
59 --will-cite \ 70 --will-cite \
60 --jobs $cores \ 71 --jobs $cores \
61 --workdir $wd \ 72 --workdir $wd \
62 -a input_paths \ 73 -a input_paths \
63 --env worker \ 74 --env worker \
64 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \ 75 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \
65 worker '{}' '{#}' "$mapper" "$filter" "$@" 76 worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@"
77 res=$?
78 echo $(date) $res
66 cat res/*.tsv | sac_reducer.py $1 $numKeys 79 cat res/*.tsv | sac_reducer.py $1 $numKeys