Mercurial > hg > cc > azure
comparison master/src/wecu/run_sac.sh @ 65:e1f61f94b196
switch to curl->file, enable retries
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Thu, 04 Jun 2020 12:08:29 +0000 |
parents | b91e44355bbf |
children | 1f04bce6ead7 |
comparison
equal
deleted
inserted
replaced
64:b91e44355bbf | 65:e1f61f94b196 |
---|---|
28 shift | 28 shift |
29 numKeys="$1" | 29 numKeys="$1" |
30 shift | 30 shift |
31 fi | 31 fi |
32 | 32 |
33 lrand () { | |
34 # cheap bad little random number generator | |
35 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) | |
36 } | |
37 | |
38 tryread () { | |
39 m=0 | |
40 u=$1 | |
41 f=$2 | |
42 set -o pipefail | |
43 until if [ $((m+=1)) -gt 5 ]; then echo " tried 5 times w/o success, giving up" 1>&2; return 1; fi && echo $(date) "Reading $u ..." 1>&2 && \ | |
44 curl -s -S --max-time 60 --insecure -o "$f" "$u" && | |
45 echo " done at " $(date) 1>&2 | |
46 do | |
47 # try to avoid lockstep retries | |
48 echo \# ${PIPESTATUS[@]} 1>&2 | |
49 sleep $(lrand 10) | |
50 echo \# $(date) retry number $m 1>&2 | |
51 done | |
52 set +o pipefail | |
53 } | |
54 | |
33 # Get quoting right... | 55 # Get quoting right... |
34 worker () { | 56 worker () { |
35 set -e | |
36 set -o pipefail | 57 set -o pipefail |
37 mkdir -p logs | 58 mkdir -p logs |
38 mkdir -p res | 59 mkdir -p res |
39 f=$1 | 60 f=$1 |
40 shift | 61 shift |
49 shift # we don't need/want the resType either | 70 shift # we don't need/want the resType either |
50 me=$(hostname | cut -c 15) | 71 me=$(hostname | cut -c 15) |
51 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') | 72 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') |
52 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log | 73 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log |
53 export PYTHONIOENCODING=utf-8 | 74 export PYTHONIOENCODING=utf-8 |
54 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ | 75 { IFS=$'\n' ; stderr=( $( { set -e |
55 unpigz -dp 1 -c | $filter ./$mapper "$keyHandler" "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; } | 76 #curl -s -N -o ${j}.gz https://commoncrawl.s3.amazonaws.com/$f |
56 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff | 77 tryread "https://commoncrawl.s3.amazonaws.com/$f" "${j}.gz" |
57 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to | 78 unpigz -dp 1 -c ${j}.gz |\ |
58 # guarantee atomic entry in the log | 79 $filter ./$mapper "$keyHandler" "$@" |
80 } 2>&1 1>res/${j}.tsv ) ) ; subres="$?" ; unset IFS ; } | |
81 rm "${j}.gz" | |
82 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff $subres | |
83 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack | |
84 # to try to guarantee atomic entry in the log | |
85 # Pbly not necessary with current sub-structure... | |
59 } | 86 } |
60 | 87 |
61 export -f worker | 88 export -f worker tryread lrand |
62 | 89 |
63 echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2 | 90 echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2 |
64 | 91 |
65 date 1>&2 | 92 echo starting... $(date) 1>&2 |
66 parallel \ | 93 parallel \ |
94 --joblog parlog.txt \ | |
67 --sshloginfile $hosts \ | 95 --sshloginfile $hosts \ |
68 --retries 3 \ | 96 --retries 3 \ |
69 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ | 97 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ |
70 --will-cite \ | 98 --will-cite \ |
71 --jobs $cores \ | 99 --jobs $cores \ |
72 --workdir $wd \ | 100 --workdir $wd \ |
73 -a input_paths \ | 101 -a input_paths \ |
74 --env worker \ | 102 --env worker --env tryread --env lrand \ |
75 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \ | 103 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \ |
76 worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" | 104 worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" |
77 res=$? | 105 res=$? |
78 echo $(date) $res | 106 echo reducing... $(date) pres=$res 1>&2 |
79 cat res/*.tsv | sac_reducer.py $1 $numKeys | 107 cat res/*.tsv | sac_reducer.py $1 $numKeys |
108 echo done $(date) 1>&2 |