comparison master/src/wecu/run_sac.sh @ 65:e1f61f94b196

switch to curl->file, enable retries
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 04 Jun 2020 12:08:29 +0000
parents b91e44355bbf
children 1f04bce6ead7
comparison
equal deleted inserted replaced
64:b91e44355bbf 65:e1f61f94b196
28 shift 28 shift
29 numKeys="$1" 29 numKeys="$1"
30 shift 30 shift
31 fi 31 fi
32 32
33 lrand () {
34 # cheap bad little random number generator
35 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
36 }
37
38 tryread () {
39 m=0
40 u=$1
41 f=$2
42 set -o pipefail
43 until if [ $((m+=1)) -gt 5 ]; then echo " tried 5 times w/o success, giving up" 1>&2; return 1; fi && echo $(date) "Reading $u ..." 1>&2 && \
44 curl -s -S --max-time 60 --insecure -o "$f" "$u" &&
45 echo " done at " $(date) 1>&2
46 do
47 # try to avoid lockstep retries
48 echo \# ${PIPESTATUS[@]} 1>&2
49 sleep $(lrand 10)
50 echo \# $(date) retry number $m 1>&2
51 done
52 set +o pipefail
53 }
54
33 # Get quoting right... 55 # Get quoting right...
34 worker () { 56 worker () {
35 set -e
36 set -o pipefail 57 set -o pipefail
37 mkdir -p logs 58 mkdir -p logs
38 mkdir -p res 59 mkdir -p res
39 f=$1 60 f=$1
40 shift 61 shift
49 shift # we don't need/want the resType either 70 shift # we don't need/want the resType either
50 me=$(hostname | cut -c 15) 71 me=$(hostname | cut -c 15)
51 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') 72 ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//')
52 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log 73 echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log
53 export PYTHONIOENCODING=utf-8 74 export PYTHONIOENCODING=utf-8
54 { IFS=$'\n' ; stderr=( $( { curl -s -N https://commoncrawl.s3.amazonaws.com/$f | \ 75 { IFS=$'\n' ; stderr=( $( { set -e
55 unpigz -dp 1 -c | $filter ./$mapper "$keyHandler" "$@" ; } 2>&1 1>res/${j}.tsv ; ) ) ; unset IFS ; } 76 #curl -s -N -o ${j}.gz https://commoncrawl.s3.amazonaws.com/$f
56 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff 77 tryread "https://commoncrawl.s3.amazonaws.com/$f" "${j}.gz"
57 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack to try to 78 unpigz -dp 1 -c ${j}.gz |\
58 # guarantee atomic entry in the log 79 $filter ./$mapper "$keyHandler" "$@"
80 } 2>&1 1>res/${j}.tsv ) ) ; subres="$?" ; unset IFS ; }
81 rm "${j}.gz"
82 { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff $subres
83 printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack
84 # to try to guarantee atomic entry in the log
85 # Pbly not necessary with current sub-structure...
59 } 86 }
60 87
61 export -f worker 88 export -f worker tryread lrand
62 89
63 echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2 90 echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2
64 91
65 date 1>&2 92 echo starting... $(date) 1>&2
66 parallel \ 93 parallel \
94 --joblog parlog.txt \
67 --sshloginfile $hosts \ 95 --sshloginfile $hosts \
68 --retries 3 \ 96 --retries 3 \
69 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ 97 --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
70 --will-cite \ 98 --will-cite \
71 --jobs $cores \ 99 --jobs $cores \
72 --workdir $wd \ 100 --workdir $wd \
73 -a input_paths \ 101 -a input_paths \
74 --env worker \ 102 --env worker --env tryread --env lrand \
75 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \ 103 --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \
76 worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 104 worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@"
77 res=$? 105 res=$?
78 echo $(date) $res 106 echo reducing... $(date) pres=$res 1>&2
79 cat res/*.tsv | sac_reducer.py $1 $numKeys 107 cat res/*.tsv | sac_reducer.py $1 $numKeys
108 echo done $(date) 1>&2