Mercurial > hg > cc > azure
view master/src/wecu/run_sac.sh @ 65:e1f61f94b196
switch to curl->file, enable retries
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Thu, 04 Jun 2020 12:08:29 +0000 |
parents | b91e44355bbf |
children | 1f04bce6ead7 |
line wrap: on
line source
#!/bin/bash # Usage: run_sac.sh numcores hostsFilename workDir mapper (-h keyHandler) (-f filter) (-k numKeys) resType patType patterns echo "$@" 1>cmd cores=$1 hosts=$2 wd=$3 mapper=$4 shift shift shift shift if [ "$1" = "-h" ] then shift keyHandler="$1" shift fi if [ "$1" = "-f" ] then shift filter="$1" shift else filter=\"\" fi if [ "$1" = "-k" ] then shift numKeys="$1" shift fi lrand () { # cheap bad little random number generator echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) } tryread () { m=0 u=$1 f=$2 set -o pipefail until if [ $((m+=1)) -gt 5 ]; then echo " tried 5 times w/o success, giving up" 1>&2; return 1; fi && echo $(date) "Reading $u ..." 1>&2 && \ curl -s -S --max-time 60 --insecure -o "$f" "$u" && echo " done at " $(date) 1>&2 do # try to avoid lockstep retries echo \# ${PIPESTATUS[@]} 1>&2 sleep $(lrand 10) echo \# $(date) retry number $m 1>&2 done set +o pipefail } # Get quoting right... worker () { set -o pipefail mkdir -p logs mkdir -p res f=$1 shift j=$1 shift mapper="$1" shift filter="$1" shift keyHandler="$1" shift shift # we don't need/want the resType either me=$(hostname | cut -c 15) ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//') echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log export PYTHONIOENCODING=utf-8 { IFS=$'\n' ; stderr=( $( { set -e #curl -s -N -o ${j}.gz https://commoncrawl.s3.amazonaws.com/$f tryread "https://commoncrawl.s3.amazonaws.com/$f" "${j}.gz" unpigz -dp 1 -c ${j}.gz |\ $filter ./$mapper "$keyHandler" "$@" } 2>&1 1>res/${j}.tsv ) ) ; subres="$?" ; unset IFS ; } rm "${j}.gz" { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff $subres printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack # to try to guarantee atomic entry in the log # Pbly not necessary with current sub-structure... } export -f worker tryread lrand echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2 echo starting... $(date) 1>&2 parallel \ --joblog parlog.txt \ --sshloginfile $hosts \ --retries 3 \ --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \ --will-cite \ --jobs $cores \ --workdir $wd \ -a input_paths \ --env worker --env tryread --env lrand \ --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \ worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" res=$? echo reducing... $(date) pres=$res 1>&2 cat res/*.tsv | sac_reducer.py $1 $numKeys echo done $(date) 1>&2