view master/src/wecu/run_sac.sh @ 65:e1f61f94b196

switch to curl->file, enable retries
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 04 Jun 2020 12:08:29 +0000
parents b91e44355bbf
children 1f04bce6ead7
line wrap: on
line source

#!/bin/bash
# Usage: run_sac.sh numcores hostsFilename workDir mapper (-h keyHandler) (-f filter) (-k numKeys) resType patType patterns
echo "$@" 1>cmd
cores=$1
hosts=$2
wd=$3
mapper=$4
shift
shift
shift
shift
if [ "$1" = "-h" ]
then
 shift
 keyHandler="$1"
 shift
fi
if [ "$1" = "-f" ]
then
 shift
 filter="$1"
 shift
else
 filter=\"\"
fi
if [ "$1" = "-k" ]
then
 shift
 numKeys="$1"
 shift
fi

lrand () {
# cheap bad little random number generator
echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
}

tryread () {
  m=0
  u=$1
  f=$2
  set -o pipefail
  until if [ $((m+=1)) -gt 5 ]; then echo " tried 5 times w/o success, giving up" 1>&2; return 1; fi && echo $(date) "Reading $u ..." 1>&2 && \
   curl -s -S --max-time 60 --insecure -o "$f" "$u" &&
        echo " done at " $(date) 1>&2 
   do
    # try to avoid lockstep retries
    echo \# ${PIPESTATUS[@]} 1>&2
    sleep $(lrand 10)
    echo \# $(date) retry number $m 1>&2
  done
  set +o pipefail
}

# Get quoting right...
worker () {
  set -o pipefail
  mkdir -p logs
  mkdir -p res
  f=$1
  shift
  j=$1
  shift
  mapper="$1"
  shift
  filter="$1"
  shift
  keyHandler="$1"
  shift
  shift # we don't need/want the resType either
  me=$(hostname | cut -c 15)
  ff=$(echo $f | cut -f 4,6 -d / | sed 's/CC-MAIN-//;s/\.warc.*$//')
  echo $(date +%Y-%m-%d.%H:%M:%S) $me start $j $ff >>logs/${j}_log
  export PYTHONIOENCODING=utf-8
  { IFS=$'\n' ; stderr=( $( { set -e
		  #curl -s -N -o ${j}.gz https://commoncrawl.s3.amazonaws.com/$f
		  tryread "https://commoncrawl.s3.amazonaws.com/$f" "${j}.gz"
		  unpigz -dp 1 -c ${j}.gz |\
		      $filter ./$mapper "$keyHandler" "$@" 
	      } 2>&1 1>res/${j}.tsv ) ) ; subres="$?" ; unset IFS ; }
  rm "${j}.gz"
  { echo $(date +%Y-%m-%d.%H:%M:%S) $me finished $j $ff $subres
    printf '%s\n' "${stderr[@]}" ; } | sed '2,$s/^/ /' >>logs/${j}_log # hack
         # to try to guarantee atomic entry in the log
         # Pbly not necessary with current sub-structure...
}

export -f worker tryread lrand

echo worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@" 1>&2

echo starting... $(date) 1>&2
parallel \
    --joblog parlog.txt \
    --sshloginfile $hosts \
    --retries 3 \
    --transferfile $(which $mapper|sed 's/\(^.*\/\)/\1.\//') \
    --will-cite \
    --jobs $cores \
    --workdir $wd \
    -a input_paths \
    --env worker --env tryread --env lrand \
    --return 'logs/{#}_log' --return 'res/{#}.tsv' --cleanup \
    worker '{}' '{#}' "$mapper" "$filter" "$keyHandler" "$@"
res=$?
echo reducing... $(date) pres=$res 1>&2
cat res/*.tsv | sac_reducer.py $1 $numKeys
echo done $(date) 1>&2