view workers/bin/ptimedWhich.sh @ 47:2a0dab424418

cci path hack changed for 2018.04
author Henry S. Thompson <ht@markup.co.uk>
date Mon, 10 Dec 2018 14:43:18 +0000
parents 7a4e49689935
children b8a88cad75d5
line wrap: on
line source

#!/bin/bash
# Test script to split CC WAT files across  threads
#   to tabulate http vs. https by last-modified date:
# Usage: [echo file file_id] | timedWhich.sh id home [-t] numWorkerProcesses
#   If -t, no random wait, just id seconds
# remove >>errs once tested
#set -e -o pipefail
echo $$ > test1.pid
proc=$1
res=/var/data/res$proc
home=$2
shift 2
function lrand {
# cheap bad little random number generator
echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
}
if [ "$1" = "-t" ]
then
 shift
 pause=$proc
else
 pause=$(lrand 60)
fi
wp=$1
touch .running
function tryread {
n=$1
while read u o
do
  m=0
  set -o pipefail
  until if [ $((m+=1)) -gt 5 ]; then echo " tried 5 times w/o success, giving up" 1>&2; return 1; fi && echo -n \# $(date) "reading $u ..." 1>&2 && \
   curl -s -S --max-time 60 --insecure -o - "$u" | \
        { echo "done at " $(date) 1>&2 ; zcat ; } |\
      _timedWhich.py > "$o"
   do
    # try to avoid lockstep retries
    echo \# ${PIPESTATUS[@]} 1>&2
    sleep $(lrand 10)
    echo \# $(date) retry number $m 1>&2
  done
  set +o pipefail
done
}
trap "{ 
  set -e -o pipefail
  cd /var/data
  tar -czhf - CC* res* | \
   ssh -o StrictHostKeyChecking=no -q $home \"{ cd data
                    mkdir -p which
                    cd which
                    tar -xzf - ; } 2>>errs\"
  rm -rf res* CC*
  cd
  rm ifile.txt *.pid
  ( sleep 5 ; rm nohup.cc ) &
  }" EXIT
mkdir -p $res
log=$res/log
# Don't all start at once
sleep $pause
echo \# $(date) >  $log
pRes=0
N=$(wc -l< ifile.txt)
export -f tryread lrand
while read s
do
 url="https://commoncrawl.s3.amazonaws.com/$s"
 # below for 2018-04, for 2017-04 needs $13 instead of $14
 cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$14}' |tr ' ' \-)
 echo $url /var/data/$cci
done < ifile.txt 2>> $res/errs | \
 parallel --pipe -N$((N / wp)) -j $wp "bash -c \"tryread 2>>$res/errs{#}\"" 2>>$res/errs || pRes=$?
echo \# $(date) main loop exit code=$pRes >> $log
rm .running