view workers/bin/test1.sh @ 10:2fbefb8d1a9e

wrun.sh: usage catchup invoke.sh: force terminal allocation on workers test1.sh: support control of number of worker processes are spawned, support -t to turn off random delay at startup count1.sh: actual do the counting in subprocs to avoid disk contention
author Henry S. Thompson <ht@markup.co.uk>
date Mon, 08 Oct 2018 13:17:23 +0000
parents 5db6015689a2
children 36b5d379909a
line wrap: on
line source

#!/bin/bash
# Test script to split CC WAT files across  threads
#   to count http: vs. https:
# Usage: [echo file file_id] | test1.sh id home [-t] numWorkerProcesses
#   If -t, no random wait, just id seconds
# remove >>errs once tested
#set -e -o pipefail
echo $$ > test1.pid
proc=$1
res=res$proc
home=$2
shift 2
if [ "$1" = "-t" ]
then
 shift
 pause=$proc
else
 pause=$(lrand 60)
fi
wp=$1
touch .running
function lrand {
# cheap bad little random number generator
echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
}
function tryRead {
m=0
set -o pipefail
until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \
  curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \
{ egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2
 do
  # try to avoid lockstep retries
  echo ${PIPESTATUS[@]} 1>&2
  sleep $(lrand 10)
  echo $(date) $2 retry number $m 1>&2
done
set +o pipefail
}
trap "{ 
  #set -e -o pipefail
  cd $res
  ln -s ../nohup.cc .
  tar -czhf - * | \
   ssh -o StrictHostKeyChecking=no -q $home \"{ cd data/jobs/ 
                    mkdir -p test1.$proc
                    cd test1.$proc
                    tar -xzf - ; } 2>>errs.1\"
  cd
  rm -rf $res
  ( sleep 5 ; rm nohup.cc ) &
  }" EXIT
mkdir -p $res
log=$res/log
# Don't all start at once
sleep $pause
echo \# $(date) >  $log
pRes=0
while read s id
do
 url="https://commoncrawl.s3.amazonaws.com/$s"
 export ID=$id
 echo $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log
 # Experimental retry loop
 tryRead "$url" crawl$id
 if [ -s crawl$id ]
 then
  echo \# $id $(wc -l crawl$id) >> $log
  parallel --round-robin --pipe -j $wp "count1.sh >> $res/{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
 else
  echo "crawl$id empty" 1>&2
 fi
 rm crawl$id
done < ifile.txt 2>> $res/errs || pRes=$?
( cd $res && fgrep -h -v \# [1-9]* ) | tr -d \" > $res/tots
echo \# $(date) main loop exit code=$pRes >> $log
rm .running