Mercurial > hg > cc > azure
view workers/bin/findPDFs.sh @ 19:d4f186655bcc
lots of tweaking, reached the 80/20 point
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 20 Oct 2018 16:11:29 +0000 |
parents | bba589cab837 |
children |
line wrap: on
line source
#!/bin/bash # Test script to split CC WAT files across threads # to count find application/pdf responses: # Usage: [echo file file_id] | findPDFs id home [-t] numWorkerProcesses # If -t, no random wait, just id seconds # remove >>errs once tested #set -e -o pipefail echo $$ > test1.pid proc=$1 res=res$proc home=$2 shift 2 function lrand { # cheap bad little random number generator echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) } if [ "$1" = "-t" ] then shift pause=$proc else pause=$(lrand 60) fi wp=$1 touch .running function tryRead { m=0 set -o pipefail until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \ { egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2 do # try to avoid lockstep retries echo ${PIPESTATUS[@]} 1>&2 sleep $(lrand 10) echo $(date) $2 retry number $m 1>&2 done set +o pipefail } trap "{ #set -e -o pipefail ln -s ../nohup.cc . tar -czhf - CC* $res | \ ssh -o StrictHostKeyChecking=no -q $home \"{ cd data mkdir -p pdf/wat cd pdf/wat tar -xzf - ; } 2>>errs\" rm -rf $res CC* ifile.txt *.pid ( sleep 5 ; rm nohup.cc ) & }" EXIT mkdir -p $res log=$res/log # Don't all start at once sleep $pause echo \# $(date) > $log pRes=0 while read s id do url="https://commoncrawl.s3.amazonaws.com/$s" ccm=${s##*/wat/} cci=${ccm%%-ip*} export ID=$id echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log # Experimental retry loop tryRead "$url" crawl$id if [ -s crawl$id ] then echo \# $(date) $id $(wc -l crawl$id) >> $log parallel --round-robin --pipe --block-size 2M -j $wp "_findPDFs.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 else echo "crawl$id empty" 1>&2 fi rm crawl$id cat $res/$cci.* > $cci done < ifile.txt 2>> $res/errs || pRes=$? echo \# $(date) main loop exit code=$pRes >> $log rm .running