Mercurial > hg > cc > azure
view workers/bin/ptimedWhich.sh @ 47:2a0dab424418
cci path hack changed for 2018.04
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Mon, 10 Dec 2018 14:43:18 +0000 |
parents | 7a4e49689935 |
children | b8a88cad75d5 |
line wrap: on
line source
#!/bin/bash # Test script to split CC WAT files across threads # to tabulate http vs. https by last-modified date: # Usage: [echo file file_id] | timedWhich.sh id home [-t] numWorkerProcesses # If -t, no random wait, just id seconds # remove >>errs once tested #set -e -o pipefail echo $$ > test1.pid proc=$1 res=/var/data/res$proc home=$2 shift 2 function lrand { # cheap bad little random number generator echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) } if [ "$1" = "-t" ] then shift pause=$proc else pause=$(lrand 60) fi wp=$1 touch .running function tryread { n=$1 while read u o do m=0 set -o pipefail until if [ $((m+=1)) -gt 5 ]; then echo " tried 5 times w/o success, giving up" 1>&2; return 1; fi && echo -n \# $(date) "reading $u ..." 1>&2 && \ curl -s -S --max-time 60 --insecure -o - "$u" | \ { echo "done at " $(date) 1>&2 ; zcat ; } |\ _timedWhich.py > "$o" do # try to avoid lockstep retries echo \# ${PIPESTATUS[@]} 1>&2 sleep $(lrand 10) echo \# $(date) retry number $m 1>&2 done set +o pipefail done } trap "{ set -e -o pipefail cd /var/data tar -czhf - CC* res* | \ ssh -o StrictHostKeyChecking=no -q $home \"{ cd data mkdir -p which cd which tar -xzf - ; } 2>>errs\" rm -rf res* CC* cd rm ifile.txt *.pid ( sleep 5 ; rm nohup.cc ) & }" EXIT mkdir -p $res log=$res/log # Don't all start at once sleep $pause echo \# $(date) > $log pRes=0 N=$(wc -l< ifile.txt) export -f tryread lrand while read s do url="https://commoncrawl.s3.amazonaws.com/$s" # below for 2018-04, for 2017-04 needs $13 instead of $14 cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$14}' |tr ' ' \-) echo $url /var/data/$cci done < ifile.txt 2>> $res/errs | \ parallel --pipe -N$((N / wp)) -j $wp "bash -c \"tryread 2>>$res/errs{#}\"" 2>>$res/errs || pRes=$? echo \# $(date) main loop exit code=$pRes >> $log rm .running