Mercurial > hg > cc > azure
changeset 17:2a2c1fb03c54
first cut at http/https real trial, with month and year last-modified info too
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Fri, 19 Oct 2018 11:36:31 +0000 |
parents | c3e9ad8a42cf |
children | 9631fca89cc6 |
files | workers/bin/_timedWhich.sh workers/bin/timedWhich.sh |
diffstat | 2 files changed, 86 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/workers/bin/_timedWhich.sh Fri Oct 19 11:36:31 2018 +0000 @@ -0,0 +1,7 @@ +#!/bin/bash +egrep -o'("WARC-Target-URI":"https?:|"Last-Modified":"[^"]*")'|\ + egrep -o '(https?:|:".*"$)' |\ + tr '\012' \# | sed 's/:#:/ /g'|tr \# '\012' | tr -d \"|\ + sed 's/ [[:digit:]][[:digit:]]\?:[[:digit:]][[:digit:]]:[[:digit:]][[:digit:]] / /;s/\(https\? \)\(: \)\?[MTWFSa-z]..\.\?, \?/\1/;s/ \([-+][[:digit:]]\{4\}\|[[:upper:]]\{2,3\}\)$//;s/ [[:digit:]]\{1,2\} / /;s/\/[[:digit:]]\{1,2\}\/\([[:digit:]]\{4\}\)$/ \1/'|\ +awk '{c[$0]+=1} END {for (k in c) {print k, c[k]}}' +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/workers/bin/timedWhich.sh Fri Oct 19 11:36:31 2018 +0000 @@ -0,0 +1,79 @@ +#!/bin/bash +# Test script to split CC WAT files across threads +# to tabulate http vs. https by last-modified date: +# Usage: [echo file file_id] | timedWhich.sh id home [-t] numWorkerProcesses +# If -t, no random wait, just id seconds +# remove >>errs once tested +#set -e -o pipefail +echo $$ > test1.pid +proc=$1 +res=res$proc +home=$2 +shift 2 +function lrand { +# cheap bad little random number generator +echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) +} +if [ "$1" = "-t" ] +then + shift + pause=$proc +else + pause=$(lrand 60) +fi +wp=$1 +touch .running +function tryRead { +m=0 +set -o pipefail +until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ + curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \ +{ egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2 + do + # try to avoid lockstep retries + echo ${PIPESTATUS[@]} 1>&2 + sleep $(lrand 10) + echo $(date) $2 retry number $m 1>&2 +done +set +o pipefail +} +trap "{ + #set -e -o pipefail + ln -s ../nohup.cc . + tar -czhf - CC* $res | \ + ssh -o StrictHostKeyChecking=no -q $home \"{ cd data + mkdir -p pdf/wat + cd pdf/wat + tar -xzf - ; } 2>>errs\" + rm -rf $res CC* ifile.txt *.pid + ( sleep 5 ; rm nohup.cc ) & + }" EXIT +mkdir -p $res +log=$res/log +# Don't all start at once +sleep $pause +echo \# $(date) > $log +pRes=0 +while read s id +do + url="https://commoncrawl.s3.amazonaws.com/$s" + ccm=${s##*/wat/} + cci=${ccm%%-ip*} + export ID=$id + echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log + # Experimental retry loop + tryRead "$url" crawl$id + if [ -s crawl$id ] + then + echo \# $(date) $id $(wc -l crawl$id) >> $log + parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 + else + echo "crawl$id empty" 1>&2 + fi + rm crawl$id + cat $res/$cci.* > $cci +done < ifile.txt 2>> $res/errs || pRes=$? +echo \# $(date) main loop exit code=$pRes >> $log +rm .running + +