Mercurial > hg > cc > azure
changeset 46:7a4e49689935
finally got logging sorted
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Mon, 03 Dec 2018 21:10:02 +0000 |
parents | 21152d241e1a |
children | 2a0dab424418 |
files | workers/bin/_timedWhich.py workers/bin/ptimedWhich.sh |
diffstat | 2 files changed, 106 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/workers/bin/_timedWhich.py Sat Dec 01 16:25:04 2018 +0000 +++ b/workers/bin/_timedWhich.py Mon Dec 03 21:10:02 2018 +0000 @@ -2,7 +2,7 @@ import re,sys,io uin=io.TextIOWrapper(sys.stdin.buffer,encoding='latin1') -p1=re.compile('"WARC-Target-URI":"(https?):.*msgtype=response') +p1=re.compile('"WARC-Target-URI":"(\w*):.*msgtype=response') p2=re.compile('"Last-Modified":"([^"]*)"') sep=re.compile('\.?[, \t]+') losers=re.compile('(mon|fri|sun)(day)?|tue(sday)?|wed(nesday)?|thu(rsday)?|sat(urday)?|gmt([+-][\d:]+)?|[ap]m|\d\d?:\d\d:(\d\d(\.\d*)?\w*|rd)|\{ts|[-+]\d\d\d\d|\d\d?|:',re.I) @@ -11,11 +11,27 @@ HTTPS=1 tab=[{},{}] nd=[0,0] # no date -sn=['http','https'] +sn={'http':HTTP,'https':HTTPS} +i=j=0 for l in uin: + i+=1 m=p1.search(l) if m: - k=HTTP if m.group(1)=='http' else HTTPS + j+=1 + scheme=m.group(1) + if scheme=='http': + k=HTTP + elif scheme=='https': + k=HTTPS + else: + scheme=scheme.lower() + try: + k=sn[scheme] + except KeyError: + k=len(sn)+1 + sn[scheme]=k + tab.append(dict()) + nd.append(0) m=p2.search(l,m.end()) if m is None: nd[k]+=1 @@ -33,9 +49,17 @@ lmc.pop() r=' '.join(c for c in lmc if not losers.fullmatch(c)) t[r]=t.get(r,0)+1 -for h in (HTTP,HTTPS): - print("%s\t\t%s"%(sn[h],nd[h])) +for l,h in sn.items(): + if nd[h]>0: + print("%s\t\t%s"%(l,nd[h])) for (k,v) in tab[h].items(): - print("%s\t%s\t%s"%(sn[h],k,v)) + print("%s\t%s\t%s"%(l,k,v)) +print("# %s lines, %s responses"%(i,j),file=sys.stderr) + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/workers/bin/ptimedWhich.sh Mon Dec 03 21:10:02 2018 +0000 @@ -0,0 +1,76 @@ +#!/bin/bash +# Test script to split CC WAT files across threads +# to tabulate http vs. https by last-modified date: +# Usage: [echo file file_id] | timedWhich.sh id home [-t] numWorkerProcesses +# If -t, no random wait, just id seconds +# remove >>errs once tested +#set -e -o pipefail +echo $$ > test1.pid +proc=$1 +res=/var/data/res$proc +home=$2 +shift 2 +function lrand { +# cheap bad little random number generator +echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) +} +if [ "$1" = "-t" ] +then + shift + pause=$proc +else + pause=$(lrand 60) +fi +wp=$1 +touch .running +function tryread { +n=$1 +while read u o +do + m=0 + set -o pipefail + until if [ $((m+=1)) -gt 5 ]; then echo " tried 5 times w/o success, giving up" 1>&2; return 1; fi && echo -n \# $(date) "reading $u ..." 1>&2 && \ + curl -s -S --max-time 60 --insecure -o - "$u" | \ + { echo "done at " $(date) 1>&2 ; zcat ; } |\ + _timedWhich.py > "$o" + do + # try to avoid lockstep retries + echo \# ${PIPESTATUS[@]} 1>&2 + sleep $(lrand 10) + echo \# $(date) retry number $m 1>&2 + done + set +o pipefail +done +} +trap "{ + set -e -o pipefail + cd /var/data + tar -czhf - CC* res* | \ + ssh -o StrictHostKeyChecking=no -q $home \"{ cd data + mkdir -p which + cd which + tar -xzf - ; } 2>>errs\" + rm -rf res* CC* + cd + rm ifile.txt *.pid + ( sleep 5 ; rm nohup.cc ) & + }" EXIT +mkdir -p $res +log=$res/log +# Don't all start at once +sleep $pause +echo \# $(date) > $log +pRes=0 +N=$(wc -l< ifile.txt) +export -f tryread lrand +while read s +do + url="https://commoncrawl.s3.amazonaws.com/$s" + cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-) + echo $url /var/data/$cci +done < ifile.txt 2>> $res/errs | \ + parallel --pipe -N$((N / wp)) -j $wp "bash -c \"tryread 2>>$res/errs{#}\"" 2>>$res/errs || pRes=$? +echo \# $(date) main loop exit code=$pRes >> $log +rm .running + +