# HG changeset patch # User Henry S. Thompson # Date 1543592462 0 # Node ID 1d776e96c16ac06eaf5758f1033c0018b15f5ceb # Parent 3313edbab3b02a358be3967d8dcfe5e6b818bb79 works on one file diff -r 3313edbab3b0 -r 1d776e96c16a workers/bin/_timedWhich.py --- a/workers/bin/_timedWhich.py Fri Nov 30 13:44:50 2018 +0000 +++ b/workers/bin/_timedWhich.py Fri Nov 30 15:41:02 2018 +0000 @@ -4,15 +4,35 @@ uin=io.TextIOWrapper(sys.stdin.buffer,encoding='latin1') p1=re.compile('"WARC-Target-URI":"(https?):.*msgtype=response') p2=re.compile('"Last-Modified":"([^"]*)"') -w={} -wo={} +sep=re.compile('\.?[, \t]+') +losers=re.compile('(mon|tue|wed|thu|fri|sat|sun)(day)?|gmt|[ap]m|\d\d?:\d\d:\d\d(\.\d*)?\w*|[-+]\d\d\d\d|\d\d?|:',re.I) +oddz=re.compile('[A-Z]\w+/[A-Z]\w+') +HTTP=0 +HTTPS=1 +tab=[{},{}] +nd=[0,0] # no date +sn=['http','https'] for l in uin: m=p1.search(l) if m: - k=m.group(1) + k=HTTP if m.group(1)=='http' else HTTPS m=p2.search(l,m.end()) if m is None: - wo[k]=wo.get(k,0)+1 + nd[k]+=1 else: - w[k]=w.get(k,0)+1 -print("with %s\nw/o %s"%(w,wo)) + t=tab[k] + lm=m.group(1) + lmc=sep.split(lm) + if len(lmc)==1 and lmc[0].startswith('serve-proxy-cache:'): + r='serve-proxy-cache:' + else: + if oddz.fullmatch(lmc[-1]): + lmc.pop() + r=' '.join(c for c in lmc if not losers.fullmatch(c)) + t[r]=t.get(r,0)+1 +for h in (HTTP,HTTPS): + print("%s\t\t%s"%(sn[h],nd[h])) + for (k,v) in tab[h].items(): + print("%s\t%s\t%s"%(sn[h],k,v)) + +