comparison workers/bin/_timedWhich.py @ 42:1d776e96c16a

works on one file
author Henry S. Thompson <ht@markup.co.uk>
date Fri, 30 Nov 2018 15:41:02 +0000
parents 4cf6bc21f683
children 1342f6669352
comparison
equal deleted inserted replaced
41:3313edbab3b0 42:1d776e96c16a
2 import re,sys,io 2 import re,sys,io
3 3
4 uin=io.TextIOWrapper(sys.stdin.buffer,encoding='latin1') 4 uin=io.TextIOWrapper(sys.stdin.buffer,encoding='latin1')
5 p1=re.compile('"WARC-Target-URI":"(https?):.*msgtype=response') 5 p1=re.compile('"WARC-Target-URI":"(https?):.*msgtype=response')
6 p2=re.compile('"Last-Modified":"([^"]*)"') 6 p2=re.compile('"Last-Modified":"([^"]*)"')
7 w={} 7 sep=re.compile('\.?[, \t]+')
8 wo={} 8 losers=re.compile('(mon|tue|wed|thu|fri|sat|sun)(day)?|gmt|[ap]m|\d\d?:\d\d:\d\d(\.\d*)?\w*|[-+]\d\d\d\d|\d\d?|:',re.I)
9 oddz=re.compile('[A-Z]\w+/[A-Z]\w+')
10 HTTP=0
11 HTTPS=1
12 tab=[{},{}]
13 nd=[0,0] # no date
14 sn=['http','https']
9 for l in uin: 15 for l in uin:
10 m=p1.search(l) 16 m=p1.search(l)
11 if m: 17 if m:
12 k=m.group(1) 18 k=HTTP if m.group(1)=='http' else HTTPS
13 m=p2.search(l,m.end()) 19 m=p2.search(l,m.end())
14 if m is None: 20 if m is None:
15 wo[k]=wo.get(k,0)+1 21 nd[k]+=1
16 else: 22 else:
17 w[k]=w.get(k,0)+1 23 t=tab[k]
18 print("with %s\nw/o %s"%(w,wo)) 24 lm=m.group(1)
25 lmc=sep.split(lm)
26 if len(lmc)==1 and lmc[0].startswith('serve-proxy-cache:'):
27 r='serve-proxy-cache:'
28 else:
29 if oddz.fullmatch(lmc[-1]):
30 lmc.pop()
31 r=' '.join(c for c in lmc if not losers.fullmatch(c))
32 t[r]=t.get(r,0)+1
33 for h in (HTTP,HTTPS):
34 print("%s\t\t%s"%(sn[h],nd[h]))
35 for (k,v) in tab[h].items():
36 print("%s\t%s\t%s"%(sn[h],k,v))
37
38