annotate workers/bin/_timedWhich.py @ 42:1d776e96c16a

works on one file
author Henry S. Thompson <ht@markup.co.uk>
date Fri, 30 Nov 2018 15:41:02 +0000
parents 4cf6bc21f683
children 1342f6669352
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
40
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/usr/bin/env python3
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
2 import re,sys,io
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
3
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 uin=io.TextIOWrapper(sys.stdin.buffer,encoding='latin1')
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5 p1=re.compile('"WARC-Target-URI":"(https?):.*msgtype=response')
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
6 p2=re.compile('"Last-Modified":"([^"]*)"')
42
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
7 sep=re.compile('\.?[, \t]+')
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
8 losers=re.compile('(mon|tue|wed|thu|fri|sat|sun)(day)?|gmt|[ap]m|\d\d?:\d\d:\d\d(\.\d*)?\w*|[-+]\d\d\d\d|\d\d?|:',re.I)
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
9 oddz=re.compile('[A-Z]\w+/[A-Z]\w+')
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
10 HTTP=0
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
11 HTTPS=1
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
12 tab=[{},{}]
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
13 nd=[0,0] # no date
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
14 sn=['http','https']
40
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
15 for l in uin:
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
16 m=p1.search(l)
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
17 if m:
42
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
18 k=HTTP if m.group(1)=='http' else HTTPS
40
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
19 m=p2.search(l,m.end())
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
20 if m is None:
42
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
21 nd[k]+=1
40
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
22 else:
42
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
23 t=tab[k]
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
24 lm=m.group(1)
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
25 lmc=sep.split(lm)
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
26 if len(lmc)==1 and lmc[0].startswith('serve-proxy-cache:'):
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
27 r='serve-proxy-cache:'
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
28 else:
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
29 if oddz.fullmatch(lmc[-1]):
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
30 lmc.pop()
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
31 r=' '.join(c for c in lmc if not losers.fullmatch(c))
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
32 t[r]=t.get(r,0)+1
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
33 for h in (HTTP,HTTPS):
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
34 print("%s\t\t%s"%(sn[h],nd[h]))
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
35 for (k,v) in tab[h].items():
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
36 print("%s\t%s\t%s"%(sn[h],k,v))
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
37
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
38