Mercurial > hg > cc > azure
annotate workers/bin/_timedWhich.py @ 44:1342f6669352
knock off a few more relatively common cases
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 01 Dec 2018 12:13:34 +0000 |
parents | 1d776e96c16a |
children | 21152d241e1a |
rev | line source |
---|---|
40
4cf6bc21f683
start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
4cf6bc21f683
start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
2 import re,sys,io |
4cf6bc21f683
start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
3 |
4cf6bc21f683
start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 uin=io.TextIOWrapper(sys.stdin.buffer,encoding='latin1') |
4cf6bc21f683
start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
5 p1=re.compile('"WARC-Target-URI":"(https?):.*msgtype=response') |
4cf6bc21f683
start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
6 p2=re.compile('"Last-Modified":"([^"]*)"') |
42 | 7 sep=re.compile('\.?[, \t]+') |
44
1342f6669352
knock off a few more relatively common cases
Henry S. Thompson <ht@markup.co.uk>
parents:
42
diff
changeset
|
8 losers=re.compile('(mon|fri|sun)(day)?|tue(sday)?|wed(nesday)?|thu(rsday)?|sat(urday)?|gmt([+-][\d:]+)?|[ap]m|\d\d?:\d\d:(\d\d(\.\d*)?\w*|rd)|\{ts|[-+]\d\d\d\d|\d\d?|:',re.I) |
1342f6669352
knock off a few more relatively common cases
Henry S. Thompson <ht@markup.co.uk>
parents:
42
diff
changeset
|
9 oddlast=re.compile('\d\w+[A-Z]{3,4}|[A-Z]\w+/[A-Z]\w+') |
42 | 10 HTTP=0 |
11 HTTPS=1 | |
12 tab=[{},{}] | |
13 nd=[0,0] # no date | |
14 sn=['http','https'] | |
40
4cf6bc21f683
start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
15 for l in uin: |
4cf6bc21f683
start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
16 m=p1.search(l) |
4cf6bc21f683
start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
17 if m: |
42 | 18 k=HTTP if m.group(1)=='http' else HTTPS |
40
4cf6bc21f683
start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
19 m=p2.search(l,m.end()) |
4cf6bc21f683
start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
20 if m is None: |
42 | 21 nd[k]+=1 |
40
4cf6bc21f683
start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
22 else: |
42 | 23 t=tab[k] |
24 lm=m.group(1) | |
25 lmc=sep.split(lm) | |
26 if len(lmc)==1 and lmc[0].startswith('serve-proxy-cache:'): | |
27 r='serve-proxy-cache:' | |
28 else: | |
44
1342f6669352
knock off a few more relatively common cases
Henry S. Thompson <ht@markup.co.uk>
parents:
42
diff
changeset
|
29 if oddlast.fullmatch(lmc[-1]): |
42 | 30 lmc.pop() |
31 r=' '.join(c for c in lmc if not losers.fullmatch(c)) | |
32 t[r]=t.get(r,0)+1 | |
33 for h in (HTTP,HTTPS): | |
34 print("%s\t\t%s"%(sn[h],nd[h])) | |
35 for (k,v) in tab[h].items(): | |
36 print("%s\t%s\t%s"%(sn[h],k,v)) | |
37 | |
38 |