annotate workers/bin/_timedWhich.py @ 68:1f04bce6ead7 default tip

use basefile instead of transferfile, and remove cleanup: belt and braces wrt lossage of sac_schemes.py in 15% of 1000_k3, this as used in a_2
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 04 Jun 2020 20:44:44 +0000
parents 7a4e49689935
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
40
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/usr/bin/env python3
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
2 import re,sys,io
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
3
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 uin=io.TextIOWrapper(sys.stdin.buffer,encoding='latin1')
46
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
5 p1=re.compile('"WARC-Target-URI":"(\w*):.*msgtype=response')
40
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
6 p2=re.compile('"Last-Modified":"([^"]*)"')
42
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
7 sep=re.compile('\.?[, \t]+')
44
1342f6669352 knock off a few more relatively common cases
Henry S. Thompson <ht@markup.co.uk>
parents: 42
diff changeset
8 losers=re.compile('(mon|fri|sun)(day)?|tue(sday)?|wed(nesday)?|thu(rsday)?|sat(urday)?|gmt([+-][\d:]+)?|[ap]m|\d\d?:\d\d:(\d\d(\.\d*)?\w*|rd)|\{ts|[-+]\d\d\d\d|\d\d?|:',re.I)
1342f6669352 knock off a few more relatively common cases
Henry S. Thompson <ht@markup.co.uk>
parents: 42
diff changeset
9 oddlast=re.compile('\d\w+[A-Z]{3,4}|[A-Z]\w+/[A-Z]\w+')
42
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
10 HTTP=0
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
11 HTTPS=1
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
12 tab=[{},{}]
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
13 nd=[0,0] # no date
46
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
14 sn={'http':HTTP,'https':HTTPS}
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
15 i=j=0
40
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
16 for l in uin:
46
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
17 i+=1
40
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
18 m=p1.search(l)
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
19 if m:
46
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
20 j+=1
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
21 scheme=m.group(1)
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
22 if scheme=='http':
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
23 k=HTTP
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
24 elif scheme=='https':
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
25 k=HTTPS
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
26 else:
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
27 scheme=scheme.lower()
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
28 try:
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
29 k=sn[scheme]
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
30 except KeyError:
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
31 k=len(sn)+1
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
32 sn[scheme]=k
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
33 tab.append(dict())
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
34 nd.append(0)
40
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
35 m=p2.search(l,m.end())
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
36 if m is None:
42
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
37 nd[k]+=1
40
4cf6bc21f683 start work on python version of tW.sh
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
38 else:
42
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
39 t=tab[k]
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
40 lm=m.group(1)
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
41 lmc=sep.split(lm)
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
42 if len(lmc)==1 and lmc[0].startswith('serve-proxy-cache:'):
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
43 r='serve-proxy-cache:'
45
21152d241e1a one last hack
Henry S. Thompson <ht@markup.co.uk>
parents: 44
diff changeset
44 elif len(lmc)>14 and lmc[-2][-1]==')':
21152d241e1a one last hack
Henry S. Thompson <ht@markup.co.uk>
parents: 44
diff changeset
45 # e.g. Sun, 23 Apr 2017 11:10(02017Sun, 23 Apr 2017 11:10:29 +0300Sun, 23 Apr 2017 11:10:29 +030017) GMT
21152d241e1a one last hack
Henry S. Thompson <ht@markup.co.uk>
parents: 44
diff changeset
46 lmc=lmc[:-12]
42
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
47 else:
44
1342f6669352 knock off a few more relatively common cases
Henry S. Thompson <ht@markup.co.uk>
parents: 42
diff changeset
48 if oddlast.fullmatch(lmc[-1]):
42
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
49 lmc.pop()
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
50 r=' '.join(c for c in lmc if not losers.fullmatch(c))
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
51 t[r]=t.get(r,0)+1
46
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
52 for l,h in sn.items():
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
53 if nd[h]>0:
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
54 print("%s\t\t%s"%(l,nd[h]))
42
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
55 for (k,v) in tab[h].items():
46
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
56 print("%s\t%s\t%s"%(l,k,v))
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
57 print("# %s lines, %s responses"%(i,j),file=sys.stderr)
42
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
58
1d776e96c16a works on one file
Henry S. Thompson <ht@markup.co.uk>
parents: 40
diff changeset
59
46
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
60
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
61
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
62
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
63
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
64
7a4e49689935 finally got logging sorted
Henry S. Thompson <ht@markup.co.uk>
parents: 45
diff changeset
65