annotate master/bin/fixDates.py @ 32:9342f6269edf

rewritten to be faster, maybe, and avoid earlier bug
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 20 Nov 2018 10:31:05 +0000
parents 580cc12c9712
children 4c117ee8ed75
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
1 #!/usr/bin/env python3
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
2 import sys,re
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
3 from array import array
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
4 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$')
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
5 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
6 from dateparser import parse
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
7
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
8 n=0
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
9 bogons=0
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
10 HTTP=0
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
11 HTTPS=1
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
12 sn=['http','https']
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
13 http_ytab=list(201*[None]) # 1900--2100
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
14 https_ytab=list(201*[None])
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
15 tab=[http_ytab,https_ytab]
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
16 nd=[0,0]
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
17 mn=[None,'Jan','Feb','Mar','Apr','May','Jun',
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
18 'Jul','Aug','Sep','Oct','Nov','Dec']
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
19 months=dict(zip(mn[1:],range(1,13)))
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
20 for l in sys.stdin:
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
21 n+=1
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
22 ff=ok.match(l)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
23 if ff is not None:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
24 #print(l,end='')
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
25 scheme=HTTP if ff.group(1)=='http' else HTTPS
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
26 count=None
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
27 try:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
28 # More alphas then numerics...
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
29 try:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
30 month=months[ff.group(2)]
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
31 except KeyError:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
32 month=int(ff.group(2))
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
33 year=int(ff.group(3))
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
34 count=int(ff.group(4))
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
35 except:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
36 # Unusual month or year field
27
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
37 try:
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
38 d=parse("%s %s"%(ff.group(2),ff.group(3)))
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
39 if d is None or count is None:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
40 print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
41 file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
42 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
43 continue
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
44 elif d.year<1900 or d.year>2100:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
45 # Shouldn't happen
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
46 print(7,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
47 file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
48 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
49 continue
27
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
50 else:
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
51 month=d.month
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
52 year=d.year
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
53 except Exception as e:
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
54 print(6,e,l,file=sys.stderr)
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
55 bogons+=1
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
56 continue
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
57 else:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
58 cols=l.split()
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
59 scheme=cols[0]
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
60 if scheme[-1]==':':
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
61 scheme=scheme[0:-1]
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
62 if scheme not in ('http','https'):
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
63 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
64 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
65 print(1,scheme,l,file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
66 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
67 continue
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
68 scheme=HTTP if scheme=='http' else HTTPS
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
69 try:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
70 cols=cols[1:]
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
71 count=int(cols.pop())
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
72 except:
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
73 print(2,sn[scheme],cols,count,file=sys.stderr)
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
74 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
75 continue
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
76 if cols==[]:
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
77 nd[scheme]+=count
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
78 continue
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
79 else:
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
80 l=' '.join(cols)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
81 try:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
82 d=parse(l)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
83 if d is None:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
84 print(3,d,l,count,file=sys.stderr)
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
85 bogons+=1
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
86 continue
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
87 elif d.year<1900 or d.year>2100:
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
88 # Shouldn't happen
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
89 print(8,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
90 file=sys.stderr)
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
91 bogons+=1
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
92 continue
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
93 else:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
94 year=d.year
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
95 month=d.month
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
96 except Exception as e:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
97 print(4,e,l,file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
98 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
99 continue
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
100 # log it
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
101 yy=tab[scheme]
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
102 y=year-1900
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
103 if yy[y] is None:
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
104 yy[y]=mm=array('L',13*[0])
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
105 else:
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
106 mm=yy[y]
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
107 mm[month]+=count
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
108 if n==0:
27
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
109 # ssh screwed up
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
110 exit(1)
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
111 for s in (HTTP,HTTPS):
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
112 if nd[s]!=0:
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
113 print(sn[s],0,0,nd[s],sep='\t')
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
114 yy=tab[s]
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
115 for y in range(201):
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
116 mm=yy[y]
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
117 if mm is not None:
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
118 for m in range(1,13):
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
119 if mm[m]!=0:
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
120 print(sn[s],mn[m],y+1900,mm[m],sep='\t')
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
121 print(n,bogons,file=sys.stderr)
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
122
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
123
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
124