Mercurial > hg > cc > azure
annotate master/bin/fixDates.py @ 32:9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 20 Nov 2018 10:31:05 +0000 |
parents | 580cc12c9712 |
children | 4c117ee8ed75 |
rev | line source |
---|---|
24
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
1 #!/usr/bin/env python3 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
2 import sys,re |
31
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
3 from array import array |
24
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
4 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$') |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
5 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}') |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
6 from dateparser import parse |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
7 |
32
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
8 n=0 |
24
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
9 bogons=0 |
32
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
10 HTTP=0 |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
11 HTTPS=1 |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
12 sn=['http','https'] |
31
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
13 http_ytab=list(201*[None]) # 1900--2100 |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
14 https_ytab=list(201*[None]) |
32
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
15 tab=[http_ytab,https_ytab] |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
16 nd=[0,0] |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
17 mn=[None,'Jan','Feb','Mar','Apr','May','Jun', |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
18 'Jul','Aug','Sep','Oct','Nov','Dec'] |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
19 months=dict(zip(mn[1:],range(1,13))) |
24
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
20 for l in sys.stdin: |
32
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
21 n+=1 |
24
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
22 ff=ok.match(l) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
23 if ff is not None: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
24 #print(l,end='') |
32
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
25 scheme=HTTP if ff.group(1)=='http' else HTTPS |
31
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
26 count=None |
24
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
27 try: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
28 # More alphas then numerics... |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
29 try: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
30 month=months[ff.group(2)] |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
31 except KeyError: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
32 month=int(ff.group(2)) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
33 year=int(ff.group(3)) |
31
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
34 count=int(ff.group(4)) |
24
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
35 except: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
36 # Unusual month or year field |
27
dd19cf97b6dd
attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents:
24
diff
changeset
|
37 try: |
dd19cf97b6dd
attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents:
24
diff
changeset
|
38 d=parse("%s %s"%(ff.group(2),ff.group(3))) |
31
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
39 if d is None or count is None: |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
40 print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4), |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
41 file=sys.stderr) |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
42 bogons+=1 |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
43 continue |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
44 elif d.year<1900 or d.year>2100: |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
45 # Shouldn't happen |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
46 print(7,ff.group(1),ff.group(2),ff.group(3),ff.group(4), |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
47 file=sys.stderr) |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
48 bogons+=1 |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
49 continue |
27
dd19cf97b6dd
attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents:
24
diff
changeset
|
50 else: |
dd19cf97b6dd
attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents:
24
diff
changeset
|
51 month=d.month |
dd19cf97b6dd
attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents:
24
diff
changeset
|
52 year=d.year |
dd19cf97b6dd
attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents:
24
diff
changeset
|
53 except Exception as e: |
dd19cf97b6dd
attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents:
24
diff
changeset
|
54 print(6,e,l,file=sys.stderr) |
dd19cf97b6dd
attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents:
24
diff
changeset
|
55 bogons+=1 |
31
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
56 continue |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
57 else: |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
58 cols=l.split() |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
59 scheme=cols[0] |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
60 if scheme[-1]==':': |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
61 scheme=scheme[0:-1] |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
62 if scheme not in ('http','https'): |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
63 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
64 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
65 print(1,scheme,l,file=sys.stderr) |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
66 bogons+=1 |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
67 continue |
32
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
68 scheme=HTTP if scheme=='http' else HTTPS |
31
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
69 try: |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
70 cols=cols[1:] |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
71 count=int(cols.pop()) |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
72 except: |
32
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
73 print(2,sn[scheme],cols,count,file=sys.stderr) |
31
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
74 bogons+=1 |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
75 continue |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
76 if cols==[]: |
32
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
77 nd[scheme]+=count |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
78 continue |
24
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
79 else: |
31
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
80 l=' '.join(cols) |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
81 try: |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
82 d=parse(l) |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
83 if d is None: |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
84 print(3,d,l,count,file=sys.stderr) |
32
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
85 bogons+=1 |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
86 continue |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
87 elif d.year<1900 or d.year>2100: |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
88 # Shouldn't happen |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
89 print(8,ff.group(1),ff.group(2),ff.group(3),ff.group(4), |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
90 file=sys.stderr) |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
91 bogons+=1 |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
92 continue |
31
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
93 else: |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
94 year=d.year |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
95 month=d.month |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
96 except Exception as e: |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
97 print(4,e,l,file=sys.stderr) |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
98 bogons+=1 |
580cc12c9712
partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents:
27
diff
changeset
|
99 continue |
32
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
100 # log it |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
101 yy=tab[scheme] |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
102 y=year-1900 |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
103 if yy[y] is None: |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
104 yy[y]=mm=array('L',13*[0]) |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
105 else: |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
106 mm=yy[y] |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
107 mm[month]+=count |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
108 if n==0: |
27
dd19cf97b6dd
attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents:
24
diff
changeset
|
109 # ssh screwed up |
dd19cf97b6dd
attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents:
24
diff
changeset
|
110 exit(1) |
32
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
111 for s in (HTTP,HTTPS): |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
112 if nd[s]!=0: |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
113 print(sn[s],0,0,nd[s],sep='\t') |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
114 yy=tab[s] |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
115 for y in range(201): |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
116 mm=yy[y] |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
117 if mm is not None: |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
118 for m in range(1,13): |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
119 if mm[m]!=0: |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
120 print(sn[s],mn[m],y+1900,mm[m],sep='\t') |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
121 print(n,bogons,file=sys.stderr) |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
122 |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
123 |
9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents:
31
diff
changeset
|
124 |