annotate master/bin/fixDates.py @ 33:4c117ee8ed75

fixDates, _fixAndMerge, _doFetch towards rework of date fixup share.sh, old_invoke.sh recover the old approach to sharing, which works
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 20 Nov 2018 14:49:07 +0000
parents 9342f6269edf
children ad6eff2bc6f9
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
1 #!/usr/bin/env python3
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
2 import sys,re
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
3 from array import array
33
4c117ee8ed75 fixDates, _fixAndMerge, _doFetch
Henry S. Thompson <ht@markup.co.uk>
parents: 32
diff changeset
4 from time import strftime
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
5 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$')
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
6 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
7 from dateparser import parse
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
8
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
9 n=0
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
10 bogons=0
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
11 HTTP=0
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
12 HTTPS=1
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
13 sn=['http','https']
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
14 http_ytab=list(201*[None]) # 1900--2100
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
15 https_ytab=list(201*[None])
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
16 tab=[http_ytab,https_ytab]
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
17 nd=[0,0]
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
18 mn=[None,'Jan','Feb','Mar','Apr','May','Jun',
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
19 'Jul','Aug','Sep','Oct','Nov','Dec']
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
20 months=dict(zip(mn[1:],range(1,13)))
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
21 for l in sys.stdin:
33
4c117ee8ed75 fixDates, _fixAndMerge, _doFetch
Henry S. Thompson <ht@markup.co.uk>
parents: 32
diff changeset
22 if l[0]=='#':
4c117ee8ed75 fixDates, _fixAndMerge, _doFetch
Henry S. Thompson <ht@markup.co.uk>
parents: 32
diff changeset
23 print('# %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr)
4c117ee8ed75 fixDates, _fixAndMerge, _doFetch
Henry S. Thompson <ht@markup.co.uk>
parents: 32
diff changeset
24 continue
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
25 n+=1
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
26 ff=ok.match(l)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
27 if ff is not None:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
28 #print(l,end='')
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
29 scheme=HTTP if ff.group(1)=='http' else HTTPS
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
30 count=None
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
31 try:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
32 # More alphas then numerics...
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
33 try:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
34 month=months[ff.group(2)]
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
35 except KeyError:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
36 month=int(ff.group(2))
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
37 year=int(ff.group(3))
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
38 count=int(ff.group(4))
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
39 except:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
40 # Unusual month or year field
27
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
41 try:
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
42 d=parse("%s %s"%(ff.group(2),ff.group(3)))
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
43 if d is None or count is None:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
44 print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
45 file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
46 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
47 continue
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
48 elif d.year<1900 or d.year>2100:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
49 # Shouldn't happen
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
50 print(7,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
51 file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
52 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
53 continue
27
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
54 else:
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
55 month=d.month
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
56 year=d.year
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
57 except Exception as e:
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
58 print(6,e,l,file=sys.stderr)
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
59 bogons+=1
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
60 continue
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
61 else:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
62 cols=l.split()
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
63 scheme=cols[0]
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
64 if scheme[-1]==':':
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
65 scheme=scheme[0:-1]
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
66 if scheme not in ('http','https'):
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
67 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
68 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
69 print(1,scheme,l,file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
70 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
71 continue
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
72 scheme=HTTP if scheme=='http' else HTTPS
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
73 try:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
74 cols=cols[1:]
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
75 count=int(cols.pop())
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
76 except:
33
4c117ee8ed75 fixDates, _fixAndMerge, _doFetch
Henry S. Thompson <ht@markup.co.uk>
parents: 32
diff changeset
77 print(2,count,l,file=sys.stderr)
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
78 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
79 continue
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
80 if cols==[]:
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
81 nd[scheme]+=count
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
82 continue
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
83 else:
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
84 l=' '.join(cols)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
85 try:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
86 d=parse(l)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
87 if d is None:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
88 print(3,d,l,count,file=sys.stderr)
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
89 bogons+=1
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
90 continue
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
91 elif d.year<1900 or d.year>2100:
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
92 # Shouldn't happen
33
4c117ee8ed75 fixDates, _fixAndMerge, _doFetch
Henry S. Thompson <ht@markup.co.uk>
parents: 32
diff changeset
93 print(8,sn[scheme],d.month,d.year,count,
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
94 file=sys.stderr)
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
95 bogons+=1
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
96 continue
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
97 else:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
98 year=d.year
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
99 month=d.month
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
100 except Exception as e:
33
4c117ee8ed75 fixDates, _fixAndMerge, _doFetch
Henry S. Thompson <ht@markup.co.uk>
parents: 32
diff changeset
101 print(4,e,l,count,file=sys.stderr)
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
102 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
103 continue
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
104 # log it
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
105 yy=tab[scheme]
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
106 y=year-1900
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
107 if yy[y] is None:
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
108 yy[y]=mm=array('L',13*[0])
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
109 else:
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
110 mm=yy[y]
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
111 mm[month]+=count
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
112 if n==0:
27
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
113 # ssh screwed up
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
114 exit(1)
32
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
115 for s in (HTTP,HTTPS):
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
116 if nd[s]!=0:
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
117 print(sn[s],0,0,nd[s],sep='\t')
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
118 yy=tab[s]
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
119 for y in range(201):
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
120 mm=yy[y]
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
121 if mm is not None:
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
122 for m in range(1,13):
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
123 if mm[m]!=0:
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
124 print(sn[s],mn[m],y+1900,mm[m],sep='\t')
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
125 print(n,bogons,file=sys.stderr)
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
126
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
127
9342f6269edf rewritten to be faster, maybe, and avoid earlier bug
Henry S. Thompson <ht@markup.co.uk>
parents: 31
diff changeset
128