annotate master/bin/fixDates.py @ 31:580cc12c9712

partway to rework after failure of mergedWhich.x64700
author Henry S. Thompson <ht@markup.co.uk>
date Mon, 19 Nov 2018 18:33:17 +0000
parents dd19cf97b6dd
children 9342f6269edf
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
1 #!/usr/bin/env python3
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
2 import sys,re
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
3 from array import array
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
4 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$')
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
5 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
6 from dateparser import parse
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
7
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
8 bogons=0
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
9 http_ytab=list(201*[None]) # 1900--2100
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
10 https_ytab=list(201*[None])
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
11 http_yzero=list(13*[0])
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
12 https_yzero=list(13*[0])
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
13 months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
14 'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
15 for l in sys.stdin:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
16 ff=ok.match(l)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
17 if ff is not None:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
18 #print(l,end='')
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
19 scheme=ff.group(1)
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
20 count=None
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
21 try:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
22 # More alphas then numerics...
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
23 try:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
24 month=months[ff.group(2)]
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
25 except KeyError:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
26 month=int(ff.group(2))
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
27 year=int(ff.group(3))
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
28 count=int(ff.group(4))
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
29 except:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
30 # Unusual month or year field
27
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
31 try:
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
32 d=parse("%s %s"%(ff.group(2),ff.group(3)))
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
33 if d is None or count is None:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
34 print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
35 file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
36 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
37 continue
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
38 elif d.year<1900 or d.year>2100:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
39 # Shouldn't happen
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
40 print(7,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
41 file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
42 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
43 continue
27
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
44 else:
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
45 month=d.month
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
46 year=d.year
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
47 except Exception as e:
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
48 print(6,e,l,file=sys.stderr)
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
49 bogons+=1
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
50 continue
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
51 else:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
52 cols=l.split()
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
53 scheme=cols[0]
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
54 if scheme[-1]==':':
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
55 scheme=scheme[0:-1]
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
56 if scheme not in ('http','https'):
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
57 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
58 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
59 print(1,scheme,l,file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
60 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
61 continue
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
62 try:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
63 cols=cols[1:]
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
64 count=int(cols.pop())
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
65 except:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
66 print(2,cols,file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
67 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
68 continue
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
69 if cols==[]:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
70 year=month=0
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
71 else:
31
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
72 l=' '.join(cols)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
73 try:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
74 d=parse(l)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
75 if d is None:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
76 print(3,d,l,count,file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
77 year=0
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
78 month=2
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
79 elif d.year<1900:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
80 year=0
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
81 month=3
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
82 elif d.year>3499:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
83 year=3499
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
84 month=2
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
85 else:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
86 year=d.year
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
87 month=d.month
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
88 except Exception as e:
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
89 print(4,e,l,file=sys.stderr)
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
90 bogons+=1
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
91 continue
580cc12c9712 partway to rework after failure of mergedWhich.x64700
Henry S. Thompson <ht@markup.co.uk>
parents: 27
diff changeset
92 # file it
27
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
93 if tab=={}:
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
94 # ssh screwed up
dd19cf97b6dd attempt to fix robustness pblms
Henry S. Thompson <ht@markup.co.uk>
parents: 24
diff changeset
95 exit(1)
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
96 for ((s,m,y),c) in tab.items():
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
97 print(s,m,y,c,sep='\t')
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
98 print(bogons,file=sys.stderr)