Mercurial > hg > cc > azure
annotate master/bin/fixDates.py @ 24:b4e3beb2227e
improved error handling,
does totalling now too
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 07 Nov 2018 14:15:56 +0000 |
parents | 0f4a0f4e38d4 |
children | dd19cf97b6dd |
rev | line source |
---|---|
24
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
1 #!/usr/bin/env python3 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
2 import sys,re |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
3 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$') |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
4 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}') |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
5 from dateparser import parse |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
6 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
7 bogons=0 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
8 tab={} |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
9 months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6, |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
10 'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12} |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
11 for l in sys.stdin: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
12 ff=ok.match(l) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
13 if ff is not None: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
14 #print(l,end='') |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
15 scheme=ff.group(1) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
16 try: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
17 # More alphas then numerics... |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
18 try: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
19 month=months[ff.group(2)] |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
20 except KeyError: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
21 month=int(ff.group(2)) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
22 year=int(ff.group(3)) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
23 except: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
24 # Unusual month or year field |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
25 d=parse("%s %s"%(ff.group(2),ff.group(3))) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
26 if d is None: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
27 print(5,ff.group(1),ff.group(2),ff.group(3),file=sys.stderr) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
28 year=0 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
29 month=0 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
30 elif d.year<1970: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
31 year=0 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
32 month=1 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
33 elif d.year>2019: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
34 month=1 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
35 year=2019 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
36 else: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
37 month=d.month |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
38 year=d.year |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
39 count=int(ff.group(4)) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
40 key=(scheme,year,month) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
41 tab[key]=tab.get(key,0)+count |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
42 continue |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
43 cols=l.split() |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
44 scheme=cols[0] |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
45 if scheme[-1]==':': |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
46 scheme=scheme[0:-1] |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
47 if scheme not in ('http','https'): |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
48 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
49 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
50 print(1,scheme,l,file=sys.stderr) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
51 bogons+=1 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
52 continue |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
53 try: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
54 cols=cols[1:] |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
55 count=int(cols.pop()) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
56 except: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
57 print(2,cols,file=sys.stderr) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
58 bogons+=1 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
59 continue |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
60 if cols==[]: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
61 key=(scheme,0,0) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
62 tab[key]=tab.get(key,0)+count |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
63 continue |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
64 l=' '.join(cols) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
65 try: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
66 d=parse(l) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
67 if d is None: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
68 print(3,d,l,count,file=sys.stderr) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
69 year=0 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
70 month=2 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
71 elif d.year<1970: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
72 key=(scheme,0,1) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
73 elif d.year>2019: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
74 key=(scheme,2019,1) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
75 else: |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
76 key=(scheme,d.year,d.month) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
77 tab[key]=tab.get(key,0)+count |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
78 except Exception(e): |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
79 print(4,e,l,file=sys.stderr) |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
80 bogons+=1 |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
81 for ((s,m,y),c) in tab.items(): |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
82 print(s,m,y,c,sep='\t') |
b4e3beb2227e
improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
20
diff
changeset
|
83 print(bogons,file=sys.stderr) |