annotate master/bin/fixDates.py @ 24:b4e3beb2227e

improved error handling, does totalling now too
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 07 Nov 2018 14:15:56 +0000
parents 0f4a0f4e38d4
children dd19cf97b6dd
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
24
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
1 #!/usr/bin/env python3
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
2 import sys,re
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
3 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$')
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
4 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
5 from dateparser import parse
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
6
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
7 bogons=0
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
8 tab={}
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
9 months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
10 'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
11 for l in sys.stdin:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
12 ff=ok.match(l)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
13 if ff is not None:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
14 #print(l,end='')
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
15 scheme=ff.group(1)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
16 try:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
17 # More alphas then numerics...
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
18 try:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
19 month=months[ff.group(2)]
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
20 except KeyError:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
21 month=int(ff.group(2))
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
22 year=int(ff.group(3))
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
23 except:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
24 # Unusual month or year field
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
25 d=parse("%s %s"%(ff.group(2),ff.group(3)))
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
26 if d is None:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
27 print(5,ff.group(1),ff.group(2),ff.group(3),file=sys.stderr)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
28 year=0
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
29 month=0
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
30 elif d.year<1970:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
31 year=0
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
32 month=1
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
33 elif d.year>2019:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
34 month=1
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
35 year=2019
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
36 else:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
37 month=d.month
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
38 year=d.year
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
39 count=int(ff.group(4))
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
40 key=(scheme,year,month)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
41 tab[key]=tab.get(key,0)+count
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
42 continue
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
43 cols=l.split()
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
44 scheme=cols[0]
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
45 if scheme[-1]==':':
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
46 scheme=scheme[0:-1]
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
47 if scheme not in ('http','https'):
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
48 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
49 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
50 print(1,scheme,l,file=sys.stderr)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
51 bogons+=1
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
52 continue
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
53 try:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
54 cols=cols[1:]
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
55 count=int(cols.pop())
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
56 except:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
57 print(2,cols,file=sys.stderr)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
58 bogons+=1
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
59 continue
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
60 if cols==[]:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
61 key=(scheme,0,0)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
62 tab[key]=tab.get(key,0)+count
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
63 continue
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
64 l=' '.join(cols)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
65 try:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
66 d=parse(l)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
67 if d is None:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
68 print(3,d,l,count,file=sys.stderr)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
69 year=0
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
70 month=2
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
71 elif d.year<1970:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
72 key=(scheme,0,1)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
73 elif d.year>2019:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
74 key=(scheme,2019,1)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
75 else:
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
76 key=(scheme,d.year,d.month)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
77 tab[key]=tab.get(key,0)+count
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
78 except Exception(e):
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
79 print(4,e,l,file=sys.stderr)
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
80 bogons+=1
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
81 for ((s,m,y),c) in tab.items():
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
82 print(s,m,y,c,sep='\t')
b4e3beb2227e improved error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 20
diff changeset
83 print(bogons,file=sys.stderr)