comparison master/bin/fixDates.py @ 24:b4e3beb2227e

improved error handling, does totalling now too
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 07 Nov 2018 14:15:56 +0000
parents 0f4a0f4e38d4
children dd19cf97b6dd
comparison
equal deleted inserted replaced
23:cc065b2a2543 24:b4e3beb2227e
1 /home/cc/lib/python/fixDates.py 1 #!/usr/bin/env python3
2 import sys,re
3 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$')
4 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
5 from dateparser import parse
6
7 bogons=0
8 tab={}
9 months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,
10 'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
11 for l in sys.stdin:
12 ff=ok.match(l)
13 if ff is not None:
14 #print(l,end='')
15 scheme=ff.group(1)
16 try:
17 # More alphas then numerics...
18 try:
19 month=months[ff.group(2)]
20 except KeyError:
21 month=int(ff.group(2))
22 year=int(ff.group(3))
23 except:
24 # Unusual month or year field
25 d=parse("%s %s"%(ff.group(2),ff.group(3)))
26 if d is None:
27 print(5,ff.group(1),ff.group(2),ff.group(3),file=sys.stderr)
28 year=0
29 month=0
30 elif d.year<1970:
31 year=0
32 month=1
33 elif d.year>2019:
34 month=1
35 year=2019
36 else:
37 month=d.month
38 year=d.year
39 count=int(ff.group(4))
40 key=(scheme,year,month)
41 tab[key]=tab.get(key,0)+count
42 continue
43 cols=l.split()
44 scheme=cols[0]
45 if scheme[-1]==':':
46 scheme=scheme[0:-1]
47 if scheme not in ('http','https'):
48 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
49 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
50 print(1,scheme,l,file=sys.stderr)
51 bogons+=1
52 continue
53 try:
54 cols=cols[1:]
55 count=int(cols.pop())
56 except:
57 print(2,cols,file=sys.stderr)
58 bogons+=1
59 continue
60 if cols==[]:
61 key=(scheme,0,0)
62 tab[key]=tab.get(key,0)+count
63 continue
64 l=' '.join(cols)
65 try:
66 d=parse(l)
67 if d is None:
68 print(3,d,l,count,file=sys.stderr)
69 year=0
70 month=2
71 elif d.year<1970:
72 key=(scheme,0,1)
73 elif d.year>2019:
74 key=(scheme,2019,1)
75 else:
76 key=(scheme,d.year,d.month)
77 tab[key]=tab.get(key,0)+count
78 except Exception(e):
79 print(4,e,l,file=sys.stderr)
80 bogons+=1
81 for ((s,m,y),c) in tab.items():
82 print(s,m,y,c,sep='\t')
83 print(bogons,file=sys.stderr)