comparison master/bin/fixDates.py @ 31:580cc12c9712

partway to rework after failure of mergedWhich.x64700
author Henry S. Thompson <ht@markup.co.uk>
date Mon, 19 Nov 2018 18:33:17 +0000
parents dd19cf97b6dd
children 9342f6269edf
comparison
equal deleted inserted replaced
30:9275e2a8b5e2 31:580cc12c9712
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 import sys,re 2 import sys,re
3 from array import array
3 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$') 4 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$')
4 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}') 5 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
5 from dateparser import parse 6 from dateparser import parse
6 7
7 bogons=0 8 bogons=0
8 tab={} 9 http_ytab=list(201*[None]) # 1900--2100
10 https_ytab=list(201*[None])
11 http_yzero=list(13*[0])
12 https_yzero=list(13*[0])
9 months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6, 13 months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,
10 'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12} 14 'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
11 for l in sys.stdin: 15 for l in sys.stdin:
12 ff=ok.match(l) 16 ff=ok.match(l)
13 if ff is not None: 17 if ff is not None:
14 #print(l,end='') 18 #print(l,end='')
15 scheme=ff.group(1) 19 scheme=ff.group(1)
20 count=None
16 try: 21 try:
17 # More alphas then numerics... 22 # More alphas then numerics...
18 try: 23 try:
19 month=months[ff.group(2)] 24 month=months[ff.group(2)]
20 except KeyError: 25 except KeyError:
21 month=int(ff.group(2)) 26 month=int(ff.group(2))
22 year=int(ff.group(3)) 27 year=int(ff.group(3))
28 count=int(ff.group(4))
23 except: 29 except:
24 # Unusual month or year field 30 # Unusual month or year field
25 try: 31 try:
26 d=parse("%s %s"%(ff.group(2),ff.group(3))) 32 d=parse("%s %s"%(ff.group(2),ff.group(3)))
27 if d is None: 33 if d is None or count is None:
28 print(5,ff.group(1),ff.group(2),ff.group(3),file=sys.stderr) 34 print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
29 year=0 35 file=sys.stderr)
30 month=0 36 bogons+=1
31 elif d.year<1970: 37 continue
32 year=0 38 elif d.year<1900 or d.year>2100:
33 month=1 39 # Shouldn't happen
34 elif d.year>2019: 40 print(7,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
35 month=1 41 file=sys.stderr)
36 year=2019 42 bogons+=1
43 continue
37 else: 44 else:
38 month=d.month 45 month=d.month
39 year=d.year 46 year=d.year
40 except Exception as e: 47 except Exception as e:
41 print(6,e,l,file=sys.stderr) 48 print(6,e,l,file=sys.stderr)
42 bogons+=1 49 bogons+=1
43 count=int(ff.group(4)) 50 continue
44 key=(scheme,year,month) 51 else:
45 tab[key]=tab.get(key,0)+count 52 cols=l.split()
46 continue 53 scheme=cols[0]
47 cols=l.split() 54 if scheme[-1]==':':
48 scheme=cols[0] 55 scheme=scheme[0:-1]
49 if scheme[-1]==':': 56 if scheme not in ('http','https'):
50 scheme=scheme[0:-1] 57 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
51 if scheme not in ('http','https'): 58 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
52 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod 59 print(1,scheme,l,file=sys.stderr)
53 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted 60 bogons+=1
54 print(1,scheme,l,file=sys.stderr) 61 continue
55 bogons+=1 62 try:
56 continue 63 cols=cols[1:]
57 try: 64 count=int(cols.pop())
58 cols=cols[1:] 65 except:
59 count=int(cols.pop()) 66 print(2,cols,file=sys.stderr)
60 except: 67 bogons+=1
61 print(2,cols,file=sys.stderr) 68 continue
62 bogons+=1 69 if cols==[]:
63 continue 70 year=month=0
64 if cols==[]:
65 key=(scheme,0,0)
66 tab[key]=tab.get(key,0)+count
67 continue
68 l=' '.join(cols)
69 try:
70 d=parse(l)
71 if d is None:
72 print(3,d,l,count,file=sys.stderr)
73 year=0
74 month=2
75 elif d.year<1970:
76 key=(scheme,0,1)
77 elif d.year>2019:
78 key=(scheme,2019,1)
79 else: 71 else:
80 key=(scheme,d.year,d.month) 72 l=' '.join(cols)
81 tab[key]=tab.get(key,0)+count 73 try:
82 except Exception as e: 74 d=parse(l)
83 print(4,e,l,file=sys.stderr) 75 if d is None:
84 bogons+=1 76 print(3,d,l,count,file=sys.stderr)
77 year=0
78 month=2
79 elif d.year<1900:
80 year=0
81 month=3
82 elif d.year>3499:
83 year=3499
84 month=2
85 else:
86 year=d.year
87 month=d.month
88 except Exception as e:
89 print(4,e,l,file=sys.stderr)
90 bogons+=1
91 continue
92 # file it
85 if tab=={}: 93 if tab=={}:
86 # ssh screwed up 94 # ssh screwed up
87 exit(1) 95 exit(1)
88 for ((s,m,y),c) in tab.items(): 96 for ((s,m,y),c) in tab.items():
89 print(s,m,y,c,sep='\t') 97 print(s,m,y,c,sep='\t')