Mercurial > hg > cc > azure
comparison master/bin/fixDates.py @ 31:580cc12c9712
partway to rework after failure of mergedWhich.x64700
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Mon, 19 Nov 2018 18:33:17 +0000 |
parents | dd19cf97b6dd |
children | 9342f6269edf |
comparison
equal
deleted
inserted
replaced
30:9275e2a8b5e2 | 31:580cc12c9712 |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 import sys,re | 2 import sys,re |
3 from array import array | |
3 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$') | 4 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$') |
4 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}') | 5 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}') |
5 from dateparser import parse | 6 from dateparser import parse |
6 | 7 |
7 bogons=0 | 8 bogons=0 |
8 tab={} | 9 http_ytab=list(201*[None]) # 1900--2100 |
10 https_ytab=list(201*[None]) | |
11 http_yzero=list(13*[0]) | |
12 https_yzero=list(13*[0]) | |
9 months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6, | 13 months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6, |
10 'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12} | 14 'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12} |
11 for l in sys.stdin: | 15 for l in sys.stdin: |
12 ff=ok.match(l) | 16 ff=ok.match(l) |
13 if ff is not None: | 17 if ff is not None: |
14 #print(l,end='') | 18 #print(l,end='') |
15 scheme=ff.group(1) | 19 scheme=ff.group(1) |
20 count=None | |
16 try: | 21 try: |
17 # More alphas then numerics... | 22 # More alphas then numerics... |
18 try: | 23 try: |
19 month=months[ff.group(2)] | 24 month=months[ff.group(2)] |
20 except KeyError: | 25 except KeyError: |
21 month=int(ff.group(2)) | 26 month=int(ff.group(2)) |
22 year=int(ff.group(3)) | 27 year=int(ff.group(3)) |
28 count=int(ff.group(4)) | |
23 except: | 29 except: |
24 # Unusual month or year field | 30 # Unusual month or year field |
25 try: | 31 try: |
26 d=parse("%s %s"%(ff.group(2),ff.group(3))) | 32 d=parse("%s %s"%(ff.group(2),ff.group(3))) |
27 if d is None: | 33 if d is None or count is None: |
28 print(5,ff.group(1),ff.group(2),ff.group(3),file=sys.stderr) | 34 print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4), |
29 year=0 | 35 file=sys.stderr) |
30 month=0 | 36 bogons+=1 |
31 elif d.year<1970: | 37 continue |
32 year=0 | 38 elif d.year<1900 or d.year>2100: |
33 month=1 | 39 # Shouldn't happen |
34 elif d.year>2019: | 40 print(7,ff.group(1),ff.group(2),ff.group(3),ff.group(4), |
35 month=1 | 41 file=sys.stderr) |
36 year=2019 | 42 bogons+=1 |
43 continue | |
37 else: | 44 else: |
38 month=d.month | 45 month=d.month |
39 year=d.year | 46 year=d.year |
40 except Exception as e: | 47 except Exception as e: |
41 print(6,e,l,file=sys.stderr) | 48 print(6,e,l,file=sys.stderr) |
42 bogons+=1 | 49 bogons+=1 |
43 count=int(ff.group(4)) | 50 continue |
44 key=(scheme,year,month) | 51 else: |
45 tab[key]=tab.get(key,0)+count | 52 cols=l.split() |
46 continue | 53 scheme=cols[0] |
47 cols=l.split() | 54 if scheme[-1]==':': |
48 scheme=cols[0] | 55 scheme=scheme[0:-1] |
49 if scheme[-1]==':': | 56 if scheme not in ('http','https'): |
50 scheme=scheme[0:-1] | 57 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod |
51 if scheme not in ('http','https'): | 58 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted |
52 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod | 59 print(1,scheme,l,file=sys.stderr) |
53 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted | 60 bogons+=1 |
54 print(1,scheme,l,file=sys.stderr) | 61 continue |
55 bogons+=1 | 62 try: |
56 continue | 63 cols=cols[1:] |
57 try: | 64 count=int(cols.pop()) |
58 cols=cols[1:] | 65 except: |
59 count=int(cols.pop()) | 66 print(2,cols,file=sys.stderr) |
60 except: | 67 bogons+=1 |
61 print(2,cols,file=sys.stderr) | 68 continue |
62 bogons+=1 | 69 if cols==[]: |
63 continue | 70 year=month=0 |
64 if cols==[]: | |
65 key=(scheme,0,0) | |
66 tab[key]=tab.get(key,0)+count | |
67 continue | |
68 l=' '.join(cols) | |
69 try: | |
70 d=parse(l) | |
71 if d is None: | |
72 print(3,d,l,count,file=sys.stderr) | |
73 year=0 | |
74 month=2 | |
75 elif d.year<1970: | |
76 key=(scheme,0,1) | |
77 elif d.year>2019: | |
78 key=(scheme,2019,1) | |
79 else: | 71 else: |
80 key=(scheme,d.year,d.month) | 72 l=' '.join(cols) |
81 tab[key]=tab.get(key,0)+count | 73 try: |
82 except Exception as e: | 74 d=parse(l) |
83 print(4,e,l,file=sys.stderr) | 75 if d is None: |
84 bogons+=1 | 76 print(3,d,l,count,file=sys.stderr) |
77 year=0 | |
78 month=2 | |
79 elif d.year<1900: | |
80 year=0 | |
81 month=3 | |
82 elif d.year>3499: | |
83 year=3499 | |
84 month=2 | |
85 else: | |
86 year=d.year | |
87 month=d.month | |
88 except Exception as e: | |
89 print(4,e,l,file=sys.stderr) | |
90 bogons+=1 | |
91 continue | |
92 # file it | |
85 if tab=={}: | 93 if tab=={}: |
86 # ssh screwed up | 94 # ssh screwed up |
87 exit(1) | 95 exit(1) |
88 for ((s,m,y),c) in tab.items(): | 96 for ((s,m,y),c) in tab.items(): |
89 print(s,m,y,c,sep='\t') | 97 print(s,m,y,c,sep='\t') |