Mercurial > hg > cc > azure
comparison master/bin/fixDates.py @ 32:9342f6269edf
rewritten to be faster, maybe, and avoid earlier bug
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 20 Nov 2018 10:31:05 +0000 |
parents | 580cc12c9712 |
children | 4c117ee8ed75 |
comparison
equal
deleted
inserted
replaced
31:580cc12c9712 | 32:9342f6269edf |
---|---|
3 from array import array | 3 from array import array |
4 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$') | 4 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$') |
5 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}') | 5 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}') |
6 from dateparser import parse | 6 from dateparser import parse |
7 | 7 |
8 n=0 | |
8 bogons=0 | 9 bogons=0 |
10 HTTP=0 | |
11 HTTPS=1 | |
12 sn=['http','https'] | |
9 http_ytab=list(201*[None]) # 1900--2100 | 13 http_ytab=list(201*[None]) # 1900--2100 |
10 https_ytab=list(201*[None]) | 14 https_ytab=list(201*[None]) |
11 http_yzero=list(13*[0]) | 15 tab=[http_ytab,https_ytab] |
12 https_yzero=list(13*[0]) | 16 nd=[0,0] |
13 months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6, | 17 mn=[None,'Jan','Feb','Mar','Apr','May','Jun', |
14 'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12} | 18 'Jul','Aug','Sep','Oct','Nov','Dec'] |
19 months=dict(zip(mn[1:],range(1,13))) | |
15 for l in sys.stdin: | 20 for l in sys.stdin: |
21 n+=1 | |
16 ff=ok.match(l) | 22 ff=ok.match(l) |
17 if ff is not None: | 23 if ff is not None: |
18 #print(l,end='') | 24 #print(l,end='') |
19 scheme=ff.group(1) | 25 scheme=HTTP if ff.group(1)=='http' else HTTPS |
20 count=None | 26 count=None |
21 try: | 27 try: |
22 # More alphas then numerics... | 28 # More alphas then numerics... |
23 try: | 29 try: |
24 month=months[ff.group(2)] | 30 month=months[ff.group(2)] |
57 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod | 63 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod |
58 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted | 64 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted |
59 print(1,scheme,l,file=sys.stderr) | 65 print(1,scheme,l,file=sys.stderr) |
60 bogons+=1 | 66 bogons+=1 |
61 continue | 67 continue |
68 scheme=HTTP if scheme=='http' else HTTPS | |
62 try: | 69 try: |
63 cols=cols[1:] | 70 cols=cols[1:] |
64 count=int(cols.pop()) | 71 count=int(cols.pop()) |
65 except: | 72 except: |
66 print(2,cols,file=sys.stderr) | 73 print(2,sn[scheme],cols,count,file=sys.stderr) |
67 bogons+=1 | 74 bogons+=1 |
68 continue | 75 continue |
69 if cols==[]: | 76 if cols==[]: |
70 year=month=0 | 77 nd[scheme]+=count |
78 continue | |
71 else: | 79 else: |
72 l=' '.join(cols) | 80 l=' '.join(cols) |
73 try: | 81 try: |
74 d=parse(l) | 82 d=parse(l) |
75 if d is None: | 83 if d is None: |
76 print(3,d,l,count,file=sys.stderr) | 84 print(3,d,l,count,file=sys.stderr) |
77 year=0 | 85 bogons+=1 |
78 month=2 | 86 continue |
79 elif d.year<1900: | 87 elif d.year<1900 or d.year>2100: |
80 year=0 | 88 # Shouldn't happen |
81 month=3 | 89 print(8,ff.group(1),ff.group(2),ff.group(3),ff.group(4), |
82 elif d.year>3499: | 90 file=sys.stderr) |
83 year=3499 | 91 bogons+=1 |
84 month=2 | 92 continue |
85 else: | 93 else: |
86 year=d.year | 94 year=d.year |
87 month=d.month | 95 month=d.month |
88 except Exception as e: | 96 except Exception as e: |
89 print(4,e,l,file=sys.stderr) | 97 print(4,e,l,file=sys.stderr) |
90 bogons+=1 | 98 bogons+=1 |
91 continue | 99 continue |
92 # file it | 100 # log it |
93 if tab=={}: | 101 yy=tab[scheme] |
102 y=year-1900 | |
103 if yy[y] is None: | |
104 yy[y]=mm=array('L',13*[0]) | |
105 else: | |
106 mm=yy[y] | |
107 mm[month]+=count | |
108 if n==0: | |
94 # ssh screwed up | 109 # ssh screwed up |
95 exit(1) | 110 exit(1) |
96 for ((s,m,y),c) in tab.items(): | 111 for s in (HTTP,HTTPS): |
97 print(s,m,y,c,sep='\t') | 112 if nd[s]!=0: |
98 print(bogons,file=sys.stderr) | 113 print(sn[s],0,0,nd[s],sep='\t') |
114 yy=tab[s] | |
115 for y in range(201): | |
116 mm=yy[y] | |
117 if mm is not None: | |
118 for m in range(1,13): | |
119 if mm[m]!=0: | |
120 print(sn[s],mn[m],y+1900,mm[m],sep='\t') | |
121 print(n,bogons,file=sys.stderr) | |
122 | |
123 | |
124 |