Mercurial > hg > cc > azure
comparison master/bin/fixDates.py @ 34:ad6eff2bc6f9
fixes to logging and efficiency, see also notes.txt wrt patches to dateparser
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 21 Nov 2018 18:42:56 +0000 |
parents | 4c117ee8ed75 |
children | bb09db2afe6b |
comparison
equal
deleted
inserted
replaced
33:4c117ee8ed75 | 34:ad6eff2bc6f9 |
---|---|
12 HTTPS=1 | 12 HTTPS=1 |
13 sn=['http','https'] | 13 sn=['http','https'] |
14 http_ytab=list(201*[None]) # 1900--2100 | 14 http_ytab=list(201*[None]) # 1900--2100 |
15 https_ytab=list(201*[None]) | 15 https_ytab=list(201*[None]) |
16 tab=[http_ytab,https_ytab] | 16 tab=[http_ytab,https_ytab] |
17 nd=[0,0] | 17 nd=[0,0] # no date |
18 ed=[0,0] # date < 1900 | |
19 ld=[0,0] # date > 2100 | |
18 mn=[None,'Jan','Feb','Mar','Apr','May','Jun', | 20 mn=[None,'Jan','Feb','Mar','Apr','May','Jun', |
19 'Jul','Aug','Sep','Oct','Nov','Dec'] | 21 'Jul','Aug','Sep','Oct','Nov','Dec'] |
20 months=dict(zip(mn[1:],range(1,13))) | 22 months=dict(zip(mn[1:],range(1,13))) |
21 for l in sys.stdin: | 23 for l in sys.stdin: |
22 if l[0]=='#': | 24 if l[0]=='#': |
23 print('# %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr) | 25 print('#1 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr) |
24 continue | 26 continue |
25 n+=1 | 27 n+=1 |
26 ff=ok.match(l) | 28 ff=ok.match(l) |
27 if ff is not None: | 29 if ff is not None: |
28 #print(l,end='') | 30 #print(l,end='') |
37 year=int(ff.group(3)) | 39 year=int(ff.group(3)) |
38 count=int(ff.group(4)) | 40 count=int(ff.group(4)) |
39 except: | 41 except: |
40 # Unusual month or year field | 42 # Unusual month or year field |
41 try: | 43 try: |
42 d=parse("%s %s"%(ff.group(2),ff.group(3))) | 44 d=parse("%s %s"%(ff.group(2),ff.group(3)))#,languages=['en']) |
43 if d is None or count is None: | 45 if d is None or count is None: |
44 print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4), | 46 print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4), |
45 file=sys.stderr) | 47 file=sys.stderr) |
46 bogons+=1 | 48 bogons+=1 |
47 continue | 49 continue |
81 nd[scheme]+=count | 83 nd[scheme]+=count |
82 continue | 84 continue |
83 else: | 85 else: |
84 l=' '.join(cols) | 86 l=' '.join(cols) |
85 try: | 87 try: |
86 d=parse(l) | 88 d=parse(l)#,languages=['en'])) |
87 if d is None: | 89 if d is None: |
88 print(3,d,l,count,file=sys.stderr) | 90 print(3,d,l,count,file=sys.stderr) |
89 bogons+=1 | 91 bogons+=1 |
90 continue | 92 continue |
91 elif d.year<1900 or d.year>2100: | 93 elif d.year<1900 or d.year>2100: |
92 # Shouldn't happen | 94 # Jan 0001 does show up, so log these as early / late |
93 print(8,sn[scheme],d.month,d.year,count, | 95 (ed if d.year<1900 else ld)[scheme]+=count |
94 file=sys.stderr) | |
95 bogons+=1 | |
96 continue | 96 continue |
97 else: | 97 else: |
98 year=d.year | 98 year=d.year |
99 month=d.month | 99 month=d.month |
100 except Exception as e: | 100 except Exception as e: |
110 mm=yy[y] | 110 mm=yy[y] |
111 mm[month]+=count | 111 mm[month]+=count |
112 if n==0: | 112 if n==0: |
113 # ssh screwed up | 113 # ssh screwed up |
114 exit(1) | 114 exit(1) |
115 print('#2 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr) | |
115 for s in (HTTP,HTTPS): | 116 for s in (HTTP,HTTPS): |
116 if nd[s]!=0: | 117 if nd[s]!=0: |
117 print(sn[s],0,0,nd[s],sep='\t') | 118 print(sn[s],0,0,nd[s],sep='\t') |
119 if ed[s]!=0: | |
120 print(sn[s],0,1,ed[s],sep='\t') | |
121 if ld[s]!=0: | |
122 print(sn[s],0,2,ld[s],sep='\t') | |
118 yy=tab[s] | 123 yy=tab[s] |
119 for y in range(201): | 124 for y in range(201): |
120 mm=yy[y] | 125 mm=yy[y] |
121 if mm is not None: | 126 if mm is not None: |
122 for m in range(1,13): | 127 for m in range(1,13): |
123 if mm[m]!=0: | 128 if mm[m]!=0: |
124 print(sn[s],mn[m],y+1900,mm[m],sep='\t') | 129 print(sn[s],mn[m],y+1900,mm[m],sep='\t') |
125 print(n,bogons,file=sys.stderr) | 130 print(n,bogons,file=sys.stderr) |
131 print('#3 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr) | |
126 | 132 |
127 | 133 |
128 | 134 |