comparison master/bin/fixDates.py @ 34:ad6eff2bc6f9

fixes to logging and efficiency, see also notes.txt wrt patches to dateparser
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 21 Nov 2018 18:42:56 +0000
parents 4c117ee8ed75
children bb09db2afe6b
comparison
equal deleted inserted replaced
33:4c117ee8ed75 34:ad6eff2bc6f9
12 HTTPS=1 12 HTTPS=1
13 sn=['http','https'] 13 sn=['http','https']
14 http_ytab=list(201*[None]) # 1900--2100 14 http_ytab=list(201*[None]) # 1900--2100
15 https_ytab=list(201*[None]) 15 https_ytab=list(201*[None])
16 tab=[http_ytab,https_ytab] 16 tab=[http_ytab,https_ytab]
17 nd=[0,0] 17 nd=[0,0] # no date
18 ed=[0,0] # date < 1900
19 ld=[0,0] # date > 2100
18 mn=[None,'Jan','Feb','Mar','Apr','May','Jun', 20 mn=[None,'Jan','Feb','Mar','Apr','May','Jun',
19 'Jul','Aug','Sep','Oct','Nov','Dec'] 21 'Jul','Aug','Sep','Oct','Nov','Dec']
20 months=dict(zip(mn[1:],range(1,13))) 22 months=dict(zip(mn[1:],range(1,13)))
21 for l in sys.stdin: 23 for l in sys.stdin:
22 if l[0]=='#': 24 if l[0]=='#':
23 print('# %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr) 25 print('#1 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr)
24 continue 26 continue
25 n+=1 27 n+=1
26 ff=ok.match(l) 28 ff=ok.match(l)
27 if ff is not None: 29 if ff is not None:
28 #print(l,end='') 30 #print(l,end='')
37 year=int(ff.group(3)) 39 year=int(ff.group(3))
38 count=int(ff.group(4)) 40 count=int(ff.group(4))
39 except: 41 except:
40 # Unusual month or year field 42 # Unusual month or year field
41 try: 43 try:
42 d=parse("%s %s"%(ff.group(2),ff.group(3))) 44 d=parse("%s %s"%(ff.group(2),ff.group(3)))#,languages=['en'])
43 if d is None or count is None: 45 if d is None or count is None:
44 print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4), 46 print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
45 file=sys.stderr) 47 file=sys.stderr)
46 bogons+=1 48 bogons+=1
47 continue 49 continue
81 nd[scheme]+=count 83 nd[scheme]+=count
82 continue 84 continue
83 else: 85 else:
84 l=' '.join(cols) 86 l=' '.join(cols)
85 try: 87 try:
86 d=parse(l) 88 d=parse(l)#,languages=['en']))
87 if d is None: 89 if d is None:
88 print(3,d,l,count,file=sys.stderr) 90 print(3,d,l,count,file=sys.stderr)
89 bogons+=1 91 bogons+=1
90 continue 92 continue
91 elif d.year<1900 or d.year>2100: 93 elif d.year<1900 or d.year>2100:
92 # Shouldn't happen 94 # Jan 0001 does show up, so log these as early / late
93 print(8,sn[scheme],d.month,d.year,count, 95 (ed if d.year<1900 else ld)[scheme]+=count
94 file=sys.stderr)
95 bogons+=1
96 continue 96 continue
97 else: 97 else:
98 year=d.year 98 year=d.year
99 month=d.month 99 month=d.month
100 except Exception as e: 100 except Exception as e:
110 mm=yy[y] 110 mm=yy[y]
111 mm[month]+=count 111 mm[month]+=count
112 if n==0: 112 if n==0:
113 # ssh screwed up 113 # ssh screwed up
114 exit(1) 114 exit(1)
115 print('#2 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr)
115 for s in (HTTP,HTTPS): 116 for s in (HTTP,HTTPS):
116 if nd[s]!=0: 117 if nd[s]!=0:
117 print(sn[s],0,0,nd[s],sep='\t') 118 print(sn[s],0,0,nd[s],sep='\t')
119 if ed[s]!=0:
120 print(sn[s],0,1,ed[s],sep='\t')
121 if ld[s]!=0:
122 print(sn[s],0,2,ld[s],sep='\t')
118 yy=tab[s] 123 yy=tab[s]
119 for y in range(201): 124 for y in range(201):
120 mm=yy[y] 125 mm=yy[y]
121 if mm is not None: 126 if mm is not None:
122 for m in range(1,13): 127 for m in range(1,13):
123 if mm[m]!=0: 128 if mm[m]!=0:
124 print(sn[s],mn[m],y+1900,mm[m],sep='\t') 129 print(sn[s],mn[m],y+1900,mm[m],sep='\t')
125 print(n,bogons,file=sys.stderr) 130 print(n,bogons,file=sys.stderr)
131 print('#3 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr)
126 132
127 133
128 134