comparison master/bin/fixDates.py @ 32:9342f6269edf

rewritten to be faster, maybe, and avoid earlier bug
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 20 Nov 2018 10:31:05 +0000
parents 580cc12c9712
children 4c117ee8ed75
comparison
equal deleted inserted replaced
31:580cc12c9712 32:9342f6269edf
3 from array import array 3 from array import array
4 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$') 4 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$')
5 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}') 5 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
6 from dateparser import parse 6 from dateparser import parse
7 7
8 n=0
8 bogons=0 9 bogons=0
10 HTTP=0
11 HTTPS=1
12 sn=['http','https']
9 http_ytab=list(201*[None]) # 1900--2100 13 http_ytab=list(201*[None]) # 1900--2100
10 https_ytab=list(201*[None]) 14 https_ytab=list(201*[None])
11 http_yzero=list(13*[0]) 15 tab=[http_ytab,https_ytab]
12 https_yzero=list(13*[0]) 16 nd=[0,0]
13 months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6, 17 mn=[None,'Jan','Feb','Mar','Apr','May','Jun',
14 'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12} 18 'Jul','Aug','Sep','Oct','Nov','Dec']
19 months=dict(zip(mn[1:],range(1,13)))
15 for l in sys.stdin: 20 for l in sys.stdin:
21 n+=1
16 ff=ok.match(l) 22 ff=ok.match(l)
17 if ff is not None: 23 if ff is not None:
18 #print(l,end='') 24 #print(l,end='')
19 scheme=ff.group(1) 25 scheme=HTTP if ff.group(1)=='http' else HTTPS
20 count=None 26 count=None
21 try: 27 try:
22 # More alphas then numerics... 28 # More alphas then numerics...
23 try: 29 try:
24 month=months[ff.group(2)] 30 month=months[ff.group(2)]
57 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod 63 # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
58 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted 64 # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
59 print(1,scheme,l,file=sys.stderr) 65 print(1,scheme,l,file=sys.stderr)
60 bogons+=1 66 bogons+=1
61 continue 67 continue
68 scheme=HTTP if scheme=='http' else HTTPS
62 try: 69 try:
63 cols=cols[1:] 70 cols=cols[1:]
64 count=int(cols.pop()) 71 count=int(cols.pop())
65 except: 72 except:
66 print(2,cols,file=sys.stderr) 73 print(2,sn[scheme],cols,count,file=sys.stderr)
67 bogons+=1 74 bogons+=1
68 continue 75 continue
69 if cols==[]: 76 if cols==[]:
70 year=month=0 77 nd[scheme]+=count
78 continue
71 else: 79 else:
72 l=' '.join(cols) 80 l=' '.join(cols)
73 try: 81 try:
74 d=parse(l) 82 d=parse(l)
75 if d is None: 83 if d is None:
76 print(3,d,l,count,file=sys.stderr) 84 print(3,d,l,count,file=sys.stderr)
77 year=0 85 bogons+=1
78 month=2 86 continue
79 elif d.year<1900: 87 elif d.year<1900 or d.year>2100:
80 year=0 88 # Shouldn't happen
81 month=3 89 print(8,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
82 elif d.year>3499: 90 file=sys.stderr)
83 year=3499 91 bogons+=1
84 month=2 92 continue
85 else: 93 else:
86 year=d.year 94 year=d.year
87 month=d.month 95 month=d.month
88 except Exception as e: 96 except Exception as e:
89 print(4,e,l,file=sys.stderr) 97 print(4,e,l,file=sys.stderr)
90 bogons+=1 98 bogons+=1
91 continue 99 continue
92 # file it 100 # log it
93 if tab=={}: 101 yy=tab[scheme]
102 y=year-1900
103 if yy[y] is None:
104 yy[y]=mm=array('L',13*[0])
105 else:
106 mm=yy[y]
107 mm[month]+=count
108 if n==0:
94 # ssh screwed up 109 # ssh screwed up
95 exit(1) 110 exit(1)
96 for ((s,m,y),c) in tab.items(): 111 for s in (HTTP,HTTPS):
97 print(s,m,y,c,sep='\t') 112 if nd[s]!=0:
98 print(bogons,file=sys.stderr) 113 print(sn[s],0,0,nd[s],sep='\t')
114 yy=tab[s]
115 for y in range(201):
116 mm=yy[y]
117 if mm is not None:
118 for m in range(1,13):
119 if mm[m]!=0:
120 print(sn[s],mn[m],y+1900,mm[m],sep='\t')
121 print(n,bogons,file=sys.stderr)
122
123
124