changeset 32:9342f6269edf

rewritten to be faster, maybe, and avoid earlier bug
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 20 Nov 2018 10:31:05 +0000
parents 580cc12c9712
children 4c117ee8ed75
files master/bin/fixDates.py
diffstat 1 files changed, 46 insertions(+), 20 deletions(-) [+]
line wrap: on
line diff
--- a/master/bin/fixDates.py	Mon Nov 19 18:33:17 2018 +0000
+++ b/master/bin/fixDates.py	Tue Nov 20 10:31:05 2018 +0000
@@ -5,18 +5,24 @@
 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
 from dateparser import parse
 
+n=0
 bogons=0
+HTTP=0
+HTTPS=1
+sn=['http','https']
 http_ytab=list(201*[None]) # 1900--2100
 https_ytab=list(201*[None])
-http_yzero=list(13*[0])
-https_yzero=list(13*[0])
-months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,
-        'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
+tab=[http_ytab,https_ytab]
+nd=[0,0]
+mn=[None,'Jan','Feb','Mar','Apr','May','Jun',
+        'Jul','Aug','Sep','Oct','Nov','Dec']
+months=dict(zip(mn[1:],range(1,13)))
 for l in sys.stdin:
+  n+=1
   ff=ok.match(l)
   if ff is not None:
     #print(l,end='')
-    scheme=ff.group(1)
+    scheme=HTTP if ff.group(1)=='http' else HTTPS
     count=None
     try:
       # More alphas then numerics...
@@ -59,29 +65,31 @@
       print(1,scheme,l,file=sys.stderr)
       bogons+=1
       continue
+    scheme=HTTP if scheme=='http' else HTTPS
     try:
       cols=cols[1:]
       count=int(cols.pop())
     except:
-      print(2,cols,file=sys.stderr)
+      print(2,sn[scheme],cols,count,file=sys.stderr)
       bogons+=1
       continue
     if cols==[]:
-      year=month=0
+      nd[scheme]+=count
+      continue
     else:
       l=' '.join(cols)
       try:
         d=parse(l)
         if d is None:
           print(3,d,l,count,file=sys.stderr)
-          year=0
-          month=2
-        elif d.year<1900:
-          year=0
-          month=3
-        elif d.year>3499:
-          year=3499
-          month=2
+          bogons+=1
+          continue
+        elif d.year<1900 or d.year>2100:
+          # Shouldn't happen 
+          print(8,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
+                file=sys.stderr)
+          bogons+=1
+          continue
         else:
           year=d.year
           month=d.month
@@ -89,10 +97,28 @@
         print(4,e,l,file=sys.stderr)
         bogons+=1
         continue
-  # file it
-if tab=={}:
+  # log it
+  yy=tab[scheme]
+  y=year-1900
+  if yy[y] is None:
+    yy[y]=mm=array('L',13*[0])
+  else:
+    mm=yy[y]
+  mm[month]+=count
+if n==0:
   # ssh screwed up
   exit(1)
-for ((s,m,y),c) in tab.items():
-  print(s,m,y,c,sep='\t')
-print(bogons,file=sys.stderr)
+for s in (HTTP,HTTPS):
+  if nd[s]!=0:
+    print(sn[s],0,0,nd[s],sep='\t')
+  yy=tab[s]
+  for y in range(201):
+    mm=yy[y]
+    if mm is not None:
+      for m in range(1,13):
+        if mm[m]!=0:
+          print(sn[s],mn[m],y+1900,mm[m],sep='\t')
+print(n,bogons,file=sys.stderr)
+
+
+