changeset 31:580cc12c9712

partway to rework after failure of mergedWhich.x64700
author Henry S. Thompson <ht@markup.co.uk>
date Mon, 19 Nov 2018 18:33:17 +0000
parents 9275e2a8b5e2
children 9342f6269edf
files master/bin/fixDates.py
diffstat 1 files changed, 60 insertions(+), 52 deletions(-) [+]
line wrap: on
line diff
--- a/master/bin/fixDates.py	Mon Nov 19 18:32:30 2018 +0000
+++ b/master/bin/fixDates.py	Mon Nov 19 18:33:17 2018 +0000
@@ -1,11 +1,15 @@
 #!/usr/bin/env python3
 import sys,re
+from array import array
 ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$')
 #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
 from dateparser import parse
 
 bogons=0
-tab={}
+http_ytab=list(201*[None]) # 1900--2100
+https_ytab=list(201*[None])
+http_yzero=list(13*[0])
+https_yzero=list(13*[0])
 months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,
         'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
 for l in sys.stdin:
@@ -13,6 +17,7 @@
   if ff is not None:
     #print(l,end='')
     scheme=ff.group(1)
+    count=None
     try:
       # More alphas then numerics...
       try:
@@ -20,68 +25,71 @@
       except KeyError:
         month=int(ff.group(2))
       year=int(ff.group(3))
+      count=int(ff.group(4))
     except:
       # Unusual month or year field
       try:
         d=parse("%s %s"%(ff.group(2),ff.group(3)))
-        if d is None:
-          print(5,ff.group(1),ff.group(2),ff.group(3),file=sys.stderr)
-          year=0
-          month=0
-        elif d.year<1970:
-          year=0
-          month=1
-        elif d.year>2019:
-          month=1
-          year=2019
+        if d is None or count is None:
+          print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
+                file=sys.stderr)
+          bogons+=1
+          continue
+        elif d.year<1900 or d.year>2100:
+          # Shouldn't happen 
+          print(7,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
+                file=sys.stderr)
+          bogons+=1
+          continue
         else:
           month=d.month
           year=d.year
       except Exception as e:
         print(6,e,l,file=sys.stderr)
         bogons+=1
-    count=int(ff.group(4))
-    key=(scheme,year,month)
-    tab[key]=tab.get(key,0)+count
-    continue
-  cols=l.split()
-  scheme=cols[0]
-  if scheme[-1]==':':
-    scheme=scheme[0:-1]
-  if scheme not in ('http','https'):
-    # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
-    #  header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
-    print(1,scheme,l,file=sys.stderr)
-    bogons+=1
-    continue
-  try:
-    cols=cols[1:]
-    count=int(cols.pop())
-  except:
-    print(2,cols,file=sys.stderr)
-    bogons+=1
-    continue
-  if cols==[]:
-    key=(scheme,0,0)
-    tab[key]=tab.get(key,0)+count
-    continue
-  l=' '.join(cols)
-  try:
-    d=parse(l)
-    if d is None:
-      print(3,d,l,count,file=sys.stderr)
-      year=0
-      month=2
-    elif d.year<1970:
-      key=(scheme,0,1)
-    elif d.year>2019:
-      key=(scheme,2019,1)
+        continue
+  else:
+    cols=l.split()
+    scheme=cols[0]
+    if scheme[-1]==':':
+      scheme=scheme[0:-1]
+    if scheme not in ('http','https'):
+      # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
+      #  header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
+      print(1,scheme,l,file=sys.stderr)
+      bogons+=1
+      continue
+    try:
+      cols=cols[1:]
+      count=int(cols.pop())
+    except:
+      print(2,cols,file=sys.stderr)
+      bogons+=1
+      continue
+    if cols==[]:
+      year=month=0
     else:
-      key=(scheme,d.year,d.month)
-    tab[key]=tab.get(key,0)+count
-  except Exception as e:
-    print(4,e,l,file=sys.stderr)
-    bogons+=1
+      l=' '.join(cols)
+      try:
+        d=parse(l)
+        if d is None:
+          print(3,d,l,count,file=sys.stderr)
+          year=0
+          month=2
+        elif d.year<1900:
+          year=0
+          month=3
+        elif d.year>3499:
+          year=3499
+          month=2
+        else:
+          year=d.year
+          month=d.month
+      except Exception as e:
+        print(4,e,l,file=sys.stderr)
+        bogons+=1
+        continue
+  # file it
 if tab=={}:
   # ssh screwed up
   exit(1)