changeset 24:b4e3beb2227e

improved error handling, does totalling now too
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 07 Nov 2018 14:15:56 +0000
parents cc065b2a2543
children 1b9329f6b5e1
files master/bin/fixDates.py
diffstat 1 files changed, 83 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/master/bin/fixDates.py	Wed Oct 31 21:42:34 2018 +0000
+++ b/master/bin/fixDates.py	Wed Nov 07 14:15:56 2018 +0000
@@ -1,1 +1,83 @@
-/home/cc/lib/python/fixDates.py
\ No newline at end of file
+#!/usr/bin/env python3
+import sys,re
+ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$')
+#parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
+from dateparser import parse
+
+bogons=0
+tab={}
+months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,
+        'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
+for l in sys.stdin:
+  ff=ok.match(l)
+  if ff is not None:
+    #print(l,end='')
+    scheme=ff.group(1)
+    try:
+      # More alphas then numerics...
+      try:
+        month=months[ff.group(2)]
+      except KeyError:
+        month=int(ff.group(2))
+      year=int(ff.group(3))
+    except:
+      # Unusual month or year field
+      d=parse("%s %s"%(ff.group(2),ff.group(3)))
+      if d is None:
+        print(5,ff.group(1),ff.group(2),ff.group(3),file=sys.stderr)
+        year=0
+        month=0
+      elif d.year<1970:
+        year=0
+        month=1
+      elif d.year>2019:
+        month=1
+        year=2019
+      else:
+        month=d.month
+        year=d.year
+    count=int(ff.group(4))
+    key=(scheme,year,month)
+    tab[key]=tab.get(key,0)+count
+    continue
+  cols=l.split()
+  scheme=cols[0]
+  if scheme[-1]==':':
+    scheme=scheme[0:-1]
+  if scheme not in ('http','https'):
+    # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
+    #  header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
+    print(1,scheme,l,file=sys.stderr)
+    bogons+=1
+    continue
+  try:
+    cols=cols[1:]
+    count=int(cols.pop())
+  except:
+    print(2,cols,file=sys.stderr)
+    bogons+=1
+    continue
+  if cols==[]:
+    key=(scheme,0,0)
+    tab[key]=tab.get(key,0)+count
+    continue
+  l=' '.join(cols)
+  try:
+    d=parse(l)
+    if d is None:
+      print(3,d,l,count,file=sys.stderr)
+      year=0
+      month=2
+    elif d.year<1970:
+      key=(scheme,0,1)
+    elif d.year>2019:
+      key=(scheme,2019,1)
+    else:
+      key=(scheme,d.year,d.month)
+    tab[key]=tab.get(key,0)+count
+  except Exception(e):
+    print(4,e,l,file=sys.stderr)
+    bogons+=1
+for ((s,m,y),c) in tab.items():
+  print(s,m,y,c,sep='\t')
+print(bogons,file=sys.stderr)