view master/bin/fixDates.py @ 24:b4e3beb2227e

improved error handling, does totalling now too
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 07 Nov 2018 14:15:56 +0000
parents 0f4a0f4e38d4
children dd19cf97b6dd
line wrap: on
line source

#!/usr/bin/env python3
import sys,re
ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$')
#parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
from dateparser import parse

bogons=0
tab={}
months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,
        'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
for l in sys.stdin:
  ff=ok.match(l)
  if ff is not None:
    #print(l,end='')
    scheme=ff.group(1)
    try:
      # More alphas then numerics...
      try:
        month=months[ff.group(2)]
      except KeyError:
        month=int(ff.group(2))
      year=int(ff.group(3))
    except:
      # Unusual month or year field
      d=parse("%s %s"%(ff.group(2),ff.group(3)))
      if d is None:
        print(5,ff.group(1),ff.group(2),ff.group(3),file=sys.stderr)
        year=0
        month=0
      elif d.year<1970:
        year=0
        month=1
      elif d.year>2019:
        month=1
        year=2019
      else:
        month=d.month
        year=d.year
    count=int(ff.group(4))
    key=(scheme,year,month)
    tab[key]=tab.get(key,0)+count
    continue
  cols=l.split()
  scheme=cols[0]
  if scheme[-1]==':':
    scheme=scheme[0:-1]
  if scheme not in ('http','https'):
    # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
    #  header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
    print(1,scheme,l,file=sys.stderr)
    bogons+=1
    continue
  try:
    cols=cols[1:]
    count=int(cols.pop())
  except:
    print(2,cols,file=sys.stderr)
    bogons+=1
    continue
  if cols==[]:
    key=(scheme,0,0)
    tab[key]=tab.get(key,0)+count
    continue
  l=' '.join(cols)
  try:
    d=parse(l)
    if d is None:
      print(3,d,l,count,file=sys.stderr)
      year=0
      month=2
    elif d.year<1970:
      key=(scheme,0,1)
    elif d.year>2019:
      key=(scheme,2019,1)
    else:
      key=(scheme,d.year,d.month)
    tab[key]=tab.get(key,0)+count
  except Exception(e):
    print(4,e,l,file=sys.stderr)
    bogons+=1
for ((s,m,y),c) in tab.items():
  print(s,m,y,c,sep='\t')
print(bogons,file=sys.stderr)