view master/bin/fixDates.py @ 31:580cc12c9712

partway to rework after failure of mergedWhich.x64700
author Henry S. Thompson <ht@markup.co.uk>
date Mon, 19 Nov 2018 18:33:17 +0000
parents dd19cf97b6dd
children 9342f6269edf
line wrap: on
line source

#!/usr/bin/env python3
import sys,re
from array import array
ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$')
#parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
from dateparser import parse

bogons=0
http_ytab=list(201*[None]) # 1900--2100
https_ytab=list(201*[None])
http_yzero=list(13*[0])
https_yzero=list(13*[0])
months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,
        'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
for l in sys.stdin:
  ff=ok.match(l)
  if ff is not None:
    #print(l,end='')
    scheme=ff.group(1)
    count=None
    try:
      # More alphas then numerics...
      try:
        month=months[ff.group(2)]
      except KeyError:
        month=int(ff.group(2))
      year=int(ff.group(3))
      count=int(ff.group(4))
    except:
      # Unusual month or year field
      try:
        d=parse("%s %s"%(ff.group(2),ff.group(3)))
        if d is None or count is None:
          print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
                file=sys.stderr)
          bogons+=1
          continue
        elif d.year<1900 or d.year>2100:
          # Shouldn't happen 
          print(7,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
                file=sys.stderr)
          bogons+=1
          continue
        else:
          month=d.month
          year=d.year
      except Exception as e:
        print(6,e,l,file=sys.stderr)
        bogons+=1
        continue
  else:
    cols=l.split()
    scheme=cols[0]
    if scheme[-1]==':':
      scheme=scheme[0:-1]
    if scheme not in ('http','https'):
      # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
      #  header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
      print(1,scheme,l,file=sys.stderr)
      bogons+=1
      continue
    try:
      cols=cols[1:]
      count=int(cols.pop())
    except:
      print(2,cols,file=sys.stderr)
      bogons+=1
      continue
    if cols==[]:
      year=month=0
    else:
      l=' '.join(cols)
      try:
        d=parse(l)
        if d is None:
          print(3,d,l,count,file=sys.stderr)
          year=0
          month=2
        elif d.year<1900:
          year=0
          month=3
        elif d.year>3499:
          year=3499
          month=2
        else:
          year=d.year
          month=d.month
      except Exception as e:
        print(4,e,l,file=sys.stderr)
        bogons+=1
        continue
  # file it
if tab=={}:
  # ssh screwed up
  exit(1)
for ((s,m,y),c) in tab.items():
  print(s,m,y,c,sep='\t')
print(bogons,file=sys.stderr)