view master/bin/fixDates.py @ 40:4cf6bc21f683

start work on python version of tW.sh
author Henry S. Thompson <ht@markup.co.uk>
date Fri, 30 Nov 2018 13:43:36 +0000
parents bb09db2afe6b
children 3313edbab3b0
line wrap: on
line source

#!/usr/bin/env python3
import sys,re
from array import array
from time import strftime
ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$')
#parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}')
from dateparser import parse

n=0
bogons=0
HTTP=0
HTTPS=1
sn=['http','https']
http_ytab=list(201*[None]) # 1900--2100
https_ytab=list(201*[None])
tab=[http_ytab,https_ytab]
nd=[0,0] # no date
ed=[0,0] # date < 1900
ld=[0,0] # date > 2100
mn=[None,'Jan','Feb','Mar','Apr','May','Jun',
        'Jul','Aug','Sep','Oct','Nov','Dec']
months=dict(zip(mn[1:],range(1,13)))
for l in sys.stdin:
  if l[0]=='#':
    print('#1 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr)
    continue
  n+=1
  ff=ok.match(l)
  if ff is not None:
    #print(l,end='')
    scheme=HTTP if ff.group(1)=='http' else HTTPS
    count=None
    try:
      # More alphas then numerics...
      count=int(ff.group(4))
      try:
        month=months[ff.group(2)]
      except KeyError:
        month=int(ff.group(2))
      year=int(ff.group(3))
    except:
      # Unusual month or year field
      try:
        # day 1 is because w/o it the default is today's is used, which may
        #  fail if it's e.g. 31 March today and the string is "April 2017"
        d=parse("1 %s %s"%(ff.group(2),ff.group(3)))#,languages=['en'])
        if d is None or count is None:
          print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
                file=sys.stderr)
          bogons+=1
          continue
        elif d.year<1900 or d.year>2100:
          # Shouldn't happen 
          print(7,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
                file=sys.stderr)
          bogons+=1
          continue
        else:
          month=d.month
          year=d.year
      except Exception as e:
        print(6,ff.group(1),e,l,file=sys.stderr)
        bogons+=1
        continue
  else:
    cols=l.split()
    scheme=cols[0]
    if scheme[-1]==':':
      scheme=scheme[0:-1]
    if scheme not in ('http','https'):
      # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod
      #  header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted
      print(1,scheme,l,file=sys.stderr)
      bogons+=1
      continue
    scheme=HTTP if scheme=='http' else HTTPS
    try:
      cols=cols[1:]
      count=int(cols.pop())
    except:
      print(2,sn[scheme],count,l,file=sys.stderr)
      bogons+=1
      continue
    if cols==[]:
      nd[scheme]+=count
      continue
    else:
      l=' '.join(cols)
      try:
        d=parse(l)#,languages=['en']))
        if d is None:
          print(3,sn[scheme],l,count,file=sys.stderr)
          bogons+=1
          continue
        elif d.year<1900 or d.year>2100:
          # Jan 0001 does show up, so log these as early / late
          (ed if d.year<1900 else ld)[scheme]+=count
          continue
        else:
          year=d.year
          month=d.month
      except Exception as e:
        print(4,sn[scheme],e,l,count,file=sys.stderr)
        bogons+=1
        continue
  # log it
  yy=tab[scheme]
  y=year-1900
  if yy[y] is None:
    yy[y]=mm=array('L',13*[0])
  else:
    mm=yy[y]
  mm[month]+=count
if n==0:
  # ssh screwed up
  exit(1)
print('#2 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr)
for s in (HTTP,HTTPS):
  if nd[s]!=0:
    print(sn[s],0,0,nd[s],sep='\t')
  if ed[s]!=0:    
    print(sn[s],0,1,ed[s],sep='\t')
  if ld[s]!=0:    
    print(sn[s],0,2,ld[s],sep='\t')
  yy=tab[s]
  for y in range(201):
    mm=yy[y]
    if mm is not None:
      for m in range(1,13):
        if mm[m]!=0:
          print(sn[s],mn[m],y+1900,mm[m],sep='\t')
print('#3 %s %s %s'%(strftime('%Y-%m-%d %H:%M:%S'),
                     n,bogons),
                     file=sys.stderr)