Mercurial > hg > cc > azure
changeset 24:b4e3beb2227e
improved error handling,
does totalling now too
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 07 Nov 2018 14:15:56 +0000 |
parents | cc065b2a2543 |
children | 1b9329f6b5e1 |
files | master/bin/fixDates.py |
diffstat | 1 files changed, 83 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/master/bin/fixDates.py Wed Oct 31 21:42:34 2018 +0000 +++ b/master/bin/fixDates.py Wed Nov 07 14:15:56 2018 +0000 @@ -1,1 +1,83 @@ -/home/cc/lib/python/fixDates.py \ No newline at end of file +#!/usr/bin/env python3 +import sys,re +ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$') +#parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}') +from dateparser import parse + +bogons=0 +tab={} +months={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6, + 'Jul':7,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12} +for l in sys.stdin: + ff=ok.match(l) + if ff is not None: + #print(l,end='') + scheme=ff.group(1) + try: + # More alphas then numerics... + try: + month=months[ff.group(2)] + except KeyError: + month=int(ff.group(2)) + year=int(ff.group(3)) + except: + # Unusual month or year field + d=parse("%s %s"%(ff.group(2),ff.group(3))) + if d is None: + print(5,ff.group(1),ff.group(2),ff.group(3),file=sys.stderr) + year=0 + month=0 + elif d.year<1970: + year=0 + month=1 + elif d.year>2019: + month=1 + year=2019 + else: + month=d.month + year=d.year + count=int(ff.group(4)) + key=(scheme,year,month) + tab[key]=tab.get(key,0)+count + continue + cols=l.split() + scheme=cols[0] + if scheme[-1]==':': + scheme=scheme[0:-1] + if scheme not in ('http','https'): + # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod + # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted + print(1,scheme,l,file=sys.stderr) + bogons+=1 + continue + try: + cols=cols[1:] + count=int(cols.pop()) + except: + print(2,cols,file=sys.stderr) + bogons+=1 + continue + if cols==[]: + key=(scheme,0,0) + tab[key]=tab.get(key,0)+count + continue + l=' '.join(cols) + try: + d=parse(l) + if d is None: + print(3,d,l,count,file=sys.stderr) + year=0 + month=2 + elif d.year<1970: + key=(scheme,0,1) + elif d.year>2019: + key=(scheme,2019,1) + else: + key=(scheme,d.year,d.month) + tab[key]=tab.get(key,0)+count + except Exception(e): + print(4,e,l,file=sys.stderr) + bogons+=1 +for ((s,m,y),c) in tab.items(): + print(s,m,y,c,sep='\t') +print(bogons,file=sys.stderr)