Mercurial > hg > cc > azure
view master/bin/fixDates.py @ 68:1f04bce6ead7 default tip
use basefile instead of transferfile, and remove cleanup: belt and braces wrt lossage of sac_schemes.py in 15% of 1000_k3,
this as used in a_2
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Thu, 04 Jun 2020 20:44:44 +0000 |
parents | 3313edbab3b0 |
children |
line wrap: on
line source
#!/usr/bin/env python3 import sys,re from array import array from time import strftime ok=re.compile('(https?) ([^ ]*) ((?:19|20)..) ([0-9][0-9]*)$') #parseable=re.compile('.*[-:/]\w+[-:/]|\w+\s\d{4}|\d{10}') from dateparser import parse n=0 bogons=0 HTTP=0 HTTPS=1 sn=['http','https'] http_ytab=list(201*[None]) # 1900--2100 https_ytab=list(201*[None]) tab=[http_ytab,https_ytab] nd=[0,0] # no date ed=[0,0] # date < 1900 ld=[0,0] # date > 2100 mn=[None,'Jan','Feb','Mar','Apr','May','Jun', 'Jul','Aug','Sep','Oct','Nov','Dec'] months=dict(zip(mn[1:],range(1,13))) for l in sys.stdin: if l[0]=='#': print('#1 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr) continue n+=1 ff=ok.match(l) if ff is not None: #print(l,end='') scheme=HTTP if ff.group(1)=='http' else HTTPS count=None try: # More alphas then numerics... count=int(ff.group(4)) try: month=months[ff.group(2)] except KeyError: month=int(ff.group(2)) year=int(ff.group(3)) except: # Unusual month or year field try: # settings is because w/o it the default is today's is used, which may # fail if it's e.g. 31 March today and the string is "April 2017" d=parse("%s %s"%(ff.group(2),ff.group(3)), settings={'PREFER_DAY_OF_MONTH': 'first'}) if d is None or count is None: print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4), file=sys.stderr) bogons+=1 continue elif d.year<1900 or d.year>2100: # Shouldn't happen print(7,ff.group(1),ff.group(2),ff.group(3),ff.group(4), file=sys.stderr) bogons+=1 continue else: month=d.month year=d.year except Exception as e: print(6,ff.group(1),e,l,file=sys.stderr) bogons+=1 continue else: cols=l.split() scheme=cols[0] if scheme[-1]==':': scheme=scheme[0:-1] if scheme not in ('http','https'): # The last 3 lines are needed because we get both http: with nothing else, when there was no last-mod # header, or http by itself, when the last-mod consisted entirely of TZ info, which then gets deleted print(1,scheme,l,file=sys.stderr) bogons+=1 continue scheme=HTTP if scheme=='http' else HTTPS try: cols=cols[1:] count=int(cols.pop()) except: print(2,sn[scheme],count,l,file=sys.stderr) bogons+=1 continue if cols==[]: nd[scheme]+=count continue else: l=' '.join(cols) try: d=parse(l, settings={'PREFER_DAY_OF_MONTH': 'first'}) if d is None: print(3,sn[scheme],l,count,file=sys.stderr) bogons+=1 continue elif d.year<1900 or d.year>2100: # Jan 0001 does show up, so log these as early / late (ed if d.year<1900 else ld)[scheme]+=count continue else: year=d.year month=d.month except Exception as e: print(4,sn[scheme],e,l,count,file=sys.stderr) bogons+=1 continue # log it yy=tab[scheme] y=year-1900 if yy[y] is None: yy[y]=mm=array('L',13*[0]) else: mm=yy[y] mm[month]+=count if n==0: # ssh screwed up exit(1) print('#2 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr) for s in (HTTP,HTTPS): if nd[s]!=0: print(sn[s],0,0,nd[s],sep='\t') if ed[s]!=0: print(sn[s],0,1,ed[s],sep='\t') if ld[s]!=0: print(sn[s],0,2,ld[s],sep='\t') yy=tab[s] for y in range(201): mm=yy[y] if mm is not None: for m in range(1,13): if mm[m]!=0: print(sn[s],mn[m],y+1900,mm[m],sep='\t') print('#3 %s %s %s'%(strftime('%Y-%m-%d %H:%M:%S'), n,bogons), file=sys.stderr)