Mercurial > hg > cc > azure
changeset 34:ad6eff2bc6f9
fixes to logging and efficiency, see also notes.txt wrt patches to dateparser
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 21 Nov 2018 18:42:56 +0000 |
parents | 4c117ee8ed75 |
children | 1b6bcc54268d |
files | master/bin/fixDates.py workers/bin/_fixAndMerge.sh |
diffstat | 2 files changed, 15 insertions(+), 9 deletions(-) [+] |
line wrap: on
line diff
--- a/master/bin/fixDates.py Tue Nov 20 14:49:07 2018 +0000 +++ b/master/bin/fixDates.py Wed Nov 21 18:42:56 2018 +0000 @@ -14,13 +14,15 @@ http_ytab=list(201*[None]) # 1900--2100 https_ytab=list(201*[None]) tab=[http_ytab,https_ytab] -nd=[0,0] +nd=[0,0] # no date +ed=[0,0] # date < 1900 +ld=[0,0] # date > 2100 mn=[None,'Jan','Feb','Mar','Apr','May','Jun', 'Jul','Aug','Sep','Oct','Nov','Dec'] months=dict(zip(mn[1:],range(1,13))) for l in sys.stdin: if l[0]=='#': - print('# %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr) + print('#1 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr) continue n+=1 ff=ok.match(l) @@ -39,7 +41,7 @@ except: # Unusual month or year field try: - d=parse("%s %s"%(ff.group(2),ff.group(3))) + d=parse("%s %s"%(ff.group(2),ff.group(3)))#,languages=['en']) if d is None or count is None: print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4), file=sys.stderr) @@ -83,16 +85,14 @@ else: l=' '.join(cols) try: - d=parse(l) + d=parse(l)#,languages=['en'])) if d is None: print(3,d,l,count,file=sys.stderr) bogons+=1 continue elif d.year<1900 or d.year>2100: - # Shouldn't happen - print(8,sn[scheme],d.month,d.year,count, - file=sys.stderr) - bogons+=1 + # Jan 0001 does show up, so log these as early / late + (ed if d.year<1900 else ld)[scheme]+=count continue else: year=d.year @@ -112,9 +112,14 @@ if n==0: # ssh screwed up exit(1) +print('#2 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr) for s in (HTTP,HTTPS): if nd[s]!=0: print(sn[s],0,0,nd[s],sep='\t') + if ed[s]!=0: + print(sn[s],0,1,ed[s],sep='\t') + if ld[s]!=0: + print(sn[s],0,2,ld[s],sep='\t') yy=tab[s] for y in range(201): mm=yy[y] @@ -123,6 +128,7 @@ if mm[m]!=0: print(sn[s],mn[m],y+1900,mm[m],sep='\t') print(n,bogons,file=sys.stderr) +print('#3 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr)
--- a/workers/bin/_fixAndMerge.sh Tue Nov 20 14:49:07 2018 +0000 +++ b/workers/bin/_fixAndMerge.sh Wed Nov 21 18:42:56 2018 +0000 @@ -10,7 +10,7 @@ cat > /var/data/in$id echo \#.$id $(date) got list $(wc -l /var/data/in$id) >> $log rm -f /var/data/d$id -xargs -n 16 _doFetch.sh "$@" < /var/data/in$id >/var/data/d$id +xargs -n 64 _doFetch.sh "$@" < /var/data/in$id >/var/data/d$id echo \#.$id $(date) got data $(wc -l /var/data/d$id) >> $log fixDates.py < /var/data/d$id echo \#.$id $(date) done >> $log