changeset 34:ad6eff2bc6f9

fixes to logging and efficiency, see also notes.txt wrt patches to dateparser
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 21 Nov 2018 18:42:56 +0000
parents 4c117ee8ed75
children 1b6bcc54268d
files master/bin/fixDates.py workers/bin/_fixAndMerge.sh
diffstat 2 files changed, 15 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/master/bin/fixDates.py	Tue Nov 20 14:49:07 2018 +0000
+++ b/master/bin/fixDates.py	Wed Nov 21 18:42:56 2018 +0000
@@ -14,13 +14,15 @@
 http_ytab=list(201*[None]) # 1900--2100
 https_ytab=list(201*[None])
 tab=[http_ytab,https_ytab]
-nd=[0,0]
+nd=[0,0] # no date
+ed=[0,0] # date < 1900
+ld=[0,0] # date > 2100
 mn=[None,'Jan','Feb','Mar','Apr','May','Jun',
         'Jul','Aug','Sep','Oct','Nov','Dec']
 months=dict(zip(mn[1:],range(1,13)))
 for l in sys.stdin:
   if l[0]=='#':
-    print('# %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr)
+    print('#1 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr)
     continue
   n+=1
   ff=ok.match(l)
@@ -39,7 +41,7 @@
     except:
       # Unusual month or year field
       try:
-        d=parse("%s %s"%(ff.group(2),ff.group(3)))
+        d=parse("%s %s"%(ff.group(2),ff.group(3)))#,languages=['en'])
         if d is None or count is None:
           print(5,ff.group(1),ff.group(2),ff.group(3),ff.group(4),
                 file=sys.stderr)
@@ -83,16 +85,14 @@
     else:
       l=' '.join(cols)
       try:
-        d=parse(l)
+        d=parse(l)#,languages=['en']))
         if d is None:
           print(3,d,l,count,file=sys.stderr)
           bogons+=1
           continue
         elif d.year<1900 or d.year>2100:
-          # Shouldn't happen 
-          print(8,sn[scheme],d.month,d.year,count,
-                file=sys.stderr)
-          bogons+=1
+          # Jan 0001 does show up, so log these as early / late
+          (ed if d.year<1900 else ld)[scheme]+=count
           continue
         else:
           year=d.year
@@ -112,9 +112,14 @@
 if n==0:
   # ssh screwed up
   exit(1)
+print('#2 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr)
 for s in (HTTP,HTTPS):
   if nd[s]!=0:
     print(sn[s],0,0,nd[s],sep='\t')
+  if ed[s]!=0:    
+    print(sn[s],0,1,ed[s],sep='\t')
+  if ld[s]!=0:    
+    print(sn[s],0,2,ld[s],sep='\t')
   yy=tab[s]
   for y in range(201):
     mm=yy[y]
@@ -123,6 +128,7 @@
         if mm[m]!=0:
           print(sn[s],mn[m],y+1900,mm[m],sep='\t')
 print(n,bogons,file=sys.stderr)
+print('#3 %s'%strftime('%Y-%m-%d %H:%M:%S'),file=sys.stderr)
 
 
 
--- a/workers/bin/_fixAndMerge.sh	Tue Nov 20 14:49:07 2018 +0000
+++ b/workers/bin/_fixAndMerge.sh	Wed Nov 21 18:42:56 2018 +0000
@@ -10,7 +10,7 @@
 cat > /var/data/in$id
 echo \#.$id $(date) got list $(wc -l /var/data/in$id) >> $log
 rm -f /var/data/d$id
-xargs -n 16 _doFetch.sh "$@" < /var/data/in$id >/var/data/d$id
+xargs -n 64 _doFetch.sh "$@" < /var/data/in$id >/var/data/d$id
 echo \#.$id $(date) got data  $(wc -l /var/data/d$id) >> $log
 fixDates.py < /var/data/d$id
 echo \#.$id $(date) done >> $log