diff master/src/wecu/sac_reducer.py @ 62:892e1c0240e1

added more robust (I hope) error handling, got reducer working with support for choosing dict or tsv output
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 02 Jun 2020 17:35:07 +0000
parents cfaf5223b071
children d46c8b12fc04
line wrap: on
line diff
--- a/master/src/wecu/sac_reducer.py	Sun May 31 12:06:44 2020 +0000
+++ b/master/src/wecu/sac_reducer.py	Tue Jun 02 17:35:07 2020 +0000
@@ -1,29 +1,49 @@
 #!/usr/bin/python3
+'''merge results from multiple mappers
+
+# Usage: sac_reducer by-file|aggregate (numKeys)
+
+Input lines: tab-separated, numKeys keys (default 1) followed by count'''
 
 import sys
-from collections import defaultdict
+from pprint import pprint
 
 print('reducing',sys.argv,file=sys.stderr)
+sys.stderr.flush()
 
-if sys.argv[1] == 'by-file':
+rtype=sys.argv[1]
+numKeys=int(sys.argv[2]) if len(sys.argv)==3 else 1
+numDicts=numKeys-1
+
+if rtype == 'by-file':
     # Show results by file
     for line in sys.stdin:
-        print(line.strip())
+        stdout.write(line)
 else:
     # Aggregate results
-    counters = defaultdict(int)
+    res={}
 
     for line in sys.stdin:
+        d=res
         try:
-            line = line.strip().split('\t')
-            k = line[0] 
-            v = line[1]
-        except:
-            print('bogus',line,file=sys.stderr)
+            ll = line.split('\t',4)
+            for i in range(numDicts):
+                d=d.setdefault(ll[i],dict())
+            k=ll[numDicts].rstrip()
+            d[k]=d.get(k,0)+int(ll[numKeys])
+        except Exception:
+            print('bogus',line,ll,file=sys.stderr)
             continue
 
-        counters[k] += int(v)
-
-    print('nc',len(counters),file=sys.stderr)
-    for k,v in counters.items():
-        print("{}\t{}".format(k, v))
+    print('nc',len(res),
+          list(res.keys()),
+          list(sum(len(res[i][j]) for j in res[i].keys()) for i in res.keys()) if numKeys>1 else '',
+          file=sys.stderr)
+    if rtype=='dict':
+        print('res=',end='')
+        pprint(res)
+    else:
+        for k1,v1 in res.items():
+            for k2,v2 in v1.items():
+                for k3,v3 in v2.items():
+                    print(k1,k2,k3,v3,sep='\t')