Mercurial > hg > cc > azure
annotate master/src/wecu/sac_reducer.py @ 62:892e1c0240e1
added more robust (I hope) error handling,
got reducer working with support for choosing dict or tsv output
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 02 Jun 2020 17:35:07 +0000 |
parents | cfaf5223b071 |
children | d46c8b12fc04 |
rev | line source |
---|---|
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
1 #!/usr/bin/python3 |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
2 '''merge results from multiple mappers |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
3 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
4 # Usage: sac_reducer by-file|aggregate (numKeys) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
5 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
6 Input lines: tab-separated, numKeys keys (default 1) followed by count''' |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
7 |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
8 import sys |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
9 from pprint import pprint |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
10 |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
11 print('reducing',sys.argv,file=sys.stderr) |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
12 sys.stderr.flush() |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
13 |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
14 rtype=sys.argv[1] |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
15 numKeys=int(sys.argv[2]) if len(sys.argv)==3 else 1 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
16 numDicts=numKeys-1 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
17 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
18 if rtype == 'by-file': |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
19 # Show results by file |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
20 for line in sys.stdin: |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
21 stdout.write(line) |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
22 else: |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
23 # Aggregate results |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
24 res={} |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
25 |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
26 for line in sys.stdin: |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
27 d=res |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
28 try: |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
29 ll = line.split('\t',4) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
30 for i in range(numDicts): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
31 d=d.setdefault(ll[i],dict()) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
32 k=ll[numDicts].rstrip() |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
33 d[k]=d.get(k,0)+int(ll[numKeys]) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
34 except Exception: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
35 print('bogus',line,ll,file=sys.stderr) |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
36 continue |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
37 |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
38 print('nc',len(res), |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
39 list(res.keys()), |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
40 list(sum(len(res[i][j]) for j in res[i].keys()) for i in res.keys()) if numKeys>1 else '', |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
41 file=sys.stderr) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
42 if rtype=='dict': |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
43 print('res=',end='') |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
44 pprint(res) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
45 else: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
46 for k1,v1 in res.items(): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
47 for k2,v2 in v1.items(): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
48 for k3,v3 in v2.items(): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
49 print(k1,k2,k3,v3,sep='\t') |