Mercurial > hg > cc > azure
annotate master/src/wecu/sac_reducer.py @ 68:1f04bce6ead7 default tip
use basefile instead of transferfile, and remove cleanup: belt and braces wrt lossage of sac_schemes.py in 15% of 1000_k3,
this as used in a_2
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Thu, 04 Jun 2020 20:44:44 +0000 |
parents | b91e44355bbf |
children |
rev | line source |
---|---|
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
1 #!/usr/bin/python3 |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
2 '''merge results from multiple mappers |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
3 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
4 # Usage: sac_reducer by-file|aggregate (numKeys) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
5 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
6 Input lines: tab-separated, numKeys keys (default 1) followed by count''' |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
7 |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
8 import sys |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
9 |
64
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
10 print('reducing',sys.argv,file=sys.stderr) |
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
11 sys.stderr.flush() |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
60
diff
changeset
|
12 |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
13 rtype=sys.argv[1] |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
14 numKeys=int(sys.argv[2]) if len(sys.argv)==3 else 1 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
15 numDicts=numKeys-1 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
16 |
63
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
17 def rec_print(d,buf,pos=0): |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
18 if pos!=0: |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
19 pos+=buf.write(b'\t') |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
20 for k,v in d.items(): |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
21 npos=pos+buf.write(k.encode()) |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
22 #print(pos,buf.tell(),npos,file=sys.stderr) |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
23 if isinstance(v,dict): |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
24 rec_print(v,buf,npos) |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
25 else: |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
26 buf.write(b'\t') |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
27 buf.write(b'%d'%v) |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
28 buf.write(b'\n') |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
29 buf.truncate() |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
30 buf.seek(0) |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
31 sys.stdout.buffer.write(buf.read(-1)) |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
32 buf.seek(pos) |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
33 |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
34 |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
35 if rtype == 'by-file': |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
36 # Show results by file |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
37 for line in sys.stdin: |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
38 stdout.write(line) |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
39 else: |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
40 # Aggregate results |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
41 res={} |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
42 |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
43 for line in sys.stdin: |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
44 d=res |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
45 try: |
63
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
46 ll = line.split('\t',numKeys+1) |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
47 for i in range(numDicts): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
48 d=d.setdefault(ll[i],dict()) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
49 k=ll[numDicts].rstrip() |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
50 d[k]=d.get(k,0)+int(ll[numKeys]) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
51 except Exception: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
52 print('bogus',line,ll,file=sys.stderr) |
57
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
53 continue |
ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
54 |
64
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
55 print('nc',len(res),file=sys.stderr) |
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
56 if numKeys>1: |
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
57 print(' ',list(res.keys()),"\n ", |
b91e44355bbf
fix minor argument passing snafus
Henry S. Thompson <ht@markup.co.uk>
parents:
63
diff
changeset
|
58 list(sum(len(res[i][j]) for j in res[i].keys()) for i in res.keys()), file=sys.stderr) |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
59 if rtype=='dict': |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
60 print('res=',end='') |
63
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
61 from pprint import pprint |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
62 pprint(res) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
63 else: |
63
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
64 from io import BufferedRandom, BytesIO |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
65 rec_print(res,BufferedRandom(BytesIO(),10000)) |
d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
Henry S. Thompson <ht@markup.co.uk>
parents:
62
diff
changeset
|
66 |