23
|
1 #!/usr/bin/python3
|
|
2 '''refactor a per-cdx count table to be per-segment
|
|
3 input on STDIN
|
|
4 Usage: per_segment segment-column
|
|
5 Assumes column 0 is empty, count is in column 1
|
|
6 Segment column is 0-origin
|
|
7 '''
|
|
8
|
|
9 import sys
|
|
10
|
|
11 c=int(sys.argv[1])
|
|
12
|
|
13 ss=[dict() for i in range(100)]
|
|
14
|
|
15 for l in sys.stdin:
|
|
16 try:
|
|
17 cc=l.split('\t')
|
|
18 s=int(cc.pop(c))
|
|
19 n=int(cc.pop(1))
|
|
20 ll='\t'.join(cc[1:]) # note we ditch the initial empty column
|
|
21 #print(s,n,cc,ll,sep='|')
|
|
22 #exit(0)
|
|
23 t=ss[s].get(ll,0)
|
|
24 ss[s][ll]=t+n
|
|
25 except:
|
|
26 sys.stdout.write(l)
|
|
27 print(cc)
|
|
28 exit(1)
|
|
29
|
|
30 # note this won't work if c is last column!
|
|
31 for s in range(100):
|
|
32 with open('s%s.tsv'%s,'w') as f:
|
|
33 for (l,c) in sorted(ss[s].items(),key=lambda p:p[1],reverse=True):
|
|
34 f.write(str(c))
|
|
35 f.write('\t')
|
|
36 f.write(l)
|