Mercurial > hg > cc > cirrus_work
diff bin/per_segment.py @ 23:e82a82ea3704
sic
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 21 Oct 2022 18:09:53 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/per_segment.py Fri Oct 21 18:09:53 2022 +0100 @@ -0,0 +1,36 @@ +#!/usr/bin/python3 +'''refactor a per-cdx count table to be per-segment +input on STDIN +Usage: per_segment segment-column +Assumes column 0 is empty, count is in column 1 +Segment column is 0-origin +''' + +import sys + +c=int(sys.argv[1]) + +ss=[dict() for i in range(100)] + +for l in sys.stdin: + try: + cc=l.split('\t') + s=int(cc.pop(c)) + n=int(cc.pop(1)) + ll='\t'.join(cc[1:]) # note we ditch the initial empty column + #print(s,n,cc,ll,sep='|') + #exit(0) + t=ss[s].get(ll,0) + ss[s][ll]=t+n + except: + sys.stdout.write(l) + print(cc) + exit(1) + +# note this won't work if c is last column! +for s in range(100): + with open('s%s.tsv'%s,'w') as f: + for (l,c) in sorted(ss[s].items(),key=lambda p:p[1],reverse=True): + f.write(str(c)) + f.write('\t') + f.write(l)