view bin/per_segment.py @ 64:b14187ccfb46

revert to just showing first LM
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 19 Jul 2023 13:19:42 +0100
parents e82a82ea3704
children
line wrap: on
line source

#!/usr/bin/python3
'''refactor a per-cdx count table to be per-segment
input on STDIN
Usage: per_segment segment-column
Assumes column 0 is empty, count is in column 1
Segment column is 0-origin
'''

import sys

c=int(sys.argv[1])

ss=[dict() for i in range(100)]

for l in sys.stdin:
  try:
    cc=l.split('\t')
    s=int(cc.pop(c))
    n=int(cc.pop(1))
    ll='\t'.join(cc[1:]) # note we ditch the initial empty column
    #print(s,n,cc,ll,sep='|')
    #exit(0)
    t=ss[s].get(ll,0)
    ss[s][ll]=t+n
  except:
    sys.stdout.write(l)
    print(cc)
    exit(1)

# note this won't work if c is last column!
for s in range(100):
  with open('s%s.tsv'%s,'w') as f:
    for (l,c) in sorted(ss[s].items(),key=lambda p:p[1],reverse=True):
      f.write(str(c))
      f.write('\t')
      f.write(l)