view bin/per_segment.py @ 93:25bd398a8035

improve reordering, still failing on cdx-00004
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 Sep 2023 18:51:21 +0100
parents e82a82ea3704
children
line wrap: on
line source

#!/usr/bin/python3
'''refactor a per-cdx count table to be per-segment
input on STDIN
Usage: per_segment segment-column
Assumes column 0 is empty, count is in column 1
Segment column is 0-origin
'''

import sys

c=int(sys.argv[1])

ss=[dict() for i in range(100)]

for l in sys.stdin:
  try:
    cc=l.split('\t')
    s=int(cc.pop(c))
    n=int(cc.pop(1))
    ll='\t'.join(cc[1:]) # note we ditch the initial empty column
    #print(s,n,cc,ll,sep='|')
    #exit(0)
    t=ss[s].get(ll,0)
    ss[s][ll]=t+n
  except:
    sys.stdout.write(l)
    print(cc)
    exit(1)

# note this won't work if c is last column!
for s in range(100):
  with open('s%s.tsv'%s,'w') as f:
    for (l,c) in sorted(ss[s].items(),key=lambda p:p[1],reverse=True):
      f.write(str(c))
      f.write('\t')
      f.write(l)