view lib/python/cc/per_segment.py @ 264:7886d7de5eed

use cdb library directly, sequestration of cdb handle complete and working, nndb counts two loops now, one with and one without counting successes
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 31 Jan 2025 13:31:02 +0000
parents d88c8d40259a
children
line wrap: on
line source

#!/usr/bin/python3
'''refactor a per-cdx count table to be per-segment
input on STDIN
Usage: per_segment segment-column
Assumes column 0 is empty, count is in column 1
Segment column is 0-origin
'''

import sys,os

c=int(sys.argv[1])

ss=[dict() for i in range(100)]

for l in sys.stdin:
  try:
    cc=l.split('\t')
    s=int(cc.pop(c))
    n=int(cc.pop(1))
    ll='\t'.join(cc[1:]) # note we ditch the initial empty column
    #print(s,n,cc,ll,sep='|')
    #exit(0)
    t=ss[s].get(ll,0)
    ss[s][ll]=t+n
  except:
    sys.stdout.write(l)
    print(cc)
    exit(1)

# note this won't work if c is last column!
for s in range(100):
  try:
    os.mkdir(str(s))
  except FileExistsError:
    pass
  with open('%s/ks.tsv'%s,'w') as f:
    for (l,c) in sorted(ss[s].items(),key=lambda p:p[1],reverse=True):
      f.write(str(c))
      f.write('\t')
      f.write(l)