changeset 23:e82a82ea3704

sic
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 21 Oct 2022 18:09:53 +0100
parents 38bab758e469
children e6adf484ebb4
files bin/per_segment.py
diffstat 1 files changed, 36 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/per_segment.py	Fri Oct 21 18:09:53 2022 +0100
@@ -0,0 +1,36 @@
+#!/usr/bin/python3
+'''refactor a per-cdx count table to be per-segment
+input on STDIN
+Usage: per_segment segment-column
+Assumes column 0 is empty, count is in column 1
+Segment column is 0-origin
+'''
+
+import sys
+
+c=int(sys.argv[1])
+
+ss=[dict() for i in range(100)]
+
+for l in sys.stdin:
+  try:
+    cc=l.split('\t')
+    s=int(cc.pop(c))
+    n=int(cc.pop(1))
+    ll='\t'.join(cc[1:]) # note we ditch the initial empty column
+    #print(s,n,cc,ll,sep='|')
+    #exit(0)
+    t=ss[s].get(ll,0)
+    ss[s][ll]=t+n
+  except:
+    sys.stdout.write(l)
+    print(cc)
+    exit(1)
+
+# note this won't work if c is last column!
+for s in range(100):
+  with open('s%s.tsv'%s,'w') as f:
+    for (l,c) in sorted(ss[s].items(),key=lambda p:p[1],reverse=True):
+      f.write(str(c))
+      f.write('\t')
+      f.write(l)