annotate bin/build_idx.py @ 119:1d12b51c4d59

minor bug wrt EOF of final cdx input file
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 27 Sep 2023 17:29:51 +0100
parents 6104acc1345b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
106
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/usr/bin/python3
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
2 '''Turn a merge_nnn.log file into a cluster.idx file
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
3 We cheat and use the old cluster.idx to save having to read
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
4 all the cdx-....gz files'''
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
5 import sys
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
6
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
7 with open('cluster.idx','r') as oidx, open('new.idx','w') as nidx:
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
8 i=-1
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
9 curpos=0
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
10 target="cdx-00%03d.gz"%i
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
11 log=open("/dev/null",'r') # embarassing hack
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
12 for ol in oidx:
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
13 (surt, datestamp, file, offset, length, cnt) = ol.split()
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
14 if file!=target:
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
15 i+=1
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
16 target="cdx-00%03d.gz"%i
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
17 log.close()
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
18 curpos=0
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
19 log=open('merge_%d.log'%(i+1),'r')
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
20 hdr=log.readline()
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
21 (j,f) = hdr.split()
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
22 sys.stderr.write(hdr)
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
23 if int(j)!=i+1:
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
24 raise ValueError("wrong file: i=%s, j=%s"%(i,j))
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
25 nl=log.readline()
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
26 if not nl:
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
27 sys.stderr.write('quiting early: %s\n'%i)
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
28 exit(1)
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
29 nlen=int(nl)
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
30 nidx.write("%s %s\t%s\t%s\t%s\t%s\n"%(surt, datestamp, file, curpos, nlen, cnt))
6104acc1345b first try
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
31 curpos+=nlen