changeset 106:6104acc1345b

first try
author Henry Thompson <ht@markup.co.uk>
date Thu, 14 Sep 2023 19:27:23 +0100
parents 9403c02d5034
children 40c460fed99f
files bin/build_idx.py
diffstat 1 files changed, 31 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/build_idx.py	Thu Sep 14 19:27:23 2023 +0100
@@ -0,0 +1,31 @@
+#!/usr/bin/python3
+'''Turn a merge_nnn.log file into a cluster.idx file
+   We cheat and use the old cluster.idx to save having to read
+   all the cdx-....gz files'''
+import sys
+
+with open('cluster.idx','r') as oidx, open('new.idx','w') as nidx:
+  i=-1
+  curpos=0
+  target="cdx-00%03d.gz"%i
+  log=open("/dev/null",'r') # embarassing hack
+  for ol in oidx:
+    (surt, datestamp, file, offset, length, cnt) = ol.split()
+    if file!=target:
+      i+=1
+      target="cdx-00%03d.gz"%i
+      log.close()
+      curpos=0
+      log=open('merge_%d.log'%(i+1),'r')
+      hdr=log.readline()
+      (j,f) = hdr.split()
+      sys.stderr.write(hdr)
+      if int(j)!=i+1:
+        raise ValueError("wrong file: i=%s, j=%s"%(i,j))
+    nl=log.readline()
+    if not nl:
+      sys.stderr.write('quiting early: %s\n'%i)
+      exit(1)
+    nlen=int(nl)
+    nidx.write("%s %s\t%s\t%s\t%s\t%s\n"%(surt, datestamp, file, curpos, nlen, cnt))
+    curpos+=nlen