changeset 153:58df6981269e

build cluster.idx
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 06 Oct 2023 15:06:53 +0100
parents 85343fe48f69
children 5d30cd8c6254
files lib/python/cc/lmh/idx.py
diffstat 1 files changed, 47 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/idx.py	Fri Oct 06 15:06:53 2023 +0100
@@ -0,0 +1,47 @@
+#!/usr/bin/python3
+'''Write a cdx index file based on a directory of cdx files and a directory
+of corresponding logs from merge_date.py
+
+Usage: idx.py cdxdir logdir
+'''
+
+import glob, sys, re, io
+from isal import igzip
+from os.path import basename, join
+
+
+cdx_dir = sys.argv[1]
+log_dir = sys.argv[2]
+
+IPAT = re.compile('(?:cdx-|merge_)([0-9]*)\.(?:gz|log)$')
+
+def fx(x):
+  # extract file index number from cdx or log file name
+  return int(IPAT.search(x).group(1))
+
+cdx_files = sorted(glob.glob(join(cdx_dir, "cdx-*.gz")), key=fx)
+
+log_files = sorted(glob.glob(join(log_dir, "merge_[0-9]*.log")), key=fx)
+
+# note that cdx-file names are 0-origin, log-file names are 1-origin
+
+k = 0
+buf=bytearray(32*1024*1024)
+with open(join(cdx_dir,"cluster.idx"),'wb') as idx:
+  for i in range(len(cdx_files)):
+    with open(cdx_files[i],'rb') as cdx_gz, \
+             open(log_files[i],'r') as log:
+      j, fn = log.readline().split()
+      assert int(j) == i+1
+      assert basename(fn)+'.gz' == (cfn:=basename(cdx_files[i]))
+      print(fn,file=sys.stderr)
+      cpos = 0
+      while ll:=log.readline():
+        clen=int(ll)
+        k += 1
+        cdx_gz.readinto(gzv:=memoryview(buf[:clen]))
+        with igzip.IGzipFile(fileobj=io.BytesIO(gzv)) as cdx:
+          key, cdate, props = cdx.readline().split(b' ',maxsplit=2)
+          idx.write(b'%s %s\t%s\t%d\t%d\t%d\n'%(key, cdate, bytes(cfn,'ascii'),
+                                                cpos, clen, k))
+        cpos += clen