# HG changeset patch # User Henry S. Thompson # Date 1696601213 -3600 # Node ID 58df6981269e8702636da2553c44c2695f3d9d77 # Parent 85343fe48f696af01a317d3812aa0d829cfdeb39 build cluster.idx diff -r 85343fe48f69 -r 58df6981269e lib/python/cc/lmh/idx.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/idx.py Fri Oct 06 15:06:53 2023 +0100 @@ -0,0 +1,47 @@ +#!/usr/bin/python3 +'''Write a cdx index file based on a directory of cdx files and a directory +of corresponding logs from merge_date.py + +Usage: idx.py cdxdir logdir +''' + +import glob, sys, re, io +from isal import igzip +from os.path import basename, join + + +cdx_dir = sys.argv[1] +log_dir = sys.argv[2] + +IPAT = re.compile('(?:cdx-|merge_)([0-9]*)\.(?:gz|log)$') + +def fx(x): + # extract file index number from cdx or log file name + return int(IPAT.search(x).group(1)) + +cdx_files = sorted(glob.glob(join(cdx_dir, "cdx-*.gz")), key=fx) + +log_files = sorted(glob.glob(join(log_dir, "merge_[0-9]*.log")), key=fx) + +# note that cdx-file names are 0-origin, log-file names are 1-origin + +k = 0 +buf=bytearray(32*1024*1024) +with open(join(cdx_dir,"cluster.idx"),'wb') as idx: + for i in range(len(cdx_files)): + with open(cdx_files[i],'rb') as cdx_gz, \ + open(log_files[i],'r') as log: + j, fn = log.readline().split() + assert int(j) == i+1 + assert basename(fn)+'.gz' == (cfn:=basename(cdx_files[i])) + print(fn,file=sys.stderr) + cpos = 0 + while ll:=log.readline(): + clen=int(ll) + k += 1 + cdx_gz.readinto(gzv:=memoryview(buf[:clen])) + with igzip.IGzipFile(fileobj=io.BytesIO(gzv)) as cdx: + key, cdate, props = cdx.readline().split(b' ',maxsplit=2) + idx.write(b'%s %s\t%s\t%d\t%d\t%d\n'%(key, cdate, bytes(cfn,'ascii'), + cpos, clen, k)) + cpos += clen