Mercurial > hg > cc > cirrus_work
changeset 153:58df6981269e
build cluster.idx
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 06 Oct 2023 15:06:53 +0100 |
parents | 85343fe48f69 |
children | 5d30cd8c6254 |
files | lib/python/cc/lmh/idx.py |
diffstat | 1 files changed, 47 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/idx.py Fri Oct 06 15:06:53 2023 +0100 @@ -0,0 +1,47 @@ +#!/usr/bin/python3 +'''Write a cdx index file based on a directory of cdx files and a directory +of corresponding logs from merge_date.py + +Usage: idx.py cdxdir logdir +''' + +import glob, sys, re, io +from isal import igzip +from os.path import basename, join + + +cdx_dir = sys.argv[1] +log_dir = sys.argv[2] + +IPAT = re.compile('(?:cdx-|merge_)([0-9]*)\.(?:gz|log)$') + +def fx(x): + # extract file index number from cdx or log file name + return int(IPAT.search(x).group(1)) + +cdx_files = sorted(glob.glob(join(cdx_dir, "cdx-*.gz")), key=fx) + +log_files = sorted(glob.glob(join(log_dir, "merge_[0-9]*.log")), key=fx) + +# note that cdx-file names are 0-origin, log-file names are 1-origin + +k = 0 +buf=bytearray(32*1024*1024) +with open(join(cdx_dir,"cluster.idx"),'wb') as idx: + for i in range(len(cdx_files)): + with open(cdx_files[i],'rb') as cdx_gz, \ + open(log_files[i],'r') as log: + j, fn = log.readline().split() + assert int(j) == i+1 + assert basename(fn)+'.gz' == (cfn:=basename(cdx_files[i])) + print(fn,file=sys.stderr) + cpos = 0 + while ll:=log.readline(): + clen=int(ll) + k += 1 + cdx_gz.readinto(gzv:=memoryview(buf[:clen])) + with igzip.IGzipFile(fileobj=io.BytesIO(gzv)) as cdx: + key, cdate, props = cdx.readline().split(b' ',maxsplit=2) + idx.write(b'%s %s\t%s\t%d\t%d\t%d\n'%(key, cdate, bytes(cfn,'ascii'), + cpos, clen, k)) + cpos += clen