Mercurial > hg > cc > cirrus_work
view bin/merge_date.py @ 89:a62580816f1c
merge a stream of ks files with a set of cdx files
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 30 Aug 2023 21:49:43 +0100 |
parents | |
children | c1a70532444c |
line wrap: on
line source
#!/usr/bin/python3 '''Add timestamps from Last-Modified-dated (ks.tsv) files into that year's index Usage: merge_date.py ksvstream cdx-dir outdir ksvstream consists of tab-separated key, CC date and Unix timestamp ''' # ' import sys, io, os from isal import igzip xpath = "%s/cdx-00%%0.3d.gz"%sys.argv[2] npath = "%s/cdx-00%%0.3d.gz"%sys.argv[3] #print(sys.argv[3],npath,file=sys.stderr) os.makedirs(sys.argv[3], exist_ok=True) fn = -1 xf = igzip.IGzipFile(filename=xpath%0) nf = open(npath%0, 'wb') df = open(sys.argv[1], 'rb') xl = b'' xkey = xdate = None for dl in df: (dkey, ddate, dtime) = dl.split(b'\t') while dkey != xkey or ddate != xdate: try: if xl == b'': # need to move to next index file nf.close() fn += 1 try: xf = igzip.IGzipFile(filename=xpath%fn) except Exception as e: print("No more index input for %s: %s\nUnmatched: |%s|%s|\n" "Last index line: |%s|%s|"%(fn,e,dkey,ddate,xkey,xdate), sys.stderr) exit(1) xl = xf.readline() nf = open(npath%fn, 'wb') #print('xl',xl,file=sys.stderr) (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) continue else: (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) except: breakpoint() nf.write(xl) xl = xf.readline() nf.write(xkey) nf.write(b' ') nf.write(xdate) nf.write(b' ') nf.write(xprops[:-2]) nf.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) xl=xf.readline()