# HG changeset patch # User Henry S. Thompson # Date 1693428583 -3600 # Node ID a62580816f1c3d7d38f916b2abba064d7a802e48 # Parent 49faf679d7dfa92347fb1d856c8d52cb0b96e60b merge a stream of ks files with a set of cdx files diff -r 49faf679d7df -r a62580816f1c bin/merge_date.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/merge_date.py Wed Aug 30 21:49:43 2023 +0100 @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +'''Add timestamps from Last-Modified-dated (ks.tsv) files into + that year's index + +Usage: merge_date.py ksvstream cdx-dir outdir + +ksvstream consists of tab-separated key, CC date and Unix timestamp +''' # ' + +import sys, io, os +from isal import igzip + +xpath = "%s/cdx-00%%0.3d.gz"%sys.argv[2] +npath = "%s/cdx-00%%0.3d.gz"%sys.argv[3] + +#print(sys.argv[3],npath,file=sys.stderr) + +os.makedirs(sys.argv[3], exist_ok=True) + +fn = -1 +xf = igzip.IGzipFile(filename=xpath%0) +nf = open(npath%0, 'wb') + +df = open(sys.argv[1], 'rb') + +xl = b'' +xkey = xdate = None + +for dl in df: + (dkey, ddate, dtime) = dl.split(b'\t') + while dkey != xkey or ddate != xdate: + try: + if xl == b'': + # need to move to next index file + nf.close() + fn += 1 + try: + xf = igzip.IGzipFile(filename=xpath%fn) + except Exception as e: + print("No more index input for %s: %s\nUnmatched: |%s|%s|\n" + "Last index line: |%s|%s|"%(fn,e,dkey,ddate,xkey,xdate), + sys.stderr) + exit(1) + xl = xf.readline() + nf = open(npath%fn, 'wb') + #print('xl',xl,file=sys.stderr) + (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) + continue + else: + (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) + except: + breakpoint() + nf.write(xl) + xl = xf.readline() + nf.write(xkey) + nf.write(b' ') + nf.write(xdate) + nf.write(b' ') + nf.write(xprops[:-2]) + nf.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) + xl=xf.readline()