# HG changeset patch # User Henry S. Thompson # Date 1693487661 -3600 # Node ID c1a70532444c3f8f91f99feea0db2e33a58fade9 # Parent a62580816f1c3d7d38f916b2abba064d7a802e48 flip loops diff -r a62580816f1c -r c1a70532444c bin/merge_date.py --- a/bin/merge_date.py Wed Aug 30 21:49:43 2023 +0100 +++ b/bin/merge_date.py Thu Aug 31 14:14:21 2023 +0100 @@ -7,55 +7,58 @@ ksvstream consists of tab-separated key, CC date and Unix timestamp ''' # ' -import sys, io, os +import sys, io, os, os.path from isal import igzip -xpath = "%s/cdx-00%%0.3d.gz"%sys.argv[2] -npath = "%s/cdx-00%%0.3d.gz"%sys.argv[3] +XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] +NPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[3] -#print(sys.argv[3],npath,file=sys.stderr) +#print(sys.argv[3],NPATH,file=sys.stderr) os.makedirs(sys.argv[3], exist_ok=True) -fn = -1 -xf = igzip.IGzipFile(filename=xpath%0) -nf = open(npath%0, 'wb') +FN = 0 + +XF = igzip.IGzipFile(filename=XPATH%0) +NF = open(NPATH%0,'wb') + +XL = b'' -df = open(sys.argv[1], 'rb') +def nextLine(): + global FN, NF, NPATH, XF, XPATH + xl=XF.readline() + if xl == b'': + # need to move to next index file + if NF is None: + FN = 0 + else: + FN += 1 + xn=XPATH%FN + if not os.path.exists(xn): + return + XF = igzip.IGzipFile(filename=xn) + NF = open(NPATH%FN, 'wb') + xl = XF.readline() + return xl -xl = b'' -xkey = xdate = None - -for dl in df: +with open(sys.argv[1], 'rb') as df: + dl = df.readline() (dkey, ddate, dtime) = dl.split(b'\t') - while dkey != xkey or ddate != xdate: - try: - if xl == b'': - # need to move to next index file - nf.close() - fn += 1 - try: - xf = igzip.IGzipFile(filename=xpath%fn) - except Exception as e: - print("No more index input for %s: %s\nUnmatched: |%s|%s|\n" - "Last index line: |%s|%s|"%(fn,e,dkey,ddate,xkey,xdate), - sys.stderr) - exit(1) - xl = xf.readline() - nf = open(npath%fn, 'wb') - #print('xl',xl,file=sys.stderr) - (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) - continue - else: - (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) - except: - breakpoint() - nf.write(xl) - xl = xf.readline() - nf.write(xkey) - nf.write(b' ') - nf.write(xdate) - nf.write(b' ') - nf.write(xprops[:-2]) - nf.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) - xl=xf.readline() + + while (xl:=nextLine()) is not None: + (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) + if dkey != xkey or ddate != xdate: + NF.write(xl) + continue + NF.write(xkey) + NF.write(b' ') + NF.write(xdate) + NF.write(b' ') + NF.write(xprops[:-2]) + NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) + dl = df.readline() + if dl == '': + # write out the last of the last index file, if any + dkey = ddate = None + else: + (dkey, ddate, dtime) = dl.split(b'\t')