Mercurial > hg > cc > cirrus_work
view bin/merge_date.py @ 90:c1a70532444c
flip loops
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 31 Aug 2023 14:14:21 +0100 |
parents | a62580816f1c |
children | 460f0599e8cd |
line wrap: on
line source
#!/usr/bin/python3 '''Add timestamps from Last-Modified-dated (ks.tsv) files into that year's index Usage: merge_date.py ksvstream cdx-dir outdir ksvstream consists of tab-separated key, CC date and Unix timestamp ''' # ' import sys, io, os, os.path from isal import igzip XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] NPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[3] #print(sys.argv[3],NPATH,file=sys.stderr) os.makedirs(sys.argv[3], exist_ok=True) FN = 0 XF = igzip.IGzipFile(filename=XPATH%0) NF = open(NPATH%0,'wb') XL = b'' def nextLine(): global FN, NF, NPATH, XF, XPATH xl=XF.readline() if xl == b'': # need to move to next index file if NF is None: FN = 0 else: FN += 1 xn=XPATH%FN if not os.path.exists(xn): return XF = igzip.IGzipFile(filename=xn) NF = open(NPATH%FN, 'wb') xl = XF.readline() return xl with open(sys.argv[1], 'rb') as df: dl = df.readline() (dkey, ddate, dtime) = dl.split(b'\t') while (xl:=nextLine()) is not None: (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) if dkey != xkey or ddate != xdate: NF.write(xl) continue NF.write(xkey) NF.write(b' ') NF.write(xdate) NF.write(b' ') NF.write(xprops[:-2]) NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) dl = df.readline() if dl == '': # write out the last of the last index file, if any dkey = ddate = None else: (dkey, ddate, dtime) = dl.split(b'\t')