view bin/merge_date.py @ 89:a62580816f1c

merge a stream of ks files with a set of cdx files
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 30 Aug 2023 21:49:43 +0100
parents
children c1a70532444c
line wrap: on
line source

#!/usr/bin/python3
'''Add timestamps from Last-Modified-dated (ks.tsv) files into
   that year's index

Usage: merge_date.py ksvstream cdx-dir outdir

ksvstream consists of tab-separated key, CC date and Unix timestamp
''' # '

import sys, io, os
from isal import igzip

xpath = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
npath = "%s/cdx-00%%0.3d.gz"%sys.argv[3]

#print(sys.argv[3],npath,file=sys.stderr)

os.makedirs(sys.argv[3], exist_ok=True)

fn = -1
xf = igzip.IGzipFile(filename=xpath%0)
nf = open(npath%0, 'wb')

df = open(sys.argv[1], 'rb')

xl = b''
xkey = xdate = None

for dl in df:
  (dkey, ddate, dtime) = dl.split(b'\t')
  while dkey != xkey or ddate != xdate:
    try:
      if xl == b'':
        # need to move to next index file
        nf.close()
        fn += 1
        try:
          xf = igzip.IGzipFile(filename=xpath%fn)
        except Exception as e:
          print("No more index input for %s: %s\nUnmatched:      |%s|%s|\n"
                "Last index line: |%s|%s|"%(fn,e,dkey,ddate,xkey,xdate),
                sys.stderr)
          exit(1)
        xl = xf.readline()
        nf = open(npath%fn, 'wb')
        #print('xl',xl,file=sys.stderr)
        (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2)
        continue
      else:
        (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2)
    except:
      breakpoint()
    nf.write(xl)
    xl = xf.readline()
  nf.write(xkey)
  nf.write(b' ')
  nf.write(xdate)
  nf.write(b' ')
  nf.write(xprops[:-2])
  nf.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
  xl=xf.readline()