view bin/merge_date.py @ 90:c1a70532444c

flip loops
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 31 Aug 2023 14:14:21 +0100
parents a62580816f1c
children 460f0599e8cd
line wrap: on
line source

#!/usr/bin/python3
'''Add timestamps from Last-Modified-dated (ks.tsv) files into
   that year's index

Usage: merge_date.py ksvstream cdx-dir outdir

ksvstream consists of tab-separated key, CC date and Unix timestamp
''' # '

import sys, io, os, os.path
from isal import igzip

XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
NPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[3]

#print(sys.argv[3],NPATH,file=sys.stderr)

os.makedirs(sys.argv[3], exist_ok=True)

FN = 0

XF = igzip.IGzipFile(filename=XPATH%0)
NF = open(NPATH%0,'wb')

XL = b''

def nextLine():
  global FN, NF, NPATH, XF, XPATH
  xl=XF.readline()
  if xl == b'':
    # need to move to next index file
    if NF is None:
      FN = 0
    else:
      FN += 1
    xn=XPATH%FN
    if not os.path.exists(xn):
      return
    XF = igzip.IGzipFile(filename=xn)
    NF = open(NPATH%FN, 'wb')
    xl = XF.readline()
  return xl

with open(sys.argv[1], 'rb') as df:
  dl = df.readline()
  (dkey, ddate, dtime) = dl.split(b'\t')

  while (xl:=nextLine()) is not None:
    (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2)
    if dkey != xkey or ddate != xdate:
      NF.write(xl)
      continue
    NF.write(xkey)
    NF.write(b' ')
    NF.write(xdate)
    NF.write(b' ')
    NF.write(xprops[:-2])
    NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
    dl = df.readline()
    if dl == '':
      # write out the last of the last index file, if any
      dkey = ddate = None
    else:
      (dkey, ddate, dtime) = dl.split(b'\t')