changeset 89:a62580816f1c

merge a stream of ks files with a set of cdx files
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 30 Aug 2023 21:49:43 +0100
parents 49faf679d7df
children c1a70532444c
files bin/merge_date.py
diffstat 1 files changed, 61 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/merge_date.py	Wed Aug 30 21:49:43 2023 +0100
@@ -0,0 +1,61 @@
+#!/usr/bin/python3
+'''Add timestamps from Last-Modified-dated (ks.tsv) files into
+   that year's index
+
+Usage: merge_date.py ksvstream cdx-dir outdir
+
+ksvstream consists of tab-separated key, CC date and Unix timestamp
+''' # '
+
+import sys, io, os
+from isal import igzip
+
+xpath = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
+npath = "%s/cdx-00%%0.3d.gz"%sys.argv[3]
+
+#print(sys.argv[3],npath,file=sys.stderr)
+
+os.makedirs(sys.argv[3], exist_ok=True)
+
+fn = -1
+xf = igzip.IGzipFile(filename=xpath%0)
+nf = open(npath%0, 'wb')
+
+df = open(sys.argv[1], 'rb')
+
+xl = b''
+xkey = xdate = None
+
+for dl in df:
+  (dkey, ddate, dtime) = dl.split(b'\t')
+  while dkey != xkey or ddate != xdate:
+    try:
+      if xl == b'':
+        # need to move to next index file
+        nf.close()
+        fn += 1
+        try:
+          xf = igzip.IGzipFile(filename=xpath%fn)
+        except Exception as e:
+          print("No more index input for %s: %s\nUnmatched:      |%s|%s|\n"
+                "Last index line: |%s|%s|"%(fn,e,dkey,ddate,xkey,xdate),
+                sys.stderr)
+          exit(1)
+        xl = xf.readline()
+        nf = open(npath%fn, 'wb')
+        #print('xl',xl,file=sys.stderr)
+        (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2)
+        continue
+      else:
+        (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2)
+    except:
+      breakpoint()
+    nf.write(xl)
+    xl = xf.readline()
+  nf.write(xkey)
+  nf.write(b' ')
+  nf.write(xdate)
+  nf.write(b' ')
+  nf.write(xprops[:-2])
+  nf.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
+  xl=xf.readline()