annotate bin/merge_date.py @ 89:a62580816f1c

merge a stream of ks files with a set of cdx files
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 30 Aug 2023 21:49:43 +0100
parents
children c1a70532444c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
89
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/python3
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 that year's index
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 Usage: merge_date.py ksvstream cdx-dir outdir
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 ksvstream consists of tab-separated key, CC date and Unix timestamp
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 ''' # '
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 import sys, io, os
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 from isal import igzip
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 xpath = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 npath = "%s/cdx-00%%0.3d.gz"%sys.argv[3]
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 #print(sys.argv[3],npath,file=sys.stderr)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 os.makedirs(sys.argv[3], exist_ok=True)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 fn = -1
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 xf = igzip.IGzipFile(filename=xpath%0)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 nf = open(npath%0, 'wb')
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 df = open(sys.argv[1], 'rb')
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 xl = b''
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 xkey = xdate = None
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 for dl in df:
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 (dkey, ddate, dtime) = dl.split(b'\t')
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 while dkey != xkey or ddate != xdate:
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 try:
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 if xl == b'':
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 # need to move to next index file
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 nf.close()
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 fn += 1
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 try:
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 xf = igzip.IGzipFile(filename=xpath%fn)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 except Exception as e:
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 print("No more index input for %s: %s\nUnmatched: |%s|%s|\n"
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 "Last index line: |%s|%s|"%(fn,e,dkey,ddate,xkey,xdate),
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 sys.stderr)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 exit(1)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 xl = xf.readline()
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 nf = open(npath%fn, 'wb')
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 #print('xl',xl,file=sys.stderr)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 continue
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 else:
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 except:
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 breakpoint()
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 nf.write(xl)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 xl = xf.readline()
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55 nf.write(xkey)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
56 nf.write(b' ')
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
57 nf.write(xdate)
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
58 nf.write(b' ')
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 nf.write(xprops[:-2])
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
60 nf.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
a62580816f1c merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
61 xl=xf.readline()