Mercurial > hg > cc > cirrus_work
annotate bin/merge_date.py @ 89:a62580816f1c
merge a stream of ks files with a set of cdx files
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 30 Aug 2023 21:49:43 +0100 |
parents | |
children | c1a70532444c |
rev | line source |
---|---|
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/python3 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 that year's index |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 Usage: merge_date.py ksvstream cdx-dir outdir |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 ksvstream consists of tab-separated key, CC date and Unix timestamp |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 ''' # ' |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 import sys, io, os |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 from isal import igzip |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 xpath = "%s/cdx-00%%0.3d.gz"%sys.argv[2] |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 npath = "%s/cdx-00%%0.3d.gz"%sys.argv[3] |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 #print(sys.argv[3],npath,file=sys.stderr) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 os.makedirs(sys.argv[3], exist_ok=True) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 fn = -1 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 xf = igzip.IGzipFile(filename=xpath%0) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 nf = open(npath%0, 'wb') |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 df = open(sys.argv[1], 'rb') |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 xl = b'' |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 xkey = xdate = None |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 for dl in df: |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 (dkey, ddate, dtime) = dl.split(b'\t') |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 while dkey != xkey or ddate != xdate: |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 try: |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 if xl == b'': |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 # need to move to next index file |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 nf.close() |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 fn += 1 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 try: |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 xf = igzip.IGzipFile(filename=xpath%fn) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 except Exception as e: |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 print("No more index input for %s: %s\nUnmatched: |%s|%s|\n" |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 "Last index line: |%s|%s|"%(fn,e,dkey,ddate,xkey,xdate), |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 sys.stderr) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
43 exit(1) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 xl = xf.readline() |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 nf = open(npath%fn, 'wb') |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 #print('xl',xl,file=sys.stderr) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 continue |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 else: |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
51 except: |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 breakpoint() |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 nf.write(xl) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
54 xl = xf.readline() |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
55 nf.write(xkey) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
56 nf.write(b' ') |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
57 nf.write(xdate) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
58 nf.write(b' ') |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
59 nf.write(xprops[:-2]) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
60 nf.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
61 xl=xf.readline() |