Mercurial > hg > cc > cirrus_work
comparison bin/merge_date.py @ 90:c1a70532444c
flip loops
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 31 Aug 2023 14:14:21 +0100 |
parents | a62580816f1c |
children | 460f0599e8cd |
comparison
equal
deleted
inserted
replaced
89:a62580816f1c | 90:c1a70532444c |
---|---|
5 Usage: merge_date.py ksvstream cdx-dir outdir | 5 Usage: merge_date.py ksvstream cdx-dir outdir |
6 | 6 |
7 ksvstream consists of tab-separated key, CC date and Unix timestamp | 7 ksvstream consists of tab-separated key, CC date and Unix timestamp |
8 ''' # ' | 8 ''' # ' |
9 | 9 |
10 import sys, io, os | 10 import sys, io, os, os.path |
11 from isal import igzip | 11 from isal import igzip |
12 | 12 |
13 xpath = "%s/cdx-00%%0.3d.gz"%sys.argv[2] | 13 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] |
14 npath = "%s/cdx-00%%0.3d.gz"%sys.argv[3] | 14 NPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[3] |
15 | 15 |
16 #print(sys.argv[3],npath,file=sys.stderr) | 16 #print(sys.argv[3],NPATH,file=sys.stderr) |
17 | 17 |
18 os.makedirs(sys.argv[3], exist_ok=True) | 18 os.makedirs(sys.argv[3], exist_ok=True) |
19 | 19 |
20 fn = -1 | 20 FN = 0 |
21 xf = igzip.IGzipFile(filename=xpath%0) | |
22 nf = open(npath%0, 'wb') | |
23 | 21 |
24 df = open(sys.argv[1], 'rb') | 22 XF = igzip.IGzipFile(filename=XPATH%0) |
23 NF = open(NPATH%0,'wb') | |
25 | 24 |
26 xl = b'' | 25 XL = b'' |
27 xkey = xdate = None | |
28 | 26 |
29 for dl in df: | 27 def nextLine(): |
28 global FN, NF, NPATH, XF, XPATH | |
29 xl=XF.readline() | |
30 if xl == b'': | |
31 # need to move to next index file | |
32 if NF is None: | |
33 FN = 0 | |
34 else: | |
35 FN += 1 | |
36 xn=XPATH%FN | |
37 if not os.path.exists(xn): | |
38 return | |
39 XF = igzip.IGzipFile(filename=xn) | |
40 NF = open(NPATH%FN, 'wb') | |
41 xl = XF.readline() | |
42 return xl | |
43 | |
44 with open(sys.argv[1], 'rb') as df: | |
45 dl = df.readline() | |
30 (dkey, ddate, dtime) = dl.split(b'\t') | 46 (dkey, ddate, dtime) = dl.split(b'\t') |
31 while dkey != xkey or ddate != xdate: | 47 |
32 try: | 48 while (xl:=nextLine()) is not None: |
33 if xl == b'': | 49 (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) |
34 # need to move to next index file | 50 if dkey != xkey or ddate != xdate: |
35 nf.close() | 51 NF.write(xl) |
36 fn += 1 | 52 continue |
37 try: | 53 NF.write(xkey) |
38 xf = igzip.IGzipFile(filename=xpath%fn) | 54 NF.write(b' ') |
39 except Exception as e: | 55 NF.write(xdate) |
40 print("No more index input for %s: %s\nUnmatched: |%s|%s|\n" | 56 NF.write(b' ') |
41 "Last index line: |%s|%s|"%(fn,e,dkey,ddate,xkey,xdate), | 57 NF.write(xprops[:-2]) |
42 sys.stderr) | 58 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) |
43 exit(1) | 59 dl = df.readline() |
44 xl = xf.readline() | 60 if dl == '': |
45 nf = open(npath%fn, 'wb') | 61 # write out the last of the last index file, if any |
46 #print('xl',xl,file=sys.stderr) | 62 dkey = ddate = None |
47 (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) | 63 else: |
48 continue | 64 (dkey, ddate, dtime) = dl.split(b'\t') |
49 else: | |
50 (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) | |
51 except: | |
52 breakpoint() | |
53 nf.write(xl) | |
54 xl = xf.readline() | |
55 nf.write(xkey) | |
56 nf.write(b' ') | |
57 nf.write(xdate) | |
58 nf.write(b' ') | |
59 nf.write(xprops[:-2]) | |
60 nf.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) | |
61 xl=xf.readline() |