comparison bin/merge_date.py @ 90:c1a70532444c

flip loops
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 31 Aug 2023 14:14:21 +0100
parents a62580816f1c
children 460f0599e8cd
comparison
equal deleted inserted replaced
89:a62580816f1c 90:c1a70532444c
5 Usage: merge_date.py ksvstream cdx-dir outdir 5 Usage: merge_date.py ksvstream cdx-dir outdir
6 6
7 ksvstream consists of tab-separated key, CC date and Unix timestamp 7 ksvstream consists of tab-separated key, CC date and Unix timestamp
8 ''' # ' 8 ''' # '
9 9
10 import sys, io, os 10 import sys, io, os, os.path
11 from isal import igzip 11 from isal import igzip
12 12
13 xpath = "%s/cdx-00%%0.3d.gz"%sys.argv[2] 13 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
14 npath = "%s/cdx-00%%0.3d.gz"%sys.argv[3] 14 NPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[3]
15 15
16 #print(sys.argv[3],npath,file=sys.stderr) 16 #print(sys.argv[3],NPATH,file=sys.stderr)
17 17
18 os.makedirs(sys.argv[3], exist_ok=True) 18 os.makedirs(sys.argv[3], exist_ok=True)
19 19
20 fn = -1 20 FN = 0
21 xf = igzip.IGzipFile(filename=xpath%0)
22 nf = open(npath%0, 'wb')
23 21
24 df = open(sys.argv[1], 'rb') 22 XF = igzip.IGzipFile(filename=XPATH%0)
23 NF = open(NPATH%0,'wb')
25 24
26 xl = b'' 25 XL = b''
27 xkey = xdate = None
28 26
29 for dl in df: 27 def nextLine():
28 global FN, NF, NPATH, XF, XPATH
29 xl=XF.readline()
30 if xl == b'':
31 # need to move to next index file
32 if NF is None:
33 FN = 0
34 else:
35 FN += 1
36 xn=XPATH%FN
37 if not os.path.exists(xn):
38 return
39 XF = igzip.IGzipFile(filename=xn)
40 NF = open(NPATH%FN, 'wb')
41 xl = XF.readline()
42 return xl
43
44 with open(sys.argv[1], 'rb') as df:
45 dl = df.readline()
30 (dkey, ddate, dtime) = dl.split(b'\t') 46 (dkey, ddate, dtime) = dl.split(b'\t')
31 while dkey != xkey or ddate != xdate: 47
32 try: 48 while (xl:=nextLine()) is not None:
33 if xl == b'': 49 (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2)
34 # need to move to next index file 50 if dkey != xkey or ddate != xdate:
35 nf.close() 51 NF.write(xl)
36 fn += 1 52 continue
37 try: 53 NF.write(xkey)
38 xf = igzip.IGzipFile(filename=xpath%fn) 54 NF.write(b' ')
39 except Exception as e: 55 NF.write(xdate)
40 print("No more index input for %s: %s\nUnmatched: |%s|%s|\n" 56 NF.write(b' ')
41 "Last index line: |%s|%s|"%(fn,e,dkey,ddate,xkey,xdate), 57 NF.write(xprops[:-2])
42 sys.stderr) 58 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
43 exit(1) 59 dl = df.readline()
44 xl = xf.readline() 60 if dl == '':
45 nf = open(npath%fn, 'wb') 61 # write out the last of the last index file, if any
46 #print('xl',xl,file=sys.stderr) 62 dkey = ddate = None
47 (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) 63 else:
48 continue 64 (dkey, ddate, dtime) = dl.split(b'\t')
49 else:
50 (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2)
51 except:
52 breakpoint()
53 nf.write(xl)
54 xl = xf.readline()
55 nf.write(xkey)
56 nf.write(b' ')
57 nf.write(xdate)
58 nf.write(b' ')
59 nf.write(xprops[:-2])
60 nf.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
61 xl=xf.readline()