comparison bin/merge_date.py @ 91:460f0599e8cd

mostly working, but need to reorder in case of cfid and friends
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 05 Sep 2023 17:32:46 +0100
parents c1a70532444c
children e56a7aad9ce9
comparison
equal deleted inserted replaced
90:c1a70532444c 91:460f0599e8cd
5 Usage: merge_date.py ksvstream cdx-dir outdir 5 Usage: merge_date.py ksvstream cdx-dir outdir
6 6
7 ksvstream consists of tab-separated key, CC date and Unix timestamp 7 ksvstream consists of tab-separated key, CC date and Unix timestamp
8 ''' # ' 8 ''' # '
9 9
10 import sys, io, os, os.path 10 import sys, io, os, os.path, time, re
11 from isal import igzip 11 from isal import igzip
12 12
13 if sys.argv[1] == '-d':
14 sys.argv.pop(1)
15 DEBUG = True
16 else:
17 DEBUG = False
18
13 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] 19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
14 NPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[3] 20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
21
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
23 b'(crawldiagnostics|robotstxt)/')
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
25 b'sid|jsessionid|aspsessionid[a-z]*)'
26 b'=[^&]*)')
27
28 # Above based on this from fixed Java code:
29 #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
30 #(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
31 #(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
32 #(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
33 #(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
15 34
16 #print(sys.argv[3],NPATH,file=sys.stderr) 35 #print(sys.argv[3],NPATH,file=sys.stderr)
17 36
18 os.makedirs(sys.argv[3], exist_ok=True) 37 os.makedirs(sys.argv[3], exist_ok=True)
19 38
20 FN = 0 39 FN = 0
21 40
41 XCNT = 0
42 dcnt = 0
43
22 XF = igzip.IGzipFile(filename=XPATH%0) 44 XF = igzip.IGzipFile(filename=XPATH%0)
23 NF = open(NPATH%0,'wb') 45 NF = open(NN:=(NPATH%0),'wb')
24
25 XL = b''
26 46
27 def nextLine(): 47 def nextLine():
28 global FN, NF, NPATH, XF, XPATH 48 '''Move on to next index file if current has run out'''
29 xl=XF.readline() 49 global FN, NF, NPATH, NN, XF, XPATH, XCNT
30 if xl == b'': 50 while True:
31 # need to move to next index file 51 xl=XF.readline()
32 if NF is None: 52 XCNT += 1
33 FN = 0 53 if xl == b'':
54 # need to move to next index file
55 FN += 1
56 XF.close()
57 NF.close()
58 print(NN, flush=True) # so we can compress it
59 time.sleep(0.1) # so it flushes?
60 XN=XPATH%FN
61 if not os.path.exists(XN):
62 return
63 XF = igzip.IGzipFile(filename=XN)
64 NF = open((NN:=NPATH%FN), 'wb')
65 xl = XF.readline()
66 XCNT = 1
67 if RorDPAT.search(xl):
68 #print(xl,file=sys.stderr)
69 continue
70 return xl
71
72 def keys(key):
73 '''Deal with failure of 2019-35-vintage Java fixup to detect
74 parameter-part-initial session ids'''
75 if m:=SESSION.match(key):
76 prefix=m[1]
77 e, b = m.span(2)
78 fixed=key[:e]+key[b:]
79 if fixed==m[1]:
80 return prefix[:-1], None
34 else: 81 else:
35 FN += 1 82 return prefix, fixed
36 xn=XPATH%FN 83 else:
37 if not os.path.exists(xn): 84 return key, None
38 return
39 XF = igzip.IGzipFile(filename=xn)
40 NF = open(NPATH%FN, 'wb')
41 xl = XF.readline()
42 return xl
43 85
44 with open(sys.argv[1], 'rb') as df: 86 with open(sys.argv[1], 'rb') as df:
45 dl = df.readline() 87 dl = df.readline()
46 (dkey, ddate, dtime) = dl.split(b'\t') 88 dcnt += 1
89 dkey, ddate, dtime = dl.split(b'\t')
47 90
48 while (xl:=nextLine()) is not None: 91 while (xl:=nextLine()) is not None:
49 (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) 92 xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
50 if dkey != xkey or ddate != xdate: 93 xkey1, xkey2 = keys(xkey)
94 if (ddate != xdate or
95 not dkey.startswith(xkey1) or
96 (xkey2 is not None and dkey!=xkey2)):
97 if DEBUG and xkey.decode('ascii')>(dkey.decode('ascii')):
98 print("Fail: xkey: %s\n"
99 " dkey: %s\n"
100 " xdate: %s\n"
101 " ddate: %s\n"
102 "k1, k2: |%s|%s|\n"
103 "FN: %s\n"
104 "xl: %s"%(xkey, dkey, xdate, ddate, xkey1, xkey2, FN, xl),
105 file=sys.stderr)
106 raise ValueError()
51 NF.write(xl) 107 NF.write(xl)
52 continue 108 continue
53 NF.write(xkey) 109 NF.write(xkey)
54 NF.write(b' ') 110 NF.write(b' ')
55 NF.write(xdate) 111 NF.write(xdate)
59 dl = df.readline() 115 dl = df.readline()
60 if dl == '': 116 if dl == '':
61 # write out the last of the last index file, if any 117 # write out the last of the last index file, if any
62 dkey = ddate = None 118 dkey = ddate = None
63 else: 119 else:
64 (dkey, ddate, dtime) = dl.split(b'\t') 120 dcnt += 1
121 dkey, ddate, dtime = dl.split(b'\t')