comparison bin/merge_date.py @ 100:18446a7eeb9e

rework handling of session key problem
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 11 Sep 2023 12:56:47 +0100
parents 009e633eb804
children e2e64c3d763e
comparison
equal deleted inserted replaced
99:4c65ae2a4bc3 100:18446a7eeb9e
8 ''' # ' 8 ''' # '
9 9
10 import sys, io, os, os.path, time, re 10 import sys, io, os, os.path, time, re
11 from isal import igzip 11 from isal import igzip
12 12
13 if sys.argv[1] == '-d': 13
14 DEBUG = 0
15 while sys.argv[1] == '-d':
14 sys.argv.pop(1) 16 sys.argv.pop(1)
15 DEBUG = True 17 DEBUG += 1
16 else:
17 DEBUG = False
18 18
19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] 19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2]
20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] 20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3]
21 21
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' 22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/'
23 b'(crawldiagnostics|robotstxt)/') 23 b'(crawldiagnostics|robotstxt)/')
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' 24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|'
25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' 25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)'
26 b'=[^&]*)') 26 b'=[^&]*)')
27 ISESSION = re.compile(SESSION.pattern,flags=re.I)
28 URL=re.compile(b'\{"url": "([^"]*)"')
27 29
28 # Above based on this from fixed Java code: 30 # Above based on this from fixed Java code:
29 #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), 31 #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
30 #(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), 32 #(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
31 #(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), 33 #(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
42 DCNT = 0 44 DCNT = 0
43 45
44 XF = igzip.IGzipFile(filename=XPATH%0) 46 XF = igzip.IGzipFile(filename=XPATH%0)
45 NF = open(NN:=(NPATH%0),'wb') 47 NF = open(NN:=(NPATH%0),'wb')
46 48
47 def nextLine(): 49 def nextLine(xq, messyD):
48 '''Move on to next index file if current has run out''' 50 '''Move on to next index file if current has run out'''
49 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT 51 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT
52 if xq and not messyD:
53 return xq.pop(0), xq
50 while True: 54 while True:
51 xl=XF.readline() 55 xl=XF.readline()
52 XCNT += 1 56 XCNT += 1
53 if xl == b'': 57 if xl == b'':
54 # need to move to next index file 58 # need to move to next index file
55 FN += 1 59 FN += 1
56 DCNT=0 # this is relative to FN
57 XF.close() 60 XF.close()
58 NF.close() 61 NF.close()
59 print(NN, flush=True) # so we can compress it 62 print(NN, flush=True) # so we can compress it
60 time.sleep(0.1) # so it flushes? 63 time.sleep(0.1) # so it flushes?
61 XN=XPATH%FN 64 XN=XPATH%FN
63 return 66 return
64 XF = igzip.IGzipFile(filename=XN) 67 XF = igzip.IGzipFile(filename=XN)
65 NF = open((NN:=NPATH%FN), 'wb') 68 NF = open((NN:=NPATH%FN), 'wb')
66 xl = XF.readline() 69 xl = XF.readline()
67 XCNT = 1 70 XCNT = 1
68 return xl 71 return xl, xq
69 72
70 def keys(key): 73 def keys(key):
71 '''Deal with failure of 2019-35-vintage Java fixup to detect 74 '''Deal with failure of 2019-35-vintage Java fixup to detect
72 parameter-part-initial session ids''' 75 parameter-part-initial session ids'''
73 if m:=SESSION.match(key): 76 if m:=SESSION.match(key):
80 return True, prefix, fixed 83 return True, prefix, fixed
81 else: 84 else:
82 return False, key, None 85 return False, key, None
83 86
84 dfq = [] # for reordering if needed 87 dfq = [] # for reordering if needed
88 messyD = False
85 89
86 with open(sys.argv[1], 'rb') as df: 90 with open(sys.argv[1], 'rb') as df:
87 dl = df.readline() 91 dl = df.readline()
88 DCNT = 1 92 DCNT = 1
89 dkey, ddate, dtime = dl.split(b'\t') 93 if DEBUG>1:
94 sys.stderr.write("dl1: %s"%dl.decode('ascii'))
95 dkey, ddate, durl, dtime = dl.split(b'\t')
96 messyD = ISESSION.search(durl)
90 97
91 while (xl:=nextLine()) is not None: 98 xq = []
99
100 while (nlRes := nextLine(xq, messyD))[0] is not None:
101 (xl, xq) = nlRes
92 xkey, xdate, xprops = xl.split(b' ', maxsplit=2) 102 xkey, xdate, xprops = xl.split(b' ', maxsplit=2)
93 messy, xkey1, xkey2 = keys(xkey) 103 m = URL.match(xprops)
94 if messy: 104 if m:
95 stale=dfq 105 xurl = m[1]
96 dfq=[] 106 else:
97 while (dkey.startswith(xkey1) and 107 raise ValueError("No url in %s"%xprops)
98 (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))): 108 if DEBUG>1:
99 dfq.append(dl) 109 sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii')
100 if stale: 110 for xp in (xkey, xdate, xurl))))
101 dl = stale.pop(0) 111 messyU, xkey1, xkey2 = keys(xkey)
102 else: 112 if messyD:
103 dl = df.readline() 113 if messyU:
104 DCNT += 1 114 # better match
105 dkey, ddate, dtime = dl.split(b'\t') 115 if (ddate != xdate or
116 not dkey.startswith(xkey1) or
117 dkey!=xkey1 or
118 durl!=xurl):
119 raise ValueError("Fail: xkey: %s\n"
120 " dkey: %s\n"
121 " xdate: %s\n"
122 " ddate: %s\n"
123 " xurl: %s\n"
124 " durl: %s\n"
125 "dfq: %s\n"
126 "k1, k2: |%s|%s|\n"
127 "FN: %s XCNT: %s DCNT: %s\n"
128 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl,
129 (b'\n '.join(dfq)).decode('ascii'),
130 xkey1, xkey2, FN, XCNT, DCNT, xl))
131 messyD = False
132 # fall through to the ordinary (non-messy) match case
133 else:
134 # still looking, save this one
135 if DEBUG:
136 print("Diso: xkey: %s\n"
137 " dkey: %s\n"
138 " xdate: %s\n"
139 " ddate: %s\n"
140 " xurl: %s\n"
141 " durl: %s\n"
142 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, xl),
143 file=sys.stderr)
144 xq.append(xl)
145 if DEBUG>1:
146 sys.stderr.write('xpush\n')
147 continue
148 else:
149 # Not messyD
150 if messyU:
151 raise ValueError("messyD w/o messyU")
106 if (ddate != xdate or 152 if (ddate != xdate or
107 not dkey.startswith(xkey1) or 153 not dkey.startswith(xkey1) or
108 (xkey2 is not None and dkey!=xkey2)): 154 (xkey2 is not None and dkey!=xkey2) or
109 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): 155 durl!=xurl):
110 print("Fail: xkey: %s\n"
111 " dkey: %s\n"
112 " xdate: %s\n"
113 " ddate: %s\n"
114 "dfq: %s\n"
115 "k1, k2: |%s|%s|\n"
116 "FN: %s XCNT: %s DCNT: %s\n"
117 "xl: %s"%(xkey, dkey, xdate, ddate,
118 (b'\n '.join(dfq)).decode('ascii'),
119 xkey1, xkey2, FN, XCNT, DCNT, xl),
120 file=sys.stderr)
121 raise ValueError
122 NF.write(xl) 156 NF.write(xl)
123 continue 157 continue
158 # Got it
124 NF.write(xkey) 159 NF.write(xkey)
125 NF.write(b' ') 160 NF.write(b' ')
126 NF.write(xdate) 161 NF.write(xdate)
127 NF.write(b' ') 162 NF.write(b' ')
128 NF.write(xprops[:-2]) 163 NF.write(xprops[:-2])
129 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) 164 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3]))
165 if DEBUG>1:
166 sys.stderr.write("out: %s"%(' '.join(xp.decode('ascii')
167 for xp in (xkey, xdate, xurl))))
168 sys.stderr.write(" %d\n"%int(dtime[:-3]))
130 dl = df.readline() 169 dl = df.readline()
131 if dl == '': 170 if dl == '':
132 if dfq: 171 if dfq:
133 if DEBUG: 172 if DEBUG:
134 breakpoint() 173 raise ValueError
135 # write out the last of the last index file, if any 174 # write out the last of the last index file, if any
136 dkey = ddate = "" 175 dkey = ddate = durl = ""
137 else: 176 else:
177 if DEBUG>1:
178 sys.stderr.write("dl3: %s"%dl.decode('ascii'))
138 DCNT += 1 179 DCNT += 1
139 dkey, ddate, dtime = dl.split(b'\t') 180 dkey, ddate, durl, dtime = dl.split(b'\t')
181 messyD = ISESSION.search(durl)