Mercurial > hg > cc > cirrus_work
comparison bin/merge_date.py @ 100:18446a7eeb9e
rework handling of session key problem
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 11 Sep 2023 12:56:47 +0100 |
parents | 009e633eb804 |
children | e2e64c3d763e |
comparison
equal
deleted
inserted
replaced
99:4c65ae2a4bc3 | 100:18446a7eeb9e |
---|---|
8 ''' # ' | 8 ''' # ' |
9 | 9 |
10 import sys, io, os, os.path, time, re | 10 import sys, io, os, os.path, time, re |
11 from isal import igzip | 11 from isal import igzip |
12 | 12 |
13 if sys.argv[1] == '-d': | 13 |
14 DEBUG = 0 | |
15 while sys.argv[1] == '-d': | |
14 sys.argv.pop(1) | 16 sys.argv.pop(1) |
15 DEBUG = True | 17 DEBUG += 1 |
16 else: | |
17 DEBUG = False | |
18 | 18 |
19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] | 19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] |
20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] | 20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] |
21 | 21 |
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' | 22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' |
23 b'(crawldiagnostics|robotstxt)/') | 23 b'(crawldiagnostics|robotstxt)/') |
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' | 24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' |
25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' | 25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' |
26 b'=[^&]*)') | 26 b'=[^&]*)') |
27 ISESSION = re.compile(SESSION.pattern,flags=re.I) | |
28 URL=re.compile(b'\{"url": "([^"]*)"') | |
27 | 29 |
28 # Above based on this from fixed Java code: | 30 # Above based on this from fixed Java code: |
29 #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), | 31 #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), |
30 #(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), | 32 #(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), |
31 #(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), | 33 #(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), |
42 DCNT = 0 | 44 DCNT = 0 |
43 | 45 |
44 XF = igzip.IGzipFile(filename=XPATH%0) | 46 XF = igzip.IGzipFile(filename=XPATH%0) |
45 NF = open(NN:=(NPATH%0),'wb') | 47 NF = open(NN:=(NPATH%0),'wb') |
46 | 48 |
47 def nextLine(): | 49 def nextLine(xq, messyD): |
48 '''Move on to next index file if current has run out''' | 50 '''Move on to next index file if current has run out''' |
49 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT | 51 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT |
52 if xq and not messyD: | |
53 return xq.pop(0), xq | |
50 while True: | 54 while True: |
51 xl=XF.readline() | 55 xl=XF.readline() |
52 XCNT += 1 | 56 XCNT += 1 |
53 if xl == b'': | 57 if xl == b'': |
54 # need to move to next index file | 58 # need to move to next index file |
55 FN += 1 | 59 FN += 1 |
56 DCNT=0 # this is relative to FN | |
57 XF.close() | 60 XF.close() |
58 NF.close() | 61 NF.close() |
59 print(NN, flush=True) # so we can compress it | 62 print(NN, flush=True) # so we can compress it |
60 time.sleep(0.1) # so it flushes? | 63 time.sleep(0.1) # so it flushes? |
61 XN=XPATH%FN | 64 XN=XPATH%FN |
63 return | 66 return |
64 XF = igzip.IGzipFile(filename=XN) | 67 XF = igzip.IGzipFile(filename=XN) |
65 NF = open((NN:=NPATH%FN), 'wb') | 68 NF = open((NN:=NPATH%FN), 'wb') |
66 xl = XF.readline() | 69 xl = XF.readline() |
67 XCNT = 1 | 70 XCNT = 1 |
68 return xl | 71 return xl, xq |
69 | 72 |
70 def keys(key): | 73 def keys(key): |
71 '''Deal with failure of 2019-35-vintage Java fixup to detect | 74 '''Deal with failure of 2019-35-vintage Java fixup to detect |
72 parameter-part-initial session ids''' | 75 parameter-part-initial session ids''' |
73 if m:=SESSION.match(key): | 76 if m:=SESSION.match(key): |
80 return True, prefix, fixed | 83 return True, prefix, fixed |
81 else: | 84 else: |
82 return False, key, None | 85 return False, key, None |
83 | 86 |
84 dfq = [] # for reordering if needed | 87 dfq = [] # for reordering if needed |
88 messyD = False | |
85 | 89 |
86 with open(sys.argv[1], 'rb') as df: | 90 with open(sys.argv[1], 'rb') as df: |
87 dl = df.readline() | 91 dl = df.readline() |
88 DCNT = 1 | 92 DCNT = 1 |
89 dkey, ddate, dtime = dl.split(b'\t') | 93 if DEBUG>1: |
94 sys.stderr.write("dl1: %s"%dl.decode('ascii')) | |
95 dkey, ddate, durl, dtime = dl.split(b'\t') | |
96 messyD = ISESSION.search(durl) | |
90 | 97 |
91 while (xl:=nextLine()) is not None: | 98 xq = [] |
99 | |
100 while (nlRes := nextLine(xq, messyD))[0] is not None: | |
101 (xl, xq) = nlRes | |
92 xkey, xdate, xprops = xl.split(b' ', maxsplit=2) | 102 xkey, xdate, xprops = xl.split(b' ', maxsplit=2) |
93 messy, xkey1, xkey2 = keys(xkey) | 103 m = URL.match(xprops) |
94 if messy: | 104 if m: |
95 stale=dfq | 105 xurl = m[1] |
96 dfq=[] | 106 else: |
97 while (dkey.startswith(xkey1) and | 107 raise ValueError("No url in %s"%xprops) |
98 (ddate!=xdate or (xkey2 is not None and dkey!=xkey2))): | 108 if DEBUG>1: |
99 dfq.append(dl) | 109 sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') |
100 if stale: | 110 for xp in (xkey, xdate, xurl)))) |
101 dl = stale.pop(0) | 111 messyU, xkey1, xkey2 = keys(xkey) |
102 else: | 112 if messyD: |
103 dl = df.readline() | 113 if messyU: |
104 DCNT += 1 | 114 # better match |
105 dkey, ddate, dtime = dl.split(b'\t') | 115 if (ddate != xdate or |
116 not dkey.startswith(xkey1) or | |
117 dkey!=xkey1 or | |
118 durl!=xurl): | |
119 raise ValueError("Fail: xkey: %s\n" | |
120 " dkey: %s\n" | |
121 " xdate: %s\n" | |
122 " ddate: %s\n" | |
123 " xurl: %s\n" | |
124 " durl: %s\n" | |
125 "dfq: %s\n" | |
126 "k1, k2: |%s|%s|\n" | |
127 "FN: %s XCNT: %s DCNT: %s\n" | |
128 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, | |
129 (b'\n '.join(dfq)).decode('ascii'), | |
130 xkey1, xkey2, FN, XCNT, DCNT, xl)) | |
131 messyD = False | |
132 # fall through to the ordinary (non-messy) match case | |
133 else: | |
134 # still looking, save this one | |
135 if DEBUG: | |
136 print("Diso: xkey: %s\n" | |
137 " dkey: %s\n" | |
138 " xdate: %s\n" | |
139 " ddate: %s\n" | |
140 " xurl: %s\n" | |
141 " durl: %s\n" | |
142 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, xl), | |
143 file=sys.stderr) | |
144 xq.append(xl) | |
145 if DEBUG>1: | |
146 sys.stderr.write('xpush\n') | |
147 continue | |
148 else: | |
149 # Not messyD | |
150 if messyU: | |
151 raise ValueError("messyD w/o messyU") | |
106 if (ddate != xdate or | 152 if (ddate != xdate or |
107 not dkey.startswith(xkey1) or | 153 not dkey.startswith(xkey1) or |
108 (xkey2 is not None and dkey!=xkey2)): | 154 (xkey2 is not None and dkey!=xkey2) or |
109 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): | 155 durl!=xurl): |
110 print("Fail: xkey: %s\n" | |
111 " dkey: %s\n" | |
112 " xdate: %s\n" | |
113 " ddate: %s\n" | |
114 "dfq: %s\n" | |
115 "k1, k2: |%s|%s|\n" | |
116 "FN: %s XCNT: %s DCNT: %s\n" | |
117 "xl: %s"%(xkey, dkey, xdate, ddate, | |
118 (b'\n '.join(dfq)).decode('ascii'), | |
119 xkey1, xkey2, FN, XCNT, DCNT, xl), | |
120 file=sys.stderr) | |
121 raise ValueError | |
122 NF.write(xl) | 156 NF.write(xl) |
123 continue | 157 continue |
158 # Got it | |
124 NF.write(xkey) | 159 NF.write(xkey) |
125 NF.write(b' ') | 160 NF.write(b' ') |
126 NF.write(xdate) | 161 NF.write(xdate) |
127 NF.write(b' ') | 162 NF.write(b' ') |
128 NF.write(xprops[:-2]) | 163 NF.write(xprops[:-2]) |
129 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) | 164 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) |
165 if DEBUG>1: | |
166 sys.stderr.write("out: %s"%(' '.join(xp.decode('ascii') | |
167 for xp in (xkey, xdate, xurl)))) | |
168 sys.stderr.write(" %d\n"%int(dtime[:-3])) | |
130 dl = df.readline() | 169 dl = df.readline() |
131 if dl == '': | 170 if dl == '': |
132 if dfq: | 171 if dfq: |
133 if DEBUG: | 172 if DEBUG: |
134 breakpoint() | 173 raise ValueError |
135 # write out the last of the last index file, if any | 174 # write out the last of the last index file, if any |
136 dkey = ddate = "" | 175 dkey = ddate = durl = "" |
137 else: | 176 else: |
177 if DEBUG>1: | |
178 sys.stderr.write("dl3: %s"%dl.decode('ascii')) | |
138 DCNT += 1 | 179 DCNT += 1 |
139 dkey, ddate, dtime = dl.split(b'\t') | 180 dkey, ddate, durl, dtime = dl.split(b'\t') |
181 messyD = ISESSION.search(durl) |