comparison bin/merge_date.py @ 107:40c460fed99f

working on sessionID pblms, still
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sun, 17 Sep 2023 15:18:11 +0100
parents e606c609f813
children 52c6a9b0fc8c
comparison
equal deleted inserted replaced
106:6104acc1345b 107:40c460fed99f
25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' 25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)'
26 b'=[^&]*)') 26 b'=[^&]*)')
27 ISESSION = re.compile(SESSION.pattern,flags=re.I) 27 ISESSION = re.compile(SESSION.pattern,flags=re.I)
28 URL=re.compile(b'\{"url": "([^"]*)"') 28 URL=re.compile(b'\{"url": "([^"]*)"')
29 29
30 # Above based on this from fixed Java code: 30 # Above based on this from broken Java code:
31 #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), 31 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
32 #(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), 32 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
33 #(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), 33 #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
34 #(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), 34 #(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
35 #(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", 35 #(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
36 #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
36 37
37 #print(sys.argv[3],NPATH,file=sys.stderr) 38 #print(sys.argv[3],NPATH,file=sys.stderr)
38 39
39 os.makedirs(sys.argv[3], exist_ok=True) 40 os.makedirs(sys.argv[3], exist_ok=True)
40 41
110 for xp in (xkey, xdate, xurl)))) 111 for xp in (xkey, xdate, xurl))))
111 messyU, xkey1, xkey2 = keys(xkey) 112 messyU, xkey1, xkey2 = keys(xkey)
112 if messyD: 113 if messyD:
113 noMatch = (ddate != xdate or 114 noMatch = (ddate != xdate or
114 not dkey.startswith(xkey1) or 115 not dkey.startswith(xkey1) or
115 dkey!=xkey1 or 116 (xkey2 is not None and dkey!=xkey2) or
116 durl!=xurl) 117 durl!=xurl)
117 if messyU: 118 if messyU:
118 # better match 119 # better match
119 if noMatch: 120 if noMatch:
120 raise ValueError("Fail: xkey: %s\n" 121 raise ValueError("Fail1: md: %s mu: %s\n"
122 " xkey: %s\n"
121 " dkey: %s\n" 123 " dkey: %s\n"
122 " xdate: %s\n" 124 " xdate: %s\n"
123 " ddate: %s\n" 125 " ddate: %s\n"
124 " xurl: %s\n" 126 " xurl: %s\n"
125 " durl: %s\n" 127 " durl: %s\n"
126 "dfq: %s\n" 128 "dfq: %s\n"
127 "k1, k2: |%s|%s|\n" 129 "k1, k2: |%s|%s|\n"
128 "FN: %s XCNT: %s DCNT: %s\n" 130 "FN: %s XCNT: %s DCNT: %s\n"
129 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, 131 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl,
130 (b'\n '.join(dfq)).decode('ascii'), 132 (b'\n '.join(dfq)).decode('ascii'),
131 xkey1, xkey2, FN, XCNT, DCNT, xl)) 133 xkey1, xkey2, FN, XCNT, DCNT, xl))
132 # fall through to the ordinary (non-messy) match case 134 # fall through to the ordinary (non-messy) match case
133 else: 135 else:
134 # still looking, save if >= date else fall through to write 136 # still looking, save if >= date else fall through to write
154 not dkey.startswith(xkey1) or 156 not dkey.startswith(xkey1) or
155 (xkey2 is not None and dkey!=xkey2) or 157 (xkey2 is not None and dkey!=xkey2) or
156 durl!=xurl): 158 durl!=xurl):
157 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): 159 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')):
158 160
159 raise ValueError("Fail: xkey: %s\n" 161 raise ValueError("Fail2: md: %s mu: %s\n"
162 " xkey: %s\n"
160 " dkey: %s\n" 163 " dkey: %s\n"
161 " xdate: %s\n" 164 " xdate: %s\n"
162 " ddate: %s\n" 165 " ddate: %s\n"
163 "dfq: %s\n" 166 "dfq: %s\n"
164 "k1, k2: |%s|%s|\n" 167 "k1, k2: |%s|%s|\n"
165 "FN: %s XCNT: %s DCNT: %s\n" 168 "FN: %s XCNT: %s DCNT: %s\n"
166 "xl: %s"%(xkey, dkey, xdate, ddate, 169 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate,
167 (b'\n '.join(dfq)).decode('ascii'), 170 (b'\n '.join(dfq)).decode('ascii'),
168 xkey1, xkey2, FN, XCNT, DCNT, xl)) 171 xkey1, xkey2, FN, XCNT, DCNT, xl))
169 NF.write(xl) 172 NF.write(xl)
170 if DEBUG>1: 173 if DEBUG>1:
171 sys.stderr.write("out_nl\n") 174 sys.stderr.write("out_nl\n")