Mercurial > hg > cc > cirrus_work
comparison bin/merge_date.py @ 107:40c460fed99f
working on sessionID pblms, still
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sun, 17 Sep 2023 15:18:11 +0100 |
parents | e606c609f813 |
children | 52c6a9b0fc8c |
comparison
equal
deleted
inserted
replaced
106:6104acc1345b | 107:40c460fed99f |
---|---|
25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' | 25 b'sid|jsessionid|phpsessid|aspsessionid[a-z]*)' |
26 b'=[^&]*)') | 26 b'=[^&]*)') |
27 ISESSION = re.compile(SESSION.pattern,flags=re.I) | 27 ISESSION = re.compile(SESSION.pattern,flags=re.I) |
28 URL=re.compile(b'\{"url": "([^"]*)"') | 28 URL=re.compile(b'\{"url": "([^"]*)"') |
29 | 29 |
30 # Above based on this from fixed Java code: | 30 # Above based on this from broken Java code: |
31 #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), | 31 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 |
32 #(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), | 32 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), |
33 #(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), | 33 #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), |
34 #(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), | 34 #(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), |
35 #(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", | 35 #(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), |
36 #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", | |
36 | 37 |
37 #print(sys.argv[3],NPATH,file=sys.stderr) | 38 #print(sys.argv[3],NPATH,file=sys.stderr) |
38 | 39 |
39 os.makedirs(sys.argv[3], exist_ok=True) | 40 os.makedirs(sys.argv[3], exist_ok=True) |
40 | 41 |
110 for xp in (xkey, xdate, xurl)))) | 111 for xp in (xkey, xdate, xurl)))) |
111 messyU, xkey1, xkey2 = keys(xkey) | 112 messyU, xkey1, xkey2 = keys(xkey) |
112 if messyD: | 113 if messyD: |
113 noMatch = (ddate != xdate or | 114 noMatch = (ddate != xdate or |
114 not dkey.startswith(xkey1) or | 115 not dkey.startswith(xkey1) or |
115 dkey!=xkey1 or | 116 (xkey2 is not None and dkey!=xkey2) or |
116 durl!=xurl) | 117 durl!=xurl) |
117 if messyU: | 118 if messyU: |
118 # better match | 119 # better match |
119 if noMatch: | 120 if noMatch: |
120 raise ValueError("Fail: xkey: %s\n" | 121 raise ValueError("Fail1: md: %s mu: %s\n" |
122 " xkey: %s\n" | |
121 " dkey: %s\n" | 123 " dkey: %s\n" |
122 " xdate: %s\n" | 124 " xdate: %s\n" |
123 " ddate: %s\n" | 125 " ddate: %s\n" |
124 " xurl: %s\n" | 126 " xurl: %s\n" |
125 " durl: %s\n" | 127 " durl: %s\n" |
126 "dfq: %s\n" | 128 "dfq: %s\n" |
127 "k1, k2: |%s|%s|\n" | 129 "k1, k2: |%s|%s|\n" |
128 "FN: %s XCNT: %s DCNT: %s\n" | 130 "FN: %s XCNT: %s DCNT: %s\n" |
129 "xl: %s"%(xkey, dkey, xdate, ddate, xurl, durl, | 131 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, xurl, durl, |
130 (b'\n '.join(dfq)).decode('ascii'), | 132 (b'\n '.join(dfq)).decode('ascii'), |
131 xkey1, xkey2, FN, XCNT, DCNT, xl)) | 133 xkey1, xkey2, FN, XCNT, DCNT, xl)) |
132 # fall through to the ordinary (non-messy) match case | 134 # fall through to the ordinary (non-messy) match case |
133 else: | 135 else: |
134 # still looking, save if >= date else fall through to write | 136 # still looking, save if >= date else fall through to write |
154 not dkey.startswith(xkey1) or | 156 not dkey.startswith(xkey1) or |
155 (xkey2 is not None and dkey!=xkey2) or | 157 (xkey2 is not None and dkey!=xkey2) or |
156 durl!=xurl): | 158 durl!=xurl): |
157 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): | 159 if DEBUG and dkey and xkey.decode('ascii')>(dkey.decode('ascii')): |
158 | 160 |
159 raise ValueError("Fail: xkey: %s\n" | 161 raise ValueError("Fail2: md: %s mu: %s\n" |
162 " xkey: %s\n" | |
160 " dkey: %s\n" | 163 " dkey: %s\n" |
161 " xdate: %s\n" | 164 " xdate: %s\n" |
162 " ddate: %s\n" | 165 " ddate: %s\n" |
163 "dfq: %s\n" | 166 "dfq: %s\n" |
164 "k1, k2: |%s|%s|\n" | 167 "k1, k2: |%s|%s|\n" |
165 "FN: %s XCNT: %s DCNT: %s\n" | 168 "FN: %s XCNT: %s DCNT: %s\n" |
166 "xl: %s"%(xkey, dkey, xdate, ddate, | 169 "xl: %s"%(messyD, messyU, xkey, dkey, xdate, ddate, |
167 (b'\n '.join(dfq)).decode('ascii'), | 170 (b'\n '.join(dfq)).decode('ascii'), |
168 xkey1, xkey2, FN, XCNT, DCNT, xl)) | 171 xkey1, xkey2, FN, XCNT, DCNT, xl)) |
169 NF.write(xl) | 172 NF.write(xl) |
170 if DEBUG>1: | 173 if DEBUG>1: |
171 sys.stderr.write("out_nl\n") | 174 sys.stderr.write("out_nl\n") |