Mercurial > hg > cc > cirrus_work
changeset 108:9f7a35bf07f9
one more sid fix,
match java unicode escaping in path
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 19 Sep 2023 19:28:34 +0100 |
parents | 40c460fed99f |
children | 52c6a9b0fc8c |
files | bin/sort_date.py |
diffstat | 1 files changed, 16 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/sort_date.py Sun Sep 17 15:18:11 2023 +0100 +++ b/bin/sort_date.py Tue Sep 19 19:28:34 2023 +0100 @@ -27,6 +27,19 @@ codecs.register_error('percent',percent_encode) +def _u_esc(c): + if c<65536: + return '\\u%04X'%c + else: + return '\\U%08X'%c + +def java_unicode_encode(ude): + '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn''' + return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]), + ude.end) + +codecs.register_error('java_unicode',java_unicode_encode) + # From RFC-3986: # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" @@ -64,7 +77,7 @@ surt.URLRegexTransformer._RES_QUERY_SESSIONID = [ re.compile(b"(.*&)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I), re.compile(b"(.*&)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I), - re.compile(b"(.*&[a-z]*)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I), + re.compile(b"(.*[a-z]*)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I), re.compile(b"(.*&)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), re.compile(b"(.*&)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), ] @@ -107,5 +120,6 @@ ctk=cmp_to_key(locale.strcoll) for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None), key=lambda x:x[0]): - print(key[0],key[1],key[2],ts,sep='\t') + print(key[0],key[1],key[2].encode('ascii',errors=java_unicode) + ,ts,sep='\t')