changeset 108:9f7a35bf07f9

one more sid fix, match java unicode escaping in path
author Henry Thompson <ht@markup.co.uk>
date Tue, 19 Sep 2023 19:28:34 +0100
parents 40c460fed99f
children 52c6a9b0fc8c
files bin/sort_date.py
diffstat 1 files changed, 16 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/bin/sort_date.py	Sun Sep 17 15:18:11 2023 +0100
+++ b/bin/sort_date.py	Tue Sep 19 19:28:34 2023 +0100
@@ -27,6 +27,19 @@
 
 codecs.register_error('percent',percent_encode)
 
+def _u_esc(c):
+  if c<65536:
+    return '\\u%04X'%c
+  else:
+    return '\\U%08X'%c
+
+def java_unicode_encode(ude):
+  '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn'''
+  return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]),
+          ude.end)
+
+codecs.register_error('java_unicode',java_unicode_encode)
+
 # From RFC-3986:
 # gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
 # sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
@@ -64,7 +77,7 @@
 surt.URLRegexTransformer._RES_QUERY_SESSIONID = [
     re.compile(b"(.*&)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I),
     re.compile(b"(.*&)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I),
-    re.compile(b"(.*&[a-z]*)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"(.*[a-z]*)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I),
     re.compile(b"(.*&)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
     re.compile(b"(.*&)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
     ]
@@ -107,5 +120,6 @@
   ctk=cmp_to_key(locale.strcoll)
   for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
                    key=lambda x:x[0]):
-    print(key[0],key[1],key[2],ts,sep='\t')
+    print(key[0],key[1],key[2].encode('ascii',errors=java_unicode)
+          ,ts,sep='\t')