changeset 133:3682ef4d2169

get 7f (two cases) and %25 working
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 28 Sep 2023 18:31:23 +0100
parents 417d2986c99c
children adabcffc7d68
files lib/python/cc/lmh/sort_date.py
diffstat 1 files changed, 25 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/sort_date.py	Thu Sep 28 18:30:48 2023 +0100
+++ b/lib/python/cc/lmh/sort_date.py	Thu Sep 28 18:31:23 2023 +0100
@@ -72,6 +72,13 @@
                         [127] # DEL
                         )).encode('latin-1')
 
+def percent_encode(ude):
+  #print(ude.object,ude.object[ude.start:ude.end])
+  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
+          ude.end)
+
+codecs.register_error('percent',percent_encode)
+
 # Override some key parts of GoogleURLCanonicalizer
 
 def escapeOnce(input, safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~'''): # '
@@ -89,14 +96,23 @@
     url.authUser = OGU.OGU.minimalEscape(url.authUser)
   if url.authPass:
     url.authPass = OGU.minimalEscape(url.authPass)
-  # <change>
   if url.query:
-    url.query = escapeOnce(OGU.unescapeRepeatedly(url.query),
-                           safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #')
+    # <change>
+    query = escapeOnce(OGU.unescapeRepeatedly(url.query),
+                       safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #')
                 )
     # Wrt \x7f (DEL), see discussion in notes wrt cn,futnsz) case
-    url.query = url.query.translate(IDMAP,delete=NONPRINT)
-  # </change>
+    query = query.translate(IDMAP,delete=NONPRINT)
+    # </change>
+    # <change>
+    # Double-escape non-unicode %-encodings
+    # Surely this could be simpler!
+    url.query = quote(OGU.unquote_to_bytes(query).decode('utf-8',
+                                                         errors='percent'),
+                      safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f'''  #'
+                        ).encode('ascii')
+    # </change>
+  
   if url.host:
     host = OGU.unescapeRepeatedly(url.host)
     try:
@@ -136,12 +152,14 @@
     # else path is free-form sort of thing, not /directory/thing
     # <change>
     url.path = escapeOnce(OGU.unescapeRepeatedly(path),
-                          safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f'''  #')
-                          ).replace(b'\x7f',b'\\x7f')
+                      safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f'''  #')
+                      ).replace(b'\x7f',b'\\x7f')
     # Wrt \x7f (DEL), see "biz,televida)" case                        )
     # It remains to be seen whether other non-printing bytes
     #  will need to be handled, which would require a regexp
     # </change>
+    # <change>
+    # </change>
   return surt.IAURLCanonicalizer.canonicalize(url, **options)
 
 # Hack this to reproduce the Java bug