changeset 129:83a574b570a6

move most of the hacking into fixGoogleCanon, which copies most but changes some of surt.GoogleURLCanonicalizer
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 28 Sep 2023 16:34:49 +0100
parents 8dd29564cfb2
children 31abd509e365
files lib/python/cc/lmh/sort_date.py
diffstat 1 files changed, 73 insertions(+), 24 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/sort_date.py	Thu Sep 28 16:10:05 2023 +0100
+++ b/lib/python/cc/lmh/sort_date.py	Thu Sep 28 16:34:49 2023 +0100
@@ -56,7 +56,8 @@
 
 # And Java strips so-called option session-ids, but python doesn't
 
-import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer
+import surt.DefaultIAURLCanonicalizer
+import surt.GoogleURLCanonicalizer as OGU
 import surt.URLRegexTransformer
 
 ident = ''.join(chr(i) for i in range(256)).encode('latin-1')
@@ -71,21 +72,77 @@
                         [127] # DEL
                         )).encode('latin-1')
 
-def notDefaultCanon(hu,**options):
-  if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host):
-    # Try to fix the incompatibility between Java and 
-    #  Python surt handling of 'octal' numbers in numeric IPv4 addresses
-    #  and it should!  See "After this line:
-    # 
-    # 15,225,107,143)" in .../azure/notes.txt
+# Override some key parts of GoogleURLCanonicalizer
+
+def escapeOnce(input, safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~'''): # '
+    """escape everything outside of 32-128, except #"""
+    if input:
+        return OGU.quote_from_bytes(input, safe).encode('ascii')
+    else:
+        return input
+
+def fixGoogleCanon(url,**options):
+  '''Copied from surt.GoogleURLCanonicalizer as retrieved in September 2023
+  Changes marked with <change>...</change> comments'''
+  url.hash = None
+  if url.authUser:
+    url.authUser = OGU.OGU.minimalEscape(url.authUser)
+  if url.authPass:
+    url.authPass = OGU.minimalEscape(url.authPass)
+  # <change>
+  if url.query:
+    url.query = escapeOnce(OGU.unescapeRepeatedly(url.query),
+                           safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #')
+                )
+    # Wrt \x7f (DEL), see discussion in notes wrt cn,futnsz) case
+    url.query = url.query.translate(IDMAP,delete=NONPRINT)
+  # </change>
+  if url.host:
+    host = OGU.unescapeRepeatedly(url.host)
     try:
-      bytestrs = hu.host.split(b'.')
-      hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs)
-    except ValueError:
-      pass
-  if hu.query:
-    hu.query = hu.query.translate(IDMAP,delete=NONPRINT)
-  return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options)
+      host.decode('ascii')
+    except UnicodeDecodeError:
+      try:
+        host = host.decode('utf-8', 'ignore').encode('idna')
+      except ValueError:
+        pass
+
+    host = host.replace(b'..', b'.').strip(b'.')
+
+    # <change>
+    if OGU.DECIMAL_IP.match(host):
+      # Try to fix the incompatibility between Java and 
+      #  Python surt handling of 'octal' numbers in numeric IPv4 addresses
+      #  and it should!  See discussion wrt 15,225,107,143)
+      #  in .../azure/notes.txt
+      try:
+        bytestrs = host.split(b'.')
+        host = b'.'.join(b'%d'%int(bs) for bs in bytestrs)
+      except ValueError:
+        pass
+      # </change>
+      ip = OGU.attemptIPFormats(host)
+      if ip:
+        host = ip;
+      else:
+        host = escapeOnce(host.lower())
+
+      url.host = host
+
+    path = OGU.unescapeRepeatedly(url.path)
+
+    if url.host:
+      path = OGU.normalizePath(path)
+    # else path is free-form sort of thing, not /directory/thing
+    # <change>
+    url.path = escapeOnce(OGU.unescapeRepeatedly(path),
+                          safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f'''  #')
+                          ).replace(b'\x7f',b'\\x7f')
+    # Wrt \x7f (DEL), see "biz,televida)" case                        )
+    # It remains to be seen whether other non-printing bytes
+    #  will need to be handled, which would require a regexp
+    # </change>
+  return surt.IAURLCanonicalizer.canonicalize(url, **options)
 
 # Hack this to reproduce the Java bug
 surt.URLRegexTransformer._RES_QUERY_SESSIONID = [
@@ -105,15 +162,7 @@
 #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
 
 def cdx_key(uristring):
-  _surt = quote(unquote(surt.surt(unquote(uristring),
-                                  canonicalizer=notDefaultCanon),
-                        errors='percent').replace('\x7f','\\x7f'),
-                safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''' # '
-                  ).lower()
-                # Wrt \x7f (DEL), see discussion in notes wrt
-                #   "biz,televida)" case
-                # It remains to be seen whether other non-printing bytes
-                #  will need to be handled, which would require a regexp
+  _surt = surt.surt(uristring, canonicalizer=fixGoogleCanon)
   return WPAT.sub(')',_surt)
 
 def keyed(l):