changeset 118:9d14e7c32737

replicate two extremely-corner cases of the way Java produces surts for URIs containin escaped DEL chars
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 27 Sep 2023 17:29:09 +0100
parents f52783faf3ee
children 1d12b51c4d59
files bin/sort_date.py
diffstat 1 files changed, 21 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/bin/sort_date.py	Tue Sep 26 18:55:43 2023 +0100
+++ b/bin/sort_date.py	Wed Sep 27 17:29:09 2023 +0100
@@ -13,6 +13,7 @@
 import surt
 
 import re, codecs
+from itertools import chain
 
 WPAT = re.compile('(,www\\d*)+\\)')
 
@@ -58,6 +59,17 @@
 import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer
 import surt.URLRegexTransformer
 
+ident = ''.join(chr(i) for i in range(256)).encode('latin-1')
+
+IDMAP=bytes.maketrans(ident,ident)
+
+# For removal of non-printing characters:
+#  Note, this is only a guess, only example so are is DEL
+NONPRINT= ''.join(chr(i) for i in chain(range(9),
+                                      range(14,32),
+                                      [127] # DEL
+                                      )).encode('latin-1')
+
 def notDefaultCanon(hu,**options):
   if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host):
     # Try to fix the incompatibility between Java and 
@@ -70,7 +82,8 @@
       hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs)
     except ValueError:
       pass
-
+  if hu.query:
+    hu.query = hu.query.translate(IDMAP,delete=NONPRINT)
   return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options)
 
 # Hack this to reproduce the Java bug
@@ -91,11 +104,15 @@
 #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
 
 def cdx_key(uristring):
-  _surt = quote(unquote(surt.surt(uristring,
+  _surt = quote(unquote(surt.surt(unquote(uristring),
                                   canonicalizer=notDefaultCanon),
                         errors='percent'),
-                safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''' # '
-                ).lower()
+                safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # '
+                  ).lower()
+                # Wrt \x7f (DEL), see discussion in notes wrt
+                #   "biz,televida)" case
+                # It remains to be seen whether other non-printing bytes
+                #  will need to be treated as 'safe'
   return WPAT.sub(')',_surt)
 
 def keyed(l):