# HG changeset patch # User Henry S. Thompson # Date 1695832149 -3600 # Node ID 9d14e7c32737f544bfbaf67e7229138a56f5cd15 # Parent f52783faf3ee96afbacb995259ea0bea5e40a7bd replicate two extremely-corner cases of the way Java produces surts for URIs containin escaped DEL chars diff -r f52783faf3ee -r 9d14e7c32737 bin/sort_date.py --- a/bin/sort_date.py Tue Sep 26 18:55:43 2023 +0100 +++ b/bin/sort_date.py Wed Sep 27 17:29:09 2023 +0100 @@ -13,6 +13,7 @@ import surt import re, codecs +from itertools import chain WPAT = re.compile('(,www\\d*)+\\)') @@ -58,6 +59,17 @@ import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer import surt.URLRegexTransformer +ident = ''.join(chr(i) for i in range(256)).encode('latin-1') + +IDMAP=bytes.maketrans(ident,ident) + +# For removal of non-printing characters: +# Note, this is only a guess, only example so are is DEL +NONPRINT= ''.join(chr(i) for i in chain(range(9), + range(14,32), + [127] # DEL + )).encode('latin-1') + def notDefaultCanon(hu,**options): if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): # Try to fix the incompatibility between Java and @@ -70,7 +82,8 @@ hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) except ValueError: pass - + if hu.query: + hu.query = hu.query.translate(IDMAP,delete=NONPRINT) return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options) # Hack this to reproduce the Java bug @@ -91,11 +104,15 @@ #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", def cdx_key(uristring): - _surt = quote(unquote(surt.surt(uristring, + _surt = quote(unquote(surt.surt(unquote(uristring), canonicalizer=notDefaultCanon), errors='percent'), - safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''' # ' - ).lower() + safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # ' + ).lower() + # Wrt \x7f (DEL), see discussion in notes wrt + # "biz,televida)" case + # It remains to be seen whether other non-printing bytes + # will need to be treated as 'safe' return WPAT.sub(')',_surt) def keyed(l):