Mercurial > hg > cc > cirrus_work
changeset 118:9d14e7c32737
replicate two extremely-corner cases of the way
Java produces surts for URIs containin escaped DEL chars
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 27 Sep 2023 17:29:09 +0100 |
parents | f52783faf3ee |
children | 1d12b51c4d59 |
files | bin/sort_date.py |
diffstat | 1 files changed, 21 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/sort_date.py Tue Sep 26 18:55:43 2023 +0100 +++ b/bin/sort_date.py Wed Sep 27 17:29:09 2023 +0100 @@ -13,6 +13,7 @@ import surt import re, codecs +from itertools import chain WPAT = re.compile('(,www\\d*)+\\)') @@ -58,6 +59,17 @@ import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer import surt.URLRegexTransformer +ident = ''.join(chr(i) for i in range(256)).encode('latin-1') + +IDMAP=bytes.maketrans(ident,ident) + +# For removal of non-printing characters: +# Note, this is only a guess, only example so are is DEL +NONPRINT= ''.join(chr(i) for i in chain(range(9), + range(14,32), + [127] # DEL + )).encode('latin-1') + def notDefaultCanon(hu,**options): if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): # Try to fix the incompatibility between Java and @@ -70,7 +82,8 @@ hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) except ValueError: pass - + if hu.query: + hu.query = hu.query.translate(IDMAP,delete=NONPRINT) return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options) # Hack this to reproduce the Java bug @@ -91,11 +104,15 @@ #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", def cdx_key(uristring): - _surt = quote(unquote(surt.surt(uristring, + _surt = quote(unquote(surt.surt(unquote(uristring), canonicalizer=notDefaultCanon), errors='percent'), - safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''' # ' - ).lower() + safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # ' + ).lower() + # Wrt \x7f (DEL), see discussion in notes wrt + # "biz,televida)" case + # It remains to be seen whether other non-printing bytes + # will need to be treated as 'safe' return WPAT.sub(')',_surt) def keyed(l):