Mercurial > hg > cc > cirrus_work
changeset 129:83a574b570a6
move most of the hacking into fixGoogleCanon,
which copies most but changes some of surt.GoogleURLCanonicalizer
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 28 Sep 2023 16:34:49 +0100 |
parents | 8dd29564cfb2 |
children | 31abd509e365 |
files | lib/python/cc/lmh/sort_date.py |
diffstat | 1 files changed, 73 insertions(+), 24 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/sort_date.py Thu Sep 28 16:10:05 2023 +0100 +++ b/lib/python/cc/lmh/sort_date.py Thu Sep 28 16:34:49 2023 +0100 @@ -56,7 +56,8 @@ # And Java strips so-called option session-ids, but python doesn't -import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer +import surt.DefaultIAURLCanonicalizer +import surt.GoogleURLCanonicalizer as OGU import surt.URLRegexTransformer ident = ''.join(chr(i) for i in range(256)).encode('latin-1') @@ -71,21 +72,77 @@ [127] # DEL )).encode('latin-1') -def notDefaultCanon(hu,**options): - if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): - # Try to fix the incompatibility between Java and - # Python surt handling of 'octal' numbers in numeric IPv4 addresses - # and it should! See "After this line: - # - # 15,225,107,143)" in .../azure/notes.txt +# Override some key parts of GoogleURLCanonicalizer + +def escapeOnce(input, safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~'''): # ' + """escape everything outside of 32-128, except #""" + if input: + return OGU.quote_from_bytes(input, safe).encode('ascii') + else: + return input + +def fixGoogleCanon(url,**options): + '''Copied from surt.GoogleURLCanonicalizer as retrieved in September 2023 + Changes marked with <change>...</change> comments''' + url.hash = None + if url.authUser: + url.authUser = OGU.OGU.minimalEscape(url.authUser) + if url.authPass: + url.authPass = OGU.minimalEscape(url.authPass) + # <change> + if url.query: + url.query = escapeOnce(OGU.unescapeRepeatedly(url.query), + safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') + ) + # Wrt \x7f (DEL), see discussion in notes wrt cn,futnsz) case + url.query = url.query.translate(IDMAP,delete=NONPRINT) + # </change> + if url.host: + host = OGU.unescapeRepeatedly(url.host) try: - bytestrs = hu.host.split(b'.') - hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) - except ValueError: - pass - if hu.query: - hu.query = hu.query.translate(IDMAP,delete=NONPRINT) - return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options) + host.decode('ascii') + except UnicodeDecodeError: + try: + host = host.decode('utf-8', 'ignore').encode('idna') + except ValueError: + pass + + host = host.replace(b'..', b'.').strip(b'.') + + # <change> + if OGU.DECIMAL_IP.match(host): + # Try to fix the incompatibility between Java and + # Python surt handling of 'octal' numbers in numeric IPv4 addresses + # and it should! See discussion wrt 15,225,107,143) + # in .../azure/notes.txt + try: + bytestrs = host.split(b'.') + host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) + except ValueError: + pass + # </change> + ip = OGU.attemptIPFormats(host) + if ip: + host = ip; + else: + host = escapeOnce(host.lower()) + + url.host = host + + path = OGU.unescapeRepeatedly(url.path) + + if url.host: + path = OGU.normalizePath(path) + # else path is free-form sort of thing, not /directory/thing + # <change> + url.path = escapeOnce(OGU.unescapeRepeatedly(path), + safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') + ).replace(b'\x7f',b'\\x7f') + # Wrt \x7f (DEL), see "biz,televida)" case ) + # It remains to be seen whether other non-printing bytes + # will need to be handled, which would require a regexp + # </change> + return surt.IAURLCanonicalizer.canonicalize(url, **options) # Hack this to reproduce the Java bug surt.URLRegexTransformer._RES_QUERY_SESSIONID = [ @@ -105,15 +162,7 @@ #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", def cdx_key(uristring): - _surt = quote(unquote(surt.surt(unquote(uristring), - canonicalizer=notDefaultCanon), - errors='percent').replace('\x7f','\\x7f'), - safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''' # ' - ).lower() - # Wrt \x7f (DEL), see discussion in notes wrt - # "biz,televida)" case - # It remains to be seen whether other non-printing bytes - # will need to be handled, which would require a regexp + _surt = surt.surt(uristring, canonicalizer=fixGoogleCanon) return WPAT.sub(')',_surt) def keyed(l):