Mercurial > hg > cc > cirrus_work
changeset 133:3682ef4d2169
get 7f (two cases) and %25 working
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 28 Sep 2023 18:31:23 +0100 |
parents | 417d2986c99c |
children | adabcffc7d68 |
files | lib/python/cc/lmh/sort_date.py |
diffstat | 1 files changed, 25 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/sort_date.py Thu Sep 28 18:30:48 2023 +0100 +++ b/lib/python/cc/lmh/sort_date.py Thu Sep 28 18:31:23 2023 +0100 @@ -72,6 +72,13 @@ [127] # DEL )).encode('latin-1') +def percent_encode(ude): + #print(ude.object,ude.object[ude.start:ude.end]) + return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), + ude.end) + +codecs.register_error('percent',percent_encode) + # Override some key parts of GoogleURLCanonicalizer def escapeOnce(input, safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~'''): # ' @@ -89,14 +96,23 @@ url.authUser = OGU.OGU.minimalEscape(url.authUser) if url.authPass: url.authPass = OGU.minimalEscape(url.authPass) - # <change> if url.query: - url.query = escapeOnce(OGU.unescapeRepeatedly(url.query), - safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') + # <change> + query = escapeOnce(OGU.unescapeRepeatedly(url.query), + safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') ) # Wrt \x7f (DEL), see discussion in notes wrt cn,futnsz) case - url.query = url.query.translate(IDMAP,delete=NONPRINT) - # </change> + query = query.translate(IDMAP,delete=NONPRINT) + # </change> + # <change> + # Double-escape non-unicode %-encodings + # Surely this could be simpler! + url.query = quote(OGU.unquote_to_bytes(query).decode('utf-8', + errors='percent'), + safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #' + ).encode('ascii') + # </change> + if url.host: host = OGU.unescapeRepeatedly(url.host) try: @@ -136,12 +152,14 @@ # else path is free-form sort of thing, not /directory/thing # <change> url.path = escapeOnce(OGU.unescapeRepeatedly(path), - safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') - ).replace(b'\x7f',b'\\x7f') + safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') + ).replace(b'\x7f',b'\\x7f') # Wrt \x7f (DEL), see "biz,televida)" case ) # It remains to be seen whether other non-printing bytes # will need to be handled, which would require a regexp # </change> + # <change> + # </change> return surt.IAURLCanonicalizer.canonicalize(url, **options) # Hack this to reproduce the Java bug