# HG changeset patch # User Henry S. Thompson # Date 1695922283 -3600 # Node ID 3682ef4d21690bd00adeb897d374c6ebcd242051 # Parent 417d2986c99c73cc4ce81cf6c148dbb61c747d48 get 7f (two cases) and %25 working diff -r 417d2986c99c -r 3682ef4d2169 lib/python/cc/lmh/sort_date.py --- a/lib/python/cc/lmh/sort_date.py Thu Sep 28 18:30:48 2023 +0100 +++ b/lib/python/cc/lmh/sort_date.py Thu Sep 28 18:31:23 2023 +0100 @@ -72,6 +72,13 @@ [127] # DEL )).encode('latin-1') +def percent_encode(ude): + #print(ude.object,ude.object[ude.start:ude.end]) + return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), + ude.end) + +codecs.register_error('percent',percent_encode) + # Override some key parts of GoogleURLCanonicalizer def escapeOnce(input, safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~'''): # ' @@ -89,14 +96,23 @@ url.authUser = OGU.OGU.minimalEscape(url.authUser) if url.authPass: url.authPass = OGU.minimalEscape(url.authPass) - # if url.query: - url.query = escapeOnce(OGU.unescapeRepeatedly(url.query), - safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') + # + query = escapeOnce(OGU.unescapeRepeatedly(url.query), + safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') ) # Wrt \x7f (DEL), see discussion in notes wrt cn,futnsz) case - url.query = url.query.translate(IDMAP,delete=NONPRINT) - # + query = query.translate(IDMAP,delete=NONPRINT) + # + # + # Double-escape non-unicode %-encodings + # Surely this could be simpler! + url.query = quote(OGU.unquote_to_bytes(query).decode('utf-8', + errors='percent'), + safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #' + ).encode('ascii') + # + if url.host: host = OGU.unescapeRepeatedly(url.host) try: @@ -136,12 +152,14 @@ # else path is free-form sort of thing, not /directory/thing # url.path = escapeOnce(OGU.unescapeRepeatedly(path), - safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') - ).replace(b'\x7f',b'\\x7f') + safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') + ).replace(b'\x7f',b'\\x7f') # Wrt \x7f (DEL), see "biz,televida)" case ) # It remains to be seen whether other non-printing bytes # will need to be handled, which would require a regexp # + # + # return surt.IAURLCanonicalizer.canonicalize(url, **options) # Hack this to reproduce the Java bug