# HG changeset patch # User Henry S. Thompson # Date 1695906516 -3600 # Node ID 1eb6cd49452d67441c1d9bfa449c76992dbb0cf3 # Parent d864700913b850897309acb2f91c19f1df90e0db found right place for \x7f hack, maybe diff -r d864700913b8 -r 1eb6cd49452d lib/python/cc/lmh/sort_date.py --- a/lib/python/cc/lmh/sort_date.py Thu Sep 28 14:06:11 2023 +0100 +++ b/lib/python/cc/lmh/sort_date.py Thu Sep 28 14:08:36 2023 +0100 @@ -23,7 +23,7 @@ def percent_encode(ude): #print(ude.object,ude.object[ude.start:ude.end]) - return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), + return (''.join('%%%x'%c for c in ude.object[ude.start:ude.end]), ude.end) codecs.register_error('percent',percent_encode) @@ -65,10 +65,11 @@ # For removal of non-printing characters: # Note, this is only a guess, only example so are is DEL -NONPRINT= ''.join(chr(i) for i in chain(range(9), - range(14,32), - [127] # DEL - )).encode('latin-1') +NONPRINT= ''.join(chr(i) for i in + chain(range(9), + range(14,32), + [127] # DEL + )).encode('latin-1') def notDefaultCanon(hu,**options): if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): @@ -106,13 +107,13 @@ def cdx_key(uristring): _surt = quote(unquote(surt.surt(unquote(uristring), canonicalizer=notDefaultCanon), - errors='percent'), - safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # ' + errors='percent').replace('\x7f','\\x7f'), + safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''' # ' ).lower() # Wrt \x7f (DEL), see discussion in notes wrt # "biz,televida)" case # It remains to be seen whether other non-printing bytes - # will need to be treated as 'safe' + # will need to be handled, which would require a regexp return WPAT.sub(')',_surt) def keyed(l): @@ -138,7 +139,7 @@ for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None), key=lambda x:x[0]): print(key[0],key[1], - key[2].encode('ascii',errors='java_unicode').decode('ascii'), + key[2].encode('ascii', errors='java_unicode').decode('ascii'), ts,sep='\t') if __name__ == '__main__':