changeset 126:1eb6cd49452d

found right place for \x7f hack, maybe
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 28 Sep 2023 14:08:36 +0100
parents d864700913b8
children 9c63ff510cc9
files lib/python/cc/lmh/sort_date.py
diffstat 1 files changed, 10 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/sort_date.py	Thu Sep 28 14:06:11 2023 +0100
+++ b/lib/python/cc/lmh/sort_date.py	Thu Sep 28 14:08:36 2023 +0100
@@ -23,7 +23,7 @@
 
 def percent_encode(ude):
   #print(ude.object,ude.object[ude.start:ude.end])
-  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
+  return (''.join('%%%x'%c for c in ude.object[ude.start:ude.end]),
           ude.end)
 
 codecs.register_error('percent',percent_encode)
@@ -65,10 +65,11 @@
 
 # For removal of non-printing characters:
 #  Note, this is only a guess, only example so are is DEL
-NONPRINT= ''.join(chr(i) for i in chain(range(9),
-                                      range(14,32),
-                                      [127] # DEL
-                                      )).encode('latin-1')
+NONPRINT= ''.join(chr(i) for i in
+                  chain(range(9),
+                        range(14,32),
+                        [127] # DEL
+                        )).encode('latin-1')
 
 def notDefaultCanon(hu,**options):
   if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host):
@@ -106,13 +107,13 @@
 def cdx_key(uristring):
   _surt = quote(unquote(surt.surt(unquote(uristring),
                                   canonicalizer=notDefaultCanon),
-                        errors='percent'),
-                safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # '
+                        errors='percent').replace('\x7f','\\x7f'),
+                safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''' # '
                   ).lower()
                 # Wrt \x7f (DEL), see discussion in notes wrt
                 #   "biz,televida)" case
                 # It remains to be seen whether other non-printing bytes
-                #  will need to be treated as 'safe'
+                #  will need to be handled, which would require a regexp
   return WPAT.sub(')',_surt)
 
 def keyed(l):
@@ -138,7 +139,7 @@
     for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
                      key=lambda x:x[0]):
       print(key[0],key[1],
-            key[2].encode('ascii',errors='java_unicode').decode('ascii'),
+            key[2].encode('ascii', errors='java_unicode').decode('ascii'),
             ts,sep='\t')
 
 if __name__ == '__main__':