changeset 88:49faf679d7df

final keystroke fixes, recurse and decimal www stripping
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 30 Aug 2023 11:11:31 +0100
parents df231c95e4aa
children a62580816f1c
files bin/sort_date.py
diffstat 1 files changed, 9 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/bin/sort_date.py	Wed Aug 30 11:10:54 2023 +0100
+++ b/bin/sort_date.py	Wed Aug 30 11:11:31 2023 +0100
@@ -9,6 +9,8 @@
 
 import re, codecs
 
+WPAT = re.compile('(,www\\d*)+\\)')
+
 # Thanks to https://stackoverflow.com/a/8776871
 import locale
 from functools import cmp_to_key
@@ -27,17 +29,18 @@
 # But # _is_ escaped in Java surt results
 #  and additionally " \ : < = > ? \ ^  _ ` { | } are not
 
-# Also, Java surt strips _all_ leading 'www.',
+# Note also that although quote already does _not_ quote - . / _ ~
+#  they are included below as that's what we find in surt.surt 0.
+
+# Also, Java surt strips _all_ leading 'www\d*.',
 #  where python3 surt only strips the first one.
 
 def cdx_key(uristring):
   _surt = quote(unquote(surt(uristring),
                         errors='percent'),
-                safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower()
-  while _surt.endswith(",www"):
-    _surt = _surt[:-4]
-
-  return _surt
+                safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''' # '
+                ).lower()
+  return WPAT.sub(')',_surt)
 
 def keyed(l):
   uri, cc_stamp, dateTime = l.split('\t',2)