Mercurial > hg > cc > cirrus_work
changeset 88:49faf679d7df
final keystroke fixes, recurse and decimal www stripping
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 30 Aug 2023 11:11:31 +0100 |
parents | df231c95e4aa |
children | a62580816f1c |
files | bin/sort_date.py |
diffstat | 1 files changed, 9 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/sort_date.py Wed Aug 30 11:10:54 2023 +0100 +++ b/bin/sort_date.py Wed Aug 30 11:11:31 2023 +0100 @@ -9,6 +9,8 @@ import re, codecs +WPAT = re.compile('(,www\\d*)+\\)') + # Thanks to https://stackoverflow.com/a/8776871 import locale from functools import cmp_to_key @@ -27,17 +29,18 @@ # But # _is_ escaped in Java surt results # and additionally " \ : < = > ? \ ^ _ ` { | } are not -# Also, Java surt strips _all_ leading 'www.', +# Note also that although quote already does _not_ quote - . / _ ~ +# they are included below as that's what we find in surt.surt 0. + +# Also, Java surt strips _all_ leading 'www\d*.', # where python3 surt only strips the first one. def cdx_key(uristring): _surt = quote(unquote(surt(uristring), errors='percent'), - safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower() - while _surt.endswith(",www"): - _surt = _surt[:-4] - - return _surt + safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''' # ' + ).lower() + return WPAT.sub(')',_surt) def keyed(l): uri, cc_stamp, dateTime = l.split('\t',2)