changeset 146:83fbd652a014

iterate WPAT fix with improved pattern
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 03 Oct 2023 17:44:59 +0100
parents 170844e51987
children 1c2f477fb064
files lib/python/cc/lmh/sort_date.py
diffstat 1 files changed, 4 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/sort_date.py	Tue Oct 03 17:43:52 2023 +0100
+++ b/lib/python/cc/lmh/sort_date.py	Tue Oct 03 17:44:59 2023 +0100
@@ -16,7 +16,7 @@
 import re, codecs
 from itertools import chain
 
-WPAT = re.compile('(,www\\d*)+\\)')
+WPAT = re.compile(r'(,www\d*)(:\d+)?\)')
 
 # Thanks to https://stackoverflow.com/a/8776871
 import locale
@@ -188,7 +188,9 @@
   if '\\u' in uristring:
     uristring=bytes(uristring,'utf-8').decode('unicode_escape')
   _surt = surt.surt(uristring, canonicalizer=fixGoogleCanon)
-  return WPAT.sub(')',_surt)
+  while WPAT.search(_surt):
+    _surt = WPAT.sub(r'\2)',_surt)
+  return _surt
 
 def keyed(l):
   uri, cc_stamp, dateTime = l.split('\t',2)