changeset 76:eeef811f734d

handle corner cases with final . and initial www..+
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 10 Aug 2023 22:14:49 +0100
parents 177f7df2bf46
children bf09a1d80d7b
files bin/sort_date.py
diffstat 1 files changed, 13 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/bin/sort_date.py	Wed Aug 09 02:01:32 2023 +0100
+++ b/bin/sort_date.py	Thu Aug 10 22:14:49 2023 +0100
@@ -5,6 +5,9 @@
 import email.utils
 import sys
 from urllib.parse import urlsplit, unquote
+import re
+
+WWW=re.compile("www[0-9]*$")
 
 def auth(s):
   if '%' in s:
@@ -12,12 +15,21 @@
   else:
     kk=s.split('.')
   kk.reverse()
+  if kk[0] == '':
+    # final full stop is pruned by CC
+    kk.pop(0)
+  while WWW.match(kk[-1]):
+    # any www... prefix is pruned
+    kk.pop()
   return ','.join(kk)
 
 def keyed(l):
   uri, dateTime = l.split('\t',1)
   try:
-    epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
+    try:
+      epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
+    except OverflowError:
+      epoch = 32535215999.0
     parts = urlsplit(uri)
     nl = parts.netloc
     if ':' in nl: