Mercurial > hg > cc > cirrus_work
changeset 76:eeef811f734d
handle corner cases with final . and initial www..+
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 10 Aug 2023 22:14:49 +0100 |
parents | 177f7df2bf46 |
children | bf09a1d80d7b |
files | bin/sort_date.py |
diffstat | 1 files changed, 13 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/sort_date.py Wed Aug 09 02:01:32 2023 +0100 +++ b/bin/sort_date.py Thu Aug 10 22:14:49 2023 +0100 @@ -5,6 +5,9 @@ import email.utils import sys from urllib.parse import urlsplit, unquote +import re + +WWW=re.compile("www[0-9]*$") def auth(s): if '%' in s: @@ -12,12 +15,21 @@ else: kk=s.split('.') kk.reverse() + if kk[0] == '': + # final full stop is pruned by CC + kk.pop(0) + while WWW.match(kk[-1]): + # any www... prefix is pruned + kk.pop() return ','.join(kk) def keyed(l): uri, dateTime = l.split('\t',1) try: - epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() + try: + epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() + except OverflowError: + epoch = 32535215999.0 parts = urlsplit(uri) nl = parts.netloc if ':' in nl: