Mercurial > hg > cc > cirrus_work
changeset 75:177f7df2bf46
handle %-encoded utf-8 as idna
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 09 Aug 2023 02:01:32 +0100 |
parents | 432915a28952 |
children | eeef811f734d |
files | bin/sort_date.py |
diffstat | 1 files changed, 12 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/sort_date.py Tue Aug 08 17:48:29 2023 +0100 +++ b/bin/sort_date.py Wed Aug 09 02:01:32 2023 +0100 @@ -4,7 +4,15 @@ # to fix a common 'bad' timestamp (~ .2% of inputs) import email.utils import sys -from urllib.parse import urlsplit +from urllib.parse import urlsplit, unquote + +def auth(s): + if '%' in s: + kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')] + else: + kk=s.split('.') + kk.reverse() + return ','.join(kk) def keyed(l): uri, dateTime = l.split('\t',1) @@ -14,12 +22,10 @@ nl = parts.netloc if ':' in nl: pa,pp=nl.split(':') - (kk:=pa.split('.')).reverse() - return ('%s:%s)%s'%(','.join(kk), pp, parts.path),epoch) + return ('%s:%s)%s'%(auth(pa), pp, parts.path),epoch) else: - (kk:=nl.split('.')).reverse() - return ('%s)%s'%(','.join(kk), parts.path),epoch) - except Exception as e: + return ('%s)%s'%(auth(nl), parts.path),epoch) + except (TypeError,IndexError,ValueError) as e: print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) return