changeset 75:177f7df2bf46

handle %-encoded utf-8 as idna
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 09 Aug 2023 02:01:32 +0100
parents 432915a28952
children eeef811f734d
files bin/sort_date.py
diffstat 1 files changed, 12 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/bin/sort_date.py	Tue Aug 08 17:48:29 2023 +0100
+++ b/bin/sort_date.py	Wed Aug 09 02:01:32 2023 +0100
@@ -4,7 +4,15 @@
 #  to fix a common 'bad' timestamp (~ .2% of inputs)
 import email.utils
 import sys
-from urllib.parse import urlsplit
+from urllib.parse import urlsplit, unquote
+
+def auth(s):
+  if '%' in s:
+    kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')]
+  else:
+    kk=s.split('.')
+  kk.reverse()
+  return ','.join(kk)
 
 def keyed(l):
   uri, dateTime = l.split('\t',1)
@@ -14,12 +22,10 @@
     nl = parts.netloc
     if ':' in nl:
       pa,pp=nl.split(':')
-      (kk:=pa.split('.')).reverse()
-      return ('%s:%s)%s'%(','.join(kk), pp, parts.path),epoch)
+      return ('%s:%s)%s'%(auth(pa), pp, parts.path),epoch)
     else:
-      (kk:=nl.split('.')).reverse()
-      return ('%s)%s'%(','.join(kk), parts.path),epoch)
-  except Exception as e:
+      return ('%s)%s'%(auth(nl), parts.path),epoch)
+  except (TypeError,IndexError,ValueError) as e:
     print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
     return