diff bin/sort_date.py @ 77:bf09a1d80d7b

make CC's own sorting explicit
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 18 Aug 2023 18:25:54 +0100
parents eeef811f734d
children fef49258d738 7bbb14f6e394
line wrap: on
line diff
--- a/bin/sort_date.py	Thu Aug 10 22:14:49 2023 +0100
+++ b/bin/sort_date.py	Fri Aug 18 18:25:54 2023 +0100
@@ -6,10 +6,14 @@
 import sys
 from urllib.parse import urlsplit, unquote
 import re
+# Thanks to https://stackoverflow.com/a/8776871
+import locale
+from functools import cmp_to_key
 
 WWW=re.compile("www[0-9]*$")
 
 def auth(s):
+  #print('auth',s,file=sys.stderr)
   if '%' in s:
     kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')]
   else:
@@ -24,7 +28,10 @@
   return ','.join(kk)
 
 def keyed(l):
-  uri, dateTime = l.split('\t',1)
+  uri, dateTime = l.split(b'\t',1)
+  uri=uri.decode('ascii')
+  dateTime=dateTime.decode('utf8') # occasional weird ones
+  #print('ul',uri,file=sys.stderr)
   try:
     try:
       epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
@@ -32,6 +39,7 @@
       epoch = 32535215999.0
     parts = urlsplit(uri)
     nl = parts.netloc
+    #print('nl',nl,file=sys.stderr)
     if ':' in nl:
       pa,pp=nl.split(':')
       return ('%s:%s)%s'%(auth(pa), pp, parts.path),epoch)
@@ -41,8 +49,11 @@
     print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
     return
 
-with open(sys.argv[1],"r") as ff:
+locale.setlocale(locale.LC_ALL, "C")
+ctk=cmp_to_key(locale.strcoll)
+
+with open(sys.argv[1],"rb") as ff:
   for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
-                   key=lambda x:x[0]):
+                   key=lambda x:ctk(x[0])):
     print(tl[0],tl[1],sep='\t')