Mercurial > hg > cc > cirrus_work
changeset 77:bf09a1d80d7b
make CC's own sorting explicit
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 18 Aug 2023 18:25:54 +0100 |
parents | eeef811f734d |
children | fef49258d738 7bbb14f6e394 |
files | bin/sort_date.py |
diffstat | 1 files changed, 14 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/sort_date.py Thu Aug 10 22:14:49 2023 +0100 +++ b/bin/sort_date.py Fri Aug 18 18:25:54 2023 +0100 @@ -6,10 +6,14 @@ import sys from urllib.parse import urlsplit, unquote import re +# Thanks to https://stackoverflow.com/a/8776871 +import locale +from functools import cmp_to_key WWW=re.compile("www[0-9]*$") def auth(s): + #print('auth',s,file=sys.stderr) if '%' in s: kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')] else: @@ -24,7 +28,10 @@ return ','.join(kk) def keyed(l): - uri, dateTime = l.split('\t',1) + uri, dateTime = l.split(b'\t',1) + uri=uri.decode('ascii') + dateTime=dateTime.decode('utf8') # occasional weird ones + #print('ul',uri,file=sys.stderr) try: try: epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() @@ -32,6 +39,7 @@ epoch = 32535215999.0 parts = urlsplit(uri) nl = parts.netloc + #print('nl',nl,file=sys.stderr) if ':' in nl: pa,pp=nl.split(':') return ('%s:%s)%s'%(auth(pa), pp, parts.path),epoch) @@ -41,8 +49,11 @@ print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) return -with open(sys.argv[1],"r") as ff: +locale.setlocale(locale.LC_ALL, "C") +ctk=cmp_to_key(locale.strcoll) + +with open(sys.argv[1],"rb") as ff: for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None), - key=lambda x:x[0]): + key=lambda x:ctk(x[0])): print(tl[0],tl[1],sep='\t')