Mercurial > hg > cc > cirrus_work
changeset 80:db3c689175fe
catching up by hand with markup version,
adding query string
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 19 Aug 2023 15:53:59 -0400 |
parents | e8f89aaa07c1 |
children | e115f2e89af6 |
files | bin/sort_date.py |
diffstat | 1 files changed, 60 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/sort_date.py Sat Aug 19 15:53:59 2023 -0400 @@ -0,0 +1,60 @@ +#!/usr/bin/python3 +# Assumes you have used grep -v $'\t' on input for speed +# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' +# to fix a common 'bad' timestamp (~ .2% of inputs) +import email.utils +import sys +from urllib.parse import urlsplit, unquote +import re +# Thanks to https://stackoverflow.com/a/8776871 +import locale +from functools import cmp_to_key + +WWW=re.compile("www[0-9]*$") + +def auth(s): + #print('auth',s,file=sys.stderr) + if '%' in s: + kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')] + else: + kk=s.split('.') + kk.reverse() + if kk[0] == '': + # final full stop is pruned by CC + kk.pop(0) + while WWW.match(kk[-1]): + # any www... prefix is pruned + kk.pop() + return ','.join(kk) + +def keyed(l): + uri, dateTime = l.split(b'\t',1) + uri=uri.decode('ascii') + dateTime=dateTime.decode('utf8') # occasional weird ones + #print('ul',uri,file=sys.stderr) + try: + try: + epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() + except OverflowError: + epoch = 32535215999.0 + parts = urlsplit(uri) + nl = parts.netloc + pq = '?%s'%parts.query if parts.query else ''; + #print('nl',nl,file=sys.stderr) + if ':' in nl: + pa,pp=nl.split(':') + return ('%s:%s)%s%s'%(auth(pa), pp, parts.path, pq),epoch) + else: + return ('%s)%s%s'%(auth(nl), parts.path, pq),epoch) + except (TypeError,IndexError,ValueError) as e: + print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) + return + +locale.setlocale(locale.LC_ALL, "C") +ctk=cmp_to_key(locale.strcoll) + +with open(sys.argv[1],"rb") as ff: + for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None), + key=lambda x:ctk(x[0])): + print(tl[0],tl[1],sep='\t') +