Mercurial > hg > cc > cirrus_work
changeset 73:e8c667bf8965
compute timestamps, key and sort lmh lines
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 08 Aug 2023 17:47:27 +0100 |
parents | fd9bcd759606 |
children | 432915a28952 |
files | bin/sort_date.py |
diffstat | 1 files changed, 30 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/sort_date.py Tue Aug 08 17:47:27 2023 +0100 @@ -0,0 +1,30 @@ +#!/usr/bin/python3 +# Assumes you have used grep -v $'\t' on input for speed +# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' +# to fix a common 'bad' timestamp (~ .2% of inputs) +import email.utils +import sys +from urllib.parse import urlsplit + +def keyed(l): + uri, dateTime = l.split('\t',1) + try: + epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() + parts = urlsplit(uri) + nl = parts.netloc + if ':' in nl: + pa,pp=nl.split(':') + (kk:=pa.split('.')).reverse() + return ('%s:%s)%s'%(','.join(kk), pp, parts.path),epoch) + else: + (kk:=nl.split('.')).reverse() + return ('%s)%s'%(','.join(kk), parts.path),epoch) + except Exception as e: + print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) + return + +with open(sys.argv[1],"r") as ff: + for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None), + key=lambda x:x[0]): + print(tl[0],tl[1],sep='\t') +