Mercurial > hg > cc > cirrus_work
view bin/sort_date.py @ 78:fef49258d738
include query
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sun, 20 Aug 2023 00:28:43 +0100 |
parents | bf09a1d80d7b |
children | c18c307cc325 |
line wrap: on
line source
#!/usr/bin/python3 # Assumes you have used grep -v $'\t' on input for speed # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' # to fix a common 'bad' timestamp (~ .2% of inputs) import email.utils import sys from urllib.parse import urlsplit, unquote import re # Thanks to https://stackoverflow.com/a/8776871 import locale from functools import cmp_to_key WWW=re.compile("www[0-9]*$") def auth(s): #print('auth',s,file=sys.stderr) if '%' in s: kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')] else: kk=s.split('.') kk.reverse() if kk[0] == '': # final full stop is pruned by CC kk.pop(0) while WWW.match(kk[-1]): # any www... prefix is pruned kk.pop() return ','.join(kk) def keyed(l): uri, dateTime = l.split(b'\t',1) uri=uri.decode('ascii') dateTime=dateTime.decode('utf8') # occasional weird ones #print('ul',uri,file=sys.stderr) try: try: epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() except OverflowError: epoch = 32535215999.0 parts = urlsplit(uri) nl = parts.netloc pq = '?%s'%parts.query if parts.query else ''; #print('nl',nl,file=sys.stderr) if ':' in nl: pa,pp=nl.split(':') return ('%s:%s)%s%s'%(auth(pa), pp, parts.path, pq),epoch) else: return ('%s)%s'%(auth(nl), parts.path, pq),epoch) except (TypeError,IndexError,ValueError) as e: print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) return locale.setlocale(locale.LC_ALL, "C") ctk=cmp_to_key(locale.strcoll) with open(sys.argv[1],"rb") as ff: for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None), key=lambda x:ctk(x[0])): print(tl[0],tl[1],sep='\t')