Mercurial > hg > cc > cirrus_work
view bin/sort_date.py @ 87:df231c95e4aa
final keystroke fixes,
note _lacks_ multi-www fix, for which see sort_date.py
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 30 Aug 2023 11:10:54 +0100 |
parents | 3a2ae6057242 |
children | 49faf679d7df |
line wrap: on
line source
#!/usr/bin/python3 # Assumes you have used grep -v $'\t' on input for speed # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' # to fix a common 'bad' timestamp (~ .2% of inputs) import email.utils import sys from urllib.parse import urlsplit, quote, unquote from surt import surt import re, codecs # Thanks to https://stackoverflow.com/a/8776871 import locale from functools import cmp_to_key def percent_encode(ude): #print(ude.object,ude.object[ude.start:ude.end]) return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), ude.end) codecs.register_error('percent',percent_encode) # From RFC-3986: # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" # / "*" / "+" / "," / ";" / "=" # But # _is_ escaped in Java surt results # and additionally " \ : < = > ? \ ^ _ ` { | } are not # Also, Java surt strips _all_ leading 'www.', # where python3 surt only strips the first one. def cdx_key(uristring): _surt = quote(unquote(surt(uristring), errors='percent'), safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower() while _surt.endswith(",www"): _surt = _surt[:-4] return _surt def keyed(l): uri, cc_stamp, dateTime = l.split('\t',2) #print('ul',uri,file=sys.stderr) try: try: epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() except OverflowError: epoch = 32535215999.0 return ((cdx_key(uri), cc_stamp), epoch) except (TypeError,IndexError,ValueError) as e: print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) return with open(sys.argv[1],"r") as ff: # crucial that the following is done _after_ the file is opened # with the default (utf-8) locale! locale.setlocale(locale.LC_ALL, "C") ctk=cmp_to_key(locale.strcoll) for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None), key=lambda x:x[0]): print(tl[0][0],tl[0][1],tl[1],sep='\t')