# HG changeset patch # User Henry Thompson # Date 1692637580 14400 # Node ID c18c307cc3254a63dd2c791c4ff193f6c0ab3fe9 # Parent d92bd852771811b759efdfa5d658ab5dc254c962# Parent 120d90b47d74a37f7a5aadbe0a622758b62d6652 merge, including pointless fix wrt pq diff -r 120d90b47d74 -r c18c307cc325 all_warc_lmh_out.tar.gz Binary file all_warc_lmh_out.tar.gz has changed diff -r 120d90b47d74 -r c18c307cc325 bin/sort_date.py --- a/bin/sort_date.py Mon Aug 21 13:37:07 2023 +0100 +++ b/bin/sort_date.py Mon Aug 21 13:06:20 2023 -0400 @@ -5,6 +5,7 @@ import email.utils import sys from urllib.parse import urlsplit, unquote +from surt import surt import re # Thanks to https://stackoverflow.com/a/8776871 import locale @@ -37,6 +38,7 @@ epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() except OverflowError: epoch = 32535215999.0 + return (surt(uri),epoch) parts = urlsplit(uri) nl = parts.netloc pq = '?%s'%parts.query if parts.query else ''; @@ -45,7 +47,7 @@ pa,pp=nl.split(':') return ('%s:%s)%s%s'%(auth(pa), pp, parts.path, pq),epoch) else: - return ('%s)%s'%(auth(nl), parts.path, pq),epoch) + return ('%s)%s%s'%(auth(nl), parts.path, pq),epoch) except (TypeError,IndexError,ValueError) as e: print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) return