changeset 84:c18c307cc325

merge, including pointless fix wrt pq
author Henry Thompson <ht@markup.co.uk>
date Mon, 21 Aug 2023 13:06:20 -0400
parents d92bd8527718 (diff) 120d90b47d74 (current diff)
children 1daa8e444cfe
files bin/sort_date.py
diffstat 2 files changed, 3 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
Binary file all_warc_lmh_out.tar.gz has changed
--- a/bin/sort_date.py	Mon Aug 21 13:37:07 2023 +0100
+++ b/bin/sort_date.py	Mon Aug 21 13:06:20 2023 -0400
@@ -5,6 +5,7 @@
 import email.utils
 import sys
 from urllib.parse import urlsplit, unquote
+from surt import surt
 import re
 # Thanks to https://stackoverflow.com/a/8776871
 import locale
@@ -37,6 +38,7 @@
       epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
     except OverflowError:
       epoch = 32535215999.0
+    return (surt(uri),epoch)
     parts = urlsplit(uri)
     nl = parts.netloc
     pq = '?%s'%parts.query if parts.query else '';
@@ -45,7 +47,7 @@
       pa,pp=nl.split(':')
       return ('%s:%s)%s%s'%(auth(pa), pp, parts.path, pq),epoch)
     else:
-      return ('%s)%s'%(auth(nl), parts.path, pq),epoch)
+      return ('%s)%s%s'%(auth(nl), parts.path, pq),epoch)
   except (TypeError,IndexError,ValueError) as e:
     print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
     return