changeset 80:db3c689175fe

catching up by hand with markup version, adding query string
author Henry Thompson <ht@markup.co.uk>
date Sat, 19 Aug 2023 15:53:59 -0400
parents e8f89aaa07c1
children e115f2e89af6
files bin/sort_date.py
diffstat 1 files changed, 60 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/sort_date.py	Sat Aug 19 15:53:59 2023 -0400
@@ -0,0 +1,60 @@
+#!/usr/bin/python3
+# Assumes you have used grep -v $'\t' on input for speed
+# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
+#  to fix a common 'bad' timestamp (~ .2% of inputs)
+import email.utils
+import sys
+from urllib.parse import urlsplit, unquote
+import re
+# Thanks to https://stackoverflow.com/a/8776871
+import locale
+from functools import cmp_to_key
+
+WWW=re.compile("www[0-9]*$")
+
+def auth(s):
+  #print('auth',s,file=sys.stderr)
+  if '%' in s:
+    kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')]
+  else:
+    kk=s.split('.')
+  kk.reverse()
+  if kk[0] == '':
+    # final full stop is pruned by CC
+    kk.pop(0)
+  while WWW.match(kk[-1]):
+    # any www... prefix is pruned
+    kk.pop()
+  return ','.join(kk)
+
+def keyed(l):
+  uri, dateTime = l.split(b'\t',1)
+  uri=uri.decode('ascii')
+  dateTime=dateTime.decode('utf8') # occasional weird ones
+  #print('ul',uri,file=sys.stderr)
+  try:
+    try:
+      epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
+    except OverflowError:
+      epoch = 32535215999.0
+    parts = urlsplit(uri)
+    nl = parts.netloc
+    pq = '?%s'%parts.query if parts.query else '';
+    #print('nl',nl,file=sys.stderr)
+    if ':' in nl:
+      pa,pp=nl.split(':')
+      return ('%s:%s)%s%s'%(auth(pa), pp, parts.path, pq),epoch)
+    else:
+      return ('%s)%s%s'%(auth(nl), parts.path, pq),epoch)
+  except (TypeError,IndexError,ValueError) as e:
+    print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
+    return
+
+locale.setlocale(locale.LC_ALL, "C")
+ctk=cmp_to_key(locale.strcoll)
+
+with open(sys.argv[1],"rb") as ff:
+  for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
+                   key=lambda x:ctk(x[0])):
+    print(tl[0],tl[1],sep='\t')
+