changeset 73:e8c667bf8965

compute timestamps, key and sort lmh lines
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 08 Aug 2023 17:47:27 +0100
parents fd9bcd759606
children 432915a28952
files bin/sort_date.py
diffstat 1 files changed, 30 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/sort_date.py	Tue Aug 08 17:47:27 2023 +0100
@@ -0,0 +1,30 @@
+#!/usr/bin/python3
+# Assumes you have used grep -v $'\t' on input for speed
+# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
+#  to fix a common 'bad' timestamp (~ .2% of inputs)
+import email.utils
+import sys
+from urllib.parse import urlsplit
+
+def keyed(l):
+  uri, dateTime = l.split('\t',1)
+  try:
+    epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
+    parts = urlsplit(uri)
+    nl = parts.netloc
+    if ':' in nl:
+      pa,pp=nl.split(':')
+      (kk:=pa.split('.')).reverse()
+      return ('%s:%s)%s'%(','.join(kk), pp, parts.path),epoch)
+    else:
+      (kk:=nl.split('.')).reverse()
+      return ('%s)%s'%(','.join(kk), parts.path),epoch)
+  except Exception as e:
+    print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
+    return
+
+with open(sys.argv[1],"r") as ff:
+  for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
+                   key=lambda x:x[0]):
+    print(tl[0],tl[1],sep='\t')
+