view bin/sort_date.py @ 87:df231c95e4aa

final keystroke fixes, note _lacks_ multi-www fix, for which see sort_date.py
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 30 Aug 2023 11:10:54 +0100
parents 3a2ae6057242
children 49faf679d7df
line wrap: on
line source

#!/usr/bin/python3
# Assumes you have used grep -v $'\t' on input for speed
# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
#  to fix a common 'bad' timestamp (~ .2% of inputs)
import email.utils
import sys
from urllib.parse import urlsplit, quote, unquote
from surt import surt

import re, codecs

# Thanks to https://stackoverflow.com/a/8776871
import locale
from functools import cmp_to_key

def percent_encode(ude):
  #print(ude.object,ude.object[ude.start:ude.end])
  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
          ude.end)

codecs.register_error('percent',percent_encode)

# From RFC-3986:
# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
#                / "*" / "+" / "," / ";" / "="
# But # _is_ escaped in Java surt results
#  and additionally " \ : < = > ? \ ^  _ ` { | } are not

# Also, Java surt strips _all_ leading 'www.',
#  where python3 surt only strips the first one.

def cdx_key(uristring):
  _surt = quote(unquote(surt(uristring),
                        errors='percent'),
                safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower()
  while _surt.endswith(",www"):
    _surt = _surt[:-4]

  return _surt

def keyed(l):
  uri, cc_stamp, dateTime = l.split('\t',2)
  #print('ul',uri,file=sys.stderr)
  try:
    try:
      epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
    except OverflowError:
      epoch = 32535215999.0
    return ((cdx_key(uri), cc_stamp), epoch)
  except (TypeError,IndexError,ValueError) as e:
    print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
    return

with open(sys.argv[1],"r") as ff:
  # crucial that the following is done _after_ the file is opened
  #  with the default (utf-8) locale!
  locale.setlocale(locale.LC_ALL, "C")
  ctk=cmp_to_key(locale.strcoll)
  for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
                   key=lambda x:x[0]):
    print(tl[0][0],tl[0][1],tl[1],sep='\t')