view bin/sort_date.py @ 76:eeef811f734d

handle corner cases with final . and initial www..+
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 10 Aug 2023 22:14:49 +0100
parents 177f7df2bf46
children bf09a1d80d7b
line wrap: on
line source

#!/usr/bin/python3
# Assumes you have used grep -v $'\t' on input for speed
# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
#  to fix a common 'bad' timestamp (~ .2% of inputs)
import email.utils
import sys
from urllib.parse import urlsplit, unquote
import re

WWW=re.compile("www[0-9]*$")

def auth(s):
  if '%' in s:
    kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')]
  else:
    kk=s.split('.')
  kk.reverse()
  if kk[0] == '':
    # final full stop is pruned by CC
    kk.pop(0)
  while WWW.match(kk[-1]):
    # any www... prefix is pruned
    kk.pop()
  return ','.join(kk)

def keyed(l):
  uri, dateTime = l.split('\t',1)
  try:
    try:
      epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
    except OverflowError:
      epoch = 32535215999.0
    parts = urlsplit(uri)
    nl = parts.netloc
    if ':' in nl:
      pa,pp=nl.split(':')
      return ('%s:%s)%s'%(auth(pa), pp, parts.path),epoch)
    else:
      return ('%s)%s'%(auth(nl), parts.path),epoch)
  except (TypeError,IndexError,ValueError) as e:
    print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
    return

with open(sys.argv[1],"r") as ff:
  for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
                   key=lambda x:x[0]):
    print(tl[0],tl[1],sep='\t')