view bin/sort_date.py @ 83:d92bd8527718

use surt instead of trying to create index term by hand
author Henry Thompson <ht@markup.co.uk>
date Sat, 19 Aug 2023 16:33:23 -0400
parents 7bbb14f6e394
children c18c307cc325
line wrap: on
line source

#!/usr/bin/python3
# Assumes you have used grep -v $'\t' on input for speed
# Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
#  to fix a common 'bad' timestamp (~ .2% of inputs)
import email.utils
import sys
from urllib.parse import urlsplit, unquote
from surt import surt
import re
# Thanks to https://stackoverflow.com/a/8776871
import locale
from functools import cmp_to_key

WWW=re.compile("www[0-9]*$")

def auth(s):
  #print('auth',s,file=sys.stderr)
  if '%' in s:
    kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')]
  else:
    kk=s.split('.')
  kk.reverse()
  if kk[0] == '':
    # final full stop is pruned by CC
    kk.pop(0)
  while WWW.match(kk[-1]):
    # any www... prefix is pruned
    kk.pop()
  return ','.join(kk)

def keyed(l):
  uri, dateTime = l.split(b'\t',1)
  uri=uri.decode('ascii')
  dateTime=dateTime.decode('utf8') # occasional weird ones
  #print('ul',uri,file=sys.stderr)
  try:
    try:
      epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
    except OverflowError:
      epoch = 32535215999.0
    return (surt(uri),epoch)
    parts = urlsplit(uri)
    nl = parts.netloc
    pq = '?%s'%parts.query if parts.query else '';
    #print('nl',nl,file=sys.stderr)
    if ':' in nl:
      pa,pp=nl.split(':')
      return ('%s:%s)%s%s'%(auth(pa), pp, parts.path, pq),epoch)
    else:
      return ('%s)%s%s'%(auth(nl), parts.path, pq),epoch)
  except (TypeError,IndexError,ValueError) as e:
    print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
    return

locale.setlocale(locale.LC_ALL, "C")
ctk=cmp_to_key(locale.strcoll)

with open(sys.argv[1],"rb") as ff:
  for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
                   key=lambda x:ctk(x[0])):
    print(tl[0],tl[1],sep='\t')