changeset 85:1daa8e444cfe

work-around for weird handling of %-encoding in Java impl. of SURT
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 24 Aug 2023 18:21:41 +0100
parents c18c307cc325
children 3a2ae6057242
files bin/percent_encode.py bin/sort_date.py
diffstat 2 files changed, 33 insertions(+), 37 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/percent_encode.py	Thu Aug 24 18:21:41 2023 +0100
@@ -0,0 +1,9 @@
+'''Handle unquoting of non-UTF-8 bytes by %-encoding them'''
+import codecs
+
+def percent_encode(ude):
+  #print(ude.object,ude.object[ude.start:ude.end])
+  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
+          ude.end)
+
+codecs.register_error('percent',percent_encode)
--- a/bin/sort_date.py	Mon Aug 21 13:06:20 2023 -0400
+++ b/bin/sort_date.py	Thu Aug 24 18:21:41 2023 +0100
@@ -4,59 +4,46 @@
 #  to fix a common 'bad' timestamp (~ .2% of inputs)
 import email.utils
 import sys
-from urllib.parse import urlsplit, unquote
+from urllib.parse import urlsplit, quote, unquote
 from surt import surt
-import re
+
+import re, codecs
+
 # Thanks to https://stackoverflow.com/a/8776871
 import locale
 from functools import cmp_to_key
 
-WWW=re.compile("www[0-9]*$")
+def percent_encode(ude):
+  #print(ude.object,ude.object[ude.start:ude.end])
+  return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
+          ude.end)
 
-def auth(s):
-  #print('auth',s,file=sys.stderr)
-  if '%' in s:
-    kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')]
-  else:
-    kk=s.split('.')
-  kk.reverse()
-  if kk[0] == '':
-    # final full stop is pruned by CC
-    kk.pop(0)
-  while WWW.match(kk[-1]):
-    # any www... prefix is pruned
-    kk.pop()
-  return ','.join(kk)
+codecs.register_error('percent',percent_encode)
+
+def cdx_key(uristring):
+  return quote(unquote(surt(uristring),
+                       errors='percent'),
+               safe='/,:)?=').lower()
 
 def keyed(l):
-  uri, dateTime = l.split(b'\t',1)
-  uri=uri.decode('ascii')
-  dateTime=dateTime.decode('utf8') # occasional weird ones
+  uri, cc_stamp, dateTime = l.split('\t',2)
   #print('ul',uri,file=sys.stderr)
   try:
     try:
       epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
     except OverflowError:
       epoch = 32535215999.0
-    return (surt(uri),epoch)
-    parts = urlsplit(uri)
-    nl = parts.netloc
-    pq = '?%s'%parts.query if parts.query else '';
-    #print('nl',nl,file=sys.stderr)
-    if ':' in nl:
-      pa,pp=nl.split(':')
-      return ('%s:%s)%s%s'%(auth(pa), pp, parts.path, pq),epoch)
-    else:
-      return ('%s)%s%s'%(auth(nl), parts.path, pq),epoch)
+    return ((cdx_key(uri), cc_stamp), epoch)
   except (TypeError,IndexError,ValueError) as e:
     print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
     return
 
-locale.setlocale(locale.LC_ALL, "C")
-ctk=cmp_to_key(locale.strcoll)
+with open(sys.argv[1],"r") as ff:
+  # crucial that the following is done _after_ the file is opened
+  #  with the default (utf-8) locale!
+  locale.setlocale(locale.LC_ALL, "C")
+  ctk=cmp_to_key(locale.strcoll)
+  for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
+                   key=lambda x:x[0]):
+    print(tl[0][0],tl[0][1],tl[1],sep='\t')
 
-with open(sys.argv[1],"rb") as ff:
-  for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
-                   key=lambda x:ctk(x[0])):
-    print(tl[0],tl[1],sep='\t')
-