Mercurial > hg > cc > cirrus_work
changeset 85:1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 24 Aug 2023 18:21:41 +0100 |
parents | c18c307cc325 |
children | 3a2ae6057242 |
files | bin/percent_encode.py bin/sort_date.py |
diffstat | 2 files changed, 33 insertions(+), 37 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/percent_encode.py Thu Aug 24 18:21:41 2023 +0100 @@ -0,0 +1,9 @@ +'''Handle unquoting of non-UTF-8 bytes by %-encoding them''' +import codecs + +def percent_encode(ude): + #print(ude.object,ude.object[ude.start:ude.end]) + return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), + ude.end) + +codecs.register_error('percent',percent_encode)
--- a/bin/sort_date.py Mon Aug 21 13:06:20 2023 -0400 +++ b/bin/sort_date.py Thu Aug 24 18:21:41 2023 +0100 @@ -4,59 +4,46 @@ # to fix a common 'bad' timestamp (~ .2% of inputs) import email.utils import sys -from urllib.parse import urlsplit, unquote +from urllib.parse import urlsplit, quote, unquote from surt import surt -import re + +import re, codecs + # Thanks to https://stackoverflow.com/a/8776871 import locale from functools import cmp_to_key -WWW=re.compile("www[0-9]*$") +def percent_encode(ude): + #print(ude.object,ude.object[ude.start:ude.end]) + return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), + ude.end) -def auth(s): - #print('auth',s,file=sys.stderr) - if '%' in s: - kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')] - else: - kk=s.split('.') - kk.reverse() - if kk[0] == '': - # final full stop is pruned by CC - kk.pop(0) - while WWW.match(kk[-1]): - # any www... prefix is pruned - kk.pop() - return ','.join(kk) +codecs.register_error('percent',percent_encode) + +def cdx_key(uristring): + return quote(unquote(surt(uristring), + errors='percent'), + safe='/,:)?=').lower() def keyed(l): - uri, dateTime = l.split(b'\t',1) - uri=uri.decode('ascii') - dateTime=dateTime.decode('utf8') # occasional weird ones + uri, cc_stamp, dateTime = l.split('\t',2) #print('ul',uri,file=sys.stderr) try: try: epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() except OverflowError: epoch = 32535215999.0 - return (surt(uri),epoch) - parts = urlsplit(uri) - nl = parts.netloc - pq = '?%s'%parts.query if parts.query else ''; - #print('nl',nl,file=sys.stderr) - if ':' in nl: - pa,pp=nl.split(':') - return ('%s:%s)%s%s'%(auth(pa), pp, parts.path, pq),epoch) - else: - return ('%s)%s%s'%(auth(nl), parts.path, pq),epoch) + return ((cdx_key(uri), cc_stamp), epoch) except (TypeError,IndexError,ValueError) as e: print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) return -locale.setlocale(locale.LC_ALL, "C") -ctk=cmp_to_key(locale.strcoll) +with open(sys.argv[1],"r") as ff: + # crucial that the following is done _after_ the file is opened + # with the default (utf-8) locale! + locale.setlocale(locale.LC_ALL, "C") + ctk=cmp_to_key(locale.strcoll) + for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None), + key=lambda x:x[0]): + print(tl[0][0],tl[0][1],tl[1],sep='\t') -with open(sys.argv[1],"rb") as ff: - for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None), - key=lambda x:ctk(x[0])): - print(tl[0],tl[1],sep='\t') -