Mercurial > hg > cc > cirrus_work
changeset 86:3a2ae6057242
handle double .www, more keep-me chars
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 28 Aug 2023 21:07:43 +0100 |
parents | 1daa8e444cfe |
children | df231c95e4aa |
files | bin/fix_key.py bin/sort_date.py |
diffstat | 2 files changed, 47 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/fix_key.py Mon Aug 28 21:07:43 2023 +0100 @@ -0,0 +1,30 @@ +#!/usr/bin/python3 +from percent_encode import percent_encode +from urllib.parse import quote, unquote +import sys + +# From RFC-3986: +# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +# sub-delims = "!" / "$" / "&" / "'" / "(" / ")" +# / "*" / "+" / "," / ";" / "=" +# But # _is_ escaped in Java surt results +# and additionally " \ : < = > ? \ ^ _ ` { | } are not + +# Also, Java surt strips _all_ leading 'www.', +# where python3 surt only strips the first one. + +with open(sys.argv[1],"r") as f: + for l in f: + while l.endswith(',www',0,ploc:=l.index(')')): + l=l[:ploc-4]+l[ploc:] + if '%' in l: + (key,wt,ts)=l.split('\t') + sys.stdout.write(quote(unquote(key,errors='percent'), + safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower()) + sys.stdout.write('\t') + sys.stdout.write(wt) + sys.stdout.write('\t') + sys.stdout.write(ts) + else: + sys.stdout.write(l) +
--- a/bin/sort_date.py Thu Aug 24 18:21:41 2023 +0100 +++ b/bin/sort_date.py Mon Aug 28 21:07:43 2023 +0100 @@ -20,10 +20,24 @@ codecs.register_error('percent',percent_encode) +# From RFC-3986: +# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +# sub-delims = "!" / "$" / "&" / "'" / "(" / ")" +# / "*" / "+" / "," / ";" / "=" +# But # _is_ escaped in Java surt results +# and additionally " \ : < = > ? \ ^ _ ` { | } are not + +# Also, Java surt strips _all_ leading 'www.', +# where python3 surt only strips the first one. + def cdx_key(uristring): - return quote(unquote(surt(uristring), - errors='percent'), - safe='/,:)?=').lower() + _surt = quote(unquote(surt(uristring), + errors='percent'), + safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower() + while _surt.endswith(",www"): + _surt = _surt[:-4] + + return _surt def keyed(l): uri, cc_stamp, dateTime = l.split('\t',2)