changeset 86:3a2ae6057242

handle double .www, more keep-me chars
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 28 Aug 2023 21:07:43 +0100
parents 1daa8e444cfe
children df231c95e4aa
files bin/fix_key.py bin/sort_date.py
diffstat 2 files changed, 47 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/fix_key.py	Mon Aug 28 21:07:43 2023 +0100
@@ -0,0 +1,30 @@
+#!/usr/bin/python3
+from percent_encode import percent_encode
+from urllib.parse import quote, unquote
+import sys
+
+# From RFC-3986:
+# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+#                / "*" / "+" / "," / ";" / "="
+# But # _is_ escaped in Java surt results
+#  and additionally " \ : < = > ? \ ^  _ ` { | } are not
+
+# Also, Java surt strips _all_ leading 'www.',
+#  where python3 surt only strips the first one.
+
+with open(sys.argv[1],"r") as f:
+  for l in f:
+    while l.endswith(',www',0,ploc:=l.index(')')):
+      l=l[:ploc-4]+l[ploc:]
+    if '%' in l:
+      (key,wt,ts)=l.split('\t')
+      sys.stdout.write(quote(unquote(key,errors='percent'),
+                             safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower())
+      sys.stdout.write('\t')
+      sys.stdout.write(wt)
+      sys.stdout.write('\t')
+      sys.stdout.write(ts)
+    else:
+      sys.stdout.write(l)
+
--- a/bin/sort_date.py	Thu Aug 24 18:21:41 2023 +0100
+++ b/bin/sort_date.py	Mon Aug 28 21:07:43 2023 +0100
@@ -20,10 +20,24 @@
 
 codecs.register_error('percent',percent_encode)
 
+# From RFC-3986:
+# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+#                / "*" / "+" / "," / ";" / "="
+# But # _is_ escaped in Java surt results
+#  and additionally " \ : < = > ? \ ^  _ ` { | } are not
+
+# Also, Java surt strips _all_ leading 'www.',
+#  where python3 surt only strips the first one.
+
 def cdx_key(uristring):
-  return quote(unquote(surt(uristring),
-                       errors='percent'),
-               safe='/,:)?=').lower()
+  _surt = quote(unquote(surt(uristring),
+                        errors='percent'),
+                safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower()
+  while _surt.endswith(",www"):
+    _surt = _surt[:-4]
+
+  return _surt
 
 def keyed(l):
   uri, cc_stamp, dateTime = l.split('\t',2)