changeset 104:fc9a045c872b

use my own Canonicalizer to fix more obscure incompatibilities between Java and Python surts
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 13 Sep 2023 12:41:55 +0100
parents 7d58dc01f329
children 9403c02d5034
files bin/sort_date.py
diffstat 1 files changed, 25 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/bin/sort_date.py	Wed Sep 13 12:40:39 2023 +0100
+++ b/bin/sort_date.py	Wed Sep 13 12:41:55 2023 +0100
@@ -1,5 +1,5 @@
 #!/usr/bin/python3
-'''Process output of lmh_warc [original 2-column version]
+'''Process output of lmh_warc [new 3-column version]
    Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/")
 '''
 
@@ -10,7 +10,7 @@
 import email.utils
 import sys
 from urllib.parse import urlsplit, quote, unquote
-from surt import surt
+import surt
 
 import re, codecs
 
@@ -40,8 +40,30 @@
 # Also, Java surt strips _all_ leading 'www\d*.',
 #  where python3 surt only strips the first one.
 
+# And Java strips so-called option session-ids, but python doesn't
+
+import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer
+
+def notDefaultCanon(hu,**options):
+  if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host):
+    # Try to fix the incompatibility between Java and 
+    #  Python surt handling of 'octal' numbers in numeric IPv4 addresses
+    #  and it should!  See "After this line:
+    # 
+    # 15,225,107,143)" in .../azure/notes.txt
+    try:
+      bytestrs = hu.host.split(b'.')
+      hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs)
+    except ValueError:
+      pass
+  # Either we don't hit any, or Java doesn't do path_strip_session_id
+  options.setdefault('query_strip_session_id',False)
+  return surt.DefaultIAURLCanonicalizer.canonicalize(hu,
+                                                **options)
+
 def cdx_key(uristring):
-  _surt = quote(unquote(surt(uristring),
+  _surt = quote(unquote(surt.surt(uristring,
+                                  canonicalizer=notDefaultCanon),
                         errors='percent'),
                 safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''' # '
                 ).lower()