# HG changeset patch # User Henry S. Thompson # Date 1694605315 -3600 # Node ID fc9a045c872bcade73b0a7182a0bbe29854f9934 # Parent 7d58dc01f3296773c0b7cdcd20a62209bb8bc2e3 use my own Canonicalizer to fix more obscure incompatibilities between Java and Python surts diff -r 7d58dc01f329 -r fc9a045c872b bin/sort_date.py --- a/bin/sort_date.py Wed Sep 13 12:40:39 2023 +0100 +++ b/bin/sort_date.py Wed Sep 13 12:41:55 2023 +0100 @@ -1,5 +1,5 @@ #!/usr/bin/python3 -'''Process output of lmh_warc [original 2-column version] +'''Process output of lmh_warc [new 3-column version] Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/") ''' @@ -10,7 +10,7 @@ import email.utils import sys from urllib.parse import urlsplit, quote, unquote -from surt import surt +import surt import re, codecs @@ -40,8 +40,30 @@ # Also, Java surt strips _all_ leading 'www\d*.', # where python3 surt only strips the first one. +# And Java strips so-called option session-ids, but python doesn't + +import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer + +def notDefaultCanon(hu,**options): + if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): + # Try to fix the incompatibility between Java and + # Python surt handling of 'octal' numbers in numeric IPv4 addresses + # and it should! See "After this line: + # + # 15,225,107,143)" in .../azure/notes.txt + try: + bytestrs = hu.host.split(b'.') + hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) + except ValueError: + pass + # Either we don't hit any, or Java doesn't do path_strip_session_id + options.setdefault('query_strip_session_id',False) + return surt.DefaultIAURLCanonicalizer.canonicalize(hu, + **options) + def cdx_key(uristring): - _surt = quote(unquote(surt(uristring), + _surt = quote(unquote(surt.surt(uristring, + canonicalizer=notDefaultCanon), errors='percent'), safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''' # ' ).lower()