Mercurial > hg > cc > cirrus_work
changeset 104:fc9a045c872b
use my own Canonicalizer to fix more obscure
incompatibilities between Java and Python surts
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 13 Sep 2023 12:41:55 +0100 |
parents | 7d58dc01f329 |
children | 9403c02d5034 |
files | bin/sort_date.py |
diffstat | 1 files changed, 25 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/sort_date.py Wed Sep 13 12:40:39 2023 +0100 +++ b/bin/sort_date.py Wed Sep 13 12:41:55 2023 +0100 @@ -1,5 +1,5 @@ #!/usr/bin/python3 -'''Process output of lmh_warc [original 2-column version] +'''Process output of lmh_warc [new 3-column version] Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/") ''' @@ -10,7 +10,7 @@ import email.utils import sys from urllib.parse import urlsplit, quote, unquote -from surt import surt +import surt import re, codecs @@ -40,8 +40,30 @@ # Also, Java surt strips _all_ leading 'www\d*.', # where python3 surt only strips the first one. +# And Java strips so-called option session-ids, but python doesn't + +import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer + +def notDefaultCanon(hu,**options): + if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): + # Try to fix the incompatibility between Java and + # Python surt handling of 'octal' numbers in numeric IPv4 addresses + # and it should! See "After this line: + # + # 15,225,107,143)" in .../azure/notes.txt + try: + bytestrs = hu.host.split(b'.') + hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) + except ValueError: + pass + # Either we don't hit any, or Java doesn't do path_strip_session_id + options.setdefault('query_strip_session_id',False) + return surt.DefaultIAURLCanonicalizer.canonicalize(hu, + **options) + def cdx_key(uristring): - _surt = quote(unquote(surt(uristring), + _surt = quote(unquote(surt.surt(uristring, + canonicalizer=notDefaultCanon), errors='percent'), safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''' # ' ).lower()