view bin/fix_key.py @ 110:a0ea1e4a714d

pass in debug flag(s) to merge_date.py
author Henry Thompson <ht@markup.co.uk>
date Tue, 19 Sep 2023 19:40:58 +0100
parents df231c95e4aa
children
line wrap: on
line source

#!/usr/bin/python3
from percent_encode import percent_encode
from urllib.parse import quote, unquote
import sys

# From RFC-3986:
# gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
# sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
#                / "*" / "+" / "," / ";" / "="
# But # _is_ escaped in Java surt results
#  and additionally " \ : < = > ? \ ^  _ ` { | } are not
# Note also that quote already does _not_ quote - . / _ ~

# Also, Java surt strips _all_ leading 'www.',
#  where python3 surt only strips the first one.

with open(sys.argv[1],"r") as f:
  for l in f:
    while l.endswith(',www',0,ploc:=l.index(')')):
      l=l[:ploc-4]+l[ploc:]
    if '%' in l:
      (key,wt,ts)=l.split('\t')
      sys.stdout.write(quote(unquote(key,errors='percent'),
                             safe='!"$&\'()*+,:;<=>?@[\\]^`{|}').lower())
      sys.stdout.write('\t')
      sys.stdout.write(wt)
      sys.stdout.write('\t')
      sys.stdout.write(ts)
    else:
      sys.stdout.write(l)