comparison bin/fix_key.py @ 86:3a2ae6057242

handle double .www, more keep-me chars
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 28 Aug 2023 21:07:43 +0100
parents
children df231c95e4aa
comparison
equal deleted inserted replaced
85:1daa8e444cfe 86:3a2ae6057242
1 #!/usr/bin/python3
2 from percent_encode import percent_encode
3 from urllib.parse import quote, unquote
4 import sys
5
6 # From RFC-3986:
7 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
8 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
9 # / "*" / "+" / "," / ";" / "="
10 # But # _is_ escaped in Java surt results
11 # and additionally " \ : < = > ? \ ^ _ ` { | } are not
12
13 # Also, Java surt strips _all_ leading 'www.',
14 # where python3 surt only strips the first one.
15
16 with open(sys.argv[1],"r") as f:
17 for l in f:
18 while l.endswith(',www',0,ploc:=l.index(')')):
19 l=l[:ploc-4]+l[ploc:]
20 if '%' in l:
21 (key,wt,ts)=l.split('\t')
22 sys.stdout.write(quote(unquote(key,errors='percent'),
23 safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower())
24 sys.stdout.write('\t')
25 sys.stdout.write(wt)
26 sys.stdout.write('\t')
27 sys.stdout.write(ts)
28 else:
29 sys.stdout.write(l)
30