Mercurial > hg > cc > cirrus_work
comparison bin/fix_key.py @ 86:3a2ae6057242
handle double .www, more keep-me chars
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 28 Aug 2023 21:07:43 +0100 |
parents | |
children | df231c95e4aa |
comparison
equal
deleted
inserted
replaced
85:1daa8e444cfe | 86:3a2ae6057242 |
---|---|
1 #!/usr/bin/python3 | |
2 from percent_encode import percent_encode | |
3 from urllib.parse import quote, unquote | |
4 import sys | |
5 | |
6 # From RFC-3986: | |
7 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" | |
8 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" | |
9 # / "*" / "+" / "," / ";" / "=" | |
10 # But # _is_ escaped in Java surt results | |
11 # and additionally " \ : < = > ? \ ^ _ ` { | } are not | |
12 | |
13 # Also, Java surt strips _all_ leading 'www.', | |
14 # where python3 surt only strips the first one. | |
15 | |
16 with open(sys.argv[1],"r") as f: | |
17 for l in f: | |
18 while l.endswith(',www',0,ploc:=l.index(')')): | |
19 l=l[:ploc-4]+l[ploc:] | |
20 if '%' in l: | |
21 (key,wt,ts)=l.split('\t') | |
22 sys.stdout.write(quote(unquote(key,errors='percent'), | |
23 safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower()) | |
24 sys.stdout.write('\t') | |
25 sys.stdout.write(wt) | |
26 sys.stdout.write('\t') | |
27 sys.stdout.write(ts) | |
28 else: | |
29 sys.stdout.write(l) | |
30 |