Mercurial > hg > cc > cirrus_work
comparison bin/sort_date.py @ 86:3a2ae6057242
handle double .www, more keep-me chars
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 28 Aug 2023 21:07:43 +0100 |
parents | 1daa8e444cfe |
children | 49faf679d7df |
comparison
equal
deleted
inserted
replaced
85:1daa8e444cfe | 86:3a2ae6057242 |
---|---|
18 return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), | 18 return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), |
19 ude.end) | 19 ude.end) |
20 | 20 |
21 codecs.register_error('percent',percent_encode) | 21 codecs.register_error('percent',percent_encode) |
22 | 22 |
23 # From RFC-3986: | |
24 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" | |
25 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" | |
26 # / "*" / "+" / "," / ";" / "=" | |
27 # But # _is_ escaped in Java surt results | |
28 # and additionally " \ : < = > ? \ ^ _ ` { | } are not | |
29 | |
30 # Also, Java surt strips _all_ leading 'www.', | |
31 # where python3 surt only strips the first one. | |
32 | |
23 def cdx_key(uristring): | 33 def cdx_key(uristring): |
24 return quote(unquote(surt(uristring), | 34 _surt = quote(unquote(surt(uristring), |
25 errors='percent'), | 35 errors='percent'), |
26 safe='/,:)?=').lower() | 36 safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower() |
37 while _surt.endswith(",www"): | |
38 _surt = _surt[:-4] | |
39 | |
40 return _surt | |
27 | 41 |
28 def keyed(l): | 42 def keyed(l): |
29 uri, cc_stamp, dateTime = l.split('\t',2) | 43 uri, cc_stamp, dateTime = l.split('\t',2) |
30 #print('ul',uri,file=sys.stderr) | 44 #print('ul',uri,file=sys.stderr) |
31 try: | 45 try: |