comparison bin/sort_date.py @ 86:3a2ae6057242

handle double .www, more keep-me chars
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 28 Aug 2023 21:07:43 +0100
parents 1daa8e444cfe
children 49faf679d7df
comparison
equal deleted inserted replaced
85:1daa8e444cfe 86:3a2ae6057242
18 return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), 18 return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
19 ude.end) 19 ude.end)
20 20
21 codecs.register_error('percent',percent_encode) 21 codecs.register_error('percent',percent_encode)
22 22
23 # From RFC-3986:
24 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
25 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
26 # / "*" / "+" / "," / ";" / "="
27 # But # _is_ escaped in Java surt results
28 # and additionally " \ : < = > ? \ ^ _ ` { | } are not
29
30 # Also, Java surt strips _all_ leading 'www.',
31 # where python3 surt only strips the first one.
32
23 def cdx_key(uristring): 33 def cdx_key(uristring):
24 return quote(unquote(surt(uristring), 34 _surt = quote(unquote(surt(uristring),
25 errors='percent'), 35 errors='percent'),
26 safe='/,:)?=').lower() 36 safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower()
37 while _surt.endswith(",www"):
38 _surt = _surt[:-4]
39
40 return _surt
27 41
28 def keyed(l): 42 def keyed(l):
29 uri, cc_stamp, dateTime = l.split('\t',2) 43 uri, cc_stamp, dateTime = l.split('\t',2)
30 #print('ul',uri,file=sys.stderr) 44 #print('ul',uri,file=sys.stderr)
31 try: 45 try: