annotate bin/sort_date.py @ 118:9d14e7c32737

replicate two extremely-corner cases of the way Java produces surts for URIs containin escaped DEL chars
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 27 Sep 2023 17:29:09 +0100
parents 827eadc72122
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/python3
104
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
2 '''Process output of lmh_warc [new 3-column version]
97
135a8c56dcc2 include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
3 Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/")
135a8c56dcc2 include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
4 '''
135a8c56dcc2 include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
5
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 # Assumes you have used grep -v $'\t' on input for speed
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 # to fix a common 'bad' timestamp (~ .2% of inputs)
97
135a8c56dcc2 include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
9
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 import email.utils
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 import sys
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
12 from urllib.parse import urlsplit, quote, unquote
104
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
13 import surt
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
14
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
15 import re, codecs
118
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
16 from itertools import chain
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
17
88
49faf679d7df final keystroke fixes, recurse and decimal www stripping
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
18 WPAT = re.compile('(,www\\d*)+\\)')
49faf679d7df final keystroke fixes, recurse and decimal www stripping
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
19
77
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
20 # Thanks to https://stackoverflow.com/a/8776871
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
21 import locale
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
22 from functools import cmp_to_key
76
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
23
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
24 def percent_encode(ude):
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
25 #print(ude.object,ude.object[ude.start:ude.end])
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
26 return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
27 ude.end)
75
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
28
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
29 codecs.register_error('percent',percent_encode)
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
30
108
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
31 def _u_esc(c):
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
32 if c<65536:
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
33 return '\\u%04X'%c
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
34 else:
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
35 return '\\U%08X'%c
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
36
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
37 def java_unicode_encode(ude):
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
38 '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn'''
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
39 return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]),
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
40 ude.end)
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
41
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
42 codecs.register_error('java_unicode',java_unicode_encode)
9f7a35bf07f9 one more sid fix,
Henry Thompson <ht@markup.co.uk>
parents: 107
diff changeset
43
86
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
44 # From RFC-3986:
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
45 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
46 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
47 # / "*" / "+" / "," / ";" / "="
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
48 # But # _is_ escaped in Java surt results
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
49 # and additionally " \ : < = > ? \ ^ _ ` { | } are not
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
50
88
49faf679d7df final keystroke fixes, recurse and decimal www stripping
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
51 # Note also that although quote already does _not_ quote - . / _ ~
97
135a8c56dcc2 include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
52 # they are included below as that's what we find in surt.surt 0.3.1
88
49faf679d7df final keystroke fixes, recurse and decimal www stripping
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
53
49faf679d7df final keystroke fixes, recurse and decimal www stripping
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
54 # Also, Java surt strips _all_ leading 'www\d*.',
86
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
55 # where python3 surt only strips the first one.
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
56
104
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
57 # And Java strips so-called option session-ids, but python doesn't
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
58
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
59 import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
60 import surt.URLRegexTransformer
104
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
61
118
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
62 ident = ''.join(chr(i) for i in range(256)).encode('latin-1')
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
63
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
64 IDMAP=bytes.maketrans(ident,ident)
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
65
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
66 # For removal of non-printing characters:
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
67 # Note, this is only a guess, only example so are is DEL
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
68 NONPRINT= ''.join(chr(i) for i in chain(range(9),
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
69 range(14,32),
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
70 [127] # DEL
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
71 )).encode('latin-1')
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
72
104
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
73 def notDefaultCanon(hu,**options):
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
74 if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host):
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
75 # Try to fix the incompatibility between Java and
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
76 # Python surt handling of 'octal' numbers in numeric IPv4 addresses
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
77 # and it should! See "After this line:
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
78 #
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
79 # 15,225,107,143)" in .../azure/notes.txt
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
80 try:
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
81 bytestrs = hu.host.split(b'.')
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
82 hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs)
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
83 except ValueError:
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
84 pass
118
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
85 if hu.query:
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
86 hu.query = hu.query.translate(IDMAP,delete=NONPRINT)
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
87 return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options)
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
88
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
89 # Hack this to reproduce the Java bug
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
90 surt.URLRegexTransformer._RES_QUERY_SESSIONID = [
112
827eadc72122 more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 111
diff changeset
91 re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I),
827eadc72122 more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 111
diff changeset
92 re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I),
827eadc72122 more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 111
diff changeset
93 re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I),
827eadc72122 more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 111
diff changeset
94 re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
827eadc72122 more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 111
diff changeset
95 re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
107
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
96 ]
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
97
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
98 # Above based on this from broken Java code:
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
99 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
100 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2),
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
101 #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2),
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
102 #(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2),
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
103 #(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2),
40c460fed99f working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 104
diff changeset
104 #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$",
104
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
105
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
106 def cdx_key(uristring):
118
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
107 _surt = quote(unquote(surt.surt(unquote(uristring),
104
fc9a045c872b use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 97
diff changeset
108 canonicalizer=notDefaultCanon),
86
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
109 errors='percent'),
118
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
110 safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # '
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
111 ).lower()
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
112 # Wrt \x7f (DEL), see discussion in notes wrt
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
113 # "biz,televida)" case
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
114 # It remains to be seen whether other non-printing bytes
9d14e7c32737 replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 112
diff changeset
115 # will need to be treated as 'safe'
88
49faf679d7df final keystroke fixes, recurse and decimal www stripping
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
116 return WPAT.sub(')',_surt)
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
117
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
118 def keyed(l):
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
119 uri, cc_stamp, dateTime = l.split('\t',2)
77
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
120 #print('ul',uri,file=sys.stderr)
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
121 try:
76
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
122 try:
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
123 epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
124 except OverflowError:
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
125 epoch = 32535215999.0
97
135a8c56dcc2 include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
126 return ((cdx_key(uri), cc_stamp, uri), epoch)
75
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
127 except (TypeError,IndexError,ValueError) as e:
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
128 print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
129 return
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
130
97
135a8c56dcc2 include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
131 fstr = sys.argv[1]
135a8c56dcc2 include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
132
135a8c56dcc2 include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
133 with open(fstr,"r") as ff:
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
134 # crucial that the following is done _after_ the file is opened
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
135 # with the default (utf-8) locale!
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
136 locale.setlocale(locale.LC_ALL, "C")
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
137 ctk=cmp_to_key(locale.strcoll)
97
135a8c56dcc2 include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
138 for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
139 key=lambda x:x[0]):
112
827eadc72122 more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 111
diff changeset
140 print(key[0],key[1],
827eadc72122 more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 111
diff changeset
141 key[2].encode('ascii',errors='java_unicode').decode('ascii'),
111
ab3d547f3e76 one uncommited fix from quentin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 108
diff changeset
142 ts,sep='\t')