Mercurial > hg > cc > cirrus_work
annotate bin/sort_date.py @ 118:9d14e7c32737
replicate two extremely-corner cases of the way
Java produces surts for URIs containin escaped DEL chars
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 27 Sep 2023 17:29:09 +0100 |
parents | 827eadc72122 |
children |
rev | line source |
---|---|
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/python3 |
104
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
2 '''Process output of lmh_warc [new 3-column version] |
97
135a8c56dcc2
include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
88
diff
changeset
|
3 Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/") |
135a8c56dcc2
include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
88
diff
changeset
|
4 ''' |
135a8c56dcc2
include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
88
diff
changeset
|
5 |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 # Assumes you have used grep -v $'\t' on input for speed |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 # to fix a common 'bad' timestamp (~ .2% of inputs) |
97
135a8c56dcc2
include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
88
diff
changeset
|
9 |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 import email.utils |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 import sys |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
12 from urllib.parse import urlsplit, quote, unquote |
104
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
13 import surt |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
14 |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
15 import re, codecs |
118
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
16 from itertools import chain |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
17 |
88
49faf679d7df
final keystroke fixes, recurse and decimal www stripping
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
86
diff
changeset
|
18 WPAT = re.compile('(,www\\d*)+\\)') |
49faf679d7df
final keystroke fixes, recurse and decimal www stripping
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
86
diff
changeset
|
19 |
77
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
20 # Thanks to https://stackoverflow.com/a/8776871 |
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
21 import locale |
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
22 from functools import cmp_to_key |
76
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
23 |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
24 def percent_encode(ude): |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
25 #print(ude.object,ude.object[ude.start:ude.end]) |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
26 return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
27 ude.end) |
75
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
28 |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
29 codecs.register_error('percent',percent_encode) |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
30 |
108 | 31 def _u_esc(c): |
32 if c<65536: | |
33 return '\\u%04X'%c | |
34 else: | |
35 return '\\U%08X'%c | |
36 | |
37 def java_unicode_encode(ude): | |
38 '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn''' | |
39 return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]), | |
40 ude.end) | |
41 | |
42 codecs.register_error('java_unicode',java_unicode_encode) | |
43 | |
86
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
44 # From RFC-3986: |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
45 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
46 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
47 # / "*" / "+" / "," / ";" / "=" |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
48 # But # _is_ escaped in Java surt results |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
49 # and additionally " \ : < = > ? \ ^ _ ` { | } are not |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
50 |
88
49faf679d7df
final keystroke fixes, recurse and decimal www stripping
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
86
diff
changeset
|
51 # Note also that although quote already does _not_ quote - . / _ ~ |
97
135a8c56dcc2
include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
88
diff
changeset
|
52 # they are included below as that's what we find in surt.surt 0.3.1 |
88
49faf679d7df
final keystroke fixes, recurse and decimal www stripping
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
86
diff
changeset
|
53 |
49faf679d7df
final keystroke fixes, recurse and decimal www stripping
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
86
diff
changeset
|
54 # Also, Java surt strips _all_ leading 'www\d*.', |
86
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
55 # where python3 surt only strips the first one. |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
56 |
104
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
57 # And Java strips so-called option session-ids, but python doesn't |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
58 |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
59 import surt.DefaultIAURLCanonicalizer, surt.GoogleURLCanonicalizer |
107
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
60 import surt.URLRegexTransformer |
104
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
61 |
118
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
62 ident = ''.join(chr(i) for i in range(256)).encode('latin-1') |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
63 |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
64 IDMAP=bytes.maketrans(ident,ident) |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
65 |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
66 # For removal of non-printing characters: |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
67 # Note, this is only a guess, only example so are is DEL |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
68 NONPRINT= ''.join(chr(i) for i in chain(range(9), |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
69 range(14,32), |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
70 [127] # DEL |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
71 )).encode('latin-1') |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
72 |
104
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
73 def notDefaultCanon(hu,**options): |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
74 if surt.GoogleURLCanonicalizer.DECIMAL_IP.match(hu.host): |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
75 # Try to fix the incompatibility between Java and |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
76 # Python surt handling of 'octal' numbers in numeric IPv4 addresses |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
77 # and it should! See "After this line: |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
78 # |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
79 # 15,225,107,143)" in .../azure/notes.txt |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
80 try: |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
81 bytestrs = hu.host.split(b'.') |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
82 hu.host = b'.'.join(b'%d'%int(bs) for bs in bytestrs) |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
83 except ValueError: |
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
84 pass |
118
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
85 if hu.query: |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
86 hu.query = hu.query.translate(IDMAP,delete=NONPRINT) |
107
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
87 return surt.DefaultIAURLCanonicalizer.canonicalize(hu, **options) |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
88 |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
89 # Hack this to reproduce the Java bug |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
90 surt.URLRegexTransformer._RES_QUERY_SESSIONID = [ |
112
827eadc72122
more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
91 re.compile(b"(.+)(?:jsessionid=[0-9a-z]{32})(?:&(.*))?$", re.I), |
827eadc72122
more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
92 re.compile(b"(.+)(?:phpsessid=[0-9a-z]{32})(?:&(.*))?$", re.I), |
827eadc72122
more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
93 re.compile(b"(.+)(?:sid=[0-9a-z]{32})(?:&(.*))?$", re.I), |
827eadc72122
more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
94 re.compile(b"(.+)(?:aspsessionid[a-z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), |
827eadc72122
more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
95 re.compile(b"(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), |
107
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
96 ] |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
97 |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
98 # Above based on this from broken Java code: |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
99 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
100 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
101 #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
102 #(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
103 #(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
104
diff
changeset
|
104 #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", |
104
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
105 |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
106 def cdx_key(uristring): |
118
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
107 _surt = quote(unquote(surt.surt(unquote(uristring), |
104
fc9a045c872b
use my own Canonicalizer to fix more obscure
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
97
diff
changeset
|
108 canonicalizer=notDefaultCanon), |
86
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
109 errors='percent'), |
118
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
110 safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' # ' |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
111 ).lower() |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
112 # Wrt \x7f (DEL), see discussion in notes wrt |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
113 # "biz,televida)" case |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
114 # It remains to be seen whether other non-printing bytes |
9d14e7c32737
replicate two extremely-corner cases of the way
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
112
diff
changeset
|
115 # will need to be treated as 'safe' |
88
49faf679d7df
final keystroke fixes, recurse and decimal www stripping
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
86
diff
changeset
|
116 return WPAT.sub(')',_surt) |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
117 |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
118 def keyed(l): |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
119 uri, cc_stamp, dateTime = l.split('\t',2) |
77
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
120 #print('ul',uri,file=sys.stderr) |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
121 try: |
76
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
122 try: |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
123 epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
124 except OverflowError: |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
125 epoch = 32535215999.0 |
97
135a8c56dcc2
include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
88
diff
changeset
|
126 return ((cdx_key(uri), cc_stamp, uri), epoch) |
75
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
127 except (TypeError,IndexError,ValueError) as e: |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
128 print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
129 return |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
130 |
97
135a8c56dcc2
include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
88
diff
changeset
|
131 fstr = sys.argv[1] |
135a8c56dcc2
include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
88
diff
changeset
|
132 |
135a8c56dcc2
include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
88
diff
changeset
|
133 with open(fstr,"r") as ff: |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
134 # crucial that the following is done _after_ the file is opened |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
135 # with the default (utf-8) locale! |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
136 locale.setlocale(locale.LC_ALL, "C") |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
137 ctk=cmp_to_key(locale.strcoll) |
97
135a8c56dcc2
include full URI in output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
88
diff
changeset
|
138 for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None), |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
139 key=lambda x:x[0]): |
112
827eadc72122
more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
140 print(key[0],key[1], |
827eadc72122
more faithful regexps and non-byte uri output
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
111
diff
changeset
|
141 key[2].encode('ascii',errors='java_unicode').decode('ascii'), |
111
ab3d547f3e76
one uncommited fix from quentin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
108
diff
changeset
|
142 ts,sep='\t') |