annotate bin/sort_date.py @ 86:3a2ae6057242

handle double .www, more keep-me chars
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 28 Aug 2023 21:07:43 +0100
parents 1daa8e444cfe
children 49faf679d7df
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/python3
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Assumes you have used grep -v $'\t' on input for speed
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 # to fix a common 'bad' timestamp (~ .2% of inputs)
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 import email.utils
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 import sys
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
7 from urllib.parse import urlsplit, quote, unquote
83
d92bd8527718 use surt instead of trying to create index term by hand
Henry Thompson <ht@markup.co.uk>
parents: 82
diff changeset
8 from surt import surt
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
9
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
10 import re, codecs
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
11
77
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
12 # Thanks to https://stackoverflow.com/a/8776871
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
13 import locale
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
14 from functools import cmp_to_key
76
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
15
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
16 def percent_encode(ude):
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
17 #print(ude.object,ude.object[ude.start:ude.end])
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
18 return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]),
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
19 ude.end)
75
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
20
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
21 codecs.register_error('percent',percent_encode)
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
22
86
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
23 # From RFC-3986:
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
24 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
25 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
26 # / "*" / "+" / "," / ";" / "="
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
27 # But # _is_ escaped in Java surt results
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
28 # and additionally " \ : < = > ? \ ^ _ ` { | } are not
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
29
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
30 # Also, Java surt strips _all_ leading 'www.',
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
31 # where python3 surt only strips the first one.
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
32
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
33 def cdx_key(uristring):
86
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
34 _surt = quote(unquote(surt(uristring),
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
35 errors='percent'),
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
36 safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower()
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
37 while _surt.endswith(",www"):
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
38 _surt = _surt[:-4]
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
39
3a2ae6057242 handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 85
diff changeset
40 return _surt
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 def keyed(l):
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
43 uri, cc_stamp, dateTime = l.split('\t',2)
77
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
44 #print('ul',uri,file=sys.stderr)
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 try:
76
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
46 try:
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
47 epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
48 except OverflowError:
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
49 epoch = 32535215999.0
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
50 return ((cdx_key(uri), cc_stamp), epoch)
75
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
51 except (TypeError,IndexError,ValueError) as e:
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 return
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54
85
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
55 with open(sys.argv[1],"r") as ff:
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
56 # crucial that the following is done _after_ the file is opened
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
57 # with the default (utf-8) locale!
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
58 locale.setlocale(locale.LC_ALL, "C")
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
59 ctk=cmp_to_key(locale.strcoll)
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
60 for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
61 key=lambda x:x[0]):
1daa8e444cfe work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 84
diff changeset
62 print(tl[0][0],tl[0][1],tl[1],sep='\t')
77
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
63