Mercurial > hg > cc > cirrus_work
annotate bin/sort_date.py @ 86:3a2ae6057242
handle double .www, more keep-me chars
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 28 Aug 2023 21:07:43 +0100 |
parents | 1daa8e444cfe |
children | 49faf679d7df |
rev | line source |
---|---|
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/python3 |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # Assumes you have used grep -v $'\t' on input for speed |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 # to fix a common 'bad' timestamp (~ .2% of inputs) |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 import email.utils |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 import sys |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
7 from urllib.parse import urlsplit, quote, unquote |
83
d92bd8527718
use surt instead of trying to create index term by hand
Henry Thompson <ht@markup.co.uk>
parents:
82
diff
changeset
|
8 from surt import surt |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
9 |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
10 import re, codecs |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
11 |
77
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
12 # Thanks to https://stackoverflow.com/a/8776871 |
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
13 import locale |
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
14 from functools import cmp_to_key |
76
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
15 |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
16 def percent_encode(ude): |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
17 #print(ude.object,ude.object[ude.start:ude.end]) |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
18 return (''.join('%%%X'%c for c in ude.object[ude.start:ude.end]), |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
19 ude.end) |
75
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
20 |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
21 codecs.register_error('percent',percent_encode) |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
22 |
86
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
23 # From RFC-3986: |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
24 # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
25 # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
26 # / "*" / "+" / "," / ";" / "=" |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
27 # But # _is_ escaped in Java surt results |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
28 # and additionally " \ : < = > ? \ ^ _ ` { | } are not |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
29 |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
30 # Also, Java surt strips _all_ leading 'www.', |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
31 # where python3 surt only strips the first one. |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
32 |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
33 def cdx_key(uristring): |
86
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
34 _surt = quote(unquote(surt(uristring), |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
35 errors='percent'), |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
36 safe='!"$&\'()*+,/:;<=>?@[\\]^_`{|}').lower() |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
37 while _surt.endswith(",www"): |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
38 _surt = _surt[:-4] |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
39 |
3a2ae6057242
handle double .www, more keep-me chars
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
85
diff
changeset
|
40 return _surt |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 def keyed(l): |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
43 uri, cc_stamp, dateTime = l.split('\t',2) |
77
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
44 #print('ul',uri,file=sys.stderr) |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 try: |
76
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
46 try: |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
47 epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
48 except OverflowError: |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
49 epoch = 32535215999.0 |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
50 return ((cdx_key(uri), cc_stamp), epoch) |
75
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
51 except (TypeError,IndexError,ValueError) as e: |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 return |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
54 |
85
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
55 with open(sys.argv[1],"r") as ff: |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
56 # crucial that the following is done _after_ the file is opened |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
57 # with the default (utf-8) locale! |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
58 locale.setlocale(locale.LC_ALL, "C") |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
59 ctk=cmp_to_key(locale.strcoll) |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
60 for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None), |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
61 key=lambda x:x[0]): |
1daa8e444cfe
work-around for weird handling of %-encoding in Java impl. of SURT
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
84
diff
changeset
|
62 print(tl[0][0],tl[0][1],tl[1],sep='\t') |
77
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
63 |