Mercurial > hg > cc > cirrus_work
annotate bin/sort_date.py @ 82:7bbb14f6e394
merge
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 19 Aug 2023 16:02:29 -0400 |
parents | db3c689175fe bf09a1d80d7b |
children | d92bd8527718 |
rev | line source |
---|---|
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/python3 |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # Assumes you have used grep -v $'\t' on input for speed |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 # to fix a common 'bad' timestamp (~ .2% of inputs) |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 import email.utils |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 import sys |
75
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
7 from urllib.parse import urlsplit, unquote |
76
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
8 import re |
77
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
9 # Thanks to https://stackoverflow.com/a/8776871 |
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
10 import locale |
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
11 from functools import cmp_to_key |
76
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
12 |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
13 WWW=re.compile("www[0-9]*$") |
75
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
14 |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
15 def auth(s): |
77
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
16 #print('auth',s,file=sys.stderr) |
75
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
17 if '%' in s: |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
18 kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')] |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
19 else: |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
20 kk=s.split('.') |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
21 kk.reverse() |
76
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
22 if kk[0] == '': |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
23 # final full stop is pruned by CC |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
24 kk.pop(0) |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
25 while WWW.match(kk[-1]): |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
26 # any www... prefix is pruned |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
27 kk.pop() |
75
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
28 return ','.join(kk) |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 def keyed(l): |
77
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
31 uri, dateTime = l.split(b'\t',1) |
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
32 uri=uri.decode('ascii') |
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
33 dateTime=dateTime.decode('utf8') # occasional weird ones |
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
34 #print('ul',uri,file=sys.stderr) |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 try: |
76
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
36 try: |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
37 epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
38 except OverflowError: |
eeef811f734d
handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
75
diff
changeset
|
39 epoch = 32535215999.0 |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 parts = urlsplit(uri) |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 nl = parts.netloc |
80
db3c689175fe
catching up by hand with markup version,
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
42 pq = '?%s'%parts.query if parts.query else ''; |
77
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
43 #print('nl',nl,file=sys.stderr) |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 if ':' in nl: |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 pa,pp=nl.split(':') |
80
db3c689175fe
catching up by hand with markup version,
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
46 return ('%s:%s)%s%s'%(auth(pa), pp, parts.path, pq),epoch) |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 else: |
80
db3c689175fe
catching up by hand with markup version,
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
48 return ('%s)%s%s'%(auth(nl), parts.path, pq),epoch) |
75
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
49 except (TypeError,IndexError,ValueError) as e: |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
51 return |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 |
77
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
53 locale.setlocale(locale.LC_ALL, "C") |
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
54 ctk=cmp_to_key(locale.strcoll) |
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
55 |
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
56 with open(sys.argv[1],"rb") as ff: |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
57 for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None), |
77
bf09a1d80d7b
make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
76
diff
changeset
|
58 key=lambda x:ctk(x[0])): |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
59 print(tl[0],tl[1],sep='\t') |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
60 |