annotate bin/sort_date.py @ 82:7bbb14f6e394

merge
author Henry Thompson <ht@markup.co.uk>
date Sat, 19 Aug 2023 16:02:29 -0400
parents db3c689175fe bf09a1d80d7b
children d92bd8527718
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/python3
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Assumes you have used grep -v $'\t' on input for speed
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 # to fix a common 'bad' timestamp (~ .2% of inputs)
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 import email.utils
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 import sys
75
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
7 from urllib.parse import urlsplit, unquote
76
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
8 import re
77
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
9 # Thanks to https://stackoverflow.com/a/8776871
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
10 import locale
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
11 from functools import cmp_to_key
76
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
12
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
13 WWW=re.compile("www[0-9]*$")
75
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
14
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
15 def auth(s):
77
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
16 #print('auth',s,file=sys.stderr)
75
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
17 if '%' in s:
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
18 kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')]
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
19 else:
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
20 kk=s.split('.')
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
21 kk.reverse()
76
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
22 if kk[0] == '':
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
23 # final full stop is pruned by CC
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
24 kk.pop(0)
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
25 while WWW.match(kk[-1]):
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
26 # any www... prefix is pruned
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
27 kk.pop()
75
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
28 return ','.join(kk)
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 def keyed(l):
77
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
31 uri, dateTime = l.split(b'\t',1)
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
32 uri=uri.decode('ascii')
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
33 dateTime=dateTime.decode('utf8') # occasional weird ones
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
34 #print('ul',uri,file=sys.stderr)
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 try:
76
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
36 try:
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
37 epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
38 except OverflowError:
eeef811f734d handle corner cases with final . and initial www..+
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 75
diff changeset
39 epoch = 32535215999.0
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 parts = urlsplit(uri)
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 nl = parts.netloc
80
db3c689175fe catching up by hand with markup version,
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
42 pq = '?%s'%parts.query if parts.query else '';
77
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
43 #print('nl',nl,file=sys.stderr)
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 if ':' in nl:
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 pa,pp=nl.split(':')
80
db3c689175fe catching up by hand with markup version,
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
46 return ('%s:%s)%s%s'%(auth(pa), pp, parts.path, pq),epoch)
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 else:
80
db3c689175fe catching up by hand with markup version,
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
48 return ('%s)%s%s'%(auth(nl), parts.path, pq),epoch)
75
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
49 except (TypeError,IndexError,ValueError) as e:
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 return
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52
77
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
53 locale.setlocale(locale.LC_ALL, "C")
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
54 ctk=cmp_to_key(locale.strcoll)
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
55
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
56 with open(sys.argv[1],"rb") as ff:
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
57 for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
77
bf09a1d80d7b make CC's own sorting explicit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
58 key=lambda x:ctk(x[0])):
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 print(tl[0],tl[1],sep='\t')
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
60