annotate bin/sort_date.py @ 75:177f7df2bf46

handle %-encoded utf-8 as idna
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 09 Aug 2023 02:01:32 +0100
parents e8c667bf8965
children eeef811f734d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/python3
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Assumes you have used grep -v $'\t' on input for speed
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 # to fix a common 'bad' timestamp (~ .2% of inputs)
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 import email.utils
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 import sys
75
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
7 from urllib.parse import urlsplit, unquote
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
8
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
9 def auth(s):
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
10 if '%' in s:
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
11 kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')]
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
12 else:
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
13 kk=s.split('.')
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
14 kk.reverse()
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
15 return ','.join(kk)
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 def keyed(l):
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 uri, dateTime = l.split('\t',1)
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 try:
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 parts = urlsplit(uri)
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 nl = parts.netloc
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 if ':' in nl:
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 pa,pp=nl.split(':')
75
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
25 return ('%s:%s)%s'%(auth(pa), pp, parts.path),epoch)
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 else:
75
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
27 return ('%s)%s'%(auth(nl), parts.path),epoch)
177f7df2bf46 handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 73
diff changeset
28 except (TypeError,IndexError,ValueError) as e:
73
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 return
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 with open(sys.argv[1],"r") as ff:
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 key=lambda x:x[0]):
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 print(tl[0],tl[1],sep='\t')
e8c667bf8965 compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36