Mercurial > hg > cc > cirrus_work
annotate bin/sort_date.py @ 75:177f7df2bf46
handle %-encoded utf-8 as idna
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 09 Aug 2023 02:01:32 +0100 |
parents | e8c667bf8965 |
children | eeef811f734d |
rev | line source |
---|---|
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/python3 |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # Assumes you have used grep -v $'\t' on input for speed |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 # to fix a common 'bad' timestamp (~ .2% of inputs) |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 import email.utils |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 import sys |
75
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
7 from urllib.parse import urlsplit, unquote |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
8 |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
9 def auth(s): |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
10 if '%' in s: |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
11 kk=[(unquote(k).encode('idna')).decode('ascii') for k in s.split('.')] |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
12 else: |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
13 kk=s.split('.') |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
14 kk.reverse() |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
15 return ','.join(kk) |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 def keyed(l): |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 uri, dateTime = l.split('\t',1) |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 try: |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 parts = urlsplit(uri) |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 nl = parts.netloc |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 if ':' in nl: |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 pa,pp=nl.split(':') |
75
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
25 return ('%s:%s)%s'%(auth(pa), pp, parts.path),epoch) |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 else: |
75
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
27 return ('%s)%s'%(auth(nl), parts.path),epoch) |
177f7df2bf46
handle %-encoded utf-8 as idna
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
73
diff
changeset
|
28 except (TypeError,IndexError,ValueError) as e: |
73
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 return |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 with open(sys.argv[1],"r") as ff: |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None), |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 key=lambda x:x[0]): |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 print(tl[0],tl[1],sep='\t') |
e8c667bf8965
compute timestamps, key and sort lmh lines
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 |