Mercurial > hg > cc > cirrus_work
diff bin/sort_date.py @ 97:135a8c56dcc2
include full URI in output
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 08 Sep 2023 18:06:54 +0100 |
parents | 49faf679d7df |
children | fc9a045c872b |
line wrap: on
line diff
--- a/bin/sort_date.py Fri Sep 08 18:05:57 2023 +0100 +++ b/bin/sort_date.py Fri Sep 08 18:06:54 2023 +0100 @@ -1,7 +1,12 @@ #!/usr/bin/python3 +'''Process output of lmh_warc [original 2-column version] + Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/") +''' + # Assumes you have used grep -v $'\t' on input for speed # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/' # to fix a common 'bad' timestamp (~ .2% of inputs) + import email.utils import sys from urllib.parse import urlsplit, quote, unquote @@ -30,7 +35,7 @@ # and additionally " \ : < = > ? \ ^ _ ` { | } are not # Note also that although quote already does _not_ quote - . / _ ~ -# they are included below as that's what we find in surt.surt 0. +# they are included below as that's what we find in surt.surt 0.3.1 # Also, Java surt strips _all_ leading 'www\d*.', # where python3 surt only strips the first one. @@ -50,17 +55,19 @@ epoch = email.utils.parsedate_to_datetime(dateTime).timestamp() except OverflowError: epoch = 32535215999.0 - return ((cdx_key(uri), cc_stamp), epoch) + return ((cdx_key(uri), cc_stamp, uri), epoch) except (TypeError,IndexError,ValueError) as e: print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) return -with open(sys.argv[1],"r") as ff: +fstr = sys.argv[1] + +with open(fstr,"r") as ff: # crucial that the following is done _after_ the file is opened # with the default (utf-8) locale! locale.setlocale(locale.LC_ALL, "C") ctk=cmp_to_key(locale.strcoll) - for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None), + for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None), key=lambda x:x[0]): - print(tl[0][0],tl[0][1],tl[1],sep='\t') + print(key[0],key[1],key[2],ts,sep='\t')