diff bin/sort_date.py @ 97:135a8c56dcc2

include full URI in output
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 08 Sep 2023 18:06:54 +0100
parents 49faf679d7df
children fc9a045c872b
line wrap: on
line diff
--- a/bin/sort_date.py	Fri Sep 08 18:05:57 2023 +0100
+++ b/bin/sort_date.py	Fri Sep 08 18:06:54 2023 +0100
@@ -1,7 +1,12 @@
 #!/usr/bin/python3
+'''Process output of lmh_warc [original 2-column version]
+   Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/")
+'''
+
 # Assumes you have used grep -v $'\t' on input for speed
 # Recommended to also sed '/GMT$/s/\([^ ]\)GMT$/\1 GMT/'
 #  to fix a common 'bad' timestamp (~ .2% of inputs)
+
 import email.utils
 import sys
 from urllib.parse import urlsplit, quote, unquote
@@ -30,7 +35,7 @@
 #  and additionally " \ : < = > ? \ ^  _ ` { | } are not
 
 # Note also that although quote already does _not_ quote - . / _ ~
-#  they are included below as that's what we find in surt.surt 0.
+#  they are included below as that's what we find in surt.surt 0.3.1
 
 # Also, Java surt strips _all_ leading 'www\d*.',
 #  where python3 surt only strips the first one.
@@ -50,17 +55,19 @@
       epoch = email.utils.parsedate_to_datetime(dateTime).timestamp()
     except OverflowError:
       epoch = 32535215999.0
-    return ((cdx_key(uri), cc_stamp), epoch)
+    return ((cdx_key(uri), cc_stamp, uri), epoch)
   except (TypeError,IndexError,ValueError) as e:
     print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
     return
 
-with open(sys.argv[1],"r") as ff:
+fstr = sys.argv[1]
+
+with open(fstr,"r") as ff:
   # crucial that the following is done _after_ the file is opened
   #  with the default (utf-8) locale!
   locale.setlocale(locale.LC_ALL, "C")
   ctk=cmp_to_key(locale.strcoll)
-  for tl in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
+  for key, ts in sorted((kk for l in ff if (kk:=keyed(l)) is not None),
                    key=lambda x:x[0]):
-    print(tl[0][0],tl[0][1],tl[1],sep='\t')
+    print(key[0],key[1],key[2],ts,sep='\t')