changeset 141:14d3802112b2

remove the mistaken deletion of NONPRINT, add support for rebuilding the output of a previous mistaken run
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 02 Oct 2023 18:52:43 +0100
parents d8b134f6ab03
children 0326805aa6df
files lib/python/cc/lmh/sort_date.py
diffstat 1 files changed, 14 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/sort_date.py	Sat Sep 30 18:04:15 2023 +0100
+++ b/lib/python/cc/lmh/sort_date.py	Mon Oct 02 18:52:43 2023 +0100
@@ -1,6 +1,7 @@
 #!/usr/bin/python3
 '''Process output of lmh_warc [new 3-column version]
    Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/")
+        or -r [previous output]
 '''
 
 # Assumes you have used grep -v $'\t' on input for speed
@@ -102,7 +103,6 @@
                        safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #')
                 )
     # Wrt \x7f (DEL), see discussion in notes wrt cn,futnsz) case
-    query = query.translate(IDMAP,delete=NONPRINT)
     # </change>
     # <change>
     # Double-escape non-unicode %-encodings
@@ -203,9 +203,15 @@
     print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
     return
 
-def main(fstr):
+def main(fstr,rebuild=False):
 
   with open(fstr,"r") as ff:
+    if rebuild:
+      for l in ff:
+        ok, ocd, ou, ot = l.split()
+        # Note, not sorted, so can compare with input
+        print(cdx_key(ou),ocd,ou,ot,sep='\t')
+      return
     # crucial that the following is done _after_ the file is opened
     #  with the default (utf-8) locale!
     locale.setlocale(locale.LC_ALL, "C")
@@ -217,4 +223,9 @@
             ts,sep='\t')
 
 if __name__ == '__main__':
-  sys.exit(main(fstr = sys.argv[1]))
+  if sys.argv[1]=='-r':
+    rebuild=True
+    sys.argv.pop(1)
+  else:
+    rebuild=False
+  sys.exit(main(sys.argv[1],rebuild))