Mercurial > hg > cc > cirrus_work
changeset 141:14d3802112b2
remove the mistaken deletion of NONPRINT,
add support for rebuilding the output of a previous mistaken run
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 02 Oct 2023 18:52:43 +0100 |
parents | d8b134f6ab03 |
children | 0326805aa6df |
files | lib/python/cc/lmh/sort_date.py |
diffstat | 1 files changed, 14 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/sort_date.py Sat Sep 30 18:04:15 2023 +0100 +++ b/lib/python/cc/lmh/sort_date.py Mon Oct 02 18:52:43 2023 +0100 @@ -1,6 +1,7 @@ #!/usr/bin/python3 '''Process output of lmh_warc [new 3-column version] Usage: <(uz ....warc.gz | fgrep $'\t'|sed "/GMT$/s/\([^ ]\)GMT$/\1 GMT/") + or -r [previous output] ''' # Assumes you have used grep -v $'\t' on input for speed @@ -102,7 +103,6 @@ safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f''' #') ) # Wrt \x7f (DEL), see discussion in notes wrt cn,futnsz) case - query = query.translate(IDMAP,delete=NONPRINT) # </change> # <change> # Double-escape non-unicode %-encodings @@ -203,9 +203,15 @@ print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) return -def main(fstr): +def main(fstr,rebuild=False): with open(fstr,"r") as ff: + if rebuild: + for l in ff: + ok, ocd, ou, ot = l.split() + # Note, not sorted, so can compare with input + print(cdx_key(ou),ocd,ou,ot,sep='\t') + return # crucial that the following is done _after_ the file is opened # with the default (utf-8) locale! locale.setlocale(locale.LC_ALL, "C") @@ -217,4 +223,9 @@ ts,sep='\t') if __name__ == '__main__': - sys.exit(main(fstr = sys.argv[1])) + if sys.argv[1]=='-r': + rebuild=True + sys.argv.pop(1) + else: + rebuild=False + sys.exit(main(sys.argv[1],rebuild))