# HG changeset patch # User Henry S. Thompson # Date 1696446274 -3600 # Node ID 3ba401110c229eb1fecbf0e01430776cd3f925bb # Parent 4c499fc47ea70a8706957cd7c531673cae6fa71c handle -m case, support src from cmdline diff -r 4c499fc47ea7 -r 3ba401110c22 bin/do_idx.sh --- a/bin/do_idx.sh Thu Oct 05 10:42:15 2023 +0100 +++ b/bin/do_idx.sh Wed Oct 04 20:04:34 2023 +0100 @@ -1,5 +1,6 @@ #!/bin/bash -# Usage: do_idx.sh [-d...] [-m previously-merged-log] result-dir cdx-source-dir +# Usage: +# do_idx.sh [-d...] [-m previously-merged-log] result-dir cdx-source-dir [datestream] while [ "$1" = "-d" ] do shift @@ -13,15 +14,33 @@ fi export res="$1" orig="$2" + +if [ "$3" ] +then + src="$3" +else + src="LC_ALL=C sort -m -k1,3 -s $res/ks_[0-9]*.tsv" +fi + # igzip was faster, but produced bigger files, so went to gzip one step # smaller than default (-6), which produces slightly _smaller_ blocks. -~/lib/python/cc/lmh/merge_date.py $debug $merged <(LC_ALL=C sort -m -k1,3 -s $res/ks_[0-9]*.tsv) \ +~/lib/python/cc/lmh/merge_date.py $debug $merged <($src) \ $orig $res/idx 2>$res/merge.log | \ - parallel -j 10 'echo {#} {} >$res/merge_{#}.log - echo $(date) {#} {} - export res - split -l 3000 --filter="gzip -c -7 --keep | \ - tee >(wc -c >> \ - $res/merge_{#}.log)" \ - {} > {}.gz && \ - rm {}' + # Note that there will be no file {} if we're rebuilding, + # as the gzipped file was left untouched. + parallel -j 10 'echo -n $(date) {#} {} + if [ -f {} ] + then + echo -n {#} {} >$res/merge_{#}.log + echo packing... | tee -a $res/merge_{#}.log + export res + split -l 3000 --filter="gzip -c -7 --keep | \ + tee >(wc -c >> \ + $res/merge_{#}.log)" \ + {} > {}.gz && \ + rm {} + echo $(date) packed + else + # do not touch the old log file if not rebuilt + echo skipped + fi'