changeset 151:3ba401110c22 mergefix

handle -m case, support src from cmdline
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 04 Oct 2023 20:04:34 +0100
parents 4c499fc47ea7
children
files bin/do_idx.sh
diffstat 1 files changed, 29 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/bin/do_idx.sh	Thu Oct 05 10:42:15 2023 +0100
+++ b/bin/do_idx.sh	Wed Oct 04 20:04:34 2023 +0100
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Usage: do_idx.sh [-d...] [-m previously-merged-log] result-dir cdx-source-dir
+# Usage:
+#  do_idx.sh [-d...] [-m previously-merged-log] result-dir cdx-source-dir [datestream]
 while [ "$1" = "-d" ]
 do
  shift
@@ -13,15 +14,33 @@
 fi
 export res="$1"
 orig="$2"
+
+if [ "$3" ]
+then
+ src="$3"
+else
+ src="LC_ALL=C sort -m -k1,3 -s $res/ks_[0-9]*.tsv"
+fi
+
 # igzip was faster, but produced bigger files, so went to gzip one step
 # smaller than default (-6), which produces slightly _smaller_ blocks.
-~/lib/python/cc/lmh/merge_date.py $debug $merged <(LC_ALL=C sort -m -k1,3 -s $res/ks_[0-9]*.tsv) \
+~/lib/python/cc/lmh/merge_date.py $debug $merged  <($src) \
               $orig $res/idx 2>$res/merge.log | \
-          parallel -j 10 'echo {#} {} >$res/merge_{#}.log
-                          echo $(date) {#} {}
-                          export res
-                          split -l 3000 --filter="gzip -c -7 --keep | \
-                                                  tee >(wc -c >> \
-                                                        $res/merge_{#}.log)" \
-                                    {} > {}.gz && \
-                                    rm {}'
+          # Note that there will be no file {} if we're rebuilding,
+          #  as the gzipped file was left untouched.
+          parallel -j 10 'echo -n $(date) {#} {}
+			  if [ -f {} ]
+                          then
+                            echo -n {#} {} >$res/merge_{#}.log
+                            echo packing... | tee -a $res/merge_{#}.log
+			    export res
+			    split -l 3000 --filter="gzip -c -7 --keep | \
+						    tee >(wc -c >> \
+							  $res/merge_{#}.log)" \
+				      {} > {}.gz && \
+				      rm {}
+                            echo $(date) packed
+                          else
+			    # do not touch the old log file if not rebuilt
+                            echo skipped
+                          fi'