view bin/do_idx.sh @ 109:52c6a9b0fc8c

loosen must-match criterion in the both-messy case
author Henry Thompson <ht@markup.co.uk>
date Tue, 19 Sep 2023 19:29:41 +0100
parents 9403c02d5034
children a0ea1e4a714d
line wrap: on
line source

#!/bin/bash
export res="$1"
orig="$2"
# igzip was faster, but produced bigger files, so went to gzip one step
# smaller than default (-6), which produces slightly _smaller_ blocks.
merge_date.py <(LC_ALL=C sort -m -k1,2 -s $res/ks_[0-9]*.tsv) $orig $res/idx |\
          parallel -j 10 'echo {#} {} >$res/merge_{#}.log
                          echo $(date) {#} {}
                          export res
                          split -l 3000 --filter="gzip -c -7 --keep | \
                                                  tee >(wc -c >> \
                                                        $res/merge_{#}.log)" \
                                    {} > {}.gz && \
                                    rm {}'