comparison bin/do_idx.sh @ 105:9403c02d5034

switch to gzip -7 to get comparable compressed cdx block size
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 13 Sep 2023 16:48:43 +0100
parents 7d58dc01f329
children a0ea1e4a714d
comparison
equal deleted inserted replaced
104:fc9a045c872b 105:9403c02d5034
1 #!/bin/bash 1 #!/bin/bash
2 export res="$1" 2 export res="$1"
3 orig="$2" 3 orig="$2"
4 # igzip was faster, but produced bigger files, so went to gzip one step
5 # smaller than default (-6), which produces slightly _smaller_ blocks.
4 merge_date.py <(LC_ALL=C sort -m -k1,2 -s $res/ks_[0-9]*.tsv) $orig $res/idx |\ 6 merge_date.py <(LC_ALL=C sort -m -k1,2 -s $res/ks_[0-9]*.tsv) $orig $res/idx |\
5 parallel -j 10 'echo {#} {} >$res/merge_{#}.log 7 parallel -j 10 'echo {#} {} >$res/merge_{#}.log
6 echo $(date) {#} {} 8 echo $(date) {#} {}
7 export res 9 export res
8 split -l 3000 --filter="igzip -c | \ 10 split -l 3000 --filter="gzip -c -7 --keep | \
9 tee >(wc -c >> \ 11 tee >(wc -c >> \
10 $res/merge_{#}.log)" \ 12 $res/merge_{#}.log)" \
11 {} > {}.gz && \ 13 {} > {}.gz && \
12 rm {}' 14 rm {}'