# HG changeset patch # User Henry S. Thompson # Date 1694620123 -3600 # Node ID 9403c02d5034c707648fa569acf9b3fcd6a3fcfa # Parent fc9a045c872bcade73b0a7182a0bbe29854f9934 switch to gzip -7 to get comparable compressed cdx block size diff -r fc9a045c872b -r 9403c02d5034 bin/do_idx.sh --- a/bin/do_idx.sh Wed Sep 13 12:41:55 2023 +0100 +++ b/bin/do_idx.sh Wed Sep 13 16:48:43 2023 +0100 @@ -1,11 +1,13 @@ #!/bin/bash export res="$1" orig="$2" +# igzip was faster, but produced bigger files, so went to gzip one step +# smaller than default (-6), which produces slightly _smaller_ blocks. merge_date.py <(LC_ALL=C sort -m -k1,2 -s $res/ks_[0-9]*.tsv) $orig $res/idx |\ parallel -j 10 'echo {#} {} >$res/merge_{#}.log echo $(date) {#} {} export res - split -l 3000 --filter="igzip -c | \ + split -l 3000 --filter="gzip -c -7 --keep | \ tee >(wc -c >> \ $res/merge_{#}.log)" \ {} > {}.gz && \