annotate bin/do_idx.sh @ 247:7737da0ccb8c

try adding lm to existing index from ks_0-9
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 02 Jan 2025 14:52:14 +0000
parents 0326805aa6df
children 3ba401110c22
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/bin/bash
142
0326805aa6df change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 116
diff changeset
2 # Usage: do_idx.sh [-d...] [-m previously-merged-log] result-dir cdx-source-dir
110
a0ea1e4a714d pass in debug flag(s) to merge_date.py
Henry Thompson <ht@markup.co.uk>
parents: 105
diff changeset
3 while [ "$1" = "-d" ]
a0ea1e4a714d pass in debug flag(s) to merge_date.py
Henry Thompson <ht@markup.co.uk>
parents: 105
diff changeset
4 do
a0ea1e4a714d pass in debug flag(s) to merge_date.py
Henry Thompson <ht@markup.co.uk>
parents: 105
diff changeset
5 shift
a0ea1e4a714d pass in debug flag(s) to merge_date.py
Henry Thompson <ht@markup.co.uk>
parents: 105
diff changeset
6 debug="$debug -d"
a0ea1e4a714d pass in debug flag(s) to merge_date.py
Henry Thompson <ht@markup.co.uk>
parents: 105
diff changeset
7 done
142
0326805aa6df change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 116
diff changeset
8 if [ "$1" = "-m" ]
0326805aa6df change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 116
diff changeset
9 then
0326805aa6df change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 116
diff changeset
10 shift
0326805aa6df change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 116
diff changeset
11 merged="-m $1"
0326805aa6df change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 116
diff changeset
12 shift
0326805aa6df change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 116
diff changeset
13 fi
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 export res="$1"
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 orig="$2"
105
9403c02d5034 switch to gzip -7 to get comparable compressed cdx block size
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 103
diff changeset
16 # igzip was faster, but produced bigger files, so went to gzip one step
9403c02d5034 switch to gzip -7 to get comparable compressed cdx block size
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 103
diff changeset
17 # smaller than default (-6), which produces slightly _smaller_ blocks.
142
0326805aa6df change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 116
diff changeset
18 ~/lib/python/cc/lmh/merge_date.py $debug $merged <(LC_ALL=C sort -m -k1,3 -s $res/ks_[0-9]*.tsv) \
116
5b952d16838c a bit more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 110
diff changeset
19 $orig $res/idx 2>$res/merge.log | \
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 parallel -j 10 'echo {#} {} >$res/merge_{#}.log
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 echo $(date) {#} {}
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 export res
105
9403c02d5034 switch to gzip -7 to get comparable compressed cdx block size
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 103
diff changeset
23 split -l 3000 --filter="gzip -c -7 --keep | \
101
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 tee >(wc -c >> \
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 $res/merge_{#}.log)" \
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 {} > {}.gz && \
e2e64c3d763e bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 rm {}'