Mercurial > hg > cc > cirrus_work
annotate bin/do_idx.sh @ 247:7737da0ccb8c
try adding lm to existing index from ks_0-9
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 02 Jan 2025 14:52:14 +0000 |
parents | 0326805aa6df |
children | 3ba401110c22 |
rev | line source |
---|---|
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
142
0326805aa6df
change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
116
diff
changeset
|
2 # Usage: do_idx.sh [-d...] [-m previously-merged-log] result-dir cdx-source-dir |
110
a0ea1e4a714d
pass in debug flag(s) to merge_date.py
Henry Thompson <ht@markup.co.uk>
parents:
105
diff
changeset
|
3 while [ "$1" = "-d" ] |
a0ea1e4a714d
pass in debug flag(s) to merge_date.py
Henry Thompson <ht@markup.co.uk>
parents:
105
diff
changeset
|
4 do |
a0ea1e4a714d
pass in debug flag(s) to merge_date.py
Henry Thompson <ht@markup.co.uk>
parents:
105
diff
changeset
|
5 shift |
a0ea1e4a714d
pass in debug flag(s) to merge_date.py
Henry Thompson <ht@markup.co.uk>
parents:
105
diff
changeset
|
6 debug="$debug -d" |
a0ea1e4a714d
pass in debug flag(s) to merge_date.py
Henry Thompson <ht@markup.co.uk>
parents:
105
diff
changeset
|
7 done |
142
0326805aa6df
change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
116
diff
changeset
|
8 if [ "$1" = "-m" ] |
0326805aa6df
change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
116
diff
changeset
|
9 then |
0326805aa6df
change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
116
diff
changeset
|
10 shift |
0326805aa6df
change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
116
diff
changeset
|
11 merged="-m $1" |
0326805aa6df
change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
116
diff
changeset
|
12 shift |
0326805aa6df
change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
116
diff
changeset
|
13 fi |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 export res="$1" |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 orig="$2" |
105
9403c02d5034
switch to gzip -7 to get comparable compressed cdx block size
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
103
diff
changeset
|
16 # igzip was faster, but produced bigger files, so went to gzip one step |
9403c02d5034
switch to gzip -7 to get comparable compressed cdx block size
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
103
diff
changeset
|
17 # smaller than default (-6), which produces slightly _smaller_ blocks. |
142
0326805aa6df
change path to merge_date.py
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
116
diff
changeset
|
18 ~/lib/python/cc/lmh/merge_date.py $debug $merged <(LC_ALL=C sort -m -k1,3 -s $res/ks_[0-9]*.tsv) \ |
116 | 19 $orig $res/idx 2>$res/merge.log | \ |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 parallel -j 10 'echo {#} {} >$res/merge_{#}.log |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 echo $(date) {#} {} |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 export res |
105
9403c02d5034
switch to gzip -7 to get comparable compressed cdx block size
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
103
diff
changeset
|
23 split -l 3000 --filter="gzip -c -7 --keep | \ |
101
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 tee >(wc -c >> \ |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 $res/merge_{#}.log)" \ |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 {} > {}.gz && \ |
e2e64c3d763e
bug4 fixed, but that created a new, earlier bug
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 rm {}' |