annotate bin/doC2T.sh @ 195:5f3c36e4fd6d default tip

add target test-core which (dangerously) avoids (we hope pointless) recompilation of all the plugins
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 26 Sep 2024 17:55:56 +0100
parents ef961d91eea5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
163
ef961d91eea5 previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/bash
ef961d91eea5 previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 c=$1
ef961d91eea5 previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 i=$2
ef961d91eea5 previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 f=$(printf 'cdx-%05.0f.gz' $i)
ef961d91eea5 previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 unpigz -dp 1 -c data/$c/cdx/warc/$f | cdx2tsv.py '(filename,f.split("/",maxsplit=5)[4][0])' '(url,f.split(":",maxsplit=1)[0])' mime mime-detected languages > /dev/shm/tbl_${i}.raw.tsv 2> /dev/shm/tbl_${i}.errs
ef961d91eea5 previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 #| tee /dev/shm/data_${i}
ef961d91eea5 previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 #wc -l /dev/shm/data_${i} /dev/shm/tbl_${i}.raw.tsv
ef961d91eea5 previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 mv /dev/shm/tbl_${i}.{errs,raw.tsv} results
ef961d91eea5 previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 #rm /dev/shm/data_${i}
ef961d91eea5 previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10