Mercurial > hg > cc > cirrus_home
annotate bin/doC2T.sh @ 163:ef961d91eea5
previous approach to lang/field extraction
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 18 Jul 2022 18:16:27 +0100 |
parents | |
children |
rev | line source |
---|---|
163
ef961d91eea5
previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/bash |
ef961d91eea5
previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 c=$1 |
ef961d91eea5
previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 i=$2 |
ef961d91eea5
previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 f=$(printf 'cdx-%05.0f.gz' $i) |
ef961d91eea5
previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 unpigz -dp 1 -c data/$c/cdx/warc/$f | cdx2tsv.py '(filename,f.split("/",maxsplit=5)[4][0])' '(url,f.split(":",maxsplit=1)[0])' mime mime-detected languages > /dev/shm/tbl_${i}.raw.tsv 2> /dev/shm/tbl_${i}.errs |
ef961d91eea5
previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 #| tee /dev/shm/data_${i} |
ef961d91eea5
previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 #wc -l /dev/shm/data_${i} /dev/shm/tbl_${i}.raw.tsv |
ef961d91eea5
previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 mv /dev/shm/tbl_${i}.{errs,raw.tsv} results |
ef961d91eea5
previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 #rm /dev/shm/data_${i} |
ef961d91eea5
previous approach to lang/field extraction
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |