Mercurial > hg > cc > cirrus_home
diff bin/doCLM.sh @ 163:ef961d91eea5
previous approach to lang/field extraction
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 18 Jul 2022 18:16:27 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/doCLM.sh Mon Jul 18 18:16:27 2022 +0100 @@ -0,0 +1,12 @@ +#!/usr/bin/bash +mkdir -p /dev/shm/hst + +c=$1 +i=$2 +f=$(printf 'cdx-%05.0f.gz' $i) + +unpigz -dp 1 -c data/$c/cdx/warc/$f | parallel --willcite --pipe -N 50000 -j10 "ix.py -x -h -c '/lustre/home/dc007/hst/bin/clm.sh /dev/shm/hst/'$i'.lmh_{#}.txt' 2>/dev/shm/hst/$i.lmh_{#}.errs" + +cd /dev/shm/hst +tar -czf $HOME/results/${i}.lmh.tar.gz ${i}.lmh_*.{txt,errs} +