view bin/doC2T.sh @ 165:e7fcae59c735

symlink to dir does't work
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 18 Jul 2022 18:40:12 +0100
parents ef961d91eea5
children
line wrap: on
line source

#!/usr/bin/bash
c=$1
i=$2
f=$(printf 'cdx-%05.0f.gz' $i)
unpigz -dp 1 -c data/$c/cdx/warc/$f  | cdx2tsv.py '(filename,f.split("/",maxsplit=5)[4][0])' '(url,f.split(":",maxsplit=1)[0])' mime mime-detected languages > /dev/shm/tbl_${i}.raw.tsv 2> /dev/shm/tbl_${i}.errs
#| tee /dev/shm/data_${i}
#wc -l /dev/shm/data_${i} /dev/shm/tbl_${i}.raw.tsv
mv /dev/shm/tbl_${i}.{errs,raw.tsv} results 
#rm /dev/shm/data_${i}