Mercurial > hg > cc > cirrus_home
changeset 72:f1bf3effa893
log trucations
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 28 Apr 2020 19:02:34 +0100 |
parents | 17eb428525cb |
children | 0780445a0840 |
files | bin/bigpdf.sh |
diffstat | 1 files changed, 11 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/bigpdf.sh Tue Apr 28 19:02:14 2020 +0100 +++ b/bin/bigpdf.sh Tue Apr 28 19:02:34 2020 +0100 @@ -34,13 +34,22 @@ fi export NUTCH_HEAPSIZE=20000 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s - ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.maps=1 bu.txt segments + if [ ! -f segments/2020*/crawl_generate/part-r-00000 ] + then + ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.maps=1 bu.txt segments + fi ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ -Dmapreduce.job.reduces=34 \ segments/* -threads 144 >/dev/null 2>&1 && \ echo $(date) $(hostname) finished $s && \ cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \ - rm -rf cdx segments warc + { for f in warc/warc/*.gz + do + unpigz -dp 1 -c $f | egrep -aiB15 '^WARC-Truncated: ' + done > truncated.txt + cp -a truncated.txt /beegfs/common_crawl/CC-MAIN-$cc/$s/warc/warc + rm -rf cdx segments warc + } done } echo $(date) $(hostname) $?