comparison bin/bigpdf.sh @ 72:f1bf3effa893

log trucations
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 28 Apr 2020 19:02:34 +0100
parents e71aeb3355ff
children
comparison
equal deleted inserted replaced
71:17eb428525cb 72:f1bf3effa893
32 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2 32 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2
33 exit 1 33 exit 1
34 fi 34 fi
35 export NUTCH_HEAPSIZE=20000 35 export NUTCH_HEAPSIZE=20000
36 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s 36 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s
37 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.maps=1 bu.txt segments 37 if [ ! -f segments/2020*/crawl_generate/part-r-00000 ]
38 then
39 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.maps=1 bu.txt segments
40 fi
38 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ 41 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
39 -Dmapreduce.job.reduces=34 \ 42 -Dmapreduce.job.reduces=34 \
40 segments/* -threads 144 >/dev/null 2>&1 && \ 43 segments/* -threads 144 >/dev/null 2>&1 && \
41 echo $(date) $(hostname) finished $s && \ 44 echo $(date) $(hostname) finished $s && \
42 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \ 45 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \
43 rm -rf cdx segments warc 46 { for f in warc/warc/*.gz
47 do
48 unpigz -dp 1 -c $f | egrep -aiB15 '^WARC-Truncated: '
49 done > truncated.txt
50 cp -a truncated.txt /beegfs/common_crawl/CC-MAIN-$cc/$s/warc/warc
51 rm -rf cdx segments warc
52 }
44 done 53 done
45 } 54 }
46 echo $(date) $(hostname) $? 55 echo $(date) $(hostname) $?