Mercurial > hg > cc > cirrus_home
comparison bin/bigpdf.sh @ 72:f1bf3effa893
log trucations
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 28 Apr 2020 19:02:34 +0100 |
parents | e71aeb3355ff |
children |
comparison
equal
deleted
inserted
replaced
71:17eb428525cb | 72:f1bf3effa893 |
---|---|
32 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2 | 32 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2 |
33 exit 1 | 33 exit 1 |
34 fi | 34 fi |
35 export NUTCH_HEAPSIZE=20000 | 35 export NUTCH_HEAPSIZE=20000 |
36 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s | 36 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s |
37 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.maps=1 bu.txt segments | 37 if [ ! -f segments/2020*/crawl_generate/part-r-00000 ] |
38 then | |
39 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.maps=1 bu.txt segments | |
40 fi | |
38 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ | 41 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ |
39 -Dmapreduce.job.reduces=34 \ | 42 -Dmapreduce.job.reduces=34 \ |
40 segments/* -threads 144 >/dev/null 2>&1 && \ | 43 segments/* -threads 144 >/dev/null 2>&1 && \ |
41 echo $(date) $(hostname) finished $s && \ | 44 echo $(date) $(hostname) finished $s && \ |
42 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \ | 45 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \ |
43 rm -rf cdx segments warc | 46 { for f in warc/warc/*.gz |
47 do | |
48 unpigz -dp 1 -c $f | egrep -aiB15 '^WARC-Truncated: ' | |
49 done > truncated.txt | |
50 cp -a truncated.txt /beegfs/common_crawl/CC-MAIN-$cc/$s/warc/warc | |
51 rm -rf cdx segments warc | |
52 } | |
44 done | 53 done |
45 } | 54 } |
46 echo $(date) $(hostname) $? | 55 echo $(date) $(hostname) $? |