Mercurial > hg > cc > cirrus_home
changeset 62:346298ac3ab9
several efficiency (hofentlich) tweaks
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 24 Apr 2020 15:20:33 +0100 |
parents | b24755311af8 |
children | d39fd9c7f1be |
files | bin/bigpdf.sh |
diffstat | 1 files changed, 5 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/bigpdf.sh Thu Apr 23 17:26:55 2020 +0100 +++ b/bin/bigpdf.sh Fri Apr 24 15:20:33 2020 +0100 @@ -1,6 +1,7 @@ #!/usr/bin/bash # Fetch big pdfs per segment from bigpdf_?.txt # First line thereof gives CC identifier +cd /dev/shm hn=$1 echo $(date) $hn tot () @@ -33,13 +34,13 @@ fi export NUTCH_HEAPSIZE=20000 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s - ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments + ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.reduces=1 bu.txt segments ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ -Dmapreduce.job.reduces=34 \ - segments/* -threads 72 >/dev/null 2>&1 && \ + segments/* -threads 144 >/dev/null 2>&1 && \ echo $(date) $(hostname) finished $s && \ - cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s #&& \ - #rm -rf cdx segments warc + cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \ + rm -rf cdx segments warc done } echo $(date) $(hostname) $?