changeset 62:346298ac3ab9

several efficiency (hofentlich) tweaks
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 24 Apr 2020 15:20:33 +0100
parents b24755311af8
children d39fd9c7f1be
files bin/bigpdf.sh
diffstat 1 files changed, 5 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/bin/bigpdf.sh	Thu Apr 23 17:26:55 2020 +0100
+++ b/bin/bigpdf.sh	Fri Apr 24 15:20:33 2020 +0100
@@ -1,6 +1,7 @@
 #!/usr/bin/bash
 # Fetch big pdfs per segment from bigpdf_?.txt
 #  First line thereof gives CC identifier
+cd /dev/shm
 hn=$1
 echo $(date) $hn
 tot () 
@@ -33,13 +34,13 @@
         fi
 	export NUTCH_HEAPSIZE=20000
 	export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s
-	${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments
+	${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.reduces=1 bu.txt segments
 	${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
           -Dmapreduce.job.reduces=34 \
-          segments/* -threads 72 >/dev/null 2>&1 && \
+          segments/* -threads 144 >/dev/null 2>&1 && \
 	  echo $(date) $(hostname) finished $s && \
-	  cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s #&& \
-	  #rm -rf cdx segments warc
+	  cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \
+	  rm -rf cdx segments warc
     done
  }
 echo $(date) $(hostname) $?