comparison bin/bigpdf.sh @ 62:346298ac3ab9

several efficiency (hofentlich) tweaks
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 24 Apr 2020 15:20:33 +0100
parents ff4e85d8ec31
children e71aeb3355ff
comparison
equal deleted inserted replaced
61:b24755311af8 62:346298ac3ab9
1 #!/usr/bin/bash 1 #!/usr/bin/bash
2 # Fetch big pdfs per segment from bigpdf_?.txt 2 # Fetch big pdfs per segment from bigpdf_?.txt
3 # First line thereof gives CC identifier 3 # First line thereof gives CC identifier
4 cd /dev/shm
4 hn=$1 5 hn=$1
5 echo $(date) $hn 6 echo $(date) $hn
6 tot () 7 tot ()
7 { 8 {
8 awk '{sum+=$1} END {printf "%u\n",sum}' 9 awk '{sum+=$1} END {printf "%u\n",sum}'
31 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2 32 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2
32 exit 1 33 exit 1
33 fi 34 fi
34 export NUTCH_HEAPSIZE=20000 35 export NUTCH_HEAPSIZE=20000
35 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s 36 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s
36 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments 37 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.reduces=1 bu.txt segments
37 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ 38 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
38 -Dmapreduce.job.reduces=34 \ 39 -Dmapreduce.job.reduces=34 \
39 segments/* -threads 72 >/dev/null 2>&1 && \ 40 segments/* -threads 144 >/dev/null 2>&1 && \
40 echo $(date) $(hostname) finished $s && \ 41 echo $(date) $(hostname) finished $s && \
41 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s #&& \ 42 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \
42 #rm -rf cdx segments warc 43 rm -rf cdx segments warc
43 done 44 done
44 } 45 }
45 echo $(date) $(hostname) $? 46 echo $(date) $(hostname) $?