Mercurial > hg > cc > cirrus_home
comparison bin/bigpdf.sh @ 62:346298ac3ab9
several efficiency (hofentlich) tweaks
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 24 Apr 2020 15:20:33 +0100 |
parents | ff4e85d8ec31 |
children | e71aeb3355ff |
comparison
equal
deleted
inserted
replaced
61:b24755311af8 | 62:346298ac3ab9 |
---|---|
1 #!/usr/bin/bash | 1 #!/usr/bin/bash |
2 # Fetch big pdfs per segment from bigpdf_?.txt | 2 # Fetch big pdfs per segment from bigpdf_?.txt |
3 # First line thereof gives CC identifier | 3 # First line thereof gives CC identifier |
4 cd /dev/shm | |
4 hn=$1 | 5 hn=$1 |
5 echo $(date) $hn | 6 echo $(date) $hn |
6 tot () | 7 tot () |
7 { | 8 { |
8 awk '{sum+=$1} END {printf "%u\n",sum}' | 9 awk '{sum+=$1} END {printf "%u\n",sum}' |
31 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2 | 32 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2 |
32 exit 1 | 33 exit 1 |
33 fi | 34 fi |
34 export NUTCH_HEAPSIZE=20000 | 35 export NUTCH_HEAPSIZE=20000 |
35 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s | 36 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s |
36 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments | 37 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.reduces=1 bu.txt segments |
37 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ | 38 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ |
38 -Dmapreduce.job.reduces=34 \ | 39 -Dmapreduce.job.reduces=34 \ |
39 segments/* -threads 72 >/dev/null 2>&1 && \ | 40 segments/* -threads 144 >/dev/null 2>&1 && \ |
40 echo $(date) $(hostname) finished $s && \ | 41 echo $(date) $(hostname) finished $s && \ |
41 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s #&& \ | 42 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \ |
42 #rm -rf cdx segments warc | 43 rm -rf cdx segments warc |
43 done | 44 done |
44 } | 45 } |
45 echo $(date) $(hostname) $? | 46 echo $(date) $(hostname) $? |