Mercurial > hg > cc > cirrus_home
changeset 60:ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 23 Apr 2020 17:25:25 +0100 |
parents | 701fe81ada29 |
children | b24755311af8 |
files | bin/bigpdf.sh |
diffstat | 1 files changed, 26 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/bigpdf.sh Wed Apr 22 18:42:40 2020 +0100 +++ b/bin/bigpdf.sh Thu Apr 23 17:25:25 2020 +0100 @@ -1,36 +1,45 @@ #!/usr/bin/bash # Fetch big pdfs per segment from bigpdf_?.txt # First line thereof gives CC identifier -echo $(date) $(hostname) -h=$(hostname) -hn=${h##*n} +hn=$1 +echo $(date) $hn +tot () +{ + awk '{sum+=$1} END {printf "%u\n",sum}' +} head -1 bigpdf_${hn}.txt |\ - { read cc ; \ + { read cc tail -n +2 bigpdf_${hn}.txt |\ while read s do echo $(date) $(hostname) starting $s mkdir -p /dev/shm/pcrawl/$s/segments cd /dev/shm/pcrawl/$s - for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar - do - tar -xOf $f '*.hdr' |\ - fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\ - tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' ' - done > bu.txt - if [ "$(tot < $b/nb.txt)" -ne\ - $((2 * $(wc -l < $b/bu.txt))) ] + if [ ! -f bu.txt ] + then + for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar + do + tar -xOf $f '*.hdr' |\ + fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\ + tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' ' + done > bu.txt + fi + if [ "$(tot < nb.txt)" -ne\ + $((2 * $(wc -l < bu.txt))) ] then printf "length mismatch: tcount: %d != ucount: %d\n" \ - $(tot < $b/nb.txt) $(wc -l < $b/bu.txt) 1>&2 + $(tot < nb.txt) $(wc -l < bu.txt) 1>&2 exit 1 fi export NUTCH_HEAPSIZE=20000 + export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ - -Dmapreduce.job.reduces=34 -Dhadoop.log.dir=/dev/shm/pcrawl/$s\ - segments/* -threads 36 >log 2>&1 - echo $(date) $(hostname) finished $s - cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s + -Dmapreduce.job.reduces=34 \ + segments/* -threads 72 >/dev/null 2>&1 && \ + echo $(date) $(hostname) finished $s && \ + cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s #&& \ + #rm -rf cdx segments warc done + } echo $(date) $(hostname) $?