# HG changeset patch # User Henry S. Thompson <ht@inf.ed.ac.uk> # Date 1587659125 -3600 # Node ID ff4e85d8ec3183259c7b945502010ff112d0dc25 # Parent 701fe81ada29b05ceed624b22ee4604f6ab46542 switch for use on login server, invoke by hand with 0/1 as only cmd line arg diff -r 701fe81ada29 -r ff4e85d8ec31 bin/bigpdf.sh --- a/bin/bigpdf.sh Wed Apr 22 18:42:40 2020 +0100 +++ b/bin/bigpdf.sh Thu Apr 23 17:25:25 2020 +0100 @@ -1,36 +1,45 @@ #!/usr/bin/bash # Fetch big pdfs per segment from bigpdf_?.txt # First line thereof gives CC identifier -echo $(date) $(hostname) -h=$(hostname) -hn=${h##*n} +hn=$1 +echo $(date) $hn +tot () +{ + awk '{sum+=$1} END {printf "%u\n",sum}' +} head -1 bigpdf_${hn}.txt |\ - { read cc ; \ + { read cc tail -n +2 bigpdf_${hn}.txt |\ while read s do echo $(date) $(hostname) starting $s mkdir -p /dev/shm/pcrawl/$s/segments cd /dev/shm/pcrawl/$s - for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar - do - tar -xOf $f '*.hdr' |\ - fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\ - tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' ' - done > bu.txt - if [ "$(tot < $b/nb.txt)" -ne\ - $((2 * $(wc -l < $b/bu.txt))) ] + if [ ! -f bu.txt ] + then + for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar + do + tar -xOf $f '*.hdr' |\ + fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\ + tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' ' + done > bu.txt + fi + if [ "$(tot < nb.txt)" -ne\ + $((2 * $(wc -l < bu.txt))) ] then printf "length mismatch: tcount: %d != ucount: %d\n" \ - $(tot < $b/nb.txt) $(wc -l < $b/bu.txt) 1>&2 + $(tot < nb.txt) $(wc -l < bu.txt) 1>&2 exit 1 fi export NUTCH_HEAPSIZE=20000 + export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ - -Dmapreduce.job.reduces=34 -Dhadoop.log.dir=/dev/shm/pcrawl/$s\ - segments/* -threads 36 >log 2>&1 - echo $(date) $(hostname) finished $s - cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s + -Dmapreduce.job.reduces=34 \ + segments/* -threads 72 >/dev/null 2>&1 && \ + echo $(date) $(hostname) finished $s && \ + cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s #&& \ + #rm -rf cdx segments warc done + } echo $(date) $(hostname) $?