view bin/bigpdf.sh @ 62:346298ac3ab9

several efficiency (hofentlich) tweaks
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 24 Apr 2020 15:20:33 +0100
parents ff4e85d8ec31
children e71aeb3355ff
line wrap: on
line source

#!/usr/bin/bash
# Fetch big pdfs per segment from bigpdf_?.txt
#  First line thereof gives CC identifier
cd /dev/shm
hn=$1
echo $(date) $hn
tot () 
{ 
    awk '{sum+=$1} END {printf "%u\n",sum}'
}
head -1 bigpdf_${hn}.txt |\
 { read cc
 tail -n +2 bigpdf_${hn}.txt |\
    while read s
    do
	echo $(date) $(hostname) starting $s
	mkdir -p /dev/shm/pcrawl/$s/segments
	cd /dev/shm/pcrawl/$s
	if [ ! -f bu.txt ]
	then
            for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar
	    do
		tar -xOf $f '*.hdr' |\
		 fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\
		 tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' '
	    done > bu.txt
        fi
	if [ "$(tot < nb.txt)" -ne\
             $((2 * $(wc -l < bu.txt))) ]
	then
	    printf "length mismatch: tcount: %d != ucount: %d\n" \
             $(tot < nb.txt) $(wc -l < bu.txt) 1>&2
	    exit 1
        fi
	export NUTCH_HEAPSIZE=20000
	export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s
	${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.reduces=1 bu.txt segments
	${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
          -Dmapreduce.job.reduces=34 \
          segments/* -threads 144 >/dev/null 2>&1 && \
	  echo $(date) $(hostname) finished $s && \
	  cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \
	  rm -rf cdx segments warc
    done
 }
echo $(date) $(hostname) $?