view bin/bigpdf.sh @ 143:ddff993994be

too clever by half, keys won't work in parallel for e.g. media types
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 20 Oct 2021 15:47:55 +0000
parents f1bf3effa893
children
line wrap: on
line source

#!/usr/bin/bash
# Fetch big pdfs per segment from bigpdf_?.txt
#  First line thereof gives CC identifier
cd /dev/shm
hn=$1
echo $(date) $hn
tot () 
{ 
    awk '{sum+=$1} END {printf "%u\n",sum}'
}
head -1 bigpdf_${hn}.txt |\
 { read cc
 tail -n +2 bigpdf_${hn}.txt |\
    while read s
    do
	echo $(date) $(hostname) starting $s
	mkdir -p /dev/shm/pcrawl/$s/segments
	cd /dev/shm/pcrawl/$s
	if [ ! -f bu.txt ]
	then
            for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar
	    do
		tar -xOf $f '*.hdr' |\
		 fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\
		 tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' '
	    done > bu.txt
        fi
	if [ "$(tot < nb.txt)" -ne\
             $((2 * $(wc -l < bu.txt))) ]
	then
	    printf "length mismatch: tcount: %d != ucount: %d\n" \
             $(tot < nb.txt) $(wc -l < bu.txt) 1>&2
	    exit 1
        fi
	export NUTCH_HEAPSIZE=20000
	export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s
	if [ ! -f segments/2020*/crawl_generate/part-r-00000 ]
        then
	    ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.maps=1 bu.txt segments
        fi
	${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
          -Dmapreduce.job.reduces=34 \
          segments/* -threads 144 >/dev/null 2>&1 && \
	  echo $(date) $(hostname) finished $s && \
	  cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \
	  { for f in warc/warc/*.gz
            do
             unpigz -dp 1 -c $f | egrep -aiB15 '^WARC-Truncated: '
            done > truncated.txt
	    cp -a truncated.txt /beegfs/common_crawl/CC-MAIN-$cc/$s/warc/warc
	    rm -rf cdx segments warc
          }
    done
 }
echo $(date) $(hostname) $?