view bin/bigpdf.sh @ 58:4f31d3234620

try nutch fetch for big pdfs
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 22 Apr 2020 18:42:23 +0100
parents
children ff4e85d8ec31
line wrap: on
line source

#!/usr/bin/bash
# Fetch big pdfs per segment from bigpdf_?.txt
#  First line thereof gives CC identifier
echo $(date) $(hostname)
h=$(hostname)
hn=${h##*n}
head -1 bigpdf_${hn}.txt |\
 { read cc ; \
 tail -n +2 bigpdf_${hn}.txt |\
    while read s
    do
	echo $(date) $(hostname) starting $s
	mkdir -p /dev/shm/pcrawl/$s/segments
	cd /dev/shm/pcrawl/$s
	for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar
	do
	    tar -xOf $f '*.hdr' |\
             fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\
             tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' '
	done > bu.txt
	if [ "$(tot < $b/nb.txt)" -ne\
             $((2 * $(wc -l < $b/bu.txt))) ]
	then
	    printf "length mismatch: tcount: %d != ucount: %d\n" \
             $(tot < $b/nb.txt) $(wc -l < $b/bu.txt) 1>&2
	    exit 1
        fi
	export NUTCH_HEAPSIZE=20000
	${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments
	${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
          -Dmapreduce.job.reduces=34 -Dhadoop.log.dir=/dev/shm/pcrawl/$s\
          segments/* -threads 36 >log 2>&1
	echo $(date) $(hostname) finished $s
	cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s
    done
echo $(date) $(hostname) $?