Mercurial > hg > cc > cirrus_home
changeset 58:4f31d3234620
try nutch fetch for big pdfs
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 22 Apr 2020 18:42:23 +0100 |
parents | 849ccd30258d |
children | 701fe81ada29 |
files | bin/bigpdf.sh |
diffstat | 1 files changed, 36 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/bigpdf.sh Wed Apr 22 18:42:23 2020 +0100 @@ -0,0 +1,36 @@ +#!/usr/bin/bash +# Fetch big pdfs per segment from bigpdf_?.txt +# First line thereof gives CC identifier +echo $(date) $(hostname) +h=$(hostname) +hn=${h##*n} +head -1 bigpdf_${hn}.txt |\ + { read cc ; \ + tail -n +2 bigpdf_${hn}.txt |\ + while read s + do + echo $(date) $(hostname) starting $s + mkdir -p /dev/shm/pcrawl/$s/segments + cd /dev/shm/pcrawl/$s + for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar + do + tar -xOf $f '*.hdr' |\ + fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\ + tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' ' + done > bu.txt + if [ "$(tot < $b/nb.txt)" -ne\ + $((2 * $(wc -l < $b/bu.txt))) ] + then + printf "length mismatch: tcount: %d != ucount: %d\n" \ + $(tot < $b/nb.txt) $(wc -l < $b/bu.txt) 1>&2 + exit 1 + fi + export NUTCH_HEAPSIZE=20000 + ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments + ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ + -Dmapreduce.job.reduces=34 -Dhadoop.log.dir=/dev/shm/pcrawl/$s\ + segments/* -threads 36 >log 2>&1 + echo $(date) $(hostname) finished $s + cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s + done +echo $(date) $(hostname) $?