changeset 58:4f31d3234620

try nutch fetch for big pdfs
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 22 Apr 2020 18:42:23 +0100
parents 849ccd30258d
children 701fe81ada29
files bin/bigpdf.sh
diffstat 1 files changed, 36 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/bigpdf.sh	Wed Apr 22 18:42:23 2020 +0100
@@ -0,0 +1,36 @@
+#!/usr/bin/bash
+# Fetch big pdfs per segment from bigpdf_?.txt
+#  First line thereof gives CC identifier
+echo $(date) $(hostname)
+h=$(hostname)
+hn=${h##*n}
+head -1 bigpdf_${hn}.txt |\
+ { read cc ; \
+ tail -n +2 bigpdf_${hn}.txt |\
+    while read s
+    do
+	echo $(date) $(hostname) starting $s
+	mkdir -p /dev/shm/pcrawl/$s/segments
+	cd /dev/shm/pcrawl/$s
+	for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar
+	do
+	    tar -xOf $f '*.hdr' |\
+             fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\
+             tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' '
+	done > bu.txt
+	if [ "$(tot < $b/nb.txt)" -ne\
+             $((2 * $(wc -l < $b/bu.txt))) ]
+	then
+	    printf "length mismatch: tcount: %d != ucount: %d\n" \
+             $(tot < $b/nb.txt) $(wc -l < $b/bu.txt) 1>&2
+	    exit 1
+        fi
+	export NUTCH_HEAPSIZE=20000
+	${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments
+	${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
+          -Dmapreduce.job.reduces=34 -Dhadoop.log.dir=/dev/shm/pcrawl/$s\
+          segments/* -threads 36 >log 2>&1
+	echo $(date) $(hostname) finished $s
+	cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s
+    done
+echo $(date) $(hostname) $?