changeset 60:ff4e85d8ec31

switch for use on login server, invoke by hand with 0/1 as only cmd line arg
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 23 Apr 2020 17:25:25 +0100
parents 701fe81ada29
children b24755311af8
files bin/bigpdf.sh
diffstat 1 files changed, 26 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/bin/bigpdf.sh	Wed Apr 22 18:42:40 2020 +0100
+++ b/bin/bigpdf.sh	Thu Apr 23 17:25:25 2020 +0100
@@ -1,36 +1,45 @@
 #!/usr/bin/bash
 # Fetch big pdfs per segment from bigpdf_?.txt
 #  First line thereof gives CC identifier
-echo $(date) $(hostname)
-h=$(hostname)
-hn=${h##*n}
+hn=$1
+echo $(date) $hn
+tot () 
+{ 
+    awk '{sum+=$1} END {printf "%u\n",sum}'
+}
 head -1 bigpdf_${hn}.txt |\
- { read cc ; \
+ { read cc
  tail -n +2 bigpdf_${hn}.txt |\
     while read s
     do
 	echo $(date) $(hostname) starting $s
 	mkdir -p /dev/shm/pcrawl/$s/segments
 	cd /dev/shm/pcrawl/$s
-	for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar
-	do
-	    tar -xOf $f '*.hdr' |\
-             fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\
-             tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' '
-	done > bu.txt
-	if [ "$(tot < $b/nb.txt)" -ne\
-             $((2 * $(wc -l < $b/bu.txt))) ]
+	if [ ! -f bu.txt ]
+	then
+            for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar
+	    do
+		tar -xOf $f '*.hdr' |\
+		 fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\
+		 tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' '
+	    done > bu.txt
+        fi
+	if [ "$(tot < nb.txt)" -ne\
+             $((2 * $(wc -l < bu.txt))) ]
 	then
 	    printf "length mismatch: tcount: %d != ucount: %d\n" \
-             $(tot < $b/nb.txt) $(wc -l < $b/bu.txt) 1>&2
+             $(tot < nb.txt) $(wc -l < bu.txt) 1>&2
 	    exit 1
         fi
 	export NUTCH_HEAPSIZE=20000
+	export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s
 	${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments
 	${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
-          -Dmapreduce.job.reduces=34 -Dhadoop.log.dir=/dev/shm/pcrawl/$s\
-          segments/* -threads 36 >log 2>&1
-	echo $(date) $(hostname) finished $s
-	cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s
+          -Dmapreduce.job.reduces=34 \
+          segments/* -threads 72 >/dev/null 2>&1 && \
+	  echo $(date) $(hostname) finished $s && \
+	  cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s #&& \
+	  #rm -rf cdx segments warc
     done
+ }
 echo $(date) $(hostname) $?