changeset 72:f1bf3effa893

log trucations
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 28 Apr 2020 19:02:34 +0100
parents 17eb428525cb
children 0780445a0840
files bin/bigpdf.sh
diffstat 1 files changed, 11 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/bin/bigpdf.sh	Tue Apr 28 19:02:14 2020 +0100
+++ b/bin/bigpdf.sh	Tue Apr 28 19:02:34 2020 +0100
@@ -34,13 +34,22 @@
         fi
 	export NUTCH_HEAPSIZE=20000
 	export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s
-	${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.maps=1 bu.txt segments
+	if [ ! -f segments/2020*/crawl_generate/part-r-00000 ]
+        then
+	    ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.maps=1 bu.txt segments
+        fi
 	${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
           -Dmapreduce.job.reduces=34 \
           segments/* -threads 144 >/dev/null 2>&1 && \
 	  echo $(date) $(hostname) finished $s && \
 	  cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \
-	  rm -rf cdx segments warc
+	  { for f in warc/warc/*.gz
+            do
+             unpigz -dp 1 -c $f | egrep -aiB15 '^WARC-Truncated: '
+            done > truncated.txt
+	    cp -a truncated.txt /beegfs/common_crawl/CC-MAIN-$cc/$s/warc/warc
+	    rm -rf cdx segments warc
+          }
     done
  }
 echo $(date) $(hostname) $?