changeset 74:b7daa4f8767c

works for big files with Hadoop 3.4.0
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 May 2020 14:23:33 +0100
parents 0780445a0840
children 1c5dab2e1cb3
files src/nutch-cc/conf/nutch-site.xml
diffstat 1 files changed, 14 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/src/nutch-cc/conf/nutch-site.xml	Wed May 06 14:22:48 2020 +0100
+++ b/src/nutch-cc/conf/nutch-site.xml	Wed May 06 14:23:33 2020 +0100
@@ -5,14 +5,14 @@
 
 <configuration>
   <!-- from email from Sebastian 2020-04-16, ccrawl:57[01] -->
-  <property><name>http.content.limit</name><value>500000000</value></property>
+  <property><name>http.content.limit</name><value>-1</value></property>
   <property><name>http.store.responsetime</name><value>true</value></property>
   <property><name>store.ip.address</name><value>true</value></property>
   <property><name>store.http.request</name><value>true</value></property>
   <property><name>store.http.headers</name><value>true</value></property>
   <property><name>http.accept.language</name><value>en-US,en;q=0.5</value></property>
   <property><name>http.accept.charset</name><value> </value></property>
-  <property><name>http.time.limit</name><value>600</value></property>
+  <property><name>http.time.limit</name><value>1200</value></property>
   <property><name>http.timeout</name><value>45000</value></property>
   <property><name>http.redirect.max</name><value>3</value></property>
   <property><name>http.redirect.max.skip</name><value>false</value></property>
@@ -32,6 +32,18 @@
   <property><name>fetcher.signature</name><value>true</value></property>
   <property><name>fetcher.redirect.dedupcache.seconds</name><value>5400</value></property>
   <property><name>fetcher.redirect.dedupcache.size</name><value>6000</value></property>
+  <property><name>fetcher.threads.timeout.divisor</name><value>1</value></property>
+  <property><name>mapreduce.fileoutputcommitter.marksuccessfuljobs</name> <value>true</value></property>
+  <property><name>mapreduce.task.timeout</name><value>1200000</value>
+    <!-- from hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/target/classes/mapred-default.xml
+    <value>600000</value>
+    <description>The number of milliseconds before a task will be
+    terminated if it neither reads an input, writes an output, nor
+    updates its status string.  A value of 0 disables the timeout.
+    [HST thinks 0 caused an immediate thread termination when he tried
+    it!]
+    </description> -->
+  </property>
 
   <!-- we use okhttp -->
   <property><name>plugin.includes</name><value>protocol-okhttp</value></property>