Mercurial > hg > cc > cirrus_home
changeset 74:b7daa4f8767c
works for big files with Hadoop 3.4.0
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 06 May 2020 14:23:33 +0100 |
parents | 0780445a0840 |
children | 1c5dab2e1cb3 |
files | src/nutch-cc/conf/nutch-site.xml |
diffstat | 1 files changed, 14 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/src/nutch-cc/conf/nutch-site.xml Wed May 06 14:22:48 2020 +0100 +++ b/src/nutch-cc/conf/nutch-site.xml Wed May 06 14:23:33 2020 +0100 @@ -5,14 +5,14 @@ <configuration> <!-- from email from Sebastian 2020-04-16, ccrawl:57[01] --> - <property><name>http.content.limit</name><value>500000000</value></property> + <property><name>http.content.limit</name><value>-1</value></property> <property><name>http.store.responsetime</name><value>true</value></property> <property><name>store.ip.address</name><value>true</value></property> <property><name>store.http.request</name><value>true</value></property> <property><name>store.http.headers</name><value>true</value></property> <property><name>http.accept.language</name><value>en-US,en;q=0.5</value></property> <property><name>http.accept.charset</name><value> </value></property> - <property><name>http.time.limit</name><value>600</value></property> + <property><name>http.time.limit</name><value>1200</value></property> <property><name>http.timeout</name><value>45000</value></property> <property><name>http.redirect.max</name><value>3</value></property> <property><name>http.redirect.max.skip</name><value>false</value></property> @@ -32,6 +32,18 @@ <property><name>fetcher.signature</name><value>true</value></property> <property><name>fetcher.redirect.dedupcache.seconds</name><value>5400</value></property> <property><name>fetcher.redirect.dedupcache.size</name><value>6000</value></property> + <property><name>fetcher.threads.timeout.divisor</name><value>1</value></property> + <property><name>mapreduce.fileoutputcommitter.marksuccessfuljobs</name> <value>true</value></property> + <property><name>mapreduce.task.timeout</name><value>1200000</value> + <!-- from hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/target/classes/mapred-default.xml + <value>600000</value> + <description>The number of milliseconds before a task will be + terminated if it neither reads an input, writes an output, nor + updates its status string. A value of 0 disables the timeout. + [HST thinks 0 caused an immediate thread termination when he tried + it!] + </description> --> + </property> <!-- we use okhttp --> <property><name>plugin.includes</name><value>protocol-okhttp</value></property>