annotate src/nutch-cc/conf/nutch-site.xml @ 74:b7daa4f8767c

works for big files with Hadoop 3.4.0
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 May 2020 14:23:33 +0100
parents 17eb428525cb
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
68
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 <?xml version="1.0"?>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 <!-- Put site-specific property overrides in this file. -->
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 <configuration>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 <!-- from email from Sebastian 2020-04-16, ccrawl:57[01] -->
74
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
8 <property><name>http.content.limit</name><value>-1</value></property>
68
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 <property><name>http.store.responsetime</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 <property><name>store.ip.address</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 <property><name>store.http.request</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 <property><name>store.http.headers</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 <property><name>http.accept.language</name><value>en-US,en;q=0.5</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 <property><name>http.accept.charset</name><value> </value></property>
74
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
15 <property><name>http.time.limit</name><value>1200</value></property>
68
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 <property><name>http.timeout</name><value>45000</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 <property><name>http.redirect.max</name><value>3</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 <property><name>http.redirect.max.skip</name><value>false</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 <property><name>http.partial.truncated</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 <!-- need to fill your crawler contact information -->
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 <property><name>http.agent.name</name><value>htInEdin</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 <!--<property><name>http.robots.agents</name><value>XXX</value></property>-->
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 <property><name>http.agent.description</name><value>Experimental PDF crawler</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 <property><name>http.agent.url</name><value>http://www.ltg.ed.ac.uk/~ht/pdfCrawl.html</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 <property><name>http.agent.version</name><value>0.1</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 <property><name>fetcher.store.content</name><value>false</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 <property><name>fetcher.store.robotstxt</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 <property><name>fetcher.store.404s</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 <property><name>fetcher.store.warc</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 <property><name>fetcher.signature</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 <property><name>fetcher.redirect.dedupcache.seconds</name><value>5400</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 <property><name>fetcher.redirect.dedupcache.size</name><value>6000</value></property>
74
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
35 <property><name>fetcher.threads.timeout.divisor</name><value>1</value></property>
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
36 <property><name>mapreduce.fileoutputcommitter.marksuccessfuljobs</name> <value>true</value></property>
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
37 <property><name>mapreduce.task.timeout</name><value>1200000</value>
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
38 <!-- from hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/target/classes/mapred-default.xml
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
39 <value>600000</value>
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
40 <description>The number of milliseconds before a task will be
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
41 terminated if it neither reads an input, writes an output, nor
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
42 updates its status string. A value of 0 disables the timeout.
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
43 [HST thinks 0 caused an immediate thread termination when he tried
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
44 it!]
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
45 </description> -->
b7daa4f8767c works for big files with Hadoop 3.4.0
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 71
diff changeset
46 </property>
68
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 <!-- we use okhttp -->
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 <property><name>plugin.includes</name><value>protocol-okhttp</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 <!-- WARC file names and info header - needs to be adapted, e.g. that's the info
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 of the current March/April crawl:
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 PREFIX=CC-MAIN
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 ISPARTOF=CC-MAIN-2020-16
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55 PUBLISHER='Common Crawl'
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
56 OPERATOR='Common Crawl Admin (info@commoncrawl.org)'
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
57 SOFTWARE='Apache Nutch 1.16 (modified, https://github.com/commoncrawl/nutch/)'
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
58 DESCRIPTION='Wide crawl of the web for March/April 2020'
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 -->
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
60 <property><name>warc.export.prefix</name><value>CC-HST-PDF</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
61 <property><name>warc.export.operator</name><value>ht@inf.ed.ac.uk</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
62 <property><name>warc.export.publisher</name><value>School of Informatics, University of Edinburgh</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
63 <property><name>warc.export.software</name><value>Apache Nutch 1.16 (modified, https://github.com/commoncrawl/nutch/)</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
64 <property><name>warc.export.description</name><value>Augment CC of August 2020 with PDFs larger than 1MB</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
65 <!--property><name>warc.export.isPartOf</name><value>$ISPARTOF</value></property-->
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
66
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
67 <!-- further WARC writer config -->
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
68 <!-- trying relative paths??? -->
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
69 <property><name>warc.export.path</name><value>warc</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
70 <property><name>warc.export.cdx.path</name><value>cdx</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
71 <property><name>warc.deduplicate</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
72 <property><name>warc.export.text</name><value>false</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
73 <property><name>warc.export.crawldiagnostics</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
74 <property><name>warc.export.robotstxt</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
75 <property><name>warc.export.cdx</name><value>true</value></property>
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
76
d5f59c1fdc10 mostly from Sebastian
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
77 </configuration>