Mercurial > hg > cc > cirrus_home
changeset 68:d5f59c1fdc10
mostly from Sebastian
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 24 Apr 2020 20:12:29 +0100 |
parents | ecdbaa11b88e |
children | 1e3cfb84867c |
files | src/nutch-cc/conf/nutch-site.xml |
diffstat | 1 files changed, 65 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/nutch-cc/conf/nutch-site.xml Fri Apr 24 20:12:29 2020 +0100 @@ -0,0 +1,65 @@ +<?xml version="1.0"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<!-- Put site-specific property overrides in this file. --> + +<configuration> + <!-- from email from Sebastian 2020-04-16, ccrawl:57[01] --> + <property><name>http.content.limit</name><value>-1</value></property> + <property><name>http.store.responsetime</name><value>true</value></property> + <property><name>store.ip.address</name><value>true</value></property> + <property><name>store.http.request</name><value>true</value></property> + <property><name>store.http.headers</name><value>true</value></property> + <property><name>http.accept.language</name><value>en-US,en;q=0.5</value></property> + <property><name>http.accept.charset</name><value> </value></property> + <property><name>http.time.limit</name><value>300</value></property> + <property><name>http.timeout</name><value>45000</value></property> + <property><name>http.redirect.max</name><value>3</value></property> + <property><name>http.redirect.max.skip</name><value>false</value></property> + <property><name>http.partial.truncated</name><value>true</value></property> + + <!-- need to fill your crawler contact information --> + <property><name>http.agent.name</name><value>htInEdin</value></property> + <!--<property><name>http.robots.agents</name><value>XXX</value></property>--> + <property><name>http.agent.description</name><value>Experimental PDF crawler</value></property> + <property><name>http.agent.url</name><value>http://www.ltg.ed.ac.uk/~ht/pdfCrawl.html</value></property> + <property><name>http.agent.version</name><value>0.1</value></property> + + <property><name>fetcher.store.content</name><value>false</value></property> + <property><name>fetcher.store.robotstxt</name><value>true</value></property> + <property><name>fetcher.store.404s</name><value>true</value></property> + <property><name>fetcher.store.warc</name><value>true</value></property> + <property><name>fetcher.signature</name><value>true</value></property> + <property><name>fetcher.redirect.dedupcache.seconds</name><value>5400</value></property> + <property><name>fetcher.redirect.dedupcache.size</name><value>6000</value></property> + + <!-- we use okhttp --> + <property><name>plugin.includes</name><value>protocol-okhttp</value></property> + + <!-- WARC file names and info header - needs to be adapted, e.g. that's the info + of the current March/April crawl: + PREFIX=CC-MAIN + ISPARTOF=CC-MAIN-2020-16 + PUBLISHER='Common Crawl' + OPERATOR='Common Crawl Admin (info@commoncrawl.org)' + SOFTWARE='Apache Nutch 1.16 (modified, https://github.com/commoncrawl/nutch/)' + DESCRIPTION='Wide crawl of the web for March/April 2020' + --> + <property><name>warc.export.prefix</name><value>CC-HST-PDF</value></property> + <property><name>warc.export.operator</name><value>ht@inf.ed.ac.uk</value></property> + <property><name>warc.export.publisher</name><value>School of Informatics, University of Edinburgh</value></property> + <property><name>warc.export.software</name><value>Apache Nutch 1.16 (modified, https://github.com/commoncrawl/nutch/)</value></property> + <property><name>warc.export.description</name><value>Augment CC of August 2020 with PDFs larger than 1MB</value></property> + <!--property><name>warc.export.isPartOf</name><value>$ISPARTOF</value></property--> + + <!-- further WARC writer config --> + <!-- trying relative paths??? --> + <property><name>warc.export.path</name><value>warc</value></property> + <property><name>warc.export.cdx.path</name><value>cdx</value></property> + <property><name>warc.deduplicate</name><value>true</value></property> + <property><name>warc.export.text</name><value>false</value></property> + <property><name>warc.export.crawldiagnostics</name><value>true</value></property> + <property><name>warc.export.robotstxt</name><value>true</value></property> + <property><name>warc.export.cdx</name><value>true</value></property> + +</configuration>