68
|
1 <?xml version="1.0"?>
|
|
2 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
|
3
|
|
4 <!-- Put site-specific property overrides in this file. -->
|
|
5
|
|
6 <configuration>
|
|
7 <!-- from email from Sebastian 2020-04-16, ccrawl:57[01] -->
|
71
|
8 <property><name>http.content.limit</name><value>500000000</value></property>
|
68
|
9 <property><name>http.store.responsetime</name><value>true</value></property>
|
|
10 <property><name>store.ip.address</name><value>true</value></property>
|
|
11 <property><name>store.http.request</name><value>true</value></property>
|
|
12 <property><name>store.http.headers</name><value>true</value></property>
|
|
13 <property><name>http.accept.language</name><value>en-US,en;q=0.5</value></property>
|
|
14 <property><name>http.accept.charset</name><value> </value></property>
|
71
|
15 <property><name>http.time.limit</name><value>600</value></property>
|
68
|
16 <property><name>http.timeout</name><value>45000</value></property>
|
|
17 <property><name>http.redirect.max</name><value>3</value></property>
|
|
18 <property><name>http.redirect.max.skip</name><value>false</value></property>
|
|
19 <property><name>http.partial.truncated</name><value>true</value></property>
|
|
20
|
|
21 <!-- need to fill your crawler contact information -->
|
|
22 <property><name>http.agent.name</name><value>htInEdin</value></property>
|
|
23 <!--<property><name>http.robots.agents</name><value>XXX</value></property>-->
|
|
24 <property><name>http.agent.description</name><value>Experimental PDF crawler</value></property>
|
|
25 <property><name>http.agent.url</name><value>http://www.ltg.ed.ac.uk/~ht/pdfCrawl.html</value></property>
|
|
26 <property><name>http.agent.version</name><value>0.1</value></property>
|
|
27
|
|
28 <property><name>fetcher.store.content</name><value>false</value></property>
|
|
29 <property><name>fetcher.store.robotstxt</name><value>true</value></property>
|
|
30 <property><name>fetcher.store.404s</name><value>true</value></property>
|
|
31 <property><name>fetcher.store.warc</name><value>true</value></property>
|
|
32 <property><name>fetcher.signature</name><value>true</value></property>
|
|
33 <property><name>fetcher.redirect.dedupcache.seconds</name><value>5400</value></property>
|
|
34 <property><name>fetcher.redirect.dedupcache.size</name><value>6000</value></property>
|
|
35
|
|
36 <!-- we use okhttp -->
|
|
37 <property><name>plugin.includes</name><value>protocol-okhttp</value></property>
|
|
38
|
|
39 <!-- WARC file names and info header - needs to be adapted, e.g. that's the info
|
|
40 of the current March/April crawl:
|
|
41 PREFIX=CC-MAIN
|
|
42 ISPARTOF=CC-MAIN-2020-16
|
|
43 PUBLISHER='Common Crawl'
|
|
44 OPERATOR='Common Crawl Admin (info@commoncrawl.org)'
|
|
45 SOFTWARE='Apache Nutch 1.16 (modified, https://github.com/commoncrawl/nutch/)'
|
|
46 DESCRIPTION='Wide crawl of the web for March/April 2020'
|
|
47 -->
|
|
48 <property><name>warc.export.prefix</name><value>CC-HST-PDF</value></property>
|
|
49 <property><name>warc.export.operator</name><value>ht@inf.ed.ac.uk</value></property>
|
|
50 <property><name>warc.export.publisher</name><value>School of Informatics, University of Edinburgh</value></property>
|
|
51 <property><name>warc.export.software</name><value>Apache Nutch 1.16 (modified, https://github.com/commoncrawl/nutch/)</value></property>
|
|
52 <property><name>warc.export.description</name><value>Augment CC of August 2020 with PDFs larger than 1MB</value></property>
|
|
53 <!--property><name>warc.export.isPartOf</name><value>$ISPARTOF</value></property-->
|
|
54
|
|
55 <!-- further WARC writer config -->
|
|
56 <!-- trying relative paths??? -->
|
|
57 <property><name>warc.export.path</name><value>warc</value></property>
|
|
58 <property><name>warc.export.cdx.path</name><value>cdx</value></property>
|
|
59 <property><name>warc.deduplicate</name><value>true</value></property>
|
|
60 <property><name>warc.export.text</name><value>false</value></property>
|
|
61 <property><name>warc.export.crawldiagnostics</name><value>true</value></property>
|
|
62 <property><name>warc.export.robotstxt</name><value>true</value></property>
|
|
63 <property><name>warc.export.cdx</name><value>true</value></property>
|
|
64
|
|
65 </configuration>
|