# HG changeset patch # User Henry S. Thompson # Date 1587755549 -3600 # Node ID d5f59c1fdc10f88027a465746e7236ae7ecd1ff9 # Parent ecdbaa11b88e474c90df2e1db28ab8524017ed99 mostly from Sebastian diff -r ecdbaa11b88e -r d5f59c1fdc10 src/nutch-cc/conf/nutch-site.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/nutch-cc/conf/nutch-site.xml Fri Apr 24 20:12:29 2020 +0100 @@ -0,0 +1,65 @@ + + + + + + + + http.content.limit-1 + http.store.responsetimetrue + store.ip.addresstrue + store.http.requesttrue + store.http.headerstrue + http.accept.languageen-US,en;q=0.5 + http.accept.charset + http.time.limit300 + http.timeout45000 + http.redirect.max3 + http.redirect.max.skipfalse + http.partial.truncatedtrue + + + http.agent.namehtInEdin + + http.agent.descriptionExperimental PDF crawler + http.agent.urlhttp://www.ltg.ed.ac.uk/~ht/pdfCrawl.html + http.agent.version0.1 + + fetcher.store.contentfalse + fetcher.store.robotstxttrue + fetcher.store.404strue + fetcher.store.warctrue + fetcher.signaturetrue + fetcher.redirect.dedupcache.seconds5400 + fetcher.redirect.dedupcache.size6000 + + + plugin.includesprotocol-okhttp + + + warc.export.prefixCC-HST-PDF + warc.export.operatorht@inf.ed.ac.uk + warc.export.publisherSchool of Informatics, University of Edinburgh + warc.export.softwareApache Nutch 1.16 (modified, https://github.com/commoncrawl/nutch/) + warc.export.descriptionAugment CC of August 2020 with PDFs larger than 1MB + + + + + warc.export.pathwarc + warc.export.cdx.pathcdx + warc.deduplicatetrue + warc.export.textfalse + warc.export.crawldiagnosticstrue + warc.export.robotstxttrue + warc.export.cdxtrue + +