Mercurial > hg > cc > cirrus_home
changeset 190:f2bf736c2d40
working, with issues:
1) warc.export.cdx.path is not set
2) mime property isn't getting through
3) lastMod isn't showing up
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 24 Sep 2024 17:08:05 +0100 |
parents | 1cc12a5a070b |
children | b5904d0bdfd4 |
files | src/nutch-cc/src/test/org/commoncrawl/util/TestWarcCdxWriter.java |
diffstat | 1 files changed, 24 insertions(+), 10 deletions(-) [+] |
line wrap: on
line diff
--- a/src/nutch-cc/src/test/org/commoncrawl/util/TestWarcCdxWriter.java Tue Sep 24 12:34:51 2024 +0100 +++ b/src/nutch-cc/src/test/org/commoncrawl/util/TestWarcCdxWriter.java Tue Sep 24 17:08:05 2024 +0100 @@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration.IntegerRanges; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.Counters; @@ -44,7 +45,10 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolStatus; import java.io.ByteArrayOutputStream; import java.io.ByteArrayInputStream; @@ -96,6 +100,7 @@ } private Counters dummyCounters = new Counters(); + private String status; public void progress() { } @@ -109,7 +114,8 @@ } public void setStatus(String arg0) throws UnsupportedOperationException { - throw new UnsupportedOperationException("Dummy context with no status"); + //throw new UnsupportedOperationException("Dummy context with no status"); + status = arg0; } @Override @@ -334,10 +340,10 @@ "Content-Type", "text/html", // "Accept-Ranges", "bytes", // "Content-Encoding", "gzip", // - "Vary", "Accept-Encoding", "Server", - "Apache/2.0.63 (Unix) PHP/4.4.7 mod_ssl/2.0.63 OpenSSL/0.9.7e mod_fastcgi/2.4.2 DAV/2 SVN/1.4.2", - "Last-Modified", "Thu, 15 Jan 2009 00:02:29 GMT", "ETag", - "\"1262d9e-3ffa-2c19af40\"", // + "Vary", "Accept-Encoding", + "Server", "Apache/2.0.63 (Unix) PHP/4.4.7 mod_ssl/2.0.63 OpenSSL/0.9.7e mod_fastcgi/2.4.2 DAV/2 SVN/1.4.2", + "Last-Modified", "Thu, 15 Jan 2009 00:02:29 GMT", + "ETag", "\"1262d9e-3ffa-2c19af40\"", // "Date", "Mon, 26 Jan 2009 10:00:40 GMT", // "Content-Length", "16378", // "Connection", "close" }; @@ -360,24 +366,32 @@ TaskAttemptContext context = new DummyContext(); config = job.getConfiguration(); config.setBoolean("warc.export.cdx", true); - WarcRecordWriter wrw = new WarcRecordWriter(config, new Path("/tmp"), + WarcRecordWriter wrw = new WarcRecordWriter(config, new Path("/tmp/hst_test"), 123, context); System.err.print("testing..."); //DataOutputStream devnull = new DataOutputStream(OutputStream.nullOutputStream()); //ByteArrayOutputStream baos = new ByteArrayOutputStream(); + CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_FETCH_SUCCESS, + 100); + MapWritable mm = new MapWritable(); + mm.put(Nutch.WRITABLE_PROTO_STATUS_KEY, + new ProtocolStatus(ProtocolStatus.SUCCESS,"OK")); + mm.put(Nutch.PROTOCOL_STATUS_CODE_KEY, + new Text("200")); + datum.setMetaData(mm); Metadata meta = new Metadata(); - meta.readFields(new DataInputStream( - new ByteArrayInputStream(testHeaderString1.getBytes()))); + + meta.add(Response.RESPONSE_HEADERS, testHeaderString1); wrw.write(new Text("someKey"), new WarcCapture(new Text("https://www.w3.org/1999/xhtml"), - new CrawlDatum(CrawlDatum.STATUS_FETCH_SUCCESS, - 100), + datum, new Content("https://www.w3.org/1999/xhtml", "https://www.w3.org/1999/xhtml", new byte[0], "text/xml", meta, config))); + wrw.close(context); System.err.println("done"); } }