Mercurial > hg > cc > cirrus_home
changeset 193:fff248a65e39
works, although output not checked
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 25 Sep 2024 13:52:42 +0100 |
parents | 4275eb6484da |
children | 1845222b3d73 |
files | src/nutch-cc/src/test/org/commoncrawl/util/TestWarcCdxWriter.java |
diffstat | 1 files changed, 28 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/src/nutch-cc/src/test/org/commoncrawl/util/TestWarcCdxWriter.java Wed Sep 25 13:51:15 2024 +0100 +++ b/src/nutch-cc/src/test/org/commoncrawl/util/TestWarcCdxWriter.java Wed Sep 25 13:52:42 2024 +0100 @@ -56,6 +56,7 @@ import java.io.OutputStream; import java.io.DataOutputStream; import java.io.DataInputStream; +import java.io.UnsupportedEncodingException; import java.net.URI; import java.util.ArrayList; import java.util.List; @@ -348,29 +349,42 @@ "Content-Length", "16378", // "Connection", "close" }; public final static String testHeaderString1; + public final static DataOutputStream headerData; + public final static ByteArrayOutputStream baos; static { StringBuilder headers = new StringBuilder(); + baos = new ByteArrayOutputStream(); + headerData = new DataOutputStream(baos); headers.append(statusLine1).append(WarcRecordWriter.CRLF); - for (int i = 0; i < testHeaders1.length; i += 2) { - headers.append(testHeaders1[i]).append(WarcRecordWriter.COLONSP); - headers.append(testHeaders1[i+1]).append(WarcRecordWriter.CRLF); + try { + headerData.writeInt(testHeaders1.length/2); + for (int i = 0; i < testHeaders1.length; i += 2) { + headers.append(testHeaders1[i]).append(WarcRecordWriter.COLONSP); + Text.writeString(headerData,testHeaders1[i]); + headers.append(testHeaders1[i+1]).append(WarcRecordWriter.CRLF); + headerData.writeInt(1); + Text.writeString(headerData,testHeaders1[i+1]); + } + headerData.close(); + } catch (final IOException ex) { + throw new RuntimeException("Failed to convert tHS1 in static block.", ex); } headers.append(WarcRecordWriter.CRLF); testHeaderString1 = headers.toString(); } @Test - public void testLastMod() throws IOException { + public void testLastMod() throws IOException { Configuration config = NutchConfiguration.create(); Job job = NutchJob.getInstance(config); TaskAttemptContext context = new DummyContext(); config = job.getConfiguration(); config.setBoolean("warc.export.cdx", true); + config.set("warc.export.cdx.path","/tmp/hst_test/cdx"); WarcRecordWriter wrw = new WarcRecordWriter(config, new Path("/tmp/hst_test"), 123, context); System.err.print("testing..."); //DataOutputStream devnull = new DataOutputStream(OutputStream.nullOutputStream()); - //ByteArrayOutputStream baos = new ByteArrayOutputStream(); CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_FETCH_SUCCESS, 100); MapWritable mm = new MapWritable(); @@ -380,17 +394,18 @@ new Text("200")); datum.setMetaData(mm); Metadata meta = new Metadata(); - meta.add(Response.RESPONSE_HEADERS, testHeaderString1); + meta.readFields(new DataInputStream(new ByteArrayInputStream(baos.toByteArray()))); // sledgehammer... + Content content = new Content("https://www.w3.org/1999/xhtml", + "https://www.w3.org/1999/xhtml", + "<html/>".getBytes("UTF8"), + "text/xml", + meta, + config); + content.setContentType("text/xml"); // Still not getting through to cdx wrw.write(new Text("someKey"), new WarcCapture(new Text("https://www.w3.org/1999/xhtml"), - datum, - new Content("https://www.w3.org/1999/xhtml", - "https://www.w3.org/1999/xhtml", - new byte[0], - "text/xml", - meta, - config))); + datum, content)); wrw.close(context); System.err.println("done"); }