changeset 193:fff248a65e39

works, although output not checked
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 25 Sep 2024 13:52:42 +0100
parents 4275eb6484da
children 1845222b3d73
files src/nutch-cc/src/test/org/commoncrawl/util/TestWarcCdxWriter.java
diffstat 1 files changed, 28 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- a/src/nutch-cc/src/test/org/commoncrawl/util/TestWarcCdxWriter.java	Wed Sep 25 13:51:15 2024 +0100
+++ b/src/nutch-cc/src/test/org/commoncrawl/util/TestWarcCdxWriter.java	Wed Sep 25 13:52:42 2024 +0100
@@ -56,6 +56,7 @@
 import java.io.OutputStream;
 import java.io.DataOutputStream;
 import java.io.DataInputStream;
+import java.io.UnsupportedEncodingException;
 import java.net.URI;
 import java.util.ArrayList;
 import java.util.List;
@@ -348,29 +349,42 @@
       "Content-Length", "16378", //
       "Connection", "close" };
   public final static String testHeaderString1;
+  public final static DataOutputStream headerData;
+  public final static ByteArrayOutputStream baos;
   static {
     StringBuilder headers = new StringBuilder();
+    baos = new ByteArrayOutputStream();
+    headerData = new DataOutputStream(baos);
     headers.append(statusLine1).append(WarcRecordWriter.CRLF);
-    for (int i = 0; i < testHeaders1.length; i += 2) {
-      headers.append(testHeaders1[i]).append(WarcRecordWriter.COLONSP);
-      headers.append(testHeaders1[i+1]).append(WarcRecordWriter.CRLF);
+    try {
+      headerData.writeInt(testHeaders1.length/2);
+      for (int i = 0; i < testHeaders1.length; i += 2) {
+	headers.append(testHeaders1[i]).append(WarcRecordWriter.COLONSP);
+	Text.writeString(headerData,testHeaders1[i]);
+	headers.append(testHeaders1[i+1]).append(WarcRecordWriter.CRLF);
+	headerData.writeInt(1);
+	Text.writeString(headerData,testHeaders1[i+1]);
+      }
+      headerData.close();
+    } catch (final IOException ex) {
+      throw new RuntimeException("Failed to convert tHS1 in static block.", ex);
     }
     headers.append(WarcRecordWriter.CRLF);
     testHeaderString1 = headers.toString();
   }
 
   @Test
-  public void testLastMod() throws IOException {
+    public void testLastMod() throws IOException {
     Configuration config = NutchConfiguration.create();
     Job job = NutchJob.getInstance(config);
     TaskAttemptContext context = new DummyContext();
     config = job.getConfiguration();
     config.setBoolean("warc.export.cdx", true);
+    config.set("warc.export.cdx.path","/tmp/hst_test/cdx");
     WarcRecordWriter wrw = new WarcRecordWriter(config, new Path("/tmp/hst_test"),
 						123, context);
     System.err.print("testing...");
     //DataOutputStream devnull = new DataOutputStream(OutputStream.nullOutputStream());
-    //ByteArrayOutputStream baos = new ByteArrayOutputStream();
     CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_FETCH_SUCCESS,
 				      100);
     MapWritable mm = new MapWritable();
@@ -380,17 +394,18 @@
 	   new Text("200"));
     datum.setMetaData(mm);
     Metadata meta = new Metadata();
-
     meta.add(Response.RESPONSE_HEADERS, testHeaderString1);
+    meta.readFields(new DataInputStream(new ByteArrayInputStream(baos.toByteArray()))); // sledgehammer...
+    Content content = new Content("https://www.w3.org/1999/xhtml",
+				  "https://www.w3.org/1999/xhtml",
+				  "<html/>".getBytes("UTF8"),
+				  "text/xml",
+				  meta,
+				  config);
+    content.setContentType("text/xml"); // Still not getting through to cdx
     wrw.write(new Text("someKey"),
 	      new WarcCapture(new Text("https://www.w3.org/1999/xhtml"),
-			      datum,
-			      new Content("https://www.w3.org/1999/xhtml",
-					  "https://www.w3.org/1999/xhtml",
-					  new byte[0],
-					  "text/xml",
-					  meta,
-					  config)));
+			      datum, content));
     wrw.close(context);
     System.err.println("done");
   }