changeset 190:f2bf736c2d40

working, with issues: 1) warc.export.cdx.path is not set 2) mime property isn't getting through 3) lastMod isn't showing up
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 24 Sep 2024 17:08:05 +0100
parents 1cc12a5a070b
children b5904d0bdfd4
files src/nutch-cc/src/test/org/commoncrawl/util/TestWarcCdxWriter.java
diffstat 1 files changed, 24 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/src/nutch-cc/src/test/org/commoncrawl/util/TestWarcCdxWriter.java	Tue Sep 24 12:34:51 2024 +0100
+++ b/src/nutch-cc/src/test/org/commoncrawl/util/TestWarcCdxWriter.java	Tue Sep 24 17:08:05 2024 +0100
@@ -23,6 +23,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configuration.IntegerRanges;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapWritable;
 import org.apache.hadoop.io.RawComparator;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.Counters;
@@ -44,7 +45,10 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolStatus;
 
 import java.io.ByteArrayOutputStream;
 import java.io.ByteArrayInputStream;
@@ -96,6 +100,7 @@
     }
 
     private Counters dummyCounters = new Counters();
+    private String status;
 
     public void progress() {
     }
@@ -109,7 +114,8 @@
     }
 
     public void setStatus(String arg0) throws UnsupportedOperationException {
-      throw new UnsupportedOperationException("Dummy context with no status");
+      //throw new UnsupportedOperationException("Dummy context with no status");
+      status = arg0;
     }
 
     @Override
@@ -334,10 +340,10 @@
       "Content-Type", "text/html", //
       "Accept-Ranges", "bytes", //
       "Content-Encoding", "gzip", //
-      "Vary", "Accept-Encoding", "Server",
-      "Apache/2.0.63 (Unix) PHP/4.4.7 mod_ssl/2.0.63 OpenSSL/0.9.7e mod_fastcgi/2.4.2 DAV/2 SVN/1.4.2",
-      "Last-Modified", "Thu, 15 Jan 2009 00:02:29 GMT", "ETag",
-      "\"1262d9e-3ffa-2c19af40\"", //
+      "Vary", "Accept-Encoding",
+      "Server", "Apache/2.0.63 (Unix) PHP/4.4.7 mod_ssl/2.0.63 OpenSSL/0.9.7e mod_fastcgi/2.4.2 DAV/2 SVN/1.4.2",
+      "Last-Modified", "Thu, 15 Jan 2009 00:02:29 GMT",
+      "ETag", "\"1262d9e-3ffa-2c19af40\"", //
       "Date", "Mon, 26 Jan 2009 10:00:40 GMT", //
       "Content-Length", "16378", //
       "Connection", "close" };
@@ -360,24 +366,32 @@
     TaskAttemptContext context = new DummyContext();
     config = job.getConfiguration();
     config.setBoolean("warc.export.cdx", true);
-    WarcRecordWriter wrw = new WarcRecordWriter(config, new Path("/tmp"),
+    WarcRecordWriter wrw = new WarcRecordWriter(config, new Path("/tmp/hst_test"),
 						123, context);
     System.err.print("testing...");
     //DataOutputStream devnull = new DataOutputStream(OutputStream.nullOutputStream());
     //ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_FETCH_SUCCESS,
+				      100);
+    MapWritable mm = new MapWritable();
+    mm.put(Nutch.WRITABLE_PROTO_STATUS_KEY,
+	   new ProtocolStatus(ProtocolStatus.SUCCESS,"OK"));
+    mm.put(Nutch.PROTOCOL_STATUS_CODE_KEY,
+	   new Text("200"));
+    datum.setMetaData(mm);
     Metadata meta = new Metadata();
-    meta.readFields(new DataInputStream(
-		    new ByteArrayInputStream(testHeaderString1.getBytes())));
+
+    meta.add(Response.RESPONSE_HEADERS, testHeaderString1);
     wrw.write(new Text("someKey"),
 	      new WarcCapture(new Text("https://www.w3.org/1999/xhtml"),
-			      new CrawlDatum(CrawlDatum.STATUS_FETCH_SUCCESS,
-					     100),
+			      datum,
 			      new Content("https://www.w3.org/1999/xhtml",
 					  "https://www.w3.org/1999/xhtml",
 					  new byte[0],
 					  "text/xml",
 					  meta,
 					  config)));
+    wrw.close(context);
     System.err.println("done");
   }
 }