changeset 93:4d870a7ec871

support a command to receive each result, remove use of X-Crawler-Content-Length
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 15 Apr 2021 10:59:25 +0000
parents d56465d5c51f
children d60073ec798a
files bin/ix.sh
diffstat 1 files changed, 11 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/bin/ix.sh	Wed Apr 14 20:15:32 2021 +0000
+++ b/bin/ix.sh	Thu Apr 15 10:59:25 2021 +0000
@@ -1,13 +1,14 @@
 #!/usr/bin/bash
 # Extract records from warc files given length, offset and file triples
 #  from stdin or as command line args
-# Usage [-d] [-w] [-h] [-b] [-x] [length offset path]
+# Usage [-d] [-w] [-h] [-b] [-e cmd] [ -x | length offset path ]
 #  -d Debug output
 #  -w WARC headers
 #  -h HTTP headers
 #  -b HTTP body
 #   No switch defaults to whole record
 #  -x take lines of from a cdx index file as input, extract triples
+#  -e pipes each result thru cmd
 if [ "$1" = "-d" ]
 then
  d=1
@@ -31,6 +32,13 @@
  p=1
  b=1
 fi
+e="cat"
+if [ "$1" = "-e" ]
+then
+ shift
+ e="$1"
+ shift
+fi
 if [ "$1" = "-x" ]
 then
     # get triples from index lines
@@ -99,9 +107,6 @@
 	     Content-Length:\ *) bl=${L##*: }
 		                 bl=${bl%%*([
[:space:]])}
 				 ;;
-	     X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018???
-			      xl=${xl%%*([
[:space:]])}
-			      ;;
 	     
) s="b" ; n=0
 		 if [ -z "$b" ]
 		 then
@@ -111,15 +116,9 @@
 		 then
 		     echo
 		 fi	 
-		 if [ "$xl" ]; then
-		     bl=$xl
-		     xx=x
-		 else
-		     unset xx
-		 fi
 		 if [ "$bl" ]; then
 		     if [ $bl -ne $wl ]; then
-			 echo length mismatch$xx: $f $o $l here: $wl given: $bl trunc: $tr 1>&2
+			 echo length mismatch: $l $o $f here: $wl given: $bl trunc: $tr 1>&2
 		     fi
 		 fi
                  continue ;;
@@ -145,5 +144,5 @@
  # No flags,the whole thing
  cat
 fi
-}
+} | $e
 done