changeset 92:d56465d5c51f

accepts index lines, less line-at-a-time
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 14 Apr 2021 20:15:32 +0000
parents 82c94684f799
children 4d870a7ec871
files bin/ix.sh
diffstat 1 files changed, 79 insertions(+), 26 deletions(-) [+]
line wrap: on
line diff
--- a/bin/ix.sh	Wed Apr 14 10:08:41 2021 +0000
+++ b/bin/ix.sh	Wed Apr 14 20:15:32 2021 +0000
@@ -1,11 +1,18 @@
 #!/usr/bin/bash
-# Extract records from warc files given filename, length and offset triples
+# Extract records from warc files given length, offset and file triples
 #  from stdin or as command line args
-# Usage [-w] [-h] [-b] [path length offset]
+# Usage [-d] [-w] [-h] [-b] [-x] [length offset path]
+#  -d Debug output
 #  -w WARC headers
 #  -h HTTP headers
 #  -b HTTP body
-# No switch defaults to whole record
+#   No switch defaults to whole record
+#  -x take lines of from a cdx index file as input, extract triples
+if [ "$1" = "-d" ]
+then
+ d=1
+ shift
+fi
 if [ "$1" = "-w" ]
 then
  shift
@@ -24,19 +31,33 @@
  p=1
  b=1
 fi
-if [ -n "$1" ]
+if [ "$1" = "-x" ]
+then
+    # get triples from index lines
+    egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \
+      sed 's/[a-z]*": "//g;s/", "/\t/g;s/\(crawl-data\|segments\|warc\)\///g' |\
+ if [ "$d" ] ; then tee /tmp/ix_triples.tsv ; else cat ; fi
+elif [ "$1" ]
 then
     printf "%s\t%s\t%s\n" "$1" "$2" "$3"
 else
     cat
 fi | \
-while { IFS=$'\t' read f l o; }
+while { IFS=$'\t' read l o f; }
 do
-  dd if="$f" of=/dev/stdout skip=$o count=$l \
-      iflag=skip_bytes,count_bytes 2>/dev/null
-done | unpigz -dp 1 -c | \
+  if [ -z "$d" ]
+  then
+      dd if="$f" of=/dev/stdout skip=$o count=$l \
+	  iflag=skip_bytes,count_bytes 2>/dev/null
+  else
+      echo dd if="$f" of=/dev/stdout skip=$o count=$l \
+	  iflag=skip_bytes,count_bytes > /tmp/ix_dd_log.txt
+      dd if="$f" of=/dev/stdout skip=$o count=$l \
+	  iflag=skip_bytes,count_bytes 2>> /tmp/ix_dd_log.txt
+  fi | \
+unpigz -dp 1 -c | tee /tmp/data | \
 { s="w"
-if [ -n "$p" ]
+if [ "$p" ]
 then
    shopt -qs extglob # for %%*(...)
    while read -r L
@@ -44,16 +65,27 @@
      if [ "$s" = "w" ]
      then
 	 # WARC header
-	 if [ "$L" = "
" ]
-	 then
-	     s="h"
-	     if [[ -n "$w" && ( -n "$h" || -n "$b" ) ]]
-	     then
-		 echo
-	     fi	 
-	     continue
-	 fi
-	 if [ -n "$w" ]
+	 case "$L" in
+	   Content-Length:\ *) wl=${L##*: }
+			       wl=${wl%%*([
[:space:]])}
+			       ;;
+	    WARC-Truncated:\ *) # echo $n $L
+		              tr=${L##*: }
+		              tr=${tr%%*([
[:space:]])}
+			      tr=${tr:-EMPTY}
+		                    ;;
+	   
) s="h"
+	       if [ -z "$h$b" ]
+	       then
+		   exit 0
+	       fi
+	       if [ "$w" ]
+	       then
+		   echo
+	       fi	 
+	       continue;;
+	 esac
+	 if [ "$w" ]
 	 then
 	     printf "%s\n" "${L%%
}"
 	 fi
@@ -62,28 +94,48 @@
      if [ "$s" = "h" ]
      then
 	 # HTTP header
+	 wl=$((wl - ( ${#L} + 1 )))
 	 case "$L" in
+	     Content-Length:\ *) bl=${L##*: }
+		                 bl=${bl%%*([
[:space:]])}
+				 ;;
+	     X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018???
+			      xl=${xl%%*([
[:space:]])}
+			      ;;
 	     
) s="b" ; n=0
-		 if [ -n "$h" -a -n "$b" ]
+		 if [ -z "$b" ]
+		 then
+		     exit 0
+		 fi
+		 if [ "$h" ]
 		 then
 		     echo
 		 fi	 
+		 if [ "$xl" ]; then
+		     bl=$xl
+		     xx=x
+		 else
+		     unset xx
+		 fi
+		 if [ "$bl" ]; then
+		     if [ $bl -ne $wl ]; then
+			 echo length mismatch$xx: $f $o $l here: $wl given: $bl trunc: $tr 1>&2
+		     fi
+		 fi
                  continue ;;
-	     Content-Length:\ *) bl=${L##*: }
-		                 bl=${bl%%*([
[:space:]])}
-				 ;;
 	 esac
-	 if [ -n "$h" ]
+	 if [ "$h" ]
 	 then
 	     printf "%s\n" "${L%%
}"
 	 fi
 	 continue
      else
 	 # HTTP body
-	 if [ -n "$b" ]
+	 if [ "$b" ]
 	 then
 	     #printf "%s\n" "$bl" 1>&2
-	     head -c "$bl"
+	     head -c "${bl-$wl}"
+	     exit 0
 	 else
 	     break
 	 fi
@@ -94,3 +146,4 @@
  cat
 fi
 }
+done