changeset 89:90f8f28b2e51

working on flags
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 13 Apr 2021 17:52:31 +0000
parents 464d2dfb99c9
children 5384208a0834
files bin/ix.sh
diffstat 1 files changed, 73 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/bin/ix.sh	Tue Apr 13 17:02:09 2021 +0000
+++ b/bin/ix.sh	Tue Apr 13 17:52:31 2021 +0000
@@ -1,6 +1,29 @@
 #!/usr/bin/bash
 # Extract records from warc files given filename, length and offset triples
 #  from stdin or as command line args
+# Usage [-w] [-h] [-b] [path length offset]
+#  -w WARC headers
+#  -h HTTP headers
+#  -b HTTP body
+# No switch defaults to whole record
+if [ "$1" = "-w" ]
+then
+ shift
+ p=1
+ w=1
+fi
+if [ "$1" = "-h" ]
+then
+ shift
+ p=1
+ h=1
+fi
+if [ "$1" = "-b" ]
+then
+ shift
+ p=1
+ b=1
+fi
 if [ -n "$1" ]
 then
     printf "%s\t%s\t%s\n" "$1" "$2" "$3"
@@ -9,5 +32,53 @@
 fi | \
 while { IFS=$'\t' read f l o; }
 do
-  dd if="$f" of=/dev/stdout skip=$o count=$l iflag=skip_bytes,count_bytes
-done | unpigz -dp 1 -c
+  dd if="$f" of=/dev/stdout skip=$o count=$l \
+      iflag=skip_bytes,count_bytes 2>/dev/null
+done | unpigz -dp 1 -c | \
+s="w"
+if [ -n "$p" ]
+then
+   while read -r L
+   do
+     if [ "$s" = "w" ]
+     then
+	 # WARC header
+	 if [ "$L" = "
" ]
+	 then
+	     s="h"
+	     continue
+	 fi
+	 if [ -n "$w" ]
+	 then
+	     printf "%s\n" "${L%%
}"
+	 fi
+	 continue
+     fi
+     if [ "$s" = "b" ]
+     then
+	 # HTTP header
+	 case "$L" in
+	     
) s="b" ; n=0 ; continue ;;
+	     Content-Length:\ *) bl=${L##*: }
+		                 bl=${bl%%*([
[:space:]])}
+				 ;;
+	 esac
+	 if [ -n "$w" ]
+	 then
+	     printf "%s\n" "${L%%
}"
+	 fi
+	 continue
+     else
+	 # HTTP body
+	 if [ -n "$b" ]
+	 then
+	     printf "%s\n" "$bl" 1>&2
+	     head -c "$bl"
+	 else
+	     break
+	 fi
+     fi
+   done
+else
+    cat
+fi # No flags,the whole thing