view bin/ix.sh @ 91:82c94684f799

working with one input
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 14 Apr 2021 10:08:41 +0000
parents 5384208a0834
children d56465d5c51f
line wrap: on
line source

#!/usr/bin/bash
# Extract records from warc files given filename, length and offset triples
#  from stdin or as command line args
# Usage [-w] [-h] [-b] [path length offset]
#  -w WARC headers
#  -h HTTP headers
#  -b HTTP body
# No switch defaults to whole record
if [ "$1" = "-w" ]
then
 shift
 p=1
 w=1
fi
if [ "$1" = "-h" ]
then
 shift
 p=1
 h=1
fi
if [ "$1" = "-b" ]
then
 shift
 p=1
 b=1
fi
if [ -n "$1" ]
then
    printf "%s\t%s\t%s\n" "$1" "$2" "$3"
else
    cat
fi | \
while { IFS=$'\t' read f l o; }
do
  dd if="$f" of=/dev/stdout skip=$o count=$l \
      iflag=skip_bytes,count_bytes 2>/dev/null
done | unpigz -dp 1 -c | \
{ s="w"
if [ -n "$p" ]
then
   shopt -qs extglob # for %%*(...)
   while read -r L
   do
     if [ "$s" = "w" ]
     then
	 # WARC header
	 if [ "$L" = "
" ]
	 then
	     s="h"
	     if [[ -n "$w" && ( -n "$h" || -n "$b" ) ]]
	     then
		 echo
	     fi	 
	     continue
	 fi
	 if [ -n "$w" ]
	 then
	     printf "%s\n" "${L%%
}"
	 fi
	 continue
     fi
     if [ "$s" = "h" ]
     then
	 # HTTP header
	 case "$L" in
	     
) s="b" ; n=0
		 if [ -n "$h" -a -n "$b" ]
		 then
		     echo
		 fi	 
                 continue ;;
	     Content-Length:\ *) bl=${L##*: }
		                 bl=${bl%%*([
[:space:]])}
				 ;;
	 esac
	 if [ -n "$h" ]
	 then
	     printf "%s\n" "${L%%
}"
	 fi
	 continue
     else
	 # HTTP body
	 if [ -n "$b" ]
	 then
	     #printf "%s\n" "$bl" 1>&2
	     head -c "$bl"
	 else
	     break
	 fi
     fi
   done
else
 # No flags,the whole thing
 cat
fi
}