view bin/ix.sh @ 89:90f8f28b2e51

working on flags
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 13 Apr 2021 17:52:31 +0000
parents 464d2dfb99c9
children 5384208a0834
line wrap: on
line source

#!/usr/bin/bash
# Extract records from warc files given filename, length and offset triples
#  from stdin or as command line args
# Usage [-w] [-h] [-b] [path length offset]
#  -w WARC headers
#  -h HTTP headers
#  -b HTTP body
# No switch defaults to whole record
if [ "$1" = "-w" ]
then
 shift
 p=1
 w=1
fi
if [ "$1" = "-h" ]
then
 shift
 p=1
 h=1
fi
if [ "$1" = "-b" ]
then
 shift
 p=1
 b=1
fi
if [ -n "$1" ]
then
    printf "%s\t%s\t%s\n" "$1" "$2" "$3"
else
    cat
fi | \
while { IFS=$'\t' read f l o; }
do
  dd if="$f" of=/dev/stdout skip=$o count=$l \
      iflag=skip_bytes,count_bytes 2>/dev/null
done | unpigz -dp 1 -c | \
s="w"
if [ -n "$p" ]
then
   while read -r L
   do
     if [ "$s" = "w" ]
     then
	 # WARC header
	 if [ "$L" = "
" ]
	 then
	     s="h"
	     continue
	 fi
	 if [ -n "$w" ]
	 then
	     printf "%s\n" "${L%%
}"
	 fi
	 continue
     fi
     if [ "$s" = "b" ]
     then
	 # HTTP header
	 case "$L" in
	     
) s="b" ; n=0 ; continue ;;
	     Content-Length:\ *) bl=${L##*: }
		                 bl=${bl%%*([
[:space:]])}
				 ;;
	 esac
	 if [ -n "$w" ]
	 then
	     printf "%s\n" "${L%%
}"
	 fi
	 continue
     else
	 # HTTP body
	 if [ -n "$b" ]
	 then
	     printf "%s\n" "$bl" 1>&2
	     head -c "$bl"
	 else
	     break
	 fi
     fi
   done
else
    cat
fi # No flags,the whole thing