Mercurial > hg > cc > cirrus_home
changeset 89:90f8f28b2e51
working on flags
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 13 Apr 2021 17:52:31 +0000 |
parents | 464d2dfb99c9 |
children | 5384208a0834 |
files | bin/ix.sh |
diffstat | 1 files changed, 73 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/ix.sh Tue Apr 13 17:02:09 2021 +0000 +++ b/bin/ix.sh Tue Apr 13 17:52:31 2021 +0000 @@ -1,6 +1,29 @@ #!/usr/bin/bash # Extract records from warc files given filename, length and offset triples # from stdin or as command line args +# Usage [-w] [-h] [-b] [path length offset] +# -w WARC headers +# -h HTTP headers +# -b HTTP body +# No switch defaults to whole record +if [ "$1" = "-w" ] +then + shift + p=1 + w=1 +fi +if [ "$1" = "-h" ] +then + shift + p=1 + h=1 +fi +if [ "$1" = "-b" ] +then + shift + p=1 + b=1 +fi if [ -n "$1" ] then printf "%s\t%s\t%s\n" "$1" "$2" "$3" @@ -9,5 +32,53 @@ fi | \ while { IFS=$'\t' read f l o; } do - dd if="$f" of=/dev/stdout skip=$o count=$l iflag=skip_bytes,count_bytes -done | unpigz -dp 1 -c + dd if="$f" of=/dev/stdout skip=$o count=$l \ + iflag=skip_bytes,count_bytes 2>/dev/null +done | unpigz -dp 1 -c | \ +s="w" +if [ -n "$p" ] +then + while read -r L + do + if [ "$s" = "w" ] + then + # WARC header + if [ "$L" = " " ] + then + s="h" + continue + fi + if [ -n "$w" ] + then + printf "%s\n" "${L%% }" + fi + continue + fi + if [ "$s" = "b" ] + then + # HTTP header + case "$L" in + ) s="b" ; n=0 ; continue ;; + Content-Length:\ *) bl=${L##*: } + bl=${bl%%*([ [:space:]])} + ;; + esac + if [ -n "$w" ] + then + printf "%s\n" "${L%% }" + fi + continue + else + # HTTP body + if [ -n "$b" ] + then + printf "%s\n" "$bl" 1>&2 + head -c "$bl" + else + break + fi + fi + done +else + cat +fi # No flags,the whole thing