Mercurial > hg > cc > cirrus_home
view bin/ix.sh @ 91:82c94684f799
working with one input
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 14 Apr 2021 10:08:41 +0000 |
parents | 5384208a0834 |
children | d56465d5c51f |
line wrap: on
line source
#!/usr/bin/bash # Extract records from warc files given filename, length and offset triples # from stdin or as command line args # Usage [-w] [-h] [-b] [path length offset] # -w WARC headers # -h HTTP headers # -b HTTP body # No switch defaults to whole record if [ "$1" = "-w" ] then shift p=1 w=1 fi if [ "$1" = "-h" ] then shift p=1 h=1 fi if [ "$1" = "-b" ] then shift p=1 b=1 fi if [ -n "$1" ] then printf "%s\t%s\t%s\n" "$1" "$2" "$3" else cat fi | \ while { IFS=$'\t' read f l o; } do dd if="$f" of=/dev/stdout skip=$o count=$l \ iflag=skip_bytes,count_bytes 2>/dev/null done | unpigz -dp 1 -c | \ { s="w" if [ -n "$p" ] then shopt -qs extglob # for %%*(...) while read -r L do if [ "$s" = "w" ] then # WARC header if [ "$L" = " " ] then s="h" if [[ -n "$w" && ( -n "$h" || -n "$b" ) ]] then echo fi continue fi if [ -n "$w" ] then printf "%s\n" "${L%% }" fi continue fi if [ "$s" = "h" ] then # HTTP header case "$L" in ) s="b" ; n=0 if [ -n "$h" -a -n "$b" ] then echo fi continue ;; Content-Length:\ *) bl=${L##*: } bl=${bl%%*([ [:space:]])} ;; esac if [ -n "$h" ] then printf "%s\n" "${L%% }" fi continue else # HTTP body if [ -n "$b" ] then #printf "%s\n" "$bl" 1>&2 head -c "$bl" else break fi fi done else # No flags,the whole thing cat fi }