view bin/warc.sh @ 0:fdd3f8a16fd4 default tip

shared scripts on valhalla cluster
author Henry Thompson <ht@markup.co.uk>
date Sat, 14 Mar 2020 11:00:58 +0000
parents
children
line wrap: on
line source

#!/bin/bash
# Try to fillet warc payloads with just a shell script
# Usage warc.sh outfilePrefix [-n startnum]

LANG=C # count bytes
LC_ALL=C # count bytes
IFS=$'\n'
shopt -qs nocasematch
shopt -qs extglob

handle_body () {
    ## read -r -N $l L doesn't work for binary bodies that contain a \000 because of Bash 'feature'
    l=$1
    head -c $l
    r=$?
    if [ $r -ne 0 ]; then
	echo "truncated \$? = $r" 1>&2
    fi
}   

handle_payload () {
    n=$1
    l=$2
    ol=$2
    f=$3
    tr=$4
    tu="$5"
    t=' Unknown'
    unset z
    unset bl
    unset xl
    unset hdr
    hn=0
    while read -r L; do
	((l = l - (${#L} + 1)))
	#((tot = tot + (${#L} + 1)))
	#echo p $l 1>&2
	hdr="${hdr}"$'\n'"${L%%
}"
	((hn+=1))
	case "$L" in
	    Content-Type:\ *) t=${L##*: }
			    t=${t%%;*}
			    t=${t%%*([
[:space:]])}
                            #echo $t 1>&2
                            ;;
	    Content-Length:\ *) bl=${L##*: }
			      bl=${bl%%*([
[:space:]])}
			      ;;
	    X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018???
			      xl=${xl%%*([
[:space:]])}
			      ;;
	    X-Crawler-Content-Encoding:\ *|Content-Encoding:\ *) # one or the other, change btw 2015&2018???
		              z=${L##*: }
			      ((cec[${z%%*([
[:space:]])}]+=1))
		                    ;;
	    
) if [ $l -gt 0 ]; then
		    if [[ "$f" && ( "$f" != "$t" ) ]]; then
			echo "$t" \!= "$f", skipping starting at $((tot + (ol - l))) 1>&2
			head -c $l >/dev/null
			return
		    fi
		    if [ "$xl" ]; then
			bl=$xl
			xx=x
		    else
			unset xx
	            fi
		    case "$t" in
			application/pdf) s=.pdf ;;
			text/html) s=.html ;;
			*) s=''
		    esac
		    if [ "$bl" ]; then
			if [ $bl -ne $l -a -z "$z" ]; then
			    echo length mismatch$xx: $n here: $l given: $bl trunc: $tr 1>&2
			fi
		    fi
		    echo "reading $l bytes into ${pprefix}_$n$s as $t starting at $((tot + (ol - l)))" 1>&2
		    { echo "$hdr" | head -$((hn-1)) | tail -n +2
		      if [ "$tr" ] ; then echo "X-HST-Truncated: $tr"; fi
                      echo "X-HST-Target-URI: $tu"
		    } > ${pprefix}_$n.hdr #
		    handle_body $l  > ${pprefix}_$n$s
		else
		    echo "empty body, skipping" 1>&2
		fi
		return;;
        esac
    done
}

handle_resp () {
    n=$1
    f=$2
    unset tr
    while read -r L; do
	tot=$((tot + ${#L} + 1))
	case "$L" in
	    Content-Length:\ *) l=${L##*: }
                              #surrounding spaces don't matter for arithmetic
                              ;;
	    WARC-Truncated:\ *) # echo $n $L
		              tr=${L##*: }
		              tr=${tr%%*([
[:space:]])}
			      tr=${tr:-EMPTY}
		                    ;;
	    WARC-Target-URI:\ *) tu=${L##*: }
		              tu=${tu%%*([
[:space:]])}
			      # echo "|$L|$tu|"
			      ;;
	    
) ll=${l%%*([
[:space:]])} # but the \r has to go
		#echo "h_p at $tot" 1>&2
		#echo "|$tu|${tu# }|"
		handle_payload $n $ll "$f" "${tr# }" "${tu# }"
		tot=$((tot + ll))
		#echo "h_p done: $tot" 1>&2
		return
		;;
        esac
    done
}

# outer loop
pprefix="$1"
shift
if [ "$1" = "-n" ]; then
    n=$2
    shift; shift
else
    n=0
fi
tot=0
c=0
f=$1
wc=0
declare -A cec
while read -r L; do
    tot=$((tot + ${#L} + 1))
    case ${L%
} in
	WARC/1.0)
	    if [ $wc -eq 0 -a $c -gt 0 ]; then
		echo "WARC/1.0 after $c non-blank lines record $n char $tot" 1>&2
	    fi
	    ((wc++))
	    ;;
	"")
	    :
	    ;;
	WARC-Type:\ response)
	    echo tot at resp prop: $tot 1>&2
	    handle_resp $((n = n + 1)) $f
	    c=0
	    wc=0
	    ;;
	*)
            c=$((c + 1))
            ;;
    esac
done
echo "Last response #: $n" 1>&2
echo "Compression stats:" 1>&2
for i in "${!cec[@]}"; do
    printf " %10s: %s\n" $i ${cec[$i]} 1>&2
done