Mercurial > hg > cc > valhalla
view bin/warc.sh @ 0:fdd3f8a16fd4 default tip
shared scripts on valhalla cluster
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 14 Mar 2020 11:00:58 +0000 |
parents | |
children |
line wrap: on
line source
#!/bin/bash # Try to fillet warc payloads with just a shell script # Usage warc.sh outfilePrefix [-n startnum] LANG=C # count bytes LC_ALL=C # count bytes IFS=$'\n' shopt -qs nocasematch shopt -qs extglob handle_body () { ## read -r -N $l L doesn't work for binary bodies that contain a \000 because of Bash 'feature' l=$1 head -c $l r=$? if [ $r -ne 0 ]; then echo "truncated \$? = $r" 1>&2 fi } handle_payload () { n=$1 l=$2 ol=$2 f=$3 tr=$4 tu="$5" t=' Unknown' unset z unset bl unset xl unset hdr hn=0 while read -r L; do ((l = l - (${#L} + 1))) #((tot = tot + (${#L} + 1))) #echo p $l 1>&2 hdr="${hdr}"$'\n'"${L%% }" ((hn+=1)) case "$L" in Content-Type:\ *) t=${L##*: } t=${t%%;*} t=${t%%*([ [:space:]])} #echo $t 1>&2 ;; Content-Length:\ *) bl=${L##*: } bl=${bl%%*([ [:space:]])} ;; X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018??? xl=${xl%%*([ [:space:]])} ;; X-Crawler-Content-Encoding:\ *|Content-Encoding:\ *) # one or the other, change btw 2015&2018??? z=${L##*: } ((cec[${z%%*([ [:space:]])}]+=1)) ;; ) if [ $l -gt 0 ]; then if [[ "$f" && ( "$f" != "$t" ) ]]; then echo "$t" \!= "$f", skipping starting at $((tot + (ol - l))) 1>&2 head -c $l >/dev/null return fi if [ "$xl" ]; then bl=$xl xx=x else unset xx fi case "$t" in application/pdf) s=.pdf ;; text/html) s=.html ;; *) s='' esac if [ "$bl" ]; then if [ $bl -ne $l -a -z "$z" ]; then echo length mismatch$xx: $n here: $l given: $bl trunc: $tr 1>&2 fi fi echo "reading $l bytes into ${pprefix}_$n$s as $t starting at $((tot + (ol - l)))" 1>&2 { echo "$hdr" | head -$((hn-1)) | tail -n +2 if [ "$tr" ] ; then echo "X-HST-Truncated: $tr"; fi echo "X-HST-Target-URI: $tu" } > ${pprefix}_$n.hdr # handle_body $l > ${pprefix}_$n$s else echo "empty body, skipping" 1>&2 fi return;; esac done } handle_resp () { n=$1 f=$2 unset tr while read -r L; do tot=$((tot + ${#L} + 1)) case "$L" in Content-Length:\ *) l=${L##*: } #surrounding spaces don't matter for arithmetic ;; WARC-Truncated:\ *) # echo $n $L tr=${L##*: } tr=${tr%%*([ [:space:]])} tr=${tr:-EMPTY} ;; WARC-Target-URI:\ *) tu=${L##*: } tu=${tu%%*([ [:space:]])} # echo "|$L|$tu|" ;; ) ll=${l%%*([ [:space:]])} # but the \r has to go #echo "h_p at $tot" 1>&2 #echo "|$tu|${tu# }|" handle_payload $n $ll "$f" "${tr# }" "${tu# }" tot=$((tot + ll)) #echo "h_p done: $tot" 1>&2 return ;; esac done } # outer loop pprefix="$1" shift if [ "$1" = "-n" ]; then n=$2 shift; shift else n=0 fi tot=0 c=0 f=$1 wc=0 declare -A cec while read -r L; do tot=$((tot + ${#L} + 1)) case ${L% } in WARC/1.0) if [ $wc -eq 0 -a $c -gt 0 ]; then echo "WARC/1.0 after $c non-blank lines record $n char $tot" 1>&2 fi ((wc++)) ;; "") : ;; WARC-Type:\ response) echo tot at resp prop: $tot 1>&2 handle_resp $((n = n + 1)) $f c=0 wc=0 ;; *) c=$((c + 1)) ;; esac done echo "Last response #: $n" 1>&2 echo "Compression stats:" 1>&2 for i in "${!cec[@]}"; do printf " %10s: %s\n" $i ${cec[$i]} 1>&2 done