comparison bin/warc.sh @ 0:fdd3f8a16fd4 default tip

shared scripts on valhalla cluster
author Henry Thompson <ht@markup.co.uk>
date Sat, 14 Mar 2020 11:00:58 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:fdd3f8a16fd4
1 #!/bin/bash
2 # Try to fillet warc payloads with just a shell script
3 # Usage warc.sh outfilePrefix [-n startnum]
4
5 LANG=C # count bytes
6 LC_ALL=C # count bytes
7 IFS=$'\n'
8 shopt -qs nocasematch
9 shopt -qs extglob
10
11 handle_body () {
12 ## read -r -N $l L doesn't work for binary bodies that contain a \000 because of Bash 'feature'
13 l=$1
14 head -c $l
15 r=$?
16 if [ $r -ne 0 ]; then
17 echo "truncated \$? = $r" 1>&2
18 fi
19 }
20
21 handle_payload () {
22 n=$1
23 l=$2
24 ol=$2
25 f=$3
26 tr=$4
27 tu="$5"
28 t=' Unknown'
29 unset z
30 unset bl
31 unset xl
32 unset hdr
33 hn=0
34 while read -r L; do
35 ((l = l - (${#L} + 1)))
36 #((tot = tot + (${#L} + 1)))
37 #echo p $l 1>&2
38 hdr="${hdr}"$'\n'"${L%%
39 }"
40 ((hn+=1))
41 case "$L" in
42 Content-Type:\ *) t=${L##*: }
43 t=${t%%;*}
44 t=${t%%*([
45 [:space:]])}
46 #echo $t 1>&2
47 ;;
48 Content-Length:\ *) bl=${L##*: }
49 bl=${bl%%*([
50 [:space:]])}
51 ;;
52 X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018???
53 xl=${xl%%*([
54 [:space:]])}
55 ;;
56 X-Crawler-Content-Encoding:\ *|Content-Encoding:\ *) # one or the other, change btw 2015&2018???
57 z=${L##*: }
58 ((cec[${z%%*([
59 [:space:]])}]+=1))
60 ;;
61
62 ) if [ $l -gt 0 ]; then
63 if [[ "$f" && ( "$f" != "$t" ) ]]; then
64 echo "$t" \!= "$f", skipping starting at $((tot + (ol - l))) 1>&2
65 head -c $l >/dev/null
66 return
67 fi
68 if [ "$xl" ]; then
69 bl=$xl
70 xx=x
71 else
72 unset xx
73 fi
74 case "$t" in
75 application/pdf) s=.pdf ;;
76 text/html) s=.html ;;
77 *) s=''
78 esac
79 if [ "$bl" ]; then
80 if [ $bl -ne $l -a -z "$z" ]; then
81 echo length mismatch$xx: $n here: $l given: $bl trunc: $tr 1>&2
82 fi
83 fi
84 echo "reading $l bytes into ${pprefix}_$n$s as $t starting at $((tot + (ol - l)))" 1>&2
85 { echo "$hdr" | head -$((hn-1)) | tail -n +2
86 if [ "$tr" ] ; then echo "X-HST-Truncated: $tr"; fi
87 echo "X-HST-Target-URI: $tu"
88 } > ${pprefix}_$n.hdr #
89 handle_body $l > ${pprefix}_$n$s
90 else
91 echo "empty body, skipping" 1>&2
92 fi
93 return;;
94 esac
95 done
96 }
97
98 handle_resp () {
99 n=$1
100 f=$2
101 unset tr
102 while read -r L; do
103 tot=$((tot + ${#L} + 1))
104 case "$L" in
105 Content-Length:\ *) l=${L##*: }
106 #surrounding spaces don't matter for arithmetic
107 ;;
108 WARC-Truncated:\ *) # echo $n $L
109 tr=${L##*: }
110 tr=${tr%%*([
111 [:space:]])}
112 tr=${tr:-EMPTY}
113 ;;
114 WARC-Target-URI:\ *) tu=${L##*: }
115 tu=${tu%%*([
116 [:space:]])}
117 # echo "|$L|$tu|"
118 ;;
119
120 ) ll=${l%%*([
121 [:space:]])} # but the \r has to go
122 #echo "h_p at $tot" 1>&2
123 #echo "|$tu|${tu# }|"
124 handle_payload $n $ll "$f" "${tr# }" "${tu# }"
125 tot=$((tot + ll))
126 #echo "h_p done: $tot" 1>&2
127 return
128 ;;
129 esac
130 done
131 }
132
133 # outer loop
134 pprefix="$1"
135 shift
136 if [ "$1" = "-n" ]; then
137 n=$2
138 shift; shift
139 else
140 n=0
141 fi
142 tot=0
143 c=0
144 f=$1
145 wc=0
146 declare -A cec
147 while read -r L; do
148 tot=$((tot + ${#L} + 1))
149 case ${L%
150 } in
151 WARC/1.0)
152 if [ $wc -eq 0 -a $c -gt 0 ]; then
153 echo "WARC/1.0 after $c non-blank lines record $n char $tot" 1>&2
154 fi
155 ((wc++))
156 ;;
157 "")
158 :
159 ;;
160 WARC-Type:\ response)
161 echo tot at resp prop: $tot 1>&2
162 handle_resp $((n = n + 1)) $f
163 c=0
164 wc=0
165 ;;
166 *)
167 c=$((c + 1))
168 ;;
169 esac
170 done
171 echo "Last response #: $n" 1>&2
172 echo "Compression stats:" 1>&2
173 for i in "${!cec[@]}"; do
174 printf " %10s: %s\n" $i ${cec[$i]} 1>&2
175 done