Mercurial > hg > cc > valhalla
annotate bin/warc.sh @ 0:fdd3f8a16fd4 default tip
shared scripts on valhalla cluster
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 14 Mar 2020 11:00:58 +0000 |
parents | |
children |
rev | line source |
---|---|
0
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
2 # Try to fillet warc payloads with just a shell script |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
3 # Usage warc.sh outfilePrefix [-n startnum] |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
5 LANG=C # count bytes |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
6 LC_ALL=C # count bytes |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
7 IFS=$'\n' |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
8 shopt -qs nocasematch |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
9 shopt -qs extglob |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
10 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
11 handle_body () { |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
12 ## read -r -N $l L doesn't work for binary bodies that contain a \000 because of Bash 'feature' |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
13 l=$1 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
14 head -c $l |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
15 r=$? |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
16 if [ $r -ne 0 ]; then |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
17 echo "truncated \$? = $r" 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
18 fi |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
19 } |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
20 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
21 handle_payload () { |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
22 n=$1 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
23 l=$2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
24 ol=$2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
25 f=$3 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
26 tr=$4 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
27 tu="$5" |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
28 t=' Unknown' |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
29 unset z |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
30 unset bl |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
31 unset xl |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
32 unset hdr |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
33 hn=0 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
34 while read -r L; do |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
35 ((l = l - (${#L} + 1))) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
36 #((tot = tot + (${#L} + 1))) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
37 #echo p $l 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
38 hdr="${hdr}"$'\n'"${L%% }" |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
39 ((hn+=1)) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
40 case "$L" in |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
41 Content-Type:\ *) t=${L##*: } |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
42 t=${t%%;*} |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
43 t=${t%%*([ [:space:]])} |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
44 #echo $t 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
45 ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
46 Content-Length:\ *) bl=${L##*: } |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
47 bl=${bl%%*([ [:space:]])} |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
48 ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
49 X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018??? |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
50 xl=${xl%%*([ [:space:]])} |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
51 ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
52 X-Crawler-Content-Encoding:\ *|Content-Encoding:\ *) # one or the other, change btw 2015&2018??? |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
53 z=${L##*: } |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
54 ((cec[${z%%*([ [:space:]])}]+=1)) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
55 ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
56 ) if [ $l -gt 0 ]; then |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
57 if [[ "$f" && ( "$f" != "$t" ) ]]; then |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
58 echo "$t" \!= "$f", skipping starting at $((tot + (ol - l))) 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
59 head -c $l >/dev/null |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
60 return |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
61 fi |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
62 if [ "$xl" ]; then |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
63 bl=$xl |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
64 xx=x |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
65 else |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
66 unset xx |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
67 fi |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
68 case "$t" in |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
69 application/pdf) s=.pdf ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
70 text/html) s=.html ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
71 *) s='' |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
72 esac |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
73 if [ "$bl" ]; then |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
74 if [ $bl -ne $l -a -z "$z" ]; then |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
75 echo length mismatch$xx: $n here: $l given: $bl trunc: $tr 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
76 fi |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
77 fi |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
78 echo "reading $l bytes into ${pprefix}_$n$s as $t starting at $((tot + (ol - l)))" 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
79 { echo "$hdr" | head -$((hn-1)) | tail -n +2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
80 if [ "$tr" ] ; then echo "X-HST-Truncated: $tr"; fi |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
81 echo "X-HST-Target-URI: $tu" |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
82 } > ${pprefix}_$n.hdr # |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
83 handle_body $l > ${pprefix}_$n$s |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
84 else |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
85 echo "empty body, skipping" 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
86 fi |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
87 return;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
88 esac |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
89 done |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
90 } |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
91 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
92 handle_resp () { |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
93 n=$1 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
94 f=$2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
95 unset tr |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
96 while read -r L; do |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
97 tot=$((tot + ${#L} + 1)) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
98 case "$L" in |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
99 Content-Length:\ *) l=${L##*: } |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
100 #surrounding spaces don't matter for arithmetic |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
101 ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
102 WARC-Truncated:\ *) # echo $n $L |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
103 tr=${L##*: } |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
104 tr=${tr%%*([ [:space:]])} |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
105 tr=${tr:-EMPTY} |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
106 ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
107 WARC-Target-URI:\ *) tu=${L##*: } |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
108 tu=${tu%%*([ [:space:]])} |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
109 # echo "|$L|$tu|" |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
110 ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
111 ) ll=${l%%*([ [:space:]])} # but the \r has to go |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
112 #echo "h_p at $tot" 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
113 #echo "|$tu|${tu# }|" |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
114 handle_payload $n $ll "$f" "${tr# }" "${tu# }" |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
115 tot=$((tot + ll)) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
116 #echo "h_p done: $tot" 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
117 return |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
118 ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
119 esac |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
120 done |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
121 } |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
122 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
123 # outer loop |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
124 pprefix="$1" |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
125 shift |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
126 if [ "$1" = "-n" ]; then |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
127 n=$2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
128 shift; shift |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
129 else |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
130 n=0 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
131 fi |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
132 tot=0 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
133 c=0 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
134 f=$1 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
135 wc=0 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
136 declare -A cec |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
137 while read -r L; do |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
138 tot=$((tot + ${#L} + 1)) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
139 case ${L% } in |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
140 WARC/1.0) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
141 if [ $wc -eq 0 -a $c -gt 0 ]; then |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
142 echo "WARC/1.0 after $c non-blank lines record $n char $tot" 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
143 fi |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
144 ((wc++)) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
145 ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
146 "") |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
147 : |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
148 ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
149 WARC-Type:\ response) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
150 echo tot at resp prop: $tot 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
151 handle_resp $((n = n + 1)) $f |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
152 c=0 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
153 wc=0 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
154 ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
155 *) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
156 c=$((c + 1)) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
157 ;; |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
158 esac |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
159 done |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
160 echo "Last response #: $n" 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
161 echo "Compression stats:" 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
162 for i in "${!cec[@]}"; do |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
163 printf " %10s: %s\n" $i ${cec[$i]} 1>&2 |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
164 done |