Mercurial > hg > cc > valhalla
comparison bin/warc.sh @ 0:fdd3f8a16fd4 default tip
shared scripts on valhalla cluster
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 14 Mar 2020 11:00:58 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fdd3f8a16fd4 |
---|---|
1 #!/bin/bash | |
2 # Try to fillet warc payloads with just a shell script | |
3 # Usage warc.sh outfilePrefix [-n startnum] | |
4 | |
5 LANG=C # count bytes | |
6 LC_ALL=C # count bytes | |
7 IFS=$'\n' | |
8 shopt -qs nocasematch | |
9 shopt -qs extglob | |
10 | |
11 handle_body () { | |
12 ## read -r -N $l L doesn't work for binary bodies that contain a \000 because of Bash 'feature' | |
13 l=$1 | |
14 head -c $l | |
15 r=$? | |
16 if [ $r -ne 0 ]; then | |
17 echo "truncated \$? = $r" 1>&2 | |
18 fi | |
19 } | |
20 | |
21 handle_payload () { | |
22 n=$1 | |
23 l=$2 | |
24 ol=$2 | |
25 f=$3 | |
26 tr=$4 | |
27 tu="$5" | |
28 t=' Unknown' | |
29 unset z | |
30 unset bl | |
31 unset xl | |
32 unset hdr | |
33 hn=0 | |
34 while read -r L; do | |
35 ((l = l - (${#L} + 1))) | |
36 #((tot = tot + (${#L} + 1))) | |
37 #echo p $l 1>&2 | |
38 hdr="${hdr}"$'\n'"${L%% | |
39 }" | |
40 ((hn+=1)) | |
41 case "$L" in | |
42 Content-Type:\ *) t=${L##*: } | |
43 t=${t%%;*} | |
44 t=${t%%*([ | |
45 [:space:]])} | |
46 #echo $t 1>&2 | |
47 ;; | |
48 Content-Length:\ *) bl=${L##*: } | |
49 bl=${bl%%*([ | |
50 [:space:]])} | |
51 ;; | |
52 X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018??? | |
53 xl=${xl%%*([ | |
54 [:space:]])} | |
55 ;; | |
56 X-Crawler-Content-Encoding:\ *|Content-Encoding:\ *) # one or the other, change btw 2015&2018??? | |
57 z=${L##*: } | |
58 ((cec[${z%%*([ | |
59 [:space:]])}]+=1)) | |
60 ;; | |
61 | |
62 ) if [ $l -gt 0 ]; then | |
63 if [[ "$f" && ( "$f" != "$t" ) ]]; then | |
64 echo "$t" \!= "$f", skipping starting at $((tot + (ol - l))) 1>&2 | |
65 head -c $l >/dev/null | |
66 return | |
67 fi | |
68 if [ "$xl" ]; then | |
69 bl=$xl | |
70 xx=x | |
71 else | |
72 unset xx | |
73 fi | |
74 case "$t" in | |
75 application/pdf) s=.pdf ;; | |
76 text/html) s=.html ;; | |
77 *) s='' | |
78 esac | |
79 if [ "$bl" ]; then | |
80 if [ $bl -ne $l -a -z "$z" ]; then | |
81 echo length mismatch$xx: $n here: $l given: $bl trunc: $tr 1>&2 | |
82 fi | |
83 fi | |
84 echo "reading $l bytes into ${pprefix}_$n$s as $t starting at $((tot + (ol - l)))" 1>&2 | |
85 { echo "$hdr" | head -$((hn-1)) | tail -n +2 | |
86 if [ "$tr" ] ; then echo "X-HST-Truncated: $tr"; fi | |
87 echo "X-HST-Target-URI: $tu" | |
88 } > ${pprefix}_$n.hdr # | |
89 handle_body $l > ${pprefix}_$n$s | |
90 else | |
91 echo "empty body, skipping" 1>&2 | |
92 fi | |
93 return;; | |
94 esac | |
95 done | |
96 } | |
97 | |
98 handle_resp () { | |
99 n=$1 | |
100 f=$2 | |
101 unset tr | |
102 while read -r L; do | |
103 tot=$((tot + ${#L} + 1)) | |
104 case "$L" in | |
105 Content-Length:\ *) l=${L##*: } | |
106 #surrounding spaces don't matter for arithmetic | |
107 ;; | |
108 WARC-Truncated:\ *) # echo $n $L | |
109 tr=${L##*: } | |
110 tr=${tr%%*([ | |
111 [:space:]])} | |
112 tr=${tr:-EMPTY} | |
113 ;; | |
114 WARC-Target-URI:\ *) tu=${L##*: } | |
115 tu=${tu%%*([ | |
116 [:space:]])} | |
117 # echo "|$L|$tu|" | |
118 ;; | |
119 | |
120 ) ll=${l%%*([ | |
121 [:space:]])} # but the \r has to go | |
122 #echo "h_p at $tot" 1>&2 | |
123 #echo "|$tu|${tu# }|" | |
124 handle_payload $n $ll "$f" "${tr# }" "${tu# }" | |
125 tot=$((tot + ll)) | |
126 #echo "h_p done: $tot" 1>&2 | |
127 return | |
128 ;; | |
129 esac | |
130 done | |
131 } | |
132 | |
133 # outer loop | |
134 pprefix="$1" | |
135 shift | |
136 if [ "$1" = "-n" ]; then | |
137 n=$2 | |
138 shift; shift | |
139 else | |
140 n=0 | |
141 fi | |
142 tot=0 | |
143 c=0 | |
144 f=$1 | |
145 wc=0 | |
146 declare -A cec | |
147 while read -r L; do | |
148 tot=$((tot + ${#L} + 1)) | |
149 case ${L% | |
150 } in | |
151 WARC/1.0) | |
152 if [ $wc -eq 0 -a $c -gt 0 ]; then | |
153 echo "WARC/1.0 after $c non-blank lines record $n char $tot" 1>&2 | |
154 fi | |
155 ((wc++)) | |
156 ;; | |
157 "") | |
158 : | |
159 ;; | |
160 WARC-Type:\ response) | |
161 echo tot at resp prop: $tot 1>&2 | |
162 handle_resp $((n = n + 1)) $f | |
163 c=0 | |
164 wc=0 | |
165 ;; | |
166 *) | |
167 c=$((c + 1)) | |
168 ;; | |
169 esac | |
170 done | |
171 echo "Last response #: $n" 1>&2 | |
172 echo "Compression stats:" 1>&2 | |
173 for i in "${!cec[@]}"; do | |
174 printf " %10s: %s\n" $i ${cec[$i]} 1>&2 | |
175 done |