comparison bin/ix.sh @ 93:4d870a7ec871

support a command to receive each result, remove use of X-Crawler-Content-Length
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 15 Apr 2021 10:59:25 +0000
parents d56465d5c51f
children 0332076afc37
comparison
equal deleted inserted replaced
92:d56465d5c51f 93:4d870a7ec871
1 #!/usr/bin/bash 1 #!/usr/bin/bash
2 # Extract records from warc files given length, offset and file triples 2 # Extract records from warc files given length, offset and file triples
3 # from stdin or as command line args 3 # from stdin or as command line args
4 # Usage [-d] [-w] [-h] [-b] [-x] [length offset path] 4 # Usage [-d] [-w] [-h] [-b] [-e cmd] [ -x | length offset path ]
5 # -d Debug output 5 # -d Debug output
6 # -w WARC headers 6 # -w WARC headers
7 # -h HTTP headers 7 # -h HTTP headers
8 # -b HTTP body 8 # -b HTTP body
9 # No switch defaults to whole record 9 # No switch defaults to whole record
10 # -x take lines of from a cdx index file as input, extract triples 10 # -x take lines of from a cdx index file as input, extract triples
11 # -e pipes each result thru cmd
11 if [ "$1" = "-d" ] 12 if [ "$1" = "-d" ]
12 then 13 then
13 d=1 14 d=1
14 shift 15 shift
15 fi 16 fi
28 if [ "$1" = "-b" ] 29 if [ "$1" = "-b" ]
29 then 30 then
30 shift 31 shift
31 p=1 32 p=1
32 b=1 33 b=1
34 fi
35 e="cat"
36 if [ "$1" = "-e" ]
37 then
38 shift
39 e="$1"
40 shift
33 fi 41 fi
34 if [ "$1" = "-x" ] 42 if [ "$1" = "-x" ]
35 then 43 then
36 # get triples from index lines 44 # get triples from index lines
37 egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \ 45 egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \
102 case "$L" in 110 case "$L" in
103 Content-Length:\ *) bl=${L##*: } 111 Content-Length:\ *) bl=${L##*: }
104 bl=${bl%%*([ 112 bl=${bl%%*([
105 [:space:]])} 113 [:space:]])}
106 ;; 114 ;;
107 X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018???
108 xl=${xl%%*([
109 [:space:]])}
110 ;;
111 115
112 ) s="b" ; n=0 116 ) s="b" ; n=0
113 if [ -z "$b" ] 117 if [ -z "$b" ]
114 then 118 then
115 exit 0 119 exit 0
116 fi 120 fi
117 if [ "$h" ] 121 if [ "$h" ]
118 then 122 then
119 echo 123 echo
120 fi 124 fi
121 if [ "$xl" ]; then
122 bl=$xl
123 xx=x
124 else
125 unset xx
126 fi
127 if [ "$bl" ]; then 125 if [ "$bl" ]; then
128 if [ $bl -ne $wl ]; then 126 if [ $bl -ne $wl ]; then
129 echo length mismatch$xx: $f $o $l here: $wl given: $bl trunc: $tr 1>&2 127 echo length mismatch: $l $o $f here: $wl given: $bl trunc: $tr 1>&2
130 fi 128 fi
131 fi 129 fi
132 continue ;; 130 continue ;;
133 esac 131 esac
134 if [ "$h" ] 132 if [ "$h" ]
151 done 149 done
152 else 150 else
153 # No flags,the whole thing 151 # No flags,the whole thing
154 cat 152 cat
155 fi 153 fi
156 } 154 } | $e
157 done 155 done