Mercurial > hg > cc > cirrus_home
comparison bin/ix.sh @ 93:4d870a7ec871
support a command to receive each result,
remove use of X-Crawler-Content-Length
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 15 Apr 2021 10:59:25 +0000 |
parents | d56465d5c51f |
children | 0332076afc37 |
comparison
equal
deleted
inserted
replaced
92:d56465d5c51f | 93:4d870a7ec871 |
---|---|
1 #!/usr/bin/bash | 1 #!/usr/bin/bash |
2 # Extract records from warc files given length, offset and file triples | 2 # Extract records from warc files given length, offset and file triples |
3 # from stdin or as command line args | 3 # from stdin or as command line args |
4 # Usage [-d] [-w] [-h] [-b] [-x] [length offset path] | 4 # Usage [-d] [-w] [-h] [-b] [-e cmd] [ -x | length offset path ] |
5 # -d Debug output | 5 # -d Debug output |
6 # -w WARC headers | 6 # -w WARC headers |
7 # -h HTTP headers | 7 # -h HTTP headers |
8 # -b HTTP body | 8 # -b HTTP body |
9 # No switch defaults to whole record | 9 # No switch defaults to whole record |
10 # -x take lines of from a cdx index file as input, extract triples | 10 # -x take lines of from a cdx index file as input, extract triples |
11 # -e pipes each result thru cmd | |
11 if [ "$1" = "-d" ] | 12 if [ "$1" = "-d" ] |
12 then | 13 then |
13 d=1 | 14 d=1 |
14 shift | 15 shift |
15 fi | 16 fi |
28 if [ "$1" = "-b" ] | 29 if [ "$1" = "-b" ] |
29 then | 30 then |
30 shift | 31 shift |
31 p=1 | 32 p=1 |
32 b=1 | 33 b=1 |
34 fi | |
35 e="cat" | |
36 if [ "$1" = "-e" ] | |
37 then | |
38 shift | |
39 e="$1" | |
40 shift | |
33 fi | 41 fi |
34 if [ "$1" = "-x" ] | 42 if [ "$1" = "-x" ] |
35 then | 43 then |
36 # get triples from index lines | 44 # get triples from index lines |
37 egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \ | 45 egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \ |
102 case "$L" in | 110 case "$L" in |
103 Content-Length:\ *) bl=${L##*: } | 111 Content-Length:\ *) bl=${L##*: } |
104 bl=${bl%%*([ | 112 bl=${bl%%*([ |
105 [:space:]])} | 113 [:space:]])} |
106 ;; | 114 ;; |
107 X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018??? | |
108 xl=${xl%%*([ | |
109 [:space:]])} | |
110 ;; | |
111 | 115 |
112 ) s="b" ; n=0 | 116 ) s="b" ; n=0 |
113 if [ -z "$b" ] | 117 if [ -z "$b" ] |
114 then | 118 then |
115 exit 0 | 119 exit 0 |
116 fi | 120 fi |
117 if [ "$h" ] | 121 if [ "$h" ] |
118 then | 122 then |
119 echo | 123 echo |
120 fi | 124 fi |
121 if [ "$xl" ]; then | |
122 bl=$xl | |
123 xx=x | |
124 else | |
125 unset xx | |
126 fi | |
127 if [ "$bl" ]; then | 125 if [ "$bl" ]; then |
128 if [ $bl -ne $wl ]; then | 126 if [ $bl -ne $wl ]; then |
129 echo length mismatch$xx: $f $o $l here: $wl given: $bl trunc: $tr 1>&2 | 127 echo length mismatch: $l $o $f here: $wl given: $bl trunc: $tr 1>&2 |
130 fi | 128 fi |
131 fi | 129 fi |
132 continue ;; | 130 continue ;; |
133 esac | 131 esac |
134 if [ "$h" ] | 132 if [ "$h" ] |
151 done | 149 done |
152 else | 150 else |
153 # No flags,the whole thing | 151 # No flags,the whole thing |
154 cat | 152 cat |
155 fi | 153 fi |
156 } | 154 } | $e |
157 done | 155 done |