Mercurial > hg > cc > cirrus_home
changeset 93:4d870a7ec871
support a command to receive each result,
remove use of X-Crawler-Content-Length
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 15 Apr 2021 10:59:25 +0000 |
parents | d56465d5c51f |
children | d60073ec798a |
files | bin/ix.sh |
diffstat | 1 files changed, 11 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/ix.sh Wed Apr 14 20:15:32 2021 +0000 +++ b/bin/ix.sh Thu Apr 15 10:59:25 2021 +0000 @@ -1,13 +1,14 @@ #!/usr/bin/bash # Extract records from warc files given length, offset and file triples # from stdin or as command line args -# Usage [-d] [-w] [-h] [-b] [-x] [length offset path] +# Usage [-d] [-w] [-h] [-b] [-e cmd] [ -x | length offset path ] # -d Debug output # -w WARC headers # -h HTTP headers # -b HTTP body # No switch defaults to whole record # -x take lines of from a cdx index file as input, extract triples +# -e pipes each result thru cmd if [ "$1" = "-d" ] then d=1 @@ -31,6 +32,13 @@ p=1 b=1 fi +e="cat" +if [ "$1" = "-e" ] +then + shift + e="$1" + shift +fi if [ "$1" = "-x" ] then # get triples from index lines @@ -99,9 +107,6 @@ Content-Length:\ *) bl=${L##*: } bl=${bl%%*([ [:space:]])} ;; - X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018??? - xl=${xl%%*([ [:space:]])} - ;; ) s="b" ; n=0 if [ -z "$b" ] then @@ -111,15 +116,9 @@ then echo fi - if [ "$xl" ]; then - bl=$xl - xx=x - else - unset xx - fi if [ "$bl" ]; then if [ $bl -ne $wl ]; then - echo length mismatch$xx: $f $o $l here: $wl given: $bl trunc: $tr 1>&2 + echo length mismatch: $l $o $f here: $wl given: $bl trunc: $tr 1>&2 fi fi continue ;; @@ -145,5 +144,5 @@ # No flags,the whole thing cat fi -} +} | $e done