# HG changeset patch # User Henry S. Thompson # Date 1618431332 0 # Node ID d56465d5c51fbd816cfefe91be90c8a63ca43834 # Parent 82c94684f799e64cd371434497e3b9277c898c7b accepts index lines, less line-at-a-time diff -r 82c94684f799 -r d56465d5c51f bin/ix.sh --- a/bin/ix.sh Wed Apr 14 10:08:41 2021 +0000 +++ b/bin/ix.sh Wed Apr 14 20:15:32 2021 +0000 @@ -1,11 +1,18 @@ #!/usr/bin/bash -# Extract records from warc files given filename, length and offset triples +# Extract records from warc files given length, offset and file triples # from stdin or as command line args -# Usage [-w] [-h] [-b] [path length offset] +# Usage [-d] [-w] [-h] [-b] [-x] [length offset path] +# -d Debug output # -w WARC headers # -h HTTP headers # -b HTTP body -# No switch defaults to whole record +# No switch defaults to whole record +# -x take lines of from a cdx index file as input, extract triples +if [ "$1" = "-d" ] +then + d=1 + shift +fi if [ "$1" = "-w" ] then shift @@ -24,19 +31,33 @@ p=1 b=1 fi -if [ -n "$1" ] +if [ "$1" = "-x" ] +then + # get triples from index lines + egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \ + sed 's/[a-z]*": "//g;s/", "/\t/g;s/\(crawl-data\|segments\|warc\)\///g' |\ + if [ "$d" ] ; then tee /tmp/ix_triples.tsv ; else cat ; fi +elif [ "$1" ] then printf "%s\t%s\t%s\n" "$1" "$2" "$3" else cat fi | \ -while { IFS=$'\t' read f l o; } +while { IFS=$'\t' read l o f; } do - dd if="$f" of=/dev/stdout skip=$o count=$l \ - iflag=skip_bytes,count_bytes 2>/dev/null -done | unpigz -dp 1 -c | \ + if [ -z "$d" ] + then + dd if="$f" of=/dev/stdout skip=$o count=$l \ + iflag=skip_bytes,count_bytes 2>/dev/null + else + echo dd if="$f" of=/dev/stdout skip=$o count=$l \ + iflag=skip_bytes,count_bytes > /tmp/ix_dd_log.txt + dd if="$f" of=/dev/stdout skip=$o count=$l \ + iflag=skip_bytes,count_bytes 2>> /tmp/ix_dd_log.txt + fi | \ +unpigz -dp 1 -c | tee /tmp/data | \ { s="w" -if [ -n "$p" ] +if [ "$p" ] then shopt -qs extglob # for %%*(...) while read -r L @@ -44,16 +65,27 @@ if [ "$s" = "w" ] then # WARC header - if [ "$L" = " " ] - then - s="h" - if [[ -n "$w" && ( -n "$h" || -n "$b" ) ]] - then - echo - fi - continue - fi - if [ -n "$w" ] + case "$L" in + Content-Length:\ *) wl=${L##*: } + wl=${wl%%*([ [:space:]])} + ;; + WARC-Truncated:\ *) # echo $n $L + tr=${L##*: } + tr=${tr%%*([ [:space:]])} + tr=${tr:-EMPTY} + ;; + ) s="h" + if [ -z "$h$b" ] + then + exit 0 + fi + if [ "$w" ] + then + echo + fi + continue;; + esac + if [ "$w" ] then printf "%s\n" "${L%% }" fi @@ -62,28 +94,48 @@ if [ "$s" = "h" ] then # HTTP header + wl=$((wl - ( ${#L} + 1 ))) case "$L" in + Content-Length:\ *) bl=${L##*: } + bl=${bl%%*([ [:space:]])} + ;; + X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018??? + xl=${xl%%*([ [:space:]])} + ;; ) s="b" ; n=0 - if [ -n "$h" -a -n "$b" ] + if [ -z "$b" ] + then + exit 0 + fi + if [ "$h" ] then echo fi + if [ "$xl" ]; then + bl=$xl + xx=x + else + unset xx + fi + if [ "$bl" ]; then + if [ $bl -ne $wl ]; then + echo length mismatch$xx: $f $o $l here: $wl given: $bl trunc: $tr 1>&2 + fi + fi continue ;; - Content-Length:\ *) bl=${L##*: } - bl=${bl%%*([ [:space:]])} - ;; esac - if [ -n "$h" ] + if [ "$h" ] then printf "%s\n" "${L%% }" fi continue else # HTTP body - if [ -n "$b" ] + if [ "$b" ] then #printf "%s\n" "$bl" 1>&2 - head -c "$bl" + head -c "${bl-$wl}" + exit 0 else break fi @@ -94,3 +146,4 @@ cat fi } +done