Mercurial > hg > cc > cirrus_home
view bin/ix.sh @ 159:c3c3dd60b8a8
demo of slurm usage using cdx2tsv.py
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 06 Jul 2022 18:07:34 +0100 |
parents | 1d6fde73789d |
children |
line wrap: on
line source
#!/usr/bin/bash # Extract records from warc files given length, offset and file triples # from stdin or as command line args # Usage [-d] [-w] [-h] [-b] [-e cmd] [ -x | length offset path ] # -d Debug output # -w WARC headers # -h HTTP headers # -b HTTP body # No switch defaults to whole record # -x take lines of from a cdx index file as input, extract triples # -e pipes each result thru cmd if [ "$1" = "-d" ] then d=1 rm /tmp/ix_dd_log.txt /tmp/ix_triples.tsv shift fi if [ "$1" = "-w" ] then shift p=1 w=1 fi if [ "$1" = "-h" ] then shift p=1 h=1 fi if [ "$1" = "-b" ] then shift p=1 b=1 fi e="cat" if [ "$1" = "-e" ] then shift e="$1" shift fi if [ "$1" = "-x" ] then # get triples from index lines egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \ sed 's/[a-z]*": "//g;s/", "/\t/g;s/\(crawl-data\|segments\|warc\)\///g' |\ if [ "$d" ] ; then tee -a /tmp/ix_triples.tsv ; else cat ; fi elif [ "$1" ] then printf "%s\t%s\t%s\n" "$1" "$2" "$3" else cat fi | \ while { IFS=$'\t' read l o wf; } do ff=($(echo $wf | tr '/' ' ')) f="/beegfs/common_crawl/${ff[0]}/${ff[1]}/orig/warc/${ff[2]}" if [ -z "$d" ] then dd if="$f" of=/dev/stdout skip=$o count=$l \ iflag=skip_bytes,count_bytes status=none else echo dd if="$f" of=/dev/stdout skip=$o count=$l \ iflag=skip_bytes,count_bytes >> /tmp/ix_dd_log.txt dd if="$f" of=/dev/stdout skip=$o count=$l \ iflag=skip_bytes,count_bytes 2>> /tmp/ix_dd_log.txt fi | \ { ~/gentoo/usr/bin/igzip -dc || { printf "dd failure?: %s %s %s\n" $f $o $l 1>&2 ; continue ; } } |\ { s="w" if [ "$p" ] then shopt -qs extglob # for %%*(...) while read -r L do if [ "$s" = "w" ] then # WARC header case "$L" in Content-Length:\ *) wl=${L##*: } wl=${wl%%*([ [:space:]])} ;; WARC-Truncated:\ *) # echo $n $L tr=${L##*: } tr=${tr%%*([ [:space:]])} tr=${tr:-EMPTY} ;; ) s="h" if [ -z "$h$b" ] then exit 0 fi if [ "$w" ] then echo fi continue;; esac if [ "$w" ] then printf "%s\n" "${L%% }" fi continue fi if [ "$s" = "h" ] then # HTTP header wl=$((wl - ( ${#L} + 1 ))) #echo $wl $bl 1>&2 case "$L" in Content-Length:\ *) bl=${L##*: } bl=${bl%%*([ [:space:]])} ;; ) s="b" ; n=0 if [ -z "$b" ] then exit 0 fi if [ "$h" ] then echo fi if [ "$bl" ]; then if [ $bl -ne $wl ]; then echo length mismatch: $l $o $f here: $wl given: $bl trunc: $tr 1>&2 fi fi continue ;; esac if [ "$h" ] then printf "%s\n" "${L%% }" fi continue else # HTTP body if [ "$b" ] then #printf "%s\n" "$bl" 1>&2 head -c "${bl-$wl}" exit 0 else break fi fi done else # No flags,the whole thing cat fi } | $e done