view bin/ix.sh @ 105:baf56ff538f8

convert to rich directory structure per 2019-35
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 20 Apr 2021 11:12:35 +0000
parents 0332076afc37
children 1d6fde73789d
line wrap: on
line source

#!/usr/bin/bash
# Extract records from warc files given length, offset and file triples
#  from stdin or as command line args
# Usage [-d] [-w] [-h] [-b] [-e cmd] [ -x | length offset path ]
#  -d Debug output
#  -w WARC headers
#  -h HTTP headers
#  -b HTTP body
#   No switch defaults to whole record
#  -x take lines of from a cdx index file as input, extract triples
#  -e pipes each result thru cmd
if [ "$1" = "-d" ]
then
 d=1
 rm /tmp/ix_dd_log.txt /tmp/ix_triples.tsv
 shift
fi
if [ "$1" = "-w" ]
then
 shift
 p=1
 w=1
fi
if [ "$1" = "-h" ]
then
 shift
 p=1
 h=1
fi
if [ "$1" = "-b" ]
then
 shift
 p=1
 b=1
fi
e="cat"
if [ "$1" = "-e" ]
then
 shift
 e="$1"
 shift
fi
if [ "$1" = "-x" ]
then
    # get triples from index lines
    egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \
      sed 's/[a-z]*": "//g;s/", "/\t/g;s/\(crawl-data\|segments\|warc\)\///g' |\
 if [ "$d" ] ; then tee -a /tmp/ix_triples.tsv ; else cat ; fi
elif [ "$1" ]
then
    printf "%s\t%s\t%s\n" "$1" "$2" "$3"
else
    cat
fi | \
while { IFS=$'\t' read l o wf; }
do
  ff=($(echo $wf | tr '/' ' '))
  f="/beegfs/common_crawl/${ff[0]}/${ff[1]}/orig/warc/${ff[2]}"
  if [ -z "$d" ]
  then
      dd if="$f" of=/dev/stdout skip=$o count=$l \
	  iflag=skip_bytes,count_bytes status=none
  else
      echo dd if="$f" of=/dev/stdout skip=$o count=$l \
	  iflag=skip_bytes,count_bytes >> /tmp/ix_dd_log.txt
      dd if="$f" of=/dev/stdout skip=$o count=$l \
	  iflag=skip_bytes,count_bytes 2>> /tmp/ix_dd_log.txt
  fi | \
{ unpigz -dp 1 -c || { printf "dd failure?: %s %s %s\n" $f $o $l 1>&2 ; continue ; } } |\
{ s="w"
if [ "$p" ]
then
   shopt -qs extglob # for %%*(...)
   while read -r L
   do
     if [ "$s" = "w" ]
     then
	 # WARC header
	 case "$L" in
	   Content-Length:\ *) wl=${L##*: }
			       wl=${wl%%*([
[:space:]])}
			       ;;
	    WARC-Truncated:\ *) # echo $n $L
		              tr=${L##*: }
		              tr=${tr%%*([
[:space:]])}
			      tr=${tr:-EMPTY}
		                    ;;
	   
) s="h"
	       if [ -z "$h$b" ]
	       then
		   exit 0
	       fi
	       if [ "$w" ]
	       then
		   echo
	       fi	 
	       continue;;
	 esac
	 if [ "$w" ]
	 then
	     printf "%s\n" "${L%%
}"
	 fi
	 continue
     fi
     if [ "$s" = "h" ]
     then
	 # HTTP header
	 wl=$((wl - ( ${#L} + 1 )))
	 case "$L" in
	     Content-Length:\ *) bl=${L##*: }
		                 bl=${bl%%*([
[:space:]])}
				 ;;
	     
) s="b" ; n=0
		 if [ -z "$b" ]
		 then
		     exit 0
		 fi
		 if [ "$h" ]
		 then
		     echo
		 fi	 
		 if [ "$bl" ]; then
		     if [ $bl -ne $wl ]; then
			 echo length mismatch: $l $o $f here: $wl given: $bl trunc: $tr 1>&2
		     fi
		 fi
                 continue ;;
	 esac
	 if [ "$h" ]
	 then
	     printf "%s\n" "${L%%
}"
	 fi
	 continue
     else
	 # HTTP body
	 if [ "$b" ]
	 then
	     #printf "%s\n" "$bl" 1>&2
	     head -c "${bl-$wl}"
	     exit 0
	 else
	     break
	 fi
     fi
   done
else
 # No flags,the whole thing
 cat
fi
} | $e
done