1 #!/usr/bin/bash
2 # Extract records from warc files given filename, length and offset triples
3 # from stdin or as command line args
4 # Usage [-w] [-h] [-b] [path length offset]
5 # -w WARC headers
6 # -h HTTP headers
7 # -b HTTP body
8 # No switch defaults to whole record
9 if [ "$1" = "-w" ]
10 then
11 shift
12 p=1
13 w=1
14 fi
15 if [ "$1" = "-h" ]
16 then
17 shift
18 p=1
19 h=1
20 fi
21 if [ "$1" = "-b" ]
22 then
23 shift
24 p=1
25 b=1
26 fi
27 if [ -n "$1" ]
28 then
29 printf "%s\t%s\t%s\n" "$1" "$2" "$3"
30 else
31 cat
32 fi | \
33 while { IFS=$'\t' read f l o; }
34 do
35 dd if="$f" of=/dev/stdout skip=$o count=$l \
36 iflag=skip_bytes,count_bytes 2>/dev/null
37 done | unpigz -dp 1 -c | \
38 s="w"
39 if [ -n "$p" ]
40 then
41 while read -r L
42 do
43 if [ "$s" = "w" ]
44 then
45 # WARC header
46 if [ "$L" = "
" ]
47 then
48 s="h"
49 continue
50 fi
51 if [ -n "$w" ]
52 then
53 printf "%s\n" "${L%%
54 fi
55 continue
56 fi
57 if [ "$s" = "b" ]
58 then
59 # HTTP header
60 case "$L" in
) s="b" ; n=0 ; continue ;;
62 Content-Length:\ *) bl=${L##*: }
63 bl=${bl%%*([
64 ;;
65 esac
66 if [ -n "$w" ]
67 then
68 printf "%s\n" "${L%%
69 fi
70 continue
71 else
72 # HTTP body
73 if [ -n "$b" ]
74 then
75 printf "%s\n" "$bl" 1>&2
76 head -c "$bl"
77 else
78 break
79 fi
80 fi
81 done
82 else
83 cat
84 fi # No flags,the whole thing