88
|
1 #!/usr/bin/bash
|
|
2 # Extract records from warc files given filename, length and offset triples
|
|
3 # from stdin or as command line args
|
89
|
4 # Usage [-w] [-h] [-b] [path length offset]
|
|
5 # -w WARC headers
|
|
6 # -h HTTP headers
|
|
7 # -b HTTP body
|
|
8 # No switch defaults to whole record
|
|
9 if [ "$1" = "-w" ]
|
|
10 then
|
|
11 shift
|
|
12 p=1
|
|
13 w=1
|
|
14 fi
|
|
15 if [ "$1" = "-h" ]
|
|
16 then
|
|
17 shift
|
|
18 p=1
|
|
19 h=1
|
|
20 fi
|
|
21 if [ "$1" = "-b" ]
|
|
22 then
|
|
23 shift
|
|
24 p=1
|
|
25 b=1
|
|
26 fi
|
88
|
27 if [ -n "$1" ]
|
|
28 then
|
|
29 printf "%s\t%s\t%s\n" "$1" "$2" "$3"
|
|
30 else
|
|
31 cat
|
|
32 fi | \
|
|
33 while { IFS=$'\t' read f l o; }
|
|
34 do
|
89
|
35 dd if="$f" of=/dev/stdout skip=$o count=$l \
|
|
36 iflag=skip_bytes,count_bytes 2>/dev/null
|
|
37 done | unpigz -dp 1 -c | \
|
90
|
38 { s="w"
|
89
|
39 if [ -n "$p" ]
|
|
40 then
|
91
|
41 shopt -qs extglob # for %%*(...)
|
89
|
42 while read -r L
|
|
43 do
|
|
44 if [ "$s" = "w" ]
|
|
45 then
|
|
46 # WARC header
|
|
47 if [ "$L" = "
" ]
|
|
48 then
|
|
49 s="h"
|
91
|
50 if [[ -n "$w" && ( -n "$h" || -n "$b" ) ]]
|
|
51 then
|
|
52 echo
|
|
53 fi
|
89
|
54 continue
|
|
55 fi
|
|
56 if [ -n "$w" ]
|
|
57 then
|
|
58 printf "%s\n" "${L%%
}"
|
|
59 fi
|
|
60 continue
|
|
61 fi
|
90
|
62 if [ "$s" = "h" ]
|
89
|
63 then
|
|
64 # HTTP header
|
|
65 case "$L" in
|
91
|
66
) s="b" ; n=0
|
|
67 if [ -n "$h" -a -n "$b" ]
|
|
68 then
|
|
69 echo
|
|
70 fi
|
|
71 continue ;;
|
89
|
72 Content-Length:\ *) bl=${L##*: }
|
|
73 bl=${bl%%*([
[:space:]])}
|
|
74 ;;
|
|
75 esac
|
90
|
76 if [ -n "$h" ]
|
89
|
77 then
|
|
78 printf "%s\n" "${L%%
}"
|
|
79 fi
|
|
80 continue
|
|
81 else
|
|
82 # HTTP body
|
|
83 if [ -n "$b" ]
|
|
84 then
|
91
|
85 #printf "%s\n" "$bl" 1>&2
|
89
|
86 head -c "$bl"
|
|
87 else
|
|
88 break
|
|
89 fi
|
|
90 fi
|
|
91 done
|
|
92 else
|
90
|
93 # No flags,the whole thing
|
|
94 cat
|
|
95 fi
|
|
96 }
|