88
|
1 #!/usr/bin/bash
|
|
2 # Extract records from warc files given filename, length and offset triples
|
|
3 # from stdin or as command line args
|
89
|
4 # Usage [-w] [-h] [-b] [path length offset]
|
|
5 # -w WARC headers
|
|
6 # -h HTTP headers
|
|
7 # -b HTTP body
|
|
8 # No switch defaults to whole record
|
|
9 if [ "$1" = "-w" ]
|
|
10 then
|
|
11 shift
|
|
12 p=1
|
|
13 w=1
|
|
14 fi
|
|
15 if [ "$1" = "-h" ]
|
|
16 then
|
|
17 shift
|
|
18 p=1
|
|
19 h=1
|
|
20 fi
|
|
21 if [ "$1" = "-b" ]
|
|
22 then
|
|
23 shift
|
|
24 p=1
|
|
25 b=1
|
|
26 fi
|
88
|
27 if [ -n "$1" ]
|
|
28 then
|
|
29 printf "%s\t%s\t%s\n" "$1" "$2" "$3"
|
|
30 else
|
|
31 cat
|
|
32 fi | \
|
|
33 while { IFS=$'\t' read f l o; }
|
|
34 do
|
89
|
35 dd if="$f" of=/dev/stdout skip=$o count=$l \
|
|
36 iflag=skip_bytes,count_bytes 2>/dev/null
|
|
37 done | unpigz -dp 1 -c | \
|
|
38 s="w"
|
|
39 if [ -n "$p" ]
|
|
40 then
|
|
41 while read -r L
|
|
42 do
|
|
43 if [ "$s" = "w" ]
|
|
44 then
|
|
45 # WARC header
|
|
46 if [ "$L" = "
" ]
|
|
47 then
|
|
48 s="h"
|
|
49 continue
|
|
50 fi
|
|
51 if [ -n "$w" ]
|
|
52 then
|
|
53 printf "%s\n" "${L%%
}"
|
|
54 fi
|
|
55 continue
|
|
56 fi
|
|
57 if [ "$s" = "b" ]
|
|
58 then
|
|
59 # HTTP header
|
|
60 case "$L" in
|
|
61
) s="b" ; n=0 ; continue ;;
|
|
62 Content-Length:\ *) bl=${L##*: }
|
|
63 bl=${bl%%*([
[:space:]])}
|
|
64 ;;
|
|
65 esac
|
|
66 if [ -n "$w" ]
|
|
67 then
|
|
68 printf "%s\n" "${L%%
}"
|
|
69 fi
|
|
70 continue
|
|
71 else
|
|
72 # HTTP body
|
|
73 if [ -n "$b" ]
|
|
74 then
|
|
75 printf "%s\n" "$bl" 1>&2
|
|
76 head -c "$bl"
|
|
77 else
|
|
78 break
|
|
79 fi
|
|
80 fi
|
|
81 done
|
|
82 else
|
|
83 cat
|
|
84 fi # No flags,the whole thing
|