annotate bin/ix.sh @ 89:90f8f28b2e51

working on flags
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 13 Apr 2021 17:52:31 +0000
parents 464d2dfb99c9
children 5384208a0834
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
88
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/bash
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Extract records from warc files given filename, length and offset triples
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # from stdin or as command line args
89
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
4 # Usage [-w] [-h] [-b] [path length offset]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
5 # -w WARC headers
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
6 # -h HTTP headers
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
7 # -b HTTP body
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
8 # No switch defaults to whole record
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
9 if [ "$1" = "-w" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
10 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
11 shift
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
12 p=1
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
13 w=1
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
14 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
15 if [ "$1" = "-h" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
16 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
17 shift
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
18 p=1
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
19 h=1
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
20 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
21 if [ "$1" = "-b" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
22 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
23 shift
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
24 p=1
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
25 b=1
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
26 fi
88
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 if [ -n "$1" ]
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 then
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 printf "%s\t%s\t%s\n" "$1" "$2" "$3"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 else
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 cat
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 fi | \
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 while { IFS=$'\t' read f l o; }
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 do
89
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
35 dd if="$f" of=/dev/stdout skip=$o count=$l \
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
36 iflag=skip_bytes,count_bytes 2>/dev/null
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
37 done | unpigz -dp 1 -c | \
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
38 s="w"
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
39 if [ -n "$p" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
40 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
41 while read -r L
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
42 do
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
43 if [ "$s" = "w" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
44 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
45 # WARC header
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
46 if [ "$L" = " " ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
47 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
48 s="h"
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
49 continue
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
50 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
51 if [ -n "$w" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
52 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
53 printf "%s\n" "${L%% }"
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
54 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
55 continue
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
56 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
57 if [ "$s" = "b" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
58 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
59 # HTTP header
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
60 case "$L" in
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
61 ) s="b" ; n=0 ; continue ;;
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
62 Content-Length:\ *) bl=${L##*: }
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
63 bl=${bl%%*([ [:space:]])}
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
64 ;;
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
65 esac
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
66 if [ -n "$w" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
67 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
68 printf "%s\n" "${L%% }"
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
69 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
70 continue
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
71 else
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
72 # HTTP body
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
73 if [ -n "$b" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
74 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
75 printf "%s\n" "$bl" 1>&2
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
76 head -c "$bl"
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
77 else
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
78 break
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
79 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
80 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
81 done
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
82 else
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
83 cat
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
84 fi # No flags,the whole thing