annotate bin/ix.sh @ 91:82c94684f799

working with one input
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 14 Apr 2021 10:08:41 +0000
parents 5384208a0834
children d56465d5c51f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
88
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/bash
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Extract records from warc files given filename, length and offset triples
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # from stdin or as command line args
89
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
4 # Usage [-w] [-h] [-b] [path length offset]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
5 # -w WARC headers
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
6 # -h HTTP headers
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
7 # -b HTTP body
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
8 # No switch defaults to whole record
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
9 if [ "$1" = "-w" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
10 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
11 shift
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
12 p=1
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
13 w=1
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
14 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
15 if [ "$1" = "-h" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
16 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
17 shift
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
18 p=1
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
19 h=1
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
20 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
21 if [ "$1" = "-b" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
22 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
23 shift
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
24 p=1
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
25 b=1
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
26 fi
88
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 if [ -n "$1" ]
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 then
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 printf "%s\t%s\t%s\n" "$1" "$2" "$3"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 else
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 cat
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 fi | \
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 while { IFS=$'\t' read f l o; }
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 do
89
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
35 dd if="$f" of=/dev/stdout skip=$o count=$l \
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
36 iflag=skip_bytes,count_bytes 2>/dev/null
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
37 done | unpigz -dp 1 -c | \
90
5384208a0834 -w and -h working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
38 { s="w"
89
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
39 if [ -n "$p" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
40 then
91
82c94684f799 working with one input
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
41 shopt -qs extglob # for %%*(...)
89
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
42 while read -r L
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
43 do
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
44 if [ "$s" = "w" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
45 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
46 # WARC header
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
47 if [ "$L" = " " ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
48 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
49 s="h"
91
82c94684f799 working with one input
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
50 if [[ -n "$w" && ( -n "$h" || -n "$b" ) ]]
82c94684f799 working with one input
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
51 then
82c94684f799 working with one input
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
52 echo
82c94684f799 working with one input
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
53 fi
89
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
54 continue
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
55 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
56 if [ -n "$w" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
57 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
58 printf "%s\n" "${L%% }"
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
59 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
60 continue
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
61 fi
90
5384208a0834 -w and -h working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
62 if [ "$s" = "h" ]
89
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
63 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
64 # HTTP header
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
65 case "$L" in
91
82c94684f799 working with one input
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
66 ) s="b" ; n=0
82c94684f799 working with one input
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
67 if [ -n "$h" -a -n "$b" ]
82c94684f799 working with one input
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
68 then
82c94684f799 working with one input
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
69 echo
82c94684f799 working with one input
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
70 fi
82c94684f799 working with one input
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
71 continue ;;
89
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
72 Content-Length:\ *) bl=${L##*: }
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
73 bl=${bl%%*([ [:space:]])}
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
74 ;;
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
75 esac
90
5384208a0834 -w and -h working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
76 if [ -n "$h" ]
89
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
77 then
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
78 printf "%s\n" "${L%% }"
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
79 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
80 continue
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
81 else
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
82 # HTTP body
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
83 if [ -n "$b" ]
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
84 then
91
82c94684f799 working with one input
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 90
diff changeset
85 #printf "%s\n" "$bl" 1>&2
89
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
86 head -c "$bl"
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
87 else
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
88 break
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
89 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
90 fi
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
91 done
90f8f28b2e51 working on flags
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 88
diff changeset
92 else
90
5384208a0834 -w and -h working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
93 # No flags,the whole thing
5384208a0834 -w and -h working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
94 cat
5384208a0834 -w and -h working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
95 fi
5384208a0834 -w and -h working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 89
diff changeset
96 }