Mercurial > hg > cc > cirrus_home
annotate bin/ix.sh @ 93:4d870a7ec871
support a command to receive each result,
remove use of X-Crawler-Content-Length
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 15 Apr 2021 10:59:25 +0000 |
parents | d56465d5c51f |
children | 0332076afc37 |
rev | line source |
---|---|
88 | 1 #!/usr/bin/bash |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
2 # Extract records from warc files given length, offset and file triples |
88 | 3 # from stdin or as command line args |
93
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
4 # Usage [-d] [-w] [-h] [-b] [-e cmd] [ -x | length offset path ] |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
5 # -d Debug output |
89 | 6 # -w WARC headers |
7 # -h HTTP headers | |
8 # -b HTTP body | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
9 # No switch defaults to whole record |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
10 # -x take lines of from a cdx index file as input, extract triples |
93
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
11 # -e pipes each result thru cmd |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
12 if [ "$1" = "-d" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
13 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
14 d=1 |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
15 shift |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
16 fi |
89 | 17 if [ "$1" = "-w" ] |
18 then | |
19 shift | |
20 p=1 | |
21 w=1 | |
22 fi | |
23 if [ "$1" = "-h" ] | |
24 then | |
25 shift | |
26 p=1 | |
27 h=1 | |
28 fi | |
29 if [ "$1" = "-b" ] | |
30 then | |
31 shift | |
32 p=1 | |
33 b=1 | |
34 fi | |
93
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
35 e="cat" |
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
36 if [ "$1" = "-e" ] |
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
37 then |
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
38 shift |
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
39 e="$1" |
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
40 shift |
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
41 fi |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
42 if [ "$1" = "-x" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
43 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
44 # get triples from index lines |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
45 egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
46 sed 's/[a-z]*": "//g;s/", "/\t/g;s/\(crawl-data\|segments\|warc\)\///g' |\ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
47 if [ "$d" ] ; then tee /tmp/ix_triples.tsv ; else cat ; fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
48 elif [ "$1" ] |
88 | 49 then |
50 printf "%s\t%s\t%s\n" "$1" "$2" "$3" | |
51 else | |
52 cat | |
53 fi | \ | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
54 while { IFS=$'\t' read l o f; } |
88 | 55 do |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
56 if [ -z "$d" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
57 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
58 dd if="$f" of=/dev/stdout skip=$o count=$l \ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
59 iflag=skip_bytes,count_bytes 2>/dev/null |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
60 else |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
61 echo dd if="$f" of=/dev/stdout skip=$o count=$l \ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
62 iflag=skip_bytes,count_bytes > /tmp/ix_dd_log.txt |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
63 dd if="$f" of=/dev/stdout skip=$o count=$l \ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
64 iflag=skip_bytes,count_bytes 2>> /tmp/ix_dd_log.txt |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
65 fi | \ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
66 unpigz -dp 1 -c | tee /tmp/data | \ |
90 | 67 { s="w" |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
68 if [ "$p" ] |
89 | 69 then |
91 | 70 shopt -qs extglob # for %%*(...) |
89 | 71 while read -r L |
72 do | |
73 if [ "$s" = "w" ] | |
74 then | |
75 # WARC header | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
76 case "$L" in |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
77 Content-Length:\ *) wl=${L##*: } |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
78 wl=${wl%%*([ [:space:]])} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
79 ;; |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
80 WARC-Truncated:\ *) # echo $n $L |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
81 tr=${L##*: } |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
82 tr=${tr%%*([ [:space:]])} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
83 tr=${tr:-EMPTY} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
84 ;; |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
85 ) s="h" |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
86 if [ -z "$h$b" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
87 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
88 exit 0 |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
89 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
90 if [ "$w" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
91 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
92 echo |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
93 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
94 continue;; |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
95 esac |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
96 if [ "$w" ] |
89 | 97 then |
98 printf "%s\n" "${L%% }" | |
99 fi | |
100 continue | |
101 fi | |
90 | 102 if [ "$s" = "h" ] |
89 | 103 then |
104 # HTTP header | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
105 wl=$((wl - ( ${#L} + 1 ))) |
89 | 106 case "$L" in |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
107 Content-Length:\ *) bl=${L##*: } |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
108 bl=${bl%%*([ [:space:]])} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
109 ;; |
91 | 110 ) s="b" ; n=0 |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
111 if [ -z "$b" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
112 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
113 exit 0 |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
114 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
115 if [ "$h" ] |
91 | 116 then |
117 echo | |
118 fi | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
119 if [ "$bl" ]; then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
120 if [ $bl -ne $wl ]; then |
93
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
121 echo length mismatch: $l $o $f here: $wl given: $bl trunc: $tr 1>&2 |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
122 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
123 fi |
91 | 124 continue ;; |
89 | 125 esac |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
126 if [ "$h" ] |
89 | 127 then |
128 printf "%s\n" "${L%% }" | |
129 fi | |
130 continue | |
131 else | |
132 # HTTP body | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
133 if [ "$b" ] |
89 | 134 then |
91 | 135 #printf "%s\n" "$bl" 1>&2 |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
136 head -c "${bl-$wl}" |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
137 exit 0 |
89 | 138 else |
139 break | |
140 fi | |
141 fi | |
142 done | |
143 else | |
90 | 144 # No flags,the whole thing |
145 cat | |
146 fi | |
93
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
147 } | $e |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
148 done |