Mercurial > hg > cc > cirrus_home
annotate bin/ix.sh @ 92:d56465d5c51f
accepts index lines, less line-at-a-time
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 14 Apr 2021 20:15:32 +0000 |
parents | 82c94684f799 |
children | 4d870a7ec871 |
rev | line source |
---|---|
88 | 1 #!/usr/bin/bash |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
2 # Extract records from warc files given length, offset and file triples |
88 | 3 # from stdin or as command line args |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
4 # Usage [-d] [-w] [-h] [-b] [-x] [length offset path] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
5 # -d Debug output |
89 | 6 # -w WARC headers |
7 # -h HTTP headers | |
8 # -b HTTP body | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
9 # No switch defaults to whole record |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
10 # -x take lines of from a cdx index file as input, extract triples |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
11 if [ "$1" = "-d" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
12 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
13 d=1 |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
14 shift |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
15 fi |
89 | 16 if [ "$1" = "-w" ] |
17 then | |
18 shift | |
19 p=1 | |
20 w=1 | |
21 fi | |
22 if [ "$1" = "-h" ] | |
23 then | |
24 shift | |
25 p=1 | |
26 h=1 | |
27 fi | |
28 if [ "$1" = "-b" ] | |
29 then | |
30 shift | |
31 p=1 | |
32 b=1 | |
33 fi | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
34 if [ "$1" = "-x" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
35 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
36 # get triples from index lines |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
37 egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
38 sed 's/[a-z]*": "//g;s/", "/\t/g;s/\(crawl-data\|segments\|warc\)\///g' |\ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
39 if [ "$d" ] ; then tee /tmp/ix_triples.tsv ; else cat ; fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
40 elif [ "$1" ] |
88 | 41 then |
42 printf "%s\t%s\t%s\n" "$1" "$2" "$3" | |
43 else | |
44 cat | |
45 fi | \ | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
46 while { IFS=$'\t' read l o f; } |
88 | 47 do |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
48 if [ -z "$d" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
49 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
50 dd if="$f" of=/dev/stdout skip=$o count=$l \ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
51 iflag=skip_bytes,count_bytes 2>/dev/null |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
52 else |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
53 echo dd if="$f" of=/dev/stdout skip=$o count=$l \ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
54 iflag=skip_bytes,count_bytes > /tmp/ix_dd_log.txt |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
55 dd if="$f" of=/dev/stdout skip=$o count=$l \ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
56 iflag=skip_bytes,count_bytes 2>> /tmp/ix_dd_log.txt |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
57 fi | \ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
58 unpigz -dp 1 -c | tee /tmp/data | \ |
90 | 59 { s="w" |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
60 if [ "$p" ] |
89 | 61 then |
91 | 62 shopt -qs extglob # for %%*(...) |
89 | 63 while read -r L |
64 do | |
65 if [ "$s" = "w" ] | |
66 then | |
67 # WARC header | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
68 case "$L" in |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
69 Content-Length:\ *) wl=${L##*: } |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
70 wl=${wl%%*([ [:space:]])} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
71 ;; |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
72 WARC-Truncated:\ *) # echo $n $L |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
73 tr=${L##*: } |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
74 tr=${tr%%*([ [:space:]])} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
75 tr=${tr:-EMPTY} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
76 ;; |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
77 ) s="h" |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
78 if [ -z "$h$b" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
79 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
80 exit 0 |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
81 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
82 if [ "$w" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
83 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
84 echo |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
85 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
86 continue;; |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
87 esac |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
88 if [ "$w" ] |
89 | 89 then |
90 printf "%s\n" "${L%% }" | |
91 fi | |
92 continue | |
93 fi | |
90 | 94 if [ "$s" = "h" ] |
89 | 95 then |
96 # HTTP header | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
97 wl=$((wl - ( ${#L} + 1 ))) |
89 | 98 case "$L" in |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
99 Content-Length:\ *) bl=${L##*: } |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
100 bl=${bl%%*([ [:space:]])} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
101 ;; |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
102 X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018??? |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
103 xl=${xl%%*([ [:space:]])} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
104 ;; |
91 | 105 ) s="b" ; n=0 |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
106 if [ -z "$b" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
107 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
108 exit 0 |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
109 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
110 if [ "$h" ] |
91 | 111 then |
112 echo | |
113 fi | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
114 if [ "$xl" ]; then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
115 bl=$xl |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
116 xx=x |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
117 else |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
118 unset xx |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
119 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
120 if [ "$bl" ]; then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
121 if [ $bl -ne $wl ]; then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
122 echo length mismatch$xx: $f $o $l here: $wl given: $bl trunc: $tr 1>&2 |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
123 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
124 fi |
91 | 125 continue ;; |
89 | 126 esac |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
127 if [ "$h" ] |
89 | 128 then |
129 printf "%s\n" "${L%% }" | |
130 fi | |
131 continue | |
132 else | |
133 # HTTP body | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
134 if [ "$b" ] |
89 | 135 then |
91 | 136 #printf "%s\n" "$bl" 1>&2 |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
137 head -c "${bl-$wl}" |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
138 exit 0 |
89 | 139 else |
140 break | |
141 fi | |
142 fi | |
143 done | |
144 else | |
90 | 145 # No flags,the whole thing |
146 cat | |
147 fi | |
148 } | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
149 done |