Mercurial > hg > cc > cirrus_home
annotate bin/ix.sh @ 177:354dae8aeb80
moved to work tree
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 05 Jul 2023 14:52:00 +0100 |
parents | 1d6fde73789d |
children |
rev | line source |
---|---|
88 | 1 #!/usr/bin/bash |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
2 # Extract records from warc files given length, offset and file triples |
88 | 3 # from stdin or as command line args |
93
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
4 # Usage [-d] [-w] [-h] [-b] [-e cmd] [ -x | length offset path ] |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
5 # -d Debug output |
89 | 6 # -w WARC headers |
7 # -h HTTP headers | |
8 # -b HTTP body | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
9 # No switch defaults to whole record |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
10 # -x take lines of from a cdx index file as input, extract triples |
93
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
11 # -e pipes each result thru cmd |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
12 if [ "$1" = "-d" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
13 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
14 d=1 |
102
0332076afc37
better dd error handling
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
93
diff
changeset
|
15 rm /tmp/ix_dd_log.txt /tmp/ix_triples.tsv |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
16 shift |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
17 fi |
89 | 18 if [ "$1" = "-w" ] |
19 then | |
20 shift | |
21 p=1 | |
22 w=1 | |
23 fi | |
24 if [ "$1" = "-h" ] | |
25 then | |
26 shift | |
27 p=1 | |
28 h=1 | |
29 fi | |
30 if [ "$1" = "-b" ] | |
31 then | |
32 shift | |
33 p=1 | |
34 b=1 | |
35 fi | |
93
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
36 e="cat" |
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
37 if [ "$1" = "-e" ] |
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
38 then |
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
39 shift |
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
40 e="$1" |
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
41 shift |
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
42 fi |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
43 if [ "$1" = "-x" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
44 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
45 # get triples from index lines |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
46 egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
47 sed 's/[a-z]*": "//g;s/", "/\t/g;s/\(crawl-data\|segments\|warc\)\///g' |\ |
102
0332076afc37
better dd error handling
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
93
diff
changeset
|
48 if [ "$d" ] ; then tee -a /tmp/ix_triples.tsv ; else cat ; fi |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
49 elif [ "$1" ] |
88 | 50 then |
51 printf "%s\t%s\t%s\n" "$1" "$2" "$3" | |
52 else | |
53 cat | |
54 fi | \ | |
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
55 while { IFS=$'\t' read l o wf; } |
88 | 56 do |
105
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
57 ff=($(echo $wf | tr '/' ' ')) |
baf56ff538f8
convert to rich directory structure per 2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
58 f="/beegfs/common_crawl/${ff[0]}/${ff[1]}/orig/warc/${ff[2]}" |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
59 if [ -z "$d" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
60 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
61 dd if="$f" of=/dev/stdout skip=$o count=$l \ |
102
0332076afc37
better dd error handling
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
93
diff
changeset
|
62 iflag=skip_bytes,count_bytes status=none |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
63 else |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
64 echo dd if="$f" of=/dev/stdout skip=$o count=$l \ |
102
0332076afc37
better dd error handling
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
93
diff
changeset
|
65 iflag=skip_bytes,count_bytes >> /tmp/ix_dd_log.txt |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
66 dd if="$f" of=/dev/stdout skip=$o count=$l \ |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
67 iflag=skip_bytes,count_bytes 2>> /tmp/ix_dd_log.txt |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
68 fi | \ |
113 | 69 { ~/gentoo/usr/bin/igzip -dc || { printf "dd failure?: %s %s %s\n" $f $o $l 1>&2 ; continue ; } } |\ |
90 | 70 { s="w" |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
71 if [ "$p" ] |
89 | 72 then |
91 | 73 shopt -qs extglob # for %%*(...) |
89 | 74 while read -r L |
75 do | |
76 if [ "$s" = "w" ] | |
77 then | |
78 # WARC header | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
79 case "$L" in |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
80 Content-Length:\ *) wl=${L##*: } |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
81 wl=${wl%%*([ [:space:]])} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
82 ;; |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
83 WARC-Truncated:\ *) # echo $n $L |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
84 tr=${L##*: } |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
85 tr=${tr%%*([ [:space:]])} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
86 tr=${tr:-EMPTY} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
87 ;; |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
88 ) s="h" |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
89 if [ -z "$h$b" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
90 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
91 exit 0 |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
92 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
93 if [ "$w" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
94 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
95 echo |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
96 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
97 continue;; |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
98 esac |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
99 if [ "$w" ] |
89 | 100 then |
101 printf "%s\n" "${L%% }" | |
102 fi | |
103 continue | |
104 fi | |
90 | 105 if [ "$s" = "h" ] |
89 | 106 then |
107 # HTTP header | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
108 wl=$((wl - ( ${#L} + 1 ))) |
113 | 109 #echo $wl $bl 1>&2 |
89 | 110 case "$L" in |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
111 Content-Length:\ *) bl=${L##*: } |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
112 bl=${bl%%*([ [:space:]])} |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
113 ;; |
91 | 114 ) s="b" ; n=0 |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
115 if [ -z "$b" ] |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
116 then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
117 exit 0 |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
118 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
119 if [ "$h" ] |
91 | 120 then |
121 echo | |
122 fi | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
123 if [ "$bl" ]; then |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
124 if [ $bl -ne $wl ]; then |
93
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
125 echo length mismatch: $l $o $f here: $wl given: $bl trunc: $tr 1>&2 |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
126 fi |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
127 fi |
91 | 128 continue ;; |
89 | 129 esac |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
130 if [ "$h" ] |
89 | 131 then |
132 printf "%s\n" "${L%% }" | |
133 fi | |
134 continue | |
135 else | |
136 # HTTP body | |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
137 if [ "$b" ] |
89 | 138 then |
91 | 139 #printf "%s\n" "$bl" 1>&2 |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
140 head -c "${bl-$wl}" |
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
141 exit 0 |
89 | 142 else |
143 break | |
144 fi | |
145 fi | |
146 done | |
147 else | |
90 | 148 # No flags,the whole thing |
149 cat | |
150 fi | |
93
4d870a7ec871
support a command to receive each result,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
151 } | $e |
92
d56465d5c51f
accepts index lines, less line-at-a-time
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
152 done |