Mercurial > hg > cc > cirrus_home
comparison bin/ix.sh @ 92:d56465d5c51f
accepts index lines, less line-at-a-time
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 14 Apr 2021 20:15:32 +0000 |
parents | 82c94684f799 |
children | 4d870a7ec871 |
comparison
equal
deleted
inserted
replaced
91:82c94684f799 | 92:d56465d5c51f |
---|---|
1 #!/usr/bin/bash | 1 #!/usr/bin/bash |
2 # Extract records from warc files given filename, length and offset triples | 2 # Extract records from warc files given length, offset and file triples |
3 # from stdin or as command line args | 3 # from stdin or as command line args |
4 # Usage [-w] [-h] [-b] [path length offset] | 4 # Usage [-d] [-w] [-h] [-b] [-x] [length offset path] |
5 # -d Debug output | |
5 # -w WARC headers | 6 # -w WARC headers |
6 # -h HTTP headers | 7 # -h HTTP headers |
7 # -b HTTP body | 8 # -b HTTP body |
8 # No switch defaults to whole record | 9 # No switch defaults to whole record |
10 # -x take lines of from a cdx index file as input, extract triples | |
11 if [ "$1" = "-d" ] | |
12 then | |
13 d=1 | |
14 shift | |
15 fi | |
9 if [ "$1" = "-w" ] | 16 if [ "$1" = "-w" ] |
10 then | 17 then |
11 shift | 18 shift |
12 p=1 | 19 p=1 |
13 w=1 | 20 w=1 |
22 then | 29 then |
23 shift | 30 shift |
24 p=1 | 31 p=1 |
25 b=1 | 32 b=1 |
26 fi | 33 fi |
27 if [ -n "$1" ] | 34 if [ "$1" = "-x" ] |
35 then | |
36 # get triples from index lines | |
37 egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \ | |
38 sed 's/[a-z]*": "//g;s/", "/\t/g;s/\(crawl-data\|segments\|warc\)\///g' |\ | |
39 if [ "$d" ] ; then tee /tmp/ix_triples.tsv ; else cat ; fi | |
40 elif [ "$1" ] | |
28 then | 41 then |
29 printf "%s\t%s\t%s\n" "$1" "$2" "$3" | 42 printf "%s\t%s\t%s\n" "$1" "$2" "$3" |
30 else | 43 else |
31 cat | 44 cat |
32 fi | \ | 45 fi | \ |
33 while { IFS=$'\t' read f l o; } | 46 while { IFS=$'\t' read l o f; } |
34 do | 47 do |
35 dd if="$f" of=/dev/stdout skip=$o count=$l \ | 48 if [ -z "$d" ] |
36 iflag=skip_bytes,count_bytes 2>/dev/null | 49 then |
37 done | unpigz -dp 1 -c | \ | 50 dd if="$f" of=/dev/stdout skip=$o count=$l \ |
51 iflag=skip_bytes,count_bytes 2>/dev/null | |
52 else | |
53 echo dd if="$f" of=/dev/stdout skip=$o count=$l \ | |
54 iflag=skip_bytes,count_bytes > /tmp/ix_dd_log.txt | |
55 dd if="$f" of=/dev/stdout skip=$o count=$l \ | |
56 iflag=skip_bytes,count_bytes 2>> /tmp/ix_dd_log.txt | |
57 fi | \ | |
58 unpigz -dp 1 -c | tee /tmp/data | \ | |
38 { s="w" | 59 { s="w" |
39 if [ -n "$p" ] | 60 if [ "$p" ] |
40 then | 61 then |
41 shopt -qs extglob # for %%*(...) | 62 shopt -qs extglob # for %%*(...) |
42 while read -r L | 63 while read -r L |
43 do | 64 do |
44 if [ "$s" = "w" ] | 65 if [ "$s" = "w" ] |
45 then | 66 then |
46 # WARC header | 67 # WARC header |
47 if [ "$L" = " | 68 case "$L" in |
48 " ] | 69 Content-Length:\ *) wl=${L##*: } |
49 then | 70 wl=${wl%%*([ |
50 s="h" | 71 [:space:]])} |
51 if [[ -n "$w" && ( -n "$h" || -n "$b" ) ]] | 72 ;; |
52 then | 73 WARC-Truncated:\ *) # echo $n $L |
53 echo | 74 tr=${L##*: } |
54 fi | 75 tr=${tr%%*([ |
55 continue | 76 [:space:]])} |
56 fi | 77 tr=${tr:-EMPTY} |
57 if [ -n "$w" ] | 78 ;; |
79 | |
80 ) s="h" | |
81 if [ -z "$h$b" ] | |
82 then | |
83 exit 0 | |
84 fi | |
85 if [ "$w" ] | |
86 then | |
87 echo | |
88 fi | |
89 continue;; | |
90 esac | |
91 if [ "$w" ] | |
58 then | 92 then |
59 printf "%s\n" "${L%% | 93 printf "%s\n" "${L%% |
60 }" | 94 }" |
61 fi | 95 fi |
62 continue | 96 continue |
63 fi | 97 fi |
64 if [ "$s" = "h" ] | 98 if [ "$s" = "h" ] |
65 then | 99 then |
66 # HTTP header | 100 # HTTP header |
101 wl=$((wl - ( ${#L} + 1 ))) | |
67 case "$L" in | 102 case "$L" in |
68 | |
69 ) s="b" ; n=0 | |
70 if [ -n "$h" -a -n "$b" ] | |
71 then | |
72 echo | |
73 fi | |
74 continue ;; | |
75 Content-Length:\ *) bl=${L##*: } | 103 Content-Length:\ *) bl=${L##*: } |
76 bl=${bl%%*([ | 104 bl=${bl%%*([ |
77 [:space:]])} | 105 [:space:]])} |
78 ;; | 106 ;; |
107 X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018??? | |
108 xl=${xl%%*([ | |
109 [:space:]])} | |
110 ;; | |
111 | |
112 ) s="b" ; n=0 | |
113 if [ -z "$b" ] | |
114 then | |
115 exit 0 | |
116 fi | |
117 if [ "$h" ] | |
118 then | |
119 echo | |
120 fi | |
121 if [ "$xl" ]; then | |
122 bl=$xl | |
123 xx=x | |
124 else | |
125 unset xx | |
126 fi | |
127 if [ "$bl" ]; then | |
128 if [ $bl -ne $wl ]; then | |
129 echo length mismatch$xx: $f $o $l here: $wl given: $bl trunc: $tr 1>&2 | |
130 fi | |
131 fi | |
132 continue ;; | |
79 esac | 133 esac |
80 if [ -n "$h" ] | 134 if [ "$h" ] |
81 then | 135 then |
82 printf "%s\n" "${L%% | 136 printf "%s\n" "${L%% |
83 }" | 137 }" |
84 fi | 138 fi |
85 continue | 139 continue |
86 else | 140 else |
87 # HTTP body | 141 # HTTP body |
88 if [ -n "$b" ] | 142 if [ "$b" ] |
89 then | 143 then |
90 #printf "%s\n" "$bl" 1>&2 | 144 #printf "%s\n" "$bl" 1>&2 |
91 head -c "$bl" | 145 head -c "${bl-$wl}" |
146 exit 0 | |
92 else | 147 else |
93 break | 148 break |
94 fi | 149 fi |
95 fi | 150 fi |
96 done | 151 done |
97 else | 152 else |
98 # No flags,the whole thing | 153 # No flags,the whole thing |
99 cat | 154 cat |
100 fi | 155 fi |
101 } | 156 } |
157 done |