comparison bin/ix.sh @ 92:d56465d5c51f

accepts index lines, less line-at-a-time
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 14 Apr 2021 20:15:32 +0000
parents 82c94684f799
children 4d870a7ec871
comparison
equal deleted inserted replaced
91:82c94684f799 92:d56465d5c51f
1 #!/usr/bin/bash 1 #!/usr/bin/bash
2 # Extract records from warc files given filename, length and offset triples 2 # Extract records from warc files given length, offset and file triples
3 # from stdin or as command line args 3 # from stdin or as command line args
4 # Usage [-w] [-h] [-b] [path length offset] 4 # Usage [-d] [-w] [-h] [-b] [-x] [length offset path]
5 # -d Debug output
5 # -w WARC headers 6 # -w WARC headers
6 # -h HTTP headers 7 # -h HTTP headers
7 # -b HTTP body 8 # -b HTTP body
8 # No switch defaults to whole record 9 # No switch defaults to whole record
10 # -x take lines of from a cdx index file as input, extract triples
11 if [ "$1" = "-d" ]
12 then
13 d=1
14 shift
15 fi
9 if [ "$1" = "-w" ] 16 if [ "$1" = "-w" ]
10 then 17 then
11 shift 18 shift
12 p=1 19 p=1
13 w=1 20 w=1
22 then 29 then
23 shift 30 shift
24 p=1 31 p=1
25 b=1 32 b=1
26 fi 33 fi
27 if [ -n "$1" ] 34 if [ "$1" = "-x" ]
35 then
36 # get triples from index lines
37 egrep -ao 'length": "[0-9]*", "offset": "[0-9]*".*\.gz'| \
38 sed 's/[a-z]*": "//g;s/", "/\t/g;s/\(crawl-data\|segments\|warc\)\///g' |\
39 if [ "$d" ] ; then tee /tmp/ix_triples.tsv ; else cat ; fi
40 elif [ "$1" ]
28 then 41 then
29 printf "%s\t%s\t%s\n" "$1" "$2" "$3" 42 printf "%s\t%s\t%s\n" "$1" "$2" "$3"
30 else 43 else
31 cat 44 cat
32 fi | \ 45 fi | \
33 while { IFS=$'\t' read f l o; } 46 while { IFS=$'\t' read l o f; }
34 do 47 do
35 dd if="$f" of=/dev/stdout skip=$o count=$l \ 48 if [ -z "$d" ]
36 iflag=skip_bytes,count_bytes 2>/dev/null 49 then
37 done | unpigz -dp 1 -c | \ 50 dd if="$f" of=/dev/stdout skip=$o count=$l \
51 iflag=skip_bytes,count_bytes 2>/dev/null
52 else
53 echo dd if="$f" of=/dev/stdout skip=$o count=$l \
54 iflag=skip_bytes,count_bytes > /tmp/ix_dd_log.txt
55 dd if="$f" of=/dev/stdout skip=$o count=$l \
56 iflag=skip_bytes,count_bytes 2>> /tmp/ix_dd_log.txt
57 fi | \
58 unpigz -dp 1 -c | tee /tmp/data | \
38 { s="w" 59 { s="w"
39 if [ -n "$p" ] 60 if [ "$p" ]
40 then 61 then
41 shopt -qs extglob # for %%*(...) 62 shopt -qs extglob # for %%*(...)
42 while read -r L 63 while read -r L
43 do 64 do
44 if [ "$s" = "w" ] 65 if [ "$s" = "w" ]
45 then 66 then
46 # WARC header 67 # WARC header
47 if [ "$L" = " 68 case "$L" in
48 " ] 69 Content-Length:\ *) wl=${L##*: }
49 then 70 wl=${wl%%*([
50 s="h" 71 [:space:]])}
51 if [[ -n "$w" && ( -n "$h" || -n "$b" ) ]] 72 ;;
52 then 73 WARC-Truncated:\ *) # echo $n $L
53 echo 74 tr=${L##*: }
54 fi 75 tr=${tr%%*([
55 continue 76 [:space:]])}
56 fi 77 tr=${tr:-EMPTY}
57 if [ -n "$w" ] 78 ;;
79
80 ) s="h"
81 if [ -z "$h$b" ]
82 then
83 exit 0
84 fi
85 if [ "$w" ]
86 then
87 echo
88 fi
89 continue;;
90 esac
91 if [ "$w" ]
58 then 92 then
59 printf "%s\n" "${L%% 93 printf "%s\n" "${L%%
60 }" 94 }"
61 fi 95 fi
62 continue 96 continue
63 fi 97 fi
64 if [ "$s" = "h" ] 98 if [ "$s" = "h" ]
65 then 99 then
66 # HTTP header 100 # HTTP header
101 wl=$((wl - ( ${#L} + 1 )))
67 case "$L" in 102 case "$L" in
68
69 ) s="b" ; n=0
70 if [ -n "$h" -a -n "$b" ]
71 then
72 echo
73 fi
74 continue ;;
75 Content-Length:\ *) bl=${L##*: } 103 Content-Length:\ *) bl=${L##*: }
76 bl=${bl%%*([ 104 bl=${bl%%*([
77 [:space:]])} 105 [:space:]])}
78 ;; 106 ;;
107 X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018???
108 xl=${xl%%*([
109 [:space:]])}
110 ;;
111
112 ) s="b" ; n=0
113 if [ -z "$b" ]
114 then
115 exit 0
116 fi
117 if [ "$h" ]
118 then
119 echo
120 fi
121 if [ "$xl" ]; then
122 bl=$xl
123 xx=x
124 else
125 unset xx
126 fi
127 if [ "$bl" ]; then
128 if [ $bl -ne $wl ]; then
129 echo length mismatch$xx: $f $o $l here: $wl given: $bl trunc: $tr 1>&2
130 fi
131 fi
132 continue ;;
79 esac 133 esac
80 if [ -n "$h" ] 134 if [ "$h" ]
81 then 135 then
82 printf "%s\n" "${L%% 136 printf "%s\n" "${L%%
83 }" 137 }"
84 fi 138 fi
85 continue 139 continue
86 else 140 else
87 # HTTP body 141 # HTTP body
88 if [ -n "$b" ] 142 if [ "$b" ]
89 then 143 then
90 #printf "%s\n" "$bl" 1>&2 144 #printf "%s\n" "$bl" 1>&2
91 head -c "$bl" 145 head -c "${bl-$wl}"
146 exit 0
92 else 147 else
93 break 148 break
94 fi 149 fi
95 fi 150 fi
96 done 151 done
97 else 152 else
98 # No flags,the whole thing 153 # No flags,the whole thing
99 cat 154 cat
100 fi 155 fi
101 } 156 }
157 done