comparison src/regex.h @ 185:3d6bfa290dbd r20-3b19

Import from CVS: tag r20-3b19
author cvs
date Mon, 13 Aug 2007 09:55:28 +0200
parents cf808b4c4290
children 41ff10fd062f
comparison
equal deleted inserted replaced
184:bcd2674570bf 185:3d6bfa290dbd
32 #include <stddef.h> 32 #include <stddef.h>
33 #endif 33 #endif
34 34
35 35
36 /* The following bits are used to determine the regexp syntax we 36 /* The following bits are used to determine the regexp syntax we
37 recognize. The set/not-set meanings are chosen so that Emacs syntax 37 recognize. The not-set meaning typically corresponds to the syntax
38 remains the value 0. The bits are given in alphabetical order, and 38 used by Emacs (the exception is RE_INTERVAL, made for historical
39 the definitions shifted by one from the previous bit; thus, when we 39 reasons). The bits are given in alphabetical order, and the
40 add or remove a bit, only one other definition need change. */ 40 definitions shifted by one from the previous bit; thus, when we add or
41 remove a bit, only one other definition need change. */
41 typedef unsigned reg_syntax_t; 42 typedef unsigned reg_syntax_t;
42 43
43 /* If this bit is not set, then \ inside a bracket expression is literal. 44 /* If this bit is not set, then \ inside a bracket expression is literal.
44 If set, then such a \ quotes the following character. */ 45 If set, then such a \ quotes the following character. */
45 #define RE_BACKSLASH_ESCAPE_IN_LISTS (1) 46 #define RE_BACKSLASH_ESCAPE_IN_LISTS (1)
46 47
47 /* If this bit is not set, then + and ? are operators, and \+ and \? are 48 /* If this bit is not set, then + and ? are operators, and \+ and \? are
48 literals. 49 literals.
49 If set, then \+ and \? are operators and + and ? are literals. */ 50 If set, then \+ and \? are operators and + and ? are literals. */
50 #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) 51 #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1)
51 52
52 /* If this bit is set, then character classes are supported. They are: 53 /* If this bit is set, then character classes are supported. They are:
53 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], 54 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:],
59 expressions, of course). 60 expressions, of course).
60 If this bit is not set, then it depends: 61 If this bit is not set, then it depends:
61 ^ is an anchor if it is at the beginning of a regular 62 ^ is an anchor if it is at the beginning of a regular
62 expression or after an open-group or an alternation operator; 63 expression or after an open-group or an alternation operator;
63 $ is an anchor if it is at the end of a regular expression, or 64 $ is an anchor if it is at the end of a regular expression, or
64 before a close-group or an alternation operator. 65 before a close-group or an alternation operator.
65 66
66 This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because 67 This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because
67 POSIX draft 11.2 says that * etc. in leading positions is undefined. 68 POSIX draft 11.2 says that * etc. in leading positions is undefined.
68 We already implemented a previous draft which made those constructs 69 We already implemented a previous draft which made those constructs
69 invalid, though, so we haven't changed the code back. */ 70 invalid, though, so we haven't changed the code back. */
70 #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) 71 #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1)
71 72
72 /* If this bit is set, then special characters are always special 73 /* If this bit is set, then special characters are always special
73 regardless of where they are in the pattern. 74 regardless of where they are in the pattern.
74 If this bit is not set, then special characters are special only in 75 If this bit is not set, then special characters are special only in
75 some contexts; otherwise they are ordinary. Specifically, 76 some contexts; otherwise they are ordinary. Specifically,
76 * + ? and intervals are only special when not after the beginning, 77 * + ? and intervals are only special when not after the beginning,
77 open-group, or alternation operator. */ 78 open-group, or alternation operator. */
78 #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) 79 #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1)
79 80
80 /* If this bit is set, then *, +, ?, and { cannot be first in an re or 81 /* If this bit is set, then *, +, ?, and { cannot be first in an re or
92 /* If this bit is set, nonmatching lists [^...] do not match newline. 93 /* If this bit is set, nonmatching lists [^...] do not match newline.
93 If not set, they do. */ 94 If not set, they do. */
94 #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) 95 #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1)
95 96
96 /* If this bit is set, either \{...\} or {...} defines an 97 /* If this bit is set, either \{...\} or {...} defines an
97 interval, depending on RE_NO_BK_BRACES. 98 interval, depending on RE_NO_BK_BRACES.
98 If not set, \{, \}, {, and } are literals. */ 99 If not set, \{, \}, {, and } are literals. */
99 #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) 100 #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
100 101
101 /* If this bit is set, +, ? and | aren't recognized as operators. 102 /* If this bit is set, +, ? and | aren't recognized as operators.
102 If not set, they are. */ 103 If not set, they are. */
117 118
118 /* If this bit is set, then \<digit> matches <digit>. 119 /* If this bit is set, then \<digit> matches <digit>.
119 If not set, then \<digit> is a back-reference. */ 120 If not set, then \<digit> is a back-reference. */
120 #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) 121 #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1)
121 122
122 /* If this bit is set, then | is an alternation operator, and \| is literal. 123 /* If this bit is set, then | is an alternation operator, and \| is literal.
123 If not set, then \| is an alternation operator, and | is literal. */ 124 If not set, then \| is an alternation operator, and | is literal. */
124 #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) 125 #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1)
125 126
126 /* If this bit is set, then an ending range point collating higher 127 /* If this bit is set, then an ending range point collating higher
127 than the starting range point, as in [z-a], is invalid. 128 than the starting range point, as in [z-a], is invalid.
128 If not set, then when ending range point collates higher than the 129 If not set, then when ending range point collates higher than the
129 starting range point, the range is ignored. */ 130 starting range point, the range is ignored. */
130 #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) 131 #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1)
131 132
133 /* If this bit is not set, allow minimal matching:
134 - a*? and a+? and a?? perform shortest-possible matching (compare with a*
135 and a+ and a?, respectively, which perform longest-possible matching)
136 - other juxtaposing of * + and ? is rejected.
137 If this bit is set, consecutive * + and ?'s are collapsed in a logical
138 manner:
139 - a*? and a+? are the same as a*
140 - a?? is the same as a?
141 */
142 #define RE_NO_MINIMAL_MATCHING (RE_NO_EMPTY_RANGES << 1)
143
144 /* If this bit is set, succeed as soon as we match the whole pattern,
145 without further backtracking. */
146 #define RE_NO_POSIX_BACKTRACKING (RE_NO_MINIMAL_MATCHING << 1)
147
148 /* If this bit is not set, (?:re) behaves like (re) (or \(?:re\) behaves like
149 \(re\)) except that the matched string is not registered. */
150 #define RE_NO_SHY_GROUPS (RE_NO_POSIX_BACKTRACKING << 1)
151
132 /* If this bit is set, then an unmatched ) is ordinary. 152 /* If this bit is set, then an unmatched ) is ordinary.
133 If not set, then an unmatched ) is invalid. */ 153 If not set, then an unmatched ) is invalid. */
134 #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) 154 #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_SHY_GROUPS << 1)
135
136 /* If this bit is set, succeed as soon as we match the whole pattern,
137 without further backtracking. */
138 #define RE_NO_POSIX_BACKTRACKING (RE_UNMATCHED_RIGHT_PAREN_ORD << 1)
139 155
140 /* This global variable defines the particular regexp syntax to use (for 156 /* This global variable defines the particular regexp syntax to use (for
141 some interfaces). When a regexp is compiled, the syntax used is 157 some interfaces). When a regexp is compiled, the syntax used is
142 stored in the pattern buffer, so changing this does not affect 158 stored in the pattern buffer, so changing this does not affect
143 already-compiled regexps. */ 159 already-compiled regexps. */
144 extern reg_syntax_t re_syntax_options; 160 extern reg_syntax_t re_syntax_options;
145 161
146 /* Define combinations of the above bits for the standard possibilities. 162 /* Define combinations of the above bits for the standard possibilities.
147 (The [[[ comments delimit what gets put into the Texinfo file, so 163 (The [[[ comments delimit what gets put into the Texinfo file, so
148 don't delete them!) */ 164 don't delete them!) */
149 /* [[[begin syntaxes]]] */ 165 /* [[[begin syntaxes]]] */
150 #define RE_SYNTAX_EMACS 0 166 #define RE_SYNTAX_EMACS RE_INTERVALS
151 167
152 #define RE_SYNTAX_AWK \ 168 #define RE_SYNTAX_AWK \
153 (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ 169 (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \
154 | RE_NO_BK_PARENS | RE_NO_BK_REFS \ 170 | RE_NO_BK_PARENS | RE_NO_BK_REFS \
155 | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ 171 | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \
156 | RE_UNMATCHED_RIGHT_PAREN_ORD) 172 | RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_SHY_GROUPS \
173 | RE_NO_MINIMAL_MATCHING)
157 174
158 #define RE_SYNTAX_POSIX_AWK \ 175 #define RE_SYNTAX_POSIX_AWK \
159 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) 176 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS)
160 177
161 #define RE_SYNTAX_GREP \ 178 #define RE_SYNTAX_GREP \
162 (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ 179 (RE_BK_PLUS_QM | RE_CHAR_CLASSES \
163 | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ 180 | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \
164 | RE_NEWLINE_ALT) 181 | RE_NEWLINE_ALT | RE_NO_SHY_GROUPS \
182 | RE_NO_MINIMAL_MATCHING)
165 183
166 #define RE_SYNTAX_EGREP \ 184 #define RE_SYNTAX_EGREP \
167 (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ 185 (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \
168 | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ 186 | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \
169 | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ 187 | RE_NEWLINE_ALT | RE_NO_BK_PARENS \
170 | RE_NO_BK_VBAR) 188 | RE_NO_BK_VBAR | RE_NO_SHY_GROUPS \
189 | RE_NO_MINIMAL_MATCHING)
171 190
172 #define RE_SYNTAX_POSIX_EGREP \ 191 #define RE_SYNTAX_POSIX_EGREP \
173 (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) 192 (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES)
174 193
175 /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ 194 /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
178 #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC 197 #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC
179 198
180 /* Syntax bits common to both basic and extended POSIX regex syntax. */ 199 /* Syntax bits common to both basic and extended POSIX regex syntax. */
181 #define _RE_SYNTAX_POSIX_COMMON \ 200 #define _RE_SYNTAX_POSIX_COMMON \
182 (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ 201 (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \
183 | RE_INTERVALS | RE_NO_EMPTY_RANGES) 202 | RE_INTERVALS | RE_NO_EMPTY_RANGES | RE_NO_SHY_GROUPS \
203 | RE_NO_MINIMAL_MATCHING)
184 204
185 #define RE_SYNTAX_POSIX_BASIC \ 205 #define RE_SYNTAX_POSIX_BASIC \
186 (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) 206 (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM)
187 207
188 /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes 208 /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes
210 (erroneously) define this in other header files, but we want our 230 (erroneously) define this in other header files, but we want our
211 value, so remove any previous define. */ 231 value, so remove any previous define. */
212 #ifdef RE_DUP_MAX 232 #ifdef RE_DUP_MAX
213 #undef RE_DUP_MAX 233 #undef RE_DUP_MAX
214 #endif 234 #endif
215 #define RE_DUP_MAX ((1 << 15) - 1) 235 #define RE_DUP_MAX ((1 << 15) - 1)
216 236
217 237
218 /* POSIX `cflags' bits (i.e., information for `regcomp'). */ 238 /* POSIX `cflags' bits (i.e., information for `regcomp'). */
219 239
220 /* If this bit is set, then use extended regular expression syntax. 240 /* If this bit is set, then use extended regular expression syntax.
222 #define REG_EXTENDED 1 242 #define REG_EXTENDED 1
223 243
224 /* If this bit is set, then ignore case when matching. 244 /* If this bit is set, then ignore case when matching.
225 If not set, then case is significant. */ 245 If not set, then case is significant. */
226 #define REG_ICASE (REG_EXTENDED << 1) 246 #define REG_ICASE (REG_EXTENDED << 1)
227 247
228 /* If this bit is set, then anchors do not match at newline 248 /* If this bit is set, then anchors do not match at newline
229 characters in the string. 249 characters in the string.
230 If not set, then anchors do match at newlines. */ 250 If not set, then anchors do match at newlines. */
231 #define REG_NEWLINE (REG_ICASE << 1) 251 #define REG_NEWLINE (REG_ICASE << 1)
232 252
261 REG_ECOLLATE, /* Not implemented. */ 281 REG_ECOLLATE, /* Not implemented. */
262 REG_ECTYPE, /* Invalid character class name. */ 282 REG_ECTYPE, /* Invalid character class name. */
263 REG_EESCAPE, /* Trailing backslash. */ 283 REG_EESCAPE, /* Trailing backslash. */
264 REG_ESUBREG, /* Invalid back reference. */ 284 REG_ESUBREG, /* Invalid back reference. */
265 REG_EBRACK, /* Unmatched left bracket. */ 285 REG_EBRACK, /* Unmatched left bracket. */
266 REG_EPAREN, /* Parenthesis imbalance. */ 286 REG_EPAREN, /* Parenthesis imbalance. */
267 REG_EBRACE, /* Unmatched \{. */ 287 REG_EBRACE, /* Unmatched \{. */
268 REG_BADBR, /* Invalid contents of \{\}. */ 288 REG_BADBR, /* Invalid contents of \{\}. */
269 REG_ERANGE, /* Invalid range end. */ 289 REG_ERANGE, /* Invalid range end. */
270 REG_ESPACE, /* Ran out of memory. */ 290 REG_ESPACE, /* Ran out of memory. */
271 REG_BADRPT, /* No preceding re for repetition op. */ 291 REG_BADRPT, /* No preceding re for repetition op. */
299 319
300 /* Number of bytes to which `buffer' points. */ 320 /* Number of bytes to which `buffer' points. */
301 unsigned long allocated; 321 unsigned long allocated;
302 322
303 /* Number of bytes actually used in `buffer'. */ 323 /* Number of bytes actually used in `buffer'. */
304 unsigned long used; 324 unsigned long used;
305 325
306 /* Syntax setting with which the pattern was compiled. */ 326 /* Syntax setting with which the pattern was compiled. */
307 reg_syntax_t syntax; 327 reg_syntax_t syntax;
308 328
309 /* Pointer to a fastmap, if any, otherwise zero. re_search uses 329 /* Pointer to a fastmap, if any, otherwise zero. re_search uses
343 /* If set, `re_match_2' does not return information about 363 /* If set, `re_match_2' does not return information about
344 subexpressions. */ 364 subexpressions. */
345 unsigned no_sub : 1; 365 unsigned no_sub : 1;
346 366
347 /* If set, a beginning-of-line anchor doesn't match at the 367 /* If set, a beginning-of-line anchor doesn't match at the
348 beginning of the string. */ 368 beginning of the string. */
349 unsigned not_bol : 1; 369 unsigned not_bol : 1;
350 370
351 /* Similarly for an end-of-line anchor. */ 371 /* Similarly for an end-of-line anchor. */
352 unsigned not_eol : 1; 372 unsigned not_eol : 1;
353 373
450 _RE_ARGS ((struct re_pattern_buffer *buffer, CONST char *string, 470 _RE_ARGS ((struct re_pattern_buffer *buffer, CONST char *string,
451 int length, int start, struct re_registers *regs)); 471 int length, int start, struct re_registers *regs));
452 472
453 473
454 /* Relates to `re_match' as `re_search_2' relates to `re_search'. */ 474 /* Relates to `re_match' as `re_search_2' relates to `re_search'. */
455 extern int re_match_2 475 extern int re_match_2
456 _RE_ARGS ((struct re_pattern_buffer *buffer, CONST char *string1, 476 _RE_ARGS ((struct re_pattern_buffer *buffer, CONST char *string1,
457 int length1, CONST char *string2, int length2, 477 int length1, CONST char *string2, int length2,
458 int start, struct re_registers *regs, int stop)); 478 int start, struct re_registers *regs, int stop));
459 479
460 480