Mercurial > hg > xemacs-beta
comparison src/regex.h @ 185:3d6bfa290dbd r20-3b19
Import from CVS: tag r20-3b19
author | cvs |
---|---|
date | Mon, 13 Aug 2007 09:55:28 +0200 |
parents | cf808b4c4290 |
children | 41ff10fd062f |
comparison
equal
deleted
inserted
replaced
184:bcd2674570bf | 185:3d6bfa290dbd |
---|---|
32 #include <stddef.h> | 32 #include <stddef.h> |
33 #endif | 33 #endif |
34 | 34 |
35 | 35 |
36 /* The following bits are used to determine the regexp syntax we | 36 /* The following bits are used to determine the regexp syntax we |
37 recognize. The set/not-set meanings are chosen so that Emacs syntax | 37 recognize. The not-set meaning typically corresponds to the syntax |
38 remains the value 0. The bits are given in alphabetical order, and | 38 used by Emacs (the exception is RE_INTERVAL, made for historical |
39 the definitions shifted by one from the previous bit; thus, when we | 39 reasons). The bits are given in alphabetical order, and the |
40 add or remove a bit, only one other definition need change. */ | 40 definitions shifted by one from the previous bit; thus, when we add or |
41 remove a bit, only one other definition need change. */ | |
41 typedef unsigned reg_syntax_t; | 42 typedef unsigned reg_syntax_t; |
42 | 43 |
43 /* If this bit is not set, then \ inside a bracket expression is literal. | 44 /* If this bit is not set, then \ inside a bracket expression is literal. |
44 If set, then such a \ quotes the following character. */ | 45 If set, then such a \ quotes the following character. */ |
45 #define RE_BACKSLASH_ESCAPE_IN_LISTS (1) | 46 #define RE_BACKSLASH_ESCAPE_IN_LISTS (1) |
46 | 47 |
47 /* If this bit is not set, then + and ? are operators, and \+ and \? are | 48 /* If this bit is not set, then + and ? are operators, and \+ and \? are |
48 literals. | 49 literals. |
49 If set, then \+ and \? are operators and + and ? are literals. */ | 50 If set, then \+ and \? are operators and + and ? are literals. */ |
50 #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) | 51 #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) |
51 | 52 |
52 /* If this bit is set, then character classes are supported. They are: | 53 /* If this bit is set, then character classes are supported. They are: |
53 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], | 54 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], |
59 expressions, of course). | 60 expressions, of course). |
60 If this bit is not set, then it depends: | 61 If this bit is not set, then it depends: |
61 ^ is an anchor if it is at the beginning of a regular | 62 ^ is an anchor if it is at the beginning of a regular |
62 expression or after an open-group or an alternation operator; | 63 expression or after an open-group or an alternation operator; |
63 $ is an anchor if it is at the end of a regular expression, or | 64 $ is an anchor if it is at the end of a regular expression, or |
64 before a close-group or an alternation operator. | 65 before a close-group or an alternation operator. |
65 | 66 |
66 This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because | 67 This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because |
67 POSIX draft 11.2 says that * etc. in leading positions is undefined. | 68 POSIX draft 11.2 says that * etc. in leading positions is undefined. |
68 We already implemented a previous draft which made those constructs | 69 We already implemented a previous draft which made those constructs |
69 invalid, though, so we haven't changed the code back. */ | 70 invalid, though, so we haven't changed the code back. */ |
70 #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) | 71 #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) |
71 | 72 |
72 /* If this bit is set, then special characters are always special | 73 /* If this bit is set, then special characters are always special |
73 regardless of where they are in the pattern. | 74 regardless of where they are in the pattern. |
74 If this bit is not set, then special characters are special only in | 75 If this bit is not set, then special characters are special only in |
75 some contexts; otherwise they are ordinary. Specifically, | 76 some contexts; otherwise they are ordinary. Specifically, |
76 * + ? and intervals are only special when not after the beginning, | 77 * + ? and intervals are only special when not after the beginning, |
77 open-group, or alternation operator. */ | 78 open-group, or alternation operator. */ |
78 #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) | 79 #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) |
79 | 80 |
80 /* If this bit is set, then *, +, ?, and { cannot be first in an re or | 81 /* If this bit is set, then *, +, ?, and { cannot be first in an re or |
92 /* If this bit is set, nonmatching lists [^...] do not match newline. | 93 /* If this bit is set, nonmatching lists [^...] do not match newline. |
93 If not set, they do. */ | 94 If not set, they do. */ |
94 #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) | 95 #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) |
95 | 96 |
96 /* If this bit is set, either \{...\} or {...} defines an | 97 /* If this bit is set, either \{...\} or {...} defines an |
97 interval, depending on RE_NO_BK_BRACES. | 98 interval, depending on RE_NO_BK_BRACES. |
98 If not set, \{, \}, {, and } are literals. */ | 99 If not set, \{, \}, {, and } are literals. */ |
99 #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) | 100 #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) |
100 | 101 |
101 /* If this bit is set, +, ? and | aren't recognized as operators. | 102 /* If this bit is set, +, ? and | aren't recognized as operators. |
102 If not set, they are. */ | 103 If not set, they are. */ |
117 | 118 |
118 /* If this bit is set, then \<digit> matches <digit>. | 119 /* If this bit is set, then \<digit> matches <digit>. |
119 If not set, then \<digit> is a back-reference. */ | 120 If not set, then \<digit> is a back-reference. */ |
120 #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) | 121 #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) |
121 | 122 |
122 /* If this bit is set, then | is an alternation operator, and \| is literal. | 123 /* If this bit is set, then | is an alternation operator, and \| is literal. |
123 If not set, then \| is an alternation operator, and | is literal. */ | 124 If not set, then \| is an alternation operator, and | is literal. */ |
124 #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) | 125 #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) |
125 | 126 |
126 /* If this bit is set, then an ending range point collating higher | 127 /* If this bit is set, then an ending range point collating higher |
127 than the starting range point, as in [z-a], is invalid. | 128 than the starting range point, as in [z-a], is invalid. |
128 If not set, then when ending range point collates higher than the | 129 If not set, then when ending range point collates higher than the |
129 starting range point, the range is ignored. */ | 130 starting range point, the range is ignored. */ |
130 #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) | 131 #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) |
131 | 132 |
133 /* If this bit is not set, allow minimal matching: | |
134 - a*? and a+? and a?? perform shortest-possible matching (compare with a* | |
135 and a+ and a?, respectively, which perform longest-possible matching) | |
136 - other juxtaposing of * + and ? is rejected. | |
137 If this bit is set, consecutive * + and ?'s are collapsed in a logical | |
138 manner: | |
139 - a*? and a+? are the same as a* | |
140 - a?? is the same as a? | |
141 */ | |
142 #define RE_NO_MINIMAL_MATCHING (RE_NO_EMPTY_RANGES << 1) | |
143 | |
144 /* If this bit is set, succeed as soon as we match the whole pattern, | |
145 without further backtracking. */ | |
146 #define RE_NO_POSIX_BACKTRACKING (RE_NO_MINIMAL_MATCHING << 1) | |
147 | |
148 /* If this bit is not set, (?:re) behaves like (re) (or \(?:re\) behaves like | |
149 \(re\)) except that the matched string is not registered. */ | |
150 #define RE_NO_SHY_GROUPS (RE_NO_POSIX_BACKTRACKING << 1) | |
151 | |
132 /* If this bit is set, then an unmatched ) is ordinary. | 152 /* If this bit is set, then an unmatched ) is ordinary. |
133 If not set, then an unmatched ) is invalid. */ | 153 If not set, then an unmatched ) is invalid. */ |
134 #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) | 154 #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_SHY_GROUPS << 1) |
135 | |
136 /* If this bit is set, succeed as soon as we match the whole pattern, | |
137 without further backtracking. */ | |
138 #define RE_NO_POSIX_BACKTRACKING (RE_UNMATCHED_RIGHT_PAREN_ORD << 1) | |
139 | 155 |
140 /* This global variable defines the particular regexp syntax to use (for | 156 /* This global variable defines the particular regexp syntax to use (for |
141 some interfaces). When a regexp is compiled, the syntax used is | 157 some interfaces). When a regexp is compiled, the syntax used is |
142 stored in the pattern buffer, so changing this does not affect | 158 stored in the pattern buffer, so changing this does not affect |
143 already-compiled regexps. */ | 159 already-compiled regexps. */ |
144 extern reg_syntax_t re_syntax_options; | 160 extern reg_syntax_t re_syntax_options; |
145 | 161 |
146 /* Define combinations of the above bits for the standard possibilities. | 162 /* Define combinations of the above bits for the standard possibilities. |
147 (The [[[ comments delimit what gets put into the Texinfo file, so | 163 (The [[[ comments delimit what gets put into the Texinfo file, so |
148 don't delete them!) */ | 164 don't delete them!) */ |
149 /* [[[begin syntaxes]]] */ | 165 /* [[[begin syntaxes]]] */ |
150 #define RE_SYNTAX_EMACS 0 | 166 #define RE_SYNTAX_EMACS RE_INTERVALS |
151 | 167 |
152 #define RE_SYNTAX_AWK \ | 168 #define RE_SYNTAX_AWK \ |
153 (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ | 169 (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ |
154 | RE_NO_BK_PARENS | RE_NO_BK_REFS \ | 170 | RE_NO_BK_PARENS | RE_NO_BK_REFS \ |
155 | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ | 171 | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ |
156 | RE_UNMATCHED_RIGHT_PAREN_ORD) | 172 | RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_SHY_GROUPS \ |
173 | RE_NO_MINIMAL_MATCHING) | |
157 | 174 |
158 #define RE_SYNTAX_POSIX_AWK \ | 175 #define RE_SYNTAX_POSIX_AWK \ |
159 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) | 176 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) |
160 | 177 |
161 #define RE_SYNTAX_GREP \ | 178 #define RE_SYNTAX_GREP \ |
162 (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ | 179 (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ |
163 | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ | 180 | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ |
164 | RE_NEWLINE_ALT) | 181 | RE_NEWLINE_ALT | RE_NO_SHY_GROUPS \ |
182 | RE_NO_MINIMAL_MATCHING) | |
165 | 183 |
166 #define RE_SYNTAX_EGREP \ | 184 #define RE_SYNTAX_EGREP \ |
167 (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ | 185 (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ |
168 | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ | 186 | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ |
169 | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ | 187 | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ |
170 | RE_NO_BK_VBAR) | 188 | RE_NO_BK_VBAR | RE_NO_SHY_GROUPS \ |
189 | RE_NO_MINIMAL_MATCHING) | |
171 | 190 |
172 #define RE_SYNTAX_POSIX_EGREP \ | 191 #define RE_SYNTAX_POSIX_EGREP \ |
173 (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) | 192 (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) |
174 | 193 |
175 /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ | 194 /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ |
178 #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC | 197 #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC |
179 | 198 |
180 /* Syntax bits common to both basic and extended POSIX regex syntax. */ | 199 /* Syntax bits common to both basic and extended POSIX regex syntax. */ |
181 #define _RE_SYNTAX_POSIX_COMMON \ | 200 #define _RE_SYNTAX_POSIX_COMMON \ |
182 (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ | 201 (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ |
183 | RE_INTERVALS | RE_NO_EMPTY_RANGES) | 202 | RE_INTERVALS | RE_NO_EMPTY_RANGES | RE_NO_SHY_GROUPS \ |
203 | RE_NO_MINIMAL_MATCHING) | |
184 | 204 |
185 #define RE_SYNTAX_POSIX_BASIC \ | 205 #define RE_SYNTAX_POSIX_BASIC \ |
186 (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) | 206 (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) |
187 | 207 |
188 /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes | 208 /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes |
210 (erroneously) define this in other header files, but we want our | 230 (erroneously) define this in other header files, but we want our |
211 value, so remove any previous define. */ | 231 value, so remove any previous define. */ |
212 #ifdef RE_DUP_MAX | 232 #ifdef RE_DUP_MAX |
213 #undef RE_DUP_MAX | 233 #undef RE_DUP_MAX |
214 #endif | 234 #endif |
215 #define RE_DUP_MAX ((1 << 15) - 1) | 235 #define RE_DUP_MAX ((1 << 15) - 1) |
216 | 236 |
217 | 237 |
218 /* POSIX `cflags' bits (i.e., information for `regcomp'). */ | 238 /* POSIX `cflags' bits (i.e., information for `regcomp'). */ |
219 | 239 |
220 /* If this bit is set, then use extended regular expression syntax. | 240 /* If this bit is set, then use extended regular expression syntax. |
222 #define REG_EXTENDED 1 | 242 #define REG_EXTENDED 1 |
223 | 243 |
224 /* If this bit is set, then ignore case when matching. | 244 /* If this bit is set, then ignore case when matching. |
225 If not set, then case is significant. */ | 245 If not set, then case is significant. */ |
226 #define REG_ICASE (REG_EXTENDED << 1) | 246 #define REG_ICASE (REG_EXTENDED << 1) |
227 | 247 |
228 /* If this bit is set, then anchors do not match at newline | 248 /* If this bit is set, then anchors do not match at newline |
229 characters in the string. | 249 characters in the string. |
230 If not set, then anchors do match at newlines. */ | 250 If not set, then anchors do match at newlines. */ |
231 #define REG_NEWLINE (REG_ICASE << 1) | 251 #define REG_NEWLINE (REG_ICASE << 1) |
232 | 252 |
261 REG_ECOLLATE, /* Not implemented. */ | 281 REG_ECOLLATE, /* Not implemented. */ |
262 REG_ECTYPE, /* Invalid character class name. */ | 282 REG_ECTYPE, /* Invalid character class name. */ |
263 REG_EESCAPE, /* Trailing backslash. */ | 283 REG_EESCAPE, /* Trailing backslash. */ |
264 REG_ESUBREG, /* Invalid back reference. */ | 284 REG_ESUBREG, /* Invalid back reference. */ |
265 REG_EBRACK, /* Unmatched left bracket. */ | 285 REG_EBRACK, /* Unmatched left bracket. */ |
266 REG_EPAREN, /* Parenthesis imbalance. */ | 286 REG_EPAREN, /* Parenthesis imbalance. */ |
267 REG_EBRACE, /* Unmatched \{. */ | 287 REG_EBRACE, /* Unmatched \{. */ |
268 REG_BADBR, /* Invalid contents of \{\}. */ | 288 REG_BADBR, /* Invalid contents of \{\}. */ |
269 REG_ERANGE, /* Invalid range end. */ | 289 REG_ERANGE, /* Invalid range end. */ |
270 REG_ESPACE, /* Ran out of memory. */ | 290 REG_ESPACE, /* Ran out of memory. */ |
271 REG_BADRPT, /* No preceding re for repetition op. */ | 291 REG_BADRPT, /* No preceding re for repetition op. */ |
299 | 319 |
300 /* Number of bytes to which `buffer' points. */ | 320 /* Number of bytes to which `buffer' points. */ |
301 unsigned long allocated; | 321 unsigned long allocated; |
302 | 322 |
303 /* Number of bytes actually used in `buffer'. */ | 323 /* Number of bytes actually used in `buffer'. */ |
304 unsigned long used; | 324 unsigned long used; |
305 | 325 |
306 /* Syntax setting with which the pattern was compiled. */ | 326 /* Syntax setting with which the pattern was compiled. */ |
307 reg_syntax_t syntax; | 327 reg_syntax_t syntax; |
308 | 328 |
309 /* Pointer to a fastmap, if any, otherwise zero. re_search uses | 329 /* Pointer to a fastmap, if any, otherwise zero. re_search uses |
343 /* If set, `re_match_2' does not return information about | 363 /* If set, `re_match_2' does not return information about |
344 subexpressions. */ | 364 subexpressions. */ |
345 unsigned no_sub : 1; | 365 unsigned no_sub : 1; |
346 | 366 |
347 /* If set, a beginning-of-line anchor doesn't match at the | 367 /* If set, a beginning-of-line anchor doesn't match at the |
348 beginning of the string. */ | 368 beginning of the string. */ |
349 unsigned not_bol : 1; | 369 unsigned not_bol : 1; |
350 | 370 |
351 /* Similarly for an end-of-line anchor. */ | 371 /* Similarly for an end-of-line anchor. */ |
352 unsigned not_eol : 1; | 372 unsigned not_eol : 1; |
353 | 373 |
450 _RE_ARGS ((struct re_pattern_buffer *buffer, CONST char *string, | 470 _RE_ARGS ((struct re_pattern_buffer *buffer, CONST char *string, |
451 int length, int start, struct re_registers *regs)); | 471 int length, int start, struct re_registers *regs)); |
452 | 472 |
453 | 473 |
454 /* Relates to `re_match' as `re_search_2' relates to `re_search'. */ | 474 /* Relates to `re_match' as `re_search_2' relates to `re_search'. */ |
455 extern int re_match_2 | 475 extern int re_match_2 |
456 _RE_ARGS ((struct re_pattern_buffer *buffer, CONST char *string1, | 476 _RE_ARGS ((struct re_pattern_buffer *buffer, CONST char *string1, |
457 int length1, CONST char *string2, int length2, | 477 int length1, CONST char *string2, int length2, |
458 int start, struct re_registers *regs, int stop)); | 478 int start, struct re_registers *regs, int stop)); |
459 | 479 |
460 | 480 |