Mercurial > hg > xemacs-beta
annotate src/regex.c @ 5041:efaa6cd845e5
add regexp-debugging
-------------------- ChangeLog entries follow: --------------------
src/ChangeLog addition:
2010-02-15 Ben Wing <ben@xemacs.org>
* regex.c:
* regex.c (DEBUG_FAIL_PRINT1):
* regex.c (PUSH_FAILURE_POINT):
* regex.c (POP_FAILURE_POINT):
* regex.c (regex_compile):
* regex.c (re_match_2_internal):
* regex.h:
* search.c:
* search.c (search_buffer):
* search.c (debug_regexps_changed):
* search.c (vars_of_search):
Add an internal variable debug_regexps and a corresponding Lisp
variable `debug-regexps' that takes a list of areas in which to
display debugging info about regex compilation and matching
(currently three areas exist). Use existing debugging code
already in regex.c and modify it so that it recognizes the
debug_regexps variable and the flags in it.
Rename variable `debug-xemacs-searches' to just `debug-searches',
consistent with other debug vars.
tests/ChangeLog addition:
2010-02-15 Ben Wing <ben@xemacs.org>
* automated/search-tests.el (let):
* automated/search-tests.el (boundp):
debug-xemacs-searches renamed to debug-searches.
author | Ben Wing <ben@xemacs.org> |
---|---|
date | Mon, 15 Feb 2010 21:51:22 -0600 |
parents | 16112448d484 |
children | 308d34e9f07d |
rev | line source |
---|---|
428 | 1 /* Extended regular expression matching and search library, |
2 version 0.12, extended for XEmacs. | |
3 (Implements POSIX draft P10003.2/D11.2, except for | |
4 internationalization features.) | |
5 | |
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc. | |
7 Copyright (C) 1995 Sun Microsystems, Inc. | |
5041 | 8 Copyright (C) 1995, 2001, 2002, 2003, 2010 Ben Wing. |
428 | 9 |
10 This program is free software; you can redistribute it and/or modify | |
11 it under the terms of the GNU General Public License as published by | |
12 the Free Software Foundation; either version 2, or (at your option) | |
13 any later version. | |
14 | |
15 This program is distributed in the hope that it will be useful, | |
16 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 GNU General Public License for more details. | |
19 | |
20 You should have received a copy of the GNU General Public License | |
21 along with this program; see the file COPYING. If not, write to | |
22 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
23 Boston, MA 02111-1307, USA. */ | |
24 | |
25 /* Synched up with: FSF 19.29. */ | |
26 | |
27 #ifdef HAVE_CONFIG_H | |
28 #include <config.h> | |
29 #endif | |
30 | |
31 #ifndef _GNU_SOURCE | |
32 #define _GNU_SOURCE 1 | |
33 #endif | |
34 | |
35 /* We assume non-Mule if emacs isn't defined. */ | |
36 #ifndef emacs | |
37 #undef MULE | |
38 #endif | |
39 | |
771 | 40 /* XEmacs addition */ |
41 #ifdef REL_ALLOC | |
42 #define REGEX_REL_ALLOC /* may be undefined below */ | |
43 #endif | |
44 | |
428 | 45 /* XEmacs: define this to add in a speedup for patterns anchored at |
46 the beginning of a line. Keep the ifdefs so that it's easier to | |
47 tell where/why this code has diverged from v19. */ | |
48 #define REGEX_BEGLINE_CHECK | |
49 | |
50 /* XEmacs: the current mmap-based ralloc handles small blocks very | |
51 poorly, so we disable it here. */ | |
52 | |
771 | 53 #if defined (HAVE_MMAP) || defined (DOUG_LEA_MALLOC) |
54 # undef REGEX_REL_ALLOC | |
428 | 55 #endif |
56 | |
57 /* The `emacs' switch turns on certain matching commands | |
58 that make sense only in Emacs. */ | |
59 #ifdef emacs | |
60 | |
61 #include "lisp.h" | |
62 #include "buffer.h" | |
63 #include "syntax.h" | |
64 | |
65 #if (defined (DEBUG_XEMACS) && !defined (DEBUG)) | |
66 #define DEBUG | |
67 #endif | |
68 | |
867 | 69 #define RE_TRANSLATE_1(ch) TRT_TABLE_OF (translate, (Ichar) ch) |
446 | 70 #define TRANSLATE_P(tr) (!NILP (tr)) |
428 | 71 |
826 | 72 /* Converts the pointer to the char to BEG-based offset from the start. */ |
73 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \ | |
74 ? (d) - string1 : (d) - (string2 - size1)) | |
75 | |
428 | 76 #else /* not emacs */ |
77 | |
2367 | 78 #include <stdlib.h> |
79 #include <sys/types.h> | |
80 #include <stddef.h> /* needed for ptrdiff_t under Solaris */ | |
81 #include <string.h> | |
82 | |
2286 | 83 #include "compiler.h" /* Get compiler-specific definitions like UNUSED */ |
84 | |
2500 | 85 #define ABORT abort |
86 | |
428 | 87 /* If we are not linking with Emacs proper, |
88 we can't use the relocating allocator | |
89 even if config.h says that we can. */ | |
771 | 90 #undef REGEX_REL_ALLOC |
428 | 91 |
544 | 92 /* defined in lisp.h */ |
93 #ifdef REGEX_MALLOC | |
94 #ifndef DECLARE_NOTHING | |
95 #define DECLARE_NOTHING struct nosuchstruct | |
96 #endif | |
97 #endif | |
98 | |
867 | 99 #define itext_ichar(str) ((Ichar) (str)[0]) |
100 #define itext_ichar_fmt(str, fmt, object) ((Ichar) (str)[0]) | |
101 #define itext_ichar_ascii_fmt(str, fmt, object) ((Ichar) (str)[0]) | |
428 | 102 |
103 #if (LONGBITS > INTBITS) | |
104 # define EMACS_INT long | |
105 #else | |
106 # define EMACS_INT int | |
107 #endif | |
108 | |
867 | 109 typedef int Ichar; |
110 | |
111 #define INC_IBYTEPTR(p) ((p)++) | |
112 #define INC_IBYTEPTR_FMT(p, fmt) ((p)++) | |
113 #define DEC_IBYTEPTR(p) ((p)--) | |
114 #define DEC_IBYTEPTR_FMT(p, fmt) ((p)--) | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
115 #define MAX_ICHAR_LEN 1 |
867 | 116 #define itext_ichar_len(ptr) 1 |
117 #define itext_ichar_len_fmt(ptr, fmt) 1 | |
428 | 118 |
119 /* Define the syntax stuff for \<, \>, etc. */ | |
120 | |
121 /* This must be nonzero for the wordchar and notwordchar pattern | |
122 commands in re_match_2. */ | |
123 #ifndef Sword | |
124 #define Sword 1 | |
125 #endif | |
126 | |
127 #ifdef SYNTAX_TABLE | |
128 | |
129 extern char *re_syntax_table; | |
130 | |
131 #else /* not SYNTAX_TABLE */ | |
132 | |
133 /* How many characters in the character set. */ | |
134 #define CHAR_SET_SIZE 256 | |
135 | |
136 static char re_syntax_table[CHAR_SET_SIZE]; | |
137 | |
138 static void | |
139 init_syntax_once (void) | |
140 { | |
141 static int done = 0; | |
142 | |
143 if (!done) | |
144 { | |
442 | 145 const char *word_syntax_chars = |
428 | 146 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_"; |
147 | |
148 memset (re_syntax_table, 0, sizeof (re_syntax_table)); | |
149 | |
150 while (*word_syntax_chars) | |
647 | 151 re_syntax_table[(unsigned int) (*word_syntax_chars++)] = Sword; |
428 | 152 |
153 done = 1; | |
154 } | |
155 } | |
156 | |
446 | 157 #endif /* SYNTAX_TABLE */ |
428 | 158 |
826 | 159 #define SYNTAX(ignored, c) re_syntax_table[c] |
460 | 160 #undef SYNTAX_FROM_CACHE |
826 | 161 #define SYNTAX_FROM_CACHE SYNTAX |
162 | |
163 #define RE_TRANSLATE_1(c) translate[(unsigned char) (c)] | |
446 | 164 #define TRANSLATE_P(tr) tr |
165 | |
166 #endif /* emacs */ | |
428 | 167 |
2201 | 168 /* This is for other GNU distributions with internationalized messages. */ |
169 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC)) | |
170 # include <libintl.h> | |
171 #else | |
172 # define gettext(msgid) (msgid) | |
173 #endif | |
174 | |
428 | 175 |
176 /* Get the interface, including the syntax bits. */ | |
177 #include "regex.h" | |
178 | |
179 /* isalpha etc. are used for the character classes. */ | |
180 #include <ctype.h> | |
181 | |
182 /* Jim Meyering writes: | |
183 | |
184 "... Some ctype macros are valid only for character codes that | |
185 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when | |
186 using /bin/cc or gcc but without giving an ansi option). So, all | |
187 ctype uses should be through macros like ISPRINT... If | |
188 STDC_HEADERS is defined, then autoconf has verified that the ctype | |
189 macros don't need to be guarded with references to isascii. ... | |
190 Defining isascii to 1 should let any compiler worth its salt | |
191 eliminate the && through constant folding." */ | |
192 | |
193 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) | |
194 #define ISASCII_1(c) 1 | |
195 #else | |
196 #define ISASCII_1(c) isascii(c) | |
197 #endif | |
198 | |
199 #ifdef MULE | |
200 /* The IS*() macros can be passed any character, including an extended | |
201 one. We need to make sure there are no crashes, which would occur | |
202 otherwise due to out-of-bounds array references. */ | |
203 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c)) | |
204 #else | |
205 #define ISASCII(c) ISASCII_1 (c) | |
206 #endif /* MULE */ | |
207 | |
208 #ifdef isblank | |
209 #define ISBLANK(c) (ISASCII (c) && isblank (c)) | |
210 #else | |
211 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') | |
212 #endif | |
213 #ifdef isgraph | |
214 #define ISGRAPH(c) (ISASCII (c) && isgraph (c)) | |
215 #else | |
216 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) | |
217 #endif | |
218 | |
219 #define ISPRINT(c) (ISASCII (c) && isprint (c)) | |
220 #define ISDIGIT(c) (ISASCII (c) && isdigit (c)) | |
221 #define ISALNUM(c) (ISASCII (c) && isalnum (c)) | |
222 #define ISALPHA(c) (ISASCII (c) && isalpha (c)) | |
223 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) | |
224 #define ISLOWER(c) (ISASCII (c) && islower (c)) | |
225 #define ISPUNCT(c) (ISASCII (c) && ispunct (c)) | |
226 #define ISSPACE(c) (ISASCII (c) && isspace (c)) | |
227 #define ISUPPER(c) (ISASCII (c) && isupper (c)) | |
228 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) | |
229 | |
230 #ifndef NULL | |
231 #define NULL (void *)0 | |
232 #endif | |
233 | |
234 /* We remove any previous definition of `SIGN_EXTEND_CHAR', | |
235 since ours (we hope) works properly with all combinations of | |
236 machines, compilers, `char' and `unsigned char' argument types. | |
237 (Per Bothner suggested the basic approach.) */ | |
238 #undef SIGN_EXTEND_CHAR | |
239 #if __STDC__ | |
240 #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) | |
241 #else /* not __STDC__ */ | |
242 /* As in Harbison and Steele. */ | |
243 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) | |
244 #endif | |
245 | |
246 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we | |
247 use `alloca' instead of `malloc'. This is because using malloc in | |
248 re_search* or re_match* could cause memory leaks when C-g is used in | |
249 Emacs; also, malloc is slower and causes storage fragmentation. On | |
250 the other hand, malloc is more portable, and easier to debug. | |
251 | |
252 Because we sometimes use alloca, some routines have to be macros, | |
253 not functions -- `alloca'-allocated space disappears at the end of the | |
254 function it is called in. */ | |
255 | |
1333 | 256 #ifndef emacs |
257 #define ALLOCA alloca | |
258 #define xmalloc malloc | |
259 #define xrealloc realloc | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
260 #define xfree free |
1333 | 261 #endif |
262 | |
263 #ifdef emacs | |
264 #define ALLOCA_GARBAGE_COLLECT() \ | |
265 do \ | |
266 { \ | |
267 if (need_to_check_c_alloca) \ | |
268 xemacs_c_alloca (0); \ | |
269 } while (0) | |
270 #elif defined (C_ALLOCA) | |
271 #define ALLOCA_GARBAGE_COLLECT() alloca (0) | |
272 #else | |
273 #define ALLOCA_GARBAGE_COLLECT() | |
274 #endif | |
275 | |
276 #ifndef emacs | |
277 /* So we can use just it to conditionalize on */ | |
278 #undef ERROR_CHECK_MALLOC | |
279 #endif | |
280 | |
281 #ifdef ERROR_CHECK_MALLOC | |
282 /* When REL_ALLOC, malloc() is problematic because it could potentially | |
283 cause all rel-alloc()ed data -- including buffer text -- to be relocated. | |
284 We deal with this by checking for such relocation whenever we have | |
285 executed a statement that may call malloc() -- or alloca(), which may | |
286 end up calling malloc() in some circumstances -- and recomputing all | |
287 of our string pointers in re_match_2_internal() and re_search_2(). | |
288 However, if malloc() or alloca() happens and we don't know about it, | |
289 we could still be screwed. So we set up a system where we indicate all | |
290 places where we are prepared for malloc() or alloca(), and in any | |
291 other circumstances, calls to those functions (from anywhere inside of | |
2500 | 292 XEmacs!) will ABORT(). We do this even when REL_ALLOC is not defined |
1333 | 293 so that we catch these problems sooner, since many developers and beta |
294 testers will not be running with REL_ALLOC. */ | |
295 int regex_malloc_disallowed; | |
296 #define BEGIN_REGEX_MALLOC_OK() regex_malloc_disallowed = 0 | |
297 #define END_REGEX_MALLOC_OK() regex_malloc_disallowed = 1 | |
298 #define UNBIND_REGEX_MALLOC_CHECK() unbind_to (depth) | |
299 #else | |
300 #define BEGIN_REGEX_MALLOC_OK() | |
301 #define END_REGEX_MALLOC_OK() | |
302 #define UNBIND_REGEX_MALLOC_CHECK() | |
303 #endif | |
304 | |
305 | |
428 | 306 #ifdef REGEX_MALLOC |
307 | |
1333 | 308 #define REGEX_ALLOCATE xmalloc |
309 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc (source, nsize) | |
310 #define REGEX_FREE xfree | |
428 | 311 |
312 #else /* not REGEX_MALLOC */ | |
313 | |
314 /* Emacs already defines alloca, sometimes. */ | |
315 #ifndef alloca | |
316 | |
317 /* Make alloca work the best possible way. */ | |
318 #ifdef __GNUC__ | |
319 #define alloca __builtin_alloca | |
771 | 320 #elif defined (__DECC) /* XEmacs: added next 3 lines, similar to config.h.in */ |
321 #include <alloca.h> | |
322 #pragma intrinsic(alloca) | |
428 | 323 #else /* not __GNUC__ */ |
324 #if HAVE_ALLOCA_H | |
325 #include <alloca.h> | |
326 #else /* not __GNUC__ or HAVE_ALLOCA_H */ | |
327 #ifndef _AIX /* Already did AIX, up at the top. */ | |
444 | 328 void *alloca (); |
428 | 329 #endif /* not _AIX */ |
446 | 330 #endif /* HAVE_ALLOCA_H */ |
331 #endif /* __GNUC__ */ | |
428 | 332 |
333 #endif /* not alloca */ | |
334 | |
1333 | 335 #define REGEX_ALLOCATE ALLOCA |
428 | 336 |
2367 | 337 /* !!#### Needs review */ |
428 | 338 /* Assumes a `char *destination' variable. */ |
339 #define REGEX_REALLOCATE(source, osize, nsize) \ | |
1333 | 340 (destination = (char *) ALLOCA (nsize), \ |
428 | 341 memmove (destination, source, osize), \ |
342 destination) | |
343 | |
1726 | 344 /* No need to do anything to free, after alloca. |
345 Do nothing! But inhibit gcc warning. */ | |
346 #define REGEX_FREE(arg,type) ((void)0) | |
428 | 347 |
446 | 348 #endif /* REGEX_MALLOC */ |
428 | 349 |
350 /* Define how to allocate the failure stack. */ | |
351 | |
771 | 352 #ifdef REGEX_REL_ALLOC |
428 | 353 #define REGEX_ALLOCATE_STACK(size) \ |
1346 | 354 r_alloc ((unsigned char **) &failure_stack_ptr, (size)) |
428 | 355 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ |
1346 | 356 r_re_alloc ((unsigned char **) &failure_stack_ptr, (nsize)) |
428 | 357 #define REGEX_FREE_STACK(ptr) \ |
1346 | 358 r_alloc_free ((unsigned char **) &failure_stack_ptr) |
428 | 359 |
771 | 360 #else /* not REGEX_REL_ALLOC */ |
428 | 361 |
362 #ifdef REGEX_MALLOC | |
363 | |
1333 | 364 #define REGEX_ALLOCATE_STACK xmalloc |
365 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc (source, nsize) | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
366 #define REGEX_FREE_STACK(arg) xfree (arg) |
428 | 367 |
368 #else /* not REGEX_MALLOC */ | |
369 | |
1333 | 370 #define REGEX_ALLOCATE_STACK ALLOCA |
428 | 371 |
372 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ | |
373 REGEX_REALLOCATE (source, osize, nsize) | |
374 /* No need to explicitly free anything. */ | |
375 #define REGEX_FREE_STACK(arg) | |
376 | |
446 | 377 #endif /* REGEX_MALLOC */ |
771 | 378 #endif /* REGEX_REL_ALLOC */ |
428 | 379 |
380 | |
381 /* True if `size1' is non-NULL and PTR is pointing anywhere inside | |
382 `string1' or just past its end. This works if PTR is NULL, which is | |
383 a good thing. */ | |
384 #define FIRST_STRING_P(ptr) \ | |
385 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) | |
386 | |
387 /* (Re)Allocate N items of type T using malloc, or fail. */ | |
1333 | 388 #define TALLOC(n, t) ((t *) xmalloc ((n) * sizeof (t))) |
389 #define RETALLOC(addr, n, t) ((addr) = (t *) xrealloc (addr, (n) * sizeof (t))) | |
428 | 390 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) |
391 | |
392 #define BYTEWIDTH 8 /* In bits. */ | |
393 | |
434 | 394 #define STREQ(s1, s2) (strcmp (s1, s2) == 0) |
428 | 395 |
396 #undef MAX | |
397 #undef MIN | |
398 #define MAX(a, b) ((a) > (b) ? (a) : (b)) | |
399 #define MIN(a, b) ((a) < (b) ? (a) : (b)) | |
400 | |
446 | 401 /* Type of source-pattern and string chars. */ |
402 typedef const unsigned char re_char; | |
403 | |
460 | 404 typedef char re_bool; |
428 | 405 #define false 0 |
406 #define true 1 | |
407 | |
408 | |
1346 | 409 #ifdef emacs |
410 | |
411 #ifdef MULE | |
412 | |
413 Lisp_Object Vthe_lisp_rangetab; | |
414 | |
415 void | |
416 vars_of_regex (void) | |
417 { | |
2421 | 418 Vthe_lisp_rangetab = Fmake_range_table (Qstart_closed_end_closed); |
1346 | 419 staticpro (&Vthe_lisp_rangetab); |
420 } | |
421 | |
422 #else /* not MULE */ | |
423 | |
424 void | |
425 vars_of_regex (void) | |
426 { | |
427 } | |
428 | |
429 #endif /* MULE */ | |
430 | |
431 /* Convert an offset from the start of the logical text string formed by | |
432 concatenating the two strings together into a character position in the | |
433 Lisp buffer or string that the text represents. Knows that | |
434 when handling buffer text, the "string" we're passed in is always | |
435 BEGV - ZV. */ | |
436 | |
437 static Charxpos | |
438 offset_to_charxpos (Lisp_Object lispobj, int off) | |
439 { | |
440 if (STRINGP (lispobj)) | |
441 return string_index_byte_to_char (lispobj, off); | |
442 else if (BUFFERP (lispobj)) | |
443 return bytebpos_to_charbpos (XBUFFER (lispobj), | |
444 off + BYTE_BUF_BEGV (XBUFFER (lispobj))); | |
445 else | |
446 return 0; | |
447 } | |
448 | |
449 #ifdef REL_ALLOC | |
450 | |
451 /* STRING1 is the value of STRING1 given to re_match_2(). LISPOBJ is | |
452 the Lisp object (if any) from which the string is taken. If LISPOBJ | |
453 is a buffer, return a relocation offset to be added to all pointers to | |
454 string data so that they will be accurate again, after an allocation or | |
455 reallocation that potentially relocated the buffer data. | |
456 */ | |
457 static Bytecount | |
458 offset_post_relocation (Lisp_Object lispobj, Ibyte *orig_buftext) | |
459 { | |
460 if (!BUFFERP (lispobj)) | |
461 return 0; | |
462 return (BYTE_BUF_BYTE_ADDRESS (XBUFFER (lispobj), | |
463 BYTE_BUF_BEGV (XBUFFER (lispobj))) - | |
464 orig_buftext); | |
465 } | |
466 | |
467 #endif /* REL_ALLOC */ | |
468 | |
469 #ifdef ERROR_CHECK_MALLOC | |
470 | |
471 /* NOTE that this can run malloc() so you need to adjust afterwards. */ | |
472 | |
473 static int | |
474 bind_regex_malloc_disallowed (int value) | |
475 { | |
476 /* Tricky, because the act of binding can run malloc(). */ | |
477 int old_regex_malloc_disallowed = regex_malloc_disallowed; | |
478 int depth; | |
479 regex_malloc_disallowed = 0; | |
480 depth = record_unwind_protect_restoring_int (®ex_malloc_disallowed, | |
481 old_regex_malloc_disallowed); | |
482 regex_malloc_disallowed = value; | |
483 return depth; | |
484 } | |
485 | |
486 #endif /* ERROR_CHECK_MALLOC */ | |
487 | |
488 #endif /* emacs */ | |
489 | |
490 | |
428 | 491 /* These are the command codes that appear in compiled regular |
492 expressions. Some opcodes are followed by argument bytes. A | |
493 command code can specify any interpretation whatsoever for its | |
494 arguments. Zero bytes may appear in the compiled regular expression. */ | |
495 | |
496 typedef enum | |
497 { | |
498 no_op = 0, | |
499 | |
500 /* Succeed right away--no more backtracking. */ | |
501 succeed, | |
502 | |
503 /* Followed by one byte giving n, then by n literal bytes. */ | |
504 exactn, | |
505 | |
506 /* Matches any (more or less) character. */ | |
507 anychar, | |
508 | |
509 /* Matches any one char belonging to specified set. First | |
510 following byte is number of bitmap bytes. Then come bytes | |
511 for a bitmap saying which chars are in. Bits in each byte | |
512 are ordered low-bit-first. A character is in the set if its | |
513 bit is 1. A character too large to have a bit in the map is | |
514 automatically not in the set. */ | |
515 charset, | |
516 | |
517 /* Same parameters as charset, but match any character that is | |
518 not one of those specified. */ | |
519 charset_not, | |
520 | |
521 /* Start remembering the text that is matched, for storing in a | |
522 register. Followed by one byte with the register number, in | |
502 | 523 the range 1 to the pattern buffer's re_ngroups |
428 | 524 field. Then followed by one byte with the number of groups |
525 inner to this one. (This last has to be part of the | |
526 start_memory only because we need it in the on_failure_jump | |
527 of re_match_2.) */ | |
528 start_memory, | |
529 | |
530 /* Stop remembering the text that is matched and store it in a | |
531 memory register. Followed by one byte with the register | |
502 | 532 number, in the range 1 to `re_ngroups' in the |
428 | 533 pattern buffer, and one byte with the number of inner groups, |
534 just like `start_memory'. (We need the number of inner | |
535 groups here because we don't have any easy way of finding the | |
536 corresponding start_memory when we're at a stop_memory.) */ | |
537 stop_memory, | |
538 | |
539 /* Match a duplicate of something remembered. Followed by one | |
540 byte containing the register number. */ | |
541 duplicate, | |
542 | |
543 /* Fail unless at beginning of line. */ | |
544 begline, | |
545 | |
546 /* Fail unless at end of line. */ | |
547 endline, | |
548 | |
549 /* Succeeds if at beginning of buffer (if emacs) or at beginning | |
550 of string to be matched (if not). */ | |
551 begbuf, | |
552 | |
553 /* Analogously, for end of buffer/string. */ | |
554 endbuf, | |
555 | |
556 /* Followed by two byte relative address to which to jump. */ | |
557 jump, | |
558 | |
559 /* Same as jump, but marks the end of an alternative. */ | |
560 jump_past_alt, | |
561 | |
562 /* Followed by two-byte relative address of place to resume at | |
563 in case of failure. */ | |
564 on_failure_jump, | |
565 | |
566 /* Like on_failure_jump, but pushes a placeholder instead of the | |
567 current string position when executed. */ | |
568 on_failure_keep_string_jump, | |
569 | |
570 /* Throw away latest failure point and then jump to following | |
571 two-byte relative address. */ | |
572 pop_failure_jump, | |
573 | |
574 /* Change to pop_failure_jump if know won't have to backtrack to | |
575 match; otherwise change to jump. This is used to jump | |
576 back to the beginning of a repeat. If what follows this jump | |
577 clearly won't match what the repeat does, such that we can be | |
578 sure that there is no use backtracking out of repetitions | |
579 already matched, then we change it to a pop_failure_jump. | |
580 Followed by two-byte address. */ | |
581 maybe_pop_jump, | |
582 | |
583 /* Jump to following two-byte address, and push a dummy failure | |
584 point. This failure point will be thrown away if an attempt | |
585 is made to use it for a failure. A `+' construct makes this | |
586 before the first repeat. Also used as an intermediary kind | |
587 of jump when compiling an alternative. */ | |
588 dummy_failure_jump, | |
589 | |
590 /* Push a dummy failure point and continue. Used at the end of | |
591 alternatives. */ | |
592 push_dummy_failure, | |
593 | |
594 /* Followed by two-byte relative address and two-byte number n. | |
595 After matching N times, jump to the address upon failure. */ | |
596 succeed_n, | |
597 | |
598 /* Followed by two-byte relative address, and two-byte number n. | |
599 Jump to the address N times, then fail. */ | |
600 jump_n, | |
601 | |
602 /* Set the following two-byte relative address to the | |
603 subsequent two-byte number. The address *includes* the two | |
604 bytes of number. */ | |
605 set_number_at, | |
606 | |
607 wordchar, /* Matches any word-constituent character. */ | |
608 notwordchar, /* Matches any char that is not a word-constituent. */ | |
609 | |
610 wordbeg, /* Succeeds if at word beginning. */ | |
611 wordend, /* Succeeds if at word end. */ | |
612 | |
613 wordbound, /* Succeeds if at a word boundary. */ | |
614 notwordbound /* Succeeds if not at a word boundary. */ | |
615 | |
616 #ifdef emacs | |
617 ,before_dot, /* Succeeds if before point. */ | |
618 at_dot, /* Succeeds if at point. */ | |
619 after_dot, /* Succeeds if after point. */ | |
620 | |
621 /* Matches any character whose syntax is specified. Followed by | |
622 a byte which contains a syntax code, e.g., Sword. */ | |
623 syntaxspec, | |
624 | |
625 /* Matches any character whose syntax is not that specified. */ | |
626 notsyntaxspec | |
627 | |
628 #endif /* emacs */ | |
629 | |
630 #ifdef MULE | |
631 /* need extra stuff to be able to properly work with XEmacs/Mule | |
632 characters (which may take up more than one byte) */ | |
633 | |
634 ,charset_mule, /* Matches any character belonging to specified set. | |
635 The set is stored in "unified range-table | |
636 format"; see rangetab.c. Unlike the `charset' | |
637 opcode, this can handle arbitrary characters. */ | |
638 | |
639 charset_mule_not /* Same parameters as charset_mule, but match any | |
640 character that is not one of those specified. */ | |
641 | |
642 /* 97/2/17 jhod: The following two were merged back in from the Mule | |
643 2.3 code to enable some language specific processing */ | |
644 ,categoryspec, /* Matches entries in the character category tables */ | |
645 notcategoryspec /* The opposite of the above */ | |
646 #endif /* MULE */ | |
647 | |
648 } re_opcode_t; | |
649 | |
650 /* Common operations on the compiled pattern. */ | |
651 | |
652 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ | |
653 | |
654 #define STORE_NUMBER(destination, number) \ | |
655 do { \ | |
656 (destination)[0] = (number) & 0377; \ | |
657 (destination)[1] = (number) >> 8; \ | |
658 } while (0) | |
659 | |
660 /* Same as STORE_NUMBER, except increment DESTINATION to | |
661 the byte after where the number is stored. Therefore, DESTINATION | |
662 must be an lvalue. */ | |
663 | |
664 #define STORE_NUMBER_AND_INCR(destination, number) \ | |
665 do { \ | |
666 STORE_NUMBER (destination, number); \ | |
667 (destination) += 2; \ | |
668 } while (0) | |
669 | |
670 /* Put into DESTINATION a number stored in two contiguous bytes starting | |
671 at SOURCE. */ | |
672 | |
673 #define EXTRACT_NUMBER(destination, source) \ | |
674 do { \ | |
675 (destination) = *(source) & 0377; \ | |
676 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ | |
677 } while (0) | |
678 | |
679 #ifdef DEBUG | |
680 static void | |
446 | 681 extract_number (int *dest, re_char *source) |
428 | 682 { |
683 int temp = SIGN_EXTEND_CHAR (*(source + 1)); | |
684 *dest = *source & 0377; | |
685 *dest += temp << 8; | |
686 } | |
687 | |
688 #ifndef EXTRACT_MACROS /* To debug the macros. */ | |
689 #undef EXTRACT_NUMBER | |
690 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) | |
691 #endif /* not EXTRACT_MACROS */ | |
692 | |
693 #endif /* DEBUG */ | |
694 | |
695 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. | |
696 SOURCE must be an lvalue. */ | |
697 | |
698 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ | |
699 do { \ | |
700 EXTRACT_NUMBER (destination, source); \ | |
701 (source) += 2; \ | |
702 } while (0) | |
703 | |
704 #ifdef DEBUG | |
705 static void | |
706 extract_number_and_incr (int *destination, unsigned char **source) | |
707 { | |
708 extract_number (destination, *source); | |
709 *source += 2; | |
710 } | |
711 | |
712 #ifndef EXTRACT_MACROS | |
713 #undef EXTRACT_NUMBER_AND_INCR | |
714 #define EXTRACT_NUMBER_AND_INCR(dest, src) \ | |
715 extract_number_and_incr (&dest, &src) | |
716 #endif /* not EXTRACT_MACROS */ | |
717 | |
718 #endif /* DEBUG */ | |
719 | |
720 /* If DEBUG is defined, Regex prints many voluminous messages about what | |
721 it is doing (if the variable `debug' is nonzero). If linked with the | |
722 main program in `iregex.c', you can enter patterns and strings | |
723 interactively. And if linked with the main program in `main.c' and | |
724 the other test files, you can run the already-written tests. */ | |
725 | |
726 #if defined (DEBUG) | |
727 | |
728 /* We use standard I/O for debugging. */ | |
729 #include <stdio.h> | |
730 | |
731 #ifndef emacs | |
732 /* XEmacs provides its own version of assert() */ | |
733 /* It is useful to test things that ``must'' be true when debugging. */ | |
734 #include <assert.h> | |
735 #endif | |
736 | |
5041 | 737 extern int debug_regexps; |
428 | 738 |
739 #define DEBUG_STATEMENT(e) e | |
5041 | 740 |
741 #define DEBUG_PRINT1(x) if (debug_regexps) printf (x) | |
742 #define DEBUG_PRINT2(x1, x2) if (debug_regexps) printf (x1, x2) | |
743 #define DEBUG_PRINT3(x1, x2, x3) if (debug_regexps) printf (x1, x2, x3) | |
744 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug_regexps) printf (x1, x2, x3, x4) | |
428 | 745 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ |
5041 | 746 if (debug_regexps) print_partial_compiled_pattern (s, e) |
428 | 747 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ |
5041 | 748 if (debug_regexps) print_double_string (w, s1, sz1, s2, sz2) |
749 | |
750 #define DEBUG_FAIL_PRINT1(x) \ | |
751 if (debug_regexps & RE_DEBUG_FAILURE_POINT) printf (x) | |
752 #define DEBUG_FAIL_PRINT2(x1, x2) \ | |
753 if (debug_regexps & RE_DEBUG_FAILURE_POINT) printf (x1, x2) | |
754 #define DEBUG_FAIL_PRINT3(x1, x2, x3) \ | |
755 if (debug_regexps & RE_DEBUG_FAILURE_POINT) printf (x1, x2, x3) | |
756 #define DEBUG_FAIL_PRINT4(x1, x2, x3, x4) \ | |
757 if (debug_regexps & RE_DEBUG_FAILURE_POINT) printf (x1, x2, x3, x4) | |
758 #define DEBUG_FAIL_PRINT_COMPILED_PATTERN(p, s, e) \ | |
759 if (debug_regexps & RE_DEBUG_FAILURE_POINT) \ | |
760 print_partial_compiled_pattern (s, e) | |
761 #define DEBUG_FAIL_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ | |
762 if (debug_regexps & RE_DEBUG_FAILURE_POINT) \ | |
763 print_double_string (w, s1, sz1, s2, sz2) | |
764 | |
765 #define DEBUG_MATCH_PRINT1(x) \ | |
766 if (debug_regexps & RE_DEBUG_MATCHING) printf (x) | |
767 #define DEBUG_MATCH_PRINT2(x1, x2) \ | |
768 if (debug_regexps & RE_DEBUG_MATCHING) printf (x1, x2) | |
769 #define DEBUG_MATCH_PRINT3(x1, x2, x3) \ | |
770 if (debug_regexps & RE_DEBUG_MATCHING) printf (x1, x2, x3) | |
771 #define DEBUG_MATCH_PRINT4(x1, x2, x3, x4) \ | |
772 if (debug_regexps & RE_DEBUG_MATCHING) printf (x1, x2, x3, x4) | |
773 #define DEBUG_MATCH_PRINT_COMPILED_PATTERN(p, s, e) \ | |
774 if (debug_regexps & RE_DEBUG_MATCHING) \ | |
775 print_partial_compiled_pattern (s, e) | |
776 #define DEBUG_MATCH_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ | |
777 if (debug_regexps & RE_DEBUG_MATCHING) \ | |
778 print_double_string (w, s1, sz1, s2, sz2) | |
428 | 779 |
780 | |
781 /* Print the fastmap in human-readable form. */ | |
782 | |
783 static void | |
784 print_fastmap (char *fastmap) | |
785 { | |
647 | 786 int was_a_range = 0; |
787 int i = 0; | |
428 | 788 |
789 while (i < (1 << BYTEWIDTH)) | |
790 { | |
791 if (fastmap[i++]) | |
792 { | |
793 was_a_range = 0; | |
794 putchar (i - 1); | |
795 while (i < (1 << BYTEWIDTH) && fastmap[i]) | |
796 { | |
797 was_a_range = 1; | |
798 i++; | |
799 } | |
800 if (was_a_range) | |
801 { | |
802 putchar ('-'); | |
803 putchar (i - 1); | |
804 } | |
805 } | |
806 } | |
807 putchar ('\n'); | |
808 } | |
809 | |
810 | |
811 /* Print a compiled pattern string in human-readable form, starting at | |
812 the START pointer into it and ending just before the pointer END. */ | |
813 | |
814 static void | |
446 | 815 print_partial_compiled_pattern (re_char *start, re_char *end) |
428 | 816 { |
817 int mcnt, mcnt2; | |
446 | 818 unsigned char *p = (unsigned char *) start; |
819 re_char *pend = end; | |
428 | 820 |
821 if (start == NULL) | |
822 { | |
823 puts ("(null)"); | |
824 return; | |
825 } | |
826 | |
827 /* Loop over pattern commands. */ | |
828 while (p < pend) | |
829 { | |
830 printf ("%ld:\t", (long)(p - start)); | |
831 | |
832 switch ((re_opcode_t) *p++) | |
833 { | |
834 case no_op: | |
835 printf ("/no_op"); | |
836 break; | |
837 | |
838 case exactn: | |
839 mcnt = *p++; | |
840 printf ("/exactn/%d", mcnt); | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
841 while (mcnt--) |
428 | 842 { |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
843 putchar ('/'); |
428 | 844 putchar (*p++); |
845 } | |
846 break; | |
847 | |
848 case start_memory: | |
849 mcnt = *p++; | |
850 printf ("/start_memory/%d/%d", mcnt, *p++); | |
851 break; | |
852 | |
853 case stop_memory: | |
854 mcnt = *p++; | |
855 printf ("/stop_memory/%d/%d", mcnt, *p++); | |
856 break; | |
857 | |
858 case duplicate: | |
859 printf ("/duplicate/%d", *p++); | |
860 break; | |
861 | |
862 case anychar: | |
863 printf ("/anychar"); | |
864 break; | |
865 | |
866 case charset: | |
867 case charset_not: | |
868 { | |
869 REGISTER int c, last = -100; | |
870 REGISTER int in_range = 0; | |
871 | |
872 printf ("/charset [%s", | |
873 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); | |
874 | |
875 assert (p + *p < pend); | |
876 | |
877 for (c = 0; c < 256; c++) | |
878 if (((unsigned char) (c / 8) < *p) | |
879 && (p[1 + (c/8)] & (1 << (c % 8)))) | |
880 { | |
881 /* Are we starting a range? */ | |
882 if (last + 1 == c && ! in_range) | |
883 { | |
884 putchar ('-'); | |
885 in_range = 1; | |
886 } | |
887 /* Have we broken a range? */ | |
888 else if (last + 1 != c && in_range) | |
889 { | |
890 putchar (last); | |
891 in_range = 0; | |
892 } | |
893 | |
894 if (! in_range) | |
895 putchar (c); | |
896 | |
897 last = c; | |
898 } | |
899 | |
900 if (in_range) | |
901 putchar (last); | |
902 | |
903 putchar (']'); | |
904 | |
905 p += 1 + *p; | |
906 } | |
907 break; | |
908 | |
909 #ifdef MULE | |
910 case charset_mule: | |
911 case charset_mule_not: | |
912 { | |
913 int nentries, i; | |
914 | |
915 printf ("/charset_mule [%s", | |
916 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : ""); | |
917 nentries = unified_range_table_nentries (p); | |
918 for (i = 0; i < nentries; i++) | |
919 { | |
920 EMACS_INT first, last; | |
921 Lisp_Object dummy_val; | |
922 | |
923 unified_range_table_get_range (p, i, &first, &last, | |
924 &dummy_val); | |
925 if (first < 0x100) | |
926 putchar (first); | |
927 else | |
928 printf ("(0x%lx)", (long)first); | |
929 if (first != last) | |
930 { | |
931 putchar ('-'); | |
932 if (last < 0x100) | |
933 putchar (last); | |
934 else | |
935 printf ("(0x%lx)", (long)last); | |
936 } | |
937 } | |
938 putchar (']'); | |
939 p += unified_range_table_bytes_used (p); | |
940 } | |
941 break; | |
942 #endif | |
943 | |
944 case begline: | |
945 printf ("/begline"); | |
946 break; | |
947 | |
948 case endline: | |
949 printf ("/endline"); | |
950 break; | |
951 | |
952 case on_failure_jump: | |
953 extract_number_and_incr (&mcnt, &p); | |
954 printf ("/on_failure_jump to %ld", (long)(p + mcnt - start)); | |
955 break; | |
956 | |
957 case on_failure_keep_string_jump: | |
958 extract_number_and_incr (&mcnt, &p); | |
959 printf ("/on_failure_keep_string_jump to %ld", (long)(p + mcnt - start)); | |
960 break; | |
961 | |
962 case dummy_failure_jump: | |
963 extract_number_and_incr (&mcnt, &p); | |
964 printf ("/dummy_failure_jump to %ld", (long)(p + mcnt - start)); | |
965 break; | |
966 | |
967 case push_dummy_failure: | |
968 printf ("/push_dummy_failure"); | |
969 break; | |
970 | |
971 case maybe_pop_jump: | |
972 extract_number_and_incr (&mcnt, &p); | |
973 printf ("/maybe_pop_jump to %ld", (long)(p + mcnt - start)); | |
974 break; | |
975 | |
976 case pop_failure_jump: | |
977 extract_number_and_incr (&mcnt, &p); | |
978 printf ("/pop_failure_jump to %ld", (long)(p + mcnt - start)); | |
979 break; | |
980 | |
981 case jump_past_alt: | |
982 extract_number_and_incr (&mcnt, &p); | |
983 printf ("/jump_past_alt to %ld", (long)(p + mcnt - start)); | |
984 break; | |
985 | |
986 case jump: | |
987 extract_number_and_incr (&mcnt, &p); | |
988 printf ("/jump to %ld", (long)(p + mcnt - start)); | |
989 break; | |
990 | |
991 case succeed_n: | |
992 extract_number_and_incr (&mcnt, &p); | |
993 extract_number_and_incr (&mcnt2, &p); | |
994 printf ("/succeed_n to %ld, %d times", (long)(p + mcnt - start), mcnt2); | |
995 break; | |
996 | |
997 case jump_n: | |
998 extract_number_and_incr (&mcnt, &p); | |
999 extract_number_and_incr (&mcnt2, &p); | |
1000 printf ("/jump_n to %ld, %d times", (long)(p + mcnt - start), mcnt2); | |
1001 break; | |
1002 | |
1003 case set_number_at: | |
1004 extract_number_and_incr (&mcnt, &p); | |
1005 extract_number_and_incr (&mcnt2, &p); | |
1006 printf ("/set_number_at location %ld to %d", (long)(p + mcnt - start), mcnt2); | |
1007 break; | |
1008 | |
1009 case wordbound: | |
1010 printf ("/wordbound"); | |
1011 break; | |
1012 | |
1013 case notwordbound: | |
1014 printf ("/notwordbound"); | |
1015 break; | |
1016 | |
1017 case wordbeg: | |
1018 printf ("/wordbeg"); | |
1019 break; | |
1020 | |
1021 case wordend: | |
1022 printf ("/wordend"); | |
1023 | |
1024 #ifdef emacs | |
1025 case before_dot: | |
1026 printf ("/before_dot"); | |
1027 break; | |
1028 | |
1029 case at_dot: | |
1030 printf ("/at_dot"); | |
1031 break; | |
1032 | |
1033 case after_dot: | |
1034 printf ("/after_dot"); | |
1035 break; | |
1036 | |
1037 case syntaxspec: | |
1038 printf ("/syntaxspec"); | |
1039 mcnt = *p++; | |
1040 printf ("/%d", mcnt); | |
1041 break; | |
1042 | |
1043 case notsyntaxspec: | |
1044 printf ("/notsyntaxspec"); | |
1045 mcnt = *p++; | |
1046 printf ("/%d", mcnt); | |
1047 break; | |
1048 | |
1049 #ifdef MULE | |
1050 /* 97/2/17 jhod Mule category patch */ | |
1051 case categoryspec: | |
1052 printf ("/categoryspec"); | |
1053 mcnt = *p++; | |
1054 printf ("/%d", mcnt); | |
1055 break; | |
1056 | |
1057 case notcategoryspec: | |
1058 printf ("/notcategoryspec"); | |
1059 mcnt = *p++; | |
1060 printf ("/%d", mcnt); | |
1061 break; | |
1062 /* end of category patch */ | |
1063 #endif /* MULE */ | |
1064 #endif /* emacs */ | |
1065 | |
1066 case wordchar: | |
1067 printf ("/wordchar"); | |
1068 break; | |
1069 | |
1070 case notwordchar: | |
1071 printf ("/notwordchar"); | |
1072 break; | |
1073 | |
1074 case begbuf: | |
1075 printf ("/begbuf"); | |
1076 break; | |
1077 | |
1078 case endbuf: | |
1079 printf ("/endbuf"); | |
1080 break; | |
1081 | |
1082 default: | |
1083 printf ("?%d", *(p-1)); | |
1084 } | |
1085 | |
1086 putchar ('\n'); | |
1087 } | |
1088 | |
1089 printf ("%ld:\tend of pattern.\n", (long)(p - start)); | |
1090 } | |
1091 | |
1092 | |
1093 static void | |
1094 print_compiled_pattern (struct re_pattern_buffer *bufp) | |
1095 { | |
446 | 1096 re_char *buffer = bufp->buffer; |
428 | 1097 |
1098 print_partial_compiled_pattern (buffer, buffer + bufp->used); | |
1099 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used, | |
1100 bufp->allocated); | |
1101 | |
1102 if (bufp->fastmap_accurate && bufp->fastmap) | |
1103 { | |
1104 printf ("fastmap: "); | |
1105 print_fastmap (bufp->fastmap); | |
1106 } | |
1107 | |
1108 printf ("re_nsub: %ld\t", (long)bufp->re_nsub); | |
502 | 1109 printf ("re_ngroups: %ld\t", (long)bufp->re_ngroups); |
428 | 1110 printf ("regs_alloc: %d\t", bufp->regs_allocated); |
1111 printf ("can_be_null: %d\t", bufp->can_be_null); | |
1112 printf ("newline_anchor: %d\n", bufp->newline_anchor); | |
1113 printf ("no_sub: %d\t", bufp->no_sub); | |
1114 printf ("not_bol: %d\t", bufp->not_bol); | |
1115 printf ("not_eol: %d\t", bufp->not_eol); | |
1116 printf ("syntax: %d\n", bufp->syntax); | |
1117 /* Perhaps we should print the translate table? */ | |
1118 /* and maybe the category table? */ | |
502 | 1119 |
1120 if (bufp->external_to_internal_register) | |
1121 { | |
1122 int i; | |
1123 | |
1124 printf ("external_to_internal_register:\n"); | |
1125 for (i = 0; i <= bufp->re_nsub; i++) | |
1126 { | |
1127 if (i > 0) | |
1128 printf (", "); | |
1129 printf ("%d -> %d", i, bufp->external_to_internal_register[i]); | |
1130 } | |
1131 printf ("\n"); | |
1132 } | |
428 | 1133 } |
1134 | |
1135 | |
1136 static void | |
446 | 1137 print_double_string (re_char *where, re_char *string1, int size1, |
1138 re_char *string2, int size2) | |
428 | 1139 { |
1140 if (where == NULL) | |
1141 printf ("(null)"); | |
1142 else | |
1143 { | |
647 | 1144 int this_char; |
428 | 1145 |
1146 if (FIRST_STRING_P (where)) | |
1147 { | |
1148 for (this_char = where - string1; this_char < size1; this_char++) | |
1149 putchar (string1[this_char]); | |
1150 | |
1151 where = string2; | |
1152 } | |
1153 | |
1154 for (this_char = where - string2; this_char < size2; this_char++) | |
1155 putchar (string2[this_char]); | |
1156 } | |
1157 } | |
1158 | |
1159 #else /* not DEBUG */ | |
1160 | |
771 | 1161 #ifndef emacs |
428 | 1162 #undef assert |
771 | 1163 #define assert(e) ((void) (1)) |
1164 #endif | |
428 | 1165 |
1166 #define DEBUG_STATEMENT(e) | |
5041 | 1167 |
428 | 1168 #define DEBUG_PRINT1(x) |
1169 #define DEBUG_PRINT2(x1, x2) | |
1170 #define DEBUG_PRINT3(x1, x2, x3) | |
1171 #define DEBUG_PRINT4(x1, x2, x3, x4) | |
1172 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) | |
1173 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) | |
1174 | |
5041 | 1175 #define DEBUG_FAIL_PRINT1(x) |
1176 #define DEBUG_FAIL_PRINT2(x1, x2) | |
1177 #define DEBUG_FAIL_PRINT3(x1, x2, x3) | |
1178 #define DEBUG_FAIL_PRINT4(x1, x2, x3, x4) | |
1179 #define DEBUG_FAIL_PRINT_COMPILED_PATTERN(p, s, e) | |
1180 #define DEBUG_FAIL_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) | |
1181 | |
1182 #define DEBUG_MATCH_PRINT1(x) | |
1183 #define DEBUG_MATCH_PRINT2(x1, x2) | |
1184 #define DEBUG_MATCH_PRINT3(x1, x2, x3) | |
1185 #define DEBUG_MATCH_PRINT4(x1, x2, x3, x4) | |
1186 #define DEBUG_MATCH_PRINT_COMPILED_PATTERN(p, s, e) | |
1187 #define DEBUG_MATCH_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) | |
1188 | |
446 | 1189 #endif /* DEBUG */ |
428 | 1190 |
1191 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can | |
1192 also be assigned to arbitrarily: each pattern buffer stores its own | |
1193 syntax, so it can be changed between regex compilations. */ | |
1194 /* This has no initializer because initialized variables in Emacs | |
1195 become read-only after dumping. */ | |
1196 reg_syntax_t re_syntax_options; | |
1197 | |
1198 | |
1199 /* Specify the precise syntax of regexps for compilation. This provides | |
1200 for compatibility for various utilities which historically have | |
1201 different, incompatible syntaxes. | |
1202 | |
1203 The argument SYNTAX is a bit mask comprised of the various bits | |
1204 defined in regex.h. We return the old syntax. */ | |
1205 | |
1206 reg_syntax_t | |
1207 re_set_syntax (reg_syntax_t syntax) | |
1208 { | |
1209 reg_syntax_t ret = re_syntax_options; | |
1210 | |
1211 re_syntax_options = syntax; | |
1212 return ret; | |
1213 } | |
1214 | |
1215 /* This table gives an error message for each of the error codes listed | |
1216 in regex.h. Obviously the order here has to be same as there. | |
1217 POSIX doesn't require that we do anything for REG_NOERROR, | |
1218 but why not be nice? */ | |
1219 | |
442 | 1220 static const char *re_error_msgid[] = |
428 | 1221 { |
1222 "Success", /* REG_NOERROR */ | |
1223 "No match", /* REG_NOMATCH */ | |
1224 "Invalid regular expression", /* REG_BADPAT */ | |
1225 "Invalid collation character", /* REG_ECOLLATE */ | |
1226 "Invalid character class name", /* REG_ECTYPE */ | |
1227 "Trailing backslash", /* REG_EESCAPE */ | |
1228 "Invalid back reference", /* REG_ESUBREG */ | |
1229 "Unmatched [ or [^", /* REG_EBRACK */ | |
1230 "Unmatched ( or \\(", /* REG_EPAREN */ | |
1231 "Unmatched \\{", /* REG_EBRACE */ | |
1232 "Invalid content of \\{\\}", /* REG_BADBR */ | |
1233 "Invalid range end", /* REG_ERANGE */ | |
1234 "Memory exhausted", /* REG_ESPACE */ | |
1235 "Invalid preceding regular expression", /* REG_BADRPT */ | |
1236 "Premature end of regular expression", /* REG_EEND */ | |
1237 "Regular expression too big", /* REG_ESIZE */ | |
1238 "Unmatched ) or \\)", /* REG_ERPAREN */ | |
1239 #ifdef emacs | |
1240 "Invalid syntax designator", /* REG_ESYNTAX */ | |
1241 #endif | |
1242 #ifdef MULE | |
1243 "Ranges may not span charsets", /* REG_ERANGESPAN */ | |
1244 "Invalid category designator", /* REG_ECATEGORY */ | |
1245 #endif | |
1246 }; | |
1247 | |
1248 /* Avoiding alloca during matching, to placate r_alloc. */ | |
1249 | |
1333 | 1250 /* About these various flags: |
1251 | |
1252 MATCH_MAY_ALLOCATE indicates that it's OK to do allocation in the | |
1253 searching and matching functions. In this case, we use local variables | |
1254 to hold the values allocated. If not, we use *global* variables, which | |
1255 are pre-allocated. NOTE: XEmacs ***MUST*** run with MATCH_MAY_ALLOCATE, | |
1256 because the regexp routines may get called reentrantly as a result of | |
1257 QUIT processing (e.g. under Windows: re_match -> QUIT -> quit_p -> drain | |
1258 events -> process WM_INITMENU -> call filter -> re_match; see stack | |
1259 trace in signal.c), so we cannot have any global variables (unless we do | |
1260 lots of trickiness including some unwind-protects, which isn't worth it | |
1261 at this point). | |
1262 | |
1263 REL_ALLOC means that the relocating allocator is in use, for buffers | |
1264 and such. REGEX_REL_ALLOC means that we use rel-alloc to manage the | |
1265 fail stack, which may grow quite large. REGEX_MALLOC means we use | |
1266 malloc() in place of alloca() to allocate the fail stack -- only | |
1267 applicable if REGEX_REL_ALLOC is not defined. | |
1268 */ | |
1269 | |
428 | 1270 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the |
1271 searching and matching functions should not call alloca. On some | |
1272 systems, alloca is implemented in terms of malloc, and if we're | |
1273 using the relocating allocator routines, then malloc could cause a | |
1274 relocation, which might (if the strings being searched are in the | |
1275 ralloc heap) shift the data out from underneath the regexp | |
771 | 1276 routines. [To clarify: The purpose of rel-alloc is to allow data to |
1277 be moved in memory from one place to another so that all data | |
1278 blocks can be consolidated together and excess memory released back | |
1279 to the operating system. This requires that all the blocks that | |
1280 are managed by rel-alloc go at the very end of the program's heap, | |
1281 after all regularly malloc()ed data. malloc(), however, is used to | |
1282 owning the end of the heap, so that when more memory is needed, it | |
1283 just expands the heap using sbrk(). This is reconciled by using a | |
1284 malloc() (such as malloc.c, gmalloc.c, or recent versions of | |
1285 malloc() in libc) where the sbrk() call can be replaced with a | |
1286 user-specified call -- in this case, to rel-alloc's r_alloc_sbrk() | |
1287 routine. This routine calls the real sbrk(), but then shifts all | |
1288 the rel-alloc-managed blocks forward to the end of the heap again, | |
1289 so that malloc() gets the memory it needs in the location it needs | |
1290 it at. The regex routines may well have pointers to buffer data as | |
1291 their arguments, and buffers are managed by rel-alloc if rel-alloc | |
1292 has been enabled, so calling malloc() may potentially screw things | |
1293 up badly if it runs out of space and asks for more from the OS.] | |
1294 | |
1295 [[Here's another reason to avoid allocation: Emacs processes input | |
1296 from X in a signal handler; processing X input may call malloc; if | |
1297 input arrives while a matching routine is calling malloc, then | |
1298 we're scrod. But Emacs can't just block input while calling | |
1299 matching routines; then we don't notice interrupts when they come | |
1300 in. So, Emacs blocks input around all regexp calls except the | |
1301 matching calls, which it leaves unprotected, in the faith that they | |
1333 | 1302 will not malloc.]] This previous paragraph is irrelevant under XEmacs, |
1303 as we *do not* do anything so stupid as process input from within a | |
1304 signal handler. | |
1305 | |
1306 However, the regexp routines may get called reentrantly as a result of | |
1307 QUIT processing (e.g. under Windows: re_match -> QUIT -> quit_p -> drain | |
1308 events -> process WM_INITMENU -> call filter -> re_match; see stack | |
1309 trace in signal.c), so we cannot have any global variables (unless we do | |
1310 lots of trickiness including some unwind-protects, which isn't worth it | |
1311 at this point). Hence we MUST have MATCH_MAY_ALLOCATE defined. | |
1312 | |
1313 Also, the first paragraph does not make complete sense to me -- what | |
1314 about the use of rel-alloc to handle the fail stacks? Shouldn't these | |
1315 reallocations potentially cause buffer data to be relocated as well? I | |
826 | 1316 must be missing something, though -- perhaps the writer above is |
1317 assuming that the failure stack(s) will always be allocated after the | |
1318 buffer data, and thus reallocating them with rel-alloc won't move buffer | |
1333 | 1319 data. (In fact, a cursory glance at the code in ralloc.c seems to |
1320 confirm this.) --ben */ | |
428 | 1321 |
1322 /* Normally, this is fine. */ | |
1323 #define MATCH_MAY_ALLOCATE | |
1324 | |
1325 /* When using GNU C, we are not REALLY using the C alloca, no matter | |
1326 what config.h may say. So don't take precautions for it. */ | |
1327 #ifdef __GNUC__ | |
1328 #undef C_ALLOCA | |
1329 #endif | |
1330 | |
1331 /* The match routines may not allocate if (1) they would do it with malloc | |
1332 and (2) it's not safe for them to use malloc. | |
1333 Note that if REL_ALLOC is defined, matching would not use malloc for the | |
1334 failure stack, but we would still use it for the register vectors; | |
1335 so REL_ALLOC should not affect this. */ | |
771 | 1336 |
1333 | 1337 /* XEmacs can handle REL_ALLOC and malloc() OK */ |
1338 #if !defined (emacs) && (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (REL_ALLOC) | |
428 | 1339 #undef MATCH_MAY_ALLOCATE |
1340 #endif | |
1341 | |
1333 | 1342 #if !defined (MATCH_MAY_ALLOCATE) && defined (emacs) |
771 | 1343 #error regex must be handle reentrancy; MATCH_MAY_ALLOCATE must be defined |
1344 #endif | |
1345 | |
428 | 1346 |
1347 /* Failure stack declarations and macros; both re_compile_fastmap and | |
1348 re_match_2 use a failure stack. These have to be macros because of | |
1349 REGEX_ALLOCATE_STACK. */ | |
1350 | |
1351 | |
1352 /* Number of failure points for which to initially allocate space | |
1353 when matching. If this number is exceeded, we allocate more | |
1354 space, so it is not a hard limit. */ | |
1355 #ifndef INIT_FAILURE_ALLOC | |
3300 | 1356 #define INIT_FAILURE_ALLOC 20 |
428 | 1357 #endif |
1358 | |
1359 /* Roughly the maximum number of failure points on the stack. Would be | |
1360 exactly that if always used MAX_FAILURE_SPACE each time we failed. | |
1361 This is a variable only so users of regex can assign to it; we never | |
1362 change it ourselves. */ | |
1363 #if defined (MATCH_MAY_ALLOCATE) | |
1364 /* 4400 was enough to cause a crash on Alpha OSF/1, | |
1365 whose default stack limit is 2mb. */ | |
3300 | 1366 int re_max_failures = 40000; |
428 | 1367 #else |
3300 | 1368 int re_max_failures = 4000; |
428 | 1369 #endif |
1370 | |
1371 union fail_stack_elt | |
1372 { | |
446 | 1373 re_char *pointer; |
428 | 1374 int integer; |
1375 }; | |
1376 | |
1377 typedef union fail_stack_elt fail_stack_elt_t; | |
1378 | |
1379 typedef struct | |
1380 { | |
1381 fail_stack_elt_t *stack; | |
665 | 1382 Elemcount size; |
1383 Elemcount avail; /* Offset of next open position. */ | |
428 | 1384 } fail_stack_type; |
1385 | |
1386 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) | |
1387 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) | |
1388 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) | |
1389 | |
1390 | |
1391 /* Define macros to initialize and free the failure stack. | |
1392 Do `return -2' if the alloc fails. */ | |
1393 | |
1394 #ifdef MATCH_MAY_ALLOCATE | |
1333 | 1395 #define INIT_FAIL_STACK() \ |
1396 do { \ | |
1397 fail_stack.stack = (fail_stack_elt_t *) \ | |
1398 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * \ | |
1399 sizeof (fail_stack_elt_t)); \ | |
1400 \ | |
1401 if (fail_stack.stack == NULL) \ | |
1402 { \ | |
1403 UNBIND_REGEX_MALLOC_CHECK (); \ | |
1404 return -2; \ | |
1405 } \ | |
1406 \ | |
1407 fail_stack.size = INIT_FAILURE_ALLOC; \ | |
1408 fail_stack.avail = 0; \ | |
428 | 1409 } while (0) |
1410 | |
1411 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) | |
1412 #else | |
1413 #define INIT_FAIL_STACK() \ | |
1414 do { \ | |
1415 fail_stack.avail = 0; \ | |
1416 } while (0) | |
1417 | |
1418 #define RESET_FAIL_STACK() | |
1419 #endif | |
1420 | |
1421 | |
1422 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. | |
1423 | |
1424 Return 1 if succeeds, and 0 if either ran out of memory | |
1425 allocating space for it or it was already too large. | |
1426 | |
1427 REGEX_REALLOCATE_STACK requires `destination' be declared. */ | |
1428 | |
1429 #define DOUBLE_FAIL_STACK(fail_stack) \ | |
1430 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ | |
1431 ? 0 \ | |
1432 : ((fail_stack).stack = (fail_stack_elt_t *) \ | |
1433 REGEX_REALLOCATE_STACK ((fail_stack).stack, \ | |
1434 (fail_stack).size * sizeof (fail_stack_elt_t), \ | |
1435 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ | |
1436 \ | |
1437 (fail_stack).stack == NULL \ | |
1438 ? 0 \ | |
1439 : ((fail_stack).size <<= 1, \ | |
1440 1))) | |
1441 | |
1333 | 1442 #if !defined (emacs) || !defined (REL_ALLOC) |
1443 #define RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS() | |
1444 #else | |
1445 /* Don't change NULL pointers */ | |
1446 #define ADD_IF_NZ(val) if (val) val += rmdp_offset | |
1346 | 1447 #define RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS() \ |
1448 do \ | |
1449 { \ | |
1450 Bytecount rmdp_offset = offset_post_relocation (lispobj, orig_buftext); \ | |
1451 \ | |
1452 if (rmdp_offset) \ | |
1453 { \ | |
1454 int i; \ | |
1455 \ | |
1456 ADD_IF_NZ (string1); \ | |
1457 ADD_IF_NZ (string2); \ | |
1458 ADD_IF_NZ (d); \ | |
1459 ADD_IF_NZ (dend); \ | |
1460 ADD_IF_NZ (end1); \ | |
1461 ADD_IF_NZ (end2); \ | |
1462 ADD_IF_NZ (end_match_1); \ | |
1463 ADD_IF_NZ (end_match_2); \ | |
1464 \ | |
1465 if (bufp->re_ngroups) \ | |
1466 { \ | |
1467 for (i = 0; i < num_regs; i++) \ | |
1468 { \ | |
1469 ADD_IF_NZ (regstart[i]); \ | |
1470 ADD_IF_NZ (regend[i]); \ | |
1471 ADD_IF_NZ (old_regstart[i]); \ | |
1472 ADD_IF_NZ (old_regend[i]); \ | |
1473 ADD_IF_NZ (best_regstart[i]); \ | |
1474 ADD_IF_NZ (best_regend[i]); \ | |
1475 ADD_IF_NZ (reg_dummy[i]); \ | |
1476 } \ | |
1477 } \ | |
1478 \ | |
1479 ADD_IF_NZ (match_end); \ | |
1480 } \ | |
1333 | 1481 } while (0) |
1482 #endif /* !defined (emacs) || !defined (REL_ALLOC) */ | |
1483 | |
1484 #if !defined (emacs) || !defined (REL_ALLOC) | |
1485 #define RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS() | |
1486 #else | |
1346 | 1487 #define RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS() \ |
1488 do \ | |
1489 { \ | |
1490 Bytecount rmdp_offset = offset_post_relocation (lispobj, orig_buftext); \ | |
1491 \ | |
1492 if (rmdp_offset) \ | |
1493 { \ | |
1494 ADD_IF_NZ (str1); \ | |
1495 ADD_IF_NZ (str2); \ | |
1496 ADD_IF_NZ (string1); \ | |
1497 ADD_IF_NZ (string2); \ | |
1498 ADD_IF_NZ (d); \ | |
1499 } \ | |
1333 | 1500 } while (0) |
1501 | |
1502 #endif /* emacs */ | |
428 | 1503 |
1504 /* Push pointer POINTER on FAIL_STACK. | |
1505 Return 1 if was able to do so and 0 if ran out of memory allocating | |
1506 space to do so. */ | |
1507 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ | |
1508 ((FAIL_STACK_FULL () \ | |
1509 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ | |
1510 ? 0 \ | |
1511 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ | |
1512 1)) | |
1513 | |
1514 /* Push a pointer value onto the failure stack. | |
1515 Assumes the variable `fail_stack'. Probably should only | |
1516 be called from within `PUSH_FAILURE_POINT'. */ | |
1517 #define PUSH_FAILURE_POINTER(item) \ | |
1518 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) | |
1519 | |
1520 /* This pushes an integer-valued item onto the failure stack. | |
1521 Assumes the variable `fail_stack'. Probably should only | |
1522 be called from within `PUSH_FAILURE_POINT'. */ | |
1523 #define PUSH_FAILURE_INT(item) \ | |
1524 fail_stack.stack[fail_stack.avail++].integer = (item) | |
1525 | |
1526 /* Push a fail_stack_elt_t value onto the failure stack. | |
1527 Assumes the variable `fail_stack'. Probably should only | |
1528 be called from within `PUSH_FAILURE_POINT'. */ | |
1529 #define PUSH_FAILURE_ELT(item) \ | |
1530 fail_stack.stack[fail_stack.avail++] = (item) | |
1531 | |
1532 /* These three POP... operations complement the three PUSH... operations. | |
1533 All assume that `fail_stack' is nonempty. */ | |
1534 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer | |
1535 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer | |
1536 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] | |
1537 | |
1538 /* Used to omit pushing failure point id's when we're not debugging. */ | |
1539 #ifdef DEBUG | |
1540 #define DEBUG_PUSH PUSH_FAILURE_INT | |
1541 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () | |
1542 #else | |
1543 #define DEBUG_PUSH(item) | |
1544 #define DEBUG_POP(item_addr) | |
1545 #endif | |
1546 | |
1547 | |
1548 /* Push the information about the state we will need | |
1549 if we ever fail back to it. | |
1550 | |
1551 Requires variables fail_stack, regstart, regend, reg_info, and | |
1552 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be | |
1553 declared. | |
1554 | |
1555 Does `return FAILURE_CODE' if runs out of memory. */ | |
1556 | |
771 | 1557 #if !defined (REGEX_MALLOC) && !defined (REGEX_REL_ALLOC) |
456 | 1558 #define DECLARE_DESTINATION char *destination |
428 | 1559 #else |
456 | 1560 #define DECLARE_DESTINATION DECLARE_NOTHING |
428 | 1561 #endif |
1562 | |
1563 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ | |
456 | 1564 do { \ |
1565 DECLARE_DESTINATION; \ | |
1566 /* Must be int, so when we don't save any registers, the arithmetic \ | |
1567 of 0 + -1 isn't done as unsigned. */ \ | |
1568 int this_reg; \ | |
428 | 1569 \ |
456 | 1570 DEBUG_STATEMENT (failure_id++); \ |
1571 DEBUG_STATEMENT (nfailure_points_pushed++); \ | |
5041 | 1572 DEBUG_FAIL_PRINT2 ("\nPUSH_FAILURE_POINT #%d:\n", failure_id); \ |
1573 DEBUG_FAIL_PRINT2 (" Before push, next avail: %ld\n", \ | |
647 | 1574 (long) (fail_stack).avail); \ |
5041 | 1575 DEBUG_FAIL_PRINT2 (" size: %ld\n", \ |
647 | 1576 (long) (fail_stack).size); \ |
456 | 1577 \ |
5041 | 1578 DEBUG_FAIL_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ |
1579 DEBUG_FAIL_PRINT2 (" available: %ld\n", \ | |
456 | 1580 (long) REMAINING_AVAIL_SLOTS); \ |
428 | 1581 \ |
456 | 1582 /* Ensure we have enough space allocated for what we will push. */ \ |
1583 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ | |
1584 { \ | |
1333 | 1585 BEGIN_REGEX_MALLOC_OK (); \ |
456 | 1586 if (!DOUBLE_FAIL_STACK (fail_stack)) \ |
1333 | 1587 { \ |
1588 END_REGEX_MALLOC_OK (); \ | |
1589 UNBIND_REGEX_MALLOC_CHECK (); \ | |
1590 return failure_code; \ | |
1591 } \ | |
1592 END_REGEX_MALLOC_OK (); \ | |
5041 | 1593 DEBUG_FAIL_PRINT2 ("\n Doubled stack; size now: %ld\n", \ |
647 | 1594 (long) (fail_stack).size); \ |
5041 | 1595 DEBUG_FAIL_PRINT2 (" slots available: %ld\n", \ |
456 | 1596 (long) REMAINING_AVAIL_SLOTS); \ |
1333 | 1597 \ |
1598 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); \ | |
456 | 1599 } \ |
428 | 1600 \ |
456 | 1601 /* Push the info, starting with the registers. */ \ |
5041 | 1602 DEBUG_FAIL_PRINT1 ("\n"); \ |
428 | 1603 \ |
456 | 1604 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ |
1605 this_reg++) \ | |
1606 { \ | |
5041 | 1607 DEBUG_FAIL_PRINT2 (" Pushing reg: %d\n", this_reg); \ |
456 | 1608 DEBUG_STATEMENT (num_regs_pushed++); \ |
428 | 1609 \ |
5041 | 1610 DEBUG_FAIL_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \ |
456 | 1611 PUSH_FAILURE_POINTER (regstart[this_reg]); \ |
1612 \ | |
5041 | 1613 DEBUG_FAIL_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \ |
456 | 1614 PUSH_FAILURE_POINTER (regend[this_reg]); \ |
428 | 1615 \ |
5041 | 1616 DEBUG_FAIL_PRINT2 (" info: 0x%lx\n ", \ |
456 | 1617 * (long *) (®_info[this_reg])); \ |
5041 | 1618 DEBUG_FAIL_PRINT2 (" match_null=%d", \ |
456 | 1619 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ |
5041 | 1620 DEBUG_FAIL_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ |
1621 DEBUG_FAIL_PRINT2 (" matched_something=%d", \ | |
456 | 1622 MATCHED_SOMETHING (reg_info[this_reg])); \ |
5041 | 1623 DEBUG_FAIL_PRINT2 (" ever_matched_something=%d", \ |
456 | 1624 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ |
5041 | 1625 DEBUG_FAIL_PRINT1 ("\n"); \ |
456 | 1626 PUSH_FAILURE_ELT (reg_info[this_reg].word); \ |
1627 } \ | |
428 | 1628 \ |
5041 | 1629 DEBUG_FAIL_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg); \ |
456 | 1630 PUSH_FAILURE_INT (lowest_active_reg); \ |
428 | 1631 \ |
5041 | 1632 DEBUG_FAIL_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg); \ |
456 | 1633 PUSH_FAILURE_INT (highest_active_reg); \ |
428 | 1634 \ |
5041 | 1635 DEBUG_FAIL_PRINT2 (" Pushing pattern 0x%lx: \n", (long) pattern_place); \ |
1636 DEBUG_FAIL_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ | |
456 | 1637 PUSH_FAILURE_POINTER (pattern_place); \ |
428 | 1638 \ |
5041 | 1639 DEBUG_FAIL_PRINT2 (" Pushing string 0x%lx: `", (long) string_place); \ |
1640 DEBUG_FAIL_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ | |
456 | 1641 size2); \ |
5041 | 1642 DEBUG_FAIL_PRINT1 ("'\n"); \ |
456 | 1643 PUSH_FAILURE_POINTER (string_place); \ |
428 | 1644 \ |
5041 | 1645 DEBUG_FAIL_PRINT2 (" Pushing failure id: %u\n", failure_id); \ |
456 | 1646 DEBUG_PUSH (failure_id); \ |
1647 } while (0) | |
428 | 1648 |
1649 /* This is the number of items that are pushed and popped on the stack | |
1650 for each register. */ | |
1651 #define NUM_REG_ITEMS 3 | |
1652 | |
1653 /* Individual items aside from the registers. */ | |
1654 #ifdef DEBUG | |
1655 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ | |
1656 #else | |
1657 #define NUM_NONREG_ITEMS 4 | |
1658 #endif | |
1659 | |
1660 /* We push at most this many items on the stack. */ | |
1661 /* We used to use (num_regs - 1), which is the number of registers | |
1662 this regexp will save; but that was changed to 5 | |
1663 to avoid stack overflow for a regexp with lots of parens. */ | |
1664 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) | |
1665 | |
1666 /* We actually push this many items. */ | |
1667 #define NUM_FAILURE_ITEMS \ | |
1668 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ | |
1669 + NUM_NONREG_ITEMS) | |
1670 | |
1671 /* How many items can still be added to the stack without overflowing it. */ | |
1672 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) | |
1673 | |
1674 | |
1675 /* Pops what PUSH_FAIL_STACK pushes. | |
1676 | |
1677 We restore into the parameters, all of which should be lvalues: | |
1678 STR -- the saved data position. | |
1679 PAT -- the saved pattern position. | |
1680 LOW_REG, HIGH_REG -- the highest and lowest active registers. | |
1681 REGSTART, REGEND -- arrays of string positions. | |
1682 REG_INFO -- array of information about each subexpression. | |
1683 | |
1684 Also assumes the variables `fail_stack' and (if debugging), `bufp', | |
1685 `pend', `string1', `size1', `string2', and `size2'. */ | |
1686 | |
456 | 1687 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \ |
1688 regstart, regend, reg_info) \ | |
1689 do { \ | |
428 | 1690 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \ |
1691 int this_reg; \ | |
442 | 1692 const unsigned char *string_temp; \ |
428 | 1693 \ |
1694 assert (!FAIL_STACK_EMPTY ()); \ | |
1695 \ | |
1696 /* Remove failure points and point to how many regs pushed. */ \ | |
5041 | 1697 DEBUG_FAIL_PRINT1 ("POP_FAILURE_POINT:\n"); \ |
1698 DEBUG_FAIL_PRINT2 (" Before pop, next avail: %ld\n", \ | |
647 | 1699 (long) fail_stack.avail); \ |
5041 | 1700 DEBUG_FAIL_PRINT2 (" size: %ld\n", \ |
647 | 1701 (long) fail_stack.size); \ |
428 | 1702 \ |
1703 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ | |
1704 \ | |
1705 DEBUG_POP (&ffailure_id.integer); \ | |
5041 | 1706 DEBUG_FAIL_PRINT2 (" Popping failure id: %d\n", \ |
647 | 1707 * (int *) &ffailure_id); \ |
428 | 1708 \ |
1709 /* If the saved string location is NULL, it came from an \ | |
1710 on_failure_keep_string_jump opcode, and we want to throw away the \ | |
1711 saved NULL, thus retaining our current position in the string. */ \ | |
1712 string_temp = POP_FAILURE_POINTER (); \ | |
1713 if (string_temp != NULL) \ | |
446 | 1714 str = string_temp; \ |
428 | 1715 \ |
5041 | 1716 DEBUG_FAIL_PRINT2 (" Popping string 0x%lx: `", (long) str); \ |
1717 DEBUG_FAIL_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ | |
1718 DEBUG_FAIL_PRINT1 ("'\n"); \ | |
428 | 1719 \ |
1720 pat = (unsigned char *) POP_FAILURE_POINTER (); \ | |
5041 | 1721 DEBUG_FAIL_PRINT2 (" Popping pattern 0x%lx: ", (long) pat); \ |
1722 DEBUG_FAIL_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ | |
428 | 1723 \ |
1724 /* Restore register info. */ \ | |
647 | 1725 high_reg = POP_FAILURE_INT (); \ |
5041 | 1726 DEBUG_FAIL_PRINT2 (" Popping high active reg: %d\n", high_reg); \ |
428 | 1727 \ |
647 | 1728 low_reg = POP_FAILURE_INT (); \ |
5041 | 1729 DEBUG_FAIL_PRINT2 (" Popping low active reg: %d\n", low_reg); \ |
428 | 1730 \ |
1731 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ | |
1732 { \ | |
5041 | 1733 DEBUG_FAIL_PRINT2 (" Popping reg: %d\n", this_reg); \ |
428 | 1734 \ |
1735 reg_info[this_reg].word = POP_FAILURE_ELT (); \ | |
5041 | 1736 DEBUG_FAIL_PRINT2 (" info: 0x%lx\n", \ |
428 | 1737 * (long *) ®_info[this_reg]); \ |
1738 \ | |
446 | 1739 regend[this_reg] = POP_FAILURE_POINTER (); \ |
5041 | 1740 DEBUG_FAIL_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \ |
428 | 1741 \ |
446 | 1742 regstart[this_reg] = POP_FAILURE_POINTER (); \ |
5041 | 1743 DEBUG_FAIL_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \ |
428 | 1744 } \ |
1745 \ | |
1746 set_regs_matched_done = 0; \ | |
1747 DEBUG_STATEMENT (nfailure_points_popped++); \ | |
456 | 1748 } while (0) /* POP_FAILURE_POINT */ |
428 | 1749 |
1750 | |
1751 | |
1752 /* Structure for per-register (a.k.a. per-group) information. | |
1753 Other register information, such as the | |
1754 starting and ending positions (which are addresses), and the list of | |
1755 inner groups (which is a bits list) are maintained in separate | |
1756 variables. | |
1757 | |
1758 We are making a (strictly speaking) nonportable assumption here: that | |
1759 the compiler will pack our bit fields into something that fits into | |
1760 the type of `word', i.e., is something that fits into one item on the | |
1761 failure stack. */ | |
1762 | |
1763 typedef union | |
1764 { | |
1765 fail_stack_elt_t word; | |
1766 struct | |
1767 { | |
1768 /* This field is one if this group can match the empty string, | |
1769 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ | |
1770 #define MATCH_NULL_UNSET_VALUE 3 | |
647 | 1771 unsigned int match_null_string_p : 2; |
1772 unsigned int is_active : 1; | |
1773 unsigned int matched_something : 1; | |
1774 unsigned int ever_matched_something : 1; | |
428 | 1775 } bits; |
1776 } register_info_type; | |
1777 | |
1778 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) | |
1779 #define IS_ACTIVE(R) ((R).bits.is_active) | |
1780 #define MATCHED_SOMETHING(R) ((R).bits.matched_something) | |
1781 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) | |
1782 | |
1783 | |
1784 /* Call this when have matched a real character; it sets `matched' flags | |
1785 for the subexpressions which we are currently inside. Also records | |
1786 that those subexprs have matched. */ | |
1787 #define SET_REGS_MATCHED() \ | |
1788 do \ | |
1789 { \ | |
1790 if (!set_regs_matched_done) \ | |
1791 { \ | |
647 | 1792 int r; \ |
428 | 1793 set_regs_matched_done = 1; \ |
1794 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ | |
1795 { \ | |
1796 MATCHED_SOMETHING (reg_info[r]) \ | |
1797 = EVER_MATCHED_SOMETHING (reg_info[r]) \ | |
1798 = 1; \ | |
1799 } \ | |
1800 } \ | |
1801 } \ | |
1802 while (0) | |
1803 | |
1804 /* Registers are set to a sentinel when they haven't yet matched. */ | |
446 | 1805 static unsigned char reg_unset_dummy; |
428 | 1806 #define REG_UNSET_VALUE (®_unset_dummy) |
1807 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) | |
1808 | |
1809 /* Subroutine declarations and macros for regex_compile. */ | |
1810 | |
1811 /* Fetch the next character in the uncompiled pattern---translating it | |
826 | 1812 if necessary. */ |
428 | 1813 #define PATFETCH(c) \ |
446 | 1814 do { \ |
1815 PATFETCH_RAW (c); \ | |
826 | 1816 c = RE_TRANSLATE (c); \ |
428 | 1817 } while (0) |
1818 | |
1819 /* Fetch the next character in the uncompiled pattern, with no | |
1820 translation. */ | |
1821 #define PATFETCH_RAW(c) \ | |
1822 do {if (p == pend) return REG_EEND; \ | |
1823 assert (p < pend); \ | |
867 | 1824 c = itext_ichar (p); \ |
1825 INC_IBYTEPTR (p); \ | |
428 | 1826 } while (0) |
1827 | |
1828 /* Go backwards one character in the pattern. */ | |
867 | 1829 #define PATUNFETCH DEC_IBYTEPTR (p) |
428 | 1830 |
1831 /* If `translate' is non-null, return translate[D], else just D. We | |
1832 cast the subscript to translate because some data is declared as | |
1833 `char *', to avoid warnings when a string constant is passed. But | |
1834 when we use a character as a subscript we must make it unsigned. */ | |
826 | 1835 #define RE_TRANSLATE(d) \ |
1836 (TRANSLATE_P (translate) ? RE_TRANSLATE_1 (d) : (d)) | |
428 | 1837 |
1838 /* Macros for outputting the compiled pattern into `buffer'. */ | |
1839 | |
1840 /* If the buffer isn't allocated when it comes in, use this. */ | |
1841 #define INIT_BUF_SIZE 32 | |
1842 | |
1843 /* Make sure we have at least N more bytes of space in buffer. */ | |
1844 #define GET_BUFFER_SPACE(n) \ | |
647 | 1845 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \ |
428 | 1846 EXTEND_BUFFER () |
1847 | |
1848 /* Make sure we have one more byte of buffer space and then add C to it. */ | |
1849 #define BUF_PUSH(c) \ | |
1850 do { \ | |
1851 GET_BUFFER_SPACE (1); \ | |
446 | 1852 *buf_end++ = (unsigned char) (c); \ |
428 | 1853 } while (0) |
1854 | |
1855 | |
1856 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ | |
1857 #define BUF_PUSH_2(c1, c2) \ | |
1858 do { \ | |
1859 GET_BUFFER_SPACE (2); \ | |
446 | 1860 *buf_end++ = (unsigned char) (c1); \ |
1861 *buf_end++ = (unsigned char) (c2); \ | |
428 | 1862 } while (0) |
1863 | |
1864 | |
1865 /* As with BUF_PUSH_2, except for three bytes. */ | |
1866 #define BUF_PUSH_3(c1, c2, c3) \ | |
1867 do { \ | |
1868 GET_BUFFER_SPACE (3); \ | |
446 | 1869 *buf_end++ = (unsigned char) (c1); \ |
1870 *buf_end++ = (unsigned char) (c2); \ | |
1871 *buf_end++ = (unsigned char) (c3); \ | |
428 | 1872 } while (0) |
1873 | |
1874 | |
1875 /* Store a jump with opcode OP at LOC to location TO. We store a | |
1876 relative address offset by the three bytes the jump itself occupies. */ | |
1877 #define STORE_JUMP(op, loc, to) \ | |
1878 store_op1 (op, loc, (to) - (loc) - 3) | |
1879 | |
1880 /* Likewise, for a two-argument jump. */ | |
1881 #define STORE_JUMP2(op, loc, to, arg) \ | |
1882 store_op2 (op, loc, (to) - (loc) - 3, arg) | |
1883 | |
446 | 1884 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the |
1885 buffer end. */ | |
428 | 1886 #define INSERT_JUMP(op, loc, to) \ |
446 | 1887 insert_op1 (op, loc, (to) - (loc) - 3, buf_end) |
1888 | |
1889 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the | |
1890 buffer end. */ | |
428 | 1891 #define INSERT_JUMP2(op, loc, to, arg) \ |
446 | 1892 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end) |
428 | 1893 |
1894 | |
1895 /* This is not an arbitrary limit: the arguments which represent offsets | |
1896 into the pattern are two bytes long. So if 2^16 bytes turns out to | |
1897 be too small, many things would have to change. */ | |
1898 #define MAX_BUF_SIZE (1L << 16) | |
1899 | |
1900 | |
1901 /* Extend the buffer by twice its current size via realloc and | |
1902 reset the pointers that pointed into the old block to point to the | |
1903 correct places in the new one. If extending the buffer results in it | |
1904 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ | |
1333 | 1905 #define EXTEND_BUFFER() \ |
1906 do { \ | |
1907 re_char *old_buffer = bufp->buffer; \ | |
1908 if (bufp->allocated == MAX_BUF_SIZE) \ | |
1909 return REG_ESIZE; \ | |
1910 bufp->allocated <<= 1; \ | |
1911 if (bufp->allocated > MAX_BUF_SIZE) \ | |
1912 bufp->allocated = MAX_BUF_SIZE; \ | |
1913 bufp->buffer = \ | |
1914 (unsigned char *) xrealloc (bufp->buffer, bufp->allocated); \ | |
1915 if (bufp->buffer == NULL) \ | |
1916 return REG_ESPACE; \ | |
1917 /* If the buffer moved, move all the pointers into it. */ \ | |
1918 if (old_buffer != bufp->buffer) \ | |
1919 { \ | |
1920 buf_end = (buf_end - old_buffer) + bufp->buffer; \ | |
1921 begalt = (begalt - old_buffer) + bufp->buffer; \ | |
1922 if (fixup_alt_jump) \ | |
1923 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer; \ | |
1924 if (laststart) \ | |
1925 laststart = (laststart - old_buffer) + bufp->buffer; \ | |
1926 if (pending_exact) \ | |
1927 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ | |
1928 } \ | |
428 | 1929 } while (0) |
1930 | |
1931 | |
1932 /* Since we have one byte reserved for the register number argument to | |
1933 {start,stop}_memory, the maximum number of groups we can report | |
1934 things about is what fits in that byte. */ | |
1935 #define MAX_REGNUM 255 | |
1936 | |
1937 /* But patterns can have more than `MAX_REGNUM' registers. We just | |
502 | 1938 ignore the excess. |
1939 #### not true! groups past this will fail in lots of ways, if we | |
1940 ever have to backtrack. | |
1941 */ | |
647 | 1942 typedef int regnum_t; |
428 | 1943 |
502 | 1944 #define INIT_REG_TRANSLATE_SIZE 5 |
428 | 1945 |
1946 /* Macros for the compile stack. */ | |
1947 | |
1948 /* Since offsets can go either forwards or backwards, this type needs to | |
1949 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ | |
1950 typedef int pattern_offset_t; | |
1951 | |
1952 typedef struct | |
1953 { | |
1954 pattern_offset_t begalt_offset; | |
1955 pattern_offset_t fixup_alt_jump; | |
1956 pattern_offset_t inner_group_offset; | |
1957 pattern_offset_t laststart_offset; | |
1958 regnum_t regnum; | |
1959 } compile_stack_elt_t; | |
1960 | |
1961 | |
1962 typedef struct | |
1963 { | |
1964 compile_stack_elt_t *stack; | |
647 | 1965 int size; |
1966 int avail; /* Offset of next open position. */ | |
428 | 1967 } compile_stack_type; |
1968 | |
1969 | |
1970 #define INIT_COMPILE_STACK_SIZE 32 | |
1971 | |
1972 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) | |
1973 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) | |
1974 | |
1975 /* The next available element. */ | |
1976 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) | |
1977 | |
1978 | |
1979 /* Set the bit for character C in a bit vector. */ | |
1980 #define SET_LIST_BIT(c) \ | |
446 | 1981 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \ |
428 | 1982 |= 1 << (((unsigned char) c) % BYTEWIDTH)) |
1983 | |
1984 #ifdef MULE | |
1985 | |
1986 /* Set the "bit" for character C in a range table. */ | |
1987 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt) | |
1988 | |
1989 /* Set the "bit" for character c in the appropriate table. */ | |
1990 #define SET_EITHER_BIT(c) \ | |
1991 do { \ | |
1992 if (has_extended_chars) \ | |
1993 SET_RANGETAB_BIT (c); \ | |
1994 else \ | |
1995 SET_LIST_BIT (c); \ | |
1996 } while (0) | |
1997 | |
1998 #else /* not MULE */ | |
1999 | |
2000 #define SET_EITHER_BIT(c) SET_LIST_BIT (c) | |
2001 | |
2002 #endif | |
2003 | |
2004 | |
2005 /* Get the next unsigned number in the uncompiled pattern. */ | |
2006 #define GET_UNSIGNED_NUMBER(num) \ | |
2007 { if (p != pend) \ | |
2008 { \ | |
2009 PATFETCH (c); \ | |
2010 while (ISDIGIT (c)) \ | |
2011 { \ | |
2012 if (num < 0) \ | |
2013 num = 0; \ | |
2014 num = num * 10 + c - '0'; \ | |
2015 if (p == pend) \ | |
2016 break; \ | |
2017 PATFETCH (c); \ | |
2018 } \ | |
2019 } \ | |
2020 } | |
2021 | |
2022 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ | |
2023 | |
2024 #define IS_CHAR_CLASS(string) \ | |
2025 (STREQ (string, "alpha") || STREQ (string, "upper") \ | |
2026 || STREQ (string, "lower") || STREQ (string, "digit") \ | |
2027 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ | |
2028 || STREQ (string, "space") || STREQ (string, "print") \ | |
2029 || STREQ (string, "punct") || STREQ (string, "graph") \ | |
2030 || STREQ (string, "cntrl") || STREQ (string, "blank")) | |
2031 | |
2032 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg); | |
2033 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2); | |
2034 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg, | |
2035 unsigned char *end); | |
2036 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, | |
2037 unsigned char *end); | |
460 | 2038 static re_bool at_begline_loc_p (re_char *pattern, re_char *p, |
428 | 2039 reg_syntax_t syntax); |
460 | 2040 static re_bool at_endline_loc_p (re_char *p, re_char *pend, int syntax); |
2041 static re_bool group_in_compile_stack (compile_stack_type compile_stack, | |
428 | 2042 regnum_t regnum); |
446 | 2043 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend, |
2044 RE_TRANSLATE_TYPE translate, | |
2045 reg_syntax_t syntax, | |
428 | 2046 unsigned char *b); |
2047 #ifdef MULE | |
446 | 2048 static reg_errcode_t compile_extended_range (re_char **p_ptr, |
2049 re_char *pend, | |
2050 RE_TRANSLATE_TYPE translate, | |
428 | 2051 reg_syntax_t syntax, |
2052 Lisp_Object rtab); | |
2053 #endif /* MULE */ | |
460 | 2054 static re_bool group_match_null_string_p (unsigned char **p, |
428 | 2055 unsigned char *end, |
2056 register_info_type *reg_info); | |
460 | 2057 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end, |
428 | 2058 register_info_type *reg_info); |
460 | 2059 static re_bool common_op_match_null_string_p (unsigned char **p, |
428 | 2060 unsigned char *end, |
2061 register_info_type *reg_info); | |
826 | 2062 static int bcmp_translate (re_char *s1, re_char *s2, |
2063 REGISTER int len, RE_TRANSLATE_TYPE translate | |
2064 #ifdef emacs | |
2065 , Internal_Format fmt, Lisp_Object lispobj | |
2066 #endif | |
2067 ); | |
428 | 2068 static int re_match_2_internal (struct re_pattern_buffer *bufp, |
446 | 2069 re_char *string1, int size1, |
2070 re_char *string2, int size2, int pos, | |
826 | 2071 struct re_registers *regs, int stop |
2072 RE_LISP_CONTEXT_ARGS_DECL); | |
428 | 2073 |
2074 #ifndef MATCH_MAY_ALLOCATE | |
2075 | |
2076 /* If we cannot allocate large objects within re_match_2_internal, | |
2077 we make the fail stack and register vectors global. | |
2078 The fail stack, we grow to the maximum size when a regexp | |
2079 is compiled. | |
2080 The register vectors, we adjust in size each time we | |
2081 compile a regexp, according to the number of registers it needs. */ | |
2082 | |
2083 static fail_stack_type fail_stack; | |
2084 | |
2085 /* Size with which the following vectors are currently allocated. | |
2086 That is so we can make them bigger as needed, | |
2087 but never make them smaller. */ | |
2088 static int regs_allocated_size; | |
2089 | |
446 | 2090 static re_char ** regstart, ** regend; |
2091 static re_char ** old_regstart, ** old_regend; | |
2092 static re_char **best_regstart, **best_regend; | |
428 | 2093 static register_info_type *reg_info; |
446 | 2094 static re_char **reg_dummy; |
428 | 2095 static register_info_type *reg_info_dummy; |
2096 | |
2097 /* Make the register vectors big enough for NUM_REGS registers, | |
2098 but don't make them smaller. */ | |
2099 | |
2100 static | |
2101 regex_grow_registers (int num_regs) | |
2102 { | |
2103 if (num_regs > regs_allocated_size) | |
2104 { | |
551 | 2105 RETALLOC (regstart, num_regs, re_char *); |
2106 RETALLOC (regend, num_regs, re_char *); | |
2107 RETALLOC (old_regstart, num_regs, re_char *); | |
2108 RETALLOC (old_regend, num_regs, re_char *); | |
2109 RETALLOC (best_regstart, num_regs, re_char *); | |
2110 RETALLOC (best_regend, num_regs, re_char *); | |
2111 RETALLOC (reg_info, num_regs, register_info_type); | |
2112 RETALLOC (reg_dummy, num_regs, re_char *); | |
2113 RETALLOC (reg_info_dummy, num_regs, register_info_type); | |
428 | 2114 |
2115 regs_allocated_size = num_regs; | |
2116 } | |
2117 } | |
2118 | |
2119 #endif /* not MATCH_MAY_ALLOCATE */ | |
2120 | |
2121 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. | |
2122 Returns one of error codes defined in `regex.h', or zero for success. | |
2123 | |
2124 Assumes the `allocated' (and perhaps `buffer') and `translate' | |
2125 fields are set in BUFP on entry. | |
2126 | |
2127 If it succeeds, results are put in BUFP (if it returns an error, the | |
2128 contents of BUFP are undefined): | |
2129 `buffer' is the compiled pattern; | |
2130 `syntax' is set to SYNTAX; | |
2131 `used' is set to the length of the compiled pattern; | |
2132 `fastmap_accurate' is zero; | |
502 | 2133 `re_ngroups' is the number of groups/subexpressions (including shy |
2134 groups) in PATTERN; | |
2135 `re_nsub' is the number of non-shy groups in PATTERN; | |
428 | 2136 `not_bol' and `not_eol' are zero; |
2137 | |
2138 The `fastmap' and `newline_anchor' fields are neither | |
2139 examined nor set. */ | |
2140 | |
2141 /* Return, freeing storage we allocated. */ | |
1726 | 2142 #define FREE_STACK_RETURN(value) \ |
2143 do \ | |
2144 { \ | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
2145 xfree (compile_stack.stack); \ |
1726 | 2146 return value; \ |
1333 | 2147 } while (0) |
428 | 2148 |
2149 static reg_errcode_t | |
446 | 2150 regex_compile (re_char *pattern, int size, reg_syntax_t syntax, |
428 | 2151 struct re_pattern_buffer *bufp) |
2152 { | |
2153 /* We fetch characters from PATTERN here. We declare these as int | |
2154 (or possibly long) so that chars above 127 can be used as | |
2155 array indices. The macros that fetch a character from the pattern | |
2156 make sure to coerce to unsigned char before assigning, so we won't | |
2157 get bitten by negative numbers here. */ | |
2158 /* XEmacs change: used to be unsigned char. */ | |
2159 REGISTER EMACS_INT c, c1; | |
2160 | |
2161 /* A random temporary spot in PATTERN. */ | |
446 | 2162 re_char *p1; |
428 | 2163 |
2164 /* Points to the end of the buffer, where we should append. */ | |
446 | 2165 REGISTER unsigned char *buf_end; |
428 | 2166 |
2167 /* Keeps track of unclosed groups. */ | |
2168 compile_stack_type compile_stack; | |
2169 | |
2170 /* Points to the current (ending) position in the pattern. */ | |
446 | 2171 re_char *p = pattern; |
2172 re_char *pend = pattern + size; | |
428 | 2173 |
2174 /* How to translate the characters in the pattern. */ | |
446 | 2175 RE_TRANSLATE_TYPE translate = bufp->translate; |
428 | 2176 |
2177 /* Address of the count-byte of the most recently inserted `exactn' | |
2178 command. This makes it possible to tell if a new exact-match | |
2179 character can be added to that command or if the character requires | |
2180 a new `exactn' command. */ | |
2181 unsigned char *pending_exact = 0; | |
2182 | |
2183 /* Address of start of the most recently finished expression. | |
2184 This tells, e.g., postfix * where to find the start of its | |
2185 operand. Reset at the beginning of groups and alternatives. */ | |
2186 unsigned char *laststart = 0; | |
2187 | |
2188 /* Address of beginning of regexp, or inside of last group. */ | |
2189 unsigned char *begalt; | |
2190 | |
2191 /* Place in the uncompiled pattern (i.e., the {) to | |
2192 which to go back if the interval is invalid. */ | |
446 | 2193 re_char *beg_interval; |
428 | 2194 |
2195 /* Address of the place where a forward jump should go to the end of | |
2196 the containing expression. Each alternative of an `or' -- except the | |
2197 last -- ends with a forward jump of this sort. */ | |
2198 unsigned char *fixup_alt_jump = 0; | |
2199 | |
2200 /* Counts open-groups as they are encountered. Remembered for the | |
2201 matching close-group on the compile stack, so the same register | |
2202 number is put in the stop_memory as the start_memory. */ | |
2203 regnum_t regnum = 0; | |
2204 | |
2205 #ifdef DEBUG | |
5041 | 2206 if (debug_regexps & RE_DEBUG_COMPILATION) |
428 | 2207 { |
647 | 2208 int debug_count; |
428 | 2209 |
5041 | 2210 DEBUG_PRINT1 ("\nCompiling pattern: "); |
428 | 2211 for (debug_count = 0; debug_count < size; debug_count++) |
2212 putchar (pattern[debug_count]); | |
2213 putchar ('\n'); | |
2214 } | |
2215 #endif /* DEBUG */ | |
2216 | |
2217 /* Initialize the compile stack. */ | |
2218 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); | |
2219 if (compile_stack.stack == NULL) | |
2220 return REG_ESPACE; | |
2221 | |
2222 compile_stack.size = INIT_COMPILE_STACK_SIZE; | |
2223 compile_stack.avail = 0; | |
2224 | |
2225 /* Initialize the pattern buffer. */ | |
2226 bufp->syntax = syntax; | |
2227 bufp->fastmap_accurate = 0; | |
2228 bufp->not_bol = bufp->not_eol = 0; | |
2229 | |
2230 /* Set `used' to zero, so that if we return an error, the pattern | |
2231 printer (for debugging) will think there's no pattern. We reset it | |
2232 at the end. */ | |
2233 bufp->used = 0; | |
2234 | |
2235 /* Always count groups, whether or not bufp->no_sub is set. */ | |
2236 bufp->re_nsub = 0; | |
502 | 2237 bufp->re_ngroups = 0; |
2238 | |
2239 bufp->warned_about_incompatible_back_references = 0; | |
2240 | |
2241 if (bufp->external_to_internal_register == 0) | |
2242 { | |
2243 bufp->external_to_internal_register_size = INIT_REG_TRANSLATE_SIZE; | |
2244 RETALLOC (bufp->external_to_internal_register, | |
2245 bufp->external_to_internal_register_size, | |
2246 int); | |
2247 } | |
2248 | |
2249 { | |
2250 int i; | |
2251 | |
2252 bufp->external_to_internal_register[0] = 0; | |
2253 for (i = 1; i < bufp->external_to_internal_register_size; i++) | |
2254 bufp->external_to_internal_register[i] = (int) 0xDEADBEEF; | |
2255 } | |
428 | 2256 |
2257 #if !defined (emacs) && !defined (SYNTAX_TABLE) | |
2258 /* Initialize the syntax table. */ | |
2259 init_syntax_once (); | |
2260 #endif | |
2261 | |
2262 if (bufp->allocated == 0) | |
2263 { | |
2264 if (bufp->buffer) | |
2265 { /* If zero allocated, but buffer is non-null, try to realloc | |
2266 enough space. This loses if buffer's address is bogus, but | |
2267 that is the user's responsibility. */ | |
2268 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); | |
2269 } | |
2270 else | |
2271 { /* Caller did not allocate a buffer. Do it for them. */ | |
2272 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); | |
2273 } | |
2274 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE); | |
2275 | |
2276 bufp->allocated = INIT_BUF_SIZE; | |
2277 } | |
2278 | |
446 | 2279 begalt = buf_end = bufp->buffer; |
428 | 2280 |
2281 /* Loop through the uncompiled pattern until we're at the end. */ | |
2282 while (p != pend) | |
2283 { | |
2284 PATFETCH (c); | |
2285 | |
2286 switch (c) | |
2287 { | |
2288 case '^': | |
2289 { | |
2290 if ( /* If at start of pattern, it's an operator. */ | |
2291 p == pattern + 1 | |
2292 /* If context independent, it's an operator. */ | |
2293 || syntax & RE_CONTEXT_INDEP_ANCHORS | |
2294 /* Otherwise, depends on what's come before. */ | |
2295 || at_begline_loc_p (pattern, p, syntax)) | |
2296 BUF_PUSH (begline); | |
2297 else | |
2298 goto normal_char; | |
2299 } | |
2300 break; | |
2301 | |
2302 | |
2303 case '$': | |
2304 { | |
2305 if ( /* If at end of pattern, it's an operator. */ | |
2306 p == pend | |
2307 /* If context independent, it's an operator. */ | |
2308 || syntax & RE_CONTEXT_INDEP_ANCHORS | |
2309 /* Otherwise, depends on what's next. */ | |
2310 || at_endline_loc_p (p, pend, syntax)) | |
2311 BUF_PUSH (endline); | |
2312 else | |
2313 goto normal_char; | |
2314 } | |
2315 break; | |
2316 | |
2317 | |
2318 case '+': | |
2319 case '?': | |
2320 if ((syntax & RE_BK_PLUS_QM) | |
2321 || (syntax & RE_LIMITED_OPS)) | |
2322 goto normal_char; | |
2323 handle_plus: | |
2324 case '*': | |
2325 /* If there is no previous pattern... */ | |
2326 if (!laststart) | |
2327 { | |
2328 if (syntax & RE_CONTEXT_INVALID_OPS) | |
2329 FREE_STACK_RETURN (REG_BADRPT); | |
2330 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) | |
2331 goto normal_char; | |
2332 } | |
2333 | |
2334 { | |
2335 /* true means zero/many matches are allowed. */ | |
460 | 2336 re_bool zero_times_ok = c != '+'; |
2337 re_bool many_times_ok = c != '?'; | |
428 | 2338 |
2339 /* true means match shortest string possible. */ | |
460 | 2340 re_bool minimal = false; |
428 | 2341 |
2342 /* If there is a sequence of repetition chars, collapse it | |
2343 down to just one (the right one). We can't combine | |
2344 interval operators with these because of, e.g., `a{2}*', | |
2345 which should only match an even number of `a's. */ | |
2346 while (p != pend) | |
2347 { | |
2348 PATFETCH (c); | |
2349 | |
2350 if (c == '*' || (!(syntax & RE_BK_PLUS_QM) | |
2351 && (c == '+' || c == '?'))) | |
2352 ; | |
2353 | |
2354 else if (syntax & RE_BK_PLUS_QM && c == '\\') | |
2355 { | |
2356 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
2357 | |
2358 PATFETCH (c1); | |
2359 if (!(c1 == '+' || c1 == '?')) | |
2360 { | |
2361 PATUNFETCH; | |
2362 PATUNFETCH; | |
2363 break; | |
2364 } | |
2365 | |
2366 c = c1; | |
2367 } | |
2368 else | |
2369 { | |
2370 PATUNFETCH; | |
2371 break; | |
2372 } | |
2373 | |
2374 /* If we get here, we found another repeat character. */ | |
2375 if (!(syntax & RE_NO_MINIMAL_MATCHING)) | |
2376 { | |
440 | 2377 /* "*?" and "+?" and "??" are okay (and mean match |
2378 minimally), but other sequences (such as "*??" and | |
2379 "+++") are rejected (reserved for future use). */ | |
428 | 2380 if (minimal || c != '?') |
2381 FREE_STACK_RETURN (REG_BADRPT); | |
2382 minimal = true; | |
2383 } | |
2384 else | |
2385 { | |
2386 zero_times_ok |= c != '+'; | |
2387 many_times_ok |= c != '?'; | |
2388 } | |
2389 } | |
2390 | |
2391 /* Star, etc. applied to an empty pattern is equivalent | |
2392 to an empty pattern. */ | |
2393 if (!laststart) | |
2394 break; | |
2395 | |
2396 /* Now we know whether zero matches is allowed | |
2397 and whether two or more matches is allowed | |
2398 and whether we want minimal or maximal matching. */ | |
2399 if (minimal) | |
2400 { | |
2401 if (!many_times_ok) | |
2402 { | |
2403 /* "a??" becomes: | |
2404 0: /on_failure_jump to 6 | |
2405 3: /jump to 9 | |
2406 6: /exactn/1/A | |
2407 9: end of pattern. | |
2408 */ | |
2409 GET_BUFFER_SPACE (6); | |
446 | 2410 INSERT_JUMP (jump, laststart, buf_end + 3); |
2411 buf_end += 3; | |
428 | 2412 INSERT_JUMP (on_failure_jump, laststart, laststart + 6); |
446 | 2413 buf_end += 3; |
428 | 2414 } |
2415 else if (zero_times_ok) | |
2416 { | |
2417 /* "a*?" becomes: | |
2418 0: /jump to 6 | |
2419 3: /exactn/1/A | |
2420 6: /on_failure_jump to 3 | |
2421 9: end of pattern. | |
2422 */ | |
2423 GET_BUFFER_SPACE (6); | |
446 | 2424 INSERT_JUMP (jump, laststart, buf_end + 3); |
2425 buf_end += 3; | |
2426 STORE_JUMP (on_failure_jump, buf_end, laststart + 3); | |
2427 buf_end += 3; | |
428 | 2428 } |
2429 else | |
2430 { | |
2431 /* "a+?" becomes: | |
2432 0: /exactn/1/A | |
2433 3: /on_failure_jump to 0 | |
2434 6: end of pattern. | |
2435 */ | |
2436 GET_BUFFER_SPACE (3); | |
446 | 2437 STORE_JUMP (on_failure_jump, buf_end, laststart); |
2438 buf_end += 3; | |
428 | 2439 } |
2440 } | |
2441 else | |
2442 { | |
2443 /* Are we optimizing this jump? */ | |
460 | 2444 re_bool keep_string_p = false; |
428 | 2445 |
2446 if (many_times_ok) | |
446 | 2447 { /* More than one repetition is allowed, so put in |
2448 at the end a backward relative jump from | |
2449 `buf_end' to before the next jump we're going | |
2450 to put in below (which jumps from laststart to | |
2451 after this jump). | |
428 | 2452 |
2453 But if we are at the `*' in the exact sequence `.*\n', | |
2454 insert an unconditional jump backwards to the ., | |
2455 instead of the beginning of the loop. This way we only | |
2456 push a failure point once, instead of every time | |
2457 through the loop. */ | |
2458 assert (p - 1 > pattern); | |
2459 | |
2460 /* Allocate the space for the jump. */ | |
2461 GET_BUFFER_SPACE (3); | |
2462 | |
2463 /* We know we are not at the first character of the | |
2464 pattern, because laststart was nonzero. And we've | |
2465 already incremented `p', by the way, to be the | |
2466 character after the `*'. Do we have to do something | |
2467 analogous here for null bytes, because of | |
2468 RE_DOT_NOT_NULL? */ | |
446 | 2469 if (*(p - 2) == '.' |
428 | 2470 && zero_times_ok |
446 | 2471 && p < pend && *p == '\n' |
428 | 2472 && !(syntax & RE_DOT_NEWLINE)) |
2473 { /* We have .*\n. */ | |
446 | 2474 STORE_JUMP (jump, buf_end, laststart); |
428 | 2475 keep_string_p = true; |
2476 } | |
2477 else | |
2478 /* Anything else. */ | |
446 | 2479 STORE_JUMP (maybe_pop_jump, buf_end, laststart - 3); |
428 | 2480 |
2481 /* We've added more stuff to the buffer. */ | |
446 | 2482 buf_end += 3; |
428 | 2483 } |
2484 | |
446 | 2485 /* On failure, jump from laststart to buf_end + 3, |
2486 which will be the end of the buffer after this jump | |
2487 is inserted. */ | |
428 | 2488 GET_BUFFER_SPACE (3); |
2489 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump | |
2490 : on_failure_jump, | |
446 | 2491 laststart, buf_end + 3); |
2492 buf_end += 3; | |
428 | 2493 |
2494 if (!zero_times_ok) | |
2495 { | |
2496 /* At least one repetition is required, so insert a | |
2497 `dummy_failure_jump' before the initial | |
2498 `on_failure_jump' instruction of the loop. This | |
2499 effects a skip over that instruction the first time | |
2500 we hit that loop. */ | |
2501 GET_BUFFER_SPACE (3); | |
2502 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); | |
446 | 2503 buf_end += 3; |
428 | 2504 } |
2505 } | |
2506 pending_exact = 0; | |
2507 } | |
2508 break; | |
2509 | |
2510 | |
2511 case '.': | |
446 | 2512 laststart = buf_end; |
428 | 2513 BUF_PUSH (anychar); |
2514 break; | |
2515 | |
2516 | |
2517 case '[': | |
2518 { | |
2519 /* XEmacs change: this whole section */ | |
460 | 2520 re_bool had_char_class = false; |
428 | 2521 #ifdef MULE |
460 | 2522 re_bool has_extended_chars = false; |
428 | 2523 REGISTER Lisp_Object rtab = Qnil; |
2524 #endif | |
2525 | |
2526 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2527 | |
2528 /* Ensure that we have enough space to push a charset: the | |
2529 opcode, the length count, and the bitset; 34 bytes in all. */ | |
2530 GET_BUFFER_SPACE (34); | |
2531 | |
446 | 2532 laststart = buf_end; |
428 | 2533 |
2534 /* We test `*p == '^' twice, instead of using an if | |
2535 statement, so we only need one BUF_PUSH. */ | |
2536 BUF_PUSH (*p == '^' ? charset_not : charset); | |
2537 if (*p == '^') | |
2538 p++; | |
2539 | |
2540 /* Remember the first position in the bracket expression. */ | |
2541 p1 = p; | |
2542 | |
2543 /* Push the number of bytes in the bitmap. */ | |
2544 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); | |
2545 | |
2546 /* Clear the whole map. */ | |
446 | 2547 memset (buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH); |
428 | 2548 |
2549 /* charset_not matches newline according to a syntax bit. */ | |
446 | 2550 if ((re_opcode_t) buf_end[-2] == charset_not |
428 | 2551 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
2552 SET_LIST_BIT ('\n'); | |
2553 | |
2554 #ifdef MULE | |
2555 start_over_with_extended: | |
2556 if (has_extended_chars) | |
2557 { | |
2558 /* There are extended chars here, which means we need to start | |
2559 over and shift to unified range-table format. */ | |
446 | 2560 if (buf_end[-2] == charset) |
2561 buf_end[-2] = charset_mule; | |
428 | 2562 else |
446 | 2563 buf_end[-2] = charset_mule_not; |
2564 buf_end--; | |
428 | 2565 p = p1; /* go back to the beginning of the charset, after |
2566 a possible ^. */ | |
2567 rtab = Vthe_lisp_rangetab; | |
2568 Fclear_range_table (rtab); | |
2569 | |
2570 /* charset_not matches newline according to a syntax bit. */ | |
446 | 2571 if ((re_opcode_t) buf_end[-1] == charset_mule_not |
428 | 2572 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
2573 SET_EITHER_BIT ('\n'); | |
2574 } | |
2575 #endif /* MULE */ | |
2576 | |
2577 /* Read in characters and ranges, setting map bits. */ | |
2578 for (;;) | |
2579 { | |
2580 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2581 | |
446 | 2582 PATFETCH (c); |
428 | 2583 |
2584 #ifdef MULE | |
2585 if (c >= 0x80 && !has_extended_chars) | |
2586 { | |
2587 has_extended_chars = 1; | |
2588 /* Frumble-bumble, we've found some extended chars. | |
2589 Need to start over, process everything using | |
2590 the general extended-char mechanism, and need | |
2591 to use charset_mule and charset_mule_not instead | |
2592 of charset and charset_not. */ | |
2593 goto start_over_with_extended; | |
2594 } | |
2595 #endif /* MULE */ | |
2596 /* \ might escape characters inside [...] and [^...]. */ | |
2597 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') | |
2598 { | |
2599 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
2600 | |
446 | 2601 PATFETCH (c1); |
428 | 2602 #ifdef MULE |
2603 if (c1 >= 0x80 && !has_extended_chars) | |
2604 { | |
2605 has_extended_chars = 1; | |
2606 goto start_over_with_extended; | |
2607 } | |
2608 #endif /* MULE */ | |
2609 SET_EITHER_BIT (c1); | |
2610 continue; | |
2611 } | |
2612 | |
2613 /* Could be the end of the bracket expression. If it's | |
2614 not (i.e., when the bracket expression is `[]' so | |
2615 far), the ']' character bit gets set way below. */ | |
2616 if (c == ']' && p != p1 + 1) | |
2617 break; | |
2618 | |
2619 /* Look ahead to see if it's a range when the last thing | |
2620 was a character class. */ | |
2621 if (had_char_class && c == '-' && *p != ']') | |
2622 FREE_STACK_RETURN (REG_ERANGE); | |
2623 | |
2624 /* Look ahead to see if it's a range when the last thing | |
2625 was a character: if this is a hyphen not at the | |
2626 beginning or the end of a list, then it's the range | |
2627 operator. */ | |
2628 if (c == '-' | |
2629 && !(p - 2 >= pattern && p[-2] == '[') | |
446 | 2630 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') |
428 | 2631 && *p != ']') |
2632 { | |
2633 reg_errcode_t ret; | |
2634 | |
2635 #ifdef MULE | |
2636 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) | |
2637 { | |
2638 has_extended_chars = 1; | |
2639 goto start_over_with_extended; | |
2640 } | |
2641 if (has_extended_chars) | |
2642 ret = compile_extended_range (&p, pend, translate, | |
2643 syntax, rtab); | |
2644 else | |
2645 #endif /* MULE */ | |
446 | 2646 ret = compile_range (&p, pend, translate, syntax, buf_end); |
428 | 2647 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
2648 } | |
2649 | |
2650 else if (p[0] == '-' && p[1] != ']') | |
2651 { /* This handles ranges made up of characters only. */ | |
2652 reg_errcode_t ret; | |
2653 | |
2654 /* Move past the `-'. */ | |
2655 PATFETCH (c1); | |
2656 | |
2657 #ifdef MULE | |
2658 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) | |
2659 { | |
2660 has_extended_chars = 1; | |
2661 goto start_over_with_extended; | |
2662 } | |
2663 if (has_extended_chars) | |
2664 ret = compile_extended_range (&p, pend, translate, | |
2665 syntax, rtab); | |
2666 else | |
2667 #endif /* MULE */ | |
446 | 2668 ret = compile_range (&p, pend, translate, syntax, buf_end); |
428 | 2669 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
2670 } | |
2671 | |
2672 /* See if we're at the beginning of a possible character | |
2673 class. */ | |
2674 | |
2675 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') | |
2676 { /* Leave room for the null. */ | |
2677 char str[CHAR_CLASS_MAX_LENGTH + 1]; | |
2678 | |
2679 PATFETCH (c); | |
2680 c1 = 0; | |
2681 | |
2682 /* If pattern is `[[:'. */ | |
2683 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2684 | |
2685 for (;;) | |
2686 { | |
446 | 2687 /* #### This code is unused. |
2688 Correctness is not checked after TRT | |
2689 table change. */ | |
428 | 2690 PATFETCH (c); |
2691 if (c == ':' || c == ']' || p == pend | |
2692 || c1 == CHAR_CLASS_MAX_LENGTH) | |
2693 break; | |
442 | 2694 str[c1++] = (char) c; |
428 | 2695 } |
2696 str[c1] = '\0'; | |
2697 | |
446 | 2698 /* If isn't a word bracketed by `[:' and `:]': |
428 | 2699 undo the ending character, the letters, and leave |
2700 the leading `:' and `[' (but set bits for them). */ | |
2701 if (c == ':' && *p == ']') | |
2702 { | |
2703 int ch; | |
460 | 2704 re_bool is_alnum = STREQ (str, "alnum"); |
2705 re_bool is_alpha = STREQ (str, "alpha"); | |
2706 re_bool is_blank = STREQ (str, "blank"); | |
2707 re_bool is_cntrl = STREQ (str, "cntrl"); | |
2708 re_bool is_digit = STREQ (str, "digit"); | |
2709 re_bool is_graph = STREQ (str, "graph"); | |
2710 re_bool is_lower = STREQ (str, "lower"); | |
2711 re_bool is_print = STREQ (str, "print"); | |
2712 re_bool is_punct = STREQ (str, "punct"); | |
2713 re_bool is_space = STREQ (str, "space"); | |
2714 re_bool is_upper = STREQ (str, "upper"); | |
2715 re_bool is_xdigit = STREQ (str, "xdigit"); | |
428 | 2716 |
2717 if (!IS_CHAR_CLASS (str)) | |
2718 FREE_STACK_RETURN (REG_ECTYPE); | |
2719 | |
2720 /* Throw away the ] at the end of the character | |
2721 class. */ | |
2722 PATFETCH (c); | |
2723 | |
2724 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2725 | |
2726 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) | |
2727 { | |
2728 /* This was split into 3 if's to | |
2729 avoid an arbitrary limit in some compiler. */ | |
2730 if ( (is_alnum && ISALNUM (ch)) | |
2731 || (is_alpha && ISALPHA (ch)) | |
2732 || (is_blank && ISBLANK (ch)) | |
2733 || (is_cntrl && ISCNTRL (ch))) | |
2734 SET_EITHER_BIT (ch); | |
2735 if ( (is_digit && ISDIGIT (ch)) | |
2736 || (is_graph && ISGRAPH (ch)) | |
2737 || (is_lower && ISLOWER (ch)) | |
2738 || (is_print && ISPRINT (ch))) | |
2739 SET_EITHER_BIT (ch); | |
2740 if ( (is_punct && ISPUNCT (ch)) | |
2741 || (is_space && ISSPACE (ch)) | |
2742 || (is_upper && ISUPPER (ch)) | |
2743 || (is_xdigit && ISXDIGIT (ch))) | |
2744 SET_EITHER_BIT (ch); | |
2745 } | |
2746 had_char_class = true; | |
2747 } | |
2748 else | |
2749 { | |
2750 c1++; | |
2751 while (c1--) | |
2752 PATUNFETCH; | |
2753 SET_EITHER_BIT ('['); | |
2754 SET_EITHER_BIT (':'); | |
2755 had_char_class = false; | |
2756 } | |
2757 } | |
2758 else | |
2759 { | |
2760 had_char_class = false; | |
2761 SET_EITHER_BIT (c); | |
2762 } | |
2763 } | |
2764 | |
2765 #ifdef MULE | |
2766 if (has_extended_chars) | |
2767 { | |
2768 /* We have a range table, not a bit vector. */ | |
2769 int bytes_needed = | |
2770 unified_range_table_bytes_needed (rtab); | |
2771 GET_BUFFER_SPACE (bytes_needed); | |
446 | 2772 unified_range_table_copy_data (rtab, buf_end); |
2773 buf_end += unified_range_table_bytes_used (buf_end); | |
428 | 2774 break; |
2775 } | |
2776 #endif /* MULE */ | |
2777 /* Discard any (non)matching list bytes that are all 0 at the | |
2778 end of the map. Decrease the map-length byte too. */ | |
446 | 2779 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0) |
2780 buf_end[-1]--; | |
2781 buf_end += buf_end[-1]; | |
428 | 2782 } |
2783 break; | |
2784 | |
2785 | |
2786 case '(': | |
2787 if (syntax & RE_NO_BK_PARENS) | |
2788 goto handle_open; | |
2789 else | |
2790 goto normal_char; | |
2791 | |
2792 | |
2793 case ')': | |
2794 if (syntax & RE_NO_BK_PARENS) | |
2795 goto handle_close; | |
2796 else | |
2797 goto normal_char; | |
2798 | |
2799 | |
2800 case '\n': | |
2801 if (syntax & RE_NEWLINE_ALT) | |
2802 goto handle_alt; | |
2803 else | |
2804 goto normal_char; | |
2805 | |
2806 | |
2807 case '|': | |
2808 if (syntax & RE_NO_BK_VBAR) | |
2809 goto handle_alt; | |
2810 else | |
2811 goto normal_char; | |
2812 | |
2813 | |
2814 case '{': | |
2815 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) | |
2816 goto handle_interval; | |
2817 else | |
2818 goto normal_char; | |
2819 | |
2820 | |
2821 case '\\': | |
2822 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
2823 | |
2824 /* Do not translate the character after the \, so that we can | |
2825 distinguish, e.g., \B from \b, even if we normally would | |
2826 translate, e.g., B to b. */ | |
2827 PATFETCH_RAW (c); | |
2828 | |
2829 switch (c) | |
2830 { | |
2831 case '(': | |
2832 if (syntax & RE_NO_BK_PARENS) | |
2833 goto normal_backslash; | |
2834 | |
2835 handle_open: | |
2836 { | |
2837 regnum_t r; | |
502 | 2838 int shy = 0; |
428 | 2839 |
2840 if (!(syntax & RE_NO_SHY_GROUPS) | |
2841 && p != pend | |
446 | 2842 && *p == '?') |
428 | 2843 { |
2844 p++; | |
446 | 2845 PATFETCH (c); |
428 | 2846 switch (c) |
2847 { | |
2848 case ':': /* shy groups */ | |
502 | 2849 shy = 1; |
428 | 2850 break; |
2851 | |
2852 /* All others are reserved for future constructs. */ | |
2853 default: | |
2854 FREE_STACK_RETURN (REG_BADPAT); | |
2855 } | |
2856 } | |
502 | 2857 |
2858 r = ++regnum; | |
2859 bufp->re_ngroups++; | |
2860 if (!shy) | |
2861 { | |
2862 bufp->re_nsub++; | |
2863 while (bufp->external_to_internal_register_size <= | |
2864 bufp->re_nsub) | |
2865 { | |
2866 int i; | |
2867 int old_size = | |
2868 bufp->external_to_internal_register_size; | |
2869 bufp->external_to_internal_register_size += 5; | |
2870 RETALLOC (bufp->external_to_internal_register, | |
2871 bufp->external_to_internal_register_size, | |
2872 int); | |
2873 /* debugging */ | |
2874 for (i = old_size; | |
2875 i < bufp->external_to_internal_register_size; i++) | |
2876 bufp->external_to_internal_register[i] = | |
2877 (int) 0xDEADBEEF; | |
2878 } | |
2879 | |
2880 bufp->external_to_internal_register[bufp->re_nsub] = | |
2881 bufp->re_ngroups; | |
2882 } | |
428 | 2883 |
2884 if (COMPILE_STACK_FULL) | |
2885 { | |
2886 RETALLOC (compile_stack.stack, compile_stack.size << 1, | |
2887 compile_stack_elt_t); | |
2888 if (compile_stack.stack == NULL) return REG_ESPACE; | |
2889 | |
2890 compile_stack.size <<= 1; | |
2891 } | |
2892 | |
2893 /* These are the values to restore when we hit end of this | |
2894 group. They are all relative offsets, so that if the | |
2895 whole pattern moves because of realloc, they will still | |
2896 be valid. */ | |
2897 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; | |
2898 COMPILE_STACK_TOP.fixup_alt_jump | |
2899 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; | |
446 | 2900 COMPILE_STACK_TOP.laststart_offset = buf_end - bufp->buffer; |
428 | 2901 COMPILE_STACK_TOP.regnum = r; |
2902 | |
2903 /* We will eventually replace the 0 with the number of | |
2904 groups inner to this one. But do not push a | |
2905 start_memory for groups beyond the last one we can | |
502 | 2906 represent in the compiled pattern. |
2907 #### bad bad bad. this will fail in lots of ways, if we | |
2908 ever have to backtrack for these groups. | |
2909 */ | |
428 | 2910 if (r <= MAX_REGNUM) |
2911 { | |
2912 COMPILE_STACK_TOP.inner_group_offset | |
446 | 2913 = buf_end - bufp->buffer + 2; |
428 | 2914 BUF_PUSH_3 (start_memory, r, 0); |
2915 } | |
2916 | |
2917 compile_stack.avail++; | |
2918 | |
2919 fixup_alt_jump = 0; | |
2920 laststart = 0; | |
446 | 2921 begalt = buf_end; |
428 | 2922 /* If we've reached MAX_REGNUM groups, then this open |
2923 won't actually generate any code, so we'll have to | |
2924 clear pending_exact explicitly. */ | |
2925 pending_exact = 0; | |
2926 } | |
2927 break; | |
2928 | |
2929 | |
2930 case ')': | |
2931 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; | |
2932 | |
2933 if (COMPILE_STACK_EMPTY) { | |
2934 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) | |
2935 goto normal_backslash; | |
2936 else | |
2937 FREE_STACK_RETURN (REG_ERPAREN); | |
2938 } | |
2939 | |
2940 handle_close: | |
2941 if (fixup_alt_jump) | |
2942 { /* Push a dummy failure point at the end of the | |
2943 alternative for a possible future | |
2944 `pop_failure_jump' to pop. See comments at | |
2945 `push_dummy_failure' in `re_match_2'. */ | |
2946 BUF_PUSH (push_dummy_failure); | |
2947 | |
2948 /* We allocated space for this jump when we assigned | |
2949 to `fixup_alt_jump', in the `handle_alt' case below. */ | |
446 | 2950 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end - 1); |
428 | 2951 } |
2952 | |
2953 /* See similar code for backslashed left paren above. */ | |
2954 if (COMPILE_STACK_EMPTY) { | |
2955 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) | |
2956 goto normal_char; | |
2957 else | |
2958 FREE_STACK_RETURN (REG_ERPAREN); | |
2959 } | |
2960 | |
2961 /* Since we just checked for an empty stack above, this | |
2962 ``can't happen''. */ | |
2963 assert (compile_stack.avail != 0); | |
2964 { | |
2965 /* We don't just want to restore into `regnum', because | |
2966 later groups should continue to be numbered higher, | |
2967 as in `(ab)c(de)' -- the second group is #2. */ | |
2968 regnum_t this_group_regnum; | |
2969 | |
2970 compile_stack.avail--; | |
2971 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; | |
2972 fixup_alt_jump | |
2973 = COMPILE_STACK_TOP.fixup_alt_jump | |
2974 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 | |
2975 : 0; | |
2976 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; | |
2977 this_group_regnum = COMPILE_STACK_TOP.regnum; | |
2978 /* If we've reached MAX_REGNUM groups, then this open | |
2979 won't actually generate any code, so we'll have to | |
2980 clear pending_exact explicitly. */ | |
2981 pending_exact = 0; | |
2982 | |
2983 /* We're at the end of the group, so now we know how many | |
2984 groups were inside this one. */ | |
2985 if (this_group_regnum <= MAX_REGNUM) | |
2986 { | |
2987 unsigned char *inner_group_loc | |
2988 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; | |
2989 | |
2990 *inner_group_loc = regnum - this_group_regnum; | |
2991 BUF_PUSH_3 (stop_memory, this_group_regnum, | |
2992 regnum - this_group_regnum); | |
2993 } | |
2994 } | |
2995 break; | |
2996 | |
2997 | |
2998 case '|': /* `\|'. */ | |
2999 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) | |
3000 goto normal_backslash; | |
3001 handle_alt: | |
3002 if (syntax & RE_LIMITED_OPS) | |
3003 goto normal_char; | |
3004 | |
3005 /* Insert before the previous alternative a jump which | |
3006 jumps to this alternative if the former fails. */ | |
3007 GET_BUFFER_SPACE (3); | |
446 | 3008 INSERT_JUMP (on_failure_jump, begalt, buf_end + 6); |
428 | 3009 pending_exact = 0; |
446 | 3010 buf_end += 3; |
428 | 3011 |
3012 /* The alternative before this one has a jump after it | |
3013 which gets executed if it gets matched. Adjust that | |
3014 jump so it will jump to this alternative's analogous | |
3015 jump (put in below, which in turn will jump to the next | |
3016 (if any) alternative's such jump, etc.). The last such | |
3017 jump jumps to the correct final destination. A picture: | |
3018 _____ _____ | |
3019 | | | | | |
3020 | v | v | |
3021 a | b | c | |
3022 | |
3023 If we are at `b', then fixup_alt_jump right now points to a | |
3024 three-byte space after `a'. We'll put in the jump, set | |
3025 fixup_alt_jump to right after `b', and leave behind three | |
3026 bytes which we'll fill in when we get to after `c'. */ | |
3027 | |
3028 if (fixup_alt_jump) | |
446 | 3029 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end); |
428 | 3030 |
3031 /* Mark and leave space for a jump after this alternative, | |
3032 to be filled in later either by next alternative or | |
3033 when know we're at the end of a series of alternatives. */ | |
446 | 3034 fixup_alt_jump = buf_end; |
428 | 3035 GET_BUFFER_SPACE (3); |
446 | 3036 buf_end += 3; |
428 | 3037 |
3038 laststart = 0; | |
446 | 3039 begalt = buf_end; |
428 | 3040 break; |
3041 | |
3042 | |
3043 case '{': | |
3044 /* If \{ is a literal. */ | |
3045 if (!(syntax & RE_INTERVALS) | |
3046 /* If we're at `\{' and it's not the open-interval | |
3047 operator. */ | |
3048 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) | |
3049 || (p - 2 == pattern && p == pend)) | |
3050 goto normal_backslash; | |
3051 | |
3052 handle_interval: | |
3053 { | |
3054 /* If got here, then the syntax allows intervals. */ | |
3055 | |
3056 /* At least (most) this many matches must be made. */ | |
3057 int lower_bound = -1, upper_bound = -1; | |
3058 | |
3059 beg_interval = p - 1; | |
3060 | |
3061 if (p == pend) | |
3062 { | |
3063 if (syntax & RE_NO_BK_BRACES) | |
3064 goto unfetch_interval; | |
3065 else | |
3066 FREE_STACK_RETURN (REG_EBRACE); | |
3067 } | |
3068 | |
3069 GET_UNSIGNED_NUMBER (lower_bound); | |
3070 | |
3071 if (c == ',') | |
3072 { | |
3073 GET_UNSIGNED_NUMBER (upper_bound); | |
3074 if (upper_bound < 0) upper_bound = RE_DUP_MAX; | |
3075 } | |
3076 else | |
3077 /* Interval such as `{1}' => match exactly once. */ | |
3078 upper_bound = lower_bound; | |
3079 | |
3080 if (lower_bound < 0 || upper_bound > RE_DUP_MAX | |
3081 || lower_bound > upper_bound) | |
3082 { | |
3083 if (syntax & RE_NO_BK_BRACES) | |
3084 goto unfetch_interval; | |
3085 else | |
3086 FREE_STACK_RETURN (REG_BADBR); | |
3087 } | |
3088 | |
3089 if (!(syntax & RE_NO_BK_BRACES)) | |
3090 { | |
3091 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE); | |
3092 | |
3093 PATFETCH (c); | |
3094 } | |
3095 | |
3096 if (c != '}') | |
3097 { | |
3098 if (syntax & RE_NO_BK_BRACES) | |
3099 goto unfetch_interval; | |
3100 else | |
3101 FREE_STACK_RETURN (REG_BADBR); | |
3102 } | |
3103 | |
3104 /* We just parsed a valid interval. */ | |
3105 | |
3106 /* If it's invalid to have no preceding re. */ | |
3107 if (!laststart) | |
3108 { | |
3109 if (syntax & RE_CONTEXT_INVALID_OPS) | |
3110 FREE_STACK_RETURN (REG_BADRPT); | |
3111 else if (syntax & RE_CONTEXT_INDEP_OPS) | |
446 | 3112 laststart = buf_end; |
428 | 3113 else |
3114 goto unfetch_interval; | |
3115 } | |
3116 | |
3117 /* If the upper bound is zero, don't want to succeed at | |
3118 all; jump from `laststart' to `b + 3', which will be | |
3119 the end of the buffer after we insert the jump. */ | |
3120 if (upper_bound == 0) | |
3121 { | |
3122 GET_BUFFER_SPACE (3); | |
446 | 3123 INSERT_JUMP (jump, laststart, buf_end + 3); |
3124 buf_end += 3; | |
428 | 3125 } |
3126 | |
3127 /* Otherwise, we have a nontrivial interval. When | |
3128 we're all done, the pattern will look like: | |
3129 set_number_at <jump count> <upper bound> | |
3130 set_number_at <succeed_n count> <lower bound> | |
3131 succeed_n <after jump addr> <succeed_n count> | |
3132 <body of loop> | |
3133 jump_n <succeed_n addr> <jump count> | |
3134 (The upper bound and `jump_n' are omitted if | |
3135 `upper_bound' is 1, though.) */ | |
3136 else | |
3137 { /* If the upper bound is > 1, we need to insert | |
3138 more at the end of the loop. */ | |
647 | 3139 int nbytes = 10 + (upper_bound > 1) * 10; |
428 | 3140 |
3141 GET_BUFFER_SPACE (nbytes); | |
3142 | |
3143 /* Initialize lower bound of the `succeed_n', even | |
3144 though it will be set during matching by its | |
3145 attendant `set_number_at' (inserted next), | |
3146 because `re_compile_fastmap' needs to know. | |
3147 Jump to the `jump_n' we might insert below. */ | |
3148 INSERT_JUMP2 (succeed_n, laststart, | |
446 | 3149 buf_end + 5 + (upper_bound > 1) * 5, |
428 | 3150 lower_bound); |
446 | 3151 buf_end += 5; |
428 | 3152 |
3153 /* Code to initialize the lower bound. Insert | |
3154 before the `succeed_n'. The `5' is the last two | |
3155 bytes of this `set_number_at', plus 3 bytes of | |
3156 the following `succeed_n'. */ | |
446 | 3157 insert_op2 (set_number_at, laststart, 5, lower_bound, buf_end); |
3158 buf_end += 5; | |
428 | 3159 |
3160 if (upper_bound > 1) | |
3161 { /* More than one repetition is allowed, so | |
3162 append a backward jump to the `succeed_n' | |
3163 that starts this interval. | |
3164 | |
3165 When we've reached this during matching, | |
3166 we'll have matched the interval once, so | |
3167 jump back only `upper_bound - 1' times. */ | |
446 | 3168 STORE_JUMP2 (jump_n, buf_end, laststart + 5, |
428 | 3169 upper_bound - 1); |
446 | 3170 buf_end += 5; |
428 | 3171 |
3172 /* The location we want to set is the second | |
3173 parameter of the `jump_n'; that is `b-2' as | |
3174 an absolute address. `laststart' will be | |
3175 the `set_number_at' we're about to insert; | |
3176 `laststart+3' the number to set, the source | |
3177 for the relative address. But we are | |
3178 inserting into the middle of the pattern -- | |
3179 so everything is getting moved up by 5. | |
3180 Conclusion: (b - 2) - (laststart + 3) + 5, | |
3181 i.e., b - laststart. | |
3182 | |
3183 We insert this at the beginning of the loop | |
3184 so that if we fail during matching, we'll | |
3185 reinitialize the bounds. */ | |
446 | 3186 insert_op2 (set_number_at, laststart, |
3187 buf_end - laststart, | |
3188 upper_bound - 1, buf_end); | |
3189 buf_end += 5; | |
428 | 3190 } |
3191 } | |
3192 pending_exact = 0; | |
3193 beg_interval = NULL; | |
3194 } | |
3195 break; | |
3196 | |
3197 unfetch_interval: | |
3198 /* If an invalid interval, match the characters as literals. */ | |
3199 assert (beg_interval); | |
3200 p = beg_interval; | |
3201 beg_interval = NULL; | |
3202 | |
3203 /* normal_char and normal_backslash need `c'. */ | |
3204 PATFETCH (c); | |
3205 | |
3206 if (!(syntax & RE_NO_BK_BRACES)) | |
3207 { | |
3208 if (p > pattern && p[-1] == '\\') | |
3209 goto normal_backslash; | |
3210 } | |
3211 goto normal_char; | |
3212 | |
3213 #ifdef emacs | |
3214 /* There is no way to specify the before_dot and after_dot | |
3215 operators. rms says this is ok. --karl */ | |
3216 case '=': | |
3217 BUF_PUSH (at_dot); | |
3218 break; | |
3219 | |
3220 case 's': | |
446 | 3221 laststart = buf_end; |
428 | 3222 PATFETCH (c); |
3223 /* XEmacs addition */ | |
3224 if (c >= 0x80 || syntax_spec_code[c] == 0377) | |
3225 FREE_STACK_RETURN (REG_ESYNTAX); | |
3226 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); | |
3227 break; | |
3228 | |
3229 case 'S': | |
446 | 3230 laststart = buf_end; |
428 | 3231 PATFETCH (c); |
3232 /* XEmacs addition */ | |
3233 if (c >= 0x80 || syntax_spec_code[c] == 0377) | |
3234 FREE_STACK_RETURN (REG_ESYNTAX); | |
3235 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); | |
3236 break; | |
3237 | |
3238 #ifdef MULE | |
3239 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */ | |
3240 case 'c': | |
446 | 3241 laststart = buf_end; |
428 | 3242 PATFETCH_RAW (c); |
3243 if (c < 32 || c > 127) | |
3244 FREE_STACK_RETURN (REG_ECATEGORY); | |
3245 BUF_PUSH_2 (categoryspec, c); | |
3246 break; | |
3247 | |
3248 case 'C': | |
446 | 3249 laststart = buf_end; |
428 | 3250 PATFETCH_RAW (c); |
3251 if (c < 32 || c > 127) | |
3252 FREE_STACK_RETURN (REG_ECATEGORY); | |
3253 BUF_PUSH_2 (notcategoryspec, c); | |
3254 break; | |
3255 /* end of category patch */ | |
3256 #endif /* MULE */ | |
3257 #endif /* emacs */ | |
3258 | |
3259 | |
3260 case 'w': | |
446 | 3261 laststart = buf_end; |
428 | 3262 BUF_PUSH (wordchar); |
3263 break; | |
3264 | |
3265 | |
3266 case 'W': | |
446 | 3267 laststart = buf_end; |
428 | 3268 BUF_PUSH (notwordchar); |
3269 break; | |
3270 | |
3271 | |
3272 case '<': | |
3273 BUF_PUSH (wordbeg); | |
3274 break; | |
3275 | |
3276 case '>': | |
3277 BUF_PUSH (wordend); | |
3278 break; | |
3279 | |
3280 case 'b': | |
3281 BUF_PUSH (wordbound); | |
3282 break; | |
3283 | |
3284 case 'B': | |
3285 BUF_PUSH (notwordbound); | |
3286 break; | |
3287 | |
3288 case '`': | |
3289 BUF_PUSH (begbuf); | |
3290 break; | |
3291 | |
3292 case '\'': | |
3293 BUF_PUSH (endbuf); | |
3294 break; | |
3295 | |
3296 case '1': case '2': case '3': case '4': case '5': | |
3297 case '6': case '7': case '8': case '9': | |
446 | 3298 { |
502 | 3299 regnum_t reg, regint; |
3300 int may_need_to_unfetch = 0; | |
446 | 3301 if (syntax & RE_NO_BK_REFS) |
3302 goto normal_char; | |
3303 | |
502 | 3304 /* This only goes up to 99. It could be extended to work |
3305 up to 255 (the maximum number of registers that can be | |
3306 handled by the current regexp engine, because it stores | |
3307 its register numbers in the compiled pattern as one byte, | |
3308 ugh). Doing that's a bit trickier, because you might | |
3309 have the case where \25 a back-ref but \255 is not, ... */ | |
446 | 3310 reg = c - '0'; |
502 | 3311 if (p < pend) |
3312 { | |
3313 PATFETCH (c); | |
3314 if (c >= '0' && c <= '9') | |
3315 { | |
3316 regnum_t new_reg = reg * 10 + c - '0'; | |
3317 if (new_reg <= bufp->re_nsub) | |
3318 { | |
3319 reg = new_reg; | |
3320 may_need_to_unfetch = 1; | |
3321 } | |
3322 else | |
3323 PATUNFETCH; | |
3324 } | |
523 | 3325 else |
3326 PATUNFETCH; | |
502 | 3327 } |
3328 | |
3329 if (reg > bufp->re_nsub) | |
446 | 3330 FREE_STACK_RETURN (REG_ESUBREG); |
3331 | |
502 | 3332 regint = bufp->external_to_internal_register[reg]; |
446 | 3333 /* Can't back reference to a subexpression if inside of it. */ |
502 | 3334 if (group_in_compile_stack (compile_stack, regint)) |
3335 { | |
3336 if (may_need_to_unfetch) | |
3337 PATUNFETCH; | |
3338 goto normal_char; | |
3339 } | |
3340 | |
3341 #ifdef emacs | |
3342 if (reg > 9 && | |
3343 bufp->warned_about_incompatible_back_references == 0) | |
3344 { | |
3345 bufp->warned_about_incompatible_back_references = 1; | |
3346 warn_when_safe (intern ("regex"), Qinfo, | |
3347 "Back reference \\%d now has new " | |
3348 "semantics in %s", reg, pattern); | |
3349 } | |
3350 #endif | |
446 | 3351 |
3352 laststart = buf_end; | |
502 | 3353 BUF_PUSH_2 (duplicate, regint); |
446 | 3354 } |
428 | 3355 break; |
3356 | |
3357 | |
3358 case '+': | |
3359 case '?': | |
3360 if (syntax & RE_BK_PLUS_QM) | |
3361 goto handle_plus; | |
3362 else | |
3363 goto normal_backslash; | |
3364 | |
3365 default: | |
3366 normal_backslash: | |
3367 /* You might think it would be useful for \ to mean | |
3368 not to translate; but if we don't translate it, | |
3369 it will never match anything. */ | |
826 | 3370 c = RE_TRANSLATE (c); |
428 | 3371 goto normal_char; |
3372 } | |
3373 break; | |
3374 | |
3375 | |
3376 default: | |
3377 /* Expects the character in `c'. */ | |
3378 /* `p' points to the location after where `c' came from. */ | |
3379 normal_char: | |
3380 { | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3381 /* The following conditional synced to GNU Emacs 22.1. */ |
428 | 3382 /* If no exactn currently being built. */ |
3383 if (!pending_exact | |
3384 | |
3385 /* If last exactn not at current position. */ | |
446 | 3386 || pending_exact + *pending_exact + 1 != buf_end |
428 | 3387 |
3388 /* We have only one byte following the exactn for the count. */ | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3389 || *pending_exact >= (1 << BYTEWIDTH) - MAX_ICHAR_LEN |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3390 |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3391 /* If followed by a repetition operator. |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3392 If the lookahead fails because of end of pattern, any |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3393 trailing backslash will get caught later. */ |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3394 || (p != pend && (*p == '*' || *p == '^')) |
428 | 3395 || ((syntax & RE_BK_PLUS_QM) |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3396 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?') |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3397 : p != pend && (*p == '+' || *p == '?')) |
428 | 3398 || ((syntax & RE_INTERVALS) |
3399 && ((syntax & RE_NO_BK_BRACES) | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3400 ? p != pend && *p == '{' |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3401 : p + 1 < pend && (p[0] == '\\' && p[1] == '{')))) |
428 | 3402 { |
3403 /* Start building a new exactn. */ | |
3404 | |
446 | 3405 laststart = buf_end; |
428 | 3406 |
3407 BUF_PUSH_2 (exactn, 0); | |
446 | 3408 pending_exact = buf_end - 1; |
428 | 3409 } |
3410 | |
446 | 3411 #ifndef MULE |
428 | 3412 BUF_PUSH (c); |
3413 (*pending_exact)++; | |
446 | 3414 #else |
3415 { | |
3416 Bytecount bt_count; | |
867 | 3417 Ibyte tmp_buf[MAX_ICHAR_LEN]; |
446 | 3418 int i; |
3419 | |
867 | 3420 bt_count = set_itext_ichar (tmp_buf, c); |
446 | 3421 |
3422 for (i = 0; i < bt_count; i++) | |
3423 { | |
3424 BUF_PUSH (tmp_buf[i]); | |
3425 (*pending_exact)++; | |
3426 } | |
3427 } | |
3428 #endif | |
428 | 3429 break; |
3430 } | |
3431 } /* switch (c) */ | |
3432 } /* while p != pend */ | |
3433 | |
3434 | |
3435 /* Through the pattern now. */ | |
3436 | |
3437 if (fixup_alt_jump) | |
446 | 3438 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end); |
428 | 3439 |
3440 if (!COMPILE_STACK_EMPTY) | |
3441 FREE_STACK_RETURN (REG_EPAREN); | |
3442 | |
3443 /* If we don't want backtracking, force success | |
3444 the first time we reach the end of the compiled pattern. */ | |
3445 if (syntax & RE_NO_POSIX_BACKTRACKING) | |
3446 BUF_PUSH (succeed); | |
3447 | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
3448 xfree (compile_stack.stack); |
428 | 3449 |
3450 /* We have succeeded; set the length of the buffer. */ | |
446 | 3451 bufp->used = buf_end - bufp->buffer; |
428 | 3452 |
3453 #ifdef DEBUG | |
5041 | 3454 if (debug_regexps & RE_DEBUG_COMPILATION) |
428 | 3455 { |
3456 DEBUG_PRINT1 ("\nCompiled pattern: \n"); | |
3457 print_compiled_pattern (bufp); | |
3458 } | |
3459 #endif /* DEBUG */ | |
3460 | |
3461 #ifndef MATCH_MAY_ALLOCATE | |
3462 /* Initialize the failure stack to the largest possible stack. This | |
3463 isn't necessary unless we're trying to avoid calling alloca in | |
3464 the search and match routines. */ | |
3465 { | |
502 | 3466 int num_regs = bufp->re_ngroups + 1; |
428 | 3467 |
3468 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size | |
3469 is strictly greater than re_max_failures, the largest possible stack | |
3470 is 2 * re_max_failures failure points. */ | |
3471 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) | |
3472 { | |
3473 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); | |
3474 | |
3475 if (! fail_stack.stack) | |
3476 fail_stack.stack | |
3477 = (fail_stack_elt_t *) xmalloc (fail_stack.size | |
3478 * sizeof (fail_stack_elt_t)); | |
3479 else | |
3480 fail_stack.stack | |
3481 = (fail_stack_elt_t *) xrealloc (fail_stack.stack, | |
3482 (fail_stack.size | |
3483 * sizeof (fail_stack_elt_t))); | |
3484 } | |
3485 | |
3486 regex_grow_registers (num_regs); | |
3487 } | |
3488 #endif /* not MATCH_MAY_ALLOCATE */ | |
3489 | |
3490 return REG_NOERROR; | |
3491 } /* regex_compile */ | |
3492 | |
3493 /* Subroutines for `regex_compile'. */ | |
3494 | |
3495 /* Store OP at LOC followed by two-byte integer parameter ARG. */ | |
3496 | |
3497 static void | |
3498 store_op1 (re_opcode_t op, unsigned char *loc, int arg) | |
3499 { | |
3500 *loc = (unsigned char) op; | |
3501 STORE_NUMBER (loc + 1, arg); | |
3502 } | |
3503 | |
3504 | |
3505 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ | |
3506 | |
3507 static void | |
3508 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2) | |
3509 { | |
3510 *loc = (unsigned char) op; | |
3511 STORE_NUMBER (loc + 1, arg1); | |
3512 STORE_NUMBER (loc + 3, arg2); | |
3513 } | |
3514 | |
3515 | |
3516 /* Copy the bytes from LOC to END to open up three bytes of space at LOC | |
3517 for OP followed by two-byte integer parameter ARG. */ | |
3518 | |
3519 static void | |
3520 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end) | |
3521 { | |
3522 REGISTER unsigned char *pfrom = end; | |
3523 REGISTER unsigned char *pto = end + 3; | |
3524 | |
3525 while (pfrom != loc) | |
3526 *--pto = *--pfrom; | |
3527 | |
3528 store_op1 (op, loc, arg); | |
3529 } | |
3530 | |
3531 | |
3532 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ | |
3533 | |
3534 static void | |
3535 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, | |
3536 unsigned char *end) | |
3537 { | |
3538 REGISTER unsigned char *pfrom = end; | |
3539 REGISTER unsigned char *pto = end + 5; | |
3540 | |
3541 while (pfrom != loc) | |
3542 *--pto = *--pfrom; | |
3543 | |
3544 store_op2 (op, loc, arg1, arg2); | |
3545 } | |
3546 | |
3547 | |
3548 /* P points to just after a ^ in PATTERN. Return true if that ^ comes | |
3549 after an alternative or a begin-subexpression. We assume there is at | |
3550 least one character before the ^. */ | |
3551 | |
460 | 3552 static re_bool |
446 | 3553 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax) |
428 | 3554 { |
446 | 3555 re_char *prev = p - 2; |
460 | 3556 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\'; |
428 | 3557 |
3558 return | |
3559 /* After a subexpression? */ | |
3560 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) | |
3561 /* After an alternative? */ | |
3562 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); | |
3563 } | |
3564 | |
3565 | |
3566 /* The dual of at_begline_loc_p. This one is for $. We assume there is | |
3567 at least one character after the $, i.e., `P < PEND'. */ | |
3568 | |
460 | 3569 static re_bool |
446 | 3570 at_endline_loc_p (re_char *p, re_char *pend, int syntax) |
428 | 3571 { |
446 | 3572 re_char *next = p; |
460 | 3573 re_bool next_backslash = *next == '\\'; |
446 | 3574 re_char *next_next = p + 1 < pend ? p + 1 : 0; |
428 | 3575 |
3576 return | |
3577 /* Before a subexpression? */ | |
3578 (syntax & RE_NO_BK_PARENS ? *next == ')' | |
3579 : next_backslash && next_next && *next_next == ')') | |
3580 /* Before an alternative? */ | |
3581 || (syntax & RE_NO_BK_VBAR ? *next == '|' | |
3582 : next_backslash && next_next && *next_next == '|'); | |
3583 } | |
3584 | |
3585 | |
3586 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and | |
3587 false if it's not. */ | |
3588 | |
460 | 3589 static re_bool |
428 | 3590 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum) |
3591 { | |
3592 int this_element; | |
3593 | |
3594 for (this_element = compile_stack.avail - 1; | |
3595 this_element >= 0; | |
3596 this_element--) | |
3597 if (compile_stack.stack[this_element].regnum == regnum) | |
3598 return true; | |
3599 | |
3600 return false; | |
3601 } | |
3602 | |
3603 | |
3604 /* Read the ending character of a range (in a bracket expression) from the | |
3605 uncompiled pattern *P_PTR (which ends at PEND). We assume the | |
3606 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) | |
3607 Then we set the translation of all bits between the starting and | |
3608 ending characters (inclusive) in the compiled pattern B. | |
3609 | |
3610 Return an error code. | |
3611 | |
3612 We use these short variable names so we can use the same macros as | |
826 | 3613 `regex_compile' itself. |
3614 | |
3615 Under Mule, this is only called when both chars of the range are | |
3616 ASCII. */ | |
428 | 3617 |
3618 static reg_errcode_t | |
446 | 3619 compile_range (re_char **p_ptr, re_char *pend, RE_TRANSLATE_TYPE translate, |
3620 reg_syntax_t syntax, unsigned char *buf_end) | |
428 | 3621 { |
867 | 3622 Ichar this_char; |
428 | 3623 |
446 | 3624 re_char *p = *p_ptr; |
428 | 3625 int range_start, range_end; |
3626 | |
3627 if (p == pend) | |
3628 return REG_ERANGE; | |
3629 | |
3630 /* Even though the pattern is a signed `char *', we need to fetch | |
3631 with unsigned char *'s; if the high bit of the pattern character | |
3632 is set, the range endpoints will be negative if we fetch using a | |
3633 signed char *. | |
3634 | |
3635 We also want to fetch the endpoints without translating them; the | |
3636 appropriate translation is done in the bit-setting loop below. */ | |
442 | 3637 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */ |
3638 range_start = ((const unsigned char *) p)[-2]; | |
3639 range_end = ((const unsigned char *) p)[0]; | |
428 | 3640 |
3641 /* Have to increment the pointer into the pattern string, so the | |
3642 caller isn't still at the ending character. */ | |
3643 (*p_ptr)++; | |
3644 | |
3645 /* If the start is after the end, the range is empty. */ | |
3646 if (range_start > range_end) | |
3647 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; | |
3648 | |
3649 /* Here we see why `this_char' has to be larger than an `unsigned | |
3650 char' -- the range is inclusive, so if `range_end' == 0xff | |
3651 (assuming 8-bit characters), we would otherwise go into an infinite | |
3652 loop, since all characters <= 0xff. */ | |
3653 for (this_char = range_start; this_char <= range_end; this_char++) | |
3654 { | |
826 | 3655 SET_LIST_BIT (RE_TRANSLATE (this_char)); |
428 | 3656 } |
3657 | |
3658 return REG_NOERROR; | |
3659 } | |
3660 | |
3661 #ifdef MULE | |
3662 | |
3663 static reg_errcode_t | |
446 | 3664 compile_extended_range (re_char **p_ptr, re_char *pend, |
3665 RE_TRANSLATE_TYPE translate, | |
428 | 3666 reg_syntax_t syntax, Lisp_Object rtab) |
3667 { | |
867 | 3668 Ichar this_char, range_start, range_end; |
3669 const Ibyte *p; | |
428 | 3670 |
3671 if (*p_ptr == pend) | |
3672 return REG_ERANGE; | |
3673 | |
867 | 3674 p = (const Ibyte *) *p_ptr; |
3675 range_end = itext_ichar (p); | |
428 | 3676 p--; /* back to '-' */ |
867 | 3677 DEC_IBYTEPTR (p); /* back to start of range */ |
428 | 3678 /* We also want to fetch the endpoints without translating them; the |
3679 appropriate translation is done in the bit-setting loop below. */ | |
867 | 3680 range_start = itext_ichar (p); |
3681 INC_IBYTEPTR (*p_ptr); | |
428 | 3682 |
3683 /* If the start is after the end, the range is empty. */ | |
3684 if (range_start > range_end) | |
3685 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; | |
3686 | |
3687 /* Can't have ranges spanning different charsets, except maybe for | |
3688 ranges entirely within the first 256 chars. */ | |
3689 | |
3690 if ((range_start >= 0x100 || range_end >= 0x100) | |
867 | 3691 && ichar_leading_byte (range_start) != |
3692 ichar_leading_byte (range_end)) | |
428 | 3693 return REG_ERANGESPAN; |
3694 | |
826 | 3695 /* #### This might be way inefficient if the range encompasses 10,000 |
3696 chars or something. To be efficient, you'd have to do something like | |
3697 this: | |
428 | 3698 |
3699 range_table a; | |
3700 range_table b; | |
3701 map over translation table in [range_start, range_end] of | |
3702 (put the mapped range in a; | |
3703 put the translation in b) | |
3704 invert the range in a and truncate to [range_start, range_end] | |
3705 compute the union of a, b | |
3706 union the result into rtab | |
3707 */ | |
826 | 3708 for (this_char = range_start; this_char <= range_end; this_char++) |
428 | 3709 { |
826 | 3710 SET_RANGETAB_BIT (RE_TRANSLATE (this_char)); |
428 | 3711 } |
3712 | |
3713 if (this_char <= range_end) | |
3714 put_range_table (rtab, this_char, range_end, Qt); | |
3715 | |
3716 return REG_NOERROR; | |
3717 } | |
3718 | |
3719 #endif /* MULE */ | |
3720 | |
3721 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in | |
3722 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible | |
3723 characters can start a string that matches the pattern. This fastmap | |
3724 is used by re_search to skip quickly over impossible starting points. | |
3725 | |
3726 The caller must supply the address of a (1 << BYTEWIDTH)-byte data | |
3727 area as BUFP->fastmap. | |
3728 | |
3729 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in | |
3730 the pattern buffer. | |
3731 | |
3732 Returns 0 if we succeed, -2 if an internal error. */ | |
3733 | |
3734 int | |
826 | 3735 re_compile_fastmap (struct re_pattern_buffer *bufp |
3736 RE_LISP_SHORT_CONTEXT_ARGS_DECL) | |
428 | 3737 { |
3738 int j, k; | |
3739 #ifdef MATCH_MAY_ALLOCATE | |
3740 fail_stack_type fail_stack; | |
3741 #endif | |
456 | 3742 DECLARE_DESTINATION; |
428 | 3743 /* We don't push any register information onto the failure stack. */ |
3744 | |
826 | 3745 /* &&#### this should be changed for 8-bit-fixed, for efficiency. see |
3746 comment marked with &&#### in re_search_2. */ | |
3747 | |
428 | 3748 REGISTER char *fastmap = bufp->fastmap; |
3749 unsigned char *pattern = bufp->buffer; | |
647 | 3750 long size = bufp->used; |
428 | 3751 unsigned char *p = pattern; |
3752 REGISTER unsigned char *pend = pattern + size; | |
3753 | |
771 | 3754 #ifdef REGEX_REL_ALLOC |
428 | 3755 /* This holds the pointer to the failure stack, when |
3756 it is allocated relocatably. */ | |
3757 fail_stack_elt_t *failure_stack_ptr; | |
3758 #endif | |
3759 | |
3760 /* Assume that each path through the pattern can be null until | |
3761 proven otherwise. We set this false at the bottom of switch | |
3762 statement, to which we get only if a particular path doesn't | |
3763 match the empty string. */ | |
460 | 3764 re_bool path_can_be_null = true; |
428 | 3765 |
3766 /* We aren't doing a `succeed_n' to begin with. */ | |
460 | 3767 re_bool succeed_n_p = false; |
428 | 3768 |
1333 | 3769 #ifdef ERROR_CHECK_MALLOC |
3770 /* The pattern comes from string data, not buffer data. We don't access | |
3771 any buffer data, so we don't have to worry about malloc() (but the | |
3772 disallowed flag may have been set by a caller). */ | |
3773 int depth = bind_regex_malloc_disallowed (0); | |
3774 #endif | |
3775 | |
428 | 3776 assert (fastmap != NULL && p != NULL); |
3777 | |
3778 INIT_FAIL_STACK (); | |
3779 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */ | |
3780 bufp->fastmap_accurate = 1; /* It will be when we're done. */ | |
3781 bufp->can_be_null = 0; | |
3782 | |
3783 while (1) | |
3784 { | |
3785 if (p == pend || *p == succeed) | |
3786 { | |
3787 /* We have reached the (effective) end of pattern. */ | |
3788 if (!FAIL_STACK_EMPTY ()) | |
3789 { | |
3790 bufp->can_be_null |= path_can_be_null; | |
3791 | |
3792 /* Reset for next path. */ | |
3793 path_can_be_null = true; | |
3794 | |
446 | 3795 p = (unsigned char *) fail_stack.stack[--fail_stack.avail].pointer; |
428 | 3796 |
3797 continue; | |
3798 } | |
3799 else | |
3800 break; | |
3801 } | |
3802 | |
3803 /* We should never be about to go beyond the end of the pattern. */ | |
3804 assert (p < pend); | |
3805 | |
4759
aa5ed11f473b
Remove support for obsolete systems. See xemacs-patches message with ID
Jerry James <james@xemacs.org>
parents:
4750
diff
changeset
|
3806 switch ((re_opcode_t) *p++) |
428 | 3807 { |
3808 | |
3809 /* I guess the idea here is to simply not bother with a fastmap | |
3810 if a backreference is used, since it's too hard to figure out | |
3811 the fastmap for the corresponding group. Setting | |
3812 `can_be_null' stops `re_search_2' from using the fastmap, so | |
3813 that is all we do. */ | |
3814 case duplicate: | |
3815 bufp->can_be_null = 1; | |
3816 goto done; | |
3817 | |
3818 | |
3819 /* Following are the cases which match a character. These end | |
3820 with `break'. */ | |
3821 | |
3822 case exactn: | |
3823 fastmap[p[1]] = 1; | |
3824 break; | |
3825 | |
3826 | |
3827 case charset: | |
3828 /* XEmacs: Under Mule, these bit vectors will | |
3829 only contain values for characters below 0x80. */ | |
3830 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | |
3831 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) | |
3832 fastmap[j] = 1; | |
3833 break; | |
3834 | |
3835 | |
3836 case charset_not: | |
3837 /* Chars beyond end of map must be allowed. */ | |
3838 #ifdef MULE | |
3839 for (j = *p * BYTEWIDTH; j < 0x80; j++) | |
3840 fastmap[j] = 1; | |
3841 /* And all extended characters must be allowed, too. */ | |
3842 for (j = 0x80; j < 0xA0; j++) | |
3843 fastmap[j] = 1; | |
446 | 3844 #else /* not MULE */ |
428 | 3845 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) |
3846 fastmap[j] = 1; | |
446 | 3847 #endif /* MULE */ |
428 | 3848 |
3849 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | |
3850 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) | |
3851 fastmap[j] = 1; | |
3852 break; | |
3853 | |
3854 #ifdef MULE | |
3855 case charset_mule: | |
3856 { | |
3857 int nentries; | |
3858 int i; | |
3859 | |
3860 nentries = unified_range_table_nentries (p); | |
3861 for (i = 0; i < nentries; i++) | |
3862 { | |
3863 EMACS_INT first, last; | |
3864 Lisp_Object dummy_val; | |
3865 int jj; | |
867 | 3866 Ibyte strr[MAX_ICHAR_LEN]; |
428 | 3867 |
3868 unified_range_table_get_range (p, i, &first, &last, | |
3869 &dummy_val); | |
3870 for (jj = first; jj <= last && jj < 0x80; jj++) | |
3871 fastmap[jj] = 1; | |
3872 /* Ranges below 0x100 can span charsets, but there | |
3873 are only two (Control-1 and Latin-1), and | |
3874 either first or last has to be in them. */ | |
867 | 3875 set_itext_ichar (strr, first); |
428 | 3876 fastmap[*strr] = 1; |
3877 if (last < 0x100) | |
3878 { | |
867 | 3879 set_itext_ichar (strr, last); |
428 | 3880 fastmap[*strr] = 1; |
3881 } | |
3882 } | |
3883 } | |
3884 break; | |
3885 | |
3886 case charset_mule_not: | |
3887 { | |
3888 int nentries; | |
3889 int i; | |
4832
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3890 int smallest_prev = 0; |
428 | 3891 |
3892 nentries = unified_range_table_nentries (p); | |
3893 for (i = 0; i < nentries; i++) | |
3894 { | |
3895 EMACS_INT first, last; | |
3896 Lisp_Object dummy_val; | |
3897 int jj; | |
3898 | |
3899 unified_range_table_get_range (p, i, &first, &last, | |
3900 &dummy_val); | |
3901 for (jj = smallest_prev; jj < first && jj < 0x80; jj++) | |
3902 fastmap[jj] = 1; | |
3903 smallest_prev = last + 1; | |
3904 if (smallest_prev >= 0x80) | |
3905 break; | |
3906 } | |
4832
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3907 |
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3908 /* Also set lead bytes after the end */ |
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3909 for (i = smallest_prev; i < 0x80; i++) |
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3910 fastmap[i] = 1; |
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3911 |
428 | 3912 /* Calculating which leading bytes are actually allowed |
3913 here is rather difficult, so we just punt and allow | |
3914 all of them. */ | |
3915 for (i = 0x80; i < 0xA0; i++) | |
3916 fastmap[i] = 1; | |
3917 } | |
3918 break; | |
3919 #endif /* MULE */ | |
3920 | |
3921 | |
3922 case anychar: | |
3923 { | |
3924 int fastmap_newline = fastmap['\n']; | |
3925 | |
3926 /* `.' matches anything ... */ | |
3927 #ifdef MULE | |
3928 /* "anything" only includes bytes that can be the | |
3929 first byte of a character. */ | |
3930 for (j = 0; j < 0xA0; j++) | |
3931 fastmap[j] = 1; | |
3932 #else | |
3933 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
3934 fastmap[j] = 1; | |
3935 #endif | |
3936 | |
3937 /* ... except perhaps newline. */ | |
3938 if (!(bufp->syntax & RE_DOT_NEWLINE)) | |
3939 fastmap['\n'] = fastmap_newline; | |
3940 | |
3941 /* Return if we have already set `can_be_null'; if we have, | |
3942 then the fastmap is irrelevant. Something's wrong here. */ | |
3943 else if (bufp->can_be_null) | |
3944 goto done; | |
3945 | |
3946 /* Otherwise, have to check alternative paths. */ | |
3947 break; | |
3948 } | |
3949 | |
826 | 3950 #ifndef emacs |
3951 case wordchar: | |
3952 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
3953 if (SYNTAX (ignored, j) == Sword) | |
3954 fastmap[j] = 1; | |
3955 break; | |
3956 | |
3957 case notwordchar: | |
3958 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
3959 if (SYNTAX (ignored, j) != Sword) | |
3960 fastmap[j] = 1; | |
3961 break; | |
3962 #else /* emacs */ | |
3963 case wordchar: | |
3964 case notwordchar: | |
460 | 3965 case wordbound: |
3966 case notwordbound: | |
3967 case wordbeg: | |
3968 case wordend: | |
3969 case notsyntaxspec: | |
3970 case syntaxspec: | |
3971 /* This match depends on text properties. These end with | |
3972 aborting optimizations. */ | |
3973 bufp->can_be_null = 1; | |
3974 goto done; | |
826 | 3975 #if 0 /* all of the following code is unused now that the `syntax-table' |
3976 property exists -- it's trickier to do this than just look in | |
3977 the buffer. &&#### but we could just use the syntax-cache stuff | |
3978 instead; why don't we? --ben */ | |
3979 case wordchar: | |
3980 k = (int) Sword; | |
3981 goto matchsyntax; | |
3982 | |
3983 case notwordchar: | |
3984 k = (int) Sword; | |
3985 goto matchnotsyntax; | |
3986 | |
428 | 3987 case syntaxspec: |
3988 k = *p++; | |
826 | 3989 matchsyntax: |
428 | 3990 #ifdef MULE |
3991 for (j = 0; j < 0x80; j++) | |
826 | 3992 if (SYNTAX |
3993 (XCHAR_TABLE (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) == | |
428 | 3994 (enum syntaxcode) k) |
3995 fastmap[j] = 1; | |
3996 for (j = 0x80; j < 0xA0; j++) | |
3997 { | |
826 | 3998 if (leading_byte_prefix_p ((unsigned char) j)) |
428 | 3999 /* too complicated to calculate this right */ |
4000 fastmap[j] = 1; | |
4001 else | |
4002 { | |
4003 int multi_p; | |
4004 Lisp_Object cset; | |
4005 | |
826 | 4006 cset = charset_by_leading_byte (j); |
428 | 4007 if (CHARSETP (cset)) |
4008 { | |
826 | 4009 if (charset_syntax (lispbuf, cset, &multi_p) |
428 | 4010 == Sword || multi_p) |
4011 fastmap[j] = 1; | |
4012 } | |
4013 } | |
4014 } | |
446 | 4015 #else /* not MULE */ |
428 | 4016 for (j = 0; j < (1 << BYTEWIDTH); j++) |
826 | 4017 if (SYNTAX |
4018 (XCHAR_TABLE (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) == | |
428 | 4019 (enum syntaxcode) k) |
4020 fastmap[j] = 1; | |
446 | 4021 #endif /* MULE */ |
428 | 4022 break; |
4023 | |
4024 | |
4025 case notsyntaxspec: | |
4026 k = *p++; | |
826 | 4027 matchnotsyntax: |
428 | 4028 #ifdef MULE |
4029 for (j = 0; j < 0x80; j++) | |
826 | 4030 if (SYNTAX |
428 | 4031 (XCHAR_TABLE |
826 | 4032 (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) != |
428 | 4033 (enum syntaxcode) k) |
4034 fastmap[j] = 1; | |
4035 for (j = 0x80; j < 0xA0; j++) | |
4036 { | |
826 | 4037 if (leading_byte_prefix_p ((unsigned char) j)) |
428 | 4038 /* too complicated to calculate this right */ |
4039 fastmap[j] = 1; | |
4040 else | |
4041 { | |
4042 int multi_p; | |
4043 Lisp_Object cset; | |
4044 | |
826 | 4045 cset = charset_by_leading_byte (j); |
428 | 4046 if (CHARSETP (cset)) |
4047 { | |
826 | 4048 if (charset_syntax (lispbuf, cset, &multi_p) |
428 | 4049 != Sword || multi_p) |
4050 fastmap[j] = 1; | |
4051 } | |
4052 } | |
4053 } | |
446 | 4054 #else /* not MULE */ |
428 | 4055 for (j = 0; j < (1 << BYTEWIDTH); j++) |
826 | 4056 if (SYNTAX |
428 | 4057 (XCHAR_TABLE |
826 | 4058 (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) != |
428 | 4059 (enum syntaxcode) k) |
4060 fastmap[j] = 1; | |
446 | 4061 #endif /* MULE */ |
428 | 4062 break; |
826 | 4063 #endif /* 0 */ |
428 | 4064 |
4065 #ifdef MULE | |
4066 /* 97/2/17 jhod category patch */ | |
4067 case categoryspec: | |
4068 case notcategoryspec: | |
4069 bufp->can_be_null = 1; | |
1333 | 4070 UNBIND_REGEX_MALLOC_CHECK (); |
428 | 4071 return 0; |
4072 /* end if category patch */ | |
4073 #endif /* MULE */ | |
4074 | |
4075 /* All cases after this match the empty string. These end with | |
4076 `continue'. */ | |
4077 case before_dot: | |
4078 case at_dot: | |
4079 case after_dot: | |
4080 continue; | |
826 | 4081 #endif /* emacs */ |
428 | 4082 |
4083 | |
4084 case no_op: | |
4085 case begline: | |
4086 case endline: | |
4087 case begbuf: | |
4088 case endbuf: | |
460 | 4089 #ifndef emacs |
428 | 4090 case wordbound: |
4091 case notwordbound: | |
4092 case wordbeg: | |
4093 case wordend: | |
460 | 4094 #endif |
428 | 4095 case push_dummy_failure: |
4096 continue; | |
4097 | |
4098 | |
4099 case jump_n: | |
4100 case pop_failure_jump: | |
4101 case maybe_pop_jump: | |
4102 case jump: | |
4103 case jump_past_alt: | |
4104 case dummy_failure_jump: | |
4105 EXTRACT_NUMBER_AND_INCR (j, p); | |
4106 p += j; | |
4107 if (j > 0) | |
4108 continue; | |
4109 | |
4110 /* Jump backward implies we just went through the body of a | |
4111 loop and matched nothing. Opcode jumped to should be | |
4112 `on_failure_jump' or `succeed_n'. Just treat it like an | |
4113 ordinary jump. For a * loop, it has pushed its failure | |
4114 point already; if so, discard that as redundant. */ | |
4115 if ((re_opcode_t) *p != on_failure_jump | |
4116 && (re_opcode_t) *p != succeed_n) | |
4117 continue; | |
4118 | |
4119 p++; | |
4120 EXTRACT_NUMBER_AND_INCR (j, p); | |
4121 p += j; | |
4122 | |
4123 /* If what's on the stack is where we are now, pop it. */ | |
4124 if (!FAIL_STACK_EMPTY () | |
4125 && fail_stack.stack[fail_stack.avail - 1].pointer == p) | |
4126 fail_stack.avail--; | |
4127 | |
4128 continue; | |
4129 | |
4130 | |
4131 case on_failure_jump: | |
4132 case on_failure_keep_string_jump: | |
4133 handle_on_failure_jump: | |
4134 EXTRACT_NUMBER_AND_INCR (j, p); | |
4135 | |
4136 /* For some patterns, e.g., `(a?)?', `p+j' here points to the | |
4137 end of the pattern. We don't want to push such a point, | |
4138 since when we restore it above, entering the switch will | |
4139 increment `p' past the end of the pattern. We don't need | |
4140 to push such a point since we obviously won't find any more | |
4141 fastmap entries beyond `pend'. Such a pattern can match | |
4142 the null string, though. */ | |
4143 if (p + j < pend) | |
4144 { | |
4145 if (!PUSH_PATTERN_OP (p + j, fail_stack)) | |
4146 { | |
4147 RESET_FAIL_STACK (); | |
1333 | 4148 UNBIND_REGEX_MALLOC_CHECK (); |
428 | 4149 return -2; |
4150 } | |
4151 } | |
4152 else | |
4153 bufp->can_be_null = 1; | |
4154 | |
4155 if (succeed_n_p) | |
4156 { | |
4157 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ | |
4158 succeed_n_p = false; | |
4159 } | |
4160 | |
4161 continue; | |
4162 | |
4163 | |
4164 case succeed_n: | |
4165 /* Get to the number of times to succeed. */ | |
4166 p += 2; | |
4167 | |
4168 /* Increment p past the n for when k != 0. */ | |
4169 EXTRACT_NUMBER_AND_INCR (k, p); | |
4170 if (k == 0) | |
4171 { | |
4172 p -= 4; | |
4173 succeed_n_p = true; /* Spaghetti code alert. */ | |
4174 goto handle_on_failure_jump; | |
4175 } | |
4176 continue; | |
4177 | |
4178 | |
4179 case set_number_at: | |
4180 p += 4; | |
4181 continue; | |
4182 | |
4183 | |
4184 case start_memory: | |
4185 case stop_memory: | |
4186 p += 2; | |
4187 continue; | |
4188 | |
4189 | |
4190 default: | |
2500 | 4191 ABORT (); /* We have listed all the cases. */ |
428 | 4192 } /* switch *p++ */ |
4193 | |
4194 /* Getting here means we have found the possible starting | |
4195 characters for one path of the pattern -- and that the empty | |
4196 string does not match. We need not follow this path further. | |
4197 Instead, look at the next alternative (remembered on the | |
4198 stack), or quit if no more. The test at the top of the loop | |
4199 does these things. */ | |
4200 path_can_be_null = false; | |
4201 p = pend; | |
4202 } /* while p */ | |
4203 | |
4204 /* Set `can_be_null' for the last path (also the first path, if the | |
4205 pattern is empty). */ | |
4206 bufp->can_be_null |= path_can_be_null; | |
4207 | |
4208 done: | |
4209 RESET_FAIL_STACK (); | |
1333 | 4210 UNBIND_REGEX_MALLOC_CHECK (); |
428 | 4211 return 0; |
4212 } /* re_compile_fastmap */ | |
4213 | |
4214 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and | |
4215 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use | |
4216 this memory for recording register information. STARTS and ENDS | |
4217 must be allocated using the malloc library routine, and must each | |
4218 be at least NUM_REGS * sizeof (regoff_t) bytes long. | |
4219 | |
4220 If NUM_REGS == 0, then subsequent matches should allocate their own | |
4221 register data. | |
4222 | |
4223 Unless this function is called, the first search or match using | |
4224 PATTERN_BUFFER will allocate its own register data, without | |
4225 freeing the old data. */ | |
4226 | |
4227 void | |
4228 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, | |
647 | 4229 int num_regs, regoff_t *starts, regoff_t *ends) |
428 | 4230 { |
4231 if (num_regs) | |
4232 { | |
4233 bufp->regs_allocated = REGS_REALLOCATE; | |
4234 regs->num_regs = num_regs; | |
4235 regs->start = starts; | |
4236 regs->end = ends; | |
4237 } | |
4238 else | |
4239 { | |
4240 bufp->regs_allocated = REGS_UNALLOCATED; | |
4241 regs->num_regs = 0; | |
4242 regs->start = regs->end = (regoff_t *) 0; | |
4243 } | |
4244 } | |
4245 | |
4246 /* Searching routines. */ | |
4247 | |
4248 /* Like re_search_2, below, but only one string is specified, and | |
4249 doesn't let you say where to stop matching. */ | |
4250 | |
4251 int | |
442 | 4252 re_search (struct re_pattern_buffer *bufp, const char *string, int size, |
826 | 4253 int startpos, int range, struct re_registers *regs |
4254 RE_LISP_CONTEXT_ARGS_DECL) | |
428 | 4255 { |
4256 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, | |
826 | 4257 regs, size RE_LISP_CONTEXT_ARGS); |
428 | 4258 } |
4259 | |
4260 /* Using the compiled pattern in BUFP->buffer, first tries to match the | |
4261 virtual concatenation of STRING1 and STRING2, starting first at index | |
4262 STARTPOS, then at STARTPOS + 1, and so on. | |
4263 | |
4264 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. | |
4265 | |
4266 RANGE is how far to scan while trying to match. RANGE = 0 means try | |
4267 only at STARTPOS; in general, the last start tried is STARTPOS + | |
4268 RANGE. | |
4269 | |
826 | 4270 All sizes and positions refer to bytes (not chars); under Mule, the code |
4271 knows about the format of the text and will only check at positions | |
4272 where a character starts. | |
4273 | |
428 | 4274 With MULE, RANGE is a byte position, not a char position. The last |
4275 start tried is the character starting <= STARTPOS + RANGE. | |
4276 | |
4277 In REGS, return the indices of the virtual concatenation of STRING1 | |
4278 and STRING2 that matched the entire BUFP->buffer and its contained | |
4279 subexpressions. | |
4280 | |
4281 Do not consider matching one past the index STOP in the virtual | |
4282 concatenation of STRING1 and STRING2. | |
4283 | |
4284 We return either the position in the strings at which the match was | |
4285 found, -1 if no match, or -2 if error (such as failure | |
4286 stack overflow). */ | |
4287 | |
4288 int | |
446 | 4289 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, |
4290 int size1, const char *str2, int size2, int startpos, | |
826 | 4291 int range, struct re_registers *regs, int stop |
4292 RE_LISP_CONTEXT_ARGS_DECL) | |
428 | 4293 { |
4294 int val; | |
446 | 4295 re_char *string1 = (re_char *) str1; |
4296 re_char *string2 = (re_char *) str2; | |
428 | 4297 REGISTER char *fastmap = bufp->fastmap; |
446 | 4298 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate; |
428 | 4299 int total_size = size1 + size2; |
4300 int endpos = startpos + range; | |
4301 #ifdef REGEX_BEGLINE_CHECK | |
4302 int anchored_at_begline = 0; | |
4303 #endif | |
446 | 4304 re_char *d; |
826 | 4305 #ifdef emacs |
4306 Internal_Format fmt = buffer_or_other_internal_format (lispobj); | |
1346 | 4307 #ifdef REL_ALLOC |
4308 Ibyte *orig_buftext = | |
4309 BUFFERP (lispobj) ? | |
4310 BYTE_BUF_BYTE_ADDRESS (XBUFFER (lispobj), | |
4311 BYTE_BUF_BEGV (XBUFFER (lispobj))) : | |
4312 0; | |
4313 #endif | |
1333 | 4314 #ifdef ERROR_CHECK_MALLOC |
4315 int depth; | |
4316 #endif | |
826 | 4317 #endif /* emacs */ |
4318 #if 1 | |
4319 int forward_search_p; | |
4320 #endif | |
428 | 4321 |
4322 /* Check for out-of-range STARTPOS. */ | |
4323 if (startpos < 0 || startpos > total_size) | |
4324 return -1; | |
4325 | |
4326 /* Fix up RANGE if it might eventually take us outside | |
4327 the virtual concatenation of STRING1 and STRING2. */ | |
4328 if (endpos < 0) | |
4329 range = 0 - startpos; | |
4330 else if (endpos > total_size) | |
4331 range = total_size - startpos; | |
4332 | |
826 | 4333 #if 1 |
4334 forward_search_p = range > 0; | |
4335 #endif | |
4336 | |
428 | 4337 /* If the search isn't to be a backwards one, don't waste time in a |
4338 search for a pattern that must be anchored. */ | |
4339 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) | |
4340 { | |
4341 if (startpos > 0) | |
4342 return -1; | |
4343 else | |
4344 { | |
442 | 4345 d = ((const unsigned char *) |
428 | 4346 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
867 | 4347 range = itext_ichar_len_fmt (d, fmt); |
428 | 4348 } |
4349 } | |
4350 | |
460 | 4351 #ifdef emacs |
4352 /* In a forward search for something that starts with \=. | |
4353 don't keep searching past point. */ | |
4354 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) | |
4355 { | |
826 | 4356 if (!BUFFERP (lispobj)) |
4357 return -1; | |
4527
8418d1ad4944
Fix at_dot regex under Mule. <87hc6rv53v.fsf@uwakimon.sk.tsukuba.ac.jp>
Stephen J. Turnbull <stephen@xemacs.org>
parents:
3300
diff
changeset
|
4358 range = (BYTE_BUF_PT (XBUFFER (lispobj)) |
8418d1ad4944
Fix at_dot regex under Mule. <87hc6rv53v.fsf@uwakimon.sk.tsukuba.ac.jp>
Stephen J. Turnbull <stephen@xemacs.org>
parents:
3300
diff
changeset
|
4359 - BYTE_BUF_BEGV (XBUFFER (lispobj)) - startpos); |
460 | 4360 if (range < 0) |
4361 return -1; | |
4362 } | |
4363 #endif /* emacs */ | |
4364 | |
1333 | 4365 #ifdef ERROR_CHECK_MALLOC |
4366 /* Do this after the above return()s. */ | |
4367 depth = bind_regex_malloc_disallowed (1); | |
4368 #endif | |
4369 | |
428 | 4370 /* Update the fastmap now if not correct already. */ |
1333 | 4371 BEGIN_REGEX_MALLOC_OK (); |
428 | 4372 if (fastmap && !bufp->fastmap_accurate) |
826 | 4373 if (re_compile_fastmap (bufp RE_LISP_SHORT_CONTEXT_ARGS) == -2) |
1333 | 4374 { |
4375 END_REGEX_MALLOC_OK (); | |
4376 UNBIND_REGEX_MALLOC_CHECK (); | |
4377 return -2; | |
4378 } | |
4379 | |
4380 END_REGEX_MALLOC_OK (); | |
4381 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
428 | 4382 |
4383 #ifdef REGEX_BEGLINE_CHECK | |
4384 { | |
647 | 4385 long i = 0; |
428 | 4386 |
4387 while (i < bufp->used) | |
4388 { | |
4389 if (bufp->buffer[i] == start_memory || | |
4390 bufp->buffer[i] == stop_memory) | |
4391 i += 2; | |
4392 else | |
4393 break; | |
4394 } | |
4395 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline; | |
4396 } | |
4397 #endif | |
4398 | |
460 | 4399 #ifdef emacs |
1333 | 4400 BEGIN_REGEX_MALLOC_OK (); |
826 | 4401 scache = setup_syntax_cache (scache, lispobj, lispbuf, |
4402 offset_to_charxpos (lispobj, startpos), | |
4403 1); | |
1333 | 4404 END_REGEX_MALLOC_OK (); |
4405 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
460 | 4406 #endif |
4407 | |
428 | 4408 /* Loop through the string, looking for a place to start matching. */ |
4409 for (;;) | |
4410 { | |
4411 #ifdef REGEX_BEGLINE_CHECK | |
826 | 4412 /* If the regex is anchored at the beginning of a line (i.e. with a |
4413 ^), then we can speed things up by skipping to the next | |
4414 beginning-of-line. However, to determine "beginning of line" we | |
4415 need to look at the previous char, so can't do this check if at | |
4416 beginning of either string. (Well, we could if at the beginning of | |
4417 the second string, but it would require additional code, and this | |
4418 is just an optimization.) */ | |
4419 if (anchored_at_begline && startpos > 0 && startpos != size1) | |
428 | 4420 { |
826 | 4421 if (range > 0) |
4422 { | |
4423 /* whose stupid idea was it anyway to make this | |
4424 function take two strings to match?? */ | |
4425 int lim = 0; | |
4426 re_char *orig_d; | |
4427 re_char *stop_d; | |
4428 | |
4429 /* Compute limit as below in fastmap code, so we are guaranteed | |
4430 to remain within a single string. */ | |
4431 if (startpos < size1 && startpos + range >= size1) | |
4432 lim = range - (size1 - startpos); | |
4433 | |
4434 d = ((const unsigned char *) | |
4435 (startpos >= size1 ? string2 - size1 : string1) + startpos); | |
4436 orig_d = d; | |
4437 stop_d = d + range - lim; | |
4438 | |
4439 /* We want to find the next location (including the current | |
4440 one) where the previous char is a newline, so back up one | |
4441 and search forward for a newline. */ | |
867 | 4442 DEC_IBYTEPTR_FMT (d, fmt); /* Ok, since startpos != size1. */ |
826 | 4443 |
4444 /* Written out as an if-else to avoid testing `translate' | |
4445 inside the loop. */ | |
4446 if (TRANSLATE_P (translate)) | |
4447 while (d < stop_d && | |
867 | 4448 RE_TRANSLATE_1 (itext_ichar_fmt (d, fmt, lispobj)) |
826 | 4449 != '\n') |
867 | 4450 INC_IBYTEPTR_FMT (d, fmt); |
826 | 4451 else |
4452 while (d < stop_d && | |
867 | 4453 itext_ichar_ascii_fmt (d, fmt, lispobj) != '\n') |
4454 INC_IBYTEPTR_FMT (d, fmt); | |
826 | 4455 |
4456 /* If we were stopped by a newline, skip forward over it. | |
4457 Otherwise we will get in an infloop when our start position | |
4458 was at begline. */ | |
4459 if (d < stop_d) | |
867 | 4460 INC_IBYTEPTR_FMT (d, fmt); |
826 | 4461 range -= d - orig_d; |
4462 startpos += d - orig_d; | |
4463 #if 1 | |
4464 assert (!forward_search_p || range >= 0); | |
4465 #endif | |
4466 } | |
4467 else if (range < 0) | |
4468 { | |
4469 /* We're lazy, like in the fastmap code below */ | |
867 | 4470 Ichar c; |
826 | 4471 |
4472 d = ((const unsigned char *) | |
4473 (startpos >= size1 ? string2 - size1 : string1) + startpos); | |
867 | 4474 DEC_IBYTEPTR_FMT (d, fmt); |
4475 c = itext_ichar_fmt (d, fmt, lispobj); | |
826 | 4476 c = RE_TRANSLATE (c); |
4477 if (c != '\n') | |
4478 goto advance; | |
4479 } | |
428 | 4480 } |
4481 #endif /* REGEX_BEGLINE_CHECK */ | |
4482 | |
4483 /* If a fastmap is supplied, skip quickly over characters that | |
4484 cannot be the start of a match. If the pattern can match the | |
4485 null string, however, we don't need to skip characters; we want | |
4486 the first null string. */ | |
4487 if (fastmap && startpos < total_size && !bufp->can_be_null) | |
4488 { | |
826 | 4489 /* For the moment, fastmap always works as if buffer |
4490 is in default format, so convert chars in the search strings | |
4491 into default format as we go along, if necessary. | |
4492 | |
4493 &&#### fastmap needs rethinking for 8-bit-fixed so | |
4494 it's faster. We need it to reflect the raw | |
4495 8-bit-fixed values. That isn't so hard if we assume | |
4496 that the top 96 bytes represent a single 1-byte | |
4497 charset. For 16-bit/32-bit stuff it's probably not | |
4498 worth it to make the fastmap represent the raw, due to | |
4499 its nature -- we'd have to use the LSB for the | |
4500 fastmap, and that causes lots of problems with Mule | |
4501 chars, where it essentially wipes out the usefulness | |
4502 of the fastmap entirely. */ | |
428 | 4503 if (range > 0) /* Searching forwards. */ |
4504 { | |
4505 int lim = 0; | |
4506 int irange = range; | |
4507 | |
4508 if (startpos < size1 && startpos + range >= size1) | |
4509 lim = range - (size1 - startpos); | |
4510 | |
442 | 4511 d = ((const unsigned char *) |
428 | 4512 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
4513 | |
4514 /* Written out as an if-else to avoid testing `translate' | |
4515 inside the loop. */ | |
446 | 4516 if (TRANSLATE_P (translate)) |
826 | 4517 { |
4518 while (range > lim) | |
4519 { | |
4520 re_char *old_d = d; | |
428 | 4521 #ifdef MULE |
867 | 4522 Ibyte tempch[MAX_ICHAR_LEN]; |
4523 Ichar buf_ch = | |
4524 RE_TRANSLATE_1 (itext_ichar_fmt (d, fmt, lispobj)); | |
4525 set_itext_ichar (tempch, buf_ch); | |
826 | 4526 if (fastmap[*tempch]) |
4527 break; | |
446 | 4528 #else |
826 | 4529 if (fastmap[(unsigned char) RE_TRANSLATE_1 (*d)]) |
4530 break; | |
446 | 4531 #endif /* MULE */ |
867 | 4532 INC_IBYTEPTR_FMT (d, fmt); |
826 | 4533 range -= (d - old_d); |
4534 #if 1 | |
1333 | 4535 assert (!forward_search_p || range >= 0); |
826 | 4536 #endif |
4537 } | |
4538 } | |
4539 #ifdef MULE | |
4540 else if (fmt != FORMAT_DEFAULT) | |
4541 { | |
4542 while (range > lim) | |
4543 { | |
4544 re_char *old_d = d; | |
867 | 4545 Ibyte tempch[MAX_ICHAR_LEN]; |
4546 Ichar buf_ch = itext_ichar_fmt (d, fmt, lispobj); | |
4547 set_itext_ichar (tempch, buf_ch); | |
826 | 4548 if (fastmap[*tempch]) |
4549 break; | |
867 | 4550 INC_IBYTEPTR_FMT (d, fmt); |
826 | 4551 range -= (d - old_d); |
4552 #if 1 | |
1333 | 4553 assert (!forward_search_p || range >= 0); |
826 | 4554 #endif |
4555 } | |
4556 } | |
4557 #endif /* MULE */ | |
428 | 4558 else |
826 | 4559 { |
4560 while (range > lim && !fastmap[*d]) | |
4561 { | |
4562 re_char *old_d = d; | |
867 | 4563 INC_IBYTEPTR (d); |
826 | 4564 range -= (d - old_d); |
4565 #if 1 | |
4566 assert (!forward_search_p || range >= 0); | |
4567 #endif | |
4568 } | |
4569 } | |
428 | 4570 |
4571 startpos += irange - range; | |
4572 } | |
4573 else /* Searching backwards. */ | |
4574 { | |
826 | 4575 /* #### It's not clear why we don't just write a loop, like |
4576 for the moving-forward case. Perhaps the writer got lazy, | |
4577 since backward searches aren't so common. */ | |
4578 d = ((const unsigned char *) | |
4579 (startpos >= size1 ? string2 - size1 : string1) + startpos); | |
428 | 4580 #ifdef MULE |
826 | 4581 { |
867 | 4582 Ibyte tempch[MAX_ICHAR_LEN]; |
4583 Ichar buf_ch = | |
4584 RE_TRANSLATE (itext_ichar_fmt (d, fmt, lispobj)); | |
4585 set_itext_ichar (tempch, buf_ch); | |
826 | 4586 if (!fastmap[*tempch]) |
4587 goto advance; | |
4588 } | |
428 | 4589 #else |
826 | 4590 if (!fastmap[(unsigned char) RE_TRANSLATE (*d)]) |
446 | 4591 goto advance; |
826 | 4592 #endif /* MULE */ |
428 | 4593 } |
4594 } | |
4595 | |
4596 /* If can't match the null string, and that's all we have left, fail. */ | |
4597 if (range >= 0 && startpos == total_size && fastmap | |
4598 && !bufp->can_be_null) | |
1333 | 4599 { |
4600 UNBIND_REGEX_MALLOC_CHECK (); | |
4601 return -1; | |
4602 } | |
428 | 4603 |
4604 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */ | |
4605 if (!no_quit_in_re_search) | |
1333 | 4606 { |
4607 BEGIN_REGEX_MALLOC_OK (); | |
4608 QUIT; | |
4609 END_REGEX_MALLOC_OK (); | |
4610 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
4611 } | |
4612 | |
428 | 4613 #endif |
1333 | 4614 BEGIN_REGEX_MALLOC_OK (); |
428 | 4615 val = re_match_2_internal (bufp, string1, size1, string2, size2, |
826 | 4616 startpos, regs, stop |
4617 RE_LISP_CONTEXT_ARGS); | |
428 | 4618 #ifndef REGEX_MALLOC |
1333 | 4619 ALLOCA_GARBAGE_COLLECT (); |
428 | 4620 #endif |
1333 | 4621 END_REGEX_MALLOC_OK (); |
4622 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
428 | 4623 |
4624 if (val >= 0) | |
1333 | 4625 { |
4626 UNBIND_REGEX_MALLOC_CHECK (); | |
4627 return startpos; | |
4628 } | |
428 | 4629 |
4630 if (val == -2) | |
1333 | 4631 { |
4632 UNBIND_REGEX_MALLOC_CHECK (); | |
4633 return -2; | |
4634 } | |
4635 | |
4636 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
428 | 4637 advance: |
4638 if (!range) | |
4639 break; | |
4640 else if (range > 0) | |
4641 { | |
826 | 4642 Bytecount d_size; |
442 | 4643 d = ((const unsigned char *) |
428 | 4644 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
867 | 4645 d_size = itext_ichar_len_fmt (d, fmt); |
428 | 4646 range -= d_size; |
826 | 4647 #if 1 |
4648 assert (!forward_search_p || range >= 0); | |
4649 #endif | |
428 | 4650 startpos += d_size; |
4651 } | |
4652 else | |
4653 { | |
826 | 4654 Bytecount d_size; |
428 | 4655 /* Note startpos > size1 not >=. If we are on the |
4656 string1/string2 boundary, we want to backup into string1. */ | |
442 | 4657 d = ((const unsigned char *) |
428 | 4658 (startpos > size1 ? string2 - size1 : string1) + startpos); |
867 | 4659 DEC_IBYTEPTR_FMT (d, fmt); |
4660 d_size = itext_ichar_len_fmt (d, fmt); | |
428 | 4661 range += d_size; |
826 | 4662 #if 1 |
4663 assert (!forward_search_p || range >= 0); | |
4664 #endif | |
428 | 4665 startpos -= d_size; |
4666 } | |
4667 } | |
1333 | 4668 UNBIND_REGEX_MALLOC_CHECK (); |
428 | 4669 return -1; |
4670 } /* re_search_2 */ | |
826 | 4671 |
428 | 4672 |
4673 /* Declarations and macros for re_match_2. */ | |
4674 | |
4675 /* This converts PTR, a pointer into one of the search strings `string1' | |
4676 and `string2' into an offset from the beginning of that string. */ | |
4677 #define POINTER_TO_OFFSET(ptr) \ | |
4678 (FIRST_STRING_P (ptr) \ | |
4679 ? ((regoff_t) ((ptr) - string1)) \ | |
4680 : ((regoff_t) ((ptr) - string2 + size1))) | |
4681 | |
4682 /* Macros for dealing with the split strings in re_match_2. */ | |
4683 | |
4684 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) | |
4685 | |
4686 /* Call before fetching a character with *d. This switches over to | |
4687 string2 if necessary. */ | |
826 | 4688 #define REGEX_PREFETCH() \ |
428 | 4689 while (d == dend) \ |
4690 { \ | |
4691 /* End of string2 => fail. */ \ | |
4692 if (dend == end_match_2) \ | |
4693 goto fail; \ | |
4694 /* End of string1 => advance to string2. */ \ | |
4695 d = string2; \ | |
4696 dend = end_match_2; \ | |
4697 } | |
4698 | |
4699 | |
4700 /* Test if at very beginning or at very end of the virtual concatenation | |
4701 of `string1' and `string2'. If only one string, it's `string2'. */ | |
4702 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) | |
4703 #define AT_STRINGS_END(d) ((d) == end2) | |
4704 | |
4705 /* XEmacs change: | |
4706 If the given position straddles the string gap, return the equivalent | |
4707 position that is before or after the gap, respectively; otherwise, | |
4708 return the same position. */ | |
4709 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d)) | |
4710 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d)) | |
4711 | |
4712 /* Test if CH is a word-constituent character. (XEmacs change) */ | |
826 | 4713 #define WORDCHAR_P(ch) \ |
4714 (SYNTAX (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf), ch) == Sword) | |
428 | 4715 |
4716 /* Free everything we malloc. */ | |
4717 #ifdef MATCH_MAY_ALLOCATE | |
1726 | 4718 #define FREE_VAR(var,type) if (var) REGEX_FREE (var, type); var = NULL |
428 | 4719 #define FREE_VARIABLES() \ |
4720 do { \ | |
1333 | 4721 UNBIND_REGEX_MALLOC_CHECK (); \ |
428 | 4722 REGEX_FREE_STACK (fail_stack.stack); \ |
1726 | 4723 FREE_VAR (regstart, re_char **); \ |
4724 FREE_VAR (regend, re_char **); \ | |
4725 FREE_VAR (old_regstart, re_char **); \ | |
4726 FREE_VAR (old_regend, re_char **); \ | |
4727 FREE_VAR (best_regstart, re_char **); \ | |
4728 FREE_VAR (best_regend, re_char **); \ | |
4729 FREE_VAR (reg_info, register_info_type *); \ | |
4730 FREE_VAR (reg_dummy, re_char **); \ | |
4731 FREE_VAR (reg_info_dummy, register_info_type *); \ | |
428 | 4732 } while (0) |
446 | 4733 #else /* not MATCH_MAY_ALLOCATE */ |
1333 | 4734 #define FREE_VARIABLES() \ |
4735 do { \ | |
4736 UNBIND_REGEX_MALLOC_CHECK (); \ | |
4737 } while (0) | |
446 | 4738 #endif /* MATCH_MAY_ALLOCATE */ |
428 | 4739 |
4740 /* These values must meet several constraints. They must not be valid | |
4741 register values; since we have a limit of 255 registers (because | |
4742 we use only one byte in the pattern for the register number), we can | |
4743 use numbers larger than 255. They must differ by 1, because of | |
4744 NUM_FAILURE_ITEMS above. And the value for the lowest register must | |
4745 be larger than the value for the highest register, so we do not try | |
4746 to actually save any registers when none are active. */ | |
4747 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) | |
4748 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) | |
4749 | |
4750 /* Matching routines. */ | |
4751 | |
826 | 4752 #ifndef emacs /* XEmacs never uses this. */ |
428 | 4753 /* re_match is like re_match_2 except it takes only a single string. */ |
4754 | |
4755 int | |
442 | 4756 re_match (struct re_pattern_buffer *bufp, const char *string, int size, |
826 | 4757 int pos, struct re_registers *regs |
4758 RE_LISP_CONTEXT_ARGS_DECL) | |
428 | 4759 { |
446 | 4760 int result = re_match_2_internal (bufp, NULL, 0, (re_char *) string, size, |
826 | 4761 pos, regs, size |
4762 RE_LISP_CONTEXT_ARGS); | |
1333 | 4763 ALLOCA_GARBAGE_COLLECT (); |
428 | 4764 return result; |
4765 } | |
4766 #endif /* not emacs */ | |
4767 | |
4768 /* re_match_2 matches the compiled pattern in BUFP against the | |
4769 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and | |
4770 SIZE2, respectively). We start matching at POS, and stop matching | |
4771 at STOP. | |
4772 | |
4773 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we | |
4774 store offsets for the substring each group matched in REGS. See the | |
4775 documentation for exactly how many groups we fill. | |
4776 | |
4777 We return -1 if no match, -2 if an internal error (such as the | |
4778 failure stack overflowing). Otherwise, we return the length of the | |
4779 matched substring. */ | |
4780 | |
4781 int | |
442 | 4782 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, |
4783 int size1, const char *string2, int size2, int pos, | |
826 | 4784 struct re_registers *regs, int stop |
4785 RE_LISP_CONTEXT_ARGS_DECL) | |
428 | 4786 { |
460 | 4787 int result; |
4788 | |
4789 #ifdef emacs | |
826 | 4790 scache = setup_syntax_cache (scache, lispobj, lispbuf, |
4791 offset_to_charxpos (lispobj, pos), | |
4792 1); | |
460 | 4793 #endif |
4794 | |
4795 result = re_match_2_internal (bufp, (re_char *) string1, size1, | |
4796 (re_char *) string2, size2, | |
826 | 4797 pos, regs, stop |
4798 RE_LISP_CONTEXT_ARGS); | |
460 | 4799 |
1333 | 4800 ALLOCA_GARBAGE_COLLECT (); |
428 | 4801 return result; |
4802 } | |
4803 | |
4804 /* This is a separate function so that we can force an alloca cleanup | |
4805 afterwards. */ | |
4806 static int | |
446 | 4807 re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, |
4808 int size1, re_char *string2, int size2, int pos, | |
826 | 4809 struct re_registers *regs, int stop |
2333 | 4810 RE_LISP_CONTEXT_ARGS_MULE_DECL) |
428 | 4811 { |
4812 /* General temporaries. */ | |
4813 int mcnt; | |
4814 unsigned char *p1; | |
4815 int should_succeed; /* XEmacs change */ | |
4816 | |
4817 /* Just past the end of the corresponding string. */ | |
446 | 4818 re_char *end1, *end2; |
428 | 4819 |
4820 /* Pointers into string1 and string2, just past the last characters in | |
4821 each to consider matching. */ | |
446 | 4822 re_char *end_match_1, *end_match_2; |
428 | 4823 |
4824 /* Where we are in the data, and the end of the current string. */ | |
446 | 4825 re_char *d, *dend; |
428 | 4826 |
4827 /* Where we are in the pattern, and the end of the pattern. */ | |
4828 unsigned char *p = bufp->buffer; | |
4829 REGISTER unsigned char *pend = p + bufp->used; | |
4830 | |
4831 /* Mark the opcode just after a start_memory, so we can test for an | |
4832 empty subpattern when we get to the stop_memory. */ | |
446 | 4833 re_char *just_past_start_mem = 0; |
428 | 4834 |
4835 /* We use this to map every character in the string. */ | |
446 | 4836 RE_TRANSLATE_TYPE translate = bufp->translate; |
428 | 4837 |
4838 /* Failure point stack. Each place that can handle a failure further | |
4839 down the line pushes a failure point on this stack. It consists of | |
4840 restart, regend, and reg_info for all registers corresponding to | |
4841 the subexpressions we're currently inside, plus the number of such | |
4842 registers, and, finally, two char *'s. The first char * is where | |
4843 to resume scanning the pattern; the second one is where to resume | |
4844 scanning the strings. If the latter is zero, the failure point is | |
4845 a ``dummy''; if a failure happens and the failure point is a dummy, | |
4846 it gets discarded and the next one is tried. */ | |
4847 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ | |
4848 fail_stack_type fail_stack; | |
4849 #endif | |
4850 #ifdef DEBUG | |
647 | 4851 static int failure_id; |
4852 int nfailure_points_pushed = 0, nfailure_points_popped = 0; | |
428 | 4853 #endif |
4854 | |
771 | 4855 #ifdef REGEX_REL_ALLOC |
428 | 4856 /* This holds the pointer to the failure stack, when |
4857 it is allocated relocatably. */ | |
4858 fail_stack_elt_t *failure_stack_ptr; | |
4859 #endif | |
4860 | |
4861 /* We fill all the registers internally, independent of what we | |
4862 return, for use in backreferences. The number here includes | |
4863 an element for register zero. */ | |
647 | 4864 int num_regs = bufp->re_ngroups + 1; |
428 | 4865 |
4866 /* The currently active registers. */ | |
647 | 4867 int lowest_active_reg = NO_LOWEST_ACTIVE_REG; |
4868 int highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
428 | 4869 |
4870 /* Information on the contents of registers. These are pointers into | |
4871 the input strings; they record just what was matched (on this | |
4872 attempt) by a subexpression part of the pattern, that is, the | |
4873 regnum-th regstart pointer points to where in the pattern we began | |
4874 matching and the regnum-th regend points to right after where we | |
4875 stopped matching the regnum-th subexpression. (The zeroth register | |
4876 keeps track of what the whole pattern matches.) */ | |
4877 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ | |
446 | 4878 re_char **regstart, **regend; |
428 | 4879 #endif |
4880 | |
4881 /* If a group that's operated upon by a repetition operator fails to | |
4882 match anything, then the register for its start will need to be | |
4883 restored because it will have been set to wherever in the string we | |
4884 are when we last see its open-group operator. Similarly for a | |
4885 register's end. */ | |
4886 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ | |
446 | 4887 re_char **old_regstart, **old_regend; |
428 | 4888 #endif |
4889 | |
4890 /* The is_active field of reg_info helps us keep track of which (possibly | |
4891 nested) subexpressions we are currently in. The matched_something | |
4892 field of reg_info[reg_num] helps us tell whether or not we have | |
4893 matched any of the pattern so far this time through the reg_num-th | |
4894 subexpression. These two fields get reset each time through any | |
4895 loop their register is in. */ | |
4896 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ | |
4897 register_info_type *reg_info; | |
4898 #endif | |
4899 | |
4900 /* The following record the register info as found in the above | |
4901 variables when we find a match better than any we've seen before. | |
4902 This happens as we backtrack through the failure points, which in | |
4903 turn happens only if we have not yet matched the entire string. */ | |
647 | 4904 int best_regs_set = false; |
428 | 4905 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ |
446 | 4906 re_char **best_regstart, **best_regend; |
428 | 4907 #endif |
4908 | |
4909 /* Logically, this is `best_regend[0]'. But we don't want to have to | |
4910 allocate space for that if we're not allocating space for anything | |
4911 else (see below). Also, we never need info about register 0 for | |
4912 any of the other register vectors, and it seems rather a kludge to | |
4913 treat `best_regend' differently than the rest. So we keep track of | |
4914 the end of the best match so far in a separate variable. We | |
4915 initialize this to NULL so that when we backtrack the first time | |
4916 and need to test it, it's not garbage. */ | |
446 | 4917 re_char *match_end = NULL; |
428 | 4918 |
4919 /* This helps SET_REGS_MATCHED avoid doing redundant work. */ | |
4920 int set_regs_matched_done = 0; | |
4921 | |
4922 /* Used when we pop values we don't care about. */ | |
4923 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ | |
446 | 4924 re_char **reg_dummy; |
428 | 4925 register_info_type *reg_info_dummy; |
4926 #endif | |
4927 | |
4928 #ifdef DEBUG | |
4929 /* Counts the total number of registers pushed. */ | |
647 | 4930 int num_regs_pushed = 0; |
428 | 4931 #endif |
4932 | |
4933 /* 1 if this match ends in the same string (string1 or string2) | |
4934 as the best previous match. */ | |
460 | 4935 re_bool same_str_p; |
428 | 4936 |
4937 /* 1 if this match is the best seen so far. */ | |
460 | 4938 re_bool best_match_p; |
428 | 4939 |
826 | 4940 #ifdef emacs |
4941 Internal_Format fmt = buffer_or_other_internal_format (lispobj); | |
1346 | 4942 #ifdef REL_ALLOC |
4943 Ibyte *orig_buftext = | |
4944 BUFFERP (lispobj) ? | |
4945 BYTE_BUF_BYTE_ADDRESS (XBUFFER (lispobj), | |
4946 BYTE_BUF_BEGV (XBUFFER (lispobj))) : | |
4947 0; | |
4948 #endif | |
4949 | |
1333 | 4950 #ifdef ERROR_CHECK_MALLOC |
4951 int depth = bind_regex_malloc_disallowed (1); | |
4952 #endif | |
826 | 4953 #endif /* emacs */ |
771 | 4954 |
5041 | 4955 DEBUG_MATCH_PRINT1 ("\n\nEntering re_match_2.\n"); |
428 | 4956 |
1333 | 4957 BEGIN_REGEX_MALLOC_OK (); |
428 | 4958 INIT_FAIL_STACK (); |
1333 | 4959 END_REGEX_MALLOC_OK (); |
428 | 4960 |
4961 #ifdef MATCH_MAY_ALLOCATE | |
4962 /* Do not bother to initialize all the register variables if there are | |
4963 no groups in the pattern, as it takes a fair amount of time. If | |
4964 there are groups, we include space for register 0 (the whole | |
4965 pattern), even though we never use it, since it simplifies the | |
4966 array indexing. We should fix this. */ | |
502 | 4967 if (bufp->re_ngroups) |
428 | 4968 { |
1333 | 4969 BEGIN_REGEX_MALLOC_OK (); |
446 | 4970 regstart = REGEX_TALLOC (num_regs, re_char *); |
4971 regend = REGEX_TALLOC (num_regs, re_char *); | |
4972 old_regstart = REGEX_TALLOC (num_regs, re_char *); | |
4973 old_regend = REGEX_TALLOC (num_regs, re_char *); | |
4974 best_regstart = REGEX_TALLOC (num_regs, re_char *); | |
4975 best_regend = REGEX_TALLOC (num_regs, re_char *); | |
428 | 4976 reg_info = REGEX_TALLOC (num_regs, register_info_type); |
446 | 4977 reg_dummy = REGEX_TALLOC (num_regs, re_char *); |
428 | 4978 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); |
1333 | 4979 END_REGEX_MALLOC_OK (); |
428 | 4980 |
4981 if (!(regstart && regend && old_regstart && old_regend && reg_info | |
4982 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) | |
4983 { | |
4984 FREE_VARIABLES (); | |
4985 return -2; | |
4986 } | |
4987 } | |
4988 else | |
4989 { | |
4990 /* We must initialize all our variables to NULL, so that | |
4991 `FREE_VARIABLES' doesn't try to free them. */ | |
4992 regstart = regend = old_regstart = old_regend = best_regstart | |
4993 = best_regend = reg_dummy = NULL; | |
4994 reg_info = reg_info_dummy = (register_info_type *) NULL; | |
4995 } | |
4996 #endif /* MATCH_MAY_ALLOCATE */ | |
4997 | |
1333 | 4998 #if defined (emacs) && defined (REL_ALLOC) |
4999 { | |
5000 /* If the allocations above (or the call to setup_syntax_cache() in | |
5001 re_match_2) caused a rel-alloc relocation, then fix up the data | |
5002 pointers */ | |
1346 | 5003 Bytecount offset = offset_post_relocation (lispobj, orig_buftext); |
1333 | 5004 if (offset) |
5005 { | |
5006 string1 += offset; | |
5007 string2 += offset; | |
5008 } | |
5009 } | |
5010 #endif /* defined (emacs) && defined (REL_ALLOC) */ | |
5011 | |
428 | 5012 /* The starting position is bogus. */ |
5013 if (pos < 0 || pos > size1 + size2) | |
5014 { | |
5015 FREE_VARIABLES (); | |
5016 return -1; | |
5017 } | |
5018 | |
5019 /* Initialize subexpression text positions to -1 to mark ones that no | |
5020 start_memory/stop_memory has been seen for. Also initialize the | |
5021 register information struct. */ | |
5022 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
5023 { | |
5024 regstart[mcnt] = regend[mcnt] | |
5025 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; | |
5026 | |
5027 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; | |
5028 IS_ACTIVE (reg_info[mcnt]) = 0; | |
5029 MATCHED_SOMETHING (reg_info[mcnt]) = 0; | |
5030 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; | |
5031 } | |
5032 /* We move `string1' into `string2' if the latter's empty -- but not if | |
5033 `string1' is null. */ | |
5034 if (size2 == 0 && string1 != NULL) | |
5035 { | |
5036 string2 = string1; | |
5037 size2 = size1; | |
5038 string1 = 0; | |
5039 size1 = 0; | |
5040 } | |
5041 end1 = string1 + size1; | |
5042 end2 = string2 + size2; | |
5043 | |
5044 /* Compute where to stop matching, within the two strings. */ | |
5045 if (stop <= size1) | |
5046 { | |
5047 end_match_1 = string1 + stop; | |
5048 end_match_2 = string2; | |
5049 } | |
5050 else | |
5051 { | |
5052 end_match_1 = end1; | |
5053 end_match_2 = string2 + stop - size1; | |
5054 } | |
5055 | |
5056 /* `p' scans through the pattern as `d' scans through the data. | |
5057 `dend' is the end of the input string that `d' points within. `d' | |
5058 is advanced into the following input string whenever necessary, but | |
5059 this happens before fetching; therefore, at the beginning of the | |
5060 loop, `d' can be pointing at the end of a string, but it cannot | |
5061 equal `string2'. */ | |
5062 if (size1 > 0 && pos <= size1) | |
5063 { | |
5064 d = string1 + pos; | |
5065 dend = end_match_1; | |
5066 } | |
5067 else | |
5068 { | |
5069 d = string2 + pos - size1; | |
5070 dend = end_match_2; | |
5071 } | |
5072 | |
5041 | 5073 DEBUG_MATCH_PRINT1 ("The compiled pattern is: \n"); |
5074 DEBUG_MATCH_PRINT_COMPILED_PATTERN (bufp, p, pend); | |
5075 DEBUG_MATCH_PRINT1 ("The string to match is: `"); | |
5076 DEBUG_MATCH_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); | |
5077 DEBUG_MATCH_PRINT1 ("'\n"); | |
428 | 5078 |
5079 /* This loops over pattern commands. It exits by returning from the | |
5080 function if the match is complete, or it drops through if the match | |
5081 fails at this starting point in the input data. */ | |
5082 for (;;) | |
5083 { | |
5041 | 5084 DEBUG_MATCH_PRINT2 ("\n0x%lx: ", (long) p); |
428 | 5085 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */ |
5086 if (!no_quit_in_re_search) | |
1333 | 5087 { |
5088 BEGIN_REGEX_MALLOC_OK (); | |
5089 QUIT; | |
5090 END_REGEX_MALLOC_OK (); | |
1346 | 5091 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); |
1333 | 5092 } |
428 | 5093 #endif |
5094 | |
5095 if (p == pend) | |
5096 { /* End of pattern means we might have succeeded. */ | |
5041 | 5097 DEBUG_MATCH_PRINT1 ("end of pattern ... "); |
428 | 5098 |
5099 /* If we haven't matched the entire string, and we want the | |
5100 longest match, try backtracking. */ | |
5101 if (d != end_match_2) | |
5102 { | |
5103 same_str_p = (FIRST_STRING_P (match_end) | |
5104 == MATCHING_IN_FIRST_STRING); | |
5105 | |
5106 /* AIX compiler got confused when this was combined | |
5107 with the previous declaration. */ | |
5108 if (same_str_p) | |
5109 best_match_p = d > match_end; | |
5110 else | |
5111 best_match_p = !MATCHING_IN_FIRST_STRING; | |
5112 | |
5041 | 5113 DEBUG_MATCH_PRINT1 ("backtracking.\n"); |
428 | 5114 |
5115 if (!FAIL_STACK_EMPTY ()) | |
5116 { /* More failure points to try. */ | |
5117 | |
5118 /* If exceeds best match so far, save it. */ | |
5119 if (!best_regs_set || best_match_p) | |
5120 { | |
5121 best_regs_set = true; | |
5122 match_end = d; | |
5123 | |
5041 | 5124 DEBUG_MATCH_PRINT1 ("\nSAVING match as best so far.\n"); |
428 | 5125 |
5126 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
5127 { | |
5128 best_regstart[mcnt] = regstart[mcnt]; | |
5129 best_regend[mcnt] = regend[mcnt]; | |
5130 } | |
5131 } | |
5132 goto fail; | |
5133 } | |
5134 | |
5135 /* If no failure points, don't restore garbage. And if | |
5136 last match is real best match, don't restore second | |
5137 best one. */ | |
5138 else if (best_regs_set && !best_match_p) | |
5139 { | |
5140 restore_best_regs: | |
5141 /* Restore best match. It may happen that `dend == | |
5142 end_match_1' while the restored d is in string2. | |
5143 For example, the pattern `x.*y.*z' against the | |
5144 strings `x-' and `y-z-', if the two strings are | |
5145 not consecutive in memory. */ | |
5041 | 5146 DEBUG_MATCH_PRINT1 ("Restoring best registers.\n"); |
428 | 5147 |
5148 d = match_end; | |
5149 dend = ((d >= string1 && d <= end1) | |
5150 ? end_match_1 : end_match_2); | |
5151 | |
5152 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
5153 { | |
5154 regstart[mcnt] = best_regstart[mcnt]; | |
5155 regend[mcnt] = best_regend[mcnt]; | |
5156 } | |
5157 } | |
5158 } /* d != end_match_2 */ | |
5159 | |
5160 succeed_label: | |
5041 | 5161 DEBUG_MATCH_PRINT1 ("Accepting match.\n"); |
428 | 5162 |
5163 /* If caller wants register contents data back, do it. */ | |
1028 | 5164 { |
5165 int num_nonshy_regs = bufp->re_nsub + 1; | |
5166 if (regs && !bufp->no_sub) | |
5167 { | |
5168 /* Have the register data arrays been allocated? */ | |
5169 if (bufp->regs_allocated == REGS_UNALLOCATED) | |
5170 { /* No. So allocate them with malloc. We need one | |
5171 extra element beyond `num_regs' for the `-1' marker | |
5172 GNU code uses. */ | |
5173 regs->num_regs = MAX (RE_NREGS, num_nonshy_regs + 1); | |
1333 | 5174 BEGIN_REGEX_MALLOC_OK (); |
1028 | 5175 regs->start = TALLOC (regs->num_regs, regoff_t); |
5176 regs->end = TALLOC (regs->num_regs, regoff_t); | |
1333 | 5177 END_REGEX_MALLOC_OK (); |
5178 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
1028 | 5179 if (regs->start == NULL || regs->end == NULL) |
5180 { | |
5181 FREE_VARIABLES (); | |
5182 return -2; | |
5183 } | |
5184 bufp->regs_allocated = REGS_REALLOCATE; | |
5185 } | |
5186 else if (bufp->regs_allocated == REGS_REALLOCATE) | |
5187 { /* Yes. If we need more elements than were already | |
5188 allocated, reallocate them. If we need fewer, just | |
5189 leave it alone. */ | |
5190 if (regs->num_regs < num_nonshy_regs + 1) | |
5191 { | |
5192 regs->num_regs = num_nonshy_regs + 1; | |
1333 | 5193 BEGIN_REGEX_MALLOC_OK (); |
1028 | 5194 RETALLOC (regs->start, regs->num_regs, regoff_t); |
5195 RETALLOC (regs->end, regs->num_regs, regoff_t); | |
1333 | 5196 END_REGEX_MALLOC_OK (); |
5197 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
1028 | 5198 if (regs->start == NULL || regs->end == NULL) |
5199 { | |
5200 FREE_VARIABLES (); | |
5201 return -2; | |
5202 } | |
5203 } | |
5204 } | |
5205 else | |
5206 { | |
5207 /* The braces fend off a "empty body in an else-statement" | |
5208 warning under GCC when assert expands to nothing. */ | |
5209 assert (bufp->regs_allocated == REGS_FIXED); | |
5210 } | |
5211 | |
5212 /* Convert the pointer data in `regstart' and `regend' to | |
5213 indices. Register zero has to be set differently, | |
5214 since we haven't kept track of any info for it. */ | |
5215 if (regs->num_regs > 0) | |
5216 { | |
5217 regs->start[0] = pos; | |
5218 regs->end[0] = (MATCHING_IN_FIRST_STRING | |
5219 ? ((regoff_t) (d - string1)) | |
5220 : ((regoff_t) (d - string2 + size1))); | |
5221 } | |
5222 | |
2639 | 5223 /* Map over the NUM_NONSHY_REGS non-shy internal registers. |
5224 Copy each into the corresponding external register. | |
5225 MCNT indexes external registers. */ | |
1028 | 5226 for (mcnt = 1; mcnt < MIN (num_nonshy_regs, regs->num_regs); |
5227 mcnt++) | |
5228 { | |
5229 int internal_reg = bufp->external_to_internal_register[mcnt]; | |
5230 if (REG_UNSET (regstart[internal_reg]) || | |
5231 REG_UNSET (regend[internal_reg])) | |
5232 regs->start[mcnt] = regs->end[mcnt] = -1; | |
5233 else | |
5234 { | |
5235 regs->start[mcnt] = | |
5236 (regoff_t) POINTER_TO_OFFSET (regstart[internal_reg]); | |
5237 regs->end[mcnt] = | |
5238 (regoff_t) POINTER_TO_OFFSET (regend[internal_reg]); | |
5239 } | |
5240 } | |
5241 } /* regs && !bufp->no_sub */ | |
5242 | |
5243 /* If we have regs and the regs structure has more elements than | |
2639 | 5244 were in the pattern, set the extra elements starting with |
5245 NUM_NONSHY_REGS to -1. If we (re)allocated the registers, | |
5246 this is the case, because we always allocate enough to have | |
5247 at least one -1 at the end. | |
1028 | 5248 |
5249 We do this even when no_sub is set because some applications | |
5250 (XEmacs) reuse register structures which may contain stale | |
5251 information, and permit attempts to access those registers. | |
5252 | |
5253 It would be possible to require the caller to do this, but we'd | |
5254 have to change the API for this function to reflect that, and | |
1425 | 5255 audit all callers. Note: as of 2003-04-17 callers in XEmacs |
5256 do clear the registers, but it's safer to leave this code in | |
5257 because of reallocation. | |
5258 */ | |
1028 | 5259 if (regs && regs->num_regs > 0) |
5260 for (mcnt = num_nonshy_regs; mcnt < regs->num_regs; mcnt++) | |
5261 regs->start[mcnt] = regs->end[mcnt] = -1; | |
5262 } | |
5041 | 5263 DEBUG_MATCH_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", |
428 | 5264 nfailure_points_pushed, nfailure_points_popped, |
5265 nfailure_points_pushed - nfailure_points_popped); | |
5041 | 5266 DEBUG_MATCH_PRINT2 ("%u registers pushed.\n", num_regs_pushed); |
428 | 5267 |
5268 mcnt = d - pos - (MATCHING_IN_FIRST_STRING | |
5269 ? string1 | |
5270 : string2 - size1); | |
5271 | |
5041 | 5272 DEBUG_MATCH_PRINT2 ("Returning %d from re_match_2.\n", mcnt); |
428 | 5273 |
5274 FREE_VARIABLES (); | |
5275 return mcnt; | |
5276 } | |
5277 | |
5278 /* Otherwise match next pattern command. */ | |
4759
aa5ed11f473b
Remove support for obsolete systems. See xemacs-patches message with ID
Jerry James <james@xemacs.org>
parents:
4750
diff
changeset
|
5279 switch ((re_opcode_t) *p++) |
428 | 5280 { |
5281 /* Ignore these. Used to ignore the n of succeed_n's which | |
5282 currently have n == 0. */ | |
5283 case no_op: | |
5041 | 5284 DEBUG_MATCH_PRINT1 ("EXECUTING no_op.\n"); |
428 | 5285 break; |
5286 | |
5287 case succeed: | |
5041 | 5288 DEBUG_MATCH_PRINT1 ("EXECUTING succeed.\n"); |
428 | 5289 goto succeed_label; |
5290 | |
826 | 5291 /* Match exactly a string of length n in the pattern. The |
5292 following byte in the pattern defines n, and the n bytes after | |
5293 that make up the string to match. (Under Mule, this will be in | |
5294 the default internal format.) */ | |
428 | 5295 case exactn: |
5296 mcnt = *p++; | |
5041 | 5297 DEBUG_MATCH_PRINT2 ("EXECUTING exactn %d.\n", mcnt); |
428 | 5298 |
5299 /* This is written out as an if-else so we don't waste time | |
5300 testing `translate' inside the loop. */ | |
446 | 5301 if (TRANSLATE_P (translate)) |
428 | 5302 { |
5303 do | |
5304 { | |
446 | 5305 #ifdef MULE |
5306 Bytecount pat_len; | |
5307 | |
450 | 5308 REGEX_PREFETCH (); |
867 | 5309 if (RE_TRANSLATE_1 (itext_ichar_fmt (d, fmt, lispobj)) |
5310 != itext_ichar (p)) | |
428 | 5311 goto fail; |
446 | 5312 |
867 | 5313 pat_len = itext_ichar_len (p); |
446 | 5314 p += pat_len; |
867 | 5315 INC_IBYTEPTR_FMT (d, fmt); |
446 | 5316 |
5317 mcnt -= pat_len; | |
5318 #else /* not MULE */ | |
450 | 5319 REGEX_PREFETCH (); |
826 | 5320 if ((unsigned char) RE_TRANSLATE_1 (*d++) != *p++) |
446 | 5321 goto fail; |
5322 mcnt--; | |
5323 #endif | |
428 | 5324 } |
446 | 5325 while (mcnt > 0); |
428 | 5326 } |
5327 else | |
5328 { | |
826 | 5329 #ifdef MULE |
5330 /* If buffer format is default, then we can shortcut and just | |
5331 compare the text directly, byte by byte. Otherwise, we | |
5332 need to go character by character. */ | |
5333 if (fmt != FORMAT_DEFAULT) | |
428 | 5334 { |
826 | 5335 do |
5336 { | |
5337 Bytecount pat_len; | |
5338 | |
5339 REGEX_PREFETCH (); | |
867 | 5340 if (itext_ichar_fmt (d, fmt, lispobj) != |
5341 itext_ichar (p)) | |
826 | 5342 goto fail; |
5343 | |
867 | 5344 pat_len = itext_ichar_len (p); |
826 | 5345 p += pat_len; |
867 | 5346 INC_IBYTEPTR_FMT (d, fmt); |
826 | 5347 |
5348 mcnt -= pat_len; | |
5349 } | |
5350 while (mcnt > 0); | |
428 | 5351 } |
826 | 5352 else |
5353 #endif | |
5354 { | |
5355 do | |
5356 { | |
5357 REGEX_PREFETCH (); | |
5358 if (*d++ != *p++) goto fail; | |
5359 mcnt--; | |
5360 } | |
5361 while (mcnt > 0); | |
5362 } | |
428 | 5363 } |
5364 SET_REGS_MATCHED (); | |
5365 break; | |
5366 | |
5367 | |
5368 /* Match any character except possibly a newline or a null. */ | |
5369 case anychar: | |
5041 | 5370 DEBUG_MATCH_PRINT1 ("EXECUTING anychar.\n"); |
428 | 5371 |
450 | 5372 REGEX_PREFETCH (); |
428 | 5373 |
826 | 5374 if ((!(bufp->syntax & RE_DOT_NEWLINE) && |
867 | 5375 RE_TRANSLATE (itext_ichar_fmt (d, fmt, lispobj)) == '\n') |
826 | 5376 || (bufp->syntax & RE_DOT_NOT_NULL && |
867 | 5377 RE_TRANSLATE (itext_ichar_fmt (d, fmt, lispobj)) == |
826 | 5378 '\000')) |
428 | 5379 goto fail; |
5380 | |
5381 SET_REGS_MATCHED (); | |
5041 | 5382 DEBUG_MATCH_PRINT2 (" Matched `%d'.\n", *d); |
867 | 5383 INC_IBYTEPTR_FMT (d, fmt); /* XEmacs change */ |
428 | 5384 break; |
5385 | |
5386 | |
5387 case charset: | |
5388 case charset_not: | |
5389 { | |
1414 | 5390 REGISTER Ichar c; |
460 | 5391 re_bool not_p = (re_opcode_t) *(p - 1) == charset_not; |
458 | 5392 |
5041 | 5393 DEBUG_MATCH_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : ""); |
428 | 5394 |
450 | 5395 REGEX_PREFETCH (); |
867 | 5396 c = itext_ichar_fmt (d, fmt, lispobj); |
826 | 5397 c = RE_TRANSLATE (c); /* The character to match. */ |
428 | 5398 |
647 | 5399 /* Cast to `unsigned int' instead of `unsigned char' in case the |
428 | 5400 bit list is a full 32 bytes long. */ |
1414 | 5401 if ((unsigned int)c < (unsigned int) (*p * BYTEWIDTH) |
428 | 5402 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) |
458 | 5403 not_p = !not_p; |
428 | 5404 |
5405 p += 1 + *p; | |
5406 | |
458 | 5407 if (!not_p) goto fail; |
428 | 5408 |
5409 SET_REGS_MATCHED (); | |
867 | 5410 INC_IBYTEPTR_FMT (d, fmt); /* XEmacs change */ |
428 | 5411 break; |
5412 } | |
5413 | |
5414 #ifdef MULE | |
5415 case charset_mule: | |
5416 case charset_mule_not: | |
5417 { | |
867 | 5418 REGISTER Ichar c; |
460 | 5419 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not; |
458 | 5420 |
5041 | 5421 DEBUG_MATCH_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : ""); |
428 | 5422 |
450 | 5423 REGEX_PREFETCH (); |
867 | 5424 c = itext_ichar_fmt (d, fmt, lispobj); |
826 | 5425 c = RE_TRANSLATE (c); /* The character to match. */ |
428 | 5426 |
5427 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil))) | |
458 | 5428 not_p = !not_p; |
428 | 5429 |
5430 p += unified_range_table_bytes_used (p); | |
5431 | |
458 | 5432 if (!not_p) goto fail; |
428 | 5433 |
5434 SET_REGS_MATCHED (); | |
867 | 5435 INC_IBYTEPTR_FMT (d, fmt); |
428 | 5436 break; |
5437 } | |
5438 #endif /* MULE */ | |
5439 | |
5440 | |
5441 /* The beginning of a group is represented by start_memory. | |
5442 The arguments are the register number in the next byte, and the | |
5443 number of groups inner to this one in the next. The text | |
5444 matched within the group is recorded (in the internal | |
5445 registers data structure) under the register number. */ | |
5446 case start_memory: | |
5041 | 5447 DEBUG_MATCH_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); |
428 | 5448 |
5449 /* Find out if this group can match the empty string. */ | |
5450 p1 = p; /* To send to group_match_null_string_p. */ | |
5451 | |
5452 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) | |
2639 | 5453 REG_MATCH_NULL_STRING_P (reg_info[*p]) |
5454 = group_match_null_string_p (&p1, pend, reg_info); | |
5455 | |
5041 | 5456 DEBUG_MATCH_PRINT2 (" group CAN%s match null string\n", |
2639 | 5457 REG_MATCH_NULL_STRING_P (reg_info[*p]) ? "NOT" : ""); |
428 | 5458 |
5459 /* Save the position in the string where we were the last time | |
5460 we were at this open-group operator in case the group is | |
5461 operated upon by a repetition operator, e.g., with `(a*)*b' | |
5462 against `ab'; then we want to ignore where we are now in | |
5463 the string in case this attempt to match fails. */ | |
5464 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) | |
5465 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] | |
5466 : regstart[*p]; | |
5041 | 5467 DEBUG_MATCH_PRINT2 (" old_regstart: %d\n", |
428 | 5468 POINTER_TO_OFFSET (old_regstart[*p])); |
5469 | |
5470 regstart[*p] = d; | |
5041 | 5471 DEBUG_MATCH_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); |
428 | 5472 |
5473 IS_ACTIVE (reg_info[*p]) = 1; | |
5474 MATCHED_SOMETHING (reg_info[*p]) = 0; | |
5475 | |
5476 /* Clear this whenever we change the register activity status. */ | |
5477 set_regs_matched_done = 0; | |
5478 | |
5479 /* This is the new highest active register. */ | |
5480 highest_active_reg = *p; | |
5481 | |
5482 /* If nothing was active before, this is the new lowest active | |
5483 register. */ | |
5484 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) | |
5485 lowest_active_reg = *p; | |
5486 | |
5487 /* Move past the register number and inner group count. */ | |
5488 p += 2; | |
5489 just_past_start_mem = p; | |
5490 | |
5491 break; | |
5492 | |
5493 | |
5494 /* The stop_memory opcode represents the end of a group. Its | |
5495 arguments are the same as start_memory's: the register | |
5496 number, and the number of inner groups. */ | |
5497 case stop_memory: | |
5041 | 5498 DEBUG_MATCH_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); |
428 | 5499 |
5500 /* We need to save the string position the last time we were at | |
5501 this close-group operator in case the group is operated | |
5502 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' | |
5503 against `aba'; then we want to ignore where we are now in | |
5504 the string in case this attempt to match fails. */ | |
5505 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) | |
5506 ? REG_UNSET (regend[*p]) ? d : regend[*p] | |
5507 : regend[*p]; | |
5041 | 5508 DEBUG_MATCH_PRINT2 (" old_regend: %d\n", |
428 | 5509 POINTER_TO_OFFSET (old_regend[*p])); |
5510 | |
5511 regend[*p] = d; | |
5041 | 5512 DEBUG_MATCH_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); |
428 | 5513 |
5514 /* This register isn't active anymore. */ | |
5515 IS_ACTIVE (reg_info[*p]) = 0; | |
5516 | |
5517 /* Clear this whenever we change the register activity status. */ | |
5518 set_regs_matched_done = 0; | |
5519 | |
5520 /* If this was the only register active, nothing is active | |
5521 anymore. */ | |
5522 if (lowest_active_reg == highest_active_reg) | |
5523 { | |
5524 lowest_active_reg = NO_LOWEST_ACTIVE_REG; | |
5525 highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
5526 } | |
5527 else | |
5528 { /* We must scan for the new highest active register, since | |
5529 it isn't necessarily one less than now: consider | |
5530 (a(b)c(d(e)f)g). When group 3 ends, after the f), the | |
5531 new highest active register is 1. */ | |
5532 unsigned char r = *p - 1; | |
5533 while (r > 0 && !IS_ACTIVE (reg_info[r])) | |
5534 r--; | |
5535 | |
5536 /* If we end up at register zero, that means that we saved | |
5537 the registers as the result of an `on_failure_jump', not | |
5538 a `start_memory', and we jumped to past the innermost | |
5539 `stop_memory'. For example, in ((.)*) we save | |
5540 registers 1 and 2 as a result of the *, but when we pop | |
5541 back to the second ), we are at the stop_memory 1. | |
5542 Thus, nothing is active. */ | |
5543 if (r == 0) | |
5544 { | |
5545 lowest_active_reg = NO_LOWEST_ACTIVE_REG; | |
5546 highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
5547 } | |
5548 else | |
5549 { | |
5550 highest_active_reg = r; | |
5551 | |
5552 /* 98/9/21 jhod: We've also gotta set lowest_active_reg, don't we? */ | |
5553 r = 1; | |
5554 while (r < highest_active_reg && !IS_ACTIVE(reg_info[r])) | |
5555 r++; | |
5556 lowest_active_reg = r; | |
5557 } | |
5558 } | |
5559 | |
5560 /* If just failed to match something this time around with a | |
5561 group that's operated on by a repetition operator, try to | |
5562 force exit from the ``loop'', and restore the register | |
5563 information for this group that we had before trying this | |
5564 last match. */ | |
5565 if ((!MATCHED_SOMETHING (reg_info[*p]) | |
5566 || just_past_start_mem == p - 1) | |
5567 && (p + 2) < pend) | |
5568 { | |
460 | 5569 re_bool is_a_jump_n = false; |
428 | 5570 |
5571 p1 = p + 2; | |
5572 mcnt = 0; | |
5573 switch ((re_opcode_t) *p1++) | |
5574 { | |
5575 case jump_n: | |
5576 is_a_jump_n = true; | |
5577 case pop_failure_jump: | |
5578 case maybe_pop_jump: | |
5579 case jump: | |
5580 case dummy_failure_jump: | |
5581 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
5582 if (is_a_jump_n) | |
5583 p1 += 2; | |
5584 break; | |
5585 | |
5586 default: | |
5587 /* do nothing */ ; | |
5588 } | |
5589 p1 += mcnt; | |
5590 | |
5591 /* If the next operation is a jump backwards in the pattern | |
5592 to an on_failure_jump right before the start_memory | |
5593 corresponding to this stop_memory, exit from the loop | |
5594 by forcing a failure after pushing on the stack the | |
5595 on_failure_jump's jump in the pattern, and d. */ | |
5596 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump | |
5597 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) | |
5598 { | |
5599 /* If this group ever matched anything, then restore | |
5600 what its registers were before trying this last | |
5601 failed match, e.g., with `(a*)*b' against `ab' for | |
5602 regstart[1], and, e.g., with `((a*)*(b*)*)*' | |
5603 against `aba' for regend[3]. | |
5604 | |
5605 Also restore the registers for inner groups for, | |
5606 e.g., `((a*)(b*))*' against `aba' (register 3 would | |
5607 otherwise get trashed). */ | |
5608 | |
5609 if (EVER_MATCHED_SOMETHING (reg_info[*p])) | |
5610 { | |
647 | 5611 int r; |
428 | 5612 |
5613 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; | |
5614 | |
5615 /* Restore this and inner groups' (if any) registers. */ | |
5616 for (r = *p; r < *p + *(p + 1); r++) | |
5617 { | |
5618 regstart[r] = old_regstart[r]; | |
5619 | |
5620 /* xx why this test? */ | |
5621 if (old_regend[r] >= regstart[r]) | |
5622 regend[r] = old_regend[r]; | |
5623 } | |
5624 } | |
5625 p1++; | |
5626 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
5627 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); | |
5628 | |
5629 goto fail; | |
5630 } | |
5631 } | |
5632 | |
5633 /* Move past the register number and the inner group count. */ | |
5634 p += 2; | |
5635 break; | |
5636 | |
5637 | |
5638 /* \<digit> has been turned into a `duplicate' command which is | |
502 | 5639 followed by the numeric value of <digit> as the register number. |
5640 (Already passed through external-to-internal-register mapping, | |
5641 so it refers to the actual group number, not the non-shy-only | |
5642 numbering used in the external world.) */ | |
428 | 5643 case duplicate: |
5644 { | |
446 | 5645 REGISTER re_char *d2, *dend2; |
502 | 5646 /* Get which register to match against. */ |
5647 int regno = *p++; | |
5041 | 5648 DEBUG_MATCH_PRINT2 ("EXECUTING duplicate %d.\n", regno); |
428 | 5649 |
5650 /* Can't back reference a group which we've never matched. */ | |
5651 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) | |
5652 goto fail; | |
5653 | |
5654 /* Where in input to try to start matching. */ | |
5655 d2 = regstart[regno]; | |
5656 | |
5657 /* Where to stop matching; if both the place to start and | |
5658 the place to stop matching are in the same string, then | |
5659 set to the place to stop, otherwise, for now have to use | |
5660 the end of the first string. */ | |
5661 | |
5662 dend2 = ((FIRST_STRING_P (regstart[regno]) | |
5663 == FIRST_STRING_P (regend[regno])) | |
5664 ? regend[regno] : end_match_1); | |
5665 for (;;) | |
5666 { | |
5667 /* If necessary, advance to next segment in register | |
5668 contents. */ | |
5669 while (d2 == dend2) | |
5670 { | |
5671 if (dend2 == end_match_2) break; | |
5672 if (dend2 == regend[regno]) break; | |
5673 | |
5674 /* End of string1 => advance to string2. */ | |
5675 d2 = string2; | |
5676 dend2 = regend[regno]; | |
5677 } | |
5678 /* At end of register contents => success */ | |
5679 if (d2 == dend2) break; | |
5680 | |
5681 /* If necessary, advance to next segment in data. */ | |
450 | 5682 REGEX_PREFETCH (); |
428 | 5683 |
5684 /* How many characters left in this segment to match. */ | |
5685 mcnt = dend - d; | |
5686 | |
5687 /* Want how many consecutive characters we can match in | |
5688 one shot, so, if necessary, adjust the count. */ | |
5689 if (mcnt > dend2 - d2) | |
5690 mcnt = dend2 - d2; | |
5691 | |
5692 /* Compare that many; failure if mismatch, else move | |
5693 past them. */ | |
446 | 5694 if (TRANSLATE_P (translate) |
826 | 5695 ? bcmp_translate (d, d2, mcnt, translate |
5696 #ifdef emacs | |
5697 , fmt, lispobj | |
5698 #endif | |
5699 ) | |
428 | 5700 : memcmp (d, d2, mcnt)) |
5701 goto fail; | |
5702 d += mcnt, d2 += mcnt; | |
5703 | |
5704 /* Do this because we've match some characters. */ | |
5705 SET_REGS_MATCHED (); | |
5706 } | |
5707 } | |
5708 break; | |
5709 | |
5710 | |
5711 /* begline matches the empty string at the beginning of the string | |
5712 (unless `not_bol' is set in `bufp'), and, if | |
5713 `newline_anchor' is set, after newlines. */ | |
5714 case begline: | |
5041 | 5715 DEBUG_MATCH_PRINT1 ("EXECUTING begline.\n"); |
428 | 5716 |
5717 if (AT_STRINGS_BEG (d)) | |
5718 { | |
5719 if (!bufp->not_bol) break; | |
5720 } | |
826 | 5721 else |
5722 { | |
5723 re_char *d2 = d; | |
867 | 5724 DEC_IBYTEPTR (d2); |
5725 if (itext_ichar_ascii_fmt (d2, fmt, lispobj) == '\n' && | |
826 | 5726 bufp->newline_anchor) |
5727 break; | |
5728 } | |
428 | 5729 /* In all other cases, we fail. */ |
5730 goto fail; | |
5731 | |
5732 | |
5733 /* endline is the dual of begline. */ | |
5734 case endline: | |
5041 | 5735 DEBUG_MATCH_PRINT1 ("EXECUTING endline.\n"); |
428 | 5736 |
5737 if (AT_STRINGS_END (d)) | |
5738 { | |
5739 if (!bufp->not_eol) break; | |
5740 } | |
5741 | |
5742 /* We have to ``prefetch'' the next character. */ | |
826 | 5743 else if ((d == end1 ? |
867 | 5744 itext_ichar_ascii_fmt (string2, fmt, lispobj) : |
5745 itext_ichar_ascii_fmt (d, fmt, lispobj)) == '\n' | |
428 | 5746 && bufp->newline_anchor) |
5747 { | |
5748 break; | |
5749 } | |
5750 goto fail; | |
5751 | |
5752 | |
5753 /* Match at the very beginning of the data. */ | |
5754 case begbuf: | |
5041 | 5755 DEBUG_MATCH_PRINT1 ("EXECUTING begbuf.\n"); |
428 | 5756 if (AT_STRINGS_BEG (d)) |
5757 break; | |
5758 goto fail; | |
5759 | |
5760 | |
5761 /* Match at the very end of the data. */ | |
5762 case endbuf: | |
5041 | 5763 DEBUG_MATCH_PRINT1 ("EXECUTING endbuf.\n"); |
428 | 5764 if (AT_STRINGS_END (d)) |
5765 break; | |
5766 goto fail; | |
5767 | |
5768 | |
5769 /* on_failure_keep_string_jump is used to optimize `.*\n'. It | |
5770 pushes NULL as the value for the string on the stack. Then | |
5771 `pop_failure_point' will keep the current value for the | |
5772 string, instead of restoring it. To see why, consider | |
5773 matching `foo\nbar' against `.*\n'. The .* matches the foo; | |
5774 then the . fails against the \n. But the next thing we want | |
5775 to do is match the \n against the \n; if we restored the | |
5776 string value, we would be back at the foo. | |
5777 | |
5778 Because this is used only in specific cases, we don't need to | |
5779 check all the things that `on_failure_jump' does, to make | |
5780 sure the right things get saved on the stack. Hence we don't | |
5781 share its code. The only reason to push anything on the | |
5782 stack at all is that otherwise we would have to change | |
5783 `anychar's code to do something besides goto fail in this | |
5784 case; that seems worse than this. */ | |
5785 case on_failure_keep_string_jump: | |
5041 | 5786 DEBUG_MATCH_PRINT1 ("EXECUTING on_failure_keep_string_jump"); |
428 | 5787 |
5788 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
5041 | 5789 DEBUG_MATCH_PRINT3 (" %d (to 0x%lx):\n", mcnt, (long) (p + mcnt)); |
428 | 5790 |
446 | 5791 PUSH_FAILURE_POINT (p + mcnt, (unsigned char *) 0, -2); |
428 | 5792 break; |
5793 | |
5794 | |
5795 /* Uses of on_failure_jump: | |
5796 | |
5797 Each alternative starts with an on_failure_jump that points | |
5798 to the beginning of the next alternative. Each alternative | |
5799 except the last ends with a jump that in effect jumps past | |
5800 the rest of the alternatives. (They really jump to the | |
5801 ending jump of the following alternative, because tensioning | |
5802 these jumps is a hassle.) | |
5803 | |
5804 Repeats start with an on_failure_jump that points past both | |
5805 the repetition text and either the following jump or | |
5806 pop_failure_jump back to this on_failure_jump. */ | |
5807 case on_failure_jump: | |
5808 on_failure: | |
5041 | 5809 DEBUG_MATCH_PRINT1 ("EXECUTING on_failure_jump"); |
428 | 5810 |
5811 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
5041 | 5812 DEBUG_MATCH_PRINT3 (" %d (to 0x%lx)", mcnt, (long) (p + mcnt)); |
428 | 5813 |
5814 /* If this on_failure_jump comes right before a group (i.e., | |
5815 the original * applied to a group), save the information | |
5816 for that group and all inner ones, so that if we fail back | |
5817 to this point, the group's information will be correct. | |
5818 For example, in \(a*\)*\1, we need the preceding group, | |
5819 and in \(\(a*\)b*\)\2, we need the inner group. */ | |
5820 | |
5821 /* We can't use `p' to check ahead because we push | |
5822 a failure point to `p + mcnt' after we do this. */ | |
5823 p1 = p; | |
5824 | |
5825 /* We need to skip no_op's before we look for the | |
5826 start_memory in case this on_failure_jump is happening as | |
5827 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 | |
5828 against aba. */ | |
5829 while (p1 < pend && (re_opcode_t) *p1 == no_op) | |
5830 p1++; | |
5831 | |
5832 if (p1 < pend && (re_opcode_t) *p1 == start_memory) | |
5833 { | |
5834 /* We have a new highest active register now. This will | |
5835 get reset at the start_memory we are about to get to, | |
5836 but we will have saved all the registers relevant to | |
5837 this repetition op, as described above. */ | |
5838 highest_active_reg = *(p1 + 1) + *(p1 + 2); | |
5839 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) | |
5840 lowest_active_reg = *(p1 + 1); | |
5841 } | |
5842 | |
5041 | 5843 DEBUG_MATCH_PRINT1 (":\n"); |
428 | 5844 PUSH_FAILURE_POINT (p + mcnt, d, -2); |
5845 break; | |
5846 | |
5847 | |
5848 /* A smart repeat ends with `maybe_pop_jump'. | |
5849 We change it to either `pop_failure_jump' or `jump'. */ | |
5850 case maybe_pop_jump: | |
5851 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
5041 | 5852 DEBUG_MATCH_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); |
428 | 5853 { |
5854 REGISTER unsigned char *p2 = p; | |
5855 | |
5856 /* Compare the beginning of the repeat with what in the | |
5857 pattern follows its end. If we can establish that there | |
5858 is nothing that they would both match, i.e., that we | |
5859 would have to backtrack because of (as in, e.g., `a*a') | |
5860 then we can change to pop_failure_jump, because we'll | |
5861 never have to backtrack. | |
5862 | |
5863 This is not true in the case of alternatives: in | |
5864 `(a|ab)*' we do need to backtrack to the `ab' alternative | |
5865 (e.g., if the string was `ab'). But instead of trying to | |
5866 detect that here, the alternative has put on a dummy | |
5867 failure point which is what we will end up popping. */ | |
5868 | |
5869 /* Skip over open/close-group commands. | |
5870 If what follows this loop is a ...+ construct, | |
5871 look at what begins its body, since we will have to | |
5872 match at least one of that. */ | |
5873 while (1) | |
5874 { | |
5875 if (p2 + 2 < pend | |
5876 && ((re_opcode_t) *p2 == stop_memory | |
5877 || (re_opcode_t) *p2 == start_memory)) | |
5878 p2 += 3; | |
5879 else if (p2 + 6 < pend | |
5880 && (re_opcode_t) *p2 == dummy_failure_jump) | |
5881 p2 += 6; | |
5882 else | |
5883 break; | |
5884 } | |
5885 | |
5886 p1 = p + mcnt; | |
5887 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding | |
5888 to the `maybe_finalize_jump' of this case. Examine what | |
5889 follows. */ | |
5890 | |
5891 /* If we're at the end of the pattern, we can change. */ | |
5892 if (p2 == pend) | |
5893 { | |
5894 /* Consider what happens when matching ":\(.*\)" | |
5895 against ":/". I don't really understand this code | |
5896 yet. */ | |
5897 p[-3] = (unsigned char) pop_failure_jump; | |
5041 | 5898 DEBUG_MATCH_PRINT1 |
428 | 5899 (" End of pattern: change to `pop_failure_jump'.\n"); |
5900 } | |
5901 | |
5902 else if ((re_opcode_t) *p2 == exactn | |
5903 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) | |
5904 { | |
5905 REGISTER unsigned char c | |
5906 = *p2 == (unsigned char) endline ? '\n' : p2[2]; | |
5907 | |
5908 if ((re_opcode_t) p1[3] == exactn && p1[5] != c) | |
5909 { | |
5910 p[-3] = (unsigned char) pop_failure_jump; | |
5041 | 5911 DEBUG_MATCH_PRINT3 (" %c != %c => pop_failure_jump.\n", |
428 | 5912 c, p1[5]); |
5913 } | |
5914 | |
5915 else if ((re_opcode_t) p1[3] == charset | |
5916 || (re_opcode_t) p1[3] == charset_not) | |
5917 { | |
458 | 5918 int not_p = (re_opcode_t) p1[3] == charset_not; |
428 | 5919 |
5920 if (c < (unsigned char) (p1[4] * BYTEWIDTH) | |
5921 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) | |
458 | 5922 not_p = !not_p; |
5923 | |
5924 /* `not_p' is equal to 1 if c would match, which means | |
428 | 5925 that we can't change to pop_failure_jump. */ |
458 | 5926 if (!not_p) |
428 | 5927 { |
5928 p[-3] = (unsigned char) pop_failure_jump; | |
5041 | 5929 DEBUG_MATCH_PRINT1 (" No match => pop_failure_jump.\n"); |
428 | 5930 } |
5931 } | |
5932 } | |
5933 else if ((re_opcode_t) *p2 == charset) | |
5934 { | |
5935 #ifdef DEBUG | |
5936 REGISTER unsigned char c | |
5937 = *p2 == (unsigned char) endline ? '\n' : p2[2]; | |
5938 #endif | |
5939 | |
5940 if ((re_opcode_t) p1[3] == exactn | |
5941 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] | |
5942 && (p2[2 + p1[5] / BYTEWIDTH] | |
5943 & (1 << (p1[5] % BYTEWIDTH))))) | |
5944 { | |
5945 p[-3] = (unsigned char) pop_failure_jump; | |
5041 | 5946 DEBUG_MATCH_PRINT3 (" %c != %c => pop_failure_jump.\n", |
428 | 5947 c, p1[5]); |
5948 } | |
5949 | |
5950 else if ((re_opcode_t) p1[3] == charset_not) | |
5951 { | |
5952 int idx; | |
5953 /* We win if the charset_not inside the loop | |
5954 lists every character listed in the charset after. */ | |
5955 for (idx = 0; idx < (int) p2[1]; idx++) | |
5956 if (! (p2[2 + idx] == 0 | |
5957 || (idx < (int) p1[4] | |
5958 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0)))) | |
5959 break; | |
5960 | |
5961 if (idx == p2[1]) | |
5962 { | |
5963 p[-3] = (unsigned char) pop_failure_jump; | |
5041 | 5964 DEBUG_MATCH_PRINT1 (" No match => pop_failure_jump.\n"); |
428 | 5965 } |
5966 } | |
5967 else if ((re_opcode_t) p1[3] == charset) | |
5968 { | |
5969 int idx; | |
5970 /* We win if the charset inside the loop | |
5971 has no overlap with the one after the loop. */ | |
5972 for (idx = 0; | |
5973 idx < (int) p2[1] && idx < (int) p1[4]; | |
5974 idx++) | |
5975 if ((p2[2 + idx] & p1[5 + idx]) != 0) | |
5976 break; | |
5977 | |
5978 if (idx == p2[1] || idx == p1[4]) | |
5979 { | |
5980 p[-3] = (unsigned char) pop_failure_jump; | |
5041 | 5981 DEBUG_MATCH_PRINT1 (" No match => pop_failure_jump.\n"); |
428 | 5982 } |
5983 } | |
5984 } | |
5985 } | |
5986 p -= 2; /* Point at relative address again. */ | |
5987 if ((re_opcode_t) p[-1] != pop_failure_jump) | |
5988 { | |
5989 p[-1] = (unsigned char) jump; | |
5041 | 5990 DEBUG_MATCH_PRINT1 (" Match => jump.\n"); |
428 | 5991 goto unconditional_jump; |
5992 } | |
5993 /* Note fall through. */ | |
5994 | |
5995 | |
5996 /* The end of a simple repeat has a pop_failure_jump back to | |
5997 its matching on_failure_jump, where the latter will push a | |
5998 failure point. The pop_failure_jump takes off failure | |
5999 points put on by this pop_failure_jump's matching | |
6000 on_failure_jump; we got through the pattern to here from the | |
6001 matching on_failure_jump, so didn't fail. */ | |
6002 case pop_failure_jump: | |
6003 { | |
6004 /* We need to pass separate storage for the lowest and | |
6005 highest registers, even though we don't care about the | |
6006 actual values. Otherwise, we will restore only one | |
6007 register from the stack, since lowest will == highest in | |
6008 `pop_failure_point'. */ | |
647 | 6009 int dummy_low_reg, dummy_high_reg; |
428 | 6010 unsigned char *pdummy; |
446 | 6011 re_char *sdummy = NULL; |
428 | 6012 |
5041 | 6013 DEBUG_MATCH_PRINT1 ("EXECUTING pop_failure_jump.\n"); |
428 | 6014 POP_FAILURE_POINT (sdummy, pdummy, |
6015 dummy_low_reg, dummy_high_reg, | |
6016 reg_dummy, reg_dummy, reg_info_dummy); | |
6017 } | |
6018 /* Note fall through. */ | |
6019 | |
6020 | |
6021 /* Unconditionally jump (without popping any failure points). */ | |
6022 case jump: | |
6023 unconditional_jump: | |
6024 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ | |
5041 | 6025 DEBUG_MATCH_PRINT2 ("EXECUTING jump %d ", mcnt); |
428 | 6026 p += mcnt; /* Do the jump. */ |
5041 | 6027 DEBUG_MATCH_PRINT2 ("(to 0x%lx).\n", (long) p); |
428 | 6028 break; |
6029 | |
6030 | |
6031 /* We need this opcode so we can detect where alternatives end | |
6032 in `group_match_null_string_p' et al. */ | |
6033 case jump_past_alt: | |
5041 | 6034 DEBUG_MATCH_PRINT1 ("EXECUTING jump_past_alt.\n"); |
428 | 6035 goto unconditional_jump; |
6036 | |
6037 | |
6038 /* Normally, the on_failure_jump pushes a failure point, which | |
6039 then gets popped at pop_failure_jump. We will end up at | |
6040 pop_failure_jump, also, and with a pattern of, say, `a+', we | |
6041 are skipping over the on_failure_jump, so we have to push | |
6042 something meaningless for pop_failure_jump to pop. */ | |
6043 case dummy_failure_jump: | |
5041 | 6044 DEBUG_MATCH_PRINT1 ("EXECUTING dummy_failure_jump.\n"); |
428 | 6045 /* It doesn't matter what we push for the string here. What |
6046 the code at `fail' tests is the value for the pattern. */ | |
446 | 6047 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2); |
428 | 6048 goto unconditional_jump; |
6049 | |
6050 | |
6051 /* At the end of an alternative, we need to push a dummy failure | |
6052 point in case we are followed by a `pop_failure_jump', because | |
6053 we don't want the failure point for the alternative to be | |
6054 popped. For example, matching `(a|ab)*' against `aab' | |
6055 requires that we match the `ab' alternative. */ | |
6056 case push_dummy_failure: | |
5041 | 6057 DEBUG_MATCH_PRINT1 ("EXECUTING push_dummy_failure.\n"); |
428 | 6058 /* See comments just above at `dummy_failure_jump' about the |
6059 two zeroes. */ | |
446 | 6060 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2); |
428 | 6061 break; |
6062 | |
6063 /* Have to succeed matching what follows at least n times. | |
6064 After that, handle like `on_failure_jump'. */ | |
6065 case succeed_n: | |
6066 EXTRACT_NUMBER (mcnt, p + 2); | |
5041 | 6067 DEBUG_MATCH_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); |
428 | 6068 |
6069 assert (mcnt >= 0); | |
6070 /* Originally, this is how many times we HAVE to succeed. */ | |
6071 if (mcnt > 0) | |
6072 { | |
6073 mcnt--; | |
6074 p += 2; | |
6075 STORE_NUMBER_AND_INCR (p, mcnt); | |
5041 | 6076 DEBUG_MATCH_PRINT3 (" Setting 0x%lx to %d.\n", (long) p, mcnt); |
428 | 6077 } |
6078 else if (mcnt == 0) | |
6079 { | |
5041 | 6080 DEBUG_MATCH_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n", |
428 | 6081 (long) (p+2)); |
6082 p[2] = (unsigned char) no_op; | |
6083 p[3] = (unsigned char) no_op; | |
6084 goto on_failure; | |
6085 } | |
6086 break; | |
6087 | |
6088 case jump_n: | |
6089 EXTRACT_NUMBER (mcnt, p + 2); | |
5041 | 6090 DEBUG_MATCH_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); |
428 | 6091 |
6092 /* Originally, this is how many times we CAN jump. */ | |
6093 if (mcnt) | |
6094 { | |
6095 mcnt--; | |
6096 STORE_NUMBER (p + 2, mcnt); | |
6097 goto unconditional_jump; | |
6098 } | |
6099 /* If don't have to jump any more, skip over the rest of command. */ | |
6100 else | |
6101 p += 4; | |
6102 break; | |
6103 | |
6104 case set_number_at: | |
6105 { | |
5041 | 6106 DEBUG_MATCH_PRINT1 ("EXECUTING set_number_at.\n"); |
428 | 6107 |
6108 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
6109 p1 = p + mcnt; | |
6110 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
5041 | 6111 DEBUG_MATCH_PRINT3 (" Setting 0x%lx to %d.\n", (long) p1, mcnt); |
428 | 6112 STORE_NUMBER (p1, mcnt); |
6113 break; | |
6114 } | |
6115 | |
6116 case wordbound: | |
5041 | 6117 DEBUG_MATCH_PRINT1 ("EXECUTING wordbound.\n"); |
428 | 6118 should_succeed = 1; |
6119 matchwordbound: | |
6120 { | |
6121 /* XEmacs change */ | |
1377 | 6122 /* Straightforward and (I hope) correct implementation. |
6123 Probably should be optimized by arranging to compute | |
1497 | 6124 charpos only once. */ |
1377 | 6125 /* emch1 is the character before d, syn1 is the syntax of |
6126 emch1, emch2 is the character at d, and syn2 is the | |
6127 syntax of emch2. */ | |
6128 Ichar emch1, emch2; | |
1468 | 6129 int syn1 = 0, |
6130 syn2 = 0; | |
1377 | 6131 re_char *d_before, *d_after; |
6132 int result, | |
6133 at_beg = AT_STRINGS_BEG (d), | |
6134 at_end = AT_STRINGS_END (d); | |
6135 #ifdef emacs | |
1497 | 6136 Charxpos charpos; |
1377 | 6137 #endif |
6138 | |
6139 if (at_beg && at_end) | |
6140 { | |
6141 result = 0; | |
6142 } | |
428 | 6143 else |
6144 { | |
1377 | 6145 if (!at_beg) |
6146 { | |
6147 d_before = POS_BEFORE_GAP_UNSAFE (d); | |
6148 DEC_IBYTEPTR_FMT (d_before, fmt); | |
6149 emch1 = itext_ichar_fmt (d_before, fmt, lispobj); | |
460 | 6150 #ifdef emacs |
1497 | 6151 charpos = offset_to_charxpos (lispobj, |
6152 PTR_TO_OFFSET (d)) - 1; | |
1377 | 6153 BEGIN_REGEX_MALLOC_OK (); |
1497 | 6154 UPDATE_SYNTAX_CACHE (scache, charpos); |
460 | 6155 #endif |
1377 | 6156 syn1 = SYNTAX_FROM_CACHE (scache, emch1); |
6157 END_REGEX_MALLOC_OK (); | |
6158 } | |
6159 if (!at_end) | |
6160 { | |
6161 d_after = POS_AFTER_GAP_UNSAFE (d); | |
6162 emch2 = itext_ichar_fmt (d_after, fmt, lispobj); | |
460 | 6163 #ifdef emacs |
1497 | 6164 charpos = offset_to_charxpos (lispobj, PTR_TO_OFFSET (d)); |
1377 | 6165 BEGIN_REGEX_MALLOC_OK (); |
1497 | 6166 UPDATE_SYNTAX_CACHE_FORWARD (scache, charpos); |
460 | 6167 #endif |
1377 | 6168 syn2 = SYNTAX_FROM_CACHE (scache, emch2); |
6169 END_REGEX_MALLOC_OK (); | |
6170 } | |
1333 | 6171 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); |
1377 | 6172 |
6173 if (at_beg) | |
6174 result = (syn2 == Sword); | |
6175 else if (at_end) | |
6176 result = (syn1 == Sword); | |
6177 else | |
6178 result = ((syn1 == Sword) != (syn2 == Sword)); | |
428 | 6179 } |
1377 | 6180 |
428 | 6181 if (result == should_succeed) |
6182 break; | |
6183 goto fail; | |
6184 } | |
6185 | |
6186 case notwordbound: | |
5041 | 6187 DEBUG_MATCH_PRINT1 ("EXECUTING notwordbound.\n"); |
428 | 6188 should_succeed = 0; |
6189 goto matchwordbound; | |
6190 | |
6191 case wordbeg: | |
5041 | 6192 DEBUG_MATCH_PRINT1 ("EXECUTING wordbeg.\n"); |
460 | 6193 if (AT_STRINGS_END (d)) |
6194 goto fail; | |
428 | 6195 { |
6196 /* XEmacs: this originally read: | |
6197 | |
6198 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) | |
6199 break; | |
6200 | |
6201 */ | |
460 | 6202 re_char *dtmp = POS_AFTER_GAP_UNSAFE (d); |
867 | 6203 Ichar emch = itext_ichar_fmt (dtmp, fmt, lispobj); |
1333 | 6204 int tempres; |
1347 | 6205 #ifdef emacs |
6206 Charxpos charpos = offset_to_charxpos (lispobj, PTR_TO_OFFSET (d)); | |
6207 #endif | |
1333 | 6208 BEGIN_REGEX_MALLOC_OK (); |
460 | 6209 #ifdef emacs |
826 | 6210 UPDATE_SYNTAX_CACHE (scache, charpos); |
460 | 6211 #endif |
1333 | 6212 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); |
6213 END_REGEX_MALLOC_OK (); | |
6214 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
6215 if (tempres) | |
428 | 6216 goto fail; |
6217 if (AT_STRINGS_BEG (d)) | |
6218 break; | |
460 | 6219 dtmp = POS_BEFORE_GAP_UNSAFE (d); |
867 | 6220 DEC_IBYTEPTR_FMT (dtmp, fmt); |
6221 emch = itext_ichar_fmt (dtmp, fmt, lispobj); | |
1333 | 6222 BEGIN_REGEX_MALLOC_OK (); |
460 | 6223 #ifdef emacs |
826 | 6224 UPDATE_SYNTAX_CACHE_BACKWARD (scache, charpos - 1); |
460 | 6225 #endif |
1333 | 6226 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); |
6227 END_REGEX_MALLOC_OK (); | |
6228 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
6229 if (tempres) | |
428 | 6230 break; |
6231 goto fail; | |
6232 } | |
6233 | |
6234 case wordend: | |
5041 | 6235 DEBUG_MATCH_PRINT1 ("EXECUTING wordend.\n"); |
460 | 6236 if (AT_STRINGS_BEG (d)) |
6237 goto fail; | |
428 | 6238 { |
6239 /* XEmacs: this originally read: | |
6240 | |
6241 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) | |
6242 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) | |
6243 break; | |
6244 | |
6245 The or condition is incorrect (reversed). | |
6246 */ | |
460 | 6247 re_char *dtmp; |
867 | 6248 Ichar emch; |
1333 | 6249 int tempres; |
460 | 6250 #ifdef emacs |
826 | 6251 Charxpos charpos = offset_to_charxpos (lispobj, PTR_TO_OFFSET (d)); |
1347 | 6252 BEGIN_REGEX_MALLOC_OK (); |
826 | 6253 UPDATE_SYNTAX_CACHE (scache, charpos); |
1333 | 6254 END_REGEX_MALLOC_OK (); |
6255 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
1347 | 6256 #endif |
460 | 6257 dtmp = POS_BEFORE_GAP_UNSAFE (d); |
867 | 6258 DEC_IBYTEPTR_FMT (dtmp, fmt); |
6259 emch = itext_ichar_fmt (dtmp, fmt, lispobj); | |
1333 | 6260 BEGIN_REGEX_MALLOC_OK (); |
6261 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); | |
6262 END_REGEX_MALLOC_OK (); | |
6263 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
6264 if (tempres) | |
428 | 6265 goto fail; |
6266 if (AT_STRINGS_END (d)) | |
6267 break; | |
460 | 6268 dtmp = POS_AFTER_GAP_UNSAFE (d); |
867 | 6269 emch = itext_ichar_fmt (dtmp, fmt, lispobj); |
1333 | 6270 BEGIN_REGEX_MALLOC_OK (); |
460 | 6271 #ifdef emacs |
826 | 6272 UPDATE_SYNTAX_CACHE_FORWARD (scache, charpos + 1); |
460 | 6273 #endif |
1333 | 6274 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); |
6275 END_REGEX_MALLOC_OK (); | |
6276 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
6277 if (tempres) | |
428 | 6278 break; |
6279 goto fail; | |
6280 } | |
6281 | |
6282 #ifdef emacs | |
6283 case before_dot: | |
5041 | 6284 DEBUG_MATCH_PRINT1 ("EXECUTING before_dot.\n"); |
826 | 6285 if (!BUFFERP (lispobj) |
6286 || (BUF_PTR_BYTE_POS (XBUFFER (lispobj), (unsigned char *) d) | |
6287 >= BUF_PT (XBUFFER (lispobj)))) | |
428 | 6288 goto fail; |
6289 break; | |
6290 | |
6291 case at_dot: | |
5041 | 6292 DEBUG_MATCH_PRINT1 ("EXECUTING at_dot.\n"); |
826 | 6293 if (!BUFFERP (lispobj) |
6294 || (BUF_PTR_BYTE_POS (XBUFFER (lispobj), (unsigned char *) d) | |
6295 != BUF_PT (XBUFFER (lispobj)))) | |
428 | 6296 goto fail; |
6297 break; | |
6298 | |
6299 case after_dot: | |
5041 | 6300 DEBUG_MATCH_PRINT1 ("EXECUTING after_dot.\n"); |
826 | 6301 if (!BUFFERP (lispobj) |
6302 || (BUF_PTR_BYTE_POS (XBUFFER (lispobj), (unsigned char *) d) | |
6303 <= BUF_PT (XBUFFER (lispobj)))) | |
428 | 6304 goto fail; |
6305 break; | |
6306 | |
6307 case syntaxspec: | |
5041 | 6308 DEBUG_MATCH_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); |
428 | 6309 mcnt = *p++; |
6310 goto matchsyntax; | |
6311 | |
6312 case wordchar: | |
5041 | 6313 DEBUG_MATCH_PRINT1 ("EXECUTING Emacs wordchar.\n"); |
428 | 6314 mcnt = (int) Sword; |
6315 matchsyntax: | |
6316 should_succeed = 1; | |
6317 matchornotsyntax: | |
6318 { | |
6319 int matches; | |
867 | 6320 Ichar emch; |
428 | 6321 |
450 | 6322 REGEX_PREFETCH (); |
1333 | 6323 BEGIN_REGEX_MALLOC_OK (); |
826 | 6324 UPDATE_SYNTAX_CACHE |
6325 (scache, offset_to_charxpos (lispobj, PTR_TO_OFFSET (d))); | |
1333 | 6326 END_REGEX_MALLOC_OK (); |
6327 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
826 | 6328 |
867 | 6329 emch = itext_ichar_fmt (d, fmt, lispobj); |
1333 | 6330 BEGIN_REGEX_MALLOC_OK (); |
826 | 6331 matches = (SYNTAX_FROM_CACHE (scache, emch) == |
6332 (enum syntaxcode) mcnt); | |
1333 | 6333 END_REGEX_MALLOC_OK (); |
6334 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
867 | 6335 INC_IBYTEPTR_FMT (d, fmt); |
428 | 6336 if (matches != should_succeed) |
6337 goto fail; | |
6338 SET_REGS_MATCHED (); | |
6339 } | |
6340 break; | |
6341 | |
6342 case notsyntaxspec: | |
5041 | 6343 DEBUG_MATCH_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); |
428 | 6344 mcnt = *p++; |
6345 goto matchnotsyntax; | |
6346 | |
6347 case notwordchar: | |
5041 | 6348 DEBUG_MATCH_PRINT1 ("EXECUTING Emacs notwordchar.\n"); |
428 | 6349 mcnt = (int) Sword; |
6350 matchnotsyntax: | |
6351 should_succeed = 0; | |
6352 goto matchornotsyntax; | |
6353 | |
6354 #ifdef MULE | |
6355 /* 97/2/17 jhod Mule category code patch */ | |
6356 case categoryspec: | |
6357 should_succeed = 1; | |
6358 matchornotcategory: | |
6359 { | |
867 | 6360 Ichar emch; |
428 | 6361 |
6362 mcnt = *p++; | |
450 | 6363 REGEX_PREFETCH (); |
867 | 6364 emch = itext_ichar_fmt (d, fmt, lispobj); |
6365 INC_IBYTEPTR_FMT (d, fmt); | |
826 | 6366 if (check_category_char (emch, BUFFER_CATEGORY_TABLE (lispbuf), |
6367 mcnt, should_succeed)) | |
428 | 6368 goto fail; |
6369 SET_REGS_MATCHED (); | |
6370 } | |
6371 break; | |
6372 | |
6373 case notcategoryspec: | |
6374 should_succeed = 0; | |
6375 goto matchornotcategory; | |
6376 /* end of category patch */ | |
6377 #endif /* MULE */ | |
6378 #else /* not emacs */ | |
6379 case wordchar: | |
5041 | 6380 DEBUG_MATCH_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); |
450 | 6381 REGEX_PREFETCH (); |
826 | 6382 if (!WORDCHAR_P ((int) (*d))) |
428 | 6383 goto fail; |
6384 SET_REGS_MATCHED (); | |
6385 d++; | |
6386 break; | |
6387 | |
6388 case notwordchar: | |
5041 | 6389 DEBUG_MATCH_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); |
450 | 6390 REGEX_PREFETCH (); |
826 | 6391 if (!WORDCHAR_P ((int) (*d))) |
428 | 6392 goto fail; |
6393 SET_REGS_MATCHED (); | |
6394 d++; | |
6395 break; | |
446 | 6396 #endif /* emacs */ |
428 | 6397 |
6398 default: | |
2500 | 6399 ABORT (); |
428 | 6400 } |
6401 continue; /* Successfully executed one pattern command; keep going. */ | |
6402 | |
6403 | |
6404 /* We goto here if a matching operation fails. */ | |
6405 fail: | |
6406 if (!FAIL_STACK_EMPTY ()) | |
6407 { /* A restart point is known. Restore to that state. */ | |
5041 | 6408 DEBUG_MATCH_PRINT1 ("\nFAIL:\n"); |
428 | 6409 POP_FAILURE_POINT (d, p, |
6410 lowest_active_reg, highest_active_reg, | |
6411 regstart, regend, reg_info); | |
6412 | |
6413 /* If this failure point is a dummy, try the next one. */ | |
6414 if (!p) | |
6415 goto fail; | |
6416 | |
6417 /* If we failed to the end of the pattern, don't examine *p. */ | |
6418 assert (p <= pend); | |
6419 if (p < pend) | |
6420 { | |
460 | 6421 re_bool is_a_jump_n = false; |
428 | 6422 |
6423 /* If failed to a backwards jump that's part of a repetition | |
6424 loop, need to pop this failure point and use the next one. */ | |
6425 switch ((re_opcode_t) *p) | |
6426 { | |
6427 case jump_n: | |
6428 is_a_jump_n = true; | |
6429 case maybe_pop_jump: | |
6430 case pop_failure_jump: | |
6431 case jump: | |
6432 p1 = p + 1; | |
6433 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6434 p1 += mcnt; | |
6435 | |
6436 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) | |
6437 || (!is_a_jump_n | |
6438 && (re_opcode_t) *p1 == on_failure_jump)) | |
6439 goto fail; | |
6440 break; | |
6441 default: | |
6442 /* do nothing */ ; | |
6443 } | |
6444 } | |
6445 | |
6446 if (d >= string1 && d <= end1) | |
6447 dend = end_match_1; | |
6448 } | |
6449 else | |
6450 break; /* Matching at this starting point really fails. */ | |
6451 } /* for (;;) */ | |
6452 | |
6453 if (best_regs_set) | |
6454 goto restore_best_regs; | |
6455 | |
6456 FREE_VARIABLES (); | |
6457 | |
6458 return -1; /* Failure to match. */ | |
1333 | 6459 } /* re_match_2_internal */ |
428 | 6460 |
6461 /* Subroutine definitions for re_match_2. */ | |
6462 | |
6463 | |
6464 /* We are passed P pointing to a register number after a start_memory. | |
6465 | |
6466 Return true if the pattern up to the corresponding stop_memory can | |
6467 match the empty string, and false otherwise. | |
6468 | |
6469 If we find the matching stop_memory, sets P to point to one past its number. | |
6470 Otherwise, sets P to an undefined byte less than or equal to END. | |
6471 | |
6472 We don't handle duplicates properly (yet). */ | |
6473 | |
460 | 6474 static re_bool |
428 | 6475 group_match_null_string_p (unsigned char **p, unsigned char *end, |
6476 register_info_type *reg_info) | |
6477 { | |
6478 int mcnt; | |
6479 /* Point to after the args to the start_memory. */ | |
6480 unsigned char *p1 = *p + 2; | |
6481 | |
6482 while (p1 < end) | |
6483 { | |
6484 /* Skip over opcodes that can match nothing, and return true or | |
6485 false, as appropriate, when we get to one that can't, or to the | |
6486 matching stop_memory. */ | |
6487 | |
6488 switch ((re_opcode_t) *p1) | |
6489 { | |
6490 /* Could be either a loop or a series of alternatives. */ | |
6491 case on_failure_jump: | |
6492 p1++; | |
6493 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6494 | |
6495 /* If the next operation is not a jump backwards in the | |
6496 pattern. */ | |
6497 | |
6498 if (mcnt >= 0) | |
6499 { | |
6500 /* Go through the on_failure_jumps of the alternatives, | |
6501 seeing if any of the alternatives cannot match nothing. | |
6502 The last alternative starts with only a jump, | |
6503 whereas the rest start with on_failure_jump and end | |
6504 with a jump, e.g., here is the pattern for `a|b|c': | |
6505 | |
6506 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 | |
6507 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 | |
6508 /exactn/1/c | |
6509 | |
6510 So, we have to first go through the first (n-1) | |
6511 alternatives and then deal with the last one separately. */ | |
6512 | |
6513 | |
6514 /* Deal with the first (n-1) alternatives, which start | |
6515 with an on_failure_jump (see above) that jumps to right | |
6516 past a jump_past_alt. */ | |
6517 | |
6518 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) | |
6519 { | |
6520 /* `mcnt' holds how many bytes long the alternative | |
6521 is, including the ending `jump_past_alt' and | |
6522 its number. */ | |
6523 | |
6524 if (!alt_match_null_string_p (p1, p1 + mcnt - 3, | |
6525 reg_info)) | |
6526 return false; | |
6527 | |
6528 /* Move to right after this alternative, including the | |
6529 jump_past_alt. */ | |
6530 p1 += mcnt; | |
6531 | |
6532 /* Break if it's the beginning of an n-th alternative | |
6533 that doesn't begin with an on_failure_jump. */ | |
6534 if ((re_opcode_t) *p1 != on_failure_jump) | |
6535 break; | |
6536 | |
6537 /* Still have to check that it's not an n-th | |
6538 alternative that starts with an on_failure_jump. */ | |
6539 p1++; | |
6540 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6541 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) | |
6542 { | |
6543 /* Get to the beginning of the n-th alternative. */ | |
6544 p1 -= 3; | |
6545 break; | |
6546 } | |
6547 } | |
6548 | |
6549 /* Deal with the last alternative: go back and get number | |
6550 of the `jump_past_alt' just before it. `mcnt' contains | |
6551 the length of the alternative. */ | |
6552 EXTRACT_NUMBER (mcnt, p1 - 2); | |
6553 | |
6554 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) | |
6555 return false; | |
6556 | |
6557 p1 += mcnt; /* Get past the n-th alternative. */ | |
6558 } /* if mcnt > 0 */ | |
6559 break; | |
6560 | |
6561 | |
6562 case stop_memory: | |
6563 assert (p1[1] == **p); | |
6564 *p = p1 + 2; | |
6565 return true; | |
6566 | |
6567 | |
6568 default: | |
6569 if (!common_op_match_null_string_p (&p1, end, reg_info)) | |
6570 return false; | |
6571 } | |
6572 } /* while p1 < end */ | |
6573 | |
6574 return false; | |
6575 } /* group_match_null_string_p */ | |
6576 | |
6577 | |
6578 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: | |
6579 It expects P to be the first byte of a single alternative and END one | |
6580 byte past the last. The alternative can contain groups. */ | |
6581 | |
460 | 6582 static re_bool |
428 | 6583 alt_match_null_string_p (unsigned char *p, unsigned char *end, |
6584 register_info_type *reg_info) | |
6585 { | |
6586 int mcnt; | |
6587 unsigned char *p1 = p; | |
6588 | |
6589 while (p1 < end) | |
6590 { | |
6591 /* Skip over opcodes that can match nothing, and break when we get | |
6592 to one that can't. */ | |
6593 | |
6594 switch ((re_opcode_t) *p1) | |
6595 { | |
6596 /* It's a loop. */ | |
6597 case on_failure_jump: | |
6598 p1++; | |
6599 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6600 p1 += mcnt; | |
6601 break; | |
6602 | |
6603 default: | |
6604 if (!common_op_match_null_string_p (&p1, end, reg_info)) | |
6605 return false; | |
6606 } | |
6607 } /* while p1 < end */ | |
6608 | |
6609 return true; | |
6610 } /* alt_match_null_string_p */ | |
6611 | |
6612 | |
6613 /* Deals with the ops common to group_match_null_string_p and | |
6614 alt_match_null_string_p. | |
6615 | |
6616 Sets P to one after the op and its arguments, if any. */ | |
6617 | |
460 | 6618 static re_bool |
428 | 6619 common_op_match_null_string_p (unsigned char **p, unsigned char *end, |
6620 register_info_type *reg_info) | |
6621 { | |
6622 int mcnt; | |
460 | 6623 re_bool ret; |
428 | 6624 int reg_no; |
6625 unsigned char *p1 = *p; | |
6626 | |
6627 switch ((re_opcode_t) *p1++) | |
6628 { | |
6629 case no_op: | |
6630 case begline: | |
6631 case endline: | |
6632 case begbuf: | |
6633 case endbuf: | |
6634 case wordbeg: | |
6635 case wordend: | |
6636 case wordbound: | |
6637 case notwordbound: | |
6638 #ifdef emacs | |
6639 case before_dot: | |
6640 case at_dot: | |
6641 case after_dot: | |
6642 #endif | |
6643 break; | |
6644 | |
6645 case start_memory: | |
6646 reg_no = *p1; | |
6647 assert (reg_no > 0 && reg_no <= MAX_REGNUM); | |
6648 ret = group_match_null_string_p (&p1, end, reg_info); | |
6649 | |
6650 /* Have to set this here in case we're checking a group which | |
6651 contains a group and a back reference to it. */ | |
6652 | |
6653 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) | |
6654 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; | |
6655 | |
6656 if (!ret) | |
6657 return false; | |
6658 break; | |
6659 | |
6660 /* If this is an optimized succeed_n for zero times, make the jump. */ | |
6661 case jump: | |
6662 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6663 if (mcnt >= 0) | |
6664 p1 += mcnt; | |
6665 else | |
6666 return false; | |
6667 break; | |
6668 | |
6669 case succeed_n: | |
6670 /* Get to the number of times to succeed. */ | |
6671 p1 += 2; | |
6672 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6673 | |
6674 if (mcnt == 0) | |
6675 { | |
6676 p1 -= 4; | |
6677 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6678 p1 += mcnt; | |
6679 } | |
6680 else | |
6681 return false; | |
6682 break; | |
6683 | |
6684 case duplicate: | |
6685 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) | |
6686 return false; | |
6687 break; | |
6688 | |
6689 case set_number_at: | |
6690 p1 += 4; | |
6691 | |
6692 default: | |
6693 /* All other opcodes mean we cannot match the empty string. */ | |
6694 return false; | |
6695 } | |
6696 | |
6697 *p = p1; | |
6698 return true; | |
6699 } /* common_op_match_null_string_p */ | |
6700 | |
6701 | |
6702 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN | |
6703 bytes; nonzero otherwise. */ | |
6704 | |
6705 static int | |
446 | 6706 bcmp_translate (re_char *s1, re_char *s2, |
826 | 6707 REGISTER int len, RE_TRANSLATE_TYPE translate |
6708 #ifdef emacs | |
2333 | 6709 , Internal_Format USED_IF_MULE (fmt), |
6710 Lisp_Object USED_IF_MULE (lispobj) | |
826 | 6711 #endif |
6712 ) | |
428 | 6713 { |
826 | 6714 REGISTER re_char *p1 = s1, *p2 = s2; |
446 | 6715 #ifdef MULE |
826 | 6716 re_char *p1_end = s1 + len; |
6717 re_char *p2_end = s2 + len; | |
446 | 6718 |
6719 while (p1 != p1_end && p2 != p2_end) | |
6720 { | |
867 | 6721 Ichar p1_ch, p2_ch; |
6722 | |
6723 p1_ch = itext_ichar_fmt (p1, fmt, lispobj); | |
6724 p2_ch = itext_ichar_fmt (p2, fmt, lispobj); | |
826 | 6725 |
6726 if (RE_TRANSLATE_1 (p1_ch) | |
6727 != RE_TRANSLATE_1 (p2_ch)) | |
446 | 6728 return 1; |
867 | 6729 INC_IBYTEPTR_FMT (p1, fmt); |
6730 INC_IBYTEPTR_FMT (p2, fmt); | |
446 | 6731 } |
6732 #else /* not MULE */ | |
428 | 6733 while (len) |
6734 { | |
826 | 6735 if (RE_TRANSLATE_1 (*p1++) != RE_TRANSLATE_1 (*p2++)) return 1; |
428 | 6736 len--; |
6737 } | |
446 | 6738 #endif /* MULE */ |
428 | 6739 return 0; |
6740 } | |
6741 | |
6742 /* Entry points for GNU code. */ | |
6743 | |
6744 /* re_compile_pattern is the GNU regular expression compiler: it | |
6745 compiles PATTERN (of length SIZE) and puts the result in BUFP. | |
6746 Returns 0 if the pattern was valid, otherwise an error string. | |
6747 | |
6748 Assumes the `allocated' (and perhaps `buffer') and `translate' fields | |
6749 are set in BUFP on entry. | |
6750 | |
6751 We call regex_compile to do the actual compilation. */ | |
6752 | |
442 | 6753 const char * |
6754 re_compile_pattern (const char *pattern, int length, | |
428 | 6755 struct re_pattern_buffer *bufp) |
6756 { | |
6757 reg_errcode_t ret; | |
6758 | |
6759 /* GNU code is written to assume at least RE_NREGS registers will be set | |
6760 (and at least one extra will be -1). */ | |
6761 bufp->regs_allocated = REGS_UNALLOCATED; | |
6762 | |
6763 /* And GNU code determines whether or not to get register information | |
6764 by passing null for the REGS argument to re_match, etc., not by | |
6765 setting no_sub. */ | |
6766 bufp->no_sub = 0; | |
6767 | |
6768 /* Match anchors at newline. */ | |
6769 bufp->newline_anchor = 1; | |
6770 | |
826 | 6771 ret = regex_compile ((unsigned char *) pattern, length, re_syntax_options, |
6772 bufp); | |
428 | 6773 |
6774 if (!ret) | |
6775 return NULL; | |
6776 return gettext (re_error_msgid[(int) ret]); | |
6777 } | |
6778 | |
6779 /* Entry points compatible with 4.2 BSD regex library. We don't define | |
6780 them unless specifically requested. */ | |
6781 | |
6782 #ifdef _REGEX_RE_COMP | |
6783 | |
6784 /* BSD has one and only one pattern buffer. */ | |
6785 static struct re_pattern_buffer re_comp_buf; | |
6786 | |
6787 char * | |
442 | 6788 re_comp (const char *s) |
428 | 6789 { |
6790 reg_errcode_t ret; | |
6791 | |
6792 if (!s) | |
6793 { | |
6794 if (!re_comp_buf.buffer) | |
6795 return gettext ("No previous regular expression"); | |
6796 return 0; | |
6797 } | |
6798 | |
6799 if (!re_comp_buf.buffer) | |
6800 { | |
1333 | 6801 re_comp_buf.buffer = (unsigned char *) xmalloc (200); |
428 | 6802 if (re_comp_buf.buffer == NULL) |
6803 return gettext (re_error_msgid[(int) REG_ESPACE]); | |
6804 re_comp_buf.allocated = 200; | |
6805 | |
1333 | 6806 re_comp_buf.fastmap = (char *) xmalloc (1 << BYTEWIDTH); |
428 | 6807 if (re_comp_buf.fastmap == NULL) |
6808 return gettext (re_error_msgid[(int) REG_ESPACE]); | |
6809 } | |
6810 | |
6811 /* Since `re_exec' always passes NULL for the `regs' argument, we | |
6812 don't need to initialize the pattern buffer fields which affect it. */ | |
6813 | |
6814 /* Match anchors at newlines. */ | |
6815 re_comp_buf.newline_anchor = 1; | |
6816 | |
826 | 6817 ret = regex_compile ((unsigned char *)s, strlen (s), re_syntax_options, |
6818 &re_comp_buf); | |
428 | 6819 |
6820 if (!ret) | |
6821 return NULL; | |
6822 | |
442 | 6823 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ |
428 | 6824 return (char *) gettext (re_error_msgid[(int) ret]); |
6825 } | |
6826 | |
6827 | |
6828 int | |
442 | 6829 re_exec (const char *s) |
428 | 6830 { |
442 | 6831 const int len = strlen (s); |
428 | 6832 return |
6833 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); | |
6834 } | |
6835 #endif /* _REGEX_RE_COMP */ | |
6836 | |
6837 /* POSIX.2 functions. Don't define these for Emacs. */ | |
6838 | |
6839 #ifndef emacs | |
6840 | |
6841 /* regcomp takes a regular expression as a string and compiles it. | |
6842 | |
6843 PREG is a regex_t *. We do not expect any fields to be initialized, | |
6844 since POSIX says we shouldn't. Thus, we set | |
6845 | |
6846 `buffer' to the compiled pattern; | |
6847 `used' to the length of the compiled pattern; | |
6848 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the | |
6849 REG_EXTENDED bit in CFLAGS is set; otherwise, to | |
6850 RE_SYNTAX_POSIX_BASIC; | |
6851 `newline_anchor' to REG_NEWLINE being set in CFLAGS; | |
6852 `fastmap' and `fastmap_accurate' to zero; | |
6853 `re_nsub' to the number of subexpressions in PATTERN. | |
502 | 6854 (non-shy of course. POSIX probably doesn't know about |
6855 shy ones, and in any case they should be invisible.) | |
428 | 6856 |
6857 PATTERN is the address of the pattern string. | |
6858 | |
6859 CFLAGS is a series of bits which affect compilation. | |
6860 | |
6861 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we | |
6862 use POSIX basic syntax. | |
6863 | |
6864 If REG_NEWLINE is set, then . and [^...] don't match newline. | |
6865 Also, regexec will try a match beginning after every newline. | |
6866 | |
6867 If REG_ICASE is set, then we considers upper- and lowercase | |
6868 versions of letters to be equivalent when matching. | |
6869 | |
6870 If REG_NOSUB is set, then when PREG is passed to regexec, that | |
6871 routine will report only success or failure, and nothing about the | |
6872 registers. | |
6873 | |
6874 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for | |
6875 the return codes and their meanings.) */ | |
6876 | |
6877 int | |
442 | 6878 regcomp (regex_t *preg, const char *pattern, int cflags) |
428 | 6879 { |
6880 reg_errcode_t ret; | |
647 | 6881 unsigned int syntax |
428 | 6882 = (cflags & REG_EXTENDED) ? |
6883 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; | |
6884 | |
6885 /* regex_compile will allocate the space for the compiled pattern. */ | |
6886 preg->buffer = 0; | |
6887 preg->allocated = 0; | |
6888 preg->used = 0; | |
6889 | |
6890 /* Don't bother to use a fastmap when searching. This simplifies the | |
6891 REG_NEWLINE case: if we used a fastmap, we'd have to put all the | |
6892 characters after newlines into the fastmap. This way, we just try | |
6893 every character. */ | |
6894 preg->fastmap = 0; | |
6895 | |
6896 if (cflags & REG_ICASE) | |
6897 { | |
647 | 6898 int i; |
428 | 6899 |
1333 | 6900 preg->translate = (char *) xmalloc (CHAR_SET_SIZE); |
428 | 6901 if (preg->translate == NULL) |
6902 return (int) REG_ESPACE; | |
6903 | |
6904 /* Map uppercase characters to corresponding lowercase ones. */ | |
6905 for (i = 0; i < CHAR_SET_SIZE; i++) | |
6906 preg->translate[i] = ISUPPER (i) ? tolower (i) : i; | |
6907 } | |
6908 else | |
6909 preg->translate = NULL; | |
6910 | |
6911 /* If REG_NEWLINE is set, newlines are treated differently. */ | |
6912 if (cflags & REG_NEWLINE) | |
6913 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ | |
6914 syntax &= ~RE_DOT_NEWLINE; | |
6915 syntax |= RE_HAT_LISTS_NOT_NEWLINE; | |
6916 /* It also changes the matching behavior. */ | |
6917 preg->newline_anchor = 1; | |
6918 } | |
6919 else | |
6920 preg->newline_anchor = 0; | |
6921 | |
6922 preg->no_sub = !!(cflags & REG_NOSUB); | |
6923 | |
6924 /* POSIX says a null character in the pattern terminates it, so we | |
6925 can use strlen here in compiling the pattern. */ | |
446 | 6926 ret = regex_compile ((unsigned char *) pattern, strlen (pattern), syntax, preg); |
428 | 6927 |
6928 /* POSIX doesn't distinguish between an unmatched open-group and an | |
6929 unmatched close-group: both are REG_EPAREN. */ | |
6930 if (ret == REG_ERPAREN) ret = REG_EPAREN; | |
6931 | |
6932 return (int) ret; | |
6933 } | |
6934 | |
6935 | |
6936 /* regexec searches for a given pattern, specified by PREG, in the | |
6937 string STRING. | |
6938 | |
6939 If NMATCH is zero or REG_NOSUB was set in the cflags argument to | |
6940 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at | |
6941 least NMATCH elements, and we set them to the offsets of the | |
6942 corresponding matched substrings. | |
6943 | |
6944 EFLAGS specifies `execution flags' which affect matching: if | |
6945 REG_NOTBOL is set, then ^ does not match at the beginning of the | |
6946 string; if REG_NOTEOL is set, then $ does not match at the end. | |
6947 | |
6948 We return 0 if we find a match and REG_NOMATCH if not. */ | |
6949 | |
6950 int | |
442 | 6951 regexec (const regex_t *preg, const char *string, size_t nmatch, |
428 | 6952 regmatch_t pmatch[], int eflags) |
6953 { | |
6954 int ret; | |
6955 struct re_registers regs; | |
6956 regex_t private_preg; | |
6957 int len = strlen (string); | |
460 | 6958 re_bool want_reg_info = !preg->no_sub && nmatch > 0; |
428 | 6959 |
6960 private_preg = *preg; | |
6961 | |
6962 private_preg.not_bol = !!(eflags & REG_NOTBOL); | |
6963 private_preg.not_eol = !!(eflags & REG_NOTEOL); | |
6964 | |
6965 /* The user has told us exactly how many registers to return | |
6966 information about, via `nmatch'. We have to pass that on to the | |
6967 matching routines. */ | |
6968 private_preg.regs_allocated = REGS_FIXED; | |
6969 | |
6970 if (want_reg_info) | |
6971 { | |
647 | 6972 regs.num_regs = (int) nmatch; |
6973 regs.start = TALLOC ((int) nmatch, regoff_t); | |
6974 regs.end = TALLOC ((int) nmatch, regoff_t); | |
428 | 6975 if (regs.start == NULL || regs.end == NULL) |
6976 return (int) REG_NOMATCH; | |
6977 } | |
6978 | |
6979 /* Perform the searching operation. */ | |
6980 ret = re_search (&private_preg, string, len, | |
6981 /* start: */ 0, /* range: */ len, | |
6982 want_reg_info ? ®s : (struct re_registers *) 0); | |
6983 | |
6984 /* Copy the register information to the POSIX structure. */ | |
6985 if (want_reg_info) | |
6986 { | |
6987 if (ret >= 0) | |
6988 { | |
647 | 6989 int r; |
6990 | |
6991 for (r = 0; r < (int) nmatch; r++) | |
428 | 6992 { |
6993 pmatch[r].rm_so = regs.start[r]; | |
6994 pmatch[r].rm_eo = regs.end[r]; | |
6995 } | |
6996 } | |
6997 | |
6998 /* If we needed the temporary register info, free the space now. */ | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
6999 xfree (regs.start); |
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
7000 xfree (regs.end); |
428 | 7001 } |
7002 | |
7003 /* We want zero return to mean success, unlike `re_search'. */ | |
7004 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; | |
7005 } | |
7006 | |
7007 | |
7008 /* Returns a message corresponding to an error code, ERRCODE, returned | |
7009 from either regcomp or regexec. We don't use PREG here. */ | |
7010 | |
7011 size_t | |
2286 | 7012 regerror (int errcode, const regex_t *UNUSED (preg), char *errbuf, |
647 | 7013 size_t errbuf_size) |
428 | 7014 { |
442 | 7015 const char *msg; |
665 | 7016 Bytecount msg_size; |
428 | 7017 |
7018 if (errcode < 0 | |
647 | 7019 || errcode >= (int) (sizeof (re_error_msgid) / |
7020 sizeof (re_error_msgid[0]))) | |
428 | 7021 /* Only error codes returned by the rest of the code should be passed |
7022 to this routine. If we are given anything else, or if other regex | |
7023 code generates an invalid error code, then the program has a bug. | |
7024 Dump core so we can fix it. */ | |
2500 | 7025 ABORT (); |
428 | 7026 |
7027 msg = gettext (re_error_msgid[errcode]); | |
7028 | |
7029 msg_size = strlen (msg) + 1; /* Includes the null. */ | |
7030 | |
7031 if (errbuf_size != 0) | |
7032 { | |
665 | 7033 if (msg_size > (Bytecount) errbuf_size) |
428 | 7034 { |
7035 strncpy (errbuf, msg, errbuf_size - 1); | |
7036 errbuf[errbuf_size - 1] = 0; | |
7037 } | |
7038 else | |
7039 strcpy (errbuf, msg); | |
7040 } | |
7041 | |
647 | 7042 return (size_t) msg_size; |
428 | 7043 } |
7044 | |
7045 | |
7046 /* Free dynamically allocated space used by PREG. */ | |
7047 | |
7048 void | |
7049 regfree (regex_t *preg) | |
7050 { | |
7051 if (preg->buffer != NULL) | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
7052 xfree (preg->buffer); |
428 | 7053 preg->buffer = NULL; |
7054 | |
7055 preg->allocated = 0; | |
7056 preg->used = 0; | |
7057 | |
7058 if (preg->fastmap != NULL) | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
7059 xfree (preg->fastmap); |
428 | 7060 preg->fastmap = NULL; |
7061 preg->fastmap_accurate = 0; | |
7062 | |
7063 if (preg->translate != NULL) | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
7064 xfree (preg->translate); |
428 | 7065 preg->translate = NULL; |
7066 } | |
7067 | |
7068 #endif /* not emacs */ | |
7069 |