Mercurial > hg > xemacs-beta
annotate src/regex.c @ 4792:95b04754ea8c
Make #'equalp more compatible with CL; add a compiler macro, test & doc it.
lisp/ChangeLog addition:
2009-11-08 Aidan Kehoe <kehoea@parhasard.net>
* cl-extra.el (cl-string-vector-equalp)
(cl-bit-vector-vector-equalp, cl-vector-array-equalp)
(cl-hash-table-contents-equalp): New functions, to implement
equalp treating arrays with identical contents as equivalent, as
specified by Common Lisp.
(equalp): Revise this function to implement array equivalence,
and the hash-table equalp behaviour specified by CL.
* cl-macs.el (equalp): Add a compiler macro for this function,
used when one of the arguments is constant, and as such, its type
is known at compile time.
man/ChangeLog addition:
2009-11-08 Aidan Kehoe <kehoea@parhasard.net>
* lispref/objects.texi (Equality Predicates):
Document #'equalp here, as well as #'equal and #'eq.
tests/ChangeLog addition:
2009-12-31 Aidan Kehoe <kehoea@parhasard.net>
* automated/lisp-tests.el:
Test much of the functionality of equalp; add a pointer to Paul
Dietz' ANSI test suite for this function, converted to Emacs
Lisp. Not including the tests themselves in XEmacs because who
owns the copyright on the files is unclear and the GCL people
didn't respond to my queries.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Thu, 31 Dec 2009 15:09:41 +0000 |
parents | aa5ed11f473b |
children | 07fa38c30fdf |
rev | line source |
---|---|
428 | 1 /* Extended regular expression matching and search library, |
2 version 0.12, extended for XEmacs. | |
3 (Implements POSIX draft P10003.2/D11.2, except for | |
4 internationalization features.) | |
5 | |
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc. | |
7 Copyright (C) 1995 Sun Microsystems, Inc. | |
1333 | 8 Copyright (C) 1995, 2001, 2002, 2003 Ben Wing. |
428 | 9 |
10 This program is free software; you can redistribute it and/or modify | |
11 it under the terms of the GNU General Public License as published by | |
12 the Free Software Foundation; either version 2, or (at your option) | |
13 any later version. | |
14 | |
15 This program is distributed in the hope that it will be useful, | |
16 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 GNU General Public License for more details. | |
19 | |
20 You should have received a copy of the GNU General Public License | |
21 along with this program; see the file COPYING. If not, write to | |
22 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
23 Boston, MA 02111-1307, USA. */ | |
24 | |
25 /* Synched up with: FSF 19.29. */ | |
26 | |
27 #ifdef HAVE_CONFIG_H | |
28 #include <config.h> | |
29 #endif | |
30 | |
31 #ifndef _GNU_SOURCE | |
32 #define _GNU_SOURCE 1 | |
33 #endif | |
34 | |
35 /* We assume non-Mule if emacs isn't defined. */ | |
36 #ifndef emacs | |
37 #undef MULE | |
38 #endif | |
39 | |
771 | 40 /* XEmacs addition */ |
41 #ifdef REL_ALLOC | |
42 #define REGEX_REL_ALLOC /* may be undefined below */ | |
43 #endif | |
44 | |
428 | 45 /* XEmacs: define this to add in a speedup for patterns anchored at |
46 the beginning of a line. Keep the ifdefs so that it's easier to | |
47 tell where/why this code has diverged from v19. */ | |
48 #define REGEX_BEGLINE_CHECK | |
49 | |
50 /* XEmacs: the current mmap-based ralloc handles small blocks very | |
51 poorly, so we disable it here. */ | |
52 | |
771 | 53 #if defined (HAVE_MMAP) || defined (DOUG_LEA_MALLOC) |
54 # undef REGEX_REL_ALLOC | |
428 | 55 #endif |
56 | |
57 /* The `emacs' switch turns on certain matching commands | |
58 that make sense only in Emacs. */ | |
59 #ifdef emacs | |
60 | |
61 #include "lisp.h" | |
62 #include "buffer.h" | |
63 #include "syntax.h" | |
64 | |
65 #if (defined (DEBUG_XEMACS) && !defined (DEBUG)) | |
66 #define DEBUG | |
67 #endif | |
68 | |
867 | 69 #define RE_TRANSLATE_1(ch) TRT_TABLE_OF (translate, (Ichar) ch) |
446 | 70 #define TRANSLATE_P(tr) (!NILP (tr)) |
428 | 71 |
826 | 72 /* Converts the pointer to the char to BEG-based offset from the start. */ |
73 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \ | |
74 ? (d) - string1 : (d) - (string2 - size1)) | |
75 | |
428 | 76 #else /* not emacs */ |
77 | |
2367 | 78 #include <stdlib.h> |
79 #include <sys/types.h> | |
80 #include <stddef.h> /* needed for ptrdiff_t under Solaris */ | |
81 #include <string.h> | |
82 | |
2286 | 83 #include "compiler.h" /* Get compiler-specific definitions like UNUSED */ |
84 | |
2500 | 85 #define ABORT abort |
86 | |
428 | 87 /* If we are not linking with Emacs proper, |
88 we can't use the relocating allocator | |
89 even if config.h says that we can. */ | |
771 | 90 #undef REGEX_REL_ALLOC |
428 | 91 |
544 | 92 /* defined in lisp.h */ |
93 #ifdef REGEX_MALLOC | |
94 #ifndef DECLARE_NOTHING | |
95 #define DECLARE_NOTHING struct nosuchstruct | |
96 #endif | |
97 #endif | |
98 | |
867 | 99 #define itext_ichar(str) ((Ichar) (str)[0]) |
100 #define itext_ichar_fmt(str, fmt, object) ((Ichar) (str)[0]) | |
101 #define itext_ichar_ascii_fmt(str, fmt, object) ((Ichar) (str)[0]) | |
428 | 102 |
103 #if (LONGBITS > INTBITS) | |
104 # define EMACS_INT long | |
105 #else | |
106 # define EMACS_INT int | |
107 #endif | |
108 | |
867 | 109 typedef int Ichar; |
110 | |
111 #define INC_IBYTEPTR(p) ((p)++) | |
112 #define INC_IBYTEPTR_FMT(p, fmt) ((p)++) | |
113 #define DEC_IBYTEPTR(p) ((p)--) | |
114 #define DEC_IBYTEPTR_FMT(p, fmt) ((p)--) | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
115 #define MAX_ICHAR_LEN 1 |
867 | 116 #define itext_ichar_len(ptr) 1 |
117 #define itext_ichar_len_fmt(ptr, fmt) 1 | |
428 | 118 |
119 /* Define the syntax stuff for \<, \>, etc. */ | |
120 | |
121 /* This must be nonzero for the wordchar and notwordchar pattern | |
122 commands in re_match_2. */ | |
123 #ifndef Sword | |
124 #define Sword 1 | |
125 #endif | |
126 | |
127 #ifdef SYNTAX_TABLE | |
128 | |
129 extern char *re_syntax_table; | |
130 | |
131 #else /* not SYNTAX_TABLE */ | |
132 | |
133 /* How many characters in the character set. */ | |
134 #define CHAR_SET_SIZE 256 | |
135 | |
136 static char re_syntax_table[CHAR_SET_SIZE]; | |
137 | |
138 static void | |
139 init_syntax_once (void) | |
140 { | |
141 static int done = 0; | |
142 | |
143 if (!done) | |
144 { | |
442 | 145 const char *word_syntax_chars = |
428 | 146 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_"; |
147 | |
148 memset (re_syntax_table, 0, sizeof (re_syntax_table)); | |
149 | |
150 while (*word_syntax_chars) | |
647 | 151 re_syntax_table[(unsigned int) (*word_syntax_chars++)] = Sword; |
428 | 152 |
153 done = 1; | |
154 } | |
155 } | |
156 | |
446 | 157 #endif /* SYNTAX_TABLE */ |
428 | 158 |
826 | 159 #define SYNTAX(ignored, c) re_syntax_table[c] |
460 | 160 #undef SYNTAX_FROM_CACHE |
826 | 161 #define SYNTAX_FROM_CACHE SYNTAX |
162 | |
163 #define RE_TRANSLATE_1(c) translate[(unsigned char) (c)] | |
446 | 164 #define TRANSLATE_P(tr) tr |
165 | |
166 #endif /* emacs */ | |
428 | 167 |
2201 | 168 /* This is for other GNU distributions with internationalized messages. */ |
169 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC)) | |
170 # include <libintl.h> | |
171 #else | |
172 # define gettext(msgid) (msgid) | |
173 #endif | |
174 | |
428 | 175 |
176 /* Get the interface, including the syntax bits. */ | |
177 #include "regex.h" | |
178 | |
179 /* isalpha etc. are used for the character classes. */ | |
180 #include <ctype.h> | |
181 | |
182 /* Jim Meyering writes: | |
183 | |
184 "... Some ctype macros are valid only for character codes that | |
185 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when | |
186 using /bin/cc or gcc but without giving an ansi option). So, all | |
187 ctype uses should be through macros like ISPRINT... If | |
188 STDC_HEADERS is defined, then autoconf has verified that the ctype | |
189 macros don't need to be guarded with references to isascii. ... | |
190 Defining isascii to 1 should let any compiler worth its salt | |
191 eliminate the && through constant folding." */ | |
192 | |
193 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) | |
194 #define ISASCII_1(c) 1 | |
195 #else | |
196 #define ISASCII_1(c) isascii(c) | |
197 #endif | |
198 | |
199 #ifdef MULE | |
200 /* The IS*() macros can be passed any character, including an extended | |
201 one. We need to make sure there are no crashes, which would occur | |
202 otherwise due to out-of-bounds array references. */ | |
203 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c)) | |
204 #else | |
205 #define ISASCII(c) ISASCII_1 (c) | |
206 #endif /* MULE */ | |
207 | |
208 #ifdef isblank | |
209 #define ISBLANK(c) (ISASCII (c) && isblank (c)) | |
210 #else | |
211 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') | |
212 #endif | |
213 #ifdef isgraph | |
214 #define ISGRAPH(c) (ISASCII (c) && isgraph (c)) | |
215 #else | |
216 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) | |
217 #endif | |
218 | |
219 #define ISPRINT(c) (ISASCII (c) && isprint (c)) | |
220 #define ISDIGIT(c) (ISASCII (c) && isdigit (c)) | |
221 #define ISALNUM(c) (ISASCII (c) && isalnum (c)) | |
222 #define ISALPHA(c) (ISASCII (c) && isalpha (c)) | |
223 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) | |
224 #define ISLOWER(c) (ISASCII (c) && islower (c)) | |
225 #define ISPUNCT(c) (ISASCII (c) && ispunct (c)) | |
226 #define ISSPACE(c) (ISASCII (c) && isspace (c)) | |
227 #define ISUPPER(c) (ISASCII (c) && isupper (c)) | |
228 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) | |
229 | |
230 #ifndef NULL | |
231 #define NULL (void *)0 | |
232 #endif | |
233 | |
234 /* We remove any previous definition of `SIGN_EXTEND_CHAR', | |
235 since ours (we hope) works properly with all combinations of | |
236 machines, compilers, `char' and `unsigned char' argument types. | |
237 (Per Bothner suggested the basic approach.) */ | |
238 #undef SIGN_EXTEND_CHAR | |
239 #if __STDC__ | |
240 #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) | |
241 #else /* not __STDC__ */ | |
242 /* As in Harbison and Steele. */ | |
243 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) | |
244 #endif | |
245 | |
246 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we | |
247 use `alloca' instead of `malloc'. This is because using malloc in | |
248 re_search* or re_match* could cause memory leaks when C-g is used in | |
249 Emacs; also, malloc is slower and causes storage fragmentation. On | |
250 the other hand, malloc is more portable, and easier to debug. | |
251 | |
252 Because we sometimes use alloca, some routines have to be macros, | |
253 not functions -- `alloca'-allocated space disappears at the end of the | |
254 function it is called in. */ | |
255 | |
1333 | 256 #ifndef emacs |
257 #define ALLOCA alloca | |
258 #define xmalloc malloc | |
259 #define xrealloc realloc | |
1726 | 260 #define xfree(x,type) free (x) |
1333 | 261 #endif |
262 | |
263 #ifdef emacs | |
264 #define ALLOCA_GARBAGE_COLLECT() \ | |
265 do \ | |
266 { \ | |
267 if (need_to_check_c_alloca) \ | |
268 xemacs_c_alloca (0); \ | |
269 } while (0) | |
270 #elif defined (C_ALLOCA) | |
271 #define ALLOCA_GARBAGE_COLLECT() alloca (0) | |
272 #else | |
273 #define ALLOCA_GARBAGE_COLLECT() | |
274 #endif | |
275 | |
276 #ifndef emacs | |
277 /* So we can use just it to conditionalize on */ | |
278 #undef ERROR_CHECK_MALLOC | |
279 #endif | |
280 | |
281 #ifdef ERROR_CHECK_MALLOC | |
282 /* When REL_ALLOC, malloc() is problematic because it could potentially | |
283 cause all rel-alloc()ed data -- including buffer text -- to be relocated. | |
284 We deal with this by checking for such relocation whenever we have | |
285 executed a statement that may call malloc() -- or alloca(), which may | |
286 end up calling malloc() in some circumstances -- and recomputing all | |
287 of our string pointers in re_match_2_internal() and re_search_2(). | |
288 However, if malloc() or alloca() happens and we don't know about it, | |
289 we could still be screwed. So we set up a system where we indicate all | |
290 places where we are prepared for malloc() or alloca(), and in any | |
291 other circumstances, calls to those functions (from anywhere inside of | |
2500 | 292 XEmacs!) will ABORT(). We do this even when REL_ALLOC is not defined |
1333 | 293 so that we catch these problems sooner, since many developers and beta |
294 testers will not be running with REL_ALLOC. */ | |
295 int regex_malloc_disallowed; | |
296 #define BEGIN_REGEX_MALLOC_OK() regex_malloc_disallowed = 0 | |
297 #define END_REGEX_MALLOC_OK() regex_malloc_disallowed = 1 | |
298 #define UNBIND_REGEX_MALLOC_CHECK() unbind_to (depth) | |
299 #else | |
300 #define BEGIN_REGEX_MALLOC_OK() | |
301 #define END_REGEX_MALLOC_OK() | |
302 #define UNBIND_REGEX_MALLOC_CHECK() | |
303 #endif | |
304 | |
305 | |
428 | 306 #ifdef REGEX_MALLOC |
307 | |
1333 | 308 #define REGEX_ALLOCATE xmalloc |
309 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc (source, nsize) | |
310 #define REGEX_FREE xfree | |
428 | 311 |
312 #else /* not REGEX_MALLOC */ | |
313 | |
314 /* Emacs already defines alloca, sometimes. */ | |
315 #ifndef alloca | |
316 | |
317 /* Make alloca work the best possible way. */ | |
318 #ifdef __GNUC__ | |
319 #define alloca __builtin_alloca | |
771 | 320 #elif defined (__DECC) /* XEmacs: added next 3 lines, similar to config.h.in */ |
321 #include <alloca.h> | |
322 #pragma intrinsic(alloca) | |
428 | 323 #else /* not __GNUC__ */ |
324 #if HAVE_ALLOCA_H | |
325 #include <alloca.h> | |
326 #else /* not __GNUC__ or HAVE_ALLOCA_H */ | |
327 #ifndef _AIX /* Already did AIX, up at the top. */ | |
444 | 328 void *alloca (); |
428 | 329 #endif /* not _AIX */ |
446 | 330 #endif /* HAVE_ALLOCA_H */ |
331 #endif /* __GNUC__ */ | |
428 | 332 |
333 #endif /* not alloca */ | |
334 | |
1333 | 335 #define REGEX_ALLOCATE ALLOCA |
428 | 336 |
2367 | 337 /* !!#### Needs review */ |
428 | 338 /* Assumes a `char *destination' variable. */ |
339 #define REGEX_REALLOCATE(source, osize, nsize) \ | |
1333 | 340 (destination = (char *) ALLOCA (nsize), \ |
428 | 341 memmove (destination, source, osize), \ |
342 destination) | |
343 | |
1726 | 344 /* No need to do anything to free, after alloca. |
345 Do nothing! But inhibit gcc warning. */ | |
346 #define REGEX_FREE(arg,type) ((void)0) | |
428 | 347 |
446 | 348 #endif /* REGEX_MALLOC */ |
428 | 349 |
350 /* Define how to allocate the failure stack. */ | |
351 | |
771 | 352 #ifdef REGEX_REL_ALLOC |
428 | 353 #define REGEX_ALLOCATE_STACK(size) \ |
1346 | 354 r_alloc ((unsigned char **) &failure_stack_ptr, (size)) |
428 | 355 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ |
1346 | 356 r_re_alloc ((unsigned char **) &failure_stack_ptr, (nsize)) |
428 | 357 #define REGEX_FREE_STACK(ptr) \ |
1346 | 358 r_alloc_free ((unsigned char **) &failure_stack_ptr) |
428 | 359 |
771 | 360 #else /* not REGEX_REL_ALLOC */ |
428 | 361 |
362 #ifdef REGEX_MALLOC | |
363 | |
1333 | 364 #define REGEX_ALLOCATE_STACK xmalloc |
365 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc (source, nsize) | |
1726 | 366 #define REGEX_FREE_STACK(arg) xfree (arg, fail_stack_elt_t *) |
428 | 367 |
368 #else /* not REGEX_MALLOC */ | |
369 | |
1333 | 370 #define REGEX_ALLOCATE_STACK ALLOCA |
428 | 371 |
372 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ | |
373 REGEX_REALLOCATE (source, osize, nsize) | |
374 /* No need to explicitly free anything. */ | |
375 #define REGEX_FREE_STACK(arg) | |
376 | |
446 | 377 #endif /* REGEX_MALLOC */ |
771 | 378 #endif /* REGEX_REL_ALLOC */ |
428 | 379 |
380 | |
381 /* True if `size1' is non-NULL and PTR is pointing anywhere inside | |
382 `string1' or just past its end. This works if PTR is NULL, which is | |
383 a good thing. */ | |
384 #define FIRST_STRING_P(ptr) \ | |
385 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) | |
386 | |
387 /* (Re)Allocate N items of type T using malloc, or fail. */ | |
1333 | 388 #define TALLOC(n, t) ((t *) xmalloc ((n) * sizeof (t))) |
389 #define RETALLOC(addr, n, t) ((addr) = (t *) xrealloc (addr, (n) * sizeof (t))) | |
428 | 390 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) |
391 | |
392 #define BYTEWIDTH 8 /* In bits. */ | |
393 | |
434 | 394 #define STREQ(s1, s2) (strcmp (s1, s2) == 0) |
428 | 395 |
396 #undef MAX | |
397 #undef MIN | |
398 #define MAX(a, b) ((a) > (b) ? (a) : (b)) | |
399 #define MIN(a, b) ((a) < (b) ? (a) : (b)) | |
400 | |
446 | 401 /* Type of source-pattern and string chars. */ |
402 typedef const unsigned char re_char; | |
403 | |
460 | 404 typedef char re_bool; |
428 | 405 #define false 0 |
406 #define true 1 | |
407 | |
408 | |
1346 | 409 #ifdef emacs |
410 | |
411 #ifdef MULE | |
412 | |
413 Lisp_Object Vthe_lisp_rangetab; | |
414 | |
415 void | |
416 vars_of_regex (void) | |
417 { | |
2421 | 418 Vthe_lisp_rangetab = Fmake_range_table (Qstart_closed_end_closed); |
1346 | 419 staticpro (&Vthe_lisp_rangetab); |
420 } | |
421 | |
422 #else /* not MULE */ | |
423 | |
424 void | |
425 vars_of_regex (void) | |
426 { | |
427 } | |
428 | |
429 #endif /* MULE */ | |
430 | |
431 /* Convert an offset from the start of the logical text string formed by | |
432 concatenating the two strings together into a character position in the | |
433 Lisp buffer or string that the text represents. Knows that | |
434 when handling buffer text, the "string" we're passed in is always | |
435 BEGV - ZV. */ | |
436 | |
437 static Charxpos | |
438 offset_to_charxpos (Lisp_Object lispobj, int off) | |
439 { | |
440 if (STRINGP (lispobj)) | |
441 return string_index_byte_to_char (lispobj, off); | |
442 else if (BUFFERP (lispobj)) | |
443 return bytebpos_to_charbpos (XBUFFER (lispobj), | |
444 off + BYTE_BUF_BEGV (XBUFFER (lispobj))); | |
445 else | |
446 return 0; | |
447 } | |
448 | |
449 #ifdef REL_ALLOC | |
450 | |
451 /* STRING1 is the value of STRING1 given to re_match_2(). LISPOBJ is | |
452 the Lisp object (if any) from which the string is taken. If LISPOBJ | |
453 is a buffer, return a relocation offset to be added to all pointers to | |
454 string data so that they will be accurate again, after an allocation or | |
455 reallocation that potentially relocated the buffer data. | |
456 */ | |
457 static Bytecount | |
458 offset_post_relocation (Lisp_Object lispobj, Ibyte *orig_buftext) | |
459 { | |
460 if (!BUFFERP (lispobj)) | |
461 return 0; | |
462 return (BYTE_BUF_BYTE_ADDRESS (XBUFFER (lispobj), | |
463 BYTE_BUF_BEGV (XBUFFER (lispobj))) - | |
464 orig_buftext); | |
465 } | |
466 | |
467 #endif /* REL_ALLOC */ | |
468 | |
469 #ifdef ERROR_CHECK_MALLOC | |
470 | |
471 /* NOTE that this can run malloc() so you need to adjust afterwards. */ | |
472 | |
473 static int | |
474 bind_regex_malloc_disallowed (int value) | |
475 { | |
476 /* Tricky, because the act of binding can run malloc(). */ | |
477 int old_regex_malloc_disallowed = regex_malloc_disallowed; | |
478 int depth; | |
479 regex_malloc_disallowed = 0; | |
480 depth = record_unwind_protect_restoring_int (®ex_malloc_disallowed, | |
481 old_regex_malloc_disallowed); | |
482 regex_malloc_disallowed = value; | |
483 return depth; | |
484 } | |
485 | |
486 #endif /* ERROR_CHECK_MALLOC */ | |
487 | |
488 #endif /* emacs */ | |
489 | |
490 | |
428 | 491 /* These are the command codes that appear in compiled regular |
492 expressions. Some opcodes are followed by argument bytes. A | |
493 command code can specify any interpretation whatsoever for its | |
494 arguments. Zero bytes may appear in the compiled regular expression. */ | |
495 | |
496 typedef enum | |
497 { | |
498 no_op = 0, | |
499 | |
500 /* Succeed right away--no more backtracking. */ | |
501 succeed, | |
502 | |
503 /* Followed by one byte giving n, then by n literal bytes. */ | |
504 exactn, | |
505 | |
506 /* Matches any (more or less) character. */ | |
507 anychar, | |
508 | |
509 /* Matches any one char belonging to specified set. First | |
510 following byte is number of bitmap bytes. Then come bytes | |
511 for a bitmap saying which chars are in. Bits in each byte | |
512 are ordered low-bit-first. A character is in the set if its | |
513 bit is 1. A character too large to have a bit in the map is | |
514 automatically not in the set. */ | |
515 charset, | |
516 | |
517 /* Same parameters as charset, but match any character that is | |
518 not one of those specified. */ | |
519 charset_not, | |
520 | |
521 /* Start remembering the text that is matched, for storing in a | |
522 register. Followed by one byte with the register number, in | |
502 | 523 the range 1 to the pattern buffer's re_ngroups |
428 | 524 field. Then followed by one byte with the number of groups |
525 inner to this one. (This last has to be part of the | |
526 start_memory only because we need it in the on_failure_jump | |
527 of re_match_2.) */ | |
528 start_memory, | |
529 | |
530 /* Stop remembering the text that is matched and store it in a | |
531 memory register. Followed by one byte with the register | |
502 | 532 number, in the range 1 to `re_ngroups' in the |
428 | 533 pattern buffer, and one byte with the number of inner groups, |
534 just like `start_memory'. (We need the number of inner | |
535 groups here because we don't have any easy way of finding the | |
536 corresponding start_memory when we're at a stop_memory.) */ | |
537 stop_memory, | |
538 | |
539 /* Match a duplicate of something remembered. Followed by one | |
540 byte containing the register number. */ | |
541 duplicate, | |
542 | |
543 /* Fail unless at beginning of line. */ | |
544 begline, | |
545 | |
546 /* Fail unless at end of line. */ | |
547 endline, | |
548 | |
549 /* Succeeds if at beginning of buffer (if emacs) or at beginning | |
550 of string to be matched (if not). */ | |
551 begbuf, | |
552 | |
553 /* Analogously, for end of buffer/string. */ | |
554 endbuf, | |
555 | |
556 /* Followed by two byte relative address to which to jump. */ | |
557 jump, | |
558 | |
559 /* Same as jump, but marks the end of an alternative. */ | |
560 jump_past_alt, | |
561 | |
562 /* Followed by two-byte relative address of place to resume at | |
563 in case of failure. */ | |
564 on_failure_jump, | |
565 | |
566 /* Like on_failure_jump, but pushes a placeholder instead of the | |
567 current string position when executed. */ | |
568 on_failure_keep_string_jump, | |
569 | |
570 /* Throw away latest failure point and then jump to following | |
571 two-byte relative address. */ | |
572 pop_failure_jump, | |
573 | |
574 /* Change to pop_failure_jump if know won't have to backtrack to | |
575 match; otherwise change to jump. This is used to jump | |
576 back to the beginning of a repeat. If what follows this jump | |
577 clearly won't match what the repeat does, such that we can be | |
578 sure that there is no use backtracking out of repetitions | |
579 already matched, then we change it to a pop_failure_jump. | |
580 Followed by two-byte address. */ | |
581 maybe_pop_jump, | |
582 | |
583 /* Jump to following two-byte address, and push a dummy failure | |
584 point. This failure point will be thrown away if an attempt | |
585 is made to use it for a failure. A `+' construct makes this | |
586 before the first repeat. Also used as an intermediary kind | |
587 of jump when compiling an alternative. */ | |
588 dummy_failure_jump, | |
589 | |
590 /* Push a dummy failure point and continue. Used at the end of | |
591 alternatives. */ | |
592 push_dummy_failure, | |
593 | |
594 /* Followed by two-byte relative address and two-byte number n. | |
595 After matching N times, jump to the address upon failure. */ | |
596 succeed_n, | |
597 | |
598 /* Followed by two-byte relative address, and two-byte number n. | |
599 Jump to the address N times, then fail. */ | |
600 jump_n, | |
601 | |
602 /* Set the following two-byte relative address to the | |
603 subsequent two-byte number. The address *includes* the two | |
604 bytes of number. */ | |
605 set_number_at, | |
606 | |
607 wordchar, /* Matches any word-constituent character. */ | |
608 notwordchar, /* Matches any char that is not a word-constituent. */ | |
609 | |
610 wordbeg, /* Succeeds if at word beginning. */ | |
611 wordend, /* Succeeds if at word end. */ | |
612 | |
613 wordbound, /* Succeeds if at a word boundary. */ | |
614 notwordbound /* Succeeds if not at a word boundary. */ | |
615 | |
616 #ifdef emacs | |
617 ,before_dot, /* Succeeds if before point. */ | |
618 at_dot, /* Succeeds if at point. */ | |
619 after_dot, /* Succeeds if after point. */ | |
620 | |
621 /* Matches any character whose syntax is specified. Followed by | |
622 a byte which contains a syntax code, e.g., Sword. */ | |
623 syntaxspec, | |
624 | |
625 /* Matches any character whose syntax is not that specified. */ | |
626 notsyntaxspec | |
627 | |
628 #endif /* emacs */ | |
629 | |
630 #ifdef MULE | |
631 /* need extra stuff to be able to properly work with XEmacs/Mule | |
632 characters (which may take up more than one byte) */ | |
633 | |
634 ,charset_mule, /* Matches any character belonging to specified set. | |
635 The set is stored in "unified range-table | |
636 format"; see rangetab.c. Unlike the `charset' | |
637 opcode, this can handle arbitrary characters. */ | |
638 | |
639 charset_mule_not /* Same parameters as charset_mule, but match any | |
640 character that is not one of those specified. */ | |
641 | |
642 /* 97/2/17 jhod: The following two were merged back in from the Mule | |
643 2.3 code to enable some language specific processing */ | |
644 ,categoryspec, /* Matches entries in the character category tables */ | |
645 notcategoryspec /* The opposite of the above */ | |
646 #endif /* MULE */ | |
647 | |
648 } re_opcode_t; | |
649 | |
650 /* Common operations on the compiled pattern. */ | |
651 | |
652 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ | |
653 | |
654 #define STORE_NUMBER(destination, number) \ | |
655 do { \ | |
656 (destination)[0] = (number) & 0377; \ | |
657 (destination)[1] = (number) >> 8; \ | |
658 } while (0) | |
659 | |
660 /* Same as STORE_NUMBER, except increment DESTINATION to | |
661 the byte after where the number is stored. Therefore, DESTINATION | |
662 must be an lvalue. */ | |
663 | |
664 #define STORE_NUMBER_AND_INCR(destination, number) \ | |
665 do { \ | |
666 STORE_NUMBER (destination, number); \ | |
667 (destination) += 2; \ | |
668 } while (0) | |
669 | |
670 /* Put into DESTINATION a number stored in two contiguous bytes starting | |
671 at SOURCE. */ | |
672 | |
673 #define EXTRACT_NUMBER(destination, source) \ | |
674 do { \ | |
675 (destination) = *(source) & 0377; \ | |
676 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ | |
677 } while (0) | |
678 | |
679 #ifdef DEBUG | |
680 static void | |
446 | 681 extract_number (int *dest, re_char *source) |
428 | 682 { |
683 int temp = SIGN_EXTEND_CHAR (*(source + 1)); | |
684 *dest = *source & 0377; | |
685 *dest += temp << 8; | |
686 } | |
687 | |
688 #ifndef EXTRACT_MACROS /* To debug the macros. */ | |
689 #undef EXTRACT_NUMBER | |
690 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) | |
691 #endif /* not EXTRACT_MACROS */ | |
692 | |
693 #endif /* DEBUG */ | |
694 | |
695 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. | |
696 SOURCE must be an lvalue. */ | |
697 | |
698 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ | |
699 do { \ | |
700 EXTRACT_NUMBER (destination, source); \ | |
701 (source) += 2; \ | |
702 } while (0) | |
703 | |
704 #ifdef DEBUG | |
705 static void | |
706 extract_number_and_incr (int *destination, unsigned char **source) | |
707 { | |
708 extract_number (destination, *source); | |
709 *source += 2; | |
710 } | |
711 | |
712 #ifndef EXTRACT_MACROS | |
713 #undef EXTRACT_NUMBER_AND_INCR | |
714 #define EXTRACT_NUMBER_AND_INCR(dest, src) \ | |
715 extract_number_and_incr (&dest, &src) | |
716 #endif /* not EXTRACT_MACROS */ | |
717 | |
718 #endif /* DEBUG */ | |
719 | |
720 /* If DEBUG is defined, Regex prints many voluminous messages about what | |
721 it is doing (if the variable `debug' is nonzero). If linked with the | |
722 main program in `iregex.c', you can enter patterns and strings | |
723 interactively. And if linked with the main program in `main.c' and | |
724 the other test files, you can run the already-written tests. */ | |
725 | |
726 #if defined (DEBUG) | |
727 | |
728 /* We use standard I/O for debugging. */ | |
729 #include <stdio.h> | |
730 | |
731 #ifndef emacs | |
732 /* XEmacs provides its own version of assert() */ | |
733 /* It is useful to test things that ``must'' be true when debugging. */ | |
734 #include <assert.h> | |
735 #endif | |
736 | |
737 static int debug = 0; | |
738 | |
739 #define DEBUG_STATEMENT(e) e | |
740 #define DEBUG_PRINT1(x) if (debug) printf (x) | |
741 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) | |
742 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) | |
743 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) | |
744 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ | |
745 if (debug) print_partial_compiled_pattern (s, e) | |
746 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ | |
747 if (debug) print_double_string (w, s1, sz1, s2, sz2) | |
748 | |
749 | |
750 /* Print the fastmap in human-readable form. */ | |
751 | |
752 static void | |
753 print_fastmap (char *fastmap) | |
754 { | |
647 | 755 int was_a_range = 0; |
756 int i = 0; | |
428 | 757 |
758 while (i < (1 << BYTEWIDTH)) | |
759 { | |
760 if (fastmap[i++]) | |
761 { | |
762 was_a_range = 0; | |
763 putchar (i - 1); | |
764 while (i < (1 << BYTEWIDTH) && fastmap[i]) | |
765 { | |
766 was_a_range = 1; | |
767 i++; | |
768 } | |
769 if (was_a_range) | |
770 { | |
771 putchar ('-'); | |
772 putchar (i - 1); | |
773 } | |
774 } | |
775 } | |
776 putchar ('\n'); | |
777 } | |
778 | |
779 | |
780 /* Print a compiled pattern string in human-readable form, starting at | |
781 the START pointer into it and ending just before the pointer END. */ | |
782 | |
783 static void | |
446 | 784 print_partial_compiled_pattern (re_char *start, re_char *end) |
428 | 785 { |
786 int mcnt, mcnt2; | |
446 | 787 unsigned char *p = (unsigned char *) start; |
788 re_char *pend = end; | |
428 | 789 |
790 if (start == NULL) | |
791 { | |
792 puts ("(null)"); | |
793 return; | |
794 } | |
795 | |
796 /* Loop over pattern commands. */ | |
797 while (p < pend) | |
798 { | |
799 printf ("%ld:\t", (long)(p - start)); | |
800 | |
801 switch ((re_opcode_t) *p++) | |
802 { | |
803 case no_op: | |
804 printf ("/no_op"); | |
805 break; | |
806 | |
807 case exactn: | |
808 mcnt = *p++; | |
809 printf ("/exactn/%d", mcnt); | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
810 while (mcnt--) |
428 | 811 { |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
812 putchar ('/'); |
428 | 813 putchar (*p++); |
814 } | |
815 break; | |
816 | |
817 case start_memory: | |
818 mcnt = *p++; | |
819 printf ("/start_memory/%d/%d", mcnt, *p++); | |
820 break; | |
821 | |
822 case stop_memory: | |
823 mcnt = *p++; | |
824 printf ("/stop_memory/%d/%d", mcnt, *p++); | |
825 break; | |
826 | |
827 case duplicate: | |
828 printf ("/duplicate/%d", *p++); | |
829 break; | |
830 | |
831 case anychar: | |
832 printf ("/anychar"); | |
833 break; | |
834 | |
835 case charset: | |
836 case charset_not: | |
837 { | |
838 REGISTER int c, last = -100; | |
839 REGISTER int in_range = 0; | |
840 | |
841 printf ("/charset [%s", | |
842 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); | |
843 | |
844 assert (p + *p < pend); | |
845 | |
846 for (c = 0; c < 256; c++) | |
847 if (((unsigned char) (c / 8) < *p) | |
848 && (p[1 + (c/8)] & (1 << (c % 8)))) | |
849 { | |
850 /* Are we starting a range? */ | |
851 if (last + 1 == c && ! in_range) | |
852 { | |
853 putchar ('-'); | |
854 in_range = 1; | |
855 } | |
856 /* Have we broken a range? */ | |
857 else if (last + 1 != c && in_range) | |
858 { | |
859 putchar (last); | |
860 in_range = 0; | |
861 } | |
862 | |
863 if (! in_range) | |
864 putchar (c); | |
865 | |
866 last = c; | |
867 } | |
868 | |
869 if (in_range) | |
870 putchar (last); | |
871 | |
872 putchar (']'); | |
873 | |
874 p += 1 + *p; | |
875 } | |
876 break; | |
877 | |
878 #ifdef MULE | |
879 case charset_mule: | |
880 case charset_mule_not: | |
881 { | |
882 int nentries, i; | |
883 | |
884 printf ("/charset_mule [%s", | |
885 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : ""); | |
886 nentries = unified_range_table_nentries (p); | |
887 for (i = 0; i < nentries; i++) | |
888 { | |
889 EMACS_INT first, last; | |
890 Lisp_Object dummy_val; | |
891 | |
892 unified_range_table_get_range (p, i, &first, &last, | |
893 &dummy_val); | |
894 if (first < 0x100) | |
895 putchar (first); | |
896 else | |
897 printf ("(0x%lx)", (long)first); | |
898 if (first != last) | |
899 { | |
900 putchar ('-'); | |
901 if (last < 0x100) | |
902 putchar (last); | |
903 else | |
904 printf ("(0x%lx)", (long)last); | |
905 } | |
906 } | |
907 putchar (']'); | |
908 p += unified_range_table_bytes_used (p); | |
909 } | |
910 break; | |
911 #endif | |
912 | |
913 case begline: | |
914 printf ("/begline"); | |
915 break; | |
916 | |
917 case endline: | |
918 printf ("/endline"); | |
919 break; | |
920 | |
921 case on_failure_jump: | |
922 extract_number_and_incr (&mcnt, &p); | |
923 printf ("/on_failure_jump to %ld", (long)(p + mcnt - start)); | |
924 break; | |
925 | |
926 case on_failure_keep_string_jump: | |
927 extract_number_and_incr (&mcnt, &p); | |
928 printf ("/on_failure_keep_string_jump to %ld", (long)(p + mcnt - start)); | |
929 break; | |
930 | |
931 case dummy_failure_jump: | |
932 extract_number_and_incr (&mcnt, &p); | |
933 printf ("/dummy_failure_jump to %ld", (long)(p + mcnt - start)); | |
934 break; | |
935 | |
936 case push_dummy_failure: | |
937 printf ("/push_dummy_failure"); | |
938 break; | |
939 | |
940 case maybe_pop_jump: | |
941 extract_number_and_incr (&mcnt, &p); | |
942 printf ("/maybe_pop_jump to %ld", (long)(p + mcnt - start)); | |
943 break; | |
944 | |
945 case pop_failure_jump: | |
946 extract_number_and_incr (&mcnt, &p); | |
947 printf ("/pop_failure_jump to %ld", (long)(p + mcnt - start)); | |
948 break; | |
949 | |
950 case jump_past_alt: | |
951 extract_number_and_incr (&mcnt, &p); | |
952 printf ("/jump_past_alt to %ld", (long)(p + mcnt - start)); | |
953 break; | |
954 | |
955 case jump: | |
956 extract_number_and_incr (&mcnt, &p); | |
957 printf ("/jump to %ld", (long)(p + mcnt - start)); | |
958 break; | |
959 | |
960 case succeed_n: | |
961 extract_number_and_incr (&mcnt, &p); | |
962 extract_number_and_incr (&mcnt2, &p); | |
963 printf ("/succeed_n to %ld, %d times", (long)(p + mcnt - start), mcnt2); | |
964 break; | |
965 | |
966 case jump_n: | |
967 extract_number_and_incr (&mcnt, &p); | |
968 extract_number_and_incr (&mcnt2, &p); | |
969 printf ("/jump_n to %ld, %d times", (long)(p + mcnt - start), mcnt2); | |
970 break; | |
971 | |
972 case set_number_at: | |
973 extract_number_and_incr (&mcnt, &p); | |
974 extract_number_and_incr (&mcnt2, &p); | |
975 printf ("/set_number_at location %ld to %d", (long)(p + mcnt - start), mcnt2); | |
976 break; | |
977 | |
978 case wordbound: | |
979 printf ("/wordbound"); | |
980 break; | |
981 | |
982 case notwordbound: | |
983 printf ("/notwordbound"); | |
984 break; | |
985 | |
986 case wordbeg: | |
987 printf ("/wordbeg"); | |
988 break; | |
989 | |
990 case wordend: | |
991 printf ("/wordend"); | |
992 | |
993 #ifdef emacs | |
994 case before_dot: | |
995 printf ("/before_dot"); | |
996 break; | |
997 | |
998 case at_dot: | |
999 printf ("/at_dot"); | |
1000 break; | |
1001 | |
1002 case after_dot: | |
1003 printf ("/after_dot"); | |
1004 break; | |
1005 | |
1006 case syntaxspec: | |
1007 printf ("/syntaxspec"); | |
1008 mcnt = *p++; | |
1009 printf ("/%d", mcnt); | |
1010 break; | |
1011 | |
1012 case notsyntaxspec: | |
1013 printf ("/notsyntaxspec"); | |
1014 mcnt = *p++; | |
1015 printf ("/%d", mcnt); | |
1016 break; | |
1017 | |
1018 #ifdef MULE | |
1019 /* 97/2/17 jhod Mule category patch */ | |
1020 case categoryspec: | |
1021 printf ("/categoryspec"); | |
1022 mcnt = *p++; | |
1023 printf ("/%d", mcnt); | |
1024 break; | |
1025 | |
1026 case notcategoryspec: | |
1027 printf ("/notcategoryspec"); | |
1028 mcnt = *p++; | |
1029 printf ("/%d", mcnt); | |
1030 break; | |
1031 /* end of category patch */ | |
1032 #endif /* MULE */ | |
1033 #endif /* emacs */ | |
1034 | |
1035 case wordchar: | |
1036 printf ("/wordchar"); | |
1037 break; | |
1038 | |
1039 case notwordchar: | |
1040 printf ("/notwordchar"); | |
1041 break; | |
1042 | |
1043 case begbuf: | |
1044 printf ("/begbuf"); | |
1045 break; | |
1046 | |
1047 case endbuf: | |
1048 printf ("/endbuf"); | |
1049 break; | |
1050 | |
1051 default: | |
1052 printf ("?%d", *(p-1)); | |
1053 } | |
1054 | |
1055 putchar ('\n'); | |
1056 } | |
1057 | |
1058 printf ("%ld:\tend of pattern.\n", (long)(p - start)); | |
1059 } | |
1060 | |
1061 | |
1062 static void | |
1063 print_compiled_pattern (struct re_pattern_buffer *bufp) | |
1064 { | |
446 | 1065 re_char *buffer = bufp->buffer; |
428 | 1066 |
1067 print_partial_compiled_pattern (buffer, buffer + bufp->used); | |
1068 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used, | |
1069 bufp->allocated); | |
1070 | |
1071 if (bufp->fastmap_accurate && bufp->fastmap) | |
1072 { | |
1073 printf ("fastmap: "); | |
1074 print_fastmap (bufp->fastmap); | |
1075 } | |
1076 | |
1077 printf ("re_nsub: %ld\t", (long)bufp->re_nsub); | |
502 | 1078 printf ("re_ngroups: %ld\t", (long)bufp->re_ngroups); |
428 | 1079 printf ("regs_alloc: %d\t", bufp->regs_allocated); |
1080 printf ("can_be_null: %d\t", bufp->can_be_null); | |
1081 printf ("newline_anchor: %d\n", bufp->newline_anchor); | |
1082 printf ("no_sub: %d\t", bufp->no_sub); | |
1083 printf ("not_bol: %d\t", bufp->not_bol); | |
1084 printf ("not_eol: %d\t", bufp->not_eol); | |
1085 printf ("syntax: %d\n", bufp->syntax); | |
1086 /* Perhaps we should print the translate table? */ | |
1087 /* and maybe the category table? */ | |
502 | 1088 |
1089 if (bufp->external_to_internal_register) | |
1090 { | |
1091 int i; | |
1092 | |
1093 printf ("external_to_internal_register:\n"); | |
1094 for (i = 0; i <= bufp->re_nsub; i++) | |
1095 { | |
1096 if (i > 0) | |
1097 printf (", "); | |
1098 printf ("%d -> %d", i, bufp->external_to_internal_register[i]); | |
1099 } | |
1100 printf ("\n"); | |
1101 } | |
428 | 1102 } |
1103 | |
1104 | |
1105 static void | |
446 | 1106 print_double_string (re_char *where, re_char *string1, int size1, |
1107 re_char *string2, int size2) | |
428 | 1108 { |
1109 if (where == NULL) | |
1110 printf ("(null)"); | |
1111 else | |
1112 { | |
647 | 1113 int this_char; |
428 | 1114 |
1115 if (FIRST_STRING_P (where)) | |
1116 { | |
1117 for (this_char = where - string1; this_char < size1; this_char++) | |
1118 putchar (string1[this_char]); | |
1119 | |
1120 where = string2; | |
1121 } | |
1122 | |
1123 for (this_char = where - string2; this_char < size2; this_char++) | |
1124 putchar (string2[this_char]); | |
1125 } | |
1126 } | |
1127 | |
1128 #else /* not DEBUG */ | |
1129 | |
771 | 1130 #ifndef emacs |
428 | 1131 #undef assert |
771 | 1132 #define assert(e) ((void) (1)) |
1133 #endif | |
428 | 1134 |
1135 #define DEBUG_STATEMENT(e) | |
1136 #define DEBUG_PRINT1(x) | |
1137 #define DEBUG_PRINT2(x1, x2) | |
1138 #define DEBUG_PRINT3(x1, x2, x3) | |
1139 #define DEBUG_PRINT4(x1, x2, x3, x4) | |
1140 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) | |
1141 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) | |
1142 | |
446 | 1143 #endif /* DEBUG */ |
428 | 1144 |
1145 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can | |
1146 also be assigned to arbitrarily: each pattern buffer stores its own | |
1147 syntax, so it can be changed between regex compilations. */ | |
1148 /* This has no initializer because initialized variables in Emacs | |
1149 become read-only after dumping. */ | |
1150 reg_syntax_t re_syntax_options; | |
1151 | |
1152 | |
1153 /* Specify the precise syntax of regexps for compilation. This provides | |
1154 for compatibility for various utilities which historically have | |
1155 different, incompatible syntaxes. | |
1156 | |
1157 The argument SYNTAX is a bit mask comprised of the various bits | |
1158 defined in regex.h. We return the old syntax. */ | |
1159 | |
1160 reg_syntax_t | |
1161 re_set_syntax (reg_syntax_t syntax) | |
1162 { | |
1163 reg_syntax_t ret = re_syntax_options; | |
1164 | |
1165 re_syntax_options = syntax; | |
1166 return ret; | |
1167 } | |
1168 | |
1169 /* This table gives an error message for each of the error codes listed | |
1170 in regex.h. Obviously the order here has to be same as there. | |
1171 POSIX doesn't require that we do anything for REG_NOERROR, | |
1172 but why not be nice? */ | |
1173 | |
442 | 1174 static const char *re_error_msgid[] = |
428 | 1175 { |
1176 "Success", /* REG_NOERROR */ | |
1177 "No match", /* REG_NOMATCH */ | |
1178 "Invalid regular expression", /* REG_BADPAT */ | |
1179 "Invalid collation character", /* REG_ECOLLATE */ | |
1180 "Invalid character class name", /* REG_ECTYPE */ | |
1181 "Trailing backslash", /* REG_EESCAPE */ | |
1182 "Invalid back reference", /* REG_ESUBREG */ | |
1183 "Unmatched [ or [^", /* REG_EBRACK */ | |
1184 "Unmatched ( or \\(", /* REG_EPAREN */ | |
1185 "Unmatched \\{", /* REG_EBRACE */ | |
1186 "Invalid content of \\{\\}", /* REG_BADBR */ | |
1187 "Invalid range end", /* REG_ERANGE */ | |
1188 "Memory exhausted", /* REG_ESPACE */ | |
1189 "Invalid preceding regular expression", /* REG_BADRPT */ | |
1190 "Premature end of regular expression", /* REG_EEND */ | |
1191 "Regular expression too big", /* REG_ESIZE */ | |
1192 "Unmatched ) or \\)", /* REG_ERPAREN */ | |
1193 #ifdef emacs | |
1194 "Invalid syntax designator", /* REG_ESYNTAX */ | |
1195 #endif | |
1196 #ifdef MULE | |
1197 "Ranges may not span charsets", /* REG_ERANGESPAN */ | |
1198 "Invalid category designator", /* REG_ECATEGORY */ | |
1199 #endif | |
1200 }; | |
1201 | |
1202 /* Avoiding alloca during matching, to placate r_alloc. */ | |
1203 | |
1333 | 1204 /* About these various flags: |
1205 | |
1206 MATCH_MAY_ALLOCATE indicates that it's OK to do allocation in the | |
1207 searching and matching functions. In this case, we use local variables | |
1208 to hold the values allocated. If not, we use *global* variables, which | |
1209 are pre-allocated. NOTE: XEmacs ***MUST*** run with MATCH_MAY_ALLOCATE, | |
1210 because the regexp routines may get called reentrantly as a result of | |
1211 QUIT processing (e.g. under Windows: re_match -> QUIT -> quit_p -> drain | |
1212 events -> process WM_INITMENU -> call filter -> re_match; see stack | |
1213 trace in signal.c), so we cannot have any global variables (unless we do | |
1214 lots of trickiness including some unwind-protects, which isn't worth it | |
1215 at this point). | |
1216 | |
1217 REL_ALLOC means that the relocating allocator is in use, for buffers | |
1218 and such. REGEX_REL_ALLOC means that we use rel-alloc to manage the | |
1219 fail stack, which may grow quite large. REGEX_MALLOC means we use | |
1220 malloc() in place of alloca() to allocate the fail stack -- only | |
1221 applicable if REGEX_REL_ALLOC is not defined. | |
1222 */ | |
1223 | |
428 | 1224 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the |
1225 searching and matching functions should not call alloca. On some | |
1226 systems, alloca is implemented in terms of malloc, and if we're | |
1227 using the relocating allocator routines, then malloc could cause a | |
1228 relocation, which might (if the strings being searched are in the | |
1229 ralloc heap) shift the data out from underneath the regexp | |
771 | 1230 routines. [To clarify: The purpose of rel-alloc is to allow data to |
1231 be moved in memory from one place to another so that all data | |
1232 blocks can be consolidated together and excess memory released back | |
1233 to the operating system. This requires that all the blocks that | |
1234 are managed by rel-alloc go at the very end of the program's heap, | |
1235 after all regularly malloc()ed data. malloc(), however, is used to | |
1236 owning the end of the heap, so that when more memory is needed, it | |
1237 just expands the heap using sbrk(). This is reconciled by using a | |
1238 malloc() (such as malloc.c, gmalloc.c, or recent versions of | |
1239 malloc() in libc) where the sbrk() call can be replaced with a | |
1240 user-specified call -- in this case, to rel-alloc's r_alloc_sbrk() | |
1241 routine. This routine calls the real sbrk(), but then shifts all | |
1242 the rel-alloc-managed blocks forward to the end of the heap again, | |
1243 so that malloc() gets the memory it needs in the location it needs | |
1244 it at. The regex routines may well have pointers to buffer data as | |
1245 their arguments, and buffers are managed by rel-alloc if rel-alloc | |
1246 has been enabled, so calling malloc() may potentially screw things | |
1247 up badly if it runs out of space and asks for more from the OS.] | |
1248 | |
1249 [[Here's another reason to avoid allocation: Emacs processes input | |
1250 from X in a signal handler; processing X input may call malloc; if | |
1251 input arrives while a matching routine is calling malloc, then | |
1252 we're scrod. But Emacs can't just block input while calling | |
1253 matching routines; then we don't notice interrupts when they come | |
1254 in. So, Emacs blocks input around all regexp calls except the | |
1255 matching calls, which it leaves unprotected, in the faith that they | |
1333 | 1256 will not malloc.]] This previous paragraph is irrelevant under XEmacs, |
1257 as we *do not* do anything so stupid as process input from within a | |
1258 signal handler. | |
1259 | |
1260 However, the regexp routines may get called reentrantly as a result of | |
1261 QUIT processing (e.g. under Windows: re_match -> QUIT -> quit_p -> drain | |
1262 events -> process WM_INITMENU -> call filter -> re_match; see stack | |
1263 trace in signal.c), so we cannot have any global variables (unless we do | |
1264 lots of trickiness including some unwind-protects, which isn't worth it | |
1265 at this point). Hence we MUST have MATCH_MAY_ALLOCATE defined. | |
1266 | |
1267 Also, the first paragraph does not make complete sense to me -- what | |
1268 about the use of rel-alloc to handle the fail stacks? Shouldn't these | |
1269 reallocations potentially cause buffer data to be relocated as well? I | |
826 | 1270 must be missing something, though -- perhaps the writer above is |
1271 assuming that the failure stack(s) will always be allocated after the | |
1272 buffer data, and thus reallocating them with rel-alloc won't move buffer | |
1333 | 1273 data. (In fact, a cursory glance at the code in ralloc.c seems to |
1274 confirm this.) --ben */ | |
428 | 1275 |
1276 /* Normally, this is fine. */ | |
1277 #define MATCH_MAY_ALLOCATE | |
1278 | |
1279 /* When using GNU C, we are not REALLY using the C alloca, no matter | |
1280 what config.h may say. So don't take precautions for it. */ | |
1281 #ifdef __GNUC__ | |
1282 #undef C_ALLOCA | |
1283 #endif | |
1284 | |
1285 /* The match routines may not allocate if (1) they would do it with malloc | |
1286 and (2) it's not safe for them to use malloc. | |
1287 Note that if REL_ALLOC is defined, matching would not use malloc for the | |
1288 failure stack, but we would still use it for the register vectors; | |
1289 so REL_ALLOC should not affect this. */ | |
771 | 1290 |
1333 | 1291 /* XEmacs can handle REL_ALLOC and malloc() OK */ |
1292 #if !defined (emacs) && (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (REL_ALLOC) | |
428 | 1293 #undef MATCH_MAY_ALLOCATE |
1294 #endif | |
1295 | |
1333 | 1296 #if !defined (MATCH_MAY_ALLOCATE) && defined (emacs) |
771 | 1297 #error regex must be handle reentrancy; MATCH_MAY_ALLOCATE must be defined |
1298 #endif | |
1299 | |
428 | 1300 |
1301 /* Failure stack declarations and macros; both re_compile_fastmap and | |
1302 re_match_2 use a failure stack. These have to be macros because of | |
1303 REGEX_ALLOCATE_STACK. */ | |
1304 | |
1305 | |
1306 /* Number of failure points for which to initially allocate space | |
1307 when matching. If this number is exceeded, we allocate more | |
1308 space, so it is not a hard limit. */ | |
1309 #ifndef INIT_FAILURE_ALLOC | |
3300 | 1310 #define INIT_FAILURE_ALLOC 20 |
428 | 1311 #endif |
1312 | |
1313 /* Roughly the maximum number of failure points on the stack. Would be | |
1314 exactly that if always used MAX_FAILURE_SPACE each time we failed. | |
1315 This is a variable only so users of regex can assign to it; we never | |
1316 change it ourselves. */ | |
1317 #if defined (MATCH_MAY_ALLOCATE) | |
1318 /* 4400 was enough to cause a crash on Alpha OSF/1, | |
1319 whose default stack limit is 2mb. */ | |
3300 | 1320 int re_max_failures = 40000; |
428 | 1321 #else |
3300 | 1322 int re_max_failures = 4000; |
428 | 1323 #endif |
1324 | |
1325 union fail_stack_elt | |
1326 { | |
446 | 1327 re_char *pointer; |
428 | 1328 int integer; |
1329 }; | |
1330 | |
1331 typedef union fail_stack_elt fail_stack_elt_t; | |
1332 | |
1333 typedef struct | |
1334 { | |
1335 fail_stack_elt_t *stack; | |
665 | 1336 Elemcount size; |
1337 Elemcount avail; /* Offset of next open position. */ | |
428 | 1338 } fail_stack_type; |
1339 | |
1340 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) | |
1341 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) | |
1342 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) | |
1343 | |
1344 | |
1345 /* Define macros to initialize and free the failure stack. | |
1346 Do `return -2' if the alloc fails. */ | |
1347 | |
1348 #ifdef MATCH_MAY_ALLOCATE | |
1333 | 1349 #define INIT_FAIL_STACK() \ |
1350 do { \ | |
1351 fail_stack.stack = (fail_stack_elt_t *) \ | |
1352 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * \ | |
1353 sizeof (fail_stack_elt_t)); \ | |
1354 \ | |
1355 if (fail_stack.stack == NULL) \ | |
1356 { \ | |
1357 UNBIND_REGEX_MALLOC_CHECK (); \ | |
1358 return -2; \ | |
1359 } \ | |
1360 \ | |
1361 fail_stack.size = INIT_FAILURE_ALLOC; \ | |
1362 fail_stack.avail = 0; \ | |
428 | 1363 } while (0) |
1364 | |
1365 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) | |
1366 #else | |
1367 #define INIT_FAIL_STACK() \ | |
1368 do { \ | |
1369 fail_stack.avail = 0; \ | |
1370 } while (0) | |
1371 | |
1372 #define RESET_FAIL_STACK() | |
1373 #endif | |
1374 | |
1375 | |
1376 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. | |
1377 | |
1378 Return 1 if succeeds, and 0 if either ran out of memory | |
1379 allocating space for it or it was already too large. | |
1380 | |
1381 REGEX_REALLOCATE_STACK requires `destination' be declared. */ | |
1382 | |
1383 #define DOUBLE_FAIL_STACK(fail_stack) \ | |
1384 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ | |
1385 ? 0 \ | |
1386 : ((fail_stack).stack = (fail_stack_elt_t *) \ | |
1387 REGEX_REALLOCATE_STACK ((fail_stack).stack, \ | |
1388 (fail_stack).size * sizeof (fail_stack_elt_t), \ | |
1389 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ | |
1390 \ | |
1391 (fail_stack).stack == NULL \ | |
1392 ? 0 \ | |
1393 : ((fail_stack).size <<= 1, \ | |
1394 1))) | |
1395 | |
1333 | 1396 #if !defined (emacs) || !defined (REL_ALLOC) |
1397 #define RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS() | |
1398 #else | |
1399 /* Don't change NULL pointers */ | |
1400 #define ADD_IF_NZ(val) if (val) val += rmdp_offset | |
1346 | 1401 #define RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS() \ |
1402 do \ | |
1403 { \ | |
1404 Bytecount rmdp_offset = offset_post_relocation (lispobj, orig_buftext); \ | |
1405 \ | |
1406 if (rmdp_offset) \ | |
1407 { \ | |
1408 int i; \ | |
1409 \ | |
1410 ADD_IF_NZ (string1); \ | |
1411 ADD_IF_NZ (string2); \ | |
1412 ADD_IF_NZ (d); \ | |
1413 ADD_IF_NZ (dend); \ | |
1414 ADD_IF_NZ (end1); \ | |
1415 ADD_IF_NZ (end2); \ | |
1416 ADD_IF_NZ (end_match_1); \ | |
1417 ADD_IF_NZ (end_match_2); \ | |
1418 \ | |
1419 if (bufp->re_ngroups) \ | |
1420 { \ | |
1421 for (i = 0; i < num_regs; i++) \ | |
1422 { \ | |
1423 ADD_IF_NZ (regstart[i]); \ | |
1424 ADD_IF_NZ (regend[i]); \ | |
1425 ADD_IF_NZ (old_regstart[i]); \ | |
1426 ADD_IF_NZ (old_regend[i]); \ | |
1427 ADD_IF_NZ (best_regstart[i]); \ | |
1428 ADD_IF_NZ (best_regend[i]); \ | |
1429 ADD_IF_NZ (reg_dummy[i]); \ | |
1430 } \ | |
1431 } \ | |
1432 \ | |
1433 ADD_IF_NZ (match_end); \ | |
1434 } \ | |
1333 | 1435 } while (0) |
1436 #endif /* !defined (emacs) || !defined (REL_ALLOC) */ | |
1437 | |
1438 #if !defined (emacs) || !defined (REL_ALLOC) | |
1439 #define RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS() | |
1440 #else | |
1346 | 1441 #define RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS() \ |
1442 do \ | |
1443 { \ | |
1444 Bytecount rmdp_offset = offset_post_relocation (lispobj, orig_buftext); \ | |
1445 \ | |
1446 if (rmdp_offset) \ | |
1447 { \ | |
1448 ADD_IF_NZ (str1); \ | |
1449 ADD_IF_NZ (str2); \ | |
1450 ADD_IF_NZ (string1); \ | |
1451 ADD_IF_NZ (string2); \ | |
1452 ADD_IF_NZ (d); \ | |
1453 } \ | |
1333 | 1454 } while (0) |
1455 | |
1456 #endif /* emacs */ | |
428 | 1457 |
1458 /* Push pointer POINTER on FAIL_STACK. | |
1459 Return 1 if was able to do so and 0 if ran out of memory allocating | |
1460 space to do so. */ | |
1461 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ | |
1462 ((FAIL_STACK_FULL () \ | |
1463 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ | |
1464 ? 0 \ | |
1465 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ | |
1466 1)) | |
1467 | |
1468 /* Push a pointer value onto the failure stack. | |
1469 Assumes the variable `fail_stack'. Probably should only | |
1470 be called from within `PUSH_FAILURE_POINT'. */ | |
1471 #define PUSH_FAILURE_POINTER(item) \ | |
1472 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) | |
1473 | |
1474 /* This pushes an integer-valued item onto the failure stack. | |
1475 Assumes the variable `fail_stack'. Probably should only | |
1476 be called from within `PUSH_FAILURE_POINT'. */ | |
1477 #define PUSH_FAILURE_INT(item) \ | |
1478 fail_stack.stack[fail_stack.avail++].integer = (item) | |
1479 | |
1480 /* Push a fail_stack_elt_t value onto the failure stack. | |
1481 Assumes the variable `fail_stack'. Probably should only | |
1482 be called from within `PUSH_FAILURE_POINT'. */ | |
1483 #define PUSH_FAILURE_ELT(item) \ | |
1484 fail_stack.stack[fail_stack.avail++] = (item) | |
1485 | |
1486 /* These three POP... operations complement the three PUSH... operations. | |
1487 All assume that `fail_stack' is nonempty. */ | |
1488 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer | |
1489 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer | |
1490 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] | |
1491 | |
1492 /* Used to omit pushing failure point id's when we're not debugging. */ | |
1493 #ifdef DEBUG | |
1494 #define DEBUG_PUSH PUSH_FAILURE_INT | |
1495 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () | |
1496 #else | |
1497 #define DEBUG_PUSH(item) | |
1498 #define DEBUG_POP(item_addr) | |
1499 #endif | |
1500 | |
1501 | |
1502 /* Push the information about the state we will need | |
1503 if we ever fail back to it. | |
1504 | |
1505 Requires variables fail_stack, regstart, regend, reg_info, and | |
1506 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be | |
1507 declared. | |
1508 | |
1509 Does `return FAILURE_CODE' if runs out of memory. */ | |
1510 | |
771 | 1511 #if !defined (REGEX_MALLOC) && !defined (REGEX_REL_ALLOC) |
456 | 1512 #define DECLARE_DESTINATION char *destination |
428 | 1513 #else |
456 | 1514 #define DECLARE_DESTINATION DECLARE_NOTHING |
428 | 1515 #endif |
1516 | |
1517 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ | |
456 | 1518 do { \ |
1519 DECLARE_DESTINATION; \ | |
1520 /* Must be int, so when we don't save any registers, the arithmetic \ | |
1521 of 0 + -1 isn't done as unsigned. */ \ | |
1522 int this_reg; \ | |
428 | 1523 \ |
456 | 1524 DEBUG_STATEMENT (failure_id++); \ |
1525 DEBUG_STATEMENT (nfailure_points_pushed++); \ | |
647 | 1526 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%d:\n", failure_id); \ |
1527 DEBUG_PRINT2 (" Before push, next avail: %ld\n", \ | |
1528 (long) (fail_stack).avail); \ | |
1529 DEBUG_PRINT2 (" size: %ld\n", \ | |
1530 (long) (fail_stack).size); \ | |
456 | 1531 \ |
1532 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ | |
1533 DEBUG_PRINT2 (" available: %ld\n", \ | |
1534 (long) REMAINING_AVAIL_SLOTS); \ | |
428 | 1535 \ |
456 | 1536 /* Ensure we have enough space allocated for what we will push. */ \ |
1537 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ | |
1538 { \ | |
1333 | 1539 BEGIN_REGEX_MALLOC_OK (); \ |
456 | 1540 if (!DOUBLE_FAIL_STACK (fail_stack)) \ |
1333 | 1541 { \ |
1542 END_REGEX_MALLOC_OK (); \ | |
1543 UNBIND_REGEX_MALLOC_CHECK (); \ | |
1544 return failure_code; \ | |
1545 } \ | |
1546 END_REGEX_MALLOC_OK (); \ | |
647 | 1547 DEBUG_PRINT2 ("\n Doubled stack; size now: %ld\n", \ |
1548 (long) (fail_stack).size); \ | |
456 | 1549 DEBUG_PRINT2 (" slots available: %ld\n", \ |
1550 (long) REMAINING_AVAIL_SLOTS); \ | |
1333 | 1551 \ |
1552 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); \ | |
456 | 1553 } \ |
428 | 1554 \ |
456 | 1555 /* Push the info, starting with the registers. */ \ |
1556 DEBUG_PRINT1 ("\n"); \ | |
428 | 1557 \ |
456 | 1558 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ |
1559 this_reg++) \ | |
1560 { \ | |
1561 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ | |
1562 DEBUG_STATEMENT (num_regs_pushed++); \ | |
428 | 1563 \ |
456 | 1564 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \ |
1565 PUSH_FAILURE_POINTER (regstart[this_reg]); \ | |
1566 \ | |
1567 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \ | |
1568 PUSH_FAILURE_POINTER (regend[this_reg]); \ | |
428 | 1569 \ |
456 | 1570 DEBUG_PRINT2 (" info: 0x%lx\n ", \ |
1571 * (long *) (®_info[this_reg])); \ | |
1572 DEBUG_PRINT2 (" match_null=%d", \ | |
1573 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ | |
1574 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ | |
1575 DEBUG_PRINT2 (" matched_something=%d", \ | |
1576 MATCHED_SOMETHING (reg_info[this_reg])); \ | |
1577 DEBUG_PRINT2 (" ever_matched_something=%d", \ | |
1578 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ | |
1579 DEBUG_PRINT1 ("\n"); \ | |
1580 PUSH_FAILURE_ELT (reg_info[this_reg].word); \ | |
1581 } \ | |
428 | 1582 \ |
456 | 1583 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg); \ |
1584 PUSH_FAILURE_INT (lowest_active_reg); \ | |
428 | 1585 \ |
456 | 1586 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg); \ |
1587 PUSH_FAILURE_INT (highest_active_reg); \ | |
428 | 1588 \ |
456 | 1589 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", (long) pattern_place); \ |
1590 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ | |
1591 PUSH_FAILURE_POINTER (pattern_place); \ | |
428 | 1592 \ |
456 | 1593 DEBUG_PRINT2 (" Pushing string 0x%lx: `", (long) string_place); \ |
1594 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ | |
1595 size2); \ | |
1596 DEBUG_PRINT1 ("'\n"); \ | |
1597 PUSH_FAILURE_POINTER (string_place); \ | |
428 | 1598 \ |
456 | 1599 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ |
1600 DEBUG_PUSH (failure_id); \ | |
1601 } while (0) | |
428 | 1602 |
1603 /* This is the number of items that are pushed and popped on the stack | |
1604 for each register. */ | |
1605 #define NUM_REG_ITEMS 3 | |
1606 | |
1607 /* Individual items aside from the registers. */ | |
1608 #ifdef DEBUG | |
1609 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ | |
1610 #else | |
1611 #define NUM_NONREG_ITEMS 4 | |
1612 #endif | |
1613 | |
1614 /* We push at most this many items on the stack. */ | |
1615 /* We used to use (num_regs - 1), which is the number of registers | |
1616 this regexp will save; but that was changed to 5 | |
1617 to avoid stack overflow for a regexp with lots of parens. */ | |
1618 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) | |
1619 | |
1620 /* We actually push this many items. */ | |
1621 #define NUM_FAILURE_ITEMS \ | |
1622 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ | |
1623 + NUM_NONREG_ITEMS) | |
1624 | |
1625 /* How many items can still be added to the stack without overflowing it. */ | |
1626 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) | |
1627 | |
1628 | |
1629 /* Pops what PUSH_FAIL_STACK pushes. | |
1630 | |
1631 We restore into the parameters, all of which should be lvalues: | |
1632 STR -- the saved data position. | |
1633 PAT -- the saved pattern position. | |
1634 LOW_REG, HIGH_REG -- the highest and lowest active registers. | |
1635 REGSTART, REGEND -- arrays of string positions. | |
1636 REG_INFO -- array of information about each subexpression. | |
1637 | |
1638 Also assumes the variables `fail_stack' and (if debugging), `bufp', | |
1639 `pend', `string1', `size1', `string2', and `size2'. */ | |
1640 | |
456 | 1641 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \ |
1642 regstart, regend, reg_info) \ | |
1643 do { \ | |
428 | 1644 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \ |
1645 int this_reg; \ | |
442 | 1646 const unsigned char *string_temp; \ |
428 | 1647 \ |
1648 assert (!FAIL_STACK_EMPTY ()); \ | |
1649 \ | |
1650 /* Remove failure points and point to how many regs pushed. */ \ | |
1651 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ | |
647 | 1652 DEBUG_PRINT2 (" Before pop, next avail: %ld\n", \ |
1653 (long) fail_stack.avail); \ | |
1654 DEBUG_PRINT2 (" size: %ld\n", \ | |
1655 (long) fail_stack.size); \ | |
428 | 1656 \ |
1657 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ | |
1658 \ | |
1659 DEBUG_POP (&ffailure_id.integer); \ | |
647 | 1660 DEBUG_PRINT2 (" Popping failure id: %d\n", \ |
1661 * (int *) &ffailure_id); \ | |
428 | 1662 \ |
1663 /* If the saved string location is NULL, it came from an \ | |
1664 on_failure_keep_string_jump opcode, and we want to throw away the \ | |
1665 saved NULL, thus retaining our current position in the string. */ \ | |
1666 string_temp = POP_FAILURE_POINTER (); \ | |
1667 if (string_temp != NULL) \ | |
446 | 1668 str = string_temp; \ |
428 | 1669 \ |
1670 DEBUG_PRINT2 (" Popping string 0x%lx: `", (long) str); \ | |
1671 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ | |
1672 DEBUG_PRINT1 ("'\n"); \ | |
1673 \ | |
1674 pat = (unsigned char *) POP_FAILURE_POINTER (); \ | |
1675 DEBUG_PRINT2 (" Popping pattern 0x%lx: ", (long) pat); \ | |
1676 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ | |
1677 \ | |
1678 /* Restore register info. */ \ | |
647 | 1679 high_reg = POP_FAILURE_INT (); \ |
428 | 1680 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ |
1681 \ | |
647 | 1682 low_reg = POP_FAILURE_INT (); \ |
428 | 1683 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ |
1684 \ | |
1685 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ | |
1686 { \ | |
1687 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ | |
1688 \ | |
1689 reg_info[this_reg].word = POP_FAILURE_ELT (); \ | |
1690 DEBUG_PRINT2 (" info: 0x%lx\n", \ | |
1691 * (long *) ®_info[this_reg]); \ | |
1692 \ | |
446 | 1693 regend[this_reg] = POP_FAILURE_POINTER (); \ |
428 | 1694 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \ |
1695 \ | |
446 | 1696 regstart[this_reg] = POP_FAILURE_POINTER (); \ |
428 | 1697 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \ |
1698 } \ | |
1699 \ | |
1700 set_regs_matched_done = 0; \ | |
1701 DEBUG_STATEMENT (nfailure_points_popped++); \ | |
456 | 1702 } while (0) /* POP_FAILURE_POINT */ |
428 | 1703 |
1704 | |
1705 | |
1706 /* Structure for per-register (a.k.a. per-group) information. | |
1707 Other register information, such as the | |
1708 starting and ending positions (which are addresses), and the list of | |
1709 inner groups (which is a bits list) are maintained in separate | |
1710 variables. | |
1711 | |
1712 We are making a (strictly speaking) nonportable assumption here: that | |
1713 the compiler will pack our bit fields into something that fits into | |
1714 the type of `word', i.e., is something that fits into one item on the | |
1715 failure stack. */ | |
1716 | |
1717 typedef union | |
1718 { | |
1719 fail_stack_elt_t word; | |
1720 struct | |
1721 { | |
1722 /* This field is one if this group can match the empty string, | |
1723 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ | |
1724 #define MATCH_NULL_UNSET_VALUE 3 | |
647 | 1725 unsigned int match_null_string_p : 2; |
1726 unsigned int is_active : 1; | |
1727 unsigned int matched_something : 1; | |
1728 unsigned int ever_matched_something : 1; | |
428 | 1729 } bits; |
1730 } register_info_type; | |
1731 | |
1732 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) | |
1733 #define IS_ACTIVE(R) ((R).bits.is_active) | |
1734 #define MATCHED_SOMETHING(R) ((R).bits.matched_something) | |
1735 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) | |
1736 | |
1737 | |
1738 /* Call this when have matched a real character; it sets `matched' flags | |
1739 for the subexpressions which we are currently inside. Also records | |
1740 that those subexprs have matched. */ | |
1741 #define SET_REGS_MATCHED() \ | |
1742 do \ | |
1743 { \ | |
1744 if (!set_regs_matched_done) \ | |
1745 { \ | |
647 | 1746 int r; \ |
428 | 1747 set_regs_matched_done = 1; \ |
1748 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ | |
1749 { \ | |
1750 MATCHED_SOMETHING (reg_info[r]) \ | |
1751 = EVER_MATCHED_SOMETHING (reg_info[r]) \ | |
1752 = 1; \ | |
1753 } \ | |
1754 } \ | |
1755 } \ | |
1756 while (0) | |
1757 | |
1758 /* Registers are set to a sentinel when they haven't yet matched. */ | |
446 | 1759 static unsigned char reg_unset_dummy; |
428 | 1760 #define REG_UNSET_VALUE (®_unset_dummy) |
1761 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) | |
1762 | |
1763 /* Subroutine declarations and macros for regex_compile. */ | |
1764 | |
1765 /* Fetch the next character in the uncompiled pattern---translating it | |
826 | 1766 if necessary. */ |
428 | 1767 #define PATFETCH(c) \ |
446 | 1768 do { \ |
1769 PATFETCH_RAW (c); \ | |
826 | 1770 c = RE_TRANSLATE (c); \ |
428 | 1771 } while (0) |
1772 | |
1773 /* Fetch the next character in the uncompiled pattern, with no | |
1774 translation. */ | |
1775 #define PATFETCH_RAW(c) \ | |
1776 do {if (p == pend) return REG_EEND; \ | |
1777 assert (p < pend); \ | |
867 | 1778 c = itext_ichar (p); \ |
1779 INC_IBYTEPTR (p); \ | |
428 | 1780 } while (0) |
1781 | |
1782 /* Go backwards one character in the pattern. */ | |
867 | 1783 #define PATUNFETCH DEC_IBYTEPTR (p) |
428 | 1784 |
1785 /* If `translate' is non-null, return translate[D], else just D. We | |
1786 cast the subscript to translate because some data is declared as | |
1787 `char *', to avoid warnings when a string constant is passed. But | |
1788 when we use a character as a subscript we must make it unsigned. */ | |
826 | 1789 #define RE_TRANSLATE(d) \ |
1790 (TRANSLATE_P (translate) ? RE_TRANSLATE_1 (d) : (d)) | |
428 | 1791 |
1792 /* Macros for outputting the compiled pattern into `buffer'. */ | |
1793 | |
1794 /* If the buffer isn't allocated when it comes in, use this. */ | |
1795 #define INIT_BUF_SIZE 32 | |
1796 | |
1797 /* Make sure we have at least N more bytes of space in buffer. */ | |
1798 #define GET_BUFFER_SPACE(n) \ | |
647 | 1799 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \ |
428 | 1800 EXTEND_BUFFER () |
1801 | |
1802 /* Make sure we have one more byte of buffer space and then add C to it. */ | |
1803 #define BUF_PUSH(c) \ | |
1804 do { \ | |
1805 GET_BUFFER_SPACE (1); \ | |
446 | 1806 *buf_end++ = (unsigned char) (c); \ |
428 | 1807 } while (0) |
1808 | |
1809 | |
1810 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ | |
1811 #define BUF_PUSH_2(c1, c2) \ | |
1812 do { \ | |
1813 GET_BUFFER_SPACE (2); \ | |
446 | 1814 *buf_end++ = (unsigned char) (c1); \ |
1815 *buf_end++ = (unsigned char) (c2); \ | |
428 | 1816 } while (0) |
1817 | |
1818 | |
1819 /* As with BUF_PUSH_2, except for three bytes. */ | |
1820 #define BUF_PUSH_3(c1, c2, c3) \ | |
1821 do { \ | |
1822 GET_BUFFER_SPACE (3); \ | |
446 | 1823 *buf_end++ = (unsigned char) (c1); \ |
1824 *buf_end++ = (unsigned char) (c2); \ | |
1825 *buf_end++ = (unsigned char) (c3); \ | |
428 | 1826 } while (0) |
1827 | |
1828 | |
1829 /* Store a jump with opcode OP at LOC to location TO. We store a | |
1830 relative address offset by the three bytes the jump itself occupies. */ | |
1831 #define STORE_JUMP(op, loc, to) \ | |
1832 store_op1 (op, loc, (to) - (loc) - 3) | |
1833 | |
1834 /* Likewise, for a two-argument jump. */ | |
1835 #define STORE_JUMP2(op, loc, to, arg) \ | |
1836 store_op2 (op, loc, (to) - (loc) - 3, arg) | |
1837 | |
446 | 1838 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the |
1839 buffer end. */ | |
428 | 1840 #define INSERT_JUMP(op, loc, to) \ |
446 | 1841 insert_op1 (op, loc, (to) - (loc) - 3, buf_end) |
1842 | |
1843 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the | |
1844 buffer end. */ | |
428 | 1845 #define INSERT_JUMP2(op, loc, to, arg) \ |
446 | 1846 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end) |
428 | 1847 |
1848 | |
1849 /* This is not an arbitrary limit: the arguments which represent offsets | |
1850 into the pattern are two bytes long. So if 2^16 bytes turns out to | |
1851 be too small, many things would have to change. */ | |
1852 #define MAX_BUF_SIZE (1L << 16) | |
1853 | |
1854 | |
1855 /* Extend the buffer by twice its current size via realloc and | |
1856 reset the pointers that pointed into the old block to point to the | |
1857 correct places in the new one. If extending the buffer results in it | |
1858 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ | |
1333 | 1859 #define EXTEND_BUFFER() \ |
1860 do { \ | |
1861 re_char *old_buffer = bufp->buffer; \ | |
1862 if (bufp->allocated == MAX_BUF_SIZE) \ | |
1863 return REG_ESIZE; \ | |
1864 bufp->allocated <<= 1; \ | |
1865 if (bufp->allocated > MAX_BUF_SIZE) \ | |
1866 bufp->allocated = MAX_BUF_SIZE; \ | |
1867 bufp->buffer = \ | |
1868 (unsigned char *) xrealloc (bufp->buffer, bufp->allocated); \ | |
1869 if (bufp->buffer == NULL) \ | |
1870 return REG_ESPACE; \ | |
1871 /* If the buffer moved, move all the pointers into it. */ \ | |
1872 if (old_buffer != bufp->buffer) \ | |
1873 { \ | |
1874 buf_end = (buf_end - old_buffer) + bufp->buffer; \ | |
1875 begalt = (begalt - old_buffer) + bufp->buffer; \ | |
1876 if (fixup_alt_jump) \ | |
1877 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer; \ | |
1878 if (laststart) \ | |
1879 laststart = (laststart - old_buffer) + bufp->buffer; \ | |
1880 if (pending_exact) \ | |
1881 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ | |
1882 } \ | |
428 | 1883 } while (0) |
1884 | |
1885 | |
1886 /* Since we have one byte reserved for the register number argument to | |
1887 {start,stop}_memory, the maximum number of groups we can report | |
1888 things about is what fits in that byte. */ | |
1889 #define MAX_REGNUM 255 | |
1890 | |
1891 /* But patterns can have more than `MAX_REGNUM' registers. We just | |
502 | 1892 ignore the excess. |
1893 #### not true! groups past this will fail in lots of ways, if we | |
1894 ever have to backtrack. | |
1895 */ | |
647 | 1896 typedef int regnum_t; |
428 | 1897 |
502 | 1898 #define INIT_REG_TRANSLATE_SIZE 5 |
428 | 1899 |
1900 /* Macros for the compile stack. */ | |
1901 | |
1902 /* Since offsets can go either forwards or backwards, this type needs to | |
1903 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ | |
1904 typedef int pattern_offset_t; | |
1905 | |
1906 typedef struct | |
1907 { | |
1908 pattern_offset_t begalt_offset; | |
1909 pattern_offset_t fixup_alt_jump; | |
1910 pattern_offset_t inner_group_offset; | |
1911 pattern_offset_t laststart_offset; | |
1912 regnum_t regnum; | |
1913 } compile_stack_elt_t; | |
1914 | |
1915 | |
1916 typedef struct | |
1917 { | |
1918 compile_stack_elt_t *stack; | |
647 | 1919 int size; |
1920 int avail; /* Offset of next open position. */ | |
428 | 1921 } compile_stack_type; |
1922 | |
1923 | |
1924 #define INIT_COMPILE_STACK_SIZE 32 | |
1925 | |
1926 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) | |
1927 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) | |
1928 | |
1929 /* The next available element. */ | |
1930 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) | |
1931 | |
1932 | |
1933 /* Set the bit for character C in a bit vector. */ | |
1934 #define SET_LIST_BIT(c) \ | |
446 | 1935 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \ |
428 | 1936 |= 1 << (((unsigned char) c) % BYTEWIDTH)) |
1937 | |
1938 #ifdef MULE | |
1939 | |
1940 /* Set the "bit" for character C in a range table. */ | |
1941 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt) | |
1942 | |
1943 /* Set the "bit" for character c in the appropriate table. */ | |
1944 #define SET_EITHER_BIT(c) \ | |
1945 do { \ | |
1946 if (has_extended_chars) \ | |
1947 SET_RANGETAB_BIT (c); \ | |
1948 else \ | |
1949 SET_LIST_BIT (c); \ | |
1950 } while (0) | |
1951 | |
1952 #else /* not MULE */ | |
1953 | |
1954 #define SET_EITHER_BIT(c) SET_LIST_BIT (c) | |
1955 | |
1956 #endif | |
1957 | |
1958 | |
1959 /* Get the next unsigned number in the uncompiled pattern. */ | |
1960 #define GET_UNSIGNED_NUMBER(num) \ | |
1961 { if (p != pend) \ | |
1962 { \ | |
1963 PATFETCH (c); \ | |
1964 while (ISDIGIT (c)) \ | |
1965 { \ | |
1966 if (num < 0) \ | |
1967 num = 0; \ | |
1968 num = num * 10 + c - '0'; \ | |
1969 if (p == pend) \ | |
1970 break; \ | |
1971 PATFETCH (c); \ | |
1972 } \ | |
1973 } \ | |
1974 } | |
1975 | |
1976 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ | |
1977 | |
1978 #define IS_CHAR_CLASS(string) \ | |
1979 (STREQ (string, "alpha") || STREQ (string, "upper") \ | |
1980 || STREQ (string, "lower") || STREQ (string, "digit") \ | |
1981 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ | |
1982 || STREQ (string, "space") || STREQ (string, "print") \ | |
1983 || STREQ (string, "punct") || STREQ (string, "graph") \ | |
1984 || STREQ (string, "cntrl") || STREQ (string, "blank")) | |
1985 | |
1986 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg); | |
1987 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2); | |
1988 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg, | |
1989 unsigned char *end); | |
1990 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, | |
1991 unsigned char *end); | |
460 | 1992 static re_bool at_begline_loc_p (re_char *pattern, re_char *p, |
428 | 1993 reg_syntax_t syntax); |
460 | 1994 static re_bool at_endline_loc_p (re_char *p, re_char *pend, int syntax); |
1995 static re_bool group_in_compile_stack (compile_stack_type compile_stack, | |
428 | 1996 regnum_t regnum); |
446 | 1997 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend, |
1998 RE_TRANSLATE_TYPE translate, | |
1999 reg_syntax_t syntax, | |
428 | 2000 unsigned char *b); |
2001 #ifdef MULE | |
446 | 2002 static reg_errcode_t compile_extended_range (re_char **p_ptr, |
2003 re_char *pend, | |
2004 RE_TRANSLATE_TYPE translate, | |
428 | 2005 reg_syntax_t syntax, |
2006 Lisp_Object rtab); | |
2007 #endif /* MULE */ | |
460 | 2008 static re_bool group_match_null_string_p (unsigned char **p, |
428 | 2009 unsigned char *end, |
2010 register_info_type *reg_info); | |
460 | 2011 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end, |
428 | 2012 register_info_type *reg_info); |
460 | 2013 static re_bool common_op_match_null_string_p (unsigned char **p, |
428 | 2014 unsigned char *end, |
2015 register_info_type *reg_info); | |
826 | 2016 static int bcmp_translate (re_char *s1, re_char *s2, |
2017 REGISTER int len, RE_TRANSLATE_TYPE translate | |
2018 #ifdef emacs | |
2019 , Internal_Format fmt, Lisp_Object lispobj | |
2020 #endif | |
2021 ); | |
428 | 2022 static int re_match_2_internal (struct re_pattern_buffer *bufp, |
446 | 2023 re_char *string1, int size1, |
2024 re_char *string2, int size2, int pos, | |
826 | 2025 struct re_registers *regs, int stop |
2026 RE_LISP_CONTEXT_ARGS_DECL); | |
428 | 2027 |
2028 #ifndef MATCH_MAY_ALLOCATE | |
2029 | |
2030 /* If we cannot allocate large objects within re_match_2_internal, | |
2031 we make the fail stack and register vectors global. | |
2032 The fail stack, we grow to the maximum size when a regexp | |
2033 is compiled. | |
2034 The register vectors, we adjust in size each time we | |
2035 compile a regexp, according to the number of registers it needs. */ | |
2036 | |
2037 static fail_stack_type fail_stack; | |
2038 | |
2039 /* Size with which the following vectors are currently allocated. | |
2040 That is so we can make them bigger as needed, | |
2041 but never make them smaller. */ | |
2042 static int regs_allocated_size; | |
2043 | |
446 | 2044 static re_char ** regstart, ** regend; |
2045 static re_char ** old_regstart, ** old_regend; | |
2046 static re_char **best_regstart, **best_regend; | |
428 | 2047 static register_info_type *reg_info; |
446 | 2048 static re_char **reg_dummy; |
428 | 2049 static register_info_type *reg_info_dummy; |
2050 | |
2051 /* Make the register vectors big enough for NUM_REGS registers, | |
2052 but don't make them smaller. */ | |
2053 | |
2054 static | |
2055 regex_grow_registers (int num_regs) | |
2056 { | |
2057 if (num_regs > regs_allocated_size) | |
2058 { | |
551 | 2059 RETALLOC (regstart, num_regs, re_char *); |
2060 RETALLOC (regend, num_regs, re_char *); | |
2061 RETALLOC (old_regstart, num_regs, re_char *); | |
2062 RETALLOC (old_regend, num_regs, re_char *); | |
2063 RETALLOC (best_regstart, num_regs, re_char *); | |
2064 RETALLOC (best_regend, num_regs, re_char *); | |
2065 RETALLOC (reg_info, num_regs, register_info_type); | |
2066 RETALLOC (reg_dummy, num_regs, re_char *); | |
2067 RETALLOC (reg_info_dummy, num_regs, register_info_type); | |
428 | 2068 |
2069 regs_allocated_size = num_regs; | |
2070 } | |
2071 } | |
2072 | |
2073 #endif /* not MATCH_MAY_ALLOCATE */ | |
2074 | |
2075 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. | |
2076 Returns one of error codes defined in `regex.h', or zero for success. | |
2077 | |
2078 Assumes the `allocated' (and perhaps `buffer') and `translate' | |
2079 fields are set in BUFP on entry. | |
2080 | |
2081 If it succeeds, results are put in BUFP (if it returns an error, the | |
2082 contents of BUFP are undefined): | |
2083 `buffer' is the compiled pattern; | |
2084 `syntax' is set to SYNTAX; | |
2085 `used' is set to the length of the compiled pattern; | |
2086 `fastmap_accurate' is zero; | |
502 | 2087 `re_ngroups' is the number of groups/subexpressions (including shy |
2088 groups) in PATTERN; | |
2089 `re_nsub' is the number of non-shy groups in PATTERN; | |
428 | 2090 `not_bol' and `not_eol' are zero; |
2091 | |
2092 The `fastmap' and `newline_anchor' fields are neither | |
2093 examined nor set. */ | |
2094 | |
2095 /* Return, freeing storage we allocated. */ | |
1726 | 2096 #define FREE_STACK_RETURN(value) \ |
2097 do \ | |
2098 { \ | |
2099 xfree (compile_stack.stack, compile_stack_elt_t *); \ | |
2100 return value; \ | |
1333 | 2101 } while (0) |
428 | 2102 |
2103 static reg_errcode_t | |
446 | 2104 regex_compile (re_char *pattern, int size, reg_syntax_t syntax, |
428 | 2105 struct re_pattern_buffer *bufp) |
2106 { | |
2107 /* We fetch characters from PATTERN here. We declare these as int | |
2108 (or possibly long) so that chars above 127 can be used as | |
2109 array indices. The macros that fetch a character from the pattern | |
2110 make sure to coerce to unsigned char before assigning, so we won't | |
2111 get bitten by negative numbers here. */ | |
2112 /* XEmacs change: used to be unsigned char. */ | |
2113 REGISTER EMACS_INT c, c1; | |
2114 | |
2115 /* A random temporary spot in PATTERN. */ | |
446 | 2116 re_char *p1; |
428 | 2117 |
2118 /* Points to the end of the buffer, where we should append. */ | |
446 | 2119 REGISTER unsigned char *buf_end; |
428 | 2120 |
2121 /* Keeps track of unclosed groups. */ | |
2122 compile_stack_type compile_stack; | |
2123 | |
2124 /* Points to the current (ending) position in the pattern. */ | |
446 | 2125 re_char *p = pattern; |
2126 re_char *pend = pattern + size; | |
428 | 2127 |
2128 /* How to translate the characters in the pattern. */ | |
446 | 2129 RE_TRANSLATE_TYPE translate = bufp->translate; |
428 | 2130 |
2131 /* Address of the count-byte of the most recently inserted `exactn' | |
2132 command. This makes it possible to tell if a new exact-match | |
2133 character can be added to that command or if the character requires | |
2134 a new `exactn' command. */ | |
2135 unsigned char *pending_exact = 0; | |
2136 | |
2137 /* Address of start of the most recently finished expression. | |
2138 This tells, e.g., postfix * where to find the start of its | |
2139 operand. Reset at the beginning of groups and alternatives. */ | |
2140 unsigned char *laststart = 0; | |
2141 | |
2142 /* Address of beginning of regexp, or inside of last group. */ | |
2143 unsigned char *begalt; | |
2144 | |
2145 /* Place in the uncompiled pattern (i.e., the {) to | |
2146 which to go back if the interval is invalid. */ | |
446 | 2147 re_char *beg_interval; |
428 | 2148 |
2149 /* Address of the place where a forward jump should go to the end of | |
2150 the containing expression. Each alternative of an `or' -- except the | |
2151 last -- ends with a forward jump of this sort. */ | |
2152 unsigned char *fixup_alt_jump = 0; | |
2153 | |
2154 /* Counts open-groups as they are encountered. Remembered for the | |
2155 matching close-group on the compile stack, so the same register | |
2156 number is put in the stop_memory as the start_memory. */ | |
2157 regnum_t regnum = 0; | |
2158 | |
2159 #ifdef DEBUG | |
2160 DEBUG_PRINT1 ("\nCompiling pattern: "); | |
2161 if (debug) | |
2162 { | |
647 | 2163 int debug_count; |
428 | 2164 |
2165 for (debug_count = 0; debug_count < size; debug_count++) | |
2166 putchar (pattern[debug_count]); | |
2167 putchar ('\n'); | |
2168 } | |
2169 #endif /* DEBUG */ | |
2170 | |
2171 /* Initialize the compile stack. */ | |
2172 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); | |
2173 if (compile_stack.stack == NULL) | |
2174 return REG_ESPACE; | |
2175 | |
2176 compile_stack.size = INIT_COMPILE_STACK_SIZE; | |
2177 compile_stack.avail = 0; | |
2178 | |
2179 /* Initialize the pattern buffer. */ | |
2180 bufp->syntax = syntax; | |
2181 bufp->fastmap_accurate = 0; | |
2182 bufp->not_bol = bufp->not_eol = 0; | |
2183 | |
2184 /* Set `used' to zero, so that if we return an error, the pattern | |
2185 printer (for debugging) will think there's no pattern. We reset it | |
2186 at the end. */ | |
2187 bufp->used = 0; | |
2188 | |
2189 /* Always count groups, whether or not bufp->no_sub is set. */ | |
2190 bufp->re_nsub = 0; | |
502 | 2191 bufp->re_ngroups = 0; |
2192 | |
2193 bufp->warned_about_incompatible_back_references = 0; | |
2194 | |
2195 if (bufp->external_to_internal_register == 0) | |
2196 { | |
2197 bufp->external_to_internal_register_size = INIT_REG_TRANSLATE_SIZE; | |
2198 RETALLOC (bufp->external_to_internal_register, | |
2199 bufp->external_to_internal_register_size, | |
2200 int); | |
2201 } | |
2202 | |
2203 { | |
2204 int i; | |
2205 | |
2206 bufp->external_to_internal_register[0] = 0; | |
2207 for (i = 1; i < bufp->external_to_internal_register_size; i++) | |
2208 bufp->external_to_internal_register[i] = (int) 0xDEADBEEF; | |
2209 } | |
428 | 2210 |
2211 #if !defined (emacs) && !defined (SYNTAX_TABLE) | |
2212 /* Initialize the syntax table. */ | |
2213 init_syntax_once (); | |
2214 #endif | |
2215 | |
2216 if (bufp->allocated == 0) | |
2217 { | |
2218 if (bufp->buffer) | |
2219 { /* If zero allocated, but buffer is non-null, try to realloc | |
2220 enough space. This loses if buffer's address is bogus, but | |
2221 that is the user's responsibility. */ | |
2222 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); | |
2223 } | |
2224 else | |
2225 { /* Caller did not allocate a buffer. Do it for them. */ | |
2226 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); | |
2227 } | |
2228 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE); | |
2229 | |
2230 bufp->allocated = INIT_BUF_SIZE; | |
2231 } | |
2232 | |
446 | 2233 begalt = buf_end = bufp->buffer; |
428 | 2234 |
2235 /* Loop through the uncompiled pattern until we're at the end. */ | |
2236 while (p != pend) | |
2237 { | |
2238 PATFETCH (c); | |
2239 | |
2240 switch (c) | |
2241 { | |
2242 case '^': | |
2243 { | |
2244 if ( /* If at start of pattern, it's an operator. */ | |
2245 p == pattern + 1 | |
2246 /* If context independent, it's an operator. */ | |
2247 || syntax & RE_CONTEXT_INDEP_ANCHORS | |
2248 /* Otherwise, depends on what's come before. */ | |
2249 || at_begline_loc_p (pattern, p, syntax)) | |
2250 BUF_PUSH (begline); | |
2251 else | |
2252 goto normal_char; | |
2253 } | |
2254 break; | |
2255 | |
2256 | |
2257 case '$': | |
2258 { | |
2259 if ( /* If at end of pattern, it's an operator. */ | |
2260 p == pend | |
2261 /* If context independent, it's an operator. */ | |
2262 || syntax & RE_CONTEXT_INDEP_ANCHORS | |
2263 /* Otherwise, depends on what's next. */ | |
2264 || at_endline_loc_p (p, pend, syntax)) | |
2265 BUF_PUSH (endline); | |
2266 else | |
2267 goto normal_char; | |
2268 } | |
2269 break; | |
2270 | |
2271 | |
2272 case '+': | |
2273 case '?': | |
2274 if ((syntax & RE_BK_PLUS_QM) | |
2275 || (syntax & RE_LIMITED_OPS)) | |
2276 goto normal_char; | |
2277 handle_plus: | |
2278 case '*': | |
2279 /* If there is no previous pattern... */ | |
2280 if (!laststart) | |
2281 { | |
2282 if (syntax & RE_CONTEXT_INVALID_OPS) | |
2283 FREE_STACK_RETURN (REG_BADRPT); | |
2284 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) | |
2285 goto normal_char; | |
2286 } | |
2287 | |
2288 { | |
2289 /* true means zero/many matches are allowed. */ | |
460 | 2290 re_bool zero_times_ok = c != '+'; |
2291 re_bool many_times_ok = c != '?'; | |
428 | 2292 |
2293 /* true means match shortest string possible. */ | |
460 | 2294 re_bool minimal = false; |
428 | 2295 |
2296 /* If there is a sequence of repetition chars, collapse it | |
2297 down to just one (the right one). We can't combine | |
2298 interval operators with these because of, e.g., `a{2}*', | |
2299 which should only match an even number of `a's. */ | |
2300 while (p != pend) | |
2301 { | |
2302 PATFETCH (c); | |
2303 | |
2304 if (c == '*' || (!(syntax & RE_BK_PLUS_QM) | |
2305 && (c == '+' || c == '?'))) | |
2306 ; | |
2307 | |
2308 else if (syntax & RE_BK_PLUS_QM && c == '\\') | |
2309 { | |
2310 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
2311 | |
2312 PATFETCH (c1); | |
2313 if (!(c1 == '+' || c1 == '?')) | |
2314 { | |
2315 PATUNFETCH; | |
2316 PATUNFETCH; | |
2317 break; | |
2318 } | |
2319 | |
2320 c = c1; | |
2321 } | |
2322 else | |
2323 { | |
2324 PATUNFETCH; | |
2325 break; | |
2326 } | |
2327 | |
2328 /* If we get here, we found another repeat character. */ | |
2329 if (!(syntax & RE_NO_MINIMAL_MATCHING)) | |
2330 { | |
440 | 2331 /* "*?" and "+?" and "??" are okay (and mean match |
2332 minimally), but other sequences (such as "*??" and | |
2333 "+++") are rejected (reserved for future use). */ | |
428 | 2334 if (minimal || c != '?') |
2335 FREE_STACK_RETURN (REG_BADRPT); | |
2336 minimal = true; | |
2337 } | |
2338 else | |
2339 { | |
2340 zero_times_ok |= c != '+'; | |
2341 many_times_ok |= c != '?'; | |
2342 } | |
2343 } | |
2344 | |
2345 /* Star, etc. applied to an empty pattern is equivalent | |
2346 to an empty pattern. */ | |
2347 if (!laststart) | |
2348 break; | |
2349 | |
2350 /* Now we know whether zero matches is allowed | |
2351 and whether two or more matches is allowed | |
2352 and whether we want minimal or maximal matching. */ | |
2353 if (minimal) | |
2354 { | |
2355 if (!many_times_ok) | |
2356 { | |
2357 /* "a??" becomes: | |
2358 0: /on_failure_jump to 6 | |
2359 3: /jump to 9 | |
2360 6: /exactn/1/A | |
2361 9: end of pattern. | |
2362 */ | |
2363 GET_BUFFER_SPACE (6); | |
446 | 2364 INSERT_JUMP (jump, laststart, buf_end + 3); |
2365 buf_end += 3; | |
428 | 2366 INSERT_JUMP (on_failure_jump, laststart, laststart + 6); |
446 | 2367 buf_end += 3; |
428 | 2368 } |
2369 else if (zero_times_ok) | |
2370 { | |
2371 /* "a*?" becomes: | |
2372 0: /jump to 6 | |
2373 3: /exactn/1/A | |
2374 6: /on_failure_jump to 3 | |
2375 9: end of pattern. | |
2376 */ | |
2377 GET_BUFFER_SPACE (6); | |
446 | 2378 INSERT_JUMP (jump, laststart, buf_end + 3); |
2379 buf_end += 3; | |
2380 STORE_JUMP (on_failure_jump, buf_end, laststart + 3); | |
2381 buf_end += 3; | |
428 | 2382 } |
2383 else | |
2384 { | |
2385 /* "a+?" becomes: | |
2386 0: /exactn/1/A | |
2387 3: /on_failure_jump to 0 | |
2388 6: end of pattern. | |
2389 */ | |
2390 GET_BUFFER_SPACE (3); | |
446 | 2391 STORE_JUMP (on_failure_jump, buf_end, laststart); |
2392 buf_end += 3; | |
428 | 2393 } |
2394 } | |
2395 else | |
2396 { | |
2397 /* Are we optimizing this jump? */ | |
460 | 2398 re_bool keep_string_p = false; |
428 | 2399 |
2400 if (many_times_ok) | |
446 | 2401 { /* More than one repetition is allowed, so put in |
2402 at the end a backward relative jump from | |
2403 `buf_end' to before the next jump we're going | |
2404 to put in below (which jumps from laststart to | |
2405 after this jump). | |
428 | 2406 |
2407 But if we are at the `*' in the exact sequence `.*\n', | |
2408 insert an unconditional jump backwards to the ., | |
2409 instead of the beginning of the loop. This way we only | |
2410 push a failure point once, instead of every time | |
2411 through the loop. */ | |
2412 assert (p - 1 > pattern); | |
2413 | |
2414 /* Allocate the space for the jump. */ | |
2415 GET_BUFFER_SPACE (3); | |
2416 | |
2417 /* We know we are not at the first character of the | |
2418 pattern, because laststart was nonzero. And we've | |
2419 already incremented `p', by the way, to be the | |
2420 character after the `*'. Do we have to do something | |
2421 analogous here for null bytes, because of | |
2422 RE_DOT_NOT_NULL? */ | |
446 | 2423 if (*(p - 2) == '.' |
428 | 2424 && zero_times_ok |
446 | 2425 && p < pend && *p == '\n' |
428 | 2426 && !(syntax & RE_DOT_NEWLINE)) |
2427 { /* We have .*\n. */ | |
446 | 2428 STORE_JUMP (jump, buf_end, laststart); |
428 | 2429 keep_string_p = true; |
2430 } | |
2431 else | |
2432 /* Anything else. */ | |
446 | 2433 STORE_JUMP (maybe_pop_jump, buf_end, laststart - 3); |
428 | 2434 |
2435 /* We've added more stuff to the buffer. */ | |
446 | 2436 buf_end += 3; |
428 | 2437 } |
2438 | |
446 | 2439 /* On failure, jump from laststart to buf_end + 3, |
2440 which will be the end of the buffer after this jump | |
2441 is inserted. */ | |
428 | 2442 GET_BUFFER_SPACE (3); |
2443 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump | |
2444 : on_failure_jump, | |
446 | 2445 laststart, buf_end + 3); |
2446 buf_end += 3; | |
428 | 2447 |
2448 if (!zero_times_ok) | |
2449 { | |
2450 /* At least one repetition is required, so insert a | |
2451 `dummy_failure_jump' before the initial | |
2452 `on_failure_jump' instruction of the loop. This | |
2453 effects a skip over that instruction the first time | |
2454 we hit that loop. */ | |
2455 GET_BUFFER_SPACE (3); | |
2456 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); | |
446 | 2457 buf_end += 3; |
428 | 2458 } |
2459 } | |
2460 pending_exact = 0; | |
2461 } | |
2462 break; | |
2463 | |
2464 | |
2465 case '.': | |
446 | 2466 laststart = buf_end; |
428 | 2467 BUF_PUSH (anychar); |
2468 break; | |
2469 | |
2470 | |
2471 case '[': | |
2472 { | |
2473 /* XEmacs change: this whole section */ | |
460 | 2474 re_bool had_char_class = false; |
428 | 2475 #ifdef MULE |
460 | 2476 re_bool has_extended_chars = false; |
428 | 2477 REGISTER Lisp_Object rtab = Qnil; |
2478 #endif | |
2479 | |
2480 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2481 | |
2482 /* Ensure that we have enough space to push a charset: the | |
2483 opcode, the length count, and the bitset; 34 bytes in all. */ | |
2484 GET_BUFFER_SPACE (34); | |
2485 | |
446 | 2486 laststart = buf_end; |
428 | 2487 |
2488 /* We test `*p == '^' twice, instead of using an if | |
2489 statement, so we only need one BUF_PUSH. */ | |
2490 BUF_PUSH (*p == '^' ? charset_not : charset); | |
2491 if (*p == '^') | |
2492 p++; | |
2493 | |
2494 /* Remember the first position in the bracket expression. */ | |
2495 p1 = p; | |
2496 | |
2497 /* Push the number of bytes in the bitmap. */ | |
2498 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); | |
2499 | |
2500 /* Clear the whole map. */ | |
446 | 2501 memset (buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH); |
428 | 2502 |
2503 /* charset_not matches newline according to a syntax bit. */ | |
446 | 2504 if ((re_opcode_t) buf_end[-2] == charset_not |
428 | 2505 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
2506 SET_LIST_BIT ('\n'); | |
2507 | |
2508 #ifdef MULE | |
2509 start_over_with_extended: | |
2510 if (has_extended_chars) | |
2511 { | |
2512 /* There are extended chars here, which means we need to start | |
2513 over and shift to unified range-table format. */ | |
446 | 2514 if (buf_end[-2] == charset) |
2515 buf_end[-2] = charset_mule; | |
428 | 2516 else |
446 | 2517 buf_end[-2] = charset_mule_not; |
2518 buf_end--; | |
428 | 2519 p = p1; /* go back to the beginning of the charset, after |
2520 a possible ^. */ | |
2521 rtab = Vthe_lisp_rangetab; | |
2522 Fclear_range_table (rtab); | |
2523 | |
2524 /* charset_not matches newline according to a syntax bit. */ | |
446 | 2525 if ((re_opcode_t) buf_end[-1] == charset_mule_not |
428 | 2526 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
2527 SET_EITHER_BIT ('\n'); | |
2528 } | |
2529 #endif /* MULE */ | |
2530 | |
2531 /* Read in characters and ranges, setting map bits. */ | |
2532 for (;;) | |
2533 { | |
2534 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2535 | |
446 | 2536 PATFETCH (c); |
428 | 2537 |
2538 #ifdef MULE | |
2539 if (c >= 0x80 && !has_extended_chars) | |
2540 { | |
2541 has_extended_chars = 1; | |
2542 /* Frumble-bumble, we've found some extended chars. | |
2543 Need to start over, process everything using | |
2544 the general extended-char mechanism, and need | |
2545 to use charset_mule and charset_mule_not instead | |
2546 of charset and charset_not. */ | |
2547 goto start_over_with_extended; | |
2548 } | |
2549 #endif /* MULE */ | |
2550 /* \ might escape characters inside [...] and [^...]. */ | |
2551 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') | |
2552 { | |
2553 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
2554 | |
446 | 2555 PATFETCH (c1); |
428 | 2556 #ifdef MULE |
2557 if (c1 >= 0x80 && !has_extended_chars) | |
2558 { | |
2559 has_extended_chars = 1; | |
2560 goto start_over_with_extended; | |
2561 } | |
2562 #endif /* MULE */ | |
2563 SET_EITHER_BIT (c1); | |
2564 continue; | |
2565 } | |
2566 | |
2567 /* Could be the end of the bracket expression. If it's | |
2568 not (i.e., when the bracket expression is `[]' so | |
2569 far), the ']' character bit gets set way below. */ | |
2570 if (c == ']' && p != p1 + 1) | |
2571 break; | |
2572 | |
2573 /* Look ahead to see if it's a range when the last thing | |
2574 was a character class. */ | |
2575 if (had_char_class && c == '-' && *p != ']') | |
2576 FREE_STACK_RETURN (REG_ERANGE); | |
2577 | |
2578 /* Look ahead to see if it's a range when the last thing | |
2579 was a character: if this is a hyphen not at the | |
2580 beginning or the end of a list, then it's the range | |
2581 operator. */ | |
2582 if (c == '-' | |
2583 && !(p - 2 >= pattern && p[-2] == '[') | |
446 | 2584 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') |
428 | 2585 && *p != ']') |
2586 { | |
2587 reg_errcode_t ret; | |
2588 | |
2589 #ifdef MULE | |
2590 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) | |
2591 { | |
2592 has_extended_chars = 1; | |
2593 goto start_over_with_extended; | |
2594 } | |
2595 if (has_extended_chars) | |
2596 ret = compile_extended_range (&p, pend, translate, | |
2597 syntax, rtab); | |
2598 else | |
2599 #endif /* MULE */ | |
446 | 2600 ret = compile_range (&p, pend, translate, syntax, buf_end); |
428 | 2601 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
2602 } | |
2603 | |
2604 else if (p[0] == '-' && p[1] != ']') | |
2605 { /* This handles ranges made up of characters only. */ | |
2606 reg_errcode_t ret; | |
2607 | |
2608 /* Move past the `-'. */ | |
2609 PATFETCH (c1); | |
2610 | |
2611 #ifdef MULE | |
2612 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) | |
2613 { | |
2614 has_extended_chars = 1; | |
2615 goto start_over_with_extended; | |
2616 } | |
2617 if (has_extended_chars) | |
2618 ret = compile_extended_range (&p, pend, translate, | |
2619 syntax, rtab); | |
2620 else | |
2621 #endif /* MULE */ | |
446 | 2622 ret = compile_range (&p, pend, translate, syntax, buf_end); |
428 | 2623 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
2624 } | |
2625 | |
2626 /* See if we're at the beginning of a possible character | |
2627 class. */ | |
2628 | |
2629 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') | |
2630 { /* Leave room for the null. */ | |
2631 char str[CHAR_CLASS_MAX_LENGTH + 1]; | |
2632 | |
2633 PATFETCH (c); | |
2634 c1 = 0; | |
2635 | |
2636 /* If pattern is `[[:'. */ | |
2637 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2638 | |
2639 for (;;) | |
2640 { | |
446 | 2641 /* #### This code is unused. |
2642 Correctness is not checked after TRT | |
2643 table change. */ | |
428 | 2644 PATFETCH (c); |
2645 if (c == ':' || c == ']' || p == pend | |
2646 || c1 == CHAR_CLASS_MAX_LENGTH) | |
2647 break; | |
442 | 2648 str[c1++] = (char) c; |
428 | 2649 } |
2650 str[c1] = '\0'; | |
2651 | |
446 | 2652 /* If isn't a word bracketed by `[:' and `:]': |
428 | 2653 undo the ending character, the letters, and leave |
2654 the leading `:' and `[' (but set bits for them). */ | |
2655 if (c == ':' && *p == ']') | |
2656 { | |
2657 int ch; | |
460 | 2658 re_bool is_alnum = STREQ (str, "alnum"); |
2659 re_bool is_alpha = STREQ (str, "alpha"); | |
2660 re_bool is_blank = STREQ (str, "blank"); | |
2661 re_bool is_cntrl = STREQ (str, "cntrl"); | |
2662 re_bool is_digit = STREQ (str, "digit"); | |
2663 re_bool is_graph = STREQ (str, "graph"); | |
2664 re_bool is_lower = STREQ (str, "lower"); | |
2665 re_bool is_print = STREQ (str, "print"); | |
2666 re_bool is_punct = STREQ (str, "punct"); | |
2667 re_bool is_space = STREQ (str, "space"); | |
2668 re_bool is_upper = STREQ (str, "upper"); | |
2669 re_bool is_xdigit = STREQ (str, "xdigit"); | |
428 | 2670 |
2671 if (!IS_CHAR_CLASS (str)) | |
2672 FREE_STACK_RETURN (REG_ECTYPE); | |
2673 | |
2674 /* Throw away the ] at the end of the character | |
2675 class. */ | |
2676 PATFETCH (c); | |
2677 | |
2678 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2679 | |
2680 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) | |
2681 { | |
2682 /* This was split into 3 if's to | |
2683 avoid an arbitrary limit in some compiler. */ | |
2684 if ( (is_alnum && ISALNUM (ch)) | |
2685 || (is_alpha && ISALPHA (ch)) | |
2686 || (is_blank && ISBLANK (ch)) | |
2687 || (is_cntrl && ISCNTRL (ch))) | |
2688 SET_EITHER_BIT (ch); | |
2689 if ( (is_digit && ISDIGIT (ch)) | |
2690 || (is_graph && ISGRAPH (ch)) | |
2691 || (is_lower && ISLOWER (ch)) | |
2692 || (is_print && ISPRINT (ch))) | |
2693 SET_EITHER_BIT (ch); | |
2694 if ( (is_punct && ISPUNCT (ch)) | |
2695 || (is_space && ISSPACE (ch)) | |
2696 || (is_upper && ISUPPER (ch)) | |
2697 || (is_xdigit && ISXDIGIT (ch))) | |
2698 SET_EITHER_BIT (ch); | |
2699 } | |
2700 had_char_class = true; | |
2701 } | |
2702 else | |
2703 { | |
2704 c1++; | |
2705 while (c1--) | |
2706 PATUNFETCH; | |
2707 SET_EITHER_BIT ('['); | |
2708 SET_EITHER_BIT (':'); | |
2709 had_char_class = false; | |
2710 } | |
2711 } | |
2712 else | |
2713 { | |
2714 had_char_class = false; | |
2715 SET_EITHER_BIT (c); | |
2716 } | |
2717 } | |
2718 | |
2719 #ifdef MULE | |
2720 if (has_extended_chars) | |
2721 { | |
2722 /* We have a range table, not a bit vector. */ | |
2723 int bytes_needed = | |
2724 unified_range_table_bytes_needed (rtab); | |
2725 GET_BUFFER_SPACE (bytes_needed); | |
446 | 2726 unified_range_table_copy_data (rtab, buf_end); |
2727 buf_end += unified_range_table_bytes_used (buf_end); | |
428 | 2728 break; |
2729 } | |
2730 #endif /* MULE */ | |
2731 /* Discard any (non)matching list bytes that are all 0 at the | |
2732 end of the map. Decrease the map-length byte too. */ | |
446 | 2733 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0) |
2734 buf_end[-1]--; | |
2735 buf_end += buf_end[-1]; | |
428 | 2736 } |
2737 break; | |
2738 | |
2739 | |
2740 case '(': | |
2741 if (syntax & RE_NO_BK_PARENS) | |
2742 goto handle_open; | |
2743 else | |
2744 goto normal_char; | |
2745 | |
2746 | |
2747 case ')': | |
2748 if (syntax & RE_NO_BK_PARENS) | |
2749 goto handle_close; | |
2750 else | |
2751 goto normal_char; | |
2752 | |
2753 | |
2754 case '\n': | |
2755 if (syntax & RE_NEWLINE_ALT) | |
2756 goto handle_alt; | |
2757 else | |
2758 goto normal_char; | |
2759 | |
2760 | |
2761 case '|': | |
2762 if (syntax & RE_NO_BK_VBAR) | |
2763 goto handle_alt; | |
2764 else | |
2765 goto normal_char; | |
2766 | |
2767 | |
2768 case '{': | |
2769 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) | |
2770 goto handle_interval; | |
2771 else | |
2772 goto normal_char; | |
2773 | |
2774 | |
2775 case '\\': | |
2776 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
2777 | |
2778 /* Do not translate the character after the \, so that we can | |
2779 distinguish, e.g., \B from \b, even if we normally would | |
2780 translate, e.g., B to b. */ | |
2781 PATFETCH_RAW (c); | |
2782 | |
2783 switch (c) | |
2784 { | |
2785 case '(': | |
2786 if (syntax & RE_NO_BK_PARENS) | |
2787 goto normal_backslash; | |
2788 | |
2789 handle_open: | |
2790 { | |
2791 regnum_t r; | |
502 | 2792 int shy = 0; |
428 | 2793 |
2794 if (!(syntax & RE_NO_SHY_GROUPS) | |
2795 && p != pend | |
446 | 2796 && *p == '?') |
428 | 2797 { |
2798 p++; | |
446 | 2799 PATFETCH (c); |
428 | 2800 switch (c) |
2801 { | |
2802 case ':': /* shy groups */ | |
502 | 2803 shy = 1; |
428 | 2804 break; |
2805 | |
2806 /* All others are reserved for future constructs. */ | |
2807 default: | |
2808 FREE_STACK_RETURN (REG_BADPAT); | |
2809 } | |
2810 } | |
502 | 2811 |
2812 r = ++regnum; | |
2813 bufp->re_ngroups++; | |
2814 if (!shy) | |
2815 { | |
2816 bufp->re_nsub++; | |
2817 while (bufp->external_to_internal_register_size <= | |
2818 bufp->re_nsub) | |
2819 { | |
2820 int i; | |
2821 int old_size = | |
2822 bufp->external_to_internal_register_size; | |
2823 bufp->external_to_internal_register_size += 5; | |
2824 RETALLOC (bufp->external_to_internal_register, | |
2825 bufp->external_to_internal_register_size, | |
2826 int); | |
2827 /* debugging */ | |
2828 for (i = old_size; | |
2829 i < bufp->external_to_internal_register_size; i++) | |
2830 bufp->external_to_internal_register[i] = | |
2831 (int) 0xDEADBEEF; | |
2832 } | |
2833 | |
2834 bufp->external_to_internal_register[bufp->re_nsub] = | |
2835 bufp->re_ngroups; | |
2836 } | |
428 | 2837 |
2838 if (COMPILE_STACK_FULL) | |
2839 { | |
2840 RETALLOC (compile_stack.stack, compile_stack.size << 1, | |
2841 compile_stack_elt_t); | |
2842 if (compile_stack.stack == NULL) return REG_ESPACE; | |
2843 | |
2844 compile_stack.size <<= 1; | |
2845 } | |
2846 | |
2847 /* These are the values to restore when we hit end of this | |
2848 group. They are all relative offsets, so that if the | |
2849 whole pattern moves because of realloc, they will still | |
2850 be valid. */ | |
2851 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; | |
2852 COMPILE_STACK_TOP.fixup_alt_jump | |
2853 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; | |
446 | 2854 COMPILE_STACK_TOP.laststart_offset = buf_end - bufp->buffer; |
428 | 2855 COMPILE_STACK_TOP.regnum = r; |
2856 | |
2857 /* We will eventually replace the 0 with the number of | |
2858 groups inner to this one. But do not push a | |
2859 start_memory for groups beyond the last one we can | |
502 | 2860 represent in the compiled pattern. |
2861 #### bad bad bad. this will fail in lots of ways, if we | |
2862 ever have to backtrack for these groups. | |
2863 */ | |
428 | 2864 if (r <= MAX_REGNUM) |
2865 { | |
2866 COMPILE_STACK_TOP.inner_group_offset | |
446 | 2867 = buf_end - bufp->buffer + 2; |
428 | 2868 BUF_PUSH_3 (start_memory, r, 0); |
2869 } | |
2870 | |
2871 compile_stack.avail++; | |
2872 | |
2873 fixup_alt_jump = 0; | |
2874 laststart = 0; | |
446 | 2875 begalt = buf_end; |
428 | 2876 /* If we've reached MAX_REGNUM groups, then this open |
2877 won't actually generate any code, so we'll have to | |
2878 clear pending_exact explicitly. */ | |
2879 pending_exact = 0; | |
2880 } | |
2881 break; | |
2882 | |
2883 | |
2884 case ')': | |
2885 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; | |
2886 | |
2887 if (COMPILE_STACK_EMPTY) { | |
2888 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) | |
2889 goto normal_backslash; | |
2890 else | |
2891 FREE_STACK_RETURN (REG_ERPAREN); | |
2892 } | |
2893 | |
2894 handle_close: | |
2895 if (fixup_alt_jump) | |
2896 { /* Push a dummy failure point at the end of the | |
2897 alternative for a possible future | |
2898 `pop_failure_jump' to pop. See comments at | |
2899 `push_dummy_failure' in `re_match_2'. */ | |
2900 BUF_PUSH (push_dummy_failure); | |
2901 | |
2902 /* We allocated space for this jump when we assigned | |
2903 to `fixup_alt_jump', in the `handle_alt' case below. */ | |
446 | 2904 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end - 1); |
428 | 2905 } |
2906 | |
2907 /* See similar code for backslashed left paren above. */ | |
2908 if (COMPILE_STACK_EMPTY) { | |
2909 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) | |
2910 goto normal_char; | |
2911 else | |
2912 FREE_STACK_RETURN (REG_ERPAREN); | |
2913 } | |
2914 | |
2915 /* Since we just checked for an empty stack above, this | |
2916 ``can't happen''. */ | |
2917 assert (compile_stack.avail != 0); | |
2918 { | |
2919 /* We don't just want to restore into `regnum', because | |
2920 later groups should continue to be numbered higher, | |
2921 as in `(ab)c(de)' -- the second group is #2. */ | |
2922 regnum_t this_group_regnum; | |
2923 | |
2924 compile_stack.avail--; | |
2925 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; | |
2926 fixup_alt_jump | |
2927 = COMPILE_STACK_TOP.fixup_alt_jump | |
2928 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 | |
2929 : 0; | |
2930 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; | |
2931 this_group_regnum = COMPILE_STACK_TOP.regnum; | |
2932 /* If we've reached MAX_REGNUM groups, then this open | |
2933 won't actually generate any code, so we'll have to | |
2934 clear pending_exact explicitly. */ | |
2935 pending_exact = 0; | |
2936 | |
2937 /* We're at the end of the group, so now we know how many | |
2938 groups were inside this one. */ | |
2939 if (this_group_regnum <= MAX_REGNUM) | |
2940 { | |
2941 unsigned char *inner_group_loc | |
2942 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; | |
2943 | |
2944 *inner_group_loc = regnum - this_group_regnum; | |
2945 BUF_PUSH_3 (stop_memory, this_group_regnum, | |
2946 regnum - this_group_regnum); | |
2947 } | |
2948 } | |
2949 break; | |
2950 | |
2951 | |
2952 case '|': /* `\|'. */ | |
2953 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) | |
2954 goto normal_backslash; | |
2955 handle_alt: | |
2956 if (syntax & RE_LIMITED_OPS) | |
2957 goto normal_char; | |
2958 | |
2959 /* Insert before the previous alternative a jump which | |
2960 jumps to this alternative if the former fails. */ | |
2961 GET_BUFFER_SPACE (3); | |
446 | 2962 INSERT_JUMP (on_failure_jump, begalt, buf_end + 6); |
428 | 2963 pending_exact = 0; |
446 | 2964 buf_end += 3; |
428 | 2965 |
2966 /* The alternative before this one has a jump after it | |
2967 which gets executed if it gets matched. Adjust that | |
2968 jump so it will jump to this alternative's analogous | |
2969 jump (put in below, which in turn will jump to the next | |
2970 (if any) alternative's such jump, etc.). The last such | |
2971 jump jumps to the correct final destination. A picture: | |
2972 _____ _____ | |
2973 | | | | | |
2974 | v | v | |
2975 a | b | c | |
2976 | |
2977 If we are at `b', then fixup_alt_jump right now points to a | |
2978 three-byte space after `a'. We'll put in the jump, set | |
2979 fixup_alt_jump to right after `b', and leave behind three | |
2980 bytes which we'll fill in when we get to after `c'. */ | |
2981 | |
2982 if (fixup_alt_jump) | |
446 | 2983 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end); |
428 | 2984 |
2985 /* Mark and leave space for a jump after this alternative, | |
2986 to be filled in later either by next alternative or | |
2987 when know we're at the end of a series of alternatives. */ | |
446 | 2988 fixup_alt_jump = buf_end; |
428 | 2989 GET_BUFFER_SPACE (3); |
446 | 2990 buf_end += 3; |
428 | 2991 |
2992 laststart = 0; | |
446 | 2993 begalt = buf_end; |
428 | 2994 break; |
2995 | |
2996 | |
2997 case '{': | |
2998 /* If \{ is a literal. */ | |
2999 if (!(syntax & RE_INTERVALS) | |
3000 /* If we're at `\{' and it's not the open-interval | |
3001 operator. */ | |
3002 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) | |
3003 || (p - 2 == pattern && p == pend)) | |
3004 goto normal_backslash; | |
3005 | |
3006 handle_interval: | |
3007 { | |
3008 /* If got here, then the syntax allows intervals. */ | |
3009 | |
3010 /* At least (most) this many matches must be made. */ | |
3011 int lower_bound = -1, upper_bound = -1; | |
3012 | |
3013 beg_interval = p - 1; | |
3014 | |
3015 if (p == pend) | |
3016 { | |
3017 if (syntax & RE_NO_BK_BRACES) | |
3018 goto unfetch_interval; | |
3019 else | |
3020 FREE_STACK_RETURN (REG_EBRACE); | |
3021 } | |
3022 | |
3023 GET_UNSIGNED_NUMBER (lower_bound); | |
3024 | |
3025 if (c == ',') | |
3026 { | |
3027 GET_UNSIGNED_NUMBER (upper_bound); | |
3028 if (upper_bound < 0) upper_bound = RE_DUP_MAX; | |
3029 } | |
3030 else | |
3031 /* Interval such as `{1}' => match exactly once. */ | |
3032 upper_bound = lower_bound; | |
3033 | |
3034 if (lower_bound < 0 || upper_bound > RE_DUP_MAX | |
3035 || lower_bound > upper_bound) | |
3036 { | |
3037 if (syntax & RE_NO_BK_BRACES) | |
3038 goto unfetch_interval; | |
3039 else | |
3040 FREE_STACK_RETURN (REG_BADBR); | |
3041 } | |
3042 | |
3043 if (!(syntax & RE_NO_BK_BRACES)) | |
3044 { | |
3045 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE); | |
3046 | |
3047 PATFETCH (c); | |
3048 } | |
3049 | |
3050 if (c != '}') | |
3051 { | |
3052 if (syntax & RE_NO_BK_BRACES) | |
3053 goto unfetch_interval; | |
3054 else | |
3055 FREE_STACK_RETURN (REG_BADBR); | |
3056 } | |
3057 | |
3058 /* We just parsed a valid interval. */ | |
3059 | |
3060 /* If it's invalid to have no preceding re. */ | |
3061 if (!laststart) | |
3062 { | |
3063 if (syntax & RE_CONTEXT_INVALID_OPS) | |
3064 FREE_STACK_RETURN (REG_BADRPT); | |
3065 else if (syntax & RE_CONTEXT_INDEP_OPS) | |
446 | 3066 laststart = buf_end; |
428 | 3067 else |
3068 goto unfetch_interval; | |
3069 } | |
3070 | |
3071 /* If the upper bound is zero, don't want to succeed at | |
3072 all; jump from `laststart' to `b + 3', which will be | |
3073 the end of the buffer after we insert the jump. */ | |
3074 if (upper_bound == 0) | |
3075 { | |
3076 GET_BUFFER_SPACE (3); | |
446 | 3077 INSERT_JUMP (jump, laststart, buf_end + 3); |
3078 buf_end += 3; | |
428 | 3079 } |
3080 | |
3081 /* Otherwise, we have a nontrivial interval. When | |
3082 we're all done, the pattern will look like: | |
3083 set_number_at <jump count> <upper bound> | |
3084 set_number_at <succeed_n count> <lower bound> | |
3085 succeed_n <after jump addr> <succeed_n count> | |
3086 <body of loop> | |
3087 jump_n <succeed_n addr> <jump count> | |
3088 (The upper bound and `jump_n' are omitted if | |
3089 `upper_bound' is 1, though.) */ | |
3090 else | |
3091 { /* If the upper bound is > 1, we need to insert | |
3092 more at the end of the loop. */ | |
647 | 3093 int nbytes = 10 + (upper_bound > 1) * 10; |
428 | 3094 |
3095 GET_BUFFER_SPACE (nbytes); | |
3096 | |
3097 /* Initialize lower bound of the `succeed_n', even | |
3098 though it will be set during matching by its | |
3099 attendant `set_number_at' (inserted next), | |
3100 because `re_compile_fastmap' needs to know. | |
3101 Jump to the `jump_n' we might insert below. */ | |
3102 INSERT_JUMP2 (succeed_n, laststart, | |
446 | 3103 buf_end + 5 + (upper_bound > 1) * 5, |
428 | 3104 lower_bound); |
446 | 3105 buf_end += 5; |
428 | 3106 |
3107 /* Code to initialize the lower bound. Insert | |
3108 before the `succeed_n'. The `5' is the last two | |
3109 bytes of this `set_number_at', plus 3 bytes of | |
3110 the following `succeed_n'. */ | |
446 | 3111 insert_op2 (set_number_at, laststart, 5, lower_bound, buf_end); |
3112 buf_end += 5; | |
428 | 3113 |
3114 if (upper_bound > 1) | |
3115 { /* More than one repetition is allowed, so | |
3116 append a backward jump to the `succeed_n' | |
3117 that starts this interval. | |
3118 | |
3119 When we've reached this during matching, | |
3120 we'll have matched the interval once, so | |
3121 jump back only `upper_bound - 1' times. */ | |
446 | 3122 STORE_JUMP2 (jump_n, buf_end, laststart + 5, |
428 | 3123 upper_bound - 1); |
446 | 3124 buf_end += 5; |
428 | 3125 |
3126 /* The location we want to set is the second | |
3127 parameter of the `jump_n'; that is `b-2' as | |
3128 an absolute address. `laststart' will be | |
3129 the `set_number_at' we're about to insert; | |
3130 `laststart+3' the number to set, the source | |
3131 for the relative address. But we are | |
3132 inserting into the middle of the pattern -- | |
3133 so everything is getting moved up by 5. | |
3134 Conclusion: (b - 2) - (laststart + 3) + 5, | |
3135 i.e., b - laststart. | |
3136 | |
3137 We insert this at the beginning of the loop | |
3138 so that if we fail during matching, we'll | |
3139 reinitialize the bounds. */ | |
446 | 3140 insert_op2 (set_number_at, laststart, |
3141 buf_end - laststart, | |
3142 upper_bound - 1, buf_end); | |
3143 buf_end += 5; | |
428 | 3144 } |
3145 } | |
3146 pending_exact = 0; | |
3147 beg_interval = NULL; | |
3148 } | |
3149 break; | |
3150 | |
3151 unfetch_interval: | |
3152 /* If an invalid interval, match the characters as literals. */ | |
3153 assert (beg_interval); | |
3154 p = beg_interval; | |
3155 beg_interval = NULL; | |
3156 | |
3157 /* normal_char and normal_backslash need `c'. */ | |
3158 PATFETCH (c); | |
3159 | |
3160 if (!(syntax & RE_NO_BK_BRACES)) | |
3161 { | |
3162 if (p > pattern && p[-1] == '\\') | |
3163 goto normal_backslash; | |
3164 } | |
3165 goto normal_char; | |
3166 | |
3167 #ifdef emacs | |
3168 /* There is no way to specify the before_dot and after_dot | |
3169 operators. rms says this is ok. --karl */ | |
3170 case '=': | |
3171 BUF_PUSH (at_dot); | |
3172 break; | |
3173 | |
3174 case 's': | |
446 | 3175 laststart = buf_end; |
428 | 3176 PATFETCH (c); |
3177 /* XEmacs addition */ | |
3178 if (c >= 0x80 || syntax_spec_code[c] == 0377) | |
3179 FREE_STACK_RETURN (REG_ESYNTAX); | |
3180 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); | |
3181 break; | |
3182 | |
3183 case 'S': | |
446 | 3184 laststart = buf_end; |
428 | 3185 PATFETCH (c); |
3186 /* XEmacs addition */ | |
3187 if (c >= 0x80 || syntax_spec_code[c] == 0377) | |
3188 FREE_STACK_RETURN (REG_ESYNTAX); | |
3189 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); | |
3190 break; | |
3191 | |
3192 #ifdef MULE | |
3193 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */ | |
3194 case 'c': | |
446 | 3195 laststart = buf_end; |
428 | 3196 PATFETCH_RAW (c); |
3197 if (c < 32 || c > 127) | |
3198 FREE_STACK_RETURN (REG_ECATEGORY); | |
3199 BUF_PUSH_2 (categoryspec, c); | |
3200 break; | |
3201 | |
3202 case 'C': | |
446 | 3203 laststart = buf_end; |
428 | 3204 PATFETCH_RAW (c); |
3205 if (c < 32 || c > 127) | |
3206 FREE_STACK_RETURN (REG_ECATEGORY); | |
3207 BUF_PUSH_2 (notcategoryspec, c); | |
3208 break; | |
3209 /* end of category patch */ | |
3210 #endif /* MULE */ | |
3211 #endif /* emacs */ | |
3212 | |
3213 | |
3214 case 'w': | |
446 | 3215 laststart = buf_end; |
428 | 3216 BUF_PUSH (wordchar); |
3217 break; | |
3218 | |
3219 | |
3220 case 'W': | |
446 | 3221 laststart = buf_end; |
428 | 3222 BUF_PUSH (notwordchar); |
3223 break; | |
3224 | |
3225 | |
3226 case '<': | |
3227 BUF_PUSH (wordbeg); | |
3228 break; | |
3229 | |
3230 case '>': | |
3231 BUF_PUSH (wordend); | |
3232 break; | |
3233 | |
3234 case 'b': | |
3235 BUF_PUSH (wordbound); | |
3236 break; | |
3237 | |
3238 case 'B': | |
3239 BUF_PUSH (notwordbound); | |
3240 break; | |
3241 | |
3242 case '`': | |
3243 BUF_PUSH (begbuf); | |
3244 break; | |
3245 | |
3246 case '\'': | |
3247 BUF_PUSH (endbuf); | |
3248 break; | |
3249 | |
3250 case '1': case '2': case '3': case '4': case '5': | |
3251 case '6': case '7': case '8': case '9': | |
446 | 3252 { |
502 | 3253 regnum_t reg, regint; |
3254 int may_need_to_unfetch = 0; | |
446 | 3255 if (syntax & RE_NO_BK_REFS) |
3256 goto normal_char; | |
3257 | |
502 | 3258 /* This only goes up to 99. It could be extended to work |
3259 up to 255 (the maximum number of registers that can be | |
3260 handled by the current regexp engine, because it stores | |
3261 its register numbers in the compiled pattern as one byte, | |
3262 ugh). Doing that's a bit trickier, because you might | |
3263 have the case where \25 a back-ref but \255 is not, ... */ | |
446 | 3264 reg = c - '0'; |
502 | 3265 if (p < pend) |
3266 { | |
3267 PATFETCH (c); | |
3268 if (c >= '0' && c <= '9') | |
3269 { | |
3270 regnum_t new_reg = reg * 10 + c - '0'; | |
3271 if (new_reg <= bufp->re_nsub) | |
3272 { | |
3273 reg = new_reg; | |
3274 may_need_to_unfetch = 1; | |
3275 } | |
3276 else | |
3277 PATUNFETCH; | |
3278 } | |
523 | 3279 else |
3280 PATUNFETCH; | |
502 | 3281 } |
3282 | |
3283 if (reg > bufp->re_nsub) | |
446 | 3284 FREE_STACK_RETURN (REG_ESUBREG); |
3285 | |
502 | 3286 regint = bufp->external_to_internal_register[reg]; |
446 | 3287 /* Can't back reference to a subexpression if inside of it. */ |
502 | 3288 if (group_in_compile_stack (compile_stack, regint)) |
3289 { | |
3290 if (may_need_to_unfetch) | |
3291 PATUNFETCH; | |
3292 goto normal_char; | |
3293 } | |
3294 | |
3295 #ifdef emacs | |
3296 if (reg > 9 && | |
3297 bufp->warned_about_incompatible_back_references == 0) | |
3298 { | |
3299 bufp->warned_about_incompatible_back_references = 1; | |
3300 warn_when_safe (intern ("regex"), Qinfo, | |
3301 "Back reference \\%d now has new " | |
3302 "semantics in %s", reg, pattern); | |
3303 } | |
3304 #endif | |
446 | 3305 |
3306 laststart = buf_end; | |
502 | 3307 BUF_PUSH_2 (duplicate, regint); |
446 | 3308 } |
428 | 3309 break; |
3310 | |
3311 | |
3312 case '+': | |
3313 case '?': | |
3314 if (syntax & RE_BK_PLUS_QM) | |
3315 goto handle_plus; | |
3316 else | |
3317 goto normal_backslash; | |
3318 | |
3319 default: | |
3320 normal_backslash: | |
3321 /* You might think it would be useful for \ to mean | |
3322 not to translate; but if we don't translate it, | |
3323 it will never match anything. */ | |
826 | 3324 c = RE_TRANSLATE (c); |
428 | 3325 goto normal_char; |
3326 } | |
3327 break; | |
3328 | |
3329 | |
3330 default: | |
3331 /* Expects the character in `c'. */ | |
3332 /* `p' points to the location after where `c' came from. */ | |
3333 normal_char: | |
3334 { | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3335 /* The following conditional synced to GNU Emacs 22.1. */ |
428 | 3336 /* If no exactn currently being built. */ |
3337 if (!pending_exact | |
3338 | |
3339 /* If last exactn not at current position. */ | |
446 | 3340 || pending_exact + *pending_exact + 1 != buf_end |
428 | 3341 |
3342 /* We have only one byte following the exactn for the count. */ | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3343 || *pending_exact >= (1 << BYTEWIDTH) - MAX_ICHAR_LEN |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3344 |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3345 /* If followed by a repetition operator. |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3346 If the lookahead fails because of end of pattern, any |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3347 trailing backslash will get caught later. */ |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3348 || (p != pend && (*p == '*' || *p == '^')) |
428 | 3349 || ((syntax & RE_BK_PLUS_QM) |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3350 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?') |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3351 : p != pend && (*p == '+' || *p == '?')) |
428 | 3352 || ((syntax & RE_INTERVALS) |
3353 && ((syntax & RE_NO_BK_BRACES) | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3354 ? p != pend && *p == '{' |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3355 : p + 1 < pend && (p[0] == '\\' && p[1] == '{')))) |
428 | 3356 { |
3357 /* Start building a new exactn. */ | |
3358 | |
446 | 3359 laststart = buf_end; |
428 | 3360 |
3361 BUF_PUSH_2 (exactn, 0); | |
446 | 3362 pending_exact = buf_end - 1; |
428 | 3363 } |
3364 | |
446 | 3365 #ifndef MULE |
428 | 3366 BUF_PUSH (c); |
3367 (*pending_exact)++; | |
446 | 3368 #else |
3369 { | |
3370 Bytecount bt_count; | |
867 | 3371 Ibyte tmp_buf[MAX_ICHAR_LEN]; |
446 | 3372 int i; |
3373 | |
867 | 3374 bt_count = set_itext_ichar (tmp_buf, c); |
446 | 3375 |
3376 for (i = 0; i < bt_count; i++) | |
3377 { | |
3378 BUF_PUSH (tmp_buf[i]); | |
3379 (*pending_exact)++; | |
3380 } | |
3381 } | |
3382 #endif | |
428 | 3383 break; |
3384 } | |
3385 } /* switch (c) */ | |
3386 } /* while p != pend */ | |
3387 | |
3388 | |
3389 /* Through the pattern now. */ | |
3390 | |
3391 if (fixup_alt_jump) | |
446 | 3392 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end); |
428 | 3393 |
3394 if (!COMPILE_STACK_EMPTY) | |
3395 FREE_STACK_RETURN (REG_EPAREN); | |
3396 | |
3397 /* If we don't want backtracking, force success | |
3398 the first time we reach the end of the compiled pattern. */ | |
3399 if (syntax & RE_NO_POSIX_BACKTRACKING) | |
3400 BUF_PUSH (succeed); | |
3401 | |
1726 | 3402 xfree (compile_stack.stack, compile_stack_elt_t *); |
428 | 3403 |
3404 /* We have succeeded; set the length of the buffer. */ | |
446 | 3405 bufp->used = buf_end - bufp->buffer; |
428 | 3406 |
3407 #ifdef DEBUG | |
3408 if (debug) | |
3409 { | |
3410 DEBUG_PRINT1 ("\nCompiled pattern: \n"); | |
3411 print_compiled_pattern (bufp); | |
3412 } | |
3413 #endif /* DEBUG */ | |
3414 | |
3415 #ifndef MATCH_MAY_ALLOCATE | |
3416 /* Initialize the failure stack to the largest possible stack. This | |
3417 isn't necessary unless we're trying to avoid calling alloca in | |
3418 the search and match routines. */ | |
3419 { | |
502 | 3420 int num_regs = bufp->re_ngroups + 1; |
428 | 3421 |
3422 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size | |
3423 is strictly greater than re_max_failures, the largest possible stack | |
3424 is 2 * re_max_failures failure points. */ | |
3425 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) | |
3426 { | |
3427 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); | |
3428 | |
3429 if (! fail_stack.stack) | |
3430 fail_stack.stack | |
3431 = (fail_stack_elt_t *) xmalloc (fail_stack.size | |
3432 * sizeof (fail_stack_elt_t)); | |
3433 else | |
3434 fail_stack.stack | |
3435 = (fail_stack_elt_t *) xrealloc (fail_stack.stack, | |
3436 (fail_stack.size | |
3437 * sizeof (fail_stack_elt_t))); | |
3438 } | |
3439 | |
3440 regex_grow_registers (num_regs); | |
3441 } | |
3442 #endif /* not MATCH_MAY_ALLOCATE */ | |
3443 | |
3444 return REG_NOERROR; | |
3445 } /* regex_compile */ | |
3446 | |
3447 /* Subroutines for `regex_compile'. */ | |
3448 | |
3449 /* Store OP at LOC followed by two-byte integer parameter ARG. */ | |
3450 | |
3451 static void | |
3452 store_op1 (re_opcode_t op, unsigned char *loc, int arg) | |
3453 { | |
3454 *loc = (unsigned char) op; | |
3455 STORE_NUMBER (loc + 1, arg); | |
3456 } | |
3457 | |
3458 | |
3459 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ | |
3460 | |
3461 static void | |
3462 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2) | |
3463 { | |
3464 *loc = (unsigned char) op; | |
3465 STORE_NUMBER (loc + 1, arg1); | |
3466 STORE_NUMBER (loc + 3, arg2); | |
3467 } | |
3468 | |
3469 | |
3470 /* Copy the bytes from LOC to END to open up three bytes of space at LOC | |
3471 for OP followed by two-byte integer parameter ARG. */ | |
3472 | |
3473 static void | |
3474 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end) | |
3475 { | |
3476 REGISTER unsigned char *pfrom = end; | |
3477 REGISTER unsigned char *pto = end + 3; | |
3478 | |
3479 while (pfrom != loc) | |
3480 *--pto = *--pfrom; | |
3481 | |
3482 store_op1 (op, loc, arg); | |
3483 } | |
3484 | |
3485 | |
3486 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ | |
3487 | |
3488 static void | |
3489 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, | |
3490 unsigned char *end) | |
3491 { | |
3492 REGISTER unsigned char *pfrom = end; | |
3493 REGISTER unsigned char *pto = end + 5; | |
3494 | |
3495 while (pfrom != loc) | |
3496 *--pto = *--pfrom; | |
3497 | |
3498 store_op2 (op, loc, arg1, arg2); | |
3499 } | |
3500 | |
3501 | |
3502 /* P points to just after a ^ in PATTERN. Return true if that ^ comes | |
3503 after an alternative or a begin-subexpression. We assume there is at | |
3504 least one character before the ^. */ | |
3505 | |
460 | 3506 static re_bool |
446 | 3507 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax) |
428 | 3508 { |
446 | 3509 re_char *prev = p - 2; |
460 | 3510 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\'; |
428 | 3511 |
3512 return | |
3513 /* After a subexpression? */ | |
3514 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) | |
3515 /* After an alternative? */ | |
3516 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); | |
3517 } | |
3518 | |
3519 | |
3520 /* The dual of at_begline_loc_p. This one is for $. We assume there is | |
3521 at least one character after the $, i.e., `P < PEND'. */ | |
3522 | |
460 | 3523 static re_bool |
446 | 3524 at_endline_loc_p (re_char *p, re_char *pend, int syntax) |
428 | 3525 { |
446 | 3526 re_char *next = p; |
460 | 3527 re_bool next_backslash = *next == '\\'; |
446 | 3528 re_char *next_next = p + 1 < pend ? p + 1 : 0; |
428 | 3529 |
3530 return | |
3531 /* Before a subexpression? */ | |
3532 (syntax & RE_NO_BK_PARENS ? *next == ')' | |
3533 : next_backslash && next_next && *next_next == ')') | |
3534 /* Before an alternative? */ | |
3535 || (syntax & RE_NO_BK_VBAR ? *next == '|' | |
3536 : next_backslash && next_next && *next_next == '|'); | |
3537 } | |
3538 | |
3539 | |
3540 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and | |
3541 false if it's not. */ | |
3542 | |
460 | 3543 static re_bool |
428 | 3544 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum) |
3545 { | |
3546 int this_element; | |
3547 | |
3548 for (this_element = compile_stack.avail - 1; | |
3549 this_element >= 0; | |
3550 this_element--) | |
3551 if (compile_stack.stack[this_element].regnum == regnum) | |
3552 return true; | |
3553 | |
3554 return false; | |
3555 } | |
3556 | |
3557 | |
3558 /* Read the ending character of a range (in a bracket expression) from the | |
3559 uncompiled pattern *P_PTR (which ends at PEND). We assume the | |
3560 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) | |
3561 Then we set the translation of all bits between the starting and | |
3562 ending characters (inclusive) in the compiled pattern B. | |
3563 | |
3564 Return an error code. | |
3565 | |
3566 We use these short variable names so we can use the same macros as | |
826 | 3567 `regex_compile' itself. |
3568 | |
3569 Under Mule, this is only called when both chars of the range are | |
3570 ASCII. */ | |
428 | 3571 |
3572 static reg_errcode_t | |
446 | 3573 compile_range (re_char **p_ptr, re_char *pend, RE_TRANSLATE_TYPE translate, |
3574 reg_syntax_t syntax, unsigned char *buf_end) | |
428 | 3575 { |
867 | 3576 Ichar this_char; |
428 | 3577 |
446 | 3578 re_char *p = *p_ptr; |
428 | 3579 int range_start, range_end; |
3580 | |
3581 if (p == pend) | |
3582 return REG_ERANGE; | |
3583 | |
3584 /* Even though the pattern is a signed `char *', we need to fetch | |
3585 with unsigned char *'s; if the high bit of the pattern character | |
3586 is set, the range endpoints will be negative if we fetch using a | |
3587 signed char *. | |
3588 | |
3589 We also want to fetch the endpoints without translating them; the | |
3590 appropriate translation is done in the bit-setting loop below. */ | |
442 | 3591 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */ |
3592 range_start = ((const unsigned char *) p)[-2]; | |
3593 range_end = ((const unsigned char *) p)[0]; | |
428 | 3594 |
3595 /* Have to increment the pointer into the pattern string, so the | |
3596 caller isn't still at the ending character. */ | |
3597 (*p_ptr)++; | |
3598 | |
3599 /* If the start is after the end, the range is empty. */ | |
3600 if (range_start > range_end) | |
3601 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; | |
3602 | |
3603 /* Here we see why `this_char' has to be larger than an `unsigned | |
3604 char' -- the range is inclusive, so if `range_end' == 0xff | |
3605 (assuming 8-bit characters), we would otherwise go into an infinite | |
3606 loop, since all characters <= 0xff. */ | |
3607 for (this_char = range_start; this_char <= range_end; this_char++) | |
3608 { | |
826 | 3609 SET_LIST_BIT (RE_TRANSLATE (this_char)); |
428 | 3610 } |
3611 | |
3612 return REG_NOERROR; | |
3613 } | |
3614 | |
3615 #ifdef MULE | |
3616 | |
3617 static reg_errcode_t | |
446 | 3618 compile_extended_range (re_char **p_ptr, re_char *pend, |
3619 RE_TRANSLATE_TYPE translate, | |
428 | 3620 reg_syntax_t syntax, Lisp_Object rtab) |
3621 { | |
867 | 3622 Ichar this_char, range_start, range_end; |
3623 const Ibyte *p; | |
428 | 3624 |
3625 if (*p_ptr == pend) | |
3626 return REG_ERANGE; | |
3627 | |
867 | 3628 p = (const Ibyte *) *p_ptr; |
3629 range_end = itext_ichar (p); | |
428 | 3630 p--; /* back to '-' */ |
867 | 3631 DEC_IBYTEPTR (p); /* back to start of range */ |
428 | 3632 /* We also want to fetch the endpoints without translating them; the |
3633 appropriate translation is done in the bit-setting loop below. */ | |
867 | 3634 range_start = itext_ichar (p); |
3635 INC_IBYTEPTR (*p_ptr); | |
428 | 3636 |
3637 /* If the start is after the end, the range is empty. */ | |
3638 if (range_start > range_end) | |
3639 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; | |
3640 | |
3641 /* Can't have ranges spanning different charsets, except maybe for | |
3642 ranges entirely within the first 256 chars. */ | |
3643 | |
3644 if ((range_start >= 0x100 || range_end >= 0x100) | |
867 | 3645 && ichar_leading_byte (range_start) != |
3646 ichar_leading_byte (range_end)) | |
428 | 3647 return REG_ERANGESPAN; |
3648 | |
826 | 3649 /* #### This might be way inefficient if the range encompasses 10,000 |
3650 chars or something. To be efficient, you'd have to do something like | |
3651 this: | |
428 | 3652 |
3653 range_table a; | |
3654 range_table b; | |
3655 map over translation table in [range_start, range_end] of | |
3656 (put the mapped range in a; | |
3657 put the translation in b) | |
3658 invert the range in a and truncate to [range_start, range_end] | |
3659 compute the union of a, b | |
3660 union the result into rtab | |
3661 */ | |
826 | 3662 for (this_char = range_start; this_char <= range_end; this_char++) |
428 | 3663 { |
826 | 3664 SET_RANGETAB_BIT (RE_TRANSLATE (this_char)); |
428 | 3665 } |
3666 | |
3667 if (this_char <= range_end) | |
3668 put_range_table (rtab, this_char, range_end, Qt); | |
3669 | |
3670 return REG_NOERROR; | |
3671 } | |
3672 | |
3673 #endif /* MULE */ | |
3674 | |
3675 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in | |
3676 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible | |
3677 characters can start a string that matches the pattern. This fastmap | |
3678 is used by re_search to skip quickly over impossible starting points. | |
3679 | |
3680 The caller must supply the address of a (1 << BYTEWIDTH)-byte data | |
3681 area as BUFP->fastmap. | |
3682 | |
3683 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in | |
3684 the pattern buffer. | |
3685 | |
3686 Returns 0 if we succeed, -2 if an internal error. */ | |
3687 | |
3688 int | |
826 | 3689 re_compile_fastmap (struct re_pattern_buffer *bufp |
3690 RE_LISP_SHORT_CONTEXT_ARGS_DECL) | |
428 | 3691 { |
3692 int j, k; | |
3693 #ifdef MATCH_MAY_ALLOCATE | |
3694 fail_stack_type fail_stack; | |
3695 #endif | |
456 | 3696 DECLARE_DESTINATION; |
428 | 3697 /* We don't push any register information onto the failure stack. */ |
3698 | |
826 | 3699 /* &&#### this should be changed for 8-bit-fixed, for efficiency. see |
3700 comment marked with &&#### in re_search_2. */ | |
3701 | |
428 | 3702 REGISTER char *fastmap = bufp->fastmap; |
3703 unsigned char *pattern = bufp->buffer; | |
647 | 3704 long size = bufp->used; |
428 | 3705 unsigned char *p = pattern; |
3706 REGISTER unsigned char *pend = pattern + size; | |
3707 | |
771 | 3708 #ifdef REGEX_REL_ALLOC |
428 | 3709 /* This holds the pointer to the failure stack, when |
3710 it is allocated relocatably. */ | |
3711 fail_stack_elt_t *failure_stack_ptr; | |
3712 #endif | |
3713 | |
3714 /* Assume that each path through the pattern can be null until | |
3715 proven otherwise. We set this false at the bottom of switch | |
3716 statement, to which we get only if a particular path doesn't | |
3717 match the empty string. */ | |
460 | 3718 re_bool path_can_be_null = true; |
428 | 3719 |
3720 /* We aren't doing a `succeed_n' to begin with. */ | |
460 | 3721 re_bool succeed_n_p = false; |
428 | 3722 |
1333 | 3723 #ifdef ERROR_CHECK_MALLOC |
3724 /* The pattern comes from string data, not buffer data. We don't access | |
3725 any buffer data, so we don't have to worry about malloc() (but the | |
3726 disallowed flag may have been set by a caller). */ | |
3727 int depth = bind_regex_malloc_disallowed (0); | |
3728 #endif | |
3729 | |
428 | 3730 assert (fastmap != NULL && p != NULL); |
3731 | |
3732 INIT_FAIL_STACK (); | |
3733 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */ | |
3734 bufp->fastmap_accurate = 1; /* It will be when we're done. */ | |
3735 bufp->can_be_null = 0; | |
3736 | |
3737 while (1) | |
3738 { | |
3739 if (p == pend || *p == succeed) | |
3740 { | |
3741 /* We have reached the (effective) end of pattern. */ | |
3742 if (!FAIL_STACK_EMPTY ()) | |
3743 { | |
3744 bufp->can_be_null |= path_can_be_null; | |
3745 | |
3746 /* Reset for next path. */ | |
3747 path_can_be_null = true; | |
3748 | |
446 | 3749 p = (unsigned char *) fail_stack.stack[--fail_stack.avail].pointer; |
428 | 3750 |
3751 continue; | |
3752 } | |
3753 else | |
3754 break; | |
3755 } | |
3756 | |
3757 /* We should never be about to go beyond the end of the pattern. */ | |
3758 assert (p < pend); | |
3759 | |
4759
aa5ed11f473b
Remove support for obsolete systems. See xemacs-patches message with ID
Jerry James <james@xemacs.org>
parents:
4750
diff
changeset
|
3760 switch ((re_opcode_t) *p++) |
428 | 3761 { |
3762 | |
3763 /* I guess the idea here is to simply not bother with a fastmap | |
3764 if a backreference is used, since it's too hard to figure out | |
3765 the fastmap for the corresponding group. Setting | |
3766 `can_be_null' stops `re_search_2' from using the fastmap, so | |
3767 that is all we do. */ | |
3768 case duplicate: | |
3769 bufp->can_be_null = 1; | |
3770 goto done; | |
3771 | |
3772 | |
3773 /* Following are the cases which match a character. These end | |
3774 with `break'. */ | |
3775 | |
3776 case exactn: | |
3777 fastmap[p[1]] = 1; | |
3778 break; | |
3779 | |
3780 | |
3781 case charset: | |
3782 /* XEmacs: Under Mule, these bit vectors will | |
3783 only contain values for characters below 0x80. */ | |
3784 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | |
3785 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) | |
3786 fastmap[j] = 1; | |
3787 break; | |
3788 | |
3789 | |
3790 case charset_not: | |
3791 /* Chars beyond end of map must be allowed. */ | |
3792 #ifdef MULE | |
3793 for (j = *p * BYTEWIDTH; j < 0x80; j++) | |
3794 fastmap[j] = 1; | |
3795 /* And all extended characters must be allowed, too. */ | |
3796 for (j = 0x80; j < 0xA0; j++) | |
3797 fastmap[j] = 1; | |
446 | 3798 #else /* not MULE */ |
428 | 3799 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) |
3800 fastmap[j] = 1; | |
446 | 3801 #endif /* MULE */ |
428 | 3802 |
3803 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | |
3804 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) | |
3805 fastmap[j] = 1; | |
3806 break; | |
3807 | |
3808 #ifdef MULE | |
3809 case charset_mule: | |
3810 { | |
3811 int nentries; | |
3812 int i; | |
3813 | |
3814 nentries = unified_range_table_nentries (p); | |
3815 for (i = 0; i < nentries; i++) | |
3816 { | |
3817 EMACS_INT first, last; | |
3818 Lisp_Object dummy_val; | |
3819 int jj; | |
867 | 3820 Ibyte strr[MAX_ICHAR_LEN]; |
428 | 3821 |
3822 unified_range_table_get_range (p, i, &first, &last, | |
3823 &dummy_val); | |
3824 for (jj = first; jj <= last && jj < 0x80; jj++) | |
3825 fastmap[jj] = 1; | |
3826 /* Ranges below 0x100 can span charsets, but there | |
3827 are only two (Control-1 and Latin-1), and | |
3828 either first or last has to be in them. */ | |
867 | 3829 set_itext_ichar (strr, first); |
428 | 3830 fastmap[*strr] = 1; |
3831 if (last < 0x100) | |
3832 { | |
867 | 3833 set_itext_ichar (strr, last); |
428 | 3834 fastmap[*strr] = 1; |
3835 } | |
3836 } | |
3837 } | |
3838 break; | |
3839 | |
3840 case charset_mule_not: | |
3841 { | |
3842 int nentries; | |
3843 int i; | |
3844 | |
3845 nentries = unified_range_table_nentries (p); | |
3846 for (i = 0; i < nentries; i++) | |
3847 { | |
3848 EMACS_INT first, last; | |
3849 Lisp_Object dummy_val; | |
3850 int jj; | |
3851 int smallest_prev = 0; | |
3852 | |
3853 unified_range_table_get_range (p, i, &first, &last, | |
3854 &dummy_val); | |
3855 for (jj = smallest_prev; jj < first && jj < 0x80; jj++) | |
3856 fastmap[jj] = 1; | |
3857 smallest_prev = last + 1; | |
3858 if (smallest_prev >= 0x80) | |
3859 break; | |
3860 } | |
3861 /* Calculating which leading bytes are actually allowed | |
3862 here is rather difficult, so we just punt and allow | |
3863 all of them. */ | |
3864 for (i = 0x80; i < 0xA0; i++) | |
3865 fastmap[i] = 1; | |
3866 } | |
3867 break; | |
3868 #endif /* MULE */ | |
3869 | |
3870 | |
3871 case anychar: | |
3872 { | |
3873 int fastmap_newline = fastmap['\n']; | |
3874 | |
3875 /* `.' matches anything ... */ | |
3876 #ifdef MULE | |
3877 /* "anything" only includes bytes that can be the | |
3878 first byte of a character. */ | |
3879 for (j = 0; j < 0xA0; j++) | |
3880 fastmap[j] = 1; | |
3881 #else | |
3882 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
3883 fastmap[j] = 1; | |
3884 #endif | |
3885 | |
3886 /* ... except perhaps newline. */ | |
3887 if (!(bufp->syntax & RE_DOT_NEWLINE)) | |
3888 fastmap['\n'] = fastmap_newline; | |
3889 | |
3890 /* Return if we have already set `can_be_null'; if we have, | |
3891 then the fastmap is irrelevant. Something's wrong here. */ | |
3892 else if (bufp->can_be_null) | |
3893 goto done; | |
3894 | |
3895 /* Otherwise, have to check alternative paths. */ | |
3896 break; | |
3897 } | |
3898 | |
826 | 3899 #ifndef emacs |
3900 case wordchar: | |
3901 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
3902 if (SYNTAX (ignored, j) == Sword) | |
3903 fastmap[j] = 1; | |
3904 break; | |
3905 | |
3906 case notwordchar: | |
3907 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
3908 if (SYNTAX (ignored, j) != Sword) | |
3909 fastmap[j] = 1; | |
3910 break; | |
3911 #else /* emacs */ | |
3912 case wordchar: | |
3913 case notwordchar: | |
460 | 3914 case wordbound: |
3915 case notwordbound: | |
3916 case wordbeg: | |
3917 case wordend: | |
3918 case notsyntaxspec: | |
3919 case syntaxspec: | |
3920 /* This match depends on text properties. These end with | |
3921 aborting optimizations. */ | |
3922 bufp->can_be_null = 1; | |
3923 goto done; | |
826 | 3924 #if 0 /* all of the following code is unused now that the `syntax-table' |
3925 property exists -- it's trickier to do this than just look in | |
3926 the buffer. &&#### but we could just use the syntax-cache stuff | |
3927 instead; why don't we? --ben */ | |
3928 case wordchar: | |
3929 k = (int) Sword; | |
3930 goto matchsyntax; | |
3931 | |
3932 case notwordchar: | |
3933 k = (int) Sword; | |
3934 goto matchnotsyntax; | |
3935 | |
428 | 3936 case syntaxspec: |
3937 k = *p++; | |
826 | 3938 matchsyntax: |
428 | 3939 #ifdef MULE |
3940 for (j = 0; j < 0x80; j++) | |
826 | 3941 if (SYNTAX |
3942 (XCHAR_TABLE (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) == | |
428 | 3943 (enum syntaxcode) k) |
3944 fastmap[j] = 1; | |
3945 for (j = 0x80; j < 0xA0; j++) | |
3946 { | |
826 | 3947 if (leading_byte_prefix_p ((unsigned char) j)) |
428 | 3948 /* too complicated to calculate this right */ |
3949 fastmap[j] = 1; | |
3950 else | |
3951 { | |
3952 int multi_p; | |
3953 Lisp_Object cset; | |
3954 | |
826 | 3955 cset = charset_by_leading_byte (j); |
428 | 3956 if (CHARSETP (cset)) |
3957 { | |
826 | 3958 if (charset_syntax (lispbuf, cset, &multi_p) |
428 | 3959 == Sword || multi_p) |
3960 fastmap[j] = 1; | |
3961 } | |
3962 } | |
3963 } | |
446 | 3964 #else /* not MULE */ |
428 | 3965 for (j = 0; j < (1 << BYTEWIDTH); j++) |
826 | 3966 if (SYNTAX |
3967 (XCHAR_TABLE (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) == | |
428 | 3968 (enum syntaxcode) k) |
3969 fastmap[j] = 1; | |
446 | 3970 #endif /* MULE */ |
428 | 3971 break; |
3972 | |
3973 | |
3974 case notsyntaxspec: | |
3975 k = *p++; | |
826 | 3976 matchnotsyntax: |
428 | 3977 #ifdef MULE |
3978 for (j = 0; j < 0x80; j++) | |
826 | 3979 if (SYNTAX |
428 | 3980 (XCHAR_TABLE |
826 | 3981 (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) != |
428 | 3982 (enum syntaxcode) k) |
3983 fastmap[j] = 1; | |
3984 for (j = 0x80; j < 0xA0; j++) | |
3985 { | |
826 | 3986 if (leading_byte_prefix_p ((unsigned char) j)) |
428 | 3987 /* too complicated to calculate this right */ |
3988 fastmap[j] = 1; | |
3989 else | |
3990 { | |
3991 int multi_p; | |
3992 Lisp_Object cset; | |
3993 | |
826 | 3994 cset = charset_by_leading_byte (j); |
428 | 3995 if (CHARSETP (cset)) |
3996 { | |
826 | 3997 if (charset_syntax (lispbuf, cset, &multi_p) |
428 | 3998 != Sword || multi_p) |
3999 fastmap[j] = 1; | |
4000 } | |
4001 } | |
4002 } | |
446 | 4003 #else /* not MULE */ |
428 | 4004 for (j = 0; j < (1 << BYTEWIDTH); j++) |
826 | 4005 if (SYNTAX |
428 | 4006 (XCHAR_TABLE |
826 | 4007 (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) != |
428 | 4008 (enum syntaxcode) k) |
4009 fastmap[j] = 1; | |
446 | 4010 #endif /* MULE */ |
428 | 4011 break; |
826 | 4012 #endif /* 0 */ |
428 | 4013 |
4014 #ifdef MULE | |
4015 /* 97/2/17 jhod category patch */ | |
4016 case categoryspec: | |
4017 case notcategoryspec: | |
4018 bufp->can_be_null = 1; | |
1333 | 4019 UNBIND_REGEX_MALLOC_CHECK (); |
428 | 4020 return 0; |
4021 /* end if category patch */ | |
4022 #endif /* MULE */ | |
4023 | |
4024 /* All cases after this match the empty string. These end with | |
4025 `continue'. */ | |
4026 case before_dot: | |
4027 case at_dot: | |
4028 case after_dot: | |
4029 continue; | |
826 | 4030 #endif /* emacs */ |
428 | 4031 |
4032 | |
4033 case no_op: | |
4034 case begline: | |
4035 case endline: | |
4036 case begbuf: | |
4037 case endbuf: | |
460 | 4038 #ifndef emacs |
428 | 4039 case wordbound: |
4040 case notwordbound: | |
4041 case wordbeg: | |
4042 case wordend: | |
460 | 4043 #endif |
428 | 4044 case push_dummy_failure: |
4045 continue; | |
4046 | |
4047 | |
4048 case jump_n: | |
4049 case pop_failure_jump: | |
4050 case maybe_pop_jump: | |
4051 case jump: | |
4052 case jump_past_alt: | |
4053 case dummy_failure_jump: | |
4054 EXTRACT_NUMBER_AND_INCR (j, p); | |
4055 p += j; | |
4056 if (j > 0) | |
4057 continue; | |
4058 | |
4059 /* Jump backward implies we just went through the body of a | |
4060 loop and matched nothing. Opcode jumped to should be | |
4061 `on_failure_jump' or `succeed_n'. Just treat it like an | |
4062 ordinary jump. For a * loop, it has pushed its failure | |
4063 point already; if so, discard that as redundant. */ | |
4064 if ((re_opcode_t) *p != on_failure_jump | |
4065 && (re_opcode_t) *p != succeed_n) | |
4066 continue; | |
4067 | |
4068 p++; | |
4069 EXTRACT_NUMBER_AND_INCR (j, p); | |
4070 p += j; | |
4071 | |
4072 /* If what's on the stack is where we are now, pop it. */ | |
4073 if (!FAIL_STACK_EMPTY () | |
4074 && fail_stack.stack[fail_stack.avail - 1].pointer == p) | |
4075 fail_stack.avail--; | |
4076 | |
4077 continue; | |
4078 | |
4079 | |
4080 case on_failure_jump: | |
4081 case on_failure_keep_string_jump: | |
4082 handle_on_failure_jump: | |
4083 EXTRACT_NUMBER_AND_INCR (j, p); | |
4084 | |
4085 /* For some patterns, e.g., `(a?)?', `p+j' here points to the | |
4086 end of the pattern. We don't want to push such a point, | |
4087 since when we restore it above, entering the switch will | |
4088 increment `p' past the end of the pattern. We don't need | |
4089 to push such a point since we obviously won't find any more | |
4090 fastmap entries beyond `pend'. Such a pattern can match | |
4091 the null string, though. */ | |
4092 if (p + j < pend) | |
4093 { | |
4094 if (!PUSH_PATTERN_OP (p + j, fail_stack)) | |
4095 { | |
4096 RESET_FAIL_STACK (); | |
1333 | 4097 UNBIND_REGEX_MALLOC_CHECK (); |
428 | 4098 return -2; |
4099 } | |
4100 } | |
4101 else | |
4102 bufp->can_be_null = 1; | |
4103 | |
4104 if (succeed_n_p) | |
4105 { | |
4106 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ | |
4107 succeed_n_p = false; | |
4108 } | |
4109 | |
4110 continue; | |
4111 | |
4112 | |
4113 case succeed_n: | |
4114 /* Get to the number of times to succeed. */ | |
4115 p += 2; | |
4116 | |
4117 /* Increment p past the n for when k != 0. */ | |
4118 EXTRACT_NUMBER_AND_INCR (k, p); | |
4119 if (k == 0) | |
4120 { | |
4121 p -= 4; | |
4122 succeed_n_p = true; /* Spaghetti code alert. */ | |
4123 goto handle_on_failure_jump; | |
4124 } | |
4125 continue; | |
4126 | |
4127 | |
4128 case set_number_at: | |
4129 p += 4; | |
4130 continue; | |
4131 | |
4132 | |
4133 case start_memory: | |
4134 case stop_memory: | |
4135 p += 2; | |
4136 continue; | |
4137 | |
4138 | |
4139 default: | |
2500 | 4140 ABORT (); /* We have listed all the cases. */ |
428 | 4141 } /* switch *p++ */ |
4142 | |
4143 /* Getting here means we have found the possible starting | |
4144 characters for one path of the pattern -- and that the empty | |
4145 string does not match. We need not follow this path further. | |
4146 Instead, look at the next alternative (remembered on the | |
4147 stack), or quit if no more. The test at the top of the loop | |
4148 does these things. */ | |
4149 path_can_be_null = false; | |
4150 p = pend; | |
4151 } /* while p */ | |
4152 | |
4153 /* Set `can_be_null' for the last path (also the first path, if the | |
4154 pattern is empty). */ | |
4155 bufp->can_be_null |= path_can_be_null; | |
4156 | |
4157 done: | |
4158 RESET_FAIL_STACK (); | |
1333 | 4159 UNBIND_REGEX_MALLOC_CHECK (); |
428 | 4160 return 0; |
4161 } /* re_compile_fastmap */ | |
4162 | |
4163 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and | |
4164 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use | |
4165 this memory for recording register information. STARTS and ENDS | |
4166 must be allocated using the malloc library routine, and must each | |
4167 be at least NUM_REGS * sizeof (regoff_t) bytes long. | |
4168 | |
4169 If NUM_REGS == 0, then subsequent matches should allocate their own | |
4170 register data. | |
4171 | |
4172 Unless this function is called, the first search or match using | |
4173 PATTERN_BUFFER will allocate its own register data, without | |
4174 freeing the old data. */ | |
4175 | |
4176 void | |
4177 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, | |
647 | 4178 int num_regs, regoff_t *starts, regoff_t *ends) |
428 | 4179 { |
4180 if (num_regs) | |
4181 { | |
4182 bufp->regs_allocated = REGS_REALLOCATE; | |
4183 regs->num_regs = num_regs; | |
4184 regs->start = starts; | |
4185 regs->end = ends; | |
4186 } | |
4187 else | |
4188 { | |
4189 bufp->regs_allocated = REGS_UNALLOCATED; | |
4190 regs->num_regs = 0; | |
4191 regs->start = regs->end = (regoff_t *) 0; | |
4192 } | |
4193 } | |
4194 | |
4195 /* Searching routines. */ | |
4196 | |
4197 /* Like re_search_2, below, but only one string is specified, and | |
4198 doesn't let you say where to stop matching. */ | |
4199 | |
4200 int | |
442 | 4201 re_search (struct re_pattern_buffer *bufp, const char *string, int size, |
826 | 4202 int startpos, int range, struct re_registers *regs |
4203 RE_LISP_CONTEXT_ARGS_DECL) | |
428 | 4204 { |
4205 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, | |
826 | 4206 regs, size RE_LISP_CONTEXT_ARGS); |
428 | 4207 } |
4208 | |
4209 /* Using the compiled pattern in BUFP->buffer, first tries to match the | |
4210 virtual concatenation of STRING1 and STRING2, starting first at index | |
4211 STARTPOS, then at STARTPOS + 1, and so on. | |
4212 | |
4213 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. | |
4214 | |
4215 RANGE is how far to scan while trying to match. RANGE = 0 means try | |
4216 only at STARTPOS; in general, the last start tried is STARTPOS + | |
4217 RANGE. | |
4218 | |
826 | 4219 All sizes and positions refer to bytes (not chars); under Mule, the code |
4220 knows about the format of the text and will only check at positions | |
4221 where a character starts. | |
4222 | |
428 | 4223 With MULE, RANGE is a byte position, not a char position. The last |
4224 start tried is the character starting <= STARTPOS + RANGE. | |
4225 | |
4226 In REGS, return the indices of the virtual concatenation of STRING1 | |
4227 and STRING2 that matched the entire BUFP->buffer and its contained | |
4228 subexpressions. | |
4229 | |
4230 Do not consider matching one past the index STOP in the virtual | |
4231 concatenation of STRING1 and STRING2. | |
4232 | |
4233 We return either the position in the strings at which the match was | |
4234 found, -1 if no match, or -2 if error (such as failure | |
4235 stack overflow). */ | |
4236 | |
4237 int | |
446 | 4238 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, |
4239 int size1, const char *str2, int size2, int startpos, | |
826 | 4240 int range, struct re_registers *regs, int stop |
4241 RE_LISP_CONTEXT_ARGS_DECL) | |
428 | 4242 { |
4243 int val; | |
446 | 4244 re_char *string1 = (re_char *) str1; |
4245 re_char *string2 = (re_char *) str2; | |
428 | 4246 REGISTER char *fastmap = bufp->fastmap; |
446 | 4247 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate; |
428 | 4248 int total_size = size1 + size2; |
4249 int endpos = startpos + range; | |
4250 #ifdef REGEX_BEGLINE_CHECK | |
4251 int anchored_at_begline = 0; | |
4252 #endif | |
446 | 4253 re_char *d; |
826 | 4254 #ifdef emacs |
4255 Internal_Format fmt = buffer_or_other_internal_format (lispobj); | |
1346 | 4256 #ifdef REL_ALLOC |
4257 Ibyte *orig_buftext = | |
4258 BUFFERP (lispobj) ? | |
4259 BYTE_BUF_BYTE_ADDRESS (XBUFFER (lispobj), | |
4260 BYTE_BUF_BEGV (XBUFFER (lispobj))) : | |
4261 0; | |
4262 #endif | |
1333 | 4263 #ifdef ERROR_CHECK_MALLOC |
4264 int depth; | |
4265 #endif | |
826 | 4266 #endif /* emacs */ |
4267 #if 1 | |
4268 int forward_search_p; | |
4269 #endif | |
428 | 4270 |
4271 /* Check for out-of-range STARTPOS. */ | |
4272 if (startpos < 0 || startpos > total_size) | |
4273 return -1; | |
4274 | |
4275 /* Fix up RANGE if it might eventually take us outside | |
4276 the virtual concatenation of STRING1 and STRING2. */ | |
4277 if (endpos < 0) | |
4278 range = 0 - startpos; | |
4279 else if (endpos > total_size) | |
4280 range = total_size - startpos; | |
4281 | |
826 | 4282 #if 1 |
4283 forward_search_p = range > 0; | |
4284 #endif | |
4285 | |
428 | 4286 /* If the search isn't to be a backwards one, don't waste time in a |
4287 search for a pattern that must be anchored. */ | |
4288 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) | |
4289 { | |
4290 if (startpos > 0) | |
4291 return -1; | |
4292 else | |
4293 { | |
442 | 4294 d = ((const unsigned char *) |
428 | 4295 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
867 | 4296 range = itext_ichar_len_fmt (d, fmt); |
428 | 4297 } |
4298 } | |
4299 | |
460 | 4300 #ifdef emacs |
4301 /* In a forward search for something that starts with \=. | |
4302 don't keep searching past point. */ | |
4303 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) | |
4304 { | |
826 | 4305 if (!BUFFERP (lispobj)) |
4306 return -1; | |
4527
8418d1ad4944
Fix at_dot regex under Mule. <87hc6rv53v.fsf@uwakimon.sk.tsukuba.ac.jp>
Stephen J. Turnbull <stephen@xemacs.org>
parents:
3300
diff
changeset
|
4307 range = (BYTE_BUF_PT (XBUFFER (lispobj)) |
8418d1ad4944
Fix at_dot regex under Mule. <87hc6rv53v.fsf@uwakimon.sk.tsukuba.ac.jp>
Stephen J. Turnbull <stephen@xemacs.org>
parents:
3300
diff
changeset
|
4308 - BYTE_BUF_BEGV (XBUFFER (lispobj)) - startpos); |
460 | 4309 if (range < 0) |
4310 return -1; | |
4311 } | |
4312 #endif /* emacs */ | |
4313 | |
1333 | 4314 #ifdef ERROR_CHECK_MALLOC |
4315 /* Do this after the above return()s. */ | |
4316 depth = bind_regex_malloc_disallowed (1); | |
4317 #endif | |
4318 | |
428 | 4319 /* Update the fastmap now if not correct already. */ |
1333 | 4320 BEGIN_REGEX_MALLOC_OK (); |
428 | 4321 if (fastmap && !bufp->fastmap_accurate) |
826 | 4322 if (re_compile_fastmap (bufp RE_LISP_SHORT_CONTEXT_ARGS) == -2) |
1333 | 4323 { |
4324 END_REGEX_MALLOC_OK (); | |
4325 UNBIND_REGEX_MALLOC_CHECK (); | |
4326 return -2; | |
4327 } | |
4328 | |
4329 END_REGEX_MALLOC_OK (); | |
4330 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
428 | 4331 |
4332 #ifdef REGEX_BEGLINE_CHECK | |
4333 { | |
647 | 4334 long i = 0; |
428 | 4335 |
4336 while (i < bufp->used) | |
4337 { | |
4338 if (bufp->buffer[i] == start_memory || | |
4339 bufp->buffer[i] == stop_memory) | |
4340 i += 2; | |
4341 else | |
4342 break; | |
4343 } | |
4344 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline; | |
4345 } | |
4346 #endif | |
4347 | |
460 | 4348 #ifdef emacs |
1333 | 4349 BEGIN_REGEX_MALLOC_OK (); |
826 | 4350 scache = setup_syntax_cache (scache, lispobj, lispbuf, |
4351 offset_to_charxpos (lispobj, startpos), | |
4352 1); | |
1333 | 4353 END_REGEX_MALLOC_OK (); |
4354 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
460 | 4355 #endif |
4356 | |
428 | 4357 /* Loop through the string, looking for a place to start matching. */ |
4358 for (;;) | |
4359 { | |
4360 #ifdef REGEX_BEGLINE_CHECK | |
826 | 4361 /* If the regex is anchored at the beginning of a line (i.e. with a |
4362 ^), then we can speed things up by skipping to the next | |
4363 beginning-of-line. However, to determine "beginning of line" we | |
4364 need to look at the previous char, so can't do this check if at | |
4365 beginning of either string. (Well, we could if at the beginning of | |
4366 the second string, but it would require additional code, and this | |
4367 is just an optimization.) */ | |
4368 if (anchored_at_begline && startpos > 0 && startpos != size1) | |
428 | 4369 { |
826 | 4370 if (range > 0) |
4371 { | |
4372 /* whose stupid idea was it anyway to make this | |
4373 function take two strings to match?? */ | |
4374 int lim = 0; | |
4375 re_char *orig_d; | |
4376 re_char *stop_d; | |
4377 | |
4378 /* Compute limit as below in fastmap code, so we are guaranteed | |
4379 to remain within a single string. */ | |
4380 if (startpos < size1 && startpos + range >= size1) | |
4381 lim = range - (size1 - startpos); | |
4382 | |
4383 d = ((const unsigned char *) | |
4384 (startpos >= size1 ? string2 - size1 : string1) + startpos); | |
4385 orig_d = d; | |
4386 stop_d = d + range - lim; | |
4387 | |
4388 /* We want to find the next location (including the current | |
4389 one) where the previous char is a newline, so back up one | |
4390 and search forward for a newline. */ | |
867 | 4391 DEC_IBYTEPTR_FMT (d, fmt); /* Ok, since startpos != size1. */ |
826 | 4392 |
4393 /* Written out as an if-else to avoid testing `translate' | |
4394 inside the loop. */ | |
4395 if (TRANSLATE_P (translate)) | |
4396 while (d < stop_d && | |
867 | 4397 RE_TRANSLATE_1 (itext_ichar_fmt (d, fmt, lispobj)) |
826 | 4398 != '\n') |
867 | 4399 INC_IBYTEPTR_FMT (d, fmt); |
826 | 4400 else |
4401 while (d < stop_d && | |
867 | 4402 itext_ichar_ascii_fmt (d, fmt, lispobj) != '\n') |
4403 INC_IBYTEPTR_FMT (d, fmt); | |
826 | 4404 |
4405 /* If we were stopped by a newline, skip forward over it. | |
4406 Otherwise we will get in an infloop when our start position | |
4407 was at begline. */ | |
4408 if (d < stop_d) | |
867 | 4409 INC_IBYTEPTR_FMT (d, fmt); |
826 | 4410 range -= d - orig_d; |
4411 startpos += d - orig_d; | |
4412 #if 1 | |
4413 assert (!forward_search_p || range >= 0); | |
4414 #endif | |
4415 } | |
4416 else if (range < 0) | |
4417 { | |
4418 /* We're lazy, like in the fastmap code below */ | |
867 | 4419 Ichar c; |
826 | 4420 |
4421 d = ((const unsigned char *) | |
4422 (startpos >= size1 ? string2 - size1 : string1) + startpos); | |
867 | 4423 DEC_IBYTEPTR_FMT (d, fmt); |
4424 c = itext_ichar_fmt (d, fmt, lispobj); | |
826 | 4425 c = RE_TRANSLATE (c); |
4426 if (c != '\n') | |
4427 goto advance; | |
4428 } | |
428 | 4429 } |
4430 #endif /* REGEX_BEGLINE_CHECK */ | |
4431 | |
4432 /* If a fastmap is supplied, skip quickly over characters that | |
4433 cannot be the start of a match. If the pattern can match the | |
4434 null string, however, we don't need to skip characters; we want | |
4435 the first null string. */ | |
4436 if (fastmap && startpos < total_size && !bufp->can_be_null) | |
4437 { | |
826 | 4438 /* For the moment, fastmap always works as if buffer |
4439 is in default format, so convert chars in the search strings | |
4440 into default format as we go along, if necessary. | |
4441 | |
4442 &&#### fastmap needs rethinking for 8-bit-fixed so | |
4443 it's faster. We need it to reflect the raw | |
4444 8-bit-fixed values. That isn't so hard if we assume | |
4445 that the top 96 bytes represent a single 1-byte | |
4446 charset. For 16-bit/32-bit stuff it's probably not | |
4447 worth it to make the fastmap represent the raw, due to | |
4448 its nature -- we'd have to use the LSB for the | |
4449 fastmap, and that causes lots of problems with Mule | |
4450 chars, where it essentially wipes out the usefulness | |
4451 of the fastmap entirely. */ | |
428 | 4452 if (range > 0) /* Searching forwards. */ |
4453 { | |
4454 int lim = 0; | |
4455 int irange = range; | |
4456 | |
4457 if (startpos < size1 && startpos + range >= size1) | |
4458 lim = range - (size1 - startpos); | |
4459 | |
442 | 4460 d = ((const unsigned char *) |
428 | 4461 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
4462 | |
4463 /* Written out as an if-else to avoid testing `translate' | |
4464 inside the loop. */ | |
446 | 4465 if (TRANSLATE_P (translate)) |
826 | 4466 { |
4467 while (range > lim) | |
4468 { | |
4469 re_char *old_d = d; | |
428 | 4470 #ifdef MULE |
867 | 4471 Ibyte tempch[MAX_ICHAR_LEN]; |
4472 Ichar buf_ch = | |
4473 RE_TRANSLATE_1 (itext_ichar_fmt (d, fmt, lispobj)); | |
4474 set_itext_ichar (tempch, buf_ch); | |
826 | 4475 if (fastmap[*tempch]) |
4476 break; | |
446 | 4477 #else |
826 | 4478 if (fastmap[(unsigned char) RE_TRANSLATE_1 (*d)]) |
4479 break; | |
446 | 4480 #endif /* MULE */ |
867 | 4481 INC_IBYTEPTR_FMT (d, fmt); |
826 | 4482 range -= (d - old_d); |
4483 #if 1 | |
1333 | 4484 assert (!forward_search_p || range >= 0); |
826 | 4485 #endif |
4486 } | |
4487 } | |
4488 #ifdef MULE | |
4489 else if (fmt != FORMAT_DEFAULT) | |
4490 { | |
4491 while (range > lim) | |
4492 { | |
4493 re_char *old_d = d; | |
867 | 4494 Ibyte tempch[MAX_ICHAR_LEN]; |
4495 Ichar buf_ch = itext_ichar_fmt (d, fmt, lispobj); | |
4496 set_itext_ichar (tempch, buf_ch); | |
826 | 4497 if (fastmap[*tempch]) |
4498 break; | |
867 | 4499 INC_IBYTEPTR_FMT (d, fmt); |
826 | 4500 range -= (d - old_d); |
4501 #if 1 | |
1333 | 4502 assert (!forward_search_p || range >= 0); |
826 | 4503 #endif |
4504 } | |
4505 } | |
4506 #endif /* MULE */ | |
428 | 4507 else |
826 | 4508 { |
4509 while (range > lim && !fastmap[*d]) | |
4510 { | |
4511 re_char *old_d = d; | |
867 | 4512 INC_IBYTEPTR (d); |
826 | 4513 range -= (d - old_d); |
4514 #if 1 | |
4515 assert (!forward_search_p || range >= 0); | |
4516 #endif | |
4517 } | |
4518 } | |
428 | 4519 |
4520 startpos += irange - range; | |
4521 } | |
4522 else /* Searching backwards. */ | |
4523 { | |
826 | 4524 /* #### It's not clear why we don't just write a loop, like |
4525 for the moving-forward case. Perhaps the writer got lazy, | |
4526 since backward searches aren't so common. */ | |
4527 d = ((const unsigned char *) | |
4528 (startpos >= size1 ? string2 - size1 : string1) + startpos); | |
428 | 4529 #ifdef MULE |
826 | 4530 { |
867 | 4531 Ibyte tempch[MAX_ICHAR_LEN]; |
4532 Ichar buf_ch = | |
4533 RE_TRANSLATE (itext_ichar_fmt (d, fmt, lispobj)); | |
4534 set_itext_ichar (tempch, buf_ch); | |
826 | 4535 if (!fastmap[*tempch]) |
4536 goto advance; | |
4537 } | |
428 | 4538 #else |
826 | 4539 if (!fastmap[(unsigned char) RE_TRANSLATE (*d)]) |
446 | 4540 goto advance; |
826 | 4541 #endif /* MULE */ |
428 | 4542 } |
4543 } | |
4544 | |
4545 /* If can't match the null string, and that's all we have left, fail. */ | |
4546 if (range >= 0 && startpos == total_size && fastmap | |
4547 && !bufp->can_be_null) | |
1333 | 4548 { |
4549 UNBIND_REGEX_MALLOC_CHECK (); | |
4550 return -1; | |
4551 } | |
428 | 4552 |
4553 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */ | |
4554 if (!no_quit_in_re_search) | |
1333 | 4555 { |
4556 BEGIN_REGEX_MALLOC_OK (); | |
4557 QUIT; | |
4558 END_REGEX_MALLOC_OK (); | |
4559 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
4560 } | |
4561 | |
428 | 4562 #endif |
1333 | 4563 BEGIN_REGEX_MALLOC_OK (); |
428 | 4564 val = re_match_2_internal (bufp, string1, size1, string2, size2, |
826 | 4565 startpos, regs, stop |
4566 RE_LISP_CONTEXT_ARGS); | |
428 | 4567 #ifndef REGEX_MALLOC |
1333 | 4568 ALLOCA_GARBAGE_COLLECT (); |
428 | 4569 #endif |
1333 | 4570 END_REGEX_MALLOC_OK (); |
4571 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
428 | 4572 |
4573 if (val >= 0) | |
1333 | 4574 { |
4575 UNBIND_REGEX_MALLOC_CHECK (); | |
4576 return startpos; | |
4577 } | |
428 | 4578 |
4579 if (val == -2) | |
1333 | 4580 { |
4581 UNBIND_REGEX_MALLOC_CHECK (); | |
4582 return -2; | |
4583 } | |
4584 | |
4585 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
428 | 4586 advance: |
4587 if (!range) | |
4588 break; | |
4589 else if (range > 0) | |
4590 { | |
826 | 4591 Bytecount d_size; |
442 | 4592 d = ((const unsigned char *) |
428 | 4593 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
867 | 4594 d_size = itext_ichar_len_fmt (d, fmt); |
428 | 4595 range -= d_size; |
826 | 4596 #if 1 |
4597 assert (!forward_search_p || range >= 0); | |
4598 #endif | |
428 | 4599 startpos += d_size; |
4600 } | |
4601 else | |
4602 { | |
826 | 4603 Bytecount d_size; |
428 | 4604 /* Note startpos > size1 not >=. If we are on the |
4605 string1/string2 boundary, we want to backup into string1. */ | |
442 | 4606 d = ((const unsigned char *) |
428 | 4607 (startpos > size1 ? string2 - size1 : string1) + startpos); |
867 | 4608 DEC_IBYTEPTR_FMT (d, fmt); |
4609 d_size = itext_ichar_len_fmt (d, fmt); | |
428 | 4610 range += d_size; |
826 | 4611 #if 1 |
4612 assert (!forward_search_p || range >= 0); | |
4613 #endif | |
428 | 4614 startpos -= d_size; |
4615 } | |
4616 } | |
1333 | 4617 UNBIND_REGEX_MALLOC_CHECK (); |
428 | 4618 return -1; |
4619 } /* re_search_2 */ | |
826 | 4620 |
428 | 4621 |
4622 /* Declarations and macros for re_match_2. */ | |
4623 | |
4624 /* This converts PTR, a pointer into one of the search strings `string1' | |
4625 and `string2' into an offset from the beginning of that string. */ | |
4626 #define POINTER_TO_OFFSET(ptr) \ | |
4627 (FIRST_STRING_P (ptr) \ | |
4628 ? ((regoff_t) ((ptr) - string1)) \ | |
4629 : ((regoff_t) ((ptr) - string2 + size1))) | |
4630 | |
4631 /* Macros for dealing with the split strings in re_match_2. */ | |
4632 | |
4633 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) | |
4634 | |
4635 /* Call before fetching a character with *d. This switches over to | |
4636 string2 if necessary. */ | |
826 | 4637 #define REGEX_PREFETCH() \ |
428 | 4638 while (d == dend) \ |
4639 { \ | |
4640 /* End of string2 => fail. */ \ | |
4641 if (dend == end_match_2) \ | |
4642 goto fail; \ | |
4643 /* End of string1 => advance to string2. */ \ | |
4644 d = string2; \ | |
4645 dend = end_match_2; \ | |
4646 } | |
4647 | |
4648 | |
4649 /* Test if at very beginning or at very end of the virtual concatenation | |
4650 of `string1' and `string2'. If only one string, it's `string2'. */ | |
4651 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) | |
4652 #define AT_STRINGS_END(d) ((d) == end2) | |
4653 | |
4654 /* XEmacs change: | |
4655 If the given position straddles the string gap, return the equivalent | |
4656 position that is before or after the gap, respectively; otherwise, | |
4657 return the same position. */ | |
4658 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d)) | |
4659 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d)) | |
4660 | |
4661 /* Test if CH is a word-constituent character. (XEmacs change) */ | |
826 | 4662 #define WORDCHAR_P(ch) \ |
4663 (SYNTAX (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf), ch) == Sword) | |
428 | 4664 |
4665 /* Free everything we malloc. */ | |
4666 #ifdef MATCH_MAY_ALLOCATE | |
1726 | 4667 #define FREE_VAR(var,type) if (var) REGEX_FREE (var, type); var = NULL |
428 | 4668 #define FREE_VARIABLES() \ |
4669 do { \ | |
1333 | 4670 UNBIND_REGEX_MALLOC_CHECK (); \ |
428 | 4671 REGEX_FREE_STACK (fail_stack.stack); \ |
1726 | 4672 FREE_VAR (regstart, re_char **); \ |
4673 FREE_VAR (regend, re_char **); \ | |
4674 FREE_VAR (old_regstart, re_char **); \ | |
4675 FREE_VAR (old_regend, re_char **); \ | |
4676 FREE_VAR (best_regstart, re_char **); \ | |
4677 FREE_VAR (best_regend, re_char **); \ | |
4678 FREE_VAR (reg_info, register_info_type *); \ | |
4679 FREE_VAR (reg_dummy, re_char **); \ | |
4680 FREE_VAR (reg_info_dummy, register_info_type *); \ | |
428 | 4681 } while (0) |
446 | 4682 #else /* not MATCH_MAY_ALLOCATE */ |
1333 | 4683 #define FREE_VARIABLES() \ |
4684 do { \ | |
4685 UNBIND_REGEX_MALLOC_CHECK (); \ | |
4686 } while (0) | |
446 | 4687 #endif /* MATCH_MAY_ALLOCATE */ |
428 | 4688 |
4689 /* These values must meet several constraints. They must not be valid | |
4690 register values; since we have a limit of 255 registers (because | |
4691 we use only one byte in the pattern for the register number), we can | |
4692 use numbers larger than 255. They must differ by 1, because of | |
4693 NUM_FAILURE_ITEMS above. And the value for the lowest register must | |
4694 be larger than the value for the highest register, so we do not try | |
4695 to actually save any registers when none are active. */ | |
4696 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) | |
4697 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) | |
4698 | |
4699 /* Matching routines. */ | |
4700 | |
826 | 4701 #ifndef emacs /* XEmacs never uses this. */ |
428 | 4702 /* re_match is like re_match_2 except it takes only a single string. */ |
4703 | |
4704 int | |
442 | 4705 re_match (struct re_pattern_buffer *bufp, const char *string, int size, |
826 | 4706 int pos, struct re_registers *regs |
4707 RE_LISP_CONTEXT_ARGS_DECL) | |
428 | 4708 { |
446 | 4709 int result = re_match_2_internal (bufp, NULL, 0, (re_char *) string, size, |
826 | 4710 pos, regs, size |
4711 RE_LISP_CONTEXT_ARGS); | |
1333 | 4712 ALLOCA_GARBAGE_COLLECT (); |
428 | 4713 return result; |
4714 } | |
4715 #endif /* not emacs */ | |
4716 | |
4717 /* re_match_2 matches the compiled pattern in BUFP against the | |
4718 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and | |
4719 SIZE2, respectively). We start matching at POS, and stop matching | |
4720 at STOP. | |
4721 | |
4722 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we | |
4723 store offsets for the substring each group matched in REGS. See the | |
4724 documentation for exactly how many groups we fill. | |
4725 | |
4726 We return -1 if no match, -2 if an internal error (such as the | |
4727 failure stack overflowing). Otherwise, we return the length of the | |
4728 matched substring. */ | |
4729 | |
4730 int | |
442 | 4731 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, |
4732 int size1, const char *string2, int size2, int pos, | |
826 | 4733 struct re_registers *regs, int stop |
4734 RE_LISP_CONTEXT_ARGS_DECL) | |
428 | 4735 { |
460 | 4736 int result; |
4737 | |
4738 #ifdef emacs | |
826 | 4739 scache = setup_syntax_cache (scache, lispobj, lispbuf, |
4740 offset_to_charxpos (lispobj, pos), | |
4741 1); | |
460 | 4742 #endif |
4743 | |
4744 result = re_match_2_internal (bufp, (re_char *) string1, size1, | |
4745 (re_char *) string2, size2, | |
826 | 4746 pos, regs, stop |
4747 RE_LISP_CONTEXT_ARGS); | |
460 | 4748 |
1333 | 4749 ALLOCA_GARBAGE_COLLECT (); |
428 | 4750 return result; |
4751 } | |
4752 | |
4753 /* This is a separate function so that we can force an alloca cleanup | |
4754 afterwards. */ | |
4755 static int | |
446 | 4756 re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, |
4757 int size1, re_char *string2, int size2, int pos, | |
826 | 4758 struct re_registers *regs, int stop |
2333 | 4759 RE_LISP_CONTEXT_ARGS_MULE_DECL) |
428 | 4760 { |
4761 /* General temporaries. */ | |
4762 int mcnt; | |
4763 unsigned char *p1; | |
4764 int should_succeed; /* XEmacs change */ | |
4765 | |
4766 /* Just past the end of the corresponding string. */ | |
446 | 4767 re_char *end1, *end2; |
428 | 4768 |
4769 /* Pointers into string1 and string2, just past the last characters in | |
4770 each to consider matching. */ | |
446 | 4771 re_char *end_match_1, *end_match_2; |
428 | 4772 |
4773 /* Where we are in the data, and the end of the current string. */ | |
446 | 4774 re_char *d, *dend; |
428 | 4775 |
4776 /* Where we are in the pattern, and the end of the pattern. */ | |
4777 unsigned char *p = bufp->buffer; | |
4778 REGISTER unsigned char *pend = p + bufp->used; | |
4779 | |
4780 /* Mark the opcode just after a start_memory, so we can test for an | |
4781 empty subpattern when we get to the stop_memory. */ | |
446 | 4782 re_char *just_past_start_mem = 0; |
428 | 4783 |
4784 /* We use this to map every character in the string. */ | |
446 | 4785 RE_TRANSLATE_TYPE translate = bufp->translate; |
428 | 4786 |
4787 /* Failure point stack. Each place that can handle a failure further | |
4788 down the line pushes a failure point on this stack. It consists of | |
4789 restart, regend, and reg_info for all registers corresponding to | |
4790 the subexpressions we're currently inside, plus the number of such | |
4791 registers, and, finally, two char *'s. The first char * is where | |
4792 to resume scanning the pattern; the second one is where to resume | |
4793 scanning the strings. If the latter is zero, the failure point is | |
4794 a ``dummy''; if a failure happens and the failure point is a dummy, | |
4795 it gets discarded and the next one is tried. */ | |
4796 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ | |
4797 fail_stack_type fail_stack; | |
4798 #endif | |
4799 #ifdef DEBUG | |
647 | 4800 static int failure_id; |
4801 int nfailure_points_pushed = 0, nfailure_points_popped = 0; | |
428 | 4802 #endif |
4803 | |
771 | 4804 #ifdef REGEX_REL_ALLOC |
428 | 4805 /* This holds the pointer to the failure stack, when |
4806 it is allocated relocatably. */ | |
4807 fail_stack_elt_t *failure_stack_ptr; | |
4808 #endif | |
4809 | |
4810 /* We fill all the registers internally, independent of what we | |
4811 return, for use in backreferences. The number here includes | |
4812 an element for register zero. */ | |
647 | 4813 int num_regs = bufp->re_ngroups + 1; |
428 | 4814 |
4815 /* The currently active registers. */ | |
647 | 4816 int lowest_active_reg = NO_LOWEST_ACTIVE_REG; |
4817 int highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
428 | 4818 |
4819 /* Information on the contents of registers. These are pointers into | |
4820 the input strings; they record just what was matched (on this | |
4821 attempt) by a subexpression part of the pattern, that is, the | |
4822 regnum-th regstart pointer points to where in the pattern we began | |
4823 matching and the regnum-th regend points to right after where we | |
4824 stopped matching the regnum-th subexpression. (The zeroth register | |
4825 keeps track of what the whole pattern matches.) */ | |
4826 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ | |
446 | 4827 re_char **regstart, **regend; |
428 | 4828 #endif |
4829 | |
4830 /* If a group that's operated upon by a repetition operator fails to | |
4831 match anything, then the register for its start will need to be | |
4832 restored because it will have been set to wherever in the string we | |
4833 are when we last see its open-group operator. Similarly for a | |
4834 register's end. */ | |
4835 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ | |
446 | 4836 re_char **old_regstart, **old_regend; |
428 | 4837 #endif |
4838 | |
4839 /* The is_active field of reg_info helps us keep track of which (possibly | |
4840 nested) subexpressions we are currently in. The matched_something | |
4841 field of reg_info[reg_num] helps us tell whether or not we have | |
4842 matched any of the pattern so far this time through the reg_num-th | |
4843 subexpression. These two fields get reset each time through any | |
4844 loop their register is in. */ | |
4845 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ | |
4846 register_info_type *reg_info; | |
4847 #endif | |
4848 | |
4849 /* The following record the register info as found in the above | |
4850 variables when we find a match better than any we've seen before. | |
4851 This happens as we backtrack through the failure points, which in | |
4852 turn happens only if we have not yet matched the entire string. */ | |
647 | 4853 int best_regs_set = false; |
428 | 4854 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ |
446 | 4855 re_char **best_regstart, **best_regend; |
428 | 4856 #endif |
4857 | |
4858 /* Logically, this is `best_regend[0]'. But we don't want to have to | |
4859 allocate space for that if we're not allocating space for anything | |
4860 else (see below). Also, we never need info about register 0 for | |
4861 any of the other register vectors, and it seems rather a kludge to | |
4862 treat `best_regend' differently than the rest. So we keep track of | |
4863 the end of the best match so far in a separate variable. We | |
4864 initialize this to NULL so that when we backtrack the first time | |
4865 and need to test it, it's not garbage. */ | |
446 | 4866 re_char *match_end = NULL; |
428 | 4867 |
4868 /* This helps SET_REGS_MATCHED avoid doing redundant work. */ | |
4869 int set_regs_matched_done = 0; | |
4870 | |
4871 /* Used when we pop values we don't care about. */ | |
4872 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ | |
446 | 4873 re_char **reg_dummy; |
428 | 4874 register_info_type *reg_info_dummy; |
4875 #endif | |
4876 | |
4877 #ifdef DEBUG | |
4878 /* Counts the total number of registers pushed. */ | |
647 | 4879 int num_regs_pushed = 0; |
428 | 4880 #endif |
4881 | |
4882 /* 1 if this match ends in the same string (string1 or string2) | |
4883 as the best previous match. */ | |
460 | 4884 re_bool same_str_p; |
428 | 4885 |
4886 /* 1 if this match is the best seen so far. */ | |
460 | 4887 re_bool best_match_p; |
428 | 4888 |
826 | 4889 #ifdef emacs |
4890 Internal_Format fmt = buffer_or_other_internal_format (lispobj); | |
1346 | 4891 #ifdef REL_ALLOC |
4892 Ibyte *orig_buftext = | |
4893 BUFFERP (lispobj) ? | |
4894 BYTE_BUF_BYTE_ADDRESS (XBUFFER (lispobj), | |
4895 BYTE_BUF_BEGV (XBUFFER (lispobj))) : | |
4896 0; | |
4897 #endif | |
4898 | |
1333 | 4899 #ifdef ERROR_CHECK_MALLOC |
4900 int depth = bind_regex_malloc_disallowed (1); | |
4901 #endif | |
826 | 4902 #endif /* emacs */ |
771 | 4903 |
428 | 4904 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); |
4905 | |
1333 | 4906 BEGIN_REGEX_MALLOC_OK (); |
428 | 4907 INIT_FAIL_STACK (); |
1333 | 4908 END_REGEX_MALLOC_OK (); |
428 | 4909 |
4910 #ifdef MATCH_MAY_ALLOCATE | |
4911 /* Do not bother to initialize all the register variables if there are | |
4912 no groups in the pattern, as it takes a fair amount of time. If | |
4913 there are groups, we include space for register 0 (the whole | |
4914 pattern), even though we never use it, since it simplifies the | |
4915 array indexing. We should fix this. */ | |
502 | 4916 if (bufp->re_ngroups) |
428 | 4917 { |
1333 | 4918 BEGIN_REGEX_MALLOC_OK (); |
446 | 4919 regstart = REGEX_TALLOC (num_regs, re_char *); |
4920 regend = REGEX_TALLOC (num_regs, re_char *); | |
4921 old_regstart = REGEX_TALLOC (num_regs, re_char *); | |
4922 old_regend = REGEX_TALLOC (num_regs, re_char *); | |
4923 best_regstart = REGEX_TALLOC (num_regs, re_char *); | |
4924 best_regend = REGEX_TALLOC (num_regs, re_char *); | |
428 | 4925 reg_info = REGEX_TALLOC (num_regs, register_info_type); |
446 | 4926 reg_dummy = REGEX_TALLOC (num_regs, re_char *); |
428 | 4927 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); |
1333 | 4928 END_REGEX_MALLOC_OK (); |
428 | 4929 |
4930 if (!(regstart && regend && old_regstart && old_regend && reg_info | |
4931 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) | |
4932 { | |
4933 FREE_VARIABLES (); | |
4934 return -2; | |
4935 } | |
4936 } | |
4937 else | |
4938 { | |
4939 /* We must initialize all our variables to NULL, so that | |
4940 `FREE_VARIABLES' doesn't try to free them. */ | |
4941 regstart = regend = old_regstart = old_regend = best_regstart | |
4942 = best_regend = reg_dummy = NULL; | |
4943 reg_info = reg_info_dummy = (register_info_type *) NULL; | |
4944 } | |
4945 #endif /* MATCH_MAY_ALLOCATE */ | |
4946 | |
1333 | 4947 #if defined (emacs) && defined (REL_ALLOC) |
4948 { | |
4949 /* If the allocations above (or the call to setup_syntax_cache() in | |
4950 re_match_2) caused a rel-alloc relocation, then fix up the data | |
4951 pointers */ | |
1346 | 4952 Bytecount offset = offset_post_relocation (lispobj, orig_buftext); |
1333 | 4953 if (offset) |
4954 { | |
4955 string1 += offset; | |
4956 string2 += offset; | |
4957 } | |
4958 } | |
4959 #endif /* defined (emacs) && defined (REL_ALLOC) */ | |
4960 | |
428 | 4961 /* The starting position is bogus. */ |
4962 if (pos < 0 || pos > size1 + size2) | |
4963 { | |
4964 FREE_VARIABLES (); | |
4965 return -1; | |
4966 } | |
4967 | |
4968 /* Initialize subexpression text positions to -1 to mark ones that no | |
4969 start_memory/stop_memory has been seen for. Also initialize the | |
4970 register information struct. */ | |
4971 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
4972 { | |
4973 regstart[mcnt] = regend[mcnt] | |
4974 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; | |
4975 | |
4976 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; | |
4977 IS_ACTIVE (reg_info[mcnt]) = 0; | |
4978 MATCHED_SOMETHING (reg_info[mcnt]) = 0; | |
4979 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; | |
4980 } | |
4981 /* We move `string1' into `string2' if the latter's empty -- but not if | |
4982 `string1' is null. */ | |
4983 if (size2 == 0 && string1 != NULL) | |
4984 { | |
4985 string2 = string1; | |
4986 size2 = size1; | |
4987 string1 = 0; | |
4988 size1 = 0; | |
4989 } | |
4990 end1 = string1 + size1; | |
4991 end2 = string2 + size2; | |
4992 | |
4993 /* Compute where to stop matching, within the two strings. */ | |
4994 if (stop <= size1) | |
4995 { | |
4996 end_match_1 = string1 + stop; | |
4997 end_match_2 = string2; | |
4998 } | |
4999 else | |
5000 { | |
5001 end_match_1 = end1; | |
5002 end_match_2 = string2 + stop - size1; | |
5003 } | |
5004 | |
5005 /* `p' scans through the pattern as `d' scans through the data. | |
5006 `dend' is the end of the input string that `d' points within. `d' | |
5007 is advanced into the following input string whenever necessary, but | |
5008 this happens before fetching; therefore, at the beginning of the | |
5009 loop, `d' can be pointing at the end of a string, but it cannot | |
5010 equal `string2'. */ | |
5011 if (size1 > 0 && pos <= size1) | |
5012 { | |
5013 d = string1 + pos; | |
5014 dend = end_match_1; | |
5015 } | |
5016 else | |
5017 { | |
5018 d = string2 + pos - size1; | |
5019 dend = end_match_2; | |
5020 } | |
5021 | |
446 | 5022 DEBUG_PRINT1 ("The compiled pattern is: \n"); |
428 | 5023 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); |
5024 DEBUG_PRINT1 ("The string to match is: `"); | |
5025 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); | |
5026 DEBUG_PRINT1 ("'\n"); | |
5027 | |
5028 /* This loops over pattern commands. It exits by returning from the | |
5029 function if the match is complete, or it drops through if the match | |
5030 fails at this starting point in the input data. */ | |
5031 for (;;) | |
5032 { | |
5033 DEBUG_PRINT2 ("\n0x%lx: ", (long) p); | |
5034 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */ | |
5035 if (!no_quit_in_re_search) | |
1333 | 5036 { |
5037 BEGIN_REGEX_MALLOC_OK (); | |
5038 QUIT; | |
5039 END_REGEX_MALLOC_OK (); | |
1346 | 5040 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); |
1333 | 5041 } |
428 | 5042 #endif |
5043 | |
5044 if (p == pend) | |
5045 { /* End of pattern means we might have succeeded. */ | |
5046 DEBUG_PRINT1 ("end of pattern ... "); | |
5047 | |
5048 /* If we haven't matched the entire string, and we want the | |
5049 longest match, try backtracking. */ | |
5050 if (d != end_match_2) | |
5051 { | |
5052 same_str_p = (FIRST_STRING_P (match_end) | |
5053 == MATCHING_IN_FIRST_STRING); | |
5054 | |
5055 /* AIX compiler got confused when this was combined | |
5056 with the previous declaration. */ | |
5057 if (same_str_p) | |
5058 best_match_p = d > match_end; | |
5059 else | |
5060 best_match_p = !MATCHING_IN_FIRST_STRING; | |
5061 | |
5062 DEBUG_PRINT1 ("backtracking.\n"); | |
5063 | |
5064 if (!FAIL_STACK_EMPTY ()) | |
5065 { /* More failure points to try. */ | |
5066 | |
5067 /* If exceeds best match so far, save it. */ | |
5068 if (!best_regs_set || best_match_p) | |
5069 { | |
5070 best_regs_set = true; | |
5071 match_end = d; | |
5072 | |
5073 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); | |
5074 | |
5075 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
5076 { | |
5077 best_regstart[mcnt] = regstart[mcnt]; | |
5078 best_regend[mcnt] = regend[mcnt]; | |
5079 } | |
5080 } | |
5081 goto fail; | |
5082 } | |
5083 | |
5084 /* If no failure points, don't restore garbage. And if | |
5085 last match is real best match, don't restore second | |
5086 best one. */ | |
5087 else if (best_regs_set && !best_match_p) | |
5088 { | |
5089 restore_best_regs: | |
5090 /* Restore best match. It may happen that `dend == | |
5091 end_match_1' while the restored d is in string2. | |
5092 For example, the pattern `x.*y.*z' against the | |
5093 strings `x-' and `y-z-', if the two strings are | |
5094 not consecutive in memory. */ | |
5095 DEBUG_PRINT1 ("Restoring best registers.\n"); | |
5096 | |
5097 d = match_end; | |
5098 dend = ((d >= string1 && d <= end1) | |
5099 ? end_match_1 : end_match_2); | |
5100 | |
5101 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
5102 { | |
5103 regstart[mcnt] = best_regstart[mcnt]; | |
5104 regend[mcnt] = best_regend[mcnt]; | |
5105 } | |
5106 } | |
5107 } /* d != end_match_2 */ | |
5108 | |
5109 succeed_label: | |
5110 DEBUG_PRINT1 ("Accepting match.\n"); | |
5111 | |
5112 /* If caller wants register contents data back, do it. */ | |
1028 | 5113 { |
5114 int num_nonshy_regs = bufp->re_nsub + 1; | |
5115 if (regs && !bufp->no_sub) | |
5116 { | |
5117 /* Have the register data arrays been allocated? */ | |
5118 if (bufp->regs_allocated == REGS_UNALLOCATED) | |
5119 { /* No. So allocate them with malloc. We need one | |
5120 extra element beyond `num_regs' for the `-1' marker | |
5121 GNU code uses. */ | |
5122 regs->num_regs = MAX (RE_NREGS, num_nonshy_regs + 1); | |
1333 | 5123 BEGIN_REGEX_MALLOC_OK (); |
1028 | 5124 regs->start = TALLOC (regs->num_regs, regoff_t); |
5125 regs->end = TALLOC (regs->num_regs, regoff_t); | |
1333 | 5126 END_REGEX_MALLOC_OK (); |
5127 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
1028 | 5128 if (regs->start == NULL || regs->end == NULL) |
5129 { | |
5130 FREE_VARIABLES (); | |
5131 return -2; | |
5132 } | |
5133 bufp->regs_allocated = REGS_REALLOCATE; | |
5134 } | |
5135 else if (bufp->regs_allocated == REGS_REALLOCATE) | |
5136 { /* Yes. If we need more elements than were already | |
5137 allocated, reallocate them. If we need fewer, just | |
5138 leave it alone. */ | |
5139 if (regs->num_regs < num_nonshy_regs + 1) | |
5140 { | |
5141 regs->num_regs = num_nonshy_regs + 1; | |
1333 | 5142 BEGIN_REGEX_MALLOC_OK (); |
1028 | 5143 RETALLOC (regs->start, regs->num_regs, regoff_t); |
5144 RETALLOC (regs->end, regs->num_regs, regoff_t); | |
1333 | 5145 END_REGEX_MALLOC_OK (); |
5146 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
1028 | 5147 if (regs->start == NULL || regs->end == NULL) |
5148 { | |
5149 FREE_VARIABLES (); | |
5150 return -2; | |
5151 } | |
5152 } | |
5153 } | |
5154 else | |
5155 { | |
5156 /* The braces fend off a "empty body in an else-statement" | |
5157 warning under GCC when assert expands to nothing. */ | |
5158 assert (bufp->regs_allocated == REGS_FIXED); | |
5159 } | |
5160 | |
5161 /* Convert the pointer data in `regstart' and `regend' to | |
5162 indices. Register zero has to be set differently, | |
5163 since we haven't kept track of any info for it. */ | |
5164 if (regs->num_regs > 0) | |
5165 { | |
5166 regs->start[0] = pos; | |
5167 regs->end[0] = (MATCHING_IN_FIRST_STRING | |
5168 ? ((regoff_t) (d - string1)) | |
5169 : ((regoff_t) (d - string2 + size1))); | |
5170 } | |
5171 | |
2639 | 5172 /* Map over the NUM_NONSHY_REGS non-shy internal registers. |
5173 Copy each into the corresponding external register. | |
5174 MCNT indexes external registers. */ | |
1028 | 5175 for (mcnt = 1; mcnt < MIN (num_nonshy_regs, regs->num_regs); |
5176 mcnt++) | |
5177 { | |
5178 int internal_reg = bufp->external_to_internal_register[mcnt]; | |
5179 if (REG_UNSET (regstart[internal_reg]) || | |
5180 REG_UNSET (regend[internal_reg])) | |
5181 regs->start[mcnt] = regs->end[mcnt] = -1; | |
5182 else | |
5183 { | |
5184 regs->start[mcnt] = | |
5185 (regoff_t) POINTER_TO_OFFSET (regstart[internal_reg]); | |
5186 regs->end[mcnt] = | |
5187 (regoff_t) POINTER_TO_OFFSET (regend[internal_reg]); | |
5188 } | |
5189 } | |
5190 } /* regs && !bufp->no_sub */ | |
5191 | |
5192 /* If we have regs and the regs structure has more elements than | |
2639 | 5193 were in the pattern, set the extra elements starting with |
5194 NUM_NONSHY_REGS to -1. If we (re)allocated the registers, | |
5195 this is the case, because we always allocate enough to have | |
5196 at least one -1 at the end. | |
1028 | 5197 |
5198 We do this even when no_sub is set because some applications | |
5199 (XEmacs) reuse register structures which may contain stale | |
5200 information, and permit attempts to access those registers. | |
5201 | |
5202 It would be possible to require the caller to do this, but we'd | |
5203 have to change the API for this function to reflect that, and | |
1425 | 5204 audit all callers. Note: as of 2003-04-17 callers in XEmacs |
5205 do clear the registers, but it's safer to leave this code in | |
5206 because of reallocation. | |
5207 */ | |
1028 | 5208 if (regs && regs->num_regs > 0) |
5209 for (mcnt = num_nonshy_regs; mcnt < regs->num_regs; mcnt++) | |
5210 regs->start[mcnt] = regs->end[mcnt] = -1; | |
5211 } | |
428 | 5212 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", |
5213 nfailure_points_pushed, nfailure_points_popped, | |
5214 nfailure_points_pushed - nfailure_points_popped); | |
5215 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); | |
5216 | |
5217 mcnt = d - pos - (MATCHING_IN_FIRST_STRING | |
5218 ? string1 | |
5219 : string2 - size1); | |
5220 | |
5221 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); | |
5222 | |
5223 FREE_VARIABLES (); | |
5224 return mcnt; | |
5225 } | |
5226 | |
5227 /* Otherwise match next pattern command. */ | |
4759
aa5ed11f473b
Remove support for obsolete systems. See xemacs-patches message with ID
Jerry James <james@xemacs.org>
parents:
4750
diff
changeset
|
5228 switch ((re_opcode_t) *p++) |
428 | 5229 { |
5230 /* Ignore these. Used to ignore the n of succeed_n's which | |
5231 currently have n == 0. */ | |
5232 case no_op: | |
5233 DEBUG_PRINT1 ("EXECUTING no_op.\n"); | |
5234 break; | |
5235 | |
5236 case succeed: | |
5237 DEBUG_PRINT1 ("EXECUTING succeed.\n"); | |
5238 goto succeed_label; | |
5239 | |
826 | 5240 /* Match exactly a string of length n in the pattern. The |
5241 following byte in the pattern defines n, and the n bytes after | |
5242 that make up the string to match. (Under Mule, this will be in | |
5243 the default internal format.) */ | |
428 | 5244 case exactn: |
5245 mcnt = *p++; | |
5246 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); | |
5247 | |
5248 /* This is written out as an if-else so we don't waste time | |
5249 testing `translate' inside the loop. */ | |
446 | 5250 if (TRANSLATE_P (translate)) |
428 | 5251 { |
5252 do | |
5253 { | |
446 | 5254 #ifdef MULE |
5255 Bytecount pat_len; | |
5256 | |
450 | 5257 REGEX_PREFETCH (); |
867 | 5258 if (RE_TRANSLATE_1 (itext_ichar_fmt (d, fmt, lispobj)) |
5259 != itext_ichar (p)) | |
428 | 5260 goto fail; |
446 | 5261 |
867 | 5262 pat_len = itext_ichar_len (p); |
446 | 5263 p += pat_len; |
867 | 5264 INC_IBYTEPTR_FMT (d, fmt); |
446 | 5265 |
5266 mcnt -= pat_len; | |
5267 #else /* not MULE */ | |
450 | 5268 REGEX_PREFETCH (); |
826 | 5269 if ((unsigned char) RE_TRANSLATE_1 (*d++) != *p++) |
446 | 5270 goto fail; |
5271 mcnt--; | |
5272 #endif | |
428 | 5273 } |
446 | 5274 while (mcnt > 0); |
428 | 5275 } |
5276 else | |
5277 { | |
826 | 5278 #ifdef MULE |
5279 /* If buffer format is default, then we can shortcut and just | |
5280 compare the text directly, byte by byte. Otherwise, we | |
5281 need to go character by character. */ | |
5282 if (fmt != FORMAT_DEFAULT) | |
428 | 5283 { |
826 | 5284 do |
5285 { | |
5286 Bytecount pat_len; | |
5287 | |
5288 REGEX_PREFETCH (); | |
867 | 5289 if (itext_ichar_fmt (d, fmt, lispobj) != |
5290 itext_ichar (p)) | |
826 | 5291 goto fail; |
5292 | |
867 | 5293 pat_len = itext_ichar_len (p); |
826 | 5294 p += pat_len; |
867 | 5295 INC_IBYTEPTR_FMT (d, fmt); |
826 | 5296 |
5297 mcnt -= pat_len; | |
5298 } | |
5299 while (mcnt > 0); | |
428 | 5300 } |
826 | 5301 else |
5302 #endif | |
5303 { | |
5304 do | |
5305 { | |
5306 REGEX_PREFETCH (); | |
5307 if (*d++ != *p++) goto fail; | |
5308 mcnt--; | |
5309 } | |
5310 while (mcnt > 0); | |
5311 } | |
428 | 5312 } |
5313 SET_REGS_MATCHED (); | |
5314 break; | |
5315 | |
5316 | |
5317 /* Match any character except possibly a newline or a null. */ | |
5318 case anychar: | |
5319 DEBUG_PRINT1 ("EXECUTING anychar.\n"); | |
5320 | |
450 | 5321 REGEX_PREFETCH (); |
428 | 5322 |
826 | 5323 if ((!(bufp->syntax & RE_DOT_NEWLINE) && |
867 | 5324 RE_TRANSLATE (itext_ichar_fmt (d, fmt, lispobj)) == '\n') |
826 | 5325 || (bufp->syntax & RE_DOT_NOT_NULL && |
867 | 5326 RE_TRANSLATE (itext_ichar_fmt (d, fmt, lispobj)) == |
826 | 5327 '\000')) |
428 | 5328 goto fail; |
5329 | |
5330 SET_REGS_MATCHED (); | |
5331 DEBUG_PRINT2 (" Matched `%d'.\n", *d); | |
867 | 5332 INC_IBYTEPTR_FMT (d, fmt); /* XEmacs change */ |
428 | 5333 break; |
5334 | |
5335 | |
5336 case charset: | |
5337 case charset_not: | |
5338 { | |
1414 | 5339 REGISTER Ichar c; |
460 | 5340 re_bool not_p = (re_opcode_t) *(p - 1) == charset_not; |
458 | 5341 |
5342 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : ""); | |
428 | 5343 |
450 | 5344 REGEX_PREFETCH (); |
867 | 5345 c = itext_ichar_fmt (d, fmt, lispobj); |
826 | 5346 c = RE_TRANSLATE (c); /* The character to match. */ |
428 | 5347 |
647 | 5348 /* Cast to `unsigned int' instead of `unsigned char' in case the |
428 | 5349 bit list is a full 32 bytes long. */ |
1414 | 5350 if ((unsigned int)c < (unsigned int) (*p * BYTEWIDTH) |
428 | 5351 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) |
458 | 5352 not_p = !not_p; |
428 | 5353 |
5354 p += 1 + *p; | |
5355 | |
458 | 5356 if (!not_p) goto fail; |
428 | 5357 |
5358 SET_REGS_MATCHED (); | |
867 | 5359 INC_IBYTEPTR_FMT (d, fmt); /* XEmacs change */ |
428 | 5360 break; |
5361 } | |
5362 | |
5363 #ifdef MULE | |
5364 case charset_mule: | |
5365 case charset_mule_not: | |
5366 { | |
867 | 5367 REGISTER Ichar c; |
460 | 5368 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not; |
458 | 5369 |
5370 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : ""); | |
428 | 5371 |
450 | 5372 REGEX_PREFETCH (); |
867 | 5373 c = itext_ichar_fmt (d, fmt, lispobj); |
826 | 5374 c = RE_TRANSLATE (c); /* The character to match. */ |
428 | 5375 |
5376 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil))) | |
458 | 5377 not_p = !not_p; |
428 | 5378 |
5379 p += unified_range_table_bytes_used (p); | |
5380 | |
458 | 5381 if (!not_p) goto fail; |
428 | 5382 |
5383 SET_REGS_MATCHED (); | |
867 | 5384 INC_IBYTEPTR_FMT (d, fmt); |
428 | 5385 break; |
5386 } | |
5387 #endif /* MULE */ | |
5388 | |
5389 | |
5390 /* The beginning of a group is represented by start_memory. | |
5391 The arguments are the register number in the next byte, and the | |
5392 number of groups inner to this one in the next. The text | |
5393 matched within the group is recorded (in the internal | |
5394 registers data structure) under the register number. */ | |
5395 case start_memory: | |
5396 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); | |
5397 | |
5398 /* Find out if this group can match the empty string. */ | |
5399 p1 = p; /* To send to group_match_null_string_p. */ | |
5400 | |
5401 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) | |
2639 | 5402 REG_MATCH_NULL_STRING_P (reg_info[*p]) |
5403 = group_match_null_string_p (&p1, pend, reg_info); | |
5404 | |
5405 DEBUG_PRINT2 (" group CAN%s match null string\n", | |
5406 REG_MATCH_NULL_STRING_P (reg_info[*p]) ? "NOT" : ""); | |
428 | 5407 |
5408 /* Save the position in the string where we were the last time | |
5409 we were at this open-group operator in case the group is | |
5410 operated upon by a repetition operator, e.g., with `(a*)*b' | |
5411 against `ab'; then we want to ignore where we are now in | |
5412 the string in case this attempt to match fails. */ | |
5413 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) | |
5414 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] | |
5415 : regstart[*p]; | |
5416 DEBUG_PRINT2 (" old_regstart: %d\n", | |
5417 POINTER_TO_OFFSET (old_regstart[*p])); | |
5418 | |
5419 regstart[*p] = d; | |
5420 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); | |
5421 | |
5422 IS_ACTIVE (reg_info[*p]) = 1; | |
5423 MATCHED_SOMETHING (reg_info[*p]) = 0; | |
5424 | |
5425 /* Clear this whenever we change the register activity status. */ | |
5426 set_regs_matched_done = 0; | |
5427 | |
5428 /* This is the new highest active register. */ | |
5429 highest_active_reg = *p; | |
5430 | |
5431 /* If nothing was active before, this is the new lowest active | |
5432 register. */ | |
5433 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) | |
5434 lowest_active_reg = *p; | |
5435 | |
5436 /* Move past the register number and inner group count. */ | |
5437 p += 2; | |
5438 just_past_start_mem = p; | |
5439 | |
5440 break; | |
5441 | |
5442 | |
5443 /* The stop_memory opcode represents the end of a group. Its | |
5444 arguments are the same as start_memory's: the register | |
5445 number, and the number of inner groups. */ | |
5446 case stop_memory: | |
5447 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); | |
5448 | |
5449 /* We need to save the string position the last time we were at | |
5450 this close-group operator in case the group is operated | |
5451 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' | |
5452 against `aba'; then we want to ignore where we are now in | |
5453 the string in case this attempt to match fails. */ | |
5454 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) | |
5455 ? REG_UNSET (regend[*p]) ? d : regend[*p] | |
5456 : regend[*p]; | |
5457 DEBUG_PRINT2 (" old_regend: %d\n", | |
5458 POINTER_TO_OFFSET (old_regend[*p])); | |
5459 | |
5460 regend[*p] = d; | |
5461 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); | |
5462 | |
5463 /* This register isn't active anymore. */ | |
5464 IS_ACTIVE (reg_info[*p]) = 0; | |
5465 | |
5466 /* Clear this whenever we change the register activity status. */ | |
5467 set_regs_matched_done = 0; | |
5468 | |
5469 /* If this was the only register active, nothing is active | |
5470 anymore. */ | |
5471 if (lowest_active_reg == highest_active_reg) | |
5472 { | |
5473 lowest_active_reg = NO_LOWEST_ACTIVE_REG; | |
5474 highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
5475 } | |
5476 else | |
5477 { /* We must scan for the new highest active register, since | |
5478 it isn't necessarily one less than now: consider | |
5479 (a(b)c(d(e)f)g). When group 3 ends, after the f), the | |
5480 new highest active register is 1. */ | |
5481 unsigned char r = *p - 1; | |
5482 while (r > 0 && !IS_ACTIVE (reg_info[r])) | |
5483 r--; | |
5484 | |
5485 /* If we end up at register zero, that means that we saved | |
5486 the registers as the result of an `on_failure_jump', not | |
5487 a `start_memory', and we jumped to past the innermost | |
5488 `stop_memory'. For example, in ((.)*) we save | |
5489 registers 1 and 2 as a result of the *, but when we pop | |
5490 back to the second ), we are at the stop_memory 1. | |
5491 Thus, nothing is active. */ | |
5492 if (r == 0) | |
5493 { | |
5494 lowest_active_reg = NO_LOWEST_ACTIVE_REG; | |
5495 highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
5496 } | |
5497 else | |
5498 { | |
5499 highest_active_reg = r; | |
5500 | |
5501 /* 98/9/21 jhod: We've also gotta set lowest_active_reg, don't we? */ | |
5502 r = 1; | |
5503 while (r < highest_active_reg && !IS_ACTIVE(reg_info[r])) | |
5504 r++; | |
5505 lowest_active_reg = r; | |
5506 } | |
5507 } | |
5508 | |
5509 /* If just failed to match something this time around with a | |
5510 group that's operated on by a repetition operator, try to | |
5511 force exit from the ``loop'', and restore the register | |
5512 information for this group that we had before trying this | |
5513 last match. */ | |
5514 if ((!MATCHED_SOMETHING (reg_info[*p]) | |
5515 || just_past_start_mem == p - 1) | |
5516 && (p + 2) < pend) | |
5517 { | |
460 | 5518 re_bool is_a_jump_n = false; |
428 | 5519 |
5520 p1 = p + 2; | |
5521 mcnt = 0; | |
5522 switch ((re_opcode_t) *p1++) | |
5523 { | |
5524 case jump_n: | |
5525 is_a_jump_n = true; | |
5526 case pop_failure_jump: | |
5527 case maybe_pop_jump: | |
5528 case jump: | |
5529 case dummy_failure_jump: | |
5530 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
5531 if (is_a_jump_n) | |
5532 p1 += 2; | |
5533 break; | |
5534 | |
5535 default: | |
5536 /* do nothing */ ; | |
5537 } | |
5538 p1 += mcnt; | |
5539 | |
5540 /* If the next operation is a jump backwards in the pattern | |
5541 to an on_failure_jump right before the start_memory | |
5542 corresponding to this stop_memory, exit from the loop | |
5543 by forcing a failure after pushing on the stack the | |
5544 on_failure_jump's jump in the pattern, and d. */ | |
5545 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump | |
5546 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) | |
5547 { | |
5548 /* If this group ever matched anything, then restore | |
5549 what its registers were before trying this last | |
5550 failed match, e.g., with `(a*)*b' against `ab' for | |
5551 regstart[1], and, e.g., with `((a*)*(b*)*)*' | |
5552 against `aba' for regend[3]. | |
5553 | |
5554 Also restore the registers for inner groups for, | |
5555 e.g., `((a*)(b*))*' against `aba' (register 3 would | |
5556 otherwise get trashed). */ | |
5557 | |
5558 if (EVER_MATCHED_SOMETHING (reg_info[*p])) | |
5559 { | |
647 | 5560 int r; |
428 | 5561 |
5562 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; | |
5563 | |
5564 /* Restore this and inner groups' (if any) registers. */ | |
5565 for (r = *p; r < *p + *(p + 1); r++) | |
5566 { | |
5567 regstart[r] = old_regstart[r]; | |
5568 | |
5569 /* xx why this test? */ | |
5570 if (old_regend[r] >= regstart[r]) | |
5571 regend[r] = old_regend[r]; | |
5572 } | |
5573 } | |
5574 p1++; | |
5575 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
5576 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); | |
5577 | |
5578 goto fail; | |
5579 } | |
5580 } | |
5581 | |
5582 /* Move past the register number and the inner group count. */ | |
5583 p += 2; | |
5584 break; | |
5585 | |
5586 | |
5587 /* \<digit> has been turned into a `duplicate' command which is | |
502 | 5588 followed by the numeric value of <digit> as the register number. |
5589 (Already passed through external-to-internal-register mapping, | |
5590 so it refers to the actual group number, not the non-shy-only | |
5591 numbering used in the external world.) */ | |
428 | 5592 case duplicate: |
5593 { | |
446 | 5594 REGISTER re_char *d2, *dend2; |
502 | 5595 /* Get which register to match against. */ |
5596 int regno = *p++; | |
428 | 5597 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); |
5598 | |
5599 /* Can't back reference a group which we've never matched. */ | |
5600 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) | |
5601 goto fail; | |
5602 | |
5603 /* Where in input to try to start matching. */ | |
5604 d2 = regstart[regno]; | |
5605 | |
5606 /* Where to stop matching; if both the place to start and | |
5607 the place to stop matching are in the same string, then | |
5608 set to the place to stop, otherwise, for now have to use | |
5609 the end of the first string. */ | |
5610 | |
5611 dend2 = ((FIRST_STRING_P (regstart[regno]) | |
5612 == FIRST_STRING_P (regend[regno])) | |
5613 ? regend[regno] : end_match_1); | |
5614 for (;;) | |
5615 { | |
5616 /* If necessary, advance to next segment in register | |
5617 contents. */ | |
5618 while (d2 == dend2) | |
5619 { | |
5620 if (dend2 == end_match_2) break; | |
5621 if (dend2 == regend[regno]) break; | |
5622 | |
5623 /* End of string1 => advance to string2. */ | |
5624 d2 = string2; | |
5625 dend2 = regend[regno]; | |
5626 } | |
5627 /* At end of register contents => success */ | |
5628 if (d2 == dend2) break; | |
5629 | |
5630 /* If necessary, advance to next segment in data. */ | |
450 | 5631 REGEX_PREFETCH (); |
428 | 5632 |
5633 /* How many characters left in this segment to match. */ | |
5634 mcnt = dend - d; | |
5635 | |
5636 /* Want how many consecutive characters we can match in | |
5637 one shot, so, if necessary, adjust the count. */ | |
5638 if (mcnt > dend2 - d2) | |
5639 mcnt = dend2 - d2; | |
5640 | |
5641 /* Compare that many; failure if mismatch, else move | |
5642 past them. */ | |
446 | 5643 if (TRANSLATE_P (translate) |
826 | 5644 ? bcmp_translate (d, d2, mcnt, translate |
5645 #ifdef emacs | |
5646 , fmt, lispobj | |
5647 #endif | |
5648 ) | |
428 | 5649 : memcmp (d, d2, mcnt)) |
5650 goto fail; | |
5651 d += mcnt, d2 += mcnt; | |
5652 | |
5653 /* Do this because we've match some characters. */ | |
5654 SET_REGS_MATCHED (); | |
5655 } | |
5656 } | |
5657 break; | |
5658 | |
5659 | |
5660 /* begline matches the empty string at the beginning of the string | |
5661 (unless `not_bol' is set in `bufp'), and, if | |
5662 `newline_anchor' is set, after newlines. */ | |
5663 case begline: | |
5664 DEBUG_PRINT1 ("EXECUTING begline.\n"); | |
5665 | |
5666 if (AT_STRINGS_BEG (d)) | |
5667 { | |
5668 if (!bufp->not_bol) break; | |
5669 } | |
826 | 5670 else |
5671 { | |
5672 re_char *d2 = d; | |
867 | 5673 DEC_IBYTEPTR (d2); |
5674 if (itext_ichar_ascii_fmt (d2, fmt, lispobj) == '\n' && | |
826 | 5675 bufp->newline_anchor) |
5676 break; | |
5677 } | |
428 | 5678 /* In all other cases, we fail. */ |
5679 goto fail; | |
5680 | |
5681 | |
5682 /* endline is the dual of begline. */ | |
5683 case endline: | |
5684 DEBUG_PRINT1 ("EXECUTING endline.\n"); | |
5685 | |
5686 if (AT_STRINGS_END (d)) | |
5687 { | |
5688 if (!bufp->not_eol) break; | |
5689 } | |
5690 | |
5691 /* We have to ``prefetch'' the next character. */ | |
826 | 5692 else if ((d == end1 ? |
867 | 5693 itext_ichar_ascii_fmt (string2, fmt, lispobj) : |
5694 itext_ichar_ascii_fmt (d, fmt, lispobj)) == '\n' | |
428 | 5695 && bufp->newline_anchor) |
5696 { | |
5697 break; | |
5698 } | |
5699 goto fail; | |
5700 | |
5701 | |
5702 /* Match at the very beginning of the data. */ | |
5703 case begbuf: | |
5704 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); | |
5705 if (AT_STRINGS_BEG (d)) | |
5706 break; | |
5707 goto fail; | |
5708 | |
5709 | |
5710 /* Match at the very end of the data. */ | |
5711 case endbuf: | |
5712 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); | |
5713 if (AT_STRINGS_END (d)) | |
5714 break; | |
5715 goto fail; | |
5716 | |
5717 | |
5718 /* on_failure_keep_string_jump is used to optimize `.*\n'. It | |
5719 pushes NULL as the value for the string on the stack. Then | |
5720 `pop_failure_point' will keep the current value for the | |
5721 string, instead of restoring it. To see why, consider | |
5722 matching `foo\nbar' against `.*\n'. The .* matches the foo; | |
5723 then the . fails against the \n. But the next thing we want | |
5724 to do is match the \n against the \n; if we restored the | |
5725 string value, we would be back at the foo. | |
5726 | |
5727 Because this is used only in specific cases, we don't need to | |
5728 check all the things that `on_failure_jump' does, to make | |
5729 sure the right things get saved on the stack. Hence we don't | |
5730 share its code. The only reason to push anything on the | |
5731 stack at all is that otherwise we would have to change | |
5732 `anychar's code to do something besides goto fail in this | |
5733 case; that seems worse than this. */ | |
5734 case on_failure_keep_string_jump: | |
5735 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); | |
5736 | |
5737 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
5738 DEBUG_PRINT3 (" %d (to 0x%lx):\n", mcnt, (long) (p + mcnt)); | |
5739 | |
446 | 5740 PUSH_FAILURE_POINT (p + mcnt, (unsigned char *) 0, -2); |
428 | 5741 break; |
5742 | |
5743 | |
5744 /* Uses of on_failure_jump: | |
5745 | |
5746 Each alternative starts with an on_failure_jump that points | |
5747 to the beginning of the next alternative. Each alternative | |
5748 except the last ends with a jump that in effect jumps past | |
5749 the rest of the alternatives. (They really jump to the | |
5750 ending jump of the following alternative, because tensioning | |
5751 these jumps is a hassle.) | |
5752 | |
5753 Repeats start with an on_failure_jump that points past both | |
5754 the repetition text and either the following jump or | |
5755 pop_failure_jump back to this on_failure_jump. */ | |
5756 case on_failure_jump: | |
5757 on_failure: | |
5758 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); | |
5759 | |
5760 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
5761 DEBUG_PRINT3 (" %d (to 0x%lx)", mcnt, (long) (p + mcnt)); | |
5762 | |
5763 /* If this on_failure_jump comes right before a group (i.e., | |
5764 the original * applied to a group), save the information | |
5765 for that group and all inner ones, so that if we fail back | |
5766 to this point, the group's information will be correct. | |
5767 For example, in \(a*\)*\1, we need the preceding group, | |
5768 and in \(\(a*\)b*\)\2, we need the inner group. */ | |
5769 | |
5770 /* We can't use `p' to check ahead because we push | |
5771 a failure point to `p + mcnt' after we do this. */ | |
5772 p1 = p; | |
5773 | |
5774 /* We need to skip no_op's before we look for the | |
5775 start_memory in case this on_failure_jump is happening as | |
5776 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 | |
5777 against aba. */ | |
5778 while (p1 < pend && (re_opcode_t) *p1 == no_op) | |
5779 p1++; | |
5780 | |
5781 if (p1 < pend && (re_opcode_t) *p1 == start_memory) | |
5782 { | |
5783 /* We have a new highest active register now. This will | |
5784 get reset at the start_memory we are about to get to, | |
5785 but we will have saved all the registers relevant to | |
5786 this repetition op, as described above. */ | |
5787 highest_active_reg = *(p1 + 1) + *(p1 + 2); | |
5788 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) | |
5789 lowest_active_reg = *(p1 + 1); | |
5790 } | |
5791 | |
5792 DEBUG_PRINT1 (":\n"); | |
5793 PUSH_FAILURE_POINT (p + mcnt, d, -2); | |
5794 break; | |
5795 | |
5796 | |
5797 /* A smart repeat ends with `maybe_pop_jump'. | |
5798 We change it to either `pop_failure_jump' or `jump'. */ | |
5799 case maybe_pop_jump: | |
5800 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
5801 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); | |
5802 { | |
5803 REGISTER unsigned char *p2 = p; | |
5804 | |
5805 /* Compare the beginning of the repeat with what in the | |
5806 pattern follows its end. If we can establish that there | |
5807 is nothing that they would both match, i.e., that we | |
5808 would have to backtrack because of (as in, e.g., `a*a') | |
5809 then we can change to pop_failure_jump, because we'll | |
5810 never have to backtrack. | |
5811 | |
5812 This is not true in the case of alternatives: in | |
5813 `(a|ab)*' we do need to backtrack to the `ab' alternative | |
5814 (e.g., if the string was `ab'). But instead of trying to | |
5815 detect that here, the alternative has put on a dummy | |
5816 failure point which is what we will end up popping. */ | |
5817 | |
5818 /* Skip over open/close-group commands. | |
5819 If what follows this loop is a ...+ construct, | |
5820 look at what begins its body, since we will have to | |
5821 match at least one of that. */ | |
5822 while (1) | |
5823 { | |
5824 if (p2 + 2 < pend | |
5825 && ((re_opcode_t) *p2 == stop_memory | |
5826 || (re_opcode_t) *p2 == start_memory)) | |
5827 p2 += 3; | |
5828 else if (p2 + 6 < pend | |
5829 && (re_opcode_t) *p2 == dummy_failure_jump) | |
5830 p2 += 6; | |
5831 else | |
5832 break; | |
5833 } | |
5834 | |
5835 p1 = p + mcnt; | |
5836 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding | |
5837 to the `maybe_finalize_jump' of this case. Examine what | |
5838 follows. */ | |
5839 | |
5840 /* If we're at the end of the pattern, we can change. */ | |
5841 if (p2 == pend) | |
5842 { | |
5843 /* Consider what happens when matching ":\(.*\)" | |
5844 against ":/". I don't really understand this code | |
5845 yet. */ | |
5846 p[-3] = (unsigned char) pop_failure_jump; | |
5847 DEBUG_PRINT1 | |
5848 (" End of pattern: change to `pop_failure_jump'.\n"); | |
5849 } | |
5850 | |
5851 else if ((re_opcode_t) *p2 == exactn | |
5852 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) | |
5853 { | |
5854 REGISTER unsigned char c | |
5855 = *p2 == (unsigned char) endline ? '\n' : p2[2]; | |
5856 | |
5857 if ((re_opcode_t) p1[3] == exactn && p1[5] != c) | |
5858 { | |
5859 p[-3] = (unsigned char) pop_failure_jump; | |
5860 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", | |
5861 c, p1[5]); | |
5862 } | |
5863 | |
5864 else if ((re_opcode_t) p1[3] == charset | |
5865 || (re_opcode_t) p1[3] == charset_not) | |
5866 { | |
458 | 5867 int not_p = (re_opcode_t) p1[3] == charset_not; |
428 | 5868 |
5869 if (c < (unsigned char) (p1[4] * BYTEWIDTH) | |
5870 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) | |
458 | 5871 not_p = !not_p; |
5872 | |
5873 /* `not_p' is equal to 1 if c would match, which means | |
428 | 5874 that we can't change to pop_failure_jump. */ |
458 | 5875 if (!not_p) |
428 | 5876 { |
5877 p[-3] = (unsigned char) pop_failure_jump; | |
5878 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); | |
5879 } | |
5880 } | |
5881 } | |
5882 else if ((re_opcode_t) *p2 == charset) | |
5883 { | |
5884 #ifdef DEBUG | |
5885 REGISTER unsigned char c | |
5886 = *p2 == (unsigned char) endline ? '\n' : p2[2]; | |
5887 #endif | |
5888 | |
5889 if ((re_opcode_t) p1[3] == exactn | |
5890 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] | |
5891 && (p2[2 + p1[5] / BYTEWIDTH] | |
5892 & (1 << (p1[5] % BYTEWIDTH))))) | |
5893 { | |
5894 p[-3] = (unsigned char) pop_failure_jump; | |
5895 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", | |
5896 c, p1[5]); | |
5897 } | |
5898 | |
5899 else if ((re_opcode_t) p1[3] == charset_not) | |
5900 { | |
5901 int idx; | |
5902 /* We win if the charset_not inside the loop | |
5903 lists every character listed in the charset after. */ | |
5904 for (idx = 0; idx < (int) p2[1]; idx++) | |
5905 if (! (p2[2 + idx] == 0 | |
5906 || (idx < (int) p1[4] | |
5907 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0)))) | |
5908 break; | |
5909 | |
5910 if (idx == p2[1]) | |
5911 { | |
5912 p[-3] = (unsigned char) pop_failure_jump; | |
5913 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); | |
5914 } | |
5915 } | |
5916 else if ((re_opcode_t) p1[3] == charset) | |
5917 { | |
5918 int idx; | |
5919 /* We win if the charset inside the loop | |
5920 has no overlap with the one after the loop. */ | |
5921 for (idx = 0; | |
5922 idx < (int) p2[1] && idx < (int) p1[4]; | |
5923 idx++) | |
5924 if ((p2[2 + idx] & p1[5 + idx]) != 0) | |
5925 break; | |
5926 | |
5927 if (idx == p2[1] || idx == p1[4]) | |
5928 { | |
5929 p[-3] = (unsigned char) pop_failure_jump; | |
5930 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); | |
5931 } | |
5932 } | |
5933 } | |
5934 } | |
5935 p -= 2; /* Point at relative address again. */ | |
5936 if ((re_opcode_t) p[-1] != pop_failure_jump) | |
5937 { | |
5938 p[-1] = (unsigned char) jump; | |
5939 DEBUG_PRINT1 (" Match => jump.\n"); | |
5940 goto unconditional_jump; | |
5941 } | |
5942 /* Note fall through. */ | |
5943 | |
5944 | |
5945 /* The end of a simple repeat has a pop_failure_jump back to | |
5946 its matching on_failure_jump, where the latter will push a | |
5947 failure point. The pop_failure_jump takes off failure | |
5948 points put on by this pop_failure_jump's matching | |
5949 on_failure_jump; we got through the pattern to here from the | |
5950 matching on_failure_jump, so didn't fail. */ | |
5951 case pop_failure_jump: | |
5952 { | |
5953 /* We need to pass separate storage for the lowest and | |
5954 highest registers, even though we don't care about the | |
5955 actual values. Otherwise, we will restore only one | |
5956 register from the stack, since lowest will == highest in | |
5957 `pop_failure_point'. */ | |
647 | 5958 int dummy_low_reg, dummy_high_reg; |
428 | 5959 unsigned char *pdummy; |
446 | 5960 re_char *sdummy = NULL; |
428 | 5961 |
5962 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); | |
5963 POP_FAILURE_POINT (sdummy, pdummy, | |
5964 dummy_low_reg, dummy_high_reg, | |
5965 reg_dummy, reg_dummy, reg_info_dummy); | |
5966 } | |
5967 /* Note fall through. */ | |
5968 | |
5969 | |
5970 /* Unconditionally jump (without popping any failure points). */ | |
5971 case jump: | |
5972 unconditional_jump: | |
5973 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ | |
5974 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); | |
5975 p += mcnt; /* Do the jump. */ | |
5976 DEBUG_PRINT2 ("(to 0x%lx).\n", (long) p); | |
5977 break; | |
5978 | |
5979 | |
5980 /* We need this opcode so we can detect where alternatives end | |
5981 in `group_match_null_string_p' et al. */ | |
5982 case jump_past_alt: | |
5983 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); | |
5984 goto unconditional_jump; | |
5985 | |
5986 | |
5987 /* Normally, the on_failure_jump pushes a failure point, which | |
5988 then gets popped at pop_failure_jump. We will end up at | |
5989 pop_failure_jump, also, and with a pattern of, say, `a+', we | |
5990 are skipping over the on_failure_jump, so we have to push | |
5991 something meaningless for pop_failure_jump to pop. */ | |
5992 case dummy_failure_jump: | |
5993 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); | |
5994 /* It doesn't matter what we push for the string here. What | |
5995 the code at `fail' tests is the value for the pattern. */ | |
446 | 5996 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2); |
428 | 5997 goto unconditional_jump; |
5998 | |
5999 | |
6000 /* At the end of an alternative, we need to push a dummy failure | |
6001 point in case we are followed by a `pop_failure_jump', because | |
6002 we don't want the failure point for the alternative to be | |
6003 popped. For example, matching `(a|ab)*' against `aab' | |
6004 requires that we match the `ab' alternative. */ | |
6005 case push_dummy_failure: | |
6006 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); | |
6007 /* See comments just above at `dummy_failure_jump' about the | |
6008 two zeroes. */ | |
446 | 6009 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2); |
428 | 6010 break; |
6011 | |
6012 /* Have to succeed matching what follows at least n times. | |
6013 After that, handle like `on_failure_jump'. */ | |
6014 case succeed_n: | |
6015 EXTRACT_NUMBER (mcnt, p + 2); | |
6016 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); | |
6017 | |
6018 assert (mcnt >= 0); | |
6019 /* Originally, this is how many times we HAVE to succeed. */ | |
6020 if (mcnt > 0) | |
6021 { | |
6022 mcnt--; | |
6023 p += 2; | |
6024 STORE_NUMBER_AND_INCR (p, mcnt); | |
6025 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p, mcnt); | |
6026 } | |
6027 else if (mcnt == 0) | |
6028 { | |
6029 DEBUG_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n", | |
6030 (long) (p+2)); | |
6031 p[2] = (unsigned char) no_op; | |
6032 p[3] = (unsigned char) no_op; | |
6033 goto on_failure; | |
6034 } | |
6035 break; | |
6036 | |
6037 case jump_n: | |
6038 EXTRACT_NUMBER (mcnt, p + 2); | |
6039 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); | |
6040 | |
6041 /* Originally, this is how many times we CAN jump. */ | |
6042 if (mcnt) | |
6043 { | |
6044 mcnt--; | |
6045 STORE_NUMBER (p + 2, mcnt); | |
6046 goto unconditional_jump; | |
6047 } | |
6048 /* If don't have to jump any more, skip over the rest of command. */ | |
6049 else | |
6050 p += 4; | |
6051 break; | |
6052 | |
6053 case set_number_at: | |
6054 { | |
6055 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); | |
6056 | |
6057 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
6058 p1 = p + mcnt; | |
6059 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
6060 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p1, mcnt); | |
6061 STORE_NUMBER (p1, mcnt); | |
6062 break; | |
6063 } | |
6064 | |
6065 case wordbound: | |
6066 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); | |
6067 should_succeed = 1; | |
6068 matchwordbound: | |
6069 { | |
6070 /* XEmacs change */ | |
1377 | 6071 /* Straightforward and (I hope) correct implementation. |
6072 Probably should be optimized by arranging to compute | |
1497 | 6073 charpos only once. */ |
1377 | 6074 /* emch1 is the character before d, syn1 is the syntax of |
6075 emch1, emch2 is the character at d, and syn2 is the | |
6076 syntax of emch2. */ | |
6077 Ichar emch1, emch2; | |
1468 | 6078 int syn1 = 0, |
6079 syn2 = 0; | |
1377 | 6080 re_char *d_before, *d_after; |
6081 int result, | |
6082 at_beg = AT_STRINGS_BEG (d), | |
6083 at_end = AT_STRINGS_END (d); | |
6084 #ifdef emacs | |
1497 | 6085 Charxpos charpos; |
1377 | 6086 #endif |
6087 | |
6088 if (at_beg && at_end) | |
6089 { | |
6090 result = 0; | |
6091 } | |
428 | 6092 else |
6093 { | |
1377 | 6094 if (!at_beg) |
6095 { | |
6096 d_before = POS_BEFORE_GAP_UNSAFE (d); | |
6097 DEC_IBYTEPTR_FMT (d_before, fmt); | |
6098 emch1 = itext_ichar_fmt (d_before, fmt, lispobj); | |
460 | 6099 #ifdef emacs |
1497 | 6100 charpos = offset_to_charxpos (lispobj, |
6101 PTR_TO_OFFSET (d)) - 1; | |
1377 | 6102 BEGIN_REGEX_MALLOC_OK (); |
1497 | 6103 UPDATE_SYNTAX_CACHE (scache, charpos); |
460 | 6104 #endif |
1377 | 6105 syn1 = SYNTAX_FROM_CACHE (scache, emch1); |
6106 END_REGEX_MALLOC_OK (); | |
6107 } | |
6108 if (!at_end) | |
6109 { | |
6110 d_after = POS_AFTER_GAP_UNSAFE (d); | |
6111 emch2 = itext_ichar_fmt (d_after, fmt, lispobj); | |
460 | 6112 #ifdef emacs |
1497 | 6113 charpos = offset_to_charxpos (lispobj, PTR_TO_OFFSET (d)); |
1377 | 6114 BEGIN_REGEX_MALLOC_OK (); |
1497 | 6115 UPDATE_SYNTAX_CACHE_FORWARD (scache, charpos); |
460 | 6116 #endif |
1377 | 6117 syn2 = SYNTAX_FROM_CACHE (scache, emch2); |
6118 END_REGEX_MALLOC_OK (); | |
6119 } | |
1333 | 6120 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); |
1377 | 6121 |
6122 if (at_beg) | |
6123 result = (syn2 == Sword); | |
6124 else if (at_end) | |
6125 result = (syn1 == Sword); | |
6126 else | |
6127 result = ((syn1 == Sword) != (syn2 == Sword)); | |
428 | 6128 } |
1377 | 6129 |
428 | 6130 if (result == should_succeed) |
6131 break; | |
6132 goto fail; | |
6133 } | |
6134 | |
6135 case notwordbound: | |
6136 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); | |
6137 should_succeed = 0; | |
6138 goto matchwordbound; | |
6139 | |
6140 case wordbeg: | |
6141 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); | |
460 | 6142 if (AT_STRINGS_END (d)) |
6143 goto fail; | |
428 | 6144 { |
6145 /* XEmacs: this originally read: | |
6146 | |
6147 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) | |
6148 break; | |
6149 | |
6150 */ | |
460 | 6151 re_char *dtmp = POS_AFTER_GAP_UNSAFE (d); |
867 | 6152 Ichar emch = itext_ichar_fmt (dtmp, fmt, lispobj); |
1333 | 6153 int tempres; |
1347 | 6154 #ifdef emacs |
6155 Charxpos charpos = offset_to_charxpos (lispobj, PTR_TO_OFFSET (d)); | |
6156 #endif | |
1333 | 6157 BEGIN_REGEX_MALLOC_OK (); |
460 | 6158 #ifdef emacs |
826 | 6159 UPDATE_SYNTAX_CACHE (scache, charpos); |
460 | 6160 #endif |
1333 | 6161 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); |
6162 END_REGEX_MALLOC_OK (); | |
6163 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
6164 if (tempres) | |
428 | 6165 goto fail; |
6166 if (AT_STRINGS_BEG (d)) | |
6167 break; | |
460 | 6168 dtmp = POS_BEFORE_GAP_UNSAFE (d); |
867 | 6169 DEC_IBYTEPTR_FMT (dtmp, fmt); |
6170 emch = itext_ichar_fmt (dtmp, fmt, lispobj); | |
1333 | 6171 BEGIN_REGEX_MALLOC_OK (); |
460 | 6172 #ifdef emacs |
826 | 6173 UPDATE_SYNTAX_CACHE_BACKWARD (scache, charpos - 1); |
460 | 6174 #endif |
1333 | 6175 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); |
6176 END_REGEX_MALLOC_OK (); | |
6177 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
6178 if (tempres) | |
428 | 6179 break; |
6180 goto fail; | |
6181 } | |
6182 | |
6183 case wordend: | |
6184 DEBUG_PRINT1 ("EXECUTING wordend.\n"); | |
460 | 6185 if (AT_STRINGS_BEG (d)) |
6186 goto fail; | |
428 | 6187 { |
6188 /* XEmacs: this originally read: | |
6189 | |
6190 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) | |
6191 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) | |
6192 break; | |
6193 | |
6194 The or condition is incorrect (reversed). | |
6195 */ | |
460 | 6196 re_char *dtmp; |
867 | 6197 Ichar emch; |
1333 | 6198 int tempres; |
460 | 6199 #ifdef emacs |
826 | 6200 Charxpos charpos = offset_to_charxpos (lispobj, PTR_TO_OFFSET (d)); |
1347 | 6201 BEGIN_REGEX_MALLOC_OK (); |
826 | 6202 UPDATE_SYNTAX_CACHE (scache, charpos); |
1333 | 6203 END_REGEX_MALLOC_OK (); |
6204 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
1347 | 6205 #endif |
460 | 6206 dtmp = POS_BEFORE_GAP_UNSAFE (d); |
867 | 6207 DEC_IBYTEPTR_FMT (dtmp, fmt); |
6208 emch = itext_ichar_fmt (dtmp, fmt, lispobj); | |
1333 | 6209 BEGIN_REGEX_MALLOC_OK (); |
6210 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); | |
6211 END_REGEX_MALLOC_OK (); | |
6212 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
6213 if (tempres) | |
428 | 6214 goto fail; |
6215 if (AT_STRINGS_END (d)) | |
6216 break; | |
460 | 6217 dtmp = POS_AFTER_GAP_UNSAFE (d); |
867 | 6218 emch = itext_ichar_fmt (dtmp, fmt, lispobj); |
1333 | 6219 BEGIN_REGEX_MALLOC_OK (); |
460 | 6220 #ifdef emacs |
826 | 6221 UPDATE_SYNTAX_CACHE_FORWARD (scache, charpos + 1); |
460 | 6222 #endif |
1333 | 6223 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); |
6224 END_REGEX_MALLOC_OK (); | |
6225 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
6226 if (tempres) | |
428 | 6227 break; |
6228 goto fail; | |
6229 } | |
6230 | |
6231 #ifdef emacs | |
6232 case before_dot: | |
6233 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); | |
826 | 6234 if (!BUFFERP (lispobj) |
6235 || (BUF_PTR_BYTE_POS (XBUFFER (lispobj), (unsigned char *) d) | |
6236 >= BUF_PT (XBUFFER (lispobj)))) | |
428 | 6237 goto fail; |
6238 break; | |
6239 | |
6240 case at_dot: | |
6241 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); | |
826 | 6242 if (!BUFFERP (lispobj) |
6243 || (BUF_PTR_BYTE_POS (XBUFFER (lispobj), (unsigned char *) d) | |
6244 != BUF_PT (XBUFFER (lispobj)))) | |
428 | 6245 goto fail; |
6246 break; | |
6247 | |
6248 case after_dot: | |
6249 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); | |
826 | 6250 if (!BUFFERP (lispobj) |
6251 || (BUF_PTR_BYTE_POS (XBUFFER (lispobj), (unsigned char *) d) | |
6252 <= BUF_PT (XBUFFER (lispobj)))) | |
428 | 6253 goto fail; |
6254 break; | |
6255 | |
6256 case syntaxspec: | |
6257 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); | |
6258 mcnt = *p++; | |
6259 goto matchsyntax; | |
6260 | |
6261 case wordchar: | |
6262 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); | |
6263 mcnt = (int) Sword; | |
6264 matchsyntax: | |
6265 should_succeed = 1; | |
6266 matchornotsyntax: | |
6267 { | |
6268 int matches; | |
867 | 6269 Ichar emch; |
428 | 6270 |
450 | 6271 REGEX_PREFETCH (); |
1333 | 6272 BEGIN_REGEX_MALLOC_OK (); |
826 | 6273 UPDATE_SYNTAX_CACHE |
6274 (scache, offset_to_charxpos (lispobj, PTR_TO_OFFSET (d))); | |
1333 | 6275 END_REGEX_MALLOC_OK (); |
6276 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
826 | 6277 |
867 | 6278 emch = itext_ichar_fmt (d, fmt, lispobj); |
1333 | 6279 BEGIN_REGEX_MALLOC_OK (); |
826 | 6280 matches = (SYNTAX_FROM_CACHE (scache, emch) == |
6281 (enum syntaxcode) mcnt); | |
1333 | 6282 END_REGEX_MALLOC_OK (); |
6283 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
867 | 6284 INC_IBYTEPTR_FMT (d, fmt); |
428 | 6285 if (matches != should_succeed) |
6286 goto fail; | |
6287 SET_REGS_MATCHED (); | |
6288 } | |
6289 break; | |
6290 | |
6291 case notsyntaxspec: | |
6292 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); | |
6293 mcnt = *p++; | |
6294 goto matchnotsyntax; | |
6295 | |
6296 case notwordchar: | |
6297 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); | |
6298 mcnt = (int) Sword; | |
6299 matchnotsyntax: | |
6300 should_succeed = 0; | |
6301 goto matchornotsyntax; | |
6302 | |
6303 #ifdef MULE | |
6304 /* 97/2/17 jhod Mule category code patch */ | |
6305 case categoryspec: | |
6306 should_succeed = 1; | |
6307 matchornotcategory: | |
6308 { | |
867 | 6309 Ichar emch; |
428 | 6310 |
6311 mcnt = *p++; | |
450 | 6312 REGEX_PREFETCH (); |
867 | 6313 emch = itext_ichar_fmt (d, fmt, lispobj); |
6314 INC_IBYTEPTR_FMT (d, fmt); | |
826 | 6315 if (check_category_char (emch, BUFFER_CATEGORY_TABLE (lispbuf), |
6316 mcnt, should_succeed)) | |
428 | 6317 goto fail; |
6318 SET_REGS_MATCHED (); | |
6319 } | |
6320 break; | |
6321 | |
6322 case notcategoryspec: | |
6323 should_succeed = 0; | |
6324 goto matchornotcategory; | |
6325 /* end of category patch */ | |
6326 #endif /* MULE */ | |
6327 #else /* not emacs */ | |
6328 case wordchar: | |
6329 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); | |
450 | 6330 REGEX_PREFETCH (); |
826 | 6331 if (!WORDCHAR_P ((int) (*d))) |
428 | 6332 goto fail; |
6333 SET_REGS_MATCHED (); | |
6334 d++; | |
6335 break; | |
6336 | |
6337 case notwordchar: | |
6338 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); | |
450 | 6339 REGEX_PREFETCH (); |
826 | 6340 if (!WORDCHAR_P ((int) (*d))) |
428 | 6341 goto fail; |
6342 SET_REGS_MATCHED (); | |
6343 d++; | |
6344 break; | |
446 | 6345 #endif /* emacs */ |
428 | 6346 |
6347 default: | |
2500 | 6348 ABORT (); |
428 | 6349 } |
6350 continue; /* Successfully executed one pattern command; keep going. */ | |
6351 | |
6352 | |
6353 /* We goto here if a matching operation fails. */ | |
6354 fail: | |
6355 if (!FAIL_STACK_EMPTY ()) | |
6356 { /* A restart point is known. Restore to that state. */ | |
6357 DEBUG_PRINT1 ("\nFAIL:\n"); | |
6358 POP_FAILURE_POINT (d, p, | |
6359 lowest_active_reg, highest_active_reg, | |
6360 regstart, regend, reg_info); | |
6361 | |
6362 /* If this failure point is a dummy, try the next one. */ | |
6363 if (!p) | |
6364 goto fail; | |
6365 | |
6366 /* If we failed to the end of the pattern, don't examine *p. */ | |
6367 assert (p <= pend); | |
6368 if (p < pend) | |
6369 { | |
460 | 6370 re_bool is_a_jump_n = false; |
428 | 6371 |
6372 /* If failed to a backwards jump that's part of a repetition | |
6373 loop, need to pop this failure point and use the next one. */ | |
6374 switch ((re_opcode_t) *p) | |
6375 { | |
6376 case jump_n: | |
6377 is_a_jump_n = true; | |
6378 case maybe_pop_jump: | |
6379 case pop_failure_jump: | |
6380 case jump: | |
6381 p1 = p + 1; | |
6382 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6383 p1 += mcnt; | |
6384 | |
6385 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) | |
6386 || (!is_a_jump_n | |
6387 && (re_opcode_t) *p1 == on_failure_jump)) | |
6388 goto fail; | |
6389 break; | |
6390 default: | |
6391 /* do nothing */ ; | |
6392 } | |
6393 } | |
6394 | |
6395 if (d >= string1 && d <= end1) | |
6396 dend = end_match_1; | |
6397 } | |
6398 else | |
6399 break; /* Matching at this starting point really fails. */ | |
6400 } /* for (;;) */ | |
6401 | |
6402 if (best_regs_set) | |
6403 goto restore_best_regs; | |
6404 | |
6405 FREE_VARIABLES (); | |
6406 | |
6407 return -1; /* Failure to match. */ | |
1333 | 6408 } /* re_match_2_internal */ |
428 | 6409 |
6410 /* Subroutine definitions for re_match_2. */ | |
6411 | |
6412 | |
6413 /* We are passed P pointing to a register number after a start_memory. | |
6414 | |
6415 Return true if the pattern up to the corresponding stop_memory can | |
6416 match the empty string, and false otherwise. | |
6417 | |
6418 If we find the matching stop_memory, sets P to point to one past its number. | |
6419 Otherwise, sets P to an undefined byte less than or equal to END. | |
6420 | |
6421 We don't handle duplicates properly (yet). */ | |
6422 | |
460 | 6423 static re_bool |
428 | 6424 group_match_null_string_p (unsigned char **p, unsigned char *end, |
6425 register_info_type *reg_info) | |
6426 { | |
6427 int mcnt; | |
6428 /* Point to after the args to the start_memory. */ | |
6429 unsigned char *p1 = *p + 2; | |
6430 | |
6431 while (p1 < end) | |
6432 { | |
6433 /* Skip over opcodes that can match nothing, and return true or | |
6434 false, as appropriate, when we get to one that can't, or to the | |
6435 matching stop_memory. */ | |
6436 | |
6437 switch ((re_opcode_t) *p1) | |
6438 { | |
6439 /* Could be either a loop or a series of alternatives. */ | |
6440 case on_failure_jump: | |
6441 p1++; | |
6442 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6443 | |
6444 /* If the next operation is not a jump backwards in the | |
6445 pattern. */ | |
6446 | |
6447 if (mcnt >= 0) | |
6448 { | |
6449 /* Go through the on_failure_jumps of the alternatives, | |
6450 seeing if any of the alternatives cannot match nothing. | |
6451 The last alternative starts with only a jump, | |
6452 whereas the rest start with on_failure_jump and end | |
6453 with a jump, e.g., here is the pattern for `a|b|c': | |
6454 | |
6455 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 | |
6456 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 | |
6457 /exactn/1/c | |
6458 | |
6459 So, we have to first go through the first (n-1) | |
6460 alternatives and then deal with the last one separately. */ | |
6461 | |
6462 | |
6463 /* Deal with the first (n-1) alternatives, which start | |
6464 with an on_failure_jump (see above) that jumps to right | |
6465 past a jump_past_alt. */ | |
6466 | |
6467 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) | |
6468 { | |
6469 /* `mcnt' holds how many bytes long the alternative | |
6470 is, including the ending `jump_past_alt' and | |
6471 its number. */ | |
6472 | |
6473 if (!alt_match_null_string_p (p1, p1 + mcnt - 3, | |
6474 reg_info)) | |
6475 return false; | |
6476 | |
6477 /* Move to right after this alternative, including the | |
6478 jump_past_alt. */ | |
6479 p1 += mcnt; | |
6480 | |
6481 /* Break if it's the beginning of an n-th alternative | |
6482 that doesn't begin with an on_failure_jump. */ | |
6483 if ((re_opcode_t) *p1 != on_failure_jump) | |
6484 break; | |
6485 | |
6486 /* Still have to check that it's not an n-th | |
6487 alternative that starts with an on_failure_jump. */ | |
6488 p1++; | |
6489 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6490 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) | |
6491 { | |
6492 /* Get to the beginning of the n-th alternative. */ | |
6493 p1 -= 3; | |
6494 break; | |
6495 } | |
6496 } | |
6497 | |
6498 /* Deal with the last alternative: go back and get number | |
6499 of the `jump_past_alt' just before it. `mcnt' contains | |
6500 the length of the alternative. */ | |
6501 EXTRACT_NUMBER (mcnt, p1 - 2); | |
6502 | |
6503 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) | |
6504 return false; | |
6505 | |
6506 p1 += mcnt; /* Get past the n-th alternative. */ | |
6507 } /* if mcnt > 0 */ | |
6508 break; | |
6509 | |
6510 | |
6511 case stop_memory: | |
6512 assert (p1[1] == **p); | |
6513 *p = p1 + 2; | |
6514 return true; | |
6515 | |
6516 | |
6517 default: | |
6518 if (!common_op_match_null_string_p (&p1, end, reg_info)) | |
6519 return false; | |
6520 } | |
6521 } /* while p1 < end */ | |
6522 | |
6523 return false; | |
6524 } /* group_match_null_string_p */ | |
6525 | |
6526 | |
6527 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: | |
6528 It expects P to be the first byte of a single alternative and END one | |
6529 byte past the last. The alternative can contain groups. */ | |
6530 | |
460 | 6531 static re_bool |
428 | 6532 alt_match_null_string_p (unsigned char *p, unsigned char *end, |
6533 register_info_type *reg_info) | |
6534 { | |
6535 int mcnt; | |
6536 unsigned char *p1 = p; | |
6537 | |
6538 while (p1 < end) | |
6539 { | |
6540 /* Skip over opcodes that can match nothing, and break when we get | |
6541 to one that can't. */ | |
6542 | |
6543 switch ((re_opcode_t) *p1) | |
6544 { | |
6545 /* It's a loop. */ | |
6546 case on_failure_jump: | |
6547 p1++; | |
6548 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6549 p1 += mcnt; | |
6550 break; | |
6551 | |
6552 default: | |
6553 if (!common_op_match_null_string_p (&p1, end, reg_info)) | |
6554 return false; | |
6555 } | |
6556 } /* while p1 < end */ | |
6557 | |
6558 return true; | |
6559 } /* alt_match_null_string_p */ | |
6560 | |
6561 | |
6562 /* Deals with the ops common to group_match_null_string_p and | |
6563 alt_match_null_string_p. | |
6564 | |
6565 Sets P to one after the op and its arguments, if any. */ | |
6566 | |
460 | 6567 static re_bool |
428 | 6568 common_op_match_null_string_p (unsigned char **p, unsigned char *end, |
6569 register_info_type *reg_info) | |
6570 { | |
6571 int mcnt; | |
460 | 6572 re_bool ret; |
428 | 6573 int reg_no; |
6574 unsigned char *p1 = *p; | |
6575 | |
6576 switch ((re_opcode_t) *p1++) | |
6577 { | |
6578 case no_op: | |
6579 case begline: | |
6580 case endline: | |
6581 case begbuf: | |
6582 case endbuf: | |
6583 case wordbeg: | |
6584 case wordend: | |
6585 case wordbound: | |
6586 case notwordbound: | |
6587 #ifdef emacs | |
6588 case before_dot: | |
6589 case at_dot: | |
6590 case after_dot: | |
6591 #endif | |
6592 break; | |
6593 | |
6594 case start_memory: | |
6595 reg_no = *p1; | |
6596 assert (reg_no > 0 && reg_no <= MAX_REGNUM); | |
6597 ret = group_match_null_string_p (&p1, end, reg_info); | |
6598 | |
6599 /* Have to set this here in case we're checking a group which | |
6600 contains a group and a back reference to it. */ | |
6601 | |
6602 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) | |
6603 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; | |
6604 | |
6605 if (!ret) | |
6606 return false; | |
6607 break; | |
6608 | |
6609 /* If this is an optimized succeed_n for zero times, make the jump. */ | |
6610 case jump: | |
6611 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6612 if (mcnt >= 0) | |
6613 p1 += mcnt; | |
6614 else | |
6615 return false; | |
6616 break; | |
6617 | |
6618 case succeed_n: | |
6619 /* Get to the number of times to succeed. */ | |
6620 p1 += 2; | |
6621 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6622 | |
6623 if (mcnt == 0) | |
6624 { | |
6625 p1 -= 4; | |
6626 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6627 p1 += mcnt; | |
6628 } | |
6629 else | |
6630 return false; | |
6631 break; | |
6632 | |
6633 case duplicate: | |
6634 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) | |
6635 return false; | |
6636 break; | |
6637 | |
6638 case set_number_at: | |
6639 p1 += 4; | |
6640 | |
6641 default: | |
6642 /* All other opcodes mean we cannot match the empty string. */ | |
6643 return false; | |
6644 } | |
6645 | |
6646 *p = p1; | |
6647 return true; | |
6648 } /* common_op_match_null_string_p */ | |
6649 | |
6650 | |
6651 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN | |
6652 bytes; nonzero otherwise. */ | |
6653 | |
6654 static int | |
446 | 6655 bcmp_translate (re_char *s1, re_char *s2, |
826 | 6656 REGISTER int len, RE_TRANSLATE_TYPE translate |
6657 #ifdef emacs | |
2333 | 6658 , Internal_Format USED_IF_MULE (fmt), |
6659 Lisp_Object USED_IF_MULE (lispobj) | |
826 | 6660 #endif |
6661 ) | |
428 | 6662 { |
826 | 6663 REGISTER re_char *p1 = s1, *p2 = s2; |
446 | 6664 #ifdef MULE |
826 | 6665 re_char *p1_end = s1 + len; |
6666 re_char *p2_end = s2 + len; | |
446 | 6667 |
6668 while (p1 != p1_end && p2 != p2_end) | |
6669 { | |
867 | 6670 Ichar p1_ch, p2_ch; |
6671 | |
6672 p1_ch = itext_ichar_fmt (p1, fmt, lispobj); | |
6673 p2_ch = itext_ichar_fmt (p2, fmt, lispobj); | |
826 | 6674 |
6675 if (RE_TRANSLATE_1 (p1_ch) | |
6676 != RE_TRANSLATE_1 (p2_ch)) | |
446 | 6677 return 1; |
867 | 6678 INC_IBYTEPTR_FMT (p1, fmt); |
6679 INC_IBYTEPTR_FMT (p2, fmt); | |
446 | 6680 } |
6681 #else /* not MULE */ | |
428 | 6682 while (len) |
6683 { | |
826 | 6684 if (RE_TRANSLATE_1 (*p1++) != RE_TRANSLATE_1 (*p2++)) return 1; |
428 | 6685 len--; |
6686 } | |
446 | 6687 #endif /* MULE */ |
428 | 6688 return 0; |
6689 } | |
6690 | |
6691 /* Entry points for GNU code. */ | |
6692 | |
6693 /* re_compile_pattern is the GNU regular expression compiler: it | |
6694 compiles PATTERN (of length SIZE) and puts the result in BUFP. | |
6695 Returns 0 if the pattern was valid, otherwise an error string. | |
6696 | |
6697 Assumes the `allocated' (and perhaps `buffer') and `translate' fields | |
6698 are set in BUFP on entry. | |
6699 | |
6700 We call regex_compile to do the actual compilation. */ | |
6701 | |
442 | 6702 const char * |
6703 re_compile_pattern (const char *pattern, int length, | |
428 | 6704 struct re_pattern_buffer *bufp) |
6705 { | |
6706 reg_errcode_t ret; | |
6707 | |
6708 /* GNU code is written to assume at least RE_NREGS registers will be set | |
6709 (and at least one extra will be -1). */ | |
6710 bufp->regs_allocated = REGS_UNALLOCATED; | |
6711 | |
6712 /* And GNU code determines whether or not to get register information | |
6713 by passing null for the REGS argument to re_match, etc., not by | |
6714 setting no_sub. */ | |
6715 bufp->no_sub = 0; | |
6716 | |
6717 /* Match anchors at newline. */ | |
6718 bufp->newline_anchor = 1; | |
6719 | |
826 | 6720 ret = regex_compile ((unsigned char *) pattern, length, re_syntax_options, |
6721 bufp); | |
428 | 6722 |
6723 if (!ret) | |
6724 return NULL; | |
6725 return gettext (re_error_msgid[(int) ret]); | |
6726 } | |
6727 | |
6728 /* Entry points compatible with 4.2 BSD regex library. We don't define | |
6729 them unless specifically requested. */ | |
6730 | |
6731 #ifdef _REGEX_RE_COMP | |
6732 | |
6733 /* BSD has one and only one pattern buffer. */ | |
6734 static struct re_pattern_buffer re_comp_buf; | |
6735 | |
6736 char * | |
442 | 6737 re_comp (const char *s) |
428 | 6738 { |
6739 reg_errcode_t ret; | |
6740 | |
6741 if (!s) | |
6742 { | |
6743 if (!re_comp_buf.buffer) | |
6744 return gettext ("No previous regular expression"); | |
6745 return 0; | |
6746 } | |
6747 | |
6748 if (!re_comp_buf.buffer) | |
6749 { | |
1333 | 6750 re_comp_buf.buffer = (unsigned char *) xmalloc (200); |
428 | 6751 if (re_comp_buf.buffer == NULL) |
6752 return gettext (re_error_msgid[(int) REG_ESPACE]); | |
6753 re_comp_buf.allocated = 200; | |
6754 | |
1333 | 6755 re_comp_buf.fastmap = (char *) xmalloc (1 << BYTEWIDTH); |
428 | 6756 if (re_comp_buf.fastmap == NULL) |
6757 return gettext (re_error_msgid[(int) REG_ESPACE]); | |
6758 } | |
6759 | |
6760 /* Since `re_exec' always passes NULL for the `regs' argument, we | |
6761 don't need to initialize the pattern buffer fields which affect it. */ | |
6762 | |
6763 /* Match anchors at newlines. */ | |
6764 re_comp_buf.newline_anchor = 1; | |
6765 | |
826 | 6766 ret = regex_compile ((unsigned char *)s, strlen (s), re_syntax_options, |
6767 &re_comp_buf); | |
428 | 6768 |
6769 if (!ret) | |
6770 return NULL; | |
6771 | |
442 | 6772 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ |
428 | 6773 return (char *) gettext (re_error_msgid[(int) ret]); |
6774 } | |
6775 | |
6776 | |
6777 int | |
442 | 6778 re_exec (const char *s) |
428 | 6779 { |
442 | 6780 const int len = strlen (s); |
428 | 6781 return |
6782 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); | |
6783 } | |
6784 #endif /* _REGEX_RE_COMP */ | |
6785 | |
6786 /* POSIX.2 functions. Don't define these for Emacs. */ | |
6787 | |
6788 #ifndef emacs | |
6789 | |
6790 /* regcomp takes a regular expression as a string and compiles it. | |
6791 | |
6792 PREG is a regex_t *. We do not expect any fields to be initialized, | |
6793 since POSIX says we shouldn't. Thus, we set | |
6794 | |
6795 `buffer' to the compiled pattern; | |
6796 `used' to the length of the compiled pattern; | |
6797 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the | |
6798 REG_EXTENDED bit in CFLAGS is set; otherwise, to | |
6799 RE_SYNTAX_POSIX_BASIC; | |
6800 `newline_anchor' to REG_NEWLINE being set in CFLAGS; | |
6801 `fastmap' and `fastmap_accurate' to zero; | |
6802 `re_nsub' to the number of subexpressions in PATTERN. | |
502 | 6803 (non-shy of course. POSIX probably doesn't know about |
6804 shy ones, and in any case they should be invisible.) | |
428 | 6805 |
6806 PATTERN is the address of the pattern string. | |
6807 | |
6808 CFLAGS is a series of bits which affect compilation. | |
6809 | |
6810 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we | |
6811 use POSIX basic syntax. | |
6812 | |
6813 If REG_NEWLINE is set, then . and [^...] don't match newline. | |
6814 Also, regexec will try a match beginning after every newline. | |
6815 | |
6816 If REG_ICASE is set, then we considers upper- and lowercase | |
6817 versions of letters to be equivalent when matching. | |
6818 | |
6819 If REG_NOSUB is set, then when PREG is passed to regexec, that | |
6820 routine will report only success or failure, and nothing about the | |
6821 registers. | |
6822 | |
6823 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for | |
6824 the return codes and their meanings.) */ | |
6825 | |
6826 int | |
442 | 6827 regcomp (regex_t *preg, const char *pattern, int cflags) |
428 | 6828 { |
6829 reg_errcode_t ret; | |
647 | 6830 unsigned int syntax |
428 | 6831 = (cflags & REG_EXTENDED) ? |
6832 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; | |
6833 | |
6834 /* regex_compile will allocate the space for the compiled pattern. */ | |
6835 preg->buffer = 0; | |
6836 preg->allocated = 0; | |
6837 preg->used = 0; | |
6838 | |
6839 /* Don't bother to use a fastmap when searching. This simplifies the | |
6840 REG_NEWLINE case: if we used a fastmap, we'd have to put all the | |
6841 characters after newlines into the fastmap. This way, we just try | |
6842 every character. */ | |
6843 preg->fastmap = 0; | |
6844 | |
6845 if (cflags & REG_ICASE) | |
6846 { | |
647 | 6847 int i; |
428 | 6848 |
1333 | 6849 preg->translate = (char *) xmalloc (CHAR_SET_SIZE); |
428 | 6850 if (preg->translate == NULL) |
6851 return (int) REG_ESPACE; | |
6852 | |
6853 /* Map uppercase characters to corresponding lowercase ones. */ | |
6854 for (i = 0; i < CHAR_SET_SIZE; i++) | |
6855 preg->translate[i] = ISUPPER (i) ? tolower (i) : i; | |
6856 } | |
6857 else | |
6858 preg->translate = NULL; | |
6859 | |
6860 /* If REG_NEWLINE is set, newlines are treated differently. */ | |
6861 if (cflags & REG_NEWLINE) | |
6862 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ | |
6863 syntax &= ~RE_DOT_NEWLINE; | |
6864 syntax |= RE_HAT_LISTS_NOT_NEWLINE; | |
6865 /* It also changes the matching behavior. */ | |
6866 preg->newline_anchor = 1; | |
6867 } | |
6868 else | |
6869 preg->newline_anchor = 0; | |
6870 | |
6871 preg->no_sub = !!(cflags & REG_NOSUB); | |
6872 | |
6873 /* POSIX says a null character in the pattern terminates it, so we | |
6874 can use strlen here in compiling the pattern. */ | |
446 | 6875 ret = regex_compile ((unsigned char *) pattern, strlen (pattern), syntax, preg); |
428 | 6876 |
6877 /* POSIX doesn't distinguish between an unmatched open-group and an | |
6878 unmatched close-group: both are REG_EPAREN. */ | |
6879 if (ret == REG_ERPAREN) ret = REG_EPAREN; | |
6880 | |
6881 return (int) ret; | |
6882 } | |
6883 | |
6884 | |
6885 /* regexec searches for a given pattern, specified by PREG, in the | |
6886 string STRING. | |
6887 | |
6888 If NMATCH is zero or REG_NOSUB was set in the cflags argument to | |
6889 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at | |
6890 least NMATCH elements, and we set them to the offsets of the | |
6891 corresponding matched substrings. | |
6892 | |
6893 EFLAGS specifies `execution flags' which affect matching: if | |
6894 REG_NOTBOL is set, then ^ does not match at the beginning of the | |
6895 string; if REG_NOTEOL is set, then $ does not match at the end. | |
6896 | |
6897 We return 0 if we find a match and REG_NOMATCH if not. */ | |
6898 | |
6899 int | |
442 | 6900 regexec (const regex_t *preg, const char *string, size_t nmatch, |
428 | 6901 regmatch_t pmatch[], int eflags) |
6902 { | |
6903 int ret; | |
6904 struct re_registers regs; | |
6905 regex_t private_preg; | |
6906 int len = strlen (string); | |
460 | 6907 re_bool want_reg_info = !preg->no_sub && nmatch > 0; |
428 | 6908 |
6909 private_preg = *preg; | |
6910 | |
6911 private_preg.not_bol = !!(eflags & REG_NOTBOL); | |
6912 private_preg.not_eol = !!(eflags & REG_NOTEOL); | |
6913 | |
6914 /* The user has told us exactly how many registers to return | |
6915 information about, via `nmatch'. We have to pass that on to the | |
6916 matching routines. */ | |
6917 private_preg.regs_allocated = REGS_FIXED; | |
6918 | |
6919 if (want_reg_info) | |
6920 { | |
647 | 6921 regs.num_regs = (int) nmatch; |
6922 regs.start = TALLOC ((int) nmatch, regoff_t); | |
6923 regs.end = TALLOC ((int) nmatch, regoff_t); | |
428 | 6924 if (regs.start == NULL || regs.end == NULL) |
6925 return (int) REG_NOMATCH; | |
6926 } | |
6927 | |
6928 /* Perform the searching operation. */ | |
6929 ret = re_search (&private_preg, string, len, | |
6930 /* start: */ 0, /* range: */ len, | |
6931 want_reg_info ? ®s : (struct re_registers *) 0); | |
6932 | |
6933 /* Copy the register information to the POSIX structure. */ | |
6934 if (want_reg_info) | |
6935 { | |
6936 if (ret >= 0) | |
6937 { | |
647 | 6938 int r; |
6939 | |
6940 for (r = 0; r < (int) nmatch; r++) | |
428 | 6941 { |
6942 pmatch[r].rm_so = regs.start[r]; | |
6943 pmatch[r].rm_eo = regs.end[r]; | |
6944 } | |
6945 } | |
6946 | |
6947 /* If we needed the temporary register info, free the space now. */ | |
1726 | 6948 xfree (regs.start, regoff_t *); |
6949 xfree (regs.end, regoff_t *); | |
428 | 6950 } |
6951 | |
6952 /* We want zero return to mean success, unlike `re_search'. */ | |
6953 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; | |
6954 } | |
6955 | |
6956 | |
6957 /* Returns a message corresponding to an error code, ERRCODE, returned | |
6958 from either regcomp or regexec. We don't use PREG here. */ | |
6959 | |
6960 size_t | |
2286 | 6961 regerror (int errcode, const regex_t *UNUSED (preg), char *errbuf, |
647 | 6962 size_t errbuf_size) |
428 | 6963 { |
442 | 6964 const char *msg; |
665 | 6965 Bytecount msg_size; |
428 | 6966 |
6967 if (errcode < 0 | |
647 | 6968 || errcode >= (int) (sizeof (re_error_msgid) / |
6969 sizeof (re_error_msgid[0]))) | |
428 | 6970 /* Only error codes returned by the rest of the code should be passed |
6971 to this routine. If we are given anything else, or if other regex | |
6972 code generates an invalid error code, then the program has a bug. | |
6973 Dump core so we can fix it. */ | |
2500 | 6974 ABORT (); |
428 | 6975 |
6976 msg = gettext (re_error_msgid[errcode]); | |
6977 | |
6978 msg_size = strlen (msg) + 1; /* Includes the null. */ | |
6979 | |
6980 if (errbuf_size != 0) | |
6981 { | |
665 | 6982 if (msg_size > (Bytecount) errbuf_size) |
428 | 6983 { |
6984 strncpy (errbuf, msg, errbuf_size - 1); | |
6985 errbuf[errbuf_size - 1] = 0; | |
6986 } | |
6987 else | |
6988 strcpy (errbuf, msg); | |
6989 } | |
6990 | |
647 | 6991 return (size_t) msg_size; |
428 | 6992 } |
6993 | |
6994 | |
6995 /* Free dynamically allocated space used by PREG. */ | |
6996 | |
6997 void | |
6998 regfree (regex_t *preg) | |
6999 { | |
7000 if (preg->buffer != NULL) | |
1726 | 7001 xfree (preg->buffer, unsigned char *); |
428 | 7002 preg->buffer = NULL; |
7003 | |
7004 preg->allocated = 0; | |
7005 preg->used = 0; | |
7006 | |
7007 if (preg->fastmap != NULL) | |
1726 | 7008 xfree (preg->fastmap, char *); |
428 | 7009 preg->fastmap = NULL; |
7010 preg->fastmap_accurate = 0; | |
7011 | |
7012 if (preg->translate != NULL) | |
1726 | 7013 xfree (preg->translate, RE_TRANSLATE_TYPE); |
428 | 7014 preg->translate = NULL; |
7015 } | |
7016 | |
7017 #endif /* not emacs */ | |
7018 |