comparison src/regex.c @ 0:376386a54a3c r19-14

Import from CVS: tag r19-14
author cvs
date Mon, 13 Aug 2007 08:45:50 +0200
parents
children ac2d302a0011
comparison
equal deleted inserted replaced
-1:000000000000 0:376386a54a3c
1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
5
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; see the file COPYING. If not, write to
22 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 Boston, MA 02111-1307, USA. */
24
25 /* Synched up with: FSF 19.29. */
26
27 /* Changes made for XEmacs:
28
29 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
30 was added. This causes a huge speedup in font-locking.
31 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
32 being used, because it's too slow -- all those calls to mmap()
33 add humongous overhead.
34 */
35
36 /* AIX requires this to be the first thing in the file. */
37 #if defined (_AIX) && !defined (REGEX_MALLOC)
38 #pragma alloca
39 #endif
40
41 #define _GNU_SOURCE
42
43 #ifdef HAVE_CONFIG_H
44 #include <config.h>
45 #endif
46
47 /* We need this for `regex.h', and perhaps for the Emacs include files. */
48 #include <sys/types.h>
49
50 /* This is for other GNU distributions with internationalized messages. */
51 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
52 # include <libintl.h>
53 #else
54 # define gettext(msgid) (msgid)
55 #endif
56
57 /* XEmacs: define this to add in a speedup for patterns anchored at
58 the beginning of a line. Keep the ifdefs so that it's easier to
59 tell where/why this code has diverged from v19. */
60 #define REGEX_BEGLINE_CHECK
61
62 /* XEmacs: the current mmap-based ralloc handles small blocks very
63 poorly, so we disable it here. */
64
65 #if defined (REL_ALLOC) && defined (HAVE_MMAP)
66 # undef REL_ALLOC
67 #endif
68
69 /* The `emacs' switch turns on certain matching commands
70 that make sense only in Emacs. */
71 #ifdef emacs
72
73 #include "lisp.h"
74 #include "buffer.h"
75 #include "syntax.h"
76
77 #if (defined (DEBUG_XEMACS) && !defined (DEBUG))
78 #define DEBUG
79 #endif
80
81 void
82 complex_vars_of_regex (void)
83 {
84 }
85
86 #else /* not emacs */
87
88 /* If we are not linking with Emacs proper,
89 we can't use the relocating allocator
90 even if config.h says that we can. */
91 #undef REL_ALLOC
92
93 #if defined (STDC_HEADERS) || defined (_LIBC)
94 #include <stdlib.h>
95 #else
96 char *malloc ();
97 char *realloc ();
98 #endif
99
100 #define charptr_emchar(str) ((Emchar) (str)[0])
101
102 #if (LONGBITS > INTBITS)
103 # define EMACS_INT long
104 #else
105 # define EMACS_INT int
106 #endif
107
108 typedef int Emchar;
109
110 #define INC_CHARPTR(p) ((p)++)
111 #define DEC_CHARPTR(p) ((p)--)
112
113 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
114 If nothing else has been done, use the method below. */
115 #ifdef INHIBIT_STRING_HEADER
116 #if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
117 #if !defined (bzero) && !defined (bcopy)
118 #undef INHIBIT_STRING_HEADER
119 #endif
120 #endif
121 #endif
122
123 /* This is the normal way of making sure we have a bcopy and a bzero.
124 This is used in most programs--a few other programs avoid this
125 by defining INHIBIT_STRING_HEADER. */
126 #ifndef INHIBIT_STRING_HEADER
127 #if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
128 #include <string.h>
129 #ifndef bcmp
130 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
131 #endif
132 #ifndef bcopy
133 #define bcopy(s, d, n) memcpy ((d), (s), (n))
134 #endif
135 #ifndef bzero
136 #define bzero(s, n) memset ((s), 0, (n))
137 #endif
138 #else
139 #include <strings.h>
140 #endif
141 #endif
142
143
144 /* Define the syntax stuff for \<, \>, etc. */
145
146 /* This must be nonzero for the wordchar and notwordchar pattern
147 commands in re_match_2. */
148 #ifndef Sword
149 #define Sword 1
150 #endif
151
152 #ifdef SYNTAX_TABLE
153
154 extern char *re_syntax_table;
155
156 #else /* not SYNTAX_TABLE */
157
158 /* How many characters in the character set. */
159 #define CHAR_SET_SIZE 256
160
161 static char re_syntax_table[CHAR_SET_SIZE];
162
163 static void
164 init_syntax_once (void)
165 {
166 register int c;
167 static int done = 0;
168
169 if (done)
170 return;
171
172 bzero (re_syntax_table, sizeof re_syntax_table);
173
174 for (c = 'a'; c <= 'z'; c++)
175 re_syntax_table[c] = Sword;
176
177 for (c = 'A'; c <= 'Z'; c++)
178 re_syntax_table[c] = Sword;
179
180 for (c = '0'; c <= '9'; c++)
181 re_syntax_table[c] = Sword;
182
183 re_syntax_table['_'] = Sword;
184
185 done = 1;
186 }
187
188 #endif /* not SYNTAX_TABLE */
189
190 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
191
192 #endif /* not emacs */
193
194 /* Under XEmacs, this is needed because we don't define it elsewhere. */
195 #ifdef SWITCH_ENUM_BUG
196 #define SWITCH_ENUM_CAST(x) ((int)(x))
197 #else
198 #define SWITCH_ENUM_CAST(x) (x)
199 #endif
200
201
202 /* Get the interface, including the syntax bits. */
203 #include "regex.h"
204
205 /* isalpha etc. are used for the character classes. */
206 #include <ctype.h>
207
208 /* Jim Meyering writes:
209
210 "... Some ctype macros are valid only for character codes that
211 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
212 using /bin/cc or gcc but without giving an ansi option). So, all
213 ctype uses should be through macros like ISPRINT... If
214 STDC_HEADERS is defined, then autoconf has verified that the ctype
215 macros don't need to be guarded with references to isascii. ...
216 Defining isascii to 1 should let any compiler worth its salt
217 eliminate the && through constant folding." */
218
219 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
220 #define ISASCII_1(c) 1
221 #else
222 #define ISASCII_1(c) isascii(c)
223 #endif
224
225 #define ISASCII(c) ISASCII_1 (c)
226
227 #ifdef isblank
228 #define ISBLANK(c) (ISASCII (c) && isblank (c))
229 #else
230 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
231 #endif
232 #ifdef isgraph
233 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
234 #else
235 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
236 #endif
237
238 #define ISPRINT(c) (ISASCII (c) && isprint (c))
239 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
240 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
241 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
242 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
243 #define ISLOWER(c) (ISASCII (c) && islower (c))
244 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
245 #define ISSPACE(c) (ISASCII (c) && isspace (c))
246 #define ISUPPER(c) (ISASCII (c) && isupper (c))
247 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
248
249 #ifndef NULL
250 #define NULL (void *)0
251 #endif
252
253 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
254 since ours (we hope) works properly with all combinations of
255 machines, compilers, `char' and `unsigned char' argument types.
256 (Per Bothner suggested the basic approach.) */
257 #undef SIGN_EXTEND_CHAR
258 #if __STDC__
259 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
260 #else /* not __STDC__ */
261 /* As in Harbison and Steele. */
262 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
263 #endif
264
265 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
266 use `alloca' instead of `malloc'. This is because using malloc in
267 re_search* or re_match* could cause memory leaks when C-g is used in
268 Emacs; also, malloc is slower and causes storage fragmentation. On
269 the other hand, malloc is more portable, and easier to debug.
270
271 Because we sometimes use alloca, some routines have to be macros,
272 not functions -- `alloca'-allocated space disappears at the end of the
273 function it is called in. */
274
275 #ifdef REGEX_MALLOC
276
277 #define REGEX_ALLOCATE malloc
278 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
279 #define REGEX_FREE free
280
281 #else /* not REGEX_MALLOC */
282
283 /* Emacs already defines alloca, sometimes. */
284 #ifndef alloca
285
286 /* Make alloca work the best possible way. */
287 #ifdef __GNUC__
288 #define alloca __builtin_alloca
289 #else /* not __GNUC__ */
290 #if HAVE_ALLOCA_H
291 #include <alloca.h>
292 #else /* not __GNUC__ or HAVE_ALLOCA_H */
293 #ifndef _AIX /* Already did AIX, up at the top. */
294 char *alloca ();
295 #endif /* not _AIX */
296 #endif /* not HAVE_ALLOCA_H */
297 #endif /* not __GNUC__ */
298
299 #endif /* not alloca */
300
301 #define REGEX_ALLOCATE alloca
302
303 /* Assumes a `char *destination' variable. */
304 #define REGEX_REALLOCATE(source, osize, nsize) \
305 (destination = (char *) alloca (nsize), \
306 memmove (destination, source, osize), \
307 destination)
308
309 /* No need to do anything to free, after alloca. */
310 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
311
312 #endif /* not REGEX_MALLOC */
313
314 /* Define how to allocate the failure stack. */
315
316 #ifdef REL_ALLOC
317 #define REGEX_ALLOCATE_STACK(size) \
318 r_alloc ((char **) &failure_stack_ptr, (size))
319 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
320 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
321 #define REGEX_FREE_STACK(ptr) \
322 r_alloc_free ((void **) &failure_stack_ptr)
323
324 #else /* not REL_ALLOC */
325
326 #ifdef REGEX_MALLOC
327
328 #define REGEX_ALLOCATE_STACK malloc
329 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
330 #define REGEX_FREE_STACK free
331
332 #else /* not REGEX_MALLOC */
333
334 #define REGEX_ALLOCATE_STACK alloca
335
336 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
337 REGEX_REALLOCATE (source, osize, nsize)
338 /* No need to explicitly free anything. */
339 #define REGEX_FREE_STACK(arg)
340
341 #endif /* not REGEX_MALLOC */
342 #endif /* not REL_ALLOC */
343
344
345 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
346 `string1' or just past its end. This works if PTR is NULL, which is
347 a good thing. */
348 #define FIRST_STRING_P(ptr) \
349 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
350
351 /* (Re)Allocate N items of type T using malloc, or fail. */
352 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
353 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
354 #define RETALLOC_IF(addr, n, t) \
355 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
356 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
357
358 #define BYTEWIDTH 8 /* In bits. */
359
360 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
361
362 #undef MAX
363 #undef MIN
364 #define MAX(a, b) ((a) > (b) ? (a) : (b))
365 #define MIN(a, b) ((a) < (b) ? (a) : (b))
366
367 typedef char boolean;
368 #define false 0
369 #define true 1
370
371
372 /* These are the command codes that appear in compiled regular
373 expressions. Some opcodes are followed by argument bytes. A
374 command code can specify any interpretation whatsoever for its
375 arguments. Zero bytes may appear in the compiled regular expression. */
376
377 typedef enum
378 {
379 no_op = 0,
380
381 /* Succeed right away--no more backtracking. */
382 succeed,
383
384 /* Followed by one byte giving n, then by n literal bytes. */
385 exactn,
386
387 /* Matches any (more or less) character. */
388 anychar,
389
390 /* Matches any one char belonging to specified set. First
391 following byte is number of bitmap bytes. Then come bytes
392 for a bitmap saying which chars are in. Bits in each byte
393 are ordered low-bit-first. A character is in the set if its
394 bit is 1. A character too large to have a bit in the map is
395 automatically not in the set. */
396 charset,
397
398 /* Same parameters as charset, but match any character that is
399 not one of those specified. */
400 charset_not,
401
402 /* Start remembering the text that is matched, for storing in a
403 register. Followed by one byte with the register number, in
404 the range 0 to one less than the pattern buffer's re_nsub
405 field. Then followed by one byte with the number of groups
406 inner to this one. (This last has to be part of the
407 start_memory only because we need it in the on_failure_jump
408 of re_match_2.) */
409 start_memory,
410
411 /* Stop remembering the text that is matched and store it in a
412 memory register. Followed by one byte with the register
413 number, in the range 0 to one less than `re_nsub' in the
414 pattern buffer, and one byte with the number of inner groups,
415 just like `start_memory'. (We need the number of inner
416 groups here because we don't have any easy way of finding the
417 corresponding start_memory when we're at a stop_memory.) */
418 stop_memory,
419
420 /* Match a duplicate of something remembered. Followed by one
421 byte containing the register number. */
422 duplicate,
423
424 /* Fail unless at beginning of line. */
425 begline,
426
427 /* Fail unless at end of line. */
428 endline,
429
430 /* Succeeds if at beginning of buffer (if emacs) or at beginning
431 of string to be matched (if not). */
432 begbuf,
433
434 /* Analogously, for end of buffer/string. */
435 endbuf,
436
437 /* Followed by two byte relative address to which to jump. */
438 jump,
439
440 /* Same as jump, but marks the end of an alternative. */
441 jump_past_alt,
442
443 /* Followed by two-byte relative address of place to resume at
444 in case of failure. */
445 on_failure_jump,
446
447 /* Like on_failure_jump, but pushes a placeholder instead of the
448 current string position when executed. */
449 on_failure_keep_string_jump,
450
451 /* Throw away latest failure point and then jump to following
452 two-byte relative address. */
453 pop_failure_jump,
454
455 /* Change to pop_failure_jump if know won't have to backtrack to
456 match; otherwise change to jump. This is used to jump
457 back to the beginning of a repeat. If what follows this jump
458 clearly won't match what the repeat does, such that we can be
459 sure that there is no use backtracking out of repetitions
460 already matched, then we change it to a pop_failure_jump.
461 Followed by two-byte address. */
462 maybe_pop_jump,
463
464 /* Jump to following two-byte address, and push a dummy failure
465 point. This failure point will be thrown away if an attempt
466 is made to use it for a failure. A `+' construct makes this
467 before the first repeat. Also used as an intermediary kind
468 of jump when compiling an alternative. */
469 dummy_failure_jump,
470
471 /* Push a dummy failure point and continue. Used at the end of
472 alternatives. */
473 push_dummy_failure,
474
475 /* Followed by two-byte relative address and two-byte number n.
476 After matching N times, jump to the address upon failure. */
477 succeed_n,
478
479 /* Followed by two-byte relative address, and two-byte number n.
480 Jump to the address N times, then fail. */
481 jump_n,
482
483 /* Set the following two-byte relative address to the
484 subsequent two-byte number. The address *includes* the two
485 bytes of number. */
486 set_number_at,
487
488 wordchar, /* Matches any word-constituent character. */
489 notwordchar, /* Matches any char that is not a word-constituent. */
490
491 wordbeg, /* Succeeds if at word beginning. */
492 wordend, /* Succeeds if at word end. */
493
494 wordbound, /* Succeeds if at a word boundary. */
495 notwordbound /* Succeeds if not at a word boundary. */
496
497 #ifdef emacs
498 ,before_dot, /* Succeeds if before point. */
499 at_dot, /* Succeeds if at point. */
500 after_dot, /* Succeeds if after point. */
501
502 /* Matches any character whose syntax is specified. Followed by
503 a byte which contains a syntax code, e.g., Sword. */
504 syntaxspec,
505
506 /* Matches any character whose syntax is not that specified. */
507 notsyntaxspec
508 #endif /* emacs */
509 } re_opcode_t;
510
511 /* Common operations on the compiled pattern. */
512
513 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
514
515 #define STORE_NUMBER(destination, number) \
516 do { \
517 (destination)[0] = (number) & 0377; \
518 (destination)[1] = (number) >> 8; \
519 } while (0)
520
521 /* Same as STORE_NUMBER, except increment DESTINATION to
522 the byte after where the number is stored. Therefore, DESTINATION
523 must be an lvalue. */
524
525 #define STORE_NUMBER_AND_INCR(destination, number) \
526 do { \
527 STORE_NUMBER (destination, number); \
528 (destination) += 2; \
529 } while (0)
530
531 /* Put into DESTINATION a number stored in two contiguous bytes starting
532 at SOURCE. */
533
534 #define EXTRACT_NUMBER(destination, source) \
535 do { \
536 (destination) = *(source) & 0377; \
537 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
538 } while (0)
539
540 #ifdef DEBUG
541 static void
542 extract_number (int *dest, unsigned char *source)
543 {
544 int temp = SIGN_EXTEND_CHAR (*(source + 1));
545 *dest = *source & 0377;
546 *dest += temp << 8;
547 }
548
549 #ifndef EXTRACT_MACROS /* To debug the macros. */
550 #undef EXTRACT_NUMBER
551 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
552 #endif /* not EXTRACT_MACROS */
553
554 #endif /* DEBUG */
555
556 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
557 SOURCE must be an lvalue. */
558
559 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
560 do { \
561 EXTRACT_NUMBER (destination, source); \
562 (source) += 2; \
563 } while (0)
564
565 #ifdef DEBUG
566 static void
567 extract_number_and_incr (int *destination, unsigned char **source)
568 {
569 extract_number (destination, *source);
570 *source += 2;
571 }
572
573 #ifndef EXTRACT_MACROS
574 #undef EXTRACT_NUMBER_AND_INCR
575 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
576 extract_number_and_incr (&dest, &src)
577 #endif /* not EXTRACT_MACROS */
578
579 #endif /* DEBUG */
580
581 /* If DEBUG is defined, Regex prints many voluminous messages about what
582 it is doing (if the variable `debug' is nonzero). If linked with the
583 main program in `iregex.c', you can enter patterns and strings
584 interactively. And if linked with the main program in `main.c' and
585 the other test files, you can run the already-written tests. */
586
587 #if defined (DEBUG)
588
589 /* We use standard I/O for debugging. */
590 #include <stdio.h>
591
592 #ifndef emacs
593 /* XEmacs provides its own version of assert() */
594 /* It is useful to test things that ``must'' be true when debugging. */
595 #include <assert.h>
596 #endif
597
598 static int debug = 0;
599
600 #define DEBUG_STATEMENT(e) e
601 #define DEBUG_PRINT1(x) if (debug) printf (x)
602 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
603 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
604 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
605 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
606 if (debug) print_partial_compiled_pattern (s, e)
607 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
608 if (debug) print_double_string (w, s1, sz1, s2, sz2)
609
610
611 /* Print the fastmap in human-readable form. */
612
613 static void
614 print_fastmap (char *fastmap)
615 {
616 unsigned was_a_range = 0;
617 unsigned i = 0;
618
619 while (i < (1 << BYTEWIDTH))
620 {
621 if (fastmap[i++])
622 {
623 was_a_range = 0;
624 putchar (i - 1);
625 while (i < (1 << BYTEWIDTH) && fastmap[i])
626 {
627 was_a_range = 1;
628 i++;
629 }
630 if (was_a_range)
631 {
632 printf ("-");
633 putchar (i - 1);
634 }
635 }
636 }
637 putchar ('\n');
638 }
639
640
641 /* Print a compiled pattern string in human-readable form, starting at
642 the START pointer into it and ending just before the pointer END. */
643
644 static void
645 print_partial_compiled_pattern (unsigned char *start, unsigned char *end)
646 {
647 int mcnt, mcnt2;
648 unsigned char *p = start;
649 unsigned char *pend = end;
650
651 if (start == NULL)
652 {
653 printf ("(null)\n");
654 return;
655 }
656
657 /* Loop over pattern commands. */
658 while (p < pend)
659 {
660 printf ("%d:\t", p - start);
661
662 switch ((re_opcode_t) *p++)
663 {
664 case no_op:
665 printf ("/no_op");
666 break;
667
668 case exactn:
669 mcnt = *p++;
670 printf ("/exactn/%d", mcnt);
671 do
672 {
673 putchar ('/');
674 putchar (*p++);
675 }
676 while (--mcnt);
677 break;
678
679 case start_memory:
680 mcnt = *p++;
681 printf ("/start_memory/%d/%d", mcnt, *p++);
682 break;
683
684 case stop_memory:
685 mcnt = *p++;
686 printf ("/stop_memory/%d/%d", mcnt, *p++);
687 break;
688
689 case duplicate:
690 printf ("/duplicate/%d", *p++);
691 break;
692
693 case anychar:
694 printf ("/anychar");
695 break;
696
697 case charset:
698 case charset_not:
699 {
700 register int c, last = -100;
701 register int in_range = 0;
702
703 printf ("/charset [%s",
704 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
705
706 assert (p + *p < pend);
707
708 for (c = 0; c < 256; c++)
709 if (((unsigned char) (c / 8) < *p)
710 && (p[1 + (c/8)] & (1 << (c % 8))))
711 {
712 /* Are we starting a range? */
713 if (last + 1 == c && ! in_range)
714 {
715 putchar ('-');
716 in_range = 1;
717 }
718 /* Have we broken a range? */
719 else if (last + 1 != c && in_range)
720 {
721 putchar (last);
722 in_range = 0;
723 }
724
725 if (! in_range)
726 putchar (c);
727
728 last = c;
729 }
730
731 if (in_range)
732 putchar (last);
733
734 putchar (']');
735
736 p += 1 + *p;
737 }
738 break;
739
740 case begline:
741 printf ("/begline");
742 break;
743
744 case endline:
745 printf ("/endline");
746 break;
747
748 case on_failure_jump:
749 extract_number_and_incr (&mcnt, &p);
750 printf ("/on_failure_jump to %d", p + mcnt - start);
751 break;
752
753 case on_failure_keep_string_jump:
754 extract_number_and_incr (&mcnt, &p);
755 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
756 break;
757
758 case dummy_failure_jump:
759 extract_number_and_incr (&mcnt, &p);
760 printf ("/dummy_failure_jump to %d", p + mcnt - start);
761 break;
762
763 case push_dummy_failure:
764 printf ("/push_dummy_failure");
765 break;
766
767 case maybe_pop_jump:
768 extract_number_and_incr (&mcnt, &p);
769 printf ("/maybe_pop_jump to %d", p + mcnt - start);
770 break;
771
772 case pop_failure_jump:
773 extract_number_and_incr (&mcnt, &p);
774 printf ("/pop_failure_jump to %d", p + mcnt - start);
775 break;
776
777 case jump_past_alt:
778 extract_number_and_incr (&mcnt, &p);
779 printf ("/jump_past_alt to %d", p + mcnt - start);
780 break;
781
782 case jump:
783 extract_number_and_incr (&mcnt, &p);
784 printf ("/jump to %d", p + mcnt - start);
785 break;
786
787 case succeed_n:
788 extract_number_and_incr (&mcnt, &p);
789 extract_number_and_incr (&mcnt2, &p);
790 printf ("/succeed_n to %d, %d times", p + mcnt - start, mcnt2);
791 break;
792
793 case jump_n:
794 extract_number_and_incr (&mcnt, &p);
795 extract_number_and_incr (&mcnt2, &p);
796 printf ("/jump_n to %d, %d times", p + mcnt - start, mcnt2);
797 break;
798
799 case set_number_at:
800 extract_number_and_incr (&mcnt, &p);
801 extract_number_and_incr (&mcnt2, &p);
802 printf ("/set_number_at location %d to %d", p + mcnt - start, mcnt2);
803 break;
804
805 case wordbound:
806 printf ("/wordbound");
807 break;
808
809 case notwordbound:
810 printf ("/notwordbound");
811 break;
812
813 case wordbeg:
814 printf ("/wordbeg");
815 break;
816
817 case wordend:
818 printf ("/wordend");
819
820 #ifdef emacs
821 case before_dot:
822 printf ("/before_dot");
823 break;
824
825 case at_dot:
826 printf ("/at_dot");
827 break;
828
829 case after_dot:
830 printf ("/after_dot");
831 break;
832
833 case syntaxspec:
834 printf ("/syntaxspec");
835 mcnt = *p++;
836 printf ("/%d", mcnt);
837 break;
838
839 case notsyntaxspec:
840 printf ("/notsyntaxspec");
841 mcnt = *p++;
842 printf ("/%d", mcnt);
843 break;
844 #endif /* emacs */
845
846 case wordchar:
847 printf ("/wordchar");
848 break;
849
850 case notwordchar:
851 printf ("/notwordchar");
852 break;
853
854 case begbuf:
855 printf ("/begbuf");
856 break;
857
858 case endbuf:
859 printf ("/endbuf");
860 break;
861
862 default:
863 printf ("?%d", *(p-1));
864 }
865
866 putchar ('\n');
867 }
868
869 printf ("%d:\tend of pattern.\n", p - start);
870 }
871
872
873 static void
874 print_compiled_pattern (struct re_pattern_buffer *bufp)
875 {
876 unsigned char *buffer = bufp->buffer;
877
878 print_partial_compiled_pattern (buffer, buffer + bufp->used);
879 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used,
880 bufp->allocated);
881
882 if (bufp->fastmap_accurate && bufp->fastmap)
883 {
884 printf ("fastmap: ");
885 print_fastmap (bufp->fastmap);
886 }
887
888 printf ("re_nsub: %ld\t", (long)bufp->re_nsub);
889 printf ("regs_alloc: %d\t", bufp->regs_allocated);
890 printf ("can_be_null: %d\t", bufp->can_be_null);
891 printf ("newline_anchor: %d\n", bufp->newline_anchor);
892 printf ("no_sub: %d\t", bufp->no_sub);
893 printf ("not_bol: %d\t", bufp->not_bol);
894 printf ("not_eol: %d\t", bufp->not_eol);
895 printf ("syntax: %d\n", bufp->syntax);
896 /* Perhaps we should print the translate table? */
897 }
898
899
900 static void
901 print_double_string (CONST char *where, CONST char *string1, int size1,
902 CONST char *string2, int size2)
903 {
904 unsigned this_char;
905
906 if (where == NULL)
907 printf ("(null)");
908 else
909 {
910 if (FIRST_STRING_P (where))
911 {
912 for (this_char = where - string1; this_char < size1; this_char++)
913 putchar (string1[this_char]);
914
915 where = string2;
916 }
917
918 for (this_char = where - string2; this_char < size2; this_char++)
919 putchar (string2[this_char]);
920 }
921 }
922
923 #else /* not DEBUG */
924
925 #undef assert
926 #define assert(e)
927
928 #define DEBUG_STATEMENT(e)
929 #define DEBUG_PRINT1(x)
930 #define DEBUG_PRINT2(x1, x2)
931 #define DEBUG_PRINT3(x1, x2, x3)
932 #define DEBUG_PRINT4(x1, x2, x3, x4)
933 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
934 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
935
936 #endif /* not DEBUG */
937
938 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
939 also be assigned to arbitrarily: each pattern buffer stores its own
940 syntax, so it can be changed between regex compilations. */
941 /* This has no initializer because initialized variables in Emacs
942 become read-only after dumping. */
943 reg_syntax_t re_syntax_options;
944
945
946 /* Specify the precise syntax of regexps for compilation. This provides
947 for compatibility for various utilities which historically have
948 different, incompatible syntaxes.
949
950 The argument SYNTAX is a bit mask comprised of the various bits
951 defined in regex.h. We return the old syntax. */
952
953 reg_syntax_t
954 re_set_syntax (reg_syntax_t syntax)
955 {
956 reg_syntax_t ret = re_syntax_options;
957
958 re_syntax_options = syntax;
959 return ret;
960 }
961
962 /* This table gives an error message for each of the error codes listed
963 in regex.h. Obviously the order here has to be same as there.
964 POSIX doesn't require that we do anything for REG_NOERROR,
965 but why not be nice? */
966
967 static CONST char *re_error_msgid[] =
968 { "Success", /* REG_NOERROR */
969 "No match", /* REG_NOMATCH */
970 "Invalid regular expression", /* REG_BADPAT */
971 "Invalid collation character", /* REG_ECOLLATE */
972 "Invalid character class name", /* REG_ECTYPE */
973 "Trailing backslash", /* REG_EESCAPE */
974 "Invalid back reference", /* REG_ESUBREG */
975 "Unmatched [ or [^", /* REG_EBRACK */
976 "Unmatched ( or \\(", /* REG_EPAREN */
977 "Unmatched \\{", /* REG_EBRACE */
978 "Invalid content of \\{\\}", /* REG_BADBR */
979 "Invalid range end", /* REG_ERANGE */
980 "Memory exhausted", /* REG_ESPACE */
981 "Invalid preceding regular expression", /* REG_BADRPT */
982 "Premature end of regular expression", /* REG_EEND */
983 "Regular expression too big", /* REG_ESIZE */
984 "Unmatched ) or \\)", /* REG_ERPAREN */
985 #ifdef emacs
986 "Invalid syntax designator", /* REG_ESYNTAX */
987 #endif
988 };
989
990 /* Avoiding alloca during matching, to placate r_alloc. */
991
992 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
993 searching and matching functions should not call alloca. On some
994 systems, alloca is implemented in terms of malloc, and if we're
995 using the relocating allocator routines, then malloc could cause a
996 relocation, which might (if the strings being searched are in the
997 ralloc heap) shift the data out from underneath the regexp
998 routines.
999
1000 Here's another reason to avoid allocation: Emacs
1001 processes input from X in a signal handler; processing X input may
1002 call malloc; if input arrives while a matching routine is calling
1003 malloc, then we're scrod. But Emacs can't just block input while
1004 calling matching routines; then we don't notice interrupts when
1005 they come in. So, Emacs blocks input around all regexp calls
1006 except the matching calls, which it leaves unprotected, in the
1007 faith that they will not malloc. */
1008
1009 /* Normally, this is fine. */
1010 #define MATCH_MAY_ALLOCATE
1011
1012 /* When using GNU C, we are not REALLY using the C alloca, no matter
1013 what config.h may say. So don't take precautions for it. */
1014 #ifdef __GNUC__
1015 #undef C_ALLOCA
1016 #endif
1017
1018 /* The match routines may not allocate if (1) they would do it with malloc
1019 and (2) it's not safe for them to use malloc.
1020 Note that if REL_ALLOC is defined, matching would not use malloc for the
1021 failure stack, but we would still use it for the register vectors;
1022 so REL_ALLOC should not affect this. */
1023 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1024 #undef MATCH_MAY_ALLOCATE
1025 #endif
1026
1027
1028 /* Failure stack declarations and macros; both re_compile_fastmap and
1029 re_match_2 use a failure stack. These have to be macros because of
1030 REGEX_ALLOCATE_STACK. */
1031
1032
1033 /* Number of failure points for which to initially allocate space
1034 when matching. If this number is exceeded, we allocate more
1035 space, so it is not a hard limit. */
1036 #ifndef INIT_FAILURE_ALLOC
1037 #define INIT_FAILURE_ALLOC 5
1038 #endif
1039
1040 /* Roughly the maximum number of failure points on the stack. Would be
1041 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1042 This is a variable only so users of regex can assign to it; we never
1043 change it ourselves. */
1044 #if defined (MATCH_MAY_ALLOCATE)
1045 int re_max_failures = 200000;
1046 #else
1047 int re_max_failures = 2000;
1048 #endif
1049
1050 union fail_stack_elt
1051 {
1052 unsigned char *pointer;
1053 int integer;
1054 };
1055
1056 typedef union fail_stack_elt fail_stack_elt_t;
1057
1058 typedef struct
1059 {
1060 fail_stack_elt_t *stack;
1061 unsigned size;
1062 unsigned avail; /* Offset of next open position. */
1063 } fail_stack_type;
1064
1065 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1066 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1067 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1068
1069
1070 /* Define macros to initialize and free the failure stack.
1071 Do `return -2' if the alloc fails. */
1072
1073 #ifdef MATCH_MAY_ALLOCATE
1074 #define INIT_FAIL_STACK() \
1075 do { \
1076 fail_stack.stack = (fail_stack_elt_t *) \
1077 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1078 \
1079 if (fail_stack.stack == NULL) \
1080 return -2; \
1081 \
1082 fail_stack.size = INIT_FAILURE_ALLOC; \
1083 fail_stack.avail = 0; \
1084 } while (0)
1085
1086 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1087 #else
1088 #define INIT_FAIL_STACK() \
1089 do { \
1090 fail_stack.avail = 0; \
1091 } while (0)
1092
1093 #define RESET_FAIL_STACK()
1094 #endif
1095
1096
1097 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1098
1099 Return 1 if succeeds, and 0 if either ran out of memory
1100 allocating space for it or it was already too large.
1101
1102 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1103
1104 #define DOUBLE_FAIL_STACK(fail_stack) \
1105 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1106 ? 0 \
1107 : ((fail_stack).stack = (fail_stack_elt_t *) \
1108 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1109 (fail_stack).size * sizeof (fail_stack_elt_t), \
1110 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1111 \
1112 (fail_stack).stack == NULL \
1113 ? 0 \
1114 : ((fail_stack).size <<= 1, \
1115 1)))
1116
1117
1118 /* Push pointer POINTER on FAIL_STACK.
1119 Return 1 if was able to do so and 0 if ran out of memory allocating
1120 space to do so. */
1121 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1122 ((FAIL_STACK_FULL () \
1123 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1124 ? 0 \
1125 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1126 1))
1127
1128 /* Push a pointer value onto the failure stack.
1129 Assumes the variable `fail_stack'. Probably should only
1130 be called from within `PUSH_FAILURE_POINT'. */
1131 #define PUSH_FAILURE_POINTER(item) \
1132 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1133
1134 /* This pushes an integer-valued item onto the failure stack.
1135 Assumes the variable `fail_stack'. Probably should only
1136 be called from within `PUSH_FAILURE_POINT'. */
1137 #define PUSH_FAILURE_INT(item) \
1138 fail_stack.stack[fail_stack.avail++].integer = (item)
1139
1140 /* Push a fail_stack_elt_t value onto the failure stack.
1141 Assumes the variable `fail_stack'. Probably should only
1142 be called from within `PUSH_FAILURE_POINT'. */
1143 #define PUSH_FAILURE_ELT(item) \
1144 fail_stack.stack[fail_stack.avail++] = (item)
1145
1146 /* These three POP... operations complement the three PUSH... operations.
1147 All assume that `fail_stack' is nonempty. */
1148 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1149 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1150 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1151
1152 /* Used to omit pushing failure point id's when we're not debugging. */
1153 #ifdef DEBUG
1154 #define DEBUG_PUSH PUSH_FAILURE_INT
1155 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1156 #else
1157 #define DEBUG_PUSH(item)
1158 #define DEBUG_POP(item_addr)
1159 #endif
1160
1161
1162 /* Push the information about the state we will need
1163 if we ever fail back to it.
1164
1165 Requires variables fail_stack, regstart, regend, reg_info, and
1166 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1167 declared.
1168
1169 Does `return FAILURE_CODE' if runs out of memory. */
1170
1171 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1172 #define DECLARE_DESTINATION char *destination;
1173 #else
1174 #define DECLARE_DESTINATION
1175 #endif
1176
1177 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1178 do { \
1179 DECLARE_DESTINATION \
1180 /* Must be int, so when we don't save any registers, the arithmetic \
1181 of 0 + -1 isn't done as unsigned. */ \
1182 int this_reg; \
1183 \
1184 DEBUG_STATEMENT (failure_id++); \
1185 DEBUG_STATEMENT (nfailure_points_pushed++); \
1186 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1187 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1188 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1189 \
1190 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1191 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1192 \
1193 /* Ensure we have enough space allocated for what we will push. */ \
1194 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1195 { \
1196 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1197 return failure_code; \
1198 \
1199 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1200 (fail_stack).size); \
1201 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1202 } \
1203 \
1204 /* Push the info, starting with the registers. */ \
1205 DEBUG_PRINT1 ("\n"); \
1206 \
1207 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1208 this_reg++) \
1209 { \
1210 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1211 DEBUG_STATEMENT (num_regs_pushed++); \
1212 \
1213 DEBUG_PRINT2 (" start: 0x%lx\n", \
1214 (unsigned long) regstart[this_reg]); \
1215 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1216 \
1217 DEBUG_PRINT2 (" end: 0x%lx\n", \
1218 (unsigned long) regend[this_reg]); \
1219 PUSH_FAILURE_POINTER (regend[this_reg]); \
1220 \
1221 DEBUG_PRINT2 (" info: 0x%lx\n ", \
1222 * (unsigned long *) (&reg_info[this_reg])); \
1223 DEBUG_PRINT2 (" match_null=%d", \
1224 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1225 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1226 DEBUG_PRINT2 (" matched_something=%d", \
1227 MATCHED_SOMETHING (reg_info[this_reg])); \
1228 DEBUG_PRINT2 (" ever_matched=%d", \
1229 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1230 DEBUG_PRINT1 ("\n"); \
1231 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1232 } \
1233 \
1234 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
1235 PUSH_FAILURE_INT (lowest_active_reg); \
1236 \
1237 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
1238 PUSH_FAILURE_INT (highest_active_reg); \
1239 \
1240 DEBUG_PRINT2 (" Pushing pattern 0x%lx: ", \
1241 (unsigned long) pattern_place); \
1242 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1243 PUSH_FAILURE_POINTER (pattern_place); \
1244 \
1245 DEBUG_PRINT2 (" Pushing string 0x%lx: `", \
1246 (unsigned long) string_place); \
1247 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1248 size2); \
1249 DEBUG_PRINT1 ("'\n"); \
1250 PUSH_FAILURE_POINTER (string_place); \
1251 \
1252 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1253 DEBUG_PUSH (failure_id); \
1254 } while (0)
1255
1256 /* This is the number of items that are pushed and popped on the stack
1257 for each register. */
1258 #define NUM_REG_ITEMS 3
1259
1260 /* Individual items aside from the registers. */
1261 #ifdef DEBUG
1262 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1263 #else
1264 #define NUM_NONREG_ITEMS 4
1265 #endif
1266
1267 /* We push at most this many items on the stack. */
1268 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1269
1270 /* We actually push this many items. */
1271 #define NUM_FAILURE_ITEMS \
1272 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1273 + NUM_NONREG_ITEMS)
1274
1275 /* How many items can still be added to the stack without overflowing it. */
1276 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1277
1278
1279 /* Pops what PUSH_FAIL_STACK pushes.
1280
1281 We restore into the parameters, all of which should be lvalues:
1282 STR -- the saved data position.
1283 PAT -- the saved pattern position.
1284 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1285 REGSTART, REGEND -- arrays of string positions.
1286 REG_INFO -- array of information about each subexpression.
1287
1288 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1289 `pend', `string1', `size1', `string2', and `size2'. */
1290
1291 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1292 { \
1293 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \
1294 int this_reg; \
1295 CONST unsigned char *string_temp; \
1296 \
1297 assert (!FAIL_STACK_EMPTY ()); \
1298 \
1299 /* Remove failure points and point to how many regs pushed. */ \
1300 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1301 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1302 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1303 \
1304 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1305 \
1306 DEBUG_POP (&ffailure_id.integer); \
1307 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1308 * (unsigned int *) &ffailure_id); \
1309 \
1310 /* If the saved string location is NULL, it came from an \
1311 on_failure_keep_string_jump opcode, and we want to throw away the \
1312 saved NULL, thus retaining our current position in the string. */ \
1313 string_temp = POP_FAILURE_POINTER (); \
1314 if (string_temp != NULL) \
1315 str = (CONST char *) string_temp; \
1316 \
1317 DEBUG_PRINT2 (" Popping string 0x%lx: `", (unsigned long) str); \
1318 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1319 DEBUG_PRINT1 ("'\n"); \
1320 \
1321 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1322 DEBUG_PRINT2 (" Popping pattern 0x%lx: ", (unsigned long) pat); \
1323 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1324 \
1325 /* Restore register info. */ \
1326 high_reg = (unsigned) POP_FAILURE_INT (); \
1327 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1328 \
1329 low_reg = (unsigned) POP_FAILURE_INT (); \
1330 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1331 \
1332 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1333 { \
1334 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1335 \
1336 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1337 DEBUG_PRINT2 (" info: 0x%lx\n", \
1338 * (unsigned long *) &reg_info[this_reg]); \
1339 \
1340 regend[this_reg] = (CONST char *) POP_FAILURE_POINTER (); \
1341 DEBUG_PRINT2 (" end: 0x%lx\n", \
1342 (unsigned long) regend[this_reg]); \
1343 \
1344 regstart[this_reg] = (CONST char *) POP_FAILURE_POINTER (); \
1345 DEBUG_PRINT2 (" start: 0x%lx\n", \
1346 (unsigned long) regstart[this_reg]); \
1347 } \
1348 \
1349 set_regs_matched_done = 0; \
1350 DEBUG_STATEMENT (nfailure_points_popped++); \
1351 } /* POP_FAILURE_POINT */
1352
1353
1354
1355 /* Structure for per-register (a.k.a. per-group) information.
1356 Other register information, such as the
1357 starting and ending positions (which are addresses), and the list of
1358 inner groups (which is a bits list) are maintained in separate
1359 variables.
1360
1361 We are making a (strictly speaking) nonportable assumption here: that
1362 the compiler will pack our bit fields into something that fits into
1363 the type of `word', i.e., is something that fits into one item on the
1364 failure stack. */
1365
1366 typedef union
1367 {
1368 fail_stack_elt_t word;
1369 struct
1370 {
1371 /* This field is one if this group can match the empty string,
1372 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1373 #define MATCH_NULL_UNSET_VALUE 3
1374 unsigned match_null_string_p : 2;
1375 unsigned is_active : 1;
1376 unsigned matched_something : 1;
1377 unsigned ever_matched_something : 1;
1378 } bits;
1379 } register_info_type;
1380
1381 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1382 #define IS_ACTIVE(R) ((R).bits.is_active)
1383 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1384 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1385
1386
1387 /* Call this when have matched a real character; it sets `matched' flags
1388 for the subexpressions which we are currently inside. Also records
1389 that those subexprs have matched. */
1390 #define SET_REGS_MATCHED() \
1391 do \
1392 { \
1393 if (!set_regs_matched_done) \
1394 { \
1395 unsigned r; \
1396 set_regs_matched_done = 1; \
1397 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1398 { \
1399 MATCHED_SOMETHING (reg_info[r]) \
1400 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1401 = 1; \
1402 } \
1403 } \
1404 } \
1405 while (0)
1406
1407 /* Registers are set to a sentinel when they haven't yet matched. */
1408 static char reg_unset_dummy;
1409 #define REG_UNSET_VALUE (&reg_unset_dummy)
1410 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1411
1412 /* Subroutine declarations and macros for regex_compile. */
1413
1414 /* Fetch the next character in the uncompiled pattern---translating it
1415 if necessary. Also cast from a signed character in the constant
1416 string passed to us by the user to an unsigned char that we can use
1417 as an array index (in, e.g., `translate'). */
1418 #define PATFETCH(c) \
1419 do {if (p == pend) return REG_EEND; \
1420 assert (p < pend); \
1421 c = (unsigned char) *p++; \
1422 if (translate) c = (unsigned char) translate[c]; \
1423 } while (0)
1424
1425 /* Fetch the next character in the uncompiled pattern, with no
1426 translation. */
1427 #define PATFETCH_RAW(c) \
1428 do {if (p == pend) return REG_EEND; \
1429 assert (p < pend); \
1430 c = (unsigned char) *p++; \
1431 } while (0)
1432
1433 /* Go backwards one character in the pattern. */
1434 #define PATUNFETCH p--
1435
1436 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1437 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1438 #define PATUNFETCH_EITHER PATUNFETCH
1439
1440
1441 /* If `translate' is non-null, return translate[D], else just D. We
1442 cast the subscript to translate because some data is declared as
1443 `char *', to avoid warnings when a string constant is passed. But
1444 when we use a character as a subscript we must make it unsigned. */
1445 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
1446
1447
1448 /* Macros for outputting the compiled pattern into `buffer'. */
1449
1450 /* If the buffer isn't allocated when it comes in, use this. */
1451 #define INIT_BUF_SIZE 32
1452
1453 /* Make sure we have at least N more bytes of space in buffer. */
1454 #define GET_BUFFER_SPACE(n) \
1455 while (b - bufp->buffer + (n) > bufp->allocated) \
1456 EXTEND_BUFFER ()
1457
1458 /* Make sure we have one more byte of buffer space and then add C to it. */
1459 #define BUF_PUSH(c) \
1460 do { \
1461 GET_BUFFER_SPACE (1); \
1462 *b++ = (unsigned char) (c); \
1463 } while (0)
1464
1465
1466 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1467 #define BUF_PUSH_2(c1, c2) \
1468 do { \
1469 GET_BUFFER_SPACE (2); \
1470 *b++ = (unsigned char) (c1); \
1471 *b++ = (unsigned char) (c2); \
1472 } while (0)
1473
1474
1475 /* As with BUF_PUSH_2, except for three bytes. */
1476 #define BUF_PUSH_3(c1, c2, c3) \
1477 do { \
1478 GET_BUFFER_SPACE (3); \
1479 *b++ = (unsigned char) (c1); \
1480 *b++ = (unsigned char) (c2); \
1481 *b++ = (unsigned char) (c3); \
1482 } while (0)
1483
1484
1485 /* Store a jump with opcode OP at LOC to location TO. We store a
1486 relative address offset by the three bytes the jump itself occupies. */
1487 #define STORE_JUMP(op, loc, to) \
1488 store_op1 (op, loc, (to) - (loc) - 3)
1489
1490 /* Likewise, for a two-argument jump. */
1491 #define STORE_JUMP2(op, loc, to, arg) \
1492 store_op2 (op, loc, (to) - (loc) - 3, arg)
1493
1494 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1495 #define INSERT_JUMP(op, loc, to) \
1496 insert_op1 (op, loc, (to) - (loc) - 3, b)
1497
1498 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1499 #define INSERT_JUMP2(op, loc, to, arg) \
1500 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1501
1502
1503 /* This is not an arbitrary limit: the arguments which represent offsets
1504 into the pattern are two bytes long. So if 2^16 bytes turns out to
1505 be too small, many things would have to change. */
1506 #define MAX_BUF_SIZE (1L << 16)
1507
1508
1509 /* Extend the buffer by twice its current size via realloc and
1510 reset the pointers that pointed into the old block to point to the
1511 correct places in the new one. If extending the buffer results in it
1512 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1513 #define EXTEND_BUFFER() \
1514 do { \
1515 unsigned char *old_buffer = bufp->buffer; \
1516 if (bufp->allocated == MAX_BUF_SIZE) \
1517 return REG_ESIZE; \
1518 bufp->allocated <<= 1; \
1519 if (bufp->allocated > MAX_BUF_SIZE) \
1520 bufp->allocated = MAX_BUF_SIZE; \
1521 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1522 if (bufp->buffer == NULL) \
1523 return REG_ESPACE; \
1524 /* If the buffer moved, move all the pointers into it. */ \
1525 if (old_buffer != bufp->buffer) \
1526 { \
1527 b = (b - old_buffer) + bufp->buffer; \
1528 begalt = (begalt - old_buffer) + bufp->buffer; \
1529 if (fixup_alt_jump) \
1530 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1531 if (laststart) \
1532 laststart = (laststart - old_buffer) + bufp->buffer; \
1533 if (pending_exact) \
1534 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1535 } \
1536 } while (0)
1537
1538
1539 /* Since we have one byte reserved for the register number argument to
1540 {start,stop}_memory, the maximum number of groups we can report
1541 things about is what fits in that byte. */
1542 #define MAX_REGNUM 255
1543
1544 /* But patterns can have more than `MAX_REGNUM' registers. We just
1545 ignore the excess. */
1546 typedef unsigned regnum_t;
1547
1548
1549 /* Macros for the compile stack. */
1550
1551 /* Since offsets can go either forwards or backwards, this type needs to
1552 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1553 typedef int pattern_offset_t;
1554
1555 typedef struct
1556 {
1557 pattern_offset_t begalt_offset;
1558 pattern_offset_t fixup_alt_jump;
1559 pattern_offset_t inner_group_offset;
1560 pattern_offset_t laststart_offset;
1561 regnum_t regnum;
1562 } compile_stack_elt_t;
1563
1564
1565 typedef struct
1566 {
1567 compile_stack_elt_t *stack;
1568 unsigned size;
1569 unsigned avail; /* Offset of next open position. */
1570 } compile_stack_type;
1571
1572
1573 #define INIT_COMPILE_STACK_SIZE 32
1574
1575 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1576 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1577
1578 /* The next available element. */
1579 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1580
1581
1582 /* Set the bit for character C in a bit vector. */
1583 #define SET_LIST_BIT(c) \
1584 (b[((unsigned char) (c)) / BYTEWIDTH] \
1585 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1586
1587 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1588
1589
1590
1591 /* Get the next unsigned number in the uncompiled pattern. */
1592 #define GET_UNSIGNED_NUMBER(num) \
1593 { if (p != pend) \
1594 { \
1595 PATFETCH (c); \
1596 while (ISDIGIT (c)) \
1597 { \
1598 if (num < 0) \
1599 num = 0; \
1600 num = num * 10 + c - '0'; \
1601 if (p == pend) \
1602 break; \
1603 PATFETCH (c); \
1604 } \
1605 } \
1606 }
1607
1608 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1609
1610 #define IS_CHAR_CLASS(string) \
1611 (STREQ (string, "alpha") || STREQ (string, "upper") \
1612 || STREQ (string, "lower") || STREQ (string, "digit") \
1613 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1614 || STREQ (string, "space") || STREQ (string, "print") \
1615 || STREQ (string, "punct") || STREQ (string, "graph") \
1616 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1617
1618 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1619 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1620 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg,
1621 unsigned char *end);
1622 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1623 unsigned char *end);
1624 static boolean at_begline_loc_p (CONST char *pattern, CONST char *p,
1625 reg_syntax_t syntax);
1626 static boolean at_endline_loc_p (CONST char *p, CONST char *pend, int syntax);
1627 static boolean group_in_compile_stack (compile_stack_type compile_stack,
1628 regnum_t regnum);
1629 static reg_errcode_t compile_range (CONST char **p_ptr, CONST char *pend,
1630 char *translate, reg_syntax_t syntax,
1631 unsigned char *b);
1632 static boolean group_match_null_string_p (unsigned char **p,
1633 unsigned char *end,
1634 register_info_type *reg_info);
1635 static boolean alt_match_null_string_p (unsigned char *p, unsigned char *end,
1636 register_info_type *reg_info);
1637 static boolean common_op_match_null_string_p (unsigned char **p,
1638 unsigned char *end,
1639 register_info_type *reg_info);
1640 static int bcmp_translate (CONST unsigned char *s1, CONST unsigned char *s2,
1641 register int len, char *translate);
1642 static int re_match_2_internal (struct re_pattern_buffer *bufp,
1643 CONST char *string1, int size1,
1644 CONST char *string2, int size2, int pos,
1645 struct re_registers *regs, int stop);
1646
1647 #ifndef MATCH_MAY_ALLOCATE
1648
1649 /* If we cannot allocate large objects within re_match_2_internal,
1650 we make the fail stack and register vectors global.
1651 The fail stack, we grow to the maximum size when a regexp
1652 is compiled.
1653 The register vectors, we adjust in size each time we
1654 compile a regexp, according to the number of registers it needs. */
1655
1656 static fail_stack_type fail_stack;
1657
1658 /* Size with which the following vectors are currently allocated.
1659 That is so we can make them bigger as needed,
1660 but never make them smaller. */
1661 static int regs_allocated_size;
1662
1663 static CONST char ** regstart, ** regend;
1664 static CONST char ** old_regstart, ** old_regend;
1665 static CONST char **best_regstart, **best_regend;
1666 static register_info_type *reg_info;
1667 static CONST char **reg_dummy;
1668 static register_info_type *reg_info_dummy;
1669
1670 /* Make the register vectors big enough for NUM_REGS registers,
1671 but don't make them smaller. */
1672
1673 static
1674 regex_grow_registers (int num_regs)
1675 {
1676 if (num_regs > regs_allocated_size)
1677 {
1678 RETALLOC_IF (regstart, num_regs, CONST char *);
1679 RETALLOC_IF (regend, num_regs, CONST char *);
1680 RETALLOC_IF (old_regstart, num_regs, CONST char *);
1681 RETALLOC_IF (old_regend, num_regs, CONST char *);
1682 RETALLOC_IF (best_regstart, num_regs, CONST char *);
1683 RETALLOC_IF (best_regend, num_regs, CONST char *);
1684 RETALLOC_IF (reg_info, num_regs, register_info_type);
1685 RETALLOC_IF (reg_dummy, num_regs, CONST char *);
1686 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1687
1688 regs_allocated_size = num_regs;
1689 }
1690 }
1691
1692 #endif /* not MATCH_MAY_ALLOCATE */
1693
1694 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1695 Returns one of error codes defined in `regex.h', or zero for success.
1696
1697 Assumes the `allocated' (and perhaps `buffer') and `translate'
1698 fields are set in BUFP on entry.
1699
1700 If it succeeds, results are put in BUFP (if it returns an error, the
1701 contents of BUFP are undefined):
1702 `buffer' is the compiled pattern;
1703 `syntax' is set to SYNTAX;
1704 `used' is set to the length of the compiled pattern;
1705 `fastmap_accurate' is zero;
1706 `re_nsub' is the number of subexpressions in PATTERN;
1707 `not_bol' and `not_eol' are zero;
1708
1709 The `fastmap' and `newline_anchor' fields are neither
1710 examined nor set. */
1711
1712 /* Return, freeing storage we allocated. */
1713 #define FREE_STACK_RETURN(value) \
1714 return (free (compile_stack.stack), value)
1715
1716 static reg_errcode_t
1717 regex_compile (CONST char *pattern, int size, reg_syntax_t syntax,
1718 struct re_pattern_buffer *bufp)
1719 {
1720 /* We fetch characters from PATTERN here. We declare these as int
1721 (or possibly long) so that chars above 127 can be used as
1722 array indices. The macros that fetch a character from the pattern
1723 make sure to coerce to unsigned char before assigning, so we won't
1724 get bitten by negative numbers here. */
1725 /* XEmacs change: used to be unsigned char. */
1726 register EMACS_INT c, c1;
1727
1728 /* A random temporary spot in PATTERN. */
1729 CONST char *p1;
1730
1731 /* Points to the end of the buffer, where we should append. */
1732 register unsigned char *b;
1733
1734 /* Keeps track of unclosed groups. */
1735 compile_stack_type compile_stack;
1736
1737 /* Points to the current (ending) position in the pattern. */
1738 CONST char *p = pattern;
1739 CONST char *pend = pattern + size;
1740
1741 /* How to translate the characters in the pattern. */
1742 char *translate = bufp->translate;
1743
1744 /* Address of the count-byte of the most recently inserted `exactn'
1745 command. This makes it possible to tell if a new exact-match
1746 character can be added to that command or if the character requires
1747 a new `exactn' command. */
1748 unsigned char *pending_exact = 0;
1749
1750 /* Address of start of the most recently finished expression.
1751 This tells, e.g., postfix * where to find the start of its
1752 operand. Reset at the beginning of groups and alternatives. */
1753 unsigned char *laststart = 0;
1754
1755 /* Address of beginning of regexp, or inside of last group. */
1756 unsigned char *begalt;
1757
1758 /* Place in the uncompiled pattern (i.e., the {) to
1759 which to go back if the interval is invalid. */
1760 CONST char *beg_interval;
1761
1762 /* Address of the place where a forward jump should go to the end of
1763 the containing expression. Each alternative of an `or' -- except the
1764 last -- ends with a forward jump of this sort. */
1765 unsigned char *fixup_alt_jump = 0;
1766
1767 /* Counts open-groups as they are encountered. Remembered for the
1768 matching close-group on the compile stack, so the same register
1769 number is put in the stop_memory as the start_memory. */
1770 regnum_t regnum = 0;
1771
1772 #ifdef DEBUG
1773 DEBUG_PRINT1 ("\nCompiling pattern: ");
1774 if (debug)
1775 {
1776 unsigned debug_count;
1777
1778 for (debug_count = 0; debug_count < size; debug_count++)
1779 putchar (pattern[debug_count]);
1780 putchar ('\n');
1781 }
1782 #endif /* DEBUG */
1783
1784 /* Initialize the compile stack. */
1785 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1786 if (compile_stack.stack == NULL)
1787 return REG_ESPACE;
1788
1789 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1790 compile_stack.avail = 0;
1791
1792 /* Initialize the pattern buffer. */
1793 bufp->syntax = syntax;
1794 bufp->fastmap_accurate = 0;
1795 bufp->not_bol = bufp->not_eol = 0;
1796
1797 /* Set `used' to zero, so that if we return an error, the pattern
1798 printer (for debugging) will think there's no pattern. We reset it
1799 at the end. */
1800 bufp->used = 0;
1801
1802 /* Always count groups, whether or not bufp->no_sub is set. */
1803 bufp->re_nsub = 0;
1804
1805 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1806 /* Initialize the syntax table. */
1807 init_syntax_once ();
1808 #endif
1809
1810 if (bufp->allocated == 0)
1811 {
1812 if (bufp->buffer)
1813 { /* If zero allocated, but buffer is non-null, try to realloc
1814 enough space. This loses if buffer's address is bogus, but
1815 that is the user's responsibility. */
1816 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1817 }
1818 else
1819 { /* Caller did not allocate a buffer. Do it for them. */
1820 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1821 }
1822 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
1823
1824 bufp->allocated = INIT_BUF_SIZE;
1825 }
1826
1827 begalt = b = bufp->buffer;
1828
1829 /* Loop through the uncompiled pattern until we're at the end. */
1830 while (p != pend)
1831 {
1832 PATFETCH (c);
1833
1834 switch (c)
1835 {
1836 case '^':
1837 {
1838 if ( /* If at start of pattern, it's an operator. */
1839 p == pattern + 1
1840 /* If context independent, it's an operator. */
1841 || syntax & RE_CONTEXT_INDEP_ANCHORS
1842 /* Otherwise, depends on what's come before. */
1843 || at_begline_loc_p (pattern, p, syntax))
1844 BUF_PUSH (begline);
1845 else
1846 goto normal_char;
1847 }
1848 break;
1849
1850
1851 case '$':
1852 {
1853 if ( /* If at end of pattern, it's an operator. */
1854 p == pend
1855 /* If context independent, it's an operator. */
1856 || syntax & RE_CONTEXT_INDEP_ANCHORS
1857 /* Otherwise, depends on what's next. */
1858 || at_endline_loc_p (p, pend, syntax))
1859 BUF_PUSH (endline);
1860 else
1861 goto normal_char;
1862 }
1863 break;
1864
1865
1866 case '+':
1867 case '?':
1868 if ((syntax & RE_BK_PLUS_QM)
1869 || (syntax & RE_LIMITED_OPS))
1870 goto normal_char;
1871 handle_plus:
1872 case '*':
1873 /* If there is no previous pattern... */
1874 if (!laststart)
1875 {
1876 if (syntax & RE_CONTEXT_INVALID_OPS)
1877 FREE_STACK_RETURN (REG_BADRPT);
1878 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1879 goto normal_char;
1880 }
1881
1882 {
1883 /* Are we optimizing this jump? */
1884 boolean keep_string_p = false;
1885
1886 /* 1 means zero (many) matches is allowed. */
1887 char zero_times_ok = 0, many_times_ok = 0;
1888
1889 /* If there is a sequence of repetition chars, collapse it
1890 down to just one (the right one). We can't combine
1891 interval operators with these because of, e.g., `a{2}*',
1892 which should only match an even number of `a's. */
1893
1894 for (;;)
1895 {
1896 zero_times_ok |= c != '+';
1897 many_times_ok |= c != '?';
1898
1899 if (p == pend)
1900 break;
1901
1902 PATFETCH (c);
1903
1904 if (c == '*'
1905 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1906 ;
1907
1908 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1909 {
1910 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
1911
1912 PATFETCH (c1);
1913 if (!(c1 == '+' || c1 == '?'))
1914 {
1915 PATUNFETCH;
1916 PATUNFETCH;
1917 break;
1918 }
1919
1920 c = c1;
1921 }
1922 else
1923 {
1924 PATUNFETCH;
1925 break;
1926 }
1927
1928 /* If we get here, we found another repeat character. */
1929 }
1930
1931 /* Star, etc. applied to an empty pattern is equivalent
1932 to an empty pattern. */
1933 if (!laststart)
1934 break;
1935
1936 /* Now we know whether or not zero matches is allowed
1937 and also whether or not two or more matches is allowed. */
1938 if (many_times_ok)
1939 { /* More than one repetition is allowed, so put in at the
1940 end a backward relative jump from `b' to before the next
1941 jump we're going to put in below (which jumps from
1942 laststart to after this jump).
1943
1944 But if we are at the `*' in the exact sequence `.*\n',
1945 insert an unconditional jump backwards to the .,
1946 instead of the beginning of the loop. This way we only
1947 push a failure point once, instead of every time
1948 through the loop. */
1949 assert (p - 1 > pattern);
1950
1951 /* Allocate the space for the jump. */
1952 GET_BUFFER_SPACE (3);
1953
1954 /* We know we are not at the first character of the pattern,
1955 because laststart was nonzero. And we've already
1956 incremented `p', by the way, to be the character after
1957 the `*'. Do we have to do something analogous here
1958 for null bytes, because of RE_DOT_NOT_NULL? */
1959 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1960 && zero_times_ok
1961 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1962 && !(syntax & RE_DOT_NEWLINE))
1963 { /* We have .*\n. */
1964 STORE_JUMP (jump, b, laststart);
1965 keep_string_p = true;
1966 }
1967 else
1968 /* Anything else. */
1969 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1970
1971 /* We've added more stuff to the buffer. */
1972 b += 3;
1973 }
1974
1975 /* On failure, jump from laststart to b + 3, which will be the
1976 end of the buffer after this jump is inserted. */
1977 GET_BUFFER_SPACE (3);
1978 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1979 : on_failure_jump,
1980 laststart, b + 3);
1981 pending_exact = 0;
1982 b += 3;
1983
1984 if (!zero_times_ok)
1985 {
1986 /* At least one repetition is required, so insert a
1987 `dummy_failure_jump' before the initial
1988 `on_failure_jump' instruction of the loop. This
1989 effects a skip over that instruction the first time
1990 we hit that loop. */
1991 GET_BUFFER_SPACE (3);
1992 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1993 b += 3;
1994 }
1995 }
1996 break;
1997
1998
1999 case '.':
2000 laststart = b;
2001 BUF_PUSH (anychar);
2002 break;
2003
2004
2005 case '[':
2006 {
2007 /* XEmacs change: this whole section */
2008 boolean had_char_class = false;
2009
2010 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2011
2012 /* Ensure that we have enough space to push a charset: the
2013 opcode, the length count, and the bitset; 34 bytes in all. */
2014 GET_BUFFER_SPACE (34);
2015
2016 laststart = b;
2017
2018 /* We test `*p == '^' twice, instead of using an if
2019 statement, so we only need one BUF_PUSH. */
2020 BUF_PUSH (*p == '^' ? charset_not : charset);
2021 if (*p == '^')
2022 p++;
2023
2024 /* Remember the first position in the bracket expression. */
2025 p1 = p;
2026
2027 /* Push the number of bytes in the bitmap. */
2028 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2029
2030 /* Clear the whole map. */
2031 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
2032
2033 /* charset_not matches newline according to a syntax bit. */
2034 if ((re_opcode_t) b[-2] == charset_not
2035 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2036 SET_LIST_BIT ('\n');
2037
2038 /* Read in characters and ranges, setting map bits. */
2039 for (;;)
2040 {
2041 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2042
2043 PATFETCH_EITHER (c);
2044
2045 /* \ might escape characters inside [...] and [^...]. */
2046 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2047 {
2048 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2049
2050 PATFETCH_EITHER (c1);
2051 SET_EITHER_BIT (c1);
2052 continue;
2053 }
2054
2055 /* Could be the end of the bracket expression. If it's
2056 not (i.e., when the bracket expression is `[]' so
2057 far), the ']' character bit gets set way below. */
2058 if (c == ']' && p != p1 + 1)
2059 break;
2060
2061 /* Look ahead to see if it's a range when the last thing
2062 was a character class. */
2063 if (had_char_class && c == '-' && *p != ']')
2064 FREE_STACK_RETURN (REG_ERANGE);
2065
2066 /* Look ahead to see if it's a range when the last thing
2067 was a character: if this is a hyphen not at the
2068 beginning or the end of a list, then it's the range
2069 operator. */
2070 if (c == '-'
2071 && !(p - 2 >= pattern && p[-2] == '[')
2072 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2073 && *p != ']')
2074 {
2075 reg_errcode_t ret;
2076
2077 ret = compile_range (&p, pend, translate, syntax, b);
2078 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2079 }
2080
2081 else if (p[0] == '-' && p[1] != ']')
2082 { /* This handles ranges made up of characters only. */
2083 reg_errcode_t ret;
2084
2085 /* Move past the `-'. */
2086 PATFETCH (c1);
2087
2088 ret = compile_range (&p, pend, translate, syntax, b);
2089 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2090 }
2091
2092 /* See if we're at the beginning of a possible character
2093 class. */
2094
2095 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2096 { /* Leave room for the null. */
2097 char str[CHAR_CLASS_MAX_LENGTH + 1];
2098
2099 PATFETCH (c);
2100 c1 = 0;
2101
2102 /* If pattern is `[[:'. */
2103 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2104
2105 for (;;)
2106 {
2107 /* Do not do PATFETCH_EITHER() here. We want
2108 to just see if the bytes match particular
2109 strings, and we put them all back if not.
2110
2111 #### May need to be changed once trt tables
2112 are working. */
2113 PATFETCH (c);
2114 if (c == ':' || c == ']' || p == pend
2115 || c1 == CHAR_CLASS_MAX_LENGTH)
2116 break;
2117 str[c1++] = c;
2118 }
2119 str[c1] = '\0';
2120
2121 /* If isn't a word bracketed by `[:' and:`]':
2122 undo the ending character, the letters, and leave
2123 the leading `:' and `[' (but set bits for them). */
2124 if (c == ':' && *p == ']')
2125 {
2126 int ch;
2127 boolean is_alnum = STREQ (str, "alnum");
2128 boolean is_alpha = STREQ (str, "alpha");
2129 boolean is_blank = STREQ (str, "blank");
2130 boolean is_cntrl = STREQ (str, "cntrl");
2131 boolean is_digit = STREQ (str, "digit");
2132 boolean is_graph = STREQ (str, "graph");
2133 boolean is_lower = STREQ (str, "lower");
2134 boolean is_print = STREQ (str, "print");
2135 boolean is_punct = STREQ (str, "punct");
2136 boolean is_space = STREQ (str, "space");
2137 boolean is_upper = STREQ (str, "upper");
2138 boolean is_xdigit = STREQ (str, "xdigit");
2139
2140 if (!IS_CHAR_CLASS (str))
2141 FREE_STACK_RETURN (REG_ECTYPE);
2142
2143 /* Throw away the ] at the end of the character
2144 class. */
2145 PATFETCH (c);
2146
2147 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2148
2149 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2150 {
2151 /* This was split into 3 if's to
2152 avoid an arbitrary limit in some compiler. */
2153 if ( (is_alnum && ISALNUM (ch))
2154 || (is_alpha && ISALPHA (ch))
2155 || (is_blank && ISBLANK (ch))
2156 || (is_cntrl && ISCNTRL (ch)))
2157 SET_EITHER_BIT (ch);
2158 if ( (is_digit && ISDIGIT (ch))
2159 || (is_graph && ISGRAPH (ch))
2160 || (is_lower && ISLOWER (ch))
2161 || (is_print && ISPRINT (ch)))
2162 SET_EITHER_BIT (ch);
2163 if ( (is_punct && ISPUNCT (ch))
2164 || (is_space && ISSPACE (ch))
2165 || (is_upper && ISUPPER (ch))
2166 || (is_xdigit && ISXDIGIT (ch)))
2167 SET_EITHER_BIT (ch);
2168 }
2169 had_char_class = true;
2170 }
2171 else
2172 {
2173 c1++;
2174 while (c1--)
2175 PATUNFETCH;
2176 SET_EITHER_BIT ('[');
2177 SET_EITHER_BIT (':');
2178 had_char_class = false;
2179 }
2180 }
2181 else
2182 {
2183 had_char_class = false;
2184 SET_EITHER_BIT (c);
2185 }
2186 }
2187
2188 /* Discard any (non)matching list bytes that are all 0 at the
2189 end of the map. Decrease the map-length byte too. */
2190 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2191 b[-1]--;
2192 b += b[-1];
2193 }
2194 break;
2195
2196
2197 case '(':
2198 if (syntax & RE_NO_BK_PARENS)
2199 goto handle_open;
2200 else
2201 goto normal_char;
2202
2203
2204 case ')':
2205 if (syntax & RE_NO_BK_PARENS)
2206 goto handle_close;
2207 else
2208 goto normal_char;
2209
2210
2211 case '\n':
2212 if (syntax & RE_NEWLINE_ALT)
2213 goto handle_alt;
2214 else
2215 goto normal_char;
2216
2217
2218 case '|':
2219 if (syntax & RE_NO_BK_VBAR)
2220 goto handle_alt;
2221 else
2222 goto normal_char;
2223
2224
2225 case '{':
2226 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2227 goto handle_interval;
2228 else
2229 goto normal_char;
2230
2231
2232 case '\\':
2233 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2234
2235 /* Do not translate the character after the \, so that we can
2236 distinguish, e.g., \B from \b, even if we normally would
2237 translate, e.g., B to b. */
2238 PATFETCH_RAW (c);
2239
2240 switch (c)
2241 {
2242 case '(':
2243 if (syntax & RE_NO_BK_PARENS)
2244 goto normal_backslash;
2245
2246 handle_open:
2247 bufp->re_nsub++;
2248 regnum++;
2249
2250 if (COMPILE_STACK_FULL)
2251 {
2252 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2253 compile_stack_elt_t);
2254 if (compile_stack.stack == NULL) return REG_ESPACE;
2255
2256 compile_stack.size <<= 1;
2257 }
2258
2259 /* These are the values to restore when we hit end of this
2260 group. They are all relative offsets, so that if the
2261 whole pattern moves because of realloc, they will still
2262 be valid. */
2263 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2264 COMPILE_STACK_TOP.fixup_alt_jump
2265 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2266 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2267 COMPILE_STACK_TOP.regnum = regnum;
2268
2269 /* We will eventually replace the 0 with the number of
2270 groups inner to this one. But do not push a
2271 start_memory for groups beyond the last one we can
2272 represent in the compiled pattern. */
2273 if (regnum <= MAX_REGNUM)
2274 {
2275 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
2276 BUF_PUSH_3 (start_memory, regnum, 0);
2277 }
2278
2279 compile_stack.avail++;
2280
2281 fixup_alt_jump = 0;
2282 laststart = 0;
2283 begalt = b;
2284 /* If we've reached MAX_REGNUM groups, then this open
2285 won't actually generate any code, so we'll have to
2286 clear pending_exact explicitly. */
2287 pending_exact = 0;
2288 break;
2289
2290
2291 case ')':
2292 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2293
2294 if (COMPILE_STACK_EMPTY)
2295 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2296 goto normal_backslash;
2297 else
2298 FREE_STACK_RETURN (REG_ERPAREN);
2299
2300 handle_close:
2301 if (fixup_alt_jump)
2302 { /* Push a dummy failure point at the end of the
2303 alternative for a possible future
2304 `pop_failure_jump' to pop. See comments at
2305 `push_dummy_failure' in `re_match_2'. */
2306 BUF_PUSH (push_dummy_failure);
2307
2308 /* We allocated space for this jump when we assigned
2309 to `fixup_alt_jump', in the `handle_alt' case below. */
2310 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2311 }
2312
2313 /* See similar code for backslashed left paren above. */
2314 if (COMPILE_STACK_EMPTY)
2315 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2316 goto normal_char;
2317 else
2318 FREE_STACK_RETURN (REG_ERPAREN);
2319
2320 /* Since we just checked for an empty stack above, this
2321 ``can't happen''. */
2322 assert (compile_stack.avail != 0);
2323 {
2324 /* We don't just want to restore into `regnum', because
2325 later groups should continue to be numbered higher,
2326 as in `(ab)c(de)' -- the second group is #2. */
2327 regnum_t this_group_regnum;
2328
2329 compile_stack.avail--;
2330 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2331 fixup_alt_jump
2332 = COMPILE_STACK_TOP.fixup_alt_jump
2333 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2334 : 0;
2335 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2336 this_group_regnum = COMPILE_STACK_TOP.regnum;
2337 /* If we've reached MAX_REGNUM groups, then this open
2338 won't actually generate any code, so we'll have to
2339 clear pending_exact explicitly. */
2340 pending_exact = 0;
2341
2342 /* We're at the end of the group, so now we know how many
2343 groups were inside this one. */
2344 if (this_group_regnum <= MAX_REGNUM)
2345 {
2346 unsigned char *inner_group_loc
2347 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2348
2349 *inner_group_loc = regnum - this_group_regnum;
2350 BUF_PUSH_3 (stop_memory, this_group_regnum,
2351 regnum - this_group_regnum);
2352 }
2353 }
2354 break;
2355
2356
2357 case '|': /* `\|'. */
2358 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2359 goto normal_backslash;
2360 handle_alt:
2361 if (syntax & RE_LIMITED_OPS)
2362 goto normal_char;
2363
2364 /* Insert before the previous alternative a jump which
2365 jumps to this alternative if the former fails. */
2366 GET_BUFFER_SPACE (3);
2367 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2368 pending_exact = 0;
2369 b += 3;
2370
2371 /* The alternative before this one has a jump after it
2372 which gets executed if it gets matched. Adjust that
2373 jump so it will jump to this alternative's analogous
2374 jump (put in below, which in turn will jump to the next
2375 (if any) alternative's such jump, etc.). The last such
2376 jump jumps to the correct final destination. A picture:
2377 _____ _____
2378 | | | |
2379 | v | v
2380 a | b | c
2381
2382 If we are at `b', then fixup_alt_jump right now points to a
2383 three-byte space after `a'. We'll put in the jump, set
2384 fixup_alt_jump to right after `b', and leave behind three
2385 bytes which we'll fill in when we get to after `c'. */
2386
2387 if (fixup_alt_jump)
2388 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2389
2390 /* Mark and leave space for a jump after this alternative,
2391 to be filled in later either by next alternative or
2392 when know we're at the end of a series of alternatives. */
2393 fixup_alt_jump = b;
2394 GET_BUFFER_SPACE (3);
2395 b += 3;
2396
2397 laststart = 0;
2398 begalt = b;
2399 break;
2400
2401
2402 case '{':
2403 /* If \{ is a literal. */
2404 if (!(syntax & RE_INTERVALS)
2405 /* If we're at `\{' and it's not the open-interval
2406 operator. */
2407 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2408 || (p - 2 == pattern && p == pend))
2409 goto normal_backslash;
2410
2411 handle_interval:
2412 {
2413 /* If got here, then the syntax allows intervals. */
2414
2415 /* At least (most) this many matches must be made. */
2416 int lower_bound = -1, upper_bound = -1;
2417
2418 beg_interval = p - 1;
2419
2420 if (p == pend)
2421 {
2422 if (syntax & RE_NO_BK_BRACES)
2423 goto unfetch_interval;
2424 else
2425 FREE_STACK_RETURN (REG_EBRACE);
2426 }
2427
2428 GET_UNSIGNED_NUMBER (lower_bound);
2429
2430 if (c == ',')
2431 {
2432 GET_UNSIGNED_NUMBER (upper_bound);
2433 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2434 }
2435 else
2436 /* Interval such as `{1}' => match exactly once. */
2437 upper_bound = lower_bound;
2438
2439 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2440 || lower_bound > upper_bound)
2441 {
2442 if (syntax & RE_NO_BK_BRACES)
2443 goto unfetch_interval;
2444 else
2445 FREE_STACK_RETURN (REG_BADBR);
2446 }
2447
2448 if (!(syntax & RE_NO_BK_BRACES))
2449 {
2450 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2451
2452 PATFETCH (c);
2453 }
2454
2455 if (c != '}')
2456 {
2457 if (syntax & RE_NO_BK_BRACES)
2458 goto unfetch_interval;
2459 else
2460 FREE_STACK_RETURN (REG_BADBR);
2461 }
2462
2463 /* We just parsed a valid interval. */
2464
2465 /* If it's invalid to have no preceding re. */
2466 if (!laststart)
2467 {
2468 if (syntax & RE_CONTEXT_INVALID_OPS)
2469 FREE_STACK_RETURN (REG_BADRPT);
2470 else if (syntax & RE_CONTEXT_INDEP_OPS)
2471 laststart = b;
2472 else
2473 goto unfetch_interval;
2474 }
2475
2476 /* If the upper bound is zero, don't want to succeed at
2477 all; jump from `laststart' to `b + 3', which will be
2478 the end of the buffer after we insert the jump. */
2479 if (upper_bound == 0)
2480 {
2481 GET_BUFFER_SPACE (3);
2482 INSERT_JUMP (jump, laststart, b + 3);
2483 b += 3;
2484 }
2485
2486 /* Otherwise, we have a nontrivial interval. When
2487 we're all done, the pattern will look like:
2488 set_number_at <jump count> <upper bound>
2489 set_number_at <succeed_n count> <lower bound>
2490 succeed_n <after jump addr> <succeed_n count>
2491 <body of loop>
2492 jump_n <succeed_n addr> <jump count>
2493 (The upper bound and `jump_n' are omitted if
2494 `upper_bound' is 1, though.) */
2495 else
2496 { /* If the upper bound is > 1, we need to insert
2497 more at the end of the loop. */
2498 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2499
2500 GET_BUFFER_SPACE (nbytes);
2501
2502 /* Initialize lower bound of the `succeed_n', even
2503 though it will be set during matching by its
2504 attendant `set_number_at' (inserted next),
2505 because `re_compile_fastmap' needs to know.
2506 Jump to the `jump_n' we might insert below. */
2507 INSERT_JUMP2 (succeed_n, laststart,
2508 b + 5 + (upper_bound > 1) * 5,
2509 lower_bound);
2510 b += 5;
2511
2512 /* Code to initialize the lower bound. Insert
2513 before the `succeed_n'. The `5' is the last two
2514 bytes of this `set_number_at', plus 3 bytes of
2515 the following `succeed_n'. */
2516 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2517 b += 5;
2518
2519 if (upper_bound > 1)
2520 { /* More than one repetition is allowed, so
2521 append a backward jump to the `succeed_n'
2522 that starts this interval.
2523
2524 When we've reached this during matching,
2525 we'll have matched the interval once, so
2526 jump back only `upper_bound - 1' times. */
2527 STORE_JUMP2 (jump_n, b, laststart + 5,
2528 upper_bound - 1);
2529 b += 5;
2530
2531 /* The location we want to set is the second
2532 parameter of the `jump_n'; that is `b-2' as
2533 an absolute address. `laststart' will be
2534 the `set_number_at' we're about to insert;
2535 `laststart+3' the number to set, the source
2536 for the relative address. But we are
2537 inserting into the middle of the pattern --
2538 so everything is getting moved up by 5.
2539 Conclusion: (b - 2) - (laststart + 3) + 5,
2540 i.e., b - laststart.
2541
2542 We insert this at the beginning of the loop
2543 so that if we fail during matching, we'll
2544 reinitialize the bounds. */
2545 insert_op2 (set_number_at, laststart, b - laststart,
2546 upper_bound - 1, b);
2547 b += 5;
2548 }
2549 }
2550 pending_exact = 0;
2551 beg_interval = NULL;
2552 }
2553 break;
2554
2555 unfetch_interval:
2556 /* If an invalid interval, match the characters as literals. */
2557 assert (beg_interval);
2558 p = beg_interval;
2559 beg_interval = NULL;
2560
2561 /* normal_char and normal_backslash need `c'. */
2562 PATFETCH (c);
2563
2564 if (!(syntax & RE_NO_BK_BRACES))
2565 {
2566 if (p > pattern && p[-1] == '\\')
2567 goto normal_backslash;
2568 }
2569 goto normal_char;
2570
2571 #ifdef emacs
2572 /* There is no way to specify the before_dot and after_dot
2573 operators. rms says this is ok. --karl */
2574 case '=':
2575 BUF_PUSH (at_dot);
2576 break;
2577
2578 case 's':
2579 laststart = b;
2580 PATFETCH (c);
2581 /* XEmacs addition */
2582 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2583 FREE_STACK_RETURN (REG_ESYNTAX);
2584 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2585 break;
2586
2587 case 'S':
2588 laststart = b;
2589 PATFETCH (c);
2590 /* XEmacs addition */
2591 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2592 FREE_STACK_RETURN (REG_ESYNTAX);
2593 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2594 break;
2595 #endif /* emacs */
2596
2597
2598 case 'w':
2599 laststart = b;
2600 BUF_PUSH (wordchar);
2601 break;
2602
2603
2604 case 'W':
2605 laststart = b;
2606 BUF_PUSH (notwordchar);
2607 break;
2608
2609
2610 case '<':
2611 BUF_PUSH (wordbeg);
2612 break;
2613
2614 case '>':
2615 BUF_PUSH (wordend);
2616 break;
2617
2618 case 'b':
2619 BUF_PUSH (wordbound);
2620 break;
2621
2622 case 'B':
2623 BUF_PUSH (notwordbound);
2624 break;
2625
2626 case '`':
2627 BUF_PUSH (begbuf);
2628 break;
2629
2630 case '\'':
2631 BUF_PUSH (endbuf);
2632 break;
2633
2634 case '1': case '2': case '3': case '4': case '5':
2635 case '6': case '7': case '8': case '9':
2636 if (syntax & RE_NO_BK_REFS)
2637 goto normal_char;
2638
2639 c1 = c - '0';
2640
2641 if (c1 > regnum)
2642 FREE_STACK_RETURN (REG_ESUBREG);
2643
2644 /* Can't back reference to a subexpression if inside of it. */
2645 if (group_in_compile_stack (compile_stack, c1))
2646 goto normal_char;
2647
2648 laststart = b;
2649 BUF_PUSH_2 (duplicate, c1);
2650 break;
2651
2652
2653 case '+':
2654 case '?':
2655 if (syntax & RE_BK_PLUS_QM)
2656 goto handle_plus;
2657 else
2658 goto normal_backslash;
2659
2660 default:
2661 normal_backslash:
2662 /* You might think it would be useful for \ to mean
2663 not to translate; but if we don't translate it
2664 it will never match anything. */
2665 c = TRANSLATE (c);
2666 goto normal_char;
2667 }
2668 break;
2669
2670
2671 default:
2672 /* Expects the character in `c'. */
2673 /* `p' points to the location after where `c' came from. */
2674 normal_char:
2675 {
2676 /* XEmacs: modifications here for Mule. */
2677 /* `q' points to the beginning of the next char. */
2678 CONST char *q = p - 1;
2679 INC_CHARPTR (q);
2680
2681 /* If no exactn currently being built. */
2682 if (!pending_exact
2683
2684 /* If last exactn not at current position. */
2685 || pending_exact + *pending_exact + 1 != b
2686
2687 /* We have only one byte following the exactn for the count. */
2688 || ((unsigned int) (*pending_exact + (q - p)) >=
2689 ((unsigned int) (1 << BYTEWIDTH) - 1))
2690
2691 /* If followed by a repetition operator. */
2692 || *q == '*' || *q == '^'
2693 || ((syntax & RE_BK_PLUS_QM)
2694 ? *q == '\\' && (q[1] == '+' || q[1] == '?')
2695 : (*q == '+' || *q == '?'))
2696 || ((syntax & RE_INTERVALS)
2697 && ((syntax & RE_NO_BK_BRACES)
2698 ? *q == '{'
2699 : (q[0] == '\\' && q[1] == '{'))))
2700 {
2701 /* Start building a new exactn. */
2702
2703 laststart = b;
2704
2705 BUF_PUSH_2 (exactn, 0);
2706 pending_exact = b - 1;
2707 }
2708
2709 BUF_PUSH (c);
2710 (*pending_exact)++;
2711
2712 while (p < q)
2713 {
2714 PATFETCH (c);
2715 BUF_PUSH (c);
2716 (*pending_exact)++;
2717 }
2718 break;
2719 }
2720 } /* switch (c) */
2721 } /* while p != pend */
2722
2723
2724 /* Through the pattern now. */
2725
2726 if (fixup_alt_jump)
2727 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2728
2729 if (!COMPILE_STACK_EMPTY)
2730 FREE_STACK_RETURN (REG_EPAREN);
2731
2732 /* If we don't want backtracking, force success
2733 the first time we reach the end of the compiled pattern. */
2734 if (syntax & RE_NO_POSIX_BACKTRACKING)
2735 BUF_PUSH (succeed);
2736
2737 free (compile_stack.stack);
2738
2739 /* We have succeeded; set the length of the buffer. */
2740 bufp->used = b - bufp->buffer;
2741
2742 #ifdef DEBUG
2743 if (debug)
2744 {
2745 DEBUG_PRINT1 ("\nCompiled pattern: \n");
2746 print_compiled_pattern (bufp);
2747 }
2748 #endif /* DEBUG */
2749
2750 #ifndef MATCH_MAY_ALLOCATE
2751 /* Initialize the failure stack to the largest possible stack. This
2752 isn't necessary unless we're trying to avoid calling alloca in
2753 the search and match routines. */
2754 {
2755 int num_regs = bufp->re_nsub + 1;
2756
2757 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
2758 is strictly greater than re_max_failures, the largest possible stack
2759 is 2 * re_max_failures failure points. */
2760 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
2761 {
2762 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
2763
2764 #ifdef emacs
2765 if (! fail_stack.stack)
2766 fail_stack.stack
2767 = (fail_stack_elt_t *) xmalloc (fail_stack.size
2768 * sizeof (fail_stack_elt_t));
2769 else
2770 fail_stack.stack
2771 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
2772 (fail_stack.size
2773 * sizeof (fail_stack_elt_t)));
2774 #else /* not emacs */
2775 if (! fail_stack.stack)
2776 fail_stack.stack
2777 = (fail_stack_elt_t *) malloc (fail_stack.size
2778 * sizeof (fail_stack_elt_t));
2779 else
2780 fail_stack.stack
2781 = (fail_stack_elt_t *) realloc (fail_stack.stack,
2782 (fail_stack.size
2783 * sizeof (fail_stack_elt_t)));
2784 #endif /* not emacs */
2785 }
2786
2787 regex_grow_registers (num_regs);
2788 }
2789 #endif /* not MATCH_MAY_ALLOCATE */
2790
2791 return REG_NOERROR;
2792 } /* regex_compile */
2793
2794 /* Subroutines for `regex_compile'. */
2795
2796 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2797
2798 static void
2799 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
2800 {
2801 *loc = (unsigned char) op;
2802 STORE_NUMBER (loc + 1, arg);
2803 }
2804
2805
2806 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2807
2808 static void
2809 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
2810 {
2811 *loc = (unsigned char) op;
2812 STORE_NUMBER (loc + 1, arg1);
2813 STORE_NUMBER (loc + 3, arg2);
2814 }
2815
2816
2817 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2818 for OP followed by two-byte integer parameter ARG. */
2819
2820 static void
2821 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
2822 {
2823 register unsigned char *pfrom = end;
2824 register unsigned char *pto = end + 3;
2825
2826 while (pfrom != loc)
2827 *--pto = *--pfrom;
2828
2829 store_op1 (op, loc, arg);
2830 }
2831
2832
2833 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2834
2835 static void
2836 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
2837 unsigned char *end)
2838 {
2839 register unsigned char *pfrom = end;
2840 register unsigned char *pto = end + 5;
2841
2842 while (pfrom != loc)
2843 *--pto = *--pfrom;
2844
2845 store_op2 (op, loc, arg1, arg2);
2846 }
2847
2848
2849 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2850 after an alternative or a begin-subexpression. We assume there is at
2851 least one character before the ^. */
2852
2853 static boolean
2854 at_begline_loc_p (CONST char *pattern, CONST char *p, reg_syntax_t syntax)
2855 {
2856 CONST char *prev = p - 2;
2857 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
2858
2859 return
2860 /* After a subexpression? */
2861 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
2862 /* After an alternative? */
2863 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
2864 }
2865
2866
2867 /* The dual of at_begline_loc_p. This one is for $. We assume there is
2868 at least one character after the $, i.e., `P < PEND'. */
2869
2870 static boolean
2871 at_endline_loc_p (CONST char *p, CONST char *pend, int syntax)
2872 {
2873 CONST char *next = p;
2874 boolean next_backslash = *next == '\\';
2875 CONST char *next_next = p + 1 < pend ? p + 1 : 0;
2876
2877 return
2878 /* Before a subexpression? */
2879 (syntax & RE_NO_BK_PARENS ? *next == ')'
2880 : next_backslash && next_next && *next_next == ')')
2881 /* Before an alternative? */
2882 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2883 : next_backslash && next_next && *next_next == '|');
2884 }
2885
2886
2887 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2888 false if it's not. */
2889
2890 static boolean
2891 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
2892 {
2893 int this_element;
2894
2895 for (this_element = compile_stack.avail - 1;
2896 this_element >= 0;
2897 this_element--)
2898 if (compile_stack.stack[this_element].regnum == regnum)
2899 return true;
2900
2901 return false;
2902 }
2903
2904
2905 /* Read the ending character of a range (in a bracket expression) from the
2906 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2907 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2908 Then we set the translation of all bits between the starting and
2909 ending characters (inclusive) in the compiled pattern B.
2910
2911 Return an error code.
2912
2913 We use these short variable names so we can use the same macros as
2914 `regex_compile' itself. */
2915
2916 static reg_errcode_t
2917 compile_range (CONST char **p_ptr, CONST char *pend, char *translate,
2918 reg_syntax_t syntax, unsigned char *b)
2919 {
2920 unsigned this_char;
2921
2922 CONST char *p = *p_ptr;
2923 int range_start, range_end;
2924
2925 if (p == pend)
2926 return REG_ERANGE;
2927
2928 /* Even though the pattern is a signed `char *', we need to fetch
2929 with unsigned char *'s; if the high bit of the pattern character
2930 is set, the range endpoints will be negative if we fetch using a
2931 signed char *.
2932
2933 We also want to fetch the endpoints without translating them; the
2934 appropriate translation is done in the bit-setting loop below. */
2935 /* The SVR4 compiler on the 3B2 had trouble with unsigned CONST char *. */
2936 range_start = ((CONST unsigned char *) p)[-2];
2937 range_end = ((CONST unsigned char *) p)[0];
2938
2939 /* Have to increment the pointer into the pattern string, so the
2940 caller isn't still at the ending character. */
2941 (*p_ptr)++;
2942
2943 /* If the start is after the end, the range is empty. */
2944 if (range_start > range_end)
2945 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2946
2947 /* Here we see why `this_char' has to be larger than an `unsigned
2948 char' -- the range is inclusive, so if `range_end' == 0xff
2949 (assuming 8-bit characters), we would otherwise go into an infinite
2950 loop, since all characters <= 0xff. */
2951 for (this_char = range_start; this_char <= range_end; this_char++)
2952 {
2953 SET_LIST_BIT (TRANSLATE (this_char));
2954 }
2955
2956 return REG_NOERROR;
2957 }
2958
2959 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2960 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2961 characters can start a string that matches the pattern. This fastmap
2962 is used by re_search to skip quickly over impossible starting points.
2963
2964 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2965 area as BUFP->fastmap.
2966
2967 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2968 the pattern buffer.
2969
2970 Returns 0 if we succeed, -2 if an internal error. */
2971
2972 int
2973 re_compile_fastmap (struct re_pattern_buffer *bufp)
2974 {
2975 int j, k;
2976 #ifdef MATCH_MAY_ALLOCATE
2977 fail_stack_type fail_stack;
2978 #endif
2979 DECLARE_DESTINATION
2980 /* We don't push any register information onto the failure stack. */
2981 unsigned num_regs = 0;
2982
2983 register char *fastmap = bufp->fastmap;
2984 unsigned char *pattern = bufp->buffer;
2985 unsigned long size = bufp->used;
2986 unsigned char *p = pattern;
2987 register unsigned char *pend = pattern + size;
2988
2989 #ifdef REL_ALLOC
2990 /* This holds the pointer to the failure stack, when
2991 it is allocated relocatably. */
2992 fail_stack_elt_t *failure_stack_ptr;
2993 #endif
2994
2995 /* Assume that each path through the pattern can be null until
2996 proven otherwise. We set this false at the bottom of switch
2997 statement, to which we get only if a particular path doesn't
2998 match the empty string. */
2999 boolean path_can_be_null = true;
3000
3001 /* We aren't doing a `succeed_n' to begin with. */
3002 boolean succeed_n_p = false;
3003
3004 assert (fastmap != NULL && p != NULL);
3005
3006 INIT_FAIL_STACK ();
3007 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3008 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3009 bufp->can_be_null = 0;
3010
3011 while (1)
3012 {
3013 if (p == pend || *p == succeed)
3014 {
3015 /* We have reached the (effective) end of pattern. */
3016 if (!FAIL_STACK_EMPTY ())
3017 {
3018 bufp->can_be_null |= path_can_be_null;
3019
3020 /* Reset for next path. */
3021 path_can_be_null = true;
3022
3023 p = fail_stack.stack[--fail_stack.avail].pointer;
3024
3025 continue;
3026 }
3027 else
3028 break;
3029 }
3030
3031 /* We should never be about to go beyond the end of the pattern. */
3032 assert (p < pend);
3033
3034 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3035 {
3036
3037 /* I guess the idea here is to simply not bother with a fastmap
3038 if a backreference is used, since it's too hard to figure out
3039 the fastmap for the corresponding group. Setting
3040 `can_be_null' stops `re_search_2' from using the fastmap, so
3041 that is all we do. */
3042 case duplicate:
3043 bufp->can_be_null = 1;
3044 goto done;
3045
3046
3047 /* Following are the cases which match a character. These end
3048 with `break'. */
3049
3050 case exactn:
3051 fastmap[p[1]] = 1;
3052 break;
3053
3054
3055 case charset:
3056 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3057 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3058 fastmap[j] = 1;
3059 break;
3060
3061
3062 case charset_not:
3063 /* Chars beyond end of map must be allowed. */
3064 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3065 fastmap[j] = 1;
3066
3067 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3068 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3069 fastmap[j] = 1;
3070 break;
3071
3072
3073 case wordchar:
3074 #ifdef emacs
3075 k = (int) Sword;
3076 goto matchsyntax;
3077 #else
3078 for (j = 0; j < (1 << BYTEWIDTH); j++)
3079 if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) == Sword)
3080 fastmap[j] = 1;
3081 break;
3082 #endif
3083
3084
3085 case notwordchar:
3086 #ifdef emacs
3087 k = (int) Sword;
3088 goto matchnotsyntax;
3089 #else
3090 for (j = 0; j < (1 << BYTEWIDTH); j++)
3091 if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) != Sword)
3092 fastmap[j] = 1;
3093 break;
3094 #endif
3095
3096
3097 case anychar:
3098 {
3099 int fastmap_newline = fastmap['\n'];
3100
3101 /* `.' matches anything ... */
3102 for (j = 0; j < (1 << BYTEWIDTH); j++)
3103 fastmap[j] = 1;
3104
3105 /* ... except perhaps newline. */
3106 if (!(bufp->syntax & RE_DOT_NEWLINE))
3107 fastmap['\n'] = fastmap_newline;
3108
3109 /* Return if we have already set `can_be_null'; if we have,
3110 then the fastmap is irrelevant. Something's wrong here. */
3111 else if (bufp->can_be_null)
3112 goto done;
3113
3114 /* Otherwise, have to check alternative paths. */
3115 break;
3116 }
3117
3118 #ifdef emacs
3119 case syntaxspec:
3120 k = *p++;
3121 matchsyntax:
3122 for (j = 0; j < (1 << BYTEWIDTH); j++)
3123 if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) ==
3124 (enum syntaxcode) k)
3125 fastmap[j] = 1;
3126 break;
3127
3128
3129 case notsyntaxspec:
3130 k = *p++;
3131 matchnotsyntax:
3132 for (j = 0; j < (1 << BYTEWIDTH); j++)
3133 if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) !=
3134 (enum syntaxcode) k)
3135 fastmap[j] = 1;
3136 break;
3137
3138
3139 /* All cases after this match the empty string. These end with
3140 `continue'. */
3141
3142
3143 case before_dot:
3144 case at_dot:
3145 case after_dot:
3146 continue;
3147 #endif /* not emacs */
3148
3149
3150 case no_op:
3151 case begline:
3152 case endline:
3153 case begbuf:
3154 case endbuf:
3155 case wordbound:
3156 case notwordbound:
3157 case wordbeg:
3158 case wordend:
3159 case push_dummy_failure:
3160 continue;
3161
3162
3163 case jump_n:
3164 case pop_failure_jump:
3165 case maybe_pop_jump:
3166 case jump:
3167 case jump_past_alt:
3168 case dummy_failure_jump:
3169 EXTRACT_NUMBER_AND_INCR (j, p);
3170 p += j;
3171 if (j > 0)
3172 continue;
3173
3174 /* Jump backward implies we just went through the body of a
3175 loop and matched nothing. Opcode jumped to should be
3176 `on_failure_jump' or `succeed_n'. Just treat it like an
3177 ordinary jump. For a * loop, it has pushed its failure
3178 point already; if so, discard that as redundant. */
3179 if ((re_opcode_t) *p != on_failure_jump
3180 && (re_opcode_t) *p != succeed_n)
3181 continue;
3182
3183 p++;
3184 EXTRACT_NUMBER_AND_INCR (j, p);
3185 p += j;
3186
3187 /* If what's on the stack is where we are now, pop it. */
3188 if (!FAIL_STACK_EMPTY ()
3189 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3190 fail_stack.avail--;
3191
3192 continue;
3193
3194
3195 case on_failure_jump:
3196 case on_failure_keep_string_jump:
3197 handle_on_failure_jump:
3198 EXTRACT_NUMBER_AND_INCR (j, p);
3199
3200 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3201 end of the pattern. We don't want to push such a point,
3202 since when we restore it above, entering the switch will
3203 increment `p' past the end of the pattern. We don't need
3204 to push such a point since we obviously won't find any more
3205 fastmap entries beyond `pend'. Such a pattern can match
3206 the null string, though. */
3207 if (p + j < pend)
3208 {
3209 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3210 {
3211 RESET_FAIL_STACK ();
3212 return -2;
3213 }
3214 }
3215 else
3216 bufp->can_be_null = 1;
3217
3218 if (succeed_n_p)
3219 {
3220 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3221 succeed_n_p = false;
3222 }
3223
3224 continue;
3225
3226
3227 case succeed_n:
3228 /* Get to the number of times to succeed. */
3229 p += 2;
3230
3231 /* Increment p past the n for when k != 0. */
3232 EXTRACT_NUMBER_AND_INCR (k, p);
3233 if (k == 0)
3234 {
3235 p -= 4;
3236 succeed_n_p = true; /* Spaghetti code alert. */
3237 goto handle_on_failure_jump;
3238 }
3239 continue;
3240
3241
3242 case set_number_at:
3243 p += 4;
3244 continue;
3245
3246
3247 case start_memory:
3248 case stop_memory:
3249 p += 2;
3250 continue;
3251
3252
3253 default:
3254 abort (); /* We have listed all the cases. */
3255 } /* switch *p++ */
3256
3257 /* Getting here means we have found the possible starting
3258 characters for one path of the pattern -- and that the empty
3259 string does not match. We need not follow this path further.
3260 Instead, look at the next alternative (remembered on the
3261 stack), or quit if no more. The test at the top of the loop
3262 does these things. */
3263 path_can_be_null = false;
3264 p = pend;
3265 } /* while p */
3266
3267 /* Set `can_be_null' for the last path (also the first path, if the
3268 pattern is empty). */
3269 bufp->can_be_null |= path_can_be_null;
3270
3271 done:
3272 RESET_FAIL_STACK ();
3273 return 0;
3274 } /* re_compile_fastmap */
3275
3276 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3277 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3278 this memory for recording register information. STARTS and ENDS
3279 must be allocated using the malloc library routine, and must each
3280 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3281
3282 If NUM_REGS == 0, then subsequent matches should allocate their own
3283 register data.
3284
3285 Unless this function is called, the first search or match using
3286 PATTERN_BUFFER will allocate its own register data, without
3287 freeing the old data. */
3288
3289 void
3290 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3291 unsigned num_regs, regoff_t *starts, regoff_t *ends)
3292 {
3293 if (num_regs)
3294 {
3295 bufp->regs_allocated = REGS_REALLOCATE;
3296 regs->num_regs = num_regs;
3297 regs->start = starts;
3298 regs->end = ends;
3299 }
3300 else
3301 {
3302 bufp->regs_allocated = REGS_UNALLOCATED;
3303 regs->num_regs = 0;
3304 regs->start = regs->end = (regoff_t *) 0;
3305 }
3306 }
3307
3308 /* Searching routines. */
3309
3310 /* Like re_search_2, below, but only one string is specified, and
3311 doesn't let you say where to stop matching. */
3312
3313 int
3314 re_search (struct re_pattern_buffer *bufp, CONST char *string, int size,
3315 int startpos, int range, struct re_registers *regs)
3316 {
3317 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3318 regs, size);
3319 }
3320
3321
3322 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3323 virtual concatenation of STRING1 and STRING2, starting first at index
3324 STARTPOS, then at STARTPOS + 1, and so on.
3325
3326 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3327
3328 RANGE is how far to scan while trying to match. RANGE = 0 means try
3329 only at STARTPOS; in general, the last start tried is STARTPOS +
3330 RANGE.
3331
3332 In REGS, return the indices of the virtual concatenation of STRING1
3333 and STRING2 that matched the entire BUFP->buffer and its contained
3334 subexpressions.
3335
3336 Do not consider matching one past the index STOP in the virtual
3337 concatenation of STRING1 and STRING2.
3338
3339 We return either the position in the strings at which the match was
3340 found, -1 if no match, or -2 if error (such as failure
3341 stack overflow). */
3342
3343 int
3344 re_search_2 (struct re_pattern_buffer *bufp, CONST char *string1,
3345 int size1, CONST char *string2, int size2, int startpos,
3346 int range, struct re_registers *regs, int stop)
3347 {
3348 int val;
3349 register char *fastmap = bufp->fastmap;
3350 register char *translate = bufp->translate;
3351 int total_size = size1 + size2;
3352 int endpos = startpos + range;
3353 #ifdef REGEX_BEGLINE_CHECK
3354 int anchored_at_begline = 0;
3355 #endif
3356
3357 /* Check for out-of-range STARTPOS. */
3358 if (startpos < 0 || startpos > total_size)
3359 return -1;
3360
3361 /* Fix up RANGE if it might eventually take us outside
3362 the virtual concatenation of STRING1 and STRING2. */
3363 if (endpos < -1)
3364 range = -1 - startpos;
3365 else if (endpos > total_size)
3366 range = total_size - startpos;
3367
3368 /* If the search isn't to be a backwards one, don't waste time in a
3369 search for a pattern that must be anchored. */
3370 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3371 {
3372 if (startpos > 0)
3373 return -1;
3374 else
3375 range = 1;
3376 }
3377
3378 /* Update the fastmap now if not correct already. */
3379 if (fastmap && !bufp->fastmap_accurate)
3380 if (re_compile_fastmap (bufp) == -2)
3381 return -2;
3382
3383 #ifdef REGEX_BEGLINE_CHECK
3384 {
3385 int i = 0;
3386
3387 while (i < bufp->used)
3388 {
3389 if (bufp->buffer[i] == start_memory ||
3390 bufp->buffer[i] == stop_memory)
3391 i += 2;
3392 else
3393 break;
3394 }
3395 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline;
3396 }
3397 #endif
3398
3399 /* Loop through the string, looking for a place to start matching. */
3400 for (;;)
3401 {
3402 #ifdef REGEX_BEGLINE_CHECK
3403 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
3404 then we can speed things up by skipping to the next beginning-of-
3405 line. */
3406 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
3407 range > 0)
3408 {
3409 /* whose stupid idea was it anyway to make this
3410 function take two strings to match?? */
3411 int lim = 0;
3412 unsigned char *p;
3413 int irange = range;
3414 if (startpos < size1 && startpos + range >= size1)
3415 lim = range - (size1 - startpos);
3416
3417 p = ((unsigned char *)
3418 &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
3419 p--;
3420
3421 if (translate)
3422 {
3423 while (range > lim && translate[*p++] != '\n')
3424 range--;
3425 }
3426 else
3427 {
3428 while (range > lim && *p++ != '\n')
3429 range--;
3430 }
3431 startpos += irange - range;
3432 }
3433 #endif /* REGEX_BEGLINE_CHECK */
3434
3435 /* If a fastmap is supplied, skip quickly over characters that
3436 cannot be the start of a match. If the pattern can match the
3437 null string, however, we don't need to skip characters; we want
3438 the first null string. */
3439 if (fastmap && startpos < total_size && !bufp->can_be_null)
3440 {
3441 if (range > 0) /* Searching forwards. */
3442 {
3443 register CONST char *d;
3444 register int lim = 0;
3445 int irange = range;
3446
3447 if (startpos < size1 && startpos + range >= size1)
3448 lim = range - (size1 - startpos);
3449
3450 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
3451
3452 /* Written out as an if-else to avoid testing `translate'
3453 inside the loop. */
3454 if (translate)
3455 while (range > lim
3456 && !fastmap[(unsigned char)
3457 translate[(unsigned char) *d++]])
3458 range--;
3459 else
3460 while (range > lim && !fastmap[(unsigned char) *d++])
3461 range--;
3462
3463 startpos += irange - range;
3464 }
3465 else /* Searching backwards. */
3466 {
3467 register char c = (size1 == 0 || startpos >= size1
3468 ? string2[startpos - size1]
3469 : string1[startpos]);
3470
3471 if (!fastmap[(unsigned char) TRANSLATE (c)])
3472 goto advance;
3473 }
3474 }
3475
3476 /* If can't match the null string, and that's all we have left, fail. */
3477 if (range >= 0 && startpos == total_size && fastmap
3478 && !bufp->can_be_null)
3479 return -1;
3480
3481 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
3482 if (!no_quit_in_re_search)
3483 QUIT;
3484 #endif
3485 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3486 startpos, regs, stop);
3487 #ifndef REGEX_MALLOC
3488 #ifdef C_ALLOCA
3489 alloca (0);
3490 #endif
3491 #endif
3492
3493 if (val >= 0)
3494 return startpos;
3495
3496 if (val == -2)
3497 return -2;
3498
3499 advance:
3500 if (!range)
3501 break;
3502 else if (range > 0)
3503 {
3504 range--;
3505 startpos++;
3506 }
3507 else
3508 {
3509 range++;
3510 startpos--;
3511 }
3512 }
3513 return -1;
3514 } /* re_search_2 */
3515
3516 /* Declarations and macros for re_match_2. */
3517
3518 /* This converts PTR, a pointer into one of the search strings `string1'
3519 and `string2' into an offset from the beginning of that string. */
3520 #define POINTER_TO_OFFSET(ptr) \
3521 (FIRST_STRING_P (ptr) \
3522 ? ((regoff_t) ((ptr) - string1)) \
3523 : ((regoff_t) ((ptr) - string2 + size1)))
3524
3525 /* Macros for dealing with the split strings in re_match_2. */
3526
3527 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3528
3529 /* Call before fetching a character with *d. This switches over to
3530 string2 if necessary. */
3531 #define PREFETCH() \
3532 while (d == dend) \
3533 { \
3534 /* End of string2 => fail. */ \
3535 if (dend == end_match_2) \
3536 goto fail; \
3537 /* End of string1 => advance to string2. */ \
3538 d = string2; \
3539 dend = end_match_2; \
3540 }
3541
3542
3543 /* Test if at very beginning or at very end of the virtual concatenation
3544 of `string1' and `string2'. If only one string, it's `string2'. */
3545 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3546 #define AT_STRINGS_END(d) ((d) == end2)
3547
3548 /* XEmacs change:
3549 If the given position straddles the string gap, return the equivalent
3550 position that is before or after the gap, respectively; otherwise,
3551 return the same position. */
3552 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
3553 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
3554
3555 /* Test if CH is a word-constituent character. (XEmacs change) */
3556 #define WORDCHAR_P_UNSAFE(ch) \
3557 (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, ch) == Sword)
3558
3559 /* Free everything we malloc. */
3560 #ifdef MATCH_MAY_ALLOCATE
3561 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
3562 #define FREE_VARIABLES() \
3563 do { \
3564 REGEX_FREE_STACK (fail_stack.stack); \
3565 FREE_VAR (regstart); \
3566 FREE_VAR (regend); \
3567 FREE_VAR (old_regstart); \
3568 FREE_VAR (old_regend); \
3569 FREE_VAR (best_regstart); \
3570 FREE_VAR (best_regend); \
3571 FREE_VAR (reg_info); \
3572 FREE_VAR (reg_dummy); \
3573 FREE_VAR (reg_info_dummy); \
3574 } while (0)
3575 #else
3576 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
3577 #endif /* not MATCH_MAY_ALLOCATE */
3578
3579 /* These values must meet several constraints. They must not be valid
3580 register values; since we have a limit of 255 registers (because
3581 we use only one byte in the pattern for the register number), we can
3582 use numbers larger than 255. They must differ by 1, because of
3583 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3584 be larger than the value for the highest register, so we do not try
3585 to actually save any registers when none are active. */
3586 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3587 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3588
3589 /* Matching routines. */
3590
3591 #ifndef emacs /* Emacs never uses this. */
3592 /* re_match is like re_match_2 except it takes only a single string. */
3593
3594 int
3595 re_match (struct re_pattern_buffer *bufp, CONST char *string, int size,
3596 int pos, struct re_registers *regs)
3597 {
3598 int result = re_match_2_internal (bufp, NULL, 0, string, size,
3599 pos, regs, size);
3600 alloca (0);
3601 return result;
3602 }
3603 #endif /* not emacs */
3604
3605
3606 /* re_match_2 matches the compiled pattern in BUFP against the
3607 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3608 and SIZE2, respectively). We start matching at POS, and stop
3609 matching at STOP.
3610
3611 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3612 store offsets for the substring each group matched in REGS. See the
3613 documentation for exactly how many groups we fill.
3614
3615 We return -1 if no match, -2 if an internal error (such as the
3616 failure stack overflowing). Otherwise, we return the length of the
3617 matched substring. */
3618
3619 int
3620 re_match_2 (struct re_pattern_buffer *bufp, CONST char *string1,
3621 int size1, CONST char *string2, int size2, int pos,
3622 struct re_registers *regs, int stop)
3623 {
3624 int result = re_match_2_internal (bufp, string1, size1, string2, size2,
3625 pos, regs, stop);
3626 alloca (0);
3627 return result;
3628 }
3629
3630 /* This is a separate function so that we can force an alloca cleanup
3631 afterwards. */
3632 static int
3633 re_match_2_internal (struct re_pattern_buffer *bufp, CONST char *string1,
3634 int size1, CONST char *string2, int size2, int pos,
3635 struct re_registers *regs, int stop)
3636 {
3637 /* General temporaries. */
3638 int mcnt;
3639 unsigned char *p1;
3640 int should_succeed; /* XEmacs change */
3641
3642 /* Just past the end of the corresponding string. */
3643 CONST char *end1, *end2;
3644
3645 /* Pointers into string1 and string2, just past the last characters in
3646 each to consider matching. */
3647 CONST char *end_match_1, *end_match_2;
3648
3649 /* Where we are in the data, and the end of the current string. */
3650 CONST char *d, *dend;
3651
3652 /* Where we are in the pattern, and the end of the pattern. */
3653 unsigned char *p = bufp->buffer;
3654 register unsigned char *pend = p + bufp->used;
3655
3656 /* Mark the opcode just after a start_memory, so we can test for an
3657 empty subpattern when we get to the stop_memory. */
3658 unsigned char *just_past_start_mem = 0;
3659
3660 /* We use this to map every character in the string. */
3661 char *translate = bufp->translate;
3662
3663 /* Failure point stack. Each place that can handle a failure further
3664 down the line pushes a failure point on this stack. It consists of
3665 restart, regend, and reg_info for all registers corresponding to
3666 the subexpressions we're currently inside, plus the number of such
3667 registers, and, finally, two char *'s. The first char * is where
3668 to resume scanning the pattern; the second one is where to resume
3669 scanning the strings. If the latter is zero, the failure point is
3670 a ``dummy''; if a failure happens and the failure point is a dummy,
3671 it gets discarded and the next next one is tried. */
3672 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3673 fail_stack_type fail_stack;
3674 #endif
3675 #ifdef DEBUG
3676 static unsigned failure_id;
3677 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3678 #endif
3679
3680 #ifdef REL_ALLOC
3681 /* This holds the pointer to the failure stack, when
3682 it is allocated relocatably. */
3683 fail_stack_elt_t *failure_stack_ptr;
3684 #endif
3685
3686 /* We fill all the registers internally, independent of what we
3687 return, for use in backreferences. The number here includes
3688 an element for register zero. */
3689 unsigned num_regs = bufp->re_nsub + 1;
3690
3691 /* The currently active registers. */
3692 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3693 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3694
3695 /* Information on the contents of registers. These are pointers into
3696 the input strings; they record just what was matched (on this
3697 attempt) by a subexpression part of the pattern, that is, the
3698 regnum-th regstart pointer points to where in the pattern we began
3699 matching and the regnum-th regend points to right after where we
3700 stopped matching the regnum-th subexpression. (The zeroth register
3701 keeps track of what the whole pattern matches.) */
3702 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3703 CONST char **regstart, **regend;
3704 #endif
3705
3706 /* If a group that's operated upon by a repetition operator fails to
3707 match anything, then the register for its start will need to be
3708 restored because it will have been set to wherever in the string we
3709 are when we last see its open-group operator. Similarly for a
3710 register's end. */
3711 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3712 CONST char **old_regstart, **old_regend;
3713 #endif
3714
3715 /* The is_active field of reg_info helps us keep track of which (possibly
3716 nested) subexpressions we are currently in. The matched_something
3717 field of reg_info[reg_num] helps us tell whether or not we have
3718 matched any of the pattern so far this time through the reg_num-th
3719 subexpression. These two fields get reset each time through any
3720 loop their register is in. */
3721 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3722 register_info_type *reg_info;
3723 #endif
3724
3725 /* The following record the register info as found in the above
3726 variables when we find a match better than any we've seen before.
3727 This happens as we backtrack through the failure points, which in
3728 turn happens only if we have not yet matched the entire string. */
3729 unsigned best_regs_set = false;
3730 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3731 CONST char **best_regstart, **best_regend;
3732 #endif
3733
3734 /* Logically, this is `best_regend[0]'. But we don't want to have to
3735 allocate space for that if we're not allocating space for anything
3736 else (see below). Also, we never need info about register 0 for
3737 any of the other register vectors, and it seems rather a kludge to
3738 treat `best_regend' differently than the rest. So we keep track of
3739 the end of the best match so far in a separate variable. We
3740 initialize this to NULL so that when we backtrack the first time
3741 and need to test it, it's not garbage. */
3742 CONST char *match_end = NULL;
3743
3744 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
3745 int set_regs_matched_done = 0;
3746
3747 /* Used when we pop values we don't care about. */
3748 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3749 CONST char **reg_dummy;
3750 register_info_type *reg_info_dummy;
3751 #endif
3752
3753 #ifdef DEBUG
3754 /* Counts the total number of registers pushed. */
3755 unsigned num_regs_pushed = 0;
3756 #endif
3757
3758 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3759
3760 INIT_FAIL_STACK ();
3761
3762 #ifdef MATCH_MAY_ALLOCATE
3763 /* Do not bother to initialize all the register variables if there are
3764 no groups in the pattern, as it takes a fair amount of time. If
3765 there are groups, we include space for register 0 (the whole
3766 pattern), even though we never use it, since it simplifies the
3767 array indexing. We should fix this. */
3768 if (bufp->re_nsub)
3769 {
3770 regstart = REGEX_TALLOC (num_regs, CONST char *);
3771 regend = REGEX_TALLOC (num_regs, CONST char *);
3772 old_regstart = REGEX_TALLOC (num_regs, CONST char *);
3773 old_regend = REGEX_TALLOC (num_regs, CONST char *);
3774 best_regstart = REGEX_TALLOC (num_regs, CONST char *);
3775 best_regend = REGEX_TALLOC (num_regs, CONST char *);
3776 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3777 reg_dummy = REGEX_TALLOC (num_regs, CONST char *);
3778 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3779
3780 if (!(regstart && regend && old_regstart && old_regend && reg_info
3781 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3782 {
3783 FREE_VARIABLES ();
3784 return -2;
3785 }
3786 }
3787 else
3788 {
3789 /* We must initialize all our variables to NULL, so that
3790 `FREE_VARIABLES' doesn't try to free them. */
3791 regstart = regend = old_regstart = old_regend = best_regstart
3792 = best_regend = reg_dummy = NULL;
3793 reg_info = reg_info_dummy = (register_info_type *) NULL;
3794 }
3795 #endif /* MATCH_MAY_ALLOCATE */
3796
3797 /* The starting position is bogus. */
3798 if (pos < 0 || pos > size1 + size2)
3799 {
3800 FREE_VARIABLES ();
3801 return -1;
3802 }
3803
3804 /* Initialize subexpression text positions to -1 to mark ones that no
3805 start_memory/stop_memory has been seen for. Also initialize the
3806 register information struct. */
3807 for (mcnt = 1; mcnt < num_regs; mcnt++)
3808 {
3809 regstart[mcnt] = regend[mcnt]
3810 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3811
3812 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3813 IS_ACTIVE (reg_info[mcnt]) = 0;
3814 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3815 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3816 }
3817
3818 /* We move `string1' into `string2' if the latter's empty -- but not if
3819 `string1' is null. */
3820 if (size2 == 0 && string1 != NULL)
3821 {
3822 string2 = string1;
3823 size2 = size1;
3824 string1 = 0;
3825 size1 = 0;
3826 }
3827 end1 = string1 + size1;
3828 end2 = string2 + size2;
3829
3830 /* Compute where to stop matching, within the two strings. */
3831 if (stop <= size1)
3832 {
3833 end_match_1 = string1 + stop;
3834 end_match_2 = string2;
3835 }
3836 else
3837 {
3838 end_match_1 = end1;
3839 end_match_2 = string2 + stop - size1;
3840 }
3841
3842 /* `p' scans through the pattern as `d' scans through the data.
3843 `dend' is the end of the input string that `d' points within. `d'
3844 is advanced into the following input string whenever necessary, but
3845 this happens before fetching; therefore, at the beginning of the
3846 loop, `d' can be pointing at the end of a string, but it cannot
3847 equal `string2'. */
3848 if (size1 > 0 && pos <= size1)
3849 {
3850 d = string1 + pos;
3851 dend = end_match_1;
3852 }
3853 else
3854 {
3855 d = string2 + pos - size1;
3856 dend = end_match_2;
3857 }
3858
3859 DEBUG_PRINT1 ("The compiled pattern is: ");
3860 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3861 DEBUG_PRINT1 ("The string to match is: `");
3862 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3863 DEBUG_PRINT1 ("'\n");
3864
3865 /* This loops over pattern commands. It exits by returning from the
3866 function if the match is complete, or it drops through if the match
3867 fails at this starting point in the input data. */
3868 for (;;)
3869 {
3870 DEBUG_PRINT2 ("\n0x%lx: ", (unsigned long) p);
3871
3872 if (p == pend)
3873 { /* End of pattern means we might have succeeded. */
3874 DEBUG_PRINT1 ("end of pattern ... ");
3875
3876 /* If we haven't matched the entire string, and we want the
3877 longest match, try backtracking. */
3878 if (d != end_match_2)
3879 {
3880 /* 1 if this match ends in the same string (string1 or string2)
3881 as the best previous match. */
3882 boolean same_str_p = (FIRST_STRING_P (match_end)
3883 == MATCHING_IN_FIRST_STRING);
3884 /* 1 if this match is the best seen so far. */
3885 boolean best_match_p;
3886
3887 /* AIX compiler got confused when this was combined
3888 with the previous declaration. */
3889 if (same_str_p)
3890 best_match_p = d > match_end;
3891 else
3892 best_match_p = !MATCHING_IN_FIRST_STRING;
3893
3894 DEBUG_PRINT1 ("backtracking.\n");
3895
3896 if (!FAIL_STACK_EMPTY ())
3897 { /* More failure points to try. */
3898
3899 /* If exceeds best match so far, save it. */
3900 if (!best_regs_set || best_match_p)
3901 {
3902 best_regs_set = true;
3903 match_end = d;
3904
3905 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3906
3907 for (mcnt = 1; mcnt < num_regs; mcnt++)
3908 {
3909 best_regstart[mcnt] = regstart[mcnt];
3910 best_regend[mcnt] = regend[mcnt];
3911 }
3912 }
3913 goto fail;
3914 }
3915
3916 /* If no failure points, don't restore garbage. And if
3917 last match is real best match, don't restore second
3918 best one. */
3919 else if (best_regs_set && !best_match_p)
3920 {
3921 restore_best_regs:
3922 /* Restore best match. It may happen that `dend ==
3923 end_match_1' while the restored d is in string2.
3924 For example, the pattern `x.*y.*z' against the
3925 strings `x-' and `y-z-', if the two strings are
3926 not consecutive in memory. */
3927 DEBUG_PRINT1 ("Restoring best registers.\n");
3928
3929 d = match_end;
3930 dend = ((d >= string1 && d <= end1)
3931 ? end_match_1 : end_match_2);
3932
3933 for (mcnt = 1; mcnt < num_regs; mcnt++)
3934 {
3935 regstart[mcnt] = best_regstart[mcnt];
3936 regend[mcnt] = best_regend[mcnt];
3937 }
3938 }
3939 } /* d != end_match_2 */
3940
3941 succeed_label:
3942 DEBUG_PRINT1 ("Accepting match.\n");
3943
3944 /* If caller wants register contents data back, do it. */
3945 if (regs && !bufp->no_sub)
3946 {
3947 /* Have the register data arrays been allocated? */
3948 if (bufp->regs_allocated == REGS_UNALLOCATED)
3949 { /* No. So allocate them with malloc. We need one
3950 extra element beyond `num_regs' for the `-1' marker
3951 GNU code uses. */
3952 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3953 regs->start = TALLOC (regs->num_regs, regoff_t);
3954 regs->end = TALLOC (regs->num_regs, regoff_t);
3955 if (regs->start == NULL || regs->end == NULL)
3956 {
3957 FREE_VARIABLES ();
3958 return -2;
3959 }
3960 bufp->regs_allocated = REGS_REALLOCATE;
3961 }
3962 else if (bufp->regs_allocated == REGS_REALLOCATE)
3963 { /* Yes. If we need more elements than were already
3964 allocated, reallocate them. If we need fewer, just
3965 leave it alone. */
3966 if (regs->num_regs < num_regs + 1)
3967 {
3968 regs->num_regs = num_regs + 1;
3969 RETALLOC (regs->start, regs->num_regs, regoff_t);
3970 RETALLOC (regs->end, regs->num_regs, regoff_t);
3971 if (regs->start == NULL || regs->end == NULL)
3972 {
3973 FREE_VARIABLES ();
3974 return -2;
3975 }
3976 }
3977 }
3978 else
3979 {
3980 /* These braces fend off a "empty body in an else-statement"
3981 warning under GCC when assert expands to nothing. */
3982 assert (bufp->regs_allocated == REGS_FIXED);
3983 }
3984
3985 /* Convert the pointer data in `regstart' and `regend' to
3986 indices. Register zero has to be set differently,
3987 since we haven't kept track of any info for it. */
3988 if (regs->num_regs > 0)
3989 {
3990 regs->start[0] = pos;
3991 regs->end[0] = (MATCHING_IN_FIRST_STRING
3992 ? ((regoff_t) (d - string1))
3993 : ((regoff_t) (d - string2 + size1)));
3994 }
3995
3996 /* Go through the first `min (num_regs, regs->num_regs)'
3997 registers, since that is all we initialized. */
3998 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
3999 {
4000 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4001 regs->start[mcnt] = regs->end[mcnt] = -1;
4002 else
4003 {
4004 regs->start[mcnt]
4005 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4006 regs->end[mcnt]
4007 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4008 }
4009 }
4010
4011 /* If the regs structure we return has more elements than
4012 were in the pattern, set the extra elements to -1. If
4013 we (re)allocated the registers, this is the case,
4014 because we always allocate enough to have at least one
4015 -1 at the end. */
4016 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4017 regs->start[mcnt] = regs->end[mcnt] = -1;
4018 } /* regs && !bufp->no_sub */
4019
4020 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4021 nfailure_points_pushed, nfailure_points_popped,
4022 nfailure_points_pushed - nfailure_points_popped);
4023 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4024
4025 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4026 ? string1
4027 : string2 - size1);
4028
4029 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4030
4031 FREE_VARIABLES ();
4032 return mcnt;
4033 }
4034
4035 /* Otherwise match next pattern command. */
4036 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4037 {
4038 /* Ignore these. Used to ignore the n of succeed_n's which
4039 currently have n == 0. */
4040 case no_op:
4041 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4042 break;
4043
4044 case succeed:
4045 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4046 goto succeed_label;
4047
4048 /* Match the next n pattern characters exactly. The following
4049 byte in the pattern defines n, and the n bytes after that
4050 are the characters to match. */
4051 case exactn:
4052 mcnt = *p++;
4053 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4054
4055 /* This is written out as an if-else so we don't waste time
4056 testing `translate' inside the loop. */
4057 if (translate)
4058 {
4059 do
4060 {
4061 PREFETCH ();
4062 if (translate[(unsigned char) *d++] != (char) *p++)
4063 goto fail;
4064 }
4065 while (--mcnt);
4066 }
4067 else
4068 {
4069 do
4070 {
4071 PREFETCH ();
4072 if (*d++ != (char) *p++) goto fail;
4073 }
4074 while (--mcnt);
4075 }
4076 SET_REGS_MATCHED ();
4077 break;
4078
4079
4080 /* Match any character except possibly a newline or a null. */
4081 case anychar:
4082 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4083
4084 PREFETCH ();
4085
4086 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4087 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4088 goto fail;
4089
4090 SET_REGS_MATCHED ();
4091 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4092 INC_CHARPTR (d); /* XEmacs change */
4093 break;
4094
4095
4096 case charset:
4097 case charset_not:
4098 {
4099 register unsigned char c;
4100 boolean not = (re_opcode_t) *(p - 1) == charset_not;
4101
4102 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4103
4104 PREFETCH ();
4105 c = TRANSLATE (*d); /* The character to match. */
4106
4107 /* Cast to `unsigned' instead of `unsigned char' in case the
4108 bit list is a full 32 bytes long. */
4109 if (c < (unsigned) (*p * BYTEWIDTH)
4110 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4111 not = !not;
4112
4113 p += 1 + *p;
4114
4115 if (!not) goto fail;
4116
4117 SET_REGS_MATCHED ();
4118 d++;
4119 break;
4120 }
4121
4122
4123 /* The beginning of a group is represented by start_memory.
4124 The arguments are the register number in the next byte, and the
4125 number of groups inner to this one in the next. The text
4126 matched within the group is recorded (in the internal
4127 registers data structure) under the register number. */
4128 case start_memory:
4129 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4130
4131 /* Find out if this group can match the empty string. */
4132 p1 = p; /* To send to group_match_null_string_p. */
4133
4134 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4135 REG_MATCH_NULL_STRING_P (reg_info[*p])
4136 = group_match_null_string_p (&p1, pend, reg_info);
4137
4138 /* Save the position in the string where we were the last time
4139 we were at this open-group operator in case the group is
4140 operated upon by a repetition operator, e.g., with `(a*)*b'
4141 against `ab'; then we want to ignore where we are now in
4142 the string in case this attempt to match fails. */
4143 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4144 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4145 : regstart[*p];
4146 DEBUG_PRINT2 (" old_regstart: %d\n",
4147 POINTER_TO_OFFSET (old_regstart[*p]));
4148
4149 regstart[*p] = d;
4150 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4151
4152 IS_ACTIVE (reg_info[*p]) = 1;
4153 MATCHED_SOMETHING (reg_info[*p]) = 0;
4154
4155 /* Clear this whenever we change the register activity status. */
4156 set_regs_matched_done = 0;
4157
4158 /* This is the new highest active register. */
4159 highest_active_reg = *p;
4160
4161 /* If nothing was active before, this is the new lowest active
4162 register. */
4163 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4164 lowest_active_reg = *p;
4165
4166 /* Move past the register number and inner group count. */
4167 p += 2;
4168 just_past_start_mem = p;
4169
4170 break;
4171
4172
4173 /* The stop_memory opcode represents the end of a group. Its
4174 arguments are the same as start_memory's: the register
4175 number, and the number of inner groups. */
4176 case stop_memory:
4177 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4178
4179 /* We need to save the string position the last time we were at
4180 this close-group operator in case the group is operated
4181 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4182 against `aba'; then we want to ignore where we are now in
4183 the string in case this attempt to match fails. */
4184 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4185 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4186 : regend[*p];
4187 DEBUG_PRINT2 (" old_regend: %d\n",
4188 POINTER_TO_OFFSET (old_regend[*p]));
4189
4190 regend[*p] = d;
4191 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4192
4193 /* This register isn't active anymore. */
4194 IS_ACTIVE (reg_info[*p]) = 0;
4195
4196 /* Clear this whenever we change the register activity status. */
4197 set_regs_matched_done = 0;
4198
4199 /* If this was the only register active, nothing is active
4200 anymore. */
4201 if (lowest_active_reg == highest_active_reg)
4202 {
4203 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4204 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4205 }
4206 else
4207 { /* We must scan for the new highest active register, since
4208 it isn't necessarily one less than now: consider
4209 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4210 new highest active register is 1. */
4211 unsigned char r = *p - 1;
4212 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4213 r--;
4214
4215 /* If we end up at register zero, that means that we saved
4216 the registers as the result of an `on_failure_jump', not
4217 a `start_memory', and we jumped to past the innermost
4218 `stop_memory'. For example, in ((.)*) we save
4219 registers 1 and 2 as a result of the *, but when we pop
4220 back to the second ), we are at the stop_memory 1.
4221 Thus, nothing is active. */
4222 if (r == 0)
4223 {
4224 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4225 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4226 }
4227 else
4228 highest_active_reg = r;
4229 }
4230
4231 /* If just failed to match something this time around with a
4232 group that's operated on by a repetition operator, try to
4233 force exit from the ``loop'', and restore the register
4234 information for this group that we had before trying this
4235 last match. */
4236 if ((!MATCHED_SOMETHING (reg_info[*p])
4237 || just_past_start_mem == p - 1)
4238 && (p + 2) < pend)
4239 {
4240 boolean is_a_jump_n = false;
4241
4242 p1 = p + 2;
4243 mcnt = 0;
4244 switch ((re_opcode_t) *p1++)
4245 {
4246 case jump_n:
4247 is_a_jump_n = true;
4248 case pop_failure_jump:
4249 case maybe_pop_jump:
4250 case jump:
4251 case dummy_failure_jump:
4252 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4253 if (is_a_jump_n)
4254 p1 += 2;
4255 break;
4256
4257 default:
4258 /* do nothing */ ;
4259 }
4260 p1 += mcnt;
4261
4262 /* If the next operation is a jump backwards in the pattern
4263 to an on_failure_jump right before the start_memory
4264 corresponding to this stop_memory, exit from the loop
4265 by forcing a failure after pushing on the stack the
4266 on_failure_jump's jump in the pattern, and d. */
4267 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
4268 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
4269 {
4270 /* If this group ever matched anything, then restore
4271 what its registers were before trying this last
4272 failed match, e.g., with `(a*)*b' against `ab' for
4273 regstart[1], and, e.g., with `((a*)*(b*)*)*'
4274 against `aba' for regend[3].
4275
4276 Also restore the registers for inner groups for,
4277 e.g., `((a*)(b*))*' against `aba' (register 3 would
4278 otherwise get trashed). */
4279
4280 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
4281 {
4282 unsigned r;
4283
4284 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
4285
4286 /* Restore this and inner groups' (if any) registers. */
4287 for (r = *p; r < *p + *(p + 1); r++)
4288 {
4289 regstart[r] = old_regstart[r];
4290
4291 /* xx why this test? */
4292 if (old_regend[r] >= regstart[r])
4293 regend[r] = old_regend[r];
4294 }
4295 }
4296 p1++;
4297 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4298 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
4299
4300 goto fail;
4301 }
4302 }
4303
4304 /* Move past the register number and the inner group count. */
4305 p += 2;
4306 break;
4307
4308
4309 /* \<digit> has been turned into a `duplicate' command which is
4310 followed by the numeric value of <digit> as the register number. */
4311 case duplicate:
4312 {
4313 register CONST char *d2, *dend2;
4314 int regno = *p++; /* Get which register to match against. */
4315 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4316
4317 /* Can't back reference a group which we've never matched. */
4318 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4319 goto fail;
4320
4321 /* Where in input to try to start matching. */
4322 d2 = regstart[regno];
4323
4324 /* Where to stop matching; if both the place to start and
4325 the place to stop matching are in the same string, then
4326 set to the place to stop, otherwise, for now have to use
4327 the end of the first string. */
4328
4329 dend2 = ((FIRST_STRING_P (regstart[regno])
4330 == FIRST_STRING_P (regend[regno]))
4331 ? regend[regno] : end_match_1);
4332 for (;;)
4333 {
4334 /* If necessary, advance to next segment in register
4335 contents. */
4336 while (d2 == dend2)
4337 {
4338 if (dend2 == end_match_2) break;
4339 if (dend2 == regend[regno]) break;
4340
4341 /* End of string1 => advance to string2. */
4342 d2 = string2;
4343 dend2 = regend[regno];
4344 }
4345 /* At end of register contents => success */
4346 if (d2 == dend2) break;
4347
4348 /* If necessary, advance to next segment in data. */
4349 PREFETCH ();
4350
4351 /* How many characters left in this segment to match. */
4352 mcnt = dend - d;
4353
4354 /* Want how many consecutive characters we can match in
4355 one shot, so, if necessary, adjust the count. */
4356 if (mcnt > dend2 - d2)
4357 mcnt = dend2 - d2;
4358
4359 /* Compare that many; failure if mismatch, else move
4360 past them. */
4361 if (translate
4362 ? bcmp_translate ((unsigned char *) d,
4363 (unsigned char *) d2, mcnt, translate)
4364 : memcmp (d, d2, mcnt))
4365 goto fail;
4366 d += mcnt, d2 += mcnt;
4367
4368 /* Do this because we've match some characters. */
4369 SET_REGS_MATCHED ();
4370 }
4371 }
4372 break;
4373
4374
4375 /* begline matches the empty string at the beginning of the string
4376 (unless `not_bol' is set in `bufp'), and, if
4377 `newline_anchor' is set, after newlines. */
4378 case begline:
4379 DEBUG_PRINT1 ("EXECUTING begline.\n");
4380
4381 if (AT_STRINGS_BEG (d))
4382 {
4383 if (!bufp->not_bol) break;
4384 }
4385 else if (d[-1] == '\n' && bufp->newline_anchor)
4386 {
4387 break;
4388 }
4389 /* In all other cases, we fail. */
4390 goto fail;
4391
4392
4393 /* endline is the dual of begline. */
4394 case endline:
4395 DEBUG_PRINT1 ("EXECUTING endline.\n");
4396
4397 if (AT_STRINGS_END (d))
4398 {
4399 if (!bufp->not_eol) break;
4400 }
4401
4402 /* We have to ``prefetch'' the next character. */
4403 else if ((d == end1 ? *string2 : *d) == '\n'
4404 && bufp->newline_anchor)
4405 {
4406 break;
4407 }
4408 goto fail;
4409
4410
4411 /* Match at the very beginning of the data. */
4412 case begbuf:
4413 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
4414 if (AT_STRINGS_BEG (d))
4415 break;
4416 goto fail;
4417
4418
4419 /* Match at the very end of the data. */
4420 case endbuf:
4421 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
4422 if (AT_STRINGS_END (d))
4423 break;
4424 goto fail;
4425
4426
4427 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
4428 pushes NULL as the value for the string on the stack. Then
4429 `pop_failure_point' will keep the current value for the
4430 string, instead of restoring it. To see why, consider
4431 matching `foo\nbar' against `.*\n'. The .* matches the foo;
4432 then the . fails against the \n. But the next thing we want
4433 to do is match the \n against the \n; if we restored the
4434 string value, we would be back at the foo.
4435
4436 Because this is used only in specific cases, we don't need to
4437 check all the things that `on_failure_jump' does, to make
4438 sure the right things get saved on the stack. Hence we don't
4439 share its code. The only reason to push anything on the
4440 stack at all is that otherwise we would have to change
4441 `anychar's code to do something besides goto fail in this
4442 case; that seems worse than this. */
4443 case on_failure_keep_string_jump:
4444 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
4445
4446 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4447 DEBUG_PRINT3 (" %d (to 0x%lx):\n", mcnt, (unsigned long) (p + mcnt));
4448
4449 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
4450 break;
4451
4452
4453 /* Uses of on_failure_jump:
4454
4455 Each alternative starts with an on_failure_jump that points
4456 to the beginning of the next alternative. Each alternative
4457 except the last ends with a jump that in effect jumps past
4458 the rest of the alternatives. (They really jump to the
4459 ending jump of the following alternative, because tensioning
4460 these jumps is a hassle.)
4461
4462 Repeats start with an on_failure_jump that points past both
4463 the repetition text and either the following jump or
4464 pop_failure_jump back to this on_failure_jump. */
4465 case on_failure_jump:
4466 on_failure:
4467 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
4468
4469 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4470 DEBUG_PRINT3 (" %d (to 0x%lx)", mcnt, (unsigned long) (p + mcnt));
4471
4472 /* If this on_failure_jump comes right before a group (i.e.,
4473 the original * applied to a group), save the information
4474 for that group and all inner ones, so that if we fail back
4475 to this point, the group's information will be correct.
4476 For example, in \(a*\)*\1, we need the preceding group,
4477 and in \(\(a*\)b*\)\2, we need the inner group. */
4478
4479 /* We can't use `p' to check ahead because we push
4480 a failure point to `p + mcnt' after we do this. */
4481 p1 = p;
4482
4483 /* We need to skip no_op's before we look for the
4484 start_memory in case this on_failure_jump is happening as
4485 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
4486 against aba. */
4487 while (p1 < pend && (re_opcode_t) *p1 == no_op)
4488 p1++;
4489
4490 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
4491 {
4492 /* We have a new highest active register now. This will
4493 get reset at the start_memory we are about to get to,
4494 but we will have saved all the registers relevant to
4495 this repetition op, as described above. */
4496 highest_active_reg = *(p1 + 1) + *(p1 + 2);
4497 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4498 lowest_active_reg = *(p1 + 1);
4499 }
4500
4501 DEBUG_PRINT1 (":\n");
4502 PUSH_FAILURE_POINT (p + mcnt, d, -2);
4503 break;
4504
4505
4506 /* A smart repeat ends with `maybe_pop_jump'.
4507 We change it to either `pop_failure_jump' or `jump'. */
4508 case maybe_pop_jump:
4509 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4510 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
4511 {
4512 register unsigned char *p2 = p;
4513
4514 /* Compare the beginning of the repeat with what in the
4515 pattern follows its end. If we can establish that there
4516 is nothing that they would both match, i.e., that we
4517 would have to backtrack because of (as in, e.g., `a*a')
4518 then we can change to pop_failure_jump, because we'll
4519 never have to backtrack.
4520
4521 This is not true in the case of alternatives: in
4522 `(a|ab)*' we do need to backtrack to the `ab' alternative
4523 (e.g., if the string was `ab'). But instead of trying to
4524 detect that here, the alternative has put on a dummy
4525 failure point which is what we will end up popping. */
4526
4527 /* Skip over open/close-group commands.
4528 If what follows this loop is a ...+ construct,
4529 look at what begins its body, since we will have to
4530 match at least one of that. */
4531 while (1)
4532 {
4533 if (p2 + 2 < pend
4534 && ((re_opcode_t) *p2 == stop_memory
4535 || (re_opcode_t) *p2 == start_memory))
4536 p2 += 3;
4537 else if (p2 + 6 < pend
4538 && (re_opcode_t) *p2 == dummy_failure_jump)
4539 p2 += 6;
4540 else
4541 break;
4542 }
4543
4544 p1 = p + mcnt;
4545 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4546 to the `maybe_finalize_jump' of this case. Examine what
4547 follows. */
4548
4549 /* If we're at the end of the pattern, we can change. */
4550 if (p2 == pend)
4551 {
4552 /* Consider what happens when matching ":\(.*\)"
4553 against ":/". I don't really understand this code
4554 yet. */
4555 p[-3] = (unsigned char) pop_failure_jump;
4556 DEBUG_PRINT1
4557 (" End of pattern: change to `pop_failure_jump'.\n");
4558 }
4559
4560 else if ((re_opcode_t) *p2 == exactn
4561 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4562 {
4563 register unsigned char c
4564 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4565
4566 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4567 {
4568 p[-3] = (unsigned char) pop_failure_jump;
4569 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4570 c, p1[5]);
4571 }
4572
4573 else if ((re_opcode_t) p1[3] == charset
4574 || (re_opcode_t) p1[3] == charset_not)
4575 {
4576 int not = (re_opcode_t) p1[3] == charset_not;
4577
4578 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4579 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4580 not = !not;
4581
4582 /* `not' is equal to 1 if c would match, which means
4583 that we can't change to pop_failure_jump. */
4584 if (!not)
4585 {
4586 p[-3] = (unsigned char) pop_failure_jump;
4587 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4588 }
4589 }
4590 }
4591 else if ((re_opcode_t) *p2 == charset)
4592 {
4593 #ifdef DEBUG
4594 register unsigned char c
4595 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4596 #endif
4597
4598 if ((re_opcode_t) p1[3] == exactn
4599 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4]
4600 && (p2[1 + p1[4] / BYTEWIDTH]
4601 & (1 << (p1[4] % BYTEWIDTH)))))
4602 {
4603 p[-3] = (unsigned char) pop_failure_jump;
4604 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4605 c, p1[5]);
4606 }
4607
4608 else if ((re_opcode_t) p1[3] == charset_not)
4609 {
4610 int idx;
4611 /* We win if the charset_not inside the loop
4612 lists every character listed in the charset after. */
4613 for (idx = 0; idx < (int) p2[1]; idx++)
4614 if (! (p2[2 + idx] == 0
4615 || (idx < (int) p1[4]
4616 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
4617 break;
4618
4619 if (idx == p2[1])
4620 {
4621 p[-3] = (unsigned char) pop_failure_jump;
4622 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4623 }
4624 }
4625 else if ((re_opcode_t) p1[3] == charset)
4626 {
4627 int idx;
4628 /* We win if the charset inside the loop
4629 has no overlap with the one after the loop. */
4630 for (idx = 0;
4631 idx < (int) p2[1] && idx < (int) p1[4];
4632 idx++)
4633 if ((p2[2 + idx] & p1[5 + idx]) != 0)
4634 break;
4635
4636 if (idx == p2[1] || idx == p1[4])
4637 {
4638 p[-3] = (unsigned char) pop_failure_jump;
4639 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4640 }
4641 }
4642 }
4643 }
4644 p -= 2; /* Point at relative address again. */
4645 if ((re_opcode_t) p[-1] != pop_failure_jump)
4646 {
4647 p[-1] = (unsigned char) jump;
4648 DEBUG_PRINT1 (" Match => jump.\n");
4649 goto unconditional_jump;
4650 }
4651 /* Note fall through. */
4652
4653
4654 /* The end of a simple repeat has a pop_failure_jump back to
4655 its matching on_failure_jump, where the latter will push a
4656 failure point. The pop_failure_jump takes off failure
4657 points put on by this pop_failure_jump's matching
4658 on_failure_jump; we got through the pattern to here from the
4659 matching on_failure_jump, so didn't fail. */
4660 case pop_failure_jump:
4661 {
4662 /* We need to pass separate storage for the lowest and
4663 highest registers, even though we don't care about the
4664 actual values. Otherwise, we will restore only one
4665 register from the stack, since lowest will == highest in
4666 `pop_failure_point'. */
4667 unsigned dummy_low_reg, dummy_high_reg;
4668 unsigned char *pdummy;
4669 CONST char *sdummy;
4670
4671 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4672 POP_FAILURE_POINT (sdummy, pdummy,
4673 dummy_low_reg, dummy_high_reg,
4674 reg_dummy, reg_dummy, reg_info_dummy);
4675 }
4676 /* Note fall through. */
4677
4678
4679 /* Unconditionally jump (without popping any failure points). */
4680 case jump:
4681 unconditional_jump:
4682 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4683 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4684 p += mcnt; /* Do the jump. */
4685 DEBUG_PRINT2 ("(to 0x%lx).\n", (unsigned long) p);
4686 break;
4687
4688
4689 /* We need this opcode so we can detect where alternatives end
4690 in `group_match_null_string_p' et al. */
4691 case jump_past_alt:
4692 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4693 goto unconditional_jump;
4694
4695
4696 /* Normally, the on_failure_jump pushes a failure point, which
4697 then gets popped at pop_failure_jump. We will end up at
4698 pop_failure_jump, also, and with a pattern of, say, `a+', we
4699 are skipping over the on_failure_jump, so we have to push
4700 something meaningless for pop_failure_jump to pop. */
4701 case dummy_failure_jump:
4702 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4703 /* It doesn't matter what we push for the string here. What
4704 the code at `fail' tests is the value for the pattern. */
4705 PUSH_FAILURE_POINT (0, 0, -2);
4706 goto unconditional_jump;
4707
4708
4709 /* At the end of an alternative, we need to push a dummy failure
4710 point in case we are followed by a `pop_failure_jump', because
4711 we don't want the failure point for the alternative to be
4712 popped. For example, matching `(a|ab)*' against `aab'
4713 requires that we match the `ab' alternative. */
4714 case push_dummy_failure:
4715 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4716 /* See comments just above at `dummy_failure_jump' about the
4717 two zeroes. */
4718 PUSH_FAILURE_POINT (0, 0, -2);
4719 break;
4720
4721 /* Have to succeed matching what follows at least n times.
4722 After that, handle like `on_failure_jump'. */
4723 case succeed_n:
4724 EXTRACT_NUMBER (mcnt, p + 2);
4725 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4726
4727 assert (mcnt >= 0);
4728 /* Originally, this is how many times we HAVE to succeed. */
4729 if (mcnt > 0)
4730 {
4731 mcnt--;
4732 p += 2;
4733 STORE_NUMBER_AND_INCR (p, mcnt);
4734 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (unsigned long) p,
4735 mcnt);
4736 }
4737 else if (mcnt == 0)
4738 {
4739 DEBUG_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n",
4740 (unsigned long) p+2);
4741 p[2] = (unsigned char) no_op;
4742 p[3] = (unsigned char) no_op;
4743 goto on_failure;
4744 }
4745 break;
4746
4747 case jump_n:
4748 EXTRACT_NUMBER (mcnt, p + 2);
4749 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4750
4751 /* Originally, this is how many times we CAN jump. */
4752 if (mcnt)
4753 {
4754 mcnt--;
4755 STORE_NUMBER (p + 2, mcnt);
4756 goto unconditional_jump;
4757 }
4758 /* If don't have to jump any more, skip over the rest of command. */
4759 else
4760 p += 4;
4761 break;
4762
4763 case set_number_at:
4764 {
4765 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4766
4767 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4768 p1 = p + mcnt;
4769 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4770 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (unsigned long) p1,
4771 mcnt);
4772 STORE_NUMBER (p1, mcnt);
4773 break;
4774 }
4775
4776 case wordbound:
4777 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4778 should_succeed = 1;
4779 matchwordbound:
4780 {
4781 /* XEmacs change */
4782 int result;
4783 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
4784 result = 1;
4785 else
4786 {
4787 CONST unsigned char *d_before =
4788 (CONST unsigned char *) POS_BEFORE_GAP_UNSAFE (d);
4789 CONST unsigned char *d_after =
4790 (CONST unsigned char *) POS_AFTER_GAP_UNSAFE (d);
4791 Emchar emch1, emch2;
4792
4793 DEC_CHARPTR (d_before);
4794 emch1 = charptr_emchar (d_before);
4795 emch2 = charptr_emchar (d_after);
4796 result = (WORDCHAR_P_UNSAFE (emch1) !=
4797 WORDCHAR_P_UNSAFE (emch2));
4798 }
4799 if (result == should_succeed)
4800 break;
4801 goto fail;
4802 }
4803
4804 case notwordbound:
4805 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4806 should_succeed = 0;
4807 goto matchwordbound;
4808
4809 case wordbeg:
4810 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4811 {
4812 /* XEmacs: this originally read:
4813
4814 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4815 break;
4816
4817 */
4818 CONST unsigned char *dtmp =
4819 (CONST unsigned char *) POS_AFTER_GAP_UNSAFE (d);
4820 Emchar emch = charptr_emchar (dtmp);
4821 if (!WORDCHAR_P_UNSAFE (emch))
4822 goto fail;
4823 if (AT_STRINGS_BEG (d))
4824 break;
4825 dtmp = (CONST unsigned char *) POS_BEFORE_GAP_UNSAFE (d);
4826 DEC_CHARPTR (dtmp);
4827 emch = charptr_emchar (dtmp);
4828 if (!WORDCHAR_P_UNSAFE (emch))
4829 break;
4830 goto fail;
4831 }
4832
4833 case wordend:
4834 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4835 {
4836 /* XEmacs: this originally read:
4837
4838 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4839 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4840 break;
4841
4842 The or condition is incorrect (reversed).
4843 */
4844 CONST unsigned char *dtmp;
4845 Emchar emch;
4846 if (AT_STRINGS_BEG (d))
4847 goto fail;
4848 dtmp = (CONST unsigned char *) POS_BEFORE_GAP_UNSAFE (d);
4849 DEC_CHARPTR (dtmp);
4850 emch = charptr_emchar (dtmp);
4851 if (!WORDCHAR_P_UNSAFE (emch))
4852 goto fail;
4853 if (AT_STRINGS_END (d))
4854 break;
4855 dtmp = (CONST unsigned char *) POS_AFTER_GAP_UNSAFE (d);
4856 emch = charptr_emchar (dtmp);
4857 if (!WORDCHAR_P_UNSAFE (emch))
4858 break;
4859 goto fail;
4860 }
4861
4862 #ifdef emacs
4863 case before_dot:
4864 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4865 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) >=
4866 BUF_PT (regex_emacs_buffer))
4867 goto fail;
4868 break;
4869
4870 case at_dot:
4871 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4872 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
4873 != BUF_PT (regex_emacs_buffer))
4874 goto fail;
4875 break;
4876
4877 case after_dot:
4878 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4879 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
4880 <= BUF_PT (regex_emacs_buffer))
4881 goto fail;
4882 break;
4883 #if 0 /* not emacs19 */
4884 case at_dot:
4885 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4886 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) + 1
4887 != BUF_PT (regex_emacs_buffer))
4888 goto fail;
4889 break;
4890 #endif /* not emacs19 */
4891
4892 case syntaxspec:
4893 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4894 mcnt = *p++;
4895 goto matchsyntax;
4896
4897 case wordchar:
4898 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4899 mcnt = (int) Sword;
4900 matchsyntax:
4901 should_succeed = 1;
4902 matchornotsyntax:
4903 {
4904 int matches;
4905 Emchar emch;
4906
4907 PREFETCH ();
4908 emch = charptr_emchar ((CONST Bufbyte *) d);
4909 matches = (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table,
4910 emch) == (enum syntaxcode) mcnt);
4911 INC_CHARPTR (d);
4912 if (matches != should_succeed)
4913 goto fail;
4914 SET_REGS_MATCHED ();
4915 }
4916 break;
4917
4918 case notsyntaxspec:
4919 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4920 mcnt = *p++;
4921 goto matchnotsyntax;
4922
4923 case notwordchar:
4924 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4925 mcnt = (int) Sword;
4926 matchnotsyntax:
4927 should_succeed = 0;
4928 goto matchornotsyntax;
4929
4930 #else /* not emacs */
4931 case wordchar:
4932 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4933 PREFETCH ();
4934 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
4935 goto fail;
4936 SET_REGS_MATCHED ();
4937 d++;
4938 break;
4939
4940 case notwordchar:
4941 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4942 PREFETCH ();
4943 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
4944 goto fail;
4945 SET_REGS_MATCHED ();
4946 d++;
4947 break;
4948 #endif /* not emacs */
4949
4950 default:
4951 abort ();
4952 }
4953 continue; /* Successfully executed one pattern command; keep going. */
4954
4955
4956 /* We goto here if a matching operation fails. */
4957 fail:
4958 if (!FAIL_STACK_EMPTY ())
4959 { /* A restart point is known. Restore to that state. */
4960 DEBUG_PRINT1 ("\nFAIL:\n");
4961 POP_FAILURE_POINT (d, p,
4962 lowest_active_reg, highest_active_reg,
4963 regstart, regend, reg_info);
4964
4965 /* If this failure point is a dummy, try the next one. */
4966 if (!p)
4967 goto fail;
4968
4969 /* If we failed to the end of the pattern, don't examine *p. */
4970 assert (p <= pend);
4971 if (p < pend)
4972 {
4973 boolean is_a_jump_n = false;
4974
4975 /* If failed to a backwards jump that's part of a repetition
4976 loop, need to pop this failure point and use the next one. */
4977 switch ((re_opcode_t) *p)
4978 {
4979 case jump_n:
4980 is_a_jump_n = true;
4981 case maybe_pop_jump:
4982 case pop_failure_jump:
4983 case jump:
4984 p1 = p + 1;
4985 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4986 p1 += mcnt;
4987
4988 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4989 || (!is_a_jump_n
4990 && (re_opcode_t) *p1 == on_failure_jump))
4991 goto fail;
4992 break;
4993 default:
4994 /* do nothing */ ;
4995 }
4996 }
4997
4998 if (d >= string1 && d <= end1)
4999 dend = end_match_1;
5000 }
5001 else
5002 break; /* Matching at this starting point really fails. */
5003 } /* for (;;) */
5004
5005 if (best_regs_set)
5006 goto restore_best_regs;
5007
5008 FREE_VARIABLES ();
5009
5010 return -1; /* Failure to match. */
5011 } /* re_match_2 */
5012
5013 /* Subroutine definitions for re_match_2. */
5014
5015
5016 /* We are passed P pointing to a register number after a start_memory.
5017
5018 Return true if the pattern up to the corresponding stop_memory can
5019 match the empty string, and false otherwise.
5020
5021 If we find the matching stop_memory, sets P to point to one past its number.
5022 Otherwise, sets P to an undefined byte less than or equal to END.
5023
5024 We don't handle duplicates properly (yet). */
5025
5026 static boolean
5027 group_match_null_string_p (unsigned char **p, unsigned char *end,
5028 register_info_type *reg_info)
5029 {
5030 int mcnt;
5031 /* Point to after the args to the start_memory. */
5032 unsigned char *p1 = *p + 2;
5033
5034 while (p1 < end)
5035 {
5036 /* Skip over opcodes that can match nothing, and return true or
5037 false, as appropriate, when we get to one that can't, or to the
5038 matching stop_memory. */
5039
5040 switch ((re_opcode_t) *p1)
5041 {
5042 /* Could be either a loop or a series of alternatives. */
5043 case on_failure_jump:
5044 p1++;
5045 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5046
5047 /* If the next operation is not a jump backwards in the
5048 pattern. */
5049
5050 if (mcnt >= 0)
5051 {
5052 /* Go through the on_failure_jumps of the alternatives,
5053 seeing if any of the alternatives cannot match nothing.
5054 The last alternative starts with only a jump,
5055 whereas the rest start with on_failure_jump and end
5056 with a jump, e.g., here is the pattern for `a|b|c':
5057
5058 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5059 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5060 /exactn/1/c
5061
5062 So, we have to first go through the first (n-1)
5063 alternatives and then deal with the last one separately. */
5064
5065
5066 /* Deal with the first (n-1) alternatives, which start
5067 with an on_failure_jump (see above) that jumps to right
5068 past a jump_past_alt. */
5069
5070 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5071 {
5072 /* `mcnt' holds how many bytes long the alternative
5073 is, including the ending `jump_past_alt' and
5074 its number. */
5075
5076 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5077 reg_info))
5078 return false;
5079
5080 /* Move to right after this alternative, including the
5081 jump_past_alt. */
5082 p1 += mcnt;
5083
5084 /* Break if it's the beginning of an n-th alternative
5085 that doesn't begin with an on_failure_jump. */
5086 if ((re_opcode_t) *p1 != on_failure_jump)
5087 break;
5088
5089 /* Still have to check that it's not an n-th
5090 alternative that starts with an on_failure_jump. */
5091 p1++;
5092 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5093 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5094 {
5095 /* Get to the beginning of the n-th alternative. */
5096 p1 -= 3;
5097 break;
5098 }
5099 }
5100
5101 /* Deal with the last alternative: go back and get number
5102 of the `jump_past_alt' just before it. `mcnt' contains
5103 the length of the alternative. */
5104 EXTRACT_NUMBER (mcnt, p1 - 2);
5105
5106 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
5107 return false;
5108
5109 p1 += mcnt; /* Get past the n-th alternative. */
5110 } /* if mcnt > 0 */
5111 break;
5112
5113
5114 case stop_memory:
5115 assert (p1[1] == **p);
5116 *p = p1 + 2;
5117 return true;
5118
5119
5120 default:
5121 if (!common_op_match_null_string_p (&p1, end, reg_info))
5122 return false;
5123 }
5124 } /* while p1 < end */
5125
5126 return false;
5127 } /* group_match_null_string_p */
5128
5129
5130 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
5131 It expects P to be the first byte of a single alternative and END one
5132 byte past the last. The alternative can contain groups. */
5133
5134 static boolean
5135 alt_match_null_string_p (unsigned char *p, unsigned char *end,
5136 register_info_type *reg_info)
5137 {
5138 int mcnt;
5139 unsigned char *p1 = p;
5140
5141 while (p1 < end)
5142 {
5143 /* Skip over opcodes that can match nothing, and break when we get
5144 to one that can't. */
5145
5146 switch ((re_opcode_t) *p1)
5147 {
5148 /* It's a loop. */
5149 case on_failure_jump:
5150 p1++;
5151 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5152 p1 += mcnt;
5153 break;
5154
5155 default:
5156 if (!common_op_match_null_string_p (&p1, end, reg_info))
5157 return false;
5158 }
5159 } /* while p1 < end */
5160
5161 return true;
5162 } /* alt_match_null_string_p */
5163
5164
5165 /* Deals with the ops common to group_match_null_string_p and
5166 alt_match_null_string_p.
5167
5168 Sets P to one after the op and its arguments, if any. */
5169
5170 static boolean
5171 common_op_match_null_string_p (unsigned char **p, unsigned char *end,
5172 register_info_type *reg_info)
5173 {
5174 int mcnt;
5175 boolean ret;
5176 int reg_no;
5177 unsigned char *p1 = *p;
5178
5179 switch ((re_opcode_t) *p1++)
5180 {
5181 case no_op:
5182 case begline:
5183 case endline:
5184 case begbuf:
5185 case endbuf:
5186 case wordbeg:
5187 case wordend:
5188 case wordbound:
5189 case notwordbound:
5190 #ifdef emacs
5191 case before_dot:
5192 case at_dot:
5193 case after_dot:
5194 #endif
5195 break;
5196
5197 case start_memory:
5198 reg_no = *p1;
5199 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
5200 ret = group_match_null_string_p (&p1, end, reg_info);
5201
5202 /* Have to set this here in case we're checking a group which
5203 contains a group and a back reference to it. */
5204
5205 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
5206 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
5207
5208 if (!ret)
5209 return false;
5210 break;
5211
5212 /* If this is an optimized succeed_n for zero times, make the jump. */
5213 case jump:
5214 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5215 if (mcnt >= 0)
5216 p1 += mcnt;
5217 else
5218 return false;
5219 break;
5220
5221 case succeed_n:
5222 /* Get to the number of times to succeed. */
5223 p1 += 2;
5224 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5225
5226 if (mcnt == 0)
5227 {
5228 p1 -= 4;
5229 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5230 p1 += mcnt;
5231 }
5232 else
5233 return false;
5234 break;
5235
5236 case duplicate:
5237 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
5238 return false;
5239 break;
5240
5241 case set_number_at:
5242 p1 += 4;
5243
5244 default:
5245 /* All other opcodes mean we cannot match the empty string. */
5246 return false;
5247 }
5248
5249 *p = p1;
5250 return true;
5251 } /* common_op_match_null_string_p */
5252
5253
5254 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5255 bytes; nonzero otherwise. */
5256
5257 static int
5258 bcmp_translate (CONST unsigned char *s1, CONST unsigned char *s2,
5259 register int len, char *translate)
5260 {
5261 register CONST unsigned char *p1 = s1, *p2 = s2;
5262 while (len)
5263 {
5264 if (translate[*p1++] != translate[*p2++]) return 1;
5265 len--;
5266 }
5267 return 0;
5268 }
5269
5270 /* Entry points for GNU code. */
5271
5272 /* re_compile_pattern is the GNU regular expression compiler: it
5273 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5274 Returns 0 if the pattern was valid, otherwise an error string.
5275
5276 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5277 are set in BUFP on entry.
5278
5279 We call regex_compile to do the actual compilation. */
5280
5281 CONST char *
5282 re_compile_pattern (CONST char *pattern, int length,
5283 struct re_pattern_buffer *bufp)
5284 {
5285 reg_errcode_t ret;
5286
5287 /* GNU code is written to assume at least RE_NREGS registers will be set
5288 (and at least one extra will be -1). */
5289 bufp->regs_allocated = REGS_UNALLOCATED;
5290
5291 /* And GNU code determines whether or not to get register information
5292 by passing null for the REGS argument to re_match, etc., not by
5293 setting no_sub. */
5294 bufp->no_sub = 0;
5295
5296 /* Match anchors at newline. */
5297 bufp->newline_anchor = 1;
5298
5299 ret = regex_compile (pattern, length, re_syntax_options, bufp);
5300
5301 if (!ret)
5302 return NULL;
5303 return gettext (re_error_msgid[(int) ret]);
5304 }
5305
5306 /* Entry points compatible with 4.2 BSD regex library. We don't define
5307 them unless specifically requested. */
5308
5309 #ifdef _REGEX_RE_COMP
5310
5311 /* BSD has one and only one pattern buffer. */
5312 static struct re_pattern_buffer re_comp_buf;
5313
5314 char *
5315 re_comp (CONST char *s)
5316 {
5317 reg_errcode_t ret;
5318
5319 if (!s)
5320 {
5321 if (!re_comp_buf.buffer)
5322 return gettext ("No previous regular expression");
5323 return 0;
5324 }
5325
5326 if (!re_comp_buf.buffer)
5327 {
5328 re_comp_buf.buffer = (unsigned char *) malloc (200);
5329 if (re_comp_buf.buffer == NULL)
5330 return gettext (re_error_msgid[(int) REG_ESPACE]);
5331 re_comp_buf.allocated = 200;
5332
5333 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5334 if (re_comp_buf.fastmap == NULL)
5335 return gettext (re_error_msgid[(int) REG_ESPACE]);
5336 }
5337
5338 /* Since `re_exec' always passes NULL for the `regs' argument, we
5339 don't need to initialize the pattern buffer fields which affect it. */
5340
5341 /* Match anchors at newlines. */
5342 re_comp_buf.newline_anchor = 1;
5343
5344 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5345
5346 if (!ret)
5347 return NULL;
5348
5349 /* Yes, we're discarding `CONST' here if !HAVE_LIBINTL. */
5350 return (char *) gettext (re_error_msgid[(int) ret]);
5351 }
5352
5353
5354 int
5355 re_exec (CONST char *s)
5356 {
5357 CONST int len = strlen (s);
5358 return
5359 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5360 }
5361 #endif /* _REGEX_RE_COMP */
5362
5363 /* POSIX.2 functions. Don't define these for Emacs. */
5364
5365 #ifndef emacs
5366
5367 /* regcomp takes a regular expression as a string and compiles it.
5368
5369 PREG is a regex_t *. We do not expect any fields to be initialized,
5370 since POSIX says we shouldn't. Thus, we set
5371
5372 `buffer' to the compiled pattern;
5373 `used' to the length of the compiled pattern;
5374 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5375 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5376 RE_SYNTAX_POSIX_BASIC;
5377 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
5378 `fastmap' and `fastmap_accurate' to zero;
5379 `re_nsub' to the number of subexpressions in PATTERN.
5380
5381 PATTERN is the address of the pattern string.
5382
5383 CFLAGS is a series of bits which affect compilation.
5384
5385 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5386 use POSIX basic syntax.
5387
5388 If REG_NEWLINE is set, then . and [^...] don't match newline.
5389 Also, regexec will try a match beginning after every newline.
5390
5391 If REG_ICASE is set, then we considers upper- and lowercase
5392 versions of letters to be equivalent when matching.
5393
5394 If REG_NOSUB is set, then when PREG is passed to regexec, that
5395 routine will report only success or failure, and nothing about the
5396 registers.
5397
5398 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
5399 the return codes and their meanings.) */
5400
5401 int
5402 regcomp (regex_t *preg, CONST char *pattern, int cflags)
5403 {
5404 reg_errcode_t ret;
5405 unsigned syntax
5406 = (cflags & REG_EXTENDED) ?
5407 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5408
5409 /* regex_compile will allocate the space for the compiled pattern. */
5410 preg->buffer = 0;
5411 preg->allocated = 0;
5412 preg->used = 0;
5413
5414 /* Don't bother to use a fastmap when searching. This simplifies the
5415 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
5416 characters after newlines into the fastmap. This way, we just try
5417 every character. */
5418 preg->fastmap = 0;
5419
5420 if (cflags & REG_ICASE)
5421 {
5422 unsigned i;
5423
5424 preg->translate = (char *) malloc (CHAR_SET_SIZE);
5425 if (preg->translate == NULL)
5426 return (int) REG_ESPACE;
5427
5428 /* Map uppercase characters to corresponding lowercase ones. */
5429 for (i = 0; i < CHAR_SET_SIZE; i++)
5430 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
5431 }
5432 else
5433 preg->translate = NULL;
5434
5435 /* If REG_NEWLINE is set, newlines are treated differently. */
5436 if (cflags & REG_NEWLINE)
5437 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5438 syntax &= ~RE_DOT_NEWLINE;
5439 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
5440 /* It also changes the matching behavior. */
5441 preg->newline_anchor = 1;
5442 }
5443 else
5444 preg->newline_anchor = 0;
5445
5446 preg->no_sub = !!(cflags & REG_NOSUB);
5447
5448 /* POSIX says a null character in the pattern terminates it, so we
5449 can use strlen here in compiling the pattern. */
5450 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
5451
5452 /* POSIX doesn't distinguish between an unmatched open-group and an
5453 unmatched close-group: both are REG_EPAREN. */
5454 if (ret == REG_ERPAREN) ret = REG_EPAREN;
5455
5456 return (int) ret;
5457 }
5458
5459
5460 /* regexec searches for a given pattern, specified by PREG, in the
5461 string STRING.
5462
5463 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
5464 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
5465 least NMATCH elements, and we set them to the offsets of the
5466 corresponding matched substrings.
5467
5468 EFLAGS specifies `execution flags' which affect matching: if
5469 REG_NOTBOL is set, then ^ does not match at the beginning of the
5470 string; if REG_NOTEOL is set, then $ does not match at the end.
5471
5472 We return 0 if we find a match and REG_NOMATCH if not. */
5473
5474 int
5475 regexec (CONST regex_t *preg, CONST char *string, size_t nmatch,
5476 regmatch_t pmatch[], int eflags)
5477 {
5478 int ret;
5479 struct re_registers regs;
5480 regex_t private_preg;
5481 int len = strlen (string);
5482 boolean want_reg_info = !preg->no_sub && nmatch > 0;
5483
5484 private_preg = *preg;
5485
5486 private_preg.not_bol = !!(eflags & REG_NOTBOL);
5487 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5488
5489 /* The user has told us exactly how many registers to return
5490 information about, via `nmatch'. We have to pass that on to the
5491 matching routines. */
5492 private_preg.regs_allocated = REGS_FIXED;
5493
5494 if (want_reg_info)
5495 {
5496 regs.num_regs = nmatch;
5497 regs.start = TALLOC (nmatch, regoff_t);
5498 regs.end = TALLOC (nmatch, regoff_t);
5499 if (regs.start == NULL || regs.end == NULL)
5500 return (int) REG_NOMATCH;
5501 }
5502
5503 /* Perform the searching operation. */
5504 ret = re_search (&private_preg, string, len,
5505 /* start: */ 0, /* range: */ len,
5506 want_reg_info ? &regs : (struct re_registers *) 0);
5507
5508 /* Copy the register information to the POSIX structure. */
5509 if (want_reg_info)
5510 {
5511 if (ret >= 0)
5512 {
5513 unsigned r;
5514
5515 for (r = 0; r < nmatch; r++)
5516 {
5517 pmatch[r].rm_so = regs.start[r];
5518 pmatch[r].rm_eo = regs.end[r];
5519 }
5520 }
5521
5522 /* If we needed the temporary register info, free the space now. */
5523 free (regs.start);
5524 free (regs.end);
5525 }
5526
5527 /* We want zero return to mean success, unlike `re_search'. */
5528 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
5529 }
5530
5531
5532 /* Returns a message corresponding to an error code, ERRCODE, returned
5533 from either regcomp or regexec. We don't use PREG here. */
5534
5535 size_t
5536 regerror (int errcode, CONST regex_t *preg, char *errbuf, size_t errbuf_size)
5537 {
5538 CONST char *msg;
5539 size_t msg_size;
5540
5541 if (errcode < 0
5542 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5543 /* Only error codes returned by the rest of the code should be passed
5544 to this routine. If we are given anything else, or if other regex
5545 code generates an invalid error code, then the program has a bug.
5546 Dump core so we can fix it. */
5547 abort ();
5548
5549 msg = gettext (re_error_msgid[errcode]);
5550
5551 msg_size = strlen (msg) + 1; /* Includes the null. */
5552
5553 if (errbuf_size != 0)
5554 {
5555 if (msg_size > errbuf_size)
5556 {
5557 strncpy (errbuf, msg, errbuf_size - 1);
5558 errbuf[errbuf_size - 1] = 0;
5559 }
5560 else
5561 strcpy (errbuf, msg);
5562 }
5563
5564 return msg_size;
5565 }
5566
5567
5568 /* Free dynamically allocated space used by PREG. */
5569
5570 void
5571 regfree (regex_t *preg)
5572 {
5573 if (preg->buffer != NULL)
5574 free (preg->buffer);
5575 preg->buffer = NULL;
5576
5577 preg->allocated = 0;
5578 preg->used = 0;
5579
5580 if (preg->fastmap != NULL)
5581 free (preg->fastmap);
5582 preg->fastmap = NULL;
5583 preg->fastmap_accurate = 0;
5584
5585 if (preg->translate != NULL)
5586 free (preg->translate);
5587 preg->translate = NULL;
5588 }
5589
5590 #endif /* not emacs */
5591
5592 /*
5593 Local variables:
5594 make-backup-files: t
5595 version-control: t
5596 trim-versions-without-asking: nil
5597 End:
5598 */