comparison src/regex.c @ 428:3ecd8885ac67 r21-2-22

Import from CVS: tag r21-2-22
author cvs
date Mon, 13 Aug 2007 11:28:15 +0200
parents
children 9d177e8d4150
comparison
equal deleted inserted replaced
427:0a0253eac470 428:3ecd8885ac67
1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
5
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; see the file COPYING. If not, write to
22 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 Boston, MA 02111-1307, USA. */
24
25 /* Synched up with: FSF 19.29. */
26
27 /* Changes made for XEmacs:
28
29 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
30 was added. This causes a huge speedup in font-locking.
31 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
32 being used, because it's too slow -- all those calls to mmap()
33 add humongous overhead.
34 (3) Lots and lots of changes for Mule. They are bracketed by
35 `#ifdef MULE' or with comments that have `XEmacs' in them.
36 */
37
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
43 #define REGISTER
44 #endif
45
46 #ifndef _GNU_SOURCE
47 #define _GNU_SOURCE 1
48 #endif
49
50 /* We assume non-Mule if emacs isn't defined. */
51 #ifndef emacs
52 #undef MULE
53 #endif
54
55 /* We need this for `regex.h', and perhaps for the Emacs include files. */
56 #include <sys/types.h>
57
58 /* This is for other GNU distributions with internationalized messages. */
59 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
60 # include <libintl.h>
61 #else
62 # define gettext(msgid) (msgid)
63 #endif
64
65 /* XEmacs: define this to add in a speedup for patterns anchored at
66 the beginning of a line. Keep the ifdefs so that it's easier to
67 tell where/why this code has diverged from v19. */
68 #define REGEX_BEGLINE_CHECK
69
70 /* XEmacs: the current mmap-based ralloc handles small blocks very
71 poorly, so we disable it here. */
72
73 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
74 # undef REL_ALLOC
75 #endif
76
77 /* The `emacs' switch turns on certain matching commands
78 that make sense only in Emacs. */
79 #ifdef emacs
80
81 #include "lisp.h"
82 #include "buffer.h"
83 #include "syntax.h"
84
85 #if (defined (DEBUG_XEMACS) && !defined (DEBUG))
86 #define DEBUG
87 #endif
88
89 #ifdef MULE
90
91 Lisp_Object Vthe_lisp_rangetab;
92
93 void
94 complex_vars_of_regex (void)
95 {
96 Vthe_lisp_rangetab = Fmake_range_table ();
97 staticpro (&Vthe_lisp_rangetab);
98 }
99
100 #else /* not MULE */
101
102 void
103 complex_vars_of_regex (void)
104 {
105 }
106
107 #endif /* not MULE */
108
109 #else /* not emacs */
110
111 /* If we are not linking with Emacs proper,
112 we can't use the relocating allocator
113 even if config.h says that we can. */
114 #undef REL_ALLOC
115
116 #if defined (STDC_HEADERS) || defined (_LIBC)
117 #include <stdlib.h>
118 #else
119 char *malloc ();
120 char *realloc ();
121 #endif
122
123 #define charptr_emchar(str) ((Emchar) (str)[0])
124
125 #if (LONGBITS > INTBITS)
126 # define EMACS_INT long
127 #else
128 # define EMACS_INT int
129 #endif
130
131 typedef int Emchar;
132
133 #define INC_CHARPTR(p) ((p)++)
134 #define DEC_CHARPTR(p) ((p)--)
135
136 #include <string.h>
137
138 /* Define the syntax stuff for \<, \>, etc. */
139
140 /* This must be nonzero for the wordchar and notwordchar pattern
141 commands in re_match_2. */
142 #ifndef Sword
143 #define Sword 1
144 #endif
145
146 #ifdef SYNTAX_TABLE
147
148 extern char *re_syntax_table;
149
150 #else /* not SYNTAX_TABLE */
151
152 /* How many characters in the character set. */
153 #define CHAR_SET_SIZE 256
154
155 static char re_syntax_table[CHAR_SET_SIZE];
156
157 static void
158 init_syntax_once (void)
159 {
160 static int done = 0;
161
162 if (!done)
163 {
164 CONST char *word_syntax_chars =
165 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
166
167 memset (re_syntax_table, 0, sizeof (re_syntax_table));
168
169 while (*word_syntax_chars)
170 re_syntax_table[(unsigned int)(*word_syntax_chars++)] = Sword;
171
172 done = 1;
173 }
174 }
175
176 #endif /* not SYNTAX_TABLE */
177
178 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
179
180 #endif /* not emacs */
181
182 /* Under XEmacs, this is needed because we don't define it elsewhere. */
183 #ifdef SWITCH_ENUM_BUG
184 #define SWITCH_ENUM_CAST(x) ((int)(x))
185 #else
186 #define SWITCH_ENUM_CAST(x) (x)
187 #endif
188
189
190 /* Get the interface, including the syntax bits. */
191 #include "regex.h"
192
193 /* isalpha etc. are used for the character classes. */
194 #include <ctype.h>
195
196 /* Jim Meyering writes:
197
198 "... Some ctype macros are valid only for character codes that
199 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
200 using /bin/cc or gcc but without giving an ansi option). So, all
201 ctype uses should be through macros like ISPRINT... If
202 STDC_HEADERS is defined, then autoconf has verified that the ctype
203 macros don't need to be guarded with references to isascii. ...
204 Defining isascii to 1 should let any compiler worth its salt
205 eliminate the && through constant folding." */
206
207 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
208 #define ISASCII_1(c) 1
209 #else
210 #define ISASCII_1(c) isascii(c)
211 #endif
212
213 #ifdef MULE
214 /* The IS*() macros can be passed any character, including an extended
215 one. We need to make sure there are no crashes, which would occur
216 otherwise due to out-of-bounds array references. */
217 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
218 #else
219 #define ISASCII(c) ISASCII_1 (c)
220 #endif /* MULE */
221
222 #ifdef isblank
223 #define ISBLANK(c) (ISASCII (c) && isblank (c))
224 #else
225 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
226 #endif
227 #ifdef isgraph
228 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
229 #else
230 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
231 #endif
232
233 #define ISPRINT(c) (ISASCII (c) && isprint (c))
234 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
235 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
236 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
237 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
238 #define ISLOWER(c) (ISASCII (c) && islower (c))
239 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
240 #define ISSPACE(c) (ISASCII (c) && isspace (c))
241 #define ISUPPER(c) (ISASCII (c) && isupper (c))
242 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
243
244 #ifndef NULL
245 #define NULL (void *)0
246 #endif
247
248 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
249 since ours (we hope) works properly with all combinations of
250 machines, compilers, `char' and `unsigned char' argument types.
251 (Per Bothner suggested the basic approach.) */
252 #undef SIGN_EXTEND_CHAR
253 #if __STDC__
254 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
255 #else /* not __STDC__ */
256 /* As in Harbison and Steele. */
257 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
258 #endif
259
260 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
261 use `alloca' instead of `malloc'. This is because using malloc in
262 re_search* or re_match* could cause memory leaks when C-g is used in
263 Emacs; also, malloc is slower and causes storage fragmentation. On
264 the other hand, malloc is more portable, and easier to debug.
265
266 Because we sometimes use alloca, some routines have to be macros,
267 not functions -- `alloca'-allocated space disappears at the end of the
268 function it is called in. */
269
270 #ifdef REGEX_MALLOC
271
272 #define REGEX_ALLOCATE malloc
273 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
274 #define REGEX_FREE free
275
276 #else /* not REGEX_MALLOC */
277
278 /* Emacs already defines alloca, sometimes. */
279 #ifndef alloca
280
281 /* Make alloca work the best possible way. */
282 #ifdef __GNUC__
283 #define alloca __builtin_alloca
284 #else /* not __GNUC__ */
285 #if HAVE_ALLOCA_H
286 #include <alloca.h>
287 #else /* not __GNUC__ or HAVE_ALLOCA_H */
288 #ifndef _AIX /* Already did AIX, up at the top. */
289 char *alloca ();
290 #endif /* not _AIX */
291 #endif /* not HAVE_ALLOCA_H */
292 #endif /* not __GNUC__ */
293
294 #endif /* not alloca */
295
296 #define REGEX_ALLOCATE alloca
297
298 /* Assumes a `char *destination' variable. */
299 #define REGEX_REALLOCATE(source, osize, nsize) \
300 (destination = (char *) alloca (nsize), \
301 memmove (destination, source, osize), \
302 destination)
303
304 /* No need to do anything to free, after alloca. */
305 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
306
307 #endif /* not REGEX_MALLOC */
308
309 /* Define how to allocate the failure stack. */
310
311 #ifdef REL_ALLOC
312 #define REGEX_ALLOCATE_STACK(size) \
313 r_alloc ((char **) &failure_stack_ptr, (size))
314 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
315 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
316 #define REGEX_FREE_STACK(ptr) \
317 r_alloc_free ((void **) &failure_stack_ptr)
318
319 #else /* not REL_ALLOC */
320
321 #ifdef REGEX_MALLOC
322
323 #define REGEX_ALLOCATE_STACK malloc
324 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
325 #define REGEX_FREE_STACK free
326
327 #else /* not REGEX_MALLOC */
328
329 #define REGEX_ALLOCATE_STACK alloca
330
331 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
332 REGEX_REALLOCATE (source, osize, nsize)
333 /* No need to explicitly free anything. */
334 #define REGEX_FREE_STACK(arg)
335
336 #endif /* not REGEX_MALLOC */
337 #endif /* not REL_ALLOC */
338
339
340 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
341 `string1' or just past its end. This works if PTR is NULL, which is
342 a good thing. */
343 #define FIRST_STRING_P(ptr) \
344 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
345
346 /* (Re)Allocate N items of type T using malloc, or fail. */
347 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
348 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
349 #define RETALLOC_IF(addr, n, t) \
350 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
351 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
352
353 #define BYTEWIDTH 8 /* In bits. */
354
355 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
356
357 #undef MAX
358 #undef MIN
359 #define MAX(a, b) ((a) > (b) ? (a) : (b))
360 #define MIN(a, b) ((a) < (b) ? (a) : (b))
361
362 typedef char boolean;
363 #define false 0
364 #define true 1
365
366
367 /* These are the command codes that appear in compiled regular
368 expressions. Some opcodes are followed by argument bytes. A
369 command code can specify any interpretation whatsoever for its
370 arguments. Zero bytes may appear in the compiled regular expression. */
371
372 typedef enum
373 {
374 no_op = 0,
375
376 /* Succeed right away--no more backtracking. */
377 succeed,
378
379 /* Followed by one byte giving n, then by n literal bytes. */
380 exactn,
381
382 /* Matches any (more or less) character. */
383 anychar,
384
385 /* Matches any one char belonging to specified set. First
386 following byte is number of bitmap bytes. Then come bytes
387 for a bitmap saying which chars are in. Bits in each byte
388 are ordered low-bit-first. A character is in the set if its
389 bit is 1. A character too large to have a bit in the map is
390 automatically not in the set. */
391 charset,
392
393 /* Same parameters as charset, but match any character that is
394 not one of those specified. */
395 charset_not,
396
397 /* Start remembering the text that is matched, for storing in a
398 register. Followed by one byte with the register number, in
399 the range 0 to one less than the pattern buffer's re_nsub
400 field. Then followed by one byte with the number of groups
401 inner to this one. (This last has to be part of the
402 start_memory only because we need it in the on_failure_jump
403 of re_match_2.) */
404 start_memory,
405
406 /* Stop remembering the text that is matched and store it in a
407 memory register. Followed by one byte with the register
408 number, in the range 0 to one less than `re_nsub' in the
409 pattern buffer, and one byte with the number of inner groups,
410 just like `start_memory'. (We need the number of inner
411 groups here because we don't have any easy way of finding the
412 corresponding start_memory when we're at a stop_memory.) */
413 stop_memory,
414
415 /* Match a duplicate of something remembered. Followed by one
416 byte containing the register number. */
417 duplicate,
418
419 /* Fail unless at beginning of line. */
420 begline,
421
422 /* Fail unless at end of line. */
423 endline,
424
425 /* Succeeds if at beginning of buffer (if emacs) or at beginning
426 of string to be matched (if not). */
427 begbuf,
428
429 /* Analogously, for end of buffer/string. */
430 endbuf,
431
432 /* Followed by two byte relative address to which to jump. */
433 jump,
434
435 /* Same as jump, but marks the end of an alternative. */
436 jump_past_alt,
437
438 /* Followed by two-byte relative address of place to resume at
439 in case of failure. */
440 on_failure_jump,
441
442 /* Like on_failure_jump, but pushes a placeholder instead of the
443 current string position when executed. */
444 on_failure_keep_string_jump,
445
446 /* Throw away latest failure point and then jump to following
447 two-byte relative address. */
448 pop_failure_jump,
449
450 /* Change to pop_failure_jump if know won't have to backtrack to
451 match; otherwise change to jump. This is used to jump
452 back to the beginning of a repeat. If what follows this jump
453 clearly won't match what the repeat does, such that we can be
454 sure that there is no use backtracking out of repetitions
455 already matched, then we change it to a pop_failure_jump.
456 Followed by two-byte address. */
457 maybe_pop_jump,
458
459 /* Jump to following two-byte address, and push a dummy failure
460 point. This failure point will be thrown away if an attempt
461 is made to use it for a failure. A `+' construct makes this
462 before the first repeat. Also used as an intermediary kind
463 of jump when compiling an alternative. */
464 dummy_failure_jump,
465
466 /* Push a dummy failure point and continue. Used at the end of
467 alternatives. */
468 push_dummy_failure,
469
470 /* Followed by two-byte relative address and two-byte number n.
471 After matching N times, jump to the address upon failure. */
472 succeed_n,
473
474 /* Followed by two-byte relative address, and two-byte number n.
475 Jump to the address N times, then fail. */
476 jump_n,
477
478 /* Set the following two-byte relative address to the
479 subsequent two-byte number. The address *includes* the two
480 bytes of number. */
481 set_number_at,
482
483 wordchar, /* Matches any word-constituent character. */
484 notwordchar, /* Matches any char that is not a word-constituent. */
485
486 wordbeg, /* Succeeds if at word beginning. */
487 wordend, /* Succeeds if at word end. */
488
489 wordbound, /* Succeeds if at a word boundary. */
490 notwordbound /* Succeeds if not at a word boundary. */
491
492 #ifdef emacs
493 ,before_dot, /* Succeeds if before point. */
494 at_dot, /* Succeeds if at point. */
495 after_dot, /* Succeeds if after point. */
496
497 /* Matches any character whose syntax is specified. Followed by
498 a byte which contains a syntax code, e.g., Sword. */
499 syntaxspec,
500
501 /* Matches any character whose syntax is not that specified. */
502 notsyntaxspec
503
504 #endif /* emacs */
505
506 #ifdef MULE
507 /* need extra stuff to be able to properly work with XEmacs/Mule
508 characters (which may take up more than one byte) */
509
510 ,charset_mule, /* Matches any character belonging to specified set.
511 The set is stored in "unified range-table
512 format"; see rangetab.c. Unlike the `charset'
513 opcode, this can handle arbitrary characters. */
514
515 charset_mule_not /* Same parameters as charset_mule, but match any
516 character that is not one of those specified. */
517
518 /* 97/2/17 jhod: The following two were merged back in from the Mule
519 2.3 code to enable some language specific processing */
520 ,categoryspec, /* Matches entries in the character category tables */
521 notcategoryspec /* The opposite of the above */
522 #endif /* MULE */
523
524 } re_opcode_t;
525
526 /* Common operations on the compiled pattern. */
527
528 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
529
530 #define STORE_NUMBER(destination, number) \
531 do { \
532 (destination)[0] = (number) & 0377; \
533 (destination)[1] = (number) >> 8; \
534 } while (0)
535
536 /* Same as STORE_NUMBER, except increment DESTINATION to
537 the byte after where the number is stored. Therefore, DESTINATION
538 must be an lvalue. */
539
540 #define STORE_NUMBER_AND_INCR(destination, number) \
541 do { \
542 STORE_NUMBER (destination, number); \
543 (destination) += 2; \
544 } while (0)
545
546 /* Put into DESTINATION a number stored in two contiguous bytes starting
547 at SOURCE. */
548
549 #define EXTRACT_NUMBER(destination, source) \
550 do { \
551 (destination) = *(source) & 0377; \
552 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
553 } while (0)
554
555 #ifdef DEBUG
556 static void
557 extract_number (int *dest, unsigned char *source)
558 {
559 int temp = SIGN_EXTEND_CHAR (*(source + 1));
560 *dest = *source & 0377;
561 *dest += temp << 8;
562 }
563
564 #ifndef EXTRACT_MACROS /* To debug the macros. */
565 #undef EXTRACT_NUMBER
566 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
567 #endif /* not EXTRACT_MACROS */
568
569 #endif /* DEBUG */
570
571 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
572 SOURCE must be an lvalue. */
573
574 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
575 do { \
576 EXTRACT_NUMBER (destination, source); \
577 (source) += 2; \
578 } while (0)
579
580 #ifdef DEBUG
581 static void
582 extract_number_and_incr (int *destination, unsigned char **source)
583 {
584 extract_number (destination, *source);
585 *source += 2;
586 }
587
588 #ifndef EXTRACT_MACROS
589 #undef EXTRACT_NUMBER_AND_INCR
590 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
591 extract_number_and_incr (&dest, &src)
592 #endif /* not EXTRACT_MACROS */
593
594 #endif /* DEBUG */
595
596 /* If DEBUG is defined, Regex prints many voluminous messages about what
597 it is doing (if the variable `debug' is nonzero). If linked with the
598 main program in `iregex.c', you can enter patterns and strings
599 interactively. And if linked with the main program in `main.c' and
600 the other test files, you can run the already-written tests. */
601
602 #if defined (DEBUG)
603
604 /* We use standard I/O for debugging. */
605 #include <stdio.h>
606
607 #ifndef emacs
608 /* XEmacs provides its own version of assert() */
609 /* It is useful to test things that ``must'' be true when debugging. */
610 #include <assert.h>
611 #endif
612
613 static int debug = 0;
614
615 #define DEBUG_STATEMENT(e) e
616 #define DEBUG_PRINT1(x) if (debug) printf (x)
617 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
618 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
619 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
620 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
621 if (debug) print_partial_compiled_pattern (s, e)
622 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
623 if (debug) print_double_string (w, s1, sz1, s2, sz2)
624
625
626 /* Print the fastmap in human-readable form. */
627
628 static void
629 print_fastmap (char *fastmap)
630 {
631 unsigned was_a_range = 0;
632 unsigned i = 0;
633
634 while (i < (1 << BYTEWIDTH))
635 {
636 if (fastmap[i++])
637 {
638 was_a_range = 0;
639 putchar (i - 1);
640 while (i < (1 << BYTEWIDTH) && fastmap[i])
641 {
642 was_a_range = 1;
643 i++;
644 }
645 if (was_a_range)
646 {
647 putchar ('-');
648 putchar (i - 1);
649 }
650 }
651 }
652 putchar ('\n');
653 }
654
655
656 /* Print a compiled pattern string in human-readable form, starting at
657 the START pointer into it and ending just before the pointer END. */
658
659 static void
660 print_partial_compiled_pattern (unsigned char *start, unsigned char *end)
661 {
662 int mcnt, mcnt2;
663 unsigned char *p = start;
664 unsigned char *pend = end;
665
666 if (start == NULL)
667 {
668 puts ("(null)");
669 return;
670 }
671
672 /* Loop over pattern commands. */
673 while (p < pend)
674 {
675 printf ("%ld:\t", (long)(p - start));
676
677 switch ((re_opcode_t) *p++)
678 {
679 case no_op:
680 printf ("/no_op");
681 break;
682
683 case exactn:
684 mcnt = *p++;
685 printf ("/exactn/%d", mcnt);
686 do
687 {
688 putchar ('/');
689 putchar (*p++);
690 }
691 while (--mcnt);
692 break;
693
694 case start_memory:
695 mcnt = *p++;
696 printf ("/start_memory/%d/%d", mcnt, *p++);
697 break;
698
699 case stop_memory:
700 mcnt = *p++;
701 printf ("/stop_memory/%d/%d", mcnt, *p++);
702 break;
703
704 case duplicate:
705 printf ("/duplicate/%d", *p++);
706 break;
707
708 case anychar:
709 printf ("/anychar");
710 break;
711
712 case charset:
713 case charset_not:
714 {
715 REGISTER int c, last = -100;
716 REGISTER int in_range = 0;
717
718 printf ("/charset [%s",
719 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
720
721 assert (p + *p < pend);
722
723 for (c = 0; c < 256; c++)
724 if (((unsigned char) (c / 8) < *p)
725 && (p[1 + (c/8)] & (1 << (c % 8))))
726 {
727 /* Are we starting a range? */
728 if (last + 1 == c && ! in_range)
729 {
730 putchar ('-');
731 in_range = 1;
732 }
733 /* Have we broken a range? */
734 else if (last + 1 != c && in_range)
735 {
736 putchar (last);
737 in_range = 0;
738 }
739
740 if (! in_range)
741 putchar (c);
742
743 last = c;
744 }
745
746 if (in_range)
747 putchar (last);
748
749 putchar (']');
750
751 p += 1 + *p;
752 }
753 break;
754
755 #ifdef MULE
756 case charset_mule:
757 case charset_mule_not:
758 {
759 int nentries, i;
760
761 printf ("/charset_mule [%s",
762 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : "");
763 nentries = unified_range_table_nentries (p);
764 for (i = 0; i < nentries; i++)
765 {
766 EMACS_INT first, last;
767 Lisp_Object dummy_val;
768
769 unified_range_table_get_range (p, i, &first, &last,
770 &dummy_val);
771 if (first < 0x100)
772 putchar (first);
773 else
774 printf ("(0x%lx)", (long)first);
775 if (first != last)
776 {
777 putchar ('-');
778 if (last < 0x100)
779 putchar (last);
780 else
781 printf ("(0x%lx)", (long)last);
782 }
783 }
784 putchar (']');
785 p += unified_range_table_bytes_used (p);
786 }
787 break;
788 #endif
789
790 case begline:
791 printf ("/begline");
792 break;
793
794 case endline:
795 printf ("/endline");
796 break;
797
798 case on_failure_jump:
799 extract_number_and_incr (&mcnt, &p);
800 printf ("/on_failure_jump to %ld", (long)(p + mcnt - start));
801 break;
802
803 case on_failure_keep_string_jump:
804 extract_number_and_incr (&mcnt, &p);
805 printf ("/on_failure_keep_string_jump to %ld", (long)(p + mcnt - start));
806 break;
807
808 case dummy_failure_jump:
809 extract_number_and_incr (&mcnt, &p);
810 printf ("/dummy_failure_jump to %ld", (long)(p + mcnt - start));
811 break;
812
813 case push_dummy_failure:
814 printf ("/push_dummy_failure");
815 break;
816
817 case maybe_pop_jump:
818 extract_number_and_incr (&mcnt, &p);
819 printf ("/maybe_pop_jump to %ld", (long)(p + mcnt - start));
820 break;
821
822 case pop_failure_jump:
823 extract_number_and_incr (&mcnt, &p);
824 printf ("/pop_failure_jump to %ld", (long)(p + mcnt - start));
825 break;
826
827 case jump_past_alt:
828 extract_number_and_incr (&mcnt, &p);
829 printf ("/jump_past_alt to %ld", (long)(p + mcnt - start));
830 break;
831
832 case jump:
833 extract_number_and_incr (&mcnt, &p);
834 printf ("/jump to %ld", (long)(p + mcnt - start));
835 break;
836
837 case succeed_n:
838 extract_number_and_incr (&mcnt, &p);
839 extract_number_and_incr (&mcnt2, &p);
840 printf ("/succeed_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
841 break;
842
843 case jump_n:
844 extract_number_and_incr (&mcnt, &p);
845 extract_number_and_incr (&mcnt2, &p);
846 printf ("/jump_n to %ld, %d times", (long)(p + mcnt - start), mcnt2);
847 break;
848
849 case set_number_at:
850 extract_number_and_incr (&mcnt, &p);
851 extract_number_and_incr (&mcnt2, &p);
852 printf ("/set_number_at location %ld to %d", (long)(p + mcnt - start), mcnt2);
853 break;
854
855 case wordbound:
856 printf ("/wordbound");
857 break;
858
859 case notwordbound:
860 printf ("/notwordbound");
861 break;
862
863 case wordbeg:
864 printf ("/wordbeg");
865 break;
866
867 case wordend:
868 printf ("/wordend");
869
870 #ifdef emacs
871 case before_dot:
872 printf ("/before_dot");
873 break;
874
875 case at_dot:
876 printf ("/at_dot");
877 break;
878
879 case after_dot:
880 printf ("/after_dot");
881 break;
882
883 case syntaxspec:
884 printf ("/syntaxspec");
885 mcnt = *p++;
886 printf ("/%d", mcnt);
887 break;
888
889 case notsyntaxspec:
890 printf ("/notsyntaxspec");
891 mcnt = *p++;
892 printf ("/%d", mcnt);
893 break;
894
895 #ifdef MULE
896 /* 97/2/17 jhod Mule category patch */
897 case categoryspec:
898 printf ("/categoryspec");
899 mcnt = *p++;
900 printf ("/%d", mcnt);
901 break;
902
903 case notcategoryspec:
904 printf ("/notcategoryspec");
905 mcnt = *p++;
906 printf ("/%d", mcnt);
907 break;
908 /* end of category patch */
909 #endif /* MULE */
910 #endif /* emacs */
911
912 case wordchar:
913 printf ("/wordchar");
914 break;
915
916 case notwordchar:
917 printf ("/notwordchar");
918 break;
919
920 case begbuf:
921 printf ("/begbuf");
922 break;
923
924 case endbuf:
925 printf ("/endbuf");
926 break;
927
928 default:
929 printf ("?%d", *(p-1));
930 }
931
932 putchar ('\n');
933 }
934
935 printf ("%ld:\tend of pattern.\n", (long)(p - start));
936 }
937
938
939 static void
940 print_compiled_pattern (struct re_pattern_buffer *bufp)
941 {
942 unsigned char *buffer = bufp->buffer;
943
944 print_partial_compiled_pattern (buffer, buffer + bufp->used);
945 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used,
946 bufp->allocated);
947
948 if (bufp->fastmap_accurate && bufp->fastmap)
949 {
950 printf ("fastmap: ");
951 print_fastmap (bufp->fastmap);
952 }
953
954 printf ("re_nsub: %ld\t", (long)bufp->re_nsub);
955 printf ("regs_alloc: %d\t", bufp->regs_allocated);
956 printf ("can_be_null: %d\t", bufp->can_be_null);
957 printf ("newline_anchor: %d\n", bufp->newline_anchor);
958 printf ("no_sub: %d\t", bufp->no_sub);
959 printf ("not_bol: %d\t", bufp->not_bol);
960 printf ("not_eol: %d\t", bufp->not_eol);
961 printf ("syntax: %d\n", bufp->syntax);
962 /* Perhaps we should print the translate table? */
963 /* and maybe the category table? */
964 }
965
966
967 static void
968 print_double_string (CONST char *where, CONST char *string1, int size1,
969 CONST char *string2, int size2)
970 {
971 if (where == NULL)
972 printf ("(null)");
973 else
974 {
975 unsigned int this_char;
976
977 if (FIRST_STRING_P (where))
978 {
979 for (this_char = where - string1; this_char < size1; this_char++)
980 putchar (string1[this_char]);
981
982 where = string2;
983 }
984
985 for (this_char = where - string2; this_char < size2; this_char++)
986 putchar (string2[this_char]);
987 }
988 }
989
990 #else /* not DEBUG */
991
992 #undef assert
993 #define assert(e)
994
995 #define DEBUG_STATEMENT(e)
996 #define DEBUG_PRINT1(x)
997 #define DEBUG_PRINT2(x1, x2)
998 #define DEBUG_PRINT3(x1, x2, x3)
999 #define DEBUG_PRINT4(x1, x2, x3, x4)
1000 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1001 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1002
1003 #endif /* not DEBUG */
1004
1005 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1006 also be assigned to arbitrarily: each pattern buffer stores its own
1007 syntax, so it can be changed between regex compilations. */
1008 /* This has no initializer because initialized variables in Emacs
1009 become read-only after dumping. */
1010 reg_syntax_t re_syntax_options;
1011
1012
1013 /* Specify the precise syntax of regexps for compilation. This provides
1014 for compatibility for various utilities which historically have
1015 different, incompatible syntaxes.
1016
1017 The argument SYNTAX is a bit mask comprised of the various bits
1018 defined in regex.h. We return the old syntax. */
1019
1020 reg_syntax_t
1021 re_set_syntax (reg_syntax_t syntax)
1022 {
1023 reg_syntax_t ret = re_syntax_options;
1024
1025 re_syntax_options = syntax;
1026 return ret;
1027 }
1028
1029 /* This table gives an error message for each of the error codes listed
1030 in regex.h. Obviously the order here has to be same as there.
1031 POSIX doesn't require that we do anything for REG_NOERROR,
1032 but why not be nice? */
1033
1034 static CONST char *re_error_msgid[] =
1035 {
1036 "Success", /* REG_NOERROR */
1037 "No match", /* REG_NOMATCH */
1038 "Invalid regular expression", /* REG_BADPAT */
1039 "Invalid collation character", /* REG_ECOLLATE */
1040 "Invalid character class name", /* REG_ECTYPE */
1041 "Trailing backslash", /* REG_EESCAPE */
1042 "Invalid back reference", /* REG_ESUBREG */
1043 "Unmatched [ or [^", /* REG_EBRACK */
1044 "Unmatched ( or \\(", /* REG_EPAREN */
1045 "Unmatched \\{", /* REG_EBRACE */
1046 "Invalid content of \\{\\}", /* REG_BADBR */
1047 "Invalid range end", /* REG_ERANGE */
1048 "Memory exhausted", /* REG_ESPACE */
1049 "Invalid preceding regular expression", /* REG_BADRPT */
1050 "Premature end of regular expression", /* REG_EEND */
1051 "Regular expression too big", /* REG_ESIZE */
1052 "Unmatched ) or \\)", /* REG_ERPAREN */
1053 #ifdef emacs
1054 "Invalid syntax designator", /* REG_ESYNTAX */
1055 #endif
1056 #ifdef MULE
1057 "Ranges may not span charsets", /* REG_ERANGESPAN */
1058 "Invalid category designator", /* REG_ECATEGORY */
1059 #endif
1060 };
1061
1062 /* Avoiding alloca during matching, to placate r_alloc. */
1063
1064 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1065 searching and matching functions should not call alloca. On some
1066 systems, alloca is implemented in terms of malloc, and if we're
1067 using the relocating allocator routines, then malloc could cause a
1068 relocation, which might (if the strings being searched are in the
1069 ralloc heap) shift the data out from underneath the regexp
1070 routines.
1071
1072 Here's another reason to avoid allocation: Emacs
1073 processes input from X in a signal handler; processing X input may
1074 call malloc; if input arrives while a matching routine is calling
1075 malloc, then we're scrod. But Emacs can't just block input while
1076 calling matching routines; then we don't notice interrupts when
1077 they come in. So, Emacs blocks input around all regexp calls
1078 except the matching calls, which it leaves unprotected, in the
1079 faith that they will not malloc. */
1080
1081 /* Normally, this is fine. */
1082 #define MATCH_MAY_ALLOCATE
1083
1084 /* When using GNU C, we are not REALLY using the C alloca, no matter
1085 what config.h may say. So don't take precautions for it. */
1086 #ifdef __GNUC__
1087 #undef C_ALLOCA
1088 #endif
1089
1090 /* The match routines may not allocate if (1) they would do it with malloc
1091 and (2) it's not safe for them to use malloc.
1092 Note that if REL_ALLOC is defined, matching would not use malloc for the
1093 failure stack, but we would still use it for the register vectors;
1094 so REL_ALLOC should not affect this. */
1095 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1096 #undef MATCH_MAY_ALLOCATE
1097 #endif
1098
1099
1100 /* Failure stack declarations and macros; both re_compile_fastmap and
1101 re_match_2 use a failure stack. These have to be macros because of
1102 REGEX_ALLOCATE_STACK. */
1103
1104
1105 /* Number of failure points for which to initially allocate space
1106 when matching. If this number is exceeded, we allocate more
1107 space, so it is not a hard limit. */
1108 #ifndef INIT_FAILURE_ALLOC
1109 #define INIT_FAILURE_ALLOC 5
1110 #endif
1111
1112 /* Roughly the maximum number of failure points on the stack. Would be
1113 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1114 This is a variable only so users of regex can assign to it; we never
1115 change it ourselves. */
1116 #if defined (MATCH_MAY_ALLOCATE)
1117 /* 4400 was enough to cause a crash on Alpha OSF/1,
1118 whose default stack limit is 2mb. */
1119 int re_max_failures = 20000;
1120 #else
1121 int re_max_failures = 2000;
1122 #endif
1123
1124 union fail_stack_elt
1125 {
1126 unsigned char *pointer;
1127 int integer;
1128 };
1129
1130 typedef union fail_stack_elt fail_stack_elt_t;
1131
1132 typedef struct
1133 {
1134 fail_stack_elt_t *stack;
1135 unsigned size;
1136 unsigned avail; /* Offset of next open position. */
1137 } fail_stack_type;
1138
1139 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1140 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1141 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1142
1143
1144 /* Define macros to initialize and free the failure stack.
1145 Do `return -2' if the alloc fails. */
1146
1147 #ifdef MATCH_MAY_ALLOCATE
1148 #define INIT_FAIL_STACK() \
1149 do { \
1150 fail_stack.stack = (fail_stack_elt_t *) \
1151 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1152 \
1153 if (fail_stack.stack == NULL) \
1154 return -2; \
1155 \
1156 fail_stack.size = INIT_FAILURE_ALLOC; \
1157 fail_stack.avail = 0; \
1158 } while (0)
1159
1160 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1161 #else
1162 #define INIT_FAIL_STACK() \
1163 do { \
1164 fail_stack.avail = 0; \
1165 } while (0)
1166
1167 #define RESET_FAIL_STACK()
1168 #endif
1169
1170
1171 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1172
1173 Return 1 if succeeds, and 0 if either ran out of memory
1174 allocating space for it or it was already too large.
1175
1176 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1177
1178 #define DOUBLE_FAIL_STACK(fail_stack) \
1179 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1180 ? 0 \
1181 : ((fail_stack).stack = (fail_stack_elt_t *) \
1182 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1183 (fail_stack).size * sizeof (fail_stack_elt_t), \
1184 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1185 \
1186 (fail_stack).stack == NULL \
1187 ? 0 \
1188 : ((fail_stack).size <<= 1, \
1189 1)))
1190
1191
1192 /* Push pointer POINTER on FAIL_STACK.
1193 Return 1 if was able to do so and 0 if ran out of memory allocating
1194 space to do so. */
1195 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1196 ((FAIL_STACK_FULL () \
1197 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1198 ? 0 \
1199 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1200 1))
1201
1202 /* Push a pointer value onto the failure stack.
1203 Assumes the variable `fail_stack'. Probably should only
1204 be called from within `PUSH_FAILURE_POINT'. */
1205 #define PUSH_FAILURE_POINTER(item) \
1206 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1207
1208 /* This pushes an integer-valued item onto the failure stack.
1209 Assumes the variable `fail_stack'. Probably should only
1210 be called from within `PUSH_FAILURE_POINT'. */
1211 #define PUSH_FAILURE_INT(item) \
1212 fail_stack.stack[fail_stack.avail++].integer = (item)
1213
1214 /* Push a fail_stack_elt_t value onto the failure stack.
1215 Assumes the variable `fail_stack'. Probably should only
1216 be called from within `PUSH_FAILURE_POINT'. */
1217 #define PUSH_FAILURE_ELT(item) \
1218 fail_stack.stack[fail_stack.avail++] = (item)
1219
1220 /* These three POP... operations complement the three PUSH... operations.
1221 All assume that `fail_stack' is nonempty. */
1222 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1223 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1224 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1225
1226 /* Used to omit pushing failure point id's when we're not debugging. */
1227 #ifdef DEBUG
1228 #define DEBUG_PUSH PUSH_FAILURE_INT
1229 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1230 #else
1231 #define DEBUG_PUSH(item)
1232 #define DEBUG_POP(item_addr)
1233 #endif
1234
1235
1236 /* Push the information about the state we will need
1237 if we ever fail back to it.
1238
1239 Requires variables fail_stack, regstart, regend, reg_info, and
1240 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1241 declared.
1242
1243 Does `return FAILURE_CODE' if runs out of memory. */
1244
1245 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1246 #define DECLARE_DESTINATION char *destination;
1247 #else
1248 #define DECLARE_DESTINATION
1249 #endif
1250
1251 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1252 do { \
1253 DECLARE_DESTINATION \
1254 /* Must be int, so when we don't save any registers, the arithmetic \
1255 of 0 + -1 isn't done as unsigned. */ \
1256 int this_reg; \
1257 \
1258 DEBUG_STATEMENT (failure_id++); \
1259 DEBUG_STATEMENT (nfailure_points_pushed++); \
1260 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1261 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1262 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1263 \
1264 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1265 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1266 \
1267 /* Ensure we have enough space allocated for what we will push. */ \
1268 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1269 { \
1270 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1271 return failure_code; \
1272 \
1273 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1274 (fail_stack).size); \
1275 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1276 } \
1277 \
1278 /* Push the info, starting with the registers. */ \
1279 DEBUG_PRINT1 ("\n"); \
1280 \
1281 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1282 this_reg++) \
1283 { \
1284 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1285 DEBUG_STATEMENT (num_regs_pushed++); \
1286 \
1287 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1288 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1289 \
1290 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1291 PUSH_FAILURE_POINTER (regend[this_reg]); \
1292 \
1293 DEBUG_PRINT2 (" info: 0x%lx\n ", \
1294 * (long *) (&reg_info[this_reg])); \
1295 DEBUG_PRINT2 (" match_null=%d", \
1296 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1297 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1298 DEBUG_PRINT2 (" matched_something=%d", \
1299 MATCHED_SOMETHING (reg_info[this_reg])); \
1300 DEBUG_PRINT2 (" ever_matched=%d", \
1301 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1302 DEBUG_PRINT1 ("\n"); \
1303 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1304 } \
1305 \
1306 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
1307 PUSH_FAILURE_INT (lowest_active_reg); \
1308 \
1309 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
1310 PUSH_FAILURE_INT (highest_active_reg); \
1311 \
1312 DEBUG_PRINT2 (" Pushing pattern 0x%lx: ", (long) pattern_place); \
1313 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1314 PUSH_FAILURE_POINTER (pattern_place); \
1315 \
1316 DEBUG_PRINT2 (" Pushing string 0x%lx: `", (long) string_place); \
1317 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1318 size2); \
1319 DEBUG_PRINT1 ("'\n"); \
1320 PUSH_FAILURE_POINTER (string_place); \
1321 \
1322 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1323 DEBUG_PUSH (failure_id); \
1324 } while (0)
1325
1326 /* This is the number of items that are pushed and popped on the stack
1327 for each register. */
1328 #define NUM_REG_ITEMS 3
1329
1330 /* Individual items aside from the registers. */
1331 #ifdef DEBUG
1332 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1333 #else
1334 #define NUM_NONREG_ITEMS 4
1335 #endif
1336
1337 /* We push at most this many items on the stack. */
1338 /* We used to use (num_regs - 1), which is the number of registers
1339 this regexp will save; but that was changed to 5
1340 to avoid stack overflow for a regexp with lots of parens. */
1341 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1342
1343 /* We actually push this many items. */
1344 #define NUM_FAILURE_ITEMS \
1345 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1346 + NUM_NONREG_ITEMS)
1347
1348 /* How many items can still be added to the stack without overflowing it. */
1349 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1350
1351
1352 /* Pops what PUSH_FAIL_STACK pushes.
1353
1354 We restore into the parameters, all of which should be lvalues:
1355 STR -- the saved data position.
1356 PAT -- the saved pattern position.
1357 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1358 REGSTART, REGEND -- arrays of string positions.
1359 REG_INFO -- array of information about each subexpression.
1360
1361 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1362 `pend', `string1', `size1', `string2', and `size2'. */
1363
1364 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1365 { \
1366 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \
1367 int this_reg; \
1368 CONST unsigned char *string_temp; \
1369 \
1370 assert (!FAIL_STACK_EMPTY ()); \
1371 \
1372 /* Remove failure points and point to how many regs pushed. */ \
1373 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1374 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1375 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1376 \
1377 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1378 \
1379 DEBUG_POP (&ffailure_id.integer); \
1380 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1381 * (unsigned int *) &ffailure_id); \
1382 \
1383 /* If the saved string location is NULL, it came from an \
1384 on_failure_keep_string_jump opcode, and we want to throw away the \
1385 saved NULL, thus retaining our current position in the string. */ \
1386 string_temp = POP_FAILURE_POINTER (); \
1387 if (string_temp != NULL) \
1388 str = (CONST char *) string_temp; \
1389 \
1390 DEBUG_PRINT2 (" Popping string 0x%lx: `", (long) str); \
1391 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1392 DEBUG_PRINT1 ("'\n"); \
1393 \
1394 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1395 DEBUG_PRINT2 (" Popping pattern 0x%lx: ", (long) pat); \
1396 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1397 \
1398 /* Restore register info. */ \
1399 high_reg = (unsigned) POP_FAILURE_INT (); \
1400 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1401 \
1402 low_reg = (unsigned) POP_FAILURE_INT (); \
1403 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1404 \
1405 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1406 { \
1407 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1408 \
1409 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1410 DEBUG_PRINT2 (" info: 0x%lx\n", \
1411 * (long *) &reg_info[this_reg]); \
1412 \
1413 regend[this_reg] = (CONST char *) POP_FAILURE_POINTER (); \
1414 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \
1415 \
1416 regstart[this_reg] = (CONST char *) POP_FAILURE_POINTER (); \
1417 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \
1418 } \
1419 \
1420 set_regs_matched_done = 0; \
1421 DEBUG_STATEMENT (nfailure_points_popped++); \
1422 } /* POP_FAILURE_POINT */
1423
1424
1425
1426 /* Structure for per-register (a.k.a. per-group) information.
1427 Other register information, such as the
1428 starting and ending positions (which are addresses), and the list of
1429 inner groups (which is a bits list) are maintained in separate
1430 variables.
1431
1432 We are making a (strictly speaking) nonportable assumption here: that
1433 the compiler will pack our bit fields into something that fits into
1434 the type of `word', i.e., is something that fits into one item on the
1435 failure stack. */
1436
1437 typedef union
1438 {
1439 fail_stack_elt_t word;
1440 struct
1441 {
1442 /* This field is one if this group can match the empty string,
1443 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1444 #define MATCH_NULL_UNSET_VALUE 3
1445 unsigned match_null_string_p : 2;
1446 unsigned is_active : 1;
1447 unsigned matched_something : 1;
1448 unsigned ever_matched_something : 1;
1449 } bits;
1450 } register_info_type;
1451
1452 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1453 #define IS_ACTIVE(R) ((R).bits.is_active)
1454 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1455 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1456
1457
1458 /* Call this when have matched a real character; it sets `matched' flags
1459 for the subexpressions which we are currently inside. Also records
1460 that those subexprs have matched. */
1461 #define SET_REGS_MATCHED() \
1462 do \
1463 { \
1464 if (!set_regs_matched_done) \
1465 { \
1466 unsigned r; \
1467 set_regs_matched_done = 1; \
1468 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1469 { \
1470 MATCHED_SOMETHING (reg_info[r]) \
1471 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1472 = 1; \
1473 } \
1474 } \
1475 } \
1476 while (0)
1477
1478 /* Registers are set to a sentinel when they haven't yet matched. */
1479 static char reg_unset_dummy;
1480 #define REG_UNSET_VALUE (&reg_unset_dummy)
1481 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1482
1483 /* Subroutine declarations and macros for regex_compile. */
1484
1485 /* Fetch the next character in the uncompiled pattern---translating it
1486 if necessary. Also cast from a signed character in the constant
1487 string passed to us by the user to an unsigned char that we can use
1488 as an array index (in, e.g., `translate'). */
1489 #define PATFETCH(c) \
1490 do {if (p == pend) return REG_EEND; \
1491 assert (p < pend); \
1492 c = (unsigned char) *p++; \
1493 if (translate) c = (unsigned char) translate[c]; \
1494 } while (0)
1495
1496 /* Fetch the next character in the uncompiled pattern, with no
1497 translation. */
1498 #define PATFETCH_RAW(c) \
1499 do {if (p == pend) return REG_EEND; \
1500 assert (p < pend); \
1501 c = (unsigned char) *p++; \
1502 } while (0)
1503
1504 /* Go backwards one character in the pattern. */
1505 #define PATUNFETCH p--
1506
1507 #ifdef MULE
1508
1509 #define PATFETCH_EXTENDED(emch) \
1510 do {if (p == pend) return REG_EEND; \
1511 assert (p < pend); \
1512 emch = charptr_emchar ((CONST Bufbyte *) p); \
1513 INC_CHARPTR (p); \
1514 if (translate && emch < 0x80) \
1515 emch = (Emchar) (unsigned char) translate[emch]; \
1516 } while (0)
1517
1518 #define PATFETCH_RAW_EXTENDED(emch) \
1519 do {if (p == pend) return REG_EEND; \
1520 assert (p < pend); \
1521 emch = charptr_emchar ((CONST Bufbyte *) p); \
1522 INC_CHARPTR (p); \
1523 } while (0)
1524
1525 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1526
1527 #define PATFETCH_EITHER(emch) \
1528 do { \
1529 if (has_extended_chars) \
1530 PATFETCH_EXTENDED (emch); \
1531 else \
1532 PATFETCH (emch); \
1533 } while (0)
1534
1535 #define PATFETCH_RAW_EITHER(emch) \
1536 do { \
1537 if (has_extended_chars) \
1538 PATFETCH_RAW_EXTENDED (emch); \
1539 else \
1540 PATFETCH_RAW (emch); \
1541 } while (0)
1542
1543 #define PATUNFETCH_EITHER \
1544 do { \
1545 if (has_extended_chars) \
1546 PATUNFETCH_EXTENDED (emch); \
1547 else \
1548 PATUNFETCH (emch); \
1549 } while (0)
1550
1551 #else /* not MULE */
1552
1553 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1554 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1555 #define PATUNFETCH_EITHER PATUNFETCH
1556
1557 #endif /* not MULE */
1558
1559 /* If `translate' is non-null, return translate[D], else just D. We
1560 cast the subscript to translate because some data is declared as
1561 `char *', to avoid warnings when a string constant is passed. But
1562 when we use a character as a subscript we must make it unsigned. */
1563 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
1564
1565 #ifdef MULE
1566
1567 #define TRANSLATE_EXTENDED_UNSAFE(emch) \
1568 (translate && emch < 0x80 ? translate[emch] : (emch))
1569
1570 #endif
1571
1572 /* Macros for outputting the compiled pattern into `buffer'. */
1573
1574 /* If the buffer isn't allocated when it comes in, use this. */
1575 #define INIT_BUF_SIZE 32
1576
1577 /* Make sure we have at least N more bytes of space in buffer. */
1578 #define GET_BUFFER_SPACE(n) \
1579 while (b - bufp->buffer + (n) > bufp->allocated) \
1580 EXTEND_BUFFER ()
1581
1582 /* Make sure we have one more byte of buffer space and then add C to it. */
1583 #define BUF_PUSH(c) \
1584 do { \
1585 GET_BUFFER_SPACE (1); \
1586 *b++ = (unsigned char) (c); \
1587 } while (0)
1588
1589
1590 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1591 #define BUF_PUSH_2(c1, c2) \
1592 do { \
1593 GET_BUFFER_SPACE (2); \
1594 *b++ = (unsigned char) (c1); \
1595 *b++ = (unsigned char) (c2); \
1596 } while (0)
1597
1598
1599 /* As with BUF_PUSH_2, except for three bytes. */
1600 #define BUF_PUSH_3(c1, c2, c3) \
1601 do { \
1602 GET_BUFFER_SPACE (3); \
1603 *b++ = (unsigned char) (c1); \
1604 *b++ = (unsigned char) (c2); \
1605 *b++ = (unsigned char) (c3); \
1606 } while (0)
1607
1608
1609 /* Store a jump with opcode OP at LOC to location TO. We store a
1610 relative address offset by the three bytes the jump itself occupies. */
1611 #define STORE_JUMP(op, loc, to) \
1612 store_op1 (op, loc, (to) - (loc) - 3)
1613
1614 /* Likewise, for a two-argument jump. */
1615 #define STORE_JUMP2(op, loc, to, arg) \
1616 store_op2 (op, loc, (to) - (loc) - 3, arg)
1617
1618 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1619 #define INSERT_JUMP(op, loc, to) \
1620 insert_op1 (op, loc, (to) - (loc) - 3, b)
1621
1622 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1623 #define INSERT_JUMP2(op, loc, to, arg) \
1624 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1625
1626
1627 /* This is not an arbitrary limit: the arguments which represent offsets
1628 into the pattern are two bytes long. So if 2^16 bytes turns out to
1629 be too small, many things would have to change. */
1630 #define MAX_BUF_SIZE (1L << 16)
1631
1632
1633 /* Extend the buffer by twice its current size via realloc and
1634 reset the pointers that pointed into the old block to point to the
1635 correct places in the new one. If extending the buffer results in it
1636 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1637 #define EXTEND_BUFFER() \
1638 do { \
1639 unsigned char *old_buffer = bufp->buffer; \
1640 if (bufp->allocated == MAX_BUF_SIZE) \
1641 return REG_ESIZE; \
1642 bufp->allocated <<= 1; \
1643 if (bufp->allocated > MAX_BUF_SIZE) \
1644 bufp->allocated = MAX_BUF_SIZE; \
1645 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1646 if (bufp->buffer == NULL) \
1647 return REG_ESPACE; \
1648 /* If the buffer moved, move all the pointers into it. */ \
1649 if (old_buffer != bufp->buffer) \
1650 { \
1651 b = (b - old_buffer) + bufp->buffer; \
1652 begalt = (begalt - old_buffer) + bufp->buffer; \
1653 if (fixup_alt_jump) \
1654 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1655 if (laststart) \
1656 laststart = (laststart - old_buffer) + bufp->buffer; \
1657 if (pending_exact) \
1658 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1659 } \
1660 } while (0)
1661
1662
1663 /* Since we have one byte reserved for the register number argument to
1664 {start,stop}_memory, the maximum number of groups we can report
1665 things about is what fits in that byte. */
1666 #define MAX_REGNUM 255
1667
1668 /* But patterns can have more than `MAX_REGNUM' registers. We just
1669 ignore the excess. */
1670 typedef unsigned regnum_t;
1671
1672
1673 /* Macros for the compile stack. */
1674
1675 /* Since offsets can go either forwards or backwards, this type needs to
1676 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1677 typedef int pattern_offset_t;
1678
1679 typedef struct
1680 {
1681 pattern_offset_t begalt_offset;
1682 pattern_offset_t fixup_alt_jump;
1683 pattern_offset_t inner_group_offset;
1684 pattern_offset_t laststart_offset;
1685 regnum_t regnum;
1686 } compile_stack_elt_t;
1687
1688
1689 typedef struct
1690 {
1691 compile_stack_elt_t *stack;
1692 unsigned size;
1693 unsigned avail; /* Offset of next open position. */
1694 } compile_stack_type;
1695
1696
1697 #define INIT_COMPILE_STACK_SIZE 32
1698
1699 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1700 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1701
1702 /* The next available element. */
1703 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1704
1705
1706 /* Set the bit for character C in a bit vector. */
1707 #define SET_LIST_BIT(c) \
1708 (b[((unsigned char) (c)) / BYTEWIDTH] \
1709 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1710
1711 #ifdef MULE
1712
1713 /* Set the "bit" for character C in a range table. */
1714 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1715
1716 /* Set the "bit" for character c in the appropriate table. */
1717 #define SET_EITHER_BIT(c) \
1718 do { \
1719 if (has_extended_chars) \
1720 SET_RANGETAB_BIT (c); \
1721 else \
1722 SET_LIST_BIT (c); \
1723 } while (0)
1724
1725 #else /* not MULE */
1726
1727 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1728
1729 #endif
1730
1731
1732 /* Get the next unsigned number in the uncompiled pattern. */
1733 #define GET_UNSIGNED_NUMBER(num) \
1734 { if (p != pend) \
1735 { \
1736 PATFETCH (c); \
1737 while (ISDIGIT (c)) \
1738 { \
1739 if (num < 0) \
1740 num = 0; \
1741 num = num * 10 + c - '0'; \
1742 if (p == pend) \
1743 break; \
1744 PATFETCH (c); \
1745 } \
1746 } \
1747 }
1748
1749 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1750
1751 #define IS_CHAR_CLASS(string) \
1752 (STREQ (string, "alpha") || STREQ (string, "upper") \
1753 || STREQ (string, "lower") || STREQ (string, "digit") \
1754 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1755 || STREQ (string, "space") || STREQ (string, "print") \
1756 || STREQ (string, "punct") || STREQ (string, "graph") \
1757 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1758
1759 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
1760 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1761 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg,
1762 unsigned char *end);
1763 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1764 unsigned char *end);
1765 static boolean at_begline_loc_p (CONST char *pattern, CONST char *p,
1766 reg_syntax_t syntax);
1767 static boolean at_endline_loc_p (CONST char *p, CONST char *pend, int syntax);
1768 static boolean group_in_compile_stack (compile_stack_type compile_stack,
1769 regnum_t regnum);
1770 static reg_errcode_t compile_range (CONST char **p_ptr, CONST char *pend,
1771 char *translate, reg_syntax_t syntax,
1772 unsigned char *b);
1773 #ifdef MULE
1774 static reg_errcode_t compile_extended_range (CONST char **p_ptr,
1775 CONST char *pend,
1776 char *translate,
1777 reg_syntax_t syntax,
1778 Lisp_Object rtab);
1779 #endif /* MULE */
1780 static boolean group_match_null_string_p (unsigned char **p,
1781 unsigned char *end,
1782 register_info_type *reg_info);
1783 static boolean alt_match_null_string_p (unsigned char *p, unsigned char *end,
1784 register_info_type *reg_info);
1785 static boolean common_op_match_null_string_p (unsigned char **p,
1786 unsigned char *end,
1787 register_info_type *reg_info);
1788 static int bcmp_translate (CONST unsigned char *s1, CONST unsigned char *s2,
1789 REGISTER int len, char *translate);
1790 static int re_match_2_internal (struct re_pattern_buffer *bufp,
1791 CONST char *string1, int size1,
1792 CONST char *string2, int size2, int pos,
1793 struct re_registers *regs, int stop);
1794
1795 #ifndef MATCH_MAY_ALLOCATE
1796
1797 /* If we cannot allocate large objects within re_match_2_internal,
1798 we make the fail stack and register vectors global.
1799 The fail stack, we grow to the maximum size when a regexp
1800 is compiled.
1801 The register vectors, we adjust in size each time we
1802 compile a regexp, according to the number of registers it needs. */
1803
1804 static fail_stack_type fail_stack;
1805
1806 /* Size with which the following vectors are currently allocated.
1807 That is so we can make them bigger as needed,
1808 but never make them smaller. */
1809 static int regs_allocated_size;
1810
1811 static CONST char ** regstart, ** regend;
1812 static CONST char ** old_regstart, ** old_regend;
1813 static CONST char **best_regstart, **best_regend;
1814 static register_info_type *reg_info;
1815 static CONST char **reg_dummy;
1816 static register_info_type *reg_info_dummy;
1817
1818 /* Make the register vectors big enough for NUM_REGS registers,
1819 but don't make them smaller. */
1820
1821 static
1822 regex_grow_registers (int num_regs)
1823 {
1824 if (num_regs > regs_allocated_size)
1825 {
1826 RETALLOC_IF (regstart, num_regs, CONST char *);
1827 RETALLOC_IF (regend, num_regs, CONST char *);
1828 RETALLOC_IF (old_regstart, num_regs, CONST char *);
1829 RETALLOC_IF (old_regend, num_regs, CONST char *);
1830 RETALLOC_IF (best_regstart, num_regs, CONST char *);
1831 RETALLOC_IF (best_regend, num_regs, CONST char *);
1832 RETALLOC_IF (reg_info, num_regs, register_info_type);
1833 RETALLOC_IF (reg_dummy, num_regs, CONST char *);
1834 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1835
1836 regs_allocated_size = num_regs;
1837 }
1838 }
1839
1840 #endif /* not MATCH_MAY_ALLOCATE */
1841
1842 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1843 Returns one of error codes defined in `regex.h', or zero for success.
1844
1845 Assumes the `allocated' (and perhaps `buffer') and `translate'
1846 fields are set in BUFP on entry.
1847
1848 If it succeeds, results are put in BUFP (if it returns an error, the
1849 contents of BUFP are undefined):
1850 `buffer' is the compiled pattern;
1851 `syntax' is set to SYNTAX;
1852 `used' is set to the length of the compiled pattern;
1853 `fastmap_accurate' is zero;
1854 `re_nsub' is the number of subexpressions in PATTERN;
1855 `not_bol' and `not_eol' are zero;
1856
1857 The `fastmap' and `newline_anchor' fields are neither
1858 examined nor set. */
1859
1860 /* Return, freeing storage we allocated. */
1861 #define FREE_STACK_RETURN(value) \
1862 return (free (compile_stack.stack), value)
1863
1864 static reg_errcode_t
1865 regex_compile (CONST char *pattern, int size, reg_syntax_t syntax,
1866 struct re_pattern_buffer *bufp)
1867 {
1868 /* We fetch characters from PATTERN here. We declare these as int
1869 (or possibly long) so that chars above 127 can be used as
1870 array indices. The macros that fetch a character from the pattern
1871 make sure to coerce to unsigned char before assigning, so we won't
1872 get bitten by negative numbers here. */
1873 /* XEmacs change: used to be unsigned char. */
1874 REGISTER EMACS_INT c, c1;
1875
1876 /* A random temporary spot in PATTERN. */
1877 CONST char *p1;
1878
1879 /* Points to the end of the buffer, where we should append. */
1880 REGISTER unsigned char *b;
1881
1882 /* Keeps track of unclosed groups. */
1883 compile_stack_type compile_stack;
1884
1885 /* Points to the current (ending) position in the pattern. */
1886 CONST char *p = pattern;
1887 CONST char *pend = pattern + size;
1888
1889 /* How to translate the characters in the pattern. */
1890 char *translate = bufp->translate;
1891
1892 /* Address of the count-byte of the most recently inserted `exactn'
1893 command. This makes it possible to tell if a new exact-match
1894 character can be added to that command or if the character requires
1895 a new `exactn' command. */
1896 unsigned char *pending_exact = 0;
1897
1898 /* Address of start of the most recently finished expression.
1899 This tells, e.g., postfix * where to find the start of its
1900 operand. Reset at the beginning of groups and alternatives. */
1901 unsigned char *laststart = 0;
1902
1903 /* Address of beginning of regexp, or inside of last group. */
1904 unsigned char *begalt;
1905
1906 /* Place in the uncompiled pattern (i.e., the {) to
1907 which to go back if the interval is invalid. */
1908 CONST char *beg_interval;
1909
1910 /* Address of the place where a forward jump should go to the end of
1911 the containing expression. Each alternative of an `or' -- except the
1912 last -- ends with a forward jump of this sort. */
1913 unsigned char *fixup_alt_jump = 0;
1914
1915 /* Counts open-groups as they are encountered. Remembered for the
1916 matching close-group on the compile stack, so the same register
1917 number is put in the stop_memory as the start_memory. */
1918 regnum_t regnum = 0;
1919
1920 #ifdef DEBUG
1921 DEBUG_PRINT1 ("\nCompiling pattern: ");
1922 if (debug)
1923 {
1924 unsigned debug_count;
1925
1926 for (debug_count = 0; debug_count < size; debug_count++)
1927 putchar (pattern[debug_count]);
1928 putchar ('\n');
1929 }
1930 #endif /* DEBUG */
1931
1932 /* Initialize the compile stack. */
1933 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1934 if (compile_stack.stack == NULL)
1935 return REG_ESPACE;
1936
1937 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1938 compile_stack.avail = 0;
1939
1940 /* Initialize the pattern buffer. */
1941 bufp->syntax = syntax;
1942 bufp->fastmap_accurate = 0;
1943 bufp->not_bol = bufp->not_eol = 0;
1944
1945 /* Set `used' to zero, so that if we return an error, the pattern
1946 printer (for debugging) will think there's no pattern. We reset it
1947 at the end. */
1948 bufp->used = 0;
1949
1950 /* Always count groups, whether or not bufp->no_sub is set. */
1951 bufp->re_nsub = 0;
1952
1953 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1954 /* Initialize the syntax table. */
1955 init_syntax_once ();
1956 #endif
1957
1958 if (bufp->allocated == 0)
1959 {
1960 if (bufp->buffer)
1961 { /* If zero allocated, but buffer is non-null, try to realloc
1962 enough space. This loses if buffer's address is bogus, but
1963 that is the user's responsibility. */
1964 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1965 }
1966 else
1967 { /* Caller did not allocate a buffer. Do it for them. */
1968 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1969 }
1970 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
1971
1972 bufp->allocated = INIT_BUF_SIZE;
1973 }
1974
1975 begalt = b = bufp->buffer;
1976
1977 /* Loop through the uncompiled pattern until we're at the end. */
1978 while (p != pend)
1979 {
1980 PATFETCH (c);
1981
1982 switch (c)
1983 {
1984 case '^':
1985 {
1986 if ( /* If at start of pattern, it's an operator. */
1987 p == pattern + 1
1988 /* If context independent, it's an operator. */
1989 || syntax & RE_CONTEXT_INDEP_ANCHORS
1990 /* Otherwise, depends on what's come before. */
1991 || at_begline_loc_p (pattern, p, syntax))
1992 BUF_PUSH (begline);
1993 else
1994 goto normal_char;
1995 }
1996 break;
1997
1998
1999 case '$':
2000 {
2001 if ( /* If at end of pattern, it's an operator. */
2002 p == pend
2003 /* If context independent, it's an operator. */
2004 || syntax & RE_CONTEXT_INDEP_ANCHORS
2005 /* Otherwise, depends on what's next. */
2006 || at_endline_loc_p (p, pend, syntax))
2007 BUF_PUSH (endline);
2008 else
2009 goto normal_char;
2010 }
2011 break;
2012
2013
2014 case '+':
2015 case '?':
2016 if ((syntax & RE_BK_PLUS_QM)
2017 || (syntax & RE_LIMITED_OPS))
2018 goto normal_char;
2019 handle_plus:
2020 case '*':
2021 /* If there is no previous pattern... */
2022 if (!laststart)
2023 {
2024 if (syntax & RE_CONTEXT_INVALID_OPS)
2025 FREE_STACK_RETURN (REG_BADRPT);
2026 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2027 goto normal_char;
2028 }
2029
2030 {
2031 /* true means zero/many matches are allowed. */
2032 boolean zero_times_ok = c != '+';
2033 boolean many_times_ok = c != '?';
2034
2035 /* true means match shortest string possible. */
2036 boolean minimal = false;
2037
2038 /* If there is a sequence of repetition chars, collapse it
2039 down to just one (the right one). We can't combine
2040 interval operators with these because of, e.g., `a{2}*',
2041 which should only match an even number of `a's. */
2042 while (p != pend)
2043 {
2044 PATFETCH (c);
2045
2046 if (c == '*' || (!(syntax & RE_BK_PLUS_QM)
2047 && (c == '+' || c == '?')))
2048 ;
2049
2050 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2051 {
2052 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2053
2054 PATFETCH (c1);
2055 if (!(c1 == '+' || c1 == '?'))
2056 {
2057 PATUNFETCH;
2058 PATUNFETCH;
2059 break;
2060 }
2061
2062 c = c1;
2063 }
2064 else
2065 {
2066 PATUNFETCH;
2067 break;
2068 }
2069
2070 /* If we get here, we found another repeat character. */
2071 if (!(syntax & RE_NO_MINIMAL_MATCHING))
2072 {
2073 /* `*?' and `+?' and `??' are okay (and mean match
2074 minimally), but other sequences (such as `*??' and
2075 `+++') are rejected (reserved for future use). */
2076 if (minimal || c != '?')
2077 FREE_STACK_RETURN (REG_BADRPT);
2078 minimal = true;
2079 }
2080 else
2081 {
2082 zero_times_ok |= c != '+';
2083 many_times_ok |= c != '?';
2084 }
2085 }
2086
2087 /* Star, etc. applied to an empty pattern is equivalent
2088 to an empty pattern. */
2089 if (!laststart)
2090 break;
2091
2092 /* Now we know whether zero matches is allowed
2093 and whether two or more matches is allowed
2094 and whether we want minimal or maximal matching. */
2095 if (minimal)
2096 {
2097 if (!many_times_ok)
2098 {
2099 /* "a??" becomes:
2100 0: /on_failure_jump to 6
2101 3: /jump to 9
2102 6: /exactn/1/A
2103 9: end of pattern.
2104 */
2105 GET_BUFFER_SPACE (6);
2106 INSERT_JUMP (jump, laststart, b + 3);
2107 b += 3;
2108 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2109 b += 3;
2110 }
2111 else if (zero_times_ok)
2112 {
2113 /* "a*?" becomes:
2114 0: /jump to 6
2115 3: /exactn/1/A
2116 6: /on_failure_jump to 3
2117 9: end of pattern.
2118 */
2119 GET_BUFFER_SPACE (6);
2120 INSERT_JUMP (jump, laststart, b + 3);
2121 b += 3;
2122 STORE_JUMP (on_failure_jump, b, laststart + 3);
2123 b += 3;
2124 }
2125 else
2126 {
2127 /* "a+?" becomes:
2128 0: /exactn/1/A
2129 3: /on_failure_jump to 0
2130 6: end of pattern.
2131 */
2132 GET_BUFFER_SPACE (3);
2133 STORE_JUMP (on_failure_jump, b, laststart);
2134 b += 3;
2135 }
2136 }
2137 else
2138 {
2139 /* Are we optimizing this jump? */
2140 boolean keep_string_p = false;
2141
2142 if (many_times_ok)
2143 { /* More than one repetition is allowed, so put in at the
2144 end a backward relative jump from `b' to before the next
2145 jump we're going to put in below (which jumps from
2146 laststart to after this jump).
2147
2148 But if we are at the `*' in the exact sequence `.*\n',
2149 insert an unconditional jump backwards to the .,
2150 instead of the beginning of the loop. This way we only
2151 push a failure point once, instead of every time
2152 through the loop. */
2153 assert (p - 1 > pattern);
2154
2155 /* Allocate the space for the jump. */
2156 GET_BUFFER_SPACE (3);
2157
2158 /* We know we are not at the first character of the
2159 pattern, because laststart was nonzero. And we've
2160 already incremented `p', by the way, to be the
2161 character after the `*'. Do we have to do something
2162 analogous here for null bytes, because of
2163 RE_DOT_NOT_NULL? */
2164 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2165 && zero_times_ok
2166 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2167 && !(syntax & RE_DOT_NEWLINE))
2168 { /* We have .*\n. */
2169 STORE_JUMP (jump, b, laststart);
2170 keep_string_p = true;
2171 }
2172 else
2173 /* Anything else. */
2174 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
2175
2176 /* We've added more stuff to the buffer. */
2177 b += 3;
2178 }
2179
2180 /* On failure, jump from laststart to b + 3, which will be the
2181 end of the buffer after this jump is inserted. */
2182 GET_BUFFER_SPACE (3);
2183 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2184 : on_failure_jump,
2185 laststart, b + 3);
2186 b += 3;
2187
2188 if (!zero_times_ok)
2189 {
2190 /* At least one repetition is required, so insert a
2191 `dummy_failure_jump' before the initial
2192 `on_failure_jump' instruction of the loop. This
2193 effects a skip over that instruction the first time
2194 we hit that loop. */
2195 GET_BUFFER_SPACE (3);
2196 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2197 b += 3;
2198 }
2199 }
2200 pending_exact = 0;
2201 }
2202 break;
2203
2204
2205 case '.':
2206 laststart = b;
2207 BUF_PUSH (anychar);
2208 break;
2209
2210
2211 case '[':
2212 {
2213 /* XEmacs change: this whole section */
2214 boolean had_char_class = false;
2215 #ifdef MULE
2216 boolean has_extended_chars = false;
2217 REGISTER Lisp_Object rtab = Qnil;
2218 #endif
2219
2220 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2221
2222 /* Ensure that we have enough space to push a charset: the
2223 opcode, the length count, and the bitset; 34 bytes in all. */
2224 GET_BUFFER_SPACE (34);
2225
2226 laststart = b;
2227
2228 /* We test `*p == '^' twice, instead of using an if
2229 statement, so we only need one BUF_PUSH. */
2230 BUF_PUSH (*p == '^' ? charset_not : charset);
2231 if (*p == '^')
2232 p++;
2233
2234 /* Remember the first position in the bracket expression. */
2235 p1 = p;
2236
2237 /* Push the number of bytes in the bitmap. */
2238 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2239
2240 /* Clear the whole map. */
2241 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2242
2243 /* charset_not matches newline according to a syntax bit. */
2244 if ((re_opcode_t) b[-2] == charset_not
2245 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2246 SET_LIST_BIT ('\n');
2247
2248 #ifdef MULE
2249 start_over_with_extended:
2250 if (has_extended_chars)
2251 {
2252 /* There are extended chars here, which means we need to start
2253 over and shift to unified range-table format. */
2254 if (b[-2] == charset)
2255 b[-2] = charset_mule;
2256 else
2257 b[-2] = charset_mule_not;
2258 b--;
2259 p = p1; /* go back to the beginning of the charset, after
2260 a possible ^. */
2261 rtab = Vthe_lisp_rangetab;
2262 Fclear_range_table (rtab);
2263
2264 /* charset_not matches newline according to a syntax bit. */
2265 if ((re_opcode_t) b[-1] == charset_mule_not
2266 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2267 SET_EITHER_BIT ('\n');
2268 }
2269 #endif /* MULE */
2270
2271 /* Read in characters and ranges, setting map bits. */
2272 for (;;)
2273 {
2274 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2275
2276 PATFETCH_EITHER (c);
2277
2278 #ifdef MULE
2279 if (c >= 0x80 && !has_extended_chars)
2280 {
2281 has_extended_chars = 1;
2282 /* Frumble-bumble, we've found some extended chars.
2283 Need to start over, process everything using
2284 the general extended-char mechanism, and need
2285 to use charset_mule and charset_mule_not instead
2286 of charset and charset_not. */
2287 goto start_over_with_extended;
2288 }
2289 #endif /* MULE */
2290 /* \ might escape characters inside [...] and [^...]. */
2291 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2292 {
2293 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2294
2295 PATFETCH_EITHER (c1);
2296 #ifdef MULE
2297 if (c1 >= 0x80 && !has_extended_chars)
2298 {
2299 has_extended_chars = 1;
2300 goto start_over_with_extended;
2301 }
2302 #endif /* MULE */
2303 SET_EITHER_BIT (c1);
2304 continue;
2305 }
2306
2307 /* Could be the end of the bracket expression. If it's
2308 not (i.e., when the bracket expression is `[]' so
2309 far), the ']' character bit gets set way below. */
2310 if (c == ']' && p != p1 + 1)
2311 break;
2312
2313 /* Look ahead to see if it's a range when the last thing
2314 was a character class. */
2315 if (had_char_class && c == '-' && *p != ']')
2316 FREE_STACK_RETURN (REG_ERANGE);
2317
2318 /* Look ahead to see if it's a range when the last thing
2319 was a character: if this is a hyphen not at the
2320 beginning or the end of a list, then it's the range
2321 operator. */
2322 if (c == '-'
2323 && !(p - 2 >= pattern && p[-2] == '[')
2324 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2325 && *p != ']')
2326 {
2327 reg_errcode_t ret;
2328
2329 #ifdef MULE
2330 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2331 {
2332 has_extended_chars = 1;
2333 goto start_over_with_extended;
2334 }
2335 if (has_extended_chars)
2336 ret = compile_extended_range (&p, pend, translate,
2337 syntax, rtab);
2338 else
2339 #endif /* MULE */
2340 ret = compile_range (&p, pend, translate, syntax, b);
2341 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2342 }
2343
2344 else if (p[0] == '-' && p[1] != ']')
2345 { /* This handles ranges made up of characters only. */
2346 reg_errcode_t ret;
2347
2348 /* Move past the `-'. */
2349 PATFETCH (c1);
2350
2351 #ifdef MULE
2352 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2353 {
2354 has_extended_chars = 1;
2355 goto start_over_with_extended;
2356 }
2357 if (has_extended_chars)
2358 ret = compile_extended_range (&p, pend, translate,
2359 syntax, rtab);
2360 else
2361 #endif /* MULE */
2362 ret = compile_range (&p, pend, translate, syntax, b);
2363 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2364 }
2365
2366 /* See if we're at the beginning of a possible character
2367 class. */
2368
2369 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2370 { /* Leave room for the null. */
2371 char str[CHAR_CLASS_MAX_LENGTH + 1];
2372
2373 PATFETCH (c);
2374 c1 = 0;
2375
2376 /* If pattern is `[[:'. */
2377 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2378
2379 for (;;)
2380 {
2381 /* Do not do PATFETCH_EITHER() here. We want
2382 to just see if the bytes match particular
2383 strings, and we put them all back if not.
2384
2385 #### May need to be changed once trt tables
2386 are working. */
2387 PATFETCH (c);
2388 if (c == ':' || c == ']' || p == pend
2389 || c1 == CHAR_CLASS_MAX_LENGTH)
2390 break;
2391 str[c1++] = c;
2392 }
2393 str[c1] = '\0';
2394
2395 /* If isn't a word bracketed by `[:' and:`]':
2396 undo the ending character, the letters, and leave
2397 the leading `:' and `[' (but set bits for them). */
2398 if (c == ':' && *p == ']')
2399 {
2400 int ch;
2401 boolean is_alnum = STREQ (str, "alnum");
2402 boolean is_alpha = STREQ (str, "alpha");
2403 boolean is_blank = STREQ (str, "blank");
2404 boolean is_cntrl = STREQ (str, "cntrl");
2405 boolean is_digit = STREQ (str, "digit");
2406 boolean is_graph = STREQ (str, "graph");
2407 boolean is_lower = STREQ (str, "lower");
2408 boolean is_print = STREQ (str, "print");
2409 boolean is_punct = STREQ (str, "punct");
2410 boolean is_space = STREQ (str, "space");
2411 boolean is_upper = STREQ (str, "upper");
2412 boolean is_xdigit = STREQ (str, "xdigit");
2413
2414 if (!IS_CHAR_CLASS (str))
2415 FREE_STACK_RETURN (REG_ECTYPE);
2416
2417 /* Throw away the ] at the end of the character
2418 class. */
2419 PATFETCH (c);
2420
2421 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2422
2423 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2424 {
2425 /* This was split into 3 if's to
2426 avoid an arbitrary limit in some compiler. */
2427 if ( (is_alnum && ISALNUM (ch))
2428 || (is_alpha && ISALPHA (ch))
2429 || (is_blank && ISBLANK (ch))
2430 || (is_cntrl && ISCNTRL (ch)))
2431 SET_EITHER_BIT (ch);
2432 if ( (is_digit && ISDIGIT (ch))
2433 || (is_graph && ISGRAPH (ch))
2434 || (is_lower && ISLOWER (ch))
2435 || (is_print && ISPRINT (ch)))
2436 SET_EITHER_BIT (ch);
2437 if ( (is_punct && ISPUNCT (ch))
2438 || (is_space && ISSPACE (ch))
2439 || (is_upper && ISUPPER (ch))
2440 || (is_xdigit && ISXDIGIT (ch)))
2441 SET_EITHER_BIT (ch);
2442 }
2443 had_char_class = true;
2444 }
2445 else
2446 {
2447 c1++;
2448 while (c1--)
2449 PATUNFETCH;
2450 SET_EITHER_BIT ('[');
2451 SET_EITHER_BIT (':');
2452 had_char_class = false;
2453 }
2454 }
2455 else
2456 {
2457 had_char_class = false;
2458 SET_EITHER_BIT (c);
2459 }
2460 }
2461
2462 #ifdef MULE
2463 if (has_extended_chars)
2464 {
2465 /* We have a range table, not a bit vector. */
2466 int bytes_needed =
2467 unified_range_table_bytes_needed (rtab);
2468 GET_BUFFER_SPACE (bytes_needed);
2469 unified_range_table_copy_data (rtab, b);
2470 b += unified_range_table_bytes_used (b);
2471 break;
2472 }
2473 #endif /* MULE */
2474 /* Discard any (non)matching list bytes that are all 0 at the
2475 end of the map. Decrease the map-length byte too. */
2476 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2477 b[-1]--;
2478 b += b[-1];
2479 }
2480 break;
2481
2482
2483 case '(':
2484 if (syntax & RE_NO_BK_PARENS)
2485 goto handle_open;
2486 else
2487 goto normal_char;
2488
2489
2490 case ')':
2491 if (syntax & RE_NO_BK_PARENS)
2492 goto handle_close;
2493 else
2494 goto normal_char;
2495
2496
2497 case '\n':
2498 if (syntax & RE_NEWLINE_ALT)
2499 goto handle_alt;
2500 else
2501 goto normal_char;
2502
2503
2504 case '|':
2505 if (syntax & RE_NO_BK_VBAR)
2506 goto handle_alt;
2507 else
2508 goto normal_char;
2509
2510
2511 case '{':
2512 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2513 goto handle_interval;
2514 else
2515 goto normal_char;
2516
2517
2518 case '\\':
2519 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2520
2521 /* Do not translate the character after the \, so that we can
2522 distinguish, e.g., \B from \b, even if we normally would
2523 translate, e.g., B to b. */
2524 PATFETCH_RAW (c);
2525
2526 switch (c)
2527 {
2528 case '(':
2529 if (syntax & RE_NO_BK_PARENS)
2530 goto normal_backslash;
2531
2532 handle_open:
2533 {
2534 regnum_t r;
2535
2536 if (!(syntax & RE_NO_SHY_GROUPS)
2537 && p != pend
2538 && TRANSLATE(*p) == TRANSLATE('?'))
2539 {
2540 p++;
2541 PATFETCH(c);
2542 switch (c)
2543 {
2544 case ':': /* shy groups */
2545 r = MAX_REGNUM + 1;
2546 break;
2547
2548 /* All others are reserved for future constructs. */
2549 default:
2550 FREE_STACK_RETURN (REG_BADPAT);
2551 }
2552 }
2553 else
2554 {
2555 bufp->re_nsub++;
2556 r = ++regnum;
2557 }
2558
2559 if (COMPILE_STACK_FULL)
2560 {
2561 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2562 compile_stack_elt_t);
2563 if (compile_stack.stack == NULL) return REG_ESPACE;
2564
2565 compile_stack.size <<= 1;
2566 }
2567
2568 /* These are the values to restore when we hit end of this
2569 group. They are all relative offsets, so that if the
2570 whole pattern moves because of realloc, they will still
2571 be valid. */
2572 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2573 COMPILE_STACK_TOP.fixup_alt_jump
2574 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2575 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2576 COMPILE_STACK_TOP.regnum = r;
2577
2578 /* We will eventually replace the 0 with the number of
2579 groups inner to this one. But do not push a
2580 start_memory for groups beyond the last one we can
2581 represent in the compiled pattern. */
2582 if (r <= MAX_REGNUM)
2583 {
2584 COMPILE_STACK_TOP.inner_group_offset
2585 = b - bufp->buffer + 2;
2586 BUF_PUSH_3 (start_memory, r, 0);
2587 }
2588
2589 compile_stack.avail++;
2590
2591 fixup_alt_jump = 0;
2592 laststart = 0;
2593 begalt = b;
2594 /* If we've reached MAX_REGNUM groups, then this open
2595 won't actually generate any code, so we'll have to
2596 clear pending_exact explicitly. */
2597 pending_exact = 0;
2598 }
2599 break;
2600
2601
2602 case ')':
2603 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2604
2605 if (COMPILE_STACK_EMPTY) {
2606 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2607 goto normal_backslash;
2608 else
2609 FREE_STACK_RETURN (REG_ERPAREN);
2610 }
2611
2612 handle_close:
2613 if (fixup_alt_jump)
2614 { /* Push a dummy failure point at the end of the
2615 alternative for a possible future
2616 `pop_failure_jump' to pop. See comments at
2617 `push_dummy_failure' in `re_match_2'. */
2618 BUF_PUSH (push_dummy_failure);
2619
2620 /* We allocated space for this jump when we assigned
2621 to `fixup_alt_jump', in the `handle_alt' case below. */
2622 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2623 }
2624
2625 /* See similar code for backslashed left paren above. */
2626 if (COMPILE_STACK_EMPTY) {
2627 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2628 goto normal_char;
2629 else
2630 FREE_STACK_RETURN (REG_ERPAREN);
2631 }
2632
2633 /* Since we just checked for an empty stack above, this
2634 ``can't happen''. */
2635 assert (compile_stack.avail != 0);
2636 {
2637 /* We don't just want to restore into `regnum', because
2638 later groups should continue to be numbered higher,
2639 as in `(ab)c(de)' -- the second group is #2. */
2640 regnum_t this_group_regnum;
2641
2642 compile_stack.avail--;
2643 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2644 fixup_alt_jump
2645 = COMPILE_STACK_TOP.fixup_alt_jump
2646 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2647 : 0;
2648 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2649 this_group_regnum = COMPILE_STACK_TOP.regnum;
2650 /* If we've reached MAX_REGNUM groups, then this open
2651 won't actually generate any code, so we'll have to
2652 clear pending_exact explicitly. */
2653 pending_exact = 0;
2654
2655 /* We're at the end of the group, so now we know how many
2656 groups were inside this one. */
2657 if (this_group_regnum <= MAX_REGNUM)
2658 {
2659 unsigned char *inner_group_loc
2660 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2661
2662 *inner_group_loc = regnum - this_group_regnum;
2663 BUF_PUSH_3 (stop_memory, this_group_regnum,
2664 regnum - this_group_regnum);
2665 }
2666 }
2667 break;
2668
2669
2670 case '|': /* `\|'. */
2671 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2672 goto normal_backslash;
2673 handle_alt:
2674 if (syntax & RE_LIMITED_OPS)
2675 goto normal_char;
2676
2677 /* Insert before the previous alternative a jump which
2678 jumps to this alternative if the former fails. */
2679 GET_BUFFER_SPACE (3);
2680 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2681 pending_exact = 0;
2682 b += 3;
2683
2684 /* The alternative before this one has a jump after it
2685 which gets executed if it gets matched. Adjust that
2686 jump so it will jump to this alternative's analogous
2687 jump (put in below, which in turn will jump to the next
2688 (if any) alternative's such jump, etc.). The last such
2689 jump jumps to the correct final destination. A picture:
2690 _____ _____
2691 | | | |
2692 | v | v
2693 a | b | c
2694
2695 If we are at `b', then fixup_alt_jump right now points to a
2696 three-byte space after `a'. We'll put in the jump, set
2697 fixup_alt_jump to right after `b', and leave behind three
2698 bytes which we'll fill in when we get to after `c'. */
2699
2700 if (fixup_alt_jump)
2701 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2702
2703 /* Mark and leave space for a jump after this alternative,
2704 to be filled in later either by next alternative or
2705 when know we're at the end of a series of alternatives. */
2706 fixup_alt_jump = b;
2707 GET_BUFFER_SPACE (3);
2708 b += 3;
2709
2710 laststart = 0;
2711 begalt = b;
2712 break;
2713
2714
2715 case '{':
2716 /* If \{ is a literal. */
2717 if (!(syntax & RE_INTERVALS)
2718 /* If we're at `\{' and it's not the open-interval
2719 operator. */
2720 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2721 || (p - 2 == pattern && p == pend))
2722 goto normal_backslash;
2723
2724 handle_interval:
2725 {
2726 /* If got here, then the syntax allows intervals. */
2727
2728 /* At least (most) this many matches must be made. */
2729 int lower_bound = -1, upper_bound = -1;
2730
2731 beg_interval = p - 1;
2732
2733 if (p == pend)
2734 {
2735 if (syntax & RE_NO_BK_BRACES)
2736 goto unfetch_interval;
2737 else
2738 FREE_STACK_RETURN (REG_EBRACE);
2739 }
2740
2741 GET_UNSIGNED_NUMBER (lower_bound);
2742
2743 if (c == ',')
2744 {
2745 GET_UNSIGNED_NUMBER (upper_bound);
2746 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2747 }
2748 else
2749 /* Interval such as `{1}' => match exactly once. */
2750 upper_bound = lower_bound;
2751
2752 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2753 || lower_bound > upper_bound)
2754 {
2755 if (syntax & RE_NO_BK_BRACES)
2756 goto unfetch_interval;
2757 else
2758 FREE_STACK_RETURN (REG_BADBR);
2759 }
2760
2761 if (!(syntax & RE_NO_BK_BRACES))
2762 {
2763 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2764
2765 PATFETCH (c);
2766 }
2767
2768 if (c != '}')
2769 {
2770 if (syntax & RE_NO_BK_BRACES)
2771 goto unfetch_interval;
2772 else
2773 FREE_STACK_RETURN (REG_BADBR);
2774 }
2775
2776 /* We just parsed a valid interval. */
2777
2778 /* If it's invalid to have no preceding re. */
2779 if (!laststart)
2780 {
2781 if (syntax & RE_CONTEXT_INVALID_OPS)
2782 FREE_STACK_RETURN (REG_BADRPT);
2783 else if (syntax & RE_CONTEXT_INDEP_OPS)
2784 laststart = b;
2785 else
2786 goto unfetch_interval;
2787 }
2788
2789 /* If the upper bound is zero, don't want to succeed at
2790 all; jump from `laststart' to `b + 3', which will be
2791 the end of the buffer after we insert the jump. */
2792 if (upper_bound == 0)
2793 {
2794 GET_BUFFER_SPACE (3);
2795 INSERT_JUMP (jump, laststart, b + 3);
2796 b += 3;
2797 }
2798
2799 /* Otherwise, we have a nontrivial interval. When
2800 we're all done, the pattern will look like:
2801 set_number_at <jump count> <upper bound>
2802 set_number_at <succeed_n count> <lower bound>
2803 succeed_n <after jump addr> <succeed_n count>
2804 <body of loop>
2805 jump_n <succeed_n addr> <jump count>
2806 (The upper bound and `jump_n' are omitted if
2807 `upper_bound' is 1, though.) */
2808 else
2809 { /* If the upper bound is > 1, we need to insert
2810 more at the end of the loop. */
2811 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2812
2813 GET_BUFFER_SPACE (nbytes);
2814
2815 /* Initialize lower bound of the `succeed_n', even
2816 though it will be set during matching by its
2817 attendant `set_number_at' (inserted next),
2818 because `re_compile_fastmap' needs to know.
2819 Jump to the `jump_n' we might insert below. */
2820 INSERT_JUMP2 (succeed_n, laststart,
2821 b + 5 + (upper_bound > 1) * 5,
2822 lower_bound);
2823 b += 5;
2824
2825 /* Code to initialize the lower bound. Insert
2826 before the `succeed_n'. The `5' is the last two
2827 bytes of this `set_number_at', plus 3 bytes of
2828 the following `succeed_n'. */
2829 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2830 b += 5;
2831
2832 if (upper_bound > 1)
2833 { /* More than one repetition is allowed, so
2834 append a backward jump to the `succeed_n'
2835 that starts this interval.
2836
2837 When we've reached this during matching,
2838 we'll have matched the interval once, so
2839 jump back only `upper_bound - 1' times. */
2840 STORE_JUMP2 (jump_n, b, laststart + 5,
2841 upper_bound - 1);
2842 b += 5;
2843
2844 /* The location we want to set is the second
2845 parameter of the `jump_n'; that is `b-2' as
2846 an absolute address. `laststart' will be
2847 the `set_number_at' we're about to insert;
2848 `laststart+3' the number to set, the source
2849 for the relative address. But we are
2850 inserting into the middle of the pattern --
2851 so everything is getting moved up by 5.
2852 Conclusion: (b - 2) - (laststart + 3) + 5,
2853 i.e., b - laststart.
2854
2855 We insert this at the beginning of the loop
2856 so that if we fail during matching, we'll
2857 reinitialize the bounds. */
2858 insert_op2 (set_number_at, laststart, b - laststart,
2859 upper_bound - 1, b);
2860 b += 5;
2861 }
2862 }
2863 pending_exact = 0;
2864 beg_interval = NULL;
2865 }
2866 break;
2867
2868 unfetch_interval:
2869 /* If an invalid interval, match the characters as literals. */
2870 assert (beg_interval);
2871 p = beg_interval;
2872 beg_interval = NULL;
2873
2874 /* normal_char and normal_backslash need `c'. */
2875 PATFETCH (c);
2876
2877 if (!(syntax & RE_NO_BK_BRACES))
2878 {
2879 if (p > pattern && p[-1] == '\\')
2880 goto normal_backslash;
2881 }
2882 goto normal_char;
2883
2884 #ifdef emacs
2885 /* There is no way to specify the before_dot and after_dot
2886 operators. rms says this is ok. --karl */
2887 case '=':
2888 BUF_PUSH (at_dot);
2889 break;
2890
2891 case 's':
2892 laststart = b;
2893 PATFETCH (c);
2894 /* XEmacs addition */
2895 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2896 FREE_STACK_RETURN (REG_ESYNTAX);
2897 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2898 break;
2899
2900 case 'S':
2901 laststart = b;
2902 PATFETCH (c);
2903 /* XEmacs addition */
2904 if (c >= 0x80 || syntax_spec_code[c] == 0377)
2905 FREE_STACK_RETURN (REG_ESYNTAX);
2906 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2907 break;
2908
2909 #ifdef MULE
2910 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
2911 case 'c':
2912 laststart = b;
2913 PATFETCH_RAW (c);
2914 if (c < 32 || c > 127)
2915 FREE_STACK_RETURN (REG_ECATEGORY);
2916 BUF_PUSH_2 (categoryspec, c);
2917 break;
2918
2919 case 'C':
2920 laststart = b;
2921 PATFETCH_RAW (c);
2922 if (c < 32 || c > 127)
2923 FREE_STACK_RETURN (REG_ECATEGORY);
2924 BUF_PUSH_2 (notcategoryspec, c);
2925 break;
2926 /* end of category patch */
2927 #endif /* MULE */
2928 #endif /* emacs */
2929
2930
2931 case 'w':
2932 laststart = b;
2933 BUF_PUSH (wordchar);
2934 break;
2935
2936
2937 case 'W':
2938 laststart = b;
2939 BUF_PUSH (notwordchar);
2940 break;
2941
2942
2943 case '<':
2944 BUF_PUSH (wordbeg);
2945 break;
2946
2947 case '>':
2948 BUF_PUSH (wordend);
2949 break;
2950
2951 case 'b':
2952 BUF_PUSH (wordbound);
2953 break;
2954
2955 case 'B':
2956 BUF_PUSH (notwordbound);
2957 break;
2958
2959 case '`':
2960 BUF_PUSH (begbuf);
2961 break;
2962
2963 case '\'':
2964 BUF_PUSH (endbuf);
2965 break;
2966
2967 case '1': case '2': case '3': case '4': case '5':
2968 case '6': case '7': case '8': case '9':
2969 if (syntax & RE_NO_BK_REFS)
2970 goto normal_char;
2971
2972 c1 = c - '0';
2973
2974 if (c1 > regnum)
2975 FREE_STACK_RETURN (REG_ESUBREG);
2976
2977 /* Can't back reference to a subexpression if inside of it. */
2978 if (group_in_compile_stack (compile_stack, c1))
2979 goto normal_char;
2980
2981 laststart = b;
2982 BUF_PUSH_2 (duplicate, c1);
2983 break;
2984
2985
2986 case '+':
2987 case '?':
2988 if (syntax & RE_BK_PLUS_QM)
2989 goto handle_plus;
2990 else
2991 goto normal_backslash;
2992
2993 default:
2994 normal_backslash:
2995 /* You might think it would be useful for \ to mean
2996 not to translate; but if we don't translate it,
2997 it will never match anything. */
2998 c = TRANSLATE (c);
2999 goto normal_char;
3000 }
3001 break;
3002
3003
3004 default:
3005 /* Expects the character in `c'. */
3006 /* `p' points to the location after where `c' came from. */
3007 normal_char:
3008 {
3009 /* XEmacs: modifications here for Mule. */
3010 /* `q' points to the beginning of the next char. */
3011 CONST char *q = p - 1;
3012 INC_CHARPTR (q);
3013
3014 /* If no exactn currently being built. */
3015 if (!pending_exact
3016
3017 /* If last exactn not at current position. */
3018 || pending_exact + *pending_exact + 1 != b
3019
3020 /* We have only one byte following the exactn for the count. */
3021 || ((unsigned int) (*pending_exact + (q - p)) >=
3022 ((unsigned int) (1 << BYTEWIDTH) - 1))
3023
3024 /* If followed by a repetition operator. */
3025 || *q == '*' || *q == '^'
3026 || ((syntax & RE_BK_PLUS_QM)
3027 ? *q == '\\' && (q[1] == '+' || q[1] == '?')
3028 : (*q == '+' || *q == '?'))
3029 || ((syntax & RE_INTERVALS)
3030 && ((syntax & RE_NO_BK_BRACES)
3031 ? *q == '{'
3032 : (q[0] == '\\' && q[1] == '{'))))
3033 {
3034 /* Start building a new exactn. */
3035
3036 laststart = b;
3037
3038 BUF_PUSH_2 (exactn, 0);
3039 pending_exact = b - 1;
3040 }
3041
3042 BUF_PUSH (c);
3043 (*pending_exact)++;
3044
3045 while (p < q)
3046 {
3047 PATFETCH (c);
3048 BUF_PUSH (c);
3049 (*pending_exact)++;
3050 }
3051 break;
3052 }
3053 } /* switch (c) */
3054 } /* while p != pend */
3055
3056
3057 /* Through the pattern now. */
3058
3059 if (fixup_alt_jump)
3060 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3061
3062 if (!COMPILE_STACK_EMPTY)
3063 FREE_STACK_RETURN (REG_EPAREN);
3064
3065 /* If we don't want backtracking, force success
3066 the first time we reach the end of the compiled pattern. */
3067 if (syntax & RE_NO_POSIX_BACKTRACKING)
3068 BUF_PUSH (succeed);
3069
3070 free (compile_stack.stack);
3071
3072 /* We have succeeded; set the length of the buffer. */
3073 bufp->used = b - bufp->buffer;
3074
3075 #ifdef DEBUG
3076 if (debug)
3077 {
3078 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3079 print_compiled_pattern (bufp);
3080 }
3081 #endif /* DEBUG */
3082
3083 #ifndef MATCH_MAY_ALLOCATE
3084 /* Initialize the failure stack to the largest possible stack. This
3085 isn't necessary unless we're trying to avoid calling alloca in
3086 the search and match routines. */
3087 {
3088 int num_regs = bufp->re_nsub + 1;
3089
3090 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3091 is strictly greater than re_max_failures, the largest possible stack
3092 is 2 * re_max_failures failure points. */
3093 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
3094 {
3095 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
3096
3097 #ifdef emacs
3098 if (! fail_stack.stack)
3099 fail_stack.stack
3100 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3101 * sizeof (fail_stack_elt_t));
3102 else
3103 fail_stack.stack
3104 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3105 (fail_stack.size
3106 * sizeof (fail_stack_elt_t)));
3107 #else /* not emacs */
3108 if (! fail_stack.stack)
3109 fail_stack.stack
3110 = (fail_stack_elt_t *) malloc (fail_stack.size
3111 * sizeof (fail_stack_elt_t));
3112 else
3113 fail_stack.stack
3114 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3115 (fail_stack.size
3116 * sizeof (fail_stack_elt_t)));
3117 #endif /* not emacs */
3118 }
3119
3120 regex_grow_registers (num_regs);
3121 }
3122 #endif /* not MATCH_MAY_ALLOCATE */
3123
3124 return REG_NOERROR;
3125 } /* regex_compile */
3126
3127 /* Subroutines for `regex_compile'. */
3128
3129 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3130
3131 static void
3132 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3133 {
3134 *loc = (unsigned char) op;
3135 STORE_NUMBER (loc + 1, arg);
3136 }
3137
3138
3139 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3140
3141 static void
3142 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3143 {
3144 *loc = (unsigned char) op;
3145 STORE_NUMBER (loc + 1, arg1);
3146 STORE_NUMBER (loc + 3, arg2);
3147 }
3148
3149
3150 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3151 for OP followed by two-byte integer parameter ARG. */
3152
3153 static void
3154 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3155 {
3156 REGISTER unsigned char *pfrom = end;
3157 REGISTER unsigned char *pto = end + 3;
3158
3159 while (pfrom != loc)
3160 *--pto = *--pfrom;
3161
3162 store_op1 (op, loc, arg);
3163 }
3164
3165
3166 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3167
3168 static void
3169 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3170 unsigned char *end)
3171 {
3172 REGISTER unsigned char *pfrom = end;
3173 REGISTER unsigned char *pto = end + 5;
3174
3175 while (pfrom != loc)
3176 *--pto = *--pfrom;
3177
3178 store_op2 (op, loc, arg1, arg2);
3179 }
3180
3181
3182 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3183 after an alternative or a begin-subexpression. We assume there is at
3184 least one character before the ^. */
3185
3186 static boolean
3187 at_begline_loc_p (CONST char *pattern, CONST char *p, reg_syntax_t syntax)
3188 {
3189 CONST char *prev = p - 2;
3190 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3191
3192 return
3193 /* After a subexpression? */
3194 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3195 /* After an alternative? */
3196 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3197 }
3198
3199
3200 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3201 at least one character after the $, i.e., `P < PEND'. */
3202
3203 static boolean
3204 at_endline_loc_p (CONST char *p, CONST char *pend, int syntax)
3205 {
3206 CONST char *next = p;
3207 boolean next_backslash = *next == '\\';
3208 CONST char *next_next = p + 1 < pend ? p + 1 : 0;
3209
3210 return
3211 /* Before a subexpression? */
3212 (syntax & RE_NO_BK_PARENS ? *next == ')'
3213 : next_backslash && next_next && *next_next == ')')
3214 /* Before an alternative? */
3215 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3216 : next_backslash && next_next && *next_next == '|');
3217 }
3218
3219
3220 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3221 false if it's not. */
3222
3223 static boolean
3224 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3225 {
3226 int this_element;
3227
3228 for (this_element = compile_stack.avail - 1;
3229 this_element >= 0;
3230 this_element--)
3231 if (compile_stack.stack[this_element].regnum == regnum)
3232 return true;
3233
3234 return false;
3235 }
3236
3237
3238 /* Read the ending character of a range (in a bracket expression) from the
3239 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3240 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3241 Then we set the translation of all bits between the starting and
3242 ending characters (inclusive) in the compiled pattern B.
3243
3244 Return an error code.
3245
3246 We use these short variable names so we can use the same macros as
3247 `regex_compile' itself. */
3248
3249 static reg_errcode_t
3250 compile_range (CONST char **p_ptr, CONST char *pend, char *translate,
3251 reg_syntax_t syntax, unsigned char *b)
3252 {
3253 unsigned this_char;
3254
3255 CONST char *p = *p_ptr;
3256 int range_start, range_end;
3257
3258 if (p == pend)
3259 return REG_ERANGE;
3260
3261 /* Even though the pattern is a signed `char *', we need to fetch
3262 with unsigned char *'s; if the high bit of the pattern character
3263 is set, the range endpoints will be negative if we fetch using a
3264 signed char *.
3265
3266 We also want to fetch the endpoints without translating them; the
3267 appropriate translation is done in the bit-setting loop below. */
3268 /* The SVR4 compiler on the 3B2 had trouble with unsigned CONST char *. */
3269 range_start = ((CONST unsigned char *) p)[-2];
3270 range_end = ((CONST unsigned char *) p)[0];
3271
3272 /* Have to increment the pointer into the pattern string, so the
3273 caller isn't still at the ending character. */
3274 (*p_ptr)++;
3275
3276 /* If the start is after the end, the range is empty. */
3277 if (range_start > range_end)
3278 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3279
3280 /* Here we see why `this_char' has to be larger than an `unsigned
3281 char' -- the range is inclusive, so if `range_end' == 0xff
3282 (assuming 8-bit characters), we would otherwise go into an infinite
3283 loop, since all characters <= 0xff. */
3284 for (this_char = range_start; this_char <= range_end; this_char++)
3285 {
3286 SET_LIST_BIT (TRANSLATE (this_char));
3287 }
3288
3289 return REG_NOERROR;
3290 }
3291
3292 #ifdef MULE
3293
3294 static reg_errcode_t
3295 compile_extended_range (CONST char **p_ptr, CONST char *pend, char *translate,
3296 reg_syntax_t syntax, Lisp_Object rtab)
3297 {
3298 Emchar this_char, range_start, range_end;
3299 CONST Bufbyte *p;
3300
3301 if (*p_ptr == pend)
3302 return REG_ERANGE;
3303
3304 p = (CONST Bufbyte *) *p_ptr;
3305 range_end = charptr_emchar (p);
3306 p--; /* back to '-' */
3307 DEC_CHARPTR (p); /* back to start of range */
3308 /* We also want to fetch the endpoints without translating them; the
3309 appropriate translation is done in the bit-setting loop below. */
3310 range_start = charptr_emchar (p);
3311 INC_CHARPTR (*p_ptr);
3312
3313 /* If the start is after the end, the range is empty. */
3314 if (range_start > range_end)
3315 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3316
3317 /* Can't have ranges spanning different charsets, except maybe for
3318 ranges entirely within the first 256 chars. */
3319
3320 if ((range_start >= 0x100 || range_end >= 0x100)
3321 && CHAR_LEADING_BYTE (range_start) !=
3322 CHAR_LEADING_BYTE (range_end))
3323 return REG_ERANGESPAN;
3324
3325 /* As advertised, translations only work over the 0 - 0x7F range.
3326 Making this kind of stuff work generally is much harder.
3327 Iterating over the whole range like this would be way efficient
3328 if the range encompasses 10,000 chars or something. You'd have
3329 to do something like this:
3330
3331 range_table a;
3332 range_table b;
3333 map over translation table in [range_start, range_end] of
3334 (put the mapped range in a;
3335 put the translation in b)
3336 invert the range in a and truncate to [range_start, range_end]
3337 compute the union of a, b
3338 union the result into rtab
3339 */
3340 for (this_char = range_start;
3341 this_char <= range_end && this_char < 0x80; this_char++)
3342 {
3343 SET_RANGETAB_BIT (TRANSLATE (this_char));
3344 }
3345
3346 if (this_char <= range_end)
3347 put_range_table (rtab, this_char, range_end, Qt);
3348
3349 return REG_NOERROR;
3350 }
3351
3352 #endif /* MULE */
3353
3354 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3355 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3356 characters can start a string that matches the pattern. This fastmap
3357 is used by re_search to skip quickly over impossible starting points.
3358
3359 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3360 area as BUFP->fastmap.
3361
3362 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3363 the pattern buffer.
3364
3365 Returns 0 if we succeed, -2 if an internal error. */
3366
3367 int
3368 re_compile_fastmap (struct re_pattern_buffer *bufp)
3369 {
3370 int j, k;
3371 #ifdef MATCH_MAY_ALLOCATE
3372 fail_stack_type fail_stack;
3373 #endif
3374 DECLARE_DESTINATION
3375 /* We don't push any register information onto the failure stack. */
3376
3377 REGISTER char *fastmap = bufp->fastmap;
3378 unsigned char *pattern = bufp->buffer;
3379 unsigned long size = bufp->used;
3380 unsigned char *p = pattern;
3381 REGISTER unsigned char *pend = pattern + size;
3382
3383 #ifdef REL_ALLOC
3384 /* This holds the pointer to the failure stack, when
3385 it is allocated relocatably. */
3386 fail_stack_elt_t *failure_stack_ptr;
3387 #endif
3388
3389 /* Assume that each path through the pattern can be null until
3390 proven otherwise. We set this false at the bottom of switch
3391 statement, to which we get only if a particular path doesn't
3392 match the empty string. */
3393 boolean path_can_be_null = true;
3394
3395 /* We aren't doing a `succeed_n' to begin with. */
3396 boolean succeed_n_p = false;
3397
3398 assert (fastmap != NULL && p != NULL);
3399
3400 INIT_FAIL_STACK ();
3401 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3402 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3403 bufp->can_be_null = 0;
3404
3405 while (1)
3406 {
3407 if (p == pend || *p == succeed)
3408 {
3409 /* We have reached the (effective) end of pattern. */
3410 if (!FAIL_STACK_EMPTY ())
3411 {
3412 bufp->can_be_null |= path_can_be_null;
3413
3414 /* Reset for next path. */
3415 path_can_be_null = true;
3416
3417 p = fail_stack.stack[--fail_stack.avail].pointer;
3418
3419 continue;
3420 }
3421 else
3422 break;
3423 }
3424
3425 /* We should never be about to go beyond the end of the pattern. */
3426 assert (p < pend);
3427
3428 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3429 {
3430
3431 /* I guess the idea here is to simply not bother with a fastmap
3432 if a backreference is used, since it's too hard to figure out
3433 the fastmap for the corresponding group. Setting
3434 `can_be_null' stops `re_search_2' from using the fastmap, so
3435 that is all we do. */
3436 case duplicate:
3437 bufp->can_be_null = 1;
3438 goto done;
3439
3440
3441 /* Following are the cases which match a character. These end
3442 with `break'. */
3443
3444 case exactn:
3445 fastmap[p[1]] = 1;
3446 break;
3447
3448
3449 case charset:
3450 /* XEmacs: Under Mule, these bit vectors will
3451 only contain values for characters below 0x80. */
3452 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3453 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3454 fastmap[j] = 1;
3455 break;
3456
3457
3458 case charset_not:
3459 /* Chars beyond end of map must be allowed. */
3460 #ifdef MULE
3461 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3462 fastmap[j] = 1;
3463 /* And all extended characters must be allowed, too. */
3464 for (j = 0x80; j < 0xA0; j++)
3465 fastmap[j] = 1;
3466 #else /* ! MULE */
3467 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3468 fastmap[j] = 1;
3469 #endif /* ! MULE */
3470
3471 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3472 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3473 fastmap[j] = 1;
3474 break;
3475
3476 #ifdef MULE
3477 case charset_mule:
3478 {
3479 int nentries;
3480 int i;
3481
3482 nentries = unified_range_table_nentries (p);
3483 for (i = 0; i < nentries; i++)
3484 {
3485 EMACS_INT first, last;
3486 Lisp_Object dummy_val;
3487 int jj;
3488 Bufbyte strr[MAX_EMCHAR_LEN];
3489
3490 unified_range_table_get_range (p, i, &first, &last,
3491 &dummy_val);
3492 for (jj = first; jj <= last && jj < 0x80; jj++)
3493 fastmap[jj] = 1;
3494 /* Ranges below 0x100 can span charsets, but there
3495 are only two (Control-1 and Latin-1), and
3496 either first or last has to be in them. */
3497 set_charptr_emchar (strr, first);
3498 fastmap[*strr] = 1;
3499 if (last < 0x100)
3500 {
3501 set_charptr_emchar (strr, last);
3502 fastmap[*strr] = 1;
3503 }
3504 }
3505 }
3506 break;
3507
3508 case charset_mule_not:
3509 {
3510 int nentries;
3511 int i;
3512
3513 nentries = unified_range_table_nentries (p);
3514 for (i = 0; i < nentries; i++)
3515 {
3516 EMACS_INT first, last;
3517 Lisp_Object dummy_val;
3518 int jj;
3519 int smallest_prev = 0;
3520
3521 unified_range_table_get_range (p, i, &first, &last,
3522 &dummy_val);
3523 for (jj = smallest_prev; jj < first && jj < 0x80; jj++)
3524 fastmap[jj] = 1;
3525 smallest_prev = last + 1;
3526 if (smallest_prev >= 0x80)
3527 break;
3528 }
3529 /* Calculating which leading bytes are actually allowed
3530 here is rather difficult, so we just punt and allow
3531 all of them. */
3532 for (i = 0x80; i < 0xA0; i++)
3533 fastmap[i] = 1;
3534 }
3535 break;
3536 #endif /* MULE */
3537
3538
3539 case wordchar:
3540 #ifdef emacs
3541 k = (int) Sword;
3542 goto matchsyntax;
3543 #else
3544 for (j = 0; j < (1 << BYTEWIDTH); j++)
3545 if (SYNTAX_UNSAFE
3546 (XCHAR_TABLE
3547 (regex_emacs_buffer->mirror_syntax_table), j) == Sword)
3548 fastmap[j] = 1;
3549 break;
3550 #endif
3551
3552
3553 case notwordchar:
3554 #ifdef emacs
3555 k = (int) Sword;
3556 goto matchnotsyntax;
3557 #else
3558 for (j = 0; j < (1 << BYTEWIDTH); j++)
3559 if (SYNTAX_UNSAFE
3560 (XCHAR_TABLE
3561 (regex_emacs_buffer->mirror_syntax_table), j) != Sword)
3562 fastmap[j] = 1;
3563 break;
3564 #endif
3565
3566
3567 case anychar:
3568 {
3569 int fastmap_newline = fastmap['\n'];
3570
3571 /* `.' matches anything ... */
3572 #ifdef MULE
3573 /* "anything" only includes bytes that can be the
3574 first byte of a character. */
3575 for (j = 0; j < 0xA0; j++)
3576 fastmap[j] = 1;
3577 #else
3578 for (j = 0; j < (1 << BYTEWIDTH); j++)
3579 fastmap[j] = 1;
3580 #endif
3581
3582 /* ... except perhaps newline. */
3583 if (!(bufp->syntax & RE_DOT_NEWLINE))
3584 fastmap['\n'] = fastmap_newline;
3585
3586 /* Return if we have already set `can_be_null'; if we have,
3587 then the fastmap is irrelevant. Something's wrong here. */
3588 else if (bufp->can_be_null)
3589 goto done;
3590
3591 /* Otherwise, have to check alternative paths. */
3592 break;
3593 }
3594
3595 #ifdef emacs
3596 case syntaxspec:
3597 k = *p++;
3598 matchsyntax:
3599 #ifdef MULE
3600 for (j = 0; j < 0x80; j++)
3601 if (SYNTAX_UNSAFE
3602 (XCHAR_TABLE
3603 (regex_emacs_buffer->mirror_syntax_table), j) ==
3604 (enum syntaxcode) k)
3605 fastmap[j] = 1;
3606 for (j = 0x80; j < 0xA0; j++)
3607 {
3608 if (LEADING_BYTE_PREFIX_P(j))
3609 /* too complicated to calculate this right */
3610 fastmap[j] = 1;
3611 else
3612 {
3613 int multi_p;
3614 Lisp_Object cset;
3615
3616 cset = CHARSET_BY_LEADING_BYTE (j);
3617 if (CHARSETP (cset))
3618 {
3619 if (charset_syntax (regex_emacs_buffer, cset,
3620 &multi_p)
3621 == Sword || multi_p)
3622 fastmap[j] = 1;
3623 }
3624 }
3625 }
3626 #else /* ! MULE */
3627 for (j = 0; j < (1 << BYTEWIDTH); j++)
3628 if (SYNTAX_UNSAFE
3629 (XCHAR_TABLE
3630 (regex_emacs_buffer->mirror_syntax_table), j) ==
3631 (enum syntaxcode) k)
3632 fastmap[j] = 1;
3633 #endif /* ! MULE */
3634 break;
3635
3636
3637 case notsyntaxspec:
3638 k = *p++;
3639 matchnotsyntax:
3640 #ifdef MULE
3641 for (j = 0; j < 0x80; j++)
3642 if (SYNTAX_UNSAFE
3643 (XCHAR_TABLE
3644 (regex_emacs_buffer->mirror_syntax_table), j) !=
3645 (enum syntaxcode) k)
3646 fastmap[j] = 1;
3647 for (j = 0x80; j < 0xA0; j++)
3648 {
3649 if (LEADING_BYTE_PREFIX_P(j))
3650 /* too complicated to calculate this right */
3651 fastmap[j] = 1;
3652 else
3653 {
3654 int multi_p;
3655 Lisp_Object cset;
3656
3657 cset = CHARSET_BY_LEADING_BYTE (j);
3658 if (CHARSETP (cset))
3659 {
3660 if (charset_syntax (regex_emacs_buffer, cset,
3661 &multi_p)
3662 != Sword || multi_p)
3663 fastmap[j] = 1;
3664 }
3665 }
3666 }
3667 #else /* ! MULE */
3668 for (j = 0; j < (1 << BYTEWIDTH); j++)
3669 if (SYNTAX_UNSAFE
3670 (XCHAR_TABLE
3671 (regex_emacs_buffer->mirror_syntax_table), j) !=
3672 (enum syntaxcode) k)
3673 fastmap[j] = 1;
3674 #endif /* ! MULE */
3675 break;
3676
3677 #ifdef MULE
3678 /* 97/2/17 jhod category patch */
3679 case categoryspec:
3680 case notcategoryspec:
3681 bufp->can_be_null = 1;
3682 return 0;
3683 /* end if category patch */
3684 #endif /* MULE */
3685
3686 /* All cases after this match the empty string. These end with
3687 `continue'. */
3688
3689
3690 case before_dot:
3691 case at_dot:
3692 case after_dot:
3693 continue;
3694 #endif /* not emacs */
3695
3696
3697 case no_op:
3698 case begline:
3699 case endline:
3700 case begbuf:
3701 case endbuf:
3702 case wordbound:
3703 case notwordbound:
3704 case wordbeg:
3705 case wordend:
3706 case push_dummy_failure:
3707 continue;
3708
3709
3710 case jump_n:
3711 case pop_failure_jump:
3712 case maybe_pop_jump:
3713 case jump:
3714 case jump_past_alt:
3715 case dummy_failure_jump:
3716 EXTRACT_NUMBER_AND_INCR (j, p);
3717 p += j;
3718 if (j > 0)
3719 continue;
3720
3721 /* Jump backward implies we just went through the body of a
3722 loop and matched nothing. Opcode jumped to should be
3723 `on_failure_jump' or `succeed_n'. Just treat it like an
3724 ordinary jump. For a * loop, it has pushed its failure
3725 point already; if so, discard that as redundant. */
3726 if ((re_opcode_t) *p != on_failure_jump
3727 && (re_opcode_t) *p != succeed_n)
3728 continue;
3729
3730 p++;
3731 EXTRACT_NUMBER_AND_INCR (j, p);
3732 p += j;
3733
3734 /* If what's on the stack is where we are now, pop it. */
3735 if (!FAIL_STACK_EMPTY ()
3736 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3737 fail_stack.avail--;
3738
3739 continue;
3740
3741
3742 case on_failure_jump:
3743 case on_failure_keep_string_jump:
3744 handle_on_failure_jump:
3745 EXTRACT_NUMBER_AND_INCR (j, p);
3746
3747 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3748 end of the pattern. We don't want to push such a point,
3749 since when we restore it above, entering the switch will
3750 increment `p' past the end of the pattern. We don't need
3751 to push such a point since we obviously won't find any more
3752 fastmap entries beyond `pend'. Such a pattern can match
3753 the null string, though. */
3754 if (p + j < pend)
3755 {
3756 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3757 {
3758 RESET_FAIL_STACK ();
3759 return -2;
3760 }
3761 }
3762 else
3763 bufp->can_be_null = 1;
3764
3765 if (succeed_n_p)
3766 {
3767 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3768 succeed_n_p = false;
3769 }
3770
3771 continue;
3772
3773
3774 case succeed_n:
3775 /* Get to the number of times to succeed. */
3776 p += 2;
3777
3778 /* Increment p past the n for when k != 0. */
3779 EXTRACT_NUMBER_AND_INCR (k, p);
3780 if (k == 0)
3781 {
3782 p -= 4;
3783 succeed_n_p = true; /* Spaghetti code alert. */
3784 goto handle_on_failure_jump;
3785 }
3786 continue;
3787
3788
3789 case set_number_at:
3790 p += 4;
3791 continue;
3792
3793
3794 case start_memory:
3795 case stop_memory:
3796 p += 2;
3797 continue;
3798
3799
3800 default:
3801 abort (); /* We have listed all the cases. */
3802 } /* switch *p++ */
3803
3804 /* Getting here means we have found the possible starting
3805 characters for one path of the pattern -- and that the empty
3806 string does not match. We need not follow this path further.
3807 Instead, look at the next alternative (remembered on the
3808 stack), or quit if no more. The test at the top of the loop
3809 does these things. */
3810 path_can_be_null = false;
3811 p = pend;
3812 } /* while p */
3813
3814 /* Set `can_be_null' for the last path (also the first path, if the
3815 pattern is empty). */
3816 bufp->can_be_null |= path_can_be_null;
3817
3818 done:
3819 RESET_FAIL_STACK ();
3820 return 0;
3821 } /* re_compile_fastmap */
3822
3823 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3824 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3825 this memory for recording register information. STARTS and ENDS
3826 must be allocated using the malloc library routine, and must each
3827 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3828
3829 If NUM_REGS == 0, then subsequent matches should allocate their own
3830 register data.
3831
3832 Unless this function is called, the first search or match using
3833 PATTERN_BUFFER will allocate its own register data, without
3834 freeing the old data. */
3835
3836 void
3837 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
3838 unsigned num_regs, regoff_t *starts, regoff_t *ends)
3839 {
3840 if (num_regs)
3841 {
3842 bufp->regs_allocated = REGS_REALLOCATE;
3843 regs->num_regs = num_regs;
3844 regs->start = starts;
3845 regs->end = ends;
3846 }
3847 else
3848 {
3849 bufp->regs_allocated = REGS_UNALLOCATED;
3850 regs->num_regs = 0;
3851 regs->start = regs->end = (regoff_t *) 0;
3852 }
3853 }
3854
3855 /* Searching routines. */
3856
3857 /* Like re_search_2, below, but only one string is specified, and
3858 doesn't let you say where to stop matching. */
3859
3860 int
3861 re_search (struct re_pattern_buffer *bufp, CONST char *string, int size,
3862 int startpos, int range, struct re_registers *regs)
3863 {
3864 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3865 regs, size);
3866 }
3867
3868 #ifndef emacs
3869 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
3870 # define bytecount_to_charcount(ptr, len) (len)
3871 # define charcount_to_bytecount(ptr, len) (len)
3872 typedef int Charcount;
3873 #endif
3874
3875 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3876 virtual concatenation of STRING1 and STRING2, starting first at index
3877 STARTPOS, then at STARTPOS + 1, and so on.
3878
3879 With MULE, STARTPOS is a byte position, not a char position. And the
3880 search will increment STARTPOS by the width of the current leading
3881 character.
3882
3883 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3884
3885 RANGE is how far to scan while trying to match. RANGE = 0 means try
3886 only at STARTPOS; in general, the last start tried is STARTPOS +
3887 RANGE.
3888
3889 With MULE, RANGE is a byte position, not a char position. The last
3890 start tried is the character starting <= STARTPOS + RANGE.
3891
3892 In REGS, return the indices of the virtual concatenation of STRING1
3893 and STRING2 that matched the entire BUFP->buffer and its contained
3894 subexpressions.
3895
3896 Do not consider matching one past the index STOP in the virtual
3897 concatenation of STRING1 and STRING2.
3898
3899 We return either the position in the strings at which the match was
3900 found, -1 if no match, or -2 if error (such as failure
3901 stack overflow). */
3902
3903 int
3904 re_search_2 (struct re_pattern_buffer *bufp, CONST char *string1,
3905 int size1, CONST char *string2, int size2, int startpos,
3906 int range, struct re_registers *regs, int stop)
3907 {
3908 int val;
3909 REGISTER char *fastmap = bufp->fastmap;
3910 REGISTER char *translate = bufp->translate;
3911 int total_size = size1 + size2;
3912 int endpos = startpos + range;
3913 #ifdef REGEX_BEGLINE_CHECK
3914 int anchored_at_begline = 0;
3915 #endif
3916 CONST unsigned char *d;
3917 Charcount d_size;
3918
3919 /* Check for out-of-range STARTPOS. */
3920 if (startpos < 0 || startpos > total_size)
3921 return -1;
3922
3923 /* Fix up RANGE if it might eventually take us outside
3924 the virtual concatenation of STRING1 and STRING2. */
3925 if (endpos < 0)
3926 range = 0 - startpos;
3927 else if (endpos > total_size)
3928 range = total_size - startpos;
3929
3930 /* If the search isn't to be a backwards one, don't waste time in a
3931 search for a pattern that must be anchored. */
3932 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3933 {
3934 if (startpos > 0)
3935 return -1;
3936 else
3937 {
3938 d = ((CONST unsigned char *)
3939 (startpos >= size1 ? string2 - size1 : string1) + startpos);
3940 range = charcount_to_bytecount (d, 1);
3941 }
3942 }
3943
3944 /* Update the fastmap now if not correct already. */
3945 if (fastmap && !bufp->fastmap_accurate)
3946 if (re_compile_fastmap (bufp) == -2)
3947 return -2;
3948
3949 #ifdef REGEX_BEGLINE_CHECK
3950 {
3951 int i = 0;
3952
3953 while (i < bufp->used)
3954 {
3955 if (bufp->buffer[i] == start_memory ||
3956 bufp->buffer[i] == stop_memory)
3957 i += 2;
3958 else
3959 break;
3960 }
3961 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline;
3962 }
3963 #endif
3964
3965 /* Loop through the string, looking for a place to start matching. */
3966 for (;;)
3967 {
3968 #ifdef REGEX_BEGLINE_CHECK
3969 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
3970 then we can speed things up by skipping to the next beginning-of-
3971 line. */
3972 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
3973 range > 0)
3974 {
3975 /* whose stupid idea was it anyway to make this
3976 function take two strings to match?? */
3977 int lim = 0;
3978 int irange = range;
3979
3980 if (startpos < size1 && startpos + range >= size1)
3981 lim = range - (size1 - startpos);
3982
3983 d = ((CONST unsigned char *)
3984 (startpos >= size1 ? string2 - size1 : string1) + startpos);
3985 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
3986 d_size = charcount_to_bytecount (d, 1);
3987
3988 if (translate)
3989 #ifdef MULE
3990 while (range > lim && (*d >= 0x80 || translate[*d] != '\n'))
3991 #else
3992 while (range > lim && translate[*d] != '\n')
3993 #endif
3994 {
3995 d += d_size; /* Speedier INC_CHARPTR(d) */
3996 d_size = charcount_to_bytecount (d, 1);
3997 range -= d_size;
3998 }
3999 else
4000 while (range > lim && *d != '\n')
4001 {
4002 d += d_size; /* Speedier INC_CHARPTR(d) */
4003 d_size = charcount_to_bytecount (d, 1);
4004 range -= d_size;
4005 }
4006
4007 startpos += irange - range;
4008 }
4009 #endif /* REGEX_BEGLINE_CHECK */
4010
4011 /* If a fastmap is supplied, skip quickly over characters that
4012 cannot be the start of a match. If the pattern can match the
4013 null string, however, we don't need to skip characters; we want
4014 the first null string. */
4015 if (fastmap && startpos < total_size && !bufp->can_be_null)
4016 {
4017 if (range > 0) /* Searching forwards. */
4018 {
4019 int lim = 0;
4020 int irange = range;
4021
4022 if (startpos < size1 && startpos + range >= size1)
4023 lim = range - (size1 - startpos);
4024
4025 d = ((CONST unsigned char *)
4026 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4027
4028 /* Written out as an if-else to avoid testing `translate'
4029 inside the loop. */
4030 if (translate)
4031 while (range > lim &&
4032 #ifdef MULE
4033 *d < 0x80 &&
4034 #endif
4035 !fastmap[(unsigned char)translate[*d]])
4036 {
4037 d_size = charcount_to_bytecount (d, 1);
4038 range -= d_size;
4039 d += d_size; /* Speedier INC_CHARPTR(d) */
4040 }
4041 else
4042 while (range > lim && !fastmap[*d])
4043 {
4044 d_size = charcount_to_bytecount (d, 1);
4045 range -= d_size;
4046 d += d_size; /* Speedier INC_CHARPTR(d) */
4047 }
4048
4049 startpos += irange - range;
4050 }
4051 else /* Searching backwards. */
4052 {
4053 unsigned char c = (size1 == 0 || startpos >= size1
4054 ? string2[startpos - size1]
4055 : string1[startpos]);
4056 #ifdef MULE
4057 if (c < 0x80 && !fastmap[(unsigned char) TRANSLATE (c)])
4058 #else
4059 if (!fastmap[(unsigned char) TRANSLATE (c)])
4060 #endif
4061 goto advance;
4062 }
4063 }
4064
4065 /* If can't match the null string, and that's all we have left, fail. */
4066 if (range >= 0 && startpos == total_size && fastmap
4067 && !bufp->can_be_null)
4068 return -1;
4069
4070 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4071 if (!no_quit_in_re_search)
4072 QUIT;
4073 #endif
4074 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4075 startpos, regs, stop);
4076 #ifndef REGEX_MALLOC
4077 #ifdef C_ALLOCA
4078 alloca (0);
4079 #endif
4080 #endif
4081
4082 if (val >= 0)
4083 return startpos;
4084
4085 if (val == -2)
4086 return -2;
4087
4088 advance:
4089 if (!range)
4090 break;
4091 else if (range > 0)
4092 {
4093 d = ((CONST unsigned char *)
4094 (startpos >= size1 ? string2 - size1 : string1) + startpos);
4095 d_size = charcount_to_bytecount (d, 1);
4096 range -= d_size;
4097 startpos += d_size;
4098 }
4099 else
4100 {
4101 /* Note startpos > size1 not >=. If we are on the
4102 string1/string2 boundary, we want to backup into string1. */
4103 d = ((CONST unsigned char *)
4104 (startpos > size1 ? string2 - size1 : string1) + startpos);
4105 DEC_CHARPTR(d);
4106 d_size = charcount_to_bytecount (d, 1);
4107 range += d_size;
4108 startpos -= d_size;
4109 }
4110 }
4111 return -1;
4112 } /* re_search_2 */
4113
4114 /* Declarations and macros for re_match_2. */
4115
4116 /* This converts PTR, a pointer into one of the search strings `string1'
4117 and `string2' into an offset from the beginning of that string. */
4118 #define POINTER_TO_OFFSET(ptr) \
4119 (FIRST_STRING_P (ptr) \
4120 ? ((regoff_t) ((ptr) - string1)) \
4121 : ((regoff_t) ((ptr) - string2 + size1)))
4122
4123 /* Macros for dealing with the split strings in re_match_2. */
4124
4125 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4126
4127 /* Call before fetching a character with *d. This switches over to
4128 string2 if necessary. */
4129 #define PREFETCH() \
4130 while (d == dend) \
4131 { \
4132 /* End of string2 => fail. */ \
4133 if (dend == end_match_2) \
4134 goto fail; \
4135 /* End of string1 => advance to string2. */ \
4136 d = string2; \
4137 dend = end_match_2; \
4138 }
4139
4140
4141 /* Test if at very beginning or at very end of the virtual concatenation
4142 of `string1' and `string2'. If only one string, it's `string2'. */
4143 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4144 #define AT_STRINGS_END(d) ((d) == end2)
4145
4146 /* XEmacs change:
4147 If the given position straddles the string gap, return the equivalent
4148 position that is before or after the gap, respectively; otherwise,
4149 return the same position. */
4150 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4151 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4152
4153 /* Test if CH is a word-constituent character. (XEmacs change) */
4154 #define WORDCHAR_P_UNSAFE(ch) \
4155 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4156 ch) == Sword)
4157
4158 /* Free everything we malloc. */
4159 #ifdef MATCH_MAY_ALLOCATE
4160 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4161 #define FREE_VARIABLES() \
4162 do { \
4163 REGEX_FREE_STACK (fail_stack.stack); \
4164 FREE_VAR (regstart); \
4165 FREE_VAR (regend); \
4166 FREE_VAR (old_regstart); \
4167 FREE_VAR (old_regend); \
4168 FREE_VAR (best_regstart); \
4169 FREE_VAR (best_regend); \
4170 FREE_VAR (reg_info); \
4171 FREE_VAR (reg_dummy); \
4172 FREE_VAR (reg_info_dummy); \
4173 } while (0)
4174 #else
4175 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4176 #endif /* not MATCH_MAY_ALLOCATE */
4177
4178 /* These values must meet several constraints. They must not be valid
4179 register values; since we have a limit of 255 registers (because
4180 we use only one byte in the pattern for the register number), we can
4181 use numbers larger than 255. They must differ by 1, because of
4182 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4183 be larger than the value for the highest register, so we do not try
4184 to actually save any registers when none are active. */
4185 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4186 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4187
4188 /* Matching routines. */
4189
4190 #ifndef emacs /* Emacs never uses this. */
4191 /* re_match is like re_match_2 except it takes only a single string. */
4192
4193 int
4194 re_match (struct re_pattern_buffer *bufp, CONST char *string, int size,
4195 int pos, struct re_registers *regs)
4196 {
4197 int result = re_match_2_internal (bufp, NULL, 0, string, size,
4198 pos, regs, size);
4199 alloca (0);
4200 return result;
4201 }
4202 #endif /* not emacs */
4203
4204
4205 /* re_match_2 matches the compiled pattern in BUFP against the
4206 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4207 SIZE2, respectively). We start matching at POS, and stop matching
4208 at STOP.
4209
4210 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4211 store offsets for the substring each group matched in REGS. See the
4212 documentation for exactly how many groups we fill.
4213
4214 We return -1 if no match, -2 if an internal error (such as the
4215 failure stack overflowing). Otherwise, we return the length of the
4216 matched substring. */
4217
4218 int
4219 re_match_2 (struct re_pattern_buffer *bufp, CONST char *string1,
4220 int size1, CONST char *string2, int size2, int pos,
4221 struct re_registers *regs, int stop)
4222 {
4223 int result = re_match_2_internal (bufp, string1, size1, string2, size2,
4224 pos, regs, stop);
4225 alloca (0);
4226 return result;
4227 }
4228
4229 /* This is a separate function so that we can force an alloca cleanup
4230 afterwards. */
4231 static int
4232 re_match_2_internal (struct re_pattern_buffer *bufp, CONST char *string1,
4233 int size1, CONST char *string2, int size2, int pos,
4234 struct re_registers *regs, int stop)
4235 {
4236 /* General temporaries. */
4237 int mcnt;
4238 unsigned char *p1;
4239 int should_succeed; /* XEmacs change */
4240
4241 /* Just past the end of the corresponding string. */
4242 CONST char *end1, *end2;
4243
4244 /* Pointers into string1 and string2, just past the last characters in
4245 each to consider matching. */
4246 CONST char *end_match_1, *end_match_2;
4247
4248 /* Where we are in the data, and the end of the current string. */
4249 CONST char *d, *dend;
4250
4251 /* Where we are in the pattern, and the end of the pattern. */
4252 unsigned char *p = bufp->buffer;
4253 REGISTER unsigned char *pend = p + bufp->used;
4254
4255 /* Mark the opcode just after a start_memory, so we can test for an
4256 empty subpattern when we get to the stop_memory. */
4257 unsigned char *just_past_start_mem = 0;
4258
4259 /* We use this to map every character in the string. */
4260 char *translate = bufp->translate;
4261
4262 /* Failure point stack. Each place that can handle a failure further
4263 down the line pushes a failure point on this stack. It consists of
4264 restart, regend, and reg_info for all registers corresponding to
4265 the subexpressions we're currently inside, plus the number of such
4266 registers, and, finally, two char *'s. The first char * is where
4267 to resume scanning the pattern; the second one is where to resume
4268 scanning the strings. If the latter is zero, the failure point is
4269 a ``dummy''; if a failure happens and the failure point is a dummy,
4270 it gets discarded and the next one is tried. */
4271 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4272 fail_stack_type fail_stack;
4273 #endif
4274 #ifdef DEBUG
4275 static unsigned failure_id;
4276 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4277 #endif
4278
4279 #ifdef REL_ALLOC
4280 /* This holds the pointer to the failure stack, when
4281 it is allocated relocatably. */
4282 fail_stack_elt_t *failure_stack_ptr;
4283 #endif
4284
4285 /* We fill all the registers internally, independent of what we
4286 return, for use in backreferences. The number here includes
4287 an element for register zero. */
4288 unsigned num_regs = bufp->re_nsub + 1;
4289
4290 /* The currently active registers. */
4291 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4292 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4293
4294 /* Information on the contents of registers. These are pointers into
4295 the input strings; they record just what was matched (on this
4296 attempt) by a subexpression part of the pattern, that is, the
4297 regnum-th regstart pointer points to where in the pattern we began
4298 matching and the regnum-th regend points to right after where we
4299 stopped matching the regnum-th subexpression. (The zeroth register
4300 keeps track of what the whole pattern matches.) */
4301 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4302 CONST char **regstart, **regend;
4303 #endif
4304
4305 /* If a group that's operated upon by a repetition operator fails to
4306 match anything, then the register for its start will need to be
4307 restored because it will have been set to wherever in the string we
4308 are when we last see its open-group operator. Similarly for a
4309 register's end. */
4310 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4311 CONST char **old_regstart, **old_regend;
4312 #endif
4313
4314 /* The is_active field of reg_info helps us keep track of which (possibly
4315 nested) subexpressions we are currently in. The matched_something
4316 field of reg_info[reg_num] helps us tell whether or not we have
4317 matched any of the pattern so far this time through the reg_num-th
4318 subexpression. These two fields get reset each time through any
4319 loop their register is in. */
4320 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4321 register_info_type *reg_info;
4322 #endif
4323
4324 /* The following record the register info as found in the above
4325 variables when we find a match better than any we've seen before.
4326 This happens as we backtrack through the failure points, which in
4327 turn happens only if we have not yet matched the entire string. */
4328 unsigned best_regs_set = false;
4329 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4330 CONST char **best_regstart, **best_regend;
4331 #endif
4332
4333 /* Logically, this is `best_regend[0]'. But we don't want to have to
4334 allocate space for that if we're not allocating space for anything
4335 else (see below). Also, we never need info about register 0 for
4336 any of the other register vectors, and it seems rather a kludge to
4337 treat `best_regend' differently than the rest. So we keep track of
4338 the end of the best match so far in a separate variable. We
4339 initialize this to NULL so that when we backtrack the first time
4340 and need to test it, it's not garbage. */
4341 CONST char *match_end = NULL;
4342
4343 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4344 int set_regs_matched_done = 0;
4345
4346 /* Used when we pop values we don't care about. */
4347 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4348 CONST char **reg_dummy;
4349 register_info_type *reg_info_dummy;
4350 #endif
4351
4352 #ifdef DEBUG
4353 /* Counts the total number of registers pushed. */
4354 unsigned num_regs_pushed = 0;
4355 #endif
4356
4357 /* 1 if this match ends in the same string (string1 or string2)
4358 as the best previous match. */
4359 boolean same_str_p;
4360
4361 /* 1 if this match is the best seen so far. */
4362 boolean best_match_p;
4363
4364 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4365
4366 INIT_FAIL_STACK ();
4367
4368 #ifdef MATCH_MAY_ALLOCATE
4369 /* Do not bother to initialize all the register variables if there are
4370 no groups in the pattern, as it takes a fair amount of time. If
4371 there are groups, we include space for register 0 (the whole
4372 pattern), even though we never use it, since it simplifies the
4373 array indexing. We should fix this. */
4374 if (bufp->re_nsub)
4375 {
4376 regstart = REGEX_TALLOC (num_regs, CONST char *);
4377 regend = REGEX_TALLOC (num_regs, CONST char *);
4378 old_regstart = REGEX_TALLOC (num_regs, CONST char *);
4379 old_regend = REGEX_TALLOC (num_regs, CONST char *);
4380 best_regstart = REGEX_TALLOC (num_regs, CONST char *);
4381 best_regend = REGEX_TALLOC (num_regs, CONST char *);
4382 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4383 reg_dummy = REGEX_TALLOC (num_regs, CONST char *);
4384 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4385
4386 if (!(regstart && regend && old_regstart && old_regend && reg_info
4387 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4388 {
4389 FREE_VARIABLES ();
4390 return -2;
4391 }
4392 }
4393 else
4394 {
4395 /* We must initialize all our variables to NULL, so that
4396 `FREE_VARIABLES' doesn't try to free them. */
4397 regstart = regend = old_regstart = old_regend = best_regstart
4398 = best_regend = reg_dummy = NULL;
4399 reg_info = reg_info_dummy = (register_info_type *) NULL;
4400 }
4401 #endif /* MATCH_MAY_ALLOCATE */
4402
4403 /* The starting position is bogus. */
4404 if (pos < 0 || pos > size1 + size2)
4405 {
4406 FREE_VARIABLES ();
4407 return -1;
4408 }
4409
4410 /* Initialize subexpression text positions to -1 to mark ones that no
4411 start_memory/stop_memory has been seen for. Also initialize the
4412 register information struct. */
4413 for (mcnt = 1; mcnt < num_regs; mcnt++)
4414 {
4415 regstart[mcnt] = regend[mcnt]
4416 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4417
4418 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4419 IS_ACTIVE (reg_info[mcnt]) = 0;
4420 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4421 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4422 }
4423
4424 /* We move `string1' into `string2' if the latter's empty -- but not if
4425 `string1' is null. */
4426 if (size2 == 0 && string1 != NULL)
4427 {
4428 string2 = string1;
4429 size2 = size1;
4430 string1 = 0;
4431 size1 = 0;
4432 }
4433 end1 = string1 + size1;
4434 end2 = string2 + size2;
4435
4436 /* Compute where to stop matching, within the two strings. */
4437 if (stop <= size1)
4438 {
4439 end_match_1 = string1 + stop;
4440 end_match_2 = string2;
4441 }
4442 else
4443 {
4444 end_match_1 = end1;
4445 end_match_2 = string2 + stop - size1;
4446 }
4447
4448 /* `p' scans through the pattern as `d' scans through the data.
4449 `dend' is the end of the input string that `d' points within. `d'
4450 is advanced into the following input string whenever necessary, but
4451 this happens before fetching; therefore, at the beginning of the
4452 loop, `d' can be pointing at the end of a string, but it cannot
4453 equal `string2'. */
4454 if (size1 > 0 && pos <= size1)
4455 {
4456 d = string1 + pos;
4457 dend = end_match_1;
4458 }
4459 else
4460 {
4461 d = string2 + pos - size1;
4462 dend = end_match_2;
4463 }
4464
4465 DEBUG_PRINT1 ("The compiled pattern is: ");
4466 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4467 DEBUG_PRINT1 ("The string to match is: `");
4468 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4469 DEBUG_PRINT1 ("'\n");
4470
4471 /* This loops over pattern commands. It exits by returning from the
4472 function if the match is complete, or it drops through if the match
4473 fails at this starting point in the input data. */
4474 for (;;)
4475 {
4476 DEBUG_PRINT2 ("\n0x%lx: ", (long) p);
4477 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4478 if (!no_quit_in_re_search)
4479 QUIT;
4480 #endif
4481
4482 if (p == pend)
4483 { /* End of pattern means we might have succeeded. */
4484 DEBUG_PRINT1 ("end of pattern ... ");
4485
4486 /* If we haven't matched the entire string, and we want the
4487 longest match, try backtracking. */
4488 if (d != end_match_2)
4489 {
4490 same_str_p = (FIRST_STRING_P (match_end)
4491 == MATCHING_IN_FIRST_STRING);
4492
4493 /* AIX compiler got confused when this was combined
4494 with the previous declaration. */
4495 if (same_str_p)
4496 best_match_p = d > match_end;
4497 else
4498 best_match_p = !MATCHING_IN_FIRST_STRING;
4499
4500 DEBUG_PRINT1 ("backtracking.\n");
4501
4502 if (!FAIL_STACK_EMPTY ())
4503 { /* More failure points to try. */
4504
4505 /* If exceeds best match so far, save it. */
4506 if (!best_regs_set || best_match_p)
4507 {
4508 best_regs_set = true;
4509 match_end = d;
4510
4511 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4512
4513 for (mcnt = 1; mcnt < num_regs; mcnt++)
4514 {
4515 best_regstart[mcnt] = regstart[mcnt];
4516 best_regend[mcnt] = regend[mcnt];
4517 }
4518 }
4519 goto fail;
4520 }
4521
4522 /* If no failure points, don't restore garbage. And if
4523 last match is real best match, don't restore second
4524 best one. */
4525 else if (best_regs_set && !best_match_p)
4526 {
4527 restore_best_regs:
4528 /* Restore best match. It may happen that `dend ==
4529 end_match_1' while the restored d is in string2.
4530 For example, the pattern `x.*y.*z' against the
4531 strings `x-' and `y-z-', if the two strings are
4532 not consecutive in memory. */
4533 DEBUG_PRINT1 ("Restoring best registers.\n");
4534
4535 d = match_end;
4536 dend = ((d >= string1 && d <= end1)
4537 ? end_match_1 : end_match_2);
4538
4539 for (mcnt = 1; mcnt < num_regs; mcnt++)
4540 {
4541 regstart[mcnt] = best_regstart[mcnt];
4542 regend[mcnt] = best_regend[mcnt];
4543 }
4544 }
4545 } /* d != end_match_2 */
4546
4547 succeed_label:
4548 DEBUG_PRINT1 ("Accepting match.\n");
4549
4550 /* If caller wants register contents data back, do it. */
4551 if (regs && !bufp->no_sub)
4552 {
4553 /* Have the register data arrays been allocated? */
4554 if (bufp->regs_allocated == REGS_UNALLOCATED)
4555 { /* No. So allocate them with malloc. We need one
4556 extra element beyond `num_regs' for the `-1' marker
4557 GNU code uses. */
4558 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4559 regs->start = TALLOC (regs->num_regs, regoff_t);
4560 regs->end = TALLOC (regs->num_regs, regoff_t);
4561 if (regs->start == NULL || regs->end == NULL)
4562 {
4563 FREE_VARIABLES ();
4564 return -2;
4565 }
4566 bufp->regs_allocated = REGS_REALLOCATE;
4567 }
4568 else if (bufp->regs_allocated == REGS_REALLOCATE)
4569 { /* Yes. If we need more elements than were already
4570 allocated, reallocate them. If we need fewer, just
4571 leave it alone. */
4572 if (regs->num_regs < num_regs + 1)
4573 {
4574 regs->num_regs = num_regs + 1;
4575 RETALLOC (regs->start, regs->num_regs, regoff_t);
4576 RETALLOC (regs->end, regs->num_regs, regoff_t);
4577 if (regs->start == NULL || regs->end == NULL)
4578 {
4579 FREE_VARIABLES ();
4580 return -2;
4581 }
4582 }
4583 }
4584 else
4585 {
4586 /* These braces fend off a "empty body in an else-statement"
4587 warning under GCC when assert expands to nothing. */
4588 assert (bufp->regs_allocated == REGS_FIXED);
4589 }
4590
4591 /* Convert the pointer data in `regstart' and `regend' to
4592 indices. Register zero has to be set differently,
4593 since we haven't kept track of any info for it. */
4594 if (regs->num_regs > 0)
4595 {
4596 regs->start[0] = pos;
4597 regs->end[0] = (MATCHING_IN_FIRST_STRING
4598 ? ((regoff_t) (d - string1))
4599 : ((regoff_t) (d - string2 + size1)));
4600 }
4601
4602 /* Go through the first `min (num_regs, regs->num_regs)'
4603 registers, since that is all we initialized. */
4604 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4605 {
4606 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4607 regs->start[mcnt] = regs->end[mcnt] = -1;
4608 else
4609 {
4610 regs->start[mcnt]
4611 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4612 regs->end[mcnt]
4613 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4614 }
4615 }
4616
4617 /* If the regs structure we return has more elements than
4618 were in the pattern, set the extra elements to -1. If
4619 we (re)allocated the registers, this is the case,
4620 because we always allocate enough to have at least one
4621 -1 at the end. */
4622 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4623 regs->start[mcnt] = regs->end[mcnt] = -1;
4624 } /* regs && !bufp->no_sub */
4625
4626 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4627 nfailure_points_pushed, nfailure_points_popped,
4628 nfailure_points_pushed - nfailure_points_popped);
4629 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4630
4631 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4632 ? string1
4633 : string2 - size1);
4634
4635 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4636
4637 FREE_VARIABLES ();
4638 return mcnt;
4639 }
4640
4641 /* Otherwise match next pattern command. */
4642 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4643 {
4644 /* Ignore these. Used to ignore the n of succeed_n's which
4645 currently have n == 0. */
4646 case no_op:
4647 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4648 break;
4649
4650 case succeed:
4651 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4652 goto succeed_label;
4653
4654 /* Match the next n pattern characters exactly. The following
4655 byte in the pattern defines n, and the n bytes after that
4656 are the characters to match. */
4657 case exactn:
4658 mcnt = *p++;
4659 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4660
4661 /* This is written out as an if-else so we don't waste time
4662 testing `translate' inside the loop. */
4663 if (translate)
4664 {
4665 do
4666 {
4667 PREFETCH ();
4668 if (translate[(unsigned char) *d++] != (char) *p++)
4669 goto fail;
4670 }
4671 while (--mcnt);
4672 }
4673 else
4674 {
4675 do
4676 {
4677 PREFETCH ();
4678 if (*d++ != (char) *p++) goto fail;
4679 }
4680 while (--mcnt);
4681 }
4682 SET_REGS_MATCHED ();
4683 break;
4684
4685
4686 /* Match any character except possibly a newline or a null. */
4687 case anychar:
4688 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4689
4690 PREFETCH ();
4691
4692 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4693 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4694 goto fail;
4695
4696 SET_REGS_MATCHED ();
4697 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4698 INC_CHARPTR (d); /* XEmacs change */
4699 break;
4700
4701
4702 case charset:
4703 case charset_not:
4704 {
4705 REGISTER unsigned char c;
4706 boolean not = (re_opcode_t) *(p - 1) == charset_not;
4707
4708 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4709
4710 PREFETCH ();
4711 c = TRANSLATE (*d); /* The character to match. */
4712
4713 /* Cast to `unsigned' instead of `unsigned char' in case the
4714 bit list is a full 32 bytes long. */
4715 if (c < (unsigned) (*p * BYTEWIDTH)
4716 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4717 not = !not;
4718
4719 p += 1 + *p;
4720
4721 if (!not) goto fail;
4722
4723 SET_REGS_MATCHED ();
4724 INC_CHARPTR (d); /* XEmacs change */
4725 break;
4726 }
4727
4728 #ifdef MULE
4729 case charset_mule:
4730 case charset_mule_not:
4731 {
4732 REGISTER Emchar c;
4733 boolean not = (re_opcode_t) *(p - 1) == charset_mule_not;
4734
4735 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not ? "_not" : "");
4736
4737 PREFETCH ();
4738 c = charptr_emchar ((CONST Bufbyte *) d);
4739 c = TRANSLATE_EXTENDED_UNSAFE (c); /* The character to match. */
4740
4741 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil)))
4742 not = !not;
4743
4744 p += unified_range_table_bytes_used (p);
4745
4746 if (!not) goto fail;
4747
4748 SET_REGS_MATCHED ();
4749 INC_CHARPTR (d);
4750 break;
4751 }
4752 #endif /* MULE */
4753
4754
4755 /* The beginning of a group is represented by start_memory.
4756 The arguments are the register number in the next byte, and the
4757 number of groups inner to this one in the next. The text
4758 matched within the group is recorded (in the internal
4759 registers data structure) under the register number. */
4760 case start_memory:
4761 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4762
4763 /* Find out if this group can match the empty string. */
4764 p1 = p; /* To send to group_match_null_string_p. */
4765
4766 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4767 REG_MATCH_NULL_STRING_P (reg_info[*p])
4768 = group_match_null_string_p (&p1, pend, reg_info);
4769
4770 /* Save the position in the string where we were the last time
4771 we were at this open-group operator in case the group is
4772 operated upon by a repetition operator, e.g., with `(a*)*b'
4773 against `ab'; then we want to ignore where we are now in
4774 the string in case this attempt to match fails. */
4775 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4776 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4777 : regstart[*p];
4778 DEBUG_PRINT2 (" old_regstart: %d\n",
4779 POINTER_TO_OFFSET (old_regstart[*p]));
4780
4781 regstart[*p] = d;
4782 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4783
4784 IS_ACTIVE (reg_info[*p]) = 1;
4785 MATCHED_SOMETHING (reg_info[*p]) = 0;
4786
4787 /* Clear this whenever we change the register activity status. */
4788 set_regs_matched_done = 0;
4789
4790 /* This is the new highest active register. */
4791 highest_active_reg = *p;
4792
4793 /* If nothing was active before, this is the new lowest active
4794 register. */
4795 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4796 lowest_active_reg = *p;
4797
4798 /* Move past the register number and inner group count. */
4799 p += 2;
4800 just_past_start_mem = p;
4801
4802 break;
4803
4804
4805 /* The stop_memory opcode represents the end of a group. Its
4806 arguments are the same as start_memory's: the register
4807 number, and the number of inner groups. */
4808 case stop_memory:
4809 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4810
4811 /* We need to save the string position the last time we were at
4812 this close-group operator in case the group is operated
4813 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4814 against `aba'; then we want to ignore where we are now in
4815 the string in case this attempt to match fails. */
4816 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4817 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4818 : regend[*p];
4819 DEBUG_PRINT2 (" old_regend: %d\n",
4820 POINTER_TO_OFFSET (old_regend[*p]));
4821
4822 regend[*p] = d;
4823 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4824
4825 /* This register isn't active anymore. */
4826 IS_ACTIVE (reg_info[*p]) = 0;
4827
4828 /* Clear this whenever we change the register activity status. */
4829 set_regs_matched_done = 0;
4830
4831 /* If this was the only register active, nothing is active
4832 anymore. */
4833 if (lowest_active_reg == highest_active_reg)
4834 {
4835 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4836 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4837 }
4838 else
4839 { /* We must scan for the new highest active register, since
4840 it isn't necessarily one less than now: consider
4841 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4842 new highest active register is 1. */
4843 unsigned char r = *p - 1;
4844 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4845 r--;
4846
4847 /* If we end up at register zero, that means that we saved
4848 the registers as the result of an `on_failure_jump', not
4849 a `start_memory', and we jumped to past the innermost
4850 `stop_memory'. For example, in ((.)*) we save
4851 registers 1 and 2 as a result of the *, but when we pop
4852 back to the second ), we are at the stop_memory 1.
4853 Thus, nothing is active. */
4854 if (r == 0)
4855 {
4856 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4857 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4858 }
4859 else
4860 {
4861 highest_active_reg = r;
4862
4863 /* 98/9/21 jhod: We've also gotta set lowest_active_reg, don't we? */
4864 r = 1;
4865 while (r < highest_active_reg && !IS_ACTIVE(reg_info[r]))
4866 r++;
4867 lowest_active_reg = r;
4868 }
4869 }
4870
4871 /* If just failed to match something this time around with a
4872 group that's operated on by a repetition operator, try to
4873 force exit from the ``loop'', and restore the register
4874 information for this group that we had before trying this
4875 last match. */
4876 if ((!MATCHED_SOMETHING (reg_info[*p])
4877 || just_past_start_mem == p - 1)
4878 && (p + 2) < pend)
4879 {
4880 boolean is_a_jump_n = false;
4881
4882 p1 = p + 2;
4883 mcnt = 0;
4884 switch ((re_opcode_t) *p1++)
4885 {
4886 case jump_n:
4887 is_a_jump_n = true;
4888 case pop_failure_jump:
4889 case maybe_pop_jump:
4890 case jump:
4891 case dummy_failure_jump:
4892 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4893 if (is_a_jump_n)
4894 p1 += 2;
4895 break;
4896
4897 default:
4898 /* do nothing */ ;
4899 }
4900 p1 += mcnt;
4901
4902 /* If the next operation is a jump backwards in the pattern
4903 to an on_failure_jump right before the start_memory
4904 corresponding to this stop_memory, exit from the loop
4905 by forcing a failure after pushing on the stack the
4906 on_failure_jump's jump in the pattern, and d. */
4907 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
4908 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
4909 {
4910 /* If this group ever matched anything, then restore
4911 what its registers were before trying this last
4912 failed match, e.g., with `(a*)*b' against `ab' for
4913 regstart[1], and, e.g., with `((a*)*(b*)*)*'
4914 against `aba' for regend[3].
4915
4916 Also restore the registers for inner groups for,
4917 e.g., `((a*)(b*))*' against `aba' (register 3 would
4918 otherwise get trashed). */
4919
4920 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
4921 {
4922 unsigned r;
4923
4924 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
4925
4926 /* Restore this and inner groups' (if any) registers. */
4927 for (r = *p; r < *p + *(p + 1); r++)
4928 {
4929 regstart[r] = old_regstart[r];
4930
4931 /* xx why this test? */
4932 if (old_regend[r] >= regstart[r])
4933 regend[r] = old_regend[r];
4934 }
4935 }
4936 p1++;
4937 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4938 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
4939
4940 goto fail;
4941 }
4942 }
4943
4944 /* Move past the register number and the inner group count. */
4945 p += 2;
4946 break;
4947
4948
4949 /* \<digit> has been turned into a `duplicate' command which is
4950 followed by the numeric value of <digit> as the register number. */
4951 case duplicate:
4952 {
4953 REGISTER CONST char *d2, *dend2;
4954 int regno = *p++; /* Get which register to match against. */
4955 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4956
4957 /* Can't back reference a group which we've never matched. */
4958 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4959 goto fail;
4960
4961 /* Where in input to try to start matching. */
4962 d2 = regstart[regno];
4963
4964 /* Where to stop matching; if both the place to start and
4965 the place to stop matching are in the same string, then
4966 set to the place to stop, otherwise, for now have to use
4967 the end of the first string. */
4968
4969 dend2 = ((FIRST_STRING_P (regstart[regno])
4970 == FIRST_STRING_P (regend[regno]))
4971 ? regend[regno] : end_match_1);
4972 for (;;)
4973 {
4974 /* If necessary, advance to next segment in register
4975 contents. */
4976 while (d2 == dend2)
4977 {
4978 if (dend2 == end_match_2) break;
4979 if (dend2 == regend[regno]) break;
4980
4981 /* End of string1 => advance to string2. */
4982 d2 = string2;
4983 dend2 = regend[regno];
4984 }
4985 /* At end of register contents => success */
4986 if (d2 == dend2) break;
4987
4988 /* If necessary, advance to next segment in data. */
4989 PREFETCH ();
4990
4991 /* How many characters left in this segment to match. */
4992 mcnt = dend - d;
4993
4994 /* Want how many consecutive characters we can match in
4995 one shot, so, if necessary, adjust the count. */
4996 if (mcnt > dend2 - d2)
4997 mcnt = dend2 - d2;
4998
4999 /* Compare that many; failure if mismatch, else move
5000 past them. */
5001 if (translate
5002 ? bcmp_translate ((unsigned char *) d,
5003 (unsigned char *) d2, mcnt, translate)
5004 : memcmp (d, d2, mcnt))
5005 goto fail;
5006 d += mcnt, d2 += mcnt;
5007
5008 /* Do this because we've match some characters. */
5009 SET_REGS_MATCHED ();
5010 }
5011 }
5012 break;
5013
5014
5015 /* begline matches the empty string at the beginning of the string
5016 (unless `not_bol' is set in `bufp'), and, if
5017 `newline_anchor' is set, after newlines. */
5018 case begline:
5019 DEBUG_PRINT1 ("EXECUTING begline.\n");
5020
5021 if (AT_STRINGS_BEG (d))
5022 {
5023 if (!bufp->not_bol) break;
5024 }
5025 else if (d[-1] == '\n' && bufp->newline_anchor)
5026 {
5027 break;
5028 }
5029 /* In all other cases, we fail. */
5030 goto fail;
5031
5032
5033 /* endline is the dual of begline. */
5034 case endline:
5035 DEBUG_PRINT1 ("EXECUTING endline.\n");
5036
5037 if (AT_STRINGS_END (d))
5038 {
5039 if (!bufp->not_eol) break;
5040 }
5041
5042 /* We have to ``prefetch'' the next character. */
5043 else if ((d == end1 ? *string2 : *d) == '\n'
5044 && bufp->newline_anchor)
5045 {
5046 break;
5047 }
5048 goto fail;
5049
5050
5051 /* Match at the very beginning of the data. */
5052 case begbuf:
5053 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5054 if (AT_STRINGS_BEG (d))
5055 break;
5056 goto fail;
5057
5058
5059 /* Match at the very end of the data. */
5060 case endbuf:
5061 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5062 if (AT_STRINGS_END (d))
5063 break;
5064 goto fail;
5065
5066
5067 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5068 pushes NULL as the value for the string on the stack. Then
5069 `pop_failure_point' will keep the current value for the
5070 string, instead of restoring it. To see why, consider
5071 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5072 then the . fails against the \n. But the next thing we want
5073 to do is match the \n against the \n; if we restored the
5074 string value, we would be back at the foo.
5075
5076 Because this is used only in specific cases, we don't need to
5077 check all the things that `on_failure_jump' does, to make
5078 sure the right things get saved on the stack. Hence we don't
5079 share its code. The only reason to push anything on the
5080 stack at all is that otherwise we would have to change
5081 `anychar's code to do something besides goto fail in this
5082 case; that seems worse than this. */
5083 case on_failure_keep_string_jump:
5084 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
5085
5086 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5087 DEBUG_PRINT3 (" %d (to 0x%lx):\n", mcnt, (long) (p + mcnt));
5088
5089 PUSH_FAILURE_POINT (p + mcnt, (char *) 0, -2);
5090 break;
5091
5092
5093 /* Uses of on_failure_jump:
5094
5095 Each alternative starts with an on_failure_jump that points
5096 to the beginning of the next alternative. Each alternative
5097 except the last ends with a jump that in effect jumps past
5098 the rest of the alternatives. (They really jump to the
5099 ending jump of the following alternative, because tensioning
5100 these jumps is a hassle.)
5101
5102 Repeats start with an on_failure_jump that points past both
5103 the repetition text and either the following jump or
5104 pop_failure_jump back to this on_failure_jump. */
5105 case on_failure_jump:
5106 on_failure:
5107 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5108
5109 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5110 DEBUG_PRINT3 (" %d (to 0x%lx)", mcnt, (long) (p + mcnt));
5111
5112 /* If this on_failure_jump comes right before a group (i.e.,
5113 the original * applied to a group), save the information
5114 for that group and all inner ones, so that if we fail back
5115 to this point, the group's information will be correct.
5116 For example, in \(a*\)*\1, we need the preceding group,
5117 and in \(\(a*\)b*\)\2, we need the inner group. */
5118
5119 /* We can't use `p' to check ahead because we push
5120 a failure point to `p + mcnt' after we do this. */
5121 p1 = p;
5122
5123 /* We need to skip no_op's before we look for the
5124 start_memory in case this on_failure_jump is happening as
5125 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5126 against aba. */
5127 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5128 p1++;
5129
5130 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5131 {
5132 /* We have a new highest active register now. This will
5133 get reset at the start_memory we are about to get to,
5134 but we will have saved all the registers relevant to
5135 this repetition op, as described above. */
5136 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5137 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5138 lowest_active_reg = *(p1 + 1);
5139 }
5140
5141 DEBUG_PRINT1 (":\n");
5142 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5143 break;
5144
5145
5146 /* A smart repeat ends with `maybe_pop_jump'.
5147 We change it to either `pop_failure_jump' or `jump'. */
5148 case maybe_pop_jump:
5149 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5150 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5151 {
5152 REGISTER unsigned char *p2 = p;
5153
5154 /* Compare the beginning of the repeat with what in the
5155 pattern follows its end. If we can establish that there
5156 is nothing that they would both match, i.e., that we
5157 would have to backtrack because of (as in, e.g., `a*a')
5158 then we can change to pop_failure_jump, because we'll
5159 never have to backtrack.
5160
5161 This is not true in the case of alternatives: in
5162 `(a|ab)*' we do need to backtrack to the `ab' alternative
5163 (e.g., if the string was `ab'). But instead of trying to
5164 detect that here, the alternative has put on a dummy
5165 failure point which is what we will end up popping. */
5166
5167 /* Skip over open/close-group commands.
5168 If what follows this loop is a ...+ construct,
5169 look at what begins its body, since we will have to
5170 match at least one of that. */
5171 while (1)
5172 {
5173 if (p2 + 2 < pend
5174 && ((re_opcode_t) *p2 == stop_memory
5175 || (re_opcode_t) *p2 == start_memory))
5176 p2 += 3;
5177 else if (p2 + 6 < pend
5178 && (re_opcode_t) *p2 == dummy_failure_jump)
5179 p2 += 6;
5180 else
5181 break;
5182 }
5183
5184 p1 = p + mcnt;
5185 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5186 to the `maybe_finalize_jump' of this case. Examine what
5187 follows. */
5188
5189 /* If we're at the end of the pattern, we can change. */
5190 if (p2 == pend)
5191 {
5192 /* Consider what happens when matching ":\(.*\)"
5193 against ":/". I don't really understand this code
5194 yet. */
5195 p[-3] = (unsigned char) pop_failure_jump;
5196 DEBUG_PRINT1
5197 (" End of pattern: change to `pop_failure_jump'.\n");
5198 }
5199
5200 else if ((re_opcode_t) *p2 == exactn
5201 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5202 {
5203 REGISTER unsigned char c
5204 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5205
5206 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
5207 {
5208 p[-3] = (unsigned char) pop_failure_jump;
5209 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5210 c, p1[5]);
5211 }
5212
5213 else if ((re_opcode_t) p1[3] == charset
5214 || (re_opcode_t) p1[3] == charset_not)
5215 {
5216 int not = (re_opcode_t) p1[3] == charset_not;
5217
5218 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
5219 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5220 not = !not;
5221
5222 /* `not' is equal to 1 if c would match, which means
5223 that we can't change to pop_failure_jump. */
5224 if (!not)
5225 {
5226 p[-3] = (unsigned char) pop_failure_jump;
5227 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5228 }
5229 }
5230 }
5231 else if ((re_opcode_t) *p2 == charset)
5232 {
5233 #ifdef DEBUG
5234 REGISTER unsigned char c
5235 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5236 #endif
5237
5238 if ((re_opcode_t) p1[3] == exactn
5239 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
5240 && (p2[2 + p1[5] / BYTEWIDTH]
5241 & (1 << (p1[5] % BYTEWIDTH)))))
5242 {
5243 p[-3] = (unsigned char) pop_failure_jump;
5244 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5245 c, p1[5]);
5246 }
5247
5248 else if ((re_opcode_t) p1[3] == charset_not)
5249 {
5250 int idx;
5251 /* We win if the charset_not inside the loop
5252 lists every character listed in the charset after. */
5253 for (idx = 0; idx < (int) p2[1]; idx++)
5254 if (! (p2[2 + idx] == 0
5255 || (idx < (int) p1[4]
5256 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5257 break;
5258
5259 if (idx == p2[1])
5260 {
5261 p[-3] = (unsigned char) pop_failure_jump;
5262 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5263 }
5264 }
5265 else if ((re_opcode_t) p1[3] == charset)
5266 {
5267 int idx;
5268 /* We win if the charset inside the loop
5269 has no overlap with the one after the loop. */
5270 for (idx = 0;
5271 idx < (int) p2[1] && idx < (int) p1[4];
5272 idx++)
5273 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5274 break;
5275
5276 if (idx == p2[1] || idx == p1[4])
5277 {
5278 p[-3] = (unsigned char) pop_failure_jump;
5279 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5280 }
5281 }
5282 }
5283 }
5284 p -= 2; /* Point at relative address again. */
5285 if ((re_opcode_t) p[-1] != pop_failure_jump)
5286 {
5287 p[-1] = (unsigned char) jump;
5288 DEBUG_PRINT1 (" Match => jump.\n");
5289 goto unconditional_jump;
5290 }
5291 /* Note fall through. */
5292
5293
5294 /* The end of a simple repeat has a pop_failure_jump back to
5295 its matching on_failure_jump, where the latter will push a
5296 failure point. The pop_failure_jump takes off failure
5297 points put on by this pop_failure_jump's matching
5298 on_failure_jump; we got through the pattern to here from the
5299 matching on_failure_jump, so didn't fail. */
5300 case pop_failure_jump:
5301 {
5302 /* We need to pass separate storage for the lowest and
5303 highest registers, even though we don't care about the
5304 actual values. Otherwise, we will restore only one
5305 register from the stack, since lowest will == highest in
5306 `pop_failure_point'. */
5307 unsigned dummy_low_reg, dummy_high_reg;
5308 unsigned char *pdummy;
5309 CONST char *sdummy = NULL;
5310
5311 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5312 POP_FAILURE_POINT (sdummy, pdummy,
5313 dummy_low_reg, dummy_high_reg,
5314 reg_dummy, reg_dummy, reg_info_dummy);
5315 }
5316 /* Note fall through. */
5317
5318
5319 /* Unconditionally jump (without popping any failure points). */
5320 case jump:
5321 unconditional_jump:
5322 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5323 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5324 p += mcnt; /* Do the jump. */
5325 DEBUG_PRINT2 ("(to 0x%lx).\n", (long) p);
5326 break;
5327
5328
5329 /* We need this opcode so we can detect where alternatives end
5330 in `group_match_null_string_p' et al. */
5331 case jump_past_alt:
5332 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5333 goto unconditional_jump;
5334
5335
5336 /* Normally, the on_failure_jump pushes a failure point, which
5337 then gets popped at pop_failure_jump. We will end up at
5338 pop_failure_jump, also, and with a pattern of, say, `a+', we
5339 are skipping over the on_failure_jump, so we have to push
5340 something meaningless for pop_failure_jump to pop. */
5341 case dummy_failure_jump:
5342 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5343 /* It doesn't matter what we push for the string here. What
5344 the code at `fail' tests is the value for the pattern. */
5345 PUSH_FAILURE_POINT ((unsigned char *) 0, (char *) 0, -2);
5346 goto unconditional_jump;
5347
5348
5349 /* At the end of an alternative, we need to push a dummy failure
5350 point in case we are followed by a `pop_failure_jump', because
5351 we don't want the failure point for the alternative to be
5352 popped. For example, matching `(a|ab)*' against `aab'
5353 requires that we match the `ab' alternative. */
5354 case push_dummy_failure:
5355 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5356 /* See comments just above at `dummy_failure_jump' about the
5357 two zeroes. */
5358 PUSH_FAILURE_POINT ((unsigned char *) 0, (char *) 0, -2);
5359 break;
5360
5361 /* Have to succeed matching what follows at least n times.
5362 After that, handle like `on_failure_jump'. */
5363 case succeed_n:
5364 EXTRACT_NUMBER (mcnt, p + 2);
5365 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5366
5367 assert (mcnt >= 0);
5368 /* Originally, this is how many times we HAVE to succeed. */
5369 if (mcnt > 0)
5370 {
5371 mcnt--;
5372 p += 2;
5373 STORE_NUMBER_AND_INCR (p, mcnt);
5374 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p, mcnt);
5375 }
5376 else if (mcnt == 0)
5377 {
5378 DEBUG_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n",
5379 (long) (p+2));
5380 p[2] = (unsigned char) no_op;
5381 p[3] = (unsigned char) no_op;
5382 goto on_failure;
5383 }
5384 break;
5385
5386 case jump_n:
5387 EXTRACT_NUMBER (mcnt, p + 2);
5388 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5389
5390 /* Originally, this is how many times we CAN jump. */
5391 if (mcnt)
5392 {
5393 mcnt--;
5394 STORE_NUMBER (p + 2, mcnt);
5395 goto unconditional_jump;
5396 }
5397 /* If don't have to jump any more, skip over the rest of command. */
5398 else
5399 p += 4;
5400 break;
5401
5402 case set_number_at:
5403 {
5404 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5405
5406 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5407 p1 = p + mcnt;
5408 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5409 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p1, mcnt);
5410 STORE_NUMBER (p1, mcnt);
5411 break;
5412 }
5413
5414 case wordbound:
5415 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5416 should_succeed = 1;
5417 matchwordbound:
5418 {
5419 /* XEmacs change */
5420 int result;
5421 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5422 result = 1;
5423 else
5424 {
5425 CONST unsigned char *d_before =
5426 (CONST unsigned char *) POS_BEFORE_GAP_UNSAFE (d);
5427 CONST unsigned char *d_after =
5428 (CONST unsigned char *) POS_AFTER_GAP_UNSAFE (d);
5429 Emchar emch1, emch2;
5430
5431 DEC_CHARPTR (d_before);
5432 emch1 = charptr_emchar (d_before);
5433 emch2 = charptr_emchar (d_after);
5434 result = (WORDCHAR_P_UNSAFE (emch1) !=
5435 WORDCHAR_P_UNSAFE (emch2));
5436 }
5437 if (result == should_succeed)
5438 break;
5439 goto fail;
5440 }
5441
5442 case notwordbound:
5443 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5444 should_succeed = 0;
5445 goto matchwordbound;
5446
5447 case wordbeg:
5448 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5449 {
5450 /* XEmacs: this originally read:
5451
5452 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5453 break;
5454
5455 */
5456 CONST unsigned char *dtmp =
5457 (CONST unsigned char *) POS_AFTER_GAP_UNSAFE (d);
5458 Emchar emch = charptr_emchar (dtmp);
5459 if (!WORDCHAR_P_UNSAFE (emch))
5460 goto fail;
5461 if (AT_STRINGS_BEG (d))
5462 break;
5463 dtmp = (CONST unsigned char *) POS_BEFORE_GAP_UNSAFE (d);
5464 DEC_CHARPTR (dtmp);
5465 emch = charptr_emchar (dtmp);
5466 if (!WORDCHAR_P_UNSAFE (emch))
5467 break;
5468 goto fail;
5469 }
5470
5471 case wordend:
5472 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5473 {
5474 /* XEmacs: this originally read:
5475
5476 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5477 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5478 break;
5479
5480 The or condition is incorrect (reversed).
5481 */
5482 CONST unsigned char *dtmp;
5483 Emchar emch;
5484 if (AT_STRINGS_BEG (d))
5485 goto fail;
5486 dtmp = (CONST unsigned char *) POS_BEFORE_GAP_UNSAFE (d);
5487 DEC_CHARPTR (dtmp);
5488 emch = charptr_emchar (dtmp);
5489 if (!WORDCHAR_P_UNSAFE (emch))
5490 goto fail;
5491 if (AT_STRINGS_END (d))
5492 break;
5493 dtmp = (CONST unsigned char *) POS_AFTER_GAP_UNSAFE (d);
5494 emch = charptr_emchar (dtmp);
5495 if (!WORDCHAR_P_UNSAFE (emch))
5496 break;
5497 goto fail;
5498 }
5499
5500 #ifdef emacs
5501 case before_dot:
5502 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5503 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) >=
5504 BUF_PT (regex_emacs_buffer))
5505 goto fail;
5506 break;
5507
5508 case at_dot:
5509 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5510 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5511 != BUF_PT (regex_emacs_buffer))
5512 goto fail;
5513 break;
5514
5515 case after_dot:
5516 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5517 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d)
5518 <= BUF_PT (regex_emacs_buffer))
5519 goto fail;
5520 break;
5521 #if 0 /* not emacs19 */
5522 case at_dot:
5523 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5524 if (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) + 1
5525 != BUF_PT (regex_emacs_buffer))
5526 goto fail;
5527 break;
5528 #endif /* not emacs19 */
5529
5530 case syntaxspec:
5531 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5532 mcnt = *p++;
5533 goto matchsyntax;
5534
5535 case wordchar:
5536 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5537 mcnt = (int) Sword;
5538 matchsyntax:
5539 should_succeed = 1;
5540 matchornotsyntax:
5541 {
5542 int matches;
5543 Emchar emch;
5544
5545 PREFETCH ();
5546 emch = charptr_emchar ((CONST Bufbyte *) d);
5547 matches = (SYNTAX_UNSAFE
5548 (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5549 emch) == (enum syntaxcode) mcnt);
5550 INC_CHARPTR (d);
5551 if (matches != should_succeed)
5552 goto fail;
5553 SET_REGS_MATCHED ();
5554 }
5555 break;
5556
5557 case notsyntaxspec:
5558 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5559 mcnt = *p++;
5560 goto matchnotsyntax;
5561
5562 case notwordchar:
5563 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5564 mcnt = (int) Sword;
5565 matchnotsyntax:
5566 should_succeed = 0;
5567 goto matchornotsyntax;
5568
5569 #ifdef MULE
5570 /* 97/2/17 jhod Mule category code patch */
5571 case categoryspec:
5572 should_succeed = 1;
5573 matchornotcategory:
5574 {
5575 Emchar emch;
5576
5577 mcnt = *p++;
5578 PREFETCH ();
5579 emch = charptr_emchar ((CONST Bufbyte *) d);
5580 INC_CHARPTR (d);
5581 if (check_category_char(emch, regex_emacs_buffer->category_table,
5582 mcnt, should_succeed))
5583 goto fail;
5584 SET_REGS_MATCHED ();
5585 }
5586 break;
5587
5588 case notcategoryspec:
5589 should_succeed = 0;
5590 goto matchornotcategory;
5591 /* end of category patch */
5592 #endif /* MULE */
5593 #else /* not emacs */
5594 case wordchar:
5595 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5596 PREFETCH ();
5597 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5598 goto fail;
5599 SET_REGS_MATCHED ();
5600 d++;
5601 break;
5602
5603 case notwordchar:
5604 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5605 PREFETCH ();
5606 if (!WORDCHAR_P_UNSAFE ((int) (*d)))
5607 goto fail;
5608 SET_REGS_MATCHED ();
5609 d++;
5610 break;
5611 #endif /* not emacs */
5612
5613 default:
5614 abort ();
5615 }
5616 continue; /* Successfully executed one pattern command; keep going. */
5617
5618
5619 /* We goto here if a matching operation fails. */
5620 fail:
5621 if (!FAIL_STACK_EMPTY ())
5622 { /* A restart point is known. Restore to that state. */
5623 DEBUG_PRINT1 ("\nFAIL:\n");
5624 POP_FAILURE_POINT (d, p,
5625 lowest_active_reg, highest_active_reg,
5626 regstart, regend, reg_info);
5627
5628 /* If this failure point is a dummy, try the next one. */
5629 if (!p)
5630 goto fail;
5631
5632 /* If we failed to the end of the pattern, don't examine *p. */
5633 assert (p <= pend);
5634 if (p < pend)
5635 {
5636 boolean is_a_jump_n = false;
5637
5638 /* If failed to a backwards jump that's part of a repetition
5639 loop, need to pop this failure point and use the next one. */
5640 switch ((re_opcode_t) *p)
5641 {
5642 case jump_n:
5643 is_a_jump_n = true;
5644 case maybe_pop_jump:
5645 case pop_failure_jump:
5646 case jump:
5647 p1 = p + 1;
5648 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5649 p1 += mcnt;
5650
5651 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5652 || (!is_a_jump_n
5653 && (re_opcode_t) *p1 == on_failure_jump))
5654 goto fail;
5655 break;
5656 default:
5657 /* do nothing */ ;
5658 }
5659 }
5660
5661 if (d >= string1 && d <= end1)
5662 dend = end_match_1;
5663 }
5664 else
5665 break; /* Matching at this starting point really fails. */
5666 } /* for (;;) */
5667
5668 if (best_regs_set)
5669 goto restore_best_regs;
5670
5671 FREE_VARIABLES ();
5672
5673 return -1; /* Failure to match. */
5674 } /* re_match_2 */
5675
5676 /* Subroutine definitions for re_match_2. */
5677
5678
5679 /* We are passed P pointing to a register number after a start_memory.
5680
5681 Return true if the pattern up to the corresponding stop_memory can
5682 match the empty string, and false otherwise.
5683
5684 If we find the matching stop_memory, sets P to point to one past its number.
5685 Otherwise, sets P to an undefined byte less than or equal to END.
5686
5687 We don't handle duplicates properly (yet). */
5688
5689 static boolean
5690 group_match_null_string_p (unsigned char **p, unsigned char *end,
5691 register_info_type *reg_info)
5692 {
5693 int mcnt;
5694 /* Point to after the args to the start_memory. */
5695 unsigned char *p1 = *p + 2;
5696
5697 while (p1 < end)
5698 {
5699 /* Skip over opcodes that can match nothing, and return true or
5700 false, as appropriate, when we get to one that can't, or to the
5701 matching stop_memory. */
5702
5703 switch ((re_opcode_t) *p1)
5704 {
5705 /* Could be either a loop or a series of alternatives. */
5706 case on_failure_jump:
5707 p1++;
5708 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5709
5710 /* If the next operation is not a jump backwards in the
5711 pattern. */
5712
5713 if (mcnt >= 0)
5714 {
5715 /* Go through the on_failure_jumps of the alternatives,
5716 seeing if any of the alternatives cannot match nothing.
5717 The last alternative starts with only a jump,
5718 whereas the rest start with on_failure_jump and end
5719 with a jump, e.g., here is the pattern for `a|b|c':
5720
5721 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5722 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5723 /exactn/1/c
5724
5725 So, we have to first go through the first (n-1)
5726 alternatives and then deal with the last one separately. */
5727
5728
5729 /* Deal with the first (n-1) alternatives, which start
5730 with an on_failure_jump (see above) that jumps to right
5731 past a jump_past_alt. */
5732
5733 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5734 {
5735 /* `mcnt' holds how many bytes long the alternative
5736 is, including the ending `jump_past_alt' and
5737 its number. */
5738
5739 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5740 reg_info))
5741 return false;
5742
5743 /* Move to right after this alternative, including the
5744 jump_past_alt. */
5745 p1 += mcnt;
5746
5747 /* Break if it's the beginning of an n-th alternative
5748 that doesn't begin with an on_failure_jump. */
5749 if ((re_opcode_t) *p1 != on_failure_jump)
5750 break;
5751
5752 /* Still have to check that it's not an n-th
5753 alternative that starts with an on_failure_jump. */
5754 p1++;
5755 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5756 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5757 {
5758 /* Get to the beginning of the n-th alternative. */
5759 p1 -= 3;
5760 break;
5761 }
5762 }
5763
5764 /* Deal with the last alternative: go back and get number
5765 of the `jump_past_alt' just before it. `mcnt' contains
5766 the length of the alternative. */
5767 EXTRACT_NUMBER (mcnt, p1 - 2);
5768
5769 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
5770 return false;
5771
5772 p1 += mcnt; /* Get past the n-th alternative. */
5773 } /* if mcnt > 0 */
5774 break;
5775
5776
5777 case stop_memory:
5778 assert (p1[1] == **p);
5779 *p = p1 + 2;
5780 return true;
5781
5782
5783 default:
5784 if (!common_op_match_null_string_p (&p1, end, reg_info))
5785 return false;
5786 }
5787 } /* while p1 < end */
5788
5789 return false;
5790 } /* group_match_null_string_p */
5791
5792
5793 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
5794 It expects P to be the first byte of a single alternative and END one
5795 byte past the last. The alternative can contain groups. */
5796
5797 static boolean
5798 alt_match_null_string_p (unsigned char *p, unsigned char *end,
5799 register_info_type *reg_info)
5800 {
5801 int mcnt;
5802 unsigned char *p1 = p;
5803
5804 while (p1 < end)
5805 {
5806 /* Skip over opcodes that can match nothing, and break when we get
5807 to one that can't. */
5808
5809 switch ((re_opcode_t) *p1)
5810 {
5811 /* It's a loop. */
5812 case on_failure_jump:
5813 p1++;
5814 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5815 p1 += mcnt;
5816 break;
5817
5818 default:
5819 if (!common_op_match_null_string_p (&p1, end, reg_info))
5820 return false;
5821 }
5822 } /* while p1 < end */
5823
5824 return true;
5825 } /* alt_match_null_string_p */
5826
5827
5828 /* Deals with the ops common to group_match_null_string_p and
5829 alt_match_null_string_p.
5830
5831 Sets P to one after the op and its arguments, if any. */
5832
5833 static boolean
5834 common_op_match_null_string_p (unsigned char **p, unsigned char *end,
5835 register_info_type *reg_info)
5836 {
5837 int mcnt;
5838 boolean ret;
5839 int reg_no;
5840 unsigned char *p1 = *p;
5841
5842 switch ((re_opcode_t) *p1++)
5843 {
5844 case no_op:
5845 case begline:
5846 case endline:
5847 case begbuf:
5848 case endbuf:
5849 case wordbeg:
5850 case wordend:
5851 case wordbound:
5852 case notwordbound:
5853 #ifdef emacs
5854 case before_dot:
5855 case at_dot:
5856 case after_dot:
5857 #endif
5858 break;
5859
5860 case start_memory:
5861 reg_no = *p1;
5862 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
5863 ret = group_match_null_string_p (&p1, end, reg_info);
5864
5865 /* Have to set this here in case we're checking a group which
5866 contains a group and a back reference to it. */
5867
5868 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
5869 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
5870
5871 if (!ret)
5872 return false;
5873 break;
5874
5875 /* If this is an optimized succeed_n for zero times, make the jump. */
5876 case jump:
5877 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5878 if (mcnt >= 0)
5879 p1 += mcnt;
5880 else
5881 return false;
5882 break;
5883
5884 case succeed_n:
5885 /* Get to the number of times to succeed. */
5886 p1 += 2;
5887 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5888
5889 if (mcnt == 0)
5890 {
5891 p1 -= 4;
5892 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5893 p1 += mcnt;
5894 }
5895 else
5896 return false;
5897 break;
5898
5899 case duplicate:
5900 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
5901 return false;
5902 break;
5903
5904 case set_number_at:
5905 p1 += 4;
5906
5907 default:
5908 /* All other opcodes mean we cannot match the empty string. */
5909 return false;
5910 }
5911
5912 *p = p1;
5913 return true;
5914 } /* common_op_match_null_string_p */
5915
5916
5917 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5918 bytes; nonzero otherwise. */
5919
5920 static int
5921 bcmp_translate (CONST unsigned char *s1, CONST unsigned char *s2,
5922 REGISTER int len, char *translate)
5923 {
5924 REGISTER CONST unsigned char *p1 = s1, *p2 = s2;
5925 while (len)
5926 {
5927 if (translate[*p1++] != translate[*p2++]) return 1;
5928 len--;
5929 }
5930 return 0;
5931 }
5932
5933 /* Entry points for GNU code. */
5934
5935 /* re_compile_pattern is the GNU regular expression compiler: it
5936 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5937 Returns 0 if the pattern was valid, otherwise an error string.
5938
5939 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5940 are set in BUFP on entry.
5941
5942 We call regex_compile to do the actual compilation. */
5943
5944 CONST char *
5945 re_compile_pattern (CONST char *pattern, int length,
5946 struct re_pattern_buffer *bufp)
5947 {
5948 reg_errcode_t ret;
5949
5950 /* GNU code is written to assume at least RE_NREGS registers will be set
5951 (and at least one extra will be -1). */
5952 bufp->regs_allocated = REGS_UNALLOCATED;
5953
5954 /* And GNU code determines whether or not to get register information
5955 by passing null for the REGS argument to re_match, etc., not by
5956 setting no_sub. */
5957 bufp->no_sub = 0;
5958
5959 /* Match anchors at newline. */
5960 bufp->newline_anchor = 1;
5961
5962 ret = regex_compile (pattern, length, re_syntax_options, bufp);
5963
5964 if (!ret)
5965 return NULL;
5966 return gettext (re_error_msgid[(int) ret]);
5967 }
5968
5969 /* Entry points compatible with 4.2 BSD regex library. We don't define
5970 them unless specifically requested. */
5971
5972 #ifdef _REGEX_RE_COMP
5973
5974 /* BSD has one and only one pattern buffer. */
5975 static struct re_pattern_buffer re_comp_buf;
5976
5977 char *
5978 re_comp (CONST char *s)
5979 {
5980 reg_errcode_t ret;
5981
5982 if (!s)
5983 {
5984 if (!re_comp_buf.buffer)
5985 return gettext ("No previous regular expression");
5986 return 0;
5987 }
5988
5989 if (!re_comp_buf.buffer)
5990 {
5991 re_comp_buf.buffer = (unsigned char *) malloc (200);
5992 if (re_comp_buf.buffer == NULL)
5993 return gettext (re_error_msgid[(int) REG_ESPACE]);
5994 re_comp_buf.allocated = 200;
5995
5996 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5997 if (re_comp_buf.fastmap == NULL)
5998 return gettext (re_error_msgid[(int) REG_ESPACE]);
5999 }
6000
6001 /* Since `re_exec' always passes NULL for the `regs' argument, we
6002 don't need to initialize the pattern buffer fields which affect it. */
6003
6004 /* Match anchors at newlines. */
6005 re_comp_buf.newline_anchor = 1;
6006
6007 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
6008
6009 if (!ret)
6010 return NULL;
6011
6012 /* Yes, we're discarding `CONST' here if !HAVE_LIBINTL. */
6013 return (char *) gettext (re_error_msgid[(int) ret]);
6014 }
6015
6016
6017 int
6018 re_exec (CONST char *s)
6019 {
6020 CONST int len = strlen (s);
6021 return
6022 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6023 }
6024 #endif /* _REGEX_RE_COMP */
6025
6026 /* POSIX.2 functions. Don't define these for Emacs. */
6027
6028 #ifndef emacs
6029
6030 /* regcomp takes a regular expression as a string and compiles it.
6031
6032 PREG is a regex_t *. We do not expect any fields to be initialized,
6033 since POSIX says we shouldn't. Thus, we set
6034
6035 `buffer' to the compiled pattern;
6036 `used' to the length of the compiled pattern;
6037 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6038 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6039 RE_SYNTAX_POSIX_BASIC;
6040 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6041 `fastmap' and `fastmap_accurate' to zero;
6042 `re_nsub' to the number of subexpressions in PATTERN.
6043
6044 PATTERN is the address of the pattern string.
6045
6046 CFLAGS is a series of bits which affect compilation.
6047
6048 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6049 use POSIX basic syntax.
6050
6051 If REG_NEWLINE is set, then . and [^...] don't match newline.
6052 Also, regexec will try a match beginning after every newline.
6053
6054 If REG_ICASE is set, then we considers upper- and lowercase
6055 versions of letters to be equivalent when matching.
6056
6057 If REG_NOSUB is set, then when PREG is passed to regexec, that
6058 routine will report only success or failure, and nothing about the
6059 registers.
6060
6061 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6062 the return codes and their meanings.) */
6063
6064 int
6065 regcomp (regex_t *preg, CONST char *pattern, int cflags)
6066 {
6067 reg_errcode_t ret;
6068 unsigned syntax
6069 = (cflags & REG_EXTENDED) ?
6070 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6071
6072 /* regex_compile will allocate the space for the compiled pattern. */
6073 preg->buffer = 0;
6074 preg->allocated = 0;
6075 preg->used = 0;
6076
6077 /* Don't bother to use a fastmap when searching. This simplifies the
6078 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6079 characters after newlines into the fastmap. This way, we just try
6080 every character. */
6081 preg->fastmap = 0;
6082
6083 if (cflags & REG_ICASE)
6084 {
6085 unsigned i;
6086
6087 preg->translate = (char *) malloc (CHAR_SET_SIZE);
6088 if (preg->translate == NULL)
6089 return (int) REG_ESPACE;
6090
6091 /* Map uppercase characters to corresponding lowercase ones. */
6092 for (i = 0; i < CHAR_SET_SIZE; i++)
6093 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
6094 }
6095 else
6096 preg->translate = NULL;
6097
6098 /* If REG_NEWLINE is set, newlines are treated differently. */
6099 if (cflags & REG_NEWLINE)
6100 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6101 syntax &= ~RE_DOT_NEWLINE;
6102 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6103 /* It also changes the matching behavior. */
6104 preg->newline_anchor = 1;
6105 }
6106 else
6107 preg->newline_anchor = 0;
6108
6109 preg->no_sub = !!(cflags & REG_NOSUB);
6110
6111 /* POSIX says a null character in the pattern terminates it, so we
6112 can use strlen here in compiling the pattern. */
6113 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
6114
6115 /* POSIX doesn't distinguish between an unmatched open-group and an
6116 unmatched close-group: both are REG_EPAREN. */
6117 if (ret == REG_ERPAREN) ret = REG_EPAREN;
6118
6119 return (int) ret;
6120 }
6121
6122
6123 /* regexec searches for a given pattern, specified by PREG, in the
6124 string STRING.
6125
6126 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6127 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6128 least NMATCH elements, and we set them to the offsets of the
6129 corresponding matched substrings.
6130
6131 EFLAGS specifies `execution flags' which affect matching: if
6132 REG_NOTBOL is set, then ^ does not match at the beginning of the
6133 string; if REG_NOTEOL is set, then $ does not match at the end.
6134
6135 We return 0 if we find a match and REG_NOMATCH if not. */
6136
6137 int
6138 regexec (CONST regex_t *preg, CONST char *string, size_t nmatch,
6139 regmatch_t pmatch[], int eflags)
6140 {
6141 int ret;
6142 struct re_registers regs;
6143 regex_t private_preg;
6144 int len = strlen (string);
6145 boolean want_reg_info = !preg->no_sub && nmatch > 0;
6146
6147 private_preg = *preg;
6148
6149 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6150 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6151
6152 /* The user has told us exactly how many registers to return
6153 information about, via `nmatch'. We have to pass that on to the
6154 matching routines. */
6155 private_preg.regs_allocated = REGS_FIXED;
6156
6157 if (want_reg_info)
6158 {
6159 regs.num_regs = nmatch;
6160 regs.start = TALLOC (nmatch, regoff_t);
6161 regs.end = TALLOC (nmatch, regoff_t);
6162 if (regs.start == NULL || regs.end == NULL)
6163 return (int) REG_NOMATCH;
6164 }
6165
6166 /* Perform the searching operation. */
6167 ret = re_search (&private_preg, string, len,
6168 /* start: */ 0, /* range: */ len,
6169 want_reg_info ? &regs : (struct re_registers *) 0);
6170
6171 /* Copy the register information to the POSIX structure. */
6172 if (want_reg_info)
6173 {
6174 if (ret >= 0)
6175 {
6176 unsigned r;
6177
6178 for (r = 0; r < nmatch; r++)
6179 {
6180 pmatch[r].rm_so = regs.start[r];
6181 pmatch[r].rm_eo = regs.end[r];
6182 }
6183 }
6184
6185 /* If we needed the temporary register info, free the space now. */
6186 free (regs.start);
6187 free (regs.end);
6188 }
6189
6190 /* We want zero return to mean success, unlike `re_search'. */
6191 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6192 }
6193
6194
6195 /* Returns a message corresponding to an error code, ERRCODE, returned
6196 from either regcomp or regexec. We don't use PREG here. */
6197
6198 size_t
6199 regerror (int errcode, CONST regex_t *preg, char *errbuf, size_t errbuf_size)
6200 {
6201 CONST char *msg;
6202 size_t msg_size;
6203
6204 if (errcode < 0
6205 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
6206 /* Only error codes returned by the rest of the code should be passed
6207 to this routine. If we are given anything else, or if other regex
6208 code generates an invalid error code, then the program has a bug.
6209 Dump core so we can fix it. */
6210 abort ();
6211
6212 msg = gettext (re_error_msgid[errcode]);
6213
6214 msg_size = strlen (msg) + 1; /* Includes the null. */
6215
6216 if (errbuf_size != 0)
6217 {
6218 if (msg_size > errbuf_size)
6219 {
6220 strncpy (errbuf, msg, errbuf_size - 1);
6221 errbuf[errbuf_size - 1] = 0;
6222 }
6223 else
6224 strcpy (errbuf, msg);
6225 }
6226
6227 return msg_size;
6228 }
6229
6230
6231 /* Free dynamically allocated space used by PREG. */
6232
6233 void
6234 regfree (regex_t *preg)
6235 {
6236 if (preg->buffer != NULL)
6237 free (preg->buffer);
6238 preg->buffer = NULL;
6239
6240 preg->allocated = 0;
6241 preg->used = 0;
6242
6243 if (preg->fastmap != NULL)
6244 free (preg->fastmap);
6245 preg->fastmap = NULL;
6246 preg->fastmap_accurate = 0;
6247
6248 if (preg->translate != NULL)
6249 free (preg->translate);
6250 preg->translate = NULL;
6251 }
6252
6253 #endif /* not emacs */
6254
6255 /*
6256 Local variables:
6257 make-backup-files: t
6258 version-control: t
6259 trim-versions-without-asking: nil
6260 End:
6261 */