Mercurial > hg > xemacs-beta
annotate src/regex.c @ 5569:d19b6e3bdf91
#'cl-defsubst-expand; avoid mutually-recursive symbol macros.
lisp/ChangeLog addition:
2011-09-10 Aidan Kehoe <kehoea@parhasard.net>
* cl-macs.el (cl-defsubst-expand):
Change set 2a6a8da4dd7c of
http://mid.gmane.org/19966.17522.332164.615228@parhasard.net
wasn't sufficiently comprehensive, symbol macros can be mutually
rather than simply recursive, and they can equally hang. Thanks
for the bug report, Michael Sperber, and for the test case,
Stephen Turnbull.
| author | Aidan Kehoe <kehoea@parhasard.net> |
|---|---|
| date | Sat, 10 Sep 2011 13:17:29 +0100 |
| parents | 308d34e9f07d |
| children | 3f4a234f4672 |
| rev | line source |
|---|---|
| 428 | 1 /* Extended regular expression matching and search library, |
| 2 version 0.12, extended for XEmacs. | |
| 3 (Implements POSIX draft P10003.2/D11.2, except for | |
| 4 internationalization features.) | |
| 5 | |
| 6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc. | |
| 7 Copyright (C) 1995 Sun Microsystems, Inc. | |
| 5041 | 8 Copyright (C) 1995, 2001, 2002, 2003, 2010 Ben Wing. |
| 428 | 9 |
|
5402
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
10 This file is part of XEmacs. |
|
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
11 |
|
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
12 XEmacs is free software: you can redistribute it and/or modify it |
|
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
13 under the terms of the GNU General Public License as published by the |
|
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
14 Free Software Foundation, either version 3 of the License, or (at your |
|
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
15 option) any later version. |
|
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
16 |
|
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
17 XEmacs is distributed in the hope that it will be useful, but WITHOUT |
|
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
18 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
19 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
20 for more details. |
|
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
21 |
| 428 | 22 You should have received a copy of the GNU General Public License |
|
5402
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5041
diff
changeset
|
23 along with XEmacs. If not, see <http://www.gnu.org/licenses/>. */ |
| 428 | 24 /* Synched up with: FSF 19.29. */ |
| 25 | |
| 26 #ifdef HAVE_CONFIG_H | |
| 27 #include <config.h> | |
| 28 #endif | |
| 29 | |
| 30 #ifndef _GNU_SOURCE | |
| 31 #define _GNU_SOURCE 1 | |
| 32 #endif | |
| 33 | |
| 34 /* We assume non-Mule if emacs isn't defined. */ | |
| 35 #ifndef emacs | |
| 36 #undef MULE | |
| 37 #endif | |
| 38 | |
| 771 | 39 /* XEmacs addition */ |
| 40 #ifdef REL_ALLOC | |
| 41 #define REGEX_REL_ALLOC /* may be undefined below */ | |
| 42 #endif | |
| 43 | |
| 428 | 44 /* XEmacs: define this to add in a speedup for patterns anchored at |
| 45 the beginning of a line. Keep the ifdefs so that it's easier to | |
| 46 tell where/why this code has diverged from v19. */ | |
| 47 #define REGEX_BEGLINE_CHECK | |
| 48 | |
| 49 /* XEmacs: the current mmap-based ralloc handles small blocks very | |
| 50 poorly, so we disable it here. */ | |
| 51 | |
| 771 | 52 #if defined (HAVE_MMAP) || defined (DOUG_LEA_MALLOC) |
| 53 # undef REGEX_REL_ALLOC | |
| 428 | 54 #endif |
| 55 | |
| 56 /* The `emacs' switch turns on certain matching commands | |
| 57 that make sense only in Emacs. */ | |
| 58 #ifdef emacs | |
| 59 | |
| 60 #include "lisp.h" | |
| 61 #include "buffer.h" | |
| 62 #include "syntax.h" | |
| 63 | |
| 64 #if (defined (DEBUG_XEMACS) && !defined (DEBUG)) | |
| 65 #define DEBUG | |
| 66 #endif | |
| 67 | |
| 867 | 68 #define RE_TRANSLATE_1(ch) TRT_TABLE_OF (translate, (Ichar) ch) |
| 446 | 69 #define TRANSLATE_P(tr) (!NILP (tr)) |
| 428 | 70 |
| 826 | 71 /* Converts the pointer to the char to BEG-based offset from the start. */ |
| 72 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \ | |
| 73 ? (d) - string1 : (d) - (string2 - size1)) | |
| 74 | |
| 428 | 75 #else /* not emacs */ |
| 76 | |
| 2367 | 77 #include <stdlib.h> |
| 78 #include <sys/types.h> | |
| 79 #include <stddef.h> /* needed for ptrdiff_t under Solaris */ | |
| 80 #include <string.h> | |
| 81 | |
| 2286 | 82 #include "compiler.h" /* Get compiler-specific definitions like UNUSED */ |
| 83 | |
| 2500 | 84 #define ABORT abort |
| 85 | |
| 428 | 86 /* If we are not linking with Emacs proper, |
| 87 we can't use the relocating allocator | |
| 88 even if config.h says that we can. */ | |
| 771 | 89 #undef REGEX_REL_ALLOC |
| 428 | 90 |
| 544 | 91 /* defined in lisp.h */ |
| 92 #ifdef REGEX_MALLOC | |
| 93 #ifndef DECLARE_NOTHING | |
| 94 #define DECLARE_NOTHING struct nosuchstruct | |
| 95 #endif | |
| 96 #endif | |
| 97 | |
| 867 | 98 #define itext_ichar(str) ((Ichar) (str)[0]) |
| 99 #define itext_ichar_fmt(str, fmt, object) ((Ichar) (str)[0]) | |
| 100 #define itext_ichar_ascii_fmt(str, fmt, object) ((Ichar) (str)[0]) | |
| 428 | 101 |
| 102 #if (LONGBITS > INTBITS) | |
| 103 # define EMACS_INT long | |
| 104 #else | |
| 105 # define EMACS_INT int | |
| 106 #endif | |
| 107 | |
| 867 | 108 typedef int Ichar; |
| 109 | |
| 110 #define INC_IBYTEPTR(p) ((p)++) | |
| 111 #define INC_IBYTEPTR_FMT(p, fmt) ((p)++) | |
| 112 #define DEC_IBYTEPTR(p) ((p)--) | |
| 113 #define DEC_IBYTEPTR_FMT(p, fmt) ((p)--) | |
|
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
114 #define MAX_ICHAR_LEN 1 |
| 867 | 115 #define itext_ichar_len(ptr) 1 |
| 116 #define itext_ichar_len_fmt(ptr, fmt) 1 | |
| 428 | 117 |
| 118 /* Define the syntax stuff for \<, \>, etc. */ | |
| 119 | |
| 120 /* This must be nonzero for the wordchar and notwordchar pattern | |
| 121 commands in re_match_2. */ | |
| 122 #ifndef Sword | |
| 123 #define Sword 1 | |
| 124 #endif | |
| 125 | |
| 126 #ifdef SYNTAX_TABLE | |
| 127 | |
| 128 extern char *re_syntax_table; | |
| 129 | |
| 130 #else /* not SYNTAX_TABLE */ | |
| 131 | |
| 132 /* How many characters in the character set. */ | |
| 133 #define CHAR_SET_SIZE 256 | |
| 134 | |
| 135 static char re_syntax_table[CHAR_SET_SIZE]; | |
| 136 | |
| 137 static void | |
| 138 init_syntax_once (void) | |
| 139 { | |
| 140 static int done = 0; | |
| 141 | |
| 142 if (!done) | |
| 143 { | |
| 442 | 144 const char *word_syntax_chars = |
| 428 | 145 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_"; |
| 146 | |
| 147 memset (re_syntax_table, 0, sizeof (re_syntax_table)); | |
| 148 | |
| 149 while (*word_syntax_chars) | |
| 647 | 150 re_syntax_table[(unsigned int) (*word_syntax_chars++)] = Sword; |
| 428 | 151 |
| 152 done = 1; | |
| 153 } | |
| 154 } | |
| 155 | |
| 446 | 156 #endif /* SYNTAX_TABLE */ |
| 428 | 157 |
| 826 | 158 #define SYNTAX(ignored, c) re_syntax_table[c] |
| 460 | 159 #undef SYNTAX_FROM_CACHE |
| 826 | 160 #define SYNTAX_FROM_CACHE SYNTAX |
| 161 | |
| 162 #define RE_TRANSLATE_1(c) translate[(unsigned char) (c)] | |
| 446 | 163 #define TRANSLATE_P(tr) tr |
| 164 | |
| 165 #endif /* emacs */ | |
| 428 | 166 |
| 2201 | 167 /* This is for other GNU distributions with internationalized messages. */ |
| 168 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC)) | |
| 169 # include <libintl.h> | |
| 170 #else | |
| 171 # define gettext(msgid) (msgid) | |
| 172 #endif | |
| 173 | |
| 428 | 174 |
| 175 /* Get the interface, including the syntax bits. */ | |
| 176 #include "regex.h" | |
| 177 | |
| 178 /* isalpha etc. are used for the character classes. */ | |
| 179 #include <ctype.h> | |
| 180 | |
| 181 /* Jim Meyering writes: | |
| 182 | |
| 183 "... Some ctype macros are valid only for character codes that | |
| 184 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when | |
| 185 using /bin/cc or gcc but without giving an ansi option). So, all | |
| 186 ctype uses should be through macros like ISPRINT... If | |
| 187 STDC_HEADERS is defined, then autoconf has verified that the ctype | |
| 188 macros don't need to be guarded with references to isascii. ... | |
| 189 Defining isascii to 1 should let any compiler worth its salt | |
| 190 eliminate the && through constant folding." */ | |
| 191 | |
| 192 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) | |
| 193 #define ISASCII_1(c) 1 | |
| 194 #else | |
| 195 #define ISASCII_1(c) isascii(c) | |
| 196 #endif | |
| 197 | |
| 198 #ifdef MULE | |
| 199 /* The IS*() macros can be passed any character, including an extended | |
| 200 one. We need to make sure there are no crashes, which would occur | |
| 201 otherwise due to out-of-bounds array references. */ | |
| 202 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c)) | |
| 203 #else | |
| 204 #define ISASCII(c) ISASCII_1 (c) | |
| 205 #endif /* MULE */ | |
| 206 | |
| 207 #ifdef isblank | |
| 208 #define ISBLANK(c) (ISASCII (c) && isblank (c)) | |
| 209 #else | |
| 210 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') | |
| 211 #endif | |
| 212 #ifdef isgraph | |
| 213 #define ISGRAPH(c) (ISASCII (c) && isgraph (c)) | |
| 214 #else | |
| 215 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) | |
| 216 #endif | |
| 217 | |
| 218 #define ISPRINT(c) (ISASCII (c) && isprint (c)) | |
| 219 #define ISDIGIT(c) (ISASCII (c) && isdigit (c)) | |
| 220 #define ISALNUM(c) (ISASCII (c) && isalnum (c)) | |
| 221 #define ISALPHA(c) (ISASCII (c) && isalpha (c)) | |
| 222 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) | |
| 223 #define ISLOWER(c) (ISASCII (c) && islower (c)) | |
| 224 #define ISPUNCT(c) (ISASCII (c) && ispunct (c)) | |
| 225 #define ISSPACE(c) (ISASCII (c) && isspace (c)) | |
| 226 #define ISUPPER(c) (ISASCII (c) && isupper (c)) | |
| 227 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) | |
| 228 | |
| 229 #ifndef NULL | |
| 230 #define NULL (void *)0 | |
| 231 #endif | |
| 232 | |
| 233 /* We remove any previous definition of `SIGN_EXTEND_CHAR', | |
| 234 since ours (we hope) works properly with all combinations of | |
| 235 machines, compilers, `char' and `unsigned char' argument types. | |
| 236 (Per Bothner suggested the basic approach.) */ | |
| 237 #undef SIGN_EXTEND_CHAR | |
| 238 #if __STDC__ | |
| 239 #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) | |
| 240 #else /* not __STDC__ */ | |
| 241 /* As in Harbison and Steele. */ | |
| 242 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) | |
| 243 #endif | |
| 244 | |
| 245 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we | |
| 246 use `alloca' instead of `malloc'. This is because using malloc in | |
| 247 re_search* or re_match* could cause memory leaks when C-g is used in | |
| 248 Emacs; also, malloc is slower and causes storage fragmentation. On | |
| 249 the other hand, malloc is more portable, and easier to debug. | |
| 250 | |
| 251 Because we sometimes use alloca, some routines have to be macros, | |
| 252 not functions -- `alloca'-allocated space disappears at the end of the | |
| 253 function it is called in. */ | |
| 254 | |
| 1333 | 255 #ifndef emacs |
| 256 #define ALLOCA alloca | |
| 257 #define xmalloc malloc | |
| 258 #define xrealloc realloc | |
|
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
259 #define xfree free |
| 1333 | 260 #endif |
| 261 | |
| 262 #ifdef emacs | |
| 263 #define ALLOCA_GARBAGE_COLLECT() \ | |
| 264 do \ | |
| 265 { \ | |
| 266 if (need_to_check_c_alloca) \ | |
| 267 xemacs_c_alloca (0); \ | |
| 268 } while (0) | |
| 269 #elif defined (C_ALLOCA) | |
| 270 #define ALLOCA_GARBAGE_COLLECT() alloca (0) | |
| 271 #else | |
| 272 #define ALLOCA_GARBAGE_COLLECT() | |
| 273 #endif | |
| 274 | |
| 275 #ifndef emacs | |
| 276 /* So we can use just it to conditionalize on */ | |
| 277 #undef ERROR_CHECK_MALLOC | |
| 278 #endif | |
| 279 | |
| 280 #ifdef ERROR_CHECK_MALLOC | |
| 281 /* When REL_ALLOC, malloc() is problematic because it could potentially | |
| 282 cause all rel-alloc()ed data -- including buffer text -- to be relocated. | |
| 283 We deal with this by checking for such relocation whenever we have | |
| 284 executed a statement that may call malloc() -- or alloca(), which may | |
| 285 end up calling malloc() in some circumstances -- and recomputing all | |
| 286 of our string pointers in re_match_2_internal() and re_search_2(). | |
| 287 However, if malloc() or alloca() happens and we don't know about it, | |
| 288 we could still be screwed. So we set up a system where we indicate all | |
| 289 places where we are prepared for malloc() or alloca(), and in any | |
| 290 other circumstances, calls to those functions (from anywhere inside of | |
| 2500 | 291 XEmacs!) will ABORT(). We do this even when REL_ALLOC is not defined |
| 1333 | 292 so that we catch these problems sooner, since many developers and beta |
| 293 testers will not be running with REL_ALLOC. */ | |
| 294 int regex_malloc_disallowed; | |
| 295 #define BEGIN_REGEX_MALLOC_OK() regex_malloc_disallowed = 0 | |
| 296 #define END_REGEX_MALLOC_OK() regex_malloc_disallowed = 1 | |
| 297 #define UNBIND_REGEX_MALLOC_CHECK() unbind_to (depth) | |
| 298 #else | |
| 299 #define BEGIN_REGEX_MALLOC_OK() | |
| 300 #define END_REGEX_MALLOC_OK() | |
| 301 #define UNBIND_REGEX_MALLOC_CHECK() | |
| 302 #endif | |
| 303 | |
| 304 | |
| 428 | 305 #ifdef REGEX_MALLOC |
| 306 | |
| 1333 | 307 #define REGEX_ALLOCATE xmalloc |
| 308 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc (source, nsize) | |
| 309 #define REGEX_FREE xfree | |
| 428 | 310 |
| 311 #else /* not REGEX_MALLOC */ | |
| 312 | |
| 313 /* Emacs already defines alloca, sometimes. */ | |
| 314 #ifndef alloca | |
| 315 | |
| 316 /* Make alloca work the best possible way. */ | |
| 317 #ifdef __GNUC__ | |
| 318 #define alloca __builtin_alloca | |
| 771 | 319 #elif defined (__DECC) /* XEmacs: added next 3 lines, similar to config.h.in */ |
| 320 #include <alloca.h> | |
| 321 #pragma intrinsic(alloca) | |
| 428 | 322 #else /* not __GNUC__ */ |
| 323 #if HAVE_ALLOCA_H | |
| 324 #include <alloca.h> | |
| 325 #else /* not __GNUC__ or HAVE_ALLOCA_H */ | |
| 326 #ifndef _AIX /* Already did AIX, up at the top. */ | |
| 444 | 327 void *alloca (); |
| 428 | 328 #endif /* not _AIX */ |
| 446 | 329 #endif /* HAVE_ALLOCA_H */ |
| 330 #endif /* __GNUC__ */ | |
| 428 | 331 |
| 332 #endif /* not alloca */ | |
| 333 | |
| 1333 | 334 #define REGEX_ALLOCATE ALLOCA |
| 428 | 335 |
| 2367 | 336 /* !!#### Needs review */ |
| 428 | 337 /* Assumes a `char *destination' variable. */ |
| 338 #define REGEX_REALLOCATE(source, osize, nsize) \ | |
| 1333 | 339 (destination = (char *) ALLOCA (nsize), \ |
| 428 | 340 memmove (destination, source, osize), \ |
| 341 destination) | |
| 342 | |
| 1726 | 343 /* No need to do anything to free, after alloca. |
| 344 Do nothing! But inhibit gcc warning. */ | |
| 345 #define REGEX_FREE(arg,type) ((void)0) | |
| 428 | 346 |
| 446 | 347 #endif /* REGEX_MALLOC */ |
| 428 | 348 |
| 349 /* Define how to allocate the failure stack. */ | |
| 350 | |
| 771 | 351 #ifdef REGEX_REL_ALLOC |
| 428 | 352 #define REGEX_ALLOCATE_STACK(size) \ |
| 1346 | 353 r_alloc ((unsigned char **) &failure_stack_ptr, (size)) |
| 428 | 354 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ |
| 1346 | 355 r_re_alloc ((unsigned char **) &failure_stack_ptr, (nsize)) |
| 428 | 356 #define REGEX_FREE_STACK(ptr) \ |
| 1346 | 357 r_alloc_free ((unsigned char **) &failure_stack_ptr) |
| 428 | 358 |
| 771 | 359 #else /* not REGEX_REL_ALLOC */ |
| 428 | 360 |
| 361 #ifdef REGEX_MALLOC | |
| 362 | |
| 1333 | 363 #define REGEX_ALLOCATE_STACK xmalloc |
| 364 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc (source, nsize) | |
|
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
365 #define REGEX_FREE_STACK(arg) xfree (arg) |
| 428 | 366 |
| 367 #else /* not REGEX_MALLOC */ | |
| 368 | |
| 1333 | 369 #define REGEX_ALLOCATE_STACK ALLOCA |
| 428 | 370 |
| 371 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ | |
| 372 REGEX_REALLOCATE (source, osize, nsize) | |
| 373 /* No need to explicitly free anything. */ | |
| 374 #define REGEX_FREE_STACK(arg) | |
| 375 | |
| 446 | 376 #endif /* REGEX_MALLOC */ |
| 771 | 377 #endif /* REGEX_REL_ALLOC */ |
| 428 | 378 |
| 379 | |
| 380 /* True if `size1' is non-NULL and PTR is pointing anywhere inside | |
| 381 `string1' or just past its end. This works if PTR is NULL, which is | |
| 382 a good thing. */ | |
| 383 #define FIRST_STRING_P(ptr) \ | |
| 384 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) | |
| 385 | |
| 386 /* (Re)Allocate N items of type T using malloc, or fail. */ | |
| 1333 | 387 #define TALLOC(n, t) ((t *) xmalloc ((n) * sizeof (t))) |
| 388 #define RETALLOC(addr, n, t) ((addr) = (t *) xrealloc (addr, (n) * sizeof (t))) | |
| 428 | 389 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) |
| 390 | |
| 391 #define BYTEWIDTH 8 /* In bits. */ | |
| 392 | |
| 434 | 393 #define STREQ(s1, s2) (strcmp (s1, s2) == 0) |
| 428 | 394 |
| 395 #undef MAX | |
| 396 #undef MIN | |
| 397 #define MAX(a, b) ((a) > (b) ? (a) : (b)) | |
| 398 #define MIN(a, b) ((a) < (b) ? (a) : (b)) | |
| 399 | |
| 446 | 400 /* Type of source-pattern and string chars. */ |
| 401 typedef const unsigned char re_char; | |
| 402 | |
| 460 | 403 typedef char re_bool; |
| 428 | 404 #define false 0 |
| 405 #define true 1 | |
| 406 | |
| 407 | |
| 1346 | 408 #ifdef emacs |
| 409 | |
| 410 #ifdef MULE | |
| 411 | |
| 412 Lisp_Object Vthe_lisp_rangetab; | |
| 413 | |
| 414 void | |
| 415 vars_of_regex (void) | |
| 416 { | |
| 2421 | 417 Vthe_lisp_rangetab = Fmake_range_table (Qstart_closed_end_closed); |
| 1346 | 418 staticpro (&Vthe_lisp_rangetab); |
| 419 } | |
| 420 | |
| 421 #else /* not MULE */ | |
| 422 | |
| 423 void | |
| 424 vars_of_regex (void) | |
| 425 { | |
| 426 } | |
| 427 | |
| 428 #endif /* MULE */ | |
| 429 | |
| 430 /* Convert an offset from the start of the logical text string formed by | |
| 431 concatenating the two strings together into a character position in the | |
| 432 Lisp buffer or string that the text represents. Knows that | |
| 433 when handling buffer text, the "string" we're passed in is always | |
| 434 BEGV - ZV. */ | |
| 435 | |
| 436 static Charxpos | |
| 437 offset_to_charxpos (Lisp_Object lispobj, int off) | |
| 438 { | |
| 439 if (STRINGP (lispobj)) | |
| 440 return string_index_byte_to_char (lispobj, off); | |
| 441 else if (BUFFERP (lispobj)) | |
| 442 return bytebpos_to_charbpos (XBUFFER (lispobj), | |
| 443 off + BYTE_BUF_BEGV (XBUFFER (lispobj))); | |
| 444 else | |
| 445 return 0; | |
| 446 } | |
| 447 | |
| 448 #ifdef REL_ALLOC | |
| 449 | |
| 450 /* STRING1 is the value of STRING1 given to re_match_2(). LISPOBJ is | |
| 451 the Lisp object (if any) from which the string is taken. If LISPOBJ | |
| 452 is a buffer, return a relocation offset to be added to all pointers to | |
| 453 string data so that they will be accurate again, after an allocation or | |
| 454 reallocation that potentially relocated the buffer data. | |
| 455 */ | |
| 456 static Bytecount | |
| 457 offset_post_relocation (Lisp_Object lispobj, Ibyte *orig_buftext) | |
| 458 { | |
| 459 if (!BUFFERP (lispobj)) | |
| 460 return 0; | |
| 461 return (BYTE_BUF_BYTE_ADDRESS (XBUFFER (lispobj), | |
| 462 BYTE_BUF_BEGV (XBUFFER (lispobj))) - | |
| 463 orig_buftext); | |
| 464 } | |
| 465 | |
| 466 #endif /* REL_ALLOC */ | |
| 467 | |
| 468 #ifdef ERROR_CHECK_MALLOC | |
| 469 | |
| 470 /* NOTE that this can run malloc() so you need to adjust afterwards. */ | |
| 471 | |
| 472 static int | |
| 473 bind_regex_malloc_disallowed (int value) | |
| 474 { | |
| 475 /* Tricky, because the act of binding can run malloc(). */ | |
| 476 int old_regex_malloc_disallowed = regex_malloc_disallowed; | |
| 477 int depth; | |
| 478 regex_malloc_disallowed = 0; | |
| 479 depth = record_unwind_protect_restoring_int (®ex_malloc_disallowed, | |
| 480 old_regex_malloc_disallowed); | |
| 481 regex_malloc_disallowed = value; | |
| 482 return depth; | |
| 483 } | |
| 484 | |
| 485 #endif /* ERROR_CHECK_MALLOC */ | |
| 486 | |
| 487 #endif /* emacs */ | |
| 488 | |
| 489 | |
| 428 | 490 /* These are the command codes that appear in compiled regular |
| 491 expressions. Some opcodes are followed by argument bytes. A | |
| 492 command code can specify any interpretation whatsoever for its | |
| 493 arguments. Zero bytes may appear in the compiled regular expression. */ | |
| 494 | |
| 495 typedef enum | |
| 496 { | |
| 497 no_op = 0, | |
| 498 | |
| 499 /* Succeed right away--no more backtracking. */ | |
| 500 succeed, | |
| 501 | |
| 502 /* Followed by one byte giving n, then by n literal bytes. */ | |
| 503 exactn, | |
| 504 | |
| 505 /* Matches any (more or less) character. */ | |
| 506 anychar, | |
| 507 | |
| 508 /* Matches any one char belonging to specified set. First | |
| 509 following byte is number of bitmap bytes. Then come bytes | |
| 510 for a bitmap saying which chars are in. Bits in each byte | |
| 511 are ordered low-bit-first. A character is in the set if its | |
| 512 bit is 1. A character too large to have a bit in the map is | |
| 513 automatically not in the set. */ | |
| 514 charset, | |
| 515 | |
| 516 /* Same parameters as charset, but match any character that is | |
| 517 not one of those specified. */ | |
| 518 charset_not, | |
| 519 | |
| 520 /* Start remembering the text that is matched, for storing in a | |
| 521 register. Followed by one byte with the register number, in | |
| 502 | 522 the range 1 to the pattern buffer's re_ngroups |
| 428 | 523 field. Then followed by one byte with the number of groups |
| 524 inner to this one. (This last has to be part of the | |
| 525 start_memory only because we need it in the on_failure_jump | |
| 526 of re_match_2.) */ | |
| 527 start_memory, | |
| 528 | |
| 529 /* Stop remembering the text that is matched and store it in a | |
| 530 memory register. Followed by one byte with the register | |
| 502 | 531 number, in the range 1 to `re_ngroups' in the |
| 428 | 532 pattern buffer, and one byte with the number of inner groups, |
| 533 just like `start_memory'. (We need the number of inner | |
| 534 groups here because we don't have any easy way of finding the | |
| 535 corresponding start_memory when we're at a stop_memory.) */ | |
| 536 stop_memory, | |
| 537 | |
| 538 /* Match a duplicate of something remembered. Followed by one | |
| 539 byte containing the register number. */ | |
| 540 duplicate, | |
| 541 | |
| 542 /* Fail unless at beginning of line. */ | |
| 543 begline, | |
| 544 | |
| 545 /* Fail unless at end of line. */ | |
| 546 endline, | |
| 547 | |
| 548 /* Succeeds if at beginning of buffer (if emacs) or at beginning | |
| 549 of string to be matched (if not). */ | |
| 550 begbuf, | |
| 551 | |
| 552 /* Analogously, for end of buffer/string. */ | |
| 553 endbuf, | |
| 554 | |
| 555 /* Followed by two byte relative address to which to jump. */ | |
| 556 jump, | |
| 557 | |
| 558 /* Same as jump, but marks the end of an alternative. */ | |
| 559 jump_past_alt, | |
| 560 | |
| 561 /* Followed by two-byte relative address of place to resume at | |
| 562 in case of failure. */ | |
| 563 on_failure_jump, | |
| 564 | |
| 565 /* Like on_failure_jump, but pushes a placeholder instead of the | |
| 566 current string position when executed. */ | |
| 567 on_failure_keep_string_jump, | |
| 568 | |
| 569 /* Throw away latest failure point and then jump to following | |
| 570 two-byte relative address. */ | |
| 571 pop_failure_jump, | |
| 572 | |
| 573 /* Change to pop_failure_jump if know won't have to backtrack to | |
| 574 match; otherwise change to jump. This is used to jump | |
| 575 back to the beginning of a repeat. If what follows this jump | |
| 576 clearly won't match what the repeat does, such that we can be | |
| 577 sure that there is no use backtracking out of repetitions | |
| 578 already matched, then we change it to a pop_failure_jump. | |
| 579 Followed by two-byte address. */ | |
| 580 maybe_pop_jump, | |
| 581 | |
| 582 /* Jump to following two-byte address, and push a dummy failure | |
| 583 point. This failure point will be thrown away if an attempt | |
| 584 is made to use it for a failure. A `+' construct makes this | |
| 585 before the first repeat. Also used as an intermediary kind | |
| 586 of jump when compiling an alternative. */ | |
| 587 dummy_failure_jump, | |
| 588 | |
| 589 /* Push a dummy failure point and continue. Used at the end of | |
| 590 alternatives. */ | |
| 591 push_dummy_failure, | |
| 592 | |
| 593 /* Followed by two-byte relative address and two-byte number n. | |
| 594 After matching N times, jump to the address upon failure. */ | |
| 595 succeed_n, | |
| 596 | |
| 597 /* Followed by two-byte relative address, and two-byte number n. | |
| 598 Jump to the address N times, then fail. */ | |
| 599 jump_n, | |
| 600 | |
| 601 /* Set the following two-byte relative address to the | |
| 602 subsequent two-byte number. The address *includes* the two | |
| 603 bytes of number. */ | |
| 604 set_number_at, | |
| 605 | |
| 606 wordchar, /* Matches any word-constituent character. */ | |
| 607 notwordchar, /* Matches any char that is not a word-constituent. */ | |
| 608 | |
| 609 wordbeg, /* Succeeds if at word beginning. */ | |
| 610 wordend, /* Succeeds if at word end. */ | |
| 611 | |
| 612 wordbound, /* Succeeds if at a word boundary. */ | |
| 613 notwordbound /* Succeeds if not at a word boundary. */ | |
| 614 | |
| 615 #ifdef emacs | |
| 616 ,before_dot, /* Succeeds if before point. */ | |
| 617 at_dot, /* Succeeds if at point. */ | |
| 618 after_dot, /* Succeeds if after point. */ | |
| 619 | |
| 620 /* Matches any character whose syntax is specified. Followed by | |
| 621 a byte which contains a syntax code, e.g., Sword. */ | |
| 622 syntaxspec, | |
| 623 | |
| 624 /* Matches any character whose syntax is not that specified. */ | |
| 625 notsyntaxspec | |
| 626 | |
| 627 #endif /* emacs */ | |
| 628 | |
| 629 #ifdef MULE | |
| 630 /* need extra stuff to be able to properly work with XEmacs/Mule | |
| 631 characters (which may take up more than one byte) */ | |
| 632 | |
| 633 ,charset_mule, /* Matches any character belonging to specified set. | |
| 634 The set is stored in "unified range-table | |
| 635 format"; see rangetab.c. Unlike the `charset' | |
| 636 opcode, this can handle arbitrary characters. */ | |
| 637 | |
| 638 charset_mule_not /* Same parameters as charset_mule, but match any | |
| 639 character that is not one of those specified. */ | |
| 640 | |
| 641 /* 97/2/17 jhod: The following two were merged back in from the Mule | |
| 642 2.3 code to enable some language specific processing */ | |
| 643 ,categoryspec, /* Matches entries in the character category tables */ | |
| 644 notcategoryspec /* The opposite of the above */ | |
| 645 #endif /* MULE */ | |
| 646 | |
| 647 } re_opcode_t; | |
| 648 | |
| 649 /* Common operations on the compiled pattern. */ | |
| 650 | |
| 651 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ | |
| 652 | |
| 653 #define STORE_NUMBER(destination, number) \ | |
| 654 do { \ | |
| 655 (destination)[0] = (number) & 0377; \ | |
| 656 (destination)[1] = (number) >> 8; \ | |
| 657 } while (0) | |
| 658 | |
| 659 /* Same as STORE_NUMBER, except increment DESTINATION to | |
| 660 the byte after where the number is stored. Therefore, DESTINATION | |
| 661 must be an lvalue. */ | |
| 662 | |
| 663 #define STORE_NUMBER_AND_INCR(destination, number) \ | |
| 664 do { \ | |
| 665 STORE_NUMBER (destination, number); \ | |
| 666 (destination) += 2; \ | |
| 667 } while (0) | |
| 668 | |
| 669 /* Put into DESTINATION a number stored in two contiguous bytes starting | |
| 670 at SOURCE. */ | |
| 671 | |
| 672 #define EXTRACT_NUMBER(destination, source) \ | |
| 673 do { \ | |
| 674 (destination) = *(source) & 0377; \ | |
| 675 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ | |
| 676 } while (0) | |
| 677 | |
| 678 #ifdef DEBUG | |
| 679 static void | |
| 446 | 680 extract_number (int *dest, re_char *source) |
| 428 | 681 { |
| 682 int temp = SIGN_EXTEND_CHAR (*(source + 1)); | |
| 683 *dest = *source & 0377; | |
| 684 *dest += temp << 8; | |
| 685 } | |
| 686 | |
| 687 #ifndef EXTRACT_MACROS /* To debug the macros. */ | |
| 688 #undef EXTRACT_NUMBER | |
| 689 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) | |
| 690 #endif /* not EXTRACT_MACROS */ | |
| 691 | |
| 692 #endif /* DEBUG */ | |
| 693 | |
| 694 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. | |
| 695 SOURCE must be an lvalue. */ | |
| 696 | |
| 697 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ | |
| 698 do { \ | |
| 699 EXTRACT_NUMBER (destination, source); \ | |
| 700 (source) += 2; \ | |
| 701 } while (0) | |
| 702 | |
| 703 #ifdef DEBUG | |
| 704 static void | |
| 705 extract_number_and_incr (int *destination, unsigned char **source) | |
| 706 { | |
| 707 extract_number (destination, *source); | |
| 708 *source += 2; | |
| 709 } | |
| 710 | |
| 711 #ifndef EXTRACT_MACROS | |
| 712 #undef EXTRACT_NUMBER_AND_INCR | |
| 713 #define EXTRACT_NUMBER_AND_INCR(dest, src) \ | |
| 714 extract_number_and_incr (&dest, &src) | |
| 715 #endif /* not EXTRACT_MACROS */ | |
| 716 | |
| 717 #endif /* DEBUG */ | |
| 718 | |
| 719 /* If DEBUG is defined, Regex prints many voluminous messages about what | |
| 720 it is doing (if the variable `debug' is nonzero). If linked with the | |
| 721 main program in `iregex.c', you can enter patterns and strings | |
| 722 interactively. And if linked with the main program in `main.c' and | |
| 723 the other test files, you can run the already-written tests. */ | |
| 724 | |
| 725 #if defined (DEBUG) | |
| 726 | |
| 727 /* We use standard I/O for debugging. */ | |
| 728 #include <stdio.h> | |
| 729 | |
| 730 #ifndef emacs | |
| 731 /* XEmacs provides its own version of assert() */ | |
| 732 /* It is useful to test things that ``must'' be true when debugging. */ | |
| 733 #include <assert.h> | |
| 734 #endif | |
| 735 | |
| 5041 | 736 extern int debug_regexps; |
| 428 | 737 |
| 738 #define DEBUG_STATEMENT(e) e | |
| 5041 | 739 |
| 740 #define DEBUG_PRINT1(x) if (debug_regexps) printf (x) | |
| 741 #define DEBUG_PRINT2(x1, x2) if (debug_regexps) printf (x1, x2) | |
| 742 #define DEBUG_PRINT3(x1, x2, x3) if (debug_regexps) printf (x1, x2, x3) | |
| 743 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug_regexps) printf (x1, x2, x3, x4) | |
| 428 | 744 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ |
| 5041 | 745 if (debug_regexps) print_partial_compiled_pattern (s, e) |
| 428 | 746 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ |
| 5041 | 747 if (debug_regexps) print_double_string (w, s1, sz1, s2, sz2) |
| 748 | |
| 749 #define DEBUG_FAIL_PRINT1(x) \ | |
| 750 if (debug_regexps & RE_DEBUG_FAILURE_POINT) printf (x) | |
| 751 #define DEBUG_FAIL_PRINT2(x1, x2) \ | |
| 752 if (debug_regexps & RE_DEBUG_FAILURE_POINT) printf (x1, x2) | |
| 753 #define DEBUG_FAIL_PRINT3(x1, x2, x3) \ | |
| 754 if (debug_regexps & RE_DEBUG_FAILURE_POINT) printf (x1, x2, x3) | |
| 755 #define DEBUG_FAIL_PRINT4(x1, x2, x3, x4) \ | |
| 756 if (debug_regexps & RE_DEBUG_FAILURE_POINT) printf (x1, x2, x3, x4) | |
| 757 #define DEBUG_FAIL_PRINT_COMPILED_PATTERN(p, s, e) \ | |
| 758 if (debug_regexps & RE_DEBUG_FAILURE_POINT) \ | |
| 759 print_partial_compiled_pattern (s, e) | |
| 760 #define DEBUG_FAIL_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ | |
| 761 if (debug_regexps & RE_DEBUG_FAILURE_POINT) \ | |
| 762 print_double_string (w, s1, sz1, s2, sz2) | |
| 763 | |
| 764 #define DEBUG_MATCH_PRINT1(x) \ | |
| 765 if (debug_regexps & RE_DEBUG_MATCHING) printf (x) | |
| 766 #define DEBUG_MATCH_PRINT2(x1, x2) \ | |
| 767 if (debug_regexps & RE_DEBUG_MATCHING) printf (x1, x2) | |
| 768 #define DEBUG_MATCH_PRINT3(x1, x2, x3) \ | |
| 769 if (debug_regexps & RE_DEBUG_MATCHING) printf (x1, x2, x3) | |
| 770 #define DEBUG_MATCH_PRINT4(x1, x2, x3, x4) \ | |
| 771 if (debug_regexps & RE_DEBUG_MATCHING) printf (x1, x2, x3, x4) | |
| 772 #define DEBUG_MATCH_PRINT_COMPILED_PATTERN(p, s, e) \ | |
| 773 if (debug_regexps & RE_DEBUG_MATCHING) \ | |
| 774 print_partial_compiled_pattern (s, e) | |
| 775 #define DEBUG_MATCH_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ | |
| 776 if (debug_regexps & RE_DEBUG_MATCHING) \ | |
| 777 print_double_string (w, s1, sz1, s2, sz2) | |
| 428 | 778 |
| 779 | |
| 780 /* Print the fastmap in human-readable form. */ | |
| 781 | |
| 782 static void | |
| 783 print_fastmap (char *fastmap) | |
| 784 { | |
| 647 | 785 int was_a_range = 0; |
| 786 int i = 0; | |
| 428 | 787 |
| 788 while (i < (1 << BYTEWIDTH)) | |
| 789 { | |
| 790 if (fastmap[i++]) | |
| 791 { | |
| 792 was_a_range = 0; | |
| 793 putchar (i - 1); | |
| 794 while (i < (1 << BYTEWIDTH) && fastmap[i]) | |
| 795 { | |
| 796 was_a_range = 1; | |
| 797 i++; | |
| 798 } | |
| 799 if (was_a_range) | |
| 800 { | |
| 801 putchar ('-'); | |
| 802 putchar (i - 1); | |
| 803 } | |
| 804 } | |
| 805 } | |
| 806 putchar ('\n'); | |
| 807 } | |
| 808 | |
| 809 | |
| 810 /* Print a compiled pattern string in human-readable form, starting at | |
| 811 the START pointer into it and ending just before the pointer END. */ | |
| 812 | |
| 813 static void | |
| 446 | 814 print_partial_compiled_pattern (re_char *start, re_char *end) |
| 428 | 815 { |
| 816 int mcnt, mcnt2; | |
| 446 | 817 unsigned char *p = (unsigned char *) start; |
| 818 re_char *pend = end; | |
| 428 | 819 |
| 820 if (start == NULL) | |
| 821 { | |
| 822 puts ("(null)"); | |
| 823 return; | |
| 824 } | |
| 825 | |
| 826 /* Loop over pattern commands. */ | |
| 827 while (p < pend) | |
| 828 { | |
| 829 printf ("%ld:\t", (long)(p - start)); | |
| 830 | |
| 831 switch ((re_opcode_t) *p++) | |
| 832 { | |
| 833 case no_op: | |
| 834 printf ("/no_op"); | |
| 835 break; | |
| 836 | |
| 837 case exactn: | |
| 838 mcnt = *p++; | |
| 839 printf ("/exactn/%d", mcnt); | |
|
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
840 while (mcnt--) |
| 428 | 841 { |
|
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
842 putchar ('/'); |
| 428 | 843 putchar (*p++); |
| 844 } | |
| 845 break; | |
| 846 | |
| 847 case start_memory: | |
| 848 mcnt = *p++; | |
| 849 printf ("/start_memory/%d/%d", mcnt, *p++); | |
| 850 break; | |
| 851 | |
| 852 case stop_memory: | |
| 853 mcnt = *p++; | |
| 854 printf ("/stop_memory/%d/%d", mcnt, *p++); | |
| 855 break; | |
| 856 | |
| 857 case duplicate: | |
| 858 printf ("/duplicate/%d", *p++); | |
| 859 break; | |
| 860 | |
| 861 case anychar: | |
| 862 printf ("/anychar"); | |
| 863 break; | |
| 864 | |
| 865 case charset: | |
| 866 case charset_not: | |
| 867 { | |
| 868 REGISTER int c, last = -100; | |
| 869 REGISTER int in_range = 0; | |
| 870 | |
| 871 printf ("/charset [%s", | |
| 872 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); | |
| 873 | |
| 874 assert (p + *p < pend); | |
| 875 | |
| 876 for (c = 0; c < 256; c++) | |
| 877 if (((unsigned char) (c / 8) < *p) | |
| 878 && (p[1 + (c/8)] & (1 << (c % 8)))) | |
| 879 { | |
| 880 /* Are we starting a range? */ | |
| 881 if (last + 1 == c && ! in_range) | |
| 882 { | |
| 883 putchar ('-'); | |
| 884 in_range = 1; | |
| 885 } | |
| 886 /* Have we broken a range? */ | |
| 887 else if (last + 1 != c && in_range) | |
| 888 { | |
| 889 putchar (last); | |
| 890 in_range = 0; | |
| 891 } | |
| 892 | |
| 893 if (! in_range) | |
| 894 putchar (c); | |
| 895 | |
| 896 last = c; | |
| 897 } | |
| 898 | |
| 899 if (in_range) | |
| 900 putchar (last); | |
| 901 | |
| 902 putchar (']'); | |
| 903 | |
| 904 p += 1 + *p; | |
| 905 } | |
| 906 break; | |
| 907 | |
| 908 #ifdef MULE | |
| 909 case charset_mule: | |
| 910 case charset_mule_not: | |
| 911 { | |
| 912 int nentries, i; | |
| 913 | |
| 914 printf ("/charset_mule [%s", | |
| 915 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : ""); | |
| 916 nentries = unified_range_table_nentries (p); | |
| 917 for (i = 0; i < nentries; i++) | |
| 918 { | |
| 919 EMACS_INT first, last; | |
| 920 Lisp_Object dummy_val; | |
| 921 | |
| 922 unified_range_table_get_range (p, i, &first, &last, | |
| 923 &dummy_val); | |
| 924 if (first < 0x100) | |
| 925 putchar (first); | |
| 926 else | |
| 927 printf ("(0x%lx)", (long)first); | |
| 928 if (first != last) | |
| 929 { | |
| 930 putchar ('-'); | |
| 931 if (last < 0x100) | |
| 932 putchar (last); | |
| 933 else | |
| 934 printf ("(0x%lx)", (long)last); | |
| 935 } | |
| 936 } | |
| 937 putchar (']'); | |
| 938 p += unified_range_table_bytes_used (p); | |
| 939 } | |
| 940 break; | |
| 941 #endif | |
| 942 | |
| 943 case begline: | |
| 944 printf ("/begline"); | |
| 945 break; | |
| 946 | |
| 947 case endline: | |
| 948 printf ("/endline"); | |
| 949 break; | |
| 950 | |
| 951 case on_failure_jump: | |
| 952 extract_number_and_incr (&mcnt, &p); | |
| 953 printf ("/on_failure_jump to %ld", (long)(p + mcnt - start)); | |
| 954 break; | |
| 955 | |
| 956 case on_failure_keep_string_jump: | |
| 957 extract_number_and_incr (&mcnt, &p); | |
| 958 printf ("/on_failure_keep_string_jump to %ld", (long)(p + mcnt - start)); | |
| 959 break; | |
| 960 | |
| 961 case dummy_failure_jump: | |
| 962 extract_number_and_incr (&mcnt, &p); | |
| 963 printf ("/dummy_failure_jump to %ld", (long)(p + mcnt - start)); | |
| 964 break; | |
| 965 | |
| 966 case push_dummy_failure: | |
| 967 printf ("/push_dummy_failure"); | |
| 968 break; | |
| 969 | |
| 970 case maybe_pop_jump: | |
| 971 extract_number_and_incr (&mcnt, &p); | |
| 972 printf ("/maybe_pop_jump to %ld", (long)(p + mcnt - start)); | |
| 973 break; | |
| 974 | |
| 975 case pop_failure_jump: | |
| 976 extract_number_and_incr (&mcnt, &p); | |
| 977 printf ("/pop_failure_jump to %ld", (long)(p + mcnt - start)); | |
| 978 break; | |
| 979 | |
| 980 case jump_past_alt: | |
| 981 extract_number_and_incr (&mcnt, &p); | |
| 982 printf ("/jump_past_alt to %ld", (long)(p + mcnt - start)); | |
| 983 break; | |
| 984 | |
| 985 case jump: | |
| 986 extract_number_and_incr (&mcnt, &p); | |
| 987 printf ("/jump to %ld", (long)(p + mcnt - start)); | |
| 988 break; | |
| 989 | |
| 990 case succeed_n: | |
| 991 extract_number_and_incr (&mcnt, &p); | |
| 992 extract_number_and_incr (&mcnt2, &p); | |
| 993 printf ("/succeed_n to %ld, %d times", (long)(p + mcnt - start), mcnt2); | |
| 994 break; | |
| 995 | |
| 996 case jump_n: | |
| 997 extract_number_and_incr (&mcnt, &p); | |
| 998 extract_number_and_incr (&mcnt2, &p); | |
| 999 printf ("/jump_n to %ld, %d times", (long)(p + mcnt - start), mcnt2); | |
| 1000 break; | |
| 1001 | |
| 1002 case set_number_at: | |
| 1003 extract_number_and_incr (&mcnt, &p); | |
| 1004 extract_number_and_incr (&mcnt2, &p); | |
| 1005 printf ("/set_number_at location %ld to %d", (long)(p + mcnt - start), mcnt2); | |
| 1006 break; | |
| 1007 | |
| 1008 case wordbound: | |
| 1009 printf ("/wordbound"); | |
| 1010 break; | |
| 1011 | |
| 1012 case notwordbound: | |
| 1013 printf ("/notwordbound"); | |
| 1014 break; | |
| 1015 | |
| 1016 case wordbeg: | |
| 1017 printf ("/wordbeg"); | |
| 1018 break; | |
| 1019 | |
| 1020 case wordend: | |
| 1021 printf ("/wordend"); | |
| 1022 | |
| 1023 #ifdef emacs | |
| 1024 case before_dot: | |
| 1025 printf ("/before_dot"); | |
| 1026 break; | |
| 1027 | |
| 1028 case at_dot: | |
| 1029 printf ("/at_dot"); | |
| 1030 break; | |
| 1031 | |
| 1032 case after_dot: | |
| 1033 printf ("/after_dot"); | |
| 1034 break; | |
| 1035 | |
| 1036 case syntaxspec: | |
| 1037 printf ("/syntaxspec"); | |
| 1038 mcnt = *p++; | |
| 1039 printf ("/%d", mcnt); | |
| 1040 break; | |
| 1041 | |
| 1042 case notsyntaxspec: | |
| 1043 printf ("/notsyntaxspec"); | |
| 1044 mcnt = *p++; | |
| 1045 printf ("/%d", mcnt); | |
| 1046 break; | |
| 1047 | |
| 1048 #ifdef MULE | |
| 1049 /* 97/2/17 jhod Mule category patch */ | |
| 1050 case categoryspec: | |
| 1051 printf ("/categoryspec"); | |
| 1052 mcnt = *p++; | |
| 1053 printf ("/%d", mcnt); | |
| 1054 break; | |
| 1055 | |
| 1056 case notcategoryspec: | |
| 1057 printf ("/notcategoryspec"); | |
| 1058 mcnt = *p++; | |
| 1059 printf ("/%d", mcnt); | |
| 1060 break; | |
| 1061 /* end of category patch */ | |
| 1062 #endif /* MULE */ | |
| 1063 #endif /* emacs */ | |
| 1064 | |
| 1065 case wordchar: | |
| 1066 printf ("/wordchar"); | |
| 1067 break; | |
| 1068 | |
| 1069 case notwordchar: | |
| 1070 printf ("/notwordchar"); | |
| 1071 break; | |
| 1072 | |
| 1073 case begbuf: | |
| 1074 printf ("/begbuf"); | |
| 1075 break; | |
| 1076 | |
| 1077 case endbuf: | |
| 1078 printf ("/endbuf"); | |
| 1079 break; | |
| 1080 | |
| 1081 default: | |
| 1082 printf ("?%d", *(p-1)); | |
| 1083 } | |
| 1084 | |
| 1085 putchar ('\n'); | |
| 1086 } | |
| 1087 | |
| 1088 printf ("%ld:\tend of pattern.\n", (long)(p - start)); | |
| 1089 } | |
| 1090 | |
| 1091 | |
| 1092 static void | |
| 1093 print_compiled_pattern (struct re_pattern_buffer *bufp) | |
| 1094 { | |
| 446 | 1095 re_char *buffer = bufp->buffer; |
| 428 | 1096 |
| 1097 print_partial_compiled_pattern (buffer, buffer + bufp->used); | |
| 1098 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used, | |
| 1099 bufp->allocated); | |
| 1100 | |
| 1101 if (bufp->fastmap_accurate && bufp->fastmap) | |
| 1102 { | |
| 1103 printf ("fastmap: "); | |
| 1104 print_fastmap (bufp->fastmap); | |
| 1105 } | |
| 1106 | |
| 1107 printf ("re_nsub: %ld\t", (long)bufp->re_nsub); | |
| 502 | 1108 printf ("re_ngroups: %ld\t", (long)bufp->re_ngroups); |
| 428 | 1109 printf ("regs_alloc: %d\t", bufp->regs_allocated); |
| 1110 printf ("can_be_null: %d\t", bufp->can_be_null); | |
| 1111 printf ("newline_anchor: %d\n", bufp->newline_anchor); | |
| 1112 printf ("no_sub: %d\t", bufp->no_sub); | |
| 1113 printf ("not_bol: %d\t", bufp->not_bol); | |
| 1114 printf ("not_eol: %d\t", bufp->not_eol); | |
| 1115 printf ("syntax: %d\n", bufp->syntax); | |
| 1116 /* Perhaps we should print the translate table? */ | |
| 1117 /* and maybe the category table? */ | |
| 502 | 1118 |
| 1119 if (bufp->external_to_internal_register) | |
| 1120 { | |
| 1121 int i; | |
| 1122 | |
| 1123 printf ("external_to_internal_register:\n"); | |
| 1124 for (i = 0; i <= bufp->re_nsub; i++) | |
| 1125 { | |
| 1126 if (i > 0) | |
| 1127 printf (", "); | |
| 1128 printf ("%d -> %d", i, bufp->external_to_internal_register[i]); | |
| 1129 } | |
| 1130 printf ("\n"); | |
| 1131 } | |
| 428 | 1132 } |
| 1133 | |
| 1134 | |
| 1135 static void | |
| 446 | 1136 print_double_string (re_char *where, re_char *string1, int size1, |
| 1137 re_char *string2, int size2) | |
| 428 | 1138 { |
| 1139 if (where == NULL) | |
| 1140 printf ("(null)"); | |
| 1141 else | |
| 1142 { | |
| 647 | 1143 int this_char; |
| 428 | 1144 |
| 1145 if (FIRST_STRING_P (where)) | |
| 1146 { | |
| 1147 for (this_char = where - string1; this_char < size1; this_char++) | |
| 1148 putchar (string1[this_char]); | |
| 1149 | |
| 1150 where = string2; | |
| 1151 } | |
| 1152 | |
| 1153 for (this_char = where - string2; this_char < size2; this_char++) | |
| 1154 putchar (string2[this_char]); | |
| 1155 } | |
| 1156 } | |
| 1157 | |
| 1158 #else /* not DEBUG */ | |
| 1159 | |
| 771 | 1160 #ifndef emacs |
| 428 | 1161 #undef assert |
| 771 | 1162 #define assert(e) ((void) (1)) |
| 1163 #endif | |
| 428 | 1164 |
| 1165 #define DEBUG_STATEMENT(e) | |
| 5041 | 1166 |
| 428 | 1167 #define DEBUG_PRINT1(x) |
| 1168 #define DEBUG_PRINT2(x1, x2) | |
| 1169 #define DEBUG_PRINT3(x1, x2, x3) | |
| 1170 #define DEBUG_PRINT4(x1, x2, x3, x4) | |
| 1171 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) | |
| 1172 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) | |
| 1173 | |
| 5041 | 1174 #define DEBUG_FAIL_PRINT1(x) |
| 1175 #define DEBUG_FAIL_PRINT2(x1, x2) | |
| 1176 #define DEBUG_FAIL_PRINT3(x1, x2, x3) | |
| 1177 #define DEBUG_FAIL_PRINT4(x1, x2, x3, x4) | |
| 1178 #define DEBUG_FAIL_PRINT_COMPILED_PATTERN(p, s, e) | |
| 1179 #define DEBUG_FAIL_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) | |
| 1180 | |
| 1181 #define DEBUG_MATCH_PRINT1(x) | |
| 1182 #define DEBUG_MATCH_PRINT2(x1, x2) | |
| 1183 #define DEBUG_MATCH_PRINT3(x1, x2, x3) | |
| 1184 #define DEBUG_MATCH_PRINT4(x1, x2, x3, x4) | |
| 1185 #define DEBUG_MATCH_PRINT_COMPILED_PATTERN(p, s, e) | |
| 1186 #define DEBUG_MATCH_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) | |
| 1187 | |
| 446 | 1188 #endif /* DEBUG */ |
| 428 | 1189 |
| 1190 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can | |
| 1191 also be assigned to arbitrarily: each pattern buffer stores its own | |
| 1192 syntax, so it can be changed between regex compilations. */ | |
| 1193 /* This has no initializer because initialized variables in Emacs | |
| 1194 become read-only after dumping. */ | |
| 1195 reg_syntax_t re_syntax_options; | |
| 1196 | |
| 1197 | |
| 1198 /* Specify the precise syntax of regexps for compilation. This provides | |
| 1199 for compatibility for various utilities which historically have | |
| 1200 different, incompatible syntaxes. | |
| 1201 | |
| 1202 The argument SYNTAX is a bit mask comprised of the various bits | |
| 1203 defined in regex.h. We return the old syntax. */ | |
| 1204 | |
| 1205 reg_syntax_t | |
| 1206 re_set_syntax (reg_syntax_t syntax) | |
| 1207 { | |
| 1208 reg_syntax_t ret = re_syntax_options; | |
| 1209 | |
| 1210 re_syntax_options = syntax; | |
| 1211 return ret; | |
| 1212 } | |
| 1213 | |
| 1214 /* This table gives an error message for each of the error codes listed | |
| 1215 in regex.h. Obviously the order here has to be same as there. | |
| 1216 POSIX doesn't require that we do anything for REG_NOERROR, | |
| 1217 but why not be nice? */ | |
| 1218 | |
| 442 | 1219 static const char *re_error_msgid[] = |
| 428 | 1220 { |
| 1221 "Success", /* REG_NOERROR */ | |
| 1222 "No match", /* REG_NOMATCH */ | |
| 1223 "Invalid regular expression", /* REG_BADPAT */ | |
| 1224 "Invalid collation character", /* REG_ECOLLATE */ | |
| 1225 "Invalid character class name", /* REG_ECTYPE */ | |
| 1226 "Trailing backslash", /* REG_EESCAPE */ | |
| 1227 "Invalid back reference", /* REG_ESUBREG */ | |
| 1228 "Unmatched [ or [^", /* REG_EBRACK */ | |
| 1229 "Unmatched ( or \\(", /* REG_EPAREN */ | |
| 1230 "Unmatched \\{", /* REG_EBRACE */ | |
| 1231 "Invalid content of \\{\\}", /* REG_BADBR */ | |
| 1232 "Invalid range end", /* REG_ERANGE */ | |
| 1233 "Memory exhausted", /* REG_ESPACE */ | |
| 1234 "Invalid preceding regular expression", /* REG_BADRPT */ | |
| 1235 "Premature end of regular expression", /* REG_EEND */ | |
| 1236 "Regular expression too big", /* REG_ESIZE */ | |
| 1237 "Unmatched ) or \\)", /* REG_ERPAREN */ | |
| 1238 #ifdef emacs | |
| 1239 "Invalid syntax designator", /* REG_ESYNTAX */ | |
| 1240 #endif | |
| 1241 #ifdef MULE | |
| 1242 "Ranges may not span charsets", /* REG_ERANGESPAN */ | |
| 1243 "Invalid category designator", /* REG_ECATEGORY */ | |
| 1244 #endif | |
| 1245 }; | |
| 1246 | |
| 1247 /* Avoiding alloca during matching, to placate r_alloc. */ | |
| 1248 | |
| 1333 | 1249 /* About these various flags: |
| 1250 | |
| 1251 MATCH_MAY_ALLOCATE indicates that it's OK to do allocation in the | |
| 1252 searching and matching functions. In this case, we use local variables | |
| 1253 to hold the values allocated. If not, we use *global* variables, which | |
| 1254 are pre-allocated. NOTE: XEmacs ***MUST*** run with MATCH_MAY_ALLOCATE, | |
| 1255 because the regexp routines may get called reentrantly as a result of | |
| 1256 QUIT processing (e.g. under Windows: re_match -> QUIT -> quit_p -> drain | |
| 1257 events -> process WM_INITMENU -> call filter -> re_match; see stack | |
| 1258 trace in signal.c), so we cannot have any global variables (unless we do | |
| 1259 lots of trickiness including some unwind-protects, which isn't worth it | |
| 1260 at this point). | |
| 1261 | |
| 1262 REL_ALLOC means that the relocating allocator is in use, for buffers | |
| 1263 and such. REGEX_REL_ALLOC means that we use rel-alloc to manage the | |
| 1264 fail stack, which may grow quite large. REGEX_MALLOC means we use | |
| 1265 malloc() in place of alloca() to allocate the fail stack -- only | |
| 1266 applicable if REGEX_REL_ALLOC is not defined. | |
| 1267 */ | |
| 1268 | |
| 428 | 1269 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the |
| 1270 searching and matching functions should not call alloca. On some | |
| 1271 systems, alloca is implemented in terms of malloc, and if we're | |
| 1272 using the relocating allocator routines, then malloc could cause a | |
| 1273 relocation, which might (if the strings being searched are in the | |
| 1274 ralloc heap) shift the data out from underneath the regexp | |
| 771 | 1275 routines. [To clarify: The purpose of rel-alloc is to allow data to |
| 1276 be moved in memory from one place to another so that all data | |
| 1277 blocks can be consolidated together and excess memory released back | |
| 1278 to the operating system. This requires that all the blocks that | |
| 1279 are managed by rel-alloc go at the very end of the program's heap, | |
| 1280 after all regularly malloc()ed data. malloc(), however, is used to | |
| 1281 owning the end of the heap, so that when more memory is needed, it | |
| 1282 just expands the heap using sbrk(). This is reconciled by using a | |
| 1283 malloc() (such as malloc.c, gmalloc.c, or recent versions of | |
| 1284 malloc() in libc) where the sbrk() call can be replaced with a | |
| 1285 user-specified call -- in this case, to rel-alloc's r_alloc_sbrk() | |
| 1286 routine. This routine calls the real sbrk(), but then shifts all | |
| 1287 the rel-alloc-managed blocks forward to the end of the heap again, | |
| 1288 so that malloc() gets the memory it needs in the location it needs | |
| 1289 it at. The regex routines may well have pointers to buffer data as | |
| 1290 their arguments, and buffers are managed by rel-alloc if rel-alloc | |
| 1291 has been enabled, so calling malloc() may potentially screw things | |
| 1292 up badly if it runs out of space and asks for more from the OS.] | |
| 1293 | |
| 1294 [[Here's another reason to avoid allocation: Emacs processes input | |
| 1295 from X in a signal handler; processing X input may call malloc; if | |
| 1296 input arrives while a matching routine is calling malloc, then | |
| 1297 we're scrod. But Emacs can't just block input while calling | |
| 1298 matching routines; then we don't notice interrupts when they come | |
| 1299 in. So, Emacs blocks input around all regexp calls except the | |
| 1300 matching calls, which it leaves unprotected, in the faith that they | |
| 1333 | 1301 will not malloc.]] This previous paragraph is irrelevant under XEmacs, |
| 1302 as we *do not* do anything so stupid as process input from within a | |
| 1303 signal handler. | |
| 1304 | |
| 1305 However, the regexp routines may get called reentrantly as a result of | |
| 1306 QUIT processing (e.g. under Windows: re_match -> QUIT -> quit_p -> drain | |
| 1307 events -> process WM_INITMENU -> call filter -> re_match; see stack | |
| 1308 trace in signal.c), so we cannot have any global variables (unless we do | |
| 1309 lots of trickiness including some unwind-protects, which isn't worth it | |
| 1310 at this point). Hence we MUST have MATCH_MAY_ALLOCATE defined. | |
| 1311 | |
| 1312 Also, the first paragraph does not make complete sense to me -- what | |
| 1313 about the use of rel-alloc to handle the fail stacks? Shouldn't these | |
| 1314 reallocations potentially cause buffer data to be relocated as well? I | |
| 826 | 1315 must be missing something, though -- perhaps the writer above is |
| 1316 assuming that the failure stack(s) will always be allocated after the | |
| 1317 buffer data, and thus reallocating them with rel-alloc won't move buffer | |
| 1333 | 1318 data. (In fact, a cursory glance at the code in ralloc.c seems to |
| 1319 confirm this.) --ben */ | |
| 428 | 1320 |
| 1321 /* Normally, this is fine. */ | |
| 1322 #define MATCH_MAY_ALLOCATE | |
| 1323 | |
| 1324 /* When using GNU C, we are not REALLY using the C alloca, no matter | |
| 1325 what config.h may say. So don't take precautions for it. */ | |
| 1326 #ifdef __GNUC__ | |
| 1327 #undef C_ALLOCA | |
| 1328 #endif | |
| 1329 | |
| 1330 /* The match routines may not allocate if (1) they would do it with malloc | |
| 1331 and (2) it's not safe for them to use malloc. | |
| 1332 Note that if REL_ALLOC is defined, matching would not use malloc for the | |
| 1333 failure stack, but we would still use it for the register vectors; | |
| 1334 so REL_ALLOC should not affect this. */ | |
| 771 | 1335 |
| 1333 | 1336 /* XEmacs can handle REL_ALLOC and malloc() OK */ |
| 1337 #if !defined (emacs) && (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (REL_ALLOC) | |
| 428 | 1338 #undef MATCH_MAY_ALLOCATE |
| 1339 #endif | |
| 1340 | |
| 1333 | 1341 #if !defined (MATCH_MAY_ALLOCATE) && defined (emacs) |
| 771 | 1342 #error regex must be handle reentrancy; MATCH_MAY_ALLOCATE must be defined |
| 1343 #endif | |
| 1344 | |
| 428 | 1345 |
| 1346 /* Failure stack declarations and macros; both re_compile_fastmap and | |
| 1347 re_match_2 use a failure stack. These have to be macros because of | |
| 1348 REGEX_ALLOCATE_STACK. */ | |
| 1349 | |
| 1350 | |
| 1351 /* Number of failure points for which to initially allocate space | |
| 1352 when matching. If this number is exceeded, we allocate more | |
| 1353 space, so it is not a hard limit. */ | |
| 1354 #ifndef INIT_FAILURE_ALLOC | |
| 3300 | 1355 #define INIT_FAILURE_ALLOC 20 |
| 428 | 1356 #endif |
| 1357 | |
| 1358 /* Roughly the maximum number of failure points on the stack. Would be | |
| 1359 exactly that if always used MAX_FAILURE_SPACE each time we failed. | |
| 1360 This is a variable only so users of regex can assign to it; we never | |
| 1361 change it ourselves. */ | |
| 1362 #if defined (MATCH_MAY_ALLOCATE) | |
| 1363 /* 4400 was enough to cause a crash on Alpha OSF/1, | |
| 1364 whose default stack limit is 2mb. */ | |
| 3300 | 1365 int re_max_failures = 40000; |
| 428 | 1366 #else |
| 3300 | 1367 int re_max_failures = 4000; |
| 428 | 1368 #endif |
| 1369 | |
| 1370 union fail_stack_elt | |
| 1371 { | |
| 446 | 1372 re_char *pointer; |
| 428 | 1373 int integer; |
| 1374 }; | |
| 1375 | |
| 1376 typedef union fail_stack_elt fail_stack_elt_t; | |
| 1377 | |
| 1378 typedef struct | |
| 1379 { | |
| 1380 fail_stack_elt_t *stack; | |
| 665 | 1381 Elemcount size; |
| 1382 Elemcount avail; /* Offset of next open position. */ | |
| 428 | 1383 } fail_stack_type; |
| 1384 | |
| 1385 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) | |
| 1386 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) | |
| 1387 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) | |
| 1388 | |
| 1389 | |
| 1390 /* Define macros to initialize and free the failure stack. | |
| 1391 Do `return -2' if the alloc fails. */ | |
| 1392 | |
| 1393 #ifdef MATCH_MAY_ALLOCATE | |
| 1333 | 1394 #define INIT_FAIL_STACK() \ |
| 1395 do { \ | |
| 1396 fail_stack.stack = (fail_stack_elt_t *) \ | |
| 1397 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * \ | |
| 1398 sizeof (fail_stack_elt_t)); \ | |
| 1399 \ | |
| 1400 if (fail_stack.stack == NULL) \ | |
| 1401 { \ | |
| 1402 UNBIND_REGEX_MALLOC_CHECK (); \ | |
| 1403 return -2; \ | |
| 1404 } \ | |
| 1405 \ | |
| 1406 fail_stack.size = INIT_FAILURE_ALLOC; \ | |
| 1407 fail_stack.avail = 0; \ | |
| 428 | 1408 } while (0) |
| 1409 | |
| 1410 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) | |
| 1411 #else | |
| 1412 #define INIT_FAIL_STACK() \ | |
| 1413 do { \ | |
| 1414 fail_stack.avail = 0; \ | |
| 1415 } while (0) | |
| 1416 | |
| 1417 #define RESET_FAIL_STACK() | |
| 1418 #endif | |
| 1419 | |
| 1420 | |
| 1421 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. | |
| 1422 | |
| 1423 Return 1 if succeeds, and 0 if either ran out of memory | |
| 1424 allocating space for it or it was already too large. | |
| 1425 | |
| 1426 REGEX_REALLOCATE_STACK requires `destination' be declared. */ | |
| 1427 | |
| 1428 #define DOUBLE_FAIL_STACK(fail_stack) \ | |
| 1429 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ | |
| 1430 ? 0 \ | |
| 1431 : ((fail_stack).stack = (fail_stack_elt_t *) \ | |
| 1432 REGEX_REALLOCATE_STACK ((fail_stack).stack, \ | |
| 1433 (fail_stack).size * sizeof (fail_stack_elt_t), \ | |
| 1434 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ | |
| 1435 \ | |
| 1436 (fail_stack).stack == NULL \ | |
| 1437 ? 0 \ | |
| 1438 : ((fail_stack).size <<= 1, \ | |
| 1439 1))) | |
| 1440 | |
| 1333 | 1441 #if !defined (emacs) || !defined (REL_ALLOC) |
| 1442 #define RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS() | |
| 1443 #else | |
| 1444 /* Don't change NULL pointers */ | |
| 1445 #define ADD_IF_NZ(val) if (val) val += rmdp_offset | |
| 1346 | 1446 #define RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS() \ |
| 1447 do \ | |
| 1448 { \ | |
| 1449 Bytecount rmdp_offset = offset_post_relocation (lispobj, orig_buftext); \ | |
| 1450 \ | |
| 1451 if (rmdp_offset) \ | |
| 1452 { \ | |
| 1453 int i; \ | |
| 1454 \ | |
| 1455 ADD_IF_NZ (string1); \ | |
| 1456 ADD_IF_NZ (string2); \ | |
| 1457 ADD_IF_NZ (d); \ | |
| 1458 ADD_IF_NZ (dend); \ | |
| 1459 ADD_IF_NZ (end1); \ | |
| 1460 ADD_IF_NZ (end2); \ | |
| 1461 ADD_IF_NZ (end_match_1); \ | |
| 1462 ADD_IF_NZ (end_match_2); \ | |
| 1463 \ | |
| 1464 if (bufp->re_ngroups) \ | |
| 1465 { \ | |
| 1466 for (i = 0; i < num_regs; i++) \ | |
| 1467 { \ | |
| 1468 ADD_IF_NZ (regstart[i]); \ | |
| 1469 ADD_IF_NZ (regend[i]); \ | |
| 1470 ADD_IF_NZ (old_regstart[i]); \ | |
| 1471 ADD_IF_NZ (old_regend[i]); \ | |
| 1472 ADD_IF_NZ (best_regstart[i]); \ | |
| 1473 ADD_IF_NZ (best_regend[i]); \ | |
| 1474 ADD_IF_NZ (reg_dummy[i]); \ | |
| 1475 } \ | |
| 1476 } \ | |
| 1477 \ | |
| 1478 ADD_IF_NZ (match_end); \ | |
| 1479 } \ | |
| 1333 | 1480 } while (0) |
| 1481 #endif /* !defined (emacs) || !defined (REL_ALLOC) */ | |
| 1482 | |
| 1483 #if !defined (emacs) || !defined (REL_ALLOC) | |
| 1484 #define RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS() | |
| 1485 #else | |
| 1346 | 1486 #define RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS() \ |
| 1487 do \ | |
| 1488 { \ | |
| 1489 Bytecount rmdp_offset = offset_post_relocation (lispobj, orig_buftext); \ | |
| 1490 \ | |
| 1491 if (rmdp_offset) \ | |
| 1492 { \ | |
| 1493 ADD_IF_NZ (str1); \ | |
| 1494 ADD_IF_NZ (str2); \ | |
| 1495 ADD_IF_NZ (string1); \ | |
| 1496 ADD_IF_NZ (string2); \ | |
| 1497 ADD_IF_NZ (d); \ | |
| 1498 } \ | |
| 1333 | 1499 } while (0) |
| 1500 | |
| 1501 #endif /* emacs */ | |
| 428 | 1502 |
| 1503 /* Push pointer POINTER on FAIL_STACK. | |
| 1504 Return 1 if was able to do so and 0 if ran out of memory allocating | |
| 1505 space to do so. */ | |
| 1506 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ | |
| 1507 ((FAIL_STACK_FULL () \ | |
| 1508 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ | |
| 1509 ? 0 \ | |
| 1510 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ | |
| 1511 1)) | |
| 1512 | |
| 1513 /* Push a pointer value onto the failure stack. | |
| 1514 Assumes the variable `fail_stack'. Probably should only | |
| 1515 be called from within `PUSH_FAILURE_POINT'. */ | |
| 1516 #define PUSH_FAILURE_POINTER(item) \ | |
| 1517 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) | |
| 1518 | |
| 1519 /* This pushes an integer-valued item onto the failure stack. | |
| 1520 Assumes the variable `fail_stack'. Probably should only | |
| 1521 be called from within `PUSH_FAILURE_POINT'. */ | |
| 1522 #define PUSH_FAILURE_INT(item) \ | |
| 1523 fail_stack.stack[fail_stack.avail++].integer = (item) | |
| 1524 | |
| 1525 /* Push a fail_stack_elt_t value onto the failure stack. | |
| 1526 Assumes the variable `fail_stack'. Probably should only | |
| 1527 be called from within `PUSH_FAILURE_POINT'. */ | |
| 1528 #define PUSH_FAILURE_ELT(item) \ | |
| 1529 fail_stack.stack[fail_stack.avail++] = (item) | |
| 1530 | |
| 1531 /* These three POP... operations complement the three PUSH... operations. | |
| 1532 All assume that `fail_stack' is nonempty. */ | |
| 1533 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer | |
| 1534 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer | |
| 1535 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] | |
| 1536 | |
| 1537 /* Used to omit pushing failure point id's when we're not debugging. */ | |
| 1538 #ifdef DEBUG | |
| 1539 #define DEBUG_PUSH PUSH_FAILURE_INT | |
| 1540 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () | |
| 1541 #else | |
| 1542 #define DEBUG_PUSH(item) | |
| 1543 #define DEBUG_POP(item_addr) | |
| 1544 #endif | |
| 1545 | |
| 1546 | |
| 1547 /* Push the information about the state we will need | |
| 1548 if we ever fail back to it. | |
| 1549 | |
| 1550 Requires variables fail_stack, regstart, regend, reg_info, and | |
| 1551 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be | |
| 1552 declared. | |
| 1553 | |
| 1554 Does `return FAILURE_CODE' if runs out of memory. */ | |
| 1555 | |
| 771 | 1556 #if !defined (REGEX_MALLOC) && !defined (REGEX_REL_ALLOC) |
| 456 | 1557 #define DECLARE_DESTINATION char *destination |
| 428 | 1558 #else |
| 456 | 1559 #define DECLARE_DESTINATION DECLARE_NOTHING |
| 428 | 1560 #endif |
| 1561 | |
| 1562 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ | |
| 456 | 1563 do { \ |
| 1564 DECLARE_DESTINATION; \ | |
| 1565 /* Must be int, so when we don't save any registers, the arithmetic \ | |
| 1566 of 0 + -1 isn't done as unsigned. */ \ | |
| 1567 int this_reg; \ | |
| 428 | 1568 \ |
| 456 | 1569 DEBUG_STATEMENT (failure_id++); \ |
| 1570 DEBUG_STATEMENT (nfailure_points_pushed++); \ | |
| 5041 | 1571 DEBUG_FAIL_PRINT2 ("\nPUSH_FAILURE_POINT #%d:\n", failure_id); \ |
| 1572 DEBUG_FAIL_PRINT2 (" Before push, next avail: %ld\n", \ | |
| 647 | 1573 (long) (fail_stack).avail); \ |
| 5041 | 1574 DEBUG_FAIL_PRINT2 (" size: %ld\n", \ |
| 647 | 1575 (long) (fail_stack).size); \ |
| 456 | 1576 \ |
| 5041 | 1577 DEBUG_FAIL_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ |
| 1578 DEBUG_FAIL_PRINT2 (" available: %ld\n", \ | |
| 456 | 1579 (long) REMAINING_AVAIL_SLOTS); \ |
| 428 | 1580 \ |
| 456 | 1581 /* Ensure we have enough space allocated for what we will push. */ \ |
| 1582 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ | |
| 1583 { \ | |
| 1333 | 1584 BEGIN_REGEX_MALLOC_OK (); \ |
| 456 | 1585 if (!DOUBLE_FAIL_STACK (fail_stack)) \ |
| 1333 | 1586 { \ |
| 1587 END_REGEX_MALLOC_OK (); \ | |
| 1588 UNBIND_REGEX_MALLOC_CHECK (); \ | |
| 1589 return failure_code; \ | |
| 1590 } \ | |
| 1591 END_REGEX_MALLOC_OK (); \ | |
| 5041 | 1592 DEBUG_FAIL_PRINT2 ("\n Doubled stack; size now: %ld\n", \ |
| 647 | 1593 (long) (fail_stack).size); \ |
| 5041 | 1594 DEBUG_FAIL_PRINT2 (" slots available: %ld\n", \ |
| 456 | 1595 (long) REMAINING_AVAIL_SLOTS); \ |
| 1333 | 1596 \ |
| 1597 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); \ | |
| 456 | 1598 } \ |
| 428 | 1599 \ |
| 456 | 1600 /* Push the info, starting with the registers. */ \ |
| 5041 | 1601 DEBUG_FAIL_PRINT1 ("\n"); \ |
| 428 | 1602 \ |
| 456 | 1603 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ |
| 1604 this_reg++) \ | |
| 1605 { \ | |
| 5041 | 1606 DEBUG_FAIL_PRINT2 (" Pushing reg: %d\n", this_reg); \ |
| 456 | 1607 DEBUG_STATEMENT (num_regs_pushed++); \ |
| 428 | 1608 \ |
| 5041 | 1609 DEBUG_FAIL_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \ |
| 456 | 1610 PUSH_FAILURE_POINTER (regstart[this_reg]); \ |
| 1611 \ | |
| 5041 | 1612 DEBUG_FAIL_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \ |
| 456 | 1613 PUSH_FAILURE_POINTER (regend[this_reg]); \ |
| 428 | 1614 \ |
| 5041 | 1615 DEBUG_FAIL_PRINT2 (" info: 0x%lx\n ", \ |
| 456 | 1616 * (long *) (®_info[this_reg])); \ |
| 5041 | 1617 DEBUG_FAIL_PRINT2 (" match_null=%d", \ |
| 456 | 1618 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ |
| 5041 | 1619 DEBUG_FAIL_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ |
| 1620 DEBUG_FAIL_PRINT2 (" matched_something=%d", \ | |
| 456 | 1621 MATCHED_SOMETHING (reg_info[this_reg])); \ |
| 5041 | 1622 DEBUG_FAIL_PRINT2 (" ever_matched_something=%d", \ |
| 456 | 1623 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ |
| 5041 | 1624 DEBUG_FAIL_PRINT1 ("\n"); \ |
| 456 | 1625 PUSH_FAILURE_ELT (reg_info[this_reg].word); \ |
| 1626 } \ | |
| 428 | 1627 \ |
| 5041 | 1628 DEBUG_FAIL_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg); \ |
| 456 | 1629 PUSH_FAILURE_INT (lowest_active_reg); \ |
| 428 | 1630 \ |
| 5041 | 1631 DEBUG_FAIL_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg); \ |
| 456 | 1632 PUSH_FAILURE_INT (highest_active_reg); \ |
| 428 | 1633 \ |
| 5041 | 1634 DEBUG_FAIL_PRINT2 (" Pushing pattern 0x%lx: \n", (long) pattern_place); \ |
| 1635 DEBUG_FAIL_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ | |
| 456 | 1636 PUSH_FAILURE_POINTER (pattern_place); \ |
| 428 | 1637 \ |
| 5041 | 1638 DEBUG_FAIL_PRINT2 (" Pushing string 0x%lx: `", (long) string_place); \ |
| 1639 DEBUG_FAIL_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ | |
| 456 | 1640 size2); \ |
| 5041 | 1641 DEBUG_FAIL_PRINT1 ("'\n"); \ |
| 456 | 1642 PUSH_FAILURE_POINTER (string_place); \ |
| 428 | 1643 \ |
| 5041 | 1644 DEBUG_FAIL_PRINT2 (" Pushing failure id: %u\n", failure_id); \ |
| 456 | 1645 DEBUG_PUSH (failure_id); \ |
| 1646 } while (0) | |
| 428 | 1647 |
| 1648 /* This is the number of items that are pushed and popped on the stack | |
| 1649 for each register. */ | |
| 1650 #define NUM_REG_ITEMS 3 | |
| 1651 | |
| 1652 /* Individual items aside from the registers. */ | |
| 1653 #ifdef DEBUG | |
| 1654 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ | |
| 1655 #else | |
| 1656 #define NUM_NONREG_ITEMS 4 | |
| 1657 #endif | |
| 1658 | |
| 1659 /* We push at most this many items on the stack. */ | |
| 1660 /* We used to use (num_regs - 1), which is the number of registers | |
| 1661 this regexp will save; but that was changed to 5 | |
| 1662 to avoid stack overflow for a regexp with lots of parens. */ | |
| 1663 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) | |
| 1664 | |
| 1665 /* We actually push this many items. */ | |
| 1666 #define NUM_FAILURE_ITEMS \ | |
| 1667 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ | |
| 1668 + NUM_NONREG_ITEMS) | |
| 1669 | |
| 1670 /* How many items can still be added to the stack without overflowing it. */ | |
| 1671 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) | |
| 1672 | |
| 1673 | |
| 1674 /* Pops what PUSH_FAIL_STACK pushes. | |
| 1675 | |
| 1676 We restore into the parameters, all of which should be lvalues: | |
| 1677 STR -- the saved data position. | |
| 1678 PAT -- the saved pattern position. | |
| 1679 LOW_REG, HIGH_REG -- the highest and lowest active registers. | |
| 1680 REGSTART, REGEND -- arrays of string positions. | |
| 1681 REG_INFO -- array of information about each subexpression. | |
| 1682 | |
| 1683 Also assumes the variables `fail_stack' and (if debugging), `bufp', | |
| 1684 `pend', `string1', `size1', `string2', and `size2'. */ | |
| 1685 | |
| 456 | 1686 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \ |
| 1687 regstart, regend, reg_info) \ | |
| 1688 do { \ | |
| 428 | 1689 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \ |
| 1690 int this_reg; \ | |
| 442 | 1691 const unsigned char *string_temp; \ |
| 428 | 1692 \ |
| 1693 assert (!FAIL_STACK_EMPTY ()); \ | |
| 1694 \ | |
| 1695 /* Remove failure points and point to how many regs pushed. */ \ | |
| 5041 | 1696 DEBUG_FAIL_PRINT1 ("POP_FAILURE_POINT:\n"); \ |
| 1697 DEBUG_FAIL_PRINT2 (" Before pop, next avail: %ld\n", \ | |
| 647 | 1698 (long) fail_stack.avail); \ |
| 5041 | 1699 DEBUG_FAIL_PRINT2 (" size: %ld\n", \ |
| 647 | 1700 (long) fail_stack.size); \ |
| 428 | 1701 \ |
| 1702 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ | |
| 1703 \ | |
| 1704 DEBUG_POP (&ffailure_id.integer); \ | |
| 5041 | 1705 DEBUG_FAIL_PRINT2 (" Popping failure id: %d\n", \ |
| 647 | 1706 * (int *) &ffailure_id); \ |
| 428 | 1707 \ |
| 1708 /* If the saved string location is NULL, it came from an \ | |
| 1709 on_failure_keep_string_jump opcode, and we want to throw away the \ | |
| 1710 saved NULL, thus retaining our current position in the string. */ \ | |
| 1711 string_temp = POP_FAILURE_POINTER (); \ | |
| 1712 if (string_temp != NULL) \ | |
| 446 | 1713 str = string_temp; \ |
| 428 | 1714 \ |
| 5041 | 1715 DEBUG_FAIL_PRINT2 (" Popping string 0x%lx: `", (long) str); \ |
| 1716 DEBUG_FAIL_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ | |
| 1717 DEBUG_FAIL_PRINT1 ("'\n"); \ | |
| 428 | 1718 \ |
| 1719 pat = (unsigned char *) POP_FAILURE_POINTER (); \ | |
| 5041 | 1720 DEBUG_FAIL_PRINT2 (" Popping pattern 0x%lx: ", (long) pat); \ |
| 1721 DEBUG_FAIL_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ | |
| 428 | 1722 \ |
| 1723 /* Restore register info. */ \ | |
| 647 | 1724 high_reg = POP_FAILURE_INT (); \ |
| 5041 | 1725 DEBUG_FAIL_PRINT2 (" Popping high active reg: %d\n", high_reg); \ |
| 428 | 1726 \ |
| 647 | 1727 low_reg = POP_FAILURE_INT (); \ |
| 5041 | 1728 DEBUG_FAIL_PRINT2 (" Popping low active reg: %d\n", low_reg); \ |
| 428 | 1729 \ |
| 1730 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ | |
| 1731 { \ | |
| 5041 | 1732 DEBUG_FAIL_PRINT2 (" Popping reg: %d\n", this_reg); \ |
| 428 | 1733 \ |
| 1734 reg_info[this_reg].word = POP_FAILURE_ELT (); \ | |
| 5041 | 1735 DEBUG_FAIL_PRINT2 (" info: 0x%lx\n", \ |
| 428 | 1736 * (long *) ®_info[this_reg]); \ |
| 1737 \ | |
| 446 | 1738 regend[this_reg] = POP_FAILURE_POINTER (); \ |
| 5041 | 1739 DEBUG_FAIL_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \ |
| 428 | 1740 \ |
| 446 | 1741 regstart[this_reg] = POP_FAILURE_POINTER (); \ |
| 5041 | 1742 DEBUG_FAIL_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \ |
| 428 | 1743 } \ |
| 1744 \ | |
| 1745 set_regs_matched_done = 0; \ | |
| 1746 DEBUG_STATEMENT (nfailure_points_popped++); \ | |
| 456 | 1747 } while (0) /* POP_FAILURE_POINT */ |
| 428 | 1748 |
| 1749 | |
| 1750 | |
| 1751 /* Structure for per-register (a.k.a. per-group) information. | |
| 1752 Other register information, such as the | |
| 1753 starting and ending positions (which are addresses), and the list of | |
| 1754 inner groups (which is a bits list) are maintained in separate | |
| 1755 variables. | |
| 1756 | |
| 1757 We are making a (strictly speaking) nonportable assumption here: that | |
| 1758 the compiler will pack our bit fields into something that fits into | |
| 1759 the type of `word', i.e., is something that fits into one item on the | |
| 1760 failure stack. */ | |
| 1761 | |
| 1762 typedef union | |
| 1763 { | |
| 1764 fail_stack_elt_t word; | |
| 1765 struct | |
| 1766 { | |
| 1767 /* This field is one if this group can match the empty string, | |
| 1768 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ | |
| 1769 #define MATCH_NULL_UNSET_VALUE 3 | |
| 647 | 1770 unsigned int match_null_string_p : 2; |
| 1771 unsigned int is_active : 1; | |
| 1772 unsigned int matched_something : 1; | |
| 1773 unsigned int ever_matched_something : 1; | |
| 428 | 1774 } bits; |
| 1775 } register_info_type; | |
| 1776 | |
| 1777 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) | |
| 1778 #define IS_ACTIVE(R) ((R).bits.is_active) | |
| 1779 #define MATCHED_SOMETHING(R) ((R).bits.matched_something) | |
| 1780 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) | |
| 1781 | |
| 1782 | |
| 1783 /* Call this when have matched a real character; it sets `matched' flags | |
| 1784 for the subexpressions which we are currently inside. Also records | |
| 1785 that those subexprs have matched. */ | |
| 1786 #define SET_REGS_MATCHED() \ | |
| 1787 do \ | |
| 1788 { \ | |
| 1789 if (!set_regs_matched_done) \ | |
| 1790 { \ | |
| 647 | 1791 int r; \ |
| 428 | 1792 set_regs_matched_done = 1; \ |
| 1793 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ | |
| 1794 { \ | |
| 1795 MATCHED_SOMETHING (reg_info[r]) \ | |
| 1796 = EVER_MATCHED_SOMETHING (reg_info[r]) \ | |
| 1797 = 1; \ | |
| 1798 } \ | |
| 1799 } \ | |
| 1800 } \ | |
| 1801 while (0) | |
| 1802 | |
| 1803 /* Registers are set to a sentinel when they haven't yet matched. */ | |
| 446 | 1804 static unsigned char reg_unset_dummy; |
| 428 | 1805 #define REG_UNSET_VALUE (®_unset_dummy) |
| 1806 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) | |
| 1807 | |
| 1808 /* Subroutine declarations and macros for regex_compile. */ | |
| 1809 | |
| 1810 /* Fetch the next character in the uncompiled pattern---translating it | |
| 826 | 1811 if necessary. */ |
| 428 | 1812 #define PATFETCH(c) \ |
| 446 | 1813 do { \ |
| 1814 PATFETCH_RAW (c); \ | |
| 826 | 1815 c = RE_TRANSLATE (c); \ |
| 428 | 1816 } while (0) |
| 1817 | |
| 1818 /* Fetch the next character in the uncompiled pattern, with no | |
| 1819 translation. */ | |
| 1820 #define PATFETCH_RAW(c) \ | |
| 1821 do {if (p == pend) return REG_EEND; \ | |
| 1822 assert (p < pend); \ | |
| 867 | 1823 c = itext_ichar (p); \ |
| 1824 INC_IBYTEPTR (p); \ | |
| 428 | 1825 } while (0) |
| 1826 | |
| 1827 /* Go backwards one character in the pattern. */ | |
| 867 | 1828 #define PATUNFETCH DEC_IBYTEPTR (p) |
| 428 | 1829 |
| 1830 /* If `translate' is non-null, return translate[D], else just D. We | |
| 1831 cast the subscript to translate because some data is declared as | |
| 1832 `char *', to avoid warnings when a string constant is passed. But | |
| 1833 when we use a character as a subscript we must make it unsigned. */ | |
| 826 | 1834 #define RE_TRANSLATE(d) \ |
| 1835 (TRANSLATE_P (translate) ? RE_TRANSLATE_1 (d) : (d)) | |
| 428 | 1836 |
| 1837 /* Macros for outputting the compiled pattern into `buffer'. */ | |
| 1838 | |
| 1839 /* If the buffer isn't allocated when it comes in, use this. */ | |
| 1840 #define INIT_BUF_SIZE 32 | |
| 1841 | |
| 1842 /* Make sure we have at least N more bytes of space in buffer. */ | |
| 1843 #define GET_BUFFER_SPACE(n) \ | |
| 647 | 1844 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \ |
| 428 | 1845 EXTEND_BUFFER () |
| 1846 | |
| 1847 /* Make sure we have one more byte of buffer space and then add C to it. */ | |
| 1848 #define BUF_PUSH(c) \ | |
| 1849 do { \ | |
| 1850 GET_BUFFER_SPACE (1); \ | |
| 446 | 1851 *buf_end++ = (unsigned char) (c); \ |
| 428 | 1852 } while (0) |
| 1853 | |
| 1854 | |
| 1855 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ | |
| 1856 #define BUF_PUSH_2(c1, c2) \ | |
| 1857 do { \ | |
| 1858 GET_BUFFER_SPACE (2); \ | |
| 446 | 1859 *buf_end++ = (unsigned char) (c1); \ |
| 1860 *buf_end++ = (unsigned char) (c2); \ | |
| 428 | 1861 } while (0) |
| 1862 | |
| 1863 | |
| 1864 /* As with BUF_PUSH_2, except for three bytes. */ | |
| 1865 #define BUF_PUSH_3(c1, c2, c3) \ | |
| 1866 do { \ | |
| 1867 GET_BUFFER_SPACE (3); \ | |
| 446 | 1868 *buf_end++ = (unsigned char) (c1); \ |
| 1869 *buf_end++ = (unsigned char) (c2); \ | |
| 1870 *buf_end++ = (unsigned char) (c3); \ | |
| 428 | 1871 } while (0) |
| 1872 | |
| 1873 | |
| 1874 /* Store a jump with opcode OP at LOC to location TO. We store a | |
| 1875 relative address offset by the three bytes the jump itself occupies. */ | |
| 1876 #define STORE_JUMP(op, loc, to) \ | |
| 1877 store_op1 (op, loc, (to) - (loc) - 3) | |
| 1878 | |
| 1879 /* Likewise, for a two-argument jump. */ | |
| 1880 #define STORE_JUMP2(op, loc, to, arg) \ | |
| 1881 store_op2 (op, loc, (to) - (loc) - 3, arg) | |
| 1882 | |
| 446 | 1883 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the |
| 1884 buffer end. */ | |
| 428 | 1885 #define INSERT_JUMP(op, loc, to) \ |
| 446 | 1886 insert_op1 (op, loc, (to) - (loc) - 3, buf_end) |
| 1887 | |
| 1888 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the | |
| 1889 buffer end. */ | |
| 428 | 1890 #define INSERT_JUMP2(op, loc, to, arg) \ |
| 446 | 1891 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end) |
| 428 | 1892 |
| 1893 | |
| 1894 /* This is not an arbitrary limit: the arguments which represent offsets | |
| 1895 into the pattern are two bytes long. So if 2^16 bytes turns out to | |
| 1896 be too small, many things would have to change. */ | |
| 1897 #define MAX_BUF_SIZE (1L << 16) | |
| 1898 | |
| 1899 | |
| 1900 /* Extend the buffer by twice its current size via realloc and | |
| 1901 reset the pointers that pointed into the old block to point to the | |
| 1902 correct places in the new one. If extending the buffer results in it | |
| 1903 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ | |
| 1333 | 1904 #define EXTEND_BUFFER() \ |
| 1905 do { \ | |
| 1906 re_char *old_buffer = bufp->buffer; \ | |
| 1907 if (bufp->allocated == MAX_BUF_SIZE) \ | |
| 1908 return REG_ESIZE; \ | |
| 1909 bufp->allocated <<= 1; \ | |
| 1910 if (bufp->allocated > MAX_BUF_SIZE) \ | |
| 1911 bufp->allocated = MAX_BUF_SIZE; \ | |
| 1912 bufp->buffer = \ | |
| 1913 (unsigned char *) xrealloc (bufp->buffer, bufp->allocated); \ | |
| 1914 if (bufp->buffer == NULL) \ | |
| 1915 return REG_ESPACE; \ | |
| 1916 /* If the buffer moved, move all the pointers into it. */ \ | |
| 1917 if (old_buffer != bufp->buffer) \ | |
| 1918 { \ | |
| 1919 buf_end = (buf_end - old_buffer) + bufp->buffer; \ | |
| 1920 begalt = (begalt - old_buffer) + bufp->buffer; \ | |
| 1921 if (fixup_alt_jump) \ | |
| 1922 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer; \ | |
| 1923 if (laststart) \ | |
| 1924 laststart = (laststart - old_buffer) + bufp->buffer; \ | |
| 1925 if (pending_exact) \ | |
| 1926 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ | |
| 1927 } \ | |
| 428 | 1928 } while (0) |
| 1929 | |
| 1930 | |
| 1931 /* Since we have one byte reserved for the register number argument to | |
| 1932 {start,stop}_memory, the maximum number of groups we can report | |
| 1933 things about is what fits in that byte. */ | |
| 1934 #define MAX_REGNUM 255 | |
| 1935 | |
| 1936 /* But patterns can have more than `MAX_REGNUM' registers. We just | |
| 502 | 1937 ignore the excess. |
| 1938 #### not true! groups past this will fail in lots of ways, if we | |
| 1939 ever have to backtrack. | |
| 1940 */ | |
| 647 | 1941 typedef int regnum_t; |
| 428 | 1942 |
| 502 | 1943 #define INIT_REG_TRANSLATE_SIZE 5 |
| 428 | 1944 |
| 1945 /* Macros for the compile stack. */ | |
| 1946 | |
| 1947 /* Since offsets can go either forwards or backwards, this type needs to | |
| 1948 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ | |
| 1949 typedef int pattern_offset_t; | |
| 1950 | |
| 1951 typedef struct | |
| 1952 { | |
| 1953 pattern_offset_t begalt_offset; | |
| 1954 pattern_offset_t fixup_alt_jump; | |
| 1955 pattern_offset_t inner_group_offset; | |
| 1956 pattern_offset_t laststart_offset; | |
| 1957 regnum_t regnum; | |
| 1958 } compile_stack_elt_t; | |
| 1959 | |
| 1960 | |
| 1961 typedef struct | |
| 1962 { | |
| 1963 compile_stack_elt_t *stack; | |
| 647 | 1964 int size; |
| 1965 int avail; /* Offset of next open position. */ | |
| 428 | 1966 } compile_stack_type; |
| 1967 | |
| 1968 | |
| 1969 #define INIT_COMPILE_STACK_SIZE 32 | |
| 1970 | |
| 1971 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) | |
| 1972 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) | |
| 1973 | |
| 1974 /* The next available element. */ | |
| 1975 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) | |
| 1976 | |
| 1977 | |
| 1978 /* Set the bit for character C in a bit vector. */ | |
| 1979 #define SET_LIST_BIT(c) \ | |
| 446 | 1980 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \ |
| 428 | 1981 |= 1 << (((unsigned char) c) % BYTEWIDTH)) |
| 1982 | |
| 1983 #ifdef MULE | |
| 1984 | |
| 1985 /* Set the "bit" for character C in a range table. */ | |
| 1986 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt) | |
| 1987 | |
| 1988 /* Set the "bit" for character c in the appropriate table. */ | |
| 1989 #define SET_EITHER_BIT(c) \ | |
| 1990 do { \ | |
| 1991 if (has_extended_chars) \ | |
| 1992 SET_RANGETAB_BIT (c); \ | |
| 1993 else \ | |
| 1994 SET_LIST_BIT (c); \ | |
| 1995 } while (0) | |
| 1996 | |
| 1997 #else /* not MULE */ | |
| 1998 | |
| 1999 #define SET_EITHER_BIT(c) SET_LIST_BIT (c) | |
| 2000 | |
| 2001 #endif | |
| 2002 | |
| 2003 | |
| 2004 /* Get the next unsigned number in the uncompiled pattern. */ | |
| 2005 #define GET_UNSIGNED_NUMBER(num) \ | |
| 2006 { if (p != pend) \ | |
| 2007 { \ | |
| 2008 PATFETCH (c); \ | |
| 2009 while (ISDIGIT (c)) \ | |
| 2010 { \ | |
| 2011 if (num < 0) \ | |
| 2012 num = 0; \ | |
| 2013 num = num * 10 + c - '0'; \ | |
| 2014 if (p == pend) \ | |
| 2015 break; \ | |
| 2016 PATFETCH (c); \ | |
| 2017 } \ | |
| 2018 } \ | |
| 2019 } | |
| 2020 | |
| 2021 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ | |
| 2022 | |
| 2023 #define IS_CHAR_CLASS(string) \ | |
| 2024 (STREQ (string, "alpha") || STREQ (string, "upper") \ | |
| 2025 || STREQ (string, "lower") || STREQ (string, "digit") \ | |
| 2026 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ | |
| 2027 || STREQ (string, "space") || STREQ (string, "print") \ | |
| 2028 || STREQ (string, "punct") || STREQ (string, "graph") \ | |
| 2029 || STREQ (string, "cntrl") || STREQ (string, "blank")) | |
| 2030 | |
| 2031 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg); | |
| 2032 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2); | |
| 2033 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg, | |
| 2034 unsigned char *end); | |
| 2035 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, | |
| 2036 unsigned char *end); | |
| 460 | 2037 static re_bool at_begline_loc_p (re_char *pattern, re_char *p, |
| 428 | 2038 reg_syntax_t syntax); |
| 460 | 2039 static re_bool at_endline_loc_p (re_char *p, re_char *pend, int syntax); |
| 2040 static re_bool group_in_compile_stack (compile_stack_type compile_stack, | |
| 428 | 2041 regnum_t regnum); |
| 446 | 2042 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend, |
| 2043 RE_TRANSLATE_TYPE translate, | |
| 2044 reg_syntax_t syntax, | |
| 428 | 2045 unsigned char *b); |
| 2046 #ifdef MULE | |
| 446 | 2047 static reg_errcode_t compile_extended_range (re_char **p_ptr, |
| 2048 re_char *pend, | |
| 2049 RE_TRANSLATE_TYPE translate, | |
| 428 | 2050 reg_syntax_t syntax, |
| 2051 Lisp_Object rtab); | |
| 2052 #endif /* MULE */ | |
| 460 | 2053 static re_bool group_match_null_string_p (unsigned char **p, |
| 428 | 2054 unsigned char *end, |
| 2055 register_info_type *reg_info); | |
| 460 | 2056 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end, |
| 428 | 2057 register_info_type *reg_info); |
| 460 | 2058 static re_bool common_op_match_null_string_p (unsigned char **p, |
| 428 | 2059 unsigned char *end, |
| 2060 register_info_type *reg_info); | |
| 826 | 2061 static int bcmp_translate (re_char *s1, re_char *s2, |
| 2062 REGISTER int len, RE_TRANSLATE_TYPE translate | |
| 2063 #ifdef emacs | |
| 2064 , Internal_Format fmt, Lisp_Object lispobj | |
| 2065 #endif | |
| 2066 ); | |
| 428 | 2067 static int re_match_2_internal (struct re_pattern_buffer *bufp, |
| 446 | 2068 re_char *string1, int size1, |
| 2069 re_char *string2, int size2, int pos, | |
| 826 | 2070 struct re_registers *regs, int stop |
| 2071 RE_LISP_CONTEXT_ARGS_DECL); | |
| 428 | 2072 |
| 2073 #ifndef MATCH_MAY_ALLOCATE | |
| 2074 | |
| 2075 /* If we cannot allocate large objects within re_match_2_internal, | |
| 2076 we make the fail stack and register vectors global. | |
| 2077 The fail stack, we grow to the maximum size when a regexp | |
| 2078 is compiled. | |
| 2079 The register vectors, we adjust in size each time we | |
| 2080 compile a regexp, according to the number of registers it needs. */ | |
| 2081 | |
| 2082 static fail_stack_type fail_stack; | |
| 2083 | |
| 2084 /* Size with which the following vectors are currently allocated. | |
| 2085 That is so we can make them bigger as needed, | |
| 2086 but never make them smaller. */ | |
| 2087 static int regs_allocated_size; | |
| 2088 | |
| 446 | 2089 static re_char ** regstart, ** regend; |
| 2090 static re_char ** old_regstart, ** old_regend; | |
| 2091 static re_char **best_regstart, **best_regend; | |
| 428 | 2092 static register_info_type *reg_info; |
| 446 | 2093 static re_char **reg_dummy; |
| 428 | 2094 static register_info_type *reg_info_dummy; |
| 2095 | |
| 2096 /* Make the register vectors big enough for NUM_REGS registers, | |
| 2097 but don't make them smaller. */ | |
| 2098 | |
| 2099 static | |
| 2100 regex_grow_registers (int num_regs) | |
| 2101 { | |
| 2102 if (num_regs > regs_allocated_size) | |
| 2103 { | |
| 551 | 2104 RETALLOC (regstart, num_regs, re_char *); |
| 2105 RETALLOC (regend, num_regs, re_char *); | |
| 2106 RETALLOC (old_regstart, num_regs, re_char *); | |
| 2107 RETALLOC (old_regend, num_regs, re_char *); | |
| 2108 RETALLOC (best_regstart, num_regs, re_char *); | |
| 2109 RETALLOC (best_regend, num_regs, re_char *); | |
| 2110 RETALLOC (reg_info, num_regs, register_info_type); | |
| 2111 RETALLOC (reg_dummy, num_regs, re_char *); | |
| 2112 RETALLOC (reg_info_dummy, num_regs, register_info_type); | |
| 428 | 2113 |
| 2114 regs_allocated_size = num_regs; | |
| 2115 } | |
| 2116 } | |
| 2117 | |
| 2118 #endif /* not MATCH_MAY_ALLOCATE */ | |
| 2119 | |
| 2120 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. | |
| 2121 Returns one of error codes defined in `regex.h', or zero for success. | |
| 2122 | |
| 2123 Assumes the `allocated' (and perhaps `buffer') and `translate' | |
| 2124 fields are set in BUFP on entry. | |
| 2125 | |
| 2126 If it succeeds, results are put in BUFP (if it returns an error, the | |
| 2127 contents of BUFP are undefined): | |
| 2128 `buffer' is the compiled pattern; | |
| 2129 `syntax' is set to SYNTAX; | |
| 2130 `used' is set to the length of the compiled pattern; | |
| 2131 `fastmap_accurate' is zero; | |
| 502 | 2132 `re_ngroups' is the number of groups/subexpressions (including shy |
| 2133 groups) in PATTERN; | |
| 2134 `re_nsub' is the number of non-shy groups in PATTERN; | |
| 428 | 2135 `not_bol' and `not_eol' are zero; |
| 2136 | |
| 2137 The `fastmap' and `newline_anchor' fields are neither | |
| 2138 examined nor set. */ | |
| 2139 | |
| 2140 /* Return, freeing storage we allocated. */ | |
| 1726 | 2141 #define FREE_STACK_RETURN(value) \ |
| 2142 do \ | |
| 2143 { \ | |
|
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
2144 xfree (compile_stack.stack); \ |
| 1726 | 2145 return value; \ |
| 1333 | 2146 } while (0) |
| 428 | 2147 |
| 2148 static reg_errcode_t | |
| 446 | 2149 regex_compile (re_char *pattern, int size, reg_syntax_t syntax, |
| 428 | 2150 struct re_pattern_buffer *bufp) |
| 2151 { | |
| 2152 /* We fetch characters from PATTERN here. We declare these as int | |
| 2153 (or possibly long) so that chars above 127 can be used as | |
| 2154 array indices. The macros that fetch a character from the pattern | |
| 2155 make sure to coerce to unsigned char before assigning, so we won't | |
| 2156 get bitten by negative numbers here. */ | |
| 2157 /* XEmacs change: used to be unsigned char. */ | |
| 2158 REGISTER EMACS_INT c, c1; | |
| 2159 | |
| 2160 /* A random temporary spot in PATTERN. */ | |
| 446 | 2161 re_char *p1; |
| 428 | 2162 |
| 2163 /* Points to the end of the buffer, where we should append. */ | |
| 446 | 2164 REGISTER unsigned char *buf_end; |
| 428 | 2165 |
| 2166 /* Keeps track of unclosed groups. */ | |
| 2167 compile_stack_type compile_stack; | |
| 2168 | |
| 2169 /* Points to the current (ending) position in the pattern. */ | |
| 446 | 2170 re_char *p = pattern; |
| 2171 re_char *pend = pattern + size; | |
| 428 | 2172 |
| 2173 /* How to translate the characters in the pattern. */ | |
| 446 | 2174 RE_TRANSLATE_TYPE translate = bufp->translate; |
| 428 | 2175 |
| 2176 /* Address of the count-byte of the most recently inserted `exactn' | |
| 2177 command. This makes it possible to tell if a new exact-match | |
| 2178 character can be added to that command or if the character requires | |
| 2179 a new `exactn' command. */ | |
| 2180 unsigned char *pending_exact = 0; | |
| 2181 | |
| 2182 /* Address of start of the most recently finished expression. | |
| 2183 This tells, e.g., postfix * where to find the start of its | |
| 2184 operand. Reset at the beginning of groups and alternatives. */ | |
| 2185 unsigned char *laststart = 0; | |
| 2186 | |
| 2187 /* Address of beginning of regexp, or inside of last group. */ | |
| 2188 unsigned char *begalt; | |
| 2189 | |
| 2190 /* Place in the uncompiled pattern (i.e., the {) to | |
| 2191 which to go back if the interval is invalid. */ | |
| 446 | 2192 re_char *beg_interval; |
| 428 | 2193 |
| 2194 /* Address of the place where a forward jump should go to the end of | |
| 2195 the containing expression. Each alternative of an `or' -- except the | |
| 2196 last -- ends with a forward jump of this sort. */ | |
| 2197 unsigned char *fixup_alt_jump = 0; | |
| 2198 | |
| 2199 /* Counts open-groups as they are encountered. Remembered for the | |
| 2200 matching close-group on the compile stack, so the same register | |
| 2201 number is put in the stop_memory as the start_memory. */ | |
| 2202 regnum_t regnum = 0; | |
| 2203 | |
| 2204 #ifdef DEBUG | |
| 5041 | 2205 if (debug_regexps & RE_DEBUG_COMPILATION) |
| 428 | 2206 { |
| 647 | 2207 int debug_count; |
| 428 | 2208 |
| 5041 | 2209 DEBUG_PRINT1 ("\nCompiling pattern: "); |
| 428 | 2210 for (debug_count = 0; debug_count < size; debug_count++) |
| 2211 putchar (pattern[debug_count]); | |
| 2212 putchar ('\n'); | |
| 2213 } | |
| 2214 #endif /* DEBUG */ | |
| 2215 | |
| 2216 /* Initialize the compile stack. */ | |
| 2217 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); | |
| 2218 if (compile_stack.stack == NULL) | |
| 2219 return REG_ESPACE; | |
| 2220 | |
| 2221 compile_stack.size = INIT_COMPILE_STACK_SIZE; | |
| 2222 compile_stack.avail = 0; | |
| 2223 | |
| 2224 /* Initialize the pattern buffer. */ | |
| 2225 bufp->syntax = syntax; | |
| 2226 bufp->fastmap_accurate = 0; | |
| 2227 bufp->not_bol = bufp->not_eol = 0; | |
| 2228 | |
| 2229 /* Set `used' to zero, so that if we return an error, the pattern | |
| 2230 printer (for debugging) will think there's no pattern. We reset it | |
| 2231 at the end. */ | |
| 2232 bufp->used = 0; | |
| 2233 | |
| 2234 /* Always count groups, whether or not bufp->no_sub is set. */ | |
| 2235 bufp->re_nsub = 0; | |
| 502 | 2236 bufp->re_ngroups = 0; |
| 2237 | |
| 2238 bufp->warned_about_incompatible_back_references = 0; | |
| 2239 | |
| 2240 if (bufp->external_to_internal_register == 0) | |
| 2241 { | |
| 2242 bufp->external_to_internal_register_size = INIT_REG_TRANSLATE_SIZE; | |
| 2243 RETALLOC (bufp->external_to_internal_register, | |
| 2244 bufp->external_to_internal_register_size, | |
| 2245 int); | |
| 2246 } | |
| 2247 | |
| 2248 { | |
| 2249 int i; | |
| 2250 | |
| 2251 bufp->external_to_internal_register[0] = 0; | |
| 2252 for (i = 1; i < bufp->external_to_internal_register_size; i++) | |
| 2253 bufp->external_to_internal_register[i] = (int) 0xDEADBEEF; | |
| 2254 } | |
| 428 | 2255 |
| 2256 #if !defined (emacs) && !defined (SYNTAX_TABLE) | |
| 2257 /* Initialize the syntax table. */ | |
| 2258 init_syntax_once (); | |
| 2259 #endif | |
| 2260 | |
| 2261 if (bufp->allocated == 0) | |
| 2262 { | |
| 2263 if (bufp->buffer) | |
| 2264 { /* If zero allocated, but buffer is non-null, try to realloc | |
| 2265 enough space. This loses if buffer's address is bogus, but | |
| 2266 that is the user's responsibility. */ | |
| 2267 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); | |
| 2268 } | |
| 2269 else | |
| 2270 { /* Caller did not allocate a buffer. Do it for them. */ | |
| 2271 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); | |
| 2272 } | |
| 2273 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE); | |
| 2274 | |
| 2275 bufp->allocated = INIT_BUF_SIZE; | |
| 2276 } | |
| 2277 | |
| 446 | 2278 begalt = buf_end = bufp->buffer; |
| 428 | 2279 |
| 2280 /* Loop through the uncompiled pattern until we're at the end. */ | |
| 2281 while (p != pend) | |
| 2282 { | |
| 2283 PATFETCH (c); | |
| 2284 | |
| 2285 switch (c) | |
| 2286 { | |
| 2287 case '^': | |
| 2288 { | |
| 2289 if ( /* If at start of pattern, it's an operator. */ | |
| 2290 p == pattern + 1 | |
| 2291 /* If context independent, it's an operator. */ | |
| 2292 || syntax & RE_CONTEXT_INDEP_ANCHORS | |
| 2293 /* Otherwise, depends on what's come before. */ | |
| 2294 || at_begline_loc_p (pattern, p, syntax)) | |
| 2295 BUF_PUSH (begline); | |
| 2296 else | |
| 2297 goto normal_char; | |
| 2298 } | |
| 2299 break; | |
| 2300 | |
| 2301 | |
| 2302 case '$': | |
| 2303 { | |
| 2304 if ( /* If at end of pattern, it's an operator. */ | |
| 2305 p == pend | |
| 2306 /* If context independent, it's an operator. */ | |
| 2307 || syntax & RE_CONTEXT_INDEP_ANCHORS | |
| 2308 /* Otherwise, depends on what's next. */ | |
| 2309 || at_endline_loc_p (p, pend, syntax)) | |
| 2310 BUF_PUSH (endline); | |
| 2311 else | |
| 2312 goto normal_char; | |
| 2313 } | |
| 2314 break; | |
| 2315 | |
| 2316 | |
| 2317 case '+': | |
| 2318 case '?': | |
| 2319 if ((syntax & RE_BK_PLUS_QM) | |
| 2320 || (syntax & RE_LIMITED_OPS)) | |
| 2321 goto normal_char; | |
| 2322 handle_plus: | |
| 2323 case '*': | |
| 2324 /* If there is no previous pattern... */ | |
| 2325 if (!laststart) | |
| 2326 { | |
| 2327 if (syntax & RE_CONTEXT_INVALID_OPS) | |
| 2328 FREE_STACK_RETURN (REG_BADRPT); | |
| 2329 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) | |
| 2330 goto normal_char; | |
| 2331 } | |
| 2332 | |
| 2333 { | |
| 2334 /* true means zero/many matches are allowed. */ | |
| 460 | 2335 re_bool zero_times_ok = c != '+'; |
| 2336 re_bool many_times_ok = c != '?'; | |
| 428 | 2337 |
| 2338 /* true means match shortest string possible. */ | |
| 460 | 2339 re_bool minimal = false; |
| 428 | 2340 |
| 2341 /* If there is a sequence of repetition chars, collapse it | |
| 2342 down to just one (the right one). We can't combine | |
| 2343 interval operators with these because of, e.g., `a{2}*', | |
| 2344 which should only match an even number of `a's. */ | |
| 2345 while (p != pend) | |
| 2346 { | |
| 2347 PATFETCH (c); | |
| 2348 | |
| 2349 if (c == '*' || (!(syntax & RE_BK_PLUS_QM) | |
| 2350 && (c == '+' || c == '?'))) | |
| 2351 ; | |
| 2352 | |
| 2353 else if (syntax & RE_BK_PLUS_QM && c == '\\') | |
| 2354 { | |
| 2355 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
| 2356 | |
| 2357 PATFETCH (c1); | |
| 2358 if (!(c1 == '+' || c1 == '?')) | |
| 2359 { | |
| 2360 PATUNFETCH; | |
| 2361 PATUNFETCH; | |
| 2362 break; | |
| 2363 } | |
| 2364 | |
| 2365 c = c1; | |
| 2366 } | |
| 2367 else | |
| 2368 { | |
| 2369 PATUNFETCH; | |
| 2370 break; | |
| 2371 } | |
| 2372 | |
| 2373 /* If we get here, we found another repeat character. */ | |
| 2374 if (!(syntax & RE_NO_MINIMAL_MATCHING)) | |
| 2375 { | |
| 440 | 2376 /* "*?" and "+?" and "??" are okay (and mean match |
| 2377 minimally), but other sequences (such as "*??" and | |
| 2378 "+++") are rejected (reserved for future use). */ | |
| 428 | 2379 if (minimal || c != '?') |
| 2380 FREE_STACK_RETURN (REG_BADRPT); | |
| 2381 minimal = true; | |
| 2382 } | |
| 2383 else | |
| 2384 { | |
| 2385 zero_times_ok |= c != '+'; | |
| 2386 many_times_ok |= c != '?'; | |
| 2387 } | |
| 2388 } | |
| 2389 | |
| 2390 /* Star, etc. applied to an empty pattern is equivalent | |
| 2391 to an empty pattern. */ | |
| 2392 if (!laststart) | |
| 2393 break; | |
| 2394 | |
| 2395 /* Now we know whether zero matches is allowed | |
| 2396 and whether two or more matches is allowed | |
| 2397 and whether we want minimal or maximal matching. */ | |
| 2398 if (minimal) | |
| 2399 { | |
| 2400 if (!many_times_ok) | |
| 2401 { | |
| 2402 /* "a??" becomes: | |
| 2403 0: /on_failure_jump to 6 | |
| 2404 3: /jump to 9 | |
| 2405 6: /exactn/1/A | |
| 2406 9: end of pattern. | |
| 2407 */ | |
| 2408 GET_BUFFER_SPACE (6); | |
| 446 | 2409 INSERT_JUMP (jump, laststart, buf_end + 3); |
| 2410 buf_end += 3; | |
| 428 | 2411 INSERT_JUMP (on_failure_jump, laststart, laststart + 6); |
| 446 | 2412 buf_end += 3; |
| 428 | 2413 } |
| 2414 else if (zero_times_ok) | |
| 2415 { | |
| 2416 /* "a*?" becomes: | |
| 2417 0: /jump to 6 | |
| 2418 3: /exactn/1/A | |
| 2419 6: /on_failure_jump to 3 | |
| 2420 9: end of pattern. | |
| 2421 */ | |
| 2422 GET_BUFFER_SPACE (6); | |
| 446 | 2423 INSERT_JUMP (jump, laststart, buf_end + 3); |
| 2424 buf_end += 3; | |
| 2425 STORE_JUMP (on_failure_jump, buf_end, laststart + 3); | |
| 2426 buf_end += 3; | |
| 428 | 2427 } |
| 2428 else | |
| 2429 { | |
| 2430 /* "a+?" becomes: | |
| 2431 0: /exactn/1/A | |
| 2432 3: /on_failure_jump to 0 | |
| 2433 6: end of pattern. | |
| 2434 */ | |
| 2435 GET_BUFFER_SPACE (3); | |
| 446 | 2436 STORE_JUMP (on_failure_jump, buf_end, laststart); |
| 2437 buf_end += 3; | |
| 428 | 2438 } |
| 2439 } | |
| 2440 else | |
| 2441 { | |
| 2442 /* Are we optimizing this jump? */ | |
| 460 | 2443 re_bool keep_string_p = false; |
| 428 | 2444 |
| 2445 if (many_times_ok) | |
| 446 | 2446 { /* More than one repetition is allowed, so put in |
| 2447 at the end a backward relative jump from | |
| 2448 `buf_end' to before the next jump we're going | |
| 2449 to put in below (which jumps from laststart to | |
| 2450 after this jump). | |
| 428 | 2451 |
| 2452 But if we are at the `*' in the exact sequence `.*\n', | |
| 2453 insert an unconditional jump backwards to the ., | |
| 2454 instead of the beginning of the loop. This way we only | |
| 2455 push a failure point once, instead of every time | |
| 2456 through the loop. */ | |
| 2457 assert (p - 1 > pattern); | |
| 2458 | |
| 2459 /* Allocate the space for the jump. */ | |
| 2460 GET_BUFFER_SPACE (3); | |
| 2461 | |
| 2462 /* We know we are not at the first character of the | |
| 2463 pattern, because laststart was nonzero. And we've | |
| 2464 already incremented `p', by the way, to be the | |
| 2465 character after the `*'. Do we have to do something | |
| 2466 analogous here for null bytes, because of | |
| 2467 RE_DOT_NOT_NULL? */ | |
| 446 | 2468 if (*(p - 2) == '.' |
| 428 | 2469 && zero_times_ok |
| 446 | 2470 && p < pend && *p == '\n' |
| 428 | 2471 && !(syntax & RE_DOT_NEWLINE)) |
| 2472 { /* We have .*\n. */ | |
| 446 | 2473 STORE_JUMP (jump, buf_end, laststart); |
| 428 | 2474 keep_string_p = true; |
| 2475 } | |
| 2476 else | |
| 2477 /* Anything else. */ | |
| 446 | 2478 STORE_JUMP (maybe_pop_jump, buf_end, laststart - 3); |
| 428 | 2479 |
| 2480 /* We've added more stuff to the buffer. */ | |
| 446 | 2481 buf_end += 3; |
| 428 | 2482 } |
| 2483 | |
| 446 | 2484 /* On failure, jump from laststart to buf_end + 3, |
| 2485 which will be the end of the buffer after this jump | |
| 2486 is inserted. */ | |
| 428 | 2487 GET_BUFFER_SPACE (3); |
| 2488 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump | |
| 2489 : on_failure_jump, | |
| 446 | 2490 laststart, buf_end + 3); |
| 2491 buf_end += 3; | |
| 428 | 2492 |
| 2493 if (!zero_times_ok) | |
| 2494 { | |
| 2495 /* At least one repetition is required, so insert a | |
| 2496 `dummy_failure_jump' before the initial | |
| 2497 `on_failure_jump' instruction of the loop. This | |
| 2498 effects a skip over that instruction the first time | |
| 2499 we hit that loop. */ | |
| 2500 GET_BUFFER_SPACE (3); | |
| 2501 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); | |
| 446 | 2502 buf_end += 3; |
| 428 | 2503 } |
| 2504 } | |
| 2505 pending_exact = 0; | |
| 2506 } | |
| 2507 break; | |
| 2508 | |
| 2509 | |
| 2510 case '.': | |
| 446 | 2511 laststart = buf_end; |
| 428 | 2512 BUF_PUSH (anychar); |
| 2513 break; | |
| 2514 | |
| 2515 | |
| 2516 case '[': | |
| 2517 { | |
| 2518 /* XEmacs change: this whole section */ | |
| 460 | 2519 re_bool had_char_class = false; |
| 428 | 2520 #ifdef MULE |
| 460 | 2521 re_bool has_extended_chars = false; |
| 428 | 2522 REGISTER Lisp_Object rtab = Qnil; |
| 2523 #endif | |
| 2524 | |
| 2525 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
| 2526 | |
| 2527 /* Ensure that we have enough space to push a charset: the | |
| 2528 opcode, the length count, and the bitset; 34 bytes in all. */ | |
| 2529 GET_BUFFER_SPACE (34); | |
| 2530 | |
| 446 | 2531 laststart = buf_end; |
| 428 | 2532 |
| 2533 /* We test `*p == '^' twice, instead of using an if | |
| 2534 statement, so we only need one BUF_PUSH. */ | |
| 2535 BUF_PUSH (*p == '^' ? charset_not : charset); | |
| 2536 if (*p == '^') | |
| 2537 p++; | |
| 2538 | |
| 2539 /* Remember the first position in the bracket expression. */ | |
| 2540 p1 = p; | |
| 2541 | |
| 2542 /* Push the number of bytes in the bitmap. */ | |
| 2543 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); | |
| 2544 | |
| 2545 /* Clear the whole map. */ | |
| 446 | 2546 memset (buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH); |
| 428 | 2547 |
| 2548 /* charset_not matches newline according to a syntax bit. */ | |
| 446 | 2549 if ((re_opcode_t) buf_end[-2] == charset_not |
| 428 | 2550 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
| 2551 SET_LIST_BIT ('\n'); | |
| 2552 | |
| 2553 #ifdef MULE | |
| 2554 start_over_with_extended: | |
| 2555 if (has_extended_chars) | |
| 2556 { | |
| 2557 /* There are extended chars here, which means we need to start | |
| 2558 over and shift to unified range-table format. */ | |
| 446 | 2559 if (buf_end[-2] == charset) |
| 2560 buf_end[-2] = charset_mule; | |
| 428 | 2561 else |
| 446 | 2562 buf_end[-2] = charset_mule_not; |
| 2563 buf_end--; | |
| 428 | 2564 p = p1; /* go back to the beginning of the charset, after |
| 2565 a possible ^. */ | |
| 2566 rtab = Vthe_lisp_rangetab; | |
| 2567 Fclear_range_table (rtab); | |
| 2568 | |
| 2569 /* charset_not matches newline according to a syntax bit. */ | |
| 446 | 2570 if ((re_opcode_t) buf_end[-1] == charset_mule_not |
| 428 | 2571 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
| 2572 SET_EITHER_BIT ('\n'); | |
| 2573 } | |
| 2574 #endif /* MULE */ | |
| 2575 | |
| 2576 /* Read in characters and ranges, setting map bits. */ | |
| 2577 for (;;) | |
| 2578 { | |
| 2579 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
| 2580 | |
| 446 | 2581 PATFETCH (c); |
| 428 | 2582 |
| 2583 #ifdef MULE | |
| 2584 if (c >= 0x80 && !has_extended_chars) | |
| 2585 { | |
| 2586 has_extended_chars = 1; | |
| 2587 /* Frumble-bumble, we've found some extended chars. | |
| 2588 Need to start over, process everything using | |
| 2589 the general extended-char mechanism, and need | |
| 2590 to use charset_mule and charset_mule_not instead | |
| 2591 of charset and charset_not. */ | |
| 2592 goto start_over_with_extended; | |
| 2593 } | |
| 2594 #endif /* MULE */ | |
| 2595 /* \ might escape characters inside [...] and [^...]. */ | |
| 2596 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') | |
| 2597 { | |
| 2598 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
| 2599 | |
| 446 | 2600 PATFETCH (c1); |
| 428 | 2601 #ifdef MULE |
| 2602 if (c1 >= 0x80 && !has_extended_chars) | |
| 2603 { | |
| 2604 has_extended_chars = 1; | |
| 2605 goto start_over_with_extended; | |
| 2606 } | |
| 2607 #endif /* MULE */ | |
| 2608 SET_EITHER_BIT (c1); | |
| 2609 continue; | |
| 2610 } | |
| 2611 | |
| 2612 /* Could be the end of the bracket expression. If it's | |
| 2613 not (i.e., when the bracket expression is `[]' so | |
| 2614 far), the ']' character bit gets set way below. */ | |
| 2615 if (c == ']' && p != p1 + 1) | |
| 2616 break; | |
| 2617 | |
| 2618 /* Look ahead to see if it's a range when the last thing | |
| 2619 was a character class. */ | |
| 2620 if (had_char_class && c == '-' && *p != ']') | |
| 2621 FREE_STACK_RETURN (REG_ERANGE); | |
| 2622 | |
| 2623 /* Look ahead to see if it's a range when the last thing | |
| 2624 was a character: if this is a hyphen not at the | |
| 2625 beginning or the end of a list, then it's the range | |
| 2626 operator. */ | |
| 2627 if (c == '-' | |
| 2628 && !(p - 2 >= pattern && p[-2] == '[') | |
| 446 | 2629 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') |
| 428 | 2630 && *p != ']') |
| 2631 { | |
| 2632 reg_errcode_t ret; | |
| 2633 | |
| 2634 #ifdef MULE | |
| 2635 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) | |
| 2636 { | |
| 2637 has_extended_chars = 1; | |
| 2638 goto start_over_with_extended; | |
| 2639 } | |
| 2640 if (has_extended_chars) | |
| 2641 ret = compile_extended_range (&p, pend, translate, | |
| 2642 syntax, rtab); | |
| 2643 else | |
| 2644 #endif /* MULE */ | |
| 446 | 2645 ret = compile_range (&p, pend, translate, syntax, buf_end); |
| 428 | 2646 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
| 2647 } | |
| 2648 | |
| 2649 else if (p[0] == '-' && p[1] != ']') | |
| 2650 { /* This handles ranges made up of characters only. */ | |
| 2651 reg_errcode_t ret; | |
| 2652 | |
| 2653 /* Move past the `-'. */ | |
| 2654 PATFETCH (c1); | |
| 2655 | |
| 2656 #ifdef MULE | |
| 2657 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) | |
| 2658 { | |
| 2659 has_extended_chars = 1; | |
| 2660 goto start_over_with_extended; | |
| 2661 } | |
| 2662 if (has_extended_chars) | |
| 2663 ret = compile_extended_range (&p, pend, translate, | |
| 2664 syntax, rtab); | |
| 2665 else | |
| 2666 #endif /* MULE */ | |
| 446 | 2667 ret = compile_range (&p, pend, translate, syntax, buf_end); |
| 428 | 2668 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
| 2669 } | |
| 2670 | |
| 2671 /* See if we're at the beginning of a possible character | |
| 2672 class. */ | |
| 2673 | |
| 2674 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') | |
| 2675 { /* Leave room for the null. */ | |
| 2676 char str[CHAR_CLASS_MAX_LENGTH + 1]; | |
| 2677 | |
| 2678 PATFETCH (c); | |
| 2679 c1 = 0; | |
| 2680 | |
| 2681 /* If pattern is `[[:'. */ | |
| 2682 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
| 2683 | |
| 2684 for (;;) | |
| 2685 { | |
| 446 | 2686 /* #### This code is unused. |
| 2687 Correctness is not checked after TRT | |
| 2688 table change. */ | |
| 428 | 2689 PATFETCH (c); |
| 2690 if (c == ':' || c == ']' || p == pend | |
| 2691 || c1 == CHAR_CLASS_MAX_LENGTH) | |
| 2692 break; | |
| 442 | 2693 str[c1++] = (char) c; |
| 428 | 2694 } |
| 2695 str[c1] = '\0'; | |
| 2696 | |
| 446 | 2697 /* If isn't a word bracketed by `[:' and `:]': |
| 428 | 2698 undo the ending character, the letters, and leave |
| 2699 the leading `:' and `[' (but set bits for them). */ | |
| 2700 if (c == ':' && *p == ']') | |
| 2701 { | |
| 2702 int ch; | |
| 460 | 2703 re_bool is_alnum = STREQ (str, "alnum"); |
| 2704 re_bool is_alpha = STREQ (str, "alpha"); | |
| 2705 re_bool is_blank = STREQ (str, "blank"); | |
| 2706 re_bool is_cntrl = STREQ (str, "cntrl"); | |
| 2707 re_bool is_digit = STREQ (str, "digit"); | |
| 2708 re_bool is_graph = STREQ (str, "graph"); | |
| 2709 re_bool is_lower = STREQ (str, "lower"); | |
| 2710 re_bool is_print = STREQ (str, "print"); | |
| 2711 re_bool is_punct = STREQ (str, "punct"); | |
| 2712 re_bool is_space = STREQ (str, "space"); | |
| 2713 re_bool is_upper = STREQ (str, "upper"); | |
| 2714 re_bool is_xdigit = STREQ (str, "xdigit"); | |
| 428 | 2715 |
| 2716 if (!IS_CHAR_CLASS (str)) | |
| 2717 FREE_STACK_RETURN (REG_ECTYPE); | |
| 2718 | |
| 2719 /* Throw away the ] at the end of the character | |
| 2720 class. */ | |
| 2721 PATFETCH (c); | |
| 2722 | |
| 2723 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
| 2724 | |
| 2725 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) | |
| 2726 { | |
| 2727 /* This was split into 3 if's to | |
| 2728 avoid an arbitrary limit in some compiler. */ | |
| 2729 if ( (is_alnum && ISALNUM (ch)) | |
| 2730 || (is_alpha && ISALPHA (ch)) | |
| 2731 || (is_blank && ISBLANK (ch)) | |
| 2732 || (is_cntrl && ISCNTRL (ch))) | |
| 2733 SET_EITHER_BIT (ch); | |
| 2734 if ( (is_digit && ISDIGIT (ch)) | |
| 2735 || (is_graph && ISGRAPH (ch)) | |
| 2736 || (is_lower && ISLOWER (ch)) | |
| 2737 || (is_print && ISPRINT (ch))) | |
| 2738 SET_EITHER_BIT (ch); | |
| 2739 if ( (is_punct && ISPUNCT (ch)) | |
| 2740 || (is_space && ISSPACE (ch)) | |
| 2741 || (is_upper && ISUPPER (ch)) | |
| 2742 || (is_xdigit && ISXDIGIT (ch))) | |
| 2743 SET_EITHER_BIT (ch); | |
| 2744 } | |
| 2745 had_char_class = true; | |
| 2746 } | |
| 2747 else | |
| 2748 { | |
| 2749 c1++; | |
| 2750 while (c1--) | |
| 2751 PATUNFETCH; | |
| 2752 SET_EITHER_BIT ('['); | |
| 2753 SET_EITHER_BIT (':'); | |
| 2754 had_char_class = false; | |
| 2755 } | |
| 2756 } | |
| 2757 else | |
| 2758 { | |
| 2759 had_char_class = false; | |
| 2760 SET_EITHER_BIT (c); | |
| 2761 } | |
| 2762 } | |
| 2763 | |
| 2764 #ifdef MULE | |
| 2765 if (has_extended_chars) | |
| 2766 { | |
| 2767 /* We have a range table, not a bit vector. */ | |
| 2768 int bytes_needed = | |
| 2769 unified_range_table_bytes_needed (rtab); | |
| 2770 GET_BUFFER_SPACE (bytes_needed); | |
| 446 | 2771 unified_range_table_copy_data (rtab, buf_end); |
| 2772 buf_end += unified_range_table_bytes_used (buf_end); | |
| 428 | 2773 break; |
| 2774 } | |
| 2775 #endif /* MULE */ | |
| 2776 /* Discard any (non)matching list bytes that are all 0 at the | |
| 2777 end of the map. Decrease the map-length byte too. */ | |
| 446 | 2778 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0) |
| 2779 buf_end[-1]--; | |
| 2780 buf_end += buf_end[-1]; | |
| 428 | 2781 } |
| 2782 break; | |
| 2783 | |
| 2784 | |
| 2785 case '(': | |
| 2786 if (syntax & RE_NO_BK_PARENS) | |
| 2787 goto handle_open; | |
| 2788 else | |
| 2789 goto normal_char; | |
| 2790 | |
| 2791 | |
| 2792 case ')': | |
| 2793 if (syntax & RE_NO_BK_PARENS) | |
| 2794 goto handle_close; | |
| 2795 else | |
| 2796 goto normal_char; | |
| 2797 | |
| 2798 | |
| 2799 case '\n': | |
| 2800 if (syntax & RE_NEWLINE_ALT) | |
| 2801 goto handle_alt; | |
| 2802 else | |
| 2803 goto normal_char; | |
| 2804 | |
| 2805 | |
| 2806 case '|': | |
| 2807 if (syntax & RE_NO_BK_VBAR) | |
| 2808 goto handle_alt; | |
| 2809 else | |
| 2810 goto normal_char; | |
| 2811 | |
| 2812 | |
| 2813 case '{': | |
| 2814 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) | |
| 2815 goto handle_interval; | |
| 2816 else | |
| 2817 goto normal_char; | |
| 2818 | |
| 2819 | |
| 2820 case '\\': | |
| 2821 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
| 2822 | |
| 2823 /* Do not translate the character after the \, so that we can | |
| 2824 distinguish, e.g., \B from \b, even if we normally would | |
| 2825 translate, e.g., B to b. */ | |
| 2826 PATFETCH_RAW (c); | |
| 2827 | |
| 2828 switch (c) | |
| 2829 { | |
| 2830 case '(': | |
| 2831 if (syntax & RE_NO_BK_PARENS) | |
| 2832 goto normal_backslash; | |
| 2833 | |
| 2834 handle_open: | |
| 2835 { | |
| 2836 regnum_t r; | |
| 502 | 2837 int shy = 0; |
| 428 | 2838 |
| 2839 if (!(syntax & RE_NO_SHY_GROUPS) | |
| 2840 && p != pend | |
| 446 | 2841 && *p == '?') |
| 428 | 2842 { |
| 2843 p++; | |
| 446 | 2844 PATFETCH (c); |
| 428 | 2845 switch (c) |
| 2846 { | |
| 2847 case ':': /* shy groups */ | |
| 502 | 2848 shy = 1; |
| 428 | 2849 break; |
| 2850 | |
| 2851 /* All others are reserved for future constructs. */ | |
| 2852 default: | |
| 2853 FREE_STACK_RETURN (REG_BADPAT); | |
| 2854 } | |
| 2855 } | |
| 502 | 2856 |
| 2857 r = ++regnum; | |
| 2858 bufp->re_ngroups++; | |
| 2859 if (!shy) | |
| 2860 { | |
| 2861 bufp->re_nsub++; | |
| 2862 while (bufp->external_to_internal_register_size <= | |
| 2863 bufp->re_nsub) | |
| 2864 { | |
| 2865 int i; | |
| 2866 int old_size = | |
| 2867 bufp->external_to_internal_register_size; | |
| 2868 bufp->external_to_internal_register_size += 5; | |
| 2869 RETALLOC (bufp->external_to_internal_register, | |
| 2870 bufp->external_to_internal_register_size, | |
| 2871 int); | |
| 2872 /* debugging */ | |
| 2873 for (i = old_size; | |
| 2874 i < bufp->external_to_internal_register_size; i++) | |
| 2875 bufp->external_to_internal_register[i] = | |
| 2876 (int) 0xDEADBEEF; | |
| 2877 } | |
| 2878 | |
| 2879 bufp->external_to_internal_register[bufp->re_nsub] = | |
| 2880 bufp->re_ngroups; | |
| 2881 } | |
| 428 | 2882 |
| 2883 if (COMPILE_STACK_FULL) | |
| 2884 { | |
| 2885 RETALLOC (compile_stack.stack, compile_stack.size << 1, | |
| 2886 compile_stack_elt_t); | |
| 2887 if (compile_stack.stack == NULL) return REG_ESPACE; | |
| 2888 | |
| 2889 compile_stack.size <<= 1; | |
| 2890 } | |
| 2891 | |
| 2892 /* These are the values to restore when we hit end of this | |
| 2893 group. They are all relative offsets, so that if the | |
| 2894 whole pattern moves because of realloc, they will still | |
| 2895 be valid. */ | |
| 2896 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; | |
| 2897 COMPILE_STACK_TOP.fixup_alt_jump | |
| 2898 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; | |
| 446 | 2899 COMPILE_STACK_TOP.laststart_offset = buf_end - bufp->buffer; |
| 428 | 2900 COMPILE_STACK_TOP.regnum = r; |
| 2901 | |
| 2902 /* We will eventually replace the 0 with the number of | |
| 2903 groups inner to this one. But do not push a | |
| 2904 start_memory for groups beyond the last one we can | |
| 502 | 2905 represent in the compiled pattern. |
| 2906 #### bad bad bad. this will fail in lots of ways, if we | |
| 2907 ever have to backtrack for these groups. | |
| 2908 */ | |
| 428 | 2909 if (r <= MAX_REGNUM) |
| 2910 { | |
| 2911 COMPILE_STACK_TOP.inner_group_offset | |
| 446 | 2912 = buf_end - bufp->buffer + 2; |
| 428 | 2913 BUF_PUSH_3 (start_memory, r, 0); |
| 2914 } | |
| 2915 | |
| 2916 compile_stack.avail++; | |
| 2917 | |
| 2918 fixup_alt_jump = 0; | |
| 2919 laststart = 0; | |
| 446 | 2920 begalt = buf_end; |
| 428 | 2921 /* If we've reached MAX_REGNUM groups, then this open |
| 2922 won't actually generate any code, so we'll have to | |
| 2923 clear pending_exact explicitly. */ | |
| 2924 pending_exact = 0; | |
| 2925 } | |
| 2926 break; | |
| 2927 | |
| 2928 | |
| 2929 case ')': | |
| 2930 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; | |
| 2931 | |
| 2932 if (COMPILE_STACK_EMPTY) { | |
| 2933 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) | |
| 2934 goto normal_backslash; | |
| 2935 else | |
| 2936 FREE_STACK_RETURN (REG_ERPAREN); | |
| 2937 } | |
| 2938 | |
| 2939 handle_close: | |
| 2940 if (fixup_alt_jump) | |
| 2941 { /* Push a dummy failure point at the end of the | |
| 2942 alternative for a possible future | |
| 2943 `pop_failure_jump' to pop. See comments at | |
| 2944 `push_dummy_failure' in `re_match_2'. */ | |
| 2945 BUF_PUSH (push_dummy_failure); | |
| 2946 | |
| 2947 /* We allocated space for this jump when we assigned | |
| 2948 to `fixup_alt_jump', in the `handle_alt' case below. */ | |
| 446 | 2949 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end - 1); |
| 428 | 2950 } |
| 2951 | |
| 2952 /* See similar code for backslashed left paren above. */ | |
| 2953 if (COMPILE_STACK_EMPTY) { | |
| 2954 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) | |
| 2955 goto normal_char; | |
| 2956 else | |
| 2957 FREE_STACK_RETURN (REG_ERPAREN); | |
| 2958 } | |
| 2959 | |
| 2960 /* Since we just checked for an empty stack above, this | |
| 2961 ``can't happen''. */ | |
| 2962 assert (compile_stack.avail != 0); | |
| 2963 { | |
| 2964 /* We don't just want to restore into `regnum', because | |
| 2965 later groups should continue to be numbered higher, | |
| 2966 as in `(ab)c(de)' -- the second group is #2. */ | |
| 2967 regnum_t this_group_regnum; | |
| 2968 | |
| 2969 compile_stack.avail--; | |
| 2970 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; | |
| 2971 fixup_alt_jump | |
| 2972 = COMPILE_STACK_TOP.fixup_alt_jump | |
| 2973 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 | |
| 2974 : 0; | |
| 2975 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; | |
| 2976 this_group_regnum = COMPILE_STACK_TOP.regnum; | |
| 2977 /* If we've reached MAX_REGNUM groups, then this open | |
| 2978 won't actually generate any code, so we'll have to | |
| 2979 clear pending_exact explicitly. */ | |
| 2980 pending_exact = 0; | |
| 2981 | |
| 2982 /* We're at the end of the group, so now we know how many | |
| 2983 groups were inside this one. */ | |
| 2984 if (this_group_regnum <= MAX_REGNUM) | |
| 2985 { | |
| 2986 unsigned char *inner_group_loc | |
| 2987 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; | |
| 2988 | |
| 2989 *inner_group_loc = regnum - this_group_regnum; | |
| 2990 BUF_PUSH_3 (stop_memory, this_group_regnum, | |
| 2991 regnum - this_group_regnum); | |
| 2992 } | |
| 2993 } | |
| 2994 break; | |
| 2995 | |
| 2996 | |
| 2997 case '|': /* `\|'. */ | |
| 2998 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) | |
| 2999 goto normal_backslash; | |
| 3000 handle_alt: | |
| 3001 if (syntax & RE_LIMITED_OPS) | |
| 3002 goto normal_char; | |
| 3003 | |
| 3004 /* Insert before the previous alternative a jump which | |
| 3005 jumps to this alternative if the former fails. */ | |
| 3006 GET_BUFFER_SPACE (3); | |
| 446 | 3007 INSERT_JUMP (on_failure_jump, begalt, buf_end + 6); |
| 428 | 3008 pending_exact = 0; |
| 446 | 3009 buf_end += 3; |
| 428 | 3010 |
| 3011 /* The alternative before this one has a jump after it | |
| 3012 which gets executed if it gets matched. Adjust that | |
| 3013 jump so it will jump to this alternative's analogous | |
| 3014 jump (put in below, which in turn will jump to the next | |
| 3015 (if any) alternative's such jump, etc.). The last such | |
| 3016 jump jumps to the correct final destination. A picture: | |
| 3017 _____ _____ | |
| 3018 | | | | | |
| 3019 | v | v | |
| 3020 a | b | c | |
| 3021 | |
| 3022 If we are at `b', then fixup_alt_jump right now points to a | |
| 3023 three-byte space after `a'. We'll put in the jump, set | |
| 3024 fixup_alt_jump to right after `b', and leave behind three | |
| 3025 bytes which we'll fill in when we get to after `c'. */ | |
| 3026 | |
| 3027 if (fixup_alt_jump) | |
| 446 | 3028 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end); |
| 428 | 3029 |
| 3030 /* Mark and leave space for a jump after this alternative, | |
| 3031 to be filled in later either by next alternative or | |
| 3032 when know we're at the end of a series of alternatives. */ | |
| 446 | 3033 fixup_alt_jump = buf_end; |
| 428 | 3034 GET_BUFFER_SPACE (3); |
| 446 | 3035 buf_end += 3; |
| 428 | 3036 |
| 3037 laststart = 0; | |
| 446 | 3038 begalt = buf_end; |
| 428 | 3039 break; |
| 3040 | |
| 3041 | |
| 3042 case '{': | |
| 3043 /* If \{ is a literal. */ | |
| 3044 if (!(syntax & RE_INTERVALS) | |
| 3045 /* If we're at `\{' and it's not the open-interval | |
| 3046 operator. */ | |
| 3047 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) | |
| 3048 || (p - 2 == pattern && p == pend)) | |
| 3049 goto normal_backslash; | |
| 3050 | |
| 3051 handle_interval: | |
| 3052 { | |
| 3053 /* If got here, then the syntax allows intervals. */ | |
| 3054 | |
| 3055 /* At least (most) this many matches must be made. */ | |
| 3056 int lower_bound = -1, upper_bound = -1; | |
| 3057 | |
| 3058 beg_interval = p - 1; | |
| 3059 | |
| 3060 if (p == pend) | |
| 3061 { | |
| 3062 if (syntax & RE_NO_BK_BRACES) | |
| 3063 goto unfetch_interval; | |
| 3064 else | |
| 3065 FREE_STACK_RETURN (REG_EBRACE); | |
| 3066 } | |
| 3067 | |
| 3068 GET_UNSIGNED_NUMBER (lower_bound); | |
| 3069 | |
| 3070 if (c == ',') | |
| 3071 { | |
| 3072 GET_UNSIGNED_NUMBER (upper_bound); | |
| 3073 if (upper_bound < 0) upper_bound = RE_DUP_MAX; | |
| 3074 } | |
| 3075 else | |
| 3076 /* Interval such as `{1}' => match exactly once. */ | |
| 3077 upper_bound = lower_bound; | |
| 3078 | |
| 3079 if (lower_bound < 0 || upper_bound > RE_DUP_MAX | |
| 3080 || lower_bound > upper_bound) | |
| 3081 { | |
| 3082 if (syntax & RE_NO_BK_BRACES) | |
| 3083 goto unfetch_interval; | |
| 3084 else | |
| 3085 FREE_STACK_RETURN (REG_BADBR); | |
| 3086 } | |
| 3087 | |
| 3088 if (!(syntax & RE_NO_BK_BRACES)) | |
| 3089 { | |
| 3090 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE); | |
| 3091 | |
| 3092 PATFETCH (c); | |
| 3093 } | |
| 3094 | |
| 3095 if (c != '}') | |
| 3096 { | |
| 3097 if (syntax & RE_NO_BK_BRACES) | |
| 3098 goto unfetch_interval; | |
| 3099 else | |
| 3100 FREE_STACK_RETURN (REG_BADBR); | |
| 3101 } | |
| 3102 | |
| 3103 /* We just parsed a valid interval. */ | |
| 3104 | |
| 3105 /* If it's invalid to have no preceding re. */ | |
| 3106 if (!laststart) | |
| 3107 { | |
| 3108 if (syntax & RE_CONTEXT_INVALID_OPS) | |
| 3109 FREE_STACK_RETURN (REG_BADRPT); | |
| 3110 else if (syntax & RE_CONTEXT_INDEP_OPS) | |
| 446 | 3111 laststart = buf_end; |
| 428 | 3112 else |
| 3113 goto unfetch_interval; | |
| 3114 } | |
| 3115 | |
| 3116 /* If the upper bound is zero, don't want to succeed at | |
| 3117 all; jump from `laststart' to `b + 3', which will be | |
| 3118 the end of the buffer after we insert the jump. */ | |
| 3119 if (upper_bound == 0) | |
| 3120 { | |
| 3121 GET_BUFFER_SPACE (3); | |
| 446 | 3122 INSERT_JUMP (jump, laststart, buf_end + 3); |
| 3123 buf_end += 3; | |
| 428 | 3124 } |
| 3125 | |
| 3126 /* Otherwise, we have a nontrivial interval. When | |
| 3127 we're all done, the pattern will look like: | |
| 3128 set_number_at <jump count> <upper bound> | |
| 3129 set_number_at <succeed_n count> <lower bound> | |
| 3130 succeed_n <after jump addr> <succeed_n count> | |
| 3131 <body of loop> | |
| 3132 jump_n <succeed_n addr> <jump count> | |
| 3133 (The upper bound and `jump_n' are omitted if | |
| 3134 `upper_bound' is 1, though.) */ | |
| 3135 else | |
| 3136 { /* If the upper bound is > 1, we need to insert | |
| 3137 more at the end of the loop. */ | |
| 647 | 3138 int nbytes = 10 + (upper_bound > 1) * 10; |
| 428 | 3139 |
| 3140 GET_BUFFER_SPACE (nbytes); | |
| 3141 | |
| 3142 /* Initialize lower bound of the `succeed_n', even | |
| 3143 though it will be set during matching by its | |
| 3144 attendant `set_number_at' (inserted next), | |
| 3145 because `re_compile_fastmap' needs to know. | |
| 3146 Jump to the `jump_n' we might insert below. */ | |
| 3147 INSERT_JUMP2 (succeed_n, laststart, | |
| 446 | 3148 buf_end + 5 + (upper_bound > 1) * 5, |
| 428 | 3149 lower_bound); |
| 446 | 3150 buf_end += 5; |
| 428 | 3151 |
| 3152 /* Code to initialize the lower bound. Insert | |
| 3153 before the `succeed_n'. The `5' is the last two | |
| 3154 bytes of this `set_number_at', plus 3 bytes of | |
| 3155 the following `succeed_n'. */ | |
| 446 | 3156 insert_op2 (set_number_at, laststart, 5, lower_bound, buf_end); |
| 3157 buf_end += 5; | |
| 428 | 3158 |
| 3159 if (upper_bound > 1) | |
| 3160 { /* More than one repetition is allowed, so | |
| 3161 append a backward jump to the `succeed_n' | |
| 3162 that starts this interval. | |
| 3163 | |
| 3164 When we've reached this during matching, | |
| 3165 we'll have matched the interval once, so | |
| 3166 jump back only `upper_bound - 1' times. */ | |
| 446 | 3167 STORE_JUMP2 (jump_n, buf_end, laststart + 5, |
| 428 | 3168 upper_bound - 1); |
| 446 | 3169 buf_end += 5; |
| 428 | 3170 |
| 3171 /* The location we want to set is the second | |
| 3172 parameter of the `jump_n'; that is `b-2' as | |
| 3173 an absolute address. `laststart' will be | |
| 3174 the `set_number_at' we're about to insert; | |
| 3175 `laststart+3' the number to set, the source | |
| 3176 for the relative address. But we are | |
| 3177 inserting into the middle of the pattern -- | |
| 3178 so everything is getting moved up by 5. | |
| 3179 Conclusion: (b - 2) - (laststart + 3) + 5, | |
| 3180 i.e., b - laststart. | |
| 3181 | |
| 3182 We insert this at the beginning of the loop | |
| 3183 so that if we fail during matching, we'll | |
| 3184 reinitialize the bounds. */ | |
| 446 | 3185 insert_op2 (set_number_at, laststart, |
| 3186 buf_end - laststart, | |
| 3187 upper_bound - 1, buf_end); | |
| 3188 buf_end += 5; | |
| 428 | 3189 } |
| 3190 } | |
| 3191 pending_exact = 0; | |
| 3192 beg_interval = NULL; | |
| 3193 } | |
| 3194 break; | |
| 3195 | |
| 3196 unfetch_interval: | |
| 3197 /* If an invalid interval, match the characters as literals. */ | |
| 3198 assert (beg_interval); | |
| 3199 p = beg_interval; | |
| 3200 beg_interval = NULL; | |
| 3201 | |
| 3202 /* normal_char and normal_backslash need `c'. */ | |
| 3203 PATFETCH (c); | |
| 3204 | |
| 3205 if (!(syntax & RE_NO_BK_BRACES)) | |
| 3206 { | |
| 3207 if (p > pattern && p[-1] == '\\') | |
| 3208 goto normal_backslash; | |
| 3209 } | |
| 3210 goto normal_char; | |
| 3211 | |
| 3212 #ifdef emacs | |
| 3213 /* There is no way to specify the before_dot and after_dot | |
| 3214 operators. rms says this is ok. --karl */ | |
| 3215 case '=': | |
| 3216 BUF_PUSH (at_dot); | |
| 3217 break; | |
| 3218 | |
| 3219 case 's': | |
| 446 | 3220 laststart = buf_end; |
| 428 | 3221 PATFETCH (c); |
| 3222 /* XEmacs addition */ | |
| 3223 if (c >= 0x80 || syntax_spec_code[c] == 0377) | |
| 3224 FREE_STACK_RETURN (REG_ESYNTAX); | |
| 3225 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); | |
| 3226 break; | |
| 3227 | |
| 3228 case 'S': | |
| 446 | 3229 laststart = buf_end; |
| 428 | 3230 PATFETCH (c); |
| 3231 /* XEmacs addition */ | |
| 3232 if (c >= 0x80 || syntax_spec_code[c] == 0377) | |
| 3233 FREE_STACK_RETURN (REG_ESYNTAX); | |
| 3234 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); | |
| 3235 break; | |
| 3236 | |
| 3237 #ifdef MULE | |
| 3238 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */ | |
| 3239 case 'c': | |
| 446 | 3240 laststart = buf_end; |
| 428 | 3241 PATFETCH_RAW (c); |
| 3242 if (c < 32 || c > 127) | |
| 3243 FREE_STACK_RETURN (REG_ECATEGORY); | |
| 3244 BUF_PUSH_2 (categoryspec, c); | |
| 3245 break; | |
| 3246 | |
| 3247 case 'C': | |
| 446 | 3248 laststart = buf_end; |
| 428 | 3249 PATFETCH_RAW (c); |
| 3250 if (c < 32 || c > 127) | |
| 3251 FREE_STACK_RETURN (REG_ECATEGORY); | |
| 3252 BUF_PUSH_2 (notcategoryspec, c); | |
| 3253 break; | |
| 3254 /* end of category patch */ | |
| 3255 #endif /* MULE */ | |
| 3256 #endif /* emacs */ | |
| 3257 | |
| 3258 | |
| 3259 case 'w': | |
| 446 | 3260 laststart = buf_end; |
| 428 | 3261 BUF_PUSH (wordchar); |
| 3262 break; | |
| 3263 | |
| 3264 | |
| 3265 case 'W': | |
| 446 | 3266 laststart = buf_end; |
| 428 | 3267 BUF_PUSH (notwordchar); |
| 3268 break; | |
| 3269 | |
| 3270 | |
| 3271 case '<': | |
| 3272 BUF_PUSH (wordbeg); | |
| 3273 break; | |
| 3274 | |
| 3275 case '>': | |
| 3276 BUF_PUSH (wordend); | |
| 3277 break; | |
| 3278 | |
| 3279 case 'b': | |
| 3280 BUF_PUSH (wordbound); | |
| 3281 break; | |
| 3282 | |
| 3283 case 'B': | |
| 3284 BUF_PUSH (notwordbound); | |
| 3285 break; | |
| 3286 | |
| 3287 case '`': | |
| 3288 BUF_PUSH (begbuf); | |
| 3289 break; | |
| 3290 | |
| 3291 case '\'': | |
| 3292 BUF_PUSH (endbuf); | |
| 3293 break; | |
| 3294 | |
| 3295 case '1': case '2': case '3': case '4': case '5': | |
| 3296 case '6': case '7': case '8': case '9': | |
| 446 | 3297 { |
| 502 | 3298 regnum_t reg, regint; |
| 3299 int may_need_to_unfetch = 0; | |
| 446 | 3300 if (syntax & RE_NO_BK_REFS) |
| 3301 goto normal_char; | |
| 3302 | |
| 502 | 3303 /* This only goes up to 99. It could be extended to work |
| 3304 up to 255 (the maximum number of registers that can be | |
| 3305 handled by the current regexp engine, because it stores | |
| 3306 its register numbers in the compiled pattern as one byte, | |
| 3307 ugh). Doing that's a bit trickier, because you might | |
| 3308 have the case where \25 a back-ref but \255 is not, ... */ | |
| 446 | 3309 reg = c - '0'; |
| 502 | 3310 if (p < pend) |
| 3311 { | |
| 3312 PATFETCH (c); | |
| 3313 if (c >= '0' && c <= '9') | |
| 3314 { | |
| 3315 regnum_t new_reg = reg * 10 + c - '0'; | |
| 3316 if (new_reg <= bufp->re_nsub) | |
| 3317 { | |
| 3318 reg = new_reg; | |
| 3319 may_need_to_unfetch = 1; | |
| 3320 } | |
| 3321 else | |
| 3322 PATUNFETCH; | |
| 3323 } | |
| 523 | 3324 else |
| 3325 PATUNFETCH; | |
| 502 | 3326 } |
| 3327 | |
| 3328 if (reg > bufp->re_nsub) | |
| 446 | 3329 FREE_STACK_RETURN (REG_ESUBREG); |
| 3330 | |
| 502 | 3331 regint = bufp->external_to_internal_register[reg]; |
| 446 | 3332 /* Can't back reference to a subexpression if inside of it. */ |
| 502 | 3333 if (group_in_compile_stack (compile_stack, regint)) |
| 3334 { | |
| 3335 if (may_need_to_unfetch) | |
| 3336 PATUNFETCH; | |
| 3337 goto normal_char; | |
| 3338 } | |
| 3339 | |
| 3340 #ifdef emacs | |
| 3341 if (reg > 9 && | |
| 3342 bufp->warned_about_incompatible_back_references == 0) | |
| 3343 { | |
| 3344 bufp->warned_about_incompatible_back_references = 1; | |
| 3345 warn_when_safe (intern ("regex"), Qinfo, | |
| 3346 "Back reference \\%d now has new " | |
| 3347 "semantics in %s", reg, pattern); | |
| 3348 } | |
| 3349 #endif | |
| 446 | 3350 |
| 3351 laststart = buf_end; | |
| 502 | 3352 BUF_PUSH_2 (duplicate, regint); |
| 446 | 3353 } |
| 428 | 3354 break; |
| 3355 | |
| 3356 | |
| 3357 case '+': | |
| 3358 case '?': | |
| 3359 if (syntax & RE_BK_PLUS_QM) | |
| 3360 goto handle_plus; | |
| 3361 else | |
| 3362 goto normal_backslash; | |
| 3363 | |
| 3364 default: | |
| 3365 normal_backslash: | |
| 3366 /* You might think it would be useful for \ to mean | |
| 3367 not to translate; but if we don't translate it, | |
| 3368 it will never match anything. */ | |
| 826 | 3369 c = RE_TRANSLATE (c); |
| 428 | 3370 goto normal_char; |
| 3371 } | |
| 3372 break; | |
| 3373 | |
| 3374 | |
| 3375 default: | |
| 3376 /* Expects the character in `c'. */ | |
| 3377 /* `p' points to the location after where `c' came from. */ | |
| 3378 normal_char: | |
| 3379 { | |
|
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3380 /* The following conditional synced to GNU Emacs 22.1. */ |
| 428 | 3381 /* If no exactn currently being built. */ |
| 3382 if (!pending_exact | |
| 3383 | |
| 3384 /* If last exactn not at current position. */ | |
| 446 | 3385 || pending_exact + *pending_exact + 1 != buf_end |
| 428 | 3386 |
| 3387 /* We have only one byte following the exactn for the count. */ | |
|
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3388 || *pending_exact >= (1 << BYTEWIDTH) - MAX_ICHAR_LEN |
|
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3389 |
|
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3390 /* If followed by a repetition operator. |
|
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3391 If the lookahead fails because of end of pattern, any |
|
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3392 trailing backslash will get caught later. */ |
|
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3393 || (p != pend && (*p == '*' || *p == '^')) |
| 428 | 3394 || ((syntax & RE_BK_PLUS_QM) |
|
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3395 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?') |
|
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3396 : p != pend && (*p == '+' || *p == '?')) |
| 428 | 3397 || ((syntax & RE_INTERVALS) |
| 3398 && ((syntax & RE_NO_BK_BRACES) | |
|
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3399 ? p != pend && *p == '{' |
|
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3400 : p + 1 < pend && (p[0] == '\\' && p[1] == '{')))) |
| 428 | 3401 { |
| 3402 /* Start building a new exactn. */ | |
| 3403 | |
| 446 | 3404 laststart = buf_end; |
| 428 | 3405 |
| 3406 BUF_PUSH_2 (exactn, 0); | |
| 446 | 3407 pending_exact = buf_end - 1; |
| 428 | 3408 } |
| 3409 | |
| 446 | 3410 #ifndef MULE |
| 428 | 3411 BUF_PUSH (c); |
| 3412 (*pending_exact)++; | |
| 446 | 3413 #else |
| 3414 { | |
| 3415 Bytecount bt_count; | |
| 867 | 3416 Ibyte tmp_buf[MAX_ICHAR_LEN]; |
| 446 | 3417 int i; |
| 3418 | |
| 867 | 3419 bt_count = set_itext_ichar (tmp_buf, c); |
| 446 | 3420 |
| 3421 for (i = 0; i < bt_count; i++) | |
| 3422 { | |
| 3423 BUF_PUSH (tmp_buf[i]); | |
| 3424 (*pending_exact)++; | |
| 3425 } | |
| 3426 } | |
| 3427 #endif | |
| 428 | 3428 break; |
| 3429 } | |
| 3430 } /* switch (c) */ | |
| 3431 } /* while p != pend */ | |
| 3432 | |
| 3433 | |
| 3434 /* Through the pattern now. */ | |
| 3435 | |
| 3436 if (fixup_alt_jump) | |
| 446 | 3437 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end); |
| 428 | 3438 |
| 3439 if (!COMPILE_STACK_EMPTY) | |
| 3440 FREE_STACK_RETURN (REG_EPAREN); | |
| 3441 | |
| 3442 /* If we don't want backtracking, force success | |
| 3443 the first time we reach the end of the compiled pattern. */ | |
| 3444 if (syntax & RE_NO_POSIX_BACKTRACKING) | |
| 3445 BUF_PUSH (succeed); | |
| 3446 | |
|
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
3447 xfree (compile_stack.stack); |
| 428 | 3448 |
| 3449 /* We have succeeded; set the length of the buffer. */ | |
| 446 | 3450 bufp->used = buf_end - bufp->buffer; |
| 428 | 3451 |
| 3452 #ifdef DEBUG | |
| 5041 | 3453 if (debug_regexps & RE_DEBUG_COMPILATION) |
| 428 | 3454 { |
| 3455 DEBUG_PRINT1 ("\nCompiled pattern: \n"); | |
| 3456 print_compiled_pattern (bufp); | |
| 3457 } | |
| 3458 #endif /* DEBUG */ | |
| 3459 | |
| 3460 #ifndef MATCH_MAY_ALLOCATE | |
| 3461 /* Initialize the failure stack to the largest possible stack. This | |
| 3462 isn't necessary unless we're trying to avoid calling alloca in | |
| 3463 the search and match routines. */ | |
| 3464 { | |
| 502 | 3465 int num_regs = bufp->re_ngroups + 1; |
| 428 | 3466 |
| 3467 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size | |
| 3468 is strictly greater than re_max_failures, the largest possible stack | |
| 3469 is 2 * re_max_failures failure points. */ | |
| 3470 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) | |
| 3471 { | |
| 3472 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); | |
| 3473 | |
| 3474 if (! fail_stack.stack) | |
| 3475 fail_stack.stack | |
| 3476 = (fail_stack_elt_t *) xmalloc (fail_stack.size | |
| 3477 * sizeof (fail_stack_elt_t)); | |
| 3478 else | |
| 3479 fail_stack.stack | |
| 3480 = (fail_stack_elt_t *) xrealloc (fail_stack.stack, | |
| 3481 (fail_stack.size | |
| 3482 * sizeof (fail_stack_elt_t))); | |
| 3483 } | |
| 3484 | |
| 3485 regex_grow_registers (num_regs); | |
| 3486 } | |
| 3487 #endif /* not MATCH_MAY_ALLOCATE */ | |
| 3488 | |
| 3489 return REG_NOERROR; | |
| 3490 } /* regex_compile */ | |
| 3491 | |
| 3492 /* Subroutines for `regex_compile'. */ | |
| 3493 | |
| 3494 /* Store OP at LOC followed by two-byte integer parameter ARG. */ | |
| 3495 | |
| 3496 static void | |
| 3497 store_op1 (re_opcode_t op, unsigned char *loc, int arg) | |
| 3498 { | |
| 3499 *loc = (unsigned char) op; | |
| 3500 STORE_NUMBER (loc + 1, arg); | |
| 3501 } | |
| 3502 | |
| 3503 | |
| 3504 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ | |
| 3505 | |
| 3506 static void | |
| 3507 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2) | |
| 3508 { | |
| 3509 *loc = (unsigned char) op; | |
| 3510 STORE_NUMBER (loc + 1, arg1); | |
| 3511 STORE_NUMBER (loc + 3, arg2); | |
| 3512 } | |
| 3513 | |
| 3514 | |
| 3515 /* Copy the bytes from LOC to END to open up three bytes of space at LOC | |
| 3516 for OP followed by two-byte integer parameter ARG. */ | |
| 3517 | |
| 3518 static void | |
| 3519 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end) | |
| 3520 { | |
| 3521 REGISTER unsigned char *pfrom = end; | |
| 3522 REGISTER unsigned char *pto = end + 3; | |
| 3523 | |
| 3524 while (pfrom != loc) | |
| 3525 *--pto = *--pfrom; | |
| 3526 | |
| 3527 store_op1 (op, loc, arg); | |
| 3528 } | |
| 3529 | |
| 3530 | |
| 3531 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ | |
| 3532 | |
| 3533 static void | |
| 3534 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, | |
| 3535 unsigned char *end) | |
| 3536 { | |
| 3537 REGISTER unsigned char *pfrom = end; | |
| 3538 REGISTER unsigned char *pto = end + 5; | |
| 3539 | |
| 3540 while (pfrom != loc) | |
| 3541 *--pto = *--pfrom; | |
| 3542 | |
| 3543 store_op2 (op, loc, arg1, arg2); | |
| 3544 } | |
| 3545 | |
| 3546 | |
| 3547 /* P points to just after a ^ in PATTERN. Return true if that ^ comes | |
| 3548 after an alternative or a begin-subexpression. We assume there is at | |
| 3549 least one character before the ^. */ | |
| 3550 | |
| 460 | 3551 static re_bool |
| 446 | 3552 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax) |
| 428 | 3553 { |
| 446 | 3554 re_char *prev = p - 2; |
| 460 | 3555 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\'; |
| 428 | 3556 |
| 3557 return | |
| 3558 /* After a subexpression? */ | |
| 3559 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) | |
| 3560 /* After an alternative? */ | |
| 3561 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); | |
| 3562 } | |
| 3563 | |
| 3564 | |
| 3565 /* The dual of at_begline_loc_p. This one is for $. We assume there is | |
| 3566 at least one character after the $, i.e., `P < PEND'. */ | |
| 3567 | |
| 460 | 3568 static re_bool |
| 446 | 3569 at_endline_loc_p (re_char *p, re_char *pend, int syntax) |
| 428 | 3570 { |
| 446 | 3571 re_char *next = p; |
| 460 | 3572 re_bool next_backslash = *next == '\\'; |
| 446 | 3573 re_char *next_next = p + 1 < pend ? p + 1 : 0; |
| 428 | 3574 |
| 3575 return | |
| 3576 /* Before a subexpression? */ | |
| 3577 (syntax & RE_NO_BK_PARENS ? *next == ')' | |
| 3578 : next_backslash && next_next && *next_next == ')') | |
| 3579 /* Before an alternative? */ | |
| 3580 || (syntax & RE_NO_BK_VBAR ? *next == '|' | |
| 3581 : next_backslash && next_next && *next_next == '|'); | |
| 3582 } | |
| 3583 | |
| 3584 | |
| 3585 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and | |
| 3586 false if it's not. */ | |
| 3587 | |
| 460 | 3588 static re_bool |
| 428 | 3589 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum) |
| 3590 { | |
| 3591 int this_element; | |
| 3592 | |
| 3593 for (this_element = compile_stack.avail - 1; | |
| 3594 this_element >= 0; | |
| 3595 this_element--) | |
| 3596 if (compile_stack.stack[this_element].regnum == regnum) | |
| 3597 return true; | |
| 3598 | |
| 3599 return false; | |
| 3600 } | |
| 3601 | |
| 3602 | |
| 3603 /* Read the ending character of a range (in a bracket expression) from the | |
| 3604 uncompiled pattern *P_PTR (which ends at PEND). We assume the | |
| 3605 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) | |
| 3606 Then we set the translation of all bits between the starting and | |
| 3607 ending characters (inclusive) in the compiled pattern B. | |
| 3608 | |
| 3609 Return an error code. | |
| 3610 | |
| 3611 We use these short variable names so we can use the same macros as | |
| 826 | 3612 `regex_compile' itself. |
| 3613 | |
| 3614 Under Mule, this is only called when both chars of the range are | |
| 3615 ASCII. */ | |
| 428 | 3616 |
| 3617 static reg_errcode_t | |
| 446 | 3618 compile_range (re_char **p_ptr, re_char *pend, RE_TRANSLATE_TYPE translate, |
| 3619 reg_syntax_t syntax, unsigned char *buf_end) | |
| 428 | 3620 { |
| 867 | 3621 Ichar this_char; |
| 428 | 3622 |
| 446 | 3623 re_char *p = *p_ptr; |
| 428 | 3624 int range_start, range_end; |
| 3625 | |
| 3626 if (p == pend) | |
| 3627 return REG_ERANGE; | |
| 3628 | |
| 3629 /* Even though the pattern is a signed `char *', we need to fetch | |
| 3630 with unsigned char *'s; if the high bit of the pattern character | |
| 3631 is set, the range endpoints will be negative if we fetch using a | |
| 3632 signed char *. | |
| 3633 | |
| 3634 We also want to fetch the endpoints without translating them; the | |
| 3635 appropriate translation is done in the bit-setting loop below. */ | |
| 442 | 3636 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */ |
| 3637 range_start = ((const unsigned char *) p)[-2]; | |
| 3638 range_end = ((const unsigned char *) p)[0]; | |
| 428 | 3639 |
| 3640 /* Have to increment the pointer into the pattern string, so the | |
| 3641 caller isn't still at the ending character. */ | |
| 3642 (*p_ptr)++; | |
| 3643 | |
| 3644 /* If the start is after the end, the range is empty. */ | |
| 3645 if (range_start > range_end) | |
| 3646 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; | |
| 3647 | |
| 3648 /* Here we see why `this_char' has to be larger than an `unsigned | |
| 3649 char' -- the range is inclusive, so if `range_end' == 0xff | |
| 3650 (assuming 8-bit characters), we would otherwise go into an infinite | |
| 3651 loop, since all characters <= 0xff. */ | |
| 3652 for (this_char = range_start; this_char <= range_end; this_char++) | |
| 3653 { | |
| 826 | 3654 SET_LIST_BIT (RE_TRANSLATE (this_char)); |
| 428 | 3655 } |
| 3656 | |
| 3657 return REG_NOERROR; | |
| 3658 } | |
| 3659 | |
| 3660 #ifdef MULE | |
| 3661 | |
| 3662 static reg_errcode_t | |
| 446 | 3663 compile_extended_range (re_char **p_ptr, re_char *pend, |
| 3664 RE_TRANSLATE_TYPE translate, | |
| 428 | 3665 reg_syntax_t syntax, Lisp_Object rtab) |
| 3666 { | |
| 867 | 3667 Ichar this_char, range_start, range_end; |
| 3668 const Ibyte *p; | |
| 428 | 3669 |
| 3670 if (*p_ptr == pend) | |
| 3671 return REG_ERANGE; | |
| 3672 | |
| 867 | 3673 p = (const Ibyte *) *p_ptr; |
| 3674 range_end = itext_ichar (p); | |
| 428 | 3675 p--; /* back to '-' */ |
| 867 | 3676 DEC_IBYTEPTR (p); /* back to start of range */ |
| 428 | 3677 /* We also want to fetch the endpoints without translating them; the |
| 3678 appropriate translation is done in the bit-setting loop below. */ | |
| 867 | 3679 range_start = itext_ichar (p); |
| 3680 INC_IBYTEPTR (*p_ptr); | |
| 428 | 3681 |
| 3682 /* If the start is after the end, the range is empty. */ | |
| 3683 if (range_start > range_end) | |
| 3684 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; | |
| 3685 | |
| 3686 /* Can't have ranges spanning different charsets, except maybe for | |
| 3687 ranges entirely within the first 256 chars. */ | |
| 3688 | |
| 3689 if ((range_start >= 0x100 || range_end >= 0x100) | |
| 867 | 3690 && ichar_leading_byte (range_start) != |
| 3691 ichar_leading_byte (range_end)) | |
| 428 | 3692 return REG_ERANGESPAN; |
| 3693 | |
| 826 | 3694 /* #### This might be way inefficient if the range encompasses 10,000 |
| 3695 chars or something. To be efficient, you'd have to do something like | |
| 3696 this: | |
| 428 | 3697 |
| 3698 range_table a; | |
| 3699 range_table b; | |
| 3700 map over translation table in [range_start, range_end] of | |
| 3701 (put the mapped range in a; | |
| 3702 put the translation in b) | |
| 3703 invert the range in a and truncate to [range_start, range_end] | |
| 3704 compute the union of a, b | |
| 3705 union the result into rtab | |
| 3706 */ | |
| 826 | 3707 for (this_char = range_start; this_char <= range_end; this_char++) |
| 428 | 3708 { |
| 826 | 3709 SET_RANGETAB_BIT (RE_TRANSLATE (this_char)); |
| 428 | 3710 } |
| 3711 | |
| 3712 if (this_char <= range_end) | |
| 3713 put_range_table (rtab, this_char, range_end, Qt); | |
| 3714 | |
| 3715 return REG_NOERROR; | |
| 3716 } | |
| 3717 | |
| 3718 #endif /* MULE */ | |
| 3719 | |
| 3720 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in | |
| 3721 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible | |
| 3722 characters can start a string that matches the pattern. This fastmap | |
| 3723 is used by re_search to skip quickly over impossible starting points. | |
| 3724 | |
| 3725 The caller must supply the address of a (1 << BYTEWIDTH)-byte data | |
| 3726 area as BUFP->fastmap. | |
| 3727 | |
| 3728 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in | |
| 3729 the pattern buffer. | |
| 3730 | |
| 3731 Returns 0 if we succeed, -2 if an internal error. */ | |
| 3732 | |
| 3733 int | |
| 826 | 3734 re_compile_fastmap (struct re_pattern_buffer *bufp |
| 3735 RE_LISP_SHORT_CONTEXT_ARGS_DECL) | |
| 428 | 3736 { |
| 3737 int j, k; | |
| 3738 #ifdef MATCH_MAY_ALLOCATE | |
| 3739 fail_stack_type fail_stack; | |
| 3740 #endif | |
| 456 | 3741 DECLARE_DESTINATION; |
| 428 | 3742 /* We don't push any register information onto the failure stack. */ |
| 3743 | |
| 826 | 3744 /* &&#### this should be changed for 8-bit-fixed, for efficiency. see |
| 3745 comment marked with &&#### in re_search_2. */ | |
| 3746 | |
| 428 | 3747 REGISTER char *fastmap = bufp->fastmap; |
| 3748 unsigned char *pattern = bufp->buffer; | |
| 647 | 3749 long size = bufp->used; |
| 428 | 3750 unsigned char *p = pattern; |
| 3751 REGISTER unsigned char *pend = pattern + size; | |
| 3752 | |
| 771 | 3753 #ifdef REGEX_REL_ALLOC |
| 428 | 3754 /* This holds the pointer to the failure stack, when |
| 3755 it is allocated relocatably. */ | |
| 3756 fail_stack_elt_t *failure_stack_ptr; | |
| 3757 #endif | |
| 3758 | |
| 3759 /* Assume that each path through the pattern can be null until | |
| 3760 proven otherwise. We set this false at the bottom of switch | |
| 3761 statement, to which we get only if a particular path doesn't | |
| 3762 match the empty string. */ | |
| 460 | 3763 re_bool path_can_be_null = true; |
| 428 | 3764 |
| 3765 /* We aren't doing a `succeed_n' to begin with. */ | |
| 460 | 3766 re_bool succeed_n_p = false; |
| 428 | 3767 |
| 1333 | 3768 #ifdef ERROR_CHECK_MALLOC |
| 3769 /* The pattern comes from string data, not buffer data. We don't access | |
| 3770 any buffer data, so we don't have to worry about malloc() (but the | |
| 3771 disallowed flag may have been set by a caller). */ | |
| 3772 int depth = bind_regex_malloc_disallowed (0); | |
| 3773 #endif | |
| 3774 | |
| 428 | 3775 assert (fastmap != NULL && p != NULL); |
| 3776 | |
| 3777 INIT_FAIL_STACK (); | |
| 3778 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */ | |
| 3779 bufp->fastmap_accurate = 1; /* It will be when we're done. */ | |
| 3780 bufp->can_be_null = 0; | |
| 3781 | |
| 3782 while (1) | |
| 3783 { | |
| 3784 if (p == pend || *p == succeed) | |
| 3785 { | |
| 3786 /* We have reached the (effective) end of pattern. */ | |
| 3787 if (!FAIL_STACK_EMPTY ()) | |
| 3788 { | |
| 3789 bufp->can_be_null |= path_can_be_null; | |
| 3790 | |
| 3791 /* Reset for next path. */ | |
| 3792 path_can_be_null = true; | |
| 3793 | |
| 446 | 3794 p = (unsigned char *) fail_stack.stack[--fail_stack.avail].pointer; |
| 428 | 3795 |
| 3796 continue; | |
| 3797 } | |
| 3798 else | |
| 3799 break; | |
| 3800 } | |
| 3801 | |
| 3802 /* We should never be about to go beyond the end of the pattern. */ | |
| 3803 assert (p < pend); | |
| 3804 | |
|
4759
aa5ed11f473b
Remove support for obsolete systems. See xemacs-patches message with ID
Jerry James <james@xemacs.org>
parents:
4750
diff
changeset
|
3805 switch ((re_opcode_t) *p++) |
| 428 | 3806 { |
| 3807 | |
| 3808 /* I guess the idea here is to simply not bother with a fastmap | |
| 3809 if a backreference is used, since it's too hard to figure out | |
| 3810 the fastmap for the corresponding group. Setting | |
| 3811 `can_be_null' stops `re_search_2' from using the fastmap, so | |
| 3812 that is all we do. */ | |
| 3813 case duplicate: | |
| 3814 bufp->can_be_null = 1; | |
| 3815 goto done; | |
| 3816 | |
| 3817 | |
| 3818 /* Following are the cases which match a character. These end | |
| 3819 with `break'. */ | |
| 3820 | |
| 3821 case exactn: | |
| 3822 fastmap[p[1]] = 1; | |
| 3823 break; | |
| 3824 | |
| 3825 | |
| 3826 case charset: | |
| 3827 /* XEmacs: Under Mule, these bit vectors will | |
| 3828 only contain values for characters below 0x80. */ | |
| 3829 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | |
| 3830 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) | |
| 3831 fastmap[j] = 1; | |
| 3832 break; | |
| 3833 | |
| 3834 | |
| 3835 case charset_not: | |
| 3836 /* Chars beyond end of map must be allowed. */ | |
| 3837 #ifdef MULE | |
| 3838 for (j = *p * BYTEWIDTH; j < 0x80; j++) | |
| 3839 fastmap[j] = 1; | |
| 3840 /* And all extended characters must be allowed, too. */ | |
| 3841 for (j = 0x80; j < 0xA0; j++) | |
| 3842 fastmap[j] = 1; | |
| 446 | 3843 #else /* not MULE */ |
| 428 | 3844 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) |
| 3845 fastmap[j] = 1; | |
| 446 | 3846 #endif /* MULE */ |
| 428 | 3847 |
| 3848 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | |
| 3849 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) | |
| 3850 fastmap[j] = 1; | |
| 3851 break; | |
| 3852 | |
| 3853 #ifdef MULE | |
| 3854 case charset_mule: | |
| 3855 { | |
| 3856 int nentries; | |
| 3857 int i; | |
| 3858 | |
| 3859 nentries = unified_range_table_nentries (p); | |
| 3860 for (i = 0; i < nentries; i++) | |
| 3861 { | |
| 3862 EMACS_INT first, last; | |
| 3863 Lisp_Object dummy_val; | |
| 3864 int jj; | |
| 867 | 3865 Ibyte strr[MAX_ICHAR_LEN]; |
| 428 | 3866 |
| 3867 unified_range_table_get_range (p, i, &first, &last, | |
| 3868 &dummy_val); | |
| 3869 for (jj = first; jj <= last && jj < 0x80; jj++) | |
| 3870 fastmap[jj] = 1; | |
| 3871 /* Ranges below 0x100 can span charsets, but there | |
| 3872 are only two (Control-1 and Latin-1), and | |
| 3873 either first or last has to be in them. */ | |
| 867 | 3874 set_itext_ichar (strr, first); |
| 428 | 3875 fastmap[*strr] = 1; |
| 3876 if (last < 0x100) | |
| 3877 { | |
| 867 | 3878 set_itext_ichar (strr, last); |
| 428 | 3879 fastmap[*strr] = 1; |
| 3880 } | |
| 3881 } | |
| 3882 } | |
| 3883 break; | |
| 3884 | |
| 3885 case charset_mule_not: | |
| 3886 { | |
| 3887 int nentries; | |
| 3888 int i; | |
|
4832
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3889 int smallest_prev = 0; |
| 428 | 3890 |
| 3891 nentries = unified_range_table_nentries (p); | |
| 3892 for (i = 0; i < nentries; i++) | |
| 3893 { | |
| 3894 EMACS_INT first, last; | |
| 3895 Lisp_Object dummy_val; | |
| 3896 int jj; | |
| 3897 | |
| 3898 unified_range_table_get_range (p, i, &first, &last, | |
| 3899 &dummy_val); | |
| 3900 for (jj = smallest_prev; jj < first && jj < 0x80; jj++) | |
| 3901 fastmap[jj] = 1; | |
| 3902 smallest_prev = last + 1; | |
| 3903 if (smallest_prev >= 0x80) | |
| 3904 break; | |
| 3905 } | |
|
4832
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3906 |
|
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3907 /* Also set lead bytes after the end */ |
|
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3908 for (i = smallest_prev; i < 0x80; i++) |
|
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3909 fastmap[i] = 1; |
|
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3910 |
| 428 | 3911 /* Calculating which leading bytes are actually allowed |
| 3912 here is rather difficult, so we just punt and allow | |
| 3913 all of them. */ | |
| 3914 for (i = 0x80; i < 0xA0; i++) | |
| 3915 fastmap[i] = 1; | |
| 3916 } | |
| 3917 break; | |
| 3918 #endif /* MULE */ | |
| 3919 | |
| 3920 | |
| 3921 case anychar: | |
| 3922 { | |
| 3923 int fastmap_newline = fastmap['\n']; | |
| 3924 | |
| 3925 /* `.' matches anything ... */ | |
| 3926 #ifdef MULE | |
| 3927 /* "anything" only includes bytes that can be the | |
| 3928 first byte of a character. */ | |
| 3929 for (j = 0; j < 0xA0; j++) | |
| 3930 fastmap[j] = 1; | |
| 3931 #else | |
| 3932 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
| 3933 fastmap[j] = 1; | |
| 3934 #endif | |
| 3935 | |
| 3936 /* ... except perhaps newline. */ | |
| 3937 if (!(bufp->syntax & RE_DOT_NEWLINE)) | |
| 3938 fastmap['\n'] = fastmap_newline; | |
| 3939 | |
| 3940 /* Return if we have already set `can_be_null'; if we have, | |
| 3941 then the fastmap is irrelevant. Something's wrong here. */ | |
| 3942 else if (bufp->can_be_null) | |
| 3943 goto done; | |
| 3944 | |
| 3945 /* Otherwise, have to check alternative paths. */ | |
| 3946 break; | |
| 3947 } | |
| 3948 | |
| 826 | 3949 #ifndef emacs |
| 3950 case wordchar: | |
| 3951 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
| 3952 if (SYNTAX (ignored, j) == Sword) | |
| 3953 fastmap[j] = 1; | |
| 3954 break; | |
| 3955 | |
| 3956 case notwordchar: | |
| 3957 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
| 3958 if (SYNTAX (ignored, j) != Sword) | |
| 3959 fastmap[j] = 1; | |
| 3960 break; | |
| 3961 #else /* emacs */ | |
| 3962 case wordchar: | |
| 3963 case notwordchar: | |
| 460 | 3964 case wordbound: |
| 3965 case notwordbound: | |
| 3966 case wordbeg: | |
| 3967 case wordend: | |
| 3968 case notsyntaxspec: | |
| 3969 case syntaxspec: | |
| 3970 /* This match depends on text properties. These end with | |
| 3971 aborting optimizations. */ | |
| 3972 bufp->can_be_null = 1; | |
| 3973 goto done; | |
| 826 | 3974 #if 0 /* all of the following code is unused now that the `syntax-table' |
| 3975 property exists -- it's trickier to do this than just look in | |
| 3976 the buffer. &&#### but we could just use the syntax-cache stuff | |
| 3977 instead; why don't we? --ben */ | |
| 3978 case wordchar: | |
| 3979 k = (int) Sword; | |
| 3980 goto matchsyntax; | |
| 3981 | |
| 3982 case notwordchar: | |
| 3983 k = (int) Sword; | |
| 3984 goto matchnotsyntax; | |
| 3985 | |
| 428 | 3986 case syntaxspec: |
| 3987 k = *p++; | |
| 826 | 3988 matchsyntax: |
| 428 | 3989 #ifdef MULE |
| 3990 for (j = 0; j < 0x80; j++) | |
| 826 | 3991 if (SYNTAX |
| 3992 (XCHAR_TABLE (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) == | |
| 428 | 3993 (enum syntaxcode) k) |
| 3994 fastmap[j] = 1; | |
| 3995 for (j = 0x80; j < 0xA0; j++) | |
| 3996 { | |
| 826 | 3997 if (leading_byte_prefix_p ((unsigned char) j)) |
| 428 | 3998 /* too complicated to calculate this right */ |
| 3999 fastmap[j] = 1; | |
| 4000 else | |
| 4001 { | |
| 4002 int multi_p; | |
| 4003 Lisp_Object cset; | |
| 4004 | |
| 826 | 4005 cset = charset_by_leading_byte (j); |
| 428 | 4006 if (CHARSETP (cset)) |
| 4007 { | |
| 826 | 4008 if (charset_syntax (lispbuf, cset, &multi_p) |
| 428 | 4009 == Sword || multi_p) |
| 4010 fastmap[j] = 1; | |
| 4011 } | |
| 4012 } | |
| 4013 } | |
| 446 | 4014 #else /* not MULE */ |
| 428 | 4015 for (j = 0; j < (1 << BYTEWIDTH); j++) |
| 826 | 4016 if (SYNTAX |
| 4017 (XCHAR_TABLE (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) == | |
| 428 | 4018 (enum syntaxcode) k) |
| 4019 fastmap[j] = 1; | |
| 446 | 4020 #endif /* MULE */ |
| 428 | 4021 break; |
| 4022 | |
| 4023 | |
| 4024 case notsyntaxspec: | |
| 4025 k = *p++; | |
| 826 | 4026 matchnotsyntax: |
| 428 | 4027 #ifdef MULE |
| 4028 for (j = 0; j < 0x80; j++) | |
| 826 | 4029 if (SYNTAX |
| 428 | 4030 (XCHAR_TABLE |
| 826 | 4031 (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) != |
| 428 | 4032 (enum syntaxcode) k) |
| 4033 fastmap[j] = 1; | |
| 4034 for (j = 0x80; j < 0xA0; j++) | |
| 4035 { | |
| 826 | 4036 if (leading_byte_prefix_p ((unsigned char) j)) |
| 428 | 4037 /* too complicated to calculate this right */ |
| 4038 fastmap[j] = 1; | |
| 4039 else | |
| 4040 { | |
| 4041 int multi_p; | |
| 4042 Lisp_Object cset; | |
| 4043 | |
| 826 | 4044 cset = charset_by_leading_byte (j); |
| 428 | 4045 if (CHARSETP (cset)) |
| 4046 { | |
| 826 | 4047 if (charset_syntax (lispbuf, cset, &multi_p) |
| 428 | 4048 != Sword || multi_p) |
| 4049 fastmap[j] = 1; | |
| 4050 } | |
| 4051 } | |
| 4052 } | |
| 446 | 4053 #else /* not MULE */ |
| 428 | 4054 for (j = 0; j < (1 << BYTEWIDTH); j++) |
| 826 | 4055 if (SYNTAX |
| 428 | 4056 (XCHAR_TABLE |
| 826 | 4057 (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) != |
| 428 | 4058 (enum syntaxcode) k) |
| 4059 fastmap[j] = 1; | |
| 446 | 4060 #endif /* MULE */ |
| 428 | 4061 break; |
| 826 | 4062 #endif /* 0 */ |
| 428 | 4063 |
| 4064 #ifdef MULE | |
| 4065 /* 97/2/17 jhod category patch */ | |
| 4066 case categoryspec: | |
| 4067 case notcategoryspec: | |
| 4068 bufp->can_be_null = 1; | |
| 1333 | 4069 UNBIND_REGEX_MALLOC_CHECK (); |
| 428 | 4070 return 0; |
| 4071 /* end if category patch */ | |
| 4072 #endif /* MULE */ | |
| 4073 | |
| 4074 /* All cases after this match the empty string. These end with | |
| 4075 `continue'. */ | |
| 4076 case before_dot: | |
| 4077 case at_dot: | |
| 4078 case after_dot: | |
| 4079 continue; | |
| 826 | 4080 #endif /* emacs */ |
| 428 | 4081 |
| 4082 | |
| 4083 case no_op: | |
| 4084 case begline: | |
| 4085 case endline: | |
| 4086 case begbuf: | |
| 4087 case endbuf: | |
| 460 | 4088 #ifndef emacs |
| 428 | 4089 case wordbound: |
| 4090 case notwordbound: | |
| 4091 case wordbeg: | |
| 4092 case wordend: | |
| 460 | 4093 #endif |
| 428 | 4094 case push_dummy_failure: |
| 4095 continue; | |
| 4096 | |
| 4097 | |
| 4098 case jump_n: | |
| 4099 case pop_failure_jump: | |
| 4100 case maybe_pop_jump: | |
| 4101 case jump: | |
| 4102 case jump_past_alt: | |
| 4103 case dummy_failure_jump: | |
| 4104 EXTRACT_NUMBER_AND_INCR (j, p); | |
| 4105 p += j; | |
| 4106 if (j > 0) | |
| 4107 continue; | |
| 4108 | |
| 4109 /* Jump backward implies we just went through the body of a | |
| 4110 loop and matched nothing. Opcode jumped to should be | |
| 4111 `on_failure_jump' or `succeed_n'. Just treat it like an | |
| 4112 ordinary jump. For a * loop, it has pushed its failure | |
| 4113 point already; if so, discard that as redundant. */ | |
| 4114 if ((re_opcode_t) *p != on_failure_jump | |
| 4115 && (re_opcode_t) *p != succeed_n) | |
| 4116 continue; | |
| 4117 | |
| 4118 p++; | |
| 4119 EXTRACT_NUMBER_AND_INCR (j, p); | |
| 4120 p += j; | |
| 4121 | |
| 4122 /* If what's on the stack is where we are now, pop it. */ | |
| 4123 if (!FAIL_STACK_EMPTY () | |
| 4124 && fail_stack.stack[fail_stack.avail - 1].pointer == p) | |
| 4125 fail_stack.avail--; | |
| 4126 | |
| 4127 continue; | |
| 4128 | |
| 4129 | |
| 4130 case on_failure_jump: | |
| 4131 case on_failure_keep_string_jump: | |
| 4132 handle_on_failure_jump: | |
| 4133 EXTRACT_NUMBER_AND_INCR (j, p); | |
| 4134 | |
| 4135 /* For some patterns, e.g., `(a?)?', `p+j' here points to the | |
| 4136 end of the pattern. We don't want to push such a point, | |
| 4137 since when we restore it above, entering the switch will | |
| 4138 increment `p' past the end of the pattern. We don't need | |
| 4139 to push such a point since we obviously won't find any more | |
| 4140 fastmap entries beyond `pend'. Such a pattern can match | |
| 4141 the null string, though. */ | |
| 4142 if (p + j < pend) | |
| 4143 { | |
| 4144 if (!PUSH_PATTERN_OP (p + j, fail_stack)) | |
| 4145 { | |
| 4146 RESET_FAIL_STACK (); | |
| 1333 | 4147 UNBIND_REGEX_MALLOC_CHECK (); |
| 428 | 4148 return -2; |
| 4149 } | |
| 4150 } | |
| 4151 else | |
| 4152 bufp->can_be_null = 1; | |
| 4153 | |
| 4154 if (succeed_n_p) | |
| 4155 { | |
| 4156 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ | |
| 4157 succeed_n_p = false; | |
| 4158 } | |
| 4159 | |
| 4160 continue; | |
| 4161 | |
| 4162 | |
| 4163 case succeed_n: | |
| 4164 /* Get to the number of times to succeed. */ | |
| 4165 p += 2; | |
| 4166 | |
| 4167 /* Increment p past the n for when k != 0. */ | |
| 4168 EXTRACT_NUMBER_AND_INCR (k, p); | |
| 4169 if (k == 0) | |
| 4170 { | |
| 4171 p -= 4; | |
| 4172 succeed_n_p = true; /* Spaghetti code alert. */ | |
| 4173 goto handle_on_failure_jump; | |
| 4174 } | |
| 4175 continue; | |
| 4176 | |
| 4177 | |
| 4178 case set_number_at: | |
| 4179 p += 4; | |
| 4180 continue; | |
| 4181 | |
| 4182 | |
| 4183 case start_memory: | |
| 4184 case stop_memory: | |
| 4185 p += 2; | |
| 4186 continue; | |
| 4187 | |
| 4188 | |
| 4189 default: | |
| 2500 | 4190 ABORT (); /* We have listed all the cases. */ |
| 428 | 4191 } /* switch *p++ */ |
| 4192 | |
| 4193 /* Getting here means we have found the possible starting | |
| 4194 characters for one path of the pattern -- and that the empty | |
| 4195 string does not match. We need not follow this path further. | |
| 4196 Instead, look at the next alternative (remembered on the | |
| 4197 stack), or quit if no more. The test at the top of the loop | |
| 4198 does these things. */ | |
| 4199 path_can_be_null = false; | |
| 4200 p = pend; | |
| 4201 } /* while p */ | |
| 4202 | |
| 4203 /* Set `can_be_null' for the last path (also the first path, if the | |
| 4204 pattern is empty). */ | |
| 4205 bufp->can_be_null |= path_can_be_null; | |
| 4206 | |
| 4207 done: | |
| 4208 RESET_FAIL_STACK (); | |
| 1333 | 4209 UNBIND_REGEX_MALLOC_CHECK (); |
| 428 | 4210 return 0; |
| 4211 } /* re_compile_fastmap */ | |
| 4212 | |
| 4213 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and | |
| 4214 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use | |
| 4215 this memory for recording register information. STARTS and ENDS | |
| 4216 must be allocated using the malloc library routine, and must each | |
| 4217 be at least NUM_REGS * sizeof (regoff_t) bytes long. | |
| 4218 | |
| 4219 If NUM_REGS == 0, then subsequent matches should allocate their own | |
| 4220 register data. | |
| 4221 | |
| 4222 Unless this function is called, the first search or match using | |
| 4223 PATTERN_BUFFER will allocate its own register data, without | |
| 4224 freeing the old data. */ | |
| 4225 | |
| 4226 void | |
| 4227 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, | |
| 647 | 4228 int num_regs, regoff_t *starts, regoff_t *ends) |
| 428 | 4229 { |
| 4230 if (num_regs) | |
| 4231 { | |
| 4232 bufp->regs_allocated = REGS_REALLOCATE; | |
| 4233 regs->num_regs = num_regs; | |
| 4234 regs->start = starts; | |
| 4235 regs->end = ends; | |
| 4236 } | |
| 4237 else | |
| 4238 { | |
| 4239 bufp->regs_allocated = REGS_UNALLOCATED; | |
| 4240 regs->num_regs = 0; | |
| 4241 regs->start = regs->end = (regoff_t *) 0; | |
| 4242 } | |
| 4243 } | |
| 4244 | |
| 4245 /* Searching routines. */ | |
| 4246 | |
| 4247 /* Like re_search_2, below, but only one string is specified, and | |
| 4248 doesn't let you say where to stop matching. */ | |
| 4249 | |
| 4250 int | |
| 442 | 4251 re_search (struct re_pattern_buffer *bufp, const char *string, int size, |
| 826 | 4252 int startpos, int range, struct re_registers *regs |
| 4253 RE_LISP_CONTEXT_ARGS_DECL) | |
| 428 | 4254 { |
| 4255 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, | |
| 826 | 4256 regs, size RE_LISP_CONTEXT_ARGS); |
| 428 | 4257 } |
| 4258 | |
| 4259 /* Using the compiled pattern in BUFP->buffer, first tries to match the | |
| 4260 virtual concatenation of STRING1 and STRING2, starting first at index | |
| 4261 STARTPOS, then at STARTPOS + 1, and so on. | |
| 4262 | |
| 4263 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. | |
| 4264 | |
| 4265 RANGE is how far to scan while trying to match. RANGE = 0 means try | |
| 4266 only at STARTPOS; in general, the last start tried is STARTPOS + | |
| 4267 RANGE. | |
| 4268 | |
| 826 | 4269 All sizes and positions refer to bytes (not chars); under Mule, the code |
| 4270 knows about the format of the text and will only check at positions | |
| 4271 where a character starts. | |
| 4272 | |
| 428 | 4273 With MULE, RANGE is a byte position, not a char position. The last |
| 4274 start tried is the character starting <= STARTPOS + RANGE. | |
| 4275 | |
| 4276 In REGS, return the indices of the virtual concatenation of STRING1 | |
| 4277 and STRING2 that matched the entire BUFP->buffer and its contained | |
| 4278 subexpressions. | |
| 4279 | |
| 4280 Do not consider matching one past the index STOP in the virtual | |
| 4281 concatenation of STRING1 and STRING2. | |
| 4282 | |
| 4283 We return either the position in the strings at which the match was | |
| 4284 found, -1 if no match, or -2 if error (such as failure | |
| 4285 stack overflow). */ | |
| 4286 | |
| 4287 int | |
| 446 | 4288 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, |
| 4289 int size1, const char *str2, int size2, int startpos, | |
| 826 | 4290 int range, struct re_registers *regs, int stop |
| 4291 RE_LISP_CONTEXT_ARGS_DECL) | |
| 428 | 4292 { |
| 4293 int val; | |
| 446 | 4294 re_char *string1 = (re_char *) str1; |
| 4295 re_char *string2 = (re_char *) str2; | |
| 428 | 4296 REGISTER char *fastmap = bufp->fastmap; |
| 446 | 4297 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate; |
| 428 | 4298 int total_size = size1 + size2; |
| 4299 int endpos = startpos + range; | |
| 4300 #ifdef REGEX_BEGLINE_CHECK | |
| 4301 int anchored_at_begline = 0; | |
| 4302 #endif | |
| 446 | 4303 re_char *d; |
| 826 | 4304 #ifdef emacs |
| 4305 Internal_Format fmt = buffer_or_other_internal_format (lispobj); | |
| 1346 | 4306 #ifdef REL_ALLOC |
| 4307 Ibyte *orig_buftext = | |
| 4308 BUFFERP (lispobj) ? | |
| 4309 BYTE_BUF_BYTE_ADDRESS (XBUFFER (lispobj), | |
| 4310 BYTE_BUF_BEGV (XBUFFER (lispobj))) : | |
| 4311 0; | |
| 4312 #endif | |
| 1333 | 4313 #ifdef ERROR_CHECK_MALLOC |
| 4314 int depth; | |
| 4315 #endif | |
| 826 | 4316 #endif /* emacs */ |
| 4317 #if 1 | |
| 4318 int forward_search_p; | |
| 4319 #endif | |
| 428 | 4320 |
| 4321 /* Check for out-of-range STARTPOS. */ | |
| 4322 if (startpos < 0 || startpos > total_size) | |
| 4323 return -1; | |
| 4324 | |
| 4325 /* Fix up RANGE if it might eventually take us outside | |
| 4326 the virtual concatenation of STRING1 and STRING2. */ | |
| 4327 if (endpos < 0) | |
| 4328 range = 0 - startpos; | |
| 4329 else if (endpos > total_size) | |
| 4330 range = total_size - startpos; | |
| 4331 | |
| 826 | 4332 #if 1 |
| 4333 forward_search_p = range > 0; | |
| 4334 #endif | |
| 4335 | |
| 428 | 4336 /* If the search isn't to be a backwards one, don't waste time in a |
| 4337 search for a pattern that must be anchored. */ | |
| 4338 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) | |
| 4339 { | |
| 4340 if (startpos > 0) | |
| 4341 return -1; | |
| 4342 else | |
| 4343 { | |
| 442 | 4344 d = ((const unsigned char *) |
| 428 | 4345 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
| 867 | 4346 range = itext_ichar_len_fmt (d, fmt); |
| 428 | 4347 } |
| 4348 } | |
| 4349 | |
| 460 | 4350 #ifdef emacs |
| 4351 /* In a forward search for something that starts with \=. | |
| 4352 don't keep searching past point. */ | |
| 4353 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) | |
| 4354 { | |
| 826 | 4355 if (!BUFFERP (lispobj)) |
| 4356 return -1; | |
|
4527
8418d1ad4944
Fix at_dot regex under Mule. <87hc6rv53v.fsf@uwakimon.sk.tsukuba.ac.jp>
Stephen J. Turnbull <stephen@xemacs.org>
parents:
3300
diff
changeset
|
4357 range = (BYTE_BUF_PT (XBUFFER (lispobj)) |
|
8418d1ad4944
Fix at_dot regex under Mule. <87hc6rv53v.fsf@uwakimon.sk.tsukuba.ac.jp>
Stephen J. Turnbull <stephen@xemacs.org>
parents:
3300
diff
changeset
|
4358 - BYTE_BUF_BEGV (XBUFFER (lispobj)) - startpos); |
| 460 | 4359 if (range < 0) |
| 4360 return -1; | |
| 4361 } | |
| 4362 #endif /* emacs */ | |
| 4363 | |
| 1333 | 4364 #ifdef ERROR_CHECK_MALLOC |
| 4365 /* Do this after the above return()s. */ | |
| 4366 depth = bind_regex_malloc_disallowed (1); | |
| 4367 #endif | |
| 4368 | |
| 428 | 4369 /* Update the fastmap now if not correct already. */ |
| 1333 | 4370 BEGIN_REGEX_MALLOC_OK (); |
| 428 | 4371 if (fastmap && !bufp->fastmap_accurate) |
| 826 | 4372 if (re_compile_fastmap (bufp RE_LISP_SHORT_CONTEXT_ARGS) == -2) |
| 1333 | 4373 { |
| 4374 END_REGEX_MALLOC_OK (); | |
| 4375 UNBIND_REGEX_MALLOC_CHECK (); | |
| 4376 return -2; | |
| 4377 } | |
| 4378 | |
| 4379 END_REGEX_MALLOC_OK (); | |
| 4380 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 428 | 4381 |
| 4382 #ifdef REGEX_BEGLINE_CHECK | |
| 4383 { | |
| 647 | 4384 long i = 0; |
| 428 | 4385 |
| 4386 while (i < bufp->used) | |
| 4387 { | |
| 4388 if (bufp->buffer[i] == start_memory || | |
| 4389 bufp->buffer[i] == stop_memory) | |
| 4390 i += 2; | |
| 4391 else | |
| 4392 break; | |
| 4393 } | |
| 4394 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline; | |
| 4395 } | |
| 4396 #endif | |
| 4397 | |
| 460 | 4398 #ifdef emacs |
| 1333 | 4399 BEGIN_REGEX_MALLOC_OK (); |
| 826 | 4400 scache = setup_syntax_cache (scache, lispobj, lispbuf, |
| 4401 offset_to_charxpos (lispobj, startpos), | |
| 4402 1); | |
| 1333 | 4403 END_REGEX_MALLOC_OK (); |
| 4404 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 460 | 4405 #endif |
| 4406 | |
| 428 | 4407 /* Loop through the string, looking for a place to start matching. */ |
| 4408 for (;;) | |
| 4409 { | |
| 4410 #ifdef REGEX_BEGLINE_CHECK | |
| 826 | 4411 /* If the regex is anchored at the beginning of a line (i.e. with a |
| 4412 ^), then we can speed things up by skipping to the next | |
| 4413 beginning-of-line. However, to determine "beginning of line" we | |
| 4414 need to look at the previous char, so can't do this check if at | |
| 4415 beginning of either string. (Well, we could if at the beginning of | |
| 4416 the second string, but it would require additional code, and this | |
| 4417 is just an optimization.) */ | |
| 4418 if (anchored_at_begline && startpos > 0 && startpos != size1) | |
| 428 | 4419 { |
| 826 | 4420 if (range > 0) |
| 4421 { | |
| 4422 /* whose stupid idea was it anyway to make this | |
| 4423 function take two strings to match?? */ | |
| 4424 int lim = 0; | |
| 4425 re_char *orig_d; | |
| 4426 re_char *stop_d; | |
| 4427 | |
| 4428 /* Compute limit as below in fastmap code, so we are guaranteed | |
| 4429 to remain within a single string. */ | |
| 4430 if (startpos < size1 && startpos + range >= size1) | |
| 4431 lim = range - (size1 - startpos); | |
| 4432 | |
| 4433 d = ((const unsigned char *) | |
| 4434 (startpos >= size1 ? string2 - size1 : string1) + startpos); | |
| 4435 orig_d = d; | |
| 4436 stop_d = d + range - lim; | |
| 4437 | |
| 4438 /* We want to find the next location (including the current | |
| 4439 one) where the previous char is a newline, so back up one | |
| 4440 and search forward for a newline. */ | |
| 867 | 4441 DEC_IBYTEPTR_FMT (d, fmt); /* Ok, since startpos != size1. */ |
| 826 | 4442 |
| 4443 /* Written out as an if-else to avoid testing `translate' | |
| 4444 inside the loop. */ | |
| 4445 if (TRANSLATE_P (translate)) | |
| 4446 while (d < stop_d && | |
| 867 | 4447 RE_TRANSLATE_1 (itext_ichar_fmt (d, fmt, lispobj)) |
| 826 | 4448 != '\n') |
| 867 | 4449 INC_IBYTEPTR_FMT (d, fmt); |
| 826 | 4450 else |
| 4451 while (d < stop_d && | |
| 867 | 4452 itext_ichar_ascii_fmt (d, fmt, lispobj) != '\n') |
| 4453 INC_IBYTEPTR_FMT (d, fmt); | |
| 826 | 4454 |
| 4455 /* If we were stopped by a newline, skip forward over it. | |
| 4456 Otherwise we will get in an infloop when our start position | |
| 4457 was at begline. */ | |
| 4458 if (d < stop_d) | |
| 867 | 4459 INC_IBYTEPTR_FMT (d, fmt); |
| 826 | 4460 range -= d - orig_d; |
| 4461 startpos += d - orig_d; | |
| 4462 #if 1 | |
| 4463 assert (!forward_search_p || range >= 0); | |
| 4464 #endif | |
| 4465 } | |
| 4466 else if (range < 0) | |
| 4467 { | |
| 4468 /* We're lazy, like in the fastmap code below */ | |
| 867 | 4469 Ichar c; |
| 826 | 4470 |
| 4471 d = ((const unsigned char *) | |
| 4472 (startpos >= size1 ? string2 - size1 : string1) + startpos); | |
| 867 | 4473 DEC_IBYTEPTR_FMT (d, fmt); |
| 4474 c = itext_ichar_fmt (d, fmt, lispobj); | |
| 826 | 4475 c = RE_TRANSLATE (c); |
| 4476 if (c != '\n') | |
| 4477 goto advance; | |
| 4478 } | |
| 428 | 4479 } |
| 4480 #endif /* REGEX_BEGLINE_CHECK */ | |
| 4481 | |
| 4482 /* If a fastmap is supplied, skip quickly over characters that | |
| 4483 cannot be the start of a match. If the pattern can match the | |
| 4484 null string, however, we don't need to skip characters; we want | |
| 4485 the first null string. */ | |
| 4486 if (fastmap && startpos < total_size && !bufp->can_be_null) | |
| 4487 { | |
| 826 | 4488 /* For the moment, fastmap always works as if buffer |
| 4489 is in default format, so convert chars in the search strings | |
| 4490 into default format as we go along, if necessary. | |
| 4491 | |
| 4492 &&#### fastmap needs rethinking for 8-bit-fixed so | |
| 4493 it's faster. We need it to reflect the raw | |
| 4494 8-bit-fixed values. That isn't so hard if we assume | |
| 4495 that the top 96 bytes represent a single 1-byte | |
| 4496 charset. For 16-bit/32-bit stuff it's probably not | |
| 4497 worth it to make the fastmap represent the raw, due to | |
| 4498 its nature -- we'd have to use the LSB for the | |
| 4499 fastmap, and that causes lots of problems with Mule | |
| 4500 chars, where it essentially wipes out the usefulness | |
| 4501 of the fastmap entirely. */ | |
| 428 | 4502 if (range > 0) /* Searching forwards. */ |
| 4503 { | |
| 4504 int lim = 0; | |
| 4505 int irange = range; | |
| 4506 | |
| 4507 if (startpos < size1 && startpos + range >= size1) | |
| 4508 lim = range - (size1 - startpos); | |
| 4509 | |
| 442 | 4510 d = ((const unsigned char *) |
| 428 | 4511 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
| 4512 | |
| 4513 /* Written out as an if-else to avoid testing `translate' | |
| 4514 inside the loop. */ | |
| 446 | 4515 if (TRANSLATE_P (translate)) |
| 826 | 4516 { |
| 4517 while (range > lim) | |
| 4518 { | |
| 4519 re_char *old_d = d; | |
| 428 | 4520 #ifdef MULE |
| 867 | 4521 Ibyte tempch[MAX_ICHAR_LEN]; |
| 4522 Ichar buf_ch = | |
| 4523 RE_TRANSLATE_1 (itext_ichar_fmt (d, fmt, lispobj)); | |
| 4524 set_itext_ichar (tempch, buf_ch); | |
| 826 | 4525 if (fastmap[*tempch]) |
| 4526 break; | |
| 446 | 4527 #else |
| 826 | 4528 if (fastmap[(unsigned char) RE_TRANSLATE_1 (*d)]) |
| 4529 break; | |
| 446 | 4530 #endif /* MULE */ |
| 867 | 4531 INC_IBYTEPTR_FMT (d, fmt); |
| 826 | 4532 range -= (d - old_d); |
| 4533 #if 1 | |
| 1333 | 4534 assert (!forward_search_p || range >= 0); |
| 826 | 4535 #endif |
| 4536 } | |
| 4537 } | |
| 4538 #ifdef MULE | |
| 4539 else if (fmt != FORMAT_DEFAULT) | |
| 4540 { | |
| 4541 while (range > lim) | |
| 4542 { | |
| 4543 re_char *old_d = d; | |
| 867 | 4544 Ibyte tempch[MAX_ICHAR_LEN]; |
| 4545 Ichar buf_ch = itext_ichar_fmt (d, fmt, lispobj); | |
| 4546 set_itext_ichar (tempch, buf_ch); | |
| 826 | 4547 if (fastmap[*tempch]) |
| 4548 break; | |
| 867 | 4549 INC_IBYTEPTR_FMT (d, fmt); |
| 826 | 4550 range -= (d - old_d); |
| 4551 #if 1 | |
| 1333 | 4552 assert (!forward_search_p || range >= 0); |
| 826 | 4553 #endif |
| 4554 } | |
| 4555 } | |
| 4556 #endif /* MULE */ | |
| 428 | 4557 else |
| 826 | 4558 { |
| 4559 while (range > lim && !fastmap[*d]) | |
| 4560 { | |
| 4561 re_char *old_d = d; | |
| 867 | 4562 INC_IBYTEPTR (d); |
| 826 | 4563 range -= (d - old_d); |
| 4564 #if 1 | |
| 4565 assert (!forward_search_p || range >= 0); | |
| 4566 #endif | |
| 4567 } | |
| 4568 } | |
| 428 | 4569 |
| 4570 startpos += irange - range; | |
| 4571 } | |
| 4572 else /* Searching backwards. */ | |
| 4573 { | |
| 826 | 4574 /* #### It's not clear why we don't just write a loop, like |
| 4575 for the moving-forward case. Perhaps the writer got lazy, | |
| 4576 since backward searches aren't so common. */ | |
| 4577 d = ((const unsigned char *) | |
| 4578 (startpos >= size1 ? string2 - size1 : string1) + startpos); | |
| 428 | 4579 #ifdef MULE |
| 826 | 4580 { |
| 867 | 4581 Ibyte tempch[MAX_ICHAR_LEN]; |
| 4582 Ichar buf_ch = | |
| 4583 RE_TRANSLATE (itext_ichar_fmt (d, fmt, lispobj)); | |
| 4584 set_itext_ichar (tempch, buf_ch); | |
| 826 | 4585 if (!fastmap[*tempch]) |
| 4586 goto advance; | |
| 4587 } | |
| 428 | 4588 #else |
| 826 | 4589 if (!fastmap[(unsigned char) RE_TRANSLATE (*d)]) |
| 446 | 4590 goto advance; |
| 826 | 4591 #endif /* MULE */ |
| 428 | 4592 } |
| 4593 } | |
| 4594 | |
| 4595 /* If can't match the null string, and that's all we have left, fail. */ | |
| 4596 if (range >= 0 && startpos == total_size && fastmap | |
| 4597 && !bufp->can_be_null) | |
| 1333 | 4598 { |
| 4599 UNBIND_REGEX_MALLOC_CHECK (); | |
| 4600 return -1; | |
| 4601 } | |
| 428 | 4602 |
| 4603 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */ | |
| 4604 if (!no_quit_in_re_search) | |
| 1333 | 4605 { |
| 4606 BEGIN_REGEX_MALLOC_OK (); | |
| 4607 QUIT; | |
| 4608 END_REGEX_MALLOC_OK (); | |
| 4609 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 4610 } | |
| 4611 | |
| 428 | 4612 #endif |
| 1333 | 4613 BEGIN_REGEX_MALLOC_OK (); |
| 428 | 4614 val = re_match_2_internal (bufp, string1, size1, string2, size2, |
| 826 | 4615 startpos, regs, stop |
| 4616 RE_LISP_CONTEXT_ARGS); | |
| 428 | 4617 #ifndef REGEX_MALLOC |
| 1333 | 4618 ALLOCA_GARBAGE_COLLECT (); |
| 428 | 4619 #endif |
| 1333 | 4620 END_REGEX_MALLOC_OK (); |
| 4621 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 428 | 4622 |
| 4623 if (val >= 0) | |
| 1333 | 4624 { |
| 4625 UNBIND_REGEX_MALLOC_CHECK (); | |
| 4626 return startpos; | |
| 4627 } | |
| 428 | 4628 |
| 4629 if (val == -2) | |
| 1333 | 4630 { |
| 4631 UNBIND_REGEX_MALLOC_CHECK (); | |
| 4632 return -2; | |
| 4633 } | |
| 4634 | |
| 4635 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 428 | 4636 advance: |
| 4637 if (!range) | |
| 4638 break; | |
| 4639 else if (range > 0) | |
| 4640 { | |
| 826 | 4641 Bytecount d_size; |
| 442 | 4642 d = ((const unsigned char *) |
| 428 | 4643 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
| 867 | 4644 d_size = itext_ichar_len_fmt (d, fmt); |
| 428 | 4645 range -= d_size; |
| 826 | 4646 #if 1 |
| 4647 assert (!forward_search_p || range >= 0); | |
| 4648 #endif | |
| 428 | 4649 startpos += d_size; |
| 4650 } | |
| 4651 else | |
| 4652 { | |
| 826 | 4653 Bytecount d_size; |
| 428 | 4654 /* Note startpos > size1 not >=. If we are on the |
| 4655 string1/string2 boundary, we want to backup into string1. */ | |
| 442 | 4656 d = ((const unsigned char *) |
| 428 | 4657 (startpos > size1 ? string2 - size1 : string1) + startpos); |
| 867 | 4658 DEC_IBYTEPTR_FMT (d, fmt); |
| 4659 d_size = itext_ichar_len_fmt (d, fmt); | |
| 428 | 4660 range += d_size; |
| 826 | 4661 #if 1 |
| 4662 assert (!forward_search_p || range >= 0); | |
| 4663 #endif | |
| 428 | 4664 startpos -= d_size; |
| 4665 } | |
| 4666 } | |
| 1333 | 4667 UNBIND_REGEX_MALLOC_CHECK (); |
| 428 | 4668 return -1; |
| 4669 } /* re_search_2 */ | |
| 826 | 4670 |
| 428 | 4671 |
| 4672 /* Declarations and macros for re_match_2. */ | |
| 4673 | |
| 4674 /* This converts PTR, a pointer into one of the search strings `string1' | |
| 4675 and `string2' into an offset from the beginning of that string. */ | |
| 4676 #define POINTER_TO_OFFSET(ptr) \ | |
| 4677 (FIRST_STRING_P (ptr) \ | |
| 4678 ? ((regoff_t) ((ptr) - string1)) \ | |
| 4679 : ((regoff_t) ((ptr) - string2 + size1))) | |
| 4680 | |
| 4681 /* Macros for dealing with the split strings in re_match_2. */ | |
| 4682 | |
| 4683 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) | |
| 4684 | |
| 4685 /* Call before fetching a character with *d. This switches over to | |
| 4686 string2 if necessary. */ | |
| 826 | 4687 #define REGEX_PREFETCH() \ |
| 428 | 4688 while (d == dend) \ |
| 4689 { \ | |
| 4690 /* End of string2 => fail. */ \ | |
| 4691 if (dend == end_match_2) \ | |
| 4692 goto fail; \ | |
| 4693 /* End of string1 => advance to string2. */ \ | |
| 4694 d = string2; \ | |
| 4695 dend = end_match_2; \ | |
| 4696 } | |
| 4697 | |
| 4698 | |
| 4699 /* Test if at very beginning or at very end of the virtual concatenation | |
| 4700 of `string1' and `string2'. If only one string, it's `string2'. */ | |
| 4701 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) | |
| 4702 #define AT_STRINGS_END(d) ((d) == end2) | |
| 4703 | |
| 4704 /* XEmacs change: | |
| 4705 If the given position straddles the string gap, return the equivalent | |
| 4706 position that is before or after the gap, respectively; otherwise, | |
| 4707 return the same position. */ | |
| 4708 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d)) | |
| 4709 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d)) | |
| 4710 | |
| 4711 /* Test if CH is a word-constituent character. (XEmacs change) */ | |
| 826 | 4712 #define WORDCHAR_P(ch) \ |
| 4713 (SYNTAX (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf), ch) == Sword) | |
| 428 | 4714 |
| 4715 /* Free everything we malloc. */ | |
| 4716 #ifdef MATCH_MAY_ALLOCATE | |
| 1726 | 4717 #define FREE_VAR(var,type) if (var) REGEX_FREE (var, type); var = NULL |
| 428 | 4718 #define FREE_VARIABLES() \ |
| 4719 do { \ | |
| 1333 | 4720 UNBIND_REGEX_MALLOC_CHECK (); \ |
| 428 | 4721 REGEX_FREE_STACK (fail_stack.stack); \ |
| 1726 | 4722 FREE_VAR (regstart, re_char **); \ |
| 4723 FREE_VAR (regend, re_char **); \ | |
| 4724 FREE_VAR (old_regstart, re_char **); \ | |
| 4725 FREE_VAR (old_regend, re_char **); \ | |
| 4726 FREE_VAR (best_regstart, re_char **); \ | |
| 4727 FREE_VAR (best_regend, re_char **); \ | |
| 4728 FREE_VAR (reg_info, register_info_type *); \ | |
| 4729 FREE_VAR (reg_dummy, re_char **); \ | |
| 4730 FREE_VAR (reg_info_dummy, register_info_type *); \ | |
| 428 | 4731 } while (0) |
| 446 | 4732 #else /* not MATCH_MAY_ALLOCATE */ |
| 1333 | 4733 #define FREE_VARIABLES() \ |
| 4734 do { \ | |
| 4735 UNBIND_REGEX_MALLOC_CHECK (); \ | |
| 4736 } while (0) | |
| 446 | 4737 #endif /* MATCH_MAY_ALLOCATE */ |
| 428 | 4738 |
| 4739 /* These values must meet several constraints. They must not be valid | |
| 4740 register values; since we have a limit of 255 registers (because | |
| 4741 we use only one byte in the pattern for the register number), we can | |
| 4742 use numbers larger than 255. They must differ by 1, because of | |
| 4743 NUM_FAILURE_ITEMS above. And the value for the lowest register must | |
| 4744 be larger than the value for the highest register, so we do not try | |
| 4745 to actually save any registers when none are active. */ | |
| 4746 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) | |
| 4747 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) | |
| 4748 | |
| 4749 /* Matching routines. */ | |
| 4750 | |
| 826 | 4751 #ifndef emacs /* XEmacs never uses this. */ |
| 428 | 4752 /* re_match is like re_match_2 except it takes only a single string. */ |
| 4753 | |
| 4754 int | |
| 442 | 4755 re_match (struct re_pattern_buffer *bufp, const char *string, int size, |
| 826 | 4756 int pos, struct re_registers *regs |
| 4757 RE_LISP_CONTEXT_ARGS_DECL) | |
| 428 | 4758 { |
| 446 | 4759 int result = re_match_2_internal (bufp, NULL, 0, (re_char *) string, size, |
| 826 | 4760 pos, regs, size |
| 4761 RE_LISP_CONTEXT_ARGS); | |
| 1333 | 4762 ALLOCA_GARBAGE_COLLECT (); |
| 428 | 4763 return result; |
| 4764 } | |
| 4765 #endif /* not emacs */ | |
| 4766 | |
| 4767 /* re_match_2 matches the compiled pattern in BUFP against the | |
| 4768 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and | |
| 4769 SIZE2, respectively). We start matching at POS, and stop matching | |
| 4770 at STOP. | |
| 4771 | |
| 4772 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we | |
| 4773 store offsets for the substring each group matched in REGS. See the | |
| 4774 documentation for exactly how many groups we fill. | |
| 4775 | |
| 4776 We return -1 if no match, -2 if an internal error (such as the | |
| 4777 failure stack overflowing). Otherwise, we return the length of the | |
| 4778 matched substring. */ | |
| 4779 | |
| 4780 int | |
| 442 | 4781 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, |
| 4782 int size1, const char *string2, int size2, int pos, | |
| 826 | 4783 struct re_registers *regs, int stop |
| 4784 RE_LISP_CONTEXT_ARGS_DECL) | |
| 428 | 4785 { |
| 460 | 4786 int result; |
| 4787 | |
| 4788 #ifdef emacs | |
| 826 | 4789 scache = setup_syntax_cache (scache, lispobj, lispbuf, |
| 4790 offset_to_charxpos (lispobj, pos), | |
| 4791 1); | |
| 460 | 4792 #endif |
| 4793 | |
| 4794 result = re_match_2_internal (bufp, (re_char *) string1, size1, | |
| 4795 (re_char *) string2, size2, | |
| 826 | 4796 pos, regs, stop |
| 4797 RE_LISP_CONTEXT_ARGS); | |
| 460 | 4798 |
| 1333 | 4799 ALLOCA_GARBAGE_COLLECT (); |
| 428 | 4800 return result; |
| 4801 } | |
| 4802 | |
| 4803 /* This is a separate function so that we can force an alloca cleanup | |
| 4804 afterwards. */ | |
| 4805 static int | |
| 446 | 4806 re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, |
| 4807 int size1, re_char *string2, int size2, int pos, | |
| 826 | 4808 struct re_registers *regs, int stop |
| 2333 | 4809 RE_LISP_CONTEXT_ARGS_MULE_DECL) |
| 428 | 4810 { |
| 4811 /* General temporaries. */ | |
| 4812 int mcnt; | |
| 4813 unsigned char *p1; | |
| 4814 int should_succeed; /* XEmacs change */ | |
| 4815 | |
| 4816 /* Just past the end of the corresponding string. */ | |
| 446 | 4817 re_char *end1, *end2; |
| 428 | 4818 |
| 4819 /* Pointers into string1 and string2, just past the last characters in | |
| 4820 each to consider matching. */ | |
| 446 | 4821 re_char *end_match_1, *end_match_2; |
| 428 | 4822 |
| 4823 /* Where we are in the data, and the end of the current string. */ | |
| 446 | 4824 re_char *d, *dend; |
| 428 | 4825 |
| 4826 /* Where we are in the pattern, and the end of the pattern. */ | |
| 4827 unsigned char *p = bufp->buffer; | |
| 4828 REGISTER unsigned char *pend = p + bufp->used; | |
| 4829 | |
| 4830 /* Mark the opcode just after a start_memory, so we can test for an | |
| 4831 empty subpattern when we get to the stop_memory. */ | |
| 446 | 4832 re_char *just_past_start_mem = 0; |
| 428 | 4833 |
| 4834 /* We use this to map every character in the string. */ | |
| 446 | 4835 RE_TRANSLATE_TYPE translate = bufp->translate; |
| 428 | 4836 |
| 4837 /* Failure point stack. Each place that can handle a failure further | |
| 4838 down the line pushes a failure point on this stack. It consists of | |
| 4839 restart, regend, and reg_info for all registers corresponding to | |
| 4840 the subexpressions we're currently inside, plus the number of such | |
| 4841 registers, and, finally, two char *'s. The first char * is where | |
| 4842 to resume scanning the pattern; the second one is where to resume | |
| 4843 scanning the strings. If the latter is zero, the failure point is | |
| 4844 a ``dummy''; if a failure happens and the failure point is a dummy, | |
| 4845 it gets discarded and the next one is tried. */ | |
| 4846 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ | |
| 4847 fail_stack_type fail_stack; | |
| 4848 #endif | |
| 4849 #ifdef DEBUG | |
| 647 | 4850 static int failure_id; |
| 4851 int nfailure_points_pushed = 0, nfailure_points_popped = 0; | |
| 428 | 4852 #endif |
| 4853 | |
| 771 | 4854 #ifdef REGEX_REL_ALLOC |
| 428 | 4855 /* This holds the pointer to the failure stack, when |
| 4856 it is allocated relocatably. */ | |
| 4857 fail_stack_elt_t *failure_stack_ptr; | |
| 4858 #endif | |
| 4859 | |
| 4860 /* We fill all the registers internally, independent of what we | |
| 4861 return, for use in backreferences. The number here includes | |
| 4862 an element for register zero. */ | |
| 647 | 4863 int num_regs = bufp->re_ngroups + 1; |
| 428 | 4864 |
| 4865 /* The currently active registers. */ | |
| 647 | 4866 int lowest_active_reg = NO_LOWEST_ACTIVE_REG; |
| 4867 int highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
| 428 | 4868 |
| 4869 /* Information on the contents of registers. These are pointers into | |
| 4870 the input strings; they record just what was matched (on this | |
| 4871 attempt) by a subexpression part of the pattern, that is, the | |
| 4872 regnum-th regstart pointer points to where in the pattern we began | |
| 4873 matching and the regnum-th regend points to right after where we | |
| 4874 stopped matching the regnum-th subexpression. (The zeroth register | |
| 4875 keeps track of what the whole pattern matches.) */ | |
| 4876 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ | |
| 446 | 4877 re_char **regstart, **regend; |
| 428 | 4878 #endif |
| 4879 | |
| 4880 /* If a group that's operated upon by a repetition operator fails to | |
| 4881 match anything, then the register for its start will need to be | |
| 4882 restored because it will have been set to wherever in the string we | |
| 4883 are when we last see its open-group operator. Similarly for a | |
| 4884 register's end. */ | |
| 4885 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ | |
| 446 | 4886 re_char **old_regstart, **old_regend; |
| 428 | 4887 #endif |
| 4888 | |
| 4889 /* The is_active field of reg_info helps us keep track of which (possibly | |
| 4890 nested) subexpressions we are currently in. The matched_something | |
| 4891 field of reg_info[reg_num] helps us tell whether or not we have | |
| 4892 matched any of the pattern so far this time through the reg_num-th | |
| 4893 subexpression. These two fields get reset each time through any | |
| 4894 loop their register is in. */ | |
| 4895 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ | |
| 4896 register_info_type *reg_info; | |
| 4897 #endif | |
| 4898 | |
| 4899 /* The following record the register info as found in the above | |
| 4900 variables when we find a match better than any we've seen before. | |
| 4901 This happens as we backtrack through the failure points, which in | |
| 4902 turn happens only if we have not yet matched the entire string. */ | |
| 647 | 4903 int best_regs_set = false; |
| 428 | 4904 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ |
| 446 | 4905 re_char **best_regstart, **best_regend; |
| 428 | 4906 #endif |
| 4907 | |
| 4908 /* Logically, this is `best_regend[0]'. But we don't want to have to | |
| 4909 allocate space for that if we're not allocating space for anything | |
| 4910 else (see below). Also, we never need info about register 0 for | |
| 4911 any of the other register vectors, and it seems rather a kludge to | |
| 4912 treat `best_regend' differently than the rest. So we keep track of | |
| 4913 the end of the best match so far in a separate variable. We | |
| 4914 initialize this to NULL so that when we backtrack the first time | |
| 4915 and need to test it, it's not garbage. */ | |
| 446 | 4916 re_char *match_end = NULL; |
| 428 | 4917 |
| 4918 /* This helps SET_REGS_MATCHED avoid doing redundant work. */ | |
| 4919 int set_regs_matched_done = 0; | |
| 4920 | |
| 4921 /* Used when we pop values we don't care about. */ | |
| 4922 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ | |
| 446 | 4923 re_char **reg_dummy; |
| 428 | 4924 register_info_type *reg_info_dummy; |
| 4925 #endif | |
| 4926 | |
| 4927 #ifdef DEBUG | |
| 4928 /* Counts the total number of registers pushed. */ | |
| 647 | 4929 int num_regs_pushed = 0; |
| 428 | 4930 #endif |
| 4931 | |
| 4932 /* 1 if this match ends in the same string (string1 or string2) | |
| 4933 as the best previous match. */ | |
| 460 | 4934 re_bool same_str_p; |
| 428 | 4935 |
| 4936 /* 1 if this match is the best seen so far. */ | |
| 460 | 4937 re_bool best_match_p; |
| 428 | 4938 |
| 826 | 4939 #ifdef emacs |
| 4940 Internal_Format fmt = buffer_or_other_internal_format (lispobj); | |
| 1346 | 4941 #ifdef REL_ALLOC |
| 4942 Ibyte *orig_buftext = | |
| 4943 BUFFERP (lispobj) ? | |
| 4944 BYTE_BUF_BYTE_ADDRESS (XBUFFER (lispobj), | |
| 4945 BYTE_BUF_BEGV (XBUFFER (lispobj))) : | |
| 4946 0; | |
| 4947 #endif | |
| 4948 | |
| 1333 | 4949 #ifdef ERROR_CHECK_MALLOC |
| 4950 int depth = bind_regex_malloc_disallowed (1); | |
| 4951 #endif | |
| 826 | 4952 #endif /* emacs */ |
| 771 | 4953 |
| 5041 | 4954 DEBUG_MATCH_PRINT1 ("\n\nEntering re_match_2.\n"); |
| 428 | 4955 |
| 1333 | 4956 BEGIN_REGEX_MALLOC_OK (); |
| 428 | 4957 INIT_FAIL_STACK (); |
| 1333 | 4958 END_REGEX_MALLOC_OK (); |
| 428 | 4959 |
| 4960 #ifdef MATCH_MAY_ALLOCATE | |
| 4961 /* Do not bother to initialize all the register variables if there are | |
| 4962 no groups in the pattern, as it takes a fair amount of time. If | |
| 4963 there are groups, we include space for register 0 (the whole | |
| 4964 pattern), even though we never use it, since it simplifies the | |
| 4965 array indexing. We should fix this. */ | |
| 502 | 4966 if (bufp->re_ngroups) |
| 428 | 4967 { |
| 1333 | 4968 BEGIN_REGEX_MALLOC_OK (); |
| 446 | 4969 regstart = REGEX_TALLOC (num_regs, re_char *); |
| 4970 regend = REGEX_TALLOC (num_regs, re_char *); | |
| 4971 old_regstart = REGEX_TALLOC (num_regs, re_char *); | |
| 4972 old_regend = REGEX_TALLOC (num_regs, re_char *); | |
| 4973 best_regstart = REGEX_TALLOC (num_regs, re_char *); | |
| 4974 best_regend = REGEX_TALLOC (num_regs, re_char *); | |
| 428 | 4975 reg_info = REGEX_TALLOC (num_regs, register_info_type); |
| 446 | 4976 reg_dummy = REGEX_TALLOC (num_regs, re_char *); |
| 428 | 4977 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); |
| 1333 | 4978 END_REGEX_MALLOC_OK (); |
| 428 | 4979 |
| 4980 if (!(regstart && regend && old_regstart && old_regend && reg_info | |
| 4981 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) | |
| 4982 { | |
| 4983 FREE_VARIABLES (); | |
| 4984 return -2; | |
| 4985 } | |
| 4986 } | |
| 4987 else | |
| 4988 { | |
| 4989 /* We must initialize all our variables to NULL, so that | |
| 4990 `FREE_VARIABLES' doesn't try to free them. */ | |
| 4991 regstart = regend = old_regstart = old_regend = best_regstart | |
| 4992 = best_regend = reg_dummy = NULL; | |
| 4993 reg_info = reg_info_dummy = (register_info_type *) NULL; | |
| 4994 } | |
| 4995 #endif /* MATCH_MAY_ALLOCATE */ | |
| 4996 | |
| 1333 | 4997 #if defined (emacs) && defined (REL_ALLOC) |
| 4998 { | |
| 4999 /* If the allocations above (or the call to setup_syntax_cache() in | |
| 5000 re_match_2) caused a rel-alloc relocation, then fix up the data | |
| 5001 pointers */ | |
| 1346 | 5002 Bytecount offset = offset_post_relocation (lispobj, orig_buftext); |
| 1333 | 5003 if (offset) |
| 5004 { | |
| 5005 string1 += offset; | |
| 5006 string2 += offset; | |
| 5007 } | |
| 5008 } | |
| 5009 #endif /* defined (emacs) && defined (REL_ALLOC) */ | |
| 5010 | |
| 428 | 5011 /* The starting position is bogus. */ |
| 5012 if (pos < 0 || pos > size1 + size2) | |
| 5013 { | |
| 5014 FREE_VARIABLES (); | |
| 5015 return -1; | |
| 5016 } | |
| 5017 | |
| 5018 /* Initialize subexpression text positions to -1 to mark ones that no | |
| 5019 start_memory/stop_memory has been seen for. Also initialize the | |
| 5020 register information struct. */ | |
| 5021 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
| 5022 { | |
| 5023 regstart[mcnt] = regend[mcnt] | |
| 5024 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; | |
| 5025 | |
| 5026 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; | |
| 5027 IS_ACTIVE (reg_info[mcnt]) = 0; | |
| 5028 MATCHED_SOMETHING (reg_info[mcnt]) = 0; | |
| 5029 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; | |
| 5030 } | |
| 5031 /* We move `string1' into `string2' if the latter's empty -- but not if | |
| 5032 `string1' is null. */ | |
| 5033 if (size2 == 0 && string1 != NULL) | |
| 5034 { | |
| 5035 string2 = string1; | |
| 5036 size2 = size1; | |
| 5037 string1 = 0; | |
| 5038 size1 = 0; | |
| 5039 } | |
| 5040 end1 = string1 + size1; | |
| 5041 end2 = string2 + size2; | |
| 5042 | |
| 5043 /* Compute where to stop matching, within the two strings. */ | |
| 5044 if (stop <= size1) | |
| 5045 { | |
| 5046 end_match_1 = string1 + stop; | |
| 5047 end_match_2 = string2; | |
| 5048 } | |
| 5049 else | |
| 5050 { | |
| 5051 end_match_1 = end1; | |
| 5052 end_match_2 = string2 + stop - size1; | |
| 5053 } | |
| 5054 | |
| 5055 /* `p' scans through the pattern as `d' scans through the data. | |
| 5056 `dend' is the end of the input string that `d' points within. `d' | |
| 5057 is advanced into the following input string whenever necessary, but | |
| 5058 this happens before fetching; therefore, at the beginning of the | |
| 5059 loop, `d' can be pointing at the end of a string, but it cannot | |
| 5060 equal `string2'. */ | |
| 5061 if (size1 > 0 && pos <= size1) | |
| 5062 { | |
| 5063 d = string1 + pos; | |
| 5064 dend = end_match_1; | |
| 5065 } | |
| 5066 else | |
| 5067 { | |
| 5068 d = string2 + pos - size1; | |
| 5069 dend = end_match_2; | |
| 5070 } | |
| 5071 | |
| 5041 | 5072 DEBUG_MATCH_PRINT1 ("The compiled pattern is: \n"); |
| 5073 DEBUG_MATCH_PRINT_COMPILED_PATTERN (bufp, p, pend); | |
| 5074 DEBUG_MATCH_PRINT1 ("The string to match is: `"); | |
| 5075 DEBUG_MATCH_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); | |
| 5076 DEBUG_MATCH_PRINT1 ("'\n"); | |
| 428 | 5077 |
| 5078 /* This loops over pattern commands. It exits by returning from the | |
| 5079 function if the match is complete, or it drops through if the match | |
| 5080 fails at this starting point in the input data. */ | |
| 5081 for (;;) | |
| 5082 { | |
| 5041 | 5083 DEBUG_MATCH_PRINT2 ("\n0x%lx: ", (long) p); |
| 428 | 5084 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */ |
| 5085 if (!no_quit_in_re_search) | |
| 1333 | 5086 { |
| 5087 BEGIN_REGEX_MALLOC_OK (); | |
| 5088 QUIT; | |
| 5089 END_REGEX_MALLOC_OK (); | |
| 1346 | 5090 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); |
| 1333 | 5091 } |
| 428 | 5092 #endif |
| 5093 | |
| 5094 if (p == pend) | |
| 5095 { /* End of pattern means we might have succeeded. */ | |
| 5041 | 5096 DEBUG_MATCH_PRINT1 ("end of pattern ... "); |
| 428 | 5097 |
| 5098 /* If we haven't matched the entire string, and we want the | |
| 5099 longest match, try backtracking. */ | |
| 5100 if (d != end_match_2) | |
| 5101 { | |
| 5102 same_str_p = (FIRST_STRING_P (match_end) | |
| 5103 == MATCHING_IN_FIRST_STRING); | |
| 5104 | |
| 5105 /* AIX compiler got confused when this was combined | |
| 5106 with the previous declaration. */ | |
| 5107 if (same_str_p) | |
| 5108 best_match_p = d > match_end; | |
| 5109 else | |
| 5110 best_match_p = !MATCHING_IN_FIRST_STRING; | |
| 5111 | |
| 5041 | 5112 DEBUG_MATCH_PRINT1 ("backtracking.\n"); |
| 428 | 5113 |
| 5114 if (!FAIL_STACK_EMPTY ()) | |
| 5115 { /* More failure points to try. */ | |
| 5116 | |
| 5117 /* If exceeds best match so far, save it. */ | |
| 5118 if (!best_regs_set || best_match_p) | |
| 5119 { | |
| 5120 best_regs_set = true; | |
| 5121 match_end = d; | |
| 5122 | |
| 5041 | 5123 DEBUG_MATCH_PRINT1 ("\nSAVING match as best so far.\n"); |
| 428 | 5124 |
| 5125 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
| 5126 { | |
| 5127 best_regstart[mcnt] = regstart[mcnt]; | |
| 5128 best_regend[mcnt] = regend[mcnt]; | |
| 5129 } | |
| 5130 } | |
| 5131 goto fail; | |
| 5132 } | |
| 5133 | |
| 5134 /* If no failure points, don't restore garbage. And if | |
| 5135 last match is real best match, don't restore second | |
| 5136 best one. */ | |
| 5137 else if (best_regs_set && !best_match_p) | |
| 5138 { | |
| 5139 restore_best_regs: | |
| 5140 /* Restore best match. It may happen that `dend == | |
| 5141 end_match_1' while the restored d is in string2. | |
| 5142 For example, the pattern `x.*y.*z' against the | |
| 5143 strings `x-' and `y-z-', if the two strings are | |
| 5144 not consecutive in memory. */ | |
| 5041 | 5145 DEBUG_MATCH_PRINT1 ("Restoring best registers.\n"); |
| 428 | 5146 |
| 5147 d = match_end; | |
| 5148 dend = ((d >= string1 && d <= end1) | |
| 5149 ? end_match_1 : end_match_2); | |
| 5150 | |
| 5151 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
| 5152 { | |
| 5153 regstart[mcnt] = best_regstart[mcnt]; | |
| 5154 regend[mcnt] = best_regend[mcnt]; | |
| 5155 } | |
| 5156 } | |
| 5157 } /* d != end_match_2 */ | |
| 5158 | |
| 5159 succeed_label: | |
| 5041 | 5160 DEBUG_MATCH_PRINT1 ("Accepting match.\n"); |
| 428 | 5161 |
| 5162 /* If caller wants register contents data back, do it. */ | |
| 1028 | 5163 { |
| 5164 int num_nonshy_regs = bufp->re_nsub + 1; | |
| 5165 if (regs && !bufp->no_sub) | |
| 5166 { | |
| 5167 /* Have the register data arrays been allocated? */ | |
| 5168 if (bufp->regs_allocated == REGS_UNALLOCATED) | |
| 5169 { /* No. So allocate them with malloc. We need one | |
| 5170 extra element beyond `num_regs' for the `-1' marker | |
| 5171 GNU code uses. */ | |
| 5172 regs->num_regs = MAX (RE_NREGS, num_nonshy_regs + 1); | |
| 1333 | 5173 BEGIN_REGEX_MALLOC_OK (); |
| 1028 | 5174 regs->start = TALLOC (regs->num_regs, regoff_t); |
| 5175 regs->end = TALLOC (regs->num_regs, regoff_t); | |
| 1333 | 5176 END_REGEX_MALLOC_OK (); |
| 5177 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 1028 | 5178 if (regs->start == NULL || regs->end == NULL) |
| 5179 { | |
| 5180 FREE_VARIABLES (); | |
| 5181 return -2; | |
| 5182 } | |
| 5183 bufp->regs_allocated = REGS_REALLOCATE; | |
| 5184 } | |
| 5185 else if (bufp->regs_allocated == REGS_REALLOCATE) | |
| 5186 { /* Yes. If we need more elements than were already | |
| 5187 allocated, reallocate them. If we need fewer, just | |
| 5188 leave it alone. */ | |
| 5189 if (regs->num_regs < num_nonshy_regs + 1) | |
| 5190 { | |
| 5191 regs->num_regs = num_nonshy_regs + 1; | |
| 1333 | 5192 BEGIN_REGEX_MALLOC_OK (); |
| 1028 | 5193 RETALLOC (regs->start, regs->num_regs, regoff_t); |
| 5194 RETALLOC (regs->end, regs->num_regs, regoff_t); | |
| 1333 | 5195 END_REGEX_MALLOC_OK (); |
| 5196 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 1028 | 5197 if (regs->start == NULL || regs->end == NULL) |
| 5198 { | |
| 5199 FREE_VARIABLES (); | |
| 5200 return -2; | |
| 5201 } | |
| 5202 } | |
| 5203 } | |
| 5204 else | |
| 5205 { | |
| 5206 /* The braces fend off a "empty body in an else-statement" | |
| 5207 warning under GCC when assert expands to nothing. */ | |
| 5208 assert (bufp->regs_allocated == REGS_FIXED); | |
| 5209 } | |
| 5210 | |
| 5211 /* Convert the pointer data in `regstart' and `regend' to | |
| 5212 indices. Register zero has to be set differently, | |
| 5213 since we haven't kept track of any info for it. */ | |
| 5214 if (regs->num_regs > 0) | |
| 5215 { | |
| 5216 regs->start[0] = pos; | |
| 5217 regs->end[0] = (MATCHING_IN_FIRST_STRING | |
| 5218 ? ((regoff_t) (d - string1)) | |
| 5219 : ((regoff_t) (d - string2 + size1))); | |
| 5220 } | |
| 5221 | |
| 2639 | 5222 /* Map over the NUM_NONSHY_REGS non-shy internal registers. |
| 5223 Copy each into the corresponding external register. | |
| 5224 MCNT indexes external registers. */ | |
| 1028 | 5225 for (mcnt = 1; mcnt < MIN (num_nonshy_regs, regs->num_regs); |
| 5226 mcnt++) | |
| 5227 { | |
| 5228 int internal_reg = bufp->external_to_internal_register[mcnt]; | |
| 5229 if (REG_UNSET (regstart[internal_reg]) || | |
| 5230 REG_UNSET (regend[internal_reg])) | |
| 5231 regs->start[mcnt] = regs->end[mcnt] = -1; | |
| 5232 else | |
| 5233 { | |
| 5234 regs->start[mcnt] = | |
| 5235 (regoff_t) POINTER_TO_OFFSET (regstart[internal_reg]); | |
| 5236 regs->end[mcnt] = | |
| 5237 (regoff_t) POINTER_TO_OFFSET (regend[internal_reg]); | |
| 5238 } | |
| 5239 } | |
| 5240 } /* regs && !bufp->no_sub */ | |
| 5241 | |
| 5242 /* If we have regs and the regs structure has more elements than | |
| 2639 | 5243 were in the pattern, set the extra elements starting with |
| 5244 NUM_NONSHY_REGS to -1. If we (re)allocated the registers, | |
| 5245 this is the case, because we always allocate enough to have | |
| 5246 at least one -1 at the end. | |
| 1028 | 5247 |
| 5248 We do this even when no_sub is set because some applications | |
| 5249 (XEmacs) reuse register structures which may contain stale | |
| 5250 information, and permit attempts to access those registers. | |
| 5251 | |
| 5252 It would be possible to require the caller to do this, but we'd | |
| 5253 have to change the API for this function to reflect that, and | |
| 1425 | 5254 audit all callers. Note: as of 2003-04-17 callers in XEmacs |
| 5255 do clear the registers, but it's safer to leave this code in | |
| 5256 because of reallocation. | |
| 5257 */ | |
| 1028 | 5258 if (regs && regs->num_regs > 0) |
| 5259 for (mcnt = num_nonshy_regs; mcnt < regs->num_regs; mcnt++) | |
| 5260 regs->start[mcnt] = regs->end[mcnt] = -1; | |
| 5261 } | |
| 5041 | 5262 DEBUG_MATCH_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", |
| 428 | 5263 nfailure_points_pushed, nfailure_points_popped, |
| 5264 nfailure_points_pushed - nfailure_points_popped); | |
| 5041 | 5265 DEBUG_MATCH_PRINT2 ("%u registers pushed.\n", num_regs_pushed); |
| 428 | 5266 |
| 5267 mcnt = d - pos - (MATCHING_IN_FIRST_STRING | |
| 5268 ? string1 | |
| 5269 : string2 - size1); | |
| 5270 | |
| 5041 | 5271 DEBUG_MATCH_PRINT2 ("Returning %d from re_match_2.\n", mcnt); |
| 428 | 5272 |
| 5273 FREE_VARIABLES (); | |
| 5274 return mcnt; | |
| 5275 } | |
| 5276 | |
| 5277 /* Otherwise match next pattern command. */ | |
|
4759
aa5ed11f473b
Remove support for obsolete systems. See xemacs-patches message with ID
Jerry James <james@xemacs.org>
parents:
4750
diff
changeset
|
5278 switch ((re_opcode_t) *p++) |
| 428 | 5279 { |
| 5280 /* Ignore these. Used to ignore the n of succeed_n's which | |
| 5281 currently have n == 0. */ | |
| 5282 case no_op: | |
| 5041 | 5283 DEBUG_MATCH_PRINT1 ("EXECUTING no_op.\n"); |
| 428 | 5284 break; |
| 5285 | |
| 5286 case succeed: | |
| 5041 | 5287 DEBUG_MATCH_PRINT1 ("EXECUTING succeed.\n"); |
| 428 | 5288 goto succeed_label; |
| 5289 | |
| 826 | 5290 /* Match exactly a string of length n in the pattern. The |
| 5291 following byte in the pattern defines n, and the n bytes after | |
| 5292 that make up the string to match. (Under Mule, this will be in | |
| 5293 the default internal format.) */ | |
| 428 | 5294 case exactn: |
| 5295 mcnt = *p++; | |
| 5041 | 5296 DEBUG_MATCH_PRINT2 ("EXECUTING exactn %d.\n", mcnt); |
| 428 | 5297 |
| 5298 /* This is written out as an if-else so we don't waste time | |
| 5299 testing `translate' inside the loop. */ | |
| 446 | 5300 if (TRANSLATE_P (translate)) |
| 428 | 5301 { |
| 5302 do | |
| 5303 { | |
| 446 | 5304 #ifdef MULE |
| 5305 Bytecount pat_len; | |
| 5306 | |
| 450 | 5307 REGEX_PREFETCH (); |
| 867 | 5308 if (RE_TRANSLATE_1 (itext_ichar_fmt (d, fmt, lispobj)) |
| 5309 != itext_ichar (p)) | |
| 428 | 5310 goto fail; |
| 446 | 5311 |
| 867 | 5312 pat_len = itext_ichar_len (p); |
| 446 | 5313 p += pat_len; |
| 867 | 5314 INC_IBYTEPTR_FMT (d, fmt); |
| 446 | 5315 |
| 5316 mcnt -= pat_len; | |
| 5317 #else /* not MULE */ | |
| 450 | 5318 REGEX_PREFETCH (); |
| 826 | 5319 if ((unsigned char) RE_TRANSLATE_1 (*d++) != *p++) |
| 446 | 5320 goto fail; |
| 5321 mcnt--; | |
| 5322 #endif | |
| 428 | 5323 } |
| 446 | 5324 while (mcnt > 0); |
| 428 | 5325 } |
| 5326 else | |
| 5327 { | |
| 826 | 5328 #ifdef MULE |
| 5329 /* If buffer format is default, then we can shortcut and just | |
| 5330 compare the text directly, byte by byte. Otherwise, we | |
| 5331 need to go character by character. */ | |
| 5332 if (fmt != FORMAT_DEFAULT) | |
| 428 | 5333 { |
| 826 | 5334 do |
| 5335 { | |
| 5336 Bytecount pat_len; | |
| 5337 | |
| 5338 REGEX_PREFETCH (); | |
| 867 | 5339 if (itext_ichar_fmt (d, fmt, lispobj) != |
| 5340 itext_ichar (p)) | |
| 826 | 5341 goto fail; |
| 5342 | |
| 867 | 5343 pat_len = itext_ichar_len (p); |
| 826 | 5344 p += pat_len; |
| 867 | 5345 INC_IBYTEPTR_FMT (d, fmt); |
| 826 | 5346 |
| 5347 mcnt -= pat_len; | |
| 5348 } | |
| 5349 while (mcnt > 0); | |
| 428 | 5350 } |
| 826 | 5351 else |
| 5352 #endif | |
| 5353 { | |
| 5354 do | |
| 5355 { | |
| 5356 REGEX_PREFETCH (); | |
| 5357 if (*d++ != *p++) goto fail; | |
| 5358 mcnt--; | |
| 5359 } | |
| 5360 while (mcnt > 0); | |
| 5361 } | |
| 428 | 5362 } |
| 5363 SET_REGS_MATCHED (); | |
| 5364 break; | |
| 5365 | |
| 5366 | |
| 5367 /* Match any character except possibly a newline or a null. */ | |
| 5368 case anychar: | |
| 5041 | 5369 DEBUG_MATCH_PRINT1 ("EXECUTING anychar.\n"); |
| 428 | 5370 |
| 450 | 5371 REGEX_PREFETCH (); |
| 428 | 5372 |
| 826 | 5373 if ((!(bufp->syntax & RE_DOT_NEWLINE) && |
| 867 | 5374 RE_TRANSLATE (itext_ichar_fmt (d, fmt, lispobj)) == '\n') |
| 826 | 5375 || (bufp->syntax & RE_DOT_NOT_NULL && |
| 867 | 5376 RE_TRANSLATE (itext_ichar_fmt (d, fmt, lispobj)) == |
| 826 | 5377 '\000')) |
| 428 | 5378 goto fail; |
| 5379 | |
| 5380 SET_REGS_MATCHED (); | |
| 5041 | 5381 DEBUG_MATCH_PRINT2 (" Matched `%d'.\n", *d); |
| 867 | 5382 INC_IBYTEPTR_FMT (d, fmt); /* XEmacs change */ |
| 428 | 5383 break; |
| 5384 | |
| 5385 | |
| 5386 case charset: | |
| 5387 case charset_not: | |
| 5388 { | |
| 1414 | 5389 REGISTER Ichar c; |
| 460 | 5390 re_bool not_p = (re_opcode_t) *(p - 1) == charset_not; |
| 458 | 5391 |
| 5041 | 5392 DEBUG_MATCH_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : ""); |
| 428 | 5393 |
| 450 | 5394 REGEX_PREFETCH (); |
| 867 | 5395 c = itext_ichar_fmt (d, fmt, lispobj); |
| 826 | 5396 c = RE_TRANSLATE (c); /* The character to match. */ |
| 428 | 5397 |
| 647 | 5398 /* Cast to `unsigned int' instead of `unsigned char' in case the |
| 428 | 5399 bit list is a full 32 bytes long. */ |
| 1414 | 5400 if ((unsigned int)c < (unsigned int) (*p * BYTEWIDTH) |
| 428 | 5401 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) |
| 458 | 5402 not_p = !not_p; |
| 428 | 5403 |
| 5404 p += 1 + *p; | |
| 5405 | |
| 458 | 5406 if (!not_p) goto fail; |
| 428 | 5407 |
| 5408 SET_REGS_MATCHED (); | |
| 867 | 5409 INC_IBYTEPTR_FMT (d, fmt); /* XEmacs change */ |
| 428 | 5410 break; |
| 5411 } | |
| 5412 | |
| 5413 #ifdef MULE | |
| 5414 case charset_mule: | |
| 5415 case charset_mule_not: | |
| 5416 { | |
| 867 | 5417 REGISTER Ichar c; |
| 460 | 5418 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not; |
| 458 | 5419 |
| 5041 | 5420 DEBUG_MATCH_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : ""); |
| 428 | 5421 |
| 450 | 5422 REGEX_PREFETCH (); |
| 867 | 5423 c = itext_ichar_fmt (d, fmt, lispobj); |
| 826 | 5424 c = RE_TRANSLATE (c); /* The character to match. */ |
| 428 | 5425 |
| 5426 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil))) | |
| 458 | 5427 not_p = !not_p; |
| 428 | 5428 |
| 5429 p += unified_range_table_bytes_used (p); | |
| 5430 | |
| 458 | 5431 if (!not_p) goto fail; |
| 428 | 5432 |
| 5433 SET_REGS_MATCHED (); | |
| 867 | 5434 INC_IBYTEPTR_FMT (d, fmt); |
| 428 | 5435 break; |
| 5436 } | |
| 5437 #endif /* MULE */ | |
| 5438 | |
| 5439 | |
| 5440 /* The beginning of a group is represented by start_memory. | |
| 5441 The arguments are the register number in the next byte, and the | |
| 5442 number of groups inner to this one in the next. The text | |
| 5443 matched within the group is recorded (in the internal | |
| 5444 registers data structure) under the register number. */ | |
| 5445 case start_memory: | |
| 5041 | 5446 DEBUG_MATCH_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); |
| 428 | 5447 |
| 5448 /* Find out if this group can match the empty string. */ | |
| 5449 p1 = p; /* To send to group_match_null_string_p. */ | |
| 5450 | |
| 5451 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) | |
| 2639 | 5452 REG_MATCH_NULL_STRING_P (reg_info[*p]) |
| 5453 = group_match_null_string_p (&p1, pend, reg_info); | |
| 5454 | |
| 5041 | 5455 DEBUG_MATCH_PRINT2 (" group CAN%s match null string\n", |
| 2639 | 5456 REG_MATCH_NULL_STRING_P (reg_info[*p]) ? "NOT" : ""); |
| 428 | 5457 |
| 5458 /* Save the position in the string where we were the last time | |
| 5459 we were at this open-group operator in case the group is | |
| 5460 operated upon by a repetition operator, e.g., with `(a*)*b' | |
| 5461 against `ab'; then we want to ignore where we are now in | |
| 5462 the string in case this attempt to match fails. */ | |
| 5463 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) | |
| 5464 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] | |
| 5465 : regstart[*p]; | |
| 5041 | 5466 DEBUG_MATCH_PRINT2 (" old_regstart: %d\n", |
| 428 | 5467 POINTER_TO_OFFSET (old_regstart[*p])); |
| 5468 | |
| 5469 regstart[*p] = d; | |
| 5041 | 5470 DEBUG_MATCH_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); |
| 428 | 5471 |
| 5472 IS_ACTIVE (reg_info[*p]) = 1; | |
| 5473 MATCHED_SOMETHING (reg_info[*p]) = 0; | |
| 5474 | |
| 5475 /* Clear this whenever we change the register activity status. */ | |
| 5476 set_regs_matched_done = 0; | |
| 5477 | |
| 5478 /* This is the new highest active register. */ | |
| 5479 highest_active_reg = *p; | |
| 5480 | |
| 5481 /* If nothing was active before, this is the new lowest active | |
| 5482 register. */ | |
| 5483 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) | |
| 5484 lowest_active_reg = *p; | |
| 5485 | |
| 5486 /* Move past the register number and inner group count. */ | |
| 5487 p += 2; | |
| 5488 just_past_start_mem = p; | |
| 5489 | |
| 5490 break; | |
| 5491 | |
| 5492 | |
| 5493 /* The stop_memory opcode represents the end of a group. Its | |
| 5494 arguments are the same as start_memory's: the register | |
| 5495 number, and the number of inner groups. */ | |
| 5496 case stop_memory: | |
| 5041 | 5497 DEBUG_MATCH_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); |
| 428 | 5498 |
| 5499 /* We need to save the string position the last time we were at | |
| 5500 this close-group operator in case the group is operated | |
| 5501 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' | |
| 5502 against `aba'; then we want to ignore where we are now in | |
| 5503 the string in case this attempt to match fails. */ | |
| 5504 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) | |
| 5505 ? REG_UNSET (regend[*p]) ? d : regend[*p] | |
| 5506 : regend[*p]; | |
| 5041 | 5507 DEBUG_MATCH_PRINT2 (" old_regend: %d\n", |
| 428 | 5508 POINTER_TO_OFFSET (old_regend[*p])); |
| 5509 | |
| 5510 regend[*p] = d; | |
| 5041 | 5511 DEBUG_MATCH_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); |
| 428 | 5512 |
| 5513 /* This register isn't active anymore. */ | |
| 5514 IS_ACTIVE (reg_info[*p]) = 0; | |
| 5515 | |
| 5516 /* Clear this whenever we change the register activity status. */ | |
| 5517 set_regs_matched_done = 0; | |
| 5518 | |
| 5519 /* If this was the only register active, nothing is active | |
| 5520 anymore. */ | |
| 5521 if (lowest_active_reg == highest_active_reg) | |
| 5522 { | |
| 5523 lowest_active_reg = NO_LOWEST_ACTIVE_REG; | |
| 5524 highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
| 5525 } | |
| 5526 else | |
| 5527 { /* We must scan for the new highest active register, since | |
| 5528 it isn't necessarily one less than now: consider | |
| 5529 (a(b)c(d(e)f)g). When group 3 ends, after the f), the | |
| 5530 new highest active register is 1. */ | |
| 5531 unsigned char r = *p - 1; | |
| 5532 while (r > 0 && !IS_ACTIVE (reg_info[r])) | |
| 5533 r--; | |
| 5534 | |
| 5535 /* If we end up at register zero, that means that we saved | |
| 5536 the registers as the result of an `on_failure_jump', not | |
| 5537 a `start_memory', and we jumped to past the innermost | |
| 5538 `stop_memory'. For example, in ((.)*) we save | |
| 5539 registers 1 and 2 as a result of the *, but when we pop | |
| 5540 back to the second ), we are at the stop_memory 1. | |
| 5541 Thus, nothing is active. */ | |
| 5542 if (r == 0) | |
| 5543 { | |
| 5544 lowest_active_reg = NO_LOWEST_ACTIVE_REG; | |
| 5545 highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
| 5546 } | |
| 5547 else | |
| 5548 { | |
| 5549 highest_active_reg = r; | |
| 5550 | |
| 5551 /* 98/9/21 jhod: We've also gotta set lowest_active_reg, don't we? */ | |
| 5552 r = 1; | |
| 5553 while (r < highest_active_reg && !IS_ACTIVE(reg_info[r])) | |
| 5554 r++; | |
| 5555 lowest_active_reg = r; | |
| 5556 } | |
| 5557 } | |
| 5558 | |
| 5559 /* If just failed to match something this time around with a | |
| 5560 group that's operated on by a repetition operator, try to | |
| 5561 force exit from the ``loop'', and restore the register | |
| 5562 information for this group that we had before trying this | |
| 5563 last match. */ | |
| 5564 if ((!MATCHED_SOMETHING (reg_info[*p]) | |
| 5565 || just_past_start_mem == p - 1) | |
| 5566 && (p + 2) < pend) | |
| 5567 { | |
| 460 | 5568 re_bool is_a_jump_n = false; |
| 428 | 5569 |
| 5570 p1 = p + 2; | |
| 5571 mcnt = 0; | |
| 5572 switch ((re_opcode_t) *p1++) | |
| 5573 { | |
| 5574 case jump_n: | |
| 5575 is_a_jump_n = true; | |
| 5576 case pop_failure_jump: | |
| 5577 case maybe_pop_jump: | |
| 5578 case jump: | |
| 5579 case dummy_failure_jump: | |
| 5580 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
| 5581 if (is_a_jump_n) | |
| 5582 p1 += 2; | |
| 5583 break; | |
| 5584 | |
| 5585 default: | |
| 5586 /* do nothing */ ; | |
| 5587 } | |
| 5588 p1 += mcnt; | |
| 5589 | |
| 5590 /* If the next operation is a jump backwards in the pattern | |
| 5591 to an on_failure_jump right before the start_memory | |
| 5592 corresponding to this stop_memory, exit from the loop | |
| 5593 by forcing a failure after pushing on the stack the | |
| 5594 on_failure_jump's jump in the pattern, and d. */ | |
| 5595 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump | |
| 5596 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) | |
| 5597 { | |
| 5598 /* If this group ever matched anything, then restore | |
| 5599 what its registers were before trying this last | |
| 5600 failed match, e.g., with `(a*)*b' against `ab' for | |
| 5601 regstart[1], and, e.g., with `((a*)*(b*)*)*' | |
| 5602 against `aba' for regend[3]. | |
| 5603 | |
| 5604 Also restore the registers for inner groups for, | |
| 5605 e.g., `((a*)(b*))*' against `aba' (register 3 would | |
| 5606 otherwise get trashed). */ | |
| 5607 | |
| 5608 if (EVER_MATCHED_SOMETHING (reg_info[*p])) | |
| 5609 { | |
| 647 | 5610 int r; |
| 428 | 5611 |
| 5612 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; | |
| 5613 | |
| 5614 /* Restore this and inner groups' (if any) registers. */ | |
| 5615 for (r = *p; r < *p + *(p + 1); r++) | |
| 5616 { | |
| 5617 regstart[r] = old_regstart[r]; | |
| 5618 | |
| 5619 /* xx why this test? */ | |
| 5620 if (old_regend[r] >= regstart[r]) | |
| 5621 regend[r] = old_regend[r]; | |
| 5622 } | |
| 5623 } | |
| 5624 p1++; | |
| 5625 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
| 5626 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); | |
| 5627 | |
| 5628 goto fail; | |
| 5629 } | |
| 5630 } | |
| 5631 | |
| 5632 /* Move past the register number and the inner group count. */ | |
| 5633 p += 2; | |
| 5634 break; | |
| 5635 | |
| 5636 | |
| 5637 /* \<digit> has been turned into a `duplicate' command which is | |
| 502 | 5638 followed by the numeric value of <digit> as the register number. |
| 5639 (Already passed through external-to-internal-register mapping, | |
| 5640 so it refers to the actual group number, not the non-shy-only | |
| 5641 numbering used in the external world.) */ | |
| 428 | 5642 case duplicate: |
| 5643 { | |
| 446 | 5644 REGISTER re_char *d2, *dend2; |
| 502 | 5645 /* Get which register to match against. */ |
| 5646 int regno = *p++; | |
| 5041 | 5647 DEBUG_MATCH_PRINT2 ("EXECUTING duplicate %d.\n", regno); |
| 428 | 5648 |
| 5649 /* Can't back reference a group which we've never matched. */ | |
| 5650 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) | |
| 5651 goto fail; | |
| 5652 | |
| 5653 /* Where in input to try to start matching. */ | |
| 5654 d2 = regstart[regno]; | |
| 5655 | |
| 5656 /* Where to stop matching; if both the place to start and | |
| 5657 the place to stop matching are in the same string, then | |
| 5658 set to the place to stop, otherwise, for now have to use | |
| 5659 the end of the first string. */ | |
| 5660 | |
| 5661 dend2 = ((FIRST_STRING_P (regstart[regno]) | |
| 5662 == FIRST_STRING_P (regend[regno])) | |
| 5663 ? regend[regno] : end_match_1); | |
| 5664 for (;;) | |
| 5665 { | |
| 5666 /* If necessary, advance to next segment in register | |
| 5667 contents. */ | |
| 5668 while (d2 == dend2) | |
| 5669 { | |
| 5670 if (dend2 == end_match_2) break; | |
| 5671 if (dend2 == regend[regno]) break; | |
| 5672 | |
| 5673 /* End of string1 => advance to string2. */ | |
| 5674 d2 = string2; | |
| 5675 dend2 = regend[regno]; | |
| 5676 } | |
| 5677 /* At end of register contents => success */ | |
| 5678 if (d2 == dend2) break; | |
| 5679 | |
| 5680 /* If necessary, advance to next segment in data. */ | |
| 450 | 5681 REGEX_PREFETCH (); |
| 428 | 5682 |
| 5683 /* How many characters left in this segment to match. */ | |
| 5684 mcnt = dend - d; | |
| 5685 | |
| 5686 /* Want how many consecutive characters we can match in | |
| 5687 one shot, so, if necessary, adjust the count. */ | |
| 5688 if (mcnt > dend2 - d2) | |
| 5689 mcnt = dend2 - d2; | |
| 5690 | |
| 5691 /* Compare that many; failure if mismatch, else move | |
| 5692 past them. */ | |
| 446 | 5693 if (TRANSLATE_P (translate) |
| 826 | 5694 ? bcmp_translate (d, d2, mcnt, translate |
| 5695 #ifdef emacs | |
| 5696 , fmt, lispobj | |
| 5697 #endif | |
| 5698 ) | |
| 428 | 5699 : memcmp (d, d2, mcnt)) |
| 5700 goto fail; | |
| 5701 d += mcnt, d2 += mcnt; | |
| 5702 | |
| 5703 /* Do this because we've match some characters. */ | |
| 5704 SET_REGS_MATCHED (); | |
| 5705 } | |
| 5706 } | |
| 5707 break; | |
| 5708 | |
| 5709 | |
| 5710 /* begline matches the empty string at the beginning of the string | |
| 5711 (unless `not_bol' is set in `bufp'), and, if | |
| 5712 `newline_anchor' is set, after newlines. */ | |
| 5713 case begline: | |
| 5041 | 5714 DEBUG_MATCH_PRINT1 ("EXECUTING begline.\n"); |
| 428 | 5715 |
| 5716 if (AT_STRINGS_BEG (d)) | |
| 5717 { | |
| 5718 if (!bufp->not_bol) break; | |
| 5719 } | |
| 826 | 5720 else |
| 5721 { | |
| 5722 re_char *d2 = d; | |
| 867 | 5723 DEC_IBYTEPTR (d2); |
| 5724 if (itext_ichar_ascii_fmt (d2, fmt, lispobj) == '\n' && | |
| 826 | 5725 bufp->newline_anchor) |
| 5726 break; | |
| 5727 } | |
| 428 | 5728 /* In all other cases, we fail. */ |
| 5729 goto fail; | |
| 5730 | |
| 5731 | |
| 5732 /* endline is the dual of begline. */ | |
| 5733 case endline: | |
| 5041 | 5734 DEBUG_MATCH_PRINT1 ("EXECUTING endline.\n"); |
| 428 | 5735 |
| 5736 if (AT_STRINGS_END (d)) | |
| 5737 { | |
| 5738 if (!bufp->not_eol) break; | |
| 5739 } | |
| 5740 | |
| 5741 /* We have to ``prefetch'' the next character. */ | |
| 826 | 5742 else if ((d == end1 ? |
| 867 | 5743 itext_ichar_ascii_fmt (string2, fmt, lispobj) : |
| 5744 itext_ichar_ascii_fmt (d, fmt, lispobj)) == '\n' | |
| 428 | 5745 && bufp->newline_anchor) |
| 5746 { | |
| 5747 break; | |
| 5748 } | |
| 5749 goto fail; | |
| 5750 | |
| 5751 | |
| 5752 /* Match at the very beginning of the data. */ | |
| 5753 case begbuf: | |
| 5041 | 5754 DEBUG_MATCH_PRINT1 ("EXECUTING begbuf.\n"); |
| 428 | 5755 if (AT_STRINGS_BEG (d)) |
| 5756 break; | |
| 5757 goto fail; | |
| 5758 | |
| 5759 | |
| 5760 /* Match at the very end of the data. */ | |
| 5761 case endbuf: | |
| 5041 | 5762 DEBUG_MATCH_PRINT1 ("EXECUTING endbuf.\n"); |
| 428 | 5763 if (AT_STRINGS_END (d)) |
| 5764 break; | |
| 5765 goto fail; | |
| 5766 | |
| 5767 | |
| 5768 /* on_failure_keep_string_jump is used to optimize `.*\n'. It | |
| 5769 pushes NULL as the value for the string on the stack. Then | |
| 5770 `pop_failure_point' will keep the current value for the | |
| 5771 string, instead of restoring it. To see why, consider | |
| 5772 matching `foo\nbar' against `.*\n'. The .* matches the foo; | |
| 5773 then the . fails against the \n. But the next thing we want | |
| 5774 to do is match the \n against the \n; if we restored the | |
| 5775 string value, we would be back at the foo. | |
| 5776 | |
| 5777 Because this is used only in specific cases, we don't need to | |
| 5778 check all the things that `on_failure_jump' does, to make | |
| 5779 sure the right things get saved on the stack. Hence we don't | |
| 5780 share its code. The only reason to push anything on the | |
| 5781 stack at all is that otherwise we would have to change | |
| 5782 `anychar's code to do something besides goto fail in this | |
| 5783 case; that seems worse than this. */ | |
| 5784 case on_failure_keep_string_jump: | |
| 5041 | 5785 DEBUG_MATCH_PRINT1 ("EXECUTING on_failure_keep_string_jump"); |
| 428 | 5786 |
| 5787 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
| 5041 | 5788 DEBUG_MATCH_PRINT3 (" %d (to 0x%lx):\n", mcnt, (long) (p + mcnt)); |
| 428 | 5789 |
| 446 | 5790 PUSH_FAILURE_POINT (p + mcnt, (unsigned char *) 0, -2); |
| 428 | 5791 break; |
| 5792 | |
| 5793 | |
| 5794 /* Uses of on_failure_jump: | |
| 5795 | |
| 5796 Each alternative starts with an on_failure_jump that points | |
| 5797 to the beginning of the next alternative. Each alternative | |
| 5798 except the last ends with a jump that in effect jumps past | |
| 5799 the rest of the alternatives. (They really jump to the | |
| 5800 ending jump of the following alternative, because tensioning | |
| 5801 these jumps is a hassle.) | |
| 5802 | |
| 5803 Repeats start with an on_failure_jump that points past both | |
| 5804 the repetition text and either the following jump or | |
| 5805 pop_failure_jump back to this on_failure_jump. */ | |
| 5806 case on_failure_jump: | |
| 5807 on_failure: | |
| 5041 | 5808 DEBUG_MATCH_PRINT1 ("EXECUTING on_failure_jump"); |
| 428 | 5809 |
| 5810 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
| 5041 | 5811 DEBUG_MATCH_PRINT3 (" %d (to 0x%lx)", mcnt, (long) (p + mcnt)); |
| 428 | 5812 |
| 5813 /* If this on_failure_jump comes right before a group (i.e., | |
| 5814 the original * applied to a group), save the information | |
| 5815 for that group and all inner ones, so that if we fail back | |
| 5816 to this point, the group's information will be correct. | |
| 5817 For example, in \(a*\)*\1, we need the preceding group, | |
| 5818 and in \(\(a*\)b*\)\2, we need the inner group. */ | |
| 5819 | |
| 5820 /* We can't use `p' to check ahead because we push | |
| 5821 a failure point to `p + mcnt' after we do this. */ | |
| 5822 p1 = p; | |
| 5823 | |
| 5824 /* We need to skip no_op's before we look for the | |
| 5825 start_memory in case this on_failure_jump is happening as | |
| 5826 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 | |
| 5827 against aba. */ | |
| 5828 while (p1 < pend && (re_opcode_t) *p1 == no_op) | |
| 5829 p1++; | |
| 5830 | |
| 5831 if (p1 < pend && (re_opcode_t) *p1 == start_memory) | |
| 5832 { | |
| 5833 /* We have a new highest active register now. This will | |
| 5834 get reset at the start_memory we are about to get to, | |
| 5835 but we will have saved all the registers relevant to | |
| 5836 this repetition op, as described above. */ | |
| 5837 highest_active_reg = *(p1 + 1) + *(p1 + 2); | |
| 5838 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) | |
| 5839 lowest_active_reg = *(p1 + 1); | |
| 5840 } | |
| 5841 | |
| 5041 | 5842 DEBUG_MATCH_PRINT1 (":\n"); |
| 428 | 5843 PUSH_FAILURE_POINT (p + mcnt, d, -2); |
| 5844 break; | |
| 5845 | |
| 5846 | |
| 5847 /* A smart repeat ends with `maybe_pop_jump'. | |
| 5848 We change it to either `pop_failure_jump' or `jump'. */ | |
| 5849 case maybe_pop_jump: | |
| 5850 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
| 5041 | 5851 DEBUG_MATCH_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); |
| 428 | 5852 { |
| 5853 REGISTER unsigned char *p2 = p; | |
| 5854 | |
| 5855 /* Compare the beginning of the repeat with what in the | |
| 5856 pattern follows its end. If we can establish that there | |
| 5857 is nothing that they would both match, i.e., that we | |
| 5858 would have to backtrack because of (as in, e.g., `a*a') | |
| 5859 then we can change to pop_failure_jump, because we'll | |
| 5860 never have to backtrack. | |
| 5861 | |
| 5862 This is not true in the case of alternatives: in | |
| 5863 `(a|ab)*' we do need to backtrack to the `ab' alternative | |
| 5864 (e.g., if the string was `ab'). But instead of trying to | |
| 5865 detect that here, the alternative has put on a dummy | |
| 5866 failure point which is what we will end up popping. */ | |
| 5867 | |
| 5868 /* Skip over open/close-group commands. | |
| 5869 If what follows this loop is a ...+ construct, | |
| 5870 look at what begins its body, since we will have to | |
| 5871 match at least one of that. */ | |
| 5872 while (1) | |
| 5873 { | |
| 5874 if (p2 + 2 < pend | |
| 5875 && ((re_opcode_t) *p2 == stop_memory | |
| 5876 || (re_opcode_t) *p2 == start_memory)) | |
| 5877 p2 += 3; | |
| 5878 else if (p2 + 6 < pend | |
| 5879 && (re_opcode_t) *p2 == dummy_failure_jump) | |
| 5880 p2 += 6; | |
| 5881 else | |
| 5882 break; | |
| 5883 } | |
| 5884 | |
| 5885 p1 = p + mcnt; | |
| 5886 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding | |
| 5887 to the `maybe_finalize_jump' of this case. Examine what | |
| 5888 follows. */ | |
| 5889 | |
| 5890 /* If we're at the end of the pattern, we can change. */ | |
| 5891 if (p2 == pend) | |
| 5892 { | |
| 5893 /* Consider what happens when matching ":\(.*\)" | |
| 5894 against ":/". I don't really understand this code | |
| 5895 yet. */ | |
| 5896 p[-3] = (unsigned char) pop_failure_jump; | |
| 5041 | 5897 DEBUG_MATCH_PRINT1 |
| 428 | 5898 (" End of pattern: change to `pop_failure_jump'.\n"); |
| 5899 } | |
| 5900 | |
| 5901 else if ((re_opcode_t) *p2 == exactn | |
| 5902 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) | |
| 5903 { | |
| 5904 REGISTER unsigned char c | |
| 5905 = *p2 == (unsigned char) endline ? '\n' : p2[2]; | |
| 5906 | |
| 5907 if ((re_opcode_t) p1[3] == exactn && p1[5] != c) | |
| 5908 { | |
| 5909 p[-3] = (unsigned char) pop_failure_jump; | |
| 5041 | 5910 DEBUG_MATCH_PRINT3 (" %c != %c => pop_failure_jump.\n", |
| 428 | 5911 c, p1[5]); |
| 5912 } | |
| 5913 | |
| 5914 else if ((re_opcode_t) p1[3] == charset | |
| 5915 || (re_opcode_t) p1[3] == charset_not) | |
| 5916 { | |
| 458 | 5917 int not_p = (re_opcode_t) p1[3] == charset_not; |
| 428 | 5918 |
| 5919 if (c < (unsigned char) (p1[4] * BYTEWIDTH) | |
| 5920 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) | |
| 458 | 5921 not_p = !not_p; |
| 5922 | |
| 5923 /* `not_p' is equal to 1 if c would match, which means | |
| 428 | 5924 that we can't change to pop_failure_jump. */ |
| 458 | 5925 if (!not_p) |
| 428 | 5926 { |
| 5927 p[-3] = (unsigned char) pop_failure_jump; | |
| 5041 | 5928 DEBUG_MATCH_PRINT1 (" No match => pop_failure_jump.\n"); |
| 428 | 5929 } |
| 5930 } | |
| 5931 } | |
| 5932 else if ((re_opcode_t) *p2 == charset) | |
| 5933 { | |
| 5934 #ifdef DEBUG | |
| 5935 REGISTER unsigned char c | |
| 5936 = *p2 == (unsigned char) endline ? '\n' : p2[2]; | |
| 5937 #endif | |
| 5938 | |
| 5939 if ((re_opcode_t) p1[3] == exactn | |
| 5940 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] | |
| 5941 && (p2[2 + p1[5] / BYTEWIDTH] | |
| 5942 & (1 << (p1[5] % BYTEWIDTH))))) | |
| 5943 { | |
| 5944 p[-3] = (unsigned char) pop_failure_jump; | |
| 5041 | 5945 DEBUG_MATCH_PRINT3 (" %c != %c => pop_failure_jump.\n", |
| 428 | 5946 c, p1[5]); |
| 5947 } | |
| 5948 | |
| 5949 else if ((re_opcode_t) p1[3] == charset_not) | |
| 5950 { | |
| 5951 int idx; | |
| 5952 /* We win if the charset_not inside the loop | |
| 5953 lists every character listed in the charset after. */ | |
| 5954 for (idx = 0; idx < (int) p2[1]; idx++) | |
| 5955 if (! (p2[2 + idx] == 0 | |
| 5956 || (idx < (int) p1[4] | |
| 5957 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0)))) | |
| 5958 break; | |
| 5959 | |
| 5960 if (idx == p2[1]) | |
| 5961 { | |
| 5962 p[-3] = (unsigned char) pop_failure_jump; | |
| 5041 | 5963 DEBUG_MATCH_PRINT1 (" No match => pop_failure_jump.\n"); |
| 428 | 5964 } |
| 5965 } | |
| 5966 else if ((re_opcode_t) p1[3] == charset) | |
| 5967 { | |
| 5968 int idx; | |
| 5969 /* We win if the charset inside the loop | |
| 5970 has no overlap with the one after the loop. */ | |
| 5971 for (idx = 0; | |
| 5972 idx < (int) p2[1] && idx < (int) p1[4]; | |
| 5973 idx++) | |
| 5974 if ((p2[2 + idx] & p1[5 + idx]) != 0) | |
| 5975 break; | |
| 5976 | |
| 5977 if (idx == p2[1] || idx == p1[4]) | |
| 5978 { | |
| 5979 p[-3] = (unsigned char) pop_failure_jump; | |
| 5041 | 5980 DEBUG_MATCH_PRINT1 (" No match => pop_failure_jump.\n"); |
| 428 | 5981 } |
| 5982 } | |
| 5983 } | |
| 5984 } | |
| 5985 p -= 2; /* Point at relative address again. */ | |
| 5986 if ((re_opcode_t) p[-1] != pop_failure_jump) | |
| 5987 { | |
| 5988 p[-1] = (unsigned char) jump; | |
| 5041 | 5989 DEBUG_MATCH_PRINT1 (" Match => jump.\n"); |
| 428 | 5990 goto unconditional_jump; |
| 5991 } | |
| 5992 /* Note fall through. */ | |
| 5993 | |
| 5994 | |
| 5995 /* The end of a simple repeat has a pop_failure_jump back to | |
| 5996 its matching on_failure_jump, where the latter will push a | |
| 5997 failure point. The pop_failure_jump takes off failure | |
| 5998 points put on by this pop_failure_jump's matching | |
| 5999 on_failure_jump; we got through the pattern to here from the | |
| 6000 matching on_failure_jump, so didn't fail. */ | |
| 6001 case pop_failure_jump: | |
| 6002 { | |
| 6003 /* We need to pass separate storage for the lowest and | |
| 6004 highest registers, even though we don't care about the | |
| 6005 actual values. Otherwise, we will restore only one | |
| 6006 register from the stack, since lowest will == highest in | |
| 6007 `pop_failure_point'. */ | |
| 647 | 6008 int dummy_low_reg, dummy_high_reg; |
| 428 | 6009 unsigned char *pdummy; |
| 446 | 6010 re_char *sdummy = NULL; |
| 428 | 6011 |
| 5041 | 6012 DEBUG_MATCH_PRINT1 ("EXECUTING pop_failure_jump.\n"); |
| 428 | 6013 POP_FAILURE_POINT (sdummy, pdummy, |
| 6014 dummy_low_reg, dummy_high_reg, | |
| 6015 reg_dummy, reg_dummy, reg_info_dummy); | |
| 6016 } | |
| 6017 /* Note fall through. */ | |
| 6018 | |
| 6019 | |
| 6020 /* Unconditionally jump (without popping any failure points). */ | |
| 6021 case jump: | |
| 6022 unconditional_jump: | |
| 6023 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ | |
| 5041 | 6024 DEBUG_MATCH_PRINT2 ("EXECUTING jump %d ", mcnt); |
| 428 | 6025 p += mcnt; /* Do the jump. */ |
| 5041 | 6026 DEBUG_MATCH_PRINT2 ("(to 0x%lx).\n", (long) p); |
| 428 | 6027 break; |
| 6028 | |
| 6029 | |
| 6030 /* We need this opcode so we can detect where alternatives end | |
| 6031 in `group_match_null_string_p' et al. */ | |
| 6032 case jump_past_alt: | |
| 5041 | 6033 DEBUG_MATCH_PRINT1 ("EXECUTING jump_past_alt.\n"); |
| 428 | 6034 goto unconditional_jump; |
| 6035 | |
| 6036 | |
| 6037 /* Normally, the on_failure_jump pushes a failure point, which | |
| 6038 then gets popped at pop_failure_jump. We will end up at | |
| 6039 pop_failure_jump, also, and with a pattern of, say, `a+', we | |
| 6040 are skipping over the on_failure_jump, so we have to push | |
| 6041 something meaningless for pop_failure_jump to pop. */ | |
| 6042 case dummy_failure_jump: | |
| 5041 | 6043 DEBUG_MATCH_PRINT1 ("EXECUTING dummy_failure_jump.\n"); |
| 428 | 6044 /* It doesn't matter what we push for the string here. What |
| 6045 the code at `fail' tests is the value for the pattern. */ | |
| 446 | 6046 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2); |
| 428 | 6047 goto unconditional_jump; |
| 6048 | |
| 6049 | |
| 6050 /* At the end of an alternative, we need to push a dummy failure | |
| 6051 point in case we are followed by a `pop_failure_jump', because | |
| 6052 we don't want the failure point for the alternative to be | |
| 6053 popped. For example, matching `(a|ab)*' against `aab' | |
| 6054 requires that we match the `ab' alternative. */ | |
| 6055 case push_dummy_failure: | |
| 5041 | 6056 DEBUG_MATCH_PRINT1 ("EXECUTING push_dummy_failure.\n"); |
| 428 | 6057 /* See comments just above at `dummy_failure_jump' about the |
| 6058 two zeroes. */ | |
| 446 | 6059 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2); |
| 428 | 6060 break; |
| 6061 | |
| 6062 /* Have to succeed matching what follows at least n times. | |
| 6063 After that, handle like `on_failure_jump'. */ | |
| 6064 case succeed_n: | |
| 6065 EXTRACT_NUMBER (mcnt, p + 2); | |
| 5041 | 6066 DEBUG_MATCH_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); |
| 428 | 6067 |
| 6068 assert (mcnt >= 0); | |
| 6069 /* Originally, this is how many times we HAVE to succeed. */ | |
| 6070 if (mcnt > 0) | |
| 6071 { | |
| 6072 mcnt--; | |
| 6073 p += 2; | |
| 6074 STORE_NUMBER_AND_INCR (p, mcnt); | |
| 5041 | 6075 DEBUG_MATCH_PRINT3 (" Setting 0x%lx to %d.\n", (long) p, mcnt); |
| 428 | 6076 } |
| 6077 else if (mcnt == 0) | |
| 6078 { | |
| 5041 | 6079 DEBUG_MATCH_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n", |
| 428 | 6080 (long) (p+2)); |
| 6081 p[2] = (unsigned char) no_op; | |
| 6082 p[3] = (unsigned char) no_op; | |
| 6083 goto on_failure; | |
| 6084 } | |
| 6085 break; | |
| 6086 | |
| 6087 case jump_n: | |
| 6088 EXTRACT_NUMBER (mcnt, p + 2); | |
| 5041 | 6089 DEBUG_MATCH_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); |
| 428 | 6090 |
| 6091 /* Originally, this is how many times we CAN jump. */ | |
| 6092 if (mcnt) | |
| 6093 { | |
| 6094 mcnt--; | |
| 6095 STORE_NUMBER (p + 2, mcnt); | |
| 6096 goto unconditional_jump; | |
| 6097 } | |
| 6098 /* If don't have to jump any more, skip over the rest of command. */ | |
| 6099 else | |
| 6100 p += 4; | |
| 6101 break; | |
| 6102 | |
| 6103 case set_number_at: | |
| 6104 { | |
| 5041 | 6105 DEBUG_MATCH_PRINT1 ("EXECUTING set_number_at.\n"); |
| 428 | 6106 |
| 6107 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
| 6108 p1 = p + mcnt; | |
| 6109 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
| 5041 | 6110 DEBUG_MATCH_PRINT3 (" Setting 0x%lx to %d.\n", (long) p1, mcnt); |
| 428 | 6111 STORE_NUMBER (p1, mcnt); |
| 6112 break; | |
| 6113 } | |
| 6114 | |
| 6115 case wordbound: | |
| 5041 | 6116 DEBUG_MATCH_PRINT1 ("EXECUTING wordbound.\n"); |
| 428 | 6117 should_succeed = 1; |
| 6118 matchwordbound: | |
| 6119 { | |
| 6120 /* XEmacs change */ | |
| 1377 | 6121 /* Straightforward and (I hope) correct implementation. |
| 6122 Probably should be optimized by arranging to compute | |
| 1497 | 6123 charpos only once. */ |
| 1377 | 6124 /* emch1 is the character before d, syn1 is the syntax of |
| 6125 emch1, emch2 is the character at d, and syn2 is the | |
| 6126 syntax of emch2. */ | |
| 6127 Ichar emch1, emch2; | |
| 1468 | 6128 int syn1 = 0, |
| 6129 syn2 = 0; | |
| 1377 | 6130 re_char *d_before, *d_after; |
| 6131 int result, | |
| 6132 at_beg = AT_STRINGS_BEG (d), | |
| 6133 at_end = AT_STRINGS_END (d); | |
| 6134 #ifdef emacs | |
| 1497 | 6135 Charxpos charpos; |
| 1377 | 6136 #endif |
| 6137 | |
| 6138 if (at_beg && at_end) | |
| 6139 { | |
| 6140 result = 0; | |
| 6141 } | |
| 428 | 6142 else |
| 6143 { | |
| 1377 | 6144 if (!at_beg) |
| 6145 { | |
| 6146 d_before = POS_BEFORE_GAP_UNSAFE (d); | |
| 6147 DEC_IBYTEPTR_FMT (d_before, fmt); | |
| 6148 emch1 = itext_ichar_fmt (d_before, fmt, lispobj); | |
| 460 | 6149 #ifdef emacs |
| 1497 | 6150 charpos = offset_to_charxpos (lispobj, |
| 6151 PTR_TO_OFFSET (d)) - 1; | |
| 1377 | 6152 BEGIN_REGEX_MALLOC_OK (); |
| 1497 | 6153 UPDATE_SYNTAX_CACHE (scache, charpos); |
| 460 | 6154 #endif |
| 1377 | 6155 syn1 = SYNTAX_FROM_CACHE (scache, emch1); |
| 6156 END_REGEX_MALLOC_OK (); | |
| 6157 } | |
| 6158 if (!at_end) | |
| 6159 { | |
| 6160 d_after = POS_AFTER_GAP_UNSAFE (d); | |
| 6161 emch2 = itext_ichar_fmt (d_after, fmt, lispobj); | |
| 460 | 6162 #ifdef emacs |
| 1497 | 6163 charpos = offset_to_charxpos (lispobj, PTR_TO_OFFSET (d)); |
| 1377 | 6164 BEGIN_REGEX_MALLOC_OK (); |
| 1497 | 6165 UPDATE_SYNTAX_CACHE_FORWARD (scache, charpos); |
| 460 | 6166 #endif |
| 1377 | 6167 syn2 = SYNTAX_FROM_CACHE (scache, emch2); |
| 6168 END_REGEX_MALLOC_OK (); | |
| 6169 } | |
| 1333 | 6170 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); |
| 1377 | 6171 |
| 6172 if (at_beg) | |
| 6173 result = (syn2 == Sword); | |
| 6174 else if (at_end) | |
| 6175 result = (syn1 == Sword); | |
| 6176 else | |
| 6177 result = ((syn1 == Sword) != (syn2 == Sword)); | |
| 428 | 6178 } |
| 1377 | 6179 |
| 428 | 6180 if (result == should_succeed) |
| 6181 break; | |
| 6182 goto fail; | |
| 6183 } | |
| 6184 | |
| 6185 case notwordbound: | |
| 5041 | 6186 DEBUG_MATCH_PRINT1 ("EXECUTING notwordbound.\n"); |
| 428 | 6187 should_succeed = 0; |
| 6188 goto matchwordbound; | |
| 6189 | |
| 6190 case wordbeg: | |
| 5041 | 6191 DEBUG_MATCH_PRINT1 ("EXECUTING wordbeg.\n"); |
| 460 | 6192 if (AT_STRINGS_END (d)) |
| 6193 goto fail; | |
| 428 | 6194 { |
| 6195 /* XEmacs: this originally read: | |
| 6196 | |
| 6197 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) | |
| 6198 break; | |
| 6199 | |
| 6200 */ | |
| 460 | 6201 re_char *dtmp = POS_AFTER_GAP_UNSAFE (d); |
| 867 | 6202 Ichar emch = itext_ichar_fmt (dtmp, fmt, lispobj); |
| 1333 | 6203 int tempres; |
| 1347 | 6204 #ifdef emacs |
| 6205 Charxpos charpos = offset_to_charxpos (lispobj, PTR_TO_OFFSET (d)); | |
| 6206 #endif | |
| 1333 | 6207 BEGIN_REGEX_MALLOC_OK (); |
| 460 | 6208 #ifdef emacs |
| 826 | 6209 UPDATE_SYNTAX_CACHE (scache, charpos); |
| 460 | 6210 #endif |
| 1333 | 6211 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); |
| 6212 END_REGEX_MALLOC_OK (); | |
| 6213 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 6214 if (tempres) | |
| 428 | 6215 goto fail; |
| 6216 if (AT_STRINGS_BEG (d)) | |
| 6217 break; | |
| 460 | 6218 dtmp = POS_BEFORE_GAP_UNSAFE (d); |
| 867 | 6219 DEC_IBYTEPTR_FMT (dtmp, fmt); |
| 6220 emch = itext_ichar_fmt (dtmp, fmt, lispobj); | |
| 1333 | 6221 BEGIN_REGEX_MALLOC_OK (); |
| 460 | 6222 #ifdef emacs |
| 826 | 6223 UPDATE_SYNTAX_CACHE_BACKWARD (scache, charpos - 1); |
| 460 | 6224 #endif |
| 1333 | 6225 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); |
| 6226 END_REGEX_MALLOC_OK (); | |
| 6227 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 6228 if (tempres) | |
| 428 | 6229 break; |
| 6230 goto fail; | |
| 6231 } | |
| 6232 | |
| 6233 case wordend: | |
| 5041 | 6234 DEBUG_MATCH_PRINT1 ("EXECUTING wordend.\n"); |
| 460 | 6235 if (AT_STRINGS_BEG (d)) |
| 6236 goto fail; | |
| 428 | 6237 { |
| 6238 /* XEmacs: this originally read: | |
| 6239 | |
| 6240 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) | |
| 6241 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) | |
| 6242 break; | |
| 6243 | |
| 6244 The or condition is incorrect (reversed). | |
| 6245 */ | |
| 460 | 6246 re_char *dtmp; |
| 867 | 6247 Ichar emch; |
| 1333 | 6248 int tempres; |
| 460 | 6249 #ifdef emacs |
| 826 | 6250 Charxpos charpos = offset_to_charxpos (lispobj, PTR_TO_OFFSET (d)); |
| 1347 | 6251 BEGIN_REGEX_MALLOC_OK (); |
| 826 | 6252 UPDATE_SYNTAX_CACHE (scache, charpos); |
| 1333 | 6253 END_REGEX_MALLOC_OK (); |
| 6254 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 1347 | 6255 #endif |
| 460 | 6256 dtmp = POS_BEFORE_GAP_UNSAFE (d); |
| 867 | 6257 DEC_IBYTEPTR_FMT (dtmp, fmt); |
| 6258 emch = itext_ichar_fmt (dtmp, fmt, lispobj); | |
| 1333 | 6259 BEGIN_REGEX_MALLOC_OK (); |
| 6260 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); | |
| 6261 END_REGEX_MALLOC_OK (); | |
| 6262 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 6263 if (tempres) | |
| 428 | 6264 goto fail; |
| 6265 if (AT_STRINGS_END (d)) | |
| 6266 break; | |
| 460 | 6267 dtmp = POS_AFTER_GAP_UNSAFE (d); |
| 867 | 6268 emch = itext_ichar_fmt (dtmp, fmt, lispobj); |
| 1333 | 6269 BEGIN_REGEX_MALLOC_OK (); |
| 460 | 6270 #ifdef emacs |
| 826 | 6271 UPDATE_SYNTAX_CACHE_FORWARD (scache, charpos + 1); |
| 460 | 6272 #endif |
| 1333 | 6273 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); |
| 6274 END_REGEX_MALLOC_OK (); | |
| 6275 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 6276 if (tempres) | |
| 428 | 6277 break; |
| 6278 goto fail; | |
| 6279 } | |
| 6280 | |
| 6281 #ifdef emacs | |
| 6282 case before_dot: | |
| 5041 | 6283 DEBUG_MATCH_PRINT1 ("EXECUTING before_dot.\n"); |
| 826 | 6284 if (!BUFFERP (lispobj) |
| 6285 || (BUF_PTR_BYTE_POS (XBUFFER (lispobj), (unsigned char *) d) | |
| 6286 >= BUF_PT (XBUFFER (lispobj)))) | |
| 428 | 6287 goto fail; |
| 6288 break; | |
| 6289 | |
| 6290 case at_dot: | |
| 5041 | 6291 DEBUG_MATCH_PRINT1 ("EXECUTING at_dot.\n"); |
| 826 | 6292 if (!BUFFERP (lispobj) |
| 6293 || (BUF_PTR_BYTE_POS (XBUFFER (lispobj), (unsigned char *) d) | |
| 6294 != BUF_PT (XBUFFER (lispobj)))) | |
| 428 | 6295 goto fail; |
| 6296 break; | |
| 6297 | |
| 6298 case after_dot: | |
| 5041 | 6299 DEBUG_MATCH_PRINT1 ("EXECUTING after_dot.\n"); |
| 826 | 6300 if (!BUFFERP (lispobj) |
| 6301 || (BUF_PTR_BYTE_POS (XBUFFER (lispobj), (unsigned char *) d) | |
| 6302 <= BUF_PT (XBUFFER (lispobj)))) | |
| 428 | 6303 goto fail; |
| 6304 break; | |
| 6305 | |
| 6306 case syntaxspec: | |
| 5041 | 6307 DEBUG_MATCH_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); |
| 428 | 6308 mcnt = *p++; |
| 6309 goto matchsyntax; | |
| 6310 | |
| 6311 case wordchar: | |
| 5041 | 6312 DEBUG_MATCH_PRINT1 ("EXECUTING Emacs wordchar.\n"); |
| 428 | 6313 mcnt = (int) Sword; |
| 6314 matchsyntax: | |
| 6315 should_succeed = 1; | |
| 6316 matchornotsyntax: | |
| 6317 { | |
| 6318 int matches; | |
| 867 | 6319 Ichar emch; |
| 428 | 6320 |
| 450 | 6321 REGEX_PREFETCH (); |
| 1333 | 6322 BEGIN_REGEX_MALLOC_OK (); |
| 826 | 6323 UPDATE_SYNTAX_CACHE |
| 6324 (scache, offset_to_charxpos (lispobj, PTR_TO_OFFSET (d))); | |
| 1333 | 6325 END_REGEX_MALLOC_OK (); |
| 6326 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 826 | 6327 |
| 867 | 6328 emch = itext_ichar_fmt (d, fmt, lispobj); |
| 1333 | 6329 BEGIN_REGEX_MALLOC_OK (); |
| 826 | 6330 matches = (SYNTAX_FROM_CACHE (scache, emch) == |
| 6331 (enum syntaxcode) mcnt); | |
| 1333 | 6332 END_REGEX_MALLOC_OK (); |
| 6333 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
| 867 | 6334 INC_IBYTEPTR_FMT (d, fmt); |
| 428 | 6335 if (matches != should_succeed) |
| 6336 goto fail; | |
| 6337 SET_REGS_MATCHED (); | |
| 6338 } | |
| 6339 break; | |
| 6340 | |
| 6341 case notsyntaxspec: | |
| 5041 | 6342 DEBUG_MATCH_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); |
| 428 | 6343 mcnt = *p++; |
| 6344 goto matchnotsyntax; | |
| 6345 | |
| 6346 case notwordchar: | |
| 5041 | 6347 DEBUG_MATCH_PRINT1 ("EXECUTING Emacs notwordchar.\n"); |
| 428 | 6348 mcnt = (int) Sword; |
| 6349 matchnotsyntax: | |
| 6350 should_succeed = 0; | |
| 6351 goto matchornotsyntax; | |
| 6352 | |
| 6353 #ifdef MULE | |
| 6354 /* 97/2/17 jhod Mule category code patch */ | |
| 6355 case categoryspec: | |
| 6356 should_succeed = 1; | |
| 6357 matchornotcategory: | |
| 6358 { | |
| 867 | 6359 Ichar emch; |
| 428 | 6360 |
| 6361 mcnt = *p++; | |
| 450 | 6362 REGEX_PREFETCH (); |
| 867 | 6363 emch = itext_ichar_fmt (d, fmt, lispobj); |
| 6364 INC_IBYTEPTR_FMT (d, fmt); | |
| 826 | 6365 if (check_category_char (emch, BUFFER_CATEGORY_TABLE (lispbuf), |
| 6366 mcnt, should_succeed)) | |
| 428 | 6367 goto fail; |
| 6368 SET_REGS_MATCHED (); | |
| 6369 } | |
| 6370 break; | |
| 6371 | |
| 6372 case notcategoryspec: | |
| 6373 should_succeed = 0; | |
| 6374 goto matchornotcategory; | |
| 6375 /* end of category patch */ | |
| 6376 #endif /* MULE */ | |
| 6377 #else /* not emacs */ | |
| 6378 case wordchar: | |
| 5041 | 6379 DEBUG_MATCH_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); |
| 450 | 6380 REGEX_PREFETCH (); |
| 826 | 6381 if (!WORDCHAR_P ((int) (*d))) |
| 428 | 6382 goto fail; |
| 6383 SET_REGS_MATCHED (); | |
| 6384 d++; | |
| 6385 break; | |
| 6386 | |
| 6387 case notwordchar: | |
| 5041 | 6388 DEBUG_MATCH_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); |
| 450 | 6389 REGEX_PREFETCH (); |
| 826 | 6390 if (!WORDCHAR_P ((int) (*d))) |
| 428 | 6391 goto fail; |
| 6392 SET_REGS_MATCHED (); | |
| 6393 d++; | |
| 6394 break; | |
| 446 | 6395 #endif /* emacs */ |
| 428 | 6396 |
| 6397 default: | |
| 2500 | 6398 ABORT (); |
| 428 | 6399 } |
| 6400 continue; /* Successfully executed one pattern command; keep going. */ | |
| 6401 | |
| 6402 | |
| 6403 /* We goto here if a matching operation fails. */ | |
| 6404 fail: | |
| 6405 if (!FAIL_STACK_EMPTY ()) | |
| 6406 { /* A restart point is known. Restore to that state. */ | |
| 5041 | 6407 DEBUG_MATCH_PRINT1 ("\nFAIL:\n"); |
| 428 | 6408 POP_FAILURE_POINT (d, p, |
| 6409 lowest_active_reg, highest_active_reg, | |
| 6410 regstart, regend, reg_info); | |
| 6411 | |
| 6412 /* If this failure point is a dummy, try the next one. */ | |
| 6413 if (!p) | |
| 6414 goto fail; | |
| 6415 | |
| 6416 /* If we failed to the end of the pattern, don't examine *p. */ | |
| 6417 assert (p <= pend); | |
| 6418 if (p < pend) | |
| 6419 { | |
| 460 | 6420 re_bool is_a_jump_n = false; |
| 428 | 6421 |
| 6422 /* If failed to a backwards jump that's part of a repetition | |
| 6423 loop, need to pop this failure point and use the next one. */ | |
| 6424 switch ((re_opcode_t) *p) | |
| 6425 { | |
| 6426 case jump_n: | |
| 6427 is_a_jump_n = true; | |
| 6428 case maybe_pop_jump: | |
| 6429 case pop_failure_jump: | |
| 6430 case jump: | |
| 6431 p1 = p + 1; | |
| 6432 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
| 6433 p1 += mcnt; | |
| 6434 | |
| 6435 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) | |
| 6436 || (!is_a_jump_n | |
| 6437 && (re_opcode_t) *p1 == on_failure_jump)) | |
| 6438 goto fail; | |
| 6439 break; | |
| 6440 default: | |
| 6441 /* do nothing */ ; | |
| 6442 } | |
| 6443 } | |
| 6444 | |
| 6445 if (d >= string1 && d <= end1) | |
| 6446 dend = end_match_1; | |
| 6447 } | |
| 6448 else | |
| 6449 break; /* Matching at this starting point really fails. */ | |
| 6450 } /* for (;;) */ | |
| 6451 | |
| 6452 if (best_regs_set) | |
| 6453 goto restore_best_regs; | |
| 6454 | |
| 6455 FREE_VARIABLES (); | |
| 6456 | |
| 6457 return -1; /* Failure to match. */ | |
| 1333 | 6458 } /* re_match_2_internal */ |
| 428 | 6459 |
| 6460 /* Subroutine definitions for re_match_2. */ | |
| 6461 | |
| 6462 | |
| 6463 /* We are passed P pointing to a register number after a start_memory. | |
| 6464 | |
| 6465 Return true if the pattern up to the corresponding stop_memory can | |
| 6466 match the empty string, and false otherwise. | |
| 6467 | |
| 6468 If we find the matching stop_memory, sets P to point to one past its number. | |
| 6469 Otherwise, sets P to an undefined byte less than or equal to END. | |
| 6470 | |
| 6471 We don't handle duplicates properly (yet). */ | |
| 6472 | |
| 460 | 6473 static re_bool |
| 428 | 6474 group_match_null_string_p (unsigned char **p, unsigned char *end, |
| 6475 register_info_type *reg_info) | |
| 6476 { | |
| 6477 int mcnt; | |
| 6478 /* Point to after the args to the start_memory. */ | |
| 6479 unsigned char *p1 = *p + 2; | |
| 6480 | |
| 6481 while (p1 < end) | |
| 6482 { | |
| 6483 /* Skip over opcodes that can match nothing, and return true or | |
| 6484 false, as appropriate, when we get to one that can't, or to the | |
| 6485 matching stop_memory. */ | |
| 6486 | |
| 6487 switch ((re_opcode_t) *p1) | |
| 6488 { | |
| 6489 /* Could be either a loop or a series of alternatives. */ | |
| 6490 case on_failure_jump: | |
| 6491 p1++; | |
| 6492 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
| 6493 | |
| 6494 /* If the next operation is not a jump backwards in the | |
| 6495 pattern. */ | |
| 6496 | |
| 6497 if (mcnt >= 0) | |
| 6498 { | |
| 6499 /* Go through the on_failure_jumps of the alternatives, | |
| 6500 seeing if any of the alternatives cannot match nothing. | |
| 6501 The last alternative starts with only a jump, | |
| 6502 whereas the rest start with on_failure_jump and end | |
| 6503 with a jump, e.g., here is the pattern for `a|b|c': | |
| 6504 | |
| 6505 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 | |
| 6506 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 | |
| 6507 /exactn/1/c | |
| 6508 | |
| 6509 So, we have to first go through the first (n-1) | |
| 6510 alternatives and then deal with the last one separately. */ | |
| 6511 | |
| 6512 | |
| 6513 /* Deal with the first (n-1) alternatives, which start | |
| 6514 with an on_failure_jump (see above) that jumps to right | |
| 6515 past a jump_past_alt. */ | |
| 6516 | |
| 6517 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) | |
| 6518 { | |
| 6519 /* `mcnt' holds how many bytes long the alternative | |
| 6520 is, including the ending `jump_past_alt' and | |
| 6521 its number. */ | |
| 6522 | |
| 6523 if (!alt_match_null_string_p (p1, p1 + mcnt - 3, | |
| 6524 reg_info)) | |
| 6525 return false; | |
| 6526 | |
| 6527 /* Move to right after this alternative, including the | |
| 6528 jump_past_alt. */ | |
| 6529 p1 += mcnt; | |
| 6530 | |
| 6531 /* Break if it's the beginning of an n-th alternative | |
| 6532 that doesn't begin with an on_failure_jump. */ | |
| 6533 if ((re_opcode_t) *p1 != on_failure_jump) | |
| 6534 break; | |
| 6535 | |
| 6536 /* Still have to check that it's not an n-th | |
| 6537 alternative that starts with an on_failure_jump. */ | |
| 6538 p1++; | |
| 6539 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
| 6540 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) | |
| 6541 { | |
| 6542 /* Get to the beginning of the n-th alternative. */ | |
| 6543 p1 -= 3; | |
| 6544 break; | |
| 6545 } | |
| 6546 } | |
| 6547 | |
| 6548 /* Deal with the last alternative: go back and get number | |
| 6549 of the `jump_past_alt' just before it. `mcnt' contains | |
| 6550 the length of the alternative. */ | |
| 6551 EXTRACT_NUMBER (mcnt, p1 - 2); | |
| 6552 | |
| 6553 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) | |
| 6554 return false; | |
| 6555 | |
| 6556 p1 += mcnt; /* Get past the n-th alternative. */ | |
| 6557 } /* if mcnt > 0 */ | |
| 6558 break; | |
| 6559 | |
| 6560 | |
| 6561 case stop_memory: | |
| 6562 assert (p1[1] == **p); | |
| 6563 *p = p1 + 2; | |
| 6564 return true; | |
| 6565 | |
| 6566 | |
| 6567 default: | |
| 6568 if (!common_op_match_null_string_p (&p1, end, reg_info)) | |
| 6569 return false; | |
| 6570 } | |
| 6571 } /* while p1 < end */ | |
| 6572 | |
| 6573 return false; | |
| 6574 } /* group_match_null_string_p */ | |
| 6575 | |
| 6576 | |
| 6577 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: | |
| 6578 It expects P to be the first byte of a single alternative and END one | |
| 6579 byte past the last. The alternative can contain groups. */ | |
| 6580 | |
| 460 | 6581 static re_bool |
| 428 | 6582 alt_match_null_string_p (unsigned char *p, unsigned char *end, |
| 6583 register_info_type *reg_info) | |
| 6584 { | |
| 6585 int mcnt; | |
| 6586 unsigned char *p1 = p; | |
| 6587 | |
| 6588 while (p1 < end) | |
| 6589 { | |
| 6590 /* Skip over opcodes that can match nothing, and break when we get | |
| 6591 to one that can't. */ | |
| 6592 | |
| 6593 switch ((re_opcode_t) *p1) | |
| 6594 { | |
| 6595 /* It's a loop. */ | |
| 6596 case on_failure_jump: | |
| 6597 p1++; | |
| 6598 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
| 6599 p1 += mcnt; | |
| 6600 break; | |
| 6601 | |
| 6602 default: | |
| 6603 if (!common_op_match_null_string_p (&p1, end, reg_info)) | |
| 6604 return false; | |
| 6605 } | |
| 6606 } /* while p1 < end */ | |
| 6607 | |
| 6608 return true; | |
| 6609 } /* alt_match_null_string_p */ | |
| 6610 | |
| 6611 | |
| 6612 /* Deals with the ops common to group_match_null_string_p and | |
| 6613 alt_match_null_string_p. | |
| 6614 | |
| 6615 Sets P to one after the op and its arguments, if any. */ | |
| 6616 | |
| 460 | 6617 static re_bool |
| 428 | 6618 common_op_match_null_string_p (unsigned char **p, unsigned char *end, |
| 6619 register_info_type *reg_info) | |
| 6620 { | |
| 6621 int mcnt; | |
| 460 | 6622 re_bool ret; |
| 428 | 6623 int reg_no; |
| 6624 unsigned char *p1 = *p; | |
| 6625 | |
| 6626 switch ((re_opcode_t) *p1++) | |
| 6627 { | |
| 6628 case no_op: | |
| 6629 case begline: | |
| 6630 case endline: | |
| 6631 case begbuf: | |
| 6632 case endbuf: | |
| 6633 case wordbeg: | |
| 6634 case wordend: | |
| 6635 case wordbound: | |
| 6636 case notwordbound: | |
| 6637 #ifdef emacs | |
| 6638 case before_dot: | |
| 6639 case at_dot: | |
| 6640 case after_dot: | |
| 6641 #endif | |
| 6642 break; | |
| 6643 | |
| 6644 case start_memory: | |
| 6645 reg_no = *p1; | |
| 6646 assert (reg_no > 0 && reg_no <= MAX_REGNUM); | |
| 6647 ret = group_match_null_string_p (&p1, end, reg_info); | |
| 6648 | |
| 6649 /* Have to set this here in case we're checking a group which | |
| 6650 contains a group and a back reference to it. */ | |
| 6651 | |
| 6652 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) | |
| 6653 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; | |
| 6654 | |
| 6655 if (!ret) | |
| 6656 return false; | |
| 6657 break; | |
| 6658 | |
| 6659 /* If this is an optimized succeed_n for zero times, make the jump. */ | |
| 6660 case jump: | |
| 6661 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
| 6662 if (mcnt >= 0) | |
| 6663 p1 += mcnt; | |
| 6664 else | |
| 6665 return false; | |
| 6666 break; | |
| 6667 | |
| 6668 case succeed_n: | |
| 6669 /* Get to the number of times to succeed. */ | |
| 6670 p1 += 2; | |
| 6671 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
| 6672 | |
| 6673 if (mcnt == 0) | |
| 6674 { | |
| 6675 p1 -= 4; | |
| 6676 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
| 6677 p1 += mcnt; | |
| 6678 } | |
| 6679 else | |
| 6680 return false; | |
| 6681 break; | |
| 6682 | |
| 6683 case duplicate: | |
| 6684 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) | |
| 6685 return false; | |
| 6686 break; | |
| 6687 | |
| 6688 case set_number_at: | |
| 6689 p1 += 4; | |
| 6690 | |
| 6691 default: | |
| 6692 /* All other opcodes mean we cannot match the empty string. */ | |
| 6693 return false; | |
| 6694 } | |
| 6695 | |
| 6696 *p = p1; | |
| 6697 return true; | |
| 6698 } /* common_op_match_null_string_p */ | |
| 6699 | |
| 6700 | |
| 6701 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN | |
| 6702 bytes; nonzero otherwise. */ | |
| 6703 | |
| 6704 static int | |
| 446 | 6705 bcmp_translate (re_char *s1, re_char *s2, |
| 826 | 6706 REGISTER int len, RE_TRANSLATE_TYPE translate |
| 6707 #ifdef emacs | |
| 2333 | 6708 , Internal_Format USED_IF_MULE (fmt), |
| 6709 Lisp_Object USED_IF_MULE (lispobj) | |
| 826 | 6710 #endif |
| 6711 ) | |
| 428 | 6712 { |
| 826 | 6713 REGISTER re_char *p1 = s1, *p2 = s2; |
| 446 | 6714 #ifdef MULE |
| 826 | 6715 re_char *p1_end = s1 + len; |
| 6716 re_char *p2_end = s2 + len; | |
| 446 | 6717 |
| 6718 while (p1 != p1_end && p2 != p2_end) | |
| 6719 { | |
| 867 | 6720 Ichar p1_ch, p2_ch; |
| 6721 | |
| 6722 p1_ch = itext_ichar_fmt (p1, fmt, lispobj); | |
| 6723 p2_ch = itext_ichar_fmt (p2, fmt, lispobj); | |
| 826 | 6724 |
| 6725 if (RE_TRANSLATE_1 (p1_ch) | |
| 6726 != RE_TRANSLATE_1 (p2_ch)) | |
| 446 | 6727 return 1; |
| 867 | 6728 INC_IBYTEPTR_FMT (p1, fmt); |
| 6729 INC_IBYTEPTR_FMT (p2, fmt); | |
| 446 | 6730 } |
| 6731 #else /* not MULE */ | |
| 428 | 6732 while (len) |
| 6733 { | |
| 826 | 6734 if (RE_TRANSLATE_1 (*p1++) != RE_TRANSLATE_1 (*p2++)) return 1; |
| 428 | 6735 len--; |
| 6736 } | |
| 446 | 6737 #endif /* MULE */ |
| 428 | 6738 return 0; |
| 6739 } | |
| 6740 | |
| 6741 /* Entry points for GNU code. */ | |
| 6742 | |
| 6743 /* re_compile_pattern is the GNU regular expression compiler: it | |
| 6744 compiles PATTERN (of length SIZE) and puts the result in BUFP. | |
| 6745 Returns 0 if the pattern was valid, otherwise an error string. | |
| 6746 | |
| 6747 Assumes the `allocated' (and perhaps `buffer') and `translate' fields | |
| 6748 are set in BUFP on entry. | |
| 6749 | |
| 6750 We call regex_compile to do the actual compilation. */ | |
| 6751 | |
| 442 | 6752 const char * |
| 6753 re_compile_pattern (const char *pattern, int length, | |
| 428 | 6754 struct re_pattern_buffer *bufp) |
| 6755 { | |
| 6756 reg_errcode_t ret; | |
| 6757 | |
| 6758 /* GNU code is written to assume at least RE_NREGS registers will be set | |
| 6759 (and at least one extra will be -1). */ | |
| 6760 bufp->regs_allocated = REGS_UNALLOCATED; | |
| 6761 | |
| 6762 /* And GNU code determines whether or not to get register information | |
| 6763 by passing null for the REGS argument to re_match, etc., not by | |
| 6764 setting no_sub. */ | |
| 6765 bufp->no_sub = 0; | |
| 6766 | |
| 6767 /* Match anchors at newline. */ | |
| 6768 bufp->newline_anchor = 1; | |
| 6769 | |
| 826 | 6770 ret = regex_compile ((unsigned char *) pattern, length, re_syntax_options, |
| 6771 bufp); | |
| 428 | 6772 |
| 6773 if (!ret) | |
| 6774 return NULL; | |
| 6775 return gettext (re_error_msgid[(int) ret]); | |
| 6776 } | |
| 6777 | |
| 6778 /* Entry points compatible with 4.2 BSD regex library. We don't define | |
| 6779 them unless specifically requested. */ | |
| 6780 | |
| 6781 #ifdef _REGEX_RE_COMP | |
| 6782 | |
| 6783 /* BSD has one and only one pattern buffer. */ | |
| 6784 static struct re_pattern_buffer re_comp_buf; | |
| 6785 | |
| 6786 char * | |
| 442 | 6787 re_comp (const char *s) |
| 428 | 6788 { |
| 6789 reg_errcode_t ret; | |
| 6790 | |
| 6791 if (!s) | |
| 6792 { | |
| 6793 if (!re_comp_buf.buffer) | |
| 6794 return gettext ("No previous regular expression"); | |
| 6795 return 0; | |
| 6796 } | |
| 6797 | |
| 6798 if (!re_comp_buf.buffer) | |
| 6799 { | |
| 1333 | 6800 re_comp_buf.buffer = (unsigned char *) xmalloc (200); |
| 428 | 6801 if (re_comp_buf.buffer == NULL) |
| 6802 return gettext (re_error_msgid[(int) REG_ESPACE]); | |
| 6803 re_comp_buf.allocated = 200; | |
| 6804 | |
| 1333 | 6805 re_comp_buf.fastmap = (char *) xmalloc (1 << BYTEWIDTH); |
| 428 | 6806 if (re_comp_buf.fastmap == NULL) |
| 6807 return gettext (re_error_msgid[(int) REG_ESPACE]); | |
| 6808 } | |
| 6809 | |
| 6810 /* Since `re_exec' always passes NULL for the `regs' argument, we | |
| 6811 don't need to initialize the pattern buffer fields which affect it. */ | |
| 6812 | |
| 6813 /* Match anchors at newlines. */ | |
| 6814 re_comp_buf.newline_anchor = 1; | |
| 6815 | |
| 826 | 6816 ret = regex_compile ((unsigned char *)s, strlen (s), re_syntax_options, |
| 6817 &re_comp_buf); | |
| 428 | 6818 |
| 6819 if (!ret) | |
| 6820 return NULL; | |
| 6821 | |
| 442 | 6822 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ |
| 428 | 6823 return (char *) gettext (re_error_msgid[(int) ret]); |
| 6824 } | |
| 6825 | |
| 6826 | |
| 6827 int | |
| 442 | 6828 re_exec (const char *s) |
| 428 | 6829 { |
| 442 | 6830 const int len = strlen (s); |
| 428 | 6831 return |
| 6832 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); | |
| 6833 } | |
| 6834 #endif /* _REGEX_RE_COMP */ | |
| 6835 | |
| 6836 /* POSIX.2 functions. Don't define these for Emacs. */ | |
| 6837 | |
| 6838 #ifndef emacs | |
| 6839 | |
| 6840 /* regcomp takes a regular expression as a string and compiles it. | |
| 6841 | |
| 6842 PREG is a regex_t *. We do not expect any fields to be initialized, | |
| 6843 since POSIX says we shouldn't. Thus, we set | |
| 6844 | |
| 6845 `buffer' to the compiled pattern; | |
| 6846 `used' to the length of the compiled pattern; | |
| 6847 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the | |
| 6848 REG_EXTENDED bit in CFLAGS is set; otherwise, to | |
| 6849 RE_SYNTAX_POSIX_BASIC; | |
| 6850 `newline_anchor' to REG_NEWLINE being set in CFLAGS; | |
| 6851 `fastmap' and `fastmap_accurate' to zero; | |
| 6852 `re_nsub' to the number of subexpressions in PATTERN. | |
| 502 | 6853 (non-shy of course. POSIX probably doesn't know about |
| 6854 shy ones, and in any case they should be invisible.) | |
| 428 | 6855 |
| 6856 PATTERN is the address of the pattern string. | |
| 6857 | |
| 6858 CFLAGS is a series of bits which affect compilation. | |
| 6859 | |
| 6860 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we | |
| 6861 use POSIX basic syntax. | |
| 6862 | |
| 6863 If REG_NEWLINE is set, then . and [^...] don't match newline. | |
| 6864 Also, regexec will try a match beginning after every newline. | |
| 6865 | |
| 6866 If REG_ICASE is set, then we considers upper- and lowercase | |
| 6867 versions of letters to be equivalent when matching. | |
| 6868 | |
| 6869 If REG_NOSUB is set, then when PREG is passed to regexec, that | |
| 6870 routine will report only success or failure, and nothing about the | |
| 6871 registers. | |
| 6872 | |
| 6873 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for | |
| 6874 the return codes and their meanings.) */ | |
| 6875 | |
| 6876 int | |
| 442 | 6877 regcomp (regex_t *preg, const char *pattern, int cflags) |
| 428 | 6878 { |
| 6879 reg_errcode_t ret; | |
| 647 | 6880 unsigned int syntax |
| 428 | 6881 = (cflags & REG_EXTENDED) ? |
| 6882 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; | |
| 6883 | |
| 6884 /* regex_compile will allocate the space for the compiled pattern. */ | |
| 6885 preg->buffer = 0; | |
| 6886 preg->allocated = 0; | |
| 6887 preg->used = 0; | |
| 6888 | |
| 6889 /* Don't bother to use a fastmap when searching. This simplifies the | |
| 6890 REG_NEWLINE case: if we used a fastmap, we'd have to put all the | |
| 6891 characters after newlines into the fastmap. This way, we just try | |
| 6892 every character. */ | |
| 6893 preg->fastmap = 0; | |
| 6894 | |
| 6895 if (cflags & REG_ICASE) | |
| 6896 { | |
| 647 | 6897 int i; |
| 428 | 6898 |
| 1333 | 6899 preg->translate = (char *) xmalloc (CHAR_SET_SIZE); |
| 428 | 6900 if (preg->translate == NULL) |
| 6901 return (int) REG_ESPACE; | |
| 6902 | |
| 6903 /* Map uppercase characters to corresponding lowercase ones. */ | |
| 6904 for (i = 0; i < CHAR_SET_SIZE; i++) | |
| 6905 preg->translate[i] = ISUPPER (i) ? tolower (i) : i; | |
| 6906 } | |
| 6907 else | |
| 6908 preg->translate = NULL; | |
| 6909 | |
| 6910 /* If REG_NEWLINE is set, newlines are treated differently. */ | |
| 6911 if (cflags & REG_NEWLINE) | |
| 6912 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ | |
| 6913 syntax &= ~RE_DOT_NEWLINE; | |
| 6914 syntax |= RE_HAT_LISTS_NOT_NEWLINE; | |
| 6915 /* It also changes the matching behavior. */ | |
| 6916 preg->newline_anchor = 1; | |
| 6917 } | |
| 6918 else | |
| 6919 preg->newline_anchor = 0; | |
| 6920 | |
| 6921 preg->no_sub = !!(cflags & REG_NOSUB); | |
| 6922 | |
| 6923 /* POSIX says a null character in the pattern terminates it, so we | |
| 6924 can use strlen here in compiling the pattern. */ | |
| 446 | 6925 ret = regex_compile ((unsigned char *) pattern, strlen (pattern), syntax, preg); |
| 428 | 6926 |
| 6927 /* POSIX doesn't distinguish between an unmatched open-group and an | |
| 6928 unmatched close-group: both are REG_EPAREN. */ | |
| 6929 if (ret == REG_ERPAREN) ret = REG_EPAREN; | |
| 6930 | |
| 6931 return (int) ret; | |
| 6932 } | |
| 6933 | |
| 6934 | |
| 6935 /* regexec searches for a given pattern, specified by PREG, in the | |
| 6936 string STRING. | |
| 6937 | |
| 6938 If NMATCH is zero or REG_NOSUB was set in the cflags argument to | |
| 6939 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at | |
| 6940 least NMATCH elements, and we set them to the offsets of the | |
| 6941 corresponding matched substrings. | |
| 6942 | |
| 6943 EFLAGS specifies `execution flags' which affect matching: if | |
| 6944 REG_NOTBOL is set, then ^ does not match at the beginning of the | |
| 6945 string; if REG_NOTEOL is set, then $ does not match at the end. | |
| 6946 | |
| 6947 We return 0 if we find a match and REG_NOMATCH if not. */ | |
| 6948 | |
| 6949 int | |
| 442 | 6950 regexec (const regex_t *preg, const char *string, size_t nmatch, |
| 428 | 6951 regmatch_t pmatch[], int eflags) |
| 6952 { | |
| 6953 int ret; | |
| 6954 struct re_registers regs; | |
| 6955 regex_t private_preg; | |
| 6956 int len = strlen (string); | |
| 460 | 6957 re_bool want_reg_info = !preg->no_sub && nmatch > 0; |
| 428 | 6958 |
| 6959 private_preg = *preg; | |
| 6960 | |
| 6961 private_preg.not_bol = !!(eflags & REG_NOTBOL); | |
| 6962 private_preg.not_eol = !!(eflags & REG_NOTEOL); | |
| 6963 | |
| 6964 /* The user has told us exactly how many registers to return | |
| 6965 information about, via `nmatch'. We have to pass that on to the | |
| 6966 matching routines. */ | |
| 6967 private_preg.regs_allocated = REGS_FIXED; | |
| 6968 | |
| 6969 if (want_reg_info) | |
| 6970 { | |
| 647 | 6971 regs.num_regs = (int) nmatch; |
| 6972 regs.start = TALLOC ((int) nmatch, regoff_t); | |
| 6973 regs.end = TALLOC ((int) nmatch, regoff_t); | |
| 428 | 6974 if (regs.start == NULL || regs.end == NULL) |
| 6975 return (int) REG_NOMATCH; | |
| 6976 } | |
| 6977 | |
| 6978 /* Perform the searching operation. */ | |
| 6979 ret = re_search (&private_preg, string, len, | |
| 6980 /* start: */ 0, /* range: */ len, | |
| 6981 want_reg_info ? ®s : (struct re_registers *) 0); | |
| 6982 | |
| 6983 /* Copy the register information to the POSIX structure. */ | |
| 6984 if (want_reg_info) | |
| 6985 { | |
| 6986 if (ret >= 0) | |
| 6987 { | |
| 647 | 6988 int r; |
| 6989 | |
| 6990 for (r = 0; r < (int) nmatch; r++) | |
| 428 | 6991 { |
| 6992 pmatch[r].rm_so = regs.start[r]; | |
| 6993 pmatch[r].rm_eo = regs.end[r]; | |
| 6994 } | |
| 6995 } | |
| 6996 | |
| 6997 /* If we needed the temporary register info, free the space now. */ | |
|
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
6998 xfree (regs.start); |
|
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
6999 xfree (regs.end); |
| 428 | 7000 } |
| 7001 | |
| 7002 /* We want zero return to mean success, unlike `re_search'. */ | |
| 7003 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; | |
| 7004 } | |
| 7005 | |
| 7006 | |
| 7007 /* Returns a message corresponding to an error code, ERRCODE, returned | |
| 7008 from either regcomp or regexec. We don't use PREG here. */ | |
| 7009 | |
| 7010 size_t | |
| 2286 | 7011 regerror (int errcode, const regex_t *UNUSED (preg), char *errbuf, |
| 647 | 7012 size_t errbuf_size) |
| 428 | 7013 { |
| 442 | 7014 const char *msg; |
| 665 | 7015 Bytecount msg_size; |
| 428 | 7016 |
| 7017 if (errcode < 0 | |
| 647 | 7018 || errcode >= (int) (sizeof (re_error_msgid) / |
| 7019 sizeof (re_error_msgid[0]))) | |
| 428 | 7020 /* Only error codes returned by the rest of the code should be passed |
| 7021 to this routine. If we are given anything else, or if other regex | |
| 7022 code generates an invalid error code, then the program has a bug. | |
| 7023 Dump core so we can fix it. */ | |
| 2500 | 7024 ABORT (); |
| 428 | 7025 |
| 7026 msg = gettext (re_error_msgid[errcode]); | |
| 7027 | |
| 7028 msg_size = strlen (msg) + 1; /* Includes the null. */ | |
| 7029 | |
| 7030 if (errbuf_size != 0) | |
| 7031 { | |
| 665 | 7032 if (msg_size > (Bytecount) errbuf_size) |
| 428 | 7033 { |
| 7034 strncpy (errbuf, msg, errbuf_size - 1); | |
| 7035 errbuf[errbuf_size - 1] = 0; | |
| 7036 } | |
| 7037 else | |
| 7038 strcpy (errbuf, msg); | |
| 7039 } | |
| 7040 | |
| 647 | 7041 return (size_t) msg_size; |
| 428 | 7042 } |
| 7043 | |
| 7044 | |
| 7045 /* Free dynamically allocated space used by PREG. */ | |
| 7046 | |
| 7047 void | |
| 7048 regfree (regex_t *preg) | |
| 7049 { | |
| 7050 if (preg->buffer != NULL) | |
|
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
7051 xfree (preg->buffer); |
| 428 | 7052 preg->buffer = NULL; |
| 7053 | |
| 7054 preg->allocated = 0; | |
| 7055 preg->used = 0; | |
| 7056 | |
| 7057 if (preg->fastmap != NULL) | |
|
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
7058 xfree (preg->fastmap); |
| 428 | 7059 preg->fastmap = NULL; |
| 7060 preg->fastmap_accurate = 0; | |
| 7061 | |
| 7062 if (preg->translate != NULL) | |
|
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
7063 xfree (preg->translate); |
| 428 | 7064 preg->translate = NULL; |
| 7065 } | |
| 7066 | |
| 7067 #endif /* not emacs */ | |
| 7068 |
