Mercurial > hg > xemacs-beta
annotate src/regex.c @ 4976:16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
-------------------- ChangeLog entries follow: --------------------
src/ChangeLog addition:
2010-02-04 Ben Wing <ben@xemacs.org>
* alloc.c (release_breathing_space):
* alloc.c (resize_string):
* alloc.c (sweep_lcrecords_1):
* alloc.c (SWEEP_FIXED_TYPE_BLOCK_1):
* alloc.c (ADDITIONAL_FREE_compiled_function):
* alloc.c (compact_string_chars):
* alloc.c (ADDITIONAL_FREE_string):
* alloc.c (sweep_strings):
* alloca.c (xemacs_c_alloca):
* alsaplay.c (alsa_play_sound_file):
* buffer.c (init_initial_directory):
* buffer.h:
* buffer.h (BUFFER_FREE):
* console-stream.c (stream_delete_console):
* console-tty.c (free_tty_console_struct):
* data.c (Fnumber_to_string):
* device-gtk.c (gtk_init_device):
* device-gtk.c (free_gtk_device_struct):
* device-gtk.c (gtk_delete_device):
* device-msw.c (mswindows_delete_device):
* device-msw.c (msprinter_delete_device):
* device-tty.c (free_tty_device_struct):
* device-tty.c (tty_delete_device):
* device-x.c (x_init_device):
* device-x.c (free_x_device_struct):
* device-x.c (x_delete_device):
* dialog-msw.c (handle_directory_dialog_box):
* dialog-x.c (dbox_descriptor_to_widget_value):
* dired-msw.c (Fmswindows_insert_directory):
* dired.c (free_user_cache):
* dired.c (user_name_completion_unwind):
* doc.c (unparesseuxify_doc_string):
* doc.c (Fsubstitute_command_keys):
* doprnt.c (emacs_doprnt_1):
* dumper.c (pdump_load_finish):
* dumper.c (pdump_file_free):
* dumper.c (pdump_file_unmap):
* dynarr.c:
* dynarr.c (Dynarr_free):
* editfns.c (uncache_home_directory):
* editfns.c (Fset_time_zone_rule):
* elhash.c:
* elhash.c (pdump_reorganize_hash_table):
* elhash.c (maphash_unwind):
* emacs.c (make_arg_list_1):
* emacs.c (free_argc_argv):
* emacs.c (sort_args):
* emacs.c (Frunning_temacs_p):
* emodules.c (attempt_module_delete):
* eval.c (free_pointer):
* event-Xt.c (unselect_filedesc):
* event-Xt.c (emacs_Xt_select_process):
* event-gtk.c (unselect_filedesc):
* event-gtk.c (dragndrop_data_received):
* event-msw.c (winsock_closer):
* event-msw.c (mswindows_dde_callback):
* event-msw.c (mswindows_wnd_proc):
* event-stream.c (finalize_command_builder):
* event-stream.c (free_command_builder):
* extents.c (free_gap_array):
* extents.c (free_extent_list):
* extents.c (free_soe):
* extents.c (extent_fragment_delete):
* extents.c (extent_priority_sort_function):
* file-coding.c (make_coding_system_1):
* file-coding.c (coding_finalizer):
* file-coding.c (set_coding_stream_coding_system):
* file-coding.c (chain_finalize_coding_stream_1):
* file-coding.c (chain_finalize):
* file-coding.c (free_detection_state):
* file-coding.c (coding_category_symbol_to_id):
* fileio.c:
* fileio.c (Ffile_name_directory):
* fileio.c (if):
* fileio.c (Ffile_symlink_p):
* filelock.c (FREE_LOCK_INFO):
* filelock.c (current_lock_owner):
* font-mgr.c (Ffc_name_unparse):
* font-mgr.c (Ffc_pattern_duplicate):
* frame-gtk.c (gtk_delete_frame):
* frame-msw.c (mswindows_delete_frame):
* frame-msw.c (msprinter_delete_frame):
* frame-x.c (x_cde_destroy_callback):
* frame-x.c (Fcde_start_drag_internal):
* frame-x.c (x_cde_transfer_callback):
* frame-x.c (x_delete_frame):
* frame.c (update_frame_title):
* frame.c (Fset_frame_pointer):
* gc.c (register_for_finalization):
* gccache-gtk.c (free_gc_cache):
* gccache-gtk.c (gc_cache_lookup):
* gccache-x.c (free_gc_cache):
* gccache-x.c (gc_cache_lookup):
* glyphs-eimage.c:
* glyphs-eimage.c (jpeg_instantiate_unwind):
* glyphs-eimage.c (gif_instantiate_unwind):
* glyphs-eimage.c (png_instantiate_unwind):
* glyphs-eimage.c (png_instantiate):
* glyphs-eimage.c (tiff_instantiate_unwind):
* glyphs-gtk.c (convert_EImage_to_GDKImage):
* glyphs-gtk.c (gtk_finalize_image_instance):
* glyphs-gtk.c (gtk_init_image_instance_from_eimage):
* glyphs-gtk.c (gtk_xpm_instantiate):
* glyphs-msw.c (convert_EImage_to_DIBitmap):
* glyphs-msw.c (mswindows_init_image_instance_from_eimage):
* glyphs-msw.c (mswindows_initialize_image_instance_mask):
* glyphs-msw.c (xpm_to_eimage):
* glyphs-msw.c (mswindows_xpm_instantiate):
* glyphs-msw.c (xbm_create_bitmap_from_data):
* glyphs-msw.c (mswindows_finalize_image_instance):
* glyphs-x.c (convert_EImage_to_XImage):
* glyphs-x.c (x_finalize_image_instance):
* glyphs-x.c (x_init_image_instance_from_eimage):
* glyphs-x.c (x_xpm_instantiate):
* gui-x.c (free_popup_widget_value_tree):
* hash.c (free_hash_table):
* hash.c (grow_hash_table):
* hash.c (pregrow_hash_table_if_necessary):
* imgproc.c (build_EImage_quantable):
* insdel.c (uninit_buffer_text):
* intl-win32.c (convert_multibyte_to_internal_malloc):
* intl.c:
* intl.c (Fset_current_locale):
* keymap.c:
* keymap.c (where_is_recursive_mapper):
* keymap.c (where_is_internal):
* lisp.h:
* lisp.h (xfree):
* lstream.c (Lstream_close):
* lstream.c (resizing_buffer_closer):
* mule-coding.c:
* mule-coding.c (iso2022_finalize_detection_state):
* nt.c:
* nt.c (mswindows_get_long_filename):
* nt.c (nt_get_resource):
* nt.c (init_mswindows_environment):
* nt.c (get_cached_volume_information):
* nt.c (mswindows_opendir):
* nt.c (mswindows_closedir):
* nt.c (mswindows_readdir):
* nt.c (mswindows_stat):
* nt.c (mswindows_getdcwd):
* nt.c (Fmswindows_long_file_name):
* ntplay.c (nt_play_sound_file):
* ntplay.c (play_sound_data_1):
* number-gmp.c (gmp_free):
* number-gmp.c (init_number_gmp):
* number-mp.c (bignum_to_string):
* number-mp.c (BIGNUM_TO_TYPE):
* number.c (bignum_print):
* number.c (bignum_convfree):
* number.c (ratio_print):
* number.c (bigfloat_print):
* number.c (bigfloat_finalize):
* objects-gtk.c (gtk_finalize_color_instance):
* objects-gtk.c (gtk_finalize_font_instance):
* objects-msw.c (mswindows_finalize_color_instance):
* objects-msw.c (mswindows_finalize_font_instance):
* objects-tty.c (tty_finalize_color_instance):
* objects-tty.c (tty_finalize_font_instance):
* objects-tty.c (tty_font_list):
* objects-x.c (x_finalize_color_instance):
* objects-x.c (x_finalize_font_instance):
* process.c:
* process.c (finalize_process):
* realpath.c:
* redisplay.c (add_propagation_runes):
* regex.c:
* regex.c (xfree):
* regex.c (REGEX_FREE_STACK):
* regex.c (FREE_STACK_RETURN):
* regex.c (regex_compile):
* regex.c (regexec):
* regex.c (regfree):
* scrollbar-gtk.c (gtk_free_scrollbar_instance):
* scrollbar-gtk.c (gtk_release_scrollbar_instance):
* scrollbar-msw.c (mswindows_free_scrollbar_instance):
* scrollbar-msw.c (unshow_that_mofo):
* scrollbar-x.c (x_free_scrollbar_instance):
* scrollbar-x.c (x_release_scrollbar_instance):
* select-gtk.c (emacs_gtk_selection_handle):
* select-msw.c (mswindows_own_selection):
* select-x.c:
* select-x.c (x_handle_selection_request):
* select-x.c (unexpect_property_change):
* select-x.c (x_handle_property_notify):
* select-x.c (receive_incremental_selection):
* select-x.c (x_get_window_property_as_lisp_data):
* select-x.c (Fx_get_cutbuffer_internal):
* specifier.c (finalize_specifier):
* syntax.c (uninit_buffer_syntax_cache):
* sysdep.c (qxe_allocating_getcwd):
* sysdep.c (qxe_lstat):
* sysdep.c (copy_in_passwd):
* sysdep.c (qxe_ctime):
* sysdep.c (closedir):
* sysdep.c (DIRSIZ):
* termcap.c (tgetent):
* termcap.c (tprint):
* tests.c (Ftest_data_format_conversion):
* text.c (new_dfc_convert_copy_data):
* text.h (eifree):
* text.h (eito_alloca):
* text.h (eito_external):
* toolbar-msw.c (mswindows_output_toolbar):
* ui-gtk.c (CONVERT_RETVAL):
* ui-gtk.c (__allocate_object_storage):
* unicode.c (free_from_unicode_table):
* unicode.c (free_to_unicode_table):
* unicode.c (free_charset_unicode_tables):
* win32.c (mswindows_read_link_1):
Rename: xfree(VAL, TYPE)->xfree(VAL)
Command used:
gr 'xfree *\((.*),.*\);' 'xfree (\1);' *.[ch]
Followed by grepping for 'xfree.*,' and fixing anything left.
Rationale: Having to specify the TYPE argument is annoying and
error-prone. It was originally put in to work around warnings
due to strict aliasing but years and years ago I rewrote it
in a way that doesn't use the TYPE argument at all and no one
has complained since then. (And anyway, XEmacs is far from
ever being in compliance with strict aliasing and would require
far-reaching changes to get that way.)
author | Ben Wing <ben@xemacs.org> |
---|---|
date | Thu, 04 Feb 2010 07:28:14 -0600 |
parents | 07fa38c30fdf |
children | efaa6cd845e5 |
rev | line source |
---|---|
428 | 1 /* Extended regular expression matching and search library, |
2 version 0.12, extended for XEmacs. | |
3 (Implements POSIX draft P10003.2/D11.2, except for | |
4 internationalization features.) | |
5 | |
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc. | |
7 Copyright (C) 1995 Sun Microsystems, Inc. | |
1333 | 8 Copyright (C) 1995, 2001, 2002, 2003 Ben Wing. |
428 | 9 |
10 This program is free software; you can redistribute it and/or modify | |
11 it under the terms of the GNU General Public License as published by | |
12 the Free Software Foundation; either version 2, or (at your option) | |
13 any later version. | |
14 | |
15 This program is distributed in the hope that it will be useful, | |
16 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 GNU General Public License for more details. | |
19 | |
20 You should have received a copy of the GNU General Public License | |
21 along with this program; see the file COPYING. If not, write to | |
22 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
23 Boston, MA 02111-1307, USA. */ | |
24 | |
25 /* Synched up with: FSF 19.29. */ | |
26 | |
27 #ifdef HAVE_CONFIG_H | |
28 #include <config.h> | |
29 #endif | |
30 | |
31 #ifndef _GNU_SOURCE | |
32 #define _GNU_SOURCE 1 | |
33 #endif | |
34 | |
35 /* We assume non-Mule if emacs isn't defined. */ | |
36 #ifndef emacs | |
37 #undef MULE | |
38 #endif | |
39 | |
771 | 40 /* XEmacs addition */ |
41 #ifdef REL_ALLOC | |
42 #define REGEX_REL_ALLOC /* may be undefined below */ | |
43 #endif | |
44 | |
428 | 45 /* XEmacs: define this to add in a speedup for patterns anchored at |
46 the beginning of a line. Keep the ifdefs so that it's easier to | |
47 tell where/why this code has diverged from v19. */ | |
48 #define REGEX_BEGLINE_CHECK | |
49 | |
50 /* XEmacs: the current mmap-based ralloc handles small blocks very | |
51 poorly, so we disable it here. */ | |
52 | |
771 | 53 #if defined (HAVE_MMAP) || defined (DOUG_LEA_MALLOC) |
54 # undef REGEX_REL_ALLOC | |
428 | 55 #endif |
56 | |
57 /* The `emacs' switch turns on certain matching commands | |
58 that make sense only in Emacs. */ | |
59 #ifdef emacs | |
60 | |
61 #include "lisp.h" | |
62 #include "buffer.h" | |
63 #include "syntax.h" | |
64 | |
65 #if (defined (DEBUG_XEMACS) && !defined (DEBUG)) | |
66 #define DEBUG | |
67 #endif | |
68 | |
867 | 69 #define RE_TRANSLATE_1(ch) TRT_TABLE_OF (translate, (Ichar) ch) |
446 | 70 #define TRANSLATE_P(tr) (!NILP (tr)) |
428 | 71 |
826 | 72 /* Converts the pointer to the char to BEG-based offset from the start. */ |
73 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \ | |
74 ? (d) - string1 : (d) - (string2 - size1)) | |
75 | |
428 | 76 #else /* not emacs */ |
77 | |
2367 | 78 #include <stdlib.h> |
79 #include <sys/types.h> | |
80 #include <stddef.h> /* needed for ptrdiff_t under Solaris */ | |
81 #include <string.h> | |
82 | |
2286 | 83 #include "compiler.h" /* Get compiler-specific definitions like UNUSED */ |
84 | |
2500 | 85 #define ABORT abort |
86 | |
428 | 87 /* If we are not linking with Emacs proper, |
88 we can't use the relocating allocator | |
89 even if config.h says that we can. */ | |
771 | 90 #undef REGEX_REL_ALLOC |
428 | 91 |
544 | 92 /* defined in lisp.h */ |
93 #ifdef REGEX_MALLOC | |
94 #ifndef DECLARE_NOTHING | |
95 #define DECLARE_NOTHING struct nosuchstruct | |
96 #endif | |
97 #endif | |
98 | |
867 | 99 #define itext_ichar(str) ((Ichar) (str)[0]) |
100 #define itext_ichar_fmt(str, fmt, object) ((Ichar) (str)[0]) | |
101 #define itext_ichar_ascii_fmt(str, fmt, object) ((Ichar) (str)[0]) | |
428 | 102 |
103 #if (LONGBITS > INTBITS) | |
104 # define EMACS_INT long | |
105 #else | |
106 # define EMACS_INT int | |
107 #endif | |
108 | |
867 | 109 typedef int Ichar; |
110 | |
111 #define INC_IBYTEPTR(p) ((p)++) | |
112 #define INC_IBYTEPTR_FMT(p, fmt) ((p)++) | |
113 #define DEC_IBYTEPTR(p) ((p)--) | |
114 #define DEC_IBYTEPTR_FMT(p, fmt) ((p)--) | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
115 #define MAX_ICHAR_LEN 1 |
867 | 116 #define itext_ichar_len(ptr) 1 |
117 #define itext_ichar_len_fmt(ptr, fmt) 1 | |
428 | 118 |
119 /* Define the syntax stuff for \<, \>, etc. */ | |
120 | |
121 /* This must be nonzero for the wordchar and notwordchar pattern | |
122 commands in re_match_2. */ | |
123 #ifndef Sword | |
124 #define Sword 1 | |
125 #endif | |
126 | |
127 #ifdef SYNTAX_TABLE | |
128 | |
129 extern char *re_syntax_table; | |
130 | |
131 #else /* not SYNTAX_TABLE */ | |
132 | |
133 /* How many characters in the character set. */ | |
134 #define CHAR_SET_SIZE 256 | |
135 | |
136 static char re_syntax_table[CHAR_SET_SIZE]; | |
137 | |
138 static void | |
139 init_syntax_once (void) | |
140 { | |
141 static int done = 0; | |
142 | |
143 if (!done) | |
144 { | |
442 | 145 const char *word_syntax_chars = |
428 | 146 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_"; |
147 | |
148 memset (re_syntax_table, 0, sizeof (re_syntax_table)); | |
149 | |
150 while (*word_syntax_chars) | |
647 | 151 re_syntax_table[(unsigned int) (*word_syntax_chars++)] = Sword; |
428 | 152 |
153 done = 1; | |
154 } | |
155 } | |
156 | |
446 | 157 #endif /* SYNTAX_TABLE */ |
428 | 158 |
826 | 159 #define SYNTAX(ignored, c) re_syntax_table[c] |
460 | 160 #undef SYNTAX_FROM_CACHE |
826 | 161 #define SYNTAX_FROM_CACHE SYNTAX |
162 | |
163 #define RE_TRANSLATE_1(c) translate[(unsigned char) (c)] | |
446 | 164 #define TRANSLATE_P(tr) tr |
165 | |
166 #endif /* emacs */ | |
428 | 167 |
2201 | 168 /* This is for other GNU distributions with internationalized messages. */ |
169 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC)) | |
170 # include <libintl.h> | |
171 #else | |
172 # define gettext(msgid) (msgid) | |
173 #endif | |
174 | |
428 | 175 |
176 /* Get the interface, including the syntax bits. */ | |
177 #include "regex.h" | |
178 | |
179 /* isalpha etc. are used for the character classes. */ | |
180 #include <ctype.h> | |
181 | |
182 /* Jim Meyering writes: | |
183 | |
184 "... Some ctype macros are valid only for character codes that | |
185 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when | |
186 using /bin/cc or gcc but without giving an ansi option). So, all | |
187 ctype uses should be through macros like ISPRINT... If | |
188 STDC_HEADERS is defined, then autoconf has verified that the ctype | |
189 macros don't need to be guarded with references to isascii. ... | |
190 Defining isascii to 1 should let any compiler worth its salt | |
191 eliminate the && through constant folding." */ | |
192 | |
193 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) | |
194 #define ISASCII_1(c) 1 | |
195 #else | |
196 #define ISASCII_1(c) isascii(c) | |
197 #endif | |
198 | |
199 #ifdef MULE | |
200 /* The IS*() macros can be passed any character, including an extended | |
201 one. We need to make sure there are no crashes, which would occur | |
202 otherwise due to out-of-bounds array references. */ | |
203 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c)) | |
204 #else | |
205 #define ISASCII(c) ISASCII_1 (c) | |
206 #endif /* MULE */ | |
207 | |
208 #ifdef isblank | |
209 #define ISBLANK(c) (ISASCII (c) && isblank (c)) | |
210 #else | |
211 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') | |
212 #endif | |
213 #ifdef isgraph | |
214 #define ISGRAPH(c) (ISASCII (c) && isgraph (c)) | |
215 #else | |
216 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) | |
217 #endif | |
218 | |
219 #define ISPRINT(c) (ISASCII (c) && isprint (c)) | |
220 #define ISDIGIT(c) (ISASCII (c) && isdigit (c)) | |
221 #define ISALNUM(c) (ISASCII (c) && isalnum (c)) | |
222 #define ISALPHA(c) (ISASCII (c) && isalpha (c)) | |
223 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) | |
224 #define ISLOWER(c) (ISASCII (c) && islower (c)) | |
225 #define ISPUNCT(c) (ISASCII (c) && ispunct (c)) | |
226 #define ISSPACE(c) (ISASCII (c) && isspace (c)) | |
227 #define ISUPPER(c) (ISASCII (c) && isupper (c)) | |
228 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) | |
229 | |
230 #ifndef NULL | |
231 #define NULL (void *)0 | |
232 #endif | |
233 | |
234 /* We remove any previous definition of `SIGN_EXTEND_CHAR', | |
235 since ours (we hope) works properly with all combinations of | |
236 machines, compilers, `char' and `unsigned char' argument types. | |
237 (Per Bothner suggested the basic approach.) */ | |
238 #undef SIGN_EXTEND_CHAR | |
239 #if __STDC__ | |
240 #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) | |
241 #else /* not __STDC__ */ | |
242 /* As in Harbison and Steele. */ | |
243 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) | |
244 #endif | |
245 | |
246 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we | |
247 use `alloca' instead of `malloc'. This is because using malloc in | |
248 re_search* or re_match* could cause memory leaks when C-g is used in | |
249 Emacs; also, malloc is slower and causes storage fragmentation. On | |
250 the other hand, malloc is more portable, and easier to debug. | |
251 | |
252 Because we sometimes use alloca, some routines have to be macros, | |
253 not functions -- `alloca'-allocated space disappears at the end of the | |
254 function it is called in. */ | |
255 | |
1333 | 256 #ifndef emacs |
257 #define ALLOCA alloca | |
258 #define xmalloc malloc | |
259 #define xrealloc realloc | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
260 #define xfree free |
1333 | 261 #endif |
262 | |
263 #ifdef emacs | |
264 #define ALLOCA_GARBAGE_COLLECT() \ | |
265 do \ | |
266 { \ | |
267 if (need_to_check_c_alloca) \ | |
268 xemacs_c_alloca (0); \ | |
269 } while (0) | |
270 #elif defined (C_ALLOCA) | |
271 #define ALLOCA_GARBAGE_COLLECT() alloca (0) | |
272 #else | |
273 #define ALLOCA_GARBAGE_COLLECT() | |
274 #endif | |
275 | |
276 #ifndef emacs | |
277 /* So we can use just it to conditionalize on */ | |
278 #undef ERROR_CHECK_MALLOC | |
279 #endif | |
280 | |
281 #ifdef ERROR_CHECK_MALLOC | |
282 /* When REL_ALLOC, malloc() is problematic because it could potentially | |
283 cause all rel-alloc()ed data -- including buffer text -- to be relocated. | |
284 We deal with this by checking for such relocation whenever we have | |
285 executed a statement that may call malloc() -- or alloca(), which may | |
286 end up calling malloc() in some circumstances -- and recomputing all | |
287 of our string pointers in re_match_2_internal() and re_search_2(). | |
288 However, if malloc() or alloca() happens and we don't know about it, | |
289 we could still be screwed. So we set up a system where we indicate all | |
290 places where we are prepared for malloc() or alloca(), and in any | |
291 other circumstances, calls to those functions (from anywhere inside of | |
2500 | 292 XEmacs!) will ABORT(). We do this even when REL_ALLOC is not defined |
1333 | 293 so that we catch these problems sooner, since many developers and beta |
294 testers will not be running with REL_ALLOC. */ | |
295 int regex_malloc_disallowed; | |
296 #define BEGIN_REGEX_MALLOC_OK() regex_malloc_disallowed = 0 | |
297 #define END_REGEX_MALLOC_OK() regex_malloc_disallowed = 1 | |
298 #define UNBIND_REGEX_MALLOC_CHECK() unbind_to (depth) | |
299 #else | |
300 #define BEGIN_REGEX_MALLOC_OK() | |
301 #define END_REGEX_MALLOC_OK() | |
302 #define UNBIND_REGEX_MALLOC_CHECK() | |
303 #endif | |
304 | |
305 | |
428 | 306 #ifdef REGEX_MALLOC |
307 | |
1333 | 308 #define REGEX_ALLOCATE xmalloc |
309 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc (source, nsize) | |
310 #define REGEX_FREE xfree | |
428 | 311 |
312 #else /* not REGEX_MALLOC */ | |
313 | |
314 /* Emacs already defines alloca, sometimes. */ | |
315 #ifndef alloca | |
316 | |
317 /* Make alloca work the best possible way. */ | |
318 #ifdef __GNUC__ | |
319 #define alloca __builtin_alloca | |
771 | 320 #elif defined (__DECC) /* XEmacs: added next 3 lines, similar to config.h.in */ |
321 #include <alloca.h> | |
322 #pragma intrinsic(alloca) | |
428 | 323 #else /* not __GNUC__ */ |
324 #if HAVE_ALLOCA_H | |
325 #include <alloca.h> | |
326 #else /* not __GNUC__ or HAVE_ALLOCA_H */ | |
327 #ifndef _AIX /* Already did AIX, up at the top. */ | |
444 | 328 void *alloca (); |
428 | 329 #endif /* not _AIX */ |
446 | 330 #endif /* HAVE_ALLOCA_H */ |
331 #endif /* __GNUC__ */ | |
428 | 332 |
333 #endif /* not alloca */ | |
334 | |
1333 | 335 #define REGEX_ALLOCATE ALLOCA |
428 | 336 |
2367 | 337 /* !!#### Needs review */ |
428 | 338 /* Assumes a `char *destination' variable. */ |
339 #define REGEX_REALLOCATE(source, osize, nsize) \ | |
1333 | 340 (destination = (char *) ALLOCA (nsize), \ |
428 | 341 memmove (destination, source, osize), \ |
342 destination) | |
343 | |
1726 | 344 /* No need to do anything to free, after alloca. |
345 Do nothing! But inhibit gcc warning. */ | |
346 #define REGEX_FREE(arg,type) ((void)0) | |
428 | 347 |
446 | 348 #endif /* REGEX_MALLOC */ |
428 | 349 |
350 /* Define how to allocate the failure stack. */ | |
351 | |
771 | 352 #ifdef REGEX_REL_ALLOC |
428 | 353 #define REGEX_ALLOCATE_STACK(size) \ |
1346 | 354 r_alloc ((unsigned char **) &failure_stack_ptr, (size)) |
428 | 355 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ |
1346 | 356 r_re_alloc ((unsigned char **) &failure_stack_ptr, (nsize)) |
428 | 357 #define REGEX_FREE_STACK(ptr) \ |
1346 | 358 r_alloc_free ((unsigned char **) &failure_stack_ptr) |
428 | 359 |
771 | 360 #else /* not REGEX_REL_ALLOC */ |
428 | 361 |
362 #ifdef REGEX_MALLOC | |
363 | |
1333 | 364 #define REGEX_ALLOCATE_STACK xmalloc |
365 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc (source, nsize) | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
366 #define REGEX_FREE_STACK(arg) xfree (arg) |
428 | 367 |
368 #else /* not REGEX_MALLOC */ | |
369 | |
1333 | 370 #define REGEX_ALLOCATE_STACK ALLOCA |
428 | 371 |
372 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ | |
373 REGEX_REALLOCATE (source, osize, nsize) | |
374 /* No need to explicitly free anything. */ | |
375 #define REGEX_FREE_STACK(arg) | |
376 | |
446 | 377 #endif /* REGEX_MALLOC */ |
771 | 378 #endif /* REGEX_REL_ALLOC */ |
428 | 379 |
380 | |
381 /* True if `size1' is non-NULL and PTR is pointing anywhere inside | |
382 `string1' or just past its end. This works if PTR is NULL, which is | |
383 a good thing. */ | |
384 #define FIRST_STRING_P(ptr) \ | |
385 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) | |
386 | |
387 /* (Re)Allocate N items of type T using malloc, or fail. */ | |
1333 | 388 #define TALLOC(n, t) ((t *) xmalloc ((n) * sizeof (t))) |
389 #define RETALLOC(addr, n, t) ((addr) = (t *) xrealloc (addr, (n) * sizeof (t))) | |
428 | 390 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) |
391 | |
392 #define BYTEWIDTH 8 /* In bits. */ | |
393 | |
434 | 394 #define STREQ(s1, s2) (strcmp (s1, s2) == 0) |
428 | 395 |
396 #undef MAX | |
397 #undef MIN | |
398 #define MAX(a, b) ((a) > (b) ? (a) : (b)) | |
399 #define MIN(a, b) ((a) < (b) ? (a) : (b)) | |
400 | |
446 | 401 /* Type of source-pattern and string chars. */ |
402 typedef const unsigned char re_char; | |
403 | |
460 | 404 typedef char re_bool; |
428 | 405 #define false 0 |
406 #define true 1 | |
407 | |
408 | |
1346 | 409 #ifdef emacs |
410 | |
411 #ifdef MULE | |
412 | |
413 Lisp_Object Vthe_lisp_rangetab; | |
414 | |
415 void | |
416 vars_of_regex (void) | |
417 { | |
2421 | 418 Vthe_lisp_rangetab = Fmake_range_table (Qstart_closed_end_closed); |
1346 | 419 staticpro (&Vthe_lisp_rangetab); |
420 } | |
421 | |
422 #else /* not MULE */ | |
423 | |
424 void | |
425 vars_of_regex (void) | |
426 { | |
427 } | |
428 | |
429 #endif /* MULE */ | |
430 | |
431 /* Convert an offset from the start of the logical text string formed by | |
432 concatenating the two strings together into a character position in the | |
433 Lisp buffer or string that the text represents. Knows that | |
434 when handling buffer text, the "string" we're passed in is always | |
435 BEGV - ZV. */ | |
436 | |
437 static Charxpos | |
438 offset_to_charxpos (Lisp_Object lispobj, int off) | |
439 { | |
440 if (STRINGP (lispobj)) | |
441 return string_index_byte_to_char (lispobj, off); | |
442 else if (BUFFERP (lispobj)) | |
443 return bytebpos_to_charbpos (XBUFFER (lispobj), | |
444 off + BYTE_BUF_BEGV (XBUFFER (lispobj))); | |
445 else | |
446 return 0; | |
447 } | |
448 | |
449 #ifdef REL_ALLOC | |
450 | |
451 /* STRING1 is the value of STRING1 given to re_match_2(). LISPOBJ is | |
452 the Lisp object (if any) from which the string is taken. If LISPOBJ | |
453 is a buffer, return a relocation offset to be added to all pointers to | |
454 string data so that they will be accurate again, after an allocation or | |
455 reallocation that potentially relocated the buffer data. | |
456 */ | |
457 static Bytecount | |
458 offset_post_relocation (Lisp_Object lispobj, Ibyte *orig_buftext) | |
459 { | |
460 if (!BUFFERP (lispobj)) | |
461 return 0; | |
462 return (BYTE_BUF_BYTE_ADDRESS (XBUFFER (lispobj), | |
463 BYTE_BUF_BEGV (XBUFFER (lispobj))) - | |
464 orig_buftext); | |
465 } | |
466 | |
467 #endif /* REL_ALLOC */ | |
468 | |
469 #ifdef ERROR_CHECK_MALLOC | |
470 | |
471 /* NOTE that this can run malloc() so you need to adjust afterwards. */ | |
472 | |
473 static int | |
474 bind_regex_malloc_disallowed (int value) | |
475 { | |
476 /* Tricky, because the act of binding can run malloc(). */ | |
477 int old_regex_malloc_disallowed = regex_malloc_disallowed; | |
478 int depth; | |
479 regex_malloc_disallowed = 0; | |
480 depth = record_unwind_protect_restoring_int (®ex_malloc_disallowed, | |
481 old_regex_malloc_disallowed); | |
482 regex_malloc_disallowed = value; | |
483 return depth; | |
484 } | |
485 | |
486 #endif /* ERROR_CHECK_MALLOC */ | |
487 | |
488 #endif /* emacs */ | |
489 | |
490 | |
428 | 491 /* These are the command codes that appear in compiled regular |
492 expressions. Some opcodes are followed by argument bytes. A | |
493 command code can specify any interpretation whatsoever for its | |
494 arguments. Zero bytes may appear in the compiled regular expression. */ | |
495 | |
496 typedef enum | |
497 { | |
498 no_op = 0, | |
499 | |
500 /* Succeed right away--no more backtracking. */ | |
501 succeed, | |
502 | |
503 /* Followed by one byte giving n, then by n literal bytes. */ | |
504 exactn, | |
505 | |
506 /* Matches any (more or less) character. */ | |
507 anychar, | |
508 | |
509 /* Matches any one char belonging to specified set. First | |
510 following byte is number of bitmap bytes. Then come bytes | |
511 for a bitmap saying which chars are in. Bits in each byte | |
512 are ordered low-bit-first. A character is in the set if its | |
513 bit is 1. A character too large to have a bit in the map is | |
514 automatically not in the set. */ | |
515 charset, | |
516 | |
517 /* Same parameters as charset, but match any character that is | |
518 not one of those specified. */ | |
519 charset_not, | |
520 | |
521 /* Start remembering the text that is matched, for storing in a | |
522 register. Followed by one byte with the register number, in | |
502 | 523 the range 1 to the pattern buffer's re_ngroups |
428 | 524 field. Then followed by one byte with the number of groups |
525 inner to this one. (This last has to be part of the | |
526 start_memory only because we need it in the on_failure_jump | |
527 of re_match_2.) */ | |
528 start_memory, | |
529 | |
530 /* Stop remembering the text that is matched and store it in a | |
531 memory register. Followed by one byte with the register | |
502 | 532 number, in the range 1 to `re_ngroups' in the |
428 | 533 pattern buffer, and one byte with the number of inner groups, |
534 just like `start_memory'. (We need the number of inner | |
535 groups here because we don't have any easy way of finding the | |
536 corresponding start_memory when we're at a stop_memory.) */ | |
537 stop_memory, | |
538 | |
539 /* Match a duplicate of something remembered. Followed by one | |
540 byte containing the register number. */ | |
541 duplicate, | |
542 | |
543 /* Fail unless at beginning of line. */ | |
544 begline, | |
545 | |
546 /* Fail unless at end of line. */ | |
547 endline, | |
548 | |
549 /* Succeeds if at beginning of buffer (if emacs) or at beginning | |
550 of string to be matched (if not). */ | |
551 begbuf, | |
552 | |
553 /* Analogously, for end of buffer/string. */ | |
554 endbuf, | |
555 | |
556 /* Followed by two byte relative address to which to jump. */ | |
557 jump, | |
558 | |
559 /* Same as jump, but marks the end of an alternative. */ | |
560 jump_past_alt, | |
561 | |
562 /* Followed by two-byte relative address of place to resume at | |
563 in case of failure. */ | |
564 on_failure_jump, | |
565 | |
566 /* Like on_failure_jump, but pushes a placeholder instead of the | |
567 current string position when executed. */ | |
568 on_failure_keep_string_jump, | |
569 | |
570 /* Throw away latest failure point and then jump to following | |
571 two-byte relative address. */ | |
572 pop_failure_jump, | |
573 | |
574 /* Change to pop_failure_jump if know won't have to backtrack to | |
575 match; otherwise change to jump. This is used to jump | |
576 back to the beginning of a repeat. If what follows this jump | |
577 clearly won't match what the repeat does, such that we can be | |
578 sure that there is no use backtracking out of repetitions | |
579 already matched, then we change it to a pop_failure_jump. | |
580 Followed by two-byte address. */ | |
581 maybe_pop_jump, | |
582 | |
583 /* Jump to following two-byte address, and push a dummy failure | |
584 point. This failure point will be thrown away if an attempt | |
585 is made to use it for a failure. A `+' construct makes this | |
586 before the first repeat. Also used as an intermediary kind | |
587 of jump when compiling an alternative. */ | |
588 dummy_failure_jump, | |
589 | |
590 /* Push a dummy failure point and continue. Used at the end of | |
591 alternatives. */ | |
592 push_dummy_failure, | |
593 | |
594 /* Followed by two-byte relative address and two-byte number n. | |
595 After matching N times, jump to the address upon failure. */ | |
596 succeed_n, | |
597 | |
598 /* Followed by two-byte relative address, and two-byte number n. | |
599 Jump to the address N times, then fail. */ | |
600 jump_n, | |
601 | |
602 /* Set the following two-byte relative address to the | |
603 subsequent two-byte number. The address *includes* the two | |
604 bytes of number. */ | |
605 set_number_at, | |
606 | |
607 wordchar, /* Matches any word-constituent character. */ | |
608 notwordchar, /* Matches any char that is not a word-constituent. */ | |
609 | |
610 wordbeg, /* Succeeds if at word beginning. */ | |
611 wordend, /* Succeeds if at word end. */ | |
612 | |
613 wordbound, /* Succeeds if at a word boundary. */ | |
614 notwordbound /* Succeeds if not at a word boundary. */ | |
615 | |
616 #ifdef emacs | |
617 ,before_dot, /* Succeeds if before point. */ | |
618 at_dot, /* Succeeds if at point. */ | |
619 after_dot, /* Succeeds if after point. */ | |
620 | |
621 /* Matches any character whose syntax is specified. Followed by | |
622 a byte which contains a syntax code, e.g., Sword. */ | |
623 syntaxspec, | |
624 | |
625 /* Matches any character whose syntax is not that specified. */ | |
626 notsyntaxspec | |
627 | |
628 #endif /* emacs */ | |
629 | |
630 #ifdef MULE | |
631 /* need extra stuff to be able to properly work with XEmacs/Mule | |
632 characters (which may take up more than one byte) */ | |
633 | |
634 ,charset_mule, /* Matches any character belonging to specified set. | |
635 The set is stored in "unified range-table | |
636 format"; see rangetab.c. Unlike the `charset' | |
637 opcode, this can handle arbitrary characters. */ | |
638 | |
639 charset_mule_not /* Same parameters as charset_mule, but match any | |
640 character that is not one of those specified. */ | |
641 | |
642 /* 97/2/17 jhod: The following two were merged back in from the Mule | |
643 2.3 code to enable some language specific processing */ | |
644 ,categoryspec, /* Matches entries in the character category tables */ | |
645 notcategoryspec /* The opposite of the above */ | |
646 #endif /* MULE */ | |
647 | |
648 } re_opcode_t; | |
649 | |
650 /* Common operations on the compiled pattern. */ | |
651 | |
652 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ | |
653 | |
654 #define STORE_NUMBER(destination, number) \ | |
655 do { \ | |
656 (destination)[0] = (number) & 0377; \ | |
657 (destination)[1] = (number) >> 8; \ | |
658 } while (0) | |
659 | |
660 /* Same as STORE_NUMBER, except increment DESTINATION to | |
661 the byte after where the number is stored. Therefore, DESTINATION | |
662 must be an lvalue. */ | |
663 | |
664 #define STORE_NUMBER_AND_INCR(destination, number) \ | |
665 do { \ | |
666 STORE_NUMBER (destination, number); \ | |
667 (destination) += 2; \ | |
668 } while (0) | |
669 | |
670 /* Put into DESTINATION a number stored in two contiguous bytes starting | |
671 at SOURCE. */ | |
672 | |
673 #define EXTRACT_NUMBER(destination, source) \ | |
674 do { \ | |
675 (destination) = *(source) & 0377; \ | |
676 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ | |
677 } while (0) | |
678 | |
679 #ifdef DEBUG | |
680 static void | |
446 | 681 extract_number (int *dest, re_char *source) |
428 | 682 { |
683 int temp = SIGN_EXTEND_CHAR (*(source + 1)); | |
684 *dest = *source & 0377; | |
685 *dest += temp << 8; | |
686 } | |
687 | |
688 #ifndef EXTRACT_MACROS /* To debug the macros. */ | |
689 #undef EXTRACT_NUMBER | |
690 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) | |
691 #endif /* not EXTRACT_MACROS */ | |
692 | |
693 #endif /* DEBUG */ | |
694 | |
695 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. | |
696 SOURCE must be an lvalue. */ | |
697 | |
698 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ | |
699 do { \ | |
700 EXTRACT_NUMBER (destination, source); \ | |
701 (source) += 2; \ | |
702 } while (0) | |
703 | |
704 #ifdef DEBUG | |
705 static void | |
706 extract_number_and_incr (int *destination, unsigned char **source) | |
707 { | |
708 extract_number (destination, *source); | |
709 *source += 2; | |
710 } | |
711 | |
712 #ifndef EXTRACT_MACROS | |
713 #undef EXTRACT_NUMBER_AND_INCR | |
714 #define EXTRACT_NUMBER_AND_INCR(dest, src) \ | |
715 extract_number_and_incr (&dest, &src) | |
716 #endif /* not EXTRACT_MACROS */ | |
717 | |
718 #endif /* DEBUG */ | |
719 | |
720 /* If DEBUG is defined, Regex prints many voluminous messages about what | |
721 it is doing (if the variable `debug' is nonzero). If linked with the | |
722 main program in `iregex.c', you can enter patterns and strings | |
723 interactively. And if linked with the main program in `main.c' and | |
724 the other test files, you can run the already-written tests. */ | |
725 | |
726 #if defined (DEBUG) | |
727 | |
728 /* We use standard I/O for debugging. */ | |
729 #include <stdio.h> | |
730 | |
731 #ifndef emacs | |
732 /* XEmacs provides its own version of assert() */ | |
733 /* It is useful to test things that ``must'' be true when debugging. */ | |
734 #include <assert.h> | |
735 #endif | |
736 | |
737 static int debug = 0; | |
738 | |
739 #define DEBUG_STATEMENT(e) e | |
740 #define DEBUG_PRINT1(x) if (debug) printf (x) | |
741 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) | |
742 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) | |
743 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) | |
744 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ | |
745 if (debug) print_partial_compiled_pattern (s, e) | |
746 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ | |
747 if (debug) print_double_string (w, s1, sz1, s2, sz2) | |
748 | |
749 | |
750 /* Print the fastmap in human-readable form. */ | |
751 | |
752 static void | |
753 print_fastmap (char *fastmap) | |
754 { | |
647 | 755 int was_a_range = 0; |
756 int i = 0; | |
428 | 757 |
758 while (i < (1 << BYTEWIDTH)) | |
759 { | |
760 if (fastmap[i++]) | |
761 { | |
762 was_a_range = 0; | |
763 putchar (i - 1); | |
764 while (i < (1 << BYTEWIDTH) && fastmap[i]) | |
765 { | |
766 was_a_range = 1; | |
767 i++; | |
768 } | |
769 if (was_a_range) | |
770 { | |
771 putchar ('-'); | |
772 putchar (i - 1); | |
773 } | |
774 } | |
775 } | |
776 putchar ('\n'); | |
777 } | |
778 | |
779 | |
780 /* Print a compiled pattern string in human-readable form, starting at | |
781 the START pointer into it and ending just before the pointer END. */ | |
782 | |
783 static void | |
446 | 784 print_partial_compiled_pattern (re_char *start, re_char *end) |
428 | 785 { |
786 int mcnt, mcnt2; | |
446 | 787 unsigned char *p = (unsigned char *) start; |
788 re_char *pend = end; | |
428 | 789 |
790 if (start == NULL) | |
791 { | |
792 puts ("(null)"); | |
793 return; | |
794 } | |
795 | |
796 /* Loop over pattern commands. */ | |
797 while (p < pend) | |
798 { | |
799 printf ("%ld:\t", (long)(p - start)); | |
800 | |
801 switch ((re_opcode_t) *p++) | |
802 { | |
803 case no_op: | |
804 printf ("/no_op"); | |
805 break; | |
806 | |
807 case exactn: | |
808 mcnt = *p++; | |
809 printf ("/exactn/%d", mcnt); | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
810 while (mcnt--) |
428 | 811 { |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
812 putchar ('/'); |
428 | 813 putchar (*p++); |
814 } | |
815 break; | |
816 | |
817 case start_memory: | |
818 mcnt = *p++; | |
819 printf ("/start_memory/%d/%d", mcnt, *p++); | |
820 break; | |
821 | |
822 case stop_memory: | |
823 mcnt = *p++; | |
824 printf ("/stop_memory/%d/%d", mcnt, *p++); | |
825 break; | |
826 | |
827 case duplicate: | |
828 printf ("/duplicate/%d", *p++); | |
829 break; | |
830 | |
831 case anychar: | |
832 printf ("/anychar"); | |
833 break; | |
834 | |
835 case charset: | |
836 case charset_not: | |
837 { | |
838 REGISTER int c, last = -100; | |
839 REGISTER int in_range = 0; | |
840 | |
841 printf ("/charset [%s", | |
842 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); | |
843 | |
844 assert (p + *p < pend); | |
845 | |
846 for (c = 0; c < 256; c++) | |
847 if (((unsigned char) (c / 8) < *p) | |
848 && (p[1 + (c/8)] & (1 << (c % 8)))) | |
849 { | |
850 /* Are we starting a range? */ | |
851 if (last + 1 == c && ! in_range) | |
852 { | |
853 putchar ('-'); | |
854 in_range = 1; | |
855 } | |
856 /* Have we broken a range? */ | |
857 else if (last + 1 != c && in_range) | |
858 { | |
859 putchar (last); | |
860 in_range = 0; | |
861 } | |
862 | |
863 if (! in_range) | |
864 putchar (c); | |
865 | |
866 last = c; | |
867 } | |
868 | |
869 if (in_range) | |
870 putchar (last); | |
871 | |
872 putchar (']'); | |
873 | |
874 p += 1 + *p; | |
875 } | |
876 break; | |
877 | |
878 #ifdef MULE | |
879 case charset_mule: | |
880 case charset_mule_not: | |
881 { | |
882 int nentries, i; | |
883 | |
884 printf ("/charset_mule [%s", | |
885 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : ""); | |
886 nentries = unified_range_table_nentries (p); | |
887 for (i = 0; i < nentries; i++) | |
888 { | |
889 EMACS_INT first, last; | |
890 Lisp_Object dummy_val; | |
891 | |
892 unified_range_table_get_range (p, i, &first, &last, | |
893 &dummy_val); | |
894 if (first < 0x100) | |
895 putchar (first); | |
896 else | |
897 printf ("(0x%lx)", (long)first); | |
898 if (first != last) | |
899 { | |
900 putchar ('-'); | |
901 if (last < 0x100) | |
902 putchar (last); | |
903 else | |
904 printf ("(0x%lx)", (long)last); | |
905 } | |
906 } | |
907 putchar (']'); | |
908 p += unified_range_table_bytes_used (p); | |
909 } | |
910 break; | |
911 #endif | |
912 | |
913 case begline: | |
914 printf ("/begline"); | |
915 break; | |
916 | |
917 case endline: | |
918 printf ("/endline"); | |
919 break; | |
920 | |
921 case on_failure_jump: | |
922 extract_number_and_incr (&mcnt, &p); | |
923 printf ("/on_failure_jump to %ld", (long)(p + mcnt - start)); | |
924 break; | |
925 | |
926 case on_failure_keep_string_jump: | |
927 extract_number_and_incr (&mcnt, &p); | |
928 printf ("/on_failure_keep_string_jump to %ld", (long)(p + mcnt - start)); | |
929 break; | |
930 | |
931 case dummy_failure_jump: | |
932 extract_number_and_incr (&mcnt, &p); | |
933 printf ("/dummy_failure_jump to %ld", (long)(p + mcnt - start)); | |
934 break; | |
935 | |
936 case push_dummy_failure: | |
937 printf ("/push_dummy_failure"); | |
938 break; | |
939 | |
940 case maybe_pop_jump: | |
941 extract_number_and_incr (&mcnt, &p); | |
942 printf ("/maybe_pop_jump to %ld", (long)(p + mcnt - start)); | |
943 break; | |
944 | |
945 case pop_failure_jump: | |
946 extract_number_and_incr (&mcnt, &p); | |
947 printf ("/pop_failure_jump to %ld", (long)(p + mcnt - start)); | |
948 break; | |
949 | |
950 case jump_past_alt: | |
951 extract_number_and_incr (&mcnt, &p); | |
952 printf ("/jump_past_alt to %ld", (long)(p + mcnt - start)); | |
953 break; | |
954 | |
955 case jump: | |
956 extract_number_and_incr (&mcnt, &p); | |
957 printf ("/jump to %ld", (long)(p + mcnt - start)); | |
958 break; | |
959 | |
960 case succeed_n: | |
961 extract_number_and_incr (&mcnt, &p); | |
962 extract_number_and_incr (&mcnt2, &p); | |
963 printf ("/succeed_n to %ld, %d times", (long)(p + mcnt - start), mcnt2); | |
964 break; | |
965 | |
966 case jump_n: | |
967 extract_number_and_incr (&mcnt, &p); | |
968 extract_number_and_incr (&mcnt2, &p); | |
969 printf ("/jump_n to %ld, %d times", (long)(p + mcnt - start), mcnt2); | |
970 break; | |
971 | |
972 case set_number_at: | |
973 extract_number_and_incr (&mcnt, &p); | |
974 extract_number_and_incr (&mcnt2, &p); | |
975 printf ("/set_number_at location %ld to %d", (long)(p + mcnt - start), mcnt2); | |
976 break; | |
977 | |
978 case wordbound: | |
979 printf ("/wordbound"); | |
980 break; | |
981 | |
982 case notwordbound: | |
983 printf ("/notwordbound"); | |
984 break; | |
985 | |
986 case wordbeg: | |
987 printf ("/wordbeg"); | |
988 break; | |
989 | |
990 case wordend: | |
991 printf ("/wordend"); | |
992 | |
993 #ifdef emacs | |
994 case before_dot: | |
995 printf ("/before_dot"); | |
996 break; | |
997 | |
998 case at_dot: | |
999 printf ("/at_dot"); | |
1000 break; | |
1001 | |
1002 case after_dot: | |
1003 printf ("/after_dot"); | |
1004 break; | |
1005 | |
1006 case syntaxspec: | |
1007 printf ("/syntaxspec"); | |
1008 mcnt = *p++; | |
1009 printf ("/%d", mcnt); | |
1010 break; | |
1011 | |
1012 case notsyntaxspec: | |
1013 printf ("/notsyntaxspec"); | |
1014 mcnt = *p++; | |
1015 printf ("/%d", mcnt); | |
1016 break; | |
1017 | |
1018 #ifdef MULE | |
1019 /* 97/2/17 jhod Mule category patch */ | |
1020 case categoryspec: | |
1021 printf ("/categoryspec"); | |
1022 mcnt = *p++; | |
1023 printf ("/%d", mcnt); | |
1024 break; | |
1025 | |
1026 case notcategoryspec: | |
1027 printf ("/notcategoryspec"); | |
1028 mcnt = *p++; | |
1029 printf ("/%d", mcnt); | |
1030 break; | |
1031 /* end of category patch */ | |
1032 #endif /* MULE */ | |
1033 #endif /* emacs */ | |
1034 | |
1035 case wordchar: | |
1036 printf ("/wordchar"); | |
1037 break; | |
1038 | |
1039 case notwordchar: | |
1040 printf ("/notwordchar"); | |
1041 break; | |
1042 | |
1043 case begbuf: | |
1044 printf ("/begbuf"); | |
1045 break; | |
1046 | |
1047 case endbuf: | |
1048 printf ("/endbuf"); | |
1049 break; | |
1050 | |
1051 default: | |
1052 printf ("?%d", *(p-1)); | |
1053 } | |
1054 | |
1055 putchar ('\n'); | |
1056 } | |
1057 | |
1058 printf ("%ld:\tend of pattern.\n", (long)(p - start)); | |
1059 } | |
1060 | |
1061 | |
1062 static void | |
1063 print_compiled_pattern (struct re_pattern_buffer *bufp) | |
1064 { | |
446 | 1065 re_char *buffer = bufp->buffer; |
428 | 1066 |
1067 print_partial_compiled_pattern (buffer, buffer + bufp->used); | |
1068 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used, | |
1069 bufp->allocated); | |
1070 | |
1071 if (bufp->fastmap_accurate && bufp->fastmap) | |
1072 { | |
1073 printf ("fastmap: "); | |
1074 print_fastmap (bufp->fastmap); | |
1075 } | |
1076 | |
1077 printf ("re_nsub: %ld\t", (long)bufp->re_nsub); | |
502 | 1078 printf ("re_ngroups: %ld\t", (long)bufp->re_ngroups); |
428 | 1079 printf ("regs_alloc: %d\t", bufp->regs_allocated); |
1080 printf ("can_be_null: %d\t", bufp->can_be_null); | |
1081 printf ("newline_anchor: %d\n", bufp->newline_anchor); | |
1082 printf ("no_sub: %d\t", bufp->no_sub); | |
1083 printf ("not_bol: %d\t", bufp->not_bol); | |
1084 printf ("not_eol: %d\t", bufp->not_eol); | |
1085 printf ("syntax: %d\n", bufp->syntax); | |
1086 /* Perhaps we should print the translate table? */ | |
1087 /* and maybe the category table? */ | |
502 | 1088 |
1089 if (bufp->external_to_internal_register) | |
1090 { | |
1091 int i; | |
1092 | |
1093 printf ("external_to_internal_register:\n"); | |
1094 for (i = 0; i <= bufp->re_nsub; i++) | |
1095 { | |
1096 if (i > 0) | |
1097 printf (", "); | |
1098 printf ("%d -> %d", i, bufp->external_to_internal_register[i]); | |
1099 } | |
1100 printf ("\n"); | |
1101 } | |
428 | 1102 } |
1103 | |
1104 | |
1105 static void | |
446 | 1106 print_double_string (re_char *where, re_char *string1, int size1, |
1107 re_char *string2, int size2) | |
428 | 1108 { |
1109 if (where == NULL) | |
1110 printf ("(null)"); | |
1111 else | |
1112 { | |
647 | 1113 int this_char; |
428 | 1114 |
1115 if (FIRST_STRING_P (where)) | |
1116 { | |
1117 for (this_char = where - string1; this_char < size1; this_char++) | |
1118 putchar (string1[this_char]); | |
1119 | |
1120 where = string2; | |
1121 } | |
1122 | |
1123 for (this_char = where - string2; this_char < size2; this_char++) | |
1124 putchar (string2[this_char]); | |
1125 } | |
1126 } | |
1127 | |
1128 #else /* not DEBUG */ | |
1129 | |
771 | 1130 #ifndef emacs |
428 | 1131 #undef assert |
771 | 1132 #define assert(e) ((void) (1)) |
1133 #endif | |
428 | 1134 |
1135 #define DEBUG_STATEMENT(e) | |
1136 #define DEBUG_PRINT1(x) | |
1137 #define DEBUG_PRINT2(x1, x2) | |
1138 #define DEBUG_PRINT3(x1, x2, x3) | |
1139 #define DEBUG_PRINT4(x1, x2, x3, x4) | |
1140 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) | |
1141 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) | |
1142 | |
446 | 1143 #endif /* DEBUG */ |
428 | 1144 |
1145 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can | |
1146 also be assigned to arbitrarily: each pattern buffer stores its own | |
1147 syntax, so it can be changed between regex compilations. */ | |
1148 /* This has no initializer because initialized variables in Emacs | |
1149 become read-only after dumping. */ | |
1150 reg_syntax_t re_syntax_options; | |
1151 | |
1152 | |
1153 /* Specify the precise syntax of regexps for compilation. This provides | |
1154 for compatibility for various utilities which historically have | |
1155 different, incompatible syntaxes. | |
1156 | |
1157 The argument SYNTAX is a bit mask comprised of the various bits | |
1158 defined in regex.h. We return the old syntax. */ | |
1159 | |
1160 reg_syntax_t | |
1161 re_set_syntax (reg_syntax_t syntax) | |
1162 { | |
1163 reg_syntax_t ret = re_syntax_options; | |
1164 | |
1165 re_syntax_options = syntax; | |
1166 return ret; | |
1167 } | |
1168 | |
1169 /* This table gives an error message for each of the error codes listed | |
1170 in regex.h. Obviously the order here has to be same as there. | |
1171 POSIX doesn't require that we do anything for REG_NOERROR, | |
1172 but why not be nice? */ | |
1173 | |
442 | 1174 static const char *re_error_msgid[] = |
428 | 1175 { |
1176 "Success", /* REG_NOERROR */ | |
1177 "No match", /* REG_NOMATCH */ | |
1178 "Invalid regular expression", /* REG_BADPAT */ | |
1179 "Invalid collation character", /* REG_ECOLLATE */ | |
1180 "Invalid character class name", /* REG_ECTYPE */ | |
1181 "Trailing backslash", /* REG_EESCAPE */ | |
1182 "Invalid back reference", /* REG_ESUBREG */ | |
1183 "Unmatched [ or [^", /* REG_EBRACK */ | |
1184 "Unmatched ( or \\(", /* REG_EPAREN */ | |
1185 "Unmatched \\{", /* REG_EBRACE */ | |
1186 "Invalid content of \\{\\}", /* REG_BADBR */ | |
1187 "Invalid range end", /* REG_ERANGE */ | |
1188 "Memory exhausted", /* REG_ESPACE */ | |
1189 "Invalid preceding regular expression", /* REG_BADRPT */ | |
1190 "Premature end of regular expression", /* REG_EEND */ | |
1191 "Regular expression too big", /* REG_ESIZE */ | |
1192 "Unmatched ) or \\)", /* REG_ERPAREN */ | |
1193 #ifdef emacs | |
1194 "Invalid syntax designator", /* REG_ESYNTAX */ | |
1195 #endif | |
1196 #ifdef MULE | |
1197 "Ranges may not span charsets", /* REG_ERANGESPAN */ | |
1198 "Invalid category designator", /* REG_ECATEGORY */ | |
1199 #endif | |
1200 }; | |
1201 | |
1202 /* Avoiding alloca during matching, to placate r_alloc. */ | |
1203 | |
1333 | 1204 /* About these various flags: |
1205 | |
1206 MATCH_MAY_ALLOCATE indicates that it's OK to do allocation in the | |
1207 searching and matching functions. In this case, we use local variables | |
1208 to hold the values allocated. If not, we use *global* variables, which | |
1209 are pre-allocated. NOTE: XEmacs ***MUST*** run with MATCH_MAY_ALLOCATE, | |
1210 because the regexp routines may get called reentrantly as a result of | |
1211 QUIT processing (e.g. under Windows: re_match -> QUIT -> quit_p -> drain | |
1212 events -> process WM_INITMENU -> call filter -> re_match; see stack | |
1213 trace in signal.c), so we cannot have any global variables (unless we do | |
1214 lots of trickiness including some unwind-protects, which isn't worth it | |
1215 at this point). | |
1216 | |
1217 REL_ALLOC means that the relocating allocator is in use, for buffers | |
1218 and such. REGEX_REL_ALLOC means that we use rel-alloc to manage the | |
1219 fail stack, which may grow quite large. REGEX_MALLOC means we use | |
1220 malloc() in place of alloca() to allocate the fail stack -- only | |
1221 applicable if REGEX_REL_ALLOC is not defined. | |
1222 */ | |
1223 | |
428 | 1224 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the |
1225 searching and matching functions should not call alloca. On some | |
1226 systems, alloca is implemented in terms of malloc, and if we're | |
1227 using the relocating allocator routines, then malloc could cause a | |
1228 relocation, which might (if the strings being searched are in the | |
1229 ralloc heap) shift the data out from underneath the regexp | |
771 | 1230 routines. [To clarify: The purpose of rel-alloc is to allow data to |
1231 be moved in memory from one place to another so that all data | |
1232 blocks can be consolidated together and excess memory released back | |
1233 to the operating system. This requires that all the blocks that | |
1234 are managed by rel-alloc go at the very end of the program's heap, | |
1235 after all regularly malloc()ed data. malloc(), however, is used to | |
1236 owning the end of the heap, so that when more memory is needed, it | |
1237 just expands the heap using sbrk(). This is reconciled by using a | |
1238 malloc() (such as malloc.c, gmalloc.c, or recent versions of | |
1239 malloc() in libc) where the sbrk() call can be replaced with a | |
1240 user-specified call -- in this case, to rel-alloc's r_alloc_sbrk() | |
1241 routine. This routine calls the real sbrk(), but then shifts all | |
1242 the rel-alloc-managed blocks forward to the end of the heap again, | |
1243 so that malloc() gets the memory it needs in the location it needs | |
1244 it at. The regex routines may well have pointers to buffer data as | |
1245 their arguments, and buffers are managed by rel-alloc if rel-alloc | |
1246 has been enabled, so calling malloc() may potentially screw things | |
1247 up badly if it runs out of space and asks for more from the OS.] | |
1248 | |
1249 [[Here's another reason to avoid allocation: Emacs processes input | |
1250 from X in a signal handler; processing X input may call malloc; if | |
1251 input arrives while a matching routine is calling malloc, then | |
1252 we're scrod. But Emacs can't just block input while calling | |
1253 matching routines; then we don't notice interrupts when they come | |
1254 in. So, Emacs blocks input around all regexp calls except the | |
1255 matching calls, which it leaves unprotected, in the faith that they | |
1333 | 1256 will not malloc.]] This previous paragraph is irrelevant under XEmacs, |
1257 as we *do not* do anything so stupid as process input from within a | |
1258 signal handler. | |
1259 | |
1260 However, the regexp routines may get called reentrantly as a result of | |
1261 QUIT processing (e.g. under Windows: re_match -> QUIT -> quit_p -> drain | |
1262 events -> process WM_INITMENU -> call filter -> re_match; see stack | |
1263 trace in signal.c), so we cannot have any global variables (unless we do | |
1264 lots of trickiness including some unwind-protects, which isn't worth it | |
1265 at this point). Hence we MUST have MATCH_MAY_ALLOCATE defined. | |
1266 | |
1267 Also, the first paragraph does not make complete sense to me -- what | |
1268 about the use of rel-alloc to handle the fail stacks? Shouldn't these | |
1269 reallocations potentially cause buffer data to be relocated as well? I | |
826 | 1270 must be missing something, though -- perhaps the writer above is |
1271 assuming that the failure stack(s) will always be allocated after the | |
1272 buffer data, and thus reallocating them with rel-alloc won't move buffer | |
1333 | 1273 data. (In fact, a cursory glance at the code in ralloc.c seems to |
1274 confirm this.) --ben */ | |
428 | 1275 |
1276 /* Normally, this is fine. */ | |
1277 #define MATCH_MAY_ALLOCATE | |
1278 | |
1279 /* When using GNU C, we are not REALLY using the C alloca, no matter | |
1280 what config.h may say. So don't take precautions for it. */ | |
1281 #ifdef __GNUC__ | |
1282 #undef C_ALLOCA | |
1283 #endif | |
1284 | |
1285 /* The match routines may not allocate if (1) they would do it with malloc | |
1286 and (2) it's not safe for them to use malloc. | |
1287 Note that if REL_ALLOC is defined, matching would not use malloc for the | |
1288 failure stack, but we would still use it for the register vectors; | |
1289 so REL_ALLOC should not affect this. */ | |
771 | 1290 |
1333 | 1291 /* XEmacs can handle REL_ALLOC and malloc() OK */ |
1292 #if !defined (emacs) && (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (REL_ALLOC) | |
428 | 1293 #undef MATCH_MAY_ALLOCATE |
1294 #endif | |
1295 | |
1333 | 1296 #if !defined (MATCH_MAY_ALLOCATE) && defined (emacs) |
771 | 1297 #error regex must be handle reentrancy; MATCH_MAY_ALLOCATE must be defined |
1298 #endif | |
1299 | |
428 | 1300 |
1301 /* Failure stack declarations and macros; both re_compile_fastmap and | |
1302 re_match_2 use a failure stack. These have to be macros because of | |
1303 REGEX_ALLOCATE_STACK. */ | |
1304 | |
1305 | |
1306 /* Number of failure points for which to initially allocate space | |
1307 when matching. If this number is exceeded, we allocate more | |
1308 space, so it is not a hard limit. */ | |
1309 #ifndef INIT_FAILURE_ALLOC | |
3300 | 1310 #define INIT_FAILURE_ALLOC 20 |
428 | 1311 #endif |
1312 | |
1313 /* Roughly the maximum number of failure points on the stack. Would be | |
1314 exactly that if always used MAX_FAILURE_SPACE each time we failed. | |
1315 This is a variable only so users of regex can assign to it; we never | |
1316 change it ourselves. */ | |
1317 #if defined (MATCH_MAY_ALLOCATE) | |
1318 /* 4400 was enough to cause a crash on Alpha OSF/1, | |
1319 whose default stack limit is 2mb. */ | |
3300 | 1320 int re_max_failures = 40000; |
428 | 1321 #else |
3300 | 1322 int re_max_failures = 4000; |
428 | 1323 #endif |
1324 | |
1325 union fail_stack_elt | |
1326 { | |
446 | 1327 re_char *pointer; |
428 | 1328 int integer; |
1329 }; | |
1330 | |
1331 typedef union fail_stack_elt fail_stack_elt_t; | |
1332 | |
1333 typedef struct | |
1334 { | |
1335 fail_stack_elt_t *stack; | |
665 | 1336 Elemcount size; |
1337 Elemcount avail; /* Offset of next open position. */ | |
428 | 1338 } fail_stack_type; |
1339 | |
1340 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) | |
1341 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) | |
1342 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) | |
1343 | |
1344 | |
1345 /* Define macros to initialize and free the failure stack. | |
1346 Do `return -2' if the alloc fails. */ | |
1347 | |
1348 #ifdef MATCH_MAY_ALLOCATE | |
1333 | 1349 #define INIT_FAIL_STACK() \ |
1350 do { \ | |
1351 fail_stack.stack = (fail_stack_elt_t *) \ | |
1352 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * \ | |
1353 sizeof (fail_stack_elt_t)); \ | |
1354 \ | |
1355 if (fail_stack.stack == NULL) \ | |
1356 { \ | |
1357 UNBIND_REGEX_MALLOC_CHECK (); \ | |
1358 return -2; \ | |
1359 } \ | |
1360 \ | |
1361 fail_stack.size = INIT_FAILURE_ALLOC; \ | |
1362 fail_stack.avail = 0; \ | |
428 | 1363 } while (0) |
1364 | |
1365 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) | |
1366 #else | |
1367 #define INIT_FAIL_STACK() \ | |
1368 do { \ | |
1369 fail_stack.avail = 0; \ | |
1370 } while (0) | |
1371 | |
1372 #define RESET_FAIL_STACK() | |
1373 #endif | |
1374 | |
1375 | |
1376 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. | |
1377 | |
1378 Return 1 if succeeds, and 0 if either ran out of memory | |
1379 allocating space for it or it was already too large. | |
1380 | |
1381 REGEX_REALLOCATE_STACK requires `destination' be declared. */ | |
1382 | |
1383 #define DOUBLE_FAIL_STACK(fail_stack) \ | |
1384 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ | |
1385 ? 0 \ | |
1386 : ((fail_stack).stack = (fail_stack_elt_t *) \ | |
1387 REGEX_REALLOCATE_STACK ((fail_stack).stack, \ | |
1388 (fail_stack).size * sizeof (fail_stack_elt_t), \ | |
1389 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ | |
1390 \ | |
1391 (fail_stack).stack == NULL \ | |
1392 ? 0 \ | |
1393 : ((fail_stack).size <<= 1, \ | |
1394 1))) | |
1395 | |
1333 | 1396 #if !defined (emacs) || !defined (REL_ALLOC) |
1397 #define RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS() | |
1398 #else | |
1399 /* Don't change NULL pointers */ | |
1400 #define ADD_IF_NZ(val) if (val) val += rmdp_offset | |
1346 | 1401 #define RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS() \ |
1402 do \ | |
1403 { \ | |
1404 Bytecount rmdp_offset = offset_post_relocation (lispobj, orig_buftext); \ | |
1405 \ | |
1406 if (rmdp_offset) \ | |
1407 { \ | |
1408 int i; \ | |
1409 \ | |
1410 ADD_IF_NZ (string1); \ | |
1411 ADD_IF_NZ (string2); \ | |
1412 ADD_IF_NZ (d); \ | |
1413 ADD_IF_NZ (dend); \ | |
1414 ADD_IF_NZ (end1); \ | |
1415 ADD_IF_NZ (end2); \ | |
1416 ADD_IF_NZ (end_match_1); \ | |
1417 ADD_IF_NZ (end_match_2); \ | |
1418 \ | |
1419 if (bufp->re_ngroups) \ | |
1420 { \ | |
1421 for (i = 0; i < num_regs; i++) \ | |
1422 { \ | |
1423 ADD_IF_NZ (regstart[i]); \ | |
1424 ADD_IF_NZ (regend[i]); \ | |
1425 ADD_IF_NZ (old_regstart[i]); \ | |
1426 ADD_IF_NZ (old_regend[i]); \ | |
1427 ADD_IF_NZ (best_regstart[i]); \ | |
1428 ADD_IF_NZ (best_regend[i]); \ | |
1429 ADD_IF_NZ (reg_dummy[i]); \ | |
1430 } \ | |
1431 } \ | |
1432 \ | |
1433 ADD_IF_NZ (match_end); \ | |
1434 } \ | |
1333 | 1435 } while (0) |
1436 #endif /* !defined (emacs) || !defined (REL_ALLOC) */ | |
1437 | |
1438 #if !defined (emacs) || !defined (REL_ALLOC) | |
1439 #define RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS() | |
1440 #else | |
1346 | 1441 #define RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS() \ |
1442 do \ | |
1443 { \ | |
1444 Bytecount rmdp_offset = offset_post_relocation (lispobj, orig_buftext); \ | |
1445 \ | |
1446 if (rmdp_offset) \ | |
1447 { \ | |
1448 ADD_IF_NZ (str1); \ | |
1449 ADD_IF_NZ (str2); \ | |
1450 ADD_IF_NZ (string1); \ | |
1451 ADD_IF_NZ (string2); \ | |
1452 ADD_IF_NZ (d); \ | |
1453 } \ | |
1333 | 1454 } while (0) |
1455 | |
1456 #endif /* emacs */ | |
428 | 1457 |
1458 /* Push pointer POINTER on FAIL_STACK. | |
1459 Return 1 if was able to do so and 0 if ran out of memory allocating | |
1460 space to do so. */ | |
1461 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ | |
1462 ((FAIL_STACK_FULL () \ | |
1463 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ | |
1464 ? 0 \ | |
1465 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ | |
1466 1)) | |
1467 | |
1468 /* Push a pointer value onto the failure stack. | |
1469 Assumes the variable `fail_stack'. Probably should only | |
1470 be called from within `PUSH_FAILURE_POINT'. */ | |
1471 #define PUSH_FAILURE_POINTER(item) \ | |
1472 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) | |
1473 | |
1474 /* This pushes an integer-valued item onto the failure stack. | |
1475 Assumes the variable `fail_stack'. Probably should only | |
1476 be called from within `PUSH_FAILURE_POINT'. */ | |
1477 #define PUSH_FAILURE_INT(item) \ | |
1478 fail_stack.stack[fail_stack.avail++].integer = (item) | |
1479 | |
1480 /* Push a fail_stack_elt_t value onto the failure stack. | |
1481 Assumes the variable `fail_stack'. Probably should only | |
1482 be called from within `PUSH_FAILURE_POINT'. */ | |
1483 #define PUSH_FAILURE_ELT(item) \ | |
1484 fail_stack.stack[fail_stack.avail++] = (item) | |
1485 | |
1486 /* These three POP... operations complement the three PUSH... operations. | |
1487 All assume that `fail_stack' is nonempty. */ | |
1488 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer | |
1489 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer | |
1490 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] | |
1491 | |
1492 /* Used to omit pushing failure point id's when we're not debugging. */ | |
1493 #ifdef DEBUG | |
1494 #define DEBUG_PUSH PUSH_FAILURE_INT | |
1495 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () | |
1496 #else | |
1497 #define DEBUG_PUSH(item) | |
1498 #define DEBUG_POP(item_addr) | |
1499 #endif | |
1500 | |
1501 | |
1502 /* Push the information about the state we will need | |
1503 if we ever fail back to it. | |
1504 | |
1505 Requires variables fail_stack, regstart, regend, reg_info, and | |
1506 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be | |
1507 declared. | |
1508 | |
1509 Does `return FAILURE_CODE' if runs out of memory. */ | |
1510 | |
771 | 1511 #if !defined (REGEX_MALLOC) && !defined (REGEX_REL_ALLOC) |
456 | 1512 #define DECLARE_DESTINATION char *destination |
428 | 1513 #else |
456 | 1514 #define DECLARE_DESTINATION DECLARE_NOTHING |
428 | 1515 #endif |
1516 | |
1517 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ | |
456 | 1518 do { \ |
1519 DECLARE_DESTINATION; \ | |
1520 /* Must be int, so when we don't save any registers, the arithmetic \ | |
1521 of 0 + -1 isn't done as unsigned. */ \ | |
1522 int this_reg; \ | |
428 | 1523 \ |
456 | 1524 DEBUG_STATEMENT (failure_id++); \ |
1525 DEBUG_STATEMENT (nfailure_points_pushed++); \ | |
647 | 1526 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%d:\n", failure_id); \ |
1527 DEBUG_PRINT2 (" Before push, next avail: %ld\n", \ | |
1528 (long) (fail_stack).avail); \ | |
1529 DEBUG_PRINT2 (" size: %ld\n", \ | |
1530 (long) (fail_stack).size); \ | |
456 | 1531 \ |
1532 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ | |
1533 DEBUG_PRINT2 (" available: %ld\n", \ | |
1534 (long) REMAINING_AVAIL_SLOTS); \ | |
428 | 1535 \ |
456 | 1536 /* Ensure we have enough space allocated for what we will push. */ \ |
1537 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ | |
1538 { \ | |
1333 | 1539 BEGIN_REGEX_MALLOC_OK (); \ |
456 | 1540 if (!DOUBLE_FAIL_STACK (fail_stack)) \ |
1333 | 1541 { \ |
1542 END_REGEX_MALLOC_OK (); \ | |
1543 UNBIND_REGEX_MALLOC_CHECK (); \ | |
1544 return failure_code; \ | |
1545 } \ | |
1546 END_REGEX_MALLOC_OK (); \ | |
647 | 1547 DEBUG_PRINT2 ("\n Doubled stack; size now: %ld\n", \ |
1548 (long) (fail_stack).size); \ | |
456 | 1549 DEBUG_PRINT2 (" slots available: %ld\n", \ |
1550 (long) REMAINING_AVAIL_SLOTS); \ | |
1333 | 1551 \ |
1552 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); \ | |
456 | 1553 } \ |
428 | 1554 \ |
456 | 1555 /* Push the info, starting with the registers. */ \ |
1556 DEBUG_PRINT1 ("\n"); \ | |
428 | 1557 \ |
456 | 1558 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ |
1559 this_reg++) \ | |
1560 { \ | |
1561 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ | |
1562 DEBUG_STATEMENT (num_regs_pushed++); \ | |
428 | 1563 \ |
456 | 1564 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \ |
1565 PUSH_FAILURE_POINTER (regstart[this_reg]); \ | |
1566 \ | |
1567 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \ | |
1568 PUSH_FAILURE_POINTER (regend[this_reg]); \ | |
428 | 1569 \ |
456 | 1570 DEBUG_PRINT2 (" info: 0x%lx\n ", \ |
1571 * (long *) (®_info[this_reg])); \ | |
1572 DEBUG_PRINT2 (" match_null=%d", \ | |
1573 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ | |
1574 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ | |
1575 DEBUG_PRINT2 (" matched_something=%d", \ | |
1576 MATCHED_SOMETHING (reg_info[this_reg])); \ | |
1577 DEBUG_PRINT2 (" ever_matched_something=%d", \ | |
1578 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ | |
1579 DEBUG_PRINT1 ("\n"); \ | |
1580 PUSH_FAILURE_ELT (reg_info[this_reg].word); \ | |
1581 } \ | |
428 | 1582 \ |
456 | 1583 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg); \ |
1584 PUSH_FAILURE_INT (lowest_active_reg); \ | |
428 | 1585 \ |
456 | 1586 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg); \ |
1587 PUSH_FAILURE_INT (highest_active_reg); \ | |
428 | 1588 \ |
456 | 1589 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", (long) pattern_place); \ |
1590 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ | |
1591 PUSH_FAILURE_POINTER (pattern_place); \ | |
428 | 1592 \ |
456 | 1593 DEBUG_PRINT2 (" Pushing string 0x%lx: `", (long) string_place); \ |
1594 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ | |
1595 size2); \ | |
1596 DEBUG_PRINT1 ("'\n"); \ | |
1597 PUSH_FAILURE_POINTER (string_place); \ | |
428 | 1598 \ |
456 | 1599 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ |
1600 DEBUG_PUSH (failure_id); \ | |
1601 } while (0) | |
428 | 1602 |
1603 /* This is the number of items that are pushed and popped on the stack | |
1604 for each register. */ | |
1605 #define NUM_REG_ITEMS 3 | |
1606 | |
1607 /* Individual items aside from the registers. */ | |
1608 #ifdef DEBUG | |
1609 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ | |
1610 #else | |
1611 #define NUM_NONREG_ITEMS 4 | |
1612 #endif | |
1613 | |
1614 /* We push at most this many items on the stack. */ | |
1615 /* We used to use (num_regs - 1), which is the number of registers | |
1616 this regexp will save; but that was changed to 5 | |
1617 to avoid stack overflow for a regexp with lots of parens. */ | |
1618 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) | |
1619 | |
1620 /* We actually push this many items. */ | |
1621 #define NUM_FAILURE_ITEMS \ | |
1622 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ | |
1623 + NUM_NONREG_ITEMS) | |
1624 | |
1625 /* How many items can still be added to the stack without overflowing it. */ | |
1626 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) | |
1627 | |
1628 | |
1629 /* Pops what PUSH_FAIL_STACK pushes. | |
1630 | |
1631 We restore into the parameters, all of which should be lvalues: | |
1632 STR -- the saved data position. | |
1633 PAT -- the saved pattern position. | |
1634 LOW_REG, HIGH_REG -- the highest and lowest active registers. | |
1635 REGSTART, REGEND -- arrays of string positions. | |
1636 REG_INFO -- array of information about each subexpression. | |
1637 | |
1638 Also assumes the variables `fail_stack' and (if debugging), `bufp', | |
1639 `pend', `string1', `size1', `string2', and `size2'. */ | |
1640 | |
456 | 1641 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \ |
1642 regstart, regend, reg_info) \ | |
1643 do { \ | |
428 | 1644 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \ |
1645 int this_reg; \ | |
442 | 1646 const unsigned char *string_temp; \ |
428 | 1647 \ |
1648 assert (!FAIL_STACK_EMPTY ()); \ | |
1649 \ | |
1650 /* Remove failure points and point to how many regs pushed. */ \ | |
1651 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ | |
647 | 1652 DEBUG_PRINT2 (" Before pop, next avail: %ld\n", \ |
1653 (long) fail_stack.avail); \ | |
1654 DEBUG_PRINT2 (" size: %ld\n", \ | |
1655 (long) fail_stack.size); \ | |
428 | 1656 \ |
1657 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ | |
1658 \ | |
1659 DEBUG_POP (&ffailure_id.integer); \ | |
647 | 1660 DEBUG_PRINT2 (" Popping failure id: %d\n", \ |
1661 * (int *) &ffailure_id); \ | |
428 | 1662 \ |
1663 /* If the saved string location is NULL, it came from an \ | |
1664 on_failure_keep_string_jump opcode, and we want to throw away the \ | |
1665 saved NULL, thus retaining our current position in the string. */ \ | |
1666 string_temp = POP_FAILURE_POINTER (); \ | |
1667 if (string_temp != NULL) \ | |
446 | 1668 str = string_temp; \ |
428 | 1669 \ |
1670 DEBUG_PRINT2 (" Popping string 0x%lx: `", (long) str); \ | |
1671 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ | |
1672 DEBUG_PRINT1 ("'\n"); \ | |
1673 \ | |
1674 pat = (unsigned char *) POP_FAILURE_POINTER (); \ | |
1675 DEBUG_PRINT2 (" Popping pattern 0x%lx: ", (long) pat); \ | |
1676 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ | |
1677 \ | |
1678 /* Restore register info. */ \ | |
647 | 1679 high_reg = POP_FAILURE_INT (); \ |
428 | 1680 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ |
1681 \ | |
647 | 1682 low_reg = POP_FAILURE_INT (); \ |
428 | 1683 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ |
1684 \ | |
1685 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ | |
1686 { \ | |
1687 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ | |
1688 \ | |
1689 reg_info[this_reg].word = POP_FAILURE_ELT (); \ | |
1690 DEBUG_PRINT2 (" info: 0x%lx\n", \ | |
1691 * (long *) ®_info[this_reg]); \ | |
1692 \ | |
446 | 1693 regend[this_reg] = POP_FAILURE_POINTER (); \ |
428 | 1694 DEBUG_PRINT2 (" end: 0x%lx\n", (long) regend[this_reg]); \ |
1695 \ | |
446 | 1696 regstart[this_reg] = POP_FAILURE_POINTER (); \ |
428 | 1697 DEBUG_PRINT2 (" start: 0x%lx\n", (long) regstart[this_reg]); \ |
1698 } \ | |
1699 \ | |
1700 set_regs_matched_done = 0; \ | |
1701 DEBUG_STATEMENT (nfailure_points_popped++); \ | |
456 | 1702 } while (0) /* POP_FAILURE_POINT */ |
428 | 1703 |
1704 | |
1705 | |
1706 /* Structure for per-register (a.k.a. per-group) information. | |
1707 Other register information, such as the | |
1708 starting and ending positions (which are addresses), and the list of | |
1709 inner groups (which is a bits list) are maintained in separate | |
1710 variables. | |
1711 | |
1712 We are making a (strictly speaking) nonportable assumption here: that | |
1713 the compiler will pack our bit fields into something that fits into | |
1714 the type of `word', i.e., is something that fits into one item on the | |
1715 failure stack. */ | |
1716 | |
1717 typedef union | |
1718 { | |
1719 fail_stack_elt_t word; | |
1720 struct | |
1721 { | |
1722 /* This field is one if this group can match the empty string, | |
1723 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ | |
1724 #define MATCH_NULL_UNSET_VALUE 3 | |
647 | 1725 unsigned int match_null_string_p : 2; |
1726 unsigned int is_active : 1; | |
1727 unsigned int matched_something : 1; | |
1728 unsigned int ever_matched_something : 1; | |
428 | 1729 } bits; |
1730 } register_info_type; | |
1731 | |
1732 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) | |
1733 #define IS_ACTIVE(R) ((R).bits.is_active) | |
1734 #define MATCHED_SOMETHING(R) ((R).bits.matched_something) | |
1735 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) | |
1736 | |
1737 | |
1738 /* Call this when have matched a real character; it sets `matched' flags | |
1739 for the subexpressions which we are currently inside. Also records | |
1740 that those subexprs have matched. */ | |
1741 #define SET_REGS_MATCHED() \ | |
1742 do \ | |
1743 { \ | |
1744 if (!set_regs_matched_done) \ | |
1745 { \ | |
647 | 1746 int r; \ |
428 | 1747 set_regs_matched_done = 1; \ |
1748 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ | |
1749 { \ | |
1750 MATCHED_SOMETHING (reg_info[r]) \ | |
1751 = EVER_MATCHED_SOMETHING (reg_info[r]) \ | |
1752 = 1; \ | |
1753 } \ | |
1754 } \ | |
1755 } \ | |
1756 while (0) | |
1757 | |
1758 /* Registers are set to a sentinel when they haven't yet matched. */ | |
446 | 1759 static unsigned char reg_unset_dummy; |
428 | 1760 #define REG_UNSET_VALUE (®_unset_dummy) |
1761 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) | |
1762 | |
1763 /* Subroutine declarations and macros for regex_compile. */ | |
1764 | |
1765 /* Fetch the next character in the uncompiled pattern---translating it | |
826 | 1766 if necessary. */ |
428 | 1767 #define PATFETCH(c) \ |
446 | 1768 do { \ |
1769 PATFETCH_RAW (c); \ | |
826 | 1770 c = RE_TRANSLATE (c); \ |
428 | 1771 } while (0) |
1772 | |
1773 /* Fetch the next character in the uncompiled pattern, with no | |
1774 translation. */ | |
1775 #define PATFETCH_RAW(c) \ | |
1776 do {if (p == pend) return REG_EEND; \ | |
1777 assert (p < pend); \ | |
867 | 1778 c = itext_ichar (p); \ |
1779 INC_IBYTEPTR (p); \ | |
428 | 1780 } while (0) |
1781 | |
1782 /* Go backwards one character in the pattern. */ | |
867 | 1783 #define PATUNFETCH DEC_IBYTEPTR (p) |
428 | 1784 |
1785 /* If `translate' is non-null, return translate[D], else just D. We | |
1786 cast the subscript to translate because some data is declared as | |
1787 `char *', to avoid warnings when a string constant is passed. But | |
1788 when we use a character as a subscript we must make it unsigned. */ | |
826 | 1789 #define RE_TRANSLATE(d) \ |
1790 (TRANSLATE_P (translate) ? RE_TRANSLATE_1 (d) : (d)) | |
428 | 1791 |
1792 /* Macros for outputting the compiled pattern into `buffer'. */ | |
1793 | |
1794 /* If the buffer isn't allocated when it comes in, use this. */ | |
1795 #define INIT_BUF_SIZE 32 | |
1796 | |
1797 /* Make sure we have at least N more bytes of space in buffer. */ | |
1798 #define GET_BUFFER_SPACE(n) \ | |
647 | 1799 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \ |
428 | 1800 EXTEND_BUFFER () |
1801 | |
1802 /* Make sure we have one more byte of buffer space and then add C to it. */ | |
1803 #define BUF_PUSH(c) \ | |
1804 do { \ | |
1805 GET_BUFFER_SPACE (1); \ | |
446 | 1806 *buf_end++ = (unsigned char) (c); \ |
428 | 1807 } while (0) |
1808 | |
1809 | |
1810 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ | |
1811 #define BUF_PUSH_2(c1, c2) \ | |
1812 do { \ | |
1813 GET_BUFFER_SPACE (2); \ | |
446 | 1814 *buf_end++ = (unsigned char) (c1); \ |
1815 *buf_end++ = (unsigned char) (c2); \ | |
428 | 1816 } while (0) |
1817 | |
1818 | |
1819 /* As with BUF_PUSH_2, except for three bytes. */ | |
1820 #define BUF_PUSH_3(c1, c2, c3) \ | |
1821 do { \ | |
1822 GET_BUFFER_SPACE (3); \ | |
446 | 1823 *buf_end++ = (unsigned char) (c1); \ |
1824 *buf_end++ = (unsigned char) (c2); \ | |
1825 *buf_end++ = (unsigned char) (c3); \ | |
428 | 1826 } while (0) |
1827 | |
1828 | |
1829 /* Store a jump with opcode OP at LOC to location TO. We store a | |
1830 relative address offset by the three bytes the jump itself occupies. */ | |
1831 #define STORE_JUMP(op, loc, to) \ | |
1832 store_op1 (op, loc, (to) - (loc) - 3) | |
1833 | |
1834 /* Likewise, for a two-argument jump. */ | |
1835 #define STORE_JUMP2(op, loc, to, arg) \ | |
1836 store_op2 (op, loc, (to) - (loc) - 3, arg) | |
1837 | |
446 | 1838 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the |
1839 buffer end. */ | |
428 | 1840 #define INSERT_JUMP(op, loc, to) \ |
446 | 1841 insert_op1 (op, loc, (to) - (loc) - 3, buf_end) |
1842 | |
1843 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the | |
1844 buffer end. */ | |
428 | 1845 #define INSERT_JUMP2(op, loc, to, arg) \ |
446 | 1846 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end) |
428 | 1847 |
1848 | |
1849 /* This is not an arbitrary limit: the arguments which represent offsets | |
1850 into the pattern are two bytes long. So if 2^16 bytes turns out to | |
1851 be too small, many things would have to change. */ | |
1852 #define MAX_BUF_SIZE (1L << 16) | |
1853 | |
1854 | |
1855 /* Extend the buffer by twice its current size via realloc and | |
1856 reset the pointers that pointed into the old block to point to the | |
1857 correct places in the new one. If extending the buffer results in it | |
1858 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ | |
1333 | 1859 #define EXTEND_BUFFER() \ |
1860 do { \ | |
1861 re_char *old_buffer = bufp->buffer; \ | |
1862 if (bufp->allocated == MAX_BUF_SIZE) \ | |
1863 return REG_ESIZE; \ | |
1864 bufp->allocated <<= 1; \ | |
1865 if (bufp->allocated > MAX_BUF_SIZE) \ | |
1866 bufp->allocated = MAX_BUF_SIZE; \ | |
1867 bufp->buffer = \ | |
1868 (unsigned char *) xrealloc (bufp->buffer, bufp->allocated); \ | |
1869 if (bufp->buffer == NULL) \ | |
1870 return REG_ESPACE; \ | |
1871 /* If the buffer moved, move all the pointers into it. */ \ | |
1872 if (old_buffer != bufp->buffer) \ | |
1873 { \ | |
1874 buf_end = (buf_end - old_buffer) + bufp->buffer; \ | |
1875 begalt = (begalt - old_buffer) + bufp->buffer; \ | |
1876 if (fixup_alt_jump) \ | |
1877 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer; \ | |
1878 if (laststart) \ | |
1879 laststart = (laststart - old_buffer) + bufp->buffer; \ | |
1880 if (pending_exact) \ | |
1881 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ | |
1882 } \ | |
428 | 1883 } while (0) |
1884 | |
1885 | |
1886 /* Since we have one byte reserved for the register number argument to | |
1887 {start,stop}_memory, the maximum number of groups we can report | |
1888 things about is what fits in that byte. */ | |
1889 #define MAX_REGNUM 255 | |
1890 | |
1891 /* But patterns can have more than `MAX_REGNUM' registers. We just | |
502 | 1892 ignore the excess. |
1893 #### not true! groups past this will fail in lots of ways, if we | |
1894 ever have to backtrack. | |
1895 */ | |
647 | 1896 typedef int regnum_t; |
428 | 1897 |
502 | 1898 #define INIT_REG_TRANSLATE_SIZE 5 |
428 | 1899 |
1900 /* Macros for the compile stack. */ | |
1901 | |
1902 /* Since offsets can go either forwards or backwards, this type needs to | |
1903 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ | |
1904 typedef int pattern_offset_t; | |
1905 | |
1906 typedef struct | |
1907 { | |
1908 pattern_offset_t begalt_offset; | |
1909 pattern_offset_t fixup_alt_jump; | |
1910 pattern_offset_t inner_group_offset; | |
1911 pattern_offset_t laststart_offset; | |
1912 regnum_t regnum; | |
1913 } compile_stack_elt_t; | |
1914 | |
1915 | |
1916 typedef struct | |
1917 { | |
1918 compile_stack_elt_t *stack; | |
647 | 1919 int size; |
1920 int avail; /* Offset of next open position. */ | |
428 | 1921 } compile_stack_type; |
1922 | |
1923 | |
1924 #define INIT_COMPILE_STACK_SIZE 32 | |
1925 | |
1926 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) | |
1927 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) | |
1928 | |
1929 /* The next available element. */ | |
1930 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) | |
1931 | |
1932 | |
1933 /* Set the bit for character C in a bit vector. */ | |
1934 #define SET_LIST_BIT(c) \ | |
446 | 1935 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \ |
428 | 1936 |= 1 << (((unsigned char) c) % BYTEWIDTH)) |
1937 | |
1938 #ifdef MULE | |
1939 | |
1940 /* Set the "bit" for character C in a range table. */ | |
1941 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt) | |
1942 | |
1943 /* Set the "bit" for character c in the appropriate table. */ | |
1944 #define SET_EITHER_BIT(c) \ | |
1945 do { \ | |
1946 if (has_extended_chars) \ | |
1947 SET_RANGETAB_BIT (c); \ | |
1948 else \ | |
1949 SET_LIST_BIT (c); \ | |
1950 } while (0) | |
1951 | |
1952 #else /* not MULE */ | |
1953 | |
1954 #define SET_EITHER_BIT(c) SET_LIST_BIT (c) | |
1955 | |
1956 #endif | |
1957 | |
1958 | |
1959 /* Get the next unsigned number in the uncompiled pattern. */ | |
1960 #define GET_UNSIGNED_NUMBER(num) \ | |
1961 { if (p != pend) \ | |
1962 { \ | |
1963 PATFETCH (c); \ | |
1964 while (ISDIGIT (c)) \ | |
1965 { \ | |
1966 if (num < 0) \ | |
1967 num = 0; \ | |
1968 num = num * 10 + c - '0'; \ | |
1969 if (p == pend) \ | |
1970 break; \ | |
1971 PATFETCH (c); \ | |
1972 } \ | |
1973 } \ | |
1974 } | |
1975 | |
1976 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ | |
1977 | |
1978 #define IS_CHAR_CLASS(string) \ | |
1979 (STREQ (string, "alpha") || STREQ (string, "upper") \ | |
1980 || STREQ (string, "lower") || STREQ (string, "digit") \ | |
1981 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ | |
1982 || STREQ (string, "space") || STREQ (string, "print") \ | |
1983 || STREQ (string, "punct") || STREQ (string, "graph") \ | |
1984 || STREQ (string, "cntrl") || STREQ (string, "blank")) | |
1985 | |
1986 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg); | |
1987 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2); | |
1988 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg, | |
1989 unsigned char *end); | |
1990 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, | |
1991 unsigned char *end); | |
460 | 1992 static re_bool at_begline_loc_p (re_char *pattern, re_char *p, |
428 | 1993 reg_syntax_t syntax); |
460 | 1994 static re_bool at_endline_loc_p (re_char *p, re_char *pend, int syntax); |
1995 static re_bool group_in_compile_stack (compile_stack_type compile_stack, | |
428 | 1996 regnum_t regnum); |
446 | 1997 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend, |
1998 RE_TRANSLATE_TYPE translate, | |
1999 reg_syntax_t syntax, | |
428 | 2000 unsigned char *b); |
2001 #ifdef MULE | |
446 | 2002 static reg_errcode_t compile_extended_range (re_char **p_ptr, |
2003 re_char *pend, | |
2004 RE_TRANSLATE_TYPE translate, | |
428 | 2005 reg_syntax_t syntax, |
2006 Lisp_Object rtab); | |
2007 #endif /* MULE */ | |
460 | 2008 static re_bool group_match_null_string_p (unsigned char **p, |
428 | 2009 unsigned char *end, |
2010 register_info_type *reg_info); | |
460 | 2011 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end, |
428 | 2012 register_info_type *reg_info); |
460 | 2013 static re_bool common_op_match_null_string_p (unsigned char **p, |
428 | 2014 unsigned char *end, |
2015 register_info_type *reg_info); | |
826 | 2016 static int bcmp_translate (re_char *s1, re_char *s2, |
2017 REGISTER int len, RE_TRANSLATE_TYPE translate | |
2018 #ifdef emacs | |
2019 , Internal_Format fmt, Lisp_Object lispobj | |
2020 #endif | |
2021 ); | |
428 | 2022 static int re_match_2_internal (struct re_pattern_buffer *bufp, |
446 | 2023 re_char *string1, int size1, |
2024 re_char *string2, int size2, int pos, | |
826 | 2025 struct re_registers *regs, int stop |
2026 RE_LISP_CONTEXT_ARGS_DECL); | |
428 | 2027 |
2028 #ifndef MATCH_MAY_ALLOCATE | |
2029 | |
2030 /* If we cannot allocate large objects within re_match_2_internal, | |
2031 we make the fail stack and register vectors global. | |
2032 The fail stack, we grow to the maximum size when a regexp | |
2033 is compiled. | |
2034 The register vectors, we adjust in size each time we | |
2035 compile a regexp, according to the number of registers it needs. */ | |
2036 | |
2037 static fail_stack_type fail_stack; | |
2038 | |
2039 /* Size with which the following vectors are currently allocated. | |
2040 That is so we can make them bigger as needed, | |
2041 but never make them smaller. */ | |
2042 static int regs_allocated_size; | |
2043 | |
446 | 2044 static re_char ** regstart, ** regend; |
2045 static re_char ** old_regstart, ** old_regend; | |
2046 static re_char **best_regstart, **best_regend; | |
428 | 2047 static register_info_type *reg_info; |
446 | 2048 static re_char **reg_dummy; |
428 | 2049 static register_info_type *reg_info_dummy; |
2050 | |
2051 /* Make the register vectors big enough for NUM_REGS registers, | |
2052 but don't make them smaller. */ | |
2053 | |
2054 static | |
2055 regex_grow_registers (int num_regs) | |
2056 { | |
2057 if (num_regs > regs_allocated_size) | |
2058 { | |
551 | 2059 RETALLOC (regstart, num_regs, re_char *); |
2060 RETALLOC (regend, num_regs, re_char *); | |
2061 RETALLOC (old_regstart, num_regs, re_char *); | |
2062 RETALLOC (old_regend, num_regs, re_char *); | |
2063 RETALLOC (best_regstart, num_regs, re_char *); | |
2064 RETALLOC (best_regend, num_regs, re_char *); | |
2065 RETALLOC (reg_info, num_regs, register_info_type); | |
2066 RETALLOC (reg_dummy, num_regs, re_char *); | |
2067 RETALLOC (reg_info_dummy, num_regs, register_info_type); | |
428 | 2068 |
2069 regs_allocated_size = num_regs; | |
2070 } | |
2071 } | |
2072 | |
2073 #endif /* not MATCH_MAY_ALLOCATE */ | |
2074 | |
2075 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. | |
2076 Returns one of error codes defined in `regex.h', or zero for success. | |
2077 | |
2078 Assumes the `allocated' (and perhaps `buffer') and `translate' | |
2079 fields are set in BUFP on entry. | |
2080 | |
2081 If it succeeds, results are put in BUFP (if it returns an error, the | |
2082 contents of BUFP are undefined): | |
2083 `buffer' is the compiled pattern; | |
2084 `syntax' is set to SYNTAX; | |
2085 `used' is set to the length of the compiled pattern; | |
2086 `fastmap_accurate' is zero; | |
502 | 2087 `re_ngroups' is the number of groups/subexpressions (including shy |
2088 groups) in PATTERN; | |
2089 `re_nsub' is the number of non-shy groups in PATTERN; | |
428 | 2090 `not_bol' and `not_eol' are zero; |
2091 | |
2092 The `fastmap' and `newline_anchor' fields are neither | |
2093 examined nor set. */ | |
2094 | |
2095 /* Return, freeing storage we allocated. */ | |
1726 | 2096 #define FREE_STACK_RETURN(value) \ |
2097 do \ | |
2098 { \ | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
2099 xfree (compile_stack.stack); \ |
1726 | 2100 return value; \ |
1333 | 2101 } while (0) |
428 | 2102 |
2103 static reg_errcode_t | |
446 | 2104 regex_compile (re_char *pattern, int size, reg_syntax_t syntax, |
428 | 2105 struct re_pattern_buffer *bufp) |
2106 { | |
2107 /* We fetch characters from PATTERN here. We declare these as int | |
2108 (or possibly long) so that chars above 127 can be used as | |
2109 array indices. The macros that fetch a character from the pattern | |
2110 make sure to coerce to unsigned char before assigning, so we won't | |
2111 get bitten by negative numbers here. */ | |
2112 /* XEmacs change: used to be unsigned char. */ | |
2113 REGISTER EMACS_INT c, c1; | |
2114 | |
2115 /* A random temporary spot in PATTERN. */ | |
446 | 2116 re_char *p1; |
428 | 2117 |
2118 /* Points to the end of the buffer, where we should append. */ | |
446 | 2119 REGISTER unsigned char *buf_end; |
428 | 2120 |
2121 /* Keeps track of unclosed groups. */ | |
2122 compile_stack_type compile_stack; | |
2123 | |
2124 /* Points to the current (ending) position in the pattern. */ | |
446 | 2125 re_char *p = pattern; |
2126 re_char *pend = pattern + size; | |
428 | 2127 |
2128 /* How to translate the characters in the pattern. */ | |
446 | 2129 RE_TRANSLATE_TYPE translate = bufp->translate; |
428 | 2130 |
2131 /* Address of the count-byte of the most recently inserted `exactn' | |
2132 command. This makes it possible to tell if a new exact-match | |
2133 character can be added to that command or if the character requires | |
2134 a new `exactn' command. */ | |
2135 unsigned char *pending_exact = 0; | |
2136 | |
2137 /* Address of start of the most recently finished expression. | |
2138 This tells, e.g., postfix * where to find the start of its | |
2139 operand. Reset at the beginning of groups and alternatives. */ | |
2140 unsigned char *laststart = 0; | |
2141 | |
2142 /* Address of beginning of regexp, or inside of last group. */ | |
2143 unsigned char *begalt; | |
2144 | |
2145 /* Place in the uncompiled pattern (i.e., the {) to | |
2146 which to go back if the interval is invalid. */ | |
446 | 2147 re_char *beg_interval; |
428 | 2148 |
2149 /* Address of the place where a forward jump should go to the end of | |
2150 the containing expression. Each alternative of an `or' -- except the | |
2151 last -- ends with a forward jump of this sort. */ | |
2152 unsigned char *fixup_alt_jump = 0; | |
2153 | |
2154 /* Counts open-groups as they are encountered. Remembered for the | |
2155 matching close-group on the compile stack, so the same register | |
2156 number is put in the stop_memory as the start_memory. */ | |
2157 regnum_t regnum = 0; | |
2158 | |
2159 #ifdef DEBUG | |
2160 DEBUG_PRINT1 ("\nCompiling pattern: "); | |
2161 if (debug) | |
2162 { | |
647 | 2163 int debug_count; |
428 | 2164 |
2165 for (debug_count = 0; debug_count < size; debug_count++) | |
2166 putchar (pattern[debug_count]); | |
2167 putchar ('\n'); | |
2168 } | |
2169 #endif /* DEBUG */ | |
2170 | |
2171 /* Initialize the compile stack. */ | |
2172 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); | |
2173 if (compile_stack.stack == NULL) | |
2174 return REG_ESPACE; | |
2175 | |
2176 compile_stack.size = INIT_COMPILE_STACK_SIZE; | |
2177 compile_stack.avail = 0; | |
2178 | |
2179 /* Initialize the pattern buffer. */ | |
2180 bufp->syntax = syntax; | |
2181 bufp->fastmap_accurate = 0; | |
2182 bufp->not_bol = bufp->not_eol = 0; | |
2183 | |
2184 /* Set `used' to zero, so that if we return an error, the pattern | |
2185 printer (for debugging) will think there's no pattern. We reset it | |
2186 at the end. */ | |
2187 bufp->used = 0; | |
2188 | |
2189 /* Always count groups, whether or not bufp->no_sub is set. */ | |
2190 bufp->re_nsub = 0; | |
502 | 2191 bufp->re_ngroups = 0; |
2192 | |
2193 bufp->warned_about_incompatible_back_references = 0; | |
2194 | |
2195 if (bufp->external_to_internal_register == 0) | |
2196 { | |
2197 bufp->external_to_internal_register_size = INIT_REG_TRANSLATE_SIZE; | |
2198 RETALLOC (bufp->external_to_internal_register, | |
2199 bufp->external_to_internal_register_size, | |
2200 int); | |
2201 } | |
2202 | |
2203 { | |
2204 int i; | |
2205 | |
2206 bufp->external_to_internal_register[0] = 0; | |
2207 for (i = 1; i < bufp->external_to_internal_register_size; i++) | |
2208 bufp->external_to_internal_register[i] = (int) 0xDEADBEEF; | |
2209 } | |
428 | 2210 |
2211 #if !defined (emacs) && !defined (SYNTAX_TABLE) | |
2212 /* Initialize the syntax table. */ | |
2213 init_syntax_once (); | |
2214 #endif | |
2215 | |
2216 if (bufp->allocated == 0) | |
2217 { | |
2218 if (bufp->buffer) | |
2219 { /* If zero allocated, but buffer is non-null, try to realloc | |
2220 enough space. This loses if buffer's address is bogus, but | |
2221 that is the user's responsibility. */ | |
2222 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); | |
2223 } | |
2224 else | |
2225 { /* Caller did not allocate a buffer. Do it for them. */ | |
2226 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); | |
2227 } | |
2228 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE); | |
2229 | |
2230 bufp->allocated = INIT_BUF_SIZE; | |
2231 } | |
2232 | |
446 | 2233 begalt = buf_end = bufp->buffer; |
428 | 2234 |
2235 /* Loop through the uncompiled pattern until we're at the end. */ | |
2236 while (p != pend) | |
2237 { | |
2238 PATFETCH (c); | |
2239 | |
2240 switch (c) | |
2241 { | |
2242 case '^': | |
2243 { | |
2244 if ( /* If at start of pattern, it's an operator. */ | |
2245 p == pattern + 1 | |
2246 /* If context independent, it's an operator. */ | |
2247 || syntax & RE_CONTEXT_INDEP_ANCHORS | |
2248 /* Otherwise, depends on what's come before. */ | |
2249 || at_begline_loc_p (pattern, p, syntax)) | |
2250 BUF_PUSH (begline); | |
2251 else | |
2252 goto normal_char; | |
2253 } | |
2254 break; | |
2255 | |
2256 | |
2257 case '$': | |
2258 { | |
2259 if ( /* If at end of pattern, it's an operator. */ | |
2260 p == pend | |
2261 /* If context independent, it's an operator. */ | |
2262 || syntax & RE_CONTEXT_INDEP_ANCHORS | |
2263 /* Otherwise, depends on what's next. */ | |
2264 || at_endline_loc_p (p, pend, syntax)) | |
2265 BUF_PUSH (endline); | |
2266 else | |
2267 goto normal_char; | |
2268 } | |
2269 break; | |
2270 | |
2271 | |
2272 case '+': | |
2273 case '?': | |
2274 if ((syntax & RE_BK_PLUS_QM) | |
2275 || (syntax & RE_LIMITED_OPS)) | |
2276 goto normal_char; | |
2277 handle_plus: | |
2278 case '*': | |
2279 /* If there is no previous pattern... */ | |
2280 if (!laststart) | |
2281 { | |
2282 if (syntax & RE_CONTEXT_INVALID_OPS) | |
2283 FREE_STACK_RETURN (REG_BADRPT); | |
2284 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) | |
2285 goto normal_char; | |
2286 } | |
2287 | |
2288 { | |
2289 /* true means zero/many matches are allowed. */ | |
460 | 2290 re_bool zero_times_ok = c != '+'; |
2291 re_bool many_times_ok = c != '?'; | |
428 | 2292 |
2293 /* true means match shortest string possible. */ | |
460 | 2294 re_bool minimal = false; |
428 | 2295 |
2296 /* If there is a sequence of repetition chars, collapse it | |
2297 down to just one (the right one). We can't combine | |
2298 interval operators with these because of, e.g., `a{2}*', | |
2299 which should only match an even number of `a's. */ | |
2300 while (p != pend) | |
2301 { | |
2302 PATFETCH (c); | |
2303 | |
2304 if (c == '*' || (!(syntax & RE_BK_PLUS_QM) | |
2305 && (c == '+' || c == '?'))) | |
2306 ; | |
2307 | |
2308 else if (syntax & RE_BK_PLUS_QM && c == '\\') | |
2309 { | |
2310 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
2311 | |
2312 PATFETCH (c1); | |
2313 if (!(c1 == '+' || c1 == '?')) | |
2314 { | |
2315 PATUNFETCH; | |
2316 PATUNFETCH; | |
2317 break; | |
2318 } | |
2319 | |
2320 c = c1; | |
2321 } | |
2322 else | |
2323 { | |
2324 PATUNFETCH; | |
2325 break; | |
2326 } | |
2327 | |
2328 /* If we get here, we found another repeat character. */ | |
2329 if (!(syntax & RE_NO_MINIMAL_MATCHING)) | |
2330 { | |
440 | 2331 /* "*?" and "+?" and "??" are okay (and mean match |
2332 minimally), but other sequences (such as "*??" and | |
2333 "+++") are rejected (reserved for future use). */ | |
428 | 2334 if (minimal || c != '?') |
2335 FREE_STACK_RETURN (REG_BADRPT); | |
2336 minimal = true; | |
2337 } | |
2338 else | |
2339 { | |
2340 zero_times_ok |= c != '+'; | |
2341 many_times_ok |= c != '?'; | |
2342 } | |
2343 } | |
2344 | |
2345 /* Star, etc. applied to an empty pattern is equivalent | |
2346 to an empty pattern. */ | |
2347 if (!laststart) | |
2348 break; | |
2349 | |
2350 /* Now we know whether zero matches is allowed | |
2351 and whether two or more matches is allowed | |
2352 and whether we want minimal or maximal matching. */ | |
2353 if (minimal) | |
2354 { | |
2355 if (!many_times_ok) | |
2356 { | |
2357 /* "a??" becomes: | |
2358 0: /on_failure_jump to 6 | |
2359 3: /jump to 9 | |
2360 6: /exactn/1/A | |
2361 9: end of pattern. | |
2362 */ | |
2363 GET_BUFFER_SPACE (6); | |
446 | 2364 INSERT_JUMP (jump, laststart, buf_end + 3); |
2365 buf_end += 3; | |
428 | 2366 INSERT_JUMP (on_failure_jump, laststart, laststart + 6); |
446 | 2367 buf_end += 3; |
428 | 2368 } |
2369 else if (zero_times_ok) | |
2370 { | |
2371 /* "a*?" becomes: | |
2372 0: /jump to 6 | |
2373 3: /exactn/1/A | |
2374 6: /on_failure_jump to 3 | |
2375 9: end of pattern. | |
2376 */ | |
2377 GET_BUFFER_SPACE (6); | |
446 | 2378 INSERT_JUMP (jump, laststart, buf_end + 3); |
2379 buf_end += 3; | |
2380 STORE_JUMP (on_failure_jump, buf_end, laststart + 3); | |
2381 buf_end += 3; | |
428 | 2382 } |
2383 else | |
2384 { | |
2385 /* "a+?" becomes: | |
2386 0: /exactn/1/A | |
2387 3: /on_failure_jump to 0 | |
2388 6: end of pattern. | |
2389 */ | |
2390 GET_BUFFER_SPACE (3); | |
446 | 2391 STORE_JUMP (on_failure_jump, buf_end, laststart); |
2392 buf_end += 3; | |
428 | 2393 } |
2394 } | |
2395 else | |
2396 { | |
2397 /* Are we optimizing this jump? */ | |
460 | 2398 re_bool keep_string_p = false; |
428 | 2399 |
2400 if (many_times_ok) | |
446 | 2401 { /* More than one repetition is allowed, so put in |
2402 at the end a backward relative jump from | |
2403 `buf_end' to before the next jump we're going | |
2404 to put in below (which jumps from laststart to | |
2405 after this jump). | |
428 | 2406 |
2407 But if we are at the `*' in the exact sequence `.*\n', | |
2408 insert an unconditional jump backwards to the ., | |
2409 instead of the beginning of the loop. This way we only | |
2410 push a failure point once, instead of every time | |
2411 through the loop. */ | |
2412 assert (p - 1 > pattern); | |
2413 | |
2414 /* Allocate the space for the jump. */ | |
2415 GET_BUFFER_SPACE (3); | |
2416 | |
2417 /* We know we are not at the first character of the | |
2418 pattern, because laststart was nonzero. And we've | |
2419 already incremented `p', by the way, to be the | |
2420 character after the `*'. Do we have to do something | |
2421 analogous here for null bytes, because of | |
2422 RE_DOT_NOT_NULL? */ | |
446 | 2423 if (*(p - 2) == '.' |
428 | 2424 && zero_times_ok |
446 | 2425 && p < pend && *p == '\n' |
428 | 2426 && !(syntax & RE_DOT_NEWLINE)) |
2427 { /* We have .*\n. */ | |
446 | 2428 STORE_JUMP (jump, buf_end, laststart); |
428 | 2429 keep_string_p = true; |
2430 } | |
2431 else | |
2432 /* Anything else. */ | |
446 | 2433 STORE_JUMP (maybe_pop_jump, buf_end, laststart - 3); |
428 | 2434 |
2435 /* We've added more stuff to the buffer. */ | |
446 | 2436 buf_end += 3; |
428 | 2437 } |
2438 | |
446 | 2439 /* On failure, jump from laststart to buf_end + 3, |
2440 which will be the end of the buffer after this jump | |
2441 is inserted. */ | |
428 | 2442 GET_BUFFER_SPACE (3); |
2443 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump | |
2444 : on_failure_jump, | |
446 | 2445 laststart, buf_end + 3); |
2446 buf_end += 3; | |
428 | 2447 |
2448 if (!zero_times_ok) | |
2449 { | |
2450 /* At least one repetition is required, so insert a | |
2451 `dummy_failure_jump' before the initial | |
2452 `on_failure_jump' instruction of the loop. This | |
2453 effects a skip over that instruction the first time | |
2454 we hit that loop. */ | |
2455 GET_BUFFER_SPACE (3); | |
2456 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); | |
446 | 2457 buf_end += 3; |
428 | 2458 } |
2459 } | |
2460 pending_exact = 0; | |
2461 } | |
2462 break; | |
2463 | |
2464 | |
2465 case '.': | |
446 | 2466 laststart = buf_end; |
428 | 2467 BUF_PUSH (anychar); |
2468 break; | |
2469 | |
2470 | |
2471 case '[': | |
2472 { | |
2473 /* XEmacs change: this whole section */ | |
460 | 2474 re_bool had_char_class = false; |
428 | 2475 #ifdef MULE |
460 | 2476 re_bool has_extended_chars = false; |
428 | 2477 REGISTER Lisp_Object rtab = Qnil; |
2478 #endif | |
2479 | |
2480 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2481 | |
2482 /* Ensure that we have enough space to push a charset: the | |
2483 opcode, the length count, and the bitset; 34 bytes in all. */ | |
2484 GET_BUFFER_SPACE (34); | |
2485 | |
446 | 2486 laststart = buf_end; |
428 | 2487 |
2488 /* We test `*p == '^' twice, instead of using an if | |
2489 statement, so we only need one BUF_PUSH. */ | |
2490 BUF_PUSH (*p == '^' ? charset_not : charset); | |
2491 if (*p == '^') | |
2492 p++; | |
2493 | |
2494 /* Remember the first position in the bracket expression. */ | |
2495 p1 = p; | |
2496 | |
2497 /* Push the number of bytes in the bitmap. */ | |
2498 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); | |
2499 | |
2500 /* Clear the whole map. */ | |
446 | 2501 memset (buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH); |
428 | 2502 |
2503 /* charset_not matches newline according to a syntax bit. */ | |
446 | 2504 if ((re_opcode_t) buf_end[-2] == charset_not |
428 | 2505 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
2506 SET_LIST_BIT ('\n'); | |
2507 | |
2508 #ifdef MULE | |
2509 start_over_with_extended: | |
2510 if (has_extended_chars) | |
2511 { | |
2512 /* There are extended chars here, which means we need to start | |
2513 over and shift to unified range-table format. */ | |
446 | 2514 if (buf_end[-2] == charset) |
2515 buf_end[-2] = charset_mule; | |
428 | 2516 else |
446 | 2517 buf_end[-2] = charset_mule_not; |
2518 buf_end--; | |
428 | 2519 p = p1; /* go back to the beginning of the charset, after |
2520 a possible ^. */ | |
2521 rtab = Vthe_lisp_rangetab; | |
2522 Fclear_range_table (rtab); | |
2523 | |
2524 /* charset_not matches newline according to a syntax bit. */ | |
446 | 2525 if ((re_opcode_t) buf_end[-1] == charset_mule_not |
428 | 2526 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
2527 SET_EITHER_BIT ('\n'); | |
2528 } | |
2529 #endif /* MULE */ | |
2530 | |
2531 /* Read in characters and ranges, setting map bits. */ | |
2532 for (;;) | |
2533 { | |
2534 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2535 | |
446 | 2536 PATFETCH (c); |
428 | 2537 |
2538 #ifdef MULE | |
2539 if (c >= 0x80 && !has_extended_chars) | |
2540 { | |
2541 has_extended_chars = 1; | |
2542 /* Frumble-bumble, we've found some extended chars. | |
2543 Need to start over, process everything using | |
2544 the general extended-char mechanism, and need | |
2545 to use charset_mule and charset_mule_not instead | |
2546 of charset and charset_not. */ | |
2547 goto start_over_with_extended; | |
2548 } | |
2549 #endif /* MULE */ | |
2550 /* \ might escape characters inside [...] and [^...]. */ | |
2551 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') | |
2552 { | |
2553 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
2554 | |
446 | 2555 PATFETCH (c1); |
428 | 2556 #ifdef MULE |
2557 if (c1 >= 0x80 && !has_extended_chars) | |
2558 { | |
2559 has_extended_chars = 1; | |
2560 goto start_over_with_extended; | |
2561 } | |
2562 #endif /* MULE */ | |
2563 SET_EITHER_BIT (c1); | |
2564 continue; | |
2565 } | |
2566 | |
2567 /* Could be the end of the bracket expression. If it's | |
2568 not (i.e., when the bracket expression is `[]' so | |
2569 far), the ']' character bit gets set way below. */ | |
2570 if (c == ']' && p != p1 + 1) | |
2571 break; | |
2572 | |
2573 /* Look ahead to see if it's a range when the last thing | |
2574 was a character class. */ | |
2575 if (had_char_class && c == '-' && *p != ']') | |
2576 FREE_STACK_RETURN (REG_ERANGE); | |
2577 | |
2578 /* Look ahead to see if it's a range when the last thing | |
2579 was a character: if this is a hyphen not at the | |
2580 beginning or the end of a list, then it's the range | |
2581 operator. */ | |
2582 if (c == '-' | |
2583 && !(p - 2 >= pattern && p[-2] == '[') | |
446 | 2584 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') |
428 | 2585 && *p != ']') |
2586 { | |
2587 reg_errcode_t ret; | |
2588 | |
2589 #ifdef MULE | |
2590 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) | |
2591 { | |
2592 has_extended_chars = 1; | |
2593 goto start_over_with_extended; | |
2594 } | |
2595 if (has_extended_chars) | |
2596 ret = compile_extended_range (&p, pend, translate, | |
2597 syntax, rtab); | |
2598 else | |
2599 #endif /* MULE */ | |
446 | 2600 ret = compile_range (&p, pend, translate, syntax, buf_end); |
428 | 2601 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
2602 } | |
2603 | |
2604 else if (p[0] == '-' && p[1] != ']') | |
2605 { /* This handles ranges made up of characters only. */ | |
2606 reg_errcode_t ret; | |
2607 | |
2608 /* Move past the `-'. */ | |
2609 PATFETCH (c1); | |
2610 | |
2611 #ifdef MULE | |
2612 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) | |
2613 { | |
2614 has_extended_chars = 1; | |
2615 goto start_over_with_extended; | |
2616 } | |
2617 if (has_extended_chars) | |
2618 ret = compile_extended_range (&p, pend, translate, | |
2619 syntax, rtab); | |
2620 else | |
2621 #endif /* MULE */ | |
446 | 2622 ret = compile_range (&p, pend, translate, syntax, buf_end); |
428 | 2623 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
2624 } | |
2625 | |
2626 /* See if we're at the beginning of a possible character | |
2627 class. */ | |
2628 | |
2629 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') | |
2630 { /* Leave room for the null. */ | |
2631 char str[CHAR_CLASS_MAX_LENGTH + 1]; | |
2632 | |
2633 PATFETCH (c); | |
2634 c1 = 0; | |
2635 | |
2636 /* If pattern is `[[:'. */ | |
2637 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2638 | |
2639 for (;;) | |
2640 { | |
446 | 2641 /* #### This code is unused. |
2642 Correctness is not checked after TRT | |
2643 table change. */ | |
428 | 2644 PATFETCH (c); |
2645 if (c == ':' || c == ']' || p == pend | |
2646 || c1 == CHAR_CLASS_MAX_LENGTH) | |
2647 break; | |
442 | 2648 str[c1++] = (char) c; |
428 | 2649 } |
2650 str[c1] = '\0'; | |
2651 | |
446 | 2652 /* If isn't a word bracketed by `[:' and `:]': |
428 | 2653 undo the ending character, the letters, and leave |
2654 the leading `:' and `[' (but set bits for them). */ | |
2655 if (c == ':' && *p == ']') | |
2656 { | |
2657 int ch; | |
460 | 2658 re_bool is_alnum = STREQ (str, "alnum"); |
2659 re_bool is_alpha = STREQ (str, "alpha"); | |
2660 re_bool is_blank = STREQ (str, "blank"); | |
2661 re_bool is_cntrl = STREQ (str, "cntrl"); | |
2662 re_bool is_digit = STREQ (str, "digit"); | |
2663 re_bool is_graph = STREQ (str, "graph"); | |
2664 re_bool is_lower = STREQ (str, "lower"); | |
2665 re_bool is_print = STREQ (str, "print"); | |
2666 re_bool is_punct = STREQ (str, "punct"); | |
2667 re_bool is_space = STREQ (str, "space"); | |
2668 re_bool is_upper = STREQ (str, "upper"); | |
2669 re_bool is_xdigit = STREQ (str, "xdigit"); | |
428 | 2670 |
2671 if (!IS_CHAR_CLASS (str)) | |
2672 FREE_STACK_RETURN (REG_ECTYPE); | |
2673 | |
2674 /* Throw away the ] at the end of the character | |
2675 class. */ | |
2676 PATFETCH (c); | |
2677 | |
2678 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2679 | |
2680 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) | |
2681 { | |
2682 /* This was split into 3 if's to | |
2683 avoid an arbitrary limit in some compiler. */ | |
2684 if ( (is_alnum && ISALNUM (ch)) | |
2685 || (is_alpha && ISALPHA (ch)) | |
2686 || (is_blank && ISBLANK (ch)) | |
2687 || (is_cntrl && ISCNTRL (ch))) | |
2688 SET_EITHER_BIT (ch); | |
2689 if ( (is_digit && ISDIGIT (ch)) | |
2690 || (is_graph && ISGRAPH (ch)) | |
2691 || (is_lower && ISLOWER (ch)) | |
2692 || (is_print && ISPRINT (ch))) | |
2693 SET_EITHER_BIT (ch); | |
2694 if ( (is_punct && ISPUNCT (ch)) | |
2695 || (is_space && ISSPACE (ch)) | |
2696 || (is_upper && ISUPPER (ch)) | |
2697 || (is_xdigit && ISXDIGIT (ch))) | |
2698 SET_EITHER_BIT (ch); | |
2699 } | |
2700 had_char_class = true; | |
2701 } | |
2702 else | |
2703 { | |
2704 c1++; | |
2705 while (c1--) | |
2706 PATUNFETCH; | |
2707 SET_EITHER_BIT ('['); | |
2708 SET_EITHER_BIT (':'); | |
2709 had_char_class = false; | |
2710 } | |
2711 } | |
2712 else | |
2713 { | |
2714 had_char_class = false; | |
2715 SET_EITHER_BIT (c); | |
2716 } | |
2717 } | |
2718 | |
2719 #ifdef MULE | |
2720 if (has_extended_chars) | |
2721 { | |
2722 /* We have a range table, not a bit vector. */ | |
2723 int bytes_needed = | |
2724 unified_range_table_bytes_needed (rtab); | |
2725 GET_BUFFER_SPACE (bytes_needed); | |
446 | 2726 unified_range_table_copy_data (rtab, buf_end); |
2727 buf_end += unified_range_table_bytes_used (buf_end); | |
428 | 2728 break; |
2729 } | |
2730 #endif /* MULE */ | |
2731 /* Discard any (non)matching list bytes that are all 0 at the | |
2732 end of the map. Decrease the map-length byte too. */ | |
446 | 2733 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0) |
2734 buf_end[-1]--; | |
2735 buf_end += buf_end[-1]; | |
428 | 2736 } |
2737 break; | |
2738 | |
2739 | |
2740 case '(': | |
2741 if (syntax & RE_NO_BK_PARENS) | |
2742 goto handle_open; | |
2743 else | |
2744 goto normal_char; | |
2745 | |
2746 | |
2747 case ')': | |
2748 if (syntax & RE_NO_BK_PARENS) | |
2749 goto handle_close; | |
2750 else | |
2751 goto normal_char; | |
2752 | |
2753 | |
2754 case '\n': | |
2755 if (syntax & RE_NEWLINE_ALT) | |
2756 goto handle_alt; | |
2757 else | |
2758 goto normal_char; | |
2759 | |
2760 | |
2761 case '|': | |
2762 if (syntax & RE_NO_BK_VBAR) | |
2763 goto handle_alt; | |
2764 else | |
2765 goto normal_char; | |
2766 | |
2767 | |
2768 case '{': | |
2769 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) | |
2770 goto handle_interval; | |
2771 else | |
2772 goto normal_char; | |
2773 | |
2774 | |
2775 case '\\': | |
2776 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
2777 | |
2778 /* Do not translate the character after the \, so that we can | |
2779 distinguish, e.g., \B from \b, even if we normally would | |
2780 translate, e.g., B to b. */ | |
2781 PATFETCH_RAW (c); | |
2782 | |
2783 switch (c) | |
2784 { | |
2785 case '(': | |
2786 if (syntax & RE_NO_BK_PARENS) | |
2787 goto normal_backslash; | |
2788 | |
2789 handle_open: | |
2790 { | |
2791 regnum_t r; | |
502 | 2792 int shy = 0; |
428 | 2793 |
2794 if (!(syntax & RE_NO_SHY_GROUPS) | |
2795 && p != pend | |
446 | 2796 && *p == '?') |
428 | 2797 { |
2798 p++; | |
446 | 2799 PATFETCH (c); |
428 | 2800 switch (c) |
2801 { | |
2802 case ':': /* shy groups */ | |
502 | 2803 shy = 1; |
428 | 2804 break; |
2805 | |
2806 /* All others are reserved for future constructs. */ | |
2807 default: | |
2808 FREE_STACK_RETURN (REG_BADPAT); | |
2809 } | |
2810 } | |
502 | 2811 |
2812 r = ++regnum; | |
2813 bufp->re_ngroups++; | |
2814 if (!shy) | |
2815 { | |
2816 bufp->re_nsub++; | |
2817 while (bufp->external_to_internal_register_size <= | |
2818 bufp->re_nsub) | |
2819 { | |
2820 int i; | |
2821 int old_size = | |
2822 bufp->external_to_internal_register_size; | |
2823 bufp->external_to_internal_register_size += 5; | |
2824 RETALLOC (bufp->external_to_internal_register, | |
2825 bufp->external_to_internal_register_size, | |
2826 int); | |
2827 /* debugging */ | |
2828 for (i = old_size; | |
2829 i < bufp->external_to_internal_register_size; i++) | |
2830 bufp->external_to_internal_register[i] = | |
2831 (int) 0xDEADBEEF; | |
2832 } | |
2833 | |
2834 bufp->external_to_internal_register[bufp->re_nsub] = | |
2835 bufp->re_ngroups; | |
2836 } | |
428 | 2837 |
2838 if (COMPILE_STACK_FULL) | |
2839 { | |
2840 RETALLOC (compile_stack.stack, compile_stack.size << 1, | |
2841 compile_stack_elt_t); | |
2842 if (compile_stack.stack == NULL) return REG_ESPACE; | |
2843 | |
2844 compile_stack.size <<= 1; | |
2845 } | |
2846 | |
2847 /* These are the values to restore when we hit end of this | |
2848 group. They are all relative offsets, so that if the | |
2849 whole pattern moves because of realloc, they will still | |
2850 be valid. */ | |
2851 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; | |
2852 COMPILE_STACK_TOP.fixup_alt_jump | |
2853 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; | |
446 | 2854 COMPILE_STACK_TOP.laststart_offset = buf_end - bufp->buffer; |
428 | 2855 COMPILE_STACK_TOP.regnum = r; |
2856 | |
2857 /* We will eventually replace the 0 with the number of | |
2858 groups inner to this one. But do not push a | |
2859 start_memory for groups beyond the last one we can | |
502 | 2860 represent in the compiled pattern. |
2861 #### bad bad bad. this will fail in lots of ways, if we | |
2862 ever have to backtrack for these groups. | |
2863 */ | |
428 | 2864 if (r <= MAX_REGNUM) |
2865 { | |
2866 COMPILE_STACK_TOP.inner_group_offset | |
446 | 2867 = buf_end - bufp->buffer + 2; |
428 | 2868 BUF_PUSH_3 (start_memory, r, 0); |
2869 } | |
2870 | |
2871 compile_stack.avail++; | |
2872 | |
2873 fixup_alt_jump = 0; | |
2874 laststart = 0; | |
446 | 2875 begalt = buf_end; |
428 | 2876 /* If we've reached MAX_REGNUM groups, then this open |
2877 won't actually generate any code, so we'll have to | |
2878 clear pending_exact explicitly. */ | |
2879 pending_exact = 0; | |
2880 } | |
2881 break; | |
2882 | |
2883 | |
2884 case ')': | |
2885 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; | |
2886 | |
2887 if (COMPILE_STACK_EMPTY) { | |
2888 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) | |
2889 goto normal_backslash; | |
2890 else | |
2891 FREE_STACK_RETURN (REG_ERPAREN); | |
2892 } | |
2893 | |
2894 handle_close: | |
2895 if (fixup_alt_jump) | |
2896 { /* Push a dummy failure point at the end of the | |
2897 alternative for a possible future | |
2898 `pop_failure_jump' to pop. See comments at | |
2899 `push_dummy_failure' in `re_match_2'. */ | |
2900 BUF_PUSH (push_dummy_failure); | |
2901 | |
2902 /* We allocated space for this jump when we assigned | |
2903 to `fixup_alt_jump', in the `handle_alt' case below. */ | |
446 | 2904 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end - 1); |
428 | 2905 } |
2906 | |
2907 /* See similar code for backslashed left paren above. */ | |
2908 if (COMPILE_STACK_EMPTY) { | |
2909 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) | |
2910 goto normal_char; | |
2911 else | |
2912 FREE_STACK_RETURN (REG_ERPAREN); | |
2913 } | |
2914 | |
2915 /* Since we just checked for an empty stack above, this | |
2916 ``can't happen''. */ | |
2917 assert (compile_stack.avail != 0); | |
2918 { | |
2919 /* We don't just want to restore into `regnum', because | |
2920 later groups should continue to be numbered higher, | |
2921 as in `(ab)c(de)' -- the second group is #2. */ | |
2922 regnum_t this_group_regnum; | |
2923 | |
2924 compile_stack.avail--; | |
2925 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; | |
2926 fixup_alt_jump | |
2927 = COMPILE_STACK_TOP.fixup_alt_jump | |
2928 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 | |
2929 : 0; | |
2930 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; | |
2931 this_group_regnum = COMPILE_STACK_TOP.regnum; | |
2932 /* If we've reached MAX_REGNUM groups, then this open | |
2933 won't actually generate any code, so we'll have to | |
2934 clear pending_exact explicitly. */ | |
2935 pending_exact = 0; | |
2936 | |
2937 /* We're at the end of the group, so now we know how many | |
2938 groups were inside this one. */ | |
2939 if (this_group_regnum <= MAX_REGNUM) | |
2940 { | |
2941 unsigned char *inner_group_loc | |
2942 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; | |
2943 | |
2944 *inner_group_loc = regnum - this_group_regnum; | |
2945 BUF_PUSH_3 (stop_memory, this_group_regnum, | |
2946 regnum - this_group_regnum); | |
2947 } | |
2948 } | |
2949 break; | |
2950 | |
2951 | |
2952 case '|': /* `\|'. */ | |
2953 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) | |
2954 goto normal_backslash; | |
2955 handle_alt: | |
2956 if (syntax & RE_LIMITED_OPS) | |
2957 goto normal_char; | |
2958 | |
2959 /* Insert before the previous alternative a jump which | |
2960 jumps to this alternative if the former fails. */ | |
2961 GET_BUFFER_SPACE (3); | |
446 | 2962 INSERT_JUMP (on_failure_jump, begalt, buf_end + 6); |
428 | 2963 pending_exact = 0; |
446 | 2964 buf_end += 3; |
428 | 2965 |
2966 /* The alternative before this one has a jump after it | |
2967 which gets executed if it gets matched. Adjust that | |
2968 jump so it will jump to this alternative's analogous | |
2969 jump (put in below, which in turn will jump to the next | |
2970 (if any) alternative's such jump, etc.). The last such | |
2971 jump jumps to the correct final destination. A picture: | |
2972 _____ _____ | |
2973 | | | | | |
2974 | v | v | |
2975 a | b | c | |
2976 | |
2977 If we are at `b', then fixup_alt_jump right now points to a | |
2978 three-byte space after `a'. We'll put in the jump, set | |
2979 fixup_alt_jump to right after `b', and leave behind three | |
2980 bytes which we'll fill in when we get to after `c'. */ | |
2981 | |
2982 if (fixup_alt_jump) | |
446 | 2983 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end); |
428 | 2984 |
2985 /* Mark and leave space for a jump after this alternative, | |
2986 to be filled in later either by next alternative or | |
2987 when know we're at the end of a series of alternatives. */ | |
446 | 2988 fixup_alt_jump = buf_end; |
428 | 2989 GET_BUFFER_SPACE (3); |
446 | 2990 buf_end += 3; |
428 | 2991 |
2992 laststart = 0; | |
446 | 2993 begalt = buf_end; |
428 | 2994 break; |
2995 | |
2996 | |
2997 case '{': | |
2998 /* If \{ is a literal. */ | |
2999 if (!(syntax & RE_INTERVALS) | |
3000 /* If we're at `\{' and it's not the open-interval | |
3001 operator. */ | |
3002 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) | |
3003 || (p - 2 == pattern && p == pend)) | |
3004 goto normal_backslash; | |
3005 | |
3006 handle_interval: | |
3007 { | |
3008 /* If got here, then the syntax allows intervals. */ | |
3009 | |
3010 /* At least (most) this many matches must be made. */ | |
3011 int lower_bound = -1, upper_bound = -1; | |
3012 | |
3013 beg_interval = p - 1; | |
3014 | |
3015 if (p == pend) | |
3016 { | |
3017 if (syntax & RE_NO_BK_BRACES) | |
3018 goto unfetch_interval; | |
3019 else | |
3020 FREE_STACK_RETURN (REG_EBRACE); | |
3021 } | |
3022 | |
3023 GET_UNSIGNED_NUMBER (lower_bound); | |
3024 | |
3025 if (c == ',') | |
3026 { | |
3027 GET_UNSIGNED_NUMBER (upper_bound); | |
3028 if (upper_bound < 0) upper_bound = RE_DUP_MAX; | |
3029 } | |
3030 else | |
3031 /* Interval such as `{1}' => match exactly once. */ | |
3032 upper_bound = lower_bound; | |
3033 | |
3034 if (lower_bound < 0 || upper_bound > RE_DUP_MAX | |
3035 || lower_bound > upper_bound) | |
3036 { | |
3037 if (syntax & RE_NO_BK_BRACES) | |
3038 goto unfetch_interval; | |
3039 else | |
3040 FREE_STACK_RETURN (REG_BADBR); | |
3041 } | |
3042 | |
3043 if (!(syntax & RE_NO_BK_BRACES)) | |
3044 { | |
3045 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE); | |
3046 | |
3047 PATFETCH (c); | |
3048 } | |
3049 | |
3050 if (c != '}') | |
3051 { | |
3052 if (syntax & RE_NO_BK_BRACES) | |
3053 goto unfetch_interval; | |
3054 else | |
3055 FREE_STACK_RETURN (REG_BADBR); | |
3056 } | |
3057 | |
3058 /* We just parsed a valid interval. */ | |
3059 | |
3060 /* If it's invalid to have no preceding re. */ | |
3061 if (!laststart) | |
3062 { | |
3063 if (syntax & RE_CONTEXT_INVALID_OPS) | |
3064 FREE_STACK_RETURN (REG_BADRPT); | |
3065 else if (syntax & RE_CONTEXT_INDEP_OPS) | |
446 | 3066 laststart = buf_end; |
428 | 3067 else |
3068 goto unfetch_interval; | |
3069 } | |
3070 | |
3071 /* If the upper bound is zero, don't want to succeed at | |
3072 all; jump from `laststart' to `b + 3', which will be | |
3073 the end of the buffer after we insert the jump. */ | |
3074 if (upper_bound == 0) | |
3075 { | |
3076 GET_BUFFER_SPACE (3); | |
446 | 3077 INSERT_JUMP (jump, laststart, buf_end + 3); |
3078 buf_end += 3; | |
428 | 3079 } |
3080 | |
3081 /* Otherwise, we have a nontrivial interval. When | |
3082 we're all done, the pattern will look like: | |
3083 set_number_at <jump count> <upper bound> | |
3084 set_number_at <succeed_n count> <lower bound> | |
3085 succeed_n <after jump addr> <succeed_n count> | |
3086 <body of loop> | |
3087 jump_n <succeed_n addr> <jump count> | |
3088 (The upper bound and `jump_n' are omitted if | |
3089 `upper_bound' is 1, though.) */ | |
3090 else | |
3091 { /* If the upper bound is > 1, we need to insert | |
3092 more at the end of the loop. */ | |
647 | 3093 int nbytes = 10 + (upper_bound > 1) * 10; |
428 | 3094 |
3095 GET_BUFFER_SPACE (nbytes); | |
3096 | |
3097 /* Initialize lower bound of the `succeed_n', even | |
3098 though it will be set during matching by its | |
3099 attendant `set_number_at' (inserted next), | |
3100 because `re_compile_fastmap' needs to know. | |
3101 Jump to the `jump_n' we might insert below. */ | |
3102 INSERT_JUMP2 (succeed_n, laststart, | |
446 | 3103 buf_end + 5 + (upper_bound > 1) * 5, |
428 | 3104 lower_bound); |
446 | 3105 buf_end += 5; |
428 | 3106 |
3107 /* Code to initialize the lower bound. Insert | |
3108 before the `succeed_n'. The `5' is the last two | |
3109 bytes of this `set_number_at', plus 3 bytes of | |
3110 the following `succeed_n'. */ | |
446 | 3111 insert_op2 (set_number_at, laststart, 5, lower_bound, buf_end); |
3112 buf_end += 5; | |
428 | 3113 |
3114 if (upper_bound > 1) | |
3115 { /* More than one repetition is allowed, so | |
3116 append a backward jump to the `succeed_n' | |
3117 that starts this interval. | |
3118 | |
3119 When we've reached this during matching, | |
3120 we'll have matched the interval once, so | |
3121 jump back only `upper_bound - 1' times. */ | |
446 | 3122 STORE_JUMP2 (jump_n, buf_end, laststart + 5, |
428 | 3123 upper_bound - 1); |
446 | 3124 buf_end += 5; |
428 | 3125 |
3126 /* The location we want to set is the second | |
3127 parameter of the `jump_n'; that is `b-2' as | |
3128 an absolute address. `laststart' will be | |
3129 the `set_number_at' we're about to insert; | |
3130 `laststart+3' the number to set, the source | |
3131 for the relative address. But we are | |
3132 inserting into the middle of the pattern -- | |
3133 so everything is getting moved up by 5. | |
3134 Conclusion: (b - 2) - (laststart + 3) + 5, | |
3135 i.e., b - laststart. | |
3136 | |
3137 We insert this at the beginning of the loop | |
3138 so that if we fail during matching, we'll | |
3139 reinitialize the bounds. */ | |
446 | 3140 insert_op2 (set_number_at, laststart, |
3141 buf_end - laststart, | |
3142 upper_bound - 1, buf_end); | |
3143 buf_end += 5; | |
428 | 3144 } |
3145 } | |
3146 pending_exact = 0; | |
3147 beg_interval = NULL; | |
3148 } | |
3149 break; | |
3150 | |
3151 unfetch_interval: | |
3152 /* If an invalid interval, match the characters as literals. */ | |
3153 assert (beg_interval); | |
3154 p = beg_interval; | |
3155 beg_interval = NULL; | |
3156 | |
3157 /* normal_char and normal_backslash need `c'. */ | |
3158 PATFETCH (c); | |
3159 | |
3160 if (!(syntax & RE_NO_BK_BRACES)) | |
3161 { | |
3162 if (p > pattern && p[-1] == '\\') | |
3163 goto normal_backslash; | |
3164 } | |
3165 goto normal_char; | |
3166 | |
3167 #ifdef emacs | |
3168 /* There is no way to specify the before_dot and after_dot | |
3169 operators. rms says this is ok. --karl */ | |
3170 case '=': | |
3171 BUF_PUSH (at_dot); | |
3172 break; | |
3173 | |
3174 case 's': | |
446 | 3175 laststart = buf_end; |
428 | 3176 PATFETCH (c); |
3177 /* XEmacs addition */ | |
3178 if (c >= 0x80 || syntax_spec_code[c] == 0377) | |
3179 FREE_STACK_RETURN (REG_ESYNTAX); | |
3180 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); | |
3181 break; | |
3182 | |
3183 case 'S': | |
446 | 3184 laststart = buf_end; |
428 | 3185 PATFETCH (c); |
3186 /* XEmacs addition */ | |
3187 if (c >= 0x80 || syntax_spec_code[c] == 0377) | |
3188 FREE_STACK_RETURN (REG_ESYNTAX); | |
3189 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); | |
3190 break; | |
3191 | |
3192 #ifdef MULE | |
3193 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */ | |
3194 case 'c': | |
446 | 3195 laststart = buf_end; |
428 | 3196 PATFETCH_RAW (c); |
3197 if (c < 32 || c > 127) | |
3198 FREE_STACK_RETURN (REG_ECATEGORY); | |
3199 BUF_PUSH_2 (categoryspec, c); | |
3200 break; | |
3201 | |
3202 case 'C': | |
446 | 3203 laststart = buf_end; |
428 | 3204 PATFETCH_RAW (c); |
3205 if (c < 32 || c > 127) | |
3206 FREE_STACK_RETURN (REG_ECATEGORY); | |
3207 BUF_PUSH_2 (notcategoryspec, c); | |
3208 break; | |
3209 /* end of category patch */ | |
3210 #endif /* MULE */ | |
3211 #endif /* emacs */ | |
3212 | |
3213 | |
3214 case 'w': | |
446 | 3215 laststart = buf_end; |
428 | 3216 BUF_PUSH (wordchar); |
3217 break; | |
3218 | |
3219 | |
3220 case 'W': | |
446 | 3221 laststart = buf_end; |
428 | 3222 BUF_PUSH (notwordchar); |
3223 break; | |
3224 | |
3225 | |
3226 case '<': | |
3227 BUF_PUSH (wordbeg); | |
3228 break; | |
3229 | |
3230 case '>': | |
3231 BUF_PUSH (wordend); | |
3232 break; | |
3233 | |
3234 case 'b': | |
3235 BUF_PUSH (wordbound); | |
3236 break; | |
3237 | |
3238 case 'B': | |
3239 BUF_PUSH (notwordbound); | |
3240 break; | |
3241 | |
3242 case '`': | |
3243 BUF_PUSH (begbuf); | |
3244 break; | |
3245 | |
3246 case '\'': | |
3247 BUF_PUSH (endbuf); | |
3248 break; | |
3249 | |
3250 case '1': case '2': case '3': case '4': case '5': | |
3251 case '6': case '7': case '8': case '9': | |
446 | 3252 { |
502 | 3253 regnum_t reg, regint; |
3254 int may_need_to_unfetch = 0; | |
446 | 3255 if (syntax & RE_NO_BK_REFS) |
3256 goto normal_char; | |
3257 | |
502 | 3258 /* This only goes up to 99. It could be extended to work |
3259 up to 255 (the maximum number of registers that can be | |
3260 handled by the current regexp engine, because it stores | |
3261 its register numbers in the compiled pattern as one byte, | |
3262 ugh). Doing that's a bit trickier, because you might | |
3263 have the case where \25 a back-ref but \255 is not, ... */ | |
446 | 3264 reg = c - '0'; |
502 | 3265 if (p < pend) |
3266 { | |
3267 PATFETCH (c); | |
3268 if (c >= '0' && c <= '9') | |
3269 { | |
3270 regnum_t new_reg = reg * 10 + c - '0'; | |
3271 if (new_reg <= bufp->re_nsub) | |
3272 { | |
3273 reg = new_reg; | |
3274 may_need_to_unfetch = 1; | |
3275 } | |
3276 else | |
3277 PATUNFETCH; | |
3278 } | |
523 | 3279 else |
3280 PATUNFETCH; | |
502 | 3281 } |
3282 | |
3283 if (reg > bufp->re_nsub) | |
446 | 3284 FREE_STACK_RETURN (REG_ESUBREG); |
3285 | |
502 | 3286 regint = bufp->external_to_internal_register[reg]; |
446 | 3287 /* Can't back reference to a subexpression if inside of it. */ |
502 | 3288 if (group_in_compile_stack (compile_stack, regint)) |
3289 { | |
3290 if (may_need_to_unfetch) | |
3291 PATUNFETCH; | |
3292 goto normal_char; | |
3293 } | |
3294 | |
3295 #ifdef emacs | |
3296 if (reg > 9 && | |
3297 bufp->warned_about_incompatible_back_references == 0) | |
3298 { | |
3299 bufp->warned_about_incompatible_back_references = 1; | |
3300 warn_when_safe (intern ("regex"), Qinfo, | |
3301 "Back reference \\%d now has new " | |
3302 "semantics in %s", reg, pattern); | |
3303 } | |
3304 #endif | |
446 | 3305 |
3306 laststart = buf_end; | |
502 | 3307 BUF_PUSH_2 (duplicate, regint); |
446 | 3308 } |
428 | 3309 break; |
3310 | |
3311 | |
3312 case '+': | |
3313 case '?': | |
3314 if (syntax & RE_BK_PLUS_QM) | |
3315 goto handle_plus; | |
3316 else | |
3317 goto normal_backslash; | |
3318 | |
3319 default: | |
3320 normal_backslash: | |
3321 /* You might think it would be useful for \ to mean | |
3322 not to translate; but if we don't translate it, | |
3323 it will never match anything. */ | |
826 | 3324 c = RE_TRANSLATE (c); |
428 | 3325 goto normal_char; |
3326 } | |
3327 break; | |
3328 | |
3329 | |
3330 default: | |
3331 /* Expects the character in `c'. */ | |
3332 /* `p' points to the location after where `c' came from. */ | |
3333 normal_char: | |
3334 { | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3335 /* The following conditional synced to GNU Emacs 22.1. */ |
428 | 3336 /* If no exactn currently being built. */ |
3337 if (!pending_exact | |
3338 | |
3339 /* If last exactn not at current position. */ | |
446 | 3340 || pending_exact + *pending_exact + 1 != buf_end |
428 | 3341 |
3342 /* We have only one byte following the exactn for the count. */ | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3343 || *pending_exact >= (1 << BYTEWIDTH) - MAX_ICHAR_LEN |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3344 |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3345 /* If followed by a repetition operator. |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3346 If the lookahead fails because of end of pattern, any |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3347 trailing backslash will get caught later. */ |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3348 || (p != pend && (*p == '*' || *p == '^')) |
428 | 3349 || ((syntax & RE_BK_PLUS_QM) |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3350 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?') |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3351 : p != pend && (*p == '+' || *p == '?')) |
428 | 3352 || ((syntax & RE_INTERVALS) |
3353 && ((syntax & RE_NO_BK_BRACES) | |
4750
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3354 ? p != pend && *p == '{' |
b5f21bb36684
Fix crash in regex.c (closes issue630).
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4527
diff
changeset
|
3355 : p + 1 < pend && (p[0] == '\\' && p[1] == '{')))) |
428 | 3356 { |
3357 /* Start building a new exactn. */ | |
3358 | |
446 | 3359 laststart = buf_end; |
428 | 3360 |
3361 BUF_PUSH_2 (exactn, 0); | |
446 | 3362 pending_exact = buf_end - 1; |
428 | 3363 } |
3364 | |
446 | 3365 #ifndef MULE |
428 | 3366 BUF_PUSH (c); |
3367 (*pending_exact)++; | |
446 | 3368 #else |
3369 { | |
3370 Bytecount bt_count; | |
867 | 3371 Ibyte tmp_buf[MAX_ICHAR_LEN]; |
446 | 3372 int i; |
3373 | |
867 | 3374 bt_count = set_itext_ichar (tmp_buf, c); |
446 | 3375 |
3376 for (i = 0; i < bt_count; i++) | |
3377 { | |
3378 BUF_PUSH (tmp_buf[i]); | |
3379 (*pending_exact)++; | |
3380 } | |
3381 } | |
3382 #endif | |
428 | 3383 break; |
3384 } | |
3385 } /* switch (c) */ | |
3386 } /* while p != pend */ | |
3387 | |
3388 | |
3389 /* Through the pattern now. */ | |
3390 | |
3391 if (fixup_alt_jump) | |
446 | 3392 STORE_JUMP (jump_past_alt, fixup_alt_jump, buf_end); |
428 | 3393 |
3394 if (!COMPILE_STACK_EMPTY) | |
3395 FREE_STACK_RETURN (REG_EPAREN); | |
3396 | |
3397 /* If we don't want backtracking, force success | |
3398 the first time we reach the end of the compiled pattern. */ | |
3399 if (syntax & RE_NO_POSIX_BACKTRACKING) | |
3400 BUF_PUSH (succeed); | |
3401 | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
3402 xfree (compile_stack.stack); |
428 | 3403 |
3404 /* We have succeeded; set the length of the buffer. */ | |
446 | 3405 bufp->used = buf_end - bufp->buffer; |
428 | 3406 |
3407 #ifdef DEBUG | |
3408 if (debug) | |
3409 { | |
3410 DEBUG_PRINT1 ("\nCompiled pattern: \n"); | |
3411 print_compiled_pattern (bufp); | |
3412 } | |
3413 #endif /* DEBUG */ | |
3414 | |
3415 #ifndef MATCH_MAY_ALLOCATE | |
3416 /* Initialize the failure stack to the largest possible stack. This | |
3417 isn't necessary unless we're trying to avoid calling alloca in | |
3418 the search and match routines. */ | |
3419 { | |
502 | 3420 int num_regs = bufp->re_ngroups + 1; |
428 | 3421 |
3422 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size | |
3423 is strictly greater than re_max_failures, the largest possible stack | |
3424 is 2 * re_max_failures failure points. */ | |
3425 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) | |
3426 { | |
3427 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); | |
3428 | |
3429 if (! fail_stack.stack) | |
3430 fail_stack.stack | |
3431 = (fail_stack_elt_t *) xmalloc (fail_stack.size | |
3432 * sizeof (fail_stack_elt_t)); | |
3433 else | |
3434 fail_stack.stack | |
3435 = (fail_stack_elt_t *) xrealloc (fail_stack.stack, | |
3436 (fail_stack.size | |
3437 * sizeof (fail_stack_elt_t))); | |
3438 } | |
3439 | |
3440 regex_grow_registers (num_regs); | |
3441 } | |
3442 #endif /* not MATCH_MAY_ALLOCATE */ | |
3443 | |
3444 return REG_NOERROR; | |
3445 } /* regex_compile */ | |
3446 | |
3447 /* Subroutines for `regex_compile'. */ | |
3448 | |
3449 /* Store OP at LOC followed by two-byte integer parameter ARG. */ | |
3450 | |
3451 static void | |
3452 store_op1 (re_opcode_t op, unsigned char *loc, int arg) | |
3453 { | |
3454 *loc = (unsigned char) op; | |
3455 STORE_NUMBER (loc + 1, arg); | |
3456 } | |
3457 | |
3458 | |
3459 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ | |
3460 | |
3461 static void | |
3462 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2) | |
3463 { | |
3464 *loc = (unsigned char) op; | |
3465 STORE_NUMBER (loc + 1, arg1); | |
3466 STORE_NUMBER (loc + 3, arg2); | |
3467 } | |
3468 | |
3469 | |
3470 /* Copy the bytes from LOC to END to open up three bytes of space at LOC | |
3471 for OP followed by two-byte integer parameter ARG. */ | |
3472 | |
3473 static void | |
3474 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end) | |
3475 { | |
3476 REGISTER unsigned char *pfrom = end; | |
3477 REGISTER unsigned char *pto = end + 3; | |
3478 | |
3479 while (pfrom != loc) | |
3480 *--pto = *--pfrom; | |
3481 | |
3482 store_op1 (op, loc, arg); | |
3483 } | |
3484 | |
3485 | |
3486 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ | |
3487 | |
3488 static void | |
3489 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, | |
3490 unsigned char *end) | |
3491 { | |
3492 REGISTER unsigned char *pfrom = end; | |
3493 REGISTER unsigned char *pto = end + 5; | |
3494 | |
3495 while (pfrom != loc) | |
3496 *--pto = *--pfrom; | |
3497 | |
3498 store_op2 (op, loc, arg1, arg2); | |
3499 } | |
3500 | |
3501 | |
3502 /* P points to just after a ^ in PATTERN. Return true if that ^ comes | |
3503 after an alternative or a begin-subexpression. We assume there is at | |
3504 least one character before the ^. */ | |
3505 | |
460 | 3506 static re_bool |
446 | 3507 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax) |
428 | 3508 { |
446 | 3509 re_char *prev = p - 2; |
460 | 3510 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\'; |
428 | 3511 |
3512 return | |
3513 /* After a subexpression? */ | |
3514 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) | |
3515 /* After an alternative? */ | |
3516 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); | |
3517 } | |
3518 | |
3519 | |
3520 /* The dual of at_begline_loc_p. This one is for $. We assume there is | |
3521 at least one character after the $, i.e., `P < PEND'. */ | |
3522 | |
460 | 3523 static re_bool |
446 | 3524 at_endline_loc_p (re_char *p, re_char *pend, int syntax) |
428 | 3525 { |
446 | 3526 re_char *next = p; |
460 | 3527 re_bool next_backslash = *next == '\\'; |
446 | 3528 re_char *next_next = p + 1 < pend ? p + 1 : 0; |
428 | 3529 |
3530 return | |
3531 /* Before a subexpression? */ | |
3532 (syntax & RE_NO_BK_PARENS ? *next == ')' | |
3533 : next_backslash && next_next && *next_next == ')') | |
3534 /* Before an alternative? */ | |
3535 || (syntax & RE_NO_BK_VBAR ? *next == '|' | |
3536 : next_backslash && next_next && *next_next == '|'); | |
3537 } | |
3538 | |
3539 | |
3540 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and | |
3541 false if it's not. */ | |
3542 | |
460 | 3543 static re_bool |
428 | 3544 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum) |
3545 { | |
3546 int this_element; | |
3547 | |
3548 for (this_element = compile_stack.avail - 1; | |
3549 this_element >= 0; | |
3550 this_element--) | |
3551 if (compile_stack.stack[this_element].regnum == regnum) | |
3552 return true; | |
3553 | |
3554 return false; | |
3555 } | |
3556 | |
3557 | |
3558 /* Read the ending character of a range (in a bracket expression) from the | |
3559 uncompiled pattern *P_PTR (which ends at PEND). We assume the | |
3560 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) | |
3561 Then we set the translation of all bits between the starting and | |
3562 ending characters (inclusive) in the compiled pattern B. | |
3563 | |
3564 Return an error code. | |
3565 | |
3566 We use these short variable names so we can use the same macros as | |
826 | 3567 `regex_compile' itself. |
3568 | |
3569 Under Mule, this is only called when both chars of the range are | |
3570 ASCII. */ | |
428 | 3571 |
3572 static reg_errcode_t | |
446 | 3573 compile_range (re_char **p_ptr, re_char *pend, RE_TRANSLATE_TYPE translate, |
3574 reg_syntax_t syntax, unsigned char *buf_end) | |
428 | 3575 { |
867 | 3576 Ichar this_char; |
428 | 3577 |
446 | 3578 re_char *p = *p_ptr; |
428 | 3579 int range_start, range_end; |
3580 | |
3581 if (p == pend) | |
3582 return REG_ERANGE; | |
3583 | |
3584 /* Even though the pattern is a signed `char *', we need to fetch | |
3585 with unsigned char *'s; if the high bit of the pattern character | |
3586 is set, the range endpoints will be negative if we fetch using a | |
3587 signed char *. | |
3588 | |
3589 We also want to fetch the endpoints without translating them; the | |
3590 appropriate translation is done in the bit-setting loop below. */ | |
442 | 3591 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */ |
3592 range_start = ((const unsigned char *) p)[-2]; | |
3593 range_end = ((const unsigned char *) p)[0]; | |
428 | 3594 |
3595 /* Have to increment the pointer into the pattern string, so the | |
3596 caller isn't still at the ending character. */ | |
3597 (*p_ptr)++; | |
3598 | |
3599 /* If the start is after the end, the range is empty. */ | |
3600 if (range_start > range_end) | |
3601 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; | |
3602 | |
3603 /* Here we see why `this_char' has to be larger than an `unsigned | |
3604 char' -- the range is inclusive, so if `range_end' == 0xff | |
3605 (assuming 8-bit characters), we would otherwise go into an infinite | |
3606 loop, since all characters <= 0xff. */ | |
3607 for (this_char = range_start; this_char <= range_end; this_char++) | |
3608 { | |
826 | 3609 SET_LIST_BIT (RE_TRANSLATE (this_char)); |
428 | 3610 } |
3611 | |
3612 return REG_NOERROR; | |
3613 } | |
3614 | |
3615 #ifdef MULE | |
3616 | |
3617 static reg_errcode_t | |
446 | 3618 compile_extended_range (re_char **p_ptr, re_char *pend, |
3619 RE_TRANSLATE_TYPE translate, | |
428 | 3620 reg_syntax_t syntax, Lisp_Object rtab) |
3621 { | |
867 | 3622 Ichar this_char, range_start, range_end; |
3623 const Ibyte *p; | |
428 | 3624 |
3625 if (*p_ptr == pend) | |
3626 return REG_ERANGE; | |
3627 | |
867 | 3628 p = (const Ibyte *) *p_ptr; |
3629 range_end = itext_ichar (p); | |
428 | 3630 p--; /* back to '-' */ |
867 | 3631 DEC_IBYTEPTR (p); /* back to start of range */ |
428 | 3632 /* We also want to fetch the endpoints without translating them; the |
3633 appropriate translation is done in the bit-setting loop below. */ | |
867 | 3634 range_start = itext_ichar (p); |
3635 INC_IBYTEPTR (*p_ptr); | |
428 | 3636 |
3637 /* If the start is after the end, the range is empty. */ | |
3638 if (range_start > range_end) | |
3639 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; | |
3640 | |
3641 /* Can't have ranges spanning different charsets, except maybe for | |
3642 ranges entirely within the first 256 chars. */ | |
3643 | |
3644 if ((range_start >= 0x100 || range_end >= 0x100) | |
867 | 3645 && ichar_leading_byte (range_start) != |
3646 ichar_leading_byte (range_end)) | |
428 | 3647 return REG_ERANGESPAN; |
3648 | |
826 | 3649 /* #### This might be way inefficient if the range encompasses 10,000 |
3650 chars or something. To be efficient, you'd have to do something like | |
3651 this: | |
428 | 3652 |
3653 range_table a; | |
3654 range_table b; | |
3655 map over translation table in [range_start, range_end] of | |
3656 (put the mapped range in a; | |
3657 put the translation in b) | |
3658 invert the range in a and truncate to [range_start, range_end] | |
3659 compute the union of a, b | |
3660 union the result into rtab | |
3661 */ | |
826 | 3662 for (this_char = range_start; this_char <= range_end; this_char++) |
428 | 3663 { |
826 | 3664 SET_RANGETAB_BIT (RE_TRANSLATE (this_char)); |
428 | 3665 } |
3666 | |
3667 if (this_char <= range_end) | |
3668 put_range_table (rtab, this_char, range_end, Qt); | |
3669 | |
3670 return REG_NOERROR; | |
3671 } | |
3672 | |
3673 #endif /* MULE */ | |
3674 | |
3675 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in | |
3676 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible | |
3677 characters can start a string that matches the pattern. This fastmap | |
3678 is used by re_search to skip quickly over impossible starting points. | |
3679 | |
3680 The caller must supply the address of a (1 << BYTEWIDTH)-byte data | |
3681 area as BUFP->fastmap. | |
3682 | |
3683 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in | |
3684 the pattern buffer. | |
3685 | |
3686 Returns 0 if we succeed, -2 if an internal error. */ | |
3687 | |
3688 int | |
826 | 3689 re_compile_fastmap (struct re_pattern_buffer *bufp |
3690 RE_LISP_SHORT_CONTEXT_ARGS_DECL) | |
428 | 3691 { |
3692 int j, k; | |
3693 #ifdef MATCH_MAY_ALLOCATE | |
3694 fail_stack_type fail_stack; | |
3695 #endif | |
456 | 3696 DECLARE_DESTINATION; |
428 | 3697 /* We don't push any register information onto the failure stack. */ |
3698 | |
826 | 3699 /* &&#### this should be changed for 8-bit-fixed, for efficiency. see |
3700 comment marked with &&#### in re_search_2. */ | |
3701 | |
428 | 3702 REGISTER char *fastmap = bufp->fastmap; |
3703 unsigned char *pattern = bufp->buffer; | |
647 | 3704 long size = bufp->used; |
428 | 3705 unsigned char *p = pattern; |
3706 REGISTER unsigned char *pend = pattern + size; | |
3707 | |
771 | 3708 #ifdef REGEX_REL_ALLOC |
428 | 3709 /* This holds the pointer to the failure stack, when |
3710 it is allocated relocatably. */ | |
3711 fail_stack_elt_t *failure_stack_ptr; | |
3712 #endif | |
3713 | |
3714 /* Assume that each path through the pattern can be null until | |
3715 proven otherwise. We set this false at the bottom of switch | |
3716 statement, to which we get only if a particular path doesn't | |
3717 match the empty string. */ | |
460 | 3718 re_bool path_can_be_null = true; |
428 | 3719 |
3720 /* We aren't doing a `succeed_n' to begin with. */ | |
460 | 3721 re_bool succeed_n_p = false; |
428 | 3722 |
1333 | 3723 #ifdef ERROR_CHECK_MALLOC |
3724 /* The pattern comes from string data, not buffer data. We don't access | |
3725 any buffer data, so we don't have to worry about malloc() (but the | |
3726 disallowed flag may have been set by a caller). */ | |
3727 int depth = bind_regex_malloc_disallowed (0); | |
3728 #endif | |
3729 | |
428 | 3730 assert (fastmap != NULL && p != NULL); |
3731 | |
3732 INIT_FAIL_STACK (); | |
3733 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */ | |
3734 bufp->fastmap_accurate = 1; /* It will be when we're done. */ | |
3735 bufp->can_be_null = 0; | |
3736 | |
3737 while (1) | |
3738 { | |
3739 if (p == pend || *p == succeed) | |
3740 { | |
3741 /* We have reached the (effective) end of pattern. */ | |
3742 if (!FAIL_STACK_EMPTY ()) | |
3743 { | |
3744 bufp->can_be_null |= path_can_be_null; | |
3745 | |
3746 /* Reset for next path. */ | |
3747 path_can_be_null = true; | |
3748 | |
446 | 3749 p = (unsigned char *) fail_stack.stack[--fail_stack.avail].pointer; |
428 | 3750 |
3751 continue; | |
3752 } | |
3753 else | |
3754 break; | |
3755 } | |
3756 | |
3757 /* We should never be about to go beyond the end of the pattern. */ | |
3758 assert (p < pend); | |
3759 | |
4759
aa5ed11f473b
Remove support for obsolete systems. See xemacs-patches message with ID
Jerry James <james@xemacs.org>
parents:
4750
diff
changeset
|
3760 switch ((re_opcode_t) *p++) |
428 | 3761 { |
3762 | |
3763 /* I guess the idea here is to simply not bother with a fastmap | |
3764 if a backreference is used, since it's too hard to figure out | |
3765 the fastmap for the corresponding group. Setting | |
3766 `can_be_null' stops `re_search_2' from using the fastmap, so | |
3767 that is all we do. */ | |
3768 case duplicate: | |
3769 bufp->can_be_null = 1; | |
3770 goto done; | |
3771 | |
3772 | |
3773 /* Following are the cases which match a character. These end | |
3774 with `break'. */ | |
3775 | |
3776 case exactn: | |
3777 fastmap[p[1]] = 1; | |
3778 break; | |
3779 | |
3780 | |
3781 case charset: | |
3782 /* XEmacs: Under Mule, these bit vectors will | |
3783 only contain values for characters below 0x80. */ | |
3784 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | |
3785 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) | |
3786 fastmap[j] = 1; | |
3787 break; | |
3788 | |
3789 | |
3790 case charset_not: | |
3791 /* Chars beyond end of map must be allowed. */ | |
3792 #ifdef MULE | |
3793 for (j = *p * BYTEWIDTH; j < 0x80; j++) | |
3794 fastmap[j] = 1; | |
3795 /* And all extended characters must be allowed, too. */ | |
3796 for (j = 0x80; j < 0xA0; j++) | |
3797 fastmap[j] = 1; | |
446 | 3798 #else /* not MULE */ |
428 | 3799 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) |
3800 fastmap[j] = 1; | |
446 | 3801 #endif /* MULE */ |
428 | 3802 |
3803 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | |
3804 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) | |
3805 fastmap[j] = 1; | |
3806 break; | |
3807 | |
3808 #ifdef MULE | |
3809 case charset_mule: | |
3810 { | |
3811 int nentries; | |
3812 int i; | |
3813 | |
3814 nentries = unified_range_table_nentries (p); | |
3815 for (i = 0; i < nentries; i++) | |
3816 { | |
3817 EMACS_INT first, last; | |
3818 Lisp_Object dummy_val; | |
3819 int jj; | |
867 | 3820 Ibyte strr[MAX_ICHAR_LEN]; |
428 | 3821 |
3822 unified_range_table_get_range (p, i, &first, &last, | |
3823 &dummy_val); | |
3824 for (jj = first; jj <= last && jj < 0x80; jj++) | |
3825 fastmap[jj] = 1; | |
3826 /* Ranges below 0x100 can span charsets, but there | |
3827 are only two (Control-1 and Latin-1), and | |
3828 either first or last has to be in them. */ | |
867 | 3829 set_itext_ichar (strr, first); |
428 | 3830 fastmap[*strr] = 1; |
3831 if (last < 0x100) | |
3832 { | |
867 | 3833 set_itext_ichar (strr, last); |
428 | 3834 fastmap[*strr] = 1; |
3835 } | |
3836 } | |
3837 } | |
3838 break; | |
3839 | |
3840 case charset_mule_not: | |
3841 { | |
3842 int nentries; | |
3843 int i; | |
4832
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3844 int smallest_prev = 0; |
428 | 3845 |
3846 nentries = unified_range_table_nentries (p); | |
3847 for (i = 0; i < nentries; i++) | |
3848 { | |
3849 EMACS_INT first, last; | |
3850 Lisp_Object dummy_val; | |
3851 int jj; | |
3852 | |
3853 unified_range_table_get_range (p, i, &first, &last, | |
3854 &dummy_val); | |
3855 for (jj = smallest_prev; jj < first && jj < 0x80; jj++) | |
3856 fastmap[jj] = 1; | |
3857 smallest_prev = last + 1; | |
3858 if (smallest_prev >= 0x80) | |
3859 break; | |
3860 } | |
4832
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3861 |
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3862 /* Also set lead bytes after the end */ |
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3863 for (i = smallest_prev; i < 0x80; i++) |
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3864 fastmap[i] = 1; |
07fa38c30fdf
fix messed-up fastmap calculation in charset_mule_not
Ben Wing <ben@xemacs.org>
parents:
4759
diff
changeset
|
3865 |
428 | 3866 /* Calculating which leading bytes are actually allowed |
3867 here is rather difficult, so we just punt and allow | |
3868 all of them. */ | |
3869 for (i = 0x80; i < 0xA0; i++) | |
3870 fastmap[i] = 1; | |
3871 } | |
3872 break; | |
3873 #endif /* MULE */ | |
3874 | |
3875 | |
3876 case anychar: | |
3877 { | |
3878 int fastmap_newline = fastmap['\n']; | |
3879 | |
3880 /* `.' matches anything ... */ | |
3881 #ifdef MULE | |
3882 /* "anything" only includes bytes that can be the | |
3883 first byte of a character. */ | |
3884 for (j = 0; j < 0xA0; j++) | |
3885 fastmap[j] = 1; | |
3886 #else | |
3887 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
3888 fastmap[j] = 1; | |
3889 #endif | |
3890 | |
3891 /* ... except perhaps newline. */ | |
3892 if (!(bufp->syntax & RE_DOT_NEWLINE)) | |
3893 fastmap['\n'] = fastmap_newline; | |
3894 | |
3895 /* Return if we have already set `can_be_null'; if we have, | |
3896 then the fastmap is irrelevant. Something's wrong here. */ | |
3897 else if (bufp->can_be_null) | |
3898 goto done; | |
3899 | |
3900 /* Otherwise, have to check alternative paths. */ | |
3901 break; | |
3902 } | |
3903 | |
826 | 3904 #ifndef emacs |
3905 case wordchar: | |
3906 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
3907 if (SYNTAX (ignored, j) == Sword) | |
3908 fastmap[j] = 1; | |
3909 break; | |
3910 | |
3911 case notwordchar: | |
3912 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
3913 if (SYNTAX (ignored, j) != Sword) | |
3914 fastmap[j] = 1; | |
3915 break; | |
3916 #else /* emacs */ | |
3917 case wordchar: | |
3918 case notwordchar: | |
460 | 3919 case wordbound: |
3920 case notwordbound: | |
3921 case wordbeg: | |
3922 case wordend: | |
3923 case notsyntaxspec: | |
3924 case syntaxspec: | |
3925 /* This match depends on text properties. These end with | |
3926 aborting optimizations. */ | |
3927 bufp->can_be_null = 1; | |
3928 goto done; | |
826 | 3929 #if 0 /* all of the following code is unused now that the `syntax-table' |
3930 property exists -- it's trickier to do this than just look in | |
3931 the buffer. &&#### but we could just use the syntax-cache stuff | |
3932 instead; why don't we? --ben */ | |
3933 case wordchar: | |
3934 k = (int) Sword; | |
3935 goto matchsyntax; | |
3936 | |
3937 case notwordchar: | |
3938 k = (int) Sword; | |
3939 goto matchnotsyntax; | |
3940 | |
428 | 3941 case syntaxspec: |
3942 k = *p++; | |
826 | 3943 matchsyntax: |
428 | 3944 #ifdef MULE |
3945 for (j = 0; j < 0x80; j++) | |
826 | 3946 if (SYNTAX |
3947 (XCHAR_TABLE (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) == | |
428 | 3948 (enum syntaxcode) k) |
3949 fastmap[j] = 1; | |
3950 for (j = 0x80; j < 0xA0; j++) | |
3951 { | |
826 | 3952 if (leading_byte_prefix_p ((unsigned char) j)) |
428 | 3953 /* too complicated to calculate this right */ |
3954 fastmap[j] = 1; | |
3955 else | |
3956 { | |
3957 int multi_p; | |
3958 Lisp_Object cset; | |
3959 | |
826 | 3960 cset = charset_by_leading_byte (j); |
428 | 3961 if (CHARSETP (cset)) |
3962 { | |
826 | 3963 if (charset_syntax (lispbuf, cset, &multi_p) |
428 | 3964 == Sword || multi_p) |
3965 fastmap[j] = 1; | |
3966 } | |
3967 } | |
3968 } | |
446 | 3969 #else /* not MULE */ |
428 | 3970 for (j = 0; j < (1 << BYTEWIDTH); j++) |
826 | 3971 if (SYNTAX |
3972 (XCHAR_TABLE (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) == | |
428 | 3973 (enum syntaxcode) k) |
3974 fastmap[j] = 1; | |
446 | 3975 #endif /* MULE */ |
428 | 3976 break; |
3977 | |
3978 | |
3979 case notsyntaxspec: | |
3980 k = *p++; | |
826 | 3981 matchnotsyntax: |
428 | 3982 #ifdef MULE |
3983 for (j = 0; j < 0x80; j++) | |
826 | 3984 if (SYNTAX |
428 | 3985 (XCHAR_TABLE |
826 | 3986 (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) != |
428 | 3987 (enum syntaxcode) k) |
3988 fastmap[j] = 1; | |
3989 for (j = 0x80; j < 0xA0; j++) | |
3990 { | |
826 | 3991 if (leading_byte_prefix_p ((unsigned char) j)) |
428 | 3992 /* too complicated to calculate this right */ |
3993 fastmap[j] = 1; | |
3994 else | |
3995 { | |
3996 int multi_p; | |
3997 Lisp_Object cset; | |
3998 | |
826 | 3999 cset = charset_by_leading_byte (j); |
428 | 4000 if (CHARSETP (cset)) |
4001 { | |
826 | 4002 if (charset_syntax (lispbuf, cset, &multi_p) |
428 | 4003 != Sword || multi_p) |
4004 fastmap[j] = 1; | |
4005 } | |
4006 } | |
4007 } | |
446 | 4008 #else /* not MULE */ |
428 | 4009 for (j = 0; j < (1 << BYTEWIDTH); j++) |
826 | 4010 if (SYNTAX |
428 | 4011 (XCHAR_TABLE |
826 | 4012 (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf)), j) != |
428 | 4013 (enum syntaxcode) k) |
4014 fastmap[j] = 1; | |
446 | 4015 #endif /* MULE */ |
428 | 4016 break; |
826 | 4017 #endif /* 0 */ |
428 | 4018 |
4019 #ifdef MULE | |
4020 /* 97/2/17 jhod category patch */ | |
4021 case categoryspec: | |
4022 case notcategoryspec: | |
4023 bufp->can_be_null = 1; | |
1333 | 4024 UNBIND_REGEX_MALLOC_CHECK (); |
428 | 4025 return 0; |
4026 /* end if category patch */ | |
4027 #endif /* MULE */ | |
4028 | |
4029 /* All cases after this match the empty string. These end with | |
4030 `continue'. */ | |
4031 case before_dot: | |
4032 case at_dot: | |
4033 case after_dot: | |
4034 continue; | |
826 | 4035 #endif /* emacs */ |
428 | 4036 |
4037 | |
4038 case no_op: | |
4039 case begline: | |
4040 case endline: | |
4041 case begbuf: | |
4042 case endbuf: | |
460 | 4043 #ifndef emacs |
428 | 4044 case wordbound: |
4045 case notwordbound: | |
4046 case wordbeg: | |
4047 case wordend: | |
460 | 4048 #endif |
428 | 4049 case push_dummy_failure: |
4050 continue; | |
4051 | |
4052 | |
4053 case jump_n: | |
4054 case pop_failure_jump: | |
4055 case maybe_pop_jump: | |
4056 case jump: | |
4057 case jump_past_alt: | |
4058 case dummy_failure_jump: | |
4059 EXTRACT_NUMBER_AND_INCR (j, p); | |
4060 p += j; | |
4061 if (j > 0) | |
4062 continue; | |
4063 | |
4064 /* Jump backward implies we just went through the body of a | |
4065 loop and matched nothing. Opcode jumped to should be | |
4066 `on_failure_jump' or `succeed_n'. Just treat it like an | |
4067 ordinary jump. For a * loop, it has pushed its failure | |
4068 point already; if so, discard that as redundant. */ | |
4069 if ((re_opcode_t) *p != on_failure_jump | |
4070 && (re_opcode_t) *p != succeed_n) | |
4071 continue; | |
4072 | |
4073 p++; | |
4074 EXTRACT_NUMBER_AND_INCR (j, p); | |
4075 p += j; | |
4076 | |
4077 /* If what's on the stack is where we are now, pop it. */ | |
4078 if (!FAIL_STACK_EMPTY () | |
4079 && fail_stack.stack[fail_stack.avail - 1].pointer == p) | |
4080 fail_stack.avail--; | |
4081 | |
4082 continue; | |
4083 | |
4084 | |
4085 case on_failure_jump: | |
4086 case on_failure_keep_string_jump: | |
4087 handle_on_failure_jump: | |
4088 EXTRACT_NUMBER_AND_INCR (j, p); | |
4089 | |
4090 /* For some patterns, e.g., `(a?)?', `p+j' here points to the | |
4091 end of the pattern. We don't want to push such a point, | |
4092 since when we restore it above, entering the switch will | |
4093 increment `p' past the end of the pattern. We don't need | |
4094 to push such a point since we obviously won't find any more | |
4095 fastmap entries beyond `pend'. Such a pattern can match | |
4096 the null string, though. */ | |
4097 if (p + j < pend) | |
4098 { | |
4099 if (!PUSH_PATTERN_OP (p + j, fail_stack)) | |
4100 { | |
4101 RESET_FAIL_STACK (); | |
1333 | 4102 UNBIND_REGEX_MALLOC_CHECK (); |
428 | 4103 return -2; |
4104 } | |
4105 } | |
4106 else | |
4107 bufp->can_be_null = 1; | |
4108 | |
4109 if (succeed_n_p) | |
4110 { | |
4111 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ | |
4112 succeed_n_p = false; | |
4113 } | |
4114 | |
4115 continue; | |
4116 | |
4117 | |
4118 case succeed_n: | |
4119 /* Get to the number of times to succeed. */ | |
4120 p += 2; | |
4121 | |
4122 /* Increment p past the n for when k != 0. */ | |
4123 EXTRACT_NUMBER_AND_INCR (k, p); | |
4124 if (k == 0) | |
4125 { | |
4126 p -= 4; | |
4127 succeed_n_p = true; /* Spaghetti code alert. */ | |
4128 goto handle_on_failure_jump; | |
4129 } | |
4130 continue; | |
4131 | |
4132 | |
4133 case set_number_at: | |
4134 p += 4; | |
4135 continue; | |
4136 | |
4137 | |
4138 case start_memory: | |
4139 case stop_memory: | |
4140 p += 2; | |
4141 continue; | |
4142 | |
4143 | |
4144 default: | |
2500 | 4145 ABORT (); /* We have listed all the cases. */ |
428 | 4146 } /* switch *p++ */ |
4147 | |
4148 /* Getting here means we have found the possible starting | |
4149 characters for one path of the pattern -- and that the empty | |
4150 string does not match. We need not follow this path further. | |
4151 Instead, look at the next alternative (remembered on the | |
4152 stack), or quit if no more. The test at the top of the loop | |
4153 does these things. */ | |
4154 path_can_be_null = false; | |
4155 p = pend; | |
4156 } /* while p */ | |
4157 | |
4158 /* Set `can_be_null' for the last path (also the first path, if the | |
4159 pattern is empty). */ | |
4160 bufp->can_be_null |= path_can_be_null; | |
4161 | |
4162 done: | |
4163 RESET_FAIL_STACK (); | |
1333 | 4164 UNBIND_REGEX_MALLOC_CHECK (); |
428 | 4165 return 0; |
4166 } /* re_compile_fastmap */ | |
4167 | |
4168 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and | |
4169 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use | |
4170 this memory for recording register information. STARTS and ENDS | |
4171 must be allocated using the malloc library routine, and must each | |
4172 be at least NUM_REGS * sizeof (regoff_t) bytes long. | |
4173 | |
4174 If NUM_REGS == 0, then subsequent matches should allocate their own | |
4175 register data. | |
4176 | |
4177 Unless this function is called, the first search or match using | |
4178 PATTERN_BUFFER will allocate its own register data, without | |
4179 freeing the old data. */ | |
4180 | |
4181 void | |
4182 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, | |
647 | 4183 int num_regs, regoff_t *starts, regoff_t *ends) |
428 | 4184 { |
4185 if (num_regs) | |
4186 { | |
4187 bufp->regs_allocated = REGS_REALLOCATE; | |
4188 regs->num_regs = num_regs; | |
4189 regs->start = starts; | |
4190 regs->end = ends; | |
4191 } | |
4192 else | |
4193 { | |
4194 bufp->regs_allocated = REGS_UNALLOCATED; | |
4195 regs->num_regs = 0; | |
4196 regs->start = regs->end = (regoff_t *) 0; | |
4197 } | |
4198 } | |
4199 | |
4200 /* Searching routines. */ | |
4201 | |
4202 /* Like re_search_2, below, but only one string is specified, and | |
4203 doesn't let you say where to stop matching. */ | |
4204 | |
4205 int | |
442 | 4206 re_search (struct re_pattern_buffer *bufp, const char *string, int size, |
826 | 4207 int startpos, int range, struct re_registers *regs |
4208 RE_LISP_CONTEXT_ARGS_DECL) | |
428 | 4209 { |
4210 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, | |
826 | 4211 regs, size RE_LISP_CONTEXT_ARGS); |
428 | 4212 } |
4213 | |
4214 /* Using the compiled pattern in BUFP->buffer, first tries to match the | |
4215 virtual concatenation of STRING1 and STRING2, starting first at index | |
4216 STARTPOS, then at STARTPOS + 1, and so on. | |
4217 | |
4218 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. | |
4219 | |
4220 RANGE is how far to scan while trying to match. RANGE = 0 means try | |
4221 only at STARTPOS; in general, the last start tried is STARTPOS + | |
4222 RANGE. | |
4223 | |
826 | 4224 All sizes and positions refer to bytes (not chars); under Mule, the code |
4225 knows about the format of the text and will only check at positions | |
4226 where a character starts. | |
4227 | |
428 | 4228 With MULE, RANGE is a byte position, not a char position. The last |
4229 start tried is the character starting <= STARTPOS + RANGE. | |
4230 | |
4231 In REGS, return the indices of the virtual concatenation of STRING1 | |
4232 and STRING2 that matched the entire BUFP->buffer and its contained | |
4233 subexpressions. | |
4234 | |
4235 Do not consider matching one past the index STOP in the virtual | |
4236 concatenation of STRING1 and STRING2. | |
4237 | |
4238 We return either the position in the strings at which the match was | |
4239 found, -1 if no match, or -2 if error (such as failure | |
4240 stack overflow). */ | |
4241 | |
4242 int | |
446 | 4243 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, |
4244 int size1, const char *str2, int size2, int startpos, | |
826 | 4245 int range, struct re_registers *regs, int stop |
4246 RE_LISP_CONTEXT_ARGS_DECL) | |
428 | 4247 { |
4248 int val; | |
446 | 4249 re_char *string1 = (re_char *) str1; |
4250 re_char *string2 = (re_char *) str2; | |
428 | 4251 REGISTER char *fastmap = bufp->fastmap; |
446 | 4252 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate; |
428 | 4253 int total_size = size1 + size2; |
4254 int endpos = startpos + range; | |
4255 #ifdef REGEX_BEGLINE_CHECK | |
4256 int anchored_at_begline = 0; | |
4257 #endif | |
446 | 4258 re_char *d; |
826 | 4259 #ifdef emacs |
4260 Internal_Format fmt = buffer_or_other_internal_format (lispobj); | |
1346 | 4261 #ifdef REL_ALLOC |
4262 Ibyte *orig_buftext = | |
4263 BUFFERP (lispobj) ? | |
4264 BYTE_BUF_BYTE_ADDRESS (XBUFFER (lispobj), | |
4265 BYTE_BUF_BEGV (XBUFFER (lispobj))) : | |
4266 0; | |
4267 #endif | |
1333 | 4268 #ifdef ERROR_CHECK_MALLOC |
4269 int depth; | |
4270 #endif | |
826 | 4271 #endif /* emacs */ |
4272 #if 1 | |
4273 int forward_search_p; | |
4274 #endif | |
428 | 4275 |
4276 /* Check for out-of-range STARTPOS. */ | |
4277 if (startpos < 0 || startpos > total_size) | |
4278 return -1; | |
4279 | |
4280 /* Fix up RANGE if it might eventually take us outside | |
4281 the virtual concatenation of STRING1 and STRING2. */ | |
4282 if (endpos < 0) | |
4283 range = 0 - startpos; | |
4284 else if (endpos > total_size) | |
4285 range = total_size - startpos; | |
4286 | |
826 | 4287 #if 1 |
4288 forward_search_p = range > 0; | |
4289 #endif | |
4290 | |
428 | 4291 /* If the search isn't to be a backwards one, don't waste time in a |
4292 search for a pattern that must be anchored. */ | |
4293 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) | |
4294 { | |
4295 if (startpos > 0) | |
4296 return -1; | |
4297 else | |
4298 { | |
442 | 4299 d = ((const unsigned char *) |
428 | 4300 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
867 | 4301 range = itext_ichar_len_fmt (d, fmt); |
428 | 4302 } |
4303 } | |
4304 | |
460 | 4305 #ifdef emacs |
4306 /* In a forward search for something that starts with \=. | |
4307 don't keep searching past point. */ | |
4308 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) | |
4309 { | |
826 | 4310 if (!BUFFERP (lispobj)) |
4311 return -1; | |
4527
8418d1ad4944
Fix at_dot regex under Mule. <87hc6rv53v.fsf@uwakimon.sk.tsukuba.ac.jp>
Stephen J. Turnbull <stephen@xemacs.org>
parents:
3300
diff
changeset
|
4312 range = (BYTE_BUF_PT (XBUFFER (lispobj)) |
8418d1ad4944
Fix at_dot regex under Mule. <87hc6rv53v.fsf@uwakimon.sk.tsukuba.ac.jp>
Stephen J. Turnbull <stephen@xemacs.org>
parents:
3300
diff
changeset
|
4313 - BYTE_BUF_BEGV (XBUFFER (lispobj)) - startpos); |
460 | 4314 if (range < 0) |
4315 return -1; | |
4316 } | |
4317 #endif /* emacs */ | |
4318 | |
1333 | 4319 #ifdef ERROR_CHECK_MALLOC |
4320 /* Do this after the above return()s. */ | |
4321 depth = bind_regex_malloc_disallowed (1); | |
4322 #endif | |
4323 | |
428 | 4324 /* Update the fastmap now if not correct already. */ |
1333 | 4325 BEGIN_REGEX_MALLOC_OK (); |
428 | 4326 if (fastmap && !bufp->fastmap_accurate) |
826 | 4327 if (re_compile_fastmap (bufp RE_LISP_SHORT_CONTEXT_ARGS) == -2) |
1333 | 4328 { |
4329 END_REGEX_MALLOC_OK (); | |
4330 UNBIND_REGEX_MALLOC_CHECK (); | |
4331 return -2; | |
4332 } | |
4333 | |
4334 END_REGEX_MALLOC_OK (); | |
4335 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
428 | 4336 |
4337 #ifdef REGEX_BEGLINE_CHECK | |
4338 { | |
647 | 4339 long i = 0; |
428 | 4340 |
4341 while (i < bufp->used) | |
4342 { | |
4343 if (bufp->buffer[i] == start_memory || | |
4344 bufp->buffer[i] == stop_memory) | |
4345 i += 2; | |
4346 else | |
4347 break; | |
4348 } | |
4349 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline; | |
4350 } | |
4351 #endif | |
4352 | |
460 | 4353 #ifdef emacs |
1333 | 4354 BEGIN_REGEX_MALLOC_OK (); |
826 | 4355 scache = setup_syntax_cache (scache, lispobj, lispbuf, |
4356 offset_to_charxpos (lispobj, startpos), | |
4357 1); | |
1333 | 4358 END_REGEX_MALLOC_OK (); |
4359 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
460 | 4360 #endif |
4361 | |
428 | 4362 /* Loop through the string, looking for a place to start matching. */ |
4363 for (;;) | |
4364 { | |
4365 #ifdef REGEX_BEGLINE_CHECK | |
826 | 4366 /* If the regex is anchored at the beginning of a line (i.e. with a |
4367 ^), then we can speed things up by skipping to the next | |
4368 beginning-of-line. However, to determine "beginning of line" we | |
4369 need to look at the previous char, so can't do this check if at | |
4370 beginning of either string. (Well, we could if at the beginning of | |
4371 the second string, but it would require additional code, and this | |
4372 is just an optimization.) */ | |
4373 if (anchored_at_begline && startpos > 0 && startpos != size1) | |
428 | 4374 { |
826 | 4375 if (range > 0) |
4376 { | |
4377 /* whose stupid idea was it anyway to make this | |
4378 function take two strings to match?? */ | |
4379 int lim = 0; | |
4380 re_char *orig_d; | |
4381 re_char *stop_d; | |
4382 | |
4383 /* Compute limit as below in fastmap code, so we are guaranteed | |
4384 to remain within a single string. */ | |
4385 if (startpos < size1 && startpos + range >= size1) | |
4386 lim = range - (size1 - startpos); | |
4387 | |
4388 d = ((const unsigned char *) | |
4389 (startpos >= size1 ? string2 - size1 : string1) + startpos); | |
4390 orig_d = d; | |
4391 stop_d = d + range - lim; | |
4392 | |
4393 /* We want to find the next location (including the current | |
4394 one) where the previous char is a newline, so back up one | |
4395 and search forward for a newline. */ | |
867 | 4396 DEC_IBYTEPTR_FMT (d, fmt); /* Ok, since startpos != size1. */ |
826 | 4397 |
4398 /* Written out as an if-else to avoid testing `translate' | |
4399 inside the loop. */ | |
4400 if (TRANSLATE_P (translate)) | |
4401 while (d < stop_d && | |
867 | 4402 RE_TRANSLATE_1 (itext_ichar_fmt (d, fmt, lispobj)) |
826 | 4403 != '\n') |
867 | 4404 INC_IBYTEPTR_FMT (d, fmt); |
826 | 4405 else |
4406 while (d < stop_d && | |
867 | 4407 itext_ichar_ascii_fmt (d, fmt, lispobj) != '\n') |
4408 INC_IBYTEPTR_FMT (d, fmt); | |
826 | 4409 |
4410 /* If we were stopped by a newline, skip forward over it. | |
4411 Otherwise we will get in an infloop when our start position | |
4412 was at begline. */ | |
4413 if (d < stop_d) | |
867 | 4414 INC_IBYTEPTR_FMT (d, fmt); |
826 | 4415 range -= d - orig_d; |
4416 startpos += d - orig_d; | |
4417 #if 1 | |
4418 assert (!forward_search_p || range >= 0); | |
4419 #endif | |
4420 } | |
4421 else if (range < 0) | |
4422 { | |
4423 /* We're lazy, like in the fastmap code below */ | |
867 | 4424 Ichar c; |
826 | 4425 |
4426 d = ((const unsigned char *) | |
4427 (startpos >= size1 ? string2 - size1 : string1) + startpos); | |
867 | 4428 DEC_IBYTEPTR_FMT (d, fmt); |
4429 c = itext_ichar_fmt (d, fmt, lispobj); | |
826 | 4430 c = RE_TRANSLATE (c); |
4431 if (c != '\n') | |
4432 goto advance; | |
4433 } | |
428 | 4434 } |
4435 #endif /* REGEX_BEGLINE_CHECK */ | |
4436 | |
4437 /* If a fastmap is supplied, skip quickly over characters that | |
4438 cannot be the start of a match. If the pattern can match the | |
4439 null string, however, we don't need to skip characters; we want | |
4440 the first null string. */ | |
4441 if (fastmap && startpos < total_size && !bufp->can_be_null) | |
4442 { | |
826 | 4443 /* For the moment, fastmap always works as if buffer |
4444 is in default format, so convert chars in the search strings | |
4445 into default format as we go along, if necessary. | |
4446 | |
4447 &&#### fastmap needs rethinking for 8-bit-fixed so | |
4448 it's faster. We need it to reflect the raw | |
4449 8-bit-fixed values. That isn't so hard if we assume | |
4450 that the top 96 bytes represent a single 1-byte | |
4451 charset. For 16-bit/32-bit stuff it's probably not | |
4452 worth it to make the fastmap represent the raw, due to | |
4453 its nature -- we'd have to use the LSB for the | |
4454 fastmap, and that causes lots of problems with Mule | |
4455 chars, where it essentially wipes out the usefulness | |
4456 of the fastmap entirely. */ | |
428 | 4457 if (range > 0) /* Searching forwards. */ |
4458 { | |
4459 int lim = 0; | |
4460 int irange = range; | |
4461 | |
4462 if (startpos < size1 && startpos + range >= size1) | |
4463 lim = range - (size1 - startpos); | |
4464 | |
442 | 4465 d = ((const unsigned char *) |
428 | 4466 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
4467 | |
4468 /* Written out as an if-else to avoid testing `translate' | |
4469 inside the loop. */ | |
446 | 4470 if (TRANSLATE_P (translate)) |
826 | 4471 { |
4472 while (range > lim) | |
4473 { | |
4474 re_char *old_d = d; | |
428 | 4475 #ifdef MULE |
867 | 4476 Ibyte tempch[MAX_ICHAR_LEN]; |
4477 Ichar buf_ch = | |
4478 RE_TRANSLATE_1 (itext_ichar_fmt (d, fmt, lispobj)); | |
4479 set_itext_ichar (tempch, buf_ch); | |
826 | 4480 if (fastmap[*tempch]) |
4481 break; | |
446 | 4482 #else |
826 | 4483 if (fastmap[(unsigned char) RE_TRANSLATE_1 (*d)]) |
4484 break; | |
446 | 4485 #endif /* MULE */ |
867 | 4486 INC_IBYTEPTR_FMT (d, fmt); |
826 | 4487 range -= (d - old_d); |
4488 #if 1 | |
1333 | 4489 assert (!forward_search_p || range >= 0); |
826 | 4490 #endif |
4491 } | |
4492 } | |
4493 #ifdef MULE | |
4494 else if (fmt != FORMAT_DEFAULT) | |
4495 { | |
4496 while (range > lim) | |
4497 { | |
4498 re_char *old_d = d; | |
867 | 4499 Ibyte tempch[MAX_ICHAR_LEN]; |
4500 Ichar buf_ch = itext_ichar_fmt (d, fmt, lispobj); | |
4501 set_itext_ichar (tempch, buf_ch); | |
826 | 4502 if (fastmap[*tempch]) |
4503 break; | |
867 | 4504 INC_IBYTEPTR_FMT (d, fmt); |
826 | 4505 range -= (d - old_d); |
4506 #if 1 | |
1333 | 4507 assert (!forward_search_p || range >= 0); |
826 | 4508 #endif |
4509 } | |
4510 } | |
4511 #endif /* MULE */ | |
428 | 4512 else |
826 | 4513 { |
4514 while (range > lim && !fastmap[*d]) | |
4515 { | |
4516 re_char *old_d = d; | |
867 | 4517 INC_IBYTEPTR (d); |
826 | 4518 range -= (d - old_d); |
4519 #if 1 | |
4520 assert (!forward_search_p || range >= 0); | |
4521 #endif | |
4522 } | |
4523 } | |
428 | 4524 |
4525 startpos += irange - range; | |
4526 } | |
4527 else /* Searching backwards. */ | |
4528 { | |
826 | 4529 /* #### It's not clear why we don't just write a loop, like |
4530 for the moving-forward case. Perhaps the writer got lazy, | |
4531 since backward searches aren't so common. */ | |
4532 d = ((const unsigned char *) | |
4533 (startpos >= size1 ? string2 - size1 : string1) + startpos); | |
428 | 4534 #ifdef MULE |
826 | 4535 { |
867 | 4536 Ibyte tempch[MAX_ICHAR_LEN]; |
4537 Ichar buf_ch = | |
4538 RE_TRANSLATE (itext_ichar_fmt (d, fmt, lispobj)); | |
4539 set_itext_ichar (tempch, buf_ch); | |
826 | 4540 if (!fastmap[*tempch]) |
4541 goto advance; | |
4542 } | |
428 | 4543 #else |
826 | 4544 if (!fastmap[(unsigned char) RE_TRANSLATE (*d)]) |
446 | 4545 goto advance; |
826 | 4546 #endif /* MULE */ |
428 | 4547 } |
4548 } | |
4549 | |
4550 /* If can't match the null string, and that's all we have left, fail. */ | |
4551 if (range >= 0 && startpos == total_size && fastmap | |
4552 && !bufp->can_be_null) | |
1333 | 4553 { |
4554 UNBIND_REGEX_MALLOC_CHECK (); | |
4555 return -1; | |
4556 } | |
428 | 4557 |
4558 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */ | |
4559 if (!no_quit_in_re_search) | |
1333 | 4560 { |
4561 BEGIN_REGEX_MALLOC_OK (); | |
4562 QUIT; | |
4563 END_REGEX_MALLOC_OK (); | |
4564 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
4565 } | |
4566 | |
428 | 4567 #endif |
1333 | 4568 BEGIN_REGEX_MALLOC_OK (); |
428 | 4569 val = re_match_2_internal (bufp, string1, size1, string2, size2, |
826 | 4570 startpos, regs, stop |
4571 RE_LISP_CONTEXT_ARGS); | |
428 | 4572 #ifndef REGEX_MALLOC |
1333 | 4573 ALLOCA_GARBAGE_COLLECT (); |
428 | 4574 #endif |
1333 | 4575 END_REGEX_MALLOC_OK (); |
4576 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
428 | 4577 |
4578 if (val >= 0) | |
1333 | 4579 { |
4580 UNBIND_REGEX_MALLOC_CHECK (); | |
4581 return startpos; | |
4582 } | |
428 | 4583 |
4584 if (val == -2) | |
1333 | 4585 { |
4586 UNBIND_REGEX_MALLOC_CHECK (); | |
4587 return -2; | |
4588 } | |
4589 | |
4590 RE_SEARCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
428 | 4591 advance: |
4592 if (!range) | |
4593 break; | |
4594 else if (range > 0) | |
4595 { | |
826 | 4596 Bytecount d_size; |
442 | 4597 d = ((const unsigned char *) |
428 | 4598 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
867 | 4599 d_size = itext_ichar_len_fmt (d, fmt); |
428 | 4600 range -= d_size; |
826 | 4601 #if 1 |
4602 assert (!forward_search_p || range >= 0); | |
4603 #endif | |
428 | 4604 startpos += d_size; |
4605 } | |
4606 else | |
4607 { | |
826 | 4608 Bytecount d_size; |
428 | 4609 /* Note startpos > size1 not >=. If we are on the |
4610 string1/string2 boundary, we want to backup into string1. */ | |
442 | 4611 d = ((const unsigned char *) |
428 | 4612 (startpos > size1 ? string2 - size1 : string1) + startpos); |
867 | 4613 DEC_IBYTEPTR_FMT (d, fmt); |
4614 d_size = itext_ichar_len_fmt (d, fmt); | |
428 | 4615 range += d_size; |
826 | 4616 #if 1 |
4617 assert (!forward_search_p || range >= 0); | |
4618 #endif | |
428 | 4619 startpos -= d_size; |
4620 } | |
4621 } | |
1333 | 4622 UNBIND_REGEX_MALLOC_CHECK (); |
428 | 4623 return -1; |
4624 } /* re_search_2 */ | |
826 | 4625 |
428 | 4626 |
4627 /* Declarations and macros for re_match_2. */ | |
4628 | |
4629 /* This converts PTR, a pointer into one of the search strings `string1' | |
4630 and `string2' into an offset from the beginning of that string. */ | |
4631 #define POINTER_TO_OFFSET(ptr) \ | |
4632 (FIRST_STRING_P (ptr) \ | |
4633 ? ((regoff_t) ((ptr) - string1)) \ | |
4634 : ((regoff_t) ((ptr) - string2 + size1))) | |
4635 | |
4636 /* Macros for dealing with the split strings in re_match_2. */ | |
4637 | |
4638 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) | |
4639 | |
4640 /* Call before fetching a character with *d. This switches over to | |
4641 string2 if necessary. */ | |
826 | 4642 #define REGEX_PREFETCH() \ |
428 | 4643 while (d == dend) \ |
4644 { \ | |
4645 /* End of string2 => fail. */ \ | |
4646 if (dend == end_match_2) \ | |
4647 goto fail; \ | |
4648 /* End of string1 => advance to string2. */ \ | |
4649 d = string2; \ | |
4650 dend = end_match_2; \ | |
4651 } | |
4652 | |
4653 | |
4654 /* Test if at very beginning or at very end of the virtual concatenation | |
4655 of `string1' and `string2'. If only one string, it's `string2'. */ | |
4656 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) | |
4657 #define AT_STRINGS_END(d) ((d) == end2) | |
4658 | |
4659 /* XEmacs change: | |
4660 If the given position straddles the string gap, return the equivalent | |
4661 position that is before or after the gap, respectively; otherwise, | |
4662 return the same position. */ | |
4663 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d)) | |
4664 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d)) | |
4665 | |
4666 /* Test if CH is a word-constituent character. (XEmacs change) */ | |
826 | 4667 #define WORDCHAR_P(ch) \ |
4668 (SYNTAX (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf), ch) == Sword) | |
428 | 4669 |
4670 /* Free everything we malloc. */ | |
4671 #ifdef MATCH_MAY_ALLOCATE | |
1726 | 4672 #define FREE_VAR(var,type) if (var) REGEX_FREE (var, type); var = NULL |
428 | 4673 #define FREE_VARIABLES() \ |
4674 do { \ | |
1333 | 4675 UNBIND_REGEX_MALLOC_CHECK (); \ |
428 | 4676 REGEX_FREE_STACK (fail_stack.stack); \ |
1726 | 4677 FREE_VAR (regstart, re_char **); \ |
4678 FREE_VAR (regend, re_char **); \ | |
4679 FREE_VAR (old_regstart, re_char **); \ | |
4680 FREE_VAR (old_regend, re_char **); \ | |
4681 FREE_VAR (best_regstart, re_char **); \ | |
4682 FREE_VAR (best_regend, re_char **); \ | |
4683 FREE_VAR (reg_info, register_info_type *); \ | |
4684 FREE_VAR (reg_dummy, re_char **); \ | |
4685 FREE_VAR (reg_info_dummy, register_info_type *); \ | |
428 | 4686 } while (0) |
446 | 4687 #else /* not MATCH_MAY_ALLOCATE */ |
1333 | 4688 #define FREE_VARIABLES() \ |
4689 do { \ | |
4690 UNBIND_REGEX_MALLOC_CHECK (); \ | |
4691 } while (0) | |
446 | 4692 #endif /* MATCH_MAY_ALLOCATE */ |
428 | 4693 |
4694 /* These values must meet several constraints. They must not be valid | |
4695 register values; since we have a limit of 255 registers (because | |
4696 we use only one byte in the pattern for the register number), we can | |
4697 use numbers larger than 255. They must differ by 1, because of | |
4698 NUM_FAILURE_ITEMS above. And the value for the lowest register must | |
4699 be larger than the value for the highest register, so we do not try | |
4700 to actually save any registers when none are active. */ | |
4701 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) | |
4702 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) | |
4703 | |
4704 /* Matching routines. */ | |
4705 | |
826 | 4706 #ifndef emacs /* XEmacs never uses this. */ |
428 | 4707 /* re_match is like re_match_2 except it takes only a single string. */ |
4708 | |
4709 int | |
442 | 4710 re_match (struct re_pattern_buffer *bufp, const char *string, int size, |
826 | 4711 int pos, struct re_registers *regs |
4712 RE_LISP_CONTEXT_ARGS_DECL) | |
428 | 4713 { |
446 | 4714 int result = re_match_2_internal (bufp, NULL, 0, (re_char *) string, size, |
826 | 4715 pos, regs, size |
4716 RE_LISP_CONTEXT_ARGS); | |
1333 | 4717 ALLOCA_GARBAGE_COLLECT (); |
428 | 4718 return result; |
4719 } | |
4720 #endif /* not emacs */ | |
4721 | |
4722 /* re_match_2 matches the compiled pattern in BUFP against the | |
4723 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and | |
4724 SIZE2, respectively). We start matching at POS, and stop matching | |
4725 at STOP. | |
4726 | |
4727 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we | |
4728 store offsets for the substring each group matched in REGS. See the | |
4729 documentation for exactly how many groups we fill. | |
4730 | |
4731 We return -1 if no match, -2 if an internal error (such as the | |
4732 failure stack overflowing). Otherwise, we return the length of the | |
4733 matched substring. */ | |
4734 | |
4735 int | |
442 | 4736 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, |
4737 int size1, const char *string2, int size2, int pos, | |
826 | 4738 struct re_registers *regs, int stop |
4739 RE_LISP_CONTEXT_ARGS_DECL) | |
428 | 4740 { |
460 | 4741 int result; |
4742 | |
4743 #ifdef emacs | |
826 | 4744 scache = setup_syntax_cache (scache, lispobj, lispbuf, |
4745 offset_to_charxpos (lispobj, pos), | |
4746 1); | |
460 | 4747 #endif |
4748 | |
4749 result = re_match_2_internal (bufp, (re_char *) string1, size1, | |
4750 (re_char *) string2, size2, | |
826 | 4751 pos, regs, stop |
4752 RE_LISP_CONTEXT_ARGS); | |
460 | 4753 |
1333 | 4754 ALLOCA_GARBAGE_COLLECT (); |
428 | 4755 return result; |
4756 } | |
4757 | |
4758 /* This is a separate function so that we can force an alloca cleanup | |
4759 afterwards. */ | |
4760 static int | |
446 | 4761 re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1, |
4762 int size1, re_char *string2, int size2, int pos, | |
826 | 4763 struct re_registers *regs, int stop |
2333 | 4764 RE_LISP_CONTEXT_ARGS_MULE_DECL) |
428 | 4765 { |
4766 /* General temporaries. */ | |
4767 int mcnt; | |
4768 unsigned char *p1; | |
4769 int should_succeed; /* XEmacs change */ | |
4770 | |
4771 /* Just past the end of the corresponding string. */ | |
446 | 4772 re_char *end1, *end2; |
428 | 4773 |
4774 /* Pointers into string1 and string2, just past the last characters in | |
4775 each to consider matching. */ | |
446 | 4776 re_char *end_match_1, *end_match_2; |
428 | 4777 |
4778 /* Where we are in the data, and the end of the current string. */ | |
446 | 4779 re_char *d, *dend; |
428 | 4780 |
4781 /* Where we are in the pattern, and the end of the pattern. */ | |
4782 unsigned char *p = bufp->buffer; | |
4783 REGISTER unsigned char *pend = p + bufp->used; | |
4784 | |
4785 /* Mark the opcode just after a start_memory, so we can test for an | |
4786 empty subpattern when we get to the stop_memory. */ | |
446 | 4787 re_char *just_past_start_mem = 0; |
428 | 4788 |
4789 /* We use this to map every character in the string. */ | |
446 | 4790 RE_TRANSLATE_TYPE translate = bufp->translate; |
428 | 4791 |
4792 /* Failure point stack. Each place that can handle a failure further | |
4793 down the line pushes a failure point on this stack. It consists of | |
4794 restart, regend, and reg_info for all registers corresponding to | |
4795 the subexpressions we're currently inside, plus the number of such | |
4796 registers, and, finally, two char *'s. The first char * is where | |
4797 to resume scanning the pattern; the second one is where to resume | |
4798 scanning the strings. If the latter is zero, the failure point is | |
4799 a ``dummy''; if a failure happens and the failure point is a dummy, | |
4800 it gets discarded and the next one is tried. */ | |
4801 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ | |
4802 fail_stack_type fail_stack; | |
4803 #endif | |
4804 #ifdef DEBUG | |
647 | 4805 static int failure_id; |
4806 int nfailure_points_pushed = 0, nfailure_points_popped = 0; | |
428 | 4807 #endif |
4808 | |
771 | 4809 #ifdef REGEX_REL_ALLOC |
428 | 4810 /* This holds the pointer to the failure stack, when |
4811 it is allocated relocatably. */ | |
4812 fail_stack_elt_t *failure_stack_ptr; | |
4813 #endif | |
4814 | |
4815 /* We fill all the registers internally, independent of what we | |
4816 return, for use in backreferences. The number here includes | |
4817 an element for register zero. */ | |
647 | 4818 int num_regs = bufp->re_ngroups + 1; |
428 | 4819 |
4820 /* The currently active registers. */ | |
647 | 4821 int lowest_active_reg = NO_LOWEST_ACTIVE_REG; |
4822 int highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
428 | 4823 |
4824 /* Information on the contents of registers. These are pointers into | |
4825 the input strings; they record just what was matched (on this | |
4826 attempt) by a subexpression part of the pattern, that is, the | |
4827 regnum-th regstart pointer points to where in the pattern we began | |
4828 matching and the regnum-th regend points to right after where we | |
4829 stopped matching the regnum-th subexpression. (The zeroth register | |
4830 keeps track of what the whole pattern matches.) */ | |
4831 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ | |
446 | 4832 re_char **regstart, **regend; |
428 | 4833 #endif |
4834 | |
4835 /* If a group that's operated upon by a repetition operator fails to | |
4836 match anything, then the register for its start will need to be | |
4837 restored because it will have been set to wherever in the string we | |
4838 are when we last see its open-group operator. Similarly for a | |
4839 register's end. */ | |
4840 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ | |
446 | 4841 re_char **old_regstart, **old_regend; |
428 | 4842 #endif |
4843 | |
4844 /* The is_active field of reg_info helps us keep track of which (possibly | |
4845 nested) subexpressions we are currently in. The matched_something | |
4846 field of reg_info[reg_num] helps us tell whether or not we have | |
4847 matched any of the pattern so far this time through the reg_num-th | |
4848 subexpression. These two fields get reset each time through any | |
4849 loop their register is in. */ | |
4850 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ | |
4851 register_info_type *reg_info; | |
4852 #endif | |
4853 | |
4854 /* The following record the register info as found in the above | |
4855 variables when we find a match better than any we've seen before. | |
4856 This happens as we backtrack through the failure points, which in | |
4857 turn happens only if we have not yet matched the entire string. */ | |
647 | 4858 int best_regs_set = false; |
428 | 4859 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ |
446 | 4860 re_char **best_regstart, **best_regend; |
428 | 4861 #endif |
4862 | |
4863 /* Logically, this is `best_regend[0]'. But we don't want to have to | |
4864 allocate space for that if we're not allocating space for anything | |
4865 else (see below). Also, we never need info about register 0 for | |
4866 any of the other register vectors, and it seems rather a kludge to | |
4867 treat `best_regend' differently than the rest. So we keep track of | |
4868 the end of the best match so far in a separate variable. We | |
4869 initialize this to NULL so that when we backtrack the first time | |
4870 and need to test it, it's not garbage. */ | |
446 | 4871 re_char *match_end = NULL; |
428 | 4872 |
4873 /* This helps SET_REGS_MATCHED avoid doing redundant work. */ | |
4874 int set_regs_matched_done = 0; | |
4875 | |
4876 /* Used when we pop values we don't care about. */ | |
4877 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ | |
446 | 4878 re_char **reg_dummy; |
428 | 4879 register_info_type *reg_info_dummy; |
4880 #endif | |
4881 | |
4882 #ifdef DEBUG | |
4883 /* Counts the total number of registers pushed. */ | |
647 | 4884 int num_regs_pushed = 0; |
428 | 4885 #endif |
4886 | |
4887 /* 1 if this match ends in the same string (string1 or string2) | |
4888 as the best previous match. */ | |
460 | 4889 re_bool same_str_p; |
428 | 4890 |
4891 /* 1 if this match is the best seen so far. */ | |
460 | 4892 re_bool best_match_p; |
428 | 4893 |
826 | 4894 #ifdef emacs |
4895 Internal_Format fmt = buffer_or_other_internal_format (lispobj); | |
1346 | 4896 #ifdef REL_ALLOC |
4897 Ibyte *orig_buftext = | |
4898 BUFFERP (lispobj) ? | |
4899 BYTE_BUF_BYTE_ADDRESS (XBUFFER (lispobj), | |
4900 BYTE_BUF_BEGV (XBUFFER (lispobj))) : | |
4901 0; | |
4902 #endif | |
4903 | |
1333 | 4904 #ifdef ERROR_CHECK_MALLOC |
4905 int depth = bind_regex_malloc_disallowed (1); | |
4906 #endif | |
826 | 4907 #endif /* emacs */ |
771 | 4908 |
428 | 4909 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); |
4910 | |
1333 | 4911 BEGIN_REGEX_MALLOC_OK (); |
428 | 4912 INIT_FAIL_STACK (); |
1333 | 4913 END_REGEX_MALLOC_OK (); |
428 | 4914 |
4915 #ifdef MATCH_MAY_ALLOCATE | |
4916 /* Do not bother to initialize all the register variables if there are | |
4917 no groups in the pattern, as it takes a fair amount of time. If | |
4918 there are groups, we include space for register 0 (the whole | |
4919 pattern), even though we never use it, since it simplifies the | |
4920 array indexing. We should fix this. */ | |
502 | 4921 if (bufp->re_ngroups) |
428 | 4922 { |
1333 | 4923 BEGIN_REGEX_MALLOC_OK (); |
446 | 4924 regstart = REGEX_TALLOC (num_regs, re_char *); |
4925 regend = REGEX_TALLOC (num_regs, re_char *); | |
4926 old_regstart = REGEX_TALLOC (num_regs, re_char *); | |
4927 old_regend = REGEX_TALLOC (num_regs, re_char *); | |
4928 best_regstart = REGEX_TALLOC (num_regs, re_char *); | |
4929 best_regend = REGEX_TALLOC (num_regs, re_char *); | |
428 | 4930 reg_info = REGEX_TALLOC (num_regs, register_info_type); |
446 | 4931 reg_dummy = REGEX_TALLOC (num_regs, re_char *); |
428 | 4932 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); |
1333 | 4933 END_REGEX_MALLOC_OK (); |
428 | 4934 |
4935 if (!(regstart && regend && old_regstart && old_regend && reg_info | |
4936 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) | |
4937 { | |
4938 FREE_VARIABLES (); | |
4939 return -2; | |
4940 } | |
4941 } | |
4942 else | |
4943 { | |
4944 /* We must initialize all our variables to NULL, so that | |
4945 `FREE_VARIABLES' doesn't try to free them. */ | |
4946 regstart = regend = old_regstart = old_regend = best_regstart | |
4947 = best_regend = reg_dummy = NULL; | |
4948 reg_info = reg_info_dummy = (register_info_type *) NULL; | |
4949 } | |
4950 #endif /* MATCH_MAY_ALLOCATE */ | |
4951 | |
1333 | 4952 #if defined (emacs) && defined (REL_ALLOC) |
4953 { | |
4954 /* If the allocations above (or the call to setup_syntax_cache() in | |
4955 re_match_2) caused a rel-alloc relocation, then fix up the data | |
4956 pointers */ | |
1346 | 4957 Bytecount offset = offset_post_relocation (lispobj, orig_buftext); |
1333 | 4958 if (offset) |
4959 { | |
4960 string1 += offset; | |
4961 string2 += offset; | |
4962 } | |
4963 } | |
4964 #endif /* defined (emacs) && defined (REL_ALLOC) */ | |
4965 | |
428 | 4966 /* The starting position is bogus. */ |
4967 if (pos < 0 || pos > size1 + size2) | |
4968 { | |
4969 FREE_VARIABLES (); | |
4970 return -1; | |
4971 } | |
4972 | |
4973 /* Initialize subexpression text positions to -1 to mark ones that no | |
4974 start_memory/stop_memory has been seen for. Also initialize the | |
4975 register information struct. */ | |
4976 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
4977 { | |
4978 regstart[mcnt] = regend[mcnt] | |
4979 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; | |
4980 | |
4981 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; | |
4982 IS_ACTIVE (reg_info[mcnt]) = 0; | |
4983 MATCHED_SOMETHING (reg_info[mcnt]) = 0; | |
4984 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; | |
4985 } | |
4986 /* We move `string1' into `string2' if the latter's empty -- but not if | |
4987 `string1' is null. */ | |
4988 if (size2 == 0 && string1 != NULL) | |
4989 { | |
4990 string2 = string1; | |
4991 size2 = size1; | |
4992 string1 = 0; | |
4993 size1 = 0; | |
4994 } | |
4995 end1 = string1 + size1; | |
4996 end2 = string2 + size2; | |
4997 | |
4998 /* Compute where to stop matching, within the two strings. */ | |
4999 if (stop <= size1) | |
5000 { | |
5001 end_match_1 = string1 + stop; | |
5002 end_match_2 = string2; | |
5003 } | |
5004 else | |
5005 { | |
5006 end_match_1 = end1; | |
5007 end_match_2 = string2 + stop - size1; | |
5008 } | |
5009 | |
5010 /* `p' scans through the pattern as `d' scans through the data. | |
5011 `dend' is the end of the input string that `d' points within. `d' | |
5012 is advanced into the following input string whenever necessary, but | |
5013 this happens before fetching; therefore, at the beginning of the | |
5014 loop, `d' can be pointing at the end of a string, but it cannot | |
5015 equal `string2'. */ | |
5016 if (size1 > 0 && pos <= size1) | |
5017 { | |
5018 d = string1 + pos; | |
5019 dend = end_match_1; | |
5020 } | |
5021 else | |
5022 { | |
5023 d = string2 + pos - size1; | |
5024 dend = end_match_2; | |
5025 } | |
5026 | |
446 | 5027 DEBUG_PRINT1 ("The compiled pattern is: \n"); |
428 | 5028 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); |
5029 DEBUG_PRINT1 ("The string to match is: `"); | |
5030 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); | |
5031 DEBUG_PRINT1 ("'\n"); | |
5032 | |
5033 /* This loops over pattern commands. It exits by returning from the | |
5034 function if the match is complete, or it drops through if the match | |
5035 fails at this starting point in the input data. */ | |
5036 for (;;) | |
5037 { | |
5038 DEBUG_PRINT2 ("\n0x%lx: ", (long) p); | |
5039 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */ | |
5040 if (!no_quit_in_re_search) | |
1333 | 5041 { |
5042 BEGIN_REGEX_MALLOC_OK (); | |
5043 QUIT; | |
5044 END_REGEX_MALLOC_OK (); | |
1346 | 5045 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); |
1333 | 5046 } |
428 | 5047 #endif |
5048 | |
5049 if (p == pend) | |
5050 { /* End of pattern means we might have succeeded. */ | |
5051 DEBUG_PRINT1 ("end of pattern ... "); | |
5052 | |
5053 /* If we haven't matched the entire string, and we want the | |
5054 longest match, try backtracking. */ | |
5055 if (d != end_match_2) | |
5056 { | |
5057 same_str_p = (FIRST_STRING_P (match_end) | |
5058 == MATCHING_IN_FIRST_STRING); | |
5059 | |
5060 /* AIX compiler got confused when this was combined | |
5061 with the previous declaration. */ | |
5062 if (same_str_p) | |
5063 best_match_p = d > match_end; | |
5064 else | |
5065 best_match_p = !MATCHING_IN_FIRST_STRING; | |
5066 | |
5067 DEBUG_PRINT1 ("backtracking.\n"); | |
5068 | |
5069 if (!FAIL_STACK_EMPTY ()) | |
5070 { /* More failure points to try. */ | |
5071 | |
5072 /* If exceeds best match so far, save it. */ | |
5073 if (!best_regs_set || best_match_p) | |
5074 { | |
5075 best_regs_set = true; | |
5076 match_end = d; | |
5077 | |
5078 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); | |
5079 | |
5080 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
5081 { | |
5082 best_regstart[mcnt] = regstart[mcnt]; | |
5083 best_regend[mcnt] = regend[mcnt]; | |
5084 } | |
5085 } | |
5086 goto fail; | |
5087 } | |
5088 | |
5089 /* If no failure points, don't restore garbage. And if | |
5090 last match is real best match, don't restore second | |
5091 best one. */ | |
5092 else if (best_regs_set && !best_match_p) | |
5093 { | |
5094 restore_best_regs: | |
5095 /* Restore best match. It may happen that `dend == | |
5096 end_match_1' while the restored d is in string2. | |
5097 For example, the pattern `x.*y.*z' against the | |
5098 strings `x-' and `y-z-', if the two strings are | |
5099 not consecutive in memory. */ | |
5100 DEBUG_PRINT1 ("Restoring best registers.\n"); | |
5101 | |
5102 d = match_end; | |
5103 dend = ((d >= string1 && d <= end1) | |
5104 ? end_match_1 : end_match_2); | |
5105 | |
5106 for (mcnt = 1; mcnt < num_regs; mcnt++) | |
5107 { | |
5108 regstart[mcnt] = best_regstart[mcnt]; | |
5109 regend[mcnt] = best_regend[mcnt]; | |
5110 } | |
5111 } | |
5112 } /* d != end_match_2 */ | |
5113 | |
5114 succeed_label: | |
5115 DEBUG_PRINT1 ("Accepting match.\n"); | |
5116 | |
5117 /* If caller wants register contents data back, do it. */ | |
1028 | 5118 { |
5119 int num_nonshy_regs = bufp->re_nsub + 1; | |
5120 if (regs && !bufp->no_sub) | |
5121 { | |
5122 /* Have the register data arrays been allocated? */ | |
5123 if (bufp->regs_allocated == REGS_UNALLOCATED) | |
5124 { /* No. So allocate them with malloc. We need one | |
5125 extra element beyond `num_regs' for the `-1' marker | |
5126 GNU code uses. */ | |
5127 regs->num_regs = MAX (RE_NREGS, num_nonshy_regs + 1); | |
1333 | 5128 BEGIN_REGEX_MALLOC_OK (); |
1028 | 5129 regs->start = TALLOC (regs->num_regs, regoff_t); |
5130 regs->end = TALLOC (regs->num_regs, regoff_t); | |
1333 | 5131 END_REGEX_MALLOC_OK (); |
5132 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
1028 | 5133 if (regs->start == NULL || regs->end == NULL) |
5134 { | |
5135 FREE_VARIABLES (); | |
5136 return -2; | |
5137 } | |
5138 bufp->regs_allocated = REGS_REALLOCATE; | |
5139 } | |
5140 else if (bufp->regs_allocated == REGS_REALLOCATE) | |
5141 { /* Yes. If we need more elements than were already | |
5142 allocated, reallocate them. If we need fewer, just | |
5143 leave it alone. */ | |
5144 if (regs->num_regs < num_nonshy_regs + 1) | |
5145 { | |
5146 regs->num_regs = num_nonshy_regs + 1; | |
1333 | 5147 BEGIN_REGEX_MALLOC_OK (); |
1028 | 5148 RETALLOC (regs->start, regs->num_regs, regoff_t); |
5149 RETALLOC (regs->end, regs->num_regs, regoff_t); | |
1333 | 5150 END_REGEX_MALLOC_OK (); |
5151 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
1028 | 5152 if (regs->start == NULL || regs->end == NULL) |
5153 { | |
5154 FREE_VARIABLES (); | |
5155 return -2; | |
5156 } | |
5157 } | |
5158 } | |
5159 else | |
5160 { | |
5161 /* The braces fend off a "empty body in an else-statement" | |
5162 warning under GCC when assert expands to nothing. */ | |
5163 assert (bufp->regs_allocated == REGS_FIXED); | |
5164 } | |
5165 | |
5166 /* Convert the pointer data in `regstart' and `regend' to | |
5167 indices. Register zero has to be set differently, | |
5168 since we haven't kept track of any info for it. */ | |
5169 if (regs->num_regs > 0) | |
5170 { | |
5171 regs->start[0] = pos; | |
5172 regs->end[0] = (MATCHING_IN_FIRST_STRING | |
5173 ? ((regoff_t) (d - string1)) | |
5174 : ((regoff_t) (d - string2 + size1))); | |
5175 } | |
5176 | |
2639 | 5177 /* Map over the NUM_NONSHY_REGS non-shy internal registers. |
5178 Copy each into the corresponding external register. | |
5179 MCNT indexes external registers. */ | |
1028 | 5180 for (mcnt = 1; mcnt < MIN (num_nonshy_regs, regs->num_regs); |
5181 mcnt++) | |
5182 { | |
5183 int internal_reg = bufp->external_to_internal_register[mcnt]; | |
5184 if (REG_UNSET (regstart[internal_reg]) || | |
5185 REG_UNSET (regend[internal_reg])) | |
5186 regs->start[mcnt] = regs->end[mcnt] = -1; | |
5187 else | |
5188 { | |
5189 regs->start[mcnt] = | |
5190 (regoff_t) POINTER_TO_OFFSET (regstart[internal_reg]); | |
5191 regs->end[mcnt] = | |
5192 (regoff_t) POINTER_TO_OFFSET (regend[internal_reg]); | |
5193 } | |
5194 } | |
5195 } /* regs && !bufp->no_sub */ | |
5196 | |
5197 /* If we have regs and the regs structure has more elements than | |
2639 | 5198 were in the pattern, set the extra elements starting with |
5199 NUM_NONSHY_REGS to -1. If we (re)allocated the registers, | |
5200 this is the case, because we always allocate enough to have | |
5201 at least one -1 at the end. | |
1028 | 5202 |
5203 We do this even when no_sub is set because some applications | |
5204 (XEmacs) reuse register structures which may contain stale | |
5205 information, and permit attempts to access those registers. | |
5206 | |
5207 It would be possible to require the caller to do this, but we'd | |
5208 have to change the API for this function to reflect that, and | |
1425 | 5209 audit all callers. Note: as of 2003-04-17 callers in XEmacs |
5210 do clear the registers, but it's safer to leave this code in | |
5211 because of reallocation. | |
5212 */ | |
1028 | 5213 if (regs && regs->num_regs > 0) |
5214 for (mcnt = num_nonshy_regs; mcnt < regs->num_regs; mcnt++) | |
5215 regs->start[mcnt] = regs->end[mcnt] = -1; | |
5216 } | |
428 | 5217 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", |
5218 nfailure_points_pushed, nfailure_points_popped, | |
5219 nfailure_points_pushed - nfailure_points_popped); | |
5220 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); | |
5221 | |
5222 mcnt = d - pos - (MATCHING_IN_FIRST_STRING | |
5223 ? string1 | |
5224 : string2 - size1); | |
5225 | |
5226 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); | |
5227 | |
5228 FREE_VARIABLES (); | |
5229 return mcnt; | |
5230 } | |
5231 | |
5232 /* Otherwise match next pattern command. */ | |
4759
aa5ed11f473b
Remove support for obsolete systems. See xemacs-patches message with ID
Jerry James <james@xemacs.org>
parents:
4750
diff
changeset
|
5233 switch ((re_opcode_t) *p++) |
428 | 5234 { |
5235 /* Ignore these. Used to ignore the n of succeed_n's which | |
5236 currently have n == 0. */ | |
5237 case no_op: | |
5238 DEBUG_PRINT1 ("EXECUTING no_op.\n"); | |
5239 break; | |
5240 | |
5241 case succeed: | |
5242 DEBUG_PRINT1 ("EXECUTING succeed.\n"); | |
5243 goto succeed_label; | |
5244 | |
826 | 5245 /* Match exactly a string of length n in the pattern. The |
5246 following byte in the pattern defines n, and the n bytes after | |
5247 that make up the string to match. (Under Mule, this will be in | |
5248 the default internal format.) */ | |
428 | 5249 case exactn: |
5250 mcnt = *p++; | |
5251 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); | |
5252 | |
5253 /* This is written out as an if-else so we don't waste time | |
5254 testing `translate' inside the loop. */ | |
446 | 5255 if (TRANSLATE_P (translate)) |
428 | 5256 { |
5257 do | |
5258 { | |
446 | 5259 #ifdef MULE |
5260 Bytecount pat_len; | |
5261 | |
450 | 5262 REGEX_PREFETCH (); |
867 | 5263 if (RE_TRANSLATE_1 (itext_ichar_fmt (d, fmt, lispobj)) |
5264 != itext_ichar (p)) | |
428 | 5265 goto fail; |
446 | 5266 |
867 | 5267 pat_len = itext_ichar_len (p); |
446 | 5268 p += pat_len; |
867 | 5269 INC_IBYTEPTR_FMT (d, fmt); |
446 | 5270 |
5271 mcnt -= pat_len; | |
5272 #else /* not MULE */ | |
450 | 5273 REGEX_PREFETCH (); |
826 | 5274 if ((unsigned char) RE_TRANSLATE_1 (*d++) != *p++) |
446 | 5275 goto fail; |
5276 mcnt--; | |
5277 #endif | |
428 | 5278 } |
446 | 5279 while (mcnt > 0); |
428 | 5280 } |
5281 else | |
5282 { | |
826 | 5283 #ifdef MULE |
5284 /* If buffer format is default, then we can shortcut and just | |
5285 compare the text directly, byte by byte. Otherwise, we | |
5286 need to go character by character. */ | |
5287 if (fmt != FORMAT_DEFAULT) | |
428 | 5288 { |
826 | 5289 do |
5290 { | |
5291 Bytecount pat_len; | |
5292 | |
5293 REGEX_PREFETCH (); | |
867 | 5294 if (itext_ichar_fmt (d, fmt, lispobj) != |
5295 itext_ichar (p)) | |
826 | 5296 goto fail; |
5297 | |
867 | 5298 pat_len = itext_ichar_len (p); |
826 | 5299 p += pat_len; |
867 | 5300 INC_IBYTEPTR_FMT (d, fmt); |
826 | 5301 |
5302 mcnt -= pat_len; | |
5303 } | |
5304 while (mcnt > 0); | |
428 | 5305 } |
826 | 5306 else |
5307 #endif | |
5308 { | |
5309 do | |
5310 { | |
5311 REGEX_PREFETCH (); | |
5312 if (*d++ != *p++) goto fail; | |
5313 mcnt--; | |
5314 } | |
5315 while (mcnt > 0); | |
5316 } | |
428 | 5317 } |
5318 SET_REGS_MATCHED (); | |
5319 break; | |
5320 | |
5321 | |
5322 /* Match any character except possibly a newline or a null. */ | |
5323 case anychar: | |
5324 DEBUG_PRINT1 ("EXECUTING anychar.\n"); | |
5325 | |
450 | 5326 REGEX_PREFETCH (); |
428 | 5327 |
826 | 5328 if ((!(bufp->syntax & RE_DOT_NEWLINE) && |
867 | 5329 RE_TRANSLATE (itext_ichar_fmt (d, fmt, lispobj)) == '\n') |
826 | 5330 || (bufp->syntax & RE_DOT_NOT_NULL && |
867 | 5331 RE_TRANSLATE (itext_ichar_fmt (d, fmt, lispobj)) == |
826 | 5332 '\000')) |
428 | 5333 goto fail; |
5334 | |
5335 SET_REGS_MATCHED (); | |
5336 DEBUG_PRINT2 (" Matched `%d'.\n", *d); | |
867 | 5337 INC_IBYTEPTR_FMT (d, fmt); /* XEmacs change */ |
428 | 5338 break; |
5339 | |
5340 | |
5341 case charset: | |
5342 case charset_not: | |
5343 { | |
1414 | 5344 REGISTER Ichar c; |
460 | 5345 re_bool not_p = (re_opcode_t) *(p - 1) == charset_not; |
458 | 5346 |
5347 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : ""); | |
428 | 5348 |
450 | 5349 REGEX_PREFETCH (); |
867 | 5350 c = itext_ichar_fmt (d, fmt, lispobj); |
826 | 5351 c = RE_TRANSLATE (c); /* The character to match. */ |
428 | 5352 |
647 | 5353 /* Cast to `unsigned int' instead of `unsigned char' in case the |
428 | 5354 bit list is a full 32 bytes long. */ |
1414 | 5355 if ((unsigned int)c < (unsigned int) (*p * BYTEWIDTH) |
428 | 5356 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) |
458 | 5357 not_p = !not_p; |
428 | 5358 |
5359 p += 1 + *p; | |
5360 | |
458 | 5361 if (!not_p) goto fail; |
428 | 5362 |
5363 SET_REGS_MATCHED (); | |
867 | 5364 INC_IBYTEPTR_FMT (d, fmt); /* XEmacs change */ |
428 | 5365 break; |
5366 } | |
5367 | |
5368 #ifdef MULE | |
5369 case charset_mule: | |
5370 case charset_mule_not: | |
5371 { | |
867 | 5372 REGISTER Ichar c; |
460 | 5373 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not; |
458 | 5374 |
5375 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : ""); | |
428 | 5376 |
450 | 5377 REGEX_PREFETCH (); |
867 | 5378 c = itext_ichar_fmt (d, fmt, lispobj); |
826 | 5379 c = RE_TRANSLATE (c); /* The character to match. */ |
428 | 5380 |
5381 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil))) | |
458 | 5382 not_p = !not_p; |
428 | 5383 |
5384 p += unified_range_table_bytes_used (p); | |
5385 | |
458 | 5386 if (!not_p) goto fail; |
428 | 5387 |
5388 SET_REGS_MATCHED (); | |
867 | 5389 INC_IBYTEPTR_FMT (d, fmt); |
428 | 5390 break; |
5391 } | |
5392 #endif /* MULE */ | |
5393 | |
5394 | |
5395 /* The beginning of a group is represented by start_memory. | |
5396 The arguments are the register number in the next byte, and the | |
5397 number of groups inner to this one in the next. The text | |
5398 matched within the group is recorded (in the internal | |
5399 registers data structure) under the register number. */ | |
5400 case start_memory: | |
5401 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); | |
5402 | |
5403 /* Find out if this group can match the empty string. */ | |
5404 p1 = p; /* To send to group_match_null_string_p. */ | |
5405 | |
5406 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) | |
2639 | 5407 REG_MATCH_NULL_STRING_P (reg_info[*p]) |
5408 = group_match_null_string_p (&p1, pend, reg_info); | |
5409 | |
5410 DEBUG_PRINT2 (" group CAN%s match null string\n", | |
5411 REG_MATCH_NULL_STRING_P (reg_info[*p]) ? "NOT" : ""); | |
428 | 5412 |
5413 /* Save the position in the string where we were the last time | |
5414 we were at this open-group operator in case the group is | |
5415 operated upon by a repetition operator, e.g., with `(a*)*b' | |
5416 against `ab'; then we want to ignore where we are now in | |
5417 the string in case this attempt to match fails. */ | |
5418 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) | |
5419 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] | |
5420 : regstart[*p]; | |
5421 DEBUG_PRINT2 (" old_regstart: %d\n", | |
5422 POINTER_TO_OFFSET (old_regstart[*p])); | |
5423 | |
5424 regstart[*p] = d; | |
5425 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); | |
5426 | |
5427 IS_ACTIVE (reg_info[*p]) = 1; | |
5428 MATCHED_SOMETHING (reg_info[*p]) = 0; | |
5429 | |
5430 /* Clear this whenever we change the register activity status. */ | |
5431 set_regs_matched_done = 0; | |
5432 | |
5433 /* This is the new highest active register. */ | |
5434 highest_active_reg = *p; | |
5435 | |
5436 /* If nothing was active before, this is the new lowest active | |
5437 register. */ | |
5438 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) | |
5439 lowest_active_reg = *p; | |
5440 | |
5441 /* Move past the register number and inner group count. */ | |
5442 p += 2; | |
5443 just_past_start_mem = p; | |
5444 | |
5445 break; | |
5446 | |
5447 | |
5448 /* The stop_memory opcode represents the end of a group. Its | |
5449 arguments are the same as start_memory's: the register | |
5450 number, and the number of inner groups. */ | |
5451 case stop_memory: | |
5452 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); | |
5453 | |
5454 /* We need to save the string position the last time we were at | |
5455 this close-group operator in case the group is operated | |
5456 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' | |
5457 against `aba'; then we want to ignore where we are now in | |
5458 the string in case this attempt to match fails. */ | |
5459 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) | |
5460 ? REG_UNSET (regend[*p]) ? d : regend[*p] | |
5461 : regend[*p]; | |
5462 DEBUG_PRINT2 (" old_regend: %d\n", | |
5463 POINTER_TO_OFFSET (old_regend[*p])); | |
5464 | |
5465 regend[*p] = d; | |
5466 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); | |
5467 | |
5468 /* This register isn't active anymore. */ | |
5469 IS_ACTIVE (reg_info[*p]) = 0; | |
5470 | |
5471 /* Clear this whenever we change the register activity status. */ | |
5472 set_regs_matched_done = 0; | |
5473 | |
5474 /* If this was the only register active, nothing is active | |
5475 anymore. */ | |
5476 if (lowest_active_reg == highest_active_reg) | |
5477 { | |
5478 lowest_active_reg = NO_LOWEST_ACTIVE_REG; | |
5479 highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
5480 } | |
5481 else | |
5482 { /* We must scan for the new highest active register, since | |
5483 it isn't necessarily one less than now: consider | |
5484 (a(b)c(d(e)f)g). When group 3 ends, after the f), the | |
5485 new highest active register is 1. */ | |
5486 unsigned char r = *p - 1; | |
5487 while (r > 0 && !IS_ACTIVE (reg_info[r])) | |
5488 r--; | |
5489 | |
5490 /* If we end up at register zero, that means that we saved | |
5491 the registers as the result of an `on_failure_jump', not | |
5492 a `start_memory', and we jumped to past the innermost | |
5493 `stop_memory'. For example, in ((.)*) we save | |
5494 registers 1 and 2 as a result of the *, but when we pop | |
5495 back to the second ), we are at the stop_memory 1. | |
5496 Thus, nothing is active. */ | |
5497 if (r == 0) | |
5498 { | |
5499 lowest_active_reg = NO_LOWEST_ACTIVE_REG; | |
5500 highest_active_reg = NO_HIGHEST_ACTIVE_REG; | |
5501 } | |
5502 else | |
5503 { | |
5504 highest_active_reg = r; | |
5505 | |
5506 /* 98/9/21 jhod: We've also gotta set lowest_active_reg, don't we? */ | |
5507 r = 1; | |
5508 while (r < highest_active_reg && !IS_ACTIVE(reg_info[r])) | |
5509 r++; | |
5510 lowest_active_reg = r; | |
5511 } | |
5512 } | |
5513 | |
5514 /* If just failed to match something this time around with a | |
5515 group that's operated on by a repetition operator, try to | |
5516 force exit from the ``loop'', and restore the register | |
5517 information for this group that we had before trying this | |
5518 last match. */ | |
5519 if ((!MATCHED_SOMETHING (reg_info[*p]) | |
5520 || just_past_start_mem == p - 1) | |
5521 && (p + 2) < pend) | |
5522 { | |
460 | 5523 re_bool is_a_jump_n = false; |
428 | 5524 |
5525 p1 = p + 2; | |
5526 mcnt = 0; | |
5527 switch ((re_opcode_t) *p1++) | |
5528 { | |
5529 case jump_n: | |
5530 is_a_jump_n = true; | |
5531 case pop_failure_jump: | |
5532 case maybe_pop_jump: | |
5533 case jump: | |
5534 case dummy_failure_jump: | |
5535 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
5536 if (is_a_jump_n) | |
5537 p1 += 2; | |
5538 break; | |
5539 | |
5540 default: | |
5541 /* do nothing */ ; | |
5542 } | |
5543 p1 += mcnt; | |
5544 | |
5545 /* If the next operation is a jump backwards in the pattern | |
5546 to an on_failure_jump right before the start_memory | |
5547 corresponding to this stop_memory, exit from the loop | |
5548 by forcing a failure after pushing on the stack the | |
5549 on_failure_jump's jump in the pattern, and d. */ | |
5550 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump | |
5551 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) | |
5552 { | |
5553 /* If this group ever matched anything, then restore | |
5554 what its registers were before trying this last | |
5555 failed match, e.g., with `(a*)*b' against `ab' for | |
5556 regstart[1], and, e.g., with `((a*)*(b*)*)*' | |
5557 against `aba' for regend[3]. | |
5558 | |
5559 Also restore the registers for inner groups for, | |
5560 e.g., `((a*)(b*))*' against `aba' (register 3 would | |
5561 otherwise get trashed). */ | |
5562 | |
5563 if (EVER_MATCHED_SOMETHING (reg_info[*p])) | |
5564 { | |
647 | 5565 int r; |
428 | 5566 |
5567 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; | |
5568 | |
5569 /* Restore this and inner groups' (if any) registers. */ | |
5570 for (r = *p; r < *p + *(p + 1); r++) | |
5571 { | |
5572 regstart[r] = old_regstart[r]; | |
5573 | |
5574 /* xx why this test? */ | |
5575 if (old_regend[r] >= regstart[r]) | |
5576 regend[r] = old_regend[r]; | |
5577 } | |
5578 } | |
5579 p1++; | |
5580 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
5581 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); | |
5582 | |
5583 goto fail; | |
5584 } | |
5585 } | |
5586 | |
5587 /* Move past the register number and the inner group count. */ | |
5588 p += 2; | |
5589 break; | |
5590 | |
5591 | |
5592 /* \<digit> has been turned into a `duplicate' command which is | |
502 | 5593 followed by the numeric value of <digit> as the register number. |
5594 (Already passed through external-to-internal-register mapping, | |
5595 so it refers to the actual group number, not the non-shy-only | |
5596 numbering used in the external world.) */ | |
428 | 5597 case duplicate: |
5598 { | |
446 | 5599 REGISTER re_char *d2, *dend2; |
502 | 5600 /* Get which register to match against. */ |
5601 int regno = *p++; | |
428 | 5602 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); |
5603 | |
5604 /* Can't back reference a group which we've never matched. */ | |
5605 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) | |
5606 goto fail; | |
5607 | |
5608 /* Where in input to try to start matching. */ | |
5609 d2 = regstart[regno]; | |
5610 | |
5611 /* Where to stop matching; if both the place to start and | |
5612 the place to stop matching are in the same string, then | |
5613 set to the place to stop, otherwise, for now have to use | |
5614 the end of the first string. */ | |
5615 | |
5616 dend2 = ((FIRST_STRING_P (regstart[regno]) | |
5617 == FIRST_STRING_P (regend[regno])) | |
5618 ? regend[regno] : end_match_1); | |
5619 for (;;) | |
5620 { | |
5621 /* If necessary, advance to next segment in register | |
5622 contents. */ | |
5623 while (d2 == dend2) | |
5624 { | |
5625 if (dend2 == end_match_2) break; | |
5626 if (dend2 == regend[regno]) break; | |
5627 | |
5628 /* End of string1 => advance to string2. */ | |
5629 d2 = string2; | |
5630 dend2 = regend[regno]; | |
5631 } | |
5632 /* At end of register contents => success */ | |
5633 if (d2 == dend2) break; | |
5634 | |
5635 /* If necessary, advance to next segment in data. */ | |
450 | 5636 REGEX_PREFETCH (); |
428 | 5637 |
5638 /* How many characters left in this segment to match. */ | |
5639 mcnt = dend - d; | |
5640 | |
5641 /* Want how many consecutive characters we can match in | |
5642 one shot, so, if necessary, adjust the count. */ | |
5643 if (mcnt > dend2 - d2) | |
5644 mcnt = dend2 - d2; | |
5645 | |
5646 /* Compare that many; failure if mismatch, else move | |
5647 past them. */ | |
446 | 5648 if (TRANSLATE_P (translate) |
826 | 5649 ? bcmp_translate (d, d2, mcnt, translate |
5650 #ifdef emacs | |
5651 , fmt, lispobj | |
5652 #endif | |
5653 ) | |
428 | 5654 : memcmp (d, d2, mcnt)) |
5655 goto fail; | |
5656 d += mcnt, d2 += mcnt; | |
5657 | |
5658 /* Do this because we've match some characters. */ | |
5659 SET_REGS_MATCHED (); | |
5660 } | |
5661 } | |
5662 break; | |
5663 | |
5664 | |
5665 /* begline matches the empty string at the beginning of the string | |
5666 (unless `not_bol' is set in `bufp'), and, if | |
5667 `newline_anchor' is set, after newlines. */ | |
5668 case begline: | |
5669 DEBUG_PRINT1 ("EXECUTING begline.\n"); | |
5670 | |
5671 if (AT_STRINGS_BEG (d)) | |
5672 { | |
5673 if (!bufp->not_bol) break; | |
5674 } | |
826 | 5675 else |
5676 { | |
5677 re_char *d2 = d; | |
867 | 5678 DEC_IBYTEPTR (d2); |
5679 if (itext_ichar_ascii_fmt (d2, fmt, lispobj) == '\n' && | |
826 | 5680 bufp->newline_anchor) |
5681 break; | |
5682 } | |
428 | 5683 /* In all other cases, we fail. */ |
5684 goto fail; | |
5685 | |
5686 | |
5687 /* endline is the dual of begline. */ | |
5688 case endline: | |
5689 DEBUG_PRINT1 ("EXECUTING endline.\n"); | |
5690 | |
5691 if (AT_STRINGS_END (d)) | |
5692 { | |
5693 if (!bufp->not_eol) break; | |
5694 } | |
5695 | |
5696 /* We have to ``prefetch'' the next character. */ | |
826 | 5697 else if ((d == end1 ? |
867 | 5698 itext_ichar_ascii_fmt (string2, fmt, lispobj) : |
5699 itext_ichar_ascii_fmt (d, fmt, lispobj)) == '\n' | |
428 | 5700 && bufp->newline_anchor) |
5701 { | |
5702 break; | |
5703 } | |
5704 goto fail; | |
5705 | |
5706 | |
5707 /* Match at the very beginning of the data. */ | |
5708 case begbuf: | |
5709 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); | |
5710 if (AT_STRINGS_BEG (d)) | |
5711 break; | |
5712 goto fail; | |
5713 | |
5714 | |
5715 /* Match at the very end of the data. */ | |
5716 case endbuf: | |
5717 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); | |
5718 if (AT_STRINGS_END (d)) | |
5719 break; | |
5720 goto fail; | |
5721 | |
5722 | |
5723 /* on_failure_keep_string_jump is used to optimize `.*\n'. It | |
5724 pushes NULL as the value for the string on the stack. Then | |
5725 `pop_failure_point' will keep the current value for the | |
5726 string, instead of restoring it. To see why, consider | |
5727 matching `foo\nbar' against `.*\n'. The .* matches the foo; | |
5728 then the . fails against the \n. But the next thing we want | |
5729 to do is match the \n against the \n; if we restored the | |
5730 string value, we would be back at the foo. | |
5731 | |
5732 Because this is used only in specific cases, we don't need to | |
5733 check all the things that `on_failure_jump' does, to make | |
5734 sure the right things get saved on the stack. Hence we don't | |
5735 share its code. The only reason to push anything on the | |
5736 stack at all is that otherwise we would have to change | |
5737 `anychar's code to do something besides goto fail in this | |
5738 case; that seems worse than this. */ | |
5739 case on_failure_keep_string_jump: | |
5740 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); | |
5741 | |
5742 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
5743 DEBUG_PRINT3 (" %d (to 0x%lx):\n", mcnt, (long) (p + mcnt)); | |
5744 | |
446 | 5745 PUSH_FAILURE_POINT (p + mcnt, (unsigned char *) 0, -2); |
428 | 5746 break; |
5747 | |
5748 | |
5749 /* Uses of on_failure_jump: | |
5750 | |
5751 Each alternative starts with an on_failure_jump that points | |
5752 to the beginning of the next alternative. Each alternative | |
5753 except the last ends with a jump that in effect jumps past | |
5754 the rest of the alternatives. (They really jump to the | |
5755 ending jump of the following alternative, because tensioning | |
5756 these jumps is a hassle.) | |
5757 | |
5758 Repeats start with an on_failure_jump that points past both | |
5759 the repetition text and either the following jump or | |
5760 pop_failure_jump back to this on_failure_jump. */ | |
5761 case on_failure_jump: | |
5762 on_failure: | |
5763 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); | |
5764 | |
5765 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
5766 DEBUG_PRINT3 (" %d (to 0x%lx)", mcnt, (long) (p + mcnt)); | |
5767 | |
5768 /* If this on_failure_jump comes right before a group (i.e., | |
5769 the original * applied to a group), save the information | |
5770 for that group and all inner ones, so that if we fail back | |
5771 to this point, the group's information will be correct. | |
5772 For example, in \(a*\)*\1, we need the preceding group, | |
5773 and in \(\(a*\)b*\)\2, we need the inner group. */ | |
5774 | |
5775 /* We can't use `p' to check ahead because we push | |
5776 a failure point to `p + mcnt' after we do this. */ | |
5777 p1 = p; | |
5778 | |
5779 /* We need to skip no_op's before we look for the | |
5780 start_memory in case this on_failure_jump is happening as | |
5781 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 | |
5782 against aba. */ | |
5783 while (p1 < pend && (re_opcode_t) *p1 == no_op) | |
5784 p1++; | |
5785 | |
5786 if (p1 < pend && (re_opcode_t) *p1 == start_memory) | |
5787 { | |
5788 /* We have a new highest active register now. This will | |
5789 get reset at the start_memory we are about to get to, | |
5790 but we will have saved all the registers relevant to | |
5791 this repetition op, as described above. */ | |
5792 highest_active_reg = *(p1 + 1) + *(p1 + 2); | |
5793 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) | |
5794 lowest_active_reg = *(p1 + 1); | |
5795 } | |
5796 | |
5797 DEBUG_PRINT1 (":\n"); | |
5798 PUSH_FAILURE_POINT (p + mcnt, d, -2); | |
5799 break; | |
5800 | |
5801 | |
5802 /* A smart repeat ends with `maybe_pop_jump'. | |
5803 We change it to either `pop_failure_jump' or `jump'. */ | |
5804 case maybe_pop_jump: | |
5805 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
5806 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); | |
5807 { | |
5808 REGISTER unsigned char *p2 = p; | |
5809 | |
5810 /* Compare the beginning of the repeat with what in the | |
5811 pattern follows its end. If we can establish that there | |
5812 is nothing that they would both match, i.e., that we | |
5813 would have to backtrack because of (as in, e.g., `a*a') | |
5814 then we can change to pop_failure_jump, because we'll | |
5815 never have to backtrack. | |
5816 | |
5817 This is not true in the case of alternatives: in | |
5818 `(a|ab)*' we do need to backtrack to the `ab' alternative | |
5819 (e.g., if the string was `ab'). But instead of trying to | |
5820 detect that here, the alternative has put on a dummy | |
5821 failure point which is what we will end up popping. */ | |
5822 | |
5823 /* Skip over open/close-group commands. | |
5824 If what follows this loop is a ...+ construct, | |
5825 look at what begins its body, since we will have to | |
5826 match at least one of that. */ | |
5827 while (1) | |
5828 { | |
5829 if (p2 + 2 < pend | |
5830 && ((re_opcode_t) *p2 == stop_memory | |
5831 || (re_opcode_t) *p2 == start_memory)) | |
5832 p2 += 3; | |
5833 else if (p2 + 6 < pend | |
5834 && (re_opcode_t) *p2 == dummy_failure_jump) | |
5835 p2 += 6; | |
5836 else | |
5837 break; | |
5838 } | |
5839 | |
5840 p1 = p + mcnt; | |
5841 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding | |
5842 to the `maybe_finalize_jump' of this case. Examine what | |
5843 follows. */ | |
5844 | |
5845 /* If we're at the end of the pattern, we can change. */ | |
5846 if (p2 == pend) | |
5847 { | |
5848 /* Consider what happens when matching ":\(.*\)" | |
5849 against ":/". I don't really understand this code | |
5850 yet. */ | |
5851 p[-3] = (unsigned char) pop_failure_jump; | |
5852 DEBUG_PRINT1 | |
5853 (" End of pattern: change to `pop_failure_jump'.\n"); | |
5854 } | |
5855 | |
5856 else if ((re_opcode_t) *p2 == exactn | |
5857 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) | |
5858 { | |
5859 REGISTER unsigned char c | |
5860 = *p2 == (unsigned char) endline ? '\n' : p2[2]; | |
5861 | |
5862 if ((re_opcode_t) p1[3] == exactn && p1[5] != c) | |
5863 { | |
5864 p[-3] = (unsigned char) pop_failure_jump; | |
5865 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", | |
5866 c, p1[5]); | |
5867 } | |
5868 | |
5869 else if ((re_opcode_t) p1[3] == charset | |
5870 || (re_opcode_t) p1[3] == charset_not) | |
5871 { | |
458 | 5872 int not_p = (re_opcode_t) p1[3] == charset_not; |
428 | 5873 |
5874 if (c < (unsigned char) (p1[4] * BYTEWIDTH) | |
5875 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) | |
458 | 5876 not_p = !not_p; |
5877 | |
5878 /* `not_p' is equal to 1 if c would match, which means | |
428 | 5879 that we can't change to pop_failure_jump. */ |
458 | 5880 if (!not_p) |
428 | 5881 { |
5882 p[-3] = (unsigned char) pop_failure_jump; | |
5883 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); | |
5884 } | |
5885 } | |
5886 } | |
5887 else if ((re_opcode_t) *p2 == charset) | |
5888 { | |
5889 #ifdef DEBUG | |
5890 REGISTER unsigned char c | |
5891 = *p2 == (unsigned char) endline ? '\n' : p2[2]; | |
5892 #endif | |
5893 | |
5894 if ((re_opcode_t) p1[3] == exactn | |
5895 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] | |
5896 && (p2[2 + p1[5] / BYTEWIDTH] | |
5897 & (1 << (p1[5] % BYTEWIDTH))))) | |
5898 { | |
5899 p[-3] = (unsigned char) pop_failure_jump; | |
5900 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", | |
5901 c, p1[5]); | |
5902 } | |
5903 | |
5904 else if ((re_opcode_t) p1[3] == charset_not) | |
5905 { | |
5906 int idx; | |
5907 /* We win if the charset_not inside the loop | |
5908 lists every character listed in the charset after. */ | |
5909 for (idx = 0; idx < (int) p2[1]; idx++) | |
5910 if (! (p2[2 + idx] == 0 | |
5911 || (idx < (int) p1[4] | |
5912 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0)))) | |
5913 break; | |
5914 | |
5915 if (idx == p2[1]) | |
5916 { | |
5917 p[-3] = (unsigned char) pop_failure_jump; | |
5918 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); | |
5919 } | |
5920 } | |
5921 else if ((re_opcode_t) p1[3] == charset) | |
5922 { | |
5923 int idx; | |
5924 /* We win if the charset inside the loop | |
5925 has no overlap with the one after the loop. */ | |
5926 for (idx = 0; | |
5927 idx < (int) p2[1] && idx < (int) p1[4]; | |
5928 idx++) | |
5929 if ((p2[2 + idx] & p1[5 + idx]) != 0) | |
5930 break; | |
5931 | |
5932 if (idx == p2[1] || idx == p1[4]) | |
5933 { | |
5934 p[-3] = (unsigned char) pop_failure_jump; | |
5935 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); | |
5936 } | |
5937 } | |
5938 } | |
5939 } | |
5940 p -= 2; /* Point at relative address again. */ | |
5941 if ((re_opcode_t) p[-1] != pop_failure_jump) | |
5942 { | |
5943 p[-1] = (unsigned char) jump; | |
5944 DEBUG_PRINT1 (" Match => jump.\n"); | |
5945 goto unconditional_jump; | |
5946 } | |
5947 /* Note fall through. */ | |
5948 | |
5949 | |
5950 /* The end of a simple repeat has a pop_failure_jump back to | |
5951 its matching on_failure_jump, where the latter will push a | |
5952 failure point. The pop_failure_jump takes off failure | |
5953 points put on by this pop_failure_jump's matching | |
5954 on_failure_jump; we got through the pattern to here from the | |
5955 matching on_failure_jump, so didn't fail. */ | |
5956 case pop_failure_jump: | |
5957 { | |
5958 /* We need to pass separate storage for the lowest and | |
5959 highest registers, even though we don't care about the | |
5960 actual values. Otherwise, we will restore only one | |
5961 register from the stack, since lowest will == highest in | |
5962 `pop_failure_point'. */ | |
647 | 5963 int dummy_low_reg, dummy_high_reg; |
428 | 5964 unsigned char *pdummy; |
446 | 5965 re_char *sdummy = NULL; |
428 | 5966 |
5967 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); | |
5968 POP_FAILURE_POINT (sdummy, pdummy, | |
5969 dummy_low_reg, dummy_high_reg, | |
5970 reg_dummy, reg_dummy, reg_info_dummy); | |
5971 } | |
5972 /* Note fall through. */ | |
5973 | |
5974 | |
5975 /* Unconditionally jump (without popping any failure points). */ | |
5976 case jump: | |
5977 unconditional_jump: | |
5978 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ | |
5979 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); | |
5980 p += mcnt; /* Do the jump. */ | |
5981 DEBUG_PRINT2 ("(to 0x%lx).\n", (long) p); | |
5982 break; | |
5983 | |
5984 | |
5985 /* We need this opcode so we can detect where alternatives end | |
5986 in `group_match_null_string_p' et al. */ | |
5987 case jump_past_alt: | |
5988 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); | |
5989 goto unconditional_jump; | |
5990 | |
5991 | |
5992 /* Normally, the on_failure_jump pushes a failure point, which | |
5993 then gets popped at pop_failure_jump. We will end up at | |
5994 pop_failure_jump, also, and with a pattern of, say, `a+', we | |
5995 are skipping over the on_failure_jump, so we have to push | |
5996 something meaningless for pop_failure_jump to pop. */ | |
5997 case dummy_failure_jump: | |
5998 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); | |
5999 /* It doesn't matter what we push for the string here. What | |
6000 the code at `fail' tests is the value for the pattern. */ | |
446 | 6001 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2); |
428 | 6002 goto unconditional_jump; |
6003 | |
6004 | |
6005 /* At the end of an alternative, we need to push a dummy failure | |
6006 point in case we are followed by a `pop_failure_jump', because | |
6007 we don't want the failure point for the alternative to be | |
6008 popped. For example, matching `(a|ab)*' against `aab' | |
6009 requires that we match the `ab' alternative. */ | |
6010 case push_dummy_failure: | |
6011 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); | |
6012 /* See comments just above at `dummy_failure_jump' about the | |
6013 two zeroes. */ | |
446 | 6014 PUSH_FAILURE_POINT ((unsigned char *) 0, (unsigned char *) 0, -2); |
428 | 6015 break; |
6016 | |
6017 /* Have to succeed matching what follows at least n times. | |
6018 After that, handle like `on_failure_jump'. */ | |
6019 case succeed_n: | |
6020 EXTRACT_NUMBER (mcnt, p + 2); | |
6021 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); | |
6022 | |
6023 assert (mcnt >= 0); | |
6024 /* Originally, this is how many times we HAVE to succeed. */ | |
6025 if (mcnt > 0) | |
6026 { | |
6027 mcnt--; | |
6028 p += 2; | |
6029 STORE_NUMBER_AND_INCR (p, mcnt); | |
6030 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p, mcnt); | |
6031 } | |
6032 else if (mcnt == 0) | |
6033 { | |
6034 DEBUG_PRINT2 (" Setting two bytes from 0x%lx to no_op.\n", | |
6035 (long) (p+2)); | |
6036 p[2] = (unsigned char) no_op; | |
6037 p[3] = (unsigned char) no_op; | |
6038 goto on_failure; | |
6039 } | |
6040 break; | |
6041 | |
6042 case jump_n: | |
6043 EXTRACT_NUMBER (mcnt, p + 2); | |
6044 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); | |
6045 | |
6046 /* Originally, this is how many times we CAN jump. */ | |
6047 if (mcnt) | |
6048 { | |
6049 mcnt--; | |
6050 STORE_NUMBER (p + 2, mcnt); | |
6051 goto unconditional_jump; | |
6052 } | |
6053 /* If don't have to jump any more, skip over the rest of command. */ | |
6054 else | |
6055 p += 4; | |
6056 break; | |
6057 | |
6058 case set_number_at: | |
6059 { | |
6060 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); | |
6061 | |
6062 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
6063 p1 = p + mcnt; | |
6064 EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
6065 DEBUG_PRINT3 (" Setting 0x%lx to %d.\n", (long) p1, mcnt); | |
6066 STORE_NUMBER (p1, mcnt); | |
6067 break; | |
6068 } | |
6069 | |
6070 case wordbound: | |
6071 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); | |
6072 should_succeed = 1; | |
6073 matchwordbound: | |
6074 { | |
6075 /* XEmacs change */ | |
1377 | 6076 /* Straightforward and (I hope) correct implementation. |
6077 Probably should be optimized by arranging to compute | |
1497 | 6078 charpos only once. */ |
1377 | 6079 /* emch1 is the character before d, syn1 is the syntax of |
6080 emch1, emch2 is the character at d, and syn2 is the | |
6081 syntax of emch2. */ | |
6082 Ichar emch1, emch2; | |
1468 | 6083 int syn1 = 0, |
6084 syn2 = 0; | |
1377 | 6085 re_char *d_before, *d_after; |
6086 int result, | |
6087 at_beg = AT_STRINGS_BEG (d), | |
6088 at_end = AT_STRINGS_END (d); | |
6089 #ifdef emacs | |
1497 | 6090 Charxpos charpos; |
1377 | 6091 #endif |
6092 | |
6093 if (at_beg && at_end) | |
6094 { | |
6095 result = 0; | |
6096 } | |
428 | 6097 else |
6098 { | |
1377 | 6099 if (!at_beg) |
6100 { | |
6101 d_before = POS_BEFORE_GAP_UNSAFE (d); | |
6102 DEC_IBYTEPTR_FMT (d_before, fmt); | |
6103 emch1 = itext_ichar_fmt (d_before, fmt, lispobj); | |
460 | 6104 #ifdef emacs |
1497 | 6105 charpos = offset_to_charxpos (lispobj, |
6106 PTR_TO_OFFSET (d)) - 1; | |
1377 | 6107 BEGIN_REGEX_MALLOC_OK (); |
1497 | 6108 UPDATE_SYNTAX_CACHE (scache, charpos); |
460 | 6109 #endif |
1377 | 6110 syn1 = SYNTAX_FROM_CACHE (scache, emch1); |
6111 END_REGEX_MALLOC_OK (); | |
6112 } | |
6113 if (!at_end) | |
6114 { | |
6115 d_after = POS_AFTER_GAP_UNSAFE (d); | |
6116 emch2 = itext_ichar_fmt (d_after, fmt, lispobj); | |
460 | 6117 #ifdef emacs |
1497 | 6118 charpos = offset_to_charxpos (lispobj, PTR_TO_OFFSET (d)); |
1377 | 6119 BEGIN_REGEX_MALLOC_OK (); |
1497 | 6120 UPDATE_SYNTAX_CACHE_FORWARD (scache, charpos); |
460 | 6121 #endif |
1377 | 6122 syn2 = SYNTAX_FROM_CACHE (scache, emch2); |
6123 END_REGEX_MALLOC_OK (); | |
6124 } | |
1333 | 6125 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); |
1377 | 6126 |
6127 if (at_beg) | |
6128 result = (syn2 == Sword); | |
6129 else if (at_end) | |
6130 result = (syn1 == Sword); | |
6131 else | |
6132 result = ((syn1 == Sword) != (syn2 == Sword)); | |
428 | 6133 } |
1377 | 6134 |
428 | 6135 if (result == should_succeed) |
6136 break; | |
6137 goto fail; | |
6138 } | |
6139 | |
6140 case notwordbound: | |
6141 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); | |
6142 should_succeed = 0; | |
6143 goto matchwordbound; | |
6144 | |
6145 case wordbeg: | |
6146 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); | |
460 | 6147 if (AT_STRINGS_END (d)) |
6148 goto fail; | |
428 | 6149 { |
6150 /* XEmacs: this originally read: | |
6151 | |
6152 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) | |
6153 break; | |
6154 | |
6155 */ | |
460 | 6156 re_char *dtmp = POS_AFTER_GAP_UNSAFE (d); |
867 | 6157 Ichar emch = itext_ichar_fmt (dtmp, fmt, lispobj); |
1333 | 6158 int tempres; |
1347 | 6159 #ifdef emacs |
6160 Charxpos charpos = offset_to_charxpos (lispobj, PTR_TO_OFFSET (d)); | |
6161 #endif | |
1333 | 6162 BEGIN_REGEX_MALLOC_OK (); |
460 | 6163 #ifdef emacs |
826 | 6164 UPDATE_SYNTAX_CACHE (scache, charpos); |
460 | 6165 #endif |
1333 | 6166 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); |
6167 END_REGEX_MALLOC_OK (); | |
6168 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
6169 if (tempres) | |
428 | 6170 goto fail; |
6171 if (AT_STRINGS_BEG (d)) | |
6172 break; | |
460 | 6173 dtmp = POS_BEFORE_GAP_UNSAFE (d); |
867 | 6174 DEC_IBYTEPTR_FMT (dtmp, fmt); |
6175 emch = itext_ichar_fmt (dtmp, fmt, lispobj); | |
1333 | 6176 BEGIN_REGEX_MALLOC_OK (); |
460 | 6177 #ifdef emacs |
826 | 6178 UPDATE_SYNTAX_CACHE_BACKWARD (scache, charpos - 1); |
460 | 6179 #endif |
1333 | 6180 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); |
6181 END_REGEX_MALLOC_OK (); | |
6182 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
6183 if (tempres) | |
428 | 6184 break; |
6185 goto fail; | |
6186 } | |
6187 | |
6188 case wordend: | |
6189 DEBUG_PRINT1 ("EXECUTING wordend.\n"); | |
460 | 6190 if (AT_STRINGS_BEG (d)) |
6191 goto fail; | |
428 | 6192 { |
6193 /* XEmacs: this originally read: | |
6194 | |
6195 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) | |
6196 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) | |
6197 break; | |
6198 | |
6199 The or condition is incorrect (reversed). | |
6200 */ | |
460 | 6201 re_char *dtmp; |
867 | 6202 Ichar emch; |
1333 | 6203 int tempres; |
460 | 6204 #ifdef emacs |
826 | 6205 Charxpos charpos = offset_to_charxpos (lispobj, PTR_TO_OFFSET (d)); |
1347 | 6206 BEGIN_REGEX_MALLOC_OK (); |
826 | 6207 UPDATE_SYNTAX_CACHE (scache, charpos); |
1333 | 6208 END_REGEX_MALLOC_OK (); |
6209 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
1347 | 6210 #endif |
460 | 6211 dtmp = POS_BEFORE_GAP_UNSAFE (d); |
867 | 6212 DEC_IBYTEPTR_FMT (dtmp, fmt); |
6213 emch = itext_ichar_fmt (dtmp, fmt, lispobj); | |
1333 | 6214 BEGIN_REGEX_MALLOC_OK (); |
6215 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); | |
6216 END_REGEX_MALLOC_OK (); | |
6217 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
6218 if (tempres) | |
428 | 6219 goto fail; |
6220 if (AT_STRINGS_END (d)) | |
6221 break; | |
460 | 6222 dtmp = POS_AFTER_GAP_UNSAFE (d); |
867 | 6223 emch = itext_ichar_fmt (dtmp, fmt, lispobj); |
1333 | 6224 BEGIN_REGEX_MALLOC_OK (); |
460 | 6225 #ifdef emacs |
826 | 6226 UPDATE_SYNTAX_CACHE_FORWARD (scache, charpos + 1); |
460 | 6227 #endif |
1333 | 6228 tempres = (SYNTAX_FROM_CACHE (scache, emch) != Sword); |
6229 END_REGEX_MALLOC_OK (); | |
6230 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
6231 if (tempres) | |
428 | 6232 break; |
6233 goto fail; | |
6234 } | |
6235 | |
6236 #ifdef emacs | |
6237 case before_dot: | |
6238 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); | |
826 | 6239 if (!BUFFERP (lispobj) |
6240 || (BUF_PTR_BYTE_POS (XBUFFER (lispobj), (unsigned char *) d) | |
6241 >= BUF_PT (XBUFFER (lispobj)))) | |
428 | 6242 goto fail; |
6243 break; | |
6244 | |
6245 case at_dot: | |
6246 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); | |
826 | 6247 if (!BUFFERP (lispobj) |
6248 || (BUF_PTR_BYTE_POS (XBUFFER (lispobj), (unsigned char *) d) | |
6249 != BUF_PT (XBUFFER (lispobj)))) | |
428 | 6250 goto fail; |
6251 break; | |
6252 | |
6253 case after_dot: | |
6254 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); | |
826 | 6255 if (!BUFFERP (lispobj) |
6256 || (BUF_PTR_BYTE_POS (XBUFFER (lispobj), (unsigned char *) d) | |
6257 <= BUF_PT (XBUFFER (lispobj)))) | |
428 | 6258 goto fail; |
6259 break; | |
6260 | |
6261 case syntaxspec: | |
6262 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); | |
6263 mcnt = *p++; | |
6264 goto matchsyntax; | |
6265 | |
6266 case wordchar: | |
6267 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); | |
6268 mcnt = (int) Sword; | |
6269 matchsyntax: | |
6270 should_succeed = 1; | |
6271 matchornotsyntax: | |
6272 { | |
6273 int matches; | |
867 | 6274 Ichar emch; |
428 | 6275 |
450 | 6276 REGEX_PREFETCH (); |
1333 | 6277 BEGIN_REGEX_MALLOC_OK (); |
826 | 6278 UPDATE_SYNTAX_CACHE |
6279 (scache, offset_to_charxpos (lispobj, PTR_TO_OFFSET (d))); | |
1333 | 6280 END_REGEX_MALLOC_OK (); |
6281 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
826 | 6282 |
867 | 6283 emch = itext_ichar_fmt (d, fmt, lispobj); |
1333 | 6284 BEGIN_REGEX_MALLOC_OK (); |
826 | 6285 matches = (SYNTAX_FROM_CACHE (scache, emch) == |
6286 (enum syntaxcode) mcnt); | |
1333 | 6287 END_REGEX_MALLOC_OK (); |
6288 RE_MATCH_RELOCATE_MOVEABLE_DATA_POINTERS (); | |
867 | 6289 INC_IBYTEPTR_FMT (d, fmt); |
428 | 6290 if (matches != should_succeed) |
6291 goto fail; | |
6292 SET_REGS_MATCHED (); | |
6293 } | |
6294 break; | |
6295 | |
6296 case notsyntaxspec: | |
6297 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); | |
6298 mcnt = *p++; | |
6299 goto matchnotsyntax; | |
6300 | |
6301 case notwordchar: | |
6302 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); | |
6303 mcnt = (int) Sword; | |
6304 matchnotsyntax: | |
6305 should_succeed = 0; | |
6306 goto matchornotsyntax; | |
6307 | |
6308 #ifdef MULE | |
6309 /* 97/2/17 jhod Mule category code patch */ | |
6310 case categoryspec: | |
6311 should_succeed = 1; | |
6312 matchornotcategory: | |
6313 { | |
867 | 6314 Ichar emch; |
428 | 6315 |
6316 mcnt = *p++; | |
450 | 6317 REGEX_PREFETCH (); |
867 | 6318 emch = itext_ichar_fmt (d, fmt, lispobj); |
6319 INC_IBYTEPTR_FMT (d, fmt); | |
826 | 6320 if (check_category_char (emch, BUFFER_CATEGORY_TABLE (lispbuf), |
6321 mcnt, should_succeed)) | |
428 | 6322 goto fail; |
6323 SET_REGS_MATCHED (); | |
6324 } | |
6325 break; | |
6326 | |
6327 case notcategoryspec: | |
6328 should_succeed = 0; | |
6329 goto matchornotcategory; | |
6330 /* end of category patch */ | |
6331 #endif /* MULE */ | |
6332 #else /* not emacs */ | |
6333 case wordchar: | |
6334 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); | |
450 | 6335 REGEX_PREFETCH (); |
826 | 6336 if (!WORDCHAR_P ((int) (*d))) |
428 | 6337 goto fail; |
6338 SET_REGS_MATCHED (); | |
6339 d++; | |
6340 break; | |
6341 | |
6342 case notwordchar: | |
6343 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); | |
450 | 6344 REGEX_PREFETCH (); |
826 | 6345 if (!WORDCHAR_P ((int) (*d))) |
428 | 6346 goto fail; |
6347 SET_REGS_MATCHED (); | |
6348 d++; | |
6349 break; | |
446 | 6350 #endif /* emacs */ |
428 | 6351 |
6352 default: | |
2500 | 6353 ABORT (); |
428 | 6354 } |
6355 continue; /* Successfully executed one pattern command; keep going. */ | |
6356 | |
6357 | |
6358 /* We goto here if a matching operation fails. */ | |
6359 fail: | |
6360 if (!FAIL_STACK_EMPTY ()) | |
6361 { /* A restart point is known. Restore to that state. */ | |
6362 DEBUG_PRINT1 ("\nFAIL:\n"); | |
6363 POP_FAILURE_POINT (d, p, | |
6364 lowest_active_reg, highest_active_reg, | |
6365 regstart, regend, reg_info); | |
6366 | |
6367 /* If this failure point is a dummy, try the next one. */ | |
6368 if (!p) | |
6369 goto fail; | |
6370 | |
6371 /* If we failed to the end of the pattern, don't examine *p. */ | |
6372 assert (p <= pend); | |
6373 if (p < pend) | |
6374 { | |
460 | 6375 re_bool is_a_jump_n = false; |
428 | 6376 |
6377 /* If failed to a backwards jump that's part of a repetition | |
6378 loop, need to pop this failure point and use the next one. */ | |
6379 switch ((re_opcode_t) *p) | |
6380 { | |
6381 case jump_n: | |
6382 is_a_jump_n = true; | |
6383 case maybe_pop_jump: | |
6384 case pop_failure_jump: | |
6385 case jump: | |
6386 p1 = p + 1; | |
6387 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6388 p1 += mcnt; | |
6389 | |
6390 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) | |
6391 || (!is_a_jump_n | |
6392 && (re_opcode_t) *p1 == on_failure_jump)) | |
6393 goto fail; | |
6394 break; | |
6395 default: | |
6396 /* do nothing */ ; | |
6397 } | |
6398 } | |
6399 | |
6400 if (d >= string1 && d <= end1) | |
6401 dend = end_match_1; | |
6402 } | |
6403 else | |
6404 break; /* Matching at this starting point really fails. */ | |
6405 } /* for (;;) */ | |
6406 | |
6407 if (best_regs_set) | |
6408 goto restore_best_regs; | |
6409 | |
6410 FREE_VARIABLES (); | |
6411 | |
6412 return -1; /* Failure to match. */ | |
1333 | 6413 } /* re_match_2_internal */ |
428 | 6414 |
6415 /* Subroutine definitions for re_match_2. */ | |
6416 | |
6417 | |
6418 /* We are passed P pointing to a register number after a start_memory. | |
6419 | |
6420 Return true if the pattern up to the corresponding stop_memory can | |
6421 match the empty string, and false otherwise. | |
6422 | |
6423 If we find the matching stop_memory, sets P to point to one past its number. | |
6424 Otherwise, sets P to an undefined byte less than or equal to END. | |
6425 | |
6426 We don't handle duplicates properly (yet). */ | |
6427 | |
460 | 6428 static re_bool |
428 | 6429 group_match_null_string_p (unsigned char **p, unsigned char *end, |
6430 register_info_type *reg_info) | |
6431 { | |
6432 int mcnt; | |
6433 /* Point to after the args to the start_memory. */ | |
6434 unsigned char *p1 = *p + 2; | |
6435 | |
6436 while (p1 < end) | |
6437 { | |
6438 /* Skip over opcodes that can match nothing, and return true or | |
6439 false, as appropriate, when we get to one that can't, or to the | |
6440 matching stop_memory. */ | |
6441 | |
6442 switch ((re_opcode_t) *p1) | |
6443 { | |
6444 /* Could be either a loop or a series of alternatives. */ | |
6445 case on_failure_jump: | |
6446 p1++; | |
6447 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6448 | |
6449 /* If the next operation is not a jump backwards in the | |
6450 pattern. */ | |
6451 | |
6452 if (mcnt >= 0) | |
6453 { | |
6454 /* Go through the on_failure_jumps of the alternatives, | |
6455 seeing if any of the alternatives cannot match nothing. | |
6456 The last alternative starts with only a jump, | |
6457 whereas the rest start with on_failure_jump and end | |
6458 with a jump, e.g., here is the pattern for `a|b|c': | |
6459 | |
6460 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 | |
6461 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 | |
6462 /exactn/1/c | |
6463 | |
6464 So, we have to first go through the first (n-1) | |
6465 alternatives and then deal with the last one separately. */ | |
6466 | |
6467 | |
6468 /* Deal with the first (n-1) alternatives, which start | |
6469 with an on_failure_jump (see above) that jumps to right | |
6470 past a jump_past_alt. */ | |
6471 | |
6472 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) | |
6473 { | |
6474 /* `mcnt' holds how many bytes long the alternative | |
6475 is, including the ending `jump_past_alt' and | |
6476 its number. */ | |
6477 | |
6478 if (!alt_match_null_string_p (p1, p1 + mcnt - 3, | |
6479 reg_info)) | |
6480 return false; | |
6481 | |
6482 /* Move to right after this alternative, including the | |
6483 jump_past_alt. */ | |
6484 p1 += mcnt; | |
6485 | |
6486 /* Break if it's the beginning of an n-th alternative | |
6487 that doesn't begin with an on_failure_jump. */ | |
6488 if ((re_opcode_t) *p1 != on_failure_jump) | |
6489 break; | |
6490 | |
6491 /* Still have to check that it's not an n-th | |
6492 alternative that starts with an on_failure_jump. */ | |
6493 p1++; | |
6494 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6495 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) | |
6496 { | |
6497 /* Get to the beginning of the n-th alternative. */ | |
6498 p1 -= 3; | |
6499 break; | |
6500 } | |
6501 } | |
6502 | |
6503 /* Deal with the last alternative: go back and get number | |
6504 of the `jump_past_alt' just before it. `mcnt' contains | |
6505 the length of the alternative. */ | |
6506 EXTRACT_NUMBER (mcnt, p1 - 2); | |
6507 | |
6508 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) | |
6509 return false; | |
6510 | |
6511 p1 += mcnt; /* Get past the n-th alternative. */ | |
6512 } /* if mcnt > 0 */ | |
6513 break; | |
6514 | |
6515 | |
6516 case stop_memory: | |
6517 assert (p1[1] == **p); | |
6518 *p = p1 + 2; | |
6519 return true; | |
6520 | |
6521 | |
6522 default: | |
6523 if (!common_op_match_null_string_p (&p1, end, reg_info)) | |
6524 return false; | |
6525 } | |
6526 } /* while p1 < end */ | |
6527 | |
6528 return false; | |
6529 } /* group_match_null_string_p */ | |
6530 | |
6531 | |
6532 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: | |
6533 It expects P to be the first byte of a single alternative and END one | |
6534 byte past the last. The alternative can contain groups. */ | |
6535 | |
460 | 6536 static re_bool |
428 | 6537 alt_match_null_string_p (unsigned char *p, unsigned char *end, |
6538 register_info_type *reg_info) | |
6539 { | |
6540 int mcnt; | |
6541 unsigned char *p1 = p; | |
6542 | |
6543 while (p1 < end) | |
6544 { | |
6545 /* Skip over opcodes that can match nothing, and break when we get | |
6546 to one that can't. */ | |
6547 | |
6548 switch ((re_opcode_t) *p1) | |
6549 { | |
6550 /* It's a loop. */ | |
6551 case on_failure_jump: | |
6552 p1++; | |
6553 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6554 p1 += mcnt; | |
6555 break; | |
6556 | |
6557 default: | |
6558 if (!common_op_match_null_string_p (&p1, end, reg_info)) | |
6559 return false; | |
6560 } | |
6561 } /* while p1 < end */ | |
6562 | |
6563 return true; | |
6564 } /* alt_match_null_string_p */ | |
6565 | |
6566 | |
6567 /* Deals with the ops common to group_match_null_string_p and | |
6568 alt_match_null_string_p. | |
6569 | |
6570 Sets P to one after the op and its arguments, if any. */ | |
6571 | |
460 | 6572 static re_bool |
428 | 6573 common_op_match_null_string_p (unsigned char **p, unsigned char *end, |
6574 register_info_type *reg_info) | |
6575 { | |
6576 int mcnt; | |
460 | 6577 re_bool ret; |
428 | 6578 int reg_no; |
6579 unsigned char *p1 = *p; | |
6580 | |
6581 switch ((re_opcode_t) *p1++) | |
6582 { | |
6583 case no_op: | |
6584 case begline: | |
6585 case endline: | |
6586 case begbuf: | |
6587 case endbuf: | |
6588 case wordbeg: | |
6589 case wordend: | |
6590 case wordbound: | |
6591 case notwordbound: | |
6592 #ifdef emacs | |
6593 case before_dot: | |
6594 case at_dot: | |
6595 case after_dot: | |
6596 #endif | |
6597 break; | |
6598 | |
6599 case start_memory: | |
6600 reg_no = *p1; | |
6601 assert (reg_no > 0 && reg_no <= MAX_REGNUM); | |
6602 ret = group_match_null_string_p (&p1, end, reg_info); | |
6603 | |
6604 /* Have to set this here in case we're checking a group which | |
6605 contains a group and a back reference to it. */ | |
6606 | |
6607 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) | |
6608 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; | |
6609 | |
6610 if (!ret) | |
6611 return false; | |
6612 break; | |
6613 | |
6614 /* If this is an optimized succeed_n for zero times, make the jump. */ | |
6615 case jump: | |
6616 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6617 if (mcnt >= 0) | |
6618 p1 += mcnt; | |
6619 else | |
6620 return false; | |
6621 break; | |
6622 | |
6623 case succeed_n: | |
6624 /* Get to the number of times to succeed. */ | |
6625 p1 += 2; | |
6626 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6627 | |
6628 if (mcnt == 0) | |
6629 { | |
6630 p1 -= 4; | |
6631 EXTRACT_NUMBER_AND_INCR (mcnt, p1); | |
6632 p1 += mcnt; | |
6633 } | |
6634 else | |
6635 return false; | |
6636 break; | |
6637 | |
6638 case duplicate: | |
6639 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) | |
6640 return false; | |
6641 break; | |
6642 | |
6643 case set_number_at: | |
6644 p1 += 4; | |
6645 | |
6646 default: | |
6647 /* All other opcodes mean we cannot match the empty string. */ | |
6648 return false; | |
6649 } | |
6650 | |
6651 *p = p1; | |
6652 return true; | |
6653 } /* common_op_match_null_string_p */ | |
6654 | |
6655 | |
6656 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN | |
6657 bytes; nonzero otherwise. */ | |
6658 | |
6659 static int | |
446 | 6660 bcmp_translate (re_char *s1, re_char *s2, |
826 | 6661 REGISTER int len, RE_TRANSLATE_TYPE translate |
6662 #ifdef emacs | |
2333 | 6663 , Internal_Format USED_IF_MULE (fmt), |
6664 Lisp_Object USED_IF_MULE (lispobj) | |
826 | 6665 #endif |
6666 ) | |
428 | 6667 { |
826 | 6668 REGISTER re_char *p1 = s1, *p2 = s2; |
446 | 6669 #ifdef MULE |
826 | 6670 re_char *p1_end = s1 + len; |
6671 re_char *p2_end = s2 + len; | |
446 | 6672 |
6673 while (p1 != p1_end && p2 != p2_end) | |
6674 { | |
867 | 6675 Ichar p1_ch, p2_ch; |
6676 | |
6677 p1_ch = itext_ichar_fmt (p1, fmt, lispobj); | |
6678 p2_ch = itext_ichar_fmt (p2, fmt, lispobj); | |
826 | 6679 |
6680 if (RE_TRANSLATE_1 (p1_ch) | |
6681 != RE_TRANSLATE_1 (p2_ch)) | |
446 | 6682 return 1; |
867 | 6683 INC_IBYTEPTR_FMT (p1, fmt); |
6684 INC_IBYTEPTR_FMT (p2, fmt); | |
446 | 6685 } |
6686 #else /* not MULE */ | |
428 | 6687 while (len) |
6688 { | |
826 | 6689 if (RE_TRANSLATE_1 (*p1++) != RE_TRANSLATE_1 (*p2++)) return 1; |
428 | 6690 len--; |
6691 } | |
446 | 6692 #endif /* MULE */ |
428 | 6693 return 0; |
6694 } | |
6695 | |
6696 /* Entry points for GNU code. */ | |
6697 | |
6698 /* re_compile_pattern is the GNU regular expression compiler: it | |
6699 compiles PATTERN (of length SIZE) and puts the result in BUFP. | |
6700 Returns 0 if the pattern was valid, otherwise an error string. | |
6701 | |
6702 Assumes the `allocated' (and perhaps `buffer') and `translate' fields | |
6703 are set in BUFP on entry. | |
6704 | |
6705 We call regex_compile to do the actual compilation. */ | |
6706 | |
442 | 6707 const char * |
6708 re_compile_pattern (const char *pattern, int length, | |
428 | 6709 struct re_pattern_buffer *bufp) |
6710 { | |
6711 reg_errcode_t ret; | |
6712 | |
6713 /* GNU code is written to assume at least RE_NREGS registers will be set | |
6714 (and at least one extra will be -1). */ | |
6715 bufp->regs_allocated = REGS_UNALLOCATED; | |
6716 | |
6717 /* And GNU code determines whether or not to get register information | |
6718 by passing null for the REGS argument to re_match, etc., not by | |
6719 setting no_sub. */ | |
6720 bufp->no_sub = 0; | |
6721 | |
6722 /* Match anchors at newline. */ | |
6723 bufp->newline_anchor = 1; | |
6724 | |
826 | 6725 ret = regex_compile ((unsigned char *) pattern, length, re_syntax_options, |
6726 bufp); | |
428 | 6727 |
6728 if (!ret) | |
6729 return NULL; | |
6730 return gettext (re_error_msgid[(int) ret]); | |
6731 } | |
6732 | |
6733 /* Entry points compatible with 4.2 BSD regex library. We don't define | |
6734 them unless specifically requested. */ | |
6735 | |
6736 #ifdef _REGEX_RE_COMP | |
6737 | |
6738 /* BSD has one and only one pattern buffer. */ | |
6739 static struct re_pattern_buffer re_comp_buf; | |
6740 | |
6741 char * | |
442 | 6742 re_comp (const char *s) |
428 | 6743 { |
6744 reg_errcode_t ret; | |
6745 | |
6746 if (!s) | |
6747 { | |
6748 if (!re_comp_buf.buffer) | |
6749 return gettext ("No previous regular expression"); | |
6750 return 0; | |
6751 } | |
6752 | |
6753 if (!re_comp_buf.buffer) | |
6754 { | |
1333 | 6755 re_comp_buf.buffer = (unsigned char *) xmalloc (200); |
428 | 6756 if (re_comp_buf.buffer == NULL) |
6757 return gettext (re_error_msgid[(int) REG_ESPACE]); | |
6758 re_comp_buf.allocated = 200; | |
6759 | |
1333 | 6760 re_comp_buf.fastmap = (char *) xmalloc (1 << BYTEWIDTH); |
428 | 6761 if (re_comp_buf.fastmap == NULL) |
6762 return gettext (re_error_msgid[(int) REG_ESPACE]); | |
6763 } | |
6764 | |
6765 /* Since `re_exec' always passes NULL for the `regs' argument, we | |
6766 don't need to initialize the pattern buffer fields which affect it. */ | |
6767 | |
6768 /* Match anchors at newlines. */ | |
6769 re_comp_buf.newline_anchor = 1; | |
6770 | |
826 | 6771 ret = regex_compile ((unsigned char *)s, strlen (s), re_syntax_options, |
6772 &re_comp_buf); | |
428 | 6773 |
6774 if (!ret) | |
6775 return NULL; | |
6776 | |
442 | 6777 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ |
428 | 6778 return (char *) gettext (re_error_msgid[(int) ret]); |
6779 } | |
6780 | |
6781 | |
6782 int | |
442 | 6783 re_exec (const char *s) |
428 | 6784 { |
442 | 6785 const int len = strlen (s); |
428 | 6786 return |
6787 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); | |
6788 } | |
6789 #endif /* _REGEX_RE_COMP */ | |
6790 | |
6791 /* POSIX.2 functions. Don't define these for Emacs. */ | |
6792 | |
6793 #ifndef emacs | |
6794 | |
6795 /* regcomp takes a regular expression as a string and compiles it. | |
6796 | |
6797 PREG is a regex_t *. We do not expect any fields to be initialized, | |
6798 since POSIX says we shouldn't. Thus, we set | |
6799 | |
6800 `buffer' to the compiled pattern; | |
6801 `used' to the length of the compiled pattern; | |
6802 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the | |
6803 REG_EXTENDED bit in CFLAGS is set; otherwise, to | |
6804 RE_SYNTAX_POSIX_BASIC; | |
6805 `newline_anchor' to REG_NEWLINE being set in CFLAGS; | |
6806 `fastmap' and `fastmap_accurate' to zero; | |
6807 `re_nsub' to the number of subexpressions in PATTERN. | |
502 | 6808 (non-shy of course. POSIX probably doesn't know about |
6809 shy ones, and in any case they should be invisible.) | |
428 | 6810 |
6811 PATTERN is the address of the pattern string. | |
6812 | |
6813 CFLAGS is a series of bits which affect compilation. | |
6814 | |
6815 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we | |
6816 use POSIX basic syntax. | |
6817 | |
6818 If REG_NEWLINE is set, then . and [^...] don't match newline. | |
6819 Also, regexec will try a match beginning after every newline. | |
6820 | |
6821 If REG_ICASE is set, then we considers upper- and lowercase | |
6822 versions of letters to be equivalent when matching. | |
6823 | |
6824 If REG_NOSUB is set, then when PREG is passed to regexec, that | |
6825 routine will report only success or failure, and nothing about the | |
6826 registers. | |
6827 | |
6828 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for | |
6829 the return codes and their meanings.) */ | |
6830 | |
6831 int | |
442 | 6832 regcomp (regex_t *preg, const char *pattern, int cflags) |
428 | 6833 { |
6834 reg_errcode_t ret; | |
647 | 6835 unsigned int syntax |
428 | 6836 = (cflags & REG_EXTENDED) ? |
6837 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; | |
6838 | |
6839 /* regex_compile will allocate the space for the compiled pattern. */ | |
6840 preg->buffer = 0; | |
6841 preg->allocated = 0; | |
6842 preg->used = 0; | |
6843 | |
6844 /* Don't bother to use a fastmap when searching. This simplifies the | |
6845 REG_NEWLINE case: if we used a fastmap, we'd have to put all the | |
6846 characters after newlines into the fastmap. This way, we just try | |
6847 every character. */ | |
6848 preg->fastmap = 0; | |
6849 | |
6850 if (cflags & REG_ICASE) | |
6851 { | |
647 | 6852 int i; |
428 | 6853 |
1333 | 6854 preg->translate = (char *) xmalloc (CHAR_SET_SIZE); |
428 | 6855 if (preg->translate == NULL) |
6856 return (int) REG_ESPACE; | |
6857 | |
6858 /* Map uppercase characters to corresponding lowercase ones. */ | |
6859 for (i = 0; i < CHAR_SET_SIZE; i++) | |
6860 preg->translate[i] = ISUPPER (i) ? tolower (i) : i; | |
6861 } | |
6862 else | |
6863 preg->translate = NULL; | |
6864 | |
6865 /* If REG_NEWLINE is set, newlines are treated differently. */ | |
6866 if (cflags & REG_NEWLINE) | |
6867 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ | |
6868 syntax &= ~RE_DOT_NEWLINE; | |
6869 syntax |= RE_HAT_LISTS_NOT_NEWLINE; | |
6870 /* It also changes the matching behavior. */ | |
6871 preg->newline_anchor = 1; | |
6872 } | |
6873 else | |
6874 preg->newline_anchor = 0; | |
6875 | |
6876 preg->no_sub = !!(cflags & REG_NOSUB); | |
6877 | |
6878 /* POSIX says a null character in the pattern terminates it, so we | |
6879 can use strlen here in compiling the pattern. */ | |
446 | 6880 ret = regex_compile ((unsigned char *) pattern, strlen (pattern), syntax, preg); |
428 | 6881 |
6882 /* POSIX doesn't distinguish between an unmatched open-group and an | |
6883 unmatched close-group: both are REG_EPAREN. */ | |
6884 if (ret == REG_ERPAREN) ret = REG_EPAREN; | |
6885 | |
6886 return (int) ret; | |
6887 } | |
6888 | |
6889 | |
6890 /* regexec searches for a given pattern, specified by PREG, in the | |
6891 string STRING. | |
6892 | |
6893 If NMATCH is zero or REG_NOSUB was set in the cflags argument to | |
6894 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at | |
6895 least NMATCH elements, and we set them to the offsets of the | |
6896 corresponding matched substrings. | |
6897 | |
6898 EFLAGS specifies `execution flags' which affect matching: if | |
6899 REG_NOTBOL is set, then ^ does not match at the beginning of the | |
6900 string; if REG_NOTEOL is set, then $ does not match at the end. | |
6901 | |
6902 We return 0 if we find a match and REG_NOMATCH if not. */ | |
6903 | |
6904 int | |
442 | 6905 regexec (const regex_t *preg, const char *string, size_t nmatch, |
428 | 6906 regmatch_t pmatch[], int eflags) |
6907 { | |
6908 int ret; | |
6909 struct re_registers regs; | |
6910 regex_t private_preg; | |
6911 int len = strlen (string); | |
460 | 6912 re_bool want_reg_info = !preg->no_sub && nmatch > 0; |
428 | 6913 |
6914 private_preg = *preg; | |
6915 | |
6916 private_preg.not_bol = !!(eflags & REG_NOTBOL); | |
6917 private_preg.not_eol = !!(eflags & REG_NOTEOL); | |
6918 | |
6919 /* The user has told us exactly how many registers to return | |
6920 information about, via `nmatch'. We have to pass that on to the | |
6921 matching routines. */ | |
6922 private_preg.regs_allocated = REGS_FIXED; | |
6923 | |
6924 if (want_reg_info) | |
6925 { | |
647 | 6926 regs.num_regs = (int) nmatch; |
6927 regs.start = TALLOC ((int) nmatch, regoff_t); | |
6928 regs.end = TALLOC ((int) nmatch, regoff_t); | |
428 | 6929 if (regs.start == NULL || regs.end == NULL) |
6930 return (int) REG_NOMATCH; | |
6931 } | |
6932 | |
6933 /* Perform the searching operation. */ | |
6934 ret = re_search (&private_preg, string, len, | |
6935 /* start: */ 0, /* range: */ len, | |
6936 want_reg_info ? ®s : (struct re_registers *) 0); | |
6937 | |
6938 /* Copy the register information to the POSIX structure. */ | |
6939 if (want_reg_info) | |
6940 { | |
6941 if (ret >= 0) | |
6942 { | |
647 | 6943 int r; |
6944 | |
6945 for (r = 0; r < (int) nmatch; r++) | |
428 | 6946 { |
6947 pmatch[r].rm_so = regs.start[r]; | |
6948 pmatch[r].rm_eo = regs.end[r]; | |
6949 } | |
6950 } | |
6951 | |
6952 /* If we needed the temporary register info, free the space now. */ | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
6953 xfree (regs.start); |
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
6954 xfree (regs.end); |
428 | 6955 } |
6956 | |
6957 /* We want zero return to mean success, unlike `re_search'. */ | |
6958 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; | |
6959 } | |
6960 | |
6961 | |
6962 /* Returns a message corresponding to an error code, ERRCODE, returned | |
6963 from either regcomp or regexec. We don't use PREG here. */ | |
6964 | |
6965 size_t | |
2286 | 6966 regerror (int errcode, const regex_t *UNUSED (preg), char *errbuf, |
647 | 6967 size_t errbuf_size) |
428 | 6968 { |
442 | 6969 const char *msg; |
665 | 6970 Bytecount msg_size; |
428 | 6971 |
6972 if (errcode < 0 | |
647 | 6973 || errcode >= (int) (sizeof (re_error_msgid) / |
6974 sizeof (re_error_msgid[0]))) | |
428 | 6975 /* Only error codes returned by the rest of the code should be passed |
6976 to this routine. If we are given anything else, or if other regex | |
6977 code generates an invalid error code, then the program has a bug. | |
6978 Dump core so we can fix it. */ | |
2500 | 6979 ABORT (); |
428 | 6980 |
6981 msg = gettext (re_error_msgid[errcode]); | |
6982 | |
6983 msg_size = strlen (msg) + 1; /* Includes the null. */ | |
6984 | |
6985 if (errbuf_size != 0) | |
6986 { | |
665 | 6987 if (msg_size > (Bytecount) errbuf_size) |
428 | 6988 { |
6989 strncpy (errbuf, msg, errbuf_size - 1); | |
6990 errbuf[errbuf_size - 1] = 0; | |
6991 } | |
6992 else | |
6993 strcpy (errbuf, msg); | |
6994 } | |
6995 | |
647 | 6996 return (size_t) msg_size; |
428 | 6997 } |
6998 | |
6999 | |
7000 /* Free dynamically allocated space used by PREG. */ | |
7001 | |
7002 void | |
7003 regfree (regex_t *preg) | |
7004 { | |
7005 if (preg->buffer != NULL) | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
7006 xfree (preg->buffer); |
428 | 7007 preg->buffer = NULL; |
7008 | |
7009 preg->allocated = 0; | |
7010 preg->used = 0; | |
7011 | |
7012 if (preg->fastmap != NULL) | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
7013 xfree (preg->fastmap); |
428 | 7014 preg->fastmap = NULL; |
7015 preg->fastmap_accurate = 0; | |
7016 | |
7017 if (preg->translate != NULL) | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4832
diff
changeset
|
7018 xfree (preg->translate); |
428 | 7019 preg->translate = NULL; |
7020 } | |
7021 | |
7022 #endif /* not emacs */ | |
7023 |