Mercurial > hg > xemacs-beta
comparison src/regex.c @ 460:223736d75acb r21-2-45
Import from CVS: tag r21-2-45
author | cvs |
---|---|
date | Mon, 13 Aug 2007 11:43:24 +0200 |
parents | c33ae14dd6d0 |
children | 7039e6323819 |
comparison
equal
deleted
inserted
replaced
459:9d4fd877b885 | 460:223736d75acb |
---|---|
45 | 45 |
46 #ifndef _GNU_SOURCE | 46 #ifndef _GNU_SOURCE |
47 #define _GNU_SOURCE 1 | 47 #define _GNU_SOURCE 1 |
48 #endif | 48 #endif |
49 | 49 |
50 #ifdef emacs | |
51 /* Converts the pointer to the char to BEG-based offset from the start. */ | |
52 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \ | |
53 ? (d) - string1 : (d) - (string2 - size1)) | |
54 #else | |
55 #define PTR_TO_OFFSET(d) 0 | |
56 #endif | |
57 | |
50 /* We assume non-Mule if emacs isn't defined. */ | 58 /* We assume non-Mule if emacs isn't defined. */ |
51 #ifndef emacs | 59 #ifndef emacs |
52 #undef MULE | 60 #undef MULE |
53 #endif | 61 #endif |
54 | 62 |
177 } | 185 } |
178 | 186 |
179 #endif /* SYNTAX_TABLE */ | 187 #endif /* SYNTAX_TABLE */ |
180 | 188 |
181 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c] | 189 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c] |
190 #undef SYNTAX_FROM_CACHE | |
191 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE | |
182 | 192 |
183 #define RE_TRANSLATE(c) translate[(unsigned char) (c)] | 193 #define RE_TRANSLATE(c) translate[(unsigned char) (c)] |
184 #define TRANSLATE_P(tr) tr | 194 #define TRANSLATE_P(tr) tr |
185 | 195 |
186 #endif /* emacs */ | 196 #endif /* emacs */ |
366 #define MIN(a, b) ((a) < (b) ? (a) : (b)) | 376 #define MIN(a, b) ((a) < (b) ? (a) : (b)) |
367 | 377 |
368 /* Type of source-pattern and string chars. */ | 378 /* Type of source-pattern and string chars. */ |
369 typedef const unsigned char re_char; | 379 typedef const unsigned char re_char; |
370 | 380 |
371 typedef char boolean; | 381 typedef char re_bool; |
372 #define false 0 | 382 #define false 0 |
373 #define true 1 | 383 #define true 1 |
374 | 384 |
375 | 385 |
376 /* These are the command codes that appear in compiled regular | 386 /* These are the command codes that appear in compiled regular |
1778 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2); | 1788 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2); |
1779 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg, | 1789 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg, |
1780 unsigned char *end); | 1790 unsigned char *end); |
1781 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, | 1791 static void insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, |
1782 unsigned char *end); | 1792 unsigned char *end); |
1783 static boolean at_begline_loc_p (re_char *pattern, re_char *p, | 1793 static re_bool at_begline_loc_p (re_char *pattern, re_char *p, |
1784 reg_syntax_t syntax); | 1794 reg_syntax_t syntax); |
1785 static boolean at_endline_loc_p (re_char *p, re_char *pend, int syntax); | 1795 static re_bool at_endline_loc_p (re_char *p, re_char *pend, int syntax); |
1786 static boolean group_in_compile_stack (compile_stack_type compile_stack, | 1796 static re_bool group_in_compile_stack (compile_stack_type compile_stack, |
1787 regnum_t regnum); | 1797 regnum_t regnum); |
1788 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend, | 1798 static reg_errcode_t compile_range (re_char **p_ptr, re_char *pend, |
1789 RE_TRANSLATE_TYPE translate, | 1799 RE_TRANSLATE_TYPE translate, |
1790 reg_syntax_t syntax, | 1800 reg_syntax_t syntax, |
1791 unsigned char *b); | 1801 unsigned char *b); |
1794 re_char *pend, | 1804 re_char *pend, |
1795 RE_TRANSLATE_TYPE translate, | 1805 RE_TRANSLATE_TYPE translate, |
1796 reg_syntax_t syntax, | 1806 reg_syntax_t syntax, |
1797 Lisp_Object rtab); | 1807 Lisp_Object rtab); |
1798 #endif /* MULE */ | 1808 #endif /* MULE */ |
1799 static boolean group_match_null_string_p (unsigned char **p, | 1809 static re_bool group_match_null_string_p (unsigned char **p, |
1800 unsigned char *end, | 1810 unsigned char *end, |
1801 register_info_type *reg_info); | 1811 register_info_type *reg_info); |
1802 static boolean alt_match_null_string_p (unsigned char *p, unsigned char *end, | 1812 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end, |
1803 register_info_type *reg_info); | 1813 register_info_type *reg_info); |
1804 static boolean common_op_match_null_string_p (unsigned char **p, | 1814 static re_bool common_op_match_null_string_p (unsigned char **p, |
1805 unsigned char *end, | 1815 unsigned char *end, |
1806 register_info_type *reg_info); | 1816 register_info_type *reg_info); |
1807 static int bcmp_translate (const unsigned char *s1, const unsigned char *s2, | 1817 static int bcmp_translate (const unsigned char *s1, const unsigned char *s2, |
1808 REGISTER int len, RE_TRANSLATE_TYPE translate); | 1818 REGISTER int len, RE_TRANSLATE_TYPE translate); |
1809 static int re_match_2_internal (struct re_pattern_buffer *bufp, | 1819 static int re_match_2_internal (struct re_pattern_buffer *bufp, |
2046 goto normal_char; | 2056 goto normal_char; |
2047 } | 2057 } |
2048 | 2058 |
2049 { | 2059 { |
2050 /* true means zero/many matches are allowed. */ | 2060 /* true means zero/many matches are allowed. */ |
2051 boolean zero_times_ok = c != '+'; | 2061 re_bool zero_times_ok = c != '+'; |
2052 boolean many_times_ok = c != '?'; | 2062 re_bool many_times_ok = c != '?'; |
2053 | 2063 |
2054 /* true means match shortest string possible. */ | 2064 /* true means match shortest string possible. */ |
2055 boolean minimal = false; | 2065 re_bool minimal = false; |
2056 | 2066 |
2057 /* If there is a sequence of repetition chars, collapse it | 2067 /* If there is a sequence of repetition chars, collapse it |
2058 down to just one (the right one). We can't combine | 2068 down to just one (the right one). We can't combine |
2059 interval operators with these because of, e.g., `a{2}*', | 2069 interval operators with these because of, e.g., `a{2}*', |
2060 which should only match an even number of `a's. */ | 2070 which should only match an even number of `a's. */ |
2154 } | 2164 } |
2155 } | 2165 } |
2156 else | 2166 else |
2157 { | 2167 { |
2158 /* Are we optimizing this jump? */ | 2168 /* Are we optimizing this jump? */ |
2159 boolean keep_string_p = false; | 2169 re_bool keep_string_p = false; |
2160 | 2170 |
2161 if (many_times_ok) | 2171 if (many_times_ok) |
2162 { /* More than one repetition is allowed, so put in | 2172 { /* More than one repetition is allowed, so put in |
2163 at the end a backward relative jump from | 2173 at the end a backward relative jump from |
2164 `buf_end' to before the next jump we're going | 2174 `buf_end' to before the next jump we're going |
2230 | 2240 |
2231 | 2241 |
2232 case '[': | 2242 case '[': |
2233 { | 2243 { |
2234 /* XEmacs change: this whole section */ | 2244 /* XEmacs change: this whole section */ |
2235 boolean had_char_class = false; | 2245 re_bool had_char_class = false; |
2236 #ifdef MULE | 2246 #ifdef MULE |
2237 boolean has_extended_chars = false; | 2247 re_bool has_extended_chars = false; |
2238 REGISTER Lisp_Object rtab = Qnil; | 2248 REGISTER Lisp_Object rtab = Qnil; |
2239 #endif | 2249 #endif |
2240 | 2250 |
2241 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2251 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
2242 | 2252 |
2414 undo the ending character, the letters, and leave | 2424 undo the ending character, the letters, and leave |
2415 the leading `:' and `[' (but set bits for them). */ | 2425 the leading `:' and `[' (but set bits for them). */ |
2416 if (c == ':' && *p == ']') | 2426 if (c == ':' && *p == ']') |
2417 { | 2427 { |
2418 int ch; | 2428 int ch; |
2419 boolean is_alnum = STREQ (str, "alnum"); | 2429 re_bool is_alnum = STREQ (str, "alnum"); |
2420 boolean is_alpha = STREQ (str, "alpha"); | 2430 re_bool is_alpha = STREQ (str, "alpha"); |
2421 boolean is_blank = STREQ (str, "blank"); | 2431 re_bool is_blank = STREQ (str, "blank"); |
2422 boolean is_cntrl = STREQ (str, "cntrl"); | 2432 re_bool is_cntrl = STREQ (str, "cntrl"); |
2423 boolean is_digit = STREQ (str, "digit"); | 2433 re_bool is_digit = STREQ (str, "digit"); |
2424 boolean is_graph = STREQ (str, "graph"); | 2434 re_bool is_graph = STREQ (str, "graph"); |
2425 boolean is_lower = STREQ (str, "lower"); | 2435 re_bool is_lower = STREQ (str, "lower"); |
2426 boolean is_print = STREQ (str, "print"); | 2436 re_bool is_print = STREQ (str, "print"); |
2427 boolean is_punct = STREQ (str, "punct"); | 2437 re_bool is_punct = STREQ (str, "punct"); |
2428 boolean is_space = STREQ (str, "space"); | 2438 re_bool is_space = STREQ (str, "space"); |
2429 boolean is_upper = STREQ (str, "upper"); | 2439 re_bool is_upper = STREQ (str, "upper"); |
2430 boolean is_xdigit = STREQ (str, "xdigit"); | 2440 re_bool is_xdigit = STREQ (str, "xdigit"); |
2431 | 2441 |
2432 if (!IS_CHAR_CLASS (str)) | 2442 if (!IS_CHAR_CLASS (str)) |
2433 FREE_STACK_RETURN (REG_ECTYPE); | 2443 FREE_STACK_RETURN (REG_ECTYPE); |
2434 | 2444 |
2435 /* Throw away the ] at the end of the character | 2445 /* Throw away the ] at the end of the character |
3211 | 3221 |
3212 /* P points to just after a ^ in PATTERN. Return true if that ^ comes | 3222 /* P points to just after a ^ in PATTERN. Return true if that ^ comes |
3213 after an alternative or a begin-subexpression. We assume there is at | 3223 after an alternative or a begin-subexpression. We assume there is at |
3214 least one character before the ^. */ | 3224 least one character before the ^. */ |
3215 | 3225 |
3216 static boolean | 3226 static re_bool |
3217 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax) | 3227 at_begline_loc_p (re_char *pattern, re_char *p, reg_syntax_t syntax) |
3218 { | 3228 { |
3219 re_char *prev = p - 2; | 3229 re_char *prev = p - 2; |
3220 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; | 3230 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\'; |
3221 | 3231 |
3222 return | 3232 return |
3223 /* After a subexpression? */ | 3233 /* After a subexpression? */ |
3224 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) | 3234 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) |
3225 /* After an alternative? */ | 3235 /* After an alternative? */ |
3228 | 3238 |
3229 | 3239 |
3230 /* The dual of at_begline_loc_p. This one is for $. We assume there is | 3240 /* The dual of at_begline_loc_p. This one is for $. We assume there is |
3231 at least one character after the $, i.e., `P < PEND'. */ | 3241 at least one character after the $, i.e., `P < PEND'. */ |
3232 | 3242 |
3233 static boolean | 3243 static re_bool |
3234 at_endline_loc_p (re_char *p, re_char *pend, int syntax) | 3244 at_endline_loc_p (re_char *p, re_char *pend, int syntax) |
3235 { | 3245 { |
3236 re_char *next = p; | 3246 re_char *next = p; |
3237 boolean next_backslash = *next == '\\'; | 3247 re_bool next_backslash = *next == '\\'; |
3238 re_char *next_next = p + 1 < pend ? p + 1 : 0; | 3248 re_char *next_next = p + 1 < pend ? p + 1 : 0; |
3239 | 3249 |
3240 return | 3250 return |
3241 /* Before a subexpression? */ | 3251 /* Before a subexpression? */ |
3242 (syntax & RE_NO_BK_PARENS ? *next == ')' | 3252 (syntax & RE_NO_BK_PARENS ? *next == ')' |
3248 | 3258 |
3249 | 3259 |
3250 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and | 3260 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and |
3251 false if it's not. */ | 3261 false if it's not. */ |
3252 | 3262 |
3253 static boolean | 3263 static re_bool |
3254 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum) | 3264 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum) |
3255 { | 3265 { |
3256 int this_element; | 3266 int this_element; |
3257 | 3267 |
3258 for (this_element = compile_stack.avail - 1; | 3268 for (this_element = compile_stack.avail - 1; |
3419 | 3429 |
3420 /* Assume that each path through the pattern can be null until | 3430 /* Assume that each path through the pattern can be null until |
3421 proven otherwise. We set this false at the bottom of switch | 3431 proven otherwise. We set this false at the bottom of switch |
3422 statement, to which we get only if a particular path doesn't | 3432 statement, to which we get only if a particular path doesn't |
3423 match the empty string. */ | 3433 match the empty string. */ |
3424 boolean path_can_be_null = true; | 3434 re_bool path_can_be_null = true; |
3425 | 3435 |
3426 /* We aren't doing a `succeed_n' to begin with. */ | 3436 /* We aren't doing a `succeed_n' to begin with. */ |
3427 boolean succeed_n_p = false; | 3437 re_bool succeed_n_p = false; |
3428 | 3438 |
3429 assert (fastmap != NULL && p != NULL); | 3439 assert (fastmap != NULL && p != NULL); |
3430 | 3440 |
3431 INIT_FAIL_STACK (); | 3441 INIT_FAIL_STACK (); |
3432 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */ | 3442 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */ |
3622 /* Otherwise, have to check alternative paths. */ | 3632 /* Otherwise, have to check alternative paths. */ |
3623 break; | 3633 break; |
3624 } | 3634 } |
3625 | 3635 |
3626 #ifdef emacs | 3636 #ifdef emacs |
3637 case wordbound: | |
3638 case notwordbound: | |
3639 case wordbeg: | |
3640 case wordend: | |
3641 case notsyntaxspec: | |
3642 case syntaxspec: | |
3643 /* This match depends on text properties. These end with | |
3644 aborting optimizations. */ | |
3645 bufp->can_be_null = 1; | |
3646 goto done; | |
3647 | |
3648 #ifdef emacs | |
3649 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */ | |
3627 case syntaxspec: | 3650 case syntaxspec: |
3628 k = *p++; | 3651 k = *p++; |
3652 #endif | |
3629 matchsyntax: | 3653 matchsyntax: |
3630 #ifdef MULE | 3654 #ifdef MULE |
3631 for (j = 0; j < 0x80; j++) | 3655 for (j = 0; j < 0x80; j++) |
3632 if (SYNTAX_UNSAFE | 3656 if (SYNTAX_UNSAFE |
3633 (XCHAR_TABLE | 3657 (XCHAR_TABLE |
3663 fastmap[j] = 1; | 3687 fastmap[j] = 1; |
3664 #endif /* MULE */ | 3688 #endif /* MULE */ |
3665 break; | 3689 break; |
3666 | 3690 |
3667 | 3691 |
3692 #if 0 /* Removed during syntax-table properties patch -- 2000/12/07 mct */ | |
3668 case notsyntaxspec: | 3693 case notsyntaxspec: |
3669 k = *p++; | 3694 k = *p++; |
3695 #endif | |
3670 matchnotsyntax: | 3696 matchnotsyntax: |
3671 #ifdef MULE | 3697 #ifdef MULE |
3672 for (j = 0; j < 0x80; j++) | 3698 for (j = 0; j < 0x80; j++) |
3673 if (SYNTAX_UNSAFE | 3699 if (SYNTAX_UNSAFE |
3674 (XCHAR_TABLE | 3700 (XCHAR_TABLE |
3702 (regex_emacs_buffer->mirror_syntax_table), j) != | 3728 (regex_emacs_buffer->mirror_syntax_table), j) != |
3703 (enum syntaxcode) k) | 3729 (enum syntaxcode) k) |
3704 fastmap[j] = 1; | 3730 fastmap[j] = 1; |
3705 #endif /* MULE */ | 3731 #endif /* MULE */ |
3706 break; | 3732 break; |
3733 #endif /* emacs */ | |
3707 | 3734 |
3708 #ifdef MULE | 3735 #ifdef MULE |
3709 /* 97/2/17 jhod category patch */ | 3736 /* 97/2/17 jhod category patch */ |
3710 case categoryspec: | 3737 case categoryspec: |
3711 case notcategoryspec: | 3738 case notcategoryspec: |
3728 case no_op: | 3755 case no_op: |
3729 case begline: | 3756 case begline: |
3730 case endline: | 3757 case endline: |
3731 case begbuf: | 3758 case begbuf: |
3732 case endbuf: | 3759 case endbuf: |
3760 #ifndef emacs | |
3733 case wordbound: | 3761 case wordbound: |
3734 case notwordbound: | 3762 case notwordbound: |
3735 case wordbeg: | 3763 case wordbeg: |
3736 case wordend: | 3764 case wordend: |
3765 #endif | |
3737 case push_dummy_failure: | 3766 case push_dummy_failure: |
3738 continue; | 3767 continue; |
3739 | 3768 |
3740 | 3769 |
3741 case jump_n: | 3770 case jump_n: |
3972 (startpos >= size1 ? string2 - size1 : string1) + startpos); | 4001 (startpos >= size1 ? string2 - size1 : string1) + startpos); |
3973 range = charcount_to_bytecount (d, 1); | 4002 range = charcount_to_bytecount (d, 1); |
3974 } | 4003 } |
3975 } | 4004 } |
3976 | 4005 |
4006 #ifdef emacs | |
4007 /* In a forward search for something that starts with \=. | |
4008 don't keep searching past point. */ | |
4009 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) | |
4010 { | |
4011 range = BUF_PT (regex_emacs_buffer) - BUF_BEGV (regex_emacs_buffer) | |
4012 - startpos; | |
4013 if (range < 0) | |
4014 return -1; | |
4015 } | |
4016 #endif /* emacs */ | |
4017 | |
3977 /* Update the fastmap now if not correct already. */ | 4018 /* Update the fastmap now if not correct already. */ |
3978 if (fastmap && !bufp->fastmap_accurate) | 4019 if (fastmap && !bufp->fastmap_accurate) |
3979 if (re_compile_fastmap (bufp) == -2) | 4020 if (re_compile_fastmap (bufp) == -2) |
3980 return -2; | 4021 return -2; |
3981 | 4022 |
3991 else | 4032 else |
3992 break; | 4033 break; |
3993 } | 4034 } |
3994 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline; | 4035 anchored_at_begline = i < bufp->used && bufp->buffer[i] == begline; |
3995 } | 4036 } |
4037 #endif | |
4038 | |
4039 #ifdef emacs | |
4040 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object, | |
4041 regex_emacs_buffer, | |
4042 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object, | |
4043 regex_emacs_buffer, | |
4044 startpos), | |
4045 1); | |
3996 #endif | 4046 #endif |
3997 | 4047 |
3998 /* Loop through the string, looking for a place to start matching. */ | 4048 /* Loop through the string, looking for a place to start matching. */ |
3999 for (;;) | 4049 for (;;) |
4000 { | 4050 { |
4256 int | 4306 int |
4257 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, | 4307 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, |
4258 int size1, const char *string2, int size2, int pos, | 4308 int size1, const char *string2, int size2, int pos, |
4259 struct re_registers *regs, int stop) | 4309 struct re_registers *regs, int stop) |
4260 { | 4310 { |
4261 int result = re_match_2_internal (bufp, (re_char *) string1, size1, | 4311 int result; |
4262 (re_char *) string2, size2, | 4312 |
4263 pos, regs, stop); | 4313 #ifdef emacs |
4314 SETUP_SYNTAX_CACHE_FOR_OBJECT (regex_match_object, | |
4315 regex_emacs_buffer, | |
4316 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR (regex_match_object, | |
4317 regex_emacs_buffer, | |
4318 pos), | |
4319 1); | |
4320 #endif | |
4321 | |
4322 result = re_match_2_internal (bufp, (re_char *) string1, size1, | |
4323 (re_char *) string2, size2, | |
4324 pos, regs, stop); | |
4325 | |
4264 alloca (0); | 4326 alloca (0); |
4265 return result; | 4327 return result; |
4266 } | 4328 } |
4267 | 4329 |
4268 /* This is a separate function so that we can force an alloca cleanup | 4330 /* This is a separate function so that we can force an alloca cleanup |
4393 unsigned num_regs_pushed = 0; | 4455 unsigned num_regs_pushed = 0; |
4394 #endif | 4456 #endif |
4395 | 4457 |
4396 /* 1 if this match ends in the same string (string1 or string2) | 4458 /* 1 if this match ends in the same string (string1 or string2) |
4397 as the best previous match. */ | 4459 as the best previous match. */ |
4398 boolean same_str_p; | 4460 re_bool same_str_p; |
4399 | 4461 |
4400 /* 1 if this match is the best seen so far. */ | 4462 /* 1 if this match is the best seen so far. */ |
4401 boolean best_match_p; | 4463 re_bool best_match_p; |
4402 | 4464 |
4403 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); | 4465 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); |
4404 | 4466 |
4405 INIT_FAIL_STACK (); | 4467 INIT_FAIL_STACK (); |
4406 | 4468 |
4757 | 4819 |
4758 case charset: | 4820 case charset: |
4759 case charset_not: | 4821 case charset_not: |
4760 { | 4822 { |
4761 REGISTER unsigned char c; | 4823 REGISTER unsigned char c; |
4762 boolean not_p = (re_opcode_t) *(p - 1) == charset_not; | 4824 re_bool not_p = (re_opcode_t) *(p - 1) == charset_not; |
4763 | 4825 |
4764 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : ""); | 4826 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not_p ? "_not" : ""); |
4765 | 4827 |
4766 REGEX_PREFETCH (); | 4828 REGEX_PREFETCH (); |
4767 c = TRANSLATE (*d); /* The character to match. */ | 4829 c = TRANSLATE (*d); /* The character to match. */ |
4784 #ifdef MULE | 4846 #ifdef MULE |
4785 case charset_mule: | 4847 case charset_mule: |
4786 case charset_mule_not: | 4848 case charset_mule_not: |
4787 { | 4849 { |
4788 REGISTER Emchar c; | 4850 REGISTER Emchar c; |
4789 boolean not_p = (re_opcode_t) *(p - 1) == charset_mule_not; | 4851 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not; |
4790 | 4852 |
4791 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : ""); | 4853 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : ""); |
4792 | 4854 |
4793 REGEX_PREFETCH (); | 4855 REGEX_PREFETCH (); |
4794 c = charptr_emchar ((const Bufbyte *) d); | 4856 c = charptr_emchar ((const Bufbyte *) d); |
4931 last match. */ | 4993 last match. */ |
4932 if ((!MATCHED_SOMETHING (reg_info[*p]) | 4994 if ((!MATCHED_SOMETHING (reg_info[*p]) |
4933 || just_past_start_mem == p - 1) | 4995 || just_past_start_mem == p - 1) |
4934 && (p + 2) < pend) | 4996 && (p + 2) < pend) |
4935 { | 4997 { |
4936 boolean is_a_jump_n = false; | 4998 re_bool is_a_jump_n = false; |
4937 | 4999 |
4938 p1 = p + 2; | 5000 p1 = p + 2; |
4939 mcnt = 0; | 5001 mcnt = 0; |
4940 switch ((re_opcode_t) *p1++) | 5002 switch ((re_opcode_t) *p1++) |
4941 { | 5003 { |
5476 int result; | 5538 int result; |
5477 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) | 5539 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) |
5478 result = 1; | 5540 result = 1; |
5479 else | 5541 else |
5480 { | 5542 { |
5481 const unsigned char *d_before = | 5543 re_char *d_before = POS_BEFORE_GAP_UNSAFE (d); |
5482 (const unsigned char *) POS_BEFORE_GAP_UNSAFE (d); | 5544 re_char *d_after = POS_AFTER_GAP_UNSAFE (d); |
5483 const unsigned char *d_after = | 5545 |
5484 (const unsigned char *) POS_AFTER_GAP_UNSAFE (d); | 5546 /* emch1 is the character before d, syn1 is the syntax of emch1, |
5547 emch2 is the character at d, and syn2 is the syntax of emch2. */ | |
5485 Emchar emch1, emch2; | 5548 Emchar emch1, emch2; |
5549 int syn1, syn2; | |
5550 #ifdef emacs | |
5551 int pos_before; | |
5552 #endif | |
5486 | 5553 |
5487 DEC_CHARPTR (d_before); | 5554 DEC_CHARPTR (d_before); |
5488 emch1 = charptr_emchar (d_before); | 5555 emch1 = charptr_emchar (d_before); |
5489 emch2 = charptr_emchar (d_after); | 5556 emch2 = charptr_emchar (d_after); |
5490 result = (WORDCHAR_P_UNSAFE (emch1) != | 5557 |
5491 WORDCHAR_P_UNSAFE (emch2)); | 5558 #ifdef emacs |
5559 pos_before = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1; | |
5560 UPDATE_SYNTAX_CACHE (pos_before); | |
5561 #endif | |
5562 syn1 = SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), | |
5563 emch1); | |
5564 #ifdef emacs | |
5565 UPDATE_SYNTAX_CACHE_FORWARD (pos_before + 1); | |
5566 #endif | |
5567 syn2 = SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), | |
5568 emch2); | |
5569 | |
5570 result = ((syn1 == Sword) != (syn2 == Sword)); | |
5492 } | 5571 } |
5493 if (result == should_succeed) | 5572 if (result == should_succeed) |
5494 break; | 5573 break; |
5495 goto fail; | 5574 goto fail; |
5496 } | 5575 } |
5500 should_succeed = 0; | 5579 should_succeed = 0; |
5501 goto matchwordbound; | 5580 goto matchwordbound; |
5502 | 5581 |
5503 case wordbeg: | 5582 case wordbeg: |
5504 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); | 5583 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); |
5584 if (AT_STRINGS_END (d)) | |
5585 goto fail; | |
5505 { | 5586 { |
5506 /* XEmacs: this originally read: | 5587 /* XEmacs: this originally read: |
5507 | 5588 |
5508 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) | 5589 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) |
5509 break; | 5590 break; |
5510 | 5591 |
5511 */ | 5592 */ |
5512 const unsigned char *dtmp = | 5593 re_char *dtmp = POS_AFTER_GAP_UNSAFE (d); |
5513 (const unsigned char *) POS_AFTER_GAP_UNSAFE (d); | |
5514 Emchar emch = charptr_emchar (dtmp); | 5594 Emchar emch = charptr_emchar (dtmp); |
5515 if (!WORDCHAR_P_UNSAFE (emch)) | 5595 #ifdef emacs |
5596 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); | |
5597 UPDATE_SYNTAX_CACHE (charpos); | |
5598 #endif | |
5599 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), | |
5600 emch) != Sword) | |
5516 goto fail; | 5601 goto fail; |
5517 if (AT_STRINGS_BEG (d)) | 5602 if (AT_STRINGS_BEG (d)) |
5518 break; | 5603 break; |
5519 dtmp = (const unsigned char *) POS_BEFORE_GAP_UNSAFE (d); | 5604 dtmp = POS_BEFORE_GAP_UNSAFE (d); |
5520 DEC_CHARPTR (dtmp); | 5605 DEC_CHARPTR (dtmp); |
5521 emch = charptr_emchar (dtmp); | 5606 emch = charptr_emchar (dtmp); |
5522 if (!WORDCHAR_P_UNSAFE (emch)) | 5607 #ifdef emacs |
5608 UPDATE_SYNTAX_CACHE_BACKWARD (charpos - 1); | |
5609 #endif | |
5610 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), | |
5611 emch) != Sword) | |
5523 break; | 5612 break; |
5524 goto fail; | 5613 goto fail; |
5525 } | 5614 } |
5526 | 5615 |
5527 case wordend: | 5616 case wordend: |
5528 DEBUG_PRINT1 ("EXECUTING wordend.\n"); | 5617 DEBUG_PRINT1 ("EXECUTING wordend.\n"); |
5618 if (AT_STRINGS_BEG (d)) | |
5619 goto fail; | |
5529 { | 5620 { |
5530 /* XEmacs: this originally read: | 5621 /* XEmacs: this originally read: |
5531 | 5622 |
5532 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) | 5623 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) |
5533 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) | 5624 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) |
5534 break; | 5625 break; |
5535 | 5626 |
5536 The or condition is incorrect (reversed). | 5627 The or condition is incorrect (reversed). |
5537 */ | 5628 */ |
5538 const unsigned char *dtmp; | 5629 re_char *dtmp; |
5539 Emchar emch; | 5630 Emchar emch; |
5540 if (AT_STRINGS_BEG (d)) | 5631 #ifdef emacs |
5541 goto fail; | 5632 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)) - 1; |
5542 dtmp = (const unsigned char *) POS_BEFORE_GAP_UNSAFE (d); | 5633 UPDATE_SYNTAX_CACHE (charpos); |
5634 #endif | |
5635 dtmp = POS_BEFORE_GAP_UNSAFE (d); | |
5543 DEC_CHARPTR (dtmp); | 5636 DEC_CHARPTR (dtmp); |
5544 emch = charptr_emchar (dtmp); | 5637 emch = charptr_emchar (dtmp); |
5545 if (!WORDCHAR_P_UNSAFE (emch)) | 5638 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), |
5639 emch) != Sword) | |
5546 goto fail; | 5640 goto fail; |
5547 if (AT_STRINGS_END (d)) | 5641 if (AT_STRINGS_END (d)) |
5548 break; | 5642 break; |
5549 dtmp = (const unsigned char *) POS_AFTER_GAP_UNSAFE (d); | 5643 dtmp = POS_AFTER_GAP_UNSAFE (d); |
5550 emch = charptr_emchar (dtmp); | 5644 emch = charptr_emchar (dtmp); |
5551 if (!WORDCHAR_P_UNSAFE (emch)) | 5645 #ifdef emacs |
5646 UPDATE_SYNTAX_CACHE_FORWARD (charpos + 1); | |
5647 #endif | |
5648 if (SYNTAX_FROM_CACHE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), | |
5649 emch) != Sword) | |
5552 break; | 5650 break; |
5553 goto fail; | 5651 goto fail; |
5554 } | 5652 } |
5555 | 5653 |
5556 #ifdef emacs | 5654 #ifdef emacs |
5557 case before_dot: | 5655 case before_dot: |
5558 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); | 5656 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); |
5559 if (!regex_emacs_buffer_p | 5657 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object)) |
5560 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) | 5658 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) |
5561 >= BUF_PT (regex_emacs_buffer))) | 5659 >= BUF_PT (regex_emacs_buffer))) |
5562 goto fail; | 5660 goto fail; |
5563 break; | 5661 break; |
5564 | 5662 |
5565 case at_dot: | 5663 case at_dot: |
5566 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); | 5664 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); |
5567 if (!regex_emacs_buffer_p | 5665 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object)) |
5568 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) | 5666 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) |
5569 != BUF_PT (regex_emacs_buffer))) | 5667 != BUF_PT (regex_emacs_buffer))) |
5570 goto fail; | 5668 goto fail; |
5571 break; | 5669 break; |
5572 | 5670 |
5573 case after_dot: | 5671 case after_dot: |
5574 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); | 5672 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); |
5575 if (!regex_emacs_buffer_p | 5673 if (! (NILP (regex_match_object) || BUFFERP (regex_match_object)) |
5576 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) | 5674 || (BUF_PTR_BYTE_POS (regex_emacs_buffer, (unsigned char *) d) |
5577 <= BUF_PT (regex_emacs_buffer))) | 5675 <= BUF_PT (regex_emacs_buffer))) |
5578 goto fail; | 5676 goto fail; |
5579 break; | 5677 break; |
5580 #if 0 /* not emacs19 */ | 5678 #if 0 /* not emacs19 */ |
5600 { | 5698 { |
5601 int matches; | 5699 int matches; |
5602 Emchar emch; | 5700 Emchar emch; |
5603 | 5701 |
5604 REGEX_PREFETCH (); | 5702 REGEX_PREFETCH (); |
5703 #ifdef emacs | |
5704 { | |
5705 int charpos = SYNTAX_CACHE_BYTE_TO_CHAR (PTR_TO_OFFSET (d)); | |
5706 UPDATE_SYNTAX_CACHE (charpos); | |
5707 } | |
5708 #endif | |
5709 | |
5605 emch = charptr_emchar ((const Bufbyte *) d); | 5710 emch = charptr_emchar ((const Bufbyte *) d); |
5606 matches = (SYNTAX_UNSAFE | 5711 matches = (SYNTAX_FROM_CACHE (regex_emacs_buffer->mirror_syntax_table, |
5607 (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), | |
5608 emch) == (enum syntaxcode) mcnt); | 5712 emch) == (enum syntaxcode) mcnt); |
5609 INC_CHARPTR (d); | 5713 INC_CHARPTR (d); |
5610 if (matches != should_succeed) | 5714 if (matches != should_succeed) |
5611 goto fail; | 5715 goto fail; |
5612 SET_REGS_MATCHED (); | 5716 SET_REGS_MATCHED (); |
5690 | 5794 |
5691 /* If we failed to the end of the pattern, don't examine *p. */ | 5795 /* If we failed to the end of the pattern, don't examine *p. */ |
5692 assert (p <= pend); | 5796 assert (p <= pend); |
5693 if (p < pend) | 5797 if (p < pend) |
5694 { | 5798 { |
5695 boolean is_a_jump_n = false; | 5799 re_bool is_a_jump_n = false; |
5696 | 5800 |
5697 /* If failed to a backwards jump that's part of a repetition | 5801 /* If failed to a backwards jump that's part of a repetition |
5698 loop, need to pop this failure point and use the next one. */ | 5802 loop, need to pop this failure point and use the next one. */ |
5699 switch ((re_opcode_t) *p) | 5803 switch ((re_opcode_t) *p) |
5700 { | 5804 { |
5743 If we find the matching stop_memory, sets P to point to one past its number. | 5847 If we find the matching stop_memory, sets P to point to one past its number. |
5744 Otherwise, sets P to an undefined byte less than or equal to END. | 5848 Otherwise, sets P to an undefined byte less than or equal to END. |
5745 | 5849 |
5746 We don't handle duplicates properly (yet). */ | 5850 We don't handle duplicates properly (yet). */ |
5747 | 5851 |
5748 static boolean | 5852 static re_bool |
5749 group_match_null_string_p (unsigned char **p, unsigned char *end, | 5853 group_match_null_string_p (unsigned char **p, unsigned char *end, |
5750 register_info_type *reg_info) | 5854 register_info_type *reg_info) |
5751 { | 5855 { |
5752 int mcnt; | 5856 int mcnt; |
5753 /* Point to after the args to the start_memory. */ | 5857 /* Point to after the args to the start_memory. */ |
5851 | 5955 |
5852 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: | 5956 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: |
5853 It expects P to be the first byte of a single alternative and END one | 5957 It expects P to be the first byte of a single alternative and END one |
5854 byte past the last. The alternative can contain groups. */ | 5958 byte past the last. The alternative can contain groups. */ |
5855 | 5959 |
5856 static boolean | 5960 static re_bool |
5857 alt_match_null_string_p (unsigned char *p, unsigned char *end, | 5961 alt_match_null_string_p (unsigned char *p, unsigned char *end, |
5858 register_info_type *reg_info) | 5962 register_info_type *reg_info) |
5859 { | 5963 { |
5860 int mcnt; | 5964 int mcnt; |
5861 unsigned char *p1 = p; | 5965 unsigned char *p1 = p; |
5887 /* Deals with the ops common to group_match_null_string_p and | 5991 /* Deals with the ops common to group_match_null_string_p and |
5888 alt_match_null_string_p. | 5992 alt_match_null_string_p. |
5889 | 5993 |
5890 Sets P to one after the op and its arguments, if any. */ | 5994 Sets P to one after the op and its arguments, if any. */ |
5891 | 5995 |
5892 static boolean | 5996 static re_bool |
5893 common_op_match_null_string_p (unsigned char **p, unsigned char *end, | 5997 common_op_match_null_string_p (unsigned char **p, unsigned char *end, |
5894 register_info_type *reg_info) | 5998 register_info_type *reg_info) |
5895 { | 5999 { |
5896 int mcnt; | 6000 int mcnt; |
5897 boolean ret; | 6001 re_bool ret; |
5898 int reg_no; | 6002 int reg_no; |
5899 unsigned char *p1 = *p; | 6003 unsigned char *p1 = *p; |
5900 | 6004 |
5901 switch ((re_opcode_t) *p1++) | 6005 switch ((re_opcode_t) *p1++) |
5902 { | 6006 { |
6218 { | 6322 { |
6219 int ret; | 6323 int ret; |
6220 struct re_registers regs; | 6324 struct re_registers regs; |
6221 regex_t private_preg; | 6325 regex_t private_preg; |
6222 int len = strlen (string); | 6326 int len = strlen (string); |
6223 boolean want_reg_info = !preg->no_sub && nmatch > 0; | 6327 re_bool want_reg_info = !preg->no_sub && nmatch > 0; |
6224 | 6328 |
6225 private_preg = *preg; | 6329 private_preg = *preg; |
6226 | 6330 |
6227 private_preg.not_bol = !!(eflags & REG_NOTBOL); | 6331 private_preg.not_bol = !!(eflags & REG_NOTBOL); |
6228 private_preg.not_eol = !!(eflags & REG_NOTEOL); | 6332 private_preg.not_eol = !!(eflags & REG_NOTEOL); |