Mercurial > hg > xemacs-beta
comparison src/regex.c @ 5648:3f4a234f4672
Support non-ASCII correctly in character classes, test this.
src/ChangeLog addition:
2012-04-21 Aidan Kehoe <kehoea@parhasard.net>
Support non-ASCII correctly in character classes ([:alnum:] and
friends).
* regex.c:
* regex.c (ISBLANK, ISUNIBYTE): New. Make these and friends
independent of the locale, since we want them to be consistent in
XEmacs.
* regex.c (print_partial_compiled_pattern): Print the flags for
charset_mule; don't print non-ASCII as the character values in
ranges, this breaks with locales.
* regex.c (enum):
Define various flags the charset_mule and charset_mule_not opcodes
can now take.
* regex.c (CHAR_CLASS_MAX_LENGTH): Update this.
* regex.c (re_iswctype, re_wctype): New, from GNU.
* regex.c (re_wctype_can_match_non_ascii): New; used when deciding
on whether to use charset_mule or the ASCII-only regex character
set opcode.
* regex.c (regex_compile):
Error correctly on long, non-existent character class names.
Break out the handling of charsets that can match non-ASCII into a
separate clause. Use compile_char_class when compiling character
classes.
* regex.c (compile_char_class): New. Used in regex_compile when
compiling character sets that may match non-ASCII.
* regex.c (re_compile_fastmap):
If there are flags set for charset_mule or charset_mule_not, we
can't use the fastmap (since we need to check syntax table values
that aren't available there).
* regex.c (re_match_2_internal):
Check the new flags passed to the charset_mule{,_not} opcode,
observe them if appropriate.
* regex.h:
* regex.h (enum):
Expose re_wctype_t here, imported from GNU.
tests/ChangeLog addition:
2012-04-21 Aidan Kehoe <kehoea@parhasard.net>
* automated/regexp-tests.el:
* automated/regexp-tests.el (Assert-char-class):
Check that #'string-match errors correctly with an over-long
character class name.
Add tests for character class functionality that supports
non-ASCII characters. These tests expose bugs in GNU Emacs
24.0.94.2, but pass under current XEmacs.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Sat, 21 Apr 2012 18:58:28 +0100 |
parents | 308d34e9f07d |
children | 3df910176b6a |
comparison
equal
deleted
inserted
replaced
5647:1d9f603e9125 | 5648:3f4a234f4672 |
---|---|
176 #include "regex.h" | 176 #include "regex.h" |
177 | 177 |
178 /* isalpha etc. are used for the character classes. */ | 178 /* isalpha etc. are used for the character classes. */ |
179 #include <ctype.h> | 179 #include <ctype.h> |
180 | 180 |
181 /* Jim Meyering writes: | 181 #ifdef emacs |
182 | 182 |
183 "... Some ctype macros are valid only for character codes that | 183 /* 1 if C is an ASCII character. */ |
184 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when | 184 #define ISASCII(c) ((c) < 0x80) |
185 using /bin/cc or gcc but without giving an ansi option). So, all | 185 |
186 ctype uses should be through macros like ISPRINT... If | 186 /* 1 if C is a unibyte character. */ |
187 STDC_HEADERS is defined, then autoconf has verified that the ctype | 187 #define ISUNIBYTE(c) 0 |
188 macros don't need to be guarded with references to isascii. ... | 188 |
189 Defining isascii to 1 should let any compiler worth its salt | 189 /* The Emacs definitions should not be directly affected by locales. */ |
190 eliminate the && through constant folding." */ | 190 |
191 | 191 /* In Emacs, these are only used for single-byte characters. */ |
192 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) | 192 #define ISDIGIT(c) ((c) >= '0' && (c) <= '9') |
193 #define ISASCII_1(c) 1 | 193 #define ISCNTRL(c) ((c) < ' ') |
194 #define ISXDIGIT(c) (ISDIGIT (c) || ((c) >= 'a' && (c) <= 'f') \ | |
195 || ((c) >= 'A' && (c) <= 'F')) | |
196 | |
197 /* This is only used for single-byte characters. */ | |
198 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') | |
199 | |
200 /* The rest must handle multibyte characters. */ | |
201 | |
202 #define ISGRAPH(c) ((c) > ' ' && (c) != 0x7f) | |
203 #define ISPRINT(c) ((c) == ' ' || ISGRAPH (c)) | |
204 #define ISALPHA(c) (ISASCII (c) ? (((c) >= 'a' && (c) <= 'z') \ | |
205 || ((c) >= 'A' && (c) <= 'Z')) \ | |
206 : ISWORD (c)) | |
207 #define ISALNUM(c) (ISALPHA (c) || ISDIGIT (c)) | |
208 | |
209 #define ISLOWER(c) LOWERCASEP (lispbuf, c) | |
210 | |
211 #define ISPUNCT(c) (ISASCII (c) \ | |
212 ? ((c) > ' ' && (c) < 0x7F \ | |
213 && !(((c) >= 'a' && (c) <= 'z') \ | |
214 || ((c) >= 'A' && (c) <= 'Z') \ | |
215 || ((c) >= '0' && (c) <= '9'))) \ | |
216 : !ISWORD (c)) | |
217 | |
218 #define ISSPACE(c) \ | |
219 (SYNTAX (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf), c) == Swhitespace) | |
220 | |
221 #define ISUPPER(c) UPPERCASEP (lispbuf, c) | |
222 | |
223 #define ISWORD(c) (SYNTAX (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf), c) == Sword) | |
224 | |
225 #else /* not emacs */ | |
226 | |
227 /* 1 if C is an ASCII character. */ | |
228 #define ISASCII(c) ((c) < 0200) | |
229 | |
230 /* 1 if C is a unibyte character. */ | |
231 #define ISUNIBYTE(c) 0 | |
232 | |
233 #ifdef isblank | |
234 # define ISBLANK(c) isblank (c) | |
194 #else | 235 #else |
195 #define ISASCII_1(c) isascii(c) | 236 # define ISBLANK(c) ((c) == ' ' || (c) == '\t') |
196 #endif | 237 #endif |
197 | 238 #ifdef isgraph |
198 #ifdef MULE | 239 # define ISGRAPH(c) isgraph (c) |
199 /* The IS*() macros can be passed any character, including an extended | |
200 one. We need to make sure there are no crashes, which would occur | |
201 otherwise due to out-of-bounds array references. */ | |
202 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c)) | |
203 #else | 240 #else |
204 #define ISASCII(c) ISASCII_1 (c) | 241 # define ISGRAPH(c) (isprint (c) && !isspace (c)) |
205 #endif /* MULE */ | 242 #endif |
206 | 243 |
207 #ifdef isblank | 244 /* Solaris defines ISPRINT so we must undefine it first. */ |
208 #define ISBLANK(c) (ISASCII (c) && isblank (c)) | 245 #undef ISPRINT |
246 #define ISPRINT(c) isprint (c) | |
247 #define ISDIGIT(c) isdigit (c) | |
248 #define ISALNUM(c) isalnum (c) | |
249 #define ISALPHA(c) isalpha (c) | |
250 #define ISCNTRL(c) iscntrl (c) | |
251 #define ISLOWER(c) islower (c) | |
252 #define ISPUNCT(c) ispunct (c) | |
253 #define ISSPACE(c) isspace (c) | |
254 #define ISUPPER(c) isupper (c) | |
255 #define ISXDIGIT(c) isxdigit (c) | |
256 | |
257 #define ISWORD(c) ISALPHA (c) | |
258 | |
259 #ifdef _tolower | |
260 # define TOLOWER(c) _tolower (c) | |
209 #else | 261 #else |
210 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') | 262 # define TOLOWER(c) tolower (c) |
211 #endif | 263 #endif |
212 #ifdef isgraph | 264 |
213 #define ISGRAPH(c) (ISASCII (c) && isgraph (c)) | 265 #endif /* emacs */ |
214 #else | |
215 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) | |
216 #endif | |
217 | |
218 #define ISPRINT(c) (ISASCII (c) && isprint (c)) | |
219 #define ISDIGIT(c) (ISASCII (c) && isdigit (c)) | |
220 #define ISALNUM(c) (ISASCII (c) && isalnum (c)) | |
221 #define ISALPHA(c) (ISASCII (c) && isalpha (c)) | |
222 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) | |
223 #define ISLOWER(c) (ISASCII (c) && islower (c)) | |
224 #define ISPUNCT(c) (ISASCII (c) && ispunct (c)) | |
225 #define ISSPACE(c) (ISASCII (c) && isspace (c)) | |
226 #define ISUPPER(c) (ISASCII (c) && isupper (c)) | |
227 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) | |
228 | 266 |
229 #ifndef NULL | 267 #ifndef NULL |
230 #define NULL (void *)0 | 268 #define NULL (void *)0 |
231 #endif | 269 #endif |
232 | 270 |
911 { | 949 { |
912 int nentries, i; | 950 int nentries, i; |
913 | 951 |
914 printf ("/charset_mule [%s", | 952 printf ("/charset_mule [%s", |
915 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : ""); | 953 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : ""); |
954 printf (" flags: 0x%02x ", *p++); | |
916 nentries = unified_range_table_nentries (p); | 955 nentries = unified_range_table_nentries (p); |
917 for (i = 0; i < nentries; i++) | 956 for (i = 0; i < nentries; i++) |
918 { | 957 { |
919 EMACS_INT first, last; | 958 EMACS_INT first, last; |
920 Lisp_Object dummy_val; | 959 Lisp_Object dummy_val; |
921 | 960 |
922 unified_range_table_get_range (p, i, &first, &last, | 961 unified_range_table_get_range (p, i, &first, &last, |
923 &dummy_val); | 962 &dummy_val); |
924 if (first < 0x100) | 963 if (first < 0x80) |
925 putchar (first); | 964 putchar (first); |
926 else | 965 else |
927 printf ("(0x%lx)", (long)first); | 966 printf ("(0x%lx)", (long)first); |
928 if (first != last) | 967 if (first != last) |
929 { | 968 { |
930 putchar ('-'); | 969 putchar ('-'); |
931 if (last < 0x100) | 970 if (last < 0x80) |
932 putchar (last); | 971 putchar (last); |
933 else | 972 else |
934 printf ("(0x%lx)", (long)last); | 973 printf ("(0x%lx)", (long)last); |
935 } | 974 } |
936 } | 975 } |
1972 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) | 2011 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) |
1973 | 2012 |
1974 /* The next available element. */ | 2013 /* The next available element. */ |
1975 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) | 2014 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) |
1976 | 2015 |
2016 /* Bits used to implement the multibyte-part of the various character | |
2017 classes such as [:alnum:] in a charset's range table. XEmacs; use an | |
2018 enum, so they're visible in the debugger. */ | |
2019 enum | |
2020 { | |
2021 BIT_WORD = (1 << 0), | |
2022 BIT_LOWER = (1 << 1), | |
2023 BIT_PUNCT = (1 << 2), | |
2024 BIT_SPACE = (1 << 3), | |
2025 BIT_UPPER = (1 << 4), | |
2026 /* XEmacs; we need this, because we unify treatment of ASCII and non-ASCII | |
2027 (possible matches) in charset_mule. [:alpha:] matches all characters | |
2028 with word syntax, with the exception of [0-9]. We don't need | |
2029 BIT_MULTIBYTE. */ | |
2030 BIT_ALPHA = (1 << 5) | |
2031 }; | |
1977 | 2032 |
1978 /* Set the bit for character C in a bit vector. */ | 2033 /* Set the bit for character C in a bit vector. */ |
1979 #define SET_LIST_BIT(c) \ | 2034 #define SET_LIST_BIT(c) \ |
1980 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \ | 2035 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \ |
1981 |= 1 << (((unsigned char) c) % BYTEWIDTH)) | 2036 |= 1 << (((unsigned char) c) % BYTEWIDTH)) |
1983 #ifdef MULE | 2038 #ifdef MULE |
1984 | 2039 |
1985 /* Set the "bit" for character C in a range table. */ | 2040 /* Set the "bit" for character C in a range table. */ |
1986 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt) | 2041 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt) |
1987 | 2042 |
1988 /* Set the "bit" for character c in the appropriate table. */ | 2043 #endif |
1989 #define SET_EITHER_BIT(c) \ | |
1990 do { \ | |
1991 if (has_extended_chars) \ | |
1992 SET_RANGETAB_BIT (c); \ | |
1993 else \ | |
1994 SET_LIST_BIT (c); \ | |
1995 } while (0) | |
1996 | |
1997 #else /* not MULE */ | |
1998 | |
1999 #define SET_EITHER_BIT(c) SET_LIST_BIT (c) | |
2000 | |
2001 #endif | |
2002 | |
2003 | 2044 |
2004 /* Get the next unsigned number in the uncompiled pattern. */ | 2045 /* Get the next unsigned number in the uncompiled pattern. */ |
2005 #define GET_UNSIGNED_NUMBER(num) \ | 2046 #define GET_UNSIGNED_NUMBER(num) \ |
2006 { if (p != pend) \ | 2047 { if (p != pend) \ |
2007 { \ | 2048 { \ |
2016 PATFETCH (c); \ | 2057 PATFETCH (c); \ |
2017 } \ | 2058 } \ |
2018 } \ | 2059 } \ |
2019 } | 2060 } |
2020 | 2061 |
2021 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ | 2062 #define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */ |
2022 | 2063 |
2023 #define IS_CHAR_CLASS(string) \ | 2064 /* Map a string to the char class it names (if any). */ |
2024 (STREQ (string, "alpha") || STREQ (string, "upper") \ | 2065 static re_wctype_t |
2025 || STREQ (string, "lower") || STREQ (string, "digit") \ | 2066 re_wctype (const char *string) |
2026 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ | 2067 { |
2027 || STREQ (string, "space") || STREQ (string, "print") \ | 2068 if (STREQ (string, "alnum")) return RECC_ALNUM; |
2028 || STREQ (string, "punct") || STREQ (string, "graph") \ | 2069 else if (STREQ (string, "alpha")) return RECC_ALPHA; |
2029 || STREQ (string, "cntrl") || STREQ (string, "blank")) | 2070 else if (STREQ (string, "word")) return RECC_WORD; |
2071 else if (STREQ (string, "ascii")) return RECC_ASCII; | |
2072 else if (STREQ (string, "nonascii")) return RECC_NONASCII; | |
2073 else if (STREQ (string, "graph")) return RECC_GRAPH; | |
2074 else if (STREQ (string, "lower")) return RECC_LOWER; | |
2075 else if (STREQ (string, "print")) return RECC_PRINT; | |
2076 else if (STREQ (string, "punct")) return RECC_PUNCT; | |
2077 else if (STREQ (string, "space")) return RECC_SPACE; | |
2078 else if (STREQ (string, "upper")) return RECC_UPPER; | |
2079 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE; | |
2080 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE; | |
2081 else if (STREQ (string, "digit")) return RECC_DIGIT; | |
2082 else if (STREQ (string, "xdigit")) return RECC_XDIGIT; | |
2083 else if (STREQ (string, "cntrl")) return RECC_CNTRL; | |
2084 else if (STREQ (string, "blank")) return RECC_BLANK; | |
2085 else return RECC_ERROR; | |
2086 } | |
2087 | |
2088 /* True if CH is in the char class CC. */ | |
2089 static re_bool | |
2090 re_iswctype (int ch, re_wctype_t cc) | |
2091 { | |
2092 #ifdef emacs | |
2093 /* This is cheesy, lispbuf isn't available to us when compiling the | |
2094 pattern. It's effectively only called (on Mule builds) when the current | |
2095 buffer doesn't matter (e.g. for RECC_ASCII, RECC_CNTRL), so it's not a | |
2096 big deal. */ | |
2097 struct buffer *lispbuf = current_buffer; | |
2098 #endif | |
2099 | |
2100 switch (cc) | |
2101 { | |
2102 case RECC_ALNUM: return ISALNUM (ch) != 0; | |
2103 case RECC_ALPHA: return ISALPHA (ch) != 0; | |
2104 case RECC_BLANK: return ISBLANK (ch) != 0; | |
2105 case RECC_CNTRL: return ISCNTRL (ch) != 0; | |
2106 case RECC_DIGIT: return ISDIGIT (ch) != 0; | |
2107 case RECC_GRAPH: return ISGRAPH (ch) != 0; | |
2108 case RECC_LOWER: return ISLOWER (ch) != 0; | |
2109 case RECC_PRINT: return ISPRINT (ch) != 0; | |
2110 case RECC_PUNCT: return ISPUNCT (ch) != 0; | |
2111 case RECC_SPACE: return ISSPACE (ch) != 0; | |
2112 case RECC_UPPER: return ISUPPER (ch) != 0; | |
2113 case RECC_XDIGIT: return ISXDIGIT (ch) != 0; | |
2114 case RECC_ASCII: return ISASCII (ch) != 0; | |
2115 case RECC_NONASCII: case RECC_MULTIBYTE: return !ISASCII (ch); | |
2116 case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0; | |
2117 case RECC_WORD: return ISWORD (ch) != 0; | |
2118 case RECC_ERROR: return false; | |
2119 default: | |
2120 abort (); | |
2121 } | |
2122 } | |
2123 | |
2124 #ifdef MULE | |
2125 | |
2126 static re_bool | |
2127 re_wctype_can_match_non_ascii (re_wctype_t cc) | |
2128 { | |
2129 switch (cc) | |
2130 { | |
2131 case RECC_ASCII: | |
2132 case RECC_UNIBYTE: | |
2133 case RECC_CNTRL: | |
2134 case RECC_DIGIT: | |
2135 case RECC_XDIGIT: | |
2136 case RECC_BLANK: | |
2137 return false; | |
2138 default: | |
2139 return true; | |
2140 } | |
2141 } | |
2142 | |
2143 /* Return a bit-pattern to use in the range-table bits to match multibyte | |
2144 chars of class CC. */ | |
2145 static unsigned char | |
2146 re_wctype_to_bit (re_wctype_t cc) | |
2147 { | |
2148 switch (cc) | |
2149 { | |
2150 case RECC_PRINT: case RECC_GRAPH: | |
2151 case RECC_ALPHA: return BIT_ALPHA; | |
2152 case RECC_ALNUM: case RECC_WORD: return BIT_WORD; | |
2153 case RECC_LOWER: return BIT_LOWER; | |
2154 case RECC_UPPER: return BIT_UPPER; | |
2155 case RECC_PUNCT: return BIT_PUNCT; | |
2156 case RECC_SPACE: return BIT_SPACE; | |
2157 case RECC_MULTIBYTE: case RECC_NONASCII: | |
2158 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: | |
2159 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; | |
2160 default: | |
2161 abort (); | |
2162 } | |
2163 } | |
2164 | |
2165 #endif /* emacs */ | |
2030 | 2166 |
2031 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg); | 2167 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg); |
2032 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2); | 2168 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2); |
2033 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg, | 2169 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg, |
2034 unsigned char *end); | 2170 unsigned char *end); |
2047 static reg_errcode_t compile_extended_range (re_char **p_ptr, | 2183 static reg_errcode_t compile_extended_range (re_char **p_ptr, |
2048 re_char *pend, | 2184 re_char *pend, |
2049 RE_TRANSLATE_TYPE translate, | 2185 RE_TRANSLATE_TYPE translate, |
2050 reg_syntax_t syntax, | 2186 reg_syntax_t syntax, |
2051 Lisp_Object rtab); | 2187 Lisp_Object rtab); |
2188 static reg_errcode_t compile_char_class (re_wctype_t cc, Lisp_Object rtab, | |
2189 Bitbyte *flags_out); | |
2052 #endif /* MULE */ | 2190 #endif /* MULE */ |
2053 static re_bool group_match_null_string_p (unsigned char **p, | 2191 static re_bool group_match_null_string_p (unsigned char **p, |
2054 unsigned char *end, | 2192 unsigned char *end, |
2055 register_info_type *reg_info); | 2193 register_info_type *reg_info); |
2056 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end, | 2194 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end, |
2510 case '.': | 2648 case '.': |
2511 laststart = buf_end; | 2649 laststart = buf_end; |
2512 BUF_PUSH (anychar); | 2650 BUF_PUSH (anychar); |
2513 break; | 2651 break; |
2514 | 2652 |
2653 #ifdef MULE | |
2654 #define MAYBE_START_OVER_WITH_EXTENDED(ch) \ | |
2655 if (ch >= 0x80) \ | |
2656 { \ | |
2657 goto start_over_with_extended; \ | |
2658 } while (0) | |
2659 #else | |
2660 #define MAYBE_START_OVER_WITH_EXTENDED(ch) | |
2661 #endif | |
2515 | 2662 |
2516 case '[': | 2663 case '[': |
2517 { | 2664 { |
2518 /* XEmacs change: this whole section */ | 2665 /* XEmacs change: this whole section */ |
2519 re_bool had_char_class = false; | 2666 re_bool had_char_class = false; |
2520 #ifdef MULE | |
2521 re_bool has_extended_chars = false; | |
2522 REGISTER Lisp_Object rtab = Qnil; | |
2523 #endif | |
2524 | 2667 |
2525 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2668 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
2526 | 2669 |
2527 /* Ensure that we have enough space to push a charset: the | 2670 /* Ensure that we have enough space to push a charset: the |
2528 opcode, the length count, and the bitset; 34 bytes in all. */ | 2671 opcode, the length count, and the bitset; 34 bytes in all. */ |
2548 /* charset_not matches newline according to a syntax bit. */ | 2691 /* charset_not matches newline according to a syntax bit. */ |
2549 if ((re_opcode_t) buf_end[-2] == charset_not | 2692 if ((re_opcode_t) buf_end[-2] == charset_not |
2550 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) | 2693 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
2551 SET_LIST_BIT ('\n'); | 2694 SET_LIST_BIT ('\n'); |
2552 | 2695 |
2553 #ifdef MULE | |
2554 start_over_with_extended: | |
2555 if (has_extended_chars) | |
2556 { | |
2557 /* There are extended chars here, which means we need to start | |
2558 over and shift to unified range-table format. */ | |
2559 if (buf_end[-2] == charset) | |
2560 buf_end[-2] = charset_mule; | |
2561 else | |
2562 buf_end[-2] = charset_mule_not; | |
2563 buf_end--; | |
2564 p = p1; /* go back to the beginning of the charset, after | |
2565 a possible ^. */ | |
2566 rtab = Vthe_lisp_rangetab; | |
2567 Fclear_range_table (rtab); | |
2568 | |
2569 /* charset_not matches newline according to a syntax bit. */ | |
2570 if ((re_opcode_t) buf_end[-1] == charset_mule_not | |
2571 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) | |
2572 SET_EITHER_BIT ('\n'); | |
2573 } | |
2574 #endif /* MULE */ | |
2575 | |
2576 /* Read in characters and ranges, setting map bits. */ | 2696 /* Read in characters and ranges, setting map bits. */ |
2577 for (;;) | 2697 for (;;) |
2578 { | 2698 { |
2579 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2699 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
2580 | 2700 |
2581 PATFETCH (c); | 2701 PATFETCH (c); |
2582 | 2702 |
2583 #ifdef MULE | 2703 /* Frumble-bumble, we may have found some extended chars. |
2584 if (c >= 0x80 && !has_extended_chars) | 2704 Need to start over, process everything using the general |
2585 { | 2705 extended-char mechanism, and need to use charset_mule and |
2586 has_extended_chars = 1; | 2706 charset_mule_not instead of charset and charset_not. */ |
2587 /* Frumble-bumble, we've found some extended chars. | 2707 MAYBE_START_OVER_WITH_EXTENDED (c); |
2588 Need to start over, process everything using | 2708 |
2589 the general extended-char mechanism, and need | |
2590 to use charset_mule and charset_mule_not instead | |
2591 of charset and charset_not. */ | |
2592 goto start_over_with_extended; | |
2593 } | |
2594 #endif /* MULE */ | |
2595 /* \ might escape characters inside [...] and [^...]. */ | 2709 /* \ might escape characters inside [...] and [^...]. */ |
2596 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') | 2710 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') |
2597 { | 2711 { |
2598 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | 2712 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); |
2599 | 2713 |
2600 PATFETCH (c1); | 2714 PATFETCH (c1); |
2601 #ifdef MULE | 2715 |
2602 if (c1 >= 0x80 && !has_extended_chars) | 2716 MAYBE_START_OVER_WITH_EXTENDED (c1); |
2603 { | 2717 |
2604 has_extended_chars = 1; | 2718 SET_LIST_BIT (c1); |
2605 goto start_over_with_extended; | |
2606 } | |
2607 #endif /* MULE */ | |
2608 SET_EITHER_BIT (c1); | |
2609 continue; | 2719 continue; |
2610 } | 2720 } |
2611 | 2721 |
2612 /* Could be the end of the bracket expression. If it's | 2722 /* Could be the end of the bracket expression. If it's |
2613 not (i.e., when the bracket expression is `[]' so | 2723 not (i.e., when the bracket expression is `[]' so |
2629 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') | 2739 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') |
2630 && *p != ']') | 2740 && *p != ']') |
2631 { | 2741 { |
2632 reg_errcode_t ret; | 2742 reg_errcode_t ret; |
2633 | 2743 |
2634 #ifdef MULE | 2744 MAYBE_START_OVER_WITH_EXTENDED (*(unsigned char *)p); |
2635 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) | 2745 |
2636 { | 2746 ret = compile_range (&p, pend, translate, syntax, |
2637 has_extended_chars = 1; | 2747 buf_end); |
2638 goto start_over_with_extended; | 2748 |
2639 } | |
2640 if (has_extended_chars) | |
2641 ret = compile_extended_range (&p, pend, translate, | |
2642 syntax, rtab); | |
2643 else | |
2644 #endif /* MULE */ | |
2645 ret = compile_range (&p, pend, translate, syntax, buf_end); | |
2646 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); | 2749 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
2647 } | 2750 } |
2648 | 2751 |
2649 else if (p[0] == '-' && p[1] != ']') | 2752 else if (p[0] == '-' && p[1] != ']') |
2650 { /* This handles ranges made up of characters only. */ | 2753 { /* This handles ranges made up of characters only. */ |
2651 reg_errcode_t ret; | 2754 reg_errcode_t ret; |
2652 | 2755 |
2653 /* Move past the `-'. */ | 2756 /* Move past the `-'. */ |
2654 PATFETCH (c1); | 2757 PATFETCH (c1); |
2655 | 2758 |
2656 #ifdef MULE | 2759 MAYBE_START_OVER_WITH_EXTENDED (*(unsigned char *)p); |
2657 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) | 2760 |
2658 { | 2761 ret = compile_range (&p, pend, translate, syntax, buf_end); |
2659 has_extended_chars = 1; | 2762 |
2660 goto start_over_with_extended; | |
2661 } | |
2662 if (has_extended_chars) | |
2663 ret = compile_extended_range (&p, pend, translate, | |
2664 syntax, rtab); | |
2665 else | |
2666 #endif /* MULE */ | |
2667 ret = compile_range (&p, pend, translate, syntax, buf_end); | |
2668 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); | 2763 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
2669 } | 2764 } |
2670 | 2765 |
2671 /* See if we're at the beginning of a possible character | 2766 /* See if we're at the beginning of a possible character |
2672 class. */ | 2767 class. */ |
2673 | 2768 |
2674 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') | 2769 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') |
2675 { /* Leave room for the null. */ | 2770 { /* Leave room for the null. */ |
2676 char str[CHAR_CLASS_MAX_LENGTH + 1]; | 2771 char str[CHAR_CLASS_MAX_LENGTH + 1]; |
2772 int ch = 0; | |
2677 | 2773 |
2678 PATFETCH (c); | 2774 PATFETCH (c); |
2679 c1 = 0; | 2775 c1 = 0; |
2680 | 2776 |
2681 /* If pattern is `[[:'. */ | 2777 /* If pattern is `[[:'. */ |
2682 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2778 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
2683 | 2779 |
2684 for (;;) | 2780 for (;;) |
2685 { | 2781 { |
2686 /* #### This code is unused. | 2782 PATFETCH (c); |
2687 Correctness is not checked after TRT | 2783 if ((c == ':' && *p == ']') || p == pend) |
2688 table change. */ | 2784 break; |
2689 PATFETCH (c); | 2785 if (c1 < CHAR_CLASS_MAX_LENGTH) |
2690 if (c == ':' || c == ']' || p == pend | 2786 str[c1++] = c; |
2691 || c1 == CHAR_CLASS_MAX_LENGTH) | 2787 else |
2692 break; | 2788 /* This is in any case an invalid class name. */ |
2693 str[c1++] = (char) c; | 2789 str[0] = '\0'; |
2694 } | 2790 } |
2695 str[c1] = '\0'; | 2791 str[c1] = '\0'; |
2696 | 2792 |
2697 /* If isn't a word bracketed by `[:' and `:]': | 2793 /* If isn't a word bracketed by `[:' and `:]': |
2698 undo the ending character, the letters, and leave | 2794 undo the ending character, the letters, and leave |
2699 the leading `:' and `[' (but set bits for them). */ | 2795 the leading `:' and `[' (but set bits for them). */ |
2700 if (c == ':' && *p == ']') | 2796 if (c == ':' && *p == ']') |
2701 { | 2797 { |
2702 int ch; | 2798 re_wctype_t cc = re_wctype (str); |
2703 re_bool is_alnum = STREQ (str, "alnum"); | 2799 |
2704 re_bool is_alpha = STREQ (str, "alpha"); | 2800 if (cc == RECC_ERROR) |
2705 re_bool is_blank = STREQ (str, "blank"); | |
2706 re_bool is_cntrl = STREQ (str, "cntrl"); | |
2707 re_bool is_digit = STREQ (str, "digit"); | |
2708 re_bool is_graph = STREQ (str, "graph"); | |
2709 re_bool is_lower = STREQ (str, "lower"); | |
2710 re_bool is_print = STREQ (str, "print"); | |
2711 re_bool is_punct = STREQ (str, "punct"); | |
2712 re_bool is_space = STREQ (str, "space"); | |
2713 re_bool is_upper = STREQ (str, "upper"); | |
2714 re_bool is_xdigit = STREQ (str, "xdigit"); | |
2715 | |
2716 if (!IS_CHAR_CLASS (str)) | |
2717 FREE_STACK_RETURN (REG_ECTYPE); | 2801 FREE_STACK_RETURN (REG_ECTYPE); |
2718 | 2802 |
2719 /* Throw away the ] at the end of the character | 2803 /* Throw away the ] at the end of the character |
2720 class. */ | 2804 class. */ |
2721 PATFETCH (c); | 2805 PATFETCH (c); |
2722 | 2806 |
2723 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2807 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
2724 | 2808 |
2725 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) | 2809 #ifdef MULE |
2726 { | 2810 if (re_wctype_can_match_non_ascii (cc)) |
2727 /* This was split into 3 if's to | 2811 { |
2728 avoid an arbitrary limit in some compiler. */ | 2812 goto start_over_with_extended; |
2729 if ( (is_alnum && ISALNUM (ch)) | 2813 } |
2730 || (is_alpha && ISALPHA (ch)) | 2814 #endif /* MULE */ |
2731 || (is_blank && ISBLANK (ch)) | 2815 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch) |
2732 || (is_cntrl && ISCNTRL (ch))) | 2816 { |
2733 SET_EITHER_BIT (ch); | 2817 if (re_iswctype (ch, cc)) |
2734 if ( (is_digit && ISDIGIT (ch)) | 2818 { |
2735 || (is_graph && ISGRAPH (ch)) | 2819 SET_LIST_BIT (ch); |
2736 || (is_lower && ISLOWER (ch)) | 2820 } |
2737 || (is_print && ISPRINT (ch))) | 2821 } |
2738 SET_EITHER_BIT (ch); | 2822 |
2739 if ( (is_punct && ISPUNCT (ch)) | |
2740 || (is_space && ISSPACE (ch)) | |
2741 || (is_upper && ISUPPER (ch)) | |
2742 || (is_xdigit && ISXDIGIT (ch))) | |
2743 SET_EITHER_BIT (ch); | |
2744 } | |
2745 had_char_class = true; | 2823 had_char_class = true; |
2746 } | 2824 } |
2747 else | 2825 else |
2748 { | 2826 { |
2749 c1++; | 2827 c1++; |
2750 while (c1--) | 2828 while (c1--) |
2751 PATUNFETCH; | 2829 PATUNFETCH; |
2752 SET_EITHER_BIT ('['); | 2830 SET_LIST_BIT ('['); |
2753 SET_EITHER_BIT (':'); | 2831 SET_LIST_BIT (':'); |
2754 had_char_class = false; | 2832 had_char_class = false; |
2755 } | 2833 } |
2756 } | 2834 } |
2757 else | 2835 else |
2758 { | 2836 { |
2759 had_char_class = false; | 2837 had_char_class = false; |
2760 SET_EITHER_BIT (c); | 2838 SET_LIST_BIT (c); |
2761 } | 2839 } |
2762 } | 2840 } |
2763 | 2841 |
2764 #ifdef MULE | |
2765 if (has_extended_chars) | |
2766 { | |
2767 /* We have a range table, not a bit vector. */ | |
2768 int bytes_needed = | |
2769 unified_range_table_bytes_needed (rtab); | |
2770 GET_BUFFER_SPACE (bytes_needed); | |
2771 unified_range_table_copy_data (rtab, buf_end); | |
2772 buf_end += unified_range_table_bytes_used (buf_end); | |
2773 break; | |
2774 } | |
2775 #endif /* MULE */ | |
2776 /* Discard any (non)matching list bytes that are all 0 at the | 2842 /* Discard any (non)matching list bytes that are all 0 at the |
2777 end of the map. Decrease the map-length byte too. */ | 2843 end of the map. Decrease the map-length byte too. */ |
2778 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0) | 2844 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0) |
2779 buf_end[-1]--; | 2845 buf_end[-1]--; |
2780 buf_end += buf_end[-1]; | 2846 buf_end += buf_end[-1]; |
2781 } | 2847 } |
2782 break; | 2848 break; |
2783 | 2849 |
2850 #ifdef MULE | |
2851 start_over_with_extended: | |
2852 { | |
2853 REGISTER Lisp_Object rtab = Qnil; | |
2854 Bitbyte flags = 0; | |
2855 int bytes_needed = sizeof (flags); | |
2856 re_bool had_char_class = false; | |
2857 | |
2858 /* There are extended chars here, which means we need to use the | |
2859 unified range-table format. */ | |
2860 if (buf_end[-2] == charset) | |
2861 buf_end[-2] = charset_mule; | |
2862 else | |
2863 buf_end[-2] = charset_mule_not; | |
2864 buf_end--; | |
2865 p = p1; /* go back to the beginning of the charset, after | |
2866 a possible ^. */ | |
2867 rtab = Vthe_lisp_rangetab; | |
2868 Fclear_range_table (rtab); | |
2869 | |
2870 /* charset_not matches newline according to a syntax bit. */ | |
2871 if ((re_opcode_t) buf_end[-1] == charset_mule_not | |
2872 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) | |
2873 SET_RANGETAB_BIT ('\n'); | |
2874 | |
2875 /* Read in characters and ranges, setting map bits. */ | |
2876 for (;;) | |
2877 { | |
2878 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2879 | |
2880 PATFETCH (c); | |
2881 | |
2882 /* \ might escape characters inside [...] and [^...]. */ | |
2883 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') | |
2884 { | |
2885 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | |
2886 | |
2887 PATFETCH (c1); | |
2888 | |
2889 SET_RANGETAB_BIT (c1); | |
2890 continue; | |
2891 } | |
2892 | |
2893 /* Could be the end of the bracket expression. If it's | |
2894 not (i.e., when the bracket expression is `[]' so | |
2895 far), the ']' character bit gets set way below. */ | |
2896 if (c == ']' && p != p1 + 1) | |
2897 break; | |
2898 | |
2899 /* Look ahead to see if it's a range when the last thing | |
2900 was a character class. */ | |
2901 if (had_char_class && c == '-' && *p != ']') | |
2902 FREE_STACK_RETURN (REG_ERANGE); | |
2903 | |
2904 /* Look ahead to see if it's a range when the last thing | |
2905 was a character: if this is a hyphen not at the | |
2906 beginning or the end of a list, then it's the range | |
2907 operator. */ | |
2908 if (c == '-' | |
2909 && !(p - 2 >= pattern && p[-2] == '[') | |
2910 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') | |
2911 && *p != ']') | |
2912 { | |
2913 reg_errcode_t ret; | |
2914 | |
2915 ret = compile_extended_range (&p, pend, translate, syntax, | |
2916 rtab); | |
2917 | |
2918 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); | |
2919 } | |
2920 | |
2921 else if (p[0] == '-' && p[1] != ']') | |
2922 { /* This handles ranges made up of characters only. */ | |
2923 reg_errcode_t ret; | |
2924 | |
2925 /* Move past the `-'. */ | |
2926 PATFETCH (c1); | |
2927 | |
2928 ret = compile_extended_range (&p, pend, translate, | |
2929 syntax, rtab); | |
2930 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); | |
2931 } | |
2932 | |
2933 /* See if we're at the beginning of a possible character | |
2934 class. */ | |
2935 | |
2936 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') | |
2937 { /* Leave room for the null. */ | |
2938 char str[CHAR_CLASS_MAX_LENGTH + 1]; | |
2939 | |
2940 PATFETCH (c); | |
2941 c1 = 0; | |
2942 | |
2943 /* If pattern is `[[:'. */ | |
2944 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2945 | |
2946 for (;;) | |
2947 { | |
2948 PATFETCH (c); | |
2949 if ((c == ':' && *p == ']') || p == pend) | |
2950 break; | |
2951 if (c1 < CHAR_CLASS_MAX_LENGTH) | |
2952 str[c1++] = c; | |
2953 else | |
2954 /* This is in any case an invalid class name. */ | |
2955 str[0] = '\0'; | |
2956 } | |
2957 str[c1] = '\0'; | |
2958 | |
2959 /* If isn't a word bracketed by `[:' and `:]': | |
2960 undo the ending character, the letters, and leave | |
2961 the leading `:' and `[' (but set bits for them). */ | |
2962 if (c == ':' && *p == ']') | |
2963 { | |
2964 re_wctype_t cc = re_wctype (str); | |
2965 reg_errcode_t ret = REG_NOERROR; | |
2966 | |
2967 if (cc == RECC_ERROR) | |
2968 FREE_STACK_RETURN (REG_ECTYPE); | |
2969 | |
2970 /* Throw away the ] at the end of the character | |
2971 class. */ | |
2972 PATFETCH (c); | |
2973 | |
2974 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | |
2975 | |
2976 ret = compile_char_class (cc, rtab, &flags); | |
2977 | |
2978 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); | |
2979 | |
2980 had_char_class = true; | |
2981 } | |
2982 else | |
2983 { | |
2984 c1++; | |
2985 while (c1--) | |
2986 PATUNFETCH; | |
2987 SET_RANGETAB_BIT ('['); | |
2988 SET_RANGETAB_BIT (':'); | |
2989 had_char_class = false; | |
2990 } | |
2991 } | |
2992 else | |
2993 { | |
2994 had_char_class = false; | |
2995 SET_RANGETAB_BIT (c); | |
2996 } | |
2997 } | |
2998 | |
2999 bytes_needed += unified_range_table_bytes_needed (rtab); | |
3000 GET_BUFFER_SPACE (bytes_needed); | |
3001 *buf_end++ = flags; | |
3002 unified_range_table_copy_data (rtab, buf_end); | |
3003 buf_end += unified_range_table_bytes_used (buf_end); | |
3004 break; | |
3005 } | |
3006 #endif /* MULE */ | |
2784 | 3007 |
2785 case '(': | 3008 case '(': |
2786 if (syntax & RE_NO_BK_PARENS) | 3009 if (syntax & RE_NO_BK_PARENS) |
2787 goto handle_open; | 3010 goto handle_open; |
2788 else | 3011 else |
3713 put_range_table (rtab, this_char, range_end, Qt); | 3936 put_range_table (rtab, this_char, range_end, Qt); |
3714 | 3937 |
3715 return REG_NOERROR; | 3938 return REG_NOERROR; |
3716 } | 3939 } |
3717 | 3940 |
3941 static reg_errcode_t | |
3942 compile_char_class (re_wctype_t cc, Lisp_Object rtab, Bitbyte *flags_out) | |
3943 { | |
3944 *flags_out |= re_wctype_to_bit (cc); | |
3945 | |
3946 switch (cc) | |
3947 { | |
3948 case RECC_ASCII: | |
3949 put_range_table (rtab, 0, 0x7f, Qt); | |
3950 break; | |
3951 | |
3952 case RECC_XDIGIT: | |
3953 put_range_table (rtab, 'a', 'f', Qt); | |
3954 put_range_table (rtab, 'A', 'f', Qt); | |
3955 /* fallthrough */ | |
3956 case RECC_DIGIT: | |
3957 put_range_table (rtab, '0', '9', Qt); | |
3958 break; | |
3959 | |
3960 case RECC_BLANK: | |
3961 put_range_table (rtab, ' ', ' ', Qt); | |
3962 put_range_table (rtab, '\t', '\t', Qt); | |
3963 break; | |
3964 | |
3965 case RECC_PRINT: | |
3966 put_range_table (rtab, ' ', 0x7e, Qt); | |
3967 put_range_table (rtab, 0x80, MOST_POSITIVE_FIXNUM, Qt); | |
3968 break; | |
3969 | |
3970 case RECC_GRAPH: | |
3971 put_range_table (rtab, '!', 0x7e, Qt); | |
3972 put_range_table (rtab, 0x80, MOST_POSITIVE_FIXNUM, Qt); | |
3973 break; | |
3974 | |
3975 case RECC_NONASCII: | |
3976 case RECC_MULTIBYTE: | |
3977 put_range_table (rtab, 0x80, MOST_POSITIVE_FIXNUM, Qt); | |
3978 break; | |
3979 | |
3980 case RECC_CNTRL: | |
3981 put_range_table (rtab, 0x00, 0x1f, Qt); | |
3982 break; | |
3983 | |
3984 case RECC_UNIBYTE: | |
3985 /* Never true in XEmacs. */ | |
3986 break; | |
3987 | |
3988 /* The following all have their own bits in the class_bits argument to | |
3989 charset_mule and charset_mule_not, they don't use the range table | |
3990 information. */ | |
3991 case RECC_ALPHA: | |
3992 case RECC_WORD: | |
3993 case RECC_ALNUM: /* Equivalent to RECC_WORD */ | |
3994 case RECC_LOWER: | |
3995 case RECC_PUNCT: | |
3996 case RECC_SPACE: | |
3997 case RECC_UPPER: | |
3998 break; | |
3999 } | |
4000 | |
4001 return REG_NOERROR; | |
4002 } | |
4003 | |
3718 #endif /* MULE */ | 4004 #endif /* MULE */ |
3719 | 4005 |
3720 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in | 4006 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in |
3721 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible | 4007 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible |
3722 characters can start a string that matches the pattern. This fastmap | 4008 characters can start a string that matches the pattern. This fastmap |
3853 #ifdef MULE | 4139 #ifdef MULE |
3854 case charset_mule: | 4140 case charset_mule: |
3855 { | 4141 { |
3856 int nentries; | 4142 int nentries; |
3857 int i; | 4143 int i; |
4144 Bitbyte flags = *p++; | |
4145 | |
4146 if (flags) | |
4147 { | |
4148 /* We need to consult the syntax table, fastmap won't | |
4149 work. */ | |
4150 bufp->can_be_null = 1; | |
4151 goto done; | |
4152 } | |
3858 | 4153 |
3859 nentries = unified_range_table_nentries (p); | 4154 nentries = unified_range_table_nentries (p); |
3860 for (i = 0; i < nentries; i++) | 4155 for (i = 0; i < nentries; i++) |
3861 { | 4156 { |
3862 EMACS_INT first, last; | 4157 EMACS_INT first, last; |
3876 if (last < 0x100) | 4171 if (last < 0x100) |
3877 { | 4172 { |
3878 set_itext_ichar (strr, last); | 4173 set_itext_ichar (strr, last); |
3879 fastmap[*strr] = 1; | 4174 fastmap[*strr] = 1; |
3880 } | 4175 } |
4176 else if (MOST_POSITIVE_FIXNUM == last) | |
4177 { | |
4178 /* This is RECC_MULTIBYTE or RECC_NONASCII; true for all | |
4179 non-ASCII characters. */ | |
4180 jj = 0x80; | |
4181 while (jj < 0xA0) | |
4182 { | |
4183 fastmap[jj++] = 1; | |
4184 } | |
4185 } | |
3881 } | 4186 } |
3882 } | 4187 } |
3883 break; | 4188 break; |
3884 | 4189 |
3885 case charset_mule_not: | 4190 case charset_mule_not: |
3886 { | 4191 { |
3887 int nentries; | 4192 int nentries; |
3888 int i; | 4193 int i; |
3889 int smallest_prev = 0; | 4194 int smallest_prev = 0; |
4195 Bitbyte flags = *p++; | |
4196 | |
4197 if (flags) | |
4198 { | |
4199 /* We need to consult the syntax table, fastmap won't | |
4200 work. */ | |
4201 bufp->can_be_null = 1; | |
4202 goto done; | |
4203 } | |
3890 | 4204 |
3891 nentries = unified_range_table_nentries (p); | 4205 nentries = unified_range_table_nentries (p); |
3892 for (i = 0; i < nentries; i++) | 4206 for (i = 0; i < nentries; i++) |
3893 { | 4207 { |
3894 EMACS_INT first, last; | 4208 EMACS_INT first, last; |
5414 case charset_mule: | 5728 case charset_mule: |
5415 case charset_mule_not: | 5729 case charset_mule_not: |
5416 { | 5730 { |
5417 REGISTER Ichar c; | 5731 REGISTER Ichar c; |
5418 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not; | 5732 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not; |
5733 Bitbyte class_bits = *p++; | |
5419 | 5734 |
5420 DEBUG_MATCH_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : ""); | 5735 DEBUG_MATCH_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : ""); |
5421 | |
5422 REGEX_PREFETCH (); | 5736 REGEX_PREFETCH (); |
5423 c = itext_ichar_fmt (d, fmt, lispobj); | 5737 c = itext_ichar_fmt (d, fmt, lispobj); |
5424 c = RE_TRANSLATE (c); /* The character to match. */ | 5738 c = RE_TRANSLATE (c); /* The character to match. */ |
5425 | 5739 |
5426 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil))) | 5740 if ((class_bits && |
5427 not_p = !not_p; | 5741 ((class_bits & BIT_ALPHA && ISALPHA (c)) |
5742 || (class_bits & BIT_SPACE && ISSPACE (c)) | |
5743 || (class_bits & BIT_PUNCT && ISPUNCT (c)) | |
5744 || (class_bits & BIT_WORD && ISWORD (c)) | |
5745 || (TRANSLATE_P (translate) ? | |
5746 (class_bits & (BIT_UPPER | BIT_LOWER) | |
5747 && !NOCASEP (lispbuf, c)) | |
5748 : ((class_bits & BIT_UPPER && ISUPPER (c)) | |
5749 || (class_bits & BIT_LOWER && ISLOWER (c)))))) | |
5750 || EQ (Qt, unified_range_table_lookup (p, c, Qnil))) | |
5751 { | |
5752 not_p = !not_p; | |
5753 } | |
5428 | 5754 |
5429 p += unified_range_table_bytes_used (p); | 5755 p += unified_range_table_bytes_used (p); |
5430 | 5756 |
5431 if (!not_p) goto fail; | 5757 if (!not_p) goto fail; |
5432 | 5758 |