comparison src/regex.c @ 5648:3f4a234f4672

Support non-ASCII correctly in character classes, test this. src/ChangeLog addition: 2012-04-21 Aidan Kehoe <kehoea@parhasard.net> Support non-ASCII correctly in character classes ([:alnum:] and friends). * regex.c: * regex.c (ISBLANK, ISUNIBYTE): New. Make these and friends independent of the locale, since we want them to be consistent in XEmacs. * regex.c (print_partial_compiled_pattern): Print the flags for charset_mule; don't print non-ASCII as the character values in ranges, this breaks with locales. * regex.c (enum): Define various flags the charset_mule and charset_mule_not opcodes can now take. * regex.c (CHAR_CLASS_MAX_LENGTH): Update this. * regex.c (re_iswctype, re_wctype): New, from GNU. * regex.c (re_wctype_can_match_non_ascii): New; used when deciding on whether to use charset_mule or the ASCII-only regex character set opcode. * regex.c (regex_compile): Error correctly on long, non-existent character class names. Break out the handling of charsets that can match non-ASCII into a separate clause. Use compile_char_class when compiling character classes. * regex.c (compile_char_class): New. Used in regex_compile when compiling character sets that may match non-ASCII. * regex.c (re_compile_fastmap): If there are flags set for charset_mule or charset_mule_not, we can't use the fastmap (since we need to check syntax table values that aren't available there). * regex.c (re_match_2_internal): Check the new flags passed to the charset_mule{,_not} opcode, observe them if appropriate. * regex.h: * regex.h (enum): Expose re_wctype_t here, imported from GNU. tests/ChangeLog addition: 2012-04-21 Aidan Kehoe <kehoea@parhasard.net> * automated/regexp-tests.el: * automated/regexp-tests.el (Assert-char-class): Check that #'string-match errors correctly with an over-long character class name. Add tests for character class functionality that supports non-ASCII characters. These tests expose bugs in GNU Emacs 24.0.94.2, but pass under current XEmacs.
author Aidan Kehoe <kehoea@parhasard.net>
date Sat, 21 Apr 2012 18:58:28 +0100
parents 308d34e9f07d
children 3df910176b6a
comparison
equal deleted inserted replaced
5647:1d9f603e9125 5648:3f4a234f4672
176 #include "regex.h" 176 #include "regex.h"
177 177
178 /* isalpha etc. are used for the character classes. */ 178 /* isalpha etc. are used for the character classes. */
179 #include <ctype.h> 179 #include <ctype.h>
180 180
181 /* Jim Meyering writes: 181 #ifdef emacs
182 182
183 "... Some ctype macros are valid only for character codes that 183 /* 1 if C is an ASCII character. */
184 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when 184 #define ISASCII(c) ((c) < 0x80)
185 using /bin/cc or gcc but without giving an ansi option). So, all 185
186 ctype uses should be through macros like ISPRINT... If 186 /* 1 if C is a unibyte character. */
187 STDC_HEADERS is defined, then autoconf has verified that the ctype 187 #define ISUNIBYTE(c) 0
188 macros don't need to be guarded with references to isascii. ... 188
189 Defining isascii to 1 should let any compiler worth its salt 189 /* The Emacs definitions should not be directly affected by locales. */
190 eliminate the && through constant folding." */ 190
191 191 /* In Emacs, these are only used for single-byte characters. */
192 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 192 #define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
193 #define ISASCII_1(c) 1 193 #define ISCNTRL(c) ((c) < ' ')
194 #define ISXDIGIT(c) (ISDIGIT (c) || ((c) >= 'a' && (c) <= 'f') \
195 || ((c) >= 'A' && (c) <= 'F'))
196
197 /* This is only used for single-byte characters. */
198 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
199
200 /* The rest must handle multibyte characters. */
201
202 #define ISGRAPH(c) ((c) > ' ' && (c) != 0x7f)
203 #define ISPRINT(c) ((c) == ' ' || ISGRAPH (c))
204 #define ISALPHA(c) (ISASCII (c) ? (((c) >= 'a' && (c) <= 'z') \
205 || ((c) >= 'A' && (c) <= 'Z')) \
206 : ISWORD (c))
207 #define ISALNUM(c) (ISALPHA (c) || ISDIGIT (c))
208
209 #define ISLOWER(c) LOWERCASEP (lispbuf, c)
210
211 #define ISPUNCT(c) (ISASCII (c) \
212 ? ((c) > ' ' && (c) < 0x7F \
213 && !(((c) >= 'a' && (c) <= 'z') \
214 || ((c) >= 'A' && (c) <= 'Z') \
215 || ((c) >= '0' && (c) <= '9'))) \
216 : !ISWORD (c))
217
218 #define ISSPACE(c) \
219 (SYNTAX (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf), c) == Swhitespace)
220
221 #define ISUPPER(c) UPPERCASEP (lispbuf, c)
222
223 #define ISWORD(c) (SYNTAX (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf), c) == Sword)
224
225 #else /* not emacs */
226
227 /* 1 if C is an ASCII character. */
228 #define ISASCII(c) ((c) < 0200)
229
230 /* 1 if C is a unibyte character. */
231 #define ISUNIBYTE(c) 0
232
233 #ifdef isblank
234 # define ISBLANK(c) isblank (c)
194 #else 235 #else
195 #define ISASCII_1(c) isascii(c) 236 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
196 #endif 237 #endif
197 238 #ifdef isgraph
198 #ifdef MULE 239 # define ISGRAPH(c) isgraph (c)
199 /* The IS*() macros can be passed any character, including an extended
200 one. We need to make sure there are no crashes, which would occur
201 otherwise due to out-of-bounds array references. */
202 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
203 #else 240 #else
204 #define ISASCII(c) ISASCII_1 (c) 241 # define ISGRAPH(c) (isprint (c) && !isspace (c))
205 #endif /* MULE */ 242 #endif
206 243
207 #ifdef isblank 244 /* Solaris defines ISPRINT so we must undefine it first. */
208 #define ISBLANK(c) (ISASCII (c) && isblank (c)) 245 #undef ISPRINT
246 #define ISPRINT(c) isprint (c)
247 #define ISDIGIT(c) isdigit (c)
248 #define ISALNUM(c) isalnum (c)
249 #define ISALPHA(c) isalpha (c)
250 #define ISCNTRL(c) iscntrl (c)
251 #define ISLOWER(c) islower (c)
252 #define ISPUNCT(c) ispunct (c)
253 #define ISSPACE(c) isspace (c)
254 #define ISUPPER(c) isupper (c)
255 #define ISXDIGIT(c) isxdigit (c)
256
257 #define ISWORD(c) ISALPHA (c)
258
259 #ifdef _tolower
260 # define TOLOWER(c) _tolower (c)
209 #else 261 #else
210 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') 262 # define TOLOWER(c) tolower (c)
211 #endif 263 #endif
212 #ifdef isgraph 264
213 #define ISGRAPH(c) (ISASCII (c) && isgraph (c)) 265 #endif /* emacs */
214 #else
215 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
216 #endif
217
218 #define ISPRINT(c) (ISASCII (c) && isprint (c))
219 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
220 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
221 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
222 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
223 #define ISLOWER(c) (ISASCII (c) && islower (c))
224 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
225 #define ISSPACE(c) (ISASCII (c) && isspace (c))
226 #define ISUPPER(c) (ISASCII (c) && isupper (c))
227 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
228 266
229 #ifndef NULL 267 #ifndef NULL
230 #define NULL (void *)0 268 #define NULL (void *)0
231 #endif 269 #endif
232 270
911 { 949 {
912 int nentries, i; 950 int nentries, i;
913 951
914 printf ("/charset_mule [%s", 952 printf ("/charset_mule [%s",
915 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : ""); 953 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : "");
954 printf (" flags: 0x%02x ", *p++);
916 nentries = unified_range_table_nentries (p); 955 nentries = unified_range_table_nentries (p);
917 for (i = 0; i < nentries; i++) 956 for (i = 0; i < nentries; i++)
918 { 957 {
919 EMACS_INT first, last; 958 EMACS_INT first, last;
920 Lisp_Object dummy_val; 959 Lisp_Object dummy_val;
921 960
922 unified_range_table_get_range (p, i, &first, &last, 961 unified_range_table_get_range (p, i, &first, &last,
923 &dummy_val); 962 &dummy_val);
924 if (first < 0x100) 963 if (first < 0x80)
925 putchar (first); 964 putchar (first);
926 else 965 else
927 printf ("(0x%lx)", (long)first); 966 printf ("(0x%lx)", (long)first);
928 if (first != last) 967 if (first != last)
929 { 968 {
930 putchar ('-'); 969 putchar ('-');
931 if (last < 0x100) 970 if (last < 0x80)
932 putchar (last); 971 putchar (last);
933 else 972 else
934 printf ("(0x%lx)", (long)last); 973 printf ("(0x%lx)", (long)last);
935 } 974 }
936 } 975 }
1972 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) 2011 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1973 2012
1974 /* The next available element. */ 2013 /* The next available element. */
1975 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) 2014 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1976 2015
2016 /* Bits used to implement the multibyte-part of the various character
2017 classes such as [:alnum:] in a charset's range table. XEmacs; use an
2018 enum, so they're visible in the debugger. */
2019 enum
2020 {
2021 BIT_WORD = (1 << 0),
2022 BIT_LOWER = (1 << 1),
2023 BIT_PUNCT = (1 << 2),
2024 BIT_SPACE = (1 << 3),
2025 BIT_UPPER = (1 << 4),
2026 /* XEmacs; we need this, because we unify treatment of ASCII and non-ASCII
2027 (possible matches) in charset_mule. [:alpha:] matches all characters
2028 with word syntax, with the exception of [0-9]. We don't need
2029 BIT_MULTIBYTE. */
2030 BIT_ALPHA = (1 << 5)
2031 };
1977 2032
1978 /* Set the bit for character C in a bit vector. */ 2033 /* Set the bit for character C in a bit vector. */
1979 #define SET_LIST_BIT(c) \ 2034 #define SET_LIST_BIT(c) \
1980 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \ 2035 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1981 |= 1 << (((unsigned char) c) % BYTEWIDTH)) 2036 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1983 #ifdef MULE 2038 #ifdef MULE
1984 2039
1985 /* Set the "bit" for character C in a range table. */ 2040 /* Set the "bit" for character C in a range table. */
1986 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt) 2041 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1987 2042
1988 /* Set the "bit" for character c in the appropriate table. */ 2043 #endif
1989 #define SET_EITHER_BIT(c) \
1990 do { \
1991 if (has_extended_chars) \
1992 SET_RANGETAB_BIT (c); \
1993 else \
1994 SET_LIST_BIT (c); \
1995 } while (0)
1996
1997 #else /* not MULE */
1998
1999 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
2000
2001 #endif
2002
2003 2044
2004 /* Get the next unsigned number in the uncompiled pattern. */ 2045 /* Get the next unsigned number in the uncompiled pattern. */
2005 #define GET_UNSIGNED_NUMBER(num) \ 2046 #define GET_UNSIGNED_NUMBER(num) \
2006 { if (p != pend) \ 2047 { if (p != pend) \
2007 { \ 2048 { \
2016 PATFETCH (c); \ 2057 PATFETCH (c); \
2017 } \ 2058 } \
2018 } \ 2059 } \
2019 } 2060 }
2020 2061
2021 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ 2062 #define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
2022 2063
2023 #define IS_CHAR_CLASS(string) \ 2064 /* Map a string to the char class it names (if any). */
2024 (STREQ (string, "alpha") || STREQ (string, "upper") \ 2065 static re_wctype_t
2025 || STREQ (string, "lower") || STREQ (string, "digit") \ 2066 re_wctype (const char *string)
2026 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ 2067 {
2027 || STREQ (string, "space") || STREQ (string, "print") \ 2068 if (STREQ (string, "alnum")) return RECC_ALNUM;
2028 || STREQ (string, "punct") || STREQ (string, "graph") \ 2069 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2029 || STREQ (string, "cntrl") || STREQ (string, "blank")) 2070 else if (STREQ (string, "word")) return RECC_WORD;
2071 else if (STREQ (string, "ascii")) return RECC_ASCII;
2072 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2073 else if (STREQ (string, "graph")) return RECC_GRAPH;
2074 else if (STREQ (string, "lower")) return RECC_LOWER;
2075 else if (STREQ (string, "print")) return RECC_PRINT;
2076 else if (STREQ (string, "punct")) return RECC_PUNCT;
2077 else if (STREQ (string, "space")) return RECC_SPACE;
2078 else if (STREQ (string, "upper")) return RECC_UPPER;
2079 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2080 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2081 else if (STREQ (string, "digit")) return RECC_DIGIT;
2082 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2083 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2084 else if (STREQ (string, "blank")) return RECC_BLANK;
2085 else return RECC_ERROR;
2086 }
2087
2088 /* True if CH is in the char class CC. */
2089 static re_bool
2090 re_iswctype (int ch, re_wctype_t cc)
2091 {
2092 #ifdef emacs
2093 /* This is cheesy, lispbuf isn't available to us when compiling the
2094 pattern. It's effectively only called (on Mule builds) when the current
2095 buffer doesn't matter (e.g. for RECC_ASCII, RECC_CNTRL), so it's not a
2096 big deal. */
2097 struct buffer *lispbuf = current_buffer;
2098 #endif
2099
2100 switch (cc)
2101 {
2102 case RECC_ALNUM: return ISALNUM (ch) != 0;
2103 case RECC_ALPHA: return ISALPHA (ch) != 0;
2104 case RECC_BLANK: return ISBLANK (ch) != 0;
2105 case RECC_CNTRL: return ISCNTRL (ch) != 0;
2106 case RECC_DIGIT: return ISDIGIT (ch) != 0;
2107 case RECC_GRAPH: return ISGRAPH (ch) != 0;
2108 case RECC_LOWER: return ISLOWER (ch) != 0;
2109 case RECC_PRINT: return ISPRINT (ch) != 0;
2110 case RECC_PUNCT: return ISPUNCT (ch) != 0;
2111 case RECC_SPACE: return ISSPACE (ch) != 0;
2112 case RECC_UPPER: return ISUPPER (ch) != 0;
2113 case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
2114 case RECC_ASCII: return ISASCII (ch) != 0;
2115 case RECC_NONASCII: case RECC_MULTIBYTE: return !ISASCII (ch);
2116 case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
2117 case RECC_WORD: return ISWORD (ch) != 0;
2118 case RECC_ERROR: return false;
2119 default:
2120 abort ();
2121 }
2122 }
2123
2124 #ifdef MULE
2125
2126 static re_bool
2127 re_wctype_can_match_non_ascii (re_wctype_t cc)
2128 {
2129 switch (cc)
2130 {
2131 case RECC_ASCII:
2132 case RECC_UNIBYTE:
2133 case RECC_CNTRL:
2134 case RECC_DIGIT:
2135 case RECC_XDIGIT:
2136 case RECC_BLANK:
2137 return false;
2138 default:
2139 return true;
2140 }
2141 }
2142
2143 /* Return a bit-pattern to use in the range-table bits to match multibyte
2144 chars of class CC. */
2145 static unsigned char
2146 re_wctype_to_bit (re_wctype_t cc)
2147 {
2148 switch (cc)
2149 {
2150 case RECC_PRINT: case RECC_GRAPH:
2151 case RECC_ALPHA: return BIT_ALPHA;
2152 case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2153 case RECC_LOWER: return BIT_LOWER;
2154 case RECC_UPPER: return BIT_UPPER;
2155 case RECC_PUNCT: return BIT_PUNCT;
2156 case RECC_SPACE: return BIT_SPACE;
2157 case RECC_MULTIBYTE: case RECC_NONASCII:
2158 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
2159 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2160 default:
2161 abort ();
2162 }
2163 }
2164
2165 #endif /* emacs */
2030 2166
2031 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg); 2167 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
2032 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2); 2168 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
2033 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg, 2169 static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg,
2034 unsigned char *end); 2170 unsigned char *end);
2047 static reg_errcode_t compile_extended_range (re_char **p_ptr, 2183 static reg_errcode_t compile_extended_range (re_char **p_ptr,
2048 re_char *pend, 2184 re_char *pend,
2049 RE_TRANSLATE_TYPE translate, 2185 RE_TRANSLATE_TYPE translate,
2050 reg_syntax_t syntax, 2186 reg_syntax_t syntax,
2051 Lisp_Object rtab); 2187 Lisp_Object rtab);
2188 static reg_errcode_t compile_char_class (re_wctype_t cc, Lisp_Object rtab,
2189 Bitbyte *flags_out);
2052 #endif /* MULE */ 2190 #endif /* MULE */
2053 static re_bool group_match_null_string_p (unsigned char **p, 2191 static re_bool group_match_null_string_p (unsigned char **p,
2054 unsigned char *end, 2192 unsigned char *end,
2055 register_info_type *reg_info); 2193 register_info_type *reg_info);
2056 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end, 2194 static re_bool alt_match_null_string_p (unsigned char *p, unsigned char *end,
2510 case '.': 2648 case '.':
2511 laststart = buf_end; 2649 laststart = buf_end;
2512 BUF_PUSH (anychar); 2650 BUF_PUSH (anychar);
2513 break; 2651 break;
2514 2652
2653 #ifdef MULE
2654 #define MAYBE_START_OVER_WITH_EXTENDED(ch) \
2655 if (ch >= 0x80) \
2656 { \
2657 goto start_over_with_extended; \
2658 } while (0)
2659 #else
2660 #define MAYBE_START_OVER_WITH_EXTENDED(ch)
2661 #endif
2515 2662
2516 case '[': 2663 case '[':
2517 { 2664 {
2518 /* XEmacs change: this whole section */ 2665 /* XEmacs change: this whole section */
2519 re_bool had_char_class = false; 2666 re_bool had_char_class = false;
2520 #ifdef MULE
2521 re_bool has_extended_chars = false;
2522 REGISTER Lisp_Object rtab = Qnil;
2523 #endif
2524 2667
2525 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2668 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2526 2669
2527 /* Ensure that we have enough space to push a charset: the 2670 /* Ensure that we have enough space to push a charset: the
2528 opcode, the length count, and the bitset; 34 bytes in all. */ 2671 opcode, the length count, and the bitset; 34 bytes in all. */
2548 /* charset_not matches newline according to a syntax bit. */ 2691 /* charset_not matches newline according to a syntax bit. */
2549 if ((re_opcode_t) buf_end[-2] == charset_not 2692 if ((re_opcode_t) buf_end[-2] == charset_not
2550 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 2693 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2551 SET_LIST_BIT ('\n'); 2694 SET_LIST_BIT ('\n');
2552 2695
2553 #ifdef MULE
2554 start_over_with_extended:
2555 if (has_extended_chars)
2556 {
2557 /* There are extended chars here, which means we need to start
2558 over and shift to unified range-table format. */
2559 if (buf_end[-2] == charset)
2560 buf_end[-2] = charset_mule;
2561 else
2562 buf_end[-2] = charset_mule_not;
2563 buf_end--;
2564 p = p1; /* go back to the beginning of the charset, after
2565 a possible ^. */
2566 rtab = Vthe_lisp_rangetab;
2567 Fclear_range_table (rtab);
2568
2569 /* charset_not matches newline according to a syntax bit. */
2570 if ((re_opcode_t) buf_end[-1] == charset_mule_not
2571 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2572 SET_EITHER_BIT ('\n');
2573 }
2574 #endif /* MULE */
2575
2576 /* Read in characters and ranges, setting map bits. */ 2696 /* Read in characters and ranges, setting map bits. */
2577 for (;;) 2697 for (;;)
2578 { 2698 {
2579 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2699 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2580 2700
2581 PATFETCH (c); 2701 PATFETCH (c);
2582 2702
2583 #ifdef MULE 2703 /* Frumble-bumble, we may have found some extended chars.
2584 if (c >= 0x80 && !has_extended_chars) 2704 Need to start over, process everything using the general
2585 { 2705 extended-char mechanism, and need to use charset_mule and
2586 has_extended_chars = 1; 2706 charset_mule_not instead of charset and charset_not. */
2587 /* Frumble-bumble, we've found some extended chars. 2707 MAYBE_START_OVER_WITH_EXTENDED (c);
2588 Need to start over, process everything using 2708
2589 the general extended-char mechanism, and need
2590 to use charset_mule and charset_mule_not instead
2591 of charset and charset_not. */
2592 goto start_over_with_extended;
2593 }
2594 #endif /* MULE */
2595 /* \ might escape characters inside [...] and [^...]. */ 2709 /* \ might escape characters inside [...] and [^...]. */
2596 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 2710 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2597 { 2711 {
2598 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2712 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2599 2713
2600 PATFETCH (c1); 2714 PATFETCH (c1);
2601 #ifdef MULE 2715
2602 if (c1 >= 0x80 && !has_extended_chars) 2716 MAYBE_START_OVER_WITH_EXTENDED (c1);
2603 { 2717
2604 has_extended_chars = 1; 2718 SET_LIST_BIT (c1);
2605 goto start_over_with_extended;
2606 }
2607 #endif /* MULE */
2608 SET_EITHER_BIT (c1);
2609 continue; 2719 continue;
2610 } 2720 }
2611 2721
2612 /* Could be the end of the bracket expression. If it's 2722 /* Could be the end of the bracket expression. If it's
2613 not (i.e., when the bracket expression is `[]' so 2723 not (i.e., when the bracket expression is `[]' so
2629 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 2739 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2630 && *p != ']') 2740 && *p != ']')
2631 { 2741 {
2632 reg_errcode_t ret; 2742 reg_errcode_t ret;
2633 2743
2634 #ifdef MULE 2744 MAYBE_START_OVER_WITH_EXTENDED (*(unsigned char *)p);
2635 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) 2745
2636 { 2746 ret = compile_range (&p, pend, translate, syntax,
2637 has_extended_chars = 1; 2747 buf_end);
2638 goto start_over_with_extended; 2748
2639 }
2640 if (has_extended_chars)
2641 ret = compile_extended_range (&p, pend, translate,
2642 syntax, rtab);
2643 else
2644 #endif /* MULE */
2645 ret = compile_range (&p, pend, translate, syntax, buf_end);
2646 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2749 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2647 } 2750 }
2648 2751
2649 else if (p[0] == '-' && p[1] != ']') 2752 else if (p[0] == '-' && p[1] != ']')
2650 { /* This handles ranges made up of characters only. */ 2753 { /* This handles ranges made up of characters only. */
2651 reg_errcode_t ret; 2754 reg_errcode_t ret;
2652 2755
2653 /* Move past the `-'. */ 2756 /* Move past the `-'. */
2654 PATFETCH (c1); 2757 PATFETCH (c1);
2655 2758
2656 #ifdef MULE 2759 MAYBE_START_OVER_WITH_EXTENDED (*(unsigned char *)p);
2657 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) 2760
2658 { 2761 ret = compile_range (&p, pend, translate, syntax, buf_end);
2659 has_extended_chars = 1; 2762
2660 goto start_over_with_extended;
2661 }
2662 if (has_extended_chars)
2663 ret = compile_extended_range (&p, pend, translate,
2664 syntax, rtab);
2665 else
2666 #endif /* MULE */
2667 ret = compile_range (&p, pend, translate, syntax, buf_end);
2668 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2763 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2669 } 2764 }
2670 2765
2671 /* See if we're at the beginning of a possible character 2766 /* See if we're at the beginning of a possible character
2672 class. */ 2767 class. */
2673 2768
2674 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 2769 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2675 { /* Leave room for the null. */ 2770 { /* Leave room for the null. */
2676 char str[CHAR_CLASS_MAX_LENGTH + 1]; 2771 char str[CHAR_CLASS_MAX_LENGTH + 1];
2772 int ch = 0;
2677 2773
2678 PATFETCH (c); 2774 PATFETCH (c);
2679 c1 = 0; 2775 c1 = 0;
2680 2776
2681 /* If pattern is `[[:'. */ 2777 /* If pattern is `[[:'. */
2682 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2778 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2683 2779
2684 for (;;) 2780 for (;;)
2685 { 2781 {
2686 /* #### This code is unused. 2782 PATFETCH (c);
2687 Correctness is not checked after TRT 2783 if ((c == ':' && *p == ']') || p == pend)
2688 table change. */ 2784 break;
2689 PATFETCH (c); 2785 if (c1 < CHAR_CLASS_MAX_LENGTH)
2690 if (c == ':' || c == ']' || p == pend 2786 str[c1++] = c;
2691 || c1 == CHAR_CLASS_MAX_LENGTH) 2787 else
2692 break; 2788 /* This is in any case an invalid class name. */
2693 str[c1++] = (char) c; 2789 str[0] = '\0';
2694 } 2790 }
2695 str[c1] = '\0'; 2791 str[c1] = '\0';
2696 2792
2697 /* If isn't a word bracketed by `[:' and `:]': 2793 /* If isn't a word bracketed by `[:' and `:]':
2698 undo the ending character, the letters, and leave 2794 undo the ending character, the letters, and leave
2699 the leading `:' and `[' (but set bits for them). */ 2795 the leading `:' and `[' (but set bits for them). */
2700 if (c == ':' && *p == ']') 2796 if (c == ':' && *p == ']')
2701 { 2797 {
2702 int ch; 2798 re_wctype_t cc = re_wctype (str);
2703 re_bool is_alnum = STREQ (str, "alnum"); 2799
2704 re_bool is_alpha = STREQ (str, "alpha"); 2800 if (cc == RECC_ERROR)
2705 re_bool is_blank = STREQ (str, "blank");
2706 re_bool is_cntrl = STREQ (str, "cntrl");
2707 re_bool is_digit = STREQ (str, "digit");
2708 re_bool is_graph = STREQ (str, "graph");
2709 re_bool is_lower = STREQ (str, "lower");
2710 re_bool is_print = STREQ (str, "print");
2711 re_bool is_punct = STREQ (str, "punct");
2712 re_bool is_space = STREQ (str, "space");
2713 re_bool is_upper = STREQ (str, "upper");
2714 re_bool is_xdigit = STREQ (str, "xdigit");
2715
2716 if (!IS_CHAR_CLASS (str))
2717 FREE_STACK_RETURN (REG_ECTYPE); 2801 FREE_STACK_RETURN (REG_ECTYPE);
2718 2802
2719 /* Throw away the ] at the end of the character 2803 /* Throw away the ] at the end of the character
2720 class. */ 2804 class. */
2721 PATFETCH (c); 2805 PATFETCH (c);
2722 2806
2723 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2807 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2724 2808
2725 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) 2809 #ifdef MULE
2726 { 2810 if (re_wctype_can_match_non_ascii (cc))
2727 /* This was split into 3 if's to 2811 {
2728 avoid an arbitrary limit in some compiler. */ 2812 goto start_over_with_extended;
2729 if ( (is_alnum && ISALNUM (ch)) 2813 }
2730 || (is_alpha && ISALPHA (ch)) 2814 #endif /* MULE */
2731 || (is_blank && ISBLANK (ch)) 2815 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
2732 || (is_cntrl && ISCNTRL (ch))) 2816 {
2733 SET_EITHER_BIT (ch); 2817 if (re_iswctype (ch, cc))
2734 if ( (is_digit && ISDIGIT (ch)) 2818 {
2735 || (is_graph && ISGRAPH (ch)) 2819 SET_LIST_BIT (ch);
2736 || (is_lower && ISLOWER (ch)) 2820 }
2737 || (is_print && ISPRINT (ch))) 2821 }
2738 SET_EITHER_BIT (ch); 2822
2739 if ( (is_punct && ISPUNCT (ch))
2740 || (is_space && ISSPACE (ch))
2741 || (is_upper && ISUPPER (ch))
2742 || (is_xdigit && ISXDIGIT (ch)))
2743 SET_EITHER_BIT (ch);
2744 }
2745 had_char_class = true; 2823 had_char_class = true;
2746 } 2824 }
2747 else 2825 else
2748 { 2826 {
2749 c1++; 2827 c1++;
2750 while (c1--) 2828 while (c1--)
2751 PATUNFETCH; 2829 PATUNFETCH;
2752 SET_EITHER_BIT ('['); 2830 SET_LIST_BIT ('[');
2753 SET_EITHER_BIT (':'); 2831 SET_LIST_BIT (':');
2754 had_char_class = false; 2832 had_char_class = false;
2755 } 2833 }
2756 } 2834 }
2757 else 2835 else
2758 { 2836 {
2759 had_char_class = false; 2837 had_char_class = false;
2760 SET_EITHER_BIT (c); 2838 SET_LIST_BIT (c);
2761 } 2839 }
2762 } 2840 }
2763 2841
2764 #ifdef MULE
2765 if (has_extended_chars)
2766 {
2767 /* We have a range table, not a bit vector. */
2768 int bytes_needed =
2769 unified_range_table_bytes_needed (rtab);
2770 GET_BUFFER_SPACE (bytes_needed);
2771 unified_range_table_copy_data (rtab, buf_end);
2772 buf_end += unified_range_table_bytes_used (buf_end);
2773 break;
2774 }
2775 #endif /* MULE */
2776 /* Discard any (non)matching list bytes that are all 0 at the 2842 /* Discard any (non)matching list bytes that are all 0 at the
2777 end of the map. Decrease the map-length byte too. */ 2843 end of the map. Decrease the map-length byte too. */
2778 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0) 2844 while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0)
2779 buf_end[-1]--; 2845 buf_end[-1]--;
2780 buf_end += buf_end[-1]; 2846 buf_end += buf_end[-1];
2781 } 2847 }
2782 break; 2848 break;
2783 2849
2850 #ifdef MULE
2851 start_over_with_extended:
2852 {
2853 REGISTER Lisp_Object rtab = Qnil;
2854 Bitbyte flags = 0;
2855 int bytes_needed = sizeof (flags);
2856 re_bool had_char_class = false;
2857
2858 /* There are extended chars here, which means we need to use the
2859 unified range-table format. */
2860 if (buf_end[-2] == charset)
2861 buf_end[-2] = charset_mule;
2862 else
2863 buf_end[-2] = charset_mule_not;
2864 buf_end--;
2865 p = p1; /* go back to the beginning of the charset, after
2866 a possible ^. */
2867 rtab = Vthe_lisp_rangetab;
2868 Fclear_range_table (rtab);
2869
2870 /* charset_not matches newline according to a syntax bit. */
2871 if ((re_opcode_t) buf_end[-1] == charset_mule_not
2872 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2873 SET_RANGETAB_BIT ('\n');
2874
2875 /* Read in characters and ranges, setting map bits. */
2876 for (;;)
2877 {
2878 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2879
2880 PATFETCH (c);
2881
2882 /* \ might escape characters inside [...] and [^...]. */
2883 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2884 {
2885 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2886
2887 PATFETCH (c1);
2888
2889 SET_RANGETAB_BIT (c1);
2890 continue;
2891 }
2892
2893 /* Could be the end of the bracket expression. If it's
2894 not (i.e., when the bracket expression is `[]' so
2895 far), the ']' character bit gets set way below. */
2896 if (c == ']' && p != p1 + 1)
2897 break;
2898
2899 /* Look ahead to see if it's a range when the last thing
2900 was a character class. */
2901 if (had_char_class && c == '-' && *p != ']')
2902 FREE_STACK_RETURN (REG_ERANGE);
2903
2904 /* Look ahead to see if it's a range when the last thing
2905 was a character: if this is a hyphen not at the
2906 beginning or the end of a list, then it's the range
2907 operator. */
2908 if (c == '-'
2909 && !(p - 2 >= pattern && p[-2] == '[')
2910 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2911 && *p != ']')
2912 {
2913 reg_errcode_t ret;
2914
2915 ret = compile_extended_range (&p, pend, translate, syntax,
2916 rtab);
2917
2918 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2919 }
2920
2921 else if (p[0] == '-' && p[1] != ']')
2922 { /* This handles ranges made up of characters only. */
2923 reg_errcode_t ret;
2924
2925 /* Move past the `-'. */
2926 PATFETCH (c1);
2927
2928 ret = compile_extended_range (&p, pend, translate,
2929 syntax, rtab);
2930 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2931 }
2932
2933 /* See if we're at the beginning of a possible character
2934 class. */
2935
2936 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2937 { /* Leave room for the null. */
2938 char str[CHAR_CLASS_MAX_LENGTH + 1];
2939
2940 PATFETCH (c);
2941 c1 = 0;
2942
2943 /* If pattern is `[[:'. */
2944 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2945
2946 for (;;)
2947 {
2948 PATFETCH (c);
2949 if ((c == ':' && *p == ']') || p == pend)
2950 break;
2951 if (c1 < CHAR_CLASS_MAX_LENGTH)
2952 str[c1++] = c;
2953 else
2954 /* This is in any case an invalid class name. */
2955 str[0] = '\0';
2956 }
2957 str[c1] = '\0';
2958
2959 /* If isn't a word bracketed by `[:' and `:]':
2960 undo the ending character, the letters, and leave
2961 the leading `:' and `[' (but set bits for them). */
2962 if (c == ':' && *p == ']')
2963 {
2964 re_wctype_t cc = re_wctype (str);
2965 reg_errcode_t ret = REG_NOERROR;
2966
2967 if (cc == RECC_ERROR)
2968 FREE_STACK_RETURN (REG_ECTYPE);
2969
2970 /* Throw away the ] at the end of the character
2971 class. */
2972 PATFETCH (c);
2973
2974 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2975
2976 ret = compile_char_class (cc, rtab, &flags);
2977
2978 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2979
2980 had_char_class = true;
2981 }
2982 else
2983 {
2984 c1++;
2985 while (c1--)
2986 PATUNFETCH;
2987 SET_RANGETAB_BIT ('[');
2988 SET_RANGETAB_BIT (':');
2989 had_char_class = false;
2990 }
2991 }
2992 else
2993 {
2994 had_char_class = false;
2995 SET_RANGETAB_BIT (c);
2996 }
2997 }
2998
2999 bytes_needed += unified_range_table_bytes_needed (rtab);
3000 GET_BUFFER_SPACE (bytes_needed);
3001 *buf_end++ = flags;
3002 unified_range_table_copy_data (rtab, buf_end);
3003 buf_end += unified_range_table_bytes_used (buf_end);
3004 break;
3005 }
3006 #endif /* MULE */
2784 3007
2785 case '(': 3008 case '(':
2786 if (syntax & RE_NO_BK_PARENS) 3009 if (syntax & RE_NO_BK_PARENS)
2787 goto handle_open; 3010 goto handle_open;
2788 else 3011 else
3713 put_range_table (rtab, this_char, range_end, Qt); 3936 put_range_table (rtab, this_char, range_end, Qt);
3714 3937
3715 return REG_NOERROR; 3938 return REG_NOERROR;
3716 } 3939 }
3717 3940
3941 static reg_errcode_t
3942 compile_char_class (re_wctype_t cc, Lisp_Object rtab, Bitbyte *flags_out)
3943 {
3944 *flags_out |= re_wctype_to_bit (cc);
3945
3946 switch (cc)
3947 {
3948 case RECC_ASCII:
3949 put_range_table (rtab, 0, 0x7f, Qt);
3950 break;
3951
3952 case RECC_XDIGIT:
3953 put_range_table (rtab, 'a', 'f', Qt);
3954 put_range_table (rtab, 'A', 'f', Qt);
3955 /* fallthrough */
3956 case RECC_DIGIT:
3957 put_range_table (rtab, '0', '9', Qt);
3958 break;
3959
3960 case RECC_BLANK:
3961 put_range_table (rtab, ' ', ' ', Qt);
3962 put_range_table (rtab, '\t', '\t', Qt);
3963 break;
3964
3965 case RECC_PRINT:
3966 put_range_table (rtab, ' ', 0x7e, Qt);
3967 put_range_table (rtab, 0x80, MOST_POSITIVE_FIXNUM, Qt);
3968 break;
3969
3970 case RECC_GRAPH:
3971 put_range_table (rtab, '!', 0x7e, Qt);
3972 put_range_table (rtab, 0x80, MOST_POSITIVE_FIXNUM, Qt);
3973 break;
3974
3975 case RECC_NONASCII:
3976 case RECC_MULTIBYTE:
3977 put_range_table (rtab, 0x80, MOST_POSITIVE_FIXNUM, Qt);
3978 break;
3979
3980 case RECC_CNTRL:
3981 put_range_table (rtab, 0x00, 0x1f, Qt);
3982 break;
3983
3984 case RECC_UNIBYTE:
3985 /* Never true in XEmacs. */
3986 break;
3987
3988 /* The following all have their own bits in the class_bits argument to
3989 charset_mule and charset_mule_not, they don't use the range table
3990 information. */
3991 case RECC_ALPHA:
3992 case RECC_WORD:
3993 case RECC_ALNUM: /* Equivalent to RECC_WORD */
3994 case RECC_LOWER:
3995 case RECC_PUNCT:
3996 case RECC_SPACE:
3997 case RECC_UPPER:
3998 break;
3999 }
4000
4001 return REG_NOERROR;
4002 }
4003
3718 #endif /* MULE */ 4004 #endif /* MULE */
3719 4005
3720 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 4006 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3721 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 4007 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3722 characters can start a string that matches the pattern. This fastmap 4008 characters can start a string that matches the pattern. This fastmap
3853 #ifdef MULE 4139 #ifdef MULE
3854 case charset_mule: 4140 case charset_mule:
3855 { 4141 {
3856 int nentries; 4142 int nentries;
3857 int i; 4143 int i;
4144 Bitbyte flags = *p++;
4145
4146 if (flags)
4147 {
4148 /* We need to consult the syntax table, fastmap won't
4149 work. */
4150 bufp->can_be_null = 1;
4151 goto done;
4152 }
3858 4153
3859 nentries = unified_range_table_nentries (p); 4154 nentries = unified_range_table_nentries (p);
3860 for (i = 0; i < nentries; i++) 4155 for (i = 0; i < nentries; i++)
3861 { 4156 {
3862 EMACS_INT first, last; 4157 EMACS_INT first, last;
3876 if (last < 0x100) 4171 if (last < 0x100)
3877 { 4172 {
3878 set_itext_ichar (strr, last); 4173 set_itext_ichar (strr, last);
3879 fastmap[*strr] = 1; 4174 fastmap[*strr] = 1;
3880 } 4175 }
4176 else if (MOST_POSITIVE_FIXNUM == last)
4177 {
4178 /* This is RECC_MULTIBYTE or RECC_NONASCII; true for all
4179 non-ASCII characters. */
4180 jj = 0x80;
4181 while (jj < 0xA0)
4182 {
4183 fastmap[jj++] = 1;
4184 }
4185 }
3881 } 4186 }
3882 } 4187 }
3883 break; 4188 break;
3884 4189
3885 case charset_mule_not: 4190 case charset_mule_not:
3886 { 4191 {
3887 int nentries; 4192 int nentries;
3888 int i; 4193 int i;
3889 int smallest_prev = 0; 4194 int smallest_prev = 0;
4195 Bitbyte flags = *p++;
4196
4197 if (flags)
4198 {
4199 /* We need to consult the syntax table, fastmap won't
4200 work. */
4201 bufp->can_be_null = 1;
4202 goto done;
4203 }
3890 4204
3891 nentries = unified_range_table_nentries (p); 4205 nentries = unified_range_table_nentries (p);
3892 for (i = 0; i < nentries; i++) 4206 for (i = 0; i < nentries; i++)
3893 { 4207 {
3894 EMACS_INT first, last; 4208 EMACS_INT first, last;
5414 case charset_mule: 5728 case charset_mule:
5415 case charset_mule_not: 5729 case charset_mule_not:
5416 { 5730 {
5417 REGISTER Ichar c; 5731 REGISTER Ichar c;
5418 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not; 5732 re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not;
5733 Bitbyte class_bits = *p++;
5419 5734
5420 DEBUG_MATCH_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : ""); 5735 DEBUG_MATCH_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : "");
5421
5422 REGEX_PREFETCH (); 5736 REGEX_PREFETCH ();
5423 c = itext_ichar_fmt (d, fmt, lispobj); 5737 c = itext_ichar_fmt (d, fmt, lispobj);
5424 c = RE_TRANSLATE (c); /* The character to match. */ 5738 c = RE_TRANSLATE (c); /* The character to match. */
5425 5739
5426 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil))) 5740 if ((class_bits &&
5427 not_p = !not_p; 5741 ((class_bits & BIT_ALPHA && ISALPHA (c))
5742 || (class_bits & BIT_SPACE && ISSPACE (c))
5743 || (class_bits & BIT_PUNCT && ISPUNCT (c))
5744 || (class_bits & BIT_WORD && ISWORD (c))
5745 || (TRANSLATE_P (translate) ?
5746 (class_bits & (BIT_UPPER | BIT_LOWER)
5747 && !NOCASEP (lispbuf, c))
5748 : ((class_bits & BIT_UPPER && ISUPPER (c))
5749 || (class_bits & BIT_LOWER && ISLOWER (c))))))
5750 || EQ (Qt, unified_range_table_lookup (p, c, Qnil)))
5751 {
5752 not_p = !not_p;
5753 }
5428 5754
5429 p += unified_range_table_bytes_used (p); 5755 p += unified_range_table_bytes_used (p);
5430 5756
5431 if (!not_p) goto fail; 5757 if (!not_p) goto fail;
5432 5758