comparison src/regex.c @ 70:131b0175ea99 r20-0b30

Import from CVS: tag r20-0b30
author cvs
date Mon, 13 Aug 2007 09:02:59 +0200
parents 441bb1e64a06
children 6a378aca36af
comparison
equal deleted inserted replaced
69:804d1389bcd6 70:131b0175ea99
29 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines 29 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
30 was added. This causes a huge speedup in font-locking. 30 was added. This causes a huge speedup in font-locking.
31 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is 31 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
32 being used, because it's too slow -- all those calls to mmap() 32 being used, because it's too slow -- all those calls to mmap()
33 add humongous overhead. 33 add humongous overhead.
34 (3) Lots and lots of changes for Mule. They are bracketed by
35 `#ifdef MULE' or with comments that have `XEmacs' in them.
34 */ 36 */
35 37
36 /* AIX requires this to be the first thing in the file. */ 38 /* AIX requires this to be the first thing in the file. */
37 #if defined (_AIX) && !defined (REGEX_MALLOC) 39 #if defined (_AIX) && !defined (REGEX_MALLOC)
38 #pragma alloca 40 #pragma alloca
40 42
41 #define _GNU_SOURCE 43 #define _GNU_SOURCE
42 44
43 #ifdef HAVE_CONFIG_H 45 #ifdef HAVE_CONFIG_H
44 #include <config.h> 46 #include <config.h>
47 #endif
48
49 /* We assume non-Mule if emacs isn't defined. */
50 #ifndef emacs
51 #undef MULE
45 #endif 52 #endif
46 53
47 /* We need this for `regex.h', and perhaps for the Emacs include files. */ 54 /* We need this for `regex.h', and perhaps for the Emacs include files. */
48 #include <sys/types.h> 55 #include <sys/types.h>
49 56
76 83
77 #if (defined (DEBUG_XEMACS) && !defined (DEBUG)) 84 #if (defined (DEBUG_XEMACS) && !defined (DEBUG))
78 #define DEBUG 85 #define DEBUG
79 #endif 86 #endif
80 87
88 #ifdef MULE
89
90 Lisp_Object Vthe_lisp_rangetab;
91
92 void
93 complex_vars_of_regex (void)
94 {
95 Vthe_lisp_rangetab = Fmake_range_table ();
96 staticpro (&Vthe_lisp_rangetab);
97 }
98
99 #else /* not MULE */
100
81 void 101 void
82 complex_vars_of_regex (void) 102 complex_vars_of_regex (void)
83 { 103 {
84 } 104 }
105
106 #endif /* not MULE */
85 107
86 #else /* not emacs */ 108 #else /* not emacs */
87 109
88 /* If we are not linking with Emacs proper, 110 /* If we are not linking with Emacs proper,
89 we can't use the relocating allocator 111 we can't use the relocating allocator
220 #define ISASCII_1(c) 1 242 #define ISASCII_1(c) 1
221 #else 243 #else
222 #define ISASCII_1(c) isascii(c) 244 #define ISASCII_1(c) isascii(c)
223 #endif 245 #endif
224 246
247 #ifdef MULE
248 /* The IS*() macros can be passed any character, including an extended
249 one. We need to make sure there are no crashes, which would occur
250 otherwise due to out-of-bounds array references. */
251 #define ISASCII(c) (((unsigned EMACS_INT) (c)) < 0x100 && ISASCII_1 (c))
252 #else
225 #define ISASCII(c) ISASCII_1 (c) 253 #define ISASCII(c) ISASCII_1 (c)
254 #endif
226 255
227 #ifdef isblank 256 #ifdef isblank
228 #define ISBLANK(c) (ISASCII (c) && isblank (c)) 257 #define ISBLANK(c) (ISASCII (c) && isblank (c))
229 #else 258 #else
230 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') 259 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
504 syntaxspec, 533 syntaxspec,
505 534
506 /* Matches any character whose syntax is not that specified. */ 535 /* Matches any character whose syntax is not that specified. */
507 notsyntaxspec 536 notsyntaxspec
508 #endif /* emacs */ 537 #endif /* emacs */
538
539 #ifdef MULE
540 /* need extra stuff to be able to properly work with XEmacs/Mule
541 characters (which may take up more than one byte) */
542
543 ,charset_mule, /* Matches any character belonging to specified set.
544 The set is stored in "unified range-table
545 format"; see rangetab.c. Unlike the `charset'
546 opcode, this can handle arbitrary characters. */
547
548 charset_mule_not /* Same parameters as charset_mule, but match any
549 character that is not one of those specified. */
550 #endif
551
509 } re_opcode_t; 552 } re_opcode_t;
510 553
511 /* Common operations on the compiled pattern. */ 554 /* Common operations on the compiled pattern. */
512 555
513 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 556 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
735 778
736 p += 1 + *p; 779 p += 1 + *p;
737 } 780 }
738 break; 781 break;
739 782
783 #ifdef MULE
784 case charset_mule:
785 case charset_mule_not:
786 {
787 int nentries, i;
788
789 printf ("/charset_mule [%s",
790 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : "");
791 nentries = unified_range_table_nentries (p);
792 for (i = 0; i < nentries; i++)
793 {
794 EMACS_INT first, last;
795 Lisp_Object dummy_val;
796
797 unified_range_table_get_range (p, i, &first, &last,
798 &dummy_val);
799 if (first < 0x100)
800 putchar (first);
801 else
802 printf ("(0x%x)", first);
803 if (first != last)
804 {
805 putchar ('-');
806 if (last < 0x100)
807 putchar (last);
808 else
809 printf ("(0x%x)", last);
810 }
811 }
812 putchar (']');
813 p += unified_range_table_bytes_used (p);
814 }
815 break;
816 #endif
817
740 case begline: 818 case begline:
741 printf ("/begline"); 819 printf ("/begline");
742 break; 820 break;
743 821
744 case endline: 822 case endline:
983 "Regular expression too big", /* REG_ESIZE */ 1061 "Regular expression too big", /* REG_ESIZE */
984 "Unmatched ) or \\)", /* REG_ERPAREN */ 1062 "Unmatched ) or \\)", /* REG_ERPAREN */
985 #ifdef emacs 1063 #ifdef emacs
986 "Invalid syntax designator", /* REG_ESYNTAX */ 1064 "Invalid syntax designator", /* REG_ESYNTAX */
987 #endif 1065 #endif
1066 #ifdef MULE
1067 "Ranges may not span charsets", /* REG_ERANGESPAN */
1068 #endif
988 }; 1069 };
989 1070
990 /* Avoiding alloca during matching, to placate r_alloc. */ 1071 /* Avoiding alloca during matching, to placate r_alloc. */
991 1072
992 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the 1073 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1042 This is a variable only so users of regex can assign to it; we never 1123 This is a variable only so users of regex can assign to it; we never
1043 change it ourselves. */ 1124 change it ourselves. */
1044 #if defined (MATCH_MAY_ALLOCATE) 1125 #if defined (MATCH_MAY_ALLOCATE)
1045 /* 4400 was enough to cause a crash on Alpha OSF/1, 1126 /* 4400 was enough to cause a crash on Alpha OSF/1,
1046 whose default stack limit is 2mb. */ 1127 whose default stack limit is 2mb. */
1047 int re_max_failures = 20000; 1128 int re_max_failures = 4000;
1048 #else 1129 #else
1049 int re_max_failures = 2000; 1130 int re_max_failures = 2000;
1050 #endif 1131 #endif
1051 1132
1052 union fail_stack_elt 1133 union fail_stack_elt
1265 #else 1346 #else
1266 #define NUM_NONREG_ITEMS 4 1347 #define NUM_NONREG_ITEMS 4
1267 #endif 1348 #endif
1268 1349
1269 /* We push at most this many items on the stack. */ 1350 /* We push at most this many items on the stack. */
1270 /* We used to use (num_regs - 1), which is the number of registers 1351 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1271 this regexp will save; but that was changed to 5
1272 to avoid stack overflow for a regexp with lots of parens. */
1273 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1274 1352
1275 /* We actually push this many items. */ 1353 /* We actually push this many items. */
1276 #define NUM_FAILURE_ITEMS \ 1354 #define NUM_FAILURE_ITEMS \
1277 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ 1355 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1278 + NUM_NONREG_ITEMS) 1356 + NUM_NONREG_ITEMS)
1436 } while (0) 1514 } while (0)
1437 1515
1438 /* Go backwards one character in the pattern. */ 1516 /* Go backwards one character in the pattern. */
1439 #define PATUNFETCH p-- 1517 #define PATUNFETCH p--
1440 1518
1519 #ifdef MULE
1520
1521 #define PATFETCH_EXTENDED(emch) \
1522 do {if (p == pend) return REG_EEND; \
1523 assert (p < pend); \
1524 emch = charptr_emchar ((CONST Bufbyte *) p); \
1525 INC_CHARPTR (p); \
1526 if (translate && emch < 0x80) \
1527 emch = (Emchar) (unsigned char) translate[emch]; \
1528 } while (0)
1529
1530 #define PATFETCH_RAW_EXTENDED(emch) \
1531 do {if (p == pend) return REG_EEND; \
1532 assert (p < pend); \
1533 emch = charptr_emchar ((CONST Bufbyte *) p); \
1534 INC_CHARPTR (p); \
1535 } while (0)
1536
1537 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1538
1539 #define PATFETCH_EITHER(emch) \
1540 do { \
1541 if (has_extended_chars) \
1542 PATFETCH_EXTENDED (emch); \
1543 else \
1544 PATFETCH (emch); \
1545 } while (0)
1546
1547 #define PATFETCH_RAW_EITHER(emch) \
1548 do { \
1549 if (has_extended_chars) \
1550 PATFETCH_RAW_EXTENDED (emch); \
1551 else \
1552 PATFETCH_RAW (emch); \
1553 } while (0)
1554
1555 #define PATUNFETCH_EITHER \
1556 do { \
1557 if (has_extended_chars) \
1558 PATUNFETCH_EXTENDED (emch); \
1559 else \
1560 PATUNFETCH (emch); \
1561 } while (0)
1562
1563 #else /* not MULE */
1564
1441 #define PATFETCH_EITHER(emch) PATFETCH (emch) 1565 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1442 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch) 1566 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1443 #define PATUNFETCH_EITHER PATUNFETCH 1567 #define PATUNFETCH_EITHER PATUNFETCH
1444 1568
1569 #endif /* not MULE */
1445 1570
1446 /* If `translate' is non-null, return translate[D], else just D. We 1571 /* If `translate' is non-null, return translate[D], else just D. We
1447 cast the subscript to translate because some data is declared as 1572 cast the subscript to translate because some data is declared as
1448 `char *', to avoid warnings when a string constant is passed. But 1573 `char *', to avoid warnings when a string constant is passed. But
1449 when we use a character as a subscript we must make it unsigned. */ 1574 when we use a character as a subscript we must make it unsigned. */
1450 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) 1575 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
1451 1576
1577 #ifdef MULE
1578
1579 #define TRANSLATE_EXTENDED_UNSAFE(emch) \
1580 (translate && emch < 0x80 ? translate[emch] : (emch))
1581
1582 #endif
1452 1583
1453 /* Macros for outputting the compiled pattern into `buffer'. */ 1584 /* Macros for outputting the compiled pattern into `buffer'. */
1454 1585
1455 /* If the buffer isn't allocated when it comes in, use this. */ 1586 /* If the buffer isn't allocated when it comes in, use this. */
1456 #define INIT_BUF_SIZE 32 1587 #define INIT_BUF_SIZE 32
1587 /* Set the bit for character C in a bit vector. */ 1718 /* Set the bit for character C in a bit vector. */
1588 #define SET_LIST_BIT(c) \ 1719 #define SET_LIST_BIT(c) \
1589 (b[((unsigned char) (c)) / BYTEWIDTH] \ 1720 (b[((unsigned char) (c)) / BYTEWIDTH] \
1590 |= 1 << (((unsigned char) c) % BYTEWIDTH)) 1721 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1591 1722
1723 #ifdef MULE
1724
1725 /* Set the "bit" for character C in a range table. */
1726 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1727
1728 /* Set the "bit" for character c in the appropriate table. */
1729 #define SET_EITHER_BIT(c) \
1730 do { \
1731 if (has_extended_chars) \
1732 SET_RANGETAB_BIT (c); \
1733 else \
1734 SET_LIST_BIT (c); \
1735 } while (0)
1736
1737 #else /* not MULE */
1738
1592 #define SET_EITHER_BIT(c) SET_LIST_BIT (c) 1739 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1593 1740
1741 #endif
1594 1742
1595 1743
1596 /* Get the next unsigned number in the uncompiled pattern. */ 1744 /* Get the next unsigned number in the uncompiled pattern. */
1597 #define GET_UNSIGNED_NUMBER(num) \ 1745 #define GET_UNSIGNED_NUMBER(num) \
1598 { if (p != pend) \ 1746 { if (p != pend) \
1632 static boolean group_in_compile_stack (compile_stack_type compile_stack, 1780 static boolean group_in_compile_stack (compile_stack_type compile_stack,
1633 regnum_t regnum); 1781 regnum_t regnum);
1634 static reg_errcode_t compile_range (CONST char **p_ptr, CONST char *pend, 1782 static reg_errcode_t compile_range (CONST char **p_ptr, CONST char *pend,
1635 char *translate, reg_syntax_t syntax, 1783 char *translate, reg_syntax_t syntax,
1636 unsigned char *b); 1784 unsigned char *b);
1785 #ifdef MULE
1786 static reg_errcode_t compile_extended_range (CONST char **p_ptr,
1787 CONST char *pend,
1788 char *translate,
1789 reg_syntax_t syntax,
1790 Lisp_Object rtab);
1791 #endif
1637 static boolean group_match_null_string_p (unsigned char **p, 1792 static boolean group_match_null_string_p (unsigned char **p,
1638 unsigned char *end, 1793 unsigned char *end,
1639 register_info_type *reg_info); 1794 register_info_type *reg_info);
1640 static boolean alt_match_null_string_p (unsigned char *p, unsigned char *end, 1795 static boolean alt_match_null_string_p (unsigned char *p, unsigned char *end,
1641 register_info_type *reg_info); 1796 register_info_type *reg_info);
2009 2164
2010 case '[': 2165 case '[':
2011 { 2166 {
2012 /* XEmacs change: this whole section */ 2167 /* XEmacs change: this whole section */
2013 boolean had_char_class = false; 2168 boolean had_char_class = false;
2169 #ifdef MULE
2170 boolean has_extended_chars = false;
2171 REGISTER Lisp_Object rtab = Qnil;
2172 #endif
2014 2173
2015 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2174 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2016 2175
2017 /* Ensure that we have enough space to push a charset: the 2176 /* Ensure that we have enough space to push a charset: the
2018 opcode, the length count, and the bitset; 34 bytes in all. */ 2177 opcode, the length count, and the bitset; 34 bytes in all. */
2038 /* charset_not matches newline according to a syntax bit. */ 2197 /* charset_not matches newline according to a syntax bit. */
2039 if ((re_opcode_t) b[-2] == charset_not 2198 if ((re_opcode_t) b[-2] == charset_not
2040 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 2199 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2041 SET_LIST_BIT ('\n'); 2200 SET_LIST_BIT ('\n');
2042 2201
2202 #ifdef MULE
2203 start_over_with_extended:
2204 if (has_extended_chars)
2205 {
2206 /* There are extended chars here, which means we need to start
2207 over and shift to unified range-table format. */
2208 if (b[-2] == charset)
2209 b[-2] = charset_mule;
2210 else
2211 b[-2] = charset_mule_not;
2212 b--;
2213 p = p1; /* go back to the beginning of the charset, after
2214 a possible ^. */
2215 rtab = Vthe_lisp_rangetab;
2216 Fclear_range_table (rtab);
2217
2218 /* charset_not matches newline according to a syntax bit. */
2219 if ((re_opcode_t) b[-1] == charset_mule_not
2220 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2221 SET_EITHER_BIT ('\n');
2222 }
2223 #endif /* MULE */
2224
2043 /* Read in characters and ranges, setting map bits. */ 2225 /* Read in characters and ranges, setting map bits. */
2044 for (;;) 2226 for (;;)
2045 { 2227 {
2046 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2228 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2047 2229
2048 PATFETCH_EITHER (c); 2230 PATFETCH_EITHER (c);
2049 2231
2232 #ifdef MULE
2233 if (c >= 0x80 && !has_extended_chars)
2234 {
2235 has_extended_chars = 1;
2236 /* Frumble-bumble, we've found some extended chars.
2237 Need to start over, process everything using
2238 the general extended-char mechanism, and need
2239 to use charset_mule and charset_mule_not instead
2240 of charset and charset_not. */
2241 goto start_over_with_extended;
2242 }
2243 #endif /* MULE */
2050 /* \ might escape characters inside [...] and [^...]. */ 2244 /* \ might escape characters inside [...] and [^...]. */
2051 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 2245 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2052 { 2246 {
2053 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2247 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2054 2248
2055 PATFETCH_EITHER (c1); 2249 PATFETCH_EITHER (c1);
2250 #ifdef MULE
2251 if (c1 >= 0x80 && !has_extended_chars)
2252 {
2253 has_extended_chars = 1;
2254 goto start_over_with_extended;
2255 }
2256 #endif /* MULE */
2056 SET_EITHER_BIT (c1); 2257 SET_EITHER_BIT (c1);
2057 continue; 2258 continue;
2058 } 2259 }
2059 2260
2060 /* Could be the end of the bracket expression. If it's 2261 /* Could be the end of the bracket expression. If it's
2077 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 2278 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2078 && *p != ']') 2279 && *p != ']')
2079 { 2280 {
2080 reg_errcode_t ret; 2281 reg_errcode_t ret;
2081 2282
2082 ret = compile_range (&p, pend, translate, syntax, b); 2283 #ifdef MULE
2284 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2285 {
2286 has_extended_chars = 1;
2287 goto start_over_with_extended;
2288 }
2289 if (has_extended_chars)
2290 ret = compile_extended_range (&p, pend, translate,
2291 syntax, rtab);
2292 else
2293 #endif /* MULE */
2294 ret = compile_range (&p, pend, translate, syntax, b);
2083 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2295 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2084 } 2296 }
2085 2297
2086 else if (p[0] == '-' && p[1] != ']') 2298 else if (p[0] == '-' && p[1] != ']')
2087 { /* This handles ranges made up of characters only. */ 2299 { /* This handles ranges made up of characters only. */
2088 reg_errcode_t ret; 2300 reg_errcode_t ret;
2089 2301
2090 /* Move past the `-'. */ 2302 /* Move past the `-'. */
2091 PATFETCH (c1); 2303 PATFETCH (c1);
2092 2304
2093 ret = compile_range (&p, pend, translate, syntax, b); 2305 #ifdef MULE
2306 if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
2307 {
2308 has_extended_chars = 1;
2309 goto start_over_with_extended;
2310 }
2311 if (has_extended_chars)
2312 ret = compile_extended_range (&p, pend, translate,
2313 syntax, rtab);
2314 else
2315 #endif /* MULE */
2316 ret = compile_range (&p, pend, translate, syntax, b);
2094 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2317 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2095 } 2318 }
2096 2319
2097 /* See if we're at the beginning of a possible character 2320 /* See if we're at the beginning of a possible character
2098 class. */ 2321 class. */
2188 had_char_class = false; 2411 had_char_class = false;
2189 SET_EITHER_BIT (c); 2412 SET_EITHER_BIT (c);
2190 } 2413 }
2191 } 2414 }
2192 2415
2416 #ifdef MULE
2417 if (has_extended_chars)
2418 {
2419 /* We have a range table, not a bit vector. */
2420 int bytes_needed =
2421 unified_range_table_bytes_needed (rtab);
2422 GET_BUFFER_SPACE (bytes_needed);
2423 unified_range_table_copy_data (rtab, b);
2424 b += unified_range_table_bytes_used (b);
2425 break;
2426 }
2427 #endif /* MULE */
2193 /* Discard any (non)matching list bytes that are all 0 at the 2428 /* Discard any (non)matching list bytes that are all 0 at the
2194 end of the map. Decrease the map-length byte too. */ 2429 end of the map. Decrease the map-length byte too. */
2195 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) 2430 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2196 b[-1]--; 2431 b[-1]--;
2197 b += b[-1]; 2432 b += b[-1];
2958 SET_LIST_BIT (TRANSLATE (this_char)); 3193 SET_LIST_BIT (TRANSLATE (this_char));
2959 } 3194 }
2960 3195
2961 return REG_NOERROR; 3196 return REG_NOERROR;
2962 } 3197 }
3198
3199 #ifdef MULE
3200
3201 static reg_errcode_t
3202 compile_extended_range (CONST char **p_ptr, CONST char *pend, char *translate,
3203 reg_syntax_t syntax, Lisp_Object rtab)
3204 {
3205 Emchar this_char;
3206
3207 CONST char *p = *p_ptr;
3208 EMACS_INT range_start, range_end;
3209
3210 if (p == pend)
3211 return REG_ERANGE;
3212
3213 p--; /* back to '-' */
3214 DEC_CHARPTR (p); /* back to start of range */
3215 /* We also want to fetch the endpoints without translating them; the
3216 appropriate translation is done in the bit-setting loop below. */
3217 range_start = charptr_emchar ((CONST Bufbyte *) p);
3218 range_end = charptr_emchar ((CONST Bufbyte *) (*p_ptr));
3219 INC_CHARPTR (*p_ptr);
3220
3221 /* If the start is after the end, the range is empty. */
3222 if (range_start > range_end)
3223 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3224
3225 /* Can't have ranges spanning different charsets, except maybe for
3226 ranges entirely witin the first 256 chars. */
3227
3228 if ((range_start >= 0x100 || range_end >= 0x100)
3229 && CHAR_LEADING_BYTE (range_start) !=
3230 CHAR_LEADING_BYTE (range_end))
3231 return REG_ERANGESPAN;
3232
3233 /* As advertised, translations only work over the 0 - 0x7F range.
3234 Making this kind of stuff work generally is much harder.
3235 Iterating over the whole range like this would be way efficient
3236 if the range encompasses 10,000 chars or something. You'd have
3237 to do something like this:
3238
3239 range_table a;
3240 range_table b;
3241 map over translation table in [range_start, range_end] of
3242 (put the mapped range in a;
3243 put the translation in b)
3244 invert the range in a and truncate to [range_start, range_end]
3245 compute the union of a, b
3246 union the result into rtab
3247 */
3248 for (this_char = range_start;
3249 this_char <= range_end && this_char < 0x80; this_char++)
3250 {
3251 SET_RANGETAB_BIT (TRANSLATE (this_char));
3252 }
3253
3254 if (this_char <= range_end)
3255 put_range_table (rtab, this_char, range_end, Qt);
3256
3257 return REG_NOERROR;
3258 }
3259
3260 #endif /* MULE */
2963 3261
2964 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 3262 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2965 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 3263 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2966 characters can start a string that matches the pattern. This fastmap 3264 characters can start a string that matches the pattern. This fastmap
2967 is used by re_search to skip quickly over impossible starting points. 3265 is used by re_search to skip quickly over impossible starting points.
3056 fastmap[p[1]] = 1; 3354 fastmap[p[1]] = 1;
3057 break; 3355 break;
3058 3356
3059 3357
3060 case charset: 3358 case charset:
3359 /* XEmacs: Under Mule, these bit vectors will
3360 only contain values for characters below 0x80. */
3061 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 3361 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3062 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 3362 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3063 fastmap[j] = 1; 3363 fastmap[j] = 1;
3064 break; 3364 break;
3065 3365
3066 3366
3067 case charset_not: 3367 case charset_not:
3068 /* Chars beyond end of map must be allowed. */ 3368 /* Chars beyond end of map must be allowed. */
3369 #ifdef MULE
3370 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3371 fastmap[j] = 1;
3372 /* And all extended characters must be allowed, too. */
3373 for (j = 0x80; j < 0xA0; j++)
3374 fastmap[j] = 1;
3375 #else
3069 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 3376 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3070 fastmap[j] = 1; 3377 fastmap[j] = 1;
3378 #endif
3071 3379
3072 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 3380 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3073 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) 3381 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3074 fastmap[j] = 1; 3382 fastmap[j] = 1;
3075 break; 3383 break;
3384
3385 #ifdef MULE
3386 case charset_mule:
3387 {
3388 int nentries;
3389 int i;
3390
3391 nentries = unified_range_table_nentries (p);
3392 for (i = 0; i < nentries; i++)
3393 {
3394 EMACS_INT first, last;
3395 Lisp_Object dummy_val;
3396 int jj;
3397 Bufbyte strr[MAX_EMCHAR_LEN];
3398
3399 unified_range_table_get_range (p, i, &first, &last,
3400 &dummy_val);
3401 for (jj = first; jj <= last && jj < 0x80; jj++)
3402 fastmap[jj] = 1;
3403 /* Ranges below 0x100 can span charsets, but there
3404 are only two (Control-1 and Latin-1), and
3405 either first or last has to be in them. */
3406 set_charptr_emchar (strr, first);
3407 fastmap[*strr] = 1;
3408 if (last < 0x100)
3409 {
3410 set_charptr_emchar (strr, last);
3411 fastmap[*strr] = 1;
3412 }
3413 }
3414 }
3415 break;
3416
3417 case charset_mule_not:
3418 {
3419 int nentries;
3420 int i;
3421
3422 nentries = unified_range_table_nentries (p);
3423 for (i = 0; i < nentries; i++)
3424 {
3425 EMACS_INT first, last;
3426 Lisp_Object dummy_val;
3427 int jj;
3428 int smallest_prev = 0;
3429
3430 unified_range_table_get_range (p, i, &first, &last,
3431 &dummy_val);
3432 for (jj = smallest_prev; jj < first && jj < 0x80; jj++)
3433 fastmap[jj] = 1;
3434 smallest_prev = last + 1;
3435 if (smallest_prev >= 0x80)
3436 break;
3437 }
3438 /* Calculating which leading bytes are actually allowed
3439 here is rather difficult, so we just punt and allow
3440 all of them. */
3441 for (i = 0x80; i < 0xA0; i++)
3442 fastmap[i] = 1;
3443 }
3444 break;
3445 #endif /* MULE */
3076 3446
3077 3447
3078 case wordchar: 3448 case wordchar:
3079 #ifdef emacs 3449 #ifdef emacs
3080 k = (int) Sword; 3450 k = (int) Sword;
3081 goto matchsyntax; 3451 goto matchsyntax;
3082 #else 3452 #else
3083 for (j = 0; j < (1 << BYTEWIDTH); j++) 3453 for (j = 0; j < (1 << BYTEWIDTH); j++)
3084 if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) == Sword) 3454 if (SYNTAX_UNSAFE
3455 (XCHAR_TABLE
3456 (regex_emacs_buffer->mirror_syntax_table), j) == Sword)
3085 fastmap[j] = 1; 3457 fastmap[j] = 1;
3086 break; 3458 break;
3087 #endif 3459 #endif
3088 3460
3089 3461
3091 #ifdef emacs 3463 #ifdef emacs
3092 k = (int) Sword; 3464 k = (int) Sword;
3093 goto matchnotsyntax; 3465 goto matchnotsyntax;
3094 #else 3466 #else
3095 for (j = 0; j < (1 << BYTEWIDTH); j++) 3467 for (j = 0; j < (1 << BYTEWIDTH); j++)
3096 if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) != Sword) 3468 if (SYNTAX_UNSAFE
3469 (XCHAR_TABLE
3470 (regex_emacs_buffer->mirror_syntax_table), j) != Sword)
3097 fastmap[j] = 1; 3471 fastmap[j] = 1;
3098 break; 3472 break;
3099 #endif 3473 #endif
3100 3474
3101 3475
3102 case anychar: 3476 case anychar:
3103 { 3477 {
3104 int fastmap_newline = fastmap['\n']; 3478 int fastmap_newline = fastmap['\n'];
3105 3479
3106 /* `.' matches anything ... */ 3480 /* `.' matches anything ... */
3481 #ifdef MULE
3482 /* "anything" only includes bytes that can be the
3483 first byte of a character. */
3484 for (j = 0; j < 0xA0; j++)
3485 fastmap[j] = 1;
3486 #else
3107 for (j = 0; j < (1 << BYTEWIDTH); j++) 3487 for (j = 0; j < (1 << BYTEWIDTH); j++)
3108 fastmap[j] = 1; 3488 fastmap[j] = 1;
3489 #endif
3109 3490
3110 /* ... except perhaps newline. */ 3491 /* ... except perhaps newline. */
3111 if (!(bufp->syntax & RE_DOT_NEWLINE)) 3492 if (!(bufp->syntax & RE_DOT_NEWLINE))
3112 fastmap['\n'] = fastmap_newline; 3493 fastmap['\n'] = fastmap_newline;
3113 3494
3122 3503
3123 #ifdef emacs 3504 #ifdef emacs
3124 case syntaxspec: 3505 case syntaxspec:
3125 k = *p++; 3506 k = *p++;
3126 matchsyntax: 3507 matchsyntax:
3127 for (j = 0; j < (1 << BYTEWIDTH); j++) 3508 #ifdef MULE
3128 if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) == 3509 for (j = 0; j < 0x80; j++)
3510 if (SYNTAX_UNSAFE
3511 (XCHAR_TABLE
3512 (regex_emacs_buffer->mirror_syntax_table), j) ==
3129 (enum syntaxcode) k) 3513 (enum syntaxcode) k)
3130 fastmap[j] = 1; 3514 fastmap[j] = 1;
3515 for (j = 0x80; j < 0xA0; j++)
3516 {
3517 if (j == PRE_LEADING_BYTE_PRIVATE_1
3518 || j == PRE_LEADING_BYTE_PRIVATE_2)
3519 /* too complicated to calculate this right */
3520 fastmap[j] = 1;
3521 else
3522 {
3523 int multi_p;
3524 Lisp_Object cset;
3525
3526 cset = CHARSET_BY_LEADING_BYTE (j);
3527 if (CHARSETP (cset))
3528 {
3529 if (charset_syntax (regex_emacs_buffer, cset,
3530 &multi_p)
3531 == Sword || multi_p)
3532 fastmap[j] = 1;
3533 }
3534 }
3535 }
3536 #else /* ! MULE */
3537 for (j = 0; j < (1 << BYTEWIDTH); j++)
3538 if (SYNTAX_UNSAFE
3539 (XCHAR_TABLE
3540 (regex_emacs_buffer->mirror_syntax_table), j) ==
3541 (enum syntaxcode) k)
3542 fastmap[j] = 1;
3543 #endif /* ! MULE */
3131 break; 3544 break;
3132 3545
3133 3546
3134 case notsyntaxspec: 3547 case notsyntaxspec:
3135 k = *p++; 3548 k = *p++;
3136 matchnotsyntax: 3549 matchnotsyntax:
3137 for (j = 0; j < (1 << BYTEWIDTH); j++) 3550 #ifdef MULE
3138 if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) != 3551 for (j = 0; j < 0x80; j++)
3552 if (SYNTAX_UNSAFE
3553 (XCHAR_TABLE
3554 (regex_emacs_buffer->mirror_syntax_table), j) !=
3139 (enum syntaxcode) k) 3555 (enum syntaxcode) k)
3140 fastmap[j] = 1; 3556 fastmap[j] = 1;
3557 for (j = 0x80; j < 0xA0; j++)
3558 {
3559 if (j == PRE_LEADING_BYTE_PRIVATE_1
3560 || j == PRE_LEADING_BYTE_PRIVATE_2)
3561 /* too complicated to calculate this right */
3562 fastmap[j] = 1;
3563 else
3564 {
3565 int multi_p;
3566 Lisp_Object cset;
3567
3568 cset = CHARSET_BY_LEADING_BYTE (j);
3569 if (CHARSETP (cset))
3570 {
3571 if (charset_syntax (regex_emacs_buffer, cset,
3572 &multi_p)
3573 != Sword || multi_p)
3574 fastmap[j] = 1;
3575 }
3576 }
3577 }
3578 #else /* ! MULE */
3579 for (j = 0; j < (1 << BYTEWIDTH); j++)
3580 if (SYNTAX_UNSAFE
3581 (XCHAR_TABLE
3582 (regex_emacs_buffer->mirror_syntax_table), j) !=
3583 (enum syntaxcode) k)
3584 fastmap[j] = 1;
3585 #endif /* ! MULE */
3141 break; 3586 break;
3142 3587
3143 3588
3144 /* All cases after this match the empty string. These end with 3589 /* All cases after this match the empty string. These end with
3145 `continue'. */ 3590 `continue'. */
3557 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d)) 4002 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
3558 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d)) 4003 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
3559 4004
3560 /* Test if CH is a word-constituent character. (XEmacs change) */ 4005 /* Test if CH is a word-constituent character. (XEmacs change) */
3561 #define WORDCHAR_P_UNSAFE(ch) \ 4006 #define WORDCHAR_P_UNSAFE(ch) \
3562 (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, ch) == Sword) 4007 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4008 ch) == Sword)
3563 4009
3564 /* Free everything we malloc. */ 4010 /* Free everything we malloc. */
3565 #ifdef MATCH_MAY_ALLOCATE 4011 #ifdef MATCH_MAY_ALLOCATE
3566 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL 4012 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
3567 #define FREE_VARIABLES() \ 4013 #define FREE_VARIABLES() \
4118 p += 1 + *p; 4564 p += 1 + *p;
4119 4565
4120 if (!not) goto fail; 4566 if (!not) goto fail;
4121 4567
4122 SET_REGS_MATCHED (); 4568 SET_REGS_MATCHED ();
4123 d++; 4569 INC_CHARPTR (d); /* XEmacs change */
4124 break; 4570 break;
4125 } 4571 }
4572
4573 #ifdef MULE
4574 case charset_mule:
4575 case charset_mule_not:
4576 {
4577 register Emchar c;
4578 boolean not = (re_opcode_t) *(p - 1) == charset_mule_not;
4579
4580 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not ? "_not" : "");
4581
4582 PREFETCH ();
4583 c = charptr_emchar ((CONST Bufbyte *) d);
4584 c = TRANSLATE_EXTENDED_UNSAFE (c); /* The character to match. */
4585
4586 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil)))
4587 not = !not;
4588
4589 p += unified_range_table_bytes_used (p);
4590
4591 if (!not) goto fail;
4592
4593 SET_REGS_MATCHED ();
4594 INC_CHARPTR (d);
4595 break;
4596 }
4597 #endif
4126 4598
4127 4599
4128 /* The beginning of a group is represented by start_memory. 4600 /* The beginning of a group is represented by start_memory.
4129 The arguments are the register number in the next byte, and the 4601 The arguments are the register number in the next byte, and the
4130 number of groups inner to this one in the next. The text 4602 number of groups inner to this one in the next. The text
4598 #ifdef DEBUG 5070 #ifdef DEBUG
4599 register unsigned char c 5071 register unsigned char c
4600 = *p2 == (unsigned char) endline ? '\n' : p2[2]; 5072 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4601 #endif 5073 #endif
4602 5074
4603 #if 1
4604 /* dmoore@ucsd.edu - emacs 19.34 uses this: */
4605
4606 if ((re_opcode_t) p1[3] == exactn 5075 if ((re_opcode_t) p1[3] == exactn
4607 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] 5076 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4]
4608 && (p2[2 + p1[5] / BYTEWIDTH] 5077 && (p2[1 + p1[4] / BYTEWIDTH]
4609 & (1 << (p1[5] % BYTEWIDTH))))) 5078 & (1 << (p1[4] % BYTEWIDTH)))))
4610 #else
4611 if ((re_opcode_t) p1[3] == exactn
4612 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4]
4613 && (p2[1 + p1[4] / BYTEWIDTH]
4614 & (1 << (p1[4] % BYTEWIDTH)))))
4615 #endif
4616 { 5079 {
4617 p[-3] = (unsigned char) pop_failure_jump; 5080 p[-3] = (unsigned char) pop_failure_jump;
4618 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 5081 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4619 c, p1[5]); 5082 c, p1[5]);
4620 } 5083 }
4918 int matches; 5381 int matches;
4919 Emchar emch; 5382 Emchar emch;
4920 5383
4921 PREFETCH (); 5384 PREFETCH ();
4922 emch = charptr_emchar ((CONST Bufbyte *) d); 5385 emch = charptr_emchar ((CONST Bufbyte *) d);
4923 matches = (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, 5386 matches = (SYNTAX_UNSAFE
4924 emch) == (enum syntaxcode) mcnt); 5387 (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table),
5388 emch) == (enum syntaxcode) mcnt);
4925 INC_CHARPTR (d); 5389 INC_CHARPTR (d);
4926 if (matches != should_succeed) 5390 if (matches != should_succeed)
4927 goto fail; 5391 goto fail;
4928 SET_REGS_MATCHED (); 5392 SET_REGS_MATCHED ();
4929 } 5393 }