Mercurial > hg > xemacs-beta
comparison src/regex.c @ 70:131b0175ea99 r20-0b30
Import from CVS: tag r20-0b30
author | cvs |
---|---|
date | Mon, 13 Aug 2007 09:02:59 +0200 |
parents | 441bb1e64a06 |
children | 6a378aca36af |
comparison
equal
deleted
inserted
replaced
69:804d1389bcd6 | 70:131b0175ea99 |
---|---|
29 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines | 29 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines |
30 was added. This causes a huge speedup in font-locking. | 30 was added. This causes a huge speedup in font-locking. |
31 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is | 31 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is |
32 being used, because it's too slow -- all those calls to mmap() | 32 being used, because it's too slow -- all those calls to mmap() |
33 add humongous overhead. | 33 add humongous overhead. |
34 (3) Lots and lots of changes for Mule. They are bracketed by | |
35 `#ifdef MULE' or with comments that have `XEmacs' in them. | |
34 */ | 36 */ |
35 | 37 |
36 /* AIX requires this to be the first thing in the file. */ | 38 /* AIX requires this to be the first thing in the file. */ |
37 #if defined (_AIX) && !defined (REGEX_MALLOC) | 39 #if defined (_AIX) && !defined (REGEX_MALLOC) |
38 #pragma alloca | 40 #pragma alloca |
40 | 42 |
41 #define _GNU_SOURCE | 43 #define _GNU_SOURCE |
42 | 44 |
43 #ifdef HAVE_CONFIG_H | 45 #ifdef HAVE_CONFIG_H |
44 #include <config.h> | 46 #include <config.h> |
47 #endif | |
48 | |
49 /* We assume non-Mule if emacs isn't defined. */ | |
50 #ifndef emacs | |
51 #undef MULE | |
45 #endif | 52 #endif |
46 | 53 |
47 /* We need this for `regex.h', and perhaps for the Emacs include files. */ | 54 /* We need this for `regex.h', and perhaps for the Emacs include files. */ |
48 #include <sys/types.h> | 55 #include <sys/types.h> |
49 | 56 |
76 | 83 |
77 #if (defined (DEBUG_XEMACS) && !defined (DEBUG)) | 84 #if (defined (DEBUG_XEMACS) && !defined (DEBUG)) |
78 #define DEBUG | 85 #define DEBUG |
79 #endif | 86 #endif |
80 | 87 |
88 #ifdef MULE | |
89 | |
90 Lisp_Object Vthe_lisp_rangetab; | |
91 | |
92 void | |
93 complex_vars_of_regex (void) | |
94 { | |
95 Vthe_lisp_rangetab = Fmake_range_table (); | |
96 staticpro (&Vthe_lisp_rangetab); | |
97 } | |
98 | |
99 #else /* not MULE */ | |
100 | |
81 void | 101 void |
82 complex_vars_of_regex (void) | 102 complex_vars_of_regex (void) |
83 { | 103 { |
84 } | 104 } |
105 | |
106 #endif /* not MULE */ | |
85 | 107 |
86 #else /* not emacs */ | 108 #else /* not emacs */ |
87 | 109 |
88 /* If we are not linking with Emacs proper, | 110 /* If we are not linking with Emacs proper, |
89 we can't use the relocating allocator | 111 we can't use the relocating allocator |
220 #define ISASCII_1(c) 1 | 242 #define ISASCII_1(c) 1 |
221 #else | 243 #else |
222 #define ISASCII_1(c) isascii(c) | 244 #define ISASCII_1(c) isascii(c) |
223 #endif | 245 #endif |
224 | 246 |
247 #ifdef MULE | |
248 /* The IS*() macros can be passed any character, including an extended | |
249 one. We need to make sure there are no crashes, which would occur | |
250 otherwise due to out-of-bounds array references. */ | |
251 #define ISASCII(c) (((unsigned EMACS_INT) (c)) < 0x100 && ISASCII_1 (c)) | |
252 #else | |
225 #define ISASCII(c) ISASCII_1 (c) | 253 #define ISASCII(c) ISASCII_1 (c) |
254 #endif | |
226 | 255 |
227 #ifdef isblank | 256 #ifdef isblank |
228 #define ISBLANK(c) (ISASCII (c) && isblank (c)) | 257 #define ISBLANK(c) (ISASCII (c) && isblank (c)) |
229 #else | 258 #else |
230 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') | 259 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') |
504 syntaxspec, | 533 syntaxspec, |
505 | 534 |
506 /* Matches any character whose syntax is not that specified. */ | 535 /* Matches any character whose syntax is not that specified. */ |
507 notsyntaxspec | 536 notsyntaxspec |
508 #endif /* emacs */ | 537 #endif /* emacs */ |
538 | |
539 #ifdef MULE | |
540 /* need extra stuff to be able to properly work with XEmacs/Mule | |
541 characters (which may take up more than one byte) */ | |
542 | |
543 ,charset_mule, /* Matches any character belonging to specified set. | |
544 The set is stored in "unified range-table | |
545 format"; see rangetab.c. Unlike the `charset' | |
546 opcode, this can handle arbitrary characters. */ | |
547 | |
548 charset_mule_not /* Same parameters as charset_mule, but match any | |
549 character that is not one of those specified. */ | |
550 #endif | |
551 | |
509 } re_opcode_t; | 552 } re_opcode_t; |
510 | 553 |
511 /* Common operations on the compiled pattern. */ | 554 /* Common operations on the compiled pattern. */ |
512 | 555 |
513 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ | 556 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ |
735 | 778 |
736 p += 1 + *p; | 779 p += 1 + *p; |
737 } | 780 } |
738 break; | 781 break; |
739 | 782 |
783 #ifdef MULE | |
784 case charset_mule: | |
785 case charset_mule_not: | |
786 { | |
787 int nentries, i; | |
788 | |
789 printf ("/charset_mule [%s", | |
790 (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : ""); | |
791 nentries = unified_range_table_nentries (p); | |
792 for (i = 0; i < nentries; i++) | |
793 { | |
794 EMACS_INT first, last; | |
795 Lisp_Object dummy_val; | |
796 | |
797 unified_range_table_get_range (p, i, &first, &last, | |
798 &dummy_val); | |
799 if (first < 0x100) | |
800 putchar (first); | |
801 else | |
802 printf ("(0x%x)", first); | |
803 if (first != last) | |
804 { | |
805 putchar ('-'); | |
806 if (last < 0x100) | |
807 putchar (last); | |
808 else | |
809 printf ("(0x%x)", last); | |
810 } | |
811 } | |
812 putchar (']'); | |
813 p += unified_range_table_bytes_used (p); | |
814 } | |
815 break; | |
816 #endif | |
817 | |
740 case begline: | 818 case begline: |
741 printf ("/begline"); | 819 printf ("/begline"); |
742 break; | 820 break; |
743 | 821 |
744 case endline: | 822 case endline: |
983 "Regular expression too big", /* REG_ESIZE */ | 1061 "Regular expression too big", /* REG_ESIZE */ |
984 "Unmatched ) or \\)", /* REG_ERPAREN */ | 1062 "Unmatched ) or \\)", /* REG_ERPAREN */ |
985 #ifdef emacs | 1063 #ifdef emacs |
986 "Invalid syntax designator", /* REG_ESYNTAX */ | 1064 "Invalid syntax designator", /* REG_ESYNTAX */ |
987 #endif | 1065 #endif |
1066 #ifdef MULE | |
1067 "Ranges may not span charsets", /* REG_ERANGESPAN */ | |
1068 #endif | |
988 }; | 1069 }; |
989 | 1070 |
990 /* Avoiding alloca during matching, to placate r_alloc. */ | 1071 /* Avoiding alloca during matching, to placate r_alloc. */ |
991 | 1072 |
992 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the | 1073 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the |
1042 This is a variable only so users of regex can assign to it; we never | 1123 This is a variable only so users of regex can assign to it; we never |
1043 change it ourselves. */ | 1124 change it ourselves. */ |
1044 #if defined (MATCH_MAY_ALLOCATE) | 1125 #if defined (MATCH_MAY_ALLOCATE) |
1045 /* 4400 was enough to cause a crash on Alpha OSF/1, | 1126 /* 4400 was enough to cause a crash on Alpha OSF/1, |
1046 whose default stack limit is 2mb. */ | 1127 whose default stack limit is 2mb. */ |
1047 int re_max_failures = 20000; | 1128 int re_max_failures = 4000; |
1048 #else | 1129 #else |
1049 int re_max_failures = 2000; | 1130 int re_max_failures = 2000; |
1050 #endif | 1131 #endif |
1051 | 1132 |
1052 union fail_stack_elt | 1133 union fail_stack_elt |
1265 #else | 1346 #else |
1266 #define NUM_NONREG_ITEMS 4 | 1347 #define NUM_NONREG_ITEMS 4 |
1267 #endif | 1348 #endif |
1268 | 1349 |
1269 /* We push at most this many items on the stack. */ | 1350 /* We push at most this many items on the stack. */ |
1270 /* We used to use (num_regs - 1), which is the number of registers | 1351 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS) |
1271 this regexp will save; but that was changed to 5 | |
1272 to avoid stack overflow for a regexp with lots of parens. */ | |
1273 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) | |
1274 | 1352 |
1275 /* We actually push this many items. */ | 1353 /* We actually push this many items. */ |
1276 #define NUM_FAILURE_ITEMS \ | 1354 #define NUM_FAILURE_ITEMS \ |
1277 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ | 1355 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ |
1278 + NUM_NONREG_ITEMS) | 1356 + NUM_NONREG_ITEMS) |
1436 } while (0) | 1514 } while (0) |
1437 | 1515 |
1438 /* Go backwards one character in the pattern. */ | 1516 /* Go backwards one character in the pattern. */ |
1439 #define PATUNFETCH p-- | 1517 #define PATUNFETCH p-- |
1440 | 1518 |
1519 #ifdef MULE | |
1520 | |
1521 #define PATFETCH_EXTENDED(emch) \ | |
1522 do {if (p == pend) return REG_EEND; \ | |
1523 assert (p < pend); \ | |
1524 emch = charptr_emchar ((CONST Bufbyte *) p); \ | |
1525 INC_CHARPTR (p); \ | |
1526 if (translate && emch < 0x80) \ | |
1527 emch = (Emchar) (unsigned char) translate[emch]; \ | |
1528 } while (0) | |
1529 | |
1530 #define PATFETCH_RAW_EXTENDED(emch) \ | |
1531 do {if (p == pend) return REG_EEND; \ | |
1532 assert (p < pend); \ | |
1533 emch = charptr_emchar ((CONST Bufbyte *) p); \ | |
1534 INC_CHARPTR (p); \ | |
1535 } while (0) | |
1536 | |
1537 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p) | |
1538 | |
1539 #define PATFETCH_EITHER(emch) \ | |
1540 do { \ | |
1541 if (has_extended_chars) \ | |
1542 PATFETCH_EXTENDED (emch); \ | |
1543 else \ | |
1544 PATFETCH (emch); \ | |
1545 } while (0) | |
1546 | |
1547 #define PATFETCH_RAW_EITHER(emch) \ | |
1548 do { \ | |
1549 if (has_extended_chars) \ | |
1550 PATFETCH_RAW_EXTENDED (emch); \ | |
1551 else \ | |
1552 PATFETCH_RAW (emch); \ | |
1553 } while (0) | |
1554 | |
1555 #define PATUNFETCH_EITHER \ | |
1556 do { \ | |
1557 if (has_extended_chars) \ | |
1558 PATUNFETCH_EXTENDED (emch); \ | |
1559 else \ | |
1560 PATUNFETCH (emch); \ | |
1561 } while (0) | |
1562 | |
1563 #else /* not MULE */ | |
1564 | |
1441 #define PATFETCH_EITHER(emch) PATFETCH (emch) | 1565 #define PATFETCH_EITHER(emch) PATFETCH (emch) |
1442 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch) | 1566 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch) |
1443 #define PATUNFETCH_EITHER PATUNFETCH | 1567 #define PATUNFETCH_EITHER PATUNFETCH |
1444 | 1568 |
1569 #endif /* not MULE */ | |
1445 | 1570 |
1446 /* If `translate' is non-null, return translate[D], else just D. We | 1571 /* If `translate' is non-null, return translate[D], else just D. We |
1447 cast the subscript to translate because some data is declared as | 1572 cast the subscript to translate because some data is declared as |
1448 `char *', to avoid warnings when a string constant is passed. But | 1573 `char *', to avoid warnings when a string constant is passed. But |
1449 when we use a character as a subscript we must make it unsigned. */ | 1574 when we use a character as a subscript we must make it unsigned. */ |
1450 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) | 1575 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) |
1451 | 1576 |
1577 #ifdef MULE | |
1578 | |
1579 #define TRANSLATE_EXTENDED_UNSAFE(emch) \ | |
1580 (translate && emch < 0x80 ? translate[emch] : (emch)) | |
1581 | |
1582 #endif | |
1452 | 1583 |
1453 /* Macros for outputting the compiled pattern into `buffer'. */ | 1584 /* Macros for outputting the compiled pattern into `buffer'. */ |
1454 | 1585 |
1455 /* If the buffer isn't allocated when it comes in, use this. */ | 1586 /* If the buffer isn't allocated when it comes in, use this. */ |
1456 #define INIT_BUF_SIZE 32 | 1587 #define INIT_BUF_SIZE 32 |
1587 /* Set the bit for character C in a bit vector. */ | 1718 /* Set the bit for character C in a bit vector. */ |
1588 #define SET_LIST_BIT(c) \ | 1719 #define SET_LIST_BIT(c) \ |
1589 (b[((unsigned char) (c)) / BYTEWIDTH] \ | 1720 (b[((unsigned char) (c)) / BYTEWIDTH] \ |
1590 |= 1 << (((unsigned char) c) % BYTEWIDTH)) | 1721 |= 1 << (((unsigned char) c) % BYTEWIDTH)) |
1591 | 1722 |
1723 #ifdef MULE | |
1724 | |
1725 /* Set the "bit" for character C in a range table. */ | |
1726 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt) | |
1727 | |
1728 /* Set the "bit" for character c in the appropriate table. */ | |
1729 #define SET_EITHER_BIT(c) \ | |
1730 do { \ | |
1731 if (has_extended_chars) \ | |
1732 SET_RANGETAB_BIT (c); \ | |
1733 else \ | |
1734 SET_LIST_BIT (c); \ | |
1735 } while (0) | |
1736 | |
1737 #else /* not MULE */ | |
1738 | |
1592 #define SET_EITHER_BIT(c) SET_LIST_BIT (c) | 1739 #define SET_EITHER_BIT(c) SET_LIST_BIT (c) |
1593 | 1740 |
1741 #endif | |
1594 | 1742 |
1595 | 1743 |
1596 /* Get the next unsigned number in the uncompiled pattern. */ | 1744 /* Get the next unsigned number in the uncompiled pattern. */ |
1597 #define GET_UNSIGNED_NUMBER(num) \ | 1745 #define GET_UNSIGNED_NUMBER(num) \ |
1598 { if (p != pend) \ | 1746 { if (p != pend) \ |
1632 static boolean group_in_compile_stack (compile_stack_type compile_stack, | 1780 static boolean group_in_compile_stack (compile_stack_type compile_stack, |
1633 regnum_t regnum); | 1781 regnum_t regnum); |
1634 static reg_errcode_t compile_range (CONST char **p_ptr, CONST char *pend, | 1782 static reg_errcode_t compile_range (CONST char **p_ptr, CONST char *pend, |
1635 char *translate, reg_syntax_t syntax, | 1783 char *translate, reg_syntax_t syntax, |
1636 unsigned char *b); | 1784 unsigned char *b); |
1785 #ifdef MULE | |
1786 static reg_errcode_t compile_extended_range (CONST char **p_ptr, | |
1787 CONST char *pend, | |
1788 char *translate, | |
1789 reg_syntax_t syntax, | |
1790 Lisp_Object rtab); | |
1791 #endif | |
1637 static boolean group_match_null_string_p (unsigned char **p, | 1792 static boolean group_match_null_string_p (unsigned char **p, |
1638 unsigned char *end, | 1793 unsigned char *end, |
1639 register_info_type *reg_info); | 1794 register_info_type *reg_info); |
1640 static boolean alt_match_null_string_p (unsigned char *p, unsigned char *end, | 1795 static boolean alt_match_null_string_p (unsigned char *p, unsigned char *end, |
1641 register_info_type *reg_info); | 1796 register_info_type *reg_info); |
2009 | 2164 |
2010 case '[': | 2165 case '[': |
2011 { | 2166 { |
2012 /* XEmacs change: this whole section */ | 2167 /* XEmacs change: this whole section */ |
2013 boolean had_char_class = false; | 2168 boolean had_char_class = false; |
2169 #ifdef MULE | |
2170 boolean has_extended_chars = false; | |
2171 REGISTER Lisp_Object rtab = Qnil; | |
2172 #endif | |
2014 | 2173 |
2015 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2174 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
2016 | 2175 |
2017 /* Ensure that we have enough space to push a charset: the | 2176 /* Ensure that we have enough space to push a charset: the |
2018 opcode, the length count, and the bitset; 34 bytes in all. */ | 2177 opcode, the length count, and the bitset; 34 bytes in all. */ |
2038 /* charset_not matches newline according to a syntax bit. */ | 2197 /* charset_not matches newline according to a syntax bit. */ |
2039 if ((re_opcode_t) b[-2] == charset_not | 2198 if ((re_opcode_t) b[-2] == charset_not |
2040 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) | 2199 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
2041 SET_LIST_BIT ('\n'); | 2200 SET_LIST_BIT ('\n'); |
2042 | 2201 |
2202 #ifdef MULE | |
2203 start_over_with_extended: | |
2204 if (has_extended_chars) | |
2205 { | |
2206 /* There are extended chars here, which means we need to start | |
2207 over and shift to unified range-table format. */ | |
2208 if (b[-2] == charset) | |
2209 b[-2] = charset_mule; | |
2210 else | |
2211 b[-2] = charset_mule_not; | |
2212 b--; | |
2213 p = p1; /* go back to the beginning of the charset, after | |
2214 a possible ^. */ | |
2215 rtab = Vthe_lisp_rangetab; | |
2216 Fclear_range_table (rtab); | |
2217 | |
2218 /* charset_not matches newline according to a syntax bit. */ | |
2219 if ((re_opcode_t) b[-1] == charset_mule_not | |
2220 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) | |
2221 SET_EITHER_BIT ('\n'); | |
2222 } | |
2223 #endif /* MULE */ | |
2224 | |
2043 /* Read in characters and ranges, setting map bits. */ | 2225 /* Read in characters and ranges, setting map bits. */ |
2044 for (;;) | 2226 for (;;) |
2045 { | 2227 { |
2046 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); | 2228 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
2047 | 2229 |
2048 PATFETCH_EITHER (c); | 2230 PATFETCH_EITHER (c); |
2049 | 2231 |
2232 #ifdef MULE | |
2233 if (c >= 0x80 && !has_extended_chars) | |
2234 { | |
2235 has_extended_chars = 1; | |
2236 /* Frumble-bumble, we've found some extended chars. | |
2237 Need to start over, process everything using | |
2238 the general extended-char mechanism, and need | |
2239 to use charset_mule and charset_mule_not instead | |
2240 of charset and charset_not. */ | |
2241 goto start_over_with_extended; | |
2242 } | |
2243 #endif /* MULE */ | |
2050 /* \ might escape characters inside [...] and [^...]. */ | 2244 /* \ might escape characters inside [...] and [^...]. */ |
2051 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') | 2245 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') |
2052 { | 2246 { |
2053 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); | 2247 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); |
2054 | 2248 |
2055 PATFETCH_EITHER (c1); | 2249 PATFETCH_EITHER (c1); |
2250 #ifdef MULE | |
2251 if (c1 >= 0x80 && !has_extended_chars) | |
2252 { | |
2253 has_extended_chars = 1; | |
2254 goto start_over_with_extended; | |
2255 } | |
2256 #endif /* MULE */ | |
2056 SET_EITHER_BIT (c1); | 2257 SET_EITHER_BIT (c1); |
2057 continue; | 2258 continue; |
2058 } | 2259 } |
2059 | 2260 |
2060 /* Could be the end of the bracket expression. If it's | 2261 /* Could be the end of the bracket expression. If it's |
2077 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') | 2278 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') |
2078 && *p != ']') | 2279 && *p != ']') |
2079 { | 2280 { |
2080 reg_errcode_t ret; | 2281 reg_errcode_t ret; |
2081 | 2282 |
2082 ret = compile_range (&p, pend, translate, syntax, b); | 2283 #ifdef MULE |
2284 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) | |
2285 { | |
2286 has_extended_chars = 1; | |
2287 goto start_over_with_extended; | |
2288 } | |
2289 if (has_extended_chars) | |
2290 ret = compile_extended_range (&p, pend, translate, | |
2291 syntax, rtab); | |
2292 else | |
2293 #endif /* MULE */ | |
2294 ret = compile_range (&p, pend, translate, syntax, b); | |
2083 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); | 2295 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
2084 } | 2296 } |
2085 | 2297 |
2086 else if (p[0] == '-' && p[1] != ']') | 2298 else if (p[0] == '-' && p[1] != ']') |
2087 { /* This handles ranges made up of characters only. */ | 2299 { /* This handles ranges made up of characters only. */ |
2088 reg_errcode_t ret; | 2300 reg_errcode_t ret; |
2089 | 2301 |
2090 /* Move past the `-'. */ | 2302 /* Move past the `-'. */ |
2091 PATFETCH (c1); | 2303 PATFETCH (c1); |
2092 | 2304 |
2093 ret = compile_range (&p, pend, translate, syntax, b); | 2305 #ifdef MULE |
2306 if (* (unsigned char *) p >= 0x80 && !has_extended_chars) | |
2307 { | |
2308 has_extended_chars = 1; | |
2309 goto start_over_with_extended; | |
2310 } | |
2311 if (has_extended_chars) | |
2312 ret = compile_extended_range (&p, pend, translate, | |
2313 syntax, rtab); | |
2314 else | |
2315 #endif /* MULE */ | |
2316 ret = compile_range (&p, pend, translate, syntax, b); | |
2094 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); | 2317 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
2095 } | 2318 } |
2096 | 2319 |
2097 /* See if we're at the beginning of a possible character | 2320 /* See if we're at the beginning of a possible character |
2098 class. */ | 2321 class. */ |
2188 had_char_class = false; | 2411 had_char_class = false; |
2189 SET_EITHER_BIT (c); | 2412 SET_EITHER_BIT (c); |
2190 } | 2413 } |
2191 } | 2414 } |
2192 | 2415 |
2416 #ifdef MULE | |
2417 if (has_extended_chars) | |
2418 { | |
2419 /* We have a range table, not a bit vector. */ | |
2420 int bytes_needed = | |
2421 unified_range_table_bytes_needed (rtab); | |
2422 GET_BUFFER_SPACE (bytes_needed); | |
2423 unified_range_table_copy_data (rtab, b); | |
2424 b += unified_range_table_bytes_used (b); | |
2425 break; | |
2426 } | |
2427 #endif /* MULE */ | |
2193 /* Discard any (non)matching list bytes that are all 0 at the | 2428 /* Discard any (non)matching list bytes that are all 0 at the |
2194 end of the map. Decrease the map-length byte too. */ | 2429 end of the map. Decrease the map-length byte too. */ |
2195 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) | 2430 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) |
2196 b[-1]--; | 2431 b[-1]--; |
2197 b += b[-1]; | 2432 b += b[-1]; |
2958 SET_LIST_BIT (TRANSLATE (this_char)); | 3193 SET_LIST_BIT (TRANSLATE (this_char)); |
2959 } | 3194 } |
2960 | 3195 |
2961 return REG_NOERROR; | 3196 return REG_NOERROR; |
2962 } | 3197 } |
3198 | |
3199 #ifdef MULE | |
3200 | |
3201 static reg_errcode_t | |
3202 compile_extended_range (CONST char **p_ptr, CONST char *pend, char *translate, | |
3203 reg_syntax_t syntax, Lisp_Object rtab) | |
3204 { | |
3205 Emchar this_char; | |
3206 | |
3207 CONST char *p = *p_ptr; | |
3208 EMACS_INT range_start, range_end; | |
3209 | |
3210 if (p == pend) | |
3211 return REG_ERANGE; | |
3212 | |
3213 p--; /* back to '-' */ | |
3214 DEC_CHARPTR (p); /* back to start of range */ | |
3215 /* We also want to fetch the endpoints without translating them; the | |
3216 appropriate translation is done in the bit-setting loop below. */ | |
3217 range_start = charptr_emchar ((CONST Bufbyte *) p); | |
3218 range_end = charptr_emchar ((CONST Bufbyte *) (*p_ptr)); | |
3219 INC_CHARPTR (*p_ptr); | |
3220 | |
3221 /* If the start is after the end, the range is empty. */ | |
3222 if (range_start > range_end) | |
3223 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; | |
3224 | |
3225 /* Can't have ranges spanning different charsets, except maybe for | |
3226 ranges entirely witin the first 256 chars. */ | |
3227 | |
3228 if ((range_start >= 0x100 || range_end >= 0x100) | |
3229 && CHAR_LEADING_BYTE (range_start) != | |
3230 CHAR_LEADING_BYTE (range_end)) | |
3231 return REG_ERANGESPAN; | |
3232 | |
3233 /* As advertised, translations only work over the 0 - 0x7F range. | |
3234 Making this kind of stuff work generally is much harder. | |
3235 Iterating over the whole range like this would be way efficient | |
3236 if the range encompasses 10,000 chars or something. You'd have | |
3237 to do something like this: | |
3238 | |
3239 range_table a; | |
3240 range_table b; | |
3241 map over translation table in [range_start, range_end] of | |
3242 (put the mapped range in a; | |
3243 put the translation in b) | |
3244 invert the range in a and truncate to [range_start, range_end] | |
3245 compute the union of a, b | |
3246 union the result into rtab | |
3247 */ | |
3248 for (this_char = range_start; | |
3249 this_char <= range_end && this_char < 0x80; this_char++) | |
3250 { | |
3251 SET_RANGETAB_BIT (TRANSLATE (this_char)); | |
3252 } | |
3253 | |
3254 if (this_char <= range_end) | |
3255 put_range_table (rtab, this_char, range_end, Qt); | |
3256 | |
3257 return REG_NOERROR; | |
3258 } | |
3259 | |
3260 #endif /* MULE */ | |
2963 | 3261 |
2964 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in | 3262 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in |
2965 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible | 3263 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible |
2966 characters can start a string that matches the pattern. This fastmap | 3264 characters can start a string that matches the pattern. This fastmap |
2967 is used by re_search to skip quickly over impossible starting points. | 3265 is used by re_search to skip quickly over impossible starting points. |
3056 fastmap[p[1]] = 1; | 3354 fastmap[p[1]] = 1; |
3057 break; | 3355 break; |
3058 | 3356 |
3059 | 3357 |
3060 case charset: | 3358 case charset: |
3359 /* XEmacs: Under Mule, these bit vectors will | |
3360 only contain values for characters below 0x80. */ | |
3061 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | 3361 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) |
3062 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) | 3362 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) |
3063 fastmap[j] = 1; | 3363 fastmap[j] = 1; |
3064 break; | 3364 break; |
3065 | 3365 |
3066 | 3366 |
3067 case charset_not: | 3367 case charset_not: |
3068 /* Chars beyond end of map must be allowed. */ | 3368 /* Chars beyond end of map must be allowed. */ |
3369 #ifdef MULE | |
3370 for (j = *p * BYTEWIDTH; j < 0x80; j++) | |
3371 fastmap[j] = 1; | |
3372 /* And all extended characters must be allowed, too. */ | |
3373 for (j = 0x80; j < 0xA0; j++) | |
3374 fastmap[j] = 1; | |
3375 #else | |
3069 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) | 3376 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) |
3070 fastmap[j] = 1; | 3377 fastmap[j] = 1; |
3378 #endif | |
3071 | 3379 |
3072 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) | 3380 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) |
3073 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) | 3381 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) |
3074 fastmap[j] = 1; | 3382 fastmap[j] = 1; |
3075 break; | 3383 break; |
3384 | |
3385 #ifdef MULE | |
3386 case charset_mule: | |
3387 { | |
3388 int nentries; | |
3389 int i; | |
3390 | |
3391 nentries = unified_range_table_nentries (p); | |
3392 for (i = 0; i < nentries; i++) | |
3393 { | |
3394 EMACS_INT first, last; | |
3395 Lisp_Object dummy_val; | |
3396 int jj; | |
3397 Bufbyte strr[MAX_EMCHAR_LEN]; | |
3398 | |
3399 unified_range_table_get_range (p, i, &first, &last, | |
3400 &dummy_val); | |
3401 for (jj = first; jj <= last && jj < 0x80; jj++) | |
3402 fastmap[jj] = 1; | |
3403 /* Ranges below 0x100 can span charsets, but there | |
3404 are only two (Control-1 and Latin-1), and | |
3405 either first or last has to be in them. */ | |
3406 set_charptr_emchar (strr, first); | |
3407 fastmap[*strr] = 1; | |
3408 if (last < 0x100) | |
3409 { | |
3410 set_charptr_emchar (strr, last); | |
3411 fastmap[*strr] = 1; | |
3412 } | |
3413 } | |
3414 } | |
3415 break; | |
3416 | |
3417 case charset_mule_not: | |
3418 { | |
3419 int nentries; | |
3420 int i; | |
3421 | |
3422 nentries = unified_range_table_nentries (p); | |
3423 for (i = 0; i < nentries; i++) | |
3424 { | |
3425 EMACS_INT first, last; | |
3426 Lisp_Object dummy_val; | |
3427 int jj; | |
3428 int smallest_prev = 0; | |
3429 | |
3430 unified_range_table_get_range (p, i, &first, &last, | |
3431 &dummy_val); | |
3432 for (jj = smallest_prev; jj < first && jj < 0x80; jj++) | |
3433 fastmap[jj] = 1; | |
3434 smallest_prev = last + 1; | |
3435 if (smallest_prev >= 0x80) | |
3436 break; | |
3437 } | |
3438 /* Calculating which leading bytes are actually allowed | |
3439 here is rather difficult, so we just punt and allow | |
3440 all of them. */ | |
3441 for (i = 0x80; i < 0xA0; i++) | |
3442 fastmap[i] = 1; | |
3443 } | |
3444 break; | |
3445 #endif /* MULE */ | |
3076 | 3446 |
3077 | 3447 |
3078 case wordchar: | 3448 case wordchar: |
3079 #ifdef emacs | 3449 #ifdef emacs |
3080 k = (int) Sword; | 3450 k = (int) Sword; |
3081 goto matchsyntax; | 3451 goto matchsyntax; |
3082 #else | 3452 #else |
3083 for (j = 0; j < (1 << BYTEWIDTH); j++) | 3453 for (j = 0; j < (1 << BYTEWIDTH); j++) |
3084 if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) == Sword) | 3454 if (SYNTAX_UNSAFE |
3455 (XCHAR_TABLE | |
3456 (regex_emacs_buffer->mirror_syntax_table), j) == Sword) | |
3085 fastmap[j] = 1; | 3457 fastmap[j] = 1; |
3086 break; | 3458 break; |
3087 #endif | 3459 #endif |
3088 | 3460 |
3089 | 3461 |
3091 #ifdef emacs | 3463 #ifdef emacs |
3092 k = (int) Sword; | 3464 k = (int) Sword; |
3093 goto matchnotsyntax; | 3465 goto matchnotsyntax; |
3094 #else | 3466 #else |
3095 for (j = 0; j < (1 << BYTEWIDTH); j++) | 3467 for (j = 0; j < (1 << BYTEWIDTH); j++) |
3096 if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) != Sword) | 3468 if (SYNTAX_UNSAFE |
3469 (XCHAR_TABLE | |
3470 (regex_emacs_buffer->mirror_syntax_table), j) != Sword) | |
3097 fastmap[j] = 1; | 3471 fastmap[j] = 1; |
3098 break; | 3472 break; |
3099 #endif | 3473 #endif |
3100 | 3474 |
3101 | 3475 |
3102 case anychar: | 3476 case anychar: |
3103 { | 3477 { |
3104 int fastmap_newline = fastmap['\n']; | 3478 int fastmap_newline = fastmap['\n']; |
3105 | 3479 |
3106 /* `.' matches anything ... */ | 3480 /* `.' matches anything ... */ |
3481 #ifdef MULE | |
3482 /* "anything" only includes bytes that can be the | |
3483 first byte of a character. */ | |
3484 for (j = 0; j < 0xA0; j++) | |
3485 fastmap[j] = 1; | |
3486 #else | |
3107 for (j = 0; j < (1 << BYTEWIDTH); j++) | 3487 for (j = 0; j < (1 << BYTEWIDTH); j++) |
3108 fastmap[j] = 1; | 3488 fastmap[j] = 1; |
3489 #endif | |
3109 | 3490 |
3110 /* ... except perhaps newline. */ | 3491 /* ... except perhaps newline. */ |
3111 if (!(bufp->syntax & RE_DOT_NEWLINE)) | 3492 if (!(bufp->syntax & RE_DOT_NEWLINE)) |
3112 fastmap['\n'] = fastmap_newline; | 3493 fastmap['\n'] = fastmap_newline; |
3113 | 3494 |
3122 | 3503 |
3123 #ifdef emacs | 3504 #ifdef emacs |
3124 case syntaxspec: | 3505 case syntaxspec: |
3125 k = *p++; | 3506 k = *p++; |
3126 matchsyntax: | 3507 matchsyntax: |
3127 for (j = 0; j < (1 << BYTEWIDTH); j++) | 3508 #ifdef MULE |
3128 if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) == | 3509 for (j = 0; j < 0x80; j++) |
3510 if (SYNTAX_UNSAFE | |
3511 (XCHAR_TABLE | |
3512 (regex_emacs_buffer->mirror_syntax_table), j) == | |
3129 (enum syntaxcode) k) | 3513 (enum syntaxcode) k) |
3130 fastmap[j] = 1; | 3514 fastmap[j] = 1; |
3515 for (j = 0x80; j < 0xA0; j++) | |
3516 { | |
3517 if (j == PRE_LEADING_BYTE_PRIVATE_1 | |
3518 || j == PRE_LEADING_BYTE_PRIVATE_2) | |
3519 /* too complicated to calculate this right */ | |
3520 fastmap[j] = 1; | |
3521 else | |
3522 { | |
3523 int multi_p; | |
3524 Lisp_Object cset; | |
3525 | |
3526 cset = CHARSET_BY_LEADING_BYTE (j); | |
3527 if (CHARSETP (cset)) | |
3528 { | |
3529 if (charset_syntax (regex_emacs_buffer, cset, | |
3530 &multi_p) | |
3531 == Sword || multi_p) | |
3532 fastmap[j] = 1; | |
3533 } | |
3534 } | |
3535 } | |
3536 #else /* ! MULE */ | |
3537 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
3538 if (SYNTAX_UNSAFE | |
3539 (XCHAR_TABLE | |
3540 (regex_emacs_buffer->mirror_syntax_table), j) == | |
3541 (enum syntaxcode) k) | |
3542 fastmap[j] = 1; | |
3543 #endif /* ! MULE */ | |
3131 break; | 3544 break; |
3132 | 3545 |
3133 | 3546 |
3134 case notsyntaxspec: | 3547 case notsyntaxspec: |
3135 k = *p++; | 3548 k = *p++; |
3136 matchnotsyntax: | 3549 matchnotsyntax: |
3137 for (j = 0; j < (1 << BYTEWIDTH); j++) | 3550 #ifdef MULE |
3138 if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) != | 3551 for (j = 0; j < 0x80; j++) |
3552 if (SYNTAX_UNSAFE | |
3553 (XCHAR_TABLE | |
3554 (regex_emacs_buffer->mirror_syntax_table), j) != | |
3139 (enum syntaxcode) k) | 3555 (enum syntaxcode) k) |
3140 fastmap[j] = 1; | 3556 fastmap[j] = 1; |
3557 for (j = 0x80; j < 0xA0; j++) | |
3558 { | |
3559 if (j == PRE_LEADING_BYTE_PRIVATE_1 | |
3560 || j == PRE_LEADING_BYTE_PRIVATE_2) | |
3561 /* too complicated to calculate this right */ | |
3562 fastmap[j] = 1; | |
3563 else | |
3564 { | |
3565 int multi_p; | |
3566 Lisp_Object cset; | |
3567 | |
3568 cset = CHARSET_BY_LEADING_BYTE (j); | |
3569 if (CHARSETP (cset)) | |
3570 { | |
3571 if (charset_syntax (regex_emacs_buffer, cset, | |
3572 &multi_p) | |
3573 != Sword || multi_p) | |
3574 fastmap[j] = 1; | |
3575 } | |
3576 } | |
3577 } | |
3578 #else /* ! MULE */ | |
3579 for (j = 0; j < (1 << BYTEWIDTH); j++) | |
3580 if (SYNTAX_UNSAFE | |
3581 (XCHAR_TABLE | |
3582 (regex_emacs_buffer->mirror_syntax_table), j) != | |
3583 (enum syntaxcode) k) | |
3584 fastmap[j] = 1; | |
3585 #endif /* ! MULE */ | |
3141 break; | 3586 break; |
3142 | 3587 |
3143 | 3588 |
3144 /* All cases after this match the empty string. These end with | 3589 /* All cases after this match the empty string. These end with |
3145 `continue'. */ | 3590 `continue'. */ |
3557 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d)) | 4002 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d)) |
3558 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d)) | 4003 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d)) |
3559 | 4004 |
3560 /* Test if CH is a word-constituent character. (XEmacs change) */ | 4005 /* Test if CH is a word-constituent character. (XEmacs change) */ |
3561 #define WORDCHAR_P_UNSAFE(ch) \ | 4006 #define WORDCHAR_P_UNSAFE(ch) \ |
3562 (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, ch) == Sword) | 4007 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \ |
4008 ch) == Sword) | |
3563 | 4009 |
3564 /* Free everything we malloc. */ | 4010 /* Free everything we malloc. */ |
3565 #ifdef MATCH_MAY_ALLOCATE | 4011 #ifdef MATCH_MAY_ALLOCATE |
3566 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL | 4012 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL |
3567 #define FREE_VARIABLES() \ | 4013 #define FREE_VARIABLES() \ |
4118 p += 1 + *p; | 4564 p += 1 + *p; |
4119 | 4565 |
4120 if (!not) goto fail; | 4566 if (!not) goto fail; |
4121 | 4567 |
4122 SET_REGS_MATCHED (); | 4568 SET_REGS_MATCHED (); |
4123 d++; | 4569 INC_CHARPTR (d); /* XEmacs change */ |
4124 break; | 4570 break; |
4125 } | 4571 } |
4572 | |
4573 #ifdef MULE | |
4574 case charset_mule: | |
4575 case charset_mule_not: | |
4576 { | |
4577 register Emchar c; | |
4578 boolean not = (re_opcode_t) *(p - 1) == charset_mule_not; | |
4579 | |
4580 DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not ? "_not" : ""); | |
4581 | |
4582 PREFETCH (); | |
4583 c = charptr_emchar ((CONST Bufbyte *) d); | |
4584 c = TRANSLATE_EXTENDED_UNSAFE (c); /* The character to match. */ | |
4585 | |
4586 if (EQ (Qt, unified_range_table_lookup (p, c, Qnil))) | |
4587 not = !not; | |
4588 | |
4589 p += unified_range_table_bytes_used (p); | |
4590 | |
4591 if (!not) goto fail; | |
4592 | |
4593 SET_REGS_MATCHED (); | |
4594 INC_CHARPTR (d); | |
4595 break; | |
4596 } | |
4597 #endif | |
4126 | 4598 |
4127 | 4599 |
4128 /* The beginning of a group is represented by start_memory. | 4600 /* The beginning of a group is represented by start_memory. |
4129 The arguments are the register number in the next byte, and the | 4601 The arguments are the register number in the next byte, and the |
4130 number of groups inner to this one in the next. The text | 4602 number of groups inner to this one in the next. The text |
4598 #ifdef DEBUG | 5070 #ifdef DEBUG |
4599 register unsigned char c | 5071 register unsigned char c |
4600 = *p2 == (unsigned char) endline ? '\n' : p2[2]; | 5072 = *p2 == (unsigned char) endline ? '\n' : p2[2]; |
4601 #endif | 5073 #endif |
4602 | 5074 |
4603 #if 1 | |
4604 /* dmoore@ucsd.edu - emacs 19.34 uses this: */ | |
4605 | |
4606 if ((re_opcode_t) p1[3] == exactn | 5075 if ((re_opcode_t) p1[3] == exactn |
4607 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] | 5076 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4] |
4608 && (p2[2 + p1[5] / BYTEWIDTH] | 5077 && (p2[1 + p1[4] / BYTEWIDTH] |
4609 & (1 << (p1[5] % BYTEWIDTH))))) | 5078 & (1 << (p1[4] % BYTEWIDTH))))) |
4610 #else | |
4611 if ((re_opcode_t) p1[3] == exactn | |
4612 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4] | |
4613 && (p2[1 + p1[4] / BYTEWIDTH] | |
4614 & (1 << (p1[4] % BYTEWIDTH))))) | |
4615 #endif | |
4616 { | 5079 { |
4617 p[-3] = (unsigned char) pop_failure_jump; | 5080 p[-3] = (unsigned char) pop_failure_jump; |
4618 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", | 5081 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", |
4619 c, p1[5]); | 5082 c, p1[5]); |
4620 } | 5083 } |
4918 int matches; | 5381 int matches; |
4919 Emchar emch; | 5382 Emchar emch; |
4920 | 5383 |
4921 PREFETCH (); | 5384 PREFETCH (); |
4922 emch = charptr_emchar ((CONST Bufbyte *) d); | 5385 emch = charptr_emchar ((CONST Bufbyte *) d); |
4923 matches = (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, | 5386 matches = (SYNTAX_UNSAFE |
4924 emch) == (enum syntaxcode) mcnt); | 5387 (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), |
5388 emch) == (enum syntaxcode) mcnt); | |
4925 INC_CHARPTR (d); | 5389 INC_CHARPTR (d); |
4926 if (matches != should_succeed) | 5390 if (matches != should_succeed) |
4927 goto fail; | 5391 goto fail; |
4928 SET_REGS_MATCHED (); | 5392 SET_REGS_MATCHED (); |
4929 } | 5393 } |