Mercurial > hg > xemacs-beta
diff src/regex.c @ 70:131b0175ea99 r20-0b30
Import from CVS: tag r20-0b30
author | cvs |
---|---|
date | Mon, 13 Aug 2007 09:02:59 +0200 |
parents | 441bb1e64a06 |
children | 6a378aca36af |
line wrap: on
line diff
--- a/src/regex.c Mon Aug 13 09:00:04 2007 +0200 +++ b/src/regex.c Mon Aug 13 09:02:59 2007 +0200 @@ -31,6 +31,8 @@ (2) Rel-alloc is disabled when the MMAP version of rel-alloc is being used, because it's too slow -- all those calls to mmap() add humongous overhead. + (3) Lots and lots of changes for Mule. They are bracketed by + `#ifdef MULE' or with comments that have `XEmacs' in them. */ /* AIX requires this to be the first thing in the file. */ @@ -44,6 +46,11 @@ #include <config.h> #endif +/* We assume non-Mule if emacs isn't defined. */ +#ifndef emacs +#undef MULE +#endif + /* We need this for `regex.h', and perhaps for the Emacs include files. */ #include <sys/types.h> @@ -78,11 +85,26 @@ #define DEBUG #endif +#ifdef MULE + +Lisp_Object Vthe_lisp_rangetab; + +void +complex_vars_of_regex (void) +{ + Vthe_lisp_rangetab = Fmake_range_table (); + staticpro (&Vthe_lisp_rangetab); +} + +#else /* not MULE */ + void complex_vars_of_regex (void) { } +#endif /* not MULE */ + #else /* not emacs */ /* If we are not linking with Emacs proper, @@ -222,7 +244,14 @@ #define ISASCII_1(c) isascii(c) #endif +#ifdef MULE +/* The IS*() macros can be passed any character, including an extended + one. We need to make sure there are no crashes, which would occur + otherwise due to out-of-bounds array references. */ +#define ISASCII(c) (((unsigned EMACS_INT) (c)) < 0x100 && ISASCII_1 (c)) +#else #define ISASCII(c) ISASCII_1 (c) +#endif #ifdef isblank #define ISBLANK(c) (ISASCII (c) && isblank (c)) @@ -506,6 +535,20 @@ /* Matches any character whose syntax is not that specified. */ notsyntaxspec #endif /* emacs */ + +#ifdef MULE + /* need extra stuff to be able to properly work with XEmacs/Mule + characters (which may take up more than one byte) */ + + ,charset_mule, /* Matches any character belonging to specified set. + The set is stored in "unified range-table + format"; see rangetab.c. Unlike the `charset' + opcode, this can handle arbitrary characters. */ + + charset_mule_not /* Same parameters as charset_mule, but match any + character that is not one of those specified. */ +#endif + } re_opcode_t; /* Common operations on the compiled pattern. */ @@ -737,6 +780,41 @@ } break; +#ifdef MULE + case charset_mule: + case charset_mule_not: + { + int nentries, i; + + printf ("/charset_mule [%s", + (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : ""); + nentries = unified_range_table_nentries (p); + for (i = 0; i < nentries; i++) + { + EMACS_INT first, last; + Lisp_Object dummy_val; + + unified_range_table_get_range (p, i, &first, &last, + &dummy_val); + if (first < 0x100) + putchar (first); + else + printf ("(0x%x)", first); + if (first != last) + { + putchar ('-'); + if (last < 0x100) + putchar (last); + else + printf ("(0x%x)", last); + } + } + putchar (']'); + p += unified_range_table_bytes_used (p); + } + break; +#endif + case begline: printf ("/begline"); break; @@ -985,6 +1063,9 @@ #ifdef emacs "Invalid syntax designator", /* REG_ESYNTAX */ #endif +#ifdef MULE + "Ranges may not span charsets", /* REG_ERANGESPAN */ +#endif }; /* Avoiding alloca during matching, to placate r_alloc. */ @@ -1044,7 +1125,7 @@ #if defined (MATCH_MAY_ALLOCATE) /* 4400 was enough to cause a crash on Alpha OSF/1, whose default stack limit is 2mb. */ -int re_max_failures = 20000; +int re_max_failures = 4000; #else int re_max_failures = 2000; #endif @@ -1267,10 +1348,7 @@ #endif /* We push at most this many items on the stack. */ -/* We used to use (num_regs - 1), which is the number of registers - this regexp will save; but that was changed to 5 - to avoid stack overflow for a regexp with lots of parens. */ -#define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) +#define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS) /* We actually push this many items. */ #define NUM_FAILURE_ITEMS \ @@ -1438,10 +1516,57 @@ /* Go backwards one character in the pattern. */ #define PATUNFETCH p-- +#ifdef MULE + +#define PATFETCH_EXTENDED(emch) \ + do {if (p == pend) return REG_EEND; \ + assert (p < pend); \ + emch = charptr_emchar ((CONST Bufbyte *) p); \ + INC_CHARPTR (p); \ + if (translate && emch < 0x80) \ + emch = (Emchar) (unsigned char) translate[emch]; \ + } while (0) + +#define PATFETCH_RAW_EXTENDED(emch) \ + do {if (p == pend) return REG_EEND; \ + assert (p < pend); \ + emch = charptr_emchar ((CONST Bufbyte *) p); \ + INC_CHARPTR (p); \ + } while (0) + +#define PATUNFETCH_EXTENDED DEC_CHARPTR (p) + +#define PATFETCH_EITHER(emch) \ + do { \ + if (has_extended_chars) \ + PATFETCH_EXTENDED (emch); \ + else \ + PATFETCH (emch); \ + } while (0) + +#define PATFETCH_RAW_EITHER(emch) \ + do { \ + if (has_extended_chars) \ + PATFETCH_RAW_EXTENDED (emch); \ + else \ + PATFETCH_RAW (emch); \ + } while (0) + +#define PATUNFETCH_EITHER \ + do { \ + if (has_extended_chars) \ + PATUNFETCH_EXTENDED (emch); \ + else \ + PATUNFETCH (emch); \ + } while (0) + +#else /* not MULE */ + #define PATFETCH_EITHER(emch) PATFETCH (emch) #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch) #define PATUNFETCH_EITHER PATUNFETCH +#endif /* not MULE */ /* If `translate' is non-null, return translate[D], else just D. We cast the subscript to translate because some data is declared as @@ -1449,6 +1574,12 @@ when we use a character as a subscript we must make it unsigned. */ #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) +#ifdef MULE + +#define TRANSLATE_EXTENDED_UNSAFE(emch) \ + (translate && emch < 0x80 ? translate[emch] : (emch)) + +#endif /* Macros for outputting the compiled pattern into `buffer'. */ @@ -1589,8 +1720,25 @@ (b[((unsigned char) (c)) / BYTEWIDTH] \ |= 1 << (((unsigned char) c) % BYTEWIDTH)) +#ifdef MULE + +/* Set the "bit" for character C in a range table. */ +#define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt) + +/* Set the "bit" for character c in the appropriate table. */ +#define SET_EITHER_BIT(c) \ + do { \ + if (has_extended_chars) \ + SET_RANGETAB_BIT (c); \ + else \ + SET_LIST_BIT (c); \ + } while (0) + +#else /* not MULE */ + #define SET_EITHER_BIT(c) SET_LIST_BIT (c) +#endif /* Get the next unsigned number in the uncompiled pattern. */ @@ -1634,6 +1782,13 @@ static reg_errcode_t compile_range (CONST char **p_ptr, CONST char *pend, char *translate, reg_syntax_t syntax, unsigned char *b); +#ifdef MULE +static reg_errcode_t compile_extended_range (CONST char **p_ptr, + CONST char *pend, + char *translate, + reg_syntax_t syntax, + Lisp_Object rtab); +#endif static boolean group_match_null_string_p (unsigned char **p, unsigned char *end, register_info_type *reg_info); @@ -2011,6 +2166,10 @@ { /* XEmacs change: this whole section */ boolean had_char_class = false; +#ifdef MULE + boolean has_extended_chars = false; + REGISTER Lisp_Object rtab = Qnil; +#endif if (p == pend) FREE_STACK_RETURN (REG_EBRACK); @@ -2040,6 +2199,29 @@ && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) SET_LIST_BIT ('\n'); +#ifdef MULE + start_over_with_extended: + if (has_extended_chars) + { + /* There are extended chars here, which means we need to start + over and shift to unified range-table format. */ + if (b[-2] == charset) + b[-2] = charset_mule; + else + b[-2] = charset_mule_not; + b--; + p = p1; /* go back to the beginning of the charset, after + a possible ^. */ + rtab = Vthe_lisp_rangetab; + Fclear_range_table (rtab); + + /* charset_not matches newline according to a syntax bit. */ + if ((re_opcode_t) b[-1] == charset_mule_not + && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) + SET_EITHER_BIT ('\n'); + } +#endif /* MULE */ + /* Read in characters and ranges, setting map bits. */ for (;;) { @@ -2047,12 +2229,31 @@ PATFETCH_EITHER (c); +#ifdef MULE + if (c >= 0x80 && !has_extended_chars) + { + has_extended_chars = 1; + /* Frumble-bumble, we've found some extended chars. + Need to start over, process everything using + the general extended-char mechanism, and need + to use charset_mule and charset_mule_not instead + of charset and charset_not. */ + goto start_over_with_extended; + } +#endif /* MULE */ /* \ might escape characters inside [...] and [^...]. */ if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') { if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); PATFETCH_EITHER (c1); +#ifdef MULE + if (c1 >= 0x80 && !has_extended_chars) + { + has_extended_chars = 1; + goto start_over_with_extended; + } +#endif /* MULE */ SET_EITHER_BIT (c1); continue; } @@ -2079,7 +2280,18 @@ { reg_errcode_t ret; - ret = compile_range (&p, pend, translate, syntax, b); +#ifdef MULE + if (* (unsigned char *) p >= 0x80 && !has_extended_chars) + { + has_extended_chars = 1; + goto start_over_with_extended; + } + if (has_extended_chars) + ret = compile_extended_range (&p, pend, translate, + syntax, rtab); + else +#endif /* MULE */ + ret = compile_range (&p, pend, translate, syntax, b); if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); } @@ -2090,7 +2302,18 @@ /* Move past the `-'. */ PATFETCH (c1); - ret = compile_range (&p, pend, translate, syntax, b); +#ifdef MULE + if (* (unsigned char *) p >= 0x80 && !has_extended_chars) + { + has_extended_chars = 1; + goto start_over_with_extended; + } + if (has_extended_chars) + ret = compile_extended_range (&p, pend, translate, + syntax, rtab); + else +#endif /* MULE */ + ret = compile_range (&p, pend, translate, syntax, b); if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); } @@ -2190,6 +2413,18 @@ } } +#ifdef MULE + if (has_extended_chars) + { + /* We have a range table, not a bit vector. */ + int bytes_needed = + unified_range_table_bytes_needed (rtab); + GET_BUFFER_SPACE (bytes_needed); + unified_range_table_copy_data (rtab, b); + b += unified_range_table_bytes_used (b); + break; + } +#endif /* MULE */ /* Discard any (non)matching list bytes that are all 0 at the end of the map. Decrease the map-length byte too. */ while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) @@ -2960,6 +3195,69 @@ return REG_NOERROR; } + +#ifdef MULE + +static reg_errcode_t +compile_extended_range (CONST char **p_ptr, CONST char *pend, char *translate, + reg_syntax_t syntax, Lisp_Object rtab) +{ + Emchar this_char; + + CONST char *p = *p_ptr; + EMACS_INT range_start, range_end; + + if (p == pend) + return REG_ERANGE; + + p--; /* back to '-' */ + DEC_CHARPTR (p); /* back to start of range */ + /* We also want to fetch the endpoints without translating them; the + appropriate translation is done in the bit-setting loop below. */ + range_start = charptr_emchar ((CONST Bufbyte *) p); + range_end = charptr_emchar ((CONST Bufbyte *) (*p_ptr)); + INC_CHARPTR (*p_ptr); + + /* If the start is after the end, the range is empty. */ + if (range_start > range_end) + return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; + + /* Can't have ranges spanning different charsets, except maybe for + ranges entirely witin the first 256 chars. */ + + if ((range_start >= 0x100 || range_end >= 0x100) + && CHAR_LEADING_BYTE (range_start) != + CHAR_LEADING_BYTE (range_end)) + return REG_ERANGESPAN; + + /* As advertised, translations only work over the 0 - 0x7F range. + Making this kind of stuff work generally is much harder. + Iterating over the whole range like this would be way efficient + if the range encompasses 10,000 chars or something. You'd have + to do something like this: + + range_table a; + range_table b; + map over translation table in [range_start, range_end] of + (put the mapped range in a; + put the translation in b) + invert the range in a and truncate to [range_start, range_end] + compute the union of a, b + union the result into rtab + */ + for (this_char = range_start; + this_char <= range_end && this_char < 0x80; this_char++) + { + SET_RANGETAB_BIT (TRANSLATE (this_char)); + } + + if (this_char <= range_end) + put_range_table (rtab, this_char, range_end, Qt); + + return REG_NOERROR; +} + +#endif /* MULE */ /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible @@ -3058,6 +3356,8 @@ case charset: + /* XEmacs: Under Mule, these bit vectors will + only contain values for characters below 0x80. */ for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) fastmap[j] = 1; @@ -3066,14 +3366,84 @@ case charset_not: /* Chars beyond end of map must be allowed. */ +#ifdef MULE + for (j = *p * BYTEWIDTH; j < 0x80; j++) + fastmap[j] = 1; + /* And all extended characters must be allowed, too. */ + for (j = 0x80; j < 0xA0; j++) + fastmap[j] = 1; +#else for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) fastmap[j] = 1; +#endif for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) fastmap[j] = 1; break; +#ifdef MULE + case charset_mule: + { + int nentries; + int i; + + nentries = unified_range_table_nentries (p); + for (i = 0; i < nentries; i++) + { + EMACS_INT first, last; + Lisp_Object dummy_val; + int jj; + Bufbyte strr[MAX_EMCHAR_LEN]; + + unified_range_table_get_range (p, i, &first, &last, + &dummy_val); + for (jj = first; jj <= last && jj < 0x80; jj++) + fastmap[jj] = 1; + /* Ranges below 0x100 can span charsets, but there + are only two (Control-1 and Latin-1), and + either first or last has to be in them. */ + set_charptr_emchar (strr, first); + fastmap[*strr] = 1; + if (last < 0x100) + { + set_charptr_emchar (strr, last); + fastmap[*strr] = 1; + } + } + } + break; + + case charset_mule_not: + { + int nentries; + int i; + + nentries = unified_range_table_nentries (p); + for (i = 0; i < nentries; i++) + { + EMACS_INT first, last; + Lisp_Object dummy_val; + int jj; + int smallest_prev = 0; + + unified_range_table_get_range (p, i, &first, &last, + &dummy_val); + for (jj = smallest_prev; jj < first && jj < 0x80; jj++) + fastmap[jj] = 1; + smallest_prev = last + 1; + if (smallest_prev >= 0x80) + break; + } + /* Calculating which leading bytes are actually allowed + here is rather difficult, so we just punt and allow + all of them. */ + for (i = 0x80; i < 0xA0; i++) + fastmap[i] = 1; + } + break; +#endif /* MULE */ + case wordchar: #ifdef emacs @@ -3081,7 +3451,9 @@ goto matchsyntax; #else for (j = 0; j < (1 << BYTEWIDTH); j++) - if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) == Sword) + if (SYNTAX_UNSAFE + (XCHAR_TABLE + (regex_emacs_buffer->mirror_syntax_table), j) == Sword) fastmap[j] = 1; break; #endif @@ -3093,7 +3465,9 @@ goto matchnotsyntax; #else for (j = 0; j < (1 << BYTEWIDTH); j++) - if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) != Sword) + if (SYNTAX_UNSAFE + (XCHAR_TABLE + (regex_emacs_buffer->mirror_syntax_table), j) != Sword) fastmap[j] = 1; break; #endif @@ -3104,8 +3478,15 @@ int fastmap_newline = fastmap['\n']; /* `.' matches anything ... */ +#ifdef MULE + /* "anything" only includes bytes that can be the + first byte of a character. */ + for (j = 0; j < 0xA0; j++) + fastmap[j] = 1; +#else for (j = 0; j < (1 << BYTEWIDTH); j++) fastmap[j] = 1; +#endif /* ... except perhaps newline. */ if (!(bufp->syntax & RE_DOT_NEWLINE)) @@ -3124,20 +3505,84 @@ case syntaxspec: k = *p++; matchsyntax: - for (j = 0; j < (1 << BYTEWIDTH); j++) - if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) == +#ifdef MULE + for (j = 0; j < 0x80; j++) + if (SYNTAX_UNSAFE + (XCHAR_TABLE + (regex_emacs_buffer->mirror_syntax_table), j) == (enum syntaxcode) k) fastmap[j] = 1; + for (j = 0x80; j < 0xA0; j++) + { + if (j == PRE_LEADING_BYTE_PRIVATE_1 + || j == PRE_LEADING_BYTE_PRIVATE_2) + /* too complicated to calculate this right */ + fastmap[j] = 1; + else + { + int multi_p; + Lisp_Object cset; + + cset = CHARSET_BY_LEADING_BYTE (j); + if (CHARSETP (cset)) + { + if (charset_syntax (regex_emacs_buffer, cset, + &multi_p) + == Sword || multi_p) + fastmap[j] = 1; + } + } + } +#else /* ! MULE */ + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX_UNSAFE + (XCHAR_TABLE + (regex_emacs_buffer->mirror_syntax_table), j) == + (enum syntaxcode) k) + fastmap[j] = 1; +#endif /* ! MULE */ break; case notsyntaxspec: k = *p++; matchnotsyntax: - for (j = 0; j < (1 << BYTEWIDTH); j++) - if (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, j) != +#ifdef MULE + for (j = 0; j < 0x80; j++) + if (SYNTAX_UNSAFE + (XCHAR_TABLE + (regex_emacs_buffer->mirror_syntax_table), j) != (enum syntaxcode) k) fastmap[j] = 1; + for (j = 0x80; j < 0xA0; j++) + { + if (j == PRE_LEADING_BYTE_PRIVATE_1 + || j == PRE_LEADING_BYTE_PRIVATE_2) + /* too complicated to calculate this right */ + fastmap[j] = 1; + else + { + int multi_p; + Lisp_Object cset; + + cset = CHARSET_BY_LEADING_BYTE (j); + if (CHARSETP (cset)) + { + if (charset_syntax (regex_emacs_buffer, cset, + &multi_p) + != Sword || multi_p) + fastmap[j] = 1; + } + } + } +#else /* ! MULE */ + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX_UNSAFE + (XCHAR_TABLE + (regex_emacs_buffer->mirror_syntax_table), j) != + (enum syntaxcode) k) + fastmap[j] = 1; +#endif /* ! MULE */ break; @@ -3559,7 +4004,8 @@ /* Test if CH is a word-constituent character. (XEmacs change) */ #define WORDCHAR_P_UNSAFE(ch) \ - (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, ch) == Sword) + (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \ + ch) == Sword) /* Free everything we malloc. */ #ifdef MATCH_MAY_ALLOCATE @@ -4120,10 +4566,36 @@ if (!not) goto fail; SET_REGS_MATCHED (); - d++; + INC_CHARPTR (d); /* XEmacs change */ break; } +#ifdef MULE + case charset_mule: + case charset_mule_not: + { + register Emchar c; + boolean not = (re_opcode_t) *(p - 1) == charset_mule_not; + + DEBUG_PRINT2 ("EXECUTING charset_mule%s.\n", not ? "_not" : ""); + + PREFETCH (); + c = charptr_emchar ((CONST Bufbyte *) d); + c = TRANSLATE_EXTENDED_UNSAFE (c); /* The character to match. */ + + if (EQ (Qt, unified_range_table_lookup (p, c, Qnil))) + not = !not; + + p += unified_range_table_bytes_used (p); + + if (!not) goto fail; + + SET_REGS_MATCHED (); + INC_CHARPTR (d); + break; + } +#endif + /* The beginning of a group is represented by start_memory. The arguments are the register number in the next byte, and the @@ -4600,19 +5072,10 @@ = *p2 == (unsigned char) endline ? '\n' : p2[2]; #endif -#if 1 - /* dmoore@ucsd.edu - emacs 19.34 uses this: */ - if ((re_opcode_t) p1[3] == exactn - && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] - && (p2[2 + p1[5] / BYTEWIDTH] - & (1 << (p1[5] % BYTEWIDTH))))) -#else - if ((re_opcode_t) p1[3] == exactn - && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4] - && (p2[1 + p1[4] / BYTEWIDTH] - & (1 << (p1[4] % BYTEWIDTH))))) -#endif + && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4] + && (p2[1 + p1[4] / BYTEWIDTH] + & (1 << (p1[4] % BYTEWIDTH))))) { p[-3] = (unsigned char) pop_failure_jump; DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", @@ -4920,8 +5383,9 @@ PREFETCH (); emch = charptr_emchar ((CONST Bufbyte *) d); - matches = (SYNTAX_UNSAFE (regex_emacs_buffer->syntax_table, - emch) == (enum syntaxcode) mcnt); + matches = (SYNTAX_UNSAFE + (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), + emch) == (enum syntaxcode) mcnt); INC_CHARPTR (d); if (matches != should_succeed) goto fail;