Mercurial > hg > xemacs-beta
changeset 4408:8bbabcab2c42
Merge.
| author | Aidan Kehoe <kehoea@parhasard.net> |
|---|---|
| date | Sun, 20 Jan 2008 13:09:58 +0100 |
| parents | 4ee73bbe4f8e (diff) 5998e37dc35e (current diff) |
| children | 3ff01259c4a2 fd8a9a4d81d9 |
| files | lisp/gtk-iso8859-1.el lisp/x-iso8859-1.el src/ChangeLog |
| diffstat | 3 files changed, 190 insertions(+), 57 deletions(-) [+] |
line wrap: on
line diff
--- a/src/ChangeLog Sat Jan 19 14:34:19 2008 +0100 +++ b/src/ChangeLog Sun Jan 20 13:09:58 2008 +0100 @@ -1,3 +1,33 @@ +2007-12-26 Aidan Kehoe <kehoea@parhasard.net> + + * casetab.c: + Extend and correct some case table documentation. + * search.c (search_buffer): + Correct a bug where only the first entry for a character in the + case equivalence table was examined in determining if the + Boyer-Moore search algorithm is appropriate. + + If there are case mappings outside of the charset and row of the + characters specified in the search string, those case mappings can + be safely ignored (and Boyer-Moore search can be used) if we know + from the buffer statistics that the corresponding characters cannot + occur. + + * search.c (boyer_moore): + Assert that we haven't been passed a string with varying + characters sets or rows within character sets. That's what + simple_search is for. + + In the very rare event that a character in the search string has a + canonical case mapping that is not in the same character set and + row, don't try to search for the canonical character, search for + some other character that is in the the desired character set and + row. Assert that the case table isn't corrupt. + + Do not search for any character case mappings that cannot possibly + occur in the buffer, given the buffer metadata about its + contents. + 2008-01-19 Aidan Kehoe <kehoea@parhasard.net> * dired.c (Ffile_attributes): If bignums are available, use them
--- a/src/casetab.c Sat Jan 19 14:34:19 2008 +0100 +++ b/src/casetab.c Sun Jan 20 13:09:58 2008 +0100 @@ -48,13 +48,28 @@ or vice versa, both characters will have the same entry in the canon table. - (4) `equiv' lists the "equivalence classes" defined by `canon'. Imagine + (4) `eqv' lists the "equivalence classes" defined by `canon'. Imagine that all characters are divided into groups having the same `canon' - entry; these groups are called "equivalence classes" and `equiv' lists - them by linking the characters in each equivalence class together in a - circular list. + entry; these groups are called "equivalence classes" and `eqv' lists them + by linking the characters in each equivalence class together in a + circular list. That is, to find out all all the members of a given char's + equivalence classe, you need something like the following code: - `canon' is used when doing case-insensitive comparisons. `equiv' is + (let* ((char ?i) + (original-char char) + (standard-case-eqv (case-table-eqv (standard-case-table)))) + (loop + with res = (list char) + until (eq (setq char (get-char-table char standard-case-eqv)) + original-char) + do (push char res) + finally return res)) + + (Where #'case-table-eqv doesn't yet exist, and probably never will, given + that the C code needs to keep it in a consistent state so Lisp can't mess + around with it.) + + `canon' is used when doing case-insensitive comparisons. `eqv' is used in the Boyer-Moore search code. */
--- a/src/search.c Sat Jan 19 14:34:19 2008 +0100 +++ b/src/search.c Sun Jan 20 13:09:58 2008 +0100 @@ -1340,11 +1340,14 @@ { int charset_base = -1; int boyer_moore_ok = 1; - Ibyte *pat = 0; Ibyte *patbuf = alloca_ibytes (len * MAX_ICHAR_LEN); - pat = patbuf; + Ibyte *pat = patbuf; + #ifdef MULE - /* &&#### needs some 8-bit work here */ + int entirely_one_byte_p = buf->text->entirely_one_byte_p; + int nothing_greater_than_0xff = + buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf); + while (len > 0) { Ibyte tmp_str[MAX_ICHAR_LEN]; @@ -1367,23 +1370,68 @@ inv_bytelen = set_itext_ichar (tmp_str, inverse); new_bytelen = set_itext_ichar (tmp_str, translated); - if (new_bytelen != orig_bytelen || inv_bytelen != orig_bytelen) - boyer_moore_ok = 0; - if (translated != c || inverse != c) - { - /* Keep track of which charset and character set row - contains the characters that need translation. - Zero out the bits corresponding to the last byte. - */ - int charset_base_code = c & ~ICHAR_FIELD3_MASK; - if (charset_base == -1) - charset_base = charset_base_code; - else if (charset_base != charset_base_code) - /* If two different rows appear, needing translation, then - we cannot use boyer_moore search. See the comment at the - head of boyer_moore(). */ - boyer_moore_ok = 0; - } + if (-1 == charset_base) + { + /* Keep track of which charset and character set row + contains the characters that need translation. + + Zero out the bits corresponding to the last byte. */ + charset_base = c & ~ICHAR_FIELD3_MASK; + } + + if (boyer_moore_ok && (translated != c || inverse != c)) + { + Ichar starting_c = c; + int charset_base_code; + + do + { + c = TRANSLATE (inverse_trt, c); + + /* If a character cannot occur in the buffer, ignore + it. */ + if (c > 0x7F && entirely_one_byte_p) + continue; + + if (c > 0xFF && nothing_greater_than_0xff) + continue; + + charset_base_code = c & ~ICHAR_FIELD3_MASK; + + if (charset_base_code != charset_base) + { + /* If two different rows, or two different charsets, + appear, needing translation, then we cannot use + boyer_moore search. See the comment at the head of + boyer_moore(). */ + boyer_moore_ok = 0; + break; + } + } while (c != starting_c); + + if (boyer_moore_ok && (charset_base != + (translated & ~ICHAR_FIELD3_MASK))) + { + /* In the rare event that the CANON entry for this + character is not in the desired set, choose one + that is, from the equivalence set. It doesn't much + matter which. */ + Ichar starting_ch = translated; + do + { + translated = TRANSLATE (inverse_trt, translated); + + if (charset_base == (translated & ~ICHAR_FIELD3_MASK)) + break; + + } while (starting_ch != translated); + + assert (starting_ch != translated); + + new_bytelen = set_itext_ichar (tmp_str, translated); + } + } + memcpy (pat, tmp_str, new_bytelen); pat += new_bytelen; base_pat += orig_bytelen; @@ -1551,14 +1599,14 @@ makes it possible to translate just the last byte of a character, and do so after just a simple test of the context. - If that criterion is not satisfied, do not call this function. */ + If that criterion is not satisfied, do not call this function. You will + get an assertion failure. */ static Charbpos boyer_moore (struct buffer *buf, Ibyte *base_pat, Bytecount len, Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt, Lisp_Object inverse_trt, int USED_IF_MULE (charset_base)) { - /* &&#### needs some 8-bit work here */ /* #### Someone really really really needs to comment the workings of this junk somewhat better. @@ -1602,6 +1650,13 @@ #ifdef MULE Ibyte translate_prev_byte = 0; Ibyte translate_anteprev_byte = 0; + /* These need to be rethought in the event that the internal format + changes, or in the event that num_8_bit_fixed_chars disappears + (entirely_one_byte_p can be trivially worked out by checking is the + byte count equal to the char count.) */ + int buffer_entirely_one_byte_p = buf->text->entirely_one_byte_p; + int buffer_nothing_greater_than_0xff = + buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf); #endif #ifdef C_ALLOCA EMACS_INT BM_tab_space[0400]; @@ -1684,20 +1739,47 @@ while (!ibyte_first_byte_p (*charstart)) charstart--; untranslated = itext_ichar (charstart); - if (charset_base == (untranslated & ~ICHAR_FIELD3_MASK)) - { - ch = TRANSLATE (trt, untranslated); - if (!ibyte_first_byte_p (*ptr)) - { - translate_prev_byte = ptr[-1]; - if (!ibyte_first_byte_p (translate_prev_byte)) - translate_anteprev_byte = ptr[-2]; - } - } - else - { - this_translated = 0; - ch = *ptr; + + /* We shouldn't have been passed a string with varying + character sets or rows. That's what simple_search is + for. */ + assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK)); + + ch = TRANSLATE (trt, untranslated); + if (!ibyte_first_byte_p (*ptr)) + { + translate_prev_byte = ptr[-1]; + if (!ibyte_first_byte_p (translate_prev_byte)) + translate_anteprev_byte = ptr[-2]; + } + + if (charset_base != (ch & ~ICHAR_FIELD3_MASK)) + { + /* In the very rare event that the CANON entry for this + character is not in the desired set, choose one that + is, from the equivalence set. It doesn't much matter + which, since we're building our own cheesy equivalence + table instead of using that belonging to the case + table directly. + + We can get here if search_buffer has worked out that + the buffer is entirely single width. */ + Ichar starting_ch = ch; + do + { + ch = TRANSLATE (inverse_trt, ch); + if (charset_base == (ch & ~ICHAR_FIELD3_MASK)) + break; + + } while (starting_ch != ch); + + /* If starting_ch is equal to ch, the case table is + corrupt. (Any mapping in the canon table should be + reflected in the equivalence table, and we know from + the canon table that untranslated maps to starting_ch + and that untranslated has the correct value for + charset_base.) */ + assert (starting_ch != ch); } } else @@ -1713,28 +1795,34 @@ if (i == infinity) stride_for_teases = BM_tab[j]; BM_tab[j] = dirlen - i; - /* A translation table is accompanied by its inverse -- - see comment in casetab.c. */ + /* A translation table is accompanied by its inverse -- see + comment in casetab.c. */ if (this_translated) { Ichar starting_ch = ch; EMACS_INT starting_j = j; - while (1) + do { ch = TRANSLATE (inverse_trt, ch); - if (ch > 0400) - j = ((unsigned char) ch | 0200); - else - j = (unsigned char) ch; - - /* For all the characters that map into CH, - set up simple_translate to map the last byte - into STARTING_J. */ - simple_translate[j] = (Ibyte) starting_j; - if (ch == starting_ch) - break; - BM_tab[j] = dirlen - i; - } + + if (ch > 0x7F && buffer_entirely_one_byte_p) + continue; + + if (ch > 0xFF && buffer_nothing_greater_than_0xff) + continue; + + if (ch > 0400) + j = ((unsigned char) ch | 0200); + else + j = (unsigned char) ch; + + /* For all the characters that map into CH, set up + simple_translate to map the last byte into + STARTING_J. */ + simple_translate[j] = (Ibyte) starting_j; + BM_tab[j] = dirlen - i; + + } while (ch != starting_ch); } #else EMACS_INT k;
