Mercurial > hg > xemacs-beta
changeset 4414:df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
2008-01-30 Aidan Kehoe <kehoea@parhasard.net>
* automated/case-tests.el:
Check for a bug Mike Sperber reported; check algorithms used, if
available.
2008-01-30 Aidan Kehoe <kehoea@parhasard.net>
* search.c (debug-xemacs-searches):
New variable, available on debug builds. Used in
tests/automated/case-tests.el.
(search_buffer): Only store the charset_base for characters with
translations. Correct some comments, correct some checks. If
debug_xemacs_searches is non-zero, record which search was used.
(boyer_moore): Remove an assertion that was incorrect. Remove its
documentation. Correct an assertion dealing with equivalence
tables; we may end up looking through the equivalence table if a
non-ASCII non-case character was searched for.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Wed, 30 Jan 2008 09:26:59 +0100 |
parents | dc84ec90b463 |
children | bceb3e285ae7 |
files | src/ChangeLog src/search.c tests/ChangeLog tests/automated/case-tests.el |
diffstat | 4 files changed, 131 insertions(+), 38 deletions(-) [+] |
line wrap: on
line diff
--- a/src/ChangeLog Mon Jan 28 08:13:37 2008 +0100 +++ b/src/ChangeLog Wed Jan 30 09:26:59 2008 +0100 @@ -1,3 +1,16 @@ +2008-01-30 Aidan Kehoe <kehoea@parhasard.net> + + * search.c (debug-xemacs-searches): + New variable, available on debug builds. Used in + tests/automated/case-tests.el. + (search_buffer): Only store the charset_base for characters with + translations. Correct some comments, correct some checks. If + debug_xemacs_searches is non-zero, record which search was used. + (boyer_moore): Remove an assertion that was incorrect. Remove its + documentation. Correct an assertion dealing with equivalence + tables; we may end up looking through the equivalence table if a + non-ASCII non-case character was searched for. + 2008-01-24 Mike Sperber <mike@xemacs.org> * make-src-depend (PrintDeps): Fix Perl code that no longer works
--- a/src/search.c Mon Jan 28 08:13:37 2008 +0100 +++ b/src/search.c Wed Jan 30 09:26:59 2008 +0100 @@ -47,6 +47,15 @@ #define REGEXP_CACHE_SIZE 20 +#ifdef DEBUG_XEMACS + +/* Used in tests/automated/case-tests.el if available. */ +Fixnum debug_xemacs_searches; + +Lisp_Object Qsearch_algorithm_used, Qboyer_moore, Qsimple_search; + +#endif + /* If the regexp is non-nil, then the buffer contains the compiled form of that regexp, suitable for searching. */ struct regexp_cache @@ -1370,16 +1379,10 @@ inv_bytelen = set_itext_ichar (tmp_str, inverse); new_bytelen = set_itext_ichar (tmp_str, translated); - if (-1 == charset_base) - { - /* Keep track of which charset and character set row - contains the characters that need translation. - - Zero out the bits corresponding to the last byte. */ - charset_base = c & ~ICHAR_FIELD3_MASK; - } - - if (boyer_moore_ok && (translated != c || inverse != c)) + if (boyer_moore_ok + /* Only do the Boyer-Moore check for characters needing + translation. */ + && (translated != c || inverse != c)) { Ichar starting_c = c; int charset_base_code; @@ -1396,21 +1399,34 @@ if (c > 0xFF && nothing_greater_than_0xff) continue; - charset_base_code = c & ~ICHAR_FIELD3_MASK; - - if (charset_base_code != charset_base) + if (-1 == charset_base) /* No charset yet specified. */ + { + /* Keep track of which charset and character set row + contains the characters that need translation. + + Zero out the bits corresponding to the last + byte. */ + charset_base = c & ~ICHAR_FIELD3_MASK; + } + else { - /* If two different rows, or two different charsets, - appear, needing translation, then we cannot use - boyer_moore search. See the comment at the head of - boyer_moore(). */ - boyer_moore_ok = 0; - break; + charset_base_code = c & ~ICHAR_FIELD3_MASK; + + if (charset_base_code != charset_base) + { + /* If two different rows, or two different + charsets, appear, needing non-ASCII + translation, then we cannot use boyer_moore + search. See the comment at the head of + boyer_moore(). */ + boyer_moore_ok = 0; + break; + } } } while (c != starting_c); - if (boyer_moore_ok && (charset_base != - (translated & ~ICHAR_FIELD3_MASK))) + if (boyer_moore_ok && charset_base != -1 && + charset_base != (translated & ~ICHAR_FIELD3_MASK)) { /* In the rare event that the CANON entry for this character is not in the desired set, choose one @@ -1437,6 +1453,12 @@ base_pat += orig_bytelen; len -= orig_bytelen; } + + if (-1 == charset_base) + { + charset_base = 'a' & ~ICHAR_FIELD3_MASK; /* Default to ASCII. */ + } + #else /* not MULE */ while (--len >= 0) { @@ -1453,6 +1475,15 @@ #endif /* MULE */ len = pat - patbuf; pat = base_pat = patbuf; + +#ifdef DEBUG_XEMACS + if (debug_xemacs_searches) + { + Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used); + sym->value = boyer_moore_ok ? Qboyer_moore : Qsimple_search; + } +#endif + if (boyer_moore_ok) return boyer_moore (buf, base_pat, len, pos, lim, n, trt, inverse_trt, charset_base); @@ -1595,9 +1626,9 @@ TRT and INVERSE_TRT are translation tables. This kind of search works if all the characters in PAT that have - nontrivial translation are the same aside from the last byte. This - makes it possible to translate just the last byte of a character, - and do so after just a simple test of the context. + (non-ASCII) translation are the same aside from the last byte. This + makes it possible to translate just the last byte of a character, and do + so after just a simple test of the context. If that criterion is not satisfied, do not call this function. You will get an assertion failure. */ @@ -1740,11 +1771,6 @@ charstart--; untranslated = itext_ichar (charstart); - /* We shouldn't have been passed a string with varying - character sets or rows. That's what simple_search is - for. */ - assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK)); - ch = TRANSLATE (trt, untranslated); if (!ibyte_first_byte_p (*ptr)) { @@ -1753,7 +1779,8 @@ translate_anteprev_byte = ptr[-2]; } - if (charset_base != (ch & ~ICHAR_FIELD3_MASK)) + if (ch != untranslated && /* Was translation done? */ + charset_base != (ch & ~ICHAR_FIELD3_MASK)) { /* In the very rare event that the CANON entry for this character is not in the desired set, choose one that @@ -1765,21 +1792,23 @@ We can get here if search_buffer has worked out that the buffer is entirely single width. */ Ichar starting_ch = ch; + int count = 0; do { ch = TRANSLATE (inverse_trt, ch); if (charset_base == (ch & ~ICHAR_FIELD3_MASK)) break; - + ++count; } while (starting_ch != ch); - /* If starting_ch is equal to ch, the case table is - corrupt. (Any mapping in the canon table should be - reflected in the equivalence table, and we know from - the canon table that untranslated maps to starting_ch - and that untranslated has the correct value for - charset_base.) */ - assert (starting_ch != ch); + /* If starting_ch is equal to ch (and count is not one, + which means no translation is necessary), the case + table is corrupt. (Any mapping in the canon table + should be reflected in the equivalence table, and we + know from the canon table that untranslated maps to + starting_ch and that untranslated has the correct value + for charset_base.) */ + assert (1 == count || starting_ch != ch); } } else @@ -3320,4 +3349,15 @@ Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed); staticpro (&Vskip_chars_range_table); +#ifdef DEBUG_XEMACS + DEFSYMBOL (Qsearch_algorithm_used); + DEFSYMBOL (Qboyer_moore); + DEFSYMBOL (Qsimple_search); + + DEFVAR_INT ("debug-xemacs-searches", &debug_xemacs_searches /* +If non-zero, bind `search-algorithm-used' to `boyer-moore' or `simple-search', +depending on the algorithm used for each search. Used for testing. +*/ ); + debug_xemacs_searches = 0; +#endif }
--- a/tests/ChangeLog Mon Jan 28 08:13:37 2008 +0100 +++ b/tests/ChangeLog Wed Jan 30 09:26:59 2008 +0100 @@ -1,3 +1,9 @@ +2008-01-30 Aidan Kehoe <kehoea@parhasard.net> + + * automated/case-tests.el: + Check for a bug Mike Sperber reported; check algorithms used, if + available. + 2008-01-16 Aidan Kehoe <kehoea@parhasard.net> * automated/mule-tests.el (test-file-name):
--- a/tests/automated/case-tests.el Mon Jan 28 08:13:37 2008 +0100 +++ b/tests/automated/case-tests.el Wed Jan 30 09:26:59 2008 +0100 @@ -268,3 +268,37 @@ (goto-char (point-max)) (Assert (not (search-backward string nil t 6)))))) +;; Bug reported in http://mid.gmane.org/y9lk5lu5orq.fsf@deinprogramm.de from +;; Michael Sperber. Fixed 2008-01-29. +(with-string-as-buffer-contents "\n\nDer beruhmte deutsche Flei\xdf\n\n" + (goto-char (point-min)) + (Assert (search-forward "Flei\xdf"))) + +(Skip-Test-Unless + (boundp 'debug-xemacs-searches) ; normal when we have DEBUG_XEMACS + "not a DEBUG_XEMACS build" + (let ((debug-xemacs-searches 1)) + (with-temp-buffer + (insert "\n\nDer beruhmte deutsche Fleiss\n\n") + (goto-char (point-min)) + (search-forward "Fleiss") + (delete-region (point-min) (point-max)) + (insert "\n\nDer beruhmte deutsche Flei\xdf\n\n") + (goto-char (point-min)) + (search-forward "Flei\xdf") + (Assert (eq 'boyer-moore search-algorithm-used)) + (delete-region (point-min) (point-max)) + (when (featurep 'mule) + (insert "\n\nDer beruhmte deutsche Flei\xdf\n\n") + (goto-char (point-min)) + (Assert + (search-forward (format "Fle%c\xdf" + (make-char 'latin-iso8859-9 #xfd)))) + (Assert (eq 'boyer-moore search-algorithm-used)) + (insert (make-char 'latin-iso8859-9 #xfd)) + (goto-char (point-min)) + (Assert + (search-forward (format "Fle%c\xdf" + (make-char 'latin-iso8859-9 #xfd)))) + (Assert (eq 'simple-search search-algorithm-used)))))) +