changeset 4414:df576f30c1d8

Correct case-insensitive search for non-case, non-ASCII chars. Add tests. 2008-01-30 Aidan Kehoe <kehoea@parhasard.net> * automated/case-tests.el: Check for a bug Mike Sperber reported; check algorithms used, if available. 2008-01-30 Aidan Kehoe <kehoea@parhasard.net> * search.c (debug-xemacs-searches): New variable, available on debug builds. Used in tests/automated/case-tests.el. (search_buffer): Only store the charset_base for characters with translations. Correct some comments, correct some checks. If debug_xemacs_searches is non-zero, record which search was used. (boyer_moore): Remove an assertion that was incorrect. Remove its documentation. Correct an assertion dealing with equivalence tables; we may end up looking through the equivalence table if a non-ASCII non-case character was searched for.
author Aidan Kehoe <kehoea@parhasard.net>
date Wed, 30 Jan 2008 09:26:59 +0100
parents dc84ec90b463
children bceb3e285ae7
files src/ChangeLog src/search.c tests/ChangeLog tests/automated/case-tests.el
diffstat 4 files changed, 131 insertions(+), 38 deletions(-) [+]
line wrap: on
line diff
--- a/src/ChangeLog	Mon Jan 28 08:13:37 2008 +0100
+++ b/src/ChangeLog	Wed Jan 30 09:26:59 2008 +0100
@@ -1,3 +1,16 @@
+2008-01-30  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* search.c (debug-xemacs-searches): 
+	New variable, available on debug builds. Used in
+	tests/automated/case-tests.el. 
+	(search_buffer): Only store the charset_base for characters with
+	translations. Correct some comments, correct some checks. If
+	debug_xemacs_searches is non-zero, record which search was used. 
+	(boyer_moore): Remove an assertion that was incorrect. Remove its
+	documentation. Correct an assertion dealing with equivalence
+	tables; we may end up looking through the equivalence table if a
+	non-ASCII non-case character was searched for. 
+
 2008-01-24 Mike Sperber   <mike@xemacs.org>
 
 	* make-src-depend (PrintDeps): Fix Perl code that no longer works
--- a/src/search.c	Mon Jan 28 08:13:37 2008 +0100
+++ b/src/search.c	Wed Jan 30 09:26:59 2008 +0100
@@ -47,6 +47,15 @@
 
 #define REGEXP_CACHE_SIZE 20
 
+#ifdef DEBUG_XEMACS
+
+/* Used in tests/automated/case-tests.el if available. */
+Fixnum debug_xemacs_searches;
+
+Lisp_Object Qsearch_algorithm_used, Qboyer_moore, Qsimple_search;
+
+#endif
+
 /* If the regexp is non-nil, then the buffer contains the compiled form
    of that regexp, suitable for searching.  */
 struct regexp_cache
@@ -1370,16 +1379,10 @@
 	  inv_bytelen = set_itext_ichar (tmp_str, inverse);
 	  new_bytelen = set_itext_ichar (tmp_str, translated);
 
-          if (-1 == charset_base)
-            {
-              /* Keep track of which charset and character set row
-                 contains the characters that need translation.
-
-                 Zero out the bits corresponding to the last byte. */
-              charset_base = c & ~ICHAR_FIELD3_MASK;
-            }
-
-          if (boyer_moore_ok && (translated != c || inverse != c))
+          if (boyer_moore_ok
+              /* Only do the Boyer-Moore check for characters needing
+                 translation. */
+              && (translated != c || inverse != c))
             {
 	      Ichar starting_c = c;
 	      int charset_base_code;
@@ -1396,21 +1399,34 @@
                   if (c > 0xFF && nothing_greater_than_0xff)
                     continue;
 
-                  charset_base_code = c & ~ICHAR_FIELD3_MASK;
-
-                  if (charset_base_code != charset_base)
+                  if (-1 == charset_base) /* No charset yet specified. */
+                    {
+                      /* Keep track of which charset and character set row
+                         contains the characters that need translation.
+
+                         Zero out the bits corresponding to the last
+                         byte. */
+                      charset_base = c & ~ICHAR_FIELD3_MASK;
+                    }
+                  else
                     {
-                      /* If two different rows, or two different charsets,
-                         appear, needing translation, then we cannot use
-                         boyer_moore search.  See the comment at the head of
-                         boyer_moore(). */
-                      boyer_moore_ok = 0;
-                      break;
+                      charset_base_code = c & ~ICHAR_FIELD3_MASK;
+
+                      if (charset_base_code != charset_base)
+                        {
+                          /* If two different rows, or two different
+                             charsets, appear, needing non-ASCII
+                             translation, then we cannot use boyer_moore
+                             search.  See the comment at the head of
+                             boyer_moore(). */
+                          boyer_moore_ok = 0;
+                          break;
+                        }
                     }
                 } while (c != starting_c);
 
-              if (boyer_moore_ok && (charset_base != 
-                                     (translated & ~ICHAR_FIELD3_MASK)))
+              if (boyer_moore_ok && charset_base != -1 && 
+                  charset_base != (translated & ~ICHAR_FIELD3_MASK))
                 {
                   /* In the rare event that the CANON entry for this
                      character is not in the desired set, choose one
@@ -1437,6 +1453,12 @@
 	  base_pat += orig_bytelen;
 	  len -= orig_bytelen;
 	}
+
+      if (-1 == charset_base)
+        {
+          charset_base = 'a' & ~ICHAR_FIELD3_MASK; /* Default to ASCII. */
+        }
+
 #else /* not MULE */
       while (--len >= 0)
 	{
@@ -1453,6 +1475,15 @@
 #endif /* MULE */
       len = pat - patbuf;
       pat = base_pat = patbuf;
+
+#ifdef DEBUG_XEMACS
+      if (debug_xemacs_searches)
+        {
+          Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used);
+          sym->value = boyer_moore_ok ? Qboyer_moore : Qsimple_search;
+        }
+#endif
+
       if (boyer_moore_ok)
 	return boyer_moore (buf, base_pat, len, pos, lim, n,
 			    trt, inverse_trt, charset_base);
@@ -1595,9 +1626,9 @@
    TRT and INVERSE_TRT are translation tables.
 
    This kind of search works if all the characters in PAT that have
-   nontrivial translation are the same aside from the last byte.  This
-   makes it possible to translate just the last byte of a character,
-   and do so after just a simple test of the context.
+   (non-ASCII) translation are the same aside from the last byte.  This
+   makes it possible to translate just the last byte of a character, and do
+   so after just a simple test of the context.
 
    If that criterion is not satisfied, do not call this function.  You will
    get an assertion failure. */
@@ -1740,11 +1771,6 @@
 		charstart--;
 	      untranslated = itext_ichar (charstart);
 
-              /* We shouldn't have been passed a string with varying
-                 character sets or rows. That's what simple_search is
-                 for.  */
-              assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK));
-
               ch = TRANSLATE (trt, untranslated);
               if (!ibyte_first_byte_p (*ptr))
                 {
@@ -1753,7 +1779,8 @@
                     translate_anteprev_byte = ptr[-2];
                 }
 
-              if (charset_base != (ch & ~ICHAR_FIELD3_MASK))
+              if (ch != untranslated && /* Was translation done? */
+                  charset_base != (ch & ~ICHAR_FIELD3_MASK))
                 {
                   /* In the very rare event that the CANON entry for this
                      character is not in the desired set, choose one that
@@ -1765,21 +1792,23 @@
                      We can get here if search_buffer has worked out that
                      the buffer is entirely single width. */
                   Ichar starting_ch = ch;
+                  int count = 0;
                   do
                     {
                       ch = TRANSLATE (inverse_trt, ch);
                       if (charset_base == (ch & ~ICHAR_FIELD3_MASK))
                         break;
-
+                      ++count;
                     } while (starting_ch != ch);
 
-                  /* If starting_ch is equal to ch, the case table is
-                     corrupt. (Any mapping in the canon table should be
-                     reflected in the equivalence table, and we know from
-                     the canon table that untranslated maps to starting_ch
-                     and that untranslated has the correct value for
-                     charset_base.) */
-                  assert (starting_ch != ch);
+                  /* If starting_ch is equal to ch (and count is not one,
+                     which means no translation is necessary), the case
+                     table is corrupt. (Any mapping in the canon table
+                     should be reflected in the equivalence table, and we
+                     know from the canon table that untranslated maps to
+                     starting_ch and that untranslated has the correct value
+                     for charset_base.) */
+                  assert (1 == count || starting_ch != ch);
 		}
 	    }
 	  else
@@ -3320,4 +3349,15 @@
 
   Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed);
   staticpro (&Vskip_chars_range_table);
+#ifdef DEBUG_XEMACS 
+  DEFSYMBOL (Qsearch_algorithm_used);
+  DEFSYMBOL (Qboyer_moore);
+  DEFSYMBOL (Qsimple_search);
+
+  DEFVAR_INT ("debug-xemacs-searches", &debug_xemacs_searches /*
+If non-zero, bind `search-algorithm-used' to `boyer-moore' or `simple-search',
+depending on the algorithm used for each search.  Used for testing.
+*/ );
+  debug_xemacs_searches = 0;
+#endif 
 }
--- a/tests/ChangeLog	Mon Jan 28 08:13:37 2008 +0100
+++ b/tests/ChangeLog	Wed Jan 30 09:26:59 2008 +0100
@@ -1,3 +1,9 @@
+2008-01-30  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* automated/case-tests.el:
+	Check for a bug Mike Sperber reported; check algorithms used, if
+	available. 
+
 2008-01-16  Aidan Kehoe  <kehoea@parhasard.net>
 
 	* automated/mule-tests.el (test-file-name): 
--- a/tests/automated/case-tests.el	Mon Jan 28 08:13:37 2008 +0100
+++ b/tests/automated/case-tests.el	Wed Jan 30 09:26:59 2008 +0100
@@ -268,3 +268,37 @@
       (goto-char (point-max))
       (Assert (not (search-backward string nil t 6))))))
 
+;; Bug reported in http://mid.gmane.org/y9lk5lu5orq.fsf@deinprogramm.de from
+;; Michael Sperber. Fixed 2008-01-29.
+(with-string-as-buffer-contents "\n\nDer beruhmte deutsche Flei\xdf\n\n"
+  (goto-char (point-min))
+  (Assert (search-forward "Flei\xdf")))
+
+(Skip-Test-Unless
+ (boundp 'debug-xemacs-searches) ; normal when we have DEBUG_XEMACS
+ "not a DEBUG_XEMACS build"
+ (let ((debug-xemacs-searches 1))
+   (with-temp-buffer
+     (insert "\n\nDer beruhmte deutsche Fleiss\n\n")
+     (goto-char (point-min))
+     (search-forward "Fleiss")
+     (delete-region (point-min) (point-max))
+     (insert "\n\nDer beruhmte deutsche Flei\xdf\n\n")
+     (goto-char (point-min))
+     (search-forward "Flei\xdf")
+     (Assert (eq 'boyer-moore search-algorithm-used))
+     (delete-region (point-min) (point-max))
+     (when (featurep 'mule)
+       (insert "\n\nDer beruhmte deutsche Flei\xdf\n\n")
+       (goto-char (point-min))
+       (Assert 
+        (search-forward (format "Fle%c\xdf"
+                                (make-char 'latin-iso8859-9 #xfd))))
+       (Assert (eq 'boyer-moore search-algorithm-used))
+       (insert (make-char 'latin-iso8859-9 #xfd))
+       (goto-char (point-min))
+       (Assert 
+        (search-forward (format "Fle%c\xdf"
+                                (make-char 'latin-iso8859-9 #xfd))))
+       (Assert (eq 'simple-search search-algorithm-used))))))
+