xemacs-beta: src/search.c comparison

comparison src/search.c @ 4414:df576f30c1d8

Correct case-insensitive search for non-case, non-ASCII chars. Add tests. 2008-01-30 Aidan Kehoe <kehoea@parhasard.net> * automated/case-tests.el: Check for a bug Mike Sperber reported; check algorithms used, if available. 2008-01-30 Aidan Kehoe <kehoea@parhasard.net> * search.c (debug-xemacs-searches): New variable, available on debug builds. Used in tests/automated/case-tests.el. (search_buffer): Only store the charset_base for characters with translations. Correct some comments, correct some checks. If debug_xemacs_searches is non-zero, record which search was used. (boyer_moore): Remove an assertion that was incorrect. Remove its documentation. Correct an assertion dealing with equivalence tables; we may end up looking through the equivalence table if a non-ASCII non-case character was searched for.

author	Aidan Kehoe <kehoea@parhasard.net>
date	Wed, 30 Jan 2008 09:26:59 +0100
parents	4ee73bbe4f8e
children	69b803c646cd

comparison

equal deleted inserted replaced

-:dc84ec90b463
+:df576f30c1d8
 #define TRANSLATE(table, pos)	\
 (!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos)
 #define REGEXP_CACHE_SIZE 20
+#ifdef DEBUG_XEMACS
+/* Used in tests/automated/case-tests.el if available. */
+Fixnum debug_xemacs_searches;
+Lisp_Object Qsearch_algorithm_used, Qboyer_moore, Qsimple_search;
+#endif
 /* If the regexp is non-nil, then the buffer contains the compiled form
 of that regexp, suitable for searching.  */
 struct regexp_cache
 {
 	  orig_bytelen = itext_ichar_len (base_pat);
 	  inv_bytelen = set_itext_ichar (tmp_str, inverse);
 	  new_bytelen = set_itext_ichar (tmp_str, translated);
-if (-1 == charset_base)
+if (boyer_moore_ok
-{
+/* Only do the Boyer-Moore check for characters needing
-/* Keep track of which charset and character set row
+translation. */
-contains the characters that need translation.
+&& (translated != c || inverse != c))
-Zero out the bits corresponding to the last byte. */
-charset_base = c & ~ICHAR_FIELD3_MASK;
-}
-if (boyer_moore_ok && (translated != c || inverse != c))
 {
 	      Ichar starting_c = c;
 	      int charset_base_code;
 	      do
 continue;
 if (c > 0xFF && nothing_greater_than_0xff)
 continue;
-charset_base_code = c & ~ICHAR_FIELD3_MASK;
+if (-1 == charset_base) /* No charset yet specified. */
-if (charset_base_code != charset_base)
 {
-/* If two different rows, or two different charsets,
+/* Keep track of which charset and character set row
-appear, needing translation, then we cannot use
+contains the characters that need translation.
-boyer_moore search.  See the comment at the head of
-boyer_moore(). */
+Zero out the bits corresponding to the last
-boyer_moore_ok = 0;
+byte. */
-break;
+charset_base = c & ~ICHAR_FIELD3_MASK;
+}
+else
+{
+charset_base_code = c & ~ICHAR_FIELD3_MASK;
+if (charset_base_code != charset_base)
+{
+/* If two different rows, or two different
+charsets, appear, needing non-ASCII
+translation, then we cannot use boyer_moore
+search.  See the comment at the head of
+boyer_moore(). */
+boyer_moore_ok = 0;
+break;
+}
 }
 } while (c != starting_c);
-if (boyer_moore_ok && (charset_base !=
+if (boyer_moore_ok && charset_base != -1 &&
-(translated & ~ICHAR_FIELD3_MASK)))
+charset_base != (translated & ~ICHAR_FIELD3_MASK))
 {
 /* In the rare event that the CANON entry for this
 character is not in the desired set, choose one
 that is, from the equivalence set. It doesn't much
 matter which. */
 	  memcpy (pat, tmp_str, new_bytelen);
 	  pat += new_bytelen;
 	  base_pat += orig_bytelen;
 	  len -= orig_bytelen;
 	}
+if (-1 == charset_base)
+{
+charset_base = 'a' & ~ICHAR_FIELD3_MASK; /* Default to ASCII. */
+}
 #else /* not MULE */
 while (--len >= 0)
 	{
 	  /* If we got here and the RE flag is set, it's because
 	     we're dealing with a regexp known to be trivial, so the
 	  *pat++ = TRANSLATE (trt, *base_pat++);
 	}
 #endif /* MULE */
 len = pat - patbuf;
 pat = base_pat = patbuf;
+#ifdef DEBUG_XEMACS
+if (debug_xemacs_searches)
+{
+Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used);
+sym->value = boyer_moore_ok ? Qboyer_moore : Qsimple_search;
+}
+#endif
 if (boyer_moore_ok)
 	return boyer_moore (buf, base_pat, len, pos, lim, n,
 			    trt, inverse_trt, charset_base);
 else
 	return simple_search (buf, base_pat, len, pos, lim, n, trt);
 from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
 DIRECTION says which direction we search in.
 TRT and INVERSE_TRT are translation tables.
 This kind of search works if all the characters in PAT that have
-nontrivial translation are the same aside from the last byte.  This
+(non-ASCII) translation are the same aside from the last byte.  This
-makes it possible to translate just the last byte of a character,
+makes it possible to translate just the last byte of a character, and do
-and do so after just a simple test of the context.
+so after just a simple test of the context.
 If that criterion is not satisfied, do not call this function.  You will
 get an assertion failure. */
 static Charbpos
 	      Ibyte *charstart = ptr;
 	      while (!ibyte_first_byte_p (*charstart))
 		charstart--;
 	      untranslated = itext_ichar (charstart);
-/* We shouldn't have been passed a string with varying
-character sets or rows. That's what simple_search is
-for.  */
-assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK));
 ch = TRANSLATE (trt, untranslated);
 if (!ibyte_first_byte_p (*ptr))
 {
 translate_prev_byte = ptr[-1];
 if (!ibyte_first_byte_p (translate_prev_byte))
 translate_anteprev_byte = ptr[-2];
 }
-if (charset_base != (ch & ~ICHAR_FIELD3_MASK))
+if (ch != untranslated && /* Was translation done? */
+charset_base != (ch & ~ICHAR_FIELD3_MASK))
 {
 /* In the very rare event that the CANON entry for this
 character is not in the desired set, choose one that
 is, from the equivalence set. It doesn't much matter
 which, since we're building our own cheesy equivalence
 table directly.
 We can get here if search_buffer has worked out that
 the buffer is entirely single width. */
 Ichar starting_ch = ch;
+int count = 0;
 do
 {
 ch = TRANSLATE (inverse_trt, ch);
 if (charset_base == (ch & ~ICHAR_FIELD3_MASK))
 break;
+++count;
 } while (starting_ch != ch);
-/* If starting_ch is equal to ch, the case table is
+/* If starting_ch is equal to ch (and count is not one,
-corrupt. (Any mapping in the canon table should be
+which means no translation is necessary), the case
-reflected in the equivalence table, and we know from
+table is corrupt. (Any mapping in the canon table
-the canon table that untranslated maps to starting_ch
+should be reflected in the equivalence table, and we
-and that untranslated has the correct value for
+know from the canon table that untranslated maps to
-charset_base.) */
+starting_ch and that untranslated has the correct value
-assert (starting_ch != ch);
+for charset_base.) */
+assert (1 == count || starting_ch != ch);
 		}
 	    }
 	  else
 	    {
 	      ch = *ptr;
 */ );
 warn_about_possibly_incompatible_back_references = 1;
 Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed);
 staticpro (&Vskip_chars_range_table);
-}
+#ifdef DEBUG_XEMACS
+DEFSYMBOL (Qsearch_algorithm_used);
+DEFSYMBOL (Qboyer_moore);
+DEFSYMBOL (Qsimple_search);
+DEFVAR_INT ("debug-xemacs-searches", &debug_xemacs_searches /*
+If non-zero, bind `search-algorithm-used' to `boyer-moore' or `simple-search',
+depending on the algorithm used for each search.  Used for testing.
+*/ );
+debug_xemacs_searches = 0;
+#endif
+}

Mercurial > hg > xemacs-beta

comparison src/search.c @ 4414:df576f30c1d8