Mercurial > hg > xemacs-beta

--- a/src/ChangeLog	Sat Jan 19 14:34:19 2008 +0100
+++ b/src/ChangeLog	Sun Jan 20 13:09:58 2008 +0100
@@ -1,3 +1,33 @@
+2007-12-26  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* casetab.c:
+	Extend and correct some case table documentation.
+	* search.c (search_buffer):
+	Correct a bug where only the first entry for a character in the
+	case equivalence table was examined in determining if the
+	Boyer-Moore search algorithm is appropriate.
+
+	If there are case mappings outside of the charset and row of the
+	characters specified in the search string, those case mappings can
+	be safely ignored (and Boyer-Moore search can be used) if we know
+	from the buffer statistics that the corresponding characters cannot
+	occur.
+
+	* search.c (boyer_moore):
+	Assert that we haven't been passed a string with varying
+	characters sets or rows within character sets. That's what
+	simple_search is for.
+
+	In the very rare event that a character in the search string has a
+	canonical case mapping that is not in the same character set and
+	row, don't try to search for the canonical character, search for
+	some other character that is in the the desired character set and
+	row. Assert that the case table isn't corrupt.
+
+	Do not search for any character case mappings that cannot possibly
+	occur in the buffer, given the buffer metadata about its
+	contents.
+
 2008-01-19  Aidan Kehoe  <kehoea@parhasard.net>

 	* dired.c (Ffile_attributes): If bignums are available, use them
--- a/src/casetab.c	Sat Jan 19 14:34:19 2008 +0100
+++ b/src/casetab.c	Sun Jan 20 13:09:58 2008 +0100
@@ -48,13 +48,28 @@
    or vice versa, both characters will have the same entry in the canon
    table.

-   (4) `equiv' lists the "equivalence classes" defined by `canon'.  Imagine
+   (4) `eqv' lists the "equivalence classes" defined by `canon'.  Imagine
    that all characters are divided into groups having the same `canon'
-   entry; these groups are called "equivalence classes" and `equiv' lists
-   them by linking the characters in each equivalence class together in a
-   circular list.
+   entry; these groups are called "equivalence classes" and `eqv' lists them
+   by linking the characters in each equivalence class together in a
+   circular list. That is, to find out all all the members of a given char's
+   equivalence classe, you need something like the following code:

-   `canon' is used when doing case-insensitive comparisons.  `equiv' is
+    (let* ((char ?i)
+           (original-char char)
+           (standard-case-eqv (case-table-eqv (standard-case-table))))
+      (loop
+        with res = (list char)
+        until (eq (setq char (get-char-table char standard-case-eqv))
+                  original-char)
+        do (push char res)
+        finally return res))
+
+   (Where #'case-table-eqv doesn't yet exist, and probably never will, given
+   that the C code needs to keep it in a consistent state so Lisp can't mess
+   around with it.)
+
+   `canon' is used when doing case-insensitive comparisons.  `eqv' is
    used in the Boyer-Moore search code.
    */
--- a/src/search.c	Sat Jan 19 14:34:19 2008 +0100
+++ b/src/search.c	Sun Jan 20 13:09:58 2008 +0100
@@ -1340,11 +1340,14 @@
     {
       int charset_base = -1;
       int boyer_moore_ok = 1;
-      Ibyte *pat = 0;
       Ibyte *patbuf = alloca_ibytes (len * MAX_ICHAR_LEN);
-      pat = patbuf;
+      Ibyte *pat = patbuf;
+
 #ifdef MULE
-      /* &&#### needs some 8-bit work here */
+      int entirely_one_byte_p = buf->text->entirely_one_byte_p;
+      int nothing_greater_than_0xff =
+        buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf);
+
       while (len > 0)
 	{
 	  Ibyte tmp_str[MAX_ICHAR_LEN];
@@ -1367,23 +1370,68 @@
 	  inv_bytelen = set_itext_ichar (tmp_str, inverse);
 	  new_bytelen = set_itext_ichar (tmp_str, translated);

-	  if (new_bytelen != orig_bytelen || inv_bytelen != orig_bytelen)
-	    boyer_moore_ok = 0;
-	  if (translated != c || inverse != c)
-	    {
-	      /* Keep track of which charset and character set row
-		 contains the characters that need translation.
-		 Zero out the bits corresponding to the last byte.
-	      */
-	      int charset_base_code = c & ~ICHAR_FIELD3_MASK;
-	      if (charset_base == -1)
-		charset_base = charset_base_code;
-	      else if (charset_base != charset_base_code)
-		/* If two different rows appear, needing translation, then
-		   we cannot use boyer_moore search.  See the comment at the
-		   head of boyer_moore(). */
-		boyer_moore_ok = 0;
-	    }
+          if (-1 == charset_base)
+            {
+              /* Keep track of which charset and character set row
+                 contains the characters that need translation.
+
+                 Zero out the bits corresponding to the last byte. */
+              charset_base = c & ~ICHAR_FIELD3_MASK;
+            }
+
+          if (boyer_moore_ok && (translated != c || inverse != c))
+            {
+	      Ichar starting_c = c;
+	      int charset_base_code;
+
+	      do
+		{
+		  c = TRANSLATE (inverse_trt, c);
+
+                  /* If a character cannot occur in the buffer, ignore
+                     it. */
+                  if (c > 0x7F && entirely_one_byte_p)
+                    continue;
+
+                  if (c > 0xFF && nothing_greater_than_0xff)
+                    continue;
+
+                  charset_base_code = c & ~ICHAR_FIELD3_MASK;
+
+                  if (charset_base_code != charset_base)
+                    {
+                      /* If two different rows, or two different charsets,
+                         appear, needing translation, then we cannot use
+                         boyer_moore search.  See the comment at the head of
+                         boyer_moore(). */
+                      boyer_moore_ok = 0;
+                      break;
+                    }
+                } while (c != starting_c);
+
+              if (boyer_moore_ok && (charset_base !=
+                                     (translated & ~ICHAR_FIELD3_MASK)))
+                {
+                  /* In the rare event that the CANON entry for this
+                     character is not in the desired set, choose one
+                     that is, from the equivalence set. It doesn't much
+                     matter which. */
+                  Ichar starting_ch = translated;
+                  do
+                    {
+                      translated = TRANSLATE (inverse_trt, translated);
+
+                      if (charset_base == (translated & ~ICHAR_FIELD3_MASK))
+                        break;
+
+                    } while (starting_ch != translated);
+
+                  assert (starting_ch != translated);
+
+                  new_bytelen = set_itext_ichar (tmp_str, translated);
+                }
+            }
+
 	  memcpy (pat, tmp_str, new_bytelen);
 	  pat += new_bytelen;
 	  base_pat += orig_bytelen;
@@ -1551,14 +1599,14 @@
    makes it possible to translate just the last byte of a character,
    and do so after just a simple test of the context.

-   If that criterion is not satisfied, do not call this function.  */
+   If that criterion is not satisfied, do not call this function.  You will
+   get an assertion failure. */

 static Charbpos
 boyer_moore (struct buffer *buf, Ibyte *base_pat, Bytecount len,
 	     Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt,
 	     Lisp_Object inverse_trt, int USED_IF_MULE (charset_base))
 {
-  /* &&#### needs some 8-bit work here */
   /* #### Someone really really really needs to comment the workings
      of this junk somewhat better.

@@ -1602,6 +1650,13 @@
 #ifdef MULE
   Ibyte translate_prev_byte = 0;
   Ibyte translate_anteprev_byte = 0;
+  /* These need to be rethought in the event that the internal format
+     changes, or in the event that num_8_bit_fixed_chars disappears
+     (entirely_one_byte_p can be trivially worked out by checking is the
+     byte count equal to the char count.)  */
+  int buffer_entirely_one_byte_p = buf->text->entirely_one_byte_p;
+  int buffer_nothing_greater_than_0xff =
+    buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf);
 #endif
 #ifdef C_ALLOCA
   EMACS_INT BM_tab_space[0400];
@@ -1684,20 +1739,47 @@
 	      while (!ibyte_first_byte_p (*charstart))
 		charstart--;
 	      untranslated = itext_ichar (charstart);
-	      if (charset_base == (untranslated & ~ICHAR_FIELD3_MASK))
-		{
-		  ch = TRANSLATE (trt, untranslated);
-		  if (!ibyte_first_byte_p (*ptr))
-		    {
-		      translate_prev_byte = ptr[-1];
-		      if (!ibyte_first_byte_p (translate_prev_byte))
-			translate_anteprev_byte = ptr[-2];
-		    }
-		}
-	      else
-		{
-		  this_translated = 0;
-		  ch = *ptr;
+
+              /* We shouldn't have been passed a string with varying
+                 character sets or rows. That's what simple_search is
+                 for.  */
+              assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK));
+
+              ch = TRANSLATE (trt, untranslated);
+              if (!ibyte_first_byte_p (*ptr))
+                {
+                  translate_prev_byte = ptr[-1];
+                  if (!ibyte_first_byte_p (translate_prev_byte))
+                    translate_anteprev_byte = ptr[-2];
+                }
+
+              if (charset_base != (ch & ~ICHAR_FIELD3_MASK))
+                {
+                  /* In the very rare event that the CANON entry for this
+                     character is not in the desired set, choose one that
+                     is, from the equivalence set. It doesn't much matter
+                     which, since we're building our own cheesy equivalence
+                     table instead of using that belonging to the case
+                     table directly.
+
+                     We can get here if search_buffer has worked out that
+                     the buffer is entirely single width. */
+                  Ichar starting_ch = ch;
+                  do
+                    {
+                      ch = TRANSLATE (inverse_trt, ch);
+                      if (charset_base == (ch & ~ICHAR_FIELD3_MASK))
+                        break;
+
+                    } while (starting_ch != ch);
+
+                  /* If starting_ch is equal to ch, the case table is
+                     corrupt. (Any mapping in the canon table should be
+                     reflected in the equivalence table, and we know from
+                     the canon table that untranslated maps to starting_ch
+                     and that untranslated has the correct value for
+                     charset_base.) */
+                  assert (starting_ch != ch);
 		}
 	    }
 	  else
@@ -1713,28 +1795,34 @@
 	  if (i == infinity)
 	    stride_for_teases = BM_tab[j];
 	  BM_tab[j] = dirlen - i;
-	  /* A translation table is accompanied by its inverse --
-	     see comment in casetab.c. */
+	  /* A translation table is accompanied by its inverse -- see
+	     comment in casetab.c. */
 	  if (this_translated)
 	    {
 	      Ichar starting_ch = ch;
 	      EMACS_INT starting_j = j;
-	      while (1)
+	      do
 		{
 		  ch = TRANSLATE (inverse_trt, ch);
-		  if (ch > 0400)
-		    j = ((unsigned char) ch | 0200);
-		  else
-		    j = (unsigned char) ch;
-
-		  /* For all the characters that map into CH,
-		     set up simple_translate to map the last byte
-		     into STARTING_J.  */
-		  simple_translate[j] = (Ibyte) starting_j;
-		  if (ch == starting_ch)
-		    break;
-		  BM_tab[j] = dirlen - i;
-		}
+
+                  if (ch > 0x7F && buffer_entirely_one_byte_p)
+                    continue;
+
+                  if (ch > 0xFF && buffer_nothing_greater_than_0xff)
+                    continue;
+
+                  if (ch > 0400)
+                    j = ((unsigned char) ch | 0200);
+                  else
+                    j = (unsigned char) ch;
+
+                  /* For all the characters that map into CH, set up
+                     simple_translate to map the last byte into
+                     STARTING_J.  */
+                  simple_translate[j] = (Ibyte) starting_j;
+                  BM_tab[j] = dirlen - i;
+
+		} while (ch != starting_ch);
 	    }
 #else
 	  EMACS_INT k;