changeset 877:e54d47b2d736

[xemacs-hg @ 2002-06-23 09:54:35 by stephent] warning fixes <87bsa2qymn.fsf@tleepslib.sk.tsukuba.ac.jp> unicode improvements <87znxmpc96.fsf@tleepslib.sk.tsukuba.ac.jp>
author stephent
date Sun, 23 Jun 2002 09:54:41 +0000
parents 890f3cafe600
children 64f38afaab2d
files lisp/ChangeLog lisp/unicode.el src/ChangeLog src/ExternalShell.c src/editfns.c src/eldap.c src/unicode.c
diffstat 7 files changed, 178 insertions(+), 55 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/ChangeLog	Sun Jun 23 09:26:37 2002 +0000
+++ b/lisp/ChangeLog	Sun Jun 23 09:54:41 2002 +0000
@@ -1,3 +1,7 @@
+2002-06-23  Stephen J. Turnbull  <stephen@xemacs.org>
+
+	* unicode.el (load-unicode-tables): Comment loading 8859-1.TXT.
+
 2002-06-20  Ben Wing  <ben@xemacs.org>
 
 	* mule/mule-msw-init-late.el:
--- a/lisp/unicode.el	Sun Jun 23 09:26:37 2002 +0000
+++ b/lisp/unicode.el	Sun Jun 23 09:54:41 2002 +0000
@@ -82,10 +82,17 @@
   "Initialize the Unicode translation tables for all standard charsets."
   (let ((parse-args
 	 '(("unicode/unicode-consortium"
+	    ;; Due to the braindamaged way Mule treats the ASCII and Control-1
+	    ;; charsets' types, trying to load them results in out-of-range
+	    ;; warnings at unicode.c:1439.  They're no-ops anyway, they're
+	    ;; hardwired in unicode.c (unicode_to_ichar, ichar_to_unicode).
+	    ;; ("8859-1.TXT" ascii #x00 #x7F #x0)
+	    ;; ("8859-1.TXT" control-1 #x80 #x9F #x-80)
+            ;; The 8859-1.TXT G1 assignments are half no-ops, hardwired in
+	    ;; unicode.c ichar_to_unicode, but not in unicode_to_ichar.
 	    ("8859-1.TXT" latin-iso8859-1 #xA0 #xFF #x-80)
 	    ;; "8859-10.TXT"
 	    ;; "8859-13.TXT"
-	    ;; "8859-14.TXT"
 	    ("8859-14.TXT" latin-iso8859-14 #xA0 #xFF #x-80)
 	    ("8859-15.TXT" latin-iso8859-15 #xA0 #xFF #x-80)
 	    ("8859-2.TXT" latin-iso8859-2 #xA0 #xFF #x-80)
--- a/src/ChangeLog	Sun Jun 23 09:26:37 2002 +0000
+++ b/src/ChangeLog	Sun Jun 23 09:54:41 2002 +0000
@@ -1,3 +1,28 @@
+2002-06-23  Stephen J. Turnbull  <stephen@xemacs.org>
+
+	* unicode.c: Improve top-level comments and many docstrings.
+
+	(Fparse_unicode_translation_table): Use right function name in error.
+
+	(unicode_to_ichar): Renamed from unicode_to_char.
+	(Funicode_to_char): 
+	(decode_unicode_char):
+	Use new name.
+
+	(add_charsets_to_precedence_list): Change the dynarr argument
+	instead of the global precedence list.
+
+	(Funicode_precedence_list): New function.
+	(Fset_language_unicode_precedence_list,
+	 Flanguage_unicode_precedence_list,
+	 Fset_default_unicode_precedence_list,
+	 Fdefault_unicode_precedence_list):
+	Docstrings refer to docstring of Funicode_precedence_list.
+
+	(set_unicode_conversion): Assert attempts to change Basic Latin
+	(Unicode BMP, row 00).
+	(Fset_unicode_conversion): Signal error on changes to Basic Latin.
+
 2002-06-23  Malcolm Purvis  <malcolmpurvis@optushome.com.au>
 
 	* dialog-gtk.c:
--- a/src/ExternalShell.c	Sun Jun 23 09:26:37 2002 +0000
+++ b/src/ExternalShell.c	Sun Jun 23 09:54:41 2002 +0000
@@ -611,8 +611,7 @@
 hack_event_masks_1 (Display *display, Window w, int this_window_propagate)
 {
   Window root, parent, *children;
-  unsigned int nchildren;
-  int i;
+  unsigned int nchildren, i;
 
   if (!XQueryTree (display, w, &root, &parent, &children, &nchildren))
     return;
--- a/src/editfns.c	Sun Jun 23 09:26:37 2002 +0000
+++ b/src/editfns.c	Sun Jun 23 09:54:41 2002 +0000
@@ -32,6 +32,7 @@
 #include "buffer.h"
 #include "casetab.h"
 #include "chartab.h"
+#include "commands.h"		/* for zmacs_region functions */
 #include "device.h"
 #include "events.h"             /* for EVENTP */
 #include "frame.h"
--- a/src/eldap.c	Sun Jun 23 09:26:37 2002 +0000
+++ b/src/eldap.c	Sun Jun 23 09:54:41 2002 +0000
@@ -35,6 +35,7 @@
 #include "opaque.h"
 #include "sysdep.h"
 #include "buffer.h"
+#include "process.h"		/* for report_process_error */
 
 #include <errno.h>
 
--- a/src/unicode.c	Sun Jun 23 09:26:37 2002 +0000
+++ b/src/unicode.c	Sun Jun 23 09:54:41 2002 +0000
@@ -27,10 +27,10 @@
    Written by Ben Wing <ben@xemacs.org>, June, 2001.
    Separated out into this file, August, 2001.
    Includes Unicode coding systems, some parts of which have been written
-   by someone else.
+   by someone else.  #### Morioka and Hayashi, I think.
 
    As of September 2001, the detection code is here and abstraction of the
-   detection system is finished.  the unicode detectors have been rewritten
+   detection system is finished.  The unicode detectors have been rewritten
    to include multiple levels of likelihood.
    */
 
@@ -47,7 +47,15 @@
    problem in that they can't handle two characters mapping to a
    single Unicode codepoint or vice-versa in a single charset table.
    It's not clear there is any way to handle this and still make the
-   sledgehammer routines useful. */
+   sledgehammer routines useful.
+
+   Inquiring Minds Want To Know Dept: does the above WARNING mean that
+   _if_ it happens, then it will signal error, or then it will do
+   something evil and unpredictable?  Signaling an error is OK: for
+   all national standards, the national to Unicode map is an inclusion
+   (1-to-1).  Any character set that does not behave that way is
+   broken according to the Unicode standard. */
+
 /* #define SLEDGEHAMMER_CHECK_UNICODE */
 
   /* We currently use the following format for tables:
@@ -153,6 +161,8 @@
 
 #ifdef MULE 
 
+/* #### Using ints for to_unicode is OK (as long as they are >= 32 bits).
+   However, shouldn't the shorts below be unsigned? */
 static int *to_unicode_blank_1;
 static int **to_unicode_blank_2;
 
@@ -302,6 +312,8 @@
   from_unicode_blank_4 = xnew_array (short ***, 256);
   for (i = 0; i < 256; i++)
     {
+      /* #### IMWTK: Why does using -1 here work? Simply because there are
+         no existing 96x96 charsets? */
       from_unicode_blank_1[i] = (short) -1;
       from_unicode_blank_2[i] = from_unicode_blank_1;
       from_unicode_blank_3[i] = from_unicode_blank_2;
@@ -312,6 +324,7 @@
   to_unicode_blank_2 = xnew_array (int *, 96);
   for (i = 0; i < 96; i++)
     {
+      /* Here -1 is guaranteed OK. */
       to_unicode_blank_1[i] = -1;
       to_unicode_blank_2[i] = to_unicode_blank_1;
     }
@@ -354,6 +367,8 @@
     }
 }
 
+/* Allocate and blank the tables.
+   Loading them up is done by parse-unicode-translation-table. */
 void
 init_charset_unicode_tables (Lisp_Object charset)
 {
@@ -783,8 +798,14 @@
 
   BREAKUP_ICHAR (chr, charset, c1, c2);
 
-  assert (!EQ (charset, Vcharset_ascii));
-  assert (!EQ (charset, Vcharset_control_1));
+  /* I tried an assert on code > 255 || chr == code, but that fails because
+     Mule gives many Latin characters separate code points for different
+     ISO 8859 coded character sets.  Obvious in hindsight.... */
+  assert (!EQ (charset, Vcharset_ascii) || chr == code);
+  assert (!EQ (charset, Vcharset_latin_iso8859_1) || chr == code);
+  assert (!EQ (charset, Vcharset_control_1) || chr == code);
+
+  /* This assert is needed because it is simply unimplemented. */
   assert (!EQ (charset, Vcharset_composite));
 
 #ifdef SLEDGEHAMMER_CHECK_UNICODE
@@ -919,6 +940,7 @@
   int c1, c2;
 
   type_checking_assert (valid_ichar_p (chr));
+  /* This shortcut depends on the representation of an Ichar, see text.c. */
   if (chr < 256)
     return (int) chr;
 
@@ -932,7 +954,7 @@
 }
 
 static Ichar
-unicode_to_char (int code, Lisp_Object_dynarr *charsets)
+unicode_to_ichar (int code, Lisp_Object_dynarr *charsets)
 {
   int u1, u2, u3, u4;
   int code_levels;
@@ -940,7 +962,10 @@
   int n = Dynarr_length (charsets);
 
   type_checking_assert (code >= 0);
-  if (code < 256)
+  /* This shortcut depends on the representation of an Ichar, see text.c.
+     Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859
+     coded character sets have points that map into that region). */
+  if (code < 0xA0)
     return (Ichar) code;
 
   BREAKUP_UNICODE_CODE (code, u4, u3, u2, u1, code_levels);
@@ -971,6 +996,10 @@
   return (Ichar) -1;
 }
 
+/* Add charsets to precedence list.
+   LIST must be a list of charsets.  Charsets which are in the list more
+   than once are given the precedence implied by their earliest appearance.
+   Later appearances are ignored. */
 static void
 add_charsets_to_precedence_list (Lisp_Object list, int *lbs,
 				 Lisp_Object_dynarr *dynarr)
@@ -982,13 +1011,18 @@
 	int lb = XCHARSET_LEADING_BYTE (charset);
 	if (lbs[lb - MIN_LEADING_BYTE] == 0)
 	  {
-	    Dynarr_add (unicode_precedence_dynarr, charset);
+	    Dynarr_add (dynarr, charset);
 	    lbs[lb - MIN_LEADING_BYTE] = 1;
 	  }
       }
   }
 }
 
+/* Rebuild the charset precedence array.
+   The "charsets preferred for the current language" get highest precedence,
+   followed by the "charsets preferred by default", ordered as in
+   Vlanguage_unicode_precedence_list and Vdefault_unicode_precedence_list,
+   respectively.  All remaining charsets follow in an arbitrary order. */
 void
 recalculate_unicode_precedence (void)
 {
@@ -1016,19 +1050,50 @@
     }
 }
 
-DEFUN ("set-language-unicode-precedence-list",
-       Fset_language_unicode_precedence_list,
-       1, 1, 0, /*
-Set the language-specific precedence list used for Unicode decoding.
-This is a list of charsets, which are consulted in order for a translation
-matching a given Unicode character.  If no matches are found, the charsets
-in the default precedence list (see `set-default-unicode-precedence-list')
-are consulted, and then all remaining charsets, in some arbitrary order.
+DEFUN ("unicode-precedence-list", 
+       Funicode_precedence_list,
+       0, 0, 0, /*
+Return the precedence order among charsets used for Unicode decoding.
+
+Value is a list of charsets, which are searched in order for a translation
+matching a given Unicode character.
+
+The highest precedence is given to the language-specific precedence list of
+charsets, defined by `set-language-unicode-precedence-list'.  These are
+followed by charsets in the default precedence list, defined by
+`set-default-unicode-precedence-list'.  Charsets occurring multiple times are
+given precedence according to their first occurrance in either list.  These
+are followed by the remaining charsets, in some arbitrary order.
 
 The language-specific precedence list is meant to be set as part of the
 language environment initialization; the default precedence list is meant
 to be set by the user.
 */
+       ())
+{
+  int i;
+  Lisp_Object list = Qnil;
+
+  for (i = Dynarr_length (unicode_precedence_dynarr) - 1; i >= 0; i--)
+    list = Fcons (Dynarr_at (unicode_precedence_dynarr, i), list);
+  return list;
+}
+
+
+/* #### This interface is wrong.  Cyrillic users and Chinese users are going
+   to have varying opinions about whether ISO Cyrillic, KOI8-R, or Windows
+   1251 should take precedence, and whether Big Five or CNS should take
+   precedence, respectively.  This means that users are sometimes going to
+   want to set Vlanguage_unicode_precedence_list.
+   Furthermore, this should be language-local (buffer-local would be a
+   reasonable approximation). */
+DEFUN ("set-language-unicode-precedence-list",
+       Fset_language_unicode_precedence_list,
+       1, 1, 0, /*
+Set the language-specific precedence of charsets in Unicode decoding.
+LIST is a list of charsets.
+See `unicode-precedence-list' for more information.
+*/
        (list))
 {
   {
@@ -1045,7 +1110,7 @@
        Flanguage_unicode_precedence_list,
        0, 0, 0, /*
 Return the language-specific precedence list used for Unicode decoding.
-See `set-language-unicode-precedence-list' for more information.
+See `unicode-precedence-list' for more information.
 */
        ())
 {
@@ -1056,8 +1121,8 @@
        Fset_default_unicode_precedence_list,
        1, 1, 0, /*
 Set the default precedence list used for Unicode decoding.
-This is meant to be set by the user.  See
-`set-language-unicode-precedence-list' for more information.
+This is intended to be set by the user.  See
+`unicode-precedence-list' for more information.
 */
        (list))
 {
@@ -1075,7 +1140,7 @@
        Fdefault_unicode_precedence_list,
        0, 0, 0, /*
 Return the default precedence list used for Unicode decoding.
-See `set-language-unicode-precedence-list' for more information.
+See `unicode-precedence-list' for more information.
 */
        ())
 {
@@ -1085,29 +1150,48 @@
 DEFUN ("set-unicode-conversion", Fset_unicode_conversion,
        2, 2, 0, /*
 Add conversion information between Unicode codepoints and characters.
+Conversions for U+0000 to U+00FF are hardwired to ASCII, Control-1, and
+Latin-1.  Attempts to set these values will raise an error.
+
 CHARACTER is one of the following:
 
 -- A character (in which case CODE must be a non-negative integer; values
    above 2^20 - 1 are allowed for the purpose of specifying private
-   characters, but will cause errors when converted to utf-16)
+   characters, but are illegal in standard Unicode---they will cause errors
+   when converted to utf-16)
 -- A vector of characters (in which case CODE must be a vector of integers
    of the same length)
 */
        (character, code))
 {
   Lisp_Object charset;
+  int ichar, unicode;
 
   CHECK_CHAR (character);
   CHECK_NATNUM (code);
 
-  charset = ichar_charset (XCHAR (character));
-  if (EQ (charset, Vcharset_ascii) ||
-      EQ (charset, Vcharset_control_1) ||
-      EQ (charset, Vcharset_composite))
-    signal_error (Qinvalid_argument, "Cannot set Unicode translation for ASCII, Control-1 or Composite chars",
+  unicode = XINT (code);
+  ichar = XCHAR (character);
+  charset = ichar_charset (ichar);
+
+  /* The translations of ASCII, Control-1, and Latin-1 code points are
+     hard-coded in ichar_to_unicode and unicode_to_ichar.
+
+     Checking unicode < 256 && ichar != unicode is wrong because Mule gives
+     many Latin characters code points in a few different character sets. */
+  if ((EQ (charset, Vcharset_ascii) ||
+       EQ (charset, Vcharset_control_1) ||
+       EQ (charset, Vcharset_latin_iso8859_1))
+      && unicode != ichar)
+    signal_error (Qinvalid_argument, "Can't change Unicode translation for ASCII, Control-1 or Latin-1 char",
 		  character);
 
-  set_unicode_conversion (XCHAR (character), XINT (code));
+  /* #### Composite characters are not properly implemented yet. */
+  if (EQ (charset, Vcharset_composite))
+    signal_error (Qinvalid_argument, "Can't set Unicode translation for Composite char",
+		  character);
+
+  set_unicode_conversion (ichar, unicode);
   return Qnil;
 }
 
@@ -1115,8 +1199,8 @@
 
 DEFUN ("char-to-unicode", Fchar_to_unicode, 1, 1, 0, /*
 Convert character to Unicode codepoint.
-When there is no international support (i.e. MULE is not defined),
-this function simply does `char-to-int'.
+When there is no international support (i.e. the 'mule feature is not
+present), this function simply does `char-to-int'.
 */
        (character))
 {
@@ -1136,9 +1220,9 @@
 Otherwise, the default ordering of all charsets will be given (see
 `set-unicode-charset-precedence').
 
-When there is no international support (i.e. MULE is not defined),
-this function simply does `int-to-char' and ignores the CHARSETS
-argument..
+When there is no international support (i.e. the 'mule feature is not
+present), this function simply does `int-to-char' and ignores the CHARSETS
+argument.
 */
        (code, charsets))
 {
@@ -1156,7 +1240,7 @@
 
   if (NILP (charsets))
     {
-      Ichar ret = unicode_to_char (c, unicode_precedence_dynarr);
+      Ichar ret = unicode_to_ichar (c, unicode_precedence_dynarr);
       if (ret == -1)
 	return Qnil;
       return make_char (ret);
@@ -1166,7 +1250,7 @@
   memset (lbs, 0, NUM_LEADING_BYTES * sizeof (int));
   add_charsets_to_precedence_list (charsets, lbs, dyn);
   {
-    Ichar ret = unicode_to_char (c, unicode_precedence_dynarr);
+    Ichar ret = unicode_to_ichar (c, dyn);
     Dynarr_free (dyn);
     if (ret == -1)
       return Qnil;
@@ -1188,36 +1272,37 @@
   return Qnil;
 }
 
+/* #### shouldn't this interface be called load-unicode-mapping-table
+   for consistency with Unicode Consortium terminology? */
 DEFUN ("parse-unicode-translation-table", Fparse_unicode_translation_table,
        2, 6, 0, /*
-Parse Unicode translation data in FILENAME for CHARSET.
+Load Unicode tables with the Unicode mapping data in FILENAME for CHARSET.
 Data is text, in the form of one translation per line -- charset
 codepoint followed by Unicode codepoint.  Numbers are decimal or hex
 \(preceded by 0x).  Comments are marked with a #.  Charset codepoints
-for two-dimensional charsets should have the first octet stored in the
+for two-dimensional charsets have the first octet stored in the
 high 8 bits of the hex number and the second in the low 8 bits.
 
 If START and END are given, only charset codepoints within the given
-range will be processed.  If OFFSET is given, that value will be added
-to all charset codepoints in the file to obtain the internal charset
-codepoint.  START and END apply to the codepoints in the file, before
-OFFSET is applied.
+range will be processed.  (START and END apply to the codepoints in the
+file, before OFFSET is applied.)
 
-\(Note that, as usual, we assume that octets are in the range 32 to
-127 or 33 to 126.  If you have a table in kuten form, with octets in
-the range 1 to 94, you will have to use an offset of 5140,
-i.e. 0x2020.)
+If OFFSET is given, that value will be added to all charset codepoints
+in the file to obtain the internal charset codepoint.  \(We assume
+that octets in the table are in the range 33 to 126 or 32 to 127.  If
+you have a table in ku-ten form, with octets in the range 1 to 94, you
+will have to use an offset of 5140, i.e. 0x2020.)
 
 FLAGS, if specified, control further how the tables are interpreted
-and are used to special-case certain known table weirdnesses in the
-Unicode tables:
+and are used to special-case certain known format deviations in the
+Unicode tables or in the charset:
 
 `ignore-first-column'
-  Exactly as it sounds.  The JIS X 0208 tables have 3 columns of data instead
-  of 2; the first is the Shift-JIS codepoint.
+  The JIS X 0208 tables have 3 columns of data instead of 2.  The first
+  column contains the Shift-JIS codepoint, which we ignore.
 `big5'
-  The charset codepoint is a Big Five codepoint; convert it to the
-  proper hacked-up codepoint in `chinese-big5-1' or `chinese-big5-2'.
+  The charset codepoints are Big Five codepoints; convert it to the
+  hacked-up Mule codepoint in `chinese-big5-1' or `chinese-big5-2'.
 */
      (filename, charset, start, end, offset, flags))
 {
@@ -1259,7 +1344,7 @@
 	  big5 = 1;
 	else
 	  invalid_constant
-	    ("Unrecognized `parse-unicode-table' flag", elt);
+	    ("Unrecognized `parse-unicode-translation-table' flag", elt);
       }
   }
 
@@ -1434,7 +1519,7 @@
   else
     {
 #ifdef MULE
-      Ichar chr = unicode_to_char (ch, unicode_precedence_dynarr);
+      Ichar chr = unicode_to_ichar (ch, unicode_precedence_dynarr);
 
       if (chr != -1)
 	{
@@ -2143,6 +2228,7 @@
 syms_of_unicode (void)
 {
 #ifdef MULE
+  DEFSUBR (Funicode_precedence_list);
   DEFSUBR (Fset_language_unicode_precedence_list);
   DEFSUBR (Flanguage_unicode_precedence_list);
   DEFSUBR (Fset_default_unicode_precedence_list);