changeset 3439:d1754e7f0cea

[xemacs-hg @ 2006-06-03 17:50:39 by aidan] Just-in-time Unicode code point support.
author aidan
date Sat, 03 Jun 2006 17:51:06 +0000
parents 14fbcab7c67b
children 4d0cab3a989f
files lisp/ChangeLog lisp/mule/mule-ccl.el lisp/unicode.el lisp/x-init.el man/ChangeLog man/lispref/mule.texi src/ChangeLog src/charset.h src/event-xlike-inc.c src/general-slots.h src/lread.c src/mule-ccl.c src/mule-charset.c src/mule-coding.c src/redisplay-x.c src/unicode.c tests/ChangeLog tests/automated/mule-tests.el
diffstat 18 files changed, 819 insertions(+), 123 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/ChangeLog	Fri Jun 02 22:18:08 2006 +0000
+++ b/lisp/ChangeLog	Sat Jun 03 17:51:06 2006 +0000
@@ -1,3 +1,21 @@
+2006-06-03  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* mule/mule-ccl.el:
+	"X Emacs" -> "XEmacs"
+	* mule/mule-ccl.el (ccl-compile-mule-to-unicode): New.
+	* mule/mule-ccl.el (ccl-compile-unicode-to-mule): New.
+	* mule/mule-ccl.el (ccl-dump-mule-to-unicode): New.
+	* mule/mule-ccl.el (ccl-dump-unicode-to-mule): New.
+	* mule/mule-ccl.el (define-ccl-program):
+	Add two new CCL commands, and commands to describe them; document
+	them. 
+
+2006-06-03  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* unicode.el:
+	* unicode.el (featurep):
+	* x-init.el (x-initialize-keyboard):
+
 2006-06-03  Adrian Aichner  <adrian@xemacs.org>
 
 	* package-get.el (package-get-download-sites): Welcome
--- a/lisp/mule/mule-ccl.el	Fri Jun 02 22:18:08 2006 +0000
+++ b/lisp/mule/mule-ccl.el	Sat Jun 03 17:51:06 2006 +0000
@@ -5,20 +5,20 @@
 
 ;; Keywords: CCL, mule, multilingual, character set, coding-system
 
-;; This file is part of X Emacs.
+;; This file is part of XEmacs.
 
-;; GNU Emacs is free software; you can redistribute it and/or modify
+;; XEmacs is free software; you can redistribute it and/or modify
 ;; it under the terms of the GNU General Public License as published by
 ;; the Free Software Foundation; either version 2, or (at your option)
 ;; any later version.
 
-;; GNU Emacs is distributed in the hope that it will be useful,
+;; XEmacs is distributed in the hope that it will be useful,
 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ;; GNU General Public License for more details.
 
 ;; You should have received a copy of the GNU General Public License
-;; along with GNU Emacs; see the file COPYING.  If not, write to the
+;; along with XEmacs; see the file COPYING.  If not, write to the
 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 ;; Boston, MA 02111-1307, USA.
 
@@ -48,7 +48,7 @@
   [if branch loop break repeat write-repeat write-read-repeat
       read read-if read-branch write call end
       read-multibyte-character write-multibyte-character
-      translate-character
+      translate-character mule-to-unicode unicode-to-mule
       iterate-multiple-map map-multiple map-single]
   "Vector of CCL commands (symbols).")
 
@@ -100,7 +100,9 @@
    write-multibyte-character
    translate-character
    translate-character-const-tbl
-   nil nil nil nil nil nil nil nil nil nil nil nil ; 0x04-0x0f
+   mule-to-unicode
+   unicode-to-mule
+   nil nil nil nil nil nil nil nil nil nil ; 0x06-0x0f
    iterate-multiple-map
    map-multiple
    map-single
@@ -830,6 +832,29 @@
 	   (ccl-embed-extended-command 'translate-character rrr RRR Rrr))))
   nil)
 
+;; Compile mule-to-unicode
+(defun ccl-compile-mule-to-unicode (cmd)
+  (if (/= (length cmd) 3)
+      (error "CCL: Invalid number of arguments: %s" cmd))
+  (let ((RRR (nth 1 cmd))
+	(rrr (nth 2 cmd)))
+    (ccl-check-register RRR cmd)
+    (ccl-check-register rrr cmd)
+    (ccl-embed-extended-command 'mule-to-unicode RRR rrr 0))
+  nil)
+
+;; Given a Unicode code point in register rrr, write the charset ID of the
+;; corresponding character in RRR, and the Mule-CCL form of its code in rrr.
+(defun ccl-compile-unicode-to-mule (cmd)
+  (if (/= (length cmd) 3)
+      (error "CCL: Invalid number of arguments: %s" cmd))
+  (let ((rrr (nth 1 cmd))
+	(RRR (nth 2 cmd)))
+    (ccl-check-register rrr cmd)
+    (ccl-check-register RRR cmd)
+    (ccl-embed-extended-command 'unicode-to-mule rrr RRR 0))
+  nil)
+
 (defun ccl-compile-iterate-multiple-map (cmd)
   (ccl-compile-multiple-map-function 'iterate-multiple-map cmd)
   nil)
@@ -1188,6 +1213,12 @@
   (let ((tbl (ccl-get-next-code)))
     (insert (format "translation table(%S) r%d r%d\n" tbl RRR rrr))))
 
+(defun ccl-dump-mule-to-unicode (rrr RRR Rrr)
+  (insert (format "change chars in r%d and r%d to unicode\n" RRR rrr)))
+
+(defun ccl-dump-unicode-to-mule (rrr RRR Rrr)
+  (insert (format "converter UCS code %d to a Mule char\n" rrr)))
+
 (defun ccl-dump-iterate-multiple-map (rrr RRR Rrr)
   (let ((notbl (ccl-get-next-code))
 	(i 0) id)
@@ -1359,9 +1390,13 @@
 ;; Call CCL program whose name is ccl-program-name.
 CALL := (call ccl-program-name)
 
-TRANSLATE := ;; Not implemented under XEmacs.
-	(translate-character REG(table) REG(charset) REG(codepoint))
-	| (translate-character SYMBOL REG(charset) REG(codepoint))
+TRANSLATE := ;; Not implemented under XEmacs, except mule-to-unicode and
+	     ;; unicode-to-mule.
+	     (translate-character REG(table) REG(charset) REG(codepoint)) 
+	     | (translate-character SYMBOL REG(charset) REG(codepoint)) 
+	     | (mule-to-unicode REG(charset) REG(codepoint))
+	     | (unicode-to-mule REG(unicode,code) REG(CHARSET))
+
 MAP :=
      (iterate-multiple-map REG REG MAP-IDs)
      | (map-multiple REG REG (MAP-SET))
@@ -1373,8 +1408,8 @@
 ;; Terminate the CCL program.
 END := (end)
 
-;; CCL registers. These can contain any integer value.  As r7 is used by CCL
-;; interpreter itself, its value change unexpectedly.
+;; CCL registers. These can contain any integer value.  As r7 is used by the
+;; CCL interpreter itself, its value can change unexpectedly.
 REG := r0 | r1 | r2 | r3 | r4 | r5 | r6 | r7
 
 ARG := REG | INT-OR-CHAR
--- a/lisp/unicode.el	Fri Jun 02 22:18:08 2006 +0000
+++ b/lisp/unicode.el	Sat Jun 03 17:51:06 2006 +0000
@@ -318,6 +318,38 @@
 	  "Sorry, encode-char doesn't yet support anything but the UCS.  ")
   (char-to-unicode char))
 
+(when (featurep 'mule)
+  ;; This CCL program is used for displaying the fallback UCS character set,
+  ;; and can be repurposed to lao and the IPA, all going well.
+  ;;
+  ;; define-ccl-program is available after mule-ccl is loaded, much later
+  ;; than this file in the build process. The below is the result of 
+  ;;
+  ;;   (macroexpand 
+  ;;    '(define-ccl-program ccl-encode-to-ucs-2
+  ;;      `(1
+  ;;        ((r1 = (r1 << 8))
+  ;; 	     (r1 = (r1 | r2))
+  ;; 	     (mule-to-unicode r0 r1)
+  ;; 	     (r1 = (r0 >> 8))
+  ;; 	     (r2 = (r0 & 255))))
+  ;;      "CCL program to transform Mule characters to UCS-2."))
+  ;;
+  ;; and it should occasionally be confirmed that the correspondence still
+  ;; holds.
+
+  (let ((prog [1 10 131127 8 98872 65823 147513 8 82009 255 22]))
+    (defconst ccl-encode-to-ucs-2 prog 
+      "CCL program to transform Mule characters to UCS-2.")
+    (put (quote ccl-encode-to-ucs-2) (quote ccl-program-idx) 
+	 (register-ccl-program (quote ccl-encode-to-ucs-2) prog)) nil))
+
+;; Won't do this just yet, though. 
+;; (set-charset-registry 'lao "iso10646-1")
+;; (set-charset-ccl-program 'lao 'ccl-encode-to-ucs-2)
+;; (set-charset-registry 'ipa "iso10646-1")
+;; (set-charset-ccl-program 'ipa 'ccl-encode-to-ucs-2)
+
 ;; #### UTF-7 is not yet implemented, and it's tricky to do.  There's
 ;; an implementation in appendix A.1 of the Unicode Standard, Version
 ;; 2.0, but I don't know its licensing characteristics.
--- a/lisp/x-init.el	Fri Jun 02 22:18:08 2006 +0000
+++ b/lisp/x-init.el	Sat Jun 03 17:51:06 2006 +0000
@@ -247,7 +247,7 @@
       (when (and (not (get (intern sym-string) 'character-of-keysym))
 		 (string-match "^U[0-9A-F]+$" sym-string))
 	(pushnew (concat sym-string " ") unknown-code-points :test 'equal)))
-    (when unknown-code-points
+    (when (and (featurep 'mule) unknown-code-points)
       (lwarn 'key-mapping 'info
       "Undefined Unicode key mappings.
 Your keyboard has, among many others, the following keysyms defined:
--- a/man/ChangeLog	Fri Jun 02 22:18:08 2006 +0000
+++ b/man/ChangeLog	Sat Jun 03 17:51:06 2006 +0000
@@ -1,3 +1,10 @@
+2006-06-03  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* lispref/mule.texi (CCL Syntax):
+	* lispref/mule.texi (CCL Statements):
+	Describe the mule-to-unicode and unicode-to-mule statements;
+	rename the section they are described in.
+	
 2006-05-17  Stephen J. Turnbull  <stephen@xemacs.org>
 
 	* xemacs-faq.texi (Q2.2.3): New node.
--- a/man/lispref/mule.texi	Fri Jun 02 22:18:08 2006 +0000
+++ b/man/lispref/mule.texi	Sat Jun 03 17:51:06 2006 +0000
@@ -1825,6 +1825,15 @@
         | (write INT-OR-CHAR) | (write string) | (write REG ARRAY)
         | string
 CALL := (call ccl-program-name)
+
+
+TRANSLATE := ;; Not implemented under XEmacs, except mule-to-unicode and
+	     ;; unicode-to-mule.
+	     (translate-character REG(table) REG(charset) REG(codepoint)) 
+	     | (translate-character SYMBOL REG(charset) REG(codepoint)) 
+	     | (mule-to-unicode REG(charset) REG(codepoint))
+	     | (unicode-to-mule REG(unicode,code) REG(CHARSET))
+
 END := (end)
 
 REG := r0 | r1 | r2 | r3 | r4 | r5 | r6 | r7
@@ -1845,7 +1854,8 @@
 
   The Emacs Code Conversion Language provides the following statement
 types: @dfn{set}, @dfn{if}, @dfn{branch}, @dfn{loop}, @dfn{repeat},
-@dfn{break}, @dfn{read}, @dfn{write}, @dfn{call}, and @dfn{end}.
+@dfn{break}, @dfn{read}, @dfn{write}, @dfn{call}, @dfn{translate} and
+@dfn{end}.
 
 @heading Set statement:
 
@@ -1933,12 +1943,32 @@
 @code{write} and @code{read} statements for the semantics of the I/O
 operations for each type of argument.
 
-@heading Other control statements:
+@heading Other statements:
 
   The @dfn{call} statement, written @samp{(call @var{ccl-program-name})},
 executes a CCL program as a subroutine.  It does not return a value to
 the caller, but can modify the register status.
 
+  The @dfn{mule-to-unicode} statement translates an XEmacs character into a
+UCS code point, using U+FFFD REPLACEMENT CHARACTER if the given XEmacs
+character has no known corresponding code point.  It takes two
+arguments; the first is a register in which is stored the character set
+ID of the character to be translated, and into which the UCS code is
+stored.  The second is a register which stores the XEmacs code of the
+character in question; if it is from a multidimensional character set,
+like most of the East Asian national sets, it's stored as @samp{((c1 <<
+8) & c2)}, where @samp{c1} is the first code, and @samp{c2} the second.
+(That is, as a single integer, the high-order eight bits of which encode
+the first position code, and the low order bits of which encode the
+second.)
+
+  The @dfn{unicode-to-mule} statement translates a Unicode code point
+(an integer) into an XEmacs character.  Its first argument is a register
+containing the UCS code point; the code for the correspond character
+will be written into this register, in the same format as for
+@samp{mule-to-unicode} The second argument is a register into which will
+be written the character set ID of the converted character. 
+
   The @dfn{end} statement, written @samp{(end)}, terminates the CCL
 program successfully, and returns to caller (which may be a CCL
 program).  It does not alter the status of the registers.
--- a/src/ChangeLog	Fri Jun 02 22:18:08 2006 +0000
+++ b/src/ChangeLog	Sat Jun 03 17:51:06 2006 +0000
@@ -1,3 +1,90 @@
+2006-06-03  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* charset.h:
+	* charset.h (struct Lisp_Charset):
+	* charset.h (CHARSET_ENCODE_AS_UTF_8):
+	* charset.h (XCHARSET_ENCODE_AS_UTF_8):
+	Add a flag `encode-as-utf-8' to the Mule charset structure; if
+	set, it's an indication to ISO 2022-oriented coding systems that
+	the characters of that charset should be encoded using the ISO-IR
+	196 UTF-8 escape syntax, since they're not members of any other
+	well-known character set we're aware of.
+
+	Make enum unicode_type, encode_unicode_char and Funicode_to_char
+	available outside of unicode.c
+	
+	* lread.c:
+	* event-xlike-inc.c:
+	Use the charset.h declaration of Funicode_to_char, don't declare
+	it ourselves. 
+
+	My XFree86 installation has taken to passing me ASCII characters
+	using the Unicode keysyms; accept them too. 
+	
+	* general-slots.h:
+	Make `ccl-program' and `encode-as-utf-8' available as symbols
+	generally. 
+
+	* mule-ccl.c:
+	Add CCL_MuleToUnicode, CCL_UnicodeToMule, implement them, enable
+	and debug CCL_MAKE_CHAR, have CCL_WriteMultibyteChar2 segfault
+	less, fix some grammar. 
+
+	* mule-charset.c (make_charset):
+	* mule-charset.c (Fmake_charset):
+	* mule-charset.c (Fcharset_property):
+	* mule-charset.c (complex_vars_of_mule_charset):
+	Require the encode_as_utf_8 property when calling make_charset ();
+	accept it when creating a charset from Lisp in Fmake_charset. 
+
+	* mule-coding.c:
+	* mule-coding.c (dynarr_add_2022_one_dimension):
+	* mule-coding.c (dynarr_add_2022_two_dimensions):
+	Add two convenience functions for iso2022_decode, to abstract out
+	writing UTF-8 a little. 
+	
+	* mule-coding.c (enum iso_esc_flag):
+	Add one more state to reflect the existence of the UTF-8 escape. 
+
+	* mule-coding.c (struct iso2022_coding_stream):
+	Add a counter variable to the state to permit handling
+	variable-length UTF-8. 
+	
+	* mule-coding.c (parse_iso2022_esc):
+	Update the function to work with ISO_STATE_UTF_8; only the ESC % @
+	escape is processed in that state, everything else is ignored and
+	passed through by the error handler. 
+	
+	* mule-coding.c (iso2022_decode):
+	* mule-coding.c (iso2022_designate):
+	* mule-coding.c (iso2022_encode):
+	Handle the UTF-8 escape sequences in reading and in writing ISO
+	2022. 
+	
+	* redisplay-x.c (separate_textual_runs):
+	Add a comment to the effect that the dimension stuff breaks when
+	using CCL programs and registries to map to a bigger charset. 
+	
+	* unicode.c:
+	Add support for creating new characters on the fly as unknown
+	Unicode code points are encountered. 
+
+	* unicode.c (get_free_codepoint): New.
+	* unicode.c (unicode_to_ichar): Reworked to create new code points
+	on the fly. 
+	* unicode.c (Funicode_to_char): Update the docstring. 
+	* unicode.c (struct unicode_coding_system):
+	Move enum unicode_type into charset.h. 
+
+	* unicode.c (encode_unicode_char):
+	encode_unicode_char isn't static any longer, mule-coding.c uses
+	it. 
+	* unicode.c (syms_of_unicode):
+	Make a couple of symbols available to unicode.c
+	* unicode.c (vars_of_unicode):
+	Tell the garbage collector about current_jit_charset, initialise
+	it. 
+
 2006-05-25  Stephen J. Turnbull  <stephen@xemacs.org>
 
 	* objects-x.c (x_find_charset_font): Keep local names local; wrap
--- a/src/charset.h	Fri Jun 02 22:18:08 2006 +0000
+++ b/src/charset.h	Sat Jun 03 17:51:06 2006 +0000
@@ -229,6 +229,11 @@
   /* Which half of font to be used to display this character set */
   int graphic;
 
+  /* If set, this charset should be written out in ISO-2022-based coding
+     systems using the escape sequence for UTF-8, not using our internal
+     representation and the associated real ISO 2022 designation. */
+  unsigned int encode_as_utf_8 :1;
+
   /* If set, this is a "temporary" charset created when we encounter
      an unknown final.  This is so that we can successfully compile
      and load such files.  We allow a real charset to be created on top
@@ -261,6 +266,7 @@
 #define CHARSET_REP_BYTES(cs)	 ((cs)->rep_bytes)
 #define CHARSET_COLUMNS(cs)	 ((cs)->columns)
 #define CHARSET_GRAPHIC(cs)	 ((cs)->graphic)
+#define CHARSET_ENCODE_AS_UTF_8(cs)	 ((cs)->encode_as_utf_8)
 #define CHARSET_TYPE(cs)	 ((cs)->type)
 #define CHARSET_DIRECTION(cs)	 ((cs)->direction)
 #define CHARSET_FINAL(cs)	 ((cs)->final)
@@ -284,6 +290,7 @@
 #define XCHARSET_REP_BYTES(cs)	  CHARSET_REP_BYTES    (XCHARSET (cs))
 #define XCHARSET_COLUMNS(cs)	  CHARSET_COLUMNS      (XCHARSET (cs))
 #define XCHARSET_GRAPHIC(cs)      CHARSET_GRAPHIC      (XCHARSET (cs))
+#define XCHARSET_ENCODE_AS_UTF_8(cs) CHARSET_ENCODE_AS_UTF_8 (XCHARSET (cs))
 #define XCHARSET_TYPE(cs)	  CHARSET_TYPE         (XCHARSET (cs))
 #define XCHARSET_DIRECTION(cs)	  CHARSET_DIRECTION    (XCHARSET (cs))
 #define XCHARSET_FINAL(cs)	  CHARSET_FINAL        (XCHARSET (cs))
@@ -547,6 +554,25 @@
 void get_charset_limits (Lisp_Object charset, int *low, int *high);
 int ichar_to_unicode (Ichar chr);
 
+EXFUN (Fcharset_name, 1);
+
 #endif /* MULE */
 
+/* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */
+
+enum unicode_type
+{
+  UNICODE_UTF_16,
+  UNICODE_UTF_8,
+  UNICODE_UTF_7,
+  UNICODE_UCS_4
+};
+
+void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
+			  int USED_IF_MULE (l), unsigned_char_dynarr *dst,
+			  enum unicode_type type, unsigned int little_endian);
+
+EXFUN (Funicode_to_char, 2);
+EXFUN (Fchar_to_unicode, 1); 
+
 #endif /* INCLUDED_charset_h_ */
--- a/src/event-xlike-inc.c	Fri Jun 02 22:18:08 2006 +0000
+++ b/src/event-xlike-inc.c	Sat Jun 03 17:51:06 2006 +0000
@@ -27,8 +27,6 @@
    included here, not in event-xlike.c.  However, event-xlike.c is always
    X-specific, whereas the following code isn't, in the GTK case. */
 
-EXFUN (Funicode_to_char, 2);  /* In unicode.c.  */
-
 static int
 #ifdef THIS_IS_GTK
 emacs_gtk_event_pending_p (int how_many)
@@ -503,9 +501,9 @@
      and only those should correspond directly to Unicode code points, in
      the range #x100-#x10FFFF; actual implementations can have the Latin 1
      code points do the same thing with keysyms
-     #x010000A0-#x01000100. */
+     #x01000000-#x01000100. */
 
-  if (keysym >= 0x010000A0 && keysym <= 0x0110FFFF)
+  if (keysym >= 0x01000000 && keysym <= 0x0110FFFF)
     return Funicode_to_char (make_int(keysym & 0xffffff), Qnil);
 
   if ((keysym & 0xff) < 0xa0)
--- a/src/general-slots.h	Fri Jun 02 22:18:08 2006 +0000
+++ b/src/general-slots.h	Sat Jun 03 17:51:06 2006 +0000
@@ -73,6 +73,7 @@
 SYMBOL_KEYWORD (Q_callback_ex);
 SYMBOL (Qcancel);
 SYMBOL (Qcategory);
+SYMBOL (Qccl_program);
 SYMBOL (Qcenter);
 SYMBOL (Qchain);
 SYMBOL (Qchange);
@@ -115,6 +116,7 @@
 SYMBOL (Qdynarr_overhead);
 SYMBOL (Qemergency);
 SYMBOL (Qempty);
+SYMBOL (Qencode_as_utf_8);
 SYMBOL (Qeq);
 SYMBOL (Qeql);
 SYMBOL (Qequal);
--- a/src/lread.c	Fri Jun 02 22:18:08 2006 +0000
+++ b/src/lread.c	Sat Jun 03 17:51:06 2006 +0000
@@ -34,6 +34,7 @@
 #include "lstream.h"
 #include "opaque.h"
 #include "profile.h"
+#include "charset.h"	/* For Funicode_to_char. */
 
 #include "sysfile.h"
 #include "sysfloat.h"
@@ -208,8 +209,6 @@
 static int locate_file_open_or_access_file (Ibyte *fn, int access_mode);
 EXFUN (Fread_from_string, 3);
 
-EXFUN (Funicode_to_char, 2);  /* In unicode.c.  */
-
 /* When errors are signaled, the actual readcharfun should not be used
    as an argument if it is an lstream, so that lstreams don't escape
    to the Lisp level.  */
--- a/src/mule-ccl.c	Fri Jun 02 22:18:08 2006 +0000
+++ b/src/mule-ccl.c	Sat Jun 03 17:51:06 2006 +0000
@@ -461,6 +461,16 @@
 					       1:ExtendedCOMMNDRrrRRRrrrXXXXX
 					       2:ARGUMENT(Translation Table ID)
 					    */
+/* Translate a character whose code point is reg[rrr] and charset ID is
+   reg[RRR], into its Unicode code point, which will be written into
+   reg[rrr]. */
+
+#define CCL_MuleToUnicode	0x04 
+
+/* Translate a Unicode code point, in reg[rrr], into a Mule character,
+   writing the charset ID into reg[RRR] and the code point into reg[Rrr]. */
+
+#define CCL_UnicodeToMule	0x05 
 
 /* Iterate looking up MAPs for reg[rrr] starting from the Nth (N =
    reg[RRR]) MAP until some value is found.
@@ -577,7 +587,6 @@
 					 ...
 					 N:SEPARATOR_z (< 0)
 				      */
-
 #define MAX_MAP_SET_LEVEL 30
 
 typedef struct
@@ -837,26 +846,41 @@
    CODE to that invalid byte.  */
 
 /* On XEmacs, TranslateCharacter is not supported.  Thus, this
-   macro is not used.  */
-#if 0
+   macro is only used in the MuleToUnicode transformation.  */
 #define CCL_MAKE_CHAR(charset, code, c)				\
   do {								\
-    if ((charset) == CHARSET_ASCII)				\
-      (c) = (code) & 0xFF;						\
-    else if (CHARSET_DEFINED_P (charset)			\
-	     && ((code) & 0x7F) >= 32				\
-	     && ((code) < 256 || ((code >> 7) & 0x7F) >= 32))	\
+    if ((charset) == LEADING_BYTE_ASCII)			\
+      {								\
+	c = (code) & 0xFF;					\
+      }								\
+    else if ((charset) == LEADING_BYTE_CONTROL_1)		\
+      {								\
+	c = ((code) & 0xFF) - 0xA0;				\
+      }								\
+    else if (!NILP(charset_by_leading_byte(charset))		\
+	     && ((code) >= 32)					\
+	     && ((code) < 256 || ((code >> 8) & 0x7F) >= 32))	\
       {								\
-	int c1 = (code) & 0x7F, c2 = 0;				\
+	int c1, c2 = 0;						\
 								\
-	if ((code) >= 256)					\
-	  c2 = c1, c1 = ((code) >> 7) & 0x7F;			\
-	(c) = make_ichar (charset, c1, c2);			\
+	if ((code) < 256)					\
+	  {							\
+	    c1 = (code) & 0x7F;					\
+	    c2 = 0;						\
+	  }							\
+	else							\
+	  {							\
+	    c1 = ((code) >> 8) & 0x7F;				\
+	    c2 = (code) & 0x7F;					\
+	  }							\
+	c = make_ichar (charset_by_leading_byte(charset),	\
+			  c1, c2);				\
       }								\
     else							\
-      (c) = (code) & 0xFF;						\
-  } while (0)
-#endif
+      {								\
+	c = (code) & 0xFF;					\
+      }								\
+  } while (0) 
 
 
 /* Execute CCL code on SRC_BYTES length text at SOURCE.  The resulting
@@ -1392,9 +1416,9 @@
 
 	    case CCL_TranslateCharacter:
 #if 0
-	      /* XEmacs does not have translate_char, and its
-		 equivalent nor.  We do nothing on this operation. */
-	      CCL_MAKE_CHAR (reg[RRR], reg[rrr], i);
+	      /* XEmacs does not have translate_char, nor an
+		 equivalent.  We do nothing on this operation. */
+	      CCL_MAKE_CHAR(reg[RRR], reg[rrr], op);
 	      op = translate_char (GET_TRANSLATION_TABLE (reg[Rrr]),
 				   i, -1, 0, 0);
 	      SPLIT_CHAR (op, reg[RRR], i, j);
@@ -1421,6 +1445,56 @@
 #endif
 	      break;
 
+	    case CCL_MuleToUnicode:
+	      {
+		Lisp_Object ucs;
+
+		CCL_MAKE_CHAR(reg[rrr], reg[RRR], op);
+		ucs = Fchar_to_unicode(make_char(op));
+
+		if (NILP(ucs))
+		  {
+		    /* Uhh, char-to-unicode doesn't return nil at the
+		       moment, only ever -1. */
+		    reg[rrr] = 0xFFFD; /* REPLACEMENT CHARACTER */
+		  }
+		else
+		  {
+		    reg[rrr] = XINT(ucs);
+		    if (-1 == reg[rrr])
+		      {
+			reg[rrr] = 0xFFFD; /* REPLACEMENT CHARACTER */
+		      }
+		  }
+		break;
+	      }
+
+	    case CCL_UnicodeToMule:
+	      {
+		Lisp_Object scratch;
+
+		scratch = Funicode_to_char(make_int(reg[rrr]), Qnil);
+
+		if (!NILP(scratch))
+		  {
+		    op = XCHAR(scratch);
+		    BREAKUP_ICHAR (op, scratch, i, j);
+		    reg[RRR] = XCHARSET_ID(scratch);
+
+		    if (j != 0)
+		      {
+			i = (i << 8) | j;
+		      }
+
+		    reg[rrr] = i;
+		  }
+		else 
+		  {
+		    reg[rrr] = reg[RRR] = 0;
+		  }
+		break;
+	      }
+
 	    case CCL_IterateMultipleMap:
 	      {
 		Lisp_Object map, content, attrib, value;
--- a/src/mule-charset.c	Fri Jun 02 22:18:08 2006 +0000
+++ b/src/mule-charset.c	Sat Jun 03 17:51:06 2006 +0000
@@ -190,7 +190,7 @@
 	      int type, int columns, int graphic,
 	      Ibyte final, int direction,  Lisp_Object short_name,
 	      Lisp_Object long_name, Lisp_Object doc,
-	      Lisp_Object reg, int overwrite)
+	      Lisp_Object reg, int overwrite, int encode_as_utf_8)
 {
   Lisp_Object obj;
   Lisp_Charset *cs;
@@ -240,6 +240,7 @@
   CHARSET_FINAL		(cs) = final;
   CHARSET_DOC_STRING	(cs) = doc;
   CHARSET_REGISTRY	(cs) = reg;
+  CHARSET_ENCODE_AS_UTF_8 (cs) = encode_as_utf_8 ? 1 : 0;
   CHARSET_CCL_PROGRAM	(cs) = Qnil;
   CHARSET_REVERSE_DIRECTION_CHARSET (cs) = Qnil;
 
@@ -454,6 +455,12 @@
 		is passed the octets of the character, with the high
 		bit cleared and set depending upon whether the value
 		of the `graphic' property is 0 or 1.
+`encode-as-utf-8'
+		If non-nil, the charset will be written out using the UTF-8
+		escape syntax in ISO 2022-oriented coding systems.  Used for
+		supporting characters we know are part of Unicode but not of
+		any other known character set in escape-quoted and compound
+		text.
 */
        (name, doc_string, props))
 {
@@ -465,6 +472,7 @@
   Lisp_Object charset = Qnil;
   Lisp_Object ccl_program = Qnil;
   Lisp_Object short_name = Qnil, long_name = Qnil;
+  int encode_as_utf_8 = 0;
   Lisp_Object existing_charset;
   int temporary = UNBOUNDP (name);
 
@@ -546,6 +554,11 @@
 	      invalid_constant ("Invalid value for `direction'", value);
 	  }
 
+	else if (EQ (keyword, Qencode_as_utf_8))
+	  {
+	    encode_as_utf_8 = NILP (value) ? 0 : 1;
+	  }
+
 	else if (EQ (keyword, Qfinal))
 	  {
 	    CHECK_CHAR_COERCE_INT (value);
@@ -553,7 +566,6 @@
 	    if (final < '0' || final > '~')
 	      invalid_constant ("Invalid value for `final'", value);
 	  }
-
 	else if (EQ (keyword, Qccl_program))
 	  {
 	    struct ccl_program test_ccl;
@@ -612,7 +624,8 @@
 
   charset = make_charset (id, name, dimension + 2, type, columns, graphic,
 			  final, direction, short_name, long_name,
-			  doc_string, registry, !NILP (existing_charset));
+			  doc_string, registry, !NILP (existing_charset),
+			  encode_as_utf_8);
 
   XCHARSET (charset)->temporary = temporary;
   if (!NILP (ccl_program))
@@ -641,7 +654,7 @@
        (charset, new_name))
 {
   Lisp_Object new_charset = Qnil;
-  int id, dimension, columns, graphic;
+  int id, dimension, columns, graphic, encode_as_utf_8;
   Ibyte final;
   int direction, type;
   Lisp_Object registry, doc_string, short_name, long_name;
@@ -672,10 +685,11 @@
   short_name = CHARSET_SHORT_NAME (cs);
   long_name = CHARSET_LONG_NAME (cs);
   registry = CHARSET_REGISTRY (cs);
+  encode_as_utf_8 = CHARSET_ENCODE_AS_UTF_8 (cs);
 
   new_charset = make_charset (id, new_name, dimension + 2, type, columns,
 			      graphic, final, direction, short_name, long_name,
-			      doc_string, registry, 0);
+			      doc_string, registry, 0, encode_as_utf_8);
 
   CHARSET_REVERSE_DIRECTION_CHARSET (cs) = new_charset;
   XCHARSET_REVERSE_DIRECTION_CHARSET (new_charset) = charset;
@@ -807,6 +821,8 @@
   if (EQ (prop, Qfinal))       return make_char (CHARSET_FINAL (cs));
   if (EQ (prop, Qchars))       return make_int (CHARSET_CHARS (cs));
   if (EQ (prop, Qregistry))    return CHARSET_REGISTRY (cs);
+  if (EQ (prop, Qencode_as_utf_8))
+    return CHARSET_ENCODE_AS_UTF_8 (cs) ? Qt : Qnil;
   if (EQ (prop, Qccl_program)) return CHARSET_CCL_PROGRAM (cs);
   if (EQ (prop, Qdirection))
     return CHARSET_DIRECTION (cs) == CHARSET_LEFT_TO_RIGHT ? Ql2r : Qr2l;
@@ -1040,7 +1056,7 @@
 		  build_string ("ASCII"),
 		  build_msg_string ("ASCII"),
 		  build_msg_string ("ASCII (ISO646 IRV)"),
-		  build_string ("\\(iso8859-[0-9]*\\|-ascii\\)"), 0);
+		  build_string ("\\(iso8859-[0-9]*\\|-ascii\\)"), 0, 0);
   staticpro (&Vcharset_control_1);
   Vcharset_control_1 =
     make_charset (LEADING_BYTE_CONTROL_1, Qcontrol_1, 2,
@@ -1049,7 +1065,7 @@
 		  build_string ("C1"),
 		  build_msg_string ("Control characters"),
 		  build_msg_string ("Control characters 128-191"),
-		  build_string (""), 0);
+		  build_string (""), 0, 0);
   staticpro (&Vcharset_latin_iso8859_1);
   Vcharset_latin_iso8859_1 =
     make_charset (LEADING_BYTE_LATIN_ISO8859_1, Qlatin_iso8859_1, 2,
@@ -1058,7 +1074,7 @@
 		  build_string ("Latin-1"),
 		  build_msg_string ("ISO8859-1 (Latin-1)"),
 		  build_msg_string ("ISO8859-1 (Latin-1)"),
-		  build_string ("iso8859-1"), 0);
+		  build_string ("iso8859-1"), 0, 0);
   staticpro (&Vcharset_latin_iso8859_2);
   Vcharset_latin_iso8859_2 =
     make_charset (LEADING_BYTE_LATIN_ISO8859_2, Qlatin_iso8859_2, 2,
@@ -1067,7 +1083,7 @@
 		  build_string ("Latin-2"),
 		  build_msg_string ("ISO8859-2 (Latin-2)"),
 		  build_msg_string ("ISO8859-2 (Latin-2)"),
-		  build_string ("iso8859-2"), 0);
+		  build_string ("iso8859-2"), 0, 0);
   staticpro (&Vcharset_latin_iso8859_3);
   Vcharset_latin_iso8859_3 =
     make_charset (LEADING_BYTE_LATIN_ISO8859_3, Qlatin_iso8859_3, 2,
@@ -1076,7 +1092,7 @@
 		  build_string ("Latin-3"),
 		  build_msg_string ("ISO8859-3 (Latin-3)"),
 		  build_msg_string ("ISO8859-3 (Latin-3)"),
-		  build_string ("iso8859-3"), 0);
+		  build_string ("iso8859-3"), 0, 0);
   staticpro (&Vcharset_latin_iso8859_4);
   Vcharset_latin_iso8859_4 =
     make_charset (LEADING_BYTE_LATIN_ISO8859_4, Qlatin_iso8859_4, 2,
@@ -1085,7 +1101,7 @@
 		  build_string ("Latin-4"),
 		  build_msg_string ("ISO8859-4 (Latin-4)"),
 		  build_msg_string ("ISO8859-4 (Latin-4)"),
-		  build_string ("iso8859-4"), 0);
+		  build_string ("iso8859-4"), 0, 0);
   staticpro (&Vcharset_thai_tis620);
   Vcharset_thai_tis620 =
     make_charset (LEADING_BYTE_THAI_TIS620, Qthai_tis620, 2,
@@ -1094,7 +1110,7 @@
 		  build_string ("TIS620"),
 		  build_msg_string ("TIS620 (Thai)"),
 		  build_msg_string ("TIS620.2529 (Thai)"),
-		  build_string ("tis620"),0);
+		  build_string ("tis620"), 0, 0);
   staticpro (&Vcharset_greek_iso8859_7);
   Vcharset_greek_iso8859_7 =
     make_charset (LEADING_BYTE_GREEK_ISO8859_7, Qgreek_iso8859_7, 2,
@@ -1103,7 +1119,7 @@
 		  build_string ("ISO8859-7"),
 		  build_msg_string ("ISO8859-7 (Greek)"),
 		  build_msg_string ("ISO8859-7 (Greek)"),
-		  build_string ("iso8859-7"), 0);
+		  build_string ("iso8859-7"), 0, 0);
   staticpro (&Vcharset_arabic_iso8859_6);
   Vcharset_arabic_iso8859_6 =
     make_charset (LEADING_BYTE_ARABIC_ISO8859_6, Qarabic_iso8859_6, 2,
@@ -1112,7 +1128,7 @@
 		  build_string ("ISO8859-6"),
 		  build_msg_string ("ISO8859-6 (Arabic)"),
 		  build_msg_string ("ISO8859-6 (Arabic)"),
-		  build_string ("iso8859-6"), 0);
+		  build_string ("iso8859-6"), 0, 0);
   staticpro (&Vcharset_hebrew_iso8859_8);
   Vcharset_hebrew_iso8859_8 =
     make_charset (LEADING_BYTE_HEBREW_ISO8859_8, Qhebrew_iso8859_8, 2,
@@ -1121,7 +1137,7 @@
 		  build_string ("ISO8859-8"),
 		  build_msg_string ("ISO8859-8 (Hebrew)"),
 		  build_msg_string ("ISO8859-8 (Hebrew)"),
-		  build_string ("iso8859-8"), 0);
+		  build_string ("iso8859-8"), 0, 0);
   staticpro (&Vcharset_katakana_jisx0201);
   Vcharset_katakana_jisx0201 =
     make_charset (LEADING_BYTE_KATAKANA_JISX0201, Qkatakana_jisx0201, 2,
@@ -1130,7 +1146,7 @@
 		  build_string ("JISX0201 Kana"),
 		  build_msg_string ("JISX0201.1976 (Japanese Kana)"),
 		  build_msg_string ("JISX0201.1976 Japanese Kana"),
-		  build_string ("jisx0201.1976"), 0);
+		  build_string ("jisx0201.1976"), 0, 0);
   staticpro (&Vcharset_latin_jisx0201);
   Vcharset_latin_jisx0201 =
     make_charset (LEADING_BYTE_LATIN_JISX0201, Qlatin_jisx0201, 2,
@@ -1139,7 +1155,7 @@
 		  build_string ("JISX0201 Roman"),
 		  build_msg_string ("JISX0201.1976 (Japanese Roman)"),
 		  build_msg_string ("JISX0201.1976 Japanese Roman"),
-		  build_string ("jisx0201.1976"), 0);
+		  build_string ("jisx0201.1976"), 0, 0);
   staticpro (&Vcharset_cyrillic_iso8859_5);
   Vcharset_cyrillic_iso8859_5 =
     make_charset (LEADING_BYTE_CYRILLIC_ISO8859_5, Qcyrillic_iso8859_5, 2,
@@ -1148,7 +1164,7 @@
 		  build_string ("ISO8859-5"),
 		  build_msg_string ("ISO8859-5 (Cyrillic)"),
 		  build_msg_string ("ISO8859-5 (Cyrillic)"),
-		  build_string ("iso8859-5"), 0);
+		  build_string ("iso8859-5"), 0, 0);
   staticpro (&Vcharset_latin_iso8859_9);
   Vcharset_latin_iso8859_9 =
     make_charset (LEADING_BYTE_LATIN_ISO8859_9, Qlatin_iso8859_9, 2,
@@ -1157,7 +1173,7 @@
 		  build_string ("Latin-5"),
 		  build_msg_string ("ISO8859-9 (Latin-5)"),
 		  build_msg_string ("ISO8859-9 (Latin-5)"),
-		  build_string ("iso8859-9"), 0);
+		  build_string ("iso8859-9"), 0, 0);
   staticpro (&Vcharset_latin_iso8859_15);
   Vcharset_latin_iso8859_15 =
     make_charset (LEADING_BYTE_LATIN_ISO8859_15, Qlatin_iso8859_15, 2,
@@ -1166,7 +1182,7 @@
 		  build_string ("Latin-9"),
 		  build_msg_string ("ISO8859-15 (Latin-9)"),
 		  build_msg_string ("ISO8859-15 (Latin-9)"),
-		  build_string ("iso8859-15"), 0);
+		  build_string ("iso8859-15"), 0, 0);
   staticpro (&Vcharset_japanese_jisx0208_1978);
   Vcharset_japanese_jisx0208_1978 =
     make_charset (LEADING_BYTE_JAPANESE_JISX0208_1978, Qjapanese_jisx0208_1978, 3,
@@ -1176,7 +1192,7 @@
 		  build_msg_string ("JISX0208.1978 (Japanese)"),
 		  build_msg_string
 		  ("JISX0208.1978 Japanese Kanji (so called \"old JIS\")"),
-		  build_string ("\\(jisx0208\\|jisc6226\\)\\.1978"), 0);
+		  build_string ("\\(jisx0208\\|jisc6226\\)\\.1978"), 0, 0);
   staticpro (&Vcharset_chinese_gb2312);
   Vcharset_chinese_gb2312 =
     make_charset (LEADING_BYTE_CHINESE_GB2312, Qchinese_gb2312, 3,
@@ -1185,7 +1201,7 @@
 		  build_string ("GB2312"),
 		  build_msg_string ("GB2312)"),
 		  build_msg_string ("GB2312 Chinese simplified"),
-		  build_string ("gb2312"), 0);
+		  build_string ("gb2312"), 0, 0);
   staticpro (&Vcharset_japanese_jisx0208);
   Vcharset_japanese_jisx0208 =
     make_charset (LEADING_BYTE_JAPANESE_JISX0208, Qjapanese_jisx0208, 3,
@@ -1194,7 +1210,7 @@
 		  build_string ("JISX0208"),
 		  build_msg_string ("JISX0208.1983/1990 (Japanese)"),
 		  build_msg_string ("JISX0208.1983/1990 Japanese Kanji"),
-		  build_string ("jisx0208.19\\(83\\|90\\)"), 0);
+		  build_string ("jisx0208.19\\(83\\|90\\)"), 0, 0);
   staticpro (&Vcharset_korean_ksc5601);
   Vcharset_korean_ksc5601 =
     make_charset (LEADING_BYTE_KOREAN_KSC5601, Qkorean_ksc5601, 3,
@@ -1203,7 +1219,7 @@
 		  build_string ("KSC5601"),
 		  build_msg_string ("KSC5601 (Korean"),
 		  build_msg_string ("KSC5601 Korean Hangul and Hanja"),
-		  build_string ("ksc5601"), 0);
+		  build_string ("ksc5601"), 0, 0);
   staticpro (&Vcharset_japanese_jisx0212);
   Vcharset_japanese_jisx0212 =
     make_charset (LEADING_BYTE_JAPANESE_JISX0212, Qjapanese_jisx0212, 3,
@@ -1212,7 +1228,7 @@
 		  build_string ("JISX0212"),
 		  build_msg_string ("JISX0212 (Japanese)"),
 		  build_msg_string ("JISX0212 Japanese Supplement"),
-		  build_string ("jisx0212"), 0);
+		  build_string ("jisx0212"), 0, 0);
 
 #define CHINESE_CNS_PLANE_RE(n) "cns11643[.-]\\(.*[.-]\\)?" n "$"
   staticpro (&Vcharset_chinese_cns11643_1);
@@ -1224,7 +1240,7 @@
 		  build_msg_string ("CNS11643-1 (Chinese traditional)"),
 		  build_msg_string
 		  ("CNS 11643 Plane 1 Chinese traditional"),
-		  build_string (CHINESE_CNS_PLANE_RE("1")), 0);
+		  build_string (CHINESE_CNS_PLANE_RE("1")), 0, 0);
   staticpro (&Vcharset_chinese_cns11643_2);
   Vcharset_chinese_cns11643_2 =
     make_charset (LEADING_BYTE_CHINESE_CNS11643_2, Qchinese_cns11643_2, 3,
@@ -1234,7 +1250,7 @@
 		  build_msg_string ("CNS11643-2 (Chinese traditional)"),
 		  build_msg_string
 		  ("CNS 11643 Plane 2 Chinese traditional"),
-		  build_string (CHINESE_CNS_PLANE_RE("2")), 0);
+		  build_string (CHINESE_CNS_PLANE_RE("2")), 0, 0);
   staticpro (&Vcharset_chinese_big5_1);
   Vcharset_chinese_big5_1 =
     make_charset (LEADING_BYTE_CHINESE_BIG5_1, Qchinese_big5_1, 3,
@@ -1244,7 +1260,7 @@
 		  build_msg_string ("Big5 (Level-1)"),
 		  build_msg_string
 		  ("Big5 Level-1 Chinese traditional"),
-		  build_string ("big5"), 0);
+		  build_string ("big5"), 0, 0);
   staticpro (&Vcharset_chinese_big5_2);
   Vcharset_chinese_big5_2 =
     make_charset (LEADING_BYTE_CHINESE_BIG5_2, Qchinese_big5_2, 3,
@@ -1254,7 +1270,7 @@
 		  build_msg_string ("Big5 (Level-2)"),
 		  build_msg_string
 		  ("Big5 Level-2 Chinese traditional"),
-		  build_string ("big5"), 0);
+		  build_string ("big5"), 0, 0);
 
 
 #ifdef ENABLE_COMPOSITE_CHARS
@@ -1269,7 +1285,7 @@
 		  build_string ("Composite"),
 		  build_msg_string ("Composite characters"),
 		  build_msg_string ("Composite characters"),
-		  build_string (""), 0);
+		  build_string (""), 0, 0);
 #else
   /* We create a hack so that we have a way of storing ESC 0 and ESC 1
      sequences as "characters", so that they will be output correctly. */
@@ -1281,6 +1297,6 @@
 		  build_string ("Composite hack"),
 		  build_msg_string ("Composite characters hack"),
 		  build_msg_string ("Composite characters hack"),
-		  build_string (""), 0);
+		  build_string (""), 0, 0);
 #endif /* ENABLE_COMPOSITE_CHARS */
 }
--- a/src/mule-coding.c	Fri Jun 02 22:18:08 2006 +0000
+++ b/src/mule-coding.c	Sat Jun 03 17:51:06 2006 +0000
@@ -96,6 +96,42 @@
   return c >= 0xA1 && c <= 0xDF;
 }
 
+inline static void
+dynarr_add_2022_one_dimension (Lisp_Object charset, Ibyte c, 
+			       unsigned char charmask, 
+			       unsigned_char_dynarr *dst)
+{
+  if (XCHARSET_ENCODE_AS_UTF_8 (charset)) 
+    {
+      encode_unicode_char (charset, c & charmask, 0,	
+			   dst, UNICODE_UTF_8, 0);		
+    } 
+  else							
+    {							
+      Dynarr_add (dst, c & charmask);			
+    }							
+}
+
+inline static void 
+dynarr_add_2022_two_dimensions (Lisp_Object charset, Ibyte c, 
+				unsigned int ch, 
+				unsigned char charmask, 
+				unsigned_char_dynarr *dst)
+{
+  if (XCHARSET_ENCODE_AS_UTF_8 (charset))			
+    {							
+      encode_unicode_char (charset,				
+			   ch & charmask,			
+			   c & charmask, dst,		
+			   UNICODE_UTF_8, 0);		
+    }							
+  else							
+    {							
+      Dynarr_add (dst, ch & charmask);			
+      Dynarr_add (dst, c & charmask);			
+    }							
+}
+
 /* Convert Shift-JIS data to internal format. */
 
 static Bytecount
@@ -671,6 +707,10 @@
   ISO_ESC_2_4,		/* We've seen ESC $.  This indicates
 			   that we're designating a multi-byte, rather
 			   than a single-byte, character set. */
+  ISO_ESC_2_5,		/* We've seen ESC %. This indicates an escape to a
+			   Unicode coding system; the only one of these
+			   we're prepared to deal with is UTF-8, which has
+			   the next character as G. */
   ISO_ESC_2_8,		/* We've seen ESC 0x28, i.e. ESC (.
 			   This means designate a 94-character
 			   character set into G0. */
@@ -752,11 +792,15 @@
    character constructed by overstriking two or more characters). */
 #define ISO_STATE_COMPOSITE	(1 << 5)
 
+/* If set, we're processing UTF-8 encoded data within ISO-2022
+   processing. */
+#define ISO_STATE_UTF_8		(1 << 6)
+
 /* ISO_STATE_LOCK is the mask of flags that remain on until explicitly
    turned off when in the ISO2022 encoder/decoder.  Other flags are turned
    off at the end of processing each character or escape sequence. */
 # define ISO_STATE_LOCK \
-  (ISO_STATE_COMPOSITE | ISO_STATE_R2L)
+  (ISO_STATE_COMPOSITE | ISO_STATE_R2L | ISO_STATE_UTF_8)
 
 typedef struct charset_conversion_spec
 {
@@ -922,6 +966,9 @@
   Lisp_Object current_charset;
   int current_half;
   int current_char_boundary;
+
+  /* Used for handling UTF-8. */
+  unsigned char counter;  
 };
 
 static const struct memory_description ccs_description_1[] =
@@ -1344,6 +1391,15 @@
 	}
 
     case ISO_ESC:
+
+      /* The only available ISO 2022 sequence in UTF-8 mode is ESC % @, to
+	 exit from it. If we see any other escape sequence, pass it through
+	 in the error handler.  */
+      if (*flags & ISO_STATE_UTF_8 && '%' != c)
+	{
+	  return 0;
+	}
+
       switch (c)
 	{
 	  /**** single shift ****/
@@ -1411,6 +1467,10 @@
 	  iso->esc = ISO_ESC_2_4;
 	  goto not_done;
 
+	case '%':	/* Prefix to an escape to or from Unicode. */
+	  iso->esc = ISO_ESC_2_5;
+	  goto not_done; 
+
 	default:
 	  if (0x28 <= c && c <= 0x2F)
 	    {
@@ -1433,8 +1493,30 @@
 	  goto error;
 	}
 
-
-
+      /* ISO-IR 196 UTF-8 support. */
+    case ISO_ESC_2_5:
+      if ('G' == c)
+	{
+	  /* Activate UTF-8 mode. */
+	  *flags &= ISO_STATE_LOCK;
+	  *flags |= ISO_STATE_UTF_8;
+	  iso->esc = ISO_ESC_NOTHING;
+	  return 1;
+	}
+      else if ('@' == c)
+	{
+	  /* Deactive UTF-8 mode. */
+	  *flags &= ISO_STATE_LOCK;
+	  *flags &= ~(ISO_STATE_UTF_8);
+	  iso->esc = ISO_ESC_NOTHING;
+	  return 1;
+	}
+      else 
+	{
+	  /* Oops, we don't support the other UTF-? coding systems within
+	     ISO 2022, only in their own context. */
+	  goto error;
+	}
       /**** directionality ****/
 
     case ISO_ESC_5_11:		/* ISO6429 direction control */
@@ -1822,6 +1904,87 @@
 	    }
 	  ch = 0;
 	}
+      else if (flags & ISO_STATE_UTF_8)
+	{
+	  unsigned char counter = data->counter; 
+	  Ibyte work[MAX_ICHAR_LEN];
+	  int len;
+	  Lisp_Object chr;
+
+	  if (ISO_CODE_ESC == c)
+	    {
+	      /* Allow the escape sequence parser to end the UTF-8 state. */
+	      flags |= ISO_STATE_ESCAPE;
+	      data->esc = ISO_ESC;
+	      data->esc_bytes_index = 1;
+	      continue;
+	    }
+
+	  switch (counter)
+	    {
+	    case 0:
+	      if (c >= 0xfc)
+		{
+		  ch = c & 0x01;
+		  counter = 5;
+		}
+	      else if (c >= 0xf8)
+		{
+		  ch = c & 0x03;
+		  counter = 4;
+		}
+	      else if (c >= 0xf0)
+		{
+		  ch = c & 0x07;
+		  counter = 3;
+		}
+	      else if (c >= 0xe0)
+		{
+		  ch = c & 0x0f;
+		  counter = 2;
+		}
+	      else if (c >= 0xc0)
+		{
+		  ch = c & 0x1f;
+		  counter = 1;
+		}
+	      else
+		/* ASCII, or the lower control characters. */
+		Dynarr_add (dst, c);
+
+	      break;
+	    case 1:
+	      ch = (ch << 6) | (c & 0x3f);
+	      chr = Funicode_to_char(make_int(ch), Qnil);			
+
+	      if (!NILP (chr))						
+		{								
+		  assert(CHARP(chr));					
+		  len = set_itext_ichar (work, XCHAR(chr));		
+		  Dynarr_add_many (dst, work, len);			
+		}								
+	      else							
+		{								
+		  /* Shouldn't happen, this code should only be enabled in
+		     XEmacsen with support for all of Unicode. */
+		  Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);	
+		  Dynarr_add (dst, 34 + 128);				
+		  Dynarr_add (dst, 46 + 128);				
+		}								
+
+	      ch = 0;
+	      counter = 0;
+	      break;
+	    default:
+	      ch = (ch << 6) | (c & 0x3f);
+	      counter--;
+	    }
+
+	  if (str->eof)
+	    DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+
+	  data->counter = counter;
+	}
       else if (byte_c0_p (c) || byte_c1_p (c))
 	{ /* Control characters */
 
@@ -2010,6 +2173,7 @@
   }
 
   Dynarr_add (dst, ISO_CODE_ESC);
+
   switch (type)
     {
     case CHARSET_TYPE_94:
@@ -2102,6 +2266,14 @@
 	{		/* Processing ASCII character */
 	  ch = 0;
 
+	  if (flags & ISO_STATE_UTF_8)
+	    {
+	      Dynarr_add (dst, ISO_CODE_ESC);
+	      Dynarr_add (dst, '%');
+	      Dynarr_add (dst, '@');
+	      flags &= ~(ISO_STATE_UTF_8);
+	    }
+
 	  restore_left_to_right_direction (codesys, dst, &flags, 0);
 
 	  /* Make sure G0 contains ASCII */
@@ -2145,18 +2317,43 @@
 	  Dynarr_add (dst, c);
 	  char_boundary = 1;
 	}
-
       else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch))
 	{ /* Processing Leading Byte */
 	  ch = 0;
 	  charset = charset_by_leading_byte (c);
 	  if (leading_byte_prefix_p (c))
-	    ch = c;
+	    {
+	      ch = c;
+	    }
+	  else if (XCHARSET_ENCODE_AS_UTF_8 (charset))
+	    {
+	      assert (!EQ (charset, Vcharset_control_1)
+		      && !EQ (charset, Vcharset_composite));
+
+	      /* If the character set is to be encoded as UTF-8, the escape
+		 is always the same. */
+	      if (!(flags & ISO_STATE_UTF_8)) 
+		{
+		  Dynarr_add (dst, ISO_CODE_ESC);
+		  Dynarr_add (dst, '%');
+		  Dynarr_add (dst, 'G');
+		  flags |= ISO_STATE_UTF_8;
+		}
+	    }
 	  else if (!EQ (charset, Vcharset_control_1)
 		   && !EQ (charset, Vcharset_composite))
 	    {
 	      int reg;
 
+	      /* End the UTF-8 state. */
+	      if (flags & ISO_STATE_UTF_8)
+		{
+		  Dynarr_add (dst, ISO_CODE_ESC);
+		  Dynarr_add (dst, '%');
+		  Dynarr_add (dst, '@');
+		  flags &= ~(ISO_STATE_UTF_8);
+		}
+
 	      ensure_correct_direction (XCHARSET_DIRECTION (charset),
 					codesys, dst, &flags, 0);
 
@@ -2274,12 +2471,14 @@
 	      switch (XCHARSET_REP_BYTES (charset))
 		{
 		case 2:
-		  Dynarr_add (dst, c & charmask);
+		  dynarr_add_2022_one_dimension (charset, c,
+						 charmask, dst);
 		  break;
 		case 3:
 		  if (XCHARSET_PRIVATE_P (charset))
 		    {
-		      Dynarr_add (dst, c & charmask);
+		      dynarr_add_2022_one_dimension (charset, c,
+						     charmask, dst);
 		      ch = 0;
 		    }
 		  else if (ch)
@@ -2287,6 +2486,9 @@
 #ifdef ENABLE_COMPOSITE_CHARS
 		      if (EQ (charset, Vcharset_composite))
 			{
+			  /* #### Hasn't been written to handle composite
+			     characters yet. */
+			  assert(!XCHARSET_ENCODE_AS_UTF_8 (charset))
 			  if (in_composite)
 			    {
 			      /* #### Bother! We don't know how to
@@ -2310,8 +2512,8 @@
 		      else
 #endif /* ENABLE_COMPOSITE_CHARS */
 			{
-			  Dynarr_add (dst, ch & charmask);
-			  Dynarr_add (dst, c & charmask);
+			  dynarr_add_2022_two_dimensions (charset, c, ch,
+							  charmask, dst);
 			}
 		      ch = 0;
 		    }
@@ -2324,8 +2526,8 @@
 		case 4:
 		  if (ch)
 		    {
-		      Dynarr_add (dst, ch & charmask);
-		      Dynarr_add (dst, c & charmask);
+		      dynarr_add_2022_two_dimensions (charset, c, ch,
+						      charmask, dst);
 		      ch = 0;
 		    }
 		  else
--- a/src/redisplay-x.c	Fri Jun 02 22:18:08 2006 +0000
+++ b/src/redisplay-x.c	Sat Jun 03 17:51:06 2006 +0000
@@ -230,6 +230,10 @@
 	}
 #endif /* MULE */
       *text_storage++ = (unsigned char) byte1;
+      /* This dimension stuff is broken if you want to use a two-dimensional
+	 X11 font to display a single-dimensional character set, as is
+	 appropriate for the IPA (use one of the -iso10646-1 fonts) or some
+	 of the other non-standard character sets. */
       if (dimension == 2)
 	*text_storage++ = (unsigned char) byte2;
 #else /* USE_XFT */
--- a/src/unicode.c	Fri Jun 02 22:18:08 2006 +0000
+++ b/src/unicode.c	Sat Jun 03 17:51:06 2006 +0000
@@ -321,6 +321,10 @@
 
 Lisp_Object Qignore_first_column;
 
+Lisp_Object Vcurrent_jit_charset;
+Lisp_Object Qlast_allocated_character;
+Lisp_Object Qccl_encode_to_ucs_2;
+
 
 /************************************************************************/
 /*                        Unicode implementation                        */
@@ -1001,12 +1005,72 @@
 }
 
 static Ichar
+get_free_codepoint(Lisp_Object charset)
+{
+  Lisp_Object name = Fcharset_name(charset);
+  Lisp_Object zeichen = Fget(name, Qlast_allocated_character, Qnil);
+  Ichar res;
+
+  /* Only allow this with the 96x96 character sets we are using for
+     temporary Unicode support. */
+  assert(2 == XCHARSET_DIMENSION(charset) && 96 == XCHARSET_CHARS(charset));
+
+  if (!NILP(zeichen))
+    {
+      int c1, c2;
+
+      BREAKUP_ICHAR(XCHAR(zeichen), charset, c1, c2);
+
+      if (127 == c1 && 127 == c2)
+	{
+	  /* We've already used the hightest-numbered character in this
+	     set--tell our caller to create another. */
+	  return -1;
+	}
+
+      if (127 == c2)
+	{
+	  ++c1;
+	  c2 = 0x20;
+	}
+      else
+	{
+	  ++c2;
+	}
+
+      res = make_ichar(charset, c1, c2);
+      Fput(name, Qlast_allocated_character, make_char(res));
+    }
+  else
+    {
+      res = make_ichar(charset, 32, 32);
+      Fput(name, Qlast_allocated_character, make_char(res));
+    }
+  return res;
+}
+
+/* The just-in-time creation of XEmacs characters that correspond to unknown
+   Unicode code points happens when: 
+
+   1. The lookup would otherwise fail. 
+
+   2. The charsets array is the nil or the default. 
+
+   If there are no free code points in the just-in-time Unicode character
+   set, and the charsets array is the default unicode precedence list,
+   create a new just-in-time Unicode character set, add it at the end of the
+   unicode precedence list, create the XEmacs character in that character
+   set, and return it. */
+
+static Ichar
 unicode_to_ichar (int code, Lisp_Object_dynarr *charsets)
 {
   int u1, u2, u3, u4;
   int code_levels;
   int i;
   int n = Dynarr_length (charsets);
+  static int number_of_jit_charsets;
+  static Ascbyte last_jit_charset_final;
 
   type_checking_assert (code >= 0);
   /* This shortcut depends on the representation of an Ichar, see text.c.
@@ -1040,8 +1104,64 @@
 	    return make_ichar (charset, retval >> 8, retval & 0xFF);
 	}
     }
+  
+  /* Only do the magic just-in-time assignment if we're using the default
+     list. */ 
+  if (unicode_precedence_dynarr == charsets) 
+    {
+      if (NILP (Vcurrent_jit_charset) || 
+	  (-1 == (i = get_free_codepoint(Vcurrent_jit_charset))))
+	{
+	  Ascbyte setname[32]; 
+	  Lisp_Object charset_descr = build_string
+	    ("Mule charset for otherwise unknown Unicode code points.");
+	  Lisp_Object charset_regr = build_string("iso10646-1");
 
-  return (Ichar) -1;
+	  struct gcpro gcpro1, gcpro2;
+
+	  if ('\0' == last_jit_charset_final)
+	    {
+	      /* This final byte shit is, umm, not that cool. */
+	      last_jit_charset_final = 0x30;
+	    }
+
+	  snprintf(setname, sizeof(setname), 
+		   "jit-ucs-charset-%d", number_of_jit_charsets++);
+
+	  /* Aside: GCPROing here would be overkill according to the FSF's
+	     philosophy. make-charset cannot currently GC, but is intended
+	     to be called from Lisp, with its arguments protected by the
+	     Lisp reader. We GCPRO in case it GCs in the future and no-one
+	     checks all the C callers.  */
+
+	  GCPRO2 (charset_descr, charset_regr);
+	  Vcurrent_jit_charset = Fmake_charset 
+	    (intern(setname), charset_descr, 
+	     /* Set encode-as-utf-8 to t, to have this character set written
+		using UTF-8 escapes in escape-quoted and ctext. This
+		sidesteps the fact that our internal character -> Unicode
+		mapping is not stable from one invocation to the next.  */
+	     nconc2 (list2(Qencode_as_utf_8, Qt),
+		     nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96),
+				   Qdimension, make_int(2)),
+			     list6(Qregistry, charset_regr,
+				   Qfinal, make_char(last_jit_charset_final++),
+				   /* This CCL program is initialised in
+				      unicode.el. */
+				   Qccl_program, Qccl_encode_to_ucs_2))));
+	  UNGCPRO;
+
+	  i = get_free_codepoint(Vcurrent_jit_charset);
+	} 
+
+      if (-1 != i)
+	{
+	  set_unicode_conversion((Ichar)i, code);
+	  /* No need to add the charset to the end of the list; it's done
+	     automatically. */
+	}
+    }
+  return (Ichar) i;
 }
 
 /* Add charsets to precedence list.
@@ -1284,37 +1404,13 @@
 present), this function simply does `int-to-char' and ignores the CHARSETS
 argument.
 
-Note that the current XEmacs internal encoding has no mapping for many
-Unicode code points, and if you use characters that are vaguely obscure with
-XEmacs' Unicode coding systems, you will lose data.
-
-To add support for some desired code point in the short term--note that our
-intention is to move to a Unicode-compatible internal encoding soon, for
-some value of soon--if you are a distributor, add something like the
-following to `site-start.el.'
-
-(make-charset 'distro-name-private 
-	      "Private character set for DISTRO"
-	      '(dimension 1
-		chars 96
-		columns 1
-		final ?5 ;; Change this--see docs for make-charset
-		long-name "Private charset for some Unicode char support."
-		short-name "Distro-Private"))
-
-(set-unicode-conversion 
- (make-char 'distro-name-private #x20) #x263A) ;; WHITE SMILING FACE
-
-(set-unicode-conversion 
- (make-char 'distro-name-private #x21) #x3030) ;; WAVY DASH
-
-;; ... 
-;;; Repeat as necessary. 
-
-Redisplay will work on the sjt-xft branch, but not with server-side X11
-fonts as is the default.  However, data read in will be preserved when they
-are written out again.
-
+If the CODE would not otherwise be converted to an XEmacs character, and the
+list of character sets to be consulted is nil or the default, a new XEmacs
+character will be created for it in one of the `jit-ucs-charset' Mule
+character sets, and that character will be returned.  There is scope for
+tens of thousands of separate Unicode code points in every session using
+this technique, so despite XEmacs' internal encoding not being based on
+Unicode, your data won't be trashed.
 */
        (code, USED_IF_MULE (charsets)))
 {
@@ -1558,16 +1654,6 @@
 /*                         Unicode coding system                        */
 /************************************************************************/
 
-/* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */
-
-enum unicode_type
-{
-  UNICODE_UTF_16,
-  UNICODE_UTF_8,
-  UNICODE_UTF_7,
-  UNICODE_UCS_4
-};
-
 struct unicode_coding_system
 {
   enum unicode_type type;
@@ -1728,7 +1814,9 @@
     }
 }
 
-static void
+/* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented
+   encodings. */
+void
 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
 		     int USED_IF_MULE (l), unsigned_char_dynarr *dst,
 		     enum unicode_type type, unsigned int little_endian)
@@ -2444,6 +2532,8 @@
 
   DEFSUBR (Fload_unicode_mapping_table);
 
+  DEFSYMBOL (Qccl_encode_to_ucs_2);
+  DEFSYMBOL (Qlast_allocated_character);
   DEFSYMBOL (Qignore_first_column);
 #endif /* MULE */
 
@@ -2519,6 +2609,9 @@
 
   init_blank_unicode_tables ();
 
+  staticpro (&Vcurrent_jit_charset);
+  Vcurrent_jit_charset = Qnil;
+
   /* Note that the "block" we are describing is a single pointer, and hence
      we could potentially use dump_add_root_block_ptr().  However, given
      the way the descriptions are written, we couldn't use them, and would
--- a/tests/ChangeLog	Fri Jun 02 22:18:08 2006 +0000
+++ b/tests/ChangeLog	Sat Jun 03 17:51:06 2006 +0000
@@ -1,3 +1,13 @@
+2006-06-03  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* automated/mule-tests.el:
+	Add checks that several Unicode characters, expressed as
+	UTF-8-encoded strings, are handled correctly by the UTF-8 support
+	of the escape-quoted character set. 
+	* automated/mule-tests.el (unicode-code-point-to-utf-8-string): New.
+	Convert a Unicode code point to the equivalent UTF-8 string. 
+	This is a naive implementation in Lisp. 
+
 2006-05-16  Stephen J. Turnbull  <stephen@xemacs.org>
 
 	* XEmacs 21.5.27 "fiddleheads" is released.
--- a/tests/automated/mule-tests.el	Fri Jun 02 22:18:08 2006 +0000
+++ b/tests/automated/mule-tests.el	Sat Jun 03 17:51:06 2006 +0000
@@ -78,6 +78,38 @@
 ;; once compiled, for no good reason.
 (test-chars t)
 
+(defun unicode-code-point-to-utf-8-string (code-point)
+  "Convert a Unicode code point to the equivalent UTF-8 string. 
+This is a naive implementation in Lisp.  "
+  (check-argument-type 'natnump code-point)
+  (check-argument-range code-point 0 #x1fffff)
+  (if (< code-point #x80)
+      (format "%c" code-point)
+    (if (< code-point #x800)
+	(format "%c%c" 
+		;; ochars[0] = 0xC0 | (input & ~(0xFFFFF83F)) >> 6;
+		(logior #xc0 (lsh (logand code-point #x7c0) -6))
+		;; ochars[1] = 0x80 | input & ~(0xFFFFFFC0);
+		(logior #x80 (logand code-point #x3f)))
+      (if (< code-point #x00010000)
+	  (format "%c%c%c" 
+		  ;; ochars[0] = 0xE0 | (input >> 12) & ~(0xFFFFFFF0); 
+		  (logior #xe0 (logand (lsh code-point -12) #x0f))
+		  ;; ochars[1] = 0x80 | (input >> 6) & ~(0xFFFFFFC0); 
+		  (logior #x80 (logand (lsh code-point -6) #x3f))
+		  ;; ochars[2] = 0x80 | input & ~(0xFFFFFFC0); 
+		  (logior #x80 (logand code-point #x3f)))
+	(if (< code-point #x200000)
+	    (format "%c%c%c%c" 
+		    ;; ochars[0] = 0xF0 | (input >> 18) & ~(0xFFFFFFF8)
+		    (logior #xF0 (logand (lsh code-point -18) #x7))
+		    ;; ochars[1] = 0x80 | (input >> 12) & ~(0xFFFFFFC0);
+		    (logior #x80 (logand (lsh code-point -12) #x3f))
+		    ;; ochars[2] = 0x80 | (input >> 6) & ~(0xFFFFFFC0); 
+		    (logior #x80 (logand (lsh code-point -6) #x3f))
+		    ;; ochars[3] = 0x80 | input & ~(0xFFFFFFC0); 
+		    (logior #x80 (logand code-point #x3f))))))))
+
 ;;-----------------------------------------------------------------
 ;; Test string modification functions that modify the length of a char.
 ;;-----------------------------------------------------------------
@@ -336,6 +368,37 @@
   
     (Check-Error wrong-type-argument (set-unicode-conversion scaron -10000)))
 
+  (dolist (utf-8-char 
+	   '("\xc6\x92"		  ;; U+0192 LATIN SMALL LETTER F WITH HOOK
+	     "\xe2\x81\x8a"	  ;; U+204A TIRONIAN SIGN ET
+	     "\xe2\x82\xae"	  ;; U+20AE TUGRIK SIGN
+	     "\xf0\x9d\x92\xbd"	  ;; U+1D4BD MATHEMATICAL SCRIPT SMALL H
+	     "\xf0\x9d\x96\x93"   ;; U+1D593 MATHEMATICAL BOLD FRAKTUR SMALL N
+	     "\xf0\xaf\xa8\x88"   ;; U+2FA08 CJK COMPATIBILITY FOR U+4BCE
+	     "\xf4\x8f\xbf\xbd")) ;; U+10FFFD <Plane 16 Private Use, Last>
+    (let* ((xemacs-character (car (append 
+				  (decode-coding-string utf-8-char 'utf-8) 
+				  nil)))
+	   (xemacs-charset (car (split-char xemacs-character))))
+
+      ;; Trivial test of the UTF-8 support of the escape-quoted character set. 
+      (Assert (equal (decode-coding-string utf-8-char 'utf-8)
+		     (decode-coding-string (concat "\033%G" utf-8-char)
+					   'escape-quoted)))
+
+      ;; Check that the reverse mapping holds. 
+      (Assert (equal (unicode-code-point-to-utf-8-string 
+		      (encode-char xemacs-character 'ucs))
+		     utf-8-char))
+
+      ;; Check that, if this character has been JIT-allocated, it is encoded
+      ;; in escape-quoted using the corresponding UTF-8 escape. 
+      (when (charset-property xemacs-charset 'encode-as-utf-8)
+	(Assert (equal (concat "\033%G" utf-8-char)
+		       (encode-coding-string xemacs-character 'escape-quoted)))
+	(Assert (equal (concat "\033%G" utf-8-char)
+		       (encode-coding-string xemacs-character 'ctext))))))
+
   ;;---------------------------------------------------------------
   ;; Test charset-in-* functions
   ;;---------------------------------------------------------------