Mercurial > hg > xemacs-beta
changeset 3439:d1754e7f0cea
[xemacs-hg @ 2006-06-03 17:50:39 by aidan]
Just-in-time Unicode code point support.
author | aidan |
---|---|
date | Sat, 03 Jun 2006 17:51:06 +0000 |
parents | 14fbcab7c67b |
children | 4d0cab3a989f |
files | lisp/ChangeLog lisp/mule/mule-ccl.el lisp/unicode.el lisp/x-init.el man/ChangeLog man/lispref/mule.texi src/ChangeLog src/charset.h src/event-xlike-inc.c src/general-slots.h src/lread.c src/mule-ccl.c src/mule-charset.c src/mule-coding.c src/redisplay-x.c src/unicode.c tests/ChangeLog tests/automated/mule-tests.el |
diffstat | 18 files changed, 819 insertions(+), 123 deletions(-) [+] |
line wrap: on
line diff
--- a/lisp/ChangeLog Fri Jun 02 22:18:08 2006 +0000 +++ b/lisp/ChangeLog Sat Jun 03 17:51:06 2006 +0000 @@ -1,3 +1,21 @@ +2006-06-03 Aidan Kehoe <kehoea@parhasard.net> + + * mule/mule-ccl.el: + "X Emacs" -> "XEmacs" + * mule/mule-ccl.el (ccl-compile-mule-to-unicode): New. + * mule/mule-ccl.el (ccl-compile-unicode-to-mule): New. + * mule/mule-ccl.el (ccl-dump-mule-to-unicode): New. + * mule/mule-ccl.el (ccl-dump-unicode-to-mule): New. + * mule/mule-ccl.el (define-ccl-program): + Add two new CCL commands, and commands to describe them; document + them. + +2006-06-03 Aidan Kehoe <kehoea@parhasard.net> + + * unicode.el: + * unicode.el (featurep): + * x-init.el (x-initialize-keyboard): + 2006-06-03 Adrian Aichner <adrian@xemacs.org> * package-get.el (package-get-download-sites): Welcome
--- a/lisp/mule/mule-ccl.el Fri Jun 02 22:18:08 2006 +0000 +++ b/lisp/mule/mule-ccl.el Sat Jun 03 17:51:06 2006 +0000 @@ -5,20 +5,20 @@ ;; Keywords: CCL, mule, multilingual, character set, coding-system -;; This file is part of X Emacs. +;; This file is part of XEmacs. -;; GNU Emacs is free software; you can redistribute it and/or modify +;; XEmacs is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 2, or (at your option) ;; any later version. -;; GNU Emacs is distributed in the hope that it will be useful, +;; XEmacs is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License -;; along with GNU Emacs; see the file COPYING. If not, write to the +;; along with XEmacs; see the file COPYING. If not, write to the ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, ;; Boston, MA 02111-1307, USA. @@ -48,7 +48,7 @@ [if branch loop break repeat write-repeat write-read-repeat read read-if read-branch write call end read-multibyte-character write-multibyte-character - translate-character + translate-character mule-to-unicode unicode-to-mule iterate-multiple-map map-multiple map-single] "Vector of CCL commands (symbols).") @@ -100,7 +100,9 @@ write-multibyte-character translate-character translate-character-const-tbl - nil nil nil nil nil nil nil nil nil nil nil nil ; 0x04-0x0f + mule-to-unicode + unicode-to-mule + nil nil nil nil nil nil nil nil nil nil ; 0x06-0x0f iterate-multiple-map map-multiple map-single @@ -830,6 +832,29 @@ (ccl-embed-extended-command 'translate-character rrr RRR Rrr)))) nil) +;; Compile mule-to-unicode +(defun ccl-compile-mule-to-unicode (cmd) + (if (/= (length cmd) 3) + (error "CCL: Invalid number of arguments: %s" cmd)) + (let ((RRR (nth 1 cmd)) + (rrr (nth 2 cmd))) + (ccl-check-register RRR cmd) + (ccl-check-register rrr cmd) + (ccl-embed-extended-command 'mule-to-unicode RRR rrr 0)) + nil) + +;; Given a Unicode code point in register rrr, write the charset ID of the +;; corresponding character in RRR, and the Mule-CCL form of its code in rrr. +(defun ccl-compile-unicode-to-mule (cmd) + (if (/= (length cmd) 3) + (error "CCL: Invalid number of arguments: %s" cmd)) + (let ((rrr (nth 1 cmd)) + (RRR (nth 2 cmd))) + (ccl-check-register rrr cmd) + (ccl-check-register RRR cmd) + (ccl-embed-extended-command 'unicode-to-mule rrr RRR 0)) + nil) + (defun ccl-compile-iterate-multiple-map (cmd) (ccl-compile-multiple-map-function 'iterate-multiple-map cmd) nil) @@ -1188,6 +1213,12 @@ (let ((tbl (ccl-get-next-code))) (insert (format "translation table(%S) r%d r%d\n" tbl RRR rrr)))) +(defun ccl-dump-mule-to-unicode (rrr RRR Rrr) + (insert (format "change chars in r%d and r%d to unicode\n" RRR rrr))) + +(defun ccl-dump-unicode-to-mule (rrr RRR Rrr) + (insert (format "converter UCS code %d to a Mule char\n" rrr))) + (defun ccl-dump-iterate-multiple-map (rrr RRR Rrr) (let ((notbl (ccl-get-next-code)) (i 0) id) @@ -1359,9 +1390,13 @@ ;; Call CCL program whose name is ccl-program-name. CALL := (call ccl-program-name) -TRANSLATE := ;; Not implemented under XEmacs. - (translate-character REG(table) REG(charset) REG(codepoint)) - | (translate-character SYMBOL REG(charset) REG(codepoint)) +TRANSLATE := ;; Not implemented under XEmacs, except mule-to-unicode and + ;; unicode-to-mule. + (translate-character REG(table) REG(charset) REG(codepoint)) + | (translate-character SYMBOL REG(charset) REG(codepoint)) + | (mule-to-unicode REG(charset) REG(codepoint)) + | (unicode-to-mule REG(unicode,code) REG(CHARSET)) + MAP := (iterate-multiple-map REG REG MAP-IDs) | (map-multiple REG REG (MAP-SET)) @@ -1373,8 +1408,8 @@ ;; Terminate the CCL program. END := (end) -;; CCL registers. These can contain any integer value. As r7 is used by CCL -;; interpreter itself, its value change unexpectedly. +;; CCL registers. These can contain any integer value. As r7 is used by the +;; CCL interpreter itself, its value can change unexpectedly. REG := r0 | r1 | r2 | r3 | r4 | r5 | r6 | r7 ARG := REG | INT-OR-CHAR
--- a/lisp/unicode.el Fri Jun 02 22:18:08 2006 +0000 +++ b/lisp/unicode.el Sat Jun 03 17:51:06 2006 +0000 @@ -318,6 +318,38 @@ "Sorry, encode-char doesn't yet support anything but the UCS. ") (char-to-unicode char)) +(when (featurep 'mule) + ;; This CCL program is used for displaying the fallback UCS character set, + ;; and can be repurposed to lao and the IPA, all going well. + ;; + ;; define-ccl-program is available after mule-ccl is loaded, much later + ;; than this file in the build process. The below is the result of + ;; + ;; (macroexpand + ;; '(define-ccl-program ccl-encode-to-ucs-2 + ;; `(1 + ;; ((r1 = (r1 << 8)) + ;; (r1 = (r1 | r2)) + ;; (mule-to-unicode r0 r1) + ;; (r1 = (r0 >> 8)) + ;; (r2 = (r0 & 255)))) + ;; "CCL program to transform Mule characters to UCS-2.")) + ;; + ;; and it should occasionally be confirmed that the correspondence still + ;; holds. + + (let ((prog [1 10 131127 8 98872 65823 147513 8 82009 255 22])) + (defconst ccl-encode-to-ucs-2 prog + "CCL program to transform Mule characters to UCS-2.") + (put (quote ccl-encode-to-ucs-2) (quote ccl-program-idx) + (register-ccl-program (quote ccl-encode-to-ucs-2) prog)) nil)) + +;; Won't do this just yet, though. +;; (set-charset-registry 'lao "iso10646-1") +;; (set-charset-ccl-program 'lao 'ccl-encode-to-ucs-2) +;; (set-charset-registry 'ipa "iso10646-1") +;; (set-charset-ccl-program 'ipa 'ccl-encode-to-ucs-2) + ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's ;; an implementation in appendix A.1 of the Unicode Standard, Version ;; 2.0, but I don't know its licensing characteristics.
--- a/lisp/x-init.el Fri Jun 02 22:18:08 2006 +0000 +++ b/lisp/x-init.el Sat Jun 03 17:51:06 2006 +0000 @@ -247,7 +247,7 @@ (when (and (not (get (intern sym-string) 'character-of-keysym)) (string-match "^U[0-9A-F]+$" sym-string)) (pushnew (concat sym-string " ") unknown-code-points :test 'equal))) - (when unknown-code-points + (when (and (featurep 'mule) unknown-code-points) (lwarn 'key-mapping 'info "Undefined Unicode key mappings. Your keyboard has, among many others, the following keysyms defined:
--- a/man/ChangeLog Fri Jun 02 22:18:08 2006 +0000 +++ b/man/ChangeLog Sat Jun 03 17:51:06 2006 +0000 @@ -1,3 +1,10 @@ +2006-06-03 Aidan Kehoe <kehoea@parhasard.net> + + * lispref/mule.texi (CCL Syntax): + * lispref/mule.texi (CCL Statements): + Describe the mule-to-unicode and unicode-to-mule statements; + rename the section they are described in. + 2006-05-17 Stephen J. Turnbull <stephen@xemacs.org> * xemacs-faq.texi (Q2.2.3): New node.
--- a/man/lispref/mule.texi Fri Jun 02 22:18:08 2006 +0000 +++ b/man/lispref/mule.texi Sat Jun 03 17:51:06 2006 +0000 @@ -1825,6 +1825,15 @@ | (write INT-OR-CHAR) | (write string) | (write REG ARRAY) | string CALL := (call ccl-program-name) + + +TRANSLATE := ;; Not implemented under XEmacs, except mule-to-unicode and + ;; unicode-to-mule. + (translate-character REG(table) REG(charset) REG(codepoint)) + | (translate-character SYMBOL REG(charset) REG(codepoint)) + | (mule-to-unicode REG(charset) REG(codepoint)) + | (unicode-to-mule REG(unicode,code) REG(CHARSET)) + END := (end) REG := r0 | r1 | r2 | r3 | r4 | r5 | r6 | r7 @@ -1845,7 +1854,8 @@ The Emacs Code Conversion Language provides the following statement types: @dfn{set}, @dfn{if}, @dfn{branch}, @dfn{loop}, @dfn{repeat}, -@dfn{break}, @dfn{read}, @dfn{write}, @dfn{call}, and @dfn{end}. +@dfn{break}, @dfn{read}, @dfn{write}, @dfn{call}, @dfn{translate} and +@dfn{end}. @heading Set statement: @@ -1933,12 +1943,32 @@ @code{write} and @code{read} statements for the semantics of the I/O operations for each type of argument. -@heading Other control statements: +@heading Other statements: The @dfn{call} statement, written @samp{(call @var{ccl-program-name})}, executes a CCL program as a subroutine. It does not return a value to the caller, but can modify the register status. + The @dfn{mule-to-unicode} statement translates an XEmacs character into a +UCS code point, using U+FFFD REPLACEMENT CHARACTER if the given XEmacs +character has no known corresponding code point. It takes two +arguments; the first is a register in which is stored the character set +ID of the character to be translated, and into which the UCS code is +stored. The second is a register which stores the XEmacs code of the +character in question; if it is from a multidimensional character set, +like most of the East Asian national sets, it's stored as @samp{((c1 << +8) & c2)}, where @samp{c1} is the first code, and @samp{c2} the second. +(That is, as a single integer, the high-order eight bits of which encode +the first position code, and the low order bits of which encode the +second.) + + The @dfn{unicode-to-mule} statement translates a Unicode code point +(an integer) into an XEmacs character. Its first argument is a register +containing the UCS code point; the code for the correspond character +will be written into this register, in the same format as for +@samp{mule-to-unicode} The second argument is a register into which will +be written the character set ID of the converted character. + The @dfn{end} statement, written @samp{(end)}, terminates the CCL program successfully, and returns to caller (which may be a CCL program). It does not alter the status of the registers.
--- a/src/ChangeLog Fri Jun 02 22:18:08 2006 +0000 +++ b/src/ChangeLog Sat Jun 03 17:51:06 2006 +0000 @@ -1,3 +1,90 @@ +2006-06-03 Aidan Kehoe <kehoea@parhasard.net> + + * charset.h: + * charset.h (struct Lisp_Charset): + * charset.h (CHARSET_ENCODE_AS_UTF_8): + * charset.h (XCHARSET_ENCODE_AS_UTF_8): + Add a flag `encode-as-utf-8' to the Mule charset structure; if + set, it's an indication to ISO 2022-oriented coding systems that + the characters of that charset should be encoded using the ISO-IR + 196 UTF-8 escape syntax, since they're not members of any other + well-known character set we're aware of. + + Make enum unicode_type, encode_unicode_char and Funicode_to_char + available outside of unicode.c + + * lread.c: + * event-xlike-inc.c: + Use the charset.h declaration of Funicode_to_char, don't declare + it ourselves. + + My XFree86 installation has taken to passing me ASCII characters + using the Unicode keysyms; accept them too. + + * general-slots.h: + Make `ccl-program' and `encode-as-utf-8' available as symbols + generally. + + * mule-ccl.c: + Add CCL_MuleToUnicode, CCL_UnicodeToMule, implement them, enable + and debug CCL_MAKE_CHAR, have CCL_WriteMultibyteChar2 segfault + less, fix some grammar. + + * mule-charset.c (make_charset): + * mule-charset.c (Fmake_charset): + * mule-charset.c (Fcharset_property): + * mule-charset.c (complex_vars_of_mule_charset): + Require the encode_as_utf_8 property when calling make_charset (); + accept it when creating a charset from Lisp in Fmake_charset. + + * mule-coding.c: + * mule-coding.c (dynarr_add_2022_one_dimension): + * mule-coding.c (dynarr_add_2022_two_dimensions): + Add two convenience functions for iso2022_decode, to abstract out + writing UTF-8 a little. + + * mule-coding.c (enum iso_esc_flag): + Add one more state to reflect the existence of the UTF-8 escape. + + * mule-coding.c (struct iso2022_coding_stream): + Add a counter variable to the state to permit handling + variable-length UTF-8. + + * mule-coding.c (parse_iso2022_esc): + Update the function to work with ISO_STATE_UTF_8; only the ESC % @ + escape is processed in that state, everything else is ignored and + passed through by the error handler. + + * mule-coding.c (iso2022_decode): + * mule-coding.c (iso2022_designate): + * mule-coding.c (iso2022_encode): + Handle the UTF-8 escape sequences in reading and in writing ISO + 2022. + + * redisplay-x.c (separate_textual_runs): + Add a comment to the effect that the dimension stuff breaks when + using CCL programs and registries to map to a bigger charset. + + * unicode.c: + Add support for creating new characters on the fly as unknown + Unicode code points are encountered. + + * unicode.c (get_free_codepoint): New. + * unicode.c (unicode_to_ichar): Reworked to create new code points + on the fly. + * unicode.c (Funicode_to_char): Update the docstring. + * unicode.c (struct unicode_coding_system): + Move enum unicode_type into charset.h. + + * unicode.c (encode_unicode_char): + encode_unicode_char isn't static any longer, mule-coding.c uses + it. + * unicode.c (syms_of_unicode): + Make a couple of symbols available to unicode.c + * unicode.c (vars_of_unicode): + Tell the garbage collector about current_jit_charset, initialise + it. + 2006-05-25 Stephen J. Turnbull <stephen@xemacs.org> * objects-x.c (x_find_charset_font): Keep local names local; wrap
--- a/src/charset.h Fri Jun 02 22:18:08 2006 +0000 +++ b/src/charset.h Sat Jun 03 17:51:06 2006 +0000 @@ -229,6 +229,11 @@ /* Which half of font to be used to display this character set */ int graphic; + /* If set, this charset should be written out in ISO-2022-based coding + systems using the escape sequence for UTF-8, not using our internal + representation and the associated real ISO 2022 designation. */ + unsigned int encode_as_utf_8 :1; + /* If set, this is a "temporary" charset created when we encounter an unknown final. This is so that we can successfully compile and load such files. We allow a real charset to be created on top @@ -261,6 +266,7 @@ #define CHARSET_REP_BYTES(cs) ((cs)->rep_bytes) #define CHARSET_COLUMNS(cs) ((cs)->columns) #define CHARSET_GRAPHIC(cs) ((cs)->graphic) +#define CHARSET_ENCODE_AS_UTF_8(cs) ((cs)->encode_as_utf_8) #define CHARSET_TYPE(cs) ((cs)->type) #define CHARSET_DIRECTION(cs) ((cs)->direction) #define CHARSET_FINAL(cs) ((cs)->final) @@ -284,6 +290,7 @@ #define XCHARSET_REP_BYTES(cs) CHARSET_REP_BYTES (XCHARSET (cs)) #define XCHARSET_COLUMNS(cs) CHARSET_COLUMNS (XCHARSET (cs)) #define XCHARSET_GRAPHIC(cs) CHARSET_GRAPHIC (XCHARSET (cs)) +#define XCHARSET_ENCODE_AS_UTF_8(cs) CHARSET_ENCODE_AS_UTF_8 (XCHARSET (cs)) #define XCHARSET_TYPE(cs) CHARSET_TYPE (XCHARSET (cs)) #define XCHARSET_DIRECTION(cs) CHARSET_DIRECTION (XCHARSET (cs)) #define XCHARSET_FINAL(cs) CHARSET_FINAL (XCHARSET (cs)) @@ -547,6 +554,25 @@ void get_charset_limits (Lisp_Object charset, int *low, int *high); int ichar_to_unicode (Ichar chr); +EXFUN (Fcharset_name, 1); + #endif /* MULE */ +/* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */ + +enum unicode_type +{ + UNICODE_UTF_16, + UNICODE_UTF_8, + UNICODE_UTF_7, + UNICODE_UCS_4 +}; + +void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, + int USED_IF_MULE (l), unsigned_char_dynarr *dst, + enum unicode_type type, unsigned int little_endian); + +EXFUN (Funicode_to_char, 2); +EXFUN (Fchar_to_unicode, 1); + #endif /* INCLUDED_charset_h_ */
--- a/src/event-xlike-inc.c Fri Jun 02 22:18:08 2006 +0000 +++ b/src/event-xlike-inc.c Sat Jun 03 17:51:06 2006 +0000 @@ -27,8 +27,6 @@ included here, not in event-xlike.c. However, event-xlike.c is always X-specific, whereas the following code isn't, in the GTK case. */ -EXFUN (Funicode_to_char, 2); /* In unicode.c. */ - static int #ifdef THIS_IS_GTK emacs_gtk_event_pending_p (int how_many) @@ -503,9 +501,9 @@ and only those should correspond directly to Unicode code points, in the range #x100-#x10FFFF; actual implementations can have the Latin 1 code points do the same thing with keysyms - #x010000A0-#x01000100. */ + #x01000000-#x01000100. */ - if (keysym >= 0x010000A0 && keysym <= 0x0110FFFF) + if (keysym >= 0x01000000 && keysym <= 0x0110FFFF) return Funicode_to_char (make_int(keysym & 0xffffff), Qnil); if ((keysym & 0xff) < 0xa0)
--- a/src/general-slots.h Fri Jun 02 22:18:08 2006 +0000 +++ b/src/general-slots.h Sat Jun 03 17:51:06 2006 +0000 @@ -73,6 +73,7 @@ SYMBOL_KEYWORD (Q_callback_ex); SYMBOL (Qcancel); SYMBOL (Qcategory); +SYMBOL (Qccl_program); SYMBOL (Qcenter); SYMBOL (Qchain); SYMBOL (Qchange); @@ -115,6 +116,7 @@ SYMBOL (Qdynarr_overhead); SYMBOL (Qemergency); SYMBOL (Qempty); +SYMBOL (Qencode_as_utf_8); SYMBOL (Qeq); SYMBOL (Qeql); SYMBOL (Qequal);
--- a/src/lread.c Fri Jun 02 22:18:08 2006 +0000 +++ b/src/lread.c Sat Jun 03 17:51:06 2006 +0000 @@ -34,6 +34,7 @@ #include "lstream.h" #include "opaque.h" #include "profile.h" +#include "charset.h" /* For Funicode_to_char. */ #include "sysfile.h" #include "sysfloat.h" @@ -208,8 +209,6 @@ static int locate_file_open_or_access_file (Ibyte *fn, int access_mode); EXFUN (Fread_from_string, 3); -EXFUN (Funicode_to_char, 2); /* In unicode.c. */ - /* When errors are signaled, the actual readcharfun should not be used as an argument if it is an lstream, so that lstreams don't escape to the Lisp level. */
--- a/src/mule-ccl.c Fri Jun 02 22:18:08 2006 +0000 +++ b/src/mule-ccl.c Sat Jun 03 17:51:06 2006 +0000 @@ -461,6 +461,16 @@ 1:ExtendedCOMMNDRrrRRRrrrXXXXX 2:ARGUMENT(Translation Table ID) */ +/* Translate a character whose code point is reg[rrr] and charset ID is + reg[RRR], into its Unicode code point, which will be written into + reg[rrr]. */ + +#define CCL_MuleToUnicode 0x04 + +/* Translate a Unicode code point, in reg[rrr], into a Mule character, + writing the charset ID into reg[RRR] and the code point into reg[Rrr]. */ + +#define CCL_UnicodeToMule 0x05 /* Iterate looking up MAPs for reg[rrr] starting from the Nth (N = reg[RRR]) MAP until some value is found. @@ -577,7 +587,6 @@ ... N:SEPARATOR_z (< 0) */ - #define MAX_MAP_SET_LEVEL 30 typedef struct @@ -837,26 +846,41 @@ CODE to that invalid byte. */ /* On XEmacs, TranslateCharacter is not supported. Thus, this - macro is not used. */ -#if 0 + macro is only used in the MuleToUnicode transformation. */ #define CCL_MAKE_CHAR(charset, code, c) \ do { \ - if ((charset) == CHARSET_ASCII) \ - (c) = (code) & 0xFF; \ - else if (CHARSET_DEFINED_P (charset) \ - && ((code) & 0x7F) >= 32 \ - && ((code) < 256 || ((code >> 7) & 0x7F) >= 32)) \ + if ((charset) == LEADING_BYTE_ASCII) \ + { \ + c = (code) & 0xFF; \ + } \ + else if ((charset) == LEADING_BYTE_CONTROL_1) \ + { \ + c = ((code) & 0xFF) - 0xA0; \ + } \ + else if (!NILP(charset_by_leading_byte(charset)) \ + && ((code) >= 32) \ + && ((code) < 256 || ((code >> 8) & 0x7F) >= 32)) \ { \ - int c1 = (code) & 0x7F, c2 = 0; \ + int c1, c2 = 0; \ \ - if ((code) >= 256) \ - c2 = c1, c1 = ((code) >> 7) & 0x7F; \ - (c) = make_ichar (charset, c1, c2); \ + if ((code) < 256) \ + { \ + c1 = (code) & 0x7F; \ + c2 = 0; \ + } \ + else \ + { \ + c1 = ((code) >> 8) & 0x7F; \ + c2 = (code) & 0x7F; \ + } \ + c = make_ichar (charset_by_leading_byte(charset), \ + c1, c2); \ } \ else \ - (c) = (code) & 0xFF; \ - } while (0) -#endif + { \ + c = (code) & 0xFF; \ + } \ + } while (0) /* Execute CCL code on SRC_BYTES length text at SOURCE. The resulting @@ -1392,9 +1416,9 @@ case CCL_TranslateCharacter: #if 0 - /* XEmacs does not have translate_char, and its - equivalent nor. We do nothing on this operation. */ - CCL_MAKE_CHAR (reg[RRR], reg[rrr], i); + /* XEmacs does not have translate_char, nor an + equivalent. We do nothing on this operation. */ + CCL_MAKE_CHAR(reg[RRR], reg[rrr], op); op = translate_char (GET_TRANSLATION_TABLE (reg[Rrr]), i, -1, 0, 0); SPLIT_CHAR (op, reg[RRR], i, j); @@ -1421,6 +1445,56 @@ #endif break; + case CCL_MuleToUnicode: + { + Lisp_Object ucs; + + CCL_MAKE_CHAR(reg[rrr], reg[RRR], op); + ucs = Fchar_to_unicode(make_char(op)); + + if (NILP(ucs)) + { + /* Uhh, char-to-unicode doesn't return nil at the + moment, only ever -1. */ + reg[rrr] = 0xFFFD; /* REPLACEMENT CHARACTER */ + } + else + { + reg[rrr] = XINT(ucs); + if (-1 == reg[rrr]) + { + reg[rrr] = 0xFFFD; /* REPLACEMENT CHARACTER */ + } + } + break; + } + + case CCL_UnicodeToMule: + { + Lisp_Object scratch; + + scratch = Funicode_to_char(make_int(reg[rrr]), Qnil); + + if (!NILP(scratch)) + { + op = XCHAR(scratch); + BREAKUP_ICHAR (op, scratch, i, j); + reg[RRR] = XCHARSET_ID(scratch); + + if (j != 0) + { + i = (i << 8) | j; + } + + reg[rrr] = i; + } + else + { + reg[rrr] = reg[RRR] = 0; + } + break; + } + case CCL_IterateMultipleMap: { Lisp_Object map, content, attrib, value;
--- a/src/mule-charset.c Fri Jun 02 22:18:08 2006 +0000 +++ b/src/mule-charset.c Sat Jun 03 17:51:06 2006 +0000 @@ -190,7 +190,7 @@ int type, int columns, int graphic, Ibyte final, int direction, Lisp_Object short_name, Lisp_Object long_name, Lisp_Object doc, - Lisp_Object reg, int overwrite) + Lisp_Object reg, int overwrite, int encode_as_utf_8) { Lisp_Object obj; Lisp_Charset *cs; @@ -240,6 +240,7 @@ CHARSET_FINAL (cs) = final; CHARSET_DOC_STRING (cs) = doc; CHARSET_REGISTRY (cs) = reg; + CHARSET_ENCODE_AS_UTF_8 (cs) = encode_as_utf_8 ? 1 : 0; CHARSET_CCL_PROGRAM (cs) = Qnil; CHARSET_REVERSE_DIRECTION_CHARSET (cs) = Qnil; @@ -454,6 +455,12 @@ is passed the octets of the character, with the high bit cleared and set depending upon whether the value of the `graphic' property is 0 or 1. +`encode-as-utf-8' + If non-nil, the charset will be written out using the UTF-8 + escape syntax in ISO 2022-oriented coding systems. Used for + supporting characters we know are part of Unicode but not of + any other known character set in escape-quoted and compound + text. */ (name, doc_string, props)) { @@ -465,6 +472,7 @@ Lisp_Object charset = Qnil; Lisp_Object ccl_program = Qnil; Lisp_Object short_name = Qnil, long_name = Qnil; + int encode_as_utf_8 = 0; Lisp_Object existing_charset; int temporary = UNBOUNDP (name); @@ -546,6 +554,11 @@ invalid_constant ("Invalid value for `direction'", value); } + else if (EQ (keyword, Qencode_as_utf_8)) + { + encode_as_utf_8 = NILP (value) ? 0 : 1; + } + else if (EQ (keyword, Qfinal)) { CHECK_CHAR_COERCE_INT (value); @@ -553,7 +566,6 @@ if (final < '0' || final > '~') invalid_constant ("Invalid value for `final'", value); } - else if (EQ (keyword, Qccl_program)) { struct ccl_program test_ccl; @@ -612,7 +624,8 @@ charset = make_charset (id, name, dimension + 2, type, columns, graphic, final, direction, short_name, long_name, - doc_string, registry, !NILP (existing_charset)); + doc_string, registry, !NILP (existing_charset), + encode_as_utf_8); XCHARSET (charset)->temporary = temporary; if (!NILP (ccl_program)) @@ -641,7 +654,7 @@ (charset, new_name)) { Lisp_Object new_charset = Qnil; - int id, dimension, columns, graphic; + int id, dimension, columns, graphic, encode_as_utf_8; Ibyte final; int direction, type; Lisp_Object registry, doc_string, short_name, long_name; @@ -672,10 +685,11 @@ short_name = CHARSET_SHORT_NAME (cs); long_name = CHARSET_LONG_NAME (cs); registry = CHARSET_REGISTRY (cs); + encode_as_utf_8 = CHARSET_ENCODE_AS_UTF_8 (cs); new_charset = make_charset (id, new_name, dimension + 2, type, columns, graphic, final, direction, short_name, long_name, - doc_string, registry, 0); + doc_string, registry, 0, encode_as_utf_8); CHARSET_REVERSE_DIRECTION_CHARSET (cs) = new_charset; XCHARSET_REVERSE_DIRECTION_CHARSET (new_charset) = charset; @@ -807,6 +821,8 @@ if (EQ (prop, Qfinal)) return make_char (CHARSET_FINAL (cs)); if (EQ (prop, Qchars)) return make_int (CHARSET_CHARS (cs)); if (EQ (prop, Qregistry)) return CHARSET_REGISTRY (cs); + if (EQ (prop, Qencode_as_utf_8)) + return CHARSET_ENCODE_AS_UTF_8 (cs) ? Qt : Qnil; if (EQ (prop, Qccl_program)) return CHARSET_CCL_PROGRAM (cs); if (EQ (prop, Qdirection)) return CHARSET_DIRECTION (cs) == CHARSET_LEFT_TO_RIGHT ? Ql2r : Qr2l; @@ -1040,7 +1056,7 @@ build_string ("ASCII"), build_msg_string ("ASCII"), build_msg_string ("ASCII (ISO646 IRV)"), - build_string ("\\(iso8859-[0-9]*\\|-ascii\\)"), 0); + build_string ("\\(iso8859-[0-9]*\\|-ascii\\)"), 0, 0); staticpro (&Vcharset_control_1); Vcharset_control_1 = make_charset (LEADING_BYTE_CONTROL_1, Qcontrol_1, 2, @@ -1049,7 +1065,7 @@ build_string ("C1"), build_msg_string ("Control characters"), build_msg_string ("Control characters 128-191"), - build_string (""), 0); + build_string (""), 0, 0); staticpro (&Vcharset_latin_iso8859_1); Vcharset_latin_iso8859_1 = make_charset (LEADING_BYTE_LATIN_ISO8859_1, Qlatin_iso8859_1, 2, @@ -1058,7 +1074,7 @@ build_string ("Latin-1"), build_msg_string ("ISO8859-1 (Latin-1)"), build_msg_string ("ISO8859-1 (Latin-1)"), - build_string ("iso8859-1"), 0); + build_string ("iso8859-1"), 0, 0); staticpro (&Vcharset_latin_iso8859_2); Vcharset_latin_iso8859_2 = make_charset (LEADING_BYTE_LATIN_ISO8859_2, Qlatin_iso8859_2, 2, @@ -1067,7 +1083,7 @@ build_string ("Latin-2"), build_msg_string ("ISO8859-2 (Latin-2)"), build_msg_string ("ISO8859-2 (Latin-2)"), - build_string ("iso8859-2"), 0); + build_string ("iso8859-2"), 0, 0); staticpro (&Vcharset_latin_iso8859_3); Vcharset_latin_iso8859_3 = make_charset (LEADING_BYTE_LATIN_ISO8859_3, Qlatin_iso8859_3, 2, @@ -1076,7 +1092,7 @@ build_string ("Latin-3"), build_msg_string ("ISO8859-3 (Latin-3)"), build_msg_string ("ISO8859-3 (Latin-3)"), - build_string ("iso8859-3"), 0); + build_string ("iso8859-3"), 0, 0); staticpro (&Vcharset_latin_iso8859_4); Vcharset_latin_iso8859_4 = make_charset (LEADING_BYTE_LATIN_ISO8859_4, Qlatin_iso8859_4, 2, @@ -1085,7 +1101,7 @@ build_string ("Latin-4"), build_msg_string ("ISO8859-4 (Latin-4)"), build_msg_string ("ISO8859-4 (Latin-4)"), - build_string ("iso8859-4"), 0); + build_string ("iso8859-4"), 0, 0); staticpro (&Vcharset_thai_tis620); Vcharset_thai_tis620 = make_charset (LEADING_BYTE_THAI_TIS620, Qthai_tis620, 2, @@ -1094,7 +1110,7 @@ build_string ("TIS620"), build_msg_string ("TIS620 (Thai)"), build_msg_string ("TIS620.2529 (Thai)"), - build_string ("tis620"),0); + build_string ("tis620"), 0, 0); staticpro (&Vcharset_greek_iso8859_7); Vcharset_greek_iso8859_7 = make_charset (LEADING_BYTE_GREEK_ISO8859_7, Qgreek_iso8859_7, 2, @@ -1103,7 +1119,7 @@ build_string ("ISO8859-7"), build_msg_string ("ISO8859-7 (Greek)"), build_msg_string ("ISO8859-7 (Greek)"), - build_string ("iso8859-7"), 0); + build_string ("iso8859-7"), 0, 0); staticpro (&Vcharset_arabic_iso8859_6); Vcharset_arabic_iso8859_6 = make_charset (LEADING_BYTE_ARABIC_ISO8859_6, Qarabic_iso8859_6, 2, @@ -1112,7 +1128,7 @@ build_string ("ISO8859-6"), build_msg_string ("ISO8859-6 (Arabic)"), build_msg_string ("ISO8859-6 (Arabic)"), - build_string ("iso8859-6"), 0); + build_string ("iso8859-6"), 0, 0); staticpro (&Vcharset_hebrew_iso8859_8); Vcharset_hebrew_iso8859_8 = make_charset (LEADING_BYTE_HEBREW_ISO8859_8, Qhebrew_iso8859_8, 2, @@ -1121,7 +1137,7 @@ build_string ("ISO8859-8"), build_msg_string ("ISO8859-8 (Hebrew)"), build_msg_string ("ISO8859-8 (Hebrew)"), - build_string ("iso8859-8"), 0); + build_string ("iso8859-8"), 0, 0); staticpro (&Vcharset_katakana_jisx0201); Vcharset_katakana_jisx0201 = make_charset (LEADING_BYTE_KATAKANA_JISX0201, Qkatakana_jisx0201, 2, @@ -1130,7 +1146,7 @@ build_string ("JISX0201 Kana"), build_msg_string ("JISX0201.1976 (Japanese Kana)"), build_msg_string ("JISX0201.1976 Japanese Kana"), - build_string ("jisx0201.1976"), 0); + build_string ("jisx0201.1976"), 0, 0); staticpro (&Vcharset_latin_jisx0201); Vcharset_latin_jisx0201 = make_charset (LEADING_BYTE_LATIN_JISX0201, Qlatin_jisx0201, 2, @@ -1139,7 +1155,7 @@ build_string ("JISX0201 Roman"), build_msg_string ("JISX0201.1976 (Japanese Roman)"), build_msg_string ("JISX0201.1976 Japanese Roman"), - build_string ("jisx0201.1976"), 0); + build_string ("jisx0201.1976"), 0, 0); staticpro (&Vcharset_cyrillic_iso8859_5); Vcharset_cyrillic_iso8859_5 = make_charset (LEADING_BYTE_CYRILLIC_ISO8859_5, Qcyrillic_iso8859_5, 2, @@ -1148,7 +1164,7 @@ build_string ("ISO8859-5"), build_msg_string ("ISO8859-5 (Cyrillic)"), build_msg_string ("ISO8859-5 (Cyrillic)"), - build_string ("iso8859-5"), 0); + build_string ("iso8859-5"), 0, 0); staticpro (&Vcharset_latin_iso8859_9); Vcharset_latin_iso8859_9 = make_charset (LEADING_BYTE_LATIN_ISO8859_9, Qlatin_iso8859_9, 2, @@ -1157,7 +1173,7 @@ build_string ("Latin-5"), build_msg_string ("ISO8859-9 (Latin-5)"), build_msg_string ("ISO8859-9 (Latin-5)"), - build_string ("iso8859-9"), 0); + build_string ("iso8859-9"), 0, 0); staticpro (&Vcharset_latin_iso8859_15); Vcharset_latin_iso8859_15 = make_charset (LEADING_BYTE_LATIN_ISO8859_15, Qlatin_iso8859_15, 2, @@ -1166,7 +1182,7 @@ build_string ("Latin-9"), build_msg_string ("ISO8859-15 (Latin-9)"), build_msg_string ("ISO8859-15 (Latin-9)"), - build_string ("iso8859-15"), 0); + build_string ("iso8859-15"), 0, 0); staticpro (&Vcharset_japanese_jisx0208_1978); Vcharset_japanese_jisx0208_1978 = make_charset (LEADING_BYTE_JAPANESE_JISX0208_1978, Qjapanese_jisx0208_1978, 3, @@ -1176,7 +1192,7 @@ build_msg_string ("JISX0208.1978 (Japanese)"), build_msg_string ("JISX0208.1978 Japanese Kanji (so called \"old JIS\")"), - build_string ("\\(jisx0208\\|jisc6226\\)\\.1978"), 0); + build_string ("\\(jisx0208\\|jisc6226\\)\\.1978"), 0, 0); staticpro (&Vcharset_chinese_gb2312); Vcharset_chinese_gb2312 = make_charset (LEADING_BYTE_CHINESE_GB2312, Qchinese_gb2312, 3, @@ -1185,7 +1201,7 @@ build_string ("GB2312"), build_msg_string ("GB2312)"), build_msg_string ("GB2312 Chinese simplified"), - build_string ("gb2312"), 0); + build_string ("gb2312"), 0, 0); staticpro (&Vcharset_japanese_jisx0208); Vcharset_japanese_jisx0208 = make_charset (LEADING_BYTE_JAPANESE_JISX0208, Qjapanese_jisx0208, 3, @@ -1194,7 +1210,7 @@ build_string ("JISX0208"), build_msg_string ("JISX0208.1983/1990 (Japanese)"), build_msg_string ("JISX0208.1983/1990 Japanese Kanji"), - build_string ("jisx0208.19\\(83\\|90\\)"), 0); + build_string ("jisx0208.19\\(83\\|90\\)"), 0, 0); staticpro (&Vcharset_korean_ksc5601); Vcharset_korean_ksc5601 = make_charset (LEADING_BYTE_KOREAN_KSC5601, Qkorean_ksc5601, 3, @@ -1203,7 +1219,7 @@ build_string ("KSC5601"), build_msg_string ("KSC5601 (Korean"), build_msg_string ("KSC5601 Korean Hangul and Hanja"), - build_string ("ksc5601"), 0); + build_string ("ksc5601"), 0, 0); staticpro (&Vcharset_japanese_jisx0212); Vcharset_japanese_jisx0212 = make_charset (LEADING_BYTE_JAPANESE_JISX0212, Qjapanese_jisx0212, 3, @@ -1212,7 +1228,7 @@ build_string ("JISX0212"), build_msg_string ("JISX0212 (Japanese)"), build_msg_string ("JISX0212 Japanese Supplement"), - build_string ("jisx0212"), 0); + build_string ("jisx0212"), 0, 0); #define CHINESE_CNS_PLANE_RE(n) "cns11643[.-]\\(.*[.-]\\)?" n "$" staticpro (&Vcharset_chinese_cns11643_1); @@ -1224,7 +1240,7 @@ build_msg_string ("CNS11643-1 (Chinese traditional)"), build_msg_string ("CNS 11643 Plane 1 Chinese traditional"), - build_string (CHINESE_CNS_PLANE_RE("1")), 0); + build_string (CHINESE_CNS_PLANE_RE("1")), 0, 0); staticpro (&Vcharset_chinese_cns11643_2); Vcharset_chinese_cns11643_2 = make_charset (LEADING_BYTE_CHINESE_CNS11643_2, Qchinese_cns11643_2, 3, @@ -1234,7 +1250,7 @@ build_msg_string ("CNS11643-2 (Chinese traditional)"), build_msg_string ("CNS 11643 Plane 2 Chinese traditional"), - build_string (CHINESE_CNS_PLANE_RE("2")), 0); + build_string (CHINESE_CNS_PLANE_RE("2")), 0, 0); staticpro (&Vcharset_chinese_big5_1); Vcharset_chinese_big5_1 = make_charset (LEADING_BYTE_CHINESE_BIG5_1, Qchinese_big5_1, 3, @@ -1244,7 +1260,7 @@ build_msg_string ("Big5 (Level-1)"), build_msg_string ("Big5 Level-1 Chinese traditional"), - build_string ("big5"), 0); + build_string ("big5"), 0, 0); staticpro (&Vcharset_chinese_big5_2); Vcharset_chinese_big5_2 = make_charset (LEADING_BYTE_CHINESE_BIG5_2, Qchinese_big5_2, 3, @@ -1254,7 +1270,7 @@ build_msg_string ("Big5 (Level-2)"), build_msg_string ("Big5 Level-2 Chinese traditional"), - build_string ("big5"), 0); + build_string ("big5"), 0, 0); #ifdef ENABLE_COMPOSITE_CHARS @@ -1269,7 +1285,7 @@ build_string ("Composite"), build_msg_string ("Composite characters"), build_msg_string ("Composite characters"), - build_string (""), 0); + build_string (""), 0, 0); #else /* We create a hack so that we have a way of storing ESC 0 and ESC 1 sequences as "characters", so that they will be output correctly. */ @@ -1281,6 +1297,6 @@ build_string ("Composite hack"), build_msg_string ("Composite characters hack"), build_msg_string ("Composite characters hack"), - build_string (""), 0); + build_string (""), 0, 0); #endif /* ENABLE_COMPOSITE_CHARS */ }
--- a/src/mule-coding.c Fri Jun 02 22:18:08 2006 +0000 +++ b/src/mule-coding.c Sat Jun 03 17:51:06 2006 +0000 @@ -96,6 +96,42 @@ return c >= 0xA1 && c <= 0xDF; } +inline static void +dynarr_add_2022_one_dimension (Lisp_Object charset, Ibyte c, + unsigned char charmask, + unsigned_char_dynarr *dst) +{ + if (XCHARSET_ENCODE_AS_UTF_8 (charset)) + { + encode_unicode_char (charset, c & charmask, 0, + dst, UNICODE_UTF_8, 0); + } + else + { + Dynarr_add (dst, c & charmask); + } +} + +inline static void +dynarr_add_2022_two_dimensions (Lisp_Object charset, Ibyte c, + unsigned int ch, + unsigned char charmask, + unsigned_char_dynarr *dst) +{ + if (XCHARSET_ENCODE_AS_UTF_8 (charset)) + { + encode_unicode_char (charset, + ch & charmask, + c & charmask, dst, + UNICODE_UTF_8, 0); + } + else + { + Dynarr_add (dst, ch & charmask); + Dynarr_add (dst, c & charmask); + } +} + /* Convert Shift-JIS data to internal format. */ static Bytecount @@ -671,6 +707,10 @@ ISO_ESC_2_4, /* We've seen ESC $. This indicates that we're designating a multi-byte, rather than a single-byte, character set. */ + ISO_ESC_2_5, /* We've seen ESC %. This indicates an escape to a + Unicode coding system; the only one of these + we're prepared to deal with is UTF-8, which has + the next character as G. */ ISO_ESC_2_8, /* We've seen ESC 0x28, i.e. ESC (. This means designate a 94-character character set into G0. */ @@ -752,11 +792,15 @@ character constructed by overstriking two or more characters). */ #define ISO_STATE_COMPOSITE (1 << 5) +/* If set, we're processing UTF-8 encoded data within ISO-2022 + processing. */ +#define ISO_STATE_UTF_8 (1 << 6) + /* ISO_STATE_LOCK is the mask of flags that remain on until explicitly turned off when in the ISO2022 encoder/decoder. Other flags are turned off at the end of processing each character or escape sequence. */ # define ISO_STATE_LOCK \ - (ISO_STATE_COMPOSITE | ISO_STATE_R2L) + (ISO_STATE_COMPOSITE | ISO_STATE_R2L | ISO_STATE_UTF_8) typedef struct charset_conversion_spec { @@ -922,6 +966,9 @@ Lisp_Object current_charset; int current_half; int current_char_boundary; + + /* Used for handling UTF-8. */ + unsigned char counter; }; static const struct memory_description ccs_description_1[] = @@ -1344,6 +1391,15 @@ } case ISO_ESC: + + /* The only available ISO 2022 sequence in UTF-8 mode is ESC % @, to + exit from it. If we see any other escape sequence, pass it through + in the error handler. */ + if (*flags & ISO_STATE_UTF_8 && '%' != c) + { + return 0; + } + switch (c) { /**** single shift ****/ @@ -1411,6 +1467,10 @@ iso->esc = ISO_ESC_2_4; goto not_done; + case '%': /* Prefix to an escape to or from Unicode. */ + iso->esc = ISO_ESC_2_5; + goto not_done; + default: if (0x28 <= c && c <= 0x2F) { @@ -1433,8 +1493,30 @@ goto error; } - - + /* ISO-IR 196 UTF-8 support. */ + case ISO_ESC_2_5: + if ('G' == c) + { + /* Activate UTF-8 mode. */ + *flags &= ISO_STATE_LOCK; + *flags |= ISO_STATE_UTF_8; + iso->esc = ISO_ESC_NOTHING; + return 1; + } + else if ('@' == c) + { + /* Deactive UTF-8 mode. */ + *flags &= ISO_STATE_LOCK; + *flags &= ~(ISO_STATE_UTF_8); + iso->esc = ISO_ESC_NOTHING; + return 1; + } + else + { + /* Oops, we don't support the other UTF-? coding systems within + ISO 2022, only in their own context. */ + goto error; + } /**** directionality ****/ case ISO_ESC_5_11: /* ISO6429 direction control */ @@ -1822,6 +1904,87 @@ } ch = 0; } + else if (flags & ISO_STATE_UTF_8) + { + unsigned char counter = data->counter; + Ibyte work[MAX_ICHAR_LEN]; + int len; + Lisp_Object chr; + + if (ISO_CODE_ESC == c) + { + /* Allow the escape sequence parser to end the UTF-8 state. */ + flags |= ISO_STATE_ESCAPE; + data->esc = ISO_ESC; + data->esc_bytes_index = 1; + continue; + } + + switch (counter) + { + case 0: + if (c >= 0xfc) + { + ch = c & 0x01; + counter = 5; + } + else if (c >= 0xf8) + { + ch = c & 0x03; + counter = 4; + } + else if (c >= 0xf0) + { + ch = c & 0x07; + counter = 3; + } + else if (c >= 0xe0) + { + ch = c & 0x0f; + counter = 2; + } + else if (c >= 0xc0) + { + ch = c & 0x1f; + counter = 1; + } + else + /* ASCII, or the lower control characters. */ + Dynarr_add (dst, c); + + break; + case 1: + ch = (ch << 6) | (c & 0x3f); + chr = Funicode_to_char(make_int(ch), Qnil); + + if (!NILP (chr)) + { + assert(CHARP(chr)); + len = set_itext_ichar (work, XCHAR(chr)); + Dynarr_add_many (dst, work, len); + } + else + { + /* Shouldn't happen, this code should only be enabled in + XEmacsen with support for all of Unicode. */ + Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); + Dynarr_add (dst, 34 + 128); + Dynarr_add (dst, 46 + 128); + } + + ch = 0; + counter = 0; + break; + default: + ch = (ch << 6) | (c & 0x3f); + counter--; + } + + if (str->eof) + DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); + + data->counter = counter; + } else if (byte_c0_p (c) || byte_c1_p (c)) { /* Control characters */ @@ -2010,6 +2173,7 @@ } Dynarr_add (dst, ISO_CODE_ESC); + switch (type) { case CHARSET_TYPE_94: @@ -2102,6 +2266,14 @@ { /* Processing ASCII character */ ch = 0; + if (flags & ISO_STATE_UTF_8) + { + Dynarr_add (dst, ISO_CODE_ESC); + Dynarr_add (dst, '%'); + Dynarr_add (dst, '@'); + flags &= ~(ISO_STATE_UTF_8); + } + restore_left_to_right_direction (codesys, dst, &flags, 0); /* Make sure G0 contains ASCII */ @@ -2145,18 +2317,43 @@ Dynarr_add (dst, c); char_boundary = 1; } - else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch)) { /* Processing Leading Byte */ ch = 0; charset = charset_by_leading_byte (c); if (leading_byte_prefix_p (c)) - ch = c; + { + ch = c; + } + else if (XCHARSET_ENCODE_AS_UTF_8 (charset)) + { + assert (!EQ (charset, Vcharset_control_1) + && !EQ (charset, Vcharset_composite)); + + /* If the character set is to be encoded as UTF-8, the escape + is always the same. */ + if (!(flags & ISO_STATE_UTF_8)) + { + Dynarr_add (dst, ISO_CODE_ESC); + Dynarr_add (dst, '%'); + Dynarr_add (dst, 'G'); + flags |= ISO_STATE_UTF_8; + } + } else if (!EQ (charset, Vcharset_control_1) && !EQ (charset, Vcharset_composite)) { int reg; + /* End the UTF-8 state. */ + if (flags & ISO_STATE_UTF_8) + { + Dynarr_add (dst, ISO_CODE_ESC); + Dynarr_add (dst, '%'); + Dynarr_add (dst, '@'); + flags &= ~(ISO_STATE_UTF_8); + } + ensure_correct_direction (XCHARSET_DIRECTION (charset), codesys, dst, &flags, 0); @@ -2274,12 +2471,14 @@ switch (XCHARSET_REP_BYTES (charset)) { case 2: - Dynarr_add (dst, c & charmask); + dynarr_add_2022_one_dimension (charset, c, + charmask, dst); break; case 3: if (XCHARSET_PRIVATE_P (charset)) { - Dynarr_add (dst, c & charmask); + dynarr_add_2022_one_dimension (charset, c, + charmask, dst); ch = 0; } else if (ch) @@ -2287,6 +2486,9 @@ #ifdef ENABLE_COMPOSITE_CHARS if (EQ (charset, Vcharset_composite)) { + /* #### Hasn't been written to handle composite + characters yet. */ + assert(!XCHARSET_ENCODE_AS_UTF_8 (charset)) if (in_composite) { /* #### Bother! We don't know how to @@ -2310,8 +2512,8 @@ else #endif /* ENABLE_COMPOSITE_CHARS */ { - Dynarr_add (dst, ch & charmask); - Dynarr_add (dst, c & charmask); + dynarr_add_2022_two_dimensions (charset, c, ch, + charmask, dst); } ch = 0; } @@ -2324,8 +2526,8 @@ case 4: if (ch) { - Dynarr_add (dst, ch & charmask); - Dynarr_add (dst, c & charmask); + dynarr_add_2022_two_dimensions (charset, c, ch, + charmask, dst); ch = 0; } else
--- a/src/redisplay-x.c Fri Jun 02 22:18:08 2006 +0000 +++ b/src/redisplay-x.c Sat Jun 03 17:51:06 2006 +0000 @@ -230,6 +230,10 @@ } #endif /* MULE */ *text_storage++ = (unsigned char) byte1; + /* This dimension stuff is broken if you want to use a two-dimensional + X11 font to display a single-dimensional character set, as is + appropriate for the IPA (use one of the -iso10646-1 fonts) or some + of the other non-standard character sets. */ if (dimension == 2) *text_storage++ = (unsigned char) byte2; #else /* USE_XFT */
--- a/src/unicode.c Fri Jun 02 22:18:08 2006 +0000 +++ b/src/unicode.c Sat Jun 03 17:51:06 2006 +0000 @@ -321,6 +321,10 @@ Lisp_Object Qignore_first_column; +Lisp_Object Vcurrent_jit_charset; +Lisp_Object Qlast_allocated_character; +Lisp_Object Qccl_encode_to_ucs_2; + /************************************************************************/ /* Unicode implementation */ @@ -1001,12 +1005,72 @@ } static Ichar +get_free_codepoint(Lisp_Object charset) +{ + Lisp_Object name = Fcharset_name(charset); + Lisp_Object zeichen = Fget(name, Qlast_allocated_character, Qnil); + Ichar res; + + /* Only allow this with the 96x96 character sets we are using for + temporary Unicode support. */ + assert(2 == XCHARSET_DIMENSION(charset) && 96 == XCHARSET_CHARS(charset)); + + if (!NILP(zeichen)) + { + int c1, c2; + + BREAKUP_ICHAR(XCHAR(zeichen), charset, c1, c2); + + if (127 == c1 && 127 == c2) + { + /* We've already used the hightest-numbered character in this + set--tell our caller to create another. */ + return -1; + } + + if (127 == c2) + { + ++c1; + c2 = 0x20; + } + else + { + ++c2; + } + + res = make_ichar(charset, c1, c2); + Fput(name, Qlast_allocated_character, make_char(res)); + } + else + { + res = make_ichar(charset, 32, 32); + Fput(name, Qlast_allocated_character, make_char(res)); + } + return res; +} + +/* The just-in-time creation of XEmacs characters that correspond to unknown + Unicode code points happens when: + + 1. The lookup would otherwise fail. + + 2. The charsets array is the nil or the default. + + If there are no free code points in the just-in-time Unicode character + set, and the charsets array is the default unicode precedence list, + create a new just-in-time Unicode character set, add it at the end of the + unicode precedence list, create the XEmacs character in that character + set, and return it. */ + +static Ichar unicode_to_ichar (int code, Lisp_Object_dynarr *charsets) { int u1, u2, u3, u4; int code_levels; int i; int n = Dynarr_length (charsets); + static int number_of_jit_charsets; + static Ascbyte last_jit_charset_final; type_checking_assert (code >= 0); /* This shortcut depends on the representation of an Ichar, see text.c. @@ -1040,8 +1104,64 @@ return make_ichar (charset, retval >> 8, retval & 0xFF); } } + + /* Only do the magic just-in-time assignment if we're using the default + list. */ + if (unicode_precedence_dynarr == charsets) + { + if (NILP (Vcurrent_jit_charset) || + (-1 == (i = get_free_codepoint(Vcurrent_jit_charset)))) + { + Ascbyte setname[32]; + Lisp_Object charset_descr = build_string + ("Mule charset for otherwise unknown Unicode code points."); + Lisp_Object charset_regr = build_string("iso10646-1"); - return (Ichar) -1; + struct gcpro gcpro1, gcpro2; + + if ('\0' == last_jit_charset_final) + { + /* This final byte shit is, umm, not that cool. */ + last_jit_charset_final = 0x30; + } + + snprintf(setname, sizeof(setname), + "jit-ucs-charset-%d", number_of_jit_charsets++); + + /* Aside: GCPROing here would be overkill according to the FSF's + philosophy. make-charset cannot currently GC, but is intended + to be called from Lisp, with its arguments protected by the + Lisp reader. We GCPRO in case it GCs in the future and no-one + checks all the C callers. */ + + GCPRO2 (charset_descr, charset_regr); + Vcurrent_jit_charset = Fmake_charset + (intern(setname), charset_descr, + /* Set encode-as-utf-8 to t, to have this character set written + using UTF-8 escapes in escape-quoted and ctext. This + sidesteps the fact that our internal character -> Unicode + mapping is not stable from one invocation to the next. */ + nconc2 (list2(Qencode_as_utf_8, Qt), + nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96), + Qdimension, make_int(2)), + list6(Qregistry, charset_regr, + Qfinal, make_char(last_jit_charset_final++), + /* This CCL program is initialised in + unicode.el. */ + Qccl_program, Qccl_encode_to_ucs_2)))); + UNGCPRO; + + i = get_free_codepoint(Vcurrent_jit_charset); + } + + if (-1 != i) + { + set_unicode_conversion((Ichar)i, code); + /* No need to add the charset to the end of the list; it's done + automatically. */ + } + } + return (Ichar) i; } /* Add charsets to precedence list. @@ -1284,37 +1404,13 @@ present), this function simply does `int-to-char' and ignores the CHARSETS argument. -Note that the current XEmacs internal encoding has no mapping for many -Unicode code points, and if you use characters that are vaguely obscure with -XEmacs' Unicode coding systems, you will lose data. - -To add support for some desired code point in the short term--note that our -intention is to move to a Unicode-compatible internal encoding soon, for -some value of soon--if you are a distributor, add something like the -following to `site-start.el.' - -(make-charset 'distro-name-private - "Private character set for DISTRO" - '(dimension 1 - chars 96 - columns 1 - final ?5 ;; Change this--see docs for make-charset - long-name "Private charset for some Unicode char support." - short-name "Distro-Private")) - -(set-unicode-conversion - (make-char 'distro-name-private #x20) #x263A) ;; WHITE SMILING FACE - -(set-unicode-conversion - (make-char 'distro-name-private #x21) #x3030) ;; WAVY DASH - -;; ... -;;; Repeat as necessary. - -Redisplay will work on the sjt-xft branch, but not with server-side X11 -fonts as is the default. However, data read in will be preserved when they -are written out again. - +If the CODE would not otherwise be converted to an XEmacs character, and the +list of character sets to be consulted is nil or the default, a new XEmacs +character will be created for it in one of the `jit-ucs-charset' Mule +character sets, and that character will be returned. There is scope for +tens of thousands of separate Unicode code points in every session using +this technique, so despite XEmacs' internal encoding not being based on +Unicode, your data won't be trashed. */ (code, USED_IF_MULE (charsets))) { @@ -1558,16 +1654,6 @@ /* Unicode coding system */ /************************************************************************/ -/* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */ - -enum unicode_type -{ - UNICODE_UTF_16, - UNICODE_UTF_8, - UNICODE_UTF_7, - UNICODE_UCS_4 -}; - struct unicode_coding_system { enum unicode_type type; @@ -1728,7 +1814,9 @@ } } -static void +/* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented + encodings. */ +void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, int USED_IF_MULE (l), unsigned_char_dynarr *dst, enum unicode_type type, unsigned int little_endian) @@ -2444,6 +2532,8 @@ DEFSUBR (Fload_unicode_mapping_table); + DEFSYMBOL (Qccl_encode_to_ucs_2); + DEFSYMBOL (Qlast_allocated_character); DEFSYMBOL (Qignore_first_column); #endif /* MULE */ @@ -2519,6 +2609,9 @@ init_blank_unicode_tables (); + staticpro (&Vcurrent_jit_charset); + Vcurrent_jit_charset = Qnil; + /* Note that the "block" we are describing is a single pointer, and hence we could potentially use dump_add_root_block_ptr(). However, given the way the descriptions are written, we couldn't use them, and would
--- a/tests/ChangeLog Fri Jun 02 22:18:08 2006 +0000 +++ b/tests/ChangeLog Sat Jun 03 17:51:06 2006 +0000 @@ -1,3 +1,13 @@ +2006-06-03 Aidan Kehoe <kehoea@parhasard.net> + + * automated/mule-tests.el: + Add checks that several Unicode characters, expressed as + UTF-8-encoded strings, are handled correctly by the UTF-8 support + of the escape-quoted character set. + * automated/mule-tests.el (unicode-code-point-to-utf-8-string): New. + Convert a Unicode code point to the equivalent UTF-8 string. + This is a naive implementation in Lisp. + 2006-05-16 Stephen J. Turnbull <stephen@xemacs.org> * XEmacs 21.5.27 "fiddleheads" is released.
--- a/tests/automated/mule-tests.el Fri Jun 02 22:18:08 2006 +0000 +++ b/tests/automated/mule-tests.el Sat Jun 03 17:51:06 2006 +0000 @@ -78,6 +78,38 @@ ;; once compiled, for no good reason. (test-chars t) +(defun unicode-code-point-to-utf-8-string (code-point) + "Convert a Unicode code point to the equivalent UTF-8 string. +This is a naive implementation in Lisp. " + (check-argument-type 'natnump code-point) + (check-argument-range code-point 0 #x1fffff) + (if (< code-point #x80) + (format "%c" code-point) + (if (< code-point #x800) + (format "%c%c" + ;; ochars[0] = 0xC0 | (input & ~(0xFFFFF83F)) >> 6; + (logior #xc0 (lsh (logand code-point #x7c0) -6)) + ;; ochars[1] = 0x80 | input & ~(0xFFFFFFC0); + (logior #x80 (logand code-point #x3f))) + (if (< code-point #x00010000) + (format "%c%c%c" + ;; ochars[0] = 0xE0 | (input >> 12) & ~(0xFFFFFFF0); + (logior #xe0 (logand (lsh code-point -12) #x0f)) + ;; ochars[1] = 0x80 | (input >> 6) & ~(0xFFFFFFC0); + (logior #x80 (logand (lsh code-point -6) #x3f)) + ;; ochars[2] = 0x80 | input & ~(0xFFFFFFC0); + (logior #x80 (logand code-point #x3f))) + (if (< code-point #x200000) + (format "%c%c%c%c" + ;; ochars[0] = 0xF0 | (input >> 18) & ~(0xFFFFFFF8) + (logior #xF0 (logand (lsh code-point -18) #x7)) + ;; ochars[1] = 0x80 | (input >> 12) & ~(0xFFFFFFC0); + (logior #x80 (logand (lsh code-point -12) #x3f)) + ;; ochars[2] = 0x80 | (input >> 6) & ~(0xFFFFFFC0); + (logior #x80 (logand (lsh code-point -6) #x3f)) + ;; ochars[3] = 0x80 | input & ~(0xFFFFFFC0); + (logior #x80 (logand code-point #x3f)))))))) + ;;----------------------------------------------------------------- ;; Test string modification functions that modify the length of a char. ;;----------------------------------------------------------------- @@ -336,6 +368,37 @@ (Check-Error wrong-type-argument (set-unicode-conversion scaron -10000))) + (dolist (utf-8-char + '("\xc6\x92" ;; U+0192 LATIN SMALL LETTER F WITH HOOK + "\xe2\x81\x8a" ;; U+204A TIRONIAN SIGN ET + "\xe2\x82\xae" ;; U+20AE TUGRIK SIGN + "\xf0\x9d\x92\xbd" ;; U+1D4BD MATHEMATICAL SCRIPT SMALL H + "\xf0\x9d\x96\x93" ;; U+1D593 MATHEMATICAL BOLD FRAKTUR SMALL N + "\xf0\xaf\xa8\x88" ;; U+2FA08 CJK COMPATIBILITY FOR U+4BCE + "\xf4\x8f\xbf\xbd")) ;; U+10FFFD <Plane 16 Private Use, Last> + (let* ((xemacs-character (car (append + (decode-coding-string utf-8-char 'utf-8) + nil))) + (xemacs-charset (car (split-char xemacs-character)))) + + ;; Trivial test of the UTF-8 support of the escape-quoted character set. + (Assert (equal (decode-coding-string utf-8-char 'utf-8) + (decode-coding-string (concat "\033%G" utf-8-char) + 'escape-quoted))) + + ;; Check that the reverse mapping holds. + (Assert (equal (unicode-code-point-to-utf-8-string + (encode-char xemacs-character 'ucs)) + utf-8-char)) + + ;; Check that, if this character has been JIT-allocated, it is encoded + ;; in escape-quoted using the corresponding UTF-8 escape. + (when (charset-property xemacs-charset 'encode-as-utf-8) + (Assert (equal (concat "\033%G" utf-8-char) + (encode-coding-string xemacs-character 'escape-quoted))) + (Assert (equal (concat "\033%G" utf-8-char) + (encode-coding-string xemacs-character 'ctext)))))) + ;;--------------------------------------------------------------- ;; Test charset-in-* functions ;;---------------------------------------------------------------