Mercurial > hg > xemacs-beta
diff src/unicode.c @ 3439:d1754e7f0cea
[xemacs-hg @ 2006-06-03 17:50:39 by aidan]
Just-in-time Unicode code point support.
author | aidan |
---|---|
date | Sat, 03 Jun 2006 17:51:06 +0000 |
parents | 8dbdcd070418 |
children | 551c008d3777 |
line wrap: on
line diff
--- a/src/unicode.c Fri Jun 02 22:18:08 2006 +0000 +++ b/src/unicode.c Sat Jun 03 17:51:06 2006 +0000 @@ -321,6 +321,10 @@ Lisp_Object Qignore_first_column; +Lisp_Object Vcurrent_jit_charset; +Lisp_Object Qlast_allocated_character; +Lisp_Object Qccl_encode_to_ucs_2; + /************************************************************************/ /* Unicode implementation */ @@ -1001,12 +1005,72 @@ } static Ichar +get_free_codepoint(Lisp_Object charset) +{ + Lisp_Object name = Fcharset_name(charset); + Lisp_Object zeichen = Fget(name, Qlast_allocated_character, Qnil); + Ichar res; + + /* Only allow this with the 96x96 character sets we are using for + temporary Unicode support. */ + assert(2 == XCHARSET_DIMENSION(charset) && 96 == XCHARSET_CHARS(charset)); + + if (!NILP(zeichen)) + { + int c1, c2; + + BREAKUP_ICHAR(XCHAR(zeichen), charset, c1, c2); + + if (127 == c1 && 127 == c2) + { + /* We've already used the hightest-numbered character in this + set--tell our caller to create another. */ + return -1; + } + + if (127 == c2) + { + ++c1; + c2 = 0x20; + } + else + { + ++c2; + } + + res = make_ichar(charset, c1, c2); + Fput(name, Qlast_allocated_character, make_char(res)); + } + else + { + res = make_ichar(charset, 32, 32); + Fput(name, Qlast_allocated_character, make_char(res)); + } + return res; +} + +/* The just-in-time creation of XEmacs characters that correspond to unknown + Unicode code points happens when: + + 1. The lookup would otherwise fail. + + 2. The charsets array is the nil or the default. + + If there are no free code points in the just-in-time Unicode character + set, and the charsets array is the default unicode precedence list, + create a new just-in-time Unicode character set, add it at the end of the + unicode precedence list, create the XEmacs character in that character + set, and return it. */ + +static Ichar unicode_to_ichar (int code, Lisp_Object_dynarr *charsets) { int u1, u2, u3, u4; int code_levels; int i; int n = Dynarr_length (charsets); + static int number_of_jit_charsets; + static Ascbyte last_jit_charset_final; type_checking_assert (code >= 0); /* This shortcut depends on the representation of an Ichar, see text.c. @@ -1040,8 +1104,64 @@ return make_ichar (charset, retval >> 8, retval & 0xFF); } } + + /* Only do the magic just-in-time assignment if we're using the default + list. */ + if (unicode_precedence_dynarr == charsets) + { + if (NILP (Vcurrent_jit_charset) || + (-1 == (i = get_free_codepoint(Vcurrent_jit_charset)))) + { + Ascbyte setname[32]; + Lisp_Object charset_descr = build_string + ("Mule charset for otherwise unknown Unicode code points."); + Lisp_Object charset_regr = build_string("iso10646-1"); - return (Ichar) -1; + struct gcpro gcpro1, gcpro2; + + if ('\0' == last_jit_charset_final) + { + /* This final byte shit is, umm, not that cool. */ + last_jit_charset_final = 0x30; + } + + snprintf(setname, sizeof(setname), + "jit-ucs-charset-%d", number_of_jit_charsets++); + + /* Aside: GCPROing here would be overkill according to the FSF's + philosophy. make-charset cannot currently GC, but is intended + to be called from Lisp, with its arguments protected by the + Lisp reader. We GCPRO in case it GCs in the future and no-one + checks all the C callers. */ + + GCPRO2 (charset_descr, charset_regr); + Vcurrent_jit_charset = Fmake_charset + (intern(setname), charset_descr, + /* Set encode-as-utf-8 to t, to have this character set written + using UTF-8 escapes in escape-quoted and ctext. This + sidesteps the fact that our internal character -> Unicode + mapping is not stable from one invocation to the next. */ + nconc2 (list2(Qencode_as_utf_8, Qt), + nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96), + Qdimension, make_int(2)), + list6(Qregistry, charset_regr, + Qfinal, make_char(last_jit_charset_final++), + /* This CCL program is initialised in + unicode.el. */ + Qccl_program, Qccl_encode_to_ucs_2)))); + UNGCPRO; + + i = get_free_codepoint(Vcurrent_jit_charset); + } + + if (-1 != i) + { + set_unicode_conversion((Ichar)i, code); + /* No need to add the charset to the end of the list; it's done + automatically. */ + } + } + return (Ichar) i; } /* Add charsets to precedence list. @@ -1284,37 +1404,13 @@ present), this function simply does `int-to-char' and ignores the CHARSETS argument. -Note that the current XEmacs internal encoding has no mapping for many -Unicode code points, and if you use characters that are vaguely obscure with -XEmacs' Unicode coding systems, you will lose data. - -To add support for some desired code point in the short term--note that our -intention is to move to a Unicode-compatible internal encoding soon, for -some value of soon--if you are a distributor, add something like the -following to `site-start.el.' - -(make-charset 'distro-name-private - "Private character set for DISTRO" - '(dimension 1 - chars 96 - columns 1 - final ?5 ;; Change this--see docs for make-charset - long-name "Private charset for some Unicode char support." - short-name "Distro-Private")) - -(set-unicode-conversion - (make-char 'distro-name-private #x20) #x263A) ;; WHITE SMILING FACE - -(set-unicode-conversion - (make-char 'distro-name-private #x21) #x3030) ;; WAVY DASH - -;; ... -;;; Repeat as necessary. - -Redisplay will work on the sjt-xft branch, but not with server-side X11 -fonts as is the default. However, data read in will be preserved when they -are written out again. - +If the CODE would not otherwise be converted to an XEmacs character, and the +list of character sets to be consulted is nil or the default, a new XEmacs +character will be created for it in one of the `jit-ucs-charset' Mule +character sets, and that character will be returned. There is scope for +tens of thousands of separate Unicode code points in every session using +this technique, so despite XEmacs' internal encoding not being based on +Unicode, your data won't be trashed. */ (code, USED_IF_MULE (charsets))) { @@ -1558,16 +1654,6 @@ /* Unicode coding system */ /************************************************************************/ -/* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */ - -enum unicode_type -{ - UNICODE_UTF_16, - UNICODE_UTF_8, - UNICODE_UTF_7, - UNICODE_UCS_4 -}; - struct unicode_coding_system { enum unicode_type type; @@ -1728,7 +1814,9 @@ } } -static void +/* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented + encodings. */ +void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, int USED_IF_MULE (l), unsigned_char_dynarr *dst, enum unicode_type type, unsigned int little_endian) @@ -2444,6 +2532,8 @@ DEFSUBR (Fload_unicode_mapping_table); + DEFSYMBOL (Qccl_encode_to_ucs_2); + DEFSYMBOL (Qlast_allocated_character); DEFSYMBOL (Qignore_first_column); #endif /* MULE */ @@ -2519,6 +2609,9 @@ init_blank_unicode_tables (); + staticpro (&Vcurrent_jit_charset); + Vcurrent_jit_charset = Qnil; + /* Note that the "block" we are describing is a single pointer, and hence we could potentially use dump_add_root_block_ptr(). However, given the way the descriptions are written, we couldn't use them, and would