comparison src/unicode.c @ 3439:d1754e7f0cea

[xemacs-hg @ 2006-06-03 17:50:39 by aidan] Just-in-time Unicode code point support.
author aidan
date Sat, 03 Jun 2006 17:51:06 +0000
parents 8dbdcd070418
children 551c008d3777
comparison
equal deleted inserted replaced
3438:14fbcab7c67b 3439:d1754e7f0cea
318 318
319 Lisp_Object Vlanguage_unicode_precedence_list; 319 Lisp_Object Vlanguage_unicode_precedence_list;
320 Lisp_Object Vdefault_unicode_precedence_list; 320 Lisp_Object Vdefault_unicode_precedence_list;
321 321
322 Lisp_Object Qignore_first_column; 322 Lisp_Object Qignore_first_column;
323
324 Lisp_Object Vcurrent_jit_charset;
325 Lisp_Object Qlast_allocated_character;
326 Lisp_Object Qccl_encode_to_ucs_2;
323 327
324 328
325 /************************************************************************/ 329 /************************************************************************/
326 /* Unicode implementation */ 330 /* Unicode implementation */
327 /************************************************************************/ 331 /************************************************************************/
999 else 1003 else
1000 return ((int **) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32][c2 - 32]; 1004 return ((int **) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32][c2 - 32];
1001 } 1005 }
1002 1006
1003 static Ichar 1007 static Ichar
1008 get_free_codepoint(Lisp_Object charset)
1009 {
1010 Lisp_Object name = Fcharset_name(charset);
1011 Lisp_Object zeichen = Fget(name, Qlast_allocated_character, Qnil);
1012 Ichar res;
1013
1014 /* Only allow this with the 96x96 character sets we are using for
1015 temporary Unicode support. */
1016 assert(2 == XCHARSET_DIMENSION(charset) && 96 == XCHARSET_CHARS(charset));
1017
1018 if (!NILP(zeichen))
1019 {
1020 int c1, c2;
1021
1022 BREAKUP_ICHAR(XCHAR(zeichen), charset, c1, c2);
1023
1024 if (127 == c1 && 127 == c2)
1025 {
1026 /* We've already used the hightest-numbered character in this
1027 set--tell our caller to create another. */
1028 return -1;
1029 }
1030
1031 if (127 == c2)
1032 {
1033 ++c1;
1034 c2 = 0x20;
1035 }
1036 else
1037 {
1038 ++c2;
1039 }
1040
1041 res = make_ichar(charset, c1, c2);
1042 Fput(name, Qlast_allocated_character, make_char(res));
1043 }
1044 else
1045 {
1046 res = make_ichar(charset, 32, 32);
1047 Fput(name, Qlast_allocated_character, make_char(res));
1048 }
1049 return res;
1050 }
1051
1052 /* The just-in-time creation of XEmacs characters that correspond to unknown
1053 Unicode code points happens when:
1054
1055 1. The lookup would otherwise fail.
1056
1057 2. The charsets array is the nil or the default.
1058
1059 If there are no free code points in the just-in-time Unicode character
1060 set, and the charsets array is the default unicode precedence list,
1061 create a new just-in-time Unicode character set, add it at the end of the
1062 unicode precedence list, create the XEmacs character in that character
1063 set, and return it. */
1064
1065 static Ichar
1004 unicode_to_ichar (int code, Lisp_Object_dynarr *charsets) 1066 unicode_to_ichar (int code, Lisp_Object_dynarr *charsets)
1005 { 1067 {
1006 int u1, u2, u3, u4; 1068 int u1, u2, u3, u4;
1007 int code_levels; 1069 int code_levels;
1008 int i; 1070 int i;
1009 int n = Dynarr_length (charsets); 1071 int n = Dynarr_length (charsets);
1072 static int number_of_jit_charsets;
1073 static Ascbyte last_jit_charset_final;
1010 1074
1011 type_checking_assert (code >= 0); 1075 type_checking_assert (code >= 0);
1012 /* This shortcut depends on the representation of an Ichar, see text.c. 1076 /* This shortcut depends on the representation of an Ichar, see text.c.
1013 Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859 1077 Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859
1014 coded character sets have points that map into that region, so this 1078 coded character sets have points that map into that region, so this
1038 1102
1039 if (retval != -1) 1103 if (retval != -1)
1040 return make_ichar (charset, retval >> 8, retval & 0xFF); 1104 return make_ichar (charset, retval >> 8, retval & 0xFF);
1041 } 1105 }
1042 } 1106 }
1043 1107
1044 return (Ichar) -1; 1108 /* Only do the magic just-in-time assignment if we're using the default
1109 list. */
1110 if (unicode_precedence_dynarr == charsets)
1111 {
1112 if (NILP (Vcurrent_jit_charset) ||
1113 (-1 == (i = get_free_codepoint(Vcurrent_jit_charset))))
1114 {
1115 Ascbyte setname[32];
1116 Lisp_Object charset_descr = build_string
1117 ("Mule charset for otherwise unknown Unicode code points.");
1118 Lisp_Object charset_regr = build_string("iso10646-1");
1119
1120 struct gcpro gcpro1, gcpro2;
1121
1122 if ('\0' == last_jit_charset_final)
1123 {
1124 /* This final byte shit is, umm, not that cool. */
1125 last_jit_charset_final = 0x30;
1126 }
1127
1128 snprintf(setname, sizeof(setname),
1129 "jit-ucs-charset-%d", number_of_jit_charsets++);
1130
1131 /* Aside: GCPROing here would be overkill according to the FSF's
1132 philosophy. make-charset cannot currently GC, but is intended
1133 to be called from Lisp, with its arguments protected by the
1134 Lisp reader. We GCPRO in case it GCs in the future and no-one
1135 checks all the C callers. */
1136
1137 GCPRO2 (charset_descr, charset_regr);
1138 Vcurrent_jit_charset = Fmake_charset
1139 (intern(setname), charset_descr,
1140 /* Set encode-as-utf-8 to t, to have this character set written
1141 using UTF-8 escapes in escape-quoted and ctext. This
1142 sidesteps the fact that our internal character -> Unicode
1143 mapping is not stable from one invocation to the next. */
1144 nconc2 (list2(Qencode_as_utf_8, Qt),
1145 nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96),
1146 Qdimension, make_int(2)),
1147 list6(Qregistry, charset_regr,
1148 Qfinal, make_char(last_jit_charset_final++),
1149 /* This CCL program is initialised in
1150 unicode.el. */
1151 Qccl_program, Qccl_encode_to_ucs_2))));
1152 UNGCPRO;
1153
1154 i = get_free_codepoint(Vcurrent_jit_charset);
1155 }
1156
1157 if (-1 != i)
1158 {
1159 set_unicode_conversion((Ichar)i, code);
1160 /* No need to add the charset to the end of the list; it's done
1161 automatically. */
1162 }
1163 }
1164 return (Ichar) i;
1045 } 1165 }
1046 1166
1047 /* Add charsets to precedence list. 1167 /* Add charsets to precedence list.
1048 LIST must be a list of charsets. Charsets which are in the list more 1168 LIST must be a list of charsets. Charsets which are in the list more
1049 than once are given the precedence implied by their earliest appearance. 1169 than once are given the precedence implied by their earliest appearance.
1282 1402
1283 When there is no international support (i.e. the `mule' feature is not 1403 When there is no international support (i.e. the `mule' feature is not
1284 present), this function simply does `int-to-char' and ignores the CHARSETS 1404 present), this function simply does `int-to-char' and ignores the CHARSETS
1285 argument. 1405 argument.
1286 1406
1287 Note that the current XEmacs internal encoding has no mapping for many 1407 If the CODE would not otherwise be converted to an XEmacs character, and the
1288 Unicode code points, and if you use characters that are vaguely obscure with 1408 list of character sets to be consulted is nil or the default, a new XEmacs
1289 XEmacs' Unicode coding systems, you will lose data. 1409 character will be created for it in one of the `jit-ucs-charset' Mule
1290 1410 character sets, and that character will be returned. There is scope for
1291 To add support for some desired code point in the short term--note that our 1411 tens of thousands of separate Unicode code points in every session using
1292 intention is to move to a Unicode-compatible internal encoding soon, for 1412 this technique, so despite XEmacs' internal encoding not being based on
1293 some value of soon--if you are a distributor, add something like the 1413 Unicode, your data won't be trashed.
1294 following to `site-start.el.'
1295
1296 (make-charset 'distro-name-private
1297 "Private character set for DISTRO"
1298 '(dimension 1
1299 chars 96
1300 columns 1
1301 final ?5 ;; Change this--see docs for make-charset
1302 long-name "Private charset for some Unicode char support."
1303 short-name "Distro-Private"))
1304
1305 (set-unicode-conversion
1306 (make-char 'distro-name-private #x20) #x263A) ;; WHITE SMILING FACE
1307
1308 (set-unicode-conversion
1309 (make-char 'distro-name-private #x21) #x3030) ;; WAVY DASH
1310
1311 ;; ...
1312 ;;; Repeat as necessary.
1313
1314 Redisplay will work on the sjt-xft branch, but not with server-side X11
1315 fonts as is the default. However, data read in will be preserved when they
1316 are written out again.
1317
1318 */ 1414 */
1319 (code, USED_IF_MULE (charsets))) 1415 (code, USED_IF_MULE (charsets)))
1320 { 1416 {
1321 #ifdef MULE 1417 #ifdef MULE
1322 Lisp_Object_dynarr *dyn; 1418 Lisp_Object_dynarr *dyn;
1556 1652
1557 /************************************************************************/ 1653 /************************************************************************/
1558 /* Unicode coding system */ 1654 /* Unicode coding system */
1559 /************************************************************************/ 1655 /************************************************************************/
1560 1656
1561 /* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */
1562
1563 enum unicode_type
1564 {
1565 UNICODE_UTF_16,
1566 UNICODE_UTF_8,
1567 UNICODE_UTF_7,
1568 UNICODE_UCS_4
1569 };
1570
1571 struct unicode_coding_system 1657 struct unicode_coding_system
1572 { 1658 {
1573 enum unicode_type type; 1659 enum unicode_type type;
1574 unsigned int little_endian :1; 1660 unsigned int little_endian :1;
1575 unsigned int need_bom :1; 1661 unsigned int need_bom :1;
1726 1812
1727 default: ABORT (); 1813 default: ABORT ();
1728 } 1814 }
1729 } 1815 }
1730 1816
1731 static void 1817 /* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented
1818 encodings. */
1819 void
1732 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, 1820 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
1733 int USED_IF_MULE (l), unsigned_char_dynarr *dst, 1821 int USED_IF_MULE (l), unsigned_char_dynarr *dst,
1734 enum unicode_type type, unsigned int little_endian) 1822 enum unicode_type type, unsigned int little_endian)
1735 { 1823 {
1736 #ifdef MULE 1824 #ifdef MULE
2442 DEFSUBR (Fdefault_unicode_precedence_list); 2530 DEFSUBR (Fdefault_unicode_precedence_list);
2443 DEFSUBR (Fset_unicode_conversion); 2531 DEFSUBR (Fset_unicode_conversion);
2444 2532
2445 DEFSUBR (Fload_unicode_mapping_table); 2533 DEFSUBR (Fload_unicode_mapping_table);
2446 2534
2535 DEFSYMBOL (Qccl_encode_to_ucs_2);
2536 DEFSYMBOL (Qlast_allocated_character);
2447 DEFSYMBOL (Qignore_first_column); 2537 DEFSYMBOL (Qignore_first_column);
2448 #endif /* MULE */ 2538 #endif /* MULE */
2449 2539
2450 DEFSUBR (Fchar_to_unicode); 2540 DEFSUBR (Fchar_to_unicode);
2451 DEFSUBR (Funicode_to_char); 2541 DEFSUBR (Funicode_to_char);
2516 unicode_precedence_dynarr = Dynarr_new (Lisp_Object); 2606 unicode_precedence_dynarr = Dynarr_new (Lisp_Object);
2517 dump_add_root_block_ptr (&unicode_precedence_dynarr, 2607 dump_add_root_block_ptr (&unicode_precedence_dynarr,
2518 &lisp_object_dynarr_description); 2608 &lisp_object_dynarr_description);
2519 2609
2520 init_blank_unicode_tables (); 2610 init_blank_unicode_tables ();
2611
2612 staticpro (&Vcurrent_jit_charset);
2613 Vcurrent_jit_charset = Qnil;
2521 2614
2522 /* Note that the "block" we are describing is a single pointer, and hence 2615 /* Note that the "block" we are describing is a single pointer, and hence
2523 we could potentially use dump_add_root_block_ptr(). However, given 2616 we could potentially use dump_add_root_block_ptr(). However, given
2524 the way the descriptions are written, we couldn't use them, and would 2617 the way the descriptions are written, we couldn't use them, and would
2525 have to write new descriptions for each of the pointers below, since 2618 have to write new descriptions for each of the pointers below, since