Mercurial > hg > xemacs-beta
comparison src/unicode.c @ 3439:d1754e7f0cea
[xemacs-hg @ 2006-06-03 17:50:39 by aidan]
Just-in-time Unicode code point support.
author | aidan |
---|---|
date | Sat, 03 Jun 2006 17:51:06 +0000 |
parents | 8dbdcd070418 |
children | 551c008d3777 |
comparison
equal
deleted
inserted
replaced
3438:14fbcab7c67b | 3439:d1754e7f0cea |
---|---|
318 | 318 |
319 Lisp_Object Vlanguage_unicode_precedence_list; | 319 Lisp_Object Vlanguage_unicode_precedence_list; |
320 Lisp_Object Vdefault_unicode_precedence_list; | 320 Lisp_Object Vdefault_unicode_precedence_list; |
321 | 321 |
322 Lisp_Object Qignore_first_column; | 322 Lisp_Object Qignore_first_column; |
323 | |
324 Lisp_Object Vcurrent_jit_charset; | |
325 Lisp_Object Qlast_allocated_character; | |
326 Lisp_Object Qccl_encode_to_ucs_2; | |
323 | 327 |
324 | 328 |
325 /************************************************************************/ | 329 /************************************************************************/ |
326 /* Unicode implementation */ | 330 /* Unicode implementation */ |
327 /************************************************************************/ | 331 /************************************************************************/ |
999 else | 1003 else |
1000 return ((int **) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32][c2 - 32]; | 1004 return ((int **) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32][c2 - 32]; |
1001 } | 1005 } |
1002 | 1006 |
1003 static Ichar | 1007 static Ichar |
1008 get_free_codepoint(Lisp_Object charset) | |
1009 { | |
1010 Lisp_Object name = Fcharset_name(charset); | |
1011 Lisp_Object zeichen = Fget(name, Qlast_allocated_character, Qnil); | |
1012 Ichar res; | |
1013 | |
1014 /* Only allow this with the 96x96 character sets we are using for | |
1015 temporary Unicode support. */ | |
1016 assert(2 == XCHARSET_DIMENSION(charset) && 96 == XCHARSET_CHARS(charset)); | |
1017 | |
1018 if (!NILP(zeichen)) | |
1019 { | |
1020 int c1, c2; | |
1021 | |
1022 BREAKUP_ICHAR(XCHAR(zeichen), charset, c1, c2); | |
1023 | |
1024 if (127 == c1 && 127 == c2) | |
1025 { | |
1026 /* We've already used the hightest-numbered character in this | |
1027 set--tell our caller to create another. */ | |
1028 return -1; | |
1029 } | |
1030 | |
1031 if (127 == c2) | |
1032 { | |
1033 ++c1; | |
1034 c2 = 0x20; | |
1035 } | |
1036 else | |
1037 { | |
1038 ++c2; | |
1039 } | |
1040 | |
1041 res = make_ichar(charset, c1, c2); | |
1042 Fput(name, Qlast_allocated_character, make_char(res)); | |
1043 } | |
1044 else | |
1045 { | |
1046 res = make_ichar(charset, 32, 32); | |
1047 Fput(name, Qlast_allocated_character, make_char(res)); | |
1048 } | |
1049 return res; | |
1050 } | |
1051 | |
1052 /* The just-in-time creation of XEmacs characters that correspond to unknown | |
1053 Unicode code points happens when: | |
1054 | |
1055 1. The lookup would otherwise fail. | |
1056 | |
1057 2. The charsets array is the nil or the default. | |
1058 | |
1059 If there are no free code points in the just-in-time Unicode character | |
1060 set, and the charsets array is the default unicode precedence list, | |
1061 create a new just-in-time Unicode character set, add it at the end of the | |
1062 unicode precedence list, create the XEmacs character in that character | |
1063 set, and return it. */ | |
1064 | |
1065 static Ichar | |
1004 unicode_to_ichar (int code, Lisp_Object_dynarr *charsets) | 1066 unicode_to_ichar (int code, Lisp_Object_dynarr *charsets) |
1005 { | 1067 { |
1006 int u1, u2, u3, u4; | 1068 int u1, u2, u3, u4; |
1007 int code_levels; | 1069 int code_levels; |
1008 int i; | 1070 int i; |
1009 int n = Dynarr_length (charsets); | 1071 int n = Dynarr_length (charsets); |
1072 static int number_of_jit_charsets; | |
1073 static Ascbyte last_jit_charset_final; | |
1010 | 1074 |
1011 type_checking_assert (code >= 0); | 1075 type_checking_assert (code >= 0); |
1012 /* This shortcut depends on the representation of an Ichar, see text.c. | 1076 /* This shortcut depends on the representation of an Ichar, see text.c. |
1013 Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859 | 1077 Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859 |
1014 coded character sets have points that map into that region, so this | 1078 coded character sets have points that map into that region, so this |
1038 | 1102 |
1039 if (retval != -1) | 1103 if (retval != -1) |
1040 return make_ichar (charset, retval >> 8, retval & 0xFF); | 1104 return make_ichar (charset, retval >> 8, retval & 0xFF); |
1041 } | 1105 } |
1042 } | 1106 } |
1043 | 1107 |
1044 return (Ichar) -1; | 1108 /* Only do the magic just-in-time assignment if we're using the default |
1109 list. */ | |
1110 if (unicode_precedence_dynarr == charsets) | |
1111 { | |
1112 if (NILP (Vcurrent_jit_charset) || | |
1113 (-1 == (i = get_free_codepoint(Vcurrent_jit_charset)))) | |
1114 { | |
1115 Ascbyte setname[32]; | |
1116 Lisp_Object charset_descr = build_string | |
1117 ("Mule charset for otherwise unknown Unicode code points."); | |
1118 Lisp_Object charset_regr = build_string("iso10646-1"); | |
1119 | |
1120 struct gcpro gcpro1, gcpro2; | |
1121 | |
1122 if ('\0' == last_jit_charset_final) | |
1123 { | |
1124 /* This final byte shit is, umm, not that cool. */ | |
1125 last_jit_charset_final = 0x30; | |
1126 } | |
1127 | |
1128 snprintf(setname, sizeof(setname), | |
1129 "jit-ucs-charset-%d", number_of_jit_charsets++); | |
1130 | |
1131 /* Aside: GCPROing here would be overkill according to the FSF's | |
1132 philosophy. make-charset cannot currently GC, but is intended | |
1133 to be called from Lisp, with its arguments protected by the | |
1134 Lisp reader. We GCPRO in case it GCs in the future and no-one | |
1135 checks all the C callers. */ | |
1136 | |
1137 GCPRO2 (charset_descr, charset_regr); | |
1138 Vcurrent_jit_charset = Fmake_charset | |
1139 (intern(setname), charset_descr, | |
1140 /* Set encode-as-utf-8 to t, to have this character set written | |
1141 using UTF-8 escapes in escape-quoted and ctext. This | |
1142 sidesteps the fact that our internal character -> Unicode | |
1143 mapping is not stable from one invocation to the next. */ | |
1144 nconc2 (list2(Qencode_as_utf_8, Qt), | |
1145 nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96), | |
1146 Qdimension, make_int(2)), | |
1147 list6(Qregistry, charset_regr, | |
1148 Qfinal, make_char(last_jit_charset_final++), | |
1149 /* This CCL program is initialised in | |
1150 unicode.el. */ | |
1151 Qccl_program, Qccl_encode_to_ucs_2)))); | |
1152 UNGCPRO; | |
1153 | |
1154 i = get_free_codepoint(Vcurrent_jit_charset); | |
1155 } | |
1156 | |
1157 if (-1 != i) | |
1158 { | |
1159 set_unicode_conversion((Ichar)i, code); | |
1160 /* No need to add the charset to the end of the list; it's done | |
1161 automatically. */ | |
1162 } | |
1163 } | |
1164 return (Ichar) i; | |
1045 } | 1165 } |
1046 | 1166 |
1047 /* Add charsets to precedence list. | 1167 /* Add charsets to precedence list. |
1048 LIST must be a list of charsets. Charsets which are in the list more | 1168 LIST must be a list of charsets. Charsets which are in the list more |
1049 than once are given the precedence implied by their earliest appearance. | 1169 than once are given the precedence implied by their earliest appearance. |
1282 | 1402 |
1283 When there is no international support (i.e. the `mule' feature is not | 1403 When there is no international support (i.e. the `mule' feature is not |
1284 present), this function simply does `int-to-char' and ignores the CHARSETS | 1404 present), this function simply does `int-to-char' and ignores the CHARSETS |
1285 argument. | 1405 argument. |
1286 | 1406 |
1287 Note that the current XEmacs internal encoding has no mapping for many | 1407 If the CODE would not otherwise be converted to an XEmacs character, and the |
1288 Unicode code points, and if you use characters that are vaguely obscure with | 1408 list of character sets to be consulted is nil or the default, a new XEmacs |
1289 XEmacs' Unicode coding systems, you will lose data. | 1409 character will be created for it in one of the `jit-ucs-charset' Mule |
1290 | 1410 character sets, and that character will be returned. There is scope for |
1291 To add support for some desired code point in the short term--note that our | 1411 tens of thousands of separate Unicode code points in every session using |
1292 intention is to move to a Unicode-compatible internal encoding soon, for | 1412 this technique, so despite XEmacs' internal encoding not being based on |
1293 some value of soon--if you are a distributor, add something like the | 1413 Unicode, your data won't be trashed. |
1294 following to `site-start.el.' | |
1295 | |
1296 (make-charset 'distro-name-private | |
1297 "Private character set for DISTRO" | |
1298 '(dimension 1 | |
1299 chars 96 | |
1300 columns 1 | |
1301 final ?5 ;; Change this--see docs for make-charset | |
1302 long-name "Private charset for some Unicode char support." | |
1303 short-name "Distro-Private")) | |
1304 | |
1305 (set-unicode-conversion | |
1306 (make-char 'distro-name-private #x20) #x263A) ;; WHITE SMILING FACE | |
1307 | |
1308 (set-unicode-conversion | |
1309 (make-char 'distro-name-private #x21) #x3030) ;; WAVY DASH | |
1310 | |
1311 ;; ... | |
1312 ;;; Repeat as necessary. | |
1313 | |
1314 Redisplay will work on the sjt-xft branch, but not with server-side X11 | |
1315 fonts as is the default. However, data read in will be preserved when they | |
1316 are written out again. | |
1317 | |
1318 */ | 1414 */ |
1319 (code, USED_IF_MULE (charsets))) | 1415 (code, USED_IF_MULE (charsets))) |
1320 { | 1416 { |
1321 #ifdef MULE | 1417 #ifdef MULE |
1322 Lisp_Object_dynarr *dyn; | 1418 Lisp_Object_dynarr *dyn; |
1556 | 1652 |
1557 /************************************************************************/ | 1653 /************************************************************************/ |
1558 /* Unicode coding system */ | 1654 /* Unicode coding system */ |
1559 /************************************************************************/ | 1655 /************************************************************************/ |
1560 | 1656 |
1561 /* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */ | |
1562 | |
1563 enum unicode_type | |
1564 { | |
1565 UNICODE_UTF_16, | |
1566 UNICODE_UTF_8, | |
1567 UNICODE_UTF_7, | |
1568 UNICODE_UCS_4 | |
1569 }; | |
1570 | |
1571 struct unicode_coding_system | 1657 struct unicode_coding_system |
1572 { | 1658 { |
1573 enum unicode_type type; | 1659 enum unicode_type type; |
1574 unsigned int little_endian :1; | 1660 unsigned int little_endian :1; |
1575 unsigned int need_bom :1; | 1661 unsigned int need_bom :1; |
1726 | 1812 |
1727 default: ABORT (); | 1813 default: ABORT (); |
1728 } | 1814 } |
1729 } | 1815 } |
1730 | 1816 |
1731 static void | 1817 /* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented |
1818 encodings. */ | |
1819 void | |
1732 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, | 1820 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, |
1733 int USED_IF_MULE (l), unsigned_char_dynarr *dst, | 1821 int USED_IF_MULE (l), unsigned_char_dynarr *dst, |
1734 enum unicode_type type, unsigned int little_endian) | 1822 enum unicode_type type, unsigned int little_endian) |
1735 { | 1823 { |
1736 #ifdef MULE | 1824 #ifdef MULE |
2442 DEFSUBR (Fdefault_unicode_precedence_list); | 2530 DEFSUBR (Fdefault_unicode_precedence_list); |
2443 DEFSUBR (Fset_unicode_conversion); | 2531 DEFSUBR (Fset_unicode_conversion); |
2444 | 2532 |
2445 DEFSUBR (Fload_unicode_mapping_table); | 2533 DEFSUBR (Fload_unicode_mapping_table); |
2446 | 2534 |
2535 DEFSYMBOL (Qccl_encode_to_ucs_2); | |
2536 DEFSYMBOL (Qlast_allocated_character); | |
2447 DEFSYMBOL (Qignore_first_column); | 2537 DEFSYMBOL (Qignore_first_column); |
2448 #endif /* MULE */ | 2538 #endif /* MULE */ |
2449 | 2539 |
2450 DEFSUBR (Fchar_to_unicode); | 2540 DEFSUBR (Fchar_to_unicode); |
2451 DEFSUBR (Funicode_to_char); | 2541 DEFSUBR (Funicode_to_char); |
2516 unicode_precedence_dynarr = Dynarr_new (Lisp_Object); | 2606 unicode_precedence_dynarr = Dynarr_new (Lisp_Object); |
2517 dump_add_root_block_ptr (&unicode_precedence_dynarr, | 2607 dump_add_root_block_ptr (&unicode_precedence_dynarr, |
2518 &lisp_object_dynarr_description); | 2608 &lisp_object_dynarr_description); |
2519 | 2609 |
2520 init_blank_unicode_tables (); | 2610 init_blank_unicode_tables (); |
2611 | |
2612 staticpro (&Vcurrent_jit_charset); | |
2613 Vcurrent_jit_charset = Qnil; | |
2521 | 2614 |
2522 /* Note that the "block" we are describing is a single pointer, and hence | 2615 /* Note that the "block" we are describing is a single pointer, and hence |
2523 we could potentially use dump_add_root_block_ptr(). However, given | 2616 we could potentially use dump_add_root_block_ptr(). However, given |
2524 the way the descriptions are written, we couldn't use them, and would | 2617 the way the descriptions are written, we couldn't use them, and would |
2525 have to write new descriptions for each of the pointers below, since | 2618 have to write new descriptions for each of the pointers below, since |