Mercurial > hg > xemacs-beta
comparison src/unicode.c @ 4268:75d0292c1bff
[xemacs-hg @ 2007-11-14 19:41:04 by aidan]
Correct the dumped information for the Unicode JIT infrastructure.
author | aidan |
---|---|
date | Wed, 14 Nov 2007 19:41:09 +0000 |
parents | 1abf84db2c7f |
children | bd9b678f4db7 |
comparison
equal
deleted
inserted
replaced
4267:66e2714696bd | 4268:75d0292c1bff |
---|---|
333 Lisp_Object Qignore_first_column; | 333 Lisp_Object Qignore_first_column; |
334 | 334 |
335 Lisp_Object Vcurrent_jit_charset; | 335 Lisp_Object Vcurrent_jit_charset; |
336 Lisp_Object Qlast_allocated_character; | 336 Lisp_Object Qlast_allocated_character; |
337 Lisp_Object Qccl_encode_to_ucs_2; | 337 Lisp_Object Qccl_encode_to_ucs_2; |
338 | |
339 Lisp_Object Vnumber_of_jit_charsets; | |
340 Lisp_Object Vlast_jit_charset_final; | |
341 Lisp_Object Vcharset_descr; | |
342 | |
338 | 343 |
339 | 344 |
340 /************************************************************************/ | 345 /************************************************************************/ |
341 /* Unicode implementation */ | 346 /* Unicode implementation */ |
342 /************************************************************************/ | 347 /************************************************************************/ |
1078 { | 1083 { |
1079 int u1, u2, u3, u4; | 1084 int u1, u2, u3, u4; |
1080 int code_levels; | 1085 int code_levels; |
1081 int i; | 1086 int i; |
1082 int n = Dynarr_length (charsets); | 1087 int n = Dynarr_length (charsets); |
1083 static int number_of_jit_charsets; | |
1084 static Ascbyte last_jit_charset_final; | |
1085 | 1088 |
1086 type_checking_assert (code >= 0); | 1089 type_checking_assert (code >= 0); |
1087 /* This shortcut depends on the representation of an Ichar, see text.c. | 1090 /* This shortcut depends on the representation of an Ichar, see text.c. |
1088 Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859 | 1091 Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859 |
1089 coded character sets have points that map into that region, so this | 1092 coded character sets have points that map into that region, so this |
1122 { | 1125 { |
1123 if (NILP (Vcurrent_jit_charset) || | 1126 if (NILP (Vcurrent_jit_charset) || |
1124 (-1 == (i = get_free_codepoint(Vcurrent_jit_charset)))) | 1127 (-1 == (i = get_free_codepoint(Vcurrent_jit_charset)))) |
1125 { | 1128 { |
1126 Ibyte setname[32]; | 1129 Ibyte setname[32]; |
1127 Lisp_Object charset_descr = build_string | 1130 int number_of_jit_charsets = XINT (Vnumber_of_jit_charsets); |
1128 ("Mule charset for otherwise unknown Unicode code points."); | 1131 Ascbyte last_jit_charset_final = XCHAR (Vlast_jit_charset_final); |
1129 | 1132 |
1130 struct gcpro gcpro1; | 1133 /* This final byte shit is, umm, not that cool. */ |
1131 | 1134 assert (last_jit_charset_final >= 0x30); |
1132 if ('\0' == last_jit_charset_final) | |
1133 { | |
1134 /* This final byte shit is, umm, not that cool. */ | |
1135 last_jit_charset_final = 0x30; | |
1136 } | |
1137 | 1135 |
1138 /* Assertion added partly because our Win32 layer doesn't | 1136 /* Assertion added partly because our Win32 layer doesn't |
1139 support snprintf; with this, we're sure it won't overflow | 1137 support snprintf; with this, we're sure it won't overflow |
1140 the buffer. */ | 1138 the buffer. */ |
1141 assert(100 > number_of_jit_charsets); | 1139 assert(100 > number_of_jit_charsets); |
1142 | 1140 |
1143 qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets++); | 1141 qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets); |
1144 | 1142 |
1145 /* Aside: GCPROing here would be overkill according to the FSF's | |
1146 philosophy. make-charset cannot currently GC, but is intended | |
1147 to be called from Lisp, with its arguments protected by the | |
1148 Lisp reader. We GCPRO in case it GCs in the future and no-one | |
1149 checks all the C callers. */ | |
1150 | |
1151 GCPRO1 (charset_descr); | |
1152 Vcurrent_jit_charset = Fmake_charset | 1143 Vcurrent_jit_charset = Fmake_charset |
1153 (intern((const CIbyte *)setname), charset_descr, | 1144 (intern((const CIbyte *)setname), Vcharset_descr, |
1154 /* Set encode-as-utf-8 to t, to have this character set written | 1145 /* Set encode-as-utf-8 to t, to have this character set written |
1155 using UTF-8 escapes in escape-quoted and ctext. This | 1146 using UTF-8 escapes in escape-quoted and ctext. This |
1156 sidesteps the fact that our internal character -> Unicode | 1147 sidesteps the fact that our internal character -> Unicode |
1157 mapping is not stable from one invocation to the next. */ | 1148 mapping is not stable from one invocation to the next. */ |
1158 nconc2 (list2(Qencode_as_utf_8, Qt), | 1149 nconc2 (list2(Qencode_as_utf_8, Qt), |
1159 nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96), | 1150 nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96), |
1160 Qdimension, make_int(2)), | 1151 Qdimension, make_int(2)), |
1161 list6(Qregistries, Qunicode_registries, | 1152 list6(Qregistries, Qunicode_registries, |
1162 Qfinal, make_char(last_jit_charset_final++), | 1153 Qfinal, make_char(last_jit_charset_final), |
1163 /* This CCL program is initialised in | 1154 /* This CCL program is initialised in |
1164 unicode.el. */ | 1155 unicode.el. */ |
1165 Qccl_program, Qccl_encode_to_ucs_2)))); | 1156 Qccl_program, Qccl_encode_to_ucs_2)))); |
1166 UNGCPRO; | 1157 |
1158 /* Record for the Unicode infrastructure that we've created | |
1159 this character set. */ | |
1160 Vnumber_of_jit_charsets = make_int (number_of_jit_charsets + 1); | |
1161 Vlast_jit_charset_final = make_char (last_jit_charset_final + 1); | |
1167 | 1162 |
1168 i = get_free_codepoint(Vcurrent_jit_charset); | 1163 i = get_free_codepoint(Vcurrent_jit_charset); |
1169 } | 1164 } |
1170 | 1165 |
1171 if (-1 != i) | 1166 if (-1 != i) |
1419 argument. | 1414 argument. |
1420 | 1415 |
1421 If the CODE would not otherwise be converted to an XEmacs character, and the | 1416 If the CODE would not otherwise be converted to an XEmacs character, and the |
1422 list of character sets to be consulted is nil or the default, a new XEmacs | 1417 list of character sets to be consulted is nil or the default, a new XEmacs |
1423 character will be created for it in one of the `jit-ucs-charset' Mule | 1418 character will be created for it in one of the `jit-ucs-charset' Mule |
1424 character sets, and that character will be returned. There is scope for | 1419 character sets, and that character will be returned. |
1425 tens of thousands of separate Unicode code points in every session using | 1420 |
1426 this technique, so despite XEmacs' internal encoding not being based on | 1421 This is limited to around 400,000 characters per XEmacs session, though, so |
1427 Unicode, your data won't be trashed. | 1422 while normal usage will not be problematic, things like: |
1423 | |
1424 \(dotimes (i #x110000) (decode-char 'ucs i)) | |
1425 | |
1426 will eventually error. The long-term solution to this is Unicode as an | |
1427 internal encoding. | |
1428 */ | 1428 */ |
1429 (code, USED_IF_MULE (charsets))) | 1429 (code, USED_IF_MULE (charsets))) |
1430 { | 1430 { |
1431 #ifdef MULE | 1431 #ifdef MULE |
1432 Lisp_Object_dynarr *dyn; | 1432 Lisp_Object_dynarr *dyn; |
2860 } | 2860 } |
2861 | 2861 |
2862 void | 2862 void |
2863 coding_system_type_create_unicode (void) | 2863 coding_system_type_create_unicode (void) |
2864 { | 2864 { |
2865 staticpro (&Vnumber_of_jit_charsets); | |
2866 Vnumber_of_jit_charsets = make_int (0); | |
2867 staticpro (&Vlast_jit_charset_final); | |
2868 Vlast_jit_charset_final = make_char (0x30); | |
2869 staticpro (&Vcharset_descr); | |
2870 Vcharset_descr | |
2871 = build_string ("Mule charset for otherwise unknown Unicode code points."); | |
2872 | |
2865 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (unicode, "unicode-coding-system-p"); | 2873 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (unicode, "unicode-coding-system-p"); |
2866 CODING_SYSTEM_HAS_METHOD (unicode, print); | 2874 CODING_SYSTEM_HAS_METHOD (unicode, print); |
2867 CODING_SYSTEM_HAS_METHOD (unicode, convert); | 2875 CODING_SYSTEM_HAS_METHOD (unicode, convert); |
2868 CODING_SYSTEM_HAS_METHOD (unicode, init_coding_stream); | 2876 CODING_SYSTEM_HAS_METHOD (unicode, init_coding_stream); |
2869 CODING_SYSTEM_HAS_METHOD (unicode, rewind_coding_stream); | 2877 CODING_SYSTEM_HAS_METHOD (unicode, rewind_coding_stream); |