comparison src/unicode.c @ 4268:75d0292c1bff

[xemacs-hg @ 2007-11-14 19:41:04 by aidan] Correct the dumped information for the Unicode JIT infrastructure.
author aidan
date Wed, 14 Nov 2007 19:41:09 +0000
parents 1abf84db2c7f
children bd9b678f4db7
comparison
equal deleted inserted replaced
4267:66e2714696bd 4268:75d0292c1bff
333 Lisp_Object Qignore_first_column; 333 Lisp_Object Qignore_first_column;
334 334
335 Lisp_Object Vcurrent_jit_charset; 335 Lisp_Object Vcurrent_jit_charset;
336 Lisp_Object Qlast_allocated_character; 336 Lisp_Object Qlast_allocated_character;
337 Lisp_Object Qccl_encode_to_ucs_2; 337 Lisp_Object Qccl_encode_to_ucs_2;
338
339 Lisp_Object Vnumber_of_jit_charsets;
340 Lisp_Object Vlast_jit_charset_final;
341 Lisp_Object Vcharset_descr;
342
338 343
339 344
340 /************************************************************************/ 345 /************************************************************************/
341 /* Unicode implementation */ 346 /* Unicode implementation */
342 /************************************************************************/ 347 /************************************************************************/
1078 { 1083 {
1079 int u1, u2, u3, u4; 1084 int u1, u2, u3, u4;
1080 int code_levels; 1085 int code_levels;
1081 int i; 1086 int i;
1082 int n = Dynarr_length (charsets); 1087 int n = Dynarr_length (charsets);
1083 static int number_of_jit_charsets;
1084 static Ascbyte last_jit_charset_final;
1085 1088
1086 type_checking_assert (code >= 0); 1089 type_checking_assert (code >= 0);
1087 /* This shortcut depends on the representation of an Ichar, see text.c. 1090 /* This shortcut depends on the representation of an Ichar, see text.c.
1088 Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859 1091 Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859
1089 coded character sets have points that map into that region, so this 1092 coded character sets have points that map into that region, so this
1122 { 1125 {
1123 if (NILP (Vcurrent_jit_charset) || 1126 if (NILP (Vcurrent_jit_charset) ||
1124 (-1 == (i = get_free_codepoint(Vcurrent_jit_charset)))) 1127 (-1 == (i = get_free_codepoint(Vcurrent_jit_charset))))
1125 { 1128 {
1126 Ibyte setname[32]; 1129 Ibyte setname[32];
1127 Lisp_Object charset_descr = build_string 1130 int number_of_jit_charsets = XINT (Vnumber_of_jit_charsets);
1128 ("Mule charset for otherwise unknown Unicode code points."); 1131 Ascbyte last_jit_charset_final = XCHAR (Vlast_jit_charset_final);
1129 1132
1130 struct gcpro gcpro1; 1133 /* This final byte shit is, umm, not that cool. */
1131 1134 assert (last_jit_charset_final >= 0x30);
1132 if ('\0' == last_jit_charset_final)
1133 {
1134 /* This final byte shit is, umm, not that cool. */
1135 last_jit_charset_final = 0x30;
1136 }
1137 1135
1138 /* Assertion added partly because our Win32 layer doesn't 1136 /* Assertion added partly because our Win32 layer doesn't
1139 support snprintf; with this, we're sure it won't overflow 1137 support snprintf; with this, we're sure it won't overflow
1140 the buffer. */ 1138 the buffer. */
1141 assert(100 > number_of_jit_charsets); 1139 assert(100 > number_of_jit_charsets);
1142 1140
1143 qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets++); 1141 qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets);
1144 1142
1145 /* Aside: GCPROing here would be overkill according to the FSF's
1146 philosophy. make-charset cannot currently GC, but is intended
1147 to be called from Lisp, with its arguments protected by the
1148 Lisp reader. We GCPRO in case it GCs in the future and no-one
1149 checks all the C callers. */
1150
1151 GCPRO1 (charset_descr);
1152 Vcurrent_jit_charset = Fmake_charset 1143 Vcurrent_jit_charset = Fmake_charset
1153 (intern((const CIbyte *)setname), charset_descr, 1144 (intern((const CIbyte *)setname), Vcharset_descr,
1154 /* Set encode-as-utf-8 to t, to have this character set written 1145 /* Set encode-as-utf-8 to t, to have this character set written
1155 using UTF-8 escapes in escape-quoted and ctext. This 1146 using UTF-8 escapes in escape-quoted and ctext. This
1156 sidesteps the fact that our internal character -> Unicode 1147 sidesteps the fact that our internal character -> Unicode
1157 mapping is not stable from one invocation to the next. */ 1148 mapping is not stable from one invocation to the next. */
1158 nconc2 (list2(Qencode_as_utf_8, Qt), 1149 nconc2 (list2(Qencode_as_utf_8, Qt),
1159 nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96), 1150 nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96),
1160 Qdimension, make_int(2)), 1151 Qdimension, make_int(2)),
1161 list6(Qregistries, Qunicode_registries, 1152 list6(Qregistries, Qunicode_registries,
1162 Qfinal, make_char(last_jit_charset_final++), 1153 Qfinal, make_char(last_jit_charset_final),
1163 /* This CCL program is initialised in 1154 /* This CCL program is initialised in
1164 unicode.el. */ 1155 unicode.el. */
1165 Qccl_program, Qccl_encode_to_ucs_2)))); 1156 Qccl_program, Qccl_encode_to_ucs_2))));
1166 UNGCPRO; 1157
1158 /* Record for the Unicode infrastructure that we've created
1159 this character set. */
1160 Vnumber_of_jit_charsets = make_int (number_of_jit_charsets + 1);
1161 Vlast_jit_charset_final = make_char (last_jit_charset_final + 1);
1167 1162
1168 i = get_free_codepoint(Vcurrent_jit_charset); 1163 i = get_free_codepoint(Vcurrent_jit_charset);
1169 } 1164 }
1170 1165
1171 if (-1 != i) 1166 if (-1 != i)
1419 argument. 1414 argument.
1420 1415
1421 If the CODE would not otherwise be converted to an XEmacs character, and the 1416 If the CODE would not otherwise be converted to an XEmacs character, and the
1422 list of character sets to be consulted is nil or the default, a new XEmacs 1417 list of character sets to be consulted is nil or the default, a new XEmacs
1423 character will be created for it in one of the `jit-ucs-charset' Mule 1418 character will be created for it in one of the `jit-ucs-charset' Mule
1424 character sets, and that character will be returned. There is scope for 1419 character sets, and that character will be returned.
1425 tens of thousands of separate Unicode code points in every session using 1420
1426 this technique, so despite XEmacs' internal encoding not being based on 1421 This is limited to around 400,000 characters per XEmacs session, though, so
1427 Unicode, your data won't be trashed. 1422 while normal usage will not be problematic, things like:
1423
1424 \(dotimes (i #x110000) (decode-char 'ucs i))
1425
1426 will eventually error. The long-term solution to this is Unicode as an
1427 internal encoding.
1428 */ 1428 */
1429 (code, USED_IF_MULE (charsets))) 1429 (code, USED_IF_MULE (charsets)))
1430 { 1430 {
1431 #ifdef MULE 1431 #ifdef MULE
1432 Lisp_Object_dynarr *dyn; 1432 Lisp_Object_dynarr *dyn;
2860 } 2860 }
2861 2861
2862 void 2862 void
2863 coding_system_type_create_unicode (void) 2863 coding_system_type_create_unicode (void)
2864 { 2864 {
2865 staticpro (&Vnumber_of_jit_charsets);
2866 Vnumber_of_jit_charsets = make_int (0);
2867 staticpro (&Vlast_jit_charset_final);
2868 Vlast_jit_charset_final = make_char (0x30);
2869 staticpro (&Vcharset_descr);
2870 Vcharset_descr
2871 = build_string ("Mule charset for otherwise unknown Unicode code points.");
2872
2865 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (unicode, "unicode-coding-system-p"); 2873 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (unicode, "unicode-coding-system-p");
2866 CODING_SYSTEM_HAS_METHOD (unicode, print); 2874 CODING_SYSTEM_HAS_METHOD (unicode, print);
2867 CODING_SYSTEM_HAS_METHOD (unicode, convert); 2875 CODING_SYSTEM_HAS_METHOD (unicode, convert);
2868 CODING_SYSTEM_HAS_METHOD (unicode, init_coding_stream); 2876 CODING_SYSTEM_HAS_METHOD (unicode, init_coding_stream);
2869 CODING_SYSTEM_HAS_METHOD (unicode, rewind_coding_stream); 2877 CODING_SYSTEM_HAS_METHOD (unicode, rewind_coding_stream);