Mercurial > hg > xemacs-beta
comparison src/unicode.c @ 877:e54d47b2d736
[xemacs-hg @ 2002-06-23 09:54:35 by stephent]
warning fixes <87bsa2qymn.fsf@tleepslib.sk.tsukuba.ac.jp>
unicode improvements <87znxmpc96.fsf@tleepslib.sk.tsukuba.ac.jp>
author | stephent |
---|---|
date | Sun, 23 Jun 2002 09:54:41 +0000 |
parents | 79c6ff3eef26 |
children | c9f067fd71a3 |
comparison
equal
deleted
inserted
replaced
876:890f3cafe600 | 877:e54d47b2d736 |
---|---|
25 Current primary author: Ben Wing <ben@xemacs.org> | 25 Current primary author: Ben Wing <ben@xemacs.org> |
26 | 26 |
27 Written by Ben Wing <ben@xemacs.org>, June, 2001. | 27 Written by Ben Wing <ben@xemacs.org>, June, 2001. |
28 Separated out into this file, August, 2001. | 28 Separated out into this file, August, 2001. |
29 Includes Unicode coding systems, some parts of which have been written | 29 Includes Unicode coding systems, some parts of which have been written |
30 by someone else. | 30 by someone else. #### Morioka and Hayashi, I think. |
31 | 31 |
32 As of September 2001, the detection code is here and abstraction of the | 32 As of September 2001, the detection code is here and abstraction of the |
33 detection system is finished. the unicode detectors have been rewritten | 33 detection system is finished. The unicode detectors have been rewritten |
34 to include multiple levels of likelihood. | 34 to include multiple levels of likelihood. |
35 */ | 35 */ |
36 | 36 |
37 #include <config.h> | 37 #include <config.h> |
38 #include "lisp.h" | 38 #include "lisp.h" |
45 | 45 |
46 /* #### WARNING! The current sledgehammer routines have a fundamental | 46 /* #### WARNING! The current sledgehammer routines have a fundamental |
47 problem in that they can't handle two characters mapping to a | 47 problem in that they can't handle two characters mapping to a |
48 single Unicode codepoint or vice-versa in a single charset table. | 48 single Unicode codepoint or vice-versa in a single charset table. |
49 It's not clear there is any way to handle this and still make the | 49 It's not clear there is any way to handle this and still make the |
50 sledgehammer routines useful. */ | 50 sledgehammer routines useful. |
51 | |
52 Inquiring Minds Want To Know Dept: does the above WARNING mean that | |
53 _if_ it happens, then it will signal error, or then it will do | |
54 something evil and unpredictable? Signaling an error is OK: for | |
55 all national standards, the national to Unicode map is an inclusion | |
56 (1-to-1). Any character set that does not behave that way is | |
57 broken according to the Unicode standard. */ | |
58 | |
51 /* #define SLEDGEHAMMER_CHECK_UNICODE */ | 59 /* #define SLEDGEHAMMER_CHECK_UNICODE */ |
52 | 60 |
53 /* We currently use the following format for tables: | 61 /* We currently use the following format for tables: |
54 | 62 |
55 If dimension == 1, to_unicode_table is a 96-element array of ints | 63 If dimension == 1, to_unicode_table is a 96-element array of ints |
151 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; | 159 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; |
152 Lisp_Object Qutf_16_little_endian_bom; | 160 Lisp_Object Qutf_16_little_endian_bom; |
153 | 161 |
154 #ifdef MULE | 162 #ifdef MULE |
155 | 163 |
164 /* #### Using ints for to_unicode is OK (as long as they are >= 32 bits). | |
165 However, shouldn't the shorts below be unsigned? */ | |
156 static int *to_unicode_blank_1; | 166 static int *to_unicode_blank_1; |
157 static int **to_unicode_blank_2; | 167 static int **to_unicode_blank_2; |
158 | 168 |
159 static short *from_unicode_blank_1; | 169 static short *from_unicode_blank_1; |
160 static short **from_unicode_blank_2; | 170 static short **from_unicode_blank_2; |
300 from_unicode_blank_2 = xnew_array (short *, 256); | 310 from_unicode_blank_2 = xnew_array (short *, 256); |
301 from_unicode_blank_3 = xnew_array (short **, 256); | 311 from_unicode_blank_3 = xnew_array (short **, 256); |
302 from_unicode_blank_4 = xnew_array (short ***, 256); | 312 from_unicode_blank_4 = xnew_array (short ***, 256); |
303 for (i = 0; i < 256; i++) | 313 for (i = 0; i < 256; i++) |
304 { | 314 { |
315 /* #### IMWTK: Why does using -1 here work? Simply because there are | |
316 no existing 96x96 charsets? */ | |
305 from_unicode_blank_1[i] = (short) -1; | 317 from_unicode_blank_1[i] = (short) -1; |
306 from_unicode_blank_2[i] = from_unicode_blank_1; | 318 from_unicode_blank_2[i] = from_unicode_blank_1; |
307 from_unicode_blank_3[i] = from_unicode_blank_2; | 319 from_unicode_blank_3[i] = from_unicode_blank_2; |
308 from_unicode_blank_4[i] = from_unicode_blank_3; | 320 from_unicode_blank_4[i] = from_unicode_blank_3; |
309 } | 321 } |
310 | 322 |
311 to_unicode_blank_1 = xnew_array (int, 96); | 323 to_unicode_blank_1 = xnew_array (int, 96); |
312 to_unicode_blank_2 = xnew_array (int *, 96); | 324 to_unicode_blank_2 = xnew_array (int *, 96); |
313 for (i = 0; i < 96; i++) | 325 for (i = 0; i < 96; i++) |
314 { | 326 { |
327 /* Here -1 is guaranteed OK. */ | |
315 to_unicode_blank_1[i] = -1; | 328 to_unicode_blank_1[i] = -1; |
316 to_unicode_blank_2[i] = to_unicode_blank_1; | 329 to_unicode_blank_2[i] = to_unicode_blank_1; |
317 } | 330 } |
318 } | 331 } |
319 | 332 |
352 abort (); | 365 abort (); |
353 return 0; | 366 return 0; |
354 } | 367 } |
355 } | 368 } |
356 | 369 |
370 /* Allocate and blank the tables. | |
371 Loading them up is done by parse-unicode-translation-table. */ | |
357 void | 372 void |
358 init_charset_unicode_tables (Lisp_Object charset) | 373 init_charset_unicode_tables (Lisp_Object charset) |
359 { | 374 { |
360 if (XCHARSET_DIMENSION (charset) == 1) | 375 if (XCHARSET_DIMENSION (charset) == 1) |
361 { | 376 { |
781 Lisp_Object charset; | 796 Lisp_Object charset; |
782 int c1, c2; | 797 int c1, c2; |
783 | 798 |
784 BREAKUP_ICHAR (chr, charset, c1, c2); | 799 BREAKUP_ICHAR (chr, charset, c1, c2); |
785 | 800 |
786 assert (!EQ (charset, Vcharset_ascii)); | 801 /* I tried an assert on code > 255 || chr == code, but that fails because |
787 assert (!EQ (charset, Vcharset_control_1)); | 802 Mule gives many Latin characters separate code points for different |
803 ISO 8859 coded character sets. Obvious in hindsight.... */ | |
804 assert (!EQ (charset, Vcharset_ascii) || chr == code); | |
805 assert (!EQ (charset, Vcharset_latin_iso8859_1) || chr == code); | |
806 assert (!EQ (charset, Vcharset_control_1) || chr == code); | |
807 | |
808 /* This assert is needed because it is simply unimplemented. */ | |
788 assert (!EQ (charset, Vcharset_composite)); | 809 assert (!EQ (charset, Vcharset_composite)); |
789 | 810 |
790 #ifdef SLEDGEHAMMER_CHECK_UNICODE | 811 #ifdef SLEDGEHAMMER_CHECK_UNICODE |
791 sledgehammer_check_unicode_tables (charset); | 812 sledgehammer_check_unicode_tables (charset); |
792 #endif | 813 #endif |
917 { | 938 { |
918 Lisp_Object charset; | 939 Lisp_Object charset; |
919 int c1, c2; | 940 int c1, c2; |
920 | 941 |
921 type_checking_assert (valid_ichar_p (chr)); | 942 type_checking_assert (valid_ichar_p (chr)); |
943 /* This shortcut depends on the representation of an Ichar, see text.c. */ | |
922 if (chr < 256) | 944 if (chr < 256) |
923 return (int) chr; | 945 return (int) chr; |
924 | 946 |
925 BREAKUP_ICHAR (chr, charset, c1, c2); | 947 BREAKUP_ICHAR (chr, charset, c1, c2); |
926 if (EQ (charset, Vcharset_composite)) | 948 if (EQ (charset, Vcharset_composite)) |
930 else | 952 else |
931 return ((int **) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32][c2 - 32]; | 953 return ((int **) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32][c2 - 32]; |
932 } | 954 } |
933 | 955 |
934 static Ichar | 956 static Ichar |
935 unicode_to_char (int code, Lisp_Object_dynarr *charsets) | 957 unicode_to_ichar (int code, Lisp_Object_dynarr *charsets) |
936 { | 958 { |
937 int u1, u2, u3, u4; | 959 int u1, u2, u3, u4; |
938 int code_levels; | 960 int code_levels; |
939 int i; | 961 int i; |
940 int n = Dynarr_length (charsets); | 962 int n = Dynarr_length (charsets); |
941 | 963 |
942 type_checking_assert (code >= 0); | 964 type_checking_assert (code >= 0); |
943 if (code < 256) | 965 /* This shortcut depends on the representation of an Ichar, see text.c. |
966 Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859 | |
967 coded character sets have points that map into that region). */ | |
968 if (code < 0xA0) | |
944 return (Ichar) code; | 969 return (Ichar) code; |
945 | 970 |
946 BREAKUP_UNICODE_CODE (code, u4, u3, u2, u1, code_levels); | 971 BREAKUP_UNICODE_CODE (code, u4, u3, u2, u1, code_levels); |
947 | 972 |
948 for (i = 0; i < n; i++) | 973 for (i = 0; i < n; i++) |
969 } | 994 } |
970 | 995 |
971 return (Ichar) -1; | 996 return (Ichar) -1; |
972 } | 997 } |
973 | 998 |
999 /* Add charsets to precedence list. | |
1000 LIST must be a list of charsets. Charsets which are in the list more | |
1001 than once are given the precedence implied by their earliest appearance. | |
1002 Later appearances are ignored. */ | |
974 static void | 1003 static void |
975 add_charsets_to_precedence_list (Lisp_Object list, int *lbs, | 1004 add_charsets_to_precedence_list (Lisp_Object list, int *lbs, |
976 Lisp_Object_dynarr *dynarr) | 1005 Lisp_Object_dynarr *dynarr) |
977 { | 1006 { |
978 { | 1007 { |
980 { | 1009 { |
981 Lisp_Object charset = Fget_charset (elt); | 1010 Lisp_Object charset = Fget_charset (elt); |
982 int lb = XCHARSET_LEADING_BYTE (charset); | 1011 int lb = XCHARSET_LEADING_BYTE (charset); |
983 if (lbs[lb - MIN_LEADING_BYTE] == 0) | 1012 if (lbs[lb - MIN_LEADING_BYTE] == 0) |
984 { | 1013 { |
985 Dynarr_add (unicode_precedence_dynarr, charset); | 1014 Dynarr_add (dynarr, charset); |
986 lbs[lb - MIN_LEADING_BYTE] = 1; | 1015 lbs[lb - MIN_LEADING_BYTE] = 1; |
987 } | 1016 } |
988 } | 1017 } |
989 } | 1018 } |
990 } | 1019 } |
991 | 1020 |
1021 /* Rebuild the charset precedence array. | |
1022 The "charsets preferred for the current language" get highest precedence, | |
1023 followed by the "charsets preferred by default", ordered as in | |
1024 Vlanguage_unicode_precedence_list and Vdefault_unicode_precedence_list, | |
1025 respectively. All remaining charsets follow in an arbitrary order. */ | |
992 void | 1026 void |
993 recalculate_unicode_precedence (void) | 1027 recalculate_unicode_precedence (void) |
994 { | 1028 { |
995 int lbs[NUM_LEADING_BYTES]; | 1029 int lbs[NUM_LEADING_BYTES]; |
996 int i; | 1030 int i; |
1014 Dynarr_add (unicode_precedence_dynarr, charset); | 1048 Dynarr_add (unicode_precedence_dynarr, charset); |
1015 } | 1049 } |
1016 } | 1050 } |
1017 } | 1051 } |
1018 | 1052 |
1019 DEFUN ("set-language-unicode-precedence-list", | 1053 DEFUN ("unicode-precedence-list", |
1020 Fset_language_unicode_precedence_list, | 1054 Funicode_precedence_list, |
1021 1, 1, 0, /* | 1055 0, 0, 0, /* |
1022 Set the language-specific precedence list used for Unicode decoding. | 1056 Return the precedence order among charsets used for Unicode decoding. |
1023 This is a list of charsets, which are consulted in order for a translation | 1057 |
1024 matching a given Unicode character. If no matches are found, the charsets | 1058 Value is a list of charsets, which are searched in order for a translation |
1025 in the default precedence list (see `set-default-unicode-precedence-list') | 1059 matching a given Unicode character. |
1026 are consulted, and then all remaining charsets, in some arbitrary order. | 1060 |
1061 The highest precedence is given to the language-specific precedence list of | |
1062 charsets, defined by `set-language-unicode-precedence-list'. These are | |
1063 followed by charsets in the default precedence list, defined by | |
1064 `set-default-unicode-precedence-list'. Charsets occurring multiple times are | |
1065 given precedence according to their first occurrance in either list. These | |
1066 are followed by the remaining charsets, in some arbitrary order. | |
1027 | 1067 |
1028 The language-specific precedence list is meant to be set as part of the | 1068 The language-specific precedence list is meant to be set as part of the |
1029 language environment initialization; the default precedence list is meant | 1069 language environment initialization; the default precedence list is meant |
1030 to be set by the user. | 1070 to be set by the user. |
1031 */ | 1071 */ |
1072 ()) | |
1073 { | |
1074 int i; | |
1075 Lisp_Object list = Qnil; | |
1076 | |
1077 for (i = Dynarr_length (unicode_precedence_dynarr) - 1; i >= 0; i--) | |
1078 list = Fcons (Dynarr_at (unicode_precedence_dynarr, i), list); | |
1079 return list; | |
1080 } | |
1081 | |
1082 | |
1083 /* #### This interface is wrong. Cyrillic users and Chinese users are going | |
1084 to have varying opinions about whether ISO Cyrillic, KOI8-R, or Windows | |
1085 1251 should take precedence, and whether Big Five or CNS should take | |
1086 precedence, respectively. This means that users are sometimes going to | |
1087 want to set Vlanguage_unicode_precedence_list. | |
1088 Furthermore, this should be language-local (buffer-local would be a | |
1089 reasonable approximation). */ | |
1090 DEFUN ("set-language-unicode-precedence-list", | |
1091 Fset_language_unicode_precedence_list, | |
1092 1, 1, 0, /* | |
1093 Set the language-specific precedence of charsets in Unicode decoding. | |
1094 LIST is a list of charsets. | |
1095 See `unicode-precedence-list' for more information. | |
1096 */ | |
1032 (list)) | 1097 (list)) |
1033 { | 1098 { |
1034 { | 1099 { |
1035 EXTERNAL_LIST_LOOP_2 (elt, list) | 1100 EXTERNAL_LIST_LOOP_2 (elt, list) |
1036 Fget_charset (elt); | 1101 Fget_charset (elt); |
1043 | 1108 |
1044 DEFUN ("language-unicode-precedence-list", | 1109 DEFUN ("language-unicode-precedence-list", |
1045 Flanguage_unicode_precedence_list, | 1110 Flanguage_unicode_precedence_list, |
1046 0, 0, 0, /* | 1111 0, 0, 0, /* |
1047 Return the language-specific precedence list used for Unicode decoding. | 1112 Return the language-specific precedence list used for Unicode decoding. |
1048 See `set-language-unicode-precedence-list' for more information. | 1113 See `unicode-precedence-list' for more information. |
1049 */ | 1114 */ |
1050 ()) | 1115 ()) |
1051 { | 1116 { |
1052 return Vlanguage_unicode_precedence_list; | 1117 return Vlanguage_unicode_precedence_list; |
1053 } | 1118 } |
1054 | 1119 |
1055 DEFUN ("set-default-unicode-precedence-list", | 1120 DEFUN ("set-default-unicode-precedence-list", |
1056 Fset_default_unicode_precedence_list, | 1121 Fset_default_unicode_precedence_list, |
1057 1, 1, 0, /* | 1122 1, 1, 0, /* |
1058 Set the default precedence list used for Unicode decoding. | 1123 Set the default precedence list used for Unicode decoding. |
1059 This is meant to be set by the user. See | 1124 This is intended to be set by the user. See |
1060 `set-language-unicode-precedence-list' for more information. | 1125 `unicode-precedence-list' for more information. |
1061 */ | 1126 */ |
1062 (list)) | 1127 (list)) |
1063 { | 1128 { |
1064 { | 1129 { |
1065 EXTERNAL_LIST_LOOP_2 (elt, list) | 1130 EXTERNAL_LIST_LOOP_2 (elt, list) |
1073 | 1138 |
1074 DEFUN ("default-unicode-precedence-list", | 1139 DEFUN ("default-unicode-precedence-list", |
1075 Fdefault_unicode_precedence_list, | 1140 Fdefault_unicode_precedence_list, |
1076 0, 0, 0, /* | 1141 0, 0, 0, /* |
1077 Return the default precedence list used for Unicode decoding. | 1142 Return the default precedence list used for Unicode decoding. |
1078 See `set-language-unicode-precedence-list' for more information. | 1143 See `unicode-precedence-list' for more information. |
1079 */ | 1144 */ |
1080 ()) | 1145 ()) |
1081 { | 1146 { |
1082 return Vdefault_unicode_precedence_list; | 1147 return Vdefault_unicode_precedence_list; |
1083 } | 1148 } |
1084 | 1149 |
1085 DEFUN ("set-unicode-conversion", Fset_unicode_conversion, | 1150 DEFUN ("set-unicode-conversion", Fset_unicode_conversion, |
1086 2, 2, 0, /* | 1151 2, 2, 0, /* |
1087 Add conversion information between Unicode codepoints and characters. | 1152 Add conversion information between Unicode codepoints and characters. |
1153 Conversions for U+0000 to U+00FF are hardwired to ASCII, Control-1, and | |
1154 Latin-1. Attempts to set these values will raise an error. | |
1155 | |
1088 CHARACTER is one of the following: | 1156 CHARACTER is one of the following: |
1089 | 1157 |
1090 -- A character (in which case CODE must be a non-negative integer; values | 1158 -- A character (in which case CODE must be a non-negative integer; values |
1091 above 2^20 - 1 are allowed for the purpose of specifying private | 1159 above 2^20 - 1 are allowed for the purpose of specifying private |
1092 characters, but will cause errors when converted to utf-16) | 1160 characters, but are illegal in standard Unicode---they will cause errors |
1161 when converted to utf-16) | |
1093 -- A vector of characters (in which case CODE must be a vector of integers | 1162 -- A vector of characters (in which case CODE must be a vector of integers |
1094 of the same length) | 1163 of the same length) |
1095 */ | 1164 */ |
1096 (character, code)) | 1165 (character, code)) |
1097 { | 1166 { |
1098 Lisp_Object charset; | 1167 Lisp_Object charset; |
1168 int ichar, unicode; | |
1099 | 1169 |
1100 CHECK_CHAR (character); | 1170 CHECK_CHAR (character); |
1101 CHECK_NATNUM (code); | 1171 CHECK_NATNUM (code); |
1102 | 1172 |
1103 charset = ichar_charset (XCHAR (character)); | 1173 unicode = XINT (code); |
1104 if (EQ (charset, Vcharset_ascii) || | 1174 ichar = XCHAR (character); |
1105 EQ (charset, Vcharset_control_1) || | 1175 charset = ichar_charset (ichar); |
1106 EQ (charset, Vcharset_composite)) | 1176 |
1107 signal_error (Qinvalid_argument, "Cannot set Unicode translation for ASCII, Control-1 or Composite chars", | 1177 /* The translations of ASCII, Control-1, and Latin-1 code points are |
1178 hard-coded in ichar_to_unicode and unicode_to_ichar. | |
1179 | |
1180 Checking unicode < 256 && ichar != unicode is wrong because Mule gives | |
1181 many Latin characters code points in a few different character sets. */ | |
1182 if ((EQ (charset, Vcharset_ascii) || | |
1183 EQ (charset, Vcharset_control_1) || | |
1184 EQ (charset, Vcharset_latin_iso8859_1)) | |
1185 && unicode != ichar) | |
1186 signal_error (Qinvalid_argument, "Can't change Unicode translation for ASCII, Control-1 or Latin-1 char", | |
1108 character); | 1187 character); |
1109 | 1188 |
1110 set_unicode_conversion (XCHAR (character), XINT (code)); | 1189 /* #### Composite characters are not properly implemented yet. */ |
1190 if (EQ (charset, Vcharset_composite)) | |
1191 signal_error (Qinvalid_argument, "Can't set Unicode translation for Composite char", | |
1192 character); | |
1193 | |
1194 set_unicode_conversion (ichar, unicode); | |
1111 return Qnil; | 1195 return Qnil; |
1112 } | 1196 } |
1113 | 1197 |
1114 #endif /* MULE */ | 1198 #endif /* MULE */ |
1115 | 1199 |
1116 DEFUN ("char-to-unicode", Fchar_to_unicode, 1, 1, 0, /* | 1200 DEFUN ("char-to-unicode", Fchar_to_unicode, 1, 1, 0, /* |
1117 Convert character to Unicode codepoint. | 1201 Convert character to Unicode codepoint. |
1118 When there is no international support (i.e. MULE is not defined), | 1202 When there is no international support (i.e. the 'mule feature is not |
1119 this function simply does `char-to-int'. | 1203 present), this function simply does `char-to-int'. |
1120 */ | 1204 */ |
1121 (character)) | 1205 (character)) |
1122 { | 1206 { |
1123 CHECK_CHAR (character); | 1207 CHECK_CHAR (character); |
1124 #ifdef MULE | 1208 #ifdef MULE |
1134 If CHARSETS is given, it should be a list of charsets, and only those | 1218 If CHARSETS is given, it should be a list of charsets, and only those |
1135 charsets will be consulted, in the given order, for a translation. | 1219 charsets will be consulted, in the given order, for a translation. |
1136 Otherwise, the default ordering of all charsets will be given (see | 1220 Otherwise, the default ordering of all charsets will be given (see |
1137 `set-unicode-charset-precedence'). | 1221 `set-unicode-charset-precedence'). |
1138 | 1222 |
1139 When there is no international support (i.e. MULE is not defined), | 1223 When there is no international support (i.e. the 'mule feature is not |
1140 this function simply does `int-to-char' and ignores the CHARSETS | 1224 present), this function simply does `int-to-char' and ignores the CHARSETS |
1141 argument.. | 1225 argument. |
1142 */ | 1226 */ |
1143 (code, charsets)) | 1227 (code, charsets)) |
1144 { | 1228 { |
1145 #ifdef MULE | 1229 #ifdef MULE |
1146 Lisp_Object_dynarr *dyn; | 1230 Lisp_Object_dynarr *dyn; |
1154 Fget_charset (elt); | 1238 Fget_charset (elt); |
1155 } | 1239 } |
1156 | 1240 |
1157 if (NILP (charsets)) | 1241 if (NILP (charsets)) |
1158 { | 1242 { |
1159 Ichar ret = unicode_to_char (c, unicode_precedence_dynarr); | 1243 Ichar ret = unicode_to_ichar (c, unicode_precedence_dynarr); |
1160 if (ret == -1) | 1244 if (ret == -1) |
1161 return Qnil; | 1245 return Qnil; |
1162 return make_char (ret); | 1246 return make_char (ret); |
1163 } | 1247 } |
1164 | 1248 |
1165 dyn = Dynarr_new (Lisp_Object); | 1249 dyn = Dynarr_new (Lisp_Object); |
1166 memset (lbs, 0, NUM_LEADING_BYTES * sizeof (int)); | 1250 memset (lbs, 0, NUM_LEADING_BYTES * sizeof (int)); |
1167 add_charsets_to_precedence_list (charsets, lbs, dyn); | 1251 add_charsets_to_precedence_list (charsets, lbs, dyn); |
1168 { | 1252 { |
1169 Ichar ret = unicode_to_char (c, unicode_precedence_dynarr); | 1253 Ichar ret = unicode_to_ichar (c, dyn); |
1170 Dynarr_free (dyn); | 1254 Dynarr_free (dyn); |
1171 if (ret == -1) | 1255 if (ret == -1) |
1172 return Qnil; | 1256 return Qnil; |
1173 return make_char (ret); | 1257 return make_char (ret); |
1174 } | 1258 } |
1186 FILE *file = (FILE *) get_opaque_ptr (fulano); | 1270 FILE *file = (FILE *) get_opaque_ptr (fulano); |
1187 retry_fclose (file); | 1271 retry_fclose (file); |
1188 return Qnil; | 1272 return Qnil; |
1189 } | 1273 } |
1190 | 1274 |
1275 /* #### shouldn't this interface be called load-unicode-mapping-table | |
1276 for consistency with Unicode Consortium terminology? */ | |
1191 DEFUN ("parse-unicode-translation-table", Fparse_unicode_translation_table, | 1277 DEFUN ("parse-unicode-translation-table", Fparse_unicode_translation_table, |
1192 2, 6, 0, /* | 1278 2, 6, 0, /* |
1193 Parse Unicode translation data in FILENAME for CHARSET. | 1279 Load Unicode tables with the Unicode mapping data in FILENAME for CHARSET. |
1194 Data is text, in the form of one translation per line -- charset | 1280 Data is text, in the form of one translation per line -- charset |
1195 codepoint followed by Unicode codepoint. Numbers are decimal or hex | 1281 codepoint followed by Unicode codepoint. Numbers are decimal or hex |
1196 \(preceded by 0x). Comments are marked with a #. Charset codepoints | 1282 \(preceded by 0x). Comments are marked with a #. Charset codepoints |
1197 for two-dimensional charsets should have the first octet stored in the | 1283 for two-dimensional charsets have the first octet stored in the |
1198 high 8 bits of the hex number and the second in the low 8 bits. | 1284 high 8 bits of the hex number and the second in the low 8 bits. |
1199 | 1285 |
1200 If START and END are given, only charset codepoints within the given | 1286 If START and END are given, only charset codepoints within the given |
1201 range will be processed. If OFFSET is given, that value will be added | 1287 range will be processed. (START and END apply to the codepoints in the |
1202 to all charset codepoints in the file to obtain the internal charset | 1288 file, before OFFSET is applied.) |
1203 codepoint. START and END apply to the codepoints in the file, before | 1289 |
1204 OFFSET is applied. | 1290 If OFFSET is given, that value will be added to all charset codepoints |
1205 | 1291 in the file to obtain the internal charset codepoint. \(We assume |
1206 \(Note that, as usual, we assume that octets are in the range 32 to | 1292 that octets in the table are in the range 33 to 126 or 32 to 127. If |
1207 127 or 33 to 126. If you have a table in kuten form, with octets in | 1293 you have a table in ku-ten form, with octets in the range 1 to 94, you |
1208 the range 1 to 94, you will have to use an offset of 5140, | 1294 will have to use an offset of 5140, i.e. 0x2020.) |
1209 i.e. 0x2020.) | |
1210 | 1295 |
1211 FLAGS, if specified, control further how the tables are interpreted | 1296 FLAGS, if specified, control further how the tables are interpreted |
1212 and are used to special-case certain known table weirdnesses in the | 1297 and are used to special-case certain known format deviations in the |
1213 Unicode tables: | 1298 Unicode tables or in the charset: |
1214 | 1299 |
1215 `ignore-first-column' | 1300 `ignore-first-column' |
1216 Exactly as it sounds. The JIS X 0208 tables have 3 columns of data instead | 1301 The JIS X 0208 tables have 3 columns of data instead of 2. The first |
1217 of 2; the first is the Shift-JIS codepoint. | 1302 column contains the Shift-JIS codepoint, which we ignore. |
1218 `big5' | 1303 `big5' |
1219 The charset codepoint is a Big Five codepoint; convert it to the | 1304 The charset codepoints are Big Five codepoints; convert it to the |
1220 proper hacked-up codepoint in `chinese-big5-1' or `chinese-big5-2'. | 1305 hacked-up Mule codepoint in `chinese-big5-1' or `chinese-big5-2'. |
1221 */ | 1306 */ |
1222 (filename, charset, start, end, offset, flags)) | 1307 (filename, charset, start, end, offset, flags)) |
1223 { | 1308 { |
1224 int st = 0, en = INT_MAX, of = 0; | 1309 int st = 0, en = INT_MAX, of = 0; |
1225 FILE *file; | 1310 FILE *file; |
1257 ignore_first_column = 1; | 1342 ignore_first_column = 1; |
1258 else if (EQ (elt, Qbig5)) | 1343 else if (EQ (elt, Qbig5)) |
1259 big5 = 1; | 1344 big5 = 1; |
1260 else | 1345 else |
1261 invalid_constant | 1346 invalid_constant |
1262 ("Unrecognized `parse-unicode-table' flag", elt); | 1347 ("Unrecognized `parse-unicode-translation-table' flag", elt); |
1263 } | 1348 } |
1264 } | 1349 } |
1265 | 1350 |
1266 GCPRO1 (filename); | 1351 GCPRO1 (filename); |
1267 filename = Fexpand_file_name (filename, Qnil); | 1352 filename = Fexpand_file_name (filename, Qnil); |
1432 if (ch == 0xFEFF && !data->seen_char && ignore_bom) | 1517 if (ch == 0xFEFF && !data->seen_char && ignore_bom) |
1433 ; | 1518 ; |
1434 else | 1519 else |
1435 { | 1520 { |
1436 #ifdef MULE | 1521 #ifdef MULE |
1437 Ichar chr = unicode_to_char (ch, unicode_precedence_dynarr); | 1522 Ichar chr = unicode_to_ichar (ch, unicode_precedence_dynarr); |
1438 | 1523 |
1439 if (chr != -1) | 1524 if (chr != -1) |
1440 { | 1525 { |
1441 Ibyte work[MAX_ICHAR_LEN]; | 1526 Ibyte work[MAX_ICHAR_LEN]; |
1442 int len; | 1527 int len; |
2141 | 2226 |
2142 void | 2227 void |
2143 syms_of_unicode (void) | 2228 syms_of_unicode (void) |
2144 { | 2229 { |
2145 #ifdef MULE | 2230 #ifdef MULE |
2231 DEFSUBR (Funicode_precedence_list); | |
2146 DEFSUBR (Fset_language_unicode_precedence_list); | 2232 DEFSUBR (Fset_language_unicode_precedence_list); |
2147 DEFSUBR (Flanguage_unicode_precedence_list); | 2233 DEFSUBR (Flanguage_unicode_precedence_list); |
2148 DEFSUBR (Fset_default_unicode_precedence_list); | 2234 DEFSUBR (Fset_default_unicode_precedence_list); |
2149 DEFSUBR (Fdefault_unicode_precedence_list); | 2235 DEFSUBR (Fdefault_unicode_precedence_list); |
2150 DEFSUBR (Fset_unicode_conversion); | 2236 DEFSUBR (Fset_unicode_conversion); |