comparison src/unicode.c @ 877:e54d47b2d736

[xemacs-hg @ 2002-06-23 09:54:35 by stephent] warning fixes <87bsa2qymn.fsf@tleepslib.sk.tsukuba.ac.jp> unicode improvements <87znxmpc96.fsf@tleepslib.sk.tsukuba.ac.jp>
author stephent
date Sun, 23 Jun 2002 09:54:41 +0000
parents 79c6ff3eef26
children c9f067fd71a3
comparison
equal deleted inserted replaced
876:890f3cafe600 877:e54d47b2d736
25 Current primary author: Ben Wing <ben@xemacs.org> 25 Current primary author: Ben Wing <ben@xemacs.org>
26 26
27 Written by Ben Wing <ben@xemacs.org>, June, 2001. 27 Written by Ben Wing <ben@xemacs.org>, June, 2001.
28 Separated out into this file, August, 2001. 28 Separated out into this file, August, 2001.
29 Includes Unicode coding systems, some parts of which have been written 29 Includes Unicode coding systems, some parts of which have been written
30 by someone else. 30 by someone else. #### Morioka and Hayashi, I think.
31 31
32 As of September 2001, the detection code is here and abstraction of the 32 As of September 2001, the detection code is here and abstraction of the
33 detection system is finished. the unicode detectors have been rewritten 33 detection system is finished. The unicode detectors have been rewritten
34 to include multiple levels of likelihood. 34 to include multiple levels of likelihood.
35 */ 35 */
36 36
37 #include <config.h> 37 #include <config.h>
38 #include "lisp.h" 38 #include "lisp.h"
45 45
46 /* #### WARNING! The current sledgehammer routines have a fundamental 46 /* #### WARNING! The current sledgehammer routines have a fundamental
47 problem in that they can't handle two characters mapping to a 47 problem in that they can't handle two characters mapping to a
48 single Unicode codepoint or vice-versa in a single charset table. 48 single Unicode codepoint or vice-versa in a single charset table.
49 It's not clear there is any way to handle this and still make the 49 It's not clear there is any way to handle this and still make the
50 sledgehammer routines useful. */ 50 sledgehammer routines useful.
51
52 Inquiring Minds Want To Know Dept: does the above WARNING mean that
53 _if_ it happens, then it will signal error, or then it will do
54 something evil and unpredictable? Signaling an error is OK: for
55 all national standards, the national to Unicode map is an inclusion
56 (1-to-1). Any character set that does not behave that way is
57 broken according to the Unicode standard. */
58
51 /* #define SLEDGEHAMMER_CHECK_UNICODE */ 59 /* #define SLEDGEHAMMER_CHECK_UNICODE */
52 60
53 /* We currently use the following format for tables: 61 /* We currently use the following format for tables:
54 62
55 If dimension == 1, to_unicode_table is a 96-element array of ints 63 If dimension == 1, to_unicode_table is a 96-element array of ints
151 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; 159 Lisp_Object Qutf_16_little_endian, Qutf_16_bom;
152 Lisp_Object Qutf_16_little_endian_bom; 160 Lisp_Object Qutf_16_little_endian_bom;
153 161
154 #ifdef MULE 162 #ifdef MULE
155 163
164 /* #### Using ints for to_unicode is OK (as long as they are >= 32 bits).
165 However, shouldn't the shorts below be unsigned? */
156 static int *to_unicode_blank_1; 166 static int *to_unicode_blank_1;
157 static int **to_unicode_blank_2; 167 static int **to_unicode_blank_2;
158 168
159 static short *from_unicode_blank_1; 169 static short *from_unicode_blank_1;
160 static short **from_unicode_blank_2; 170 static short **from_unicode_blank_2;
300 from_unicode_blank_2 = xnew_array (short *, 256); 310 from_unicode_blank_2 = xnew_array (short *, 256);
301 from_unicode_blank_3 = xnew_array (short **, 256); 311 from_unicode_blank_3 = xnew_array (short **, 256);
302 from_unicode_blank_4 = xnew_array (short ***, 256); 312 from_unicode_blank_4 = xnew_array (short ***, 256);
303 for (i = 0; i < 256; i++) 313 for (i = 0; i < 256; i++)
304 { 314 {
315 /* #### IMWTK: Why does using -1 here work? Simply because there are
316 no existing 96x96 charsets? */
305 from_unicode_blank_1[i] = (short) -1; 317 from_unicode_blank_1[i] = (short) -1;
306 from_unicode_blank_2[i] = from_unicode_blank_1; 318 from_unicode_blank_2[i] = from_unicode_blank_1;
307 from_unicode_blank_3[i] = from_unicode_blank_2; 319 from_unicode_blank_3[i] = from_unicode_blank_2;
308 from_unicode_blank_4[i] = from_unicode_blank_3; 320 from_unicode_blank_4[i] = from_unicode_blank_3;
309 } 321 }
310 322
311 to_unicode_blank_1 = xnew_array (int, 96); 323 to_unicode_blank_1 = xnew_array (int, 96);
312 to_unicode_blank_2 = xnew_array (int *, 96); 324 to_unicode_blank_2 = xnew_array (int *, 96);
313 for (i = 0; i < 96; i++) 325 for (i = 0; i < 96; i++)
314 { 326 {
327 /* Here -1 is guaranteed OK. */
315 to_unicode_blank_1[i] = -1; 328 to_unicode_blank_1[i] = -1;
316 to_unicode_blank_2[i] = to_unicode_blank_1; 329 to_unicode_blank_2[i] = to_unicode_blank_1;
317 } 330 }
318 } 331 }
319 332
352 abort (); 365 abort ();
353 return 0; 366 return 0;
354 } 367 }
355 } 368 }
356 369
370 /* Allocate and blank the tables.
371 Loading them up is done by parse-unicode-translation-table. */
357 void 372 void
358 init_charset_unicode_tables (Lisp_Object charset) 373 init_charset_unicode_tables (Lisp_Object charset)
359 { 374 {
360 if (XCHARSET_DIMENSION (charset) == 1) 375 if (XCHARSET_DIMENSION (charset) == 1)
361 { 376 {
781 Lisp_Object charset; 796 Lisp_Object charset;
782 int c1, c2; 797 int c1, c2;
783 798
784 BREAKUP_ICHAR (chr, charset, c1, c2); 799 BREAKUP_ICHAR (chr, charset, c1, c2);
785 800
786 assert (!EQ (charset, Vcharset_ascii)); 801 /* I tried an assert on code > 255 || chr == code, but that fails because
787 assert (!EQ (charset, Vcharset_control_1)); 802 Mule gives many Latin characters separate code points for different
803 ISO 8859 coded character sets. Obvious in hindsight.... */
804 assert (!EQ (charset, Vcharset_ascii) || chr == code);
805 assert (!EQ (charset, Vcharset_latin_iso8859_1) || chr == code);
806 assert (!EQ (charset, Vcharset_control_1) || chr == code);
807
808 /* This assert is needed because it is simply unimplemented. */
788 assert (!EQ (charset, Vcharset_composite)); 809 assert (!EQ (charset, Vcharset_composite));
789 810
790 #ifdef SLEDGEHAMMER_CHECK_UNICODE 811 #ifdef SLEDGEHAMMER_CHECK_UNICODE
791 sledgehammer_check_unicode_tables (charset); 812 sledgehammer_check_unicode_tables (charset);
792 #endif 813 #endif
917 { 938 {
918 Lisp_Object charset; 939 Lisp_Object charset;
919 int c1, c2; 940 int c1, c2;
920 941
921 type_checking_assert (valid_ichar_p (chr)); 942 type_checking_assert (valid_ichar_p (chr));
943 /* This shortcut depends on the representation of an Ichar, see text.c. */
922 if (chr < 256) 944 if (chr < 256)
923 return (int) chr; 945 return (int) chr;
924 946
925 BREAKUP_ICHAR (chr, charset, c1, c2); 947 BREAKUP_ICHAR (chr, charset, c1, c2);
926 if (EQ (charset, Vcharset_composite)) 948 if (EQ (charset, Vcharset_composite))
930 else 952 else
931 return ((int **) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32][c2 - 32]; 953 return ((int **) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32][c2 - 32];
932 } 954 }
933 955
934 static Ichar 956 static Ichar
935 unicode_to_char (int code, Lisp_Object_dynarr *charsets) 957 unicode_to_ichar (int code, Lisp_Object_dynarr *charsets)
936 { 958 {
937 int u1, u2, u3, u4; 959 int u1, u2, u3, u4;
938 int code_levels; 960 int code_levels;
939 int i; 961 int i;
940 int n = Dynarr_length (charsets); 962 int n = Dynarr_length (charsets);
941 963
942 type_checking_assert (code >= 0); 964 type_checking_assert (code >= 0);
943 if (code < 256) 965 /* This shortcut depends on the representation of an Ichar, see text.c.
966 Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859
967 coded character sets have points that map into that region). */
968 if (code < 0xA0)
944 return (Ichar) code; 969 return (Ichar) code;
945 970
946 BREAKUP_UNICODE_CODE (code, u4, u3, u2, u1, code_levels); 971 BREAKUP_UNICODE_CODE (code, u4, u3, u2, u1, code_levels);
947 972
948 for (i = 0; i < n; i++) 973 for (i = 0; i < n; i++)
969 } 994 }
970 995
971 return (Ichar) -1; 996 return (Ichar) -1;
972 } 997 }
973 998
999 /* Add charsets to precedence list.
1000 LIST must be a list of charsets. Charsets which are in the list more
1001 than once are given the precedence implied by their earliest appearance.
1002 Later appearances are ignored. */
974 static void 1003 static void
975 add_charsets_to_precedence_list (Lisp_Object list, int *lbs, 1004 add_charsets_to_precedence_list (Lisp_Object list, int *lbs,
976 Lisp_Object_dynarr *dynarr) 1005 Lisp_Object_dynarr *dynarr)
977 { 1006 {
978 { 1007 {
980 { 1009 {
981 Lisp_Object charset = Fget_charset (elt); 1010 Lisp_Object charset = Fget_charset (elt);
982 int lb = XCHARSET_LEADING_BYTE (charset); 1011 int lb = XCHARSET_LEADING_BYTE (charset);
983 if (lbs[lb - MIN_LEADING_BYTE] == 0) 1012 if (lbs[lb - MIN_LEADING_BYTE] == 0)
984 { 1013 {
985 Dynarr_add (unicode_precedence_dynarr, charset); 1014 Dynarr_add (dynarr, charset);
986 lbs[lb - MIN_LEADING_BYTE] = 1; 1015 lbs[lb - MIN_LEADING_BYTE] = 1;
987 } 1016 }
988 } 1017 }
989 } 1018 }
990 } 1019 }
991 1020
1021 /* Rebuild the charset precedence array.
1022 The "charsets preferred for the current language" get highest precedence,
1023 followed by the "charsets preferred by default", ordered as in
1024 Vlanguage_unicode_precedence_list and Vdefault_unicode_precedence_list,
1025 respectively. All remaining charsets follow in an arbitrary order. */
992 void 1026 void
993 recalculate_unicode_precedence (void) 1027 recalculate_unicode_precedence (void)
994 { 1028 {
995 int lbs[NUM_LEADING_BYTES]; 1029 int lbs[NUM_LEADING_BYTES];
996 int i; 1030 int i;
1014 Dynarr_add (unicode_precedence_dynarr, charset); 1048 Dynarr_add (unicode_precedence_dynarr, charset);
1015 } 1049 }
1016 } 1050 }
1017 } 1051 }
1018 1052
1019 DEFUN ("set-language-unicode-precedence-list", 1053 DEFUN ("unicode-precedence-list",
1020 Fset_language_unicode_precedence_list, 1054 Funicode_precedence_list,
1021 1, 1, 0, /* 1055 0, 0, 0, /*
1022 Set the language-specific precedence list used for Unicode decoding. 1056 Return the precedence order among charsets used for Unicode decoding.
1023 This is a list of charsets, which are consulted in order for a translation 1057
1024 matching a given Unicode character. If no matches are found, the charsets 1058 Value is a list of charsets, which are searched in order for a translation
1025 in the default precedence list (see `set-default-unicode-precedence-list') 1059 matching a given Unicode character.
1026 are consulted, and then all remaining charsets, in some arbitrary order. 1060
1061 The highest precedence is given to the language-specific precedence list of
1062 charsets, defined by `set-language-unicode-precedence-list'. These are
1063 followed by charsets in the default precedence list, defined by
1064 `set-default-unicode-precedence-list'. Charsets occurring multiple times are
1065 given precedence according to their first occurrance in either list. These
1066 are followed by the remaining charsets, in some arbitrary order.
1027 1067
1028 The language-specific precedence list is meant to be set as part of the 1068 The language-specific precedence list is meant to be set as part of the
1029 language environment initialization; the default precedence list is meant 1069 language environment initialization; the default precedence list is meant
1030 to be set by the user. 1070 to be set by the user.
1031 */ 1071 */
1072 ())
1073 {
1074 int i;
1075 Lisp_Object list = Qnil;
1076
1077 for (i = Dynarr_length (unicode_precedence_dynarr) - 1; i >= 0; i--)
1078 list = Fcons (Dynarr_at (unicode_precedence_dynarr, i), list);
1079 return list;
1080 }
1081
1082
1083 /* #### This interface is wrong. Cyrillic users and Chinese users are going
1084 to have varying opinions about whether ISO Cyrillic, KOI8-R, or Windows
1085 1251 should take precedence, and whether Big Five or CNS should take
1086 precedence, respectively. This means that users are sometimes going to
1087 want to set Vlanguage_unicode_precedence_list.
1088 Furthermore, this should be language-local (buffer-local would be a
1089 reasonable approximation). */
1090 DEFUN ("set-language-unicode-precedence-list",
1091 Fset_language_unicode_precedence_list,
1092 1, 1, 0, /*
1093 Set the language-specific precedence of charsets in Unicode decoding.
1094 LIST is a list of charsets.
1095 See `unicode-precedence-list' for more information.
1096 */
1032 (list)) 1097 (list))
1033 { 1098 {
1034 { 1099 {
1035 EXTERNAL_LIST_LOOP_2 (elt, list) 1100 EXTERNAL_LIST_LOOP_2 (elt, list)
1036 Fget_charset (elt); 1101 Fget_charset (elt);
1043 1108
1044 DEFUN ("language-unicode-precedence-list", 1109 DEFUN ("language-unicode-precedence-list",
1045 Flanguage_unicode_precedence_list, 1110 Flanguage_unicode_precedence_list,
1046 0, 0, 0, /* 1111 0, 0, 0, /*
1047 Return the language-specific precedence list used for Unicode decoding. 1112 Return the language-specific precedence list used for Unicode decoding.
1048 See `set-language-unicode-precedence-list' for more information. 1113 See `unicode-precedence-list' for more information.
1049 */ 1114 */
1050 ()) 1115 ())
1051 { 1116 {
1052 return Vlanguage_unicode_precedence_list; 1117 return Vlanguage_unicode_precedence_list;
1053 } 1118 }
1054 1119
1055 DEFUN ("set-default-unicode-precedence-list", 1120 DEFUN ("set-default-unicode-precedence-list",
1056 Fset_default_unicode_precedence_list, 1121 Fset_default_unicode_precedence_list,
1057 1, 1, 0, /* 1122 1, 1, 0, /*
1058 Set the default precedence list used for Unicode decoding. 1123 Set the default precedence list used for Unicode decoding.
1059 This is meant to be set by the user. See 1124 This is intended to be set by the user. See
1060 `set-language-unicode-precedence-list' for more information. 1125 `unicode-precedence-list' for more information.
1061 */ 1126 */
1062 (list)) 1127 (list))
1063 { 1128 {
1064 { 1129 {
1065 EXTERNAL_LIST_LOOP_2 (elt, list) 1130 EXTERNAL_LIST_LOOP_2 (elt, list)
1073 1138
1074 DEFUN ("default-unicode-precedence-list", 1139 DEFUN ("default-unicode-precedence-list",
1075 Fdefault_unicode_precedence_list, 1140 Fdefault_unicode_precedence_list,
1076 0, 0, 0, /* 1141 0, 0, 0, /*
1077 Return the default precedence list used for Unicode decoding. 1142 Return the default precedence list used for Unicode decoding.
1078 See `set-language-unicode-precedence-list' for more information. 1143 See `unicode-precedence-list' for more information.
1079 */ 1144 */
1080 ()) 1145 ())
1081 { 1146 {
1082 return Vdefault_unicode_precedence_list; 1147 return Vdefault_unicode_precedence_list;
1083 } 1148 }
1084 1149
1085 DEFUN ("set-unicode-conversion", Fset_unicode_conversion, 1150 DEFUN ("set-unicode-conversion", Fset_unicode_conversion,
1086 2, 2, 0, /* 1151 2, 2, 0, /*
1087 Add conversion information between Unicode codepoints and characters. 1152 Add conversion information between Unicode codepoints and characters.
1153 Conversions for U+0000 to U+00FF are hardwired to ASCII, Control-1, and
1154 Latin-1. Attempts to set these values will raise an error.
1155
1088 CHARACTER is one of the following: 1156 CHARACTER is one of the following:
1089 1157
1090 -- A character (in which case CODE must be a non-negative integer; values 1158 -- A character (in which case CODE must be a non-negative integer; values
1091 above 2^20 - 1 are allowed for the purpose of specifying private 1159 above 2^20 - 1 are allowed for the purpose of specifying private
1092 characters, but will cause errors when converted to utf-16) 1160 characters, but are illegal in standard Unicode---they will cause errors
1161 when converted to utf-16)
1093 -- A vector of characters (in which case CODE must be a vector of integers 1162 -- A vector of characters (in which case CODE must be a vector of integers
1094 of the same length) 1163 of the same length)
1095 */ 1164 */
1096 (character, code)) 1165 (character, code))
1097 { 1166 {
1098 Lisp_Object charset; 1167 Lisp_Object charset;
1168 int ichar, unicode;
1099 1169
1100 CHECK_CHAR (character); 1170 CHECK_CHAR (character);
1101 CHECK_NATNUM (code); 1171 CHECK_NATNUM (code);
1102 1172
1103 charset = ichar_charset (XCHAR (character)); 1173 unicode = XINT (code);
1104 if (EQ (charset, Vcharset_ascii) || 1174 ichar = XCHAR (character);
1105 EQ (charset, Vcharset_control_1) || 1175 charset = ichar_charset (ichar);
1106 EQ (charset, Vcharset_composite)) 1176
1107 signal_error (Qinvalid_argument, "Cannot set Unicode translation for ASCII, Control-1 or Composite chars", 1177 /* The translations of ASCII, Control-1, and Latin-1 code points are
1178 hard-coded in ichar_to_unicode and unicode_to_ichar.
1179
1180 Checking unicode < 256 && ichar != unicode is wrong because Mule gives
1181 many Latin characters code points in a few different character sets. */
1182 if ((EQ (charset, Vcharset_ascii) ||
1183 EQ (charset, Vcharset_control_1) ||
1184 EQ (charset, Vcharset_latin_iso8859_1))
1185 && unicode != ichar)
1186 signal_error (Qinvalid_argument, "Can't change Unicode translation for ASCII, Control-1 or Latin-1 char",
1108 character); 1187 character);
1109 1188
1110 set_unicode_conversion (XCHAR (character), XINT (code)); 1189 /* #### Composite characters are not properly implemented yet. */
1190 if (EQ (charset, Vcharset_composite))
1191 signal_error (Qinvalid_argument, "Can't set Unicode translation for Composite char",
1192 character);
1193
1194 set_unicode_conversion (ichar, unicode);
1111 return Qnil; 1195 return Qnil;
1112 } 1196 }
1113 1197
1114 #endif /* MULE */ 1198 #endif /* MULE */
1115 1199
1116 DEFUN ("char-to-unicode", Fchar_to_unicode, 1, 1, 0, /* 1200 DEFUN ("char-to-unicode", Fchar_to_unicode, 1, 1, 0, /*
1117 Convert character to Unicode codepoint. 1201 Convert character to Unicode codepoint.
1118 When there is no international support (i.e. MULE is not defined), 1202 When there is no international support (i.e. the 'mule feature is not
1119 this function simply does `char-to-int'. 1203 present), this function simply does `char-to-int'.
1120 */ 1204 */
1121 (character)) 1205 (character))
1122 { 1206 {
1123 CHECK_CHAR (character); 1207 CHECK_CHAR (character);
1124 #ifdef MULE 1208 #ifdef MULE
1134 If CHARSETS is given, it should be a list of charsets, and only those 1218 If CHARSETS is given, it should be a list of charsets, and only those
1135 charsets will be consulted, in the given order, for a translation. 1219 charsets will be consulted, in the given order, for a translation.
1136 Otherwise, the default ordering of all charsets will be given (see 1220 Otherwise, the default ordering of all charsets will be given (see
1137 `set-unicode-charset-precedence'). 1221 `set-unicode-charset-precedence').
1138 1222
1139 When there is no international support (i.e. MULE is not defined), 1223 When there is no international support (i.e. the 'mule feature is not
1140 this function simply does `int-to-char' and ignores the CHARSETS 1224 present), this function simply does `int-to-char' and ignores the CHARSETS
1141 argument.. 1225 argument.
1142 */ 1226 */
1143 (code, charsets)) 1227 (code, charsets))
1144 { 1228 {
1145 #ifdef MULE 1229 #ifdef MULE
1146 Lisp_Object_dynarr *dyn; 1230 Lisp_Object_dynarr *dyn;
1154 Fget_charset (elt); 1238 Fget_charset (elt);
1155 } 1239 }
1156 1240
1157 if (NILP (charsets)) 1241 if (NILP (charsets))
1158 { 1242 {
1159 Ichar ret = unicode_to_char (c, unicode_precedence_dynarr); 1243 Ichar ret = unicode_to_ichar (c, unicode_precedence_dynarr);
1160 if (ret == -1) 1244 if (ret == -1)
1161 return Qnil; 1245 return Qnil;
1162 return make_char (ret); 1246 return make_char (ret);
1163 } 1247 }
1164 1248
1165 dyn = Dynarr_new (Lisp_Object); 1249 dyn = Dynarr_new (Lisp_Object);
1166 memset (lbs, 0, NUM_LEADING_BYTES * sizeof (int)); 1250 memset (lbs, 0, NUM_LEADING_BYTES * sizeof (int));
1167 add_charsets_to_precedence_list (charsets, lbs, dyn); 1251 add_charsets_to_precedence_list (charsets, lbs, dyn);
1168 { 1252 {
1169 Ichar ret = unicode_to_char (c, unicode_precedence_dynarr); 1253 Ichar ret = unicode_to_ichar (c, dyn);
1170 Dynarr_free (dyn); 1254 Dynarr_free (dyn);
1171 if (ret == -1) 1255 if (ret == -1)
1172 return Qnil; 1256 return Qnil;
1173 return make_char (ret); 1257 return make_char (ret);
1174 } 1258 }
1186 FILE *file = (FILE *) get_opaque_ptr (fulano); 1270 FILE *file = (FILE *) get_opaque_ptr (fulano);
1187 retry_fclose (file); 1271 retry_fclose (file);
1188 return Qnil; 1272 return Qnil;
1189 } 1273 }
1190 1274
1275 /* #### shouldn't this interface be called load-unicode-mapping-table
1276 for consistency with Unicode Consortium terminology? */
1191 DEFUN ("parse-unicode-translation-table", Fparse_unicode_translation_table, 1277 DEFUN ("parse-unicode-translation-table", Fparse_unicode_translation_table,
1192 2, 6, 0, /* 1278 2, 6, 0, /*
1193 Parse Unicode translation data in FILENAME for CHARSET. 1279 Load Unicode tables with the Unicode mapping data in FILENAME for CHARSET.
1194 Data is text, in the form of one translation per line -- charset 1280 Data is text, in the form of one translation per line -- charset
1195 codepoint followed by Unicode codepoint. Numbers are decimal or hex 1281 codepoint followed by Unicode codepoint. Numbers are decimal or hex
1196 \(preceded by 0x). Comments are marked with a #. Charset codepoints 1282 \(preceded by 0x). Comments are marked with a #. Charset codepoints
1197 for two-dimensional charsets should have the first octet stored in the 1283 for two-dimensional charsets have the first octet stored in the
1198 high 8 bits of the hex number and the second in the low 8 bits. 1284 high 8 bits of the hex number and the second in the low 8 bits.
1199 1285
1200 If START and END are given, only charset codepoints within the given 1286 If START and END are given, only charset codepoints within the given
1201 range will be processed. If OFFSET is given, that value will be added 1287 range will be processed. (START and END apply to the codepoints in the
1202 to all charset codepoints in the file to obtain the internal charset 1288 file, before OFFSET is applied.)
1203 codepoint. START and END apply to the codepoints in the file, before 1289
1204 OFFSET is applied. 1290 If OFFSET is given, that value will be added to all charset codepoints
1205 1291 in the file to obtain the internal charset codepoint. \(We assume
1206 \(Note that, as usual, we assume that octets are in the range 32 to 1292 that octets in the table are in the range 33 to 126 or 32 to 127. If
1207 127 or 33 to 126. If you have a table in kuten form, with octets in 1293 you have a table in ku-ten form, with octets in the range 1 to 94, you
1208 the range 1 to 94, you will have to use an offset of 5140, 1294 will have to use an offset of 5140, i.e. 0x2020.)
1209 i.e. 0x2020.)
1210 1295
1211 FLAGS, if specified, control further how the tables are interpreted 1296 FLAGS, if specified, control further how the tables are interpreted
1212 and are used to special-case certain known table weirdnesses in the 1297 and are used to special-case certain known format deviations in the
1213 Unicode tables: 1298 Unicode tables or in the charset:
1214 1299
1215 `ignore-first-column' 1300 `ignore-first-column'
1216 Exactly as it sounds. The JIS X 0208 tables have 3 columns of data instead 1301 The JIS X 0208 tables have 3 columns of data instead of 2. The first
1217 of 2; the first is the Shift-JIS codepoint. 1302 column contains the Shift-JIS codepoint, which we ignore.
1218 `big5' 1303 `big5'
1219 The charset codepoint is a Big Five codepoint; convert it to the 1304 The charset codepoints are Big Five codepoints; convert it to the
1220 proper hacked-up codepoint in `chinese-big5-1' or `chinese-big5-2'. 1305 hacked-up Mule codepoint in `chinese-big5-1' or `chinese-big5-2'.
1221 */ 1306 */
1222 (filename, charset, start, end, offset, flags)) 1307 (filename, charset, start, end, offset, flags))
1223 { 1308 {
1224 int st = 0, en = INT_MAX, of = 0; 1309 int st = 0, en = INT_MAX, of = 0;
1225 FILE *file; 1310 FILE *file;
1257 ignore_first_column = 1; 1342 ignore_first_column = 1;
1258 else if (EQ (elt, Qbig5)) 1343 else if (EQ (elt, Qbig5))
1259 big5 = 1; 1344 big5 = 1;
1260 else 1345 else
1261 invalid_constant 1346 invalid_constant
1262 ("Unrecognized `parse-unicode-table' flag", elt); 1347 ("Unrecognized `parse-unicode-translation-table' flag", elt);
1263 } 1348 }
1264 } 1349 }
1265 1350
1266 GCPRO1 (filename); 1351 GCPRO1 (filename);
1267 filename = Fexpand_file_name (filename, Qnil); 1352 filename = Fexpand_file_name (filename, Qnil);
1432 if (ch == 0xFEFF && !data->seen_char && ignore_bom) 1517 if (ch == 0xFEFF && !data->seen_char && ignore_bom)
1433 ; 1518 ;
1434 else 1519 else
1435 { 1520 {
1436 #ifdef MULE 1521 #ifdef MULE
1437 Ichar chr = unicode_to_char (ch, unicode_precedence_dynarr); 1522 Ichar chr = unicode_to_ichar (ch, unicode_precedence_dynarr);
1438 1523
1439 if (chr != -1) 1524 if (chr != -1)
1440 { 1525 {
1441 Ibyte work[MAX_ICHAR_LEN]; 1526 Ibyte work[MAX_ICHAR_LEN];
1442 int len; 1527 int len;
2141 2226
2142 void 2227 void
2143 syms_of_unicode (void) 2228 syms_of_unicode (void)
2144 { 2229 {
2145 #ifdef MULE 2230 #ifdef MULE
2231 DEFSUBR (Funicode_precedence_list);
2146 DEFSUBR (Fset_language_unicode_precedence_list); 2232 DEFSUBR (Fset_language_unicode_precedence_list);
2147 DEFSUBR (Flanguage_unicode_precedence_list); 2233 DEFSUBR (Flanguage_unicode_precedence_list);
2148 DEFSUBR (Fset_default_unicode_precedence_list); 2234 DEFSUBR (Fset_default_unicode_precedence_list);
2149 DEFSUBR (Fdefault_unicode_precedence_list); 2235 DEFSUBR (Fdefault_unicode_precedence_list);
2150 DEFSUBR (Fset_unicode_conversion); 2236 DEFSUBR (Fset_unicode_conversion);