Mercurial > hg > xemacs-beta
annotate src/charset.h @ 5287:cd167465bf69
More permission consistency.
author | Stephen J. Turnbull <stephen@xemacs.org> |
---|---|
date | Mon, 14 Jun 2010 15:03:08 +0900 |
parents | 70ed8a0d8da8 |
children | 308d34e9f07d |
rev | line source |
---|---|
771 | 1 /* Header for charsets. |
2 Copyright (C) 1992, 1995 Free Software Foundation, Inc. | |
3 Copyright (C) 1995 Sun Microsystems, Inc. | |
5200
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
4 Copyright (C) 2001, 2002, 2010 Ben Wing. |
771 | 5 |
6 This file is part of XEmacs. | |
7 | |
8 XEmacs is free software; you can redistribute it and/or modify it | |
9 under the terms of the GNU General Public License as published by the | |
10 Free Software Foundation; either version 2, or (at your option) any | |
11 later version. | |
12 | |
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 for more details. | |
17 | |
18 You should have received a copy of the GNU General Public License | |
19 along with XEmacs; see the file COPYING. If not, write to | |
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
21 Boston, MA 02111-1307, USA. */ | |
22 | |
23 /* Synched up with: Mule 2.3. Not in FSF. */ | |
24 | |
25 /* Rewritten by Ben Wing <ben@xemacs.org>. */ | |
26 | |
27 #ifndef INCLUDED_charset_h_ | |
28 #define INCLUDED_charset_h_ | |
29 | |
30 | |
31 | |
32 #ifndef MULE | |
33 | |
34 /************************************************************************/ | |
35 /* fake charset defs */ | |
36 /************************************************************************/ | |
37 | |
38 /* used when MULE is not defined, so that Charset-type stuff can still | |
39 be done */ | |
40 | |
41 #define Vcharset_ascii Qnil | |
42 | |
867 | 43 #define ichar_charset(ch) Vcharset_ascii |
44 #define ichar_leading_byte(ch) LEADING_BYTE_ASCII | |
45 #define ichar_len(ch) 1 | |
46 #define ichar_len_fmt(ch, fmt) 1 | |
771 | 47 #define LEADING_BYTE_ASCII 0x80 |
48 #define NUM_LEADING_BYTES 1 | |
49 #define MIN_LEADING_BYTE 0x80 | |
50 #define CHARSETP(cs) 1 | |
826 | 51 #define charset_by_leading_byte(lb) Vcharset_ascii |
771 | 52 #define XCHARSET_LEADING_BYTE(cs) LEADING_BYTE_ASCII |
53 #define XCHARSET_GRAPHIC(cs) -1 | |
54 #define XCHARSET_COLUMNS(cs) 1 | |
55 #define XCHARSET_DIMENSION(cs) 1 | |
867 | 56 #define BREAKUP_ICHAR(ch, charset, byte1, byte2) do { \ |
771 | 57 (charset) = Vcharset_ascii; \ |
58 (byte1) = (ch); \ | |
59 (byte2) = 0; \ | |
60 } while (0) | |
3659 | 61 #define XCHARSET_CCL_PROGRAM(cs) Qnil |
62 #define XCHARSET_NAME(cs) Qascii | |
5019 | 63 #define Fget_charset(cs) (cs) |
64 #define Fcharset_list() list1 (Vcharset_ascii) | |
771 | 65 |
66 #else /* MULE */ | |
67 | |
68 | |
69 /************************************************************************/ | |
70 /* Definition of leading bytes */ | |
71 /************************************************************************/ | |
72 | |
73 #define MIN_LEADING_BYTE 0x7F | |
74 | |
75 /** The following are for 1-byte characters in an official charset. **/ | |
76 enum LEADING_BYTE_OFFICIAL_1 | |
77 { | |
78 MIN_LEADING_BYTE_OFFICIAL_1 = 0x80, | |
79 /* LEADING_BYTE_LATIN_ISO8859_1 *MUST* be equal to | |
80 MIN_LEADING_BYTE_OFFICIAL_1. */ | |
81 LEADING_BYTE_LATIN_ISO8859_1 = /* 0x80 Right half of ISO 8859-1 */ | |
82 MIN_LEADING_BYTE_OFFICIAL_1, | |
83 LEADING_BYTE_LATIN_ISO8859_2, /* 0x81 Right half of ISO 8859-2 */ | |
84 LEADING_BYTE_LATIN_ISO8859_3, /* 0x82 Right half of ISO 8859-3 */ | |
85 LEADING_BYTE_LATIN_ISO8859_4, /* 0x83 Right half of ISO 8859-4 */ | |
86 LEADING_BYTE_THAI_TIS620, /* 0x84 TIS620-2533 */ | |
87 LEADING_BYTE_GREEK_ISO8859_7, /* 0x85 Right half of ISO 8859-7 */ | |
88 LEADING_BYTE_ARABIC_ISO8859_6, /* 0x86 Right half of ISO 8859-6 */ | |
89 LEADING_BYTE_HEBREW_ISO8859_8, /* 0x87 Right half of ISO 8859-8 */ | |
90 LEADING_BYTE_KATAKANA_JISX0201, /* 0x88 Right half of JIS X0201-1976 */ | |
91 LEADING_BYTE_LATIN_JISX0201, /* 0x89 Left half of JIS X0201-1976 */ | |
92 LEADING_BYTE_CYRILLIC_ISO8859_5,/* 0x8A Right half of ISO 8859-5 */ | |
93 LEADING_BYTE_LATIN_ISO8859_9, /* 0x8B Right half of ISO 8859-9 */ | |
94 LEADING_BYTE_LATIN_ISO8859_15, /* 0x8C Right half of ISO 8859-15 */ | |
95 #ifdef ENABLE_COMPOSITE_CHARS | |
96 LEADING_BYTE_COMPOSITE, /* 0x8D For a composite character */ | |
97 MAX_LEADING_BYTE_OFFICIAL_1 = | |
98 LEADING_BYTE_COMPOSITE - 1, | |
99 #else | |
100 /* Does not need to be the last entry, but simplifies things */ | |
101 LEADING_BYTE_COMPOSITE_REPLACEMENT, /* 0x8D Replaces ESC 0 - ESC 4 in a | |
102 buffer */ | |
103 MAX_LEADING_BYTE_OFFICIAL_1 = | |
104 LEADING_BYTE_COMPOSITE_REPLACEMENT, | |
105 #endif | |
106 /* 0x8E Unused */ | |
107 }; | |
108 | |
109 /* These next 2 + LEADING_BYTE_COMPOSITE need special treatment in a string | |
110 and/or character */ | |
111 | |
112 #define LEADING_BYTE_ASCII 0x7F /* Not used except in arrays | |
113 indexed by leading byte */ | |
114 #define LEADING_BYTE_CONTROL_1 0x8F /* represent normal 80-9F */ | |
115 | |
116 /** The following are for 2-byte characters in an official charset. **/ | |
117 enum LEADING_BYTE_OFFICIAL_2 | |
118 { | |
119 MIN_LEADING_BYTE_OFFICIAL_2 = 0x90, | |
120 LEADING_BYTE_JAPANESE_JISX0208_1978 = | |
121 MIN_LEADING_BYTE_OFFICIAL_2, /* 0x90 Japanese JIS X0208-1978 */ | |
122 LEADING_BYTE_CHINESE_GB2312, /* 0x91 Chinese Hanzi GB2312-1980 */ | |
123 LEADING_BYTE_JAPANESE_JISX0208, /* 0x92 Japanese JIS X0208-1983 */ | |
124 LEADING_BYTE_KOREAN_KSC5601, /* 0x93 Hangul KS C5601-1987 */ | |
125 LEADING_BYTE_JAPANESE_JISX0212, /* 0x94 Japanese JIS X0212-1990 */ | |
126 LEADING_BYTE_CHINESE_CNS11643_1, /* 0x95 Chinese CNS11643 Set 1 */ | |
127 LEADING_BYTE_CHINESE_CNS11643_2, /* 0x96 Chinese CNS11643 Set 2 */ | |
128 LEADING_BYTE_CHINESE_BIG5_1, /* 0x97 Big5 Level 1 */ | |
129 LEADING_BYTE_CHINESE_BIG5_2, /* 0x98 Big5 Level 2 */ | |
130 MAX_LEADING_BYTE_OFFICIAL_2 = | |
131 LEADING_BYTE_CHINESE_BIG5_2, | |
132 | |
133 /* 0x99 unused */ | |
134 /* 0x9A unused */ | |
135 /* 0x9B unused */ | |
136 /* 0x9C unused */ | |
137 /* 0x9D unused */ | |
138 }; | |
139 | |
140 | |
141 /** The following are for 1- and 2-byte characters in a private charset. **/ | |
142 | |
143 #define PRE_LEADING_BYTE_PRIVATE_1 0x9E /* 1-byte char-set */ | |
144 #define PRE_LEADING_BYTE_PRIVATE_2 0x9F /* 2-byte char-set */ | |
145 | |
146 #define MIN_LEADING_BYTE_PRIVATE_1 0xA0 | |
3496 | 147 #define MAX_LEADING_BYTE_PRIVATE_1 0xC0 |
148 #define MIN_LEADING_BYTE_PRIVATE_2 0xC1 | |
771 | 149 #define MAX_LEADING_BYTE_PRIVATE_2 0xFF |
150 | |
151 #define NUM_LEADING_BYTES 129 | |
152 | |
153 | |
154 /************************************************************************/ | |
155 /* Operations on leading bytes */ | |
156 /************************************************************************/ | |
157 | |
158 /* Is this leading byte for a private charset? */ | |
159 | |
826 | 160 #define leading_byte_private_p(lb) ((lb) >= MIN_LEADING_BYTE_PRIVATE_1) |
771 | 161 |
162 /* Is this a prefix for a private leading byte? */ | |
163 | |
826 | 164 DECLARE_INLINE_HEADER ( |
165 int | |
867 | 166 leading_byte_prefix_p (Ibyte lb) |
826 | 167 ) |
771 | 168 { |
169 return (lb == PRE_LEADING_BYTE_PRIVATE_1 || | |
170 lb == PRE_LEADING_BYTE_PRIVATE_2); | |
171 } | |
172 | |
173 /* Given a private leading byte, return the leading byte prefix stored | |
174 in a string. */ | |
175 | |
826 | 176 #define private_leading_byte_prefix(lb) \ |
771 | 177 ((unsigned int) (lb) < MIN_LEADING_BYTE_PRIVATE_2 ? \ |
178 PRE_LEADING_BYTE_PRIVATE_1 : \ | |
179 PRE_LEADING_BYTE_PRIVATE_2) | |
180 | |
181 | |
182 /************************************************************************/ | |
183 /* Information about a particular character set */ | |
184 /************************************************************************/ | |
185 | |
186 struct Lisp_Charset | |
187 { | |
5127
a9c41067dd88
more cleanups, terminology clarification, lots of doc work
Ben Wing <ben@xemacs.org>
parents:
5125
diff
changeset
|
188 NORMAL_LISP_OBJECT_HEADER header; |
771 | 189 |
190 int id; | |
191 Lisp_Object name; | |
192 Lisp_Object doc_string; | |
3659 | 193 Lisp_Object registries; |
771 | 194 Lisp_Object short_name; |
195 Lisp_Object long_name; | |
196 | |
197 Lisp_Object reverse_direction_charset; | |
198 | |
199 Lisp_Object ccl_program; | |
200 | |
2367 | 201 /* Unicode translation tables. See unicode.c for the format of these |
202 tables, and discussion of how they are initialized. | |
203 */ | |
771 | 204 void *to_unicode_table; |
205 void *from_unicode_table; | |
206 int from_unicode_levels; | |
207 | |
2367 | 208 /* Final byte of this character set in ISO2022 designating escape |
209 sequence */ | |
867 | 210 Ibyte final; |
771 | 211 |
212 /* Number of bytes (1 - 4) required in the internal representation | |
213 for characters in this character set. This is *not* the | |
214 same as the dimension of the character set). */ | |
215 int rep_bytes; | |
216 | |
217 /* Number of columns a character in this charset takes up, on TTY | |
218 devices. Not used for X devices. */ | |
219 int columns; | |
220 | |
221 /* Direction of this character set */ | |
222 int direction; | |
223 | |
224 /* Type of this character set (94, 96, 94x94, 96x96) */ | |
225 int type; | |
226 | |
227 /* Number of bytes used in encoding of this character set (1 or 2) */ | |
228 int dimension; | |
229 | |
230 /* Number of chars in each dimension (usually 94 or 96) */ | |
231 int chars; | |
232 | |
233 /* Which half of font to be used to display this character set */ | |
234 int graphic; | |
235 | |
3439 | 236 /* If set, this charset should be written out in ISO-2022-based coding |
237 systems using the escape sequence for UTF-8, not using our internal | |
238 representation and the associated real ISO 2022 designation. */ | |
239 unsigned int encode_as_utf_8 :1; | |
240 | |
771 | 241 /* If set, this is a "temporary" charset created when we encounter |
242 an unknown final. This is so that we can successfully compile | |
243 and load such files. We allow a real charset to be created on top | |
244 of this temporary charset. */ | |
245 unsigned int temporary :1; | |
246 }; | |
247 typedef struct Lisp_Charset Lisp_Charset; | |
248 | |
5118
e0db3c197671
merge up to latest default branch, doesn't compile yet
Ben Wing <ben@xemacs.org>
parents:
4096
diff
changeset
|
249 DECLARE_LISP_OBJECT (charset, Lisp_Charset); |
771 | 250 #define XCHARSET(x) XRECORD (x, charset, Lisp_Charset) |
251 #define wrap_charset(p) wrap_record (p, charset) | |
252 #define CHARSETP(x) RECORDP (x, charset) | |
253 #define CHECK_CHARSET(x) CHECK_RECORD (x, charset) | |
254 #define CONCHECK_CHARSET(x) CONCHECK_RECORD (x, charset) | |
255 | |
256 #define CHARSET_TYPE_94 0 /* This charset includes 94 characters. */ | |
257 #define CHARSET_TYPE_96 1 /* This charset includes 96 characters. */ | |
258 #define CHARSET_TYPE_94X94 2 /* This charset includes 94x94 characters. */ | |
259 #define CHARSET_TYPE_96X96 3 /* This charset includes 96x96 characters. */ | |
260 | |
261 #define CHARSET_LEFT_TO_RIGHT 0 | |
262 #define CHARSET_RIGHT_TO_LEFT 1 | |
263 | |
264 /* Leading byte and id have been regrouped. -- OG */ | |
265 #define CHARSET_ID(cs) ((cs)->id) | |
867 | 266 #define CHARSET_LEADING_BYTE(cs) ((Ibyte) CHARSET_ID (cs)) |
771 | 267 #define CHARSET_NAME(cs) ((cs)->name) |
268 #define CHARSET_SHORT_NAME(cs) ((cs)->short_name) | |
269 #define CHARSET_LONG_NAME(cs) ((cs)->long_name) | |
270 #define CHARSET_REP_BYTES(cs) ((cs)->rep_bytes) | |
271 #define CHARSET_COLUMNS(cs) ((cs)->columns) | |
272 #define CHARSET_GRAPHIC(cs) ((cs)->graphic) | |
3439 | 273 #define CHARSET_ENCODE_AS_UTF_8(cs) ((cs)->encode_as_utf_8) |
771 | 274 #define CHARSET_TYPE(cs) ((cs)->type) |
275 #define CHARSET_DIRECTION(cs) ((cs)->direction) | |
276 #define CHARSET_FINAL(cs) ((cs)->final) | |
277 #define CHARSET_DOC_STRING(cs) ((cs)->doc_string) | |
3659 | 278 #define CHARSET_REGISTRIES(cs) ((cs)->registries) |
771 | 279 #define CHARSET_CCL_PROGRAM(cs) ((cs)->ccl_program) |
280 #define CHARSET_DIMENSION(cs) ((cs)->dimension) | |
281 #define CHARSET_CHARS(cs) ((cs)->chars) | |
282 #define CHARSET_REVERSE_DIRECTION_CHARSET(cs) ((cs)->reverse_direction_charset) | |
283 #define CHARSET_TO_UNICODE_TABLE(cs) ((cs)->to_unicode_table) | |
284 #define CHARSET_FROM_UNICODE_TABLE(cs) ((cs)->from_unicode_table) | |
285 #define CHARSET_FROM_UNICODE_LEVELS(cs) ((cs)->from_unicode_levels) | |
286 | |
826 | 287 #define CHARSET_PRIVATE_P(cs) leading_byte_private_p (CHARSET_LEADING_BYTE (cs)) |
771 | 288 |
289 #define XCHARSET_ID(cs) CHARSET_ID (XCHARSET (cs)) | |
290 #define XCHARSET_NAME(cs) CHARSET_NAME (XCHARSET (cs)) | |
291 #define XCHARSET_SHORT_NAME(cs) CHARSET_SHORT_NAME (XCHARSET (cs)) | |
292 #define XCHARSET_LONG_NAME(cs) CHARSET_LONG_NAME (XCHARSET (cs)) | |
293 #define XCHARSET_REP_BYTES(cs) CHARSET_REP_BYTES (XCHARSET (cs)) | |
294 #define XCHARSET_COLUMNS(cs) CHARSET_COLUMNS (XCHARSET (cs)) | |
295 #define XCHARSET_GRAPHIC(cs) CHARSET_GRAPHIC (XCHARSET (cs)) | |
3439 | 296 #define XCHARSET_ENCODE_AS_UTF_8(cs) CHARSET_ENCODE_AS_UTF_8 (XCHARSET (cs)) |
771 | 297 #define XCHARSET_TYPE(cs) CHARSET_TYPE (XCHARSET (cs)) |
298 #define XCHARSET_DIRECTION(cs) CHARSET_DIRECTION (XCHARSET (cs)) | |
299 #define XCHARSET_FINAL(cs) CHARSET_FINAL (XCHARSET (cs)) | |
300 #define XCHARSET_DOC_STRING(cs) CHARSET_DOC_STRING (XCHARSET (cs)) | |
3659 | 301 #define XCHARSET_REGISTRIES(cs) CHARSET_REGISTRIES (XCHARSET (cs)) |
771 | 302 #define XCHARSET_LEADING_BYTE(cs) CHARSET_LEADING_BYTE (XCHARSET (cs)) |
303 #define XCHARSET_CCL_PROGRAM(cs) CHARSET_CCL_PROGRAM (XCHARSET (cs)) | |
304 #define XCHARSET_DIMENSION(cs) CHARSET_DIMENSION (XCHARSET (cs)) | |
305 #define XCHARSET_CHARS(cs) CHARSET_CHARS (XCHARSET (cs)) | |
3659 | 306 |
771 | 307 #define XCHARSET_PRIVATE_P(cs) CHARSET_PRIVATE_P (XCHARSET (cs)) |
308 #define XCHARSET_REVERSE_DIRECTION_CHARSET(cs) \ | |
309 CHARSET_REVERSE_DIRECTION_CHARSET (XCHARSET (cs)) | |
310 #define XCHARSET_TO_UNICODE_TABLE(cs) \ | |
311 CHARSET_TO_UNICODE_TABLE (XCHARSET (cs)) | |
312 #define XCHARSET_FROM_UNICODE_TABLE(cs) \ | |
313 CHARSET_FROM_UNICODE_TABLE (XCHARSET (cs)) | |
314 #define XCHARSET_FROM_UNICODE_LEVELS(cs) \ | |
315 CHARSET_FROM_UNICODE_LEVELS (XCHARSET (cs)) | |
316 | |
317 struct charset_lookup | |
318 { | |
319 /* Table of charsets indexed by leading byte. */ | |
320 Lisp_Object charset_by_leading_byte[NUM_LEADING_BYTES]; | |
321 | |
322 /* Table of charsets indexed by type/final-byte/direction. */ | |
323 Lisp_Object charset_by_attributes[4][128][2]; | |
867 | 324 Ibyte next_allocated_1_byte_leading_byte; |
325 Ibyte next_allocated_2_byte_leading_byte; | |
771 | 326 }; |
327 | |
1111 | 328 extern struct charset_lookup *chlook; |
329 | |
826 | 330 DECLARE_INLINE_HEADER ( |
331 Lisp_Object | |
332 charset_by_leading_byte (int lb) | |
333 ) | |
771 | 334 { |
800 | 335 #ifdef ERROR_CHECK_TEXT |
771 | 336 /* When error-checking is on, x86 GCC 2.95.2 -O3 miscompiles the |
337 following unless we introduce `tem'. */ | |
338 int tem = lb; | |
800 | 339 text_checking_assert (tem >= MIN_LEADING_BYTE && tem <= 0xFF); |
771 | 340 #endif |
341 return chlook->charset_by_leading_byte[lb - MIN_LEADING_BYTE]; | |
342 } | |
343 | |
826 | 344 DECLARE_INLINE_HEADER ( |
345 Lisp_Object | |
346 charset_by_attributes (int type, int final, int dir) | |
347 ) | |
771 | 348 { |
349 type_checking_assert (type < countof (chlook->charset_by_attributes) && | |
350 final < countof (chlook->charset_by_attributes[0]) && | |
351 dir < countof (chlook->charset_by_attributes[0][0])); | |
352 return chlook->charset_by_attributes[type][final][dir]; | |
353 } | |
354 | |
355 | |
356 /************************************************************************/ | |
357 /* Dealing with characters */ | |
358 /************************************************************************/ | |
359 | |
360 /* The bit fields of character are divided into 3 parts: | |
3496 | 361 FIELD1(7bits):FIELD2(7bits):FIELD3(7bits) */ |
771 | 362 |
3496 | 363 #define ICHAR_FIELD1_MASK (0x7F << 14) |
867 | 364 #define ICHAR_FIELD2_MASK (0x7F << 7) |
365 #define ICHAR_FIELD3_MASK 0x7F | |
771 | 366 |
367 /* Macros to access each field of a character code of C. */ | |
368 | |
867 | 369 #define ichar_field1(c) (((c) & ICHAR_FIELD1_MASK) >> 14) |
370 #define ichar_field2(c) (((c) & ICHAR_FIELD2_MASK) >> 7) | |
371 #define ichar_field3(c) ((c) & ICHAR_FIELD3_MASK) | |
771 | 372 |
373 /* Field 1, if non-zero, usually holds a leading byte for a | |
374 dimension-2 charset. Field 2, if non-zero, usually holds a leading | |
375 byte for a dimension-1 charset. */ | |
376 | |
377 /* Converting between field values and leading bytes. */ | |
378 | |
379 #define FIELD2_TO_OFFICIAL_LEADING_BYTE (MIN_LEADING_BYTE_OFFICIAL_1 - 1) | |
380 #define FIELD2_TO_PRIVATE_LEADING_BYTE 0x80 | |
381 | |
382 #define FIELD1_TO_OFFICIAL_LEADING_BYTE (MIN_LEADING_BYTE_OFFICIAL_2 - 1) | |
3496 | 383 #define FIELD1_TO_PRIVATE_LEADING_BYTE 0x80 |
771 | 384 |
385 /* Minimum and maximum allowed values for the fields. */ | |
386 | |
867 | 387 #define MIN_ICHAR_FIELD2_OFFICIAL \ |
771 | 388 (MIN_LEADING_BYTE_OFFICIAL_1 - FIELD2_TO_OFFICIAL_LEADING_BYTE) |
867 | 389 #define MAX_ICHAR_FIELD2_OFFICIAL \ |
771 | 390 (MAX_LEADING_BYTE_OFFICIAL_1 - FIELD2_TO_OFFICIAL_LEADING_BYTE) |
391 | |
867 | 392 #define MIN_ICHAR_FIELD1_OFFICIAL \ |
771 | 393 (MIN_LEADING_BYTE_OFFICIAL_2 - FIELD1_TO_OFFICIAL_LEADING_BYTE) |
867 | 394 #define MAX_ICHAR_FIELD1_OFFICIAL \ |
771 | 395 (MAX_LEADING_BYTE_OFFICIAL_2 - FIELD1_TO_OFFICIAL_LEADING_BYTE) |
396 | |
867 | 397 #define MIN_ICHAR_FIELD2_PRIVATE \ |
771 | 398 (MIN_LEADING_BYTE_PRIVATE_1 - FIELD2_TO_PRIVATE_LEADING_BYTE) |
867 | 399 #define MAX_ICHAR_FIELD2_PRIVATE \ |
771 | 400 (MAX_LEADING_BYTE_PRIVATE_1 - FIELD2_TO_PRIVATE_LEADING_BYTE) |
401 | |
867 | 402 #define MIN_ICHAR_FIELD1_PRIVATE \ |
771 | 403 (MIN_LEADING_BYTE_PRIVATE_2 - FIELD1_TO_PRIVATE_LEADING_BYTE) |
867 | 404 #define MAX_ICHAR_FIELD1_PRIVATE \ |
771 | 405 (MAX_LEADING_BYTE_PRIVATE_2 - FIELD1_TO_PRIVATE_LEADING_BYTE) |
406 | |
407 /* Minimum character code of each <type> character. */ | |
408 | |
867 | 409 #define MIN_CHAR_OFFICIAL_TYPE9N (MIN_ICHAR_FIELD2_OFFICIAL << 7) |
410 #define MIN_CHAR_PRIVATE_TYPE9N (MIN_ICHAR_FIELD2_PRIVATE << 7) | |
411 #define MIN_CHAR_OFFICIAL_TYPE9NX9N (MIN_ICHAR_FIELD1_OFFICIAL << 14) | |
412 #define MIN_CHAR_PRIVATE_TYPE9NX9N (MIN_ICHAR_FIELD1_PRIVATE << 14) | |
3496 | 413 #define MIN_CHAR_COMPOSITION (0x7F << 14) |
771 | 414 |
415 /* Leading byte of a character. | |
416 | |
417 NOTE: This takes advantage of the fact that | |
418 FIELD2_TO_OFFICIAL_LEADING_BYTE and | |
419 FIELD2_TO_PRIVATE_LEADING_BYTE are the same. | |
420 */ | |
421 | |
826 | 422 DECLARE_INLINE_HEADER ( |
867 | 423 Ibyte |
424 ichar_leading_byte (Ichar c) | |
826 | 425 ) |
771 | 426 { |
867 | 427 if (ichar_ascii_p (c)) |
771 | 428 return LEADING_BYTE_ASCII; |
429 else if (c < 0xA0) | |
430 return LEADING_BYTE_CONTROL_1; | |
431 else if (c < MIN_CHAR_OFFICIAL_TYPE9NX9N) | |
867 | 432 return ichar_field2 (c) + FIELD2_TO_OFFICIAL_LEADING_BYTE; |
771 | 433 else if (c < MIN_CHAR_PRIVATE_TYPE9NX9N) |
867 | 434 return ichar_field1 (c) + FIELD1_TO_OFFICIAL_LEADING_BYTE; |
771 | 435 else if (c < MIN_CHAR_COMPOSITION) |
867 | 436 return ichar_field1 (c) + FIELD1_TO_PRIVATE_LEADING_BYTE; |
771 | 437 else |
438 { | |
439 #ifdef ENABLE_COMPOSITE_CHARS | |
440 return LEADING_BYTE_COMPOSITE; | |
441 #else | |
2500 | 442 ABORT(); |
771 | 443 return 0; |
444 #endif /* ENABLE_COMPOSITE_CHARS */ | |
445 } | |
446 } | |
447 | |
826 | 448 DECLARE_INLINE_HEADER ( |
449 Bytecount | |
867 | 450 ichar_len (Ichar c) |
826 | 451 ) |
452 { | |
867 | 453 if (ichar_ascii_p (c)) |
826 | 454 return 1; |
455 else if (c < MIN_CHAR_OFFICIAL_TYPE9NX9N) | |
456 return 2; | |
457 else if (c < MIN_CHAR_PRIVATE_TYPE9NX9N) | |
458 return 3; /* dimension-2 official or dimension-1 private */ | |
459 else if (c < MIN_CHAR_COMPOSITION) | |
460 return 4; | |
461 else | |
462 { | |
463 #ifdef ENABLE_COMPOSITE_CHARS | |
464 #error Not yet implemented | |
465 #else | |
2500 | 466 ABORT(); |
826 | 467 return 0; |
468 #endif /* ENABLE_COMPOSITE_CHARS */ | |
469 } | |
470 } | |
471 | |
472 DECLARE_INLINE_HEADER ( | |
473 Bytecount | |
867 | 474 ichar_len_fmt (Ichar c, Internal_Format fmt) |
826 | 475 ) |
476 { | |
477 switch (fmt) | |
478 { | |
479 case FORMAT_DEFAULT: | |
867 | 480 return ichar_len (c); |
826 | 481 case FORMAT_16_BIT_FIXED: |
482 return 2; | |
483 case FORMAT_32_BIT_FIXED: | |
484 return 4; | |
485 default: | |
486 text_checking_assert (fmt == FORMAT_8_BIT_FIXED); | |
487 return 1; | |
488 } | |
489 } | |
490 | |
867 | 491 #define ichar_charset(c) charset_by_leading_byte (ichar_leading_byte (c)) |
771 | 492 |
493 /* Return a character whose charset is CHARSET and position-codes are C1 | |
494 and C2. TYPE9N character ignores C2. (For typical charsets, i.e. not | |
495 ASCII, Control-1 or Composite, C1 and C2 will be in the range of 32 to | |
496 127 or 33 to 126. See `make-char'.) | |
497 | |
498 NOTE: This takes advantage of the fact that | |
499 FIELD2_TO_OFFICIAL_LEADING_BYTE and | |
500 FIELD2_TO_PRIVATE_LEADING_BYTE are the same. | |
501 */ | |
502 | |
826 | 503 DECLARE_INLINE_HEADER ( |
867 | 504 Ichar |
505 make_ichar (Lisp_Object charset, int c1, int c2) | |
826 | 506 ) |
771 | 507 { |
867 | 508 Ichar retval; |
771 | 509 if (EQ (charset, Vcharset_ascii)) |
826 | 510 retval = c1; |
771 | 511 else if (EQ (charset, Vcharset_control_1)) |
826 | 512 retval = c1 | 0x80; |
771 | 513 #ifdef ENABLE_COMPOSITE_CHARS |
514 else if (EQ (charset, Vcharset_composite)) | |
826 | 515 retval = (0x1F << 14) | ((c1) << 7) | (c2); |
771 | 516 #endif |
517 else if (XCHARSET_DIMENSION (charset) == 1) | |
826 | 518 retval = ((XCHARSET_LEADING_BYTE (charset) - |
519 FIELD2_TO_OFFICIAL_LEADING_BYTE) << 7) | (c1); | |
771 | 520 else if (!XCHARSET_PRIVATE_P (charset)) |
826 | 521 retval = ((XCHARSET_LEADING_BYTE (charset) - |
522 FIELD1_TO_OFFICIAL_LEADING_BYTE) << 14) | ((c1) << 7) | (c2); | |
771 | 523 else |
826 | 524 retval = ((XCHARSET_LEADING_BYTE (charset) - |
525 FIELD1_TO_PRIVATE_LEADING_BYTE) << 14) | ((c1) << 7) | (c2); | |
867 | 526 text_checking_assert (valid_ichar_p (retval)); |
826 | 527 return retval; |
771 | 528 } |
529 | |
867 | 530 /* BREAKUP_ICHAR_1_UNSAFE assumes that the charset has already been |
771 | 531 calculated, and just computes c1 and c2. |
532 | |
867 | 533 BREAKUP_ICHAR also computes and stores the charset. */ |
771 | 534 |
867 | 535 #define BREAKUP_ICHAR_1_UNSAFE(c, charset, c1, c2) \ |
771 | 536 XCHARSET_DIMENSION (charset) == 1 \ |
867 | 537 ? ((c1) = ichar_field3 (c), (c2) = 0) \ |
538 : ((c1) = ichar_field2 (c), \ | |
539 (c2) = ichar_field3 (c)) | |
771 | 540 |
826 | 541 DECLARE_INLINE_HEADER ( |
542 void | |
867 | 543 breakup_ichar_1 (Ichar c, Lisp_Object *charset, int *c1, int *c2) |
826 | 544 ) |
771 | 545 { |
867 | 546 text_checking_assert (valid_ichar_p (c)); |
547 *charset = ichar_charset (c); | |
548 BREAKUP_ICHAR_1_UNSAFE (c, *charset, *c1, *c2); | |
771 | 549 } |
550 | |
867 | 551 /* BREAKUP_ICHAR separates an Ichar into its components. The charset of |
771 | 552 character C is set to CHARSET, and the position-codes of C are set to C1 |
553 and C2. C2 of TYPE9N character is 0. */ | |
554 | |
867 | 555 #define BREAKUP_ICHAR(c, charset, c1, c2) \ |
556 breakup_ichar_1 (c, &(charset), &(c1), &(c2)) | |
771 | 557 |
5200
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
558 /* Forward compatibility from ben-unicode-internal: Convert a charset |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
559 codepoint into a character in the internal string representation. |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
560 Return number of bytes written out. FAIL controls failure mode when |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
561 charset conversion to Unicode is not possible (unused as of yet). */ |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
562 DECLARE_INLINE_HEADER ( |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
563 Bytecount |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
564 charset_codepoint_to_itext (Lisp_Object charset, int c1, int c2, Ibyte *ptr, |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
565 enum converr UNUSED (fail)) |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
566 ) |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
567 { |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
568 Ichar ch; |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
569 |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
570 if (EQ (charset, Vcharset_ascii)) |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
571 { |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
572 ptr[0] = (Ibyte) c2; |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
573 return 1; |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
574 } |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
575 |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
576 ch = make_ichar (charset, c1, c2); |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
577 |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
578 /* We can't rely on the converted character being non-ASCII. For |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
579 example, JISX0208 codepoint (33, 64) == Unicode 0x5C (ASCII |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
580 backslash). */ |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
581 return set_itext_ichar (ptr, ch); |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
582 } |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
583 |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
584 /* Forward compatibility from ben-unicode-internal */ |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
585 |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
586 DECLARE_INLINE_HEADER ( |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
587 void |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
588 buffer_itext_to_charset_codepoint (const Ibyte *ptr, |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
589 struct buffer *UNUSED (buf), |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
590 Lisp_Object *charset, int *c1, int *c2, |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
591 enum converr UNUSED (fail)) |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
592 ) |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
593 { |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
594 Ichar ch = itext_ichar (ptr); |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
595 breakup_ichar_1 (ch, charset, c1, c2); |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
596 } |
70ed8a0d8da8
port Mule-ization of mule-wnnfns.c from ben-unicode-internal
Ben Wing <ben@xemacs.org>
parents:
5127
diff
changeset
|
597 |
788 | 598 void get_charset_limits (Lisp_Object charset, int *low, int *high); |
867 | 599 int ichar_to_unicode (Ichar chr); |
788 | 600 |
3439 | 601 EXFUN (Fcharset_name, 1); |
602 | |
771 | 603 #endif /* MULE */ |
604 | |
3439 | 605 /* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */ |
606 | |
607 enum unicode_type | |
608 { | |
609 UNICODE_UTF_16, | |
610 UNICODE_UTF_8, | |
611 UNICODE_UTF_7, | |
4096 | 612 UNICODE_UCS_4, |
613 UNICODE_UTF_32 | |
3439 | 614 }; |
615 | |
616 void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, | |
617 int USED_IF_MULE (l), unsigned_char_dynarr *dst, | |
4096 | 618 enum unicode_type type, unsigned int little_endian, |
619 int write_error_characters_as_such); | |
620 | |
621 #define UNICODE_ERROR_OCTET_RANGE_START 0x200000 | |
622 | |
623 #define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800) | |
624 #define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00) | |
625 #define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800) | |
3439 | 626 |
3676 | 627 void set_charset_registries(Lisp_Object charset, Lisp_Object registries); |
628 | |
3439 | 629 EXFUN (Funicode_to_char, 2); |
630 EXFUN (Fchar_to_unicode, 1); | |
631 | |
771 | 632 #endif /* INCLUDED_charset_h_ */ |