Mercurial > hg > xemacs-beta
comparison src/mule-charset.h @ 428:3ecd8885ac67 r21-2-22
Import from CVS: tag r21-2-22
author | cvs |
---|---|
date | Mon, 13 Aug 2007 11:28:15 +0200 |
parents | |
children | 84b14dcb0985 |
comparison
equal
deleted
inserted
replaced
427:0a0253eac470 | 428:3ecd8885ac67 |
---|---|
1 /* Header for multilingual functions. | |
2 Copyright (C) 1992, 1995 Free Software Foundation, Inc. | |
3 Copyright (C) 1995 Sun Microsystems, Inc. | |
4 | |
5 This file is part of XEmacs. | |
6 | |
7 XEmacs is free software; you can redistribute it and/or modify it | |
8 under the terms of the GNU General Public License as published by the | |
9 Free Software Foundation; either version 2, or (at your option) any | |
10 later version. | |
11 | |
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
15 for more details. | |
16 | |
17 You should have received a copy of the GNU General Public License | |
18 along with XEmacs; see the file COPYING. If not, write to | |
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
20 Boston, MA 02111-1307, USA. */ | |
21 | |
22 /* Synched up with: Mule 2.3. Not in FSF. */ | |
23 | |
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */ | |
25 | |
26 #ifndef _XEMACS_MULE_CHARSET_H | |
27 #define _XEMACS_MULE_CHARSET_H | |
28 | |
29 /* | |
30 1. Character Sets | |
31 ================= | |
32 | |
33 A character set (or "charset") is an ordered set of characters. | |
34 A particular character in a charset is indexed using one or | |
35 more "position codes", which are non-negative integers. | |
36 The number of position codes needed to identify a particular | |
37 character in a charset is called the "dimension" of the | |
38 charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions, | |
39 and the size of all charsets (except for a few special cases) | |
40 is either 94, 96, 94 by 94, or 96 by 96. The range of | |
41 position codes used to index characters from any of these | |
42 types of character sets is as follows: | |
43 | |
44 Charset type Position code 1 Position code 2 | |
45 ------------------------------------------------------------ | |
46 94 33 - 126 N/A | |
47 96 32 - 127 N/A | |
48 94x94 33 - 126 33 - 126 | |
49 96x96 32 - 127 32 - 127 | |
50 | |
51 Note that in the above cases position codes do not start at | |
52 an expected value such as 0 or 1. The reason for this will | |
53 become clear later. | |
54 | |
55 For example, Latin-1 is a 96-character charset, and JISX0208 | |
56 (the Japanese national character set) is a 94x94-character | |
57 charset. | |
58 | |
59 [Note that, although the ranges above define the *valid* | |
60 position codes for a charset, some of the slots in a particular | |
61 charset may in fact be empty. This is the case for JISX0208, | |
62 for example, where (e.g.) all the slots whose first | |
63 position code is in the range 118 - 127 are empty.] | |
64 | |
65 There are three charsets that do not follow the above rules. | |
66 All of them have one dimension, and have ranges of position | |
67 codes as follows: | |
68 | |
69 Charset name Position code 1 | |
70 ------------------------------------ | |
71 ASCII 0 - 127 | |
72 Control-1 0 - 31 | |
73 Composite 0 - some large number | |
74 | |
75 (The upper bound of the position code for composite characters | |
76 has not yet been determined, but it will probably be at | |
77 least 16,383). | |
78 | |
79 ASCII is the union of two subsidiary character sets: | |
80 Printing-ASCII (the printing ASCII character set, | |
81 consisting of position codes 33 - 126, like for a standard | |
82 94-character charset) and Control-ASCII (the non-printing | |
83 characters that would appear in a binary file with codes 0 | |
84 - 32 and 127). | |
85 | |
86 Control-1 contains the non-printing characters that would | |
87 appear in a binary file with codes 128 - 159. | |
88 | |
89 Composite contains characters that are generated by | |
90 overstriking one or more characters from other charsets. | |
91 | |
92 Note that some characters in ASCII, and all characters | |
93 in Control-1, are "control" (non-printing) characters. | |
94 These have no printed representation but instead control | |
95 some other function of the printing (e.g. TAB or 8 moves | |
96 the current character position to the next tab stop). | |
97 All other characters in all charsets are "graphic" | |
98 (printing) characters. | |
99 | |
100 When a binary file is read in, the bytes in the file are | |
101 assigned to character sets as follows: | |
102 | |
103 Bytes Character set Range | |
104 -------------------------------------------------- | |
105 0 - 127 ASCII 0 - 127 | |
106 128 - 159 Control-1 0 - 31 | |
107 160 - 255 Latin-1 32 - 127 | |
108 | |
109 This is a bit ad-hoc but gets the job done. | |
110 | |
111 2. Encodings | |
112 ============ | |
113 | |
114 An "encoding" is a way of numerically representing | |
115 characters from one or more character sets. If an encoding | |
116 only encompasses one character set, then the position codes | |
117 for the characters in that character set could be used | |
118 directly. This is not possible, however, if more than one | |
119 character set is to be used in the encoding. | |
120 | |
121 For example, the conversion detailed above between bytes in | |
122 a binary file and characters is effectively an encoding | |
123 that encompasses the three character sets ASCII, Control-1, | |
124 and Latin-1 in a stream of 8-bit bytes. | |
125 | |
126 Thus, an encoding can be viewed as a way of encoding | |
127 characters from a specified group of character sets using a | |
128 stream of bytes, each of which contains a fixed number of | |
129 bits (but not necessarily 8, as in the common usage of | |
130 "byte"). | |
131 | |
132 Here are descriptions of a couple of common | |
133 encodings: | |
134 | |
135 | |
136 A. Japanese EUC (Extended Unix Code) | |
137 | |
138 This encompasses the character sets: | |
139 - Printing-ASCII, | |
140 - Katakana-JISX0201 (half-width katakana, the right half of JISX0201). | |
141 - Japanese-JISX0208 | |
142 - Japanese-JISX0212 | |
143 It uses 8-bit bytes. | |
144 | |
145 Note that Printing-ASCII and Katakana-JISX0201 are 94-character | |
146 charsets, while Japanese-JISX0208 is a 94x94-character charset. | |
147 | |
148 The encoding is as follows: | |
149 | |
150 Character set Representation (PC == position-code) | |
151 ------------- -------------- | |
152 Printing-ASCII PC1 | |
153 Japanese-JISX0208 PC1 + 0x80 | PC2 + 0x80 | |
154 Katakana-JISX0201 0x8E | PC1 + 0x80 | |
155 | |
156 | |
157 B. JIS7 | |
158 | |
159 This encompasses the character sets: | |
160 - Printing-ASCII | |
161 - Latin-JISX0201 (the left half of JISX0201; this character set is | |
162 very similar to Printing-ASCII and is a 94-character charset) | |
163 - Japanese-JISX0208 | |
164 - Katakana-JISX0201 | |
165 It uses 7-bit bytes. | |
166 | |
167 Unlike Japanese EUC, this is a "modal" encoding, which | |
168 means that there are multiple states that the encoding can | |
169 be in, which affect how the bytes are to be interpreted. | |
170 Special sequences of bytes (called "escape sequences") | |
171 are used to change states. | |
172 | |
173 The encoding is as follows: | |
174 | |
175 Character set Representation | |
176 ------------- -------------- | |
177 Printing-ASCII PC1 | |
178 Latin-JISX0201 PC1 | |
179 Katakana-JISX0201 PC1 | |
180 Japanese-JISX0208 PC1 | PC2 | |
181 | |
182 Escape sequence ASCII equivalent Meaning | |
183 --------------- ---------------- ------- | |
184 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII | |
185 0x1B 0x28 0x4A ESC ( J invoke Latin-JISX0201 | |
186 0x1B 0x28 0x49 ESC ( I invoke Katakana-JISX0201 | |
187 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208 | |
188 | |
189 Initially, Printing-ASCII is invoked. | |
190 | |
191 3. Internal Mule Encodings | |
192 ========================== | |
193 | |
194 In XEmacs/Mule, each character set is assigned a unique number, | |
195 called a "leading byte". This is used in the encodings of a | |
196 character. Leading bytes are in the range 0x80 - 0xFF | |
197 (except for ASCII, which has a leading byte of 0), although | |
198 some leading bytes are reserved. | |
199 | |
200 Charsets whose leading byte is in the range 0x80 - 0x9F are | |
201 called "official" and are used for built-in charsets. | |
202 Other charsets are called "private" and have leading bytes | |
203 in the range 0xA0 - 0xFF; these are user-defined charsets. | |
204 | |
205 More specifically: | |
206 | |
207 Character set Leading byte | |
208 ------------- ------------ | |
209 ASCII 0 | |
210 Composite 0x80 | |
211 Dimension-1 Official 0x81 - 0x8D | |
212 (0x8E is free) | |
213 Control 0x8F | |
214 Dimension-2 Official 0x90 - 0x99 | |
215 (0x9A - 0x9D are free; | |
216 0x9E and 0x9F are reserved) | |
217 Dimension-1 Private 0xA0 - 0xEF | |
218 Dimension-2 Private 0xF0 - 0xFF | |
219 | |
220 There are two internal encodings for characters in XEmacs/Mule. | |
221 One is called "string encoding" and is an 8-bit encoding that | |
222 is used for representing characters in a buffer or string. | |
223 It uses 1 to 4 bytes per character. The other is called | |
224 "character encoding" and is a 19-bit encoding that is used | |
225 for representing characters individually in a variable. | |
226 | |
227 (In the following descriptions, we'll ignore composite | |
228 characters for the moment. We also give a general (structural) | |
229 overview first, followed later by the exact details.) | |
230 | |
231 A. Internal String Encoding | |
232 | |
233 ASCII characters are encoded using their position code directly. | |
234 Other characters are encoded using their leading byte followed | |
235 by their position code(s) with the high bit set. Characters | |
236 in private character sets have their leading byte prefixed with | |
237 a "leading byte prefix", which is either 0x9E or 0x9F. (No | |
238 character sets are ever assigned these leading bytes.) Specifically: | |
239 | |
240 Character set Encoding (PC == position-code) | |
241 ------------- -------- (LB == leading-byte) | |
242 ASCII PC1 | | |
243 Control-1 LB | PC1 + 0xA0 | |
244 Dimension-1 official LB | PC1 + 0x80 | |
245 Dimension-1 private 0x9E | LB | PC1 + 0x80 | |
246 Dimension-2 official LB | PC1 | PC2 + 0x80 | |
247 Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80 | |
248 | |
249 The basic characteristic of this encoding is that the first byte | |
250 of all characters is in the range 0x00 - 0x9F, and the second and | |
251 following bytes of all characters is in the range 0xA0 - 0xFF. | |
252 This means that it is impossible to get out of sync, or more | |
253 specifically: | |
254 | |
255 1. Given any byte position, the beginning of the character it is | |
256 within can be determined in constant time. | |
257 2. Given any byte position at the beginning of a character, the | |
258 beginning of the next character can be determined in constant | |
259 time. | |
260 3. Given any byte position at the beginning of a character, the | |
261 beginning of the previous character can be determined in constant | |
262 time. | |
263 4. Textual searches can simply treat encoded strings as if they | |
264 were encoded in a one-byte-per-character fashion rather than | |
265 the actual multi-byte encoding. | |
266 | |
267 None of the standard non-modal encodings meet all of these | |
268 conditions. For example, EUC satisfies only (2) and (3), while | |
269 Shift-JIS and Big5 (not yet described) satisfy only (2). (All | |
270 non-modal encodings must satisfy (2), in order to be unambiguous.) | |
271 | |
272 B. Internal Character Encoding | |
273 | |
274 One 19-bit word represents a single character. The word is | |
275 separated into three fields: | |
276 | |
277 Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 | |
278 <------------> <------------------> <------------------> | |
279 Field: 1 2 3 | |
280 | |
281 Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits. | |
282 | |
283 Character set Field 1 Field 2 Field 3 | |
284 ------------- ------- ------- ------- | |
285 ASCII 0 0 PC1 | |
286 range: (00 - 7F) | |
287 Control-1 0 1 PC1 | |
288 range: (00 - 1F) | |
289 Dimension-1 official 0 LB - 0x80 PC1 | |
290 range: (01 - 0D) (20 - 7F) | |
291 Dimension-1 private 0 LB - 0x80 PC1 | |
292 range: (20 - 6F) (20 - 7F) | |
293 Dimension-2 official LB - 0x8F PC1 PC2 | |
294 range: (01 - 0A) (20 - 7F) (20 - 7F) | |
295 Dimension-2 private LB - 0xE1 PC1 PC2 | |
296 range: (0F - 1E) (20 - 7F) (20 - 7F) | |
297 Composite 0x1F ? ? | |
298 | |
299 Note that character codes 0 - 255 are the same as the "binary encoding" | |
300 described above. | |
301 */ | |
302 | |
303 /* | |
304 About Unicode support: | |
305 | |
306 Adding Unicode support is very desirable. Unicode will likely be a | |
307 very common representation in the future, and thus we should | |
308 represent Unicode characters using three bytes instead of four. | |
309 This means we need to find leading bytes for Unicode. Given that | |
310 there are 65,536 characters in Unicode and we can attach 96x96 = | |
311 9,216 characters per leading byte, we need eight leading bytes for | |
312 Unicode. We currently have four free (0x9A - 0x9D), and with a | |
313 little bit of rearranging we can get five: ASCII doesn't really | |
314 need to take up a leading byte. (We could just as well use 0x7F, | |
315 with a little change to the functions that assume that 0x80 is the | |
316 lowest leading byte.) This means we still need to dump three | |
317 leading bytes and move them into private space. The CNS charsets | |
318 are good candidates since they are rarely used, and | |
319 JAPANESE_JISX0208_1978 is becoming less and less used and could | |
320 also be dumped. */ | |
321 | |
322 | |
323 /************************************************************************/ | |
324 /* Definition of leading bytes */ | |
325 /************************************************************************/ | |
326 | |
327 #define MIN_LEADING_BYTE 0x80 | |
328 /* These need special treatment in a string and/or character */ | |
329 #define LEADING_BYTE_ASCII 0x8E /* Omitted in a buffer */ | |
330 #ifdef ENABLE_COMPOSITE_CHARS | |
331 #endif | |
332 #define LEADING_BYTE_COMPOSITE 0x80 /* for a composite character */ | |
333 #define LEADING_BYTE_CONTROL_1 0x8F /* represent normal 80-9F */ | |
334 | |
335 /* Note the gap in each official charset can cause core dump | |
336 as first and last values are used to determine whether | |
337 charset is defined or not in non_ascii_valid_char_p */ | |
338 | |
339 /** The following are for 1-byte characters in an official charset. **/ | |
340 enum LEADING_BYTE_OFFICIAL_1 | |
341 { | |
342 LEADING_BYTE_LATIN_ISO8859_1 = 0x81, /* Right half of ISO 8859-1 */ | |
343 LEADING_BYTE_LATIN_ISO8859_2, /* 0x82 Right half of ISO 8859-2 */ | |
344 LEADING_BYTE_LATIN_ISO8859_3, /* 0x83 Right half of ISO 8859-3 */ | |
345 LEADING_BYTE_LATIN_ISO8859_4, /* 0x84 Right half of ISO 8859-4 */ | |
346 LEADING_BYTE_THAI_TIS620, /* 0x85 TIS620-2533 */ | |
347 LEADING_BYTE_GREEK_ISO8859_7, /* 0x86 Right half of ISO 8859-7 */ | |
348 LEADING_BYTE_ARABIC_ISO8859_6, /* 0x87 Right half of ISO 8859-6 */ | |
349 LEADING_BYTE_HEBREW_ISO8859_8, /* 0x88 Right half of ISO 8859-8 */ | |
350 LEADING_BYTE_KATAKANA_JISX0201, /* 0x89 Right half of JIS X0201-1976 */ | |
351 LEADING_BYTE_LATIN_JISX0201, /* 0x8A Left half of JIS X0201-1976 */ | |
352 LEADING_BYTE_CYRILLIC_ISO8859_5,/* 0x8B Right half of ISO 8859-5 */ | |
353 LEADING_BYTE_LATIN_ISO8859_9 /* 0x8C Right half of ISO 8859-9 */ | |
354 /* 0x8D unused */ | |
355 }; | |
356 | |
357 #define MIN_LEADING_BYTE_OFFICIAL_1 LEADING_BYTE_LATIN_ISO8859_1 | |
358 #define MAX_LEADING_BYTE_OFFICIAL_1 LEADING_BYTE_LATIN_ISO8859_9 | |
359 | |
360 /** The following are for 2-byte characters in an official charset. **/ | |
361 enum LEADING_BYTE_OFFICIAL_2 | |
362 { | |
363 LEADING_BYTE_JAPANESE_JISX0208_1978 = 0x90, /* Japanese JIS X0208-1978 */ | |
364 LEADING_BYTE_CHINESE_GB2312, /* 0x91 Chinese Hanzi GB2312-1980 */ | |
365 LEADING_BYTE_JAPANESE_JISX0208, /* 0x92 Japanese JIS X0208-1983 */ | |
366 LEADING_BYTE_KOREAN_KSC5601, /* 0x93 Hangul KS C5601-1987 */ | |
367 LEADING_BYTE_JAPANESE_JISX0212, /* 0x94 Japanese JIS X0212-1990 */ | |
368 LEADING_BYTE_CHINESE_CNS11643_1, /* 0x95 Chinese CNS11643 Set 1 */ | |
369 LEADING_BYTE_CHINESE_CNS11643_2, /* 0x96 Chinese CNS11643 Set 2 */ | |
370 LEADING_BYTE_CHINESE_BIG5_1, /* 0x97 Big5 Level 1 */ | |
371 LEADING_BYTE_CHINESE_BIG5_2 /* 0x98 Big5 Level 2 */ | |
372 /* 0x99 unused */ | |
373 /* 0x9A unused */ | |
374 /* 0x9B unused */ | |
375 /* 0x9C unused */ | |
376 }; | |
377 | |
378 #define MIN_LEADING_BYTE_OFFICIAL_2 LEADING_BYTE_JAPANESE_JISX0208_1978 | |
379 #define MAX_LEADING_BYTE_OFFICIAL_2 LEADING_BYTE_CHINESE_BIG5_2 | |
380 | |
381 /** The following are for 1- and 2-byte characters in a private charset. **/ | |
382 | |
383 #define PRE_LEADING_BYTE_PRIVATE_1 0x9E /* 1-byte char-set */ | |
384 #define PRE_LEADING_BYTE_PRIVATE_2 0x9F /* 2-byte char-set */ | |
385 | |
386 #define MIN_LEADING_BYTE_PRIVATE_1 0xA0 | |
387 #define MAX_LEADING_BYTE_PRIVATE_1 0xEF | |
388 #define MIN_LEADING_BYTE_PRIVATE_2 0xF0 | |
389 #define MAX_LEADING_BYTE_PRIVATE_2 0xFF | |
390 | |
391 #define NUM_LEADING_BYTES 128 | |
392 | |
393 | |
394 /************************************************************************/ | |
395 /* Operations on leading bytes */ | |
396 /************************************************************************/ | |
397 | |
398 /* Is this leading byte for a private charset? */ | |
399 | |
400 #define LEADING_BYTE_PRIVATE_P(lb) ((lb) >= MIN_LEADING_BYTE_PRIVATE_1) | |
401 | |
402 /* Is this a prefix for a private leading byte? */ | |
403 | |
404 INLINE int LEADING_BYTE_PREFIX_P (unsigned char lb); | |
405 INLINE int | |
406 LEADING_BYTE_PREFIX_P (unsigned char lb) | |
407 { | |
408 return (lb == PRE_LEADING_BYTE_PRIVATE_1 || | |
409 lb == PRE_LEADING_BYTE_PRIVATE_2); | |
410 } | |
411 | |
412 /* Given a private leading byte, return the leading byte prefix stored | |
413 in a string */ | |
414 | |
415 #define PRIVATE_LEADING_BYTE_PREFIX(lb) \ | |
416 ((lb) < MIN_LEADING_BYTE_PRIVATE_2 ? \ | |
417 PRE_LEADING_BYTE_PRIVATE_1 : \ | |
418 PRE_LEADING_BYTE_PRIVATE_2) | |
419 | |
420 | |
421 /************************************************************************/ | |
422 /* Operations on individual bytes */ | |
423 /* of any format */ | |
424 /************************************************************************/ | |
425 | |
426 /* Argument `c' should be (unsigned int) or (unsigned char). */ | |
427 /* Note that SP and DEL are not included. */ | |
428 | |
429 #define BYTE_ASCII_P(c) ((c) < 0x80) | |
430 #define BYTE_C0_P(c) ((c) < 0x20) | |
431 /* Do some forced casting just to make *sure* things are gotten right. */ | |
432 #define BYTE_C1_P(c) ((unsigned int) ((unsigned int) (c) - 0x80) < 0x20) | |
433 | |
434 | |
435 /************************************************************************/ | |
436 /* Operations on individual bytes */ | |
437 /* in a Mule-formatted string */ | |
438 /************************************************************************/ | |
439 | |
440 /* Does this byte represent the first byte of a character? */ | |
441 | |
442 #define BUFBYTE_FIRST_BYTE_P(c) ((c) < 0xA0) | |
443 | |
444 /* Does this byte represent the first byte of a multi-byte character? */ | |
445 | |
446 #define BUFBYTE_LEADING_BYTE_P(c) BYTE_C1_P (c) | |
447 | |
448 | |
449 /************************************************************************/ | |
450 /* Information about a particular character set */ | |
451 /************************************************************************/ | |
452 | |
453 struct Lisp_Charset | |
454 { | |
455 struct lcrecord_header header; | |
456 | |
457 int id; | |
458 Lisp_Object name; | |
459 Lisp_Object doc_string, registry, short_name, long_name; | |
460 | |
461 Lisp_Object reverse_direction_charset; | |
462 | |
463 Lisp_Object ccl_program; | |
464 | |
465 /* Final byte of this character set in ISO2022 designating escape sequence */ | |
466 Bufbyte final; | |
467 | |
468 /* Number of bytes (1 - 4) required in the internal representation | |
469 for characters in this character set. This is *not* the | |
470 same as the dimension of the character set). */ | |
471 unsigned int rep_bytes; | |
472 | |
473 /* Number of columns a character in this charset takes up, on TTY | |
474 devices. Not used for X devices. */ | |
475 unsigned int columns; | |
476 | |
477 /* Direction of this character set */ | |
478 unsigned int direction; | |
479 | |
480 /* Type of this character set (94, 96, 94x94, 96x96) */ | |
481 unsigned int type; | |
482 | |
483 /* Number of bytes used in encoding of this character set (1 or 2) */ | |
484 unsigned int dimension; | |
485 | |
486 /* Number of chars in each dimension (usually 94 or 96) */ | |
487 unsigned int chars; | |
488 | |
489 /* Which half of font to be used to display this character set */ | |
490 unsigned int graphic; | |
491 }; | |
492 | |
493 DECLARE_LRECORD (charset, struct Lisp_Charset); | |
494 #define XCHARSET(x) XRECORD (x, charset, struct Lisp_Charset) | |
495 #define XSETCHARSET(x, p) XSETRECORD (x, p, charset) | |
496 #define CHARSETP(x) RECORDP (x, charset) | |
497 #define CHECK_CHARSET(x) CHECK_RECORD (x, charset) | |
498 #define CONCHECK_CHARSET(x) CONCHECK_RECORD (x, charset) | |
499 | |
500 #define CHARSET_TYPE_94 0 /* This charset includes 94 characters. */ | |
501 #define CHARSET_TYPE_96 1 /* This charset includes 96 characters. */ | |
502 #define CHARSET_TYPE_94X94 2 /* This charset includes 94x94 characters. */ | |
503 #define CHARSET_TYPE_96X96 3 /* This charset includes 96x96 characters. */ | |
504 | |
505 #define CHARSET_LEFT_TO_RIGHT 0 | |
506 #define CHARSET_RIGHT_TO_LEFT 1 | |
507 | |
508 /* Leading byte and id have been regrouped. -- OG */ | |
509 #define CHARSET_ID(cs) ((cs)->id) | |
510 #define CHARSET_LEADING_BYTE(cs) ((Bufbyte) CHARSET_ID(cs)) | |
511 #define CHARSET_NAME(cs) ((cs)->name) | |
512 #define CHARSET_SHORT_NAME(cs) ((cs)->short_name) | |
513 #define CHARSET_LONG_NAME(cs) ((cs)->long_name) | |
514 #define CHARSET_REP_BYTES(cs) ((cs)->rep_bytes) | |
515 #define CHARSET_COLUMNS(cs) ((cs)->columns) | |
516 #define CHARSET_GRAPHIC(cs) ((cs)->graphic) | |
517 #define CHARSET_TYPE(cs) ((cs)->type) | |
518 #define CHARSET_DIRECTION(cs) ((cs)->direction) | |
519 #define CHARSET_FINAL(cs) ((cs)->final) | |
520 #define CHARSET_DOC_STRING(cs) ((cs)->doc_string) | |
521 #define CHARSET_REGISTRY(cs) ((cs)->registry) | |
522 #define CHARSET_CCL_PROGRAM(cs) ((cs)->ccl_program) | |
523 #define CHARSET_DIMENSION(cs) ((cs)->dimension) | |
524 #define CHARSET_CHARS(cs) ((cs)->chars) | |
525 #define CHARSET_REVERSE_DIRECTION_CHARSET(cs) ((cs)->reverse_direction_charset) | |
526 | |
527 | |
528 #define CHARSET_PRIVATE_P(cs) LEADING_BYTE_PRIVATE_P (CHARSET_LEADING_BYTE (cs)) | |
529 | |
530 #define XCHARSET_ID(cs) CHARSET_ID (XCHARSET (cs)) | |
531 #define XCHARSET_NAME(cs) CHARSET_NAME (XCHARSET (cs)) | |
532 #define XCHARSET_SHORT_NAME(cs) CHARSET_SHORT_NAME (XCHARSET (cs)) | |
533 #define XCHARSET_LONG_NAME(cs) CHARSET_LONG_NAME (XCHARSET (cs)) | |
534 #define XCHARSET_REP_BYTES(cs) CHARSET_REP_BYTES (XCHARSET (cs)) | |
535 #define XCHARSET_COLUMNS(cs) CHARSET_COLUMNS (XCHARSET (cs)) | |
536 #define XCHARSET_GRAPHIC(cs) CHARSET_GRAPHIC (XCHARSET (cs)) | |
537 #define XCHARSET_TYPE(cs) CHARSET_TYPE (XCHARSET (cs)) | |
538 #define XCHARSET_DIRECTION(cs) CHARSET_DIRECTION (XCHARSET (cs)) | |
539 #define XCHARSET_FINAL(cs) CHARSET_FINAL (XCHARSET (cs)) | |
540 #define XCHARSET_DOC_STRING(cs) CHARSET_DOC_STRING (XCHARSET (cs)) | |
541 #define XCHARSET_REGISTRY(cs) CHARSET_REGISTRY (XCHARSET (cs)) | |
542 #define XCHARSET_LEADING_BYTE(cs) CHARSET_LEADING_BYTE (XCHARSET (cs)) | |
543 #define XCHARSET_CCL_PROGRAM(cs) CHARSET_CCL_PROGRAM (XCHARSET (cs)) | |
544 #define XCHARSET_DIMENSION(cs) CHARSET_DIMENSION (XCHARSET (cs)) | |
545 #define XCHARSET_CHARS(cs) CHARSET_CHARS (XCHARSET (cs)) | |
546 #define XCHARSET_PRIVATE_P(cs) CHARSET_PRIVATE_P (XCHARSET (cs)) | |
547 #define XCHARSET_REVERSE_DIRECTION_CHARSET(cs) \ | |
548 CHARSET_REVERSE_DIRECTION_CHARSET (XCHARSET (cs)) | |
549 | |
550 struct charset_lookup { | |
551 /* Table of charsets indexed by leading byte. */ | |
552 Lisp_Object charset_by_leading_byte[128]; | |
553 | |
554 /* Table of charsets indexed by type/final-byte/direction. */ | |
555 Lisp_Object charset_by_attributes[4][128][2]; | |
556 }; | |
557 | |
558 extern struct charset_lookup *chlook; | |
559 | |
560 /* Table of number of bytes in the string representation of a character | |
561 indexed by the first byte of that representation. | |
562 | |
563 This value can be derived other ways -- e.g. something like | |
564 | |
565 (BYTE_ASCII_P (first_byte) ? 1 : | |
566 XCHARSET_REP_BYTES (CHARSET_BY_LEADING_BYTE (first_byte))) | |
567 | |
568 but it's faster this way. */ | |
569 extern Bytecount rep_bytes_by_first_byte[0xA0]; | |
570 | |
571 #ifdef ERROR_CHECK_TYPECHECK | |
572 /* int not Bufbyte even though that is the actual type of a leading byte. | |
573 This way, out-ot-range values will get caught rather than automatically | |
574 truncated. */ | |
575 INLINE Lisp_Object CHARSET_BY_LEADING_BYTE (int lb); | |
576 INLINE Lisp_Object | |
577 CHARSET_BY_LEADING_BYTE (int lb) | |
578 { | |
579 assert (lb >= 0x80 && lb <= 0xFF); | |
580 return chlook->charset_by_leading_byte[lb - 128]; | |
581 } | |
582 | |
583 #else | |
584 | |
585 #define CHARSET_BY_LEADING_BYTE(lb) (chlook->charset_by_leading_byte[(lb) - 128]) | |
586 | |
587 #endif | |
588 | |
589 #define CHARSET_BY_ATTRIBUTES(type, final, dir) \ | |
590 (chlook->charset_by_attributes[type][final][dir]) | |
591 | |
592 #ifdef ERROR_CHECK_TYPECHECK | |
593 | |
594 /* Number of bytes in the string representation of a character */ | |
595 INLINE int REP_BYTES_BY_FIRST_BYTE (int fb); | |
596 INLINE int | |
597 REP_BYTES_BY_FIRST_BYTE (int fb) | |
598 { | |
599 assert (fb >= 0 && fb < 0xA0); | |
600 return rep_bytes_by_first_byte[fb]; | |
601 } | |
602 | |
603 #else | |
604 #define REP_BYTES_BY_FIRST_BYTE(fb) (rep_bytes_by_first_byte[fb]) | |
605 #endif | |
606 | |
607 | |
608 /************************************************************************/ | |
609 /* Dealing with characters */ | |
610 /************************************************************************/ | |
611 | |
612 /* Is this character represented by more than one byte in a string? */ | |
613 | |
614 #define CHAR_MULTIBYTE_P(c) ((c) >= 0x80) | |
615 | |
616 #define CHAR_ASCII_P(c) (!CHAR_MULTIBYTE_P (c)) | |
617 | |
618 /* The bit fields of character are divided into 3 parts: | |
619 FIELD1(5bits):FIELD2(7bits):FIELD3(7bits) */ | |
620 | |
621 #define CHAR_FIELD1_MASK (0x1F << 14) | |
622 #define CHAR_FIELD2_MASK (0x7F << 7) | |
623 #define CHAR_FIELD3_MASK 0x7F | |
624 | |
625 /* Macros to access each field of a character code of C. */ | |
626 | |
627 #define CHAR_FIELD1(c) (((c) & CHAR_FIELD1_MASK) >> 14) | |
628 #define CHAR_FIELD2(c) (((c) & CHAR_FIELD2_MASK) >> 7) | |
629 #define CHAR_FIELD3(c) ((c) & CHAR_FIELD3_MASK) | |
630 | |
631 /* Field 1, if non-zero, usually holds a leading byte for a | |
632 dimension-2 charset. Field 2, if non-zero, usually holds a leading | |
633 byte for a dimension-1 charset. */ | |
634 | |
635 /* Converting between field values and leading bytes. */ | |
636 | |
637 #define FIELD2_TO_OFFICIAL_LEADING_BYTE 0x80 | |
638 #define FIELD2_TO_PRIVATE_LEADING_BYTE 0x80 | |
639 | |
640 #define FIELD1_TO_OFFICIAL_LEADING_BYTE 0x8F | |
641 #define FIELD1_TO_PRIVATE_LEADING_BYTE 0xE1 | |
642 | |
643 /* Minimum and maximum allowed values for the fields. */ | |
644 | |
645 #define MIN_CHAR_FIELD2_OFFICIAL \ | |
646 (MIN_LEADING_BYTE_OFFICIAL_1 - FIELD2_TO_OFFICIAL_LEADING_BYTE) | |
647 #define MAX_CHAR_FIELD2_OFFICIAL \ | |
648 (MAX_LEADING_BYTE_OFFICIAL_1 - FIELD2_TO_OFFICIAL_LEADING_BYTE) | |
649 | |
650 #define MIN_CHAR_FIELD1_OFFICIAL \ | |
651 (MIN_LEADING_BYTE_OFFICIAL_2 - FIELD1_TO_OFFICIAL_LEADING_BYTE) | |
652 #define MAX_CHAR_FIELD1_OFFICIAL \ | |
653 (MAX_LEADING_BYTE_OFFICIAL_2 - FIELD1_TO_OFFICIAL_LEADING_BYTE) | |
654 | |
655 #define MIN_CHAR_FIELD2_PRIVATE \ | |
656 (MIN_LEADING_BYTE_PRIVATE_1 - FIELD2_TO_PRIVATE_LEADING_BYTE) | |
657 #define MAX_CHAR_FIELD2_PRIVATE \ | |
658 (MAX_LEADING_BYTE_PRIVATE_1 - FIELD2_TO_PRIVATE_LEADING_BYTE) | |
659 | |
660 #define MIN_CHAR_FIELD1_PRIVATE \ | |
661 (MIN_LEADING_BYTE_PRIVATE_2 - FIELD1_TO_PRIVATE_LEADING_BYTE) | |
662 #define MAX_CHAR_FIELD1_PRIVATE \ | |
663 (MAX_LEADING_BYTE_PRIVATE_2 - FIELD1_TO_PRIVATE_LEADING_BYTE) | |
664 | |
665 /* Minimum character code of each <type> character. */ | |
666 | |
667 #define MIN_CHAR_OFFICIAL_TYPE9N (MIN_CHAR_FIELD2_OFFICIAL << 7) | |
668 #define MIN_CHAR_PRIVATE_TYPE9N (MIN_CHAR_FIELD2_PRIVATE << 7) | |
669 #define MIN_CHAR_OFFICIAL_TYPE9NX9N (MIN_CHAR_FIELD1_OFFICIAL << 14) | |
670 #define MIN_CHAR_PRIVATE_TYPE9NX9N (MIN_CHAR_FIELD1_PRIVATE << 14) | |
671 #define MIN_CHAR_COMPOSITION (0x1F << 14) | |
672 | |
673 /* Leading byte of a character. | |
674 | |
675 NOTE: This takes advantage of the fact that | |
676 FIELD2_TO_OFFICIAL_LEADING_BYTE and | |
677 FIELD2_TO_PRIVATE_LEADING_BYTE are the same. | |
678 */ | |
679 | |
680 INLINE Bufbyte CHAR_LEADING_BYTE (Emchar c); | |
681 INLINE Bufbyte | |
682 CHAR_LEADING_BYTE (Emchar c) | |
683 { | |
684 if (CHAR_ASCII_P (c)) | |
685 return LEADING_BYTE_ASCII; | |
686 else if (c < 0xA0) | |
687 return LEADING_BYTE_CONTROL_1; | |
688 else if (c < MIN_CHAR_OFFICIAL_TYPE9NX9N) | |
689 return CHAR_FIELD2 (c) + FIELD2_TO_OFFICIAL_LEADING_BYTE; | |
690 else if (c < MIN_CHAR_PRIVATE_TYPE9NX9N) | |
691 return CHAR_FIELD1 (c) + FIELD1_TO_OFFICIAL_LEADING_BYTE; | |
692 else if (c < MIN_CHAR_COMPOSITION) | |
693 return CHAR_FIELD1 (c) + FIELD1_TO_PRIVATE_LEADING_BYTE; | |
694 else | |
695 { | |
696 #ifdef ENABLE_COMPOSITE_CHARS | |
697 return LEADING_BYTE_COMPOSITE; | |
698 #else | |
699 abort(); | |
700 return 0; | |
701 #endif /* ENABLE_COMPOSITE_CHARS */ | |
702 } | |
703 } | |
704 | |
705 #define CHAR_CHARSET(c) CHARSET_BY_LEADING_BYTE (CHAR_LEADING_BYTE (c)) | |
706 | |
707 /* Return a character whose charset is CHARSET and position-codes | |
708 are C1 and C2. TYPE9N character ignores C2. | |
709 | |
710 NOTE: This takes advantage of the fact that | |
711 FIELD2_TO_OFFICIAL_LEADING_BYTE and | |
712 FIELD2_TO_PRIVATE_LEADING_BYTE are the same. | |
713 */ | |
714 | |
715 INLINE Emchar MAKE_CHAR (Lisp_Object charset, int c1, int c2); | |
716 INLINE Emchar | |
717 MAKE_CHAR (Lisp_Object charset, int c1, int c2) | |
718 { | |
719 if (EQ (charset, Vcharset_ascii)) | |
720 return c1; | |
721 else if (EQ (charset, Vcharset_control_1)) | |
722 return c1 | 0x80; | |
723 #ifdef ENABLE_COMPOSITE_CHARS | |
724 else if (EQ (charset, Vcharset_composite)) | |
725 return (0x1F << 14) | ((c1) << 7) | (c2); | |
726 #endif | |
727 else if (XCHARSET_DIMENSION (charset) == 1) | |
728 return ((XCHARSET_LEADING_BYTE (charset) - | |
729 FIELD2_TO_OFFICIAL_LEADING_BYTE) << 7) | (c1); | |
730 else if (!XCHARSET_PRIVATE_P (charset)) | |
731 return ((XCHARSET_LEADING_BYTE (charset) - | |
732 FIELD1_TO_OFFICIAL_LEADING_BYTE) << 14) | ((c1) << 7) | (c2); | |
733 else | |
734 return ((XCHARSET_LEADING_BYTE (charset) - | |
735 FIELD1_TO_PRIVATE_LEADING_BYTE) << 14) | ((c1) << 7) | (c2); | |
736 } | |
737 | |
738 /* The charset of character C is set to CHARSET, and the | |
739 position-codes of C are set to C1 and C2. C2 of TYPE9N character | |
740 is 0. */ | |
741 | |
742 /* BREAKUP_CHAR_1_UNSAFE assumes that the charset has already been | |
743 calculated, and just computes c1 and c2. | |
744 | |
745 BREAKUP_CHAR also computes and stores the charset. */ | |
746 | |
747 #define BREAKUP_CHAR_1_UNSAFE(c, charset, c1, c2) \ | |
748 XCHARSET_DIMENSION (charset) == 1 \ | |
749 ? ((c1) = CHAR_FIELD3 (c), (c2) = 0) \ | |
750 : ((c1) = CHAR_FIELD2 (c), \ | |
751 (c2) = CHAR_FIELD3 (c)) | |
752 | |
753 INLINE void breakup_char_1 (Emchar c, Lisp_Object *charset, int *c1, int *c2); | |
754 INLINE void | |
755 breakup_char_1 (Emchar c, Lisp_Object *charset, int *c1, int *c2) | |
756 { | |
757 *charset = CHAR_CHARSET (c); | |
758 BREAKUP_CHAR_1_UNSAFE (c, *charset, *c1, *c2); | |
759 } | |
760 | |
761 #define BREAKUP_CHAR(c, charset, c1, c2) \ | |
762 breakup_char_1 (c, &(charset), &(c1), &(c2)) | |
763 | |
764 | |
765 | |
766 #ifdef ENABLE_COMPOSITE_CHARS | |
767 /************************************************************************/ | |
768 /* Composite characters */ | |
769 /************************************************************************/ | |
770 | |
771 Emchar lookup_composite_char (Bufbyte *str, int len); | |
772 Lisp_Object composite_char_string (Emchar ch); | |
773 #endif /* ENABLE_COMPOSITE_CHARS */ | |
774 | |
775 | |
776 /************************************************************************/ | |
777 /* Exported functions */ | |
778 /************************************************************************/ | |
779 | |
780 EXFUN (Ffind_charset, 1); | |
781 EXFUN (Fget_charset, 1); | |
782 | |
783 extern Lisp_Object Vcharset_chinese_big5_1; | |
784 extern Lisp_Object Vcharset_chinese_big5_2; | |
785 extern Lisp_Object Vcharset_japanese_jisx0208; | |
786 | |
787 Emchar Lstream_get_emchar_1 (Lstream *stream, int first_char); | |
788 int Lstream_fput_emchar (Lstream *stream, Emchar ch); | |
789 void Lstream_funget_emchar (Lstream *stream, Emchar ch); | |
790 | |
791 int copy_internal_to_external (CONST Bufbyte *internal, Bytecount len, | |
792 unsigned char *external); | |
793 Bytecount copy_external_to_internal (CONST unsigned char *external, | |
794 int len, Bufbyte *internal); | |
795 | |
796 #endif /* _XEMACS_MULE_CHARSET_H */ |