Mercurial > hg > xemacs-beta
comparison src/mule-charset.h @ 70:131b0175ea99 r20-0b30
Import from CVS: tag r20-0b30
author | cvs |
---|---|
date | Mon, 13 Aug 2007 09:02:59 +0200 |
parents | |
children | 54cc21c15cbb |
comparison
equal
deleted
inserted
replaced
69:804d1389bcd6 | 70:131b0175ea99 |
---|---|
1 /* Header for multilingual functions. | |
2 Copyright (C) 1992, 1995 Free Software Foundation, Inc. | |
3 Copyright (C) 1995 Sun Microsystems, Inc. | |
4 | |
5 This file is part of XEmacs. | |
6 | |
7 XEmacs is free software; you can redistribute it and/or modify it | |
8 under the terms of the GNU General Public License as published by the | |
9 Free Software Foundation; either version 2, or (at your option) any | |
10 later version. | |
11 | |
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
15 for more details. | |
16 | |
17 You should have received a copy of the GNU General Public License | |
18 along with XEmacs; see the file COPYING. If not, write to | |
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
20 Boston, MA 02111-1307, USA. */ | |
21 | |
22 /* Synched up with: Mule 2.3. Not in FSF. */ | |
23 | |
24 /* Rewritten by Ben Wing <wing@666.com>. */ | |
25 | |
26 #ifndef _XEMACS_MULE_CHARSET_H | |
27 #define _XEMACS_MULE_CHARSET_H | |
28 | |
29 /* | |
30 1. Character Sets | |
31 ================= | |
32 | |
33 A character set (or "charset") is an ordered set of characters. | |
34 A particular character in a charset is indexed using one or | |
35 more "position codes", which are non-negative integers. | |
36 The number of position codes needed to identify a particular | |
37 character in a charset is called the "dimension" of the | |
38 charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions, | |
39 and the size of all charsets (except for a few special cases) | |
40 is either 94, 96, 94 by 94, or 96 by 96. The range of | |
41 position codes used to index characters from any of these | |
42 types of character sets is as follows: | |
43 | |
44 Charset type Position code 1 Position code 2 | |
45 ------------------------------------------------------------ | |
46 94 33 - 126 N/A | |
47 96 32 - 127 N/A | |
48 94x94 33 - 126 33 - 126 | |
49 96x96 32 - 127 32 - 127 | |
50 | |
51 Note that in the above cases position codes do not start at | |
52 an expected value such as 0 or 1. The reason for this will | |
53 become clear later. | |
54 | |
55 For example, Latin-1 is a 96-character charset, and JISX0208 | |
56 (the Japanese national character set) is a 94x94-character | |
57 charset. | |
58 | |
59 [Note that, although the ranges above define the *valid* | |
60 position codes for a charset, some of the slots in a particular | |
61 charset may in fact be empty. This is the case for JISX0208, | |
62 for example, where (e.g.) all the slots whose first | |
63 position code is in the range 118 - 127 are empty.] | |
64 | |
65 There are three charsets that do not follow the above rules. | |
66 All of them have one dimension, and have ranges of position | |
67 codes as follows: | |
68 | |
69 Charset name Position code 1 | |
70 ------------------------------------ | |
71 ASCII 0 - 127 | |
72 Control-1 0 - 31 | |
73 Composite 0 - some large number | |
74 | |
75 (The upper bound of the position code for composite characters | |
76 has not yet been determined, but it will probably be at | |
77 least 16,383). | |
78 | |
79 ASCII is the union of two subsidiary character sets: | |
80 Printing-ASCII (the printing ASCII character set, | |
81 consisting of position codes 33 - 126, like for a standard | |
82 94-character charset) and Control-ASCII (the non-printing | |
83 characters that would appear in a binary file with codes 0 | |
84 - 32 and 127). | |
85 | |
86 Control-1 contains the non-printing characters that would | |
87 appear in a binary file with codes 128 - 159. | |
88 | |
89 Composite contains characters that are generated by | |
90 overstriking one or more characters from other charsets. | |
91 | |
92 Note that some characters in ASCII, and all characters | |
93 in Control-1, are "control" (non-printing) characters. | |
94 These have no printed representation but instead control | |
95 some other function of the printing (e.g. TAB or 8 moves | |
96 the current character position to the next tab stop). | |
97 All other characters in all charsets are "graphic" | |
98 (printing) characters. | |
99 | |
100 When a binary file is read in, the bytes in the file are | |
101 assigned to character sets as follows: | |
102 | |
103 Bytes Character set Range | |
104 -------------------------------------------------- | |
105 0 - 127 ASCII 0 - 127 | |
106 128 - 159 Control-1 0 - 31 | |
107 160 - 255 Latin-1 32 - 127 | |
108 | |
109 This is a bit ad-hoc but gets the job done. | |
110 | |
111 2. Encodings | |
112 ============ | |
113 | |
114 An "encoding" is a way of numerically representing | |
115 characters from one or more character sets. If an encoding | |
116 only encompasses one character set, then the position codes | |
117 for the characters in that character set could be used | |
118 directly. This is not possible, however, if more than one | |
119 character set is to be used in the encoding. | |
120 | |
121 For example, the conversion detailed above between bytes in | |
122 a binary file and characters is effectively an encoding | |
123 that encompasses the three character sets ASCII, Control-1, | |
124 and Latin-1 in a stream of 8-bit bytes. | |
125 | |
126 Thus, an encoding can be viewed as a way of encoding | |
127 characters from a specified group of character sets using a | |
128 stream of bytes, each of which contains a fixed number of | |
129 bits (but not necessarily 8, as in the common usage of | |
130 "byte"). | |
131 | |
132 Here are descriptions of a couple of common | |
133 encodings: | |
134 | |
135 | |
136 A. Japanese EUC (Extended Unix Code) | |
137 | |
138 This encompasses the character sets: | |
139 - Printing-ASCII, | |
140 - Japanese-JISX0201-Kana (half-width katakana, the right half of JISX0201). | |
141 - Japanese-JISX0208 | |
142 - Japanese-JISX0212 | |
143 It uses 8-bit bytes. | |
144 | |
145 Note that Printing-ASCII and Japanese-JISX0201-Kana are 94-character | |
146 charsets, while Japanese-JISX0208 is a 94x94-character charset. | |
147 | |
148 The encoding is as follows: | |
149 | |
150 Character set Representation (PC == position-code) | |
151 ------------- -------------- | |
152 Printing-ASCII PC-1 | |
153 Japanese-JISX0208 PC-1 + 0x80 / PC-2 + 0x80 | |
154 Japanese-JISX0201-Kana 0x8E / PC-1 + 0x80 | |
155 | |
156 | |
157 B. JIS7 | |
158 | |
159 This encompasses the character sets: | |
160 - Printing-ASCII | |
161 - Japanese-JISX0201-Roman (the left half of JISX0201; this | |
162 character set is very similar to Printing-ASCII and is a | |
163 94-character charset) | |
164 - Japanese-JISX0208 | |
165 - Japanese-JISX0201-Kana. | |
166 It uses 7-bit bytes. | |
167 | |
168 Unlike Japanese EUC, this is a "modal" encoding, which | |
169 means that there are multiple states that the encoding can | |
170 be in, which affect how the bytes are to be interpreted. | |
171 Special sequences of bytes (called "escape sequences") | |
172 are used to change states. | |
173 | |
174 The encoding is as follows: | |
175 | |
176 Character set Representation | |
177 ------------- -------------- | |
178 Printing-ASCII PC-1 | |
179 Japanese-JISX0201-Roman PC-1 | |
180 Japanese-JISX0201-Kana PC-1 | |
181 Japanese-JISX0208 PC-1 / PC-2 | |
182 | |
183 Escape sequence ASCII equivalent Meaning | |
184 --------------- ---------------- ------- | |
185 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII | |
186 0x1B 0x28 0x4A ESC ( J invoke Japanese-JISX0201-Roman | |
187 0x1B 0x28 0x49 ESC ( I invoke Japanese-JISX0201-Kana | |
188 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208 | |
189 | |
190 Initially, Printing-ASCII is invoked. | |
191 | |
192 3. Internal Mule Encodings | |
193 ========================== | |
194 | |
195 In XEmacs/Mule, each character set is assigned a unique number, | |
196 called a "leading byte". This is used in the encodings of a | |
197 character. Leading bytes are in the range 0x80 - 0xFF | |
198 (except for ASCII, which has a leading byte of 0), although | |
199 some leading bytes are reserved. | |
200 | |
201 Charsets whose leading byte is in the range 0x80 - 0x9F are | |
202 called "official" and are used for built-in charsets. | |
203 Other charsets are called "private" and have leading bytes | |
204 in the range 0xA0 - 0xFF; these are user-defined charsets. | |
205 | |
206 More specifically: | |
207 | |
208 Character set Leading byte | |
209 ------------- ------------ | |
210 ASCII 0 | |
211 Composite 0x80 | |
212 Dimension-1 Official 0x81 - 0x8D | |
213 (0x8E is free) | |
214 Control 0x8F | |
215 Dimension-2 Official 0x90 - 0x99 | |
216 (0x9A - 0x9D are free; | |
217 0x9E and 0x9F are reserved) | |
218 Dimension-1 Private 0xA0 - 0xEF | |
219 Dimension-2 Private 0xF0 - 0xFF | |
220 | |
221 There are two internal encodings for characters in XEmacs/Mule. | |
222 One is called "string encoding" and is an 8-bit encoding that | |
223 is used for representing characters in a buffer or string. | |
224 It uses 1 to 4 bytes per character. The other is called | |
225 "character encoding" and is a 19-bit encoding that is used | |
226 for representing characters individually in a variable. | |
227 | |
228 (In the following descriptions, we'll ignore composite | |
229 characters for the moment. We also give a general (structural) | |
230 overview first, followed later by the exact details.) | |
231 | |
232 A. Internal String Encoding | |
233 | |
234 ASCII characters are encoded using their position code directly. | |
235 Other characters are encoded using their leading byte followed | |
236 by their position code(s) with the high bit set. Characters | |
237 in private character sets have their leading byte prefixed with | |
238 a "leading byte prefix", which is either 0x9E or 0x9F. (No | |
239 character sets are ever assigned these leading bytes.) Specifically: | |
240 | |
241 Character set Encoding (PC == position-code) | |
242 ------------- -------- (LB == leading-byte) | |
243 ASCII PC1 | | |
244 Control-1 LB | PC1 + 0xA0 | |
245 Dimension-1 official LB | PC1 + 0x80 | |
246 Dimension-1 private 0x9E | LB | PC1 + 0x80 | |
247 Dimension-2 official LB | PC1 | PC2 + 0x80 | |
248 Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80 | |
249 | |
250 The basic characteristic of this encoding is that the first byte | |
251 of all characters is in the range 0x00 - 0x9F, and the second and | |
252 following bytes of all characters is in the range 0xA0 - 0xFF. | |
253 This means that it is impossible to get out of sync, or more | |
254 specifically: | |
255 | |
256 1. Given any byte position, the beginning of the character it is | |
257 within can be determined in constant time. | |
258 2. Given any byte position at the beginning of a character, the | |
259 beginning of the next character can be determined in constant | |
260 time. | |
261 3. Given any byte position at the beginning of a character, the | |
262 beginning of the previous character can be determined in constant | |
263 time. | |
264 4. Textual searches can simply treat encoded strings as if they | |
265 were encoded in a one-byte-per-character fashion rather than | |
266 the actual multi-byte encoding. | |
267 | |
268 None of the standard non-modal encodings meet all of these | |
269 conditions. For example, EUC satisfies only (2) and (3), while | |
270 Shift-JIS and Big5 (not yet described) satisfy only (2). (All | |
271 non-modal encodings must satisfy (2), in order to be unambiguous.) | |
272 | |
273 B. Internal Character Encoding | |
274 | |
275 One 19-bit word represents a single character. The word is | |
276 separated into three fields: | |
277 | |
278 Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 | |
279 <------------> <------------------> <------------------> | |
280 Field: 1 2 3 | |
281 | |
282 Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits. | |
283 | |
284 Character set Field 1 Field 2 Field 3 | |
285 ------------- ------- ------- ------- | |
286 ASCII 0 0 PC1 | |
287 range: (00 - 7F) | |
288 Control-1 0 1 PC1 | |
289 range: (00 - 1F) | |
290 Dimension-1 official 0 LB - 0x80 PC1 | |
291 range: (01 - 0D) (20 - 7F) | |
292 Dimension-1 private 0 LB - 0x80 PC1 | |
293 range: (20 - 6F) (20 - 7F) | |
294 Dimension-2 official LB - 0x8F PC1 PC2 | |
295 range: (01 - 0A) (20 - 7F) (20 - 7F) | |
296 Dimension-2 private LB - 0xE1 PC1 PC2 | |
297 range: (0F - 1E) (20 - 7F) (20 - 7F) | |
298 Composite 0x1F ? ? | |
299 | |
300 Note that character codes 0 - 255 are the same as the "binary encoding" | |
301 described above. | |
302 */ | |
303 | |
304 /* | |
305 About Unicode support: | |
306 | |
307 Adding Unicode support is very desirable. Unicode will likely be a | |
308 very common representation in the future, and thus we should | |
309 represent Unicode characters using three bytes instead of four. | |
310 This means we need to find leading bytes for Unicode. Given that | |
311 there are 65,536 characters in Unicode and we can attach 96x96 = | |
312 9,216 characters per leading byte, we need eight leading bytes for | |
313 Unicode. We currently have four free (0x9A - 0x9D), and with a | |
314 little bit of rearranging we can get five: ASCII doesn't really | |
315 need to take up a leading byte. (We could just as well use 0x7F, | |
316 with a little change to the functions that assume that 0x80 is the | |
317 lowest leading byte.) This means we still need to dump three | |
318 leading bytes and move them into private space. The CNS charsets | |
319 are good candidates since they are rarely used, and | |
320 JAPANESE_JISX0208_1978 is becoming less and less used and could | |
321 also be dumped. */ | |
322 | |
323 | |
324 /************************************************************************/ | |
325 /* Definition of leading bytes */ | |
326 /************************************************************************/ | |
327 | |
328 #define MIN_LEADING_BYTE 0x80 | |
329 /* These need special treatment in a string and/or character */ | |
330 #define LEADING_BYTE_ASCII 0x8E /* Omitted in a buffer */ | |
331 #define LEADING_BYTE_COMPOSITE 0x80 /* for a composite character */ | |
332 #define LEADING_BYTE_CONTROL_1 0x8F /* represent normal 80-9F */ | |
333 | |
334 /** The following are for 1-byte characters in an official charset. **/ | |
335 | |
336 #define LEADING_BYTE_LATIN_1 0x81 /* Right half of ISO 8859-1 */ | |
337 #define LEADING_BYTE_LATIN_2 0x82 /* Right half of ISO 8859-2 */ | |
338 #define LEADING_BYTE_LATIN_3 0x83 /* Right half of ISO 8859-3 */ | |
339 #define LEADING_BYTE_LATIN_4 0x84 /* Right half of ISO 8859-4 */ | |
340 #define LEADING_BYTE_THAI 0x85 /* TIS620-2533 */ | |
341 #define LEADING_BYTE_GREEK 0x86 /* Right half of ISO 8859-7 */ | |
342 #define LEADING_BYTE_ARABIC 0x87 /* Right half of ISO 8859-6 */ | |
343 #define LEADING_BYTE_HEBREW 0x88 /* Right half of ISO 8859-8 */ | |
344 #define LEADING_BYTE_JAPANESE_JISX0201_KANA 0x89 /* Right half of JIS X0201-1976 */ | |
345 #define LEADING_BYTE_JAPANESE_JISX0201_ROMAN 0x8A /* Left half of JIS X0201-1976 */ | |
346 #define LEADING_BYTE_CYRILLIC 0x8C /* Right half of ISO 8859-5 */ | |
347 #define LEADING_BYTE_LATIN_5 0x8D /* Right half of ISO 8859-9 */ | |
348 | |
349 #define MIN_LEADING_BYTE_OFFICIAL_1 LEADING_BYTE_LATIN_1 | |
350 #define MAX_LEADING_BYTE_OFFICIAL_1 LEADING_BYTE_LATIN_5 | |
351 | |
352 /** The following are for 2-byte characters in an official charset. **/ | |
353 | |
354 #define LEADING_BYTE_JAPANESE_JISX0208_1978 0x90/* Japanese JIS X0208-1978 */ | |
355 #define LEADING_BYTE_CHINESE_GB 0x91 /* Chinese Hanzi GB2312-1980 */ | |
356 #define LEADING_BYTE_JAPANESE_JISX0208 0x92 /* Japanese JIS X0208-1983 */ | |
357 #define LEADING_BYTE_KOREAN_KSC5601 0x93 /* Hangul KS C5601-1987 */ | |
358 #define LEADING_BYTE_JAPANESE_JISX0212 0x94 /* Japanese JIS X0212-1990 */ | |
359 #define LEADING_BYTE_CHINESE_CNS11643_1 0x95 /* Chinese CNS11643 Set 1 */ | |
360 #define LEADING_BYTE_CHINESE_CNS11643_2 0x96 /* Chinese CNS11643 Set 2 */ | |
361 #define LEADING_BYTE_CHINESE_BIG5_1 0x97 /* Big5 Level 1 */ | |
362 #define LEADING_BYTE_CHINESE_BIG5_2 0x98 /* Big5 Level 2 */ | |
363 /* 0x99 unused */ | |
364 /* 0x9A unused */ | |
365 /* 0x9B unused */ | |
366 /* 0x9C unused */ | |
367 /* 0x9D unused */ | |
368 | |
369 #define MIN_LEADING_BYTE_OFFICIAL_2 LEADING_BYTE_JAPANESE_JISX0208_1978 | |
370 #define MAX_LEADING_BYTE_OFFICIAL_2 LEADING_BYTE_CHINESE_BIG5_2 | |
371 | |
372 /** The following are for 1- and 2-byte characters in a private charset. **/ | |
373 | |
374 #define PRE_LEADING_BYTE_PRIVATE_1 0x9E /* 1-byte char-set */ | |
375 #define PRE_LEADING_BYTE_PRIVATE_2 0x9F /* 2-byte char-set */ | |
376 | |
377 #define MIN_LEADING_BYTE_PRIVATE_1 0xA0 | |
378 #define MAX_LEADING_BYTE_PRIVATE_1 0xEF | |
379 #define MIN_LEADING_BYTE_PRIVATE_2 0xF0 | |
380 #define MAX_LEADING_BYTE_PRIVATE_2 0xFF | |
381 | |
382 #define NUM_LEADING_BYTES 128 | |
383 | |
384 | |
385 /************************************************************************/ | |
386 /* Operations on leading bytes */ | |
387 /************************************************************************/ | |
388 | |
389 /* Is this leading byte for a private charset? */ | |
390 | |
391 #define LEADING_BYTE_PRIVATE_P(lb) ((lb) >= MIN_LEADING_BYTE_PRIVATE_1) | |
392 | |
393 /* Is this a prefix for a private leading byte? */ | |
394 | |
395 INLINE int LEADING_BYTE_PREFIX_P (unsigned char lb); | |
396 INLINE int | |
397 LEADING_BYTE_PREFIX_P (unsigned char lb) | |
398 { | |
399 return (lb == PRE_LEADING_BYTE_PRIVATE_1 || | |
400 lb == PRE_LEADING_BYTE_PRIVATE_2); | |
401 } | |
402 | |
403 /* Given a private leading byte, return the leading byte prefix stored | |
404 in a string */ | |
405 | |
406 #define PRIVATE_LEADING_BYTE_PREFIX(lb) \ | |
407 ((lb) < MIN_LEADING_BYTE_PRIVATE_2 ? PRE_LEADING_BYTE_PRIVATE_1 \ | |
408 : PRE_LEADING_BYTE_PRIVATE_2) | |
409 | |
410 | |
411 | |
412 | |
413 /************************************************************************/ | |
414 /* Operations on individual bytes */ | |
415 /* of any format */ | |
416 /************************************************************************/ | |
417 | |
418 /* Argument `c' should be (unsigned int) or (unsigned char). */ | |
419 /* Note that SP and DEL are not included. */ | |
420 | |
421 #define BYTE_ASCII_P(c) ((c) < 0x80) | |
422 #define BYTE_C0_P(c) ((c) < 0x20) | |
423 /* Do some forced casting just to make *sure* things are gotten right. */ | |
424 #define BYTE_C1_P(c) ((unsigned int) ((unsigned int) (c) - 0x80) < 0x20) | |
425 | |
426 | |
427 /************************************************************************/ | |
428 /* Operations on individual bytes */ | |
429 /* in a Mule-formatted string */ | |
430 /************************************************************************/ | |
431 | |
432 /* Does this byte represent the first byte of a character? */ | |
433 | |
434 #define BUFBYTE_FIRST_BYTE_P(c) ((c) < 0xA0) | |
435 | |
436 /* Does this byte represent the first byte of a multi-byte character? */ | |
437 | |
438 #define BUFBYTE_LEADING_BYTE_P(c) BYTE_C1_P (c) | |
439 | |
440 | |
441 /************************************************************************/ | |
442 /* Information about a particular character set */ | |
443 /************************************************************************/ | |
444 | |
445 struct Lisp_Charset | |
446 { | |
447 struct lcrecord_header header; | |
448 | |
449 Lisp_Object name; | |
450 Lisp_Object doc_string, registry; | |
451 | |
452 Lisp_Object reverse_direction_charset; | |
453 | |
454 Lisp_Object ccl_program; | |
455 | |
456 unsigned int leading_byte :8; | |
457 | |
458 /* Number of bytes (1 - 4) required in the internal representation | |
459 for characters in this character set. This is *not* the | |
460 same as the number of bytes used in the encoding (i.e. | |
461 the "dimension" of the character set). That value can | |
462 be derived from the TYPE. */ | |
463 unsigned int rep_bytes :3; | |
464 | |
465 /* Number of columns a character in this charset takes up, on TTY | |
466 devices. Not used for X devices. */ | |
467 unsigned int columns :2; | |
468 /* Direction of this character set */ | |
469 unsigned int direction :1; | |
470 | |
471 /* Type of this character set (94, 96, 94x94, 96x96) */ | |
472 unsigned int type :2; | |
473 | |
474 /* Which half of font to be used to display this character set */ | |
475 unsigned int graphic :2; | |
476 | |
477 /* Final byte of this character set in ISO2022 designating escape sequence */ | |
478 Bufbyte final; | |
479 }; | |
480 | |
481 DECLARE_LRECORD (charset, struct Lisp_Charset); | |
482 #define XCHARSET(x) XRECORD (x, charset, struct Lisp_Charset) | |
483 #define XSETCHARSET(x, p) XSETRECORD (x, p, charset) | |
484 #define CHARSETP(x) RECORDP (x, charset) | |
485 #define GC_CHARSETP(x) GC_RECORDP (x, charset) | |
486 #define CHECK_CHARSET(x) CHECK_RECORD (x, charset) | |
487 #define CONCHECK_CHARSET(x) CONCHECK_RECORD (x, charset) | |
488 | |
489 #define CHARSET_TYPE_94 0 /* This charset includes 94 characters. */ | |
490 #define CHARSET_TYPE_96 1 /* This charset includes 96 characters. */ | |
491 #define CHARSET_TYPE_94X94 2 /* This charset includes 94x94 characters. */ | |
492 #define CHARSET_TYPE_96X96 3 /* This charset includes 96x96 characters. */ | |
493 | |
494 #define CHARSET_LEFT_TO_RIGHT 0 | |
495 #define CHARSET_RIGHT_TO_LEFT 1 | |
496 | |
497 #define CHARSET_NAME(cs) ((cs)->name) | |
498 #define CHARSET_LEADING_BYTE(cs) ((cs)->leading_byte) | |
499 #define CHARSET_REP_BYTES(cs) ((cs)->rep_bytes) | |
500 #define CHARSET_COLUMNS(cs) ((cs)->columns) | |
501 #define CHARSET_GRAPHIC(cs) ((cs)->graphic) | |
502 #define CHARSET_TYPE(cs) ((cs)->type) | |
503 #define CHARSET_DIRECTION(cs) ((cs)->direction) | |
504 #define CHARSET_FINAL(cs) ((cs)->final) | |
505 #define CHARSET_DOC_STRING(cs) ((cs)->doc_string) | |
506 #define CHARSET_REGISTRY(cs) ((cs)->registry) | |
507 #define CHARSET_CCL_PROGRAM(cs) ((cs)->ccl_program) | |
508 #define CHARSET_REVERSE_DIRECTION_CHARSET(cs) ((cs)->reverse_direction_charset) | |
509 | |
510 INLINE int CHARSET_DIMENSION (struct Lisp_Charset *cs); | |
511 INLINE int | |
512 CHARSET_DIMENSION (struct Lisp_Charset *cs) | |
513 { | |
514 return (CHARSET_TYPE (cs) == CHARSET_TYPE_94 || | |
515 CHARSET_TYPE (cs) == CHARSET_TYPE_96) ? 1 : 2; | |
516 } | |
517 | |
518 INLINE int CHARSET_CHARS (struct Lisp_Charset *cs); | |
519 INLINE int | |
520 CHARSET_CHARS (struct Lisp_Charset *cs) | |
521 { | |
522 return (CHARSET_TYPE (cs) == CHARSET_TYPE_94 || | |
523 CHARSET_TYPE (cs) == CHARSET_TYPE_94X94) ? 94 : 96; | |
524 } | |
525 | |
526 #define CHARSET_PRIVATE_P(cs) \ | |
527 LEADING_BYTE_PRIVATE_P (CHARSET_LEADING_BYTE (cs)) | |
528 | |
529 #define XCHARSET_NAME(cs) CHARSET_NAME (XCHARSET (cs)) | |
530 #define XCHARSET_REP_BYTES(cs) CHARSET_REP_BYTES (XCHARSET (cs)) | |
531 #define XCHARSET_COLUMNS(cs) CHARSET_COLUMNS (XCHARSET (cs)) | |
532 #define XCHARSET_GRAPHIC(cs) CHARSET_GRAPHIC (XCHARSET (cs)) | |
533 #define XCHARSET_TYPE(cs) CHARSET_TYPE (XCHARSET (cs)) | |
534 #define XCHARSET_DIRECTION(cs) CHARSET_DIRECTION (XCHARSET (cs)) | |
535 #define XCHARSET_FINAL(cs) CHARSET_FINAL (XCHARSET (cs)) | |
536 #define XCHARSET_DOC_STRING(cs) CHARSET_DOC_STRING (XCHARSET (cs)) | |
537 #define XCHARSET_REGISTRY(cs) CHARSET_REGISTRY (XCHARSET (cs)) | |
538 #define XCHARSET_LEADING_BYTE(cs) CHARSET_LEADING_BYTE (XCHARSET (cs)) | |
539 #define XCHARSET_CCL_PROGRAM(cs) CHARSET_CCL_PROGRAM (XCHARSET (cs)) | |
540 #define XCHARSET_DIMENSION(cs) CHARSET_DIMENSION (XCHARSET (cs)) | |
541 #define XCHARSET_CHARS(cs) CHARSET_CHARS (XCHARSET (cs)) | |
542 #define XCHARSET_PRIVATE_P(cs) CHARSET_PRIVATE_P (XCHARSET (cs)) | |
543 #define XCHARSET_REVERSE_DIRECTION_CHARSET(cs) \ | |
544 CHARSET_REVERSE_DIRECTION_CHARSET (XCHARSET (cs)) | |
545 | |
546 /* Table of charsets indexed by (leading byte - 128). */ | |
547 extern Lisp_Object charset_by_leading_byte[128]; | |
548 | |
549 /* Table of charsets indexed by type/final-byte/direction. */ | |
550 extern Lisp_Object charset_by_attributes[4][128][2]; | |
551 | |
552 /* Table of number of bytes in the string representation of a character | |
553 indexed by the first byte of that representation. | |
554 | |
555 This value can be derived other ways -- e.g. something like | |
556 | |
557 (BYTE_ASCII_P (first_byte) ? 1 : | |
558 XCHARSET_REP_BYTES (CHARSET_BY_LEADING_BYTE (first_byte))) | |
559 | |
560 but it's faster this way. */ | |
561 extern Bytecount rep_bytes_by_first_byte[0xA0]; | |
562 | |
563 #ifdef ERROR_CHECK_TYPECHECK | |
564 /* int not Bufbyte even though that is the actual type of a leading byte. | |
565 This way, out-ot-range values will get caught rather than automatically | |
566 truncated. */ | |
567 INLINE Lisp_Object CHARSET_BY_LEADING_BYTE (int lb); | |
568 INLINE Lisp_Object | |
569 CHARSET_BY_LEADING_BYTE (int lb) | |
570 { | |
571 assert (lb >= 0x80 && lb <= 0xFF); | |
572 return charset_by_leading_byte[lb - 128]; | |
573 } | |
574 | |
575 #else | |
576 | |
577 #define CHARSET_BY_LEADING_BYTE(lb) (charset_by_leading_byte[(lb) - 128]) | |
578 | |
579 #endif | |
580 | |
581 #define CHARSET_BY_ATTRIBUTES(type, final, dir) \ | |
582 (charset_by_attributes[type][final][dir]) | |
583 | |
584 #ifdef ERROR_CHECK_TYPECHECK | |
585 | |
586 /* Number of bytes in the string representation of a character */ | |
587 INLINE int REP_BYTES_BY_FIRST_BYTE (int fb); | |
588 INLINE int | |
589 REP_BYTES_BY_FIRST_BYTE (int fb) | |
590 { | |
591 assert (fb >= 0 && fb < 0xA0); | |
592 return rep_bytes_by_first_byte[fb]; | |
593 } | |
594 | |
595 #else | |
596 #define REP_BYTES_BY_FIRST_BYTE(fb) (rep_bytes_by_first_byte[fb]) | |
597 #endif | |
598 | |
599 extern Lisp_Object Vcharset_ascii; | |
600 extern Lisp_Object Vcharset_control_1; | |
601 extern Lisp_Object Vcharset_latin_1; | |
602 extern Lisp_Object Vcharset_latin_2; | |
603 extern Lisp_Object Vcharset_latin_3; | |
604 extern Lisp_Object Vcharset_latin_4; | |
605 extern Lisp_Object Vcharset_latin_5; | |
606 extern Lisp_Object Vcharset_greek; | |
607 extern Lisp_Object Vcharset_thai; | |
608 extern Lisp_Object Vcharset_arabic; | |
609 extern Lisp_Object Vcharset_hebrew; | |
610 extern Lisp_Object Vcharset_cyrillic; | |
611 extern Lisp_Object Vcharset_japanese_jisx0201_kana; | |
612 extern Lisp_Object Vcharset_japanese_jisx0201_roman; | |
613 extern Lisp_Object Vcharset_japanese_jisx0208_1978; | |
614 extern Lisp_Object Vcharset_japanese_jisx0208; | |
615 extern Lisp_Object Vcharset_japanese_jisx0212; | |
616 extern Lisp_Object Vcharset_korean_ksc5601; | |
617 extern Lisp_Object Vcharset_chinese_gb; | |
618 extern Lisp_Object Vcharset_chinese_big5_1; | |
619 extern Lisp_Object Vcharset_chinese_big5_2; | |
620 extern Lisp_Object Vcharset_chinese_cns11643_1; | |
621 extern Lisp_Object Vcharset_chinese_cns11643_2; | |
622 extern Lisp_Object Vcharset_composite; | |
623 | |
624 | |
625 /************************************************************************/ | |
626 /* Dealing with characters */ | |
627 /************************************************************************/ | |
628 | |
629 /* Is this character represented by more than one byte in a string? */ | |
630 | |
631 #define CHAR_MULTIBYTE_P(c) ((c) >= 0x80) | |
632 | |
633 #define CHAR_ASCII_P(c) (!CHAR_MULTIBYTE_P (c)) | |
634 | |
635 /* The bit fields of character are divided into 3 parts: | |
636 FIELD1(5bits):FIELD2(7bits):FIELD3(7bits) */ | |
637 | |
638 #define CHAR_FIELD1_MASK (0x1F << 14) | |
639 #define CHAR_FIELD2_MASK (0x7F << 7) | |
640 #define CHAR_FIELD3_MASK 0x7F | |
641 | |
642 /* Macros to access each field of a character code of C. */ | |
643 | |
644 #define CHAR_FIELD1(c) (((c) & CHAR_FIELD1_MASK) >> 14) | |
645 #define CHAR_FIELD2(c) (((c) & CHAR_FIELD2_MASK) >> 7) | |
646 #define CHAR_FIELD3(c) ((c) & CHAR_FIELD3_MASK) | |
647 | |
648 /* Field 1, if non-zero, usually holds a leading byte for a | |
649 dimension-2 charset. Field 2, if non-zero, usually holds a leading | |
650 byte for a dimension-1 charset. */ | |
651 | |
652 /* Converting between field values and leading bytes. */ | |
653 | |
654 #define FIELD2_TO_OFFICIAL_LEADING_BYTE 0x80 | |
655 #define FIELD2_TO_PRIVATE_LEADING_BYTE 0x80 | |
656 | |
657 #define FIELD1_TO_OFFICIAL_LEADING_BYTE 0x8F | |
658 #define FIELD1_TO_PRIVATE_LEADING_BYTE 0xE1 | |
659 | |
660 /* Minimum and maximum allowed values for the fields. */ | |
661 | |
662 #define MIN_CHAR_FIELD2_OFFICIAL \ | |
663 (MIN_LEADING_BYTE_OFFICIAL_1 - FIELD2_TO_OFFICIAL_LEADING_BYTE) | |
664 #define MAX_CHAR_FIELD2_OFFICIAL \ | |
665 (MAX_LEADING_BYTE_OFFICIAL_1 - FIELD2_TO_OFFICIAL_LEADING_BYTE) | |
666 | |
667 #define MIN_CHAR_FIELD1_OFFICIAL \ | |
668 (MIN_LEADING_BYTE_OFFICIAL_2 - FIELD1_TO_OFFICIAL_LEADING_BYTE) | |
669 #define MAX_CHAR_FIELD1_OFFICIAL \ | |
670 (MAX_LEADING_BYTE_OFFICIAL_2 - FIELD1_TO_OFFICIAL_LEADING_BYTE) | |
671 | |
672 #define MIN_CHAR_FIELD2_PRIVATE \ | |
673 (MIN_LEADING_BYTE_PRIVATE_1 - FIELD2_TO_PRIVATE_LEADING_BYTE) | |
674 #define MAX_CHAR_FIELD2_PRIVATE \ | |
675 (MAX_LEADING_BYTE_PRIVATE_1 - FIELD2_TO_PRIVATE_LEADING_BYTE) | |
676 | |
677 #define MIN_CHAR_FIELD1_PRIVATE \ | |
678 (MIN_LEADING_BYTE_PRIVATE_2 - FIELD1_TO_PRIVATE_LEADING_BYTE) | |
679 #define MAX_CHAR_FIELD1_PRIVATE \ | |
680 (MAX_LEADING_BYTE_PRIVATE_2 - FIELD1_TO_PRIVATE_LEADING_BYTE) | |
681 | |
682 /* Minimum character code of each <type> character. */ | |
683 | |
684 #define MIN_CHAR_OFFICIAL_TYPE9N (MIN_CHAR_FIELD2_OFFICIAL << 7) | |
685 #define MIN_CHAR_PRIVATE_TYPE9N (MIN_CHAR_FIELD2_PRIVATE << 7) | |
686 #define MIN_CHAR_OFFICIAL_TYPE9NX9N (MIN_CHAR_FIELD1_OFFICIAL << 14) | |
687 #define MIN_CHAR_PRIVATE_TYPE9NX9N (MIN_CHAR_FIELD1_PRIVATE << 14) | |
688 #define MIN_CHAR_COMPOSITION (0x1F << 14) | |
689 | |
690 /* Leading byte of a character. | |
691 | |
692 NOTE: This takes advantage of the fact that | |
693 FIELD2_TO_OFFICIAL_LEADING_BYTE and | |
694 FIELD2_TO_PRIVATE_LEADING_BYTE are the same. | |
695 */ | |
696 | |
697 INLINE Bufbyte CHAR_LEADING_BYTE (Emchar c); | |
698 INLINE Bufbyte | |
699 CHAR_LEADING_BYTE (Emchar c) | |
700 { | |
701 if (CHAR_ASCII_P (c)) | |
702 return LEADING_BYTE_ASCII; | |
703 else if (c < 0xA0) | |
704 return LEADING_BYTE_CONTROL_1; | |
705 else if (c < MIN_CHAR_OFFICIAL_TYPE9NX9N) | |
706 return CHAR_FIELD2 (c) + FIELD2_TO_OFFICIAL_LEADING_BYTE; | |
707 else if (c < MIN_CHAR_PRIVATE_TYPE9NX9N) | |
708 return CHAR_FIELD1 (c) + FIELD1_TO_OFFICIAL_LEADING_BYTE; | |
709 else if (c < MIN_CHAR_COMPOSITION) | |
710 return CHAR_FIELD1 (c) + FIELD1_TO_PRIVATE_LEADING_BYTE; | |
711 else | |
712 return LEADING_BYTE_COMPOSITE; | |
713 } | |
714 | |
715 #define CHAR_CHARSET(c) CHARSET_BY_LEADING_BYTE (CHAR_LEADING_BYTE (c)) | |
716 | |
717 /* Return a character whose charset is CHARSET and position-codes | |
718 are C1 and C2. TYPE9N character ignores C2. | |
719 | |
720 NOTE: This takes advantage of the fact that | |
721 FIELD2_TO_OFFICIAL_LEADING_BYTE and | |
722 FIELD2_TO_PRIVATE_LEADING_BYTE are the same. | |
723 */ | |
724 | |
725 INLINE Emchar MAKE_CHAR (Lisp_Object charset, int c1, int c2); | |
726 INLINE Emchar | |
727 MAKE_CHAR (Lisp_Object charset, int c1, int c2) | |
728 { | |
729 if (EQ (charset, Vcharset_ascii)) | |
730 return c1; | |
731 else if (EQ (charset, Vcharset_control_1)) | |
732 return c1 | 0x80; | |
733 else if (EQ (charset, Vcharset_composite)) | |
734 return (0x1F << 14) | ((c1) << 7) | (c2); | |
735 else if (XCHARSET_DIMENSION (charset) == 1) | |
736 return ((XCHARSET_LEADING_BYTE (charset) - | |
737 FIELD2_TO_OFFICIAL_LEADING_BYTE) << 7) | (c1); | |
738 else if (!XCHARSET_PRIVATE_P (charset)) | |
739 return ((XCHARSET_LEADING_BYTE (charset) - | |
740 FIELD1_TO_OFFICIAL_LEADING_BYTE) << 14) | ((c1) << 7) | (c2); | |
741 else | |
742 return ((XCHARSET_LEADING_BYTE (charset) - | |
743 FIELD1_TO_PRIVATE_LEADING_BYTE) << 14) | ((c1) << 7) | (c2); | |
744 } | |
745 | |
746 /* The charset of character C is set to CHARSET, and the | |
747 position-codes of C are set to C1 and C2. C2 of TYPE9N character | |
748 is 0. */ | |
749 | |
750 /* BREAKUP_CHAR_1_UNSAFE assumes that the charset has already been | |
751 calculated, and just computes c1 and c2. | |
752 | |
753 BREAKUP_CHAR also computes and stores the charset. */ | |
754 | |
755 #define BREAKUP_CHAR_1_UNSAFE(c, charset, c1, c2) \ | |
756 XCHARSET_DIMENSION (charset) == 1 \ | |
757 ? ((c1) = CHAR_FIELD3 (c), (c2) = 0) \ | |
758 : ((c1) = CHAR_FIELD2 (c), \ | |
759 (c2) = CHAR_FIELD3 (c)) | |
760 | |
761 INLINE void breakup_char_1 (Emchar c, Lisp_Object *charset, int *c1, int *c2); | |
762 INLINE void | |
763 breakup_char_1 (Emchar c, Lisp_Object *charset, int *c1, int *c2) | |
764 { | |
765 *charset = CHAR_CHARSET (c); | |
766 BREAKUP_CHAR_1_UNSAFE (c, *charset, *c1, *c2); | |
767 } | |
768 | |
769 #define BREAKUP_CHAR(c, charset, c1, c2) \ | |
770 breakup_char_1 (c, &(charset), &(c1), &(c2)) | |
771 | |
772 | |
773 | |
774 /************************************************************************/ | |
775 /* Composite characters */ | |
776 /************************************************************************/ | |
777 | |
778 extern Lisp_Object Vcomposite_char_int2string_hashtable; | |
779 extern Lisp_Object Vcomposite_char_string2int_hashtable; | |
780 | |
781 Emchar lookup_composite_char (Bufbyte *str, int len); | |
782 Lisp_Object composite_char_string (Emchar ch); | |
783 | |
784 | |
785 | |
786 /************************************************************************/ | |
787 /* Exported functions */ | |
788 /************************************************************************/ | |
789 | |
790 Lisp_Object Fget_charset (Lisp_Object); | |
791 Lisp_Object Ffind_charset (Lisp_Object); | |
792 | |
793 int copy_internal_to_external (CONST Bufbyte *internal, Bytecount len, | |
794 unsigned char *external); | |
795 Bytecount copy_external_to_internal (CONST unsigned char *external, | |
796 int len, Bufbyte *internal); | |
797 | |
798 #endif /* _XEMACS_MULE_CHARSET_H */ |