70
|
1 /* Header for multilingual functions.
|
|
2 Copyright (C) 1992, 1995 Free Software Foundation, Inc.
|
|
3 Copyright (C) 1995 Sun Microsystems, Inc.
|
|
4
|
|
5 This file is part of XEmacs.
|
|
6
|
|
7 XEmacs is free software; you can redistribute it and/or modify it
|
|
8 under the terms of the GNU General Public License as published by the
|
|
9 Free Software Foundation; either version 2, or (at your option) any
|
|
10 later version.
|
|
11
|
|
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
|
|
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
15 for more details.
|
|
16
|
|
17 You should have received a copy of the GNU General Public License
|
|
18 along with XEmacs; see the file COPYING. If not, write to
|
|
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
20 Boston, MA 02111-1307, USA. */
|
|
21
|
|
22 /* Synched up with: Mule 2.3. Not in FSF. */
|
|
23
|
|
24 /* Rewritten by Ben Wing <wing@666.com>. */
|
|
25
|
|
26 #ifndef _XEMACS_MULE_CHARSET_H
|
|
27 #define _XEMACS_MULE_CHARSET_H
|
|
28
|
|
29 /*
|
|
30 1. Character Sets
|
|
31 =================
|
|
32
|
|
33 A character set (or "charset") is an ordered set of characters.
|
|
34 A particular character in a charset is indexed using one or
|
|
35 more "position codes", which are non-negative integers.
|
|
36 The number of position codes needed to identify a particular
|
|
37 character in a charset is called the "dimension" of the
|
|
38 charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions,
|
|
39 and the size of all charsets (except for a few special cases)
|
|
40 is either 94, 96, 94 by 94, or 96 by 96. The range of
|
|
41 position codes used to index characters from any of these
|
|
42 types of character sets is as follows:
|
|
43
|
|
44 Charset type Position code 1 Position code 2
|
|
45 ------------------------------------------------------------
|
|
46 94 33 - 126 N/A
|
|
47 96 32 - 127 N/A
|
|
48 94x94 33 - 126 33 - 126
|
|
49 96x96 32 - 127 32 - 127
|
|
50
|
|
51 Note that in the above cases position codes do not start at
|
|
52 an expected value such as 0 or 1. The reason for this will
|
|
53 become clear later.
|
|
54
|
|
55 For example, Latin-1 is a 96-character charset, and JISX0208
|
|
56 (the Japanese national character set) is a 94x94-character
|
|
57 charset.
|
|
58
|
|
59 [Note that, although the ranges above define the *valid*
|
|
60 position codes for a charset, some of the slots in a particular
|
|
61 charset may in fact be empty. This is the case for JISX0208,
|
|
62 for example, where (e.g.) all the slots whose first
|
|
63 position code is in the range 118 - 127 are empty.]
|
|
64
|
|
65 There are three charsets that do not follow the above rules.
|
|
66 All of them have one dimension, and have ranges of position
|
|
67 codes as follows:
|
|
68
|
|
69 Charset name Position code 1
|
|
70 ------------------------------------
|
|
71 ASCII 0 - 127
|
|
72 Control-1 0 - 31
|
|
73 Composite 0 - some large number
|
|
74
|
|
75 (The upper bound of the position code for composite characters
|
|
76 has not yet been determined, but it will probably be at
|
|
77 least 16,383).
|
|
78
|
|
79 ASCII is the union of two subsidiary character sets:
|
|
80 Printing-ASCII (the printing ASCII character set,
|
|
81 consisting of position codes 33 - 126, like for a standard
|
|
82 94-character charset) and Control-ASCII (the non-printing
|
|
83 characters that would appear in a binary file with codes 0
|
|
84 - 32 and 127).
|
|
85
|
|
86 Control-1 contains the non-printing characters that would
|
|
87 appear in a binary file with codes 128 - 159.
|
|
88
|
|
89 Composite contains characters that are generated by
|
|
90 overstriking one or more characters from other charsets.
|
|
91
|
|
92 Note that some characters in ASCII, and all characters
|
|
93 in Control-1, are "control" (non-printing) characters.
|
|
94 These have no printed representation but instead control
|
|
95 some other function of the printing (e.g. TAB or 8 moves
|
|
96 the current character position to the next tab stop).
|
|
97 All other characters in all charsets are "graphic"
|
|
98 (printing) characters.
|
|
99
|
|
100 When a binary file is read in, the bytes in the file are
|
|
101 assigned to character sets as follows:
|
|
102
|
|
103 Bytes Character set Range
|
|
104 --------------------------------------------------
|
|
105 0 - 127 ASCII 0 - 127
|
|
106 128 - 159 Control-1 0 - 31
|
|
107 160 - 255 Latin-1 32 - 127
|
|
108
|
|
109 This is a bit ad-hoc but gets the job done.
|
|
110
|
|
111 2. Encodings
|
|
112 ============
|
|
113
|
|
114 An "encoding" is a way of numerically representing
|
|
115 characters from one or more character sets. If an encoding
|
|
116 only encompasses one character set, then the position codes
|
|
117 for the characters in that character set could be used
|
|
118 directly. This is not possible, however, if more than one
|
|
119 character set is to be used in the encoding.
|
|
120
|
|
121 For example, the conversion detailed above between bytes in
|
|
122 a binary file and characters is effectively an encoding
|
|
123 that encompasses the three character sets ASCII, Control-1,
|
|
124 and Latin-1 in a stream of 8-bit bytes.
|
|
125
|
|
126 Thus, an encoding can be viewed as a way of encoding
|
|
127 characters from a specified group of character sets using a
|
|
128 stream of bytes, each of which contains a fixed number of
|
|
129 bits (but not necessarily 8, as in the common usage of
|
|
130 "byte").
|
|
131
|
|
132 Here are descriptions of a couple of common
|
|
133 encodings:
|
|
134
|
|
135
|
|
136 A. Japanese EUC (Extended Unix Code)
|
|
137
|
|
138 This encompasses the character sets:
|
|
139 - Printing-ASCII,
|
74
|
140 - Katakana-JISX0201 (half-width katakana, the right half of JISX0201).
|
70
|
141 - Japanese-JISX0208
|
|
142 - Japanese-JISX0212
|
|
143 It uses 8-bit bytes.
|
|
144
|
74
|
145 Note that Printing-ASCII and Katakana-JISX0201 are 94-character
|
70
|
146 charsets, while Japanese-JISX0208 is a 94x94-character charset.
|
|
147
|
|
148 The encoding is as follows:
|
|
149
|
74
|
150 Character set Representation (PC == position-code)
|
|
151 ------------- --------------
|
|
152 Printing-ASCII PC1
|
|
153 Japanese-JISX0208 PC1 + 0x80 | PC2 + 0x80
|
|
154 Katakana-JISX0201 0x8E | PC1 + 0x80
|
70
|
155
|
|
156
|
|
157 B. JIS7
|
|
158
|
|
159 This encompasses the character sets:
|
|
160 - Printing-ASCII
|
74
|
161 - Latin-JISX0201 (the left half of JISX0201; this character set is
|
|
162 very similar to Printing-ASCII and is a 94-character charset)
|
70
|
163 - Japanese-JISX0208
|
74
|
164 - Katakana-JISX0201
|
70
|
165 It uses 7-bit bytes.
|
|
166
|
|
167 Unlike Japanese EUC, this is a "modal" encoding, which
|
|
168 means that there are multiple states that the encoding can
|
|
169 be in, which affect how the bytes are to be interpreted.
|
|
170 Special sequences of bytes (called "escape sequences")
|
|
171 are used to change states.
|
|
172
|
|
173 The encoding is as follows:
|
|
174
|
74
|
175 Character set Representation
|
|
176 ------------- --------------
|
|
177 Printing-ASCII PC1
|
|
178 Latin-JISX0201 PC1
|
|
179 Katakana-JISX0201 PC1
|
|
180 Japanese-JISX0208 PC1 | PC2
|
70
|
181
|
|
182 Escape sequence ASCII equivalent Meaning
|
|
183 --------------- ---------------- -------
|
|
184 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII
|
74
|
185 0x1B 0x28 0x4A ESC ( J invoke Latin-JISX0201
|
|
186 0x1B 0x28 0x49 ESC ( I invoke Katakana-JISX0201
|
70
|
187 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208
|
|
188
|
|
189 Initially, Printing-ASCII is invoked.
|
|
190
|
|
191 3. Internal Mule Encodings
|
|
192 ==========================
|
|
193
|
|
194 In XEmacs/Mule, each character set is assigned a unique number,
|
|
195 called a "leading byte". This is used in the encodings of a
|
|
196 character. Leading bytes are in the range 0x80 - 0xFF
|
|
197 (except for ASCII, which has a leading byte of 0), although
|
|
198 some leading bytes are reserved.
|
|
199
|
|
200 Charsets whose leading byte is in the range 0x80 - 0x9F are
|
|
201 called "official" and are used for built-in charsets.
|
|
202 Other charsets are called "private" and have leading bytes
|
|
203 in the range 0xA0 - 0xFF; these are user-defined charsets.
|
|
204
|
|
205 More specifically:
|
|
206
|
|
207 Character set Leading byte
|
|
208 ------------- ------------
|
|
209 ASCII 0
|
|
210 Composite 0x80
|
|
211 Dimension-1 Official 0x81 - 0x8D
|
|
212 (0x8E is free)
|
|
213 Control 0x8F
|
|
214 Dimension-2 Official 0x90 - 0x99
|
|
215 (0x9A - 0x9D are free;
|
|
216 0x9E and 0x9F are reserved)
|
|
217 Dimension-1 Private 0xA0 - 0xEF
|
|
218 Dimension-2 Private 0xF0 - 0xFF
|
|
219
|
|
220 There are two internal encodings for characters in XEmacs/Mule.
|
|
221 One is called "string encoding" and is an 8-bit encoding that
|
|
222 is used for representing characters in a buffer or string.
|
|
223 It uses 1 to 4 bytes per character. The other is called
|
|
224 "character encoding" and is a 19-bit encoding that is used
|
|
225 for representing characters individually in a variable.
|
|
226
|
|
227 (In the following descriptions, we'll ignore composite
|
|
228 characters for the moment. We also give a general (structural)
|
|
229 overview first, followed later by the exact details.)
|
|
230
|
|
231 A. Internal String Encoding
|
|
232
|
|
233 ASCII characters are encoded using their position code directly.
|
|
234 Other characters are encoded using their leading byte followed
|
|
235 by their position code(s) with the high bit set. Characters
|
|
236 in private character sets have their leading byte prefixed with
|
|
237 a "leading byte prefix", which is either 0x9E or 0x9F. (No
|
|
238 character sets are ever assigned these leading bytes.) Specifically:
|
|
239
|
|
240 Character set Encoding (PC == position-code)
|
|
241 ------------- -------- (LB == leading-byte)
|
|
242 ASCII PC1 |
|
|
243 Control-1 LB | PC1 + 0xA0
|
|
244 Dimension-1 official LB | PC1 + 0x80
|
|
245 Dimension-1 private 0x9E | LB | PC1 + 0x80
|
|
246 Dimension-2 official LB | PC1 | PC2 + 0x80
|
|
247 Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80
|
|
248
|
|
249 The basic characteristic of this encoding is that the first byte
|
|
250 of all characters is in the range 0x00 - 0x9F, and the second and
|
|
251 following bytes of all characters is in the range 0xA0 - 0xFF.
|
|
252 This means that it is impossible to get out of sync, or more
|
|
253 specifically:
|
185
|
254
|
70
|
255 1. Given any byte position, the beginning of the character it is
|
|
256 within can be determined in constant time.
|
|
257 2. Given any byte position at the beginning of a character, the
|
|
258 beginning of the next character can be determined in constant
|
|
259 time.
|
|
260 3. Given any byte position at the beginning of a character, the
|
|
261 beginning of the previous character can be determined in constant
|
|
262 time.
|
|
263 4. Textual searches can simply treat encoded strings as if they
|
|
264 were encoded in a one-byte-per-character fashion rather than
|
|
265 the actual multi-byte encoding.
|
|
266
|
|
267 None of the standard non-modal encodings meet all of these
|
|
268 conditions. For example, EUC satisfies only (2) and (3), while
|
|
269 Shift-JIS and Big5 (not yet described) satisfy only (2). (All
|
|
270 non-modal encodings must satisfy (2), in order to be unambiguous.)
|
|
271
|
|
272 B. Internal Character Encoding
|
|
273
|
|
274 One 19-bit word represents a single character. The word is
|
|
275 separated into three fields:
|
|
276
|
|
277 Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
|
|
278 <------------> <------------------> <------------------>
|
|
279 Field: 1 2 3
|
|
280
|
|
281 Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits.
|
|
282
|
|
283 Character set Field 1 Field 2 Field 3
|
|
284 ------------- ------- ------- -------
|
|
285 ASCII 0 0 PC1
|
|
286 range: (00 - 7F)
|
|
287 Control-1 0 1 PC1
|
|
288 range: (00 - 1F)
|
|
289 Dimension-1 official 0 LB - 0x80 PC1
|
|
290 range: (01 - 0D) (20 - 7F)
|
|
291 Dimension-1 private 0 LB - 0x80 PC1
|
|
292 range: (20 - 6F) (20 - 7F)
|
|
293 Dimension-2 official LB - 0x8F PC1 PC2
|
|
294 range: (01 - 0A) (20 - 7F) (20 - 7F)
|
|
295 Dimension-2 private LB - 0xE1 PC1 PC2
|
|
296 range: (0F - 1E) (20 - 7F) (20 - 7F)
|
|
297 Composite 0x1F ? ?
|
|
298
|
|
299 Note that character codes 0 - 255 are the same as the "binary encoding"
|
|
300 described above.
|
|
301 */
|
|
302
|
|
303 /*
|
|
304 About Unicode support:
|
|
305
|
|
306 Adding Unicode support is very desirable. Unicode will likely be a
|
|
307 very common representation in the future, and thus we should
|
|
308 represent Unicode characters using three bytes instead of four.
|
|
309 This means we need to find leading bytes for Unicode. Given that
|
|
310 there are 65,536 characters in Unicode and we can attach 96x96 =
|
|
311 9,216 characters per leading byte, we need eight leading bytes for
|
|
312 Unicode. We currently have four free (0x9A - 0x9D), and with a
|
|
313 little bit of rearranging we can get five: ASCII doesn't really
|
|
314 need to take up a leading byte. (We could just as well use 0x7F,
|
|
315 with a little change to the functions that assume that 0x80 is the
|
|
316 lowest leading byte.) This means we still need to dump three
|
|
317 leading bytes and move them into private space. The CNS charsets
|
|
318 are good candidates since they are rarely used, and
|
|
319 JAPANESE_JISX0208_1978 is becoming less and less used and could
|
|
320 also be dumped. */
|
|
321
|
|
322
|
|
323 /************************************************************************/
|
|
324 /* Definition of leading bytes */
|
|
325 /************************************************************************/
|
|
326
|
|
327 #define MIN_LEADING_BYTE 0x80
|
|
328 /* These need special treatment in a string and/or character */
|
|
329 #define LEADING_BYTE_ASCII 0x8E /* Omitted in a buffer */
|
|
330 #define LEADING_BYTE_COMPOSITE 0x80 /* for a composite character */
|
|
331 #define LEADING_BYTE_CONTROL_1 0x8F /* represent normal 80-9F */
|
|
332
|
|
333 /** The following are for 1-byte characters in an official charset. **/
|
|
334
|
74
|
335 #define LEADING_BYTE_LATIN_ISO8859_1 0x81 /* Right half of ISO 8859-1 */
|
|
336 #define LEADING_BYTE_LATIN_ISO8859_2 0x82 /* Right half of ISO 8859-2 */
|
|
337 #define LEADING_BYTE_LATIN_ISO8859_3 0x83 /* Right half of ISO 8859-3 */
|
|
338 #define LEADING_BYTE_LATIN_ISO8859_4 0x84 /* Right half of ISO 8859-4 */
|
|
339 #define LEADING_BYTE_THAI_TIS620 0x85 /* TIS620-2533 */
|
|
340 #define LEADING_BYTE_GREEK_ISO8859_7 0x86 /* Right half of ISO 8859-7 */
|
|
341 #define LEADING_BYTE_ARABIC_ISO8859_6 0x87 /* Right half of ISO 8859-6 */
|
|
342 #define LEADING_BYTE_HEBREW_ISO8859_8 0x88 /* Right half of ISO 8859-8 */
|
|
343 #define LEADING_BYTE_KATAKANA_JISX0201 0x89 /* Right half of JIS X0201-1976 */
|
|
344 #define LEADING_BYTE_LATIN_JISX0201 0x8A /* Left half of JIS X0201-1976 */
|
|
345 #define LEADING_BYTE_CYRILLIC_ISO8859_5 0x8C /* Right half of ISO 8859-5 */
|
|
346 #define LEADING_BYTE_LATIN_ISO8859_9 0x8D /* Right half of ISO 8859-9 */
|
70
|
347
|
74
|
348 #define MIN_LEADING_BYTE_OFFICIAL_1 LEADING_BYTE_LATIN_ISO8859_1
|
|
349 #define MAX_LEADING_BYTE_OFFICIAL_1 LEADING_BYTE_LATIN_ISO8859_9
|
70
|
350
|
|
351 /** The following are for 2-byte characters in an official charset. **/
|
|
352
|
|
353 #define LEADING_BYTE_JAPANESE_JISX0208_1978 0x90/* Japanese JIS X0208-1978 */
|
74
|
354 #define LEADING_BYTE_CHINESE_GB2312 0x91 /* Chinese Hanzi GB2312-1980 */
|
70
|
355 #define LEADING_BYTE_JAPANESE_JISX0208 0x92 /* Japanese JIS X0208-1983 */
|
|
356 #define LEADING_BYTE_KOREAN_KSC5601 0x93 /* Hangul KS C5601-1987 */
|
|
357 #define LEADING_BYTE_JAPANESE_JISX0212 0x94 /* Japanese JIS X0212-1990 */
|
|
358 #define LEADING_BYTE_CHINESE_CNS11643_1 0x95 /* Chinese CNS11643 Set 1 */
|
|
359 #define LEADING_BYTE_CHINESE_CNS11643_2 0x96 /* Chinese CNS11643 Set 2 */
|
|
360 #define LEADING_BYTE_CHINESE_BIG5_1 0x97 /* Big5 Level 1 */
|
|
361 #define LEADING_BYTE_CHINESE_BIG5_2 0x98 /* Big5 Level 2 */
|
|
362 /* 0x99 unused */
|
|
363 /* 0x9A unused */
|
|
364 /* 0x9B unused */
|
|
365 /* 0x9C unused */
|
|
366 /* 0x9D unused */
|
|
367
|
|
368 #define MIN_LEADING_BYTE_OFFICIAL_2 LEADING_BYTE_JAPANESE_JISX0208_1978
|
|
369 #define MAX_LEADING_BYTE_OFFICIAL_2 LEADING_BYTE_CHINESE_BIG5_2
|
|
370
|
|
371 /** The following are for 1- and 2-byte characters in a private charset. **/
|
|
372
|
|
373 #define PRE_LEADING_BYTE_PRIVATE_1 0x9E /* 1-byte char-set */
|
|
374 #define PRE_LEADING_BYTE_PRIVATE_2 0x9F /* 2-byte char-set */
|
|
375
|
|
376 #define MIN_LEADING_BYTE_PRIVATE_1 0xA0
|
|
377 #define MAX_LEADING_BYTE_PRIVATE_1 0xEF
|
|
378 #define MIN_LEADING_BYTE_PRIVATE_2 0xF0
|
|
379 #define MAX_LEADING_BYTE_PRIVATE_2 0xFF
|
|
380
|
|
381 #define NUM_LEADING_BYTES 128
|
|
382
|
|
383
|
|
384 /************************************************************************/
|
|
385 /* Operations on leading bytes */
|
|
386 /************************************************************************/
|
|
387
|
|
388 /* Is this leading byte for a private charset? */
|
|
389
|
|
390 #define LEADING_BYTE_PRIVATE_P(lb) ((lb) >= MIN_LEADING_BYTE_PRIVATE_1)
|
|
391
|
|
392 /* Is this a prefix for a private leading byte? */
|
|
393
|
|
394 INLINE int LEADING_BYTE_PREFIX_P (unsigned char lb);
|
|
395 INLINE int
|
|
396 LEADING_BYTE_PREFIX_P (unsigned char lb)
|
|
397 {
|
|
398 return (lb == PRE_LEADING_BYTE_PRIVATE_1 ||
|
|
399 lb == PRE_LEADING_BYTE_PRIVATE_2);
|
|
400 }
|
|
401
|
|
402 /* Given a private leading byte, return the leading byte prefix stored
|
|
403 in a string */
|
|
404
|
74
|
405 #define PRIVATE_LEADING_BYTE_PREFIX(lb) \
|
|
406 ((lb) < MIN_LEADING_BYTE_PRIVATE_2 ? \
|
|
407 PRE_LEADING_BYTE_PRIVATE_1 : \
|
|
408 PRE_LEADING_BYTE_PRIVATE_2)
|
70
|
409
|
|
410
|
|
411 /************************************************************************/
|
|
412 /* Operations on individual bytes */
|
|
413 /* of any format */
|
|
414 /************************************************************************/
|
|
415
|
|
416 /* Argument `c' should be (unsigned int) or (unsigned char). */
|
|
417 /* Note that SP and DEL are not included. */
|
|
418
|
|
419 #define BYTE_ASCII_P(c) ((c) < 0x80)
|
|
420 #define BYTE_C0_P(c) ((c) < 0x20)
|
|
421 /* Do some forced casting just to make *sure* things are gotten right. */
|
|
422 #define BYTE_C1_P(c) ((unsigned int) ((unsigned int) (c) - 0x80) < 0x20)
|
|
423
|
|
424
|
|
425 /************************************************************************/
|
|
426 /* Operations on individual bytes */
|
|
427 /* in a Mule-formatted string */
|
|
428 /************************************************************************/
|
|
429
|
|
430 /* Does this byte represent the first byte of a character? */
|
|
431
|
|
432 #define BUFBYTE_FIRST_BYTE_P(c) ((c) < 0xA0)
|
|
433
|
|
434 /* Does this byte represent the first byte of a multi-byte character? */
|
|
435
|
|
436 #define BUFBYTE_LEADING_BYTE_P(c) BYTE_C1_P (c)
|
|
437
|
|
438
|
|
439 /************************************************************************/
|
|
440 /* Information about a particular character set */
|
|
441 /************************************************************************/
|
|
442
|
|
443 struct Lisp_Charset
|
|
444 {
|
|
445 struct lcrecord_header header;
|
|
446
|
213
|
447 int id;
|
70
|
448 Lisp_Object name;
|
|
449 Lisp_Object doc_string, registry;
|
|
450
|
|
451 Lisp_Object reverse_direction_charset;
|
|
452
|
|
453 Lisp_Object ccl_program;
|
185
|
454
|
70
|
455 unsigned int leading_byte :8;
|
|
456
|
|
457 /* Number of bytes (1 - 4) required in the internal representation
|
|
458 for characters in this character set. This is *not* the
|
|
459 same as the number of bytes used in the encoding (i.e.
|
|
460 the "dimension" of the character set). That value can
|
|
461 be derived from the TYPE. */
|
|
462 unsigned int rep_bytes :3;
|
|
463
|
|
464 /* Number of columns a character in this charset takes up, on TTY
|
|
465 devices. Not used for X devices. */
|
|
466 unsigned int columns :2;
|
|
467 /* Direction of this character set */
|
|
468 unsigned int direction :1;
|
|
469
|
|
470 /* Type of this character set (94, 96, 94x94, 96x96) */
|
|
471 unsigned int type :2;
|
|
472
|
|
473 /* Which half of font to be used to display this character set */
|
|
474 unsigned int graphic :2;
|
|
475
|
|
476 /* Final byte of this character set in ISO2022 designating escape sequence */
|
|
477 Bufbyte final;
|
|
478 };
|
|
479
|
|
480 DECLARE_LRECORD (charset, struct Lisp_Charset);
|
|
481 #define XCHARSET(x) XRECORD (x, charset, struct Lisp_Charset)
|
|
482 #define XSETCHARSET(x, p) XSETRECORD (x, p, charset)
|
|
483 #define CHARSETP(x) RECORDP (x, charset)
|
|
484 #define GC_CHARSETP(x) GC_RECORDP (x, charset)
|
|
485 #define CHECK_CHARSET(x) CHECK_RECORD (x, charset)
|
|
486 #define CONCHECK_CHARSET(x) CONCHECK_RECORD (x, charset)
|
|
487
|
74
|
488 #define CHARSET_TYPE_94 0 /* This charset includes 94 characters. */
|
|
489 #define CHARSET_TYPE_96 1 /* This charset includes 96 characters. */
|
70
|
490 #define CHARSET_TYPE_94X94 2 /* This charset includes 94x94 characters. */
|
|
491 #define CHARSET_TYPE_96X96 3 /* This charset includes 96x96 characters. */
|
|
492
|
74
|
493 #define CHARSET_LEFT_TO_RIGHT 0
|
|
494 #define CHARSET_RIGHT_TO_LEFT 1
|
70
|
495
|
213
|
496 #define CHARSET_ID(cs) ((cs)->id)
|
70
|
497 #define CHARSET_NAME(cs) ((cs)->name)
|
|
498 #define CHARSET_LEADING_BYTE(cs) ((cs)->leading_byte)
|
|
499 #define CHARSET_REP_BYTES(cs) ((cs)->rep_bytes)
|
|
500 #define CHARSET_COLUMNS(cs) ((cs)->columns)
|
|
501 #define CHARSET_GRAPHIC(cs) ((cs)->graphic)
|
|
502 #define CHARSET_TYPE(cs) ((cs)->type)
|
|
503 #define CHARSET_DIRECTION(cs) ((cs)->direction)
|
|
504 #define CHARSET_FINAL(cs) ((cs)->final)
|
|
505 #define CHARSET_DOC_STRING(cs) ((cs)->doc_string)
|
|
506 #define CHARSET_REGISTRY(cs) ((cs)->registry)
|
|
507 #define CHARSET_CCL_PROGRAM(cs) ((cs)->ccl_program)
|
|
508 #define CHARSET_REVERSE_DIRECTION_CHARSET(cs) ((cs)->reverse_direction_charset)
|
|
509
|
78
|
510 /* Optimized using inside knowledge of CHARSET_TYPE values */
|
|
511 #define CHARSET_DIMENSION(cs) ((CHARSET_TYPE (cs) <= CHARSET_TYPE_96) ? 1 : 2)
|
|
512 #define CHARSET_CHARS(cs) ((CHARSET_TYPE (cs) & 0x1) ? 96 : 94)
|
70
|
513
|
74
|
514 #define CHARSET_PRIVATE_P(cs) LEADING_BYTE_PRIVATE_P (CHARSET_LEADING_BYTE (cs))
|
70
|
515
|
213
|
516 #define XCHARSET_ID(cs) CHARSET_ID (XCHARSET (cs))
|
70
|
517 #define XCHARSET_NAME(cs) CHARSET_NAME (XCHARSET (cs))
|
|
518 #define XCHARSET_REP_BYTES(cs) CHARSET_REP_BYTES (XCHARSET (cs))
|
|
519 #define XCHARSET_COLUMNS(cs) CHARSET_COLUMNS (XCHARSET (cs))
|
|
520 #define XCHARSET_GRAPHIC(cs) CHARSET_GRAPHIC (XCHARSET (cs))
|
|
521 #define XCHARSET_TYPE(cs) CHARSET_TYPE (XCHARSET (cs))
|
|
522 #define XCHARSET_DIRECTION(cs) CHARSET_DIRECTION (XCHARSET (cs))
|
|
523 #define XCHARSET_FINAL(cs) CHARSET_FINAL (XCHARSET (cs))
|
|
524 #define XCHARSET_DOC_STRING(cs) CHARSET_DOC_STRING (XCHARSET (cs))
|
|
525 #define XCHARSET_REGISTRY(cs) CHARSET_REGISTRY (XCHARSET (cs))
|
|
526 #define XCHARSET_LEADING_BYTE(cs) CHARSET_LEADING_BYTE (XCHARSET (cs))
|
|
527 #define XCHARSET_CCL_PROGRAM(cs) CHARSET_CCL_PROGRAM (XCHARSET (cs))
|
|
528 #define XCHARSET_DIMENSION(cs) CHARSET_DIMENSION (XCHARSET (cs))
|
|
529 #define XCHARSET_CHARS(cs) CHARSET_CHARS (XCHARSET (cs))
|
|
530 #define XCHARSET_PRIVATE_P(cs) CHARSET_PRIVATE_P (XCHARSET (cs))
|
|
531 #define XCHARSET_REVERSE_DIRECTION_CHARSET(cs) \
|
|
532 CHARSET_REVERSE_DIRECTION_CHARSET (XCHARSET (cs))
|
|
533
|
|
534 /* Table of charsets indexed by (leading byte - 128). */
|
|
535 extern Lisp_Object charset_by_leading_byte[128];
|
|
536
|
|
537 /* Table of charsets indexed by type/final-byte/direction. */
|
|
538 extern Lisp_Object charset_by_attributes[4][128][2];
|
|
539
|
|
540 /* Table of number of bytes in the string representation of a character
|
|
541 indexed by the first byte of that representation.
|
|
542
|
|
543 This value can be derived other ways -- e.g. something like
|
|
544
|
|
545 (BYTE_ASCII_P (first_byte) ? 1 :
|
|
546 XCHARSET_REP_BYTES (CHARSET_BY_LEADING_BYTE (first_byte)))
|
|
547
|
|
548 but it's faster this way. */
|
|
549 extern Bytecount rep_bytes_by_first_byte[0xA0];
|
|
550
|
|
551 #ifdef ERROR_CHECK_TYPECHECK
|
|
552 /* int not Bufbyte even though that is the actual type of a leading byte.
|
|
553 This way, out-ot-range values will get caught rather than automatically
|
|
554 truncated. */
|
|
555 INLINE Lisp_Object CHARSET_BY_LEADING_BYTE (int lb);
|
|
556 INLINE Lisp_Object
|
|
557 CHARSET_BY_LEADING_BYTE (int lb)
|
|
558 {
|
|
559 assert (lb >= 0x80 && lb <= 0xFF);
|
|
560 return charset_by_leading_byte[lb - 128];
|
|
561 }
|
|
562
|
|
563 #else
|
|
564
|
|
565 #define CHARSET_BY_LEADING_BYTE(lb) (charset_by_leading_byte[(lb) - 128])
|
|
566
|
|
567 #endif
|
|
568
|
|
569 #define CHARSET_BY_ATTRIBUTES(type, final, dir) \
|
|
570 (charset_by_attributes[type][final][dir])
|
|
571
|
|
572 #ifdef ERROR_CHECK_TYPECHECK
|
|
573
|
|
574 /* Number of bytes in the string representation of a character */
|
|
575 INLINE int REP_BYTES_BY_FIRST_BYTE (int fb);
|
|
576 INLINE int
|
|
577 REP_BYTES_BY_FIRST_BYTE (int fb)
|
|
578 {
|
|
579 assert (fb >= 0 && fb < 0xA0);
|
|
580 return rep_bytes_by_first_byte[fb];
|
|
581 }
|
|
582
|
|
583 #else
|
|
584 #define REP_BYTES_BY_FIRST_BYTE(fb) (rep_bytes_by_first_byte[fb])
|
|
585 #endif
|
|
586
|
|
587 extern Lisp_Object Vcharset_ascii;
|
|
588 extern Lisp_Object Vcharset_control_1;
|
74
|
589 extern Lisp_Object Vcharset_latin_iso8859_1;
|
|
590 extern Lisp_Object Vcharset_latin_iso8859_2;
|
|
591 extern Lisp_Object Vcharset_latin_iso8859_3;
|
|
592 extern Lisp_Object Vcharset_latin_iso8859_4;
|
|
593 extern Lisp_Object Vcharset_cyrillic_iso8859_5;
|
|
594 extern Lisp_Object Vcharset_arabic_iso8859_6;
|
|
595 extern Lisp_Object Vcharset_greek_iso8859_7;
|
|
596 extern Lisp_Object Vcharset_hebrew_iso8859_8;
|
|
597 extern Lisp_Object Vcharset_latin_iso8859_9;
|
|
598 extern Lisp_Object Vcharset_thai_tis620;
|
|
599 extern Lisp_Object Vcharset_katakana_jisx0201;
|
|
600 extern Lisp_Object Vcharset_latin_jisx0201;
|
70
|
601 extern Lisp_Object Vcharset_japanese_jisx0208_1978;
|
|
602 extern Lisp_Object Vcharset_japanese_jisx0208;
|
|
603 extern Lisp_Object Vcharset_japanese_jisx0212;
|
|
604 extern Lisp_Object Vcharset_korean_ksc5601;
|
74
|
605 extern Lisp_Object Vcharset_chinese_gb2312;
|
70
|
606 extern Lisp_Object Vcharset_chinese_big5_1;
|
|
607 extern Lisp_Object Vcharset_chinese_big5_2;
|
|
608 extern Lisp_Object Vcharset_chinese_cns11643_1;
|
|
609 extern Lisp_Object Vcharset_chinese_cns11643_2;
|
|
610 extern Lisp_Object Vcharset_composite;
|
|
611
|
|
612
|
|
613 /************************************************************************/
|
|
614 /* Dealing with characters */
|
|
615 /************************************************************************/
|
|
616
|
|
617 /* Is this character represented by more than one byte in a string? */
|
|
618
|
|
619 #define CHAR_MULTIBYTE_P(c) ((c) >= 0x80)
|
|
620
|
|
621 #define CHAR_ASCII_P(c) (!CHAR_MULTIBYTE_P (c))
|
|
622
|
|
623 /* The bit fields of character are divided into 3 parts:
|
|
624 FIELD1(5bits):FIELD2(7bits):FIELD3(7bits) */
|
|
625
|
|
626 #define CHAR_FIELD1_MASK (0x1F << 14)
|
|
627 #define CHAR_FIELD2_MASK (0x7F << 7)
|
|
628 #define CHAR_FIELD3_MASK 0x7F
|
|
629
|
|
630 /* Macros to access each field of a character code of C. */
|
|
631
|
|
632 #define CHAR_FIELD1(c) (((c) & CHAR_FIELD1_MASK) >> 14)
|
|
633 #define CHAR_FIELD2(c) (((c) & CHAR_FIELD2_MASK) >> 7)
|
|
634 #define CHAR_FIELD3(c) ((c) & CHAR_FIELD3_MASK)
|
|
635
|
|
636 /* Field 1, if non-zero, usually holds a leading byte for a
|
|
637 dimension-2 charset. Field 2, if non-zero, usually holds a leading
|
|
638 byte for a dimension-1 charset. */
|
|
639
|
|
640 /* Converting between field values and leading bytes. */
|
|
641
|
|
642 #define FIELD2_TO_OFFICIAL_LEADING_BYTE 0x80
|
|
643 #define FIELD2_TO_PRIVATE_LEADING_BYTE 0x80
|
|
644
|
|
645 #define FIELD1_TO_OFFICIAL_LEADING_BYTE 0x8F
|
|
646 #define FIELD1_TO_PRIVATE_LEADING_BYTE 0xE1
|
|
647
|
|
648 /* Minimum and maximum allowed values for the fields. */
|
|
649
|
|
650 #define MIN_CHAR_FIELD2_OFFICIAL \
|
|
651 (MIN_LEADING_BYTE_OFFICIAL_1 - FIELD2_TO_OFFICIAL_LEADING_BYTE)
|
|
652 #define MAX_CHAR_FIELD2_OFFICIAL \
|
|
653 (MAX_LEADING_BYTE_OFFICIAL_1 - FIELD2_TO_OFFICIAL_LEADING_BYTE)
|
|
654
|
|
655 #define MIN_CHAR_FIELD1_OFFICIAL \
|
|
656 (MIN_LEADING_BYTE_OFFICIAL_2 - FIELD1_TO_OFFICIAL_LEADING_BYTE)
|
|
657 #define MAX_CHAR_FIELD1_OFFICIAL \
|
|
658 (MAX_LEADING_BYTE_OFFICIAL_2 - FIELD1_TO_OFFICIAL_LEADING_BYTE)
|
|
659
|
|
660 #define MIN_CHAR_FIELD2_PRIVATE \
|
|
661 (MIN_LEADING_BYTE_PRIVATE_1 - FIELD2_TO_PRIVATE_LEADING_BYTE)
|
|
662 #define MAX_CHAR_FIELD2_PRIVATE \
|
|
663 (MAX_LEADING_BYTE_PRIVATE_1 - FIELD2_TO_PRIVATE_LEADING_BYTE)
|
|
664
|
|
665 #define MIN_CHAR_FIELD1_PRIVATE \
|
|
666 (MIN_LEADING_BYTE_PRIVATE_2 - FIELD1_TO_PRIVATE_LEADING_BYTE)
|
|
667 #define MAX_CHAR_FIELD1_PRIVATE \
|
|
668 (MAX_LEADING_BYTE_PRIVATE_2 - FIELD1_TO_PRIVATE_LEADING_BYTE)
|
|
669
|
|
670 /* Minimum character code of each <type> character. */
|
|
671
|
|
672 #define MIN_CHAR_OFFICIAL_TYPE9N (MIN_CHAR_FIELD2_OFFICIAL << 7)
|
|
673 #define MIN_CHAR_PRIVATE_TYPE9N (MIN_CHAR_FIELD2_PRIVATE << 7)
|
|
674 #define MIN_CHAR_OFFICIAL_TYPE9NX9N (MIN_CHAR_FIELD1_OFFICIAL << 14)
|
|
675 #define MIN_CHAR_PRIVATE_TYPE9NX9N (MIN_CHAR_FIELD1_PRIVATE << 14)
|
|
676 #define MIN_CHAR_COMPOSITION (0x1F << 14)
|
|
677
|
|
678 /* Leading byte of a character.
|
|
679
|
|
680 NOTE: This takes advantage of the fact that
|
|
681 FIELD2_TO_OFFICIAL_LEADING_BYTE and
|
|
682 FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
|
|
683 */
|
|
684
|
|
685 INLINE Bufbyte CHAR_LEADING_BYTE (Emchar c);
|
|
686 INLINE Bufbyte
|
|
687 CHAR_LEADING_BYTE (Emchar c)
|
|
688 {
|
|
689 if (CHAR_ASCII_P (c))
|
|
690 return LEADING_BYTE_ASCII;
|
|
691 else if (c < 0xA0)
|
|
692 return LEADING_BYTE_CONTROL_1;
|
|
693 else if (c < MIN_CHAR_OFFICIAL_TYPE9NX9N)
|
|
694 return CHAR_FIELD2 (c) + FIELD2_TO_OFFICIAL_LEADING_BYTE;
|
|
695 else if (c < MIN_CHAR_PRIVATE_TYPE9NX9N)
|
|
696 return CHAR_FIELD1 (c) + FIELD1_TO_OFFICIAL_LEADING_BYTE;
|
|
697 else if (c < MIN_CHAR_COMPOSITION)
|
|
698 return CHAR_FIELD1 (c) + FIELD1_TO_PRIVATE_LEADING_BYTE;
|
|
699 else
|
|
700 return LEADING_BYTE_COMPOSITE;
|
|
701 }
|
|
702
|
|
703 #define CHAR_CHARSET(c) CHARSET_BY_LEADING_BYTE (CHAR_LEADING_BYTE (c))
|
|
704
|
|
705 /* Return a character whose charset is CHARSET and position-codes
|
|
706 are C1 and C2. TYPE9N character ignores C2.
|
|
707
|
|
708 NOTE: This takes advantage of the fact that
|
|
709 FIELD2_TO_OFFICIAL_LEADING_BYTE and
|
|
710 FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
|
|
711 */
|
|
712
|
|
713 INLINE Emchar MAKE_CHAR (Lisp_Object charset, int c1, int c2);
|
|
714 INLINE Emchar
|
|
715 MAKE_CHAR (Lisp_Object charset, int c1, int c2)
|
|
716 {
|
|
717 if (EQ (charset, Vcharset_ascii))
|
|
718 return c1;
|
|
719 else if (EQ (charset, Vcharset_control_1))
|
|
720 return c1 | 0x80;
|
|
721 else if (EQ (charset, Vcharset_composite))
|
|
722 return (0x1F << 14) | ((c1) << 7) | (c2);
|
|
723 else if (XCHARSET_DIMENSION (charset) == 1)
|
|
724 return ((XCHARSET_LEADING_BYTE (charset) -
|
|
725 FIELD2_TO_OFFICIAL_LEADING_BYTE) << 7) | (c1);
|
|
726 else if (!XCHARSET_PRIVATE_P (charset))
|
|
727 return ((XCHARSET_LEADING_BYTE (charset) -
|
|
728 FIELD1_TO_OFFICIAL_LEADING_BYTE) << 14) | ((c1) << 7) | (c2);
|
|
729 else
|
|
730 return ((XCHARSET_LEADING_BYTE (charset) -
|
|
731 FIELD1_TO_PRIVATE_LEADING_BYTE) << 14) | ((c1) << 7) | (c2);
|
|
732 }
|
|
733
|
|
734 /* The charset of character C is set to CHARSET, and the
|
|
735 position-codes of C are set to C1 and C2. C2 of TYPE9N character
|
|
736 is 0. */
|
|
737
|
|
738 /* BREAKUP_CHAR_1_UNSAFE assumes that the charset has already been
|
|
739 calculated, and just computes c1 and c2.
|
|
740
|
|
741 BREAKUP_CHAR also computes and stores the charset. */
|
|
742
|
|
743 #define BREAKUP_CHAR_1_UNSAFE(c, charset, c1, c2) \
|
|
744 XCHARSET_DIMENSION (charset) == 1 \
|
|
745 ? ((c1) = CHAR_FIELD3 (c), (c2) = 0) \
|
|
746 : ((c1) = CHAR_FIELD2 (c), \
|
|
747 (c2) = CHAR_FIELD3 (c))
|
|
748
|
|
749 INLINE void breakup_char_1 (Emchar c, Lisp_Object *charset, int *c1, int *c2);
|
|
750 INLINE void
|
|
751 breakup_char_1 (Emchar c, Lisp_Object *charset, int *c1, int *c2)
|
|
752 {
|
|
753 *charset = CHAR_CHARSET (c);
|
|
754 BREAKUP_CHAR_1_UNSAFE (c, *charset, *c1, *c2);
|
|
755 }
|
|
756
|
|
757 #define BREAKUP_CHAR(c, charset, c1, c2) \
|
|
758 breakup_char_1 (c, &(charset), &(c1), &(c2))
|
|
759
|
|
760
|
|
761
|
|
762 /************************************************************************/
|
|
763 /* Composite characters */
|
|
764 /************************************************************************/
|
|
765
|
|
766 extern Lisp_Object Vcomposite_char_int2string_hashtable;
|
|
767 extern Lisp_Object Vcomposite_char_string2int_hashtable;
|
|
768
|
|
769 Emchar lookup_composite_char (Bufbyte *str, int len);
|
|
770 Lisp_Object composite_char_string (Emchar ch);
|
|
771
|
|
772
|
|
773
|
|
774 /************************************************************************/
|
|
775 /* Exported functions */
|
|
776 /************************************************************************/
|
|
777
|
|
778 Lisp_Object Fget_charset (Lisp_Object);
|
|
779 Lisp_Object Ffind_charset (Lisp_Object);
|
|
780
|
|
781 int copy_internal_to_external (CONST Bufbyte *internal, Bytecount len,
|
|
782 unsigned char *external);
|
|
783 Bytecount copy_external_to_internal (CONST unsigned char *external,
|
|
784 int len, Bufbyte *internal);
|
|
785
|
|
786 #endif /* _XEMACS_MULE_CHARSET_H */
|