xemacs-beta: src/mule-charset.h comparison

comparison src/mule-charset.h @ 70:131b0175ea99 r20-0b30

Import from CVS: tag r20-0b30

author	cvs
date	Mon, 13 Aug 2007 09:02:59 +0200
parents
children	54cc21c15cbb

comparison

equal deleted inserted replaced

-:804d1389bcd6
+:131b0175ea99
+/* Header for multilingual functions.
+Copyright (C) 1992, 1995 Free Software Foundation, Inc.
+Copyright (C) 1995 Sun Microsystems, Inc.
+This file is part of XEmacs.
+XEmacs is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+XEmacs is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+You should have received a copy of the GNU General Public License
+along with XEmacs; see the file COPYING.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+/* Synched up with: Mule 2.3.  Not in FSF. */
+/* Rewritten by Ben Wing <wing@666.com>. */
+#ifndef _XEMACS_MULE_CHARSET_H
+#define _XEMACS_MULE_CHARSET_H
+/*
+1. Character Sets
+=================
+A character set (or "charset") is an ordered set of characters.
+A particular character in a charset is indexed using one or
+more "position codes", which are non-negative integers.
+The number of position codes needed to identify a particular
+character in a charset is called the "dimension" of the
+charset.  In XEmacs/Mule, all charsets have 1 or 2 dimensions,
+and the size of all charsets (except for a few special cases)
+is either 94, 96, 94 by 94, or 96 by 96.  The range of
+position codes used to index characters from any of these
+types of character sets is as follows:
+Charset type		Position code 1		Position code 2
+------------------------------------------------------------
+94			33 - 126		N/A
+96			32 - 127		N/A
+94x94		33 - 126		33 - 126
+96x96		32 - 127		32 - 127
+Note that in the above cases position codes do not start at
+an expected value such as 0 or 1.  The reason for this will
+become clear later.
+For example, Latin-1 is a 96-character charset, and JISX0208
+(the Japanese national character set) is a 94x94-character
+charset.
+[Note that, although the ranges above define the *valid*
+position codes for a charset, some of the slots in a particular
+charset may in fact be empty.  This is the case for JISX0208,
+for example, where (e.g.) all the slots whose first
+position code is in the range 118 - 127 are empty.]
+There are three charsets that do not follow the above rules.
+All of them have one dimension, and have ranges of position
+codes as follows:
+Charset name		Position code 1
+------------------------------------
+ASCII		0 - 127
+Control-1		0 - 31
+Composite		0 - some large number
+(The upper bound of the position code for composite characters
+has not yet been determined, but it will probably be at
+least 16,383).
+ASCII is the union of two subsidiary character sets:
+Printing-ASCII (the printing ASCII character set,
+consisting of position codes 33 - 126, like for a standard
+94-character charset) and Control-ASCII (the non-printing
+characters that would appear in a binary file with codes 0
+- 32 and 127).
+Control-1 contains the non-printing characters that would
+appear in a binary file with codes 128 - 159.
+Composite contains characters that are generated by
+overstriking one or more characters from other charsets.
+Note that some characters in ASCII, and all characters
+in Control-1, are "control" (non-printing) characters.
+These have no printed representation but instead control
+some other function of the printing (e.g. TAB or 8 moves
+the current character position to the next tab stop).
+All other characters in all charsets are "graphic"
+(printing) characters.
+When a binary file is read in, the bytes in the file are
+assigned to character sets as follows:
+Bytes		Character set		Range
+--------------------------------------------------
+0 - 127		ASCII			0 - 127
+128 - 159		Control-1		0 - 31
+160 - 255		Latin-1			32 - 127
+This is a bit ad-hoc but gets the job done.
+2. Encodings
+============
+An "encoding" is a way of numerically representing
+characters from one or more character sets.  If an encoding
+only encompasses one character set, then the position codes
+for the characters in that character set could be used
+directly.  This is not possible, however, if more than one
+character set is to be used in the encoding.
+For example, the conversion detailed above between bytes in
+a binary file and characters is effectively an encoding
+that encompasses the three character sets ASCII, Control-1,
+and Latin-1 in a stream of 8-bit bytes.
+Thus, an encoding can be viewed as a way of encoding
+characters from a specified group of character sets using a
+stream of bytes, each of which contains a fixed number of
+bits (but not necessarily 8, as in the common usage of
+"byte").
+Here are descriptions of a couple of common
+encodings:
+A. Japanese EUC (Extended Unix Code)
+This encompasses the character sets:
+- Printing-ASCII,
+- Japanese-JISX0201-Kana (half-width katakana, the right half of JISX0201).
+- Japanese-JISX0208
+- Japanese-JISX0212
+It uses 8-bit bytes.
+Note that Printing-ASCII and Japanese-JISX0201-Kana are 94-character
+charsets, while Japanese-JISX0208 is a 94x94-character charset.
+The encoding is as follows:
+Character set		Representation  (PC == position-code)
+-------------		--------------
+Printing-ASCII		PC-1
+Japanese-JISX0208		PC-1 + 0x80 / PC-2 + 0x80
+Japanese-JISX0201-Kana	0x8E / PC-1 + 0x80
+B. JIS7
+This encompasses the character sets:
+- Printing-ASCII
+- Japanese-JISX0201-Roman (the left half of JISX0201; this
+character set is very similar to Printing-ASCII and is a
+94-character charset)
+- Japanese-JISX0208
+- Japanese-JISX0201-Kana.
+It uses 7-bit bytes.
+Unlike Japanese EUC, this is a "modal" encoding, which
+means that there are multiple states that the encoding can
+be in, which affect how the bytes are to be interpreted.
+Special sequences of bytes (called "escape sequences")
+are used to change states.
+The encoding is as follows:
+Character set		Representation
+-------------		--------------
+Printing-ASCII		PC-1
+Japanese-JISX0201-Roman	PC-1
+Japanese-JISX0201-Kana	PC-1
+Japanese-JISX0208		PC-1 / PC-2
+Escape sequence	ASCII equivalent  Meaning
+---------------	----------------  -------
+0x1B 0x28 0x42	ESC ( B		  invoke Printing-ASCII
+0x1B 0x28 0x4A	ESC ( J		  invoke Japanese-JISX0201-Roman
+0x1B 0x28 0x49	ESC ( I		  invoke Japanese-JISX0201-Kana
+0x1B 0x24 0x42	ESC $ B		  invoke Japanese-JISX0208
+Initially, Printing-ASCII is invoked.
+3. Internal Mule Encodings
+==========================
+In XEmacs/Mule, each character set is assigned a unique number,
+called a "leading byte".  This is used in the encodings of a
+character.  Leading bytes are in the range 0x80 - 0xFF
+(except for ASCII, which has a leading byte of 0), although
+some leading bytes are reserved.
+Charsets whose leading byte is in the range 0x80 - 0x9F are
+called "official" and are used for built-in charsets.
+Other charsets are called "private" and have leading bytes
+in the range 0xA0 - 0xFF; these are user-defined charsets.
+More specifically:
+Character set		Leading byte
+-------------		------------
+ASCII			0
+Composite			0x80
+Dimension-1 Official		0x81 - 0x8D
+				  (0x8E is free)
+Control			0x8F
+Dimension-2 Official		0x90 - 0x99
+				  (0x9A - 0x9D are free;
+				  0x9E and 0x9F are reserved)
+Dimension-1 Private		0xA0 - 0xEF
+Dimension-2 Private		0xF0 - 0xFF
+There are two internal encodings for characters in XEmacs/Mule.
+One is called "string encoding" and is an 8-bit encoding that
+is used for representing characters in a buffer or string.
+It uses 1 to 4 bytes per character.  The other is called
+"character encoding" and is a 19-bit encoding that is used
+for representing characters individually in a variable.
+(In the following descriptions, we'll ignore composite
+characters for the moment.  We also give a general (structural)
+overview first, followed later by the exact details.)
+A. Internal String Encoding
+ASCII characters are encoded using their position code directly.
+Other characters are encoded using their leading byte followed
+by their position code(s) with the high bit set.  Characters
+in private character sets have their leading byte prefixed with
+a "leading byte prefix", which is either 0x9E or 0x9F. (No
+character sets are ever assigned these leading bytes.) Specifically:
+Character set		Encoding (PC == position-code)
+-------------		-------- (LB == leading-byte)
+ASCII			PC1 |
+Control-1			LB   | PC1 + 0xA0
+Dimension-1 official		LB   | PC1 + 0x80
+Dimension-1 private		0x9E | LB         | PC1 + 0x80
+Dimension-2 official		LB   | PC1        | PC2 + 0x80
+Dimension-2 private		0x9F | LB         | PC1 + 0x80 | PC2 + 0x80
+The basic characteristic of this encoding is that the first byte
+of all characters is in the range 0x00 - 0x9F, and the second and
+following bytes of all characters is in the range 0xA0 - 0xFF.
+This means that it is impossible to get out of sync, or more
+specifically:
+1. Given any byte position, the beginning of the character it is
+within can be determined in constant time.
+2. Given any byte position at the beginning of a character, the
+beginning of the next character can be determined in constant
+time.
+3. Given any byte position at the beginning of a character, the
+beginning of the previous character can be determined in constant
+time.
+4. Textual searches can simply treat encoded strings as if they
+were encoded in a one-byte-per-character fashion rather than
+the actual multi-byte encoding.
+None of the standard non-modal encodings meet all of these
+conditions.  For example, EUC satisfies only (2) and (3), while
+Shift-JIS and Big5 (not yet described) satisfy only (2). (All
+non-modal encodings must satisfy (2), in order to be unambiguous.)
+B. Internal Character Encoding
+One 19-bit word represents a single character.  The word is
+separated into three fields:
+Bit number:	18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+		<------------> <------------------> <------------------>
+Field:	      1		         2		      3
+Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits.
+Character set		Field 1		Field 2		Field 3
+-------------		-------		-------		-------
+ASCII			   0		   0              PC1
+range:                                                   (00 - 7F)
+Control-1			   0		   1              PC1
+range:                                                   (00 - 1F)
+Dimension-1 official            0            LB - 0x80         PC1
+range:                                    (01 - 0D)      (20 - 7F)
+Dimension-1 private             0            LB - 0x80         PC1
+range:                                    (20 - 6F)      (20 - 7F)
+Dimension-2 official		LB - 0x8F          PC1            PC2
+range:                    (01 - 0A)       (20 - 7F)      (20 - 7F)
+Dimension-2 private          LB - 0xE1          PC1            PC2
+range:                    (0F - 1E)       (20 - 7F)      (20 - 7F)
+Composite			  0x1F              ?              ?
+Note that character codes 0 - 255 are the same as the "binary encoding"
+described above.
+*/
+/*
+About Unicode support:
+Adding Unicode support is very desirable.  Unicode will likely be a
+very common representation in the future, and thus we should
+represent Unicode characters using three bytes instead of four.
+This means we need to find leading bytes for Unicode.  Given that
+there are 65,536 characters in Unicode and we can attach 96x96 =
+9,216 characters per leading byte, we need eight leading bytes for
+Unicode.  We currently have four free (0x9A - 0x9D), and with a
+little bit of rearranging we can get five: ASCII doesn't really
+need to take up a leading byte. (We could just as well use 0x7F,
+with a little change to the functions that assume that 0x80 is the
+lowest leading byte.) This means we still need to dump three
+leading bytes and move them into private space.  The CNS charsets
+are good candidates since they are rarely used, and
+JAPANESE_JISX0208_1978 is becoming less and less used and could
+also be dumped. */
+/************************************************************************/
+/*                    Definition of leading bytes                       */
+/************************************************************************/
+#define MIN_LEADING_BYTE		0x80
+/* These need special treatment in a string and/or character */
+#define LEADING_BYTE_ASCII		0x8E /* Omitted in a buffer */
+#define LEADING_BYTE_COMPOSITE		0x80 /* for a composite character */
+#define LEADING_BYTE_CONTROL_1		0x8F /* represent normal 80-9F */
+/** The following are for 1-byte characters in an official charset. **/
+#define LEADING_BYTE_LATIN_1		0x81 /* Right half of ISO 8859-1 */
+#define LEADING_BYTE_LATIN_2		0x82 /* Right half of ISO 8859-2 */
+#define LEADING_BYTE_LATIN_3		0x83 /* Right half of ISO 8859-3 */
+#define LEADING_BYTE_LATIN_4		0x84 /* Right half of ISO 8859-4 */
+#define LEADING_BYTE_THAI		0x85 /* TIS620-2533 */
+#define LEADING_BYTE_GREEK		0x86 /* Right half of ISO 8859-7 */
+#define LEADING_BYTE_ARABIC		0x87 /* Right half of ISO 8859-6 */
+#define LEADING_BYTE_HEBREW		0x88 /* Right half of ISO 8859-8 */
+#define LEADING_BYTE_JAPANESE_JISX0201_KANA  0x89 /* Right half of JIS X0201-1976 */
+#define LEADING_BYTE_JAPANESE_JISX0201_ROMAN 0x8A /* Left  half of JIS X0201-1976 */
+#define LEADING_BYTE_CYRILLIC		0x8C /* Right half of ISO 8859-5 */
+#define LEADING_BYTE_LATIN_5		0x8D /* Right half of ISO 8859-9 */
+#define MIN_LEADING_BYTE_OFFICIAL_1	LEADING_BYTE_LATIN_1
+#define MAX_LEADING_BYTE_OFFICIAL_1	LEADING_BYTE_LATIN_5
+/** The following are for 2-byte characters in an official charset. **/
+#define LEADING_BYTE_JAPANESE_JISX0208_1978 0x90/* Japanese JIS X0208-1978 */
+#define LEADING_BYTE_CHINESE_GB		0x91	/* Chinese Hanzi GB2312-1980 */
+#define LEADING_BYTE_JAPANESE_JISX0208	0x92	/* Japanese JIS X0208-1983 */
+#define LEADING_BYTE_KOREAN_KSC5601	0x93	/* Hangul KS C5601-1987 */
+#define LEADING_BYTE_JAPANESE_JISX0212	0x94	/* Japanese JIS X0212-1990 */
+#define LEADING_BYTE_CHINESE_CNS11643_1	0x95	/* Chinese CNS11643 Set 1 */
+#define LEADING_BYTE_CHINESE_CNS11643_2	0x96	/* Chinese CNS11643 Set 2 */
+#define LEADING_BYTE_CHINESE_BIG5_1	0x97	/* Big5 Level 1 */
+#define LEADING_BYTE_CHINESE_BIG5_2	0x98	/* Big5 Level 2 */
+				     /* 0x99	   unused */
+				     /* 0x9A       unused */
+				     /* 0x9B       unused */
+				     /* 0x9C       unused */
+				     /* 0x9D       unused */
+#define MIN_LEADING_BYTE_OFFICIAL_2	LEADING_BYTE_JAPANESE_JISX0208_1978
+#define MAX_LEADING_BYTE_OFFICIAL_2	LEADING_BYTE_CHINESE_BIG5_2
+/** The following are for 1- and 2-byte characters in a private charset. **/
+#define PRE_LEADING_BYTE_PRIVATE_1	0x9E	/* 1-byte char-set */
+#define PRE_LEADING_BYTE_PRIVATE_2	0x9F	/* 2-byte char-set */
+#define MIN_LEADING_BYTE_PRIVATE_1	0xA0
+#define MAX_LEADING_BYTE_PRIVATE_1	0xEF
+#define MIN_LEADING_BYTE_PRIVATE_2	0xF0
+#define MAX_LEADING_BYTE_PRIVATE_2	0xFF
+#define NUM_LEADING_BYTES 128
+/************************************************************************/
+/*                    Operations on leading bytes                       */
+/************************************************************************/
+/* Is this leading byte for a private charset? */
+#define LEADING_BYTE_PRIVATE_P(lb) ((lb) >= MIN_LEADING_BYTE_PRIVATE_1)
+/* Is this a prefix for a private leading byte? */
+INLINE int LEADING_BYTE_PREFIX_P (unsigned char lb);
+INLINE int
+LEADING_BYTE_PREFIX_P (unsigned char lb)
+{
+return (lb == PRE_LEADING_BYTE_PRIVATE_1 ||
+	  lb == PRE_LEADING_BYTE_PRIVATE_2);
+}
+/* Given a private leading byte, return the leading byte prefix stored
+in a string */
+#define PRIVATE_LEADING_BYTE_PREFIX(lb)				\
+((lb) < MIN_LEADING_BYTE_PRIVATE_2 ? PRE_LEADING_BYTE_PRIVATE_1	\
+: PRE_LEADING_BYTE_PRIVATE_2)
+/************************************************************************/
+/*                     Operations on individual bytes                   */
+/*                             of any format                            */
+/************************************************************************/
+/* Argument `c' should be (unsigned int) or (unsigned char). */
+/* Note that SP and DEL are not included. */
+#define BYTE_ASCII_P(c) ((c) < 0x80)
+#define BYTE_C0_P(c) ((c) < 0x20)
+/* Do some forced casting just to make *sure* things are gotten right. */
+#define BYTE_C1_P(c) ((unsigned int) ((unsigned int) (c) - 0x80) < 0x20)
+/************************************************************************/
+/*                     Operations on individual bytes                   */
+/*                       in a Mule-formatted string                     */
+/************************************************************************/
+/* Does this byte represent the first byte of a character? */
+#define BUFBYTE_FIRST_BYTE_P(c) ((c) < 0xA0)
+/* Does this byte represent the first byte of a multi-byte character? */
+#define BUFBYTE_LEADING_BYTE_P(c) BYTE_C1_P (c)
+/************************************************************************/
+/*            Information about a particular character set              */
+/************************************************************************/
+struct Lisp_Charset
+{
+struct lcrecord_header header;
+Lisp_Object name;
+Lisp_Object doc_string, registry;
+Lisp_Object reverse_direction_charset;
+Lisp_Object ccl_program;
+unsigned int leading_byte :8;
+/* Number of bytes (1 - 4) required in the internal representation
+for characters in this character set.  This is *not* the
+same as the number of bytes used in the encoding (i.e.
+the "dimension" of the character set).  That value can
+be derived from the TYPE. */
+unsigned int rep_bytes :3;
+/* Number of columns a character in this charset takes up, on TTY
+devices.  Not used for X devices. */
+unsigned int columns :2;
+/* Direction of this character set */
+unsigned int direction :1;
+/* Type of this character set (94, 96, 94x94, 96x96) */
+unsigned int type :2;
+/* Which half of font to be used to display this character set */
+unsigned int graphic :2;
+/* Final byte of this character set in ISO2022 designating escape sequence */
+Bufbyte final;
+};
+DECLARE_LRECORD (charset, struct Lisp_Charset);
+#define XCHARSET(x) XRECORD (x, charset, struct Lisp_Charset)
+#define XSETCHARSET(x, p) XSETRECORD (x, p, charset)
+#define CHARSETP(x) RECORDP (x, charset)
+#define GC_CHARSETP(x) GC_RECORDP (x, charset)
+#define CHECK_CHARSET(x) CHECK_RECORD (x, charset)
+#define CONCHECK_CHARSET(x) CONCHECK_RECORD (x, charset)
+#define CHARSET_TYPE_94  0	/* This charset includes 94    characters. */
+#define CHARSET_TYPE_96  1	/* This charset includes 96    characters. */
+#define CHARSET_TYPE_94X94 2	/* This charset includes 94x94 characters. */
+#define CHARSET_TYPE_96X96 3	/* This charset includes 96x96 characters. */
+#define CHARSET_LEFT_TO_RIGHT		0
+#define CHARSET_RIGHT_TO_LEFT		1
+#define CHARSET_NAME(cs)	 ((cs)->name)
+#define CHARSET_LEADING_BYTE(cs) ((cs)->leading_byte)
+#define CHARSET_REP_BYTES(cs)	 ((cs)->rep_bytes)
+#define CHARSET_COLUMNS(cs)	 ((cs)->columns)
+#define CHARSET_GRAPHIC(cs)	 ((cs)->graphic)
+#define CHARSET_TYPE(cs)	 ((cs)->type)
+#define CHARSET_DIRECTION(cs)	 ((cs)->direction)
+#define CHARSET_FINAL(cs)	 ((cs)->final)
+#define CHARSET_DOC_STRING(cs)	 ((cs)->doc_string)
+#define CHARSET_REGISTRY(cs)	 ((cs)->registry)
+#define CHARSET_CCL_PROGRAM(cs)  ((cs)->ccl_program)
+#define CHARSET_REVERSE_DIRECTION_CHARSET(cs) ((cs)->reverse_direction_charset)
+INLINE int CHARSET_DIMENSION (struct Lisp_Charset *cs);
+INLINE int
+CHARSET_DIMENSION (struct Lisp_Charset *cs)
+{
+return (CHARSET_TYPE (cs) == CHARSET_TYPE_94 ||
+CHARSET_TYPE (cs) == CHARSET_TYPE_96) ? 1 : 2;
+}
+INLINE int CHARSET_CHARS (struct Lisp_Charset *cs);
+INLINE int
+CHARSET_CHARS (struct Lisp_Charset *cs)
+{
+return (CHARSET_TYPE (cs) == CHARSET_TYPE_94 ||
+CHARSET_TYPE (cs) == CHARSET_TYPE_94X94) ? 94 : 96;
+}
+#define CHARSET_PRIVATE_P(cs) \
+LEADING_BYTE_PRIVATE_P (CHARSET_LEADING_BYTE (cs))
+#define XCHARSET_NAME(cs)	  CHARSET_NAME         (XCHARSET (cs))
+#define XCHARSET_REP_BYTES(cs)	  CHARSET_REP_BYTES    (XCHARSET (cs))
+#define XCHARSET_COLUMNS(cs)	  CHARSET_COLUMNS      (XCHARSET (cs))
+#define XCHARSET_GRAPHIC(cs)      CHARSET_GRAPHIC      (XCHARSET (cs))
+#define XCHARSET_TYPE(cs)	  CHARSET_TYPE         (XCHARSET (cs))
+#define XCHARSET_DIRECTION(cs)	  CHARSET_DIRECTION    (XCHARSET (cs))
+#define XCHARSET_FINAL(cs)	  CHARSET_FINAL        (XCHARSET (cs))
+#define XCHARSET_DOC_STRING(cs)	  CHARSET_DOC_STRING   (XCHARSET (cs))
+#define XCHARSET_REGISTRY(cs)	  CHARSET_REGISTRY     (XCHARSET (cs))
+#define XCHARSET_LEADING_BYTE(cs) CHARSET_LEADING_BYTE (XCHARSET (cs))
+#define XCHARSET_CCL_PROGRAM(cs)  CHARSET_CCL_PROGRAM  (XCHARSET (cs))
+#define XCHARSET_DIMENSION(cs)	  CHARSET_DIMENSION    (XCHARSET (cs))
+#define XCHARSET_CHARS(cs)	  CHARSET_CHARS        (XCHARSET (cs))
+#define XCHARSET_PRIVATE_P(cs)	  CHARSET_PRIVATE_P    (XCHARSET (cs))
+#define XCHARSET_REVERSE_DIRECTION_CHARSET(cs) \
+CHARSET_REVERSE_DIRECTION_CHARSET (XCHARSET (cs))
+/* Table of charsets indexed by (leading byte - 128). */
+extern Lisp_Object charset_by_leading_byte[128];
+/* Table of charsets indexed by type/final-byte/direction. */
+extern Lisp_Object charset_by_attributes[4][128][2];
+/* Table of number of bytes in the string representation of a character
+indexed by the first byte of that representation.
+This value can be derived other ways -- e.g. something like
+(BYTE_ASCII_P (first_byte) ? 1 :
+XCHARSET_REP_BYTES (CHARSET_BY_LEADING_BYTE (first_byte)))
+but it's faster this way. */
+extern Bytecount rep_bytes_by_first_byte[0xA0];
+#ifdef ERROR_CHECK_TYPECHECK
+/* int not Bufbyte even though that is the actual type of a leading byte.
+This way, out-ot-range values will get caught rather than automatically
+truncated. */
+INLINE Lisp_Object CHARSET_BY_LEADING_BYTE (int lb);
+INLINE Lisp_Object
+CHARSET_BY_LEADING_BYTE (int lb)
+{
+assert (lb >= 0x80 && lb <= 0xFF);
+return charset_by_leading_byte[lb - 128];
+}
+#else
+#define CHARSET_BY_LEADING_BYTE(lb) (charset_by_leading_byte[(lb) - 128])
+#endif
+#define CHARSET_BY_ATTRIBUTES(type, final, dir) \
+(charset_by_attributes[type][final][dir])
+#ifdef ERROR_CHECK_TYPECHECK
+/* Number of bytes in the string representation of a character */
+INLINE int REP_BYTES_BY_FIRST_BYTE (int fb);
+INLINE int
+REP_BYTES_BY_FIRST_BYTE (int fb)
+{
+assert (fb >= 0 && fb < 0xA0);
+return rep_bytes_by_first_byte[fb];
+}
+#else
+#define REP_BYTES_BY_FIRST_BYTE(fb) (rep_bytes_by_first_byte[fb])
+#endif
+extern Lisp_Object Vcharset_ascii;
+extern Lisp_Object Vcharset_control_1;
+extern Lisp_Object Vcharset_latin_1;
+extern Lisp_Object Vcharset_latin_2;
+extern Lisp_Object Vcharset_latin_3;
+extern Lisp_Object Vcharset_latin_4;
+extern Lisp_Object Vcharset_latin_5;
+extern Lisp_Object Vcharset_greek;
+extern Lisp_Object Vcharset_thai;
+extern Lisp_Object Vcharset_arabic;
+extern Lisp_Object Vcharset_hebrew;
+extern Lisp_Object Vcharset_cyrillic;
+extern Lisp_Object Vcharset_japanese_jisx0201_kana;
+extern Lisp_Object Vcharset_japanese_jisx0201_roman;
+extern Lisp_Object Vcharset_japanese_jisx0208_1978;
+extern Lisp_Object Vcharset_japanese_jisx0208;
+extern Lisp_Object Vcharset_japanese_jisx0212;
+extern Lisp_Object Vcharset_korean_ksc5601;
+extern Lisp_Object Vcharset_chinese_gb;
+extern Lisp_Object Vcharset_chinese_big5_1;
+extern Lisp_Object Vcharset_chinese_big5_2;
+extern Lisp_Object Vcharset_chinese_cns11643_1;
+extern Lisp_Object Vcharset_chinese_cns11643_2;
+extern Lisp_Object Vcharset_composite;
+/************************************************************************/
+/*                        Dealing with characters                       */
+/************************************************************************/
+/* Is this character represented by more than one byte in a string? */
+#define CHAR_MULTIBYTE_P(c) ((c) >= 0x80)
+#define CHAR_ASCII_P(c) (!CHAR_MULTIBYTE_P (c))
+/* The bit fields of character are divided into 3 parts:
+FIELD1(5bits):FIELD2(7bits):FIELD3(7bits) */
+#define CHAR_FIELD1_MASK (0x1F << 14)
+#define CHAR_FIELD2_MASK (0x7F << 7)
+#define CHAR_FIELD3_MASK 0x7F
+/* Macros to access each field of a character code of C.  */
+#define CHAR_FIELD1(c) (((c) & CHAR_FIELD1_MASK) >> 14)
+#define CHAR_FIELD2(c) (((c) & CHAR_FIELD2_MASK) >> 7)
+#define CHAR_FIELD3(c)  ((c) & CHAR_FIELD3_MASK)
+/* Field 1, if non-zero, usually holds a leading byte for a
+dimension-2 charset.  Field 2, if non-zero, usually holds a leading
+byte for a dimension-1 charset. */
+/* Converting between field values and leading bytes.  */
+#define FIELD2_TO_OFFICIAL_LEADING_BYTE 0x80
+#define FIELD2_TO_PRIVATE_LEADING_BYTE  0x80
+#define FIELD1_TO_OFFICIAL_LEADING_BYTE 0x8F
+#define FIELD1_TO_PRIVATE_LEADING_BYTE  0xE1
+/* Minimum and maximum allowed values for the fields. */
+#define MIN_CHAR_FIELD2_OFFICIAL \
+(MIN_LEADING_BYTE_OFFICIAL_1 - FIELD2_TO_OFFICIAL_LEADING_BYTE)
+#define MAX_CHAR_FIELD2_OFFICIAL \
+(MAX_LEADING_BYTE_OFFICIAL_1 - FIELD2_TO_OFFICIAL_LEADING_BYTE)
+#define MIN_CHAR_FIELD1_OFFICIAL \
+(MIN_LEADING_BYTE_OFFICIAL_2 - FIELD1_TO_OFFICIAL_LEADING_BYTE)
+#define MAX_CHAR_FIELD1_OFFICIAL \
+(MAX_LEADING_BYTE_OFFICIAL_2 - FIELD1_TO_OFFICIAL_LEADING_BYTE)
+#define MIN_CHAR_FIELD2_PRIVATE \
+(MIN_LEADING_BYTE_PRIVATE_1 - FIELD2_TO_PRIVATE_LEADING_BYTE)
+#define MAX_CHAR_FIELD2_PRIVATE \
+(MAX_LEADING_BYTE_PRIVATE_1 - FIELD2_TO_PRIVATE_LEADING_BYTE)
+#define MIN_CHAR_FIELD1_PRIVATE \
+(MIN_LEADING_BYTE_PRIVATE_2 - FIELD1_TO_PRIVATE_LEADING_BYTE)
+#define MAX_CHAR_FIELD1_PRIVATE \
+(MAX_LEADING_BYTE_PRIVATE_2 - FIELD1_TO_PRIVATE_LEADING_BYTE)
+/* Minimum character code of each <type> character.  */
+#define MIN_CHAR_OFFICIAL_TYPE9N    (MIN_CHAR_FIELD2_OFFICIAL <<  7)
+#define MIN_CHAR_PRIVATE_TYPE9N     (MIN_CHAR_FIELD2_PRIVATE  <<  7)
+#define MIN_CHAR_OFFICIAL_TYPE9NX9N (MIN_CHAR_FIELD1_OFFICIAL << 14)
+#define MIN_CHAR_PRIVATE_TYPE9NX9N  (MIN_CHAR_FIELD1_PRIVATE  << 14)
+#define MIN_CHAR_COMPOSITION        (0x1F << 14)
+/* Leading byte of a character.
+NOTE: This takes advantage of the fact that
+FIELD2_TO_OFFICIAL_LEADING_BYTE and
+FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
+*/
+INLINE Bufbyte CHAR_LEADING_BYTE (Emchar c);
+INLINE Bufbyte
+CHAR_LEADING_BYTE (Emchar c)
+{
+if (CHAR_ASCII_P (c))
+return LEADING_BYTE_ASCII;
+else if (c < 0xA0)
+return LEADING_BYTE_CONTROL_1;
+else if (c < MIN_CHAR_OFFICIAL_TYPE9NX9N)
+return CHAR_FIELD2 (c) + FIELD2_TO_OFFICIAL_LEADING_BYTE;
+else if (c < MIN_CHAR_PRIVATE_TYPE9NX9N)
+return CHAR_FIELD1 (c) + FIELD1_TO_OFFICIAL_LEADING_BYTE;
+else if (c < MIN_CHAR_COMPOSITION)
+return CHAR_FIELD1 (c) + FIELD1_TO_PRIVATE_LEADING_BYTE;
+else
+return LEADING_BYTE_COMPOSITE;
+}
+#define CHAR_CHARSET(c) CHARSET_BY_LEADING_BYTE (CHAR_LEADING_BYTE (c))
+/* Return a character whose charset is CHARSET and position-codes
+are C1 and C2.  TYPE9N character ignores C2.
+NOTE: This takes advantage of the fact that
+FIELD2_TO_OFFICIAL_LEADING_BYTE and
+FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
+*/
+INLINE Emchar MAKE_CHAR (Lisp_Object charset, int c1, int c2);
+INLINE Emchar
+MAKE_CHAR (Lisp_Object charset, int c1, int c2)
+{
+if (EQ (charset, Vcharset_ascii))
+return c1;
+else if (EQ (charset, Vcharset_control_1))
+return c1 | 0x80;
+else if (EQ (charset, Vcharset_composite))
+return (0x1F << 14) | ((c1) << 7) | (c2);
+else if (XCHARSET_DIMENSION (charset) == 1)
+return ((XCHARSET_LEADING_BYTE (charset) -
+	     FIELD2_TO_OFFICIAL_LEADING_BYTE) << 7) | (c1);
+else if (!XCHARSET_PRIVATE_P (charset))
+return ((XCHARSET_LEADING_BYTE (charset) -
+	     FIELD1_TO_OFFICIAL_LEADING_BYTE) << 14) | ((c1) << 7) | (c2);
+else
+return ((XCHARSET_LEADING_BYTE (charset) -
+	     FIELD1_TO_PRIVATE_LEADING_BYTE) << 14) | ((c1) << 7) | (c2);
+}
+/* The charset of character C is set to CHARSET, and the
+position-codes of C are set to C1 and C2.  C2 of TYPE9N character
+is 0.  */
+/* BREAKUP_CHAR_1_UNSAFE assumes that the charset has already been
+calculated, and just computes c1 and c2.
+BREAKUP_CHAR also computes and stores the charset. */
+#define BREAKUP_CHAR_1_UNSAFE(c, charset, c1, c2)	\
+XCHARSET_DIMENSION (charset) == 1			\
+? ((c1) = CHAR_FIELD3 (c), (c2) = 0)			\
+: ((c1) = CHAR_FIELD2 (c),				\
+(c2) = CHAR_FIELD3 (c))
+INLINE void breakup_char_1 (Emchar c, Lisp_Object *charset, int *c1, int *c2);
+INLINE void
+breakup_char_1 (Emchar c, Lisp_Object *charset, int *c1, int *c2)
+{
+*charset = CHAR_CHARSET (c);
+BREAKUP_CHAR_1_UNSAFE (c, *charset, *c1, *c2);
+}
+#define BREAKUP_CHAR(c, charset, c1, c2) \
+breakup_char_1 (c, &(charset), &(c1), &(c2))
+/************************************************************************/
+/*                           Composite characters                       */
+/************************************************************************/
+extern Lisp_Object Vcomposite_char_int2string_hashtable;
+extern Lisp_Object Vcomposite_char_string2int_hashtable;
+Emchar lookup_composite_char (Bufbyte *str, int len);
+Lisp_Object composite_char_string (Emchar ch);
+/************************************************************************/
+/*                            Exported functions                        */
+/************************************************************************/
+Lisp_Object Fget_charset (Lisp_Object);
+Lisp_Object Ffind_charset (Lisp_Object);
+int copy_internal_to_external (CONST Bufbyte *internal, Bytecount len,
+			       unsigned char *external);
+Bytecount copy_external_to_internal (CONST unsigned char *external,
+				     int len, Bufbyte *internal);
+#endif /* _XEMACS_MULE_CHARSET_H */

Mercurial > hg > xemacs-beta

comparison src/mule-charset.h @ 70:131b0175ea99 r20-0b30