Mercurial > hg > xemacs-beta
diff src/mule-coding.c @ 3439:d1754e7f0cea
[xemacs-hg @ 2006-06-03 17:50:39 by aidan]
Just-in-time Unicode code point support.
author | aidan |
---|---|
date | Sat, 03 Jun 2006 17:51:06 +0000 |
parents | 96ec8f16af45 |
children | 42e4605ef1de |
line wrap: on
line diff
--- a/src/mule-coding.c Fri Jun 02 22:18:08 2006 +0000 +++ b/src/mule-coding.c Sat Jun 03 17:51:06 2006 +0000 @@ -96,6 +96,42 @@ return c >= 0xA1 && c <= 0xDF; } +inline static void +dynarr_add_2022_one_dimension (Lisp_Object charset, Ibyte c, + unsigned char charmask, + unsigned_char_dynarr *dst) +{ + if (XCHARSET_ENCODE_AS_UTF_8 (charset)) + { + encode_unicode_char (charset, c & charmask, 0, + dst, UNICODE_UTF_8, 0); + } + else + { + Dynarr_add (dst, c & charmask); + } +} + +inline static void +dynarr_add_2022_two_dimensions (Lisp_Object charset, Ibyte c, + unsigned int ch, + unsigned char charmask, + unsigned_char_dynarr *dst) +{ + if (XCHARSET_ENCODE_AS_UTF_8 (charset)) + { + encode_unicode_char (charset, + ch & charmask, + c & charmask, dst, + UNICODE_UTF_8, 0); + } + else + { + Dynarr_add (dst, ch & charmask); + Dynarr_add (dst, c & charmask); + } +} + /* Convert Shift-JIS data to internal format. */ static Bytecount @@ -671,6 +707,10 @@ ISO_ESC_2_4, /* We've seen ESC $. This indicates that we're designating a multi-byte, rather than a single-byte, character set. */ + ISO_ESC_2_5, /* We've seen ESC %. This indicates an escape to a + Unicode coding system; the only one of these + we're prepared to deal with is UTF-8, which has + the next character as G. */ ISO_ESC_2_8, /* We've seen ESC 0x28, i.e. ESC (. This means designate a 94-character character set into G0. */ @@ -752,11 +792,15 @@ character constructed by overstriking two or more characters). */ #define ISO_STATE_COMPOSITE (1 << 5) +/* If set, we're processing UTF-8 encoded data within ISO-2022 + processing. */ +#define ISO_STATE_UTF_8 (1 << 6) + /* ISO_STATE_LOCK is the mask of flags that remain on until explicitly turned off when in the ISO2022 encoder/decoder. Other flags are turned off at the end of processing each character or escape sequence. */ # define ISO_STATE_LOCK \ - (ISO_STATE_COMPOSITE | ISO_STATE_R2L) + (ISO_STATE_COMPOSITE | ISO_STATE_R2L | ISO_STATE_UTF_8) typedef struct charset_conversion_spec { @@ -922,6 +966,9 @@ Lisp_Object current_charset; int current_half; int current_char_boundary; + + /* Used for handling UTF-8. */ + unsigned char counter; }; static const struct memory_description ccs_description_1[] = @@ -1344,6 +1391,15 @@ } case ISO_ESC: + + /* The only available ISO 2022 sequence in UTF-8 mode is ESC % @, to + exit from it. If we see any other escape sequence, pass it through + in the error handler. */ + if (*flags & ISO_STATE_UTF_8 && '%' != c) + { + return 0; + } + switch (c) { /**** single shift ****/ @@ -1411,6 +1467,10 @@ iso->esc = ISO_ESC_2_4; goto not_done; + case '%': /* Prefix to an escape to or from Unicode. */ + iso->esc = ISO_ESC_2_5; + goto not_done; + default: if (0x28 <= c && c <= 0x2F) { @@ -1433,8 +1493,30 @@ goto error; } - - + /* ISO-IR 196 UTF-8 support. */ + case ISO_ESC_2_5: + if ('G' == c) + { + /* Activate UTF-8 mode. */ + *flags &= ISO_STATE_LOCK; + *flags |= ISO_STATE_UTF_8; + iso->esc = ISO_ESC_NOTHING; + return 1; + } + else if ('@' == c) + { + /* Deactive UTF-8 mode. */ + *flags &= ISO_STATE_LOCK; + *flags &= ~(ISO_STATE_UTF_8); + iso->esc = ISO_ESC_NOTHING; + return 1; + } + else + { + /* Oops, we don't support the other UTF-? coding systems within + ISO 2022, only in their own context. */ + goto error; + } /**** directionality ****/ case ISO_ESC_5_11: /* ISO6429 direction control */ @@ -1822,6 +1904,87 @@ } ch = 0; } + else if (flags & ISO_STATE_UTF_8) + { + unsigned char counter = data->counter; + Ibyte work[MAX_ICHAR_LEN]; + int len; + Lisp_Object chr; + + if (ISO_CODE_ESC == c) + { + /* Allow the escape sequence parser to end the UTF-8 state. */ + flags |= ISO_STATE_ESCAPE; + data->esc = ISO_ESC; + data->esc_bytes_index = 1; + continue; + } + + switch (counter) + { + case 0: + if (c >= 0xfc) + { + ch = c & 0x01; + counter = 5; + } + else if (c >= 0xf8) + { + ch = c & 0x03; + counter = 4; + } + else if (c >= 0xf0) + { + ch = c & 0x07; + counter = 3; + } + else if (c >= 0xe0) + { + ch = c & 0x0f; + counter = 2; + } + else if (c >= 0xc0) + { + ch = c & 0x1f; + counter = 1; + } + else + /* ASCII, or the lower control characters. */ + Dynarr_add (dst, c); + + break; + case 1: + ch = (ch << 6) | (c & 0x3f); + chr = Funicode_to_char(make_int(ch), Qnil); + + if (!NILP (chr)) + { + assert(CHARP(chr)); + len = set_itext_ichar (work, XCHAR(chr)); + Dynarr_add_many (dst, work, len); + } + else + { + /* Shouldn't happen, this code should only be enabled in + XEmacsen with support for all of Unicode. */ + Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); + Dynarr_add (dst, 34 + 128); + Dynarr_add (dst, 46 + 128); + } + + ch = 0; + counter = 0; + break; + default: + ch = (ch << 6) | (c & 0x3f); + counter--; + } + + if (str->eof) + DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); + + data->counter = counter; + } else if (byte_c0_p (c) || byte_c1_p (c)) { /* Control characters */ @@ -2010,6 +2173,7 @@ } Dynarr_add (dst, ISO_CODE_ESC); + switch (type) { case CHARSET_TYPE_94: @@ -2102,6 +2266,14 @@ { /* Processing ASCII character */ ch = 0; + if (flags & ISO_STATE_UTF_8) + { + Dynarr_add (dst, ISO_CODE_ESC); + Dynarr_add (dst, '%'); + Dynarr_add (dst, '@'); + flags &= ~(ISO_STATE_UTF_8); + } + restore_left_to_right_direction (codesys, dst, &flags, 0); /* Make sure G0 contains ASCII */ @@ -2145,18 +2317,43 @@ Dynarr_add (dst, c); char_boundary = 1; } - else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch)) { /* Processing Leading Byte */ ch = 0; charset = charset_by_leading_byte (c); if (leading_byte_prefix_p (c)) - ch = c; + { + ch = c; + } + else if (XCHARSET_ENCODE_AS_UTF_8 (charset)) + { + assert (!EQ (charset, Vcharset_control_1) + && !EQ (charset, Vcharset_composite)); + + /* If the character set is to be encoded as UTF-8, the escape + is always the same. */ + if (!(flags & ISO_STATE_UTF_8)) + { + Dynarr_add (dst, ISO_CODE_ESC); + Dynarr_add (dst, '%'); + Dynarr_add (dst, 'G'); + flags |= ISO_STATE_UTF_8; + } + } else if (!EQ (charset, Vcharset_control_1) && !EQ (charset, Vcharset_composite)) { int reg; + /* End the UTF-8 state. */ + if (flags & ISO_STATE_UTF_8) + { + Dynarr_add (dst, ISO_CODE_ESC); + Dynarr_add (dst, '%'); + Dynarr_add (dst, '@'); + flags &= ~(ISO_STATE_UTF_8); + } + ensure_correct_direction (XCHARSET_DIRECTION (charset), codesys, dst, &flags, 0); @@ -2274,12 +2471,14 @@ switch (XCHARSET_REP_BYTES (charset)) { case 2: - Dynarr_add (dst, c & charmask); + dynarr_add_2022_one_dimension (charset, c, + charmask, dst); break; case 3: if (XCHARSET_PRIVATE_P (charset)) { - Dynarr_add (dst, c & charmask); + dynarr_add_2022_one_dimension (charset, c, + charmask, dst); ch = 0; } else if (ch) @@ -2287,6 +2486,9 @@ #ifdef ENABLE_COMPOSITE_CHARS if (EQ (charset, Vcharset_composite)) { + /* #### Hasn't been written to handle composite + characters yet. */ + assert(!XCHARSET_ENCODE_AS_UTF_8 (charset)) if (in_composite) { /* #### Bother! We don't know how to @@ -2310,8 +2512,8 @@ else #endif /* ENABLE_COMPOSITE_CHARS */ { - Dynarr_add (dst, ch & charmask); - Dynarr_add (dst, c & charmask); + dynarr_add_2022_two_dimensions (charset, c, ch, + charmask, dst); } ch = 0; } @@ -2324,8 +2526,8 @@ case 4: if (ch) { - Dynarr_add (dst, ch & charmask); - Dynarr_add (dst, c & charmask); + dynarr_add_2022_two_dimensions (charset, c, ch, + charmask, dst); ch = 0; } else