Mercurial > hg > xemacs-beta
diff src/mule-coding.c @ 4096:1abf84db2c7f
[xemacs-hg @ 2007-08-04 20:00:10 by aidan]
Preserve invalid UTF-8, UTF-16 sequences on encoding, decoding.
author | aidan |
---|---|
date | Sat, 04 Aug 2007 20:00:24 +0000 |
parents | 42e4605ef1de |
children | 383ab474a241 |
line wrap: on
line diff
--- a/src/mule-coding.c Fri Aug 03 21:51:12 2007 +0000 +++ b/src/mule-coding.c Sat Aug 04 20:00:24 2007 +0000 @@ -104,7 +104,7 @@ if (XCHARSET_ENCODE_AS_UTF_8 (charset)) { encode_unicode_char (charset, c & charmask, 0, - dst, UNICODE_UTF_8, 0); + dst, UNICODE_UTF_8, 0, 0); } else { @@ -123,7 +123,7 @@ encode_unicode_char (charset, ch & charmask, c & charmask, dst, - UNICODE_UTF_8, 0); + UNICODE_UTF_8, 0, 0); } else { @@ -969,6 +969,7 @@ /* Used for handling UTF-8. */ unsigned char counter; + unsigned char indicated_length; }; static const struct memory_description ccs_description_1[] = @@ -1804,6 +1805,39 @@ } } +/* Note that this name conflicts with a function in unicode.c. */ +static void +decode_unicode_char (int ucs, unsigned_char_dynarr *dst) +{ + Ibyte work[MAX_ICHAR_LEN]; + int len; + Lisp_Object chr; + + chr = Funicode_to_char(make_int(ucs), Qnil); + assert (!NILP(chr)); + len = set_itext_ichar (work, XCHAR(chr)); + Dynarr_add_many (dst, work, len); +} + +#define DECODE_ERROR_OCTET(octet, dst) \ + decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, dst) + +static inline void +indicate_invalid_utf_8 (unsigned char indicated_length, + unsigned char counter, + int ch, unsigned_char_dynarr *dst) +{ + Binbyte stored = indicated_length - counter; + Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length]; + + while (stored > 0) + { + DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask, + dst); + mask = 0x80, stored--; + } +} + /* Convert ISO2022-format data to internal format. */ static Bytecount @@ -1907,9 +1941,7 @@ else if (flags & ISO_STATE_UTF_8) { unsigned char counter = data->counter; - Ibyte work[MAX_ICHAR_LEN]; - int len; - Lisp_Object chr; + unsigned char indicated_length = data->indicated_length; if (ISO_CODE_ESC == c) { @@ -1920,73 +1952,126 @@ continue; } - switch (counter) - { - case 0: - if (c >= 0xfc) - { - ch = c & 0x01; - counter = 5; - } - else if (c >= 0xf8) - { - ch = c & 0x03; - counter = 4; - } - else if (c >= 0xf0) - { - ch = c & 0x07; - counter = 3; - } - else if (c >= 0xe0) - { - ch = c & 0x0f; - counter = 2; - } - else if (c >= 0xc0) - { - ch = c & 0x1f; - counter = 1; - } - else - /* ASCII, or the lower control characters. - - Perhaps we should signal an error if the character is in - the range 0x80-0xc0; this is illegal UTF-8. */ - Dynarr_add (dst, (c & 0x7f)); - - break; - case 1: - ch = (ch << 6) | (c & 0x3f); - chr = Funicode_to_char(make_int(ch), Qnil); - - if (!NILP (chr)) - { - assert(CHARP(chr)); - len = set_itext_ichar (work, XCHAR(chr)); - Dynarr_add_many (dst, work, len); - } - else - { - /* Shouldn't happen, this code should only be enabled in - XEmacsen with support for all of Unicode. */ - Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); - Dynarr_add (dst, 34 + 128); - Dynarr_add (dst, 46 + 128); - } - - ch = 0; - counter = 0; - break; - default: - ch = (ch << 6) | (c & 0x3f); - counter--; - } - - if (str->eof) - DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); + if (0 == counter) + { + if (0 == (c & 0x80)) + { + /* ASCII. */ + decode_unicode_char (c, dst); + } + else if (0 == (c & 0x40)) + { + /* Highest bit set, second highest not--there's + something wrong. */ + DECODE_ERROR_OCTET (c, dst); + } + else if (0 == (c & 0x20)) + { + ch = c & 0x1f; + counter = 1; + indicated_length = 2; + } + else if (0 == (c & 0x10)) + { + ch = c & 0x0f; + counter = 2; + indicated_length = 3; + } + else if (0 == (c & 0x08)) + { + ch = c & 0x0f; + counter = 3; + indicated_length = 4; + } + /* We support lengths longer than 4 here, since we want to + represent UTF-8 error chars as distinct from the + corresponding ISO 8859-1 characters in escape-quoted. + + However, we can't differentiate UTF-8 error chars as + written to disk, and UTF-8 errors in escape-quoted. This + is not a big problem; + non-Unicode-chars-encoded-as-UTF-8-in-ISO-2022 is not + deployed, in practice, so if such a sequence of octets + occurs, XEmacs generated it. */ + else if (0 == (c & 0x04)) + { + ch = c & 0x03; + counter = 4; + indicated_length = 5; + } + else if (0 == (c & 0x02)) + { + ch = c & 0x01; + counter = 5; + indicated_length = 6; + } + else + { + /* #xFF is not a valid leading byte in any form of + UTF-8. */ + DECODE_ERROR_OCTET (c, dst); + + } + } + else + { + /* counter != 0 */ + if ((0 == (c & 0x80)) || (0 != (c & 0x40))) + { + indicate_invalid_utf_8(indicated_length, + counter, + ch, dst); + if (c & 0x80) + { + DECODE_ERROR_OCTET (c, dst); + } + else + { + /* The character just read is ASCII. Treat it as + such. */ + decode_unicode_char (c, dst); + } + ch = 0; + counter = 0; + } + else + { + ch = (ch << 6) | (c & 0x3f); + counter--; + + /* Just processed the final byte. Emit the character. */ + if (!counter) + { + /* Don't accept over-long sequences, or surrogates. */ + if ((ch < 0x80) || + ((ch < 0x800) && indicated_length > 2) || + ((ch < 0x10000) && indicated_length > 3) || + /* We accept values above #x110000 in + escape-quoted, though not in UTF-8. */ + /* (ch > 0x110000) || */ + valid_utf_16_surrogate(ch)) + { + indicate_invalid_utf_8(indicated_length, + counter, + ch, dst); + } + else + { + decode_unicode_char (ch, dst); + } + ch = 0; + } + } + } + + if (str->eof && ch) + { + DECODE_ERROR_OCTET (ch, dst); + ch = 0; + } data->counter = counter; + data->indicated_length = indicated_length; } else if (byte_c0_p (c) || byte_c1_p (c)) { /* Control characters */