Mercurial > hg > xemacs-beta
diff src/unicode.c @ 4096:1abf84db2c7f
[xemacs-hg @ 2007-08-04 20:00:10 by aidan]
Preserve invalid UTF-8, UTF-16 sequences on encoding, decoding.
author | aidan |
---|---|
date | Sat, 04 Aug 2007 20:00:24 +0000 |
parents | 3584cb2c07db |
children | 75d0292c1bff |
line wrap: on
line diff
--- a/src/unicode.c Fri Aug 03 21:51:12 2007 +0000 +++ b/src/unicode.c Sat Aug 04 20:00:24 2007 +0000 @@ -146,13 +146,6 @@ (1) User-defined charsets: It would be inconvenient to require all dumped user-defined charsets to be reloaded at init time. - (2) Starting up in a non-ISO-8859-1 directory. If we load at run-time, - we don't load the tables until after we've parsed the current - directories, and we run into a real bootstrapping problem, if the - directories themselves are non-ISO-8859-1. This is potentially fixable - once we switch to using Unicode internally, so we don't have to do any - conversion (other than the automatic kind, e.g. UTF-16 to UTF-8). - NB With run-time loading, we load in init-mule-at-startup, in mule-cmds.el. This is called from startup.el, which is quite late in the initialization process -- but data-directory isn't set until then. @@ -192,7 +185,7 @@ convert them back.) */ Lisp_Object Qunicode; -Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7; +Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32; Lisp_Object Qneed_bom; Lisp_Object Qutf_16_little_endian, Qutf_16_bom; @@ -218,10 +211,6 @@ trail = 0xDC00 + (__ctu16s_code & 0x3FF); \ } while (0) -#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800) -#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00) -#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800) - #ifdef MULE /* Using ints for to_unicode is OK (as long as they are >= 32 bits). @@ -1703,6 +1692,7 @@ { /* decode */ unsigned char counter; + unsigned char indicated_length; int seen_char; /* encode */ Lisp_Object current_charset; @@ -1716,11 +1706,6 @@ DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode); -/* Decode a UCS-2 or UCS-4 character into a buffer. If the lookup fails, use - <GETA MARK> (U+3013) of JIS X 0208, which means correct character - is not found, instead. - #### do something more appropriate (use blob?) - Danger, Will Robinson! Data loss. Should we signal user? */ static void decode_unicode_char (int ch, unsigned_char_dynarr *dst, struct unicode_coding_stream *data, @@ -1755,9 +1740,32 @@ data->seen_char = 1; } +#define DECODE_ERROR_OCTET(octet, dst, data, ignore_bom) \ + decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, \ + dst, data, ignore_bom) + +static inline void +indicate_invalid_utf_8 (unsigned char indicated_length, + unsigned char counter, + int ch, unsigned_char_dynarr *dst, + struct unicode_coding_stream *data, + unsigned int ignore_bom) +{ + Binbyte stored = indicated_length - counter; + Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length]; + + while (stored > 0) + { + DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask, + dst, data, ignore_bom); + mask = 0x80, stored--; + } +} + static void encode_unicode_char_1 (int code, unsigned_char_dynarr *dst, - enum unicode_type type, unsigned int little_endian) + enum unicode_type type, unsigned int little_endian, + int write_error_characters_as_such) { switch (type) { @@ -1767,53 +1775,105 @@ if (code < 0x10000) { Dynarr_add (dst, (unsigned char) (code & 255)); Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); - } else { - /* Little endian; least significant byte first. */ - int first, second; - - CODE_TO_UTF_16_SURROGATES(code, first, second); - - Dynarr_add (dst, (unsigned char) (first & 255)); - Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); - - Dynarr_add (dst, (unsigned char) (second & 255)); - Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); - } + } else if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else if (code < 0x110000) + { + /* Little endian; least significant byte first. */ + int first, second; + + CODE_TO_UTF_16_SURROGATES(code, first, second); + + Dynarr_add (dst, (unsigned char) (first & 255)); + Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); + + Dynarr_add (dst, (unsigned char) (second & 255)); + Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); + } + else + { + /* Not valid Unicode. Pass U+FFFD, least significant byte + first. */ + Dynarr_add (dst, (unsigned char) 0xFD); + Dynarr_add (dst, (unsigned char) 0xFF); + } } else { if (code < 0x10000) { Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); Dynarr_add (dst, (unsigned char) (code & 255)); - } else { - /* Big endian; most significant byte first. */ - int first, second; - - CODE_TO_UTF_16_SURROGATES(code, first, second); - - Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); - Dynarr_add (dst, (unsigned char) (first & 255)); - - Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); - Dynarr_add (dst, (unsigned char) (second & 255)); - } + } else if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else if (code < 0x110000) + { + /* Big endian; most significant byte first. */ + int first, second; + + CODE_TO_UTF_16_SURROGATES(code, first, second); + + Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (first & 255)); + + Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (second & 255)); + } + else + { + /* Not valid Unicode. Pass U+FFFD, most significant byte + first. */ + Dynarr_add (dst, (unsigned char) 0xFF); + Dynarr_add (dst, (unsigned char) 0xFD); + } } break; case UNICODE_UCS_4: + case UNICODE_UTF_32: if (little_endian) { - Dynarr_add (dst, (unsigned char) (code & 255)); - Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); - Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); - Dynarr_add (dst, (unsigned char) (code >> 24)); + if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else + { + /* We generate and accept incorrect sequences here, which is + okay, in the interest of preservation of the user's + data. */ + Dynarr_add (dst, (unsigned char) (code & 255)); + Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); + Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); + Dynarr_add (dst, (unsigned char) (code >> 24)); + } } else { - Dynarr_add (dst, (unsigned char) (code >> 24)); - Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); - Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); - Dynarr_add (dst, (unsigned char) (code & 255)); + if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else + { + /* We generate and accept incorrect sequences here, which is okay, + in the interest of preservation of the user's data. */ + Dynarr_add (dst, (unsigned char) (code >> 24)); + Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); + Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (code & 255)); + } } break; @@ -1842,11 +1902,25 @@ } else if (code <= 0x3ffffff) { - Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8)); - Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); - Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); - Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); - Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); + +#if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \ + && UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff) +#error "This code needs to be rewritten. " +#endif + if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else + { + Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8)); + Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); + Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); + Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); + Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); + } } else { @@ -1870,7 +1944,8 @@ void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, int USED_IF_MULE (l), unsigned_char_dynarr *dst, - enum unicode_type type, unsigned int little_endian) + enum unicode_type type, unsigned int little_endian, + int write_error_characters_as_such) { #ifdef MULE int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127)); @@ -1896,7 +1971,8 @@ int code = h; #endif /* MULE */ - encode_unicode_char_1 (code, dst, type, little_endian); + encode_unicode_char_1 (code, dst, type, little_endian, + write_error_characters_as_such); } static Bytecount @@ -1915,6 +1991,8 @@ if (str->direction == CODING_DECODE) { unsigned char counter = data->counter; + unsigned char indicated_length + = data->indicated_length; while (n--) { @@ -1923,46 +2001,92 @@ switch (type) { case UNICODE_UTF_8: - switch (counter) - { - case 0: - if (c >= 0xfc) - { - ch = c & 0x01; - counter = 5; - } - else if (c >= 0xf8) - { - ch = c & 0x03; - counter = 4; - } - else if (c >= 0xf0) - { - ch = c & 0x07; - counter = 3; - } - else if (c >= 0xe0) - { - ch = c & 0x0f; - counter = 2; - } - else if (c >= 0xc0) - { - ch = c & 0x1f; - counter = 1; - } - else - decode_unicode_char (c, dst, data, ignore_bom); - break; - case 1: - ch = (ch << 6) | (c & 0x3f); - decode_unicode_char (ch, dst, data, ignore_bom); - ch = 0; - counter = 0; - break; - default: - ch = (ch << 6) | (c & 0x3f); - counter--; + if (0 == counter) + { + if (0 == (c & 0x80)) + { + /* ASCII. */ + decode_unicode_char (c, dst, data, ignore_bom); + } + else if (0 == (c & 0x40)) + { + /* Highest bit set, second highest not--there's + something wrong. */ + DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + } + else if (0 == (c & 0x20)) + { + ch = c & 0x1f; + counter = 1; + indicated_length = 2; + } + else if (0 == (c & 0x10)) + { + ch = c & 0x0f; + counter = 2; + indicated_length = 3; + } + else if (0 == (c & 0x08)) + { + ch = c & 0x0f; + counter = 3; + indicated_length = 4; + } + else + { + /* We don't supports lengths longer than 4 in + external-format data. */ + DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + + } + } + else + { + /* counter != 0 */ + if ((0 == (c & 0x80)) || (0 != (c & 0x40))) + { + indicate_invalid_utf_8(indicated_length, + counter, + ch, dst, data, ignore_bom); + if (c & 0x80) + { + DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + } + else + { + /* The character just read is ASCII. Treat it as + such. */ + decode_unicode_char (c, dst, data, ignore_bom); + } + ch = 0; + counter = 0; + } + else + { + ch = (ch << 6) | (c & 0x3f); + counter--; + /* Just processed the final byte. Emit the character. */ + if (!counter) + { + /* Don't accept over-long sequences, surrogates, + or codes above #x10FFFF. */ + if ((ch < 0x80) || + ((ch < 0x800) && indicated_length > 2) || + ((ch < 0x10000) && indicated_length > 3) || + valid_utf_16_surrogate(ch) || (ch > 0x110000)) + { + indicate_invalid_utf_8(indicated_length, + counter, + ch, dst, data, + ignore_bom); + } + else + { + decode_unicode_char (ch, dst, data, ignore_bom); + } + ch = 0; + } + } } break; @@ -1972,39 +2096,51 @@ ch = (c << counter) | ch; else ch = (ch << 8) | c; + counter += 8; - if (counter == 16 && valid_utf_16_first_surrogate(ch)) - break; - - if (counter == 16) - { + if (16 == counter) + { int tempch = ch; + + if (valid_utf_16_first_surrogate(ch)) + { + break; + } ch = 0; counter = 0; decode_unicode_char (tempch, dst, data, ignore_bom); } - if (counter == 32) + else if (32 == counter) { int tempch; - /* #### Signalling an error may be a bit extreme. Should - we try and read it in anyway? */ - if (!valid_utf_16_first_surrogate(ch >> 16) - || !valid_utf_16_last_surrogate(ch & 0xFFFF)) + + if (!valid_utf_16_last_surrogate(ch & 0xFFFF)) { - signal_error(Qtext_conversion_error, - "Invalid UTF-16 surrogate sequence", - Qunbound); + DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); } - tempch = utf_16_surrogates_to_code((ch >> 16), - (ch & 0xffff)); + else + { + tempch = utf_16_surrogates_to_code((ch >> 16), + (ch & 0xffff)); + decode_unicode_char(tempch, dst, data, ignore_bom); + } ch = 0; counter = 0; - decode_unicode_char(tempch, dst, data, ignore_bom); - } + } + else + assert(8 == counter || 24 == counter); break; case UNICODE_UCS_4: + case UNICODE_UTF_32: if (little_endian) ch = (c << counter) | ch; else @@ -2012,15 +2148,43 @@ counter += 8; if (counter == 32) { - int tempch = ch; + if (ch > 0x10ffff) + { + /* ch is not a legal Unicode character. We're fine + with that in UCS-4, though not in UTF-32. */ + if (UNICODE_UCS_4 == type && ch < 0x80000000) + { + decode_unicode_char (ch, dst, data, ignore_bom); + } + else if (little_endian) + { + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, + ignore_bom); + } + else + { + DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + } + } + else + { + decode_unicode_char (ch, dst, data, ignore_bom); + } ch = 0; counter = 0; - if (tempch < 0) - { - /* !!#### indicate an error */ - tempch = '~'; - } - decode_unicode_char (tempch, dst, data, ignore_bom); } break; @@ -2032,10 +2196,67 @@ } } - if (str->eof) - DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); + + if (str->eof && ch) + { + switch (type) + { + case UNICODE_UTF_8: + indicate_invalid_utf_8(indicated_length, + counter, ch, dst, data, + ignore_bom); + break; + + case UNICODE_UTF_16: + case UNICODE_UCS_4: + case UNICODE_UTF_32: + if (8 == counter) + { + DECODE_ERROR_OCTET (ch, dst, data, ignore_bom); + } + else if (16 == counter) + { + if (little_endian) + { + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + } + else + { + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); + } + } + else if (24 == counter) + { + if (little_endian) + { + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + } + else + { + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + } + } + else assert(0); + break; + } + ch = 0; + } data->counter = counter; + data->indicated_length = indicated_length; } else { @@ -2054,7 +2275,7 @@ if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom) { - encode_unicode_char_1 (0xFEFF, dst, type, little_endian); + encode_unicode_char_1 (0xFEFF, dst, type, little_endian, 1); data->wrote_bom = 1; } @@ -2068,7 +2289,7 @@ { /* Processing ASCII character */ ch = 0; encode_unicode_char (Vcharset_ascii, c, 0, dst, type, - little_endian); + little_endian, 1); char_boundary = 1; } @@ -2092,20 +2313,20 @@ for the rationale behind subtracting #xa0 from the character's code. */ encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst, - type, little_endian); + type, little_endian, 1); else { switch (XCHARSET_REP_BYTES (charset)) { case 2: encode_unicode_char (charset, c, 0, dst, type, - little_endian); + little_endian, 1); break; case 3: if (XCHARSET_PRIVATE_P (charset)) { encode_unicode_char (charset, c, 0, dst, type, - little_endian); + little_endian, 1); ch = 0; } else if (ch) @@ -2119,7 +2340,7 @@ handle this yet. */ encode_unicode_char (Vcharset_ascii, '~', 0, dst, type, - little_endian); + little_endian, 1); } else { @@ -2138,7 +2359,7 @@ else #endif /* ENABLE_COMPOSITE_CHARS */ encode_unicode_char (charset, ch, c, dst, type, - little_endian); + little_endian, 1); ch = 0; } else @@ -2151,7 +2372,7 @@ if (ch) { encode_unicode_char (charset, ch, c, dst, type, - little_endian); + little_endian, 1); ch = 0; } else @@ -2521,6 +2742,8 @@ type = UNICODE_UTF_7; else if (EQ (value, Qucs_4)) type = UNICODE_UCS_4; + else if (EQ (value, Qutf_32)) + type = UNICODE_UTF_32; else invalid_constant ("Invalid Unicode type", key); @@ -2546,6 +2769,7 @@ case UNICODE_UTF_8: return Qutf_8; case UNICODE_UTF_7: return Qutf_7; case UNICODE_UCS_4: return Qucs_4; + case UNICODE_UTF_32: return Qutf_32; default: ABORT (); } } @@ -2620,6 +2844,7 @@ DEFSYMBOL (Qunicode); DEFSYMBOL (Qucs_4); DEFSYMBOL (Qutf_16); + DEFSYMBOL (Qutf_32); DEFSYMBOL (Qutf_8); DEFSYMBOL (Qutf_7);