Mercurial > hg > xemacs-beta
changeset 4096:1abf84db2c7f
[xemacs-hg @ 2007-08-04 20:00:10 by aidan]
Preserve invalid UTF-8, UTF-16 sequences on encoding, decoding.
author | aidan |
---|---|
date | Sat, 04 Aug 2007 20:00:24 +0000 |
parents | bff7e065cfdc |
children | 50932d98a7f9 |
files | lisp/ChangeLog lisp/unicode.el src/ChangeLog src/charset.h src/lisp.h src/lread.c src/mule-coding.c src/unicode.c |
diffstat | 8 files changed, 618 insertions(+), 217 deletions(-) [+] |
line wrap: on
line diff
--- a/lisp/ChangeLog Fri Aug 03 21:51:12 2007 +0000 +++ b/lisp/ChangeLog Sat Aug 04 20:00:24 2007 +0000 @@ -1,3 +1,13 @@ +2007-08-04 Aidan Kehoe <kehoea@parhasard.net> + + * unicode.el: + * unicode.el (utf-32): + * unicode.el (utf-32-little-endian): + Add UTF-32 coding systems. + + * unicode.el (decode-char): + Only accept valid Unicode in this function. + 2007-08-02 Mike Sperber <mike@xemacs.org> * startup.el (startup-setup-paths): Fix typo in init expression
--- a/lisp/unicode.el Fri Aug 03 21:51:12 2007 +0000 +++ b/lisp/unicode.el Sat Aug 04 20:00:24 2007 +0000 @@ -233,6 +233,26 @@ little-endian t)) (make-coding-system + 'utf-32 'unicode + "UTF-32" + '(mnemonic "UTF32" + documentation + "UTF-32 Unicode encoding -- fixed-width four-byte encoding, +characters less than #x10FFFF are not supported. " + unicode-type utf-32)) + +(make-coding-system + 'utf-32-little-endian 'unicode + "UTF-32 Little Endian" + '(mnemonic "UTF32-LE" + documentation + "Little-endian version of UTF-32 Unicode encoding. + +A fixed-width four-byte encoding, characters less than #x10FFFF are not +supported. " + unicode-type ucs-4 little-endian t)) + +(make-coding-system 'utf-8 'unicode "UTF-8" '(mnemonic "UTF8" @@ -274,6 +294,10 @@ (defun decode-char (quote-ucs code &optional restriction) "FSF compatibility--return Mule character with Unicode codepoint CODE. The second argument must be 'ucs, the third argument is ignored. " + ;; We're prepared to accept invalid Unicode in unicode-to-char, but not in + ;; this function, which is the API that should actually be used, since + ;; it's available in GNU and in Mule-UCS. + (check-argument-range code #x0 #x10FFFF) (assert (eq quote-ucs 'ucs) t "Sorry, decode-char doesn't yet support anything but the UCS. ") (unicode-to-char code))
--- a/src/ChangeLog Fri Aug 03 21:51:12 2007 +0000 +++ b/src/ChangeLog Sat Aug 04 20:00:24 2007 +0000 @@ -1,3 +1,50 @@ +2007-08-04 Aidan Kehoe <kehoea@parhasard.net> + + * charset.h: + * charset.h (enum unicode_type): + Add UNICODE_UTF_32. + * lisp.h: + Add Qutf_32. + * lread.c (read_unicode_escape): + Error on an invalid Unicode escape; error on no mapping, as GNU does. + + * mule-coding.c: + * mule-coding.c (dynarr_add_2022_one_dimension): + * mule-coding.c (dynarr_add_2022_two_dimensions): + * mule-coding.c (struct iso2022_coding_stream): + * mule-coding.c (decode_unicode_char): + * mule-coding.c (indicate_invalid_utf_8): + * mule-coding.c (iso2022_decode): + * unicode.c: + * unicode.c (struct unicode_coding_stream): + * unicode.c (decode_unicode_char): + * unicode.c (DECODE_ERROR_OCTET): + * unicode.c (indicate_invalid_utf_8): + * unicode.c (encode_unicode_char_1): + * unicode.c (encode_unicode_char): + * unicode.c (unicode_convert): + * unicode.c (unicode_putprop): + * unicode.c (unicode_getprop): + * unicode.c (syms_of_unicode): + Make UTF-8 and UTF-16 handling more robust; indicate error + sequences when decoding, passing the octets as distinct from the + corresponding ISO8859-1 characters, and (by default) writing them + to disk on encoding. Don't accept over-long UTF-8 sequences, codes + >= #x110000, or UTF-16 surrogates on reading in the utf-8 coding + system; represent them as error sequences. + + Do accept code points above #x110000 in the ISO IR 196 handling, + since we decode Unicode error sequences to "Unicode" code points + starting at 0x200000, and will need to save them as such in + escape-quoted. Do not accept over-long UTF-8 sequences or UTF-16 + surrogates in escape-quoted. + + This change means that when a non-UTF-8 file is opened as UTF-8, + one change made, and immediately saved, the non-ASCII characters + are not corrupted. In Europe, this is a distinct win. + + Add UCS-4, UTF-32 as coding systems. + 2007-07-26 Aidan Kehoe <kehoea@parhasard.net> * mule-ccl.c (ccl_driver):
--- a/src/charset.h Fri Aug 03 21:51:12 2007 +0000 +++ b/src/charset.h Sat Aug 04 20:00:24 2007 +0000 @@ -567,12 +567,20 @@ UNICODE_UTF_16, UNICODE_UTF_8, UNICODE_UTF_7, - UNICODE_UCS_4 + UNICODE_UCS_4, + UNICODE_UTF_32 }; void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, int USED_IF_MULE (l), unsigned_char_dynarr *dst, - enum unicode_type type, unsigned int little_endian); + enum unicode_type type, unsigned int little_endian, + int write_error_characters_as_such); + +#define UNICODE_ERROR_OCTET_RANGE_START 0x200000 + +#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800) +#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00) +#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800) void set_charset_registries(Lisp_Object charset, Lisp_Object registries);
--- a/src/lisp.h Fri Aug 03 21:51:12 2007 +0000 +++ b/src/lisp.h Sat Aug 04 20:00:24 2007 +0000 @@ -5488,7 +5488,7 @@ void free_charset_unicode_tables (Lisp_Object charset); void recalculate_unicode_precedence (void); extern Lisp_Object Qunicode; -extern Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7; +extern Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32; #ifdef MEMORY_USAGE_STATS Bytecount compute_from_unicode_table_size (Lisp_Object charset, struct overhead_stats *stats);
--- a/src/lread.c Fri Aug 03 21:51:12 2007 +0000 +++ b/src/lread.c Sat Aug 04 20:00:24 2007 +0000 @@ -1694,24 +1694,26 @@ } } + if (i > 0x110000 || i < 0) + { + syntax_error ("Not a Unicode code point", make_int(i)); + } + lisp_char = Funicode_to_char(make_int(i), Qnil); if (EQ(Qnil, lisp_char)) { - /* This is ugly and horrible and trashes the user's data, but - it's what unicode.c does. In the future, unicode-to-char - should not return nil. */ -#ifdef MULE - i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128); -#else - i = '~'; -#endif - return i; + /* Will happen on non-Mule. Silent corruption is what happens + elsewhere, and we used to do that to be consistent, but GNU error, + so people writing portable code need to be able to handle that, and + given a choice I prefer that behaviour. + + An undesirable aspect to this error is that the code point is shown + as a decimal integer, which is mostly unreadable. */ + syntax_error ("Unsupported Unicode code point", make_int(i)); } - else - { - return XCHAR(lisp_char); - } + + return XCHAR(lisp_char); }
--- a/src/mule-coding.c Fri Aug 03 21:51:12 2007 +0000 +++ b/src/mule-coding.c Sat Aug 04 20:00:24 2007 +0000 @@ -104,7 +104,7 @@ if (XCHARSET_ENCODE_AS_UTF_8 (charset)) { encode_unicode_char (charset, c & charmask, 0, - dst, UNICODE_UTF_8, 0); + dst, UNICODE_UTF_8, 0, 0); } else { @@ -123,7 +123,7 @@ encode_unicode_char (charset, ch & charmask, c & charmask, dst, - UNICODE_UTF_8, 0); + UNICODE_UTF_8, 0, 0); } else { @@ -969,6 +969,7 @@ /* Used for handling UTF-8. */ unsigned char counter; + unsigned char indicated_length; }; static const struct memory_description ccs_description_1[] = @@ -1804,6 +1805,39 @@ } } +/* Note that this name conflicts with a function in unicode.c. */ +static void +decode_unicode_char (int ucs, unsigned_char_dynarr *dst) +{ + Ibyte work[MAX_ICHAR_LEN]; + int len; + Lisp_Object chr; + + chr = Funicode_to_char(make_int(ucs), Qnil); + assert (!NILP(chr)); + len = set_itext_ichar (work, XCHAR(chr)); + Dynarr_add_many (dst, work, len); +} + +#define DECODE_ERROR_OCTET(octet, dst) \ + decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, dst) + +static inline void +indicate_invalid_utf_8 (unsigned char indicated_length, + unsigned char counter, + int ch, unsigned_char_dynarr *dst) +{ + Binbyte stored = indicated_length - counter; + Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length]; + + while (stored > 0) + { + DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask, + dst); + mask = 0x80, stored--; + } +} + /* Convert ISO2022-format data to internal format. */ static Bytecount @@ -1907,9 +1941,7 @@ else if (flags & ISO_STATE_UTF_8) { unsigned char counter = data->counter; - Ibyte work[MAX_ICHAR_LEN]; - int len; - Lisp_Object chr; + unsigned char indicated_length = data->indicated_length; if (ISO_CODE_ESC == c) { @@ -1920,73 +1952,126 @@ continue; } - switch (counter) - { - case 0: - if (c >= 0xfc) - { - ch = c & 0x01; - counter = 5; - } - else if (c >= 0xf8) - { - ch = c & 0x03; - counter = 4; - } - else if (c >= 0xf0) - { - ch = c & 0x07; - counter = 3; - } - else if (c >= 0xe0) - { - ch = c & 0x0f; - counter = 2; - } - else if (c >= 0xc0) - { - ch = c & 0x1f; - counter = 1; - } - else - /* ASCII, or the lower control characters. - - Perhaps we should signal an error if the character is in - the range 0x80-0xc0; this is illegal UTF-8. */ - Dynarr_add (dst, (c & 0x7f)); - - break; - case 1: - ch = (ch << 6) | (c & 0x3f); - chr = Funicode_to_char(make_int(ch), Qnil); - - if (!NILP (chr)) - { - assert(CHARP(chr)); - len = set_itext_ichar (work, XCHAR(chr)); - Dynarr_add_many (dst, work, len); - } - else - { - /* Shouldn't happen, this code should only be enabled in - XEmacsen with support for all of Unicode. */ - Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); - Dynarr_add (dst, 34 + 128); - Dynarr_add (dst, 46 + 128); - } - - ch = 0; - counter = 0; - break; - default: - ch = (ch << 6) | (c & 0x3f); - counter--; - } - - if (str->eof) - DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); + if (0 == counter) + { + if (0 == (c & 0x80)) + { + /* ASCII. */ + decode_unicode_char (c, dst); + } + else if (0 == (c & 0x40)) + { + /* Highest bit set, second highest not--there's + something wrong. */ + DECODE_ERROR_OCTET (c, dst); + } + else if (0 == (c & 0x20)) + { + ch = c & 0x1f; + counter = 1; + indicated_length = 2; + } + else if (0 == (c & 0x10)) + { + ch = c & 0x0f; + counter = 2; + indicated_length = 3; + } + else if (0 == (c & 0x08)) + { + ch = c & 0x0f; + counter = 3; + indicated_length = 4; + } + /* We support lengths longer than 4 here, since we want to + represent UTF-8 error chars as distinct from the + corresponding ISO 8859-1 characters in escape-quoted. + + However, we can't differentiate UTF-8 error chars as + written to disk, and UTF-8 errors in escape-quoted. This + is not a big problem; + non-Unicode-chars-encoded-as-UTF-8-in-ISO-2022 is not + deployed, in practice, so if such a sequence of octets + occurs, XEmacs generated it. */ + else if (0 == (c & 0x04)) + { + ch = c & 0x03; + counter = 4; + indicated_length = 5; + } + else if (0 == (c & 0x02)) + { + ch = c & 0x01; + counter = 5; + indicated_length = 6; + } + else + { + /* #xFF is not a valid leading byte in any form of + UTF-8. */ + DECODE_ERROR_OCTET (c, dst); + + } + } + else + { + /* counter != 0 */ + if ((0 == (c & 0x80)) || (0 != (c & 0x40))) + { + indicate_invalid_utf_8(indicated_length, + counter, + ch, dst); + if (c & 0x80) + { + DECODE_ERROR_OCTET (c, dst); + } + else + { + /* The character just read is ASCII. Treat it as + such. */ + decode_unicode_char (c, dst); + } + ch = 0; + counter = 0; + } + else + { + ch = (ch << 6) | (c & 0x3f); + counter--; + + /* Just processed the final byte. Emit the character. */ + if (!counter) + { + /* Don't accept over-long sequences, or surrogates. */ + if ((ch < 0x80) || + ((ch < 0x800) && indicated_length > 2) || + ((ch < 0x10000) && indicated_length > 3) || + /* We accept values above #x110000 in + escape-quoted, though not in UTF-8. */ + /* (ch > 0x110000) || */ + valid_utf_16_surrogate(ch)) + { + indicate_invalid_utf_8(indicated_length, + counter, + ch, dst); + } + else + { + decode_unicode_char (ch, dst); + } + ch = 0; + } + } + } + + if (str->eof && ch) + { + DECODE_ERROR_OCTET (ch, dst); + ch = 0; + } data->counter = counter; + data->indicated_length = indicated_length; } else if (byte_c0_p (c) || byte_c1_p (c)) { /* Control characters */
--- a/src/unicode.c Fri Aug 03 21:51:12 2007 +0000 +++ b/src/unicode.c Sat Aug 04 20:00:24 2007 +0000 @@ -146,13 +146,6 @@ (1) User-defined charsets: It would be inconvenient to require all dumped user-defined charsets to be reloaded at init time. - (2) Starting up in a non-ISO-8859-1 directory. If we load at run-time, - we don't load the tables until after we've parsed the current - directories, and we run into a real bootstrapping problem, if the - directories themselves are non-ISO-8859-1. This is potentially fixable - once we switch to using Unicode internally, so we don't have to do any - conversion (other than the automatic kind, e.g. UTF-16 to UTF-8). - NB With run-time loading, we load in init-mule-at-startup, in mule-cmds.el. This is called from startup.el, which is quite late in the initialization process -- but data-directory isn't set until then. @@ -192,7 +185,7 @@ convert them back.) */ Lisp_Object Qunicode; -Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7; +Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32; Lisp_Object Qneed_bom; Lisp_Object Qutf_16_little_endian, Qutf_16_bom; @@ -218,10 +211,6 @@ trail = 0xDC00 + (__ctu16s_code & 0x3FF); \ } while (0) -#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800) -#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00) -#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800) - #ifdef MULE /* Using ints for to_unicode is OK (as long as they are >= 32 bits). @@ -1703,6 +1692,7 @@ { /* decode */ unsigned char counter; + unsigned char indicated_length; int seen_char; /* encode */ Lisp_Object current_charset; @@ -1716,11 +1706,6 @@ DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode); -/* Decode a UCS-2 or UCS-4 character into a buffer. If the lookup fails, use - <GETA MARK> (U+3013) of JIS X 0208, which means correct character - is not found, instead. - #### do something more appropriate (use blob?) - Danger, Will Robinson! Data loss. Should we signal user? */ static void decode_unicode_char (int ch, unsigned_char_dynarr *dst, struct unicode_coding_stream *data, @@ -1755,9 +1740,32 @@ data->seen_char = 1; } +#define DECODE_ERROR_OCTET(octet, dst, data, ignore_bom) \ + decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, \ + dst, data, ignore_bom) + +static inline void +indicate_invalid_utf_8 (unsigned char indicated_length, + unsigned char counter, + int ch, unsigned_char_dynarr *dst, + struct unicode_coding_stream *data, + unsigned int ignore_bom) +{ + Binbyte stored = indicated_length - counter; + Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length]; + + while (stored > 0) + { + DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask, + dst, data, ignore_bom); + mask = 0x80, stored--; + } +} + static void encode_unicode_char_1 (int code, unsigned_char_dynarr *dst, - enum unicode_type type, unsigned int little_endian) + enum unicode_type type, unsigned int little_endian, + int write_error_characters_as_such) { switch (type) { @@ -1767,53 +1775,105 @@ if (code < 0x10000) { Dynarr_add (dst, (unsigned char) (code & 255)); Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); - } else { - /* Little endian; least significant byte first. */ - int first, second; - - CODE_TO_UTF_16_SURROGATES(code, first, second); - - Dynarr_add (dst, (unsigned char) (first & 255)); - Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); - - Dynarr_add (dst, (unsigned char) (second & 255)); - Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); - } + } else if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else if (code < 0x110000) + { + /* Little endian; least significant byte first. */ + int first, second; + + CODE_TO_UTF_16_SURROGATES(code, first, second); + + Dynarr_add (dst, (unsigned char) (first & 255)); + Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); + + Dynarr_add (dst, (unsigned char) (second & 255)); + Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); + } + else + { + /* Not valid Unicode. Pass U+FFFD, least significant byte + first. */ + Dynarr_add (dst, (unsigned char) 0xFD); + Dynarr_add (dst, (unsigned char) 0xFF); + } } else { if (code < 0x10000) { Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); Dynarr_add (dst, (unsigned char) (code & 255)); - } else { - /* Big endian; most significant byte first. */ - int first, second; - - CODE_TO_UTF_16_SURROGATES(code, first, second); - - Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); - Dynarr_add (dst, (unsigned char) (first & 255)); - - Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); - Dynarr_add (dst, (unsigned char) (second & 255)); - } + } else if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else if (code < 0x110000) + { + /* Big endian; most significant byte first. */ + int first, second; + + CODE_TO_UTF_16_SURROGATES(code, first, second); + + Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (first & 255)); + + Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (second & 255)); + } + else + { + /* Not valid Unicode. Pass U+FFFD, most significant byte + first. */ + Dynarr_add (dst, (unsigned char) 0xFF); + Dynarr_add (dst, (unsigned char) 0xFD); + } } break; case UNICODE_UCS_4: + case UNICODE_UTF_32: if (little_endian) { - Dynarr_add (dst, (unsigned char) (code & 255)); - Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); - Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); - Dynarr_add (dst, (unsigned char) (code >> 24)); + if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else + { + /* We generate and accept incorrect sequences here, which is + okay, in the interest of preservation of the user's + data. */ + Dynarr_add (dst, (unsigned char) (code & 255)); + Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); + Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); + Dynarr_add (dst, (unsigned char) (code >> 24)); + } } else { - Dynarr_add (dst, (unsigned char) (code >> 24)); - Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); - Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); - Dynarr_add (dst, (unsigned char) (code & 255)); + if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else + { + /* We generate and accept incorrect sequences here, which is okay, + in the interest of preservation of the user's data. */ + Dynarr_add (dst, (unsigned char) (code >> 24)); + Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); + Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (code & 255)); + } } break; @@ -1842,11 +1902,25 @@ } else if (code <= 0x3ffffff) { - Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8)); - Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); - Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); - Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); - Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); + +#if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \ + && UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff) +#error "This code needs to be rewritten. " +#endif + if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else + { + Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8)); + Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); + Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); + Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); + Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); + } } else { @@ -1870,7 +1944,8 @@ void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, int USED_IF_MULE (l), unsigned_char_dynarr *dst, - enum unicode_type type, unsigned int little_endian) + enum unicode_type type, unsigned int little_endian, + int write_error_characters_as_such) { #ifdef MULE int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127)); @@ -1896,7 +1971,8 @@ int code = h; #endif /* MULE */ - encode_unicode_char_1 (code, dst, type, little_endian); + encode_unicode_char_1 (code, dst, type, little_endian, + write_error_characters_as_such); } static Bytecount @@ -1915,6 +1991,8 @@ if (str->direction == CODING_DECODE) { unsigned char counter = data->counter; + unsigned char indicated_length + = data->indicated_length; while (n--) { @@ -1923,46 +2001,92 @@ switch (type) { case UNICODE_UTF_8: - switch (counter) - { - case 0: - if (c >= 0xfc) - { - ch = c & 0x01; - counter = 5; - } - else if (c >= 0xf8) - { - ch = c & 0x03; - counter = 4; - } - else if (c >= 0xf0) - { - ch = c & 0x07; - counter = 3; - } - else if (c >= 0xe0) - { - ch = c & 0x0f; - counter = 2; - } - else if (c >= 0xc0) - { - ch = c & 0x1f; - counter = 1; - } - else - decode_unicode_char (c, dst, data, ignore_bom); - break; - case 1: - ch = (ch << 6) | (c & 0x3f); - decode_unicode_char (ch, dst, data, ignore_bom); - ch = 0; - counter = 0; - break; - default: - ch = (ch << 6) | (c & 0x3f); - counter--; + if (0 == counter) + { + if (0 == (c & 0x80)) + { + /* ASCII. */ + decode_unicode_char (c, dst, data, ignore_bom); + } + else if (0 == (c & 0x40)) + { + /* Highest bit set, second highest not--there's + something wrong. */ + DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + } + else if (0 == (c & 0x20)) + { + ch = c & 0x1f; + counter = 1; + indicated_length = 2; + } + else if (0 == (c & 0x10)) + { + ch = c & 0x0f; + counter = 2; + indicated_length = 3; + } + else if (0 == (c & 0x08)) + { + ch = c & 0x0f; + counter = 3; + indicated_length = 4; + } + else + { + /* We don't supports lengths longer than 4 in + external-format data. */ + DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + + } + } + else + { + /* counter != 0 */ + if ((0 == (c & 0x80)) || (0 != (c & 0x40))) + { + indicate_invalid_utf_8(indicated_length, + counter, + ch, dst, data, ignore_bom); + if (c & 0x80) + { + DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + } + else + { + /* The character just read is ASCII. Treat it as + such. */ + decode_unicode_char (c, dst, data, ignore_bom); + } + ch = 0; + counter = 0; + } + else + { + ch = (ch << 6) | (c & 0x3f); + counter--; + /* Just processed the final byte. Emit the character. */ + if (!counter) + { + /* Don't accept over-long sequences, surrogates, + or codes above #x10FFFF. */ + if ((ch < 0x80) || + ((ch < 0x800) && indicated_length > 2) || + ((ch < 0x10000) && indicated_length > 3) || + valid_utf_16_surrogate(ch) || (ch > 0x110000)) + { + indicate_invalid_utf_8(indicated_length, + counter, + ch, dst, data, + ignore_bom); + } + else + { + decode_unicode_char (ch, dst, data, ignore_bom); + } + ch = 0; + } + } } break; @@ -1972,39 +2096,51 @@ ch = (c << counter) | ch; else ch = (ch << 8) | c; + counter += 8; - if (counter == 16 && valid_utf_16_first_surrogate(ch)) - break; - - if (counter == 16) - { + if (16 == counter) + { int tempch = ch; + + if (valid_utf_16_first_surrogate(ch)) + { + break; + } ch = 0; counter = 0; decode_unicode_char (tempch, dst, data, ignore_bom); } - if (counter == 32) + else if (32 == counter) { int tempch; - /* #### Signalling an error may be a bit extreme. Should - we try and read it in anyway? */ - if (!valid_utf_16_first_surrogate(ch >> 16) - || !valid_utf_16_last_surrogate(ch & 0xFFFF)) + + if (!valid_utf_16_last_surrogate(ch & 0xFFFF)) { - signal_error(Qtext_conversion_error, - "Invalid UTF-16 surrogate sequence", - Qunbound); + DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); } - tempch = utf_16_surrogates_to_code((ch >> 16), - (ch & 0xffff)); + else + { + tempch = utf_16_surrogates_to_code((ch >> 16), + (ch & 0xffff)); + decode_unicode_char(tempch, dst, data, ignore_bom); + } ch = 0; counter = 0; - decode_unicode_char(tempch, dst, data, ignore_bom); - } + } + else + assert(8 == counter || 24 == counter); break; case UNICODE_UCS_4: + case UNICODE_UTF_32: if (little_endian) ch = (c << counter) | ch; else @@ -2012,15 +2148,43 @@ counter += 8; if (counter == 32) { - int tempch = ch; + if (ch > 0x10ffff) + { + /* ch is not a legal Unicode character. We're fine + with that in UCS-4, though not in UTF-32. */ + if (UNICODE_UCS_4 == type && ch < 0x80000000) + { + decode_unicode_char (ch, dst, data, ignore_bom); + } + else if (little_endian) + { + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, + ignore_bom); + } + else + { + DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + } + } + else + { + decode_unicode_char (ch, dst, data, ignore_bom); + } ch = 0; counter = 0; - if (tempch < 0) - { - /* !!#### indicate an error */ - tempch = '~'; - } - decode_unicode_char (tempch, dst, data, ignore_bom); } break; @@ -2032,10 +2196,67 @@ } } - if (str->eof) - DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); + + if (str->eof && ch) + { + switch (type) + { + case UNICODE_UTF_8: + indicate_invalid_utf_8(indicated_length, + counter, ch, dst, data, + ignore_bom); + break; + + case UNICODE_UTF_16: + case UNICODE_UCS_4: + case UNICODE_UTF_32: + if (8 == counter) + { + DECODE_ERROR_OCTET (ch, dst, data, ignore_bom); + } + else if (16 == counter) + { + if (little_endian) + { + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + } + else + { + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); + } + } + else if (24 == counter) + { + if (little_endian) + { + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + } + else + { + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + } + } + else assert(0); + break; + } + ch = 0; + } data->counter = counter; + data->indicated_length = indicated_length; } else { @@ -2054,7 +2275,7 @@ if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom) { - encode_unicode_char_1 (0xFEFF, dst, type, little_endian); + encode_unicode_char_1 (0xFEFF, dst, type, little_endian, 1); data->wrote_bom = 1; } @@ -2068,7 +2289,7 @@ { /* Processing ASCII character */ ch = 0; encode_unicode_char (Vcharset_ascii, c, 0, dst, type, - little_endian); + little_endian, 1); char_boundary = 1; } @@ -2092,20 +2313,20 @@ for the rationale behind subtracting #xa0 from the character's code. */ encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst, - type, little_endian); + type, little_endian, 1); else { switch (XCHARSET_REP_BYTES (charset)) { case 2: encode_unicode_char (charset, c, 0, dst, type, - little_endian); + little_endian, 1); break; case 3: if (XCHARSET_PRIVATE_P (charset)) { encode_unicode_char (charset, c, 0, dst, type, - little_endian); + little_endian, 1); ch = 0; } else if (ch) @@ -2119,7 +2340,7 @@ handle this yet. */ encode_unicode_char (Vcharset_ascii, '~', 0, dst, type, - little_endian); + little_endian, 1); } else { @@ -2138,7 +2359,7 @@ else #endif /* ENABLE_COMPOSITE_CHARS */ encode_unicode_char (charset, ch, c, dst, type, - little_endian); + little_endian, 1); ch = 0; } else @@ -2151,7 +2372,7 @@ if (ch) { encode_unicode_char (charset, ch, c, dst, type, - little_endian); + little_endian, 1); ch = 0; } else @@ -2521,6 +2742,8 @@ type = UNICODE_UTF_7; else if (EQ (value, Qucs_4)) type = UNICODE_UCS_4; + else if (EQ (value, Qutf_32)) + type = UNICODE_UTF_32; else invalid_constant ("Invalid Unicode type", key); @@ -2546,6 +2769,7 @@ case UNICODE_UTF_8: return Qutf_8; case UNICODE_UTF_7: return Qutf_7; case UNICODE_UCS_4: return Qucs_4; + case UNICODE_UTF_32: return Qutf_32; default: ABORT (); } } @@ -2620,6 +2844,7 @@ DEFSYMBOL (Qunicode); DEFSYMBOL (Qucs_4); DEFSYMBOL (Qutf_16); + DEFSYMBOL (Qutf_32); DEFSYMBOL (Qutf_8); DEFSYMBOL (Qutf_7);