Mercurial > hg > xemacs-beta
changeset 4583:2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
src/ChangeLog addition:
2009-01-31 Aidan Kehoe <kehoea@parhasard.net>
* unicode.c (unicode_convert):
Correct little-endian UTF-16 surrogate handling.
tests/ChangeLog addition:
2009-01-31 Aidan Kehoe <kehoea@parhasard.net>
* automated/mule-tests.el:
Test little-endian Unicode surrogates too.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Sat, 31 Jan 2009 13:06:37 +0000 |
parents | 00ed9903a988 |
children | 56e67d42eb04 b25f081370e0 |
files | src/ChangeLog src/unicode.c tests/ChangeLog tests/automated/mule-tests.el |
diffstat | 4 files changed, 60 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/src/ChangeLog Sun Jan 18 12:56:51 2009 +0000 +++ b/src/ChangeLog Sat Jan 31 13:06:37 2009 +0000 @@ -1,3 +1,8 @@ +2009-01-31 Aidan Kehoe <kehoea@parhasard.net> + + * unicode.c (unicode_convert): + Correct little-endian UTF-16 surrogate handling. + 2009-01-16 Aidan Kehoe <kehoea@parhasard.net> * chartab.c (print_table_entry):
--- a/src/unicode.c Sun Jan 18 12:56:51 2009 +0000 +++ b/src/unicode.c Sat Jan 31 13:06:37 2009 +0000 @@ -2115,23 +2115,47 @@ { int tempch; - if (!valid_utf_16_last_surrogate(ch & 0xFFFF)) - { - DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, - ignore_bom); - DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, - ignore_bom); - DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, - ignore_bom); - DECODE_ERROR_OCTET (ch & 0xFF, dst, data, - ignore_bom); - } - else + if (little_endian) { - tempch = utf_16_surrogates_to_code((ch >> 16), - (ch & 0xffff)); - decode_unicode_char(tempch, dst, data, ignore_bom); + if (!valid_utf_16_last_surrogate(ch >> 16)) + { + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, + ignore_bom); + } + else + { + tempch = utf_16_surrogates_to_code((ch & 0xffff), + (ch >> 16)); + decode_unicode_char(tempch, dst, data, ignore_bom); + } } + else + { + if (!valid_utf_16_last_surrogate(ch & 0xFFFF)) + { + DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + } + else + { + tempch = utf_16_surrogates_to_code((ch >> 16), + (ch & 0xffff)); + decode_unicode_char(tempch, dst, data, ignore_bom); + } + } + ch = 0; counter = 0; }
--- a/tests/ChangeLog Sun Jan 18 12:56:51 2009 +0000 +++ b/tests/ChangeLog Sat Jan 31 13:06:37 2009 +0000 @@ -1,3 +1,8 @@ +2009-01-31 Aidan Kehoe <kehoea@parhasard.net> + + * automated/mule-tests.el: + Test little-endian Unicode surrogates too. + 2009-01-18 Aidan Kehoe <kehoea@parhasard.net> * automated/lisp-tests.el: (char-table-with-string):
--- a/tests/automated/mule-tests.el Sun Jan 18 12:56:51 2009 +0000 +++ b/tests/automated/mule-tests.el Sat Jan 31 13:06:37 2009 +0000 @@ -446,12 +446,17 @@ (encode-coding-string xemacs-character 'ctext)))))) (loop - for (code-point encoded) - in '((#x10000 "\xd8\x00\xdc\x00") - (#x10FFFD "\xdb\xff\xdf\xfd")) - do (Assert (equal (encode-coding-string - (decode-char 'ucs code-point) 'utf-16) - encoded))) + for (code-point utf-16-big-endian utf-16-little-endian) + in '((#x10000 "\xd8\x00\xdc\x00" "\x00\xd8\x00\xdc") + (#x10FFFD "\xdb\xff\xdf\xfd" "\xff\xdb\xfd\xdf")) + do + (Assert (equal (encode-coding-string + (decode-char 'ucs code-point) 'utf-16) + utf-16-big-endian)) + (Assert (equal (encode-coding-string + (decode-char 'ucs code-point) 'utf-16-le) + utf-16-little-endian)) + ;;--------------------------------------------------------------- ;; Regression test for a couple of CCL-related bugs.