Mercurial > hg > xemacs-beta
changeset 3952:3584cb2c07db
[xemacs-hg @ 2007-05-13 11:11:28 by aidan]
Support non-BMP UTF-16.
author | aidan |
---|---|
date | Sun, 13 May 2007 11:11:38 +0000 |
parents | 20ac78313587 |
children | c1ae1b2e5b5b |
files | src/ChangeLog src/unicode.c tests/ChangeLog tests/automated/mule-tests.el |
diffstat | 4 files changed, 96 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- a/src/ChangeLog Sat May 12 21:51:25 2007 +0000 +++ b/src/ChangeLog Sun May 13 11:11:38 2007 +0000 @@ -1,3 +1,10 @@ +2007-04-30 Aidan Kehoe <kehoea@parhasard.net> + + * unicode.c: + * unicode.c (encode_unicode_char_1): + * unicode.c (unicode_convert): + Support non-BMP characters in UTF-16. + 2007-05-12 Aidan Kehoe <kehoea@parhasard.net> * event-Xt.c (x_reset_modifier_mapping):
--- a/src/unicode.c Sat May 12 21:51:25 2007 +0000 +++ b/src/unicode.c Sun May 13 11:11:38 2007 +0000 @@ -200,6 +200,28 @@ Lisp_Object Qutf_8_bom; +/* See the Unicode FAQ, http://www.unicode.org/faq/utf_bom.html#35 for this + algorithm. + + (They also give another, really verbose one, as part of their explanation + of the various planes of the encoding, but we won't use that.) */ + +#define UTF_16_LEAD_OFFSET (0xD800 - (0x10000 >> 10)) +#define UTF_16_SURROGATE_OFFSET (0x10000 - (0xD800 << 10) - 0xDC00) + +#define utf_16_surrogates_to_code(lead, trail) \ + (((lead) << 10) + (trail) + UTF_16_SURROGATE_OFFSET) + +#define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do { \ + int __ctu16s_code = (codepoint); \ + lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10); \ + trail = 0xDC00 + (__ctu16s_code & 0x3FF); \ +} while (0) + +#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800) +#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00) +#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800) + #ifdef MULE /* Using ints for to_unicode is OK (as long as they are >= 32 bits). @@ -1742,13 +1764,39 @@ case UNICODE_UTF_16: if (little_endian) { - Dynarr_add (dst, (unsigned char) (code & 255)); - Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); + if (code < 0x10000) { + Dynarr_add (dst, (unsigned char) (code & 255)); + Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); + } else { + /* Little endian; least significant byte first. */ + int first, second; + + CODE_TO_UTF_16_SURROGATES(code, first, second); + + Dynarr_add (dst, (unsigned char) (first & 255)); + Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); + + Dynarr_add (dst, (unsigned char) (second & 255)); + Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); + } } else { - Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); - Dynarr_add (dst, (unsigned char) (code & 255)); + if (code < 0x10000) { + Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (code & 255)); + } else { + /* Big endian; most significant byte first. */ + int first, second; + + CODE_TO_UTF_16_SURROGATES(code, first, second); + + Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (first & 255)); + + Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (second & 255)); + } } break; @@ -1919,11 +1967,16 @@ break; case UNICODE_UTF_16: + if (little_endian) ch = (c << counter) | ch; else ch = (ch << 8) | c; counter += 8; + + if (counter == 16 && valid_utf_16_first_surrogate(ch)) + break; + if (counter == 16) { int tempch = ch; @@ -1931,6 +1984,24 @@ counter = 0; decode_unicode_char (tempch, dst, data, ignore_bom); } + if (counter == 32) + { + int tempch; + /* #### Signalling an error may be a bit extreme. Should + we try and read it in anyway? */ + if (!valid_utf_16_first_surrogate(ch >> 16) + || !valid_utf_16_last_surrogate(ch & 0xFFFF)) + { + signal_error(Qtext_conversion_error, + "Invalid UTF-16 surrogate sequence", + Qunbound); + } + tempch = utf_16_surrogates_to_code((ch >> 16), + (ch & 0xffff)); + ch = 0; + counter = 0; + decode_unicode_char(tempch, dst, data, ignore_bom); + } break; case UNICODE_UCS_4:
--- a/tests/ChangeLog Sat May 12 21:51:25 2007 +0000 +++ b/tests/ChangeLog Sun May 13 11:11:38 2007 +0000 @@ -1,3 +1,8 @@ +2007-04-30 Aidan Kehoe <kehoea@parhasard.net> + + * automated/mule-tests.el (featurep): + Minimal tests of the non-BMP UTF-16 support. + 2007-05-12 Aidan Kehoe <kehoea@parhasard.net> * automated/mule-tests.el:
--- a/tests/automated/mule-tests.el Sat May 12 21:51:25 2007 +0000 +++ b/tests/automated/mule-tests.el Sun May 13 11:11:38 2007 +0000 @@ -341,9 +341,9 @@ 'utf-8 'iso-8859-2)) ) - ;; This is how you suppress output from `message', called by `write-region' (Assert (not (equal name1 name2))) (Assert (not (file-exists-p name1))) + ;; This is how you suppress output from `message', called by `write-region' (Silence-Message (write-region (point-min) (point-max) name1)) (Assert (file-exists-p name1)) @@ -401,6 +401,14 @@ (Assert (equal (concat "\033%G" utf-8-char) (encode-coding-string xemacs-character 'ctext)))))) + (loop + for (code-point encoded) + in '((#x10000 "\xd8\x00\xdc\x00") + (#x10FFFD "\xdb\xff\xdf\xfd")) + do (Assert (equal (encode-coding-string + (decode-char 'ucs code-point) 'utf-16) + encoded))) + ;;--------------------------------------------------------------- ;; Regression test for a couple of CCL-related bugs. ;;---------------------------------------------------------------