Mercurial > hg > xemacs-beta
comparison src/unicode.c @ 3952:3584cb2c07db
[xemacs-hg @ 2007-05-13 11:11:28 by aidan]
Support non-BMP UTF-16.
author | aidan |
---|---|
date | Sun, 13 May 2007 11:11:38 +0000 |
parents | 6b2ef948e140 |
children | 1abf84db2c7f |
comparison
equal
deleted
inserted
replaced
3951:20ac78313587 | 3952:3584cb2c07db |
---|---|
197 | 197 |
198 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; | 198 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; |
199 Lisp_Object Qutf_16_little_endian_bom; | 199 Lisp_Object Qutf_16_little_endian_bom; |
200 | 200 |
201 Lisp_Object Qutf_8_bom; | 201 Lisp_Object Qutf_8_bom; |
202 | |
203 /* See the Unicode FAQ, http://www.unicode.org/faq/utf_bom.html#35 for this | |
204 algorithm. | |
205 | |
206 (They also give another, really verbose one, as part of their explanation | |
207 of the various planes of the encoding, but we won't use that.) */ | |
208 | |
209 #define UTF_16_LEAD_OFFSET (0xD800 - (0x10000 >> 10)) | |
210 #define UTF_16_SURROGATE_OFFSET (0x10000 - (0xD800 << 10) - 0xDC00) | |
211 | |
212 #define utf_16_surrogates_to_code(lead, trail) \ | |
213 (((lead) << 10) + (trail) + UTF_16_SURROGATE_OFFSET) | |
214 | |
215 #define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do { \ | |
216 int __ctu16s_code = (codepoint); \ | |
217 lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10); \ | |
218 trail = 0xDC00 + (__ctu16s_code & 0x3FF); \ | |
219 } while (0) | |
220 | |
221 #define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800) | |
222 #define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00) | |
223 #define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800) | |
202 | 224 |
203 #ifdef MULE | 225 #ifdef MULE |
204 | 226 |
205 /* Using ints for to_unicode is OK (as long as they are >= 32 bits). | 227 /* Using ints for to_unicode is OK (as long as they are >= 32 bits). |
206 In from_unicode, we're converting from Mule characters, which means | 228 In from_unicode, we're converting from Mule characters, which means |
1740 switch (type) | 1762 switch (type) |
1741 { | 1763 { |
1742 case UNICODE_UTF_16: | 1764 case UNICODE_UTF_16: |
1743 if (little_endian) | 1765 if (little_endian) |
1744 { | 1766 { |
1745 Dynarr_add (dst, (unsigned char) (code & 255)); | 1767 if (code < 0x10000) { |
1746 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | 1768 Dynarr_add (dst, (unsigned char) (code & 255)); |
1769 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | |
1770 } else { | |
1771 /* Little endian; least significant byte first. */ | |
1772 int first, second; | |
1773 | |
1774 CODE_TO_UTF_16_SURROGATES(code, first, second); | |
1775 | |
1776 Dynarr_add (dst, (unsigned char) (first & 255)); | |
1777 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); | |
1778 | |
1779 Dynarr_add (dst, (unsigned char) (second & 255)); | |
1780 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); | |
1781 } | |
1747 } | 1782 } |
1748 else | 1783 else |
1749 { | 1784 { |
1750 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | 1785 if (code < 0x10000) { |
1751 Dynarr_add (dst, (unsigned char) (code & 255)); | 1786 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); |
1787 Dynarr_add (dst, (unsigned char) (code & 255)); | |
1788 } else { | |
1789 /* Big endian; most significant byte first. */ | |
1790 int first, second; | |
1791 | |
1792 CODE_TO_UTF_16_SURROGATES(code, first, second); | |
1793 | |
1794 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); | |
1795 Dynarr_add (dst, (unsigned char) (first & 255)); | |
1796 | |
1797 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); | |
1798 Dynarr_add (dst, (unsigned char) (second & 255)); | |
1799 } | |
1752 } | 1800 } |
1753 break; | 1801 break; |
1754 | 1802 |
1755 case UNICODE_UCS_4: | 1803 case UNICODE_UCS_4: |
1756 if (little_endian) | 1804 if (little_endian) |
1917 counter--; | 1965 counter--; |
1918 } | 1966 } |
1919 break; | 1967 break; |
1920 | 1968 |
1921 case UNICODE_UTF_16: | 1969 case UNICODE_UTF_16: |
1970 | |
1922 if (little_endian) | 1971 if (little_endian) |
1923 ch = (c << counter) | ch; | 1972 ch = (c << counter) | ch; |
1924 else | 1973 else |
1925 ch = (ch << 8) | c; | 1974 ch = (ch << 8) | c; |
1926 counter += 8; | 1975 counter += 8; |
1976 | |
1977 if (counter == 16 && valid_utf_16_first_surrogate(ch)) | |
1978 break; | |
1979 | |
1927 if (counter == 16) | 1980 if (counter == 16) |
1928 { | 1981 { |
1929 int tempch = ch; | 1982 int tempch = ch; |
1930 ch = 0; | 1983 ch = 0; |
1931 counter = 0; | 1984 counter = 0; |
1932 decode_unicode_char (tempch, dst, data, ignore_bom); | 1985 decode_unicode_char (tempch, dst, data, ignore_bom); |
1986 } | |
1987 if (counter == 32) | |
1988 { | |
1989 int tempch; | |
1990 /* #### Signalling an error may be a bit extreme. Should | |
1991 we try and read it in anyway? */ | |
1992 if (!valid_utf_16_first_surrogate(ch >> 16) | |
1993 || !valid_utf_16_last_surrogate(ch & 0xFFFF)) | |
1994 { | |
1995 signal_error(Qtext_conversion_error, | |
1996 "Invalid UTF-16 surrogate sequence", | |
1997 Qunbound); | |
1998 } | |
1999 tempch = utf_16_surrogates_to_code((ch >> 16), | |
2000 (ch & 0xffff)); | |
2001 ch = 0; | |
2002 counter = 0; | |
2003 decode_unicode_char(tempch, dst, data, ignore_bom); | |
1933 } | 2004 } |
1934 break; | 2005 break; |
1935 | 2006 |
1936 case UNICODE_UCS_4: | 2007 case UNICODE_UCS_4: |
1937 if (little_endian) | 2008 if (little_endian) |