comparison src/unicode.c @ 3952:3584cb2c07db

[xemacs-hg @ 2007-05-13 11:11:28 by aidan] Support non-BMP UTF-16.
author aidan
date Sun, 13 May 2007 11:11:38 +0000
parents 6b2ef948e140
children 1abf84db2c7f
comparison
equal deleted inserted replaced
3951:20ac78313587 3952:3584cb2c07db
197 197
198 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; 198 Lisp_Object Qutf_16_little_endian, Qutf_16_bom;
199 Lisp_Object Qutf_16_little_endian_bom; 199 Lisp_Object Qutf_16_little_endian_bom;
200 200
201 Lisp_Object Qutf_8_bom; 201 Lisp_Object Qutf_8_bom;
202
203 /* See the Unicode FAQ, http://www.unicode.org/faq/utf_bom.html#35 for this
204 algorithm.
205
206 (They also give another, really verbose one, as part of their explanation
207 of the various planes of the encoding, but we won't use that.) */
208
209 #define UTF_16_LEAD_OFFSET (0xD800 - (0x10000 >> 10))
210 #define UTF_16_SURROGATE_OFFSET (0x10000 - (0xD800 << 10) - 0xDC00)
211
212 #define utf_16_surrogates_to_code(lead, trail) \
213 (((lead) << 10) + (trail) + UTF_16_SURROGATE_OFFSET)
214
215 #define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do { \
216 int __ctu16s_code = (codepoint); \
217 lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10); \
218 trail = 0xDC00 + (__ctu16s_code & 0x3FF); \
219 } while (0)
220
221 #define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
222 #define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
223 #define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
202 224
203 #ifdef MULE 225 #ifdef MULE
204 226
205 /* Using ints for to_unicode is OK (as long as they are >= 32 bits). 227 /* Using ints for to_unicode is OK (as long as they are >= 32 bits).
206 In from_unicode, we're converting from Mule characters, which means 228 In from_unicode, we're converting from Mule characters, which means
1740 switch (type) 1762 switch (type)
1741 { 1763 {
1742 case UNICODE_UTF_16: 1764 case UNICODE_UTF_16:
1743 if (little_endian) 1765 if (little_endian)
1744 { 1766 {
1745 Dynarr_add (dst, (unsigned char) (code & 255)); 1767 if (code < 0x10000) {
1746 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); 1768 Dynarr_add (dst, (unsigned char) (code & 255));
1769 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
1770 } else {
1771 /* Little endian; least significant byte first. */
1772 int first, second;
1773
1774 CODE_TO_UTF_16_SURROGATES(code, first, second);
1775
1776 Dynarr_add (dst, (unsigned char) (first & 255));
1777 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
1778
1779 Dynarr_add (dst, (unsigned char) (second & 255));
1780 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
1781 }
1747 } 1782 }
1748 else 1783 else
1749 { 1784 {
1750 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); 1785 if (code < 0x10000) {
1751 Dynarr_add (dst, (unsigned char) (code & 255)); 1786 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
1787 Dynarr_add (dst, (unsigned char) (code & 255));
1788 } else {
1789 /* Big endian; most significant byte first. */
1790 int first, second;
1791
1792 CODE_TO_UTF_16_SURROGATES(code, first, second);
1793
1794 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
1795 Dynarr_add (dst, (unsigned char) (first & 255));
1796
1797 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
1798 Dynarr_add (dst, (unsigned char) (second & 255));
1799 }
1752 } 1800 }
1753 break; 1801 break;
1754 1802
1755 case UNICODE_UCS_4: 1803 case UNICODE_UCS_4:
1756 if (little_endian) 1804 if (little_endian)
1917 counter--; 1965 counter--;
1918 } 1966 }
1919 break; 1967 break;
1920 1968
1921 case UNICODE_UTF_16: 1969 case UNICODE_UTF_16:
1970
1922 if (little_endian) 1971 if (little_endian)
1923 ch = (c << counter) | ch; 1972 ch = (c << counter) | ch;
1924 else 1973 else
1925 ch = (ch << 8) | c; 1974 ch = (ch << 8) | c;
1926 counter += 8; 1975 counter += 8;
1976
1977 if (counter == 16 && valid_utf_16_first_surrogate(ch))
1978 break;
1979
1927 if (counter == 16) 1980 if (counter == 16)
1928 { 1981 {
1929 int tempch = ch; 1982 int tempch = ch;
1930 ch = 0; 1983 ch = 0;
1931 counter = 0; 1984 counter = 0;
1932 decode_unicode_char (tempch, dst, data, ignore_bom); 1985 decode_unicode_char (tempch, dst, data, ignore_bom);
1986 }
1987 if (counter == 32)
1988 {
1989 int tempch;
1990 /* #### Signalling an error may be a bit extreme. Should
1991 we try and read it in anyway? */
1992 if (!valid_utf_16_first_surrogate(ch >> 16)
1993 || !valid_utf_16_last_surrogate(ch & 0xFFFF))
1994 {
1995 signal_error(Qtext_conversion_error,
1996 "Invalid UTF-16 surrogate sequence",
1997 Qunbound);
1998 }
1999 tempch = utf_16_surrogates_to_code((ch >> 16),
2000 (ch & 0xffff));
2001 ch = 0;
2002 counter = 0;
2003 decode_unicode_char(tempch, dst, data, ignore_bom);
1933 } 2004 }
1934 break; 2005 break;
1935 2006
1936 case UNICODE_UCS_4: 2007 case UNICODE_UCS_4:
1937 if (little_endian) 2008 if (little_endian)