Mercurial > hg > xemacs-beta
comparison src/unicode.c @ 4096:1abf84db2c7f
[xemacs-hg @ 2007-08-04 20:00:10 by aidan]
Preserve invalid UTF-8, UTF-16 sequences on encoding, decoding.
author | aidan |
---|---|
date | Sat, 04 Aug 2007 20:00:24 +0000 |
parents | 3584cb2c07db |
children | 75d0292c1bff |
comparison
equal
deleted
inserted
replaced
4095:bff7e065cfdc | 4096:1abf84db2c7f |
---|---|
144 Disadvantages: | 144 Disadvantages: |
145 | 145 |
146 (1) User-defined charsets: It would be inconvenient to require all | 146 (1) User-defined charsets: It would be inconvenient to require all |
147 dumped user-defined charsets to be reloaded at init time. | 147 dumped user-defined charsets to be reloaded at init time. |
148 | 148 |
149 (2) Starting up in a non-ISO-8859-1 directory. If we load at run-time, | |
150 we don't load the tables until after we've parsed the current | |
151 directories, and we run into a real bootstrapping problem, if the | |
152 directories themselves are non-ISO-8859-1. This is potentially fixable | |
153 once we switch to using Unicode internally, so we don't have to do any | |
154 conversion (other than the automatic kind, e.g. UTF-16 to UTF-8). | |
155 | |
156 NB With run-time loading, we load in init-mule-at-startup, in | 149 NB With run-time loading, we load in init-mule-at-startup, in |
157 mule-cmds.el. This is called from startup.el, which is quite late in | 150 mule-cmds.el. This is called from startup.el, which is quite late in |
158 the initialization process -- but data-directory isn't set until then. | 151 the initialization process -- but data-directory isn't set until then. |
159 With dump-time loading, you still can't dump in a Japanese directory | 152 With dump-time loading, you still can't dump in a Japanese directory |
160 (again, until we move to Unicode internally), but this is not such an | 153 (again, until we move to Unicode internally), but this is not such an |
190 call the Unicode ones anyway, so in the case of structures, we'd be | 183 call the Unicode ones anyway, so in the case of structures, we'd be |
191 converting from Unicode to ANSI structures, only to have the OS | 184 converting from Unicode to ANSI structures, only to have the OS |
192 convert them back.) */ | 185 convert them back.) */ |
193 | 186 |
194 Lisp_Object Qunicode; | 187 Lisp_Object Qunicode; |
195 Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7; | 188 Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32; |
196 Lisp_Object Qneed_bom; | 189 Lisp_Object Qneed_bom; |
197 | 190 |
198 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; | 191 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; |
199 Lisp_Object Qutf_16_little_endian_bom; | 192 Lisp_Object Qutf_16_little_endian_bom; |
200 | 193 |
215 #define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do { \ | 208 #define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do { \ |
216 int __ctu16s_code = (codepoint); \ | 209 int __ctu16s_code = (codepoint); \ |
217 lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10); \ | 210 lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10); \ |
218 trail = 0xDC00 + (__ctu16s_code & 0x3FF); \ | 211 trail = 0xDC00 + (__ctu16s_code & 0x3FF); \ |
219 } while (0) | 212 } while (0) |
220 | |
221 #define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800) | |
222 #define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00) | |
223 #define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800) | |
224 | 213 |
225 #ifdef MULE | 214 #ifdef MULE |
226 | 215 |
227 /* Using ints for to_unicode is OK (as long as they are >= 32 bits). | 216 /* Using ints for to_unicode is OK (as long as they are >= 32 bits). |
228 In from_unicode, we're converting from Mule characters, which means | 217 In from_unicode, we're converting from Mule characters, which means |
1701 | 1690 |
1702 struct unicode_coding_stream | 1691 struct unicode_coding_stream |
1703 { | 1692 { |
1704 /* decode */ | 1693 /* decode */ |
1705 unsigned char counter; | 1694 unsigned char counter; |
1695 unsigned char indicated_length; | |
1706 int seen_char; | 1696 int seen_char; |
1707 /* encode */ | 1697 /* encode */ |
1708 Lisp_Object current_charset; | 1698 Lisp_Object current_charset; |
1709 int current_char_boundary; | 1699 int current_char_boundary; |
1710 int wrote_bom; | 1700 int wrote_bom; |
1714 { XD_END } | 1704 { XD_END } |
1715 }; | 1705 }; |
1716 | 1706 |
1717 DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode); | 1707 DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode); |
1718 | 1708 |
1719 /* Decode a UCS-2 or UCS-4 character into a buffer. If the lookup fails, use | |
1720 <GETA MARK> (U+3013) of JIS X 0208, which means correct character | |
1721 is not found, instead. | |
1722 #### do something more appropriate (use blob?) | |
1723 Danger, Will Robinson! Data loss. Should we signal user? */ | |
1724 static void | 1709 static void |
1725 decode_unicode_char (int ch, unsigned_char_dynarr *dst, | 1710 decode_unicode_char (int ch, unsigned_char_dynarr *dst, |
1726 struct unicode_coding_stream *data, | 1711 struct unicode_coding_stream *data, |
1727 unsigned int ignore_bom) | 1712 unsigned int ignore_bom) |
1728 { | 1713 { |
1753 } | 1738 } |
1754 | 1739 |
1755 data->seen_char = 1; | 1740 data->seen_char = 1; |
1756 } | 1741 } |
1757 | 1742 |
1743 #define DECODE_ERROR_OCTET(octet, dst, data, ignore_bom) \ | |
1744 decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, \ | |
1745 dst, data, ignore_bom) | |
1746 | |
1747 static inline void | |
1748 indicate_invalid_utf_8 (unsigned char indicated_length, | |
1749 unsigned char counter, | |
1750 int ch, unsigned_char_dynarr *dst, | |
1751 struct unicode_coding_stream *data, | |
1752 unsigned int ignore_bom) | |
1753 { | |
1754 Binbyte stored = indicated_length - counter; | |
1755 Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length]; | |
1756 | |
1757 while (stored > 0) | |
1758 { | |
1759 DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask, | |
1760 dst, data, ignore_bom); | |
1761 mask = 0x80, stored--; | |
1762 } | |
1763 } | |
1764 | |
1758 static void | 1765 static void |
1759 encode_unicode_char_1 (int code, unsigned_char_dynarr *dst, | 1766 encode_unicode_char_1 (int code, unsigned_char_dynarr *dst, |
1760 enum unicode_type type, unsigned int little_endian) | 1767 enum unicode_type type, unsigned int little_endian, |
1768 int write_error_characters_as_such) | |
1761 { | 1769 { |
1762 switch (type) | 1770 switch (type) |
1763 { | 1771 { |
1764 case UNICODE_UTF_16: | 1772 case UNICODE_UTF_16: |
1765 if (little_endian) | 1773 if (little_endian) |
1766 { | 1774 { |
1767 if (code < 0x10000) { | 1775 if (code < 0x10000) { |
1768 Dynarr_add (dst, (unsigned char) (code & 255)); | 1776 Dynarr_add (dst, (unsigned char) (code & 255)); |
1769 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | 1777 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); |
1770 } else { | 1778 } else if (write_error_characters_as_such && |
1771 /* Little endian; least significant byte first. */ | 1779 code >= UNICODE_ERROR_OCTET_RANGE_START && |
1772 int first, second; | 1780 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) |
1773 | 1781 { |
1774 CODE_TO_UTF_16_SURROGATES(code, first, second); | 1782 Dynarr_add (dst, (unsigned char) ((code & 0xFF))); |
1775 | 1783 } |
1776 Dynarr_add (dst, (unsigned char) (first & 255)); | 1784 else if (code < 0x110000) |
1777 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); | 1785 { |
1778 | 1786 /* Little endian; least significant byte first. */ |
1779 Dynarr_add (dst, (unsigned char) (second & 255)); | 1787 int first, second; |
1780 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); | 1788 |
1781 } | 1789 CODE_TO_UTF_16_SURROGATES(code, first, second); |
1790 | |
1791 Dynarr_add (dst, (unsigned char) (first & 255)); | |
1792 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); | |
1793 | |
1794 Dynarr_add (dst, (unsigned char) (second & 255)); | |
1795 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); | |
1796 } | |
1797 else | |
1798 { | |
1799 /* Not valid Unicode. Pass U+FFFD, least significant byte | |
1800 first. */ | |
1801 Dynarr_add (dst, (unsigned char) 0xFD); | |
1802 Dynarr_add (dst, (unsigned char) 0xFF); | |
1803 } | |
1782 } | 1804 } |
1783 else | 1805 else |
1784 { | 1806 { |
1785 if (code < 0x10000) { | 1807 if (code < 0x10000) { |
1786 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | 1808 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); |
1787 Dynarr_add (dst, (unsigned char) (code & 255)); | 1809 Dynarr_add (dst, (unsigned char) (code & 255)); |
1788 } else { | 1810 } else if (write_error_characters_as_such && |
1789 /* Big endian; most significant byte first. */ | 1811 code >= UNICODE_ERROR_OCTET_RANGE_START && |
1790 int first, second; | 1812 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) |
1791 | 1813 { |
1792 CODE_TO_UTF_16_SURROGATES(code, first, second); | 1814 Dynarr_add (dst, (unsigned char) ((code & 0xFF))); |
1793 | 1815 } |
1794 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); | 1816 else if (code < 0x110000) |
1795 Dynarr_add (dst, (unsigned char) (first & 255)); | 1817 { |
1796 | 1818 /* Big endian; most significant byte first. */ |
1797 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); | 1819 int first, second; |
1798 Dynarr_add (dst, (unsigned char) (second & 255)); | 1820 |
1799 } | 1821 CODE_TO_UTF_16_SURROGATES(code, first, second); |
1822 | |
1823 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); | |
1824 Dynarr_add (dst, (unsigned char) (first & 255)); | |
1825 | |
1826 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); | |
1827 Dynarr_add (dst, (unsigned char) (second & 255)); | |
1828 } | |
1829 else | |
1830 { | |
1831 /* Not valid Unicode. Pass U+FFFD, most significant byte | |
1832 first. */ | |
1833 Dynarr_add (dst, (unsigned char) 0xFF); | |
1834 Dynarr_add (dst, (unsigned char) 0xFD); | |
1835 } | |
1800 } | 1836 } |
1801 break; | 1837 break; |
1802 | 1838 |
1803 case UNICODE_UCS_4: | 1839 case UNICODE_UCS_4: |
1840 case UNICODE_UTF_32: | |
1804 if (little_endian) | 1841 if (little_endian) |
1805 { | 1842 { |
1806 Dynarr_add (dst, (unsigned char) (code & 255)); | 1843 if (write_error_characters_as_such && |
1807 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | 1844 code >= UNICODE_ERROR_OCTET_RANGE_START && |
1808 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); | 1845 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) |
1809 Dynarr_add (dst, (unsigned char) (code >> 24)); | 1846 { |
1847 Dynarr_add (dst, (unsigned char) ((code & 0xFF))); | |
1848 } | |
1849 else | |
1850 { | |
1851 /* We generate and accept incorrect sequences here, which is | |
1852 okay, in the interest of preservation of the user's | |
1853 data. */ | |
1854 Dynarr_add (dst, (unsigned char) (code & 255)); | |
1855 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | |
1856 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); | |
1857 Dynarr_add (dst, (unsigned char) (code >> 24)); | |
1858 } | |
1810 } | 1859 } |
1811 else | 1860 else |
1812 { | 1861 { |
1813 Dynarr_add (dst, (unsigned char) (code >> 24)); | 1862 if (write_error_characters_as_such && |
1814 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); | 1863 code >= UNICODE_ERROR_OCTET_RANGE_START && |
1815 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | 1864 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) |
1816 Dynarr_add (dst, (unsigned char) (code & 255)); | 1865 { |
1866 Dynarr_add (dst, (unsigned char) ((code & 0xFF))); | |
1867 } | |
1868 else | |
1869 { | |
1870 /* We generate and accept incorrect sequences here, which is okay, | |
1871 in the interest of preservation of the user's data. */ | |
1872 Dynarr_add (dst, (unsigned char) (code >> 24)); | |
1873 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); | |
1874 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | |
1875 Dynarr_add (dst, (unsigned char) (code & 255)); | |
1876 } | |
1817 } | 1877 } |
1818 break; | 1878 break; |
1819 | 1879 |
1820 case UNICODE_UTF_8: | 1880 case UNICODE_UTF_8: |
1821 if (code <= 0x7f) | 1881 if (code <= 0x7f) |
1840 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); | 1900 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); |
1841 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | 1901 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); |
1842 } | 1902 } |
1843 else if (code <= 0x3ffffff) | 1903 else if (code <= 0x3ffffff) |
1844 { | 1904 { |
1845 Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8)); | 1905 |
1846 Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); | 1906 #if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \ |
1847 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); | 1907 && UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff) |
1848 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); | 1908 #error "This code needs to be rewritten. " |
1849 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | 1909 #endif |
1910 if (write_error_characters_as_such && | |
1911 code >= UNICODE_ERROR_OCTET_RANGE_START && | |
1912 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) | |
1913 { | |
1914 Dynarr_add (dst, (unsigned char) ((code & 0xFF))); | |
1915 } | |
1916 else | |
1917 { | |
1918 Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8)); | |
1919 Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); | |
1920 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); | |
1921 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); | |
1922 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | |
1923 } | |
1850 } | 1924 } |
1851 else | 1925 else |
1852 { | 1926 { |
1853 Dynarr_add (dst, (unsigned char) ((code >> 30) | 0xfc)); | 1927 Dynarr_add (dst, (unsigned char) ((code >> 30) | 0xfc)); |
1854 Dynarr_add (dst, (unsigned char) (((code >> 24) & 0x3f) | 0x80)); | 1928 Dynarr_add (dst, (unsigned char) (((code >> 24) & 0x3f) | 0x80)); |
1868 /* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented | 1942 /* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented |
1869 encodings. */ | 1943 encodings. */ |
1870 void | 1944 void |
1871 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, | 1945 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, |
1872 int USED_IF_MULE (l), unsigned_char_dynarr *dst, | 1946 int USED_IF_MULE (l), unsigned_char_dynarr *dst, |
1873 enum unicode_type type, unsigned int little_endian) | 1947 enum unicode_type type, unsigned int little_endian, |
1948 int write_error_characters_as_such) | |
1874 { | 1949 { |
1875 #ifdef MULE | 1950 #ifdef MULE |
1876 int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127)); | 1951 int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127)); |
1877 | 1952 |
1878 if (code == -1) | 1953 if (code == -1) |
1894 } | 1969 } |
1895 #else | 1970 #else |
1896 int code = h; | 1971 int code = h; |
1897 #endif /* MULE */ | 1972 #endif /* MULE */ |
1898 | 1973 |
1899 encode_unicode_char_1 (code, dst, type, little_endian); | 1974 encode_unicode_char_1 (code, dst, type, little_endian, |
1975 write_error_characters_as_such); | |
1900 } | 1976 } |
1901 | 1977 |
1902 static Bytecount | 1978 static Bytecount |
1903 unicode_convert (struct coding_stream *str, const UExtbyte *src, | 1979 unicode_convert (struct coding_stream *str, const UExtbyte *src, |
1904 unsigned_char_dynarr *dst, Bytecount n) | 1980 unsigned_char_dynarr *dst, Bytecount n) |
1913 Bytecount orign = n; | 1989 Bytecount orign = n; |
1914 | 1990 |
1915 if (str->direction == CODING_DECODE) | 1991 if (str->direction == CODING_DECODE) |
1916 { | 1992 { |
1917 unsigned char counter = data->counter; | 1993 unsigned char counter = data->counter; |
1994 unsigned char indicated_length | |
1995 = data->indicated_length; | |
1918 | 1996 |
1919 while (n--) | 1997 while (n--) |
1920 { | 1998 { |
1921 UExtbyte c = *src++; | 1999 UExtbyte c = *src++; |
1922 | 2000 |
1923 switch (type) | 2001 switch (type) |
1924 { | 2002 { |
1925 case UNICODE_UTF_8: | 2003 case UNICODE_UTF_8: |
1926 switch (counter) | 2004 if (0 == counter) |
1927 { | 2005 { |
1928 case 0: | 2006 if (0 == (c & 0x80)) |
1929 if (c >= 0xfc) | 2007 { |
1930 { | 2008 /* ASCII. */ |
1931 ch = c & 0x01; | 2009 decode_unicode_char (c, dst, data, ignore_bom); |
1932 counter = 5; | 2010 } |
1933 } | 2011 else if (0 == (c & 0x40)) |
1934 else if (c >= 0xf8) | 2012 { |
1935 { | 2013 /* Highest bit set, second highest not--there's |
1936 ch = c & 0x03; | 2014 something wrong. */ |
1937 counter = 4; | 2015 DECODE_ERROR_OCTET (c, dst, data, ignore_bom); |
1938 } | 2016 } |
1939 else if (c >= 0xf0) | 2017 else if (0 == (c & 0x20)) |
1940 { | 2018 { |
1941 ch = c & 0x07; | 2019 ch = c & 0x1f; |
1942 counter = 3; | 2020 counter = 1; |
1943 } | 2021 indicated_length = 2; |
1944 else if (c >= 0xe0) | 2022 } |
1945 { | 2023 else if (0 == (c & 0x10)) |
1946 ch = c & 0x0f; | 2024 { |
1947 counter = 2; | 2025 ch = c & 0x0f; |
1948 } | 2026 counter = 2; |
1949 else if (c >= 0xc0) | 2027 indicated_length = 3; |
1950 { | 2028 } |
1951 ch = c & 0x1f; | 2029 else if (0 == (c & 0x08)) |
1952 counter = 1; | 2030 { |
1953 } | 2031 ch = c & 0x0f; |
1954 else | 2032 counter = 3; |
1955 decode_unicode_char (c, dst, data, ignore_bom); | 2033 indicated_length = 4; |
1956 break; | 2034 } |
1957 case 1: | 2035 else |
1958 ch = (ch << 6) | (c & 0x3f); | 2036 { |
1959 decode_unicode_char (ch, dst, data, ignore_bom); | 2037 /* We don't supports lengths longer than 4 in |
1960 ch = 0; | 2038 external-format data. */ |
1961 counter = 0; | 2039 DECODE_ERROR_OCTET (c, dst, data, ignore_bom); |
1962 break; | 2040 |
1963 default: | 2041 } |
1964 ch = (ch << 6) | (c & 0x3f); | 2042 } |
1965 counter--; | 2043 else |
2044 { | |
2045 /* counter != 0 */ | |
2046 if ((0 == (c & 0x80)) || (0 != (c & 0x40))) | |
2047 { | |
2048 indicate_invalid_utf_8(indicated_length, | |
2049 counter, | |
2050 ch, dst, data, ignore_bom); | |
2051 if (c & 0x80) | |
2052 { | |
2053 DECODE_ERROR_OCTET (c, dst, data, ignore_bom); | |
2054 } | |
2055 else | |
2056 { | |
2057 /* The character just read is ASCII. Treat it as | |
2058 such. */ | |
2059 decode_unicode_char (c, dst, data, ignore_bom); | |
2060 } | |
2061 ch = 0; | |
2062 counter = 0; | |
2063 } | |
2064 else | |
2065 { | |
2066 ch = (ch << 6) | (c & 0x3f); | |
2067 counter--; | |
2068 /* Just processed the final byte. Emit the character. */ | |
2069 if (!counter) | |
2070 { | |
2071 /* Don't accept over-long sequences, surrogates, | |
2072 or codes above #x10FFFF. */ | |
2073 if ((ch < 0x80) || | |
2074 ((ch < 0x800) && indicated_length > 2) || | |
2075 ((ch < 0x10000) && indicated_length > 3) || | |
2076 valid_utf_16_surrogate(ch) || (ch > 0x110000)) | |
2077 { | |
2078 indicate_invalid_utf_8(indicated_length, | |
2079 counter, | |
2080 ch, dst, data, | |
2081 ignore_bom); | |
2082 } | |
2083 else | |
2084 { | |
2085 decode_unicode_char (ch, dst, data, ignore_bom); | |
2086 } | |
2087 ch = 0; | |
2088 } | |
2089 } | |
1966 } | 2090 } |
1967 break; | 2091 break; |
1968 | 2092 |
1969 case UNICODE_UTF_16: | 2093 case UNICODE_UTF_16: |
1970 | 2094 |
1971 if (little_endian) | 2095 if (little_endian) |
1972 ch = (c << counter) | ch; | 2096 ch = (c << counter) | ch; |
1973 else | 2097 else |
1974 ch = (ch << 8) | c; | 2098 ch = (ch << 8) | c; |
2099 | |
1975 counter += 8; | 2100 counter += 8; |
1976 | 2101 |
1977 if (counter == 16 && valid_utf_16_first_surrogate(ch)) | 2102 if (16 == counter) |
1978 break; | 2103 { |
1979 | |
1980 if (counter == 16) | |
1981 { | |
1982 int tempch = ch; | 2104 int tempch = ch; |
2105 | |
2106 if (valid_utf_16_first_surrogate(ch)) | |
2107 { | |
2108 break; | |
2109 } | |
1983 ch = 0; | 2110 ch = 0; |
1984 counter = 0; | 2111 counter = 0; |
1985 decode_unicode_char (tempch, dst, data, ignore_bom); | 2112 decode_unicode_char (tempch, dst, data, ignore_bom); |
1986 } | 2113 } |
1987 if (counter == 32) | 2114 else if (32 == counter) |
1988 { | 2115 { |
1989 int tempch; | 2116 int tempch; |
1990 /* #### Signalling an error may be a bit extreme. Should | 2117 |
1991 we try and read it in anyway? */ | 2118 if (!valid_utf_16_last_surrogate(ch & 0xFFFF)) |
1992 if (!valid_utf_16_first_surrogate(ch >> 16) | |
1993 || !valid_utf_16_last_surrogate(ch & 0xFFFF)) | |
1994 { | 2119 { |
1995 signal_error(Qtext_conversion_error, | 2120 DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, |
1996 "Invalid UTF-16 surrogate sequence", | 2121 ignore_bom); |
1997 Qunbound); | 2122 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, |
2123 ignore_bom); | |
2124 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
2125 ignore_bom); | |
2126 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, | |
2127 ignore_bom); | |
1998 } | 2128 } |
1999 tempch = utf_16_surrogates_to_code((ch >> 16), | 2129 else |
2000 (ch & 0xffff)); | 2130 { |
2131 tempch = utf_16_surrogates_to_code((ch >> 16), | |
2132 (ch & 0xffff)); | |
2133 decode_unicode_char(tempch, dst, data, ignore_bom); | |
2134 } | |
2001 ch = 0; | 2135 ch = 0; |
2002 counter = 0; | 2136 counter = 0; |
2003 decode_unicode_char(tempch, dst, data, ignore_bom); | 2137 } |
2004 } | 2138 else |
2139 assert(8 == counter || 24 == counter); | |
2005 break; | 2140 break; |
2006 | 2141 |
2007 case UNICODE_UCS_4: | 2142 case UNICODE_UCS_4: |
2143 case UNICODE_UTF_32: | |
2008 if (little_endian) | 2144 if (little_endian) |
2009 ch = (c << counter) | ch; | 2145 ch = (c << counter) | ch; |
2010 else | 2146 else |
2011 ch = (ch << 8) | c; | 2147 ch = (ch << 8) | c; |
2012 counter += 8; | 2148 counter += 8; |
2013 if (counter == 32) | 2149 if (counter == 32) |
2014 { | 2150 { |
2015 int tempch = ch; | 2151 if (ch > 0x10ffff) |
2152 { | |
2153 /* ch is not a legal Unicode character. We're fine | |
2154 with that in UCS-4, though not in UTF-32. */ | |
2155 if (UNICODE_UCS_4 == type && ch < 0x80000000) | |
2156 { | |
2157 decode_unicode_char (ch, dst, data, ignore_bom); | |
2158 } | |
2159 else if (little_endian) | |
2160 { | |
2161 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, | |
2162 ignore_bom); | |
2163 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
2164 ignore_bom); | |
2165 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, | |
2166 ignore_bom); | |
2167 DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, | |
2168 ignore_bom); | |
2169 } | |
2170 else | |
2171 { | |
2172 DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, | |
2173 ignore_bom); | |
2174 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, | |
2175 ignore_bom); | |
2176 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
2177 ignore_bom); | |
2178 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, | |
2179 ignore_bom); | |
2180 } | |
2181 } | |
2182 else | |
2183 { | |
2184 decode_unicode_char (ch, dst, data, ignore_bom); | |
2185 } | |
2016 ch = 0; | 2186 ch = 0; |
2017 counter = 0; | 2187 counter = 0; |
2018 if (tempch < 0) | |
2019 { | |
2020 /* !!#### indicate an error */ | |
2021 tempch = '~'; | |
2022 } | |
2023 decode_unicode_char (tempch, dst, data, ignore_bom); | |
2024 } | 2188 } |
2025 break; | 2189 break; |
2026 | 2190 |
2027 case UNICODE_UTF_7: | 2191 case UNICODE_UTF_7: |
2028 ABORT (); | 2192 ABORT (); |
2030 | 2194 |
2031 default: ABORT (); | 2195 default: ABORT (); |
2032 } | 2196 } |
2033 | 2197 |
2034 } | 2198 } |
2035 if (str->eof) | 2199 |
2036 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); | 2200 if (str->eof && ch) |
2201 { | |
2202 switch (type) | |
2203 { | |
2204 case UNICODE_UTF_8: | |
2205 indicate_invalid_utf_8(indicated_length, | |
2206 counter, ch, dst, data, | |
2207 ignore_bom); | |
2208 break; | |
2209 | |
2210 case UNICODE_UTF_16: | |
2211 case UNICODE_UCS_4: | |
2212 case UNICODE_UTF_32: | |
2213 if (8 == counter) | |
2214 { | |
2215 DECODE_ERROR_OCTET (ch, dst, data, ignore_bom); | |
2216 } | |
2217 else if (16 == counter) | |
2218 { | |
2219 if (little_endian) | |
2220 { | |
2221 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); | |
2222 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
2223 ignore_bom); | |
2224 } | |
2225 else | |
2226 { | |
2227 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
2228 ignore_bom); | |
2229 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); | |
2230 } | |
2231 } | |
2232 else if (24 == counter) | |
2233 { | |
2234 if (little_endian) | |
2235 { | |
2236 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, | |
2237 ignore_bom); | |
2238 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); | |
2239 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
2240 ignore_bom); | |
2241 } | |
2242 else | |
2243 { | |
2244 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, | |
2245 ignore_bom); | |
2246 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
2247 ignore_bom); | |
2248 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, | |
2249 ignore_bom); | |
2250 } | |
2251 } | |
2252 else assert(0); | |
2253 break; | |
2254 } | |
2255 ch = 0; | |
2256 } | |
2037 | 2257 |
2038 data->counter = counter; | 2258 data->counter = counter; |
2259 data->indicated_length = indicated_length; | |
2039 } | 2260 } |
2040 else | 2261 else |
2041 { | 2262 { |
2042 unsigned char char_boundary = data->current_char_boundary; | 2263 unsigned char char_boundary = data->current_char_boundary; |
2043 Lisp_Object charset = data->current_charset; | 2264 Lisp_Object charset = data->current_charset; |
2052 back_to_square_n: | 2273 back_to_square_n: |
2053 #endif /* ENABLE_COMPOSITE_CHARS */ | 2274 #endif /* ENABLE_COMPOSITE_CHARS */ |
2054 | 2275 |
2055 if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom) | 2276 if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom) |
2056 { | 2277 { |
2057 encode_unicode_char_1 (0xFEFF, dst, type, little_endian); | 2278 encode_unicode_char_1 (0xFEFF, dst, type, little_endian, 1); |
2058 data->wrote_bom = 1; | 2279 data->wrote_bom = 1; |
2059 } | 2280 } |
2060 | 2281 |
2061 while (n--) | 2282 while (n--) |
2062 { | 2283 { |
2066 if (byte_ascii_p (c)) | 2287 if (byte_ascii_p (c)) |
2067 #endif /* MULE */ | 2288 #endif /* MULE */ |
2068 { /* Processing ASCII character */ | 2289 { /* Processing ASCII character */ |
2069 ch = 0; | 2290 ch = 0; |
2070 encode_unicode_char (Vcharset_ascii, c, 0, dst, type, | 2291 encode_unicode_char (Vcharset_ascii, c, 0, dst, type, |
2071 little_endian); | 2292 little_endian, 1); |
2072 | 2293 |
2073 char_boundary = 1; | 2294 char_boundary = 1; |
2074 } | 2295 } |
2075 #ifdef MULE | 2296 #ifdef MULE |
2076 else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch)) | 2297 else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch)) |
2090 (Info-goto-node "(internals)Internal String Encoding") | 2311 (Info-goto-node "(internals)Internal String Encoding") |
2091 | 2312 |
2092 for the rationale behind subtracting #xa0 from the | 2313 for the rationale behind subtracting #xa0 from the |
2093 character's code. */ | 2314 character's code. */ |
2094 encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst, | 2315 encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst, |
2095 type, little_endian); | 2316 type, little_endian, 1); |
2096 else | 2317 else |
2097 { | 2318 { |
2098 switch (XCHARSET_REP_BYTES (charset)) | 2319 switch (XCHARSET_REP_BYTES (charset)) |
2099 { | 2320 { |
2100 case 2: | 2321 case 2: |
2101 encode_unicode_char (charset, c, 0, dst, type, | 2322 encode_unicode_char (charset, c, 0, dst, type, |
2102 little_endian); | 2323 little_endian, 1); |
2103 break; | 2324 break; |
2104 case 3: | 2325 case 3: |
2105 if (XCHARSET_PRIVATE_P (charset)) | 2326 if (XCHARSET_PRIVATE_P (charset)) |
2106 { | 2327 { |
2107 encode_unicode_char (charset, c, 0, dst, type, | 2328 encode_unicode_char (charset, c, 0, dst, type, |
2108 little_endian); | 2329 little_endian, 1); |
2109 ch = 0; | 2330 ch = 0; |
2110 } | 2331 } |
2111 else if (ch) | 2332 else if (ch) |
2112 { | 2333 { |
2113 #ifdef ENABLE_COMPOSITE_CHARS | 2334 #ifdef ENABLE_COMPOSITE_CHARS |
2117 { | 2338 { |
2118 /* #### Bother! We don't know how to | 2339 /* #### Bother! We don't know how to |
2119 handle this yet. */ | 2340 handle this yet. */ |
2120 encode_unicode_char (Vcharset_ascii, '~', 0, | 2341 encode_unicode_char (Vcharset_ascii, '~', 0, |
2121 dst, type, | 2342 dst, type, |
2122 little_endian); | 2343 little_endian, 1); |
2123 } | 2344 } |
2124 else | 2345 else |
2125 { | 2346 { |
2126 Ichar emch = make_ichar (Vcharset_composite, | 2347 Ichar emch = make_ichar (Vcharset_composite, |
2127 ch & 0x7F, | 2348 ch & 0x7F, |
2136 } | 2357 } |
2137 } | 2358 } |
2138 else | 2359 else |
2139 #endif /* ENABLE_COMPOSITE_CHARS */ | 2360 #endif /* ENABLE_COMPOSITE_CHARS */ |
2140 encode_unicode_char (charset, ch, c, dst, type, | 2361 encode_unicode_char (charset, ch, c, dst, type, |
2141 little_endian); | 2362 little_endian, 1); |
2142 ch = 0; | 2363 ch = 0; |
2143 } | 2364 } |
2144 else | 2365 else |
2145 { | 2366 { |
2146 ch = c; | 2367 ch = c; |
2149 break; | 2370 break; |
2150 case 4: | 2371 case 4: |
2151 if (ch) | 2372 if (ch) |
2152 { | 2373 { |
2153 encode_unicode_char (charset, ch, c, dst, type, | 2374 encode_unicode_char (charset, ch, c, dst, type, |
2154 little_endian); | 2375 little_endian, 1); |
2155 ch = 0; | 2376 ch = 0; |
2156 } | 2377 } |
2157 else | 2378 else |
2158 { | 2379 { |
2159 ch = c; | 2380 ch = c; |
2519 type = UNICODE_UTF_16; | 2740 type = UNICODE_UTF_16; |
2520 else if (EQ (value, Qutf_7)) | 2741 else if (EQ (value, Qutf_7)) |
2521 type = UNICODE_UTF_7; | 2742 type = UNICODE_UTF_7; |
2522 else if (EQ (value, Qucs_4)) | 2743 else if (EQ (value, Qucs_4)) |
2523 type = UNICODE_UCS_4; | 2744 type = UNICODE_UCS_4; |
2745 else if (EQ (value, Qutf_32)) | |
2746 type = UNICODE_UTF_32; | |
2524 else | 2747 else |
2525 invalid_constant ("Invalid Unicode type", key); | 2748 invalid_constant ("Invalid Unicode type", key); |
2526 | 2749 |
2527 XCODING_SYSTEM_UNICODE_TYPE (codesys) = type; | 2750 XCODING_SYSTEM_UNICODE_TYPE (codesys) = type; |
2528 } | 2751 } |
2544 { | 2767 { |
2545 case UNICODE_UTF_16: return Qutf_16; | 2768 case UNICODE_UTF_16: return Qutf_16; |
2546 case UNICODE_UTF_8: return Qutf_8; | 2769 case UNICODE_UTF_8: return Qutf_8; |
2547 case UNICODE_UTF_7: return Qutf_7; | 2770 case UNICODE_UTF_7: return Qutf_7; |
2548 case UNICODE_UCS_4: return Qucs_4; | 2771 case UNICODE_UCS_4: return Qucs_4; |
2772 case UNICODE_UTF_32: return Qutf_32; | |
2549 default: ABORT (); | 2773 default: ABORT (); |
2550 } | 2774 } |
2551 } | 2775 } |
2552 else if (EQ (prop, Qlittle_endian)) | 2776 else if (EQ (prop, Qlittle_endian)) |
2553 return XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (coding_system) ? Qt : Qnil; | 2777 return XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (coding_system) ? Qt : Qnil; |
2618 DEFSUBR (Funicode_to_char); | 2842 DEFSUBR (Funicode_to_char); |
2619 | 2843 |
2620 DEFSYMBOL (Qunicode); | 2844 DEFSYMBOL (Qunicode); |
2621 DEFSYMBOL (Qucs_4); | 2845 DEFSYMBOL (Qucs_4); |
2622 DEFSYMBOL (Qutf_16); | 2846 DEFSYMBOL (Qutf_16); |
2847 DEFSYMBOL (Qutf_32); | |
2623 DEFSYMBOL (Qutf_8); | 2848 DEFSYMBOL (Qutf_8); |
2624 DEFSYMBOL (Qutf_7); | 2849 DEFSYMBOL (Qutf_7); |
2625 | 2850 |
2626 DEFSYMBOL (Qneed_bom); | 2851 DEFSYMBOL (Qneed_bom); |
2627 | 2852 |