comparison src/unicode.c @ 4096:1abf84db2c7f

[xemacs-hg @ 2007-08-04 20:00:10 by aidan] Preserve invalid UTF-8, UTF-16 sequences on encoding, decoding.
author aidan
date Sat, 04 Aug 2007 20:00:24 +0000
parents 3584cb2c07db
children 75d0292c1bff
comparison
equal deleted inserted replaced
4095:bff7e065cfdc 4096:1abf84db2c7f
144 Disadvantages: 144 Disadvantages:
145 145
146 (1) User-defined charsets: It would be inconvenient to require all 146 (1) User-defined charsets: It would be inconvenient to require all
147 dumped user-defined charsets to be reloaded at init time. 147 dumped user-defined charsets to be reloaded at init time.
148 148
149 (2) Starting up in a non-ISO-8859-1 directory. If we load at run-time,
150 we don't load the tables until after we've parsed the current
151 directories, and we run into a real bootstrapping problem, if the
152 directories themselves are non-ISO-8859-1. This is potentially fixable
153 once we switch to using Unicode internally, so we don't have to do any
154 conversion (other than the automatic kind, e.g. UTF-16 to UTF-8).
155
156 NB With run-time loading, we load in init-mule-at-startup, in 149 NB With run-time loading, we load in init-mule-at-startup, in
157 mule-cmds.el. This is called from startup.el, which is quite late in 150 mule-cmds.el. This is called from startup.el, which is quite late in
158 the initialization process -- but data-directory isn't set until then. 151 the initialization process -- but data-directory isn't set until then.
159 With dump-time loading, you still can't dump in a Japanese directory 152 With dump-time loading, you still can't dump in a Japanese directory
160 (again, until we move to Unicode internally), but this is not such an 153 (again, until we move to Unicode internally), but this is not such an
190 call the Unicode ones anyway, so in the case of structures, we'd be 183 call the Unicode ones anyway, so in the case of structures, we'd be
191 converting from Unicode to ANSI structures, only to have the OS 184 converting from Unicode to ANSI structures, only to have the OS
192 convert them back.) */ 185 convert them back.) */
193 186
194 Lisp_Object Qunicode; 187 Lisp_Object Qunicode;
195 Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7; 188 Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32;
196 Lisp_Object Qneed_bom; 189 Lisp_Object Qneed_bom;
197 190
198 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; 191 Lisp_Object Qutf_16_little_endian, Qutf_16_bom;
199 Lisp_Object Qutf_16_little_endian_bom; 192 Lisp_Object Qutf_16_little_endian_bom;
200 193
215 #define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do { \ 208 #define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do { \
216 int __ctu16s_code = (codepoint); \ 209 int __ctu16s_code = (codepoint); \
217 lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10); \ 210 lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10); \
218 trail = 0xDC00 + (__ctu16s_code & 0x3FF); \ 211 trail = 0xDC00 + (__ctu16s_code & 0x3FF); \
219 } while (0) 212 } while (0)
220
221 #define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
222 #define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
223 #define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
224 213
225 #ifdef MULE 214 #ifdef MULE
226 215
227 /* Using ints for to_unicode is OK (as long as they are >= 32 bits). 216 /* Using ints for to_unicode is OK (as long as they are >= 32 bits).
228 In from_unicode, we're converting from Mule characters, which means 217 In from_unicode, we're converting from Mule characters, which means
1701 1690
1702 struct unicode_coding_stream 1691 struct unicode_coding_stream
1703 { 1692 {
1704 /* decode */ 1693 /* decode */
1705 unsigned char counter; 1694 unsigned char counter;
1695 unsigned char indicated_length;
1706 int seen_char; 1696 int seen_char;
1707 /* encode */ 1697 /* encode */
1708 Lisp_Object current_charset; 1698 Lisp_Object current_charset;
1709 int current_char_boundary; 1699 int current_char_boundary;
1710 int wrote_bom; 1700 int wrote_bom;
1714 { XD_END } 1704 { XD_END }
1715 }; 1705 };
1716 1706
1717 DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode); 1707 DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode);
1718 1708
1719 /* Decode a UCS-2 or UCS-4 character into a buffer. If the lookup fails, use
1720 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
1721 is not found, instead.
1722 #### do something more appropriate (use blob?)
1723 Danger, Will Robinson! Data loss. Should we signal user? */
1724 static void 1709 static void
1725 decode_unicode_char (int ch, unsigned_char_dynarr *dst, 1710 decode_unicode_char (int ch, unsigned_char_dynarr *dst,
1726 struct unicode_coding_stream *data, 1711 struct unicode_coding_stream *data,
1727 unsigned int ignore_bom) 1712 unsigned int ignore_bom)
1728 { 1713 {
1753 } 1738 }
1754 1739
1755 data->seen_char = 1; 1740 data->seen_char = 1;
1756 } 1741 }
1757 1742
1743 #define DECODE_ERROR_OCTET(octet, dst, data, ignore_bom) \
1744 decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, \
1745 dst, data, ignore_bom)
1746
1747 static inline void
1748 indicate_invalid_utf_8 (unsigned char indicated_length,
1749 unsigned char counter,
1750 int ch, unsigned_char_dynarr *dst,
1751 struct unicode_coding_stream *data,
1752 unsigned int ignore_bom)
1753 {
1754 Binbyte stored = indicated_length - counter;
1755 Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
1756
1757 while (stored > 0)
1758 {
1759 DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
1760 dst, data, ignore_bom);
1761 mask = 0x80, stored--;
1762 }
1763 }
1764
1758 static void 1765 static void
1759 encode_unicode_char_1 (int code, unsigned_char_dynarr *dst, 1766 encode_unicode_char_1 (int code, unsigned_char_dynarr *dst,
1760 enum unicode_type type, unsigned int little_endian) 1767 enum unicode_type type, unsigned int little_endian,
1768 int write_error_characters_as_such)
1761 { 1769 {
1762 switch (type) 1770 switch (type)
1763 { 1771 {
1764 case UNICODE_UTF_16: 1772 case UNICODE_UTF_16:
1765 if (little_endian) 1773 if (little_endian)
1766 { 1774 {
1767 if (code < 0x10000) { 1775 if (code < 0x10000) {
1768 Dynarr_add (dst, (unsigned char) (code & 255)); 1776 Dynarr_add (dst, (unsigned char) (code & 255));
1769 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); 1777 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
1770 } else { 1778 } else if (write_error_characters_as_such &&
1771 /* Little endian; least significant byte first. */ 1779 code >= UNICODE_ERROR_OCTET_RANGE_START &&
1772 int first, second; 1780 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
1773 1781 {
1774 CODE_TO_UTF_16_SURROGATES(code, first, second); 1782 Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
1775 1783 }
1776 Dynarr_add (dst, (unsigned char) (first & 255)); 1784 else if (code < 0x110000)
1777 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); 1785 {
1778 1786 /* Little endian; least significant byte first. */
1779 Dynarr_add (dst, (unsigned char) (second & 255)); 1787 int first, second;
1780 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); 1788
1781 } 1789 CODE_TO_UTF_16_SURROGATES(code, first, second);
1790
1791 Dynarr_add (dst, (unsigned char) (first & 255));
1792 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
1793
1794 Dynarr_add (dst, (unsigned char) (second & 255));
1795 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
1796 }
1797 else
1798 {
1799 /* Not valid Unicode. Pass U+FFFD, least significant byte
1800 first. */
1801 Dynarr_add (dst, (unsigned char) 0xFD);
1802 Dynarr_add (dst, (unsigned char) 0xFF);
1803 }
1782 } 1804 }
1783 else 1805 else
1784 { 1806 {
1785 if (code < 0x10000) { 1807 if (code < 0x10000) {
1786 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); 1808 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
1787 Dynarr_add (dst, (unsigned char) (code & 255)); 1809 Dynarr_add (dst, (unsigned char) (code & 255));
1788 } else { 1810 } else if (write_error_characters_as_such &&
1789 /* Big endian; most significant byte first. */ 1811 code >= UNICODE_ERROR_OCTET_RANGE_START &&
1790 int first, second; 1812 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
1791 1813 {
1792 CODE_TO_UTF_16_SURROGATES(code, first, second); 1814 Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
1793 1815 }
1794 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); 1816 else if (code < 0x110000)
1795 Dynarr_add (dst, (unsigned char) (first & 255)); 1817 {
1796 1818 /* Big endian; most significant byte first. */
1797 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); 1819 int first, second;
1798 Dynarr_add (dst, (unsigned char) (second & 255)); 1820
1799 } 1821 CODE_TO_UTF_16_SURROGATES(code, first, second);
1822
1823 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
1824 Dynarr_add (dst, (unsigned char) (first & 255));
1825
1826 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
1827 Dynarr_add (dst, (unsigned char) (second & 255));
1828 }
1829 else
1830 {
1831 /* Not valid Unicode. Pass U+FFFD, most significant byte
1832 first. */
1833 Dynarr_add (dst, (unsigned char) 0xFF);
1834 Dynarr_add (dst, (unsigned char) 0xFD);
1835 }
1800 } 1836 }
1801 break; 1837 break;
1802 1838
1803 case UNICODE_UCS_4: 1839 case UNICODE_UCS_4:
1840 case UNICODE_UTF_32:
1804 if (little_endian) 1841 if (little_endian)
1805 { 1842 {
1806 Dynarr_add (dst, (unsigned char) (code & 255)); 1843 if (write_error_characters_as_such &&
1807 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); 1844 code >= UNICODE_ERROR_OCTET_RANGE_START &&
1808 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); 1845 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
1809 Dynarr_add (dst, (unsigned char) (code >> 24)); 1846 {
1847 Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
1848 }
1849 else
1850 {
1851 /* We generate and accept incorrect sequences here, which is
1852 okay, in the interest of preservation of the user's
1853 data. */
1854 Dynarr_add (dst, (unsigned char) (code & 255));
1855 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
1856 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
1857 Dynarr_add (dst, (unsigned char) (code >> 24));
1858 }
1810 } 1859 }
1811 else 1860 else
1812 { 1861 {
1813 Dynarr_add (dst, (unsigned char) (code >> 24)); 1862 if (write_error_characters_as_such &&
1814 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); 1863 code >= UNICODE_ERROR_OCTET_RANGE_START &&
1815 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); 1864 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
1816 Dynarr_add (dst, (unsigned char) (code & 255)); 1865 {
1866 Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
1867 }
1868 else
1869 {
1870 /* We generate and accept incorrect sequences here, which is okay,
1871 in the interest of preservation of the user's data. */
1872 Dynarr_add (dst, (unsigned char) (code >> 24));
1873 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
1874 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
1875 Dynarr_add (dst, (unsigned char) (code & 255));
1876 }
1817 } 1877 }
1818 break; 1878 break;
1819 1879
1820 case UNICODE_UTF_8: 1880 case UNICODE_UTF_8:
1821 if (code <= 0x7f) 1881 if (code <= 0x7f)
1840 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); 1900 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80));
1841 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); 1901 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80));
1842 } 1902 }
1843 else if (code <= 0x3ffffff) 1903 else if (code <= 0x3ffffff)
1844 { 1904 {
1845 Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8)); 1905
1846 Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); 1906 #if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \
1847 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); 1907 && UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff)
1848 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); 1908 #error "This code needs to be rewritten. "
1849 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); 1909 #endif
1910 if (write_error_characters_as_such &&
1911 code >= UNICODE_ERROR_OCTET_RANGE_START &&
1912 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
1913 {
1914 Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
1915 }
1916 else
1917 {
1918 Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8));
1919 Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80));
1920 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80));
1921 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80));
1922 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80));
1923 }
1850 } 1924 }
1851 else 1925 else
1852 { 1926 {
1853 Dynarr_add (dst, (unsigned char) ((code >> 30) | 0xfc)); 1927 Dynarr_add (dst, (unsigned char) ((code >> 30) | 0xfc));
1854 Dynarr_add (dst, (unsigned char) (((code >> 24) & 0x3f) | 0x80)); 1928 Dynarr_add (dst, (unsigned char) (((code >> 24) & 0x3f) | 0x80));
1868 /* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented 1942 /* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented
1869 encodings. */ 1943 encodings. */
1870 void 1944 void
1871 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, 1945 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
1872 int USED_IF_MULE (l), unsigned_char_dynarr *dst, 1946 int USED_IF_MULE (l), unsigned_char_dynarr *dst,
1873 enum unicode_type type, unsigned int little_endian) 1947 enum unicode_type type, unsigned int little_endian,
1948 int write_error_characters_as_such)
1874 { 1949 {
1875 #ifdef MULE 1950 #ifdef MULE
1876 int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127)); 1951 int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127));
1877 1952
1878 if (code == -1) 1953 if (code == -1)
1894 } 1969 }
1895 #else 1970 #else
1896 int code = h; 1971 int code = h;
1897 #endif /* MULE */ 1972 #endif /* MULE */
1898 1973
1899 encode_unicode_char_1 (code, dst, type, little_endian); 1974 encode_unicode_char_1 (code, dst, type, little_endian,
1975 write_error_characters_as_such);
1900 } 1976 }
1901 1977
1902 static Bytecount 1978 static Bytecount
1903 unicode_convert (struct coding_stream *str, const UExtbyte *src, 1979 unicode_convert (struct coding_stream *str, const UExtbyte *src,
1904 unsigned_char_dynarr *dst, Bytecount n) 1980 unsigned_char_dynarr *dst, Bytecount n)
1913 Bytecount orign = n; 1989 Bytecount orign = n;
1914 1990
1915 if (str->direction == CODING_DECODE) 1991 if (str->direction == CODING_DECODE)
1916 { 1992 {
1917 unsigned char counter = data->counter; 1993 unsigned char counter = data->counter;
1994 unsigned char indicated_length
1995 = data->indicated_length;
1918 1996
1919 while (n--) 1997 while (n--)
1920 { 1998 {
1921 UExtbyte c = *src++; 1999 UExtbyte c = *src++;
1922 2000
1923 switch (type) 2001 switch (type)
1924 { 2002 {
1925 case UNICODE_UTF_8: 2003 case UNICODE_UTF_8:
1926 switch (counter) 2004 if (0 == counter)
1927 { 2005 {
1928 case 0: 2006 if (0 == (c & 0x80))
1929 if (c >= 0xfc) 2007 {
1930 { 2008 /* ASCII. */
1931 ch = c & 0x01; 2009 decode_unicode_char (c, dst, data, ignore_bom);
1932 counter = 5; 2010 }
1933 } 2011 else if (0 == (c & 0x40))
1934 else if (c >= 0xf8) 2012 {
1935 { 2013 /* Highest bit set, second highest not--there's
1936 ch = c & 0x03; 2014 something wrong. */
1937 counter = 4; 2015 DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
1938 } 2016 }
1939 else if (c >= 0xf0) 2017 else if (0 == (c & 0x20))
1940 { 2018 {
1941 ch = c & 0x07; 2019 ch = c & 0x1f;
1942 counter = 3; 2020 counter = 1;
1943 } 2021 indicated_length = 2;
1944 else if (c >= 0xe0) 2022 }
1945 { 2023 else if (0 == (c & 0x10))
1946 ch = c & 0x0f; 2024 {
1947 counter = 2; 2025 ch = c & 0x0f;
1948 } 2026 counter = 2;
1949 else if (c >= 0xc0) 2027 indicated_length = 3;
1950 { 2028 }
1951 ch = c & 0x1f; 2029 else if (0 == (c & 0x08))
1952 counter = 1; 2030 {
1953 } 2031 ch = c & 0x0f;
1954 else 2032 counter = 3;
1955 decode_unicode_char (c, dst, data, ignore_bom); 2033 indicated_length = 4;
1956 break; 2034 }
1957 case 1: 2035 else
1958 ch = (ch << 6) | (c & 0x3f); 2036 {
1959 decode_unicode_char (ch, dst, data, ignore_bom); 2037 /* We don't supports lengths longer than 4 in
1960 ch = 0; 2038 external-format data. */
1961 counter = 0; 2039 DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
1962 break; 2040
1963 default: 2041 }
1964 ch = (ch << 6) | (c & 0x3f); 2042 }
1965 counter--; 2043 else
2044 {
2045 /* counter != 0 */
2046 if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
2047 {
2048 indicate_invalid_utf_8(indicated_length,
2049 counter,
2050 ch, dst, data, ignore_bom);
2051 if (c & 0x80)
2052 {
2053 DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
2054 }
2055 else
2056 {
2057 /* The character just read is ASCII. Treat it as
2058 such. */
2059 decode_unicode_char (c, dst, data, ignore_bom);
2060 }
2061 ch = 0;
2062 counter = 0;
2063 }
2064 else
2065 {
2066 ch = (ch << 6) | (c & 0x3f);
2067 counter--;
2068 /* Just processed the final byte. Emit the character. */
2069 if (!counter)
2070 {
2071 /* Don't accept over-long sequences, surrogates,
2072 or codes above #x10FFFF. */
2073 if ((ch < 0x80) ||
2074 ((ch < 0x800) && indicated_length > 2) ||
2075 ((ch < 0x10000) && indicated_length > 3) ||
2076 valid_utf_16_surrogate(ch) || (ch > 0x110000))
2077 {
2078 indicate_invalid_utf_8(indicated_length,
2079 counter,
2080 ch, dst, data,
2081 ignore_bom);
2082 }
2083 else
2084 {
2085 decode_unicode_char (ch, dst, data, ignore_bom);
2086 }
2087 ch = 0;
2088 }
2089 }
1966 } 2090 }
1967 break; 2091 break;
1968 2092
1969 case UNICODE_UTF_16: 2093 case UNICODE_UTF_16:
1970 2094
1971 if (little_endian) 2095 if (little_endian)
1972 ch = (c << counter) | ch; 2096 ch = (c << counter) | ch;
1973 else 2097 else
1974 ch = (ch << 8) | c; 2098 ch = (ch << 8) | c;
2099
1975 counter += 8; 2100 counter += 8;
1976 2101
1977 if (counter == 16 && valid_utf_16_first_surrogate(ch)) 2102 if (16 == counter)
1978 break; 2103 {
1979
1980 if (counter == 16)
1981 {
1982 int tempch = ch; 2104 int tempch = ch;
2105
2106 if (valid_utf_16_first_surrogate(ch))
2107 {
2108 break;
2109 }
1983 ch = 0; 2110 ch = 0;
1984 counter = 0; 2111 counter = 0;
1985 decode_unicode_char (tempch, dst, data, ignore_bom); 2112 decode_unicode_char (tempch, dst, data, ignore_bom);
1986 } 2113 }
1987 if (counter == 32) 2114 else if (32 == counter)
1988 { 2115 {
1989 int tempch; 2116 int tempch;
1990 /* #### Signalling an error may be a bit extreme. Should 2117
1991 we try and read it in anyway? */ 2118 if (!valid_utf_16_last_surrogate(ch & 0xFFFF))
1992 if (!valid_utf_16_first_surrogate(ch >> 16)
1993 || !valid_utf_16_last_surrogate(ch & 0xFFFF))
1994 { 2119 {
1995 signal_error(Qtext_conversion_error, 2120 DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
1996 "Invalid UTF-16 surrogate sequence", 2121 ignore_bom);
1997 Qunbound); 2122 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
2123 ignore_bom);
2124 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
2125 ignore_bom);
2126 DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
2127 ignore_bom);
1998 } 2128 }
1999 tempch = utf_16_surrogates_to_code((ch >> 16), 2129 else
2000 (ch & 0xffff)); 2130 {
2131 tempch = utf_16_surrogates_to_code((ch >> 16),
2132 (ch & 0xffff));
2133 decode_unicode_char(tempch, dst, data, ignore_bom);
2134 }
2001 ch = 0; 2135 ch = 0;
2002 counter = 0; 2136 counter = 0;
2003 decode_unicode_char(tempch, dst, data, ignore_bom); 2137 }
2004 } 2138 else
2139 assert(8 == counter || 24 == counter);
2005 break; 2140 break;
2006 2141
2007 case UNICODE_UCS_4: 2142 case UNICODE_UCS_4:
2143 case UNICODE_UTF_32:
2008 if (little_endian) 2144 if (little_endian)
2009 ch = (c << counter) | ch; 2145 ch = (c << counter) | ch;
2010 else 2146 else
2011 ch = (ch << 8) | c; 2147 ch = (ch << 8) | c;
2012 counter += 8; 2148 counter += 8;
2013 if (counter == 32) 2149 if (counter == 32)
2014 { 2150 {
2015 int tempch = ch; 2151 if (ch > 0x10ffff)
2152 {
2153 /* ch is not a legal Unicode character. We're fine
2154 with that in UCS-4, though not in UTF-32. */
2155 if (UNICODE_UCS_4 == type && ch < 0x80000000)
2156 {
2157 decode_unicode_char (ch, dst, data, ignore_bom);
2158 }
2159 else if (little_endian)
2160 {
2161 DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
2162 ignore_bom);
2163 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
2164 ignore_bom);
2165 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
2166 ignore_bom);
2167 DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
2168 ignore_bom);
2169 }
2170 else
2171 {
2172 DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
2173 ignore_bom);
2174 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
2175 ignore_bom);
2176 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
2177 ignore_bom);
2178 DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
2179 ignore_bom);
2180 }
2181 }
2182 else
2183 {
2184 decode_unicode_char (ch, dst, data, ignore_bom);
2185 }
2016 ch = 0; 2186 ch = 0;
2017 counter = 0; 2187 counter = 0;
2018 if (tempch < 0)
2019 {
2020 /* !!#### indicate an error */
2021 tempch = '~';
2022 }
2023 decode_unicode_char (tempch, dst, data, ignore_bom);
2024 } 2188 }
2025 break; 2189 break;
2026 2190
2027 case UNICODE_UTF_7: 2191 case UNICODE_UTF_7:
2028 ABORT (); 2192 ABORT ();
2030 2194
2031 default: ABORT (); 2195 default: ABORT ();
2032 } 2196 }
2033 2197
2034 } 2198 }
2035 if (str->eof) 2199
2036 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); 2200 if (str->eof && ch)
2201 {
2202 switch (type)
2203 {
2204 case UNICODE_UTF_8:
2205 indicate_invalid_utf_8(indicated_length,
2206 counter, ch, dst, data,
2207 ignore_bom);
2208 break;
2209
2210 case UNICODE_UTF_16:
2211 case UNICODE_UCS_4:
2212 case UNICODE_UTF_32:
2213 if (8 == counter)
2214 {
2215 DECODE_ERROR_OCTET (ch, dst, data, ignore_bom);
2216 }
2217 else if (16 == counter)
2218 {
2219 if (little_endian)
2220 {
2221 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom);
2222 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
2223 ignore_bom);
2224 }
2225 else
2226 {
2227 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
2228 ignore_bom);
2229 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom);
2230 }
2231 }
2232 else if (24 == counter)
2233 {
2234 if (little_endian)
2235 {
2236 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
2237 ignore_bom);
2238 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom);
2239 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
2240 ignore_bom);
2241 }
2242 else
2243 {
2244 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
2245 ignore_bom);
2246 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
2247 ignore_bom);
2248 DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
2249 ignore_bom);
2250 }
2251 }
2252 else assert(0);
2253 break;
2254 }
2255 ch = 0;
2256 }
2037 2257
2038 data->counter = counter; 2258 data->counter = counter;
2259 data->indicated_length = indicated_length;
2039 } 2260 }
2040 else 2261 else
2041 { 2262 {
2042 unsigned char char_boundary = data->current_char_boundary; 2263 unsigned char char_boundary = data->current_char_boundary;
2043 Lisp_Object charset = data->current_charset; 2264 Lisp_Object charset = data->current_charset;
2052 back_to_square_n: 2273 back_to_square_n:
2053 #endif /* ENABLE_COMPOSITE_CHARS */ 2274 #endif /* ENABLE_COMPOSITE_CHARS */
2054 2275
2055 if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom) 2276 if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom)
2056 { 2277 {
2057 encode_unicode_char_1 (0xFEFF, dst, type, little_endian); 2278 encode_unicode_char_1 (0xFEFF, dst, type, little_endian, 1);
2058 data->wrote_bom = 1; 2279 data->wrote_bom = 1;
2059 } 2280 }
2060 2281
2061 while (n--) 2282 while (n--)
2062 { 2283 {
2066 if (byte_ascii_p (c)) 2287 if (byte_ascii_p (c))
2067 #endif /* MULE */ 2288 #endif /* MULE */
2068 { /* Processing ASCII character */ 2289 { /* Processing ASCII character */
2069 ch = 0; 2290 ch = 0;
2070 encode_unicode_char (Vcharset_ascii, c, 0, dst, type, 2291 encode_unicode_char (Vcharset_ascii, c, 0, dst, type,
2071 little_endian); 2292 little_endian, 1);
2072 2293
2073 char_boundary = 1; 2294 char_boundary = 1;
2074 } 2295 }
2075 #ifdef MULE 2296 #ifdef MULE
2076 else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch)) 2297 else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch))
2090 (Info-goto-node "(internals)Internal String Encoding") 2311 (Info-goto-node "(internals)Internal String Encoding")
2091 2312
2092 for the rationale behind subtracting #xa0 from the 2313 for the rationale behind subtracting #xa0 from the
2093 character's code. */ 2314 character's code. */
2094 encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst, 2315 encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst,
2095 type, little_endian); 2316 type, little_endian, 1);
2096 else 2317 else
2097 { 2318 {
2098 switch (XCHARSET_REP_BYTES (charset)) 2319 switch (XCHARSET_REP_BYTES (charset))
2099 { 2320 {
2100 case 2: 2321 case 2:
2101 encode_unicode_char (charset, c, 0, dst, type, 2322 encode_unicode_char (charset, c, 0, dst, type,
2102 little_endian); 2323 little_endian, 1);
2103 break; 2324 break;
2104 case 3: 2325 case 3:
2105 if (XCHARSET_PRIVATE_P (charset)) 2326 if (XCHARSET_PRIVATE_P (charset))
2106 { 2327 {
2107 encode_unicode_char (charset, c, 0, dst, type, 2328 encode_unicode_char (charset, c, 0, dst, type,
2108 little_endian); 2329 little_endian, 1);
2109 ch = 0; 2330 ch = 0;
2110 } 2331 }
2111 else if (ch) 2332 else if (ch)
2112 { 2333 {
2113 #ifdef ENABLE_COMPOSITE_CHARS 2334 #ifdef ENABLE_COMPOSITE_CHARS
2117 { 2338 {
2118 /* #### Bother! We don't know how to 2339 /* #### Bother! We don't know how to
2119 handle this yet. */ 2340 handle this yet. */
2120 encode_unicode_char (Vcharset_ascii, '~', 0, 2341 encode_unicode_char (Vcharset_ascii, '~', 0,
2121 dst, type, 2342 dst, type,
2122 little_endian); 2343 little_endian, 1);
2123 } 2344 }
2124 else 2345 else
2125 { 2346 {
2126 Ichar emch = make_ichar (Vcharset_composite, 2347 Ichar emch = make_ichar (Vcharset_composite,
2127 ch & 0x7F, 2348 ch & 0x7F,
2136 } 2357 }
2137 } 2358 }
2138 else 2359 else
2139 #endif /* ENABLE_COMPOSITE_CHARS */ 2360 #endif /* ENABLE_COMPOSITE_CHARS */
2140 encode_unicode_char (charset, ch, c, dst, type, 2361 encode_unicode_char (charset, ch, c, dst, type,
2141 little_endian); 2362 little_endian, 1);
2142 ch = 0; 2363 ch = 0;
2143 } 2364 }
2144 else 2365 else
2145 { 2366 {
2146 ch = c; 2367 ch = c;
2149 break; 2370 break;
2150 case 4: 2371 case 4:
2151 if (ch) 2372 if (ch)
2152 { 2373 {
2153 encode_unicode_char (charset, ch, c, dst, type, 2374 encode_unicode_char (charset, ch, c, dst, type,
2154 little_endian); 2375 little_endian, 1);
2155 ch = 0; 2376 ch = 0;
2156 } 2377 }
2157 else 2378 else
2158 { 2379 {
2159 ch = c; 2380 ch = c;
2519 type = UNICODE_UTF_16; 2740 type = UNICODE_UTF_16;
2520 else if (EQ (value, Qutf_7)) 2741 else if (EQ (value, Qutf_7))
2521 type = UNICODE_UTF_7; 2742 type = UNICODE_UTF_7;
2522 else if (EQ (value, Qucs_4)) 2743 else if (EQ (value, Qucs_4))
2523 type = UNICODE_UCS_4; 2744 type = UNICODE_UCS_4;
2745 else if (EQ (value, Qutf_32))
2746 type = UNICODE_UTF_32;
2524 else 2747 else
2525 invalid_constant ("Invalid Unicode type", key); 2748 invalid_constant ("Invalid Unicode type", key);
2526 2749
2527 XCODING_SYSTEM_UNICODE_TYPE (codesys) = type; 2750 XCODING_SYSTEM_UNICODE_TYPE (codesys) = type;
2528 } 2751 }
2544 { 2767 {
2545 case UNICODE_UTF_16: return Qutf_16; 2768 case UNICODE_UTF_16: return Qutf_16;
2546 case UNICODE_UTF_8: return Qutf_8; 2769 case UNICODE_UTF_8: return Qutf_8;
2547 case UNICODE_UTF_7: return Qutf_7; 2770 case UNICODE_UTF_7: return Qutf_7;
2548 case UNICODE_UCS_4: return Qucs_4; 2771 case UNICODE_UCS_4: return Qucs_4;
2772 case UNICODE_UTF_32: return Qutf_32;
2549 default: ABORT (); 2773 default: ABORT ();
2550 } 2774 }
2551 } 2775 }
2552 else if (EQ (prop, Qlittle_endian)) 2776 else if (EQ (prop, Qlittle_endian))
2553 return XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (coding_system) ? Qt : Qnil; 2777 return XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (coding_system) ? Qt : Qnil;
2618 DEFSUBR (Funicode_to_char); 2842 DEFSUBR (Funicode_to_char);
2619 2843
2620 DEFSYMBOL (Qunicode); 2844 DEFSYMBOL (Qunicode);
2621 DEFSYMBOL (Qucs_4); 2845 DEFSYMBOL (Qucs_4);
2622 DEFSYMBOL (Qutf_16); 2846 DEFSYMBOL (Qutf_16);
2847 DEFSYMBOL (Qutf_32);
2623 DEFSYMBOL (Qutf_8); 2848 DEFSYMBOL (Qutf_8);
2624 DEFSYMBOL (Qutf_7); 2849 DEFSYMBOL (Qutf_7);
2625 2850
2626 DEFSYMBOL (Qneed_bom); 2851 DEFSYMBOL (Qneed_bom);
2627 2852