Mercurial > hg > xemacs-beta
comparison src/mule-coding.c @ 4096:1abf84db2c7f
[xemacs-hg @ 2007-08-04 20:00:10 by aidan]
Preserve invalid UTF-8, UTF-16 sequences on encoding, decoding.
author | aidan |
---|---|
date | Sat, 04 Aug 2007 20:00:24 +0000 |
parents | 42e4605ef1de |
children | 383ab474a241 |
comparison
equal
deleted
inserted
replaced
4095:bff7e065cfdc | 4096:1abf84db2c7f |
---|---|
102 unsigned_char_dynarr *dst) | 102 unsigned_char_dynarr *dst) |
103 { | 103 { |
104 if (XCHARSET_ENCODE_AS_UTF_8 (charset)) | 104 if (XCHARSET_ENCODE_AS_UTF_8 (charset)) |
105 { | 105 { |
106 encode_unicode_char (charset, c & charmask, 0, | 106 encode_unicode_char (charset, c & charmask, 0, |
107 dst, UNICODE_UTF_8, 0); | 107 dst, UNICODE_UTF_8, 0, 0); |
108 } | 108 } |
109 else | 109 else |
110 { | 110 { |
111 Dynarr_add (dst, c & charmask); | 111 Dynarr_add (dst, c & charmask); |
112 } | 112 } |
121 if (XCHARSET_ENCODE_AS_UTF_8 (charset)) | 121 if (XCHARSET_ENCODE_AS_UTF_8 (charset)) |
122 { | 122 { |
123 encode_unicode_char (charset, | 123 encode_unicode_char (charset, |
124 ch & charmask, | 124 ch & charmask, |
125 c & charmask, dst, | 125 c & charmask, dst, |
126 UNICODE_UTF_8, 0); | 126 UNICODE_UTF_8, 0, 0); |
127 } | 127 } |
128 else | 128 else |
129 { | 129 { |
130 Dynarr_add (dst, ch & charmask); | 130 Dynarr_add (dst, ch & charmask); |
131 Dynarr_add (dst, c & charmask); | 131 Dynarr_add (dst, c & charmask); |
967 int current_half; | 967 int current_half; |
968 int current_char_boundary; | 968 int current_char_boundary; |
969 | 969 |
970 /* Used for handling UTF-8. */ | 970 /* Used for handling UTF-8. */ |
971 unsigned char counter; | 971 unsigned char counter; |
972 unsigned char indicated_length; | |
972 }; | 973 }; |
973 | 974 |
974 static const struct memory_description ccs_description_1[] = | 975 static const struct memory_description ccs_description_1[] = |
975 { | 976 { |
976 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) }, | 977 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) }, |
1799 Dynarr_add (dst, ISO_CODE_CSI); | 1800 Dynarr_add (dst, ISO_CODE_CSI); |
1800 Dynarr_add (dst, '2'); | 1801 Dynarr_add (dst, '2'); |
1801 Dynarr_add (dst, ']'); | 1802 Dynarr_add (dst, ']'); |
1802 if (flags) | 1803 if (flags) |
1803 *flags |= ISO_STATE_R2L; | 1804 *flags |= ISO_STATE_R2L; |
1805 } | |
1806 } | |
1807 | |
1808 /* Note that this name conflicts with a function in unicode.c. */ | |
1809 static void | |
1810 decode_unicode_char (int ucs, unsigned_char_dynarr *dst) | |
1811 { | |
1812 Ibyte work[MAX_ICHAR_LEN]; | |
1813 int len; | |
1814 Lisp_Object chr; | |
1815 | |
1816 chr = Funicode_to_char(make_int(ucs), Qnil); | |
1817 assert (!NILP(chr)); | |
1818 len = set_itext_ichar (work, XCHAR(chr)); | |
1819 Dynarr_add_many (dst, work, len); | |
1820 } | |
1821 | |
1822 #define DECODE_ERROR_OCTET(octet, dst) \ | |
1823 decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, dst) | |
1824 | |
1825 static inline void | |
1826 indicate_invalid_utf_8 (unsigned char indicated_length, | |
1827 unsigned char counter, | |
1828 int ch, unsigned_char_dynarr *dst) | |
1829 { | |
1830 Binbyte stored = indicated_length - counter; | |
1831 Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length]; | |
1832 | |
1833 while (stored > 0) | |
1834 { | |
1835 DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask, | |
1836 dst); | |
1837 mask = 0x80, stored--; | |
1804 } | 1838 } |
1805 } | 1839 } |
1806 | 1840 |
1807 /* Convert ISO2022-format data to internal format. */ | 1841 /* Convert ISO2022-format data to internal format. */ |
1808 | 1842 |
1905 ch = 0; | 1939 ch = 0; |
1906 } | 1940 } |
1907 else if (flags & ISO_STATE_UTF_8) | 1941 else if (flags & ISO_STATE_UTF_8) |
1908 { | 1942 { |
1909 unsigned char counter = data->counter; | 1943 unsigned char counter = data->counter; |
1910 Ibyte work[MAX_ICHAR_LEN]; | 1944 unsigned char indicated_length = data->indicated_length; |
1911 int len; | |
1912 Lisp_Object chr; | |
1913 | 1945 |
1914 if (ISO_CODE_ESC == c) | 1946 if (ISO_CODE_ESC == c) |
1915 { | 1947 { |
1916 /* Allow the escape sequence parser to end the UTF-8 state. */ | 1948 /* Allow the escape sequence parser to end the UTF-8 state. */ |
1917 flags |= ISO_STATE_ESCAPE; | 1949 flags |= ISO_STATE_ESCAPE; |
1918 data->esc = ISO_ESC; | 1950 data->esc = ISO_ESC; |
1919 data->esc_bytes_index = 1; | 1951 data->esc_bytes_index = 1; |
1920 continue; | 1952 continue; |
1921 } | 1953 } |
1922 | 1954 |
1923 switch (counter) | 1955 if (0 == counter) |
1924 { | 1956 { |
1925 case 0: | 1957 if (0 == (c & 0x80)) |
1926 if (c >= 0xfc) | 1958 { |
1927 { | 1959 /* ASCII. */ |
1928 ch = c & 0x01; | 1960 decode_unicode_char (c, dst); |
1929 counter = 5; | 1961 } |
1930 } | 1962 else if (0 == (c & 0x40)) |
1931 else if (c >= 0xf8) | 1963 { |
1932 { | 1964 /* Highest bit set, second highest not--there's |
1933 ch = c & 0x03; | 1965 something wrong. */ |
1934 counter = 4; | 1966 DECODE_ERROR_OCTET (c, dst); |
1935 } | 1967 } |
1936 else if (c >= 0xf0) | 1968 else if (0 == (c & 0x20)) |
1937 { | 1969 { |
1938 ch = c & 0x07; | 1970 ch = c & 0x1f; |
1939 counter = 3; | 1971 counter = 1; |
1940 } | 1972 indicated_length = 2; |
1941 else if (c >= 0xe0) | 1973 } |
1942 { | 1974 else if (0 == (c & 0x10)) |
1943 ch = c & 0x0f; | 1975 { |
1944 counter = 2; | 1976 ch = c & 0x0f; |
1945 } | 1977 counter = 2; |
1946 else if (c >= 0xc0) | 1978 indicated_length = 3; |
1947 { | 1979 } |
1948 ch = c & 0x1f; | 1980 else if (0 == (c & 0x08)) |
1949 counter = 1; | 1981 { |
1950 } | 1982 ch = c & 0x0f; |
1951 else | 1983 counter = 3; |
1952 /* ASCII, or the lower control characters. | 1984 indicated_length = 4; |
1953 | 1985 } |
1954 Perhaps we should signal an error if the character is in | 1986 /* We support lengths longer than 4 here, since we want to |
1955 the range 0x80-0xc0; this is illegal UTF-8. */ | 1987 represent UTF-8 error chars as distinct from the |
1956 Dynarr_add (dst, (c & 0x7f)); | 1988 corresponding ISO 8859-1 characters in escape-quoted. |
1957 | 1989 |
1958 break; | 1990 However, we can't differentiate UTF-8 error chars as |
1959 case 1: | 1991 written to disk, and UTF-8 errors in escape-quoted. This |
1960 ch = (ch << 6) | (c & 0x3f); | 1992 is not a big problem; |
1961 chr = Funicode_to_char(make_int(ch), Qnil); | 1993 non-Unicode-chars-encoded-as-UTF-8-in-ISO-2022 is not |
1962 | 1994 deployed, in practice, so if such a sequence of octets |
1963 if (!NILP (chr)) | 1995 occurs, XEmacs generated it. */ |
1964 { | 1996 else if (0 == (c & 0x04)) |
1965 assert(CHARP(chr)); | 1997 { |
1966 len = set_itext_ichar (work, XCHAR(chr)); | 1998 ch = c & 0x03; |
1967 Dynarr_add_many (dst, work, len); | 1999 counter = 4; |
1968 } | 2000 indicated_length = 5; |
1969 else | 2001 } |
1970 { | 2002 else if (0 == (c & 0x02)) |
1971 /* Shouldn't happen, this code should only be enabled in | 2003 { |
1972 XEmacsen with support for all of Unicode. */ | 2004 ch = c & 0x01; |
1973 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); | 2005 counter = 5; |
1974 Dynarr_add (dst, 34 + 128); | 2006 indicated_length = 6; |
1975 Dynarr_add (dst, 46 + 128); | 2007 } |
1976 } | 2008 else |
1977 | 2009 { |
1978 ch = 0; | 2010 /* #xFF is not a valid leading byte in any form of |
1979 counter = 0; | 2011 UTF-8. */ |
1980 break; | 2012 DECODE_ERROR_OCTET (c, dst); |
1981 default: | 2013 |
1982 ch = (ch << 6) | (c & 0x3f); | 2014 } |
1983 counter--; | 2015 } |
1984 } | 2016 else |
1985 | 2017 { |
1986 if (str->eof) | 2018 /* counter != 0 */ |
1987 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); | 2019 if ((0 == (c & 0x80)) || (0 != (c & 0x40))) |
2020 { | |
2021 indicate_invalid_utf_8(indicated_length, | |
2022 counter, | |
2023 ch, dst); | |
2024 if (c & 0x80) | |
2025 { | |
2026 DECODE_ERROR_OCTET (c, dst); | |
2027 } | |
2028 else | |
2029 { | |
2030 /* The character just read is ASCII. Treat it as | |
2031 such. */ | |
2032 decode_unicode_char (c, dst); | |
2033 } | |
2034 ch = 0; | |
2035 counter = 0; | |
2036 } | |
2037 else | |
2038 { | |
2039 ch = (ch << 6) | (c & 0x3f); | |
2040 counter--; | |
2041 | |
2042 /* Just processed the final byte. Emit the character. */ | |
2043 if (!counter) | |
2044 { | |
2045 /* Don't accept over-long sequences, or surrogates. */ | |
2046 if ((ch < 0x80) || | |
2047 ((ch < 0x800) && indicated_length > 2) || | |
2048 ((ch < 0x10000) && indicated_length > 3) || | |
2049 /* We accept values above #x110000 in | |
2050 escape-quoted, though not in UTF-8. */ | |
2051 /* (ch > 0x110000) || */ | |
2052 valid_utf_16_surrogate(ch)) | |
2053 { | |
2054 indicate_invalid_utf_8(indicated_length, | |
2055 counter, | |
2056 ch, dst); | |
2057 } | |
2058 else | |
2059 { | |
2060 decode_unicode_char (ch, dst); | |
2061 } | |
2062 ch = 0; | |
2063 } | |
2064 } | |
2065 } | |
2066 | |
2067 if (str->eof && ch) | |
2068 { | |
2069 DECODE_ERROR_OCTET (ch, dst); | |
2070 ch = 0; | |
2071 } | |
1988 | 2072 |
1989 data->counter = counter; | 2073 data->counter = counter; |
2074 data->indicated_length = indicated_length; | |
1990 } | 2075 } |
1991 else if (byte_c0_p (c) || byte_c1_p (c)) | 2076 else if (byte_c0_p (c) || byte_c1_p (c)) |
1992 { /* Control characters */ | 2077 { /* Control characters */ |
1993 | 2078 |
1994 /***** Error-handling *****/ | 2079 /***** Error-handling *****/ |