comparison src/mule-coding.c @ 4096:1abf84db2c7f

[xemacs-hg @ 2007-08-04 20:00:10 by aidan] Preserve invalid UTF-8, UTF-16 sequences on encoding, decoding.
author aidan
date Sat, 04 Aug 2007 20:00:24 +0000
parents 42e4605ef1de
children 383ab474a241
comparison
equal deleted inserted replaced
4095:bff7e065cfdc 4096:1abf84db2c7f
102 unsigned_char_dynarr *dst) 102 unsigned_char_dynarr *dst)
103 { 103 {
104 if (XCHARSET_ENCODE_AS_UTF_8 (charset)) 104 if (XCHARSET_ENCODE_AS_UTF_8 (charset))
105 { 105 {
106 encode_unicode_char (charset, c & charmask, 0, 106 encode_unicode_char (charset, c & charmask, 0,
107 dst, UNICODE_UTF_8, 0); 107 dst, UNICODE_UTF_8, 0, 0);
108 } 108 }
109 else 109 else
110 { 110 {
111 Dynarr_add (dst, c & charmask); 111 Dynarr_add (dst, c & charmask);
112 } 112 }
121 if (XCHARSET_ENCODE_AS_UTF_8 (charset)) 121 if (XCHARSET_ENCODE_AS_UTF_8 (charset))
122 { 122 {
123 encode_unicode_char (charset, 123 encode_unicode_char (charset,
124 ch & charmask, 124 ch & charmask,
125 c & charmask, dst, 125 c & charmask, dst,
126 UNICODE_UTF_8, 0); 126 UNICODE_UTF_8, 0, 0);
127 } 127 }
128 else 128 else
129 { 129 {
130 Dynarr_add (dst, ch & charmask); 130 Dynarr_add (dst, ch & charmask);
131 Dynarr_add (dst, c & charmask); 131 Dynarr_add (dst, c & charmask);
967 int current_half; 967 int current_half;
968 int current_char_boundary; 968 int current_char_boundary;
969 969
970 /* Used for handling UTF-8. */ 970 /* Used for handling UTF-8. */
971 unsigned char counter; 971 unsigned char counter;
972 unsigned char indicated_length;
972 }; 973 };
973 974
974 static const struct memory_description ccs_description_1[] = 975 static const struct memory_description ccs_description_1[] =
975 { 976 {
976 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) }, 977 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
1799 Dynarr_add (dst, ISO_CODE_CSI); 1800 Dynarr_add (dst, ISO_CODE_CSI);
1800 Dynarr_add (dst, '2'); 1801 Dynarr_add (dst, '2');
1801 Dynarr_add (dst, ']'); 1802 Dynarr_add (dst, ']');
1802 if (flags) 1803 if (flags)
1803 *flags |= ISO_STATE_R2L; 1804 *flags |= ISO_STATE_R2L;
1805 }
1806 }
1807
1808 /* Note that this name conflicts with a function in unicode.c. */
1809 static void
1810 decode_unicode_char (int ucs, unsigned_char_dynarr *dst)
1811 {
1812 Ibyte work[MAX_ICHAR_LEN];
1813 int len;
1814 Lisp_Object chr;
1815
1816 chr = Funicode_to_char(make_int(ucs), Qnil);
1817 assert (!NILP(chr));
1818 len = set_itext_ichar (work, XCHAR(chr));
1819 Dynarr_add_many (dst, work, len);
1820 }
1821
1822 #define DECODE_ERROR_OCTET(octet, dst) \
1823 decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, dst)
1824
1825 static inline void
1826 indicate_invalid_utf_8 (unsigned char indicated_length,
1827 unsigned char counter,
1828 int ch, unsigned_char_dynarr *dst)
1829 {
1830 Binbyte stored = indicated_length - counter;
1831 Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
1832
1833 while (stored > 0)
1834 {
1835 DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
1836 dst);
1837 mask = 0x80, stored--;
1804 } 1838 }
1805 } 1839 }
1806 1840
1807 /* Convert ISO2022-format data to internal format. */ 1841 /* Convert ISO2022-format data to internal format. */
1808 1842
1905 ch = 0; 1939 ch = 0;
1906 } 1940 }
1907 else if (flags & ISO_STATE_UTF_8) 1941 else if (flags & ISO_STATE_UTF_8)
1908 { 1942 {
1909 unsigned char counter = data->counter; 1943 unsigned char counter = data->counter;
1910 Ibyte work[MAX_ICHAR_LEN]; 1944 unsigned char indicated_length = data->indicated_length;
1911 int len;
1912 Lisp_Object chr;
1913 1945
1914 if (ISO_CODE_ESC == c) 1946 if (ISO_CODE_ESC == c)
1915 { 1947 {
1916 /* Allow the escape sequence parser to end the UTF-8 state. */ 1948 /* Allow the escape sequence parser to end the UTF-8 state. */
1917 flags |= ISO_STATE_ESCAPE; 1949 flags |= ISO_STATE_ESCAPE;
1918 data->esc = ISO_ESC; 1950 data->esc = ISO_ESC;
1919 data->esc_bytes_index = 1; 1951 data->esc_bytes_index = 1;
1920 continue; 1952 continue;
1921 } 1953 }
1922 1954
1923 switch (counter) 1955 if (0 == counter)
1924 { 1956 {
1925 case 0: 1957 if (0 == (c & 0x80))
1926 if (c >= 0xfc) 1958 {
1927 { 1959 /* ASCII. */
1928 ch = c & 0x01; 1960 decode_unicode_char (c, dst);
1929 counter = 5; 1961 }
1930 } 1962 else if (0 == (c & 0x40))
1931 else if (c >= 0xf8) 1963 {
1932 { 1964 /* Highest bit set, second highest not--there's
1933 ch = c & 0x03; 1965 something wrong. */
1934 counter = 4; 1966 DECODE_ERROR_OCTET (c, dst);
1935 } 1967 }
1936 else if (c >= 0xf0) 1968 else if (0 == (c & 0x20))
1937 { 1969 {
1938 ch = c & 0x07; 1970 ch = c & 0x1f;
1939 counter = 3; 1971 counter = 1;
1940 } 1972 indicated_length = 2;
1941 else if (c >= 0xe0) 1973 }
1942 { 1974 else if (0 == (c & 0x10))
1943 ch = c & 0x0f; 1975 {
1944 counter = 2; 1976 ch = c & 0x0f;
1945 } 1977 counter = 2;
1946 else if (c >= 0xc0) 1978 indicated_length = 3;
1947 { 1979 }
1948 ch = c & 0x1f; 1980 else if (0 == (c & 0x08))
1949 counter = 1; 1981 {
1950 } 1982 ch = c & 0x0f;
1951 else 1983 counter = 3;
1952 /* ASCII, or the lower control characters. 1984 indicated_length = 4;
1953 1985 }
1954 Perhaps we should signal an error if the character is in 1986 /* We support lengths longer than 4 here, since we want to
1955 the range 0x80-0xc0; this is illegal UTF-8. */ 1987 represent UTF-8 error chars as distinct from the
1956 Dynarr_add (dst, (c & 0x7f)); 1988 corresponding ISO 8859-1 characters in escape-quoted.
1957 1989
1958 break; 1990 However, we can't differentiate UTF-8 error chars as
1959 case 1: 1991 written to disk, and UTF-8 errors in escape-quoted. This
1960 ch = (ch << 6) | (c & 0x3f); 1992 is not a big problem;
1961 chr = Funicode_to_char(make_int(ch), Qnil); 1993 non-Unicode-chars-encoded-as-UTF-8-in-ISO-2022 is not
1962 1994 deployed, in practice, so if such a sequence of octets
1963 if (!NILP (chr)) 1995 occurs, XEmacs generated it. */
1964 { 1996 else if (0 == (c & 0x04))
1965 assert(CHARP(chr)); 1997 {
1966 len = set_itext_ichar (work, XCHAR(chr)); 1998 ch = c & 0x03;
1967 Dynarr_add_many (dst, work, len); 1999 counter = 4;
1968 } 2000 indicated_length = 5;
1969 else 2001 }
1970 { 2002 else if (0 == (c & 0x02))
1971 /* Shouldn't happen, this code should only be enabled in 2003 {
1972 XEmacsen with support for all of Unicode. */ 2004 ch = c & 0x01;
1973 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); 2005 counter = 5;
1974 Dynarr_add (dst, 34 + 128); 2006 indicated_length = 6;
1975 Dynarr_add (dst, 46 + 128); 2007 }
1976 } 2008 else
1977 2009 {
1978 ch = 0; 2010 /* #xFF is not a valid leading byte in any form of
1979 counter = 0; 2011 UTF-8. */
1980 break; 2012 DECODE_ERROR_OCTET (c, dst);
1981 default: 2013
1982 ch = (ch << 6) | (c & 0x3f); 2014 }
1983 counter--; 2015 }
1984 } 2016 else
1985 2017 {
1986 if (str->eof) 2018 /* counter != 0 */
1987 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); 2019 if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
2020 {
2021 indicate_invalid_utf_8(indicated_length,
2022 counter,
2023 ch, dst);
2024 if (c & 0x80)
2025 {
2026 DECODE_ERROR_OCTET (c, dst);
2027 }
2028 else
2029 {
2030 /* The character just read is ASCII. Treat it as
2031 such. */
2032 decode_unicode_char (c, dst);
2033 }
2034 ch = 0;
2035 counter = 0;
2036 }
2037 else
2038 {
2039 ch = (ch << 6) | (c & 0x3f);
2040 counter--;
2041
2042 /* Just processed the final byte. Emit the character. */
2043 if (!counter)
2044 {
2045 /* Don't accept over-long sequences, or surrogates. */
2046 if ((ch < 0x80) ||
2047 ((ch < 0x800) && indicated_length > 2) ||
2048 ((ch < 0x10000) && indicated_length > 3) ||
2049 /* We accept values above #x110000 in
2050 escape-quoted, though not in UTF-8. */
2051 /* (ch > 0x110000) || */
2052 valid_utf_16_surrogate(ch))
2053 {
2054 indicate_invalid_utf_8(indicated_length,
2055 counter,
2056 ch, dst);
2057 }
2058 else
2059 {
2060 decode_unicode_char (ch, dst);
2061 }
2062 ch = 0;
2063 }
2064 }
2065 }
2066
2067 if (str->eof && ch)
2068 {
2069 DECODE_ERROR_OCTET (ch, dst);
2070 ch = 0;
2071 }
1988 2072
1989 data->counter = counter; 2073 data->counter = counter;
2074 data->indicated_length = indicated_length;
1990 } 2075 }
1991 else if (byte_c0_p (c) || byte_c1_p (c)) 2076 else if (byte_c0_p (c) || byte_c1_p (c))
1992 { /* Control characters */ 2077 { /* Control characters */
1993 2078
1994 /***** Error-handling *****/ 2079 /***** Error-handling *****/