Mercurial > hg > xemacs-beta
comparison src/unicode.c @ 1267:c57f32e44416
[xemacs-hg @ 2003-02-07 01:43:05 by ben]
fix unicode/utf misdetection
unicode.c: UTF-8 detector was totally bogus and detected almost anything as UTF-8.
UTF-16 detector got overly happy with 0x2028's (line separator,
also space+paren).
author | ben |
---|---|
date | Fri, 07 Feb 2003 01:43:07 +0000 |
parents | e22b0213b713 |
children | f3437b56874d |
comparison
equal
deleted
inserted
replaced
1266:b5a5863da615 | 1267:c57f32e44416 |
---|---|
1 /* Code to handle Unicode conversion. | 1 /* Code to handle Unicode conversion. |
2 Copyright (C) 2000, 2001, 2002 Ben Wing. | 2 Copyright (C) 2000, 2001, 2002, 2003 Ben Wing. |
3 | 3 |
4 This file is part of XEmacs. | 4 This file is part of XEmacs. |
5 | 5 |
6 XEmacs is free software; you can redistribute it and/or modify it | 6 XEmacs is free software; you can redistribute it and/or modify it |
7 under the terms of the GNU General Public License as published by the | 7 under the terms of the GNU General Public License as published by the |
1963 unsigned int seen_forward_bom:1; | 1963 unsigned int seen_forward_bom:1; |
1964 unsigned int seen_rev_bom:1; | 1964 unsigned int seen_rev_bom:1; |
1965 int byteno; | 1965 int byteno; |
1966 int prev_char; | 1966 int prev_char; |
1967 int text, rev_text; | 1967 int text, rev_text; |
1968 int sep, rev_sep; | |
1969 int num_ascii; | |
1968 }; | 1970 }; |
1969 | 1971 |
1970 static void | 1972 static void |
1971 utf_16_detect (struct detection_state *st, const UExtbyte *src, | 1973 utf_16_detect (struct detection_state *st, const UExtbyte *src, |
1972 Bytecount n) | 1974 Bytecount n) |
1992 data->text++; | 1994 data->text++; |
1993 if (c == 0 | 1995 if (c == 0 |
1994 && (prevc == '\r' || prevc == '\n' | 1996 && (prevc == '\r' || prevc == '\n' |
1995 || (prevc >= 0x20 && prevc <= 0x7E))) | 1997 || (prevc >= 0x20 && prevc <= 0x7E))) |
1996 data->rev_text++; | 1998 data->rev_text++; |
1999 /* #### 0x2028 is LINE SEPARATOR and 0x2029 is PARAGRAPH SEPARATOR. | |
2000 I used to count these in text and rev_text but that is very bad, | |
2001 as 0x2028 is also space + left-paren in ASCII, which is extremely | |
2002 common. So, what do we do with these? */ | |
1997 if (prevc == 0x20 && (c == 0x28 || c == 0x29)) | 2003 if (prevc == 0x20 && (c == 0x28 || c == 0x29)) |
1998 data->text++; | 2004 data->sep++; |
1999 if (c == 0x20 && (prevc == 0x28 || prevc == 0x29)) | 2005 if (c == 0x20 && (prevc == 0x28 || prevc == 0x29)) |
2000 data->rev_text++; | 2006 data->rev_sep++; |
2001 } | 2007 } |
2002 | 2008 |
2009 if ((c >= ' ' && c <= '~') || c == '\n' || c == '\r' || c == '\t' || | |
2010 c == '\f' || c == '\v') | |
2011 data->num_ascii++; | |
2003 data->byteno++; | 2012 data->byteno++; |
2004 data->prev_char = c; | 2013 data->prev_char = c; |
2005 } | 2014 } |
2006 | 2015 |
2007 { | 2016 { |
2061 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | 2070 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); |
2062 DET_RESULT (st, utf_16) = DET_SOMEWHAT_UNLIKELY; | 2071 DET_RESULT (st, utf_16) = DET_SOMEWHAT_UNLIKELY; |
2063 DET_RESULT (st, utf_16_little_endian) = DET_SOMEWHAT_LIKELY; | 2072 DET_RESULT (st, utf_16_little_endian) = DET_SOMEWHAT_LIKELY; |
2064 } | 2073 } |
2065 else | 2074 else |
2066 SET_DET_RESULTS (st, utf_16, DET_AS_LIKELY_AS_UNLIKELY); | 2075 { |
2076 /* #### FUCKME! There should really be an ASCII detector. This | |
2077 would rule out the need to have this built-in here as | |
2078 well. --ben */ | |
2079 int pct_ascii = ((100 * data->num_ascii) / data->byteno); | |
2080 | |
2081 if (pct_ascii > 90) | |
2082 SET_DET_RESULTS (st, utf_16, DET_QUITE_IMPROBABLE); | |
2083 else if (pct_ascii > 75) | |
2084 SET_DET_RESULTS (st, utf_16, DET_SOMEWHAT_UNLIKELY); | |
2085 else | |
2086 SET_DET_RESULTS (st, utf_16, DET_AS_LIKELY_AS_UNLIKELY); | |
2087 } | |
2067 } | 2088 } |
2068 } | 2089 } |
2069 | 2090 |
2070 struct utf_8_detector | 2091 struct utf_8_detector |
2071 { | 2092 { |
2072 int byteno; | 2093 int byteno; |
2073 int first_byte; | 2094 int first_byte; |
2074 int second_byte; | 2095 int second_byte; |
2096 int prev_byte; | |
2075 int in_utf_8_byte; | 2097 int in_utf_8_byte; |
2098 int recent_utf_8_sequence; | |
2099 int seen_bogus_utf8; | |
2100 int seen_really_bogus_utf8; | |
2101 int seen_2byte_sequence; | |
2102 int seen_longer_sequence; | |
2103 int seen_iso2022_esc; | |
2104 int seen_iso_shift; | |
2105 int seen_utf_bom:1; | |
2076 }; | 2106 }; |
2077 | 2107 |
2078 static void | 2108 static void |
2079 utf_8_detect (struct detection_state *st, const UExtbyte *src, | 2109 utf_8_detect (struct detection_state *st, const UExtbyte *src, |
2080 Bytecount n) | 2110 Bytecount n) |
2094 break; | 2124 break; |
2095 case 2: | 2125 case 2: |
2096 if (data->first_byte == 0xef && | 2126 if (data->first_byte == 0xef && |
2097 data->second_byte == 0xbb && | 2127 data->second_byte == 0xbb && |
2098 c == 0xbf) | 2128 c == 0xbf) |
2099 { | 2129 data->seen_utf_bom = 1; |
2100 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); | |
2101 DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY; | |
2102 return; | |
2103 } | |
2104 break; | 2130 break; |
2105 } | 2131 } |
2106 | 2132 |
2107 switch (data->in_utf_8_byte) | 2133 switch (data->in_utf_8_byte) |
2108 { | 2134 { |
2109 case 0: | 2135 case 0: |
2110 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | 2136 if (data->prev_byte == ISO_CODE_ESC && c >= 0x28 && c <= 0x2F) |
2111 { | 2137 data->seen_iso2022_esc++; |
2112 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); | 2138 else if (c == ISO_CODE_SI || c == ISO_CODE_SO) |
2113 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | 2139 data->seen_iso_shift++; |
2114 return; | |
2115 } | |
2116 else if (c >= 0xfc) | 2140 else if (c >= 0xfc) |
2117 data->in_utf_8_byte = 5; | 2141 data->in_utf_8_byte = 5; |
2118 else if (c >= 0xf8) | 2142 else if (c >= 0xf8) |
2119 data->in_utf_8_byte = 4; | 2143 data->in_utf_8_byte = 4; |
2120 else if (c >= 0xf0) | 2144 else if (c >= 0xf0) |
2122 else if (c >= 0xe0) | 2146 else if (c >= 0xe0) |
2123 data->in_utf_8_byte = 2; | 2147 data->in_utf_8_byte = 2; |
2124 else if (c >= 0xc0) | 2148 else if (c >= 0xc0) |
2125 data->in_utf_8_byte = 1; | 2149 data->in_utf_8_byte = 1; |
2126 else if (c >= 0x80) | 2150 else if (c >= 0x80) |
2127 { | 2151 data->seen_bogus_utf8++; |
2128 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); | 2152 if (data->in_utf_8_byte > 0) |
2129 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | 2153 data->recent_utf_8_sequence = data->in_utf_8_byte; |
2130 return; | |
2131 } | |
2132 break; | 2154 break; |
2133 default: | 2155 default: |
2134 if ((c & 0xc0) != 0x80) | 2156 if ((c & 0xc0) != 0x80) |
2157 data->seen_really_bogus_utf8++; | |
2158 else | |
2135 { | 2159 { |
2136 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); | 2160 data->in_utf_8_byte--; |
2137 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | 2161 if (data->in_utf_8_byte == 0) |
2138 return; | 2162 { |
2163 if (data->recent_utf_8_sequence == 1) | |
2164 data->seen_2byte_sequence++; | |
2165 else | |
2166 { | |
2167 assert (data->recent_utf_8_sequence >= 2); | |
2168 data->seen_longer_sequence++; | |
2169 } | |
2170 } | |
2139 } | 2171 } |
2140 else | |
2141 data->in_utf_8_byte--; | |
2142 } | 2172 } |
2143 | 2173 |
2144 data->byteno++; | 2174 data->byteno++; |
2145 } | 2175 data->prev_byte = c; |
2146 SET_DET_RESULTS (st, utf_8, DET_SOMEWHAT_LIKELY); | 2176 } |
2177 | |
2178 /* either BOM or no BOM, but not both */ | |
2179 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); | |
2180 | |
2181 | |
2182 if (data->seen_utf_bom) | |
2183 DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY; | |
2184 else | |
2185 { | |
2186 if (data->seen_really_bogus_utf8 || | |
2187 data->seen_bogus_utf8 >= 2) | |
2188 ; /* bogus */ | |
2189 else if (data->seen_bogus_utf8) | |
2190 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | |
2191 else if ((data->seen_longer_sequence >= 5 || | |
2192 data->seen_2byte_sequence >= 10) && | |
2193 (!(data->seen_iso2022_esc + data->seen_iso_shift) || | |
2194 (data->seen_longer_sequence * 2 + data->seen_2byte_sequence) / | |
2195 (data->seen_iso2022_esc + data->seen_iso_shift) >= 10)) | |
2196 /* heuristics, heuristics, we love heuristics */ | |
2197 DET_RESULT (st, utf_8) = DET_QUITE_PROBABLE; | |
2198 else if (data->seen_iso2022_esc || | |
2199 data->seen_iso_shift >= 3) | |
2200 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | |
2201 else if (data->seen_longer_sequence || | |
2202 data->seen_2byte_sequence) | |
2203 DET_RESULT (st, utf_8) = DET_SOMEWHAT_LIKELY; | |
2204 else if (data->seen_iso_shift) | |
2205 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | |
2206 else | |
2207 DET_RESULT (st, utf_8) = DET_AS_LIKELY_AS_UNLIKELY; | |
2208 } | |
2147 } | 2209 } |
2148 | 2210 |
2149 static void | 2211 static void |
2150 unicode_init_coding_stream (struct coding_stream *str) | 2212 unicode_init_coding_stream (struct coding_stream *str) |
2151 { | 2213 { |