comparison src/unicode.c @ 1267:c57f32e44416

[xemacs-hg @ 2003-02-07 01:43:05 by ben] fix unicode/utf misdetection unicode.c: UTF-8 detector was totally bogus and detected almost anything as UTF-8. UTF-16 detector got overly happy with 0x2028's (line separator, also space+paren).
author ben
date Fri, 07 Feb 2003 01:43:07 +0000
parents e22b0213b713
children f3437b56874d
comparison
equal deleted inserted replaced
1266:b5a5863da615 1267:c57f32e44416
1 /* Code to handle Unicode conversion. 1 /* Code to handle Unicode conversion.
2 Copyright (C) 2000, 2001, 2002 Ben Wing. 2 Copyright (C) 2000, 2001, 2002, 2003 Ben Wing.
3 3
4 This file is part of XEmacs. 4 This file is part of XEmacs.
5 5
6 XEmacs is free software; you can redistribute it and/or modify it 6 XEmacs is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the 7 under the terms of the GNU General Public License as published by the
1963 unsigned int seen_forward_bom:1; 1963 unsigned int seen_forward_bom:1;
1964 unsigned int seen_rev_bom:1; 1964 unsigned int seen_rev_bom:1;
1965 int byteno; 1965 int byteno;
1966 int prev_char; 1966 int prev_char;
1967 int text, rev_text; 1967 int text, rev_text;
1968 int sep, rev_sep;
1969 int num_ascii;
1968 }; 1970 };
1969 1971
1970 static void 1972 static void
1971 utf_16_detect (struct detection_state *st, const UExtbyte *src, 1973 utf_16_detect (struct detection_state *st, const UExtbyte *src,
1972 Bytecount n) 1974 Bytecount n)
1992 data->text++; 1994 data->text++;
1993 if (c == 0 1995 if (c == 0
1994 && (prevc == '\r' || prevc == '\n' 1996 && (prevc == '\r' || prevc == '\n'
1995 || (prevc >= 0x20 && prevc <= 0x7E))) 1997 || (prevc >= 0x20 && prevc <= 0x7E)))
1996 data->rev_text++; 1998 data->rev_text++;
1999 /* #### 0x2028 is LINE SEPARATOR and 0x2029 is PARAGRAPH SEPARATOR.
2000 I used to count these in text and rev_text but that is very bad,
2001 as 0x2028 is also space + left-paren in ASCII, which is extremely
2002 common. So, what do we do with these? */
1997 if (prevc == 0x20 && (c == 0x28 || c == 0x29)) 2003 if (prevc == 0x20 && (c == 0x28 || c == 0x29))
1998 data->text++; 2004 data->sep++;
1999 if (c == 0x20 && (prevc == 0x28 || prevc == 0x29)) 2005 if (c == 0x20 && (prevc == 0x28 || prevc == 0x29))
2000 data->rev_text++; 2006 data->rev_sep++;
2001 } 2007 }
2002 2008
2009 if ((c >= ' ' && c <= '~') || c == '\n' || c == '\r' || c == '\t' ||
2010 c == '\f' || c == '\v')
2011 data->num_ascii++;
2003 data->byteno++; 2012 data->byteno++;
2004 data->prev_char = c; 2013 data->prev_char = c;
2005 } 2014 }
2006 2015
2007 { 2016 {
2061 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); 2070 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE);
2062 DET_RESULT (st, utf_16) = DET_SOMEWHAT_UNLIKELY; 2071 DET_RESULT (st, utf_16) = DET_SOMEWHAT_UNLIKELY;
2063 DET_RESULT (st, utf_16_little_endian) = DET_SOMEWHAT_LIKELY; 2072 DET_RESULT (st, utf_16_little_endian) = DET_SOMEWHAT_LIKELY;
2064 } 2073 }
2065 else 2074 else
2066 SET_DET_RESULTS (st, utf_16, DET_AS_LIKELY_AS_UNLIKELY); 2075 {
2076 /* #### FUCKME! There should really be an ASCII detector. This
2077 would rule out the need to have this built-in here as
2078 well. --ben */
2079 int pct_ascii = ((100 * data->num_ascii) / data->byteno);
2080
2081 if (pct_ascii > 90)
2082 SET_DET_RESULTS (st, utf_16, DET_QUITE_IMPROBABLE);
2083 else if (pct_ascii > 75)
2084 SET_DET_RESULTS (st, utf_16, DET_SOMEWHAT_UNLIKELY);
2085 else
2086 SET_DET_RESULTS (st, utf_16, DET_AS_LIKELY_AS_UNLIKELY);
2087 }
2067 } 2088 }
2068 } 2089 }
2069 2090
2070 struct utf_8_detector 2091 struct utf_8_detector
2071 { 2092 {
2072 int byteno; 2093 int byteno;
2073 int first_byte; 2094 int first_byte;
2074 int second_byte; 2095 int second_byte;
2096 int prev_byte;
2075 int in_utf_8_byte; 2097 int in_utf_8_byte;
2098 int recent_utf_8_sequence;
2099 int seen_bogus_utf8;
2100 int seen_really_bogus_utf8;
2101 int seen_2byte_sequence;
2102 int seen_longer_sequence;
2103 int seen_iso2022_esc;
2104 int seen_iso_shift;
2105 int seen_utf_bom:1;
2076 }; 2106 };
2077 2107
2078 static void 2108 static void
2079 utf_8_detect (struct detection_state *st, const UExtbyte *src, 2109 utf_8_detect (struct detection_state *st, const UExtbyte *src,
2080 Bytecount n) 2110 Bytecount n)
2094 break; 2124 break;
2095 case 2: 2125 case 2:
2096 if (data->first_byte == 0xef && 2126 if (data->first_byte == 0xef &&
2097 data->second_byte == 0xbb && 2127 data->second_byte == 0xbb &&
2098 c == 0xbf) 2128 c == 0xbf)
2099 { 2129 data->seen_utf_bom = 1;
2100 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
2101 DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY;
2102 return;
2103 }
2104 break; 2130 break;
2105 } 2131 }
2106 2132
2107 switch (data->in_utf_8_byte) 2133 switch (data->in_utf_8_byte)
2108 { 2134 {
2109 case 0: 2135 case 0:
2110 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) 2136 if (data->prev_byte == ISO_CODE_ESC && c >= 0x28 && c <= 0x2F)
2111 { 2137 data->seen_iso2022_esc++;
2112 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); 2138 else if (c == ISO_CODE_SI || c == ISO_CODE_SO)
2113 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; 2139 data->seen_iso_shift++;
2114 return;
2115 }
2116 else if (c >= 0xfc) 2140 else if (c >= 0xfc)
2117 data->in_utf_8_byte = 5; 2141 data->in_utf_8_byte = 5;
2118 else if (c >= 0xf8) 2142 else if (c >= 0xf8)
2119 data->in_utf_8_byte = 4; 2143 data->in_utf_8_byte = 4;
2120 else if (c >= 0xf0) 2144 else if (c >= 0xf0)
2122 else if (c >= 0xe0) 2146 else if (c >= 0xe0)
2123 data->in_utf_8_byte = 2; 2147 data->in_utf_8_byte = 2;
2124 else if (c >= 0xc0) 2148 else if (c >= 0xc0)
2125 data->in_utf_8_byte = 1; 2149 data->in_utf_8_byte = 1;
2126 else if (c >= 0x80) 2150 else if (c >= 0x80)
2127 { 2151 data->seen_bogus_utf8++;
2128 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); 2152 if (data->in_utf_8_byte > 0)
2129 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; 2153 data->recent_utf_8_sequence = data->in_utf_8_byte;
2130 return;
2131 }
2132 break; 2154 break;
2133 default: 2155 default:
2134 if ((c & 0xc0) != 0x80) 2156 if ((c & 0xc0) != 0x80)
2157 data->seen_really_bogus_utf8++;
2158 else
2135 { 2159 {
2136 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); 2160 data->in_utf_8_byte--;
2137 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; 2161 if (data->in_utf_8_byte == 0)
2138 return; 2162 {
2163 if (data->recent_utf_8_sequence == 1)
2164 data->seen_2byte_sequence++;
2165 else
2166 {
2167 assert (data->recent_utf_8_sequence >= 2);
2168 data->seen_longer_sequence++;
2169 }
2170 }
2139 } 2171 }
2140 else
2141 data->in_utf_8_byte--;
2142 } 2172 }
2143 2173
2144 data->byteno++; 2174 data->byteno++;
2145 } 2175 data->prev_byte = c;
2146 SET_DET_RESULTS (st, utf_8, DET_SOMEWHAT_LIKELY); 2176 }
2177
2178 /* either BOM or no BOM, but not both */
2179 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
2180
2181
2182 if (data->seen_utf_bom)
2183 DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY;
2184 else
2185 {
2186 if (data->seen_really_bogus_utf8 ||
2187 data->seen_bogus_utf8 >= 2)
2188 ; /* bogus */
2189 else if (data->seen_bogus_utf8)
2190 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
2191 else if ((data->seen_longer_sequence >= 5 ||
2192 data->seen_2byte_sequence >= 10) &&
2193 (!(data->seen_iso2022_esc + data->seen_iso_shift) ||
2194 (data->seen_longer_sequence * 2 + data->seen_2byte_sequence) /
2195 (data->seen_iso2022_esc + data->seen_iso_shift) >= 10))
2196 /* heuristics, heuristics, we love heuristics */
2197 DET_RESULT (st, utf_8) = DET_QUITE_PROBABLE;
2198 else if (data->seen_iso2022_esc ||
2199 data->seen_iso_shift >= 3)
2200 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
2201 else if (data->seen_longer_sequence ||
2202 data->seen_2byte_sequence)
2203 DET_RESULT (st, utf_8) = DET_SOMEWHAT_LIKELY;
2204 else if (data->seen_iso_shift)
2205 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
2206 else
2207 DET_RESULT (st, utf_8) = DET_AS_LIKELY_AS_UNLIKELY;
2208 }
2147 } 2209 }
2148 2210
2149 static void 2211 static void
2150 unicode_init_coding_stream (struct coding_stream *str) 2212 unicode_init_coding_stream (struct coding_stream *str)
2151 { 2213 {