comparison src/unicode.c @ 985:7f62a956b825

[xemacs-hg @ 2002-09-01 06:41:40 by youngs] 2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp> * code-init.el (reset-coding-categories-to-default): Add new coding category 'utf-8-bom'. * coding.el (coding-system-category): Add check for 'utf-8-bom'. * unicode.el: Add new coding system 'utf-8-bom'. 2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp> * file-coding.h (detection_result): fix DET_LOWEST. * mule-coding.c (big5_detector): Add member 'seen_euc_char'. (big5_detect): It was made not to become the trouble of EUC. (iso2022_detector): Add member 'longest_even_high_byte'. (iso2022_detect): Fix checking for even/odd_high_byte_group. The judgment of EUC is changed. * unicode.c: Add DETECTOR_CATEGORY utf_8_bom. (utf_8_detector): Add member byteno, first_byte and second_byte. (utf_8_detect): The judgment of UTF-8-BOM is added.
author youngs
date Sun, 01 Sep 2002 06:41:45 +0000
parents c9f067fd71a3
children e22b0213b713
comparison
equal deleted inserted replaced
984:c55a519aa13f 985:7f62a956b825
156 Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7; 156 Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7;
157 Lisp_Object Qneed_bom; 157 Lisp_Object Qneed_bom;
158 158
159 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; 159 Lisp_Object Qutf_16_little_endian, Qutf_16_bom;
160 Lisp_Object Qutf_16_little_endian_bom; 160 Lisp_Object Qutf_16_little_endian_bom;
161
162 Lisp_Object Qutf_8_bom;
161 163
162 #ifdef MULE 164 #ifdef MULE
163 165
164 /* #### Using ints for to_unicode is OK (as long as they are >= 32 bits). 166 /* #### Using ints for to_unicode is OK (as long as they are >= 32 bits).
165 However, shouldn't the shorts below be unsigned? */ 167 However, shouldn't the shorts below be unsigned? */
1922 } 1924 }
1923 1925
1924 /* DEFINE_DETECTOR (utf_7); */ 1926 /* DEFINE_DETECTOR (utf_7); */
1925 DEFINE_DETECTOR (utf_8); 1927 DEFINE_DETECTOR (utf_8);
1926 DEFINE_DETECTOR_CATEGORY (utf_8, utf_8); 1928 DEFINE_DETECTOR_CATEGORY (utf_8, utf_8);
1929 DEFINE_DETECTOR_CATEGORY (utf_8, utf_8_bom);
1927 DEFINE_DETECTOR (ucs_4); 1930 DEFINE_DETECTOR (ucs_4);
1928 DEFINE_DETECTOR_CATEGORY (ucs_4, ucs_4); 1931 DEFINE_DETECTOR_CATEGORY (ucs_4, ucs_4);
1929 DEFINE_DETECTOR (utf_16); 1932 DEFINE_DETECTOR (utf_16);
1930 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16); 1933 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16);
1931 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian); 1934 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian);
2079 } 2082 }
2080 } 2083 }
2081 2084
2082 struct utf_8_detector 2085 struct utf_8_detector
2083 { 2086 {
2087 int byteno;
2088 int first_byte;
2089 int second_byte;
2084 int in_utf_8_byte; 2090 int in_utf_8_byte;
2085 }; 2091 };
2086 2092
2087 static void 2093 static void
2088 utf_8_detect (struct detection_state *st, const UExtbyte *src, 2094 utf_8_detect (struct detection_state *st, const UExtbyte *src,
2091 struct utf_8_detector *data = DETECTION_STATE_DATA (st, utf_8); 2097 struct utf_8_detector *data = DETECTION_STATE_DATA (st, utf_8);
2092 2098
2093 while (n--) 2099 while (n--)
2094 { 2100 {
2095 UExtbyte c = *src++; 2101 UExtbyte c = *src++;
2102 switch (data->byteno)
2103 {
2104 case 0:
2105 data->first_byte = c;
2106 break;
2107 case 1:
2108 data->second_byte = c;
2109 break;
2110 case 2:
2111 if (data->first_byte == 0xef &&
2112 data->second_byte == 0xbb &&
2113 c == 0xbf)
2114 {
2115 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
2116 DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY;
2117 return;
2118 }
2119 break;
2120 }
2121
2096 switch (data->in_utf_8_byte) 2122 switch (data->in_utf_8_byte)
2097 { 2123 {
2098 case 0: 2124 case 0:
2099 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) 2125 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2100 { 2126 {
2127 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
2101 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; 2128 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
2102 return; 2129 return;
2103 } 2130 }
2104 else if (c >= 0xfc) 2131 else if (c >= 0xfc)
2105 data->in_utf_8_byte = 5; 2132 data->in_utf_8_byte = 5;
2111 data->in_utf_8_byte = 2; 2138 data->in_utf_8_byte = 2;
2112 else if (c >= 0xc0) 2139 else if (c >= 0xc0)
2113 data->in_utf_8_byte = 1; 2140 data->in_utf_8_byte = 1;
2114 else if (c >= 0x80) 2141 else if (c >= 0x80)
2115 { 2142 {
2143 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
2116 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; 2144 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
2117 return; 2145 return;
2118 } 2146 }
2119 break; 2147 break;
2120 default: 2148 default:
2121 if ((c & 0xc0) != 0x80) 2149 if ((c & 0xc0) != 0x80)
2122 { 2150 {
2151 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
2123 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; 2152 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
2124 return; 2153 return;
2125 } 2154 }
2126 else 2155 else
2127 data->in_utf_8_byte--; 2156 data->in_utf_8_byte--;
2128 } 2157 }
2129 } 2158
2130 DET_RESULT (st, utf_8) = DET_SOMEWHAT_LIKELY; 2159 data->byteno++;
2160 }
2161 SET_DET_RESULTS (st, utf_8, DET_SOMEWHAT_LIKELY);
2131 } 2162 }
2132 2163
2133 static void 2164 static void
2134 unicode_init_coding_stream (struct coding_stream *str) 2165 unicode_init_coding_stream (struct coding_stream *str)
2135 { 2166 {
2254 2285
2255 DEFSYMBOL (Qutf_16); 2286 DEFSYMBOL (Qutf_16);
2256 DEFSYMBOL (Qutf_16_little_endian); 2287 DEFSYMBOL (Qutf_16_little_endian);
2257 DEFSYMBOL (Qutf_16_bom); 2288 DEFSYMBOL (Qutf_16_bom);
2258 DEFSYMBOL (Qutf_16_little_endian_bom); 2289 DEFSYMBOL (Qutf_16_little_endian_bom);
2290
2291 DEFSYMBOL (Qutf_8);
2292 DEFSYMBOL (Qutf_8_bom);
2259 } 2293 }
2260 2294
2261 void 2295 void
2262 coding_system_type_create_unicode (void) 2296 coding_system_type_create_unicode (void)
2263 { 2297 {
2270 CODING_SYSTEM_HAS_METHOD (unicode, getprop); 2304 CODING_SYSTEM_HAS_METHOD (unicode, getprop);
2271 2305
2272 INITIALIZE_DETECTOR (utf_8); 2306 INITIALIZE_DETECTOR (utf_8);
2273 DETECTOR_HAS_METHOD (utf_8, detect); 2307 DETECTOR_HAS_METHOD (utf_8, detect);
2274 INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8); 2308 INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8);
2309 INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8_bom);
2275 2310
2276 INITIALIZE_DETECTOR (ucs_4); 2311 INITIALIZE_DETECTOR (ucs_4);
2277 DETECTOR_HAS_METHOD (ucs_4, detect); 2312 DETECTOR_HAS_METHOD (ucs_4, detect);
2278 INITIALIZE_DETECTOR_CATEGORY (ucs_4, ucs_4); 2313 INITIALIZE_DETECTOR_CATEGORY (ucs_4, ucs_4);
2279 2314