Mercurial > hg > xemacs-beta
comparison src/unicode.c @ 985:7f62a956b825
[xemacs-hg @ 2002-09-01 06:41:40 by youngs]
2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp>
* code-init.el (reset-coding-categories-to-default): Add new
coding category 'utf-8-bom'.
* coding.el (coding-system-category): Add check for 'utf-8-bom'.
* unicode.el: Add new coding system 'utf-8-bom'.
2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp>
* file-coding.h (detection_result): fix DET_LOWEST.
* mule-coding.c
(big5_detector): Add member 'seen_euc_char'.
(big5_detect): It was made not to become the trouble of EUC.
(iso2022_detector): Add member 'longest_even_high_byte'.
(iso2022_detect): Fix checking for even/odd_high_byte_group.
The judgment of EUC is changed.
* unicode.c: Add DETECTOR_CATEGORY utf_8_bom.
(utf_8_detector): Add member byteno, first_byte and second_byte.
(utf_8_detect): The judgment of UTF-8-BOM is added.
author | youngs |
---|---|
date | Sun, 01 Sep 2002 06:41:45 +0000 |
parents | c9f067fd71a3 |
children | e22b0213b713 |
comparison
equal
deleted
inserted
replaced
984:c55a519aa13f | 985:7f62a956b825 |
---|---|
156 Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7; | 156 Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7; |
157 Lisp_Object Qneed_bom; | 157 Lisp_Object Qneed_bom; |
158 | 158 |
159 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; | 159 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; |
160 Lisp_Object Qutf_16_little_endian_bom; | 160 Lisp_Object Qutf_16_little_endian_bom; |
161 | |
162 Lisp_Object Qutf_8_bom; | |
161 | 163 |
162 #ifdef MULE | 164 #ifdef MULE |
163 | 165 |
164 /* #### Using ints for to_unicode is OK (as long as they are >= 32 bits). | 166 /* #### Using ints for to_unicode is OK (as long as they are >= 32 bits). |
165 However, shouldn't the shorts below be unsigned? */ | 167 However, shouldn't the shorts below be unsigned? */ |
1922 } | 1924 } |
1923 | 1925 |
1924 /* DEFINE_DETECTOR (utf_7); */ | 1926 /* DEFINE_DETECTOR (utf_7); */ |
1925 DEFINE_DETECTOR (utf_8); | 1927 DEFINE_DETECTOR (utf_8); |
1926 DEFINE_DETECTOR_CATEGORY (utf_8, utf_8); | 1928 DEFINE_DETECTOR_CATEGORY (utf_8, utf_8); |
1929 DEFINE_DETECTOR_CATEGORY (utf_8, utf_8_bom); | |
1927 DEFINE_DETECTOR (ucs_4); | 1930 DEFINE_DETECTOR (ucs_4); |
1928 DEFINE_DETECTOR_CATEGORY (ucs_4, ucs_4); | 1931 DEFINE_DETECTOR_CATEGORY (ucs_4, ucs_4); |
1929 DEFINE_DETECTOR (utf_16); | 1932 DEFINE_DETECTOR (utf_16); |
1930 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16); | 1933 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16); |
1931 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian); | 1934 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian); |
2079 } | 2082 } |
2080 } | 2083 } |
2081 | 2084 |
2082 struct utf_8_detector | 2085 struct utf_8_detector |
2083 { | 2086 { |
2087 int byteno; | |
2088 int first_byte; | |
2089 int second_byte; | |
2084 int in_utf_8_byte; | 2090 int in_utf_8_byte; |
2085 }; | 2091 }; |
2086 | 2092 |
2087 static void | 2093 static void |
2088 utf_8_detect (struct detection_state *st, const UExtbyte *src, | 2094 utf_8_detect (struct detection_state *st, const UExtbyte *src, |
2091 struct utf_8_detector *data = DETECTION_STATE_DATA (st, utf_8); | 2097 struct utf_8_detector *data = DETECTION_STATE_DATA (st, utf_8); |
2092 | 2098 |
2093 while (n--) | 2099 while (n--) |
2094 { | 2100 { |
2095 UExtbyte c = *src++; | 2101 UExtbyte c = *src++; |
2102 switch (data->byteno) | |
2103 { | |
2104 case 0: | |
2105 data->first_byte = c; | |
2106 break; | |
2107 case 1: | |
2108 data->second_byte = c; | |
2109 break; | |
2110 case 2: | |
2111 if (data->first_byte == 0xef && | |
2112 data->second_byte == 0xbb && | |
2113 c == 0xbf) | |
2114 { | |
2115 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); | |
2116 DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY; | |
2117 return; | |
2118 } | |
2119 break; | |
2120 } | |
2121 | |
2096 switch (data->in_utf_8_byte) | 2122 switch (data->in_utf_8_byte) |
2097 { | 2123 { |
2098 case 0: | 2124 case 0: |
2099 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | 2125 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) |
2100 { | 2126 { |
2127 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); | |
2101 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | 2128 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; |
2102 return; | 2129 return; |
2103 } | 2130 } |
2104 else if (c >= 0xfc) | 2131 else if (c >= 0xfc) |
2105 data->in_utf_8_byte = 5; | 2132 data->in_utf_8_byte = 5; |
2111 data->in_utf_8_byte = 2; | 2138 data->in_utf_8_byte = 2; |
2112 else if (c >= 0xc0) | 2139 else if (c >= 0xc0) |
2113 data->in_utf_8_byte = 1; | 2140 data->in_utf_8_byte = 1; |
2114 else if (c >= 0x80) | 2141 else if (c >= 0x80) |
2115 { | 2142 { |
2143 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); | |
2116 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | 2144 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; |
2117 return; | 2145 return; |
2118 } | 2146 } |
2119 break; | 2147 break; |
2120 default: | 2148 default: |
2121 if ((c & 0xc0) != 0x80) | 2149 if ((c & 0xc0) != 0x80) |
2122 { | 2150 { |
2151 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); | |
2123 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | 2152 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; |
2124 return; | 2153 return; |
2125 } | 2154 } |
2126 else | 2155 else |
2127 data->in_utf_8_byte--; | 2156 data->in_utf_8_byte--; |
2128 } | 2157 } |
2129 } | 2158 |
2130 DET_RESULT (st, utf_8) = DET_SOMEWHAT_LIKELY; | 2159 data->byteno++; |
2160 } | |
2161 SET_DET_RESULTS (st, utf_8, DET_SOMEWHAT_LIKELY); | |
2131 } | 2162 } |
2132 | 2163 |
2133 static void | 2164 static void |
2134 unicode_init_coding_stream (struct coding_stream *str) | 2165 unicode_init_coding_stream (struct coding_stream *str) |
2135 { | 2166 { |
2254 | 2285 |
2255 DEFSYMBOL (Qutf_16); | 2286 DEFSYMBOL (Qutf_16); |
2256 DEFSYMBOL (Qutf_16_little_endian); | 2287 DEFSYMBOL (Qutf_16_little_endian); |
2257 DEFSYMBOL (Qutf_16_bom); | 2288 DEFSYMBOL (Qutf_16_bom); |
2258 DEFSYMBOL (Qutf_16_little_endian_bom); | 2289 DEFSYMBOL (Qutf_16_little_endian_bom); |
2290 | |
2291 DEFSYMBOL (Qutf_8); | |
2292 DEFSYMBOL (Qutf_8_bom); | |
2259 } | 2293 } |
2260 | 2294 |
2261 void | 2295 void |
2262 coding_system_type_create_unicode (void) | 2296 coding_system_type_create_unicode (void) |
2263 { | 2297 { |
2270 CODING_SYSTEM_HAS_METHOD (unicode, getprop); | 2304 CODING_SYSTEM_HAS_METHOD (unicode, getprop); |
2271 | 2305 |
2272 INITIALIZE_DETECTOR (utf_8); | 2306 INITIALIZE_DETECTOR (utf_8); |
2273 DETECTOR_HAS_METHOD (utf_8, detect); | 2307 DETECTOR_HAS_METHOD (utf_8, detect); |
2274 INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8); | 2308 INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8); |
2309 INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8_bom); | |
2275 | 2310 |
2276 INITIALIZE_DETECTOR (ucs_4); | 2311 INITIALIZE_DETECTOR (ucs_4); |
2277 DETECTOR_HAS_METHOD (ucs_4, detect); | 2312 DETECTOR_HAS_METHOD (ucs_4, detect); |
2278 INITIALIZE_DETECTOR_CATEGORY (ucs_4, ucs_4); | 2313 INITIALIZE_DETECTOR_CATEGORY (ucs_4, ucs_4); |
2279 | 2314 |