Mercurial > hg > xemacs-beta
diff src/unicode.c @ 985:7f62a956b825
[xemacs-hg @ 2002-09-01 06:41:40 by youngs]
2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp>
* code-init.el (reset-coding-categories-to-default): Add new
coding category 'utf-8-bom'.
* coding.el (coding-system-category): Add check for 'utf-8-bom'.
* unicode.el: Add new coding system 'utf-8-bom'.
2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp>
* file-coding.h (detection_result): fix DET_LOWEST.
* mule-coding.c
(big5_detector): Add member 'seen_euc_char'.
(big5_detect): It was made not to become the trouble of EUC.
(iso2022_detector): Add member 'longest_even_high_byte'.
(iso2022_detect): Fix checking for even/odd_high_byte_group.
The judgment of EUC is changed.
* unicode.c: Add DETECTOR_CATEGORY utf_8_bom.
(utf_8_detector): Add member byteno, first_byte and second_byte.
(utf_8_detect): The judgment of UTF-8-BOM is added.
author | youngs |
---|---|
date | Sun, 01 Sep 2002 06:41:45 +0000 |
parents | c9f067fd71a3 |
children | e22b0213b713 |
line wrap: on
line diff
--- a/src/unicode.c Sat Aug 31 11:04:01 2002 +0000 +++ b/src/unicode.c Sun Sep 01 06:41:45 2002 +0000 @@ -159,6 +159,8 @@ Lisp_Object Qutf_16_little_endian, Qutf_16_bom; Lisp_Object Qutf_16_little_endian_bom; +Lisp_Object Qutf_8_bom; + #ifdef MULE /* #### Using ints for to_unicode is OK (as long as they are >= 32 bits). @@ -1924,6 +1926,7 @@ /* DEFINE_DETECTOR (utf_7); */ DEFINE_DETECTOR (utf_8); DEFINE_DETECTOR_CATEGORY (utf_8, utf_8); +DEFINE_DETECTOR_CATEGORY (utf_8, utf_8_bom); DEFINE_DETECTOR (ucs_4); DEFINE_DETECTOR_CATEGORY (ucs_4, ucs_4); DEFINE_DETECTOR (utf_16); @@ -2081,6 +2084,9 @@ struct utf_8_detector { + int byteno; + int first_byte; + int second_byte; int in_utf_8_byte; }; @@ -2093,11 +2099,32 @@ while (n--) { UExtbyte c = *src++; + switch (data->byteno) + { + case 0: + data->first_byte = c; + break; + case 1: + data->second_byte = c; + break; + case 2: + if (data->first_byte == 0xef && + data->second_byte == 0xbb && + c == 0xbf) + { + SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); + DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY; + return; + } + break; + } + switch (data->in_utf_8_byte) { case 0: if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) { + SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; return; } @@ -2113,6 +2140,7 @@ data->in_utf_8_byte = 1; else if (c >= 0x80) { + SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; return; } @@ -2120,14 +2148,17 @@ default: if ((c & 0xc0) != 0x80) { + SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; return; } else data->in_utf_8_byte--; } + + data->byteno++; } - DET_RESULT (st, utf_8) = DET_SOMEWHAT_LIKELY; + SET_DET_RESULTS (st, utf_8, DET_SOMEWHAT_LIKELY); } static void @@ -2256,6 +2287,9 @@ DEFSYMBOL (Qutf_16_little_endian); DEFSYMBOL (Qutf_16_bom); DEFSYMBOL (Qutf_16_little_endian_bom); + + DEFSYMBOL (Qutf_8); + DEFSYMBOL (Qutf_8_bom); } void @@ -2272,6 +2306,7 @@ INITIALIZE_DETECTOR (utf_8); DETECTOR_HAS_METHOD (utf_8, detect); INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8); + INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8_bom); INITIALIZE_DETECTOR (ucs_4); DETECTOR_HAS_METHOD (ucs_4, detect);