Mercurial > hg > xemacs-beta
changeset 985:7f62a956b825
[xemacs-hg @ 2002-09-01 06:41:40 by youngs]
2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp>
* code-init.el (reset-coding-categories-to-default): Add new
coding category 'utf-8-bom'.
* coding.el (coding-system-category): Add check for 'utf-8-bom'.
* unicode.el: Add new coding system 'utf-8-bom'.
2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp>
* file-coding.h (detection_result): fix DET_LOWEST.
* mule-coding.c
(big5_detector): Add member 'seen_euc_char'.
(big5_detect): It was made not to become the trouble of EUC.
(iso2022_detector): Add member 'longest_even_high_byte'.
(iso2022_detect): Fix checking for even/odd_high_byte_group.
The judgment of EUC is changed.
* unicode.c: Add DETECTOR_CATEGORY utf_8_bom.
(utf_8_detector): Add member byteno, first_byte and second_byte.
(utf_8_detect): The judgment of UTF-8-BOM is added.
author | youngs |
---|---|
date | Sun, 01 Sep 2002 06:41:45 +0000 |
parents | c55a519aa13f |
children | 9b80efded6a5 |
files | lisp/ChangeLog lisp/code-init.el lisp/coding.el lisp/unicode.el src/ChangeLog src/file-coding.h src/mule-coding.c src/unicode.c |
diffstat | 8 files changed, 115 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- a/lisp/ChangeLog Sat Aug 31 11:04:01 2002 +0000 +++ b/lisp/ChangeLog Sun Sep 01 06:41:45 2002 +0000 @@ -1,3 +1,12 @@ +2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp> + + * code-init.el (reset-coding-categories-to-default): Add new + coding category 'utf-8-bom'. + + * coding.el (coding-system-category): Add check for 'utf-8-bom'. + + * unicode.el: Add new coding system 'utf-8-bom'. + 2002-08-30 Steve Youngs <youngs@xemacs.org> * XEmacs 21.5.9 "brussels sprouts" is released.
--- a/lisp/code-init.el Sat Aug 31 11:04:01 2002 +0000 +++ b/lisp/code-init.el Sun Sep 01 06:41:45 2002 +0000 @@ -276,6 +276,7 @@ -------------------------------------------------- utf-16-little-endian-bom utf-16-little-endian utf-16-bom utf-16-bom + utf-8-bom utf-8-bom iso-7 iso-2022-7bit no-conversion raw-text utf-8 utf-8 @@ -324,6 +325,7 @@ (coding-system-variable-default-value 'no-conversion-coding-system-mapping)) (set-coding-category-system 'ucs-4 'ucs-4) (set-coding-category-system 'utf-8 'utf-8) + (set-coding-category-system 'utf-8-bom 'utf-8-bom) (set-coding-category-system 'utf-16-little-endian 'utf-16-little-endian) (set-coding-category-system 'utf-16 'utf-16) (set-coding-category-system 'utf-16-little-endian-bom @@ -333,6 +335,7 @@ (if (featurep 'mule) '(utf-16-little-endian-bom utf-16-bom + utf-8-bom iso-7 no-conversion utf-8 @@ -347,6 +350,7 @@ ucs-4) '(utf-16-little-endian-bom utf-16-bom + utf-8-bom no-conversion utf-8 utf-16-little-endian
--- a/lisp/coding.el Sat Aug 31 11:04:01 2002 +0000 +++ b/lisp/coding.el Sun Sep 01 06:41:45 2002 +0000 @@ -201,7 +201,10 @@ (no-conversion 'no-conversion) (shift-jis 'shift-jis) (unicode (case (coding-system-property coding-system 'type) - (utf-8 'utf-8) + (utf-8 (let ((bom (coding-system-property coding-system + 'need-bom))) + (cond (bom 'utf-8-bom) + ((not bom) 'utf-8)))) (ucs-4 'ucs-4) (utf-16 (let ((bom (coding-system-property coding-system 'need-bom))
--- a/lisp/unicode.el Sat Aug 31 11:04:01 2002 +0000 +++ b/lisp/unicode.el Sun Sep 01 06:41:45 2002 +0000 @@ -281,6 +281,17 @@ " type utf-8)) +(make-coding-system + 'utf-8-bom 'unicode + "UTF-8 w/BOM" + '(mnemonic "MSW-UTF8" + documentation + "UTF-8 Unicode encoding, with byte order mark. +Standard encoding for representing UTF-8 under MS Windows." + type utf-8 + little-endian t + need-bom t)) + ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's ;; an implementation in appendix A.1 of the Unicode Standard, Version ;; 2.0, but I don't know its licensing characteristics.
--- a/src/ChangeLog Sat Aug 31 11:04:01 2002 +0000 +++ b/src/ChangeLog Sun Sep 01 06:41:45 2002 +0000 @@ -1,3 +1,18 @@ +2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp> + + * file-coding.h (detection_result): fix DET_LOWEST. + + * mule-coding.c + (big5_detector): Add member 'seen_euc_char'. + (big5_detect): It was made not to become the trouble of EUC. + (iso2022_detector): Add member 'longest_even_high_byte'. + (iso2022_detect): Fix checking for even/odd_high_byte_group. + The judgment of EUC is changed. + + * unicode.c: Add DETECTOR_CATEGORY utf_8_bom. + (utf_8_detector): Add member byteno, first_byte and second_byte. + (utf_8_detect): The judgment of UTF-8-BOM is added. + 2002-08-30 Steve Youngs <youngs@xemacs.org> * XEmacs 21.5.9 "brussels sprouts" is released.
--- a/src/file-coding.h Sat Aug 31 11:04:01 2002 +0000 +++ b/src/file-coding.h Sun Sep 01 06:41:45 2002 +0000 @@ -756,7 +756,7 @@ DET_QUITE_IMPROBABLE = -2, /* An erroneous sequence was seen. */ DET_NEARLY_IMPOSSIBLE = -3, - DET_LOWEST = 3, + DET_LOWEST = -3, }; extern int coding_detector_count;
--- a/src/mule-coding.c Sat Aug 31 11:04:01 2002 +0000 +++ b/src/mule-coding.c Sun Sep 01 06:41:45 2002 +0000 @@ -599,6 +599,7 @@ struct big5_detector { int seen_big5_char; + int seen_euc_char; unsigned int seen_iso2022_esc:1; unsigned int seen_bad_first_byte:1; unsigned int seen_bad_second_byte:1; @@ -628,7 +629,9 @@ else { data->in_second_byte = 0; - if ((c >= 0x40 && c <= 0x7E) || (c >= 0xA1 && c <= 0xFE)) + if (c >= 0xA1 && c <= 0xFE) + data->seen_euc_char++; + else if (c >= 0x40 && c <= 0x7E) data->seen_big5_char++; else data->seen_bad_second_byte = 1; @@ -643,6 +646,8 @@ DET_RESULT (st, big5) = DET_SOMEWHAT_UNLIKELY; else if (data->seen_big5_char >= 4) DET_RESULT (st, big5) = DET_SOMEWHAT_LIKELY; + else if (data->seen_euc_char) + DET_RESULT (st, big5) = DET_SLIGHTLY_LIKELY; else DET_RESULT (st, big5) = DET_AS_LIKELY_AS_UNLIKELY; } @@ -2749,6 +2754,7 @@ unsigned int bad_multibyte_escape_sequences; unsigned int good_multibyte_escape_sequences; int even_high_byte_groups; + int longest_even_high_byte; int odd_high_byte_groups; }; @@ -2794,7 +2800,11 @@ if (data->high_byte_count & 1) data->odd_high_byte_groups++; else - data->even_high_byte_groups++; + { + data->even_high_byte_groups++; + if (data->longest_even_high_byte < data->high_byte_count) + data->longest_even_high_byte = data->high_byte_count; + } } data->high_byte_count = 0; data->saw_single_shift_just_now = 0; @@ -2861,6 +2871,19 @@ label_continue_loop:; } + if (data->high_byte_count && + !data->saw_single_shift_just_now) + { + if (data->high_byte_count & 1) + data->odd_high_byte_groups++; + else + { + data->even_high_byte_groups++; + if (data->longest_even_high_byte < data->high_byte_count) + data->longest_even_high_byte = data->high_byte_count; + } + } + if (data->bad_multibyte_escape_sequences > 2 || (data->bad_multibyte_escape_sequences > 0 && data->good_multibyte_escape_sequences / @@ -2919,6 +2942,7 @@ else if (data->odd_high_byte_groups == 0 && data->even_high_byte_groups > 0) { +#if 0 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); if (data->even_high_byte_groups > 10) { @@ -2930,6 +2954,15 @@ DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_UNLIKELY; /* else it stays at quite improbable */ } +#else + SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); + if (data->seen_single_shift) + DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE; + else if (data->even_high_byte_groups > 10) + DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY; + else if (data->longest_even_high_byte > 6) + DET_RESULT (st, iso_8_2) = DET_SLIGHTLY_LIKELY; +#endif } else if (data->odd_high_byte_groups > 0 && data->even_high_byte_groups > 0)
--- a/src/unicode.c Sat Aug 31 11:04:01 2002 +0000 +++ b/src/unicode.c Sun Sep 01 06:41:45 2002 +0000 @@ -159,6 +159,8 @@ Lisp_Object Qutf_16_little_endian, Qutf_16_bom; Lisp_Object Qutf_16_little_endian_bom; +Lisp_Object Qutf_8_bom; + #ifdef MULE /* #### Using ints for to_unicode is OK (as long as they are >= 32 bits). @@ -1924,6 +1926,7 @@ /* DEFINE_DETECTOR (utf_7); */ DEFINE_DETECTOR (utf_8); DEFINE_DETECTOR_CATEGORY (utf_8, utf_8); +DEFINE_DETECTOR_CATEGORY (utf_8, utf_8_bom); DEFINE_DETECTOR (ucs_4); DEFINE_DETECTOR_CATEGORY (ucs_4, ucs_4); DEFINE_DETECTOR (utf_16); @@ -2081,6 +2084,9 @@ struct utf_8_detector { + int byteno; + int first_byte; + int second_byte; int in_utf_8_byte; }; @@ -2093,11 +2099,32 @@ while (n--) { UExtbyte c = *src++; + switch (data->byteno) + { + case 0: + data->first_byte = c; + break; + case 1: + data->second_byte = c; + break; + case 2: + if (data->first_byte == 0xef && + data->second_byte == 0xbb && + c == 0xbf) + { + SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); + DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY; + return; + } + break; + } + switch (data->in_utf_8_byte) { case 0: if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) { + SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; return; } @@ -2113,6 +2140,7 @@ data->in_utf_8_byte = 1; else if (c >= 0x80) { + SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; return; } @@ -2120,14 +2148,17 @@ default: if ((c & 0xc0) != 0x80) { + SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; return; } else data->in_utf_8_byte--; } + + data->byteno++; } - DET_RESULT (st, utf_8) = DET_SOMEWHAT_LIKELY; + SET_DET_RESULTS (st, utf_8, DET_SOMEWHAT_LIKELY); } static void @@ -2256,6 +2287,9 @@ DEFSYMBOL (Qutf_16_little_endian); DEFSYMBOL (Qutf_16_bom); DEFSYMBOL (Qutf_16_little_endian_bom); + + DEFSYMBOL (Qutf_8); + DEFSYMBOL (Qutf_8_bom); } void @@ -2272,6 +2306,7 @@ INITIALIZE_DETECTOR (utf_8); DETECTOR_HAS_METHOD (utf_8, detect); INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8); + INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8_bom); INITIALIZE_DETECTOR (ucs_4); DETECTOR_HAS_METHOD (ucs_4, detect);