Mercurial > hg > xemacs-beta
comparison src/mule-coding.c @ 985:7f62a956b825
[xemacs-hg @ 2002-09-01 06:41:40 by youngs]
2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp>
* code-init.el (reset-coding-categories-to-default): Add new
coding category 'utf-8-bom'.
* coding.el (coding-system-category): Add check for 'utf-8-bom'.
* unicode.el: Add new coding system 'utf-8-bom'.
2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp>
* file-coding.h (detection_result): fix DET_LOWEST.
* mule-coding.c
(big5_detector): Add member 'seen_euc_char'.
(big5_detect): It was made not to become the trouble of EUC.
(iso2022_detector): Add member 'longest_even_high_byte'.
(iso2022_detect): Fix checking for even/odd_high_byte_group.
The judgment of EUC is changed.
* unicode.c: Add DETECTOR_CATEGORY utf_8_bom.
(utf_8_detector): Add member byteno, first_byte and second_byte.
(utf_8_detect): The judgment of UTF-8-BOM is added.
author | youngs |
---|---|
date | Sun, 01 Sep 2002 06:41:45 +0000 |
parents | 804517e16990 |
children | e22b0213b713 |
comparison
equal
deleted
inserted
replaced
984:c55a519aa13f | 985:7f62a956b825 |
---|---|
597 DEFINE_DETECTOR_CATEGORY (big5, big5); | 597 DEFINE_DETECTOR_CATEGORY (big5, big5); |
598 | 598 |
599 struct big5_detector | 599 struct big5_detector |
600 { | 600 { |
601 int seen_big5_char; | 601 int seen_big5_char; |
602 int seen_euc_char; | |
602 unsigned int seen_iso2022_esc:1; | 603 unsigned int seen_iso2022_esc:1; |
603 unsigned int seen_bad_first_byte:1; | 604 unsigned int seen_bad_first_byte:1; |
604 unsigned int seen_bad_second_byte:1; | 605 unsigned int seen_bad_second_byte:1; |
605 | 606 |
606 /* temporary */ | 607 /* temporary */ |
626 data->seen_bad_first_byte = 1; | 627 data->seen_bad_first_byte = 1; |
627 } | 628 } |
628 else | 629 else |
629 { | 630 { |
630 data->in_second_byte = 0; | 631 data->in_second_byte = 0; |
631 if ((c >= 0x40 && c <= 0x7E) || (c >= 0xA1 && c <= 0xFE)) | 632 if (c >= 0xA1 && c <= 0xFE) |
633 data->seen_euc_char++; | |
634 else if (c >= 0x40 && c <= 0x7E) | |
632 data->seen_big5_char++; | 635 data->seen_big5_char++; |
633 else | 636 else |
634 data->seen_bad_second_byte = 1; | 637 data->seen_bad_second_byte = 1; |
635 } | 638 } |
636 } | 639 } |
641 DET_RESULT (st, big5) = DET_QUITE_IMPROBABLE; | 644 DET_RESULT (st, big5) = DET_QUITE_IMPROBABLE; |
642 else if (data->seen_iso2022_esc) | 645 else if (data->seen_iso2022_esc) |
643 DET_RESULT (st, big5) = DET_SOMEWHAT_UNLIKELY; | 646 DET_RESULT (st, big5) = DET_SOMEWHAT_UNLIKELY; |
644 else if (data->seen_big5_char >= 4) | 647 else if (data->seen_big5_char >= 4) |
645 DET_RESULT (st, big5) = DET_SOMEWHAT_LIKELY; | 648 DET_RESULT (st, big5) = DET_SOMEWHAT_LIKELY; |
649 else if (data->seen_euc_char) | |
650 DET_RESULT (st, big5) = DET_SLIGHTLY_LIKELY; | |
646 else | 651 else |
647 DET_RESULT (st, big5) = DET_AS_LIKELY_AS_UNLIKELY; | 652 DET_RESULT (st, big5) = DET_AS_LIKELY_AS_UNLIKELY; |
648 } | 653 } |
649 | 654 |
650 | 655 |
2747 unsigned int seen_designate:1; | 2752 unsigned int seen_designate:1; |
2748 unsigned int bad_single_byte_sequences; | 2753 unsigned int bad_single_byte_sequences; |
2749 unsigned int bad_multibyte_escape_sequences; | 2754 unsigned int bad_multibyte_escape_sequences; |
2750 unsigned int good_multibyte_escape_sequences; | 2755 unsigned int good_multibyte_escape_sequences; |
2751 int even_high_byte_groups; | 2756 int even_high_byte_groups; |
2757 int longest_even_high_byte; | |
2752 int odd_high_byte_groups; | 2758 int odd_high_byte_groups; |
2753 }; | 2759 }; |
2754 | 2760 |
2755 static void | 2761 static void |
2756 iso2022_detect (struct detection_state *st, const UExtbyte *src, | 2762 iso2022_detect (struct detection_state *st, const UExtbyte *src, |
2792 !data->saw_single_shift_just_now) | 2798 !data->saw_single_shift_just_now) |
2793 { | 2799 { |
2794 if (data->high_byte_count & 1) | 2800 if (data->high_byte_count & 1) |
2795 data->odd_high_byte_groups++; | 2801 data->odd_high_byte_groups++; |
2796 else | 2802 else |
2797 data->even_high_byte_groups++; | 2803 { |
2804 data->even_high_byte_groups++; | |
2805 if (data->longest_even_high_byte < data->high_byte_count) | |
2806 data->longest_even_high_byte = data->high_byte_count; | |
2807 } | |
2798 } | 2808 } |
2799 data->high_byte_count = 0; | 2809 data->high_byte_count = 0; |
2800 data->saw_single_shift_just_now = 0; | 2810 data->saw_single_shift_just_now = 0; |
2801 } | 2811 } |
2802 if (!(data->flags & ISO_STATE_ESCAPE) | 2812 if (!(data->flags & ISO_STATE_ESCAPE) |
2859 } | 2869 } |
2860 } | 2870 } |
2861 label_continue_loop:; | 2871 label_continue_loop:; |
2862 } | 2872 } |
2863 | 2873 |
2874 if (data->high_byte_count && | |
2875 !data->saw_single_shift_just_now) | |
2876 { | |
2877 if (data->high_byte_count & 1) | |
2878 data->odd_high_byte_groups++; | |
2879 else | |
2880 { | |
2881 data->even_high_byte_groups++; | |
2882 if (data->longest_even_high_byte < data->high_byte_count) | |
2883 data->longest_even_high_byte = data->high_byte_count; | |
2884 } | |
2885 } | |
2886 | |
2864 if (data->bad_multibyte_escape_sequences > 2 || | 2887 if (data->bad_multibyte_escape_sequences > 2 || |
2865 (data->bad_multibyte_escape_sequences > 0 && | 2888 (data->bad_multibyte_escape_sequences > 0 && |
2866 data->good_multibyte_escape_sequences / | 2889 data->good_multibyte_escape_sequences / |
2867 data->bad_multibyte_escape_sequences < 10)) | 2890 data->bad_multibyte_escape_sequences < 10)) |
2868 /* Just making it up ... */ | 2891 /* Just making it up ... */ |
2917 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_LIKELY; | 2940 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_LIKELY; |
2918 } | 2941 } |
2919 else if (data->odd_high_byte_groups == 0 && | 2942 else if (data->odd_high_byte_groups == 0 && |
2920 data->even_high_byte_groups > 0) | 2943 data->even_high_byte_groups > 0) |
2921 { | 2944 { |
2945 #if 0 | |
2922 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); | 2946 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); |
2923 if (data->even_high_byte_groups > 10) | 2947 if (data->even_high_byte_groups > 10) |
2924 { | 2948 { |
2925 if (data->seen_single_shift) | 2949 if (data->seen_single_shift) |
2926 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE; | 2950 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE; |
2928 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY; | 2952 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY; |
2929 if (data->even_high_byte_groups < 50) | 2953 if (data->even_high_byte_groups < 50) |
2930 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_UNLIKELY; | 2954 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_UNLIKELY; |
2931 /* else it stays at quite improbable */ | 2955 /* else it stays at quite improbable */ |
2932 } | 2956 } |
2957 #else | |
2958 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); | |
2959 if (data->seen_single_shift) | |
2960 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE; | |
2961 else if (data->even_high_byte_groups > 10) | |
2962 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY; | |
2963 else if (data->longest_even_high_byte > 6) | |
2964 DET_RESULT (st, iso_8_2) = DET_SLIGHTLY_LIKELY; | |
2965 #endif | |
2933 } | 2966 } |
2934 else if (data->odd_high_byte_groups > 0 && | 2967 else if (data->odd_high_byte_groups > 0 && |
2935 data->even_high_byte_groups > 0) | 2968 data->even_high_byte_groups > 0) |
2936 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); | 2969 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); |
2937 else | 2970 else |