comparison src/mule-coding.c @ 985:7f62a956b825

[xemacs-hg @ 2002-09-01 06:41:40 by youngs] 2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp> * code-init.el (reset-coding-categories-to-default): Add new coding category 'utf-8-bom'. * coding.el (coding-system-category): Add check for 'utf-8-bom'. * unicode.el: Add new coding system 'utf-8-bom'. 2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp> * file-coding.h (detection_result): fix DET_LOWEST. * mule-coding.c (big5_detector): Add member 'seen_euc_char'. (big5_detect): It was made not to become the trouble of EUC. (iso2022_detector): Add member 'longest_even_high_byte'. (iso2022_detect): Fix checking for even/odd_high_byte_group. The judgment of EUC is changed. * unicode.c: Add DETECTOR_CATEGORY utf_8_bom. (utf_8_detector): Add member byteno, first_byte and second_byte. (utf_8_detect): The judgment of UTF-8-BOM is added.
author youngs
date Sun, 01 Sep 2002 06:41:45 +0000
parents 804517e16990
children e22b0213b713
comparison
equal deleted inserted replaced
984:c55a519aa13f 985:7f62a956b825
597 DEFINE_DETECTOR_CATEGORY (big5, big5); 597 DEFINE_DETECTOR_CATEGORY (big5, big5);
598 598
599 struct big5_detector 599 struct big5_detector
600 { 600 {
601 int seen_big5_char; 601 int seen_big5_char;
602 int seen_euc_char;
602 unsigned int seen_iso2022_esc:1; 603 unsigned int seen_iso2022_esc:1;
603 unsigned int seen_bad_first_byte:1; 604 unsigned int seen_bad_first_byte:1;
604 unsigned int seen_bad_second_byte:1; 605 unsigned int seen_bad_second_byte:1;
605 606
606 /* temporary */ 607 /* temporary */
626 data->seen_bad_first_byte = 1; 627 data->seen_bad_first_byte = 1;
627 } 628 }
628 else 629 else
629 { 630 {
630 data->in_second_byte = 0; 631 data->in_second_byte = 0;
631 if ((c >= 0x40 && c <= 0x7E) || (c >= 0xA1 && c <= 0xFE)) 632 if (c >= 0xA1 && c <= 0xFE)
633 data->seen_euc_char++;
634 else if (c >= 0x40 && c <= 0x7E)
632 data->seen_big5_char++; 635 data->seen_big5_char++;
633 else 636 else
634 data->seen_bad_second_byte = 1; 637 data->seen_bad_second_byte = 1;
635 } 638 }
636 } 639 }
641 DET_RESULT (st, big5) = DET_QUITE_IMPROBABLE; 644 DET_RESULT (st, big5) = DET_QUITE_IMPROBABLE;
642 else if (data->seen_iso2022_esc) 645 else if (data->seen_iso2022_esc)
643 DET_RESULT (st, big5) = DET_SOMEWHAT_UNLIKELY; 646 DET_RESULT (st, big5) = DET_SOMEWHAT_UNLIKELY;
644 else if (data->seen_big5_char >= 4) 647 else if (data->seen_big5_char >= 4)
645 DET_RESULT (st, big5) = DET_SOMEWHAT_LIKELY; 648 DET_RESULT (st, big5) = DET_SOMEWHAT_LIKELY;
649 else if (data->seen_euc_char)
650 DET_RESULT (st, big5) = DET_SLIGHTLY_LIKELY;
646 else 651 else
647 DET_RESULT (st, big5) = DET_AS_LIKELY_AS_UNLIKELY; 652 DET_RESULT (st, big5) = DET_AS_LIKELY_AS_UNLIKELY;
648 } 653 }
649 654
650 655
2747 unsigned int seen_designate:1; 2752 unsigned int seen_designate:1;
2748 unsigned int bad_single_byte_sequences; 2753 unsigned int bad_single_byte_sequences;
2749 unsigned int bad_multibyte_escape_sequences; 2754 unsigned int bad_multibyte_escape_sequences;
2750 unsigned int good_multibyte_escape_sequences; 2755 unsigned int good_multibyte_escape_sequences;
2751 int even_high_byte_groups; 2756 int even_high_byte_groups;
2757 int longest_even_high_byte;
2752 int odd_high_byte_groups; 2758 int odd_high_byte_groups;
2753 }; 2759 };
2754 2760
2755 static void 2761 static void
2756 iso2022_detect (struct detection_state *st, const UExtbyte *src, 2762 iso2022_detect (struct detection_state *st, const UExtbyte *src,
2792 !data->saw_single_shift_just_now) 2798 !data->saw_single_shift_just_now)
2793 { 2799 {
2794 if (data->high_byte_count & 1) 2800 if (data->high_byte_count & 1)
2795 data->odd_high_byte_groups++; 2801 data->odd_high_byte_groups++;
2796 else 2802 else
2797 data->even_high_byte_groups++; 2803 {
2804 data->even_high_byte_groups++;
2805 if (data->longest_even_high_byte < data->high_byte_count)
2806 data->longest_even_high_byte = data->high_byte_count;
2807 }
2798 } 2808 }
2799 data->high_byte_count = 0; 2809 data->high_byte_count = 0;
2800 data->saw_single_shift_just_now = 0; 2810 data->saw_single_shift_just_now = 0;
2801 } 2811 }
2802 if (!(data->flags & ISO_STATE_ESCAPE) 2812 if (!(data->flags & ISO_STATE_ESCAPE)
2859 } 2869 }
2860 } 2870 }
2861 label_continue_loop:; 2871 label_continue_loop:;
2862 } 2872 }
2863 2873
2874 if (data->high_byte_count &&
2875 !data->saw_single_shift_just_now)
2876 {
2877 if (data->high_byte_count & 1)
2878 data->odd_high_byte_groups++;
2879 else
2880 {
2881 data->even_high_byte_groups++;
2882 if (data->longest_even_high_byte < data->high_byte_count)
2883 data->longest_even_high_byte = data->high_byte_count;
2884 }
2885 }
2886
2864 if (data->bad_multibyte_escape_sequences > 2 || 2887 if (data->bad_multibyte_escape_sequences > 2 ||
2865 (data->bad_multibyte_escape_sequences > 0 && 2888 (data->bad_multibyte_escape_sequences > 0 &&
2866 data->good_multibyte_escape_sequences / 2889 data->good_multibyte_escape_sequences /
2867 data->bad_multibyte_escape_sequences < 10)) 2890 data->bad_multibyte_escape_sequences < 10))
2868 /* Just making it up ... */ 2891 /* Just making it up ... */
2917 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_LIKELY; 2940 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_LIKELY;
2918 } 2941 }
2919 else if (data->odd_high_byte_groups == 0 && 2942 else if (data->odd_high_byte_groups == 0 &&
2920 data->even_high_byte_groups > 0) 2943 data->even_high_byte_groups > 0)
2921 { 2944 {
2945 #if 0
2922 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); 2946 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
2923 if (data->even_high_byte_groups > 10) 2947 if (data->even_high_byte_groups > 10)
2924 { 2948 {
2925 if (data->seen_single_shift) 2949 if (data->seen_single_shift)
2926 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE; 2950 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE;
2928 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY; 2952 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY;
2929 if (data->even_high_byte_groups < 50) 2953 if (data->even_high_byte_groups < 50)
2930 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_UNLIKELY; 2954 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_UNLIKELY;
2931 /* else it stays at quite improbable */ 2955 /* else it stays at quite improbable */
2932 } 2956 }
2957 #else
2958 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
2959 if (data->seen_single_shift)
2960 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE;
2961 else if (data->even_high_byte_groups > 10)
2962 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY;
2963 else if (data->longest_even_high_byte > 6)
2964 DET_RESULT (st, iso_8_2) = DET_SLIGHTLY_LIKELY;
2965 #endif
2933 } 2966 }
2934 else if (data->odd_high_byte_groups > 0 && 2967 else if (data->odd_high_byte_groups > 0 &&
2935 data->even_high_byte_groups > 0) 2968 data->even_high_byte_groups > 0)
2936 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); 2969 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
2937 else 2970 else