changeset 985:7f62a956b825

[xemacs-hg @ 2002-09-01 06:41:40 by youngs] 2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp> * code-init.el (reset-coding-categories-to-default): Add new coding category 'utf-8-bom'. * coding.el (coding-system-category): Add check for 'utf-8-bom'. * unicode.el: Add new coding system 'utf-8-bom'. 2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp> * file-coding.h (detection_result): fix DET_LOWEST. * mule-coding.c (big5_detector): Add member 'seen_euc_char'. (big5_detect): It was made not to become the trouble of EUC. (iso2022_detector): Add member 'longest_even_high_byte'. (iso2022_detect): Fix checking for even/odd_high_byte_group. The judgment of EUC is changed. * unicode.c: Add DETECTOR_CATEGORY utf_8_bom. (utf_8_detector): Add member byteno, first_byte and second_byte. (utf_8_detect): The judgment of UTF-8-BOM is added.
author youngs
date Sun, 01 Sep 2002 06:41:45 +0000
parents c55a519aa13f
children 9b80efded6a5
files lisp/ChangeLog lisp/code-init.el lisp/coding.el lisp/unicode.el src/ChangeLog src/file-coding.h src/mule-coding.c src/unicode.c
diffstat 8 files changed, 115 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/ChangeLog	Sat Aug 31 11:04:01 2002 +0000
+++ b/lisp/ChangeLog	Sun Sep 01 06:41:45 2002 +0000
@@ -1,3 +1,12 @@
+2002-08-31  Seiichi Ikiuo  <ikiuo@polyphony.co.jp>
+
+	* code-init.el (reset-coding-categories-to-default): Add new
+	coding category 'utf-8-bom'.
+
+	* coding.el (coding-system-category): Add check for 'utf-8-bom'.
+
+	* unicode.el: Add new coding system 'utf-8-bom'.
+
 2002-08-30  Steve Youngs  <youngs@xemacs.org>
 
 	* XEmacs 21.5.9 "brussels sprouts" is released.
--- a/lisp/code-init.el	Sat Aug 31 11:04:01 2002 +0000
+++ b/lisp/code-init.el	Sun Sep 01 06:41:45 2002 +0000
@@ -276,6 +276,7 @@
 	--------------------------------------------------
         utf-16-little-endian-bom  utf-16-little-endian
 	utf-16-bom		  utf-16-bom
+	utf-8-bom		  utf-8-bom
 	iso-7			  iso-2022-7bit
 	no-conversion		  raw-text
 	utf-8			  utf-8
@@ -324,6 +325,7 @@
    (coding-system-variable-default-value 'no-conversion-coding-system-mapping))
   (set-coding-category-system 'ucs-4 'ucs-4)
   (set-coding-category-system 'utf-8 'utf-8)
+  (set-coding-category-system 'utf-8-bom 'utf-8-bom)
   (set-coding-category-system 'utf-16-little-endian 'utf-16-little-endian)
   (set-coding-category-system 'utf-16 'utf-16)
   (set-coding-category-system 'utf-16-little-endian-bom
@@ -333,6 +335,7 @@
    (if (featurep 'mule)
        '(utf-16-little-endian-bom
 	 utf-16-bom
+	 utf-8-bom
 	 iso-7
 	 no-conversion
 	 utf-8
@@ -347,6 +350,7 @@
 	 ucs-4)
      '(utf-16-little-endian-bom
        utf-16-bom
+       utf-8-bom
        no-conversion
        utf-8
        utf-16-little-endian
--- a/lisp/coding.el	Sat Aug 31 11:04:01 2002 +0000
+++ b/lisp/coding.el	Sun Sep 01 06:41:45 2002 +0000
@@ -201,7 +201,10 @@
 	(no-conversion 'no-conversion)
 	(shift-jis 'shift-jis)
 	(unicode (case (coding-system-property coding-system 'type)
-		   (utf-8 'utf-8)
+		   (utf-8 (let ((bom (coding-system-property coding-system
+							     'need-bom)))
+			    (cond (bom 'utf-8-bom)
+				  ((not bom) 'utf-8))))
 		   (ucs-4 'ucs-4)
 		   (utf-16 (let ((bom (coding-system-property coding-system
 							      'need-bom))
--- a/lisp/unicode.el	Sat Aug 31 11:04:01 2002 +0000
+++ b/lisp/unicode.el	Sun Sep 01 06:41:45 2002 +0000
@@ -281,6 +281,17 @@
 "
    type utf-8))
 
+(make-coding-system
+ 'utf-8-bom 'unicode
+ "UTF-8 w/BOM"
+ '(mnemonic "MSW-UTF8"
+   documentation
+   "UTF-8 Unicode encoding, with byte order mark.
+Standard encoding for representing UTF-8 under MS Windows."
+   type utf-8
+   little-endian t
+   need-bom t))
+
 ;; #### UTF-7 is not yet implemented, and it's tricky to do.  There's
 ;; an implementation in appendix A.1 of the Unicode Standard, Version
 ;; 2.0, but I don't know its licensing characteristics.
--- a/src/ChangeLog	Sat Aug 31 11:04:01 2002 +0000
+++ b/src/ChangeLog	Sun Sep 01 06:41:45 2002 +0000
@@ -1,3 +1,18 @@
+2002-08-31  Seiichi Ikiuo  <ikiuo@polyphony.co.jp>
+
+	* file-coding.h (detection_result): fix DET_LOWEST.
+
+	* mule-coding.c
+	(big5_detector): Add member 'seen_euc_char'.
+	(big5_detect): It was made not to become the trouble of EUC.
+	(iso2022_detector): Add member 'longest_even_high_byte'.
+	(iso2022_detect): Fix checking for even/odd_high_byte_group.
+	The judgment of EUC is changed. 
+
+	* unicode.c: Add DETECTOR_CATEGORY utf_8_bom.
+	(utf_8_detector): Add member byteno, first_byte and second_byte.
+	(utf_8_detect): The judgment of UTF-8-BOM is added.
+
 2002-08-30  Steve Youngs  <youngs@xemacs.org>
 
 	* XEmacs 21.5.9 "brussels sprouts" is released.
--- a/src/file-coding.h	Sat Aug 31 11:04:01 2002 +0000
+++ b/src/file-coding.h	Sun Sep 01 06:41:45 2002 +0000
@@ -756,7 +756,7 @@
   DET_QUITE_IMPROBABLE = -2,
   /* An erroneous sequence was seen. */
   DET_NEARLY_IMPOSSIBLE = -3,
-  DET_LOWEST = 3,
+  DET_LOWEST = -3,
  };
 
 extern int coding_detector_count;
--- a/src/mule-coding.c	Sat Aug 31 11:04:01 2002 +0000
+++ b/src/mule-coding.c	Sun Sep 01 06:41:45 2002 +0000
@@ -599,6 +599,7 @@
 struct big5_detector
 {
   int seen_big5_char;
+  int seen_euc_char;
   unsigned int seen_iso2022_esc:1;
   unsigned int seen_bad_first_byte:1;
   unsigned int seen_bad_second_byte:1;
@@ -628,7 +629,9 @@
       else
 	{
 	  data->in_second_byte = 0;
-	  if ((c >= 0x40 && c <= 0x7E) || (c >= 0xA1 && c <= 0xFE))
+	  if (c >= 0xA1 && c <= 0xFE)
+	    data->seen_euc_char++;
+	  else if (c >= 0x40 && c <= 0x7E)
 	    data->seen_big5_char++;
 	  else
 	    data->seen_bad_second_byte = 1;
@@ -643,6 +646,8 @@
     DET_RESULT (st, big5) = DET_SOMEWHAT_UNLIKELY;
   else if (data->seen_big5_char >= 4)
     DET_RESULT (st, big5) = DET_SOMEWHAT_LIKELY;
+  else if (data->seen_euc_char)
+    DET_RESULT (st, big5) = DET_SLIGHTLY_LIKELY;
   else
     DET_RESULT (st, big5) = DET_AS_LIKELY_AS_UNLIKELY;
 }
@@ -2749,6 +2754,7 @@
   unsigned int bad_multibyte_escape_sequences;
   unsigned int good_multibyte_escape_sequences;
   int even_high_byte_groups;
+  int longest_even_high_byte;
   int odd_high_byte_groups;
 };
 
@@ -2794,7 +2800,11 @@
 	      if (data->high_byte_count & 1)
 		data->odd_high_byte_groups++;
 	      else
-		data->even_high_byte_groups++;
+		{
+		  data->even_high_byte_groups++;
+		  if (data->longest_even_high_byte < data->high_byte_count)
+		    data->longest_even_high_byte = data->high_byte_count;
+		}
 	    }
 	  data->high_byte_count = 0;
 	  data->saw_single_shift_just_now = 0;
@@ -2861,6 +2871,19 @@
     label_continue_loop:;
     }
 
+  if (data->high_byte_count &&
+      !data->saw_single_shift_just_now)
+    {
+      if (data->high_byte_count & 1)
+	data->odd_high_byte_groups++;
+      else
+	{
+	  data->even_high_byte_groups++;
+	  if (data->longest_even_high_byte < data->high_byte_count)
+	    data->longest_even_high_byte = data->high_byte_count;
+	}
+    }
+
   if (data->bad_multibyte_escape_sequences > 2 ||
       (data->bad_multibyte_escape_sequences > 0 &&
        data->good_multibyte_escape_sequences /
@@ -2919,6 +2942,7 @@
   else if (data->odd_high_byte_groups == 0 &&
 	   data->even_high_byte_groups > 0)
     {
+#if 0
       SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
       if (data->even_high_byte_groups > 10)
 	{
@@ -2930,6 +2954,15 @@
 	    DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_UNLIKELY;
 	  /* else it stays at quite improbable */
 	}
+#else
+      SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
+      if (data->seen_single_shift)
+	DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE;
+      else if (data->even_high_byte_groups > 10)
+	DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY;
+      else if (data->longest_even_high_byte > 6)
+	DET_RESULT (st, iso_8_2) = DET_SLIGHTLY_LIKELY;
+#endif
     }
   else if (data->odd_high_byte_groups > 0 &&
 	   data->even_high_byte_groups > 0)
--- a/src/unicode.c	Sat Aug 31 11:04:01 2002 +0000
+++ b/src/unicode.c	Sun Sep 01 06:41:45 2002 +0000
@@ -159,6 +159,8 @@
 Lisp_Object Qutf_16_little_endian, Qutf_16_bom;
 Lisp_Object Qutf_16_little_endian_bom;
 
+Lisp_Object Qutf_8_bom;
+
 #ifdef MULE 
 
 /* #### Using ints for to_unicode is OK (as long as they are >= 32 bits).
@@ -1924,6 +1926,7 @@
 /* DEFINE_DETECTOR (utf_7); */
 DEFINE_DETECTOR (utf_8);
 DEFINE_DETECTOR_CATEGORY (utf_8, utf_8);
+DEFINE_DETECTOR_CATEGORY (utf_8, utf_8_bom);
 DEFINE_DETECTOR (ucs_4);
 DEFINE_DETECTOR_CATEGORY (ucs_4, ucs_4);
 DEFINE_DETECTOR (utf_16);
@@ -2081,6 +2084,9 @@
 
 struct utf_8_detector
 {
+  int byteno;
+  int first_byte;
+  int second_byte;
   int in_utf_8_byte;
 };
 
@@ -2093,11 +2099,32 @@
   while (n--)
     {
       UExtbyte c = *src++;
+      switch (data->byteno)
+	{
+	case 0:
+	  data->first_byte = c;
+	  break;
+	case 1:
+	  data->second_byte = c;
+	  break;
+	case 2:
+	  if (data->first_byte == 0xef &&
+	      data->second_byte == 0xbb &&
+	      c == 0xbf)
+	    {
+	      SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
+	      DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY;
+	      return;
+	    }
+	  break;
+	}
+
       switch (data->in_utf_8_byte)
 	{
 	case 0:
 	  if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 	    {
+	      SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
 	      DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
 	      return;
 	    }
@@ -2113,6 +2140,7 @@
 	    data->in_utf_8_byte = 1;
 	  else if (c >= 0x80)
 	    {
+	      SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
 	      DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
 	      return;
 	    }
@@ -2120,14 +2148,17 @@
 	default:
 	  if ((c & 0xc0) != 0x80)
 	    {
+	      SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
 	      DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
 	      return;
 	    }
 	  else
 	    data->in_utf_8_byte--;
 	}
+
+      data->byteno++;
     }
-  DET_RESULT (st, utf_8) = DET_SOMEWHAT_LIKELY;
+  SET_DET_RESULTS (st, utf_8, DET_SOMEWHAT_LIKELY);
 }
 
 static void
@@ -2256,6 +2287,9 @@
   DEFSYMBOL (Qutf_16_little_endian);
   DEFSYMBOL (Qutf_16_bom);
   DEFSYMBOL (Qutf_16_little_endian_bom);
+
+  DEFSYMBOL (Qutf_8);
+  DEFSYMBOL (Qutf_8_bom);
 }
 
 void
@@ -2272,6 +2306,7 @@
   INITIALIZE_DETECTOR (utf_8);
   DETECTOR_HAS_METHOD (utf_8, detect);
   INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8);
+  INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8_bom);
 
   INITIALIZE_DETECTOR (ucs_4);
   DETECTOR_HAS_METHOD (ucs_4, detect);