diff src/unicode.c @ 985:7f62a956b825

[xemacs-hg @ 2002-09-01 06:41:40 by youngs] 2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp> * code-init.el (reset-coding-categories-to-default): Add new coding category 'utf-8-bom'. * coding.el (coding-system-category): Add check for 'utf-8-bom'. * unicode.el: Add new coding system 'utf-8-bom'. 2002-08-31 Seiichi Ikiuo <ikiuo@polyphony.co.jp> * file-coding.h (detection_result): fix DET_LOWEST. * mule-coding.c (big5_detector): Add member 'seen_euc_char'. (big5_detect): It was made not to become the trouble of EUC. (iso2022_detector): Add member 'longest_even_high_byte'. (iso2022_detect): Fix checking for even/odd_high_byte_group. The judgment of EUC is changed. * unicode.c: Add DETECTOR_CATEGORY utf_8_bom. (utf_8_detector): Add member byteno, first_byte and second_byte. (utf_8_detect): The judgment of UTF-8-BOM is added.
author youngs
date Sun, 01 Sep 2002 06:41:45 +0000
parents c9f067fd71a3
children e22b0213b713
line wrap: on
line diff
--- a/src/unicode.c	Sat Aug 31 11:04:01 2002 +0000
+++ b/src/unicode.c	Sun Sep 01 06:41:45 2002 +0000
@@ -159,6 +159,8 @@
 Lisp_Object Qutf_16_little_endian, Qutf_16_bom;
 Lisp_Object Qutf_16_little_endian_bom;
 
+Lisp_Object Qutf_8_bom;
+
 #ifdef MULE 
 
 /* #### Using ints for to_unicode is OK (as long as they are >= 32 bits).
@@ -1924,6 +1926,7 @@
 /* DEFINE_DETECTOR (utf_7); */
 DEFINE_DETECTOR (utf_8);
 DEFINE_DETECTOR_CATEGORY (utf_8, utf_8);
+DEFINE_DETECTOR_CATEGORY (utf_8, utf_8_bom);
 DEFINE_DETECTOR (ucs_4);
 DEFINE_DETECTOR_CATEGORY (ucs_4, ucs_4);
 DEFINE_DETECTOR (utf_16);
@@ -2081,6 +2084,9 @@
 
 struct utf_8_detector
 {
+  int byteno;
+  int first_byte;
+  int second_byte;
   int in_utf_8_byte;
 };
 
@@ -2093,11 +2099,32 @@
   while (n--)
     {
       UExtbyte c = *src++;
+      switch (data->byteno)
+	{
+	case 0:
+	  data->first_byte = c;
+	  break;
+	case 1:
+	  data->second_byte = c;
+	  break;
+	case 2:
+	  if (data->first_byte == 0xef &&
+	      data->second_byte == 0xbb &&
+	      c == 0xbf)
+	    {
+	      SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
+	      DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY;
+	      return;
+	    }
+	  break;
+	}
+
       switch (data->in_utf_8_byte)
 	{
 	case 0:
 	  if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 	    {
+	      SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
 	      DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
 	      return;
 	    }
@@ -2113,6 +2140,7 @@
 	    data->in_utf_8_byte = 1;
 	  else if (c >= 0x80)
 	    {
+	      SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
 	      DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
 	      return;
 	    }
@@ -2120,14 +2148,17 @@
 	default:
 	  if ((c & 0xc0) != 0x80)
 	    {
+	      SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
 	      DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
 	      return;
 	    }
 	  else
 	    data->in_utf_8_byte--;
 	}
+
+      data->byteno++;
     }
-  DET_RESULT (st, utf_8) = DET_SOMEWHAT_LIKELY;
+  SET_DET_RESULTS (st, utf_8, DET_SOMEWHAT_LIKELY);
 }
 
 static void
@@ -2256,6 +2287,9 @@
   DEFSYMBOL (Qutf_16_little_endian);
   DEFSYMBOL (Qutf_16_bom);
   DEFSYMBOL (Qutf_16_little_endian_bom);
+
+  DEFSYMBOL (Qutf_8);
+  DEFSYMBOL (Qutf_8_bom);
 }
 
 void
@@ -2272,6 +2306,7 @@
   INITIALIZE_DETECTOR (utf_8);
   DETECTOR_HAS_METHOD (utf_8, detect);
   INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8);
+  INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8_bom);
 
   INITIALIZE_DETECTOR (ucs_4);
   DETECTOR_HAS_METHOD (ucs_4, detect);