changeset 1267:c57f32e44416

[xemacs-hg @ 2003-02-07 01:43:05 by ben] fix unicode/utf misdetection unicode.c: UTF-8 detector was totally bogus and detected almost anything as UTF-8. UTF-16 detector got overly happy with 0x2028's (line separator, also space+paren).
author ben
date Fri, 07 Feb 2003 01:43:07 +0000
parents b5a5863da615
children fffe735e63ee
files src/ChangeLog src/unicode.c
diffstat 2 files changed, 99 insertions(+), 26 deletions(-) [+]
line wrap: on
line diff
--- a/src/ChangeLog	Fri Feb 07 00:54:20 2003 +0000
+++ b/src/ChangeLog	Fri Feb 07 01:43:07 2003 +0000
@@ -1,3 +1,14 @@
+2003-02-06  Ben Wing  <ben@xemacs.org>
+
+	* unicode.c:
+	* unicode.c (struct utf_16_detector):
+	* unicode.c (utf_16_detect):
+	* unicode.c (struct utf_8_detector):
+	* unicode.c (utf_8_detect):
+	UTF-8 detector was totally bogus and detected almost anything as UTF-8.
+	UTF-16 detector got overly happy with 0x2028's (line separator,
+	also space+paren).
+
 2003-02-06  Ben Wing  <ben@xemacs.org>
 
 	* lread.c (Fload_internal):
--- a/src/unicode.c	Fri Feb 07 00:54:20 2003 +0000
+++ b/src/unicode.c	Fri Feb 07 01:43:07 2003 +0000
@@ -1,5 +1,5 @@
 /* Code to handle Unicode conversion.
-   Copyright (C) 2000, 2001, 2002 Ben Wing.
+   Copyright (C) 2000, 2001, 2002, 2003 Ben Wing.
 
 This file is part of XEmacs.
 
@@ -1965,6 +1965,8 @@
   int byteno;
   int prev_char;
   int text, rev_text;
+  int sep, rev_sep;
+  int num_ascii;
 };
 
 static void
@@ -1994,12 +1996,19 @@
 	      && (prevc == '\r' || prevc == '\n'
 		  || (prevc >= 0x20 && prevc <= 0x7E)))
 	    data->rev_text++;
+	  /* #### 0x2028 is LINE SEPARATOR and 0x2029 is PARAGRAPH SEPARATOR.
+	     I used to count these in text and rev_text but that is very bad,
+	     as 0x2028 is also space + left-paren in ASCII, which is extremely
+	     common.  So, what do we do with these? */
 	  if (prevc == 0x20 && (c == 0x28 || c == 0x29))
-	    data->text++;
+	    data->sep++;
 	  if (c == 0x20 && (prevc == 0x28 || prevc == 0x29))
-	    data->rev_text++;
+	    data->rev_sep++;
 	}
 
+      if ((c >= ' ' && c <= '~') || c == '\n' || c == '\r' || c == '\t' ||
+	  c == '\f' || c == '\v')
+	data->num_ascii++;
       data->byteno++;
       data->prev_char = c;
     }
@@ -2063,7 +2072,19 @@
 	DET_RESULT (st, utf_16_little_endian) = DET_SOMEWHAT_LIKELY;
       }
     else
-      SET_DET_RESULTS (st, utf_16, DET_AS_LIKELY_AS_UNLIKELY);
+      {
+	/* #### FUCKME!  There should really be an ASCII detector.  This
+	   would rule out the need to have this built-in here as
+	   well. --ben */
+	int pct_ascii = ((100 * data->num_ascii) / data->byteno);
+
+	if (pct_ascii > 90)
+	  SET_DET_RESULTS (st, utf_16, DET_QUITE_IMPROBABLE);
+	else if (pct_ascii > 75)
+	  SET_DET_RESULTS (st, utf_16, DET_SOMEWHAT_UNLIKELY);
+	else
+	  SET_DET_RESULTS (st, utf_16, DET_AS_LIKELY_AS_UNLIKELY);
+      }
   }
 }
 
@@ -2072,7 +2093,16 @@
   int byteno;
   int first_byte;
   int second_byte;
+  int prev_byte;
   int in_utf_8_byte;
+  int recent_utf_8_sequence;
+  int seen_bogus_utf8;
+  int seen_really_bogus_utf8;
+  int seen_2byte_sequence;
+  int seen_longer_sequence;
+  int seen_iso2022_esc;
+  int seen_iso_shift;
+  int seen_utf_bom:1;
 };
 
 static void
@@ -2096,23 +2126,17 @@
 	  if (data->first_byte == 0xef &&
 	      data->second_byte == 0xbb &&
 	      c == 0xbf)
-	    {
-	      SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
-	      DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY;
-	      return;
-	    }
+	    data->seen_utf_bom = 1;
 	  break;
 	}
 
       switch (data->in_utf_8_byte)
 	{
 	case 0:
-	  if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
-	    {
-	      SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
-	      DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
-	      return;
-	    }
+	  if (data->prev_byte == ISO_CODE_ESC && c >= 0x28 && c <= 0x2F)
+	    data->seen_iso2022_esc++;
+	  else if (c == ISO_CODE_SI || c == ISO_CODE_SO)
+	    data->seen_iso_shift++;
 	  else if (c >= 0xfc)
 	    data->in_utf_8_byte = 5;
 	  else if (c >= 0xf8)
@@ -2124,26 +2148,64 @@
 	  else if (c >= 0xc0)
 	    data->in_utf_8_byte = 1;
 	  else if (c >= 0x80)
-	    {
-	      SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
-	      DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
-	      return;
-	    }
+	    data->seen_bogus_utf8++;
+	  if (data->in_utf_8_byte > 0)
+	    data->recent_utf_8_sequence = data->in_utf_8_byte;
 	  break;
 	default:
 	  if ((c & 0xc0) != 0x80)
+	    data->seen_really_bogus_utf8++;
+	  else
 	    {
-	      SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
-	      DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
-	      return;
+	      data->in_utf_8_byte--;
+	      if (data->in_utf_8_byte == 0)
+		{
+		  if (data->recent_utf_8_sequence == 1)
+		    data->seen_2byte_sequence++;
+		  else
+		    {
+		      assert (data->recent_utf_8_sequence >= 2);
+		      data->seen_longer_sequence++;
+		    }
+		}
 	    }
-	  else
-	    data->in_utf_8_byte--;
 	}
 
       data->byteno++;
+      data->prev_byte = c;
     }
-  SET_DET_RESULTS (st, utf_8, DET_SOMEWHAT_LIKELY);
+
+  /* either BOM or no BOM, but not both */
+  SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE);
+
+
+  if (data->seen_utf_bom)
+    DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY;
+  else
+    {
+      if (data->seen_really_bogus_utf8 ||
+	  data->seen_bogus_utf8 >= 2)
+	; /* bogus */
+      else if (data->seen_bogus_utf8)
+	DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
+      else if ((data->seen_longer_sequence >= 5 ||
+		data->seen_2byte_sequence >= 10) &&
+	       (!(data->seen_iso2022_esc + data->seen_iso_shift) ||
+		(data->seen_longer_sequence * 2 + data->seen_2byte_sequence) /
+		(data->seen_iso2022_esc + data->seen_iso_shift) >= 10))
+	/* heuristics, heuristics, we love heuristics */
+	DET_RESULT (st, utf_8) = DET_QUITE_PROBABLE;
+      else if (data->seen_iso2022_esc ||
+	       data->seen_iso_shift >= 3)
+	DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
+      else if (data->seen_longer_sequence ||
+	       data->seen_2byte_sequence)
+	DET_RESULT (st, utf_8) = DET_SOMEWHAT_LIKELY;
+      else if (data->seen_iso_shift)
+	DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
+      else
+	DET_RESULT (st, utf_8) = DET_AS_LIKELY_AS_UNLIKELY;
+    }
 }
 
 static void