# HG changeset patch
# User stephent
# Date 1147337881 0
# Node ID 96ec8f16af4535cf58b018ab77dd6a9f3aa1b95e
# Parent  824c3c18a12996b7c23c45e4465bfeb23a7a913d
[xemacs-hg @ 2006-05-11 08:57:59 by stephent]
Improve detection of ISO-8-1 coding systems. <874pzx2bn2.fsf@tleepslib.sk.tsukuba.ac.jp>

diff -r 824c3c18a129 -r 96ec8f16af45 src/ChangeLog
--- a/src/ChangeLog	Wed May 10 21:51:33 2006 +0000
+++ b/src/ChangeLog	Thu May 11 08:58:01 2006 +0000
@@ -1,3 +1,8 @@
+2006-02-27  Joachim Schrod  <jschrod@acm.org>
+
+	* mule-coding.c (iso2022_detect): Handle Latin-1 encoded files
+	that have several high-byte chars in a row.
+
 2006-05-08  Jerry James  <james@xemacs.org>
 
 	* number.c (Fcanonicalize_number): Use EMACS_INT instead of int,
diff -r 824c3c18a129 -r 96ec8f16af45 src/mule-coding.c
--- a/src/mule-coding.c	Wed May 10 21:51:33 2006 +0000
+++ b/src/mule-coding.c	Thu May 11 08:58:01 2006 +0000
@@ -2927,7 +2927,20 @@
     }
   else if (data->odd_high_byte_groups > 0 &&
 	   data->even_high_byte_groups > 0)
-    SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
+    {
+      /* Well, this could be a Latin-1 text, with most high-byte
+	 characters single, but sometimes two are together, though
+	 this happens not as often. This is common for Western
+	 European languages like German, French, Danish, Swedish, etc.
+	 Then we would either have a rather small file and
+	 even_high_byte_groups would be low.
+	 Or we would have a larger file and the ratio of odd to even
+	 groups would be very high. */
+      SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
+      if (data->even_high_byte_groups <= 3 ||
+	  data->odd_high_byte_groups >= 10 * data->even_high_byte_groups)
+	DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_LIKELY;
+    }
   else
     SET_DET_RESULTS (st, iso2022, DET_AS_LIKELY_AS_UNLIKELY);
 }