changeset 4096:1abf84db2c7f

[xemacs-hg @ 2007-08-04 20:00:10 by aidan] Preserve invalid UTF-8, UTF-16 sequences on encoding, decoding.
author aidan
date Sat, 04 Aug 2007 20:00:24 +0000
parents bff7e065cfdc
children 50932d98a7f9
files lisp/ChangeLog lisp/unicode.el src/ChangeLog src/charset.h src/lisp.h src/lread.c src/mule-coding.c src/unicode.c
diffstat 8 files changed, 618 insertions(+), 217 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/ChangeLog	Fri Aug 03 21:51:12 2007 +0000
+++ b/lisp/ChangeLog	Sat Aug 04 20:00:24 2007 +0000
@@ -1,3 +1,13 @@
+2007-08-04  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* unicode.el:
+	* unicode.el (utf-32):
+	* unicode.el (utf-32-little-endian):
+	Add UTF-32 coding systems. 
+	
+	* unicode.el (decode-char):
+	Only accept valid Unicode in this function. 
+
 2007-08-02  Mike Sperber  <mike@xemacs.org>
 
 	* startup.el (startup-setup-paths): Fix typo in init expression
--- a/lisp/unicode.el	Fri Aug 03 21:51:12 2007 +0000
+++ b/lisp/unicode.el	Sat Aug 04 20:00:24 2007 +0000
@@ -233,6 +233,26 @@
    little-endian t))
 
 (make-coding-system
+ 'utf-32 'unicode
+ "UTF-32"
+ '(mnemonic "UTF32"
+   documentation
+   "UTF-32 Unicode encoding -- fixed-width four-byte encoding,
+characters less than #x10FFFF are not supported.  "
+   unicode-type utf-32))
+
+(make-coding-system
+ 'utf-32-little-endian 'unicode
+ "UTF-32 Little Endian"
+ '(mnemonic "UTF32-LE"
+   documentation
+   "Little-endian version of UTF-32 Unicode encoding.
+
+A fixed-width four-byte encoding, characters less than #x10FFFF are not
+supported.  "
+   unicode-type ucs-4 little-endian t))
+
+(make-coding-system
  'utf-8 'unicode
  "UTF-8"
  '(mnemonic "UTF8"
@@ -274,6 +294,10 @@
 (defun decode-char (quote-ucs code &optional restriction) 
   "FSF compatibility--return Mule character with Unicode codepoint CODE.
 The second argument must be 'ucs, the third argument is ignored.  "
+  ;; We're prepared to accept invalid Unicode in unicode-to-char, but not in
+  ;; this function, which is the API that should actually be used, since
+  ;; it's available in GNU and in Mule-UCS.
+  (check-argument-range code #x0 #x10FFFF)
   (assert (eq quote-ucs 'ucs) t
 	  "Sorry, decode-char doesn't yet support anything but the UCS.  ")
   (unicode-to-char code))
--- a/src/ChangeLog	Fri Aug 03 21:51:12 2007 +0000
+++ b/src/ChangeLog	Sat Aug 04 20:00:24 2007 +0000
@@ -1,3 +1,50 @@
+2007-08-04  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* charset.h:
+	* charset.h (enum unicode_type):
+	Add UNICODE_UTF_32. 
+	* lisp.h:
+	Add Qutf_32.
+	* lread.c (read_unicode_escape):
+	Error on an invalid Unicode escape; error on no mapping, as GNU does. 
+	
+	* mule-coding.c:
+	* mule-coding.c (dynarr_add_2022_one_dimension):
+	* mule-coding.c (dynarr_add_2022_two_dimensions):
+	* mule-coding.c (struct iso2022_coding_stream):
+	* mule-coding.c (decode_unicode_char):
+	* mule-coding.c (indicate_invalid_utf_8):
+	* mule-coding.c (iso2022_decode):
+	* unicode.c:
+	* unicode.c (struct unicode_coding_stream):
+	* unicode.c (decode_unicode_char):
+	* unicode.c (DECODE_ERROR_OCTET):
+	* unicode.c (indicate_invalid_utf_8):
+	* unicode.c (encode_unicode_char_1):
+	* unicode.c (encode_unicode_char):
+	* unicode.c (unicode_convert):
+	* unicode.c (unicode_putprop):
+	* unicode.c (unicode_getprop):
+	* unicode.c (syms_of_unicode):
+	Make UTF-8 and UTF-16 handling more robust; indicate error
+	sequences when decoding, passing the octets as distinct from the
+	corresponding ISO8859-1 characters, and (by default) writing them
+	to disk on encoding. Don't accept over-long UTF-8 sequences, codes
+	>= #x110000, or UTF-16 surrogates on reading in the utf-8 coding
+	system; represent them as error sequences.
+
+	Do accept code points above #x110000 in the ISO IR 196 handling,
+	since we decode Unicode error sequences to "Unicode" code points
+	starting at 0x200000, and will need to save them as such in
+	escape-quoted. Do not accept over-long UTF-8 sequences or UTF-16
+	surrogates in escape-quoted. 
+
+	This change means that when a non-UTF-8 file is opened as UTF-8,
+	one change made, and immediately saved, the non-ASCII characters
+	are not corrupted. In Europe, this is a distinct win. 
+
+	Add UCS-4, UTF-32 as coding systems. 
+
 2007-07-26  Aidan Kehoe  <kehoea@parhasard.net>
 
 	* mule-ccl.c (ccl_driver):
--- a/src/charset.h	Fri Aug 03 21:51:12 2007 +0000
+++ b/src/charset.h	Sat Aug 04 20:00:24 2007 +0000
@@ -567,12 +567,20 @@
   UNICODE_UTF_16,
   UNICODE_UTF_8,
   UNICODE_UTF_7,
-  UNICODE_UCS_4
+  UNICODE_UCS_4,
+  UNICODE_UTF_32
 };
 
 void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
 			  int USED_IF_MULE (l), unsigned_char_dynarr *dst,
-			  enum unicode_type type, unsigned int little_endian);
+			  enum unicode_type type, unsigned int little_endian,
+                          int write_error_characters_as_such);
+
+#define UNICODE_ERROR_OCTET_RANGE_START 0x200000
+
+#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
+#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
+#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
 
 void set_charset_registries(Lisp_Object charset, Lisp_Object registries);
 
--- a/src/lisp.h	Fri Aug 03 21:51:12 2007 +0000
+++ b/src/lisp.h	Sat Aug 04 20:00:24 2007 +0000
@@ -5488,7 +5488,7 @@
 void free_charset_unicode_tables (Lisp_Object charset);
 void recalculate_unicode_precedence (void);
 extern Lisp_Object Qunicode;
-extern Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7;
+extern Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32;
 #ifdef MEMORY_USAGE_STATS
 Bytecount compute_from_unicode_table_size (Lisp_Object charset,
 					      struct overhead_stats *stats);
--- a/src/lread.c	Fri Aug 03 21:51:12 2007 +0000
+++ b/src/lread.c	Sat Aug 04 20:00:24 2007 +0000
@@ -1694,24 +1694,26 @@
 	}
     }
 
+  if (i > 0x110000 || i < 0)
+    {
+      syntax_error ("Not a Unicode code point", make_int(i));
+    }
+
   lisp_char = Funicode_to_char(make_int(i), Qnil);
 
   if (EQ(Qnil, lisp_char))
     {
-      /* This is ugly and horrible and trashes the user's data, but
-	 it's what unicode.c does. In the future, unicode-to-char
-	 should not return nil.  */
-#ifdef MULE
-      i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128);
-#else
-      i = '~';
-#endif
-      return i;
+      /* Will happen on non-Mule. Silent corruption is what happens
+         elsewhere, and we used to do that to be consistent, but GNU error,
+         so people writing portable code need to be able to handle that, and
+         given a choice I prefer that behaviour.
+
+         An undesirable aspect to this error is that the code point is shown
+         as a decimal integer, which is mostly unreadable. */
+      syntax_error ("Unsupported Unicode code point", make_int(i));
     }
-  else
-    {
-      return XCHAR(lisp_char);
-    }
+
+  return XCHAR(lisp_char);
 }
 
 
--- a/src/mule-coding.c	Fri Aug 03 21:51:12 2007 +0000
+++ b/src/mule-coding.c	Sat Aug 04 20:00:24 2007 +0000
@@ -104,7 +104,7 @@
   if (XCHARSET_ENCODE_AS_UTF_8 (charset)) 
     {
       encode_unicode_char (charset, c & charmask, 0,	
-			   dst, UNICODE_UTF_8, 0);		
+			   dst, UNICODE_UTF_8, 0, 0); 
     } 
   else							
     {							
@@ -123,7 +123,7 @@
       encode_unicode_char (charset,				
 			   ch & charmask,			
 			   c & charmask, dst,		
-			   UNICODE_UTF_8, 0);		
+			   UNICODE_UTF_8, 0, 0); 
     }							
   else							
     {							
@@ -969,6 +969,7 @@
 
   /* Used for handling UTF-8. */
   unsigned char counter;  
+  unsigned char indicated_length;
 };
 
 static const struct memory_description ccs_description_1[] =
@@ -1804,6 +1805,39 @@
     }
 }
 
+/* Note that this name conflicts with a function in unicode.c. */
+static void
+decode_unicode_char (int ucs, unsigned_char_dynarr *dst)
+{
+  Ibyte work[MAX_ICHAR_LEN];
+  int len;
+  Lisp_Object chr;
+
+  chr = Funicode_to_char(make_int(ucs), Qnil);
+  assert (!NILP(chr));
+  len = set_itext_ichar (work, XCHAR(chr));
+  Dynarr_add_many (dst, work, len);
+}
+
+#define DECODE_ERROR_OCTET(octet, dst) \
+  decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, dst)
+
+static inline void
+indicate_invalid_utf_8 (unsigned char indicated_length,
+                        unsigned char counter,
+                        int ch, unsigned_char_dynarr *dst)
+{
+  Binbyte stored = indicated_length - counter; 
+  Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
+
+  while (stored > 0)
+    {
+      DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
+                          dst);
+      mask = 0x80, stored--;
+    }
+}
+
 /* Convert ISO2022-format data to internal format. */
 
 static Bytecount
@@ -1907,9 +1941,7 @@
       else if (flags & ISO_STATE_UTF_8)
 	{
 	  unsigned char counter = data->counter; 
-	  Ibyte work[MAX_ICHAR_LEN];
-	  int len;
-	  Lisp_Object chr;
+          unsigned char indicated_length = data->indicated_length;
 
 	  if (ISO_CODE_ESC == c)
 	    {
@@ -1920,73 +1952,126 @@
 	      continue;
 	    }
 
-	  switch (counter)
-	    {
-	    case 0:
-	      if (c >= 0xfc)
-		{
-		  ch = c & 0x01;
-		  counter = 5;
-		}
-	      else if (c >= 0xf8)
-		{
-		  ch = c & 0x03;
-		  counter = 4;
-		}
-	      else if (c >= 0xf0)
-		{
-		  ch = c & 0x07;
-		  counter = 3;
-		}
-	      else if (c >= 0xe0)
-		{
-		  ch = c & 0x0f;
-		  counter = 2;
-		}
-	      else if (c >= 0xc0)
-		{
-		  ch = c & 0x1f;
-		  counter = 1;
-		}
-	      else
-		/* ASCII, or the lower control characters.
-                   
-                   Perhaps we should signal an error if the character is in
-                   the range 0x80-0xc0; this is illegal UTF-8. */
-                Dynarr_add (dst, (c & 0x7f));
-
-	      break;
-	    case 1:
-	      ch = (ch << 6) | (c & 0x3f);
-	      chr = Funicode_to_char(make_int(ch), Qnil);			
-
-	      if (!NILP (chr))						
-		{								
-		  assert(CHARP(chr));					
-		  len = set_itext_ichar (work, XCHAR(chr));		
-		  Dynarr_add_many (dst, work, len);			
-		}								
-	      else							
-		{								
-		  /* Shouldn't happen, this code should only be enabled in
-		     XEmacsen with support for all of Unicode. */
-		  Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);	
-		  Dynarr_add (dst, 34 + 128);				
-		  Dynarr_add (dst, 46 + 128);				
-		}								
-
-	      ch = 0;
-	      counter = 0;
-	      break;
-	    default:
-	      ch = (ch << 6) | (c & 0x3f);
-	      counter--;
-	    }
-
-	  if (str->eof)
-	    DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+          if (0 == counter)
+            {
+              if (0 == (c & 0x80))
+                {
+                  /* ASCII. */
+                  decode_unicode_char (c, dst);
+                }
+              else if (0 == (c & 0x40))
+                {
+                  /* Highest bit set, second highest not--there's
+                     something wrong. */
+                  DECODE_ERROR_OCTET (c, dst);
+                }
+              else if (0 == (c & 0x20))
+                {
+                  ch = c & 0x1f; 
+                  counter = 1;
+                  indicated_length = 2;
+                }
+              else if (0 == (c & 0x10))
+                {
+                  ch = c & 0x0f;
+                  counter = 2;
+                  indicated_length = 3;
+                }
+              else if (0 == (c & 0x08))
+                {
+                  ch = c & 0x0f;
+                  counter = 3;
+                  indicated_length = 4;
+                }
+              /* We support lengths longer than 4 here, since we want to
+                 represent UTF-8 error chars as distinct from the
+                 corresponding ISO 8859-1 characters in escape-quoted.
+
+                 However, we can't differentiate UTF-8 error chars as
+                 written to disk, and UTF-8 errors in escape-quoted.  This
+                 is not a big problem;
+                 non-Unicode-chars-encoded-as-UTF-8-in-ISO-2022 is not
+                 deployed, in practice, so if such a sequence of octets
+                 occurs, XEmacs generated it.  */
+              else if (0 == (c & 0x04))
+                {
+                  ch = c & 0x03;
+                  counter = 4;
+                  indicated_length = 5;
+                }
+              else if (0 == (c & 0x02))
+                {
+                  ch = c & 0x01;
+                  counter = 5;
+                  indicated_length = 6;
+                }
+              else
+                {
+                  /* #xFF is not a valid leading byte in any form of
+                     UTF-8. */
+                  DECODE_ERROR_OCTET (c, dst);
+
+                }
+            }
+          else
+            {
+              /* counter != 0 */
+              if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
+                {
+                  indicate_invalid_utf_8(indicated_length, 
+                                         counter, 
+                                         ch, dst);
+                  if (c & 0x80)
+                    {
+                      DECODE_ERROR_OCTET (c, dst);
+                    }
+                  else
+                    {
+                      /* The character just read is ASCII. Treat it as
+                         such.  */
+                      decode_unicode_char (c, dst);
+                    }
+                  ch = 0;
+                  counter = 0;
+                }
+              else 
+                {
+                  ch = (ch << 6) | (c & 0x3f);
+                  counter--;
+
+                  /* Just processed the final byte. Emit the character. */
+                  if (!counter)
+                    {
+                      /* Don't accept over-long sequences, or surrogates. */
+                      if ((ch < 0x80) ||
+                          ((ch < 0x800) && indicated_length > 2) || 
+                          ((ch < 0x10000) && indicated_length > 3) || 
+                          /* We accept values above #x110000 in
+                             escape-quoted, though not in UTF-8. */
+                          /* (ch > 0x110000) || */
+                          valid_utf_16_surrogate(ch))
+                        {
+                          indicate_invalid_utf_8(indicated_length, 
+                                                 counter, 
+                                                 ch, dst);
+                        }
+                      else
+                        {
+                          decode_unicode_char (ch, dst);
+                        }
+                      ch = 0;
+                    }
+                }
+            }
+
+          if (str->eof && ch)
+            {
+              DECODE_ERROR_OCTET (ch, dst);
+              ch  = 0;
+            }
 
 	  data->counter = counter;
+	  data->indicated_length = indicated_length;
 	}
       else if (byte_c0_p (c) || byte_c1_p (c))
 	{ /* Control characters */
--- a/src/unicode.c	Fri Aug 03 21:51:12 2007 +0000
+++ b/src/unicode.c	Sat Aug 04 20:00:24 2007 +0000
@@ -146,13 +146,6 @@
    (1) User-defined charsets: It would be inconvenient to require all
    dumped user-defined charsets to be reloaded at init time.
 
-   (2) Starting up in a non-ISO-8859-1 directory.  If we load at run-time,
-   we don't load the tables until after we've parsed the current
-   directories, and we run into a real bootstrapping problem, if the
-   directories themselves are non-ISO-8859-1.  This is potentially fixable
-   once we switch to using Unicode internally, so we don't have to do any
-   conversion (other than the automatic kind, e.g. UTF-16 to UTF-8).
-
    NB With run-time loading, we load in init-mule-at-startup, in
    mule-cmds.el.  This is called from startup.el, which is quite late in
    the initialization process -- but data-directory isn't set until then.
@@ -192,7 +185,7 @@
    convert them back.) */
 
 Lisp_Object Qunicode;
-Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7;
+Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32;
 Lisp_Object Qneed_bom;
 
 Lisp_Object Qutf_16_little_endian, Qutf_16_bom;
@@ -218,10 +211,6 @@
     trail = 0xDC00 + (__ctu16s_code & 0x3FF);			\
 } while (0)
 
-#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
-#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
-#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
-
 #ifdef MULE 
 
 /* Using ints for to_unicode is OK (as long as they are >= 32 bits).
@@ -1703,6 +1692,7 @@
 {
   /* decode */
   unsigned char counter;
+  unsigned char indicated_length;
   int seen_char;
   /* encode */
   Lisp_Object current_charset;
@@ -1716,11 +1706,6 @@
 
 DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode);
 
-/* Decode a UCS-2 or UCS-4 character into a buffer.  If the lookup fails, use
-   <GETA MARK> (U+3013) of JIS X 0208, which means correct character
-   is not found, instead.
-   #### do something more appropriate (use blob?)
-        Danger, Will Robinson!  Data loss.  Should we signal user? */
 static void
 decode_unicode_char (int ch, unsigned_char_dynarr *dst,
 		     struct unicode_coding_stream *data,
@@ -1755,9 +1740,32 @@
   data->seen_char = 1;
 }
 
+#define DECODE_ERROR_OCTET(octet, dst, data, ignore_bom) \
+  decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, \
+                       dst, data, ignore_bom)
+
+static inline void
+indicate_invalid_utf_8 (unsigned char indicated_length,
+                        unsigned char counter,
+                        int ch, unsigned_char_dynarr *dst,
+                        struct unicode_coding_stream *data,
+                        unsigned int ignore_bom)
+{
+  Binbyte stored = indicated_length - counter; 
+  Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
+
+  while (stored > 0)
+    {
+      DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
+                        dst, data, ignore_bom);
+      mask = 0x80, stored--;
+    }
+}
+
 static void
 encode_unicode_char_1 (int code, unsigned_char_dynarr *dst,
-		       enum unicode_type type, unsigned int little_endian)
+		       enum unicode_type type, unsigned int little_endian,
+                       int write_error_characters_as_such)
 {
   switch (type)
     {
@@ -1767,53 +1775,105 @@
 	  if (code < 0x10000) {
 	    Dynarr_add (dst, (unsigned char) (code & 255));
 	    Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
-	  } else {
-	    /* Little endian; least significant byte first. */
-	    int first, second;
-
-	    CODE_TO_UTF_16_SURROGATES(code, first, second);
-
-	    Dynarr_add (dst, (unsigned char) (first & 255));
-	    Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
-
-	    Dynarr_add (dst, (unsigned char) (second & 255));
-	    Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
-	  }
+	  } else if (write_error_characters_as_such && 
+                     code >= UNICODE_ERROR_OCTET_RANGE_START &&
+                     code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+            {
+              Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+            }
+          else if (code < 0x110000)
+            {
+              /* Little endian; least significant byte first. */
+              int first, second;
+
+              CODE_TO_UTF_16_SURROGATES(code, first, second);
+
+              Dynarr_add (dst, (unsigned char) (first & 255));
+              Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+
+              Dynarr_add (dst, (unsigned char) (second & 255));
+              Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+            }
+          else
+            {
+              /* Not valid Unicode. Pass U+FFFD, least significant byte
+                 first. */
+              Dynarr_add (dst, (unsigned char) 0xFD);
+              Dynarr_add (dst, (unsigned char) 0xFF);
+            }
 	}
       else
 	{
 	  if (code < 0x10000) {
 	    Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
 	    Dynarr_add (dst, (unsigned char) (code & 255));
-	  } else {
-	    /* Big endian; most significant byte first. */
-	    int first, second;
-
-	    CODE_TO_UTF_16_SURROGATES(code, first, second);
-
-	    Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
-	    Dynarr_add (dst, (unsigned char) (first & 255));
-
-	    Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
-	    Dynarr_add (dst, (unsigned char) (second & 255));
-	  }
+	  } else if (write_error_characters_as_such && 
+                     code >= UNICODE_ERROR_OCTET_RANGE_START &&
+                     code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+            {
+              Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+            }
+          else if (code < 0x110000)
+            {
+              /* Big endian; most significant byte first. */
+              int first, second;
+
+              CODE_TO_UTF_16_SURROGATES(code, first, second);
+
+              Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+              Dynarr_add (dst, (unsigned char) (first & 255));
+
+              Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+              Dynarr_add (dst, (unsigned char) (second & 255));
+            }
+          else
+            {
+              /* Not valid Unicode. Pass U+FFFD, most significant byte
+                 first. */
+              Dynarr_add (dst, (unsigned char) 0xFF);
+              Dynarr_add (dst, (unsigned char) 0xFD);
+            }
 	}
       break;
 
     case UNICODE_UCS_4:
+    case UNICODE_UTF_32:
       if (little_endian)
 	{
-	  Dynarr_add (dst, (unsigned char) (code & 255));
-	  Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
-	  Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
-	  Dynarr_add (dst, (unsigned char) (code >> 24));
+          if (write_error_characters_as_such && 
+              code >= UNICODE_ERROR_OCTET_RANGE_START &&
+              code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+            {
+              Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+            }
+          else
+            {
+              /* We generate and accept incorrect sequences here, which is
+                 okay, in the interest of preservation of the user's
+                 data.  */
+              Dynarr_add (dst, (unsigned char) (code & 255));
+              Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+              Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+              Dynarr_add (dst, (unsigned char) (code >> 24));
+            }
 	}
       else
 	{
-	  Dynarr_add (dst, (unsigned char) (code >> 24));
-	  Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
-	  Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
-	  Dynarr_add (dst, (unsigned char) (code & 255));
+          if (write_error_characters_as_such && 
+              code >= UNICODE_ERROR_OCTET_RANGE_START &&
+              code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+            {
+              Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+            }
+          else
+            {
+              /* We generate and accept incorrect sequences here, which is okay,
+                 in the interest of preservation of the user's data.  */
+              Dynarr_add (dst, (unsigned char) (code >> 24));
+              Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+              Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+              Dynarr_add (dst, (unsigned char) (code & 255));
+            }
 	}
       break;
 
@@ -1842,11 +1902,25 @@
 	}
       else if (code <= 0x3ffffff)
 	{
-	  Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8));
-	  Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80));
-	  Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80));
-	  Dynarr_add (dst, (unsigned char) (((code >>  6) & 0x3f) | 0x80));
-	  Dynarr_add (dst, (unsigned char) ((code        & 0x3f) | 0x80));
+
+#if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \
+          && UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff)
+#error "This code needs to be rewritten. " 
+#endif
+          if (write_error_characters_as_such && 
+              code >= UNICODE_ERROR_OCTET_RANGE_START &&
+              code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+            {
+              Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+            }
+          else 
+            {
+              Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8));
+              Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80));
+              Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80));
+              Dynarr_add (dst, (unsigned char) (((code >>  6) & 0x3f) | 0x80));
+              Dynarr_add (dst, (unsigned char) ((code        & 0x3f) | 0x80));
+            }
 	}
       else
 	{
@@ -1870,7 +1944,8 @@
 void
 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
 		     int USED_IF_MULE (l), unsigned_char_dynarr *dst,
-		     enum unicode_type type, unsigned int little_endian)
+		     enum unicode_type type, unsigned int little_endian,
+                     int write_error_characters_as_such)
 {
 #ifdef MULE
   int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127));
@@ -1896,7 +1971,8 @@
   int code = h;
 #endif /* MULE */
 
-  encode_unicode_char_1 (code, dst, type, little_endian);
+  encode_unicode_char_1 (code, dst, type, little_endian, 
+                         write_error_characters_as_such);
 }
 
 static Bytecount
@@ -1915,6 +1991,8 @@
   if (str->direction == CODING_DECODE)
     {
       unsigned char counter = data->counter;
+      unsigned char indicated_length
+        = data->indicated_length;
 
       while (n--)
 	{
@@ -1923,46 +2001,92 @@
 	  switch (type)
 	    {
 	    case UNICODE_UTF_8:
-	      switch (counter)
-		{
-		case 0:
-		  if (c >= 0xfc)
-		    {
-		      ch = c & 0x01;
-		      counter = 5;
-		    }
-		  else if (c >= 0xf8)
-		    {
-		      ch = c & 0x03;
-		      counter = 4;
-		    }
-		  else if (c >= 0xf0)
-		    {
-		      ch = c & 0x07;
-		      counter = 3;
-		    }
-		  else if (c >= 0xe0)
-		    {
-		      ch = c & 0x0f;
-		      counter = 2;
-		    }
-		  else if (c >= 0xc0)
-		    {
-		      ch = c & 0x1f;
-		      counter = 1;
-		    }
-		  else
-		    decode_unicode_char (c, dst, data, ignore_bom);
-		  break;
-		case 1:
-		  ch = (ch << 6) | (c & 0x3f);
-		  decode_unicode_char (ch, dst, data, ignore_bom);
-		  ch = 0;
-		  counter = 0;
-		  break;
-		default:
-		  ch = (ch << 6) | (c & 0x3f);
-		  counter--;
+              if (0 == counter)
+                {
+                  if (0 == (c & 0x80))
+                    {
+                      /* ASCII. */
+                      decode_unicode_char (c, dst, data, ignore_bom);
+                    }
+                  else if (0 == (c & 0x40))
+                    {
+                      /* Highest bit set, second highest not--there's
+                         something wrong. */
+                      DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+                    }
+                  else if (0 == (c & 0x20))
+                    {
+                      ch = c & 0x1f; 
+                      counter = 1;
+                      indicated_length = 2;
+                    }
+                  else if (0 == (c & 0x10))
+                    {
+                      ch = c & 0x0f;
+                      counter = 2;
+                      indicated_length = 3;
+                    }
+                  else if (0 == (c & 0x08))
+                    {
+                      ch = c & 0x0f;
+                      counter = 3;
+                      indicated_length = 4;
+                    }
+                  else
+                    {
+                      /* We don't supports lengths longer than 4 in
+                         external-format data. */
+                      DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+
+                    }
+                }
+              else
+                {
+                  /* counter != 0 */
+                  if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
+                    {
+                      indicate_invalid_utf_8(indicated_length, 
+                                             counter, 
+                                             ch, dst, data, ignore_bom);
+                      if (c & 0x80)
+                        {
+                          DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+                        }
+                      else
+                        {
+                          /* The character just read is ASCII. Treat it as
+                             such.  */
+                          decode_unicode_char (c, dst, data, ignore_bom);
+                        }
+                      ch = 0;
+                      counter = 0;
+                    }
+                  else 
+                    {
+                      ch = (ch << 6) | (c & 0x3f);
+                      counter--;
+                      /* Just processed the final byte. Emit the character. */
+                      if (!counter)
+                        {
+			  /* Don't accept over-long sequences, surrogates,
+                             or codes above #x10FFFF. */
+                          if ((ch < 0x80) ||
+                              ((ch < 0x800) && indicated_length > 2) || 
+                              ((ch < 0x10000) && indicated_length > 3) || 
+                              valid_utf_16_surrogate(ch) || (ch > 0x110000))
+                            {
+                              indicate_invalid_utf_8(indicated_length, 
+                                                     counter, 
+                                                     ch, dst, data,
+                                                     ignore_bom);
+                            }
+                          else
+                            {
+                              decode_unicode_char (ch, dst, data, ignore_bom);
+                            }
+                          ch = 0;
+                        }
+                    }
 		}
 	      break;
 
@@ -1972,39 +2096,51 @@
 		ch = (c << counter) | ch;
 	      else
 		ch = (ch << 8) | c;
+
 	      counter += 8;
 
-	      if (counter == 16 && valid_utf_16_first_surrogate(ch))
-		break;
-
-	      if (counter == 16)
-		{
+	      if (16 == counter)
+                {
 		  int tempch = ch;
+
+                  if (valid_utf_16_first_surrogate(ch))
+                    {
+                      break;
+                    }
 		  ch = 0;
 		  counter = 0;
 		  decode_unicode_char (tempch, dst, data, ignore_bom);
 		}
-	      if (counter == 32)
+	      else if (32 == counter)
 		{
 		  int tempch;
-		  /* #### Signalling an error may be a bit extreme. Should
-		     we try and read it in anyway? */
-		  if (!valid_utf_16_first_surrogate(ch >> 16) 
-		      || !valid_utf_16_last_surrogate(ch & 0xFFFF))
+
+		  if (!valid_utf_16_last_surrogate(ch & 0xFFFF))
 		    {
-		      signal_error(Qtext_conversion_error, 
-				   "Invalid UTF-16 surrogate sequence", 
-				   Qunbound);
+                      DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+                                        ignore_bom);
+                      DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+                                        ignore_bom);
+                      DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+                                        ignore_bom);
+                      DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+                                        ignore_bom);
 		    }
-		  tempch = utf_16_surrogates_to_code((ch >> 16), 
-						     (ch & 0xffff));
+                  else 
+                    {
+                      tempch = utf_16_surrogates_to_code((ch >> 16), 
+                                                         (ch & 0xffff));
+                      decode_unicode_char(tempch, dst, data, ignore_bom);
+                    }
 		  ch = 0;
 		  counter = 0;
-		  decode_unicode_char(tempch, dst, data, ignore_bom);
-		}
+                }
+              else
+                assert(8 == counter || 24 == counter);
 	      break;
 
 	    case UNICODE_UCS_4:
+            case UNICODE_UTF_32:
 	      if (little_endian)
 		ch = (c << counter) | ch;
 	      else
@@ -2012,15 +2148,43 @@
 	      counter += 8;
 	      if (counter == 32)
 		{
-		  int tempch = ch;
+		  if (ch > 0x10ffff)
+		    {
+                      /* ch is not a legal Unicode character. We're fine
+                         with that in UCS-4, though not in UTF-32. */
+                      if (UNICODE_UCS_4 == type && ch < 0x80000000)
+                        {
+                          decode_unicode_char (ch, dst, data, ignore_bom);
+                        }
+                      else if (little_endian)
+                        {
+                          DECODE_ERROR_OCTET (ch & 0xFF, dst, data, 
+                                            ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+                                            ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+                                            ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+                                            ignore_bom);
+                        }
+                      else
+                        {
+                          DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+                                            ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+                                            ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+                                            ignore_bom);
+                          DECODE_ERROR_OCTET (ch & 0xFF, dst, data, 
+                                            ignore_bom);
+                        }
+		    }
+                  else
+                    {
+                      decode_unicode_char (ch, dst, data, ignore_bom);
+                    }
 		  ch = 0;
 		  counter = 0;
-		  if (tempch < 0)
-		    {
-		      /* !!#### indicate an error */
-		      tempch = '~';
-		    }
-		  decode_unicode_char (tempch, dst, data, ignore_bom);
 		}
 	      break;
 
@@ -2032,10 +2196,67 @@
 	    }
 
 	}
-      if (str->eof)
-	DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+
+      if (str->eof && ch)
+        {
+          switch (type)
+            {
+	    case UNICODE_UTF_8:
+              indicate_invalid_utf_8(indicated_length, 
+                                     counter, ch, dst, data, 
+                                     ignore_bom);
+              break;
+
+            case UNICODE_UTF_16:
+            case UNICODE_UCS_4:
+            case UNICODE_UTF_32:
+              if (8 == counter)
+                {
+                  DECODE_ERROR_OCTET (ch, dst, data, ignore_bom);
+                }
+              else if (16 == counter)
+                {
+                  if (little_endian)
+                    {
+                      DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); 
+                      DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+                                          ignore_bom); 
+                    }
+                  else
+                    {
+                      DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+                                          ignore_bom); 
+                      DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); 
+                    }
+                }
+              else if (24 == counter)
+                {
+                  if (little_endian)
+                    {
+                      DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+                                          ignore_bom);
+                      DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); 
+                      DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+                                          ignore_bom); 
+                    }
+                  else
+                    {
+                      DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+                                          ignore_bom);
+                      DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+                                          ignore_bom); 
+                      DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+                                          ignore_bom); 
+                    }
+                }
+              else assert(0);
+              break;
+            }
+          ch = 0;
+        }
 
       data->counter = counter;
+      data->indicated_length = indicated_length;
     }
   else
     {
@@ -2054,7 +2275,7 @@
 
       if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom)
 	{
-	  encode_unicode_char_1 (0xFEFF, dst, type, little_endian);
+	  encode_unicode_char_1 (0xFEFF, dst, type, little_endian, 1);
 	  data->wrote_bom = 1;
 	}
 
@@ -2068,7 +2289,7 @@
 	    {			/* Processing ASCII character */
 	      ch = 0;
 	      encode_unicode_char (Vcharset_ascii, c, 0, dst, type,
-				   little_endian);
+				   little_endian, 1);
 
 	      char_boundary = 1;
 	    }
@@ -2092,20 +2313,20 @@
 		   for the rationale behind subtracting #xa0 from the
 		   character's code. */
 		encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst,
-				     type, little_endian);
+				     type, little_endian, 1);
 	      else
 		{
 		  switch (XCHARSET_REP_BYTES (charset))
 		    {
 		    case 2:
 		      encode_unicode_char (charset, c, 0, dst, type,
-					   little_endian);
+					   little_endian, 1);
 		      break;
 		    case 3:
 		      if (XCHARSET_PRIVATE_P (charset))
 			{
 			  encode_unicode_char (charset, c, 0, dst, type,
-					       little_endian);
+					       little_endian, 1);
 			  ch = 0;
 			}
 		      else if (ch)
@@ -2119,7 +2340,7 @@
 				     handle this yet. */
 				  encode_unicode_char (Vcharset_ascii, '~', 0,
 						       dst, type,
-						       little_endian);
+						       little_endian, 1);
 				}
 			      else
 				{
@@ -2138,7 +2359,7 @@
 			  else
 #endif /* ENABLE_COMPOSITE_CHARS */
 			    encode_unicode_char (charset, ch, c, dst, type,
-						 little_endian);
+						 little_endian, 1);
 			  ch = 0;
 			}
 		      else
@@ -2151,7 +2372,7 @@
 		      if (ch)
 			{
 			  encode_unicode_char (charset, ch, c, dst, type,
-					       little_endian);
+					       little_endian, 1);
 			  ch = 0;
 			}
 		      else
@@ -2521,6 +2742,8 @@
 	type = UNICODE_UTF_7;
       else if (EQ (value, Qucs_4))
 	type = UNICODE_UCS_4;
+      else if (EQ (value, Qutf_32))
+	type = UNICODE_UTF_32;
       else
 	invalid_constant ("Invalid Unicode type", key);
       
@@ -2546,6 +2769,7 @@
 	case UNICODE_UTF_8: return Qutf_8;
 	case UNICODE_UTF_7: return Qutf_7;
 	case UNICODE_UCS_4: return Qucs_4;
+	case UNICODE_UTF_32: return Qutf_32;
 	default: ABORT ();
 	}
     }
@@ -2620,6 +2844,7 @@
   DEFSYMBOL (Qunicode);
   DEFSYMBOL (Qucs_4);
   DEFSYMBOL (Qutf_16);
+  DEFSYMBOL (Qutf_32);
   DEFSYMBOL (Qutf_8);
   DEFSYMBOL (Qutf_7);