xemacs-beta: src/unicode.c comparison

comparison src/unicode.c @ 4096:1abf84db2c7f

[xemacs-hg @ 2007-08-04 20:00:10 by aidan] Preserve invalid UTF-8, UTF-16 sequences on encoding, decoding.

author	aidan
date	Sat, 04 Aug 2007 20:00:24 +0000
parents	3584cb2c07db
children	75d0292c1bff

comparison

equal deleted inserted replaced

-:bff7e065cfdc
+:1abf84db2c7f
 Disadvantages:
 (1) User-defined charsets: It would be inconvenient to require all
 dumped user-defined charsets to be reloaded at init time.
-(2) Starting up in a non-ISO-8859-1 directory.  If we load at run-time,
-we don't load the tables until after we've parsed the current
-directories, and we run into a real bootstrapping problem, if the
-directories themselves are non-ISO-8859-1.  This is potentially fixable
-once we switch to using Unicode internally, so we don't have to do any
-conversion (other than the automatic kind, e.g. UTF-16 to UTF-8).
 NB With run-time loading, we load in init-mule-at-startup, in
 mule-cmds.el.  This is called from startup.el, which is quite late in
 the initialization process -- but data-directory isn't set until then.
 With dump-time loading, you still can't dump in a Japanese directory
 (again, until we move to Unicode internally), but this is not such an
 call the Unicode ones anyway, so in the case of structures, we'd be
 converting from Unicode to ANSI structures, only to have the OS
 convert them back.) */
 Lisp_Object Qunicode;
-Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7;
+Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32;
 Lisp_Object Qneed_bom;
 Lisp_Object Qutf_16_little_endian, Qutf_16_bom;
 Lisp_Object Qutf_16_little_endian_bom;
 #define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do {	\
 int __ctu16s_code = (codepoint);				\
 lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10);		\
 trail = 0xDC00 + (__ctu16s_code & 0x3FF);			\
 } while (0)
-#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
-#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
-#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
 #ifdef MULE
 /* Using ints for to_unicode is OK (as long as they are >= 32 bits).
 In from_unicode, we're converting from Mule characters, which means
 struct unicode_coding_stream
 {
 /* decode */
 unsigned char counter;
+unsigned char indicated_length;
 int seen_char;
 /* encode */
 Lisp_Object current_charset;
 int current_char_boundary;
 int wrote_bom;
 { XD_END }
 };
 DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode);
-/* Decode a UCS-2 or UCS-4 character into a buffer.  If the lookup fails, use
-<GETA MARK> (U+3013) of JIS X 0208, which means correct character
-is not found, instead.
-#### do something more appropriate (use blob?)
-Danger, Will Robinson!  Data loss.  Should we signal user? */
 static void
 decode_unicode_char (int ch, unsigned_char_dynarr *dst,
 		     struct unicode_coding_stream *data,
 		     unsigned int ignore_bom)
 {
 }
 data->seen_char = 1;
 }
+#define DECODE_ERROR_OCTET(octet, dst, data, ignore_bom) \
+decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, \
+dst, data, ignore_bom)
+static inline void
+indicate_invalid_utf_8 (unsigned char indicated_length,
+unsigned char counter,
+int ch, unsigned_char_dynarr *dst,
+struct unicode_coding_stream *data,
+unsigned int ignore_bom)
+{
+Binbyte stored = indicated_length - counter;
+Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
+while (stored > 0)
+{
+DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
+dst, data, ignore_bom);
+mask = 0x80, stored--;
+}
+}
 static void
 encode_unicode_char_1 (int code, unsigned_char_dynarr *dst,
-		       enum unicode_type type, unsigned int little_endian)
+		       enum unicode_type type, unsigned int little_endian,
+int write_error_characters_as_such)
 {
 switch (type)
 {
 case UNICODE_UTF_16:
 if (little_endian)
 	{
 	  if (code < 0x10000) {
 	    Dynarr_add (dst, (unsigned char) (code & 255));
 	    Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
-	  } else {
+	  } else if (write_error_characters_as_such &&
-	    /* Little endian; least significant byte first. */
+code >= UNICODE_ERROR_OCTET_RANGE_START &&
-	    int first, second;
+code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+{
-	    CODE_TO_UTF_16_SURROGATES(code, first, second);
+Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+}
-	    Dynarr_add (dst, (unsigned char) (first & 255));
+else if (code < 0x110000)
-	    Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+{
+/* Little endian; least significant byte first. */
-	    Dynarr_add (dst, (unsigned char) (second & 255));
+int first, second;
-	    Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
-	  }
+CODE_TO_UTF_16_SURROGATES(code, first, second);
+Dynarr_add (dst, (unsigned char) (first & 255));
+Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+Dynarr_add (dst, (unsigned char) (second & 255));
+Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+}
+else
+{
+/* Not valid Unicode. Pass U+FFFD, least significant byte
+first. */
+Dynarr_add (dst, (unsigned char) 0xFD);
+Dynarr_add (dst, (unsigned char) 0xFF);
+}
 	}
 else
 	{
 	  if (code < 0x10000) {
 	    Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
 	    Dynarr_add (dst, (unsigned char) (code & 255));
-	  } else {
+	  } else if (write_error_characters_as_such &&
-	    /* Big endian; most significant byte first. */
+code >= UNICODE_ERROR_OCTET_RANGE_START &&
-	    int first, second;
+code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+{
-	    CODE_TO_UTF_16_SURROGATES(code, first, second);
+Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+}
-	    Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+else if (code < 0x110000)
-	    Dynarr_add (dst, (unsigned char) (first & 255));
+{
+/* Big endian; most significant byte first. */
-	    Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+int first, second;
-	    Dynarr_add (dst, (unsigned char) (second & 255));
-	  }
+CODE_TO_UTF_16_SURROGATES(code, first, second);
+Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+Dynarr_add (dst, (unsigned char) (first & 255));
+Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+Dynarr_add (dst, (unsigned char) (second & 255));
+}
+else
+{
+/* Not valid Unicode. Pass U+FFFD, most significant byte
+first. */
+Dynarr_add (dst, (unsigned char) 0xFF);
+Dynarr_add (dst, (unsigned char) 0xFD);
+}
 	}
 break;
 case UNICODE_UCS_4:
+case UNICODE_UTF_32:
 if (little_endian)
 	{
-	  Dynarr_add (dst, (unsigned char) (code & 255));
+if (write_error_characters_as_such &&
-	  Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+code >= UNICODE_ERROR_OCTET_RANGE_START &&
-	  Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
-	  Dynarr_add (dst, (unsigned char) (code >> 24));
+{
+Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+}
+else
+{
+/* We generate and accept incorrect sequences here, which is
+okay, in the interest of preservation of the user's
+data.  */
+Dynarr_add (dst, (unsigned char) (code & 255));
+Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+Dynarr_add (dst, (unsigned char) (code >> 24));
+}
 	}
 else
 	{
-	  Dynarr_add (dst, (unsigned char) (code >> 24));
+if (write_error_characters_as_such &&
-	  Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+code >= UNICODE_ERROR_OCTET_RANGE_START &&
-	  Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
-	  Dynarr_add (dst, (unsigned char) (code & 255));
+{
+Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+}
+else
+{
+/* We generate and accept incorrect sequences here, which is okay,
+in the interest of preservation of the user's data.  */
+Dynarr_add (dst, (unsigned char) (code >> 24));
+Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+Dynarr_add (dst, (unsigned char) (code & 255));
+}
 	}
 break;
 case UNICODE_UTF_8:
 if (code <= 0x7f)
 	  Dynarr_add (dst, (unsigned char) (((code >>  6) & 0x3f) | 0x80));
 	  Dynarr_add (dst, (unsigned char) ((code        & 0x3f) | 0x80));
 	}
 else if (code <= 0x3ffffff)
 	{
-	  Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8));
-	  Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80));
+#if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \
-	  Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80));
+&& UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff)
-	  Dynarr_add (dst, (unsigned char) (((code >>  6) & 0x3f) | 0x80));
+#error "This code needs to be rewritten. "
-	  Dynarr_add (dst, (unsigned char) ((code        & 0x3f) | 0x80));
+#endif
+if (write_error_characters_as_such &&
+code >= UNICODE_ERROR_OCTET_RANGE_START &&
+code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+{
+Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+}
+else
+{
+Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8));
+Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80));
+Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80));
+Dynarr_add (dst, (unsigned char) (((code >>  6) & 0x3f) | 0x80));
+Dynarr_add (dst, (unsigned char) ((code        & 0x3f) | 0x80));
+}
 	}
 else
 	{
 	  Dynarr_add (dst, (unsigned char) ((code >> 30) | 0xfc));
 	  Dynarr_add (dst, (unsigned char) (((code >> 24) & 0x3f) | 0x80));
 /* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented
 encodings. */
 void
 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
 		     int USED_IF_MULE (l), unsigned_char_dynarr *dst,
-		     enum unicode_type type, unsigned int little_endian)
+		     enum unicode_type type, unsigned int little_endian,
+int write_error_characters_as_such)
 {
 #ifdef MULE
 int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127));
 if (code == -1)
 }
 #else
 int code = h;
 #endif /* MULE */
-encode_unicode_char_1 (code, dst, type, little_endian);
+encode_unicode_char_1 (code, dst, type, little_endian,
+write_error_characters_as_such);
 }
 static Bytecount
 unicode_convert (struct coding_stream *str, const UExtbyte *src,
 		 unsigned_char_dynarr *dst, Bytecount n)
 Bytecount orign = n;
 if (str->direction == CODING_DECODE)
 {
 unsigned char counter = data->counter;
+unsigned char indicated_length
+= data->indicated_length;
 while (n--)
 	{
 	  UExtbyte c = *src++;
 	  switch (type)
 	    {
 	    case UNICODE_UTF_8:
-	      switch (counter)
+if (0 == counter)
-		{
+{
-		case 0:
+if (0 == (c & 0x80))
-		  if (c >= 0xfc)
+{
-		    {
+/* ASCII. */
-		      ch = c & 0x01;
+decode_unicode_char (c, dst, data, ignore_bom);
-		      counter = 5;
+}
-		    }
+else if (0 == (c & 0x40))
-		  else if (c >= 0xf8)
+{
-		    {
+/* Highest bit set, second highest not--there's
-		      ch = c & 0x03;
+something wrong. */
-		      counter = 4;
+DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
-		    }
+}
-		  else if (c >= 0xf0)
+else if (0 == (c & 0x20))
-		    {
+{
-		      ch = c & 0x07;
+ch = c & 0x1f;
-		      counter = 3;
+counter = 1;
-		    }
+indicated_length = 2;
-		  else if (c >= 0xe0)
+}
-		    {
+else if (0 == (c & 0x10))
-		      ch = c & 0x0f;
+{
-		      counter = 2;
+ch = c & 0x0f;
-		    }
+counter = 2;
-		  else if (c >= 0xc0)
+indicated_length = 3;
-		    {
+}
-		      ch = c & 0x1f;
+else if (0 == (c & 0x08))
-		      counter = 1;
+{
-		    }
+ch = c & 0x0f;
-		  else
+counter = 3;
-		    decode_unicode_char (c, dst, data, ignore_bom);
+indicated_length = 4;
-		  break;
+}
-		case 1:
+else
-		  ch = (ch << 6) | (c & 0x3f);
+{
-		  decode_unicode_char (ch, dst, data, ignore_bom);
+/* We don't supports lengths longer than 4 in
-		  ch = 0;
+external-format data. */
-		  counter = 0;
+DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
-		  break;
-		default:
+}
-		  ch = (ch << 6) | (c & 0x3f);
+}
-		  counter--;
+else
+{
+/* counter != 0 */
+if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
+{
+indicate_invalid_utf_8(indicated_length,
+counter,
+ch, dst, data, ignore_bom);
+if (c & 0x80)
+{
+DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+}
+else
+{
+/* The character just read is ASCII. Treat it as
+such.  */
+decode_unicode_char (c, dst, data, ignore_bom);
+}
+ch = 0;
+counter = 0;
+}
+else
+{
+ch = (ch << 6) | (c & 0x3f);
+counter--;
+/* Just processed the final byte. Emit the character. */
+if (!counter)
+{
+			  /* Don't accept over-long sequences, surrogates,
+or codes above #x10FFFF. */
+if ((ch < 0x80) ||
+((ch < 0x800) && indicated_length > 2) ||
+((ch < 0x10000) && indicated_length > 3) ||
+valid_utf_16_surrogate(ch) || (ch > 0x110000))
+{
+indicate_invalid_utf_8(indicated_length,
+counter,
+ch, dst, data,
+ignore_bom);
+}
+else
+{
+decode_unicode_char (ch, dst, data, ignore_bom);
+}
+ch = 0;
+}
+}
 		}
 	      break;
 	    case UNICODE_UTF_16:
 	      if (little_endian)
 		ch = (c << counter) | ch;
 	      else
 		ch = (ch << 8) | c;
 	      counter += 8;
-	      if (counter == 16 && valid_utf_16_first_surrogate(ch))
+	      if (16 == counter)
-		break;
+{
-	      if (counter == 16)
-		{
 		  int tempch = ch;
+if (valid_utf_16_first_surrogate(ch))
+{
+break;
+}
 		  ch = 0;
 		  counter = 0;
 		  decode_unicode_char (tempch, dst, data, ignore_bom);
 		}
-	      if (counter == 32)
+	      else if (32 == counter)
 		{
 		  int tempch;
-		  /* #### Signalling an error may be a bit extreme. Should
-		     we try and read it in anyway? */
+		  if (!valid_utf_16_last_surrogate(ch & 0xFFFF))
-		  if (!valid_utf_16_first_surrogate(ch >> 16)
-		      || !valid_utf_16_last_surrogate(ch & 0xFFFF))
 		    {
-		      signal_error(Qtext_conversion_error,
+DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
-				   "Invalid UTF-16 surrogate sequence",
+ignore_bom);
-				   Qunbound);
+DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ignore_bom);
+DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ignore_bom);
+DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+ignore_bom);
 		    }
-		  tempch = utf_16_surrogates_to_code((ch >> 16),
+else
-						     (ch & 0xffff));
+{
+tempch = utf_16_surrogates_to_code((ch >> 16),
+(ch & 0xffff));
+decode_unicode_char(tempch, dst, data, ignore_bom);
+}
 		  ch = 0;
 		  counter = 0;
-		  decode_unicode_char(tempch, dst, data, ignore_bom);
+}
-		}
+else
+assert(8 == counter || 24 == counter);
 	      break;
 	    case UNICODE_UCS_4:
+case UNICODE_UTF_32:
 	      if (little_endian)
 		ch = (c << counter) | ch;
 	      else
 		ch = (ch << 8) | c;
 	      counter += 8;
 	      if (counter == 32)
 		{
-		  int tempch = ch;
+		  if (ch > 0x10ffff)
+		    {
+/* ch is not a legal Unicode character. We're fine
+with that in UCS-4, though not in UTF-32. */
+if (UNICODE_UCS_4 == type && ch < 0x80000000)
+{
+decode_unicode_char (ch, dst, data, ignore_bom);
+}
+else if (little_endian)
+{
+DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+ignore_bom);
+DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ignore_bom);
+DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ignore_bom);
+DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+ignore_bom);
+}
+else
+{
+DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+ignore_bom);
+DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ignore_bom);
+DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ignore_bom);
+DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+ignore_bom);
+}
+		    }
+else
+{
+decode_unicode_char (ch, dst, data, ignore_bom);
+}
 		  ch = 0;
 		  counter = 0;
-		  if (tempch < 0)
-		    {
-		      /* !!#### indicate an error */
-		      tempch = '~';
-		    }
-		  decode_unicode_char (tempch, dst, data, ignore_bom);
 		}
 	      break;
 	    case UNICODE_UTF_7:
 	      ABORT ();
 	    default: ABORT ();
 	    }
 	}
-if (str->eof)
-	DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+if (str->eof && ch)
+{
+switch (type)
+{
+	    case UNICODE_UTF_8:
+indicate_invalid_utf_8(indicated_length,
+counter, ch, dst, data,
+ignore_bom);
+break;
+case UNICODE_UTF_16:
+case UNICODE_UCS_4:
+case UNICODE_UTF_32:
+if (8 == counter)
+{
+DECODE_ERROR_OCTET (ch, dst, data, ignore_bom);
+}
+else if (16 == counter)
+{
+if (little_endian)
+{
+DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom);
+DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ignore_bom);
+}
+else
+{
+DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ignore_bom);
+DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom);
+}
+}
+else if (24 == counter)
+{
+if (little_endian)
+{
+DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ignore_bom);
+DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom);
+DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ignore_bom);
+}
+else
+{
+DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ignore_bom);
+DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ignore_bom);
+DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+ignore_bom);
+}
+}
+else assert(0);
+break;
+}
+ch = 0;
+}
 data->counter = counter;
+data->indicated_length = indicated_length;
 }
 else
 {
 unsigned char char_boundary = data->current_char_boundary;
 Lisp_Object charset = data->current_charset;
 back_to_square_n:
 #endif /* ENABLE_COMPOSITE_CHARS */
 if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom)
 	{
-	  encode_unicode_char_1 (0xFEFF, dst, type, little_endian);
+	  encode_unicode_char_1 (0xFEFF, dst, type, little_endian, 1);
 	  data->wrote_bom = 1;
 	}
 while (n--)
 	{
 	  if (byte_ascii_p (c))
 #endif /* MULE */
 	    {			/* Processing ASCII character */
 	      ch = 0;
 	      encode_unicode_char (Vcharset_ascii, c, 0, dst, type,
-				   little_endian);
+				   little_endian, 1);
 	      char_boundary = 1;
 	    }
 #ifdef MULE
 	  else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch))
 		   (Info-goto-node "(internals)Internal String Encoding")
 		   for the rationale behind subtracting #xa0 from the
 		   character's code. */
 		encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst,
-				     type, little_endian);
+				     type, little_endian, 1);
 	      else
 		{
 		  switch (XCHARSET_REP_BYTES (charset))
 		    {
 		    case 2:
 		      encode_unicode_char (charset, c, 0, dst, type,
-					   little_endian);
+					   little_endian, 1);
 		      break;
 		    case 3:
 		      if (XCHARSET_PRIVATE_P (charset))
 			{
 			  encode_unicode_char (charset, c, 0, dst, type,
-					       little_endian);
+					       little_endian, 1);
 			  ch = 0;
 			}
 		      else if (ch)
 			{
 #ifdef ENABLE_COMPOSITE_CHARS
 				{
 				  /* #### Bother! We don't know how to
 				     handle this yet. */
 				  encode_unicode_char (Vcharset_ascii, '~', 0,
 						       dst, type,
-						       little_endian);
+						       little_endian, 1);
 				}
 			      else
 				{
 				  Ichar emch = make_ichar (Vcharset_composite,
 							   ch & 0x7F,
 				}
 			    }
 			  else
 #endif /* ENABLE_COMPOSITE_CHARS */
 			    encode_unicode_char (charset, ch, c, dst, type,
-						 little_endian);
+						 little_endian, 1);
 			  ch = 0;
 			}
 		      else
 			{
 			  ch = c;
 		      break;
 		    case 4:
 		      if (ch)
 			{
 			  encode_unicode_char (charset, ch, c, dst, type,
-					       little_endian);
+					       little_endian, 1);
 			  ch = 0;
 			}
 		      else
 			{
 			  ch = c;
 	type = UNICODE_UTF_16;
 else if (EQ (value, Qutf_7))
 	type = UNICODE_UTF_7;
 else if (EQ (value, Qucs_4))
 	type = UNICODE_UCS_4;
+else if (EQ (value, Qutf_32))
+	type = UNICODE_UTF_32;
 else
 	invalid_constant ("Invalid Unicode type", key);
 XCODING_SYSTEM_UNICODE_TYPE (codesys) = type;
 }
 	{
 	case UNICODE_UTF_16: return Qutf_16;
 	case UNICODE_UTF_8: return Qutf_8;
 	case UNICODE_UTF_7: return Qutf_7;
 	case UNICODE_UCS_4: return Qucs_4;
+	case UNICODE_UTF_32: return Qutf_32;
 	default: ABORT ();
 	}
 }
 else if (EQ (prop, Qlittle_endian))
 return XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (coding_system) ? Qt : Qnil;
 DEFSUBR (Funicode_to_char);
 DEFSYMBOL (Qunicode);
 DEFSYMBOL (Qucs_4);
 DEFSYMBOL (Qutf_16);
+DEFSYMBOL (Qutf_32);
 DEFSYMBOL (Qutf_8);
 DEFSYMBOL (Qutf_7);
 DEFSYMBOL (Qneed_bom);

Mercurial > hg > xemacs-beta

comparison src/unicode.c @ 4096:1abf84db2c7f