xemacs-beta: src/mule-coding.c comparison

comparison src/mule-coding.c @ 4096:1abf84db2c7f

[xemacs-hg @ 2007-08-04 20:00:10 by aidan] Preserve invalid UTF-8, UTF-16 sequences on encoding, decoding.

author	aidan
date	Sat, 04 Aug 2007 20:00:24 +0000
parents	42e4605ef1de
children	383ab474a241

comparison

equal deleted inserted replaced

-:bff7e065cfdc
+:1abf84db2c7f
 			       unsigned_char_dynarr *dst)
 {
 if (XCHARSET_ENCODE_AS_UTF_8 (charset))
 {
 encode_unicode_char (charset, c & charmask, 0,
-			   dst, UNICODE_UTF_8, 0);
+			   dst, UNICODE_UTF_8, 0, 0);
 }
 else
 {
 Dynarr_add (dst, c & charmask);
 }
 if (XCHARSET_ENCODE_AS_UTF_8 (charset))
 {
 encode_unicode_char (charset,
 			   ch & charmask,
 			   c & charmask, dst,
-			   UNICODE_UTF_8, 0);
+			   UNICODE_UTF_8, 0, 0);
 }
 else
 {
 Dynarr_add (dst, ch & charmask);
 Dynarr_add (dst, c & charmask);
 int current_half;
 int current_char_boundary;
 /* Used for handling UTF-8. */
 unsigned char counter;
+unsigned char indicated_length;
 };
 static const struct memory_description ccs_description_1[] =
 {
 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
 	Dynarr_add (dst, ISO_CODE_CSI);
 Dynarr_add (dst, '2');
 Dynarr_add (dst, ']');
 if (flags)
 	*flags |= ISO_STATE_R2L;
+}
+}
+/* Note that this name conflicts with a function in unicode.c. */
+static void
+decode_unicode_char (int ucs, unsigned_char_dynarr *dst)
+{
+Ibyte work[MAX_ICHAR_LEN];
+int len;
+Lisp_Object chr;
+chr = Funicode_to_char(make_int(ucs), Qnil);
+assert (!NILP(chr));
+len = set_itext_ichar (work, XCHAR(chr));
+Dynarr_add_many (dst, work, len);
+}
+#define DECODE_ERROR_OCTET(octet, dst) \
+decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, dst)
+static inline void
+indicate_invalid_utf_8 (unsigned char indicated_length,
+unsigned char counter,
+int ch, unsigned_char_dynarr *dst)
+{
+Binbyte stored = indicated_length - counter;
+Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
+while (stored > 0)
+{
+DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
+dst);
+mask = 0x80, stored--;
 }
 }
 /* Convert ISO2022-format data to internal format. */
 	  ch = 0;
 	}
 else if (flags & ISO_STATE_UTF_8)
 	{
 	  unsigned char counter = data->counter;
-	  Ibyte work[MAX_ICHAR_LEN];
+unsigned char indicated_length = data->indicated_length;
-	  int len;
-	  Lisp_Object chr;
 	  if (ISO_CODE_ESC == c)
 	    {
 	      /* Allow the escape sequence parser to end the UTF-8 state. */
 	      flags |= ISO_STATE_ESCAPE;
 	      data->esc = ISO_ESC;
 	      data->esc_bytes_index = 1;
 	      continue;
 	    }
-	  switch (counter)
+if (0 == counter)
-	    {
+{
-	    case 0:
+if (0 == (c & 0x80))
-	      if (c >= 0xfc)
+{
-		{
+/* ASCII. */
-		  ch = c & 0x01;
+decode_unicode_char (c, dst);
-		  counter = 5;
+}
-		}
+else if (0 == (c & 0x40))
-	      else if (c >= 0xf8)
+{
-		{
+/* Highest bit set, second highest not--there's
-		  ch = c & 0x03;
+something wrong. */
-		  counter = 4;
+DECODE_ERROR_OCTET (c, dst);
-		}
+}
-	      else if (c >= 0xf0)
+else if (0 == (c & 0x20))
-		{
+{
-		  ch = c & 0x07;
+ch = c & 0x1f;
-		  counter = 3;
+counter = 1;
-		}
+indicated_length = 2;
-	      else if (c >= 0xe0)
+}
-		{
+else if (0 == (c & 0x10))
-		  ch = c & 0x0f;
+{
-		  counter = 2;
+ch = c & 0x0f;
-		}
+counter = 2;
-	      else if (c >= 0xc0)
+indicated_length = 3;
-		{
+}
-		  ch = c & 0x1f;
+else if (0 == (c & 0x08))
-		  counter = 1;
+{
-		}
+ch = c & 0x0f;
-	      else
+counter = 3;
-		/* ASCII, or the lower control characters.
+indicated_length = 4;
+}
-Perhaps we should signal an error if the character is in
+/* We support lengths longer than 4 here, since we want to
-the range 0x80-0xc0; this is illegal UTF-8. */
+represent UTF-8 error chars as distinct from the
-Dynarr_add (dst, (c & 0x7f));
+corresponding ISO 8859-1 characters in escape-quoted.
-	      break;
+However, we can't differentiate UTF-8 error chars as
-	    case 1:
+written to disk, and UTF-8 errors in escape-quoted.  This
-	      ch = (ch << 6) | (c & 0x3f);
+is not a big problem;
-	      chr = Funicode_to_char(make_int(ch), Qnil);
+non-Unicode-chars-encoded-as-UTF-8-in-ISO-2022 is not
+deployed, in practice, so if such a sequence of octets
-	      if (!NILP (chr))
+occurs, XEmacs generated it.  */
-		{
+else if (0 == (c & 0x04))
-		  assert(CHARP(chr));
+{
-		  len = set_itext_ichar (work, XCHAR(chr));
+ch = c & 0x03;
-		  Dynarr_add_many (dst, work, len);
+counter = 4;
-		}
+indicated_length = 5;
-	      else
+}
-		{
+else if (0 == (c & 0x02))
-		  /* Shouldn't happen, this code should only be enabled in
+{
-		     XEmacsen with support for all of Unicode. */
+ch = c & 0x01;
-		  Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
+counter = 5;
-		  Dynarr_add (dst, 34 + 128);
+indicated_length = 6;
-		  Dynarr_add (dst, 46 + 128);
+}
-		}
+else
+{
-	      ch = 0;
+/* #xFF is not a valid leading byte in any form of
-	      counter = 0;
+UTF-8. */
-	      break;
+DECODE_ERROR_OCTET (c, dst);
-	    default:
-	      ch = (ch << 6) | (c & 0x3f);
+}
-	      counter--;
+}
-	    }
+else
+{
-	  if (str->eof)
+/* counter != 0 */
-	    DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
+{
+indicate_invalid_utf_8(indicated_length,
+counter,
+ch, dst);
+if (c & 0x80)
+{
+DECODE_ERROR_OCTET (c, dst);
+}
+else
+{
+/* The character just read is ASCII. Treat it as
+such.  */
+decode_unicode_char (c, dst);
+}
+ch = 0;
+counter = 0;
+}
+else
+{
+ch = (ch << 6) | (c & 0x3f);
+counter--;
+/* Just processed the final byte. Emit the character. */
+if (!counter)
+{
+/* Don't accept over-long sequences, or surrogates. */
+if ((ch < 0x80) ||
+((ch < 0x800) && indicated_length > 2) ||
+((ch < 0x10000) && indicated_length > 3) ||
+/* We accept values above #x110000 in
+escape-quoted, though not in UTF-8. */
+/* (ch > 0x110000) || */
+valid_utf_16_surrogate(ch))
+{
+indicate_invalid_utf_8(indicated_length,
+counter,
+ch, dst);
+}
+else
+{
+decode_unicode_char (ch, dst);
+}
+ch = 0;
+}
+}
+}
+if (str->eof && ch)
+{
+DECODE_ERROR_OCTET (ch, dst);
+ch  = 0;
+}
 	  data->counter = counter;
+	  data->indicated_length = indicated_length;
 	}
 else if (byte_c0_p (c) || byte_c1_p (c))
 	{ /* Control characters */
 	  /***** Error-handling *****/

Mercurial > hg > xemacs-beta

comparison src/mule-coding.c @ 4096:1abf84db2c7f