changeset 3367:84ee3ca77e7f

[xemacs-hg @ 2006-04-29 14:36:49 by aidan] Support Unicode escapes in the Lisp reader, taking the syntax from C#.
author aidan
date Sat, 29 Apr 2006 14:36:57 +0000
parents db585a1b4d86
children 959746c534f6
files man/ChangeLog man/lispref/objects.texi src/ChangeLog src/lread.c src/xft-fonts.h
diffstat 4 files changed, 78 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/man/ChangeLog	Fri Apr 28 21:51:06 2006 +0000
+++ b/man/ChangeLog	Sat Apr 29 14:36:57 2006 +0000
@@ -1,3 +1,9 @@
+2006-04-29  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* lispref/objects.texi (Character Type):
+	Document the Unicode syntax for characters in characters and
+	strings. 
+
 2006-04-23  Stephen J. Turnbull  <stephen@xemacs.org>
 
 	* internals/internals.texi:  Run texinfo-master-menu.
--- a/man/lispref/objects.texi	Fri Apr 28 21:51:06 2006 +0000
+++ b/man/lispref/objects.texi	Sat Apr 29 14:36:57 2006 +0000
@@ -510,6 +510,21 @@
 For example, character code 193 is a lowercase @samp{a} with an acute
 accent, in @sc{iso}-8859-1.)
 
+@cindex unicode character escape
+   From version 21.5.25 onwards, XEmacs provides a syntax for specifying
+characters by their Unicode code points.  @samp{?\uABCD} will give you
+an XEmacs character that maps to the code point @samp{U+ABCD} in
+Unicode-based representations (UTF-8 text files, Unicode-oriented fonts,
+etc.)  Just as in the C# language, there is a slightly different syntax
+for specifying characters with code points above @samp{#xFFFF};
+@samp{\U00ABCDEF} will give you an XEmacs character that maps to the
+code point @samp{U+ABCDEF} in Unicode-based representations, if such an
+XEmacs character exists. 
+
+  Unlike in C#, while this syntax is available for character literals,
+and (see later) in strings, it is not available elsewhere in your Lisp
+source code.
+
 @ignore @c None of this crap applies to XEmacs.
   For use in strings and buffers, you are limited to the control
 characters that exist in @sc{ascii}, but for keyboard input purposes,
@@ -614,6 +629,7 @@
 @cindex backslash in character constant
 @cindex octal character code
 @cindex hexadecimal character code
+
   Finally, there are two read syntaxes involving character codes.
 It is not possible to represent multibyte or wide characters in this
 way; the permissible range of codes is from 0 to 255 (@emph{i.e.},
--- a/src/ChangeLog	Fri Apr 28 21:51:06 2006 +0000
+++ b/src/ChangeLog	Sat Apr 29 14:36:57 2006 +0000
@@ -1,3 +1,10 @@
+2006-04-29  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* lread.c:
+	* lread.c (read_escape):
+	Support \uABCD and \U00ABCDEF for specifying characters by their
+	Unicode code point. 
+	
 2006-04-25  Stephen J. Turnbull  <stephen@xemacs.org>
 
 	Repair busted commit, plus some gratuitous doc improvements.
--- a/src/lread.c	Fri Apr 28 21:51:06 2006 +0000
+++ b/src/lread.c	Sat Apr 29 14:36:57 2006 +0000
@@ -208,6 +208,8 @@
 static int locate_file_open_or_access_file (Ibyte *fn, int access_mode);
 EXFUN (Fread_from_string, 3);
 
+EXFUN (Funicode_to_char, 2);  /* In unicode.c.  */
+
 /* When errors are signaled, the actual readcharfun should not be used
    as an argument if it is an lstream, so that lstreams don't escape
    to the Lisp level.  */
@@ -1675,6 +1677,9 @@
 {
   /* This function can GC */
   Ichar c = readchar (readcharfun);
+  /* \u allows up to four hex digits, \U up to eight. Default to the
+     behaviour for \u, and change this value in the case that \U is seen. */
+  int unicode_hex_count = 4;
 
   if (c < 0)
     signal_error (Qend_of_file, 0, READCHARFUN_MAYBE (readcharfun));
@@ -1763,7 +1768,7 @@
 	      }
 	  }
 	if (i >= 0400)
-	  syntax_error ("Attempt to create non-ASCII/ISO-8859-1 character",
+	  syntax_error ("Non-ISO-8859-1 character specified with octal escape",
 			make_int (i));
 	return i;
       }
@@ -1791,11 +1796,51 @@
 	  }
 	return i;
       }
-
+    case 'U':
+      /* Post-Unicode-2.0: Up to eight hex chars */
+      unicode_hex_count = 8;
+    case 'u':
+
+      /* A Unicode escape, as in C# (though we only permit them in strings
+	 and characters, not arbitrarily in the source code.) */
+      {
+	REGISTER Ichar i = 0;
+	REGISTER int count = 0;
+	Lisp_Object lisp_char;
+	while (++count <= unicode_hex_count)
+	  {
+	    c = readchar (readcharfun);
+	    /* Remember, can't use isdigit(), isalpha() etc. on Ichars */
+	    if      (c >= '0' && c <= '9')  i = (i << 4) + (c - '0');
+	    else if (c >= 'a' && c <= 'f')  i = (i << 4) + (c - 'a') + 10;
+            else if (c >= 'A' && c <= 'F')  i = (i << 4) + (c - 'A') + 10;
+	    else
+	      {
+		syntax_error ("Non-hex digit used for Unicode escape",
+			      make_char (c));
+		break;
+	      }
+	  }
+
+	lisp_char = Funicode_to_char(make_int(i), Qnil);
+
+	if (EQ(Qnil, lisp_char))
+	  {
+	    /* This is ugly and horrible and trashes the user's data, but
+	       it's what unicode.c does. In the future, unicode-to-char
+	       should not return nil.  */
 #ifdef MULE
-      /* #### need some way of reading an extended character with
-	 an escape sequence. */
+	    i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128);
+#else
+	    i = '~';
 #endif
+            return i;
+	  }
+	else
+	  {
+	    return XCHAR(lisp_char);
+	  }
+      }
 
     default:
 	return c;