Mercurial > hg > xemacs-beta
changeset 3367:84ee3ca77e7f
[xemacs-hg @ 2006-04-29 14:36:49 by aidan]
Support Unicode escapes in the Lisp reader, taking the syntax from C#.
author | aidan |
---|---|
date | Sat, 29 Apr 2006 14:36:57 +0000 |
parents | db585a1b4d86 |
children | 959746c534f6 |
files | man/ChangeLog man/lispref/objects.texi src/ChangeLog src/lread.c src/xft-fonts.h |
diffstat | 4 files changed, 78 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/man/ChangeLog Fri Apr 28 21:51:06 2006 +0000 +++ b/man/ChangeLog Sat Apr 29 14:36:57 2006 +0000 @@ -1,3 +1,9 @@ +2006-04-29 Aidan Kehoe <kehoea@parhasard.net> + + * lispref/objects.texi (Character Type): + Document the Unicode syntax for characters in characters and + strings. + 2006-04-23 Stephen J. Turnbull <stephen@xemacs.org> * internals/internals.texi: Run texinfo-master-menu.
--- a/man/lispref/objects.texi Fri Apr 28 21:51:06 2006 +0000 +++ b/man/lispref/objects.texi Sat Apr 29 14:36:57 2006 +0000 @@ -510,6 +510,21 @@ For example, character code 193 is a lowercase @samp{a} with an acute accent, in @sc{iso}-8859-1.) +@cindex unicode character escape + From version 21.5.25 onwards, XEmacs provides a syntax for specifying +characters by their Unicode code points. @samp{?\uABCD} will give you +an XEmacs character that maps to the code point @samp{U+ABCD} in +Unicode-based representations (UTF-8 text files, Unicode-oriented fonts, +etc.) Just as in the C# language, there is a slightly different syntax +for specifying characters with code points above @samp{#xFFFF}; +@samp{\U00ABCDEF} will give you an XEmacs character that maps to the +code point @samp{U+ABCDEF} in Unicode-based representations, if such an +XEmacs character exists. + + Unlike in C#, while this syntax is available for character literals, +and (see later) in strings, it is not available elsewhere in your Lisp +source code. + @ignore @c None of this crap applies to XEmacs. For use in strings and buffers, you are limited to the control characters that exist in @sc{ascii}, but for keyboard input purposes, @@ -614,6 +629,7 @@ @cindex backslash in character constant @cindex octal character code @cindex hexadecimal character code + Finally, there are two read syntaxes involving character codes. It is not possible to represent multibyte or wide characters in this way; the permissible range of codes is from 0 to 255 (@emph{i.e.},
--- a/src/ChangeLog Fri Apr 28 21:51:06 2006 +0000 +++ b/src/ChangeLog Sat Apr 29 14:36:57 2006 +0000 @@ -1,3 +1,10 @@ +2006-04-29 Aidan Kehoe <kehoea@parhasard.net> + + * lread.c: + * lread.c (read_escape): + Support \uABCD and \U00ABCDEF for specifying characters by their + Unicode code point. + 2006-04-25 Stephen J. Turnbull <stephen@xemacs.org> Repair busted commit, plus some gratuitous doc improvements.
--- a/src/lread.c Fri Apr 28 21:51:06 2006 +0000 +++ b/src/lread.c Sat Apr 29 14:36:57 2006 +0000 @@ -208,6 +208,8 @@ static int locate_file_open_or_access_file (Ibyte *fn, int access_mode); EXFUN (Fread_from_string, 3); +EXFUN (Funicode_to_char, 2); /* In unicode.c. */ + /* When errors are signaled, the actual readcharfun should not be used as an argument if it is an lstream, so that lstreams don't escape to the Lisp level. */ @@ -1675,6 +1677,9 @@ { /* This function can GC */ Ichar c = readchar (readcharfun); + /* \u allows up to four hex digits, \U up to eight. Default to the + behaviour for \u, and change this value in the case that \U is seen. */ + int unicode_hex_count = 4; if (c < 0) signal_error (Qend_of_file, 0, READCHARFUN_MAYBE (readcharfun)); @@ -1763,7 +1768,7 @@ } } if (i >= 0400) - syntax_error ("Attempt to create non-ASCII/ISO-8859-1 character", + syntax_error ("Non-ISO-8859-1 character specified with octal escape", make_int (i)); return i; } @@ -1791,11 +1796,51 @@ } return i; } - + case 'U': + /* Post-Unicode-2.0: Up to eight hex chars */ + unicode_hex_count = 8; + case 'u': + + /* A Unicode escape, as in C# (though we only permit them in strings + and characters, not arbitrarily in the source code.) */ + { + REGISTER Ichar i = 0; + REGISTER int count = 0; + Lisp_Object lisp_char; + while (++count <= unicode_hex_count) + { + c = readchar (readcharfun); + /* Remember, can't use isdigit(), isalpha() etc. on Ichars */ + if (c >= '0' && c <= '9') i = (i << 4) + (c - '0'); + else if (c >= 'a' && c <= 'f') i = (i << 4) + (c - 'a') + 10; + else if (c >= 'A' && c <= 'F') i = (i << 4) + (c - 'A') + 10; + else + { + syntax_error ("Non-hex digit used for Unicode escape", + make_char (c)); + break; + } + } + + lisp_char = Funicode_to_char(make_int(i), Qnil); + + if (EQ(Qnil, lisp_char)) + { + /* This is ugly and horrible and trashes the user's data, but + it's what unicode.c does. In the future, unicode-to-char + should not return nil. */ #ifdef MULE - /* #### need some way of reading an extended character with - an escape sequence. */ + i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128); +#else + i = '~'; #endif + return i; + } + else + { + return XCHAR(lisp_char); + } + } default: return c;