Mercurial > hg > xemacs-beta
changeset 3543:c136144fe765
[xemacs-hg @ 2006-08-04 22:55:04 by aidan]
Raw strings, from Python via SXEmacs
author | aidan |
---|---|
date | Fri, 04 Aug 2006 22:55:19 +0000 |
parents | 1ce31579a443 |
children | 981a144e71fa |
files | lisp/ChangeLog lisp/lisp.el man/ChangeLog man/lispref/objects.texi src/ChangeLog src/lread.c tests/ChangeLog tests/automated/lisp-reader-tests.el |
diffstat | 8 files changed, 295 insertions(+), 99 deletions(-) [+] |
line wrap: on
line diff
--- a/lisp/ChangeLog Fri Aug 04 21:50:46 2006 +0000 +++ b/lisp/ChangeLog Fri Aug 04 22:55:19 2006 +0000 @@ -1,3 +1,9 @@ +2006-08-05 Aidan Kehoe <kehoea@parhasard.net> + + * lisp.el (forward-sexp): + Handle raw strings specially just as we do structures. Fixes + problems evaluating them in *scratch*. + 2006-08-04 Aidan Kehoe <kehoea@parhasard.net> * iso8859-1.el:
--- a/lisp/lisp.el Fri Aug 04 21:50:46 2006 +0000 +++ b/lisp/lisp.el Fri Aug 04 22:55:19 2006 +0000 @@ -60,19 +60,20 @@ (interactive "_p") (or arg (setq arg 1)) ;; XEmacs: evil hack! The other half of the evil hack below. - (if (and (> arg 0) (looking-at "#s(")) - (goto-char (+ (point) 2))) + (if (and (> arg 0) (looking-at "#s(\\|#r[uU]\\{0,1\\}\"")) + (goto-char (1+ (- (point) (- (match-end 0) (match-beginning 0)))))) (goto-char (or (scan-sexps (point) arg) (buffer-end arg))) - (if (< arg 0) (backward-prefix-chars)) - ;; XEmacs: evil hack! Skip back over #s so that structures are read - ;; properly. the current cheesified syntax tables just aren't up to - ;; this. - (if (and (< arg 0) - (eq (char-after (point)) ?\() - (>= (- (point) (point-min)) 2) - (eq (char-after (- (point) 1)) ?s) - (eq (char-after (- (point) 2)) ?#)) - (goto-char (- (point) 2)))) + (when (< arg 0) + (backward-prefix-chars) + ;; XEmacs: evil hack! Skip back over #[sr] so that structures and raw + ;; strings are read properly. the current cheesified syntax tables just + ;; aren't up to this. + (let* ((diff (- (point) (point-min))) + (subject (buffer-substring (- (point) (min diff 3)) + (1+ (point)))) + (matched (string-match "#s(\\|#r[uU]\\{0,1\\}\"" subject))) + (if matched + (goto-char (1+ (- (point) (- (length subject) matched)))))))) (defun backward-sexp (&optional arg) "Move backward across one balanced expression (sexp).
--- a/man/ChangeLog Fri Aug 04 21:50:46 2006 +0000 +++ b/man/ChangeLog Fri Aug 04 22:55:19 2006 +0000 @@ -1,3 +1,9 @@ +2006-08-05 Aidan Kehoe <kehoea@parhasard.net> + + * lispref/objects.texi (String Type): + Give details of the raw string syntax, taken from SXEmacs and + Python. + 2006-07-19 Stephen J. Turnbull <stephen@xemacs.org> * new-users-guide/edit.texi (Insert): Document bogosity in
--- a/man/lispref/objects.texi Fri Aug 04 21:50:46 2006 +0000 +++ b/man/lispref/objects.texi Fri Aug 04 22:55:19 2006 +0000 @@ -1079,6 +1079,16 @@ escape any backslash or double-quote characters in the string with a backslash, like this: @code{"this \" is an embedded quote"}. + An alternative syntax allows insertion of raw backslashes into a +string, like this: @code{#r"this \ is an embedded backslash"}. In such +a string, each character following a backslash is included literally in +the string, and all backslashes are left in the string. This means that +@code{#r"\""} is a valid string literal with two characters, a backslash and a +double-quote. It also means that a string with this syntax @emph{cannot end +in a single backslash}. As with Python, from where this syntax was +taken, you can specify @code{u} or @code{U} after the @code{#r} to +specify that interpretation of Unicode escapes should be done. + The newline character is not special in the read syntax for strings; if you write a new line between the double-quotes, it becomes a character in the string. But an escaped newline---one that is preceded
--- a/src/ChangeLog Fri Aug 04 21:50:46 2006 +0000 +++ b/src/ChangeLog Fri Aug 04 22:55:19 2006 +0000 @@ -1,3 +1,18 @@ +2006-08-05 Aidan Kehoe <kehoea@parhasard.net> + + * lread.c (read_unicode_escape): + Refactor this code out from read_escape, since it's now called + from read_string as well. + * lread.c (read_escape): + Call read_unicode_escape instead of using inline code, + * lread.c (read_string): + Refactor out from read1, provide raw and honor_unicode options. + * lread.c (read_raw_string): + Added, a function that calls read_string with the correct + arguments for a raw string. + * lread.c (read1): + Pass raw strings to read_raw_string; pass strings to read_string. + 2006-08-04 Aidan Kehoe <kehoea@parhasard.net> * event-tty.c (emacs_tty_next_event):
--- a/src/lread.c Fri Aug 04 21:50:46 2006 +0000 +++ b/src/lread.c Fri Aug 04 22:55:19 2006 +0000 @@ -1670,15 +1670,56 @@ return val; } + +/* A Unicode escape, as in C# (though we only permit them in strings + and characters, not arbitrarily in the source code.) */ +static Ichar +read_unicode_escape (Lisp_Object readcharfun, int unicode_hex_count) +{ + REGISTER Ichar i = 0, c; + REGISTER int count = 0; + Lisp_Object lisp_char; + while (++count <= unicode_hex_count) + { + c = readchar (readcharfun); + /* Remember, can't use isdigit(), isalpha() etc. on Ichars */ + if (c >= '0' && c <= '9') i = (i << 4) + (c - '0'); + else if (c >= 'a' && c <= 'f') i = (i << 4) + (c - 'a') + 10; + else if (c >= 'A' && c <= 'F') i = (i << 4) + (c - 'A') + 10; + else + { + syntax_error ("Non-hex digit used for Unicode escape", + make_char (c)); + break; + } + } + + lisp_char = Funicode_to_char(make_int(i), Qnil); + + if (EQ(Qnil, lisp_char)) + { + /* This is ugly and horrible and trashes the user's data, but + it's what unicode.c does. In the future, unicode-to-char + should not return nil. */ +#ifdef MULE + i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128); +#else + i = '~'; +#endif + return i; + } + else + { + return XCHAR(lisp_char); + } +} + static Ichar read_escape (Lisp_Object readcharfun) { /* This function can GC */ Ichar c = readchar (readcharfun); - /* \u allows up to four hex digits, \U up to eight. Default to the - behaviour for \u, and change this value in the case that \U is seen. */ - int unicode_hex_count = 4; if (c < 0) signal_error (Qend_of_file, 0, READCHARFUN_MAYBE (readcharfun)); @@ -1797,49 +1838,10 @@ } case 'U': /* Post-Unicode-2.0: Up to eight hex chars */ - unicode_hex_count = 8; + return read_unicode_escape(readcharfun, 8); case 'u': - - /* A Unicode escape, as in C# (though we only permit them in strings - and characters, not arbitrarily in the source code.) */ - { - REGISTER Ichar i = 0; - REGISTER int count = 0; - Lisp_Object lisp_char; - while (++count <= unicode_hex_count) - { - c = readchar (readcharfun); - /* Remember, can't use isdigit(), isalpha() etc. on Ichars */ - if (c >= '0' && c <= '9') i = (i << 4) + (c - '0'); - else if (c >= 'a' && c <= 'f') i = (i << 4) + (c - 'a') + 10; - else if (c >= 'A' && c <= 'F') i = (i << 4) + (c - 'A') + 10; - else - { - syntax_error ("Non-hex digit used for Unicode escape", - make_char (c)); - break; - } - } - - lisp_char = Funicode_to_char(make_int(i), Qnil); - - if (EQ(Qnil, lisp_char)) - { - /* This is ugly and horrible and trashes the user's data, but - it's what unicode.c does. In the future, unicode-to-char - should not return nil. */ -#ifdef MULE - i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128); -#else - i = '~'; -#endif - return i; - } - else - { - return XCHAR(lisp_char); - } - } + /* Unicode-2.0 and before; four hex chars. */ + return read_unicode_escape(readcharfun, 4); default: return c; @@ -2270,6 +2272,113 @@ } #endif +static Lisp_Object +read_string (Lisp_Object readcharfun, Ichar delim, int raw, + int honor_unicode) +{ +#ifdef I18N3 + /* #### If the input stream is translating, then the string + should be marked as translatable by setting its + `string-translatable' property to t. .el and .elc files + normally are translating input streams. See Fgettext() + and print_internal(). */ +#endif + Ichar c; + int cancel = 0; + + Lstream_rewind(XLSTREAM(Vread_buffer_stream)); + while ((c = readchar(readcharfun)) >= 0 && c != delim) + { + if (c == '\\') + { + if (raw) + { + c = readchar(readcharfun); + if (honor_unicode && ('u' == c || 'U' == c)) + { + c = read_unicode_escape(readcharfun, + 'U' == c ? 8 : 4); + } + else + { + /* For raw strings, insert the + backslash and the next char, */ + Lstream_put_ichar(XLSTREAM + (Vread_buffer_stream), + '\\'); + } + } + else + /* otherwise, backslash escapes the next char. */ + c = read_escape(readcharfun); + } + /* c is -1 if \ newline has just been seen */ + if (c == -1) + { + if (Lstream_byte_count + (XLSTREAM(Vread_buffer_stream)) == + 0) + cancel = 1; + } + else + Lstream_put_ichar(XLSTREAM + (Vread_buffer_stream), + c); + QUIT; + } + if (c < 0) + return Fsignal(Qend_of_file, + list1(READCHARFUN_MAYBE(readcharfun))); + + /* If purifying, and string starts with \ newline, + return zero instead. This is for doc strings + that we are really going to find in lib-src/DOC.nn.nn */ + if (purify_flag && NILP(Vinternal_doc_file_name) + && cancel) + return Qzero; + + Lstream_flush(XLSTREAM(Vread_buffer_stream)); + return make_string(resizing_buffer_stream_ptr + (XLSTREAM(Vread_buffer_stream)), + Lstream_byte_count(XLSTREAM(Vread_buffer_stream))); +} + +static Lisp_Object +read_raw_string (Lisp_Object readcharfun) +{ + Ichar c; + Ichar permit_unicode = 0; + + do { + c = reader_nextchar(readcharfun); + switch (c) { + /* #r:engine"my sexy raw string" -- raw string w/ flags*/ + /* case ':': */ + /* #ru"Hi there\u20AC \U000020AC" -- raw string, honouring Unicode. */ + case 'u': + case 'U': + permit_unicode = c; + continue; + + /* #r"my raw string" -- raw string */ + case '\"': + return read_string(readcharfun, '\"', 1, permit_unicode); + /* invalid syntax */ + default: + { + if (permit_unicode) + { + unreadchar(readcharfun, permit_unicode); + } + unreadchar(readcharfun, c); + return Fsignal(Qinvalid_read_syntax, + list1(build_string + ("unrecognized raw string syntax"))); + } + } + } while (1); +} + /* Read the next Lisp object from the stream READCHARFUN and return it. If the return value is a cons whose car is Qunbound, then read1() encountered a misplaced token (e.g. a right bracket, right paren, @@ -2509,6 +2618,8 @@ case 'x': return read_integer (readcharfun, 16); /* #b010 => 2 -- binary constant syntax */ case 'b': return read_integer (readcharfun, 2); + /* #r"raw\stringt" -- raw string syntax */ + case 'r': return read_raw_string(readcharfun); /* #s(foobar key1 val1 key2 val2) -- structure syntax */ case 's': return read_structure (readcharfun); case '<': @@ -2654,48 +2765,8 @@ } case '\"': - { - /* String */ -#ifdef I18N3 - /* #### If the input stream is translating, then the string - should be marked as translatable by setting its - `string-translatable' property to t. .el and .elc files - normally are translating input streams. See Fgettext() - and print_internal(). */ -#endif - int cancel = 0; - - Lstream_rewind (XLSTREAM (Vread_buffer_stream)); - while ((c = readchar (readcharfun)) >= 0 - && c != '\"') - { - if (c == '\\') - c = read_escape (readcharfun); - /* c is -1 if \ newline has just been seen */ - if (c == -1) - { - if (Lstream_byte_count (XLSTREAM (Vread_buffer_stream)) == 0) - cancel = 1; - } - else - Lstream_put_ichar (XLSTREAM (Vread_buffer_stream), c); - QUIT; - } - if (c < 0) - return Fsignal (Qend_of_file, list1 (READCHARFUN_MAYBE (readcharfun))); - - /* If purifying, and string starts with \ newline, - return zero instead. This is for doc strings - that we are really going to find in lib-src/DOC.nn.nn */ - if (purify_flag && NILP (Vinternal_doc_file_name) && cancel) - return Qzero; - - Lstream_flush (XLSTREAM (Vread_buffer_stream)); - return - make_string - (resizing_buffer_stream_ptr (XLSTREAM (Vread_buffer_stream)), - Lstream_byte_count (XLSTREAM (Vread_buffer_stream))); - } + /* String */ + return read_string(readcharfun, '\"', 0, 1); default: {
--- a/tests/ChangeLog Fri Aug 04 21:50:46 2006 +0000 +++ b/tests/ChangeLog Fri Aug 04 22:55:19 2006 +0000 @@ -1,3 +1,10 @@ +2006-08-05 Aidan Kehoe <kehoea@parhasard.net> + + * automated/lisp-reader-tests.el: + New file, imported from Martin Kuehl's SXEmacs commit; test the + new raw string syntax, including the Unicode escapes, which + SXEmacs doesn't have. + 2006-06-24 Stephen J. Turnbull <stephen@xemacs.org> * automated/test-harness.el (Silence-Message): New macro.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/automated/lisp-reader-tests.el Fri Aug 04 22:55:19 2006 +0000 @@ -0,0 +1,80 @@ +;; Copyright (C) 2005 Martin Kuehl. + +;; Author: Martin Kuehl <martin.kuehl@gmail.com> +;; Maintainer: Martin Kuehl <martin.kuehl@gmail.com> +;; Created: 2005 +;; Keywords: tests + +;; This file is NOT part of SXEmacs. + +;; SXEmacs is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. + +;; SXEmacs is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with SXEmacs; see the file COPYING. If not, write to the Free +;; Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +;; 02111-1307, USA. + +;;; Synched up with: Not in FSF. + +;;; Commentary: + +;; Test the lisp reader. +;; See test-harness.el for instructions on how to run these tests. + +;;; Raw Strings +;;; =========== + +;; Equality to "traditional" strings +;; --------------------------------- +(dolist (strings '((#r"xyz" "xyz") ; no backslashes + (#r"\xyz" "\\xyz") ; backslash at start + (#r"\\xyz" "\\\\xyz") ; backslashes at start + (#r"\nxyz" "\\nxyz") ; escape seq. at start + (#r"\"xyz" "\\\"xyz") ; quote at start + (#r"xy\z" "xy\\z") ; backslash in middle + (#r"xy\\z" "xy\\\\z") ; backslashes in middle + (#r"xy\nz" "xy\\nz") ; escape seq. in middle + (#r"xy\"z" "xy\\\"z") ; quote in middle + ;;(#r"xyz\" "xyz\\") ; backslash at end: error + (#r"xyz\\" "xyz\\\\") ; backslashes at end + (#r"xyz\n" "xyz\\n") ; escape seq. at end + (#r"xyz\"" "xyz\\\"") ; quote at end + (#ru"\u00ABxyz" "\u00ABxyz") ; one Unicode escape + (#rU"\U000000ABxyz" "\U000000ABxyz") ; another Unicode escape + (#rU"xyz\u00AB" "xyz\u00AB") ; one Unicode escape + )) + (Assert (apply #'string= strings))) + +;; Odd number of backslashes at the end +;; ------------------------------------ +(dolist (string '("#r\"xyz\\\"" ; `#r"abc\"': escaped delimiter + "#r\"xyz\\\\\\\"" ; `#r"abc\\\"': escaped delimiter + )) + (with-temp-buffer + (insert string) + (Check-Error end-of-file (eval-buffer)))) + +;; Alternate string/regex delimiters +;; --------------------------------- +(dolist (string '("#r/xyz/" ; Perl syntax + "#r:ix/xyz/" ; Extended Perl syntax + "#r|xyz|" ; TeX syntax + "#r[xyz]" ; (uncommon) Perl syntax + "#r<xyz>" ; Perl6 syntax? + "#r(xyz)" ; arbitrary santax + "#r{xyz}" ; arbitrary santax + "#r,xyz," ; arbitrary santax + "#r!xyz!" ; arbitrary santax + )) + (with-temp-buffer + (insert string) + (Check-Error-Message invalid-read-syntax "unrecognized raw string" + (eval-buffer))))