changeset 3543:c136144fe765

[xemacs-hg @ 2006-08-04 22:55:04 by aidan] Raw strings, from Python via SXEmacs
author aidan
date Fri, 04 Aug 2006 22:55:19 +0000
parents 1ce31579a443
children 981a144e71fa
files lisp/ChangeLog lisp/lisp.el man/ChangeLog man/lispref/objects.texi src/ChangeLog src/lread.c tests/ChangeLog tests/automated/lisp-reader-tests.el
diffstat 8 files changed, 295 insertions(+), 99 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/ChangeLog	Fri Aug 04 21:50:46 2006 +0000
+++ b/lisp/ChangeLog	Fri Aug 04 22:55:19 2006 +0000
@@ -1,3 +1,9 @@
+2006-08-05  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* lisp.el (forward-sexp):
+	Handle raw strings specially just as we do structures. Fixes
+	problems evaluating them in *scratch*. 
+
 2006-08-04  Aidan Kehoe  <kehoea@parhasard.net>
 
 	* iso8859-1.el:
--- a/lisp/lisp.el	Fri Aug 04 21:50:46 2006 +0000
+++ b/lisp/lisp.el	Fri Aug 04 22:55:19 2006 +0000
@@ -60,19 +60,20 @@
   (interactive "_p")
   (or arg (setq arg 1))
   ;; XEmacs: evil hack! The other half of the evil hack below.
-  (if (and (> arg 0) (looking-at "#s("))
-      (goto-char (+ (point) 2)))
+  (if (and (> arg 0) (looking-at "#s(\\|#r[uU]\\{0,1\\}\""))
+    (goto-char (1+ (- (point) (- (match-end 0) (match-beginning 0))))))
   (goto-char (or (scan-sexps (point) arg) (buffer-end arg)))
-  (if (< arg 0) (backward-prefix-chars))
-  ;; XEmacs: evil hack! Skip back over #s so that structures are read
-  ;; properly.  the current cheesified syntax tables just aren't up to
-  ;; this.
-  (if (and (< arg 0)
-	   (eq (char-after (point)) ?\()
-	   (>= (- (point) (point-min)) 2)
-	   (eq (char-after (- (point) 1)) ?s)
-	   (eq (char-after (- (point) 2)) ?#))
-      (goto-char (- (point) 2))))
+  (when (< arg 0) 
+    (backward-prefix-chars)
+    ;; XEmacs: evil hack! Skip back over #[sr] so that structures and raw
+    ;; strings are read properly.  the current cheesified syntax tables just
+    ;; aren't up to this.
+    (let* ((diff (- (point) (point-min)))
+	   (subject (buffer-substring (- (point) (min diff 3))
+				      (1+ (point))))
+	   (matched (string-match "#s(\\|#r[uU]\\{0,1\\}\"" subject)))
+      (if matched
+	(goto-char (1+ (- (point) (- (length subject) matched))))))))
 
 (defun backward-sexp (&optional arg)
   "Move backward across one balanced expression (sexp).
--- a/man/ChangeLog	Fri Aug 04 21:50:46 2006 +0000
+++ b/man/ChangeLog	Fri Aug 04 22:55:19 2006 +0000
@@ -1,3 +1,9 @@
+2006-08-05  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* lispref/objects.texi (String Type):
+	Give details of the raw string syntax, taken from SXEmacs and
+	Python. 
+
 2006-07-19  Stephen J. Turnbull  <stephen@xemacs.org>
 
 	* new-users-guide/edit.texi (Insert): Document bogosity in
--- a/man/lispref/objects.texi	Fri Aug 04 21:50:46 2006 +0000
+++ b/man/lispref/objects.texi	Fri Aug 04 22:55:19 2006 +0000
@@ -1079,6 +1079,16 @@
 escape any backslash or double-quote characters in the string with a
 backslash, like this: @code{"this \" is an embedded quote"}.
 
+ An alternative syntax allows insertion of raw backslashes into a
+string, like this: @code{#r"this \ is an embedded backslash"}.  In  such
+a string, each character following a backslash is included literally in
+the string, and all backslashes are left in the string.  This means that
+@code{#r"\""} is a valid string literal with two characters, a backslash and a
+double-quote.  It also means that a string  with this syntax @emph{cannot end
+in a single backslash}.  As with Python, from where this syntax was
+taken, you can specify @code{u} or @code{U} after the @code{#r} to
+specify that interpretation of Unicode escapes should be done. 
+
   The newline character is not special in the read syntax for strings;
 if you write a new line between the double-quotes, it becomes a
 character in the string.  But an escaped newline---one that is preceded
--- a/src/ChangeLog	Fri Aug 04 21:50:46 2006 +0000
+++ b/src/ChangeLog	Fri Aug 04 22:55:19 2006 +0000
@@ -1,3 +1,18 @@
+2006-08-05  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* lread.c (read_unicode_escape):
+	Refactor this code out from read_escape, since it's now called
+	from read_string as well. 
+	* lread.c (read_escape):
+	Call read_unicode_escape instead of using inline code, 
+	* lread.c (read_string):
+	Refactor out from read1, provide raw and honor_unicode options. 
+	* lread.c (read_raw_string):
+	Added, a function that calls read_string with the correct
+	arguments for a raw string. 
+	* lread.c (read1):
+	Pass raw strings to read_raw_string; pass strings to read_string.
+
 2006-08-04  Aidan Kehoe  <kehoea@parhasard.net>
 
 	* event-tty.c (emacs_tty_next_event):
--- a/src/lread.c	Fri Aug 04 21:50:46 2006 +0000
+++ b/src/lread.c	Fri Aug 04 22:55:19 2006 +0000
@@ -1670,15 +1670,56 @@
 
   return val;
 }
+
+/* A Unicode escape, as in C# (though we only permit them in strings
+   and characters, not arbitrarily in the source code.) */
+static Ichar
+read_unicode_escape (Lisp_Object readcharfun, int unicode_hex_count)
+{
+  REGISTER Ichar i = 0, c;
+  REGISTER int count = 0;
+  Lisp_Object lisp_char;
+  while (++count <= unicode_hex_count)
+    {
+      c = readchar (readcharfun);
+      /* Remember, can't use isdigit(), isalpha() etc. on Ichars */
+      if      (c >= '0' && c <= '9')  i = (i << 4) + (c - '0');
+      else if (c >= 'a' && c <= 'f')  i = (i << 4) + (c - 'a') + 10;
+      else if (c >= 'A' && c <= 'F')  i = (i << 4) + (c - 'A') + 10;
+      else
+	{
+	  syntax_error ("Non-hex digit used for Unicode escape",
+			make_char (c));
+	  break;
+	}
+    }
+
+  lisp_char = Funicode_to_char(make_int(i), Qnil);
+
+  if (EQ(Qnil, lisp_char))
+    {
+      /* This is ugly and horrible and trashes the user's data, but
+	 it's what unicode.c does. In the future, unicode-to-char
+	 should not return nil.  */
+#ifdef MULE
+      i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128);
+#else
+      i = '~';
+#endif
+      return i;
+    }
+  else
+    {
+      return XCHAR(lisp_char);
+    }
+}
+
 
 static Ichar
 read_escape (Lisp_Object readcharfun)
 {
   /* This function can GC */
   Ichar c = readchar (readcharfun);
-  /* \u allows up to four hex digits, \U up to eight. Default to the
-     behaviour for \u, and change this value in the case that \U is seen. */
-  int unicode_hex_count = 4;
 
   if (c < 0)
     signal_error (Qend_of_file, 0, READCHARFUN_MAYBE (readcharfun));
@@ -1797,49 +1838,10 @@
       }
     case 'U':
       /* Post-Unicode-2.0: Up to eight hex chars */
-      unicode_hex_count = 8;
+      return read_unicode_escape(readcharfun, 8);
     case 'u':
-
-      /* A Unicode escape, as in C# (though we only permit them in strings
-	 and characters, not arbitrarily in the source code.) */
-      {
-	REGISTER Ichar i = 0;
-	REGISTER int count = 0;
-	Lisp_Object lisp_char;
-	while (++count <= unicode_hex_count)
-	  {
-	    c = readchar (readcharfun);
-	    /* Remember, can't use isdigit(), isalpha() etc. on Ichars */
-	    if      (c >= '0' && c <= '9')  i = (i << 4) + (c - '0');
-	    else if (c >= 'a' && c <= 'f')  i = (i << 4) + (c - 'a') + 10;
-            else if (c >= 'A' && c <= 'F')  i = (i << 4) + (c - 'A') + 10;
-	    else
-	      {
-		syntax_error ("Non-hex digit used for Unicode escape",
-			      make_char (c));
-		break;
-	      }
-	  }
-
-	lisp_char = Funicode_to_char(make_int(i), Qnil);
-
-	if (EQ(Qnil, lisp_char))
-	  {
-	    /* This is ugly and horrible and trashes the user's data, but
-	       it's what unicode.c does. In the future, unicode-to-char
-	       should not return nil.  */
-#ifdef MULE
-	    i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128);
-#else
-	    i = '~';
-#endif
-            return i;
-	  }
-	else
-	  {
-	    return XCHAR(lisp_char);
-	  }
-      }
+      /* Unicode-2.0 and before; four hex chars. */
+      return read_unicode_escape(readcharfun, 4);
 
     default:
 	return c;
@@ -2270,6 +2272,113 @@
 }
 #endif
 
+static Lisp_Object
+read_string (Lisp_Object readcharfun, Ichar delim, int raw, 
+	     int honor_unicode)
+{
+#ifdef I18N3
+  /* #### If the input stream is translating, then the string
+     should be marked as translatable by setting its
+     `string-translatable' property to t.  .el and .elc files
+     normally are translating input streams.  See Fgettext()
+     and print_internal(). */
+#endif
+  Ichar c;
+  int cancel = 0;
+
+  Lstream_rewind(XLSTREAM(Vread_buffer_stream));
+  while ((c = readchar(readcharfun)) >= 0 && c != delim)
+    {
+    if (c == '\\') 
+      {
+	if (raw) 
+	  {
+	    c = readchar(readcharfun);
+	    if (honor_unicode && ('u' == c || 'U' == c))
+	      {
+		c = read_unicode_escape(readcharfun,
+					'U' == c ? 8 : 4);
+	      }
+	    else
+	      {
+		/* For raw strings, insert the
+		   backslash and the next char, */
+		Lstream_put_ichar(XLSTREAM
+				  (Vread_buffer_stream),
+				  '\\');
+	      }
+	  } 
+	else
+	  /* otherwise, backslash escapes the next char. */
+	  c = read_escape(readcharfun);
+      }
+    /* c is -1 if \ newline has just been seen */
+    if (c == -1) 
+      {
+	if (Lstream_byte_count
+	  (XLSTREAM(Vread_buffer_stream)) ==
+	  0)
+	  cancel = 1;
+      } 
+    else
+      Lstream_put_ichar(XLSTREAM
+			 (Vread_buffer_stream),
+			 c);
+    QUIT;
+    }
+  if (c < 0)
+    return Fsignal(Qend_of_file,
+		   list1(READCHARFUN_MAYBE(readcharfun)));
+
+  /* If purifying, and string starts with \ newline,
+     return zero instead.  This is for doc strings
+     that we are really going to find in lib-src/DOC.nn.nn  */
+  if (purify_flag && NILP(Vinternal_doc_file_name)
+      && cancel)
+    return Qzero;
+
+  Lstream_flush(XLSTREAM(Vread_buffer_stream));
+  return make_string(resizing_buffer_stream_ptr
+		     (XLSTREAM(Vread_buffer_stream)),
+		     Lstream_byte_count(XLSTREAM(Vread_buffer_stream)));
+}
+
+static Lisp_Object
+read_raw_string (Lisp_Object readcharfun)
+{
+  Ichar c;
+  Ichar permit_unicode = 0; 
+
+  do {
+    c = reader_nextchar(readcharfun);
+    switch (c) {
+      /* #r:engine"my sexy raw string" -- raw string w/ flags*/
+      /* case ':': */
+      /* #ru"Hi there\u20AC \U000020AC" -- raw string, honouring Unicode. */
+    case 'u':
+    case 'U':
+      permit_unicode = c; 
+      continue;
+
+      /* #r"my raw string" -- raw string */
+    case '\"':
+      return read_string(readcharfun, '\"', 1, permit_unicode);
+      /* invalid syntax */
+    default:
+      {
+	if (permit_unicode)
+	  {
+	    unreadchar(readcharfun, permit_unicode);
+	  }
+	unreadchar(readcharfun, c);
+	return Fsignal(Qinvalid_read_syntax,
+		       list1(build_string
+			     ("unrecognized raw string syntax")));
+      }
+    }
+  } while (1);
+}
+
 /* Read the next Lisp object from the stream READCHARFUN and return it.
    If the return value is a cons whose car is Qunbound, then read1()
    encountered a misplaced token (e.g. a right bracket, right paren,
@@ -2509,6 +2618,8 @@
 	  case 'x': return read_integer (readcharfun, 16);
             /* #b010 => 2 -- binary constant syntax */
 	  case 'b': return read_integer (readcharfun, 2);
+	    /* #r"raw\stringt" -- raw string syntax */
+	  case 'r': return read_raw_string(readcharfun);
             /* #s(foobar key1 val1 key2 val2) -- structure syntax */
 	  case 's': return read_structure (readcharfun);
 	  case '<':
@@ -2654,48 +2765,8 @@
       }
 
     case '\"':
-      {
-	/* String */
-#ifdef I18N3
-	/* #### If the input stream is translating, then the string
-	   should be marked as translatable by setting its
-	   `string-translatable' property to t.  .el and .elc files
-	   normally are translating input streams.  See Fgettext()
-	   and print_internal(). */
-#endif
-	int cancel = 0;
-
-	Lstream_rewind (XLSTREAM (Vread_buffer_stream));
-	while ((c = readchar (readcharfun)) >= 0
-	       && c != '\"')
-	  {
-	    if (c == '\\')
-	      c = read_escape (readcharfun);
-	    /* c is -1 if \ newline has just been seen */
-	    if (c == -1)
-	      {
-		if (Lstream_byte_count (XLSTREAM (Vread_buffer_stream)) == 0)
-		  cancel = 1;
-	      }
-	    else
-	      Lstream_put_ichar (XLSTREAM (Vread_buffer_stream), c);
-	    QUIT;
-	  }
-	if (c < 0)
-	  return Fsignal (Qend_of_file, list1 (READCHARFUN_MAYBE (readcharfun)));
-
-	/* If purifying, and string starts with \ newline,
-	   return zero instead.  This is for doc strings
-	   that we are really going to find in lib-src/DOC.nn.nn  */
-	if (purify_flag && NILP (Vinternal_doc_file_name) && cancel)
-	  return Qzero;
-
-	Lstream_flush (XLSTREAM (Vread_buffer_stream));
-	return
-	  make_string
-	  (resizing_buffer_stream_ptr (XLSTREAM (Vread_buffer_stream)),
-	   Lstream_byte_count (XLSTREAM (Vread_buffer_stream)));
-      }
+      /* String */
+      return read_string(readcharfun, '\"', 0, 1);
 
     default:
       {
--- a/tests/ChangeLog	Fri Aug 04 21:50:46 2006 +0000
+++ b/tests/ChangeLog	Fri Aug 04 22:55:19 2006 +0000
@@ -1,3 +1,10 @@
+2006-08-05  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* automated/lisp-reader-tests.el:
+	New file, imported from Martin Kuehl's SXEmacs commit; test the
+	new raw string syntax, including the Unicode escapes, which
+	SXEmacs doesn't have. 
+
 2006-06-24  Stephen J. Turnbull  <stephen@xemacs.org>
 
 	* automated/test-harness.el (Silence-Message): New macro.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/automated/lisp-reader-tests.el	Fri Aug 04 22:55:19 2006 +0000
@@ -0,0 +1,80 @@
+;; Copyright (C) 2005 Martin Kuehl.
+
+;; Author: Martin Kuehl <martin.kuehl@gmail.com>
+;; Maintainer: Martin Kuehl <martin.kuehl@gmail.com>
+;; Created: 2005
+;; Keywords: tests
+
+;; This file is NOT part of SXEmacs.
+
+;; SXEmacs is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+
+;; SXEmacs is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with SXEmacs; see the file COPYING.  If not, write to the Free
+;; Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+;; 02111-1307, USA.
+
+;;; Synched up with: Not in FSF.
+
+;;; Commentary:
+
+;; Test the lisp reader.
+;; See test-harness.el for instructions on how to run these tests.
+
+;;; Raw Strings
+;;; ===========
+
+;; Equality to "traditional" strings
+;; ---------------------------------
+(dolist (strings '((#r"xyz"   "xyz")	 ; no backslashes
+		   (#r"\xyz"  "\\xyz")   ; backslash at start
+                   (#r"\\xyz" "\\\\xyz") ; backslashes at start
+                   (#r"\nxyz" "\\nxyz")  ; escape seq. at start
+                   (#r"\"xyz" "\\\"xyz") ; quote at start
+                   (#r"xy\z"  "xy\\z")   ; backslash in middle
+                   (#r"xy\\z" "xy\\\\z") ; backslashes in middle
+                   (#r"xy\nz" "xy\\nz")  ; escape seq. in middle
+                   (#r"xy\"z" "xy\\\"z") ; quote in middle
+                   ;;(#r"xyz\"  "xyz\\")   ; backslash at end: error
+                   (#r"xyz\\" "xyz\\\\") ; backslashes at end
+                   (#r"xyz\n" "xyz\\n")  ; escape seq. at end
+                   (#r"xyz\"" "xyz\\\"") ; quote at end
+		   (#ru"\u00ABxyz" "\u00ABxyz") ; one Unicode escape
+		   (#rU"\U000000ABxyz" "\U000000ABxyz") ; another Unicode escape
+		   (#rU"xyz\u00AB" "xyz\u00AB") ; one Unicode escape
+                   ))
+  (Assert (apply #'string= strings)))
+
+;; Odd number of backslashes at the end
+;; ------------------------------------
+(dolist (string '("#r\"xyz\\\""         ; `#r"abc\"': escaped delimiter
+                  "#r\"xyz\\\\\\\""     ; `#r"abc\\\"': escaped delimiter
+                  ))
+  (with-temp-buffer
+    (insert string)
+    (Check-Error end-of-file (eval-buffer))))
+
+;; Alternate string/regex delimiters
+;; ---------------------------------
+(dolist (string '("#r/xyz/"             ; Perl syntax
+                  "#r:ix/xyz/"          ; Extended Perl syntax
+                  "#r|xyz|"             ; TeX syntax
+                  "#r[xyz]"             ; (uncommon) Perl syntax
+                  "#r<xyz>"             ; Perl6 syntax?
+                  "#r(xyz)"             ; arbitrary santax
+                  "#r{xyz}"             ; arbitrary santax
+                  "#r,xyz,"             ; arbitrary santax
+                  "#r!xyz!"             ; arbitrary santax
+                  ))
+  (with-temp-buffer
+    (insert string)
+    (Check-Error-Message invalid-read-syntax "unrecognized raw string"
+                         (eval-buffer))))