changeset 4268:75d0292c1bff

[xemacs-hg @ 2007-11-14 19:41:04 by aidan] Correct the dumped information for the Unicode JIT infrastructure.
author aidan
date Wed, 14 Nov 2007 19:41:09 +0000
parents 66e2714696bd
children 609a5762d915
files lisp/ChangeLog lisp/unicode.el src/ChangeLog src/lread.c src/unicode.c
diffstat 5 files changed, 93 insertions(+), 60 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/ChangeLog	Wed Nov 14 19:25:40 2007 +0000
+++ b/lisp/ChangeLog	Wed Nov 14 19:41:09 2007 +0000
@@ -1,3 +1,12 @@
+2007-11-14  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* unicode.el (unicode-error-default-translation-table): 
+	* unicode.el (unicode-error-sequence-regexp-range):
+	* unicode.el (frob-unicode-errors-region):
+	Make these variables and the single function available to
+	make-docfile, by moving them to the start of the line. This
+	conflicts with normal indentation of Lisp, unfortunately. 
+
 2007-11-14  Aidan Kehoe  <kehoea@parhasard.net>
 
 	* subr.el (string-to-sequence):
--- a/lisp/unicode.el	Wed Nov 14 19:25:40 2007 +0000
+++ b/lisp/unicode.el	Wed Nov 14 19:41:09 2007 +0000
@@ -494,36 +494,40 @@
                           (char-syntax ascii-or-latin-1))
                          syntax-table))
 
-  ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0
-  ;; characters starting at U+200000 (which isn't a valid Unicode code
-  ;; point). Make them available to user code. 
-  (defvar unicode-error-default-translation-table
-    (loop 
-      with char-table = (make-char-table 'char)
-      for i from ?\x00 to ?\xFF
-      do
-      (put-char-table (aref
-		       ;; #xd800 is the first leading surrogate;
-		       ;; trailing surrogates must be in the range
-		       ;; #xdc00-#xdfff. These examples are not, so we
-		       ;; intentionally provoke an error sequence.
-		       (decode-coding-string (format "\xd8\x00\x00%c" i)
-					     'utf-16-be)
-		       3)
-		      i
-                      char-table)
-      finally return char-table)
-    "Translation table mapping Unicode error sequences to Latin-1 chars.
+;; *Sigh*, declarations needs to be at the start of the line to be picked up
+;; by make-docfile. Not so much an issue with ccl-encode-to-ucs-2, which we
+;; don't necessarily want to advertise, but the following are important.
+
+;; Create all the Unicode error sequences, normally as jit-ucs-charset-0
+;; characters starting at U+200000 (which isn't a valid Unicode code
+;; point). Make them available to user code. 
+(defvar unicode-error-default-translation-table
+  (loop 
+    with char-table = (make-char-table 'char)
+    for i from ?\x00 to ?\xFF
+    do
+    (put-char-table (aref
+                     ;; #xd800 is the first leading surrogate;
+                     ;; trailing surrogates must be in the range
+                     ;; #xdc00-#xdfff. These examples are not, so we
+                     ;; intentionally provoke an error sequence.
+                     (decode-coding-string (format "\xd8\x00\x00%c" i)
+                                           'utf-16-be)
+                     3)
+                    i
+                    char-table)
+    finally return char-table)
+  "Translation table mapping Unicode error sequences to Latin-1 chars.
 
 To transform XEmacs Unicode error sequences to the Latin-1 characters that
 correspond to the octets on disk, you can use this variable.  ")
 
-  (defvar unicode-error-sequence-regexp-range
-    (format "%c%c-%c"
-            (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 0)
-            (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 3)
-            (aref (decode-coding-string "\xd8\x00\x00\xFF" 'utf-16-be) 3))
-    "Regular expression range to match Unicode error sequences in XEmacs.
+(defvar unicode-error-sequence-regexp-range
+  (format "%c%c-%c"
+          (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 0)
+          (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 3)
+          (aref (decode-coding-string "\xd8\x00\x00\xFF" 'utf-16-be) 3))
+  "Regular expression range to match Unicode error sequences in XEmacs.
 
 Invalid Unicode sequences on input are represented as XEmacs
 characters with values stored as the keys in
@@ -559,14 +563,14 @@
 	      nil
 	      (format "Could not find char ?\\x%x in buffer" i))))
 
-  (defun frob-unicode-errors-region (frob-function begin end &optional buffer)
-    "Call FROB-FUNCTION on the Unicode error sequences between BEGIN and END.
+(defun frob-unicode-errors-region (frob-function begin end &optional buffer)
+  "Call FROB-FUNCTION on the Unicode error sequences between BEGIN and END.
 
 Optional argument BUFFER specifies the buffer that should be examined for
 such sequences.  "
-    (check-argument-type #'functionp frob-function)
-    (check-argument-range begin (point-min buffer) (point-max buffer))
-    (check-argument-range end (point-min buffer) (point-max buffer))
+  (check-argument-type #'functionp frob-function)
+  (check-argument-range begin (point-min buffer) (point-max buffer))
+  (check-argument-range end (point-min buffer) (point-max buffer))
     (save-excursion
       (save-restriction
 	(if buffer (set-buffer buffer))
--- a/src/ChangeLog	Wed Nov 14 19:25:40 2007 +0000
+++ b/src/ChangeLog	Wed Nov 14 19:41:09 2007 +0000
@@ -1,3 +1,15 @@
+2007-11-14  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* lread.c (read_unicode_escape):
+	Correct the range check for Unicode characters specified with
+	source-level escapes. 
+	* unicode.c:
+	* unicode.c (unicode_to_ichar):
+	* unicode.c (coding_system_type_create_unicode):
+	Correct the dump behaviour for just-in-time Unicode code
+	points. Update the docstring for #'unicode-to-char to indicate
+	that code points will run out above around 400,000 in a session. 
+
 2007-11-14  Aidan Kehoe  <kehoea@parhasard.net>
 
 	* editfns.c (vars_of_editfns):
--- a/src/lread.c	Wed Nov 14 19:25:40 2007 +0000
+++ b/src/lread.c	Wed Nov 14 19:41:09 2007 +0000
@@ -1694,7 +1694,7 @@
 	}
     }
 
-  if (i > 0x110000 || i < 0)
+  if (i >= 0x110000 || i < 0)
     {
       syntax_error ("Not a Unicode code point", make_int(i));
     }
--- a/src/unicode.c	Wed Nov 14 19:25:40 2007 +0000
+++ b/src/unicode.c	Wed Nov 14 19:41:09 2007 +0000
@@ -336,6 +336,11 @@
 Lisp_Object Qlast_allocated_character;
 Lisp_Object Qccl_encode_to_ucs_2;
 
+Lisp_Object Vnumber_of_jit_charsets;
+Lisp_Object Vlast_jit_charset_final;
+Lisp_Object Vcharset_descr;
+
+
 
 /************************************************************************/
 /*                        Unicode implementation                        */
@@ -1080,8 +1085,6 @@
   int code_levels;
   int i;
   int n = Dynarr_length (charsets);
-  static int number_of_jit_charsets;
-  static Ascbyte last_jit_charset_final;
 
   type_checking_assert (code >= 0);
   /* This shortcut depends on the representation of an Ichar, see text.c.
@@ -1124,33 +1127,21 @@
 	  (-1 == (i = get_free_codepoint(Vcurrent_jit_charset))))
 	{
 	  Ibyte setname[32]; 
-	  Lisp_Object charset_descr = build_string
-	    ("Mule charset for otherwise unknown Unicode code points.");
-
-	  struct gcpro gcpro1;
-
-	  if ('\0' == last_jit_charset_final)
-	    {
-	      /* This final byte shit is, umm, not that cool. */
-	      last_jit_charset_final = 0x30;
-	    }
+	  int number_of_jit_charsets = XINT (Vnumber_of_jit_charsets);
+	  Ascbyte last_jit_charset_final = XCHAR (Vlast_jit_charset_final);
+
+	  /* This final byte shit is, umm, not that cool. */
+	  assert (last_jit_charset_final >= 0x30);
 
 	  /* Assertion added partly because our Win32 layer doesn't
 	     support snprintf; with this, we're sure it won't overflow
 	     the buffer.  */
 	  assert(100 > number_of_jit_charsets);
 
-	  qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets++);
-
-	  /* Aside: GCPROing here would be overkill according to the FSF's
-	     philosophy. make-charset cannot currently GC, but is intended
-	     to be called from Lisp, with its arguments protected by the
-	     Lisp reader. We GCPRO in case it GCs in the future and no-one
-	     checks all the C callers.  */
-
-	  GCPRO1 (charset_descr);
+	  qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets);
+
 	  Vcurrent_jit_charset = Fmake_charset 
-	    (intern((const CIbyte *)setname), charset_descr, 
+	    (intern((const CIbyte *)setname), Vcharset_descr, 
 	     /* Set encode-as-utf-8 to t, to have this character set written
 		using UTF-8 escapes in escape-quoted and ctext. This
 		sidesteps the fact that our internal character -> Unicode
@@ -1159,11 +1150,15 @@
 		     nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96),
 				   Qdimension, make_int(2)),
 			     list6(Qregistries, Qunicode_registries,
-				   Qfinal, make_char(last_jit_charset_final++),
+				   Qfinal, make_char(last_jit_charset_final),
 				   /* This CCL program is initialised in
 				      unicode.el. */
 				   Qccl_program, Qccl_encode_to_ucs_2))));
-	  UNGCPRO;
+
+	  /* Record for the Unicode infrastructure that we've created
+	     this character set.  */
+	  Vnumber_of_jit_charsets = make_int (number_of_jit_charsets + 1);
+	  Vlast_jit_charset_final = make_char (last_jit_charset_final + 1);
 
 	  i = get_free_codepoint(Vcurrent_jit_charset);
 	} 
@@ -1421,10 +1416,15 @@
 If the CODE would not otherwise be converted to an XEmacs character, and the
 list of character sets to be consulted is nil or the default, a new XEmacs
 character will be created for it in one of the `jit-ucs-charset' Mule
-character sets, and that character will be returned.  There is scope for
-tens of thousands of separate Unicode code points in every session using
-this technique, so despite XEmacs' internal encoding not being based on
-Unicode, your data won't be trashed.
+character sets, and that character will be returned.  
+
+This is limited to around 400,000 characters per XEmacs session, though, so
+while normal usage will not be problematic, things like:
+
+\(dotimes (i #x110000) (decode-char 'ucs i))
+
+will eventually error.  The long-term solution to this is Unicode as an
+internal encoding. 
 */
        (code, USED_IF_MULE (charsets)))
 {
@@ -2862,6 +2862,14 @@
 void
 coding_system_type_create_unicode (void)
 {
+  staticpro (&Vnumber_of_jit_charsets);
+  Vnumber_of_jit_charsets = make_int (0);
+  staticpro (&Vlast_jit_charset_final);
+  Vlast_jit_charset_final = make_char (0x30);
+  staticpro (&Vcharset_descr);
+  Vcharset_descr
+    = build_string ("Mule charset for otherwise unknown Unicode code points.");
+
   INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (unicode, "unicode-coding-system-p");
   CODING_SYSTEM_HAS_METHOD (unicode, print);
   CODING_SYSTEM_HAS_METHOD (unicode, convert);