Mercurial > hg > xemacs-beta
changeset 4268:75d0292c1bff
[xemacs-hg @ 2007-11-14 19:41:04 by aidan]
Correct the dumped information for the Unicode JIT infrastructure.
author | aidan |
---|---|
date | Wed, 14 Nov 2007 19:41:09 +0000 |
parents | 66e2714696bd |
children | 609a5762d915 |
files | lisp/ChangeLog lisp/unicode.el src/ChangeLog src/lread.c src/unicode.c |
diffstat | 5 files changed, 93 insertions(+), 60 deletions(-) [+] |
line wrap: on
line diff
--- a/lisp/ChangeLog Wed Nov 14 19:25:40 2007 +0000 +++ b/lisp/ChangeLog Wed Nov 14 19:41:09 2007 +0000 @@ -1,3 +1,12 @@ +2007-11-14 Aidan Kehoe <kehoea@parhasard.net> + + * unicode.el (unicode-error-default-translation-table): + * unicode.el (unicode-error-sequence-regexp-range): + * unicode.el (frob-unicode-errors-region): + Make these variables and the single function available to + make-docfile, by moving them to the start of the line. This + conflicts with normal indentation of Lisp, unfortunately. + 2007-11-14 Aidan Kehoe <kehoea@parhasard.net> * subr.el (string-to-sequence):
--- a/lisp/unicode.el Wed Nov 14 19:25:40 2007 +0000 +++ b/lisp/unicode.el Wed Nov 14 19:41:09 2007 +0000 @@ -494,36 +494,40 @@ (char-syntax ascii-or-latin-1)) syntax-table)) - ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0 - ;; characters starting at U+200000 (which isn't a valid Unicode code - ;; point). Make them available to user code. - (defvar unicode-error-default-translation-table - (loop - with char-table = (make-char-table 'char) - for i from ?\x00 to ?\xFF - do - (put-char-table (aref - ;; #xd800 is the first leading surrogate; - ;; trailing surrogates must be in the range - ;; #xdc00-#xdfff. These examples are not, so we - ;; intentionally provoke an error sequence. - (decode-coding-string (format "\xd8\x00\x00%c" i) - 'utf-16-be) - 3) - i - char-table) - finally return char-table) - "Translation table mapping Unicode error sequences to Latin-1 chars. +;; *Sigh*, declarations needs to be at the start of the line to be picked up +;; by make-docfile. Not so much an issue with ccl-encode-to-ucs-2, which we +;; don't necessarily want to advertise, but the following are important. + +;; Create all the Unicode error sequences, normally as jit-ucs-charset-0 +;; characters starting at U+200000 (which isn't a valid Unicode code +;; point). Make them available to user code. +(defvar unicode-error-default-translation-table + (loop + with char-table = (make-char-table 'char) + for i from ?\x00 to ?\xFF + do + (put-char-table (aref + ;; #xd800 is the first leading surrogate; + ;; trailing surrogates must be in the range + ;; #xdc00-#xdfff. These examples are not, so we + ;; intentionally provoke an error sequence. + (decode-coding-string (format "\xd8\x00\x00%c" i) + 'utf-16-be) + 3) + i + char-table) + finally return char-table) + "Translation table mapping Unicode error sequences to Latin-1 chars. To transform XEmacs Unicode error sequences to the Latin-1 characters that correspond to the octets on disk, you can use this variable. ") - (defvar unicode-error-sequence-regexp-range - (format "%c%c-%c" - (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 0) - (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 3) - (aref (decode-coding-string "\xd8\x00\x00\xFF" 'utf-16-be) 3)) - "Regular expression range to match Unicode error sequences in XEmacs. +(defvar unicode-error-sequence-regexp-range + (format "%c%c-%c" + (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 0) + (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 3) + (aref (decode-coding-string "\xd8\x00\x00\xFF" 'utf-16-be) 3)) + "Regular expression range to match Unicode error sequences in XEmacs. Invalid Unicode sequences on input are represented as XEmacs characters with values stored as the keys in @@ -559,14 +563,14 @@ nil (format "Could not find char ?\\x%x in buffer" i)))) - (defun frob-unicode-errors-region (frob-function begin end &optional buffer) - "Call FROB-FUNCTION on the Unicode error sequences between BEGIN and END. +(defun frob-unicode-errors-region (frob-function begin end &optional buffer) + "Call FROB-FUNCTION on the Unicode error sequences between BEGIN and END. Optional argument BUFFER specifies the buffer that should be examined for such sequences. " - (check-argument-type #'functionp frob-function) - (check-argument-range begin (point-min buffer) (point-max buffer)) - (check-argument-range end (point-min buffer) (point-max buffer)) + (check-argument-type #'functionp frob-function) + (check-argument-range begin (point-min buffer) (point-max buffer)) + (check-argument-range end (point-min buffer) (point-max buffer)) (save-excursion (save-restriction (if buffer (set-buffer buffer))
--- a/src/ChangeLog Wed Nov 14 19:25:40 2007 +0000 +++ b/src/ChangeLog Wed Nov 14 19:41:09 2007 +0000 @@ -1,3 +1,15 @@ +2007-11-14 Aidan Kehoe <kehoea@parhasard.net> + + * lread.c (read_unicode_escape): + Correct the range check for Unicode characters specified with + source-level escapes. + * unicode.c: + * unicode.c (unicode_to_ichar): + * unicode.c (coding_system_type_create_unicode): + Correct the dump behaviour for just-in-time Unicode code + points. Update the docstring for #'unicode-to-char to indicate + that code points will run out above around 400,000 in a session. + 2007-11-14 Aidan Kehoe <kehoea@parhasard.net> * editfns.c (vars_of_editfns):
--- a/src/lread.c Wed Nov 14 19:25:40 2007 +0000 +++ b/src/lread.c Wed Nov 14 19:41:09 2007 +0000 @@ -1694,7 +1694,7 @@ } } - if (i > 0x110000 || i < 0) + if (i >= 0x110000 || i < 0) { syntax_error ("Not a Unicode code point", make_int(i)); }
--- a/src/unicode.c Wed Nov 14 19:25:40 2007 +0000 +++ b/src/unicode.c Wed Nov 14 19:41:09 2007 +0000 @@ -336,6 +336,11 @@ Lisp_Object Qlast_allocated_character; Lisp_Object Qccl_encode_to_ucs_2; +Lisp_Object Vnumber_of_jit_charsets; +Lisp_Object Vlast_jit_charset_final; +Lisp_Object Vcharset_descr; + + /************************************************************************/ /* Unicode implementation */ @@ -1080,8 +1085,6 @@ int code_levels; int i; int n = Dynarr_length (charsets); - static int number_of_jit_charsets; - static Ascbyte last_jit_charset_final; type_checking_assert (code >= 0); /* This shortcut depends on the representation of an Ichar, see text.c. @@ -1124,33 +1127,21 @@ (-1 == (i = get_free_codepoint(Vcurrent_jit_charset)))) { Ibyte setname[32]; - Lisp_Object charset_descr = build_string - ("Mule charset for otherwise unknown Unicode code points."); - - struct gcpro gcpro1; - - if ('\0' == last_jit_charset_final) - { - /* This final byte shit is, umm, not that cool. */ - last_jit_charset_final = 0x30; - } + int number_of_jit_charsets = XINT (Vnumber_of_jit_charsets); + Ascbyte last_jit_charset_final = XCHAR (Vlast_jit_charset_final); + + /* This final byte shit is, umm, not that cool. */ + assert (last_jit_charset_final >= 0x30); /* Assertion added partly because our Win32 layer doesn't support snprintf; with this, we're sure it won't overflow the buffer. */ assert(100 > number_of_jit_charsets); - qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets++); - - /* Aside: GCPROing here would be overkill according to the FSF's - philosophy. make-charset cannot currently GC, but is intended - to be called from Lisp, with its arguments protected by the - Lisp reader. We GCPRO in case it GCs in the future and no-one - checks all the C callers. */ - - GCPRO1 (charset_descr); + qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets); + Vcurrent_jit_charset = Fmake_charset - (intern((const CIbyte *)setname), charset_descr, + (intern((const CIbyte *)setname), Vcharset_descr, /* Set encode-as-utf-8 to t, to have this character set written using UTF-8 escapes in escape-quoted and ctext. This sidesteps the fact that our internal character -> Unicode @@ -1159,11 +1150,15 @@ nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96), Qdimension, make_int(2)), list6(Qregistries, Qunicode_registries, - Qfinal, make_char(last_jit_charset_final++), + Qfinal, make_char(last_jit_charset_final), /* This CCL program is initialised in unicode.el. */ Qccl_program, Qccl_encode_to_ucs_2)))); - UNGCPRO; + + /* Record for the Unicode infrastructure that we've created + this character set. */ + Vnumber_of_jit_charsets = make_int (number_of_jit_charsets + 1); + Vlast_jit_charset_final = make_char (last_jit_charset_final + 1); i = get_free_codepoint(Vcurrent_jit_charset); } @@ -1421,10 +1416,15 @@ If the CODE would not otherwise be converted to an XEmacs character, and the list of character sets to be consulted is nil or the default, a new XEmacs character will be created for it in one of the `jit-ucs-charset' Mule -character sets, and that character will be returned. There is scope for -tens of thousands of separate Unicode code points in every session using -this technique, so despite XEmacs' internal encoding not being based on -Unicode, your data won't be trashed. +character sets, and that character will be returned. + +This is limited to around 400,000 characters per XEmacs session, though, so +while normal usage will not be problematic, things like: + +\(dotimes (i #x110000) (decode-char 'ucs i)) + +will eventually error. The long-term solution to this is Unicode as an +internal encoding. */ (code, USED_IF_MULE (charsets))) { @@ -2862,6 +2862,14 @@ void coding_system_type_create_unicode (void) { + staticpro (&Vnumber_of_jit_charsets); + Vnumber_of_jit_charsets = make_int (0); + staticpro (&Vlast_jit_charset_final); + Vlast_jit_charset_final = make_char (0x30); + staticpro (&Vcharset_descr); + Vcharset_descr + = build_string ("Mule charset for otherwise unknown Unicode code points."); + INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (unicode, "unicode-coding-system-p"); CODING_SYSTEM_HAS_METHOD (unicode, print); CODING_SYSTEM_HAS_METHOD (unicode, convert);