Mercurial > hg > xemacs-beta
diff src/unicode.c @ 5118:e0db3c197671 ben-lisp-object
merge up to latest default branch, doesn't compile yet
author | Ben Wing <ben@xemacs.org> |
---|---|
date | Sat, 26 Dec 2009 21:18:49 -0600 |
parents | b9aaf2a18957 |
children | c12b646d84ee |
line wrap: on
line diff
--- a/src/unicode.c Sat Dec 26 00:20:27 2009 -0600 +++ b/src/unicode.c Sat Dec 26 21:18:49 2009 -0600 @@ -41,6 +41,10 @@ #include "file-coding.h" #include "opaque.h" +#include "buffer.h" +#include "rangetab.h" +#include "extents.h" + #include "sysfile.h" /* For more info about how Unicode works under Windows, see intl-win32.c. */ @@ -146,13 +150,6 @@ (1) User-defined charsets: It would be inconvenient to require all dumped user-defined charsets to be reloaded at init time. - (2) Starting up in a non-ISO-8859-1 directory. If we load at run-time, - we don't load the tables until after we've parsed the current - directories, and we run into a real bootstrapping problem, if the - directories themselves are non-ISO-8859-1. This is potentially fixable - once we switch to using Unicode internally, so we don't have to do any - conversion (other than the automatic kind, e.g. UTF-16 to UTF-8). - NB With run-time loading, we load in init-mule-at-startup, in mule-cmds.el. This is called from startup.el, which is quite late in the initialization process -- but data-directory isn't set until then. @@ -192,7 +189,7 @@ convert them back.) */ Lisp_Object Qunicode; -Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7; +Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32; Lisp_Object Qneed_bom; Lisp_Object Qutf_16_little_endian, Qutf_16_bom; @@ -200,13 +197,40 @@ Lisp_Object Qutf_8_bom; +#ifdef MULE +/* These range tables are not directly accessible from Lisp: */ +static Lisp_Object Vunicode_invalid_and_query_skip_chars; +static Lisp_Object Vutf_8_invalid_and_query_skip_chars; +static Lisp_Object Vunicode_query_skip_chars; + +static Lisp_Object Vunicode_query_string, Vunicode_invalid_string, + Vutf_8_invalid_string; +#endif /* MULE */ + +/* See the Unicode FAQ, http://www.unicode.org/faq/utf_bom.html#35 for this + algorithm. + + (They also give another, really verbose one, as part of their explanation + of the various planes of the encoding, but we won't use that.) */ + +#define UTF_16_LEAD_OFFSET (0xD800 - (0x10000 >> 10)) +#define UTF_16_SURROGATE_OFFSET (0x10000 - (0xD800 << 10) - 0xDC00) + +#define utf_16_surrogates_to_code(lead, trail) \ + (((lead) << 10) + (trail) + UTF_16_SURROGATE_OFFSET) + +#define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do { \ + int __ctu16s_code = (codepoint); \ + lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10); \ + trail = 0xDC00 + (__ctu16s_code & 0x3FF); \ +} while (0) + #ifdef MULE -/* #### Using ints for to_unicode is OK (as long as they are >= 32 bits). - However, shouldn't the shorts below be unsigned? - - Answer: Doesn't matter because the values being converted to are only - 96x96. */ +/* Using ints for to_unicode is OK (as long as they are >= 32 bits). + In from_unicode, we're converting from Mule characters, which means + that the values being converted to are only 96x96, and we can save + space by using shorts (signedness doesn't matter). */ static int *to_unicode_blank_1; static int **to_unicode_blank_2; @@ -322,6 +346,15 @@ Lisp_Object Qignore_first_column; +Lisp_Object Vcurrent_jit_charset; +Lisp_Object Qlast_allocated_character; +Lisp_Object Qccl_encode_to_ucs_2; + +Lisp_Object Vnumber_of_jit_charsets; +Lisp_Object Vlast_jit_charset_final; +Lisp_Object Vcharset_descr; + + /************************************************************************/ /* Unicode implementation */ @@ -1002,6 +1035,64 @@ } static Ichar +get_free_codepoint(Lisp_Object charset) +{ + Lisp_Object name = Fcharset_name(charset); + Lisp_Object zeichen = Fget(name, Qlast_allocated_character, Qnil); + Ichar res; + + /* Only allow this with the 96x96 character sets we are using for + temporary Unicode support. */ + assert(2 == XCHARSET_DIMENSION(charset) && 96 == XCHARSET_CHARS(charset)); + + if (!NILP(zeichen)) + { + int c1, c2; + + BREAKUP_ICHAR(XCHAR(zeichen), charset, c1, c2); + + if (127 == c1 && 127 == c2) + { + /* We've already used the hightest-numbered character in this + set--tell our caller to create another. */ + return -1; + } + + if (127 == c2) + { + ++c1; + c2 = 0x20; + } + else + { + ++c2; + } + + res = make_ichar(charset, c1, c2); + Fput(name, Qlast_allocated_character, make_char(res)); + } + else + { + res = make_ichar(charset, 32, 32); + Fput(name, Qlast_allocated_character, make_char(res)); + } + return res; +} + +/* The just-in-time creation of XEmacs characters that correspond to unknown + Unicode code points happens when: + + 1. The lookup would otherwise fail. + + 2. The charsets array is the nil or the default. + + If there are no free code points in the just-in-time Unicode character + set, and the charsets array is the default unicode precedence list, + create a new just-in-time Unicode character set, add it at the end of the + unicode precedence list, create the XEmacs character in that character + set, and return it. */ + +static Ichar unicode_to_ichar (int code, Lisp_Object_dynarr *charsets) { int u1, u2, u3, u4; @@ -1041,8 +1132,59 @@ return make_ichar (charset, retval >> 8, retval & 0xFF); } } - - return (Ichar) -1; + + /* Only do the magic just-in-time assignment if we're using the default + list. */ + if (unicode_precedence_dynarr == charsets) + { + if (NILP (Vcurrent_jit_charset) || + (-1 == (i = get_free_codepoint(Vcurrent_jit_charset)))) + { + Ibyte setname[32]; + int number_of_jit_charsets = XINT (Vnumber_of_jit_charsets); + Ascbyte last_jit_charset_final = XCHAR (Vlast_jit_charset_final); + + /* This final byte shit is, umm, not that cool. */ + assert (last_jit_charset_final >= 0x30); + + /* Assertion added partly because our Win32 layer doesn't + support snprintf; with this, we're sure it won't overflow + the buffer. */ + assert(100 > number_of_jit_charsets); + + qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets); + + Vcurrent_jit_charset = Fmake_charset + (intern((const CIbyte *)setname), Vcharset_descr, + /* Set encode-as-utf-8 to t, to have this character set written + using UTF-8 escapes in escape-quoted and ctext. This + sidesteps the fact that our internal character -> Unicode + mapping is not stable from one invocation to the next. */ + nconc2 (list2(Qencode_as_utf_8, Qt), + nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96), + Qdimension, make_int(2)), + list6(Qregistries, Qunicode_registries, + Qfinal, make_char(last_jit_charset_final), + /* This CCL program is initialised in + unicode.el. */ + Qccl_program, Qccl_encode_to_ucs_2)))); + + /* Record for the Unicode infrastructure that we've created + this character set. */ + Vnumber_of_jit_charsets = make_int (number_of_jit_charsets + 1); + Vlast_jit_charset_final = make_char (last_jit_charset_final + 1); + + i = get_free_codepoint(Vcurrent_jit_charset); + } + + if (-1 != i) + { + set_unicode_conversion((Ichar)i, code); + /* No need to add the charset to the end of the list; it's done + automatically. */ + } + } + return (Ichar) i; } /* Add charsets to precedence list. @@ -1285,37 +1427,18 @@ present), this function simply does `int-to-char' and ignores the CHARSETS argument. -Note that the current XEmacs internal encoding has no mapping for many -Unicode code points, and if you use characters that are vaguely obscure with -XEmacs' Unicode coding systems, you will lose data. - -To add support for some desired code point in the short term--note that our -intention is to move to a Unicode-compatible internal encoding soon, for -some value of soon--if you are a distributor, add something like the -following to `site-start.el.' - -(make-charset 'distro-name-private - "Private character set for DISTRO" - '(dimension 1 - chars 96 - columns 1 - final ?5 ;; Change this--see docs for make-charset - long-name "Private charset for some Unicode char support." - short-name "Distro-Private")) - -(set-unicode-conversion - (make-char 'distro-name-private #x20) #x263A) ;; WHITE SMILING FACE - -(set-unicode-conversion - (make-char 'distro-name-private #x21) #x3030) ;; WAVY DASH - -;; ... -;;; Repeat as necessary. - -Redisplay will work on the sjt-xft branch, but not with server-side X11 -fonts as is the default. However, data read in will be preserved when they -are written out again. - +If the CODE would not otherwise be converted to an XEmacs character, and the +list of character sets to be consulted is nil or the default, a new XEmacs +character will be created for it in one of the `jit-ucs-charset' Mule +character sets, and that character will be returned. + +This is limited to around 400,000 characters per XEmacs session, though, so +while normal usage will not be problematic, things like: + +\(dotimes (i #x110000) (decode-char 'ucs i)) + +will eventually error. The long-term solution to this is Unicode as an +internal encoding. */ (code, USED_IF_MULE (charsets))) { @@ -1559,16 +1682,6 @@ /* Unicode coding system */ /************************************************************************/ -/* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */ - -enum unicode_type -{ - UNICODE_UTF_16, - UNICODE_UTF_8, - UNICODE_UTF_7, - UNICODE_UCS_4 -}; - struct unicode_coding_system { enum unicode_type type; @@ -1593,6 +1706,7 @@ { /* decode */ unsigned char counter; + unsigned char indicated_length; int seen_char; /* encode */ Lisp_Object current_charset; @@ -1606,11 +1720,6 @@ DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode); -/* Decode a UCS-2 or UCS-4 character into a buffer. If the lookup fails, use - <GETA MARK> (U+3013) of JIS X 0208, which means correct character - is not found, instead. - #### do something more appropriate (use blob?) - Danger, Will Robinson! Data loss. Should we signal user? */ static void decode_unicode_char (int ch, unsigned_char_dynarr *dst, struct unicode_coding_stream *data, @@ -1645,39 +1754,140 @@ data->seen_char = 1; } +#define DECODE_ERROR_OCTET(octet, dst, data, ignore_bom) \ + decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, \ + dst, data, ignore_bom) + +static inline void +indicate_invalid_utf_8 (unsigned char indicated_length, + unsigned char counter, + int ch, unsigned_char_dynarr *dst, + struct unicode_coding_stream *data, + unsigned int ignore_bom) +{ + Binbyte stored = indicated_length - counter; + Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length]; + + while (stored > 0) + { + DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask, + dst, data, ignore_bom); + mask = 0x80, stored--; + } +} + static void encode_unicode_char_1 (int code, unsigned_char_dynarr *dst, - enum unicode_type type, unsigned int little_endian) + enum unicode_type type, unsigned int little_endian, + int write_error_characters_as_such) { switch (type) { case UNICODE_UTF_16: if (little_endian) { - Dynarr_add (dst, (unsigned char) (code & 255)); - Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); + if (code < 0x10000) { + Dynarr_add (dst, (unsigned char) (code & 255)); + Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); + } else if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else if (code < 0x110000) + { + /* Little endian; least significant byte first. */ + int first, second; + + CODE_TO_UTF_16_SURROGATES(code, first, second); + + Dynarr_add (dst, (unsigned char) (first & 255)); + Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); + + Dynarr_add (dst, (unsigned char) (second & 255)); + Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); + } + else + { + /* Not valid Unicode. Pass U+FFFD, least significant byte + first. */ + Dynarr_add (dst, (unsigned char) 0xFD); + Dynarr_add (dst, (unsigned char) 0xFF); + } } else { - Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); - Dynarr_add (dst, (unsigned char) (code & 255)); + if (code < 0x10000) { + Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (code & 255)); + } else if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else if (code < 0x110000) + { + /* Big endian; most significant byte first. */ + int first, second; + + CODE_TO_UTF_16_SURROGATES(code, first, second); + + Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (first & 255)); + + Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (second & 255)); + } + else + { + /* Not valid Unicode. Pass U+FFFD, most significant byte + first. */ + Dynarr_add (dst, (unsigned char) 0xFF); + Dynarr_add (dst, (unsigned char) 0xFD); + } } break; case UNICODE_UCS_4: + case UNICODE_UTF_32: if (little_endian) { - Dynarr_add (dst, (unsigned char) (code & 255)); - Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); - Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); - Dynarr_add (dst, (unsigned char) (code >> 24)); + if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else + { + /* We generate and accept incorrect sequences here, which is + okay, in the interest of preservation of the user's + data. */ + Dynarr_add (dst, (unsigned char) (code & 255)); + Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); + Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); + Dynarr_add (dst, (unsigned char) (code >> 24)); + } } else { - Dynarr_add (dst, (unsigned char) (code >> 24)); - Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); - Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); - Dynarr_add (dst, (unsigned char) (code & 255)); + if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else + { + /* We generate and accept incorrect sequences here, which is okay, + in the interest of preservation of the user's data. */ + Dynarr_add (dst, (unsigned char) (code >> 24)); + Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); + Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); + Dynarr_add (dst, (unsigned char) (code & 255)); + } } break; @@ -1706,11 +1916,25 @@ } else if (code <= 0x3ffffff) { - Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8)); - Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); - Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); - Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); - Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); + +#if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \ + && UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff) +#error "This code needs to be rewritten. " +#endif + if (write_error_characters_as_such && + code >= UNICODE_ERROR_OCTET_RANGE_START && + code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) + { + Dynarr_add (dst, (unsigned char) ((code & 0xFF))); + } + else + { + Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8)); + Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); + Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); + Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); + Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); + } } else { @@ -1729,10 +1953,13 @@ } } -static void +/* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented + encodings. */ +void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, int USED_IF_MULE (l), unsigned_char_dynarr *dst, - enum unicode_type type, unsigned int little_endian) + enum unicode_type type, unsigned int little_endian, + int write_error_characters_as_such) { #ifdef MULE int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127)); @@ -1758,7 +1985,8 @@ int code = h; #endif /* MULE */ - encode_unicode_char_1 (code, dst, type, little_endian); + encode_unicode_char_1 (code, dst, type, little_endian, + write_error_characters_as_such); } static Bytecount @@ -1777,6 +2005,8 @@ if (str->direction == CODING_DECODE) { unsigned char counter = data->counter; + unsigned char indicated_length + = data->indicated_length; while (n--) { @@ -1785,65 +2015,170 @@ switch (type) { case UNICODE_UTF_8: - switch (counter) - { - case 0: - if (c >= 0xfc) - { - ch = c & 0x01; - counter = 5; - } - else if (c >= 0xf8) - { - ch = c & 0x03; - counter = 4; - } - else if (c >= 0xf0) - { - ch = c & 0x07; - counter = 3; - } - else if (c >= 0xe0) - { - ch = c & 0x0f; - counter = 2; - } - else if (c >= 0xc0) - { - ch = c & 0x1f; - counter = 1; - } - else - decode_unicode_char (c, dst, data, ignore_bom); - break; - case 1: - ch = (ch << 6) | (c & 0x3f); - decode_unicode_char (ch, dst, data, ignore_bom); - ch = 0; - counter = 0; - break; - default: - ch = (ch << 6) | (c & 0x3f); - counter--; + if (0 == counter) + { + if (0 == (c & 0x80)) + { + /* ASCII. */ + decode_unicode_char (c, dst, data, ignore_bom); + } + else if (0 == (c & 0x40)) + { + /* Highest bit set, second highest not--there's + something wrong. */ + DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + } + else if (0 == (c & 0x20)) + { + ch = c & 0x1f; + counter = 1; + indicated_length = 2; + } + else if (0 == (c & 0x10)) + { + ch = c & 0x0f; + counter = 2; + indicated_length = 3; + } + else if (0 == (c & 0x08)) + { + ch = c & 0x0f; + counter = 3; + indicated_length = 4; + } + else + { + /* We don't supports lengths longer than 4 in + external-format data. */ + DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + + } + } + else + { + /* counter != 0 */ + if ((0 == (c & 0x80)) || (0 != (c & 0x40))) + { + indicate_invalid_utf_8(indicated_length, + counter, + ch, dst, data, ignore_bom); + if (c & 0x80) + { + DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + } + else + { + /* The character just read is ASCII. Treat it as + such. */ + decode_unicode_char (c, dst, data, ignore_bom); + } + ch = 0; + counter = 0; + } + else + { + ch = (ch << 6) | (c & 0x3f); + counter--; + /* Just processed the final byte. Emit the character. */ + if (!counter) + { + /* Don't accept over-long sequences, surrogates, + or codes above #x10FFFF. */ + if ((ch < 0x80) || + ((ch < 0x800) && indicated_length > 2) || + ((ch < 0x10000) && indicated_length > 3) || + valid_utf_16_surrogate(ch) || (ch > 0x110000)) + { + indicate_invalid_utf_8(indicated_length, + counter, + ch, dst, data, + ignore_bom); + } + else + { + decode_unicode_char (ch, dst, data, ignore_bom); + } + ch = 0; + } + } } break; case UNICODE_UTF_16: + if (little_endian) ch = (c << counter) | ch; else ch = (ch << 8) | c; + counter += 8; - if (counter == 16) - { + + if (16 == counter) + { int tempch = ch; + + if (valid_utf_16_first_surrogate(ch)) + { + break; + } ch = 0; counter = 0; decode_unicode_char (tempch, dst, data, ignore_bom); } + else if (32 == counter) + { + int tempch; + + if (little_endian) + { + if (!valid_utf_16_last_surrogate(ch >> 16)) + { + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, + ignore_bom); + } + else + { + tempch = utf_16_surrogates_to_code((ch & 0xffff), + (ch >> 16)); + decode_unicode_char(tempch, dst, data, ignore_bom); + } + } + else + { + if (!valid_utf_16_last_surrogate(ch & 0xFFFF)) + { + DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + } + else + { + tempch = utf_16_surrogates_to_code((ch >> 16), + (ch & 0xffff)); + decode_unicode_char(tempch, dst, data, ignore_bom); + } + } + + ch = 0; + counter = 0; + } + else + assert(8 == counter || 24 == counter); break; case UNICODE_UCS_4: + case UNICODE_UTF_32: if (little_endian) ch = (c << counter) | ch; else @@ -1851,15 +2186,43 @@ counter += 8; if (counter == 32) { - int tempch = ch; + if (ch > 0x10ffff) + { + /* ch is not a legal Unicode character. We're fine + with that in UCS-4, though not in UTF-32. */ + if (UNICODE_UCS_4 == type && ch < 0x80000000) + { + decode_unicode_char (ch, dst, data, ignore_bom); + } + else if (little_endian) + { + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, + ignore_bom); + } + else + { + DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + } + } + else + { + decode_unicode_char (ch, dst, data, ignore_bom); + } ch = 0; counter = 0; - if (tempch < 0) - { - /* !!#### indicate an error */ - tempch = '~'; - } - decode_unicode_char (tempch, dst, data, ignore_bom); } break; @@ -1871,10 +2234,68 @@ } } - if (str->eof) - DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); + + if (str->eof && counter) + { + switch (type) + { + case UNICODE_UTF_8: + indicate_invalid_utf_8(indicated_length, + counter, ch, dst, data, + ignore_bom); + break; + + case UNICODE_UTF_16: + case UNICODE_UCS_4: + case UNICODE_UTF_32: + if (8 == counter) + { + DECODE_ERROR_OCTET (ch, dst, data, ignore_bom); + } + else if (16 == counter) + { + if (little_endian) + { + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + } + else + { + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); + } + } + else if (24 == counter) + { + if (little_endian) + { + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + } + else + { + DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, + ignore_bom); + DECODE_ERROR_OCTET (ch & 0xFF, dst, data, + ignore_bom); + } + } + else assert(0); + break; + } + ch = 0; + counter = 0; + } data->counter = counter; + data->indicated_length = indicated_length; } else { @@ -1893,7 +2314,7 @@ if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom) { - encode_unicode_char_1 (0xFEFF, dst, type, little_endian); + encode_unicode_char_1 (0xFEFF, dst, type, little_endian, 1); data->wrote_bom = 1; } @@ -1907,7 +2328,7 @@ { /* Processing ASCII character */ ch = 0; encode_unicode_char (Vcharset_ascii, c, 0, dst, type, - little_endian); + little_endian, 1); char_boundary = 1; } @@ -1931,20 +2352,20 @@ for the rationale behind subtracting #xa0 from the character's code. */ encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst, - type, little_endian); + type, little_endian, 1); else { switch (XCHARSET_REP_BYTES (charset)) { case 2: encode_unicode_char (charset, c, 0, dst, type, - little_endian); + little_endian, 1); break; case 3: if (XCHARSET_PRIVATE_P (charset)) { encode_unicode_char (charset, c, 0, dst, type, - little_endian); + little_endian, 1); ch = 0; } else if (ch) @@ -1958,7 +2379,7 @@ handle this yet. */ encode_unicode_char (Vcharset_ascii, '~', 0, dst, type, - little_endian); + little_endian, 1); } else { @@ -1977,7 +2398,7 @@ else #endif /* ENABLE_COMPOSITE_CHARS */ encode_unicode_char (charset, ch, c, dst, type, - little_endian); + little_endian, 1); ch = 0; } else @@ -1990,7 +2411,7 @@ if (ch) { encode_unicode_char (charset, ch, c, dst, type, - little_endian); + little_endian, 1); ch = 0; } else @@ -2348,7 +2769,7 @@ static int unicode_putprop (Lisp_Object codesys, Lisp_Object key, Lisp_Object value) { - if (EQ (key, Qtype)) + if (EQ (key, Qunicode_type)) { enum unicode_type type; @@ -2360,6 +2781,8 @@ type = UNICODE_UTF_7; else if (EQ (value, Qucs_4)) type = UNICODE_UCS_4; + else if (EQ (value, Qutf_32)) + type = UNICODE_UTF_32; else invalid_constant ("Invalid Unicode type", key); @@ -2377,7 +2800,7 @@ static Lisp_Object unicode_getprop (Lisp_Object coding_system, Lisp_Object prop) { - if (EQ (prop, Qtype)) + if (EQ (prop, Qunicode_type)) { switch (XCODING_SYSTEM_UNICODE_TYPE (coding_system)) { @@ -2385,6 +2808,7 @@ case UNICODE_UTF_8: return Qutf_8; case UNICODE_UTF_7: return Qutf_7; case UNICODE_UCS_4: return Qucs_4; + case UNICODE_UTF_32: return Qutf_32; default: ABORT (); } } @@ -2399,7 +2823,8 @@ unicode_print (Lisp_Object cs, Lisp_Object printcharfun, int UNUSED (escapeflag)) { - write_fmt_string_lisp (printcharfun, "(%s", 1, unicode_getprop (cs, Qtype)); + write_fmt_string_lisp (printcharfun, "(%s", 1, + unicode_getprop (cs, Qunicode_type)); if (XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (cs)) write_c_string (printcharfun, ", little-endian"); if (XCODING_SYSTEM_UNICODE_NEED_BOM (cs)) @@ -2407,6 +2832,276 @@ write_c_string (printcharfun, ")"); } +#ifdef MULE +DEFUN ("set-unicode-query-skip-chars-args", Fset_unicode_query_skip_chars_args, + 3, 3, 0, /* +Specify strings as matching characters known to Unicode coding systems. + +QUERY-STRING is a string matching characters that can unequivocally be +encoded by the Unicode coding systems. + +INVALID-STRING is a string to match XEmacs characters that represent known +octets on disk, but that are invalid sequences according to Unicode. + +UTF-8-INVALID-STRING is a more restrictive string to match XEmacs characters +that are invalid UTF-8 octets. + +All three strings are in the format accepted by `skip-chars-forward'. +*/ + (query_string, invalid_string, utf_8_invalid_string)) +{ + CHECK_STRING (query_string); + CHECK_STRING (invalid_string); + CHECK_STRING (utf_8_invalid_string); + + Vunicode_query_string = query_string; + Vunicode_invalid_string = invalid_string; + Vutf_8_invalid_string = utf_8_invalid_string; + + return Qnil; +} + +static void +add_lisp_string_to_skip_chars_range (Lisp_Object string, Lisp_Object rtab, + Lisp_Object value) +{ + Ibyte *p, *pend; + Ichar c; + + p = XSTRING_DATA (string); + pend = p + XSTRING_LENGTH (string); + + while (p != pend) + { + c = itext_ichar (p); + + INC_IBYTEPTR (p); + + if (c == '\\') + { + if (p == pend) break; + c = itext_ichar (p); + INC_IBYTEPTR (p); + } + + if (p != pend && *p == '-') + { + Ichar cend; + + /* Skip over the dash. */ + p++; + if (p == pend) break; + cend = itext_ichar (p); + + Fput_range_table (make_int (c), make_int (cend), value, + rtab); + + INC_IBYTEPTR (p); + } + else + { + Fput_range_table (make_int (c), make_int (c), value, rtab); + } + } +} + +/* This function wouldn't be necessary if initialised range tables were + dumped properly; see + http://mid.gmane.org/18179.49815.622843.336527@parhasard.net . */ +static void +initialize_unicode_query_range_tables_from_strings (void) +{ + CHECK_STRING (Vunicode_query_string); + CHECK_STRING (Vunicode_invalid_string); + CHECK_STRING (Vutf_8_invalid_string); + + Vunicode_query_skip_chars = Fmake_range_table (Qstart_closed_end_closed); + + add_lisp_string_to_skip_chars_range (Vunicode_query_string, + Vunicode_query_skip_chars, + Qsucceeded); + + Vunicode_invalid_and_query_skip_chars + = Fcopy_range_table (Vunicode_query_skip_chars); + + add_lisp_string_to_skip_chars_range (Vunicode_invalid_string, + Vunicode_invalid_and_query_skip_chars, + Qinvalid_sequence); + + Vutf_8_invalid_and_query_skip_chars + = Fcopy_range_table (Vunicode_query_skip_chars); + + add_lisp_string_to_skip_chars_range (Vutf_8_invalid_string, + Vutf_8_invalid_and_query_skip_chars, + Qinvalid_sequence); +} + +static Lisp_Object +unicode_query (Lisp_Object codesys, struct buffer *buf, Charbpos end, + int flags) +{ + Charbpos pos = BUF_PT (buf), fail_range_start, fail_range_end; + Charbpos pos_byte = BYTE_BUF_PT (buf); + Lisp_Object skip_chars_range_table, result = Qnil; + enum query_coding_failure_reasons failed_reason, + previous_failed_reason = query_coding_succeeded; + int checked_unicode, invalid_lower_limit = UNICODE_ERROR_OCTET_RANGE_START, + invalid_upper_limit, unicode_type = XCODING_SYSTEM_UNICODE_TYPE (codesys); + + if (flags & QUERY_METHOD_HIGHLIGHT && + /* If we're being called really early, live without highlights getting + cleared properly: */ + !(UNBOUNDP (XSYMBOL (Qquery_coding_clear_highlights)->function))) + { + /* It's okay to call Lisp here, the only non-stack object we may have + allocated up to this point is skip_chars_range_table, and that's + reachable from its entry in Vfixed_width_query_ranges_cache. */ + call3 (Qquery_coding_clear_highlights, make_int (pos), make_int (end), + wrap_buffer (buf)); + } + + if (NILP (Vunicode_query_skip_chars)) + { + initialize_unicode_query_range_tables_from_strings(); + } + + if (flags & QUERY_METHOD_IGNORE_INVALID_SEQUENCES) + { + switch (unicode_type) + { + case UNICODE_UTF_8: + skip_chars_range_table = Vutf_8_invalid_and_query_skip_chars; + break; + case UNICODE_UTF_7: + /* #### See above. */ + return Qunbound; + break; + default: + skip_chars_range_table = Vunicode_invalid_and_query_skip_chars; + break; + } + } + else + { + switch (unicode_type) + { + case UNICODE_UTF_8: + invalid_lower_limit = UNICODE_ERROR_OCTET_RANGE_START + 0x80; + invalid_upper_limit = UNICODE_ERROR_OCTET_RANGE_START + 0xFF; + break; + case UNICODE_UTF_7: + /* #### Work out what to do here in reality, read the spec and decide + which octets are invalid. */ + return Qunbound; + break; + default: + invalid_lower_limit = UNICODE_ERROR_OCTET_RANGE_START; + invalid_upper_limit = UNICODE_ERROR_OCTET_RANGE_START + 0xFF; + break; + } + + skip_chars_range_table = Vunicode_query_skip_chars; + } + + while (pos < end) + { + Ichar ch = BYTE_BUF_FETCH_CHAR (buf, pos_byte); + if ((ch < 0x100 ? 1 : + (!EQ (Qnil, Fget_range_table (make_int (ch), skip_chars_range_table, + Qnil))))) + { + pos++; + INC_BYTEBPOS (buf, pos_byte); + } + else + { + fail_range_start = pos; + while ((pos < end) && + ((checked_unicode = ichar_to_unicode (ch), + -1 == checked_unicode + && (failed_reason = query_coding_unencodable)) + || (!(flags & QUERY_METHOD_IGNORE_INVALID_SEQUENCES) && + (invalid_lower_limit <= checked_unicode) && + (checked_unicode <= invalid_upper_limit) + && (failed_reason = query_coding_invalid_sequence))) + && (previous_failed_reason == query_coding_succeeded + || previous_failed_reason == failed_reason)) + { + pos++; + INC_BYTEBPOS (buf, pos_byte); + ch = BYTE_BUF_FETCH_CHAR (buf, pos_byte); + previous_failed_reason = failed_reason; + } + + if (fail_range_start == pos) + { + /* The character can actually be encoded; move on. */ + pos++; + INC_BYTEBPOS (buf, pos_byte); + } + else + { + assert (previous_failed_reason == query_coding_invalid_sequence + || previous_failed_reason == query_coding_unencodable); + + if (flags & QUERY_METHOD_ERRORP) + { + DECLARE_EISTRING (error_details); + + eicpy_ascii (error_details, "Cannot encode "); + eicat_lstr (error_details, + make_string_from_buffer (buf, fail_range_start, + pos - + fail_range_start)); + eicat_ascii (error_details, " using coding system"); + + signal_error (Qtext_conversion_error, + (const CIbyte *)(eidata (error_details)), + XCODING_SYSTEM_NAME (codesys)); + } + + if (NILP (result)) + { + result = Fmake_range_table (Qstart_closed_end_open); + } + + fail_range_end = pos; + + Fput_range_table (make_int (fail_range_start), + make_int (fail_range_end), + (previous_failed_reason + == query_coding_unencodable ? + Qunencodable : Qinvalid_sequence), + result); + previous_failed_reason = query_coding_succeeded; + + if (flags & QUERY_METHOD_HIGHLIGHT) + { + Lisp_Object extent + = Fmake_extent (make_int (fail_range_start), + make_int (fail_range_end), + wrap_buffer (buf)); + + Fset_extent_priority + (extent, make_int (2 + mouse_highlight_priority)); + Fset_extent_face (extent, Qquery_coding_warning_face); + } + } + } + } + + return result; +} +#else /* !MULE */ +static Lisp_Object +unicode_query (Lisp_Object UNUSED (codesys), + struct buffer * UNUSED (buf), + Charbpos UNUSED (end), int UNUSED (flags)) +{ + return Qnil; +} +#endif + int dfc_coding_system_is_unicode ( #ifdef WIN32_ANY @@ -2445,7 +3140,13 @@ DEFSUBR (Fload_unicode_mapping_table); + DEFSUBR (Fset_unicode_query_skip_chars_args); + + DEFSYMBOL (Qccl_encode_to_ucs_2); + DEFSYMBOL (Qlast_allocated_character); DEFSYMBOL (Qignore_first_column); + + DEFSYMBOL (Qunicode_registries); #endif /* MULE */ DEFSUBR (Fchar_to_unicode); @@ -2454,6 +3155,7 @@ DEFSYMBOL (Qunicode); DEFSYMBOL (Qucs_4); DEFSYMBOL (Qutf_16); + DEFSYMBOL (Qutf_32); DEFSYMBOL (Qutf_8); DEFSYMBOL (Qutf_7); @@ -2474,6 +3176,7 @@ INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (unicode, "unicode-coding-system-p"); CODING_SYSTEM_HAS_METHOD (unicode, print); CODING_SYSTEM_HAS_METHOD (unicode, convert); + CODING_SYSTEM_HAS_METHOD (unicode, query); CODING_SYSTEM_HAS_METHOD (unicode, init_coding_stream); CODING_SYSTEM_HAS_METHOD (unicode, rewind_coding_stream); CODING_SYSTEM_HAS_METHOD (unicode, putprop); @@ -2508,6 +3211,14 @@ Fprovide (intern ("unicode")); #ifdef MULE + staticpro (&Vnumber_of_jit_charsets); + Vnumber_of_jit_charsets = make_int (0); + staticpro (&Vlast_jit_charset_final); + Vlast_jit_charset_final = make_char (0x30); + staticpro (&Vcharset_descr); + Vcharset_descr + = build_string ("Mule charset for otherwise unknown Unicode code points."); + staticpro (&Vlanguage_unicode_precedence_list); Vlanguage_unicode_precedence_list = Qnil; @@ -2518,8 +3229,13 @@ dump_add_root_block_ptr (&unicode_precedence_dynarr, &lisp_object_dynarr_description); + + init_blank_unicode_tables (); + staticpro (&Vcurrent_jit_charset); + Vcurrent_jit_charset = Qnil; + /* Note that the "block" we are describing is a single pointer, and hence we could potentially use dump_add_root_block_ptr(). However, given the way the descriptions are written, we couldn't use them, and would @@ -2540,5 +3256,35 @@ from_unicode_level_3_desc_1); dump_add_root_block (&from_unicode_blank_4, sizeof (void *), from_unicode_level_4_desc_1); + + DEFVAR_LISP ("unicode-registries", &Qunicode_registries /* +Vector describing the X11 registries searched when using fallback fonts. + +"Fallback fonts" here includes by default those fonts used by redisplay when +displaying charsets for which the `encode-as-utf-8' property is true, and +those used when no font matching the charset's registries property has been +found (that is, they're probably Mule-specific charsets like Ethiopic or +IPA.) +*/ ); + Qunicode_registries = vector1(build_string("iso10646-1")); + + /* Initialised in lisp/mule/general-late.el, by a call to + #'set-unicode-query-skip-chars-args. Or at least they would be, but we + can't do this at dump time right now, initialised range tables aren't + dumped properly. */ + staticpro (&Vunicode_invalid_and_query_skip_chars); + Vunicode_invalid_and_query_skip_chars = Qnil; + staticpro (&Vutf_8_invalid_and_query_skip_chars); + Vutf_8_invalid_and_query_skip_chars = Qnil; + staticpro (&Vunicode_query_skip_chars); + Vunicode_query_skip_chars = Qnil; + + /* If we could dump the range table above these wouldn't be necessary: */ + staticpro (&Vunicode_query_string); + Vunicode_query_string = Qnil; + staticpro (&Vunicode_invalid_string); + Vunicode_invalid_string = Qnil; + staticpro (&Vutf_8_invalid_string); + Vutf_8_invalid_string = Qnil; #endif /* MULE */ }