Mercurial > hg > xemacs-beta
diff src/file-coding.c @ 5118:e0db3c197671 ben-lisp-object
merge up to latest default branch, doesn't compile yet
author | Ben Wing <ben@xemacs.org> |
---|---|
date | Sat, 26 Dec 2009 21:18:49 -0600 |
parents | 3742ea8250b5 257b468bf2ca |
children | d1247f3cc363 |
line wrap: on
line diff
--- a/src/file-coding.c Sat Dec 26 00:20:27 2009 -0600 +++ b/src/file-coding.c Sat Dec 26 21:18:49 2009 -0600 @@ -78,6 +78,9 @@ #include "lstream.h" #include "opaque.h" #include "file-coding.h" +#include "extents.h" +#include "rangetab.h" +#include "chartab.h" #ifdef HAVE_ZLIB #include "zlib.h" @@ -89,10 +92,17 @@ Lisp_Object Vcoding_system_for_write; Lisp_Object Vfile_name_coding_system; +Lisp_Object Qaliases, Qcharset_skip_chars_string; + #ifdef DEBUG_XEMACS Lisp_Object Vdebug_coding_detection; #endif +#ifdef MULE +extern Lisp_Object Vcharset_ascii, Vcharset_control_1, + Vcharset_latin_iso8859_1; +#endif + typedef struct coding_system_type_entry { struct coding_system_methods *meths; @@ -229,6 +239,8 @@ Lisp_Object QScoding_system_cookie; +Lisp_Object Qposix_charset_to_coding_system_hash; + /* This is used to convert autodetected coding systems into existing systems. For example, the chain undecided->convert-eol-autodetect may have its separate parts detected as mswindows-multibyte and @@ -305,6 +317,7 @@ write_c_string (printcharfun, "]"); } +#ifndef NEW_GC static void finalize_coding_system (void *header, int for_disksave) { @@ -315,6 +328,7 @@ if (!for_disksave) /* see comment in lstream.c */ MAYBE_XCODESYSMETH (cs, finalize, (cs)); } +#endif /* not NEW_GC */ static Bytecount sizeof_coding_system (const void *header) @@ -366,13 +380,22 @@ 0, coding_system_empty_extra_description_1 }; -DEFINE_SIZABLE_LISP_OBJECT ("coding-system", coding_system, - mark_coding_system, - print_coding_system, - finalize_coding_system, - 0, 0, coding_system_description, - sizeof_coding_system, - Lisp_Coding_System); +#ifdef NEW_GC +DEFINE_DUMPABLE_SIZABLE_LISP_OBJECT ("coding-system", coding_system, + mark_coding_system, + print_coding_system, + 0, 0, 0, coding_system_description, + sizeof_coding_system, + Lisp_Coding_System); +#else /* not NEW_GC */ +DEFINE_DUMPABLE_SIZABLE_LISP_OBJECT ("coding-system", coding_system, + mark_coding_system, + print_coding_system, + finalize_coding_system, + 0, 0, coding_system_description, + sizeof_coding_system, + Lisp_Coding_System); +#endif /* not NEW_GC */ /************************************************************************/ /* Creating coding systems */ @@ -402,6 +425,155 @@ return decode_coding_system_type (type, ERROR_ME_NOT) != 0; } +#ifdef MULE +static Lisp_Object Vdefault_query_coding_region_chartab_cache; + +/* Non-static because it's used in INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA. */ +Lisp_Object +default_query_method (Lisp_Object codesys, struct buffer *buf, + Charbpos end, int flags) +{ + Charbpos pos = BUF_PT (buf), fail_range_start, fail_range_end; + Charbpos pos_byte = BYTE_BUF_PT (buf); + Lisp_Object safe_charsets = XCODING_SYSTEM_SAFE_CHARSETS (codesys); + Lisp_Object safe_chars = XCODING_SYSTEM_SAFE_CHARS (codesys), + result = Qnil; + enum query_coding_failure_reasons failed_reason, + previous_failed_reason = query_coding_succeeded; + + /* safe-charsets of t means the coding system can encode everything. */ + if (EQ (Qnil, safe_chars)) + { + if (EQ (Qt, safe_charsets)) + { + return Qnil; + } + + /* If we've no information on what characters the coding system can + encode, give up. */ + if (EQ (Qnil, safe_charsets) && EQ (Qnil, safe_chars)) + { + return Qunbound; + } + + safe_chars = Fgethash (safe_charsets, + Vdefault_query_coding_region_chartab_cache, + Qnil); + if (NILP (safe_chars)) + { + safe_chars = Fmake_char_table (Qgeneric); + { + EXTERNAL_LIST_LOOP_2 (safe_charset, safe_charsets) + Fput_char_table (safe_charset, Qt, safe_chars); + } + + Fputhash (safe_charsets, safe_chars, + Vdefault_query_coding_region_chartab_cache); + } + } + + if (flags & QUERY_METHOD_HIGHLIGHT && + /* If we're being called really early, live without highlights getting + cleared properly: */ + !(UNBOUNDP (XSYMBOL (Qquery_coding_clear_highlights)->function))) + { + /* It's okay to call Lisp here, the only non-stack object we may have + allocated up to this point is safe_chars, and that's + reachable from its entry in + Vdefault_query_coding_region_chartab_cache */ + call3 (Qquery_coding_clear_highlights, make_int (pos), make_int (end), + wrap_buffer (buf)); + } + + while (pos < end) + { + Ichar ch = BYTE_BUF_FETCH_CHAR (buf, pos_byte); + if (!EQ (Qnil, get_char_table (ch, safe_chars))) + { + pos++; + INC_BYTEBPOS (buf, pos_byte); + } + else + { + fail_range_start = pos; + while ((pos < end) && + (EQ (Qnil, get_char_table (ch, safe_chars)) + && (failed_reason = query_coding_unencodable)) + && (previous_failed_reason == query_coding_succeeded + || previous_failed_reason == failed_reason)) + { + pos++; + INC_BYTEBPOS (buf, pos_byte); + ch = BYTE_BUF_FETCH_CHAR (buf, pos_byte); + previous_failed_reason = failed_reason; + } + + if (fail_range_start == pos) + { + /* The character can actually be encoded; move on. */ + pos++; + INC_BYTEBPOS (buf, pos_byte); + } + else + { + assert (previous_failed_reason == query_coding_unencodable); + + if (flags & QUERY_METHOD_ERRORP) + { + DECLARE_EISTRING (error_details); + + eicpy_ascii (error_details, "Cannot encode "); + eicat_lstr (error_details, + make_string_from_buffer (buf, fail_range_start, + pos - + fail_range_start)); + eicat_ascii (error_details, " using coding system"); + + signal_error (Qtext_conversion_error, + (const CIbyte *)(eidata (error_details)), + XCODING_SYSTEM_NAME (codesys)); + } + + if (NILP (result)) + { + result = Fmake_range_table (Qstart_closed_end_open); + } + + fail_range_end = pos; + + Fput_range_table (make_int (fail_range_start), + make_int (fail_range_end), + Qunencodable, + result); + previous_failed_reason = query_coding_succeeded; + + if (flags & QUERY_METHOD_HIGHLIGHT) + { + Lisp_Object extent + = Fmake_extent (make_int (fail_range_start), + make_int (fail_range_end), + wrap_buffer (buf)); + + Fset_extent_priority + (extent, make_int (2 + mouse_highlight_priority)); + Fset_extent_face (extent, Qquery_coding_warning_face); + } + } + } + } + + return result; +} +#else +Lisp_Object +default_query_method (Lisp_Object UNUSED (codesys), + struct buffer * UNUSED (buf), + Charbpos UNUSED (end), int UNUSED (flags)) +{ + return Qnil; +} +#endif /* defined MULE */ + DEFUN ("valid-coding-system-type-p", Fvalid_coding_system_type_p, 1, 1, 0, /* Given a CODING-SYSTEM-TYPE, return non-nil if it is valid. Valid types depend on how XEmacs was compiled but may include @@ -456,6 +628,89 @@ return CODING_SYSTEMP (object) ? Qt : Qnil; } +static Lisp_Object +find_coding_system (Lisp_Object coding_system_or_name, + int do_autoloads) +{ + Lisp_Object lookup; + + if (NILP (coding_system_or_name)) + coding_system_or_name = Qbinary; + else if (CODING_SYSTEMP (coding_system_or_name)) + return coding_system_or_name; + else + CHECK_SYMBOL (coding_system_or_name); + + while (1) + { + lookup = + Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil); + + if (CONSP (lookup) && do_autoloads) + { + struct gcpro gcpro1; + int length; + DECLARE_EISTRING (desired_base); + DECLARE_EISTRING (warning_info); + + eicpy_lstr (desired_base, XSYMBOL_NAME (coding_system_or_name)); + + /* Work out the name of the base coding system. */ + length = eilen (desired_base); + if (length > (int)(sizeof ("-unix") - 1)) + { + if (0 == qxestrcmp ((UAscbyte *)"-unix", (eidata (desired_base)) + + (length - (sizeof ("-unix") - 1)))) + { + eidel (desired_base, length - (sizeof ("-unix") - 1), + -1, 5, 5); + } + } + else if (length > (int)(sizeof ("-dos") - 1)) + { + if ((0 == qxestrcmp ((UAscbyte *)"-dos", (eidata (desired_base)) + + (length - (sizeof ("-dos") - 1)))) || + (0 == qxestrcmp ((UAscbyte *)"-mac", (eidata (desired_base)) + + (length - (sizeof ("-mac") - 1))))) + { + eidel (desired_base, length - (sizeof ("-dos") - 1), -1, + 4, 4); + } + } + + coding_system_or_name = intern_int (eidata (desired_base)); + + /* Remove this coding system and its subsidiary coding + systems from the hash, to avoid calling this code recursively. */ + Fremhash (coding_system_or_name, Vcoding_system_hash_table); + Fremhash (add_suffix_to_symbol(coding_system_or_name, "-unix"), + Vcoding_system_hash_table); + Fremhash (add_suffix_to_symbol(coding_system_or_name, "-dos"), + Vcoding_system_hash_table); + Fremhash (add_suffix_to_symbol(coding_system_or_name, "-mac"), + Vcoding_system_hash_table); + + eicpy_ascii (warning_info, "Error autoloading coding system "); + eicat_lstr (warning_info, XSYMBOL_NAME (coding_system_or_name)); + + /* Keep around the form so it doesn't disappear from under + #'eval's feet. */ + GCPRO1 (lookup); + call1_trapping_problems ((const CIbyte *)eidata (warning_info), + Qeval, lookup, 0); + UNGCPRO; + + lookup = + Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil); + } + + if (CODING_SYSTEMP (lookup) || NILP (lookup)) + return lookup; + + coding_system_or_name = lookup; + } +} + DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /* Retrieve the coding system of the given name. @@ -466,22 +721,77 @@ */ (coding_system_or_name)) { - if (NILP (coding_system_or_name)) - coding_system_or_name = Qbinary; - else if (CODING_SYSTEMP (coding_system_or_name)) - return coding_system_or_name; - else - CHECK_SYMBOL (coding_system_or_name); - - while (1) + return find_coding_system(coding_system_or_name, 1); +} + +DEFUN ("autoload-coding-system", Fautoload_coding_system, 2, 2, 0, /* +Define SYMBOL as a coding-system that is loaded on demand. + +FORM is a form to evaluate to define the coding-system. +*/ + (symbol, form)) +{ + Lisp_Object lookup; + + CHECK_SYMBOL (symbol); + CHECK_CONS (form); + + lookup = find_coding_system (symbol, 0); + + if (!NILP (lookup) && + /* Allow autoloads to be redefined. */ + !CONSP (lookup)) { - coding_system_or_name = - Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil); - - if (CODING_SYSTEMP (coding_system_or_name) - || NILP (coding_system_or_name)) - return coding_system_or_name; + invalid_operation ("Cannot redefine existing coding system", + symbol); } + + Fputhash (symbol, form, Vcoding_system_hash_table); + Fputhash (add_suffix_to_symbol(symbol, "-unix"), form, + Vcoding_system_hash_table); + Fputhash (add_suffix_to_symbol(symbol, "-dos"), form, + Vcoding_system_hash_table); + Fputhash (add_suffix_to_symbol(symbol, "-mac"), form, + Vcoding_system_hash_table); + + /* Tell the POSIX locale infrastructure about this coding system (though + unfortunately it'll be too late for the startup locale sniffing. */ + if (!UNBOUNDP (Qposix_charset_to_coding_system_hash)) + { + Lisp_Object val = Fsymbol_value (Qposix_charset_to_coding_system_hash); + DECLARE_EISTRING (minimal_name); + Ibyte *full_name; + int len = XSTRING_LENGTH (XSYMBOL_NAME (symbol)), i; + + if (!NILP (val)) + { + full_name = XSTRING_DATA (XSYMBOL_NAME (symbol)); + for (i = 0; i < len; ++i) + { + if (full_name[i] >= '0' && full_name[i] <= '9') + { + eicat_ch (minimal_name, full_name[i]); + } + else if (full_name[i] >= 'a' && full_name[i] <= 'z') + { + eicat_ch (minimal_name, full_name[i]); + } + else if (full_name[i] >= 'A' && full_name[i] <= 'Z') + { + eicat_ch (minimal_name, full_name[i] + + ('a' - 'A')); + } + } + + if (eilen (minimal_name)) + { + CHECK_HASH_TABLE (val); + Fputhash (eimake_string(minimal_name), symbol, val); + } + } + } + + return Qt; } DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /* @@ -638,7 +948,7 @@ }; static int -add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object UNUSED (value), +add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value, void *coding_system_list_closure) { /* This function can GC */ @@ -647,9 +957,13 @@ Lisp_Object *coding_system_list = cscl->coding_system_list; /* We can't just use VALUE because KEY might be an alias, and we need - the real coding system object. */ - if (XCODING_SYSTEM (Ffind_coding_system (key))->internal_p ? - cscl->internal : cscl->normal) + the real coding system object. + + Autoloaded coding systems have conses for their values, and can't be + internal coding systems, or coding system aliases. */ + if (CONSP (value) || + (XCODING_SYSTEM (Ffind_coding_system (key))->internal_p ? + cscl->internal : cscl->normal)) *coding_system_list = Fcons (key, *coding_system_list); return 0; } @@ -739,8 +1053,8 @@ struct subsidiary_type { - Ascbyte *extension; - Ascbyte *mnemonic_ext; + const Ascbyte *extension; + const Ascbyte *mnemonic_ext; enum eol_type eol; }; @@ -786,8 +1100,8 @@ for (i = 0; i < countof (coding_subsidiary_list); i++) { - Ascbyte *extension = coding_subsidiary_list[i].extension; - Ascbyte *mnemonic_ext = coding_subsidiary_list[i].mnemonic_ext; + const Ascbyte *extension = coding_subsidiary_list[i].extension; + const Ascbyte *mnemonic_ext = coding_subsidiary_list[i].mnemonic_ext; enum eol_type eol = coding_subsidiary_list[i].eol; qxestrcpy_ascii (codesys_name + len, extension); @@ -824,6 +1138,16 @@ } } +DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, + 1, 1, 0, /* +Return t if OBJECT names a coding system, and is not a coding system alias. +*/ + (object)) +{ + return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil)) + ? Qt : Qnil; +} + /* Basic function to create new coding systems. For `make-coding-system', NAME-OR-EXISTING is the NAME argument, PREFIX is null, and TYPE, DESCRIPTION, and PROPS are the same. All created coding systems are put @@ -863,7 +1187,7 @@ */ static Lisp_Object -make_coding_system_1 (Lisp_Object name_or_existing, Ascbyte *prefix, +make_coding_system_1 (Lisp_Object name_or_existing, const Ascbyte *prefix, Lisp_Object type, Lisp_Object description, Lisp_Object props) { @@ -872,7 +1196,7 @@ enum eol_type eol_wrapper = EOL_AUTODETECT; struct coding_system_methods *meths; Lisp_Object csobj; - Lisp_Object defmnem = Qnil; + Lisp_Object defmnem = Qnil, aliases = Qnil; if (NILP (type)) type = Qundecided; @@ -907,9 +1231,13 @@ else CHECK_SYMBOL (name_or_existing); - if (!NILP (Ffind_coding_system (name_or_existing))) + /* See is there an entry for name_or_existing in the defined coding system + hash table. */ + csobj = find_coding_system (name_or_existing, 0); + /* Error if it's there and not an autoload form. */ + if (!NILP (csobj) && !CONSP (csobj)) invalid_operation ("Cannot redefine existing coding system", - name_or_existing); + name_or_existing); cs = allocate_coding_system (meths, meths->extra_data_size, name_or_existing); @@ -957,15 +1285,55 @@ CODING_SYSTEM_POST_READ_CONVERSION (cs) = value; else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (cs) = value; + else if (EQ (key, Qaliases)) + { + EXTERNAL_LIST_LOOP_2 (alias, value) + { + CHECK_SYMBOL (alias); + + if (!NILP (Fcoding_system_canonical_name_p (alias))) + { + invalid_change ("Symbol is the canonical name of a " + "coding system and cannot be redefined", + alias); + } + } + aliases = value; + } /* FSF compatibility */ else if (EQ (key, Qtranslation_table_for_decode)) ; else if (EQ (key, Qtranslation_table_for_encode)) ; else if (EQ (key, Qsafe_chars)) - ; + { + CHECK_CHAR_TABLE (value); + CODING_SYSTEM_SAFE_CHARS (cs) = value; + } else if (EQ (key, Qsafe_charsets)) - ; + { + if (!EQ (Qt, value) + /* Would be nice to actually do this check, but there are + some order conflicts with japanese.el and + mule-coding.el */ + && 0) + { +#ifdef MULE + EXTERNAL_LIST_LOOP_2 (safe_charset, value) + CHECK_CHARSET (Ffind_charset (safe_charset)); +#endif + } + + CODING_SYSTEM_SAFE_CHARSETS (cs) = value; + } + else if (EQ (key, Qcategory)) + { + Fput (name_or_existing, intern ("coding-system-property"), + Fplist_put (Fget (name_or_existing, + intern ("coding-system-property"), + Qnil), + Qcategory, value)); + } else if (EQ (key, Qmime_charset)) ; else if (EQ (key, Qvalid_codes)) @@ -985,6 +1353,8 @@ XCODING_SYSTEM_EOL_TYPE (csobj) = EOL_AUTODETECT; /* for copy-coding-system below */ + Fputhash (name_or_existing, csobj, Vcoding_system_hash_table); + if (need_to_setup_eol_systems && !cs->internal_p) setup_eol_coding_systems (csobj); else if (eol_wrapper == EOL_CR || eol_wrapper == EOL_CRLF) @@ -1022,347 +1392,34 @@ csobj)); } XCODING_SYSTEM_EOL_TYPE (csobj) = eol_wrapper; + + { + EXTERNAL_LIST_LOOP_2 (alias, aliases) + Fdefine_coding_system_alias (alias, csobj); + } } - - Fputhash (name_or_existing, csobj, Vcoding_system_hash_table); return csobj; } Lisp_Object -make_internal_coding_system (Lisp_Object existing, Ascbyte *prefix, +make_internal_coding_system (Lisp_Object existing, const Ascbyte *prefix, Lisp_Object type, Lisp_Object description, Lisp_Object props) { return make_coding_system_1 (existing, prefix, type, description, props); } -DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /* -Register symbol NAME as a coding system. - -TYPE describes the conversion method used and should be one of - -nil or `undecided' - Automatic conversion. XEmacs attempts to detect the coding system - used in the file. -`chain' - Chain two or more coding systems together to make a combination coding - system. -`no-conversion' - No conversion. Use this for binary files and such. On output, - graphic characters that are not in ASCII or Latin-1 will be - replaced by a ?. (For a no-conversion-encoded buffer, these - characters will only be present if you explicitly insert them.) -`convert-eol' - Convert CRLF sequences or CR to LF. -`shift-jis' - Shift-JIS (a Japanese encoding commonly used in PC operating systems). -`unicode' - Any Unicode encoding (UCS-4, UTF-8, UTF-16, etc.). -`mswindows-unicode-to-multibyte' - (MS Windows only) Converts from Windows Unicode to Windows Multibyte - (any code page encoding) upon encoding, and the other way upon decoding. -`mswindows-multibyte' - Converts to or from Windows Multibyte (any code page encoding). - This is resolved into a chain of `mswindows-unicode' and - `mswindows-unicode-to-multibyte'. -`iso2022' - Any ISO2022-compliant encoding. Among other things, this includes - JIS (the Japanese encoding commonly used for e-mail), EUC (the - standard Unix encoding for Japanese and other languages), and - Compound Text (the encoding used in X11). You can specify more - specific information about the conversion with the PROPS argument. -`big5' - Big5 (the encoding commonly used for Mandarin Chinese in Taiwan). -`ccl' - The conversion is performed using a user-written pseudo-code - program. CCL (Code Conversion Language) is the name of this - pseudo-code. -`gzip' - GZIP compression format. -`internal' - Write out or read in the raw contents of the memory representing - the buffer's text. This is primarily useful for debugging - purposes, and is only enabled when XEmacs has been compiled with - DEBUG_XEMACS defined (via the --debug configure option). - WARNING: Reading in a file using `internal' conversion can result - in an internal inconsistency in the memory representing a - buffer's text, which will produce unpredictable results and may - cause XEmacs to crash. Under normal circumstances you should - never use `internal' conversion. - -DESCRIPTION is a short English phrase describing the coding system, -suitable for use as a menu item. (See also the `documentation' property -below.) - -PROPS is a property list, describing the specific nature of the -character set. Recognized properties are: - -`mnemonic' - String to be displayed in the modeline when this coding system is - active. - -`documentation' - Detailed documentation on the coding system. - -`eol-type' - End-of-line conversion to be used. It should be one of - - nil - Automatically detect the end-of-line type (LF, CRLF, - or CR). Also generate subsidiary coding systems named - `NAME-unix', `NAME-dos', and `NAME-mac', that are - identical to this coding system but have an EOL-TYPE - value of `lf', `crlf', and `cr', respectively. - `lf' - The end of a line is marked externally using ASCII LF. - Since this is also the way that XEmacs represents an - end-of-line internally, specifying this option results - in no end-of-line conversion. This is the standard - format for Unix text files. - `crlf' - The end of a line is marked externally using ASCII - CRLF. This is the standard format for MS-DOS text - files. - `cr' - The end of a line is marked externally using ASCII CR. - This is the standard format for Macintosh text files. - t - Automatically detect the end-of-line type but do not - generate subsidiary coding systems. (This value is - converted to nil when stored internally, and - `coding-system-property' will return nil.) - -`post-read-conversion' - The value is a function to call after some text is inserted and - decoded by the coding system itself and before any functions in - `after-change-functions' are called. (#### Not actually true in - XEmacs. `after-change-functions' will be called twice if - `post-read-conversion' changes something.) The argument of this - function is the same as for a function in - `after-insert-file-functions', i.e. LENGTH of the text inserted, - with point at the head of the text to be decoded. - -`pre-write-conversion' - The value is a function to call after all functions in - `write-region-annotate-functions' and `buffer-file-format' are - called, and before the text is encoded by the coding system itself. - The arguments to this function are the same as those of a function - in `write-region-annotate-functions', i.e. FROM and TO, specifying - a region of text. - - - -The following properties are allowed for FSF compatibility but currently -ignored: - -`translation-table-for-decode' - The value is a translation table to be applied on decoding. See - the function `make-translation-table' for the format of translation - table. This is not applicable to CCL-based coding systems. - -`translation-table-for-encode' - The value is a translation table to be applied on encoding. This is - not applicable to CCL-based coding systems. - -`safe-chars' - The value is a char table. If a character has non-nil value in it, - the character is safely supported by the coding system. This - overrides the specification of safe-charsets. - -`safe-charsets' - The value is a list of charsets safely supported by the coding - system. The value t means that all charsets Emacs handles are - supported. Even if some charset is not in this list, it doesn't - mean that the charset can't be encoded in the coding system; - it just means that some other receiver of text encoded - in the coding system won't be able to handle that charset. - -`mime-charset' - The value is a symbol of which name is `MIME-charset' parameter of - the coding system. - -`valid-codes' (meaningful only for a coding system based on CCL) - The value is a list to indicate valid byte ranges of the encoded - file. Each element of the list is an integer or a cons of integer. - In the former case, the integer value is a valid byte code. In the - latter case, the integers specifies the range of valid byte codes. - - - -The following additional property is recognized if TYPE is `convert-eol': - -`subtype' - One of `lf', `crlf', `cr' or nil (for autodetection). When decoding, - the corresponding sequence will be converted to LF. When encoding, - the opposite happens. This coding system converts characters to - characters. - - - -The following additional properties are recognized if TYPE is `iso2022': - -`charset-g0' -`charset-g1' -`charset-g2' -`charset-g3' - The character set initially designated to the G0 - G3 registers. - The value should be one of - - -- A charset object (designate that character set) - -- nil (do not ever use this register) - -- t (no character set is initially designated to - the register, but may be later on; this automatically - sets the corresponding `force-g*-on-output' property) - -`force-g0-on-output' -`force-g1-on-output' -`force-g2-on-output' -`force-g2-on-output' - If non-nil, send an explicit designation sequence on output before - using the specified register. - -`short' - If non-nil, use the short forms "ESC $ @", "ESC $ A", and - "ESC $ B" on output in place of the full designation sequences - "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B". - -`no-ascii-eol' - If non-nil, don't designate ASCII to G0 at each end of line on output. - Setting this to non-nil also suppresses other state-resetting that - normally happens at the end of a line. - -`no-ascii-cntl' - If non-nil, don't designate ASCII to G0 before control chars on output. - -`seven' - If non-nil, use 7-bit environment on output. Otherwise, use 8-bit - environment. - -`lock-shift' - If non-nil, use locking-shift (SO/SI) instead of single-shift - or designation by escape sequence. - -`no-iso6429' - If non-nil, don't use ISO6429's direction specification. - -`escape-quoted' - If non-nil, literal control characters that are the same as - the beginning of a recognized ISO2022 or ISO6429 escape sequence - (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E), - SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character - so that they can be properly distinguished from an escape sequence. - (Note that doing this results in a non-portable encoding.) This - encoding flag is used for byte-compiled files. Note that ESC - is a good choice for a quoting character because there are no - escape sequences whose second byte is a character from the Control-0 - or Control-1 character sets; this is explicitly disallowed by the - ISO2022 standard. - -`input-charset-conversion' - A list of conversion specifications, specifying conversion of - characters in one charset to another when decoding is performed. - Each specification is a list of two elements: the source charset, - and the destination charset. - -`output-charset-conversion' - A list of conversion specifications, specifying conversion of - characters in one charset to another when encoding is performed. - The form of each specification is the same as for - `input-charset-conversion'. - - - -The following additional properties are recognized (and required) -if TYPE is `ccl': - -`decode' - CCL program used for decoding (converting to internal format). - -`encode' - CCL program used for encoding (converting to external format). - - -The following additional properties are recognized if TYPE is `chain': - -`chain' - List of coding systems to be chained together, in decoding order. - -`canonicalize-after-coding' - Coding system to be returned by the detector routines in place of - this coding system. - - - -The following additional properties are recognized if TYPE is `unicode': - -`type' - One of `utf-16', `utf-8', `ucs-4', or `utf-7' (the latter is not - yet implemented). `utf-16' is the basic two-byte encoding; - `ucs-4' is the four-byte encoding; `utf-8' is an ASCII-compatible - variable-width 8-bit encoding; `utf-7' is a 7-bit encoding using - only characters that will safely pass through all mail gateways. - [[ This should be \"transformation format\". There should also be - `ucs-2' (or `bmp' -- no surrogates) and `utf-32' (range checked). ]] - -`little-endian' - If non-nil, `utf-16' and `ucs-4' will write out the groups of two - or four bytes little-endian instead of big-endian. This is required, - for example, under Windows. - -`need-bom' - If non-nil, a byte order mark (BOM, or Unicode FFFE) should be - written out at the beginning of the data. This serves both to - identify the endianness of the following data and to mark the - data as Unicode (at least, this is how Windows uses it). - [[ The correct term is \"signature\", since this technique may also - be used with UTF-8. That is the term used in the standard. ]] - - -The following additional properties are recognized if TYPE is -`mswindows-multibyte': - -`code-page' - Either a number (specifying a particular code page) or one of the - symbols `ansi', `oem', `mac', or `ebcdic', specifying the ANSI, - OEM, Macintosh, or EBCDIC code page associated with a particular - locale (given by the `locale' property). NOTE: EBCDIC code pages - only exist in Windows 2000 and later. - -`locale' - If `code-page' is a symbol, this specifies the locale whose code - page of the corresponding type should be used. This should be - one of the following: A cons of two strings, (LANGUAGE - . SUBLANGUAGE) (see `mswindows-set-current-locale'); a string (a - language; SUBLANG_DEFAULT, i.e. the default sublanguage, is - used); or one of the symbols `current', `user-default', or - `system-default', corresponding to the values of - `mswindows-current-locale', `mswindows-user-default-locale', or - `mswindows-system-default-locale', respectively. - - - -The following additional properties are recognized if TYPE is `undecided': -[[ Doesn't GNU use \"detect-*\" for the following two? ]] - -`do-eol' - Do EOL detection. - -`do-coding' - Do encoding detection. - -`coding-system' - If encoding detection is not done, use the specified coding system - to do decoding. This is used internally when implementing coding - systems with an EOL type that specifies autodetection (the default), - so that the detector routines return the proper subsidiary. - - - -The following additional property is recognized if TYPE is `gzip': - -`level' - Compression level: 0 through 9, or `default' (currently 6). +DEFUN ("make-coding-system-internal", Fmake_coding_system_internal, 2, 4, 0, /* +See `make-coding-system'. This does much of the work of that function. + +Without Mule support, it does all the work of that function, and an alias +exists, mapping `make-coding-system' to +`make-coding-system-internal'. You'll need a non-Mule XEmacs to read the +complete docstring. Or you can just read it in make-coding-system.el; +something like the following should work: + + \\[find-function-other-window] find-file RET \\[find-file] mule/make-coding-system.el RET */ (name, type, description, props)) @@ -1382,7 +1439,7 @@ Lisp_Object new_coding_system; old_coding_system = Fget_coding_system (old_coding_system); new_coding_system = - UNBOUNDP (new_name) ? Qnil : Ffind_coding_system (new_name); + UNBOUNDP (new_name) ? Qnil : find_coding_system (new_name, 0); if (NILP (new_coding_system)) { new_coding_system = @@ -1408,16 +1465,6 @@ return new_coding_system; } -DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, - 1, 1, 0, /* -Return t if OBJECT names a coding system, and is not a coding system alias. -*/ - (object)) -{ - return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil)) - ? Qt : Qnil; -} - /* #### Shouldn't this really be a find/get pair? */ DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /* @@ -1700,6 +1747,10 @@ return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system); else if (EQ (prop, Qpre_write_conversion)) return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system); + else if (EQ (prop, Qsafe_charsets)) + return XCODING_SYSTEM_SAFE_CHARSETS (coding_system); + else if (EQ (prop, Qsafe_chars)) + return XCODING_SYSTEM_SAFE_CHARS (coding_system); else { Lisp_Object value = CODESYSMETH_OR_GIVEN (XCODING_SYSTEM (coding_system), @@ -2274,13 +2325,14 @@ } } -DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /* +DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, + "*r\nzDecode from coding system: \ni", /* Decode the text between START and END which is encoded in CODING-SYSTEM. This is useful if you've read in encoded text from a file without decoding it (e.g. you read in a JIS-formatted file but used the `binary' or `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B"). Return length of decoded text. -BUFFER defaults to the current buffer if unspecified. +BUFFER defaults to the current buffer if unspecified, and when interactive. */ (start, end, coding_system, buffer)) { @@ -2288,11 +2340,12 @@ CODING_DECODE); } -DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /* +DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, + "*r\nzEncode to coding system: \ni", /* Encode the text between START and END using CODING-SYSTEM. This will, for example, convert Japanese characters into stuff such as -"^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded -text. BUFFER defaults to the current buffer if unspecified. +"^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded text. +BUFFER defaults to the current buffer if unspecified, and when interactive. */ (start, end, coding_system, buffer)) { @@ -2300,6 +2353,100 @@ CODING_ENCODE); } +DEFUN ("query-coding-region", Fquery_coding_region, 3, 7, 0, /* +Work out whether CODING-SYSTEM can losslessly encode a region. + +START and END are the beginning and end of the region to check. +CODING-SYSTEM is the coding system to try. + +Optional argument BUFFER is the buffer to check, and defaults to the current +buffer. + +IGNORE-INVALID-SEQUENCESP, also an optional argument, says to treat XEmacs +characters which have an unambiguous encoded representation, despite being +undefined in what they represent, as encodable. These chiefly arise with +variable-length encodings like UTF-8 and UTF-16, where an invalid sequence +is passed through to XEmacs as a sequence of characters with a defined +correspondence to the octets on disk, but no non-error semantics; see the +`invalid-sequence-coding-system' argument to `set-language-info'. + +They can also arise with fixed-length encodings like ISO 8859-7, where +certain octets on disk have undefined values, and treating them as +corresponding to the ISO 8859-1 characters with the same numerical values +may lead to data that is not understood by other applications. + +Optional argument ERRORP says to signal a `text-conversion-error' if some +character in the region cannot be encoded, and defaults to nil. + +Optional argument HIGHLIGHT says to display unencodable characters in the +region using `query-coding-warning-face'. It defaults to nil. + +This function can return multiple values; the intention is that callers use +`multiple-value-bind' or the related CL multiple value functions to deal +with it. The first result is `t' if the region can be encoded using +CODING-SYSTEM, or `nil' if not. If the region cannot be encoded using +CODING-SYSTEM, the second result is a range table describing the positions +of the unencodable characters. + +Ranges that describe characters that would be ignored were +IGNORE-INVALID-SEQUENCESP non-nil map to the symbol `invalid-sequence'; +other ranges map to the symbol `unencodable'. If IGNORE-INVALID-SEQUENCESP +is non-nil, all ranges will map to the symbol `unencodable'. See +`make-range-table' for more details of range tables. +*/ + (start, end, coding_system, buffer, ignore_invalid_sequencesp, + errorp, highlight)) +{ + Charbpos b, e; + struct buffer *buf = decode_buffer (buffer, 1); + Lisp_Object result; + int flags = 0, speccount = specpdl_depth (); + + coding_system = Fget_coding_system (coding_system); + + get_buffer_range_char (buf, start, end, &b, &e, 0); + + if (buf != current_buffer) + { + record_unwind_protect (save_current_buffer_restore, Fcurrent_buffer ()); + set_buffer_internal (buf); + } + + record_unwind_protect (save_excursion_restore, save_excursion_save ()); + + BUF_SET_PT (buf, b); + + if (!NILP (ignore_invalid_sequencesp)) + { + flags |= QUERY_METHOD_IGNORE_INVALID_SEQUENCES; + } + + if (!NILP (errorp)) + { + flags |= QUERY_METHOD_ERRORP; + } + + if (!NILP (highlight)) + { + flags |= QUERY_METHOD_HIGHLIGHT; + } + + result = XCODESYSMETH_OR_GIVEN (coding_system, query, + (coding_system, buf, e, flags), Qunbound); + + if (UNBOUNDP (result)) + { + signal_error (Qtext_conversion_error, + "Coding system doesn't say what it can encode", + XCODING_SYSTEM_NAME (coding_system)); + } + + result = (NILP (result)) ? Qt : values2 (Qnil, result); + + return unbind_to_1 (speccount, result); +} + + /************************************************************************/ /* Chain methods */ @@ -3351,13 +3498,15 @@ static void output_bytes_in_ascii_and_hex (const UExtbyte *src, Bytecount n) { - UExtbyte *ascii = alloca_array (UExtbyte, n + 1); - UExtbyte *hex = alloca_array (UExtbyte, 3 * n + 1); + Extbyte *ascii = alloca_array (Extbyte, n + 1); + Extbyte *hex = alloca_array (Extbyte, 3 * n + 1); int i; + DECLARE_EISTRING (eistr_ascii); + DECLARE_EISTRING (eistr_hex); for (i = 0; i < n; i++) { - UExtbyte c = src[i]; + Extbyte c = src[i]; if (c < 0x20) ascii[i] = '.'; else @@ -3368,7 +3517,11 @@ } ascii[i] = '\0'; hex[3 * i - 1] = '\0'; - stderr_out ("%s %s", ascii, hex); + + eicpy_ext(eistr_hex, hex, Qbinary); + eicpy_ext(eistr_ascii, ascii, Qbinary); + + stderr_out ("%s %s", eidata(eistr_ascii), eidata(eistr_hex)); } #endif /* DEBUG_XEMACS */ @@ -3498,7 +3651,7 @@ } } if (NILP (retval)) - retval = Fget_coding_system (Qraw_text); + retval = Fget_coding_system (Qbinary); return retval; } else @@ -3847,6 +4000,9 @@ random result when doing subprocess detection. */ detect_coding_type (data->st, src, n); data->actual = detected_coding_system (data->st); + /* kludge to prevent infinite recursion */ + if (XCODING_SYSTEM(data->actual)->methods->enumtype == undecided_coding_system) + data->actual = Fget_coding_system (Qbinary); } } /* We need to set the detected coding system if we actually have @@ -3894,12 +4050,12 @@ return str->codesys; if (!data->c.initted) - return Fget_coding_system (Qundecided); + return str->codesys; ret = coding_stream_canonicalize_after_coding (XLSTREAM (data->c.lstreams[0])); if (NILP (ret)) - ret = Fget_coding_system (Qundecided); + ret = str->codesys; if (XCODING_SYSTEM_EOL_TYPE (ret) != EOL_AUTODETECT) return ret; eolret = coding_stream_canonicalize_after_coding @@ -4361,11 +4517,12 @@ DEFSUBR (Fvalid_coding_system_type_p); DEFSUBR (Fcoding_system_type_list); DEFSUBR (Fcoding_system_p); + DEFSUBR (Fautoload_coding_system); DEFSUBR (Ffind_coding_system); DEFSUBR (Fget_coding_system); DEFSUBR (Fcoding_system_list); DEFSUBR (Fcoding_system_name); - DEFSUBR (Fmake_coding_system); + DEFSUBR (Fmake_coding_system_internal); DEFSUBR (Fcopy_coding_system); DEFSUBR (Fcoding_system_canonical_name_p); DEFSUBR (Fcoding_system_alias_p); @@ -4388,6 +4545,7 @@ DEFSUBR (Fdetect_coding_region); DEFSUBR (Fdecode_coding_region); DEFSUBR (Fencode_coding_region); + DEFSUBR (Fquery_coding_region); DEFSYMBOL_MULTIWORD_PREDICATE (Qcoding_systemp); DEFSYMBOL (Qno_conversion); DEFSYMBOL (Qconvert_eol); @@ -4432,8 +4590,14 @@ DEFSYMBOL (Qcanonicalize_after_coding); + DEFSYMBOL (Qposix_charset_to_coding_system_hash); + DEFSYMBOL (Qescape_quoted); + DEFSYMBOL (Qquery_coding_warning_face); + DEFSYMBOL (Qaliases); + DEFSYMBOL (Qcharset_skip_chars_string); + #ifdef HAVE_ZLIB DEFSYMBOL (Qgzip); #endif @@ -4598,8 +4762,14 @@ #endif DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /* -Coding system used for TTY keyboard input. -Not used under a windowing system. +Default coding system used for TTY and X11 keyboard input. +Under X11, used only to interpet the character for a key event when that +event has a KeySym of NoSymbol but does have an associated string keysym, +something that's seen with input methods. + +If you need to set these things to different coding systems, call the +function `set-console-tty-coding-system' for the TTY and use this variable +for X11. */ ); Vkeyboard_coding_system = Qnil; @@ -4651,6 +4821,12 @@ */ ); Vdebug_coding_detection = Qnil; #endif + +#ifdef MULE + Vdefault_query_coding_region_chartab_cache + = make_lisp_hash_table (25, HASH_TABLE_NON_WEAK, HASH_TABLE_EQUAL); + staticpro (&Vdefault_query_coding_region_chartab_cache); +#endif } /* #### reformat this for consistent appearance? */ @@ -4658,7 +4834,7 @@ void complex_vars_of_file_coding (void) { - Fmake_coding_system + Fmake_coding_system_internal (Qconvert_eol_cr, Qconvert_eol, build_msg_string ("Convert CR to LF"), nconc2 (list6 (Qdocumentation, @@ -4670,9 +4846,10 @@ /* VERY IMPORTANT! Tell make-coding-system not to generate subsidiaries -- it needs the coding systems we're creating to do so! */ - list2 (Qeol_type, Qlf))); - - Fmake_coding_system + list4 (Qeol_type, Qlf, + Qsafe_charsets, Qt))); + + Fmake_coding_system_internal (Qconvert_eol_lf, Qconvert_eol, build_msg_string ("Convert LF to LF (do nothing)"), nconc2 (list6 (Qdocumentation, @@ -4683,9 +4860,10 @@ /* VERY IMPORTANT! Tell make-coding-system not to generate subsidiaries -- it needs the coding systems we're creating to do so! */ - list2 (Qeol_type, Qlf))); - - Fmake_coding_system + list4 (Qeol_type, Qlf, + Qsafe_charsets, Qt))); + + Fmake_coding_system_internal (Qconvert_eol_crlf, Qconvert_eol, build_msg_string ("Convert CRLF to LF"), nconc2 (list6 (Qdocumentation, @@ -4694,12 +4872,14 @@ "(used internally and under Unix to mark the end of a line)."), Qmnemonic, build_string ("CRLF->LF"), Qsubtype, Qcrlf), + /* VERY IMPORTANT! Tell make-coding-system not to generate subsidiaries -- it needs the coding systems we're creating to do so! */ - list2 (Qeol_type, Qlf))); - - Fmake_coding_system + list4 (Qeol_type, Qlf, + Qsafe_charsets, Qt))); + + Fmake_coding_system_internal (Qconvert_eol_autodetect, Qconvert_eol, build_msg_string ("Autodetect EOL type"), nconc2 (list6 (Qdocumentation, @@ -4710,9 +4890,10 @@ /* VERY IMPORTANT! Tell make-coding-system not to generate subsidiaries -- it needs the coding systems we're creating to do so! */ - list2 (Qeol_type, Qlf))); - - Fmake_coding_system + list4 (Qeol_type, Qlf, + Qsafe_charsets, Qt))); + + Fmake_coding_system_internal (Qundecided, Qundecided, build_msg_string ("Undecided (auto-detect)"), nconc2 (list4 (Qdocumentation, @@ -4725,7 +4906,7 @@ though, I don't think.) */ Qeol_type, Qlf))); - Fmake_coding_system + Fmake_coding_system_internal (intern ("undecided-dos"), Qundecided, build_msg_string ("Undecided (auto-detect) (CRLF)"), nconc2 (list4 (Qdocumentation, @@ -4735,7 +4916,7 @@ list4 (Qdo_coding, Qt, Qeol_type, Qcrlf))); - Fmake_coding_system + Fmake_coding_system_internal (intern ("undecided-unix"), Qundecided, build_msg_string ("Undecided (auto-detect) (LF)"), nconc2 (list4 (Qdocumentation, @@ -4745,7 +4926,7 @@ list4 (Qdo_coding, Qt, Qeol_type, Qlf))); - Fmake_coding_system + Fmake_coding_system_internal (intern ("undecided-mac"), Qundecided, build_msg_string ("Undecided (auto-detect) (CR)"), nconc2 (list4 (Qdocumentation, @@ -4756,26 +4937,42 @@ Qeol_type, Qcr))); /* Need to create this here or we're really screwed. */ - Fmake_coding_system + Fmake_coding_system_internal (Qraw_text, Qno_conversion, build_msg_string ("Raw Text"), - list4 (Qdocumentation, - build_msg_string ("Raw text converts only line-break codes, and acts otherwise like `binary'."), - Qmnemonic, build_string ("Raw"))); - - Fmake_coding_system + nconc2 (list4 (Qdocumentation, + build_msg_string ("Raw text converts only line-break " + "codes, and acts otherwise like " + "`binary'."), + Qmnemonic, build_string ("Raw")), +#ifdef MULE + list2 (Qsafe_charsets, list3 (Vcharset_ascii, Vcharset_control_1, + Vcharset_latin_iso8859_1)))); + +#else + Qnil)); +#endif + + Fmake_coding_system_internal (Qbinary, Qno_conversion, build_msg_string ("Binary"), - list6 (Qdocumentation, - build_msg_string ( + nconc2 (list6 (Qdocumentation, + build_msg_string ( "This coding system is as close as it comes to doing no conversion.\n" "On input, each byte is converted directly into the character\n" "with the corresponding code -- i.e. from the `ascii', `control-1',\n" "or `latin-1' character sets. On output, these characters are\n" "converted back to the corresponding bytes, and other characters\n" "are converted to the default character, i.e. `~'."), - Qeol_type, Qlf, - Qmnemonic, build_string ("Binary"))); + Qeol_type, Qlf, + Qmnemonic, build_string ("Binary")), +#ifdef MULE + list2 (Qsafe_charsets, list3 (Vcharset_ascii, Vcharset_control_1, + Vcharset_latin_iso8859_1)))); + +#else + Qnil)); +#endif /* Formerly aliased to raw-text! Completely bogus and not even the same as FSF Emacs. */