Mercurial > hg > xemacs-beta
view src/file-coding.c @ 678:8e8a7b205142
[xemacs-hg @ 2001-11-29 17:00:38 by youngs]
2001-11-28 Steve Youngs <youngs@xemacs.org>
* package-get.el (package-get-locate-index-file): Also search the
core etc/ directory for the package index file if it can't be
found in ~/.xemacs/.
author | youngs |
---|---|
date | Thu, 29 Nov 2001 17:00:39 +0000 |
parents | fdefd0186b75 |
children | 943eaba38521 |
line wrap: on
line source
/* Code conversion functions. Copyright (C) 1991, 1995 Free Software Foundation, Inc. Copyright (C) 1995 Sun Microsystems, Inc. This file is part of XEmacs. XEmacs is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. XEmacs is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with XEmacs; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* Synched up with: Mule 2.3. Not in FSF. */ /* Rewritten by Ben Wing <ben@xemacs.org>. */ #include <config.h> #include "lisp.h" #include "buffer.h" #include "elhash.h" #include "insdel.h" #include "lstream.h" #include "opaque.h" #ifdef MULE #include "mule-ccl.h" #include "chartab.h" #endif #include "file-coding.h" Lisp_Object Vkeyboard_coding_system; Lisp_Object Vterminal_coding_system; Lisp_Object Vcoding_system_for_read; Lisp_Object Vcoding_system_for_write; Lisp_Object Vfile_name_coding_system; /* Table of symbols identifying each coding category. */ Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST]; struct file_coding_dump { /* Coding system currently associated with each coding category. */ Lisp_Object coding_category_system[CODING_CATEGORY_LAST]; /* Table of all coding categories in decreasing order of priority. This describes a permutation of the possible coding categories. */ int coding_category_by_priority[CODING_CATEGORY_LAST]; #ifdef MULE Lisp_Object ucs_to_mule_table[65536]; #endif } *fcd; static const struct lrecord_description fcd_description_1[] = { { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST }, #ifdef MULE { XD_LISP_OBJECT_ARRAY, offsetof (struct file_coding_dump, ucs_to_mule_table), countof (fcd->ucs_to_mule_table) }, #endif { XD_END } }; static const struct struct_description fcd_description = { sizeof (struct file_coding_dump), fcd_description_1 }; Lisp_Object mule_to_ucs_table; Lisp_Object Qcoding_systemp; Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022; /* Qinternal in general.c */ Lisp_Object Qmnemonic, Qeol_type; Lisp_Object Qcr, Qcrlf, Qlf; Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf; Lisp_Object Qpost_read_conversion; Lisp_Object Qpre_write_conversion; #ifdef MULE Lisp_Object Qucs4, Qutf8; Lisp_Object Qbig5, Qshift_jis; Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3; Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output; Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output; Lisp_Object Qno_iso6429; Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion; Lisp_Object Qescape_quoted; Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift; #endif Lisp_Object Qencode, Qdecode; Lisp_Object Vcoding_system_hash_table; int enable_multibyte_characters; #ifdef MULE /* Additional information used by the ISO2022 decoder and detector. */ struct iso2022_decoder { /* CHARSET holds the character sets currently assigned to the G0 through G3 variables. It is initialized from the array INITIAL_CHARSET in CODESYS. */ Lisp_Object charset[4]; /* Which registers are currently invoked into the left (GL) and right (GR) halves of the 8-bit encoding space? */ int register_left, register_right; /* ISO_ESC holds a value indicating part of an escape sequence that has already been seen. */ enum iso_esc_flag esc; /* This records the bytes we've seen so far in an escape sequence, in case the sequence is invalid (we spit out the bytes unchanged). */ unsigned char esc_bytes[8]; /* Index for next byte to store in ISO escape sequence. */ int esc_bytes_index; #ifdef ENABLE_COMPOSITE_CHARS /* Stuff seen so far when composing a string. */ unsigned_char_dynarr *composite_chars; #endif /* If we saw an invalid designation sequence for a particular register, we flag it here and switch to ASCII. The next time we see a valid designation for this register, we turn off the flag and do the designation normally, but pretend the sequence was invalid. The effect of all this is that (most of the time) the escape sequences for both the switch to the unknown charset, and the switch back to the known charset, get inserted literally into the buffer and saved out as such. The hope is that we can preserve the escape sequences so that the resulting written out file makes sense. If we don't do any of this, the designation to the invalid charset will be preserved but that switch back to the known charset will probably get eaten because it was the same charset that was already present in the register. */ unsigned char invalid_designated[4]; /* We try to do similar things as above for direction-switching sequences. If we encountered a direction switch while an invalid designation was present, or an invalid designation just after a direction switch (i.e. no valid designation encountered yet), we insert the direction-switch escape sequence literally into the output stream, and later on insert the corresponding direction-restoring escape sequence literally also. */ unsigned int switched_dir_and_no_valid_charset_yet :1; unsigned int invalid_switch_dir :1; /* Tells the decoder to output the escape sequence literally even though it was valid. Used in the games we play to avoid lossage when we encounter invalid designations. */ unsigned int output_literally :1; /* We encountered a direction switch followed by an invalid designation. We didn't output the direction switch literally because we didn't know about the invalid designation; but we have to do so now. */ unsigned int output_direction_sequence :1; }; #endif /* MULE */ EXFUN (Fcopy_coding_system, 2); #ifdef MULE struct detection_state; static int detect_coding_sjis (struct detection_state *st, const Extbyte *src, Bytecount n); static void decode_coding_sjis (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n); static void encode_coding_sjis (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n); static int detect_coding_big5 (struct detection_state *st, const Extbyte *src, Bytecount n); static void decode_coding_big5 (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n); static void encode_coding_big5 (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n); static int detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Bytecount n); static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n); static void encode_coding_ucs4 (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n); static int detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Bytecount n); static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n); static void encode_coding_utf8 (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n); static int postprocess_iso2022_mask (int mask); static void reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso); static int detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Bytecount n); static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n); static void encode_coding_iso2022 (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n); #endif /* MULE */ static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n); static void encode_coding_no_conversion (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n); static void mule_decode (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n); static void mule_encode (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n); typedef struct codesys_prop codesys_prop; struct codesys_prop { Lisp_Object sym; int prop_type; }; typedef struct { Dynarr_declare (codesys_prop); } codesys_prop_dynarr; static const struct lrecord_description codesys_prop_description_1[] = { { XD_LISP_OBJECT, offsetof (codesys_prop, sym) }, { XD_END } }; static const struct struct_description codesys_prop_description = { sizeof (codesys_prop), codesys_prop_description_1 }; static const struct lrecord_description codesys_prop_dynarr_description_1[] = { XD_DYNARR_DESC (codesys_prop_dynarr, &codesys_prop_description), { XD_END } }; static const struct struct_description codesys_prop_dynarr_description = { sizeof (codesys_prop_dynarr), codesys_prop_dynarr_description_1 }; codesys_prop_dynarr *the_codesys_prop_dynarr; enum codesys_prop_enum { CODESYS_PROP_ALL_OK, CODESYS_PROP_ISO2022, CODESYS_PROP_CCL }; /************************************************************************/ /* Coding system functions */ /************************************************************************/ static Lisp_Object mark_coding_system (Lisp_Object); static void print_coding_system (Lisp_Object, Lisp_Object, int); static void finalize_coding_system (void *header, int for_disksave); #ifdef MULE static const struct lrecord_description ccs_description_1[] = { { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) }, { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) }, { XD_END } }; static const struct struct_description ccs_description = { sizeof (charset_conversion_spec), ccs_description_1 }; static const struct lrecord_description ccsd_description_1[] = { XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description), { XD_END } }; static const struct struct_description ccsd_description = { sizeof (charset_conversion_spec_dynarr), ccsd_description_1 }; #endif static const struct lrecord_description coding_system_description[] = { { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, name) }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, doc_string) }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, mnemonic) }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, post_read_conversion) }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, pre_write_conversion) }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_lf) }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_crlf) }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, eol_cr) }, #ifdef MULE { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, iso2022.initial_charset), 4 }, { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description }, { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.decode) }, { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, ccl.encode) }, #endif { XD_END } }; DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system, mark_coding_system, print_coding_system, finalize_coding_system, 0, 0, coding_system_description, Lisp_Coding_System); static Lisp_Object mark_coding_system (Lisp_Object obj) { Lisp_Coding_System *codesys = XCODING_SYSTEM (obj); mark_object (CODING_SYSTEM_NAME (codesys)); mark_object (CODING_SYSTEM_DOC_STRING (codesys)); mark_object (CODING_SYSTEM_MNEMONIC (codesys)); mark_object (CODING_SYSTEM_EOL_LF (codesys)); mark_object (CODING_SYSTEM_EOL_CRLF (codesys)); mark_object (CODING_SYSTEM_EOL_CR (codesys)); switch (CODING_SYSTEM_TYPE (codesys)) { #ifdef MULE int i; case CODESYS_ISO2022: for (i = 0; i < 4; i++) mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)); if (codesys->iso2022.input_conv) { for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++) { struct charset_conversion_spec *ccs = Dynarr_atp (codesys->iso2022.input_conv, i); mark_object (ccs->from_charset); mark_object (ccs->to_charset); } } if (codesys->iso2022.output_conv) { for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++) { struct charset_conversion_spec *ccs = Dynarr_atp (codesys->iso2022.output_conv, i); mark_object (ccs->from_charset); mark_object (ccs->to_charset); } } break; case CODESYS_CCL: mark_object (CODING_SYSTEM_CCL_DECODE (codesys)); mark_object (CODING_SYSTEM_CCL_ENCODE (codesys)); break; #endif /* MULE */ default: break; } mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys)); return CODING_SYSTEM_POST_READ_CONVERSION (codesys); } static void print_coding_system (Lisp_Object obj, Lisp_Object printcharfun, int escapeflag) { Lisp_Coding_System *c = XCODING_SYSTEM (obj); if (print_readably) printing_unreadable_object ("#<coding_system 0x%x>", c->header.uid); write_c_string ("#<coding_system ", printcharfun); print_internal (c->name, printcharfun, 1); write_c_string (">", printcharfun); } static void finalize_coding_system (void *header, int for_disksave) { Lisp_Coding_System *c = (Lisp_Coding_System *) header; /* Since coding systems never go away, this function is not necessary. But it would be necessary if we changed things so that coding systems could go away. */ if (!for_disksave) /* see comment in lstream.c */ { switch (CODING_SYSTEM_TYPE (c)) { #ifdef MULE case CODESYS_ISO2022: if (c->iso2022.input_conv) { Dynarr_free (c->iso2022.input_conv); c->iso2022.input_conv = 0; } if (c->iso2022.output_conv) { Dynarr_free (c->iso2022.output_conv); c->iso2022.output_conv = 0; } break; #endif /* MULE */ default: break; } } } static eol_type_t symbol_to_eol_type (Lisp_Object symbol) { CHECK_SYMBOL (symbol); if (NILP (symbol)) return EOL_AUTODETECT; if (EQ (symbol, Qlf)) return EOL_LF; if (EQ (symbol, Qcrlf)) return EOL_CRLF; if (EQ (symbol, Qcr)) return EOL_CR; invalid_constant ("Unrecognized eol type", symbol); return EOL_AUTODETECT; /* not reached */ } static Lisp_Object eol_type_to_symbol (eol_type_t type) { switch (type) { default: abort (); case EOL_LF: return Qlf; case EOL_CRLF: return Qcrlf; case EOL_CR: return Qcr; case EOL_AUTODETECT: return Qnil; } } static void setup_eol_coding_systems (Lisp_Coding_System *codesys) { Lisp_Object codesys_obj; int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name); char *codesys_name = (char *) alloca (len + 7); int mlen = -1; char *codesys_mnemonic=0; Lisp_Object codesys_name_sym, sub_codesys_obj; /* kludge */ XSETCODING_SYSTEM (codesys_obj, codesys); memcpy (codesys_name, string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len); if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys))) { mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys)); codesys_mnemonic = (char *) alloca (mlen + 7); memcpy (codesys_mnemonic, XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen); } #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \ strcpy (codesys_name + len, "-" op_sys); \ if (mlen != -1) \ strcpy (codesys_mnemonic + mlen, op_sys_abbr); \ codesys_name_sym = intern (codesys_name); \ sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \ XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \ if (mlen != -1) \ XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \ build_string (codesys_mnemonic); \ CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \ } while (0) DEFINE_SUB_CODESYS("unix", "", EOL_LF); DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF); DEFINE_SUB_CODESYS("mac", ":t", EOL_CR); } DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /* Return t if OBJECT is a coding system. A coding system is an object that defines how text containing multiple character sets is encoded into a stream of (typically 8-bit) bytes. The coding system is used to decode the stream into a series of characters (which may be from multiple charsets) when the text is read from a file or process, and is used to encode the text back into the same format when it is written out to a file or process. For example, many ISO2022-compliant coding systems (such as Compound Text, which is used for inter-client data under the X Window System) use escape sequences to switch between different charsets -- Japanese Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See `make-coding-system' for more information. Coding systems are normally identified using a symbol, and the symbol is accepted in place of the actual coding system object whenever a coding system is called for. (This is similar to how faces work.) */ (object)) { return CODING_SYSTEMP (object) ? Qt : Qnil; } DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /* Retrieve the coding system of the given name. If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol. If there is no such coding system, nil is returned. Otherwise the associated coding system object is returned. */ (coding_system_or_name)) { if (NILP (coding_system_or_name)) coding_system_or_name = Qbinary; else if (CODING_SYSTEMP (coding_system_or_name)) return coding_system_or_name; else CHECK_SYMBOL (coding_system_or_name); while (1) { coding_system_or_name = Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil); if (CODING_SYSTEMP (coding_system_or_name) || NILP (coding_system_or_name)) return coding_system_or_name; } } DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /* Retrieve the coding system of the given name. Same as `find-coding-system' except that if there is no such coding system, an error is signaled instead of returning nil. */ (name)) { Lisp_Object coding_system = Ffind_coding_system (name); if (NILP (coding_system)) invalid_argument ("No such coding system", name); return coding_system; } /* We store the coding systems in hash tables with the names as the key and the actual coding system object as the value. Occasionally we need to use them in a list format. These routines provide us with that. */ struct coding_system_list_closure { Lisp_Object *coding_system_list; }; static int add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value, void *coding_system_list_closure) { /* This function can GC */ struct coding_system_list_closure *cscl = (struct coding_system_list_closure *) coding_system_list_closure; Lisp_Object *coding_system_list = cscl->coding_system_list; *coding_system_list = Fcons (key, *coding_system_list); return 0; } DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /* Return a list of the names of all defined coding systems. */ ()) { Lisp_Object coding_system_list = Qnil; struct gcpro gcpro1; struct coding_system_list_closure coding_system_list_closure; GCPRO1 (coding_system_list); coding_system_list_closure.coding_system_list = &coding_system_list; elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table, &coding_system_list_closure); UNGCPRO; return coding_system_list; } DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /* Return the name of the given coding system. */ (coding_system)) { coding_system = Fget_coding_system (coding_system); return XCODING_SYSTEM_NAME (coding_system); } static Lisp_Coding_System * allocate_coding_system (enum coding_system_type type, Lisp_Object name) { Lisp_Coding_System *codesys = alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system); zero_lcrecord (codesys); CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil; CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil; CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT; CODING_SYSTEM_EOL_CRLF (codesys) = Qnil; CODING_SYSTEM_EOL_CR (codesys) = Qnil; CODING_SYSTEM_EOL_LF (codesys) = Qnil; CODING_SYSTEM_TYPE (codesys) = type; CODING_SYSTEM_MNEMONIC (codesys) = Qnil; #ifdef MULE if (type == CODESYS_ISO2022) { int i; for (i = 0; i < 4; i++) CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil; } else if (type == CODESYS_CCL) { CODING_SYSTEM_CCL_DECODE (codesys) = Qnil; CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil; } #endif /* MULE */ CODING_SYSTEM_NAME (codesys) = name; return codesys; } #ifdef MULE /* Given a list of charset conversion specs as specified in a Lisp program, parse it into STORE_HERE. */ static void parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here, Lisp_Object spec_list) { Lisp_Object rest; EXTERNAL_LIST_LOOP (rest, spec_list) { Lisp_Object car = XCAR (rest); Lisp_Object from, to; struct charset_conversion_spec spec; if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car)))) invalid_argument ("Invalid charset conversion spec", car); from = Fget_charset (XCAR (car)); to = Fget_charset (XCAR (XCDR (car))); if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to)) invalid_operation_2 ("Attempted conversion between different charset types", from, to); spec.from_charset = from; spec.to_charset = to; Dynarr_add (store_here, spec); } } /* Given a dynarr LOAD_HERE of internally-stored charset conversion specs, return the equivalent as the Lisp programmer would see it. If LOAD_HERE is 0, return Qnil. */ static Lisp_Object unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here) { int i; Lisp_Object result; if (!load_here) return Qnil; for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++) { struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i); result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result); } return Fnreverse (result); } #endif /* MULE */ DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /* Register symbol NAME as a coding system. TYPE describes the conversion method used and should be one of nil or 'undecided Automatic conversion. XEmacs attempts to detect the coding system used in the file. 'no-conversion No conversion. Use this for binary files and such. On output, graphic characters that are not in ASCII or Latin-1 will be replaced by a ?. (For a no-conversion-encoded buffer, these characters will only be present if you explicitly insert them.) 'shift-jis Shift-JIS (a Japanese encoding commonly used in PC operating systems). 'ucs-4 ISO 10646 UCS-4 encoding. 'utf-8 ISO 10646 UTF-8 encoding. 'iso2022 Any ISO2022-compliant encoding. Among other things, this includes JIS (the Japanese encoding commonly used for e-mail), EUC (the standard Unix encoding for Japanese and other languages), and Compound Text (the encoding used in X11). You can specify more specific information about the conversion with the PROPS argument. 'big5 Big5 (the encoding commonly used for Taiwanese). 'ccl The conversion is performed using a user-written pseudo-code program. CCL (Code Conversion Language) is the name of this pseudo-code. 'internal Write out or read in the raw contents of the memory representing the buffer's text. This is primarily useful for debugging purposes, and is only enabled when XEmacs has been compiled with DEBUG_XEMACS defined (via the --debug configure option). WARNING: Reading in a file using 'internal conversion can result in an internal inconsistency in the memory representing a buffer's text, which will produce unpredictable results and may cause XEmacs to crash. Under normal circumstances you should never use 'internal conversion. DOC-STRING is a string describing the coding system. PROPS is a property list, describing the specific nature of the character set. Recognized properties are: 'mnemonic String to be displayed in the modeline when this coding system is active. 'eol-type End-of-line conversion to be used. It should be one of nil Automatically detect the end-of-line type (LF, CRLF, or CR). Also generate subsidiary coding systems named `NAME-unix', `NAME-dos', and `NAME-mac', that are identical to this coding system but have an EOL-TYPE value of 'lf, 'crlf, and 'cr, respectively. 'lf The end of a line is marked externally using ASCII LF. Since this is also the way that XEmacs represents an end-of-line internally, specifying this option results in no end-of-line conversion. This is the standard format for Unix text files. 'crlf The end of a line is marked externally using ASCII CRLF. This is the standard format for MS-DOS text files. 'cr The end of a line is marked externally using ASCII CR. This is the standard format for Macintosh text files. t Automatically detect the end-of-line type but do not generate subsidiary coding systems. (This value is converted to nil when stored internally, and `coding-system-property' will return nil.) 'post-read-conversion Function called after a file has been read in, to perform the decoding. Called with two arguments, START and END, denoting a region of the current buffer to be decoded. 'pre-write-conversion Function called before a file is written out, to perform the encoding. Called with two arguments, START and END, denoting a region of the current buffer to be encoded. The following additional properties are recognized if TYPE is 'iso2022: 'charset-g0 'charset-g1 'charset-g2 'charset-g3 The character set initially designated to the G0 - G3 registers. The value should be one of -- A charset object (designate that character set) -- nil (do not ever use this register) -- t (no character set is initially designated to the register, but may be later on; this automatically sets the corresponding `force-g*-on-output' property) 'force-g0-on-output 'force-g1-on-output 'force-g2-on-output 'force-g2-on-output If non-nil, send an explicit designation sequence on output before using the specified register. 'short If non-nil, use the short forms "ESC $ @", "ESC $ A", and "ESC $ B" on output in place of the full designation sequences "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B". 'no-ascii-eol If non-nil, don't designate ASCII to G0 at each end of line on output. Setting this to non-nil also suppresses other state-resetting that normally happens at the end of a line. 'no-ascii-cntl If non-nil, don't designate ASCII to G0 before control chars on output. 'seven If non-nil, use 7-bit environment on output. Otherwise, use 8-bit environment. 'lock-shift If non-nil, use locking-shift (SO/SI) instead of single-shift or designation by escape sequence. 'no-iso6429 If non-nil, don't use ISO6429's direction specification. 'escape-quoted If non-nil, literal control characters that are the same as the beginning of a recognized ISO2022 or ISO6429 escape sequence (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E), SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character so that they can be properly distinguished from an escape sequence. (Note that doing this results in a non-portable encoding.) This encoding flag is used for byte-compiled files. Note that ESC is a good choice for a quoting character because there are no escape sequences whose second byte is a character from the Control-0 or Control-1 character sets; this is explicitly disallowed by the ISO2022 standard. 'input-charset-conversion A list of conversion specifications, specifying conversion of characters in one charset to another when decoding is performed. Each specification is a list of two elements: the source charset, and the destination charset. 'output-charset-conversion A list of conversion specifications, specifying conversion of characters in one charset to another when encoding is performed. The form of each specification is the same as for 'input-charset-conversion. The following additional properties are recognized (and required) if TYPE is 'ccl: 'decode CCL program used for decoding (converting to internal format). 'encode CCL program used for encoding (converting to external format). */ (name, type, doc_string, props)) { Lisp_Coding_System *codesys; enum coding_system_type ty; int need_to_setup_eol_systems = 1; /* Convert type to constant */ if (NILP (type) || EQ (type, Qundecided)) { ty = CODESYS_AUTODETECT; } #ifdef MULE else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; } else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; } else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; } else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; } else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; } else if (EQ (type, Qccl)) { ty = CODESYS_CCL; } #endif else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; } #ifdef DEBUG_XEMACS else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; } #endif else invalid_constant ("Invalid coding system type", type); CHECK_SYMBOL (name); codesys = allocate_coding_system (ty, name); if (NILP (doc_string)) doc_string = build_string (""); else CHECK_STRING (doc_string); CODING_SYSTEM_DOC_STRING (codesys) = doc_string; { EXTERNAL_PROPERTY_LIST_LOOP_3 (key, value, props) { if (EQ (key, Qmnemonic)) { if (!NILP (value)) CHECK_STRING (value); CODING_SYSTEM_MNEMONIC (codesys) = value; } else if (EQ (key, Qeol_type)) { need_to_setup_eol_systems = NILP (value); if (EQ (value, Qt)) value = Qnil; CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value); } else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value; else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value; #ifdef MULE else if (ty == CODESYS_ISO2022) { #define FROB_INITIAL_CHARSET(charset_num) \ CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \ ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value)) if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0); else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1); else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2); else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3); #define FROB_FORCE_CHARSET(charset_num) \ CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value) else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0); else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1); else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2); else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3); #define FROB_BOOLEAN_PROPERTY(prop) \ CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value) else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT); else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL); else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL); else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN); else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT); else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429); else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED); else if (EQ (key, Qinput_charset_conversion)) { codesys->iso2022.input_conv = Dynarr_new (charset_conversion_spec); parse_charset_conversion_specs (codesys->iso2022.input_conv, value); } else if (EQ (key, Qoutput_charset_conversion)) { codesys->iso2022.output_conv = Dynarr_new (charset_conversion_spec); parse_charset_conversion_specs (codesys->iso2022.output_conv, value); } else invalid_constant ("Unrecognized property", key); } else if (EQ (type, Qccl)) { Lisp_Object sym; struct ccl_program test_ccl; Extbyte *suffix; /* Check key first. */ if (EQ (key, Qdecode)) suffix = "-ccl-decode"; else if (EQ (key, Qencode)) suffix = "-ccl-encode"; else invalid_constant ("Unrecognized property", key); /* If value is vector, register it as a ccl program associated with an newly created symbol for backward compatibility. */ if (VECTORP (value)) { sym = Fintern (concat2 (Fsymbol_name (name), build_string (suffix)), Qnil); Fregister_ccl_program (sym, value); } else { CHECK_SYMBOL (value); sym = value; } /* check if the given ccl programs are valid. */ if (setup_ccl_program (&test_ccl, sym) < 0) invalid_argument ("Invalid CCL program", value); if (EQ (key, Qdecode)) CODING_SYSTEM_CCL_DECODE (codesys) = sym; else if (EQ (key, Qencode)) CODING_SYSTEM_CCL_ENCODE (codesys) = sym; } #endif /* MULE */ else invalid_constant ("Unrecognized property", key); } } if (need_to_setup_eol_systems) setup_eol_coding_systems (codesys); { Lisp_Object codesys_obj; XSETCODING_SYSTEM (codesys_obj, codesys); Fputhash (name, codesys_obj, Vcoding_system_hash_table); return codesys_obj; } } DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /* Copy OLD-CODING-SYSTEM to NEW-NAME. If NEW-NAME does not name an existing coding system, a new one will be created. */ (old_coding_system, new_name)) { Lisp_Object new_coding_system; old_coding_system = Fget_coding_system (old_coding_system); new_coding_system = Ffind_coding_system (new_name); if (NILP (new_coding_system)) { XSETCODING_SYSTEM (new_coding_system, allocate_coding_system (XCODING_SYSTEM_TYPE (old_coding_system), new_name)); Fputhash (new_name, new_coding_system, Vcoding_system_hash_table); } { Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system); Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system); memcpy (((char *) to ) + sizeof (to->header), ((char *) from) + sizeof (from->header), sizeof (*from) - sizeof (from->header)); to->name = new_name; } return new_coding_system; } DEFUN ("coding-system-canonical-name-p", Fcoding_system_canonical_name_p, 1, 1, 0, /* Return t if OBJECT names a coding system, and is not a coding system alias. */ (object)) { return CODING_SYSTEMP (Fgethash (object, Vcoding_system_hash_table, Qnil)) ? Qt : Qnil; } DEFUN ("coding-system-alias-p", Fcoding_system_alias_p, 1, 1, 0, /* Return t if OBJECT is a coding system alias. All coding system aliases are created by `define-coding-system-alias'. */ (object)) { return SYMBOLP (Fgethash (object, Vcoding_system_hash_table, Qzero)) ? Qt : Qnil; } DEFUN ("coding-system-aliasee", Fcoding_system_aliasee, 1, 1, 0, /* Return the coding-system symbol for which symbol ALIAS is an alias. */ (alias)) { Lisp_Object aliasee = Fgethash (alias, Vcoding_system_hash_table, Qnil); if (SYMBOLP (aliasee)) return aliasee; else invalid_argument ("Symbol is not a coding system alias", alias); return Qnil; /* To keep the compiler happy */ } static Lisp_Object append_suffix_to_symbol (Lisp_Object symbol, const char *ascii_string) { return Fintern (concat2 (Fsymbol_name (symbol), build_string (ascii_string)), Qnil); } /* A maphash function, for removing dangling coding system aliases. */ static int dangling_coding_system_alias_p (Lisp_Object alias, Lisp_Object aliasee, void *dangling_aliases) { if (SYMBOLP (aliasee) && NILP (Fgethash (aliasee, Vcoding_system_hash_table, Qnil))) { (*(int *) dangling_aliases)++; return 1; } else return 0; } DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /* Define symbol ALIAS as an alias for coding system ALIASEE. You can use this function to redefine an alias that has already been defined, but you cannot redefine a name which is the canonical name for a coding system. \(a canonical name of a coding system is what is returned when you call `coding-system-name' on a coding system). ALIASEE itself can be an alias, which allows you to define nested aliases. You are forbidden, however, from creating alias loops or `dangling' aliases. These will be detected, and an error will be signaled if you attempt to do so. If ALIASEE is nil, then ALIAS will simply be undefined. See also `coding-system-alias-p', `coding-system-aliasee', and `coding-system-canonical-name-p'. */ (alias, aliasee)) { Lisp_Object real_coding_system, probe; CHECK_SYMBOL (alias); if (!NILP (Fcoding_system_canonical_name_p (alias))) invalid_change ("Symbol is the canonical name of a coding system and cannot be redefined", alias); if (NILP (aliasee)) { Lisp_Object subsidiary_unix = append_suffix_to_symbol (alias, "-unix"); Lisp_Object subsidiary_dos = append_suffix_to_symbol (alias, "-dos"); Lisp_Object subsidiary_mac = append_suffix_to_symbol (alias, "-mac"); Fremhash (alias, Vcoding_system_hash_table); /* Undefine subsidiary aliases, presumably created by a previous call to this function */ if (! NILP (Fcoding_system_alias_p (subsidiary_unix)) && ! NILP (Fcoding_system_alias_p (subsidiary_dos)) && ! NILP (Fcoding_system_alias_p (subsidiary_mac))) { Fdefine_coding_system_alias (subsidiary_unix, Qnil); Fdefine_coding_system_alias (subsidiary_dos, Qnil); Fdefine_coding_system_alias (subsidiary_mac, Qnil); } /* Undefine dangling coding system aliases. */ { int dangling_aliases; do { dangling_aliases = 0; elisp_map_remhash (dangling_coding_system_alias_p, Vcoding_system_hash_table, &dangling_aliases); } while (dangling_aliases > 0); } return Qnil; } if (CODING_SYSTEMP (aliasee)) aliasee = XCODING_SYSTEM_NAME (aliasee); /* Checks that aliasee names a coding-system */ real_coding_system = Fget_coding_system (aliasee); /* Check for coding system alias loops */ if (EQ (alias, aliasee)) alias_loop: invalid_operation_2 ("Attempt to create a coding system alias loop", alias, aliasee); for (probe = aliasee; SYMBOLP (probe); probe = Fgethash (probe, Vcoding_system_hash_table, Qzero)) { if (EQ (probe, alias)) goto alias_loop; } Fputhash (alias, aliasee, Vcoding_system_hash_table); /* Set up aliases for subsidiaries. #### There must be a better way to handle subsidiary coding systems. */ { static const char *suffixes[] = { "-unix", "-dos", "-mac" }; int i; for (i = 0; i < countof (suffixes); i++) { Lisp_Object alias_subsidiary = append_suffix_to_symbol (alias, suffixes[i]); Lisp_Object aliasee_subsidiary = append_suffix_to_symbol (aliasee, suffixes[i]); if (! NILP (Ffind_coding_system (aliasee_subsidiary))) Fdefine_coding_system_alias (alias_subsidiary, aliasee_subsidiary); } } /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac], but it doesn't look intentional, so I'd rather return something meaningful or nothing at all. */ return Qnil; } static Lisp_Object subsidiary_coding_system (Lisp_Object coding_system, eol_type_t type) { Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system); Lisp_Object new_coding_system; if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) return coding_system; switch (type) { case EOL_AUTODETECT: return coding_system; case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break; case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break; case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break; default: abort (); return Qnil; } return NILP (new_coding_system) ? coding_system : new_coding_system; } DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /* Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE. */ (coding_system, eol_type)) { coding_system = Fget_coding_system (coding_system); return subsidiary_coding_system (coding_system, symbol_to_eol_type (eol_type)); } /************************************************************************/ /* Coding system accessors */ /************************************************************************/ DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /* Return the doc string for CODING-SYSTEM. */ (coding_system)) { coding_system = Fget_coding_system (coding_system); return XCODING_SYSTEM_DOC_STRING (coding_system); } DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /* Return the type of CODING-SYSTEM. */ (coding_system)) { switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system))) { default: abort (); case CODESYS_AUTODETECT: return Qundecided; #ifdef MULE case CODESYS_SHIFT_JIS: return Qshift_jis; case CODESYS_ISO2022: return Qiso2022; case CODESYS_BIG5: return Qbig5; case CODESYS_UCS4: return Qucs4; case CODESYS_UTF8: return Qutf8; case CODESYS_CCL: return Qccl; #endif case CODESYS_NO_CONVERSION: return Qno_conversion; #ifdef DEBUG_XEMACS case CODESYS_INTERNAL: return Qinternal; #endif } } #ifdef MULE static Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum) { Lisp_Object cs = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum); return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil; } #endif /* MULE */ DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /* Return the PROP property of CODING-SYSTEM. */ (coding_system, prop)) { int i, ok = 0; enum coding_system_type type; coding_system = Fget_coding_system (coding_system); CHECK_SYMBOL (prop); type = XCODING_SYSTEM_TYPE (coding_system); for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++) if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop)) { ok = 1; switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type) { case CODESYS_PROP_ALL_OK: break; #ifdef MULE case CODESYS_PROP_ISO2022: if (type != CODESYS_ISO2022) invalid_argument ("Property only valid in ISO2022 coding systems", prop); break; case CODESYS_PROP_CCL: if (type != CODESYS_CCL) invalid_argument ("Property only valid in CCL coding systems", prop); break; #endif /* MULE */ default: abort (); } } if (!ok) invalid_constant ("Unrecognized property", prop); if (EQ (prop, Qname)) return XCODING_SYSTEM_NAME (coding_system); else if (EQ (prop, Qtype)) return Fcoding_system_type (coding_system); else if (EQ (prop, Qdoc_string)) return XCODING_SYSTEM_DOC_STRING (coding_system); else if (EQ (prop, Qmnemonic)) return XCODING_SYSTEM_MNEMONIC (coding_system); else if (EQ (prop, Qeol_type)) return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system)); else if (EQ (prop, Qeol_lf)) return XCODING_SYSTEM_EOL_LF (coding_system); else if (EQ (prop, Qeol_crlf)) return XCODING_SYSTEM_EOL_CRLF (coding_system); else if (EQ (prop, Qeol_cr)) return XCODING_SYSTEM_EOL_CR (coding_system); else if (EQ (prop, Qpost_read_conversion)) return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system); else if (EQ (prop, Qpre_write_conversion)) return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system); #ifdef MULE else if (type == CODESYS_ISO2022) { if (EQ (prop, Qcharset_g0)) return coding_system_charset (coding_system, 0); else if (EQ (prop, Qcharset_g1)) return coding_system_charset (coding_system, 1); else if (EQ (prop, Qcharset_g2)) return coding_system_charset (coding_system, 2); else if (EQ (prop, Qcharset_g3)) return coding_system_charset (coding_system, 3); #define FORCE_CHARSET(charset_num) \ (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \ (coding_system, charset_num) ? Qt : Qnil) else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0); else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1); else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2); else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3); #define LISP_BOOLEAN(prop) \ (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil) else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT); else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL); else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL); else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN); else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT); else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429); else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED); else if (EQ (prop, Qinput_charset_conversion)) return unparse_charset_conversion_specs (XCODING_SYSTEM (coding_system)->iso2022.input_conv); else if (EQ (prop, Qoutput_charset_conversion)) return unparse_charset_conversion_specs (XCODING_SYSTEM (coding_system)->iso2022.output_conv); else abort (); } else if (type == CODESYS_CCL) { if (EQ (prop, Qdecode)) return XCODING_SYSTEM_CCL_DECODE (coding_system); else if (EQ (prop, Qencode)) return XCODING_SYSTEM_CCL_ENCODE (coding_system); else abort (); } #endif /* MULE */ else abort (); return Qnil; /* not reached */ } /************************************************************************/ /* Coding category functions */ /************************************************************************/ static int decode_coding_category (Lisp_Object symbol) { int i; CHECK_SYMBOL (symbol); for (i = 0; i < CODING_CATEGORY_LAST; i++) if (EQ (coding_category_symbol[i], symbol)) return i; invalid_constant ("Unrecognized coding category", symbol); return 0; /* not reached */ } DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /* Return a list of all recognized coding categories. */ ()) { int i; Lisp_Object list = Qnil; for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) list = Fcons (coding_category_symbol[i], list); return list; } DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /* Change the priority order of the coding categories. LIST should be list of coding categories, in descending order of priority. Unspecified coding categories will be lower in priority than all specified ones, in the same relative order they were in previously. */ (list)) { int category_to_priority[CODING_CATEGORY_LAST]; int i, j; Lisp_Object rest; /* First generate a list that maps coding categories to priorities. */ for (i = 0; i < CODING_CATEGORY_LAST; i++) category_to_priority[i] = -1; /* Highest priority comes from the specified list. */ i = 0; EXTERNAL_LIST_LOOP (rest, list) { int cat = decode_coding_category (XCAR (rest)); if (category_to_priority[cat] >= 0) sferror ("Duplicate coding category in list", XCAR (rest)); category_to_priority[cat] = i++; } /* Now go through the existing categories by priority to retrieve the categories not yet specified and preserve their priority order. */ for (j = 0; j < CODING_CATEGORY_LAST; j++) { int cat = fcd->coding_category_by_priority[j]; if (category_to_priority[cat] < 0) category_to_priority[cat] = i++; } /* Now we need to construct the inverse of the mapping we just constructed. */ for (i = 0; i < CODING_CATEGORY_LAST; i++) fcd->coding_category_by_priority[category_to_priority[i]] = i; /* Phew! That was confusing. */ return Qnil; } DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /* Return a list of coding categories in descending order of priority. */ ()) { int i; Lisp_Object list = Qnil; for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]], list); return list; } DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /* Change the coding system associated with a coding category. */ (coding_category, coding_system)) { int cat = decode_coding_category (coding_category); coding_system = Fget_coding_system (coding_system); fcd->coding_category_system[cat] = coding_system; return Qnil; } DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /* Return the coding system associated with a coding category. */ (coding_category)) { int cat = decode_coding_category (coding_category); Lisp_Object sys = fcd->coding_category_system[cat]; if (!NILP (sys)) return XCODING_SYSTEM_NAME (sys); return Qnil; } /************************************************************************/ /* Detecting the encoding of data */ /************************************************************************/ struct detection_state { eol_type_t eol_type; int seen_non_ascii; int mask; #ifdef MULE struct { int mask; int in_second_byte; } big5; struct { int mask; int in_second_byte; } shift_jis; struct { int mask; int in_byte; } ucs4; struct { int mask; int in_byte; } utf8; struct { int mask; int initted; struct iso2022_decoder iso; unsigned int flags; int high_byte_count; unsigned int saw_single_shift:1; } iso2022; #endif struct { int seen_anything; int just_saw_cr; } eol; }; static int acceptable_control_char_p (int c) { switch (c) { /* Allow and ignore control characters that you might reasonably see in a text file */ case '\r': case '\n': case '\t': case 7: /* bell */ case 8: /* backspace */ case 11: /* vertical tab */ case 12: /* form feed */ case 26: /* MS-DOS C-z junk */ case 31: /* '^_' -- for info */ return 1; default: return 0; } } static int mask_has_at_most_one_bit_p (int mask) { /* Perhaps the only thing useful you learn from intensive Microsoft technical interviews */ return (mask & (mask - 1)) == 0; } static eol_type_t detect_eol_type (struct detection_state *st, const Extbyte *src, Bytecount n) { while (n--) { unsigned char c = *(unsigned char *)src++; if (c == '\n') { if (st->eol.just_saw_cr) return EOL_CRLF; else if (st->eol.seen_anything) return EOL_LF; } else if (st->eol.just_saw_cr) return EOL_CR; else if (c == '\r') st->eol.just_saw_cr = 1; else st->eol.just_saw_cr = 0; st->eol.seen_anything = 1; } return EOL_AUTODETECT; } /* Attempt to determine the encoding and EOL type of the given text. Before calling this function for the first type, you must initialize st->eol_type as appropriate and initialize st->mask to ~0. st->eol_type holds the determined EOL type, or EOL_AUTODETECT if not yet known. st->mask holds the determined coding category mask, or ~0 if only ASCII has been seen so far. Returns: 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category is present in st->mask 1 == definitive answers are here for both st->eol_type and st->mask */ static int detect_coding_type (struct detection_state *st, const Extbyte *src, Bytecount n, int just_do_eol) { if (st->eol_type == EOL_AUTODETECT) st->eol_type = detect_eol_type (st, src, n); if (just_do_eol) return st->eol_type != EOL_AUTODETECT; if (!st->seen_non_ascii) { for (; n; n--, src++) { unsigned char c = *(unsigned char *) src; if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80) { st->seen_non_ascii = 1; #ifdef MULE st->shift_jis.mask = ~0; st->big5.mask = ~0; st->ucs4.mask = ~0; st->utf8.mask = ~0; st->iso2022.mask = ~0; #endif break; } } } if (!n) return 0; #ifdef MULE if (!mask_has_at_most_one_bit_p (st->iso2022.mask)) st->iso2022.mask = detect_coding_iso2022 (st, src, n); if (!mask_has_at_most_one_bit_p (st->shift_jis.mask)) st->shift_jis.mask = detect_coding_sjis (st, src, n); if (!mask_has_at_most_one_bit_p (st->big5.mask)) st->big5.mask = detect_coding_big5 (st, src, n); if (!mask_has_at_most_one_bit_p (st->utf8.mask)) st->utf8.mask = detect_coding_utf8 (st, src, n); if (!mask_has_at_most_one_bit_p (st->ucs4.mask)) st->ucs4.mask = detect_coding_ucs4 (st, src, n); st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask | st->utf8.mask | st->ucs4.mask; #endif { int retval = mask_has_at_most_one_bit_p (st->mask); st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK; return retval && st->eol_type != EOL_AUTODETECT; } } static Lisp_Object coding_system_from_mask (int mask) { if (mask == ~0) { /* If the file was entirely or basically ASCII, use the default value of `buffer-file-coding-system'. */ Lisp_Object retval = XBUFFER (Vbuffer_defaults)->buffer_file_coding_system; if (!NILP (retval)) { retval = Ffind_coding_system (retval); if (NILP (retval)) { warn_when_safe (Qbad_variable, Qwarning, "Invalid `default-buffer-file-coding-system', set to nil"); XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil; } } if (NILP (retval)) retval = Fget_coding_system (Qraw_text); return retval; } else { int i; int cat = -1; #ifdef MULE mask = postprocess_iso2022_mask (mask); #endif /* Look through the coding categories by priority and find the first one that is allowed. */ for (i = 0; i < CODING_CATEGORY_LAST; i++) { cat = fcd->coding_category_by_priority[i]; if ((mask & (1 << cat)) && !NILP (fcd->coding_category_system[cat])) break; } if (cat >= 0) return fcd->coding_category_system[cat]; else return Fget_coding_system (Qraw_text); } } /* Given a seekable read stream and potential coding system and EOL type as specified, do any autodetection that is called for. If the coding system and/or EOL type are not `autodetect', they will be left alone; but this function will never return an autodetect coding system or EOL type. This function does not automatically fetch subsidiary coding systems; that should be unnecessary with the explicit eol-type argument. */ #define LENGTH(string_constant) (sizeof (string_constant) - 1) void determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out, eol_type_t *eol_type_in_out) { struct detection_state decst; if (*eol_type_in_out == EOL_AUTODETECT) *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out); xzero (decst); decst.eol_type = *eol_type_in_out; decst.mask = ~0; /* If autodetection is called for, do it now. */ if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT || *eol_type_in_out == EOL_AUTODETECT) { Extbyte buf[4096]; Lisp_Object coding_system = Qnil; Extbyte *p; Bytecount nread = Lstream_read (stream, buf, sizeof (buf)); Extbyte *scan_end; /* Look for initial "-*-"; mode line prefix */ for (p = buf, scan_end = buf + nread - LENGTH ("-*-coding:?-*-"); p <= scan_end && *p != '\n' && *p != '\r'; p++) if (*p == '-' && *(p+1) == '*' && *(p+2) == '-') { Extbyte *local_vars_beg = p + 3; /* Look for final "-*-"; mode line suffix */ for (p = local_vars_beg, scan_end = buf + nread - LENGTH ("-*-"); p <= scan_end && *p != '\n' && *p != '\r'; p++) if (*p == '-' && *(p+1) == '*' && *(p+2) == '-') { Extbyte *suffix = p; /* Look for "coding:" */ for (p = local_vars_beg, scan_end = suffix - LENGTH ("coding:?"); p <= scan_end; p++) if (memcmp ("coding:", p, LENGTH ("coding:")) == 0 && (p == local_vars_beg || (*(p-1) == ' ' || *(p-1) == '\t' || *(p-1) == ';'))) { Extbyte save; int n; p += LENGTH ("coding:"); while (*p == ' ' || *p == '\t') p++; /* Get coding system name */ save = *suffix; *suffix = '\0'; /* Characters valid in a MIME charset name (rfc 1521), and in a Lisp symbol name. */ n = strspn ( (char *) p, "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "0123456789" "!$%&*+-.^_{|}~"); *suffix = save; if (n > 0) { save = p[n]; p[n] = '\0'; coding_system = Ffind_coding_system (intern ((char *) p)); p[n] = save; } break; } break; } break; } if (NILP (coding_system)) do { if (detect_coding_type (&decst, buf, nread, XCODING_SYSTEM_TYPE (*codesys_in_out) != CODESYS_AUTODETECT)) break; nread = Lstream_read (stream, buf, sizeof (buf)); if (nread == 0) break; } while (1); else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT) do { if (detect_coding_type (&decst, buf, nread, 1)) break; nread = Lstream_read (stream, buf, sizeof (buf)); if (!nread) break; } while (1); *eol_type_in_out = decst.eol_type; if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT) { if (NILP (coding_system)) *codesys_in_out = coding_system_from_mask (decst.mask); else *codesys_in_out = coding_system; } } /* If we absolutely can't determine the EOL type, just assume LF. */ if (*eol_type_in_out == EOL_AUTODETECT) *eol_type_in_out = EOL_LF; Lstream_rewind (stream); } DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /* Detect coding system of the text in the region between START and END. Return a list of possible coding systems ordered by priority. If only ASCII characters are found, return 'undecided or one of its subsidiary coding systems according to a detected end-of-line type. Optional arg BUFFER defaults to the current buffer. */ (start, end, buffer)) { Lisp_Object val = Qnil; struct buffer *buf = decode_buffer (buffer, 0); Charbpos b, e; Lisp_Object instream, lb_instream; Lstream *istr, *lb_istr; struct detection_state decst; struct gcpro gcpro1, gcpro2; get_buffer_range_char (buf, start, end, &b, &e, 0); lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0); lb_istr = XLSTREAM (lb_instream); instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary)); istr = XLSTREAM (instream); GCPRO2 (instream, lb_instream); xzero (decst); decst.eol_type = EOL_AUTODETECT; decst.mask = ~0; while (1) { Extbyte random_buffer[4096]; Bytecount nread = Lstream_read (istr, random_buffer, sizeof (random_buffer)); if (!nread) break; if (detect_coding_type (&decst, random_buffer, nread, 0)) break; } if (decst.mask == ~0) val = subsidiary_coding_system (Fget_coding_system (Qundecided), decst.eol_type); else { int i; val = Qnil; #ifdef MULE decst.mask = postprocess_iso2022_mask (decst.mask); #endif for (i = CODING_CATEGORY_LAST - 1; i >= 0; i--) { int sys = fcd->coding_category_by_priority[i]; if (decst.mask & (1 << sys)) { Lisp_Object codesys = fcd->coding_category_system[sys]; if (!NILP (codesys)) codesys = subsidiary_coding_system (codesys, decst.eol_type); val = Fcons (codesys, val); } } } Lstream_close (istr); UNGCPRO; Lstream_delete (istr); Lstream_delete (lb_istr); return val; } /************************************************************************/ /* Converting to internal Mule format ("decoding") */ /************************************************************************/ /* A decoding stream is a stream used for decoding text (i.e. converting from some external format to internal format). The decoding-stream object keeps track of the actual coding stream, the stream that is at the other end, and data that needs to be persistent across the lifetime of the stream. */ /* Handle the EOL stuff related to just-read-in character C. EOL_TYPE is the EOL type of the coding stream. FLAGS is the current value of FLAGS in the coding stream, and may be modified by this macro. (The macro only looks at the CODING_STATE_CR flag.) DST is the Dynarr to which the decoded bytes are to be written. You need to also define a local goto label "label_continue_loop" that is at the end of the main character-reading loop. If C is a CR character, then this macro handles it entirely and jumps to label_continue_loop. Otherwise, this macro does not add anything to DST, and continues normally. You should continue processing C normally after this macro. */ #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \ do { \ if (c == '\r') \ { \ if (eol_type == EOL_CR) \ Dynarr_add (dst, '\n'); \ else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \ Dynarr_add (dst, c); \ else \ flags |= CODING_STATE_CR; \ goto label_continue_loop; \ } \ else if (flags & CODING_STATE_CR) \ { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \ if (c != '\n') \ Dynarr_add (dst, '\r'); \ flags &= ~CODING_STATE_CR; \ } \ } while (0) /* C should be a binary character in the range 0 - 255; convert to internal format and add to Dynarr DST. */ #define DECODE_ADD_BINARY_CHAR(c, dst) \ do { \ if (BYTE_ASCII_P (c)) \ Dynarr_add (dst, c); \ else if (BYTE_C1_P (c)) \ { \ Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \ Dynarr_add (dst, c + 0x20); \ } \ else \ { \ Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \ Dynarr_add (dst, c); \ } \ } while (0) #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \ do { \ if (ch) \ { \ DECODE_ADD_BINARY_CHAR (ch, dst); \ ch = 0; \ } \ } while (0) #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \ do { \ if (flags & CODING_STATE_END) \ { \ DECODE_OUTPUT_PARTIAL_CHAR (ch); \ if (flags & CODING_STATE_CR) \ Dynarr_add (dst, '\r'); \ } \ } while (0) #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding) struct decoding_stream { /* Coding system that governs the conversion. */ Lisp_Coding_System *codesys; /* Stream that we read the encoded data from or write the decoded data to. */ Lstream *other_end; /* If we are reading, then we can return only a fixed amount of data, so if the conversion resulted in too much data, we store it here for retrieval the next time around. */ unsigned_char_dynarr *runoff; /* FLAGS holds flags indicating the current state of the decoding. Some of these flags are dependent on the coding system. */ unsigned int flags; /* CH holds a partially built-up character. Since we only deal with one- and two-byte characters at the moment, we only use this to store the first byte of a two-byte character. */ unsigned int ch; /* EOL_TYPE specifies the type of end-of-line conversion that currently applies. We need to keep this separate from the EOL type stored in CODESYS because the latter might indicate automatic EOL-type detection while the former will always indicate a particular EOL type. */ eol_type_t eol_type; #ifdef MULE /* Additional ISO2022 information. We define the structure above because it's also needed by the detection routines. */ struct iso2022_decoder iso2022; /* Additional information (the state of the running CCL program) used by the CCL decoder. */ struct ccl_program ccl; /* counter for UTF-8 or UCS-4 */ unsigned char counter; #endif struct detection_state decst; }; static Bytecount decoding_reader (Lstream *stream, unsigned char *data, Bytecount size); static Bytecount decoding_writer (Lstream *stream, const unsigned char *data, Bytecount size); static int decoding_rewinder (Lstream *stream); static int decoding_seekable_p (Lstream *stream); static int decoding_flusher (Lstream *stream); static int decoding_closer (Lstream *stream); static Lisp_Object decoding_marker (Lisp_Object stream); DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding, sizeof (struct decoding_stream)); static Lisp_Object decoding_marker (Lisp_Object stream) { Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end; Lisp_Object str_obj; /* We do not need to mark the coding systems or charsets stored within the stream because they are stored in a global list and automatically marked. */ XSETLSTREAM (str_obj, str); mark_object (str_obj); if (str->imp->marker) return (str->imp->marker) (str_obj); else return Qnil; } /* Read SIZE bytes of data and store it into DATA. We are a decoding stream so we read data from the other end, decode it, and store it into DATA. */ static Bytecount decoding_reader (Lstream *stream, unsigned char *data, Bytecount size) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); unsigned char *orig_data = data; Bytecount read_size; int error_occurred = 0; /* We need to interface to mule_decode(), which expects to take some amount of data and store the result into a Dynarr. We have mule_decode() store into str->runoff, and take data from there as necessary. */ /* We loop until we have enough data, reading chunks from the other end and decoding it. */ while (1) { /* Take data from the runoff if we can. Make sure to take at most SIZE bytes, and delete the data from the runoff. */ if (Dynarr_length (str->runoff) > 0) { Bytecount chunk = min (size, (Bytecount) Dynarr_length (str->runoff)); memcpy (data, Dynarr_atp (str->runoff, 0), chunk); Dynarr_delete_many (str->runoff, 0, chunk); data += chunk; size -= chunk; } if (size == 0) break; /* No more room for data */ if (str->flags & CODING_STATE_END) /* This means that on the previous iteration, we hit the EOF on the other end. We loop once more so that mule_decode() can output any final stuff it may be holding, or any "go back to a sane state" escape sequences. (This latter makes sense during encoding.) */ break; /* Exhausted the runoff, so get some more. DATA has at least SIZE bytes left of storage in it, so it's OK to read directly into it. (We'll be overwriting above, after we've decoded it into the runoff.) */ read_size = Lstream_read (str->other_end, data, size); if (read_size < 0) { error_occurred = 1; break; } if (read_size == 0) /* There might be some more end data produced in the translation. See the comment above. */ str->flags |= CODING_STATE_END; mule_decode (stream, (Extbyte *) data, str->runoff, read_size); } if (data - orig_data == 0) return error_occurred ? -1 : 0; else return data - orig_data; } static Bytecount decoding_writer (Lstream *stream, const unsigned char *data, Bytecount size) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); Bytecount retval; /* Decode all our data into the runoff, and then attempt to write it all out to the other end. Remove whatever chunk we succeeded in writing. */ mule_decode (stream, (Extbyte *) data, str->runoff, size); retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0), Dynarr_length (str->runoff)); if (retval > 0) Dynarr_delete_many (str->runoff, 0, retval); /* Do NOT return retval. The return value indicates how much of the incoming data was written, not how many bytes were written. */ return size; } static void reset_decoding_stream (struct decoding_stream *str) { #ifdef MULE if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022) { Lisp_Object coding_system; XSETCODING_SYSTEM (coding_system, str->codesys); reset_iso2022 (coding_system, &str->iso2022); } else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL) { setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys)); } str->counter = 0; #endif /* MULE */ if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT || CODING_SYSTEM_EOL_TYPE (str->codesys) == EOL_AUTODETECT) { xzero (str->decst); str->decst.eol_type = EOL_AUTODETECT; str->decst.mask = ~0; } str->flags = str->ch = 0; } static int decoding_rewinder (Lstream *stream) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); reset_decoding_stream (str); Dynarr_reset (str->runoff); return Lstream_rewind (str->other_end); } static int decoding_seekable_p (Lstream *stream) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); return Lstream_seekable_p (str->other_end); } static int decoding_flusher (Lstream *stream) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); return Lstream_flush (str->other_end); } static int decoding_closer (Lstream *stream) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); if (stream->flags & LSTREAM_FL_WRITE) { str->flags |= CODING_STATE_END; decoding_writer (stream, 0, 0); } Dynarr_free (str->runoff); #ifdef MULE #ifdef ENABLE_COMPOSITE_CHARS if (str->iso2022.composite_chars) Dynarr_free (str->iso2022.composite_chars); #endif #endif return Lstream_close (str->other_end); } Lisp_Object decoding_stream_coding_system (Lstream *stream) { Lisp_Object coding_system; struct decoding_stream *str = DECODING_STREAM_DATA (stream); XSETCODING_SYSTEM (coding_system, str->codesys); return subsidiary_coding_system (coding_system, str->eol_type); } void set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) { Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); struct decoding_stream *str = DECODING_STREAM_DATA (lstr); str->codesys = cs; if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) str->eol_type = CODING_SYSTEM_EOL_TYPE (cs); reset_decoding_stream (str); } /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding stream for writing, no automatic code detection will be performed. The reason for this is that automatic code detection requires a seekable input. Things will also fail if you open a decoding stream for reading using a non-fully-specified coding system and a non-seekable input stream. */ static Lisp_Object make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys, const char *mode) { Lstream *lstr = Lstream_new (lstream_decoding, mode); struct decoding_stream *str = DECODING_STREAM_DATA (lstr); Lisp_Object obj; xzero (*str); str->other_end = stream; str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char); str->eol_type = EOL_AUTODETECT; if (!strcmp (mode, "r") && Lstream_seekable_p (stream)) /* We can determine the coding system now. */ determine_real_coding_system (stream, &codesys, &str->eol_type); set_decoding_stream_coding_system (lstr, codesys); str->decst.eol_type = str->eol_type; str->decst.mask = ~0; XSETLSTREAM (obj, lstr); return obj; } Lisp_Object make_decoding_input_stream (Lstream *stream, Lisp_Object codesys) { return make_decoding_stream_1 (stream, codesys, "r"); } Lisp_Object make_decoding_output_stream (Lstream *stream, Lisp_Object codesys) { return make_decoding_stream_1 (stream, codesys, "w"); } /* Note: the decode_coding_* functions all take the same arguments as mule_decode(), which is to say some SRC data of size N, which is to be stored into dynamic array DST. DECODING is the stream within which the decoding is taking place, but no data is actually read from or written to that stream; that is handled in decoding_reader() or decoding_writer(). This allows the same functions to be used for both reading and writing. */ static void mule_decode (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n) { struct decoding_stream *str = DECODING_STREAM_DATA (decoding); /* If necessary, do encoding-detection now. We do this when we're a writing stream or a non-seekable reading stream, meaning that we can't just process the whole input, rewind, and start over. */ if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT || str->eol_type == EOL_AUTODETECT) { Lisp_Object codesys; XSETCODING_SYSTEM (codesys, str->codesys); detect_coding_type (&str->decst, src, n, CODING_SYSTEM_TYPE (str->codesys) != CODESYS_AUTODETECT); if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT && str->decst.mask != ~0) /* #### This is cheesy. What we really ought to do is buffer up a certain amount of data so as to get a less random result. */ codesys = coding_system_from_mask (str->decst.mask); str->eol_type = str->decst.eol_type; if (XCODING_SYSTEM (codesys) != str->codesys) { /* Preserve the CODING_STATE_END flag in case it was set. If we erase it, bad things might happen. */ int was_end = str->flags & CODING_STATE_END; set_decoding_stream_coding_system (decoding, codesys); if (was_end) str->flags |= CODING_STATE_END; } } switch (CODING_SYSTEM_TYPE (str->codesys)) { #ifdef DEBUG_XEMACS case CODESYS_INTERNAL: Dynarr_add_many (dst, src, n); break; #endif case CODESYS_AUTODETECT: /* If we got this far and still haven't decided on the coding system, then do no conversion. */ case CODESYS_NO_CONVERSION: decode_coding_no_conversion (decoding, src, dst, n); break; #ifdef MULE case CODESYS_SHIFT_JIS: decode_coding_sjis (decoding, src, dst, n); break; case CODESYS_BIG5: decode_coding_big5 (decoding, src, dst, n); break; case CODESYS_UCS4: decode_coding_ucs4 (decoding, src, dst, n); break; case CODESYS_UTF8: decode_coding_utf8 (decoding, src, dst, n); break; case CODESYS_CCL: str->ccl.last_block = str->flags & CODING_STATE_END; /* When applying ccl program to stream, MUST NOT set NULL pointer to src. */ ccl_driver (&str->ccl, (src ? (unsigned char *)src : (unsigned char*)""), dst, n, 0, CCL_MODE_DECODING); break; case CODESYS_ISO2022: decode_coding_iso2022 (decoding, src, dst, n); break; #endif /* MULE */ default: abort (); } } DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /* Decode the text between START and END which is encoded in CODING-SYSTEM. This is useful if you've read in encoded text from a file without decoding it (e.g. you read in a JIS-formatted file but used the `binary' or `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B"). Return length of decoded text. BUFFER defaults to the current buffer if unspecified. */ (start, end, coding_system, buffer)) { Charbpos b, e; struct buffer *buf = decode_buffer (buffer, 0); Lisp_Object instream, lb_outstream, de_outstream, outstream; Lstream *istr, *ostr; struct gcpro gcpro1, gcpro2, gcpro3, gcpro4; get_buffer_range_char (buf, start, end, &b, &e, 0); barf_if_buffer_read_only (buf, b, e); coding_system = Fget_coding_system (coding_system); instream = make_lisp_buffer_input_stream (buf, b, e, 0); lb_outstream = make_lisp_buffer_output_stream (buf, b, 0); de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream), coding_system); outstream = make_encoding_output_stream (XLSTREAM (de_outstream), Fget_coding_system (Qbinary)); istr = XLSTREAM (instream); ostr = XLSTREAM (outstream); GCPRO4 (instream, lb_outstream, de_outstream, outstream); /* The chain of streams looks like this: [BUFFER] <----- send through ------> [ENCODE AS BINARY] ------> [DECODE AS SPECIFIED] ------> [BUFFER] */ while (1) { char tempbuf[1024]; /* some random amount */ Charbpos newpos, even_newer_pos; Charbpos oldpos = lisp_buffer_stream_startpos (istr); Bytecount size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); if (!size_in_bytes) break; newpos = lisp_buffer_stream_startpos (istr); Lstream_write (ostr, tempbuf, size_in_bytes); even_newer_pos = lisp_buffer_stream_startpos (istr); buffer_delete_range (buf, even_newer_pos - (newpos - oldpos), even_newer_pos, 0); } Lstream_close (istr); Lstream_close (ostr); UNGCPRO; Lstream_delete (istr); Lstream_delete (ostr); Lstream_delete (XLSTREAM (de_outstream)); Lstream_delete (XLSTREAM (lb_outstream)); return Qnil; } /************************************************************************/ /* Converting to an external encoding ("encoding") */ /************************************************************************/ /* An encoding stream is an output stream. When you create the stream, you specify the coding system that governs the encoding and another stream that the resulting encoded data is to be sent to, and then start sending data to it. */ #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding) struct encoding_stream { /* Coding system that governs the conversion. */ Lisp_Coding_System *codesys; /* Stream that we read the encoded data from or write the decoded data to. */ Lstream *other_end; /* If we are reading, then we can return only a fixed amount of data, so if the conversion resulted in too much data, we store it here for retrieval the next time around. */ unsigned_char_dynarr *runoff; /* FLAGS holds flags indicating the current state of the encoding. Some of these flags are dependent on the coding system. */ unsigned int flags; /* CH holds a partially built-up character. Since we only deal with one- and two-byte characters at the moment, we only use this to store the first byte of a two-byte character. */ unsigned int ch; #ifdef MULE /* Additional information used by the ISO2022 encoder. */ struct { /* CHARSET holds the character sets currently assigned to the G0 through G3 registers. It is initialized from the array INITIAL_CHARSET in CODESYS. */ Lisp_Object charset[4]; /* Which registers are currently invoked into the left (GL) and right (GR) halves of the 8-bit encoding space? */ int register_left, register_right; /* Whether we need to explicitly designate the charset in the G? register before using it. It is initialized from the array FORCE_CHARSET_ON_OUTPUT in CODESYS. */ unsigned char force_charset_on_output[4]; /* Other state variables that need to be preserved across invocations. */ Lisp_Object current_charset; int current_half; int current_char_boundary; } iso2022; /* Additional information (the state of the running CCL program) used by the CCL encoder. */ struct ccl_program ccl; #endif /* MULE */ }; static Bytecount encoding_reader (Lstream *stream, unsigned char *data, Bytecount size); static Bytecount encoding_writer (Lstream *stream, const unsigned char *data, Bytecount size); static int encoding_rewinder (Lstream *stream); static int encoding_seekable_p (Lstream *stream); static int encoding_flusher (Lstream *stream); static int encoding_closer (Lstream *stream); static Lisp_Object encoding_marker (Lisp_Object stream); DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding, sizeof (struct encoding_stream)); static Lisp_Object encoding_marker (Lisp_Object stream) { Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end; Lisp_Object str_obj; /* We do not need to mark the coding systems or charsets stored within the stream because they are stored in a global list and automatically marked. */ XSETLSTREAM (str_obj, str); mark_object (str_obj); if (str->imp->marker) return (str->imp->marker) (str_obj); else return Qnil; } /* Read SIZE bytes of data and store it into DATA. We are a encoding stream so we read data from the other end, encode it, and store it into DATA. */ static Bytecount encoding_reader (Lstream *stream, unsigned char *data, Bytecount size) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); unsigned char *orig_data = data; Bytecount read_size; int error_occurred = 0; /* We need to interface to mule_encode(), which expects to take some amount of data and store the result into a Dynarr. We have mule_encode() store into str->runoff, and take data from there as necessary. */ /* We loop until we have enough data, reading chunks from the other end and encoding it. */ while (1) { /* Take data from the runoff if we can. Make sure to take at most SIZE bytes, and delete the data from the runoff. */ if (Dynarr_length (str->runoff) > 0) { int chunk = min ((int) size, Dynarr_length (str->runoff)); memcpy (data, Dynarr_atp (str->runoff, 0), chunk); Dynarr_delete_many (str->runoff, 0, chunk); data += chunk; size -= chunk; } if (size == 0) break; /* No more room for data */ if (str->flags & CODING_STATE_END) /* This means that on the previous iteration, we hit the EOF on the other end. We loop once more so that mule_encode() can output any final stuff it may be holding, or any "go back to a sane state" escape sequences. (This latter makes sense during encoding.) */ break; /* Exhausted the runoff, so get some more. DATA at least SIZE bytes left of storage in it, so it's OK to read directly into it. (We'll be overwriting above, after we've encoded it into the runoff.) */ read_size = Lstream_read (str->other_end, data, size); if (read_size < 0) { error_occurred = 1; break; } if (read_size == 0) /* There might be some more end data produced in the translation. See the comment above. */ str->flags |= CODING_STATE_END; mule_encode (stream, data, str->runoff, read_size); } if (data == orig_data) return error_occurred ? -1 : 0; else return data - orig_data; } static Bytecount encoding_writer (Lstream *stream, const unsigned char *data, Bytecount size) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); Bytecount retval; /* Encode all our data into the runoff, and then attempt to write it all out to the other end. Remove whatever chunk we succeeded in writing. */ mule_encode (stream, data, str->runoff, size); retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0), Dynarr_length (str->runoff)); if (retval > 0) Dynarr_delete_many (str->runoff, 0, retval); /* Do NOT return retval. The return value indicates how much of the incoming data was written, not how many bytes were written. */ return size; } static void reset_encoding_stream (struct encoding_stream *str) { #ifdef MULE switch (CODING_SYSTEM_TYPE (str->codesys)) { case CODESYS_ISO2022: { int i; for (i = 0; i < 4; i++) { str->iso2022.charset[i] = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i); str->iso2022.force_charset_on_output[i] = CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i); } str->iso2022.register_left = 0; str->iso2022.register_right = 1; str->iso2022.current_charset = Qnil; str->iso2022.current_half = 0; str->iso2022.current_char_boundary = 1; break; } case CODESYS_CCL: setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys)); break; default: break; } #endif /* MULE */ str->flags = str->ch = 0; } static int encoding_rewinder (Lstream *stream) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); reset_encoding_stream (str); Dynarr_reset (str->runoff); return Lstream_rewind (str->other_end); } static int encoding_seekable_p (Lstream *stream) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); return Lstream_seekable_p (str->other_end); } static int encoding_flusher (Lstream *stream) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); return Lstream_flush (str->other_end); } static int encoding_closer (Lstream *stream) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); if (stream->flags & LSTREAM_FL_WRITE) { str->flags |= CODING_STATE_END; encoding_writer (stream, 0, 0); } Dynarr_free (str->runoff); return Lstream_close (str->other_end); } Lisp_Object encoding_stream_coding_system (Lstream *stream) { Lisp_Object coding_system; struct encoding_stream *str = ENCODING_STREAM_DATA (stream); XSETCODING_SYSTEM (coding_system, str->codesys); return coding_system; } void set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) { Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); struct encoding_stream *str = ENCODING_STREAM_DATA (lstr); str->codesys = cs; reset_encoding_stream (str); } static Lisp_Object make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys, const char *mode) { Lstream *lstr = Lstream_new (lstream_encoding, mode); struct encoding_stream *str = ENCODING_STREAM_DATA (lstr); Lisp_Object obj; xzero (*str); str->runoff = Dynarr_new (unsigned_char); str->other_end = stream; set_encoding_stream_coding_system (lstr, codesys); XSETLSTREAM (obj, lstr); return obj; } Lisp_Object make_encoding_input_stream (Lstream *stream, Lisp_Object codesys) { return make_encoding_stream_1 (stream, codesys, "r"); } Lisp_Object make_encoding_output_stream (Lstream *stream, Lisp_Object codesys) { return make_encoding_stream_1 (stream, codesys, "w"); } /* Convert N bytes of internally-formatted data stored in SRC to an external format, according to the encoding stream ENCODING. Store the encoded data into DST. */ static void mule_encode (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n) { struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); switch (CODING_SYSTEM_TYPE (str->codesys)) { #ifdef DEBUG_XEMACS case CODESYS_INTERNAL: Dynarr_add_many (dst, src, n); break; #endif case CODESYS_AUTODETECT: /* If we got this far and still haven't decided on the coding system, then do no conversion. */ case CODESYS_NO_CONVERSION: encode_coding_no_conversion (encoding, src, dst, n); break; #ifdef MULE case CODESYS_SHIFT_JIS: encode_coding_sjis (encoding, src, dst, n); break; case CODESYS_BIG5: encode_coding_big5 (encoding, src, dst, n); break; case CODESYS_UCS4: encode_coding_ucs4 (encoding, src, dst, n); break; case CODESYS_UTF8: encode_coding_utf8 (encoding, src, dst, n); break; case CODESYS_CCL: str->ccl.last_block = str->flags & CODING_STATE_END; /* When applying ccl program to stream, MUST NOT set NULL pointer to src. */ ccl_driver (&str->ccl, ((src) ? src : (unsigned char*)""), dst, n, 0, CCL_MODE_ENCODING); break; case CODESYS_ISO2022: encode_coding_iso2022 (encoding, src, dst, n); break; #endif /* MULE */ default: abort (); } } DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /* Encode the text between START and END using CODING-SYSTEM. This will, for example, convert Japanese characters into stuff such as "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded text. BUFFER defaults to the current buffer if unspecified. */ (start, end, coding_system, buffer)) { Charbpos b, e; struct buffer *buf = decode_buffer (buffer, 0); Lisp_Object instream, lb_outstream, de_outstream, outstream; Lstream *istr, *ostr; struct gcpro gcpro1, gcpro2, gcpro3, gcpro4; get_buffer_range_char (buf, start, end, &b, &e, 0); barf_if_buffer_read_only (buf, b, e); coding_system = Fget_coding_system (coding_system); instream = make_lisp_buffer_input_stream (buf, b, e, 0); lb_outstream = make_lisp_buffer_output_stream (buf, b, 0); de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream), Fget_coding_system (Qbinary)); outstream = make_encoding_output_stream (XLSTREAM (de_outstream), coding_system); istr = XLSTREAM (instream); ostr = XLSTREAM (outstream); GCPRO4 (instream, outstream, de_outstream, lb_outstream); /* The chain of streams looks like this: [BUFFER] <----- send through ------> [ENCODE AS SPECIFIED] ------> [DECODE AS BINARY] ------> [BUFFER] */ while (1) { char tempbuf[1024]; /* some random amount */ Charbpos newpos, even_newer_pos; Charbpos oldpos = lisp_buffer_stream_startpos (istr); Bytecount size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); if (!size_in_bytes) break; newpos = lisp_buffer_stream_startpos (istr); Lstream_write (ostr, tempbuf, size_in_bytes); even_newer_pos = lisp_buffer_stream_startpos (istr); buffer_delete_range (buf, even_newer_pos - (newpos - oldpos), even_newer_pos, 0); } { Charcount retlen = lisp_buffer_stream_startpos (XLSTREAM (instream)) - b; Lstream_close (istr); Lstream_close (ostr); UNGCPRO; Lstream_delete (istr); Lstream_delete (ostr); Lstream_delete (XLSTREAM (de_outstream)); Lstream_delete (XLSTREAM (lb_outstream)); return make_int (retlen); } } #ifdef MULE /************************************************************************/ /* Shift-JIS methods */ /************************************************************************/ /* Shift-JIS is a coding system encoding three character sets: ASCII, right half of JISX0201-Kana, and JISX0208. An ASCII character is encoded as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is encoded by "position-code + 0x80". A character of JISX0208 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two position-codes are divided and shifted so that it fit in the range below. --- CODE RANGE of Shift-JIS --- (character set) (range) ASCII 0x00 .. 0x7F JISX0201-Kana 0xA0 .. 0xDF JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC ------------------------------- */ /* Is this the first byte of a Shift-JIS two-byte char? */ #define BYTE_SJIS_TWO_BYTE_1_P(c) \ (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF)) /* Is this the second byte of a Shift-JIS two-byte char? */ #define BYTE_SJIS_TWO_BYTE_2_P(c) \ (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC)) #define BYTE_SJIS_KATAKANA_P(c) \ ((c) >= 0xA1 && (c) <= 0xDF) static int detect_coding_sjis (struct detection_state *st, const Extbyte *src, Bytecount n) { while (n--) { unsigned char c = *(unsigned char *)src++; if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) return 0; if (st->shift_jis.in_second_byte) { st->shift_jis.in_second_byte = 0; if (c < 0x40) return 0; } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0) st->shift_jis.in_second_byte = 1; } return CODING_CATEGORY_SHIFT_JIS_MASK; } /* Convert Shift-JIS data to internal format. */ static void decode_coding_sjis (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n) { struct decoding_stream *str = DECODING_STREAM_DATA (decoding); unsigned int flags = str->flags; unsigned int ch = str->ch; eol_type_t eol_type = str->eol_type; while (n--) { unsigned char c = *(unsigned char *)src++; if (ch) { /* Previous character was first byte of Shift-JIS Kanji char. */ if (BYTE_SJIS_TWO_BYTE_2_P (c)) { unsigned char e1, e2; Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); DECODE_SJIS (ch, c, e1, e2); Dynarr_add (dst, e1); Dynarr_add (dst, e2); } else { DECODE_ADD_BINARY_CHAR (ch, dst); DECODE_ADD_BINARY_CHAR (c, dst); } ch = 0; } else { DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); if (BYTE_SJIS_TWO_BYTE_1_P (c)) ch = c; else if (BYTE_SJIS_KATAKANA_P (c)) { Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201); Dynarr_add (dst, c); } else DECODE_ADD_BINARY_CHAR (c, dst); } label_continue_loop:; } DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); str->flags = flags; str->ch = ch; } /* Convert internally-formatted data to Shift-JIS. */ static void encode_coding_sjis (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n) { struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); unsigned int flags = str->flags; unsigned int ch = str->ch; eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); while (n--) { Intbyte c = *src++; if (c == '\n') { if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) Dynarr_add (dst, '\r'); if (eol_type != EOL_CR) Dynarr_add (dst, '\n'); ch = 0; } else if (BYTE_ASCII_P (c)) { Dynarr_add (dst, c); ch = 0; } else if (INTBYTE_LEADING_BYTE_P (c)) ch = (c == LEADING_BYTE_KATAKANA_JISX0201 || c == LEADING_BYTE_JAPANESE_JISX0208_1978 || c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0; else if (ch) { if (ch == LEADING_BYTE_KATAKANA_JISX0201) { Dynarr_add (dst, c); ch = 0; } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 || ch == LEADING_BYTE_JAPANESE_JISX0208) ch = c; else { unsigned char j1, j2; ENCODE_SJIS (ch, c, j1, j2); Dynarr_add (dst, j1); Dynarr_add (dst, j2); ch = 0; } } } str->flags = flags; str->ch = ch; } DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /* Decode a JISX0208 character of Shift-JIS coding-system. CODE is the character code in Shift-JIS as a cons of type bytes. Return the corresponding character. */ (code)) { unsigned char c1, c2, s1, s2; CHECK_CONS (code); CHECK_INT (XCAR (code)); CHECK_INT (XCDR (code)); s1 = XINT (XCAR (code)); s2 = XINT (XCDR (code)); if (BYTE_SJIS_TWO_BYTE_1_P (s1) && BYTE_SJIS_TWO_BYTE_2_P (s2)) { DECODE_SJIS (s1, s2, c1, c2); return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208, c1 & 0x7F, c2 & 0x7F)); } else return Qnil; } DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /* Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system. Return the corresponding character code in SHIFT-JIS as a cons of two bytes. */ (character)) { Lisp_Object charset; int c1, c2, s1, s2; CHECK_CHAR_COERCE_INT (character); BREAKUP_CHAR (XCHAR (character), charset, c1, c2); if (EQ (charset, Vcharset_japanese_jisx0208)) { ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2); return Fcons (make_int (s1), make_int (s2)); } else return Qnil; } /************************************************************************/ /* Big5 methods */ /************************************************************************/ /* BIG5 is a coding system encoding two character sets: ASCII and Big5. An ASCII character is encoded as is. Big5 is a two-byte character set and is encoded in two-byte. --- CODE RANGE of BIG5 --- (character set) (range) ASCII 0x00 .. 0x7F Big5 (1st byte) 0xA1 .. 0xFE (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE -------------------------- Since the number of characters in Big5 is larger than maximum characters in Emacs' charset (96x96), it can't be handled as one charset. So, in Emacs, Big5 is divided into two: `charset-big5-1' and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former contains frequently used characters and the latter contains less frequently used characters. */ #define BYTE_BIG5_TWO_BYTE_1_P(c) \ ((c) >= 0xA1 && (c) <= 0xFE) /* Is this the second byte of a Shift-JIS two-byte char? */ #define BYTE_BIG5_TWO_BYTE_2_P(c) \ (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE)) /* Number of Big5 characters which have the same code in 1st byte. */ #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40) /* Code conversion macros. These are macros because they are used in inner loops during code conversion. Note that temporary variables in macros introduce the classic dynamic-scoping problems with variable names. We use capital- lettered variables in the assumption that XEmacs does not use capital letters in variables except in a very formalized way (e.g. Qstring). */ /* Convert Big5 code (b1, b2) into its internal string representation (lb, c1, c2). */ /* There is a much simpler way to split the Big5 charset into two. For the moment I'm going to leave the algorithm as-is because it claims to separate out the most-used characters into a single charset, which perhaps will lead to optimizations in various places. The way the algorithm works is something like this: Big5 can be viewed as a 94x157 charset, where the row is encoded into the bytes 0xA1 .. 0xFE and the column is encoded into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency, the split between low and high column numbers is apparently meaningless; ascending rows produce less and less frequent chars. Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to the first charset, and the upper half (0xC9 .. 0xFE) to the second. To do the conversion, we convert the character into a single number where 0 .. 156 is the first row, 157 .. 313 is the second, etc. That way, the characters are ordered by decreasing frequency. Then we just chop the space in two and coerce the result into a 94x94 space. */ #define DECODE_BIG5(b1, b2, lb, c1, c2) do \ { \ int B1 = b1, B2 = b2; \ int I \ = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \ \ if (B1 < 0xC9) \ { \ lb = LEADING_BYTE_CHINESE_BIG5_1; \ } \ else \ { \ lb = LEADING_BYTE_CHINESE_BIG5_2; \ I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \ } \ c1 = I / (0xFF - 0xA1) + 0xA1; \ c2 = I % (0xFF - 0xA1) + 0xA1; \ } while (0) /* Convert the internal string representation of a Big5 character (lb, c1, c2) into Big5 code (b1, b2). */ #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \ { \ int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \ \ if (lb == LEADING_BYTE_CHINESE_BIG5_2) \ { \ I += BIG5_SAME_ROW * (0xC9 - 0xA1); \ } \ b1 = I / BIG5_SAME_ROW + 0xA1; \ b2 = I % BIG5_SAME_ROW; \ b2 += b2 < 0x3F ? 0x40 : 0x62; \ } while (0) static int detect_coding_big5 (struct detection_state *st, const Extbyte *src, Bytecount n) { while (n--) { unsigned char c = *(unsigned char *)src++; if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO || (c >= 0x80 && c <= 0xA0)) return 0; if (st->big5.in_second_byte) { st->big5.in_second_byte = 0; if (c < 0x40 || (c >= 0x80 && c <= 0xA0)) return 0; } else if (c >= 0xA1) st->big5.in_second_byte = 1; } return CODING_CATEGORY_BIG5_MASK; } /* Convert Big5 data to internal format. */ static void decode_coding_big5 (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n) { struct decoding_stream *str = DECODING_STREAM_DATA (decoding); unsigned int flags = str->flags; unsigned int ch = str->ch; eol_type_t eol_type = str->eol_type; while (n--) { unsigned char c = *(unsigned char *)src++; if (ch) { /* Previous character was first byte of Big5 char. */ if (BYTE_BIG5_TWO_BYTE_2_P (c)) { unsigned char b1, b2, b3; DECODE_BIG5 (ch, c, b1, b2, b3); Dynarr_add (dst, b1); Dynarr_add (dst, b2); Dynarr_add (dst, b3); } else { DECODE_ADD_BINARY_CHAR (ch, dst); DECODE_ADD_BINARY_CHAR (c, dst); } ch = 0; } else { DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); if (BYTE_BIG5_TWO_BYTE_1_P (c)) ch = c; else DECODE_ADD_BINARY_CHAR (c, dst); } label_continue_loop:; } DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); str->flags = flags; str->ch = ch; } /* Convert internally-formatted data to Big5. */ static void encode_coding_big5 (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n) { unsigned char c; struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); unsigned int flags = str->flags; unsigned int ch = str->ch; eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); while (n--) { c = *src++; if (c == '\n') { if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) Dynarr_add (dst, '\r'); if (eol_type != EOL_CR) Dynarr_add (dst, '\n'); } else if (BYTE_ASCII_P (c)) { /* ASCII. */ Dynarr_add (dst, c); } else if (INTBYTE_LEADING_BYTE_P (c)) { if (c == LEADING_BYTE_CHINESE_BIG5_1 || c == LEADING_BYTE_CHINESE_BIG5_2) { /* A recognized leading byte. */ ch = c; continue; /* not done with this character. */ } /* otherwise just ignore this character. */ } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 || ch == LEADING_BYTE_CHINESE_BIG5_2) { /* Previous char was a recognized leading byte. */ ch = (ch << 8) | c; continue; /* not done with this character. */ } else if (ch) { /* Encountering second byte of a Big5 character. */ unsigned char b1, b2; ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2); Dynarr_add (dst, b1); Dynarr_add (dst, b2); } ch = 0; } str->flags = flags; str->ch = ch; } DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /* Decode a Big5 character CODE of BIG5 coding-system. CODE is the character code in BIG5, a cons of two integers. Return the corresponding character. */ (code)) { unsigned char c1, c2, b1, b2; CHECK_CONS (code); CHECK_INT (XCAR (code)); CHECK_INT (XCDR (code)); b1 = XINT (XCAR (code)); b2 = XINT (XCDR (code)); if (BYTE_BIG5_TWO_BYTE_1_P (b1) && BYTE_BIG5_TWO_BYTE_2_P (b2)) { int leading_byte; Lisp_Object charset; DECODE_BIG5 (b1, b2, leading_byte, c1, c2); charset = CHARSET_BY_LEADING_BYTE (leading_byte); return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F)); } else return Qnil; } DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /* Encode the Big5 character CHARACTER in the BIG5 coding-system. Return the corresponding character code in Big5. */ (character)) { Lisp_Object charset; int c1, c2, b1, b2; CHECK_CHAR_COERCE_INT (character); BREAKUP_CHAR (XCHAR (character), charset, c1, c2); if (EQ (charset, Vcharset_chinese_big5_1) || EQ (charset, Vcharset_chinese_big5_2)) { ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80, b1, b2); return Fcons (make_int (b1), make_int (b2)); } else return Qnil; } /************************************************************************/ /* UCS-4 methods */ /* */ /* UCS-4 character codes are implemented as nonnegative integers. */ /* */ /************************************************************************/ DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /* Map UCS-4 code CODE to Mule character CHARACTER. Return T on success, NIL on failure. */ (code, character)) { EMACS_INT c; CHECK_CHAR (character); CHECK_NATNUM (code); c = XINT (code); if (c < countof (fcd->ucs_to_mule_table)) { fcd->ucs_to_mule_table[c] = character; return Qt; } else return Qnil; } static Lisp_Object ucs_to_char (unsigned long code) { if (code < countof (fcd->ucs_to_mule_table)) { return fcd->ucs_to_mule_table[code]; } else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) { unsigned int c; code -= 0xe00000; c = code % (94 * 94); return make_char (MAKE_CHAR (CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, code / (94 * 94) + '@', CHARSET_LEFT_TO_RIGHT), c / 94 + 33, c % 94 + 33)); } else return Qnil; } DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /* Return Mule character corresponding to UCS code CODE (a positive integer). */ (code)) { CHECK_NATNUM (code); return ucs_to_char (XINT (code)); } DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /* Map Mule character CHARACTER to UCS code CODE (a positive integer). */ (character, code)) { /* #### Isn't this gilding the lily? Fput_char_table checks its args. Fset_char_ucs is more restrictive on index arg, but should check code arg in a char_table method. */ CHECK_CHAR (character); CHECK_NATNUM (code); return Fput_char_table (character, code, mule_to_ucs_table); } DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /* Return the UCS code (a positive integer) corresponding to CHARACTER. */ (character)) { return Fget_char_table (character, mule_to_ucs_table); } /* Decode a UCS-4 character into a buffer. If the lookup fails, use <GETA MARK> (U+3013) of JIS X 0208, which means correct character is not found, instead. #### do something more appropriate (use blob?) Danger, Will Robinson! Data loss. Should we signal user? */ static void decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst) { Lisp_Object chr = ucs_to_char (ch); if (! NILP (chr)) { Intbyte work[MAX_EMCHAR_LEN]; int len; ch = XCHAR (chr); len = (ch < 128) ? simple_set_charptr_emchar (work, ch) : non_ascii_set_charptr_emchar (work, ch); Dynarr_add_many (dst, work, len); } else { Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); Dynarr_add (dst, 34 + 128); Dynarr_add (dst, 46 + 128); } } static unsigned long mule_char_to_ucs4 (Lisp_Object charset, unsigned char h, unsigned char l) { Lisp_Object code = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)), mule_to_ucs_table); if (INTP (code)) { return XINT (code); } else if ( (XCHARSET_DIMENSION (charset) == 2) && (XCHARSET_CHARS (charset) == 94) ) { unsigned char final = XCHARSET_FINAL (charset); if ( ('@' <= final) && (final < 0x7f) ) { return 0xe00000 + (final - '@') * 94 * 94 + ((h & 127) - 33) * 94 + (l & 127) - 33; } else { return '?'; } } else { return '?'; } } static void encode_ucs4 (Lisp_Object charset, unsigned char h, unsigned char l, unsigned_char_dynarr *dst) { unsigned long code = mule_char_to_ucs4 (charset, h, l); Dynarr_add (dst, code >> 24); Dynarr_add (dst, (code >> 16) & 255); Dynarr_add (dst, (code >> 8) & 255); Dynarr_add (dst, code & 255); } static int detect_coding_ucs4 (struct detection_state *st, const Extbyte *src, Bytecount n) { while (n--) { unsigned char c = *(unsigned char *)src++; switch (st->ucs4.in_byte) { case 0: if (c >= 128) return 0; else st->ucs4.in_byte++; break; case 3: st->ucs4.in_byte = 0; break; default: st->ucs4.in_byte++; } } return CODING_CATEGORY_UCS4_MASK; } static void decode_coding_ucs4 (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n) { struct decoding_stream *str = DECODING_STREAM_DATA (decoding); unsigned int flags = str->flags; unsigned int ch = str->ch; unsigned char counter = str->counter; while (n--) { unsigned char c = *(unsigned char *)src++; switch (counter) { case 0: ch = c; counter = 3; break; case 1: decode_ucs4 ( ( ch << 8 ) | c, dst); ch = 0; counter = 0; break; default: ch = ( ch << 8 ) | c; counter--; } } if (counter & CODING_STATE_END) DECODE_OUTPUT_PARTIAL_CHAR (ch); str->flags = flags; str->ch = ch; str->counter = counter; } static void encode_coding_ucs4 (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n) { struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); unsigned int flags = str->flags; unsigned int ch = str->ch; unsigned char char_boundary = str->iso2022.current_char_boundary; Lisp_Object charset = str->iso2022.current_charset; #ifdef ENABLE_COMPOSITE_CHARS /* flags for handling composite chars. We do a little switcharoo on the source while we're outputting the composite char. */ Bytecount saved_n = 0; const unsigned char *saved_src = NULL; int in_composite = 0; back_to_square_n: #endif while (n--) { unsigned char c = *src++; if (BYTE_ASCII_P (c)) { /* Processing ASCII character */ ch = 0; encode_ucs4 (Vcharset_ascii, c, 0, dst); char_boundary = 1; } else if (INTBYTE_LEADING_BYTE_P (c) || INTBYTE_LEADING_BYTE_P (ch)) { /* Processing Leading Byte */ ch = 0; charset = CHARSET_BY_LEADING_BYTE (c); if (LEADING_BYTE_PREFIX_P(c)) ch = c; char_boundary = 0; } else { /* Processing Non-ASCII character */ char_boundary = 1; if (EQ (charset, Vcharset_control_1)) { encode_ucs4 (Vcharset_control_1, c, 0, dst); } else { switch (XCHARSET_REP_BYTES (charset)) { case 2: encode_ucs4 (charset, c, 0, dst); break; case 3: if (XCHARSET_PRIVATE_P (charset)) { encode_ucs4 (charset, c, 0, dst); ch = 0; } else if (ch) { #ifdef ENABLE_COMPOSITE_CHARS if (EQ (charset, Vcharset_composite)) { if (in_composite) { /* #### Bother! We don't know how to handle this yet. */ Dynarr_add (dst, '\0'); Dynarr_add (dst, '\0'); Dynarr_add (dst, '\0'); Dynarr_add (dst, '~'); } else { Emchar emch = MAKE_CHAR (Vcharset_composite, ch & 0x7F, c & 0x7F); Lisp_Object lstr = composite_char_string (emch); saved_n = n; saved_src = src; in_composite = 1; src = XSTRING_DATA (lstr); n = XSTRING_LENGTH (lstr); } } else #endif /* ENABLE_COMPOSITE_CHARS */ { encode_ucs4(charset, ch, c, dst); } ch = 0; } else { ch = c; char_boundary = 0; } break; case 4: if (ch) { encode_ucs4 (charset, ch, c, dst); ch = 0; } else { ch = c; char_boundary = 0; } break; default: abort (); } } } } #ifdef ENABLE_COMPOSITE_CHARS if (in_composite) { n = saved_n; src = saved_src; in_composite = 0; goto back_to_square_n; /* Wheeeeeeeee ..... */ } #endif /* ENABLE_COMPOSITE_CHARS */ str->flags = flags; str->ch = ch; str->iso2022.current_char_boundary = char_boundary; str->iso2022.current_charset = charset; /* Verbum caro factum est! */ } /************************************************************************/ /* UTF-8 methods */ /************************************************************************/ static int detect_coding_utf8 (struct detection_state *st, const Extbyte *src, Bytecount n) { while (n--) { unsigned char c = *(unsigned char *)src++; switch (st->utf8.in_byte) { case 0: if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) return 0; else if (c >= 0xfc) st->utf8.in_byte = 5; else if (c >= 0xf8) st->utf8.in_byte = 4; else if (c >= 0xf0) st->utf8.in_byte = 3; else if (c >= 0xe0) st->utf8.in_byte = 2; else if (c >= 0xc0) st->utf8.in_byte = 1; else if (c >= 0x80) return 0; break; default: if ((c & 0xc0) != 0x80) return 0; else st->utf8.in_byte--; } } return CODING_CATEGORY_UTF8_MASK; } static void decode_coding_utf8 (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n) { struct decoding_stream *str = DECODING_STREAM_DATA (decoding); unsigned int flags = str->flags; unsigned int ch = str->ch; eol_type_t eol_type = str->eol_type; unsigned char counter = str->counter; while (n--) { unsigned char c = *(unsigned char *)src++; switch (counter) { case 0: if ( c >= 0xfc ) { ch = c & 0x01; counter = 5; } else if ( c >= 0xf8 ) { ch = c & 0x03; counter = 4; } else if ( c >= 0xf0 ) { ch = c & 0x07; counter = 3; } else if ( c >= 0xe0 ) { ch = c & 0x0f; counter = 2; } else if ( c >= 0xc0 ) { ch = c & 0x1f; counter = 1; } else { DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); decode_ucs4 (c, dst); } break; case 1: ch = ( ch << 6 ) | ( c & 0x3f ); decode_ucs4 (ch, dst); ch = 0; counter = 0; break; default: ch = ( ch << 6 ) | ( c & 0x3f ); counter--; } label_continue_loop:; } if (flags & CODING_STATE_END) DECODE_OUTPUT_PARTIAL_CHAR (ch); str->flags = flags; str->ch = ch; str->counter = counter; } static void encode_utf8 (Lisp_Object charset, unsigned char h, unsigned char l, unsigned_char_dynarr *dst) { unsigned long code = mule_char_to_ucs4 (charset, h, l); if ( code <= 0x7f ) { Dynarr_add (dst, code); } else if ( code <= 0x7ff ) { Dynarr_add (dst, (code >> 6) | 0xc0); Dynarr_add (dst, (code & 0x3f) | 0x80); } else if ( code <= 0xffff ) { Dynarr_add (dst, (code >> 12) | 0xe0); Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80); Dynarr_add (dst, (code & 0x3f) | 0x80); } else if ( code <= 0x1fffff ) { Dynarr_add (dst, (code >> 18) | 0xf0); Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80); Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80); Dynarr_add (dst, (code & 0x3f) | 0x80); } else if ( code <= 0x3ffffff ) { Dynarr_add (dst, (code >> 24) | 0xf8); Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80); Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80); Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80); Dynarr_add (dst, (code & 0x3f) | 0x80); } else { Dynarr_add (dst, (code >> 30) | 0xfc); Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80); Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80); Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80); Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80); Dynarr_add (dst, (code & 0x3f) | 0x80); } } static void encode_coding_utf8 (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n) { struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); unsigned int flags = str->flags; unsigned int ch = str->ch; eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); unsigned char char_boundary = str->iso2022.current_char_boundary; Lisp_Object charset = str->iso2022.current_charset; #ifdef ENABLE_COMPOSITE_CHARS /* flags for handling composite chars. We do a little switcharoo on the source while we're outputting the composite char. */ Bytecount saved_n = 0; const unsigned char *saved_src = NULL; int in_composite = 0; back_to_square_n: #endif /* ENABLE_COMPOSITE_CHARS */ while (n--) { unsigned char c = *src++; if (BYTE_ASCII_P (c)) { /* Processing ASCII character */ ch = 0; if (c == '\n') { if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) Dynarr_add (dst, '\r'); if (eol_type != EOL_CR) Dynarr_add (dst, c); } else encode_utf8 (Vcharset_ascii, c, 0, dst); char_boundary = 1; } else if (INTBYTE_LEADING_BYTE_P (c) || INTBYTE_LEADING_BYTE_P (ch)) { /* Processing Leading Byte */ ch = 0; charset = CHARSET_BY_LEADING_BYTE (c); if (LEADING_BYTE_PREFIX_P(c)) ch = c; char_boundary = 0; } else { /* Processing Non-ASCII character */ char_boundary = 1; if (EQ (charset, Vcharset_control_1)) { encode_utf8 (Vcharset_control_1, c, 0, dst); } else { switch (XCHARSET_REP_BYTES (charset)) { case 2: encode_utf8 (charset, c, 0, dst); break; case 3: if (XCHARSET_PRIVATE_P (charset)) { encode_utf8 (charset, c, 0, dst); ch = 0; } else if (ch) { #ifdef ENABLE_COMPOSITE_CHARS if (EQ (charset, Vcharset_composite)) { if (in_composite) { /* #### Bother! We don't know how to handle this yet. */ encode_utf8 (Vcharset_ascii, '~', 0, dst); } else { Emchar emch = MAKE_CHAR (Vcharset_composite, ch & 0x7F, c & 0x7F); Lisp_Object lstr = composite_char_string (emch); saved_n = n; saved_src = src; in_composite = 1; src = XSTRING_DATA (lstr); n = XSTRING_LENGTH (lstr); } } else #endif /* ENABLE_COMPOSITE_CHARS */ { encode_utf8 (charset, ch, c, dst); } ch = 0; } else { ch = c; char_boundary = 0; } break; case 4: if (ch) { encode_utf8 (charset, ch, c, dst); ch = 0; } else { ch = c; char_boundary = 0; } break; default: abort (); } } } } #ifdef ENABLE_COMPOSITE_CHARS if (in_composite) { n = saved_n; src = saved_src; in_composite = 0; goto back_to_square_n; /* Wheeeeeeeee ..... */ } #endif str->flags = flags; str->ch = ch; str->iso2022.current_char_boundary = char_boundary; str->iso2022.current_charset = charset; /* Verbum caro factum est! */ } /************************************************************************/ /* ISO2022 methods */ /************************************************************************/ /* The following note describes the coding system ISO2022 briefly. Since the intention of this note is to help understand the functions in this file, some parts are NOT ACCURATE or OVERLY SIMPLIFIED. For thorough understanding, please refer to the original document of ISO2022. ISO2022 provides many mechanisms to encode several character sets in 7-bit and 8-bit environments. For 7-bit environments, all text is encoded using bytes less than 128. This may make the encoded text a little bit longer, but the text passes more easily through several gateways, some of which strip off MSB (Most Signigant Bit). There are two kinds of character sets: control character set and graphic character set. The former contains control characters such as `newline' and `escape' to provide control functions (control functions are also provided by escape sequences). The latter contains graphic characters such as 'A' and '-'. Emacs recognizes two control character sets and many graphic character sets. Graphic character sets are classified into one of the following four classes, according to the number of bytes (DIMENSION) and number of characters in one dimension (CHARS) of the set: - DIMENSION1_CHARS94 - DIMENSION1_CHARS96 - DIMENSION2_CHARS94 - DIMENSION2_CHARS96 In addition, each character set is assigned an identification tag, unique for each set, called "final character" (denoted as <F> hereafter). The <F> of each character set is decided by ECMA(*) when it is registered in ISO. The code range of <F> is 0x30..0x7F (0x30..0x3F are for private use only). Note (*): ECMA = European Computer Manufacturers Association Here are examples of graphic character set [NAME(<F>)]: o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ... o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ... o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ... o DIMENSION2_CHARS96 -- none for the moment A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR. C0 [0x00..0x1F] -- control character plane 0 GL [0x20..0x7F] -- graphic character plane 0 C1 [0x80..0x9F] -- control character plane 1 GR [0xA0..0xFF] -- graphic character plane 1 A control character set is directly designated and invoked to C0 or C1 by an escape sequence. The most common case is that: - ISO646's control character set is designated/invoked to C0, and - ISO6429's control character set is designated/invoked to C1, and usually these designations/invocations are omitted in encoded text. In a 7-bit environment, only C0 can be used, and a control character for C1 is encoded by an appropriate escape sequence to fit into the environment. All control characters for C1 are defined to have corresponding escape sequences. A graphic character set is at first designated to one of four graphic registers (G0 through G3), then these graphic registers are invoked to GL or GR. These designations and invocations can be done independently. The most common case is that G0 is invoked to GL, G1 is invoked to GR, and ASCII is designated to G0. Usually these invocations and designations are omitted in encoded text. In a 7-bit environment, only GL can be used. When a graphic character set of CHARS94 is invoked to GL, codes 0x20 and 0x7F of the GL area work as control characters SPACE and DEL respectively, and codes 0xA0 and 0xFF of the GR area should not be used. There are two ways of invocation: locking-shift and single-shift. With locking-shift, the invocation lasts until the next different invocation, whereas with single-shift, the invocation affects the following character only and doesn't affect the locking-shift state. Invocations are done by the following control characters or escape sequences: ---------------------------------------------------------------------- abbrev function cntrl escape seq description ---------------------------------------------------------------------- SI/LS0 (shift-in) 0x0F none invoke G0 into GL SO/LS1 (shift-out) 0x0E none invoke G1 into GL LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*) LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*) LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*) SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char ---------------------------------------------------------------------- (*) These are not used by any known coding system. Control characters for these functions are defined by macros ISO_CODE_XXX in `coding.h'. Designations are done by the following escape sequences: ---------------------------------------------------------------------- escape sequence description ---------------------------------------------------------------------- ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*) ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**) ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*) ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3 ---------------------------------------------------------------------- In this list, "DIMENSION1_CHARS94<F>" means a graphic character set of dimension 1, chars 94, and final character <F>, etc... Note (*): Although these designations are not allowed in ISO2022, Emacs accepts them on decoding, and produces them on encoding CHARS96 character sets in a coding system which is characterized as 7-bit environment, non-locking-shift, and non-single-shift. Note (**): If <F> is '@', 'A', or 'B', the intermediate character '(' can be omitted. We refer to this as "short-form" hereafter. Now you may notice that there are a lot of ways for encoding the same multilingual text in ISO2022. Actually, there exist many coding systems such as Compound Text (used in X11's inter client communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR (used in Korean internet), EUC (Extended UNIX Code, used in Asian localized platforms), and all of these are variants of ISO2022. In addition to the above, Emacs handles two more kinds of escape sequences: ISO6429's direction specification and Emacs' private sequence for specifying character composition. ISO6429's direction specification takes the following form: o CSI ']' -- end of the current direction o CSI '0' ']' -- end of the current direction o CSI '1' ']' -- start of left-to-right text o CSI '2' ']' -- start of right-to-left text The control character CSI (0x9B: control sequence introducer) is abbreviated to the escape sequence ESC '[' in a 7-bit environment. Character composition specification takes the following form: o ESC '0' -- start character composition o ESC '1' -- end character composition Since these are not standard escape sequences of any ISO standard, their use with these meanings is restricted to Emacs only. */ static void reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso) { int i; for (i = 0; i < 4; i++) { if (!NILP (coding_system)) iso->charset[i] = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i); else iso->charset[i] = Qt; iso->invalid_designated[i] = 0; } iso->esc = ISO_ESC_NOTHING; iso->esc_bytes_index = 0; iso->register_left = 0; iso->register_right = 1; iso->switched_dir_and_no_valid_charset_yet = 0; iso->invalid_switch_dir = 0; iso->output_direction_sequence = 0; iso->output_literally = 0; #ifdef ENABLE_COMPOSITE_CHARS if (iso->composite_chars) Dynarr_reset (iso->composite_chars); #endif } static int fit_to_be_escape_quoted (unsigned char c) { switch (c) { case ISO_CODE_ESC: case ISO_CODE_CSI: case ISO_CODE_SS2: case ISO_CODE_SS3: case ISO_CODE_SO: case ISO_CODE_SI: return 1; default: return 0; } } /* Parse one byte of an ISO2022 escape sequence. If the result is an invalid escape sequence, return 0 and do not change anything in STR. Otherwise, if the result is an incomplete escape sequence, update ISO2022.ESC and ISO2022.ESC_BYTES and return -1. Otherwise, update all the state variables (but not ISO2022.ESC_BYTES) and return 1. If CHECK_INVALID_CHARSETS is non-zero, check for designation or invocation of an invalid character set and treat that as an unrecognized escape sequence. ******************************************************************** #### Strategies for error annotation and coding orthogonalization We really want to separate out a number of things. Conceptually, there is a nested syntax. At the top level is the ISO 2022 extension syntax, including charset designation and invocation, and certain auxiliary controls such as the ISO 6429 direction specification. These are octet-oriented, with the single exception (AFAIK) of the "exit Unicode" sequence which uses the UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a (deprecated) special case in Unicode processing. The middle layer is ISO 2022 character interpretation. This will depend on the current state of the ISO 2022 registers, and assembles octets into the character's internal representation. The lowest level is translating system control conventions. At present this is restricted to newline translation, but one could imagine doing tab conversion or line wrapping here. "Escape from Unicode" processing would be done at this level. At each level the parser will verify the syntax. In the case of a syntax error or warning (such as a redundant escape sequence that affects no characters), the parser will take some action, typically inserting the erroneous octets directly into the output and creating an annotation which can be used by higher level I/O to mark the affected region. This should make it possible to do something sensible about separating newline convention processing from character construction, and about preventing ISO 2022 escape sequences from being recognized inappropriately. The basic strategy will be to have octet classification tables, and switch processing according to the table entry. It's possible that, by doing the processing with tables of functions or the like, the parser can be used for both detection and translation. */ static int parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso, unsigned char c, unsigned int *flags, int check_invalid_charsets) { /* (1) If we're at the end of a designation sequence, CS is the charset being designated and REG is the register to designate it to. (2) If we're at the end of a locking-shift sequence, REG is the register to invoke and HALF (0 == left, 1 == right) is the half to invoke it into. (3) If we're at the end of a single-shift sequence, REG is the register to invoke. */ Lisp_Object cs = Qnil; int reg, half; /* NOTE: This code does goto's all over the fucking place. The reason for this is that we're basically implementing a state machine here, and hierarchical languages like C don't really provide a clean way of doing this. */ if (! (*flags & CODING_STATE_ESCAPE)) /* At beginning of escape sequence; we need to reset our escape-state variables. */ iso->esc = ISO_ESC_NOTHING; iso->output_literally = 0; iso->output_direction_sequence = 0; switch (iso->esc) { case ISO_ESC_NOTHING: iso->esc_bytes_index = 0; switch (c) { case ISO_CODE_ESC: /* Start escape sequence */ *flags |= CODING_STATE_ESCAPE; iso->esc = ISO_ESC; goto not_done; case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */ *flags |= CODING_STATE_ESCAPE; iso->esc = ISO_ESC_5_11; goto not_done; case ISO_CODE_SO: /* locking shift 1 */ reg = 1; half = 0; goto locking_shift; case ISO_CODE_SI: /* locking shift 0 */ reg = 0; half = 0; goto locking_shift; case ISO_CODE_SS2: /* single shift */ reg = 2; goto single_shift; case ISO_CODE_SS3: /* single shift */ reg = 3; goto single_shift; default: /* Other control characters */ return 0; } case ISO_ESC: switch (c) { /**** single shift ****/ case 'N': /* single shift 2 */ reg = 2; goto single_shift; case 'O': /* single shift 3 */ reg = 3; goto single_shift; /**** locking shift ****/ case '~': /* locking shift 1 right */ reg = 1; half = 1; goto locking_shift; case 'n': /* locking shift 2 */ reg = 2; half = 0; goto locking_shift; case '}': /* locking shift 2 right */ reg = 2; half = 1; goto locking_shift; case 'o': /* locking shift 3 */ reg = 3; half = 0; goto locking_shift; case '|': /* locking shift 3 right */ reg = 3; half = 1; goto locking_shift; #ifdef ENABLE_COMPOSITE_CHARS /**** composite ****/ case '0': iso->esc = ISO_ESC_START_COMPOSITE; *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_COMPOSITE; return 1; case '1': iso->esc = ISO_ESC_END_COMPOSITE; *flags = (*flags & CODING_STATE_ISO2022_LOCK) & ~CODING_STATE_COMPOSITE; return 1; #endif /* ENABLE_COMPOSITE_CHARS */ /**** directionality ****/ case '[': iso->esc = ISO_ESC_5_11; goto not_done; /**** designation ****/ case '$': /* multibyte charset prefix */ iso->esc = ISO_ESC_2_4; goto not_done; default: if (0x28 <= c && c <= 0x2F) { iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8); goto not_done; } /* This function is called with CODESYS equal to nil when doing coding-system detection. */ if (!NILP (codesys) && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) && fit_to_be_escape_quoted (c)) { iso->esc = ISO_ESC_LITERAL; *flags &= CODING_STATE_ISO2022_LOCK; return 1; } /* bzzzt! */ return 0; } /**** directionality ****/ case ISO_ESC_5_11: /* ISO6429 direction control */ if (c == ']') { *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L); goto directionality; } if (c == '0') iso->esc = ISO_ESC_5_11_0; else if (c == '1') iso->esc = ISO_ESC_5_11_1; else if (c == '2') iso->esc = ISO_ESC_5_11_2; else return 0; goto not_done; case ISO_ESC_5_11_0: if (c == ']') { *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L); goto directionality; } return 0; case ISO_ESC_5_11_1: if (c == ']') { *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L); goto directionality; } return 0; case ISO_ESC_5_11_2: if (c == ']') { *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L; goto directionality; } return 0; directionality: iso->esc = ISO_ESC_DIRECTIONALITY; /* Various junk here to attempt to preserve the direction sequences literally in the text if they would otherwise be swallowed due to invalid designations that don't show up as actual charset changes in the text. */ if (iso->invalid_switch_dir) { /* We already inserted a direction switch literally into the text. We assume (#### this may not be right) that the next direction switch is the one going the other way, and we need to output that literally as well. */ iso->output_literally = 1; iso->invalid_switch_dir = 0; } else { int jj; /* If we are in the thrall of an invalid designation, then stick the directionality sequence literally into the output stream so it ends up in the original text again. */ for (jj = 0; jj < 4; jj++) if (iso->invalid_designated[jj]) break; if (jj < 4) { iso->output_literally = 1; iso->invalid_switch_dir = 1; } else /* Indicate that we haven't yet seen a valid designation, so that if a switch-dir is directly followed by an invalid designation, both get inserted literally. */ iso->switched_dir_and_no_valid_charset_yet = 1; } return 1; /**** designation ****/ case ISO_ESC_2_4: if (0x28 <= c && c <= 0x2F) { iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8); goto not_done; } if (0x40 <= c && c <= 0x42) { cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c, *flags & CODING_STATE_R2L ? CHARSET_RIGHT_TO_LEFT : CHARSET_LEFT_TO_RIGHT); reg = 0; goto designated; } return 0; default: { int type =-1; if (c < '0' || c > '~') return 0; /* bad final byte */ if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) { type = ((iso->esc >= ISO_ESC_2_12) ? CHARSET_TYPE_96 : CHARSET_TYPE_94); reg = (iso->esc - ISO_ESC_2_8) & 3; } else if (iso->esc >= ISO_ESC_2_4_8 && iso->esc <= ISO_ESC_2_4_15) { type = ((iso->esc >= ISO_ESC_2_4_12) ? CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94); reg = (iso->esc - ISO_ESC_2_4_8) & 3; } else { /* Can this ever be reached? -slb */ abort(); return 0; } cs = CHARSET_BY_ATTRIBUTES (type, c, *flags & CODING_STATE_R2L ? CHARSET_RIGHT_TO_LEFT : CHARSET_LEFT_TO_RIGHT); goto designated; } } not_done: iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c; return -1; single_shift: if (check_invalid_charsets && !CHARSETP (iso->charset[reg])) /* can't invoke something that ain't there. */ return 0; iso->esc = ISO_ESC_SINGLE_SHIFT; *flags &= CODING_STATE_ISO2022_LOCK; if (reg == 2) *flags |= CODING_STATE_SS2; else *flags |= CODING_STATE_SS3; return 1; locking_shift: if (check_invalid_charsets && !CHARSETP (iso->charset[reg])) /* can't invoke something that ain't there. */ return 0; if (half) iso->register_right = reg; else iso->register_left = reg; *flags &= CODING_STATE_ISO2022_LOCK; iso->esc = ISO_ESC_LOCKING_SHIFT; return 1; designated: if (NILP (cs) && check_invalid_charsets) { iso->invalid_designated[reg] = 1; iso->charset[reg] = Vcharset_ascii; iso->esc = ISO_ESC_DESIGNATE; *flags &= CODING_STATE_ISO2022_LOCK; iso->output_literally = 1; if (iso->switched_dir_and_no_valid_charset_yet) { /* We encountered a switch-direction followed by an invalid designation. Ensure that the switch-direction gets outputted; otherwise it will probably get eaten when the text is written out again. */ iso->switched_dir_and_no_valid_charset_yet = 0; iso->output_direction_sequence = 1; /* And make sure that the switch-dir going the other way gets outputted, as well. */ iso->invalid_switch_dir = 1; } return 1; } /* This function is called with CODESYS equal to nil when doing coding-system detection. */ if (!NILP (codesys)) { charset_conversion_spec_dynarr *dyn = XCODING_SYSTEM (codesys)->iso2022.input_conv; if (dyn) { int i; for (i = 0; i < Dynarr_length (dyn); i++) { struct charset_conversion_spec *spec = Dynarr_atp (dyn, i); if (EQ (cs, spec->from_charset)) cs = spec->to_charset; } } } iso->charset[reg] = cs; iso->esc = ISO_ESC_DESIGNATE; *flags &= CODING_STATE_ISO2022_LOCK; if (iso->invalid_designated[reg]) { iso->invalid_designated[reg] = 0; iso->output_literally = 1; } if (iso->switched_dir_and_no_valid_charset_yet) iso->switched_dir_and_no_valid_charset_yet = 0; return 1; } static int detect_coding_iso2022 (struct detection_state *st, const Extbyte *src, Bytecount n) { int mask; /* #### There are serious deficiencies in the recognition mechanism here. This needs to be much smarter if it's going to cut it. The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while it should be detected as Latin-1. All the ISO2022 stuff in this file should be synced up with the code from FSF Emacs-20.4, in which Mule should be more or less stable. Perhaps we should wait till R2L works in FSF Emacs? */ if (!st->iso2022.initted) { reset_iso2022 (Qnil, &st->iso2022.iso); st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK | CODING_CATEGORY_ISO_8_DESIGNATE_MASK | CODING_CATEGORY_ISO_8_1_MASK | CODING_CATEGORY_ISO_8_2_MASK | CODING_CATEGORY_ISO_LOCK_SHIFT_MASK); st->iso2022.flags = 0; st->iso2022.high_byte_count = 0; st->iso2022.saw_single_shift = 0; st->iso2022.initted = 1; } mask = st->iso2022.mask; while (n--) { unsigned char c = *(unsigned char *)src++; if (c >= 0xA0) { mask &= ~CODING_CATEGORY_ISO_7_MASK; st->iso2022.high_byte_count++; } else { if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift) { if (st->iso2022.high_byte_count & 1) /* odd number of high bytes; assume not iso-8-2 */ mask &= ~CODING_CATEGORY_ISO_8_2_MASK; } st->iso2022.high_byte_count = 0; st->iso2022.saw_single_shift = 0; if (c > 0x80) mask &= ~CODING_CATEGORY_ISO_7_MASK; } if (!(st->iso2022.flags & CODING_STATE_ESCAPE) && (BYTE_C0_P (c) || BYTE_C1_P (c))) { /* control chars */ switch (c) { /* Allow and ignore control characters that you might reasonably see in a text file */ case '\r': case '\n': case '\t': case 7: /* bell */ case 8: /* backspace */ case 11: /* vertical tab */ case 12: /* form feed */ case 26: /* MS-DOS C-z junk */ case 31: /* '^_' -- for info */ goto label_continue_loop; default: break; } } if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c) || BYTE_C1_P (c)) { if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c, &st->iso2022.flags, 0)) { switch (st->iso2022.iso.esc) { case ISO_ESC_DESIGNATE: mask &= ~CODING_CATEGORY_ISO_8_1_MASK; mask &= ~CODING_CATEGORY_ISO_8_2_MASK; break; case ISO_ESC_LOCKING_SHIFT: mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK; goto ran_out_of_chars; case ISO_ESC_SINGLE_SHIFT: mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK; st->iso2022.saw_single_shift = 1; break; default: break; } } else { mask = 0; goto ran_out_of_chars; } } label_continue_loop:; } ran_out_of_chars: return mask; } static int postprocess_iso2022_mask (int mask) { /* #### kind of cheesy */ /* If seven-bit ISO is allowed, then assume that the encoding is entirely seven-bit and turn off the eight-bit ones. */ if (mask & CODING_CATEGORY_ISO_7_MASK) mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK | CODING_CATEGORY_ISO_8_1_MASK | CODING_CATEGORY_ISO_8_2_MASK); return mask; } /* If FLAGS is a null pointer or specifies right-to-left motion, output a switch-dir-to-left-to-right sequence to DST. Also update FLAGS if it is not a null pointer. If INTERNAL_P is set, we are outputting in internal format and need to handle the CSI differently. */ static void restore_left_to_right_direction (Lisp_Coding_System *codesys, unsigned_char_dynarr *dst, unsigned int *flags, int internal_p) { if (!flags || (*flags & CODING_STATE_R2L)) { if (CODING_SYSTEM_ISO2022_SEVEN (codesys)) { Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, '['); } else if (internal_p) DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst); else Dynarr_add (dst, ISO_CODE_CSI); Dynarr_add (dst, '0'); Dynarr_add (dst, ']'); if (flags) *flags &= ~CODING_STATE_R2L; } } /* If FLAGS is a null pointer or specifies a direction different from DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape sequence to DST. Also update FLAGS if it is not a null pointer. If INTERNAL_P is set, we are outputting in internal format and need to handle the CSI differently. */ static void ensure_correct_direction (int direction, Lisp_Coding_System *codesys, unsigned_char_dynarr *dst, unsigned int *flags, int internal_p) { if ((!flags || (*flags & CODING_STATE_R2L)) && direction == CHARSET_LEFT_TO_RIGHT) restore_left_to_right_direction (codesys, dst, flags, internal_p); else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys) && (!flags || !(*flags & CODING_STATE_R2L)) && direction == CHARSET_RIGHT_TO_LEFT) { if (CODING_SYSTEM_ISO2022_SEVEN (codesys)) { Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, '['); } else if (internal_p) DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst); else Dynarr_add (dst, ISO_CODE_CSI); Dynarr_add (dst, '2'); Dynarr_add (dst, ']'); if (flags) *flags |= CODING_STATE_R2L; } } /* Convert ISO2022-format data to internal format. */ static void decode_coding_iso2022 (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n) { struct decoding_stream *str = DECODING_STREAM_DATA (decoding); unsigned int flags = str->flags; unsigned int ch = str->ch; eol_type_t eol_type = str->eol_type; #ifdef ENABLE_COMPOSITE_CHARS unsigned_char_dynarr *real_dst = dst; #endif Lisp_Object coding_system; XSETCODING_SYSTEM (coding_system, str->codesys); #ifdef ENABLE_COMPOSITE_CHARS if (flags & CODING_STATE_COMPOSITE) dst = str->iso2022.composite_chars; #endif /* ENABLE_COMPOSITE_CHARS */ while (n--) { unsigned char c = *(unsigned char *)src++; if (flags & CODING_STATE_ESCAPE) { /* Within ESC sequence */ int retval = parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1); if (retval) { switch (str->iso2022.esc) { #ifdef ENABLE_COMPOSITE_CHARS case ISO_ESC_START_COMPOSITE: if (str->iso2022.composite_chars) Dynarr_reset (str->iso2022.composite_chars); else str->iso2022.composite_chars = Dynarr_new (unsigned_char); dst = str->iso2022.composite_chars; break; case ISO_ESC_END_COMPOSITE: { Intbyte comstr[MAX_EMCHAR_LEN]; Bytecount len; Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0), Dynarr_length (dst)); dst = real_dst; len = set_charptr_emchar (comstr, emch); Dynarr_add_many (dst, comstr, len); break; } #endif /* ENABLE_COMPOSITE_CHARS */ case ISO_ESC_LITERAL: DECODE_ADD_BINARY_CHAR (c, dst); break; default: /* Everything else handled already */ break; } } /* Attempted error recovery. */ if (str->iso2022.output_direction_sequence) ensure_correct_direction (flags & CODING_STATE_R2L ? CHARSET_RIGHT_TO_LEFT : CHARSET_LEFT_TO_RIGHT, str->codesys, dst, 0, 1); /* More error recovery. */ if (!retval || str->iso2022.output_literally) { /* Output the (possibly invalid) sequence */ int i; for (i = 0; i < str->iso2022.esc_bytes_index; i++) DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst); flags &= CODING_STATE_ISO2022_LOCK; if (!retval) n++, src--;/* Repeat the loop with the same character. */ else { /* No sense in reprocessing the final byte of the escape sequence; it could mess things up anyway. Just add it now. */ DECODE_ADD_BINARY_CHAR (c, dst); } } ch = 0; } else if (BYTE_C0_P (c) || BYTE_C1_P (c)) { /* Control characters */ /***** Error-handling *****/ /* If we were in the middle of a character, dump out the partial character. */ DECODE_OUTPUT_PARTIAL_CHAR (ch); /* If we just saw a single-shift character, dump it out. This may dump out the wrong sort of single-shift character, but least it will give an indication that something went wrong. */ if (flags & CODING_STATE_SS2) { DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst); flags &= ~CODING_STATE_SS2; } if (flags & CODING_STATE_SS3) { DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst); flags &= ~CODING_STATE_SS3; } /***** Now handle the control characters. *****/ /* Handle CR/LF */ DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); flags &= CODING_STATE_ISO2022_LOCK; if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1)) DECODE_ADD_BINARY_CHAR (c, dst); } else { /* Graphic characters */ Lisp_Object charset; int lb; int reg; DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); /* Now determine the charset. */ reg = ((flags & CODING_STATE_SS2) ? 2 : (flags & CODING_STATE_SS3) ? 3 : !BYTE_ASCII_P (c) ? str->iso2022.register_right : str->iso2022.register_left); charset = str->iso2022.charset[reg]; /* Error checking: */ if (! CHARSETP (charset) || str->iso2022.invalid_designated[reg] || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL) && XCHARSET_CHARS (charset) == 94)) /* Mrmph. We are trying to invoke a register that has no or an invalid charset in it, or trying to add a character outside the range of the charset. Insert that char literally to preserve it for the output. */ { DECODE_OUTPUT_PARTIAL_CHAR (ch); DECODE_ADD_BINARY_CHAR (c, dst); } else { /* Things are probably hunky-dorey. */ /* Fetch reverse charset, maybe. */ if (((flags & CODING_STATE_R2L) && XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT) || (!(flags & CODING_STATE_R2L) && XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT)) { Lisp_Object new_charset = XCHARSET_REVERSE_DIRECTION_CHARSET (charset); if (!NILP (new_charset)) charset = new_charset; } lb = XCHARSET_LEADING_BYTE (charset); switch (XCHARSET_REP_BYTES (charset)) { case 1: /* ASCII */ DECODE_OUTPUT_PARTIAL_CHAR (ch); Dynarr_add (dst, c & 0x7F); break; case 2: /* one-byte official */ DECODE_OUTPUT_PARTIAL_CHAR (ch); Dynarr_add (dst, lb); Dynarr_add (dst, c | 0x80); break; case 3: /* one-byte private or two-byte official */ if (XCHARSET_PRIVATE_P (charset)) { DECODE_OUTPUT_PARTIAL_CHAR (ch); Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1); Dynarr_add (dst, lb); Dynarr_add (dst, c | 0x80); } else { if (ch) { Dynarr_add (dst, lb); Dynarr_add (dst, ch | 0x80); Dynarr_add (dst, c | 0x80); ch = 0; } else ch = c; } break; default: /* two-byte private */ if (ch) { Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2); Dynarr_add (dst, lb); Dynarr_add (dst, ch | 0x80); Dynarr_add (dst, c | 0x80); ch = 0; } else ch = c; } } if (!ch) flags &= CODING_STATE_ISO2022_LOCK; } label_continue_loop:; } if (flags & CODING_STATE_END) DECODE_OUTPUT_PARTIAL_CHAR (ch); str->flags = flags; str->ch = ch; } /***** ISO2022 encoder *****/ /* Designate CHARSET into register REG. */ static void iso2022_designate (Lisp_Object charset, unsigned char reg, struct encoding_stream *str, unsigned_char_dynarr *dst) { static const char inter94[] = "()*+"; static const char inter96[] = ",-./"; int type; unsigned char final; Lisp_Object old_charset = str->iso2022.charset[reg]; str->iso2022.charset[reg] = charset; if (!CHARSETP (charset)) /* charset might be an initial nil or t. */ return; type = XCHARSET_TYPE (charset); final = XCHARSET_FINAL (charset); if (!str->iso2022.force_charset_on_output[reg] && CHARSETP (old_charset) && XCHARSET_TYPE (old_charset) == type && XCHARSET_FINAL (old_charset) == final) return; str->iso2022.force_charset_on_output[reg] = 0; { charset_conversion_spec_dynarr *dyn = str->codesys->iso2022.output_conv; if (dyn) { int i; for (i = 0; i < Dynarr_length (dyn); i++) { struct charset_conversion_spec *spec = Dynarr_atp (dyn, i); if (EQ (charset, spec->from_charset)) charset = spec->to_charset; } } } Dynarr_add (dst, ISO_CODE_ESC); switch (type) { case CHARSET_TYPE_94: Dynarr_add (dst, inter94[reg]); break; case CHARSET_TYPE_96: Dynarr_add (dst, inter96[reg]); break; case CHARSET_TYPE_94X94: Dynarr_add (dst, '$'); if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys)) || final < '@' || final > 'B') Dynarr_add (dst, inter94[reg]); break; case CHARSET_TYPE_96X96: Dynarr_add (dst, '$'); Dynarr_add (dst, inter96[reg]); break; } Dynarr_add (dst, final); } static void ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst) { if (str->iso2022.register_left != 0) { Dynarr_add (dst, ISO_CODE_SI); str->iso2022.register_left = 0; } } static void ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst) { if (str->iso2022.register_left != 1) { Dynarr_add (dst, ISO_CODE_SO); str->iso2022.register_left = 1; } } /* Convert internally-formatted data to ISO2022 format. */ static void encode_coding_iso2022 (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n) { unsigned char charmask, c; unsigned char char_boundary; struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); unsigned int flags = str->flags; unsigned int ch = str->ch; Lisp_Coding_System *codesys = str->codesys; eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); int i; Lisp_Object charset; int half; #ifdef ENABLE_COMPOSITE_CHARS /* flags for handling composite chars. We do a little switcharoo on the source while we're outputting the composite char. */ Bytecount saved_n = 0; const unsigned char *saved_src = NULL; int in_composite = 0; #endif /* ENABLE_COMPOSITE_CHARS */ char_boundary = str->iso2022.current_char_boundary; charset = str->iso2022.current_charset; half = str->iso2022.current_half; #ifdef ENABLE_COMPOSITE_CHARS back_to_square_n: #endif while (n--) { c = *src++; if (BYTE_ASCII_P (c)) { /* Processing ASCII character */ ch = 0; restore_left_to_right_direction (codesys, dst, &flags, 0); /* Make sure G0 contains ASCII */ if ((c > ' ' && c < ISO_CODE_DEL) || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys)) { ensure_normal_shift (str, dst); iso2022_designate (Vcharset_ascii, 0, str, dst); } /* If necessary, restore everything to the default state at end-of-line */ if (c == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys))) { restore_left_to_right_direction (codesys, dst, &flags, 0); ensure_normal_shift (str, dst); for (i = 0; i < 4; i++) { Lisp_Object initial_charset = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); iso2022_designate (initial_charset, i, str, dst); } } if (c == '\n') { if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) Dynarr_add (dst, '\r'); if (eol_type != EOL_CR) Dynarr_add (dst, c); } else { if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) && fit_to_be_escape_quoted (c)) Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, c); } char_boundary = 1; } else if (INTBYTE_LEADING_BYTE_P (c) || INTBYTE_LEADING_BYTE_P (ch)) { /* Processing Leading Byte */ ch = 0; charset = CHARSET_BY_LEADING_BYTE (c); if (LEADING_BYTE_PREFIX_P(c)) ch = c; else if (!EQ (charset, Vcharset_control_1) #ifdef ENABLE_COMPOSITE_CHARS && !EQ (charset, Vcharset_composite) #endif ) { int reg; ensure_correct_direction (XCHARSET_DIRECTION (charset), codesys, dst, &flags, 0); /* Now determine which register to use. */ reg = -1; for (i = 0; i < 4; i++) { if (EQ (charset, str->iso2022.charset[i]) || EQ (charset, CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))) { reg = i; break; } } if (reg == -1) { if (XCHARSET_GRAPHIC (charset) != 0) { if (!NILP (str->iso2022.charset[1]) && (!CODING_SYSTEM_ISO2022_SEVEN (codesys) || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys))) reg = 1; else if (!NILP (str->iso2022.charset[2])) reg = 2; else if (!NILP (str->iso2022.charset[3])) reg = 3; else reg = 0; } else reg = 0; } iso2022_designate (charset, reg, str, dst); /* Now invoke that register. */ switch (reg) { case 0: ensure_normal_shift (str, dst); half = 0; break; case 1: if (CODING_SYSTEM_ISO2022_SEVEN (codesys)) { ensure_shift_out (str, dst); half = 0; } else half = 1; break; case 2: if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys)) { Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, 'N'); half = 0; } else { Dynarr_add (dst, ISO_CODE_SS2); half = 1; } break; case 3: if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys)) { Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, 'O'); half = 0; } else { Dynarr_add (dst, ISO_CODE_SS3); half = 1; } break; default: abort (); } } char_boundary = 0; } else { /* Processing Non-ASCII character */ charmask = (half == 0 ? 0x7F : 0xFF); char_boundary = 1; if (EQ (charset, Vcharset_control_1)) { if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) && fit_to_be_escape_quoted (c)) Dynarr_add (dst, ISO_CODE_ESC); /* you asked for it ... */ Dynarr_add (dst, c - 0x20); } else { switch (XCHARSET_REP_BYTES (charset)) { case 2: Dynarr_add (dst, c & charmask); break; case 3: if (XCHARSET_PRIVATE_P (charset)) { Dynarr_add (dst, c & charmask); ch = 0; } else if (ch) { #ifdef ENABLE_COMPOSITE_CHARS if (EQ (charset, Vcharset_composite)) { if (in_composite) { /* #### Bother! We don't know how to handle this yet. */ Dynarr_add (dst, '~'); } else { Emchar emch = MAKE_CHAR (Vcharset_composite, ch & 0x7F, c & 0x7F); Lisp_Object lstr = composite_char_string (emch); saved_n = n; saved_src = src; in_composite = 1; src = XSTRING_DATA (lstr); n = XSTRING_LENGTH (lstr); Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, '0'); /* start composing */ } } else #endif /* ENABLE_COMPOSITE_CHARS */ { Dynarr_add (dst, ch & charmask); Dynarr_add (dst, c & charmask); } ch = 0; } else { ch = c; char_boundary = 0; } break; case 4: if (ch) { Dynarr_add (dst, ch & charmask); Dynarr_add (dst, c & charmask); ch = 0; } else { ch = c; char_boundary = 0; } break; default: abort (); } } } } #ifdef ENABLE_COMPOSITE_CHARS if (in_composite) { n = saved_n; src = saved_src; in_composite = 0; Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, '1'); /* end composing */ goto back_to_square_n; /* Wheeeeeeeee ..... */ } #endif /* ENABLE_COMPOSITE_CHARS */ if (char_boundary && flags & CODING_STATE_END) { restore_left_to_right_direction (codesys, dst, &flags, 0); ensure_normal_shift (str, dst); for (i = 0; i < 4; i++) { Lisp_Object initial_charset = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); iso2022_designate (initial_charset, i, str, dst); } } str->flags = flags; str->ch = ch; str->iso2022.current_char_boundary = char_boundary; str->iso2022.current_charset = charset; str->iso2022.current_half = half; /* Verbum caro factum est! */ } #endif /* MULE */ /************************************************************************/ /* No-conversion methods */ /************************************************************************/ /* This is used when reading in "binary" files -- i.e. files that may contain all 256 possible byte values and that are not to be interpreted as being in any particular decoding. */ static void decode_coding_no_conversion (Lstream *decoding, const Extbyte *src, unsigned_char_dynarr *dst, Bytecount n) { struct decoding_stream *str = DECODING_STREAM_DATA (decoding); unsigned int flags = str->flags; unsigned int ch = str->ch; eol_type_t eol_type = str->eol_type; while (n--) { unsigned char c = *(unsigned char *)src++; DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); DECODE_ADD_BINARY_CHAR (c, dst); label_continue_loop:; } DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); str->flags = flags; str->ch = ch; } static void encode_coding_no_conversion (Lstream *encoding, const Intbyte *src, unsigned_char_dynarr *dst, Bytecount n) { unsigned char c; struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); unsigned int flags = str->flags; unsigned int ch = str->ch; eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); while (n--) { c = *src++; if (c == '\n') { if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) Dynarr_add (dst, '\r'); if (eol_type != EOL_CR) Dynarr_add (dst, '\n'); ch = 0; } else if (BYTE_ASCII_P (c)) { assert (ch == 0); Dynarr_add (dst, c); } else if (INTBYTE_LEADING_BYTE_P (c)) { assert (ch == 0); if (c == LEADING_BYTE_LATIN_ISO8859_1 || c == LEADING_BYTE_CONTROL_1) ch = c; else Dynarr_add (dst, '~'); /* untranslatable character */ } else { if (ch == LEADING_BYTE_LATIN_ISO8859_1) Dynarr_add (dst, c); else if (ch == LEADING_BYTE_CONTROL_1) { assert (c < 0xC0); Dynarr_add (dst, c - 0x20); } /* else it should be the second or third byte of an untranslatable character, so ignore it */ ch = 0; } } str->flags = flags; str->ch = ch; } /************************************************************************/ /* Initialization */ /************************************************************************/ void syms_of_file_coding (void) { INIT_LRECORD_IMPLEMENTATION (coding_system); DEFSUBR (Fcoding_system_p); DEFSUBR (Ffind_coding_system); DEFSUBR (Fget_coding_system); DEFSUBR (Fcoding_system_list); DEFSUBR (Fcoding_system_name); DEFSUBR (Fmake_coding_system); DEFSUBR (Fcopy_coding_system); DEFSUBR (Fcoding_system_canonical_name_p); DEFSUBR (Fcoding_system_alias_p); DEFSUBR (Fcoding_system_aliasee); DEFSUBR (Fdefine_coding_system_alias); DEFSUBR (Fsubsidiary_coding_system); DEFSUBR (Fcoding_system_type); DEFSUBR (Fcoding_system_doc_string); DEFSUBR (Fcoding_system_property); DEFSUBR (Fcoding_category_list); DEFSUBR (Fset_coding_priority_list); DEFSUBR (Fcoding_priority_list); DEFSUBR (Fset_coding_category_system); DEFSUBR (Fcoding_category_system); DEFSUBR (Fdetect_coding_region); DEFSUBR (Fdecode_coding_region); DEFSUBR (Fencode_coding_region); #ifdef MULE DEFSUBR (Fdecode_shift_jis_char); DEFSUBR (Fencode_shift_jis_char); DEFSUBR (Fdecode_big5_char); DEFSUBR (Fencode_big5_char); DEFSUBR (Fset_ucs_char); DEFSUBR (Fucs_char); DEFSUBR (Fset_char_ucs); DEFSUBR (Fchar_ucs); #endif /* MULE */ DEFSYMBOL_MULTIWORD_PREDICATE (Qcoding_systemp); DEFSYMBOL (Qno_conversion); DEFSYMBOL (Qraw_text); #ifdef MULE DEFSYMBOL (Qbig5); DEFSYMBOL (Qshift_jis); defsymbol (&Qucs4, "ucs-4"); defsymbol (&Qutf8, "utf-8"); DEFSYMBOL (Qccl); DEFSYMBOL (Qiso2022); #endif /* MULE */ DEFSYMBOL (Qmnemonic); DEFSYMBOL (Qeol_type); DEFSYMBOL (Qpost_read_conversion); DEFSYMBOL (Qpre_write_conversion); DEFSYMBOL (Qcr); DEFSYMBOL (Qlf); DEFSYMBOL (Qcrlf); DEFSYMBOL (Qeol_cr); DEFSYMBOL (Qeol_lf); DEFSYMBOL (Qeol_crlf); #ifdef MULE DEFSYMBOL (Qcharset_g0); DEFSYMBOL (Qcharset_g1); DEFSYMBOL (Qcharset_g2); DEFSYMBOL (Qcharset_g3); DEFSYMBOL (Qforce_g0_on_output); DEFSYMBOL (Qforce_g1_on_output); DEFSYMBOL (Qforce_g2_on_output); DEFSYMBOL (Qforce_g3_on_output); DEFSYMBOL (Qno_iso6429); DEFSYMBOL (Qinput_charset_conversion); DEFSYMBOL (Qoutput_charset_conversion); DEFSYMBOL (Qshort); DEFSYMBOL (Qno_ascii_eol); DEFSYMBOL (Qno_ascii_cntl); DEFSYMBOL (Qseven); DEFSYMBOL (Qlock_shift); DEFSYMBOL (Qescape_quoted); #endif /* MULE */ DEFSYMBOL (Qencode); DEFSYMBOL (Qdecode); #ifdef MULE defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS], "shift-jis"); defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5], "big5"); defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4], "ucs-4"); defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8], "utf-8"); defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7"); defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE], "iso-8-designate"); defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1"); defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2"); defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT], "iso-lock-shift"); #endif /* MULE */ defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION], "no-conversion"); } void lstream_type_create_file_coding (void) { LSTREAM_HAS_METHOD (decoding, reader); LSTREAM_HAS_METHOD (decoding, writer); LSTREAM_HAS_METHOD (decoding, rewinder); LSTREAM_HAS_METHOD (decoding, seekable_p); LSTREAM_HAS_METHOD (decoding, flusher); LSTREAM_HAS_METHOD (decoding, closer); LSTREAM_HAS_METHOD (decoding, marker); LSTREAM_HAS_METHOD (encoding, reader); LSTREAM_HAS_METHOD (encoding, writer); LSTREAM_HAS_METHOD (encoding, rewinder); LSTREAM_HAS_METHOD (encoding, seekable_p); LSTREAM_HAS_METHOD (encoding, flusher); LSTREAM_HAS_METHOD (encoding, closer); LSTREAM_HAS_METHOD (encoding, marker); } void vars_of_file_coding (void) { int i; fcd = xnew (struct file_coding_dump); dump_add_root_struct_ptr (&fcd, &fcd_description); /* Initialize to something reasonable ... */ for (i = 0; i < CODING_CATEGORY_LAST; i++) { fcd->coding_category_system[i] = Qnil; fcd->coding_category_by_priority[i] = i; } Fprovide (intern ("file-coding")); DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /* Coding system used for TTY keyboard input. Not used under a windowing system. */ ); Vkeyboard_coding_system = Qnil; DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /* Coding system used for TTY display output. Not used under a windowing system. */ ); Vterminal_coding_system = Qnil; DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /* Overriding coding system used when reading from a file or process. You should bind this variable with `let', but do not set it globally. If this is non-nil, it specifies the coding system that will be used to decode input on read operations, such as from a file or process. It overrides `buffer-file-coding-system-for-read', `insert-file-contents-pre-hook', etc. Use those variables instead of this one for permanent changes to the environment. */ ); Vcoding_system_for_read = Qnil; DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write /* Overriding coding system used when writing to a file or process. You should bind this variable with `let', but do not set it globally. If this is non-nil, it specifies the coding system that will be used to encode output for write operations, such as to a file or process. It overrides `buffer-file-coding-system', `write-region-pre-hook', etc. Use those variables instead of this one for permanent changes to the environment. */ ); Vcoding_system_for_write = Qnil; DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /* Coding system used to convert pathnames when accessing files. */ ); Vfile_name_coding_system = Qnil; DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /* Non-nil means the buffer contents are regarded as multi-byte form of characters, not a binary code. This affects the display, file I/O, and behaviors of various editing commands. Setting this to nil does not do anything. */ ); enable_multibyte_characters = 1; } void complex_vars_of_file_coding (void) { staticpro (&Vcoding_system_hash_table); Vcoding_system_hash_table = make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ); the_codesys_prop_dynarr = Dynarr_new (codesys_prop); dump_add_root_struct_ptr (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description); #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \ { \ struct codesys_prop csp; \ csp.sym = (Sym); \ csp.prop_type = (Prop_Type); \ Dynarr_add (the_codesys_prop_dynarr, csp); \ } while (0) DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic); DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type); DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr); DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf); DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf); DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion); DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion); #ifdef MULE DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion); DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode); DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode); #endif /* MULE */ /* Need to create this here or we're really screwed. */ Fmake_coding_system (Qraw_text, Qno_conversion, build_string ("Raw text, which means it converts only line-break-codes."), list2 (Qmnemonic, build_string ("Raw"))); Fmake_coding_system (Qbinary, Qno_conversion, build_string ("Binary, which means it does not convert anything."), list4 (Qeol_type, Qlf, Qmnemonic, build_string ("Binary"))); Fdefine_coding_system_alias (Qno_conversion, Qraw_text); Fdefine_coding_system_alias (Qfile_name, Qbinary); Fdefine_coding_system_alias (Qterminal, Qbinary); Fdefine_coding_system_alias (Qkeyboard, Qbinary); /* Need this for bootstrapping */ fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] = Fget_coding_system (Qraw_text); #ifdef MULE { int i; for (i = 0; i < countof (fcd->ucs_to_mule_table); i++) fcd->ucs_to_mule_table[i] = Qnil; } staticpro (&mule_to_ucs_table); mule_to_ucs_table = Fmake_char_table (Qgeneric); #endif /* MULE */ }