Mercurial > hg > xemacs-beta
view src/mule-coding.c @ 241:f955c73f5258 r20-5b19
Import from CVS: tag r20-5b19
author | cvs |
---|---|
date | Mon, 13 Aug 2007 10:16:16 +0200 |
parents | 78f53ef88e17 |
children | f220cc83d72e |
line wrap: on
line source
/* Code conversion functions. Copyright (C) 1991, 1995 Free Software Foundation, Inc. Copyright (C) 1995 Sun Microsystems, Inc. This file is part of XEmacs. XEmacs is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. XEmacs is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with XEmacs; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* Synched up with: Mule 2.3. Not in FSF. */ /* Rewritten by Ben Wing <wing@666.com>. */ #include <config.h> #include "lisp.h" #include "buffer.h" #include "elhash.h" #include "insdel.h" #include "lstream.h" #include "mule-ccl.h" #include "mule-coding.h" Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error; Lisp_Object Vkeyboard_coding_system; Lisp_Object Vterminal_coding_system; Lisp_Object Vcoding_system_for_read; Lisp_Object Vcoding_system_for_write; Lisp_Object Vfile_name_coding_system; /* Table of symbols identifying each coding category. */ Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1]; /* Coding system currently associated with each coding category. */ Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1]; /* Table of all coding categories in decreasing order of priority. This describes a permutation of the possible coding categories. */ int coding_category_by_priority[CODING_CATEGORY_LAST + 1]; Lisp_Object Qcoding_system_p; Lisp_Object Qbig5, Qshift_jis, Qno_conversion, Qccl, Qiso2022; /* Qinternal in general.c */ Lisp_Object Qmnemonic, Qeol_type; Lisp_Object Qcr, Qcrlf, Qlf; Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf; Lisp_Object Qpost_read_conversion; Lisp_Object Qpre_write_conversion; Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3; Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output; Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output; Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift; Lisp_Object Qno_iso6429, Qescape_quoted; Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion; Lisp_Object Qencode, Qdecode; Lisp_Object Qctext; Lisp_Object Vcoding_system_hashtable; int enable_multibyte_characters; /* Additional information used by the ISO2022 decoder and detector. */ struct iso2022_decoder { /* CHARSET holds the character sets currently assigned to the G0 through G3 variables. It is initialized from the array INITIAL_CHARSET in CODESYS. */ Lisp_Object charset[4]; /* Which registers are currently invoked into the left (GL) and right (GR) halves of the 8-bit encoding space? */ int register_left, register_right; /* ISO_ESC holds a value indicating part of an escape sequence that has already been seen. */ enum iso_esc_flag esc; /* This records the bytes we've seen so far in an escape sequence, in case the sequence is invalid (we spit out the bytes unchanged). */ unsigned char esc_bytes[8]; /* Index for next byte to store in ISO escape sequence. */ int esc_bytes_index; /* Stuff seen so far when composing a string. */ unsigned_char_dynarr *composite_chars; /* If we saw an invalid designation sequence for a particular register, we flag it here and switch to ASCII. The next time we see a valid designation for this register, we turn off the flag and do the designation normally, but pretend the sequence was invalid. The effect of all this is that (most of the time) the escape sequences for both the switch to the unknown charset, and the switch back to the known charset, get inserted literally into the buffer and saved out as such. The hope is that we can preserve the escape sequences so that the resulting written out file makes sense. If we don't do any of this, the designation to the invalid charset will be preserved but that switch back to the known charset will probably get eaten because it was the same charset that was already present in the register. */ unsigned char invalid_designated[4]; /* We try to do similar things as above for direction-switching sequences. If we encountered a direction switch while an invalid designation was present, or an invalid designation just after a direction switch (i.e. no valid designation encountered yet), we insert the direction-switch escape sequence literally into the output stream, and later on insert the corresponding direction-restoring escape sequence literally also. */ unsigned int switched_dir_and_no_valid_charset_yet :1; unsigned int invalid_switch_dir :1; /* Tells the decoder to output the escape sequence literally even though it was valid. Used in the games we play to avoid lossage when we encounter invalid designations. */ unsigned int output_literally :1; /* We encountered a direction switch followed by an invalid designation. We didn't output the direction switch literally because we didn't know about the invalid designation; but we have to do so now. */ unsigned int output_direction_sequence :1; }; Lisp_Object Fcopy_coding_system (Lisp_Object old_coding_system, Lisp_Object new_name); struct detection_state; static int detect_coding_sjis (struct detection_state *st, CONST unsigned char *src, unsigned int n); static void decode_coding_sjis (Lstream *decoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n); static void encode_coding_sjis (Lstream *encoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n); static int detect_coding_big5 (struct detection_state *st, CONST unsigned char *src, unsigned int n); static void decode_coding_big5 (Lstream *decoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n); static void encode_coding_big5 (Lstream *encoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n); static int postprocess_iso2022_mask (int mask); static void reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso); static int detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src, unsigned int n); static void decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n); static void encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n); static void decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n); static void encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n); static void mule_decode (Lstream *decoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n); static void mule_encode (Lstream *encoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n); typedef struct codesys_prop codesys_prop; struct codesys_prop { Lisp_Object sym; int prop_type; }; typedef struct { Dynarr_declare (codesys_prop); } codesys_prop_dynarr; codesys_prop_dynarr *the_codesys_prop_dynarr; enum codesys_prop_enum { CODESYS_PROP_ALL_OK, CODESYS_PROP_ISO2022, CODESYS_PROP_CCL }; /************************************************************************/ /* Coding system functions */ /************************************************************************/ static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object)); static void print_coding_system (Lisp_Object, Lisp_Object, int); static void finalize_coding_system (void *header, int for_disksave); DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system, mark_coding_system, print_coding_system, finalize_coding_system, 0, 0, struct Lisp_Coding_System); static Lisp_Object mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object)) { struct Lisp_Coding_System *codesys = XCODING_SYSTEM (obj); (markobj) (CODING_SYSTEM_NAME (codesys)); (markobj) (CODING_SYSTEM_DOC_STRING (codesys)); (markobj) (CODING_SYSTEM_MNEMONIC (codesys)); (markobj) (CODING_SYSTEM_EOL_LF (codesys)); (markobj) (CODING_SYSTEM_EOL_CRLF (codesys)); (markobj) (CODING_SYSTEM_EOL_CR (codesys)); switch (CODING_SYSTEM_TYPE (codesys)) { int i; case CODESYS_ISO2022: for (i = 0; i < 4; i++) (markobj) (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)); if (codesys->iso2022.input_conv) { for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++) { struct charset_conversion_spec *ccs = Dynarr_atp (codesys->iso2022.input_conv, i); (markobj) (ccs->from_charset); (markobj) (ccs->to_charset); } } if (codesys->iso2022.output_conv) { for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++) { struct charset_conversion_spec *ccs = Dynarr_atp (codesys->iso2022.output_conv, i); (markobj) (ccs->from_charset); (markobj) (ccs->to_charset); } } break; case CODESYS_CCL: (markobj) (CODING_SYSTEM_CCL_DECODE (codesys)); (markobj) (CODING_SYSTEM_CCL_ENCODE (codesys)); break; default: break; } (markobj) (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys)); return CODING_SYSTEM_POST_READ_CONVERSION (codesys); } static void print_coding_system (Lisp_Object obj, Lisp_Object printcharfun, int escapeflag) { struct Lisp_Coding_System *c = XCODING_SYSTEM (obj); if (print_readably) error ("printing unreadable object #<coding_system 0x%x>", c->header.uid); write_c_string ("#<coding_system ", printcharfun); print_internal (c->name, printcharfun, 1); write_c_string (">", printcharfun); } static void finalize_coding_system (void *header, int for_disksave) { struct Lisp_Coding_System *c = (struct Lisp_Coding_System *) header; /* Since coding systems never go away, this function is not necessary. But it would be necessary if we changed things so that coding systems could go away. */ if (!for_disksave) /* see comment in lstream.c */ { switch (CODING_SYSTEM_TYPE (c)) { case CODESYS_ISO2022: if (c->iso2022.input_conv) { Dynarr_free (c->iso2022.input_conv); c->iso2022.input_conv = 0; } if (c->iso2022.output_conv) { Dynarr_free (c->iso2022.output_conv); c->iso2022.output_conv = 0; } break; default: break; } } } static enum eol_type symbol_to_eol_type (Lisp_Object symbol) { CHECK_SYMBOL (symbol); if (NILP (symbol)) return EOL_AUTODETECT; if (EQ (symbol, Qlf)) return EOL_LF; if (EQ (symbol, Qcrlf)) return EOL_CRLF; if (EQ (symbol, Qcr)) return EOL_CR; signal_simple_error ("Unrecognized eol type", symbol); return EOL_AUTODETECT; /* not reached */ } static Lisp_Object eol_type_to_symbol (enum eol_type type) { switch (type) { case EOL_LF: return Qlf; case EOL_CRLF: return Qcrlf; case EOL_CR: return Qcr; case EOL_AUTODETECT: return Qnil; default: abort (); return Qnil; /* not reached */ } } static void setup_eol_coding_systems (struct Lisp_Coding_System *codesys) { Lisp_Object codesys_obj = Qnil; int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name); char *codesys_name = (char *) alloca (len + 7); Lisp_Object codesys_name_sym, sub_codesys_obj; /* kludge */ XSETCODING_SYSTEM (codesys_obj, codesys); memcpy (codesys_name, string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len); #define DEFINE_SUB_CODESYS(op_sys, Type) do { \ strcpy (codesys_name + len, "-" op_sys); \ codesys_name_sym = intern (codesys_name); \ sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \ XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \ CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \ } while (0) DEFINE_SUB_CODESYS("unix", EOL_LF); DEFINE_SUB_CODESYS("dos", EOL_CRLF); DEFINE_SUB_CODESYS("mac", EOL_CR); } DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /* T if OBJECT is a coding system. A coding system is an object that defines how text containing multiple character sets is encoded into a stream of (typically 8-bit) bytes. The coding system is used to decode the stream into a series of characters (which may be from multiple charsets) when the text is read from a file or process, and is used to encode the text back into the same format when it is written out to a file or process. For example, many ISO2022-compliant coding systems (such as Compound Text, which is used for inter-client data under the X Window System) use escape sequences to switch between different charsets -- Japanese Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See `make-coding-system' for more information. Coding systems are normally identified using a symbol, and the symbol is accepted in place of the actual coding system object whenever a coding system is called for. (This is similar to how faces work.) */ (object)) { return CODING_SYSTEMP (object) ? Qt : Qnil; } DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /* Retrieve the coding system of the given name. If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol. If there is no such coding system, nil is returned. Otherwise the associated coding system object is returned. */ (coding_system_or_name)) { if (NILP (coding_system_or_name)) coding_system_or_name = Qbinary; if (CODING_SYSTEMP (coding_system_or_name)) return coding_system_or_name; CHECK_SYMBOL (coding_system_or_name); return Fgethash (coding_system_or_name, Vcoding_system_hashtable, Qnil); } DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /* Retrieve the coding system of the given name. Same as `find-coding-system' except that if there is no such coding system, an error is signaled instead of returning nil. */ (name)) { Lisp_Object coding_system = Ffind_coding_system (name); if (NILP (coding_system)) signal_simple_error ("No such coding system", name); return coding_system; } /* We store the coding systems in hash tables with the names as the key and the actual coding system object as the value. Occasionally we need to use them in a list format. These routines provide us with that. */ struct coding_system_list_closure { Lisp_Object *coding_system_list; }; static int add_coding_system_to_list_mapper (CONST void *hash_key, void *hash_contents, void *coding_system_list_closure) { /* This function can GC */ Lisp_Object key, contents; Lisp_Object *coding_system_list; struct coding_system_list_closure *cscl = (struct coding_system_list_closure *) coding_system_list_closure; CVOID_TO_LISP (key, hash_key); VOID_TO_LISP (contents, hash_contents); coding_system_list = cscl->coding_system_list; *coding_system_list = Fcons (XCODING_SYSTEM (contents)->name, *coding_system_list); return 0; } DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /* Return a list of the names of all defined coding systems. */ ()) { Lisp_Object coding_system_list = Qnil; struct gcpro gcpro1; struct coding_system_list_closure coding_system_list_closure; GCPRO1 (coding_system_list); coding_system_list_closure.coding_system_list = &coding_system_list; elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hashtable, &coding_system_list_closure); UNGCPRO; return coding_system_list; } DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /* Return the name of the given coding system. */ (coding_system)) { coding_system = Fget_coding_system (coding_system); return XCODING_SYSTEM_NAME (coding_system); } static struct Lisp_Coding_System * allocate_coding_system (enum coding_system_type type, Lisp_Object name) { struct Lisp_Coding_System *codesys = alloc_lcrecord_type (struct Lisp_Coding_System, lrecord_coding_system); zero_lcrecord (codesys); CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil; CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil; CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT; CODING_SYSTEM_EOL_CRLF (codesys) = Qnil; CODING_SYSTEM_EOL_CR (codesys) = Qnil; CODING_SYSTEM_EOL_LF (codesys) = Qnil; CODING_SYSTEM_TYPE (codesys) = type; if (type == CODESYS_ISO2022) { int i; for (i = 0; i < 4; i++) CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil; } else if (type == CODESYS_CCL) { CODING_SYSTEM_CCL_DECODE (codesys) = Qnil; CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil; } CODING_SYSTEM_NAME (codesys) = name; return codesys; } /* Given a list of charset conversion specs as specified in a Lisp program, parse it into STORE_HERE. */ static void parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here, Lisp_Object spec_list) { Lisp_Object rest; EXTERNAL_LIST_LOOP (rest, spec_list) { Lisp_Object car = XCAR (rest); Lisp_Object from, to; struct charset_conversion_spec spec; if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car)))) signal_simple_error ("Invalid charset conversion spec", car); from = Fget_charset (XCAR (car)); to = Fget_charset (XCAR (XCDR (car))); if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to)) signal_simple_error_2 ("Attempted conversion between different charset types", from, to); spec.from_charset = from; spec.to_charset = to; Dynarr_add (store_here, spec); } } /* Given a dynarr LOAD_HERE of internally-stored charset conversion specs, return the equivalent as the Lisp programmer would see it. If LOAD_HERE is 0, return Qnil. */ static Lisp_Object unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here) { int i; Lisp_Object result = Qnil; if (!load_here) return Qnil; for (i = 0; i < Dynarr_length (load_here); i++) { struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i); result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result); } return Fnreverse (result); } DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /* Register symbol NAME as a coding system. TYPE describes the conversion method used and should be one of nil or 'undecided Automatic conversion. XEmacs attempts to detect the coding system used in the file. 'no-conversion No conversion. Use this for binary files and such. On output, graphic characters that are not in ASCII or Latin-1 will be replaced by a ?. (For a no-conversion-encoded buffer, these characters will only be present if you explicitly insert them.) 'shift-jis Shift-JIS (a Japanese encoding commonly used in PC operating systems). 'iso2022 Any ISO2022-compliant encoding. Among other things, this includes JIS (the Japanese encoding commonly used for e-mail), EUC (the standard Unix encoding for Japanese and other languages), and Compound Text (the encoding used in X11). You can specify more specific information about the conversion with the FLAGS argument. 'big5 Big5 (the encoding commonly used for Taiwanese). 'ccl The conversion is performed using a user-written pseudo-code program. CCL (Code Conversion Language) is the name of this pseudo-code. 'internal Write out or read in the raw contents of the memory representing the buffer's text. This is primarily useful for debugging purposes, and is only enabled when XEmacs has been compiled with DEBUG_XEMACS defined (via the --debug configure option). WARNING: Reading in a file using 'internal conversion can result in an internal inconsistency in the memory representing a buffer's text, which will produce unpredictable results and may cause XEmacs to crash. Under normal circumstances you should never use 'internal conversion. DOC-STRING is a string describing the coding system. PROPS is a property list, describing the specific nature of the character set. Recognized properties are: 'mnemonic String to be displayed in the modeline when this coding system is active. 'eol-type End-of-line conversion to be used. It should be one of nil Automatically detect the end-of-line type (LF, CRLF, or CR). Also generate subsidiary coding systems named `NAME-unix', `NAME-dos', and `NAME-mac', that are identical to this coding system but have an EOL-TYPE value of 'lf, 'crlf, and 'cr, respectively. 'lf The end of a line is marked externally using ASCII LF. Since this is also the way that XEmacs represents an end-of-line internally, specifying this option results in no end-of-line conversion. This is the standard format for Unix text files. 'crlf The end of a line is marked externally using ASCII CRLF. This is the standard format for MS-DOS text files. 'cr The end of a line is marked externally using ASCII CR. This is the standard format for Macintosh text files. t Automatically detect the end-of-line type but do not generate subsidiary coding systems. (This value is converted to nil when stored internally, and `coding-system-property' will return nil.) 'post-read-conversion Function called after a file has been read in, to perform the decoding. Called with two arguments, BEG and END, denoting a region of the current buffer to be decoded. 'pre-write-conversion Function called before a file is written out, to perform the encoding. Called with two arguments, BEG and END, denoting a region of the current buffer to be encoded. The following additional properties are recognized if TYPE is 'iso2022: 'charset-g0 'charset-g1 'charset-g2 'charset-g3 The character set initially designated to the G0 - G3 registers. The value should be one of -- A charset object (designate that character set) -- nil (do not ever use this register) -- t (no character set is initially designated to the register, but may be later on; this automatically sets the corresponding `force-g*-on-output' property) 'force-g0-on-output 'force-g1-on-output 'force-g2-on-output 'force-g2-on-output If non-nil, send an explicit designation sequence on output before using the specified register. 'short If non-nil, use the short forms "ESC $ @", "ESC $ A", and "ESC $ B" on output in place of the full designation sequences "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B". 'no-ascii-eol If non-nil, don't designate ASCII to G0 at each end of line on output. Setting this to non-nil also suppresses other state-resetting that normally happens at the end of a line. 'no-ascii-cntl If non-nil, don't designate ASCII to G0 before control chars on output. 'seven If non-nil, use 7-bit environment on output. Otherwise, use 8-bit environment. 'lock-shift If non-nil, use locking-shift (SO/SI) instead of single-shift or designation by escape sequence. 'no-iso6429 If non-nil, don't use ISO6429's direction specification. 'escape-quoted If non-nil, literal control characters that are the same as the beginning of a recognized ISO2022 or ISO6429 escape sequence (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E), SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character so that they can be properly distinguished from an escape sequence. (Note that doing this results in a non-portable encoding.) This encoding flag is used for byte-compiled files. Note that ESC is a good choice for a quoting character because there are no escape sequences whose second byte is a character from the Control-0 or Control-1 character sets; this is explicitly disallowed by the ISO2022 standard. 'input-charset-conversion A list of conversion specifications, specifying conversion of characters in one charset to another when decoding is performed. Each specification is a list of two elements: the source charset, and the destination charset. 'output-charset-conversion A list of conversion specifications, specifying conversion of characters in one charset to another when encoding is performed. The form of each specification is the same as for 'input-charset-conversion. The following additional properties are recognized (and required) if TYPE is 'ccl: 'decode CCL program used for decoding (converting to internal format). 'encode CCL program used for encoding (converting to external format). */ (name, type, doc_string, props)) { struct Lisp_Coding_System *codesys; Lisp_Object rest, key, value; enum coding_system_type ty; int need_to_setup_eol_systems = 1; /* Convert type to constant */ if (NILP (type) || EQ (type, Qundecided)) { ty = CODESYS_AUTODETECT; } else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; } else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; } else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; } else if (EQ (type, Qccl)) { ty = CODESYS_CCL; } else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; } #ifdef DEBUG_XEMACS else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; } #endif else signal_simple_error ("Invalid coding system type", type); CHECK_SYMBOL (name); codesys = allocate_coding_system (ty, name); if (NILP (doc_string)) doc_string = build_string (""); else CHECK_STRING (doc_string); CODING_SYSTEM_DOC_STRING (codesys) = doc_string; EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props) { if (EQ (key, Qmnemonic)) { if (!NILP (value)) CHECK_STRING (value); CODING_SYSTEM_MNEMONIC (codesys) = value; } else if (EQ (key, Qeol_type)) { need_to_setup_eol_systems = NILP (value); if (EQ (value, Qt)) value = Qnil; CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value); } else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value; else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value; else if (ty == CODESYS_ISO2022) { #define FROB_INITIAL_CHARSET(charset_num) \ CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \ ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value)) if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0); else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1); else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2); else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3); #define FROB_FORCE_CHARSET(charset_num) \ CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value) else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0); else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1); else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2); else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3); #define FROB_BOOLEAN_PROPERTY(prop) \ CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value) else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT); else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL); else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL); else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN); else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT); else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429); else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED); else if (EQ (key, Qinput_charset_conversion)) { codesys->iso2022.input_conv = Dynarr_new (charset_conversion_spec); parse_charset_conversion_specs (codesys->iso2022.input_conv, value); } else if (EQ (key, Qoutput_charset_conversion)) { codesys->iso2022.output_conv = Dynarr_new (charset_conversion_spec); parse_charset_conversion_specs (codesys->iso2022.output_conv, value); } else signal_simple_error ("Unrecognized property", key); } else if (EQ (type, Qccl)) { if (EQ (key, Qdecode)) { CHECK_VECTOR (value); CODING_SYSTEM_CCL_DECODE (codesys) = value; } else if (EQ (key, Qencode)) { CHECK_VECTOR (value); CODING_SYSTEM_CCL_ENCODE (codesys) = value; } else signal_simple_error ("Unrecognized property", key); } else signal_simple_error ("Unrecognized property", key); } if (need_to_setup_eol_systems) setup_eol_coding_systems (codesys); { Lisp_Object codesys_obj; XSETCODING_SYSTEM (codesys_obj, codesys); Fputhash (name, codesys_obj, Vcoding_system_hashtable); return codesys_obj; } } DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /* Copy OLD-CODING-SYSTEM to NEW-NAME. If NEW-NAME does not name an existing coding system, a new one will be created. */ (old_coding_system, new_name)) { Lisp_Object new_coding_system; old_coding_system = Fget_coding_system (old_coding_system); new_coding_system = Ffind_coding_system (new_name); if (NILP (new_coding_system)) { XSETCODING_SYSTEM (new_coding_system, allocate_coding_system (XCODING_SYSTEM_TYPE (old_coding_system), new_name)); Fputhash (new_name, new_coding_system, Vcoding_system_hashtable); } { struct Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system); struct Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system); memcpy (((char *) to ) + sizeof (to->header), ((char *) from) + sizeof (from->header), sizeof (*from) - sizeof (from->header)); to->name = new_name; } return new_coding_system; } static Lisp_Object subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type) { struct Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system); Lisp_Object new_coding_system; if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) return coding_system; switch (type) { case EOL_AUTODETECT: return coding_system; case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break; case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break; case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break; default: abort (); } return NILP (new_coding_system) ? coding_system : new_coding_system; } DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /* Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE. */ (coding_system, eol_type)) { coding_system = Fget_coding_system (coding_system); return subsidiary_coding_system (coding_system, symbol_to_eol_type (eol_type)); } /************************************************************************/ /* Coding system accessors */ /************************************************************************/ DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /* Return the doc string for CODING-SYSTEM. */ (coding_system)) { coding_system = Fget_coding_system (coding_system); return XCODING_SYSTEM_DOC_STRING (coding_system); } DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /* Return the type of CODING-SYSTEM. */ (coding_system)) { switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system))) { case CODESYS_AUTODETECT: return Qundecided; case CODESYS_SHIFT_JIS: return Qshift_jis; case CODESYS_ISO2022: return Qiso2022; case CODESYS_BIG5: return Qbig5; case CODESYS_CCL: return Qccl; case CODESYS_NO_CONVERSION: return Qno_conversion; #ifdef DEBUG_XEMACS case CODESYS_INTERNAL: return Qinternal; #endif default: abort (); } return Qnil; /* not reached */ } Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum) { Lisp_Object cs = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum); if (CHARSETP(cs)){ return XCHARSET_NAME(cs); } else { return Qnil; } } DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /* Return initial charset of CODING-SYSTEM designated to GNUM. GNUM allows 0 .. 3. */ (coding_system, gnum)) { coding_system = Fget_coding_system (coding_system); CHECK_INT (gnum); return coding_system_charset(coding_system, XINT (gnum)); } DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /* Return the PROP property of CODING-SYSTEM. */ (coding_system, prop)) { int i, ok = 0; enum coding_system_type type; coding_system = Fget_coding_system (coding_system); CHECK_SYMBOL (prop); type = XCODING_SYSTEM_TYPE (coding_system); for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++) if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop)) { ok = 1; switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type) { case CODESYS_PROP_ALL_OK: break; case CODESYS_PROP_ISO2022: if (type != CODESYS_ISO2022) signal_simple_error ("Property only valid in ISO2022 coding systems", prop); break; case CODESYS_PROP_CCL: if (type != CODESYS_CCL) signal_simple_error ("Property only valid in CCL coding systems", prop); break; default: abort (); } } if (!ok) signal_simple_error ("Unrecognized property", prop); if (EQ (prop, Qname)) return XCODING_SYSTEM_NAME (coding_system); else if (EQ (prop, Qtype)) return Fcoding_system_type (coding_system); else if (EQ (prop, Qdoc_string)) return XCODING_SYSTEM_DOC_STRING (coding_system); else if (EQ (prop, Qmnemonic)) return XCODING_SYSTEM_MNEMONIC (coding_system); else if (EQ (prop, Qeol_type)) return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system)); else if (EQ (prop, Qeol_lf)) return XCODING_SYSTEM_EOL_LF (coding_system); else if (EQ (prop, Qeol_crlf)) return XCODING_SYSTEM_EOL_CRLF (coding_system); else if (EQ (prop, Qeol_cr)) return XCODING_SYSTEM_EOL_CR (coding_system); else if (EQ (prop, Qpost_read_conversion)) return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system); else if (EQ (prop, Qpre_write_conversion)) return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system); else if (type == CODESYS_ISO2022) { if (EQ (prop, Qcharset_g0)) return coding_system_charset (coding_system, 0); else if (EQ (prop, Qcharset_g1)) return coding_system_charset (coding_system, 1); else if (EQ (prop, Qcharset_g2)) return coding_system_charset (coding_system, 2); else if (EQ (prop, Qcharset_g3)) return coding_system_charset (coding_system, 3); #define FORCE_CHARSET(charset_num) \ (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \ (coding_system, charset_num) ? Qt : Qnil) else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0); else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1); else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2); else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3); #define LISP_BOOLEAN(prop) \ (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil) else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT); else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL); else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL); else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN); else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT); else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429); else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED); else if (EQ (prop, Qinput_charset_conversion)) return unparse_charset_conversion_specs (XCODING_SYSTEM (coding_system)->iso2022.input_conv); else if (EQ (prop, Qoutput_charset_conversion)) return unparse_charset_conversion_specs (XCODING_SYSTEM (coding_system)->iso2022.output_conv); else abort (); } else if (type == CODESYS_CCL) { if (EQ (prop, Qdecode)) return XCODING_SYSTEM_CCL_DECODE (coding_system); else if (EQ (prop, Qencode)) return XCODING_SYSTEM_CCL_ENCODE (coding_system); else abort (); } else abort (); return Qnil; /* not reached */ } /************************************************************************/ /* Coding category functions */ /************************************************************************/ static int decode_coding_category (Lisp_Object symbol) { int i; CHECK_SYMBOL (symbol); for (i = 0; i <= CODING_CATEGORY_LAST; i++) if (EQ (coding_category_symbol[i], symbol)) return i; signal_simple_error ("Unrecognized coding category", symbol); return 0; /* not reached */ } DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /* Return a list of all recognized coding categories. */ ()) { int i; Lisp_Object list = Qnil; for (i = CODING_CATEGORY_LAST; i >= 0; i--) list = Fcons (coding_category_symbol[i], list); return list; } DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /* Change the priority order of the coding categories. LIST should be list of coding categories, in descending order of priority. Unspecified coding categories will be lower in priority than all specified ones, in the same relative order they were in previously. */ (list)) { int category_to_priority[CODING_CATEGORY_LAST + 1]; int i, j; Lisp_Object rest; /* First generate a list that maps coding categories to priorities. */ for (i = 0; i <= CODING_CATEGORY_LAST; i++) category_to_priority[i] = -1; /* Highest priority comes from the specified list. */ i = 0; EXTERNAL_LIST_LOOP (rest, list) { int cat = decode_coding_category (XCAR (rest)); if (category_to_priority[cat] >= 0) signal_simple_error ("Duplicate coding category in list", XCAR (rest)); category_to_priority[cat] = i++; } /* Now go through the existing categories by priority to retrieve the categories not yet specified and preserve their priority order. */ for (j = 0; j <= CODING_CATEGORY_LAST; j++) { int cat = coding_category_by_priority[j]; if (category_to_priority[cat] < 0) category_to_priority[cat] = i++; } /* Now we need to construct the inverse of the mapping we just constructed. */ for (i = 0; i <= CODING_CATEGORY_LAST; i++) coding_category_by_priority[category_to_priority[i]] = i; /* Phew! That was confusing. */ return Qnil; } DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /* Return a list of coding categories in descending order of priority. */ ()) { int i; Lisp_Object list = Qnil; for (i = CODING_CATEGORY_LAST; i >= 0; i--) list = Fcons (coding_category_symbol[coding_category_by_priority[i]], list); return list; } DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /* Change the coding system associated with a coding category. */ (coding_category, coding_system)) { int cat = decode_coding_category (coding_category); coding_system = Fget_coding_system (coding_system); coding_category_system[cat] = coding_system; return Qnil; } DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /* Return the coding system associated with a coding category. */ (coding_category)) { int cat = decode_coding_category (coding_category); Lisp_Object sys = coding_category_system[cat]; if (!NILP (sys)) return XCODING_SYSTEM_NAME (sys); return Qnil; } /************************************************************************/ /* Detecting the encoding of data */ /************************************************************************/ struct detection_state { enum eol_type eol_type; int seen_non_ascii; int mask; struct { int mask; int in_second_byte; } big5; struct { int mask; int in_second_byte; } shift_jis; struct { int mask; int initted; struct iso2022_decoder iso; unsigned int flags; int high_byte_count; unsigned int saw_single_shift:1; } iso2022; struct { int seen_anything; int just_saw_cr; } eol; }; static int acceptable_control_char_p (int c) { switch (c) { /* Allow and ignore control characters that you might reasonably see in a text file */ case '\r': case '\n': case '\t': case 7: /* bell */ case 8: /* backspace */ case 11: /* vertical tab */ case 12: /* form feed */ case 26: /* MS-DOS C-z junk */ case 31: /* '^_' -- for info */ return 1; default: return 0; } } static int mask_has_at_most_one_bit_p (int mask) { /* Perhaps the only thing useful you learn from intensive Microsoft technical interviews */ return (mask & (mask - 1)) == 0; } static enum eol_type detect_eol_type (struct detection_state *st, CONST unsigned char *src, unsigned int n) { int c; while (n--) { c = *src++; if (c == '\r') st->eol.just_saw_cr = 1; else { if (c == '\n') { if (st->eol.just_saw_cr) return EOL_CRLF; else if (st->eol.seen_anything) return EOL_LF; } else if (st->eol.just_saw_cr) return EOL_CR; st->eol.just_saw_cr = 0; } st->eol.seen_anything = 1; } return EOL_AUTODETECT; } /* Attempt to determine the encoding and EOL type of the given text. Before calling this function for the first type, you must initialize st->eol_type as appropriate and initialize st->mask to ~0. st->eol_type holds the determined EOL type, or EOL_AUTODETECT if not yet known. st->mask holds the determined coding category mask, or ~0 if only ASCII has been seen so far. Returns: 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category is present in st->mask 1 == definitive answers are here for both st->eol_type and st->mask */ static int detect_coding_type (struct detection_state *st, CONST unsigned char *src, unsigned int n, int just_do_eol) { int c; if (st->eol_type == EOL_AUTODETECT) st->eol_type = detect_eol_type (st, src, n); if (just_do_eol) return st->eol_type != EOL_AUTODETECT; if (!st->seen_non_ascii) { for (; n; n--, src++) { c = *src; if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80) { st->seen_non_ascii = 1; st->shift_jis.mask = ~0; st->big5.mask = ~0; st->iso2022.mask = ~0; break; } } } if (!n) return 0; if (!mask_has_at_most_one_bit_p (st->iso2022.mask)) st->iso2022.mask = detect_coding_iso2022 (st, src, n); if (!mask_has_at_most_one_bit_p (st->shift_jis.mask)) st->shift_jis.mask = detect_coding_sjis (st, src, n); if (!mask_has_at_most_one_bit_p (st->big5.mask)) st->big5.mask = detect_coding_big5 (st, src, n); st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask; { int retval = mask_has_at_most_one_bit_p (st->mask); st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK; return retval && st->eol_type != EOL_AUTODETECT; } } static Lisp_Object coding_system_from_mask (int mask) { if (mask == ~0) { /* If the file was entirely or basically ASCII, use the default value of `buffer-file-coding-system'. */ Lisp_Object retval = XBUFFER (Vbuffer_defaults)->buffer_file_coding_system; if (!NILP (retval)) { retval = Ffind_coding_system (retval); if (NILP (retval)) { warn_when_safe (Qbad_variable, Qwarning, "Invalid `default-buffer-file-coding-system', set to nil"); XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil; } } if (NILP (retval)) retval = Fget_coding_system (Qno_conversion); return retval; } else { int i; int cat = -1; mask = postprocess_iso2022_mask (mask); /* Look through the coding categories by priority and find the first one that is allowed. */ for (i = 0; i <= CODING_CATEGORY_LAST; i++) { cat = coding_category_by_priority[i]; if ((mask & (1 << cat)) && !NILP (coding_category_system[cat])) break; } if (cat >= 0) return coding_category_system[cat]; else return Fget_coding_system (Qno_conversion); } } /* Given a seekable read stream and potential coding system and EOL type as specified, do any autodetection that is called for. If the coding system and/or EOL type are not autodetect, they will be left alone; but this function will never return an autodetect coding system or EOL type. This function does not automatically fetch subsidiary coding systems; that should be unnecessary with the explicit eol-type argument. */ static void determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out, enum eol_type *eol_type_in_out) { struct detection_state decst; if (*eol_type_in_out == EOL_AUTODETECT) *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out); memset (&decst, 0, sizeof (decst)); decst.eol_type = *eol_type_in_out; decst.mask = ~0; /* If autodetection is called for, do it now. */ if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT || *eol_type_in_out == EOL_AUTODETECT) { while (1) { unsigned char random_buffer[4096]; int nread; nread = Lstream_read (stream, random_buffer, sizeof (random_buffer)); if (!nread) break; if (detect_coding_type (&decst, random_buffer, nread, XCODING_SYSTEM_TYPE (*codesys_in_out) != CODESYS_AUTODETECT)) break; } *eol_type_in_out = decst.eol_type; if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT) *codesys_in_out = coding_system_from_mask (decst.mask); } /* If we absolutely can't determine the EOL type, just assume LF. */ if (*eol_type_in_out == EOL_AUTODETECT) *eol_type_in_out = EOL_LF; Lstream_rewind (stream); } DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /* Detect coding system of the text in the region between START and END. Returned a list of possible coding systems ordered by priority. If only ASCII characters are found, it returns 'undecided or one of its subsidiary coding systems according to a detected end-of-line type. Optional arg BUFFER defaults to the current buffer. */ (start, end, buffer)) { Lisp_Object val = Qnil; struct buffer *buf = decode_buffer (buffer, 0); Bufpos b, e; Lisp_Object instream, lb_instream; Lstream *istr, *lb_istr; struct detection_state decst; struct gcpro gcpro1, gcpro2; get_buffer_range_char (buf, start, end, &b, &e, 0); lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0); lb_istr = XLSTREAM (lb_instream); instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary)); istr = XLSTREAM (instream); GCPRO2 (instream, lb_instream); memset (&decst, 0, sizeof (decst)); decst.eol_type = EOL_AUTODETECT; decst.mask = ~0; while (1) { unsigned char random_buffer[4096]; int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer)); if (!nread) break; if (detect_coding_type (&decst, random_buffer, nread, 0)) break; } if (decst.mask == ~0) val = subsidiary_coding_system (Fget_coding_system (Qundecided), decst.eol_type); else { int i; val = Qnil; decst.mask = postprocess_iso2022_mask (decst.mask); for (i = CODING_CATEGORY_LAST; i >= 0; i--) { int sys = coding_category_by_priority[i]; if (decst.mask & (1 << sys)) { Lisp_Object codesys = coding_category_system[sys]; if (!NILP (codesys)) codesys = subsidiary_coding_system (codesys, decst.eol_type); val = Fcons (codesys, val); } } } Lstream_close (istr); UNGCPRO; Lstream_delete (istr); Lstream_delete (lb_istr); return val; } /************************************************************************/ /* Converting to internal Mule format ("decoding") */ /************************************************************************/ /* A decoding stream is a stream used for decoding text (i.e. converting from some external format to internal format). The decoding-stream object keeps track of the actual coding stream, the stream that is at the other end, and data that needs to be persistent across the lifetime of the stream. */ /* Handle the EOL stuff related to just-read-in character C. EOL_TYPE is the EOL type of the coding stream. FLAGS is the current value of FLAGS in the coding stream, and may be modified by this macro. (The macro only looks at the CODING_STATE_CR flag.) DST is the Dynarr to which the decoded bytes are to be written. You need to also define a local goto label "label_continue_loop" that is at the end of the main character-reading loop. If C is a CR character, then this macro handles it entirely and jumps to label_continue_loop. Otherwise, this macro does not add anything to DST, and continues normally. You should continue processing C normally after this macro. */ #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \ do { \ if (c == '\r') \ { \ if (eol_type == EOL_CR) \ Dynarr_add (dst, '\n'); \ else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \ Dynarr_add (dst, c); \ else \ flags |= CODING_STATE_CR; \ goto label_continue_loop; \ } \ else if (flags & CODING_STATE_CR) \ { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \ if (c != '\n') \ Dynarr_add (dst, '\r'); \ flags &= ~CODING_STATE_CR; \ } \ } while (0) /* C should be a binary character in the range 0 - 255; convert to internal format and add to Dynarr DST. */ #define DECODE_ADD_BINARY_CHAR(c, dst) \ do { \ if (BYTE_ASCII_P (c)) \ Dynarr_add (dst, c); \ else if (BYTE_C1_P (c)) \ { \ Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \ Dynarr_add (dst, c + 0x20); \ } \ else \ { \ Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \ Dynarr_add (dst, c); \ } \ } while (0) #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \ do { \ if (ch) \ { \ DECODE_ADD_BINARY_CHAR (ch, dst); \ ch = 0; \ } \ } while (0) #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \ do { \ DECODE_OUTPUT_PARTIAL_CHAR (ch); \ if ((flags & CODING_STATE_END) && \ (flags & CODING_STATE_CR)) \ Dynarr_add (dst, '\r'); \ } while (0) #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding) struct decoding_stream { /* Coding system that governs the conversion. */ struct Lisp_Coding_System *codesys; /* Stream that we read the encoded data from or write the decoded data to. */ Lstream *other_end; /* If we are reading, then we can return only a fixed amount of data, so if the conversion resulted in too much data, we store it here for retrieval the next time around. */ unsigned_char_dynarr *runoff; /* FLAGS holds flags indicating the current state of the decoding. Some of these flags are dependent on the coding system. */ unsigned int flags; /* CH holds a partially built-up character. Since we only deal with one- and two-byte characters at the moment, we only use this to store the first byte of a two-byte character. */ unsigned int ch; /* EOL_TYPE specifies the type of end-of-line conversion that currently applies. We need to keep this separate from the EOL type stored in CODESYS because the latter might indicate automatic EOL-type detection while the former will always indicate a particular EOL type. */ enum eol_type eol_type; /* Additional ISO2022 information. We define the structure above because it's also needed by the detection routines. */ struct iso2022_decoder iso2022; /* Additional information (the state of the running CCL program) used by the CCL decoder. */ struct ccl_program ccl; struct detection_state decst; }; static int decoding_reader (Lstream *stream, unsigned char *data, int size); static int decoding_writer (Lstream *stream, CONST unsigned char *data, int size); static int decoding_rewinder (Lstream *stream); static int decoding_seekable_p (Lstream *stream); static int decoding_flusher (Lstream *stream); static int decoding_closer (Lstream *stream); static Lisp_Object decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object)); DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding, sizeof (struct decoding_stream)); static Lisp_Object decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object)) { Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end; Lisp_Object str_obj; /* We do not need to mark the coding systems or charsets stored within the stream because they are stored in a global list and automatically marked. */ XSETLSTREAM (str_obj, str); (markobj) (str_obj); if (str->imp->marker) return (str->imp->marker) (str_obj, markobj); else return Qnil; } /* Read SIZE bytes of data and store it into DATA. We are a decoding stream so we read data from the other end, decode it, and store it into DATA. */ static int decoding_reader (Lstream *stream, unsigned char *data, int size) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); unsigned char *orig_data = data; int read_size; int error_occurred = 0; /* We need to interface to mule_decode(), which expects to take some amount of data and store the result into a Dynarr. We have mule_decode() store into str->runoff, and take data from there as necessary. */ /* We loop until we have enough data, reading chunks from the other end and decoding it. */ while (1) { /* Take data from the runoff if we can. Make sure to take at most SIZE bytes, and delete the data from the runoff. */ if (Dynarr_length (str->runoff) > 0) { int chunk = min (size, Dynarr_length (str->runoff)); memcpy (data, Dynarr_atp (str->runoff, 0), chunk); Dynarr_delete_many (str->runoff, 0, chunk); data += chunk; size -= chunk; } if (size == 0) break; /* No more room for data */ if (str->flags & CODING_STATE_END) /* This means that on the previous iteration, we hit the EOF on the other end. We loop once more so that mule_decode() can output any final stuff it may be holding, or any "go back to a sane state" escape sequences. (This latter makes sense during encoding.) */ break; /* Exhausted the runoff, so get some more. DATA has at least SIZE bytes left of storage in it, so it's OK to read directly into it. (We'll be overwriting above, after we've decoded it into the runoff.) */ read_size = Lstream_read (str->other_end, data, size); if (read_size < 0) { error_occurred = 1; break; } if (read_size == 0) /* There might be some more end data produced in the translation. See the comment above. */ str->flags |= CODING_STATE_END; mule_decode (stream, data, str->runoff, read_size); } if (data - orig_data == 0) return error_occurred ? -1 : 0; else return data - orig_data; } static int decoding_writer (Lstream *stream, CONST unsigned char *data, int size) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); int retval; /* Decode all our data into the runoff, and then attempt to write it all out to the other end. Remove whatever chunk we succeeded in writing. */ mule_decode (stream, data, str->runoff, size); retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0), Dynarr_length (str->runoff)); if (retval > 0) Dynarr_delete_many (str->runoff, 0, retval); /* Do NOT return retval. The return value indicates how much of the incoming data was written, not how many bytes were written. */ return size; } static void reset_decoding_stream (struct decoding_stream *str) { if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022) { Lisp_Object coding_system = Qnil; XSETCODING_SYSTEM (coding_system, str->codesys); reset_iso2022 (coding_system, &str->iso2022); } else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL) { setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys)); } str->flags = str->ch = 0; } static int decoding_rewinder (Lstream *stream) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); reset_decoding_stream (str); Dynarr_reset (str->runoff); return Lstream_rewind (str->other_end); } static int decoding_seekable_p (Lstream *stream) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); return Lstream_seekable_p (str->other_end); } static int decoding_flusher (Lstream *stream) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); return Lstream_flush (str->other_end); } static int decoding_closer (Lstream *stream) { struct decoding_stream *str = DECODING_STREAM_DATA (stream); if (stream->flags & LSTREAM_FL_WRITE) { str->flags |= CODING_STATE_END; decoding_writer (stream, 0, 0); } Dynarr_free (str->runoff); if (str->iso2022.composite_chars) Dynarr_free (str->iso2022.composite_chars); return Lstream_close (str->other_end); } Lisp_Object decoding_stream_coding_system (Lstream *stream) { Lisp_Object coding_system = Qnil; struct decoding_stream *str = DECODING_STREAM_DATA (stream); XSETCODING_SYSTEM (coding_system, str->codesys); return subsidiary_coding_system (coding_system, str->eol_type); } void set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) { struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); struct decoding_stream *str = DECODING_STREAM_DATA (lstr); str->codesys = cs; if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) str->eol_type = CODING_SYSTEM_EOL_TYPE (cs); reset_decoding_stream (str); } /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding stream for writing, no automatic code detection will be performed. The reason for this is that automatic code detection requires a seekable input. Things will also fail if you open a decoding stream for reading using a non-fully-specified coding system and a non-seekable input stream. */ static Lisp_Object make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys, CONST char *mode) { Lstream *lstr = Lstream_new (lstream_decoding, mode); struct decoding_stream *str = DECODING_STREAM_DATA (lstr); Lisp_Object obj; memset (str, 0, sizeof (*str)); str->other_end = stream; str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char); str->eol_type = EOL_AUTODETECT; if (!strcmp (mode, "r") && Lstream_seekable_p (stream)) /* We can determine the coding system now. */ determine_real_coding_system (stream, &codesys, &str->eol_type); set_decoding_stream_coding_system (lstr, codesys); str->decst.eol_type = str->eol_type; str->decst.mask = ~0; XSETLSTREAM (obj, lstr); return obj; } Lisp_Object make_decoding_input_stream (Lstream *stream, Lisp_Object codesys) { return make_decoding_stream_1 (stream, codesys, "r"); } Lisp_Object make_decoding_output_stream (Lstream *stream, Lisp_Object codesys) { return make_decoding_stream_1 (stream, codesys, "w"); } /* Note: the decode_coding_* functions all take the same arguments as mule_decode(), which is to say some SRC data of size N, which is to be stored into dynamic array DST. DECODING is the stream within which the decoding is taking place, but no data is actually read from or written to that stream; that is handled in decoding_reader() or decoding_writer(). This allows the same functions to be used for both reading and writing. */ static void mule_decode (Lstream *decoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n) { struct decoding_stream *str = DECODING_STREAM_DATA (decoding); /* If necessary, do encoding-detection now. We do this when we're a writing stream or a non-seekable reading stream, meaning that we can't just process the whole input, rewind, and start over. */ if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT || str->eol_type == EOL_AUTODETECT) { Lisp_Object codesys = Qnil; XSETCODING_SYSTEM (codesys, str->codesys); detect_coding_type (&str->decst, src, n, CODING_SYSTEM_TYPE (str->codesys) != CODESYS_AUTODETECT); if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT && str->decst.mask != ~0) /* #### This is cheesy. What we really ought to do is buffer up a certain amount of data so as to get a less random result. */ codesys = coding_system_from_mask (str->decst.mask); str->eol_type = str->decst.eol_type; if (XCODING_SYSTEM (codesys) != str->codesys) { /* Preserve the CODING_STATE_END flag in case it was set. If we erase it, bad things might happen. */ int was_end = str->flags & CODING_STATE_END; set_decoding_stream_coding_system (decoding, codesys); if (was_end) str->flags |= CODING_STATE_END; } } switch (CODING_SYSTEM_TYPE (str->codesys)) { #ifdef DEBUG_XEMACS case CODESYS_INTERNAL: Dynarr_add_many (dst, src, n); break; #endif case CODESYS_AUTODETECT: /* If we got this far and still haven't decided on the coding system, then do no conversion. */ case CODESYS_NO_CONVERSION: decode_coding_no_conversion (decoding, src, dst, n); break; case CODESYS_SHIFT_JIS: decode_coding_sjis (decoding, src, dst, n); break; case CODESYS_BIG5: decode_coding_big5 (decoding, src, dst, n); break; case CODESYS_CCL: ccl_driver (&str->ccl, src, dst, n, 0); break; case CODESYS_ISO2022: decode_coding_iso2022 (decoding, src, dst, n); break; default: abort (); } } DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /* Decode the text between START and END which is encoded in CODING-SYSTEM. This is useful if you've read in encoded text from a file without decoding it (e.g. you read in a JIS-formatted file but used the `binary' or `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B"). Return length of decoded text. BUFFER defaults to the current buffer if unspecified. */ (start, end, coding_system, buffer)) { Bufpos b, e; struct buffer *buf = decode_buffer (buffer, 0); Lisp_Object instream, lb_outstream, de_outstream, outstream; Lstream *istr, *ostr; struct gcpro gcpro1, gcpro2, gcpro3, gcpro4; get_buffer_range_char (buf, start, end, &b, &e, 0); barf_if_buffer_read_only (buf, b, e); coding_system = Fget_coding_system (coding_system); instream = make_lisp_buffer_input_stream (buf, b, e, 0); lb_outstream = make_lisp_buffer_output_stream (buf, b, 0); de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream), coding_system); outstream = make_encoding_output_stream (XLSTREAM (de_outstream), Fget_coding_system (Qbinary)); istr = XLSTREAM (instream); ostr = XLSTREAM (outstream); GCPRO4 (instream, lb_outstream, de_outstream, outstream); /* The chain of streams looks like this: [BUFFER] <----- send through ------> [ENCODE AS BINARY] ------> [DECODE AS SPECIFIED] ------> [BUFFER] */ while (1) { char tempbuf[1024]; /* some random amount */ Bufpos newpos, even_newer_pos; Bufpos oldpos = lisp_buffer_stream_startpos (istr); int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); if (!size_in_bytes) break; newpos = lisp_buffer_stream_startpos (istr); Lstream_write (ostr, tempbuf, size_in_bytes); even_newer_pos = lisp_buffer_stream_startpos (istr); buffer_delete_range (buf, even_newer_pos - (newpos - oldpos), even_newer_pos, 0); } Lstream_close (istr); Lstream_close (ostr); UNGCPRO; Lstream_delete (istr); Lstream_delete (ostr); Lstream_delete (XLSTREAM (de_outstream)); Lstream_delete (XLSTREAM (lb_outstream)); return Qnil; } /************************************************************************/ /* Converting to an external encoding ("encoding") */ /************************************************************************/ /* An encoding stream is an output stream. When you create the stream, you specify the coding system that governs the encoding and another stream that the resulting encoded data is to be sent to, and then start sending data to it. */ #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding) struct encoding_stream { /* Coding system that governs the conversion. */ struct Lisp_Coding_System *codesys; /* Stream that we read the encoded data from or write the decoded data to. */ Lstream *other_end; /* If we are reading, then we can return only a fixed amount of data, so if the conversion resulted in too much data, we store it here for retrieval the next time around. */ unsigned_char_dynarr *runoff; /* FLAGS holds flags indicating the current state of the encoding. Some of these flags are dependent on the coding system. */ unsigned int flags; /* CH holds a partially built-up character. Since we only deal with one- and two-byte characters at the moment, we only use this to store the first byte of a two-byte character. */ unsigned int ch; /* Additional information used by the ISO2022 encoder. */ struct { /* CHARSET holds the character sets currently assigned to the G0 through G3 registers. It is initialized from the array INITIAL_CHARSET in CODESYS. */ Lisp_Object charset[4]; /* Which registers are currently invoked into the left (GL) and right (GR) halves of the 8-bit encoding space? */ int register_left, register_right; /* Whether we need to explicitly designate the charset in the G? register before using it. It is initialized from the array FORCE_CHARSET_ON_OUTPUT in CODESYS. */ unsigned char force_charset_on_output[4]; /* Other state variables that need to be preserved across invocations. */ Lisp_Object current_charset; int current_half; int current_char_boundary; } iso2022; /* Additional information (the state of the running CCL program) used by the CCL encoder. */ struct ccl_program ccl; }; static int encoding_reader (Lstream *stream, unsigned char *data, int size); static int encoding_writer (Lstream *stream, CONST unsigned char *data, int size); static int encoding_rewinder (Lstream *stream); static int encoding_seekable_p (Lstream *stream); static int encoding_flusher (Lstream *stream); static int encoding_closer (Lstream *stream); static Lisp_Object encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object)); DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding, sizeof (struct encoding_stream)); static Lisp_Object encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object)) { Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end; Lisp_Object str_obj; /* We do not need to mark the coding systems or charsets stored within the stream because they are stored in a global list and automatically marked. */ XSETLSTREAM (str_obj, str); (markobj) (str_obj); if (str->imp->marker) return (str->imp->marker) (str_obj, markobj); else return Qnil; } /* Read SIZE bytes of data and store it into DATA. We are a encoding stream so we read data from the other end, encode it, and store it into DATA. */ static int encoding_reader (Lstream *stream, unsigned char *data, int size) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); unsigned char *orig_data = data; int read_size; int error_occurred = 0; /* We need to interface to mule_encode(), which expects to take some amount of data and store the result into a Dynarr. We have mule_encode() store into str->runoff, and take data from there as necessary. */ /* We loop until we have enough data, reading chunks from the other end and encoding it. */ while (1) { /* Take data from the runoff if we can. Make sure to take at most SIZE bytes, and delete the data from the runoff. */ if (Dynarr_length (str->runoff) > 0) { int chunk = min (size, Dynarr_length (str->runoff)); memcpy (data, Dynarr_atp (str->runoff, 0), chunk); Dynarr_delete_many (str->runoff, 0, chunk); data += chunk; size -= chunk; } if (size == 0) break; /* No more room for data */ if (str->flags & CODING_STATE_END) /* This means that on the previous iteration, we hit the EOF on the other end. We loop once more so that mule_encode() can output any final stuff it may be holding, or any "go back to a sane state" escape sequences. (This latter makes sense during encoding.) */ break; /* Exhausted the runoff, so get some more. DATA at least SIZE bytes left of storage in it, so it's OK to read directly into it. (We'll be overwriting above, after we've encoded it into the runoff.) */ read_size = Lstream_read (str->other_end, data, size); if (read_size < 0) { error_occurred = 1; break; } if (read_size == 0) /* There might be some more end data produced in the translation. See the comment above. */ str->flags |= CODING_STATE_END; mule_encode (stream, data, str->runoff, read_size); } if (data == orig_data) return error_occurred ? -1 : 0; else return data - orig_data; } static int encoding_writer (Lstream *stream, CONST unsigned char *data, int size) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); int retval; /* Encode all our data into the runoff, and then attempt to write it all out to the other end. Remove whatever chunk we succeeded in writing. */ mule_encode (stream, data, str->runoff, size); retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0), Dynarr_length (str->runoff)); if (retval > 0) Dynarr_delete_many (str->runoff, 0, retval); /* Do NOT return retval. The return value indicates how much of the incoming data was written, not how many bytes were written. */ return size; } static void reset_encoding_stream (struct encoding_stream *str) { switch (CODING_SYSTEM_TYPE (str->codesys)) { case CODESYS_ISO2022: { int i; for (i = 0; i < 4; i++) { str->iso2022.charset[i] = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i); str->iso2022.force_charset_on_output[i] = CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i); } str->iso2022.register_left = 0; str->iso2022.register_right = 1; str->iso2022.current_charset = Qnil; str->iso2022.current_half = 0; str->iso2022.current_char_boundary = 1; break; } case CODESYS_CCL: setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys)); break; default: break; } str->flags = str->ch = 0; } static int encoding_rewinder (Lstream *stream) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); reset_encoding_stream (str); Dynarr_reset (str->runoff); return Lstream_rewind (str->other_end); } static int encoding_seekable_p (Lstream *stream) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); return Lstream_seekable_p (str->other_end); } static int encoding_flusher (Lstream *stream) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); return Lstream_flush (str->other_end); } static int encoding_closer (Lstream *stream) { struct encoding_stream *str = ENCODING_STREAM_DATA (stream); if (stream->flags & LSTREAM_FL_WRITE) { str->flags |= CODING_STATE_END; encoding_writer (stream, 0, 0); } Dynarr_free (str->runoff); return Lstream_close (str->other_end); } Lisp_Object encoding_stream_coding_system (Lstream *stream) { Lisp_Object coding_system = Qnil; struct encoding_stream *str = ENCODING_STREAM_DATA (stream); XSETCODING_SYSTEM (coding_system, str->codesys); return coding_system; } void set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) { struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); struct encoding_stream *str = ENCODING_STREAM_DATA (lstr); str->codesys = cs; reset_encoding_stream (str); } static Lisp_Object make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys, CONST char *mode) { Lstream *lstr = Lstream_new (lstream_encoding, mode); struct encoding_stream *str = ENCODING_STREAM_DATA (lstr); Lisp_Object obj; memset (str, 0, sizeof (*str)); str->runoff = Dynarr_new (unsigned_char); str->other_end = stream; set_encoding_stream_coding_system (lstr, codesys); XSETLSTREAM (obj, lstr); return obj; } Lisp_Object make_encoding_input_stream (Lstream *stream, Lisp_Object codesys) { return make_encoding_stream_1 (stream, codesys, "r"); } Lisp_Object make_encoding_output_stream (Lstream *stream, Lisp_Object codesys) { return make_encoding_stream_1 (stream, codesys, "w"); } /* Convert N bytes of internally-formatted data stored in SRC to an external format, according to the encoding stream ENCODING. Store the encoded data into DST. */ static void mule_encode (Lstream *encoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n) { struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); switch (CODING_SYSTEM_TYPE (str->codesys)) { #ifdef DEBUG_XEMACS case CODESYS_INTERNAL: Dynarr_add_many (dst, src, n); break; #endif case CODESYS_AUTODETECT: /* If we got this far and still haven't decided on the coding system, then do no conversion. */ case CODESYS_NO_CONVERSION: encode_coding_no_conversion (encoding, src, dst, n); break; case CODESYS_SHIFT_JIS: encode_coding_sjis (encoding, src, dst, n); break; case CODESYS_BIG5: encode_coding_big5 (encoding, src, dst, n); break; case CODESYS_CCL: ccl_driver (&str->ccl, src, dst, n, 0); break; case CODESYS_ISO2022: encode_coding_iso2022 (encoding, src, dst, n); break; default: abort (); } } DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /* Encode the text between START and END using CODING-SYSTEM. This will, for example, convert Japanese characters into stuff such as "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded text. BUFFER defaults to the current buffer if unspecified. */ (start, end, coding_system, buffer)) { Bufpos b, e; struct buffer *buf = decode_buffer (buffer, 0); Lisp_Object instream, lb_outstream, de_outstream, outstream; Lstream *istr, *ostr; struct gcpro gcpro1, gcpro2, gcpro3, gcpro4; get_buffer_range_char (buf, start, end, &b, &e, 0); barf_if_buffer_read_only (buf, b, e); coding_system = Fget_coding_system (coding_system); instream = make_lisp_buffer_input_stream (buf, b, e, 0); lb_outstream = make_lisp_buffer_output_stream (buf, b, 0); de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream), Fget_coding_system (Qbinary)); outstream = make_encoding_output_stream (XLSTREAM (de_outstream), coding_system); istr = XLSTREAM (instream); ostr = XLSTREAM (outstream); GCPRO4 (instream, outstream, de_outstream, lb_outstream); /* The chain of streams looks like this: [BUFFER] <----- send through ------> [ENCODE AS SPECIFIED] ------> [DECODE AS BINARY] ------> [BUFFER] */ while (1) { char tempbuf[1024]; /* some random amount */ Bufpos newpos, even_newer_pos; Bufpos oldpos = lisp_buffer_stream_startpos (istr); int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); if (!size_in_bytes) break; newpos = lisp_buffer_stream_startpos (istr); Lstream_write (ostr, tempbuf, size_in_bytes); even_newer_pos = lisp_buffer_stream_startpos (istr); buffer_delete_range (buf, even_newer_pos - (newpos - oldpos), even_newer_pos, 0); } { Charcount retlen = lisp_buffer_stream_startpos (XLSTREAM (instream)) - b; Lstream_close (istr); Lstream_close (ostr); UNGCPRO; Lstream_delete (istr); Lstream_delete (ostr); Lstream_delete (XLSTREAM (de_outstream)); Lstream_delete (XLSTREAM (lb_outstream)); return make_int (retlen); } } /************************************************************************/ /* Shift-JIS methods */ /************************************************************************/ /* Shift-JIS is a coding system encoding three character sets: ASCII, right half of JISX0201-Kana, and JISX0208. An ASCII character is encoded as is. A character of JISX0201-Kana (TYPE94 character set) is encoded by "position-code + 0x80". A character of JISX0208 (TYPE94x94 character set) is encoded in 2-byte but two position-codes are divided and shifted so that it fit in the range below. --- CODE RANGE of Shift-JIS --- (character set) (range) ASCII 0x00 .. 0x7F JISX0201-Kana 0xA0 .. 0xDF JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC ------------------------------- */ /* Is this the first byte of a Shift-JIS two-byte char? */ #define BYTE_SJIS_TWO_BYTE_1_P(c) \ (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF)) /* Is this the second byte of a Shift-JIS two-byte char? */ #define BYTE_SJIS_TWO_BYTE_2_P(c) \ (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC)) #define BYTE_SJIS_KATAKANA_P(c) \ ((c) >= 0xA1 && (c) <= 0xDF) static int detect_coding_sjis (struct detection_state *st, CONST unsigned char *src, unsigned int n) { int c; while (n--) { c = *src++; if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) return 0; if (st->shift_jis.in_second_byte) { st->shift_jis.in_second_byte = 0; if (c < 0x40) return 0; } else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0) st->shift_jis.in_second_byte = 1; } return CODING_CATEGORY_SHIFT_JIS_MASK; } /* Convert Shift-JIS data to internal format. */ static void decode_coding_sjis (Lstream *decoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n) { unsigned char c; unsigned int flags, ch; enum eol_type eol_type; struct decoding_stream *str = DECODING_STREAM_DATA (decoding); CODING_STREAM_DECOMPOSE (str, flags, ch); eol_type = str->eol_type; while (n--) { c = *src++; if (ch) { /* Previous character was first byte of Shift-JIS Kanji char. */ if (BYTE_SJIS_TWO_BYTE_2_P (c)) { unsigned char e1, e2; Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); DECODE_SJIS (ch, c, e1, e2); Dynarr_add (dst, e1); Dynarr_add (dst, e2); } else { DECODE_ADD_BINARY_CHAR (ch, dst); DECODE_ADD_BINARY_CHAR (c, dst); } ch = 0; } else { DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); if (BYTE_SJIS_TWO_BYTE_1_P (c)) ch = c; else if (BYTE_SJIS_KATAKANA_P (c)) { Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201); Dynarr_add (dst, c); } else DECODE_ADD_BINARY_CHAR (c, dst); } label_continue_loop:; } DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); CODING_STREAM_COMPOSE (str, flags, ch); } /* Convert internally-formatted data to Shift-JIS. */ static void encode_coding_sjis (Lstream *encoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n) { unsigned char c; struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); unsigned int flags, ch; enum eol_type eol_type; CODING_STREAM_DECOMPOSE (str, flags, ch); eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); while (n--) { c = *src++; if (c == '\n') { if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) Dynarr_add (dst, '\r'); if (eol_type != EOL_CR) Dynarr_add (dst, '\n'); ch = 0; } else if (BYTE_ASCII_P (c)) { Dynarr_add (dst, c); ch = 0; } else if (BUFBYTE_LEADING_BYTE_P (c)) ch = (c == LEADING_BYTE_KATAKANA_JISX0201 || c == LEADING_BYTE_JAPANESE_JISX0208_1978 || c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0; else if (ch) { if (ch == LEADING_BYTE_KATAKANA_JISX0201) { Dynarr_add (dst, c); ch = 0; } else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 || ch == LEADING_BYTE_JAPANESE_JISX0208) ch = c; else { unsigned char j1, j2; ENCODE_SJIS (ch, c, j1, j2); Dynarr_add (dst, j1); Dynarr_add (dst, j2); ch = 0; } } } CODING_STREAM_COMPOSE (str, flags, ch); } DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /* Decode a JISX0208 character of Shift-JIS coding-system. CODE is the character code in Shift-JIS as a cons of type bytes. Return the corresponding character. */ (code)) { unsigned char c1, c2, s1, s2; CHECK_CONS (code); CHECK_INT (XCAR (code)); CHECK_INT (XCDR (code)); s1 = XINT (XCAR (code)); s2 = XINT (XCDR (code)); if (BYTE_SJIS_TWO_BYTE_1_P (s1) && BYTE_SJIS_TWO_BYTE_2_P (s2)) { DECODE_SJIS (s1, s2, c1, c2); return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208, c1 & 0x7F, c2 & 0x7F)); } else return Qnil; } DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /* Encode a JISX0208 character CHAR to SHIFT-JIS coding-system. Return the corresponding character code in SHIFT-JIS as a cons of two bytes. */ (ch)) { Lisp_Object charset; int c1, c2, s1, s2; CHECK_CHAR_COERCE_INT (ch); BREAKUP_CHAR (XCHAR (ch), charset, c1, c2); if (EQ (charset, Vcharset_japanese_jisx0208)) { ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2); return Fcons (make_int (s1), make_int (s2)); } else return Qnil; } /************************************************************************/ /* Big5 methods */ /************************************************************************/ /* BIG5 is a coding system encoding two character sets: ASCII and Big5. An ASCII character is encoded as is. Big5 is a two-byte character set and is encoded in two-byte. --- CODE RANGE of BIG5 --- (character set) (range) ASCII 0x00 .. 0x7F Big5 (1st byte) 0xA1 .. 0xFE (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE -------------------------- Since the number of characters in Big5 is larger than maximum characters in Emacs' charset (96x96), it can't be handled as one charset. So, in Emacs, Big5 is devided into two: `charset-big5-1' and `charset-big5-2'. Both <type>s are TYPE94x94. The former contains frequently used characters and the latter contains less frequently used characters. */ #define BYTE_BIG5_TWO_BYTE_1_P(c) \ ((c) >= 0xA1 && (c) <= 0xFE) /* Is this the second byte of a Shift-JIS two-byte char? */ #define BYTE_BIG5_TWO_BYTE_2_P(c) \ (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE)) /* Number of Big5 characters which have the same code in 1st byte. */ #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40) /* Code conversion macros. These are macros because they are used in inner loops during code conversion. Note that temporary variables in macros introduce the classic dynamic-scoping problems with variable names. We use capital- lettered variables in the assumption that XEmacs does not use capital letters in variables except in a very formalized way (e.g. Qstring). */ /* Convert Big5 code (b1, b2) into its internal string representation (lb, c1, c2). */ /* There is a much simpler way to split the Big5 charset into two. For the moment I'm going to leave the algorithm as-is because it claims to separate out the most-used characters into a single charset, which perhaps will lead to optimizations in various places. The way the algorithm works is something like this: Big5 can be viewed as a 94x157 charset, where the row is encoded into the bytes 0xA1 .. 0xFE and the column is encoded into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency, the split between low and high column numbers is apparently meaningless; ascending rows produce less and less frequent chars. Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to the first charset, and the upper half (0xC9 .. 0xFE) to the second. To do the conversion, we convert the character into a single number where 0 .. 156 is the first row, 157 .. 313 is the second, etc. That way, the characters are ordered by decreasing frequency. Then we just chop the space in two and coerce the result into a 94x94 space. */ #define DECODE_BIG5(b1, b2, lb, c1, c2) do \ { \ int B1 = b1, B2 = b2; \ unsigned int I \ = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \ \ if (B1 < 0xC9) \ { \ lb = LEADING_BYTE_CHINESE_BIG5_1; \ } \ else \ { \ lb = LEADING_BYTE_CHINESE_BIG5_2; \ I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \ } \ c1 = I / (0xFF - 0xA1) + 0xA1; \ c2 = I % (0xFF - 0xA1) + 0xA1; \ } while (0) /* Convert the internal string representation of a Big5 character (lb, c1, c2) into Big5 code (b1, b2). */ #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \ { \ unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \ \ if (lb == LEADING_BYTE_CHINESE_BIG5_2) \ { \ I += BIG5_SAME_ROW * (0xC9 - 0xA1); \ } \ b1 = I / BIG5_SAME_ROW + 0xA1; \ b2 = I % BIG5_SAME_ROW; \ b2 += b2 < 0x3F ? 0x40 : 0x62; \ } while (0) static int detect_coding_big5 (struct detection_state *st, CONST unsigned char *src, unsigned int n) { int c; while (n--) { c = *src++; if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO || (c >= 0x80 && c <= 0xA0)) return 0; if (st->big5.in_second_byte) { st->big5.in_second_byte = 0; if (c < 0x40 || (c >= 0x80 && c <= 0xA0)) return 0; } else if (c >= 0xA1) st->big5.in_second_byte = 1; } return CODING_CATEGORY_BIG5_MASK; } /* Convert Big5 data to internal format. */ static void decode_coding_big5 (Lstream *decoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n) { unsigned char c; unsigned int flags, ch; enum eol_type eol_type; struct decoding_stream *str = DECODING_STREAM_DATA (decoding); CODING_STREAM_DECOMPOSE (str, flags, ch); eol_type = str->eol_type; while (n--) { c = *src++; if (ch) { /* Previous character was first byte of Big5 char. */ if (BYTE_BIG5_TWO_BYTE_2_P (c)) { unsigned char b1, b2, b3; DECODE_BIG5 (ch, c, b1, b2, b3); Dynarr_add (dst, b1); Dynarr_add (dst, b2); Dynarr_add (dst, b3); } else { DECODE_ADD_BINARY_CHAR (ch, dst); DECODE_ADD_BINARY_CHAR (c, dst); } ch = 0; } else { DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); if (BYTE_BIG5_TWO_BYTE_1_P (c)) ch = c; else DECODE_ADD_BINARY_CHAR (c, dst); } label_continue_loop:; } DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); CODING_STREAM_COMPOSE (str, flags, ch); } /* Convert internally-formatted data to Big5. */ static void encode_coding_big5 (Lstream *encoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n) { unsigned char c; struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); unsigned int flags, ch; enum eol_type eol_type; CODING_STREAM_DECOMPOSE (str, flags, ch); eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); while (n--) { c = *src++; if (c == '\n') { if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) Dynarr_add (dst, '\r'); if (eol_type != EOL_CR) Dynarr_add (dst, '\n'); } else if (BYTE_ASCII_P (c)) { /* ASCII. */ Dynarr_add (dst, c); } else if (BUFBYTE_LEADING_BYTE_P (c)) { if (c == LEADING_BYTE_CHINESE_BIG5_1 || c == LEADING_BYTE_CHINESE_BIG5_2) { /* A recognized leading byte. */ ch = c; continue; /* not done with this character. */ } /* otherwise just ignore this character. */ } else if (ch == LEADING_BYTE_CHINESE_BIG5_1 || ch == LEADING_BYTE_CHINESE_BIG5_2) { /* Previous char was a recognized leading byte. */ ch = (ch << 8) | c; continue; /* not done with this character. */ } else if (ch) { /* Encountering second byte of a Big5 character. */ unsigned char b1, b2; ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2); Dynarr_add (dst, b1); Dynarr_add (dst, b2); } ch = 0; } CODING_STREAM_COMPOSE (str, flags, ch); } DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /* Decode a Big5 character CODE of BIG5 coding-system. CODE is the character code in BIG5, a cons of two integers. Return the corresponding character. */ (code)) { unsigned char c1, c2, b1, b2; CHECK_CONS (code); CHECK_INT (XCAR (code)); CHECK_INT (XCDR (code)); b1 = XINT (XCAR (code)); b2 = XINT (XCDR (code)); if (BYTE_BIG5_TWO_BYTE_1_P (b1) && BYTE_BIG5_TWO_BYTE_2_P (b2)) { int leading_byte; Lisp_Object charset; DECODE_BIG5 (b1, b2, leading_byte, c1, c2); charset = CHARSET_BY_LEADING_BYTE (leading_byte); return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F)); } else return Qnil; } DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /* Encode the Big5 character CH to BIG5 coding-system. Return the corresponding character code in Big5. */ (ch)) { Lisp_Object charset; int c1, c2, b1, b2; CHECK_CHAR_COERCE_INT (ch); BREAKUP_CHAR (XCHAR (ch), charset, c1, c2); if (EQ (charset, Vcharset_chinese_big5_1) || EQ (charset, Vcharset_chinese_big5_2)) { ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80, b1, b2); return Fcons (make_int (b1), make_int (b2)); } else return Qnil; } /************************************************************************/ /* ISO2022 methods */ /************************************************************************/ /* The following note describes the coding system ISO2022 briefly. Since the intention of this note is to help understanding of the programs in this file, some parts are NOT ACCURATE or OVERLY SIMPLIFIED. For thorough understanding, please refer to the original document of ISO2022. ISO2022 provides many mechanisms to encode several character sets in 7-bit and 8-bit environments. If one chooses 7-bit environment, all text is encoded by codes of less than 128. This may make the encoded text a little bit longer, but the text get more stability to pass through several gateways (some of them strip off MSB). There are two kind of character sets: control character set and graphic character set. The former contains control characters such as `newline' and `escape' to provide control functions (control functions are provided also by escape sequence). The latter contains graphic characters such as 'A' and '-'. Emacs recognizes two control character sets and many graphic character sets. Graphic character sets are classified into one of four types, according to the dimension and number of characters in the set: TYPE94, TYPE96, TYPE94x94, and TYPE96x96. In addition, each character set is assigned an identification byte, unique for each type, called "final character" (denoted as <F> hereafter). The <F> of each character set is decided by ECMA(*) when it is registered in ISO. Code range of <F> is 0x30..0x7F (0x30..0x3F are for private use only). Note (*): ECMA = European Computer Manufacturers Association Here are examples of graphic character set [NAME(<F>)]: o TYPE94 -- ASCII('B'), right-half-of-JISX0201('I'), ... o TYPE96 -- right-half-of-ISO8859-1('A'), ... o TYPE94x94 -- GB2312('A'), JISX0208('B'), ... o TYPE96x96 -- none for the moment A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR. C0 [0x00..0x1F] -- control character plane 0 GL [0x20..0x7F] -- graphic character plane 0 C1 [0x80..0x9F] -- control character plane 1 GR [0xA0..0xFF] -- graphic character plane 1 A control character set is directly designated and invoked to C0 or C1 by an escape sequence. The most common case is that: - ISO646's control character set is designated/invoked to C0, and - ISO6429's control character set is designated/invoked to C1, and usually these designations/invocations are omitted in encoded text. In a 7-bit environment, only C0 can be used, and a control character for C1 is encoded by an appropriate escape sequence to fit into the environment. All control characters for C1 are defined to have corresponding escape sequences. A graphic character set is at first designated to one of four graphic registers (G0 through G3), then these graphic registers are invoked to GL or GR. These designations and invocations can be done independently. The most common case is that G0 is invoked to GL, G1 is invoked to GR, and ASCII is designated to G0. Usually these invocations and designations are omitted in encoded text. In a 7-bit environment, only GL can be used. When a graphic character set of TYPE94 or TYPE94x94 is invoked to GL, codes 0x20 and 0x7F of the GL area work as control characters SPACE and DEL respectively, and code 0xA0 and 0xFF of GR area should not be used. There are two ways of invocation: locking-shift and single-shift. With locking-shift, the invocation lasts until the next different invocation, whereas with single-shift, the invocation works only for the following character and doesn't affect locking-shift. Invocations are done by the following control characters or escape sequences. ---------------------------------------------------------------------- abbrev function cntrl escape seq description ---------------------------------------------------------------------- SI/LS0 (shift-in) 0x0F none invoke G0 into GL SO/LS1 (shift-out) 0x0E none invoke G1 into GL LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char ---------------------------------------------------------------------- The first four are for locking-shift. Control characters for these functions are defined by macros ISO_CODE_XXX in `coding.h'. Designations are done by the following escape sequences. ---------------------------------------------------------------------- escape sequence description ---------------------------------------------------------------------- ESC '(' <F> designate TYPE94<F> to G0 ESC ')' <F> designate TYPE94<F> to G1 ESC '*' <F> designate TYPE94<F> to G2 ESC '+' <F> designate TYPE94<F> to G3 ESC ',' <F> designate TYPE96<F> to G0 (*) ESC '-' <F> designate TYPE96<F> to G1 ESC '.' <F> designate TYPE96<F> to G2 ESC '/' <F> designate TYPE96<F> to G3 ESC '$' '(' <F> designate TYPE94x94<F> to G0 (**) ESC '$' ')' <F> designate TYPE94x94<F> to G1 ESC '$' '*' <F> designate TYPE94x94<F> to G2 ESC '$' '+' <F> designate TYPE94x94<F> to G3 ESC '$' ',' <F> designate TYPE96x96<F> to G0 (*) ESC '$' '-' <F> designate TYPE96x96<F> to G1 ESC '$' '.' <F> designate TYPE96x96<F> to G2 ESC '$' '/' <F> designate TYPE96x96<F> to G3 ---------------------------------------------------------------------- In this list, "TYPE94<F>" means a graphic character set of type TYPE94 and final character <F>, and etc. Note (*): Although these designations are not allowed in ISO2022, Emacs accepts them on decoding, and produces them on encoding TYPE96 or TYPE96x96 character set in a coding system which is characterized as 7-bit environment, non-locking-shift, and non-single-shift. Note (**): If <F> is '@', 'A', or 'B', the intermediate character '(' can be omitted. We call this as "short-form" here after. Now you may notice that there are a lot of ways for encoding the same multilingual text in ISO2022. Actually, there exist many coding systems such as Compound Text (used in X's inter client communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR (used in Korean internet), EUC (Extended UNIX Code, used in Asian localized platforms), and all of these are variants of ISO2022. In addition to the above, Emacs handles two more kinds of escape sequences: ISO6429's direction specification and Emacs' private sequence for specifying character composition. ISO6429's direction specification takes the following format: o CSI ']' -- end of the current direction o CSI '0' ']' -- end of the current direction o CSI '1' ']' -- start of left-to-right text o CSI '2' ']' -- start of right-to-left text The control character CSI (0x9B: control sequence introducer) is abbreviated to the escape sequence ESC '[' in 7-bit environment. Character composition specification takes the following format: o ESC '0' -- start character composition o ESC '1' -- end character composition Since these are not standard escape sequences of any ISO, the use of them for these meanings is restricted to Emacs only. */ static void reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso) { int i; for (i = 0; i < 4; i++) { if (!NILP (coding_system)) iso->charset[i] = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i); else iso->charset[i] = Qt; iso->invalid_designated[i] = 0; } iso->esc = ISO_ESC_NOTHING; iso->esc_bytes_index = 0; iso->register_left = 0; iso->register_right = 1; iso->switched_dir_and_no_valid_charset_yet = 0; iso->invalid_switch_dir = 0; iso->output_direction_sequence = 0; iso->output_literally = 0; if (iso->composite_chars) Dynarr_reset (iso->composite_chars); } static int fit_to_be_escape_quoted (unsigned char c) { switch (c) { case ISO_CODE_ESC: case ISO_CODE_CSI: case ISO_CODE_SS2: case ISO_CODE_SS3: case ISO_CODE_SO: case ISO_CODE_SI: return 1; default: return 0; } } /* Parse one byte of an ISO2022 escape sequence. If the result is an invalid escape sequence, return 0 and do not change anything in STR. Otherwise, if the result is an incomplete escape sequence, update ISO2022.ESC and ISO2022.ESC_BYTES and return -1. Otherwise, update all the state variables (but not ISO2022.ESC_BYTES) and return 1. If CHECK_INVALID_CHARSETS is non-zero, check for designation or invocation of an invalid character set and treat that as an unrecognized escape sequence. */ static int parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso, unsigned char c, unsigned int *flags, int check_invalid_charsets) { /* (1) If we're at the end of a designation sequence, CS is the charset being designated and REG is the register to designate it to. (2) If we're at the end of a locking-shift sequence, REG is the register to invoke and HALF (0 == left, 1 == right) is the half to invoke it into. (3) If we're at the end of a single-shift sequence, REG is the register to invoke. */ Lisp_Object cs = Qnil; int reg, half; /* NOTE: This code does goto's all over the fucking place. The reason for this is that we're basically implementing a state machine here, and hierarchical languages like C don't really provide a clean way of doing this. */ if (! (*flags & CODING_STATE_ESCAPE)) /* At beginning of escape sequence; we need to reset our escape-state variables. */ iso->esc = ISO_ESC_NOTHING; iso->output_literally = 0; iso->output_direction_sequence = 0; switch (iso->esc) { case ISO_ESC_NOTHING: iso->esc_bytes_index = 0; switch (c) { case ISO_CODE_ESC: /* Start escape sequence */ *flags |= CODING_STATE_ESCAPE; iso->esc = ISO_ESC; goto not_done; case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */ *flags |= CODING_STATE_ESCAPE; iso->esc = ISO_ESC_5_11; goto not_done; case ISO_CODE_SO: /* locking shift 1 */ reg = 1; half = 0; goto locking_shift; case ISO_CODE_SI: /* locking shift 0 */ reg = 0; half = 0; goto locking_shift; case ISO_CODE_SS2: /* single shift */ reg = 2; goto single_shift; case ISO_CODE_SS3: /* single shift */ reg = 3; goto single_shift; default: /* Other control characters */ return 0; } case ISO_ESC: switch (c) { /**** single shift ****/ case 'N': /* single shift 2 */ reg = 2; goto single_shift; case 'O': /* single shift 3 */ reg = 3; goto single_shift; /**** locking shift ****/ case '~': /* locking shift 1 right */ reg = 1; half = 1; goto locking_shift; case 'n': /* locking shift 2 */ reg = 2; half = 0; goto locking_shift; case '}': /* locking shift 2 right */ reg = 2; half = 1; goto locking_shift; case 'o': /* locking shift 3 */ reg = 3; half = 0; goto locking_shift; case '|': /* locking shift 3 right */ reg = 3; half = 1; goto locking_shift; /**** composite ****/ case '0': iso->esc = ISO_ESC_START_COMPOSITE; *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_COMPOSITE; return 1; case '1': iso->esc = ISO_ESC_END_COMPOSITE; *flags = (*flags & CODING_STATE_ISO2022_LOCK) & ~CODING_STATE_COMPOSITE; return 1; /**** directionality ****/ case '[': iso->esc = ISO_ESC_5_11; goto not_done; /**** designation ****/ case '$': /* multibyte charset prefix */ iso->esc = ISO_ESC_2_4; goto not_done; default: if (0x28 <= c && c <= 0x2F) { iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8); goto not_done; } /* This function is called with CODESYS equal to nil when doing coding-system detection. */ if (!NILP (codesys) && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) && fit_to_be_escape_quoted (c)) { iso->esc = ISO_ESC_LITERAL; *flags &= CODING_STATE_ISO2022_LOCK; return 1; } /* bzzzt! */ return 0; } /**** directionality ****/ case ISO_ESC_5_11: /* ISO6429 direction control */ if (c == ']') { *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L); goto directionality; } if (c == '0') iso->esc = ISO_ESC_5_11_0; else if (c == '1') iso->esc = ISO_ESC_5_11_1; else if (c == '2') iso->esc = ISO_ESC_5_11_2; else return 0; goto not_done; case ISO_ESC_5_11_0: if (c == ']') { *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L); goto directionality; } return 0; case ISO_ESC_5_11_1: if (c == ']') { *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L); goto directionality; } return 0; case ISO_ESC_5_11_2: if (c == ']') { *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L; goto directionality; } return 0; directionality: iso->esc = ISO_ESC_DIRECTIONALITY; /* Various junk here to attempt to preserve the direction sequences literally in the text if they would otherwise be swallowed due to invalid designations that don't show up as actual charset changes in the text. */ if (iso->invalid_switch_dir) { /* We already inserted a direction switch literally into the text. We assume (#### this may not be right) that the next direction switch is the one going the other way, and we need to output that literally as well. */ iso->output_literally = 1; iso->invalid_switch_dir = 0; } else { int jj; /* If we are in the thrall of an invalid designation, then stick the directionality sequence literally into the output stream so it ends up in the original text again. */ for (jj = 0; jj < 4; jj++) if (iso->invalid_designated[jj]) break; if (jj < 4) { iso->output_literally = 1; iso->invalid_switch_dir = 1; } else /* Indicate that we haven't yet seen a valid designation, so that if a switch-dir is directly followed by an invalid designation, both get inserted literally. */ iso->switched_dir_and_no_valid_charset_yet = 1; } return 1; /**** designation ****/ case ISO_ESC_2_4: if (0x28 <= c && c <= 0x2F) { iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8); goto not_done; } if (0x40 <= c && c <= 0x42) { cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c, *flags & CODING_STATE_R2L ? CHARSET_RIGHT_TO_LEFT : CHARSET_LEFT_TO_RIGHT); reg = 0; goto designated; } return 0; default: { int type =-1; if (c < '0' || c > '~') return 0; /* bad final byte */ if (iso->esc >= ISO_ESC_2_8 && iso->esc <= ISO_ESC_2_15) { type = ((iso->esc >= ISO_ESC_2_12) ? CHARSET_TYPE_96 : CHARSET_TYPE_94); reg = (iso->esc - ISO_ESC_2_8) & 3; } else if (iso->esc >= ISO_ESC_2_4_8 && iso->esc <= ISO_ESC_2_4_15) { type = ((iso->esc >= ISO_ESC_2_4_12) ? CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94); reg = (iso->esc - ISO_ESC_2_4_8) & 3; } else { /* Can this ever be reached? -slb */ abort(); } cs = CHARSET_BY_ATTRIBUTES (type, c, *flags & CODING_STATE_R2L ? CHARSET_RIGHT_TO_LEFT : CHARSET_LEFT_TO_RIGHT); goto designated; } } not_done: iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c; return -1; single_shift: if (check_invalid_charsets && !CHARSETP (iso->charset[reg])) /* can't invoke something that ain't there. */ return 0; iso->esc = ISO_ESC_SINGLE_SHIFT; *flags &= CODING_STATE_ISO2022_LOCK; if (reg == 2) *flags |= CODING_STATE_SS2; else *flags |= CODING_STATE_SS3; return 1; locking_shift: if (check_invalid_charsets && !CHARSETP (iso->charset[reg])) /* can't invoke something that ain't there. */ return 0; if (half) iso->register_right = reg; else iso->register_left = reg; *flags &= CODING_STATE_ISO2022_LOCK; iso->esc = ISO_ESC_LOCKING_SHIFT; return 1; designated: if (NILP (cs) && check_invalid_charsets) { iso->invalid_designated[reg] = 1; iso->charset[reg] = Vcharset_ascii; iso->esc = ISO_ESC_DESIGNATE; *flags &= CODING_STATE_ISO2022_LOCK; iso->output_literally = 1; if (iso->switched_dir_and_no_valid_charset_yet) { /* We encountered a switch-direction followed by an invalid designation. Ensure that the switch-direction gets outputted; otherwise it will probably get eaten when the text is written out again. */ iso->switched_dir_and_no_valid_charset_yet = 0; iso->output_direction_sequence = 1; /* And make sure that the switch-dir going the other way gets outputted, as well. */ iso->invalid_switch_dir = 1; } return 1; } /* This function is called with CODESYS equal to nil when doing coding-system detection. */ if (!NILP (codesys)) { charset_conversion_spec_dynarr *dyn = XCODING_SYSTEM (codesys)->iso2022.input_conv; if (dyn) { int i; for (i = 0; i < Dynarr_length (dyn); i++) { struct charset_conversion_spec *spec = Dynarr_atp (dyn, i); if (EQ (cs, spec->from_charset)) cs = spec->to_charset; } } } iso->charset[reg] = cs; iso->esc = ISO_ESC_DESIGNATE; *flags &= CODING_STATE_ISO2022_LOCK; if (iso->invalid_designated[reg]) { iso->invalid_designated[reg] = 0; iso->output_literally = 1; } if (iso->switched_dir_and_no_valid_charset_yet) iso->switched_dir_and_no_valid_charset_yet = 0; return 1; } static int detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src, unsigned int n) { int c; int mask; /* #### There are serious deficiencies in the recognition mechanism here. This needs to be much smarter if it's going to cut it. */ if (!st->iso2022.initted) { reset_iso2022 (Qnil, &st->iso2022.iso); st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK | CODING_CATEGORY_ISO_8_DESIGNATE_MASK | CODING_CATEGORY_ISO_8_1_MASK | CODING_CATEGORY_ISO_8_2_MASK | CODING_CATEGORY_ISO_LOCK_SHIFT_MASK); st->iso2022.flags = 0; st->iso2022.high_byte_count = 0; st->iso2022.saw_single_shift = 0; st->iso2022.initted = 1; } mask = st->iso2022.mask; while (n--) { c = *src++; if (c >= 0xA0) { mask &= ~CODING_CATEGORY_ISO_7_MASK; st->iso2022.high_byte_count++; } else { if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift) { if (st->iso2022.high_byte_count & 1) /* odd number of high bytes; assume not iso-8-2 */ mask &= ~CODING_CATEGORY_ISO_8_2_MASK; } st->iso2022.high_byte_count = 0; st->iso2022.saw_single_shift = 0; if (c > 0x80) mask &= ~CODING_CATEGORY_ISO_7_MASK; } if (!(st->iso2022.flags & CODING_STATE_ESCAPE) && (BYTE_C0_P (c) || BYTE_C1_P (c))) { /* control chars */ switch (c) { /* Allow and ignore control characters that you might reasonably see in a text file */ case '\r': case '\n': case '\t': case 7: /* bell */ case 8: /* backspace */ case 11: /* vertical tab */ case 12: /* form feed */ case 26: /* MS-DOS C-z junk */ case 31: /* '^_' -- for info */ goto label_continue_loop; default: break; } } if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c) || BYTE_C1_P (c)) { if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c, &st->iso2022.flags, 0)) { switch (st->iso2022.iso.esc) { case ISO_ESC_DESIGNATE: mask &= ~CODING_CATEGORY_ISO_8_1_MASK; mask &= ~CODING_CATEGORY_ISO_8_2_MASK; break; case ISO_ESC_LOCKING_SHIFT: mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK; goto ran_out_of_chars; case ISO_ESC_SINGLE_SHIFT: mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK; st->iso2022.saw_single_shift = 1; break; default: break; } } else { mask = 0; goto ran_out_of_chars; } } label_continue_loop:; } ran_out_of_chars: return mask; } static int postprocess_iso2022_mask (int mask) { /* #### kind of cheesy */ /* If seven-bit ISO is allowed, then assume that the encoding is entirely seven-bit and turn off the eight-bit ones. */ if (mask & CODING_CATEGORY_ISO_7_MASK) mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK | CODING_CATEGORY_ISO_8_1_MASK | CODING_CATEGORY_ISO_8_2_MASK); return mask; } /* If FLAGS is a null pointer or specifies right-to-left motion, output a switch-dir-to-left-to-right sequence to DST. Also update FLAGS if it is not a null pointer. If INTERNAL_P is set, we are outputting in internal format and need to handle the CSI differently. */ static void restore_left_to_right_direction (struct Lisp_Coding_System *codesys, unsigned_char_dynarr *dst, unsigned int *flags, int internal_p) { if (!flags || (*flags & CODING_STATE_R2L)) { if (CODING_SYSTEM_ISO2022_SEVEN (codesys)) { Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, '['); } else if (internal_p) DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst); else Dynarr_add (dst, ISO_CODE_CSI); Dynarr_add (dst, '0'); Dynarr_add (dst, ']'); if (flags) *flags &= ~CODING_STATE_R2L; } } /* If FLAGS is a null pointer or specifies a direction different from DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape sequence to DST. Also update FLAGS if it is not a null pointer. If INTERNAL_P is set, we are outputting in internal format and need to handle the CSI differently. */ static void ensure_correct_direction (int direction, struct Lisp_Coding_System *codesys, unsigned_char_dynarr *dst, unsigned int *flags, int internal_p) { if ((!flags || (*flags & CODING_STATE_R2L)) && direction == CHARSET_LEFT_TO_RIGHT) restore_left_to_right_direction (codesys, dst, flags, internal_p); else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys) && (!flags || !(*flags & CODING_STATE_R2L)) && direction == CHARSET_RIGHT_TO_LEFT) { if (CODING_SYSTEM_ISO2022_SEVEN (codesys)) { Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, '['); } else if (internal_p) DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst); else Dynarr_add (dst, ISO_CODE_CSI); Dynarr_add (dst, '2'); Dynarr_add (dst, ']'); if (flags) *flags |= CODING_STATE_R2L; } } /* Convert ISO2022-format data to internal format. */ static void decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n) { unsigned char c; unsigned int flags, ch; enum eol_type eol_type; struct decoding_stream *str = DECODING_STREAM_DATA (decoding); Lisp_Object coding_system = Qnil; unsigned_char_dynarr *real_dst = dst; CODING_STREAM_DECOMPOSE (str, flags, ch); eol_type = str->eol_type; XSETCODING_SYSTEM (coding_system, str->codesys); if (flags & CODING_STATE_COMPOSITE) dst = str->iso2022.composite_chars; while (n--) { c = *src++; if (flags & CODING_STATE_ESCAPE) { /* Within ESC sequence */ int retval = parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1); if (retval) { switch (str->iso2022.esc) { case ISO_ESC_START_COMPOSITE: if (str->iso2022.composite_chars) Dynarr_reset (str->iso2022.composite_chars); else str->iso2022.composite_chars = Dynarr_new (unsigned_char); dst = str->iso2022.composite_chars; break; case ISO_ESC_END_COMPOSITE: { Bufbyte comstr[MAX_EMCHAR_LEN]; Bytecount len; Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0), Dynarr_length (dst)); dst = real_dst; len = set_charptr_emchar (comstr, emch); Dynarr_add_many (dst, comstr, len); break; } case ISO_ESC_LITERAL: DECODE_ADD_BINARY_CHAR (c, dst); break; default: /* Everything else handled already */ break; } } /* Attempted error recovery. */ if (str->iso2022.output_direction_sequence) ensure_correct_direction (flags & CODING_STATE_R2L ? CHARSET_RIGHT_TO_LEFT : CHARSET_LEFT_TO_RIGHT, str->codesys, dst, 0, 1); /* More error recovery. */ if (!retval || str->iso2022.output_literally) { /* Output the (possibly invalid) sequence */ int i; for (i = 0; i < str->iso2022.esc_bytes_index; i++) DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst); flags &= CODING_STATE_ISO2022_LOCK; if (!retval) n++, src--;/* Repeat the loop with the same character. */ else { /* No sense in reprocessing the final byte of the escape sequence; it could mess things up anyway. Just add it now. */ DECODE_ADD_BINARY_CHAR (c, dst); } } ch = 0; } else if (BYTE_C0_P (c) || BYTE_C1_P (c)) { /* Control characters */ /***** Error-handling *****/ /* If we were in the middle of a character, dump out the partial character. */ DECODE_OUTPUT_PARTIAL_CHAR (ch); /* If we just saw a single-shift character, dump it out. This may dump out the wrong sort of single-shift character, but least it will give an indication that something went wrong. */ if (flags & CODING_STATE_SS2) { DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst); flags &= ~CODING_STATE_SS2; } if (flags & CODING_STATE_SS3) { DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst); flags &= ~CODING_STATE_SS3; } /***** Now handle the control characters. *****/ /* Handle CR/LF */ DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); flags &= CODING_STATE_ISO2022_LOCK; if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1)) DECODE_ADD_BINARY_CHAR (c, dst); } else { /* Graphic characters */ Lisp_Object charset; int lb; int reg; DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); /* Now determine the charset. */ reg = ((flags & CODING_STATE_SS2) ? 2 : (flags & CODING_STATE_SS3) ? 3 : !BYTE_ASCII_P (c) ? str->iso2022.register_right : str->iso2022.register_left); charset = str->iso2022.charset[reg]; /* Error checking: */ if (NILP (charset) || str->iso2022.invalid_designated[reg] || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL) && XCHARSET_CHARS (charset) == 94)) /* Mrmph. We are trying to invoke a register that has no or an invalid charset in it, or trying to add a character outside the range of the charset. Insert that char literally to preserve it for the output. */ { DECODE_OUTPUT_PARTIAL_CHAR (ch); DECODE_ADD_BINARY_CHAR (c, dst); } else { /* Things are probably hunky-dorey. */ /* Fetch reverse charset, maybe. */ if (((flags & CODING_STATE_R2L) && XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT) || (!(flags & CODING_STATE_R2L) && XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT)) { Lisp_Object new_charset = XCHARSET_REVERSE_DIRECTION_CHARSET (charset); if (!NILP (new_charset)) charset = new_charset; } lb = XCHARSET_LEADING_BYTE (charset); switch (XCHARSET_REP_BYTES (charset)) { case 1: /* ASCII */ DECODE_OUTPUT_PARTIAL_CHAR (ch); Dynarr_add (dst, c & 0x7F); break; case 2: /* one-byte official */ DECODE_OUTPUT_PARTIAL_CHAR (ch); Dynarr_add (dst, lb); Dynarr_add (dst, c | 0x80); break; case 3: /* one-byte private or two-byte official */ if (XCHARSET_PRIVATE_P (charset)) { DECODE_OUTPUT_PARTIAL_CHAR (ch); Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1); Dynarr_add (dst, lb); Dynarr_add (dst, c | 0x80); } else { if (ch) { Dynarr_add (dst, lb); Dynarr_add (dst, ch | 0x80); Dynarr_add (dst, c | 0x80); ch = 0; } else ch = c; } break; default: /* two-byte private */ if (ch) { Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2); Dynarr_add (dst, lb); Dynarr_add (dst, ch | 0x80); Dynarr_add (dst, c | 0x80); ch = 0; } else ch = c; } } if (!ch) flags &= CODING_STATE_ISO2022_LOCK; } label_continue_loop:; } if (flags & CODING_STATE_END) DECODE_OUTPUT_PARTIAL_CHAR (ch); CODING_STREAM_COMPOSE (str, flags, ch); } /***** ISO2022 encoder *****/ /* Designate CHARSET into register REG. */ static void iso2022_designate (Lisp_Object charset, unsigned char reg, struct encoding_stream *str, unsigned_char_dynarr *dst) { CONST char *inter94 = "()*+", *inter96= ",-./"; int type; unsigned char final; Lisp_Object old_charset = str->iso2022.charset[reg]; str->iso2022.charset[reg] = charset; if (!CHARSETP (charset)) /* charset might be an initial nil or t. */ return; type = XCHARSET_TYPE (charset); final = XCHARSET_FINAL (charset); if (!str->iso2022.force_charset_on_output[reg] && CHARSETP (old_charset) && XCHARSET_TYPE (old_charset) == type && XCHARSET_FINAL (old_charset) == final) return; str->iso2022.force_charset_on_output[reg] = 0; { charset_conversion_spec_dynarr *dyn = str->codesys->iso2022.output_conv; if (dyn) { int i; for (i = 0; i < Dynarr_length (dyn); i++) { struct charset_conversion_spec *spec = Dynarr_atp (dyn, i); if (EQ (charset, spec->from_charset)) charset = spec->to_charset; } } } Dynarr_add (dst, ISO_CODE_ESC); switch (type) { case CHARSET_TYPE_94: Dynarr_add (dst, inter94[reg]); break; case CHARSET_TYPE_96: Dynarr_add (dst, inter96[reg]); break; case CHARSET_TYPE_94X94: Dynarr_add (dst, '$'); if (reg != 0 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys)) || final < '@' || final > 'B') Dynarr_add (dst, inter94[reg]); break; case CHARSET_TYPE_96X96: Dynarr_add (dst, '$'); Dynarr_add (dst, inter96[reg]); break; } Dynarr_add (dst, final); } static void ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst) { if (str->iso2022.register_left != 0) { Dynarr_add (dst, ISO_CODE_SI); str->iso2022.register_left = 0; } } static void ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst) { if (str->iso2022.register_left != 1) { Dynarr_add (dst, ISO_CODE_SO); str->iso2022.register_left = 1; } } /* Convert internally-formatted data to ISO2022 format. */ static void encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n) { unsigned char charmask, c; unsigned int flags, ch; enum eol_type eol_type; unsigned char char_boundary; struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); struct Lisp_Coding_System *codesys = str->codesys; int i; Lisp_Object charset; int half; /* flags for handling composite chars. We do a little switcharoo on the source while we're outputting the composite char. */ unsigned int saved_n = 0; CONST unsigned char *saved_src = NULL; int in_composite = 0; CODING_STREAM_DECOMPOSE (str, flags, ch); eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); char_boundary = str->iso2022.current_char_boundary; charset = str->iso2022.current_charset; half = str->iso2022.current_half; back_to_square_n: while (n--) { c = *src++; if (BYTE_ASCII_P (c)) { /* Processing ASCII character */ ch = 0; restore_left_to_right_direction (codesys, dst, &flags, 0); /* Make sure G0 contains ASCII */ if ((c > ' ' && c < ISO_CODE_DEL) || !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys)) { ensure_normal_shift (str, dst); iso2022_designate (Vcharset_ascii, 0, str, dst); } /* If necessary, restore everything to the default state at end-of-line */ if (c == '\n' && !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys))) { restore_left_to_right_direction (codesys, dst, &flags, 0); ensure_normal_shift (str, dst); for (i = 0; i < 4; i++) { Lisp_Object initial_charset = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); iso2022_designate (initial_charset, i, str, dst); } } if (c == '\n') { if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) Dynarr_add (dst, '\r'); if (eol_type != EOL_CR) Dynarr_add (dst, c); } else { if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) && fit_to_be_escape_quoted (c)) Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, c); } char_boundary = 1; } else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch)) { /* Processing Leading Byte */ ch = 0; charset = CHARSET_BY_LEADING_BYTE (c); if (LEADING_BYTE_PREFIX_P(c)) ch = c; else if (!EQ (charset, Vcharset_control_1) && !EQ (charset, Vcharset_composite)) { int reg; ensure_correct_direction (XCHARSET_DIRECTION (charset), codesys, dst, &flags, 0); /* Now determine which register to use. */ reg = -1; for (i = 0; i < 4; i++) { if (EQ (charset, str->iso2022.charset[i]) || EQ (charset, CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))) { reg = i; break; } } if (reg == -1) { if (XCHARSET_GRAPHIC (charset) != 0) { if (!NILP (str->iso2022.charset[1]) && (!CODING_SYSTEM_ISO2022_SEVEN (codesys) || CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys))) reg = 1; else if (!NILP (str->iso2022.charset[2])) reg = 2; else if (!NILP (str->iso2022.charset[3])) reg = 3; else reg = 0; } else reg = 0; } iso2022_designate (charset, reg, str, dst); /* Now invoke that register. */ switch (reg) { case 0: ensure_normal_shift (str, dst); half = 0; break; case 1: if (CODING_SYSTEM_ISO2022_SEVEN (codesys)) { ensure_shift_out (str, dst); half = 0; } else half = 1; break; case 2: if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys)) { Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, 'N'); half = 0; } else { Dynarr_add (dst, ISO_CODE_SS2); half = 1; } break; case 3: if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys)) { Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, 'O'); half = 0; } else { Dynarr_add (dst, ISO_CODE_SS3); half = 1; } break; default: abort (); } } char_boundary = 0; } else { /* Processing Non-ASCII character */ charmask = (half == 0 ? 0x7F : 0xFF); char_boundary = 1; if (EQ (charset, Vcharset_control_1)) { if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) && fit_to_be_escape_quoted (c)) Dynarr_add (dst, ISO_CODE_ESC); /* you asked for it ... */ Dynarr_add (dst, c - 0x20); } else { switch (XCHARSET_REP_BYTES (charset)) { case 2: Dynarr_add (dst, c & charmask); break; case 3: if (XCHARSET_PRIVATE_P (charset)) { Dynarr_add (dst, c & charmask); ch = 0; } else if (ch) { if (EQ (charset, Vcharset_composite)) { if (in_composite) { /* #### Bother! We don't know how to handle this yet. */ Dynarr_add (dst, '~'); } else { Emchar emch = MAKE_CHAR (Vcharset_composite, ch & 0x7F, c & 0x7F); Lisp_Object lstr = composite_char_string (emch); saved_n = n; saved_src = src; in_composite = 1; src = XSTRING_DATA (lstr); n = XSTRING_LENGTH (lstr); Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, '0'); /* start composing */ } } else { Dynarr_add (dst, ch & charmask); Dynarr_add (dst, c & charmask); } ch = 0; } else { ch = c; char_boundary = 0; } break; case 4: if (ch) { Dynarr_add (dst, ch & charmask); Dynarr_add (dst, c & charmask); ch = 0; } else { ch = c; char_boundary = 0; } break; default: abort (); } } } } if (in_composite) { n = saved_n; src = saved_src; in_composite = 0; Dynarr_add (dst, ISO_CODE_ESC); Dynarr_add (dst, '1'); /* end composing */ goto back_to_square_n; /* Wheeeeeeeee ..... */ } if (char_boundary && flags & CODING_STATE_END) { restore_left_to_right_direction (codesys, dst, &flags, 0); ensure_normal_shift (str, dst); for (i = 0; i < 4; i++) { Lisp_Object initial_charset = CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); iso2022_designate (initial_charset, i, str, dst); } } CODING_STREAM_COMPOSE (str, flags, ch); str->iso2022.current_char_boundary = char_boundary; str->iso2022.current_charset = charset; str->iso2022.current_half = half; /* Verbum caro factum est! */ } /************************************************************************/ /* No-conversion methods */ /************************************************************************/ /* This is used when reading in "binary" files -- i.e. files that may contain all 256 possible byte values and that are not to be interpreted as being in any particular decoding. */ static void decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n) { unsigned char c; unsigned int flags, ch; enum eol_type eol_type; struct decoding_stream *str = DECODING_STREAM_DATA (decoding); CODING_STREAM_DECOMPOSE (str, flags, ch); eol_type = str->eol_type; while (n--) { c = *src++; DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); DECODE_ADD_BINARY_CHAR (c, dst); label_continue_loop:; } DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); CODING_STREAM_COMPOSE (str, flags, ch); } static void encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src, unsigned_char_dynarr *dst, unsigned int n) { unsigned char c; struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); unsigned int flags, ch; enum eol_type eol_type; CODING_STREAM_DECOMPOSE (str, flags, ch); eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); while (n--) { c = *src++; if (c == '\n') { if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) Dynarr_add (dst, '\r'); if (eol_type != EOL_CR) Dynarr_add (dst, '\n'); ch = 0; } else if (BYTE_ASCII_P (c)) { assert (ch == 0); Dynarr_add (dst, c); } else if (BUFBYTE_LEADING_BYTE_P (c)) { assert (ch == 0); if (c == LEADING_BYTE_LATIN_ISO8859_1 || c == LEADING_BYTE_CONTROL_1) ch = c; else Dynarr_add (dst, '~'); /* untranslatable character */ } else { if (ch == LEADING_BYTE_LATIN_ISO8859_1) Dynarr_add (dst, c); else if (ch == LEADING_BYTE_CONTROL_1) { assert (c < 0xC0); Dynarr_add (dst, c - 0x20); } /* else it should be the second or third byte of an untranslatable character, so ignore it */ ch = 0; } } CODING_STREAM_COMPOSE (str, flags, ch); } /************************************************************************/ /* Simple internal/external functions */ /************************************************************************/ static Extbyte_dynarr *conversion_out_dynarr; static Bufbyte_dynarr *conversion_in_dynarr; /* Determine coding system from coding format */ #define FILE_NAME_CODING_SYSTEM \ ((NILP (Vfile_name_coding_system) || \ (EQ ((Vfile_name_coding_system), Qbinary))) ? \ Qnil : Fget_coding_system (Vfile_name_coding_system)) /* #### not correct for all values of `fmt'! */ #define FMT_CODING_SYSTEM(fmt) \ (((fmt) == FORMAT_FILENAME) ? FILE_NAME_CODING_SYSTEM : \ ((fmt) == FORMAT_CTEXT ) ? Fget_coding_system (Qctext) : \ ((fmt) == FORMAT_TERMINAL) ? FILE_NAME_CODING_SYSTEM : \ Qnil) extern CONST Extbyte * convert_to_external_format (CONST Bufbyte *ptr, Bytecount len, Extcount *len_out, enum external_data_format fmt) { Lisp_Object coding_system = FMT_CODING_SYSTEM (fmt); if (!conversion_out_dynarr) conversion_out_dynarr = Dynarr_new (Extbyte); else Dynarr_reset (conversion_out_dynarr); if (NILP (coding_system)) { CONST Bufbyte *end = ptr + len; for (; ptr < end;) { Bufbyte c = (BYTE_ASCII_P (*ptr)) ? *ptr : (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) : (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) : '~'; Dynarr_add (conversion_out_dynarr, (Extbyte) c); INC_CHARPTR (ptr); } #ifdef ERROR_CHECK_BUFPOS assert (ptr == end); #endif } else { Lisp_Object instream, outstream, da_outstream; Lstream *istr, *ostr; struct gcpro gcpro1, gcpro2, gcpro3; char tempbuf[1024]; /* some random amount */ instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len); da_outstream = make_dynarr_output_stream ((unsigned_char_dynarr *) conversion_out_dynarr); outstream = make_encoding_output_stream (XLSTREAM (da_outstream), coding_system); istr = XLSTREAM (instream); ostr = XLSTREAM (outstream); GCPRO3 (instream, outstream, da_outstream); while (1) { int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); if (!size_in_bytes) break; Lstream_write (ostr, tempbuf, size_in_bytes); } Lstream_close (istr); Lstream_close (ostr); UNGCPRO; Lstream_delete (istr); Lstream_delete (ostr); Lstream_delete (XLSTREAM (da_outstream)); } *len_out = Dynarr_length (conversion_out_dynarr); Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */ return Dynarr_atp (conversion_out_dynarr, 0); } extern CONST Bufbyte * convert_from_external_format (CONST Extbyte *ptr, Extcount len, Bytecount *len_out, enum external_data_format fmt) { Lisp_Object coding_system = FMT_CODING_SYSTEM (fmt); if (!conversion_in_dynarr) conversion_in_dynarr = Dynarr_new (Bufbyte); else Dynarr_reset (conversion_in_dynarr); if (NILP (coding_system)) { CONST Extbyte *end = ptr + len; for (; ptr < end; ptr++) { Extbyte c = *ptr; DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr); } } else { Lisp_Object instream, outstream, da_outstream; Lstream *istr, *ostr; struct gcpro gcpro1, gcpro2, gcpro3; char tempbuf[1024]; /* some random amount */ instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len); da_outstream = make_dynarr_output_stream ((unsigned_char_dynarr *) conversion_in_dynarr); outstream = make_decoding_output_stream (XLSTREAM (da_outstream), coding_system); istr = XLSTREAM (instream); ostr = XLSTREAM (outstream); GCPRO3 (instream, outstream, da_outstream); while (1) { int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); if (!size_in_bytes) break; Lstream_write (ostr, tempbuf, size_in_bytes); } Lstream_close (istr); Lstream_close (ostr); UNGCPRO; Lstream_delete (istr); Lstream_delete (ostr); Lstream_delete (XLSTREAM (da_outstream)); } *len_out = Dynarr_length (conversion_in_dynarr); Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */ return Dynarr_atp (conversion_in_dynarr, 0); } /************************************************************************/ /* Initialization */ /************************************************************************/ void syms_of_mule_coding (void) { defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system"); deferror (&Qcoding_system_error, "coding-system-error", "Coding-system error", Qio_error); DEFSUBR (Fcoding_system_p); DEFSUBR (Ffind_coding_system); DEFSUBR (Fget_coding_system); DEFSUBR (Fcoding_system_list); DEFSUBR (Fcoding_system_name); DEFSUBR (Fmake_coding_system); DEFSUBR (Fcopy_coding_system); DEFSUBR (Fsubsidiary_coding_system); DEFSUBR (Fcoding_system_type); DEFSUBR (Fcoding_system_doc_string); DEFSUBR (Fcoding_system_charset); DEFSUBR (Fcoding_system_property); DEFSUBR (Fcoding_category_list); DEFSUBR (Fset_coding_priority_list); DEFSUBR (Fcoding_priority_list); DEFSUBR (Fset_coding_category_system); DEFSUBR (Fcoding_category_system); DEFSUBR (Fdetect_coding_region); DEFSUBR (Fdecode_coding_region); DEFSUBR (Fencode_coding_region); DEFSUBR (Fdecode_shift_jis_char); DEFSUBR (Fencode_shift_jis_char); DEFSUBR (Fdecode_big5_char); DEFSUBR (Fencode_big5_char); defsymbol (&Qcoding_system_p, "coding-system-p"); defsymbol (&Qbig5, "big5"); defsymbol (&Qshift_jis, "shift-jis"); defsymbol (&Qno_conversion, "no-conversion"); defsymbol (&Qccl, "ccl"); defsymbol (&Qiso2022, "iso2022"); defsymbol (&Qmnemonic, "mnemonic"); defsymbol (&Qeol_type, "eol-type"); defsymbol (&Qpost_read_conversion, "post-read-conversion"); defsymbol (&Qpre_write_conversion, "pre-write-conversion"); defsymbol (&Qcr, "cr"); defsymbol (&Qlf, "lf"); defsymbol (&Qcrlf, "crlf"); defsymbol (&Qeol_cr, "eol-cr"); defsymbol (&Qeol_lf, "eol-lf"); defsymbol (&Qeol_crlf, "eol-crlf"); defsymbol (&Qcharset_g0, "charset-g0"); defsymbol (&Qcharset_g1, "charset-g1"); defsymbol (&Qcharset_g2, "charset-g2"); defsymbol (&Qcharset_g3, "charset-g3"); defsymbol (&Qforce_g0_on_output, "force-g0-on-output"); defsymbol (&Qforce_g1_on_output, "force-g1-on-output"); defsymbol (&Qforce_g2_on_output, "force-g2-on-output"); defsymbol (&Qforce_g3_on_output, "force-g3-on-output"); defsymbol (&Qshort, "short"); defsymbol (&Qno_ascii_eol, "no-ascii-eol"); defsymbol (&Qno_ascii_cntl, "no-ascii-cntl"); defsymbol (&Qseven, "seven"); defsymbol (&Qlock_shift, "lock-shift"); defsymbol (&Qno_iso6429, "no-iso6429"); defsymbol (&Qescape_quoted, "escape-quoted"); defsymbol (&Qinput_charset_conversion, "input-charset-conversion"); defsymbol (&Qoutput_charset_conversion, "output-charset-conversion"); defsymbol (&Qencode, "encode"); defsymbol (&Qdecode, "decode"); defsymbol (&Qctext, "ctext"); defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS], "shift-jis"); defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7], "iso-7"); defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE], "iso-8-designate"); defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1], "iso-8-1"); defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2], "iso-8-2"); defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT], "iso-lock-shift"); defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5], "big5"); defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION], "no-conversion"); } void lstream_type_create_mule_coding (void) { LSTREAM_HAS_METHOD (decoding, reader); LSTREAM_HAS_METHOD (decoding, writer); LSTREAM_HAS_METHOD (decoding, rewinder); LSTREAM_HAS_METHOD (decoding, seekable_p); LSTREAM_HAS_METHOD (decoding, flusher); LSTREAM_HAS_METHOD (decoding, closer); LSTREAM_HAS_METHOD (decoding, marker); LSTREAM_HAS_METHOD (encoding, reader); LSTREAM_HAS_METHOD (encoding, writer); LSTREAM_HAS_METHOD (encoding, rewinder); LSTREAM_HAS_METHOD (encoding, seekable_p); LSTREAM_HAS_METHOD (encoding, flusher); LSTREAM_HAS_METHOD (encoding, closer); LSTREAM_HAS_METHOD (encoding, marker); } void vars_of_mule_coding (void) { int i; /* Initialize to something reasonable ... */ for (i = 0; i <= CODING_CATEGORY_LAST; i++) { coding_category_system[i] = Qnil; coding_category_by_priority[i] = i; } DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /* Coding system used for TTY keyboard input. Not used under a windowing system. */ ); Vkeyboard_coding_system = Qnil; DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /* Coding system used for TTY display output. Not used under a windowing system. */ ); Vterminal_coding_system = Qnil; DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /* Overriding coding system used when writing a file or process. You should *bind* this, not set it. If this is non-nil, it specifies the coding system that will be used when a file or process is read in, and overrides `buffer-file-coding-system-for-read', `insert-file-contents-pre-hook', etc. Use those variables instead of this one for permanent changes to the environment. */ ); Vcoding_system_for_read = Qnil; DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write /* Overriding coding system used when writing a file or process. You should *bind* this, not set it. If this is non-nil, it specifies the coding system that will be used when a file or process is wrote in, and overrides `buffer-file-coding-system', `write-region-pre-hook', etc. Use those variables instead of this one for permanent changes to the environment. */ ); Vcoding_system_for_write = Qnil; DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /* Coding system used to convert pathnames when accessing files. */ ); Vfile_name_coding_system = Qnil; DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /* Non-nil means the buffer contents are regarded as multi-byte form of characters, not a binary code. This affects the display, file I/O, and behaviors of various editing commands. Setting this to nil does not do anything. */ ); enable_multibyte_characters = 1; } void complex_vars_of_mule_coding (void) { staticpro (&Vcoding_system_hashtable); Vcoding_system_hashtable = make_lisp_hashtable (50, HASHTABLE_NONWEAK, HASHTABLE_EQ); the_codesys_prop_dynarr = Dynarr_new (codesys_prop); #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \ { \ struct codesys_prop csp; \ csp.sym = (Sym); \ csp.prop_type = (Prop_Type); \ Dynarr_add (the_codesys_prop_dynarr, csp); \ } while (0) DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic); DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type); DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr); DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf); DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf); DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion); DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion); DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion); DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode); DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode); /* Need to create this here or we're really screwed. */ Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"), list2 (Qmnemonic, build_string ("Noconv"))); Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf), Qbinary); /* Need this for bootstrapping */ coding_category_system[CODING_CATEGORY_NO_CONVERSION] = Fget_coding_system (Qno_conversion); }