comparison src/file-coding.c @ 259:11cf20601dec r20-5b28

Import from CVS: tag r20-5b28
author cvs
date Mon, 13 Aug 2007 10:23:02 +0200
parents
children 405dd6d1825b
comparison
equal deleted inserted replaced
258:58424f6abf56 259:11cf20601dec
1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4
5 This file is part of XEmacs.
6
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
10 later version.
11
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* Synched up with: Mule 2.3. Not in FSF. */
23
24 /* Rewritten by Ben Wing <wing@666.com>. */
25
26 #include <config.h>
27 #include "lisp.h"
28 #include "buffer.h"
29 #include "elhash.h"
30 #include "insdel.h"
31 #include "lstream.h"
32 #ifdef MULE
33 #include "mule-ccl.h"
34 #endif
35 #include "file-coding.h"
36
37 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
38
39 Lisp_Object Vkeyboard_coding_system;
40 Lisp_Object Vterminal_coding_system;
41 Lisp_Object Vcoding_system_for_read;
42 Lisp_Object Vcoding_system_for_write;
43 Lisp_Object Vfile_name_coding_system;
44
45 /* Table of symbols identifying each coding category. */
46 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
47
48 /* Coding system currently associated with each coding category. */
49 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
50
51 /* Table of all coding categories in decreasing order of priority.
52 This describes a permutation of the possible coding categories. */
53 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
54
55 Lisp_Object Qcoding_system_p;
56
57 Lisp_Object Qno_conversion, Qccl, Qiso2022;
58 /* Qinternal in general.c */
59
60 Lisp_Object Qmnemonic, Qeol_type;
61 Lisp_Object Qcr, Qcrlf, Qlf;
62 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
63 Lisp_Object Qpost_read_conversion;
64 Lisp_Object Qpre_write_conversion;
65
66 #ifdef MULE
67 Lisp_Object Qbig5, Qshift_jis;
68 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
69 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
70 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
71 Lisp_Object Qno_iso6429;
72 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
73 Lisp_Object Qctext;
74 #endif
75 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
76
77 Lisp_Object Qencode, Qdecode, Qescape_quoted;
78
79 Lisp_Object Vcoding_system_hashtable;
80
81 int enable_multibyte_characters;
82
83 #ifdef MULE
84 /* Additional information used by the ISO2022 decoder and detector. */
85 struct iso2022_decoder
86 {
87 /* CHARSET holds the character sets currently assigned to the G0
88 through G3 variables. It is initialized from the array
89 INITIAL_CHARSET in CODESYS. */
90 Lisp_Object charset[4];
91
92 /* Which registers are currently invoked into the left (GL) and
93 right (GR) halves of the 8-bit encoding space? */
94 int register_left, register_right;
95
96 /* ISO_ESC holds a value indicating part of an escape sequence
97 that has already been seen. */
98 enum iso_esc_flag esc;
99
100 /* This records the bytes we've seen so far in an escape sequence,
101 in case the sequence is invalid (we spit out the bytes unchanged). */
102 unsigned char esc_bytes[8];
103
104 /* Index for next byte to store in ISO escape sequence. */
105 int esc_bytes_index;
106
107 /* Stuff seen so far when composing a string. */
108 unsigned_char_dynarr *composite_chars;
109
110 /* If we saw an invalid designation sequence for a particular
111 register, we flag it here and switch to ASCII. The next time we
112 see a valid designation for this register, we turn off the flag
113 and do the designation normally, but pretend the sequence was
114 invalid. The effect of all this is that (most of the time) the
115 escape sequences for both the switch to the unknown charset, and
116 the switch back to the known charset, get inserted literally into
117 the buffer and saved out as such. The hope is that we can
118 preserve the escape sequences so that the resulting written out
119 file makes sense. If we don't do any of this, the designation
120 to the invalid charset will be preserved but that switch back
121 to the known charset will probably get eaten because it was
122 the same charset that was already present in the register. */
123 unsigned char invalid_designated[4];
124
125 /* We try to do similar things as above for direction-switching
126 sequences. If we encountered a direction switch while an
127 invalid designation was present, or an invalid designation
128 just after a direction switch (i.e. no valid designation
129 encountered yet), we insert the direction-switch escape
130 sequence literally into the output stream, and later on
131 insert the corresponding direction-restoring escape sequence
132 literally also. */
133 unsigned int switched_dir_and_no_valid_charset_yet :1;
134 unsigned int invalid_switch_dir :1;
135
136 /* Tells the decoder to output the escape sequence literally
137 even though it was valid. Used in the games we play to
138 avoid lossage when we encounter invalid designations. */
139 unsigned int output_literally :1;
140 /* We encountered a direction switch followed by an invalid
141 designation. We didn't output the direction switch
142 literally because we didn't know about the invalid designation;
143 but we have to do so now. */
144 unsigned int output_direction_sequence :1;
145 };
146 #endif
147 Lisp_Object Fcopy_coding_system (Lisp_Object old_coding_system,
148 Lisp_Object new_name);
149 #ifdef MULE
150 struct detection_state;
151 static int detect_coding_sjis (struct detection_state *st,
152 CONST unsigned char *src,
153 unsigned int n);
154 static void decode_coding_sjis (Lstream *decoding,
155 CONST unsigned char *src,
156 unsigned_char_dynarr *dst,
157 unsigned int n);
158 static void encode_coding_sjis (Lstream *encoding,
159 CONST unsigned char *src,
160 unsigned_char_dynarr *dst,
161 unsigned int n);
162 static int detect_coding_big5 (struct detection_state *st,
163 CONST unsigned char *src,
164 unsigned int n);
165 static void decode_coding_big5 (Lstream *decoding,
166 CONST unsigned char *src,
167 unsigned_char_dynarr *dst, unsigned int n);
168 static void encode_coding_big5 (Lstream *encoding,
169 CONST unsigned char *src,
170 unsigned_char_dynarr *dst, unsigned int n);
171 static int postprocess_iso2022_mask (int mask);
172 static void reset_iso2022 (Lisp_Object coding_system,
173 struct iso2022_decoder *iso);
174 static int detect_coding_iso2022 (struct detection_state *st,
175 CONST unsigned char *src,
176 unsigned int n);
177 static void decode_coding_iso2022 (Lstream *decoding,
178 CONST unsigned char *src,
179 unsigned_char_dynarr *dst, unsigned int n);
180 static void encode_coding_iso2022 (Lstream *encoding,
181 CONST unsigned char *src,
182 unsigned_char_dynarr *dst, unsigned int n);
183 #endif /* MULE */
184 static void decode_coding_no_conversion (Lstream *decoding,
185 CONST unsigned char *src,
186 unsigned_char_dynarr *dst,
187 unsigned int n);
188 static void encode_coding_no_conversion (Lstream *encoding,
189 CONST unsigned char *src,
190 unsigned_char_dynarr *dst,
191 unsigned int n);
192 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
193 unsigned_char_dynarr *dst, unsigned int n);
194 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
195 unsigned_char_dynarr *dst, unsigned int n);
196
197 typedef struct codesys_prop codesys_prop;
198 struct codesys_prop
199 {
200 Lisp_Object sym;
201 int prop_type;
202 };
203
204 typedef struct
205 {
206 Dynarr_declare (codesys_prop);
207 } codesys_prop_dynarr;
208
209 codesys_prop_dynarr *the_codesys_prop_dynarr;
210
211 enum codesys_prop_enum
212 {
213 CODESYS_PROP_ALL_OK,
214 CODESYS_PROP_ISO2022,
215 CODESYS_PROP_CCL
216 };
217
218
219 /************************************************************************/
220 /* Coding system functions */
221 /************************************************************************/
222
223 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
224 static void print_coding_system (Lisp_Object, Lisp_Object, int);
225 static void finalize_coding_system (void *header, int for_disksave);
226
227 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
228 mark_coding_system, print_coding_system,
229 finalize_coding_system,
230 0, 0, struct Lisp_Coding_System);
231
232 static Lisp_Object
233 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
234 {
235 struct Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
236
237 (markobj) (CODING_SYSTEM_NAME (codesys));
238 (markobj) (CODING_SYSTEM_DOC_STRING (codesys));
239 (markobj) (CODING_SYSTEM_MNEMONIC (codesys));
240 (markobj) (CODING_SYSTEM_EOL_LF (codesys));
241 (markobj) (CODING_SYSTEM_EOL_CRLF (codesys));
242 (markobj) (CODING_SYSTEM_EOL_CR (codesys));
243
244 switch (CODING_SYSTEM_TYPE (codesys))
245 {
246 #ifdef MULE
247 int i;
248 case CODESYS_ISO2022:
249 for (i = 0; i < 4; i++)
250 (markobj) (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
251 if (codesys->iso2022.input_conv)
252 {
253 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
254 {
255 struct charset_conversion_spec *ccs =
256 Dynarr_atp (codesys->iso2022.input_conv, i);
257 (markobj) (ccs->from_charset);
258 (markobj) (ccs->to_charset);
259 }
260 }
261 if (codesys->iso2022.output_conv)
262 {
263 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
264 {
265 struct charset_conversion_spec *ccs =
266 Dynarr_atp (codesys->iso2022.output_conv, i);
267 (markobj) (ccs->from_charset);
268 (markobj) (ccs->to_charset);
269 }
270 }
271 break;
272
273 case CODESYS_CCL:
274 (markobj) (CODING_SYSTEM_CCL_DECODE (codesys));
275 (markobj) (CODING_SYSTEM_CCL_ENCODE (codesys));
276 break;
277 #endif
278 default:
279 break;
280 }
281
282 (markobj) (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
283 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
284 }
285
286 static void
287 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
288 int escapeflag)
289 {
290 struct Lisp_Coding_System *c = XCODING_SYSTEM (obj);
291 if (print_readably)
292 error ("printing unreadable object #<coding_system 0x%x>",
293 c->header.uid);
294
295 write_c_string ("#<coding_system ", printcharfun);
296 print_internal (c->name, printcharfun, 1);
297 write_c_string (">", printcharfun);
298 }
299
300 static void
301 finalize_coding_system (void *header, int for_disksave)
302 {
303 struct Lisp_Coding_System *c = (struct Lisp_Coding_System *) header;
304 /* Since coding systems never go away, this function is not
305 necessary. But it would be necessary if we changed things
306 so that coding systems could go away. */
307 if (!for_disksave) /* see comment in lstream.c */
308 {
309 switch (CODING_SYSTEM_TYPE (c))
310 {
311 #ifdef MULE
312 case CODESYS_ISO2022:
313 if (c->iso2022.input_conv)
314 {
315 Dynarr_free (c->iso2022.input_conv);
316 c->iso2022.input_conv = 0;
317 }
318 if (c->iso2022.output_conv)
319 {
320 Dynarr_free (c->iso2022.output_conv);
321 c->iso2022.output_conv = 0;
322 }
323 break;
324 #endif
325 default:
326 break;
327 }
328 }
329 }
330
331 static enum eol_type
332 symbol_to_eol_type (Lisp_Object symbol)
333 {
334 CHECK_SYMBOL (symbol);
335 if (NILP (symbol)) return EOL_AUTODETECT;
336 if (EQ (symbol, Qlf)) return EOL_LF;
337 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
338 if (EQ (symbol, Qcr)) return EOL_CR;
339
340 signal_simple_error ("Unrecognized eol type", symbol);
341 return EOL_AUTODETECT; /* not reached */
342 }
343
344 static Lisp_Object
345 eol_type_to_symbol (enum eol_type type)
346 {
347 switch (type)
348 {
349 case EOL_LF: return Qlf;
350 case EOL_CRLF: return Qcrlf;
351 case EOL_CR: return Qcr;
352 case EOL_AUTODETECT: return Qnil;
353 default: abort (); return Qnil; /* not reached */
354 }
355 }
356
357 static void
358 setup_eol_coding_systems (struct Lisp_Coding_System *codesys)
359 {
360 Lisp_Object codesys_obj = Qnil;
361 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
362 char *codesys_name = (char *) alloca (len + 7);
363
364 int mlen = XSTRING_LENGTH(CODING_SYSTEM_MNEMONIC (codesys));
365 char *codesys_mnemonic = (char *) alloca (mlen + 7);
366
367 Lisp_Object codesys_name_sym, sub_codesys_obj;
368
369 /* kludge */
370
371 XSETCODING_SYSTEM (codesys_obj, codesys);
372
373 memcpy (codesys_name,
374 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
375
376 memcpy (codesys_mnemonic,
377 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
378
379 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
380 strcpy (codesys_name + len, "-" op_sys); \
381 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
382 codesys_name_sym = intern (codesys_name); \
383 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
384 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
385 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = build_string(codesys_mnemonic); \
386 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
387 } while (0)
388
389 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
390 DEFINE_SUB_CODESYS("dos", "(T)", EOL_CRLF);
391 DEFINE_SUB_CODESYS("mac", "(t)", EOL_CR);
392 }
393
394 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
395 T if OBJECT is a coding system.
396 A coding system is an object that defines how text containing multiple
397 character sets is encoded into a stream of (typically 8-bit) bytes.
398 The coding system is used to decode the stream into a series of
399 characters (which may be from multiple charsets) when the text is read
400 from a file or process, and is used to encode the text back into the
401 same format when it is written out to a file or process.
402
403 For example, many ISO2022-compliant coding systems (such as Compound
404 Text, which is used for inter-client data under the X Window System)
405 use escape sequences to switch between different charsets -- Japanese
406 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
407 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
408 `make-coding-system' for more information.
409
410 Coding systems are normally identified using a symbol, and the
411 symbol is accepted in place of the actual coding system object whenever
412 a coding system is called for. (This is similar to how faces work.)
413 */
414 (object))
415 {
416 return CODING_SYSTEMP (object) ? Qt : Qnil;
417 }
418
419 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
420 Retrieve the coding system of the given name.
421
422 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
423 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
424 If there is no such coding system, nil is returned. Otherwise the
425 associated coding system object is returned.
426 */
427 (coding_system_or_name))
428 {
429 if (NILP (coding_system_or_name))
430 coding_system_or_name = Qbinary;
431 if (CODING_SYSTEMP (coding_system_or_name))
432 return coding_system_or_name;
433 CHECK_SYMBOL (coding_system_or_name);
434
435 return Fgethash (coding_system_or_name, Vcoding_system_hashtable, Qnil);
436 }
437
438 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
439 Retrieve the coding system of the given name.
440 Same as `find-coding-system' except that if there is no such
441 coding system, an error is signaled instead of returning nil.
442 */
443 (name))
444 {
445 Lisp_Object coding_system = Ffind_coding_system (name);
446
447 if (NILP (coding_system))
448 signal_simple_error ("No such coding system", name);
449 return coding_system;
450 }
451
452 /* We store the coding systems in hash tables with the names as the key and the
453 actual coding system object as the value. Occasionally we need to use them
454 in a list format. These routines provide us with that. */
455 struct coding_system_list_closure
456 {
457 Lisp_Object *coding_system_list;
458 };
459
460 static int
461 add_coding_system_to_list_mapper (CONST void *hash_key, void *hash_contents,
462 void *coding_system_list_closure)
463 {
464 /* This function can GC */
465 Lisp_Object key, contents;
466 Lisp_Object *coding_system_list;
467 struct coding_system_list_closure *cscl =
468 (struct coding_system_list_closure *) coding_system_list_closure;
469 CVOID_TO_LISP (key, hash_key);
470 VOID_TO_LISP (contents, hash_contents);
471 coding_system_list = cscl->coding_system_list;
472
473 *coding_system_list = Fcons (XCODING_SYSTEM (contents)->name,
474 *coding_system_list);
475 return 0;
476 }
477
478 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
479 Return a list of the names of all defined coding systems.
480 */
481 ())
482 {
483 Lisp_Object coding_system_list = Qnil;
484 struct gcpro gcpro1;
485 struct coding_system_list_closure coding_system_list_closure;
486
487 GCPRO1 (coding_system_list);
488 coding_system_list_closure.coding_system_list = &coding_system_list;
489 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hashtable,
490 &coding_system_list_closure);
491 UNGCPRO;
492
493 return coding_system_list;
494 }
495
496 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
497 Return the name of the given coding system.
498 */
499 (coding_system))
500 {
501 coding_system = Fget_coding_system (coding_system);
502 return XCODING_SYSTEM_NAME (coding_system);
503 }
504
505 static struct Lisp_Coding_System *
506 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
507 {
508 struct Lisp_Coding_System *codesys =
509 alloc_lcrecord_type (struct Lisp_Coding_System, lrecord_coding_system);
510
511 zero_lcrecord (codesys);
512 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
513 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
514 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
515 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
516 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
517 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
518 CODING_SYSTEM_TYPE (codesys) = type;
519 #ifdef MULE
520 if (type == CODESYS_ISO2022)
521 {
522 int i;
523 for (i = 0; i < 4; i++)
524 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
525 }
526 else if (type == CODESYS_CCL)
527 {
528 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
529 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
530 }
531 #endif
532 CODING_SYSTEM_NAME (codesys) = name;
533
534 return codesys;
535 }
536
537 #ifdef MULE
538 /* Given a list of charset conversion specs as specified in a Lisp
539 program, parse it into STORE_HERE. */
540
541 static void
542 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
543 Lisp_Object spec_list)
544 {
545 Lisp_Object rest;
546
547 EXTERNAL_LIST_LOOP (rest, spec_list)
548 {
549 Lisp_Object car = XCAR (rest);
550 Lisp_Object from, to;
551 struct charset_conversion_spec spec;
552
553 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
554 signal_simple_error ("Invalid charset conversion spec", car);
555 from = Fget_charset (XCAR (car));
556 to = Fget_charset (XCAR (XCDR (car)));
557 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
558 signal_simple_error_2
559 ("Attempted conversion between different charset types",
560 from, to);
561 spec.from_charset = from;
562 spec.to_charset = to;
563
564 Dynarr_add (store_here, spec);
565 }
566 }
567
568 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
569 specs, return the equivalent as the Lisp programmer would see it.
570
571 If LOAD_HERE is 0, return Qnil. */
572
573 static Lisp_Object
574 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
575 {
576 int i;
577 Lisp_Object result = Qnil;
578
579 if (!load_here)
580 return Qnil;
581 for (i = 0; i < Dynarr_length (load_here); i++)
582 {
583 struct charset_conversion_spec *ccs =
584 Dynarr_atp (load_here, i);
585 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
586 }
587
588 return Fnreverse (result);
589 }
590
591 #endif
592
593 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
594 Register symbol NAME as a coding system.
595
596 TYPE describes the conversion method used and should be one of
597
598 nil or 'undecided
599 Automatic conversion. XEmacs attempts to detect the coding system
600 used in the file.
601 'no-conversion
602 No conversion. Use this for binary files and such. On output,
603 graphic characters that are not in ASCII or Latin-1 will be
604 replaced by a ?. (For a no-conversion-encoded buffer, these
605 characters will only be present if you explicitly insert them.)
606 'shift-jis
607 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
608 'iso2022
609 Any ISO2022-compliant encoding. Among other things, this includes
610 JIS (the Japanese encoding commonly used for e-mail), EUC (the
611 standard Unix encoding for Japanese and other languages), and
612 Compound Text (the encoding used in X11). You can specify more
613 specific information about the conversion with the FLAGS argument.
614 'big5
615 Big5 (the encoding commonly used for Taiwanese).
616 'ccl
617 The conversion is performed using a user-written pseudo-code
618 program. CCL (Code Conversion Language) is the name of this
619 pseudo-code.
620 'internal
621 Write out or read in the raw contents of the memory representing
622 the buffer's text. This is primarily useful for debugging
623 purposes, and is only enabled when XEmacs has been compiled with
624 DEBUG_XEMACS defined (via the --debug configure option).
625 WARNING: Reading in a file using 'internal conversion can result
626 in an internal inconsistency in the memory representing a
627 buffer's text, which will produce unpredictable results and may
628 cause XEmacs to crash. Under normal circumstances you should
629 never use 'internal conversion.
630
631 DOC-STRING is a string describing the coding system.
632
633 PROPS is a property list, describing the specific nature of the
634 character set. Recognized properties are:
635
636 'mnemonic
637 String to be displayed in the modeline when this coding system is
638 active.
639
640 'eol-type
641 End-of-line conversion to be used. It should be one of
642
643 nil
644 Automatically detect the end-of-line type (LF, CRLF,
645 or CR). Also generate subsidiary coding systems named
646 `NAME-unix', `NAME-dos', and `NAME-mac', that are
647 identical to this coding system but have an EOL-TYPE
648 value of 'lf, 'crlf, and 'cr, respectively.
649 'lf
650 The end of a line is marked externally using ASCII LF.
651 Since this is also the way that XEmacs represents an
652 end-of-line internally, specifying this option results
653 in no end-of-line conversion. This is the standard
654 format for Unix text files.
655 'crlf
656 The end of a line is marked externally using ASCII
657 CRLF. This is the standard format for MS-DOS text
658 files.
659 'cr
660 The end of a line is marked externally using ASCII CR.
661 This is the standard format for Macintosh text files.
662 t
663 Automatically detect the end-of-line type but do not
664 generate subsidiary coding systems. (This value is
665 converted to nil when stored internally, and
666 `coding-system-property' will return nil.)
667
668 'post-read-conversion
669 Function called after a file has been read in, to perform the
670 decoding. Called with two arguments, BEG and END, denoting
671 a region of the current buffer to be decoded.
672
673 'pre-write-conversion
674 Function called before a file is written out, to perform the
675 encoding. Called with two arguments, BEG and END, denoting
676 a region of the current buffer to be encoded.
677
678
679 The following additional properties are recognized if TYPE is 'iso2022:
680
681 'charset-g0
682 'charset-g1
683 'charset-g2
684 'charset-g3
685 The character set initially designated to the G0 - G3 registers.
686 The value should be one of
687
688 -- A charset object (designate that character set)
689 -- nil (do not ever use this register)
690 -- t (no character set is initially designated to
691 the register, but may be later on; this automatically
692 sets the corresponding `force-g*-on-output' property)
693
694 'force-g0-on-output
695 'force-g1-on-output
696 'force-g2-on-output
697 'force-g2-on-output
698 If non-nil, send an explicit designation sequence on output before
699 using the specified register.
700
701 'short
702 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
703 "ESC $ B" on output in place of the full designation sequences
704 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
705
706 'no-ascii-eol
707 If non-nil, don't designate ASCII to G0 at each end of line on output.
708 Setting this to non-nil also suppresses other state-resetting that
709 normally happens at the end of a line.
710
711 'no-ascii-cntl
712 If non-nil, don't designate ASCII to G0 before control chars on output.
713
714 'seven
715 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
716 environment.
717
718 'lock-shift
719 If non-nil, use locking-shift (SO/SI) instead of single-shift
720 or designation by escape sequence.
721
722 'no-iso6429
723 If non-nil, don't use ISO6429's direction specification.
724
725 'escape-quoted
726 If non-nil, literal control characters that are the same as
727 the beginning of a recognized ISO2022 or ISO6429 escape sequence
728 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
729 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
730 so that they can be properly distinguished from an escape sequence.
731 (Note that doing this results in a non-portable encoding.) This
732 encoding flag is used for byte-compiled files. Note that ESC
733 is a good choice for a quoting character because there are no
734 escape sequences whose second byte is a character from the Control-0
735 or Control-1 character sets; this is explicitly disallowed by the
736 ISO2022 standard.
737
738 'input-charset-conversion
739 A list of conversion specifications, specifying conversion of
740 characters in one charset to another when decoding is performed.
741 Each specification is a list of two elements: the source charset,
742 and the destination charset.
743
744 'output-charset-conversion
745 A list of conversion specifications, specifying conversion of
746 characters in one charset to another when encoding is performed.
747 The form of each specification is the same as for
748 'input-charset-conversion.
749
750
751 The following additional properties are recognized (and required)
752 if TYPE is 'ccl:
753
754 'decode
755 CCL program used for decoding (converting to internal format).
756
757 'encode
758 CCL program used for encoding (converting to external format).
759 */
760 (name, type, doc_string, props))
761 {
762 struct Lisp_Coding_System *codesys;
763 Lisp_Object rest, key, value;
764 enum coding_system_type ty;
765 int need_to_setup_eol_systems = 1;
766
767 /* Convert type to constant */
768 if (NILP (type) || EQ (type, Qundecided))
769 { ty = CODESYS_AUTODETECT; }
770 #ifdef MULE
771 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
772 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
773 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
774 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
775 #endif
776 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
777 #ifdef DEBUG_XEMACS
778 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
779 #endif
780 else
781 signal_simple_error ("Invalid coding system type", type);
782
783 CHECK_SYMBOL (name);
784
785 codesys = allocate_coding_system (ty, name);
786
787 if (NILP (doc_string))
788 doc_string = build_string ("");
789 else
790 CHECK_STRING (doc_string);
791 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
792
793 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
794 {
795 if (EQ (key, Qmnemonic))
796 {
797 if (!NILP (value))
798 CHECK_STRING (value);
799 CODING_SYSTEM_MNEMONIC (codesys) = value;
800 }
801
802 else if (EQ (key, Qeol_type))
803 {
804 need_to_setup_eol_systems = NILP (value);
805 if (EQ (value, Qt))
806 value = Qnil;
807 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
808 }
809
810 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
811 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
812 #ifdef MULE
813 else if (ty == CODESYS_ISO2022)
814 {
815 #define FROB_INITIAL_CHARSET(charset_num) \
816 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
817 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
818
819 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
820 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
821 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
822 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
823
824 #define FROB_FORCE_CHARSET(charset_num) \
825 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
826
827 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
828 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
829 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
830 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
831
832 #define FROB_BOOLEAN_PROPERTY(prop) \
833 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
834
835 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
836 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
837 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
838 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
839 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
840 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
841 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
842
843 else if (EQ (key, Qinput_charset_conversion))
844 {
845 codesys->iso2022.input_conv =
846 Dynarr_new (charset_conversion_spec);
847 parse_charset_conversion_specs (codesys->iso2022.input_conv,
848 value);
849 }
850 else if (EQ (key, Qoutput_charset_conversion))
851 {
852 codesys->iso2022.output_conv =
853 Dynarr_new (charset_conversion_spec);
854 parse_charset_conversion_specs (codesys->iso2022.output_conv,
855 value);
856 }
857 else
858 signal_simple_error ("Unrecognized property", key);
859 }
860 else if (EQ (type, Qccl))
861 {
862 if (EQ (key, Qdecode))
863 {
864 CHECK_VECTOR (value);
865 CODING_SYSTEM_CCL_DECODE (codesys) = value;
866 }
867 else if (EQ (key, Qencode))
868 {
869 CHECK_VECTOR (value);
870 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
871 }
872 else
873 signal_simple_error ("Unrecognized property", key);
874 }
875 #endif /* MULE */
876 else
877 signal_simple_error ("Unrecognized property", key);
878 }
879
880 if (need_to_setup_eol_systems)
881 setup_eol_coding_systems (codesys);
882
883 {
884 Lisp_Object codesys_obj;
885 XSETCODING_SYSTEM (codesys_obj, codesys);
886 Fputhash (name, codesys_obj, Vcoding_system_hashtable);
887 return codesys_obj;
888 }
889 }
890
891 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
892 Copy OLD-CODING-SYSTEM to NEW-NAME.
893 If NEW-NAME does not name an existing coding system, a new one will
894 be created.
895 */
896 (old_coding_system, new_name))
897 {
898 Lisp_Object new_coding_system;
899 old_coding_system = Fget_coding_system (old_coding_system);
900 new_coding_system = Ffind_coding_system (new_name);
901 if (NILP (new_coding_system))
902 {
903 XSETCODING_SYSTEM (new_coding_system,
904 allocate_coding_system
905 (XCODING_SYSTEM_TYPE (old_coding_system),
906 new_name));
907 Fputhash (new_name, new_coding_system, Vcoding_system_hashtable);
908 }
909
910 {
911 struct Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
912 struct Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
913 memcpy (((char *) to ) + sizeof (to->header),
914 ((char *) from) + sizeof (from->header),
915 sizeof (*from) - sizeof (from->header));
916 to->name = new_name;
917 }
918 return new_coding_system;
919 }
920
921 static Lisp_Object
922 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
923 {
924 struct Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
925 Lisp_Object new_coding_system;
926
927 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
928 return coding_system;
929
930 switch (type)
931 {
932 case EOL_AUTODETECT: return coding_system;
933 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
934 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
935 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
936 default: abort ();
937 }
938
939 return NILP (new_coding_system) ? coding_system : new_coding_system;
940 }
941
942 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
943 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
944 */
945 (coding_system, eol_type))
946 {
947 coding_system = Fget_coding_system (coding_system);
948
949 return subsidiary_coding_system (coding_system,
950 symbol_to_eol_type (eol_type));
951 }
952
953
954 /************************************************************************/
955 /* Coding system accessors */
956 /************************************************************************/
957
958 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
959 Return the doc string for CODING-SYSTEM.
960 */
961 (coding_system))
962 {
963 coding_system = Fget_coding_system (coding_system);
964 return XCODING_SYSTEM_DOC_STRING (coding_system);
965 }
966
967 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
968 Return the type of CODING-SYSTEM.
969 */
970 (coding_system))
971 {
972 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
973 {
974 case CODESYS_AUTODETECT: return Qundecided;
975 #ifdef MULE
976 case CODESYS_SHIFT_JIS: return Qshift_jis;
977 case CODESYS_ISO2022: return Qiso2022;
978 case CODESYS_BIG5: return Qbig5;
979 case CODESYS_CCL: return Qccl;
980 #endif
981 case CODESYS_NO_CONVERSION: return Qno_conversion;
982 #ifdef DEBUG_XEMACS
983 case CODESYS_INTERNAL: return Qinternal;
984 #endif
985 default:
986 abort ();
987 }
988
989 return Qnil; /* not reached */
990 }
991
992 #ifdef MULE
993 static
994 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
995 {
996 Lisp_Object cs
997 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
998
999 if (CHARSETP(cs)){
1000 return XCHARSET_NAME(cs);
1001 }
1002 else {
1003 return Qnil;
1004 }
1005 }
1006
1007 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1008 Return initial charset of CODING-SYSTEM designated to GNUM.
1009 GNUM allows 0 .. 3.
1010 */
1011 (coding_system, gnum))
1012 {
1013 coding_system = Fget_coding_system (coding_system);
1014 CHECK_INT (gnum);
1015
1016 return coding_system_charset(coding_system, XINT (gnum));
1017 }
1018 #endif
1019
1020 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1021 Return the PROP property of CODING-SYSTEM.
1022 */
1023 (coding_system, prop))
1024 {
1025 int i, ok = 0;
1026 enum coding_system_type type;
1027
1028 coding_system = Fget_coding_system (coding_system);
1029 CHECK_SYMBOL (prop);
1030 type = XCODING_SYSTEM_TYPE (coding_system);
1031
1032 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1033 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1034 {
1035 ok = 1;
1036 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1037 {
1038 case CODESYS_PROP_ALL_OK:
1039 break;
1040 #ifdef MULE
1041 case CODESYS_PROP_ISO2022:
1042 if (type != CODESYS_ISO2022)
1043 signal_simple_error
1044 ("Property only valid in ISO2022 coding systems",
1045 prop);
1046 break;
1047
1048 case CODESYS_PROP_CCL:
1049 if (type != CODESYS_CCL)
1050 signal_simple_error
1051 ("Property only valid in CCL coding systems",
1052 prop);
1053 break;
1054 #endif /* MULE */
1055 default:
1056 abort ();
1057 }
1058 }
1059
1060 if (!ok)
1061 signal_simple_error ("Unrecognized property", prop);
1062
1063 if (EQ (prop, Qname))
1064 return XCODING_SYSTEM_NAME (coding_system);
1065 else if (EQ (prop, Qtype))
1066 return Fcoding_system_type (coding_system);
1067 else if (EQ (prop, Qdoc_string))
1068 return XCODING_SYSTEM_DOC_STRING (coding_system);
1069 else if (EQ (prop, Qmnemonic))
1070 return XCODING_SYSTEM_MNEMONIC (coding_system);
1071 else if (EQ (prop, Qeol_type))
1072 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1073 else if (EQ (prop, Qeol_lf))
1074 return XCODING_SYSTEM_EOL_LF (coding_system);
1075 else if (EQ (prop, Qeol_crlf))
1076 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1077 else if (EQ (prop, Qeol_cr))
1078 return XCODING_SYSTEM_EOL_CR (coding_system);
1079 else if (EQ (prop, Qpost_read_conversion))
1080 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1081 else if (EQ (prop, Qpre_write_conversion))
1082 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1083 #ifdef MULE
1084 else if (type == CODESYS_ISO2022)
1085 {
1086 if (EQ (prop, Qcharset_g0))
1087 return coding_system_charset (coding_system, 0);
1088 else if (EQ (prop, Qcharset_g1))
1089 return coding_system_charset (coding_system, 1);
1090 else if (EQ (prop, Qcharset_g2))
1091 return coding_system_charset (coding_system, 2);
1092 else if (EQ (prop, Qcharset_g3))
1093 return coding_system_charset (coding_system, 3);
1094
1095 #define FORCE_CHARSET(charset_num) \
1096 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1097 (coding_system, charset_num) ? Qt : Qnil)
1098
1099 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1100 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1101 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1102 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1103
1104 #define LISP_BOOLEAN(prop) \
1105 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1106
1107 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1108 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1109 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1110 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1111 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1112 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1113 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1114
1115 else if (EQ (prop, Qinput_charset_conversion))
1116 return
1117 unparse_charset_conversion_specs
1118 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1119 else if (EQ (prop, Qoutput_charset_conversion))
1120 return
1121 unparse_charset_conversion_specs
1122 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1123 else
1124 abort ();
1125 }
1126 else if (type == CODESYS_CCL)
1127 {
1128 if (EQ (prop, Qdecode))
1129 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1130 else if (EQ (prop, Qencode))
1131 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1132 else
1133 abort ();
1134 }
1135 #endif /* MULE */
1136 else
1137 abort ();
1138
1139 return Qnil; /* not reached */
1140 }
1141
1142
1143 /************************************************************************/
1144 /* Coding category functions */
1145 /************************************************************************/
1146
1147 static int
1148 decode_coding_category (Lisp_Object symbol)
1149 {
1150 int i;
1151
1152 CHECK_SYMBOL (symbol);
1153 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1154 if (EQ (coding_category_symbol[i], symbol))
1155 return i;
1156
1157 signal_simple_error ("Unrecognized coding category", symbol);
1158 return 0; /* not reached */
1159 }
1160
1161 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1162 Return a list of all recognized coding categories.
1163 */
1164 ())
1165 {
1166 int i;
1167 Lisp_Object list = Qnil;
1168
1169 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1170 list = Fcons (coding_category_symbol[i], list);
1171 return list;
1172 }
1173
1174 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1175 Change the priority order of the coding categories.
1176 LIST should be list of coding categories, in descending order of
1177 priority. Unspecified coding categories will be lower in priority
1178 than all specified ones, in the same relative order they were in
1179 previously.
1180 */
1181 (list))
1182 {
1183 int category_to_priority[CODING_CATEGORY_LAST + 1];
1184 int i, j;
1185 Lisp_Object rest;
1186
1187 /* First generate a list that maps coding categories to priorities. */
1188
1189 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1190 category_to_priority[i] = -1;
1191
1192 /* Highest priority comes from the specified list. */
1193 i = 0;
1194 EXTERNAL_LIST_LOOP (rest, list)
1195 {
1196 int cat = decode_coding_category (XCAR (rest));
1197
1198 if (category_to_priority[cat] >= 0)
1199 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1200 category_to_priority[cat] = i++;
1201 }
1202
1203 /* Now go through the existing categories by priority to retrieve
1204 the categories not yet specified and preserve their priority
1205 order. */
1206 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1207 {
1208 int cat = coding_category_by_priority[j];
1209 if (category_to_priority[cat] < 0)
1210 category_to_priority[cat] = i++;
1211 }
1212
1213 /* Now we need to construct the inverse of the mapping we just
1214 constructed. */
1215
1216 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1217 coding_category_by_priority[category_to_priority[i]] = i;
1218
1219 /* Phew! That was confusing. */
1220 return Qnil;
1221 }
1222
1223 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1224 Return a list of coding categories in descending order of priority.
1225 */
1226 ())
1227 {
1228 int i;
1229 Lisp_Object list = Qnil;
1230
1231 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1232 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1233 list);
1234 return list;
1235 }
1236
1237 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1238 Change the coding system associated with a coding category.
1239 */
1240 (coding_category, coding_system))
1241 {
1242 int cat = decode_coding_category (coding_category);
1243
1244 coding_system = Fget_coding_system (coding_system);
1245 coding_category_system[cat] = coding_system;
1246 return Qnil;
1247 }
1248
1249 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1250 Return the coding system associated with a coding category.
1251 */
1252 (coding_category))
1253 {
1254 int cat = decode_coding_category (coding_category);
1255 Lisp_Object sys = coding_category_system[cat];
1256
1257 if (!NILP (sys))
1258 return XCODING_SYSTEM_NAME (sys);
1259 return Qnil;
1260 }
1261
1262
1263 /************************************************************************/
1264 /* Detecting the encoding of data */
1265 /************************************************************************/
1266
1267 struct detection_state
1268 {
1269 enum eol_type eol_type;
1270 int seen_non_ascii;
1271 int mask;
1272 #ifdef MULE
1273 struct
1274 {
1275 int mask;
1276 int in_second_byte;
1277 }
1278 big5;
1279
1280 struct
1281 {
1282 int mask;
1283 int in_second_byte;
1284 }
1285 shift_jis;
1286
1287 struct
1288 {
1289 int mask;
1290 int initted;
1291 struct iso2022_decoder iso;
1292 unsigned int flags;
1293 int high_byte_count;
1294 unsigned int saw_single_shift:1;
1295 }
1296 iso2022;
1297 #endif
1298 struct
1299 {
1300 int seen_anything;
1301 int just_saw_cr;
1302 }
1303 eol;
1304 };
1305
1306 static int
1307 acceptable_control_char_p (int c)
1308 {
1309 switch (c)
1310 {
1311 /* Allow and ignore control characters that you might
1312 reasonably see in a text file */
1313 case '\r':
1314 case '\n':
1315 case '\t':
1316 case 7: /* bell */
1317 case 8: /* backspace */
1318 case 11: /* vertical tab */
1319 case 12: /* form feed */
1320 case 26: /* MS-DOS C-z junk */
1321 case 31: /* '^_' -- for info */
1322 return 1;
1323 default:
1324 return 0;
1325 }
1326 }
1327
1328 static int
1329 mask_has_at_most_one_bit_p (int mask)
1330 {
1331 /* Perhaps the only thing useful you learn from intensive Microsoft
1332 technical interviews */
1333 return (mask & (mask - 1)) == 0;
1334 }
1335
1336 static enum eol_type
1337 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1338 unsigned int n)
1339 {
1340 int c;
1341
1342 while (n--)
1343 {
1344 c = *src++;
1345 if (c == '\r')
1346 st->eol.just_saw_cr = 1;
1347 else
1348 {
1349 if (c == '\n')
1350 {
1351 if (st->eol.just_saw_cr)
1352 return EOL_CRLF;
1353 else if (st->eol.seen_anything)
1354 return EOL_LF;
1355 }
1356 else if (st->eol.just_saw_cr)
1357 return EOL_CR;
1358 st->eol.just_saw_cr = 0;
1359 }
1360 st->eol.seen_anything = 1;
1361 }
1362
1363 return EOL_AUTODETECT;
1364 }
1365
1366 /* Attempt to determine the encoding and EOL type of the given text.
1367 Before calling this function for the first type, you must initialize
1368 st->eol_type as appropriate and initialize st->mask to ~0.
1369
1370 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1371 not yet known.
1372
1373 st->mask holds the determined coding category mask, or ~0 if only
1374 ASCII has been seen so far.
1375
1376 Returns:
1377
1378 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1379 is present in st->mask
1380 1 == definitive answers are here for both st->eol_type and st->mask
1381 */
1382
1383 static int
1384 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1385 unsigned int n, int just_do_eol)
1386 {
1387 int c;
1388
1389 if (st->eol_type == EOL_AUTODETECT)
1390 st->eol_type = detect_eol_type (st, src, n);
1391
1392 if (just_do_eol)
1393 return st->eol_type != EOL_AUTODETECT;
1394
1395 if (!st->seen_non_ascii)
1396 {
1397 for (; n; n--, src++)
1398 {
1399 c = *src;
1400 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1401 {
1402 st->seen_non_ascii = 1;
1403 #ifdef MULE
1404 st->shift_jis.mask = ~0;
1405 st->big5.mask = ~0;
1406 st->iso2022.mask = ~0;
1407 #endif
1408 break;
1409 }
1410 }
1411 }
1412
1413 if (!n)
1414 return 0;
1415 #ifdef MULE
1416 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1417 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1418 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1419 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1420 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1421 st->big5.mask = detect_coding_big5 (st, src, n);
1422
1423 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask;
1424 #endif
1425 {
1426 int retval = mask_has_at_most_one_bit_p (st->mask);
1427 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1428 return retval && st->eol_type != EOL_AUTODETECT;
1429 }
1430 }
1431
1432 static Lisp_Object
1433 coding_system_from_mask (int mask)
1434 {
1435 if (mask == ~0)
1436 {
1437 /* If the file was entirely or basically ASCII, use the
1438 default value of `buffer-file-coding-system'. */
1439 Lisp_Object retval =
1440 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1441 if (!NILP (retval))
1442 {
1443 retval = Ffind_coding_system (retval);
1444 if (NILP (retval))
1445 {
1446 warn_when_safe
1447 (Qbad_variable, Qwarning,
1448 "Invalid `default-buffer-file-coding-system', set to nil");
1449 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1450 }
1451 }
1452 if (NILP (retval))
1453 retval = Fget_coding_system (Qno_conversion);
1454 return retval;
1455 }
1456 else
1457 {
1458 int i;
1459 int cat = -1;
1460 #ifdef MULE
1461 mask = postprocess_iso2022_mask (mask);
1462 #endif
1463 /* Look through the coding categories by priority and find
1464 the first one that is allowed. */
1465 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1466 {
1467 cat = coding_category_by_priority[i];
1468 if ((mask & (1 << cat)) &&
1469 !NILP (coding_category_system[cat]))
1470 break;
1471 }
1472 if (cat >= 0)
1473 return coding_category_system[cat];
1474 else
1475 return Fget_coding_system (Qno_conversion);
1476 }
1477 }
1478
1479 /* Given a seekable read stream and potential coding system and EOL type
1480 as specified, do any autodetection that is called for. If the
1481 coding system and/or EOL type are not autodetect, they will be left
1482 alone; but this function will never return an autodetect coding system
1483 or EOL type.
1484
1485 This function does not automatically fetch subsidiary coding systems;
1486 that should be unnecessary with the explicit eol-type argument. */
1487
1488 void
1489 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1490 enum eol_type *eol_type_in_out)
1491 {
1492 struct detection_state decst;
1493
1494 if (*eol_type_in_out == EOL_AUTODETECT)
1495 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1496
1497 memset (&decst, 0, sizeof (decst));
1498 decst.eol_type = *eol_type_in_out;
1499 decst.mask = ~0;
1500
1501 /* If autodetection is called for, do it now. */
1502 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1503 *eol_type_in_out == EOL_AUTODETECT)
1504 {
1505
1506 while (1)
1507 {
1508 unsigned char random_buffer[4096];
1509 int nread;
1510
1511 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1512 if (!nread)
1513 break;
1514 if (detect_coding_type (&decst, random_buffer, nread,
1515 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1516 CODESYS_AUTODETECT))
1517 break;
1518 }
1519
1520 *eol_type_in_out = decst.eol_type;
1521 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1522 *codesys_in_out = coding_system_from_mask (decst.mask);
1523 }
1524
1525 /* If we absolutely can't determine the EOL type, just assume LF. */
1526 if (*eol_type_in_out == EOL_AUTODETECT)
1527 *eol_type_in_out = EOL_LF;
1528
1529 Lstream_rewind (stream);
1530 }
1531
1532 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1533 Detect coding system of the text in the region between START and END.
1534 Returned a list of possible coding systems ordered by priority.
1535 If only ASCII characters are found, it returns 'undecided or one of
1536 its subsidiary coding systems according to a detected end-of-line
1537 type. Optional arg BUFFER defaults to the current buffer.
1538 */
1539 (start, end, buffer))
1540 {
1541 Lisp_Object val = Qnil;
1542 struct buffer *buf = decode_buffer (buffer, 0);
1543 Bufpos b, e;
1544 Lisp_Object instream, lb_instream;
1545 Lstream *istr, *lb_istr;
1546 struct detection_state decst;
1547 struct gcpro gcpro1, gcpro2;
1548
1549 get_buffer_range_char (buf, start, end, &b, &e, 0);
1550 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1551 lb_istr = XLSTREAM (lb_instream);
1552 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1553 istr = XLSTREAM (instream);
1554 GCPRO2 (instream, lb_instream);
1555 memset (&decst, 0, sizeof (decst));
1556 decst.eol_type = EOL_AUTODETECT;
1557 decst.mask = ~0;
1558 while (1)
1559 {
1560 unsigned char random_buffer[4096];
1561 int nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1562
1563 if (!nread)
1564 break;
1565 if (detect_coding_type (&decst, random_buffer, nread, 0))
1566 break;
1567 }
1568
1569 if (decst.mask == ~0)
1570 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1571 decst.eol_type);
1572 else
1573 {
1574 int i;
1575
1576 val = Qnil;
1577 #ifdef MULE
1578 decst.mask = postprocess_iso2022_mask (decst.mask);
1579 #endif
1580 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1581 {
1582 int sys = coding_category_by_priority[i];
1583 if (decst.mask & (1 << sys))
1584 {
1585 Lisp_Object codesys = coding_category_system[sys];
1586 if (!NILP (codesys))
1587 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1588 val = Fcons (codesys, val);
1589 }
1590 }
1591 }
1592 Lstream_close (istr);
1593 UNGCPRO;
1594 Lstream_delete (istr);
1595 Lstream_delete (lb_istr);
1596 return val;
1597 }
1598
1599
1600 /************************************************************************/
1601 /* Converting to internal Mule format ("decoding") */
1602 /************************************************************************/
1603
1604 /* A decoding stream is a stream used for decoding text (i.e.
1605 converting from some external format to internal format).
1606 The decoding-stream object keeps track of the actual coding
1607 stream, the stream that is at the other end, and data that
1608 needs to be persistent across the lifetime of the stream. */
1609
1610 /* Handle the EOL stuff related to just-read-in character C.
1611 EOL_TYPE is the EOL type of the coding stream.
1612 FLAGS is the current value of FLAGS in the coding stream, and may
1613 be modified by this macro. (The macro only looks at the
1614 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1615 bytes are to be written. You need to also define a local goto
1616 label "label_continue_loop" that is at the end of the main
1617 character-reading loop.
1618
1619 If C is a CR character, then this macro handles it entirely and
1620 jumps to label_continue_loop. Otherwise, this macro does not add
1621 anything to DST, and continues normally. You should continue
1622 processing C normally after this macro. */
1623
1624 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1625 do { \
1626 if (c == '\r') \
1627 { \
1628 if (eol_type == EOL_CR) \
1629 Dynarr_add (dst, '\n'); \
1630 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1631 Dynarr_add (dst, c); \
1632 else \
1633 flags |= CODING_STATE_CR; \
1634 goto label_continue_loop; \
1635 } \
1636 else if (flags & CODING_STATE_CR) \
1637 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1638 if (c != '\n') \
1639 Dynarr_add (dst, '\r'); \
1640 flags &= ~CODING_STATE_CR; \
1641 } \
1642 } while (0)
1643
1644 /* C should be a binary character in the range 0 - 255; convert
1645 to internal format and add to Dynarr DST. */
1646
1647 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1648 do { \
1649 if (BYTE_ASCII_P (c)) \
1650 Dynarr_add (dst, c); \
1651 else if (BYTE_C1_P (c)) \
1652 { \
1653 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1654 Dynarr_add (dst, c + 0x20); \
1655 } \
1656 else \
1657 { \
1658 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1659 Dynarr_add (dst, c); \
1660 } \
1661 } while (0)
1662
1663 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1664 do { \
1665 if (ch) \
1666 { \
1667 DECODE_ADD_BINARY_CHAR (ch, dst); \
1668 ch = 0; \
1669 } \
1670 } while (0)
1671
1672 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1673 do { \
1674 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1675 if ((flags & CODING_STATE_END) && \
1676 (flags & CODING_STATE_CR)) \
1677 Dynarr_add (dst, '\r'); \
1678 } while (0)
1679
1680 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1681
1682 struct decoding_stream
1683 {
1684 /* Coding system that governs the conversion. */
1685 struct Lisp_Coding_System *codesys;
1686
1687 /* Stream that we read the encoded data from or
1688 write the decoded data to. */
1689 Lstream *other_end;
1690
1691 /* If we are reading, then we can return only a fixed amount of
1692 data, so if the conversion resulted in too much data, we store it
1693 here for retrieval the next time around. */
1694 unsigned_char_dynarr *runoff;
1695
1696 /* FLAGS holds flags indicating the current state of the decoding.
1697 Some of these flags are dependent on the coding system. */
1698 unsigned int flags;
1699
1700 /* CH holds a partially built-up character. Since we only deal
1701 with one- and two-byte characters at the moment, we only use
1702 this to store the first byte of a two-byte character. */
1703 unsigned int ch;
1704
1705 /* EOL_TYPE specifies the type of end-of-line conversion that
1706 currently applies. We need to keep this separate from the
1707 EOL type stored in CODESYS because the latter might indicate
1708 automatic EOL-type detection while the former will always
1709 indicate a particular EOL type. */
1710 enum eol_type eol_type;
1711 #ifdef MULE
1712 /* Additional ISO2022 information. We define the structure above
1713 because it's also needed by the detection routines. */
1714 struct iso2022_decoder iso2022;
1715
1716 /* Additional information (the state of the running CCL program)
1717 used by the CCL decoder. */
1718 struct ccl_program ccl;
1719 #endif
1720 struct detection_state decst;
1721 };
1722
1723 static int decoding_reader (Lstream *stream, unsigned char *data, int size);
1724 static int decoding_writer (Lstream *stream, CONST unsigned char *data, int size);
1725 static int decoding_rewinder (Lstream *stream);
1726 static int decoding_seekable_p (Lstream *stream);
1727 static int decoding_flusher (Lstream *stream);
1728 static int decoding_closer (Lstream *stream);
1729
1730 static Lisp_Object decoding_marker (Lisp_Object stream,
1731 void (*markobj) (Lisp_Object));
1732
1733 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1734 sizeof (struct decoding_stream));
1735
1736 static Lisp_Object
1737 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1738 {
1739 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1740 Lisp_Object str_obj;
1741
1742 /* We do not need to mark the coding systems or charsets stored
1743 within the stream because they are stored in a global list
1744 and automatically marked. */
1745
1746 XSETLSTREAM (str_obj, str);
1747 (markobj) (str_obj);
1748 if (str->imp->marker)
1749 return (str->imp->marker) (str_obj, markobj);
1750 else
1751 return Qnil;
1752 }
1753
1754 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1755 so we read data from the other end, decode it, and store it into DATA. */
1756
1757 static int
1758 decoding_reader (Lstream *stream, unsigned char *data, int size)
1759 {
1760 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1761 unsigned char *orig_data = data;
1762 int read_size;
1763 int error_occurred = 0;
1764
1765 /* We need to interface to mule_decode(), which expects to take some
1766 amount of data and store the result into a Dynarr. We have
1767 mule_decode() store into str->runoff, and take data from there
1768 as necessary. */
1769
1770 /* We loop until we have enough data, reading chunks from the other
1771 end and decoding it. */
1772 while (1)
1773 {
1774 /* Take data from the runoff if we can. Make sure to take at
1775 most SIZE bytes, and delete the data from the runoff. */
1776 if (Dynarr_length (str->runoff) > 0)
1777 {
1778 int chunk = min (size, Dynarr_length (str->runoff));
1779 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1780 Dynarr_delete_many (str->runoff, 0, chunk);
1781 data += chunk;
1782 size -= chunk;
1783 }
1784
1785 if (size == 0)
1786 break; /* No more room for data */
1787
1788 if (str->flags & CODING_STATE_END)
1789 /* This means that on the previous iteration, we hit the EOF on
1790 the other end. We loop once more so that mule_decode() can
1791 output any final stuff it may be holding, or any "go back
1792 to a sane state" escape sequences. (This latter makes sense
1793 during encoding.) */
1794 break;
1795
1796 /* Exhausted the runoff, so get some more. DATA has at least
1797 SIZE bytes left of storage in it, so it's OK to read directly
1798 into it. (We'll be overwriting above, after we've decoded it
1799 into the runoff.) */
1800 read_size = Lstream_read (str->other_end, data, size);
1801 if (read_size < 0)
1802 {
1803 error_occurred = 1;
1804 break;
1805 }
1806 if (read_size == 0)
1807 /* There might be some more end data produced in the translation.
1808 See the comment above. */
1809 str->flags |= CODING_STATE_END;
1810 mule_decode (stream, data, str->runoff, read_size);
1811 }
1812
1813 if (data - orig_data == 0)
1814 return error_occurred ? -1 : 0;
1815 else
1816 return data - orig_data;
1817 }
1818
1819 static int
1820 decoding_writer (Lstream *stream, CONST unsigned char *data, int size)
1821 {
1822 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1823 int retval;
1824
1825 /* Decode all our data into the runoff, and then attempt to write
1826 it all out to the other end. Remove whatever chunk we succeeded
1827 in writing. */
1828 mule_decode (stream, data, str->runoff, size);
1829 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1830 Dynarr_length (str->runoff));
1831 if (retval > 0)
1832 Dynarr_delete_many (str->runoff, 0, retval);
1833 /* Do NOT return retval. The return value indicates how much
1834 of the incoming data was written, not how many bytes were
1835 written. */
1836 return size;
1837 }
1838
1839 static void
1840 reset_decoding_stream (struct decoding_stream *str)
1841 {
1842 #ifdef MULE
1843 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1844 {
1845 Lisp_Object coding_system = Qnil;
1846 XSETCODING_SYSTEM (coding_system, str->codesys);
1847 reset_iso2022 (coding_system, &str->iso2022);
1848 }
1849 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1850 {
1851 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
1852 }
1853 #endif
1854 str->flags = str->ch = 0;
1855 }
1856
1857 static int
1858 decoding_rewinder (Lstream *stream)
1859 {
1860 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1861 reset_decoding_stream (str);
1862 Dynarr_reset (str->runoff);
1863 return Lstream_rewind (str->other_end);
1864 }
1865
1866 static int
1867 decoding_seekable_p (Lstream *stream)
1868 {
1869 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1870 return Lstream_seekable_p (str->other_end);
1871 }
1872
1873 static int
1874 decoding_flusher (Lstream *stream)
1875 {
1876 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1877 return Lstream_flush (str->other_end);
1878 }
1879
1880 static int
1881 decoding_closer (Lstream *stream)
1882 {
1883 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1884 if (stream->flags & LSTREAM_FL_WRITE)
1885 {
1886 str->flags |= CODING_STATE_END;
1887 decoding_writer (stream, 0, 0);
1888 }
1889 Dynarr_free (str->runoff);
1890 #ifdef MULE
1891 if (str->iso2022.composite_chars)
1892 Dynarr_free (str->iso2022.composite_chars);
1893 #endif
1894 return Lstream_close (str->other_end);
1895 }
1896
1897 Lisp_Object
1898 decoding_stream_coding_system (Lstream *stream)
1899 {
1900 Lisp_Object coding_system = Qnil;
1901 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1902
1903 XSETCODING_SYSTEM (coding_system, str->codesys);
1904 return subsidiary_coding_system (coding_system, str->eol_type);
1905 }
1906
1907 void
1908 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1909 {
1910 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1911 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1912 str->codesys = cs;
1913 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1914 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1915 reset_decoding_stream (str);
1916 }
1917
1918 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
1919 stream for writing, no automatic code detection will be performed.
1920 The reason for this is that automatic code detection requires a
1921 seekable input. Things will also fail if you open a decoding
1922 stream for reading using a non-fully-specified coding system and
1923 a non-seekable input stream. */
1924
1925 static Lisp_Object
1926 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
1927 CONST char *mode)
1928 {
1929 Lstream *lstr = Lstream_new (lstream_decoding, mode);
1930 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1931 Lisp_Object obj;
1932
1933 memset (str, 0, sizeof (*str));
1934 str->other_end = stream;
1935 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
1936 str->eol_type = EOL_AUTODETECT;
1937 if (!strcmp (mode, "r")
1938 && Lstream_seekable_p (stream))
1939 /* We can determine the coding system now. */
1940 determine_real_coding_system (stream, &codesys, &str->eol_type);
1941 set_decoding_stream_coding_system (lstr, codesys);
1942 str->decst.eol_type = str->eol_type;
1943 str->decst.mask = ~0;
1944 XSETLSTREAM (obj, lstr);
1945 return obj;
1946 }
1947
1948 Lisp_Object
1949 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
1950 {
1951 return make_decoding_stream_1 (stream, codesys, "r");
1952 }
1953
1954 Lisp_Object
1955 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
1956 {
1957 return make_decoding_stream_1 (stream, codesys, "w");
1958 }
1959
1960 /* Note: the decode_coding_* functions all take the same
1961 arguments as mule_decode(), which is to say some SRC data of
1962 size N, which is to be stored into dynamic array DST.
1963 DECODING is the stream within which the decoding is
1964 taking place, but no data is actually read from or
1965 written to that stream; that is handled in decoding_reader()
1966 or decoding_writer(). This allows the same functions to
1967 be used for both reading and writing. */
1968
1969 static void
1970 mule_decode (Lstream *decoding, CONST unsigned char *src,
1971 unsigned_char_dynarr *dst, unsigned int n)
1972 {
1973 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
1974
1975 /* If necessary, do encoding-detection now. We do this when
1976 we're a writing stream or a non-seekable reading stream,
1977 meaning that we can't just process the whole input,
1978 rewind, and start over. */
1979
1980 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
1981 str->eol_type == EOL_AUTODETECT)
1982 {
1983 Lisp_Object codesys = Qnil;
1984
1985 XSETCODING_SYSTEM (codesys, str->codesys);
1986 detect_coding_type (&str->decst, src, n,
1987 CODING_SYSTEM_TYPE (str->codesys) !=
1988 CODESYS_AUTODETECT);
1989 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
1990 str->decst.mask != ~0)
1991 /* #### This is cheesy. What we really ought to do is
1992 buffer up a certain amount of data so as to get a
1993 less random result. */
1994 codesys = coding_system_from_mask (str->decst.mask);
1995 str->eol_type = str->decst.eol_type;
1996 if (XCODING_SYSTEM (codesys) != str->codesys)
1997 {
1998 /* Preserve the CODING_STATE_END flag in case it was set.
1999 If we erase it, bad things might happen. */
2000 int was_end = str->flags & CODING_STATE_END;
2001 set_decoding_stream_coding_system (decoding, codesys);
2002 if (was_end)
2003 str->flags |= CODING_STATE_END;
2004 }
2005 }
2006
2007 switch (CODING_SYSTEM_TYPE (str->codesys))
2008 {
2009 #ifdef DEBUG_XEMACS
2010 case CODESYS_INTERNAL:
2011 Dynarr_add_many (dst, src, n);
2012 break;
2013 #endif
2014 case CODESYS_AUTODETECT:
2015 /* If we got this far and still haven't decided on the coding
2016 system, then do no conversion. */
2017 case CODESYS_NO_CONVERSION:
2018 decode_coding_no_conversion (decoding, src, dst, n);
2019 break;
2020 #ifdef MULE
2021 case CODESYS_SHIFT_JIS:
2022 decode_coding_sjis (decoding, src, dst, n);
2023 break;
2024 case CODESYS_BIG5:
2025 decode_coding_big5 (decoding, src, dst, n);
2026 break;
2027 case CODESYS_CCL:
2028 ccl_driver (&str->ccl, src, dst, n, 0);
2029 break;
2030 case CODESYS_ISO2022:
2031 decode_coding_iso2022 (decoding, src, dst, n);
2032 break;
2033 #endif
2034 default:
2035 abort ();
2036 }
2037 }
2038
2039 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2040 Decode the text between START and END which is encoded in CODING-SYSTEM.
2041 This is useful if you've read in encoded text from a file without decoding
2042 it (e.g. you read in a JIS-formatted file but used the `binary' or
2043 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2044 Return length of decoded text.
2045 BUFFER defaults to the current buffer if unspecified.
2046 */
2047 (start, end, coding_system, buffer))
2048 {
2049 Bufpos b, e;
2050 struct buffer *buf = decode_buffer (buffer, 0);
2051 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2052 Lstream *istr, *ostr;
2053 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2054
2055 get_buffer_range_char (buf, start, end, &b, &e, 0);
2056
2057 barf_if_buffer_read_only (buf, b, e);
2058
2059 coding_system = Fget_coding_system (coding_system);
2060 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2061 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2062 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2063 coding_system);
2064 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2065 Fget_coding_system (Qbinary));
2066 istr = XLSTREAM (instream);
2067 ostr = XLSTREAM (outstream);
2068 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2069
2070 /* The chain of streams looks like this:
2071
2072 [BUFFER] <----- send through
2073 ------> [ENCODE AS BINARY]
2074 ------> [DECODE AS SPECIFIED]
2075 ------> [BUFFER]
2076 */
2077
2078 while (1)
2079 {
2080 char tempbuf[1024]; /* some random amount */
2081 Bufpos newpos, even_newer_pos;
2082 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2083 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2084
2085 if (!size_in_bytes)
2086 break;
2087 newpos = lisp_buffer_stream_startpos (istr);
2088 Lstream_write (ostr, tempbuf, size_in_bytes);
2089 even_newer_pos = lisp_buffer_stream_startpos (istr);
2090 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2091 even_newer_pos, 0);
2092 }
2093 Lstream_close (istr);
2094 Lstream_close (ostr);
2095 UNGCPRO;
2096 Lstream_delete (istr);
2097 Lstream_delete (ostr);
2098 Lstream_delete (XLSTREAM (de_outstream));
2099 Lstream_delete (XLSTREAM (lb_outstream));
2100 return Qnil;
2101 }
2102
2103
2104 /************************************************************************/
2105 /* Converting to an external encoding ("encoding") */
2106 /************************************************************************/
2107
2108 /* An encoding stream is an output stream. When you create the
2109 stream, you specify the coding system that governs the encoding
2110 and another stream that the resulting encoded data is to be
2111 sent to, and then start sending data to it. */
2112
2113 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2114
2115 struct encoding_stream
2116 {
2117 /* Coding system that governs the conversion. */
2118 struct Lisp_Coding_System *codesys;
2119
2120 /* Stream that we read the encoded data from or
2121 write the decoded data to. */
2122 Lstream *other_end;
2123
2124 /* If we are reading, then we can return only a fixed amount of
2125 data, so if the conversion resulted in too much data, we store it
2126 here for retrieval the next time around. */
2127 unsigned_char_dynarr *runoff;
2128
2129 /* FLAGS holds flags indicating the current state of the encoding.
2130 Some of these flags are dependent on the coding system. */
2131 unsigned int flags;
2132
2133 /* CH holds a partially built-up character. Since we only deal
2134 with one- and two-byte characters at the moment, we only use
2135 this to store the first byte of a two-byte character. */
2136 unsigned int ch;
2137 #ifdef MULE
2138 /* Additional information used by the ISO2022 encoder. */
2139 struct
2140 {
2141 /* CHARSET holds the character sets currently assigned to the G0
2142 through G3 registers. It is initialized from the array
2143 INITIAL_CHARSET in CODESYS. */
2144 Lisp_Object charset[4];
2145
2146 /* Which registers are currently invoked into the left (GL) and
2147 right (GR) halves of the 8-bit encoding space? */
2148 int register_left, register_right;
2149
2150 /* Whether we need to explicitly designate the charset in the
2151 G? register before using it. It is initialized from the
2152 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2153 unsigned char force_charset_on_output[4];
2154
2155 /* Other state variables that need to be preserved across
2156 invocations. */
2157 Lisp_Object current_charset;
2158 int current_half;
2159 int current_char_boundary;
2160 } iso2022;
2161
2162 /* Additional information (the state of the running CCL program)
2163 used by the CCL encoder. */
2164 struct ccl_program ccl;
2165 #endif
2166 };
2167
2168 static int encoding_reader (Lstream *stream, unsigned char *data, int size);
2169 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2170 int size);
2171 static int encoding_rewinder (Lstream *stream);
2172 static int encoding_seekable_p (Lstream *stream);
2173 static int encoding_flusher (Lstream *stream);
2174 static int encoding_closer (Lstream *stream);
2175
2176 static Lisp_Object encoding_marker (Lisp_Object stream,
2177 void (*markobj) (Lisp_Object));
2178
2179 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2180 sizeof (struct encoding_stream));
2181
2182 static Lisp_Object
2183 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2184 {
2185 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2186 Lisp_Object str_obj;
2187
2188 /* We do not need to mark the coding systems or charsets stored
2189 within the stream because they are stored in a global list
2190 and automatically marked. */
2191
2192 XSETLSTREAM (str_obj, str);
2193 (markobj) (str_obj);
2194 if (str->imp->marker)
2195 return (str->imp->marker) (str_obj, markobj);
2196 else
2197 return Qnil;
2198 }
2199
2200 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2201 so we read data from the other end, encode it, and store it into DATA. */
2202
2203 static int
2204 encoding_reader (Lstream *stream, unsigned char *data, int size)
2205 {
2206 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2207 unsigned char *orig_data = data;
2208 int read_size;
2209 int error_occurred = 0;
2210
2211 /* We need to interface to mule_encode(), which expects to take some
2212 amount of data and store the result into a Dynarr. We have
2213 mule_encode() store into str->runoff, and take data from there
2214 as necessary. */
2215
2216 /* We loop until we have enough data, reading chunks from the other
2217 end and encoding it. */
2218 while (1)
2219 {
2220 /* Take data from the runoff if we can. Make sure to take at
2221 most SIZE bytes, and delete the data from the runoff. */
2222 if (Dynarr_length (str->runoff) > 0)
2223 {
2224 int chunk = min (size, Dynarr_length (str->runoff));
2225 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2226 Dynarr_delete_many (str->runoff, 0, chunk);
2227 data += chunk;
2228 size -= chunk;
2229 }
2230
2231 if (size == 0)
2232 break; /* No more room for data */
2233
2234 if (str->flags & CODING_STATE_END)
2235 /* This means that on the previous iteration, we hit the EOF on
2236 the other end. We loop once more so that mule_encode() can
2237 output any final stuff it may be holding, or any "go back
2238 to a sane state" escape sequences. (This latter makes sense
2239 during encoding.) */
2240 break;
2241
2242 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2243 left of storage in it, so it's OK to read directly into it.
2244 (We'll be overwriting above, after we've encoded it into the
2245 runoff.) */
2246 read_size = Lstream_read (str->other_end, data, size);
2247 if (read_size < 0)
2248 {
2249 error_occurred = 1;
2250 break;
2251 }
2252 if (read_size == 0)
2253 /* There might be some more end data produced in the translation.
2254 See the comment above. */
2255 str->flags |= CODING_STATE_END;
2256 mule_encode (stream, data, str->runoff, read_size);
2257 }
2258
2259 if (data == orig_data)
2260 return error_occurred ? -1 : 0;
2261 else
2262 return data - orig_data;
2263 }
2264
2265 static int
2266 encoding_writer (Lstream *stream, CONST unsigned char *data, int size)
2267 {
2268 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2269 int retval;
2270
2271 /* Encode all our data into the runoff, and then attempt to write
2272 it all out to the other end. Remove whatever chunk we succeeded
2273 in writing. */
2274 mule_encode (stream, data, str->runoff, size);
2275 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2276 Dynarr_length (str->runoff));
2277 if (retval > 0)
2278 Dynarr_delete_many (str->runoff, 0, retval);
2279 /* Do NOT return retval. The return value indicates how much
2280 of the incoming data was written, not how many bytes were
2281 written. */
2282 return size;
2283 }
2284
2285 static void
2286 reset_encoding_stream (struct encoding_stream *str)
2287 {
2288 switch (CODING_SYSTEM_TYPE (str->codesys))
2289 {
2290 #ifdef MULE
2291 case CODESYS_ISO2022:
2292 {
2293 int i;
2294
2295 for (i = 0; i < 4; i++)
2296 {
2297 str->iso2022.charset[i] =
2298 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2299 str->iso2022.force_charset_on_output[i] =
2300 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2301 }
2302 str->iso2022.register_left = 0;
2303 str->iso2022.register_right = 1;
2304 str->iso2022.current_charset = Qnil;
2305 str->iso2022.current_half = 0;
2306 str->iso2022.current_char_boundary = 1;
2307 break;
2308 }
2309 case CODESYS_CCL:
2310 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2311 break;
2312 #endif
2313 default:
2314 break;
2315 }
2316
2317 str->flags = str->ch = 0;
2318 }
2319
2320 static int
2321 encoding_rewinder (Lstream *stream)
2322 {
2323 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2324 reset_encoding_stream (str);
2325 Dynarr_reset (str->runoff);
2326 return Lstream_rewind (str->other_end);
2327 }
2328
2329 static int
2330 encoding_seekable_p (Lstream *stream)
2331 {
2332 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2333 return Lstream_seekable_p (str->other_end);
2334 }
2335
2336 static int
2337 encoding_flusher (Lstream *stream)
2338 {
2339 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2340 return Lstream_flush (str->other_end);
2341 }
2342
2343 static int
2344 encoding_closer (Lstream *stream)
2345 {
2346 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2347 if (stream->flags & LSTREAM_FL_WRITE)
2348 {
2349 str->flags |= CODING_STATE_END;
2350 encoding_writer (stream, 0, 0);
2351 }
2352 Dynarr_free (str->runoff);
2353 return Lstream_close (str->other_end);
2354 }
2355
2356 Lisp_Object
2357 encoding_stream_coding_system (Lstream *stream)
2358 {
2359 Lisp_Object coding_system = Qnil;
2360 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2361
2362 XSETCODING_SYSTEM (coding_system, str->codesys);
2363 return coding_system;
2364 }
2365
2366 void
2367 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2368 {
2369 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2370 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2371 str->codesys = cs;
2372 reset_encoding_stream (str);
2373 }
2374
2375 static Lisp_Object
2376 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2377 CONST char *mode)
2378 {
2379 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2380 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2381 Lisp_Object obj;
2382
2383 memset (str, 0, sizeof (*str));
2384 str->runoff = Dynarr_new (unsigned_char);
2385 str->other_end = stream;
2386 set_encoding_stream_coding_system (lstr, codesys);
2387 XSETLSTREAM (obj, lstr);
2388 return obj;
2389 }
2390
2391 Lisp_Object
2392 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2393 {
2394 return make_encoding_stream_1 (stream, codesys, "r");
2395 }
2396
2397 Lisp_Object
2398 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2399 {
2400 return make_encoding_stream_1 (stream, codesys, "w");
2401 }
2402
2403 /* Convert N bytes of internally-formatted data stored in SRC to an
2404 external format, according to the encoding stream ENCODING.
2405 Store the encoded data into DST. */
2406
2407 static void
2408 mule_encode (Lstream *encoding, CONST unsigned char *src,
2409 unsigned_char_dynarr *dst, unsigned int n)
2410 {
2411 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2412
2413 switch (CODING_SYSTEM_TYPE (str->codesys))
2414 {
2415 #ifdef DEBUG_XEMACS
2416 case CODESYS_INTERNAL:
2417 Dynarr_add_many (dst, src, n);
2418 break;
2419 #endif
2420 case CODESYS_AUTODETECT:
2421 /* If we got this far and still haven't decided on the coding
2422 system, then do no conversion. */
2423 case CODESYS_NO_CONVERSION:
2424 encode_coding_no_conversion (encoding, src, dst, n);
2425 break;
2426 #ifdef MULE
2427 case CODESYS_SHIFT_JIS:
2428 encode_coding_sjis (encoding, src, dst, n);
2429 break;
2430 case CODESYS_BIG5:
2431 encode_coding_big5 (encoding, src, dst, n);
2432 break;
2433 case CODESYS_CCL:
2434 ccl_driver (&str->ccl, src, dst, n, 0);
2435 break;
2436 case CODESYS_ISO2022:
2437 encode_coding_iso2022 (encoding, src, dst, n);
2438 break;
2439 #endif /* MULE */
2440 default:
2441 abort ();
2442 }
2443 }
2444
2445 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2446 Encode the text between START and END using CODING-SYSTEM.
2447 This will, for example, convert Japanese characters into stuff such as
2448 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2449 text. BUFFER defaults to the current buffer if unspecified.
2450 */
2451 (start, end, coding_system, buffer))
2452 {
2453 Bufpos b, e;
2454 struct buffer *buf = decode_buffer (buffer, 0);
2455 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2456 Lstream *istr, *ostr;
2457 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2458
2459 get_buffer_range_char (buf, start, end, &b, &e, 0);
2460
2461 barf_if_buffer_read_only (buf, b, e);
2462
2463 coding_system = Fget_coding_system (coding_system);
2464 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2465 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2466 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2467 Fget_coding_system (Qbinary));
2468 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2469 coding_system);
2470 istr = XLSTREAM (instream);
2471 ostr = XLSTREAM (outstream);
2472 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2473 /* The chain of streams looks like this:
2474
2475 [BUFFER] <----- send through
2476 ------> [ENCODE AS SPECIFIED]
2477 ------> [DECODE AS BINARY]
2478 ------> [BUFFER]
2479 */
2480 while (1)
2481 {
2482 char tempbuf[1024]; /* some random amount */
2483 Bufpos newpos, even_newer_pos;
2484 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2485 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2486
2487 if (!size_in_bytes)
2488 break;
2489 newpos = lisp_buffer_stream_startpos (istr);
2490 Lstream_write (ostr, tempbuf, size_in_bytes);
2491 even_newer_pos = lisp_buffer_stream_startpos (istr);
2492 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2493 even_newer_pos, 0);
2494 }
2495
2496 {
2497 Charcount retlen =
2498 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2499 Lstream_close (istr);
2500 Lstream_close (ostr);
2501 UNGCPRO;
2502 Lstream_delete (istr);
2503 Lstream_delete (ostr);
2504 Lstream_delete (XLSTREAM (de_outstream));
2505 Lstream_delete (XLSTREAM (lb_outstream));
2506 return make_int (retlen);
2507 }
2508 }
2509
2510 #ifdef MULE
2511
2512 /************************************************************************/
2513 /* Shift-JIS methods */
2514 /************************************************************************/
2515
2516 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2517 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2518 as is. A character of JISX0201-Kana (TYPE94 character set) is
2519 encoded by "position-code + 0x80". A character of JISX0208
2520 (TYPE94x94 character set) is encoded in 2-byte but two
2521 position-codes are divided and shifted so that it fit in the range
2522 below.
2523
2524 --- CODE RANGE of Shift-JIS ---
2525 (character set) (range)
2526 ASCII 0x00 .. 0x7F
2527 JISX0201-Kana 0xA0 .. 0xDF
2528 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2529 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2530 -------------------------------
2531
2532 */
2533
2534 /* Is this the first byte of a Shift-JIS two-byte char? */
2535
2536 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2537 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2538
2539 /* Is this the second byte of a Shift-JIS two-byte char? */
2540
2541 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2542 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2543
2544 #define BYTE_SJIS_KATAKANA_P(c) \
2545 ((c) >= 0xA1 && (c) <= 0xDF)
2546
2547 static int
2548 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2549 unsigned int n)
2550 {
2551 int c;
2552
2553 while (n--)
2554 {
2555 c = *src++;
2556 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2557 return 0;
2558 if (st->shift_jis.in_second_byte)
2559 {
2560 st->shift_jis.in_second_byte = 0;
2561 if (c < 0x40)
2562 return 0;
2563 }
2564 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2565 st->shift_jis.in_second_byte = 1;
2566 }
2567 return CODING_CATEGORY_SHIFT_JIS_MASK;
2568 }
2569
2570 /* Convert Shift-JIS data to internal format. */
2571
2572 static void
2573 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2574 unsigned_char_dynarr *dst, unsigned int n)
2575 {
2576 unsigned char c;
2577 unsigned int flags, ch;
2578 enum eol_type eol_type;
2579 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2580
2581 CODING_STREAM_DECOMPOSE (str, flags, ch);
2582 eol_type = str->eol_type;
2583
2584 while (n--)
2585 {
2586 c = *src++;
2587
2588 if (ch)
2589 {
2590 /* Previous character was first byte of Shift-JIS Kanji char. */
2591 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2592 {
2593 unsigned char e1, e2;
2594
2595 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2596 DECODE_SJIS (ch, c, e1, e2);
2597 Dynarr_add (dst, e1);
2598 Dynarr_add (dst, e2);
2599 }
2600 else
2601 {
2602 DECODE_ADD_BINARY_CHAR (ch, dst);
2603 DECODE_ADD_BINARY_CHAR (c, dst);
2604 }
2605 ch = 0;
2606 }
2607 else
2608 {
2609 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2610 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2611 ch = c;
2612 else if (BYTE_SJIS_KATAKANA_P (c))
2613 {
2614 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2615 Dynarr_add (dst, c);
2616 }
2617 else
2618 DECODE_ADD_BINARY_CHAR (c, dst);
2619 }
2620 label_continue_loop:;
2621 }
2622
2623 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2624
2625 CODING_STREAM_COMPOSE (str, flags, ch);
2626 }
2627
2628 /* Convert internally-formatted data to Shift-JIS. */
2629
2630 static void
2631 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2632 unsigned_char_dynarr *dst, unsigned int n)
2633 {
2634 unsigned char c;
2635 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2636 unsigned int flags, ch;
2637 enum eol_type eol_type;
2638
2639 CODING_STREAM_DECOMPOSE (str, flags, ch);
2640 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2641
2642 while (n--)
2643 {
2644 c = *src++;
2645 if (c == '\n')
2646 {
2647 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2648 Dynarr_add (dst, '\r');
2649 if (eol_type != EOL_CR)
2650 Dynarr_add (dst, '\n');
2651 ch = 0;
2652 }
2653 else if (BYTE_ASCII_P (c))
2654 {
2655 Dynarr_add (dst, c);
2656 ch = 0;
2657 }
2658 else if (BUFBYTE_LEADING_BYTE_P (c))
2659 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2660 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2661 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2662 else if (ch)
2663 {
2664 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2665 {
2666 Dynarr_add (dst, c);
2667 ch = 0;
2668 }
2669 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2670 ch == LEADING_BYTE_JAPANESE_JISX0208)
2671 ch = c;
2672 else
2673 {
2674 unsigned char j1, j2;
2675 ENCODE_SJIS (ch, c, j1, j2);
2676 Dynarr_add (dst, j1);
2677 Dynarr_add (dst, j2);
2678 ch = 0;
2679 }
2680 }
2681 }
2682
2683 CODING_STREAM_COMPOSE (str, flags, ch);
2684 }
2685
2686 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2687 Decode a JISX0208 character of Shift-JIS coding-system.
2688 CODE is the character code in Shift-JIS as a cons of type bytes.
2689 Return the corresponding character.
2690 */
2691 (code))
2692 {
2693 unsigned char c1, c2, s1, s2;
2694
2695 CHECK_CONS (code);
2696 CHECK_INT (XCAR (code));
2697 CHECK_INT (XCDR (code));
2698 s1 = XINT (XCAR (code));
2699 s2 = XINT (XCDR (code));
2700 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2701 BYTE_SJIS_TWO_BYTE_2_P (s2))
2702 {
2703 DECODE_SJIS (s1, s2, c1, c2);
2704 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2705 c1 & 0x7F, c2 & 0x7F));
2706 }
2707 else
2708 return Qnil;
2709 }
2710
2711 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2712 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2713 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2714 */
2715 (ch))
2716 {
2717 Lisp_Object charset;
2718 int c1, c2, s1, s2;
2719
2720 CHECK_CHAR_COERCE_INT (ch);
2721 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2722 if (EQ (charset, Vcharset_japanese_jisx0208))
2723 {
2724 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2725 return Fcons (make_int (s1), make_int (s2));
2726 }
2727 else
2728 return Qnil;
2729 }
2730
2731
2732 /************************************************************************/
2733 /* Big5 methods */
2734 /************************************************************************/
2735
2736 /* BIG5 is a coding system encoding two character sets: ASCII and
2737 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2738 character set and is encoded in two-byte.
2739
2740 --- CODE RANGE of BIG5 ---
2741 (character set) (range)
2742 ASCII 0x00 .. 0x7F
2743 Big5 (1st byte) 0xA1 .. 0xFE
2744 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2745 --------------------------
2746
2747 Since the number of characters in Big5 is larger than maximum
2748 characters in Emacs' charset (96x96), it can't be handled as one
2749 charset. So, in Emacs, Big5 is devided into two: `charset-big5-1'
2750 and `charset-big5-2'. Both <type>s are TYPE94x94. The former
2751 contains frequently used characters and the latter contains less
2752 frequently used characters. */
2753
2754 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2755 ((c) >= 0xA1 && (c) <= 0xFE)
2756
2757 /* Is this the second byte of a Shift-JIS two-byte char? */
2758
2759 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2760 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2761
2762 /* Number of Big5 characters which have the same code in 1st byte. */
2763
2764 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2765
2766 /* Code conversion macros. These are macros because they are used in
2767 inner loops during code conversion.
2768
2769 Note that temporary variables in macros introduce the classic
2770 dynamic-scoping problems with variable names. We use capital-
2771 lettered variables in the assumption that XEmacs does not use
2772 capital letters in variables except in a very formalized way
2773 (e.g. Qstring). */
2774
2775 /* Convert Big5 code (b1, b2) into its internal string representation
2776 (lb, c1, c2). */
2777
2778 /* There is a much simpler way to split the Big5 charset into two.
2779 For the moment I'm going to leave the algorithm as-is because it
2780 claims to separate out the most-used characters into a single
2781 charset, which perhaps will lead to optimizations in various
2782 places.
2783
2784 The way the algorithm works is something like this:
2785
2786 Big5 can be viewed as a 94x157 charset, where the row is
2787 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2788 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2789 the split between low and high column numbers is apparently
2790 meaningless; ascending rows produce less and less frequent chars.
2791 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2792 the first charset, and the upper half (0xC9 .. 0xFE) to the
2793 second. To do the conversion, we convert the character into
2794 a single number where 0 .. 156 is the first row, 157 .. 313
2795 is the second, etc. That way, the characters are ordered by
2796 decreasing frequency. Then we just chop the space in two
2797 and coerce the result into a 94x94 space.
2798 */
2799
2800 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2801 { \
2802 int B1 = b1, B2 = b2; \
2803 unsigned int I \
2804 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2805 \
2806 if (B1 < 0xC9) \
2807 { \
2808 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2809 } \
2810 else \
2811 { \
2812 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2813 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2814 } \
2815 c1 = I / (0xFF - 0xA1) + 0xA1; \
2816 c2 = I % (0xFF - 0xA1) + 0xA1; \
2817 } while (0)
2818
2819 /* Convert the internal string representation of a Big5 character
2820 (lb, c1, c2) into Big5 code (b1, b2). */
2821
2822 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2823 { \
2824 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2825 \
2826 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2827 { \
2828 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2829 } \
2830 b1 = I / BIG5_SAME_ROW + 0xA1; \
2831 b2 = I % BIG5_SAME_ROW; \
2832 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2833 } while (0)
2834
2835 static int
2836 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2837 unsigned int n)
2838 {
2839 int c;
2840
2841 while (n--)
2842 {
2843 c = *src++;
2844 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2845 (c >= 0x80 && c <= 0xA0))
2846 return 0;
2847 if (st->big5.in_second_byte)
2848 {
2849 st->big5.in_second_byte = 0;
2850 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2851 return 0;
2852 }
2853 else if (c >= 0xA1)
2854 st->big5.in_second_byte = 1;
2855 }
2856 return CODING_CATEGORY_BIG5_MASK;
2857 }
2858
2859 /* Convert Big5 data to internal format. */
2860
2861 static void
2862 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2863 unsigned_char_dynarr *dst, unsigned int n)
2864 {
2865 unsigned char c;
2866 unsigned int flags, ch;
2867 enum eol_type eol_type;
2868 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2869
2870 CODING_STREAM_DECOMPOSE (str, flags, ch);
2871 eol_type = str->eol_type;
2872
2873 while (n--)
2874 {
2875 c = *src++;
2876 if (ch)
2877 {
2878 /* Previous character was first byte of Big5 char. */
2879 if (BYTE_BIG5_TWO_BYTE_2_P (c))
2880 {
2881 unsigned char b1, b2, b3;
2882 DECODE_BIG5 (ch, c, b1, b2, b3);
2883 Dynarr_add (dst, b1);
2884 Dynarr_add (dst, b2);
2885 Dynarr_add (dst, b3);
2886 }
2887 else
2888 {
2889 DECODE_ADD_BINARY_CHAR (ch, dst);
2890 DECODE_ADD_BINARY_CHAR (c, dst);
2891 }
2892 ch = 0;
2893 }
2894 else
2895 {
2896 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2897 if (BYTE_BIG5_TWO_BYTE_1_P (c))
2898 ch = c;
2899 else
2900 DECODE_ADD_BINARY_CHAR (c, dst);
2901 }
2902 label_continue_loop:;
2903 }
2904
2905 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2906
2907 CODING_STREAM_COMPOSE (str, flags, ch);
2908 }
2909
2910 /* Convert internally-formatted data to Big5. */
2911
2912 static void
2913 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2914 unsigned_char_dynarr *dst, unsigned int n)
2915 {
2916 unsigned char c;
2917 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2918 unsigned int flags, ch;
2919 enum eol_type eol_type;
2920
2921 CODING_STREAM_DECOMPOSE (str, flags, ch);
2922 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2923
2924 while (n--)
2925 {
2926 c = *src++;
2927 if (c == '\n')
2928 {
2929 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2930 Dynarr_add (dst, '\r');
2931 if (eol_type != EOL_CR)
2932 Dynarr_add (dst, '\n');
2933 }
2934 else if (BYTE_ASCII_P (c))
2935 {
2936 /* ASCII. */
2937 Dynarr_add (dst, c);
2938 }
2939 else if (BUFBYTE_LEADING_BYTE_P (c))
2940 {
2941 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
2942 c == LEADING_BYTE_CHINESE_BIG5_2)
2943 {
2944 /* A recognized leading byte. */
2945 ch = c;
2946 continue; /* not done with this character. */
2947 }
2948 /* otherwise just ignore this character. */
2949 }
2950 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
2951 ch == LEADING_BYTE_CHINESE_BIG5_2)
2952 {
2953 /* Previous char was a recognized leading byte. */
2954 ch = (ch << 8) | c;
2955 continue; /* not done with this character. */
2956 }
2957 else if (ch)
2958 {
2959 /* Encountering second byte of a Big5 character. */
2960 unsigned char b1, b2;
2961
2962 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
2963 Dynarr_add (dst, b1);
2964 Dynarr_add (dst, b2);
2965 }
2966
2967 ch = 0;
2968 }
2969
2970 CODING_STREAM_COMPOSE (str, flags, ch);
2971 }
2972
2973
2974 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
2975 Decode a Big5 character CODE of BIG5 coding-system.
2976 CODE is the character code in BIG5, a cons of two integers.
2977 Return the corresponding character.
2978 */
2979 (code))
2980 {
2981 unsigned char c1, c2, b1, b2;
2982
2983 CHECK_CONS (code);
2984 CHECK_INT (XCAR (code));
2985 CHECK_INT (XCDR (code));
2986 b1 = XINT (XCAR (code));
2987 b2 = XINT (XCDR (code));
2988 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
2989 BYTE_BIG5_TWO_BYTE_2_P (b2))
2990 {
2991 int leading_byte;
2992 Lisp_Object charset;
2993 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
2994 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
2995 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
2996 }
2997 else
2998 return Qnil;
2999 }
3000
3001 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3002 Encode the Big5 character CH to BIG5 coding-system.
3003 Return the corresponding character code in Big5.
3004 */
3005 (ch))
3006 {
3007 Lisp_Object charset;
3008 int c1, c2, b1, b2;
3009
3010 CHECK_CHAR_COERCE_INT (ch);
3011 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3012 if (EQ (charset, Vcharset_chinese_big5_1) ||
3013 EQ (charset, Vcharset_chinese_big5_2))
3014 {
3015 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3016 b1, b2);
3017 return Fcons (make_int (b1), make_int (b2));
3018 }
3019 else
3020 return Qnil;
3021 }
3022
3023
3024 /************************************************************************/
3025 /* ISO2022 methods */
3026 /************************************************************************/
3027
3028 /* The following note describes the coding system ISO2022 briefly.
3029 Since the intention of this note is to help understanding of the
3030 programs in this file, some parts are NOT ACCURATE or OVERLY
3031 SIMPLIFIED. For thorough understanding, please refer to the
3032 original document of ISO2022.
3033
3034 ISO2022 provides many mechanisms to encode several character sets
3035 in 7-bit and 8-bit environments. If one chooses 7-bit environment,
3036 all text is encoded by codes of less than 128. This may make the
3037 encoded text a little bit longer, but the text get more stability
3038 to pass through several gateways (some of them strip off MSB).
3039
3040 There are two kind of character sets: control character set and
3041 graphic character set. The former contains control characters such
3042 as `newline' and `escape' to provide control functions (control
3043 functions are provided also by escape sequence). The latter
3044 contains graphic characters such as 'A' and '-'. Emacs recognizes
3045 two control character sets and many graphic character sets.
3046
3047 Graphic character sets are classified into one of four types,
3048 according to the dimension and number of characters in the set:
3049 TYPE94, TYPE96, TYPE94x94, and TYPE96x96. In addition, each
3050 character set is assigned an identification byte, unique for each
3051 type, called "final character" (denoted as <F> hereafter). The <F>
3052 of each character set is decided by ECMA(*) when it is registered
3053 in ISO. Code range of <F> is 0x30..0x7F (0x30..0x3F are for
3054 private use only).
3055
3056 Note (*): ECMA = European Computer Manufacturers Association
3057
3058 Here are examples of graphic character set [NAME(<F>)]:
3059 o TYPE94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3060 o TYPE96 -- right-half-of-ISO8859-1('A'), ...
3061 o TYPE94x94 -- GB2312('A'), JISX0208('B'), ...
3062 o TYPE96x96 -- none for the moment
3063
3064 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
3065 C0 [0x00..0x1F] -- control character plane 0
3066 GL [0x20..0x7F] -- graphic character plane 0
3067 C1 [0x80..0x9F] -- control character plane 1
3068 GR [0xA0..0xFF] -- graphic character plane 1
3069
3070 A control character set is directly designated and invoked to C0 or
3071 C1 by an escape sequence. The most common case is that:
3072 - ISO646's control character set is designated/invoked to C0, and
3073 - ISO6429's control character set is designated/invoked to C1,
3074 and usually these designations/invocations are omitted in encoded
3075 text. In a 7-bit environment, only C0 can be used, and a control
3076 character for C1 is encoded by an appropriate escape sequence to
3077 fit into the environment. All control characters for C1 are
3078 defined to have corresponding escape sequences.
3079
3080 A graphic character set is at first designated to one of four
3081 graphic registers (G0 through G3), then these graphic registers are
3082 invoked to GL or GR. These designations and invocations can be
3083 done independently. The most common case is that G0 is invoked to
3084 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3085 these invocations and designations are omitted in encoded text.
3086 In a 7-bit environment, only GL can be used.
3087
3088 When a graphic character set of TYPE94 or TYPE94x94 is invoked to
3089 GL, codes 0x20 and 0x7F of the GL area work as control characters
3090 SPACE and DEL respectively, and code 0xA0 and 0xFF of GR area
3091 should not be used.
3092
3093 There are two ways of invocation: locking-shift and single-shift.
3094 With locking-shift, the invocation lasts until the next different
3095 invocation, whereas with single-shift, the invocation works only
3096 for the following character and doesn't affect locking-shift.
3097 Invocations are done by the following control characters or escape
3098 sequences.
3099
3100 ----------------------------------------------------------------------
3101 abbrev function cntrl escape seq description
3102 ----------------------------------------------------------------------
3103 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3104 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3105 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR
3106 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3107 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR
3108 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3109 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR
3110 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3111 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3112 ----------------------------------------------------------------------
3113 The first four are for locking-shift. Control characters for these
3114 functions are defined by macros ISO_CODE_XXX in `coding.h'.
3115
3116 Designations are done by the following escape sequences.
3117 ----------------------------------------------------------------------
3118 escape sequence description
3119 ----------------------------------------------------------------------
3120 ESC '(' <F> designate TYPE94<F> to G0
3121 ESC ')' <F> designate TYPE94<F> to G1
3122 ESC '*' <F> designate TYPE94<F> to G2
3123 ESC '+' <F> designate TYPE94<F> to G3
3124 ESC ',' <F> designate TYPE96<F> to G0 (*)
3125 ESC '-' <F> designate TYPE96<F> to G1
3126 ESC '.' <F> designate TYPE96<F> to G2
3127 ESC '/' <F> designate TYPE96<F> to G3
3128 ESC '$' '(' <F> designate TYPE94x94<F> to G0 (**)
3129 ESC '$' ')' <F> designate TYPE94x94<F> to G1
3130 ESC '$' '*' <F> designate TYPE94x94<F> to G2
3131 ESC '$' '+' <F> designate TYPE94x94<F> to G3
3132 ESC '$' ',' <F> designate TYPE96x96<F> to G0 (*)
3133 ESC '$' '-' <F> designate TYPE96x96<F> to G1
3134 ESC '$' '.' <F> designate TYPE96x96<F> to G2
3135 ESC '$' '/' <F> designate TYPE96x96<F> to G3
3136 ----------------------------------------------------------------------
3137 In this list, "TYPE94<F>" means a graphic character set of type TYPE94
3138 and final character <F>, and etc.
3139
3140 Note (*): Although these designations are not allowed in ISO2022,
3141 Emacs accepts them on decoding, and produces them on encoding
3142 TYPE96 or TYPE96x96 character set in a coding system which is
3143 characterized as 7-bit environment, non-locking-shift, and
3144 non-single-shift.
3145
3146 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3147 '(' can be omitted. We call this as "short-form" here after.
3148
3149 Now you may notice that there are a lot of ways for encoding the
3150 same multilingual text in ISO2022. Actually, there exist many
3151 coding systems such as Compound Text (used in X's inter client
3152 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3153 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3154 localized platforms), and all of these are variants of ISO2022.
3155
3156 In addition to the above, Emacs handles two more kinds of escape
3157 sequences: ISO6429's direction specification and Emacs' private
3158 sequence for specifying character composition.
3159
3160 ISO6429's direction specification takes the following format:
3161 o CSI ']' -- end of the current direction
3162 o CSI '0' ']' -- end of the current direction
3163 o CSI '1' ']' -- start of left-to-right text
3164 o CSI '2' ']' -- start of right-to-left text
3165 The control character CSI (0x9B: control sequence introducer) is
3166 abbreviated to the escape sequence ESC '[' in 7-bit environment.
3167
3168 Character composition specification takes the following format:
3169 o ESC '0' -- start character composition
3170 o ESC '1' -- end character composition
3171 Since these are not standard escape sequences of any ISO, the use
3172 of them for these meanings is restricted to Emacs only. */
3173
3174 static void
3175 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3176 {
3177 int i;
3178
3179 for (i = 0; i < 4; i++)
3180 {
3181 if (!NILP (coding_system))
3182 iso->charset[i] =
3183 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3184 else
3185 iso->charset[i] = Qt;
3186 iso->invalid_designated[i] = 0;
3187 }
3188 iso->esc = ISO_ESC_NOTHING;
3189 iso->esc_bytes_index = 0;
3190 iso->register_left = 0;
3191 iso->register_right = 1;
3192 iso->switched_dir_and_no_valid_charset_yet = 0;
3193 iso->invalid_switch_dir = 0;
3194 iso->output_direction_sequence = 0;
3195 iso->output_literally = 0;
3196 if (iso->composite_chars)
3197 Dynarr_reset (iso->composite_chars);
3198 }
3199
3200 static int
3201 fit_to_be_escape_quoted (unsigned char c)
3202 {
3203 switch (c)
3204 {
3205 case ISO_CODE_ESC:
3206 case ISO_CODE_CSI:
3207 case ISO_CODE_SS2:
3208 case ISO_CODE_SS3:
3209 case ISO_CODE_SO:
3210 case ISO_CODE_SI:
3211 return 1;
3212
3213 default:
3214 return 0;
3215 }
3216 }
3217
3218 /* Parse one byte of an ISO2022 escape sequence.
3219 If the result is an invalid escape sequence, return 0 and
3220 do not change anything in STR. Otherwise, if the result is
3221 an incomplete escape sequence, update ISO2022.ESC and
3222 ISO2022.ESC_BYTES and return -1. Otherwise, update
3223 all the state variables (but not ISO2022.ESC_BYTES) and
3224 return 1.
3225
3226 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3227 or invocation of an invalid character set and treat that as
3228 an unrecognized escape sequence. */
3229
3230 static int
3231 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3232 unsigned char c, unsigned int *flags,
3233 int check_invalid_charsets)
3234 {
3235 /* (1) If we're at the end of a designation sequence, CS is the
3236 charset being designated and REG is the register to designate
3237 it to.
3238
3239 (2) If we're at the end of a locking-shift sequence, REG is
3240 the register to invoke and HALF (0 == left, 1 == right) is
3241 the half to invoke it into.
3242
3243 (3) If we're at the end of a single-shift sequence, REG is
3244 the register to invoke. */
3245 Lisp_Object cs = Qnil;
3246 int reg, half;
3247
3248 /* NOTE: This code does goto's all over the fucking place.
3249 The reason for this is that we're basically implementing
3250 a state machine here, and hierarchical languages like C
3251 don't really provide a clean way of doing this. */
3252
3253 if (! (*flags & CODING_STATE_ESCAPE))
3254 /* At beginning of escape sequence; we need to reset our
3255 escape-state variables. */
3256 iso->esc = ISO_ESC_NOTHING;
3257
3258 iso->output_literally = 0;
3259 iso->output_direction_sequence = 0;
3260
3261 switch (iso->esc)
3262 {
3263 case ISO_ESC_NOTHING:
3264 iso->esc_bytes_index = 0;
3265 switch (c)
3266 {
3267 case ISO_CODE_ESC: /* Start escape sequence */
3268 *flags |= CODING_STATE_ESCAPE;
3269 iso->esc = ISO_ESC;
3270 goto not_done;
3271
3272 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3273 *flags |= CODING_STATE_ESCAPE;
3274 iso->esc = ISO_ESC_5_11;
3275 goto not_done;
3276
3277 case ISO_CODE_SO: /* locking shift 1 */
3278 reg = 1; half = 0;
3279 goto locking_shift;
3280 case ISO_CODE_SI: /* locking shift 0 */
3281 reg = 0; half = 0;
3282 goto locking_shift;
3283
3284 case ISO_CODE_SS2: /* single shift */
3285 reg = 2;
3286 goto single_shift;
3287 case ISO_CODE_SS3: /* single shift */
3288 reg = 3;
3289 goto single_shift;
3290
3291 default: /* Other control characters */
3292 return 0;
3293 }
3294
3295 case ISO_ESC:
3296 switch (c)
3297 {
3298 /**** single shift ****/
3299
3300 case 'N': /* single shift 2 */
3301 reg = 2;
3302 goto single_shift;
3303 case 'O': /* single shift 3 */
3304 reg = 3;
3305 goto single_shift;
3306
3307 /**** locking shift ****/
3308
3309 case '~': /* locking shift 1 right */
3310 reg = 1; half = 1;
3311 goto locking_shift;
3312 case 'n': /* locking shift 2 */
3313 reg = 2; half = 0;
3314 goto locking_shift;
3315 case '}': /* locking shift 2 right */
3316 reg = 2; half = 1;
3317 goto locking_shift;
3318 case 'o': /* locking shift 3 */
3319 reg = 3; half = 0;
3320 goto locking_shift;
3321 case '|': /* locking shift 3 right */
3322 reg = 3; half = 1;
3323 goto locking_shift;
3324
3325 /**** composite ****/
3326
3327 case '0':
3328 iso->esc = ISO_ESC_START_COMPOSITE;
3329 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3330 CODING_STATE_COMPOSITE;
3331 return 1;
3332
3333 case '1':
3334 iso->esc = ISO_ESC_END_COMPOSITE;
3335 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3336 ~CODING_STATE_COMPOSITE;
3337 return 1;
3338
3339 /**** directionality ****/
3340
3341 case '[':
3342 iso->esc = ISO_ESC_5_11;
3343 goto not_done;
3344
3345 /**** designation ****/
3346
3347 case '$': /* multibyte charset prefix */
3348 iso->esc = ISO_ESC_2_4;
3349 goto not_done;
3350
3351 default:
3352 if (0x28 <= c && c <= 0x2F)
3353 {
3354 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
3355 goto not_done;
3356 }
3357
3358 /* This function is called with CODESYS equal to nil when
3359 doing coding-system detection. */
3360 if (!NILP (codesys)
3361 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3362 && fit_to_be_escape_quoted (c))
3363 {
3364 iso->esc = ISO_ESC_LITERAL;
3365 *flags &= CODING_STATE_ISO2022_LOCK;
3366 return 1;
3367 }
3368
3369 /* bzzzt! */
3370 return 0;
3371 }
3372
3373
3374
3375 /**** directionality ****/
3376
3377 case ISO_ESC_5_11: /* ISO6429 direction control */
3378 if (c == ']')
3379 {
3380 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3381 goto directionality;
3382 }
3383 if (c == '0') iso->esc = ISO_ESC_5_11_0;
3384 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3385 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3386 else return 0;
3387 goto not_done;
3388
3389 case ISO_ESC_5_11_0:
3390 if (c == ']')
3391 {
3392 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3393 goto directionality;
3394 }
3395 return 0;
3396
3397 case ISO_ESC_5_11_1:
3398 if (c == ']')
3399 {
3400 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3401 goto directionality;
3402 }
3403 return 0;
3404
3405 case ISO_ESC_5_11_2:
3406 if (c == ']')
3407 {
3408 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
3409 goto directionality;
3410 }
3411 return 0;
3412
3413 directionality:
3414 iso->esc = ISO_ESC_DIRECTIONALITY;
3415 /* Various junk here to attempt to preserve the direction sequences
3416 literally in the text if they would otherwise be swallowed due
3417 to invalid designations that don't show up as actual charset
3418 changes in the text. */
3419 if (iso->invalid_switch_dir)
3420 {
3421 /* We already inserted a direction switch literally into the
3422 text. We assume (#### this may not be right) that the
3423 next direction switch is the one going the other way,
3424 and we need to output that literally as well. */
3425 iso->output_literally = 1;
3426 iso->invalid_switch_dir = 0;
3427 }
3428 else
3429 {
3430 int jj;
3431
3432 /* If we are in the thrall of an invalid designation,
3433 then stick the directionality sequence literally into the
3434 output stream so it ends up in the original text again. */
3435 for (jj = 0; jj < 4; jj++)
3436 if (iso->invalid_designated[jj])
3437 break;
3438 if (jj < 4)
3439 {
3440 iso->output_literally = 1;
3441 iso->invalid_switch_dir = 1;
3442 }
3443 else
3444 /* Indicate that we haven't yet seen a valid designation,
3445 so that if a switch-dir is directly followed by an
3446 invalid designation, both get inserted literally. */
3447 iso->switched_dir_and_no_valid_charset_yet = 1;
3448 }
3449 return 1;
3450
3451
3452 /**** designation ****/
3453
3454 case ISO_ESC_2_4:
3455 if (0x28 <= c && c <= 0x2F)
3456 {
3457 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
3458 goto not_done;
3459 }
3460 if (0x40 <= c && c <= 0x42)
3461 {
3462 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
3463 *flags & CODING_STATE_R2L ?
3464 CHARSET_RIGHT_TO_LEFT :
3465 CHARSET_LEFT_TO_RIGHT);
3466 reg = 0;
3467 goto designated;
3468 }
3469 return 0;
3470
3471 default:
3472 {
3473 int type =-1;
3474
3475 if (c < '0' || c > '~')
3476 return 0; /* bad final byte */
3477
3478 if (iso->esc >= ISO_ESC_2_8 &&
3479 iso->esc <= ISO_ESC_2_15)
3480 {
3481 type = ((iso->esc >= ISO_ESC_2_12) ?
3482 CHARSET_TYPE_96 : CHARSET_TYPE_94);
3483 reg = (iso->esc - ISO_ESC_2_8) & 3;
3484 }
3485 else if (iso->esc >= ISO_ESC_2_4_8 &&
3486 iso->esc <= ISO_ESC_2_4_15)
3487 {
3488 type = ((iso->esc >= ISO_ESC_2_4_12) ?
3489 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
3490 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
3491 }
3492 else
3493 {
3494 /* Can this ever be reached? -slb */
3495 abort();
3496 }
3497
3498 cs = CHARSET_BY_ATTRIBUTES (type, c,
3499 *flags & CODING_STATE_R2L ?
3500 CHARSET_RIGHT_TO_LEFT :
3501 CHARSET_LEFT_TO_RIGHT);
3502 goto designated;
3503 }
3504 }
3505
3506 not_done:
3507 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
3508 return -1;
3509
3510 single_shift:
3511 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
3512 /* can't invoke something that ain't there. */
3513 return 0;
3514 iso->esc = ISO_ESC_SINGLE_SHIFT;
3515 *flags &= CODING_STATE_ISO2022_LOCK;
3516 if (reg == 2)
3517 *flags |= CODING_STATE_SS2;
3518 else
3519 *flags |= CODING_STATE_SS3;
3520 return 1;
3521
3522 locking_shift:
3523 if (check_invalid_charsets &&
3524 !CHARSETP (iso->charset[reg]))
3525 /* can't invoke something that ain't there. */
3526 return 0;
3527 if (half)
3528 iso->register_right = reg;
3529 else
3530 iso->register_left = reg;
3531 *flags &= CODING_STATE_ISO2022_LOCK;
3532 iso->esc = ISO_ESC_LOCKING_SHIFT;
3533 return 1;
3534
3535 designated:
3536 if (NILP (cs) && check_invalid_charsets)
3537 {
3538 iso->invalid_designated[reg] = 1;
3539 iso->charset[reg] = Vcharset_ascii;
3540 iso->esc = ISO_ESC_DESIGNATE;
3541 *flags &= CODING_STATE_ISO2022_LOCK;
3542 iso->output_literally = 1;
3543 if (iso->switched_dir_and_no_valid_charset_yet)
3544 {
3545 /* We encountered a switch-direction followed by an
3546 invalid designation. Ensure that the switch-direction
3547 gets outputted; otherwise it will probably get eaten
3548 when the text is written out again. */
3549 iso->switched_dir_and_no_valid_charset_yet = 0;
3550 iso->output_direction_sequence = 1;
3551 /* And make sure that the switch-dir going the other
3552 way gets outputted, as well. */
3553 iso->invalid_switch_dir = 1;
3554 }
3555 return 1;
3556 }
3557 /* This function is called with CODESYS equal to nil when
3558 doing coding-system detection. */
3559 if (!NILP (codesys))
3560 {
3561 charset_conversion_spec_dynarr *dyn =
3562 XCODING_SYSTEM (codesys)->iso2022.input_conv;
3563
3564 if (dyn)
3565 {
3566 int i;
3567
3568 for (i = 0; i < Dynarr_length (dyn); i++)
3569 {
3570 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
3571 if (EQ (cs, spec->from_charset))
3572 cs = spec->to_charset;
3573 }
3574 }
3575 }
3576
3577 iso->charset[reg] = cs;
3578 iso->esc = ISO_ESC_DESIGNATE;
3579 *flags &= CODING_STATE_ISO2022_LOCK;
3580 if (iso->invalid_designated[reg])
3581 {
3582 iso->invalid_designated[reg] = 0;
3583 iso->output_literally = 1;
3584 }
3585 if (iso->switched_dir_and_no_valid_charset_yet)
3586 iso->switched_dir_and_no_valid_charset_yet = 0;
3587 return 1;
3588 }
3589
3590 static int
3591 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
3592 unsigned int n)
3593 {
3594 int c;
3595 int mask;
3596
3597 /* #### There are serious deficiencies in the recognition mechanism
3598 here. This needs to be much smarter if it's going to cut it. */
3599
3600 if (!st->iso2022.initted)
3601 {
3602 reset_iso2022 (Qnil, &st->iso2022.iso);
3603 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
3604 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3605 CODING_CATEGORY_ISO_8_1_MASK |
3606 CODING_CATEGORY_ISO_8_2_MASK |
3607 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
3608 st->iso2022.flags = 0;
3609 st->iso2022.high_byte_count = 0;
3610 st->iso2022.saw_single_shift = 0;
3611 st->iso2022.initted = 1;
3612 }
3613
3614 mask = st->iso2022.mask;
3615
3616 while (n--)
3617 {
3618 c = *src++;
3619 if (c >= 0xA0)
3620 {
3621 mask &= ~CODING_CATEGORY_ISO_7_MASK;
3622 st->iso2022.high_byte_count++;
3623 }
3624 else
3625 {
3626 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
3627 {
3628 if (st->iso2022.high_byte_count & 1)
3629 /* odd number of high bytes; assume not iso-8-2 */
3630 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3631 }
3632 st->iso2022.high_byte_count = 0;
3633 st->iso2022.saw_single_shift = 0;
3634 if (c > 0x80)
3635 mask &= ~CODING_CATEGORY_ISO_7_MASK;
3636 }
3637 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
3638 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
3639 { /* control chars */
3640 switch (c)
3641 {
3642 /* Allow and ignore control characters that you might
3643 reasonably see in a text file */
3644 case '\r':
3645 case '\n':
3646 case '\t':
3647 case 7: /* bell */
3648 case 8: /* backspace */
3649 case 11: /* vertical tab */
3650 case 12: /* form feed */
3651 case 26: /* MS-DOS C-z junk */
3652 case 31: /* '^_' -- for info */
3653 goto label_continue_loop;
3654
3655 default:
3656 break;
3657 }
3658 }
3659
3660 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
3661 || BYTE_C1_P (c))
3662 {
3663 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
3664 &st->iso2022.flags, 0))
3665 {
3666 switch (st->iso2022.iso.esc)
3667 {
3668 case ISO_ESC_DESIGNATE:
3669 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
3670 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3671 break;
3672 case ISO_ESC_LOCKING_SHIFT:
3673 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
3674 goto ran_out_of_chars;
3675 case ISO_ESC_SINGLE_SHIFT:
3676 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
3677 st->iso2022.saw_single_shift = 1;
3678 break;
3679 default:
3680 break;
3681 }
3682 }
3683 else
3684 {
3685 mask = 0;
3686 goto ran_out_of_chars;
3687 }
3688 }
3689 label_continue_loop:;
3690 }
3691
3692 ran_out_of_chars:
3693
3694 return mask;
3695 }
3696
3697 static int
3698 postprocess_iso2022_mask (int mask)
3699 {
3700 /* #### kind of cheesy */
3701 /* If seven-bit ISO is allowed, then assume that the encoding is
3702 entirely seven-bit and turn off the eight-bit ones. */
3703 if (mask & CODING_CATEGORY_ISO_7_MASK)
3704 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3705 CODING_CATEGORY_ISO_8_1_MASK |
3706 CODING_CATEGORY_ISO_8_2_MASK);
3707 return mask;
3708 }
3709
3710 /* If FLAGS is a null pointer or specifies right-to-left motion,
3711 output a switch-dir-to-left-to-right sequence to DST.
3712 Also update FLAGS if it is not a null pointer.
3713 If INTERNAL_P is set, we are outputting in internal format and
3714 need to handle the CSI differently. */
3715
3716 static void
3717 restore_left_to_right_direction (struct Lisp_Coding_System *codesys,
3718 unsigned_char_dynarr *dst,
3719 unsigned int *flags,
3720 int internal_p)
3721 {
3722 if (!flags || (*flags & CODING_STATE_R2L))
3723 {
3724 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3725 {
3726 Dynarr_add (dst, ISO_CODE_ESC);
3727 Dynarr_add (dst, '[');
3728 }
3729 else if (internal_p)
3730 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3731 else
3732 Dynarr_add (dst, ISO_CODE_CSI);
3733 Dynarr_add (dst, '0');
3734 Dynarr_add (dst, ']');
3735 if (flags)
3736 *flags &= ~CODING_STATE_R2L;
3737 }
3738 }
3739
3740 /* If FLAGS is a null pointer or specifies a direction different from
3741 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
3742 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
3743 sequence to DST. Also update FLAGS if it is not a null pointer.
3744 If INTERNAL_P is set, we are outputting in internal format and
3745 need to handle the CSI differently. */
3746
3747 static void
3748 ensure_correct_direction (int direction, struct Lisp_Coding_System *codesys,
3749 unsigned_char_dynarr *dst, unsigned int *flags,
3750 int internal_p)
3751 {
3752 if ((!flags || (*flags & CODING_STATE_R2L)) &&
3753 direction == CHARSET_LEFT_TO_RIGHT)
3754 restore_left_to_right_direction (codesys, dst, flags, internal_p);
3755 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
3756 && (!flags || !(*flags & CODING_STATE_R2L)) &&
3757 direction == CHARSET_RIGHT_TO_LEFT)
3758 {
3759 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3760 {
3761 Dynarr_add (dst, ISO_CODE_ESC);
3762 Dynarr_add (dst, '[');
3763 }
3764 else if (internal_p)
3765 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3766 else
3767 Dynarr_add (dst, ISO_CODE_CSI);
3768 Dynarr_add (dst, '2');
3769 Dynarr_add (dst, ']');
3770 if (flags)
3771 *flags |= CODING_STATE_R2L;
3772 }
3773 }
3774
3775 /* Convert ISO2022-format data to internal format. */
3776
3777 static void
3778 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
3779 unsigned_char_dynarr *dst, unsigned int n)
3780 {
3781 unsigned char c;
3782 unsigned int flags, ch;
3783 enum eol_type eol_type;
3784 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3785 Lisp_Object coding_system = Qnil;
3786 unsigned_char_dynarr *real_dst = dst;
3787
3788 CODING_STREAM_DECOMPOSE (str, flags, ch);
3789 eol_type = str->eol_type;
3790 XSETCODING_SYSTEM (coding_system, str->codesys);
3791
3792 if (flags & CODING_STATE_COMPOSITE)
3793 dst = str->iso2022.composite_chars;
3794
3795 while (n--)
3796 {
3797 c = *src++;
3798 if (flags & CODING_STATE_ESCAPE)
3799 { /* Within ESC sequence */
3800 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
3801 c, &flags, 1);
3802
3803 if (retval)
3804 {
3805 switch (str->iso2022.esc)
3806 {
3807 case ISO_ESC_START_COMPOSITE:
3808 if (str->iso2022.composite_chars)
3809 Dynarr_reset (str->iso2022.composite_chars);
3810 else
3811 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
3812 dst = str->iso2022.composite_chars;
3813 break;
3814 case ISO_ESC_END_COMPOSITE:
3815 {
3816 Bufbyte comstr[MAX_EMCHAR_LEN];
3817 Bytecount len;
3818 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
3819 Dynarr_length (dst));
3820 dst = real_dst;
3821 len = set_charptr_emchar (comstr, emch);
3822 Dynarr_add_many (dst, comstr, len);
3823 break;
3824 }
3825
3826 case ISO_ESC_LITERAL:
3827 DECODE_ADD_BINARY_CHAR (c, dst);
3828 break;
3829
3830 default:
3831 /* Everything else handled already */
3832 break;
3833 }
3834 }
3835
3836 /* Attempted error recovery. */
3837 if (str->iso2022.output_direction_sequence)
3838 ensure_correct_direction (flags & CODING_STATE_R2L ?
3839 CHARSET_RIGHT_TO_LEFT :
3840 CHARSET_LEFT_TO_RIGHT,
3841 str->codesys, dst, 0, 1);
3842 /* More error recovery. */
3843 if (!retval || str->iso2022.output_literally)
3844 {
3845 /* Output the (possibly invalid) sequence */
3846 int i;
3847 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
3848 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
3849 flags &= CODING_STATE_ISO2022_LOCK;
3850 if (!retval)
3851 n++, src--;/* Repeat the loop with the same character. */
3852 else
3853 {
3854 /* No sense in reprocessing the final byte of the
3855 escape sequence; it could mess things up anyway.
3856 Just add it now. */
3857 DECODE_ADD_BINARY_CHAR (c, dst);
3858 }
3859 }
3860 ch = 0;
3861 }
3862 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
3863 { /* Control characters */
3864
3865 /***** Error-handling *****/
3866
3867 /* If we were in the middle of a character, dump out the
3868 partial character. */
3869 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3870
3871 /* If we just saw a single-shift character, dump it out.
3872 This may dump out the wrong sort of single-shift character,
3873 but least it will give an indication that something went
3874 wrong. */
3875 if (flags & CODING_STATE_SS2)
3876 {
3877 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
3878 flags &= ~CODING_STATE_SS2;
3879 }
3880 if (flags & CODING_STATE_SS3)
3881 {
3882 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
3883 flags &= ~CODING_STATE_SS3;
3884 }
3885
3886 /***** Now handle the control characters. *****/
3887
3888 /* Handle CR/LF */
3889 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3890
3891 flags &= CODING_STATE_ISO2022_LOCK;
3892
3893 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
3894 DECODE_ADD_BINARY_CHAR (c, dst);
3895 }
3896 else
3897 { /* Graphic characters */
3898 Lisp_Object charset;
3899 int lb;
3900 int reg;
3901
3902 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3903
3904 /* Now determine the charset. */
3905 reg = ((flags & CODING_STATE_SS2) ? 2
3906 : (flags & CODING_STATE_SS3) ? 3
3907 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
3908 : str->iso2022.register_left);
3909 charset = str->iso2022.charset[reg];
3910
3911 /* Error checking: */
3912 if (NILP (charset) || str->iso2022.invalid_designated[reg]
3913 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
3914 && XCHARSET_CHARS (charset) == 94))
3915 /* Mrmph. We are trying to invoke a register that has no
3916 or an invalid charset in it, or trying to add a character
3917 outside the range of the charset. Insert that char literally
3918 to preserve it for the output. */
3919 {
3920 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3921 DECODE_ADD_BINARY_CHAR (c, dst);
3922 }
3923
3924 else
3925 {
3926 /* Things are probably hunky-dorey. */
3927
3928 /* Fetch reverse charset, maybe. */
3929 if (((flags & CODING_STATE_R2L) &&
3930 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
3931 ||
3932 (!(flags & CODING_STATE_R2L) &&
3933 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
3934 {
3935 Lisp_Object new_charset =
3936 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
3937 if (!NILP (new_charset))
3938 charset = new_charset;
3939 }
3940
3941 lb = XCHARSET_LEADING_BYTE (charset);
3942 switch (XCHARSET_REP_BYTES (charset))
3943 {
3944 case 1: /* ASCII */
3945 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3946 Dynarr_add (dst, c & 0x7F);
3947 break;
3948
3949 case 2: /* one-byte official */
3950 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3951 Dynarr_add (dst, lb);
3952 Dynarr_add (dst, c | 0x80);
3953 break;
3954
3955 case 3: /* one-byte private or two-byte official */
3956 if (XCHARSET_PRIVATE_P (charset))
3957 {
3958 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3959 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
3960 Dynarr_add (dst, lb);
3961 Dynarr_add (dst, c | 0x80);
3962 }
3963 else
3964 {
3965 if (ch)
3966 {
3967 Dynarr_add (dst, lb);
3968 Dynarr_add (dst, ch | 0x80);
3969 Dynarr_add (dst, c | 0x80);
3970 ch = 0;
3971 }
3972 else
3973 ch = c;
3974 }
3975 break;
3976
3977 default: /* two-byte private */
3978 if (ch)
3979 {
3980 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
3981 Dynarr_add (dst, lb);
3982 Dynarr_add (dst, ch | 0x80);
3983 Dynarr_add (dst, c | 0x80);
3984 ch = 0;
3985 }
3986 else
3987 ch = c;
3988 }
3989 }
3990
3991 if (!ch)
3992 flags &= CODING_STATE_ISO2022_LOCK;
3993 }
3994
3995 label_continue_loop:;
3996 }
3997
3998 if (flags & CODING_STATE_END)
3999 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4000
4001 CODING_STREAM_COMPOSE (str, flags, ch);
4002 }
4003
4004
4005 /***** ISO2022 encoder *****/
4006
4007 /* Designate CHARSET into register REG. */
4008
4009 static void
4010 iso2022_designate (Lisp_Object charset, unsigned char reg,
4011 struct encoding_stream *str, unsigned_char_dynarr *dst)
4012 {
4013 CONST char *inter94 = "()*+", *inter96= ",-./";
4014 int type;
4015 unsigned char final;
4016 Lisp_Object old_charset = str->iso2022.charset[reg];
4017
4018 str->iso2022.charset[reg] = charset;
4019 if (!CHARSETP (charset))
4020 /* charset might be an initial nil or t. */
4021 return;
4022 type = XCHARSET_TYPE (charset);
4023 final = XCHARSET_FINAL (charset);
4024 if (!str->iso2022.force_charset_on_output[reg] &&
4025 CHARSETP (old_charset) &&
4026 XCHARSET_TYPE (old_charset) == type &&
4027 XCHARSET_FINAL (old_charset) == final)
4028 return;
4029
4030 str->iso2022.force_charset_on_output[reg] = 0;
4031
4032 {
4033 charset_conversion_spec_dynarr *dyn =
4034 str->codesys->iso2022.output_conv;
4035
4036 if (dyn)
4037 {
4038 int i;
4039
4040 for (i = 0; i < Dynarr_length (dyn); i++)
4041 {
4042 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4043 if (EQ (charset, spec->from_charset))
4044 charset = spec->to_charset;
4045 }
4046 }
4047 }
4048
4049 Dynarr_add (dst, ISO_CODE_ESC);
4050 switch (type)
4051 {
4052 case CHARSET_TYPE_94:
4053 Dynarr_add (dst, inter94[reg]);
4054 break;
4055 case CHARSET_TYPE_96:
4056 Dynarr_add (dst, inter96[reg]);
4057 break;
4058 case CHARSET_TYPE_94X94:
4059 Dynarr_add (dst, '$');
4060 if (reg != 0
4061 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4062 || final < '@'
4063 || final > 'B')
4064 Dynarr_add (dst, inter94[reg]);
4065 break;
4066 case CHARSET_TYPE_96X96:
4067 Dynarr_add (dst, '$');
4068 Dynarr_add (dst, inter96[reg]);
4069 break;
4070 }
4071 Dynarr_add (dst, final);
4072 }
4073
4074 static void
4075 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4076 {
4077 if (str->iso2022.register_left != 0)
4078 {
4079 Dynarr_add (dst, ISO_CODE_SI);
4080 str->iso2022.register_left = 0;
4081 }
4082 }
4083
4084 static void
4085 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4086 {
4087 if (str->iso2022.register_left != 1)
4088 {
4089 Dynarr_add (dst, ISO_CODE_SO);
4090 str->iso2022.register_left = 1;
4091 }
4092 }
4093
4094 /* Convert internally-formatted data to ISO2022 format. */
4095
4096 static void
4097 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4098 unsigned_char_dynarr *dst, unsigned int n)
4099 {
4100 unsigned char charmask, c;
4101 unsigned int flags, ch;
4102 enum eol_type eol_type;
4103 unsigned char char_boundary;
4104 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4105 struct Lisp_Coding_System *codesys = str->codesys;
4106 int i;
4107 Lisp_Object charset;
4108 int half;
4109
4110 /* flags for handling composite chars. We do a little switcharoo
4111 on the source while we're outputting the composite char. */
4112 unsigned int saved_n = 0;
4113 CONST unsigned char *saved_src = NULL;
4114 int in_composite = 0;
4115
4116 CODING_STREAM_DECOMPOSE (str, flags, ch);
4117 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4118 char_boundary = str->iso2022.current_char_boundary;
4119 charset = str->iso2022.current_charset;
4120 half = str->iso2022.current_half;
4121
4122 back_to_square_n:
4123 while (n--)
4124 {
4125 c = *src++;
4126
4127 if (BYTE_ASCII_P (c))
4128 { /* Processing ASCII character */
4129 ch = 0;
4130
4131 restore_left_to_right_direction (codesys, dst, &flags, 0);
4132
4133 /* Make sure G0 contains ASCII */
4134 if ((c > ' ' && c < ISO_CODE_DEL) ||
4135 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4136 {
4137 ensure_normal_shift (str, dst);
4138 iso2022_designate (Vcharset_ascii, 0, str, dst);
4139 }
4140
4141 /* If necessary, restore everything to the default state
4142 at end-of-line */
4143 if (c == '\n' &&
4144 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4145 {
4146 restore_left_to_right_direction (codesys, dst, &flags, 0);
4147
4148 ensure_normal_shift (str, dst);
4149
4150 for (i = 0; i < 4; i++)
4151 {
4152 Lisp_Object initial_charset =
4153 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4154 iso2022_designate (initial_charset, i, str, dst);
4155 }
4156 }
4157 if (c == '\n')
4158 {
4159 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4160 Dynarr_add (dst, '\r');
4161 if (eol_type != EOL_CR)
4162 Dynarr_add (dst, c);
4163 }
4164 else
4165 {
4166 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4167 && fit_to_be_escape_quoted (c))
4168 Dynarr_add (dst, ISO_CODE_ESC);
4169 Dynarr_add (dst, c);
4170 }
4171 char_boundary = 1;
4172 }
4173
4174 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4175 { /* Processing Leading Byte */
4176 ch = 0;
4177 charset = CHARSET_BY_LEADING_BYTE (c);
4178 if (LEADING_BYTE_PREFIX_P(c))
4179 ch = c;
4180 else if (!EQ (charset, Vcharset_control_1)
4181 && !EQ (charset, Vcharset_composite))
4182 {
4183 int reg;
4184
4185 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4186 codesys, dst, &flags, 0);
4187
4188 /* Now determine which register to use. */
4189 reg = -1;
4190 for (i = 0; i < 4; i++)
4191 {
4192 if (EQ (charset, str->iso2022.charset[i]) ||
4193 EQ (charset,
4194 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4195 {
4196 reg = i;
4197 break;
4198 }
4199 }
4200
4201 if (reg == -1)
4202 {
4203 if (XCHARSET_GRAPHIC (charset) != 0)
4204 {
4205 if (!NILP (str->iso2022.charset[1]) &&
4206 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4207 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4208 reg = 1;
4209 else if (!NILP (str->iso2022.charset[2]))
4210 reg = 2;
4211 else if (!NILP (str->iso2022.charset[3]))
4212 reg = 3;
4213 else
4214 reg = 0;
4215 }
4216 else
4217 reg = 0;
4218 }
4219
4220 iso2022_designate (charset, reg, str, dst);
4221
4222 /* Now invoke that register. */
4223 switch (reg)
4224 {
4225 case 0:
4226 ensure_normal_shift (str, dst);
4227 half = 0;
4228 break;
4229
4230 case 1:
4231 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4232 {
4233 ensure_shift_out (str, dst);
4234 half = 0;
4235 }
4236 else
4237 half = 1;
4238 break;
4239
4240 case 2:
4241 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4242 {
4243 Dynarr_add (dst, ISO_CODE_ESC);
4244 Dynarr_add (dst, 'N');
4245 half = 0;
4246 }
4247 else
4248 {
4249 Dynarr_add (dst, ISO_CODE_SS2);
4250 half = 1;
4251 }
4252 break;
4253
4254 case 3:
4255 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4256 {
4257 Dynarr_add (dst, ISO_CODE_ESC);
4258 Dynarr_add (dst, 'O');
4259 half = 0;
4260 }
4261 else
4262 {
4263 Dynarr_add (dst, ISO_CODE_SS3);
4264 half = 1;
4265 }
4266 break;
4267
4268 default:
4269 abort ();
4270 }
4271 }
4272 char_boundary = 0;
4273 }
4274 else
4275 { /* Processing Non-ASCII character */
4276 charmask = (half == 0 ? 0x7F : 0xFF);
4277 char_boundary = 1;
4278 if (EQ (charset, Vcharset_control_1))
4279 {
4280 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4281 && fit_to_be_escape_quoted (c))
4282 Dynarr_add (dst, ISO_CODE_ESC);
4283 /* you asked for it ... */
4284 Dynarr_add (dst, c - 0x20);
4285 }
4286 else
4287 {
4288 switch (XCHARSET_REP_BYTES (charset))
4289 {
4290 case 2:
4291 Dynarr_add (dst, c & charmask);
4292 break;
4293 case 3:
4294 if (XCHARSET_PRIVATE_P (charset))
4295 {
4296 Dynarr_add (dst, c & charmask);
4297 ch = 0;
4298 }
4299 else if (ch)
4300 {
4301 if (EQ (charset, Vcharset_composite))
4302 {
4303 if (in_composite)
4304 {
4305 /* #### Bother! We don't know how to
4306 handle this yet. */
4307 Dynarr_add (dst, '~');
4308 }
4309 else
4310 {
4311 Emchar emch = MAKE_CHAR (Vcharset_composite,
4312 ch & 0x7F, c & 0x7F);
4313 Lisp_Object lstr = composite_char_string (emch);
4314 saved_n = n;
4315 saved_src = src;
4316 in_composite = 1;
4317 src = XSTRING_DATA (lstr);
4318 n = XSTRING_LENGTH (lstr);
4319 Dynarr_add (dst, ISO_CODE_ESC);
4320 Dynarr_add (dst, '0'); /* start composing */
4321 }
4322 }
4323 else
4324 {
4325 Dynarr_add (dst, ch & charmask);
4326 Dynarr_add (dst, c & charmask);
4327 }
4328 ch = 0;
4329 }
4330 else
4331 {
4332 ch = c;
4333 char_boundary = 0;
4334 }
4335 break;
4336 case 4:
4337 if (ch)
4338 {
4339 Dynarr_add (dst, ch & charmask);
4340 Dynarr_add (dst, c & charmask);
4341 ch = 0;
4342 }
4343 else
4344 {
4345 ch = c;
4346 char_boundary = 0;
4347 }
4348 break;
4349 default:
4350 abort ();
4351 }
4352 }
4353 }
4354 }
4355
4356 if (in_composite)
4357 {
4358 n = saved_n;
4359 src = saved_src;
4360 in_composite = 0;
4361 Dynarr_add (dst, ISO_CODE_ESC);
4362 Dynarr_add (dst, '1'); /* end composing */
4363 goto back_to_square_n; /* Wheeeeeeeee ..... */
4364 }
4365
4366 if (char_boundary && flags & CODING_STATE_END)
4367 {
4368 restore_left_to_right_direction (codesys, dst, &flags, 0);
4369 ensure_normal_shift (str, dst);
4370 for (i = 0; i < 4; i++)
4371 {
4372 Lisp_Object initial_charset =
4373 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4374 iso2022_designate (initial_charset, i, str, dst);
4375 }
4376 }
4377
4378 CODING_STREAM_COMPOSE (str, flags, ch);
4379 str->iso2022.current_char_boundary = char_boundary;
4380 str->iso2022.current_charset = charset;
4381 str->iso2022.current_half = half;
4382
4383 /* Verbum caro factum est! */
4384 }
4385 #endif /* MULE */
4386
4387 /************************************************************************/
4388 /* No-conversion methods */
4389 /************************************************************************/
4390
4391 /* This is used when reading in "binary" files -- i.e. files that may
4392 contain all 256 possible byte values and that are not to be
4393 interpreted as being in any particular decoding. */
4394 static void
4395 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4396 unsigned_char_dynarr *dst, unsigned int n)
4397 {
4398 unsigned char c;
4399 unsigned int flags, ch;
4400 enum eol_type eol_type;
4401 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4402
4403 CODING_STREAM_DECOMPOSE (str, flags, ch);
4404 eol_type = str->eol_type;
4405
4406 while (n--)
4407 {
4408 c = *src++;
4409
4410 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4411 DECODE_ADD_BINARY_CHAR (c, dst);
4412 label_continue_loop:;
4413 }
4414
4415 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4416
4417 CODING_STREAM_COMPOSE (str, flags, ch);
4418 }
4419
4420 static void
4421 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4422 unsigned_char_dynarr *dst, unsigned int n)
4423 {
4424 unsigned char c;
4425 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4426 unsigned int flags, ch;
4427 enum eol_type eol_type;
4428
4429 CODING_STREAM_DECOMPOSE (str, flags, ch);
4430 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4431
4432 while (n--)
4433 {
4434 c = *src++;
4435 if (c == '\n')
4436 {
4437 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
4438 Dynarr_add (dst, '\r');
4439 if (eol_type != EOL_CR)
4440 Dynarr_add (dst, '\n');
4441 ch = 0;
4442 }
4443 else if (BYTE_ASCII_P (c))
4444 {
4445 assert (ch == 0);
4446 Dynarr_add (dst, c);
4447 }
4448 else if (BUFBYTE_LEADING_BYTE_P (c))
4449 {
4450 assert (ch == 0);
4451 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
4452 c == LEADING_BYTE_CONTROL_1)
4453 ch = c;
4454 else
4455 Dynarr_add (dst, '~'); /* untranslatable character */
4456 }
4457 else
4458 {
4459 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
4460 Dynarr_add (dst, c);
4461 else if (ch == LEADING_BYTE_CONTROL_1)
4462 {
4463 assert (c < 0xC0);
4464 Dynarr_add (dst, c - 0x20);
4465 }
4466 /* else it should be the second or third byte of an
4467 untranslatable character, so ignore it */
4468 ch = 0;
4469 }
4470 }
4471
4472 CODING_STREAM_COMPOSE (str, flags, ch);
4473 }
4474
4475
4476 /************************************************************************/
4477 /* Simple internal/external functions */
4478 /************************************************************************/
4479
4480 static Extbyte_dynarr *conversion_out_dynarr;
4481 static Bufbyte_dynarr *conversion_in_dynarr;
4482
4483 /* Determine coding system from coding format */
4484
4485 #define FILE_NAME_CODING_SYSTEM \
4486 ((NILP (Vfile_name_coding_system) || \
4487 (EQ ((Vfile_name_coding_system), Qbinary))) ? \
4488 Qnil : Fget_coding_system (Vfile_name_coding_system))
4489
4490 /* #### not correct for all values of `fmt'! */
4491 #define FMT_CODING_SYSTEM(fmt) \
4492 (((fmt) == FORMAT_FILENAME) ? FILE_NAME_CODING_SYSTEM : \
4493 ((fmt) == FORMAT_CTEXT ) ? Fget_coding_system (Qctext) : \
4494 ((fmt) == FORMAT_TERMINAL) ? FILE_NAME_CODING_SYSTEM : \
4495 Qnil)
4496
4497 extern CONST Extbyte *
4498 convert_to_external_format (CONST Bufbyte *ptr,
4499 Bytecount len,
4500 Extcount *len_out,
4501 enum external_data_format fmt)
4502 {
4503 #ifdef MULE
4504 Lisp_Object coding_system = FMT_CODING_SYSTEM (fmt);
4505 #else
4506 Lisp_Object coding_system = Qnil;
4507 #endif
4508
4509 if (!conversion_out_dynarr)
4510 conversion_out_dynarr = Dynarr_new (Extbyte);
4511 else
4512 Dynarr_reset (conversion_out_dynarr);
4513
4514 if (NILP (coding_system))
4515 {
4516 CONST Bufbyte *end = ptr + len;
4517
4518 for (; ptr < end;)
4519 {
4520 Bufbyte c =
4521 (BYTE_ASCII_P (*ptr)) ? *ptr :
4522 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
4523 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
4524 '~';
4525
4526 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
4527 INC_CHARPTR (ptr);
4528 }
4529
4530 #ifdef ERROR_CHECK_BUFPOS
4531 assert (ptr == end);
4532 #endif
4533 }
4534 else
4535 {
4536 Lisp_Object instream, outstream, da_outstream;
4537 Lstream *istr, *ostr;
4538 struct gcpro gcpro1, gcpro2, gcpro3;
4539 char tempbuf[1024]; /* some random amount */
4540
4541 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4542 da_outstream = make_dynarr_output_stream
4543 ((unsigned_char_dynarr *) conversion_out_dynarr);
4544 outstream =
4545 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
4546 istr = XLSTREAM (instream);
4547 ostr = XLSTREAM (outstream);
4548 GCPRO3 (instream, outstream, da_outstream);
4549 while (1)
4550 {
4551 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
4552 if (!size_in_bytes)
4553 break;
4554 Lstream_write (ostr, tempbuf, size_in_bytes);
4555 }
4556 Lstream_close (istr);
4557 Lstream_close (ostr);
4558 UNGCPRO;
4559 Lstream_delete (istr);
4560 Lstream_delete (ostr);
4561 Lstream_delete (XLSTREAM (da_outstream));
4562 }
4563
4564 *len_out = Dynarr_length (conversion_out_dynarr);
4565 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
4566 return Dynarr_atp (conversion_out_dynarr, 0);
4567 }
4568
4569 extern CONST Bufbyte *
4570 convert_from_external_format (CONST Extbyte *ptr,
4571 Extcount len,
4572 Bytecount *len_out,
4573 enum external_data_format fmt)
4574 {
4575 #ifdef MULE
4576 Lisp_Object coding_system = FMT_CODING_SYSTEM (fmt);
4577 #else
4578 Lisp_Object coding_system = Qnil;
4579 #endif
4580
4581 if (!conversion_in_dynarr)
4582 conversion_in_dynarr = Dynarr_new (Bufbyte);
4583 else
4584 Dynarr_reset (conversion_in_dynarr);
4585
4586 if (NILP (coding_system))
4587 {
4588 CONST Extbyte *end = ptr + len;
4589 for (; ptr < end; ptr++)
4590 {
4591 Extbyte c = *ptr;
4592 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
4593 }
4594 }
4595 else
4596 {
4597 Lisp_Object instream, outstream, da_outstream;
4598 Lstream *istr, *ostr;
4599 struct gcpro gcpro1, gcpro2, gcpro3;
4600 char tempbuf[1024]; /* some random amount */
4601
4602 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4603 da_outstream = make_dynarr_output_stream
4604 ((unsigned_char_dynarr *) conversion_in_dynarr);
4605 outstream =
4606 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
4607 istr = XLSTREAM (instream);
4608 ostr = XLSTREAM (outstream);
4609 GCPRO3 (instream, outstream, da_outstream);
4610 while (1)
4611 {
4612 int size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
4613 if (!size_in_bytes)
4614 break;
4615 Lstream_write (ostr, tempbuf, size_in_bytes);
4616 }
4617 Lstream_close (istr);
4618 Lstream_close (ostr);
4619 UNGCPRO;
4620 Lstream_delete (istr);
4621 Lstream_delete (ostr);
4622 Lstream_delete (XLSTREAM (da_outstream));
4623 }
4624
4625 *len_out = Dynarr_length (conversion_in_dynarr);
4626 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
4627 return Dynarr_atp (conversion_in_dynarr, 0);
4628 }
4629
4630
4631 /************************************************************************/
4632 /* Initialization */
4633 /************************************************************************/
4634
4635 void
4636 syms_of_mule_coding (void)
4637 {
4638 defsymbol (&Qbuffer_file_coding_system, "buffer-file-coding-system");
4639 deferror (&Qcoding_system_error, "coding-system-error",
4640 "Coding-system error", Qio_error);
4641
4642 DEFSUBR (Fcoding_system_p);
4643 DEFSUBR (Ffind_coding_system);
4644 DEFSUBR (Fget_coding_system);
4645 DEFSUBR (Fcoding_system_list);
4646 DEFSUBR (Fcoding_system_name);
4647 DEFSUBR (Fmake_coding_system);
4648 DEFSUBR (Fcopy_coding_system);
4649 DEFSUBR (Fsubsidiary_coding_system);
4650
4651 DEFSUBR (Fcoding_system_type);
4652 DEFSUBR (Fcoding_system_doc_string);
4653 #ifdef MULE
4654 DEFSUBR (Fcoding_system_charset);
4655 #endif
4656 DEFSUBR (Fcoding_system_property);
4657
4658 DEFSUBR (Fcoding_category_list);
4659 DEFSUBR (Fset_coding_priority_list);
4660 DEFSUBR (Fcoding_priority_list);
4661 DEFSUBR (Fset_coding_category_system);
4662 DEFSUBR (Fcoding_category_system);
4663
4664 DEFSUBR (Fdetect_coding_region);
4665 DEFSUBR (Fdecode_coding_region);
4666 DEFSUBR (Fencode_coding_region);
4667 #ifdef MULE
4668 DEFSUBR (Fdecode_shift_jis_char);
4669 DEFSUBR (Fencode_shift_jis_char);
4670 DEFSUBR (Fdecode_big5_char);
4671 DEFSUBR (Fencode_big5_char);
4672 #endif /* MULE */
4673 defsymbol (&Qcoding_system_p, "coding-system-p");
4674 defsymbol (&Qno_conversion, "no-conversion");
4675 #ifdef MULE
4676 defsymbol (&Qbig5, "big5");
4677 defsymbol (&Qshift_jis, "shift-jis");
4678 defsymbol (&Qccl, "ccl");
4679 defsymbol (&Qiso2022, "iso2022");
4680 #endif /* MULE */
4681 defsymbol (&Qmnemonic, "mnemonic");
4682 defsymbol (&Qeol_type, "eol-type");
4683 defsymbol (&Qpost_read_conversion, "post-read-conversion");
4684 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
4685
4686 defsymbol (&Qcr, "cr");
4687 defsymbol (&Qlf, "lf");
4688 defsymbol (&Qcrlf, "crlf");
4689 defsymbol (&Qeol_cr, "eol-cr");
4690 defsymbol (&Qeol_lf, "eol-lf");
4691 defsymbol (&Qeol_crlf, "eol-crlf");
4692 #ifdef MULE
4693 defsymbol (&Qcharset_g0, "charset-g0");
4694 defsymbol (&Qcharset_g1, "charset-g1");
4695 defsymbol (&Qcharset_g2, "charset-g2");
4696 defsymbol (&Qcharset_g3, "charset-g3");
4697 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
4698 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
4699 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
4700 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
4701 defsymbol (&Qno_iso6429, "no-iso6429");
4702 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
4703 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
4704 #endif
4705 defsymbol (&Qshort, "short");
4706 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
4707 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
4708 defsymbol (&Qseven, "seven");
4709 defsymbol (&Qlock_shift, "lock-shift");
4710 defsymbol (&Qescape_quoted, "escape-quoted");
4711
4712 defsymbol (&Qencode, "encode");
4713 defsymbol (&Qdecode, "decode");
4714
4715 #ifdef MULE
4716 defsymbol (&Qctext, "ctext");
4717 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
4718 "shift-jis");
4719 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
4720 "big5");
4721 #endif /* MULE */
4722 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
4723 "iso-7");
4724 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
4725 "iso-8-designate");
4726 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
4727 "iso-8-1");
4728 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
4729 "iso-8-2");
4730 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
4731 "iso-lock-shift");
4732 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
4733 "no-conversion");
4734 }
4735
4736 void
4737 lstream_type_create_mule_coding (void)
4738 {
4739 LSTREAM_HAS_METHOD (decoding, reader);
4740 LSTREAM_HAS_METHOD (decoding, writer);
4741 LSTREAM_HAS_METHOD (decoding, rewinder);
4742 LSTREAM_HAS_METHOD (decoding, seekable_p);
4743 LSTREAM_HAS_METHOD (decoding, flusher);
4744 LSTREAM_HAS_METHOD (decoding, closer);
4745 LSTREAM_HAS_METHOD (decoding, marker);
4746
4747 LSTREAM_HAS_METHOD (encoding, reader);
4748 LSTREAM_HAS_METHOD (encoding, writer);
4749 LSTREAM_HAS_METHOD (encoding, rewinder);
4750 LSTREAM_HAS_METHOD (encoding, seekable_p);
4751 LSTREAM_HAS_METHOD (encoding, flusher);
4752 LSTREAM_HAS_METHOD (encoding, closer);
4753 LSTREAM_HAS_METHOD (encoding, marker);
4754 }
4755
4756 void
4757 vars_of_mule_coding (void)
4758 {
4759 int i;
4760
4761 /* Initialize to something reasonable ... */
4762 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
4763 {
4764 coding_category_system[i] = Qnil;
4765 coding_category_by_priority[i] = i;
4766 }
4767
4768 Fprovide (intern ("file-coding"));
4769
4770 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
4771 Coding system used for TTY keyboard input.
4772 Not used under a windowing system.
4773 */ );
4774 Vkeyboard_coding_system = Qnil;
4775
4776 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
4777 Coding system used for TTY display output.
4778 Not used under a windowing system.
4779 */ );
4780 Vterminal_coding_system = Qnil;
4781
4782 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
4783 Overriding coding system used when writing a file or process.
4784 You should *bind* this, not set it. If this is non-nil, it specifies
4785 the coding system that will be used when a file or process is read
4786 in, and overrides `buffer-file-coding-system-for-read',
4787 `insert-file-contents-pre-hook', etc. Use those variables instead of
4788 this one for permanent changes to the environment.
4789 */ );
4790 Vcoding_system_for_read = Qnil;
4791
4792 DEFVAR_LISP ("coding-system-for-write",
4793 &Vcoding_system_for_write /*
4794 Overriding coding system used when writing a file or process.
4795 You should *bind* this, not set it. If this is non-nil, it specifies
4796 the coding system that will be used when a file or process is wrote
4797 in, and overrides `buffer-file-coding-system',
4798 `write-region-pre-hook', etc. Use those variables instead of this one
4799 for permanent changes to the environment.
4800 */ );
4801 Vcoding_system_for_write = Qnil;
4802
4803 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
4804 Coding system used to convert pathnames when accessing files.
4805 */ );
4806 Vfile_name_coding_system = Qnil;
4807
4808 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
4809 Non-nil means the buffer contents are regarded as multi-byte form
4810 of characters, not a binary code. This affects the display, file I/O,
4811 and behaviors of various editing commands.
4812
4813 Setting this to nil does not do anything.
4814 */ );
4815 enable_multibyte_characters = 1;
4816 }
4817
4818 void
4819 complex_vars_of_mule_coding (void)
4820 {
4821 staticpro (&Vcoding_system_hashtable);
4822 Vcoding_system_hashtable = make_lisp_hashtable (50, HASHTABLE_NONWEAK,
4823 HASHTABLE_EQ);
4824
4825 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
4826
4827 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
4828 { \
4829 struct codesys_prop csp; \
4830 csp.sym = (Sym); \
4831 csp.prop_type = (Prop_Type); \
4832 Dynarr_add (the_codesys_prop_dynarr, csp); \
4833 } while (0)
4834
4835 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
4836 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
4837 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
4838 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
4839 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
4840 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
4841 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
4842 #ifdef MULE
4843 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
4844 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
4845 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
4846 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
4847 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
4848 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
4849 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
4850 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
4851 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
4852 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
4853 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
4854 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
4855 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
4856 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
4857 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
4858 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
4859 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
4860
4861 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
4862 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
4863 #endif /* MULE */
4864 /* Need to create this here or we're really screwed. */
4865 Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
4866 list2 (Qmnemonic, build_string ("Noconv")));
4867
4868 Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
4869 Qbinary);
4870
4871 /* Need this for bootstrapping */
4872 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
4873 Fget_coding_system (Qno_conversion);
4874 }