comparison src/file-coding.c @ 428:3ecd8885ac67 r21-2-22

Import from CVS: tag r21-2-22
author cvs
date Mon, 13 Aug 2007 11:28:15 +0200
parents
children 3a7e78e1142d
comparison
equal deleted inserted replaced
427:0a0253eac470 428:3ecd8885ac67
1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4
5 This file is part of XEmacs.
6
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
10 later version.
11
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* Synched up with: Mule 2.3. Not in FSF. */
23
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */
25
26 #include <config.h>
27 #include "lisp.h"
28
29 #include "buffer.h"
30 #include "elhash.h"
31 #include "insdel.h"
32 #include "lstream.h"
33 #ifdef MULE
34 #include "mule-ccl.h"
35 #include "chartab.h"
36 #endif
37 #include "file-coding.h"
38
39 Lisp_Object Qcoding_system_error;
40
41 Lisp_Object Vkeyboard_coding_system;
42 Lisp_Object Vterminal_coding_system;
43 Lisp_Object Vcoding_system_for_read;
44 Lisp_Object Vcoding_system_for_write;
45 Lisp_Object Vfile_name_coding_system;
46
47 /* Table of symbols identifying each coding category. */
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
49
50
51
52 struct file_coding_dump {
53 /* Coding system currently associated with each coding category. */
54 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
55
56 /* Table of all coding categories in decreasing order of priority.
57 This describes a permutation of the possible coding categories. */
58 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
59
60 Lisp_Object ucs_to_mule_table[65536];
61 } *fcd;
62
63 static const struct lrecord_description fcd_description_1[] = {
64 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 },
65 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, ucs_to_mule_table), 65536 },
66 { XD_END }
67 };
68
69 static const struct struct_description fcd_description = {
70 sizeof(struct file_coding_dump),
71 fcd_description_1
72 };
73
74 Lisp_Object mule_to_ucs_table;
75
76 Lisp_Object Qcoding_systemp;
77
78 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022;
79 /* Qinternal in general.c */
80
81 Lisp_Object Qmnemonic, Qeol_type;
82 Lisp_Object Qcr, Qcrlf, Qlf;
83 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
84 Lisp_Object Qpost_read_conversion;
85 Lisp_Object Qpre_write_conversion;
86
87 #ifdef MULE
88 Lisp_Object Qucs4, Qutf8;
89 Lisp_Object Qbig5, Qshift_jis;
90 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
91 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
92 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
93 Lisp_Object Qno_iso6429;
94 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
95 Lisp_Object Qctext, Qescape_quoted;
96 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
97 #endif
98 Lisp_Object Qencode, Qdecode;
99
100 Lisp_Object Vcoding_system_hash_table;
101
102 int enable_multibyte_characters;
103
104 #ifdef MULE
105 /* Additional information used by the ISO2022 decoder and detector. */
106 struct iso2022_decoder
107 {
108 /* CHARSET holds the character sets currently assigned to the G0
109 through G3 variables. It is initialized from the array
110 INITIAL_CHARSET in CODESYS. */
111 Lisp_Object charset[4];
112
113 /* Which registers are currently invoked into the left (GL) and
114 right (GR) halves of the 8-bit encoding space? */
115 int register_left, register_right;
116
117 /* ISO_ESC holds a value indicating part of an escape sequence
118 that has already been seen. */
119 enum iso_esc_flag esc;
120
121 /* This records the bytes we've seen so far in an escape sequence,
122 in case the sequence is invalid (we spit out the bytes unchanged). */
123 unsigned char esc_bytes[8];
124
125 /* Index for next byte to store in ISO escape sequence. */
126 int esc_bytes_index;
127
128 #ifdef ENABLE_COMPOSITE_CHARS
129 /* Stuff seen so far when composing a string. */
130 unsigned_char_dynarr *composite_chars;
131 #endif
132
133 /* If we saw an invalid designation sequence for a particular
134 register, we flag it here and switch to ASCII. The next time we
135 see a valid designation for this register, we turn off the flag
136 and do the designation normally, but pretend the sequence was
137 invalid. The effect of all this is that (most of the time) the
138 escape sequences for both the switch to the unknown charset, and
139 the switch back to the known charset, get inserted literally into
140 the buffer and saved out as such. The hope is that we can
141 preserve the escape sequences so that the resulting written out
142 file makes sense. If we don't do any of this, the designation
143 to the invalid charset will be preserved but that switch back
144 to the known charset will probably get eaten because it was
145 the same charset that was already present in the register. */
146 unsigned char invalid_designated[4];
147
148 /* We try to do similar things as above for direction-switching
149 sequences. If we encountered a direction switch while an
150 invalid designation was present, or an invalid designation
151 just after a direction switch (i.e. no valid designation
152 encountered yet), we insert the direction-switch escape
153 sequence literally into the output stream, and later on
154 insert the corresponding direction-restoring escape sequence
155 literally also. */
156 unsigned int switched_dir_and_no_valid_charset_yet :1;
157 unsigned int invalid_switch_dir :1;
158
159 /* Tells the decoder to output the escape sequence literally
160 even though it was valid. Used in the games we play to
161 avoid lossage when we encounter invalid designations. */
162 unsigned int output_literally :1;
163 /* We encountered a direction switch followed by an invalid
164 designation. We didn't output the direction switch
165 literally because we didn't know about the invalid designation;
166 but we have to do so now. */
167 unsigned int output_direction_sequence :1;
168 };
169 #endif /* MULE */
170 EXFUN (Fcopy_coding_system, 2);
171 #ifdef MULE
172 struct detection_state;
173 static int detect_coding_sjis (struct detection_state *st,
174 CONST unsigned char *src,
175 unsigned int n);
176 static void decode_coding_sjis (Lstream *decoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst,
179 unsigned int n);
180 static void encode_coding_sjis (Lstream *encoding,
181 CONST unsigned char *src,
182 unsigned_char_dynarr *dst,
183 unsigned int n);
184 static int detect_coding_big5 (struct detection_state *st,
185 CONST unsigned char *src,
186 unsigned int n);
187 static void decode_coding_big5 (Lstream *decoding,
188 CONST unsigned char *src,
189 unsigned_char_dynarr *dst, unsigned int n);
190 static void encode_coding_big5 (Lstream *encoding,
191 CONST unsigned char *src,
192 unsigned_char_dynarr *dst, unsigned int n);
193 static int detect_coding_ucs4 (struct detection_state *st,
194 CONST unsigned char *src,
195 unsigned int n);
196 static void decode_coding_ucs4 (Lstream *decoding,
197 CONST unsigned char *src,
198 unsigned_char_dynarr *dst, unsigned int n);
199 static void encode_coding_ucs4 (Lstream *encoding,
200 CONST unsigned char *src,
201 unsigned_char_dynarr *dst, unsigned int n);
202 static int detect_coding_utf8 (struct detection_state *st,
203 CONST unsigned char *src,
204 unsigned int n);
205 static void decode_coding_utf8 (Lstream *decoding,
206 CONST unsigned char *src,
207 unsigned_char_dynarr *dst, unsigned int n);
208 static void encode_coding_utf8 (Lstream *encoding,
209 CONST unsigned char *src,
210 unsigned_char_dynarr *dst, unsigned int n);
211 static int postprocess_iso2022_mask (int mask);
212 static void reset_iso2022 (Lisp_Object coding_system,
213 struct iso2022_decoder *iso);
214 static int detect_coding_iso2022 (struct detection_state *st,
215 CONST unsigned char *src,
216 unsigned int n);
217 static void decode_coding_iso2022 (Lstream *decoding,
218 CONST unsigned char *src,
219 unsigned_char_dynarr *dst, unsigned int n);
220 static void encode_coding_iso2022 (Lstream *encoding,
221 CONST unsigned char *src,
222 unsigned_char_dynarr *dst, unsigned int n);
223 #endif /* MULE */
224 static void decode_coding_no_conversion (Lstream *decoding,
225 CONST unsigned char *src,
226 unsigned_char_dynarr *dst,
227 unsigned int n);
228 static void encode_coding_no_conversion (Lstream *encoding,
229 CONST unsigned char *src,
230 unsigned_char_dynarr *dst,
231 unsigned int n);
232 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
233 unsigned_char_dynarr *dst, unsigned int n);
234 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
235 unsigned_char_dynarr *dst, unsigned int n);
236
237 typedef struct codesys_prop codesys_prop;
238 struct codesys_prop
239 {
240 Lisp_Object sym;
241 int prop_type;
242 };
243
244 typedef struct
245 {
246 Dynarr_declare (codesys_prop);
247 } codesys_prop_dynarr;
248
249 static const struct lrecord_description codesys_prop_description_1[] = {
250 { XD_LISP_OBJECT, offsetof(codesys_prop, sym), 1 },
251 { XD_END }
252 };
253
254 static const struct struct_description codesys_prop_description = {
255 sizeof(codesys_prop),
256 codesys_prop_description_1
257 };
258
259 static const struct lrecord_description codesys_prop_dynarr_description_1[] = {
260 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description),
261 { XD_END }
262 };
263
264 static const struct struct_description codesys_prop_dynarr_description = {
265 sizeof(codesys_prop_dynarr),
266 codesys_prop_dynarr_description_1
267 };
268
269 codesys_prop_dynarr *the_codesys_prop_dynarr;
270
271 enum codesys_prop_enum
272 {
273 CODESYS_PROP_ALL_OK,
274 CODESYS_PROP_ISO2022,
275 CODESYS_PROP_CCL
276 };
277
278
279 /************************************************************************/
280 /* Coding system functions */
281 /************************************************************************/
282
283 static Lisp_Object mark_coding_system (Lisp_Object);
284 static void print_coding_system (Lisp_Object, Lisp_Object, int);
285 static void finalize_coding_system (void *header, int for_disksave);
286
287 #ifdef MULE
288 static const struct lrecord_description ccs_description_1[] = {
289 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 },
290 { XD_END }
291 };
292
293 static const struct struct_description ccs_description = {
294 sizeof(charset_conversion_spec),
295 ccs_description_1
296 };
297
298 static const struct lrecord_description ccsd_description_1[] = {
299 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description),
300 { XD_END }
301 };
302
303 static const struct struct_description ccsd_description = {
304 sizeof(charset_conversion_spec_dynarr),
305 ccsd_description_1
306 };
307 #endif
308
309 static const struct lrecord_description coding_system_description[] = {
310 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 },
311 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 },
312 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 },
313 #ifdef MULE
314 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 },
315 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description },
316 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description },
317 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 },
318 #endif
319 { XD_END }
320 };
321
322 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
323 mark_coding_system, print_coding_system,
324 finalize_coding_system,
325 0, 0, coding_system_description,
326 struct Lisp_Coding_System);
327
328 static Lisp_Object
329 mark_coding_system (Lisp_Object obj)
330 {
331 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
332
333 mark_object (CODING_SYSTEM_NAME (codesys));
334 mark_object (CODING_SYSTEM_DOC_STRING (codesys));
335 mark_object (CODING_SYSTEM_MNEMONIC (codesys));
336 mark_object (CODING_SYSTEM_EOL_LF (codesys));
337 mark_object (CODING_SYSTEM_EOL_CRLF (codesys));
338 mark_object (CODING_SYSTEM_EOL_CR (codesys));
339
340 switch (CODING_SYSTEM_TYPE (codesys))
341 {
342 #ifdef MULE
343 int i;
344 case CODESYS_ISO2022:
345 for (i = 0; i < 4; i++)
346 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
347 if (codesys->iso2022.input_conv)
348 {
349 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
350 {
351 struct charset_conversion_spec *ccs =
352 Dynarr_atp (codesys->iso2022.input_conv, i);
353 mark_object (ccs->from_charset);
354 mark_object (ccs->to_charset);
355 }
356 }
357 if (codesys->iso2022.output_conv)
358 {
359 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
360 {
361 struct charset_conversion_spec *ccs =
362 Dynarr_atp (codesys->iso2022.output_conv, i);
363 mark_object (ccs->from_charset);
364 mark_object (ccs->to_charset);
365 }
366 }
367 break;
368
369 case CODESYS_CCL:
370 mark_object (CODING_SYSTEM_CCL_DECODE (codesys));
371 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys));
372 break;
373 #endif /* MULE */
374 default:
375 break;
376 }
377
378 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
379 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
380 }
381
382 static void
383 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
384 int escapeflag)
385 {
386 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
387 if (print_readably)
388 error ("printing unreadable object #<coding_system 0x%x>",
389 c->header.uid);
390
391 write_c_string ("#<coding_system ", printcharfun);
392 print_internal (c->name, printcharfun, 1);
393 write_c_string (">", printcharfun);
394 }
395
396 static void
397 finalize_coding_system (void *header, int for_disksave)
398 {
399 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
400 /* Since coding systems never go away, this function is not
401 necessary. But it would be necessary if we changed things
402 so that coding systems could go away. */
403 if (!for_disksave) /* see comment in lstream.c */
404 {
405 switch (CODING_SYSTEM_TYPE (c))
406 {
407 #ifdef MULE
408 case CODESYS_ISO2022:
409 if (c->iso2022.input_conv)
410 {
411 Dynarr_free (c->iso2022.input_conv);
412 c->iso2022.input_conv = 0;
413 }
414 if (c->iso2022.output_conv)
415 {
416 Dynarr_free (c->iso2022.output_conv);
417 c->iso2022.output_conv = 0;
418 }
419 break;
420 #endif /* MULE */
421 default:
422 break;
423 }
424 }
425 }
426
427 static enum eol_type
428 symbol_to_eol_type (Lisp_Object symbol)
429 {
430 CHECK_SYMBOL (symbol);
431 if (NILP (symbol)) return EOL_AUTODETECT;
432 if (EQ (symbol, Qlf)) return EOL_LF;
433 if (EQ (symbol, Qcrlf)) return EOL_CRLF;
434 if (EQ (symbol, Qcr)) return EOL_CR;
435
436 signal_simple_error ("Unrecognized eol type", symbol);
437 return EOL_AUTODETECT; /* not reached */
438 }
439
440 static Lisp_Object
441 eol_type_to_symbol (enum eol_type type)
442 {
443 switch (type)
444 {
445 default: abort ();
446 case EOL_LF: return Qlf;
447 case EOL_CRLF: return Qcrlf;
448 case EOL_CR: return Qcr;
449 case EOL_AUTODETECT: return Qnil;
450 }
451 }
452
453 static void
454 setup_eol_coding_systems (Lisp_Coding_System *codesys)
455 {
456 Lisp_Object codesys_obj;
457 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
458 char *codesys_name = (char *) alloca (len + 7);
459 int mlen = -1;
460 char *codesys_mnemonic=0;
461
462 Lisp_Object codesys_name_sym, sub_codesys_obj;
463
464 /* kludge */
465
466 XSETCODING_SYSTEM (codesys_obj, codesys);
467
468 memcpy (codesys_name,
469 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
470
471 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys)))
472 {
473 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys));
474 codesys_mnemonic = (char *) alloca (mlen + 7);
475 memcpy (codesys_mnemonic,
476 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen);
477 }
478
479 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \
480 strcpy (codesys_name + len, "-" op_sys); \
481 if (mlen != -1) \
482 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \
483 codesys_name_sym = intern (codesys_name); \
484 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
485 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
486 if (mlen != -1) \
487 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \
488 build_string (codesys_mnemonic); \
489 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
490 } while (0)
491
492 DEFINE_SUB_CODESYS("unix", "", EOL_LF);
493 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF);
494 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR);
495 }
496
497 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
498 Return t if OBJECT is a coding system.
499 A coding system is an object that defines how text containing multiple
500 character sets is encoded into a stream of (typically 8-bit) bytes.
501 The coding system is used to decode the stream into a series of
502 characters (which may be from multiple charsets) when the text is read
503 from a file or process, and is used to encode the text back into the
504 same format when it is written out to a file or process.
505
506 For example, many ISO2022-compliant coding systems (such as Compound
507 Text, which is used for inter-client data under the X Window System)
508 use escape sequences to switch between different charsets -- Japanese
509 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked
510 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See
511 `make-coding-system' for more information.
512
513 Coding systems are normally identified using a symbol, and the
514 symbol is accepted in place of the actual coding system object whenever
515 a coding system is called for. (This is similar to how faces work.)
516 */
517 (object))
518 {
519 return CODING_SYSTEMP (object) ? Qt : Qnil;
520 }
521
522 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
523 Retrieve the coding system of the given name.
524
525 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
526 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
527 If there is no such coding system, nil is returned. Otherwise the
528 associated coding system object is returned.
529 */
530 (coding_system_or_name))
531 {
532 if (CODING_SYSTEMP (coding_system_or_name))
533 return coding_system_or_name;
534
535 if (NILP (coding_system_or_name))
536 coding_system_or_name = Qbinary;
537 else
538 CHECK_SYMBOL (coding_system_or_name);
539
540 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil);
541 }
542
543 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
544 Retrieve the coding system of the given name.
545 Same as `find-coding-system' except that if there is no such
546 coding system, an error is signaled instead of returning nil.
547 */
548 (name))
549 {
550 Lisp_Object coding_system = Ffind_coding_system (name);
551
552 if (NILP (coding_system))
553 signal_simple_error ("No such coding system", name);
554 return coding_system;
555 }
556
557 /* We store the coding systems in hash tables with the names as the key and the
558 actual coding system object as the value. Occasionally we need to use them
559 in a list format. These routines provide us with that. */
560 struct coding_system_list_closure
561 {
562 Lisp_Object *coding_system_list;
563 };
564
565 static int
566 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value,
567 void *coding_system_list_closure)
568 {
569 /* This function can GC */
570 struct coding_system_list_closure *cscl =
571 (struct coding_system_list_closure *) coding_system_list_closure;
572 Lisp_Object *coding_system_list = cscl->coding_system_list;
573
574 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name,
575 *coding_system_list);
576 return 0;
577 }
578
579 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
580 Return a list of the names of all defined coding systems.
581 */
582 ())
583 {
584 Lisp_Object coding_system_list = Qnil;
585 struct gcpro gcpro1;
586 struct coding_system_list_closure coding_system_list_closure;
587
588 GCPRO1 (coding_system_list);
589 coding_system_list_closure.coding_system_list = &coding_system_list;
590 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table,
591 &coding_system_list_closure);
592 UNGCPRO;
593
594 return coding_system_list;
595 }
596
597 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
598 Return the name of the given coding system.
599 */
600 (coding_system))
601 {
602 coding_system = Fget_coding_system (coding_system);
603 return XCODING_SYSTEM_NAME (coding_system);
604 }
605
606 static Lisp_Coding_System *
607 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
608 {
609 Lisp_Coding_System *codesys =
610 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system);
611
612 zero_lcrecord (codesys);
613 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
614 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
615 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
616 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
617 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
618 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
619 CODING_SYSTEM_TYPE (codesys) = type;
620 CODING_SYSTEM_MNEMONIC (codesys) = Qnil;
621 #ifdef MULE
622 if (type == CODESYS_ISO2022)
623 {
624 int i;
625 for (i = 0; i < 4; i++)
626 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
627 }
628 else if (type == CODESYS_CCL)
629 {
630 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
631 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
632 }
633 #endif /* MULE */
634 CODING_SYSTEM_NAME (codesys) = name;
635
636 return codesys;
637 }
638
639 #ifdef MULE
640 /* Given a list of charset conversion specs as specified in a Lisp
641 program, parse it into STORE_HERE. */
642
643 static void
644 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
645 Lisp_Object spec_list)
646 {
647 Lisp_Object rest;
648
649 EXTERNAL_LIST_LOOP (rest, spec_list)
650 {
651 Lisp_Object car = XCAR (rest);
652 Lisp_Object from, to;
653 struct charset_conversion_spec spec;
654
655 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
656 signal_simple_error ("Invalid charset conversion spec", car);
657 from = Fget_charset (XCAR (car));
658 to = Fget_charset (XCAR (XCDR (car)));
659 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
660 signal_simple_error_2
661 ("Attempted conversion between different charset types",
662 from, to);
663 spec.from_charset = from;
664 spec.to_charset = to;
665
666 Dynarr_add (store_here, spec);
667 }
668 }
669
670 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
671 specs, return the equivalent as the Lisp programmer would see it.
672
673 If LOAD_HERE is 0, return Qnil. */
674
675 static Lisp_Object
676 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
677 {
678 int i;
679 Lisp_Object result;
680
681 if (!load_here)
682 return Qnil;
683 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
684 {
685 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
686 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
687 }
688
689 return Fnreverse (result);
690 }
691
692 #endif /* MULE */
693
694 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
695 Register symbol NAME as a coding system.
696
697 TYPE describes the conversion method used and should be one of
698
699 nil or 'undecided
700 Automatic conversion. XEmacs attempts to detect the coding system
701 used in the file.
702 'no-conversion
703 No conversion. Use this for binary files and such. On output,
704 graphic characters that are not in ASCII or Latin-1 will be
705 replaced by a ?. (For a no-conversion-encoded buffer, these
706 characters will only be present if you explicitly insert them.)
707 'shift-jis
708 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
709 'ucs-4
710 ISO 10646 UCS-4 encoding.
711 'utf-8
712 ISO 10646 UTF-8 encoding.
713 'iso2022
714 Any ISO2022-compliant encoding. Among other things, this includes
715 JIS (the Japanese encoding commonly used for e-mail), EUC (the
716 standard Unix encoding for Japanese and other languages), and
717 Compound Text (the encoding used in X11). You can specify more
718 specific information about the conversion with the FLAGS argument.
719 'big5
720 Big5 (the encoding commonly used for Taiwanese).
721 'ccl
722 The conversion is performed using a user-written pseudo-code
723 program. CCL (Code Conversion Language) is the name of this
724 pseudo-code.
725 'internal
726 Write out or read in the raw contents of the memory representing
727 the buffer's text. This is primarily useful for debugging
728 purposes, and is only enabled when XEmacs has been compiled with
729 DEBUG_XEMACS defined (via the --debug configure option).
730 WARNING: Reading in a file using 'internal conversion can result
731 in an internal inconsistency in the memory representing a
732 buffer's text, which will produce unpredictable results and may
733 cause XEmacs to crash. Under normal circumstances you should
734 never use 'internal conversion.
735
736 DOC-STRING is a string describing the coding system.
737
738 PROPS is a property list, describing the specific nature of the
739 character set. Recognized properties are:
740
741 'mnemonic
742 String to be displayed in the modeline when this coding system is
743 active.
744
745 'eol-type
746 End-of-line conversion to be used. It should be one of
747
748 nil
749 Automatically detect the end-of-line type (LF, CRLF,
750 or CR). Also generate subsidiary coding systems named
751 `NAME-unix', `NAME-dos', and `NAME-mac', that are
752 identical to this coding system but have an EOL-TYPE
753 value of 'lf, 'crlf, and 'cr, respectively.
754 'lf
755 The end of a line is marked externally using ASCII LF.
756 Since this is also the way that XEmacs represents an
757 end-of-line internally, specifying this option results
758 in no end-of-line conversion. This is the standard
759 format for Unix text files.
760 'crlf
761 The end of a line is marked externally using ASCII
762 CRLF. This is the standard format for MS-DOS text
763 files.
764 'cr
765 The end of a line is marked externally using ASCII CR.
766 This is the standard format for Macintosh text files.
767 t
768 Automatically detect the end-of-line type but do not
769 generate subsidiary coding systems. (This value is
770 converted to nil when stored internally, and
771 `coding-system-property' will return nil.)
772
773 'post-read-conversion
774 Function called after a file has been read in, to perform the
775 decoding. Called with two arguments, BEG and END, denoting
776 a region of the current buffer to be decoded.
777
778 'pre-write-conversion
779 Function called before a file is written out, to perform the
780 encoding. Called with two arguments, BEG and END, denoting
781 a region of the current buffer to be encoded.
782
783
784 The following additional properties are recognized if TYPE is 'iso2022:
785
786 'charset-g0
787 'charset-g1
788 'charset-g2
789 'charset-g3
790 The character set initially designated to the G0 - G3 registers.
791 The value should be one of
792
793 -- A charset object (designate that character set)
794 -- nil (do not ever use this register)
795 -- t (no character set is initially designated to
796 the register, but may be later on; this automatically
797 sets the corresponding `force-g*-on-output' property)
798
799 'force-g0-on-output
800 'force-g1-on-output
801 'force-g2-on-output
802 'force-g2-on-output
803 If non-nil, send an explicit designation sequence on output before
804 using the specified register.
805
806 'short
807 If non-nil, use the short forms "ESC $ @", "ESC $ A", and
808 "ESC $ B" on output in place of the full designation sequences
809 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B".
810
811 'no-ascii-eol
812 If non-nil, don't designate ASCII to G0 at each end of line on output.
813 Setting this to non-nil also suppresses other state-resetting that
814 normally happens at the end of a line.
815
816 'no-ascii-cntl
817 If non-nil, don't designate ASCII to G0 before control chars on output.
818
819 'seven
820 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
821 environment.
822
823 'lock-shift
824 If non-nil, use locking-shift (SO/SI) instead of single-shift
825 or designation by escape sequence.
826
827 'no-iso6429
828 If non-nil, don't use ISO6429's direction specification.
829
830 'escape-quoted
831 If non-nil, literal control characters that are the same as
832 the beginning of a recognized ISO2022 or ISO6429 escape sequence
833 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
834 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character
835 so that they can be properly distinguished from an escape sequence.
836 (Note that doing this results in a non-portable encoding.) This
837 encoding flag is used for byte-compiled files. Note that ESC
838 is a good choice for a quoting character because there are no
839 escape sequences whose second byte is a character from the Control-0
840 or Control-1 character sets; this is explicitly disallowed by the
841 ISO2022 standard.
842
843 'input-charset-conversion
844 A list of conversion specifications, specifying conversion of
845 characters in one charset to another when decoding is performed.
846 Each specification is a list of two elements: the source charset,
847 and the destination charset.
848
849 'output-charset-conversion
850 A list of conversion specifications, specifying conversion of
851 characters in one charset to another when encoding is performed.
852 The form of each specification is the same as for
853 'input-charset-conversion.
854
855
856 The following additional properties are recognized (and required)
857 if TYPE is 'ccl:
858
859 'decode
860 CCL program used for decoding (converting to internal format).
861
862 'encode
863 CCL program used for encoding (converting to external format).
864 */
865 (name, type, doc_string, props))
866 {
867 Lisp_Coding_System *codesys;
868 Lisp_Object rest, key, value;
869 enum coding_system_type ty;
870 int need_to_setup_eol_systems = 1;
871
872 /* Convert type to constant */
873 if (NILP (type) || EQ (type, Qundecided))
874 { ty = CODESYS_AUTODETECT; }
875 #ifdef MULE
876 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
877 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
878 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
879 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
880 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
881 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
882 #endif
883 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
884 #ifdef DEBUG_XEMACS
885 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
886 #endif
887 else
888 signal_simple_error ("Invalid coding system type", type);
889
890 CHECK_SYMBOL (name);
891
892 codesys = allocate_coding_system (ty, name);
893
894 if (NILP (doc_string))
895 doc_string = build_string ("");
896 else
897 CHECK_STRING (doc_string);
898 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
899
900 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
901 {
902 if (EQ (key, Qmnemonic))
903 {
904 if (!NILP (value))
905 CHECK_STRING (value);
906 CODING_SYSTEM_MNEMONIC (codesys) = value;
907 }
908
909 else if (EQ (key, Qeol_type))
910 {
911 need_to_setup_eol_systems = NILP (value);
912 if (EQ (value, Qt))
913 value = Qnil;
914 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
915 }
916
917 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
918 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
919 #ifdef MULE
920 else if (ty == CODESYS_ISO2022)
921 {
922 #define FROB_INITIAL_CHARSET(charset_num) \
923 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
924 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
925
926 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
927 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
928 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
929 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
930
931 #define FROB_FORCE_CHARSET(charset_num) \
932 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
933
934 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
935 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
936 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
937 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
938
939 #define FROB_BOOLEAN_PROPERTY(prop) \
940 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
941
942 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
943 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
944 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
945 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
946 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
947 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
948 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
949
950 else if (EQ (key, Qinput_charset_conversion))
951 {
952 codesys->iso2022.input_conv =
953 Dynarr_new (charset_conversion_spec);
954 parse_charset_conversion_specs (codesys->iso2022.input_conv,
955 value);
956 }
957 else if (EQ (key, Qoutput_charset_conversion))
958 {
959 codesys->iso2022.output_conv =
960 Dynarr_new (charset_conversion_spec);
961 parse_charset_conversion_specs (codesys->iso2022.output_conv,
962 value);
963 }
964 else
965 signal_simple_error ("Unrecognized property", key);
966 }
967 else if (EQ (type, Qccl))
968 {
969 if (EQ (key, Qdecode))
970 {
971 CHECK_VECTOR (value);
972 CODING_SYSTEM_CCL_DECODE (codesys) = value;
973 }
974 else if (EQ (key, Qencode))
975 {
976 CHECK_VECTOR (value);
977 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
978 }
979 else
980 signal_simple_error ("Unrecognized property", key);
981 }
982 #endif /* MULE */
983 else
984 signal_simple_error ("Unrecognized property", key);
985 }
986
987 if (need_to_setup_eol_systems)
988 setup_eol_coding_systems (codesys);
989
990 {
991 Lisp_Object codesys_obj;
992 XSETCODING_SYSTEM (codesys_obj, codesys);
993 Fputhash (name, codesys_obj, Vcoding_system_hash_table);
994 return codesys_obj;
995 }
996 }
997
998 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
999 Copy OLD-CODING-SYSTEM to NEW-NAME.
1000 If NEW-NAME does not name an existing coding system, a new one will
1001 be created.
1002 */
1003 (old_coding_system, new_name))
1004 {
1005 Lisp_Object new_coding_system;
1006 old_coding_system = Fget_coding_system (old_coding_system);
1007 new_coding_system = Ffind_coding_system (new_name);
1008 if (NILP (new_coding_system))
1009 {
1010 XSETCODING_SYSTEM (new_coding_system,
1011 allocate_coding_system
1012 (XCODING_SYSTEM_TYPE (old_coding_system),
1013 new_name));
1014 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
1015 }
1016
1017 {
1018 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
1019 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
1020 memcpy (((char *) to ) + sizeof (to->header),
1021 ((char *) from) + sizeof (from->header),
1022 sizeof (*from) - sizeof (from->header));
1023 to->name = new_name;
1024 }
1025 return new_coding_system;
1026 }
1027
1028 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /*
1029 Define symbol ALIAS as an alias for coding system CODING-SYSTEM.
1030 */
1031 (alias, coding_system))
1032 {
1033 CHECK_SYMBOL (alias);
1034 if (!NILP (Ffind_coding_system (alias)))
1035 signal_simple_error ("Symbol already names a coding system", alias);
1036 coding_system = Fget_coding_system (coding_system);
1037 Fputhash (alias, coding_system, Vcoding_system_hash_table);
1038
1039 /* Set up aliases for subsidiaries. */
1040 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1041 {
1042 Lisp_Object str;
1043 XSETSTRING (str, symbol_name (XSYMBOL (alias)));
1044 #define FROB(type, name) \
1045 do { \
1046 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \
1047 if (!NILP (subsidiary)) \
1048 Fdefine_coding_system_alias \
1049 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \
1050 } while (0)
1051 FROB (LF, "-unix");
1052 FROB (CRLF, "-dos");
1053 FROB (CR, "-mac");
1054 #undef FROB
1055 }
1056 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac],
1057 but it doesn't look intentional, so I'd rather return something
1058 meaningful or nothing at all. */
1059 return Qnil;
1060 }
1061
1062 static Lisp_Object
1063 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
1064 {
1065 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
1066 Lisp_Object new_coding_system;
1067
1068 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1069 return coding_system;
1070
1071 switch (type)
1072 {
1073 case EOL_AUTODETECT: return coding_system;
1074 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
1075 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
1076 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
1077 default: abort ();
1078 }
1079
1080 return NILP (new_coding_system) ? coding_system : new_coding_system;
1081 }
1082
1083 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
1084 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
1085 */
1086 (coding_system, eol_type))
1087 {
1088 coding_system = Fget_coding_system (coding_system);
1089
1090 return subsidiary_coding_system (coding_system,
1091 symbol_to_eol_type (eol_type));
1092 }
1093
1094
1095 /************************************************************************/
1096 /* Coding system accessors */
1097 /************************************************************************/
1098
1099 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
1100 Return the doc string for CODING-SYSTEM.
1101 */
1102 (coding_system))
1103 {
1104 coding_system = Fget_coding_system (coding_system);
1105 return XCODING_SYSTEM_DOC_STRING (coding_system);
1106 }
1107
1108 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
1109 Return the type of CODING-SYSTEM.
1110 */
1111 (coding_system))
1112 {
1113 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
1114 {
1115 default: abort ();
1116 case CODESYS_AUTODETECT: return Qundecided;
1117 #ifdef MULE
1118 case CODESYS_SHIFT_JIS: return Qshift_jis;
1119 case CODESYS_ISO2022: return Qiso2022;
1120 case CODESYS_BIG5: return Qbig5;
1121 case CODESYS_UCS4: return Qucs4;
1122 case CODESYS_UTF8: return Qutf8;
1123 case CODESYS_CCL: return Qccl;
1124 #endif
1125 case CODESYS_NO_CONVERSION: return Qno_conversion;
1126 #ifdef DEBUG_XEMACS
1127 case CODESYS_INTERNAL: return Qinternal;
1128 #endif
1129 }
1130 }
1131
1132 #ifdef MULE
1133 static
1134 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum)
1135 {
1136 Lisp_Object cs
1137 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
1138
1139 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
1140 }
1141
1142 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /*
1143 Return initial charset of CODING-SYSTEM designated to GNUM.
1144 GNUM allows 0 .. 3.
1145 */
1146 (coding_system, gnum))
1147 {
1148 coding_system = Fget_coding_system (coding_system);
1149 CHECK_INT (gnum);
1150
1151 return coding_system_charset (coding_system, XINT (gnum));
1152 }
1153 #endif /* MULE */
1154
1155 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
1156 Return the PROP property of CODING-SYSTEM.
1157 */
1158 (coding_system, prop))
1159 {
1160 int i, ok = 0;
1161 enum coding_system_type type;
1162
1163 coding_system = Fget_coding_system (coding_system);
1164 CHECK_SYMBOL (prop);
1165 type = XCODING_SYSTEM_TYPE (coding_system);
1166
1167 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
1168 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
1169 {
1170 ok = 1;
1171 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
1172 {
1173 case CODESYS_PROP_ALL_OK:
1174 break;
1175 #ifdef MULE
1176 case CODESYS_PROP_ISO2022:
1177 if (type != CODESYS_ISO2022)
1178 signal_simple_error
1179 ("Property only valid in ISO2022 coding systems",
1180 prop);
1181 break;
1182
1183 case CODESYS_PROP_CCL:
1184 if (type != CODESYS_CCL)
1185 signal_simple_error
1186 ("Property only valid in CCL coding systems",
1187 prop);
1188 break;
1189 #endif /* MULE */
1190 default:
1191 abort ();
1192 }
1193 }
1194
1195 if (!ok)
1196 signal_simple_error ("Unrecognized property", prop);
1197
1198 if (EQ (prop, Qname))
1199 return XCODING_SYSTEM_NAME (coding_system);
1200 else if (EQ (prop, Qtype))
1201 return Fcoding_system_type (coding_system);
1202 else if (EQ (prop, Qdoc_string))
1203 return XCODING_SYSTEM_DOC_STRING (coding_system);
1204 else if (EQ (prop, Qmnemonic))
1205 return XCODING_SYSTEM_MNEMONIC (coding_system);
1206 else if (EQ (prop, Qeol_type))
1207 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1208 else if (EQ (prop, Qeol_lf))
1209 return XCODING_SYSTEM_EOL_LF (coding_system);
1210 else if (EQ (prop, Qeol_crlf))
1211 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1212 else if (EQ (prop, Qeol_cr))
1213 return XCODING_SYSTEM_EOL_CR (coding_system);
1214 else if (EQ (prop, Qpost_read_conversion))
1215 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1216 else if (EQ (prop, Qpre_write_conversion))
1217 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1218 #ifdef MULE
1219 else if (type == CODESYS_ISO2022)
1220 {
1221 if (EQ (prop, Qcharset_g0))
1222 return coding_system_charset (coding_system, 0);
1223 else if (EQ (prop, Qcharset_g1))
1224 return coding_system_charset (coding_system, 1);
1225 else if (EQ (prop, Qcharset_g2))
1226 return coding_system_charset (coding_system, 2);
1227 else if (EQ (prop, Qcharset_g3))
1228 return coding_system_charset (coding_system, 3);
1229
1230 #define FORCE_CHARSET(charset_num) \
1231 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1232 (coding_system, charset_num) ? Qt : Qnil)
1233
1234 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1235 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1236 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1237 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1238
1239 #define LISP_BOOLEAN(prop) \
1240 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1241
1242 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1243 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1244 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1245 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1246 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1247 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1248 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1249
1250 else if (EQ (prop, Qinput_charset_conversion))
1251 return
1252 unparse_charset_conversion_specs
1253 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1254 else if (EQ (prop, Qoutput_charset_conversion))
1255 return
1256 unparse_charset_conversion_specs
1257 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1258 else
1259 abort ();
1260 }
1261 else if (type == CODESYS_CCL)
1262 {
1263 if (EQ (prop, Qdecode))
1264 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1265 else if (EQ (prop, Qencode))
1266 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1267 else
1268 abort ();
1269 }
1270 #endif /* MULE */
1271 else
1272 abort ();
1273
1274 return Qnil; /* not reached */
1275 }
1276
1277
1278 /************************************************************************/
1279 /* Coding category functions */
1280 /************************************************************************/
1281
1282 static int
1283 decode_coding_category (Lisp_Object symbol)
1284 {
1285 int i;
1286
1287 CHECK_SYMBOL (symbol);
1288 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1289 if (EQ (coding_category_symbol[i], symbol))
1290 return i;
1291
1292 signal_simple_error ("Unrecognized coding category", symbol);
1293 return 0; /* not reached */
1294 }
1295
1296 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1297 Return a list of all recognized coding categories.
1298 */
1299 ())
1300 {
1301 int i;
1302 Lisp_Object list = Qnil;
1303
1304 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1305 list = Fcons (coding_category_symbol[i], list);
1306 return list;
1307 }
1308
1309 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1310 Change the priority order of the coding categories.
1311 LIST should be list of coding categories, in descending order of
1312 priority. Unspecified coding categories will be lower in priority
1313 than all specified ones, in the same relative order they were in
1314 previously.
1315 */
1316 (list))
1317 {
1318 int category_to_priority[CODING_CATEGORY_LAST + 1];
1319 int i, j;
1320 Lisp_Object rest;
1321
1322 /* First generate a list that maps coding categories to priorities. */
1323
1324 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1325 category_to_priority[i] = -1;
1326
1327 /* Highest priority comes from the specified list. */
1328 i = 0;
1329 EXTERNAL_LIST_LOOP (rest, list)
1330 {
1331 int cat = decode_coding_category (XCAR (rest));
1332
1333 if (category_to_priority[cat] >= 0)
1334 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1335 category_to_priority[cat] = i++;
1336 }
1337
1338 /* Now go through the existing categories by priority to retrieve
1339 the categories not yet specified and preserve their priority
1340 order. */
1341 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1342 {
1343 int cat = fcd->coding_category_by_priority[j];
1344 if (category_to_priority[cat] < 0)
1345 category_to_priority[cat] = i++;
1346 }
1347
1348 /* Now we need to construct the inverse of the mapping we just
1349 constructed. */
1350
1351 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1352 fcd->coding_category_by_priority[category_to_priority[i]] = i;
1353
1354 /* Phew! That was confusing. */
1355 return Qnil;
1356 }
1357
1358 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1359 Return a list of coding categories in descending order of priority.
1360 */
1361 ())
1362 {
1363 int i;
1364 Lisp_Object list = Qnil;
1365
1366 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1367 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]],
1368 list);
1369 return list;
1370 }
1371
1372 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1373 Change the coding system associated with a coding category.
1374 */
1375 (coding_category, coding_system))
1376 {
1377 int cat = decode_coding_category (coding_category);
1378
1379 coding_system = Fget_coding_system (coding_system);
1380 fcd->coding_category_system[cat] = coding_system;
1381 return Qnil;
1382 }
1383
1384 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1385 Return the coding system associated with a coding category.
1386 */
1387 (coding_category))
1388 {
1389 int cat = decode_coding_category (coding_category);
1390 Lisp_Object sys = fcd->coding_category_system[cat];
1391
1392 if (!NILP (sys))
1393 return XCODING_SYSTEM_NAME (sys);
1394 return Qnil;
1395 }
1396
1397
1398 /************************************************************************/
1399 /* Detecting the encoding of data */
1400 /************************************************************************/
1401
1402 struct detection_state
1403 {
1404 enum eol_type eol_type;
1405 int seen_non_ascii;
1406 int mask;
1407 #ifdef MULE
1408 struct
1409 {
1410 int mask;
1411 int in_second_byte;
1412 }
1413 big5;
1414
1415 struct
1416 {
1417 int mask;
1418 int in_second_byte;
1419 }
1420 shift_jis;
1421
1422 struct
1423 {
1424 int mask;
1425 int in_byte;
1426 }
1427 ucs4;
1428
1429 struct
1430 {
1431 int mask;
1432 int in_byte;
1433 }
1434 utf8;
1435
1436 struct
1437 {
1438 int mask;
1439 int initted;
1440 struct iso2022_decoder iso;
1441 unsigned int flags;
1442 int high_byte_count;
1443 unsigned int saw_single_shift:1;
1444 }
1445 iso2022;
1446 #endif
1447 struct
1448 {
1449 int seen_anything;
1450 int just_saw_cr;
1451 }
1452 eol;
1453 };
1454
1455 static int
1456 acceptable_control_char_p (int c)
1457 {
1458 switch (c)
1459 {
1460 /* Allow and ignore control characters that you might
1461 reasonably see in a text file */
1462 case '\r':
1463 case '\n':
1464 case '\t':
1465 case 7: /* bell */
1466 case 8: /* backspace */
1467 case 11: /* vertical tab */
1468 case 12: /* form feed */
1469 case 26: /* MS-DOS C-z junk */
1470 case 31: /* '^_' -- for info */
1471 return 1;
1472 default:
1473 return 0;
1474 }
1475 }
1476
1477 static int
1478 mask_has_at_most_one_bit_p (int mask)
1479 {
1480 /* Perhaps the only thing useful you learn from intensive Microsoft
1481 technical interviews */
1482 return (mask & (mask - 1)) == 0;
1483 }
1484
1485 static enum eol_type
1486 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1487 unsigned int n)
1488 {
1489 int c;
1490
1491 while (n--)
1492 {
1493 c = *src++;
1494 if (c == '\n')
1495 {
1496 if (st->eol.just_saw_cr)
1497 return EOL_CRLF;
1498 else if (st->eol.seen_anything)
1499 return EOL_LF;
1500 }
1501 else if (st->eol.just_saw_cr)
1502 return EOL_CR;
1503 else if (c == '\r')
1504 st->eol.just_saw_cr = 1;
1505 else
1506 st->eol.just_saw_cr = 0;
1507 st->eol.seen_anything = 1;
1508 }
1509
1510 return EOL_AUTODETECT;
1511 }
1512
1513 /* Attempt to determine the encoding and EOL type of the given text.
1514 Before calling this function for the first type, you must initialize
1515 st->eol_type as appropriate and initialize st->mask to ~0.
1516
1517 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1518 not yet known.
1519
1520 st->mask holds the determined coding category mask, or ~0 if only
1521 ASCII has been seen so far.
1522
1523 Returns:
1524
1525 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1526 is present in st->mask
1527 1 == definitive answers are here for both st->eol_type and st->mask
1528 */
1529
1530 static int
1531 detect_coding_type (struct detection_state *st, CONST Extbyte *src,
1532 unsigned int n, int just_do_eol)
1533 {
1534 int c;
1535
1536 if (st->eol_type == EOL_AUTODETECT)
1537 st->eol_type = detect_eol_type (st, src, n);
1538
1539 if (just_do_eol)
1540 return st->eol_type != EOL_AUTODETECT;
1541
1542 if (!st->seen_non_ascii)
1543 {
1544 for (; n; n--, src++)
1545 {
1546 c = *src;
1547 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1548 {
1549 st->seen_non_ascii = 1;
1550 #ifdef MULE
1551 st->shift_jis.mask = ~0;
1552 st->big5.mask = ~0;
1553 st->ucs4.mask = ~0;
1554 st->utf8.mask = ~0;
1555 st->iso2022.mask = ~0;
1556 #endif
1557 break;
1558 }
1559 }
1560 }
1561
1562 if (!n)
1563 return 0;
1564 #ifdef MULE
1565 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1566 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1567 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1568 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1569 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1570 st->big5.mask = detect_coding_big5 (st, src, n);
1571 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1572 st->utf8.mask = detect_coding_utf8 (st, src, n);
1573 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1574 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1575
1576 st->mask
1577 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1578 | st->utf8.mask | st->ucs4.mask;
1579 #endif
1580 {
1581 int retval = mask_has_at_most_one_bit_p (st->mask);
1582 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1583 return retval && st->eol_type != EOL_AUTODETECT;
1584 }
1585 }
1586
1587 static Lisp_Object
1588 coding_system_from_mask (int mask)
1589 {
1590 if (mask == ~0)
1591 {
1592 /* If the file was entirely or basically ASCII, use the
1593 default value of `buffer-file-coding-system'. */
1594 Lisp_Object retval =
1595 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system;
1596 if (!NILP (retval))
1597 {
1598 retval = Ffind_coding_system (retval);
1599 if (NILP (retval))
1600 {
1601 warn_when_safe
1602 (Qbad_variable, Qwarning,
1603 "Invalid `default-buffer-file-coding-system', set to nil");
1604 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil;
1605 }
1606 }
1607 if (NILP (retval))
1608 retval = Fget_coding_system (Qraw_text);
1609 return retval;
1610 }
1611 else
1612 {
1613 int i;
1614 int cat = -1;
1615 #ifdef MULE
1616 mask = postprocess_iso2022_mask (mask);
1617 #endif
1618 /* Look through the coding categories by priority and find
1619 the first one that is allowed. */
1620 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1621 {
1622 cat = fcd->coding_category_by_priority[i];
1623 if ((mask & (1 << cat)) &&
1624 !NILP (fcd->coding_category_system[cat]))
1625 break;
1626 }
1627 if (cat >= 0)
1628 return fcd->coding_category_system[cat];
1629 else
1630 return Fget_coding_system (Qraw_text);
1631 }
1632 }
1633
1634 /* Given a seekable read stream and potential coding system and EOL type
1635 as specified, do any autodetection that is called for. If the
1636 coding system and/or EOL type are not `autodetect', they will be left
1637 alone; but this function will never return an autodetect coding system
1638 or EOL type.
1639
1640 This function does not automatically fetch subsidiary coding systems;
1641 that should be unnecessary with the explicit eol-type argument. */
1642
1643 #define LENGTH(string_constant) (sizeof (string_constant) - 1)
1644
1645 void
1646 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1647 enum eol_type *eol_type_in_out)
1648 {
1649 struct detection_state decst;
1650
1651 if (*eol_type_in_out == EOL_AUTODETECT)
1652 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1653
1654 xzero (decst);
1655 decst.eol_type = *eol_type_in_out;
1656 decst.mask = ~0;
1657
1658 /* If autodetection is called for, do it now. */
1659 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1660 || *eol_type_in_out == EOL_AUTODETECT)
1661 {
1662 Extbyte buf[4096];
1663 Lisp_Object coding_system = Qnil;
1664 Extbyte *p;
1665 ssize_t nread = Lstream_read (stream, buf, sizeof (buf));
1666 Extbyte *scan_end;
1667
1668 /* Look for initial "-*-"; mode line prefix */
1669 for (p = buf,
1670 scan_end = buf + nread - LENGTH ("-*-coding:?-*-");
1671 p <= scan_end
1672 && *p != '\n'
1673 && *p != '\r';
1674 p++)
1675 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1676 {
1677 Extbyte *local_vars_beg = p + 3;
1678 /* Look for final "-*-"; mode line suffix */
1679 for (p = local_vars_beg,
1680 scan_end = buf + nread - LENGTH ("-*-");
1681 p <= scan_end
1682 && *p != '\n'
1683 && *p != '\r';
1684 p++)
1685 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-')
1686 {
1687 Extbyte *suffix = p;
1688 /* Look for "coding:" */
1689 for (p = local_vars_beg,
1690 scan_end = suffix - LENGTH ("coding:?");
1691 p <= scan_end;
1692 p++)
1693 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0
1694 && (p == local_vars_beg
1695 || (*(p-1) == ' ' ||
1696 *(p-1) == '\t' ||
1697 *(p-1) == ';')))
1698 {
1699 Extbyte save;
1700 int n;
1701 p += LENGTH ("coding:");
1702 while (*p == ' ' || *p == '\t') p++;
1703
1704 /* Get coding system name */
1705 save = *suffix; *suffix = '\0';
1706 /* Characters valid in a MIME charset name (rfc 1521),
1707 and in a Lisp symbol name. */
1708 n = strspn ( (char *) p,
1709 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1710 "abcdefghijklmnopqrstuvwxyz"
1711 "0123456789"
1712 "!$%&*+-.^_{|}~");
1713 *suffix = save;
1714 if (n > 0)
1715 {
1716 save = p[n]; p[n] = '\0';
1717 coding_system =
1718 Ffind_coding_system (intern ((char *) p));
1719 p[n] = save;
1720 }
1721 break;
1722 }
1723 break;
1724 }
1725 break;
1726 }
1727
1728 if (NILP (coding_system))
1729 do
1730 {
1731 if (detect_coding_type (&decst, buf, nread,
1732 XCODING_SYSTEM_TYPE (*codesys_in_out)
1733 != CODESYS_AUTODETECT))
1734 break;
1735 nread = Lstream_read (stream, buf, sizeof (buf));
1736 if (nread == 0)
1737 break;
1738 }
1739 while (1);
1740
1741 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT
1742 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT)
1743 do
1744 {
1745 if (detect_coding_type (&decst, buf, nread, 1))
1746 break;
1747 nread = Lstream_read (stream, buf, sizeof (buf));
1748 if (!nread)
1749 break;
1750 }
1751 while (1);
1752
1753 *eol_type_in_out = decst.eol_type;
1754 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1755 {
1756 if (NILP (coding_system))
1757 *codesys_in_out = coding_system_from_mask (decst.mask);
1758 else
1759 *codesys_in_out = coding_system;
1760 }
1761 }
1762
1763 /* If we absolutely can't determine the EOL type, just assume LF. */
1764 if (*eol_type_in_out == EOL_AUTODETECT)
1765 *eol_type_in_out = EOL_LF;
1766
1767 Lstream_rewind (stream);
1768 }
1769
1770 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1771 Detect coding system of the text in the region between START and END.
1772 Returned a list of possible coding systems ordered by priority.
1773 If only ASCII characters are found, it returns 'undecided or one of
1774 its subsidiary coding systems according to a detected end-of-line
1775 type. Optional arg BUFFER defaults to the current buffer.
1776 */
1777 (start, end, buffer))
1778 {
1779 Lisp_Object val = Qnil;
1780 struct buffer *buf = decode_buffer (buffer, 0);
1781 Bufpos b, e;
1782 Lisp_Object instream, lb_instream;
1783 Lstream *istr, *lb_istr;
1784 struct detection_state decst;
1785 struct gcpro gcpro1, gcpro2;
1786
1787 get_buffer_range_char (buf, start, end, &b, &e, 0);
1788 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1789 lb_istr = XLSTREAM (lb_instream);
1790 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary));
1791 istr = XLSTREAM (instream);
1792 GCPRO2 (instream, lb_instream);
1793 xzero (decst);
1794 decst.eol_type = EOL_AUTODETECT;
1795 decst.mask = ~0;
1796 while (1)
1797 {
1798 unsigned char random_buffer[4096];
1799 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer));
1800
1801 if (!nread)
1802 break;
1803 if (detect_coding_type (&decst, random_buffer, nread, 0))
1804 break;
1805 }
1806
1807 if (decst.mask == ~0)
1808 val = subsidiary_coding_system (Fget_coding_system (Qundecided),
1809 decst.eol_type);
1810 else
1811 {
1812 int i;
1813
1814 val = Qnil;
1815 #ifdef MULE
1816 decst.mask = postprocess_iso2022_mask (decst.mask);
1817 #endif
1818 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1819 {
1820 int sys = fcd->coding_category_by_priority[i];
1821 if (decst.mask & (1 << sys))
1822 {
1823 Lisp_Object codesys = fcd->coding_category_system[sys];
1824 if (!NILP (codesys))
1825 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1826 val = Fcons (codesys, val);
1827 }
1828 }
1829 }
1830 Lstream_close (istr);
1831 UNGCPRO;
1832 Lstream_delete (istr);
1833 Lstream_delete (lb_istr);
1834 return val;
1835 }
1836
1837
1838 /************************************************************************/
1839 /* Converting to internal Mule format ("decoding") */
1840 /************************************************************************/
1841
1842 /* A decoding stream is a stream used for decoding text (i.e.
1843 converting from some external format to internal format).
1844 The decoding-stream object keeps track of the actual coding
1845 stream, the stream that is at the other end, and data that
1846 needs to be persistent across the lifetime of the stream. */
1847
1848 /* Handle the EOL stuff related to just-read-in character C.
1849 EOL_TYPE is the EOL type of the coding stream.
1850 FLAGS is the current value of FLAGS in the coding stream, and may
1851 be modified by this macro. (The macro only looks at the
1852 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded
1853 bytes are to be written. You need to also define a local goto
1854 label "label_continue_loop" that is at the end of the main
1855 character-reading loop.
1856
1857 If C is a CR character, then this macro handles it entirely and
1858 jumps to label_continue_loop. Otherwise, this macro does not add
1859 anything to DST, and continues normally. You should continue
1860 processing C normally after this macro. */
1861
1862 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \
1863 do { \
1864 if (c == '\r') \
1865 { \
1866 if (eol_type == EOL_CR) \
1867 Dynarr_add (dst, '\n'); \
1868 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \
1869 Dynarr_add (dst, c); \
1870 else \
1871 flags |= CODING_STATE_CR; \
1872 goto label_continue_loop; \
1873 } \
1874 else if (flags & CODING_STATE_CR) \
1875 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \
1876 if (c != '\n') \
1877 Dynarr_add (dst, '\r'); \
1878 flags &= ~CODING_STATE_CR; \
1879 } \
1880 } while (0)
1881
1882 /* C should be a binary character in the range 0 - 255; convert
1883 to internal format and add to Dynarr DST. */
1884
1885 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1886 do { \
1887 if (BYTE_ASCII_P (c)) \
1888 Dynarr_add (dst, c); \
1889 else if (BYTE_C1_P (c)) \
1890 { \
1891 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1892 Dynarr_add (dst, c + 0x20); \
1893 } \
1894 else \
1895 { \
1896 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
1897 Dynarr_add (dst, c); \
1898 } \
1899 } while (0)
1900
1901 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1902 do { \
1903 if (ch) \
1904 { \
1905 DECODE_ADD_BINARY_CHAR (ch, dst); \
1906 ch = 0; \
1907 } \
1908 } while (0)
1909
1910 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \
1911 do { \
1912 if (flags & CODING_STATE_END) \
1913 { \
1914 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1915 if (flags & CODING_STATE_CR) \
1916 Dynarr_add (dst, '\r'); \
1917 } \
1918 } while (0)
1919
1920 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1921
1922 struct decoding_stream
1923 {
1924 /* Coding system that governs the conversion. */
1925 Lisp_Coding_System *codesys;
1926
1927 /* Stream that we read the encoded data from or
1928 write the decoded data to. */
1929 Lstream *other_end;
1930
1931 /* If we are reading, then we can return only a fixed amount of
1932 data, so if the conversion resulted in too much data, we store it
1933 here for retrieval the next time around. */
1934 unsigned_char_dynarr *runoff;
1935
1936 /* FLAGS holds flags indicating the current state of the decoding.
1937 Some of these flags are dependent on the coding system. */
1938 unsigned int flags;
1939
1940 /* CH holds a partially built-up character. Since we only deal
1941 with one- and two-byte characters at the moment, we only use
1942 this to store the first byte of a two-byte character. */
1943 unsigned int ch;
1944
1945 /* EOL_TYPE specifies the type of end-of-line conversion that
1946 currently applies. We need to keep this separate from the
1947 EOL type stored in CODESYS because the latter might indicate
1948 automatic EOL-type detection while the former will always
1949 indicate a particular EOL type. */
1950 enum eol_type eol_type;
1951 #ifdef MULE
1952 /* Additional ISO2022 information. We define the structure above
1953 because it's also needed by the detection routines. */
1954 struct iso2022_decoder iso2022;
1955
1956 /* Additional information (the state of the running CCL program)
1957 used by the CCL decoder. */
1958 struct ccl_program ccl;
1959
1960 /* counter for UTF-8 or UCS-4 */
1961 unsigned char counter;
1962 #endif
1963 struct detection_state decst;
1964 };
1965
1966 static ssize_t decoding_reader (Lstream *stream,
1967 unsigned char *data, size_t size);
1968 static ssize_t decoding_writer (Lstream *stream,
1969 CONST unsigned char *data, size_t size);
1970 static int decoding_rewinder (Lstream *stream);
1971 static int decoding_seekable_p (Lstream *stream);
1972 static int decoding_flusher (Lstream *stream);
1973 static int decoding_closer (Lstream *stream);
1974
1975 static Lisp_Object decoding_marker (Lisp_Object stream);
1976
1977 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1978 sizeof (struct decoding_stream));
1979
1980 static Lisp_Object
1981 decoding_marker (Lisp_Object stream)
1982 {
1983 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1984 Lisp_Object str_obj;
1985
1986 /* We do not need to mark the coding systems or charsets stored
1987 within the stream because they are stored in a global list
1988 and automatically marked. */
1989
1990 XSETLSTREAM (str_obj, str);
1991 mark_object (str_obj);
1992 if (str->imp->marker)
1993 return (str->imp->marker) (str_obj);
1994 else
1995 return Qnil;
1996 }
1997
1998 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1999 so we read data from the other end, decode it, and store it into DATA. */
2000
2001 static ssize_t
2002 decoding_reader (Lstream *stream, unsigned char *data, size_t size)
2003 {
2004 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2005 unsigned char *orig_data = data;
2006 ssize_t read_size;
2007 int error_occurred = 0;
2008
2009 /* We need to interface to mule_decode(), which expects to take some
2010 amount of data and store the result into a Dynarr. We have
2011 mule_decode() store into str->runoff, and take data from there
2012 as necessary. */
2013
2014 /* We loop until we have enough data, reading chunks from the other
2015 end and decoding it. */
2016 while (1)
2017 {
2018 /* Take data from the runoff if we can. Make sure to take at
2019 most SIZE bytes, and delete the data from the runoff. */
2020 if (Dynarr_length (str->runoff) > 0)
2021 {
2022 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff));
2023 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2024 Dynarr_delete_many (str->runoff, 0, chunk);
2025 data += chunk;
2026 size -= chunk;
2027 }
2028
2029 if (size == 0)
2030 break; /* No more room for data */
2031
2032 if (str->flags & CODING_STATE_END)
2033 /* This means that on the previous iteration, we hit the EOF on
2034 the other end. We loop once more so that mule_decode() can
2035 output any final stuff it may be holding, or any "go back
2036 to a sane state" escape sequences. (This latter makes sense
2037 during encoding.) */
2038 break;
2039
2040 /* Exhausted the runoff, so get some more. DATA has at least
2041 SIZE bytes left of storage in it, so it's OK to read directly
2042 into it. (We'll be overwriting above, after we've decoded it
2043 into the runoff.) */
2044 read_size = Lstream_read (str->other_end, data, size);
2045 if (read_size < 0)
2046 {
2047 error_occurred = 1;
2048 break;
2049 }
2050 if (read_size == 0)
2051 /* There might be some more end data produced in the translation.
2052 See the comment above. */
2053 str->flags |= CODING_STATE_END;
2054 mule_decode (stream, data, str->runoff, read_size);
2055 }
2056
2057 if (data - orig_data == 0)
2058 return error_occurred ? -1 : 0;
2059 else
2060 return data - orig_data;
2061 }
2062
2063 static ssize_t
2064 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2065 {
2066 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2067 ssize_t retval;
2068
2069 /* Decode all our data into the runoff, and then attempt to write
2070 it all out to the other end. Remove whatever chunk we succeeded
2071 in writing. */
2072 mule_decode (stream, data, str->runoff, size);
2073 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2074 Dynarr_length (str->runoff));
2075 if (retval > 0)
2076 Dynarr_delete_many (str->runoff, 0, retval);
2077 /* Do NOT return retval. The return value indicates how much
2078 of the incoming data was written, not how many bytes were
2079 written. */
2080 return size;
2081 }
2082
2083 static void
2084 reset_decoding_stream (struct decoding_stream *str)
2085 {
2086 #ifdef MULE
2087 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2088 {
2089 Lisp_Object coding_system;
2090 XSETCODING_SYSTEM (coding_system, str->codesys);
2091 reset_iso2022 (coding_system, &str->iso2022);
2092 }
2093 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2094 {
2095 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys));
2096 }
2097 str->counter = 0;
2098 #endif /* MULE */
2099 str->flags = str->ch = 0;
2100 }
2101
2102 static int
2103 decoding_rewinder (Lstream *stream)
2104 {
2105 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2106 reset_decoding_stream (str);
2107 Dynarr_reset (str->runoff);
2108 return Lstream_rewind (str->other_end);
2109 }
2110
2111 static int
2112 decoding_seekable_p (Lstream *stream)
2113 {
2114 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2115 return Lstream_seekable_p (str->other_end);
2116 }
2117
2118 static int
2119 decoding_flusher (Lstream *stream)
2120 {
2121 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2122 return Lstream_flush (str->other_end);
2123 }
2124
2125 static int
2126 decoding_closer (Lstream *stream)
2127 {
2128 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2129 if (stream->flags & LSTREAM_FL_WRITE)
2130 {
2131 str->flags |= CODING_STATE_END;
2132 decoding_writer (stream, 0, 0);
2133 }
2134 Dynarr_free (str->runoff);
2135 #ifdef MULE
2136 #ifdef ENABLE_COMPOSITE_CHARS
2137 if (str->iso2022.composite_chars)
2138 Dynarr_free (str->iso2022.composite_chars);
2139 #endif
2140 #endif
2141 return Lstream_close (str->other_end);
2142 }
2143
2144 Lisp_Object
2145 decoding_stream_coding_system (Lstream *stream)
2146 {
2147 Lisp_Object coding_system;
2148 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
2149
2150 XSETCODING_SYSTEM (coding_system, str->codesys);
2151 return subsidiary_coding_system (coding_system, str->eol_type);
2152 }
2153
2154 void
2155 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2156 {
2157 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2158 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2159 str->codesys = cs;
2160 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
2161 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
2162 reset_decoding_stream (str);
2163 }
2164
2165 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
2166 stream for writing, no automatic code detection will be performed.
2167 The reason for this is that automatic code detection requires a
2168 seekable input. Things will also fail if you open a decoding
2169 stream for reading using a non-fully-specified coding system and
2170 a non-seekable input stream. */
2171
2172 static Lisp_Object
2173 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2174 CONST char *mode)
2175 {
2176 Lstream *lstr = Lstream_new (lstream_decoding, mode);
2177 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
2178 Lisp_Object obj;
2179
2180 xzero (*str);
2181 str->other_end = stream;
2182 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char);
2183 str->eol_type = EOL_AUTODETECT;
2184 if (!strcmp (mode, "r")
2185 && Lstream_seekable_p (stream))
2186 /* We can determine the coding system now. */
2187 determine_real_coding_system (stream, &codesys, &str->eol_type);
2188 set_decoding_stream_coding_system (lstr, codesys);
2189 str->decst.eol_type = str->eol_type;
2190 str->decst.mask = ~0;
2191 XSETLSTREAM (obj, lstr);
2192 return obj;
2193 }
2194
2195 Lisp_Object
2196 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
2197 {
2198 return make_decoding_stream_1 (stream, codesys, "r");
2199 }
2200
2201 Lisp_Object
2202 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
2203 {
2204 return make_decoding_stream_1 (stream, codesys, "w");
2205 }
2206
2207 /* Note: the decode_coding_* functions all take the same
2208 arguments as mule_decode(), which is to say some SRC data of
2209 size N, which is to be stored into dynamic array DST.
2210 DECODING is the stream within which the decoding is
2211 taking place, but no data is actually read from or
2212 written to that stream; that is handled in decoding_reader()
2213 or decoding_writer(). This allows the same functions to
2214 be used for both reading and writing. */
2215
2216 static void
2217 mule_decode (Lstream *decoding, CONST unsigned char *src,
2218 unsigned_char_dynarr *dst, unsigned int n)
2219 {
2220 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2221
2222 /* If necessary, do encoding-detection now. We do this when
2223 we're a writing stream or a non-seekable reading stream,
2224 meaning that we can't just process the whole input,
2225 rewind, and start over. */
2226
2227 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
2228 str->eol_type == EOL_AUTODETECT)
2229 {
2230 Lisp_Object codesys;
2231
2232 XSETCODING_SYSTEM (codesys, str->codesys);
2233 detect_coding_type (&str->decst, src, n,
2234 CODING_SYSTEM_TYPE (str->codesys) !=
2235 CODESYS_AUTODETECT);
2236 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
2237 str->decst.mask != ~0)
2238 /* #### This is cheesy. What we really ought to do is
2239 buffer up a certain amount of data so as to get a
2240 less random result. */
2241 codesys = coding_system_from_mask (str->decst.mask);
2242 str->eol_type = str->decst.eol_type;
2243 if (XCODING_SYSTEM (codesys) != str->codesys)
2244 {
2245 /* Preserve the CODING_STATE_END flag in case it was set.
2246 If we erase it, bad things might happen. */
2247 int was_end = str->flags & CODING_STATE_END;
2248 set_decoding_stream_coding_system (decoding, codesys);
2249 if (was_end)
2250 str->flags |= CODING_STATE_END;
2251 }
2252 }
2253
2254 switch (CODING_SYSTEM_TYPE (str->codesys))
2255 {
2256 #ifdef DEBUG_XEMACS
2257 case CODESYS_INTERNAL:
2258 Dynarr_add_many (dst, src, n);
2259 break;
2260 #endif
2261 case CODESYS_AUTODETECT:
2262 /* If we got this far and still haven't decided on the coding
2263 system, then do no conversion. */
2264 case CODESYS_NO_CONVERSION:
2265 decode_coding_no_conversion (decoding, src, dst, n);
2266 break;
2267 #ifdef MULE
2268 case CODESYS_SHIFT_JIS:
2269 decode_coding_sjis (decoding, src, dst, n);
2270 break;
2271 case CODESYS_BIG5:
2272 decode_coding_big5 (decoding, src, dst, n);
2273 break;
2274 case CODESYS_UCS4:
2275 decode_coding_ucs4 (decoding, src, dst, n);
2276 break;
2277 case CODESYS_UTF8:
2278 decode_coding_utf8 (decoding, src, dst, n);
2279 break;
2280 case CODESYS_CCL:
2281 str->ccl.last_block = str->flags & CODING_STATE_END;
2282 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING);
2283 break;
2284 case CODESYS_ISO2022:
2285 decode_coding_iso2022 (decoding, src, dst, n);
2286 break;
2287 #endif /* MULE */
2288 default:
2289 abort ();
2290 }
2291 }
2292
2293 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
2294 Decode the text between START and END which is encoded in CODING-SYSTEM.
2295 This is useful if you've read in encoded text from a file without decoding
2296 it (e.g. you read in a JIS-formatted file but used the `binary' or
2297 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B").
2298 Return length of decoded text.
2299 BUFFER defaults to the current buffer if unspecified.
2300 */
2301 (start, end, coding_system, buffer))
2302 {
2303 Bufpos b, e;
2304 struct buffer *buf = decode_buffer (buffer, 0);
2305 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2306 Lstream *istr, *ostr;
2307 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2308
2309 get_buffer_range_char (buf, start, end, &b, &e, 0);
2310
2311 barf_if_buffer_read_only (buf, b, e);
2312
2313 coding_system = Fget_coding_system (coding_system);
2314 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2315 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2316 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2317 coding_system);
2318 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2319 Fget_coding_system (Qbinary));
2320 istr = XLSTREAM (instream);
2321 ostr = XLSTREAM (outstream);
2322 GCPRO4 (instream, lb_outstream, de_outstream, outstream);
2323
2324 /* The chain of streams looks like this:
2325
2326 [BUFFER] <----- send through
2327 ------> [ENCODE AS BINARY]
2328 ------> [DECODE AS SPECIFIED]
2329 ------> [BUFFER]
2330 */
2331
2332 while (1)
2333 {
2334 char tempbuf[1024]; /* some random amount */
2335 Bufpos newpos, even_newer_pos;
2336 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2337 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2338
2339 if (!size_in_bytes)
2340 break;
2341 newpos = lisp_buffer_stream_startpos (istr);
2342 Lstream_write (ostr, tempbuf, size_in_bytes);
2343 even_newer_pos = lisp_buffer_stream_startpos (istr);
2344 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2345 even_newer_pos, 0);
2346 }
2347 Lstream_close (istr);
2348 Lstream_close (ostr);
2349 UNGCPRO;
2350 Lstream_delete (istr);
2351 Lstream_delete (ostr);
2352 Lstream_delete (XLSTREAM (de_outstream));
2353 Lstream_delete (XLSTREAM (lb_outstream));
2354 return Qnil;
2355 }
2356
2357
2358 /************************************************************************/
2359 /* Converting to an external encoding ("encoding") */
2360 /************************************************************************/
2361
2362 /* An encoding stream is an output stream. When you create the
2363 stream, you specify the coding system that governs the encoding
2364 and another stream that the resulting encoded data is to be
2365 sent to, and then start sending data to it. */
2366
2367 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2368
2369 struct encoding_stream
2370 {
2371 /* Coding system that governs the conversion. */
2372 Lisp_Coding_System *codesys;
2373
2374 /* Stream that we read the encoded data from or
2375 write the decoded data to. */
2376 Lstream *other_end;
2377
2378 /* If we are reading, then we can return only a fixed amount of
2379 data, so if the conversion resulted in too much data, we store it
2380 here for retrieval the next time around. */
2381 unsigned_char_dynarr *runoff;
2382
2383 /* FLAGS holds flags indicating the current state of the encoding.
2384 Some of these flags are dependent on the coding system. */
2385 unsigned int flags;
2386
2387 /* CH holds a partially built-up character. Since we only deal
2388 with one- and two-byte characters at the moment, we only use
2389 this to store the first byte of a two-byte character. */
2390 unsigned int ch;
2391 #ifdef MULE
2392 /* Additional information used by the ISO2022 encoder. */
2393 struct
2394 {
2395 /* CHARSET holds the character sets currently assigned to the G0
2396 through G3 registers. It is initialized from the array
2397 INITIAL_CHARSET in CODESYS. */
2398 Lisp_Object charset[4];
2399
2400 /* Which registers are currently invoked into the left (GL) and
2401 right (GR) halves of the 8-bit encoding space? */
2402 int register_left, register_right;
2403
2404 /* Whether we need to explicitly designate the charset in the
2405 G? register before using it. It is initialized from the
2406 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2407 unsigned char force_charset_on_output[4];
2408
2409 /* Other state variables that need to be preserved across
2410 invocations. */
2411 Lisp_Object current_charset;
2412 int current_half;
2413 int current_char_boundary;
2414 } iso2022;
2415
2416 /* Additional information (the state of the running CCL program)
2417 used by the CCL encoder. */
2418 struct ccl_program ccl;
2419 #endif /* MULE */
2420 };
2421
2422 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size);
2423 static ssize_t encoding_writer (Lstream *stream, CONST unsigned char *data,
2424 size_t size);
2425 static int encoding_rewinder (Lstream *stream);
2426 static int encoding_seekable_p (Lstream *stream);
2427 static int encoding_flusher (Lstream *stream);
2428 static int encoding_closer (Lstream *stream);
2429
2430 static Lisp_Object encoding_marker (Lisp_Object stream);
2431
2432 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2433 sizeof (struct encoding_stream));
2434
2435 static Lisp_Object
2436 encoding_marker (Lisp_Object stream)
2437 {
2438 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2439 Lisp_Object str_obj;
2440
2441 /* We do not need to mark the coding systems or charsets stored
2442 within the stream because they are stored in a global list
2443 and automatically marked. */
2444
2445 XSETLSTREAM (str_obj, str);
2446 mark_object (str_obj);
2447 if (str->imp->marker)
2448 return (str->imp->marker) (str_obj);
2449 else
2450 return Qnil;
2451 }
2452
2453 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2454 so we read data from the other end, encode it, and store it into DATA. */
2455
2456 static ssize_t
2457 encoding_reader (Lstream *stream, unsigned char *data, size_t size)
2458 {
2459 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2460 unsigned char *orig_data = data;
2461 ssize_t read_size;
2462 int error_occurred = 0;
2463
2464 /* We need to interface to mule_encode(), which expects to take some
2465 amount of data and store the result into a Dynarr. We have
2466 mule_encode() store into str->runoff, and take data from there
2467 as necessary. */
2468
2469 /* We loop until we have enough data, reading chunks from the other
2470 end and encoding it. */
2471 while (1)
2472 {
2473 /* Take data from the runoff if we can. Make sure to take at
2474 most SIZE bytes, and delete the data from the runoff. */
2475 if (Dynarr_length (str->runoff) > 0)
2476 {
2477 int chunk = min ((int) size, Dynarr_length (str->runoff));
2478 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2479 Dynarr_delete_many (str->runoff, 0, chunk);
2480 data += chunk;
2481 size -= chunk;
2482 }
2483
2484 if (size == 0)
2485 break; /* No more room for data */
2486
2487 if (str->flags & CODING_STATE_END)
2488 /* This means that on the previous iteration, we hit the EOF on
2489 the other end. We loop once more so that mule_encode() can
2490 output any final stuff it may be holding, or any "go back
2491 to a sane state" escape sequences. (This latter makes sense
2492 during encoding.) */
2493 break;
2494
2495 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2496 left of storage in it, so it's OK to read directly into it.
2497 (We'll be overwriting above, after we've encoded it into the
2498 runoff.) */
2499 read_size = Lstream_read (str->other_end, data, size);
2500 if (read_size < 0)
2501 {
2502 error_occurred = 1;
2503 break;
2504 }
2505 if (read_size == 0)
2506 /* There might be some more end data produced in the translation.
2507 See the comment above. */
2508 str->flags |= CODING_STATE_END;
2509 mule_encode (stream, data, str->runoff, read_size);
2510 }
2511
2512 if (data == orig_data)
2513 return error_occurred ? -1 : 0;
2514 else
2515 return data - orig_data;
2516 }
2517
2518 static ssize_t
2519 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size)
2520 {
2521 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2522 ssize_t retval;
2523
2524 /* Encode all our data into the runoff, and then attempt to write
2525 it all out to the other end. Remove whatever chunk we succeeded
2526 in writing. */
2527 mule_encode (stream, data, str->runoff, size);
2528 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2529 Dynarr_length (str->runoff));
2530 if (retval > 0)
2531 Dynarr_delete_many (str->runoff, 0, retval);
2532 /* Do NOT return retval. The return value indicates how much
2533 of the incoming data was written, not how many bytes were
2534 written. */
2535 return size;
2536 }
2537
2538 static void
2539 reset_encoding_stream (struct encoding_stream *str)
2540 {
2541 #ifdef MULE
2542 switch (CODING_SYSTEM_TYPE (str->codesys))
2543 {
2544 case CODESYS_ISO2022:
2545 {
2546 int i;
2547
2548 for (i = 0; i < 4; i++)
2549 {
2550 str->iso2022.charset[i] =
2551 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2552 str->iso2022.force_charset_on_output[i] =
2553 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2554 }
2555 str->iso2022.register_left = 0;
2556 str->iso2022.register_right = 1;
2557 str->iso2022.current_charset = Qnil;
2558 str->iso2022.current_half = 0;
2559 str->iso2022.current_char_boundary = 1;
2560 break;
2561 }
2562 case CODESYS_CCL:
2563 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys));
2564 break;
2565 default:
2566 break;
2567 }
2568 #endif /* MULE */
2569
2570 str->flags = str->ch = 0;
2571 }
2572
2573 static int
2574 encoding_rewinder (Lstream *stream)
2575 {
2576 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2577 reset_encoding_stream (str);
2578 Dynarr_reset (str->runoff);
2579 return Lstream_rewind (str->other_end);
2580 }
2581
2582 static int
2583 encoding_seekable_p (Lstream *stream)
2584 {
2585 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2586 return Lstream_seekable_p (str->other_end);
2587 }
2588
2589 static int
2590 encoding_flusher (Lstream *stream)
2591 {
2592 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2593 return Lstream_flush (str->other_end);
2594 }
2595
2596 static int
2597 encoding_closer (Lstream *stream)
2598 {
2599 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2600 if (stream->flags & LSTREAM_FL_WRITE)
2601 {
2602 str->flags |= CODING_STATE_END;
2603 encoding_writer (stream, 0, 0);
2604 }
2605 Dynarr_free (str->runoff);
2606 return Lstream_close (str->other_end);
2607 }
2608
2609 Lisp_Object
2610 encoding_stream_coding_system (Lstream *stream)
2611 {
2612 Lisp_Object coding_system;
2613 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2614
2615 XSETCODING_SYSTEM (coding_system, str->codesys);
2616 return coding_system;
2617 }
2618
2619 void
2620 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2621 {
2622 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2623 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2624 str->codesys = cs;
2625 reset_encoding_stream (str);
2626 }
2627
2628 static Lisp_Object
2629 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2630 CONST char *mode)
2631 {
2632 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2633 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2634 Lisp_Object obj;
2635
2636 xzero (*str);
2637 str->runoff = Dynarr_new (unsigned_char);
2638 str->other_end = stream;
2639 set_encoding_stream_coding_system (lstr, codesys);
2640 XSETLSTREAM (obj, lstr);
2641 return obj;
2642 }
2643
2644 Lisp_Object
2645 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2646 {
2647 return make_encoding_stream_1 (stream, codesys, "r");
2648 }
2649
2650 Lisp_Object
2651 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2652 {
2653 return make_encoding_stream_1 (stream, codesys, "w");
2654 }
2655
2656 /* Convert N bytes of internally-formatted data stored in SRC to an
2657 external format, according to the encoding stream ENCODING.
2658 Store the encoded data into DST. */
2659
2660 static void
2661 mule_encode (Lstream *encoding, CONST unsigned char *src,
2662 unsigned_char_dynarr *dst, unsigned int n)
2663 {
2664 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2665
2666 switch (CODING_SYSTEM_TYPE (str->codesys))
2667 {
2668 #ifdef DEBUG_XEMACS
2669 case CODESYS_INTERNAL:
2670 Dynarr_add_many (dst, src, n);
2671 break;
2672 #endif
2673 case CODESYS_AUTODETECT:
2674 /* If we got this far and still haven't decided on the coding
2675 system, then do no conversion. */
2676 case CODESYS_NO_CONVERSION:
2677 encode_coding_no_conversion (encoding, src, dst, n);
2678 break;
2679 #ifdef MULE
2680 case CODESYS_SHIFT_JIS:
2681 encode_coding_sjis (encoding, src, dst, n);
2682 break;
2683 case CODESYS_BIG5:
2684 encode_coding_big5 (encoding, src, dst, n);
2685 break;
2686 case CODESYS_UCS4:
2687 encode_coding_ucs4 (encoding, src, dst, n);
2688 break;
2689 case CODESYS_UTF8:
2690 encode_coding_utf8 (encoding, src, dst, n);
2691 break;
2692 case CODESYS_CCL:
2693 str->ccl.last_block = str->flags & CODING_STATE_END;
2694 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING);
2695 break;
2696 case CODESYS_ISO2022:
2697 encode_coding_iso2022 (encoding, src, dst, n);
2698 break;
2699 #endif /* MULE */
2700 default:
2701 abort ();
2702 }
2703 }
2704
2705 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2706 Encode the text between START and END using CODING-SYSTEM.
2707 This will, for example, convert Japanese characters into stuff such as
2708 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded
2709 text. BUFFER defaults to the current buffer if unspecified.
2710 */
2711 (start, end, coding_system, buffer))
2712 {
2713 Bufpos b, e;
2714 struct buffer *buf = decode_buffer (buffer, 0);
2715 Lisp_Object instream, lb_outstream, de_outstream, outstream;
2716 Lstream *istr, *ostr;
2717 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4;
2718
2719 get_buffer_range_char (buf, start, end, &b, &e, 0);
2720
2721 barf_if_buffer_read_only (buf, b, e);
2722
2723 coding_system = Fget_coding_system (coding_system);
2724 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2725 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0);
2726 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream),
2727 Fget_coding_system (Qbinary));
2728 outstream = make_encoding_output_stream (XLSTREAM (de_outstream),
2729 coding_system);
2730 istr = XLSTREAM (instream);
2731 ostr = XLSTREAM (outstream);
2732 GCPRO4 (instream, outstream, de_outstream, lb_outstream);
2733 /* The chain of streams looks like this:
2734
2735 [BUFFER] <----- send through
2736 ------> [ENCODE AS SPECIFIED]
2737 ------> [DECODE AS BINARY]
2738 ------> [BUFFER]
2739 */
2740 while (1)
2741 {
2742 char tempbuf[1024]; /* some random amount */
2743 Bufpos newpos, even_newer_pos;
2744 Bufpos oldpos = lisp_buffer_stream_startpos (istr);
2745 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
2746
2747 if (!size_in_bytes)
2748 break;
2749 newpos = lisp_buffer_stream_startpos (istr);
2750 Lstream_write (ostr, tempbuf, size_in_bytes);
2751 even_newer_pos = lisp_buffer_stream_startpos (istr);
2752 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2753 even_newer_pos, 0);
2754 }
2755
2756 {
2757 Charcount retlen =
2758 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2759 Lstream_close (istr);
2760 Lstream_close (ostr);
2761 UNGCPRO;
2762 Lstream_delete (istr);
2763 Lstream_delete (ostr);
2764 Lstream_delete (XLSTREAM (de_outstream));
2765 Lstream_delete (XLSTREAM (lb_outstream));
2766 return make_int (retlen);
2767 }
2768 }
2769
2770 #ifdef MULE
2771
2772 /************************************************************************/
2773 /* Shift-JIS methods */
2774 /************************************************************************/
2775
2776 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2777 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2778 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2779 encoded by "position-code + 0x80". A character of JISX0208
2780 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2781 position-codes are divided and shifted so that it fit in the range
2782 below.
2783
2784 --- CODE RANGE of Shift-JIS ---
2785 (character set) (range)
2786 ASCII 0x00 .. 0x7F
2787 JISX0201-Kana 0xA0 .. 0xDF
2788 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2789 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2790 -------------------------------
2791
2792 */
2793
2794 /* Is this the first byte of a Shift-JIS two-byte char? */
2795
2796 #define BYTE_SJIS_TWO_BYTE_1_P(c) \
2797 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2798
2799 /* Is this the second byte of a Shift-JIS two-byte char? */
2800
2801 #define BYTE_SJIS_TWO_BYTE_2_P(c) \
2802 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2803
2804 #define BYTE_SJIS_KATAKANA_P(c) \
2805 ((c) >= 0xA1 && (c) <= 0xDF)
2806
2807 static int
2808 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src,
2809 unsigned int n)
2810 {
2811 int c;
2812
2813 while (n--)
2814 {
2815 c = *src++;
2816 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2817 return 0;
2818 if (st->shift_jis.in_second_byte)
2819 {
2820 st->shift_jis.in_second_byte = 0;
2821 if (c < 0x40)
2822 return 0;
2823 }
2824 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2825 st->shift_jis.in_second_byte = 1;
2826 }
2827 return CODING_CATEGORY_SHIFT_JIS_MASK;
2828 }
2829
2830 /* Convert Shift-JIS data to internal format. */
2831
2832 static void
2833 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2834 unsigned_char_dynarr *dst, unsigned int n)
2835 {
2836 unsigned char c;
2837 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2838 unsigned int flags = str->flags;
2839 unsigned int ch = str->ch;
2840 eol_type_t eol_type = str->eol_type;
2841
2842 while (n--)
2843 {
2844 c = *src++;
2845
2846 if (ch)
2847 {
2848 /* Previous character was first byte of Shift-JIS Kanji char. */
2849 if (BYTE_SJIS_TWO_BYTE_2_P (c))
2850 {
2851 unsigned char e1, e2;
2852
2853 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2854 DECODE_SJIS (ch, c, e1, e2);
2855 Dynarr_add (dst, e1);
2856 Dynarr_add (dst, e2);
2857 }
2858 else
2859 {
2860 DECODE_ADD_BINARY_CHAR (ch, dst);
2861 DECODE_ADD_BINARY_CHAR (c, dst);
2862 }
2863 ch = 0;
2864 }
2865 else
2866 {
2867 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
2868 if (BYTE_SJIS_TWO_BYTE_1_P (c))
2869 ch = c;
2870 else if (BYTE_SJIS_KATAKANA_P (c))
2871 {
2872 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
2873 Dynarr_add (dst, c);
2874 }
2875 else
2876 DECODE_ADD_BINARY_CHAR (c, dst);
2877 }
2878 label_continue_loop:;
2879 }
2880
2881 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2882
2883 str->flags = flags;
2884 str->ch = ch;
2885 }
2886
2887 /* Convert internally-formatted data to Shift-JIS. */
2888
2889 static void
2890 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2891 unsigned_char_dynarr *dst, unsigned int n)
2892 {
2893 unsigned char c;
2894 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2895 unsigned int flags = str->flags;
2896 unsigned int ch = str->ch;
2897 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2898
2899 while (n--)
2900 {
2901 c = *src++;
2902 if (c == '\n')
2903 {
2904 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
2905 Dynarr_add (dst, '\r');
2906 if (eol_type != EOL_CR)
2907 Dynarr_add (dst, '\n');
2908 ch = 0;
2909 }
2910 else if (BYTE_ASCII_P (c))
2911 {
2912 Dynarr_add (dst, c);
2913 ch = 0;
2914 }
2915 else if (BUFBYTE_LEADING_BYTE_P (c))
2916 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
2917 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2918 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2919 else if (ch)
2920 {
2921 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
2922 {
2923 Dynarr_add (dst, c);
2924 ch = 0;
2925 }
2926 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2927 ch == LEADING_BYTE_JAPANESE_JISX0208)
2928 ch = c;
2929 else
2930 {
2931 unsigned char j1, j2;
2932 ENCODE_SJIS (ch, c, j1, j2);
2933 Dynarr_add (dst, j1);
2934 Dynarr_add (dst, j2);
2935 ch = 0;
2936 }
2937 }
2938 }
2939
2940 str->flags = flags;
2941 str->ch = ch;
2942 }
2943
2944 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2945 Decode a JISX0208 character of Shift-JIS coding-system.
2946 CODE is the character code in Shift-JIS as a cons of type bytes.
2947 Return the corresponding character.
2948 */
2949 (code))
2950 {
2951 unsigned char c1, c2, s1, s2;
2952
2953 CHECK_CONS (code);
2954 CHECK_INT (XCAR (code));
2955 CHECK_INT (XCDR (code));
2956 s1 = XINT (XCAR (code));
2957 s2 = XINT (XCDR (code));
2958 if (BYTE_SJIS_TWO_BYTE_1_P (s1) &&
2959 BYTE_SJIS_TWO_BYTE_2_P (s2))
2960 {
2961 DECODE_SJIS (s1, s2, c1, c2);
2962 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2963 c1 & 0x7F, c2 & 0x7F));
2964 }
2965 else
2966 return Qnil;
2967 }
2968
2969 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2970 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2971 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2972 */
2973 (ch))
2974 {
2975 Lisp_Object charset;
2976 int c1, c2, s1, s2;
2977
2978 CHECK_CHAR_COERCE_INT (ch);
2979 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2980 if (EQ (charset, Vcharset_japanese_jisx0208))
2981 {
2982 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2);
2983 return Fcons (make_int (s1), make_int (s2));
2984 }
2985 else
2986 return Qnil;
2987 }
2988
2989
2990 /************************************************************************/
2991 /* Big5 methods */
2992 /************************************************************************/
2993
2994 /* BIG5 is a coding system encoding two character sets: ASCII and
2995 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2996 character set and is encoded in two-byte.
2997
2998 --- CODE RANGE of BIG5 ---
2999 (character set) (range)
3000 ASCII 0x00 .. 0x7F
3001 Big5 (1st byte) 0xA1 .. 0xFE
3002 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3003 --------------------------
3004
3005 Since the number of characters in Big5 is larger than maximum
3006 characters in Emacs' charset (96x96), it can't be handled as one
3007 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
3008 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
3009 contains frequently used characters and the latter contains less
3010 frequently used characters. */
3011
3012 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
3013 ((c) >= 0xA1 && (c) <= 0xFE)
3014
3015 /* Is this the second byte of a Shift-JIS two-byte char? */
3016
3017 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
3018 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
3019
3020 /* Number of Big5 characters which have the same code in 1st byte. */
3021
3022 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
3023
3024 /* Code conversion macros. These are macros because they are used in
3025 inner loops during code conversion.
3026
3027 Note that temporary variables in macros introduce the classic
3028 dynamic-scoping problems with variable names. We use capital-
3029 lettered variables in the assumption that XEmacs does not use
3030 capital letters in variables except in a very formalized way
3031 (e.g. Qstring). */
3032
3033 /* Convert Big5 code (b1, b2) into its internal string representation
3034 (lb, c1, c2). */
3035
3036 /* There is a much simpler way to split the Big5 charset into two.
3037 For the moment I'm going to leave the algorithm as-is because it
3038 claims to separate out the most-used characters into a single
3039 charset, which perhaps will lead to optimizations in various
3040 places.
3041
3042 The way the algorithm works is something like this:
3043
3044 Big5 can be viewed as a 94x157 charset, where the row is
3045 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
3046 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
3047 the split between low and high column numbers is apparently
3048 meaningless; ascending rows produce less and less frequent chars.
3049 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
3050 the first charset, and the upper half (0xC9 .. 0xFE) to the
3051 second. To do the conversion, we convert the character into
3052 a single number where 0 .. 156 is the first row, 157 .. 313
3053 is the second, etc. That way, the characters are ordered by
3054 decreasing frequency. Then we just chop the space in two
3055 and coerce the result into a 94x94 space.
3056 */
3057
3058 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
3059 { \
3060 int B1 = b1, B2 = b2; \
3061 unsigned int I \
3062 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
3063 \
3064 if (B1 < 0xC9) \
3065 { \
3066 lb = LEADING_BYTE_CHINESE_BIG5_1; \
3067 } \
3068 else \
3069 { \
3070 lb = LEADING_BYTE_CHINESE_BIG5_2; \
3071 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
3072 } \
3073 c1 = I / (0xFF - 0xA1) + 0xA1; \
3074 c2 = I % (0xFF - 0xA1) + 0xA1; \
3075 } while (0)
3076
3077 /* Convert the internal string representation of a Big5 character
3078 (lb, c1, c2) into Big5 code (b1, b2). */
3079
3080 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
3081 { \
3082 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
3083 \
3084 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
3085 { \
3086 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
3087 } \
3088 b1 = I / BIG5_SAME_ROW + 0xA1; \
3089 b2 = I % BIG5_SAME_ROW; \
3090 b2 += b2 < 0x3F ? 0x40 : 0x62; \
3091 } while (0)
3092
3093 static int
3094 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
3095 unsigned int n)
3096 {
3097 int c;
3098
3099 while (n--)
3100 {
3101 c = *src++;
3102 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
3103 (c >= 0x80 && c <= 0xA0))
3104 return 0;
3105 if (st->big5.in_second_byte)
3106 {
3107 st->big5.in_second_byte = 0;
3108 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
3109 return 0;
3110 }
3111 else if (c >= 0xA1)
3112 st->big5.in_second_byte = 1;
3113 }
3114 return CODING_CATEGORY_BIG5_MASK;
3115 }
3116
3117 /* Convert Big5 data to internal format. */
3118
3119 static void
3120 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
3121 unsigned_char_dynarr *dst, unsigned int n)
3122 {
3123 unsigned char c;
3124 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3125 unsigned int flags = str->flags;
3126 unsigned int ch = str->ch;
3127 eol_type_t eol_type = str->eol_type;
3128
3129 while (n--)
3130 {
3131 c = *src++;
3132 if (ch)
3133 {
3134 /* Previous character was first byte of Big5 char. */
3135 if (BYTE_BIG5_TWO_BYTE_2_P (c))
3136 {
3137 unsigned char b1, b2, b3;
3138 DECODE_BIG5 (ch, c, b1, b2, b3);
3139 Dynarr_add (dst, b1);
3140 Dynarr_add (dst, b2);
3141 Dynarr_add (dst, b3);
3142 }
3143 else
3144 {
3145 DECODE_ADD_BINARY_CHAR (ch, dst);
3146 DECODE_ADD_BINARY_CHAR (c, dst);
3147 }
3148 ch = 0;
3149 }
3150 else
3151 {
3152 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3153 if (BYTE_BIG5_TWO_BYTE_1_P (c))
3154 ch = c;
3155 else
3156 DECODE_ADD_BINARY_CHAR (c, dst);
3157 }
3158 label_continue_loop:;
3159 }
3160
3161 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
3162
3163 str->flags = flags;
3164 str->ch = ch;
3165 }
3166
3167 /* Convert internally-formatted data to Big5. */
3168
3169 static void
3170 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
3171 unsigned_char_dynarr *dst, unsigned int n)
3172 {
3173 unsigned char c;
3174 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3175 unsigned int flags = str->flags;
3176 unsigned int ch = str->ch;
3177 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3178
3179 while (n--)
3180 {
3181 c = *src++;
3182 if (c == '\n')
3183 {
3184 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3185 Dynarr_add (dst, '\r');
3186 if (eol_type != EOL_CR)
3187 Dynarr_add (dst, '\n');
3188 }
3189 else if (BYTE_ASCII_P (c))
3190 {
3191 /* ASCII. */
3192 Dynarr_add (dst, c);
3193 }
3194 else if (BUFBYTE_LEADING_BYTE_P (c))
3195 {
3196 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
3197 c == LEADING_BYTE_CHINESE_BIG5_2)
3198 {
3199 /* A recognized leading byte. */
3200 ch = c;
3201 continue; /* not done with this character. */
3202 }
3203 /* otherwise just ignore this character. */
3204 }
3205 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
3206 ch == LEADING_BYTE_CHINESE_BIG5_2)
3207 {
3208 /* Previous char was a recognized leading byte. */
3209 ch = (ch << 8) | c;
3210 continue; /* not done with this character. */
3211 }
3212 else if (ch)
3213 {
3214 /* Encountering second byte of a Big5 character. */
3215 unsigned char b1, b2;
3216
3217 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
3218 Dynarr_add (dst, b1);
3219 Dynarr_add (dst, b2);
3220 }
3221
3222 ch = 0;
3223 }
3224
3225 str->flags = flags;
3226 str->ch = ch;
3227 }
3228
3229
3230 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
3231 Decode a Big5 character CODE of BIG5 coding-system.
3232 CODE is the character code in BIG5, a cons of two integers.
3233 Return the corresponding character.
3234 */
3235 (code))
3236 {
3237 unsigned char c1, c2, b1, b2;
3238
3239 CHECK_CONS (code);
3240 CHECK_INT (XCAR (code));
3241 CHECK_INT (XCDR (code));
3242 b1 = XINT (XCAR (code));
3243 b2 = XINT (XCDR (code));
3244 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
3245 BYTE_BIG5_TWO_BYTE_2_P (b2))
3246 {
3247 int leading_byte;
3248 Lisp_Object charset;
3249 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
3250 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
3251 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
3252 }
3253 else
3254 return Qnil;
3255 }
3256
3257 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
3258 Encode the Big5 character CH to BIG5 coding-system.
3259 Return the corresponding character code in Big5.
3260 */
3261 (ch))
3262 {
3263 Lisp_Object charset;
3264 int c1, c2, b1, b2;
3265
3266 CHECK_CHAR_COERCE_INT (ch);
3267 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
3268 if (EQ (charset, Vcharset_chinese_big5_1) ||
3269 EQ (charset, Vcharset_chinese_big5_2))
3270 {
3271 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
3272 b1, b2);
3273 return Fcons (make_int (b1), make_int (b2));
3274 }
3275 else
3276 return Qnil;
3277 }
3278
3279
3280 /************************************************************************/
3281 /* UCS-4 methods */
3282 /* */
3283 /* UCS-4 character codes are implemented as nonnegative integers. */
3284 /* */
3285 /************************************************************************/
3286
3287
3288 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3289 Map UCS-4 code CODE to Mule character CHARACTER.
3290
3291 Return T on success, NIL on failure.
3292 */
3293 (code, character))
3294 {
3295 unsigned int c;
3296
3297 CHECK_CHAR (character);
3298 CHECK_INT (code);
3299 c = XINT (code);
3300
3301 if (c < sizeof (fcd->ucs_to_mule_table))
3302 {
3303 fcd->ucs_to_mule_table[c] = character;
3304 return Qt;
3305 }
3306 else
3307 return Qnil;
3308 }
3309
3310 static Lisp_Object
3311 ucs_to_char (unsigned long code)
3312 {
3313 if (code < sizeof (fcd->ucs_to_mule_table))
3314 {
3315 return fcd->ucs_to_mule_table[code];
3316 }
3317 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3318 {
3319 unsigned int c;
3320
3321 code -= 0xe00000;
3322 c = code % (94 * 94);
3323 return make_char
3324 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3325 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3326 CHARSET_LEFT_TO_RIGHT),
3327 c / 94 + 33, c % 94 + 33));
3328 }
3329 else
3330 return Qnil;
3331 }
3332
3333 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3334 Return Mule character corresponding to UCS code CODE (a positive integer).
3335 */
3336 (code))
3337 {
3338 CHECK_NATNUM (code);
3339 return ucs_to_char (XINT (code));
3340 }
3341
3342 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3343 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3344 */
3345 (character, code))
3346 {
3347 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3348 Fset_char_ucs is more restrictive on index arg, but should
3349 check code arg in a char_table method. */
3350 CHECK_CHAR (character);
3351 CHECK_NATNUM (code);
3352 return Fput_char_table (character, code, mule_to_ucs_table);
3353 }
3354
3355 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3356 Return the UCS code (a positive integer) corresponding to CHARACTER.
3357 */
3358 (character))
3359 {
3360 return Fget_char_table (character, mule_to_ucs_table);
3361 }
3362
3363 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3364 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
3365 is not found, instead.
3366 #### do something more appropriate (use blob?)
3367 Danger, Will Robinson! Data loss. Should we signal user? */
3368 static void
3369 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3370 {
3371 Lisp_Object chr = ucs_to_char (ch);
3372
3373 if (! NILP (chr))
3374 {
3375 Bufbyte work[MAX_EMCHAR_LEN];
3376 int len;
3377
3378 ch = XCHAR (chr);
3379 len = (ch < 128) ?
3380 simple_set_charptr_emchar (work, ch) :
3381 non_ascii_set_charptr_emchar (work, ch);
3382 Dynarr_add_many (dst, work, len);
3383 }
3384 else
3385 {
3386 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3387 Dynarr_add (dst, 34 + 128);
3388 Dynarr_add (dst, 46 + 128);
3389 }
3390 }
3391
3392 static unsigned long
3393 mule_char_to_ucs4 (Lisp_Object charset,
3394 unsigned char h, unsigned char l)
3395 {
3396 Lisp_Object code
3397 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3398 mule_to_ucs_table);
3399
3400 if (INTP (code))
3401 {
3402 return XINT (code);
3403 }
3404 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3405 (XCHARSET_CHARS (charset) == 94) )
3406 {
3407 unsigned char final = XCHARSET_FINAL (charset);
3408
3409 if ( ('@' <= final) && (final < 0x7f) )
3410 {
3411 return 0xe00000 + (final - '@') * 94 * 94
3412 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3413 }
3414 else
3415 {
3416 return '?';
3417 }
3418 }
3419 else
3420 {
3421 return '?';
3422 }
3423 }
3424
3425 static void
3426 encode_ucs4 (Lisp_Object charset,
3427 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3428 {
3429 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3430 Dynarr_add (dst, code >> 24);
3431 Dynarr_add (dst, (code >> 16) & 255);
3432 Dynarr_add (dst, (code >> 8) & 255);
3433 Dynarr_add (dst, code & 255);
3434 }
3435
3436 static int
3437 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3438 unsigned int n)
3439 {
3440 while (n--)
3441 {
3442 int c = *src++;
3443 switch (st->ucs4.in_byte)
3444 {
3445 case 0:
3446 if (c >= 128)
3447 return 0;
3448 else
3449 st->ucs4.in_byte++;
3450 break;
3451 case 3:
3452 st->ucs4.in_byte = 0;
3453 break;
3454 default:
3455 st->ucs4.in_byte++;
3456 }
3457 }
3458 return CODING_CATEGORY_UCS4_MASK;
3459 }
3460
3461 static void
3462 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3463 unsigned_char_dynarr *dst, unsigned int n)
3464 {
3465 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3466 unsigned int flags = str->flags;
3467 unsigned int ch = str->ch;
3468 unsigned char counter = str->counter;
3469
3470 while (n--)
3471 {
3472 unsigned char c = *src++;
3473 switch (counter)
3474 {
3475 case 0:
3476 ch = c;
3477 counter = 3;
3478 break;
3479 case 1:
3480 decode_ucs4 ( ( ch << 8 ) | c, dst);
3481 ch = 0;
3482 counter = 0;
3483 break;
3484 default:
3485 ch = ( ch << 8 ) | c;
3486 counter--;
3487 }
3488 }
3489 if (counter & CODING_STATE_END)
3490 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3491
3492 str->flags = flags;
3493 str->ch = ch;
3494 str->counter = counter;
3495 }
3496
3497 static void
3498 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3499 unsigned_char_dynarr *dst, unsigned int n)
3500 {
3501 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3502 unsigned int flags = str->flags;
3503 unsigned int ch = str->ch;
3504 unsigned char char_boundary = str->iso2022.current_char_boundary;
3505 Lisp_Object charset = str->iso2022.current_charset;
3506
3507 #ifdef ENABLE_COMPOSITE_CHARS
3508 /* flags for handling composite chars. We do a little switcharoo
3509 on the source while we're outputting the composite char. */
3510 unsigned int saved_n = 0;
3511 CONST unsigned char *saved_src = NULL;
3512 int in_composite = 0;
3513
3514 back_to_square_n:
3515 #endif
3516
3517 while (n--)
3518 {
3519 unsigned char c = *src++;
3520
3521 if (BYTE_ASCII_P (c))
3522 { /* Processing ASCII character */
3523 ch = 0;
3524 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3525 char_boundary = 1;
3526 }
3527 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3528 { /* Processing Leading Byte */
3529 ch = 0;
3530 charset = CHARSET_BY_LEADING_BYTE (c);
3531 if (LEADING_BYTE_PREFIX_P(c))
3532 ch = c;
3533 char_boundary = 0;
3534 }
3535 else
3536 { /* Processing Non-ASCII character */
3537 char_boundary = 1;
3538 if (EQ (charset, Vcharset_control_1))
3539 {
3540 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3541 }
3542 else
3543 {
3544 switch (XCHARSET_REP_BYTES (charset))
3545 {
3546 case 2:
3547 encode_ucs4 (charset, c, 0, dst);
3548 break;
3549 case 3:
3550 if (XCHARSET_PRIVATE_P (charset))
3551 {
3552 encode_ucs4 (charset, c, 0, dst);
3553 ch = 0;
3554 }
3555 else if (ch)
3556 {
3557 #ifdef ENABLE_COMPOSITE_CHARS
3558 if (EQ (charset, Vcharset_composite))
3559 {
3560 if (in_composite)
3561 {
3562 /* #### Bother! We don't know how to
3563 handle this yet. */
3564 Dynarr_add (dst, 0);
3565 Dynarr_add (dst, 0);
3566 Dynarr_add (dst, 0);
3567 Dynarr_add (dst, '~');
3568 }
3569 else
3570 {
3571 Emchar emch = MAKE_CHAR (Vcharset_composite,
3572 ch & 0x7F, c & 0x7F);
3573 Lisp_Object lstr = composite_char_string (emch);
3574 saved_n = n;
3575 saved_src = src;
3576 in_composite = 1;
3577 src = XSTRING_DATA (lstr);
3578 n = XSTRING_LENGTH (lstr);
3579 }
3580 }
3581 else
3582 #endif /* ENABLE_COMPOSITE_CHARS */
3583 {
3584 encode_ucs4(charset, ch, c, dst);
3585 }
3586 ch = 0;
3587 }
3588 else
3589 {
3590 ch = c;
3591 char_boundary = 0;
3592 }
3593 break;
3594 case 4:
3595 if (ch)
3596 {
3597 encode_ucs4 (charset, ch, c, dst);
3598 ch = 0;
3599 }
3600 else
3601 {
3602 ch = c;
3603 char_boundary = 0;
3604 }
3605 break;
3606 default:
3607 abort ();
3608 }
3609 }
3610 }
3611 }
3612
3613 #ifdef ENABLE_COMPOSITE_CHARS
3614 if (in_composite)
3615 {
3616 n = saved_n;
3617 src = saved_src;
3618 in_composite = 0;
3619 goto back_to_square_n; /* Wheeeeeeeee ..... */
3620 }
3621 #endif /* ENABLE_COMPOSITE_CHARS */
3622
3623 str->flags = flags;
3624 str->ch = ch;
3625 str->iso2022.current_char_boundary = char_boundary;
3626 str->iso2022.current_charset = charset;
3627
3628 /* Verbum caro factum est! */
3629 }
3630
3631
3632 /************************************************************************/
3633 /* UTF-8 methods */
3634 /************************************************************************/
3635
3636 static int
3637 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3638 unsigned int n)
3639 {
3640 while (n--)
3641 {
3642 unsigned char c = *src++;
3643 switch (st->utf8.in_byte)
3644 {
3645 case 0:
3646 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3647 return 0;
3648 else if (c >= 0xfc)
3649 st->utf8.in_byte = 5;
3650 else if (c >= 0xf8)
3651 st->utf8.in_byte = 4;
3652 else if (c >= 0xf0)
3653 st->utf8.in_byte = 3;
3654 else if (c >= 0xe0)
3655 st->utf8.in_byte = 2;
3656 else if (c >= 0xc0)
3657 st->utf8.in_byte = 1;
3658 else if (c >= 0x80)
3659 return 0;
3660 break;
3661 default:
3662 if ((c & 0xc0) != 0x80)
3663 return 0;
3664 else
3665 st->utf8.in_byte--;
3666 }
3667 }
3668 return CODING_CATEGORY_UTF8_MASK;
3669 }
3670
3671 static void
3672 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3673 unsigned_char_dynarr *dst, unsigned int n)
3674 {
3675 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3676 unsigned int flags = str->flags;
3677 unsigned int ch = str->ch;
3678 eol_type_t eol_type = str->eol_type;
3679 unsigned char counter = str->counter;
3680
3681 while (n--)
3682 {
3683 unsigned char c = *src++;
3684 switch (counter)
3685 {
3686 case 0:
3687 if ( c >= 0xfc )
3688 {
3689 ch = c & 0x01;
3690 counter = 5;
3691 }
3692 else if ( c >= 0xf8 )
3693 {
3694 ch = c & 0x03;
3695 counter = 4;
3696 }
3697 else if ( c >= 0xf0 )
3698 {
3699 ch = c & 0x07;
3700 counter = 3;
3701 }
3702 else if ( c >= 0xe0 )
3703 {
3704 ch = c & 0x0f;
3705 counter = 2;
3706 }
3707 else if ( c >= 0xc0 )
3708 {
3709 ch = c & 0x1f;
3710 counter = 1;
3711 }
3712 else
3713 {
3714 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3715 decode_ucs4 (c, dst);
3716 }
3717 break;
3718 case 1:
3719 ch = ( ch << 6 ) | ( c & 0x3f );
3720 decode_ucs4 (ch, dst);
3721 ch = 0;
3722 counter = 0;
3723 break;
3724 default:
3725 ch = ( ch << 6 ) | ( c & 0x3f );
3726 counter--;
3727 }
3728 label_continue_loop:;
3729 }
3730
3731 if (flags & CODING_STATE_END)
3732 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3733
3734 str->flags = flags;
3735 str->ch = ch;
3736 str->counter = counter;
3737 }
3738
3739 static void
3740 encode_utf8 (Lisp_Object charset,
3741 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3742 {
3743 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3744 if ( code <= 0x7f )
3745 {
3746 Dynarr_add (dst, code);
3747 }
3748 else if ( code <= 0x7ff )
3749 {
3750 Dynarr_add (dst, (code >> 6) | 0xc0);
3751 Dynarr_add (dst, (code & 0x3f) | 0x80);
3752 }
3753 else if ( code <= 0xffff )
3754 {
3755 Dynarr_add (dst, (code >> 12) | 0xe0);
3756 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3757 Dynarr_add (dst, (code & 0x3f) | 0x80);
3758 }
3759 else if ( code <= 0x1fffff )
3760 {
3761 Dynarr_add (dst, (code >> 18) | 0xf0);
3762 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3763 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3764 Dynarr_add (dst, (code & 0x3f) | 0x80);
3765 }
3766 else if ( code <= 0x3ffffff )
3767 {
3768 Dynarr_add (dst, (code >> 24) | 0xf8);
3769 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3770 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3771 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3772 Dynarr_add (dst, (code & 0x3f) | 0x80);
3773 }
3774 else
3775 {
3776 Dynarr_add (dst, (code >> 30) | 0xfc);
3777 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3778 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3779 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3780 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3781 Dynarr_add (dst, (code & 0x3f) | 0x80);
3782 }
3783 }
3784
3785 static void
3786 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3787 unsigned_char_dynarr *dst, unsigned int n)
3788 {
3789 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3790 unsigned int flags = str->flags;
3791 unsigned int ch = str->ch;
3792 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3793 unsigned char char_boundary = str->iso2022.current_char_boundary;
3794 Lisp_Object charset = str->iso2022.current_charset;
3795
3796 #ifdef ENABLE_COMPOSITE_CHARS
3797 /* flags for handling composite chars. We do a little switcharoo
3798 on the source while we're outputting the composite char. */
3799 unsigned int saved_n = 0;
3800 CONST unsigned char *saved_src = NULL;
3801 int in_composite = 0;
3802
3803 back_to_square_n:
3804 #endif /* ENABLE_COMPOSITE_CHARS */
3805
3806 while (n--)
3807 {
3808 unsigned char c = *src++;
3809
3810 if (BYTE_ASCII_P (c))
3811 { /* Processing ASCII character */
3812 ch = 0;
3813 if (c == '\n')
3814 {
3815 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3816 Dynarr_add (dst, '\r');
3817 if (eol_type != EOL_CR)
3818 Dynarr_add (dst, c);
3819 }
3820 else
3821 encode_utf8 (Vcharset_ascii, c, 0, dst);
3822 char_boundary = 1;
3823 }
3824 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3825 { /* Processing Leading Byte */
3826 ch = 0;
3827 charset = CHARSET_BY_LEADING_BYTE (c);
3828 if (LEADING_BYTE_PREFIX_P(c))
3829 ch = c;
3830 char_boundary = 0;
3831 }
3832 else
3833 { /* Processing Non-ASCII character */
3834 char_boundary = 1;
3835 if (EQ (charset, Vcharset_control_1))
3836 {
3837 encode_utf8 (Vcharset_control_1, c, 0, dst);
3838 }
3839 else
3840 {
3841 switch (XCHARSET_REP_BYTES (charset))
3842 {
3843 case 2:
3844 encode_utf8 (charset, c, 0, dst);
3845 break;
3846 case 3:
3847 if (XCHARSET_PRIVATE_P (charset))
3848 {
3849 encode_utf8 (charset, c, 0, dst);
3850 ch = 0;
3851 }
3852 else if (ch)
3853 {
3854 #ifdef ENABLE_COMPOSITE_CHARS
3855 if (EQ (charset, Vcharset_composite))
3856 {
3857 if (in_composite)
3858 {
3859 /* #### Bother! We don't know how to
3860 handle this yet. */
3861 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3862 }
3863 else
3864 {
3865 Emchar emch = MAKE_CHAR (Vcharset_composite,
3866 ch & 0x7F, c & 0x7F);
3867 Lisp_Object lstr = composite_char_string (emch);
3868 saved_n = n;
3869 saved_src = src;
3870 in_composite = 1;
3871 src = XSTRING_DATA (lstr);
3872 n = XSTRING_LENGTH (lstr);
3873 }
3874 }
3875 else
3876 #endif /* ENABLE_COMPOSITE_CHARS */
3877 {
3878 encode_utf8 (charset, ch, c, dst);
3879 }
3880 ch = 0;
3881 }
3882 else
3883 {
3884 ch = c;
3885 char_boundary = 0;
3886 }
3887 break;
3888 case 4:
3889 if (ch)
3890 {
3891 encode_utf8 (charset, ch, c, dst);
3892 ch = 0;
3893 }
3894 else
3895 {
3896 ch = c;
3897 char_boundary = 0;
3898 }
3899 break;
3900 default:
3901 abort ();
3902 }
3903 }
3904 }
3905 }
3906
3907 #ifdef ENABLE_COMPOSITE_CHARS
3908 if (in_composite)
3909 {
3910 n = saved_n;
3911 src = saved_src;
3912 in_composite = 0;
3913 goto back_to_square_n; /* Wheeeeeeeee ..... */
3914 }
3915 #endif
3916
3917 str->flags = flags;
3918 str->ch = ch;
3919 str->iso2022.current_char_boundary = char_boundary;
3920 str->iso2022.current_charset = charset;
3921
3922 /* Verbum caro factum est! */
3923 }
3924
3925
3926 /************************************************************************/
3927 /* ISO2022 methods */
3928 /************************************************************************/
3929
3930 /* The following note describes the coding system ISO2022 briefly.
3931 Since the intention of this note is to help understand the
3932 functions in this file, some parts are NOT ACCURATE or OVERLY
3933 SIMPLIFIED. For thorough understanding, please refer to the
3934 original document of ISO2022.
3935
3936 ISO2022 provides many mechanisms to encode several character sets
3937 in 7-bit and 8-bit environments. For 7-bit environments, all text
3938 is encoded using bytes less than 128. This may make the encoded
3939 text a little bit longer, but the text passes more easily through
3940 several gateways, some of which strip off MSB (Most Signigant Bit).
3941
3942 There are two kinds of character sets: control character set and
3943 graphic character set. The former contains control characters such
3944 as `newline' and `escape' to provide control functions (control
3945 functions are also provided by escape sequences). The latter
3946 contains graphic characters such as 'A' and '-'. Emacs recognizes
3947 two control character sets and many graphic character sets.
3948
3949 Graphic character sets are classified into one of the following
3950 four classes, according to the number of bytes (DIMENSION) and
3951 number of characters in one dimension (CHARS) of the set:
3952 - DIMENSION1_CHARS94
3953 - DIMENSION1_CHARS96
3954 - DIMENSION2_CHARS94
3955 - DIMENSION2_CHARS96
3956
3957 In addition, each character set is assigned an identification tag,
3958 unique for each set, called "final character" (denoted as <F>
3959 hereafter). The <F> of each character set is decided by ECMA(*)
3960 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3961 (0x30..0x3F are for private use only).
3962
3963 Note (*): ECMA = European Computer Manufacturers Association
3964
3965 Here are examples of graphic character set [NAME(<F>)]:
3966 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3967 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3968 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3969 o DIMENSION2_CHARS96 -- none for the moment
3970
3971 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3972 C0 [0x00..0x1F] -- control character plane 0
3973 GL [0x20..0x7F] -- graphic character plane 0
3974 C1 [0x80..0x9F] -- control character plane 1
3975 GR [0xA0..0xFF] -- graphic character plane 1
3976
3977 A control character set is directly designated and invoked to C0 or
3978 C1 by an escape sequence. The most common case is that:
3979 - ISO646's control character set is designated/invoked to C0, and
3980 - ISO6429's control character set is designated/invoked to C1,
3981 and usually these designations/invocations are omitted in encoded
3982 text. In a 7-bit environment, only C0 can be used, and a control
3983 character for C1 is encoded by an appropriate escape sequence to
3984 fit into the environment. All control characters for C1 are
3985 defined to have corresponding escape sequences.
3986
3987 A graphic character set is at first designated to one of four
3988 graphic registers (G0 through G3), then these graphic registers are
3989 invoked to GL or GR. These designations and invocations can be
3990 done independently. The most common case is that G0 is invoked to
3991 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3992 these invocations and designations are omitted in encoded text.
3993 In a 7-bit environment, only GL can be used.
3994
3995 When a graphic character set of CHARS94 is invoked to GL, codes
3996 0x20 and 0x7F of the GL area work as control characters SPACE and
3997 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3998 be used.
3999
4000 There are two ways of invocation: locking-shift and single-shift.
4001 With locking-shift, the invocation lasts until the next different
4002 invocation, whereas with single-shift, the invocation affects the
4003 following character only and doesn't affect the locking-shift
4004 state. Invocations are done by the following control characters or
4005 escape sequences:
4006
4007 ----------------------------------------------------------------------
4008 abbrev function cntrl escape seq description
4009 ----------------------------------------------------------------------
4010 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
4011 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
4012 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
4013 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
4014 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
4015 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
4016 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
4017 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
4018 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4019 ----------------------------------------------------------------------
4020 (*) These are not used by any known coding system.
4021
4022 Control characters for these functions are defined by macros
4023 ISO_CODE_XXX in `coding.h'.
4024
4025 Designations are done by the following escape sequences:
4026 ----------------------------------------------------------------------
4027 escape sequence description
4028 ----------------------------------------------------------------------
4029 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
4030 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
4031 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
4032 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
4033 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
4034 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
4035 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
4036 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
4037 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
4038 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
4039 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
4040 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
4041 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
4042 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
4043 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
4044 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
4045 ----------------------------------------------------------------------
4046
4047 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
4048 of dimension 1, chars 94, and final character <F>, etc...
4049
4050 Note (*): Although these designations are not allowed in ISO2022,
4051 Emacs accepts them on decoding, and produces them on encoding
4052 CHARS96 character sets in a coding system which is characterized as
4053 7-bit environment, non-locking-shift, and non-single-shift.
4054
4055 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
4056 '(' can be omitted. We refer to this as "short-form" hereafter.
4057
4058 Now you may notice that there are a lot of ways for encoding the
4059 same multilingual text in ISO2022. Actually, there exist many
4060 coding systems such as Compound Text (used in X11's inter client
4061 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
4062 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4063 localized platforms), and all of these are variants of ISO2022.
4064
4065 In addition to the above, Emacs handles two more kinds of escape
4066 sequences: ISO6429's direction specification and Emacs' private
4067 sequence for specifying character composition.
4068
4069 ISO6429's direction specification takes the following form:
4070 o CSI ']' -- end of the current direction
4071 o CSI '0' ']' -- end of the current direction
4072 o CSI '1' ']' -- start of left-to-right text
4073 o CSI '2' ']' -- start of right-to-left text
4074 The control character CSI (0x9B: control sequence introducer) is
4075 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
4076
4077 Character composition specification takes the following form:
4078 o ESC '0' -- start character composition
4079 o ESC '1' -- end character composition
4080 Since these are not standard escape sequences of any ISO standard,
4081 their use with these meanings is restricted to Emacs only. */
4082
4083 static void
4084 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
4085 {
4086 int i;
4087
4088 for (i = 0; i < 4; i++)
4089 {
4090 if (!NILP (coding_system))
4091 iso->charset[i] =
4092 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
4093 else
4094 iso->charset[i] = Qt;
4095 iso->invalid_designated[i] = 0;
4096 }
4097 iso->esc = ISO_ESC_NOTHING;
4098 iso->esc_bytes_index = 0;
4099 iso->register_left = 0;
4100 iso->register_right = 1;
4101 iso->switched_dir_and_no_valid_charset_yet = 0;
4102 iso->invalid_switch_dir = 0;
4103 iso->output_direction_sequence = 0;
4104 iso->output_literally = 0;
4105 #ifdef ENABLE_COMPOSITE_CHARS
4106 if (iso->composite_chars)
4107 Dynarr_reset (iso->composite_chars);
4108 #endif
4109 }
4110
4111 static int
4112 fit_to_be_escape_quoted (unsigned char c)
4113 {
4114 switch (c)
4115 {
4116 case ISO_CODE_ESC:
4117 case ISO_CODE_CSI:
4118 case ISO_CODE_SS2:
4119 case ISO_CODE_SS3:
4120 case ISO_CODE_SO:
4121 case ISO_CODE_SI:
4122 return 1;
4123
4124 default:
4125 return 0;
4126 }
4127 }
4128
4129 /* Parse one byte of an ISO2022 escape sequence.
4130 If the result is an invalid escape sequence, return 0 and
4131 do not change anything in STR. Otherwise, if the result is
4132 an incomplete escape sequence, update ISO2022.ESC and
4133 ISO2022.ESC_BYTES and return -1. Otherwise, update
4134 all the state variables (but not ISO2022.ESC_BYTES) and
4135 return 1.
4136
4137 If CHECK_INVALID_CHARSETS is non-zero, check for designation
4138 or invocation of an invalid character set and treat that as
4139 an unrecognized escape sequence. */
4140
4141 static int
4142 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
4143 unsigned char c, unsigned int *flags,
4144 int check_invalid_charsets)
4145 {
4146 /* (1) If we're at the end of a designation sequence, CS is the
4147 charset being designated and REG is the register to designate
4148 it to.
4149
4150 (2) If we're at the end of a locking-shift sequence, REG is
4151 the register to invoke and HALF (0 == left, 1 == right) is
4152 the half to invoke it into.
4153
4154 (3) If we're at the end of a single-shift sequence, REG is
4155 the register to invoke. */
4156 Lisp_Object cs = Qnil;
4157 int reg, half;
4158
4159 /* NOTE: This code does goto's all over the fucking place.
4160 The reason for this is that we're basically implementing
4161 a state machine here, and hierarchical languages like C
4162 don't really provide a clean way of doing this. */
4163
4164 if (! (*flags & CODING_STATE_ESCAPE))
4165 /* At beginning of escape sequence; we need to reset our
4166 escape-state variables. */
4167 iso->esc = ISO_ESC_NOTHING;
4168
4169 iso->output_literally = 0;
4170 iso->output_direction_sequence = 0;
4171
4172 switch (iso->esc)
4173 {
4174 case ISO_ESC_NOTHING:
4175 iso->esc_bytes_index = 0;
4176 switch (c)
4177 {
4178 case ISO_CODE_ESC: /* Start escape sequence */
4179 *flags |= CODING_STATE_ESCAPE;
4180 iso->esc = ISO_ESC;
4181 goto not_done;
4182
4183 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
4184 *flags |= CODING_STATE_ESCAPE;
4185 iso->esc = ISO_ESC_5_11;
4186 goto not_done;
4187
4188 case ISO_CODE_SO: /* locking shift 1 */
4189 reg = 1; half = 0;
4190 goto locking_shift;
4191 case ISO_CODE_SI: /* locking shift 0 */
4192 reg = 0; half = 0;
4193 goto locking_shift;
4194
4195 case ISO_CODE_SS2: /* single shift */
4196 reg = 2;
4197 goto single_shift;
4198 case ISO_CODE_SS3: /* single shift */
4199 reg = 3;
4200 goto single_shift;
4201
4202 default: /* Other control characters */
4203 return 0;
4204 }
4205
4206 case ISO_ESC:
4207 switch (c)
4208 {
4209 /**** single shift ****/
4210
4211 case 'N': /* single shift 2 */
4212 reg = 2;
4213 goto single_shift;
4214 case 'O': /* single shift 3 */
4215 reg = 3;
4216 goto single_shift;
4217
4218 /**** locking shift ****/
4219
4220 case '~': /* locking shift 1 right */
4221 reg = 1; half = 1;
4222 goto locking_shift;
4223 case 'n': /* locking shift 2 */
4224 reg = 2; half = 0;
4225 goto locking_shift;
4226 case '}': /* locking shift 2 right */
4227 reg = 2; half = 1;
4228 goto locking_shift;
4229 case 'o': /* locking shift 3 */
4230 reg = 3; half = 0;
4231 goto locking_shift;
4232 case '|': /* locking shift 3 right */
4233 reg = 3; half = 1;
4234 goto locking_shift;
4235
4236 #ifdef ENABLE_COMPOSITE_CHARS
4237 /**** composite ****/
4238
4239 case '0':
4240 iso->esc = ISO_ESC_START_COMPOSITE;
4241 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
4242 CODING_STATE_COMPOSITE;
4243 return 1;
4244
4245 case '1':
4246 iso->esc = ISO_ESC_END_COMPOSITE;
4247 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
4248 ~CODING_STATE_COMPOSITE;
4249 return 1;
4250 #endif /* ENABLE_COMPOSITE_CHARS */
4251
4252 /**** directionality ****/
4253
4254 case '[':
4255 iso->esc = ISO_ESC_5_11;
4256 goto not_done;
4257
4258 /**** designation ****/
4259
4260 case '$': /* multibyte charset prefix */
4261 iso->esc = ISO_ESC_2_4;
4262 goto not_done;
4263
4264 default:
4265 if (0x28 <= c && c <= 0x2F)
4266 {
4267 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
4268 goto not_done;
4269 }
4270
4271 /* This function is called with CODESYS equal to nil when
4272 doing coding-system detection. */
4273 if (!NILP (codesys)
4274 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4275 && fit_to_be_escape_quoted (c))
4276 {
4277 iso->esc = ISO_ESC_LITERAL;
4278 *flags &= CODING_STATE_ISO2022_LOCK;
4279 return 1;
4280 }
4281
4282 /* bzzzt! */
4283 return 0;
4284 }
4285
4286
4287
4288 /**** directionality ****/
4289
4290 case ISO_ESC_5_11: /* ISO6429 direction control */
4291 if (c == ']')
4292 {
4293 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4294 goto directionality;
4295 }
4296 if (c == '0') iso->esc = ISO_ESC_5_11_0;
4297 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
4298 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
4299 else return 0;
4300 goto not_done;
4301
4302 case ISO_ESC_5_11_0:
4303 if (c == ']')
4304 {
4305 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4306 goto directionality;
4307 }
4308 return 0;
4309
4310 case ISO_ESC_5_11_1:
4311 if (c == ']')
4312 {
4313 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
4314 goto directionality;
4315 }
4316 return 0;
4317
4318 case ISO_ESC_5_11_2:
4319 if (c == ']')
4320 {
4321 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
4322 goto directionality;
4323 }
4324 return 0;
4325
4326 directionality:
4327 iso->esc = ISO_ESC_DIRECTIONALITY;
4328 /* Various junk here to attempt to preserve the direction sequences
4329 literally in the text if they would otherwise be swallowed due
4330 to invalid designations that don't show up as actual charset
4331 changes in the text. */
4332 if (iso->invalid_switch_dir)
4333 {
4334 /* We already inserted a direction switch literally into the
4335 text. We assume (#### this may not be right) that the
4336 next direction switch is the one going the other way,
4337 and we need to output that literally as well. */
4338 iso->output_literally = 1;
4339 iso->invalid_switch_dir = 0;
4340 }
4341 else
4342 {
4343 int jj;
4344
4345 /* If we are in the thrall of an invalid designation,
4346 then stick the directionality sequence literally into the
4347 output stream so it ends up in the original text again. */
4348 for (jj = 0; jj < 4; jj++)
4349 if (iso->invalid_designated[jj])
4350 break;
4351 if (jj < 4)
4352 {
4353 iso->output_literally = 1;
4354 iso->invalid_switch_dir = 1;
4355 }
4356 else
4357 /* Indicate that we haven't yet seen a valid designation,
4358 so that if a switch-dir is directly followed by an
4359 invalid designation, both get inserted literally. */
4360 iso->switched_dir_and_no_valid_charset_yet = 1;
4361 }
4362 return 1;
4363
4364
4365 /**** designation ****/
4366
4367 case ISO_ESC_2_4:
4368 if (0x28 <= c && c <= 0x2F)
4369 {
4370 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
4371 goto not_done;
4372 }
4373 if (0x40 <= c && c <= 0x42)
4374 {
4375 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
4376 *flags & CODING_STATE_R2L ?
4377 CHARSET_RIGHT_TO_LEFT :
4378 CHARSET_LEFT_TO_RIGHT);
4379 reg = 0;
4380 goto designated;
4381 }
4382 return 0;
4383
4384 default:
4385 {
4386 int type =-1;
4387
4388 if (c < '0' || c > '~')
4389 return 0; /* bad final byte */
4390
4391 if (iso->esc >= ISO_ESC_2_8 &&
4392 iso->esc <= ISO_ESC_2_15)
4393 {
4394 type = ((iso->esc >= ISO_ESC_2_12) ?
4395 CHARSET_TYPE_96 : CHARSET_TYPE_94);
4396 reg = (iso->esc - ISO_ESC_2_8) & 3;
4397 }
4398 else if (iso->esc >= ISO_ESC_2_4_8 &&
4399 iso->esc <= ISO_ESC_2_4_15)
4400 {
4401 type = ((iso->esc >= ISO_ESC_2_4_12) ?
4402 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
4403 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
4404 }
4405 else
4406 {
4407 /* Can this ever be reached? -slb */
4408 abort();
4409 }
4410
4411 cs = CHARSET_BY_ATTRIBUTES (type, c,
4412 *flags & CODING_STATE_R2L ?
4413 CHARSET_RIGHT_TO_LEFT :
4414 CHARSET_LEFT_TO_RIGHT);
4415 goto designated;
4416 }
4417 }
4418
4419 not_done:
4420 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
4421 return -1;
4422
4423 single_shift:
4424 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
4425 /* can't invoke something that ain't there. */
4426 return 0;
4427 iso->esc = ISO_ESC_SINGLE_SHIFT;
4428 *flags &= CODING_STATE_ISO2022_LOCK;
4429 if (reg == 2)
4430 *flags |= CODING_STATE_SS2;
4431 else
4432 *flags |= CODING_STATE_SS3;
4433 return 1;
4434
4435 locking_shift:
4436 if (check_invalid_charsets &&
4437 !CHARSETP (iso->charset[reg]))
4438 /* can't invoke something that ain't there. */
4439 return 0;
4440 if (half)
4441 iso->register_right = reg;
4442 else
4443 iso->register_left = reg;
4444 *flags &= CODING_STATE_ISO2022_LOCK;
4445 iso->esc = ISO_ESC_LOCKING_SHIFT;
4446 return 1;
4447
4448 designated:
4449 if (NILP (cs) && check_invalid_charsets)
4450 {
4451 iso->invalid_designated[reg] = 1;
4452 iso->charset[reg] = Vcharset_ascii;
4453 iso->esc = ISO_ESC_DESIGNATE;
4454 *flags &= CODING_STATE_ISO2022_LOCK;
4455 iso->output_literally = 1;
4456 if (iso->switched_dir_and_no_valid_charset_yet)
4457 {
4458 /* We encountered a switch-direction followed by an
4459 invalid designation. Ensure that the switch-direction
4460 gets outputted; otherwise it will probably get eaten
4461 when the text is written out again. */
4462 iso->switched_dir_and_no_valid_charset_yet = 0;
4463 iso->output_direction_sequence = 1;
4464 /* And make sure that the switch-dir going the other
4465 way gets outputted, as well. */
4466 iso->invalid_switch_dir = 1;
4467 }
4468 return 1;
4469 }
4470 /* This function is called with CODESYS equal to nil when
4471 doing coding-system detection. */
4472 if (!NILP (codesys))
4473 {
4474 charset_conversion_spec_dynarr *dyn =
4475 XCODING_SYSTEM (codesys)->iso2022.input_conv;
4476
4477 if (dyn)
4478 {
4479 int i;
4480
4481 for (i = 0; i < Dynarr_length (dyn); i++)
4482 {
4483 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4484 if (EQ (cs, spec->from_charset))
4485 cs = spec->to_charset;
4486 }
4487 }
4488 }
4489
4490 iso->charset[reg] = cs;
4491 iso->esc = ISO_ESC_DESIGNATE;
4492 *flags &= CODING_STATE_ISO2022_LOCK;
4493 if (iso->invalid_designated[reg])
4494 {
4495 iso->invalid_designated[reg] = 0;
4496 iso->output_literally = 1;
4497 }
4498 if (iso->switched_dir_and_no_valid_charset_yet)
4499 iso->switched_dir_and_no_valid_charset_yet = 0;
4500 return 1;
4501 }
4502
4503 static int
4504 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
4505 unsigned int n)
4506 {
4507 int mask;
4508
4509 /* #### There are serious deficiencies in the recognition mechanism
4510 here. This needs to be much smarter if it's going to cut it.
4511 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
4512 it should be detected as Latin-1.
4513 All the ISO2022 stuff in this file should be synced up with the
4514 code from FSF Emacs-20.4, in which Mule should be more or less stable.
4515 Perhaps we should wait till R2L works in FSF Emacs? */
4516
4517 if (!st->iso2022.initted)
4518 {
4519 reset_iso2022 (Qnil, &st->iso2022.iso);
4520 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
4521 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4522 CODING_CATEGORY_ISO_8_1_MASK |
4523 CODING_CATEGORY_ISO_8_2_MASK |
4524 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
4525 st->iso2022.flags = 0;
4526 st->iso2022.high_byte_count = 0;
4527 st->iso2022.saw_single_shift = 0;
4528 st->iso2022.initted = 1;
4529 }
4530
4531 mask = st->iso2022.mask;
4532
4533 while (n--)
4534 {
4535 int c = *src++;
4536 if (c >= 0xA0)
4537 {
4538 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4539 st->iso2022.high_byte_count++;
4540 }
4541 else
4542 {
4543 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
4544 {
4545 if (st->iso2022.high_byte_count & 1)
4546 /* odd number of high bytes; assume not iso-8-2 */
4547 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4548 }
4549 st->iso2022.high_byte_count = 0;
4550 st->iso2022.saw_single_shift = 0;
4551 if (c > 0x80)
4552 mask &= ~CODING_CATEGORY_ISO_7_MASK;
4553 }
4554 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
4555 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
4556 { /* control chars */
4557 switch (c)
4558 {
4559 /* Allow and ignore control characters that you might
4560 reasonably see in a text file */
4561 case '\r':
4562 case '\n':
4563 case '\t':
4564 case 7: /* bell */
4565 case 8: /* backspace */
4566 case 11: /* vertical tab */
4567 case 12: /* form feed */
4568 case 26: /* MS-DOS C-z junk */
4569 case 31: /* '^_' -- for info */
4570 goto label_continue_loop;
4571
4572 default:
4573 break;
4574 }
4575 }
4576
4577 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
4578 || BYTE_C1_P (c))
4579 {
4580 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
4581 &st->iso2022.flags, 0))
4582 {
4583 switch (st->iso2022.iso.esc)
4584 {
4585 case ISO_ESC_DESIGNATE:
4586 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
4587 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
4588 break;
4589 case ISO_ESC_LOCKING_SHIFT:
4590 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
4591 goto ran_out_of_chars;
4592 case ISO_ESC_SINGLE_SHIFT:
4593 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
4594 st->iso2022.saw_single_shift = 1;
4595 break;
4596 default:
4597 break;
4598 }
4599 }
4600 else
4601 {
4602 mask = 0;
4603 goto ran_out_of_chars;
4604 }
4605 }
4606 label_continue_loop:;
4607 }
4608
4609 ran_out_of_chars:
4610
4611 return mask;
4612 }
4613
4614 static int
4615 postprocess_iso2022_mask (int mask)
4616 {
4617 /* #### kind of cheesy */
4618 /* If seven-bit ISO is allowed, then assume that the encoding is
4619 entirely seven-bit and turn off the eight-bit ones. */
4620 if (mask & CODING_CATEGORY_ISO_7_MASK)
4621 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
4622 CODING_CATEGORY_ISO_8_1_MASK |
4623 CODING_CATEGORY_ISO_8_2_MASK);
4624 return mask;
4625 }
4626
4627 /* If FLAGS is a null pointer or specifies right-to-left motion,
4628 output a switch-dir-to-left-to-right sequence to DST.
4629 Also update FLAGS if it is not a null pointer.
4630 If INTERNAL_P is set, we are outputting in internal format and
4631 need to handle the CSI differently. */
4632
4633 static void
4634 restore_left_to_right_direction (Lisp_Coding_System *codesys,
4635 unsigned_char_dynarr *dst,
4636 unsigned int *flags,
4637 int internal_p)
4638 {
4639 if (!flags || (*flags & CODING_STATE_R2L))
4640 {
4641 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4642 {
4643 Dynarr_add (dst, ISO_CODE_ESC);
4644 Dynarr_add (dst, '[');
4645 }
4646 else if (internal_p)
4647 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4648 else
4649 Dynarr_add (dst, ISO_CODE_CSI);
4650 Dynarr_add (dst, '0');
4651 Dynarr_add (dst, ']');
4652 if (flags)
4653 *flags &= ~CODING_STATE_R2L;
4654 }
4655 }
4656
4657 /* If FLAGS is a null pointer or specifies a direction different from
4658 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
4659 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
4660 sequence to DST. Also update FLAGS if it is not a null pointer.
4661 If INTERNAL_P is set, we are outputting in internal format and
4662 need to handle the CSI differently. */
4663
4664 static void
4665 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
4666 unsigned_char_dynarr *dst, unsigned int *flags,
4667 int internal_p)
4668 {
4669 if ((!flags || (*flags & CODING_STATE_R2L)) &&
4670 direction == CHARSET_LEFT_TO_RIGHT)
4671 restore_left_to_right_direction (codesys, dst, flags, internal_p);
4672 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
4673 && (!flags || !(*flags & CODING_STATE_R2L)) &&
4674 direction == CHARSET_RIGHT_TO_LEFT)
4675 {
4676 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4677 {
4678 Dynarr_add (dst, ISO_CODE_ESC);
4679 Dynarr_add (dst, '[');
4680 }
4681 else if (internal_p)
4682 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
4683 else
4684 Dynarr_add (dst, ISO_CODE_CSI);
4685 Dynarr_add (dst, '2');
4686 Dynarr_add (dst, ']');
4687 if (flags)
4688 *flags |= CODING_STATE_R2L;
4689 }
4690 }
4691
4692 /* Convert ISO2022-format data to internal format. */
4693
4694 static void
4695 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
4696 unsigned_char_dynarr *dst, unsigned int n)
4697 {
4698 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4699 unsigned int flags = str->flags;
4700 unsigned int ch = str->ch;
4701 eol_type_t eol_type = str->eol_type;
4702 #ifdef ENABLE_COMPOSITE_CHARS
4703 unsigned_char_dynarr *real_dst = dst;
4704 #endif
4705 Lisp_Object coding_system;
4706
4707 XSETCODING_SYSTEM (coding_system, str->codesys);
4708
4709 #ifdef ENABLE_COMPOSITE_CHARS
4710 if (flags & CODING_STATE_COMPOSITE)
4711 dst = str->iso2022.composite_chars;
4712 #endif /* ENABLE_COMPOSITE_CHARS */
4713
4714 while (n--)
4715 {
4716 unsigned char c = *src++;
4717 if (flags & CODING_STATE_ESCAPE)
4718 { /* Within ESC sequence */
4719 int retval = parse_iso2022_esc (coding_system, &str->iso2022,
4720 c, &flags, 1);
4721
4722 if (retval)
4723 {
4724 switch (str->iso2022.esc)
4725 {
4726 #ifdef ENABLE_COMPOSITE_CHARS
4727 case ISO_ESC_START_COMPOSITE:
4728 if (str->iso2022.composite_chars)
4729 Dynarr_reset (str->iso2022.composite_chars);
4730 else
4731 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
4732 dst = str->iso2022.composite_chars;
4733 break;
4734 case ISO_ESC_END_COMPOSITE:
4735 {
4736 Bufbyte comstr[MAX_EMCHAR_LEN];
4737 Bytecount len;
4738 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
4739 Dynarr_length (dst));
4740 dst = real_dst;
4741 len = set_charptr_emchar (comstr, emch);
4742 Dynarr_add_many (dst, comstr, len);
4743 break;
4744 }
4745 #endif /* ENABLE_COMPOSITE_CHARS */
4746
4747 case ISO_ESC_LITERAL:
4748 DECODE_ADD_BINARY_CHAR (c, dst);
4749 break;
4750
4751 default:
4752 /* Everything else handled already */
4753 break;
4754 }
4755 }
4756
4757 /* Attempted error recovery. */
4758 if (str->iso2022.output_direction_sequence)
4759 ensure_correct_direction (flags & CODING_STATE_R2L ?
4760 CHARSET_RIGHT_TO_LEFT :
4761 CHARSET_LEFT_TO_RIGHT,
4762 str->codesys, dst, 0, 1);
4763 /* More error recovery. */
4764 if (!retval || str->iso2022.output_literally)
4765 {
4766 /* Output the (possibly invalid) sequence */
4767 int i;
4768 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
4769 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
4770 flags &= CODING_STATE_ISO2022_LOCK;
4771 if (!retval)
4772 n++, src--;/* Repeat the loop with the same character. */
4773 else
4774 {
4775 /* No sense in reprocessing the final byte of the
4776 escape sequence; it could mess things up anyway.
4777 Just add it now. */
4778 DECODE_ADD_BINARY_CHAR (c, dst);
4779 }
4780 }
4781 ch = 0;
4782 }
4783 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
4784 { /* Control characters */
4785
4786 /***** Error-handling *****/
4787
4788 /* If we were in the middle of a character, dump out the
4789 partial character. */
4790 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4791
4792 /* If we just saw a single-shift character, dump it out.
4793 This may dump out the wrong sort of single-shift character,
4794 but least it will give an indication that something went
4795 wrong. */
4796 if (flags & CODING_STATE_SS2)
4797 {
4798 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
4799 flags &= ~CODING_STATE_SS2;
4800 }
4801 if (flags & CODING_STATE_SS3)
4802 {
4803 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
4804 flags &= ~CODING_STATE_SS3;
4805 }
4806
4807 /***** Now handle the control characters. *****/
4808
4809 /* Handle CR/LF */
4810 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4811
4812 flags &= CODING_STATE_ISO2022_LOCK;
4813
4814 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
4815 DECODE_ADD_BINARY_CHAR (c, dst);
4816 }
4817 else
4818 { /* Graphic characters */
4819 Lisp_Object charset;
4820 int lb;
4821 int reg;
4822
4823 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
4824
4825 /* Now determine the charset. */
4826 reg = ((flags & CODING_STATE_SS2) ? 2
4827 : (flags & CODING_STATE_SS3) ? 3
4828 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
4829 : str->iso2022.register_left);
4830 charset = str->iso2022.charset[reg];
4831
4832 /* Error checking: */
4833 if (! CHARSETP (charset)
4834 || str->iso2022.invalid_designated[reg]
4835 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
4836 && XCHARSET_CHARS (charset) == 94))
4837 /* Mrmph. We are trying to invoke a register that has no
4838 or an invalid charset in it, or trying to add a character
4839 outside the range of the charset. Insert that char literally
4840 to preserve it for the output. */
4841 {
4842 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4843 DECODE_ADD_BINARY_CHAR (c, dst);
4844 }
4845
4846 else
4847 {
4848 /* Things are probably hunky-dorey. */
4849
4850 /* Fetch reverse charset, maybe. */
4851 if (((flags & CODING_STATE_R2L) &&
4852 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
4853 ||
4854 (!(flags & CODING_STATE_R2L) &&
4855 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
4856 {
4857 Lisp_Object new_charset =
4858 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
4859 if (!NILP (new_charset))
4860 charset = new_charset;
4861 }
4862
4863 lb = XCHARSET_LEADING_BYTE (charset);
4864 switch (XCHARSET_REP_BYTES (charset))
4865 {
4866 case 1: /* ASCII */
4867 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4868 Dynarr_add (dst, c & 0x7F);
4869 break;
4870
4871 case 2: /* one-byte official */
4872 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4873 Dynarr_add (dst, lb);
4874 Dynarr_add (dst, c | 0x80);
4875 break;
4876
4877 case 3: /* one-byte private or two-byte official */
4878 if (XCHARSET_PRIVATE_P (charset))
4879 {
4880 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4881 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
4882 Dynarr_add (dst, lb);
4883 Dynarr_add (dst, c | 0x80);
4884 }
4885 else
4886 {
4887 if (ch)
4888 {
4889 Dynarr_add (dst, lb);
4890 Dynarr_add (dst, ch | 0x80);
4891 Dynarr_add (dst, c | 0x80);
4892 ch = 0;
4893 }
4894 else
4895 ch = c;
4896 }
4897 break;
4898
4899 default: /* two-byte private */
4900 if (ch)
4901 {
4902 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
4903 Dynarr_add (dst, lb);
4904 Dynarr_add (dst, ch | 0x80);
4905 Dynarr_add (dst, c | 0x80);
4906 ch = 0;
4907 }
4908 else
4909 ch = c;
4910 }
4911 }
4912
4913 if (!ch)
4914 flags &= CODING_STATE_ISO2022_LOCK;
4915 }
4916
4917 label_continue_loop:;
4918 }
4919
4920 if (flags & CODING_STATE_END)
4921 DECODE_OUTPUT_PARTIAL_CHAR (ch);
4922
4923 str->flags = flags;
4924 str->ch = ch;
4925 }
4926
4927
4928 /***** ISO2022 encoder *****/
4929
4930 /* Designate CHARSET into register REG. */
4931
4932 static void
4933 iso2022_designate (Lisp_Object charset, unsigned char reg,
4934 struct encoding_stream *str, unsigned_char_dynarr *dst)
4935 {
4936 static CONST char inter94[] = "()*+";
4937 static CONST char inter96[] = ",-./";
4938 unsigned int type;
4939 unsigned char final;
4940 Lisp_Object old_charset = str->iso2022.charset[reg];
4941
4942 str->iso2022.charset[reg] = charset;
4943 if (!CHARSETP (charset))
4944 /* charset might be an initial nil or t. */
4945 return;
4946 type = XCHARSET_TYPE (charset);
4947 final = XCHARSET_FINAL (charset);
4948 if (!str->iso2022.force_charset_on_output[reg] &&
4949 CHARSETP (old_charset) &&
4950 XCHARSET_TYPE (old_charset) == type &&
4951 XCHARSET_FINAL (old_charset) == final)
4952 return;
4953
4954 str->iso2022.force_charset_on_output[reg] = 0;
4955
4956 {
4957 charset_conversion_spec_dynarr *dyn =
4958 str->codesys->iso2022.output_conv;
4959
4960 if (dyn)
4961 {
4962 int i;
4963
4964 for (i = 0; i < Dynarr_length (dyn); i++)
4965 {
4966 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4967 if (EQ (charset, spec->from_charset))
4968 charset = spec->to_charset;
4969 }
4970 }
4971 }
4972
4973 Dynarr_add (dst, ISO_CODE_ESC);
4974 switch (type)
4975 {
4976 case CHARSET_TYPE_94:
4977 Dynarr_add (dst, inter94[reg]);
4978 break;
4979 case CHARSET_TYPE_96:
4980 Dynarr_add (dst, inter96[reg]);
4981 break;
4982 case CHARSET_TYPE_94X94:
4983 Dynarr_add (dst, '$');
4984 if (reg != 0
4985 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4986 || final < '@'
4987 || final > 'B')
4988 Dynarr_add (dst, inter94[reg]);
4989 break;
4990 case CHARSET_TYPE_96X96:
4991 Dynarr_add (dst, '$');
4992 Dynarr_add (dst, inter96[reg]);
4993 break;
4994 }
4995 Dynarr_add (dst, final);
4996 }
4997
4998 static void
4999 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
5000 {
5001 if (str->iso2022.register_left != 0)
5002 {
5003 Dynarr_add (dst, ISO_CODE_SI);
5004 str->iso2022.register_left = 0;
5005 }
5006 }
5007
5008 static void
5009 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
5010 {
5011 if (str->iso2022.register_left != 1)
5012 {
5013 Dynarr_add (dst, ISO_CODE_SO);
5014 str->iso2022.register_left = 1;
5015 }
5016 }
5017
5018 /* Convert internally-formatted data to ISO2022 format. */
5019
5020 static void
5021 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
5022 unsigned_char_dynarr *dst, unsigned int n)
5023 {
5024 unsigned char charmask, c;
5025 unsigned char char_boundary;
5026 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5027 unsigned int flags = str->flags;
5028 unsigned int ch = str->ch;
5029 Lisp_Coding_System *codesys = str->codesys;
5030 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5031 int i;
5032 Lisp_Object charset;
5033 int half;
5034
5035 #ifdef ENABLE_COMPOSITE_CHARS
5036 /* flags for handling composite chars. We do a little switcharoo
5037 on the source while we're outputting the composite char. */
5038 unsigned int saved_n = 0;
5039 CONST unsigned char *saved_src = NULL;
5040 int in_composite = 0;
5041 #endif /* ENABLE_COMPOSITE_CHARS */
5042
5043 char_boundary = str->iso2022.current_char_boundary;
5044 charset = str->iso2022.current_charset;
5045 half = str->iso2022.current_half;
5046
5047 #ifdef ENABLE_COMPOSITE_CHARS
5048 back_to_square_n:
5049 #endif
5050 while (n--)
5051 {
5052 c = *src++;
5053
5054 if (BYTE_ASCII_P (c))
5055 { /* Processing ASCII character */
5056 ch = 0;
5057
5058 restore_left_to_right_direction (codesys, dst, &flags, 0);
5059
5060 /* Make sure G0 contains ASCII */
5061 if ((c > ' ' && c < ISO_CODE_DEL) ||
5062 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
5063 {
5064 ensure_normal_shift (str, dst);
5065 iso2022_designate (Vcharset_ascii, 0, str, dst);
5066 }
5067
5068 /* If necessary, restore everything to the default state
5069 at end-of-line */
5070 if (c == '\n' &&
5071 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
5072 {
5073 restore_left_to_right_direction (codesys, dst, &flags, 0);
5074
5075 ensure_normal_shift (str, dst);
5076
5077 for (i = 0; i < 4; i++)
5078 {
5079 Lisp_Object initial_charset =
5080 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5081 iso2022_designate (initial_charset, i, str, dst);
5082 }
5083 }
5084 if (c == '\n')
5085 {
5086 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5087 Dynarr_add (dst, '\r');
5088 if (eol_type != EOL_CR)
5089 Dynarr_add (dst, c);
5090 }
5091 else
5092 {
5093 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5094 && fit_to_be_escape_quoted (c))
5095 Dynarr_add (dst, ISO_CODE_ESC);
5096 Dynarr_add (dst, c);
5097 }
5098 char_boundary = 1;
5099 }
5100
5101 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
5102 { /* Processing Leading Byte */
5103 ch = 0;
5104 charset = CHARSET_BY_LEADING_BYTE (c);
5105 if (LEADING_BYTE_PREFIX_P(c))
5106 ch = c;
5107 else if (!EQ (charset, Vcharset_control_1)
5108 #ifdef ENABLE_COMPOSITE_CHARS
5109 && !EQ (charset, Vcharset_composite)
5110 #endif
5111 )
5112 {
5113 int reg;
5114
5115 ensure_correct_direction (XCHARSET_DIRECTION (charset),
5116 codesys, dst, &flags, 0);
5117
5118 /* Now determine which register to use. */
5119 reg = -1;
5120 for (i = 0; i < 4; i++)
5121 {
5122 if (EQ (charset, str->iso2022.charset[i]) ||
5123 EQ (charset,
5124 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
5125 {
5126 reg = i;
5127 break;
5128 }
5129 }
5130
5131 if (reg == -1)
5132 {
5133 if (XCHARSET_GRAPHIC (charset) != 0)
5134 {
5135 if (!NILP (str->iso2022.charset[1]) &&
5136 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
5137 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
5138 reg = 1;
5139 else if (!NILP (str->iso2022.charset[2]))
5140 reg = 2;
5141 else if (!NILP (str->iso2022.charset[3]))
5142 reg = 3;
5143 else
5144 reg = 0;
5145 }
5146 else
5147 reg = 0;
5148 }
5149
5150 iso2022_designate (charset, reg, str, dst);
5151
5152 /* Now invoke that register. */
5153 switch (reg)
5154 {
5155 case 0:
5156 ensure_normal_shift (str, dst);
5157 half = 0;
5158 break;
5159
5160 case 1:
5161 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
5162 {
5163 ensure_shift_out (str, dst);
5164 half = 0;
5165 }
5166 else
5167 half = 1;
5168 break;
5169
5170 case 2:
5171 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5172 {
5173 Dynarr_add (dst, ISO_CODE_ESC);
5174 Dynarr_add (dst, 'N');
5175 half = 0;
5176 }
5177 else
5178 {
5179 Dynarr_add (dst, ISO_CODE_SS2);
5180 half = 1;
5181 }
5182 break;
5183
5184 case 3:
5185 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
5186 {
5187 Dynarr_add (dst, ISO_CODE_ESC);
5188 Dynarr_add (dst, 'O');
5189 half = 0;
5190 }
5191 else
5192 {
5193 Dynarr_add (dst, ISO_CODE_SS3);
5194 half = 1;
5195 }
5196 break;
5197
5198 default:
5199 abort ();
5200 }
5201 }
5202 char_boundary = 0;
5203 }
5204 else
5205 { /* Processing Non-ASCII character */
5206 charmask = (half == 0 ? 0x7F : 0xFF);
5207 char_boundary = 1;
5208 if (EQ (charset, Vcharset_control_1))
5209 {
5210 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
5211 && fit_to_be_escape_quoted (c))
5212 Dynarr_add (dst, ISO_CODE_ESC);
5213 /* you asked for it ... */
5214 Dynarr_add (dst, c - 0x20);
5215 }
5216 else
5217 {
5218 switch (XCHARSET_REP_BYTES (charset))
5219 {
5220 case 2:
5221 Dynarr_add (dst, c & charmask);
5222 break;
5223 case 3:
5224 if (XCHARSET_PRIVATE_P (charset))
5225 {
5226 Dynarr_add (dst, c & charmask);
5227 ch = 0;
5228 }
5229 else if (ch)
5230 {
5231 #ifdef ENABLE_COMPOSITE_CHARS
5232 if (EQ (charset, Vcharset_composite))
5233 {
5234 if (in_composite)
5235 {
5236 /* #### Bother! We don't know how to
5237 handle this yet. */
5238 Dynarr_add (dst, '~');
5239 }
5240 else
5241 {
5242 Emchar emch = MAKE_CHAR (Vcharset_composite,
5243 ch & 0x7F, c & 0x7F);
5244 Lisp_Object lstr = composite_char_string (emch);
5245 saved_n = n;
5246 saved_src = src;
5247 in_composite = 1;
5248 src = XSTRING_DATA (lstr);
5249 n = XSTRING_LENGTH (lstr);
5250 Dynarr_add (dst, ISO_CODE_ESC);
5251 Dynarr_add (dst, '0'); /* start composing */
5252 }
5253 }
5254 else
5255 #endif /* ENABLE_COMPOSITE_CHARS */
5256 {
5257 Dynarr_add (dst, ch & charmask);
5258 Dynarr_add (dst, c & charmask);
5259 }
5260 ch = 0;
5261 }
5262 else
5263 {
5264 ch = c;
5265 char_boundary = 0;
5266 }
5267 break;
5268 case 4:
5269 if (ch)
5270 {
5271 Dynarr_add (dst, ch & charmask);
5272 Dynarr_add (dst, c & charmask);
5273 ch = 0;
5274 }
5275 else
5276 {
5277 ch = c;
5278 char_boundary = 0;
5279 }
5280 break;
5281 default:
5282 abort ();
5283 }
5284 }
5285 }
5286 }
5287
5288 #ifdef ENABLE_COMPOSITE_CHARS
5289 if (in_composite)
5290 {
5291 n = saved_n;
5292 src = saved_src;
5293 in_composite = 0;
5294 Dynarr_add (dst, ISO_CODE_ESC);
5295 Dynarr_add (dst, '1'); /* end composing */
5296 goto back_to_square_n; /* Wheeeeeeeee ..... */
5297 }
5298 #endif /* ENABLE_COMPOSITE_CHARS */
5299
5300 if (char_boundary && flags & CODING_STATE_END)
5301 {
5302 restore_left_to_right_direction (codesys, dst, &flags, 0);
5303 ensure_normal_shift (str, dst);
5304 for (i = 0; i < 4; i++)
5305 {
5306 Lisp_Object initial_charset =
5307 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
5308 iso2022_designate (initial_charset, i, str, dst);
5309 }
5310 }
5311
5312 str->flags = flags;
5313 str->ch = ch;
5314 str->iso2022.current_char_boundary = char_boundary;
5315 str->iso2022.current_charset = charset;
5316 str->iso2022.current_half = half;
5317
5318 /* Verbum caro factum est! */
5319 }
5320 #endif /* MULE */
5321
5322 /************************************************************************/
5323 /* No-conversion methods */
5324 /************************************************************************/
5325
5326 /* This is used when reading in "binary" files -- i.e. files that may
5327 contain all 256 possible byte values and that are not to be
5328 interpreted as being in any particular decoding. */
5329 static void
5330 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
5331 unsigned_char_dynarr *dst, unsigned int n)
5332 {
5333 unsigned char c;
5334 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
5335 unsigned int flags = str->flags;
5336 unsigned int ch = str->ch;
5337 eol_type_t eol_type = str->eol_type;
5338
5339 while (n--)
5340 {
5341 c = *src++;
5342
5343 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
5344 DECODE_ADD_BINARY_CHAR (c, dst);
5345 label_continue_loop:;
5346 }
5347
5348 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
5349
5350 str->flags = flags;
5351 str->ch = ch;
5352 }
5353
5354 static void
5355 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
5356 unsigned_char_dynarr *dst, unsigned int n)
5357 {
5358 unsigned char c;
5359 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
5360 unsigned int flags = str->flags;
5361 unsigned int ch = str->ch;
5362 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
5363
5364 while (n--)
5365 {
5366 c = *src++;
5367 if (c == '\n')
5368 {
5369 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
5370 Dynarr_add (dst, '\r');
5371 if (eol_type != EOL_CR)
5372 Dynarr_add (dst, '\n');
5373 ch = 0;
5374 }
5375 else if (BYTE_ASCII_P (c))
5376 {
5377 assert (ch == 0);
5378 Dynarr_add (dst, c);
5379 }
5380 else if (BUFBYTE_LEADING_BYTE_P (c))
5381 {
5382 assert (ch == 0);
5383 if (c == LEADING_BYTE_LATIN_ISO8859_1 ||
5384 c == LEADING_BYTE_CONTROL_1)
5385 ch = c;
5386 else
5387 Dynarr_add (dst, '~'); /* untranslatable character */
5388 }
5389 else
5390 {
5391 if (ch == LEADING_BYTE_LATIN_ISO8859_1)
5392 Dynarr_add (dst, c);
5393 else if (ch == LEADING_BYTE_CONTROL_1)
5394 {
5395 assert (c < 0xC0);
5396 Dynarr_add (dst, c - 0x20);
5397 }
5398 /* else it should be the second or third byte of an
5399 untranslatable character, so ignore it */
5400 ch = 0;
5401 }
5402 }
5403
5404 str->flags = flags;
5405 str->ch = ch;
5406 }
5407
5408
5409 /************************************************************************/
5410 /* Simple internal/external functions */
5411 /************************************************************************/
5412
5413 static Extbyte_dynarr *conversion_out_dynarr;
5414 static Bufbyte_dynarr *conversion_in_dynarr;
5415
5416 /* Determine coding system from coding format */
5417
5418 /* #### not correct for all values of `fmt'! */
5419 static Lisp_Object
5420 external_data_format_to_coding_system (enum external_data_format fmt)
5421 {
5422 switch (fmt)
5423 {
5424 case FORMAT_FILENAME:
5425 case FORMAT_TERMINAL:
5426 if (EQ (Vfile_name_coding_system, Qnil) ||
5427 EQ (Vfile_name_coding_system, Qbinary))
5428 return Qnil;
5429 else
5430 return Fget_coding_system (Vfile_name_coding_system);
5431 #ifdef MULE
5432 case FORMAT_CTEXT:
5433 return Fget_coding_system (Qctext);
5434 #endif
5435 default:
5436 return Qnil;
5437 }
5438 }
5439
5440 Extbyte *
5441 convert_to_external_format (CONST Bufbyte *ptr,
5442 Bytecount len,
5443 Extcount *len_out,
5444 enum external_data_format fmt)
5445 {
5446 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5447
5448 if (!conversion_out_dynarr)
5449 conversion_out_dynarr = Dynarr_new (Extbyte);
5450 else
5451 Dynarr_reset (conversion_out_dynarr);
5452
5453 if (NILP (coding_system))
5454 {
5455 CONST Bufbyte *end = ptr + len;
5456
5457 for (; ptr < end;)
5458 {
5459 Bufbyte c =
5460 (BYTE_ASCII_P (*ptr)) ? *ptr :
5461 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
5462 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
5463 '~';
5464
5465 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
5466 INC_CHARPTR (ptr);
5467 }
5468
5469 #ifdef ERROR_CHECK_BUFPOS
5470 assert (ptr == end);
5471 #endif
5472 }
5473 else
5474 {
5475 Lisp_Object instream, outstream, da_outstream;
5476 Lstream *istr, *ostr;
5477 struct gcpro gcpro1, gcpro2, gcpro3;
5478 char tempbuf[1024]; /* some random amount */
5479
5480 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5481 da_outstream = make_dynarr_output_stream
5482 ((unsigned_char_dynarr *) conversion_out_dynarr);
5483 outstream =
5484 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system);
5485 istr = XLSTREAM (instream);
5486 ostr = XLSTREAM (outstream);
5487 GCPRO3 (instream, outstream, da_outstream);
5488 while (1)
5489 {
5490 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5491 if (!size_in_bytes)
5492 break;
5493 Lstream_write (ostr, tempbuf, size_in_bytes);
5494 }
5495 Lstream_close (istr);
5496 Lstream_close (ostr);
5497 UNGCPRO;
5498 Lstream_delete (istr);
5499 Lstream_delete (ostr);
5500 Lstream_delete (XLSTREAM (da_outstream));
5501 }
5502
5503 *len_out = Dynarr_length (conversion_out_dynarr);
5504 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
5505 return Dynarr_atp (conversion_out_dynarr, 0);
5506 }
5507
5508 Bufbyte *
5509 convert_from_external_format (CONST Extbyte *ptr,
5510 Extcount len,
5511 Bytecount *len_out,
5512 enum external_data_format fmt)
5513 {
5514 Lisp_Object coding_system = external_data_format_to_coding_system (fmt);
5515
5516 if (!conversion_in_dynarr)
5517 conversion_in_dynarr = Dynarr_new (Bufbyte);
5518 else
5519 Dynarr_reset (conversion_in_dynarr);
5520
5521 if (NILP (coding_system))
5522 {
5523 CONST Extbyte *end = ptr + len;
5524 for (; ptr < end; ptr++)
5525 {
5526 Extbyte c = *ptr;
5527 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
5528 }
5529 }
5530 else
5531 {
5532 Lisp_Object instream, outstream, da_outstream;
5533 Lstream *istr, *ostr;
5534 struct gcpro gcpro1, gcpro2, gcpro3;
5535 char tempbuf[1024]; /* some random amount */
5536
5537 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
5538 da_outstream = make_dynarr_output_stream
5539 ((unsigned_char_dynarr *) conversion_in_dynarr);
5540 outstream =
5541 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system);
5542 istr = XLSTREAM (instream);
5543 ostr = XLSTREAM (outstream);
5544 GCPRO3 (instream, outstream, da_outstream);
5545 while (1)
5546 {
5547 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf));
5548 if (!size_in_bytes)
5549 break;
5550 Lstream_write (ostr, tempbuf, size_in_bytes);
5551 }
5552 Lstream_close (istr);
5553 Lstream_close (ostr);
5554 UNGCPRO;
5555 Lstream_delete (istr);
5556 Lstream_delete (ostr);
5557 Lstream_delete (XLSTREAM (da_outstream));
5558 }
5559
5560 *len_out = Dynarr_length (conversion_in_dynarr);
5561 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
5562 return Dynarr_atp (conversion_in_dynarr, 0);
5563 }
5564
5565
5566 /************************************************************************/
5567 /* Initialization */
5568 /************************************************************************/
5569
5570 void
5571 syms_of_file_coding (void)
5572 {
5573 deferror (&Qcoding_system_error, "coding-system-error",
5574 "Coding-system error", Qio_error);
5575
5576 DEFSUBR (Fcoding_system_p);
5577 DEFSUBR (Ffind_coding_system);
5578 DEFSUBR (Fget_coding_system);
5579 DEFSUBR (Fcoding_system_list);
5580 DEFSUBR (Fcoding_system_name);
5581 DEFSUBR (Fmake_coding_system);
5582 DEFSUBR (Fcopy_coding_system);
5583 DEFSUBR (Fdefine_coding_system_alias);
5584 DEFSUBR (Fsubsidiary_coding_system);
5585
5586 DEFSUBR (Fcoding_system_type);
5587 DEFSUBR (Fcoding_system_doc_string);
5588 #ifdef MULE
5589 DEFSUBR (Fcoding_system_charset);
5590 #endif
5591 DEFSUBR (Fcoding_system_property);
5592
5593 DEFSUBR (Fcoding_category_list);
5594 DEFSUBR (Fset_coding_priority_list);
5595 DEFSUBR (Fcoding_priority_list);
5596 DEFSUBR (Fset_coding_category_system);
5597 DEFSUBR (Fcoding_category_system);
5598
5599 DEFSUBR (Fdetect_coding_region);
5600 DEFSUBR (Fdecode_coding_region);
5601 DEFSUBR (Fencode_coding_region);
5602 #ifdef MULE
5603 DEFSUBR (Fdecode_shift_jis_char);
5604 DEFSUBR (Fencode_shift_jis_char);
5605 DEFSUBR (Fdecode_big5_char);
5606 DEFSUBR (Fencode_big5_char);
5607 DEFSUBR (Fset_ucs_char);
5608 DEFSUBR (Fucs_char);
5609 DEFSUBR (Fset_char_ucs);
5610 DEFSUBR (Fchar_ucs);
5611 #endif /* MULE */
5612 defsymbol (&Qcoding_systemp, "coding-system-p");
5613 defsymbol (&Qno_conversion, "no-conversion");
5614 defsymbol (&Qraw_text, "raw-text");
5615 #ifdef MULE
5616 defsymbol (&Qbig5, "big5");
5617 defsymbol (&Qshift_jis, "shift-jis");
5618 defsymbol (&Qucs4, "ucs-4");
5619 defsymbol (&Qutf8, "utf-8");
5620 defsymbol (&Qccl, "ccl");
5621 defsymbol (&Qiso2022, "iso2022");
5622 #endif /* MULE */
5623 defsymbol (&Qmnemonic, "mnemonic");
5624 defsymbol (&Qeol_type, "eol-type");
5625 defsymbol (&Qpost_read_conversion, "post-read-conversion");
5626 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
5627
5628 defsymbol (&Qcr, "cr");
5629 defsymbol (&Qlf, "lf");
5630 defsymbol (&Qcrlf, "crlf");
5631 defsymbol (&Qeol_cr, "eol-cr");
5632 defsymbol (&Qeol_lf, "eol-lf");
5633 defsymbol (&Qeol_crlf, "eol-crlf");
5634 #ifdef MULE
5635 defsymbol (&Qcharset_g0, "charset-g0");
5636 defsymbol (&Qcharset_g1, "charset-g1");
5637 defsymbol (&Qcharset_g2, "charset-g2");
5638 defsymbol (&Qcharset_g3, "charset-g3");
5639 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
5640 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
5641 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
5642 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
5643 defsymbol (&Qno_iso6429, "no-iso6429");
5644 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
5645 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
5646
5647 defsymbol (&Qshort, "short");
5648 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
5649 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
5650 defsymbol (&Qseven, "seven");
5651 defsymbol (&Qlock_shift, "lock-shift");
5652 defsymbol (&Qescape_quoted, "escape-quoted");
5653 #endif /* MULE */
5654 defsymbol (&Qencode, "encode");
5655 defsymbol (&Qdecode, "decode");
5656
5657 #ifdef MULE
5658 defsymbol (&Qctext, "ctext");
5659 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
5660 "shift-jis");
5661 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
5662 "big5");
5663 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5664 "ucs-4");
5665 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5666 "utf-8");
5667 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
5668 "iso-7");
5669 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
5670 "iso-8-designate");
5671 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
5672 "iso-8-1");
5673 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
5674 "iso-8-2");
5675 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
5676 "iso-lock-shift");
5677 #endif /* MULE */
5678 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
5679 "no-conversion");
5680 }
5681
5682 void
5683 lstream_type_create_file_coding (void)
5684 {
5685 LSTREAM_HAS_METHOD (decoding, reader);
5686 LSTREAM_HAS_METHOD (decoding, writer);
5687 LSTREAM_HAS_METHOD (decoding, rewinder);
5688 LSTREAM_HAS_METHOD (decoding, seekable_p);
5689 LSTREAM_HAS_METHOD (decoding, flusher);
5690 LSTREAM_HAS_METHOD (decoding, closer);
5691 LSTREAM_HAS_METHOD (decoding, marker);
5692
5693 LSTREAM_HAS_METHOD (encoding, reader);
5694 LSTREAM_HAS_METHOD (encoding, writer);
5695 LSTREAM_HAS_METHOD (encoding, rewinder);
5696 LSTREAM_HAS_METHOD (encoding, seekable_p);
5697 LSTREAM_HAS_METHOD (encoding, flusher);
5698 LSTREAM_HAS_METHOD (encoding, closer);
5699 LSTREAM_HAS_METHOD (encoding, marker);
5700 }
5701
5702 void
5703 vars_of_file_coding (void)
5704 {
5705 int i;
5706
5707 fcd = xnew (struct file_coding_dump);
5708 dumpstruct (&fcd, &fcd_description);
5709
5710 /* Initialize to something reasonable ... */
5711 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
5712 {
5713 fcd->coding_category_system[i] = Qnil;
5714 fcd->coding_category_by_priority[i] = i;
5715 }
5716
5717 Fprovide (intern ("file-coding"));
5718
5719 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
5720 Coding system used for TTY keyboard input.
5721 Not used under a windowing system.
5722 */ );
5723 Vkeyboard_coding_system = Qnil;
5724
5725 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
5726 Coding system used for TTY display output.
5727 Not used under a windowing system.
5728 */ );
5729 Vterminal_coding_system = Qnil;
5730
5731 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /*
5732 Overriding coding system used when writing a file or process.
5733 You should *bind* this, not set it. If this is non-nil, it specifies
5734 the coding system that will be used when a file or process is read
5735 in, and overrides `buffer-file-coding-system-for-read',
5736 `insert-file-contents-pre-hook', etc. Use those variables instead of
5737 this one for permanent changes to the environment.
5738 */ );
5739 Vcoding_system_for_read = Qnil;
5740
5741 DEFVAR_LISP ("coding-system-for-write",
5742 &Vcoding_system_for_write /*
5743 Overriding coding system used when writing a file or process.
5744 You should *bind* this, not set it. If this is non-nil, it specifies
5745 the coding system that will be used when a file or process is wrote
5746 in, and overrides `buffer-file-coding-system',
5747 `write-region-pre-hook', etc. Use those variables instead of this one
5748 for permanent changes to the environment.
5749 */ );
5750 Vcoding_system_for_write = Qnil;
5751
5752 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /*
5753 Coding system used to convert pathnames when accessing files.
5754 */ );
5755 Vfile_name_coding_system = Qnil;
5756
5757 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /*
5758 Non-nil means the buffer contents are regarded as multi-byte form
5759 of characters, not a binary code. This affects the display, file I/O,
5760 and behaviors of various editing commands.
5761
5762 Setting this to nil does not do anything.
5763 */ );
5764 enable_multibyte_characters = 1;
5765 }
5766
5767 void
5768 complex_vars_of_file_coding (void)
5769 {
5770 staticpro (&Vcoding_system_hash_table);
5771 Vcoding_system_hash_table =
5772 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
5773
5774 the_codesys_prop_dynarr = Dynarr_new (codesys_prop);
5775 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description);
5776
5777 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
5778 { \
5779 struct codesys_prop csp; \
5780 csp.sym = (Sym); \
5781 csp.prop_type = (Prop_Type); \
5782 Dynarr_add (the_codesys_prop_dynarr, csp); \
5783 } while (0)
5784
5785 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
5786 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
5787 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
5788 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
5789 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
5790 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
5791 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
5792 #ifdef MULE
5793 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
5794 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
5795 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
5796 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
5797 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
5798 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
5799 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
5800 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
5801 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
5802 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
5803 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
5804 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
5805 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
5806 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
5807 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
5808 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
5809 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
5810
5811 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
5812 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
5813 #endif /* MULE */
5814 /* Need to create this here or we're really screwed. */
5815 Fmake_coding_system
5816 (Qraw_text, Qno_conversion,
5817 build_string ("Raw text, which means it converts only line-break-codes."),
5818 list2 (Qmnemonic, build_string ("Raw")));
5819
5820 Fmake_coding_system
5821 (Qbinary, Qno_conversion,
5822 build_string ("Binary, which means it does not convert anything."),
5823 list4 (Qeol_type, Qlf,
5824 Qmnemonic, build_string ("Binary")));
5825
5826 Fdefine_coding_system_alias (Qno_conversion, Qraw_text);
5827
5828 /* Need this for bootstrapping */
5829 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
5830 Fget_coding_system (Qraw_text);
5831
5832 #ifdef MULE
5833 {
5834 unsigned int i;
5835
5836 for (i = 0; i < 65536; i++)
5837 fcd->ucs_to_mule_table[i] = Qnil;
5838 }
5839 staticpro (&mule_to_ucs_table);
5840 mule_to_ucs_table = Fmake_char_table(Qgeneric);
5841 #endif /* MULE */
5842 }