comparison src/mule-coding.c @ 771:943eaba38521

[xemacs-hg @ 2002-03-13 08:51:24 by ben] The big ben-mule-21-5 check-in! Various files were added and deleted. See CHANGES-ben-mule. There are still some test suite failures. No crashes, though. Many of the failures have to do with problems in the test suite itself rather than in the actual code. I'll be addressing these in the next day or so -- none of the test suite failures are at all critical. Meanwhile I'll be trying to address the biggest issues -- i.e. build or run failures, which will almost certainly happen on various platforms. All comments should be sent to ben@xemacs.org -- use a Cc: if necessary when sending to mailing lists. There will be pre- and post- tags, something like pre-ben-mule-21-5-merge-in, and post-ben-mule-21-5-merge-in.
author ben
date Wed, 13 Mar 2002 08:54:06 +0000
parents
children a5954632b187
comparison
equal deleted inserted replaced
770:336a418893b5 771:943eaba38521
1 /* Conversion functions for I18N encodings, but not Unicode (in separate file).
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4 Copyright (C) 2000, 2001, 2002 Ben Wing.
5
6 This file is part of XEmacs.
7
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
11 later version.
12
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
22
23 /* Synched up with: Mule 2.3. Not in FSF. */
24
25 /* For previous history, see file-coding.c.
26
27 September 10, 2001: Extracted from file-coding.c by Ben Wing.
28
29 Later in September: Finished abstraction of detection system, rewrote
30 all the detectors to include multiple levels of likelihood.
31 */
32
33 #include <config.h>
34 #include "lisp.h"
35
36 #include "charset.h"
37 #include "mule-ccl.h"
38 #include "file-coding.h"
39
40 Lisp_Object Qshift_jis, Qiso2022, Qbig5, Qccl;
41
42 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
43 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
44 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
45 Lisp_Object Qno_iso6429;
46 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
47 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
48
49 Lisp_Object Qiso_7, Qiso_8_designate, Qiso_8_1, Qiso_8_2, Qiso_lock_shift;
50
51
52 /************************************************************************/
53 /* Shift-JIS methods */
54 /************************************************************************/
55
56 /* Shift-JIS; Hankaku (half-width) KANA is also supported. */
57 DEFINE_CODING_SYSTEM_TYPE (shift_jis);
58
59 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
60 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
61 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
62 encoded by "position-code + 0x80". A character of JISX0208
63 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
64 position-codes are divided and shifted so that it fit in the range
65 below.
66
67 --- CODE RANGE of Shift-JIS ---
68 (character set) (range)
69 ASCII 0x00 .. 0x7F
70 JISX0201-Kana 0xA0 .. 0xDF
71 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
72 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
73 -------------------------------
74
75 */
76
77 /* Is this the first byte of a Shift-JIS two-byte char? */
78
79 #define BYTE_SHIFT_JIS_TWO_BYTE_1_P(c) \
80 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
81
82 /* Is this the second byte of a Shift-JIS two-byte char? */
83
84 #define BYTE_SHIFT_JIS_TWO_BYTE_2_P(c) \
85 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
86
87 #define BYTE_SHIFT_JIS_KATAKANA_P(c) \
88 ((c) >= 0xA1 && (c) <= 0xDF)
89
90 /* Convert Shift-JIS data to internal format. */
91
92 static Bytecount
93 shift_jis_convert (struct coding_stream *str, const UExtbyte *src,
94 unsigned_char_dynarr *dst, Bytecount n)
95 {
96 unsigned int ch = str->ch;
97 Bytecount orign = n;
98
99 if (str->direction == CODING_DECODE)
100 {
101 while (n--)
102 {
103 UExtbyte c = *src++;
104
105 if (ch)
106 {
107 /* Previous character was first byte of Shift-JIS Kanji char. */
108 if (BYTE_SHIFT_JIS_TWO_BYTE_2_P (c))
109 {
110 Intbyte e1, e2;
111
112 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
113 DECODE_SHIFT_JIS (ch, c, e1, e2);
114 Dynarr_add (dst, e1);
115 Dynarr_add (dst, e2);
116 }
117 else
118 {
119 DECODE_ADD_BINARY_CHAR (ch, dst);
120 DECODE_ADD_BINARY_CHAR (c, dst);
121 }
122 ch = 0;
123 }
124 else
125 {
126 if (BYTE_SHIFT_JIS_TWO_BYTE_1_P (c))
127 ch = c;
128 else if (BYTE_SHIFT_JIS_KATAKANA_P (c))
129 {
130 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
131 Dynarr_add (dst, c);
132 }
133 else
134 DECODE_ADD_BINARY_CHAR (c, dst);
135 }
136 }
137
138 if (str->eof)
139 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
140 }
141 else
142 {
143 while (n--)
144 {
145 Intbyte c = *src++;
146 if (BYTE_ASCII_P (c))
147 {
148 Dynarr_add (dst, c);
149 ch = 0;
150 }
151 else if (INTBYTE_LEADING_BYTE_P (c))
152 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
153 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
154 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
155 else if (ch)
156 {
157 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
158 {
159 Dynarr_add (dst, c);
160 ch = 0;
161 }
162 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
163 ch == LEADING_BYTE_JAPANESE_JISX0208)
164 ch = c;
165 else
166 {
167 UExtbyte j1, j2;
168 ENCODE_SHIFT_JIS (ch, c, j1, j2);
169 Dynarr_add (dst, j1);
170 Dynarr_add (dst, j2);
171 ch = 0;
172 }
173 }
174 }
175 }
176
177 str->ch = ch;
178
179 return orign;
180 }
181
182 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
183 Decode a JISX0208 character of Shift-JIS coding-system.
184 CODE is the character code in Shift-JIS as a cons of type bytes.
185 Return the corresponding character.
186 */
187 (code))
188 {
189 int c1, c2, s1, s2;
190
191 CHECK_CONS (code);
192 CHECK_INT (XCAR (code));
193 CHECK_INT (XCDR (code));
194 s1 = XINT (XCAR (code));
195 s2 = XINT (XCDR (code));
196 if (BYTE_SHIFT_JIS_TWO_BYTE_1_P (s1) &&
197 BYTE_SHIFT_JIS_TWO_BYTE_2_P (s2))
198 {
199 DECODE_SHIFT_JIS (s1, s2, c1, c2);
200 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
201 c1 & 0x7F, c2 & 0x7F));
202 }
203 else
204 return Qnil;
205 }
206
207 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
208 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
209 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
210 */
211 (character))
212 {
213 Lisp_Object charset;
214 int c1, c2, s1, s2;
215
216 CHECK_CHAR_COERCE_INT (character);
217 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
218 if (EQ (charset, Vcharset_japanese_jisx0208))
219 {
220 ENCODE_SHIFT_JIS (c1 | 0x80, c2 | 0x80, s1, s2);
221 return Fcons (make_int (s1), make_int (s2));
222 }
223 else
224 return Qnil;
225 }
226
227
228 /************************************************************************/
229 /* Shift-JIS detector */
230 /************************************************************************/
231
232 DEFINE_DETECTOR (shift_jis);
233 DEFINE_DETECTOR_CATEGORY (shift_jis, shift_jis);
234
235 struct shift_jis_detector
236 {
237 int seen_jisx0208_char_in_c1;
238 int seen_jisx0208_char_in_upper;
239 int seen_jisx0201_char;
240 unsigned int seen_iso2022_esc:1;
241 unsigned int seen_bad_first_byte:1;
242 unsigned int seen_bad_second_byte:1;
243 /* temporary */
244 unsigned int in_second_byte:1;
245 unsigned int first_byte_was_c1:1;
246 };
247
248 static void
249 shift_jis_detect (struct detection_state *st, const UExtbyte *src,
250 Bytecount n)
251 {
252 struct shift_jis_detector *data = DETECTION_STATE_DATA (st, shift_jis);
253
254 while (n--)
255 {
256 UExtbyte c = *src++;
257 if (!data->in_second_byte)
258 {
259 if (c >= 0x80 && c <= 0x9F)
260 data->first_byte_was_c1 = 1;
261 if (c >= 0xA0 && c <= 0xDF)
262 data->seen_jisx0201_char++;
263 else if ((c >= 0x80 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
264 data->in_second_byte = 1;
265 else if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
266 data->seen_iso2022_esc = 1;
267 else if (c >= 0x80)
268 data->seen_bad_first_byte = 1;
269 }
270 else
271 {
272 if ((c >= 0x40 && c <= 0x7E) || (c >= 0x80 && c <= 0xFC))
273 {
274 if (data->first_byte_was_c1 || (c >= 0x80 && c <= 0x9F))
275 data->seen_jisx0208_char_in_c1++;
276 else
277 data->seen_jisx0208_char_in_upper++;
278 }
279 else
280 data->seen_bad_second_byte = 1;
281 data->in_second_byte = 0;
282 data->first_byte_was_c1 = 0;
283 }
284 }
285
286 if (data->seen_bad_second_byte)
287 DET_RESULT (st, shift_jis) = DET_NEARLY_IMPOSSIBLE;
288 else if (data->seen_bad_first_byte)
289 DET_RESULT (st, shift_jis) = DET_QUITE_IMPROBABLE;
290 else if (data->seen_iso2022_esc)
291 DET_RESULT (st, shift_jis) = DET_SOMEWHAT_UNLIKELY;
292 else if (data->seen_jisx0208_char_in_c1 >= 20 ||
293 (data->seen_jisx0208_char_in_c1 >= 10 &&
294 data->seen_jisx0208_char_in_upper >= 10))
295 DET_RESULT (st, shift_jis) = DET_QUITE_PROBABLE;
296 else if (data->seen_jisx0208_char_in_c1 > 3 ||
297 data->seen_jisx0208_char_in_upper >= 10 ||
298 /* Since the range is limited compared to what is often seen
299 is typical Latin-X charsets, the fact that we've seen a
300 bunch of them and none that are invalid is reasonably
301 strong statistical evidence of this encoding, or at least
302 not of the common Latin-X ones. */
303 data->seen_jisx0201_char >= 100)
304 DET_RESULT (st, shift_jis) = DET_SOMEWHAT_LIKELY;
305 else if (data->seen_jisx0208_char_in_c1 > 0 ||
306 data->seen_jisx0208_char_in_upper > 0 ||
307 data->seen_jisx0201_char > 0)
308 DET_RESULT (st, shift_jis) = DET_SLIGHTLY_LIKELY;
309 else
310 DET_RESULT (st, shift_jis) = DET_AS_LIKELY_AS_UNLIKELY;
311 }
312
313
314 /************************************************************************/
315 /* Big5 methods */
316 /************************************************************************/
317
318 /* BIG5 (used for Taiwanese). */
319 DEFINE_CODING_SYSTEM_TYPE (big5);
320
321 /* BIG5 is a coding system encoding two character sets: ASCII and
322 Big5. An ASCII character is encoded as is. Big5 is a two-byte
323 character set and is encoded in two-byte.
324
325 --- CODE RANGE of BIG5 ---
326 (character set) (range)
327 ASCII 0x00 .. 0x7F
328 Big5 (1st byte) 0xA1 .. 0xFE
329 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
330 --------------------------
331
332 Since the number of characters in Big5 is larger than maximum
333 characters in Emacs' charset (96x96), it can't be handled as one
334 charset. So, in XEmacs, Big5 is divided into two: `charset-big5-1'
335 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
336 contains frequently used characters and the latter contains less
337 frequently used characters. */
338
339 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
340 ((c) >= 0xA1 && (c) <= 0xFE)
341
342 /* Is this the second byte of a Shift-JIS two-byte char? */
343
344 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
345 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
346
347 /* Number of Big5 characters which have the same code in 1st byte. */
348
349 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
350
351 /* Code conversion macros. These are macros because they are used in
352 inner loops during code conversion.
353
354 Note that temporary variables in macros introduce the classic
355 dynamic-scoping problems with variable names. We use capital-
356 lettered variables in the assumption that XEmacs does not use
357 capital letters in variables except in a very formalized way
358 (e.g. Qstring). */
359
360 /* Convert Big5 code (b1, b2) into its internal string representation
361 (lb, c1, c2). */
362
363 /* There is a much simpler way to split the Big5 charset into two.
364 For the moment I'm going to leave the algorithm as-is because it
365 claims to separate out the most-used characters into a single
366 charset, which perhaps will lead to optimizations in various
367 places.
368
369 The way the algorithm works is something like this:
370
371 Big5 can be viewed as a 94x157 charset, where the row is
372 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
373 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
374 the split between low and high column numbers is apparently
375 meaningless; ascending rows produce less and less frequent chars.
376 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
377 the first charset, and the upper half (0xC9 .. 0xFE) to the
378 second. To do the conversion, we convert the character into
379 a single number where 0 .. 156 is the first row, 157 .. 313
380 is the second, etc. That way, the characters are ordered by
381 decreasing frequency. Then we just chop the space in two
382 and coerce the result into a 94x94 space.
383 */
384
385 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
386 { \
387 int B1 = b1, B2 = b2; \
388 int I \
389 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
390 \
391 if (B1 < 0xC9) \
392 { \
393 lb = LEADING_BYTE_CHINESE_BIG5_1; \
394 } \
395 else \
396 { \
397 lb = LEADING_BYTE_CHINESE_BIG5_2; \
398 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
399 } \
400 c1 = I / (0xFF - 0xA1) + 0xA1; \
401 c2 = I % (0xFF - 0xA1) + 0xA1; \
402 } while (0)
403
404 /* Convert the internal string representation of a Big5 character
405 (lb, c1, c2) into Big5 code (b1, b2). */
406
407 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
408 { \
409 int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
410 \
411 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
412 { \
413 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
414 } \
415 b1 = I / BIG5_SAME_ROW + 0xA1; \
416 b2 = I % BIG5_SAME_ROW; \
417 b2 += b2 < 0x3F ? 0x40 : 0x62; \
418 } while (0)
419
420 /* Convert Big5 data to internal format. */
421
422 static Bytecount
423 big5_convert (struct coding_stream *str, const UExtbyte *src,
424 unsigned_char_dynarr *dst, Bytecount n)
425 {
426 unsigned int ch = str->ch;
427 Bytecount orign = n;
428
429 if (str->direction == CODING_DECODE)
430 {
431 while (n--)
432 {
433 UExtbyte c = *src++;
434 if (ch)
435 {
436 /* Previous character was first byte of Big5 char. */
437 if (BYTE_BIG5_TWO_BYTE_2_P (c))
438 {
439 Intbyte b1, b2, b3;
440 DECODE_BIG5 (ch, c, b1, b2, b3);
441 Dynarr_add (dst, b1);
442 Dynarr_add (dst, b2);
443 Dynarr_add (dst, b3);
444 }
445 else
446 {
447 DECODE_ADD_BINARY_CHAR (ch, dst);
448 DECODE_ADD_BINARY_CHAR (c, dst);
449 }
450 ch = 0;
451 }
452 else
453 {
454 if (BYTE_BIG5_TWO_BYTE_1_P (c))
455 ch = c;
456 else
457 DECODE_ADD_BINARY_CHAR (c, dst);
458 }
459 }
460
461 if (str->eof)
462 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
463 }
464 else
465 {
466 while (n--)
467 {
468 Intbyte c = *src++;
469 if (BYTE_ASCII_P (c))
470 {
471 /* ASCII. */
472 Dynarr_add (dst, c);
473 }
474 else if (INTBYTE_LEADING_BYTE_P (c))
475 {
476 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
477 c == LEADING_BYTE_CHINESE_BIG5_2)
478 {
479 /* A recognized leading byte. */
480 ch = c;
481 continue; /* not done with this character. */
482 }
483 /* otherwise just ignore this character. */
484 }
485 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
486 ch == LEADING_BYTE_CHINESE_BIG5_2)
487 {
488 /* Previous char was a recognized leading byte. */
489 ch = (ch << 8) | c;
490 continue; /* not done with this character. */
491 }
492 else if (ch)
493 {
494 /* Encountering second byte of a Big5 character. */
495 UExtbyte b1, b2;
496
497 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
498 Dynarr_add (dst, b1);
499 Dynarr_add (dst, b2);
500 }
501
502 ch = 0;
503 }
504 }
505
506 str->ch = ch;
507 return orign;
508 }
509
510 Emchar
511 decode_big5_char (int b1, int b2)
512 {
513 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
514 BYTE_BIG5_TWO_BYTE_2_P (b2))
515 {
516 int leading_byte;
517 Lisp_Object charset;
518 int c1, c2;
519
520 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
521 charset = CHARSET_BY_LEADING_BYTE (leading_byte);
522 return MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F);
523 }
524 else
525 return -1;
526 }
527
528 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
529 Convert Big Five character codes in CODE into a character.
530 CODE is a cons of two integers specifying the codepoints in Big Five.
531 Return the corresponding character, or nil if the codepoints are out of range.
532
533 The term `decode' is used because the codepoints can be viewed as the
534 representation of the character in the external Big Five encoding, and thus
535 converting them to a character is analogous to any other operation that
536 decodes an external representation.
537 */
538 (code))
539 {
540 Emchar ch;
541
542 CHECK_CONS (code);
543 CHECK_INT (XCAR (code));
544 CHECK_INT (XCDR (code));
545 ch = decode_big5_char (XINT (XCAR (code)), XINT (XCDR (code)));
546 if (ch == -1)
547 return Qnil;
548 else
549 return make_char (ch);
550 }
551
552 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
553 Convert the specified Big Five character into its codepoints.
554 The codepoints are returned as a cons of two integers, specifying the
555 Big Five codepoints. See `decode-big5-char' for the reason why the
556 term `encode' is used for this operation.
557 */
558 (character))
559 {
560 Lisp_Object charset;
561 int c1, c2, b1, b2;
562
563 CHECK_CHAR_COERCE_INT (character);
564 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
565 if (EQ (charset, Vcharset_chinese_big5_1) ||
566 EQ (charset, Vcharset_chinese_big5_2))
567 {
568 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
569 b1, b2);
570 return Fcons (make_int (b1), make_int (b2));
571 }
572 else
573 return Qnil;
574 }
575
576
577 /************************************************************************/
578 /* Big5 detector */
579 /************************************************************************/
580
581 DEFINE_DETECTOR (big5);
582 DEFINE_DETECTOR_CATEGORY (big5, big5);
583
584 struct big5_detector
585 {
586 int seen_big5_char;
587 unsigned int seen_iso2022_esc:1;
588 unsigned int seen_bad_first_byte:1;
589 unsigned int seen_bad_second_byte:1;
590
591 /* temporary */
592 unsigned int in_second_byte:1;
593 };
594
595 static void
596 big5_detect (struct detection_state *st, const UExtbyte *src,
597 Bytecount n)
598 {
599 struct big5_detector *data = DETECTION_STATE_DATA (st, big5);
600
601 while (n--)
602 {
603 UExtbyte c = *src++;
604 if (!data->in_second_byte)
605 {
606 if (c >= 0xA1 && c <= 0xFE)
607 data->in_second_byte = 1;
608 else if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
609 data->seen_iso2022_esc = 1;
610 else if (c >= 0x80)
611 data->seen_bad_first_byte = 1;
612 }
613 else
614 {
615 data->in_second_byte = 0;
616 if ((c >= 0x40 && c <= 0x7E) || (c >= 0xA1 && c <= 0xFE))
617 data->seen_big5_char++;
618 else
619 data->seen_bad_second_byte = 1;
620 }
621 }
622
623 if (data->seen_bad_second_byte)
624 DET_RESULT (st, big5) = DET_NEARLY_IMPOSSIBLE;
625 else if (data->seen_bad_first_byte)
626 DET_RESULT (st, big5) = DET_QUITE_IMPROBABLE;
627 else if (data->seen_iso2022_esc)
628 DET_RESULT (st, big5) = DET_SOMEWHAT_UNLIKELY;
629 else if (data->seen_big5_char >= 4)
630 DET_RESULT (st, big5) = DET_SOMEWHAT_LIKELY;
631 else
632 DET_RESULT (st, big5) = DET_AS_LIKELY_AS_UNLIKELY;
633 }
634
635
636 /************************************************************************/
637 /* ISO2022 methods */
638 /************************************************************************/
639
640 /* Any ISO-2022-compliant coding system. Includes JIS, EUC, CTEXT
641 (Compound Text, the encoding of selections in X Windows). See below for
642 a complete description of ISO-2022. */
643 DEFINE_CODING_SYSTEM_TYPE (iso2022);
644
645 /* Flags indicating what we've seen so far when parsing an
646 ISO2022 escape sequence. */
647 enum iso_esc_flag
648 {
649 /* Partial sequences */
650 ISO_ESC_NOTHING, /* Nothing has been seen. */
651 ISO_ESC, /* We've seen ESC. */
652 ISO_ESC_2_4, /* We've seen ESC $. This indicates
653 that we're designating a multi-byte, rather
654 than a single-byte, character set. */
655 ISO_ESC_2_8, /* We've seen ESC 0x28, i.e. ESC (.
656 This means designate a 94-character
657 character set into G0. */
658 ISO_ESC_2_9, /* We've seen ESC 0x29 -- designate a
659 94-character character set into G1. */
660 ISO_ESC_2_10, /* We've seen ESC 0x2A. */
661 ISO_ESC_2_11, /* We've seen ESC 0x2B. */
662 ISO_ESC_2_12, /* We've seen ESC 0x2C -- designate a
663 96-character character set into G0.
664 (This is not ISO2022-standard.
665 The following 96-character
666 control sequences are standard,
667 though.) */
668 ISO_ESC_2_13, /* We've seen ESC 0x2D -- designate a
669 96-character character set into G1.
670 */
671 ISO_ESC_2_14, /* We've seen ESC 0x2E. */
672 ISO_ESC_2_15, /* We've seen ESC 0x2F. */
673 ISO_ESC_2_4_8, /* We've seen ESC $ 0x28 -- designate
674 a 94^N character set into G0. */
675 ISO_ESC_2_4_9, /* We've seen ESC $ 0x29. */
676 ISO_ESC_2_4_10, /* We've seen ESC $ 0x2A. */
677 ISO_ESC_2_4_11, /* We've seen ESC $ 0x2B. */
678 ISO_ESC_2_4_12, /* We've seen ESC $ 0x2C. */
679 ISO_ESC_2_4_13, /* We've seen ESC $ 0x2D. */
680 ISO_ESC_2_4_14, /* We've seen ESC $ 0x2E. */
681 ISO_ESC_2_4_15, /* We've seen ESC $ 0x2F. */
682 ISO_ESC_5_11, /* We've seen ESC [ or 0x9B. This
683 starts a directionality-control
684 sequence. The next character
685 must be 0, 1, 2, or ]. */
686 ISO_ESC_5_11_0, /* We've seen 0x9B 0. The next character must be ]. */
687 ISO_ESC_5_11_1, /* We've seen 0x9B 1. The next character must be ]. */
688 ISO_ESC_5_11_2, /* We've seen 0x9B 2. The next character must be ]. */
689
690 /* Full sequences. */
691 ISO_ESC_START_COMPOSITE, /* Private usage for START COMPOSING */
692 ISO_ESC_END_COMPOSITE, /* Private usage for END COMPOSING */
693 ISO_ESC_SINGLE_SHIFT, /* We've seen a complete single-shift sequence. */
694 ISO_ESC_LOCKING_SHIFT,/* We've seen a complete locking-shift sequence. */
695 ISO_ESC_DESIGNATE, /* We've seen a complete designation sequence. */
696 ISO_ESC_DIRECTIONALITY,/* We've seen a complete ISO6429 directionality
697 sequence. */
698 ISO_ESC_LITERAL /* We've seen a literal character ala
699 escape-quoting. */
700 };
701
702 enum iso_error
703 {
704 ISO_ERROR_BAD_FINAL,
705 ISO_ERROR_UNKWOWN_ESC_SEQUENCE,
706 ISO_ERROR_INVALID_CODE_POINT_CHARACTER,
707 };
708
709
710 /* Flags indicating current state while converting code. */
711
712 /************ Used during encoding and decoding: ************/
713 /* If set, the current directionality is right-to-left. Otherwise, it's
714 left-to-right. */
715 #define ISO_STATE_R2L (1 << 0)
716
717 /************ Used during encoding: ************/
718 /* If set, we just saw a CR. */
719 #define ISO_STATE_CR (1 << 1)
720
721 /************ Used during decoding: ************/
722 /* If set, we're currently parsing an escape sequence and the upper 16 bits
723 should be looked at to indicate what partial escape sequence we've seen
724 so far. Otherwise, we're running through actual text. */
725 #define ISO_STATE_ESCAPE (1 << 2)
726 /* If set, G2 is invoked into GL, but only for the next character. */
727 #define ISO_STATE_SS2 (1 << 3)
728 /* If set, G3 is invoked into GL, but only for the next character. If both
729 ISO_STATE_SS2 and ISO_STATE_SS3 are set, ISO_STATE_SS2 overrides; but
730 this probably indicates an error in the text encoding. */
731 #define ISO_STATE_SS3 (1 << 4)
732 /* If set, we're currently processing a composite character (i.e. a
733 character constructed by overstriking two or more characters). */
734 #define ISO_STATE_COMPOSITE (1 << 5)
735
736 /* ISO_STATE_LOCK is the mask of flags that remain on until explicitly
737 turned off when in the ISO2022 encoder/decoder. Other flags are turned
738 off at the end of processing each character or escape sequence. */
739 # define ISO_STATE_LOCK \
740 (ISO_STATE_COMPOSITE | ISO_STATE_R2L)
741
742 typedef struct charset_conversion_spec
743 {
744 Lisp_Object from_charset;
745 Lisp_Object to_charset;
746 } charset_conversion_spec;
747
748 typedef struct
749 {
750 Dynarr_declare (charset_conversion_spec);
751 } charset_conversion_spec_dynarr;
752
753 struct iso2022_coding_system
754 {
755 /* What are the charsets to be initially designated to G0, G1,
756 G2, G3? If t, no charset is initially designated. If nil,
757 no charset is initially designated and no charset is allowed
758 to be designated. */
759 Lisp_Object initial_charset[4];
760
761 /* If true, a designation escape sequence needs to be sent on output
762 for the charset in G[0-3] before that charset is used. */
763 unsigned char force_charset_on_output[4];
764
765 charset_conversion_spec_dynarr *input_conv;
766 charset_conversion_spec_dynarr *output_conv;
767
768 unsigned int shoort :1; /* C makes you speak Dutch */
769 unsigned int no_ascii_eol :1;
770 unsigned int no_ascii_cntl :1;
771 unsigned int seven :1;
772 unsigned int lock_shift :1;
773 unsigned int no_iso6429 :1;
774 unsigned int escape_quoted :1;
775 };
776
777 #define CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, g) \
778 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->initial_charset[g])
779 #define CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(codesys, g) \
780 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->force_charset_on_output[g])
781 #define CODING_SYSTEM_ISO2022_SHORT(codesys) \
782 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->shoort)
783 #define CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys) \
784 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->no_ascii_eol)
785 #define CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys) \
786 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->no_ascii_cntl)
787 #define CODING_SYSTEM_ISO2022_SEVEN(codesys) \
788 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->seven)
789 #define CODING_SYSTEM_ISO2022_LOCK_SHIFT(codesys) \
790 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->lock_shift)
791 #define CODING_SYSTEM_ISO2022_NO_ISO6429(codesys) \
792 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->no_iso6429)
793 #define CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys) \
794 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->escape_quoted)
795 #define CODING_SYSTEM_ISO2022_INPUT_CONV(codesys) \
796 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->input_conv)
797 #define CODING_SYSTEM_ISO2022_OUTPUT_CONV(codesys) \
798 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->output_conv)
799
800 #define XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, g) \
801 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (XCODING_SYSTEM (codesys), g)
802 #define XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(codesys, g) \
803 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (XCODING_SYSTEM (codesys), g)
804 #define XCODING_SYSTEM_ISO2022_SHORT(codesys) \
805 CODING_SYSTEM_ISO2022_SHORT (XCODING_SYSTEM (codesys))
806 #define XCODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys) \
807 CODING_SYSTEM_ISO2022_NO_ASCII_EOL (XCODING_SYSTEM (codesys))
808 #define XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys) \
809 CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (XCODING_SYSTEM (codesys))
810 #define XCODING_SYSTEM_ISO2022_SEVEN(codesys) \
811 CODING_SYSTEM_ISO2022_SEVEN (XCODING_SYSTEM (codesys))
812 #define XCODING_SYSTEM_ISO2022_LOCK_SHIFT(codesys) \
813 CODING_SYSTEM_ISO2022_LOCK_SHIFT (XCODING_SYSTEM (codesys))
814 #define XCODING_SYSTEM_ISO2022_NO_ISO6429(codesys) \
815 CODING_SYSTEM_ISO2022_NO_ISO6429 (XCODING_SYSTEM (codesys))
816 #define XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys) \
817 CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (XCODING_SYSTEM (codesys))
818 #define XCODING_SYSTEM_ISO2022_INPUT_CONV(codesys) \
819 CODING_SYSTEM_ISO2022_INPUT_CONV (XCODING_SYSTEM (codesys))
820 #define XCODING_SYSTEM_ISO2022_OUTPUT_CONV(codesys) \
821 CODING_SYSTEM_ISO2022_OUTPUT_CONV (XCODING_SYSTEM (codesys))
822
823 /* Additional information used by the ISO2022 decoder and detector. */
824 struct iso2022_coding_stream
825 {
826 /* CHARSET holds the character sets currently assigned to the G0
827 through G3 variables. It is initialized from the array
828 INITIAL_CHARSET in CODESYS. */
829 Lisp_Object charset[4];
830
831 /* Which registers are currently invoked into the left (GL) and
832 right (GR) halves of the 8-bit encoding space? */
833 int register_left, register_right;
834
835 /* FLAGS holds flags indicating the current state of the encoding. Some of
836 these flags are actually part of the state-dependent data and should be
837 moved there. */
838 unsigned int flags;
839
840 /**************** for decoding ****************/
841
842 /* ISO_ESC holds a value indicating part of an escape sequence
843 that has already been seen. */
844 enum iso_esc_flag esc;
845
846 /* This records the bytes we've seen so far in an escape sequence,
847 in case the sequence is invalid (we spit out the bytes unchanged). */
848 unsigned char esc_bytes[8];
849
850 /* Index for next byte to store in ISO escape sequence. */
851 int esc_bytes_index;
852
853 #ifdef ENABLE_COMPOSITE_CHARS
854 /* Stuff seen so far when composing a string. */
855 unsigned_char_dynarr *composite_chars;
856 #endif
857
858 /* If we saw an invalid designation sequence for a particular
859 register, we flag it here and switch to ASCII. The next time we
860 see a valid designation for this register, we turn off the flag
861 and do the designation normally, but pretend the sequence was
862 invalid. The effect of all this is that (most of the time) the
863 escape sequences for both the switch to the unknown charset, and
864 the switch back to the known charset, get inserted literally into
865 the buffer and saved out as such. The hope is that we can
866 preserve the escape sequences so that the resulting written out
867 file makes sense. If we don't do any of this, the designation
868 to the invalid charset will be preserved but that switch back
869 to the known charset will probably get eaten because it was
870 the same charset that was already present in the register. */
871 unsigned char invalid_designated[4];
872
873 /* We try to do similar things as above for direction-switching
874 sequences. If we encountered a direction switch while an
875 invalid designation was present, or an invalid designation
876 just after a direction switch (i.e. no valid designation
877 encountered yet), we insert the direction-switch escape
878 sequence literally into the output stream, and later on
879 insert the corresponding direction-restoring escape sequence
880 literally also. */
881 unsigned int switched_dir_and_no_valid_charset_yet :1;
882 unsigned int invalid_switch_dir :1;
883
884 /* Tells the decoder to output the escape sequence literally
885 even though it was valid. Used in the games we play to
886 avoid lossage when we encounter invalid designations. */
887 unsigned int output_literally :1;
888 /* We encountered a direction switch followed by an invalid
889 designation. We didn't output the direction switch
890 literally because we didn't know about the invalid designation;
891 but we have to do so now. */
892 unsigned int output_direction_sequence :1;
893
894 /**************** for encoding ****************/
895
896 /* Whether we need to explicitly designate the charset in the
897 G? register before using it. It is initialized from the
898 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
899 unsigned char force_charset_on_output[4];
900
901 /* Other state variables that need to be preserved across
902 invocations. */
903 Lisp_Object current_charset;
904 int current_half;
905 int current_char_boundary;
906 };
907
908 static const struct lrecord_description ccs_description_1[] =
909 {
910 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
911 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
912 { XD_END }
913 };
914
915 static const struct struct_description ccs_description =
916 {
917 sizeof (charset_conversion_spec),
918 ccs_description_1
919 };
920
921 static const struct lrecord_description ccsd_description_1[] =
922 {
923 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
924 { XD_END }
925 };
926
927 static const struct struct_description ccsd_description =
928 {
929 sizeof (charset_conversion_spec_dynarr),
930 ccsd_description_1
931 };
932
933 static const struct lrecord_description iso2022_coding_system_description[] = {
934 { XD_LISP_OBJECT_ARRAY,
935 coding_system_data_offset + offsetof (struct iso2022_coding_system,
936 initial_charset), 4 },
937 { XD_STRUCT_PTR,
938 coding_system_data_offset + offsetof (struct iso2022_coding_system,
939 input_conv),
940 1, &ccsd_description },
941 { XD_STRUCT_PTR,
942 coding_system_data_offset + offsetof (struct iso2022_coding_system,
943 output_conv),
944 1, &ccsd_description },
945 { XD_END }
946 };
947
948 /* The following note taken directly from FSF 21.0.103. */
949
950 /* The following note describes the coding system ISO2022 briefly.
951 Since the intention of this note is to help understand the
952 functions in this file, some parts are NOT ACCURATE or are OVERLY
953 SIMPLIFIED. For thorough understanding, please refer to the
954 original document of ISO2022. This is equivalent to the standard
955 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
956
957 ISO2022 provides many mechanisms to encode several character sets
958 in 7-bit and 8-bit environments. For 7-bit environments, all text
959 is encoded using bytes less than 128. This may make the encoded
960 text a little bit longer, but the text passes more easily through
961 several types of gateway, some of which strip off the MSB (Most
962 Significant Bit).
963
964 There are two kinds of character sets: control character sets and
965 graphic character sets. The former contain control characters such
966 as `newline' and `escape' to provide control functions (control
967 functions are also provided by escape sequences). The latter
968 contain graphic characters such as 'A' and '-'. Emacs recognizes
969 two control character sets and many graphic character sets.
970
971 Graphic character sets are classified into one of the following
972 four classes, according to the number of bytes (DIMENSION) and
973 number of characters in one dimension (CHARS) of the set:
974 - DIMENSION1_CHARS94
975 - DIMENSION1_CHARS96
976 - DIMENSION2_CHARS94
977 - DIMENSION2_CHARS96
978
979 In addition, each character set is assigned an identification tag,
980 unique for each set, called the "final character" (denoted as <F>
981 hereafter). The <F> of each character set is decided by ECMA(*)
982 when it is registered in ISO. The code range of <F> is 0x30..0x7F
983 (0x30..0x3F are for private use only).
984
985 Note (*): ECMA = European Computer Manufacturers Association
986
987 Here are examples of graphic character sets [NAME(<F>)]:
988 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
989 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
990 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
991 o DIMENSION2_CHARS96 -- none for the moment
992
993 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
994 C0 [0x00..0x1F] -- control character plane 0
995 GL [0x20..0x7F] -- graphic character plane 0
996 C1 [0x80..0x9F] -- control character plane 1
997 GR [0xA0..0xFF] -- graphic character plane 1
998
999 A control character set is directly designated and invoked to C0 or
1000 C1 by an escape sequence. The most common case is that:
1001 - ISO646's control character set is designated/invoked to C0, and
1002 - ISO6429's control character set is designated/invoked to C1,
1003 and usually these designations/invocations are omitted in encoded
1004 text. In a 7-bit environment, only C0 can be used, and a control
1005 character for C1 is encoded by an appropriate escape sequence to
1006 fit into the environment. All control characters for C1 are
1007 defined to have corresponding escape sequences.
1008
1009 A graphic character set is at first designated to one of four
1010 graphic registers (G0 through G3), then these graphic registers are
1011 invoked to GL or GR. These designations and invocations can be
1012 done independently. The most common case is that G0 is invoked to
1013 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
1014 these invocations and designations are omitted in encoded text.
1015 In a 7-bit environment, only GL can be used.
1016
1017 When a graphic character set of CHARS94 is invoked to GL, codes
1018 0x20 and 0x7F of the GL area work as control characters SPACE and
1019 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1020 be used.
1021
1022 There are two ways of invocation: locking-shift and single-shift.
1023 With locking-shift, the invocation lasts until the next different
1024 invocation, whereas with single-shift, the invocation affects the
1025 following character only and doesn't affect the locking-shift
1026 state. Invocations are done by the following control characters or
1027 escape sequences:
1028
1029 ----------------------------------------------------------------------
1030 abbrev function cntrl escape seq description
1031 ----------------------------------------------------------------------
1032 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
1033 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
1034 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
1035 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
1036 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
1037 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
1038 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
1039 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
1040 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
1041 ----------------------------------------------------------------------
1042 (*) These are not used by any known coding system.
1043
1044 Control characters for these functions are defined by macros
1045 ISO_CODE_XXX in `coding.h'.
1046
1047 Designations are done by the following escape sequences:
1048 ----------------------------------------------------------------------
1049 escape sequence description
1050 ----------------------------------------------------------------------
1051 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
1052 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
1053 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
1054 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
1055 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
1056 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
1057 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
1058 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
1059 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
1060 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
1061 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
1062 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
1063 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
1064 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
1065 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
1066 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
1067 ----------------------------------------------------------------------
1068
1069 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1070 of dimension 1, chars 94, and final character <F>, etc...
1071
1072 Note (*): Although these designations are not allowed in ISO2022,
1073 Emacs accepts them on decoding, and produces them on encoding
1074 CHARS96 character sets in a coding system which is characterized as
1075 7-bit environment, non-locking-shift, and non-single-shift.
1076
1077 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1078 '(' can be omitted. We refer to this as "short-form" hereafter.
1079
1080 Now you may notice that there are a lot of ways of encoding the
1081 same multilingual text in ISO2022. Actually, there exist many
1082 coding systems such as Compound Text (used in X11's inter client
1083 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1084 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1085 localized platforms), and all of these are variants of ISO2022.
1086
1087 In addition to the above, Emacs handles two more kinds of escape
1088 sequences: ISO6429's direction specification and Emacs' private
1089 sequence for specifying character composition.
1090
1091 ISO6429's direction specification takes the following form:
1092 o CSI ']' -- end of the current direction
1093 o CSI '0' ']' -- end of the current direction
1094 o CSI '1' ']' -- start of left-to-right text
1095 o CSI '2' ']' -- start of right-to-left text
1096 The control character CSI (0x9B: control sequence introducer) is
1097 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1098
1099 Character composition specification takes the following form:
1100 o ESC '0' -- start relative composition
1101 o ESC '1' -- end composition
1102 o ESC '2' -- start rule-base composition (*)
1103 o ESC '3' -- start relative composition with alternate chars (**)
1104 o ESC '4' -- start rule-base composition with alternate chars (**)
1105 Since these are not standard escape sequences of any ISO standard,
1106 the use of them with these meanings is restricted to Emacs only.
1107
1108 (*) This form is used only in Emacs 20.5 and older versions,
1109 but the newer versions can safely decode it.
1110 (**) This form is used only in Emacs 21.1 and newer versions,
1111 and the older versions can't decode it.
1112
1113 Here's a list of example usages of these composition escape
1114 sequences (categorized by `enum composition_method').
1115
1116 COMPOSITION_RELATIVE:
1117 ESC 0 CHAR [ CHAR ] ESC 1
1118 COMPOSITION_WITH_RULE:
1119 ESC 2 CHAR [ RULE CHAR ] ESC 1
1120 COMPOSITION_WITH_ALTCHARS:
1121 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1122 COMPOSITION_WITH_RULE_ALTCHARS:
1123 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1124
1125 static void
1126 reset_iso2022_decode (Lisp_Object coding_system,
1127 struct iso2022_coding_stream *data)
1128 {
1129 int i;
1130 #ifdef ENABLE_COMPOSITE_CHARS
1131 unsigned_char_dynarr *old_composite_chars = data->composite_chars;
1132 #endif
1133
1134 xzero (*data);
1135
1136 for (i = 0; i < 4; i++)
1137 {
1138 if (!NILP (coding_system))
1139 data->charset[i] =
1140 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
1141 else
1142 data->charset[i] = Qt;
1143 }
1144 data->esc = ISO_ESC_NOTHING;
1145 data->register_right = 1;
1146 #ifdef ENABLE_COMPOSITE_CHARS
1147 if (old_composite_chars)
1148 {
1149 data->composite_chars = old_composite_chars;
1150 Dynarr_reset (data->composite_chars);
1151 }
1152 #endif
1153 }
1154
1155 static void
1156 reset_iso2022_encode (Lisp_Object coding_system,
1157 struct iso2022_coding_stream *data)
1158 {
1159 int i;
1160
1161 xzero (*data);
1162
1163 for (i = 0; i < 4; i++)
1164 {
1165 data->charset[i] =
1166 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
1167 data->force_charset_on_output[i] =
1168 XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (coding_system, i);
1169 }
1170 data->register_right = 1;
1171 data->current_charset = Qnil;
1172 data->current_char_boundary = 1;
1173 }
1174
1175 static void
1176 iso2022_init_coding_stream (struct coding_stream *str)
1177 {
1178 if (str->direction == CODING_DECODE)
1179 reset_iso2022_decode (str->codesys,
1180 CODING_STREAM_TYPE_DATA (str, iso2022));
1181 else
1182 reset_iso2022_encode (str->codesys,
1183 CODING_STREAM_TYPE_DATA (str, iso2022));
1184 }
1185
1186 static void
1187 iso2022_rewind_coding_stream (struct coding_stream *str)
1188 {
1189 iso2022_init_coding_stream (str);
1190 }
1191
1192 static int
1193 fit_to_be_escape_quoted (unsigned char c)
1194 {
1195 switch (c)
1196 {
1197 case ISO_CODE_ESC:
1198 case ISO_CODE_CSI:
1199 case ISO_CODE_SS2:
1200 case ISO_CODE_SS3:
1201 case ISO_CODE_SO:
1202 case ISO_CODE_SI:
1203 return 1;
1204
1205 default:
1206 return 0;
1207 }
1208 }
1209
1210 static Lisp_Object
1211 charset_by_attributes_or_create_one (int type, Intbyte final, int dir)
1212 {
1213 Lisp_Object charset = CHARSET_BY_ATTRIBUTES (type, final, dir);
1214
1215 if (NILP (charset))
1216 {
1217 int chars, dim;
1218
1219 switch (type)
1220 {
1221 case CHARSET_TYPE_94:
1222 chars = 94; dim = 1;
1223 break;
1224 case CHARSET_TYPE_96:
1225 chars = 96; dim = 1;
1226 break;
1227 case CHARSET_TYPE_94X94:
1228 chars = 94; dim = 2;
1229 break;
1230 case CHARSET_TYPE_96X96:
1231 chars = 96; dim = 2;
1232 break;
1233 default:
1234 abort (); chars = 0; dim = 0;
1235 }
1236
1237 charset = Fmake_charset (Qunbound, Qnil,
1238 nconc2 (list6 (Qfinal, make_char (final),
1239 Qchars, make_int (chars),
1240 Qdimension, make_int (dim)),
1241 list2 (Qdirection,
1242 dir == CHARSET_LEFT_TO_RIGHT ?
1243 Ql2r : Qr2l)));
1244 }
1245
1246 return charset;
1247 }
1248
1249 /* Parse one byte of an ISO2022 escape sequence.
1250 If the result is an invalid escape sequence, return 0 and
1251 do not change anything in STR. Otherwise, if the result is
1252 an incomplete escape sequence, update ISO2022.ESC and
1253 ISO2022.ESC_BYTES and return -1. Otherwise, update
1254 all the state variables (but not ISO2022.ESC_BYTES) and
1255 return 1.
1256
1257 If CHECK_INVALID_CHARSETS is non-zero, check for designation
1258 or invocation of an invalid character set and treat that as
1259 an unrecognized escape sequence.
1260
1261 ********************************************************************
1262
1263 #### Strategies for error annotation and coding orthogonalization
1264
1265 We really want to separate out a number of things. Conceptually,
1266 there is a nested syntax.
1267
1268 At the top level is the ISO 2022 extension syntax, including charset
1269 designation and invocation, and certain auxiliary controls such as the
1270 ISO 6429 direction specification. These are octet-oriented, with the
1271 single exception (AFAIK) of the "exit Unicode" sequence which uses the
1272 UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
1273 UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a
1274 (deprecated) special case in Unicode processing.
1275
1276 The middle layer is ISO 2022 character interpretation. This will depend
1277 on the current state of the ISO 2022 registers, and assembles octets
1278 into the character's internal representation.
1279
1280 The lowest level is translating system control conventions. At present
1281 this is restricted to newline translation, but one could imagine doing
1282 tab conversion or line wrapping here. "Escape from Unicode" processing
1283 would be done at this level.
1284
1285 At each level the parser will verify the syntax. In the case of a
1286 syntax error or warning (such as a redundant escape sequence that affects
1287 no characters), the parser will take some action, typically inserting the
1288 erroneous octets directly into the output and creating an annotation
1289 which can be used by higher level I/O to mark the affected region.
1290
1291 This should make it possible to do something sensible about separating
1292 newline convention processing from character construction, and about
1293 preventing ISO 2022 escape sequences from being recognized
1294 inappropriately.
1295
1296 The basic strategy will be to have octet classification tables, and
1297 switch processing according to the table entry.
1298
1299 It's possible that, by doing the processing with tables of functions or
1300 the like, the parser can be used for both detection and translation. */
1301
1302 static int
1303 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_coding_stream *iso,
1304 unsigned char c, unsigned int *flags,
1305 int check_invalid_charsets)
1306 {
1307 /* (1) If we're at the end of a designation sequence, CS is the
1308 charset being designated and REG is the register to designate
1309 it to.
1310
1311 (2) If we're at the end of a locking-shift sequence, REG is
1312 the register to invoke and HALF (0 == left, 1 == right) is
1313 the half to invoke it into.
1314
1315 (3) If we're at the end of a single-shift sequence, REG is
1316 the register to invoke. */
1317 Lisp_Object cs = Qnil;
1318 int reg, half;
1319
1320 /* NOTE: This code does goto's all over the fucking place.
1321 The reason for this is that we're basically implementing
1322 a state machine here, and hierarchical languages like C
1323 don't really provide a clean way of doing this. */
1324
1325 if (! (*flags & ISO_STATE_ESCAPE))
1326 /* At beginning of escape sequence; we need to reset our
1327 escape-state variables. */
1328 iso->esc = ISO_ESC_NOTHING;
1329
1330 iso->output_literally = 0;
1331 iso->output_direction_sequence = 0;
1332
1333 switch (iso->esc)
1334 {
1335 case ISO_ESC_NOTHING:
1336 iso->esc_bytes_index = 0;
1337 switch (c)
1338 {
1339 case ISO_CODE_ESC: /* Start escape sequence */
1340 *flags |= ISO_STATE_ESCAPE;
1341 iso->esc = ISO_ESC;
1342 goto not_done;
1343
1344 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
1345 *flags |= ISO_STATE_ESCAPE;
1346 iso->esc = ISO_ESC_5_11;
1347 goto not_done;
1348
1349 case ISO_CODE_SO: /* locking shift 1 */
1350 reg = 1; half = 0;
1351 goto locking_shift;
1352 case ISO_CODE_SI: /* locking shift 0 */
1353 reg = 0; half = 0;
1354 goto locking_shift;
1355
1356 case ISO_CODE_SS2: /* single shift */
1357 reg = 2;
1358 goto single_shift;
1359 case ISO_CODE_SS3: /* single shift */
1360 reg = 3;
1361 goto single_shift;
1362
1363 default: /* Other control characters */
1364 error:
1365 *flags &= ISO_STATE_LOCK;
1366 return 0;
1367 }
1368
1369 case ISO_ESC:
1370 switch (c)
1371 {
1372 /**** single shift ****/
1373
1374 case 'N': /* single shift 2 */
1375 reg = 2;
1376 goto single_shift;
1377 case 'O': /* single shift 3 */
1378 reg = 3;
1379 goto single_shift;
1380
1381 /**** locking shift ****/
1382
1383 case '~': /* locking shift 1 right */
1384 reg = 1; half = 1;
1385 goto locking_shift;
1386 case 'n': /* locking shift 2 */
1387 reg = 2; half = 0;
1388 goto locking_shift;
1389 case '}': /* locking shift 2 right */
1390 reg = 2; half = 1;
1391 goto locking_shift;
1392 case 'o': /* locking shift 3 */
1393 reg = 3; half = 0;
1394 goto locking_shift;
1395 case '|': /* locking shift 3 right */
1396 reg = 3; half = 1;
1397 goto locking_shift;
1398
1399 /**** composite ****/
1400
1401 #ifdef ENABLE_COMPOSITE_CHARS
1402 case '0':
1403 iso->esc = ISO_ESC_START_COMPOSITE;
1404 *flags = (*flags & ISO_STATE_LOCK) |
1405 ISO_STATE_COMPOSITE;
1406 return 1;
1407
1408 case '1':
1409 iso->esc = ISO_ESC_END_COMPOSITE;
1410 *flags = (*flags & ISO_STATE_LOCK) &
1411 ~ISO_STATE_COMPOSITE;
1412 return 1;
1413 #else
1414 case '0': case '1': case '2': case '3': case '4':
1415 /* We simply return a flag indicating that some composite
1416 escape was seen. The caller will use the particular
1417 character to encode the appropriate "composite hack"
1418 character out of Vcharset_composite, so that we will
1419 preserve these values on output. */
1420 iso->esc = ISO_ESC_START_COMPOSITE;
1421 *flags &= ISO_STATE_LOCK;
1422 return 1;
1423 #endif /* ENABLE_COMPOSITE_CHARS */
1424
1425 /**** directionality ****/
1426
1427 case '[':
1428 iso->esc = ISO_ESC_5_11;
1429 goto not_done;
1430
1431 /**** designation ****/
1432
1433 case '$': /* multibyte charset prefix */
1434 iso->esc = ISO_ESC_2_4;
1435 goto not_done;
1436
1437 default:
1438 if (0x28 <= c && c <= 0x2F)
1439 {
1440 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
1441 goto not_done;
1442 }
1443
1444 /* This function is called with CODESYS equal to nil when
1445 doing coding-system detection. */
1446 if (!NILP (codesys)
1447 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
1448 && fit_to_be_escape_quoted (c))
1449 {
1450 iso->esc = ISO_ESC_LITERAL;
1451 *flags &= ISO_STATE_LOCK;
1452 return 1;
1453 }
1454
1455 /* bzzzt! */
1456 goto error;
1457 }
1458
1459
1460
1461 /**** directionality ****/
1462
1463 case ISO_ESC_5_11: /* ISO6429 direction control */
1464 if (c == ']')
1465 {
1466 *flags &= (ISO_STATE_LOCK & ~ISO_STATE_R2L);
1467 goto directionality;
1468 }
1469 if (c == '0') iso->esc = ISO_ESC_5_11_0;
1470 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
1471 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
1472 else goto error;
1473 goto not_done;
1474
1475 case ISO_ESC_5_11_0:
1476 if (c == ']')
1477 {
1478 *flags &= (ISO_STATE_LOCK & ~ISO_STATE_R2L);
1479 goto directionality;
1480 }
1481 goto error;
1482
1483 case ISO_ESC_5_11_1:
1484 if (c == ']')
1485 {
1486 *flags = (ISO_STATE_LOCK & ~ISO_STATE_R2L);
1487 goto directionality;
1488 }
1489 goto error;
1490
1491 case ISO_ESC_5_11_2:
1492 if (c == ']')
1493 {
1494 *flags = (*flags & ISO_STATE_LOCK) | ISO_STATE_R2L;
1495 goto directionality;
1496 }
1497 goto error;
1498
1499 directionality:
1500 iso->esc = ISO_ESC_DIRECTIONALITY;
1501 /* Various junk here to attempt to preserve the direction sequences
1502 literally in the text if they would otherwise be swallowed due
1503 to invalid designations that don't show up as actual charset
1504 changes in the text. */
1505 if (iso->invalid_switch_dir)
1506 {
1507 /* We already inserted a direction switch literally into the
1508 text. We assume (#### this may not be right) that the
1509 next direction switch is the one going the other way,
1510 and we need to output that literally as well. */
1511 iso->output_literally = 1;
1512 iso->invalid_switch_dir = 0;
1513 }
1514 else
1515 {
1516 int jj;
1517
1518 /* If we are in the thrall of an invalid designation,
1519 then stick the directionality sequence literally into the
1520 output stream so it ends up in the original text again. */
1521 for (jj = 0; jj < 4; jj++)
1522 if (iso->invalid_designated[jj])
1523 break;
1524 if (jj < 4)
1525 {
1526 iso->output_literally = 1;
1527 iso->invalid_switch_dir = 1;
1528 }
1529 else
1530 /* Indicate that we haven't yet seen a valid designation,
1531 so that if a switch-dir is directly followed by an
1532 invalid designation, both get inserted literally. */
1533 iso->switched_dir_and_no_valid_charset_yet = 1;
1534 }
1535 return 1;
1536
1537
1538 /**** designation ****/
1539
1540 case ISO_ESC_2_4:
1541 if (0x28 <= c && c <= 0x2F)
1542 {
1543 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
1544 goto not_done;
1545 }
1546 if (0x40 <= c && c <= 0x42)
1547 {
1548 cs = charset_by_attributes_or_create_one (CHARSET_TYPE_94X94, c,
1549 *flags & ISO_STATE_R2L ?
1550 CHARSET_RIGHT_TO_LEFT :
1551 CHARSET_LEFT_TO_RIGHT);
1552 reg = 0;
1553 goto designated;
1554 }
1555 goto error;
1556
1557 default:
1558 {
1559 int type = -1;
1560
1561 if (iso->esc >= ISO_ESC_2_8 &&
1562 iso->esc <= ISO_ESC_2_15)
1563 {
1564 type = ((iso->esc >= ISO_ESC_2_12) ?
1565 CHARSET_TYPE_96 : CHARSET_TYPE_94);
1566 reg = (iso->esc - ISO_ESC_2_8) & 3;
1567 }
1568 else if (iso->esc >= ISO_ESC_2_4_8 &&
1569 iso->esc <= ISO_ESC_2_4_15)
1570 {
1571 type = ((iso->esc >= ISO_ESC_2_4_12) ?
1572 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
1573 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
1574 }
1575 else
1576 {
1577 /* Can this ever be reached? -slb */
1578 abort ();
1579 goto error;
1580 }
1581
1582 if (c < '0' || c > '~' ||
1583 (c > 0x5F && (type == CHARSET_TYPE_94X94 ||
1584 type == CHARSET_TYPE_96X96)))
1585 goto error; /* bad final byte */
1586
1587 cs = charset_by_attributes_or_create_one (type, c,
1588 *flags & ISO_STATE_R2L ?
1589 CHARSET_RIGHT_TO_LEFT :
1590 CHARSET_LEFT_TO_RIGHT);
1591 goto designated;
1592 }
1593 }
1594
1595 not_done:
1596 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
1597 return -1;
1598
1599 single_shift:
1600 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
1601 /* can't invoke something that ain't there. */
1602 goto error;
1603 iso->esc = ISO_ESC_SINGLE_SHIFT;
1604 *flags &= ISO_STATE_LOCK;
1605 if (reg == 2)
1606 *flags |= ISO_STATE_SS2;
1607 else
1608 *flags |= ISO_STATE_SS3;
1609 return 1;
1610
1611 locking_shift:
1612 if (check_invalid_charsets &&
1613 !CHARSETP (iso->charset[reg]))
1614 /* can't invoke something that ain't there. */
1615 goto error;
1616 if (half)
1617 iso->register_right = reg;
1618 else
1619 iso->register_left = reg;
1620 *flags &= ISO_STATE_LOCK;
1621 iso->esc = ISO_ESC_LOCKING_SHIFT;
1622 return 1;
1623
1624 designated:
1625 if (NILP (cs) && check_invalid_charsets)
1626 {
1627 abort ();
1628 /* #### This should never happen now that we automatically create
1629 temporary charsets as necessary. We should probably remove
1630 this code. --ben */
1631 iso->invalid_designated[reg] = 1;
1632 iso->charset[reg] = Vcharset_ascii;
1633 iso->esc = ISO_ESC_DESIGNATE;
1634 *flags &= ISO_STATE_LOCK;
1635 iso->output_literally = 1;
1636 if (iso->switched_dir_and_no_valid_charset_yet)
1637 {
1638 /* We encountered a switch-direction followed by an
1639 invalid designation. Ensure that the switch-direction
1640 gets outputted; otherwise it will probably get eaten
1641 when the text is written out again. */
1642 iso->switched_dir_and_no_valid_charset_yet = 0;
1643 iso->output_direction_sequence = 1;
1644 /* And make sure that the switch-dir going the other
1645 way gets outputted, as well. */
1646 iso->invalid_switch_dir = 1;
1647 }
1648 return 1;
1649 }
1650 /* This function is called with CODESYS equal to nil when
1651 doing coding-system detection. */
1652 if (!NILP (codesys))
1653 {
1654 charset_conversion_spec_dynarr *dyn =
1655 XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys);
1656
1657 if (dyn)
1658 {
1659 int i;
1660
1661 for (i = 0; i < Dynarr_length (dyn); i++)
1662 {
1663 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
1664 if (EQ (cs, spec->from_charset))
1665 cs = spec->to_charset;
1666 }
1667 }
1668 }
1669
1670 iso->charset[reg] = cs;
1671 iso->esc = ISO_ESC_DESIGNATE;
1672 *flags &= ISO_STATE_LOCK;
1673 if (iso->invalid_designated[reg])
1674 {
1675 iso->invalid_designated[reg] = 0;
1676 iso->output_literally = 1;
1677 }
1678 if (iso->switched_dir_and_no_valid_charset_yet)
1679 iso->switched_dir_and_no_valid_charset_yet = 0;
1680 return 1;
1681 }
1682
1683 /* If FLAGS is a null pointer or specifies right-to-left motion,
1684 output a switch-dir-to-left-to-right sequence to DST.
1685 Also update FLAGS if it is not a null pointer.
1686 If INTERNAL_P is set, we are outputting in internal format and
1687 need to handle the CSI differently. */
1688
1689 static void
1690 restore_left_to_right_direction (Lisp_Object codesys,
1691 unsigned_char_dynarr *dst,
1692 unsigned int *flags,
1693 int internal_p)
1694 {
1695 if (!flags || (*flags & ISO_STATE_R2L))
1696 {
1697 if (XCODING_SYSTEM_ISO2022_SEVEN (codesys))
1698 {
1699 Dynarr_add (dst, ISO_CODE_ESC);
1700 Dynarr_add (dst, '[');
1701 }
1702 else if (internal_p)
1703 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
1704 else
1705 Dynarr_add (dst, ISO_CODE_CSI);
1706 Dynarr_add (dst, '0');
1707 Dynarr_add (dst, ']');
1708 if (flags)
1709 *flags &= ~ISO_STATE_R2L;
1710 }
1711 }
1712
1713 /* If FLAGS is a null pointer or specifies a direction different from
1714 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
1715 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
1716 sequence to DST. Also update FLAGS if it is not a null pointer.
1717 If INTERNAL_P is set, we are outputting in internal format and
1718 need to handle the CSI differently. */
1719
1720 static void
1721 ensure_correct_direction (int direction, Lisp_Object codesys,
1722 unsigned_char_dynarr *dst, unsigned int *flags,
1723 int internal_p)
1724 {
1725 if ((!flags || (*flags & ISO_STATE_R2L)) &&
1726 direction == CHARSET_LEFT_TO_RIGHT)
1727 restore_left_to_right_direction (codesys, dst, flags, internal_p);
1728 else if (!XCODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
1729 && (!flags || !(*flags & ISO_STATE_R2L)) &&
1730 direction == CHARSET_RIGHT_TO_LEFT)
1731 {
1732 if (XCODING_SYSTEM_ISO2022_SEVEN (codesys))
1733 {
1734 Dynarr_add (dst, ISO_CODE_ESC);
1735 Dynarr_add (dst, '[');
1736 }
1737 else if (internal_p)
1738 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
1739 else
1740 Dynarr_add (dst, ISO_CODE_CSI);
1741 Dynarr_add (dst, '2');
1742 Dynarr_add (dst, ']');
1743 if (flags)
1744 *flags |= ISO_STATE_R2L;
1745 }
1746 }
1747
1748 /* Convert ISO2022-format data to internal format. */
1749
1750 static Bytecount
1751 iso2022_decode (struct coding_stream *str, const UExtbyte *src,
1752 unsigned_char_dynarr *dst, Bytecount n)
1753 {
1754 unsigned int ch = str->ch;
1755 #ifdef ENABLE_COMPOSITE_CHARS
1756 unsigned_char_dynarr *real_dst = dst;
1757 #endif
1758 struct iso2022_coding_stream *data =
1759 CODING_STREAM_TYPE_DATA (str, iso2022);
1760 unsigned int flags = data->flags;
1761 Bytecount orign = n;
1762
1763 #ifdef ENABLE_COMPOSITE_CHARS
1764 if (flags & ISO_STATE_COMPOSITE)
1765 dst = data->composite_chars;
1766 #endif /* ENABLE_COMPOSITE_CHARS */
1767
1768 while (n--)
1769 {
1770 UExtbyte c = *src++;
1771 if (flags & ISO_STATE_ESCAPE)
1772 { /* Within ESC sequence */
1773 int retval = parse_iso2022_esc (str->codesys, data,
1774 c, &flags, 1);
1775
1776 if (retval)
1777 {
1778 switch (data->esc)
1779 {
1780 #ifdef ENABLE_COMPOSITE_CHARS
1781 case ISO_ESC_START_COMPOSITE:
1782 if (data->composite_chars)
1783 Dynarr_reset (data->composite_chars);
1784 else
1785 data->composite_chars = Dynarr_new (unsigned_char);
1786 dst = data->composite_chars;
1787 break;
1788 case ISO_ESC_END_COMPOSITE:
1789 {
1790 Intbyte comstr[MAX_EMCHAR_LEN];
1791 Bytecount len;
1792 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
1793 Dynarr_length (dst));
1794 dst = real_dst;
1795 len = set_charptr_emchar (comstr, emch);
1796 Dynarr_add_many (dst, comstr, len);
1797 break;
1798 }
1799 #else
1800 case ISO_ESC_START_COMPOSITE:
1801 {
1802 Intbyte comstr[MAX_EMCHAR_LEN];
1803 Bytecount len;
1804 Emchar emch = MAKE_CHAR (Vcharset_composite, c - '0' + ' ',
1805 0);
1806 len = set_charptr_emchar (comstr, emch);
1807 Dynarr_add_many (dst, comstr, len);
1808 break;
1809 }
1810 #endif /* ENABLE_COMPOSITE_CHARS */
1811
1812 case ISO_ESC_LITERAL:
1813 DECODE_ADD_BINARY_CHAR (c, dst);
1814 break;
1815
1816 default:
1817 /* Everything else handled already */
1818 break;
1819 }
1820 }
1821
1822 /* Attempted error recovery. */
1823 if (data->output_direction_sequence)
1824 ensure_correct_direction (flags & ISO_STATE_R2L ?
1825 CHARSET_RIGHT_TO_LEFT :
1826 CHARSET_LEFT_TO_RIGHT,
1827 str->codesys, dst, 0, 1);
1828 /* More error recovery. */
1829 if (!retval || data->output_literally)
1830 {
1831 /* Output the (possibly invalid) sequence */
1832 int i;
1833 for (i = 0; i < data->esc_bytes_index; i++)
1834 DECODE_ADD_BINARY_CHAR (data->esc_bytes[i], dst);
1835 flags &= ISO_STATE_LOCK;
1836 if (!retval)
1837 n++, src--;/* Repeat the loop with the same character. */
1838 else
1839 {
1840 /* No sense in reprocessing the final byte of the
1841 escape sequence; it could mess things up anyway.
1842 Just add it now. */
1843 DECODE_ADD_BINARY_CHAR (c, dst);
1844 }
1845 }
1846 ch = 0;
1847 }
1848 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
1849 { /* Control characters */
1850
1851 /***** Error-handling *****/
1852
1853 /* If we were in the middle of a character, dump out the
1854 partial character. */
1855 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
1856
1857 /* If we just saw a single-shift character, dump it out.
1858 This may dump out the wrong sort of single-shift character,
1859 but least it will give an indication that something went
1860 wrong. */
1861 if (flags & ISO_STATE_SS2)
1862 {
1863 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
1864 flags &= ~ISO_STATE_SS2;
1865 }
1866 if (flags & ISO_STATE_SS3)
1867 {
1868 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
1869 flags &= ~ISO_STATE_SS3;
1870 }
1871
1872 /***** Now handle the control characters. *****/
1873
1874 flags &= ISO_STATE_LOCK;
1875
1876 if (!parse_iso2022_esc (str->codesys, data, c, &flags, 1))
1877 DECODE_ADD_BINARY_CHAR (c, dst);
1878 }
1879 else
1880 { /* Graphic characters */
1881 Lisp_Object charset;
1882 int lb;
1883 int reg;
1884
1885 /* Now determine the charset. */
1886 reg = ((flags & ISO_STATE_SS2) ? 2
1887 : (flags & ISO_STATE_SS3) ? 3
1888 : !BYTE_ASCII_P (c) ? data->register_right
1889 : data->register_left);
1890 charset = data->charset[reg];
1891
1892 /* Error checking: */
1893 if (! CHARSETP (charset)
1894 || data->invalid_designated[reg]
1895 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
1896 && XCHARSET_CHARS (charset) == 94))
1897 /* Mrmph. We are trying to invoke a register that has no
1898 or an invalid charset in it, or trying to add a character
1899 outside the range of the charset. Insert that char literally
1900 to preserve it for the output. */
1901 {
1902 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
1903 DECODE_ADD_BINARY_CHAR (c, dst);
1904 }
1905
1906 else
1907 {
1908 /* Things are probably hunky-dorey. */
1909
1910 /* Fetch reverse charset, maybe. */
1911 if (((flags & ISO_STATE_R2L) &&
1912 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
1913 ||
1914 (!(flags & ISO_STATE_R2L) &&
1915 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
1916 {
1917 Lisp_Object new_charset =
1918 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
1919 if (!NILP (new_charset))
1920 charset = new_charset;
1921 }
1922
1923 lb = XCHARSET_LEADING_BYTE (charset);
1924 switch (XCHARSET_REP_BYTES (charset))
1925 {
1926 case 1: /* ASCII */
1927 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
1928 Dynarr_add (dst, c & 0x7F);
1929 break;
1930
1931 case 2: /* one-byte official */
1932 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
1933 Dynarr_add (dst, lb);
1934 Dynarr_add (dst, c | 0x80);
1935 break;
1936
1937 case 3: /* one-byte private or two-byte official */
1938 if (XCHARSET_PRIVATE_P (charset))
1939 {
1940 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
1941 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
1942 Dynarr_add (dst, lb);
1943 Dynarr_add (dst, c | 0x80);
1944 }
1945 else
1946 {
1947 if (ch)
1948 {
1949 Dynarr_add (dst, lb);
1950 Dynarr_add (dst, ch | 0x80);
1951 Dynarr_add (dst, c | 0x80);
1952 ch = 0;
1953 }
1954 else
1955 ch = c;
1956 }
1957 break;
1958
1959 default: /* two-byte private */
1960 if (ch)
1961 {
1962 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
1963 Dynarr_add (dst, lb);
1964 Dynarr_add (dst, ch | 0x80);
1965 Dynarr_add (dst, c | 0x80);
1966 ch = 0;
1967 }
1968 else
1969 ch = c;
1970 }
1971 }
1972
1973 if (!ch)
1974 flags &= ISO_STATE_LOCK;
1975 }
1976
1977 }
1978
1979 if (str->eof)
1980 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
1981
1982 data->flags = flags;
1983 str->ch = ch;
1984 return orign;
1985 }
1986
1987
1988 /***** ISO2022 encoder *****/
1989
1990 /* Designate CHARSET into register REG. */
1991
1992 static void
1993 iso2022_designate (Lisp_Object charset, int reg,
1994 struct coding_stream *str, unsigned_char_dynarr *dst)
1995 {
1996 static const char inter94[] = "()*+";
1997 static const char inter96[] = ",-./";
1998 int type;
1999 unsigned char final;
2000 struct iso2022_coding_stream *data =
2001 CODING_STREAM_TYPE_DATA (str, iso2022);
2002 Lisp_Object old_charset = data->charset[reg];
2003
2004 data->charset[reg] = charset;
2005 if (!CHARSETP (charset))
2006 /* charset might be an initial nil or t. */
2007 return;
2008 type = XCHARSET_TYPE (charset);
2009 final = XCHARSET_FINAL (charset);
2010 if (!data->force_charset_on_output[reg] &&
2011 CHARSETP (old_charset) &&
2012 XCHARSET_TYPE (old_charset) == type &&
2013 XCHARSET_FINAL (old_charset) == final)
2014 return;
2015
2016 data->force_charset_on_output[reg] = 0;
2017
2018 {
2019 charset_conversion_spec_dynarr *dyn =
2020 XCODING_SYSTEM_ISO2022_OUTPUT_CONV (str->codesys);
2021
2022 if (dyn)
2023 {
2024 int i;
2025
2026 for (i = 0; i < Dynarr_length (dyn); i++)
2027 {
2028 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
2029 if (EQ (charset, spec->from_charset))
2030 charset = spec->to_charset;
2031 }
2032 }
2033 }
2034
2035 Dynarr_add (dst, ISO_CODE_ESC);
2036 switch (type)
2037 {
2038 case CHARSET_TYPE_94:
2039 Dynarr_add (dst, inter94[reg]);
2040 break;
2041 case CHARSET_TYPE_96:
2042 Dynarr_add (dst, inter96[reg]);
2043 break;
2044 case CHARSET_TYPE_94X94:
2045 Dynarr_add (dst, '$');
2046 if (reg != 0
2047 || !(XCODING_SYSTEM_ISO2022_SHORT (str->codesys))
2048 || final < '@'
2049 || final > 'B')
2050 Dynarr_add (dst, inter94[reg]);
2051 break;
2052 case CHARSET_TYPE_96X96:
2053 Dynarr_add (dst, '$');
2054 Dynarr_add (dst, inter96[reg]);
2055 break;
2056 }
2057 Dynarr_add (dst, final);
2058 }
2059
2060 static void
2061 ensure_normal_shift (struct coding_stream *str, unsigned_char_dynarr *dst)
2062 {
2063 struct iso2022_coding_stream *data =
2064 CODING_STREAM_TYPE_DATA (str, iso2022);
2065
2066 if (data->register_left != 0)
2067 {
2068 Dynarr_add (dst, ISO_CODE_SI);
2069 data->register_left = 0;
2070 }
2071 }
2072
2073 static void
2074 ensure_shift_out (struct coding_stream *str, unsigned_char_dynarr *dst)
2075 {
2076 struct iso2022_coding_stream *data =
2077 CODING_STREAM_TYPE_DATA (str, iso2022);
2078
2079 if (data->register_left != 1)
2080 {
2081 Dynarr_add (dst, ISO_CODE_SO);
2082 data->register_left = 1;
2083 }
2084 }
2085
2086 /* Convert internally-formatted data to ISO2022 format. */
2087
2088 static Bytecount
2089 iso2022_encode (struct coding_stream *str, const Intbyte *src,
2090 unsigned_char_dynarr *dst, Bytecount n)
2091 {
2092 unsigned char charmask;
2093 Intbyte c;
2094 unsigned char char_boundary;
2095 unsigned int ch = str->ch;
2096 Lisp_Object codesys = str->codesys;
2097 int i;
2098 Lisp_Object charset;
2099 int half;
2100 struct iso2022_coding_stream *data =
2101 CODING_STREAM_TYPE_DATA (str, iso2022);
2102 unsigned int flags = data->flags;
2103 Bytecount orign = n;
2104
2105 #ifdef ENABLE_COMPOSITE_CHARS
2106 /* flags for handling composite chars. We do a little switcheroo
2107 on the source while we're outputting the composite char. */
2108 Bytecount saved_n = 0;
2109 const Intbyte *saved_src = NULL;
2110 int in_composite = 0;
2111 #endif /* ENABLE_COMPOSITE_CHARS */
2112
2113 char_boundary = data->current_char_boundary;
2114 charset = data->current_charset;
2115 half = data->current_half;
2116
2117 #ifdef ENABLE_COMPOSITE_CHARS
2118 back_to_square_n:
2119 #endif
2120 while (n--)
2121 {
2122 c = *src++;
2123
2124 if (BYTE_ASCII_P (c))
2125 { /* Processing ASCII character */
2126 ch = 0;
2127
2128 restore_left_to_right_direction (codesys, dst, &flags, 0);
2129
2130 /* Make sure G0 contains ASCII */
2131 if ((c > ' ' && c < ISO_CODE_DEL) ||
2132 !XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
2133 {
2134 ensure_normal_shift (str, dst);
2135 iso2022_designate (Vcharset_ascii, 0, str, dst);
2136 }
2137
2138 /* If necessary, restore everything to the default state
2139 at end-of-line */
2140 if (!(XCODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
2141 {
2142 /* NOTE: CRLF encoding happens *BEFORE* other encoding.
2143 Thus, even though we're working with internal-format
2144 data, there may be CR's or CRLF sequences representing
2145 newlines. */
2146 if (c == '\r' || (c == '\n' && !(flags & ISO_STATE_CR)))
2147 {
2148 restore_left_to_right_direction (codesys, dst, &flags, 0);
2149
2150 ensure_normal_shift (str, dst);
2151
2152 for (i = 0; i < 4; i++)
2153 {
2154 Lisp_Object initial_charset =
2155 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
2156 iso2022_designate (initial_charset, i, str, dst);
2157 }
2158 }
2159 if (c == '\r')
2160 flags |= ISO_STATE_CR;
2161 else
2162 flags &= ~ISO_STATE_CR;
2163 }
2164
2165 if (XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
2166 && fit_to_be_escape_quoted (c))
2167 Dynarr_add (dst, ISO_CODE_ESC);
2168 Dynarr_add (dst, c);
2169 char_boundary = 1;
2170 }
2171
2172 else if (INTBYTE_LEADING_BYTE_P (c) || INTBYTE_LEADING_BYTE_P (ch))
2173 { /* Processing Leading Byte */
2174 ch = 0;
2175 charset = CHARSET_BY_LEADING_BYTE (c);
2176 if (LEADING_BYTE_PREFIX_P (c))
2177 ch = c;
2178 else if (!EQ (charset, Vcharset_control_1)
2179 && !EQ (charset, Vcharset_composite))
2180 {
2181 int reg;
2182
2183 ensure_correct_direction (XCHARSET_DIRECTION (charset),
2184 codesys, dst, &flags, 0);
2185
2186 /* Now determine which register to use. */
2187 reg = -1;
2188 for (i = 0; i < 4; i++)
2189 {
2190 if (EQ (charset, data->charset[i]) ||
2191 EQ (charset,
2192 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
2193 {
2194 reg = i;
2195 break;
2196 }
2197 }
2198
2199 if (reg == -1)
2200 {
2201 if (XCHARSET_GRAPHIC (charset) != 0)
2202 {
2203 if (!NILP (data->charset[1]) &&
2204 (!XCODING_SYSTEM_ISO2022_SEVEN (codesys) ||
2205 XCODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
2206 reg = 1;
2207 else if (!NILP (data->charset[2]))
2208 reg = 2;
2209 else if (!NILP (data->charset[3]))
2210 reg = 3;
2211 else
2212 reg = 0;
2213 }
2214 else
2215 reg = 0;
2216 }
2217
2218 iso2022_designate (charset, reg, str, dst);
2219
2220 /* Now invoke that register. */
2221 switch (reg)
2222 {
2223 case 0:
2224 ensure_normal_shift (str, dst);
2225 half = 0;
2226 break;
2227
2228 case 1:
2229 if (XCODING_SYSTEM_ISO2022_SEVEN (codesys))
2230 {
2231 ensure_shift_out (str, dst);
2232 half = 0;
2233 }
2234 else
2235 half = 1;
2236 break;
2237
2238 case 2:
2239 if (XCODING_SYSTEM_ISO2022_SEVEN (str->codesys))
2240 {
2241 Dynarr_add (dst, ISO_CODE_ESC);
2242 Dynarr_add (dst, 'N');
2243 half = 0;
2244 }
2245 else
2246 {
2247 Dynarr_add (dst, ISO_CODE_SS2);
2248 half = 1;
2249 }
2250 break;
2251
2252 case 3:
2253 if (XCODING_SYSTEM_ISO2022_SEVEN (str->codesys))
2254 {
2255 Dynarr_add (dst, ISO_CODE_ESC);
2256 Dynarr_add (dst, 'O');
2257 half = 0;
2258 }
2259 else
2260 {
2261 Dynarr_add (dst, ISO_CODE_SS3);
2262 half = 1;
2263 }
2264 break;
2265
2266 default:
2267 abort ();
2268 }
2269 }
2270 char_boundary = 0;
2271 }
2272 else
2273 { /* Processing Non-ASCII character */
2274 charmask = (half == 0 ? 0x7F : 0xFF);
2275 char_boundary = 1;
2276 if (EQ (charset, Vcharset_control_1))
2277 {
2278 if (XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
2279 && fit_to_be_escape_quoted (c))
2280 Dynarr_add (dst, ISO_CODE_ESC);
2281 /* you asked for it ... */
2282 Dynarr_add (dst, c - 0x20);
2283 }
2284 #ifndef ENABLE_COMPOSITE_CHARS
2285 else if (EQ (charset, Vcharset_composite))
2286 {
2287 if (c >= 160 || c <= 164) /* Someone might have stuck in
2288 something else */
2289 {
2290 Dynarr_add (dst, ISO_CODE_ESC);
2291 Dynarr_add (dst, c - 160 + '0');
2292 }
2293 }
2294 #endif
2295 else
2296 {
2297 switch (XCHARSET_REP_BYTES (charset))
2298 {
2299 case 2:
2300 Dynarr_add (dst, c & charmask);
2301 break;
2302 case 3:
2303 if (XCHARSET_PRIVATE_P (charset))
2304 {
2305 Dynarr_add (dst, c & charmask);
2306 ch = 0;
2307 }
2308 else if (ch)
2309 {
2310 #ifdef ENABLE_COMPOSITE_CHARS
2311 if (EQ (charset, Vcharset_composite))
2312 {
2313 if (in_composite)
2314 {
2315 /* #### Bother! We don't know how to
2316 handle this yet. */
2317 Dynarr_add (dst, '~');
2318 }
2319 else
2320 {
2321 Emchar emch = MAKE_CHAR (Vcharset_composite,
2322 ch & 0x7F, c & 0x7F);
2323 Lisp_Object lstr = composite_char_string (emch);
2324 saved_n = n;
2325 saved_src = src;
2326 in_composite = 1;
2327 src = XSTRING_DATA (lstr);
2328 n = XSTRING_LENGTH (lstr);
2329 Dynarr_add (dst, ISO_CODE_ESC);
2330 Dynarr_add (dst, '0'); /* start composing */
2331 }
2332 }
2333 else
2334 #endif /* ENABLE_COMPOSITE_CHARS */
2335 {
2336 Dynarr_add (dst, ch & charmask);
2337 Dynarr_add (dst, c & charmask);
2338 }
2339 ch = 0;
2340 }
2341 else
2342 {
2343 ch = c;
2344 char_boundary = 0;
2345 }
2346 break;
2347 case 4:
2348 if (ch)
2349 {
2350 Dynarr_add (dst, ch & charmask);
2351 Dynarr_add (dst, c & charmask);
2352 ch = 0;
2353 }
2354 else
2355 {
2356 ch = c;
2357 char_boundary = 0;
2358 }
2359 break;
2360 default:
2361 abort ();
2362 }
2363 }
2364 }
2365 }
2366
2367 #ifdef ENABLE_COMPOSITE_CHARS
2368 if (in_composite)
2369 {
2370 n = saved_n;
2371 src = saved_src;
2372 in_composite = 0;
2373 Dynarr_add (dst, ISO_CODE_ESC);
2374 Dynarr_add (dst, '1'); /* end composing */
2375 goto back_to_square_n; /* Wheeeeeeeee ..... */
2376 }
2377 #endif /* ENABLE_COMPOSITE_CHARS */
2378
2379 if (char_boundary && str->eof)
2380 {
2381 restore_left_to_right_direction (codesys, dst, &flags, 0);
2382 ensure_normal_shift (str, dst);
2383 for (i = 0; i < 4; i++)
2384 {
2385 Lisp_Object initial_charset =
2386 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
2387 iso2022_designate (initial_charset, i, str, dst);
2388 }
2389 }
2390
2391 data->flags = flags;
2392 str->ch = ch;
2393 data->current_char_boundary = char_boundary;
2394 data->current_charset = charset;
2395 data->current_half = half;
2396
2397 /* Verbum caro factum est! */
2398 return orign;
2399 }
2400
2401 static Bytecount
2402 iso2022_convert (struct coding_stream *str,
2403 const UExtbyte *src,
2404 unsigned_char_dynarr *dst, Bytecount n)
2405 {
2406 if (str->direction == CODING_DECODE)
2407 return iso2022_decode (str, src, dst, n);
2408 else
2409 return iso2022_encode (str, src, dst, n);
2410 }
2411
2412 static void
2413 iso2022_mark (Lisp_Object codesys)
2414 {
2415 int i;
2416
2417 for (i = 0; i < 4; i++)
2418 mark_object (XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
2419 if (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys))
2420 {
2421 for (i = 0;
2422 i < Dynarr_length (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys));
2423 i++)
2424 {
2425 struct charset_conversion_spec *ccs =
2426 Dynarr_atp (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys), i);
2427 mark_object (ccs->from_charset);
2428 mark_object (ccs->to_charset);
2429 }
2430 }
2431 if (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys))
2432 {
2433 for (i = 0;
2434 i < Dynarr_length (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys));
2435 i++)
2436 {
2437 struct charset_conversion_spec *ccs =
2438 Dynarr_atp (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys), i);
2439 mark_object (ccs->from_charset);
2440 mark_object (ccs->to_charset);
2441 }
2442 }
2443 }
2444
2445 static void
2446 iso2022_finalize (Lisp_Object cs)
2447 {
2448 if (XCODING_SYSTEM_ISO2022_INPUT_CONV (cs))
2449 {
2450 Dynarr_free (XCODING_SYSTEM_ISO2022_INPUT_CONV (cs));
2451 XCODING_SYSTEM_ISO2022_INPUT_CONV (cs) = 0;
2452 }
2453 if (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs))
2454 {
2455 Dynarr_free (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs));
2456 XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs) = 0;
2457 }
2458 }
2459
2460 /* Given a list of charset conversion specs as specified in a Lisp
2461 program, parse it into STORE_HERE. */
2462
2463 static void
2464 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
2465 Lisp_Object spec_list)
2466 {
2467 Lisp_Object rest;
2468
2469 EXTERNAL_LIST_LOOP (rest, spec_list)
2470 {
2471 Lisp_Object car = XCAR (rest);
2472 Lisp_Object from, to;
2473 struct charset_conversion_spec spec;
2474
2475 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
2476 invalid_argument ("Invalid charset conversion spec", car);
2477 from = Fget_charset (XCAR (car));
2478 to = Fget_charset (XCAR (XCDR (car)));
2479 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
2480 invalid_operation_2
2481 ("Attempted conversion between different charset types",
2482 from, to);
2483 spec.from_charset = from;
2484 spec.to_charset = to;
2485
2486 Dynarr_add (store_here, spec);
2487 }
2488 }
2489
2490 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
2491 specs, return the equivalent as the Lisp programmer would see it.
2492
2493 If LOAD_HERE is 0, return Qnil. */
2494
2495 static Lisp_Object
2496 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here,
2497 int names)
2498 {
2499 int i;
2500 Lisp_Object result;
2501
2502 if (!load_here)
2503 return Qnil;
2504 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
2505 {
2506 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
2507 if (names)
2508 result = Fcons (list2 (XCHARSET_NAME (ccs->from_charset),
2509 XCHARSET_NAME (ccs->to_charset)), result);
2510 else
2511 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
2512 }
2513
2514 return Fnreverse (result);
2515 }
2516
2517 static int
2518 iso2022_putprop (Lisp_Object codesys,
2519 Lisp_Object key,
2520 Lisp_Object value)
2521 {
2522 #define FROB_INITIAL_CHARSET(charset_num) \
2523 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
2524 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
2525
2526 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
2527 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
2528 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
2529 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
2530
2531 #define FROB_FORCE_CHARSET(charset_num) \
2532 XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = \
2533 !NILP (value)
2534
2535 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
2536 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
2537 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
2538 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
2539
2540 #define FROB_BOOLEAN_PROPERTY(prop) \
2541 XCODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
2542
2543 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
2544 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
2545 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
2546 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
2547 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
2548 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
2549 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
2550
2551 else if (EQ (key, Qinput_charset_conversion))
2552 {
2553 XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys) =
2554 Dynarr_new (charset_conversion_spec);
2555 parse_charset_conversion_specs
2556 (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys), value);
2557 }
2558 else if (EQ (key, Qoutput_charset_conversion))
2559 {
2560 XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys) =
2561 Dynarr_new (charset_conversion_spec);
2562 parse_charset_conversion_specs
2563 (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys), value);
2564 }
2565 else
2566 return 0;
2567
2568 return 1;
2569 }
2570
2571 static void
2572 iso2022_finalize_coding_stream (struct coding_stream *str)
2573 {
2574 #ifdef ENABLE_COMPOSITE_CHARS
2575 struct iso2022_coding_stream *data =
2576 CODING_STREAM_TYPE_DATA (str, iso2022);
2577
2578 if (data->composite_chars)
2579 Dynarr_free (data->composite_chars);
2580 #endif
2581 }
2582
2583 static void
2584 iso2022_init (Lisp_Object codesys)
2585 {
2586 int i;
2587 for (i = 0; i < 4; i++)
2588 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
2589 }
2590
2591 static Lisp_Object
2592 coding_system_charset (Lisp_Object coding_system, int gnum)
2593 {
2594 Lisp_Object cs
2595 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
2596
2597 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
2598 }
2599
2600 static Lisp_Object
2601 iso2022_getprop (Lisp_Object coding_system, Lisp_Object prop)
2602 {
2603 if (EQ (prop, Qcharset_g0))
2604 return coding_system_charset (coding_system, 0);
2605 else if (EQ (prop, Qcharset_g1))
2606 return coding_system_charset (coding_system, 1);
2607 else if (EQ (prop, Qcharset_g2))
2608 return coding_system_charset (coding_system, 2);
2609 else if (EQ (prop, Qcharset_g3))
2610 return coding_system_charset (coding_system, 3);
2611
2612 #define FORCE_CHARSET(charset_num) \
2613 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
2614 (coding_system, charset_num) ? Qt : Qnil)
2615
2616 else if (EQ (prop, Qforce_g0_on_output))
2617 return FORCE_CHARSET (0);
2618 else if (EQ (prop, Qforce_g1_on_output))
2619 return FORCE_CHARSET (1);
2620 else if (EQ (prop, Qforce_g2_on_output))
2621 return FORCE_CHARSET (2);
2622 else if (EQ (prop, Qforce_g3_on_output))
2623 return FORCE_CHARSET (3);
2624
2625 #define LISP_BOOLEAN(prop) \
2626 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
2627
2628 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
2629 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
2630 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
2631 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
2632 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
2633 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
2634 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
2635
2636 else if (EQ (prop, Qinput_charset_conversion))
2637 return
2638 unparse_charset_conversion_specs
2639 (XCODING_SYSTEM_ISO2022_INPUT_CONV (coding_system), 0);
2640 else if (EQ (prop, Qoutput_charset_conversion))
2641 return
2642 unparse_charset_conversion_specs
2643 (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (coding_system), 0);
2644 else
2645 return Qunbound;
2646 }
2647
2648 static void
2649 iso2022_print (Lisp_Object cs, Lisp_Object printcharfun, int escapeflag)
2650 {
2651 int i;
2652
2653 write_c_string ("(", printcharfun);
2654 for (i = 0; i < 4; i++)
2655 {
2656 Lisp_Object charset = coding_system_charset (cs, i);
2657 if (i > 0)
2658 write_c_string (", ", printcharfun);
2659 write_fmt_string (printcharfun, "g%d=", i);
2660 print_internal (CHARSETP (charset) ? XCHARSET_NAME (charset) : charset,
2661 printcharfun, 0);
2662 if (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (cs, i))
2663 write_c_string ("(force)", printcharfun);
2664 }
2665
2666 #define FROB(prop) \
2667 if (!NILP (iso2022_getprop (cs, prop))) \
2668 { \
2669 write_c_string (", ", printcharfun); \
2670 print_internal (prop, printcharfun, 0); \
2671 }
2672
2673 FROB (Qshort);
2674 FROB (Qno_ascii_eol);
2675 FROB (Qno_ascii_cntl);
2676 FROB (Qseven);
2677 FROB (Qlock_shift);
2678 FROB (Qno_iso6429);
2679 FROB (Qescape_quoted);
2680
2681 {
2682 Lisp_Object val =
2683 unparse_charset_conversion_specs
2684 (XCODING_SYSTEM_ISO2022_INPUT_CONV (cs), 1);
2685 if (!NILP (val))
2686 {
2687 write_c_string (", input-charset-conversion=", printcharfun);
2688 print_internal (val, printcharfun, 0);
2689 }
2690 val =
2691 unparse_charset_conversion_specs
2692 (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs), 1);
2693 if (!NILP (val))
2694 {
2695 write_c_string (", output-charset-conversion=", printcharfun);
2696 print_internal (val, printcharfun, 0);
2697 }
2698 write_c_string (")", printcharfun);
2699 }
2700 }
2701
2702
2703 /************************************************************************/
2704 /* ISO2022 detector */
2705 /************************************************************************/
2706
2707 DEFINE_DETECTOR (iso2022);
2708 /* ISO2022 system using only seven-bit bytes, no locking shift */
2709 DEFINE_DETECTOR_CATEGORY (iso2022, iso_7);
2710 /* ISO2022 system using eight-bit bytes, no locking shift, no single shift,
2711 using designation to switch charsets */
2712 DEFINE_DETECTOR_CATEGORY (iso2022, iso_8_designate);
2713 /* ISO2022 system using eight-bit bytes, no locking shift, no designation
2714 sequences, one-dimension characters in the upper half. */
2715 DEFINE_DETECTOR_CATEGORY (iso2022, iso_8_1);
2716 /* ISO2022 system using eight-bit bytes, no locking shift, no designation
2717 sequences, two-dimension characters in the upper half. */
2718 DEFINE_DETECTOR_CATEGORY (iso2022, iso_8_2);
2719 /* ISO2022 system using locking shift */
2720 DEFINE_DETECTOR_CATEGORY (iso2022, iso_lock_shift);
2721
2722 struct iso2022_detector
2723 {
2724 int initted;
2725 struct iso2022_coding_stream *iso;
2726 unsigned int flags;
2727
2728 /* for keeping temporary track of high-byte groups */
2729 int high_byte_count;
2730 unsigned int saw_single_shift_just_now:1;
2731
2732 /* running state; we set the likelihoods at the end */
2733 unsigned int seen_high_byte:1;
2734 unsigned int seen_single_shift:1;
2735 unsigned int seen_locking_shift:1;
2736 unsigned int seen_designate:1;
2737 unsigned int bad_single_byte_sequences;
2738 unsigned int bad_multibyte_escape_sequences;
2739 unsigned int good_multibyte_escape_sequences;
2740 int even_high_byte_groups;
2741 int odd_high_byte_groups;
2742 };
2743
2744 static void
2745 iso2022_detect (struct detection_state *st, const UExtbyte *src,
2746 Bytecount n)
2747 {
2748 Bytecount orign = n;
2749 struct iso2022_detector *data = DETECTION_STATE_DATA (st, iso2022);
2750
2751 /* #### There are serious deficiencies in the recognition mechanism
2752 here. This needs to be much smarter if it's going to cut it.
2753 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
2754 it should be detected as Latin-1.
2755 All the ISO2022 stuff in this file should be synced up with the
2756 code from FSF Emacs-21.0, in which Mule should be more or less stable.
2757 Perhaps we should wait till R2L works in FSF Emacs? */
2758
2759 /* We keep track of running state on our own, and set the categories at the
2760 end; that way we can reflect the correct state each time we finish, but
2761 not get confused by those results the next time around. */
2762
2763 if (!data->initted)
2764 {
2765 xzero (*data);
2766 data->iso = xnew_and_zero (struct iso2022_coding_stream);
2767 reset_iso2022_decode (Qnil, data->iso);
2768 data->initted = 1;
2769 }
2770
2771 while (n--)
2772 {
2773 UExtbyte c = *src++;
2774 if (c >= 0x80)
2775 data->seen_high_byte = 1;
2776 if (c >= 0xA0)
2777 data->high_byte_count++;
2778 else
2779 {
2780 if (data->high_byte_count &&
2781 !data->saw_single_shift_just_now)
2782 {
2783 if (data->high_byte_count & 1)
2784 data->odd_high_byte_groups++;
2785 else
2786 data->even_high_byte_groups++;
2787 }
2788 data->high_byte_count = 0;
2789 data->saw_single_shift_just_now = 0;
2790 }
2791 if (!(data->flags & ISO_STATE_ESCAPE)
2792 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
2793 { /* control chars */
2794 switch (c)
2795 {
2796 /* Allow and ignore control characters that you might
2797 reasonably see in a text file */
2798 case '\r':
2799 case '\n':
2800 case '\t':
2801 case 7: /* bell */
2802 case 8: /* backspace */
2803 case 11: /* vertical tab */
2804 case 12: /* form feed */
2805 case 26: /* MS-DOS C-z junk */
2806 case 31: /* '^_' -- for info */
2807 goto label_continue_loop;
2808
2809 default:
2810 break;
2811 }
2812 }
2813
2814 if ((data->flags & ISO_STATE_ESCAPE) || BYTE_C0_P (c)
2815 || BYTE_C1_P (c))
2816 {
2817 switch (parse_iso2022_esc (Qnil, data->iso, c,
2818 &data->flags, 0))
2819 {
2820 case 1: /* done */
2821 if (data->iso->esc_bytes_index > 0)
2822 data->good_multibyte_escape_sequences++;
2823 switch (data->iso->esc)
2824 {
2825 case ISO_ESC_DESIGNATE:
2826 data->seen_designate = 1;
2827 break;
2828 case ISO_ESC_LOCKING_SHIFT:
2829 data->seen_locking_shift = 1;
2830 break;
2831 case ISO_ESC_SINGLE_SHIFT:
2832 data->saw_single_shift_just_now = 1;
2833 data->seen_single_shift = 1;
2834 break;
2835 default:
2836 break;
2837 }
2838 break;
2839
2840 case -1: /* not done */
2841 break;
2842
2843 case 0: /* error */
2844 if (data->iso->esc == ISO_ESC_NOTHING)
2845 data->bad_single_byte_sequences++;
2846 else
2847 data->bad_multibyte_escape_sequences++;
2848 }
2849 }
2850 label_continue_loop:;
2851 }
2852
2853 if (data->bad_multibyte_escape_sequences > 2 ||
2854 (data->bad_multibyte_escape_sequences > 0 &&
2855 data->good_multibyte_escape_sequences /
2856 data->bad_multibyte_escape_sequences < 10))
2857 /* Just making it up ... */
2858 SET_DET_RESULTS (st, iso2022, DET_NEARLY_IMPOSSIBLE);
2859 else if (data->bad_single_byte_sequences > 5 ||
2860 (data->bad_single_byte_sequences > 0 &&
2861 (data->good_multibyte_escape_sequences +
2862 data->even_high_byte_groups +
2863 data->odd_high_byte_groups) /
2864 data->bad_single_byte_sequences < 10))
2865 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
2866 else if (data->seen_locking_shift)
2867 {
2868 SET_DET_RESULTS (st, iso2022, DET_QUITE_IMPROBABLE);
2869 DET_RESULT (st, iso_lock_shift) = DET_QUITE_PROBABLE;
2870 }
2871 else if (!data->seen_high_byte)
2872 {
2873 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
2874 if (data->good_multibyte_escape_sequences)
2875 DET_RESULT (st, iso_7) = DET_QUITE_PROBABLE;
2876 else if (data->seen_single_shift)
2877 DET_RESULT (st, iso_7) = DET_SOMEWHAT_LIKELY;
2878 else
2879 {
2880 /* If we've just seen pure 7-bit data, no escape sequences,
2881 then we can't give much likelihood; but if we've seen enough
2882 of this data, we can assume some unlikelihood of any 8-bit
2883 encoding */
2884 if (orign + st->bytes_seen >= 1000)
2885 DET_RESULT (st, iso_7) = DET_AS_LIKELY_AS_UNLIKELY;
2886 else
2887 SET_DET_RESULTS (st, iso2022, DET_AS_LIKELY_AS_UNLIKELY);
2888 }
2889 }
2890 else if (data->seen_designate)
2891 {
2892 SET_DET_RESULTS (st, iso2022, DET_QUITE_IMPROBABLE);
2893 if (data->seen_single_shift)
2894 /* #### Does this really make sense? */
2895 DET_RESULT (st, iso_8_designate) = DET_SOMEWHAT_UNLIKELY;
2896 else
2897 DET_RESULT (st, iso_8_designate) = DET_QUITE_PROBABLE;
2898 }
2899 else if (data->odd_high_byte_groups > 0 &&
2900 data->even_high_byte_groups == 0)
2901 {
2902 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
2903 if (data->seen_single_shift)
2904 DET_RESULT (st, iso_8_1) = DET_QUITE_PROBABLE;
2905 else
2906 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_LIKELY;
2907 }
2908 else if (data->odd_high_byte_groups == 0 &&
2909 data->even_high_byte_groups > 0)
2910 {
2911 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
2912 if (data->even_high_byte_groups > 10)
2913 {
2914 if (data->seen_single_shift)
2915 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE;
2916 else
2917 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY;
2918 if (data->even_high_byte_groups < 50)
2919 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_UNLIKELY;
2920 /* else it stays at quite improbable */
2921 }
2922 }
2923 else if (data->odd_high_byte_groups > 0 &&
2924 data->even_high_byte_groups > 0)
2925 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
2926 else
2927 SET_DET_RESULTS (st, iso2022, DET_AS_LIKELY_AS_UNLIKELY);
2928 }
2929
2930 static void
2931 iso2022_finalize_detection_state (struct detection_state *st)
2932 {
2933 struct iso2022_detector *data = DETECTION_STATE_DATA (st, iso2022);
2934 if (data->iso)
2935 xfree (data->iso);
2936 }
2937
2938
2939 /************************************************************************/
2940 /* CCL methods */
2941 /************************************************************************/
2942
2943 /* Converter written in CCL. */
2944 DEFINE_CODING_SYSTEM_TYPE (ccl);
2945
2946 struct ccl_coding_system
2947 {
2948 /* For a CCL coding system, these specify the CCL programs used for
2949 decoding (input) and encoding (output). */
2950 Lisp_Object decode;
2951 Lisp_Object encode;
2952 };
2953
2954 #define CODING_SYSTEM_CCL_DECODE(codesys) \
2955 (CODING_SYSTEM_TYPE_DATA (codesys, ccl)->decode)
2956 #define CODING_SYSTEM_CCL_ENCODE(codesys) \
2957 (CODING_SYSTEM_TYPE_DATA (codesys, ccl)->encode)
2958 #define XCODING_SYSTEM_CCL_DECODE(codesys) \
2959 CODING_SYSTEM_CCL_DECODE (XCODING_SYSTEM (codesys))
2960 #define XCODING_SYSTEM_CCL_ENCODE(codesys) \
2961 CODING_SYSTEM_CCL_ENCODE (XCODING_SYSTEM (codesys))
2962
2963 struct ccl_coding_stream
2964 {
2965 /* state of the running CCL program */
2966 struct ccl_program ccl;
2967 };
2968
2969 static const struct lrecord_description ccl_coding_system_description[] = {
2970 { XD_LISP_OBJECT,
2971 coding_system_data_offset + offsetof (struct ccl_coding_system,
2972 decode) },
2973 { XD_LISP_OBJECT,
2974 coding_system_data_offset + offsetof (struct ccl_coding_system,
2975 encode) },
2976 { XD_END }
2977 };
2978
2979 static void
2980 ccl_mark (Lisp_Object codesys)
2981 {
2982 mark_object (XCODING_SYSTEM_CCL_DECODE (codesys));
2983 mark_object (XCODING_SYSTEM_CCL_ENCODE (codesys));
2984 }
2985
2986 static Bytecount
2987 ccl_convert (struct coding_stream *str, const UExtbyte *src,
2988 unsigned_char_dynarr *dst, Bytecount n)
2989 {
2990 struct ccl_coding_stream *data =
2991 CODING_STREAM_TYPE_DATA (str, ccl);
2992 Bytecount orign = n;
2993
2994 data->ccl.last_block = str->eof;
2995 /* When applying a CCL program to a stream, SRC must not be NULL -- this
2996 is a special signal to the driver that read and write operations are
2997 not allowed. The code does not actually look at what SRC points to if
2998 N == 0.
2999 */
3000 ccl_driver (&data->ccl, src ? src : (const unsigned char *) "",
3001 dst, n, 0,
3002 str->direction == CODING_DECODE ? CCL_MODE_DECODING :
3003 CCL_MODE_ENCODING);
3004 return orign;
3005 }
3006
3007 static void
3008 ccl_init_coding_stream (struct coding_stream *str)
3009 {
3010 struct ccl_coding_stream *data =
3011 CODING_STREAM_TYPE_DATA (str, ccl);
3012
3013 setup_ccl_program (&data->ccl,
3014 str->direction == CODING_DECODE ?
3015 XCODING_SYSTEM_CCL_DECODE (str->codesys) :
3016 XCODING_SYSTEM_CCL_ENCODE (str->codesys));
3017 }
3018
3019 static void
3020 ccl_rewind_coding_stream (struct coding_stream *str)
3021 {
3022 ccl_init_coding_stream (str);
3023 }
3024
3025 static void
3026 ccl_init (Lisp_Object codesys)
3027 {
3028 XCODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
3029 XCODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
3030 }
3031
3032 static int
3033 ccl_putprop (Lisp_Object codesys, Lisp_Object key, Lisp_Object value)
3034 {
3035 Lisp_Object sym;
3036 struct ccl_program test_ccl;
3037 Char_ASCII *suffix;
3038
3039 /* Check key first. */
3040 if (EQ (key, Qdecode))
3041 suffix = "-ccl-decode";
3042 else if (EQ (key, Qencode))
3043 suffix = "-ccl-encode";
3044 else
3045 return 0;
3046
3047 /* If value is vector, register it as a ccl program
3048 associated with a newly created symbol for
3049 backward compatibility.
3050
3051 #### Bogosity alert! Do we really have to do this crap???? --ben */
3052 if (VECTORP (value))
3053 {
3054 sym = Fintern (concat2 (Fsymbol_name (XCODING_SYSTEM_NAME (codesys)),
3055 build_string (suffix)),
3056 Qnil);
3057 Fregister_ccl_program (sym, value);
3058 }
3059 else
3060 {
3061 CHECK_SYMBOL (value);
3062 sym = value;
3063 }
3064 /* check if the given ccl programs are valid. */
3065 if (setup_ccl_program (&test_ccl, sym) < 0)
3066 invalid_argument ("Invalid CCL program", value);
3067
3068 if (EQ (key, Qdecode))
3069 XCODING_SYSTEM_CCL_DECODE (codesys) = sym;
3070 else if (EQ (key, Qencode))
3071 XCODING_SYSTEM_CCL_ENCODE (codesys) = sym;
3072
3073 return 1;
3074 }
3075
3076 static Lisp_Object
3077 ccl_getprop (Lisp_Object coding_system, Lisp_Object prop)
3078 {
3079 if (EQ (prop, Qdecode))
3080 return XCODING_SYSTEM_CCL_DECODE (coding_system);
3081 else if (EQ (prop, Qencode))
3082 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
3083 else
3084 return Qunbound;
3085 }
3086
3087
3088 /************************************************************************/
3089 /* Initialization */
3090 /************************************************************************/
3091
3092 void
3093 syms_of_mule_coding (void)
3094 {
3095 DEFSUBR (Fdecode_shift_jis_char);
3096 DEFSUBR (Fencode_shift_jis_char);
3097 DEFSUBR (Fdecode_big5_char);
3098 DEFSUBR (Fencode_big5_char);
3099
3100 DEFSYMBOL (Qbig5);
3101 DEFSYMBOL (Qshift_jis);
3102 DEFSYMBOL (Qccl);
3103 DEFSYMBOL (Qiso2022);
3104
3105 DEFSYMBOL (Qcharset_g0);
3106 DEFSYMBOL (Qcharset_g1);
3107 DEFSYMBOL (Qcharset_g2);
3108 DEFSYMBOL (Qcharset_g3);
3109 DEFSYMBOL (Qforce_g0_on_output);
3110 DEFSYMBOL (Qforce_g1_on_output);
3111 DEFSYMBOL (Qforce_g2_on_output);
3112 DEFSYMBOL (Qforce_g3_on_output);
3113 DEFSYMBOL (Qno_iso6429);
3114 DEFSYMBOL (Qinput_charset_conversion);
3115 DEFSYMBOL (Qoutput_charset_conversion);
3116
3117 DEFSYMBOL (Qshort);
3118 DEFSYMBOL (Qno_ascii_eol);
3119 DEFSYMBOL (Qno_ascii_cntl);
3120 DEFSYMBOL (Qseven);
3121 DEFSYMBOL (Qlock_shift);
3122
3123 DEFSYMBOL (Qiso_7);
3124 DEFSYMBOL (Qiso_8_designate);
3125 DEFSYMBOL (Qiso_8_1);
3126 DEFSYMBOL (Qiso_8_2);
3127 DEFSYMBOL (Qiso_lock_shift);
3128 }
3129
3130 void
3131 coding_system_type_create_mule_coding (void)
3132 {
3133 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (iso2022, "iso2022-coding-system-p");
3134 CODING_SYSTEM_HAS_METHOD (iso2022, mark);
3135 CODING_SYSTEM_HAS_METHOD (iso2022, convert);
3136 CODING_SYSTEM_HAS_METHOD (iso2022, finalize_coding_stream);
3137 CODING_SYSTEM_HAS_METHOD (iso2022, init_coding_stream);
3138 CODING_SYSTEM_HAS_METHOD (iso2022, rewind_coding_stream);
3139 CODING_SYSTEM_HAS_METHOD (iso2022, init);
3140 CODING_SYSTEM_HAS_METHOD (iso2022, print);
3141 CODING_SYSTEM_HAS_METHOD (iso2022, finalize);
3142 CODING_SYSTEM_HAS_METHOD (iso2022, putprop);
3143 CODING_SYSTEM_HAS_METHOD (iso2022, getprop);
3144
3145 INITIALIZE_DETECTOR (iso2022);
3146 DETECTOR_HAS_METHOD (iso2022, detect);
3147 DETECTOR_HAS_METHOD (iso2022, finalize_detection_state);
3148 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_7);
3149 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_8_designate);
3150 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_8_1);
3151 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_8_2);
3152 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_lock_shift);
3153
3154 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (ccl, "ccl-coding-system-p");
3155 CODING_SYSTEM_HAS_METHOD (ccl, mark);
3156 CODING_SYSTEM_HAS_METHOD (ccl, convert);
3157 CODING_SYSTEM_HAS_METHOD (ccl, init);
3158 CODING_SYSTEM_HAS_METHOD (ccl, init_coding_stream);
3159 CODING_SYSTEM_HAS_METHOD (ccl, rewind_coding_stream);
3160 CODING_SYSTEM_HAS_METHOD (ccl, putprop);
3161 CODING_SYSTEM_HAS_METHOD (ccl, getprop);
3162
3163 INITIALIZE_CODING_SYSTEM_TYPE (shift_jis, "shift-jis-coding-system-p");
3164 CODING_SYSTEM_HAS_METHOD (shift_jis, convert);
3165
3166 INITIALIZE_DETECTOR (shift_jis);
3167 DETECTOR_HAS_METHOD (shift_jis, detect);
3168 INITIALIZE_DETECTOR_CATEGORY (shift_jis, shift_jis);
3169
3170 INITIALIZE_CODING_SYSTEM_TYPE (big5, "big5-coding-system-p");
3171 CODING_SYSTEM_HAS_METHOD (big5, convert);
3172
3173 INITIALIZE_DETECTOR (big5);
3174 DETECTOR_HAS_METHOD (big5, detect);
3175 INITIALIZE_DETECTOR_CATEGORY (big5, big5);
3176 }
3177
3178 void
3179 reinit_coding_system_type_create_mule_coding (void)
3180 {
3181 REINITIALIZE_CODING_SYSTEM_TYPE (iso2022);
3182 REINITIALIZE_CODING_SYSTEM_TYPE (ccl);
3183 REINITIALIZE_CODING_SYSTEM_TYPE (shift_jis);
3184 REINITIALIZE_CODING_SYSTEM_TYPE (big5);
3185 }
3186
3187 void
3188 reinit_vars_of_mule_coding (void)
3189 {
3190 }
3191
3192 void
3193 vars_of_mule_coding (void)
3194 {
3195 }