771
|
1 /* Conversion functions for I18N encodings, but not Unicode (in separate file).
|
|
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
|
|
3 Copyright (C) 1995 Sun Microsystems, Inc.
|
|
4 Copyright (C) 2000, 2001, 2002 Ben Wing.
|
|
5
|
|
6 This file is part of XEmacs.
|
|
7
|
|
8 XEmacs is free software; you can redistribute it and/or modify it
|
|
9 under the terms of the GNU General Public License as published by the
|
|
10 Free Software Foundation; either version 2, or (at your option) any
|
|
11 later version.
|
|
12
|
|
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
|
|
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
16 for more details.
|
|
17
|
|
18 You should have received a copy of the GNU General Public License
|
|
19 along with XEmacs; see the file COPYING. If not, write to
|
|
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
21 Boston, MA 02111-1307, USA. */
|
|
22
|
|
23 /* Synched up with: Mule 2.3. Not in FSF. */
|
|
24
|
|
25 /* For previous history, see file-coding.c.
|
|
26
|
|
27 September 10, 2001: Extracted from file-coding.c by Ben Wing.
|
|
28
|
|
29 Later in September: Finished abstraction of detection system, rewrote
|
|
30 all the detectors to include multiple levels of likelihood.
|
|
31 */
|
|
32
|
|
33 #include <config.h>
|
|
34 #include "lisp.h"
|
|
35
|
|
36 #include "charset.h"
|
|
37 #include "mule-ccl.h"
|
|
38 #include "file-coding.h"
|
|
39
|
|
40 Lisp_Object Qshift_jis, Qiso2022, Qbig5, Qccl;
|
|
41
|
|
42 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
|
|
43 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
|
|
44 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
|
|
45 Lisp_Object Qno_iso6429;
|
|
46 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
|
|
47 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
|
|
48
|
|
49 Lisp_Object Qiso_7, Qiso_8_designate, Qiso_8_1, Qiso_8_2, Qiso_lock_shift;
|
|
50
|
|
51
|
|
52 /************************************************************************/
|
|
53 /* Shift-JIS methods */
|
|
54 /************************************************************************/
|
|
55
|
|
56 /* Shift-JIS; Hankaku (half-width) KANA is also supported. */
|
|
57 DEFINE_CODING_SYSTEM_TYPE (shift_jis);
|
|
58
|
|
59 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
|
|
60 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
|
|
61 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
|
|
62 encoded by "position-code + 0x80". A character of JISX0208
|
|
63 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
|
|
64 position-codes are divided and shifted so that it fit in the range
|
|
65 below.
|
|
66
|
|
67 --- CODE RANGE of Shift-JIS ---
|
|
68 (character set) (range)
|
|
69 ASCII 0x00 .. 0x7F
|
|
70 JISX0201-Kana 0xA0 .. 0xDF
|
|
71 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
|
|
72 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
|
|
73 -------------------------------
|
|
74
|
|
75 */
|
|
76
|
|
77 /* Is this the first byte of a Shift-JIS two-byte char? */
|
|
78
|
826
|
79 inline static int
|
|
80 byte_shift_jis_two_byte_1_p (int c)
|
|
81 {
|
|
82 return (c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF);
|
|
83 }
|
771
|
84
|
|
85 /* Is this the second byte of a Shift-JIS two-byte char? */
|
|
86
|
826
|
87 inline static int
|
|
88 byte_shift_jis_two_byte_2_p (int c)
|
|
89 {
|
|
90 return (c >= 0x40 && c <= 0x7E) || (c >= 0x80 && c <= 0xFC);
|
|
91 }
|
|
92
|
|
93 inline static int
|
|
94 byte_shift_jis_katakana_p (int c)
|
|
95 {
|
|
96 return c >= 0xA1 && c <= 0xDF;
|
|
97 }
|
771
|
98
|
3439
|
99 inline static void
|
|
100 dynarr_add_2022_one_dimension (Lisp_Object charset, Ibyte c,
|
|
101 unsigned char charmask,
|
|
102 unsigned_char_dynarr *dst)
|
|
103 {
|
|
104 if (XCHARSET_ENCODE_AS_UTF_8 (charset))
|
|
105 {
|
|
106 encode_unicode_char (charset, c & charmask, 0,
|
4096
|
107 dst, UNICODE_UTF_8, 0, 0);
|
3439
|
108 }
|
|
109 else
|
|
110 {
|
|
111 Dynarr_add (dst, c & charmask);
|
|
112 }
|
|
113 }
|
|
114
|
|
115 inline static void
|
|
116 dynarr_add_2022_two_dimensions (Lisp_Object charset, Ibyte c,
|
|
117 unsigned int ch,
|
|
118 unsigned char charmask,
|
|
119 unsigned_char_dynarr *dst)
|
|
120 {
|
|
121 if (XCHARSET_ENCODE_AS_UTF_8 (charset))
|
|
122 {
|
|
123 encode_unicode_char (charset,
|
|
124 ch & charmask,
|
|
125 c & charmask, dst,
|
4096
|
126 UNICODE_UTF_8, 0, 0);
|
3439
|
127 }
|
|
128 else
|
|
129 {
|
|
130 Dynarr_add (dst, ch & charmask);
|
|
131 Dynarr_add (dst, c & charmask);
|
|
132 }
|
|
133 }
|
|
134
|
771
|
135 /* Convert Shift-JIS data to internal format. */
|
|
136
|
|
137 static Bytecount
|
|
138 shift_jis_convert (struct coding_stream *str, const UExtbyte *src,
|
|
139 unsigned_char_dynarr *dst, Bytecount n)
|
|
140 {
|
|
141 unsigned int ch = str->ch;
|
|
142 Bytecount orign = n;
|
|
143
|
|
144 if (str->direction == CODING_DECODE)
|
|
145 {
|
|
146 while (n--)
|
|
147 {
|
|
148 UExtbyte c = *src++;
|
|
149
|
|
150 if (ch)
|
|
151 {
|
|
152 /* Previous character was first byte of Shift-JIS Kanji char. */
|
826
|
153 if (byte_shift_jis_two_byte_2_p (c))
|
771
|
154 {
|
867
|
155 Ibyte e1, e2;
|
771
|
156
|
|
157 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
|
|
158 DECODE_SHIFT_JIS (ch, c, e1, e2);
|
|
159 Dynarr_add (dst, e1);
|
|
160 Dynarr_add (dst, e2);
|
|
161 }
|
|
162 else
|
|
163 {
|
|
164 DECODE_ADD_BINARY_CHAR (ch, dst);
|
|
165 DECODE_ADD_BINARY_CHAR (c, dst);
|
|
166 }
|
|
167 ch = 0;
|
|
168 }
|
|
169 else
|
|
170 {
|
826
|
171 if (byte_shift_jis_two_byte_1_p (c))
|
771
|
172 ch = c;
|
826
|
173 else if (byte_shift_jis_katakana_p (c))
|
771
|
174 {
|
|
175 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
|
|
176 Dynarr_add (dst, c);
|
|
177 }
|
|
178 else
|
|
179 DECODE_ADD_BINARY_CHAR (c, dst);
|
|
180 }
|
|
181 }
|
|
182
|
|
183 if (str->eof)
|
|
184 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
|
|
185 }
|
|
186 else
|
|
187 {
|
|
188 while (n--)
|
|
189 {
|
867
|
190 Ibyte c = *src++;
|
826
|
191 if (byte_ascii_p (c))
|
771
|
192 {
|
|
193 Dynarr_add (dst, c);
|
|
194 ch = 0;
|
|
195 }
|
867
|
196 else if (ibyte_leading_byte_p (c))
|
771
|
197 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
|
|
198 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
|
|
199 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
|
|
200 else if (ch)
|
|
201 {
|
|
202 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
|
|
203 {
|
|
204 Dynarr_add (dst, c);
|
|
205 ch = 0;
|
|
206 }
|
|
207 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
|
|
208 ch == LEADING_BYTE_JAPANESE_JISX0208)
|
|
209 ch = c;
|
|
210 else
|
|
211 {
|
|
212 UExtbyte j1, j2;
|
|
213 ENCODE_SHIFT_JIS (ch, c, j1, j2);
|
|
214 Dynarr_add (dst, j1);
|
|
215 Dynarr_add (dst, j2);
|
|
216 ch = 0;
|
|
217 }
|
|
218 }
|
|
219 }
|
|
220 }
|
|
221
|
|
222 str->ch = ch;
|
|
223
|
|
224 return orign;
|
|
225 }
|
|
226
|
|
227 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
|
|
228 Decode a JISX0208 character of Shift-JIS coding-system.
|
|
229 CODE is the character code in Shift-JIS as a cons of type bytes.
|
|
230 Return the corresponding character.
|
|
231 */
|
|
232 (code))
|
|
233 {
|
|
234 int c1, c2, s1, s2;
|
|
235
|
|
236 CHECK_CONS (code);
|
|
237 CHECK_INT (XCAR (code));
|
|
238 CHECK_INT (XCDR (code));
|
|
239 s1 = XINT (XCAR (code));
|
|
240 s2 = XINT (XCDR (code));
|
826
|
241 if (byte_shift_jis_two_byte_1_p (s1) &&
|
|
242 byte_shift_jis_two_byte_2_p (s2))
|
771
|
243 {
|
|
244 DECODE_SHIFT_JIS (s1, s2, c1, c2);
|
867
|
245 return make_char (make_ichar (Vcharset_japanese_jisx0208,
|
831
|
246 c1 & 0x7F, c2 & 0x7F));
|
771
|
247 }
|
|
248 else
|
|
249 return Qnil;
|
|
250 }
|
|
251
|
|
252 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
|
|
253 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
|
|
254 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
|
|
255 */
|
|
256 (character))
|
|
257 {
|
|
258 Lisp_Object charset;
|
|
259 int c1, c2, s1, s2;
|
|
260
|
|
261 CHECK_CHAR_COERCE_INT (character);
|
867
|
262 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2);
|
771
|
263 if (EQ (charset, Vcharset_japanese_jisx0208))
|
|
264 {
|
|
265 ENCODE_SHIFT_JIS (c1 | 0x80, c2 | 0x80, s1, s2);
|
|
266 return Fcons (make_int (s1), make_int (s2));
|
|
267 }
|
|
268 else
|
|
269 return Qnil;
|
|
270 }
|
|
271
|
|
272
|
|
273 /************************************************************************/
|
|
274 /* Shift-JIS detector */
|
|
275 /************************************************************************/
|
|
276
|
|
277 DEFINE_DETECTOR (shift_jis);
|
|
278 DEFINE_DETECTOR_CATEGORY (shift_jis, shift_jis);
|
|
279
|
|
280 struct shift_jis_detector
|
|
281 {
|
|
282 int seen_jisx0208_char_in_c1;
|
|
283 int seen_jisx0208_char_in_upper;
|
|
284 int seen_jisx0201_char;
|
|
285 unsigned int seen_iso2022_esc:1;
|
|
286 unsigned int seen_bad_first_byte:1;
|
|
287 unsigned int seen_bad_second_byte:1;
|
|
288 /* temporary */
|
|
289 unsigned int in_second_byte:1;
|
|
290 unsigned int first_byte_was_c1:1;
|
|
291 };
|
|
292
|
|
293 static void
|
|
294 shift_jis_detect (struct detection_state *st, const UExtbyte *src,
|
|
295 Bytecount n)
|
|
296 {
|
|
297 struct shift_jis_detector *data = DETECTION_STATE_DATA (st, shift_jis);
|
|
298
|
|
299 while (n--)
|
|
300 {
|
|
301 UExtbyte c = *src++;
|
|
302 if (!data->in_second_byte)
|
|
303 {
|
|
304 if (c >= 0x80 && c <= 0x9F)
|
|
305 data->first_byte_was_c1 = 1;
|
|
306 if (c >= 0xA0 && c <= 0xDF)
|
|
307 data->seen_jisx0201_char++;
|
|
308 else if ((c >= 0x80 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
|
|
309 data->in_second_byte = 1;
|
|
310 else if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
|
|
311 data->seen_iso2022_esc = 1;
|
|
312 else if (c >= 0x80)
|
|
313 data->seen_bad_first_byte = 1;
|
|
314 }
|
|
315 else
|
|
316 {
|
|
317 if ((c >= 0x40 && c <= 0x7E) || (c >= 0x80 && c <= 0xFC))
|
|
318 {
|
|
319 if (data->first_byte_was_c1 || (c >= 0x80 && c <= 0x9F))
|
|
320 data->seen_jisx0208_char_in_c1++;
|
|
321 else
|
|
322 data->seen_jisx0208_char_in_upper++;
|
|
323 }
|
|
324 else
|
|
325 data->seen_bad_second_byte = 1;
|
|
326 data->in_second_byte = 0;
|
|
327 data->first_byte_was_c1 = 0;
|
|
328 }
|
|
329 }
|
|
330
|
|
331 if (data->seen_bad_second_byte)
|
|
332 DET_RESULT (st, shift_jis) = DET_NEARLY_IMPOSSIBLE;
|
|
333 else if (data->seen_bad_first_byte)
|
|
334 DET_RESULT (st, shift_jis) = DET_QUITE_IMPROBABLE;
|
|
335 else if (data->seen_iso2022_esc)
|
|
336 DET_RESULT (st, shift_jis) = DET_SOMEWHAT_UNLIKELY;
|
|
337 else if (data->seen_jisx0208_char_in_c1 >= 20 ||
|
|
338 (data->seen_jisx0208_char_in_c1 >= 10 &&
|
|
339 data->seen_jisx0208_char_in_upper >= 10))
|
|
340 DET_RESULT (st, shift_jis) = DET_QUITE_PROBABLE;
|
|
341 else if (data->seen_jisx0208_char_in_c1 > 3 ||
|
|
342 data->seen_jisx0208_char_in_upper >= 10 ||
|
|
343 /* Since the range is limited compared to what is often seen
|
|
344 is typical Latin-X charsets, the fact that we've seen a
|
|
345 bunch of them and none that are invalid is reasonably
|
|
346 strong statistical evidence of this encoding, or at least
|
|
347 not of the common Latin-X ones. */
|
|
348 data->seen_jisx0201_char >= 100)
|
|
349 DET_RESULT (st, shift_jis) = DET_SOMEWHAT_LIKELY;
|
|
350 else if (data->seen_jisx0208_char_in_c1 > 0 ||
|
|
351 data->seen_jisx0208_char_in_upper > 0 ||
|
|
352 data->seen_jisx0201_char > 0)
|
|
353 DET_RESULT (st, shift_jis) = DET_SLIGHTLY_LIKELY;
|
|
354 else
|
|
355 DET_RESULT (st, shift_jis) = DET_AS_LIKELY_AS_UNLIKELY;
|
|
356 }
|
|
357
|
|
358
|
|
359 /************************************************************************/
|
|
360 /* Big5 methods */
|
|
361 /************************************************************************/
|
|
362
|
2819
|
363 /* BIG5 (used for Mandarin in Taiwan). */
|
771
|
364 DEFINE_CODING_SYSTEM_TYPE (big5);
|
|
365
|
|
366 /* BIG5 is a coding system encoding two character sets: ASCII and
|
|
367 Big5. An ASCII character is encoded as is. Big5 is a two-byte
|
|
368 character set and is encoded in two-byte.
|
|
369
|
|
370 --- CODE RANGE of BIG5 ---
|
|
371 (character set) (range)
|
|
372 ASCII 0x00 .. 0x7F
|
|
373 Big5 (1st byte) 0xA1 .. 0xFE
|
|
374 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
|
|
375 --------------------------
|
|
376
|
|
377 Since the number of characters in Big5 is larger than maximum
|
|
378 characters in Emacs' charset (96x96), it can't be handled as one
|
|
379 charset. So, in XEmacs, Big5 is divided into two: `charset-big5-1'
|
|
380 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
|
|
381 contains frequently used characters and the latter contains less
|
|
382 frequently used characters. */
|
|
383
|
826
|
384 inline static int
|
|
385 byte_big5_two_byte_1_p (int c)
|
|
386 {
|
|
387 return c >= 0xA1 && c <= 0xFE;
|
|
388 }
|
771
|
389
|
|
390 /* Is this the second byte of a Shift-JIS two-byte char? */
|
|
391
|
826
|
392 inline static int
|
|
393 byte_big5_two_byte_2_p (int c)
|
|
394 {
|
|
395 return (c >= 0x40 && c <= 0x7E) || (c >= 0xA1 && c <= 0xFE);
|
|
396 }
|
771
|
397
|
|
398 /* Number of Big5 characters which have the same code in 1st byte. */
|
|
399
|
|
400 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
|
|
401
|
|
402 /* Code conversion macros. These are macros because they are used in
|
|
403 inner loops during code conversion.
|
|
404
|
|
405 Note that temporary variables in macros introduce the classic
|
|
406 dynamic-scoping problems with variable names. We use capital-
|
|
407 lettered variables in the assumption that XEmacs does not use
|
|
408 capital letters in variables except in a very formalized way
|
|
409 (e.g. Qstring). */
|
|
410
|
|
411 /* Convert Big5 code (b1, b2) into its internal string representation
|
|
412 (lb, c1, c2). */
|
|
413
|
|
414 /* There is a much simpler way to split the Big5 charset into two.
|
|
415 For the moment I'm going to leave the algorithm as-is because it
|
|
416 claims to separate out the most-used characters into a single
|
|
417 charset, which perhaps will lead to optimizations in various
|
|
418 places.
|
|
419
|
|
420 The way the algorithm works is something like this:
|
|
421
|
|
422 Big5 can be viewed as a 94x157 charset, where the row is
|
|
423 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
|
|
424 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
|
|
425 the split between low and high column numbers is apparently
|
|
426 meaningless; ascending rows produce less and less frequent chars.
|
|
427 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
|
|
428 the first charset, and the upper half (0xC9 .. 0xFE) to the
|
|
429 second. To do the conversion, we convert the character into
|
|
430 a single number where 0 .. 156 is the first row, 157 .. 313
|
|
431 is the second, etc. That way, the characters are ordered by
|
|
432 decreasing frequency. Then we just chop the space in two
|
|
433 and coerce the result into a 94x94 space.
|
|
434 */
|
|
435
|
|
436 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
|
|
437 { \
|
|
438 int B1 = b1, B2 = b2; \
|
|
439 int I \
|
|
440 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
|
|
441 \
|
|
442 if (B1 < 0xC9) \
|
|
443 { \
|
|
444 lb = LEADING_BYTE_CHINESE_BIG5_1; \
|
|
445 } \
|
|
446 else \
|
|
447 { \
|
|
448 lb = LEADING_BYTE_CHINESE_BIG5_2; \
|
|
449 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
|
|
450 } \
|
|
451 c1 = I / (0xFF - 0xA1) + 0xA1; \
|
|
452 c2 = I % (0xFF - 0xA1) + 0xA1; \
|
|
453 } while (0)
|
|
454
|
|
455 /* Convert the internal string representation of a Big5 character
|
|
456 (lb, c1, c2) into Big5 code (b1, b2). */
|
|
457
|
|
458 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
|
|
459 { \
|
|
460 int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
|
|
461 \
|
|
462 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
|
|
463 { \
|
|
464 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
|
|
465 } \
|
|
466 b1 = I / BIG5_SAME_ROW + 0xA1; \
|
|
467 b2 = I % BIG5_SAME_ROW; \
|
|
468 b2 += b2 < 0x3F ? 0x40 : 0x62; \
|
|
469 } while (0)
|
|
470
|
|
471 /* Convert Big5 data to internal format. */
|
|
472
|
|
473 static Bytecount
|
|
474 big5_convert (struct coding_stream *str, const UExtbyte *src,
|
|
475 unsigned_char_dynarr *dst, Bytecount n)
|
|
476 {
|
|
477 unsigned int ch = str->ch;
|
|
478 Bytecount orign = n;
|
|
479
|
|
480 if (str->direction == CODING_DECODE)
|
|
481 {
|
|
482 while (n--)
|
|
483 {
|
|
484 UExtbyte c = *src++;
|
|
485 if (ch)
|
|
486 {
|
|
487 /* Previous character was first byte of Big5 char. */
|
826
|
488 if (byte_big5_two_byte_2_p (c))
|
771
|
489 {
|
867
|
490 Ibyte b1, b2, b3;
|
771
|
491 DECODE_BIG5 (ch, c, b1, b2, b3);
|
|
492 Dynarr_add (dst, b1);
|
|
493 Dynarr_add (dst, b2);
|
|
494 Dynarr_add (dst, b3);
|
|
495 }
|
|
496 else
|
|
497 {
|
|
498 DECODE_ADD_BINARY_CHAR (ch, dst);
|
|
499 DECODE_ADD_BINARY_CHAR (c, dst);
|
|
500 }
|
|
501 ch = 0;
|
|
502 }
|
|
503 else
|
|
504 {
|
826
|
505 if (byte_big5_two_byte_1_p (c))
|
771
|
506 ch = c;
|
|
507 else
|
|
508 DECODE_ADD_BINARY_CHAR (c, dst);
|
|
509 }
|
|
510 }
|
|
511
|
|
512 if (str->eof)
|
|
513 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
|
|
514 }
|
|
515 else
|
|
516 {
|
|
517 while (n--)
|
|
518 {
|
867
|
519 Ibyte c = *src++;
|
826
|
520 if (byte_ascii_p (c))
|
771
|
521 {
|
|
522 /* ASCII. */
|
|
523 Dynarr_add (dst, c);
|
|
524 }
|
867
|
525 else if (ibyte_leading_byte_p (c))
|
771
|
526 {
|
|
527 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
|
|
528 c == LEADING_BYTE_CHINESE_BIG5_2)
|
|
529 {
|
|
530 /* A recognized leading byte. */
|
|
531 ch = c;
|
|
532 continue; /* not done with this character. */
|
|
533 }
|
|
534 /* otherwise just ignore this character. */
|
|
535 }
|
|
536 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
|
|
537 ch == LEADING_BYTE_CHINESE_BIG5_2)
|
|
538 {
|
|
539 /* Previous char was a recognized leading byte. */
|
|
540 ch = (ch << 8) | c;
|
|
541 continue; /* not done with this character. */
|
|
542 }
|
|
543 else if (ch)
|
|
544 {
|
|
545 /* Encountering second byte of a Big5 character. */
|
|
546 UExtbyte b1, b2;
|
|
547
|
|
548 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
|
|
549 Dynarr_add (dst, b1);
|
|
550 Dynarr_add (dst, b2);
|
|
551 }
|
|
552
|
|
553 ch = 0;
|
|
554 }
|
|
555 }
|
|
556
|
|
557 str->ch = ch;
|
|
558 return orign;
|
|
559 }
|
|
560
|
867
|
561 Ichar
|
771
|
562 decode_big5_char (int b1, int b2)
|
|
563 {
|
826
|
564 if (byte_big5_two_byte_1_p (b1) &&
|
|
565 byte_big5_two_byte_2_p (b2))
|
771
|
566 {
|
|
567 int leading_byte;
|
|
568 Lisp_Object charset;
|
|
569 int c1, c2;
|
|
570
|
|
571 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
|
826
|
572 charset = charset_by_leading_byte (leading_byte);
|
867
|
573 return make_ichar (charset, c1 & 0x7F, c2 & 0x7F);
|
771
|
574 }
|
|
575 else
|
|
576 return -1;
|
|
577 }
|
|
578
|
|
579 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
|
|
580 Convert Big Five character codes in CODE into a character.
|
|
581 CODE is a cons of two integers specifying the codepoints in Big Five.
|
|
582 Return the corresponding character, or nil if the codepoints are out of range.
|
|
583
|
|
584 The term `decode' is used because the codepoints can be viewed as the
|
|
585 representation of the character in the external Big Five encoding, and thus
|
|
586 converting them to a character is analogous to any other operation that
|
|
587 decodes an external representation.
|
|
588 */
|
|
589 (code))
|
|
590 {
|
867
|
591 Ichar ch;
|
771
|
592
|
|
593 CHECK_CONS (code);
|
|
594 CHECK_INT (XCAR (code));
|
|
595 CHECK_INT (XCDR (code));
|
|
596 ch = decode_big5_char (XINT (XCAR (code)), XINT (XCDR (code)));
|
|
597 if (ch == -1)
|
|
598 return Qnil;
|
|
599 else
|
|
600 return make_char (ch);
|
|
601 }
|
|
602
|
|
603 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
|
|
604 Convert the specified Big Five character into its codepoints.
|
|
605 The codepoints are returned as a cons of two integers, specifying the
|
|
606 Big Five codepoints. See `decode-big5-char' for the reason why the
|
|
607 term `encode' is used for this operation.
|
|
608 */
|
|
609 (character))
|
|
610 {
|
|
611 Lisp_Object charset;
|
|
612 int c1, c2, b1, b2;
|
|
613
|
|
614 CHECK_CHAR_COERCE_INT (character);
|
867
|
615 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2);
|
771
|
616 if (EQ (charset, Vcharset_chinese_big5_1) ||
|
|
617 EQ (charset, Vcharset_chinese_big5_2))
|
|
618 {
|
|
619 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
|
|
620 b1, b2);
|
|
621 return Fcons (make_int (b1), make_int (b2));
|
|
622 }
|
|
623 else
|
|
624 return Qnil;
|
|
625 }
|
|
626
|
|
627
|
|
628 /************************************************************************/
|
|
629 /* Big5 detector */
|
|
630 /************************************************************************/
|
|
631
|
|
632 DEFINE_DETECTOR (big5);
|
|
633 DEFINE_DETECTOR_CATEGORY (big5, big5);
|
|
634
|
|
635 struct big5_detector
|
|
636 {
|
|
637 int seen_big5_char;
|
985
|
638 int seen_euc_char;
|
771
|
639 unsigned int seen_iso2022_esc:1;
|
|
640 unsigned int seen_bad_first_byte:1;
|
|
641 unsigned int seen_bad_second_byte:1;
|
|
642
|
|
643 /* temporary */
|
|
644 unsigned int in_second_byte:1;
|
|
645 };
|
|
646
|
|
647 static void
|
|
648 big5_detect (struct detection_state *st, const UExtbyte *src,
|
|
649 Bytecount n)
|
|
650 {
|
|
651 struct big5_detector *data = DETECTION_STATE_DATA (st, big5);
|
|
652
|
|
653 while (n--)
|
|
654 {
|
|
655 UExtbyte c = *src++;
|
|
656 if (!data->in_second_byte)
|
|
657 {
|
|
658 if (c >= 0xA1 && c <= 0xFE)
|
|
659 data->in_second_byte = 1;
|
|
660 else if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
|
|
661 data->seen_iso2022_esc = 1;
|
|
662 else if (c >= 0x80)
|
|
663 data->seen_bad_first_byte = 1;
|
|
664 }
|
|
665 else
|
|
666 {
|
|
667 data->in_second_byte = 0;
|
985
|
668 if (c >= 0xA1 && c <= 0xFE)
|
|
669 data->seen_euc_char++;
|
|
670 else if (c >= 0x40 && c <= 0x7E)
|
771
|
671 data->seen_big5_char++;
|
|
672 else
|
|
673 data->seen_bad_second_byte = 1;
|
|
674 }
|
|
675 }
|
|
676
|
|
677 if (data->seen_bad_second_byte)
|
|
678 DET_RESULT (st, big5) = DET_NEARLY_IMPOSSIBLE;
|
|
679 else if (data->seen_bad_first_byte)
|
|
680 DET_RESULT (st, big5) = DET_QUITE_IMPROBABLE;
|
|
681 else if (data->seen_iso2022_esc)
|
|
682 DET_RESULT (st, big5) = DET_SOMEWHAT_UNLIKELY;
|
|
683 else if (data->seen_big5_char >= 4)
|
|
684 DET_RESULT (st, big5) = DET_SOMEWHAT_LIKELY;
|
985
|
685 else if (data->seen_euc_char)
|
|
686 DET_RESULT (st, big5) = DET_SLIGHTLY_LIKELY;
|
771
|
687 else
|
|
688 DET_RESULT (st, big5) = DET_AS_LIKELY_AS_UNLIKELY;
|
|
689 }
|
|
690
|
|
691
|
|
692 /************************************************************************/
|
|
693 /* ISO2022 methods */
|
|
694 /************************************************************************/
|
|
695
|
|
696 /* Any ISO-2022-compliant coding system. Includes JIS, EUC, CTEXT
|
|
697 (Compound Text, the encoding of selections in X Windows). See below for
|
|
698 a complete description of ISO-2022. */
|
|
699
|
|
700 /* Flags indicating what we've seen so far when parsing an
|
|
701 ISO2022 escape sequence. */
|
|
702 enum iso_esc_flag
|
|
703 {
|
|
704 /* Partial sequences */
|
|
705 ISO_ESC_NOTHING, /* Nothing has been seen. */
|
|
706 ISO_ESC, /* We've seen ESC. */
|
|
707 ISO_ESC_2_4, /* We've seen ESC $. This indicates
|
|
708 that we're designating a multi-byte, rather
|
|
709 than a single-byte, character set. */
|
3439
|
710 ISO_ESC_2_5, /* We've seen ESC %. This indicates an escape to a
|
|
711 Unicode coding system; the only one of these
|
|
712 we're prepared to deal with is UTF-8, which has
|
|
713 the next character as G. */
|
771
|
714 ISO_ESC_2_8, /* We've seen ESC 0x28, i.e. ESC (.
|
|
715 This means designate a 94-character
|
|
716 character set into G0. */
|
|
717 ISO_ESC_2_9, /* We've seen ESC 0x29 -- designate a
|
|
718 94-character character set into G1. */
|
|
719 ISO_ESC_2_10, /* We've seen ESC 0x2A. */
|
|
720 ISO_ESC_2_11, /* We've seen ESC 0x2B. */
|
|
721 ISO_ESC_2_12, /* We've seen ESC 0x2C -- designate a
|
|
722 96-character character set into G0.
|
|
723 (This is not ISO2022-standard.
|
|
724 The following 96-character
|
|
725 control sequences are standard,
|
|
726 though.) */
|
|
727 ISO_ESC_2_13, /* We've seen ESC 0x2D -- designate a
|
|
728 96-character character set into G1.
|
|
729 */
|
|
730 ISO_ESC_2_14, /* We've seen ESC 0x2E. */
|
|
731 ISO_ESC_2_15, /* We've seen ESC 0x2F. */
|
|
732 ISO_ESC_2_4_8, /* We've seen ESC $ 0x28 -- designate
|
|
733 a 94^N character set into G0. */
|
|
734 ISO_ESC_2_4_9, /* We've seen ESC $ 0x29. */
|
|
735 ISO_ESC_2_4_10, /* We've seen ESC $ 0x2A. */
|
|
736 ISO_ESC_2_4_11, /* We've seen ESC $ 0x2B. */
|
|
737 ISO_ESC_2_4_12, /* We've seen ESC $ 0x2C. */
|
|
738 ISO_ESC_2_4_13, /* We've seen ESC $ 0x2D. */
|
|
739 ISO_ESC_2_4_14, /* We've seen ESC $ 0x2E. */
|
|
740 ISO_ESC_2_4_15, /* We've seen ESC $ 0x2F. */
|
|
741 ISO_ESC_5_11, /* We've seen ESC [ or 0x9B. This
|
|
742 starts a directionality-control
|
|
743 sequence. The next character
|
|
744 must be 0, 1, 2, or ]. */
|
|
745 ISO_ESC_5_11_0, /* We've seen 0x9B 0. The next character must be ]. */
|
|
746 ISO_ESC_5_11_1, /* We've seen 0x9B 1. The next character must be ]. */
|
|
747 ISO_ESC_5_11_2, /* We've seen 0x9B 2. The next character must be ]. */
|
|
748
|
|
749 /* Full sequences. */
|
|
750 ISO_ESC_START_COMPOSITE, /* Private usage for START COMPOSING */
|
|
751 ISO_ESC_END_COMPOSITE, /* Private usage for END COMPOSING */
|
|
752 ISO_ESC_SINGLE_SHIFT, /* We've seen a complete single-shift sequence. */
|
|
753 ISO_ESC_LOCKING_SHIFT,/* We've seen a complete locking-shift sequence. */
|
|
754 ISO_ESC_DESIGNATE, /* We've seen a complete designation sequence. */
|
|
755 ISO_ESC_DIRECTIONALITY,/* We've seen a complete ISO6429 directionality
|
|
756 sequence. */
|
|
757 ISO_ESC_LITERAL /* We've seen a literal character ala
|
|
758 escape-quoting. */
|
|
759 };
|
|
760
|
|
761 enum iso_error
|
|
762 {
|
|
763 ISO_ERROR_BAD_FINAL,
|
|
764 ISO_ERROR_UNKWOWN_ESC_SEQUENCE,
|
|
765 ISO_ERROR_INVALID_CODE_POINT_CHARACTER,
|
|
766 };
|
|
767
|
|
768
|
|
769 /* Flags indicating current state while converting code. */
|
|
770
|
|
771 /************ Used during encoding and decoding: ************/
|
|
772 /* If set, the current directionality is right-to-left. Otherwise, it's
|
|
773 left-to-right. */
|
|
774 #define ISO_STATE_R2L (1 << 0)
|
|
775
|
|
776 /************ Used during encoding: ************/
|
|
777 /* If set, we just saw a CR. */
|
|
778 #define ISO_STATE_CR (1 << 1)
|
|
779
|
|
780 /************ Used during decoding: ************/
|
|
781 /* If set, we're currently parsing an escape sequence and the upper 16 bits
|
|
782 should be looked at to indicate what partial escape sequence we've seen
|
|
783 so far. Otherwise, we're running through actual text. */
|
|
784 #define ISO_STATE_ESCAPE (1 << 2)
|
|
785 /* If set, G2 is invoked into GL, but only for the next character. */
|
|
786 #define ISO_STATE_SS2 (1 << 3)
|
|
787 /* If set, G3 is invoked into GL, but only for the next character. If both
|
|
788 ISO_STATE_SS2 and ISO_STATE_SS3 are set, ISO_STATE_SS2 overrides; but
|
|
789 this probably indicates an error in the text encoding. */
|
|
790 #define ISO_STATE_SS3 (1 << 4)
|
|
791 /* If set, we're currently processing a composite character (i.e. a
|
|
792 character constructed by overstriking two or more characters). */
|
|
793 #define ISO_STATE_COMPOSITE (1 << 5)
|
|
794
|
3439
|
795 /* If set, we're processing UTF-8 encoded data within ISO-2022
|
|
796 processing. */
|
|
797 #define ISO_STATE_UTF_8 (1 << 6)
|
|
798
|
771
|
799 /* ISO_STATE_LOCK is the mask of flags that remain on until explicitly
|
|
800 turned off when in the ISO2022 encoder/decoder. Other flags are turned
|
|
801 off at the end of processing each character or escape sequence. */
|
|
802 # define ISO_STATE_LOCK \
|
3439
|
803 (ISO_STATE_COMPOSITE | ISO_STATE_R2L | ISO_STATE_UTF_8)
|
771
|
804
|
|
805 typedef struct charset_conversion_spec
|
|
806 {
|
|
807 Lisp_Object from_charset;
|
|
808 Lisp_Object to_charset;
|
|
809 } charset_conversion_spec;
|
|
810
|
|
811 typedef struct
|
|
812 {
|
|
813 Dynarr_declare (charset_conversion_spec);
|
|
814 } charset_conversion_spec_dynarr;
|
|
815
|
|
816 struct iso2022_coding_system
|
|
817 {
|
|
818 /* What are the charsets to be initially designated to G0, G1,
|
|
819 G2, G3? If t, no charset is initially designated. If nil,
|
|
820 no charset is initially designated and no charset is allowed
|
|
821 to be designated. */
|
|
822 Lisp_Object initial_charset[4];
|
|
823
|
|
824 /* If true, a designation escape sequence needs to be sent on output
|
|
825 for the charset in G[0-3] before that charset is used. */
|
|
826 unsigned char force_charset_on_output[4];
|
|
827
|
|
828 charset_conversion_spec_dynarr *input_conv;
|
|
829 charset_conversion_spec_dynarr *output_conv;
|
|
830
|
|
831 unsigned int shoort :1; /* C makes you speak Dutch */
|
|
832 unsigned int no_ascii_eol :1;
|
|
833 unsigned int no_ascii_cntl :1;
|
|
834 unsigned int seven :1;
|
|
835 unsigned int lock_shift :1;
|
|
836 unsigned int no_iso6429 :1;
|
|
837 unsigned int escape_quoted :1;
|
|
838 };
|
|
839
|
|
840 #define CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, g) \
|
|
841 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->initial_charset[g])
|
|
842 #define CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(codesys, g) \
|
|
843 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->force_charset_on_output[g])
|
|
844 #define CODING_SYSTEM_ISO2022_SHORT(codesys) \
|
|
845 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->shoort)
|
|
846 #define CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys) \
|
|
847 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->no_ascii_eol)
|
|
848 #define CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys) \
|
|
849 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->no_ascii_cntl)
|
|
850 #define CODING_SYSTEM_ISO2022_SEVEN(codesys) \
|
|
851 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->seven)
|
|
852 #define CODING_SYSTEM_ISO2022_LOCK_SHIFT(codesys) \
|
|
853 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->lock_shift)
|
|
854 #define CODING_SYSTEM_ISO2022_NO_ISO6429(codesys) \
|
|
855 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->no_iso6429)
|
|
856 #define CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys) \
|
|
857 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->escape_quoted)
|
|
858 #define CODING_SYSTEM_ISO2022_INPUT_CONV(codesys) \
|
|
859 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->input_conv)
|
|
860 #define CODING_SYSTEM_ISO2022_OUTPUT_CONV(codesys) \
|
|
861 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->output_conv)
|
|
862
|
|
863 #define XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, g) \
|
|
864 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (XCODING_SYSTEM (codesys), g)
|
|
865 #define XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(codesys, g) \
|
|
866 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (XCODING_SYSTEM (codesys), g)
|
|
867 #define XCODING_SYSTEM_ISO2022_SHORT(codesys) \
|
|
868 CODING_SYSTEM_ISO2022_SHORT (XCODING_SYSTEM (codesys))
|
|
869 #define XCODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys) \
|
|
870 CODING_SYSTEM_ISO2022_NO_ASCII_EOL (XCODING_SYSTEM (codesys))
|
|
871 #define XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys) \
|
|
872 CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (XCODING_SYSTEM (codesys))
|
|
873 #define XCODING_SYSTEM_ISO2022_SEVEN(codesys) \
|
|
874 CODING_SYSTEM_ISO2022_SEVEN (XCODING_SYSTEM (codesys))
|
|
875 #define XCODING_SYSTEM_ISO2022_LOCK_SHIFT(codesys) \
|
|
876 CODING_SYSTEM_ISO2022_LOCK_SHIFT (XCODING_SYSTEM (codesys))
|
|
877 #define XCODING_SYSTEM_ISO2022_NO_ISO6429(codesys) \
|
|
878 CODING_SYSTEM_ISO2022_NO_ISO6429 (XCODING_SYSTEM (codesys))
|
|
879 #define XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys) \
|
|
880 CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (XCODING_SYSTEM (codesys))
|
|
881 #define XCODING_SYSTEM_ISO2022_INPUT_CONV(codesys) \
|
|
882 CODING_SYSTEM_ISO2022_INPUT_CONV (XCODING_SYSTEM (codesys))
|
|
883 #define XCODING_SYSTEM_ISO2022_OUTPUT_CONV(codesys) \
|
|
884 CODING_SYSTEM_ISO2022_OUTPUT_CONV (XCODING_SYSTEM (codesys))
|
|
885
|
|
886 /* Additional information used by the ISO2022 decoder and detector. */
|
|
887 struct iso2022_coding_stream
|
|
888 {
|
|
889 /* CHARSET holds the character sets currently assigned to the G0
|
|
890 through G3 variables. It is initialized from the array
|
|
891 INITIAL_CHARSET in CODESYS. */
|
|
892 Lisp_Object charset[4];
|
|
893
|
|
894 /* Which registers are currently invoked into the left (GL) and
|
|
895 right (GR) halves of the 8-bit encoding space? */
|
|
896 int register_left, register_right;
|
|
897
|
|
898 /* FLAGS holds flags indicating the current state of the encoding. Some of
|
|
899 these flags are actually part of the state-dependent data and should be
|
|
900 moved there. */
|
|
901 unsigned int flags;
|
|
902
|
|
903 /**************** for decoding ****************/
|
|
904
|
|
905 /* ISO_ESC holds a value indicating part of an escape sequence
|
|
906 that has already been seen. */
|
|
907 enum iso_esc_flag esc;
|
|
908
|
|
909 /* This records the bytes we've seen so far in an escape sequence,
|
|
910 in case the sequence is invalid (we spit out the bytes unchanged). */
|
|
911 unsigned char esc_bytes[8];
|
|
912
|
|
913 /* Index for next byte to store in ISO escape sequence. */
|
|
914 int esc_bytes_index;
|
|
915
|
|
916 #ifdef ENABLE_COMPOSITE_CHARS
|
|
917 /* Stuff seen so far when composing a string. */
|
|
918 unsigned_char_dynarr *composite_chars;
|
|
919 #endif
|
|
920
|
|
921 /* If we saw an invalid designation sequence for a particular
|
|
922 register, we flag it here and switch to ASCII. The next time we
|
|
923 see a valid designation for this register, we turn off the flag
|
|
924 and do the designation normally, but pretend the sequence was
|
|
925 invalid. The effect of all this is that (most of the time) the
|
|
926 escape sequences for both the switch to the unknown charset, and
|
|
927 the switch back to the known charset, get inserted literally into
|
|
928 the buffer and saved out as such. The hope is that we can
|
|
929 preserve the escape sequences so that the resulting written out
|
|
930 file makes sense. If we don't do any of this, the designation
|
|
931 to the invalid charset will be preserved but that switch back
|
|
932 to the known charset will probably get eaten because it was
|
|
933 the same charset that was already present in the register. */
|
|
934 unsigned char invalid_designated[4];
|
|
935
|
|
936 /* We try to do similar things as above for direction-switching
|
|
937 sequences. If we encountered a direction switch while an
|
|
938 invalid designation was present, or an invalid designation
|
|
939 just after a direction switch (i.e. no valid designation
|
|
940 encountered yet), we insert the direction-switch escape
|
|
941 sequence literally into the output stream, and later on
|
|
942 insert the corresponding direction-restoring escape sequence
|
|
943 literally also. */
|
|
944 unsigned int switched_dir_and_no_valid_charset_yet :1;
|
|
945 unsigned int invalid_switch_dir :1;
|
|
946
|
|
947 /* Tells the decoder to output the escape sequence literally
|
|
948 even though it was valid. Used in the games we play to
|
|
949 avoid lossage when we encounter invalid designations. */
|
|
950 unsigned int output_literally :1;
|
|
951 /* We encountered a direction switch followed by an invalid
|
|
952 designation. We didn't output the direction switch
|
|
953 literally because we didn't know about the invalid designation;
|
|
954 but we have to do so now. */
|
|
955 unsigned int output_direction_sequence :1;
|
|
956
|
|
957 /**************** for encoding ****************/
|
|
958
|
|
959 /* Whether we need to explicitly designate the charset in the
|
|
960 G? register before using it. It is initialized from the
|
|
961 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
|
|
962 unsigned char force_charset_on_output[4];
|
|
963
|
|
964 /* Other state variables that need to be preserved across
|
|
965 invocations. */
|
|
966 Lisp_Object current_charset;
|
|
967 int current_half;
|
|
968 int current_char_boundary;
|
3439
|
969
|
|
970 /* Used for handling UTF-8. */
|
|
971 unsigned char counter;
|
4096
|
972 unsigned char indicated_length;
|
771
|
973 };
|
|
974
|
1204
|
975 static const struct memory_description ccs_description_1[] =
|
771
|
976 {
|
|
977 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
|
|
978 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
|
|
979 { XD_END }
|
|
980 };
|
|
981
|
1204
|
982 static const struct sized_memory_description ccs_description =
|
771
|
983 {
|
|
984 sizeof (charset_conversion_spec),
|
|
985 ccs_description_1
|
|
986 };
|
|
987
|
1204
|
988 static const struct memory_description ccsd_description_1[] =
|
771
|
989 {
|
|
990 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
|
|
991 { XD_END }
|
|
992 };
|
|
993
|
1204
|
994 static const struct sized_memory_description ccsd_description =
|
771
|
995 {
|
|
996 sizeof (charset_conversion_spec_dynarr),
|
|
997 ccsd_description_1
|
|
998 };
|
|
999
|
1204
|
1000 static const struct memory_description iso2022_coding_system_description[] = {
|
|
1001 { XD_LISP_OBJECT_ARRAY, offsetof (struct iso2022_coding_system,
|
|
1002 initial_charset), 4 },
|
2367
|
1003 { XD_BLOCK_PTR, offsetof (struct iso2022_coding_system, input_conv),
|
2551
|
1004 1, { &ccsd_description } },
|
2367
|
1005 { XD_BLOCK_PTR, offsetof (struct iso2022_coding_system, output_conv),
|
2551
|
1006 1, { &ccsd_description } },
|
771
|
1007 { XD_END }
|
|
1008 };
|
|
1009
|
1204
|
1010 DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (iso2022);
|
|
1011
|
771
|
1012 /* The following note taken directly from FSF 21.0.103. */
|
|
1013
|
|
1014 /* The following note describes the coding system ISO2022 briefly.
|
|
1015 Since the intention of this note is to help understand the
|
|
1016 functions in this file, some parts are NOT ACCURATE or are OVERLY
|
|
1017 SIMPLIFIED. For thorough understanding, please refer to the
|
|
1018 original document of ISO2022. This is equivalent to the standard
|
|
1019 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
|
|
1020
|
|
1021 ISO2022 provides many mechanisms to encode several character sets
|
|
1022 in 7-bit and 8-bit environments. For 7-bit environments, all text
|
|
1023 is encoded using bytes less than 128. This may make the encoded
|
|
1024 text a little bit longer, but the text passes more easily through
|
|
1025 several types of gateway, some of which strip off the MSB (Most
|
|
1026 Significant Bit).
|
|
1027
|
|
1028 There are two kinds of character sets: control character sets and
|
|
1029 graphic character sets. The former contain control characters such
|
|
1030 as `newline' and `escape' to provide control functions (control
|
|
1031 functions are also provided by escape sequences). The latter
|
|
1032 contain graphic characters such as 'A' and '-'. Emacs recognizes
|
|
1033 two control character sets and many graphic character sets.
|
|
1034
|
|
1035 Graphic character sets are classified into one of the following
|
|
1036 four classes, according to the number of bytes (DIMENSION) and
|
|
1037 number of characters in one dimension (CHARS) of the set:
|
|
1038 - DIMENSION1_CHARS94
|
|
1039 - DIMENSION1_CHARS96
|
|
1040 - DIMENSION2_CHARS94
|
|
1041 - DIMENSION2_CHARS96
|
|
1042
|
|
1043 In addition, each character set is assigned an identification tag,
|
|
1044 unique for each set, called the "final character" (denoted as <F>
|
|
1045 hereafter). The <F> of each character set is decided by ECMA(*)
|
|
1046 when it is registered in ISO. The code range of <F> is 0x30..0x7F
|
|
1047 (0x30..0x3F are for private use only).
|
|
1048
|
|
1049 Note (*): ECMA = European Computer Manufacturers Association
|
|
1050
|
|
1051 Here are examples of graphic character sets [NAME(<F>)]:
|
|
1052 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
|
|
1053 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
|
|
1054 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
|
|
1055 o DIMENSION2_CHARS96 -- none for the moment
|
|
1056
|
|
1057 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
|
|
1058 C0 [0x00..0x1F] -- control character plane 0
|
|
1059 GL [0x20..0x7F] -- graphic character plane 0
|
|
1060 C1 [0x80..0x9F] -- control character plane 1
|
|
1061 GR [0xA0..0xFF] -- graphic character plane 1
|
|
1062
|
|
1063 A control character set is directly designated and invoked to C0 or
|
|
1064 C1 by an escape sequence. The most common case is that:
|
|
1065 - ISO646's control character set is designated/invoked to C0, and
|
|
1066 - ISO6429's control character set is designated/invoked to C1,
|
|
1067 and usually these designations/invocations are omitted in encoded
|
|
1068 text. In a 7-bit environment, only C0 can be used, and a control
|
|
1069 character for C1 is encoded by an appropriate escape sequence to
|
|
1070 fit into the environment. All control characters for C1 are
|
|
1071 defined to have corresponding escape sequences.
|
|
1072
|
|
1073 A graphic character set is at first designated to one of four
|
|
1074 graphic registers (G0 through G3), then these graphic registers are
|
|
1075 invoked to GL or GR. These designations and invocations can be
|
|
1076 done independently. The most common case is that G0 is invoked to
|
|
1077 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
|
|
1078 these invocations and designations are omitted in encoded text.
|
|
1079 In a 7-bit environment, only GL can be used.
|
|
1080
|
|
1081 When a graphic character set of CHARS94 is invoked to GL, codes
|
|
1082 0x20 and 0x7F of the GL area work as control characters SPACE and
|
|
1083 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
|
|
1084 be used.
|
|
1085
|
|
1086 There are two ways of invocation: locking-shift and single-shift.
|
|
1087 With locking-shift, the invocation lasts until the next different
|
|
1088 invocation, whereas with single-shift, the invocation affects the
|
|
1089 following character only and doesn't affect the locking-shift
|
|
1090 state. Invocations are done by the following control characters or
|
|
1091 escape sequences:
|
|
1092
|
|
1093 ----------------------------------------------------------------------
|
|
1094 abbrev function cntrl escape seq description
|
|
1095 ----------------------------------------------------------------------
|
|
1096 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
|
|
1097 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
|
|
1098 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
|
|
1099 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
|
|
1100 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
|
|
1101 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
|
|
1102 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
|
|
1103 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
|
|
1104 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
|
|
1105 ----------------------------------------------------------------------
|
|
1106 (*) These are not used by any known coding system.
|
|
1107
|
|
1108 Control characters for these functions are defined by macros
|
|
1109 ISO_CODE_XXX in `coding.h'.
|
|
1110
|
|
1111 Designations are done by the following escape sequences:
|
|
1112 ----------------------------------------------------------------------
|
|
1113 escape sequence description
|
|
1114 ----------------------------------------------------------------------
|
|
1115 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
|
|
1116 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
|
|
1117 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
|
|
1118 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
|
|
1119 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
|
|
1120 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
|
|
1121 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
|
|
1122 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
|
|
1123 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
|
|
1124 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
|
|
1125 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
|
|
1126 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
|
|
1127 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
|
|
1128 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
|
|
1129 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
|
|
1130 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
|
|
1131 ----------------------------------------------------------------------
|
|
1132
|
|
1133 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
|
|
1134 of dimension 1, chars 94, and final character <F>, etc...
|
|
1135
|
|
1136 Note (*): Although these designations are not allowed in ISO2022,
|
|
1137 Emacs accepts them on decoding, and produces them on encoding
|
|
1138 CHARS96 character sets in a coding system which is characterized as
|
|
1139 7-bit environment, non-locking-shift, and non-single-shift.
|
|
1140
|
|
1141 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
|
|
1142 '(' can be omitted. We refer to this as "short-form" hereafter.
|
|
1143
|
|
1144 Now you may notice that there are a lot of ways of encoding the
|
|
1145 same multilingual text in ISO2022. Actually, there exist many
|
|
1146 coding systems such as Compound Text (used in X11's inter client
|
|
1147 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
|
|
1148 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
|
|
1149 localized platforms), and all of these are variants of ISO2022.
|
|
1150
|
|
1151 In addition to the above, Emacs handles two more kinds of escape
|
|
1152 sequences: ISO6429's direction specification and Emacs' private
|
|
1153 sequence for specifying character composition.
|
|
1154
|
|
1155 ISO6429's direction specification takes the following form:
|
|
1156 o CSI ']' -- end of the current direction
|
|
1157 o CSI '0' ']' -- end of the current direction
|
|
1158 o CSI '1' ']' -- start of left-to-right text
|
|
1159 o CSI '2' ']' -- start of right-to-left text
|
|
1160 The control character CSI (0x9B: control sequence introducer) is
|
|
1161 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
|
|
1162
|
|
1163 Character composition specification takes the following form:
|
|
1164 o ESC '0' -- start relative composition
|
|
1165 o ESC '1' -- end composition
|
|
1166 o ESC '2' -- start rule-base composition (*)
|
|
1167 o ESC '3' -- start relative composition with alternate chars (**)
|
|
1168 o ESC '4' -- start rule-base composition with alternate chars (**)
|
|
1169 Since these are not standard escape sequences of any ISO standard,
|
|
1170 the use of them with these meanings is restricted to Emacs only.
|
|
1171
|
|
1172 (*) This form is used only in Emacs 20.5 and older versions,
|
|
1173 but the newer versions can safely decode it.
|
|
1174 (**) This form is used only in Emacs 21.1 and newer versions,
|
|
1175 and the older versions can't decode it.
|
|
1176
|
|
1177 Here's a list of example usages of these composition escape
|
|
1178 sequences (categorized by `enum composition_method').
|
|
1179
|
|
1180 COMPOSITION_RELATIVE:
|
|
1181 ESC 0 CHAR [ CHAR ] ESC 1
|
|
1182 COMPOSITION_WITH_RULE:
|
|
1183 ESC 2 CHAR [ RULE CHAR ] ESC 1
|
|
1184 COMPOSITION_WITH_ALTCHARS:
|
|
1185 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
|
|
1186 COMPOSITION_WITH_RULE_ALTCHARS:
|
|
1187 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
|
|
1188
|
|
1189 static void
|
|
1190 reset_iso2022_decode (Lisp_Object coding_system,
|
|
1191 struct iso2022_coding_stream *data)
|
|
1192 {
|
|
1193 int i;
|
|
1194 #ifdef ENABLE_COMPOSITE_CHARS
|
|
1195 unsigned_char_dynarr *old_composite_chars = data->composite_chars;
|
|
1196 #endif
|
|
1197
|
|
1198 xzero (*data);
|
|
1199
|
|
1200 for (i = 0; i < 4; i++)
|
|
1201 {
|
|
1202 if (!NILP (coding_system))
|
|
1203 data->charset[i] =
|
|
1204 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
|
|
1205 else
|
|
1206 data->charset[i] = Qt;
|
|
1207 }
|
|
1208 data->esc = ISO_ESC_NOTHING;
|
|
1209 data->register_right = 1;
|
|
1210 #ifdef ENABLE_COMPOSITE_CHARS
|
|
1211 if (old_composite_chars)
|
|
1212 {
|
|
1213 data->composite_chars = old_composite_chars;
|
|
1214 Dynarr_reset (data->composite_chars);
|
|
1215 }
|
|
1216 #endif
|
|
1217 }
|
|
1218
|
|
1219 static void
|
|
1220 reset_iso2022_encode (Lisp_Object coding_system,
|
|
1221 struct iso2022_coding_stream *data)
|
|
1222 {
|
|
1223 int i;
|
|
1224
|
|
1225 xzero (*data);
|
|
1226
|
|
1227 for (i = 0; i < 4; i++)
|
|
1228 {
|
|
1229 data->charset[i] =
|
|
1230 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
|
|
1231 data->force_charset_on_output[i] =
|
|
1232 XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (coding_system, i);
|
|
1233 }
|
|
1234 data->register_right = 1;
|
|
1235 data->current_charset = Qnil;
|
|
1236 data->current_char_boundary = 1;
|
|
1237 }
|
|
1238
|
|
1239 static void
|
|
1240 iso2022_init_coding_stream (struct coding_stream *str)
|
|
1241 {
|
|
1242 if (str->direction == CODING_DECODE)
|
|
1243 reset_iso2022_decode (str->codesys,
|
|
1244 CODING_STREAM_TYPE_DATA (str, iso2022));
|
|
1245 else
|
|
1246 reset_iso2022_encode (str->codesys,
|
|
1247 CODING_STREAM_TYPE_DATA (str, iso2022));
|
|
1248 }
|
|
1249
|
|
1250 static void
|
|
1251 iso2022_rewind_coding_stream (struct coding_stream *str)
|
|
1252 {
|
|
1253 iso2022_init_coding_stream (str);
|
|
1254 }
|
|
1255
|
|
1256 static int
|
|
1257 fit_to_be_escape_quoted (unsigned char c)
|
|
1258 {
|
|
1259 switch (c)
|
|
1260 {
|
|
1261 case ISO_CODE_ESC:
|
|
1262 case ISO_CODE_CSI:
|
|
1263 case ISO_CODE_SS2:
|
|
1264 case ISO_CODE_SS3:
|
|
1265 case ISO_CODE_SO:
|
|
1266 case ISO_CODE_SI:
|
|
1267 return 1;
|
|
1268
|
|
1269 default:
|
|
1270 return 0;
|
|
1271 }
|
|
1272 }
|
|
1273
|
|
1274 static Lisp_Object
|
867
|
1275 charset_by_attributes_or_create_one (int type, Ibyte final, int dir)
|
771
|
1276 {
|
826
|
1277 Lisp_Object charset = charset_by_attributes (type, final, dir);
|
771
|
1278
|
|
1279 if (NILP (charset))
|
|
1280 {
|
|
1281 int chars, dim;
|
|
1282
|
|
1283 switch (type)
|
|
1284 {
|
|
1285 case CHARSET_TYPE_94:
|
|
1286 chars = 94; dim = 1;
|
|
1287 break;
|
|
1288 case CHARSET_TYPE_96:
|
|
1289 chars = 96; dim = 1;
|
|
1290 break;
|
|
1291 case CHARSET_TYPE_94X94:
|
|
1292 chars = 94; dim = 2;
|
|
1293 break;
|
|
1294 case CHARSET_TYPE_96X96:
|
|
1295 chars = 96; dim = 2;
|
|
1296 break;
|
|
1297 default:
|
2500
|
1298 ABORT (); chars = 0; dim = 0;
|
771
|
1299 }
|
|
1300
|
|
1301 charset = Fmake_charset (Qunbound, Qnil,
|
|
1302 nconc2 (list6 (Qfinal, make_char (final),
|
|
1303 Qchars, make_int (chars),
|
|
1304 Qdimension, make_int (dim)),
|
|
1305 list2 (Qdirection,
|
|
1306 dir == CHARSET_LEFT_TO_RIGHT ?
|
|
1307 Ql2r : Qr2l)));
|
|
1308 }
|
|
1309
|
|
1310 return charset;
|
|
1311 }
|
|
1312
|
|
1313 /* Parse one byte of an ISO2022 escape sequence.
|
|
1314 If the result is an invalid escape sequence, return 0 and
|
|
1315 do not change anything in STR. Otherwise, if the result is
|
|
1316 an incomplete escape sequence, update ISO2022.ESC and
|
|
1317 ISO2022.ESC_BYTES and return -1. Otherwise, update
|
|
1318 all the state variables (but not ISO2022.ESC_BYTES) and
|
|
1319 return 1.
|
|
1320
|
|
1321 If CHECK_INVALID_CHARSETS is non-zero, check for designation
|
|
1322 or invocation of an invalid character set and treat that as
|
|
1323 an unrecognized escape sequence.
|
|
1324
|
2367
|
1325 */
|
771
|
1326
|
|
1327 static int
|
|
1328 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_coding_stream *iso,
|
|
1329 unsigned char c, unsigned int *flags,
|
|
1330 int check_invalid_charsets)
|
|
1331 {
|
|
1332 /* (1) If we're at the end of a designation sequence, CS is the
|
|
1333 charset being designated and REG is the register to designate
|
|
1334 it to.
|
|
1335
|
|
1336 (2) If we're at the end of a locking-shift sequence, REG is
|
|
1337 the register to invoke and HALF (0 == left, 1 == right) is
|
|
1338 the half to invoke it into.
|
|
1339
|
|
1340 (3) If we're at the end of a single-shift sequence, REG is
|
|
1341 the register to invoke. */
|
|
1342 Lisp_Object cs = Qnil;
|
|
1343 int reg, half;
|
|
1344
|
|
1345 /* NOTE: This code does goto's all over the fucking place.
|
|
1346 The reason for this is that we're basically implementing
|
|
1347 a state machine here, and hierarchical languages like C
|
|
1348 don't really provide a clean way of doing this. */
|
|
1349
|
|
1350 if (! (*flags & ISO_STATE_ESCAPE))
|
|
1351 /* At beginning of escape sequence; we need to reset our
|
|
1352 escape-state variables. */
|
|
1353 iso->esc = ISO_ESC_NOTHING;
|
|
1354
|
|
1355 iso->output_literally = 0;
|
|
1356 iso->output_direction_sequence = 0;
|
|
1357
|
|
1358 switch (iso->esc)
|
|
1359 {
|
|
1360 case ISO_ESC_NOTHING:
|
|
1361 iso->esc_bytes_index = 0;
|
|
1362 switch (c)
|
|
1363 {
|
|
1364 case ISO_CODE_ESC: /* Start escape sequence */
|
|
1365 *flags |= ISO_STATE_ESCAPE;
|
|
1366 iso->esc = ISO_ESC;
|
|
1367 goto not_done;
|
|
1368
|
|
1369 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
|
|
1370 *flags |= ISO_STATE_ESCAPE;
|
|
1371 iso->esc = ISO_ESC_5_11;
|
|
1372 goto not_done;
|
|
1373
|
|
1374 case ISO_CODE_SO: /* locking shift 1 */
|
|
1375 reg = 1; half = 0;
|
|
1376 goto locking_shift;
|
|
1377 case ISO_CODE_SI: /* locking shift 0 */
|
|
1378 reg = 0; half = 0;
|
|
1379 goto locking_shift;
|
|
1380
|
|
1381 case ISO_CODE_SS2: /* single shift */
|
|
1382 reg = 2;
|
|
1383 goto single_shift;
|
|
1384 case ISO_CODE_SS3: /* single shift */
|
|
1385 reg = 3;
|
|
1386 goto single_shift;
|
|
1387
|
|
1388 default: /* Other control characters */
|
|
1389 error:
|
|
1390 *flags &= ISO_STATE_LOCK;
|
|
1391 return 0;
|
|
1392 }
|
|
1393
|
|
1394 case ISO_ESC:
|
3439
|
1395
|
|
1396 /* The only available ISO 2022 sequence in UTF-8 mode is ESC % @, to
|
|
1397 exit from it. If we see any other escape sequence, pass it through
|
|
1398 in the error handler. */
|
|
1399 if (*flags & ISO_STATE_UTF_8 && '%' != c)
|
|
1400 {
|
|
1401 return 0;
|
|
1402 }
|
|
1403
|
771
|
1404 switch (c)
|
|
1405 {
|
|
1406 /**** single shift ****/
|
|
1407
|
|
1408 case 'N': /* single shift 2 */
|
|
1409 reg = 2;
|
|
1410 goto single_shift;
|
|
1411 case 'O': /* single shift 3 */
|
|
1412 reg = 3;
|
|
1413 goto single_shift;
|
|
1414
|
|
1415 /**** locking shift ****/
|
|
1416
|
|
1417 case '~': /* locking shift 1 right */
|
|
1418 reg = 1; half = 1;
|
|
1419 goto locking_shift;
|
|
1420 case 'n': /* locking shift 2 */
|
|
1421 reg = 2; half = 0;
|
|
1422 goto locking_shift;
|
|
1423 case '}': /* locking shift 2 right */
|
|
1424 reg = 2; half = 1;
|
|
1425 goto locking_shift;
|
|
1426 case 'o': /* locking shift 3 */
|
|
1427 reg = 3; half = 0;
|
|
1428 goto locking_shift;
|
|
1429 case '|': /* locking shift 3 right */
|
|
1430 reg = 3; half = 1;
|
|
1431 goto locking_shift;
|
|
1432
|
|
1433 /**** composite ****/
|
|
1434
|
|
1435 #ifdef ENABLE_COMPOSITE_CHARS
|
|
1436 case '0':
|
|
1437 iso->esc = ISO_ESC_START_COMPOSITE;
|
|
1438 *flags = (*flags & ISO_STATE_LOCK) |
|
|
1439 ISO_STATE_COMPOSITE;
|
|
1440 return 1;
|
|
1441
|
|
1442 case '1':
|
|
1443 iso->esc = ISO_ESC_END_COMPOSITE;
|
|
1444 *flags = (*flags & ISO_STATE_LOCK) &
|
|
1445 ~ISO_STATE_COMPOSITE;
|
|
1446 return 1;
|
|
1447 #else
|
|
1448 case '0': case '1': case '2': case '3': case '4':
|
|
1449 /* We simply return a flag indicating that some composite
|
|
1450 escape was seen. The caller will use the particular
|
|
1451 character to encode the appropriate "composite hack"
|
|
1452 character out of Vcharset_composite, so that we will
|
|
1453 preserve these values on output. */
|
|
1454 iso->esc = ISO_ESC_START_COMPOSITE;
|
|
1455 *flags &= ISO_STATE_LOCK;
|
|
1456 return 1;
|
|
1457 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
1458
|
|
1459 /**** directionality ****/
|
|
1460
|
|
1461 case '[':
|
|
1462 iso->esc = ISO_ESC_5_11;
|
|
1463 goto not_done;
|
|
1464
|
|
1465 /**** designation ****/
|
|
1466
|
|
1467 case '$': /* multibyte charset prefix */
|
|
1468 iso->esc = ISO_ESC_2_4;
|
|
1469 goto not_done;
|
|
1470
|
3439
|
1471 case '%': /* Prefix to an escape to or from Unicode. */
|
|
1472 iso->esc = ISO_ESC_2_5;
|
|
1473 goto not_done;
|
|
1474
|
771
|
1475 default:
|
|
1476 if (0x28 <= c && c <= 0x2F)
|
|
1477 {
|
|
1478 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
|
|
1479 goto not_done;
|
|
1480 }
|
|
1481
|
|
1482 /* This function is called with CODESYS equal to nil when
|
|
1483 doing coding-system detection. */
|
|
1484 if (!NILP (codesys)
|
|
1485 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
|
|
1486 && fit_to_be_escape_quoted (c))
|
|
1487 {
|
|
1488 iso->esc = ISO_ESC_LITERAL;
|
|
1489 *flags &= ISO_STATE_LOCK;
|
|
1490 return 1;
|
|
1491 }
|
|
1492
|
|
1493 /* bzzzt! */
|
|
1494 goto error;
|
|
1495 }
|
|
1496
|
3439
|
1497 /* ISO-IR 196 UTF-8 support. */
|
|
1498 case ISO_ESC_2_5:
|
|
1499 if ('G' == c)
|
|
1500 {
|
|
1501 /* Activate UTF-8 mode. */
|
|
1502 *flags &= ISO_STATE_LOCK;
|
|
1503 *flags |= ISO_STATE_UTF_8;
|
|
1504 iso->esc = ISO_ESC_NOTHING;
|
|
1505 return 1;
|
|
1506 }
|
|
1507 else if ('@' == c)
|
|
1508 {
|
|
1509 /* Deactive UTF-8 mode. */
|
|
1510 *flags &= ISO_STATE_LOCK;
|
|
1511 *flags &= ~(ISO_STATE_UTF_8);
|
|
1512 iso->esc = ISO_ESC_NOTHING;
|
|
1513 return 1;
|
|
1514 }
|
|
1515 else
|
|
1516 {
|
|
1517 /* Oops, we don't support the other UTF-? coding systems within
|
|
1518 ISO 2022, only in their own context. */
|
|
1519 goto error;
|
|
1520 }
|
771
|
1521 /**** directionality ****/
|
|
1522
|
|
1523 case ISO_ESC_5_11: /* ISO6429 direction control */
|
|
1524 if (c == ']')
|
|
1525 {
|
|
1526 *flags &= (ISO_STATE_LOCK & ~ISO_STATE_R2L);
|
|
1527 goto directionality;
|
|
1528 }
|
|
1529 if (c == '0') iso->esc = ISO_ESC_5_11_0;
|
|
1530 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
|
|
1531 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
|
|
1532 else goto error;
|
|
1533 goto not_done;
|
|
1534
|
|
1535 case ISO_ESC_5_11_0:
|
|
1536 if (c == ']')
|
|
1537 {
|
|
1538 *flags &= (ISO_STATE_LOCK & ~ISO_STATE_R2L);
|
|
1539 goto directionality;
|
|
1540 }
|
|
1541 goto error;
|
|
1542
|
|
1543 case ISO_ESC_5_11_1:
|
|
1544 if (c == ']')
|
|
1545 {
|
|
1546 *flags = (ISO_STATE_LOCK & ~ISO_STATE_R2L);
|
|
1547 goto directionality;
|
|
1548 }
|
|
1549 goto error;
|
|
1550
|
|
1551 case ISO_ESC_5_11_2:
|
|
1552 if (c == ']')
|
|
1553 {
|
|
1554 *flags = (*flags & ISO_STATE_LOCK) | ISO_STATE_R2L;
|
|
1555 goto directionality;
|
|
1556 }
|
|
1557 goto error;
|
|
1558
|
|
1559 directionality:
|
|
1560 iso->esc = ISO_ESC_DIRECTIONALITY;
|
|
1561 /* Various junk here to attempt to preserve the direction sequences
|
|
1562 literally in the text if they would otherwise be swallowed due
|
|
1563 to invalid designations that don't show up as actual charset
|
|
1564 changes in the text. */
|
|
1565 if (iso->invalid_switch_dir)
|
|
1566 {
|
|
1567 /* We already inserted a direction switch literally into the
|
|
1568 text. We assume (#### this may not be right) that the
|
|
1569 next direction switch is the one going the other way,
|
|
1570 and we need to output that literally as well. */
|
|
1571 iso->output_literally = 1;
|
|
1572 iso->invalid_switch_dir = 0;
|
|
1573 }
|
|
1574 else
|
|
1575 {
|
|
1576 int jj;
|
|
1577
|
|
1578 /* If we are in the thrall of an invalid designation,
|
|
1579 then stick the directionality sequence literally into the
|
|
1580 output stream so it ends up in the original text again. */
|
|
1581 for (jj = 0; jj < 4; jj++)
|
|
1582 if (iso->invalid_designated[jj])
|
|
1583 break;
|
|
1584 if (jj < 4)
|
|
1585 {
|
|
1586 iso->output_literally = 1;
|
|
1587 iso->invalid_switch_dir = 1;
|
|
1588 }
|
|
1589 else
|
|
1590 /* Indicate that we haven't yet seen a valid designation,
|
|
1591 so that if a switch-dir is directly followed by an
|
|
1592 invalid designation, both get inserted literally. */
|
|
1593 iso->switched_dir_and_no_valid_charset_yet = 1;
|
|
1594 }
|
|
1595 return 1;
|
|
1596
|
|
1597
|
|
1598 /**** designation ****/
|
|
1599
|
|
1600 case ISO_ESC_2_4:
|
|
1601 if (0x28 <= c && c <= 0x2F)
|
|
1602 {
|
|
1603 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
|
|
1604 goto not_done;
|
|
1605 }
|
|
1606 if (0x40 <= c && c <= 0x42)
|
|
1607 {
|
|
1608 cs = charset_by_attributes_or_create_one (CHARSET_TYPE_94X94, c,
|
|
1609 *flags & ISO_STATE_R2L ?
|
|
1610 CHARSET_RIGHT_TO_LEFT :
|
|
1611 CHARSET_LEFT_TO_RIGHT);
|
|
1612 reg = 0;
|
|
1613 goto designated;
|
|
1614 }
|
|
1615 goto error;
|
|
1616
|
|
1617 default:
|
|
1618 {
|
|
1619 int type = -1;
|
|
1620
|
|
1621 if (iso->esc >= ISO_ESC_2_8 &&
|
|
1622 iso->esc <= ISO_ESC_2_15)
|
|
1623 {
|
|
1624 type = ((iso->esc >= ISO_ESC_2_12) ?
|
|
1625 CHARSET_TYPE_96 : CHARSET_TYPE_94);
|
|
1626 reg = (iso->esc - ISO_ESC_2_8) & 3;
|
|
1627 }
|
|
1628 else if (iso->esc >= ISO_ESC_2_4_8 &&
|
|
1629 iso->esc <= ISO_ESC_2_4_15)
|
|
1630 {
|
|
1631 type = ((iso->esc >= ISO_ESC_2_4_12) ?
|
|
1632 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
|
|
1633 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
|
|
1634 }
|
|
1635 else
|
|
1636 {
|
|
1637 /* Can this ever be reached? -slb */
|
2500
|
1638 ABORT ();
|
771
|
1639 goto error;
|
|
1640 }
|
|
1641
|
|
1642 if (c < '0' || c > '~' ||
|
|
1643 (c > 0x5F && (type == CHARSET_TYPE_94X94 ||
|
|
1644 type == CHARSET_TYPE_96X96)))
|
|
1645 goto error; /* bad final byte */
|
|
1646
|
|
1647 cs = charset_by_attributes_or_create_one (type, c,
|
|
1648 *flags & ISO_STATE_R2L ?
|
|
1649 CHARSET_RIGHT_TO_LEFT :
|
|
1650 CHARSET_LEFT_TO_RIGHT);
|
|
1651 goto designated;
|
|
1652 }
|
|
1653 }
|
|
1654
|
|
1655 not_done:
|
|
1656 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
|
|
1657 return -1;
|
|
1658
|
|
1659 single_shift:
|
|
1660 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
|
|
1661 /* can't invoke something that ain't there. */
|
|
1662 goto error;
|
|
1663 iso->esc = ISO_ESC_SINGLE_SHIFT;
|
|
1664 *flags &= ISO_STATE_LOCK;
|
|
1665 if (reg == 2)
|
|
1666 *flags |= ISO_STATE_SS2;
|
|
1667 else
|
|
1668 *flags |= ISO_STATE_SS3;
|
|
1669 return 1;
|
|
1670
|
|
1671 locking_shift:
|
|
1672 if (check_invalid_charsets &&
|
|
1673 !CHARSETP (iso->charset[reg]))
|
|
1674 /* can't invoke something that ain't there. */
|
|
1675 goto error;
|
|
1676 if (half)
|
|
1677 iso->register_right = reg;
|
|
1678 else
|
|
1679 iso->register_left = reg;
|
|
1680 *flags &= ISO_STATE_LOCK;
|
|
1681 iso->esc = ISO_ESC_LOCKING_SHIFT;
|
|
1682 return 1;
|
|
1683
|
|
1684 designated:
|
|
1685 if (NILP (cs) && check_invalid_charsets)
|
|
1686 {
|
2500
|
1687 ABORT ();
|
771
|
1688 /* #### This should never happen now that we automatically create
|
|
1689 temporary charsets as necessary. We should probably remove
|
|
1690 this code. --ben */
|
|
1691 iso->invalid_designated[reg] = 1;
|
|
1692 iso->charset[reg] = Vcharset_ascii;
|
|
1693 iso->esc = ISO_ESC_DESIGNATE;
|
|
1694 *flags &= ISO_STATE_LOCK;
|
|
1695 iso->output_literally = 1;
|
|
1696 if (iso->switched_dir_and_no_valid_charset_yet)
|
|
1697 {
|
|
1698 /* We encountered a switch-direction followed by an
|
|
1699 invalid designation. Ensure that the switch-direction
|
|
1700 gets outputted; otherwise it will probably get eaten
|
|
1701 when the text is written out again. */
|
|
1702 iso->switched_dir_and_no_valid_charset_yet = 0;
|
|
1703 iso->output_direction_sequence = 1;
|
|
1704 /* And make sure that the switch-dir going the other
|
|
1705 way gets outputted, as well. */
|
|
1706 iso->invalid_switch_dir = 1;
|
|
1707 }
|
|
1708 return 1;
|
|
1709 }
|
|
1710 /* This function is called with CODESYS equal to nil when
|
|
1711 doing coding-system detection. */
|
|
1712 if (!NILP (codesys))
|
|
1713 {
|
|
1714 charset_conversion_spec_dynarr *dyn =
|
|
1715 XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys);
|
|
1716
|
|
1717 if (dyn)
|
|
1718 {
|
|
1719 int i;
|
|
1720
|
|
1721 for (i = 0; i < Dynarr_length (dyn); i++)
|
|
1722 {
|
|
1723 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
|
|
1724 if (EQ (cs, spec->from_charset))
|
|
1725 cs = spec->to_charset;
|
|
1726 }
|
|
1727 }
|
|
1728 }
|
|
1729
|
|
1730 iso->charset[reg] = cs;
|
|
1731 iso->esc = ISO_ESC_DESIGNATE;
|
|
1732 *flags &= ISO_STATE_LOCK;
|
|
1733 if (iso->invalid_designated[reg])
|
|
1734 {
|
|
1735 iso->invalid_designated[reg] = 0;
|
|
1736 iso->output_literally = 1;
|
|
1737 }
|
|
1738 if (iso->switched_dir_and_no_valid_charset_yet)
|
|
1739 iso->switched_dir_and_no_valid_charset_yet = 0;
|
|
1740 return 1;
|
|
1741 }
|
|
1742
|
|
1743 /* If FLAGS is a null pointer or specifies right-to-left motion,
|
|
1744 output a switch-dir-to-left-to-right sequence to DST.
|
|
1745 Also update FLAGS if it is not a null pointer.
|
|
1746 If INTERNAL_P is set, we are outputting in internal format and
|
|
1747 need to handle the CSI differently. */
|
|
1748
|
|
1749 static void
|
|
1750 restore_left_to_right_direction (Lisp_Object codesys,
|
|
1751 unsigned_char_dynarr *dst,
|
|
1752 unsigned int *flags,
|
|
1753 int internal_p)
|
|
1754 {
|
|
1755 if (!flags || (*flags & ISO_STATE_R2L))
|
|
1756 {
|
|
1757 if (XCODING_SYSTEM_ISO2022_SEVEN (codesys))
|
|
1758 {
|
|
1759 Dynarr_add (dst, ISO_CODE_ESC);
|
|
1760 Dynarr_add (dst, '[');
|
|
1761 }
|
|
1762 else if (internal_p)
|
|
1763 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
|
|
1764 else
|
|
1765 Dynarr_add (dst, ISO_CODE_CSI);
|
|
1766 Dynarr_add (dst, '0');
|
|
1767 Dynarr_add (dst, ']');
|
|
1768 if (flags)
|
|
1769 *flags &= ~ISO_STATE_R2L;
|
|
1770 }
|
|
1771 }
|
|
1772
|
|
1773 /* If FLAGS is a null pointer or specifies a direction different from
|
|
1774 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
|
|
1775 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
|
|
1776 sequence to DST. Also update FLAGS if it is not a null pointer.
|
|
1777 If INTERNAL_P is set, we are outputting in internal format and
|
|
1778 need to handle the CSI differently. */
|
|
1779
|
|
1780 static void
|
|
1781 ensure_correct_direction (int direction, Lisp_Object codesys,
|
|
1782 unsigned_char_dynarr *dst, unsigned int *flags,
|
|
1783 int internal_p)
|
|
1784 {
|
|
1785 if ((!flags || (*flags & ISO_STATE_R2L)) &&
|
|
1786 direction == CHARSET_LEFT_TO_RIGHT)
|
|
1787 restore_left_to_right_direction (codesys, dst, flags, internal_p);
|
|
1788 else if (!XCODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
|
|
1789 && (!flags || !(*flags & ISO_STATE_R2L)) &&
|
|
1790 direction == CHARSET_RIGHT_TO_LEFT)
|
|
1791 {
|
|
1792 if (XCODING_SYSTEM_ISO2022_SEVEN (codesys))
|
|
1793 {
|
|
1794 Dynarr_add (dst, ISO_CODE_ESC);
|
|
1795 Dynarr_add (dst, '[');
|
|
1796 }
|
|
1797 else if (internal_p)
|
|
1798 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
|
|
1799 else
|
|
1800 Dynarr_add (dst, ISO_CODE_CSI);
|
|
1801 Dynarr_add (dst, '2');
|
|
1802 Dynarr_add (dst, ']');
|
|
1803 if (flags)
|
|
1804 *flags |= ISO_STATE_R2L;
|
|
1805 }
|
|
1806 }
|
|
1807
|
4096
|
1808 /* Note that this name conflicts with a function in unicode.c. */
|
|
1809 static void
|
|
1810 decode_unicode_char (int ucs, unsigned_char_dynarr *dst)
|
|
1811 {
|
|
1812 Ibyte work[MAX_ICHAR_LEN];
|
|
1813 int len;
|
|
1814 Lisp_Object chr;
|
|
1815
|
|
1816 chr = Funicode_to_char(make_int(ucs), Qnil);
|
|
1817 assert (!NILP(chr));
|
|
1818 len = set_itext_ichar (work, XCHAR(chr));
|
|
1819 Dynarr_add_many (dst, work, len);
|
|
1820 }
|
|
1821
|
|
1822 #define DECODE_ERROR_OCTET(octet, dst) \
|
|
1823 decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, dst)
|
|
1824
|
|
1825 static inline void
|
|
1826 indicate_invalid_utf_8 (unsigned char indicated_length,
|
|
1827 unsigned char counter,
|
|
1828 int ch, unsigned_char_dynarr *dst)
|
|
1829 {
|
|
1830 Binbyte stored = indicated_length - counter;
|
|
1831 Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
|
|
1832
|
|
1833 while (stored > 0)
|
|
1834 {
|
|
1835 DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
|
|
1836 dst);
|
|
1837 mask = 0x80, stored--;
|
|
1838 }
|
|
1839 }
|
|
1840
|
771
|
1841 /* Convert ISO2022-format data to internal format. */
|
|
1842
|
|
1843 static Bytecount
|
|
1844 iso2022_decode (struct coding_stream *str, const UExtbyte *src,
|
|
1845 unsigned_char_dynarr *dst, Bytecount n)
|
|
1846 {
|
|
1847 unsigned int ch = str->ch;
|
|
1848 #ifdef ENABLE_COMPOSITE_CHARS
|
|
1849 unsigned_char_dynarr *real_dst = dst;
|
|
1850 #endif
|
|
1851 struct iso2022_coding_stream *data =
|
|
1852 CODING_STREAM_TYPE_DATA (str, iso2022);
|
|
1853 unsigned int flags = data->flags;
|
|
1854 Bytecount orign = n;
|
|
1855
|
|
1856 #ifdef ENABLE_COMPOSITE_CHARS
|
|
1857 if (flags & ISO_STATE_COMPOSITE)
|
|
1858 dst = data->composite_chars;
|
|
1859 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
1860
|
|
1861 while (n--)
|
|
1862 {
|
|
1863 UExtbyte c = *src++;
|
|
1864 if (flags & ISO_STATE_ESCAPE)
|
|
1865 { /* Within ESC sequence */
|
|
1866 int retval = parse_iso2022_esc (str->codesys, data,
|
|
1867 c, &flags, 1);
|
|
1868
|
|
1869 if (retval)
|
|
1870 {
|
|
1871 switch (data->esc)
|
|
1872 {
|
|
1873 #ifdef ENABLE_COMPOSITE_CHARS
|
|
1874 case ISO_ESC_START_COMPOSITE:
|
|
1875 if (data->composite_chars)
|
|
1876 Dynarr_reset (data->composite_chars);
|
|
1877 else
|
|
1878 data->composite_chars = Dynarr_new (unsigned_char);
|
|
1879 dst = data->composite_chars;
|
|
1880 break;
|
|
1881 case ISO_ESC_END_COMPOSITE:
|
|
1882 {
|
867
|
1883 Ibyte comstr[MAX_ICHAR_LEN];
|
771
|
1884 Bytecount len;
|
867
|
1885 Ichar emch = lookup_composite_char (Dynarr_atp (dst, 0),
|
771
|
1886 Dynarr_length (dst));
|
|
1887 dst = real_dst;
|
867
|
1888 len = set_itext_ichar (comstr, emch);
|
771
|
1889 Dynarr_add_many (dst, comstr, len);
|
|
1890 break;
|
|
1891 }
|
|
1892 #else
|
|
1893 case ISO_ESC_START_COMPOSITE:
|
|
1894 {
|
867
|
1895 Ibyte comstr[MAX_ICHAR_LEN];
|
771
|
1896 Bytecount len;
|
867
|
1897 Ichar emch = make_ichar (Vcharset_composite, c - '0' + ' ',
|
771
|
1898 0);
|
867
|
1899 len = set_itext_ichar (comstr, emch);
|
771
|
1900 Dynarr_add_many (dst, comstr, len);
|
|
1901 break;
|
|
1902 }
|
|
1903 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
1904
|
|
1905 case ISO_ESC_LITERAL:
|
|
1906 DECODE_ADD_BINARY_CHAR (c, dst);
|
|
1907 break;
|
|
1908
|
|
1909 default:
|
|
1910 /* Everything else handled already */
|
|
1911 break;
|
|
1912 }
|
|
1913 }
|
|
1914
|
|
1915 /* Attempted error recovery. */
|
|
1916 if (data->output_direction_sequence)
|
|
1917 ensure_correct_direction (flags & ISO_STATE_R2L ?
|
|
1918 CHARSET_RIGHT_TO_LEFT :
|
|
1919 CHARSET_LEFT_TO_RIGHT,
|
|
1920 str->codesys, dst, 0, 1);
|
|
1921 /* More error recovery. */
|
|
1922 if (!retval || data->output_literally)
|
|
1923 {
|
|
1924 /* Output the (possibly invalid) sequence */
|
|
1925 int i;
|
|
1926 for (i = 0; i < data->esc_bytes_index; i++)
|
|
1927 DECODE_ADD_BINARY_CHAR (data->esc_bytes[i], dst);
|
|
1928 flags &= ISO_STATE_LOCK;
|
|
1929 if (!retval)
|
|
1930 n++, src--;/* Repeat the loop with the same character. */
|
|
1931 else
|
|
1932 {
|
|
1933 /* No sense in reprocessing the final byte of the
|
|
1934 escape sequence; it could mess things up anyway.
|
|
1935 Just add it now. */
|
|
1936 DECODE_ADD_BINARY_CHAR (c, dst);
|
|
1937 }
|
|
1938 }
|
|
1939 ch = 0;
|
|
1940 }
|
3439
|
1941 else if (flags & ISO_STATE_UTF_8)
|
|
1942 {
|
|
1943 unsigned char counter = data->counter;
|
4096
|
1944 unsigned char indicated_length = data->indicated_length;
|
3439
|
1945
|
|
1946 if (ISO_CODE_ESC == c)
|
|
1947 {
|
|
1948 /* Allow the escape sequence parser to end the UTF-8 state. */
|
|
1949 flags |= ISO_STATE_ESCAPE;
|
|
1950 data->esc = ISO_ESC;
|
|
1951 data->esc_bytes_index = 1;
|
|
1952 continue;
|
|
1953 }
|
|
1954
|
4096
|
1955 if (0 == counter)
|
|
1956 {
|
|
1957 if (0 == (c & 0x80))
|
|
1958 {
|
|
1959 /* ASCII. */
|
|
1960 decode_unicode_char (c, dst);
|
|
1961 }
|
|
1962 else if (0 == (c & 0x40))
|
|
1963 {
|
|
1964 /* Highest bit set, second highest not--there's
|
|
1965 something wrong. */
|
|
1966 DECODE_ERROR_OCTET (c, dst);
|
|
1967 }
|
|
1968 else if (0 == (c & 0x20))
|
|
1969 {
|
|
1970 ch = c & 0x1f;
|
|
1971 counter = 1;
|
|
1972 indicated_length = 2;
|
|
1973 }
|
|
1974 else if (0 == (c & 0x10))
|
|
1975 {
|
|
1976 ch = c & 0x0f;
|
|
1977 counter = 2;
|
|
1978 indicated_length = 3;
|
|
1979 }
|
|
1980 else if (0 == (c & 0x08))
|
|
1981 {
|
|
1982 ch = c & 0x0f;
|
|
1983 counter = 3;
|
|
1984 indicated_length = 4;
|
|
1985 }
|
|
1986 /* We support lengths longer than 4 here, since we want to
|
|
1987 represent UTF-8 error chars as distinct from the
|
|
1988 corresponding ISO 8859-1 characters in escape-quoted.
|
|
1989
|
|
1990 However, we can't differentiate UTF-8 error chars as
|
|
1991 written to disk, and UTF-8 errors in escape-quoted. This
|
|
1992 is not a big problem;
|
|
1993 non-Unicode-chars-encoded-as-UTF-8-in-ISO-2022 is not
|
|
1994 deployed, in practice, so if such a sequence of octets
|
|
1995 occurs, XEmacs generated it. */
|
|
1996 else if (0 == (c & 0x04))
|
|
1997 {
|
|
1998 ch = c & 0x03;
|
|
1999 counter = 4;
|
|
2000 indicated_length = 5;
|
|
2001 }
|
|
2002 else if (0 == (c & 0x02))
|
|
2003 {
|
|
2004 ch = c & 0x01;
|
|
2005 counter = 5;
|
|
2006 indicated_length = 6;
|
|
2007 }
|
|
2008 else
|
|
2009 {
|
|
2010 /* #xFF is not a valid leading byte in any form of
|
|
2011 UTF-8. */
|
|
2012 DECODE_ERROR_OCTET (c, dst);
|
|
2013
|
|
2014 }
|
|
2015 }
|
|
2016 else
|
|
2017 {
|
|
2018 /* counter != 0 */
|
|
2019 if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
|
|
2020 {
|
|
2021 indicate_invalid_utf_8(indicated_length,
|
|
2022 counter,
|
|
2023 ch, dst);
|
|
2024 if (c & 0x80)
|
|
2025 {
|
|
2026 DECODE_ERROR_OCTET (c, dst);
|
|
2027 }
|
|
2028 else
|
|
2029 {
|
|
2030 /* The character just read is ASCII. Treat it as
|
|
2031 such. */
|
|
2032 decode_unicode_char (c, dst);
|
|
2033 }
|
|
2034 ch = 0;
|
|
2035 counter = 0;
|
|
2036 }
|
|
2037 else
|
|
2038 {
|
|
2039 ch = (ch << 6) | (c & 0x3f);
|
|
2040 counter--;
|
|
2041
|
|
2042 /* Just processed the final byte. Emit the character. */
|
|
2043 if (!counter)
|
|
2044 {
|
|
2045 /* Don't accept over-long sequences, or surrogates. */
|
|
2046 if ((ch < 0x80) ||
|
|
2047 ((ch < 0x800) && indicated_length > 2) ||
|
|
2048 ((ch < 0x10000) && indicated_length > 3) ||
|
|
2049 /* We accept values above #x110000 in
|
|
2050 escape-quoted, though not in UTF-8. */
|
|
2051 /* (ch > 0x110000) || */
|
|
2052 valid_utf_16_surrogate(ch))
|
|
2053 {
|
|
2054 indicate_invalid_utf_8(indicated_length,
|
|
2055 counter,
|
|
2056 ch, dst);
|
|
2057 }
|
|
2058 else
|
|
2059 {
|
|
2060 decode_unicode_char (ch, dst);
|
|
2061 }
|
|
2062 ch = 0;
|
|
2063 }
|
|
2064 }
|
|
2065 }
|
|
2066
|
|
2067 if (str->eof && ch)
|
|
2068 {
|
|
2069 DECODE_ERROR_OCTET (ch, dst);
|
|
2070 ch = 0;
|
|
2071 }
|
3439
|
2072
|
|
2073 data->counter = counter;
|
4096
|
2074 data->indicated_length = indicated_length;
|
3439
|
2075 }
|
826
|
2076 else if (byte_c0_p (c) || byte_c1_p (c))
|
771
|
2077 { /* Control characters */
|
|
2078
|
|
2079 /***** Error-handling *****/
|
|
2080
|
|
2081 /* If we were in the middle of a character, dump out the
|
|
2082 partial character. */
|
|
2083 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
|
|
2084
|
|
2085 /* If we just saw a single-shift character, dump it out.
|
|
2086 This may dump out the wrong sort of single-shift character,
|
|
2087 but least it will give an indication that something went
|
|
2088 wrong. */
|
|
2089 if (flags & ISO_STATE_SS2)
|
|
2090 {
|
|
2091 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
|
|
2092 flags &= ~ISO_STATE_SS2;
|
|
2093 }
|
|
2094 if (flags & ISO_STATE_SS3)
|
|
2095 {
|
|
2096 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
|
|
2097 flags &= ~ISO_STATE_SS3;
|
|
2098 }
|
|
2099
|
|
2100 /***** Now handle the control characters. *****/
|
|
2101
|
|
2102 flags &= ISO_STATE_LOCK;
|
|
2103
|
|
2104 if (!parse_iso2022_esc (str->codesys, data, c, &flags, 1))
|
|
2105 DECODE_ADD_BINARY_CHAR (c, dst);
|
|
2106 }
|
|
2107 else
|
|
2108 { /* Graphic characters */
|
|
2109 Lisp_Object charset;
|
|
2110 int lb;
|
|
2111 int reg;
|
|
2112
|
|
2113 /* Now determine the charset. */
|
|
2114 reg = ((flags & ISO_STATE_SS2) ? 2
|
|
2115 : (flags & ISO_STATE_SS3) ? 3
|
826
|
2116 : !byte_ascii_p (c) ? data->register_right
|
771
|
2117 : data->register_left);
|
|
2118 charset = data->charset[reg];
|
|
2119
|
|
2120 /* Error checking: */
|
|
2121 if (! CHARSETP (charset)
|
|
2122 || data->invalid_designated[reg]
|
|
2123 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
|
|
2124 && XCHARSET_CHARS (charset) == 94))
|
|
2125 /* Mrmph. We are trying to invoke a register that has no
|
|
2126 or an invalid charset in it, or trying to add a character
|
|
2127 outside the range of the charset. Insert that char literally
|
|
2128 to preserve it for the output. */
|
|
2129 {
|
|
2130 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
|
|
2131 DECODE_ADD_BINARY_CHAR (c, dst);
|
|
2132 }
|
|
2133
|
|
2134 else
|
|
2135 {
|
|
2136 /* Things are probably hunky-dorey. */
|
|
2137
|
|
2138 /* Fetch reverse charset, maybe. */
|
|
2139 if (((flags & ISO_STATE_R2L) &&
|
|
2140 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
|
|
2141 ||
|
|
2142 (!(flags & ISO_STATE_R2L) &&
|
|
2143 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
|
|
2144 {
|
|
2145 Lisp_Object new_charset =
|
|
2146 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
|
|
2147 if (!NILP (new_charset))
|
|
2148 charset = new_charset;
|
|
2149 }
|
|
2150
|
|
2151 lb = XCHARSET_LEADING_BYTE (charset);
|
|
2152 switch (XCHARSET_REP_BYTES (charset))
|
|
2153 {
|
|
2154 case 1: /* ASCII */
|
|
2155 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
|
|
2156 Dynarr_add (dst, c & 0x7F);
|
|
2157 break;
|
|
2158
|
|
2159 case 2: /* one-byte official */
|
|
2160 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
|
|
2161 Dynarr_add (dst, lb);
|
|
2162 Dynarr_add (dst, c | 0x80);
|
|
2163 break;
|
|
2164
|
|
2165 case 3: /* one-byte private or two-byte official */
|
|
2166 if (XCHARSET_PRIVATE_P (charset))
|
|
2167 {
|
|
2168 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
|
|
2169 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
|
|
2170 Dynarr_add (dst, lb);
|
|
2171 Dynarr_add (dst, c | 0x80);
|
|
2172 }
|
|
2173 else
|
|
2174 {
|
|
2175 if (ch)
|
|
2176 {
|
|
2177 Dynarr_add (dst, lb);
|
|
2178 Dynarr_add (dst, ch | 0x80);
|
|
2179 Dynarr_add (dst, c | 0x80);
|
|
2180 ch = 0;
|
|
2181 }
|
|
2182 else
|
|
2183 ch = c;
|
|
2184 }
|
|
2185 break;
|
|
2186
|
|
2187 default: /* two-byte private */
|
|
2188 if (ch)
|
|
2189 {
|
|
2190 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
|
|
2191 Dynarr_add (dst, lb);
|
|
2192 Dynarr_add (dst, ch | 0x80);
|
|
2193 Dynarr_add (dst, c | 0x80);
|
|
2194 ch = 0;
|
|
2195 }
|
|
2196 else
|
|
2197 ch = c;
|
|
2198 }
|
|
2199 }
|
|
2200
|
|
2201 if (!ch)
|
|
2202 flags &= ISO_STATE_LOCK;
|
|
2203 }
|
|
2204
|
|
2205 }
|
|
2206
|
|
2207 if (str->eof)
|
|
2208 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
|
|
2209
|
|
2210 data->flags = flags;
|
|
2211 str->ch = ch;
|
|
2212 return orign;
|
|
2213 }
|
|
2214
|
|
2215
|
|
2216 /***** ISO2022 encoder *****/
|
|
2217
|
|
2218 /* Designate CHARSET into register REG. */
|
|
2219
|
|
2220 static void
|
|
2221 iso2022_designate (Lisp_Object charset, int reg,
|
|
2222 struct coding_stream *str, unsigned_char_dynarr *dst)
|
|
2223 {
|
|
2224 static const char inter94[] = "()*+";
|
|
2225 static const char inter96[] = ",-./";
|
|
2226 int type;
|
|
2227 unsigned char final;
|
|
2228 struct iso2022_coding_stream *data =
|
|
2229 CODING_STREAM_TYPE_DATA (str, iso2022);
|
|
2230 Lisp_Object old_charset = data->charset[reg];
|
|
2231
|
|
2232 data->charset[reg] = charset;
|
|
2233 if (!CHARSETP (charset))
|
|
2234 /* charset might be an initial nil or t. */
|
|
2235 return;
|
|
2236 type = XCHARSET_TYPE (charset);
|
|
2237 final = XCHARSET_FINAL (charset);
|
|
2238 if (!data->force_charset_on_output[reg] &&
|
|
2239 CHARSETP (old_charset) &&
|
|
2240 XCHARSET_TYPE (old_charset) == type &&
|
|
2241 XCHARSET_FINAL (old_charset) == final)
|
|
2242 return;
|
|
2243
|
|
2244 data->force_charset_on_output[reg] = 0;
|
|
2245
|
|
2246 {
|
|
2247 charset_conversion_spec_dynarr *dyn =
|
|
2248 XCODING_SYSTEM_ISO2022_OUTPUT_CONV (str->codesys);
|
|
2249
|
|
2250 if (dyn)
|
|
2251 {
|
|
2252 int i;
|
|
2253
|
|
2254 for (i = 0; i < Dynarr_length (dyn); i++)
|
|
2255 {
|
|
2256 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
|
|
2257 if (EQ (charset, spec->from_charset))
|
|
2258 charset = spec->to_charset;
|
|
2259 }
|
|
2260 }
|
|
2261 }
|
|
2262
|
|
2263 Dynarr_add (dst, ISO_CODE_ESC);
|
3439
|
2264
|
771
|
2265 switch (type)
|
|
2266 {
|
|
2267 case CHARSET_TYPE_94:
|
|
2268 Dynarr_add (dst, inter94[reg]);
|
|
2269 break;
|
|
2270 case CHARSET_TYPE_96:
|
|
2271 Dynarr_add (dst, inter96[reg]);
|
|
2272 break;
|
|
2273 case CHARSET_TYPE_94X94:
|
|
2274 Dynarr_add (dst, '$');
|
|
2275 if (reg != 0
|
|
2276 || !(XCODING_SYSTEM_ISO2022_SHORT (str->codesys))
|
|
2277 || final < '@'
|
|
2278 || final > 'B')
|
|
2279 Dynarr_add (dst, inter94[reg]);
|
|
2280 break;
|
|
2281 case CHARSET_TYPE_96X96:
|
|
2282 Dynarr_add (dst, '$');
|
|
2283 Dynarr_add (dst, inter96[reg]);
|
|
2284 break;
|
|
2285 }
|
|
2286 Dynarr_add (dst, final);
|
|
2287 }
|
|
2288
|
|
2289 static void
|
|
2290 ensure_normal_shift (struct coding_stream *str, unsigned_char_dynarr *dst)
|
|
2291 {
|
|
2292 struct iso2022_coding_stream *data =
|
|
2293 CODING_STREAM_TYPE_DATA (str, iso2022);
|
|
2294
|
|
2295 if (data->register_left != 0)
|
|
2296 {
|
|
2297 Dynarr_add (dst, ISO_CODE_SI);
|
|
2298 data->register_left = 0;
|
|
2299 }
|
|
2300 }
|
|
2301
|
|
2302 static void
|
|
2303 ensure_shift_out (struct coding_stream *str, unsigned_char_dynarr *dst)
|
|
2304 {
|
|
2305 struct iso2022_coding_stream *data =
|
|
2306 CODING_STREAM_TYPE_DATA (str, iso2022);
|
|
2307
|
|
2308 if (data->register_left != 1)
|
|
2309 {
|
|
2310 Dynarr_add (dst, ISO_CODE_SO);
|
|
2311 data->register_left = 1;
|
|
2312 }
|
|
2313 }
|
|
2314
|
|
2315 /* Convert internally-formatted data to ISO2022 format. */
|
|
2316
|
|
2317 static Bytecount
|
867
|
2318 iso2022_encode (struct coding_stream *str, const Ibyte *src,
|
771
|
2319 unsigned_char_dynarr *dst, Bytecount n)
|
|
2320 {
|
|
2321 unsigned char charmask;
|
867
|
2322 Ibyte c;
|
771
|
2323 unsigned char char_boundary;
|
|
2324 unsigned int ch = str->ch;
|
|
2325 Lisp_Object codesys = str->codesys;
|
|
2326 int i;
|
|
2327 Lisp_Object charset;
|
|
2328 int half;
|
|
2329 struct iso2022_coding_stream *data =
|
|
2330 CODING_STREAM_TYPE_DATA (str, iso2022);
|
|
2331 unsigned int flags = data->flags;
|
|
2332 Bytecount orign = n;
|
|
2333
|
|
2334 #ifdef ENABLE_COMPOSITE_CHARS
|
|
2335 /* flags for handling composite chars. We do a little switcheroo
|
|
2336 on the source while we're outputting the composite char. */
|
|
2337 Bytecount saved_n = 0;
|
867
|
2338 const Ibyte *saved_src = NULL;
|
771
|
2339 int in_composite = 0;
|
|
2340 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
2341
|
|
2342 char_boundary = data->current_char_boundary;
|
|
2343 charset = data->current_charset;
|
|
2344 half = data->current_half;
|
|
2345
|
|
2346 #ifdef ENABLE_COMPOSITE_CHARS
|
|
2347 back_to_square_n:
|
|
2348 #endif
|
|
2349 while (n--)
|
|
2350 {
|
|
2351 c = *src++;
|
|
2352
|
826
|
2353 if (byte_ascii_p (c))
|
771
|
2354 { /* Processing ASCII character */
|
|
2355 ch = 0;
|
|
2356
|
3439
|
2357 if (flags & ISO_STATE_UTF_8)
|
|
2358 {
|
|
2359 Dynarr_add (dst, ISO_CODE_ESC);
|
|
2360 Dynarr_add (dst, '%');
|
|
2361 Dynarr_add (dst, '@');
|
|
2362 flags &= ~(ISO_STATE_UTF_8);
|
|
2363 }
|
|
2364
|
771
|
2365 restore_left_to_right_direction (codesys, dst, &flags, 0);
|
|
2366
|
|
2367 /* Make sure G0 contains ASCII */
|
|
2368 if ((c > ' ' && c < ISO_CODE_DEL) ||
|
|
2369 !XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
|
|
2370 {
|
|
2371 ensure_normal_shift (str, dst);
|
|
2372 iso2022_designate (Vcharset_ascii, 0, str, dst);
|
|
2373 }
|
|
2374
|
|
2375 /* If necessary, restore everything to the default state
|
|
2376 at end-of-line */
|
|
2377 if (!(XCODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
|
|
2378 {
|
|
2379 /* NOTE: CRLF encoding happens *BEFORE* other encoding.
|
|
2380 Thus, even though we're working with internal-format
|
|
2381 data, there may be CR's or CRLF sequences representing
|
|
2382 newlines. */
|
|
2383 if (c == '\r' || (c == '\n' && !(flags & ISO_STATE_CR)))
|
|
2384 {
|
|
2385 restore_left_to_right_direction (codesys, dst, &flags, 0);
|
|
2386
|
|
2387 ensure_normal_shift (str, dst);
|
|
2388
|
|
2389 for (i = 0; i < 4; i++)
|
|
2390 {
|
|
2391 Lisp_Object initial_charset =
|
|
2392 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
|
|
2393 iso2022_designate (initial_charset, i, str, dst);
|
|
2394 }
|
|
2395 }
|
|
2396 if (c == '\r')
|
|
2397 flags |= ISO_STATE_CR;
|
|
2398 else
|
|
2399 flags &= ~ISO_STATE_CR;
|
|
2400 }
|
|
2401
|
|
2402 if (XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
|
|
2403 && fit_to_be_escape_quoted (c))
|
|
2404 Dynarr_add (dst, ISO_CODE_ESC);
|
|
2405 Dynarr_add (dst, c);
|
|
2406 char_boundary = 1;
|
|
2407 }
|
867
|
2408 else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch))
|
771
|
2409 { /* Processing Leading Byte */
|
|
2410 ch = 0;
|
826
|
2411 charset = charset_by_leading_byte (c);
|
|
2412 if (leading_byte_prefix_p (c))
|
3439
|
2413 {
|
|
2414 ch = c;
|
|
2415 }
|
|
2416 else if (XCHARSET_ENCODE_AS_UTF_8 (charset))
|
|
2417 {
|
|
2418 assert (!EQ (charset, Vcharset_control_1)
|
|
2419 && !EQ (charset, Vcharset_composite));
|
|
2420
|
|
2421 /* If the character set is to be encoded as UTF-8, the escape
|
|
2422 is always the same. */
|
|
2423 if (!(flags & ISO_STATE_UTF_8))
|
|
2424 {
|
|
2425 Dynarr_add (dst, ISO_CODE_ESC);
|
|
2426 Dynarr_add (dst, '%');
|
|
2427 Dynarr_add (dst, 'G');
|
|
2428 flags |= ISO_STATE_UTF_8;
|
|
2429 }
|
|
2430 }
|
771
|
2431 else if (!EQ (charset, Vcharset_control_1)
|
|
2432 && !EQ (charset, Vcharset_composite))
|
|
2433 {
|
|
2434 int reg;
|
|
2435
|
3439
|
2436 /* End the UTF-8 state. */
|
|
2437 if (flags & ISO_STATE_UTF_8)
|
|
2438 {
|
|
2439 Dynarr_add (dst, ISO_CODE_ESC);
|
|
2440 Dynarr_add (dst, '%');
|
|
2441 Dynarr_add (dst, '@');
|
|
2442 flags &= ~(ISO_STATE_UTF_8);
|
|
2443 }
|
|
2444
|
771
|
2445 ensure_correct_direction (XCHARSET_DIRECTION (charset),
|
|
2446 codesys, dst, &flags, 0);
|
|
2447
|
|
2448 /* Now determine which register to use. */
|
|
2449 reg = -1;
|
|
2450 for (i = 0; i < 4; i++)
|
|
2451 {
|
|
2452 if (EQ (charset, data->charset[i]) ||
|
|
2453 EQ (charset,
|
|
2454 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
|
|
2455 {
|
|
2456 reg = i;
|
|
2457 break;
|
|
2458 }
|
|
2459 }
|
|
2460
|
|
2461 if (reg == -1)
|
|
2462 {
|
|
2463 if (XCHARSET_GRAPHIC (charset) != 0)
|
|
2464 {
|
|
2465 if (!NILP (data->charset[1]) &&
|
|
2466 (!XCODING_SYSTEM_ISO2022_SEVEN (codesys) ||
|
|
2467 XCODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
|
|
2468 reg = 1;
|
|
2469 else if (!NILP (data->charset[2]))
|
|
2470 reg = 2;
|
|
2471 else if (!NILP (data->charset[3]))
|
|
2472 reg = 3;
|
|
2473 else
|
|
2474 reg = 0;
|
|
2475 }
|
|
2476 else
|
|
2477 reg = 0;
|
|
2478 }
|
|
2479
|
|
2480 iso2022_designate (charset, reg, str, dst);
|
|
2481
|
|
2482 /* Now invoke that register. */
|
|
2483 switch (reg)
|
|
2484 {
|
|
2485 case 0:
|
|
2486 ensure_normal_shift (str, dst);
|
|
2487 half = 0;
|
|
2488 break;
|
|
2489
|
|
2490 case 1:
|
|
2491 if (XCODING_SYSTEM_ISO2022_SEVEN (codesys))
|
|
2492 {
|
|
2493 ensure_shift_out (str, dst);
|
|
2494 half = 0;
|
|
2495 }
|
|
2496 else
|
|
2497 half = 1;
|
|
2498 break;
|
|
2499
|
|
2500 case 2:
|
|
2501 if (XCODING_SYSTEM_ISO2022_SEVEN (str->codesys))
|
|
2502 {
|
|
2503 Dynarr_add (dst, ISO_CODE_ESC);
|
|
2504 Dynarr_add (dst, 'N');
|
|
2505 half = 0;
|
|
2506 }
|
|
2507 else
|
|
2508 {
|
|
2509 Dynarr_add (dst, ISO_CODE_SS2);
|
|
2510 half = 1;
|
|
2511 }
|
|
2512 break;
|
|
2513
|
|
2514 case 3:
|
|
2515 if (XCODING_SYSTEM_ISO2022_SEVEN (str->codesys))
|
|
2516 {
|
|
2517 Dynarr_add (dst, ISO_CODE_ESC);
|
|
2518 Dynarr_add (dst, 'O');
|
|
2519 half = 0;
|
|
2520 }
|
|
2521 else
|
|
2522 {
|
|
2523 Dynarr_add (dst, ISO_CODE_SS3);
|
|
2524 half = 1;
|
|
2525 }
|
|
2526 break;
|
|
2527
|
|
2528 default:
|
2500
|
2529 ABORT ();
|
771
|
2530 }
|
|
2531 }
|
|
2532 char_boundary = 0;
|
|
2533 }
|
|
2534 else
|
|
2535 { /* Processing Non-ASCII character */
|
|
2536 charmask = (half == 0 ? 0x7F : 0xFF);
|
|
2537 char_boundary = 1;
|
|
2538 if (EQ (charset, Vcharset_control_1))
|
|
2539 {
|
|
2540 if (XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
|
|
2541 && fit_to_be_escape_quoted (c))
|
|
2542 Dynarr_add (dst, ISO_CODE_ESC);
|
|
2543 /* you asked for it ... */
|
|
2544 Dynarr_add (dst, c - 0x20);
|
|
2545 }
|
|
2546 #ifndef ENABLE_COMPOSITE_CHARS
|
|
2547 else if (EQ (charset, Vcharset_composite))
|
|
2548 {
|
|
2549 if (c >= 160 || c <= 164) /* Someone might have stuck in
|
|
2550 something else */
|
|
2551 {
|
|
2552 Dynarr_add (dst, ISO_CODE_ESC);
|
|
2553 Dynarr_add (dst, c - 160 + '0');
|
|
2554 }
|
|
2555 }
|
|
2556 #endif
|
|
2557 else
|
|
2558 {
|
|
2559 switch (XCHARSET_REP_BYTES (charset))
|
|
2560 {
|
|
2561 case 2:
|
3439
|
2562 dynarr_add_2022_one_dimension (charset, c,
|
|
2563 charmask, dst);
|
771
|
2564 break;
|
|
2565 case 3:
|
|
2566 if (XCHARSET_PRIVATE_P (charset))
|
|
2567 {
|
3439
|
2568 dynarr_add_2022_one_dimension (charset, c,
|
|
2569 charmask, dst);
|
771
|
2570 ch = 0;
|
|
2571 }
|
|
2572 else if (ch)
|
|
2573 {
|
|
2574 #ifdef ENABLE_COMPOSITE_CHARS
|
|
2575 if (EQ (charset, Vcharset_composite))
|
|
2576 {
|
3439
|
2577 /* #### Hasn't been written to handle composite
|
|
2578 characters yet. */
|
|
2579 assert(!XCHARSET_ENCODE_AS_UTF_8 (charset))
|
771
|
2580 if (in_composite)
|
|
2581 {
|
|
2582 /* #### Bother! We don't know how to
|
|
2583 handle this yet. */
|
|
2584 Dynarr_add (dst, '~');
|
|
2585 }
|
|
2586 else
|
|
2587 {
|
867
|
2588 Ichar emch = make_ichar (Vcharset_composite,
|
771
|
2589 ch & 0x7F, c & 0x7F);
|
|
2590 Lisp_Object lstr = composite_char_string (emch);
|
|
2591 saved_n = n;
|
|
2592 saved_src = src;
|
|
2593 in_composite = 1;
|
|
2594 src = XSTRING_DATA (lstr);
|
|
2595 n = XSTRING_LENGTH (lstr);
|
|
2596 Dynarr_add (dst, ISO_CODE_ESC);
|
|
2597 Dynarr_add (dst, '0'); /* start composing */
|
|
2598 }
|
|
2599 }
|
|
2600 else
|
|
2601 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
2602 {
|
3439
|
2603 dynarr_add_2022_two_dimensions (charset, c, ch,
|
|
2604 charmask, dst);
|
771
|
2605 }
|
|
2606 ch = 0;
|
|
2607 }
|
|
2608 else
|
|
2609 {
|
|
2610 ch = c;
|
|
2611 char_boundary = 0;
|
|
2612 }
|
|
2613 break;
|
|
2614 case 4:
|
|
2615 if (ch)
|
|
2616 {
|
3439
|
2617 dynarr_add_2022_two_dimensions (charset, c, ch,
|
|
2618 charmask, dst);
|
771
|
2619 ch = 0;
|
|
2620 }
|
|
2621 else
|
|
2622 {
|
|
2623 ch = c;
|
|
2624 char_boundary = 0;
|
|
2625 }
|
|
2626 break;
|
|
2627 default:
|
2500
|
2628 ABORT ();
|
771
|
2629 }
|
|
2630 }
|
|
2631 }
|
|
2632 }
|
|
2633
|
|
2634 #ifdef ENABLE_COMPOSITE_CHARS
|
|
2635 if (in_composite)
|
|
2636 {
|
|
2637 n = saved_n;
|
|
2638 src = saved_src;
|
|
2639 in_composite = 0;
|
|
2640 Dynarr_add (dst, ISO_CODE_ESC);
|
|
2641 Dynarr_add (dst, '1'); /* end composing */
|
|
2642 goto back_to_square_n; /* Wheeeeeeeee ..... */
|
|
2643 }
|
|
2644 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
2645
|
|
2646 if (char_boundary && str->eof)
|
|
2647 {
|
|
2648 restore_left_to_right_direction (codesys, dst, &flags, 0);
|
|
2649 ensure_normal_shift (str, dst);
|
|
2650 for (i = 0; i < 4; i++)
|
|
2651 {
|
|
2652 Lisp_Object initial_charset =
|
|
2653 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
|
|
2654 iso2022_designate (initial_charset, i, str, dst);
|
|
2655 }
|
|
2656 }
|
|
2657
|
|
2658 data->flags = flags;
|
|
2659 str->ch = ch;
|
|
2660 data->current_char_boundary = char_boundary;
|
|
2661 data->current_charset = charset;
|
|
2662 data->current_half = half;
|
|
2663
|
|
2664 /* Verbum caro factum est! */
|
|
2665 return orign;
|
|
2666 }
|
|
2667
|
|
2668 static Bytecount
|
|
2669 iso2022_convert (struct coding_stream *str,
|
|
2670 const UExtbyte *src,
|
|
2671 unsigned_char_dynarr *dst, Bytecount n)
|
|
2672 {
|
|
2673 if (str->direction == CODING_DECODE)
|
|
2674 return iso2022_decode (str, src, dst, n);
|
|
2675 else
|
|
2676 return iso2022_encode (str, src, dst, n);
|
|
2677 }
|
|
2678
|
|
2679 static void
|
|
2680 iso2022_mark (Lisp_Object codesys)
|
|
2681 {
|
|
2682 int i;
|
|
2683
|
|
2684 for (i = 0; i < 4; i++)
|
|
2685 mark_object (XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
|
|
2686 if (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys))
|
|
2687 {
|
|
2688 for (i = 0;
|
|
2689 i < Dynarr_length (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys));
|
|
2690 i++)
|
|
2691 {
|
|
2692 struct charset_conversion_spec *ccs =
|
|
2693 Dynarr_atp (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys), i);
|
|
2694 mark_object (ccs->from_charset);
|
|
2695 mark_object (ccs->to_charset);
|
|
2696 }
|
|
2697 }
|
|
2698 if (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys))
|
|
2699 {
|
|
2700 for (i = 0;
|
|
2701 i < Dynarr_length (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys));
|
|
2702 i++)
|
|
2703 {
|
|
2704 struct charset_conversion_spec *ccs =
|
|
2705 Dynarr_atp (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys), i);
|
|
2706 mark_object (ccs->from_charset);
|
|
2707 mark_object (ccs->to_charset);
|
|
2708 }
|
|
2709 }
|
|
2710 }
|
|
2711
|
|
2712 static void
|
|
2713 iso2022_finalize (Lisp_Object cs)
|
|
2714 {
|
|
2715 if (XCODING_SYSTEM_ISO2022_INPUT_CONV (cs))
|
|
2716 {
|
|
2717 Dynarr_free (XCODING_SYSTEM_ISO2022_INPUT_CONV (cs));
|
|
2718 XCODING_SYSTEM_ISO2022_INPUT_CONV (cs) = 0;
|
|
2719 }
|
|
2720 if (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs))
|
|
2721 {
|
|
2722 Dynarr_free (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs));
|
|
2723 XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs) = 0;
|
|
2724 }
|
|
2725 }
|
|
2726
|
|
2727 /* Given a list of charset conversion specs as specified in a Lisp
|
|
2728 program, parse it into STORE_HERE. */
|
|
2729
|
|
2730 static void
|
|
2731 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
|
|
2732 Lisp_Object spec_list)
|
|
2733 {
|
2367
|
2734 EXTERNAL_LIST_LOOP_2 (car, spec_list)
|
771
|
2735 {
|
|
2736 Lisp_Object from, to;
|
|
2737 struct charset_conversion_spec spec;
|
|
2738
|
|
2739 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
|
|
2740 invalid_argument ("Invalid charset conversion spec", car);
|
|
2741 from = Fget_charset (XCAR (car));
|
|
2742 to = Fget_charset (XCAR (XCDR (car)));
|
|
2743 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
|
|
2744 invalid_operation_2
|
|
2745 ("Attempted conversion between different charset types",
|
|
2746 from, to);
|
|
2747 spec.from_charset = from;
|
|
2748 spec.to_charset = to;
|
|
2749
|
|
2750 Dynarr_add (store_here, spec);
|
|
2751 }
|
|
2752 }
|
|
2753
|
|
2754 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
|
|
2755 specs, return the equivalent as the Lisp programmer would see it.
|
|
2756
|
|
2757 If LOAD_HERE is 0, return Qnil. */
|
|
2758
|
|
2759 static Lisp_Object
|
|
2760 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here,
|
|
2761 int names)
|
|
2762 {
|
|
2763 int i;
|
|
2764 Lisp_Object result;
|
|
2765
|
|
2766 if (!load_here)
|
|
2767 return Qnil;
|
|
2768 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
|
|
2769 {
|
|
2770 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
|
|
2771 if (names)
|
|
2772 result = Fcons (list2 (XCHARSET_NAME (ccs->from_charset),
|
|
2773 XCHARSET_NAME (ccs->to_charset)), result);
|
|
2774 else
|
|
2775 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
|
|
2776 }
|
|
2777
|
|
2778 return Fnreverse (result);
|
|
2779 }
|
|
2780
|
|
2781 static int
|
|
2782 iso2022_putprop (Lisp_Object codesys,
|
|
2783 Lisp_Object key,
|
|
2784 Lisp_Object value)
|
|
2785 {
|
|
2786 #define FROB_INITIAL_CHARSET(charset_num) \
|
|
2787 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
|
|
2788 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
|
|
2789
|
|
2790 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
|
|
2791 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
|
|
2792 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
|
|
2793 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
|
|
2794
|
|
2795 #define FROB_FORCE_CHARSET(charset_num) \
|
|
2796 XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = \
|
|
2797 !NILP (value)
|
|
2798
|
|
2799 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
|
|
2800 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
|
|
2801 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
|
|
2802 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
|
|
2803
|
|
2804 #define FROB_BOOLEAN_PROPERTY(prop) \
|
|
2805 XCODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
|
|
2806
|
|
2807 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
|
|
2808 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
|
|
2809 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
|
|
2810 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
|
|
2811 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
|
|
2812 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
|
|
2813 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
|
|
2814
|
|
2815 else if (EQ (key, Qinput_charset_conversion))
|
|
2816 {
|
|
2817 XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys) =
|
|
2818 Dynarr_new (charset_conversion_spec);
|
|
2819 parse_charset_conversion_specs
|
|
2820 (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys), value);
|
|
2821 }
|
|
2822 else if (EQ (key, Qoutput_charset_conversion))
|
|
2823 {
|
|
2824 XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys) =
|
|
2825 Dynarr_new (charset_conversion_spec);
|
|
2826 parse_charset_conversion_specs
|
|
2827 (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys), value);
|
|
2828 }
|
|
2829 else
|
|
2830 return 0;
|
|
2831
|
|
2832 return 1;
|
|
2833 }
|
|
2834
|
|
2835 static void
|
2286
|
2836 iso2022_finalize_coding_stream (
|
|
2837 #ifdef ENABLE_COMPOSITE_CHARS
|
|
2838 struct coding_stream *str
|
|
2839 #else
|
|
2840 struct coding_stream *UNUSED (str)
|
|
2841 #endif
|
|
2842 )
|
771
|
2843 {
|
|
2844 #ifdef ENABLE_COMPOSITE_CHARS
|
|
2845 struct iso2022_coding_stream *data =
|
|
2846 CODING_STREAM_TYPE_DATA (str, iso2022);
|
|
2847
|
|
2848 if (data->composite_chars)
|
|
2849 Dynarr_free (data->composite_chars);
|
|
2850 #endif
|
|
2851 }
|
|
2852
|
|
2853 static void
|
|
2854 iso2022_init (Lisp_Object codesys)
|
|
2855 {
|
|
2856 int i;
|
|
2857 for (i = 0; i < 4; i++)
|
|
2858 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
|
|
2859 }
|
|
2860
|
|
2861 static Lisp_Object
|
|
2862 coding_system_charset (Lisp_Object coding_system, int gnum)
|
|
2863 {
|
|
2864 Lisp_Object cs
|
|
2865 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
|
|
2866
|
|
2867 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
|
|
2868 }
|
|
2869
|
|
2870 static Lisp_Object
|
|
2871 iso2022_getprop (Lisp_Object coding_system, Lisp_Object prop)
|
|
2872 {
|
|
2873 if (EQ (prop, Qcharset_g0))
|
|
2874 return coding_system_charset (coding_system, 0);
|
|
2875 else if (EQ (prop, Qcharset_g1))
|
|
2876 return coding_system_charset (coding_system, 1);
|
|
2877 else if (EQ (prop, Qcharset_g2))
|
|
2878 return coding_system_charset (coding_system, 2);
|
|
2879 else if (EQ (prop, Qcharset_g3))
|
|
2880 return coding_system_charset (coding_system, 3);
|
|
2881
|
|
2882 #define FORCE_CHARSET(charset_num) \
|
|
2883 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
|
|
2884 (coding_system, charset_num) ? Qt : Qnil)
|
|
2885
|
|
2886 else if (EQ (prop, Qforce_g0_on_output))
|
|
2887 return FORCE_CHARSET (0);
|
|
2888 else if (EQ (prop, Qforce_g1_on_output))
|
|
2889 return FORCE_CHARSET (1);
|
|
2890 else if (EQ (prop, Qforce_g2_on_output))
|
|
2891 return FORCE_CHARSET (2);
|
|
2892 else if (EQ (prop, Qforce_g3_on_output))
|
|
2893 return FORCE_CHARSET (3);
|
|
2894
|
|
2895 #define LISP_BOOLEAN(prop) \
|
|
2896 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
|
|
2897
|
|
2898 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
|
|
2899 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
|
|
2900 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
|
|
2901 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
|
|
2902 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
|
|
2903 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
|
|
2904 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
|
|
2905
|
|
2906 else if (EQ (prop, Qinput_charset_conversion))
|
|
2907 return
|
|
2908 unparse_charset_conversion_specs
|
|
2909 (XCODING_SYSTEM_ISO2022_INPUT_CONV (coding_system), 0);
|
|
2910 else if (EQ (prop, Qoutput_charset_conversion))
|
|
2911 return
|
|
2912 unparse_charset_conversion_specs
|
|
2913 (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (coding_system), 0);
|
|
2914 else
|
|
2915 return Qunbound;
|
|
2916 }
|
|
2917
|
|
2918 static void
|
2286
|
2919 iso2022_print (Lisp_Object cs, Lisp_Object printcharfun,
|
|
2920 int UNUSED (escapeflag))
|
771
|
2921 {
|
|
2922 int i;
|
|
2923
|
826
|
2924 write_c_string (printcharfun, "(");
|
771
|
2925 for (i = 0; i < 4; i++)
|
|
2926 {
|
|
2927 Lisp_Object charset = coding_system_charset (cs, i);
|
|
2928 if (i > 0)
|
826
|
2929 write_c_string (printcharfun, ", ");
|
771
|
2930 write_fmt_string (printcharfun, "g%d=", i);
|
800
|
2931 print_internal (CHARSETP (charset) ? XCHARSET_NAME (charset) : charset, printcharfun, 0);
|
771
|
2932 if (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (cs, i))
|
826
|
2933 write_c_string (printcharfun, "(force)");
|
771
|
2934 }
|
|
2935
|
3084
|
2936 #define FROB(prop) \
|
|
2937 if (!NILP (iso2022_getprop (cs, prop))) \
|
|
2938 { \
|
|
2939 write_fmt_string_lisp (printcharfun, ", %s", 1, prop); \
|
771
|
2940 }
|
|
2941
|
|
2942 FROB (Qshort);
|
|
2943 FROB (Qno_ascii_eol);
|
|
2944 FROB (Qno_ascii_cntl);
|
|
2945 FROB (Qseven);
|
|
2946 FROB (Qlock_shift);
|
|
2947 FROB (Qno_iso6429);
|
|
2948 FROB (Qescape_quoted);
|
|
2949
|
|
2950 {
|
|
2951 Lisp_Object val =
|
|
2952 unparse_charset_conversion_specs
|
|
2953 (XCODING_SYSTEM_ISO2022_INPUT_CONV (cs), 1);
|
|
2954 if (!NILP (val))
|
|
2955 {
|
800
|
2956 write_fmt_string_lisp (printcharfun, ", input-charset-conversion=%s", 1, val);
|
771
|
2957 }
|
|
2958 val =
|
|
2959 unparse_charset_conversion_specs
|
|
2960 (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs), 1);
|
|
2961 if (!NILP (val))
|
|
2962 {
|
800
|
2963 write_fmt_string_lisp (printcharfun, ", output-charset-conversion=%s", 1, val);
|
771
|
2964 }
|
826
|
2965 write_c_string (printcharfun, ")");
|
771
|
2966 }
|
|
2967 }
|
|
2968
|
|
2969
|
|
2970 /************************************************************************/
|
|
2971 /* ISO2022 detector */
|
|
2972 /************************************************************************/
|
|
2973
|
|
2974 DEFINE_DETECTOR (iso2022);
|
|
2975 /* ISO2022 system using only seven-bit bytes, no locking shift */
|
|
2976 DEFINE_DETECTOR_CATEGORY (iso2022, iso_7);
|
|
2977 /* ISO2022 system using eight-bit bytes, no locking shift, no single shift,
|
|
2978 using designation to switch charsets */
|
|
2979 DEFINE_DETECTOR_CATEGORY (iso2022, iso_8_designate);
|
|
2980 /* ISO2022 system using eight-bit bytes, no locking shift, no designation
|
|
2981 sequences, one-dimension characters in the upper half. */
|
|
2982 DEFINE_DETECTOR_CATEGORY (iso2022, iso_8_1);
|
|
2983 /* ISO2022 system using eight-bit bytes, no locking shift, no designation
|
|
2984 sequences, two-dimension characters in the upper half. */
|
|
2985 DEFINE_DETECTOR_CATEGORY (iso2022, iso_8_2);
|
|
2986 /* ISO2022 system using locking shift */
|
|
2987 DEFINE_DETECTOR_CATEGORY (iso2022, iso_lock_shift);
|
|
2988
|
|
2989 struct iso2022_detector
|
|
2990 {
|
|
2991 int initted;
|
|
2992 struct iso2022_coding_stream *iso;
|
|
2993 unsigned int flags;
|
|
2994
|
|
2995 /* for keeping temporary track of high-byte groups */
|
|
2996 int high_byte_count;
|
|
2997 unsigned int saw_single_shift_just_now:1;
|
|
2998
|
|
2999 /* running state; we set the likelihoods at the end */
|
|
3000 unsigned int seen_high_byte:1;
|
|
3001 unsigned int seen_single_shift:1;
|
|
3002 unsigned int seen_locking_shift:1;
|
|
3003 unsigned int seen_designate:1;
|
|
3004 unsigned int bad_single_byte_sequences;
|
|
3005 unsigned int bad_multibyte_escape_sequences;
|
|
3006 unsigned int good_multibyte_escape_sequences;
|
|
3007 int even_high_byte_groups;
|
985
|
3008 int longest_even_high_byte;
|
771
|
3009 int odd_high_byte_groups;
|
|
3010 };
|
|
3011
|
|
3012 static void
|
|
3013 iso2022_detect (struct detection_state *st, const UExtbyte *src,
|
|
3014 Bytecount n)
|
|
3015 {
|
|
3016 Bytecount orign = n;
|
|
3017 struct iso2022_detector *data = DETECTION_STATE_DATA (st, iso2022);
|
|
3018
|
|
3019 /* #### There are serious deficiencies in the recognition mechanism
|
|
3020 here. This needs to be much smarter if it's going to cut it.
|
|
3021 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
|
|
3022 it should be detected as Latin-1.
|
|
3023 All the ISO2022 stuff in this file should be synced up with the
|
|
3024 code from FSF Emacs-21.0, in which Mule should be more or less stable.
|
|
3025 Perhaps we should wait till R2L works in FSF Emacs? */
|
|
3026
|
|
3027 /* We keep track of running state on our own, and set the categories at the
|
|
3028 end; that way we can reflect the correct state each time we finish, but
|
|
3029 not get confused by those results the next time around. */
|
|
3030
|
|
3031 if (!data->initted)
|
|
3032 {
|
|
3033 xzero (*data);
|
|
3034 data->iso = xnew_and_zero (struct iso2022_coding_stream);
|
|
3035 reset_iso2022_decode (Qnil, data->iso);
|
|
3036 data->initted = 1;
|
|
3037 }
|
|
3038
|
|
3039 while (n--)
|
|
3040 {
|
|
3041 UExtbyte c = *src++;
|
|
3042 if (c >= 0x80)
|
|
3043 data->seen_high_byte = 1;
|
|
3044 if (c >= 0xA0)
|
|
3045 data->high_byte_count++;
|
|
3046 else
|
|
3047 {
|
|
3048 if (data->high_byte_count &&
|
|
3049 !data->saw_single_shift_just_now)
|
|
3050 {
|
|
3051 if (data->high_byte_count & 1)
|
|
3052 data->odd_high_byte_groups++;
|
|
3053 else
|
985
|
3054 {
|
|
3055 data->even_high_byte_groups++;
|
|
3056 if (data->longest_even_high_byte < data->high_byte_count)
|
|
3057 data->longest_even_high_byte = data->high_byte_count;
|
|
3058 }
|
771
|
3059 }
|
|
3060 data->high_byte_count = 0;
|
|
3061 data->saw_single_shift_just_now = 0;
|
|
3062 }
|
|
3063 if (!(data->flags & ISO_STATE_ESCAPE)
|
826
|
3064 && (byte_c0_p (c) || byte_c1_p (c)))
|
771
|
3065 { /* control chars */
|
|
3066 switch (c)
|
|
3067 {
|
|
3068 /* Allow and ignore control characters that you might
|
|
3069 reasonably see in a text file */
|
|
3070 case '\r':
|
|
3071 case '\n':
|
|
3072 case '\t':
|
|
3073 case 7: /* bell */
|
|
3074 case 8: /* backspace */
|
|
3075 case 11: /* vertical tab */
|
|
3076 case 12: /* form feed */
|
|
3077 case 26: /* MS-DOS C-z junk */
|
|
3078 case 31: /* '^_' -- for info */
|
|
3079 goto label_continue_loop;
|
|
3080
|
|
3081 default:
|
|
3082 break;
|
|
3083 }
|
|
3084 }
|
|
3085
|
826
|
3086 if ((data->flags & ISO_STATE_ESCAPE) || byte_c0_p (c)
|
|
3087 || byte_c1_p (c))
|
771
|
3088 {
|
|
3089 switch (parse_iso2022_esc (Qnil, data->iso, c,
|
|
3090 &data->flags, 0))
|
|
3091 {
|
|
3092 case 1: /* done */
|
|
3093 if (data->iso->esc_bytes_index > 0)
|
|
3094 data->good_multibyte_escape_sequences++;
|
|
3095 switch (data->iso->esc)
|
|
3096 {
|
|
3097 case ISO_ESC_DESIGNATE:
|
|
3098 data->seen_designate = 1;
|
|
3099 break;
|
|
3100 case ISO_ESC_LOCKING_SHIFT:
|
|
3101 data->seen_locking_shift = 1;
|
|
3102 break;
|
|
3103 case ISO_ESC_SINGLE_SHIFT:
|
|
3104 data->saw_single_shift_just_now = 1;
|
|
3105 data->seen_single_shift = 1;
|
|
3106 break;
|
|
3107 default:
|
|
3108 break;
|
|
3109 }
|
|
3110 break;
|
|
3111
|
|
3112 case -1: /* not done */
|
|
3113 break;
|
|
3114
|
|
3115 case 0: /* error */
|
|
3116 if (data->iso->esc == ISO_ESC_NOTHING)
|
|
3117 data->bad_single_byte_sequences++;
|
|
3118 else
|
|
3119 data->bad_multibyte_escape_sequences++;
|
|
3120 }
|
|
3121 }
|
|
3122 label_continue_loop:;
|
|
3123 }
|
|
3124
|
985
|
3125 if (data->high_byte_count &&
|
|
3126 !data->saw_single_shift_just_now)
|
|
3127 {
|
|
3128 if (data->high_byte_count & 1)
|
|
3129 data->odd_high_byte_groups++;
|
|
3130 else
|
|
3131 {
|
|
3132 data->even_high_byte_groups++;
|
|
3133 if (data->longest_even_high_byte < data->high_byte_count)
|
|
3134 data->longest_even_high_byte = data->high_byte_count;
|
|
3135 }
|
|
3136 }
|
|
3137
|
771
|
3138 if (data->bad_multibyte_escape_sequences > 2 ||
|
|
3139 (data->bad_multibyte_escape_sequences > 0 &&
|
|
3140 data->good_multibyte_escape_sequences /
|
|
3141 data->bad_multibyte_escape_sequences < 10))
|
|
3142 /* Just making it up ... */
|
|
3143 SET_DET_RESULTS (st, iso2022, DET_NEARLY_IMPOSSIBLE);
|
|
3144 else if (data->bad_single_byte_sequences > 5 ||
|
|
3145 (data->bad_single_byte_sequences > 0 &&
|
|
3146 (data->good_multibyte_escape_sequences +
|
|
3147 data->even_high_byte_groups +
|
|
3148 data->odd_high_byte_groups) /
|
|
3149 data->bad_single_byte_sequences < 10))
|
|
3150 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
|
|
3151 else if (data->seen_locking_shift)
|
|
3152 {
|
|
3153 SET_DET_RESULTS (st, iso2022, DET_QUITE_IMPROBABLE);
|
|
3154 DET_RESULT (st, iso_lock_shift) = DET_QUITE_PROBABLE;
|
|
3155 }
|
|
3156 else if (!data->seen_high_byte)
|
|
3157 {
|
|
3158 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
|
|
3159 if (data->good_multibyte_escape_sequences)
|
|
3160 DET_RESULT (st, iso_7) = DET_QUITE_PROBABLE;
|
|
3161 else if (data->seen_single_shift)
|
|
3162 DET_RESULT (st, iso_7) = DET_SOMEWHAT_LIKELY;
|
|
3163 else
|
|
3164 {
|
|
3165 /* If we've just seen pure 7-bit data, no escape sequences,
|
|
3166 then we can't give much likelihood; but if we've seen enough
|
|
3167 of this data, we can assume some unlikelihood of any 8-bit
|
|
3168 encoding */
|
|
3169 if (orign + st->bytes_seen >= 1000)
|
|
3170 DET_RESULT (st, iso_7) = DET_AS_LIKELY_AS_UNLIKELY;
|
|
3171 else
|
|
3172 SET_DET_RESULTS (st, iso2022, DET_AS_LIKELY_AS_UNLIKELY);
|
|
3173 }
|
|
3174 }
|
|
3175 else if (data->seen_designate)
|
|
3176 {
|
|
3177 SET_DET_RESULTS (st, iso2022, DET_QUITE_IMPROBABLE);
|
|
3178 if (data->seen_single_shift)
|
|
3179 /* #### Does this really make sense? */
|
|
3180 DET_RESULT (st, iso_8_designate) = DET_SOMEWHAT_UNLIKELY;
|
|
3181 else
|
|
3182 DET_RESULT (st, iso_8_designate) = DET_QUITE_PROBABLE;
|
|
3183 }
|
|
3184 else if (data->odd_high_byte_groups > 0 &&
|
|
3185 data->even_high_byte_groups == 0)
|
|
3186 {
|
|
3187 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
|
|
3188 if (data->seen_single_shift)
|
|
3189 DET_RESULT (st, iso_8_1) = DET_QUITE_PROBABLE;
|
|
3190 else
|
|
3191 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_LIKELY;
|
|
3192 }
|
|
3193 else if (data->odd_high_byte_groups == 0 &&
|
|
3194 data->even_high_byte_groups > 0)
|
|
3195 {
|
985
|
3196 #if 0
|
771
|
3197 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
|
|
3198 if (data->even_high_byte_groups > 10)
|
|
3199 {
|
|
3200 if (data->seen_single_shift)
|
|
3201 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE;
|
|
3202 else
|
|
3203 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY;
|
|
3204 if (data->even_high_byte_groups < 50)
|
|
3205 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_UNLIKELY;
|
|
3206 /* else it stays at quite improbable */
|
|
3207 }
|
985
|
3208 #else
|
|
3209 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
|
|
3210 if (data->seen_single_shift)
|
|
3211 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE;
|
|
3212 else if (data->even_high_byte_groups > 10)
|
|
3213 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY;
|
|
3214 else if (data->longest_even_high_byte > 6)
|
|
3215 DET_RESULT (st, iso_8_2) = DET_SLIGHTLY_LIKELY;
|
|
3216 #endif
|
771
|
3217 }
|
|
3218 else if (data->odd_high_byte_groups > 0 &&
|
|
3219 data->even_high_byte_groups > 0)
|
3393
|
3220 {
|
|
3221 /* Well, this could be a Latin-1 text, with most high-byte
|
|
3222 characters single, but sometimes two are together, though
|
|
3223 this happens not as often. This is common for Western
|
|
3224 European languages like German, French, Danish, Swedish, etc.
|
|
3225 Then we would either have a rather small file and
|
|
3226 even_high_byte_groups would be low.
|
|
3227 Or we would have a larger file and the ratio of odd to even
|
|
3228 groups would be very high. */
|
|
3229 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
|
|
3230 if (data->even_high_byte_groups <= 3 ||
|
|
3231 data->odd_high_byte_groups >= 10 * data->even_high_byte_groups)
|
|
3232 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_LIKELY;
|
|
3233 }
|
771
|
3234 else
|
|
3235 SET_DET_RESULTS (st, iso2022, DET_AS_LIKELY_AS_UNLIKELY);
|
|
3236 }
|
|
3237
|
|
3238 static void
|
|
3239 iso2022_finalize_detection_state (struct detection_state *st)
|
|
3240 {
|
|
3241 struct iso2022_detector *data = DETECTION_STATE_DATA (st, iso2022);
|
|
3242 if (data->iso)
|
1726
|
3243 xfree (data->iso, struct iso2022_coding_stream *);
|
771
|
3244 }
|
|
3245
|
|
3246
|
|
3247 /************************************************************************/
|
|
3248 /* CCL methods */
|
|
3249 /************************************************************************/
|
|
3250
|
|
3251 /* Converter written in CCL. */
|
|
3252
|
|
3253 struct ccl_coding_system
|
|
3254 {
|
|
3255 /* For a CCL coding system, these specify the CCL programs used for
|
|
3256 decoding (input) and encoding (output). */
|
|
3257 Lisp_Object decode;
|
|
3258 Lisp_Object encode;
|
|
3259 };
|
|
3260
|
|
3261 #define CODING_SYSTEM_CCL_DECODE(codesys) \
|
|
3262 (CODING_SYSTEM_TYPE_DATA (codesys, ccl)->decode)
|
|
3263 #define CODING_SYSTEM_CCL_ENCODE(codesys) \
|
|
3264 (CODING_SYSTEM_TYPE_DATA (codesys, ccl)->encode)
|
|
3265 #define XCODING_SYSTEM_CCL_DECODE(codesys) \
|
|
3266 CODING_SYSTEM_CCL_DECODE (XCODING_SYSTEM (codesys))
|
|
3267 #define XCODING_SYSTEM_CCL_ENCODE(codesys) \
|
|
3268 CODING_SYSTEM_CCL_ENCODE (XCODING_SYSTEM (codesys))
|
|
3269
|
|
3270 struct ccl_coding_stream
|
|
3271 {
|
|
3272 /* state of the running CCL program */
|
|
3273 struct ccl_program ccl;
|
|
3274 };
|
|
3275
|
1204
|
3276 static const struct memory_description ccl_coding_system_description[] = {
|
|
3277 { XD_LISP_OBJECT, offsetof (struct ccl_coding_system, decode) },
|
|
3278 { XD_LISP_OBJECT, offsetof (struct ccl_coding_system, encode) },
|
771
|
3279 { XD_END }
|
|
3280 };
|
|
3281
|
1204
|
3282 DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (ccl);
|
|
3283
|
771
|
3284 static void
|
|
3285 ccl_mark (Lisp_Object codesys)
|
|
3286 {
|
|
3287 mark_object (XCODING_SYSTEM_CCL_DECODE (codesys));
|
|
3288 mark_object (XCODING_SYSTEM_CCL_ENCODE (codesys));
|
|
3289 }
|
|
3290
|
|
3291 static Bytecount
|
|
3292 ccl_convert (struct coding_stream *str, const UExtbyte *src,
|
|
3293 unsigned_char_dynarr *dst, Bytecount n)
|
|
3294 {
|
|
3295 struct ccl_coding_stream *data =
|
|
3296 CODING_STREAM_TYPE_DATA (str, ccl);
|
|
3297 Bytecount orign = n;
|
|
3298
|
|
3299 data->ccl.last_block = str->eof;
|
|
3300 /* When applying a CCL program to a stream, SRC must not be NULL -- this
|
|
3301 is a special signal to the driver that read and write operations are
|
|
3302 not allowed. The code does not actually look at what SRC points to if
|
|
3303 N == 0.
|
|
3304 */
|
|
3305 ccl_driver (&data->ccl, src ? src : (const unsigned char *) "",
|
|
3306 dst, n, 0,
|
|
3307 str->direction == CODING_DECODE ? CCL_MODE_DECODING :
|
|
3308 CCL_MODE_ENCODING);
|
|
3309 return orign;
|
|
3310 }
|
|
3311
|
|
3312 static void
|
|
3313 ccl_init_coding_stream (struct coding_stream *str)
|
|
3314 {
|
|
3315 struct ccl_coding_stream *data =
|
|
3316 CODING_STREAM_TYPE_DATA (str, ccl);
|
|
3317
|
|
3318 setup_ccl_program (&data->ccl,
|
|
3319 str->direction == CODING_DECODE ?
|
|
3320 XCODING_SYSTEM_CCL_DECODE (str->codesys) :
|
|
3321 XCODING_SYSTEM_CCL_ENCODE (str->codesys));
|
|
3322 }
|
|
3323
|
|
3324 static void
|
|
3325 ccl_rewind_coding_stream (struct coding_stream *str)
|
|
3326 {
|
|
3327 ccl_init_coding_stream (str);
|
|
3328 }
|
|
3329
|
|
3330 static void
|
|
3331 ccl_init (Lisp_Object codesys)
|
|
3332 {
|
|
3333 XCODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
|
|
3334 XCODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
|
|
3335 }
|
|
3336
|
|
3337 static int
|
|
3338 ccl_putprop (Lisp_Object codesys, Lisp_Object key, Lisp_Object value)
|
|
3339 {
|
|
3340 Lisp_Object sym;
|
|
3341 struct ccl_program test_ccl;
|
2367
|
3342 Ascbyte *suffix;
|
771
|
3343
|
|
3344 /* Check key first. */
|
|
3345 if (EQ (key, Qdecode))
|
|
3346 suffix = "-ccl-decode";
|
|
3347 else if (EQ (key, Qencode))
|
|
3348 suffix = "-ccl-encode";
|
|
3349 else
|
|
3350 return 0;
|
|
3351
|
|
3352 /* If value is vector, register it as a ccl program
|
|
3353 associated with a newly created symbol for
|
|
3354 backward compatibility.
|
|
3355
|
|
3356 #### Bogosity alert! Do we really have to do this crap???? --ben */
|
|
3357 if (VECTORP (value))
|
|
3358 {
|
|
3359 sym = Fintern (concat2 (Fsymbol_name (XCODING_SYSTEM_NAME (codesys)),
|
|
3360 build_string (suffix)),
|
|
3361 Qnil);
|
|
3362 Fregister_ccl_program (sym, value);
|
|
3363 }
|
|
3364 else
|
|
3365 {
|
|
3366 CHECK_SYMBOL (value);
|
|
3367 sym = value;
|
|
3368 }
|
|
3369 /* check if the given ccl programs are valid. */
|
|
3370 if (setup_ccl_program (&test_ccl, sym) < 0)
|
|
3371 invalid_argument ("Invalid CCL program", value);
|
|
3372
|
|
3373 if (EQ (key, Qdecode))
|
|
3374 XCODING_SYSTEM_CCL_DECODE (codesys) = sym;
|
|
3375 else if (EQ (key, Qencode))
|
|
3376 XCODING_SYSTEM_CCL_ENCODE (codesys) = sym;
|
|
3377
|
|
3378 return 1;
|
|
3379 }
|
|
3380
|
|
3381 static Lisp_Object
|
|
3382 ccl_getprop (Lisp_Object coding_system, Lisp_Object prop)
|
|
3383 {
|
|
3384 if (EQ (prop, Qdecode))
|
|
3385 return XCODING_SYSTEM_CCL_DECODE (coding_system);
|
|
3386 else if (EQ (prop, Qencode))
|
|
3387 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
|
|
3388 else
|
|
3389 return Qunbound;
|
|
3390 }
|
|
3391
|
|
3392
|
|
3393 /************************************************************************/
|
|
3394 /* Initialization */
|
|
3395 /************************************************************************/
|
|
3396
|
|
3397 void
|
|
3398 syms_of_mule_coding (void)
|
|
3399 {
|
|
3400 DEFSUBR (Fdecode_shift_jis_char);
|
|
3401 DEFSUBR (Fencode_shift_jis_char);
|
|
3402 DEFSUBR (Fdecode_big5_char);
|
|
3403 DEFSUBR (Fencode_big5_char);
|
|
3404
|
|
3405 DEFSYMBOL (Qbig5);
|
|
3406 DEFSYMBOL (Qshift_jis);
|
|
3407 DEFSYMBOL (Qccl);
|
|
3408 DEFSYMBOL (Qiso2022);
|
|
3409
|
|
3410 DEFSYMBOL (Qcharset_g0);
|
|
3411 DEFSYMBOL (Qcharset_g1);
|
|
3412 DEFSYMBOL (Qcharset_g2);
|
|
3413 DEFSYMBOL (Qcharset_g3);
|
|
3414 DEFSYMBOL (Qforce_g0_on_output);
|
|
3415 DEFSYMBOL (Qforce_g1_on_output);
|
|
3416 DEFSYMBOL (Qforce_g2_on_output);
|
|
3417 DEFSYMBOL (Qforce_g3_on_output);
|
|
3418 DEFSYMBOL (Qno_iso6429);
|
|
3419 DEFSYMBOL (Qinput_charset_conversion);
|
|
3420 DEFSYMBOL (Qoutput_charset_conversion);
|
|
3421
|
|
3422 DEFSYMBOL (Qshort);
|
|
3423 DEFSYMBOL (Qno_ascii_eol);
|
|
3424 DEFSYMBOL (Qno_ascii_cntl);
|
|
3425 DEFSYMBOL (Qseven);
|
|
3426 DEFSYMBOL (Qlock_shift);
|
|
3427
|
|
3428 DEFSYMBOL (Qiso_7);
|
|
3429 DEFSYMBOL (Qiso_8_designate);
|
|
3430 DEFSYMBOL (Qiso_8_1);
|
|
3431 DEFSYMBOL (Qiso_8_2);
|
|
3432 DEFSYMBOL (Qiso_lock_shift);
|
|
3433 }
|
|
3434
|
|
3435 void
|
|
3436 coding_system_type_create_mule_coding (void)
|
|
3437 {
|
|
3438 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (iso2022, "iso2022-coding-system-p");
|
|
3439 CODING_SYSTEM_HAS_METHOD (iso2022, mark);
|
|
3440 CODING_SYSTEM_HAS_METHOD (iso2022, convert);
|
|
3441 CODING_SYSTEM_HAS_METHOD (iso2022, finalize_coding_stream);
|
|
3442 CODING_SYSTEM_HAS_METHOD (iso2022, init_coding_stream);
|
|
3443 CODING_SYSTEM_HAS_METHOD (iso2022, rewind_coding_stream);
|
|
3444 CODING_SYSTEM_HAS_METHOD (iso2022, init);
|
|
3445 CODING_SYSTEM_HAS_METHOD (iso2022, print);
|
|
3446 CODING_SYSTEM_HAS_METHOD (iso2022, finalize);
|
|
3447 CODING_SYSTEM_HAS_METHOD (iso2022, putprop);
|
|
3448 CODING_SYSTEM_HAS_METHOD (iso2022, getprop);
|
|
3449
|
|
3450 INITIALIZE_DETECTOR (iso2022);
|
|
3451 DETECTOR_HAS_METHOD (iso2022, detect);
|
|
3452 DETECTOR_HAS_METHOD (iso2022, finalize_detection_state);
|
|
3453 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_7);
|
|
3454 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_8_designate);
|
|
3455 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_8_1);
|
|
3456 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_8_2);
|
|
3457 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_lock_shift);
|
|
3458
|
|
3459 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (ccl, "ccl-coding-system-p");
|
|
3460 CODING_SYSTEM_HAS_METHOD (ccl, mark);
|
|
3461 CODING_SYSTEM_HAS_METHOD (ccl, convert);
|
|
3462 CODING_SYSTEM_HAS_METHOD (ccl, init);
|
|
3463 CODING_SYSTEM_HAS_METHOD (ccl, init_coding_stream);
|
|
3464 CODING_SYSTEM_HAS_METHOD (ccl, rewind_coding_stream);
|
|
3465 CODING_SYSTEM_HAS_METHOD (ccl, putprop);
|
|
3466 CODING_SYSTEM_HAS_METHOD (ccl, getprop);
|
|
3467
|
|
3468 INITIALIZE_CODING_SYSTEM_TYPE (shift_jis, "shift-jis-coding-system-p");
|
|
3469 CODING_SYSTEM_HAS_METHOD (shift_jis, convert);
|
|
3470
|
|
3471 INITIALIZE_DETECTOR (shift_jis);
|
|
3472 DETECTOR_HAS_METHOD (shift_jis, detect);
|
|
3473 INITIALIZE_DETECTOR_CATEGORY (shift_jis, shift_jis);
|
|
3474
|
|
3475 INITIALIZE_CODING_SYSTEM_TYPE (big5, "big5-coding-system-p");
|
|
3476 CODING_SYSTEM_HAS_METHOD (big5, convert);
|
|
3477
|
|
3478 INITIALIZE_DETECTOR (big5);
|
|
3479 DETECTOR_HAS_METHOD (big5, detect);
|
|
3480 INITIALIZE_DETECTOR_CATEGORY (big5, big5);
|
|
3481 }
|
|
3482
|
|
3483 void
|
|
3484 reinit_coding_system_type_create_mule_coding (void)
|
|
3485 {
|
|
3486 REINITIALIZE_CODING_SYSTEM_TYPE (iso2022);
|
|
3487 REINITIALIZE_CODING_SYSTEM_TYPE (ccl);
|
|
3488 REINITIALIZE_CODING_SYSTEM_TYPE (shift_jis);
|
|
3489 REINITIALIZE_CODING_SYSTEM_TYPE (big5);
|
|
3490 }
|
|
3491
|
|
3492 void
|
|
3493 reinit_vars_of_mule_coding (void)
|
|
3494 {
|
|
3495 }
|
|
3496
|
|
3497 void
|
|
3498 vars_of_mule_coding (void)
|
|
3499 {
|
|
3500 }
|