Mercurial > hg > xemacs-beta
annotate src/mule-coding.c @ 4614:afbfad080ddd
The URLs in our current config.guess and config.sub files are obsolete.
Update to the latest upstream release to get correct URLs, as well as fixes
and enhancements to those scripts.
| author | Jerry James <james@xemacs.org> |
|---|---|
| date | Wed, 11 Feb 2009 11:09:35 -0700 |
| parents | 726060ee587c |
| children | 257b468bf2ca |
| rev | line source |
|---|---|
| 771 | 1 /* Conversion functions for I18N encodings, but not Unicode (in separate file). |
| 2 Copyright (C) 1991, 1995 Free Software Foundation, Inc. | |
| 3 Copyright (C) 1995 Sun Microsystems, Inc. | |
| 4 Copyright (C) 2000, 2001, 2002 Ben Wing. | |
| 5 | |
| 6 This file is part of XEmacs. | |
| 7 | |
| 8 XEmacs is free software; you can redistribute it and/or modify it | |
| 9 under the terms of the GNU General Public License as published by the | |
| 10 Free Software Foundation; either version 2, or (at your option) any | |
| 11 later version. | |
| 12 | |
| 13 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
| 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
| 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
| 16 for more details. | |
| 17 | |
| 18 You should have received a copy of the GNU General Public License | |
| 19 along with XEmacs; see the file COPYING. If not, write to | |
| 20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
| 21 Boston, MA 02111-1307, USA. */ | |
| 22 | |
| 23 /* Synched up with: Mule 2.3. Not in FSF. */ | |
| 24 | |
| 25 /* For previous history, see file-coding.c. | |
| 26 | |
| 27 September 10, 2001: Extracted from file-coding.c by Ben Wing. | |
| 28 | |
| 29 Later in September: Finished abstraction of detection system, rewrote | |
| 30 all the detectors to include multiple levels of likelihood. | |
| 31 */ | |
| 32 | |
| 33 #include <config.h> | |
| 34 #include "lisp.h" | |
| 35 | |
| 36 #include "charset.h" | |
| 37 #include "mule-ccl.h" | |
| 38 #include "file-coding.h" | |
| 39 | |
| 40 Lisp_Object Qshift_jis, Qiso2022, Qbig5, Qccl; | |
| 41 | |
| 42 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3; | |
| 43 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output; | |
| 44 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output; | |
| 45 Lisp_Object Qno_iso6429; | |
| 46 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion; | |
| 47 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift; | |
| 48 | |
| 49 Lisp_Object Qiso_7, Qiso_8_designate, Qiso_8_1, Qiso_8_2, Qiso_lock_shift; | |
| 50 | |
| 51 | |
| 52 /************************************************************************/ | |
| 53 /* Shift-JIS methods */ | |
| 54 /************************************************************************/ | |
| 55 | |
| 56 /* Shift-JIS; Hankaku (half-width) KANA is also supported. */ | |
| 57 DEFINE_CODING_SYSTEM_TYPE (shift_jis); | |
| 58 | |
| 59 /* Shift-JIS is a coding system encoding three character sets: ASCII, right | |
| 60 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded | |
| 61 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is | |
| 62 encoded by "position-code + 0x80". A character of JISX0208 | |
| 63 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two | |
| 64 position-codes are divided and shifted so that it fit in the range | |
| 65 below. | |
| 66 | |
| 67 --- CODE RANGE of Shift-JIS --- | |
| 68 (character set) (range) | |
| 69 ASCII 0x00 .. 0x7F | |
| 70 JISX0201-Kana 0xA0 .. 0xDF | |
| 71 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF | |
| 72 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC | |
| 73 ------------------------------- | |
| 74 | |
| 75 */ | |
| 76 | |
| 77 /* Is this the first byte of a Shift-JIS two-byte char? */ | |
| 78 | |
| 826 | 79 inline static int |
| 80 byte_shift_jis_two_byte_1_p (int c) | |
| 81 { | |
| 82 return (c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF); | |
| 83 } | |
| 771 | 84 |
| 85 /* Is this the second byte of a Shift-JIS two-byte char? */ | |
| 86 | |
| 826 | 87 inline static int |
| 88 byte_shift_jis_two_byte_2_p (int c) | |
| 89 { | |
| 90 return (c >= 0x40 && c <= 0x7E) || (c >= 0x80 && c <= 0xFC); | |
| 91 } | |
| 92 | |
| 93 inline static int | |
| 94 byte_shift_jis_katakana_p (int c) | |
| 95 { | |
| 96 return c >= 0xA1 && c <= 0xDF; | |
| 97 } | |
| 771 | 98 |
| 3439 | 99 inline static void |
| 100 dynarr_add_2022_one_dimension (Lisp_Object charset, Ibyte c, | |
| 101 unsigned char charmask, | |
| 102 unsigned_char_dynarr *dst) | |
| 103 { | |
| 104 if (XCHARSET_ENCODE_AS_UTF_8 (charset)) | |
| 105 { | |
| 106 encode_unicode_char (charset, c & charmask, 0, | |
| 4096 | 107 dst, UNICODE_UTF_8, 0, 0); |
| 3439 | 108 } |
| 109 else | |
| 110 { | |
| 111 Dynarr_add (dst, c & charmask); | |
| 112 } | |
| 113 } | |
| 114 | |
| 115 inline static void | |
| 116 dynarr_add_2022_two_dimensions (Lisp_Object charset, Ibyte c, | |
| 117 unsigned int ch, | |
| 118 unsigned char charmask, | |
| 119 unsigned_char_dynarr *dst) | |
| 120 { | |
| 121 if (XCHARSET_ENCODE_AS_UTF_8 (charset)) | |
| 122 { | |
| 123 encode_unicode_char (charset, | |
| 124 ch & charmask, | |
| 125 c & charmask, dst, | |
| 4096 | 126 UNICODE_UTF_8, 0, 0); |
| 3439 | 127 } |
| 128 else | |
| 129 { | |
| 130 Dynarr_add (dst, ch & charmask); | |
| 131 Dynarr_add (dst, c & charmask); | |
| 132 } | |
| 133 } | |
| 134 | |
| 771 | 135 /* Convert Shift-JIS data to internal format. */ |
| 136 | |
| 137 static Bytecount | |
| 138 shift_jis_convert (struct coding_stream *str, const UExtbyte *src, | |
| 139 unsigned_char_dynarr *dst, Bytecount n) | |
| 140 { | |
| 141 unsigned int ch = str->ch; | |
| 142 Bytecount orign = n; | |
| 143 | |
| 144 if (str->direction == CODING_DECODE) | |
| 145 { | |
| 146 while (n--) | |
| 147 { | |
| 148 UExtbyte c = *src++; | |
| 149 | |
| 150 if (ch) | |
| 151 { | |
| 152 /* Previous character was first byte of Shift-JIS Kanji char. */ | |
| 826 | 153 if (byte_shift_jis_two_byte_2_p (c)) |
| 771 | 154 { |
| 867 | 155 Ibyte e1, e2; |
| 771 | 156 |
| 157 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); | |
| 158 DECODE_SHIFT_JIS (ch, c, e1, e2); | |
| 159 Dynarr_add (dst, e1); | |
| 160 Dynarr_add (dst, e2); | |
| 161 } | |
| 162 else | |
| 163 { | |
| 164 DECODE_ADD_BINARY_CHAR (ch, dst); | |
| 165 DECODE_ADD_BINARY_CHAR (c, dst); | |
| 166 } | |
| 167 ch = 0; | |
| 168 } | |
| 169 else | |
| 170 { | |
| 826 | 171 if (byte_shift_jis_two_byte_1_p (c)) |
| 771 | 172 ch = c; |
| 826 | 173 else if (byte_shift_jis_katakana_p (c)) |
| 771 | 174 { |
| 175 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201); | |
| 176 Dynarr_add (dst, c); | |
| 177 } | |
| 178 else | |
| 179 DECODE_ADD_BINARY_CHAR (c, dst); | |
| 180 } | |
| 181 } | |
| 182 | |
| 183 if (str->eof) | |
| 184 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); | |
| 185 } | |
| 186 else | |
| 187 { | |
| 188 while (n--) | |
| 189 { | |
| 867 | 190 Ibyte c = *src++; |
| 826 | 191 if (byte_ascii_p (c)) |
| 771 | 192 { |
| 193 Dynarr_add (dst, c); | |
| 194 ch = 0; | |
| 195 } | |
| 867 | 196 else if (ibyte_leading_byte_p (c)) |
| 771 | 197 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 || |
| 198 c == LEADING_BYTE_JAPANESE_JISX0208_1978 || | |
| 199 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0; | |
| 200 else if (ch) | |
| 201 { | |
| 202 if (ch == LEADING_BYTE_KATAKANA_JISX0201) | |
| 203 { | |
| 204 Dynarr_add (dst, c); | |
| 205 ch = 0; | |
| 206 } | |
| 207 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 || | |
| 208 ch == LEADING_BYTE_JAPANESE_JISX0208) | |
| 209 ch = c; | |
| 210 else | |
| 211 { | |
| 212 UExtbyte j1, j2; | |
| 213 ENCODE_SHIFT_JIS (ch, c, j1, j2); | |
| 214 Dynarr_add (dst, j1); | |
| 215 Dynarr_add (dst, j2); | |
| 216 ch = 0; | |
| 217 } | |
| 218 } | |
| 219 } | |
| 220 } | |
| 221 | |
| 222 str->ch = ch; | |
| 223 | |
| 224 return orign; | |
| 225 } | |
| 226 | |
| 227 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /* | |
| 228 Decode a JISX0208 character of Shift-JIS coding-system. | |
| 229 CODE is the character code in Shift-JIS as a cons of type bytes. | |
| 230 Return the corresponding character. | |
| 231 */ | |
| 232 (code)) | |
| 233 { | |
| 234 int c1, c2, s1, s2; | |
| 235 | |
| 236 CHECK_CONS (code); | |
| 237 CHECK_INT (XCAR (code)); | |
| 238 CHECK_INT (XCDR (code)); | |
| 239 s1 = XINT (XCAR (code)); | |
| 240 s2 = XINT (XCDR (code)); | |
| 826 | 241 if (byte_shift_jis_two_byte_1_p (s1) && |
| 242 byte_shift_jis_two_byte_2_p (s2)) | |
| 771 | 243 { |
| 244 DECODE_SHIFT_JIS (s1, s2, c1, c2); | |
| 867 | 245 return make_char (make_ichar (Vcharset_japanese_jisx0208, |
| 831 | 246 c1 & 0x7F, c2 & 0x7F)); |
| 771 | 247 } |
| 248 else | |
| 249 return Qnil; | |
| 250 } | |
| 251 | |
| 252 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /* | |
| 253 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system. | |
| 254 Return the corresponding character code in SHIFT-JIS as a cons of two bytes. | |
| 255 */ | |
| 256 (character)) | |
| 257 { | |
| 258 Lisp_Object charset; | |
| 259 int c1, c2, s1, s2; | |
| 260 | |
| 261 CHECK_CHAR_COERCE_INT (character); | |
| 867 | 262 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2); |
| 771 | 263 if (EQ (charset, Vcharset_japanese_jisx0208)) |
| 264 { | |
| 265 ENCODE_SHIFT_JIS (c1 | 0x80, c2 | 0x80, s1, s2); | |
| 266 return Fcons (make_int (s1), make_int (s2)); | |
| 267 } | |
| 268 else | |
| 269 return Qnil; | |
| 270 } | |
| 271 | |
| 272 | |
| 273 /************************************************************************/ | |
| 274 /* Shift-JIS detector */ | |
| 275 /************************************************************************/ | |
| 276 | |
| 277 DEFINE_DETECTOR (shift_jis); | |
| 278 DEFINE_DETECTOR_CATEGORY (shift_jis, shift_jis); | |
| 279 | |
| 280 struct shift_jis_detector | |
| 281 { | |
| 282 int seen_jisx0208_char_in_c1; | |
| 283 int seen_jisx0208_char_in_upper; | |
| 284 int seen_jisx0201_char; | |
| 285 unsigned int seen_iso2022_esc:1; | |
| 286 unsigned int seen_bad_first_byte:1; | |
| 287 unsigned int seen_bad_second_byte:1; | |
| 288 /* temporary */ | |
| 289 unsigned int in_second_byte:1; | |
| 290 unsigned int first_byte_was_c1:1; | |
| 291 }; | |
| 292 | |
| 293 static void | |
| 294 shift_jis_detect (struct detection_state *st, const UExtbyte *src, | |
| 295 Bytecount n) | |
| 296 { | |
| 297 struct shift_jis_detector *data = DETECTION_STATE_DATA (st, shift_jis); | |
| 298 | |
| 299 while (n--) | |
| 300 { | |
| 301 UExtbyte c = *src++; | |
| 302 if (!data->in_second_byte) | |
| 303 { | |
| 304 if (c >= 0x80 && c <= 0x9F) | |
| 305 data->first_byte_was_c1 = 1; | |
| 306 if (c >= 0xA0 && c <= 0xDF) | |
| 307 data->seen_jisx0201_char++; | |
| 308 else if ((c >= 0x80 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF)) | |
| 309 data->in_second_byte = 1; | |
| 310 else if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
| 311 data->seen_iso2022_esc = 1; | |
| 312 else if (c >= 0x80) | |
| 313 data->seen_bad_first_byte = 1; | |
| 314 } | |
| 315 else | |
| 316 { | |
| 317 if ((c >= 0x40 && c <= 0x7E) || (c >= 0x80 && c <= 0xFC)) | |
| 318 { | |
| 319 if (data->first_byte_was_c1 || (c >= 0x80 && c <= 0x9F)) | |
| 320 data->seen_jisx0208_char_in_c1++; | |
| 321 else | |
| 322 data->seen_jisx0208_char_in_upper++; | |
| 323 } | |
| 324 else | |
| 325 data->seen_bad_second_byte = 1; | |
| 326 data->in_second_byte = 0; | |
| 327 data->first_byte_was_c1 = 0; | |
| 328 } | |
| 329 } | |
| 330 | |
| 331 if (data->seen_bad_second_byte) | |
| 332 DET_RESULT (st, shift_jis) = DET_NEARLY_IMPOSSIBLE; | |
| 333 else if (data->seen_bad_first_byte) | |
| 334 DET_RESULT (st, shift_jis) = DET_QUITE_IMPROBABLE; | |
| 335 else if (data->seen_iso2022_esc) | |
| 336 DET_RESULT (st, shift_jis) = DET_SOMEWHAT_UNLIKELY; | |
| 337 else if (data->seen_jisx0208_char_in_c1 >= 20 || | |
| 338 (data->seen_jisx0208_char_in_c1 >= 10 && | |
| 339 data->seen_jisx0208_char_in_upper >= 10)) | |
| 340 DET_RESULT (st, shift_jis) = DET_QUITE_PROBABLE; | |
| 341 else if (data->seen_jisx0208_char_in_c1 > 3 || | |
| 342 data->seen_jisx0208_char_in_upper >= 10 || | |
| 343 /* Since the range is limited compared to what is often seen | |
| 344 is typical Latin-X charsets, the fact that we've seen a | |
| 345 bunch of them and none that are invalid is reasonably | |
| 346 strong statistical evidence of this encoding, or at least | |
| 347 not of the common Latin-X ones. */ | |
| 348 data->seen_jisx0201_char >= 100) | |
| 349 DET_RESULT (st, shift_jis) = DET_SOMEWHAT_LIKELY; | |
| 350 else if (data->seen_jisx0208_char_in_c1 > 0 || | |
| 351 data->seen_jisx0208_char_in_upper > 0 || | |
| 352 data->seen_jisx0201_char > 0) | |
| 353 DET_RESULT (st, shift_jis) = DET_SLIGHTLY_LIKELY; | |
| 354 else | |
| 355 DET_RESULT (st, shift_jis) = DET_AS_LIKELY_AS_UNLIKELY; | |
| 356 } | |
| 357 | |
| 358 | |
| 359 /************************************************************************/ | |
| 360 /* Big5 methods */ | |
| 361 /************************************************************************/ | |
| 362 | |
| 2819 | 363 /* BIG5 (used for Mandarin in Taiwan). */ |
| 771 | 364 DEFINE_CODING_SYSTEM_TYPE (big5); |
| 365 | |
| 366 /* BIG5 is a coding system encoding two character sets: ASCII and | |
| 367 Big5. An ASCII character is encoded as is. Big5 is a two-byte | |
| 368 character set and is encoded in two-byte. | |
| 369 | |
| 370 --- CODE RANGE of BIG5 --- | |
| 371 (character set) (range) | |
| 372 ASCII 0x00 .. 0x7F | |
| 373 Big5 (1st byte) 0xA1 .. 0xFE | |
| 374 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE | |
| 375 -------------------------- | |
| 376 | |
| 377 Since the number of characters in Big5 is larger than maximum | |
| 378 characters in Emacs' charset (96x96), it can't be handled as one | |
| 379 charset. So, in XEmacs, Big5 is divided into two: `charset-big5-1' | |
| 380 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former | |
| 381 contains frequently used characters and the latter contains less | |
| 382 frequently used characters. */ | |
| 383 | |
| 826 | 384 inline static int |
| 385 byte_big5_two_byte_1_p (int c) | |
| 386 { | |
| 387 return c >= 0xA1 && c <= 0xFE; | |
| 388 } | |
| 771 | 389 |
| 390 /* Is this the second byte of a Shift-JIS two-byte char? */ | |
| 391 | |
| 826 | 392 inline static int |
| 393 byte_big5_two_byte_2_p (int c) | |
| 394 { | |
| 395 return (c >= 0x40 && c <= 0x7E) || (c >= 0xA1 && c <= 0xFE); | |
| 396 } | |
| 771 | 397 |
| 398 /* Number of Big5 characters which have the same code in 1st byte. */ | |
| 399 | |
| 400 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40) | |
| 401 | |
| 402 /* Code conversion macros. These are macros because they are used in | |
| 403 inner loops during code conversion. | |
| 404 | |
| 405 Note that temporary variables in macros introduce the classic | |
| 406 dynamic-scoping problems with variable names. We use capital- | |
| 407 lettered variables in the assumption that XEmacs does not use | |
| 408 capital letters in variables except in a very formalized way | |
| 409 (e.g. Qstring). */ | |
| 410 | |
| 411 /* Convert Big5 code (b1, b2) into its internal string representation | |
| 412 (lb, c1, c2). */ | |
| 413 | |
| 414 /* There is a much simpler way to split the Big5 charset into two. | |
| 415 For the moment I'm going to leave the algorithm as-is because it | |
| 416 claims to separate out the most-used characters into a single | |
| 417 charset, which perhaps will lead to optimizations in various | |
| 418 places. | |
| 419 | |
| 420 The way the algorithm works is something like this: | |
| 421 | |
| 422 Big5 can be viewed as a 94x157 charset, where the row is | |
| 423 encoded into the bytes 0xA1 .. 0xFE and the column is encoded | |
| 424 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency, | |
| 425 the split between low and high column numbers is apparently | |
| 426 meaningless; ascending rows produce less and less frequent chars. | |
| 427 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to | |
| 428 the first charset, and the upper half (0xC9 .. 0xFE) to the | |
| 429 second. To do the conversion, we convert the character into | |
| 430 a single number where 0 .. 156 is the first row, 157 .. 313 | |
| 431 is the second, etc. That way, the characters are ordered by | |
| 432 decreasing frequency. Then we just chop the space in two | |
| 433 and coerce the result into a 94x94 space. | |
| 434 */ | |
| 435 | |
| 436 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \ | |
| 437 { \ | |
| 438 int B1 = b1, B2 = b2; \ | |
| 439 int I \ | |
| 440 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \ | |
| 441 \ | |
| 442 if (B1 < 0xC9) \ | |
| 443 { \ | |
| 444 lb = LEADING_BYTE_CHINESE_BIG5_1; \ | |
| 445 } \ | |
| 446 else \ | |
| 447 { \ | |
| 448 lb = LEADING_BYTE_CHINESE_BIG5_2; \ | |
| 449 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \ | |
| 450 } \ | |
| 451 c1 = I / (0xFF - 0xA1) + 0xA1; \ | |
| 452 c2 = I % (0xFF - 0xA1) + 0xA1; \ | |
| 453 } while (0) | |
| 454 | |
| 455 /* Convert the internal string representation of a Big5 character | |
| 456 (lb, c1, c2) into Big5 code (b1, b2). */ | |
| 457 | |
| 458 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \ | |
| 459 { \ | |
| 460 int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \ | |
| 461 \ | |
| 462 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \ | |
| 463 { \ | |
| 464 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \ | |
| 465 } \ | |
| 466 b1 = I / BIG5_SAME_ROW + 0xA1; \ | |
| 467 b2 = I % BIG5_SAME_ROW; \ | |
| 468 b2 += b2 < 0x3F ? 0x40 : 0x62; \ | |
| 469 } while (0) | |
| 470 | |
| 471 /* Convert Big5 data to internal format. */ | |
| 472 | |
| 473 static Bytecount | |
| 474 big5_convert (struct coding_stream *str, const UExtbyte *src, | |
| 475 unsigned_char_dynarr *dst, Bytecount n) | |
| 476 { | |
| 477 unsigned int ch = str->ch; | |
| 478 Bytecount orign = n; | |
| 479 | |
| 480 if (str->direction == CODING_DECODE) | |
| 481 { | |
| 482 while (n--) | |
| 483 { | |
| 484 UExtbyte c = *src++; | |
| 485 if (ch) | |
| 486 { | |
| 487 /* Previous character was first byte of Big5 char. */ | |
| 826 | 488 if (byte_big5_two_byte_2_p (c)) |
| 771 | 489 { |
| 867 | 490 Ibyte b1, b2, b3; |
| 771 | 491 DECODE_BIG5 (ch, c, b1, b2, b3); |
| 492 Dynarr_add (dst, b1); | |
| 493 Dynarr_add (dst, b2); | |
| 494 Dynarr_add (dst, b3); | |
| 495 } | |
| 496 else | |
| 497 { | |
| 498 DECODE_ADD_BINARY_CHAR (ch, dst); | |
| 499 DECODE_ADD_BINARY_CHAR (c, dst); | |
| 500 } | |
| 501 ch = 0; | |
| 502 } | |
| 503 else | |
| 504 { | |
| 826 | 505 if (byte_big5_two_byte_1_p (c)) |
| 771 | 506 ch = c; |
| 507 else | |
| 508 DECODE_ADD_BINARY_CHAR (c, dst); | |
| 509 } | |
| 510 } | |
| 511 | |
| 512 if (str->eof) | |
| 513 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); | |
| 514 } | |
| 515 else | |
| 516 { | |
| 517 while (n--) | |
| 518 { | |
| 867 | 519 Ibyte c = *src++; |
| 826 | 520 if (byte_ascii_p (c)) |
| 771 | 521 { |
| 522 /* ASCII. */ | |
| 523 Dynarr_add (dst, c); | |
| 524 } | |
| 867 | 525 else if (ibyte_leading_byte_p (c)) |
| 771 | 526 { |
| 527 if (c == LEADING_BYTE_CHINESE_BIG5_1 || | |
| 528 c == LEADING_BYTE_CHINESE_BIG5_2) | |
| 529 { | |
| 530 /* A recognized leading byte. */ | |
| 531 ch = c; | |
| 532 continue; /* not done with this character. */ | |
| 533 } | |
| 534 /* otherwise just ignore this character. */ | |
| 535 } | |
| 536 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 || | |
| 537 ch == LEADING_BYTE_CHINESE_BIG5_2) | |
| 538 { | |
| 539 /* Previous char was a recognized leading byte. */ | |
| 540 ch = (ch << 8) | c; | |
| 541 continue; /* not done with this character. */ | |
| 542 } | |
| 543 else if (ch) | |
| 544 { | |
| 545 /* Encountering second byte of a Big5 character. */ | |
| 546 UExtbyte b1, b2; | |
| 547 | |
| 548 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2); | |
| 549 Dynarr_add (dst, b1); | |
| 550 Dynarr_add (dst, b2); | |
| 551 } | |
| 552 | |
| 553 ch = 0; | |
| 554 } | |
| 555 } | |
| 556 | |
| 557 str->ch = ch; | |
| 558 return orign; | |
| 559 } | |
| 560 | |
| 867 | 561 Ichar |
| 771 | 562 decode_big5_char (int b1, int b2) |
| 563 { | |
| 826 | 564 if (byte_big5_two_byte_1_p (b1) && |
| 565 byte_big5_two_byte_2_p (b2)) | |
| 771 | 566 { |
| 567 int leading_byte; | |
| 568 Lisp_Object charset; | |
| 569 int c1, c2; | |
| 570 | |
| 571 DECODE_BIG5 (b1, b2, leading_byte, c1, c2); | |
| 826 | 572 charset = charset_by_leading_byte (leading_byte); |
| 867 | 573 return make_ichar (charset, c1 & 0x7F, c2 & 0x7F); |
| 771 | 574 } |
| 575 else | |
| 576 return -1; | |
| 577 } | |
| 578 | |
| 579 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /* | |
| 580 Convert Big Five character codes in CODE into a character. | |
| 581 CODE is a cons of two integers specifying the codepoints in Big Five. | |
| 582 Return the corresponding character, or nil if the codepoints are out of range. | |
| 583 | |
| 584 The term `decode' is used because the codepoints can be viewed as the | |
| 585 representation of the character in the external Big Five encoding, and thus | |
| 586 converting them to a character is analogous to any other operation that | |
| 587 decodes an external representation. | |
| 588 */ | |
| 589 (code)) | |
| 590 { | |
| 867 | 591 Ichar ch; |
| 771 | 592 |
| 593 CHECK_CONS (code); | |
| 594 CHECK_INT (XCAR (code)); | |
| 595 CHECK_INT (XCDR (code)); | |
| 596 ch = decode_big5_char (XINT (XCAR (code)), XINT (XCDR (code))); | |
| 597 if (ch == -1) | |
| 598 return Qnil; | |
| 599 else | |
| 600 return make_char (ch); | |
| 601 } | |
| 602 | |
| 603 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /* | |
| 604 Convert the specified Big Five character into its codepoints. | |
| 605 The codepoints are returned as a cons of two integers, specifying the | |
| 606 Big Five codepoints. See `decode-big5-char' for the reason why the | |
| 607 term `encode' is used for this operation. | |
| 608 */ | |
| 609 (character)) | |
| 610 { | |
| 611 Lisp_Object charset; | |
| 612 int c1, c2, b1, b2; | |
| 613 | |
| 614 CHECK_CHAR_COERCE_INT (character); | |
| 867 | 615 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2); |
| 771 | 616 if (EQ (charset, Vcharset_chinese_big5_1) || |
| 617 EQ (charset, Vcharset_chinese_big5_2)) | |
| 618 { | |
| 619 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80, | |
| 620 b1, b2); | |
| 621 return Fcons (make_int (b1), make_int (b2)); | |
| 622 } | |
| 623 else | |
| 624 return Qnil; | |
| 625 } | |
| 626 | |
| 627 | |
| 628 /************************************************************************/ | |
| 629 /* Big5 detector */ | |
| 630 /************************************************************************/ | |
| 631 | |
| 632 DEFINE_DETECTOR (big5); | |
| 633 DEFINE_DETECTOR_CATEGORY (big5, big5); | |
| 634 | |
| 635 struct big5_detector | |
| 636 { | |
| 637 int seen_big5_char; | |
| 985 | 638 int seen_euc_char; |
| 771 | 639 unsigned int seen_iso2022_esc:1; |
| 640 unsigned int seen_bad_first_byte:1; | |
| 641 unsigned int seen_bad_second_byte:1; | |
| 642 | |
| 643 /* temporary */ | |
| 644 unsigned int in_second_byte:1; | |
| 645 }; | |
| 646 | |
| 647 static void | |
| 648 big5_detect (struct detection_state *st, const UExtbyte *src, | |
| 649 Bytecount n) | |
| 650 { | |
| 651 struct big5_detector *data = DETECTION_STATE_DATA (st, big5); | |
| 652 | |
| 653 while (n--) | |
| 654 { | |
| 655 UExtbyte c = *src++; | |
| 656 if (!data->in_second_byte) | |
| 657 { | |
| 658 if (c >= 0xA1 && c <= 0xFE) | |
| 659 data->in_second_byte = 1; | |
| 660 else if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
| 661 data->seen_iso2022_esc = 1; | |
| 662 else if (c >= 0x80) | |
| 663 data->seen_bad_first_byte = 1; | |
| 664 } | |
| 665 else | |
| 666 { | |
| 667 data->in_second_byte = 0; | |
| 985 | 668 if (c >= 0xA1 && c <= 0xFE) |
| 669 data->seen_euc_char++; | |
| 670 else if (c >= 0x40 && c <= 0x7E) | |
| 771 | 671 data->seen_big5_char++; |
| 672 else | |
| 673 data->seen_bad_second_byte = 1; | |
| 674 } | |
| 675 } | |
| 676 | |
| 677 if (data->seen_bad_second_byte) | |
| 678 DET_RESULT (st, big5) = DET_NEARLY_IMPOSSIBLE; | |
| 679 else if (data->seen_bad_first_byte) | |
| 680 DET_RESULT (st, big5) = DET_QUITE_IMPROBABLE; | |
| 681 else if (data->seen_iso2022_esc) | |
| 682 DET_RESULT (st, big5) = DET_SOMEWHAT_UNLIKELY; | |
| 683 else if (data->seen_big5_char >= 4) | |
| 684 DET_RESULT (st, big5) = DET_SOMEWHAT_LIKELY; | |
| 985 | 685 else if (data->seen_euc_char) |
| 686 DET_RESULT (st, big5) = DET_SLIGHTLY_LIKELY; | |
| 771 | 687 else |
| 688 DET_RESULT (st, big5) = DET_AS_LIKELY_AS_UNLIKELY; | |
| 689 } | |
| 690 | |
| 691 | |
| 692 /************************************************************************/ | |
| 693 /* ISO2022 methods */ | |
| 694 /************************************************************************/ | |
| 695 | |
| 696 /* Any ISO-2022-compliant coding system. Includes JIS, EUC, CTEXT | |
| 697 (Compound Text, the encoding of selections in X Windows). See below for | |
| 698 a complete description of ISO-2022. */ | |
| 699 | |
| 700 /* Flags indicating what we've seen so far when parsing an | |
| 701 ISO2022 escape sequence. */ | |
| 702 enum iso_esc_flag | |
| 703 { | |
| 704 /* Partial sequences */ | |
| 705 ISO_ESC_NOTHING, /* Nothing has been seen. */ | |
| 706 ISO_ESC, /* We've seen ESC. */ | |
| 707 ISO_ESC_2_4, /* We've seen ESC $. This indicates | |
| 708 that we're designating a multi-byte, rather | |
| 709 than a single-byte, character set. */ | |
| 3439 | 710 ISO_ESC_2_5, /* We've seen ESC %. This indicates an escape to a |
| 711 Unicode coding system; the only one of these | |
| 712 we're prepared to deal with is UTF-8, which has | |
| 713 the next character as G. */ | |
| 771 | 714 ISO_ESC_2_8, /* We've seen ESC 0x28, i.e. ESC (. |
| 715 This means designate a 94-character | |
| 716 character set into G0. */ | |
| 717 ISO_ESC_2_9, /* We've seen ESC 0x29 -- designate a | |
| 718 94-character character set into G1. */ | |
| 719 ISO_ESC_2_10, /* We've seen ESC 0x2A. */ | |
| 720 ISO_ESC_2_11, /* We've seen ESC 0x2B. */ | |
| 721 ISO_ESC_2_12, /* We've seen ESC 0x2C -- designate a | |
| 722 96-character character set into G0. | |
| 723 (This is not ISO2022-standard. | |
| 724 The following 96-character | |
| 725 control sequences are standard, | |
| 726 though.) */ | |
| 727 ISO_ESC_2_13, /* We've seen ESC 0x2D -- designate a | |
| 728 96-character character set into G1. | |
| 729 */ | |
| 730 ISO_ESC_2_14, /* We've seen ESC 0x2E. */ | |
| 731 ISO_ESC_2_15, /* We've seen ESC 0x2F. */ | |
| 732 ISO_ESC_2_4_8, /* We've seen ESC $ 0x28 -- designate | |
| 733 a 94^N character set into G0. */ | |
| 734 ISO_ESC_2_4_9, /* We've seen ESC $ 0x29. */ | |
| 735 ISO_ESC_2_4_10, /* We've seen ESC $ 0x2A. */ | |
| 736 ISO_ESC_2_4_11, /* We've seen ESC $ 0x2B. */ | |
| 737 ISO_ESC_2_4_12, /* We've seen ESC $ 0x2C. */ | |
| 738 ISO_ESC_2_4_13, /* We've seen ESC $ 0x2D. */ | |
| 739 ISO_ESC_2_4_14, /* We've seen ESC $ 0x2E. */ | |
| 740 ISO_ESC_2_4_15, /* We've seen ESC $ 0x2F. */ | |
| 741 ISO_ESC_5_11, /* We've seen ESC [ or 0x9B. This | |
| 742 starts a directionality-control | |
| 743 sequence. The next character | |
| 744 must be 0, 1, 2, or ]. */ | |
| 745 ISO_ESC_5_11_0, /* We've seen 0x9B 0. The next character must be ]. */ | |
| 746 ISO_ESC_5_11_1, /* We've seen 0x9B 1. The next character must be ]. */ | |
| 747 ISO_ESC_5_11_2, /* We've seen 0x9B 2. The next character must be ]. */ | |
| 748 | |
| 749 /* Full sequences. */ | |
| 750 ISO_ESC_START_COMPOSITE, /* Private usage for START COMPOSING */ | |
| 751 ISO_ESC_END_COMPOSITE, /* Private usage for END COMPOSING */ | |
| 752 ISO_ESC_SINGLE_SHIFT, /* We've seen a complete single-shift sequence. */ | |
| 753 ISO_ESC_LOCKING_SHIFT,/* We've seen a complete locking-shift sequence. */ | |
| 754 ISO_ESC_DESIGNATE, /* We've seen a complete designation sequence. */ | |
| 755 ISO_ESC_DIRECTIONALITY,/* We've seen a complete ISO6429 directionality | |
| 756 sequence. */ | |
| 757 ISO_ESC_LITERAL /* We've seen a literal character ala | |
| 758 escape-quoting. */ | |
| 759 }; | |
| 760 | |
| 761 enum iso_error | |
| 762 { | |
| 763 ISO_ERROR_BAD_FINAL, | |
| 764 ISO_ERROR_UNKWOWN_ESC_SEQUENCE, | |
| 765 ISO_ERROR_INVALID_CODE_POINT_CHARACTER, | |
| 766 }; | |
| 767 | |
| 768 | |
| 769 /* Flags indicating current state while converting code. */ | |
| 770 | |
| 771 /************ Used during encoding and decoding: ************/ | |
| 772 /* If set, the current directionality is right-to-left. Otherwise, it's | |
| 773 left-to-right. */ | |
| 774 #define ISO_STATE_R2L (1 << 0) | |
| 775 | |
| 776 /************ Used during encoding: ************/ | |
| 777 /* If set, we just saw a CR. */ | |
| 778 #define ISO_STATE_CR (1 << 1) | |
| 779 | |
| 780 /************ Used during decoding: ************/ | |
| 781 /* If set, we're currently parsing an escape sequence and the upper 16 bits | |
| 782 should be looked at to indicate what partial escape sequence we've seen | |
| 783 so far. Otherwise, we're running through actual text. */ | |
| 784 #define ISO_STATE_ESCAPE (1 << 2) | |
| 785 /* If set, G2 is invoked into GL, but only for the next character. */ | |
| 786 #define ISO_STATE_SS2 (1 << 3) | |
| 787 /* If set, G3 is invoked into GL, but only for the next character. If both | |
| 788 ISO_STATE_SS2 and ISO_STATE_SS3 are set, ISO_STATE_SS2 overrides; but | |
| 789 this probably indicates an error in the text encoding. */ | |
| 790 #define ISO_STATE_SS3 (1 << 4) | |
| 791 /* If set, we're currently processing a composite character (i.e. a | |
| 792 character constructed by overstriking two or more characters). */ | |
| 793 #define ISO_STATE_COMPOSITE (1 << 5) | |
| 794 | |
| 3439 | 795 /* If set, we're processing UTF-8 encoded data within ISO-2022 |
| 796 processing. */ | |
| 797 #define ISO_STATE_UTF_8 (1 << 6) | |
| 798 | |
| 771 | 799 /* ISO_STATE_LOCK is the mask of flags that remain on until explicitly |
| 800 turned off when in the ISO2022 encoder/decoder. Other flags are turned | |
| 801 off at the end of processing each character or escape sequence. */ | |
| 802 # define ISO_STATE_LOCK \ | |
| 3439 | 803 (ISO_STATE_COMPOSITE | ISO_STATE_R2L | ISO_STATE_UTF_8) |
| 771 | 804 |
| 805 typedef struct charset_conversion_spec | |
| 806 { | |
| 807 Lisp_Object from_charset; | |
| 808 Lisp_Object to_charset; | |
| 809 } charset_conversion_spec; | |
| 810 | |
| 811 typedef struct | |
| 812 { | |
| 813 Dynarr_declare (charset_conversion_spec); | |
| 814 } charset_conversion_spec_dynarr; | |
| 815 | |
| 816 struct iso2022_coding_system | |
| 817 { | |
| 818 /* What are the charsets to be initially designated to G0, G1, | |
| 819 G2, G3? If t, no charset is initially designated. If nil, | |
| 820 no charset is initially designated and no charset is allowed | |
| 821 to be designated. */ | |
| 822 Lisp_Object initial_charset[4]; | |
| 823 | |
| 824 /* If true, a designation escape sequence needs to be sent on output | |
| 825 for the charset in G[0-3] before that charset is used. */ | |
| 826 unsigned char force_charset_on_output[4]; | |
| 827 | |
| 828 charset_conversion_spec_dynarr *input_conv; | |
| 829 charset_conversion_spec_dynarr *output_conv; | |
| 830 | |
| 831 unsigned int shoort :1; /* C makes you speak Dutch */ | |
| 832 unsigned int no_ascii_eol :1; | |
| 833 unsigned int no_ascii_cntl :1; | |
| 834 unsigned int seven :1; | |
| 835 unsigned int lock_shift :1; | |
| 836 unsigned int no_iso6429 :1; | |
| 837 unsigned int escape_quoted :1; | |
| 838 }; | |
| 839 | |
| 840 #define CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, g) \ | |
| 841 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->initial_charset[g]) | |
| 842 #define CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(codesys, g) \ | |
| 843 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->force_charset_on_output[g]) | |
| 844 #define CODING_SYSTEM_ISO2022_SHORT(codesys) \ | |
| 845 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->shoort) | |
| 846 #define CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys) \ | |
| 847 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->no_ascii_eol) | |
| 848 #define CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys) \ | |
| 849 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->no_ascii_cntl) | |
| 850 #define CODING_SYSTEM_ISO2022_SEVEN(codesys) \ | |
| 851 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->seven) | |
| 852 #define CODING_SYSTEM_ISO2022_LOCK_SHIFT(codesys) \ | |
| 853 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->lock_shift) | |
| 854 #define CODING_SYSTEM_ISO2022_NO_ISO6429(codesys) \ | |
| 855 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->no_iso6429) | |
| 856 #define CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys) \ | |
| 857 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->escape_quoted) | |
| 858 #define CODING_SYSTEM_ISO2022_INPUT_CONV(codesys) \ | |
| 859 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->input_conv) | |
| 860 #define CODING_SYSTEM_ISO2022_OUTPUT_CONV(codesys) \ | |
| 861 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->output_conv) | |
| 862 | |
| 863 #define XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, g) \ | |
| 864 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (XCODING_SYSTEM (codesys), g) | |
| 865 #define XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(codesys, g) \ | |
| 866 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (XCODING_SYSTEM (codesys), g) | |
| 867 #define XCODING_SYSTEM_ISO2022_SHORT(codesys) \ | |
| 868 CODING_SYSTEM_ISO2022_SHORT (XCODING_SYSTEM (codesys)) | |
| 869 #define XCODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys) \ | |
| 870 CODING_SYSTEM_ISO2022_NO_ASCII_EOL (XCODING_SYSTEM (codesys)) | |
| 871 #define XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys) \ | |
| 872 CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (XCODING_SYSTEM (codesys)) | |
| 873 #define XCODING_SYSTEM_ISO2022_SEVEN(codesys) \ | |
| 874 CODING_SYSTEM_ISO2022_SEVEN (XCODING_SYSTEM (codesys)) | |
| 875 #define XCODING_SYSTEM_ISO2022_LOCK_SHIFT(codesys) \ | |
| 876 CODING_SYSTEM_ISO2022_LOCK_SHIFT (XCODING_SYSTEM (codesys)) | |
| 877 #define XCODING_SYSTEM_ISO2022_NO_ISO6429(codesys) \ | |
| 878 CODING_SYSTEM_ISO2022_NO_ISO6429 (XCODING_SYSTEM (codesys)) | |
| 879 #define XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys) \ | |
| 880 CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (XCODING_SYSTEM (codesys)) | |
| 881 #define XCODING_SYSTEM_ISO2022_INPUT_CONV(codesys) \ | |
| 882 CODING_SYSTEM_ISO2022_INPUT_CONV (XCODING_SYSTEM (codesys)) | |
| 883 #define XCODING_SYSTEM_ISO2022_OUTPUT_CONV(codesys) \ | |
| 884 CODING_SYSTEM_ISO2022_OUTPUT_CONV (XCODING_SYSTEM (codesys)) | |
| 885 | |
| 886 /* Additional information used by the ISO2022 decoder and detector. */ | |
| 887 struct iso2022_coding_stream | |
| 888 { | |
| 889 /* CHARSET holds the character sets currently assigned to the G0 | |
| 890 through G3 variables. It is initialized from the array | |
| 891 INITIAL_CHARSET in CODESYS. */ | |
| 892 Lisp_Object charset[4]; | |
| 893 | |
| 894 /* Which registers are currently invoked into the left (GL) and | |
| 895 right (GR) halves of the 8-bit encoding space? */ | |
| 896 int register_left, register_right; | |
| 897 | |
| 898 /* FLAGS holds flags indicating the current state of the encoding. Some of | |
| 899 these flags are actually part of the state-dependent data and should be | |
| 900 moved there. */ | |
| 901 unsigned int flags; | |
| 902 | |
| 903 /**************** for decoding ****************/ | |
| 904 | |
| 905 /* ISO_ESC holds a value indicating part of an escape sequence | |
| 906 that has already been seen. */ | |
| 907 enum iso_esc_flag esc; | |
| 908 | |
| 909 /* This records the bytes we've seen so far in an escape sequence, | |
| 910 in case the sequence is invalid (we spit out the bytes unchanged). */ | |
| 911 unsigned char esc_bytes[8]; | |
| 912 | |
| 913 /* Index for next byte to store in ISO escape sequence. */ | |
| 914 int esc_bytes_index; | |
| 915 | |
| 916 #ifdef ENABLE_COMPOSITE_CHARS | |
| 917 /* Stuff seen so far when composing a string. */ | |
| 918 unsigned_char_dynarr *composite_chars; | |
| 919 #endif | |
| 920 | |
| 921 /* If we saw an invalid designation sequence for a particular | |
| 922 register, we flag it here and switch to ASCII. The next time we | |
| 923 see a valid designation for this register, we turn off the flag | |
| 924 and do the designation normally, but pretend the sequence was | |
| 925 invalid. The effect of all this is that (most of the time) the | |
| 926 escape sequences for both the switch to the unknown charset, and | |
| 927 the switch back to the known charset, get inserted literally into | |
| 928 the buffer and saved out as such. The hope is that we can | |
| 929 preserve the escape sequences so that the resulting written out | |
| 930 file makes sense. If we don't do any of this, the designation | |
| 931 to the invalid charset will be preserved but that switch back | |
| 932 to the known charset will probably get eaten because it was | |
| 933 the same charset that was already present in the register. */ | |
| 934 unsigned char invalid_designated[4]; | |
| 935 | |
| 936 /* We try to do similar things as above for direction-switching | |
| 937 sequences. If we encountered a direction switch while an | |
| 938 invalid designation was present, or an invalid designation | |
| 939 just after a direction switch (i.e. no valid designation | |
| 940 encountered yet), we insert the direction-switch escape | |
| 941 sequence literally into the output stream, and later on | |
| 942 insert the corresponding direction-restoring escape sequence | |
| 943 literally also. */ | |
| 944 unsigned int switched_dir_and_no_valid_charset_yet :1; | |
| 945 unsigned int invalid_switch_dir :1; | |
| 946 | |
| 947 /* Tells the decoder to output the escape sequence literally | |
| 948 even though it was valid. Used in the games we play to | |
| 949 avoid lossage when we encounter invalid designations. */ | |
| 950 unsigned int output_literally :1; | |
| 951 /* We encountered a direction switch followed by an invalid | |
| 952 designation. We didn't output the direction switch | |
| 953 literally because we didn't know about the invalid designation; | |
| 954 but we have to do so now. */ | |
| 955 unsigned int output_direction_sequence :1; | |
| 956 | |
| 957 /**************** for encoding ****************/ | |
| 958 | |
| 959 /* Whether we need to explicitly designate the charset in the | |
| 960 G? register before using it. It is initialized from the | |
| 961 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */ | |
| 962 unsigned char force_charset_on_output[4]; | |
| 963 | |
| 964 /* Other state variables that need to be preserved across | |
| 965 invocations. */ | |
| 966 Lisp_Object current_charset; | |
| 967 int current_half; | |
| 968 int current_char_boundary; | |
| 3439 | 969 |
| 970 /* Used for handling UTF-8. */ | |
| 971 unsigned char counter; | |
| 4096 | 972 unsigned char indicated_length; |
| 771 | 973 }; |
| 974 | |
| 1204 | 975 static const struct memory_description ccs_description_1[] = |
| 771 | 976 { |
| 977 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) }, | |
| 978 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) }, | |
| 979 { XD_END } | |
| 980 }; | |
| 981 | |
| 1204 | 982 static const struct sized_memory_description ccs_description = |
| 771 | 983 { |
| 984 sizeof (charset_conversion_spec), | |
| 985 ccs_description_1 | |
| 986 }; | |
| 987 | |
| 1204 | 988 static const struct memory_description ccsd_description_1[] = |
| 771 | 989 { |
| 990 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description), | |
| 991 { XD_END } | |
| 992 }; | |
| 993 | |
| 1204 | 994 static const struct sized_memory_description ccsd_description = |
| 771 | 995 { |
| 996 sizeof (charset_conversion_spec_dynarr), | |
| 997 ccsd_description_1 | |
| 998 }; | |
| 999 | |
| 1204 | 1000 static const struct memory_description iso2022_coding_system_description[] = { |
| 1001 { XD_LISP_OBJECT_ARRAY, offsetof (struct iso2022_coding_system, | |
| 1002 initial_charset), 4 }, | |
| 2367 | 1003 { XD_BLOCK_PTR, offsetof (struct iso2022_coding_system, input_conv), |
| 2551 | 1004 1, { &ccsd_description } }, |
| 2367 | 1005 { XD_BLOCK_PTR, offsetof (struct iso2022_coding_system, output_conv), |
| 2551 | 1006 1, { &ccsd_description } }, |
| 771 | 1007 { XD_END } |
| 1008 }; | |
| 1009 | |
| 1204 | 1010 DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (iso2022); |
| 1011 | |
| 771 | 1012 /* The following note taken directly from FSF 21.0.103. */ |
| 1013 | |
| 1014 /* The following note describes the coding system ISO2022 briefly. | |
| 1015 Since the intention of this note is to help understand the | |
| 1016 functions in this file, some parts are NOT ACCURATE or are OVERLY | |
| 1017 SIMPLIFIED. For thorough understanding, please refer to the | |
| 1018 original document of ISO2022. This is equivalent to the standard | |
| 1019 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*). | |
| 1020 | |
| 1021 ISO2022 provides many mechanisms to encode several character sets | |
| 1022 in 7-bit and 8-bit environments. For 7-bit environments, all text | |
| 1023 is encoded using bytes less than 128. This may make the encoded | |
| 1024 text a little bit longer, but the text passes more easily through | |
| 1025 several types of gateway, some of which strip off the MSB (Most | |
| 1026 Significant Bit). | |
| 1027 | |
| 1028 There are two kinds of character sets: control character sets and | |
| 1029 graphic character sets. The former contain control characters such | |
| 1030 as `newline' and `escape' to provide control functions (control | |
| 1031 functions are also provided by escape sequences). The latter | |
| 1032 contain graphic characters such as 'A' and '-'. Emacs recognizes | |
| 1033 two control character sets and many graphic character sets. | |
| 1034 | |
| 1035 Graphic character sets are classified into one of the following | |
| 1036 four classes, according to the number of bytes (DIMENSION) and | |
| 1037 number of characters in one dimension (CHARS) of the set: | |
| 1038 - DIMENSION1_CHARS94 | |
| 1039 - DIMENSION1_CHARS96 | |
| 1040 - DIMENSION2_CHARS94 | |
| 1041 - DIMENSION2_CHARS96 | |
| 1042 | |
| 1043 In addition, each character set is assigned an identification tag, | |
| 1044 unique for each set, called the "final character" (denoted as <F> | |
| 1045 hereafter). The <F> of each character set is decided by ECMA(*) | |
| 1046 when it is registered in ISO. The code range of <F> is 0x30..0x7F | |
| 1047 (0x30..0x3F are for private use only). | |
| 1048 | |
| 1049 Note (*): ECMA = European Computer Manufacturers Association | |
| 1050 | |
| 1051 Here are examples of graphic character sets [NAME(<F>)]: | |
| 1052 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ... | |
| 1053 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ... | |
| 1054 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ... | |
| 1055 o DIMENSION2_CHARS96 -- none for the moment | |
| 1056 | |
| 1057 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR. | |
| 1058 C0 [0x00..0x1F] -- control character plane 0 | |
| 1059 GL [0x20..0x7F] -- graphic character plane 0 | |
| 1060 C1 [0x80..0x9F] -- control character plane 1 | |
| 1061 GR [0xA0..0xFF] -- graphic character plane 1 | |
| 1062 | |
| 1063 A control character set is directly designated and invoked to C0 or | |
| 1064 C1 by an escape sequence. The most common case is that: | |
| 1065 - ISO646's control character set is designated/invoked to C0, and | |
| 1066 - ISO6429's control character set is designated/invoked to C1, | |
| 1067 and usually these designations/invocations are omitted in encoded | |
| 1068 text. In a 7-bit environment, only C0 can be used, and a control | |
| 1069 character for C1 is encoded by an appropriate escape sequence to | |
| 1070 fit into the environment. All control characters for C1 are | |
| 1071 defined to have corresponding escape sequences. | |
| 1072 | |
| 1073 A graphic character set is at first designated to one of four | |
| 1074 graphic registers (G0 through G3), then these graphic registers are | |
| 1075 invoked to GL or GR. These designations and invocations can be | |
| 1076 done independently. The most common case is that G0 is invoked to | |
| 1077 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually | |
| 1078 these invocations and designations are omitted in encoded text. | |
| 1079 In a 7-bit environment, only GL can be used. | |
| 1080 | |
| 1081 When a graphic character set of CHARS94 is invoked to GL, codes | |
| 1082 0x20 and 0x7F of the GL area work as control characters SPACE and | |
| 1083 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not | |
| 1084 be used. | |
| 1085 | |
| 1086 There are two ways of invocation: locking-shift and single-shift. | |
| 1087 With locking-shift, the invocation lasts until the next different | |
| 1088 invocation, whereas with single-shift, the invocation affects the | |
| 1089 following character only and doesn't affect the locking-shift | |
| 1090 state. Invocations are done by the following control characters or | |
| 1091 escape sequences: | |
| 1092 | |
| 1093 ---------------------------------------------------------------------- | |
| 1094 abbrev function cntrl escape seq description | |
| 1095 ---------------------------------------------------------------------- | |
| 1096 SI/LS0 (shift-in) 0x0F none invoke G0 into GL | |
| 1097 SO/LS1 (shift-out) 0x0E none invoke G1 into GL | |
| 1098 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL | |
| 1099 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL | |
| 1100 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*) | |
| 1101 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*) | |
| 1102 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*) | |
| 1103 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char | |
| 1104 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char | |
| 1105 ---------------------------------------------------------------------- | |
| 1106 (*) These are not used by any known coding system. | |
| 1107 | |
| 1108 Control characters for these functions are defined by macros | |
| 1109 ISO_CODE_XXX in `coding.h'. | |
| 1110 | |
| 1111 Designations are done by the following escape sequences: | |
| 1112 ---------------------------------------------------------------------- | |
| 1113 escape sequence description | |
| 1114 ---------------------------------------------------------------------- | |
| 1115 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0 | |
| 1116 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1 | |
| 1117 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2 | |
| 1118 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3 | |
| 1119 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*) | |
| 1120 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1 | |
| 1121 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2 | |
| 1122 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3 | |
| 1123 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**) | |
| 1124 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1 | |
| 1125 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2 | |
| 1126 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3 | |
| 1127 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*) | |
| 1128 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1 | |
| 1129 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2 | |
| 1130 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3 | |
| 1131 ---------------------------------------------------------------------- | |
| 1132 | |
| 1133 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set | |
| 1134 of dimension 1, chars 94, and final character <F>, etc... | |
| 1135 | |
| 1136 Note (*): Although these designations are not allowed in ISO2022, | |
| 1137 Emacs accepts them on decoding, and produces them on encoding | |
| 1138 CHARS96 character sets in a coding system which is characterized as | |
| 1139 7-bit environment, non-locking-shift, and non-single-shift. | |
| 1140 | |
| 1141 Note (**): If <F> is '@', 'A', or 'B', the intermediate character | |
| 1142 '(' can be omitted. We refer to this as "short-form" hereafter. | |
| 1143 | |
| 1144 Now you may notice that there are a lot of ways of encoding the | |
| 1145 same multilingual text in ISO2022. Actually, there exist many | |
| 1146 coding systems such as Compound Text (used in X11's inter client | |
| 1147 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR | |
| 1148 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian | |
| 1149 localized platforms), and all of these are variants of ISO2022. | |
| 1150 | |
| 1151 In addition to the above, Emacs handles two more kinds of escape | |
| 1152 sequences: ISO6429's direction specification and Emacs' private | |
| 1153 sequence for specifying character composition. | |
| 1154 | |
| 1155 ISO6429's direction specification takes the following form: | |
| 1156 o CSI ']' -- end of the current direction | |
| 1157 o CSI '0' ']' -- end of the current direction | |
| 1158 o CSI '1' ']' -- start of left-to-right text | |
| 1159 o CSI '2' ']' -- start of right-to-left text | |
| 1160 The control character CSI (0x9B: control sequence introducer) is | |
| 1161 abbreviated to the escape sequence ESC '[' in a 7-bit environment. | |
| 1162 | |
| 1163 Character composition specification takes the following form: | |
| 1164 o ESC '0' -- start relative composition | |
| 1165 o ESC '1' -- end composition | |
| 1166 o ESC '2' -- start rule-base composition (*) | |
| 1167 o ESC '3' -- start relative composition with alternate chars (**) | |
| 1168 o ESC '4' -- start rule-base composition with alternate chars (**) | |
| 1169 Since these are not standard escape sequences of any ISO standard, | |
| 1170 the use of them with these meanings is restricted to Emacs only. | |
| 1171 | |
| 1172 (*) This form is used only in Emacs 20.5 and older versions, | |
| 1173 but the newer versions can safely decode it. | |
| 1174 (**) This form is used only in Emacs 21.1 and newer versions, | |
| 1175 and the older versions can't decode it. | |
| 1176 | |
| 1177 Here's a list of example usages of these composition escape | |
| 1178 sequences (categorized by `enum composition_method'). | |
| 1179 | |
| 1180 COMPOSITION_RELATIVE: | |
| 1181 ESC 0 CHAR [ CHAR ] ESC 1 | |
| 1182 COMPOSITION_WITH_RULE: | |
| 1183 ESC 2 CHAR [ RULE CHAR ] ESC 1 | |
| 1184 COMPOSITION_WITH_ALTCHARS: | |
| 1185 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 | |
| 1186 COMPOSITION_WITH_RULE_ALTCHARS: | |
| 1187 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */ | |
| 1188 | |
| 1189 static void | |
| 1190 reset_iso2022_decode (Lisp_Object coding_system, | |
| 1191 struct iso2022_coding_stream *data) | |
| 1192 { | |
| 1193 int i; | |
| 1194 #ifdef ENABLE_COMPOSITE_CHARS | |
| 1195 unsigned_char_dynarr *old_composite_chars = data->composite_chars; | |
| 1196 #endif | |
| 1197 | |
| 1198 xzero (*data); | |
| 1199 | |
| 1200 for (i = 0; i < 4; i++) | |
| 1201 { | |
| 1202 if (!NILP (coding_system)) | |
| 1203 data->charset[i] = | |
| 1204 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i); | |
| 1205 else | |
| 1206 data->charset[i] = Qt; | |
| 1207 } | |
| 1208 data->esc = ISO_ESC_NOTHING; | |
| 1209 data->register_right = 1; | |
| 1210 #ifdef ENABLE_COMPOSITE_CHARS | |
| 1211 if (old_composite_chars) | |
| 1212 { | |
| 1213 data->composite_chars = old_composite_chars; | |
| 1214 Dynarr_reset (data->composite_chars); | |
| 1215 } | |
| 1216 #endif | |
| 1217 } | |
| 1218 | |
| 1219 static void | |
| 1220 reset_iso2022_encode (Lisp_Object coding_system, | |
| 1221 struct iso2022_coding_stream *data) | |
| 1222 { | |
| 1223 int i; | |
| 1224 | |
| 1225 xzero (*data); | |
| 1226 | |
| 1227 for (i = 0; i < 4; i++) | |
| 1228 { | |
| 1229 data->charset[i] = | |
| 1230 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i); | |
| 1231 data->force_charset_on_output[i] = | |
| 1232 XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (coding_system, i); | |
| 1233 } | |
| 1234 data->register_right = 1; | |
| 1235 data->current_charset = Qnil; | |
| 1236 data->current_char_boundary = 1; | |
| 1237 } | |
| 1238 | |
| 1239 static void | |
| 1240 iso2022_init_coding_stream (struct coding_stream *str) | |
| 1241 { | |
| 1242 if (str->direction == CODING_DECODE) | |
| 1243 reset_iso2022_decode (str->codesys, | |
| 1244 CODING_STREAM_TYPE_DATA (str, iso2022)); | |
| 1245 else | |
| 1246 reset_iso2022_encode (str->codesys, | |
| 1247 CODING_STREAM_TYPE_DATA (str, iso2022)); | |
| 1248 } | |
| 1249 | |
| 1250 static void | |
| 1251 iso2022_rewind_coding_stream (struct coding_stream *str) | |
| 1252 { | |
| 1253 iso2022_init_coding_stream (str); | |
| 1254 } | |
| 1255 | |
| 1256 static int | |
| 1257 fit_to_be_escape_quoted (unsigned char c) | |
| 1258 { | |
| 1259 switch (c) | |
| 1260 { | |
| 1261 case ISO_CODE_ESC: | |
| 1262 case ISO_CODE_CSI: | |
| 1263 case ISO_CODE_SS2: | |
| 1264 case ISO_CODE_SS3: | |
| 1265 case ISO_CODE_SO: | |
| 1266 case ISO_CODE_SI: | |
| 1267 return 1; | |
| 1268 | |
| 1269 default: | |
| 1270 return 0; | |
| 1271 } | |
| 1272 } | |
| 1273 | |
| 1274 static Lisp_Object | |
| 867 | 1275 charset_by_attributes_or_create_one (int type, Ibyte final, int dir) |
| 771 | 1276 { |
| 826 | 1277 Lisp_Object charset = charset_by_attributes (type, final, dir); |
| 771 | 1278 |
| 1279 if (NILP (charset)) | |
| 1280 { | |
| 1281 int chars, dim; | |
| 1282 | |
| 1283 switch (type) | |
| 1284 { | |
| 1285 case CHARSET_TYPE_94: | |
| 1286 chars = 94; dim = 1; | |
| 1287 break; | |
| 1288 case CHARSET_TYPE_96: | |
| 1289 chars = 96; dim = 1; | |
| 1290 break; | |
| 1291 case CHARSET_TYPE_94X94: | |
| 1292 chars = 94; dim = 2; | |
| 1293 break; | |
| 1294 case CHARSET_TYPE_96X96: | |
| 1295 chars = 96; dim = 2; | |
| 1296 break; | |
| 1297 default: | |
| 2500 | 1298 ABORT (); chars = 0; dim = 0; |
| 771 | 1299 } |
| 1300 | |
| 1301 charset = Fmake_charset (Qunbound, Qnil, | |
| 1302 nconc2 (list6 (Qfinal, make_char (final), | |
| 1303 Qchars, make_int (chars), | |
| 1304 Qdimension, make_int (dim)), | |
| 1305 list2 (Qdirection, | |
| 1306 dir == CHARSET_LEFT_TO_RIGHT ? | |
| 1307 Ql2r : Qr2l))); | |
| 1308 } | |
| 1309 | |
| 1310 return charset; | |
| 1311 } | |
| 1312 | |
| 1313 /* Parse one byte of an ISO2022 escape sequence. | |
| 1314 If the result is an invalid escape sequence, return 0 and | |
| 1315 do not change anything in STR. Otherwise, if the result is | |
| 1316 an incomplete escape sequence, update ISO2022.ESC and | |
| 1317 ISO2022.ESC_BYTES and return -1. Otherwise, update | |
| 1318 all the state variables (but not ISO2022.ESC_BYTES) and | |
| 1319 return 1. | |
| 1320 | |
| 1321 If CHECK_INVALID_CHARSETS is non-zero, check for designation | |
| 1322 or invocation of an invalid character set and treat that as | |
| 1323 an unrecognized escape sequence. | |
| 1324 | |
| 2367 | 1325 */ |
| 771 | 1326 |
| 1327 static int | |
| 1328 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_coding_stream *iso, | |
| 1329 unsigned char c, unsigned int *flags, | |
| 1330 int check_invalid_charsets) | |
| 1331 { | |
| 1332 /* (1) If we're at the end of a designation sequence, CS is the | |
| 1333 charset being designated and REG is the register to designate | |
| 1334 it to. | |
| 1335 | |
| 1336 (2) If we're at the end of a locking-shift sequence, REG is | |
| 1337 the register to invoke and HALF (0 == left, 1 == right) is | |
| 1338 the half to invoke it into. | |
| 1339 | |
| 1340 (3) If we're at the end of a single-shift sequence, REG is | |
| 1341 the register to invoke. */ | |
| 1342 Lisp_Object cs = Qnil; | |
| 1343 int reg, half; | |
| 1344 | |
| 1345 /* NOTE: This code does goto's all over the fucking place. | |
| 1346 The reason for this is that we're basically implementing | |
| 1347 a state machine here, and hierarchical languages like C | |
| 1348 don't really provide a clean way of doing this. */ | |
| 1349 | |
| 1350 if (! (*flags & ISO_STATE_ESCAPE)) | |
| 1351 /* At beginning of escape sequence; we need to reset our | |
| 1352 escape-state variables. */ | |
| 1353 iso->esc = ISO_ESC_NOTHING; | |
| 1354 | |
| 1355 iso->output_literally = 0; | |
| 1356 iso->output_direction_sequence = 0; | |
| 1357 | |
| 1358 switch (iso->esc) | |
| 1359 { | |
| 1360 case ISO_ESC_NOTHING: | |
| 1361 iso->esc_bytes_index = 0; | |
| 1362 switch (c) | |
| 1363 { | |
| 1364 case ISO_CODE_ESC: /* Start escape sequence */ | |
| 1365 *flags |= ISO_STATE_ESCAPE; | |
| 1366 iso->esc = ISO_ESC; | |
| 1367 goto not_done; | |
| 1368 | |
| 1369 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */ | |
| 1370 *flags |= ISO_STATE_ESCAPE; | |
| 1371 iso->esc = ISO_ESC_5_11; | |
| 1372 goto not_done; | |
| 1373 | |
| 1374 case ISO_CODE_SO: /* locking shift 1 */ | |
| 1375 reg = 1; half = 0; | |
| 1376 goto locking_shift; | |
| 1377 case ISO_CODE_SI: /* locking shift 0 */ | |
| 1378 reg = 0; half = 0; | |
| 1379 goto locking_shift; | |
| 1380 | |
| 1381 case ISO_CODE_SS2: /* single shift */ | |
| 1382 reg = 2; | |
| 1383 goto single_shift; | |
| 1384 case ISO_CODE_SS3: /* single shift */ | |
| 1385 reg = 3; | |
| 1386 goto single_shift; | |
| 1387 | |
| 1388 default: /* Other control characters */ | |
| 1389 error: | |
| 1390 *flags &= ISO_STATE_LOCK; | |
| 1391 return 0; | |
| 1392 } | |
| 1393 | |
| 1394 case ISO_ESC: | |
| 3439 | 1395 |
| 1396 /* The only available ISO 2022 sequence in UTF-8 mode is ESC % @, to | |
| 1397 exit from it. If we see any other escape sequence, pass it through | |
| 1398 in the error handler. */ | |
| 1399 if (*flags & ISO_STATE_UTF_8 && '%' != c) | |
| 1400 { | |
| 1401 return 0; | |
| 1402 } | |
| 1403 | |
| 771 | 1404 switch (c) |
| 1405 { | |
| 1406 /**** single shift ****/ | |
| 1407 | |
| 1408 case 'N': /* single shift 2 */ | |
| 1409 reg = 2; | |
| 1410 goto single_shift; | |
| 1411 case 'O': /* single shift 3 */ | |
| 1412 reg = 3; | |
| 1413 goto single_shift; | |
| 1414 | |
| 1415 /**** locking shift ****/ | |
| 1416 | |
| 1417 case '~': /* locking shift 1 right */ | |
| 1418 reg = 1; half = 1; | |
| 1419 goto locking_shift; | |
| 1420 case 'n': /* locking shift 2 */ | |
| 1421 reg = 2; half = 0; | |
| 1422 goto locking_shift; | |
| 1423 case '}': /* locking shift 2 right */ | |
| 1424 reg = 2; half = 1; | |
| 1425 goto locking_shift; | |
| 1426 case 'o': /* locking shift 3 */ | |
| 1427 reg = 3; half = 0; | |
| 1428 goto locking_shift; | |
| 1429 case '|': /* locking shift 3 right */ | |
| 1430 reg = 3; half = 1; | |
| 1431 goto locking_shift; | |
| 1432 | |
| 1433 /**** composite ****/ | |
| 1434 | |
| 1435 #ifdef ENABLE_COMPOSITE_CHARS | |
| 1436 case '0': | |
| 1437 iso->esc = ISO_ESC_START_COMPOSITE; | |
| 1438 *flags = (*flags & ISO_STATE_LOCK) | | |
| 1439 ISO_STATE_COMPOSITE; | |
| 1440 return 1; | |
| 1441 | |
| 1442 case '1': | |
| 1443 iso->esc = ISO_ESC_END_COMPOSITE; | |
| 1444 *flags = (*flags & ISO_STATE_LOCK) & | |
| 1445 ~ISO_STATE_COMPOSITE; | |
| 1446 return 1; | |
| 1447 #else | |
| 1448 case '0': case '1': case '2': case '3': case '4': | |
| 1449 /* We simply return a flag indicating that some composite | |
| 1450 escape was seen. The caller will use the particular | |
| 1451 character to encode the appropriate "composite hack" | |
| 1452 character out of Vcharset_composite, so that we will | |
| 1453 preserve these values on output. */ | |
| 1454 iso->esc = ISO_ESC_START_COMPOSITE; | |
| 1455 *flags &= ISO_STATE_LOCK; | |
| 1456 return 1; | |
| 1457 #endif /* ENABLE_COMPOSITE_CHARS */ | |
| 1458 | |
| 1459 /**** directionality ****/ | |
| 1460 | |
| 1461 case '[': | |
| 1462 iso->esc = ISO_ESC_5_11; | |
| 1463 goto not_done; | |
| 1464 | |
| 1465 /**** designation ****/ | |
| 1466 | |
| 1467 case '$': /* multibyte charset prefix */ | |
| 1468 iso->esc = ISO_ESC_2_4; | |
| 1469 goto not_done; | |
| 1470 | |
| 3439 | 1471 case '%': /* Prefix to an escape to or from Unicode. */ |
| 1472 iso->esc = ISO_ESC_2_5; | |
| 1473 goto not_done; | |
| 1474 | |
| 771 | 1475 default: |
| 1476 if (0x28 <= c && c <= 0x2F) | |
| 1477 { | |
| 1478 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8); | |
| 1479 goto not_done; | |
| 1480 } | |
| 1481 | |
| 1482 /* This function is called with CODESYS equal to nil when | |
| 1483 doing coding-system detection. */ | |
| 1484 if (!NILP (codesys) | |
| 1485 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) | |
| 1486 && fit_to_be_escape_quoted (c)) | |
| 1487 { | |
| 1488 iso->esc = ISO_ESC_LITERAL; | |
| 1489 *flags &= ISO_STATE_LOCK; | |
| 1490 return 1; | |
| 1491 } | |
| 1492 | |
| 1493 /* bzzzt! */ | |
| 1494 goto error; | |
| 1495 } | |
| 1496 | |
| 3439 | 1497 /* ISO-IR 196 UTF-8 support. */ |
| 1498 case ISO_ESC_2_5: | |
| 1499 if ('G' == c) | |
| 1500 { | |
| 1501 /* Activate UTF-8 mode. */ | |
| 1502 *flags &= ISO_STATE_LOCK; | |
| 1503 *flags |= ISO_STATE_UTF_8; | |
| 1504 iso->esc = ISO_ESC_NOTHING; | |
| 1505 return 1; | |
| 1506 } | |
| 1507 else if ('@' == c) | |
| 1508 { | |
| 1509 /* Deactive UTF-8 mode. */ | |
| 1510 *flags &= ISO_STATE_LOCK; | |
| 1511 *flags &= ~(ISO_STATE_UTF_8); | |
| 1512 iso->esc = ISO_ESC_NOTHING; | |
| 1513 return 1; | |
| 1514 } | |
| 1515 else | |
| 1516 { | |
| 1517 /* Oops, we don't support the other UTF-? coding systems within | |
| 1518 ISO 2022, only in their own context. */ | |
| 1519 goto error; | |
| 1520 } | |
| 771 | 1521 /**** directionality ****/ |
| 1522 | |
| 1523 case ISO_ESC_5_11: /* ISO6429 direction control */ | |
| 1524 if (c == ']') | |
| 1525 { | |
| 1526 *flags &= (ISO_STATE_LOCK & ~ISO_STATE_R2L); | |
| 1527 goto directionality; | |
| 1528 } | |
| 1529 if (c == '0') iso->esc = ISO_ESC_5_11_0; | |
| 1530 else if (c == '1') iso->esc = ISO_ESC_5_11_1; | |
| 1531 else if (c == '2') iso->esc = ISO_ESC_5_11_2; | |
| 1532 else goto error; | |
| 1533 goto not_done; | |
| 1534 | |
| 1535 case ISO_ESC_5_11_0: | |
| 1536 if (c == ']') | |
| 1537 { | |
| 1538 *flags &= (ISO_STATE_LOCK & ~ISO_STATE_R2L); | |
| 1539 goto directionality; | |
| 1540 } | |
| 1541 goto error; | |
| 1542 | |
| 1543 case ISO_ESC_5_11_1: | |
| 1544 if (c == ']') | |
| 1545 { | |
| 1546 *flags = (ISO_STATE_LOCK & ~ISO_STATE_R2L); | |
| 1547 goto directionality; | |
| 1548 } | |
| 1549 goto error; | |
| 1550 | |
| 1551 case ISO_ESC_5_11_2: | |
| 1552 if (c == ']') | |
| 1553 { | |
| 1554 *flags = (*flags & ISO_STATE_LOCK) | ISO_STATE_R2L; | |
| 1555 goto directionality; | |
| 1556 } | |
| 1557 goto error; | |
| 1558 | |
| 1559 directionality: | |
| 1560 iso->esc = ISO_ESC_DIRECTIONALITY; | |
| 1561 /* Various junk here to attempt to preserve the direction sequences | |
| 1562 literally in the text if they would otherwise be swallowed due | |
| 1563 to invalid designations that don't show up as actual charset | |
| 1564 changes in the text. */ | |
| 1565 if (iso->invalid_switch_dir) | |
| 1566 { | |
| 1567 /* We already inserted a direction switch literally into the | |
| 1568 text. We assume (#### this may not be right) that the | |
| 1569 next direction switch is the one going the other way, | |
| 1570 and we need to output that literally as well. */ | |
| 1571 iso->output_literally = 1; | |
| 1572 iso->invalid_switch_dir = 0; | |
| 1573 } | |
| 1574 else | |
| 1575 { | |
| 1576 int jj; | |
| 1577 | |
| 1578 /* If we are in the thrall of an invalid designation, | |
| 1579 then stick the directionality sequence literally into the | |
| 1580 output stream so it ends up in the original text again. */ | |
| 1581 for (jj = 0; jj < 4; jj++) | |
| 1582 if (iso->invalid_designated[jj]) | |
| 1583 break; | |
| 1584 if (jj < 4) | |
| 1585 { | |
| 1586 iso->output_literally = 1; | |
| 1587 iso->invalid_switch_dir = 1; | |
| 1588 } | |
| 1589 else | |
| 1590 /* Indicate that we haven't yet seen a valid designation, | |
| 1591 so that if a switch-dir is directly followed by an | |
| 1592 invalid designation, both get inserted literally. */ | |
| 1593 iso->switched_dir_and_no_valid_charset_yet = 1; | |
| 1594 } | |
| 1595 return 1; | |
| 1596 | |
| 1597 | |
| 1598 /**** designation ****/ | |
| 1599 | |
| 1600 case ISO_ESC_2_4: | |
| 1601 if (0x28 <= c && c <= 0x2F) | |
| 1602 { | |
| 1603 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8); | |
| 1604 goto not_done; | |
| 1605 } | |
| 1606 if (0x40 <= c && c <= 0x42) | |
| 1607 { | |
| 1608 cs = charset_by_attributes_or_create_one (CHARSET_TYPE_94X94, c, | |
| 1609 *flags & ISO_STATE_R2L ? | |
| 1610 CHARSET_RIGHT_TO_LEFT : | |
| 1611 CHARSET_LEFT_TO_RIGHT); | |
| 1612 reg = 0; | |
| 1613 goto designated; | |
| 1614 } | |
| 1615 goto error; | |
| 1616 | |
| 1617 default: | |
| 1618 { | |
| 1619 int type = -1; | |
| 1620 | |
| 1621 if (iso->esc >= ISO_ESC_2_8 && | |
| 1622 iso->esc <= ISO_ESC_2_15) | |
| 1623 { | |
| 1624 type = ((iso->esc >= ISO_ESC_2_12) ? | |
| 1625 CHARSET_TYPE_96 : CHARSET_TYPE_94); | |
| 1626 reg = (iso->esc - ISO_ESC_2_8) & 3; | |
| 1627 } | |
| 1628 else if (iso->esc >= ISO_ESC_2_4_8 && | |
| 1629 iso->esc <= ISO_ESC_2_4_15) | |
| 1630 { | |
| 1631 type = ((iso->esc >= ISO_ESC_2_4_12) ? | |
| 1632 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94); | |
| 1633 reg = (iso->esc - ISO_ESC_2_4_8) & 3; | |
| 1634 } | |
| 1635 else | |
| 1636 { | |
| 1637 /* Can this ever be reached? -slb */ | |
| 2500 | 1638 ABORT (); |
| 771 | 1639 goto error; |
| 1640 } | |
| 1641 | |
| 1642 if (c < '0' || c > '~' || | |
| 1643 (c > 0x5F && (type == CHARSET_TYPE_94X94 || | |
| 1644 type == CHARSET_TYPE_96X96))) | |
| 1645 goto error; /* bad final byte */ | |
| 1646 | |
| 1647 cs = charset_by_attributes_or_create_one (type, c, | |
| 1648 *flags & ISO_STATE_R2L ? | |
| 1649 CHARSET_RIGHT_TO_LEFT : | |
| 1650 CHARSET_LEFT_TO_RIGHT); | |
| 1651 goto designated; | |
| 1652 } | |
| 1653 } | |
| 1654 | |
| 1655 not_done: | |
| 1656 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c; | |
| 1657 return -1; | |
| 1658 | |
| 1659 single_shift: | |
| 1660 if (check_invalid_charsets && !CHARSETP (iso->charset[reg])) | |
| 1661 /* can't invoke something that ain't there. */ | |
| 1662 goto error; | |
| 1663 iso->esc = ISO_ESC_SINGLE_SHIFT; | |
| 1664 *flags &= ISO_STATE_LOCK; | |
| 1665 if (reg == 2) | |
| 1666 *flags |= ISO_STATE_SS2; | |
| 1667 else | |
| 1668 *flags |= ISO_STATE_SS3; | |
| 1669 return 1; | |
| 1670 | |
| 1671 locking_shift: | |
| 1672 if (check_invalid_charsets && | |
| 1673 !CHARSETP (iso->charset[reg])) | |
| 1674 /* can't invoke something that ain't there. */ | |
| 1675 goto error; | |
| 1676 if (half) | |
| 1677 iso->register_right = reg; | |
| 1678 else | |
| 1679 iso->register_left = reg; | |
| 1680 *flags &= ISO_STATE_LOCK; | |
| 1681 iso->esc = ISO_ESC_LOCKING_SHIFT; | |
| 1682 return 1; | |
| 1683 | |
| 1684 designated: | |
| 1685 if (NILP (cs) && check_invalid_charsets) | |
| 1686 { | |
| 2500 | 1687 ABORT (); |
| 771 | 1688 /* #### This should never happen now that we automatically create |
| 1689 temporary charsets as necessary. We should probably remove | |
| 1690 this code. --ben */ | |
| 1691 iso->invalid_designated[reg] = 1; | |
| 1692 iso->charset[reg] = Vcharset_ascii; | |
| 1693 iso->esc = ISO_ESC_DESIGNATE; | |
| 1694 *flags &= ISO_STATE_LOCK; | |
| 1695 iso->output_literally = 1; | |
| 1696 if (iso->switched_dir_and_no_valid_charset_yet) | |
| 1697 { | |
| 1698 /* We encountered a switch-direction followed by an | |
| 1699 invalid designation. Ensure that the switch-direction | |
| 1700 gets outputted; otherwise it will probably get eaten | |
| 1701 when the text is written out again. */ | |
| 1702 iso->switched_dir_and_no_valid_charset_yet = 0; | |
| 1703 iso->output_direction_sequence = 1; | |
| 1704 /* And make sure that the switch-dir going the other | |
| 1705 way gets outputted, as well. */ | |
| 1706 iso->invalid_switch_dir = 1; | |
| 1707 } | |
| 1708 return 1; | |
| 1709 } | |
| 1710 /* This function is called with CODESYS equal to nil when | |
| 1711 doing coding-system detection. */ | |
| 1712 if (!NILP (codesys)) | |
| 1713 { | |
| 1714 charset_conversion_spec_dynarr *dyn = | |
| 1715 XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys); | |
| 1716 | |
| 1717 if (dyn) | |
| 1718 { | |
| 1719 int i; | |
| 1720 | |
| 1721 for (i = 0; i < Dynarr_length (dyn); i++) | |
| 1722 { | |
| 1723 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i); | |
| 1724 if (EQ (cs, spec->from_charset)) | |
| 1725 cs = spec->to_charset; | |
| 1726 } | |
| 1727 } | |
| 1728 } | |
| 1729 | |
| 1730 iso->charset[reg] = cs; | |
| 1731 iso->esc = ISO_ESC_DESIGNATE; | |
| 1732 *flags &= ISO_STATE_LOCK; | |
| 1733 if (iso->invalid_designated[reg]) | |
| 1734 { | |
| 1735 iso->invalid_designated[reg] = 0; | |
| 1736 iso->output_literally = 1; | |
| 1737 } | |
| 1738 if (iso->switched_dir_and_no_valid_charset_yet) | |
| 1739 iso->switched_dir_and_no_valid_charset_yet = 0; | |
| 1740 return 1; | |
| 1741 } | |
| 1742 | |
| 1743 /* If FLAGS is a null pointer or specifies right-to-left motion, | |
| 1744 output a switch-dir-to-left-to-right sequence to DST. | |
| 1745 Also update FLAGS if it is not a null pointer. | |
| 1746 If INTERNAL_P is set, we are outputting in internal format and | |
| 1747 need to handle the CSI differently. */ | |
| 1748 | |
| 1749 static void | |
| 1750 restore_left_to_right_direction (Lisp_Object codesys, | |
| 1751 unsigned_char_dynarr *dst, | |
| 1752 unsigned int *flags, | |
| 1753 int internal_p) | |
| 1754 { | |
| 1755 if (!flags || (*flags & ISO_STATE_R2L)) | |
| 1756 { | |
| 1757 if (XCODING_SYSTEM_ISO2022_SEVEN (codesys)) | |
| 1758 { | |
| 1759 Dynarr_add (dst, ISO_CODE_ESC); | |
| 1760 Dynarr_add (dst, '['); | |
| 1761 } | |
| 1762 else if (internal_p) | |
| 1763 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst); | |
| 1764 else | |
| 1765 Dynarr_add (dst, ISO_CODE_CSI); | |
| 1766 Dynarr_add (dst, '0'); | |
| 1767 Dynarr_add (dst, ']'); | |
| 1768 if (flags) | |
| 1769 *flags &= ~ISO_STATE_R2L; | |
| 1770 } | |
| 1771 } | |
| 1772 | |
| 1773 /* If FLAGS is a null pointer or specifies a direction different from | |
| 1774 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or | |
| 1775 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape | |
| 1776 sequence to DST. Also update FLAGS if it is not a null pointer. | |
| 1777 If INTERNAL_P is set, we are outputting in internal format and | |
| 1778 need to handle the CSI differently. */ | |
| 1779 | |
| 1780 static void | |
| 1781 ensure_correct_direction (int direction, Lisp_Object codesys, | |
| 1782 unsigned_char_dynarr *dst, unsigned int *flags, | |
| 1783 int internal_p) | |
| 1784 { | |
| 1785 if ((!flags || (*flags & ISO_STATE_R2L)) && | |
| 1786 direction == CHARSET_LEFT_TO_RIGHT) | |
| 1787 restore_left_to_right_direction (codesys, dst, flags, internal_p); | |
| 1788 else if (!XCODING_SYSTEM_ISO2022_NO_ISO6429 (codesys) | |
| 1789 && (!flags || !(*flags & ISO_STATE_R2L)) && | |
| 1790 direction == CHARSET_RIGHT_TO_LEFT) | |
| 1791 { | |
| 1792 if (XCODING_SYSTEM_ISO2022_SEVEN (codesys)) | |
| 1793 { | |
| 1794 Dynarr_add (dst, ISO_CODE_ESC); | |
| 1795 Dynarr_add (dst, '['); | |
| 1796 } | |
| 1797 else if (internal_p) | |
| 1798 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst); | |
| 1799 else | |
| 1800 Dynarr_add (dst, ISO_CODE_CSI); | |
| 1801 Dynarr_add (dst, '2'); | |
| 1802 Dynarr_add (dst, ']'); | |
| 1803 if (flags) | |
| 1804 *flags |= ISO_STATE_R2L; | |
| 1805 } | |
| 1806 } | |
| 1807 | |
| 4096 | 1808 /* Note that this name conflicts with a function in unicode.c. */ |
| 1809 static void | |
| 1810 decode_unicode_char (int ucs, unsigned_char_dynarr *dst) | |
| 1811 { | |
| 1812 Ibyte work[MAX_ICHAR_LEN]; | |
| 1813 int len; | |
| 1814 Lisp_Object chr; | |
| 1815 | |
| 1816 chr = Funicode_to_char(make_int(ucs), Qnil); | |
| 1817 assert (!NILP(chr)); | |
| 1818 len = set_itext_ichar (work, XCHAR(chr)); | |
| 1819 Dynarr_add_many (dst, work, len); | |
| 1820 } | |
| 1821 | |
| 1822 #define DECODE_ERROR_OCTET(octet, dst) \ | |
| 1823 decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, dst) | |
| 1824 | |
| 1825 static inline void | |
| 1826 indicate_invalid_utf_8 (unsigned char indicated_length, | |
| 1827 unsigned char counter, | |
| 1828 int ch, unsigned_char_dynarr *dst) | |
| 1829 { | |
| 1830 Binbyte stored = indicated_length - counter; | |
| 1831 Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length]; | |
| 1832 | |
| 1833 while (stored > 0) | |
| 1834 { | |
| 1835 DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask, | |
| 1836 dst); | |
| 1837 mask = 0x80, stored--; | |
| 1838 } | |
| 1839 } | |
| 1840 | |
| 771 | 1841 /* Convert ISO2022-format data to internal format. */ |
| 1842 | |
| 1843 static Bytecount | |
| 1844 iso2022_decode (struct coding_stream *str, const UExtbyte *src, | |
| 1845 unsigned_char_dynarr *dst, Bytecount n) | |
| 1846 { | |
| 1847 unsigned int ch = str->ch; | |
| 1848 #ifdef ENABLE_COMPOSITE_CHARS | |
| 1849 unsigned_char_dynarr *real_dst = dst; | |
| 1850 #endif | |
| 1851 struct iso2022_coding_stream *data = | |
| 1852 CODING_STREAM_TYPE_DATA (str, iso2022); | |
| 1853 unsigned int flags = data->flags; | |
| 1854 Bytecount orign = n; | |
| 1855 | |
| 1856 #ifdef ENABLE_COMPOSITE_CHARS | |
| 1857 if (flags & ISO_STATE_COMPOSITE) | |
| 1858 dst = data->composite_chars; | |
| 1859 #endif /* ENABLE_COMPOSITE_CHARS */ | |
| 1860 | |
| 1861 while (n--) | |
| 1862 { | |
| 1863 UExtbyte c = *src++; | |
| 1864 if (flags & ISO_STATE_ESCAPE) | |
| 1865 { /* Within ESC sequence */ | |
| 1866 int retval = parse_iso2022_esc (str->codesys, data, | |
| 1867 c, &flags, 1); | |
| 1868 | |
| 1869 if (retval) | |
| 1870 { | |
| 1871 switch (data->esc) | |
| 1872 { | |
| 1873 #ifdef ENABLE_COMPOSITE_CHARS | |
| 1874 case ISO_ESC_START_COMPOSITE: | |
| 1875 if (data->composite_chars) | |
| 1876 Dynarr_reset (data->composite_chars); | |
| 1877 else | |
| 1878 data->composite_chars = Dynarr_new (unsigned_char); | |
| 1879 dst = data->composite_chars; | |
| 1880 break; | |
| 1881 case ISO_ESC_END_COMPOSITE: | |
| 1882 { | |
| 867 | 1883 Ibyte comstr[MAX_ICHAR_LEN]; |
| 771 | 1884 Bytecount len; |
| 867 | 1885 Ichar emch = lookup_composite_char (Dynarr_atp (dst, 0), |
| 771 | 1886 Dynarr_length (dst)); |
| 1887 dst = real_dst; | |
| 867 | 1888 len = set_itext_ichar (comstr, emch); |
| 771 | 1889 Dynarr_add_many (dst, comstr, len); |
| 1890 break; | |
| 1891 } | |
| 1892 #else | |
| 1893 case ISO_ESC_START_COMPOSITE: | |
| 1894 { | |
| 867 | 1895 Ibyte comstr[MAX_ICHAR_LEN]; |
| 771 | 1896 Bytecount len; |
| 867 | 1897 Ichar emch = make_ichar (Vcharset_composite, c - '0' + ' ', |
| 771 | 1898 0); |
| 867 | 1899 len = set_itext_ichar (comstr, emch); |
| 771 | 1900 Dynarr_add_many (dst, comstr, len); |
| 1901 break; | |
| 1902 } | |
| 1903 #endif /* ENABLE_COMPOSITE_CHARS */ | |
| 1904 | |
| 1905 case ISO_ESC_LITERAL: | |
| 1906 DECODE_ADD_BINARY_CHAR (c, dst); | |
| 1907 break; | |
| 1908 | |
| 1909 default: | |
| 1910 /* Everything else handled already */ | |
| 1911 break; | |
| 1912 } | |
| 1913 } | |
| 1914 | |
| 1915 /* Attempted error recovery. */ | |
| 1916 if (data->output_direction_sequence) | |
| 1917 ensure_correct_direction (flags & ISO_STATE_R2L ? | |
| 1918 CHARSET_RIGHT_TO_LEFT : | |
| 1919 CHARSET_LEFT_TO_RIGHT, | |
| 1920 str->codesys, dst, 0, 1); | |
| 1921 /* More error recovery. */ | |
| 1922 if (!retval || data->output_literally) | |
| 1923 { | |
| 1924 /* Output the (possibly invalid) sequence */ | |
| 1925 int i; | |
| 1926 for (i = 0; i < data->esc_bytes_index; i++) | |
| 1927 DECODE_ADD_BINARY_CHAR (data->esc_bytes[i], dst); | |
| 1928 flags &= ISO_STATE_LOCK; | |
| 1929 if (!retval) | |
| 1930 n++, src--;/* Repeat the loop with the same character. */ | |
| 1931 else | |
| 1932 { | |
| 1933 /* No sense in reprocessing the final byte of the | |
| 1934 escape sequence; it could mess things up anyway. | |
| 1935 Just add it now. */ | |
| 1936 DECODE_ADD_BINARY_CHAR (c, dst); | |
| 1937 } | |
| 1938 } | |
| 1939 ch = 0; | |
| 1940 } | |
| 3439 | 1941 else if (flags & ISO_STATE_UTF_8) |
| 1942 { | |
| 1943 unsigned char counter = data->counter; | |
| 4096 | 1944 unsigned char indicated_length = data->indicated_length; |
| 3439 | 1945 |
| 1946 if (ISO_CODE_ESC == c) | |
| 1947 { | |
| 1948 /* Allow the escape sequence parser to end the UTF-8 state. */ | |
| 1949 flags |= ISO_STATE_ESCAPE; | |
| 1950 data->esc = ISO_ESC; | |
| 1951 data->esc_bytes_index = 1; | |
| 1952 continue; | |
| 1953 } | |
| 1954 | |
| 4096 | 1955 if (0 == counter) |
| 1956 { | |
| 1957 if (0 == (c & 0x80)) | |
| 1958 { | |
| 1959 /* ASCII. */ | |
| 1960 decode_unicode_char (c, dst); | |
| 1961 } | |
| 1962 else if (0 == (c & 0x40)) | |
| 1963 { | |
| 1964 /* Highest bit set, second highest not--there's | |
| 1965 something wrong. */ | |
| 1966 DECODE_ERROR_OCTET (c, dst); | |
| 1967 } | |
| 1968 else if (0 == (c & 0x20)) | |
| 1969 { | |
| 1970 ch = c & 0x1f; | |
| 1971 counter = 1; | |
| 1972 indicated_length = 2; | |
| 1973 } | |
| 1974 else if (0 == (c & 0x10)) | |
| 1975 { | |
| 1976 ch = c & 0x0f; | |
| 1977 counter = 2; | |
| 1978 indicated_length = 3; | |
| 1979 } | |
| 1980 else if (0 == (c & 0x08)) | |
| 1981 { | |
| 1982 ch = c & 0x0f; | |
| 1983 counter = 3; | |
| 1984 indicated_length = 4; | |
| 1985 } | |
| 1986 /* We support lengths longer than 4 here, since we want to | |
| 1987 represent UTF-8 error chars as distinct from the | |
| 1988 corresponding ISO 8859-1 characters in escape-quoted. | |
| 1989 | |
| 1990 However, we can't differentiate UTF-8 error chars as | |
| 1991 written to disk, and UTF-8 errors in escape-quoted. This | |
| 1992 is not a big problem; | |
| 1993 non-Unicode-chars-encoded-as-UTF-8-in-ISO-2022 is not | |
| 1994 deployed, in practice, so if such a sequence of octets | |
| 1995 occurs, XEmacs generated it. */ | |
| 1996 else if (0 == (c & 0x04)) | |
| 1997 { | |
| 1998 ch = c & 0x03; | |
| 1999 counter = 4; | |
| 2000 indicated_length = 5; | |
| 2001 } | |
| 2002 else if (0 == (c & 0x02)) | |
| 2003 { | |
| 2004 ch = c & 0x01; | |
| 2005 counter = 5; | |
| 2006 indicated_length = 6; | |
| 2007 } | |
| 2008 else | |
| 2009 { | |
| 2010 /* #xFF is not a valid leading byte in any form of | |
| 2011 UTF-8. */ | |
| 2012 DECODE_ERROR_OCTET (c, dst); | |
| 2013 | |
| 2014 } | |
| 2015 } | |
| 2016 else | |
| 2017 { | |
| 2018 /* counter != 0 */ | |
| 2019 if ((0 == (c & 0x80)) || (0 != (c & 0x40))) | |
| 2020 { | |
| 2021 indicate_invalid_utf_8(indicated_length, | |
| 2022 counter, | |
| 2023 ch, dst); | |
| 2024 if (c & 0x80) | |
| 2025 { | |
| 2026 DECODE_ERROR_OCTET (c, dst); | |
| 2027 } | |
| 2028 else | |
| 2029 { | |
| 2030 /* The character just read is ASCII. Treat it as | |
| 2031 such. */ | |
| 2032 decode_unicode_char (c, dst); | |
| 2033 } | |
| 2034 ch = 0; | |
| 2035 counter = 0; | |
| 2036 } | |
| 2037 else | |
| 2038 { | |
| 2039 ch = (ch << 6) | (c & 0x3f); | |
| 2040 counter--; | |
| 2041 | |
| 2042 /* Just processed the final byte. Emit the character. */ | |
| 2043 if (!counter) | |
| 2044 { | |
| 2045 /* Don't accept over-long sequences, or surrogates. */ | |
| 2046 if ((ch < 0x80) || | |
| 2047 ((ch < 0x800) && indicated_length > 2) || | |
| 2048 ((ch < 0x10000) && indicated_length > 3) || | |
| 2049 /* We accept values above #x110000 in | |
| 2050 escape-quoted, though not in UTF-8. */ | |
| 2051 /* (ch > 0x110000) || */ | |
| 2052 valid_utf_16_surrogate(ch)) | |
| 2053 { | |
| 2054 indicate_invalid_utf_8(indicated_length, | |
| 2055 counter, | |
| 2056 ch, dst); | |
| 2057 } | |
| 2058 else | |
| 2059 { | |
| 2060 decode_unicode_char (ch, dst); | |
| 2061 } | |
| 2062 ch = 0; | |
| 2063 } | |
| 2064 } | |
| 2065 } | |
| 2066 | |
| 2067 if (str->eof && ch) | |
| 2068 { | |
| 2069 DECODE_ERROR_OCTET (ch, dst); | |
| 2070 ch = 0; | |
| 2071 } | |
| 3439 | 2072 |
| 2073 data->counter = counter; | |
| 4096 | 2074 data->indicated_length = indicated_length; |
| 3439 | 2075 } |
| 826 | 2076 else if (byte_c0_p (c) || byte_c1_p (c)) |
| 771 | 2077 { /* Control characters */ |
| 2078 | |
| 2079 /***** Error-handling *****/ | |
| 2080 | |
| 2081 /* If we were in the middle of a character, dump out the | |
| 2082 partial character. */ | |
| 2083 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); | |
| 2084 | |
| 2085 /* If we just saw a single-shift character, dump it out. | |
| 2086 This may dump out the wrong sort of single-shift character, | |
| 2087 but least it will give an indication that something went | |
| 2088 wrong. */ | |
| 2089 if (flags & ISO_STATE_SS2) | |
| 2090 { | |
| 2091 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst); | |
| 2092 flags &= ~ISO_STATE_SS2; | |
| 2093 } | |
| 2094 if (flags & ISO_STATE_SS3) | |
| 2095 { | |
| 2096 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst); | |
| 2097 flags &= ~ISO_STATE_SS3; | |
| 2098 } | |
| 2099 | |
| 2100 /***** Now handle the control characters. *****/ | |
| 2101 | |
| 2102 flags &= ISO_STATE_LOCK; | |
| 2103 | |
| 2104 if (!parse_iso2022_esc (str->codesys, data, c, &flags, 1)) | |
| 2105 DECODE_ADD_BINARY_CHAR (c, dst); | |
| 2106 } | |
| 2107 else | |
| 2108 { /* Graphic characters */ | |
| 2109 Lisp_Object charset; | |
| 2110 int lb; | |
| 2111 int reg; | |
| 2112 | |
| 2113 /* Now determine the charset. */ | |
| 2114 reg = ((flags & ISO_STATE_SS2) ? 2 | |
| 2115 : (flags & ISO_STATE_SS3) ? 3 | |
| 826 | 2116 : !byte_ascii_p (c) ? data->register_right |
| 771 | 2117 : data->register_left); |
| 2118 charset = data->charset[reg]; | |
| 2119 | |
| 2120 /* Error checking: */ | |
| 2121 if (! CHARSETP (charset) | |
| 2122 || data->invalid_designated[reg] | |
| 2123 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL) | |
| 2124 && XCHARSET_CHARS (charset) == 94)) | |
| 2125 /* Mrmph. We are trying to invoke a register that has no | |
| 2126 or an invalid charset in it, or trying to add a character | |
| 2127 outside the range of the charset. Insert that char literally | |
| 2128 to preserve it for the output. */ | |
| 2129 { | |
| 2130 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); | |
| 2131 DECODE_ADD_BINARY_CHAR (c, dst); | |
| 2132 } | |
| 2133 | |
| 2134 else | |
| 2135 { | |
| 2136 /* Things are probably hunky-dorey. */ | |
| 2137 | |
| 2138 /* Fetch reverse charset, maybe. */ | |
| 2139 if (((flags & ISO_STATE_R2L) && | |
| 2140 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT) | |
| 2141 || | |
| 2142 (!(flags & ISO_STATE_R2L) && | |
| 2143 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT)) | |
| 2144 { | |
| 2145 Lisp_Object new_charset = | |
| 2146 XCHARSET_REVERSE_DIRECTION_CHARSET (charset); | |
| 2147 if (!NILP (new_charset)) | |
| 2148 charset = new_charset; | |
| 2149 } | |
| 2150 | |
| 2151 lb = XCHARSET_LEADING_BYTE (charset); | |
| 2152 switch (XCHARSET_REP_BYTES (charset)) | |
| 2153 { | |
| 2154 case 1: /* ASCII */ | |
| 2155 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); | |
| 2156 Dynarr_add (dst, c & 0x7F); | |
| 2157 break; | |
| 2158 | |
| 2159 case 2: /* one-byte official */ | |
| 2160 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); | |
| 2161 Dynarr_add (dst, lb); | |
| 2162 Dynarr_add (dst, c | 0x80); | |
| 2163 break; | |
| 2164 | |
| 2165 case 3: /* one-byte private or two-byte official */ | |
| 2166 if (XCHARSET_PRIVATE_P (charset)) | |
| 2167 { | |
| 2168 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); | |
| 2169 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1); | |
| 2170 Dynarr_add (dst, lb); | |
| 2171 Dynarr_add (dst, c | 0x80); | |
| 2172 } | |
| 2173 else | |
| 2174 { | |
| 2175 if (ch) | |
| 2176 { | |
| 2177 Dynarr_add (dst, lb); | |
| 2178 Dynarr_add (dst, ch | 0x80); | |
| 2179 Dynarr_add (dst, c | 0x80); | |
| 2180 ch = 0; | |
| 2181 } | |
| 2182 else | |
| 2183 ch = c; | |
| 2184 } | |
| 2185 break; | |
| 2186 | |
| 2187 default: /* two-byte private */ | |
| 2188 if (ch) | |
| 2189 { | |
| 2190 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2); | |
| 2191 Dynarr_add (dst, lb); | |
| 2192 Dynarr_add (dst, ch | 0x80); | |
| 2193 Dynarr_add (dst, c | 0x80); | |
| 2194 ch = 0; | |
| 2195 } | |
| 2196 else | |
| 2197 ch = c; | |
| 2198 } | |
| 2199 } | |
| 2200 | |
| 2201 if (!ch) | |
| 2202 flags &= ISO_STATE_LOCK; | |
| 2203 } | |
| 2204 | |
| 2205 } | |
| 2206 | |
| 2207 if (str->eof) | |
| 2208 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); | |
| 2209 | |
| 2210 data->flags = flags; | |
| 2211 str->ch = ch; | |
| 2212 return orign; | |
| 2213 } | |
| 2214 | |
| 2215 | |
| 2216 /***** ISO2022 encoder *****/ | |
| 2217 | |
| 2218 /* Designate CHARSET into register REG. */ | |
| 2219 | |
| 2220 static void | |
| 2221 iso2022_designate (Lisp_Object charset, int reg, | |
| 2222 struct coding_stream *str, unsigned_char_dynarr *dst) | |
| 2223 { | |
| 2224 static const char inter94[] = "()*+"; | |
| 2225 static const char inter96[] = ",-./"; | |
| 2226 int type; | |
| 2227 unsigned char final; | |
| 2228 struct iso2022_coding_stream *data = | |
| 2229 CODING_STREAM_TYPE_DATA (str, iso2022); | |
| 2230 Lisp_Object old_charset = data->charset[reg]; | |
| 2231 | |
| 2232 data->charset[reg] = charset; | |
| 2233 if (!CHARSETP (charset)) | |
| 2234 /* charset might be an initial nil or t. */ | |
| 2235 return; | |
| 2236 type = XCHARSET_TYPE (charset); | |
| 2237 final = XCHARSET_FINAL (charset); | |
| 2238 if (!data->force_charset_on_output[reg] && | |
| 2239 CHARSETP (old_charset) && | |
| 2240 XCHARSET_TYPE (old_charset) == type && | |
| 2241 XCHARSET_FINAL (old_charset) == final) | |
| 2242 return; | |
| 2243 | |
| 2244 data->force_charset_on_output[reg] = 0; | |
| 2245 | |
| 2246 { | |
| 2247 charset_conversion_spec_dynarr *dyn = | |
| 2248 XCODING_SYSTEM_ISO2022_OUTPUT_CONV (str->codesys); | |
| 2249 | |
| 2250 if (dyn) | |
| 2251 { | |
| 2252 int i; | |
| 2253 | |
| 2254 for (i = 0; i < Dynarr_length (dyn); i++) | |
| 2255 { | |
| 2256 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i); | |
| 2257 if (EQ (charset, spec->from_charset)) | |
| 2258 charset = spec->to_charset; | |
| 2259 } | |
| 2260 } | |
| 2261 } | |
| 2262 | |
| 2263 Dynarr_add (dst, ISO_CODE_ESC); | |
| 3439 | 2264 |
| 771 | 2265 switch (type) |
| 2266 { | |
| 2267 case CHARSET_TYPE_94: | |
| 2268 Dynarr_add (dst, inter94[reg]); | |
| 2269 break; | |
| 2270 case CHARSET_TYPE_96: | |
| 2271 Dynarr_add (dst, inter96[reg]); | |
| 2272 break; | |
| 2273 case CHARSET_TYPE_94X94: | |
| 2274 Dynarr_add (dst, '$'); | |
| 2275 if (reg != 0 | |
| 2276 || !(XCODING_SYSTEM_ISO2022_SHORT (str->codesys)) | |
| 2277 || final < '@' | |
| 2278 || final > 'B') | |
| 2279 Dynarr_add (dst, inter94[reg]); | |
| 2280 break; | |
| 2281 case CHARSET_TYPE_96X96: | |
| 2282 Dynarr_add (dst, '$'); | |
| 2283 Dynarr_add (dst, inter96[reg]); | |
| 2284 break; | |
| 2285 } | |
| 2286 Dynarr_add (dst, final); | |
| 2287 } | |
| 2288 | |
| 2289 static void | |
| 2290 ensure_normal_shift (struct coding_stream *str, unsigned_char_dynarr *dst) | |
| 2291 { | |
| 2292 struct iso2022_coding_stream *data = | |
| 2293 CODING_STREAM_TYPE_DATA (str, iso2022); | |
| 2294 | |
| 2295 if (data->register_left != 0) | |
| 2296 { | |
| 2297 Dynarr_add (dst, ISO_CODE_SI); | |
| 2298 data->register_left = 0; | |
| 2299 } | |
| 2300 } | |
| 2301 | |
| 2302 static void | |
| 2303 ensure_shift_out (struct coding_stream *str, unsigned_char_dynarr *dst) | |
| 2304 { | |
| 2305 struct iso2022_coding_stream *data = | |
| 2306 CODING_STREAM_TYPE_DATA (str, iso2022); | |
| 2307 | |
| 2308 if (data->register_left != 1) | |
| 2309 { | |
| 2310 Dynarr_add (dst, ISO_CODE_SO); | |
| 2311 data->register_left = 1; | |
| 2312 } | |
| 2313 } | |
| 2314 | |
| 2315 /* Convert internally-formatted data to ISO2022 format. */ | |
| 2316 | |
| 2317 static Bytecount | |
| 867 | 2318 iso2022_encode (struct coding_stream *str, const Ibyte *src, |
| 771 | 2319 unsigned_char_dynarr *dst, Bytecount n) |
| 2320 { | |
| 2321 unsigned char charmask; | |
| 867 | 2322 Ibyte c; |
| 771 | 2323 unsigned char char_boundary; |
| 2324 unsigned int ch = str->ch; | |
| 2325 Lisp_Object codesys = str->codesys; | |
| 2326 int i; | |
| 2327 Lisp_Object charset; | |
| 2328 int half; | |
| 2329 struct iso2022_coding_stream *data = | |
| 2330 CODING_STREAM_TYPE_DATA (str, iso2022); | |
| 2331 unsigned int flags = data->flags; | |
| 2332 Bytecount orign = n; | |
| 2333 | |
| 2334 #ifdef ENABLE_COMPOSITE_CHARS | |
| 2335 /* flags for handling composite chars. We do a little switcheroo | |
| 2336 on the source while we're outputting the composite char. */ | |
| 2337 Bytecount saved_n = 0; | |
| 867 | 2338 const Ibyte *saved_src = NULL; |
| 771 | 2339 int in_composite = 0; |
| 2340 #endif /* ENABLE_COMPOSITE_CHARS */ | |
| 2341 | |
| 2342 char_boundary = data->current_char_boundary; | |
| 2343 charset = data->current_charset; | |
| 2344 half = data->current_half; | |
| 2345 | |
| 2346 #ifdef ENABLE_COMPOSITE_CHARS | |
| 2347 back_to_square_n: | |
| 2348 #endif | |
| 2349 while (n--) | |
| 2350 { | |
| 2351 c = *src++; | |
| 2352 | |
| 826 | 2353 if (byte_ascii_p (c)) |
| 771 | 2354 { /* Processing ASCII character */ |
| 2355 ch = 0; | |
| 2356 | |
| 3439 | 2357 if (flags & ISO_STATE_UTF_8) |
| 2358 { | |
| 2359 Dynarr_add (dst, ISO_CODE_ESC); | |
| 2360 Dynarr_add (dst, '%'); | |
| 2361 Dynarr_add (dst, '@'); | |
| 2362 flags &= ~(ISO_STATE_UTF_8); | |
| 2363 } | |
| 2364 | |
| 771 | 2365 restore_left_to_right_direction (codesys, dst, &flags, 0); |
| 2366 | |
| 2367 /* Make sure G0 contains ASCII */ | |
| 2368 if ((c > ' ' && c < ISO_CODE_DEL) || | |
| 2369 !XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys)) | |
| 2370 { | |
| 2371 ensure_normal_shift (str, dst); | |
| 2372 iso2022_designate (Vcharset_ascii, 0, str, dst); | |
| 2373 } | |
| 2374 | |
| 2375 /* If necessary, restore everything to the default state | |
| 2376 at end-of-line */ | |
| 2377 if (!(XCODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys))) | |
| 2378 { | |
| 2379 /* NOTE: CRLF encoding happens *BEFORE* other encoding. | |
| 2380 Thus, even though we're working with internal-format | |
| 2381 data, there may be CR's or CRLF sequences representing | |
| 2382 newlines. */ | |
| 2383 if (c == '\r' || (c == '\n' && !(flags & ISO_STATE_CR))) | |
| 2384 { | |
| 2385 restore_left_to_right_direction (codesys, dst, &flags, 0); | |
| 2386 | |
| 2387 ensure_normal_shift (str, dst); | |
| 2388 | |
| 2389 for (i = 0; i < 4; i++) | |
| 2390 { | |
| 2391 Lisp_Object initial_charset = | |
| 2392 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); | |
| 2393 iso2022_designate (initial_charset, i, str, dst); | |
| 2394 } | |
| 2395 } | |
| 2396 if (c == '\r') | |
| 2397 flags |= ISO_STATE_CR; | |
| 2398 else | |
| 2399 flags &= ~ISO_STATE_CR; | |
| 2400 } | |
| 2401 | |
| 2402 if (XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) | |
| 2403 && fit_to_be_escape_quoted (c)) | |
| 2404 Dynarr_add (dst, ISO_CODE_ESC); | |
| 2405 Dynarr_add (dst, c); | |
| 2406 char_boundary = 1; | |
| 2407 } | |
| 867 | 2408 else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch)) |
| 771 | 2409 { /* Processing Leading Byte */ |
| 2410 ch = 0; | |
| 826 | 2411 charset = charset_by_leading_byte (c); |
| 2412 if (leading_byte_prefix_p (c)) | |
| 3439 | 2413 { |
| 2414 ch = c; | |
| 2415 } | |
| 2416 else if (XCHARSET_ENCODE_AS_UTF_8 (charset)) | |
| 2417 { | |
| 2418 assert (!EQ (charset, Vcharset_control_1) | |
| 2419 && !EQ (charset, Vcharset_composite)); | |
| 2420 | |
| 2421 /* If the character set is to be encoded as UTF-8, the escape | |
| 2422 is always the same. */ | |
| 2423 if (!(flags & ISO_STATE_UTF_8)) | |
| 2424 { | |
| 2425 Dynarr_add (dst, ISO_CODE_ESC); | |
| 2426 Dynarr_add (dst, '%'); | |
| 2427 Dynarr_add (dst, 'G'); | |
| 2428 flags |= ISO_STATE_UTF_8; | |
| 2429 } | |
| 2430 } | |
| 771 | 2431 else if (!EQ (charset, Vcharset_control_1) |
| 2432 && !EQ (charset, Vcharset_composite)) | |
| 2433 { | |
| 2434 int reg; | |
| 2435 | |
| 3439 | 2436 /* End the UTF-8 state. */ |
| 2437 if (flags & ISO_STATE_UTF_8) | |
| 2438 { | |
| 2439 Dynarr_add (dst, ISO_CODE_ESC); | |
| 2440 Dynarr_add (dst, '%'); | |
| 2441 Dynarr_add (dst, '@'); | |
| 2442 flags &= ~(ISO_STATE_UTF_8); | |
| 2443 } | |
| 2444 | |
| 771 | 2445 ensure_correct_direction (XCHARSET_DIRECTION (charset), |
| 2446 codesys, dst, &flags, 0); | |
| 2447 | |
| 2448 /* Now determine which register to use. */ | |
| 2449 reg = -1; | |
| 2450 for (i = 0; i < 4; i++) | |
| 2451 { | |
| 2452 if (EQ (charset, data->charset[i]) || | |
| 2453 EQ (charset, | |
| 2454 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))) | |
| 2455 { | |
| 2456 reg = i; | |
| 2457 break; | |
| 2458 } | |
| 2459 } | |
| 2460 | |
| 2461 if (reg == -1) | |
| 2462 { | |
| 2463 if (XCHARSET_GRAPHIC (charset) != 0) | |
| 2464 { | |
| 2465 if (!NILP (data->charset[1]) && | |
| 2466 (!XCODING_SYSTEM_ISO2022_SEVEN (codesys) || | |
| 2467 XCODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys))) | |
| 2468 reg = 1; | |
| 2469 else if (!NILP (data->charset[2])) | |
| 2470 reg = 2; | |
| 2471 else if (!NILP (data->charset[3])) | |
| 2472 reg = 3; | |
| 2473 else | |
| 2474 reg = 0; | |
| 2475 } | |
| 2476 else | |
| 2477 reg = 0; | |
| 2478 } | |
| 2479 | |
| 2480 iso2022_designate (charset, reg, str, dst); | |
| 2481 | |
| 2482 /* Now invoke that register. */ | |
| 2483 switch (reg) | |
| 2484 { | |
| 2485 case 0: | |
| 2486 ensure_normal_shift (str, dst); | |
| 2487 half = 0; | |
| 2488 break; | |
| 2489 | |
| 2490 case 1: | |
| 2491 if (XCODING_SYSTEM_ISO2022_SEVEN (codesys)) | |
| 2492 { | |
| 2493 ensure_shift_out (str, dst); | |
| 2494 half = 0; | |
| 2495 } | |
| 2496 else | |
| 2497 half = 1; | |
| 2498 break; | |
| 2499 | |
| 2500 case 2: | |
| 2501 if (XCODING_SYSTEM_ISO2022_SEVEN (str->codesys)) | |
| 2502 { | |
| 2503 Dynarr_add (dst, ISO_CODE_ESC); | |
| 2504 Dynarr_add (dst, 'N'); | |
| 2505 half = 0; | |
| 2506 } | |
| 2507 else | |
| 2508 { | |
| 2509 Dynarr_add (dst, ISO_CODE_SS2); | |
| 2510 half = 1; | |
| 2511 } | |
| 2512 break; | |
| 2513 | |
| 2514 case 3: | |
| 2515 if (XCODING_SYSTEM_ISO2022_SEVEN (str->codesys)) | |
| 2516 { | |
| 2517 Dynarr_add (dst, ISO_CODE_ESC); | |
| 2518 Dynarr_add (dst, 'O'); | |
| 2519 half = 0; | |
| 2520 } | |
| 2521 else | |
| 2522 { | |
| 2523 Dynarr_add (dst, ISO_CODE_SS3); | |
| 2524 half = 1; | |
| 2525 } | |
| 2526 break; | |
| 2527 | |
| 2528 default: | |
| 2500 | 2529 ABORT (); |
| 771 | 2530 } |
| 2531 } | |
| 2532 char_boundary = 0; | |
| 2533 } | |
| 2534 else | |
| 2535 { /* Processing Non-ASCII character */ | |
| 2536 charmask = (half == 0 ? 0x7F : 0xFF); | |
| 2537 char_boundary = 1; | |
| 2538 if (EQ (charset, Vcharset_control_1)) | |
| 2539 { | |
| 2540 if (XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) | |
| 2541 && fit_to_be_escape_quoted (c)) | |
| 2542 Dynarr_add (dst, ISO_CODE_ESC); | |
| 2543 /* you asked for it ... */ | |
| 2544 Dynarr_add (dst, c - 0x20); | |
| 2545 } | |
| 2546 #ifndef ENABLE_COMPOSITE_CHARS | |
| 2547 else if (EQ (charset, Vcharset_composite)) | |
| 2548 { | |
| 2549 if (c >= 160 || c <= 164) /* Someone might have stuck in | |
| 2550 something else */ | |
| 2551 { | |
| 2552 Dynarr_add (dst, ISO_CODE_ESC); | |
| 2553 Dynarr_add (dst, c - 160 + '0'); | |
| 2554 } | |
| 2555 } | |
| 2556 #endif | |
| 2557 else | |
| 2558 { | |
| 2559 switch (XCHARSET_REP_BYTES (charset)) | |
| 2560 { | |
| 2561 case 2: | |
| 3439 | 2562 dynarr_add_2022_one_dimension (charset, c, |
| 2563 charmask, dst); | |
| 771 | 2564 break; |
| 2565 case 3: | |
| 2566 if (XCHARSET_PRIVATE_P (charset)) | |
| 2567 { | |
| 3439 | 2568 dynarr_add_2022_one_dimension (charset, c, |
| 2569 charmask, dst); | |
| 771 | 2570 ch = 0; |
| 2571 } | |
| 2572 else if (ch) | |
| 2573 { | |
| 2574 #ifdef ENABLE_COMPOSITE_CHARS | |
| 2575 if (EQ (charset, Vcharset_composite)) | |
| 2576 { | |
| 3439 | 2577 /* #### Hasn't been written to handle composite |
| 2578 characters yet. */ | |
| 2579 assert(!XCHARSET_ENCODE_AS_UTF_8 (charset)) | |
| 771 | 2580 if (in_composite) |
| 2581 { | |
| 2582 /* #### Bother! We don't know how to | |
| 2583 handle this yet. */ | |
| 2584 Dynarr_add (dst, '~'); | |
| 2585 } | |
| 2586 else | |
| 2587 { | |
| 867 | 2588 Ichar emch = make_ichar (Vcharset_composite, |
| 771 | 2589 ch & 0x7F, c & 0x7F); |
| 2590 Lisp_Object lstr = composite_char_string (emch); | |
| 2591 saved_n = n; | |
| 2592 saved_src = src; | |
| 2593 in_composite = 1; | |
| 2594 src = XSTRING_DATA (lstr); | |
| 2595 n = XSTRING_LENGTH (lstr); | |
| 2596 Dynarr_add (dst, ISO_CODE_ESC); | |
| 2597 Dynarr_add (dst, '0'); /* start composing */ | |
| 2598 } | |
| 2599 } | |
| 2600 else | |
| 2601 #endif /* ENABLE_COMPOSITE_CHARS */ | |
| 2602 { | |
| 3439 | 2603 dynarr_add_2022_two_dimensions (charset, c, ch, |
| 2604 charmask, dst); | |
| 771 | 2605 } |
| 2606 ch = 0; | |
| 2607 } | |
| 2608 else | |
| 2609 { | |
| 2610 ch = c; | |
| 2611 char_boundary = 0; | |
| 2612 } | |
| 2613 break; | |
| 2614 case 4: | |
| 2615 if (ch) | |
| 2616 { | |
| 3439 | 2617 dynarr_add_2022_two_dimensions (charset, c, ch, |
| 2618 charmask, dst); | |
| 771 | 2619 ch = 0; |
| 2620 } | |
| 2621 else | |
| 2622 { | |
| 2623 ch = c; | |
| 2624 char_boundary = 0; | |
| 2625 } | |
| 2626 break; | |
| 2627 default: | |
| 2500 | 2628 ABORT (); |
| 771 | 2629 } |
| 2630 } | |
| 2631 } | |
| 2632 } | |
| 2633 | |
| 2634 #ifdef ENABLE_COMPOSITE_CHARS | |
| 2635 if (in_composite) | |
| 2636 { | |
| 2637 n = saved_n; | |
| 2638 src = saved_src; | |
| 2639 in_composite = 0; | |
| 2640 Dynarr_add (dst, ISO_CODE_ESC); | |
| 2641 Dynarr_add (dst, '1'); /* end composing */ | |
| 2642 goto back_to_square_n; /* Wheeeeeeeee ..... */ | |
| 2643 } | |
| 2644 #endif /* ENABLE_COMPOSITE_CHARS */ | |
| 2645 | |
| 2646 if (char_boundary && str->eof) | |
| 2647 { | |
| 2648 restore_left_to_right_direction (codesys, dst, &flags, 0); | |
| 2649 ensure_normal_shift (str, dst); | |
| 2650 for (i = 0; i < 4; i++) | |
| 2651 { | |
| 2652 Lisp_Object initial_charset = | |
| 2653 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); | |
| 2654 iso2022_designate (initial_charset, i, str, dst); | |
| 2655 } | |
| 2656 } | |
| 2657 | |
| 2658 data->flags = flags; | |
| 2659 str->ch = ch; | |
| 2660 data->current_char_boundary = char_boundary; | |
| 2661 data->current_charset = charset; | |
| 2662 data->current_half = half; | |
| 2663 | |
| 2664 /* Verbum caro factum est! */ | |
| 2665 return orign; | |
| 2666 } | |
| 2667 | |
| 2668 static Bytecount | |
| 2669 iso2022_convert (struct coding_stream *str, | |
| 2670 const UExtbyte *src, | |
| 2671 unsigned_char_dynarr *dst, Bytecount n) | |
| 2672 { | |
| 2673 if (str->direction == CODING_DECODE) | |
| 2674 return iso2022_decode (str, src, dst, n); | |
| 2675 else | |
| 2676 return iso2022_encode (str, src, dst, n); | |
| 2677 } | |
| 2678 | |
| 2679 static void | |
| 2680 iso2022_mark (Lisp_Object codesys) | |
| 2681 { | |
| 2682 int i; | |
| 2683 | |
| 2684 for (i = 0; i < 4; i++) | |
| 2685 mark_object (XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)); | |
| 2686 if (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys)) | |
| 2687 { | |
| 2688 for (i = 0; | |
| 2689 i < Dynarr_length (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys)); | |
| 2690 i++) | |
| 2691 { | |
| 2692 struct charset_conversion_spec *ccs = | |
| 2693 Dynarr_atp (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys), i); | |
| 2694 mark_object (ccs->from_charset); | |
| 2695 mark_object (ccs->to_charset); | |
| 2696 } | |
| 2697 } | |
| 2698 if (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys)) | |
| 2699 { | |
| 2700 for (i = 0; | |
| 2701 i < Dynarr_length (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys)); | |
| 2702 i++) | |
| 2703 { | |
| 2704 struct charset_conversion_spec *ccs = | |
| 2705 Dynarr_atp (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys), i); | |
| 2706 mark_object (ccs->from_charset); | |
| 2707 mark_object (ccs->to_charset); | |
| 2708 } | |
| 2709 } | |
| 2710 } | |
| 2711 | |
| 2712 static void | |
| 2713 iso2022_finalize (Lisp_Object cs) | |
| 2714 { | |
| 2715 if (XCODING_SYSTEM_ISO2022_INPUT_CONV (cs)) | |
| 2716 { | |
| 2717 Dynarr_free (XCODING_SYSTEM_ISO2022_INPUT_CONV (cs)); | |
| 2718 XCODING_SYSTEM_ISO2022_INPUT_CONV (cs) = 0; | |
| 2719 } | |
| 2720 if (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs)) | |
| 2721 { | |
| 2722 Dynarr_free (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs)); | |
| 2723 XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs) = 0; | |
| 2724 } | |
| 2725 } | |
| 2726 | |
| 2727 /* Given a list of charset conversion specs as specified in a Lisp | |
| 2728 program, parse it into STORE_HERE. */ | |
| 2729 | |
| 2730 static void | |
| 2731 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here, | |
| 2732 Lisp_Object spec_list) | |
| 2733 { | |
| 2367 | 2734 EXTERNAL_LIST_LOOP_2 (car, spec_list) |
| 771 | 2735 { |
| 2736 Lisp_Object from, to; | |
| 2737 struct charset_conversion_spec spec; | |
| 2738 | |
| 2739 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car)))) | |
| 2740 invalid_argument ("Invalid charset conversion spec", car); | |
| 2741 from = Fget_charset (XCAR (car)); | |
| 2742 to = Fget_charset (XCAR (XCDR (car))); | |
| 2743 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to)) | |
| 2744 invalid_operation_2 | |
| 2745 ("Attempted conversion between different charset types", | |
| 2746 from, to); | |
| 2747 spec.from_charset = from; | |
| 2748 spec.to_charset = to; | |
| 2749 | |
| 2750 Dynarr_add (store_here, spec); | |
| 2751 } | |
| 2752 } | |
| 2753 | |
| 2754 /* Given a dynarr LOAD_HERE of internally-stored charset conversion | |
| 2755 specs, return the equivalent as the Lisp programmer would see it. | |
| 2756 | |
| 2757 If LOAD_HERE is 0, return Qnil. */ | |
| 2758 | |
| 2759 static Lisp_Object | |
| 2760 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here, | |
| 2761 int names) | |
| 2762 { | |
| 2763 int i; | |
| 2764 Lisp_Object result; | |
| 2765 | |
| 2766 if (!load_here) | |
| 2767 return Qnil; | |
| 2768 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++) | |
| 2769 { | |
| 2770 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i); | |
| 2771 if (names) | |
| 2772 result = Fcons (list2 (XCHARSET_NAME (ccs->from_charset), | |
| 2773 XCHARSET_NAME (ccs->to_charset)), result); | |
| 2774 else | |
| 2775 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result); | |
| 2776 } | |
| 2777 | |
| 2778 return Fnreverse (result); | |
| 2779 } | |
| 2780 | |
| 2781 static int | |
| 2782 iso2022_putprop (Lisp_Object codesys, | |
| 2783 Lisp_Object key, | |
| 2784 Lisp_Object value) | |
| 2785 { | |
| 2786 #define FROB_INITIAL_CHARSET(charset_num) \ | |
| 2787 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \ | |
| 2788 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value)) | |
| 2789 | |
| 2790 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0); | |
| 2791 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1); | |
| 2792 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2); | |
| 2793 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3); | |
| 2794 | |
| 2795 #define FROB_FORCE_CHARSET(charset_num) \ | |
| 2796 XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = \ | |
| 2797 !NILP (value) | |
| 2798 | |
| 2799 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0); | |
| 2800 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1); | |
| 2801 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2); | |
| 2802 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3); | |
| 2803 | |
| 2804 #define FROB_BOOLEAN_PROPERTY(prop) \ | |
| 2805 XCODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value) | |
| 2806 | |
| 2807 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT); | |
| 2808 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL); | |
| 2809 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL); | |
| 2810 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN); | |
| 2811 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT); | |
| 2812 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429); | |
| 2813 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED); | |
| 2814 | |
| 2815 else if (EQ (key, Qinput_charset_conversion)) | |
| 2816 { | |
| 2817 XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys) = | |
| 2818 Dynarr_new (charset_conversion_spec); | |
| 2819 parse_charset_conversion_specs | |
| 2820 (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys), value); | |
| 2821 } | |
| 2822 else if (EQ (key, Qoutput_charset_conversion)) | |
| 2823 { | |
| 2824 XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys) = | |
| 2825 Dynarr_new (charset_conversion_spec); | |
| 2826 parse_charset_conversion_specs | |
| 2827 (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys), value); | |
| 2828 } | |
| 2829 else | |
| 2830 return 0; | |
| 2831 | |
| 2832 return 1; | |
| 2833 } | |
| 2834 | |
| 2835 static void | |
| 2286 | 2836 iso2022_finalize_coding_stream ( |
| 2837 #ifdef ENABLE_COMPOSITE_CHARS | |
| 2838 struct coding_stream *str | |
| 2839 #else | |
| 2840 struct coding_stream *UNUSED (str) | |
| 2841 #endif | |
| 2842 ) | |
| 771 | 2843 { |
| 2844 #ifdef ENABLE_COMPOSITE_CHARS | |
| 2845 struct iso2022_coding_stream *data = | |
| 2846 CODING_STREAM_TYPE_DATA (str, iso2022); | |
| 2847 | |
| 2848 if (data->composite_chars) | |
| 2849 Dynarr_free (data->composite_chars); | |
| 2850 #endif | |
| 2851 } | |
| 2852 | |
| 2853 static void | |
| 2854 iso2022_init (Lisp_Object codesys) | |
| 2855 { | |
| 2856 int i; | |
| 2857 for (i = 0; i < 4; i++) | |
| 2858 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil; | |
| 2859 } | |
| 2860 | |
| 2861 static Lisp_Object | |
| 2862 coding_system_charset (Lisp_Object coding_system, int gnum) | |
| 2863 { | |
| 2864 Lisp_Object cs | |
| 2865 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum); | |
| 2866 | |
| 2867 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil; | |
| 2868 } | |
| 2869 | |
| 2870 static Lisp_Object | |
| 2871 iso2022_getprop (Lisp_Object coding_system, Lisp_Object prop) | |
| 2872 { | |
| 2873 if (EQ (prop, Qcharset_g0)) | |
| 2874 return coding_system_charset (coding_system, 0); | |
| 2875 else if (EQ (prop, Qcharset_g1)) | |
| 2876 return coding_system_charset (coding_system, 1); | |
| 2877 else if (EQ (prop, Qcharset_g2)) | |
| 2878 return coding_system_charset (coding_system, 2); | |
| 2879 else if (EQ (prop, Qcharset_g3)) | |
| 2880 return coding_system_charset (coding_system, 3); | |
| 2881 | |
| 2882 #define FORCE_CHARSET(charset_num) \ | |
| 2883 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \ | |
| 2884 (coding_system, charset_num) ? Qt : Qnil) | |
| 2885 | |
| 2886 else if (EQ (prop, Qforce_g0_on_output)) | |
| 2887 return FORCE_CHARSET (0); | |
| 2888 else if (EQ (prop, Qforce_g1_on_output)) | |
| 2889 return FORCE_CHARSET (1); | |
| 2890 else if (EQ (prop, Qforce_g2_on_output)) | |
| 2891 return FORCE_CHARSET (2); | |
| 2892 else if (EQ (prop, Qforce_g3_on_output)) | |
| 2893 return FORCE_CHARSET (3); | |
| 2894 | |
| 2895 #define LISP_BOOLEAN(prop) \ | |
| 2896 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil) | |
| 2897 | |
| 2898 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT); | |
| 2899 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL); | |
| 2900 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL); | |
| 2901 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN); | |
| 2902 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT); | |
| 2903 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429); | |
| 2904 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED); | |
| 2905 | |
| 2906 else if (EQ (prop, Qinput_charset_conversion)) | |
| 2907 return | |
| 2908 unparse_charset_conversion_specs | |
| 2909 (XCODING_SYSTEM_ISO2022_INPUT_CONV (coding_system), 0); | |
| 2910 else if (EQ (prop, Qoutput_charset_conversion)) | |
| 2911 return | |
| 2912 unparse_charset_conversion_specs | |
| 2913 (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (coding_system), 0); | |
| 2914 else | |
| 2915 return Qunbound; | |
| 2916 } | |
| 2917 | |
| 2918 static void | |
| 2286 | 2919 iso2022_print (Lisp_Object cs, Lisp_Object printcharfun, |
| 2920 int UNUSED (escapeflag)) | |
| 771 | 2921 { |
| 2922 int i; | |
| 2923 | |
| 826 | 2924 write_c_string (printcharfun, "("); |
| 771 | 2925 for (i = 0; i < 4; i++) |
| 2926 { | |
| 2927 Lisp_Object charset = coding_system_charset (cs, i); | |
| 2928 if (i > 0) | |
| 826 | 2929 write_c_string (printcharfun, ", "); |
| 771 | 2930 write_fmt_string (printcharfun, "g%d=", i); |
| 800 | 2931 print_internal (CHARSETP (charset) ? XCHARSET_NAME (charset) : charset, printcharfun, 0); |
| 771 | 2932 if (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (cs, i)) |
| 826 | 2933 write_c_string (printcharfun, "(force)"); |
| 771 | 2934 } |
| 2935 | |
| 3084 | 2936 #define FROB(prop) \ |
| 2937 if (!NILP (iso2022_getprop (cs, prop))) \ | |
| 2938 { \ | |
| 2939 write_fmt_string_lisp (printcharfun, ", %s", 1, prop); \ | |
| 771 | 2940 } |
| 2941 | |
| 2942 FROB (Qshort); | |
| 2943 FROB (Qno_ascii_eol); | |
| 2944 FROB (Qno_ascii_cntl); | |
| 2945 FROB (Qseven); | |
| 2946 FROB (Qlock_shift); | |
| 2947 FROB (Qno_iso6429); | |
| 2948 FROB (Qescape_quoted); | |
| 2949 | |
| 2950 { | |
| 2951 Lisp_Object val = | |
| 2952 unparse_charset_conversion_specs | |
| 2953 (XCODING_SYSTEM_ISO2022_INPUT_CONV (cs), 1); | |
| 2954 if (!NILP (val)) | |
| 2955 { | |
| 800 | 2956 write_fmt_string_lisp (printcharfun, ", input-charset-conversion=%s", 1, val); |
| 771 | 2957 } |
| 2958 val = | |
| 2959 unparse_charset_conversion_specs | |
| 2960 (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs), 1); | |
| 2961 if (!NILP (val)) | |
| 2962 { | |
| 800 | 2963 write_fmt_string_lisp (printcharfun, ", output-charset-conversion=%s", 1, val); |
| 771 | 2964 } |
| 826 | 2965 write_c_string (printcharfun, ")"); |
| 771 | 2966 } |
| 2967 } | |
| 2968 | |
| 2969 | |
| 2970 /************************************************************************/ | |
| 2971 /* ISO2022 detector */ | |
| 2972 /************************************************************************/ | |
| 2973 | |
| 2974 DEFINE_DETECTOR (iso2022); | |
| 2975 /* ISO2022 system using only seven-bit bytes, no locking shift */ | |
| 2976 DEFINE_DETECTOR_CATEGORY (iso2022, iso_7); | |
| 2977 /* ISO2022 system using eight-bit bytes, no locking shift, no single shift, | |
| 2978 using designation to switch charsets */ | |
| 2979 DEFINE_DETECTOR_CATEGORY (iso2022, iso_8_designate); | |
| 2980 /* ISO2022 system using eight-bit bytes, no locking shift, no designation | |
| 2981 sequences, one-dimension characters in the upper half. */ | |
| 2982 DEFINE_DETECTOR_CATEGORY (iso2022, iso_8_1); | |
| 2983 /* ISO2022 system using eight-bit bytes, no locking shift, no designation | |
| 2984 sequences, two-dimension characters in the upper half. */ | |
| 2985 DEFINE_DETECTOR_CATEGORY (iso2022, iso_8_2); | |
| 2986 /* ISO2022 system using locking shift */ | |
| 2987 DEFINE_DETECTOR_CATEGORY (iso2022, iso_lock_shift); | |
| 2988 | |
| 2989 struct iso2022_detector | |
| 2990 { | |
| 2991 int initted; | |
| 2992 struct iso2022_coding_stream *iso; | |
| 2993 unsigned int flags; | |
| 2994 | |
| 2995 /* for keeping temporary track of high-byte groups */ | |
| 2996 int high_byte_count; | |
| 2997 unsigned int saw_single_shift_just_now:1; | |
| 2998 | |
| 2999 /* running state; we set the likelihoods at the end */ | |
| 3000 unsigned int seen_high_byte:1; | |
| 3001 unsigned int seen_single_shift:1; | |
| 3002 unsigned int seen_locking_shift:1; | |
| 3003 unsigned int seen_designate:1; | |
| 3004 unsigned int bad_single_byte_sequences; | |
| 3005 unsigned int bad_multibyte_escape_sequences; | |
| 3006 unsigned int good_multibyte_escape_sequences; | |
| 3007 int even_high_byte_groups; | |
| 985 | 3008 int longest_even_high_byte; |
| 771 | 3009 int odd_high_byte_groups; |
| 3010 }; | |
| 3011 | |
| 3012 static void | |
| 3013 iso2022_detect (struct detection_state *st, const UExtbyte *src, | |
| 3014 Bytecount n) | |
| 3015 { | |
| 3016 Bytecount orign = n; | |
| 3017 struct iso2022_detector *data = DETECTION_STATE_DATA (st, iso2022); | |
| 3018 | |
| 3019 /* #### There are serious deficiencies in the recognition mechanism | |
| 3020 here. This needs to be much smarter if it's going to cut it. | |
| 3021 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while | |
| 3022 it should be detected as Latin-1. | |
| 3023 All the ISO2022 stuff in this file should be synced up with the | |
| 3024 code from FSF Emacs-21.0, in which Mule should be more or less stable. | |
| 3025 Perhaps we should wait till R2L works in FSF Emacs? */ | |
| 3026 | |
| 3027 /* We keep track of running state on our own, and set the categories at the | |
| 3028 end; that way we can reflect the correct state each time we finish, but | |
| 3029 not get confused by those results the next time around. */ | |
| 3030 | |
| 3031 if (!data->initted) | |
| 3032 { | |
| 3033 xzero (*data); | |
| 3034 data->iso = xnew_and_zero (struct iso2022_coding_stream); | |
| 3035 reset_iso2022_decode (Qnil, data->iso); | |
| 3036 data->initted = 1; | |
| 3037 } | |
| 3038 | |
| 3039 while (n--) | |
| 3040 { | |
| 3041 UExtbyte c = *src++; | |
| 3042 if (c >= 0x80) | |
| 3043 data->seen_high_byte = 1; | |
| 3044 if (c >= 0xA0) | |
| 3045 data->high_byte_count++; | |
| 3046 else | |
| 3047 { | |
| 3048 if (data->high_byte_count && | |
| 3049 !data->saw_single_shift_just_now) | |
| 3050 { | |
| 3051 if (data->high_byte_count & 1) | |
| 3052 data->odd_high_byte_groups++; | |
| 3053 else | |
| 985 | 3054 { |
| 3055 data->even_high_byte_groups++; | |
| 3056 if (data->longest_even_high_byte < data->high_byte_count) | |
| 3057 data->longest_even_high_byte = data->high_byte_count; | |
| 3058 } | |
| 771 | 3059 } |
| 3060 data->high_byte_count = 0; | |
| 3061 data->saw_single_shift_just_now = 0; | |
| 3062 } | |
| 3063 if (!(data->flags & ISO_STATE_ESCAPE) | |
| 826 | 3064 && (byte_c0_p (c) || byte_c1_p (c))) |
| 771 | 3065 { /* control chars */ |
| 3066 switch (c) | |
| 3067 { | |
| 3068 /* Allow and ignore control characters that you might | |
| 3069 reasonably see in a text file */ | |
| 3070 case '\r': | |
| 3071 case '\n': | |
| 3072 case '\t': | |
| 3073 case 7: /* bell */ | |
| 3074 case 8: /* backspace */ | |
| 3075 case 11: /* vertical tab */ | |
| 3076 case 12: /* form feed */ | |
| 3077 case 26: /* MS-DOS C-z junk */ | |
| 3078 case 31: /* '^_' -- for info */ | |
| 3079 goto label_continue_loop; | |
| 3080 | |
| 3081 default: | |
| 3082 break; | |
| 3083 } | |
| 3084 } | |
| 3085 | |
| 826 | 3086 if ((data->flags & ISO_STATE_ESCAPE) || byte_c0_p (c) |
| 3087 || byte_c1_p (c)) | |
| 771 | 3088 { |
| 3089 switch (parse_iso2022_esc (Qnil, data->iso, c, | |
| 3090 &data->flags, 0)) | |
| 3091 { | |
| 3092 case 1: /* done */ | |
| 3093 if (data->iso->esc_bytes_index > 0) | |
| 3094 data->good_multibyte_escape_sequences++; | |
| 3095 switch (data->iso->esc) | |
| 3096 { | |
| 3097 case ISO_ESC_DESIGNATE: | |
| 3098 data->seen_designate = 1; | |
| 3099 break; | |
| 3100 case ISO_ESC_LOCKING_SHIFT: | |
| 3101 data->seen_locking_shift = 1; | |
| 3102 break; | |
| 3103 case ISO_ESC_SINGLE_SHIFT: | |
| 3104 data->saw_single_shift_just_now = 1; | |
| 3105 data->seen_single_shift = 1; | |
| 3106 break; | |
| 3107 default: | |
| 3108 break; | |
| 3109 } | |
| 3110 break; | |
| 3111 | |
| 3112 case -1: /* not done */ | |
| 3113 break; | |
| 3114 | |
| 3115 case 0: /* error */ | |
| 3116 if (data->iso->esc == ISO_ESC_NOTHING) | |
| 3117 data->bad_single_byte_sequences++; | |
| 3118 else | |
| 3119 data->bad_multibyte_escape_sequences++; | |
| 3120 } | |
| 3121 } | |
| 3122 label_continue_loop:; | |
| 3123 } | |
| 3124 | |
| 985 | 3125 if (data->high_byte_count && |
| 3126 !data->saw_single_shift_just_now) | |
| 3127 { | |
| 3128 if (data->high_byte_count & 1) | |
| 3129 data->odd_high_byte_groups++; | |
| 3130 else | |
| 3131 { | |
| 3132 data->even_high_byte_groups++; | |
| 3133 if (data->longest_even_high_byte < data->high_byte_count) | |
| 3134 data->longest_even_high_byte = data->high_byte_count; | |
| 3135 } | |
| 3136 } | |
| 3137 | |
| 771 | 3138 if (data->bad_multibyte_escape_sequences > 2 || |
| 3139 (data->bad_multibyte_escape_sequences > 0 && | |
| 3140 data->good_multibyte_escape_sequences / | |
| 3141 data->bad_multibyte_escape_sequences < 10)) | |
| 3142 /* Just making it up ... */ | |
| 3143 SET_DET_RESULTS (st, iso2022, DET_NEARLY_IMPOSSIBLE); | |
| 3144 else if (data->bad_single_byte_sequences > 5 || | |
| 3145 (data->bad_single_byte_sequences > 0 && | |
| 3146 (data->good_multibyte_escape_sequences + | |
| 3147 data->even_high_byte_groups + | |
| 3148 data->odd_high_byte_groups) / | |
| 3149 data->bad_single_byte_sequences < 10)) | |
| 3150 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); | |
| 3151 else if (data->seen_locking_shift) | |
| 3152 { | |
| 3153 SET_DET_RESULTS (st, iso2022, DET_QUITE_IMPROBABLE); | |
| 3154 DET_RESULT (st, iso_lock_shift) = DET_QUITE_PROBABLE; | |
| 3155 } | |
| 3156 else if (!data->seen_high_byte) | |
| 3157 { | |
| 3158 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); | |
| 3159 if (data->good_multibyte_escape_sequences) | |
| 3160 DET_RESULT (st, iso_7) = DET_QUITE_PROBABLE; | |
| 3161 else if (data->seen_single_shift) | |
| 3162 DET_RESULT (st, iso_7) = DET_SOMEWHAT_LIKELY; | |
| 3163 else | |
| 3164 { | |
| 3165 /* If we've just seen pure 7-bit data, no escape sequences, | |
| 3166 then we can't give much likelihood; but if we've seen enough | |
| 3167 of this data, we can assume some unlikelihood of any 8-bit | |
| 3168 encoding */ | |
| 3169 if (orign + st->bytes_seen >= 1000) | |
| 3170 DET_RESULT (st, iso_7) = DET_AS_LIKELY_AS_UNLIKELY; | |
| 3171 else | |
| 3172 SET_DET_RESULTS (st, iso2022, DET_AS_LIKELY_AS_UNLIKELY); | |
| 3173 } | |
| 3174 } | |
| 3175 else if (data->seen_designate) | |
| 3176 { | |
| 3177 SET_DET_RESULTS (st, iso2022, DET_QUITE_IMPROBABLE); | |
| 3178 if (data->seen_single_shift) | |
| 3179 /* #### Does this really make sense? */ | |
| 3180 DET_RESULT (st, iso_8_designate) = DET_SOMEWHAT_UNLIKELY; | |
| 3181 else | |
| 3182 DET_RESULT (st, iso_8_designate) = DET_QUITE_PROBABLE; | |
| 3183 } | |
| 3184 else if (data->odd_high_byte_groups > 0 && | |
| 3185 data->even_high_byte_groups == 0) | |
| 3186 { | |
| 3187 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); | |
| 3188 if (data->seen_single_shift) | |
| 3189 DET_RESULT (st, iso_8_1) = DET_QUITE_PROBABLE; | |
| 3190 else | |
| 3191 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_LIKELY; | |
| 3192 } | |
| 3193 else if (data->odd_high_byte_groups == 0 && | |
| 3194 data->even_high_byte_groups > 0) | |
| 3195 { | |
| 985 | 3196 #if 0 |
| 771 | 3197 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); |
| 3198 if (data->even_high_byte_groups > 10) | |
| 3199 { | |
| 3200 if (data->seen_single_shift) | |
| 3201 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE; | |
| 3202 else | |
| 3203 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY; | |
| 3204 if (data->even_high_byte_groups < 50) | |
| 3205 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_UNLIKELY; | |
| 3206 /* else it stays at quite improbable */ | |
| 3207 } | |
| 985 | 3208 #else |
| 3209 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); | |
| 3210 if (data->seen_single_shift) | |
| 3211 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE; | |
| 3212 else if (data->even_high_byte_groups > 10) | |
| 3213 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY; | |
| 3214 else if (data->longest_even_high_byte > 6) | |
| 3215 DET_RESULT (st, iso_8_2) = DET_SLIGHTLY_LIKELY; | |
| 3216 #endif | |
| 771 | 3217 } |
| 3218 else if (data->odd_high_byte_groups > 0 && | |
| 3219 data->even_high_byte_groups > 0) | |
| 3393 | 3220 { |
| 3221 /* Well, this could be a Latin-1 text, with most high-byte | |
| 3222 characters single, but sometimes two are together, though | |
| 3223 this happens not as often. This is common for Western | |
| 3224 European languages like German, French, Danish, Swedish, etc. | |
| 3225 Then we would either have a rather small file and | |
| 3226 even_high_byte_groups would be low. | |
| 3227 Or we would have a larger file and the ratio of odd to even | |
| 3228 groups would be very high. */ | |
| 3229 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY); | |
| 3230 if (data->even_high_byte_groups <= 3 || | |
| 3231 data->odd_high_byte_groups >= 10 * data->even_high_byte_groups) | |
| 3232 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_LIKELY; | |
| 3233 } | |
| 771 | 3234 else |
| 3235 SET_DET_RESULTS (st, iso2022, DET_AS_LIKELY_AS_UNLIKELY); | |
| 3236 } | |
| 3237 | |
| 3238 static void | |
| 3239 iso2022_finalize_detection_state (struct detection_state *st) | |
| 3240 { | |
| 3241 struct iso2022_detector *data = DETECTION_STATE_DATA (st, iso2022); | |
| 3242 if (data->iso) | |
| 1726 | 3243 xfree (data->iso, struct iso2022_coding_stream *); |
| 771 | 3244 } |
| 3245 | |
| 3246 | |
| 3247 /************************************************************************/ | |
| 3248 /* CCL methods */ | |
| 3249 /************************************************************************/ | |
| 3250 | |
| 3251 /* Converter written in CCL. */ | |
| 3252 | |
| 3253 struct ccl_coding_system | |
| 3254 { | |
| 3255 /* For a CCL coding system, these specify the CCL programs used for | |
| 3256 decoding (input) and encoding (output). */ | |
| 3257 Lisp_Object decode; | |
| 3258 Lisp_Object encode; | |
| 3259 }; | |
| 3260 | |
| 3261 #define CODING_SYSTEM_CCL_DECODE(codesys) \ | |
| 3262 (CODING_SYSTEM_TYPE_DATA (codesys, ccl)->decode) | |
| 3263 #define CODING_SYSTEM_CCL_ENCODE(codesys) \ | |
| 3264 (CODING_SYSTEM_TYPE_DATA (codesys, ccl)->encode) | |
| 3265 #define XCODING_SYSTEM_CCL_DECODE(codesys) \ | |
| 3266 CODING_SYSTEM_CCL_DECODE (XCODING_SYSTEM (codesys)) | |
| 3267 #define XCODING_SYSTEM_CCL_ENCODE(codesys) \ | |
| 3268 CODING_SYSTEM_CCL_ENCODE (XCODING_SYSTEM (codesys)) | |
| 3269 | |
| 3270 struct ccl_coding_stream | |
| 3271 { | |
| 3272 /* state of the running CCL program */ | |
| 3273 struct ccl_program ccl; | |
| 3274 }; | |
| 3275 | |
| 1204 | 3276 static const struct memory_description ccl_coding_system_description[] = { |
| 3277 { XD_LISP_OBJECT, offsetof (struct ccl_coding_system, decode) }, | |
| 3278 { XD_LISP_OBJECT, offsetof (struct ccl_coding_system, encode) }, | |
| 771 | 3279 { XD_END } |
| 3280 }; | |
| 3281 | |
| 1204 | 3282 DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (ccl); |
| 3283 | |
| 771 | 3284 static void |
| 3285 ccl_mark (Lisp_Object codesys) | |
| 3286 { | |
| 3287 mark_object (XCODING_SYSTEM_CCL_DECODE (codesys)); | |
| 3288 mark_object (XCODING_SYSTEM_CCL_ENCODE (codesys)); | |
| 3289 } | |
| 3290 | |
| 3291 static Bytecount | |
| 3292 ccl_convert (struct coding_stream *str, const UExtbyte *src, | |
| 3293 unsigned_char_dynarr *dst, Bytecount n) | |
| 3294 { | |
| 3295 struct ccl_coding_stream *data = | |
| 3296 CODING_STREAM_TYPE_DATA (str, ccl); | |
| 3297 Bytecount orign = n; | |
| 3298 | |
| 3299 data->ccl.last_block = str->eof; | |
| 3300 /* When applying a CCL program to a stream, SRC must not be NULL -- this | |
| 3301 is a special signal to the driver that read and write operations are | |
| 3302 not allowed. The code does not actually look at what SRC points to if | |
| 3303 N == 0. | |
| 3304 */ | |
| 3305 ccl_driver (&data->ccl, src ? src : (const unsigned char *) "", | |
| 3306 dst, n, 0, | |
| 3307 str->direction == CODING_DECODE ? CCL_MODE_DECODING : | |
| 3308 CCL_MODE_ENCODING); | |
| 3309 return orign; | |
| 3310 } | |
| 3311 | |
| 3312 static void | |
| 3313 ccl_init_coding_stream (struct coding_stream *str) | |
| 3314 { | |
| 3315 struct ccl_coding_stream *data = | |
| 3316 CODING_STREAM_TYPE_DATA (str, ccl); | |
| 3317 | |
| 3318 setup_ccl_program (&data->ccl, | |
| 3319 str->direction == CODING_DECODE ? | |
| 3320 XCODING_SYSTEM_CCL_DECODE (str->codesys) : | |
| 3321 XCODING_SYSTEM_CCL_ENCODE (str->codesys)); | |
| 3322 } | |
| 3323 | |
| 3324 static void | |
| 3325 ccl_rewind_coding_stream (struct coding_stream *str) | |
| 3326 { | |
| 3327 ccl_init_coding_stream (str); | |
| 3328 } | |
| 3329 | |
| 3330 static void | |
| 3331 ccl_init (Lisp_Object codesys) | |
| 3332 { | |
| 3333 XCODING_SYSTEM_CCL_DECODE (codesys) = Qnil; | |
| 3334 XCODING_SYSTEM_CCL_ENCODE (codesys) = Qnil; | |
| 3335 } | |
| 3336 | |
| 3337 static int | |
| 3338 ccl_putprop (Lisp_Object codesys, Lisp_Object key, Lisp_Object value) | |
| 3339 { | |
| 3340 Lisp_Object sym; | |
| 3341 struct ccl_program test_ccl; | |
|
4528
726060ee587c
First draft of g++ 4.3 warning removal patch. Builds. *Needs ChangeLogs.*
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4522
diff
changeset
|
3342 const Ascbyte *suffix; |
| 771 | 3343 |
| 3344 /* Check key first. */ | |
| 3345 if (EQ (key, Qdecode)) | |
| 3346 suffix = "-ccl-decode"; | |
| 3347 else if (EQ (key, Qencode)) | |
| 3348 suffix = "-ccl-encode"; | |
| 3349 else | |
| 3350 return 0; | |
| 3351 | |
| 3352 /* If value is vector, register it as a ccl program | |
| 3353 associated with a newly created symbol for | |
| 3354 backward compatibility. | |
| 3355 | |
| 3356 #### Bogosity alert! Do we really have to do this crap???? --ben */ | |
| 3357 if (VECTORP (value)) | |
| 3358 { | |
| 3359 sym = Fintern (concat2 (Fsymbol_name (XCODING_SYSTEM_NAME (codesys)), | |
| 3360 build_string (suffix)), | |
| 3361 Qnil); | |
| 3362 Fregister_ccl_program (sym, value); | |
| 3363 } | |
| 3364 else | |
| 3365 { | |
| 3366 CHECK_SYMBOL (value); | |
| 3367 sym = value; | |
| 3368 } | |
| 3369 /* check if the given ccl programs are valid. */ | |
| 3370 if (setup_ccl_program (&test_ccl, sym) < 0) | |
| 3371 invalid_argument ("Invalid CCL program", value); | |
| 3372 | |
| 3373 if (EQ (key, Qdecode)) | |
| 3374 XCODING_SYSTEM_CCL_DECODE (codesys) = sym; | |
| 3375 else if (EQ (key, Qencode)) | |
| 3376 XCODING_SYSTEM_CCL_ENCODE (codesys) = sym; | |
| 3377 | |
| 3378 return 1; | |
| 3379 } | |
| 3380 | |
| 3381 static Lisp_Object | |
| 3382 ccl_getprop (Lisp_Object coding_system, Lisp_Object prop) | |
| 3383 { | |
| 3384 if (EQ (prop, Qdecode)) | |
| 3385 return XCODING_SYSTEM_CCL_DECODE (coding_system); | |
| 3386 else if (EQ (prop, Qencode)) | |
| 3387 return XCODING_SYSTEM_CCL_ENCODE (coding_system); | |
| 3388 else | |
| 3389 return Qunbound; | |
| 3390 } | |
| 3391 | |
| 3392 | |
| 3393 /************************************************************************/ | |
| 3394 /* Initialization */ | |
| 3395 /************************************************************************/ | |
| 3396 | |
| 3397 void | |
| 3398 syms_of_mule_coding (void) | |
| 3399 { | |
| 3400 DEFSUBR (Fdecode_shift_jis_char); | |
| 3401 DEFSUBR (Fencode_shift_jis_char); | |
| 3402 DEFSUBR (Fdecode_big5_char); | |
| 3403 DEFSUBR (Fencode_big5_char); | |
| 3404 | |
| 3405 DEFSYMBOL (Qbig5); | |
| 3406 DEFSYMBOL (Qshift_jis); | |
| 3407 DEFSYMBOL (Qccl); | |
| 3408 DEFSYMBOL (Qiso2022); | |
| 3409 | |
| 3410 DEFSYMBOL (Qcharset_g0); | |
| 3411 DEFSYMBOL (Qcharset_g1); | |
| 3412 DEFSYMBOL (Qcharset_g2); | |
| 3413 DEFSYMBOL (Qcharset_g3); | |
| 3414 DEFSYMBOL (Qforce_g0_on_output); | |
| 3415 DEFSYMBOL (Qforce_g1_on_output); | |
| 3416 DEFSYMBOL (Qforce_g2_on_output); | |
| 3417 DEFSYMBOL (Qforce_g3_on_output); | |
| 3418 DEFSYMBOL (Qno_iso6429); | |
| 3419 DEFSYMBOL (Qinput_charset_conversion); | |
| 3420 DEFSYMBOL (Qoutput_charset_conversion); | |
| 3421 | |
| 3422 DEFSYMBOL (Qshort); | |
| 3423 DEFSYMBOL (Qno_ascii_eol); | |
| 3424 DEFSYMBOL (Qno_ascii_cntl); | |
| 3425 DEFSYMBOL (Qseven); | |
| 3426 DEFSYMBOL (Qlock_shift); | |
| 3427 | |
| 3428 DEFSYMBOL (Qiso_7); | |
| 3429 DEFSYMBOL (Qiso_8_designate); | |
| 3430 DEFSYMBOL (Qiso_8_1); | |
| 3431 DEFSYMBOL (Qiso_8_2); | |
| 3432 DEFSYMBOL (Qiso_lock_shift); | |
| 3433 } | |
| 3434 | |
| 3435 void | |
| 3436 coding_system_type_create_mule_coding (void) | |
| 3437 { | |
| 3438 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (iso2022, "iso2022-coding-system-p"); | |
| 3439 CODING_SYSTEM_HAS_METHOD (iso2022, mark); | |
| 3440 CODING_SYSTEM_HAS_METHOD (iso2022, convert); | |
| 3441 CODING_SYSTEM_HAS_METHOD (iso2022, finalize_coding_stream); | |
| 3442 CODING_SYSTEM_HAS_METHOD (iso2022, init_coding_stream); | |
| 3443 CODING_SYSTEM_HAS_METHOD (iso2022, rewind_coding_stream); | |
| 3444 CODING_SYSTEM_HAS_METHOD (iso2022, init); | |
| 3445 CODING_SYSTEM_HAS_METHOD (iso2022, print); | |
| 3446 CODING_SYSTEM_HAS_METHOD (iso2022, finalize); | |
| 3447 CODING_SYSTEM_HAS_METHOD (iso2022, putprop); | |
| 3448 CODING_SYSTEM_HAS_METHOD (iso2022, getprop); | |
| 3449 | |
| 3450 INITIALIZE_DETECTOR (iso2022); | |
| 3451 DETECTOR_HAS_METHOD (iso2022, detect); | |
| 3452 DETECTOR_HAS_METHOD (iso2022, finalize_detection_state); | |
| 3453 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_7); | |
| 3454 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_8_designate); | |
| 3455 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_8_1); | |
| 3456 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_8_2); | |
| 3457 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_lock_shift); | |
| 3458 | |
| 3459 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (ccl, "ccl-coding-system-p"); | |
| 3460 CODING_SYSTEM_HAS_METHOD (ccl, mark); | |
| 3461 CODING_SYSTEM_HAS_METHOD (ccl, convert); | |
| 3462 CODING_SYSTEM_HAS_METHOD (ccl, init); | |
| 3463 CODING_SYSTEM_HAS_METHOD (ccl, init_coding_stream); | |
| 3464 CODING_SYSTEM_HAS_METHOD (ccl, rewind_coding_stream); | |
| 3465 CODING_SYSTEM_HAS_METHOD (ccl, putprop); | |
| 3466 CODING_SYSTEM_HAS_METHOD (ccl, getprop); | |
| 3467 | |
| 3468 INITIALIZE_CODING_SYSTEM_TYPE (shift_jis, "shift-jis-coding-system-p"); | |
| 3469 CODING_SYSTEM_HAS_METHOD (shift_jis, convert); | |
| 3470 | |
| 3471 INITIALIZE_DETECTOR (shift_jis); | |
| 3472 DETECTOR_HAS_METHOD (shift_jis, detect); | |
| 3473 INITIALIZE_DETECTOR_CATEGORY (shift_jis, shift_jis); | |
| 3474 | |
| 3475 INITIALIZE_CODING_SYSTEM_TYPE (big5, "big5-coding-system-p"); | |
| 3476 CODING_SYSTEM_HAS_METHOD (big5, convert); | |
| 3477 | |
| 3478 INITIALIZE_DETECTOR (big5); | |
| 3479 DETECTOR_HAS_METHOD (big5, detect); | |
| 3480 INITIALIZE_DETECTOR_CATEGORY (big5, big5); | |
| 3481 } | |
| 3482 | |
| 3483 void | |
| 3484 reinit_coding_system_type_create_mule_coding (void) | |
| 3485 { | |
| 3486 REINITIALIZE_CODING_SYSTEM_TYPE (iso2022); | |
| 3487 REINITIALIZE_CODING_SYSTEM_TYPE (ccl); | |
| 3488 REINITIALIZE_CODING_SYSTEM_TYPE (shift_jis); | |
| 3489 REINITIALIZE_CODING_SYSTEM_TYPE (big5); | |
| 3490 } | |
| 3491 | |
| 3492 void | |
| 3493 reinit_vars_of_mule_coding (void) | |
| 3494 { | |
| 3495 } | |
| 3496 | |
| 3497 void | |
| 3498 vars_of_mule_coding (void) | |
| 3499 { | |
| 3500 } |
