771
+ − 1 /* Conversion functions for I18N encodings, but not Unicode (in separate file).
+ − 2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
+ − 3 Copyright (C) 1995 Sun Microsystems, Inc.
+ − 4 Copyright (C) 2000, 2001, 2002 Ben Wing.
+ − 5
+ − 6 This file is part of XEmacs.
+ − 7
+ − 8 XEmacs is free software; you can redistribute it and/or modify it
+ − 9 under the terms of the GNU General Public License as published by the
+ − 10 Free Software Foundation; either version 2, or (at your option) any
+ − 11 later version.
+ − 12
+ − 13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
+ − 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ − 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ − 16 for more details.
+ − 17
+ − 18 You should have received a copy of the GNU General Public License
+ − 19 along with XEmacs; see the file COPYING. If not, write to
+ − 20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ − 21 Boston, MA 02111-1307, USA. */
+ − 22
+ − 23 /* Synched up with: Mule 2.3. Not in FSF. */
+ − 24
+ − 25 /* For previous history, see file-coding.c.
+ − 26
+ − 27 September 10, 2001: Extracted from file-coding.c by Ben Wing.
+ − 28
+ − 29 Later in September: Finished abstraction of detection system, rewrote
+ − 30 all the detectors to include multiple levels of likelihood.
+ − 31 */
+ − 32
+ − 33 #include <config.h>
+ − 34 #include "lisp.h"
+ − 35
+ − 36 #include "charset.h"
+ − 37 #include "mule-ccl.h"
+ − 38 #include "file-coding.h"
+ − 39
+ − 40 Lisp_Object Qshift_jis, Qiso2022, Qbig5, Qccl;
+ − 41
+ − 42 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
+ − 43 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
+ − 44 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
+ − 45 Lisp_Object Qno_iso6429;
+ − 46 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
+ − 47 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
+ − 48
+ − 49 Lisp_Object Qiso_7, Qiso_8_designate, Qiso_8_1, Qiso_8_2, Qiso_lock_shift;
+ − 50
+ − 51
+ − 52 /************************************************************************/
+ − 53 /* Shift-JIS methods */
+ − 54 /************************************************************************/
+ − 55
+ − 56 /* Shift-JIS; Hankaku (half-width) KANA is also supported. */
+ − 57 DEFINE_CODING_SYSTEM_TYPE (shift_jis);
+ − 58
+ − 59 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
+ − 60 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
+ − 61 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
+ − 62 encoded by "position-code + 0x80". A character of JISX0208
+ − 63 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
+ − 64 position-codes are divided and shifted so that it fit in the range
+ − 65 below.
+ − 66
+ − 67 --- CODE RANGE of Shift-JIS ---
+ − 68 (character set) (range)
+ − 69 ASCII 0x00 .. 0x7F
+ − 70 JISX0201-Kana 0xA0 .. 0xDF
+ − 71 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
+ − 72 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
+ − 73 -------------------------------
+ − 74
+ − 75 */
+ − 76
+ − 77 /* Is this the first byte of a Shift-JIS two-byte char? */
+ − 78
826
+ − 79 inline static int
+ − 80 byte_shift_jis_two_byte_1_p (int c)
+ − 81 {
+ − 82 return (c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF);
+ − 83 }
771
+ − 84
+ − 85 /* Is this the second byte of a Shift-JIS two-byte char? */
+ − 86
826
+ − 87 inline static int
+ − 88 byte_shift_jis_two_byte_2_p (int c)
+ − 89 {
+ − 90 return (c >= 0x40 && c <= 0x7E) || (c >= 0x80 && c <= 0xFC);
+ − 91 }
+ − 92
+ − 93 inline static int
+ − 94 byte_shift_jis_katakana_p (int c)
+ − 95 {
+ − 96 return c >= 0xA1 && c <= 0xDF;
+ − 97 }
771
+ − 98
+ − 99 /* Convert Shift-JIS data to internal format. */
+ − 100
+ − 101 static Bytecount
+ − 102 shift_jis_convert (struct coding_stream *str, const UExtbyte *src,
+ − 103 unsigned_char_dynarr *dst, Bytecount n)
+ − 104 {
+ − 105 unsigned int ch = str->ch;
+ − 106 Bytecount orign = n;
+ − 107
+ − 108 if (str->direction == CODING_DECODE)
+ − 109 {
+ − 110 while (n--)
+ − 111 {
+ − 112 UExtbyte c = *src++;
+ − 113
+ − 114 if (ch)
+ − 115 {
+ − 116 /* Previous character was first byte of Shift-JIS Kanji char. */
826
+ − 117 if (byte_shift_jis_two_byte_2_p (c))
771
+ − 118 {
867
+ − 119 Ibyte e1, e2;
771
+ − 120
+ − 121 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
+ − 122 DECODE_SHIFT_JIS (ch, c, e1, e2);
+ − 123 Dynarr_add (dst, e1);
+ − 124 Dynarr_add (dst, e2);
+ − 125 }
+ − 126 else
+ − 127 {
+ − 128 DECODE_ADD_BINARY_CHAR (ch, dst);
+ − 129 DECODE_ADD_BINARY_CHAR (c, dst);
+ − 130 }
+ − 131 ch = 0;
+ − 132 }
+ − 133 else
+ − 134 {
826
+ − 135 if (byte_shift_jis_two_byte_1_p (c))
771
+ − 136 ch = c;
826
+ − 137 else if (byte_shift_jis_katakana_p (c))
771
+ − 138 {
+ − 139 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201);
+ − 140 Dynarr_add (dst, c);
+ − 141 }
+ − 142 else
+ − 143 DECODE_ADD_BINARY_CHAR (c, dst);
+ − 144 }
+ − 145 }
+ − 146
+ − 147 if (str->eof)
+ − 148 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+ − 149 }
+ − 150 else
+ − 151 {
+ − 152 while (n--)
+ − 153 {
867
+ − 154 Ibyte c = *src++;
826
+ − 155 if (byte_ascii_p (c))
771
+ − 156 {
+ − 157 Dynarr_add (dst, c);
+ − 158 ch = 0;
+ − 159 }
867
+ − 160 else if (ibyte_leading_byte_p (c))
771
+ − 161 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 ||
+ − 162 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
+ − 163 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
+ − 164 else if (ch)
+ − 165 {
+ − 166 if (ch == LEADING_BYTE_KATAKANA_JISX0201)
+ − 167 {
+ − 168 Dynarr_add (dst, c);
+ − 169 ch = 0;
+ − 170 }
+ − 171 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
+ − 172 ch == LEADING_BYTE_JAPANESE_JISX0208)
+ − 173 ch = c;
+ − 174 else
+ − 175 {
+ − 176 UExtbyte j1, j2;
+ − 177 ENCODE_SHIFT_JIS (ch, c, j1, j2);
+ − 178 Dynarr_add (dst, j1);
+ − 179 Dynarr_add (dst, j2);
+ − 180 ch = 0;
+ − 181 }
+ − 182 }
+ − 183 }
+ − 184 }
+ − 185
+ − 186 str->ch = ch;
+ − 187
+ − 188 return orign;
+ − 189 }
+ − 190
+ − 191 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
+ − 192 Decode a JISX0208 character of Shift-JIS coding-system.
+ − 193 CODE is the character code in Shift-JIS as a cons of type bytes.
+ − 194 Return the corresponding character.
+ − 195 */
+ − 196 (code))
+ − 197 {
+ − 198 int c1, c2, s1, s2;
+ − 199
+ − 200 CHECK_CONS (code);
+ − 201 CHECK_INT (XCAR (code));
+ − 202 CHECK_INT (XCDR (code));
+ − 203 s1 = XINT (XCAR (code));
+ − 204 s2 = XINT (XCDR (code));
826
+ − 205 if (byte_shift_jis_two_byte_1_p (s1) &&
+ − 206 byte_shift_jis_two_byte_2_p (s2))
771
+ − 207 {
+ − 208 DECODE_SHIFT_JIS (s1, s2, c1, c2);
867
+ − 209 return make_char (make_ichar (Vcharset_japanese_jisx0208,
831
+ − 210 c1 & 0x7F, c2 & 0x7F));
771
+ − 211 }
+ − 212 else
+ − 213 return Qnil;
+ − 214 }
+ − 215
+ − 216 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
+ − 217 Encode a JISX0208 character CHARACTER to SHIFT-JIS coding-system.
+ − 218 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
+ − 219 */
+ − 220 (character))
+ − 221 {
+ − 222 Lisp_Object charset;
+ − 223 int c1, c2, s1, s2;
+ − 224
+ − 225 CHECK_CHAR_COERCE_INT (character);
867
+ − 226 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2);
771
+ − 227 if (EQ (charset, Vcharset_japanese_jisx0208))
+ − 228 {
+ − 229 ENCODE_SHIFT_JIS (c1 | 0x80, c2 | 0x80, s1, s2);
+ − 230 return Fcons (make_int (s1), make_int (s2));
+ − 231 }
+ − 232 else
+ − 233 return Qnil;
+ − 234 }
+ − 235
+ − 236
+ − 237 /************************************************************************/
+ − 238 /* Shift-JIS detector */
+ − 239 /************************************************************************/
+ − 240
+ − 241 DEFINE_DETECTOR (shift_jis);
+ − 242 DEFINE_DETECTOR_CATEGORY (shift_jis, shift_jis);
+ − 243
+ − 244 struct shift_jis_detector
+ − 245 {
+ − 246 int seen_jisx0208_char_in_c1;
+ − 247 int seen_jisx0208_char_in_upper;
+ − 248 int seen_jisx0201_char;
+ − 249 unsigned int seen_iso2022_esc:1;
+ − 250 unsigned int seen_bad_first_byte:1;
+ − 251 unsigned int seen_bad_second_byte:1;
+ − 252 /* temporary */
+ − 253 unsigned int in_second_byte:1;
+ − 254 unsigned int first_byte_was_c1:1;
+ − 255 };
+ − 256
+ − 257 static void
+ − 258 shift_jis_detect (struct detection_state *st, const UExtbyte *src,
+ − 259 Bytecount n)
+ − 260 {
+ − 261 struct shift_jis_detector *data = DETECTION_STATE_DATA (st, shift_jis);
+ − 262
+ − 263 while (n--)
+ − 264 {
+ − 265 UExtbyte c = *src++;
+ − 266 if (!data->in_second_byte)
+ − 267 {
+ − 268 if (c >= 0x80 && c <= 0x9F)
+ − 269 data->first_byte_was_c1 = 1;
+ − 270 if (c >= 0xA0 && c <= 0xDF)
+ − 271 data->seen_jisx0201_char++;
+ − 272 else if ((c >= 0x80 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
+ − 273 data->in_second_byte = 1;
+ − 274 else if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
+ − 275 data->seen_iso2022_esc = 1;
+ − 276 else if (c >= 0x80)
+ − 277 data->seen_bad_first_byte = 1;
+ − 278 }
+ − 279 else
+ − 280 {
+ − 281 if ((c >= 0x40 && c <= 0x7E) || (c >= 0x80 && c <= 0xFC))
+ − 282 {
+ − 283 if (data->first_byte_was_c1 || (c >= 0x80 && c <= 0x9F))
+ − 284 data->seen_jisx0208_char_in_c1++;
+ − 285 else
+ − 286 data->seen_jisx0208_char_in_upper++;
+ − 287 }
+ − 288 else
+ − 289 data->seen_bad_second_byte = 1;
+ − 290 data->in_second_byte = 0;
+ − 291 data->first_byte_was_c1 = 0;
+ − 292 }
+ − 293 }
+ − 294
+ − 295 if (data->seen_bad_second_byte)
+ − 296 DET_RESULT (st, shift_jis) = DET_NEARLY_IMPOSSIBLE;
+ − 297 else if (data->seen_bad_first_byte)
+ − 298 DET_RESULT (st, shift_jis) = DET_QUITE_IMPROBABLE;
+ − 299 else if (data->seen_iso2022_esc)
+ − 300 DET_RESULT (st, shift_jis) = DET_SOMEWHAT_UNLIKELY;
+ − 301 else if (data->seen_jisx0208_char_in_c1 >= 20 ||
+ − 302 (data->seen_jisx0208_char_in_c1 >= 10 &&
+ − 303 data->seen_jisx0208_char_in_upper >= 10))
+ − 304 DET_RESULT (st, shift_jis) = DET_QUITE_PROBABLE;
+ − 305 else if (data->seen_jisx0208_char_in_c1 > 3 ||
+ − 306 data->seen_jisx0208_char_in_upper >= 10 ||
+ − 307 /* Since the range is limited compared to what is often seen
+ − 308 is typical Latin-X charsets, the fact that we've seen a
+ − 309 bunch of them and none that are invalid is reasonably
+ − 310 strong statistical evidence of this encoding, or at least
+ − 311 not of the common Latin-X ones. */
+ − 312 data->seen_jisx0201_char >= 100)
+ − 313 DET_RESULT (st, shift_jis) = DET_SOMEWHAT_LIKELY;
+ − 314 else if (data->seen_jisx0208_char_in_c1 > 0 ||
+ − 315 data->seen_jisx0208_char_in_upper > 0 ||
+ − 316 data->seen_jisx0201_char > 0)
+ − 317 DET_RESULT (st, shift_jis) = DET_SLIGHTLY_LIKELY;
+ − 318 else
+ − 319 DET_RESULT (st, shift_jis) = DET_AS_LIKELY_AS_UNLIKELY;
+ − 320 }
+ − 321
+ − 322
+ − 323 /************************************************************************/
+ − 324 /* Big5 methods */
+ − 325 /************************************************************************/
+ − 326
+ − 327 /* BIG5 (used for Taiwanese). */
+ − 328 DEFINE_CODING_SYSTEM_TYPE (big5);
+ − 329
+ − 330 /* BIG5 is a coding system encoding two character sets: ASCII and
+ − 331 Big5. An ASCII character is encoded as is. Big5 is a two-byte
+ − 332 character set and is encoded in two-byte.
+ − 333
+ − 334 --- CODE RANGE of BIG5 ---
+ − 335 (character set) (range)
+ − 336 ASCII 0x00 .. 0x7F
+ − 337 Big5 (1st byte) 0xA1 .. 0xFE
+ − 338 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
+ − 339 --------------------------
+ − 340
+ − 341 Since the number of characters in Big5 is larger than maximum
+ − 342 characters in Emacs' charset (96x96), it can't be handled as one
+ − 343 charset. So, in XEmacs, Big5 is divided into two: `charset-big5-1'
+ − 344 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
+ − 345 contains frequently used characters and the latter contains less
+ − 346 frequently used characters. */
+ − 347
826
+ − 348 inline static int
+ − 349 byte_big5_two_byte_1_p (int c)
+ − 350 {
+ − 351 return c >= 0xA1 && c <= 0xFE;
+ − 352 }
771
+ − 353
+ − 354 /* Is this the second byte of a Shift-JIS two-byte char? */
+ − 355
826
+ − 356 inline static int
+ − 357 byte_big5_two_byte_2_p (int c)
+ − 358 {
+ − 359 return (c >= 0x40 && c <= 0x7E) || (c >= 0xA1 && c <= 0xFE);
+ − 360 }
771
+ − 361
+ − 362 /* Number of Big5 characters which have the same code in 1st byte. */
+ − 363
+ − 364 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
+ − 365
+ − 366 /* Code conversion macros. These are macros because they are used in
+ − 367 inner loops during code conversion.
+ − 368
+ − 369 Note that temporary variables in macros introduce the classic
+ − 370 dynamic-scoping problems with variable names. We use capital-
+ − 371 lettered variables in the assumption that XEmacs does not use
+ − 372 capital letters in variables except in a very formalized way
+ − 373 (e.g. Qstring). */
+ − 374
+ − 375 /* Convert Big5 code (b1, b2) into its internal string representation
+ − 376 (lb, c1, c2). */
+ − 377
+ − 378 /* There is a much simpler way to split the Big5 charset into two.
+ − 379 For the moment I'm going to leave the algorithm as-is because it
+ − 380 claims to separate out the most-used characters into a single
+ − 381 charset, which perhaps will lead to optimizations in various
+ − 382 places.
+ − 383
+ − 384 The way the algorithm works is something like this:
+ − 385
+ − 386 Big5 can be viewed as a 94x157 charset, where the row is
+ − 387 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
+ − 388 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
+ − 389 the split between low and high column numbers is apparently
+ − 390 meaningless; ascending rows produce less and less frequent chars.
+ − 391 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
+ − 392 the first charset, and the upper half (0xC9 .. 0xFE) to the
+ − 393 second. To do the conversion, we convert the character into
+ − 394 a single number where 0 .. 156 is the first row, 157 .. 313
+ − 395 is the second, etc. That way, the characters are ordered by
+ − 396 decreasing frequency. Then we just chop the space in two
+ − 397 and coerce the result into a 94x94 space.
+ − 398 */
+ − 399
+ − 400 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
+ − 401 { \
+ − 402 int B1 = b1, B2 = b2; \
+ − 403 int I \
+ − 404 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
+ − 405 \
+ − 406 if (B1 < 0xC9) \
+ − 407 { \
+ − 408 lb = LEADING_BYTE_CHINESE_BIG5_1; \
+ − 409 } \
+ − 410 else \
+ − 411 { \
+ − 412 lb = LEADING_BYTE_CHINESE_BIG5_2; \
+ − 413 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
+ − 414 } \
+ − 415 c1 = I / (0xFF - 0xA1) + 0xA1; \
+ − 416 c2 = I % (0xFF - 0xA1) + 0xA1; \
+ − 417 } while (0)
+ − 418
+ − 419 /* Convert the internal string representation of a Big5 character
+ − 420 (lb, c1, c2) into Big5 code (b1, b2). */
+ − 421
+ − 422 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
+ − 423 { \
+ − 424 int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
+ − 425 \
+ − 426 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
+ − 427 { \
+ − 428 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
+ − 429 } \
+ − 430 b1 = I / BIG5_SAME_ROW + 0xA1; \
+ − 431 b2 = I % BIG5_SAME_ROW; \
+ − 432 b2 += b2 < 0x3F ? 0x40 : 0x62; \
+ − 433 } while (0)
+ − 434
+ − 435 /* Convert Big5 data to internal format. */
+ − 436
+ − 437 static Bytecount
+ − 438 big5_convert (struct coding_stream *str, const UExtbyte *src,
+ − 439 unsigned_char_dynarr *dst, Bytecount n)
+ − 440 {
+ − 441 unsigned int ch = str->ch;
+ − 442 Bytecount orign = n;
+ − 443
+ − 444 if (str->direction == CODING_DECODE)
+ − 445 {
+ − 446 while (n--)
+ − 447 {
+ − 448 UExtbyte c = *src++;
+ − 449 if (ch)
+ − 450 {
+ − 451 /* Previous character was first byte of Big5 char. */
826
+ − 452 if (byte_big5_two_byte_2_p (c))
771
+ − 453 {
867
+ − 454 Ibyte b1, b2, b3;
771
+ − 455 DECODE_BIG5 (ch, c, b1, b2, b3);
+ − 456 Dynarr_add (dst, b1);
+ − 457 Dynarr_add (dst, b2);
+ − 458 Dynarr_add (dst, b3);
+ − 459 }
+ − 460 else
+ − 461 {
+ − 462 DECODE_ADD_BINARY_CHAR (ch, dst);
+ − 463 DECODE_ADD_BINARY_CHAR (c, dst);
+ − 464 }
+ − 465 ch = 0;
+ − 466 }
+ − 467 else
+ − 468 {
826
+ − 469 if (byte_big5_two_byte_1_p (c))
771
+ − 470 ch = c;
+ − 471 else
+ − 472 DECODE_ADD_BINARY_CHAR (c, dst);
+ − 473 }
+ − 474 }
+ − 475
+ − 476 if (str->eof)
+ − 477 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+ − 478 }
+ − 479 else
+ − 480 {
+ − 481 while (n--)
+ − 482 {
867
+ − 483 Ibyte c = *src++;
826
+ − 484 if (byte_ascii_p (c))
771
+ − 485 {
+ − 486 /* ASCII. */
+ − 487 Dynarr_add (dst, c);
+ − 488 }
867
+ − 489 else if (ibyte_leading_byte_p (c))
771
+ − 490 {
+ − 491 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
+ − 492 c == LEADING_BYTE_CHINESE_BIG5_2)
+ − 493 {
+ − 494 /* A recognized leading byte. */
+ − 495 ch = c;
+ − 496 continue; /* not done with this character. */
+ − 497 }
+ − 498 /* otherwise just ignore this character. */
+ − 499 }
+ − 500 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
+ − 501 ch == LEADING_BYTE_CHINESE_BIG5_2)
+ − 502 {
+ − 503 /* Previous char was a recognized leading byte. */
+ − 504 ch = (ch << 8) | c;
+ − 505 continue; /* not done with this character. */
+ − 506 }
+ − 507 else if (ch)
+ − 508 {
+ − 509 /* Encountering second byte of a Big5 character. */
+ − 510 UExtbyte b1, b2;
+ − 511
+ − 512 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
+ − 513 Dynarr_add (dst, b1);
+ − 514 Dynarr_add (dst, b2);
+ − 515 }
+ − 516
+ − 517 ch = 0;
+ − 518 }
+ − 519 }
+ − 520
+ − 521 str->ch = ch;
+ − 522 return orign;
+ − 523 }
+ − 524
867
+ − 525 Ichar
771
+ − 526 decode_big5_char (int b1, int b2)
+ − 527 {
826
+ − 528 if (byte_big5_two_byte_1_p (b1) &&
+ − 529 byte_big5_two_byte_2_p (b2))
771
+ − 530 {
+ − 531 int leading_byte;
+ − 532 Lisp_Object charset;
+ − 533 int c1, c2;
+ − 534
+ − 535 DECODE_BIG5 (b1, b2, leading_byte, c1, c2);
826
+ − 536 charset = charset_by_leading_byte (leading_byte);
867
+ − 537 return make_ichar (charset, c1 & 0x7F, c2 & 0x7F);
771
+ − 538 }
+ − 539 else
+ − 540 return -1;
+ − 541 }
+ − 542
+ − 543 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
+ − 544 Convert Big Five character codes in CODE into a character.
+ − 545 CODE is a cons of two integers specifying the codepoints in Big Five.
+ − 546 Return the corresponding character, or nil if the codepoints are out of range.
+ − 547
+ − 548 The term `decode' is used because the codepoints can be viewed as the
+ − 549 representation of the character in the external Big Five encoding, and thus
+ − 550 converting them to a character is analogous to any other operation that
+ − 551 decodes an external representation.
+ − 552 */
+ − 553 (code))
+ − 554 {
867
+ − 555 Ichar ch;
771
+ − 556
+ − 557 CHECK_CONS (code);
+ − 558 CHECK_INT (XCAR (code));
+ − 559 CHECK_INT (XCDR (code));
+ − 560 ch = decode_big5_char (XINT (XCAR (code)), XINT (XCDR (code)));
+ − 561 if (ch == -1)
+ − 562 return Qnil;
+ − 563 else
+ − 564 return make_char (ch);
+ − 565 }
+ − 566
+ − 567 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
+ − 568 Convert the specified Big Five character into its codepoints.
+ − 569 The codepoints are returned as a cons of two integers, specifying the
+ − 570 Big Five codepoints. See `decode-big5-char' for the reason why the
+ − 571 term `encode' is used for this operation.
+ − 572 */
+ − 573 (character))
+ − 574 {
+ − 575 Lisp_Object charset;
+ − 576 int c1, c2, b1, b2;
+ − 577
+ − 578 CHECK_CHAR_COERCE_INT (character);
867
+ − 579 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2);
771
+ − 580 if (EQ (charset, Vcharset_chinese_big5_1) ||
+ − 581 EQ (charset, Vcharset_chinese_big5_2))
+ − 582 {
+ − 583 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
+ − 584 b1, b2);
+ − 585 return Fcons (make_int (b1), make_int (b2));
+ − 586 }
+ − 587 else
+ − 588 return Qnil;
+ − 589 }
+ − 590
+ − 591
+ − 592 /************************************************************************/
+ − 593 /* Big5 detector */
+ − 594 /************************************************************************/
+ − 595
+ − 596 DEFINE_DETECTOR (big5);
+ − 597 DEFINE_DETECTOR_CATEGORY (big5, big5);
+ − 598
+ − 599 struct big5_detector
+ − 600 {
+ − 601 int seen_big5_char;
985
+ − 602 int seen_euc_char;
771
+ − 603 unsigned int seen_iso2022_esc:1;
+ − 604 unsigned int seen_bad_first_byte:1;
+ − 605 unsigned int seen_bad_second_byte:1;
+ − 606
+ − 607 /* temporary */
+ − 608 unsigned int in_second_byte:1;
+ − 609 };
+ − 610
+ − 611 static void
+ − 612 big5_detect (struct detection_state *st, const UExtbyte *src,
+ − 613 Bytecount n)
+ − 614 {
+ − 615 struct big5_detector *data = DETECTION_STATE_DATA (st, big5);
+ − 616
+ − 617 while (n--)
+ − 618 {
+ − 619 UExtbyte c = *src++;
+ − 620 if (!data->in_second_byte)
+ − 621 {
+ − 622 if (c >= 0xA1 && c <= 0xFE)
+ − 623 data->in_second_byte = 1;
+ − 624 else if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
+ − 625 data->seen_iso2022_esc = 1;
+ − 626 else if (c >= 0x80)
+ − 627 data->seen_bad_first_byte = 1;
+ − 628 }
+ − 629 else
+ − 630 {
+ − 631 data->in_second_byte = 0;
985
+ − 632 if (c >= 0xA1 && c <= 0xFE)
+ − 633 data->seen_euc_char++;
+ − 634 else if (c >= 0x40 && c <= 0x7E)
771
+ − 635 data->seen_big5_char++;
+ − 636 else
+ − 637 data->seen_bad_second_byte = 1;
+ − 638 }
+ − 639 }
+ − 640
+ − 641 if (data->seen_bad_second_byte)
+ − 642 DET_RESULT (st, big5) = DET_NEARLY_IMPOSSIBLE;
+ − 643 else if (data->seen_bad_first_byte)
+ − 644 DET_RESULT (st, big5) = DET_QUITE_IMPROBABLE;
+ − 645 else if (data->seen_iso2022_esc)
+ − 646 DET_RESULT (st, big5) = DET_SOMEWHAT_UNLIKELY;
+ − 647 else if (data->seen_big5_char >= 4)
+ − 648 DET_RESULT (st, big5) = DET_SOMEWHAT_LIKELY;
985
+ − 649 else if (data->seen_euc_char)
+ − 650 DET_RESULT (st, big5) = DET_SLIGHTLY_LIKELY;
771
+ − 651 else
+ − 652 DET_RESULT (st, big5) = DET_AS_LIKELY_AS_UNLIKELY;
+ − 653 }
+ − 654
+ − 655
+ − 656 /************************************************************************/
+ − 657 /* ISO2022 methods */
+ − 658 /************************************************************************/
+ − 659
+ − 660 /* Any ISO-2022-compliant coding system. Includes JIS, EUC, CTEXT
+ − 661 (Compound Text, the encoding of selections in X Windows). See below for
+ − 662 a complete description of ISO-2022. */
+ − 663 DEFINE_CODING_SYSTEM_TYPE (iso2022);
+ − 664
+ − 665 /* Flags indicating what we've seen so far when parsing an
+ − 666 ISO2022 escape sequence. */
+ − 667 enum iso_esc_flag
+ − 668 {
+ − 669 /* Partial sequences */
+ − 670 ISO_ESC_NOTHING, /* Nothing has been seen. */
+ − 671 ISO_ESC, /* We've seen ESC. */
+ − 672 ISO_ESC_2_4, /* We've seen ESC $. This indicates
+ − 673 that we're designating a multi-byte, rather
+ − 674 than a single-byte, character set. */
+ − 675 ISO_ESC_2_8, /* We've seen ESC 0x28, i.e. ESC (.
+ − 676 This means designate a 94-character
+ − 677 character set into G0. */
+ − 678 ISO_ESC_2_9, /* We've seen ESC 0x29 -- designate a
+ − 679 94-character character set into G1. */
+ − 680 ISO_ESC_2_10, /* We've seen ESC 0x2A. */
+ − 681 ISO_ESC_2_11, /* We've seen ESC 0x2B. */
+ − 682 ISO_ESC_2_12, /* We've seen ESC 0x2C -- designate a
+ − 683 96-character character set into G0.
+ − 684 (This is not ISO2022-standard.
+ − 685 The following 96-character
+ − 686 control sequences are standard,
+ − 687 though.) */
+ − 688 ISO_ESC_2_13, /* We've seen ESC 0x2D -- designate a
+ − 689 96-character character set into G1.
+ − 690 */
+ − 691 ISO_ESC_2_14, /* We've seen ESC 0x2E. */
+ − 692 ISO_ESC_2_15, /* We've seen ESC 0x2F. */
+ − 693 ISO_ESC_2_4_8, /* We've seen ESC $ 0x28 -- designate
+ − 694 a 94^N character set into G0. */
+ − 695 ISO_ESC_2_4_9, /* We've seen ESC $ 0x29. */
+ − 696 ISO_ESC_2_4_10, /* We've seen ESC $ 0x2A. */
+ − 697 ISO_ESC_2_4_11, /* We've seen ESC $ 0x2B. */
+ − 698 ISO_ESC_2_4_12, /* We've seen ESC $ 0x2C. */
+ − 699 ISO_ESC_2_4_13, /* We've seen ESC $ 0x2D. */
+ − 700 ISO_ESC_2_4_14, /* We've seen ESC $ 0x2E. */
+ − 701 ISO_ESC_2_4_15, /* We've seen ESC $ 0x2F. */
+ − 702 ISO_ESC_5_11, /* We've seen ESC [ or 0x9B. This
+ − 703 starts a directionality-control
+ − 704 sequence. The next character
+ − 705 must be 0, 1, 2, or ]. */
+ − 706 ISO_ESC_5_11_0, /* We've seen 0x9B 0. The next character must be ]. */
+ − 707 ISO_ESC_5_11_1, /* We've seen 0x9B 1. The next character must be ]. */
+ − 708 ISO_ESC_5_11_2, /* We've seen 0x9B 2. The next character must be ]. */
+ − 709
+ − 710 /* Full sequences. */
+ − 711 ISO_ESC_START_COMPOSITE, /* Private usage for START COMPOSING */
+ − 712 ISO_ESC_END_COMPOSITE, /* Private usage for END COMPOSING */
+ − 713 ISO_ESC_SINGLE_SHIFT, /* We've seen a complete single-shift sequence. */
+ − 714 ISO_ESC_LOCKING_SHIFT,/* We've seen a complete locking-shift sequence. */
+ − 715 ISO_ESC_DESIGNATE, /* We've seen a complete designation sequence. */
+ − 716 ISO_ESC_DIRECTIONALITY,/* We've seen a complete ISO6429 directionality
+ − 717 sequence. */
+ − 718 ISO_ESC_LITERAL /* We've seen a literal character ala
+ − 719 escape-quoting. */
+ − 720 };
+ − 721
+ − 722 enum iso_error
+ − 723 {
+ − 724 ISO_ERROR_BAD_FINAL,
+ − 725 ISO_ERROR_UNKWOWN_ESC_SEQUENCE,
+ − 726 ISO_ERROR_INVALID_CODE_POINT_CHARACTER,
+ − 727 };
+ − 728
+ − 729
+ − 730 /* Flags indicating current state while converting code. */
+ − 731
+ − 732 /************ Used during encoding and decoding: ************/
+ − 733 /* If set, the current directionality is right-to-left. Otherwise, it's
+ − 734 left-to-right. */
+ − 735 #define ISO_STATE_R2L (1 << 0)
+ − 736
+ − 737 /************ Used during encoding: ************/
+ − 738 /* If set, we just saw a CR. */
+ − 739 #define ISO_STATE_CR (1 << 1)
+ − 740
+ − 741 /************ Used during decoding: ************/
+ − 742 /* If set, we're currently parsing an escape sequence and the upper 16 bits
+ − 743 should be looked at to indicate what partial escape sequence we've seen
+ − 744 so far. Otherwise, we're running through actual text. */
+ − 745 #define ISO_STATE_ESCAPE (1 << 2)
+ − 746 /* If set, G2 is invoked into GL, but only for the next character. */
+ − 747 #define ISO_STATE_SS2 (1 << 3)
+ − 748 /* If set, G3 is invoked into GL, but only for the next character. If both
+ − 749 ISO_STATE_SS2 and ISO_STATE_SS3 are set, ISO_STATE_SS2 overrides; but
+ − 750 this probably indicates an error in the text encoding. */
+ − 751 #define ISO_STATE_SS3 (1 << 4)
+ − 752 /* If set, we're currently processing a composite character (i.e. a
+ − 753 character constructed by overstriking two or more characters). */
+ − 754 #define ISO_STATE_COMPOSITE (1 << 5)
+ − 755
+ − 756 /* ISO_STATE_LOCK is the mask of flags that remain on until explicitly
+ − 757 turned off when in the ISO2022 encoder/decoder. Other flags are turned
+ − 758 off at the end of processing each character or escape sequence. */
+ − 759 # define ISO_STATE_LOCK \
+ − 760 (ISO_STATE_COMPOSITE | ISO_STATE_R2L)
+ − 761
+ − 762 typedef struct charset_conversion_spec
+ − 763 {
+ − 764 Lisp_Object from_charset;
+ − 765 Lisp_Object to_charset;
+ − 766 } charset_conversion_spec;
+ − 767
+ − 768 typedef struct
+ − 769 {
+ − 770 Dynarr_declare (charset_conversion_spec);
+ − 771 } charset_conversion_spec_dynarr;
+ − 772
+ − 773 struct iso2022_coding_system
+ − 774 {
+ − 775 /* What are the charsets to be initially designated to G0, G1,
+ − 776 G2, G3? If t, no charset is initially designated. If nil,
+ − 777 no charset is initially designated and no charset is allowed
+ − 778 to be designated. */
+ − 779 Lisp_Object initial_charset[4];
+ − 780
+ − 781 /* If true, a designation escape sequence needs to be sent on output
+ − 782 for the charset in G[0-3] before that charset is used. */
+ − 783 unsigned char force_charset_on_output[4];
+ − 784
+ − 785 charset_conversion_spec_dynarr *input_conv;
+ − 786 charset_conversion_spec_dynarr *output_conv;
+ − 787
+ − 788 unsigned int shoort :1; /* C makes you speak Dutch */
+ − 789 unsigned int no_ascii_eol :1;
+ − 790 unsigned int no_ascii_cntl :1;
+ − 791 unsigned int seven :1;
+ − 792 unsigned int lock_shift :1;
+ − 793 unsigned int no_iso6429 :1;
+ − 794 unsigned int escape_quoted :1;
+ − 795 };
+ − 796
+ − 797 #define CODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, g) \
+ − 798 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->initial_charset[g])
+ − 799 #define CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(codesys, g) \
+ − 800 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->force_charset_on_output[g])
+ − 801 #define CODING_SYSTEM_ISO2022_SHORT(codesys) \
+ − 802 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->shoort)
+ − 803 #define CODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys) \
+ − 804 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->no_ascii_eol)
+ − 805 #define CODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys) \
+ − 806 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->no_ascii_cntl)
+ − 807 #define CODING_SYSTEM_ISO2022_SEVEN(codesys) \
+ − 808 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->seven)
+ − 809 #define CODING_SYSTEM_ISO2022_LOCK_SHIFT(codesys) \
+ − 810 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->lock_shift)
+ − 811 #define CODING_SYSTEM_ISO2022_NO_ISO6429(codesys) \
+ − 812 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->no_iso6429)
+ − 813 #define CODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys) \
+ − 814 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->escape_quoted)
+ − 815 #define CODING_SYSTEM_ISO2022_INPUT_CONV(codesys) \
+ − 816 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->input_conv)
+ − 817 #define CODING_SYSTEM_ISO2022_OUTPUT_CONV(codesys) \
+ − 818 (CODING_SYSTEM_TYPE_DATA (codesys, iso2022)->output_conv)
+ − 819
+ − 820 #define XCODING_SYSTEM_ISO2022_INITIAL_CHARSET(codesys, g) \
+ − 821 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (XCODING_SYSTEM (codesys), g)
+ − 822 #define XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT(codesys, g) \
+ − 823 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (XCODING_SYSTEM (codesys), g)
+ − 824 #define XCODING_SYSTEM_ISO2022_SHORT(codesys) \
+ − 825 CODING_SYSTEM_ISO2022_SHORT (XCODING_SYSTEM (codesys))
+ − 826 #define XCODING_SYSTEM_ISO2022_NO_ASCII_EOL(codesys) \
+ − 827 CODING_SYSTEM_ISO2022_NO_ASCII_EOL (XCODING_SYSTEM (codesys))
+ − 828 #define XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL(codesys) \
+ − 829 CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (XCODING_SYSTEM (codesys))
+ − 830 #define XCODING_SYSTEM_ISO2022_SEVEN(codesys) \
+ − 831 CODING_SYSTEM_ISO2022_SEVEN (XCODING_SYSTEM (codesys))
+ − 832 #define XCODING_SYSTEM_ISO2022_LOCK_SHIFT(codesys) \
+ − 833 CODING_SYSTEM_ISO2022_LOCK_SHIFT (XCODING_SYSTEM (codesys))
+ − 834 #define XCODING_SYSTEM_ISO2022_NO_ISO6429(codesys) \
+ − 835 CODING_SYSTEM_ISO2022_NO_ISO6429 (XCODING_SYSTEM (codesys))
+ − 836 #define XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED(codesys) \
+ − 837 CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (XCODING_SYSTEM (codesys))
+ − 838 #define XCODING_SYSTEM_ISO2022_INPUT_CONV(codesys) \
+ − 839 CODING_SYSTEM_ISO2022_INPUT_CONV (XCODING_SYSTEM (codesys))
+ − 840 #define XCODING_SYSTEM_ISO2022_OUTPUT_CONV(codesys) \
+ − 841 CODING_SYSTEM_ISO2022_OUTPUT_CONV (XCODING_SYSTEM (codesys))
+ − 842
+ − 843 /* Additional information used by the ISO2022 decoder and detector. */
+ − 844 struct iso2022_coding_stream
+ − 845 {
+ − 846 /* CHARSET holds the character sets currently assigned to the G0
+ − 847 through G3 variables. It is initialized from the array
+ − 848 INITIAL_CHARSET in CODESYS. */
+ − 849 Lisp_Object charset[4];
+ − 850
+ − 851 /* Which registers are currently invoked into the left (GL) and
+ − 852 right (GR) halves of the 8-bit encoding space? */
+ − 853 int register_left, register_right;
+ − 854
+ − 855 /* FLAGS holds flags indicating the current state of the encoding. Some of
+ − 856 these flags are actually part of the state-dependent data and should be
+ − 857 moved there. */
+ − 858 unsigned int flags;
+ − 859
+ − 860 /**************** for decoding ****************/
+ − 861
+ − 862 /* ISO_ESC holds a value indicating part of an escape sequence
+ − 863 that has already been seen. */
+ − 864 enum iso_esc_flag esc;
+ − 865
+ − 866 /* This records the bytes we've seen so far in an escape sequence,
+ − 867 in case the sequence is invalid (we spit out the bytes unchanged). */
+ − 868 unsigned char esc_bytes[8];
+ − 869
+ − 870 /* Index for next byte to store in ISO escape sequence. */
+ − 871 int esc_bytes_index;
+ − 872
+ − 873 #ifdef ENABLE_COMPOSITE_CHARS
+ − 874 /* Stuff seen so far when composing a string. */
+ − 875 unsigned_char_dynarr *composite_chars;
+ − 876 #endif
+ − 877
+ − 878 /* If we saw an invalid designation sequence for a particular
+ − 879 register, we flag it here and switch to ASCII. The next time we
+ − 880 see a valid designation for this register, we turn off the flag
+ − 881 and do the designation normally, but pretend the sequence was
+ − 882 invalid. The effect of all this is that (most of the time) the
+ − 883 escape sequences for both the switch to the unknown charset, and
+ − 884 the switch back to the known charset, get inserted literally into
+ − 885 the buffer and saved out as such. The hope is that we can
+ − 886 preserve the escape sequences so that the resulting written out
+ − 887 file makes sense. If we don't do any of this, the designation
+ − 888 to the invalid charset will be preserved but that switch back
+ − 889 to the known charset will probably get eaten because it was
+ − 890 the same charset that was already present in the register. */
+ − 891 unsigned char invalid_designated[4];
+ − 892
+ − 893 /* We try to do similar things as above for direction-switching
+ − 894 sequences. If we encountered a direction switch while an
+ − 895 invalid designation was present, or an invalid designation
+ − 896 just after a direction switch (i.e. no valid designation
+ − 897 encountered yet), we insert the direction-switch escape
+ − 898 sequence literally into the output stream, and later on
+ − 899 insert the corresponding direction-restoring escape sequence
+ − 900 literally also. */
+ − 901 unsigned int switched_dir_and_no_valid_charset_yet :1;
+ − 902 unsigned int invalid_switch_dir :1;
+ − 903
+ − 904 /* Tells the decoder to output the escape sequence literally
+ − 905 even though it was valid. Used in the games we play to
+ − 906 avoid lossage when we encounter invalid designations. */
+ − 907 unsigned int output_literally :1;
+ − 908 /* We encountered a direction switch followed by an invalid
+ − 909 designation. We didn't output the direction switch
+ − 910 literally because we didn't know about the invalid designation;
+ − 911 but we have to do so now. */
+ − 912 unsigned int output_direction_sequence :1;
+ − 913
+ − 914 /**************** for encoding ****************/
+ − 915
+ − 916 /* Whether we need to explicitly designate the charset in the
+ − 917 G? register before using it. It is initialized from the
+ − 918 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
+ − 919 unsigned char force_charset_on_output[4];
+ − 920
+ − 921 /* Other state variables that need to be preserved across
+ − 922 invocations. */
+ − 923 Lisp_Object current_charset;
+ − 924 int current_half;
+ − 925 int current_char_boundary;
+ − 926 };
+ − 927
+ − 928 static const struct lrecord_description ccs_description_1[] =
+ − 929 {
+ − 930 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
+ − 931 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, to_charset) },
+ − 932 { XD_END }
+ − 933 };
+ − 934
+ − 935 static const struct struct_description ccs_description =
+ − 936 {
+ − 937 sizeof (charset_conversion_spec),
+ − 938 ccs_description_1
+ − 939 };
+ − 940
+ − 941 static const struct lrecord_description ccsd_description_1[] =
+ − 942 {
+ − 943 XD_DYNARR_DESC (charset_conversion_spec_dynarr, &ccs_description),
+ − 944 { XD_END }
+ − 945 };
+ − 946
+ − 947 static const struct struct_description ccsd_description =
+ − 948 {
+ − 949 sizeof (charset_conversion_spec_dynarr),
+ − 950 ccsd_description_1
+ − 951 };
+ − 952
+ − 953 static const struct lrecord_description iso2022_coding_system_description[] = {
+ − 954 { XD_LISP_OBJECT_ARRAY,
+ − 955 coding_system_data_offset + offsetof (struct iso2022_coding_system,
+ − 956 initial_charset), 4 },
+ − 957 { XD_STRUCT_PTR,
+ − 958 coding_system_data_offset + offsetof (struct iso2022_coding_system,
+ − 959 input_conv),
+ − 960 1, &ccsd_description },
+ − 961 { XD_STRUCT_PTR,
+ − 962 coding_system_data_offset + offsetof (struct iso2022_coding_system,
+ − 963 output_conv),
+ − 964 1, &ccsd_description },
+ − 965 { XD_END }
+ − 966 };
+ − 967
+ − 968 /* The following note taken directly from FSF 21.0.103. */
+ − 969
+ − 970 /* The following note describes the coding system ISO2022 briefly.
+ − 971 Since the intention of this note is to help understand the
+ − 972 functions in this file, some parts are NOT ACCURATE or are OVERLY
+ − 973 SIMPLIFIED. For thorough understanding, please refer to the
+ − 974 original document of ISO2022. This is equivalent to the standard
+ − 975 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
+ − 976
+ − 977 ISO2022 provides many mechanisms to encode several character sets
+ − 978 in 7-bit and 8-bit environments. For 7-bit environments, all text
+ − 979 is encoded using bytes less than 128. This may make the encoded
+ − 980 text a little bit longer, but the text passes more easily through
+ − 981 several types of gateway, some of which strip off the MSB (Most
+ − 982 Significant Bit).
+ − 983
+ − 984 There are two kinds of character sets: control character sets and
+ − 985 graphic character sets. The former contain control characters such
+ − 986 as `newline' and `escape' to provide control functions (control
+ − 987 functions are also provided by escape sequences). The latter
+ − 988 contain graphic characters such as 'A' and '-'. Emacs recognizes
+ − 989 two control character sets and many graphic character sets.
+ − 990
+ − 991 Graphic character sets are classified into one of the following
+ − 992 four classes, according to the number of bytes (DIMENSION) and
+ − 993 number of characters in one dimension (CHARS) of the set:
+ − 994 - DIMENSION1_CHARS94
+ − 995 - DIMENSION1_CHARS96
+ − 996 - DIMENSION2_CHARS94
+ − 997 - DIMENSION2_CHARS96
+ − 998
+ − 999 In addition, each character set is assigned an identification tag,
+ − 1000 unique for each set, called the "final character" (denoted as <F>
+ − 1001 hereafter). The <F> of each character set is decided by ECMA(*)
+ − 1002 when it is registered in ISO. The code range of <F> is 0x30..0x7F
+ − 1003 (0x30..0x3F are for private use only).
+ − 1004
+ − 1005 Note (*): ECMA = European Computer Manufacturers Association
+ − 1006
+ − 1007 Here are examples of graphic character sets [NAME(<F>)]:
+ − 1008 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
+ − 1009 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
+ − 1010 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
+ − 1011 o DIMENSION2_CHARS96 -- none for the moment
+ − 1012
+ − 1013 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
+ − 1014 C0 [0x00..0x1F] -- control character plane 0
+ − 1015 GL [0x20..0x7F] -- graphic character plane 0
+ − 1016 C1 [0x80..0x9F] -- control character plane 1
+ − 1017 GR [0xA0..0xFF] -- graphic character plane 1
+ − 1018
+ − 1019 A control character set is directly designated and invoked to C0 or
+ − 1020 C1 by an escape sequence. The most common case is that:
+ − 1021 - ISO646's control character set is designated/invoked to C0, and
+ − 1022 - ISO6429's control character set is designated/invoked to C1,
+ − 1023 and usually these designations/invocations are omitted in encoded
+ − 1024 text. In a 7-bit environment, only C0 can be used, and a control
+ − 1025 character for C1 is encoded by an appropriate escape sequence to
+ − 1026 fit into the environment. All control characters for C1 are
+ − 1027 defined to have corresponding escape sequences.
+ − 1028
+ − 1029 A graphic character set is at first designated to one of four
+ − 1030 graphic registers (G0 through G3), then these graphic registers are
+ − 1031 invoked to GL or GR. These designations and invocations can be
+ − 1032 done independently. The most common case is that G0 is invoked to
+ − 1033 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
+ − 1034 these invocations and designations are omitted in encoded text.
+ − 1035 In a 7-bit environment, only GL can be used.
+ − 1036
+ − 1037 When a graphic character set of CHARS94 is invoked to GL, codes
+ − 1038 0x20 and 0x7F of the GL area work as control characters SPACE and
+ − 1039 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
+ − 1040 be used.
+ − 1041
+ − 1042 There are two ways of invocation: locking-shift and single-shift.
+ − 1043 With locking-shift, the invocation lasts until the next different
+ − 1044 invocation, whereas with single-shift, the invocation affects the
+ − 1045 following character only and doesn't affect the locking-shift
+ − 1046 state. Invocations are done by the following control characters or
+ − 1047 escape sequences:
+ − 1048
+ − 1049 ----------------------------------------------------------------------
+ − 1050 abbrev function cntrl escape seq description
+ − 1051 ----------------------------------------------------------------------
+ − 1052 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
+ − 1053 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
+ − 1054 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
+ − 1055 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
+ − 1056 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
+ − 1057 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
+ − 1058 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
+ − 1059 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
+ − 1060 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
+ − 1061 ----------------------------------------------------------------------
+ − 1062 (*) These are not used by any known coding system.
+ − 1063
+ − 1064 Control characters for these functions are defined by macros
+ − 1065 ISO_CODE_XXX in `coding.h'.
+ − 1066
+ − 1067 Designations are done by the following escape sequences:
+ − 1068 ----------------------------------------------------------------------
+ − 1069 escape sequence description
+ − 1070 ----------------------------------------------------------------------
+ − 1071 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
+ − 1072 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
+ − 1073 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
+ − 1074 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
+ − 1075 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
+ − 1076 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
+ − 1077 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
+ − 1078 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
+ − 1079 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
+ − 1080 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
+ − 1081 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
+ − 1082 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
+ − 1083 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
+ − 1084 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
+ − 1085 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
+ − 1086 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
+ − 1087 ----------------------------------------------------------------------
+ − 1088
+ − 1089 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
+ − 1090 of dimension 1, chars 94, and final character <F>, etc...
+ − 1091
+ − 1092 Note (*): Although these designations are not allowed in ISO2022,
+ − 1093 Emacs accepts them on decoding, and produces them on encoding
+ − 1094 CHARS96 character sets in a coding system which is characterized as
+ − 1095 7-bit environment, non-locking-shift, and non-single-shift.
+ − 1096
+ − 1097 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
+ − 1098 '(' can be omitted. We refer to this as "short-form" hereafter.
+ − 1099
+ − 1100 Now you may notice that there are a lot of ways of encoding the
+ − 1101 same multilingual text in ISO2022. Actually, there exist many
+ − 1102 coding systems such as Compound Text (used in X11's inter client
+ − 1103 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
+ − 1104 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
+ − 1105 localized platforms), and all of these are variants of ISO2022.
+ − 1106
+ − 1107 In addition to the above, Emacs handles two more kinds of escape
+ − 1108 sequences: ISO6429's direction specification and Emacs' private
+ − 1109 sequence for specifying character composition.
+ − 1110
+ − 1111 ISO6429's direction specification takes the following form:
+ − 1112 o CSI ']' -- end of the current direction
+ − 1113 o CSI '0' ']' -- end of the current direction
+ − 1114 o CSI '1' ']' -- start of left-to-right text
+ − 1115 o CSI '2' ']' -- start of right-to-left text
+ − 1116 The control character CSI (0x9B: control sequence introducer) is
+ − 1117 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
+ − 1118
+ − 1119 Character composition specification takes the following form:
+ − 1120 o ESC '0' -- start relative composition
+ − 1121 o ESC '1' -- end composition
+ − 1122 o ESC '2' -- start rule-base composition (*)
+ − 1123 o ESC '3' -- start relative composition with alternate chars (**)
+ − 1124 o ESC '4' -- start rule-base composition with alternate chars (**)
+ − 1125 Since these are not standard escape sequences of any ISO standard,
+ − 1126 the use of them with these meanings is restricted to Emacs only.
+ − 1127
+ − 1128 (*) This form is used only in Emacs 20.5 and older versions,
+ − 1129 but the newer versions can safely decode it.
+ − 1130 (**) This form is used only in Emacs 21.1 and newer versions,
+ − 1131 and the older versions can't decode it.
+ − 1132
+ − 1133 Here's a list of example usages of these composition escape
+ − 1134 sequences (categorized by `enum composition_method').
+ − 1135
+ − 1136 COMPOSITION_RELATIVE:
+ − 1137 ESC 0 CHAR [ CHAR ] ESC 1
+ − 1138 COMPOSITION_WITH_RULE:
+ − 1139 ESC 2 CHAR [ RULE CHAR ] ESC 1
+ − 1140 COMPOSITION_WITH_ALTCHARS:
+ − 1141 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
+ − 1142 COMPOSITION_WITH_RULE_ALTCHARS:
+ − 1143 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
+ − 1144
+ − 1145 static void
+ − 1146 reset_iso2022_decode (Lisp_Object coding_system,
+ − 1147 struct iso2022_coding_stream *data)
+ − 1148 {
+ − 1149 int i;
+ − 1150 #ifdef ENABLE_COMPOSITE_CHARS
+ − 1151 unsigned_char_dynarr *old_composite_chars = data->composite_chars;
+ − 1152 #endif
+ − 1153
+ − 1154 xzero (*data);
+ − 1155
+ − 1156 for (i = 0; i < 4; i++)
+ − 1157 {
+ − 1158 if (!NILP (coding_system))
+ − 1159 data->charset[i] =
+ − 1160 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
+ − 1161 else
+ − 1162 data->charset[i] = Qt;
+ − 1163 }
+ − 1164 data->esc = ISO_ESC_NOTHING;
+ − 1165 data->register_right = 1;
+ − 1166 #ifdef ENABLE_COMPOSITE_CHARS
+ − 1167 if (old_composite_chars)
+ − 1168 {
+ − 1169 data->composite_chars = old_composite_chars;
+ − 1170 Dynarr_reset (data->composite_chars);
+ − 1171 }
+ − 1172 #endif
+ − 1173 }
+ − 1174
+ − 1175 static void
+ − 1176 reset_iso2022_encode (Lisp_Object coding_system,
+ − 1177 struct iso2022_coding_stream *data)
+ − 1178 {
+ − 1179 int i;
+ − 1180
+ − 1181 xzero (*data);
+ − 1182
+ − 1183 for (i = 0; i < 4; i++)
+ − 1184 {
+ − 1185 data->charset[i] =
+ − 1186 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
+ − 1187 data->force_charset_on_output[i] =
+ − 1188 XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (coding_system, i);
+ − 1189 }
+ − 1190 data->register_right = 1;
+ − 1191 data->current_charset = Qnil;
+ − 1192 data->current_char_boundary = 1;
+ − 1193 }
+ − 1194
+ − 1195 static void
+ − 1196 iso2022_init_coding_stream (struct coding_stream *str)
+ − 1197 {
+ − 1198 if (str->direction == CODING_DECODE)
+ − 1199 reset_iso2022_decode (str->codesys,
+ − 1200 CODING_STREAM_TYPE_DATA (str, iso2022));
+ − 1201 else
+ − 1202 reset_iso2022_encode (str->codesys,
+ − 1203 CODING_STREAM_TYPE_DATA (str, iso2022));
+ − 1204 }
+ − 1205
+ − 1206 static void
+ − 1207 iso2022_rewind_coding_stream (struct coding_stream *str)
+ − 1208 {
+ − 1209 iso2022_init_coding_stream (str);
+ − 1210 }
+ − 1211
+ − 1212 static int
+ − 1213 fit_to_be_escape_quoted (unsigned char c)
+ − 1214 {
+ − 1215 switch (c)
+ − 1216 {
+ − 1217 case ISO_CODE_ESC:
+ − 1218 case ISO_CODE_CSI:
+ − 1219 case ISO_CODE_SS2:
+ − 1220 case ISO_CODE_SS3:
+ − 1221 case ISO_CODE_SO:
+ − 1222 case ISO_CODE_SI:
+ − 1223 return 1;
+ − 1224
+ − 1225 default:
+ − 1226 return 0;
+ − 1227 }
+ − 1228 }
+ − 1229
+ − 1230 static Lisp_Object
867
+ − 1231 charset_by_attributes_or_create_one (int type, Ibyte final, int dir)
771
+ − 1232 {
826
+ − 1233 Lisp_Object charset = charset_by_attributes (type, final, dir);
771
+ − 1234
+ − 1235 if (NILP (charset))
+ − 1236 {
+ − 1237 int chars, dim;
+ − 1238
+ − 1239 switch (type)
+ − 1240 {
+ − 1241 case CHARSET_TYPE_94:
+ − 1242 chars = 94; dim = 1;
+ − 1243 break;
+ − 1244 case CHARSET_TYPE_96:
+ − 1245 chars = 96; dim = 1;
+ − 1246 break;
+ − 1247 case CHARSET_TYPE_94X94:
+ − 1248 chars = 94; dim = 2;
+ − 1249 break;
+ − 1250 case CHARSET_TYPE_96X96:
+ − 1251 chars = 96; dim = 2;
+ − 1252 break;
+ − 1253 default:
+ − 1254 abort (); chars = 0; dim = 0;
+ − 1255 }
+ − 1256
+ − 1257 charset = Fmake_charset (Qunbound, Qnil,
+ − 1258 nconc2 (list6 (Qfinal, make_char (final),
+ − 1259 Qchars, make_int (chars),
+ − 1260 Qdimension, make_int (dim)),
+ − 1261 list2 (Qdirection,
+ − 1262 dir == CHARSET_LEFT_TO_RIGHT ?
+ − 1263 Ql2r : Qr2l)));
+ − 1264 }
+ − 1265
+ − 1266 return charset;
+ − 1267 }
+ − 1268
+ − 1269 /* Parse one byte of an ISO2022 escape sequence.
+ − 1270 If the result is an invalid escape sequence, return 0 and
+ − 1271 do not change anything in STR. Otherwise, if the result is
+ − 1272 an incomplete escape sequence, update ISO2022.ESC and
+ − 1273 ISO2022.ESC_BYTES and return -1. Otherwise, update
+ − 1274 all the state variables (but not ISO2022.ESC_BYTES) and
+ − 1275 return 1.
+ − 1276
+ − 1277 If CHECK_INVALID_CHARSETS is non-zero, check for designation
+ − 1278 or invocation of an invalid character set and treat that as
+ − 1279 an unrecognized escape sequence.
+ − 1280
+ − 1281 ********************************************************************
+ − 1282
+ − 1283 #### Strategies for error annotation and coding orthogonalization
+ − 1284
+ − 1285 We really want to separate out a number of things. Conceptually,
+ − 1286 there is a nested syntax.
+ − 1287
+ − 1288 At the top level is the ISO 2022 extension syntax, including charset
+ − 1289 designation and invocation, and certain auxiliary controls such as the
+ − 1290 ISO 6429 direction specification. These are octet-oriented, with the
+ − 1291 single exception (AFAIK) of the "exit Unicode" sequence which uses the
+ − 1292 UTF's natural width (1 byte for UTF-7 and UTF-8, 2 bytes for UCS-2 and
+ − 1293 UTF-16, and 4 bytes for UCS-4 and UTF-32). This will be treated as a
+ − 1294 (deprecated) special case in Unicode processing.
+ − 1295
+ − 1296 The middle layer is ISO 2022 character interpretation. This will depend
+ − 1297 on the current state of the ISO 2022 registers, and assembles octets
+ − 1298 into the character's internal representation.
+ − 1299
+ − 1300 The lowest level is translating system control conventions. At present
+ − 1301 this is restricted to newline translation, but one could imagine doing
+ − 1302 tab conversion or line wrapping here. "Escape from Unicode" processing
+ − 1303 would be done at this level.
+ − 1304
+ − 1305 At each level the parser will verify the syntax. In the case of a
+ − 1306 syntax error or warning (such as a redundant escape sequence that affects
+ − 1307 no characters), the parser will take some action, typically inserting the
+ − 1308 erroneous octets directly into the output and creating an annotation
+ − 1309 which can be used by higher level I/O to mark the affected region.
+ − 1310
+ − 1311 This should make it possible to do something sensible about separating
+ − 1312 newline convention processing from character construction, and about
+ − 1313 preventing ISO 2022 escape sequences from being recognized
+ − 1314 inappropriately.
+ − 1315
+ − 1316 The basic strategy will be to have octet classification tables, and
+ − 1317 switch processing according to the table entry.
+ − 1318
+ − 1319 It's possible that, by doing the processing with tables of functions or
+ − 1320 the like, the parser can be used for both detection and translation. */
+ − 1321
+ − 1322 static int
+ − 1323 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_coding_stream *iso,
+ − 1324 unsigned char c, unsigned int *flags,
+ − 1325 int check_invalid_charsets)
+ − 1326 {
+ − 1327 /* (1) If we're at the end of a designation sequence, CS is the
+ − 1328 charset being designated and REG is the register to designate
+ − 1329 it to.
+ − 1330
+ − 1331 (2) If we're at the end of a locking-shift sequence, REG is
+ − 1332 the register to invoke and HALF (0 == left, 1 == right) is
+ − 1333 the half to invoke it into.
+ − 1334
+ − 1335 (3) If we're at the end of a single-shift sequence, REG is
+ − 1336 the register to invoke. */
+ − 1337 Lisp_Object cs = Qnil;
+ − 1338 int reg, half;
+ − 1339
+ − 1340 /* NOTE: This code does goto's all over the fucking place.
+ − 1341 The reason for this is that we're basically implementing
+ − 1342 a state machine here, and hierarchical languages like C
+ − 1343 don't really provide a clean way of doing this. */
+ − 1344
+ − 1345 if (! (*flags & ISO_STATE_ESCAPE))
+ − 1346 /* At beginning of escape sequence; we need to reset our
+ − 1347 escape-state variables. */
+ − 1348 iso->esc = ISO_ESC_NOTHING;
+ − 1349
+ − 1350 iso->output_literally = 0;
+ − 1351 iso->output_direction_sequence = 0;
+ − 1352
+ − 1353 switch (iso->esc)
+ − 1354 {
+ − 1355 case ISO_ESC_NOTHING:
+ − 1356 iso->esc_bytes_index = 0;
+ − 1357 switch (c)
+ − 1358 {
+ − 1359 case ISO_CODE_ESC: /* Start escape sequence */
+ − 1360 *flags |= ISO_STATE_ESCAPE;
+ − 1361 iso->esc = ISO_ESC;
+ − 1362 goto not_done;
+ − 1363
+ − 1364 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
+ − 1365 *flags |= ISO_STATE_ESCAPE;
+ − 1366 iso->esc = ISO_ESC_5_11;
+ − 1367 goto not_done;
+ − 1368
+ − 1369 case ISO_CODE_SO: /* locking shift 1 */
+ − 1370 reg = 1; half = 0;
+ − 1371 goto locking_shift;
+ − 1372 case ISO_CODE_SI: /* locking shift 0 */
+ − 1373 reg = 0; half = 0;
+ − 1374 goto locking_shift;
+ − 1375
+ − 1376 case ISO_CODE_SS2: /* single shift */
+ − 1377 reg = 2;
+ − 1378 goto single_shift;
+ − 1379 case ISO_CODE_SS3: /* single shift */
+ − 1380 reg = 3;
+ − 1381 goto single_shift;
+ − 1382
+ − 1383 default: /* Other control characters */
+ − 1384 error:
+ − 1385 *flags &= ISO_STATE_LOCK;
+ − 1386 return 0;
+ − 1387 }
+ − 1388
+ − 1389 case ISO_ESC:
+ − 1390 switch (c)
+ − 1391 {
+ − 1392 /**** single shift ****/
+ − 1393
+ − 1394 case 'N': /* single shift 2 */
+ − 1395 reg = 2;
+ − 1396 goto single_shift;
+ − 1397 case 'O': /* single shift 3 */
+ − 1398 reg = 3;
+ − 1399 goto single_shift;
+ − 1400
+ − 1401 /**** locking shift ****/
+ − 1402
+ − 1403 case '~': /* locking shift 1 right */
+ − 1404 reg = 1; half = 1;
+ − 1405 goto locking_shift;
+ − 1406 case 'n': /* locking shift 2 */
+ − 1407 reg = 2; half = 0;
+ − 1408 goto locking_shift;
+ − 1409 case '}': /* locking shift 2 right */
+ − 1410 reg = 2; half = 1;
+ − 1411 goto locking_shift;
+ − 1412 case 'o': /* locking shift 3 */
+ − 1413 reg = 3; half = 0;
+ − 1414 goto locking_shift;
+ − 1415 case '|': /* locking shift 3 right */
+ − 1416 reg = 3; half = 1;
+ − 1417 goto locking_shift;
+ − 1418
+ − 1419 /**** composite ****/
+ − 1420
+ − 1421 #ifdef ENABLE_COMPOSITE_CHARS
+ − 1422 case '0':
+ − 1423 iso->esc = ISO_ESC_START_COMPOSITE;
+ − 1424 *flags = (*flags & ISO_STATE_LOCK) |
+ − 1425 ISO_STATE_COMPOSITE;
+ − 1426 return 1;
+ − 1427
+ − 1428 case '1':
+ − 1429 iso->esc = ISO_ESC_END_COMPOSITE;
+ − 1430 *flags = (*flags & ISO_STATE_LOCK) &
+ − 1431 ~ISO_STATE_COMPOSITE;
+ − 1432 return 1;
+ − 1433 #else
+ − 1434 case '0': case '1': case '2': case '3': case '4':
+ − 1435 /* We simply return a flag indicating that some composite
+ − 1436 escape was seen. The caller will use the particular
+ − 1437 character to encode the appropriate "composite hack"
+ − 1438 character out of Vcharset_composite, so that we will
+ − 1439 preserve these values on output. */
+ − 1440 iso->esc = ISO_ESC_START_COMPOSITE;
+ − 1441 *flags &= ISO_STATE_LOCK;
+ − 1442 return 1;
+ − 1443 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 1444
+ − 1445 /**** directionality ****/
+ − 1446
+ − 1447 case '[':
+ − 1448 iso->esc = ISO_ESC_5_11;
+ − 1449 goto not_done;
+ − 1450
+ − 1451 /**** designation ****/
+ − 1452
+ − 1453 case '$': /* multibyte charset prefix */
+ − 1454 iso->esc = ISO_ESC_2_4;
+ − 1455 goto not_done;
+ − 1456
+ − 1457 default:
+ − 1458 if (0x28 <= c && c <= 0x2F)
+ − 1459 {
+ − 1460 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
+ − 1461 goto not_done;
+ − 1462 }
+ − 1463
+ − 1464 /* This function is called with CODESYS equal to nil when
+ − 1465 doing coding-system detection. */
+ − 1466 if (!NILP (codesys)
+ − 1467 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
+ − 1468 && fit_to_be_escape_quoted (c))
+ − 1469 {
+ − 1470 iso->esc = ISO_ESC_LITERAL;
+ − 1471 *flags &= ISO_STATE_LOCK;
+ − 1472 return 1;
+ − 1473 }
+ − 1474
+ − 1475 /* bzzzt! */
+ − 1476 goto error;
+ − 1477 }
+ − 1478
+ − 1479
+ − 1480
+ − 1481 /**** directionality ****/
+ − 1482
+ − 1483 case ISO_ESC_5_11: /* ISO6429 direction control */
+ − 1484 if (c == ']')
+ − 1485 {
+ − 1486 *flags &= (ISO_STATE_LOCK & ~ISO_STATE_R2L);
+ − 1487 goto directionality;
+ − 1488 }
+ − 1489 if (c == '0') iso->esc = ISO_ESC_5_11_0;
+ − 1490 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
+ − 1491 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
+ − 1492 else goto error;
+ − 1493 goto not_done;
+ − 1494
+ − 1495 case ISO_ESC_5_11_0:
+ − 1496 if (c == ']')
+ − 1497 {
+ − 1498 *flags &= (ISO_STATE_LOCK & ~ISO_STATE_R2L);
+ − 1499 goto directionality;
+ − 1500 }
+ − 1501 goto error;
+ − 1502
+ − 1503 case ISO_ESC_5_11_1:
+ − 1504 if (c == ']')
+ − 1505 {
+ − 1506 *flags = (ISO_STATE_LOCK & ~ISO_STATE_R2L);
+ − 1507 goto directionality;
+ − 1508 }
+ − 1509 goto error;
+ − 1510
+ − 1511 case ISO_ESC_5_11_2:
+ − 1512 if (c == ']')
+ − 1513 {
+ − 1514 *flags = (*flags & ISO_STATE_LOCK) | ISO_STATE_R2L;
+ − 1515 goto directionality;
+ − 1516 }
+ − 1517 goto error;
+ − 1518
+ − 1519 directionality:
+ − 1520 iso->esc = ISO_ESC_DIRECTIONALITY;
+ − 1521 /* Various junk here to attempt to preserve the direction sequences
+ − 1522 literally in the text if they would otherwise be swallowed due
+ − 1523 to invalid designations that don't show up as actual charset
+ − 1524 changes in the text. */
+ − 1525 if (iso->invalid_switch_dir)
+ − 1526 {
+ − 1527 /* We already inserted a direction switch literally into the
+ − 1528 text. We assume (#### this may not be right) that the
+ − 1529 next direction switch is the one going the other way,
+ − 1530 and we need to output that literally as well. */
+ − 1531 iso->output_literally = 1;
+ − 1532 iso->invalid_switch_dir = 0;
+ − 1533 }
+ − 1534 else
+ − 1535 {
+ − 1536 int jj;
+ − 1537
+ − 1538 /* If we are in the thrall of an invalid designation,
+ − 1539 then stick the directionality sequence literally into the
+ − 1540 output stream so it ends up in the original text again. */
+ − 1541 for (jj = 0; jj < 4; jj++)
+ − 1542 if (iso->invalid_designated[jj])
+ − 1543 break;
+ − 1544 if (jj < 4)
+ − 1545 {
+ − 1546 iso->output_literally = 1;
+ − 1547 iso->invalid_switch_dir = 1;
+ − 1548 }
+ − 1549 else
+ − 1550 /* Indicate that we haven't yet seen a valid designation,
+ − 1551 so that if a switch-dir is directly followed by an
+ − 1552 invalid designation, both get inserted literally. */
+ − 1553 iso->switched_dir_and_no_valid_charset_yet = 1;
+ − 1554 }
+ − 1555 return 1;
+ − 1556
+ − 1557
+ − 1558 /**** designation ****/
+ − 1559
+ − 1560 case ISO_ESC_2_4:
+ − 1561 if (0x28 <= c && c <= 0x2F)
+ − 1562 {
+ − 1563 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8);
+ − 1564 goto not_done;
+ − 1565 }
+ − 1566 if (0x40 <= c && c <= 0x42)
+ − 1567 {
+ − 1568 cs = charset_by_attributes_or_create_one (CHARSET_TYPE_94X94, c,
+ − 1569 *flags & ISO_STATE_R2L ?
+ − 1570 CHARSET_RIGHT_TO_LEFT :
+ − 1571 CHARSET_LEFT_TO_RIGHT);
+ − 1572 reg = 0;
+ − 1573 goto designated;
+ − 1574 }
+ − 1575 goto error;
+ − 1576
+ − 1577 default:
+ − 1578 {
+ − 1579 int type = -1;
+ − 1580
+ − 1581 if (iso->esc >= ISO_ESC_2_8 &&
+ − 1582 iso->esc <= ISO_ESC_2_15)
+ − 1583 {
+ − 1584 type = ((iso->esc >= ISO_ESC_2_12) ?
+ − 1585 CHARSET_TYPE_96 : CHARSET_TYPE_94);
+ − 1586 reg = (iso->esc - ISO_ESC_2_8) & 3;
+ − 1587 }
+ − 1588 else if (iso->esc >= ISO_ESC_2_4_8 &&
+ − 1589 iso->esc <= ISO_ESC_2_4_15)
+ − 1590 {
+ − 1591 type = ((iso->esc >= ISO_ESC_2_4_12) ?
+ − 1592 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94);
+ − 1593 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
+ − 1594 }
+ − 1595 else
+ − 1596 {
+ − 1597 /* Can this ever be reached? -slb */
+ − 1598 abort ();
+ − 1599 goto error;
+ − 1600 }
+ − 1601
+ − 1602 if (c < '0' || c > '~' ||
+ − 1603 (c > 0x5F && (type == CHARSET_TYPE_94X94 ||
+ − 1604 type == CHARSET_TYPE_96X96)))
+ − 1605 goto error; /* bad final byte */
+ − 1606
+ − 1607 cs = charset_by_attributes_or_create_one (type, c,
+ − 1608 *flags & ISO_STATE_R2L ?
+ − 1609 CHARSET_RIGHT_TO_LEFT :
+ − 1610 CHARSET_LEFT_TO_RIGHT);
+ − 1611 goto designated;
+ − 1612 }
+ − 1613 }
+ − 1614
+ − 1615 not_done:
+ − 1616 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
+ − 1617 return -1;
+ − 1618
+ − 1619 single_shift:
+ − 1620 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
+ − 1621 /* can't invoke something that ain't there. */
+ − 1622 goto error;
+ − 1623 iso->esc = ISO_ESC_SINGLE_SHIFT;
+ − 1624 *flags &= ISO_STATE_LOCK;
+ − 1625 if (reg == 2)
+ − 1626 *flags |= ISO_STATE_SS2;
+ − 1627 else
+ − 1628 *flags |= ISO_STATE_SS3;
+ − 1629 return 1;
+ − 1630
+ − 1631 locking_shift:
+ − 1632 if (check_invalid_charsets &&
+ − 1633 !CHARSETP (iso->charset[reg]))
+ − 1634 /* can't invoke something that ain't there. */
+ − 1635 goto error;
+ − 1636 if (half)
+ − 1637 iso->register_right = reg;
+ − 1638 else
+ − 1639 iso->register_left = reg;
+ − 1640 *flags &= ISO_STATE_LOCK;
+ − 1641 iso->esc = ISO_ESC_LOCKING_SHIFT;
+ − 1642 return 1;
+ − 1643
+ − 1644 designated:
+ − 1645 if (NILP (cs) && check_invalid_charsets)
+ − 1646 {
+ − 1647 abort ();
+ − 1648 /* #### This should never happen now that we automatically create
+ − 1649 temporary charsets as necessary. We should probably remove
+ − 1650 this code. --ben */
+ − 1651 iso->invalid_designated[reg] = 1;
+ − 1652 iso->charset[reg] = Vcharset_ascii;
+ − 1653 iso->esc = ISO_ESC_DESIGNATE;
+ − 1654 *flags &= ISO_STATE_LOCK;
+ − 1655 iso->output_literally = 1;
+ − 1656 if (iso->switched_dir_and_no_valid_charset_yet)
+ − 1657 {
+ − 1658 /* We encountered a switch-direction followed by an
+ − 1659 invalid designation. Ensure that the switch-direction
+ − 1660 gets outputted; otherwise it will probably get eaten
+ − 1661 when the text is written out again. */
+ − 1662 iso->switched_dir_and_no_valid_charset_yet = 0;
+ − 1663 iso->output_direction_sequence = 1;
+ − 1664 /* And make sure that the switch-dir going the other
+ − 1665 way gets outputted, as well. */
+ − 1666 iso->invalid_switch_dir = 1;
+ − 1667 }
+ − 1668 return 1;
+ − 1669 }
+ − 1670 /* This function is called with CODESYS equal to nil when
+ − 1671 doing coding-system detection. */
+ − 1672 if (!NILP (codesys))
+ − 1673 {
+ − 1674 charset_conversion_spec_dynarr *dyn =
+ − 1675 XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys);
+ − 1676
+ − 1677 if (dyn)
+ − 1678 {
+ − 1679 int i;
+ − 1680
+ − 1681 for (i = 0; i < Dynarr_length (dyn); i++)
+ − 1682 {
+ − 1683 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
+ − 1684 if (EQ (cs, spec->from_charset))
+ − 1685 cs = spec->to_charset;
+ − 1686 }
+ − 1687 }
+ − 1688 }
+ − 1689
+ − 1690 iso->charset[reg] = cs;
+ − 1691 iso->esc = ISO_ESC_DESIGNATE;
+ − 1692 *flags &= ISO_STATE_LOCK;
+ − 1693 if (iso->invalid_designated[reg])
+ − 1694 {
+ − 1695 iso->invalid_designated[reg] = 0;
+ − 1696 iso->output_literally = 1;
+ − 1697 }
+ − 1698 if (iso->switched_dir_and_no_valid_charset_yet)
+ − 1699 iso->switched_dir_and_no_valid_charset_yet = 0;
+ − 1700 return 1;
+ − 1701 }
+ − 1702
+ − 1703 /* If FLAGS is a null pointer or specifies right-to-left motion,
+ − 1704 output a switch-dir-to-left-to-right sequence to DST.
+ − 1705 Also update FLAGS if it is not a null pointer.
+ − 1706 If INTERNAL_P is set, we are outputting in internal format and
+ − 1707 need to handle the CSI differently. */
+ − 1708
+ − 1709 static void
+ − 1710 restore_left_to_right_direction (Lisp_Object codesys,
+ − 1711 unsigned_char_dynarr *dst,
+ − 1712 unsigned int *flags,
+ − 1713 int internal_p)
+ − 1714 {
+ − 1715 if (!flags || (*flags & ISO_STATE_R2L))
+ − 1716 {
+ − 1717 if (XCODING_SYSTEM_ISO2022_SEVEN (codesys))
+ − 1718 {
+ − 1719 Dynarr_add (dst, ISO_CODE_ESC);
+ − 1720 Dynarr_add (dst, '[');
+ − 1721 }
+ − 1722 else if (internal_p)
+ − 1723 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
+ − 1724 else
+ − 1725 Dynarr_add (dst, ISO_CODE_CSI);
+ − 1726 Dynarr_add (dst, '0');
+ − 1727 Dynarr_add (dst, ']');
+ − 1728 if (flags)
+ − 1729 *flags &= ~ISO_STATE_R2L;
+ − 1730 }
+ − 1731 }
+ − 1732
+ − 1733 /* If FLAGS is a null pointer or specifies a direction different from
+ − 1734 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
+ − 1735 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
+ − 1736 sequence to DST. Also update FLAGS if it is not a null pointer.
+ − 1737 If INTERNAL_P is set, we are outputting in internal format and
+ − 1738 need to handle the CSI differently. */
+ − 1739
+ − 1740 static void
+ − 1741 ensure_correct_direction (int direction, Lisp_Object codesys,
+ − 1742 unsigned_char_dynarr *dst, unsigned int *flags,
+ − 1743 int internal_p)
+ − 1744 {
+ − 1745 if ((!flags || (*flags & ISO_STATE_R2L)) &&
+ − 1746 direction == CHARSET_LEFT_TO_RIGHT)
+ − 1747 restore_left_to_right_direction (codesys, dst, flags, internal_p);
+ − 1748 else if (!XCODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
+ − 1749 && (!flags || !(*flags & ISO_STATE_R2L)) &&
+ − 1750 direction == CHARSET_RIGHT_TO_LEFT)
+ − 1751 {
+ − 1752 if (XCODING_SYSTEM_ISO2022_SEVEN (codesys))
+ − 1753 {
+ − 1754 Dynarr_add (dst, ISO_CODE_ESC);
+ − 1755 Dynarr_add (dst, '[');
+ − 1756 }
+ − 1757 else if (internal_p)
+ − 1758 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
+ − 1759 else
+ − 1760 Dynarr_add (dst, ISO_CODE_CSI);
+ − 1761 Dynarr_add (dst, '2');
+ − 1762 Dynarr_add (dst, ']');
+ − 1763 if (flags)
+ − 1764 *flags |= ISO_STATE_R2L;
+ − 1765 }
+ − 1766 }
+ − 1767
+ − 1768 /* Convert ISO2022-format data to internal format. */
+ − 1769
+ − 1770 static Bytecount
+ − 1771 iso2022_decode (struct coding_stream *str, const UExtbyte *src,
+ − 1772 unsigned_char_dynarr *dst, Bytecount n)
+ − 1773 {
+ − 1774 unsigned int ch = str->ch;
+ − 1775 #ifdef ENABLE_COMPOSITE_CHARS
+ − 1776 unsigned_char_dynarr *real_dst = dst;
+ − 1777 #endif
+ − 1778 struct iso2022_coding_stream *data =
+ − 1779 CODING_STREAM_TYPE_DATA (str, iso2022);
+ − 1780 unsigned int flags = data->flags;
+ − 1781 Bytecount orign = n;
+ − 1782
+ − 1783 #ifdef ENABLE_COMPOSITE_CHARS
+ − 1784 if (flags & ISO_STATE_COMPOSITE)
+ − 1785 dst = data->composite_chars;
+ − 1786 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 1787
+ − 1788 while (n--)
+ − 1789 {
+ − 1790 UExtbyte c = *src++;
+ − 1791 if (flags & ISO_STATE_ESCAPE)
+ − 1792 { /* Within ESC sequence */
+ − 1793 int retval = parse_iso2022_esc (str->codesys, data,
+ − 1794 c, &flags, 1);
+ − 1795
+ − 1796 if (retval)
+ − 1797 {
+ − 1798 switch (data->esc)
+ − 1799 {
+ − 1800 #ifdef ENABLE_COMPOSITE_CHARS
+ − 1801 case ISO_ESC_START_COMPOSITE:
+ − 1802 if (data->composite_chars)
+ − 1803 Dynarr_reset (data->composite_chars);
+ − 1804 else
+ − 1805 data->composite_chars = Dynarr_new (unsigned_char);
+ − 1806 dst = data->composite_chars;
+ − 1807 break;
+ − 1808 case ISO_ESC_END_COMPOSITE:
+ − 1809 {
867
+ − 1810 Ibyte comstr[MAX_ICHAR_LEN];
771
+ − 1811 Bytecount len;
867
+ − 1812 Ichar emch = lookup_composite_char (Dynarr_atp (dst, 0),
771
+ − 1813 Dynarr_length (dst));
+ − 1814 dst = real_dst;
867
+ − 1815 len = set_itext_ichar (comstr, emch);
771
+ − 1816 Dynarr_add_many (dst, comstr, len);
+ − 1817 break;
+ − 1818 }
+ − 1819 #else
+ − 1820 case ISO_ESC_START_COMPOSITE:
+ − 1821 {
867
+ − 1822 Ibyte comstr[MAX_ICHAR_LEN];
771
+ − 1823 Bytecount len;
867
+ − 1824 Ichar emch = make_ichar (Vcharset_composite, c - '0' + ' ',
771
+ − 1825 0);
867
+ − 1826 len = set_itext_ichar (comstr, emch);
771
+ − 1827 Dynarr_add_many (dst, comstr, len);
+ − 1828 break;
+ − 1829 }
+ − 1830 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 1831
+ − 1832 case ISO_ESC_LITERAL:
+ − 1833 DECODE_ADD_BINARY_CHAR (c, dst);
+ − 1834 break;
+ − 1835
+ − 1836 default:
+ − 1837 /* Everything else handled already */
+ − 1838 break;
+ − 1839 }
+ − 1840 }
+ − 1841
+ − 1842 /* Attempted error recovery. */
+ − 1843 if (data->output_direction_sequence)
+ − 1844 ensure_correct_direction (flags & ISO_STATE_R2L ?
+ − 1845 CHARSET_RIGHT_TO_LEFT :
+ − 1846 CHARSET_LEFT_TO_RIGHT,
+ − 1847 str->codesys, dst, 0, 1);
+ − 1848 /* More error recovery. */
+ − 1849 if (!retval || data->output_literally)
+ − 1850 {
+ − 1851 /* Output the (possibly invalid) sequence */
+ − 1852 int i;
+ − 1853 for (i = 0; i < data->esc_bytes_index; i++)
+ − 1854 DECODE_ADD_BINARY_CHAR (data->esc_bytes[i], dst);
+ − 1855 flags &= ISO_STATE_LOCK;
+ − 1856 if (!retval)
+ − 1857 n++, src--;/* Repeat the loop with the same character. */
+ − 1858 else
+ − 1859 {
+ − 1860 /* No sense in reprocessing the final byte of the
+ − 1861 escape sequence; it could mess things up anyway.
+ − 1862 Just add it now. */
+ − 1863 DECODE_ADD_BINARY_CHAR (c, dst);
+ − 1864 }
+ − 1865 }
+ − 1866 ch = 0;
+ − 1867 }
826
+ − 1868 else if (byte_c0_p (c) || byte_c1_p (c))
771
+ − 1869 { /* Control characters */
+ − 1870
+ − 1871 /***** Error-handling *****/
+ − 1872
+ − 1873 /* If we were in the middle of a character, dump out the
+ − 1874 partial character. */
+ − 1875 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+ − 1876
+ − 1877 /* If we just saw a single-shift character, dump it out.
+ − 1878 This may dump out the wrong sort of single-shift character,
+ − 1879 but least it will give an indication that something went
+ − 1880 wrong. */
+ − 1881 if (flags & ISO_STATE_SS2)
+ − 1882 {
+ − 1883 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
+ − 1884 flags &= ~ISO_STATE_SS2;
+ − 1885 }
+ − 1886 if (flags & ISO_STATE_SS3)
+ − 1887 {
+ − 1888 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
+ − 1889 flags &= ~ISO_STATE_SS3;
+ − 1890 }
+ − 1891
+ − 1892 /***** Now handle the control characters. *****/
+ − 1893
+ − 1894 flags &= ISO_STATE_LOCK;
+ − 1895
+ − 1896 if (!parse_iso2022_esc (str->codesys, data, c, &flags, 1))
+ − 1897 DECODE_ADD_BINARY_CHAR (c, dst);
+ − 1898 }
+ − 1899 else
+ − 1900 { /* Graphic characters */
+ − 1901 Lisp_Object charset;
+ − 1902 int lb;
+ − 1903 int reg;
+ − 1904
+ − 1905 /* Now determine the charset. */
+ − 1906 reg = ((flags & ISO_STATE_SS2) ? 2
+ − 1907 : (flags & ISO_STATE_SS3) ? 3
826
+ − 1908 : !byte_ascii_p (c) ? data->register_right
771
+ − 1909 : data->register_left);
+ − 1910 charset = data->charset[reg];
+ − 1911
+ − 1912 /* Error checking: */
+ − 1913 if (! CHARSETP (charset)
+ − 1914 || data->invalid_designated[reg]
+ − 1915 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
+ − 1916 && XCHARSET_CHARS (charset) == 94))
+ − 1917 /* Mrmph. We are trying to invoke a register that has no
+ − 1918 or an invalid charset in it, or trying to add a character
+ − 1919 outside the range of the charset. Insert that char literally
+ − 1920 to preserve it for the output. */
+ − 1921 {
+ − 1922 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+ − 1923 DECODE_ADD_BINARY_CHAR (c, dst);
+ − 1924 }
+ − 1925
+ − 1926 else
+ − 1927 {
+ − 1928 /* Things are probably hunky-dorey. */
+ − 1929
+ − 1930 /* Fetch reverse charset, maybe. */
+ − 1931 if (((flags & ISO_STATE_R2L) &&
+ − 1932 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
+ − 1933 ||
+ − 1934 (!(flags & ISO_STATE_R2L) &&
+ − 1935 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
+ − 1936 {
+ − 1937 Lisp_Object new_charset =
+ − 1938 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
+ − 1939 if (!NILP (new_charset))
+ − 1940 charset = new_charset;
+ − 1941 }
+ − 1942
+ − 1943 lb = XCHARSET_LEADING_BYTE (charset);
+ − 1944 switch (XCHARSET_REP_BYTES (charset))
+ − 1945 {
+ − 1946 case 1: /* ASCII */
+ − 1947 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+ − 1948 Dynarr_add (dst, c & 0x7F);
+ − 1949 break;
+ − 1950
+ − 1951 case 2: /* one-byte official */
+ − 1952 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+ − 1953 Dynarr_add (dst, lb);
+ − 1954 Dynarr_add (dst, c | 0x80);
+ − 1955 break;
+ − 1956
+ − 1957 case 3: /* one-byte private or two-byte official */
+ − 1958 if (XCHARSET_PRIVATE_P (charset))
+ − 1959 {
+ − 1960 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+ − 1961 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
+ − 1962 Dynarr_add (dst, lb);
+ − 1963 Dynarr_add (dst, c | 0x80);
+ − 1964 }
+ − 1965 else
+ − 1966 {
+ − 1967 if (ch)
+ − 1968 {
+ − 1969 Dynarr_add (dst, lb);
+ − 1970 Dynarr_add (dst, ch | 0x80);
+ − 1971 Dynarr_add (dst, c | 0x80);
+ − 1972 ch = 0;
+ − 1973 }
+ − 1974 else
+ − 1975 ch = c;
+ − 1976 }
+ − 1977 break;
+ − 1978
+ − 1979 default: /* two-byte private */
+ − 1980 if (ch)
+ − 1981 {
+ − 1982 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
+ − 1983 Dynarr_add (dst, lb);
+ − 1984 Dynarr_add (dst, ch | 0x80);
+ − 1985 Dynarr_add (dst, c | 0x80);
+ − 1986 ch = 0;
+ − 1987 }
+ − 1988 else
+ − 1989 ch = c;
+ − 1990 }
+ − 1991 }
+ − 1992
+ − 1993 if (!ch)
+ − 1994 flags &= ISO_STATE_LOCK;
+ − 1995 }
+ − 1996
+ − 1997 }
+ − 1998
+ − 1999 if (str->eof)
+ − 2000 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+ − 2001
+ − 2002 data->flags = flags;
+ − 2003 str->ch = ch;
+ − 2004 return orign;
+ − 2005 }
+ − 2006
+ − 2007
+ − 2008 /***** ISO2022 encoder *****/
+ − 2009
+ − 2010 /* Designate CHARSET into register REG. */
+ − 2011
+ − 2012 static void
+ − 2013 iso2022_designate (Lisp_Object charset, int reg,
+ − 2014 struct coding_stream *str, unsigned_char_dynarr *dst)
+ − 2015 {
+ − 2016 static const char inter94[] = "()*+";
+ − 2017 static const char inter96[] = ",-./";
+ − 2018 int type;
+ − 2019 unsigned char final;
+ − 2020 struct iso2022_coding_stream *data =
+ − 2021 CODING_STREAM_TYPE_DATA (str, iso2022);
+ − 2022 Lisp_Object old_charset = data->charset[reg];
+ − 2023
+ − 2024 data->charset[reg] = charset;
+ − 2025 if (!CHARSETP (charset))
+ − 2026 /* charset might be an initial nil or t. */
+ − 2027 return;
+ − 2028 type = XCHARSET_TYPE (charset);
+ − 2029 final = XCHARSET_FINAL (charset);
+ − 2030 if (!data->force_charset_on_output[reg] &&
+ − 2031 CHARSETP (old_charset) &&
+ − 2032 XCHARSET_TYPE (old_charset) == type &&
+ − 2033 XCHARSET_FINAL (old_charset) == final)
+ − 2034 return;
+ − 2035
+ − 2036 data->force_charset_on_output[reg] = 0;
+ − 2037
+ − 2038 {
+ − 2039 charset_conversion_spec_dynarr *dyn =
+ − 2040 XCODING_SYSTEM_ISO2022_OUTPUT_CONV (str->codesys);
+ − 2041
+ − 2042 if (dyn)
+ − 2043 {
+ − 2044 int i;
+ − 2045
+ − 2046 for (i = 0; i < Dynarr_length (dyn); i++)
+ − 2047 {
+ − 2048 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
+ − 2049 if (EQ (charset, spec->from_charset))
+ − 2050 charset = spec->to_charset;
+ − 2051 }
+ − 2052 }
+ − 2053 }
+ − 2054
+ − 2055 Dynarr_add (dst, ISO_CODE_ESC);
+ − 2056 switch (type)
+ − 2057 {
+ − 2058 case CHARSET_TYPE_94:
+ − 2059 Dynarr_add (dst, inter94[reg]);
+ − 2060 break;
+ − 2061 case CHARSET_TYPE_96:
+ − 2062 Dynarr_add (dst, inter96[reg]);
+ − 2063 break;
+ − 2064 case CHARSET_TYPE_94X94:
+ − 2065 Dynarr_add (dst, '$');
+ − 2066 if (reg != 0
+ − 2067 || !(XCODING_SYSTEM_ISO2022_SHORT (str->codesys))
+ − 2068 || final < '@'
+ − 2069 || final > 'B')
+ − 2070 Dynarr_add (dst, inter94[reg]);
+ − 2071 break;
+ − 2072 case CHARSET_TYPE_96X96:
+ − 2073 Dynarr_add (dst, '$');
+ − 2074 Dynarr_add (dst, inter96[reg]);
+ − 2075 break;
+ − 2076 }
+ − 2077 Dynarr_add (dst, final);
+ − 2078 }
+ − 2079
+ − 2080 static void
+ − 2081 ensure_normal_shift (struct coding_stream *str, unsigned_char_dynarr *dst)
+ − 2082 {
+ − 2083 struct iso2022_coding_stream *data =
+ − 2084 CODING_STREAM_TYPE_DATA (str, iso2022);
+ − 2085
+ − 2086 if (data->register_left != 0)
+ − 2087 {
+ − 2088 Dynarr_add (dst, ISO_CODE_SI);
+ − 2089 data->register_left = 0;
+ − 2090 }
+ − 2091 }
+ − 2092
+ − 2093 static void
+ − 2094 ensure_shift_out (struct coding_stream *str, unsigned_char_dynarr *dst)
+ − 2095 {
+ − 2096 struct iso2022_coding_stream *data =
+ − 2097 CODING_STREAM_TYPE_DATA (str, iso2022);
+ − 2098
+ − 2099 if (data->register_left != 1)
+ − 2100 {
+ − 2101 Dynarr_add (dst, ISO_CODE_SO);
+ − 2102 data->register_left = 1;
+ − 2103 }
+ − 2104 }
+ − 2105
+ − 2106 /* Convert internally-formatted data to ISO2022 format. */
+ − 2107
+ − 2108 static Bytecount
867
+ − 2109 iso2022_encode (struct coding_stream *str, const Ibyte *src,
771
+ − 2110 unsigned_char_dynarr *dst, Bytecount n)
+ − 2111 {
+ − 2112 unsigned char charmask;
867
+ − 2113 Ibyte c;
771
+ − 2114 unsigned char char_boundary;
+ − 2115 unsigned int ch = str->ch;
+ − 2116 Lisp_Object codesys = str->codesys;
+ − 2117 int i;
+ − 2118 Lisp_Object charset;
+ − 2119 int half;
+ − 2120 struct iso2022_coding_stream *data =
+ − 2121 CODING_STREAM_TYPE_DATA (str, iso2022);
+ − 2122 unsigned int flags = data->flags;
+ − 2123 Bytecount orign = n;
+ − 2124
+ − 2125 #ifdef ENABLE_COMPOSITE_CHARS
+ − 2126 /* flags for handling composite chars. We do a little switcheroo
+ − 2127 on the source while we're outputting the composite char. */
+ − 2128 Bytecount saved_n = 0;
867
+ − 2129 const Ibyte *saved_src = NULL;
771
+ − 2130 int in_composite = 0;
+ − 2131 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 2132
+ − 2133 char_boundary = data->current_char_boundary;
+ − 2134 charset = data->current_charset;
+ − 2135 half = data->current_half;
+ − 2136
+ − 2137 #ifdef ENABLE_COMPOSITE_CHARS
+ − 2138 back_to_square_n:
+ − 2139 #endif
+ − 2140 while (n--)
+ − 2141 {
+ − 2142 c = *src++;
+ − 2143
826
+ − 2144 if (byte_ascii_p (c))
771
+ − 2145 { /* Processing ASCII character */
+ − 2146 ch = 0;
+ − 2147
+ − 2148 restore_left_to_right_direction (codesys, dst, &flags, 0);
+ − 2149
+ − 2150 /* Make sure G0 contains ASCII */
+ − 2151 if ((c > ' ' && c < ISO_CODE_DEL) ||
+ − 2152 !XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
+ − 2153 {
+ − 2154 ensure_normal_shift (str, dst);
+ − 2155 iso2022_designate (Vcharset_ascii, 0, str, dst);
+ − 2156 }
+ − 2157
+ − 2158 /* If necessary, restore everything to the default state
+ − 2159 at end-of-line */
+ − 2160 if (!(XCODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
+ − 2161 {
+ − 2162 /* NOTE: CRLF encoding happens *BEFORE* other encoding.
+ − 2163 Thus, even though we're working with internal-format
+ − 2164 data, there may be CR's or CRLF sequences representing
+ − 2165 newlines. */
+ − 2166 if (c == '\r' || (c == '\n' && !(flags & ISO_STATE_CR)))
+ − 2167 {
+ − 2168 restore_left_to_right_direction (codesys, dst, &flags, 0);
+ − 2169
+ − 2170 ensure_normal_shift (str, dst);
+ − 2171
+ − 2172 for (i = 0; i < 4; i++)
+ − 2173 {
+ − 2174 Lisp_Object initial_charset =
+ − 2175 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
+ − 2176 iso2022_designate (initial_charset, i, str, dst);
+ − 2177 }
+ − 2178 }
+ − 2179 if (c == '\r')
+ − 2180 flags |= ISO_STATE_CR;
+ − 2181 else
+ − 2182 flags &= ~ISO_STATE_CR;
+ − 2183 }
+ − 2184
+ − 2185 if (XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
+ − 2186 && fit_to_be_escape_quoted (c))
+ − 2187 Dynarr_add (dst, ISO_CODE_ESC);
+ − 2188 Dynarr_add (dst, c);
+ − 2189 char_boundary = 1;
+ − 2190 }
+ − 2191
867
+ − 2192 else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch))
771
+ − 2193 { /* Processing Leading Byte */
+ − 2194 ch = 0;
826
+ − 2195 charset = charset_by_leading_byte (c);
+ − 2196 if (leading_byte_prefix_p (c))
771
+ − 2197 ch = c;
+ − 2198 else if (!EQ (charset, Vcharset_control_1)
+ − 2199 && !EQ (charset, Vcharset_composite))
+ − 2200 {
+ − 2201 int reg;
+ − 2202
+ − 2203 ensure_correct_direction (XCHARSET_DIRECTION (charset),
+ − 2204 codesys, dst, &flags, 0);
+ − 2205
+ − 2206 /* Now determine which register to use. */
+ − 2207 reg = -1;
+ − 2208 for (i = 0; i < 4; i++)
+ − 2209 {
+ − 2210 if (EQ (charset, data->charset[i]) ||
+ − 2211 EQ (charset,
+ − 2212 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
+ − 2213 {
+ − 2214 reg = i;
+ − 2215 break;
+ − 2216 }
+ − 2217 }
+ − 2218
+ − 2219 if (reg == -1)
+ − 2220 {
+ − 2221 if (XCHARSET_GRAPHIC (charset) != 0)
+ − 2222 {
+ − 2223 if (!NILP (data->charset[1]) &&
+ − 2224 (!XCODING_SYSTEM_ISO2022_SEVEN (codesys) ||
+ − 2225 XCODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
+ − 2226 reg = 1;
+ − 2227 else if (!NILP (data->charset[2]))
+ − 2228 reg = 2;
+ − 2229 else if (!NILP (data->charset[3]))
+ − 2230 reg = 3;
+ − 2231 else
+ − 2232 reg = 0;
+ − 2233 }
+ − 2234 else
+ − 2235 reg = 0;
+ − 2236 }
+ − 2237
+ − 2238 iso2022_designate (charset, reg, str, dst);
+ − 2239
+ − 2240 /* Now invoke that register. */
+ − 2241 switch (reg)
+ − 2242 {
+ − 2243 case 0:
+ − 2244 ensure_normal_shift (str, dst);
+ − 2245 half = 0;
+ − 2246 break;
+ − 2247
+ − 2248 case 1:
+ − 2249 if (XCODING_SYSTEM_ISO2022_SEVEN (codesys))
+ − 2250 {
+ − 2251 ensure_shift_out (str, dst);
+ − 2252 half = 0;
+ − 2253 }
+ − 2254 else
+ − 2255 half = 1;
+ − 2256 break;
+ − 2257
+ − 2258 case 2:
+ − 2259 if (XCODING_SYSTEM_ISO2022_SEVEN (str->codesys))
+ − 2260 {
+ − 2261 Dynarr_add (dst, ISO_CODE_ESC);
+ − 2262 Dynarr_add (dst, 'N');
+ − 2263 half = 0;
+ − 2264 }
+ − 2265 else
+ − 2266 {
+ − 2267 Dynarr_add (dst, ISO_CODE_SS2);
+ − 2268 half = 1;
+ − 2269 }
+ − 2270 break;
+ − 2271
+ − 2272 case 3:
+ − 2273 if (XCODING_SYSTEM_ISO2022_SEVEN (str->codesys))
+ − 2274 {
+ − 2275 Dynarr_add (dst, ISO_CODE_ESC);
+ − 2276 Dynarr_add (dst, 'O');
+ − 2277 half = 0;
+ − 2278 }
+ − 2279 else
+ − 2280 {
+ − 2281 Dynarr_add (dst, ISO_CODE_SS3);
+ − 2282 half = 1;
+ − 2283 }
+ − 2284 break;
+ − 2285
+ − 2286 default:
+ − 2287 abort ();
+ − 2288 }
+ − 2289 }
+ − 2290 char_boundary = 0;
+ − 2291 }
+ − 2292 else
+ − 2293 { /* Processing Non-ASCII character */
+ − 2294 charmask = (half == 0 ? 0x7F : 0xFF);
+ − 2295 char_boundary = 1;
+ − 2296 if (EQ (charset, Vcharset_control_1))
+ − 2297 {
+ − 2298 if (XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
+ − 2299 && fit_to_be_escape_quoted (c))
+ − 2300 Dynarr_add (dst, ISO_CODE_ESC);
+ − 2301 /* you asked for it ... */
+ − 2302 Dynarr_add (dst, c - 0x20);
+ − 2303 }
+ − 2304 #ifndef ENABLE_COMPOSITE_CHARS
+ − 2305 else if (EQ (charset, Vcharset_composite))
+ − 2306 {
+ − 2307 if (c >= 160 || c <= 164) /* Someone might have stuck in
+ − 2308 something else */
+ − 2309 {
+ − 2310 Dynarr_add (dst, ISO_CODE_ESC);
+ − 2311 Dynarr_add (dst, c - 160 + '0');
+ − 2312 }
+ − 2313 }
+ − 2314 #endif
+ − 2315 else
+ − 2316 {
+ − 2317 switch (XCHARSET_REP_BYTES (charset))
+ − 2318 {
+ − 2319 case 2:
+ − 2320 Dynarr_add (dst, c & charmask);
+ − 2321 break;
+ − 2322 case 3:
+ − 2323 if (XCHARSET_PRIVATE_P (charset))
+ − 2324 {
+ − 2325 Dynarr_add (dst, c & charmask);
+ − 2326 ch = 0;
+ − 2327 }
+ − 2328 else if (ch)
+ − 2329 {
+ − 2330 #ifdef ENABLE_COMPOSITE_CHARS
+ − 2331 if (EQ (charset, Vcharset_composite))
+ − 2332 {
+ − 2333 if (in_composite)
+ − 2334 {
+ − 2335 /* #### Bother! We don't know how to
+ − 2336 handle this yet. */
+ − 2337 Dynarr_add (dst, '~');
+ − 2338 }
+ − 2339 else
+ − 2340 {
867
+ − 2341 Ichar emch = make_ichar (Vcharset_composite,
771
+ − 2342 ch & 0x7F, c & 0x7F);
+ − 2343 Lisp_Object lstr = composite_char_string (emch);
+ − 2344 saved_n = n;
+ − 2345 saved_src = src;
+ − 2346 in_composite = 1;
+ − 2347 src = XSTRING_DATA (lstr);
+ − 2348 n = XSTRING_LENGTH (lstr);
+ − 2349 Dynarr_add (dst, ISO_CODE_ESC);
+ − 2350 Dynarr_add (dst, '0'); /* start composing */
+ − 2351 }
+ − 2352 }
+ − 2353 else
+ − 2354 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 2355 {
+ − 2356 Dynarr_add (dst, ch & charmask);
+ − 2357 Dynarr_add (dst, c & charmask);
+ − 2358 }
+ − 2359 ch = 0;
+ − 2360 }
+ − 2361 else
+ − 2362 {
+ − 2363 ch = c;
+ − 2364 char_boundary = 0;
+ − 2365 }
+ − 2366 break;
+ − 2367 case 4:
+ − 2368 if (ch)
+ − 2369 {
+ − 2370 Dynarr_add (dst, ch & charmask);
+ − 2371 Dynarr_add (dst, c & charmask);
+ − 2372 ch = 0;
+ − 2373 }
+ − 2374 else
+ − 2375 {
+ − 2376 ch = c;
+ − 2377 char_boundary = 0;
+ − 2378 }
+ − 2379 break;
+ − 2380 default:
+ − 2381 abort ();
+ − 2382 }
+ − 2383 }
+ − 2384 }
+ − 2385 }
+ − 2386
+ − 2387 #ifdef ENABLE_COMPOSITE_CHARS
+ − 2388 if (in_composite)
+ − 2389 {
+ − 2390 n = saved_n;
+ − 2391 src = saved_src;
+ − 2392 in_composite = 0;
+ − 2393 Dynarr_add (dst, ISO_CODE_ESC);
+ − 2394 Dynarr_add (dst, '1'); /* end composing */
+ − 2395 goto back_to_square_n; /* Wheeeeeeeee ..... */
+ − 2396 }
+ − 2397 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 2398
+ − 2399 if (char_boundary && str->eof)
+ − 2400 {
+ − 2401 restore_left_to_right_direction (codesys, dst, &flags, 0);
+ − 2402 ensure_normal_shift (str, dst);
+ − 2403 for (i = 0; i < 4; i++)
+ − 2404 {
+ − 2405 Lisp_Object initial_charset =
+ − 2406 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
+ − 2407 iso2022_designate (initial_charset, i, str, dst);
+ − 2408 }
+ − 2409 }
+ − 2410
+ − 2411 data->flags = flags;
+ − 2412 str->ch = ch;
+ − 2413 data->current_char_boundary = char_boundary;
+ − 2414 data->current_charset = charset;
+ − 2415 data->current_half = half;
+ − 2416
+ − 2417 /* Verbum caro factum est! */
+ − 2418 return orign;
+ − 2419 }
+ − 2420
+ − 2421 static Bytecount
+ − 2422 iso2022_convert (struct coding_stream *str,
+ − 2423 const UExtbyte *src,
+ − 2424 unsigned_char_dynarr *dst, Bytecount n)
+ − 2425 {
+ − 2426 if (str->direction == CODING_DECODE)
+ − 2427 return iso2022_decode (str, src, dst, n);
+ − 2428 else
+ − 2429 return iso2022_encode (str, src, dst, n);
+ − 2430 }
+ − 2431
+ − 2432 static void
+ − 2433 iso2022_mark (Lisp_Object codesys)
+ − 2434 {
+ − 2435 int i;
+ − 2436
+ − 2437 for (i = 0; i < 4; i++)
+ − 2438 mark_object (XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
+ − 2439 if (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys))
+ − 2440 {
+ − 2441 for (i = 0;
+ − 2442 i < Dynarr_length (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys));
+ − 2443 i++)
+ − 2444 {
+ − 2445 struct charset_conversion_spec *ccs =
+ − 2446 Dynarr_atp (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys), i);
+ − 2447 mark_object (ccs->from_charset);
+ − 2448 mark_object (ccs->to_charset);
+ − 2449 }
+ − 2450 }
+ − 2451 if (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys))
+ − 2452 {
+ − 2453 for (i = 0;
+ − 2454 i < Dynarr_length (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys));
+ − 2455 i++)
+ − 2456 {
+ − 2457 struct charset_conversion_spec *ccs =
+ − 2458 Dynarr_atp (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys), i);
+ − 2459 mark_object (ccs->from_charset);
+ − 2460 mark_object (ccs->to_charset);
+ − 2461 }
+ − 2462 }
+ − 2463 }
+ − 2464
+ − 2465 static void
+ − 2466 iso2022_finalize (Lisp_Object cs)
+ − 2467 {
+ − 2468 if (XCODING_SYSTEM_ISO2022_INPUT_CONV (cs))
+ − 2469 {
+ − 2470 Dynarr_free (XCODING_SYSTEM_ISO2022_INPUT_CONV (cs));
+ − 2471 XCODING_SYSTEM_ISO2022_INPUT_CONV (cs) = 0;
+ − 2472 }
+ − 2473 if (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs))
+ − 2474 {
+ − 2475 Dynarr_free (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs));
+ − 2476 XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs) = 0;
+ − 2477 }
+ − 2478 }
+ − 2479
+ − 2480 /* Given a list of charset conversion specs as specified in a Lisp
+ − 2481 program, parse it into STORE_HERE. */
+ − 2482
+ − 2483 static void
+ − 2484 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
+ − 2485 Lisp_Object spec_list)
+ − 2486 {
+ − 2487 Lisp_Object rest;
+ − 2488
+ − 2489 EXTERNAL_LIST_LOOP (rest, spec_list)
+ − 2490 {
+ − 2491 Lisp_Object car = XCAR (rest);
+ − 2492 Lisp_Object from, to;
+ − 2493 struct charset_conversion_spec spec;
+ − 2494
+ − 2495 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
+ − 2496 invalid_argument ("Invalid charset conversion spec", car);
+ − 2497 from = Fget_charset (XCAR (car));
+ − 2498 to = Fget_charset (XCAR (XCDR (car)));
+ − 2499 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
+ − 2500 invalid_operation_2
+ − 2501 ("Attempted conversion between different charset types",
+ − 2502 from, to);
+ − 2503 spec.from_charset = from;
+ − 2504 spec.to_charset = to;
+ − 2505
+ − 2506 Dynarr_add (store_here, spec);
+ − 2507 }
+ − 2508 }
+ − 2509
+ − 2510 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
+ − 2511 specs, return the equivalent as the Lisp programmer would see it.
+ − 2512
+ − 2513 If LOAD_HERE is 0, return Qnil. */
+ − 2514
+ − 2515 static Lisp_Object
+ − 2516 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here,
+ − 2517 int names)
+ − 2518 {
+ − 2519 int i;
+ − 2520 Lisp_Object result;
+ − 2521
+ − 2522 if (!load_here)
+ − 2523 return Qnil;
+ − 2524 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++)
+ − 2525 {
+ − 2526 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i);
+ − 2527 if (names)
+ − 2528 result = Fcons (list2 (XCHARSET_NAME (ccs->from_charset),
+ − 2529 XCHARSET_NAME (ccs->to_charset)), result);
+ − 2530 else
+ − 2531 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
+ − 2532 }
+ − 2533
+ − 2534 return Fnreverse (result);
+ − 2535 }
+ − 2536
+ − 2537 static int
+ − 2538 iso2022_putprop (Lisp_Object codesys,
+ − 2539 Lisp_Object key,
+ − 2540 Lisp_Object value)
+ − 2541 {
+ − 2542 #define FROB_INITIAL_CHARSET(charset_num) \
+ − 2543 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
+ − 2544 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
+ − 2545
+ − 2546 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
+ − 2547 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
+ − 2548 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
+ − 2549 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
+ − 2550
+ − 2551 #define FROB_FORCE_CHARSET(charset_num) \
+ − 2552 XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = \
+ − 2553 !NILP (value)
+ − 2554
+ − 2555 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
+ − 2556 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
+ − 2557 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
+ − 2558 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
+ − 2559
+ − 2560 #define FROB_BOOLEAN_PROPERTY(prop) \
+ − 2561 XCODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
+ − 2562
+ − 2563 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
+ − 2564 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
+ − 2565 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
+ − 2566 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
+ − 2567 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
+ − 2568 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
+ − 2569 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
+ − 2570
+ − 2571 else if (EQ (key, Qinput_charset_conversion))
+ − 2572 {
+ − 2573 XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys) =
+ − 2574 Dynarr_new (charset_conversion_spec);
+ − 2575 parse_charset_conversion_specs
+ − 2576 (XCODING_SYSTEM_ISO2022_INPUT_CONV (codesys), value);
+ − 2577 }
+ − 2578 else if (EQ (key, Qoutput_charset_conversion))
+ − 2579 {
+ − 2580 XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys) =
+ − 2581 Dynarr_new (charset_conversion_spec);
+ − 2582 parse_charset_conversion_specs
+ − 2583 (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (codesys), value);
+ − 2584 }
+ − 2585 else
+ − 2586 return 0;
+ − 2587
+ − 2588 return 1;
+ − 2589 }
+ − 2590
+ − 2591 static void
+ − 2592 iso2022_finalize_coding_stream (struct coding_stream *str)
+ − 2593 {
+ − 2594 #ifdef ENABLE_COMPOSITE_CHARS
+ − 2595 struct iso2022_coding_stream *data =
+ − 2596 CODING_STREAM_TYPE_DATA (str, iso2022);
+ − 2597
+ − 2598 if (data->composite_chars)
+ − 2599 Dynarr_free (data->composite_chars);
+ − 2600 #endif
+ − 2601 }
+ − 2602
+ − 2603 static void
+ − 2604 iso2022_init (Lisp_Object codesys)
+ − 2605 {
+ − 2606 int i;
+ − 2607 for (i = 0; i < 4; i++)
+ − 2608 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
+ − 2609 }
+ − 2610
+ − 2611 static Lisp_Object
+ − 2612 coding_system_charset (Lisp_Object coding_system, int gnum)
+ − 2613 {
+ − 2614 Lisp_Object cs
+ − 2615 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum);
+ − 2616
+ − 2617 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil;
+ − 2618 }
+ − 2619
+ − 2620 static Lisp_Object
+ − 2621 iso2022_getprop (Lisp_Object coding_system, Lisp_Object prop)
+ − 2622 {
+ − 2623 if (EQ (prop, Qcharset_g0))
+ − 2624 return coding_system_charset (coding_system, 0);
+ − 2625 else if (EQ (prop, Qcharset_g1))
+ − 2626 return coding_system_charset (coding_system, 1);
+ − 2627 else if (EQ (prop, Qcharset_g2))
+ − 2628 return coding_system_charset (coding_system, 2);
+ − 2629 else if (EQ (prop, Qcharset_g3))
+ − 2630 return coding_system_charset (coding_system, 3);
+ − 2631
+ − 2632 #define FORCE_CHARSET(charset_num) \
+ − 2633 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
+ − 2634 (coding_system, charset_num) ? Qt : Qnil)
+ − 2635
+ − 2636 else if (EQ (prop, Qforce_g0_on_output))
+ − 2637 return FORCE_CHARSET (0);
+ − 2638 else if (EQ (prop, Qforce_g1_on_output))
+ − 2639 return FORCE_CHARSET (1);
+ − 2640 else if (EQ (prop, Qforce_g2_on_output))
+ − 2641 return FORCE_CHARSET (2);
+ − 2642 else if (EQ (prop, Qforce_g3_on_output))
+ − 2643 return FORCE_CHARSET (3);
+ − 2644
+ − 2645 #define LISP_BOOLEAN(prop) \
+ − 2646 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
+ − 2647
+ − 2648 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
+ − 2649 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
+ − 2650 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
+ − 2651 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
+ − 2652 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
+ − 2653 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
+ − 2654 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
+ − 2655
+ − 2656 else if (EQ (prop, Qinput_charset_conversion))
+ − 2657 return
+ − 2658 unparse_charset_conversion_specs
+ − 2659 (XCODING_SYSTEM_ISO2022_INPUT_CONV (coding_system), 0);
+ − 2660 else if (EQ (prop, Qoutput_charset_conversion))
+ − 2661 return
+ − 2662 unparse_charset_conversion_specs
+ − 2663 (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (coding_system), 0);
+ − 2664 else
+ − 2665 return Qunbound;
+ − 2666 }
+ − 2667
+ − 2668 static void
+ − 2669 iso2022_print (Lisp_Object cs, Lisp_Object printcharfun, int escapeflag)
+ − 2670 {
+ − 2671 int i;
+ − 2672
826
+ − 2673 write_c_string (printcharfun, "(");
771
+ − 2674 for (i = 0; i < 4; i++)
+ − 2675 {
+ − 2676 Lisp_Object charset = coding_system_charset (cs, i);
+ − 2677 if (i > 0)
826
+ − 2678 write_c_string (printcharfun, ", ");
771
+ − 2679 write_fmt_string (printcharfun, "g%d=", i);
800
+ − 2680 print_internal (CHARSETP (charset) ? XCHARSET_NAME (charset) : charset, printcharfun, 0);
771
+ − 2681 if (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (cs, i))
826
+ − 2682 write_c_string (printcharfun, "(force)");
771
+ − 2683 }
+ − 2684
800
+ − 2685 #define FROB(prop) \
+ − 2686 if (!NILP (iso2022_getprop (cs, prop))) \
+ − 2687 { \
+ − 2688 write_fmt_string (printcharfun, ", %s", prop); \
771
+ − 2689 }
+ − 2690
+ − 2691 FROB (Qshort);
+ − 2692 FROB (Qno_ascii_eol);
+ − 2693 FROB (Qno_ascii_cntl);
+ − 2694 FROB (Qseven);
+ − 2695 FROB (Qlock_shift);
+ − 2696 FROB (Qno_iso6429);
+ − 2697 FROB (Qescape_quoted);
+ − 2698
+ − 2699 {
+ − 2700 Lisp_Object val =
+ − 2701 unparse_charset_conversion_specs
+ − 2702 (XCODING_SYSTEM_ISO2022_INPUT_CONV (cs), 1);
+ − 2703 if (!NILP (val))
+ − 2704 {
800
+ − 2705 write_fmt_string_lisp (printcharfun, ", input-charset-conversion=%s", 1, val);
771
+ − 2706 }
+ − 2707 val =
+ − 2708 unparse_charset_conversion_specs
+ − 2709 (XCODING_SYSTEM_ISO2022_OUTPUT_CONV (cs), 1);
+ − 2710 if (!NILP (val))
+ − 2711 {
800
+ − 2712 write_fmt_string_lisp (printcharfun, ", output-charset-conversion=%s", 1, val);
771
+ − 2713 }
826
+ − 2714 write_c_string (printcharfun, ")");
771
+ − 2715 }
+ − 2716 }
+ − 2717
+ − 2718
+ − 2719 /************************************************************************/
+ − 2720 /* ISO2022 detector */
+ − 2721 /************************************************************************/
+ − 2722
+ − 2723 DEFINE_DETECTOR (iso2022);
+ − 2724 /* ISO2022 system using only seven-bit bytes, no locking shift */
+ − 2725 DEFINE_DETECTOR_CATEGORY (iso2022, iso_7);
+ − 2726 /* ISO2022 system using eight-bit bytes, no locking shift, no single shift,
+ − 2727 using designation to switch charsets */
+ − 2728 DEFINE_DETECTOR_CATEGORY (iso2022, iso_8_designate);
+ − 2729 /* ISO2022 system using eight-bit bytes, no locking shift, no designation
+ − 2730 sequences, one-dimension characters in the upper half. */
+ − 2731 DEFINE_DETECTOR_CATEGORY (iso2022, iso_8_1);
+ − 2732 /* ISO2022 system using eight-bit bytes, no locking shift, no designation
+ − 2733 sequences, two-dimension characters in the upper half. */
+ − 2734 DEFINE_DETECTOR_CATEGORY (iso2022, iso_8_2);
+ − 2735 /* ISO2022 system using locking shift */
+ − 2736 DEFINE_DETECTOR_CATEGORY (iso2022, iso_lock_shift);
+ − 2737
+ − 2738 struct iso2022_detector
+ − 2739 {
+ − 2740 int initted;
+ − 2741 struct iso2022_coding_stream *iso;
+ − 2742 unsigned int flags;
+ − 2743
+ − 2744 /* for keeping temporary track of high-byte groups */
+ − 2745 int high_byte_count;
+ − 2746 unsigned int saw_single_shift_just_now:1;
+ − 2747
+ − 2748 /* running state; we set the likelihoods at the end */
+ − 2749 unsigned int seen_high_byte:1;
+ − 2750 unsigned int seen_single_shift:1;
+ − 2751 unsigned int seen_locking_shift:1;
+ − 2752 unsigned int seen_designate:1;
+ − 2753 unsigned int bad_single_byte_sequences;
+ − 2754 unsigned int bad_multibyte_escape_sequences;
+ − 2755 unsigned int good_multibyte_escape_sequences;
+ − 2756 int even_high_byte_groups;
985
+ − 2757 int longest_even_high_byte;
771
+ − 2758 int odd_high_byte_groups;
+ − 2759 };
+ − 2760
+ − 2761 static void
+ − 2762 iso2022_detect (struct detection_state *st, const UExtbyte *src,
+ − 2763 Bytecount n)
+ − 2764 {
+ − 2765 Bytecount orign = n;
+ − 2766 struct iso2022_detector *data = DETECTION_STATE_DATA (st, iso2022);
+ − 2767
+ − 2768 /* #### There are serious deficiencies in the recognition mechanism
+ − 2769 here. This needs to be much smarter if it's going to cut it.
+ − 2770 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while
+ − 2771 it should be detected as Latin-1.
+ − 2772 All the ISO2022 stuff in this file should be synced up with the
+ − 2773 code from FSF Emacs-21.0, in which Mule should be more or less stable.
+ − 2774 Perhaps we should wait till R2L works in FSF Emacs? */
+ − 2775
+ − 2776 /* We keep track of running state on our own, and set the categories at the
+ − 2777 end; that way we can reflect the correct state each time we finish, but
+ − 2778 not get confused by those results the next time around. */
+ − 2779
+ − 2780 if (!data->initted)
+ − 2781 {
+ − 2782 xzero (*data);
+ − 2783 data->iso = xnew_and_zero (struct iso2022_coding_stream);
+ − 2784 reset_iso2022_decode (Qnil, data->iso);
+ − 2785 data->initted = 1;
+ − 2786 }
+ − 2787
+ − 2788 while (n--)
+ − 2789 {
+ − 2790 UExtbyte c = *src++;
+ − 2791 if (c >= 0x80)
+ − 2792 data->seen_high_byte = 1;
+ − 2793 if (c >= 0xA0)
+ − 2794 data->high_byte_count++;
+ − 2795 else
+ − 2796 {
+ − 2797 if (data->high_byte_count &&
+ − 2798 !data->saw_single_shift_just_now)
+ − 2799 {
+ − 2800 if (data->high_byte_count & 1)
+ − 2801 data->odd_high_byte_groups++;
+ − 2802 else
985
+ − 2803 {
+ − 2804 data->even_high_byte_groups++;
+ − 2805 if (data->longest_even_high_byte < data->high_byte_count)
+ − 2806 data->longest_even_high_byte = data->high_byte_count;
+ − 2807 }
771
+ − 2808 }
+ − 2809 data->high_byte_count = 0;
+ − 2810 data->saw_single_shift_just_now = 0;
+ − 2811 }
+ − 2812 if (!(data->flags & ISO_STATE_ESCAPE)
826
+ − 2813 && (byte_c0_p (c) || byte_c1_p (c)))
771
+ − 2814 { /* control chars */
+ − 2815 switch (c)
+ − 2816 {
+ − 2817 /* Allow and ignore control characters that you might
+ − 2818 reasonably see in a text file */
+ − 2819 case '\r':
+ − 2820 case '\n':
+ − 2821 case '\t':
+ − 2822 case 7: /* bell */
+ − 2823 case 8: /* backspace */
+ − 2824 case 11: /* vertical tab */
+ − 2825 case 12: /* form feed */
+ − 2826 case 26: /* MS-DOS C-z junk */
+ − 2827 case 31: /* '^_' -- for info */
+ − 2828 goto label_continue_loop;
+ − 2829
+ − 2830 default:
+ − 2831 break;
+ − 2832 }
+ − 2833 }
+ − 2834
826
+ − 2835 if ((data->flags & ISO_STATE_ESCAPE) || byte_c0_p (c)
+ − 2836 || byte_c1_p (c))
771
+ − 2837 {
+ − 2838 switch (parse_iso2022_esc (Qnil, data->iso, c,
+ − 2839 &data->flags, 0))
+ − 2840 {
+ − 2841 case 1: /* done */
+ − 2842 if (data->iso->esc_bytes_index > 0)
+ − 2843 data->good_multibyte_escape_sequences++;
+ − 2844 switch (data->iso->esc)
+ − 2845 {
+ − 2846 case ISO_ESC_DESIGNATE:
+ − 2847 data->seen_designate = 1;
+ − 2848 break;
+ − 2849 case ISO_ESC_LOCKING_SHIFT:
+ − 2850 data->seen_locking_shift = 1;
+ − 2851 break;
+ − 2852 case ISO_ESC_SINGLE_SHIFT:
+ − 2853 data->saw_single_shift_just_now = 1;
+ − 2854 data->seen_single_shift = 1;
+ − 2855 break;
+ − 2856 default:
+ − 2857 break;
+ − 2858 }
+ − 2859 break;
+ − 2860
+ − 2861 case -1: /* not done */
+ − 2862 break;
+ − 2863
+ − 2864 case 0: /* error */
+ − 2865 if (data->iso->esc == ISO_ESC_NOTHING)
+ − 2866 data->bad_single_byte_sequences++;
+ − 2867 else
+ − 2868 data->bad_multibyte_escape_sequences++;
+ − 2869 }
+ − 2870 }
+ − 2871 label_continue_loop:;
+ − 2872 }
+ − 2873
985
+ − 2874 if (data->high_byte_count &&
+ − 2875 !data->saw_single_shift_just_now)
+ − 2876 {
+ − 2877 if (data->high_byte_count & 1)
+ − 2878 data->odd_high_byte_groups++;
+ − 2879 else
+ − 2880 {
+ − 2881 data->even_high_byte_groups++;
+ − 2882 if (data->longest_even_high_byte < data->high_byte_count)
+ − 2883 data->longest_even_high_byte = data->high_byte_count;
+ − 2884 }
+ − 2885 }
+ − 2886
771
+ − 2887 if (data->bad_multibyte_escape_sequences > 2 ||
+ − 2888 (data->bad_multibyte_escape_sequences > 0 &&
+ − 2889 data->good_multibyte_escape_sequences /
+ − 2890 data->bad_multibyte_escape_sequences < 10))
+ − 2891 /* Just making it up ... */
+ − 2892 SET_DET_RESULTS (st, iso2022, DET_NEARLY_IMPOSSIBLE);
+ − 2893 else if (data->bad_single_byte_sequences > 5 ||
+ − 2894 (data->bad_single_byte_sequences > 0 &&
+ − 2895 (data->good_multibyte_escape_sequences +
+ − 2896 data->even_high_byte_groups +
+ − 2897 data->odd_high_byte_groups) /
+ − 2898 data->bad_single_byte_sequences < 10))
+ − 2899 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
+ − 2900 else if (data->seen_locking_shift)
+ − 2901 {
+ − 2902 SET_DET_RESULTS (st, iso2022, DET_QUITE_IMPROBABLE);
+ − 2903 DET_RESULT (st, iso_lock_shift) = DET_QUITE_PROBABLE;
+ − 2904 }
+ − 2905 else if (!data->seen_high_byte)
+ − 2906 {
+ − 2907 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
+ − 2908 if (data->good_multibyte_escape_sequences)
+ − 2909 DET_RESULT (st, iso_7) = DET_QUITE_PROBABLE;
+ − 2910 else if (data->seen_single_shift)
+ − 2911 DET_RESULT (st, iso_7) = DET_SOMEWHAT_LIKELY;
+ − 2912 else
+ − 2913 {
+ − 2914 /* If we've just seen pure 7-bit data, no escape sequences,
+ − 2915 then we can't give much likelihood; but if we've seen enough
+ − 2916 of this data, we can assume some unlikelihood of any 8-bit
+ − 2917 encoding */
+ − 2918 if (orign + st->bytes_seen >= 1000)
+ − 2919 DET_RESULT (st, iso_7) = DET_AS_LIKELY_AS_UNLIKELY;
+ − 2920 else
+ − 2921 SET_DET_RESULTS (st, iso2022, DET_AS_LIKELY_AS_UNLIKELY);
+ − 2922 }
+ − 2923 }
+ − 2924 else if (data->seen_designate)
+ − 2925 {
+ − 2926 SET_DET_RESULTS (st, iso2022, DET_QUITE_IMPROBABLE);
+ − 2927 if (data->seen_single_shift)
+ − 2928 /* #### Does this really make sense? */
+ − 2929 DET_RESULT (st, iso_8_designate) = DET_SOMEWHAT_UNLIKELY;
+ − 2930 else
+ − 2931 DET_RESULT (st, iso_8_designate) = DET_QUITE_PROBABLE;
+ − 2932 }
+ − 2933 else if (data->odd_high_byte_groups > 0 &&
+ − 2934 data->even_high_byte_groups == 0)
+ − 2935 {
+ − 2936 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
+ − 2937 if (data->seen_single_shift)
+ − 2938 DET_RESULT (st, iso_8_1) = DET_QUITE_PROBABLE;
+ − 2939 else
+ − 2940 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_LIKELY;
+ − 2941 }
+ − 2942 else if (data->odd_high_byte_groups == 0 &&
+ − 2943 data->even_high_byte_groups > 0)
+ − 2944 {
985
+ − 2945 #if 0
771
+ − 2946 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
+ − 2947 if (data->even_high_byte_groups > 10)
+ − 2948 {
+ − 2949 if (data->seen_single_shift)
+ − 2950 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE;
+ − 2951 else
+ − 2952 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY;
+ − 2953 if (data->even_high_byte_groups < 50)
+ − 2954 DET_RESULT (st, iso_8_1) = DET_SOMEWHAT_UNLIKELY;
+ − 2955 /* else it stays at quite improbable */
+ − 2956 }
985
+ − 2957 #else
+ − 2958 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
+ − 2959 if (data->seen_single_shift)
+ − 2960 DET_RESULT (st, iso_8_2) = DET_QUITE_PROBABLE;
+ − 2961 else if (data->even_high_byte_groups > 10)
+ − 2962 DET_RESULT (st, iso_8_2) = DET_SOMEWHAT_LIKELY;
+ − 2963 else if (data->longest_even_high_byte > 6)
+ − 2964 DET_RESULT (st, iso_8_2) = DET_SLIGHTLY_LIKELY;
+ − 2965 #endif
771
+ − 2966 }
+ − 2967 else if (data->odd_high_byte_groups > 0 &&
+ − 2968 data->even_high_byte_groups > 0)
+ − 2969 SET_DET_RESULTS (st, iso2022, DET_SOMEWHAT_UNLIKELY);
+ − 2970 else
+ − 2971 SET_DET_RESULTS (st, iso2022, DET_AS_LIKELY_AS_UNLIKELY);
+ − 2972 }
+ − 2973
+ − 2974 static void
+ − 2975 iso2022_finalize_detection_state (struct detection_state *st)
+ − 2976 {
+ − 2977 struct iso2022_detector *data = DETECTION_STATE_DATA (st, iso2022);
+ − 2978 if (data->iso)
+ − 2979 xfree (data->iso);
+ − 2980 }
+ − 2981
+ − 2982
+ − 2983 /************************************************************************/
+ − 2984 /* CCL methods */
+ − 2985 /************************************************************************/
+ − 2986
+ − 2987 /* Converter written in CCL. */
+ − 2988 DEFINE_CODING_SYSTEM_TYPE (ccl);
+ − 2989
+ − 2990 struct ccl_coding_system
+ − 2991 {
+ − 2992 /* For a CCL coding system, these specify the CCL programs used for
+ − 2993 decoding (input) and encoding (output). */
+ − 2994 Lisp_Object decode;
+ − 2995 Lisp_Object encode;
+ − 2996 };
+ − 2997
+ − 2998 #define CODING_SYSTEM_CCL_DECODE(codesys) \
+ − 2999 (CODING_SYSTEM_TYPE_DATA (codesys, ccl)->decode)
+ − 3000 #define CODING_SYSTEM_CCL_ENCODE(codesys) \
+ − 3001 (CODING_SYSTEM_TYPE_DATA (codesys, ccl)->encode)
+ − 3002 #define XCODING_SYSTEM_CCL_DECODE(codesys) \
+ − 3003 CODING_SYSTEM_CCL_DECODE (XCODING_SYSTEM (codesys))
+ − 3004 #define XCODING_SYSTEM_CCL_ENCODE(codesys) \
+ − 3005 CODING_SYSTEM_CCL_ENCODE (XCODING_SYSTEM (codesys))
+ − 3006
+ − 3007 struct ccl_coding_stream
+ − 3008 {
+ − 3009 /* state of the running CCL program */
+ − 3010 struct ccl_program ccl;
+ − 3011 };
+ − 3012
+ − 3013 static const struct lrecord_description ccl_coding_system_description[] = {
+ − 3014 { XD_LISP_OBJECT,
+ − 3015 coding_system_data_offset + offsetof (struct ccl_coding_system,
+ − 3016 decode) },
+ − 3017 { XD_LISP_OBJECT,
+ − 3018 coding_system_data_offset + offsetof (struct ccl_coding_system,
+ − 3019 encode) },
+ − 3020 { XD_END }
+ − 3021 };
+ − 3022
+ − 3023 static void
+ − 3024 ccl_mark (Lisp_Object codesys)
+ − 3025 {
+ − 3026 mark_object (XCODING_SYSTEM_CCL_DECODE (codesys));
+ − 3027 mark_object (XCODING_SYSTEM_CCL_ENCODE (codesys));
+ − 3028 }
+ − 3029
+ − 3030 static Bytecount
+ − 3031 ccl_convert (struct coding_stream *str, const UExtbyte *src,
+ − 3032 unsigned_char_dynarr *dst, Bytecount n)
+ − 3033 {
+ − 3034 struct ccl_coding_stream *data =
+ − 3035 CODING_STREAM_TYPE_DATA (str, ccl);
+ − 3036 Bytecount orign = n;
+ − 3037
+ − 3038 data->ccl.last_block = str->eof;
+ − 3039 /* When applying a CCL program to a stream, SRC must not be NULL -- this
+ − 3040 is a special signal to the driver that read and write operations are
+ − 3041 not allowed. The code does not actually look at what SRC points to if
+ − 3042 N == 0.
+ − 3043 */
+ − 3044 ccl_driver (&data->ccl, src ? src : (const unsigned char *) "",
+ − 3045 dst, n, 0,
+ − 3046 str->direction == CODING_DECODE ? CCL_MODE_DECODING :
+ − 3047 CCL_MODE_ENCODING);
+ − 3048 return orign;
+ − 3049 }
+ − 3050
+ − 3051 static void
+ − 3052 ccl_init_coding_stream (struct coding_stream *str)
+ − 3053 {
+ − 3054 struct ccl_coding_stream *data =
+ − 3055 CODING_STREAM_TYPE_DATA (str, ccl);
+ − 3056
+ − 3057 setup_ccl_program (&data->ccl,
+ − 3058 str->direction == CODING_DECODE ?
+ − 3059 XCODING_SYSTEM_CCL_DECODE (str->codesys) :
+ − 3060 XCODING_SYSTEM_CCL_ENCODE (str->codesys));
+ − 3061 }
+ − 3062
+ − 3063 static void
+ − 3064 ccl_rewind_coding_stream (struct coding_stream *str)
+ − 3065 {
+ − 3066 ccl_init_coding_stream (str);
+ − 3067 }
+ − 3068
+ − 3069 static void
+ − 3070 ccl_init (Lisp_Object codesys)
+ − 3071 {
+ − 3072 XCODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
+ − 3073 XCODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
+ − 3074 }
+ − 3075
+ − 3076 static int
+ − 3077 ccl_putprop (Lisp_Object codesys, Lisp_Object key, Lisp_Object value)
+ − 3078 {
+ − 3079 Lisp_Object sym;
+ − 3080 struct ccl_program test_ccl;
+ − 3081 Char_ASCII *suffix;
+ − 3082
+ − 3083 /* Check key first. */
+ − 3084 if (EQ (key, Qdecode))
+ − 3085 suffix = "-ccl-decode";
+ − 3086 else if (EQ (key, Qencode))
+ − 3087 suffix = "-ccl-encode";
+ − 3088 else
+ − 3089 return 0;
+ − 3090
+ − 3091 /* If value is vector, register it as a ccl program
+ − 3092 associated with a newly created symbol for
+ − 3093 backward compatibility.
+ − 3094
+ − 3095 #### Bogosity alert! Do we really have to do this crap???? --ben */
+ − 3096 if (VECTORP (value))
+ − 3097 {
+ − 3098 sym = Fintern (concat2 (Fsymbol_name (XCODING_SYSTEM_NAME (codesys)),
+ − 3099 build_string (suffix)),
+ − 3100 Qnil);
+ − 3101 Fregister_ccl_program (sym, value);
+ − 3102 }
+ − 3103 else
+ − 3104 {
+ − 3105 CHECK_SYMBOL (value);
+ − 3106 sym = value;
+ − 3107 }
+ − 3108 /* check if the given ccl programs are valid. */
+ − 3109 if (setup_ccl_program (&test_ccl, sym) < 0)
+ − 3110 invalid_argument ("Invalid CCL program", value);
+ − 3111
+ − 3112 if (EQ (key, Qdecode))
+ − 3113 XCODING_SYSTEM_CCL_DECODE (codesys) = sym;
+ − 3114 else if (EQ (key, Qencode))
+ − 3115 XCODING_SYSTEM_CCL_ENCODE (codesys) = sym;
+ − 3116
+ − 3117 return 1;
+ − 3118 }
+ − 3119
+ − 3120 static Lisp_Object
+ − 3121 ccl_getprop (Lisp_Object coding_system, Lisp_Object prop)
+ − 3122 {
+ − 3123 if (EQ (prop, Qdecode))
+ − 3124 return XCODING_SYSTEM_CCL_DECODE (coding_system);
+ − 3125 else if (EQ (prop, Qencode))
+ − 3126 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
+ − 3127 else
+ − 3128 return Qunbound;
+ − 3129 }
+ − 3130
+ − 3131
+ − 3132 /************************************************************************/
+ − 3133 /* Initialization */
+ − 3134 /************************************************************************/
+ − 3135
+ − 3136 void
+ − 3137 syms_of_mule_coding (void)
+ − 3138 {
+ − 3139 DEFSUBR (Fdecode_shift_jis_char);
+ − 3140 DEFSUBR (Fencode_shift_jis_char);
+ − 3141 DEFSUBR (Fdecode_big5_char);
+ − 3142 DEFSUBR (Fencode_big5_char);
+ − 3143
+ − 3144 DEFSYMBOL (Qbig5);
+ − 3145 DEFSYMBOL (Qshift_jis);
+ − 3146 DEFSYMBOL (Qccl);
+ − 3147 DEFSYMBOL (Qiso2022);
+ − 3148
+ − 3149 DEFSYMBOL (Qcharset_g0);
+ − 3150 DEFSYMBOL (Qcharset_g1);
+ − 3151 DEFSYMBOL (Qcharset_g2);
+ − 3152 DEFSYMBOL (Qcharset_g3);
+ − 3153 DEFSYMBOL (Qforce_g0_on_output);
+ − 3154 DEFSYMBOL (Qforce_g1_on_output);
+ − 3155 DEFSYMBOL (Qforce_g2_on_output);
+ − 3156 DEFSYMBOL (Qforce_g3_on_output);
+ − 3157 DEFSYMBOL (Qno_iso6429);
+ − 3158 DEFSYMBOL (Qinput_charset_conversion);
+ − 3159 DEFSYMBOL (Qoutput_charset_conversion);
+ − 3160
+ − 3161 DEFSYMBOL (Qshort);
+ − 3162 DEFSYMBOL (Qno_ascii_eol);
+ − 3163 DEFSYMBOL (Qno_ascii_cntl);
+ − 3164 DEFSYMBOL (Qseven);
+ − 3165 DEFSYMBOL (Qlock_shift);
+ − 3166
+ − 3167 DEFSYMBOL (Qiso_7);
+ − 3168 DEFSYMBOL (Qiso_8_designate);
+ − 3169 DEFSYMBOL (Qiso_8_1);
+ − 3170 DEFSYMBOL (Qiso_8_2);
+ − 3171 DEFSYMBOL (Qiso_lock_shift);
+ − 3172 }
+ − 3173
+ − 3174 void
+ − 3175 coding_system_type_create_mule_coding (void)
+ − 3176 {
+ − 3177 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (iso2022, "iso2022-coding-system-p");
+ − 3178 CODING_SYSTEM_HAS_METHOD (iso2022, mark);
+ − 3179 CODING_SYSTEM_HAS_METHOD (iso2022, convert);
+ − 3180 CODING_SYSTEM_HAS_METHOD (iso2022, finalize_coding_stream);
+ − 3181 CODING_SYSTEM_HAS_METHOD (iso2022, init_coding_stream);
+ − 3182 CODING_SYSTEM_HAS_METHOD (iso2022, rewind_coding_stream);
+ − 3183 CODING_SYSTEM_HAS_METHOD (iso2022, init);
+ − 3184 CODING_SYSTEM_HAS_METHOD (iso2022, print);
+ − 3185 CODING_SYSTEM_HAS_METHOD (iso2022, finalize);
+ − 3186 CODING_SYSTEM_HAS_METHOD (iso2022, putprop);
+ − 3187 CODING_SYSTEM_HAS_METHOD (iso2022, getprop);
+ − 3188
+ − 3189 INITIALIZE_DETECTOR (iso2022);
+ − 3190 DETECTOR_HAS_METHOD (iso2022, detect);
+ − 3191 DETECTOR_HAS_METHOD (iso2022, finalize_detection_state);
+ − 3192 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_7);
+ − 3193 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_8_designate);
+ − 3194 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_8_1);
+ − 3195 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_8_2);
+ − 3196 INITIALIZE_DETECTOR_CATEGORY (iso2022, iso_lock_shift);
+ − 3197
+ − 3198 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (ccl, "ccl-coding-system-p");
+ − 3199 CODING_SYSTEM_HAS_METHOD (ccl, mark);
+ − 3200 CODING_SYSTEM_HAS_METHOD (ccl, convert);
+ − 3201 CODING_SYSTEM_HAS_METHOD (ccl, init);
+ − 3202 CODING_SYSTEM_HAS_METHOD (ccl, init_coding_stream);
+ − 3203 CODING_SYSTEM_HAS_METHOD (ccl, rewind_coding_stream);
+ − 3204 CODING_SYSTEM_HAS_METHOD (ccl, putprop);
+ − 3205 CODING_SYSTEM_HAS_METHOD (ccl, getprop);
+ − 3206
+ − 3207 INITIALIZE_CODING_SYSTEM_TYPE (shift_jis, "shift-jis-coding-system-p");
+ − 3208 CODING_SYSTEM_HAS_METHOD (shift_jis, convert);
+ − 3209
+ − 3210 INITIALIZE_DETECTOR (shift_jis);
+ − 3211 DETECTOR_HAS_METHOD (shift_jis, detect);
+ − 3212 INITIALIZE_DETECTOR_CATEGORY (shift_jis, shift_jis);
+ − 3213
+ − 3214 INITIALIZE_CODING_SYSTEM_TYPE (big5, "big5-coding-system-p");
+ − 3215 CODING_SYSTEM_HAS_METHOD (big5, convert);
+ − 3216
+ − 3217 INITIALIZE_DETECTOR (big5);
+ − 3218 DETECTOR_HAS_METHOD (big5, detect);
+ − 3219 INITIALIZE_DETECTOR_CATEGORY (big5, big5);
+ − 3220 }
+ − 3221
+ − 3222 void
+ − 3223 reinit_coding_system_type_create_mule_coding (void)
+ − 3224 {
+ − 3225 REINITIALIZE_CODING_SYSTEM_TYPE (iso2022);
+ − 3226 REINITIALIZE_CODING_SYSTEM_TYPE (ccl);
+ − 3227 REINITIALIZE_CODING_SYSTEM_TYPE (shift_jis);
+ − 3228 REINITIALIZE_CODING_SYSTEM_TYPE (big5);
+ − 3229 }
+ − 3230
+ − 3231 void
+ − 3232 reinit_vars_of_mule_coding (void)
+ − 3233 {
+ − 3234 }
+ − 3235
+ − 3236 void
+ − 3237 vars_of_mule_coding (void)
+ − 3238 {
+ − 3239 }