comparison src/file-coding.c @ 396:6719134a07c2 r21-2-13

Import from CVS: tag r21-2-13
author cvs
date Mon, 13 Aug 2007 11:12:05 +0200
parents aabb7f5b1c81
children 74fd4e045ea6
comparison
equal deleted inserted replaced
395:de2c2a7459d2 396:6719134a07c2
29 #include "elhash.h" 29 #include "elhash.h"
30 #include "insdel.h" 30 #include "insdel.h"
31 #include "lstream.h" 31 #include "lstream.h"
32 #ifdef MULE 32 #ifdef MULE
33 #include "mule-ccl.h" 33 #include "mule-ccl.h"
34 #include "chartab.h"
34 #endif 35 #endif
35 #include "file-coding.h" 36 #include "file-coding.h"
36 37
37 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error; 38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error;
38 39
62 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf; 63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
63 Lisp_Object Qpost_read_conversion; 64 Lisp_Object Qpost_read_conversion;
64 Lisp_Object Qpre_write_conversion; 65 Lisp_Object Qpre_write_conversion;
65 66
66 #ifdef MULE 67 #ifdef MULE
68 Lisp_Object Qucs4, Qutf8;
67 Lisp_Object Qbig5, Qshift_jis; 69 Lisp_Object Qbig5, Qshift_jis;
68 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3; 70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
69 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output; 71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
70 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output; 72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
71 Lisp_Object Qno_iso6429; 73 Lisp_Object Qno_iso6429;
101 unsigned char esc_bytes[8]; 103 unsigned char esc_bytes[8];
102 104
103 /* Index for next byte to store in ISO escape sequence. */ 105 /* Index for next byte to store in ISO escape sequence. */
104 int esc_bytes_index; 106 int esc_bytes_index;
105 107
108 #ifdef ENABLE_COMPOSITE_CHARS
106 /* Stuff seen so far when composing a string. */ 109 /* Stuff seen so far when composing a string. */
107 unsigned_char_dynarr *composite_chars; 110 unsigned_char_dynarr *composite_chars;
111 #endif
108 112
109 /* If we saw an invalid designation sequence for a particular 113 /* If we saw an invalid designation sequence for a particular
110 register, we flag it here and switch to ASCII. The next time we 114 register, we flag it here and switch to ASCII. The next time we
111 see a valid designation for this register, we turn off the flag 115 see a valid designation for this register, we turn off the flag
112 and do the designation normally, but pretend the sequence was 116 and do the designation normally, but pretend the sequence was
164 CONST unsigned char *src, 168 CONST unsigned char *src,
165 unsigned_char_dynarr *dst, unsigned int n); 169 unsigned_char_dynarr *dst, unsigned int n);
166 static void encode_coding_big5 (Lstream *encoding, 170 static void encode_coding_big5 (Lstream *encoding,
167 CONST unsigned char *src, 171 CONST unsigned char *src,
168 unsigned_char_dynarr *dst, unsigned int n); 172 unsigned_char_dynarr *dst, unsigned int n);
173 static int detect_coding_ucs4 (struct detection_state *st,
174 CONST unsigned char *src,
175 unsigned int n);
176 static void decode_coding_ucs4 (Lstream *decoding,
177 CONST unsigned char *src,
178 unsigned_char_dynarr *dst, unsigned int n);
179 static void encode_coding_ucs4 (Lstream *encoding,
180 CONST unsigned char *src,
181 unsigned_char_dynarr *dst, unsigned int n);
182 static int detect_coding_utf8 (struct detection_state *st,
183 CONST unsigned char *src,
184 unsigned int n);
185 static void decode_coding_utf8 (Lstream *decoding,
186 CONST unsigned char *src,
187 unsigned_char_dynarr *dst, unsigned int n);
188 static void encode_coding_utf8 (Lstream *encoding,
189 CONST unsigned char *src,
190 unsigned_char_dynarr *dst, unsigned int n);
169 static int postprocess_iso2022_mask (int mask); 191 static int postprocess_iso2022_mask (int mask);
170 static void reset_iso2022 (Lisp_Object coding_system, 192 static void reset_iso2022 (Lisp_Object coding_system,
171 struct iso2022_decoder *iso); 193 struct iso2022_decoder *iso);
172 static int detect_coding_iso2022 (struct detection_state *st, 194 static int detect_coding_iso2022 (struct detection_state *st,
173 CONST unsigned char *src, 195 CONST unsigned char *src,
228 0, 0, struct Lisp_Coding_System); 250 0, 0, struct Lisp_Coding_System);
229 251
230 static Lisp_Object 252 static Lisp_Object
231 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object)) 253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
232 { 254 {
233 struct Lisp_Coding_System *codesys = XCODING_SYSTEM (obj); 255 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
234 256
235 markobj (CODING_SYSTEM_NAME (codesys)); 257 markobj (CODING_SYSTEM_NAME (codesys));
236 markobj (CODING_SYSTEM_DOC_STRING (codesys)); 258 markobj (CODING_SYSTEM_DOC_STRING (codesys));
237 markobj (CODING_SYSTEM_MNEMONIC (codesys)); 259 markobj (CODING_SYSTEM_MNEMONIC (codesys));
238 markobj (CODING_SYSTEM_EOL_LF (codesys)); 260 markobj (CODING_SYSTEM_EOL_LF (codesys));
283 305
284 static void 306 static void
285 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun, 307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
286 int escapeflag) 308 int escapeflag)
287 { 309 {
288 struct Lisp_Coding_System *c = XCODING_SYSTEM (obj); 310 Lisp_Coding_System *c = XCODING_SYSTEM (obj);
289 if (print_readably) 311 if (print_readably)
290 error ("printing unreadable object #<coding_system 0x%x>", 312 error ("printing unreadable object #<coding_system 0x%x>",
291 c->header.uid); 313 c->header.uid);
292 314
293 write_c_string ("#<coding_system ", printcharfun); 315 write_c_string ("#<coding_system ", printcharfun);
296 } 318 }
297 319
298 static void 320 static void
299 finalize_coding_system (void *header, int for_disksave) 321 finalize_coding_system (void *header, int for_disksave)
300 { 322 {
301 struct Lisp_Coding_System *c = (struct Lisp_Coding_System *) header; 323 Lisp_Coding_System *c = (Lisp_Coding_System *) header;
302 /* Since coding systems never go away, this function is not 324 /* Since coding systems never go away, this function is not
303 necessary. But it would be necessary if we changed things 325 necessary. But it would be necessary if we changed things
304 so that coding systems could go away. */ 326 so that coding systems could go away. */
305 if (!for_disksave) /* see comment in lstream.c */ 327 if (!for_disksave) /* see comment in lstream.c */
306 { 328 {
351 case EOL_AUTODETECT: return Qnil; 373 case EOL_AUTODETECT: return Qnil;
352 } 374 }
353 } 375 }
354 376
355 static void 377 static void
356 setup_eol_coding_systems (struct Lisp_Coding_System *codesys) 378 setup_eol_coding_systems (Lisp_Coding_System *codesys)
357 { 379 {
358 Lisp_Object codesys_obj; 380 Lisp_Object codesys_obj;
359 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name); 381 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
360 char *codesys_name = (char *) alloca (len + 7); 382 char *codesys_name = (char *) alloca (len + 7);
361 int mlen = -1; 383 int mlen = -1;
503 { 525 {
504 coding_system = Fget_coding_system (coding_system); 526 coding_system = Fget_coding_system (coding_system);
505 return XCODING_SYSTEM_NAME (coding_system); 527 return XCODING_SYSTEM_NAME (coding_system);
506 } 528 }
507 529
508 static struct Lisp_Coding_System * 530 static Lisp_Coding_System *
509 allocate_coding_system (enum coding_system_type type, Lisp_Object name) 531 allocate_coding_system (enum coding_system_type type, Lisp_Object name)
510 { 532 {
511 struct Lisp_Coding_System *codesys = 533 Lisp_Coding_System *codesys =
512 alloc_lcrecord_type (struct Lisp_Coding_System, lrecord_coding_system); 534 alloc_lcrecord_type (Lisp_Coding_System, lrecord_coding_system);
513 535
514 zero_lcrecord (codesys); 536 zero_lcrecord (codesys);
515 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil; 537 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
516 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil; 538 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
517 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT; 539 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
606 graphic characters that are not in ASCII or Latin-1 will be 628 graphic characters that are not in ASCII or Latin-1 will be
607 replaced by a ?. (For a no-conversion-encoded buffer, these 629 replaced by a ?. (For a no-conversion-encoded buffer, these
608 characters will only be present if you explicitly insert them.) 630 characters will only be present if you explicitly insert them.)
609 'shift-jis 631 'shift-jis
610 Shift-JIS (a Japanese encoding commonly used in PC operating systems). 632 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
633 'ucs-4
634 ISO 10646 UCS-4 encoding.
635 'utf-8
636 ISO 10646 UTF-8 encoding.
611 'iso2022 637 'iso2022
612 Any ISO2022-compliant encoding. Among other things, this includes 638 Any ISO2022-compliant encoding. Among other things, this includes
613 JIS (the Japanese encoding commonly used for e-mail), EUC (the 639 JIS (the Japanese encoding commonly used for e-mail), EUC (the
614 standard Unix encoding for Japanese and other languages), and 640 standard Unix encoding for Japanese and other languages), and
615 Compound Text (the encoding used in X11). You can specify more 641 Compound Text (the encoding used in X11). You can specify more
760 'encode 786 'encode
761 CCL program used for encoding (converting to external format). 787 CCL program used for encoding (converting to external format).
762 */ 788 */
763 (name, type, doc_string, props)) 789 (name, type, doc_string, props))
764 { 790 {
765 struct Lisp_Coding_System *codesys; 791 Lisp_Coding_System *codesys;
766 Lisp_Object rest, key, value; 792 Lisp_Object rest, key, value;
767 enum coding_system_type ty; 793 enum coding_system_type ty;
768 int need_to_setup_eol_systems = 1; 794 int need_to_setup_eol_systems = 1;
769 795
770 /* Convert type to constant */ 796 /* Convert type to constant */
772 { ty = CODESYS_AUTODETECT; } 798 { ty = CODESYS_AUTODETECT; }
773 #ifdef MULE 799 #ifdef MULE
774 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; } 800 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
775 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; } 801 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
776 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; } 802 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
803 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; }
804 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; }
777 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; } 805 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
778 #endif 806 #endif
779 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; } 807 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
780 #ifdef DEBUG_XEMACS 808 #ifdef DEBUG_XEMACS
781 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; } 809 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
909 new_name)); 937 new_name));
910 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table); 938 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table);
911 } 939 }
912 940
913 { 941 {
914 struct Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system); 942 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
915 struct Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system); 943 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
916 memcpy (((char *) to ) + sizeof (to->header), 944 memcpy (((char *) to ) + sizeof (to->header),
917 ((char *) from) + sizeof (from->header), 945 ((char *) from) + sizeof (from->header),
918 sizeof (*from) - sizeof (from->header)); 946 sizeof (*from) - sizeof (from->header));
919 to->name = new_name; 947 to->name = new_name;
920 } 948 }
922 } 950 }
923 951
924 static Lisp_Object 952 static Lisp_Object
925 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type) 953 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type)
926 { 954 {
927 struct Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system); 955 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
928 Lisp_Object new_coding_system; 956 Lisp_Object new_coding_system;
929 957
930 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) 958 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
931 return coding_system; 959 return coding_system;
932 960
978 case CODESYS_AUTODETECT: return Qundecided; 1006 case CODESYS_AUTODETECT: return Qundecided;
979 #ifdef MULE 1007 #ifdef MULE
980 case CODESYS_SHIFT_JIS: return Qshift_jis; 1008 case CODESYS_SHIFT_JIS: return Qshift_jis;
981 case CODESYS_ISO2022: return Qiso2022; 1009 case CODESYS_ISO2022: return Qiso2022;
982 case CODESYS_BIG5: return Qbig5; 1010 case CODESYS_BIG5: return Qbig5;
1011 case CODESYS_UCS4: return Qucs4;
1012 case CODESYS_UTF8: return Qutf8;
983 case CODESYS_CCL: return Qccl; 1013 case CODESYS_CCL: return Qccl;
984 #endif 1014 #endif
985 case CODESYS_NO_CONVERSION: return Qno_conversion; 1015 case CODESYS_NO_CONVERSION: return Qno_conversion;
986 #ifdef DEBUG_XEMACS 1016 #ifdef DEBUG_XEMACS
987 case CODESYS_INTERNAL: return Qinternal; 1017 case CODESYS_INTERNAL: return Qinternal;
1280 shift_jis; 1310 shift_jis;
1281 1311
1282 struct 1312 struct
1283 { 1313 {
1284 int mask; 1314 int mask;
1315 int in_byte;
1316 }
1317 ucs4;
1318
1319 struct
1320 {
1321 int mask;
1322 int in_byte;
1323 }
1324 utf8;
1325
1326 struct
1327 {
1328 int mask;
1285 int initted; 1329 int initted;
1286 struct iso2022_decoder iso; 1330 struct iso2022_decoder iso;
1287 unsigned int flags; 1331 unsigned int flags;
1288 int high_byte_count; 1332 int high_byte_count;
1289 unsigned int saw_single_shift:1; 1333 unsigned int saw_single_shift:1;
1396 { 1440 {
1397 st->seen_non_ascii = 1; 1441 st->seen_non_ascii = 1;
1398 #ifdef MULE 1442 #ifdef MULE
1399 st->shift_jis.mask = ~0; 1443 st->shift_jis.mask = ~0;
1400 st->big5.mask = ~0; 1444 st->big5.mask = ~0;
1445 st->ucs4.mask = ~0;
1446 st->utf8.mask = ~0;
1401 st->iso2022.mask = ~0; 1447 st->iso2022.mask = ~0;
1402 #endif 1448 #endif
1403 break; 1449 break;
1404 } 1450 }
1405 } 1451 }
1412 st->iso2022.mask = detect_coding_iso2022 (st, src, n); 1458 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1413 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask)) 1459 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1414 st->shift_jis.mask = detect_coding_sjis (st, src, n); 1460 st->shift_jis.mask = detect_coding_sjis (st, src, n);
1415 if (!mask_has_at_most_one_bit_p (st->big5.mask)) 1461 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1416 st->big5.mask = detect_coding_big5 (st, src, n); 1462 st->big5.mask = detect_coding_big5 (st, src, n);
1417 1463 if (!mask_has_at_most_one_bit_p (st->utf8.mask))
1418 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask; 1464 st->utf8.mask = detect_coding_utf8 (st, src, n);
1465 if (!mask_has_at_most_one_bit_p (st->ucs4.mask))
1466 st->ucs4.mask = detect_coding_ucs4 (st, src, n);
1467
1468 st->mask
1469 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask
1470 | st->utf8.mask | st->ucs4.mask;
1419 #endif 1471 #endif
1420 { 1472 {
1421 int retval = mask_has_at_most_one_bit_p (st->mask); 1473 int retval = mask_has_at_most_one_bit_p (st->mask);
1422 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK; 1474 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1423 return retval && st->eol_type != EOL_AUTODETECT; 1475 return retval && st->eol_type != EOL_AUTODETECT;
1675 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding) 1727 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1676 1728
1677 struct decoding_stream 1729 struct decoding_stream
1678 { 1730 {
1679 /* Coding system that governs the conversion. */ 1731 /* Coding system that governs the conversion. */
1680 struct Lisp_Coding_System *codesys; 1732 Lisp_Coding_System *codesys;
1681 1733
1682 /* Stream that we read the encoded data from or 1734 /* Stream that we read the encoded data from or
1683 write the decoded data to. */ 1735 write the decoded data to. */
1684 Lstream *other_end; 1736 Lstream *other_end;
1685 1737
1881 str->flags |= CODING_STATE_END; 1933 str->flags |= CODING_STATE_END;
1882 decoding_writer (stream, 0, 0); 1934 decoding_writer (stream, 0, 0);
1883 } 1935 }
1884 Dynarr_free (str->runoff); 1936 Dynarr_free (str->runoff);
1885 #ifdef MULE 1937 #ifdef MULE
1938 #ifdef ENABLE_COMPOSITE_CHARS
1886 if (str->iso2022.composite_chars) 1939 if (str->iso2022.composite_chars)
1887 Dynarr_free (str->iso2022.composite_chars); 1940 Dynarr_free (str->iso2022.composite_chars);
1888 #endif 1941 #endif
1942 #endif
1889 return Lstream_close (str->other_end); 1943 return Lstream_close (str->other_end);
1890 } 1944 }
1891 1945
1892 Lisp_Object 1946 Lisp_Object
1893 decoding_stream_coding_system (Lstream *stream) 1947 decoding_stream_coding_system (Lstream *stream)
1900 } 1954 }
1901 1955
1902 void 1956 void
1903 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) 1957 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1904 { 1958 {
1905 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); 1959 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1906 struct decoding_stream *str = DECODING_STREAM_DATA (lstr); 1960 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1907 str->codesys = cs; 1961 str->codesys = cs;
1908 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) 1962 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1909 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs); 1963 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1910 reset_decoding_stream (str); 1964 reset_decoding_stream (str);
2017 decode_coding_sjis (decoding, src, dst, n); 2071 decode_coding_sjis (decoding, src, dst, n);
2018 break; 2072 break;
2019 case CODESYS_BIG5: 2073 case CODESYS_BIG5:
2020 decode_coding_big5 (decoding, src, dst, n); 2074 decode_coding_big5 (decoding, src, dst, n);
2021 break; 2075 break;
2076 case CODESYS_UCS4:
2077 decode_coding_ucs4 (decoding, src, dst, n);
2078 break;
2079 case CODESYS_UTF8:
2080 decode_coding_utf8 (decoding, src, dst, n);
2081 break;
2022 case CODESYS_CCL: 2082 case CODESYS_CCL:
2023 ccl_driver (&str->ccl, src, dst, n, 0); 2083 ccl_driver (&str->ccl, src, dst, n, 0);
2024 break; 2084 break;
2025 case CODESYS_ISO2022: 2085 case CODESYS_ISO2022:
2026 decode_coding_iso2022 (decoding, src, dst, n); 2086 decode_coding_iso2022 (decoding, src, dst, n);
2108 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding) 2168 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2109 2169
2110 struct encoding_stream 2170 struct encoding_stream
2111 { 2171 {
2112 /* Coding system that governs the conversion. */ 2172 /* Coding system that governs the conversion. */
2113 struct Lisp_Coding_System *codesys; 2173 Lisp_Coding_System *codesys;
2114 2174
2115 /* Stream that we read the encoded data from or 2175 /* Stream that we read the encoded data from or
2116 write the decoded data to. */ 2176 write the decoded data to. */
2117 Lstream *other_end; 2177 Lstream *other_end;
2118 2178
2359 } 2419 }
2360 2420
2361 void 2421 void
2362 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) 2422 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2363 { 2423 {
2364 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); 2424 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2365 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr); 2425 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2366 str->codesys = cs; 2426 str->codesys = cs;
2367 reset_encoding_stream (str); 2427 reset_encoding_stream (str);
2368 } 2428 }
2369 2429
2423 encode_coding_sjis (encoding, src, dst, n); 2483 encode_coding_sjis (encoding, src, dst, n);
2424 break; 2484 break;
2425 case CODESYS_BIG5: 2485 case CODESYS_BIG5:
2426 encode_coding_big5 (encoding, src, dst, n); 2486 encode_coding_big5 (encoding, src, dst, n);
2427 break; 2487 break;
2488 case CODESYS_UCS4:
2489 encode_coding_ucs4 (encoding, src, dst, n);
2490 break;
2491 case CODESYS_UTF8:
2492 encode_coding_utf8 (encoding, src, dst, n);
2493 break;
2428 case CODESYS_CCL: 2494 case CODESYS_CCL:
2429 ccl_driver (&str->ccl, src, dst, n, 0); 2495 ccl_driver (&str->ccl, src, dst, n, 0);
2430 break; 2496 break;
2431 case CODESYS_ISO2022: 2497 case CODESYS_ISO2022:
2432 encode_coding_iso2022 (encoding, src, dst, n); 2498 encode_coding_iso2022 (encoding, src, dst, n);
2508 /* Shift-JIS methods */ 2574 /* Shift-JIS methods */
2509 /************************************************************************/ 2575 /************************************************************************/
2510 2576
2511 /* Shift-JIS is a coding system encoding three character sets: ASCII, right 2577 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2512 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded 2578 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2513 as is. A character of JISX0201-Kana (TYPE94 character set) is 2579 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is
2514 encoded by "position-code + 0x80". A character of JISX0208 2580 encoded by "position-code + 0x80". A character of JISX0208
2515 (TYPE94x94 character set) is encoded in 2-byte but two 2581 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two
2516 position-codes are divided and shifted so that it fit in the range 2582 position-codes are divided and shifted so that it fit in the range
2517 below. 2583 below.
2518 2584
2519 --- CODE RANGE of Shift-JIS --- 2585 --- CODE RANGE of Shift-JIS ---
2520 (character set) (range) 2586 (character set) (range)
2567 static void 2633 static void
2568 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src, 2634 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src,
2569 unsigned_char_dynarr *dst, unsigned int n) 2635 unsigned_char_dynarr *dst, unsigned int n)
2570 { 2636 {
2571 unsigned char c; 2637 unsigned char c;
2572 unsigned int flags, ch;
2573 enum eol_type eol_type;
2574 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); 2638 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2575 2639 unsigned int flags = str->flags;
2576 CODING_STREAM_DECOMPOSE (str, flags, ch); 2640 unsigned int ch = str->ch;
2577 eol_type = str->eol_type; 2641 eol_type_t eol_type = str->eol_type;
2578 2642
2579 while (n--) 2643 while (n--)
2580 { 2644 {
2581 c = *src++; 2645 c = *src++;
2582 2646
2615 label_continue_loop:; 2679 label_continue_loop:;
2616 } 2680 }
2617 2681
2618 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); 2682 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2619 2683
2620 CODING_STREAM_COMPOSE (str, flags, ch); 2684 str->flags = flags;
2685 str->ch = ch;
2621 } 2686 }
2622 2687
2623 /* Convert internally-formatted data to Shift-JIS. */ 2688 /* Convert internally-formatted data to Shift-JIS. */
2624 2689
2625 static void 2690 static void
2626 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src, 2691 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src,
2627 unsigned_char_dynarr *dst, unsigned int n) 2692 unsigned_char_dynarr *dst, unsigned int n)
2628 { 2693 {
2629 unsigned char c; 2694 unsigned char c;
2630 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); 2695 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2631 unsigned int flags, ch; 2696 unsigned int flags = str->flags;
2632 enum eol_type eol_type; 2697 unsigned int ch = str->ch;
2633 2698 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2634 CODING_STREAM_DECOMPOSE (str, flags, ch);
2635 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2636 2699
2637 while (n--) 2700 while (n--)
2638 { 2701 {
2639 c = *src++; 2702 c = *src++;
2640 if (c == '\n') 2703 if (c == '\n')
2673 ch = 0; 2736 ch = 0;
2674 } 2737 }
2675 } 2738 }
2676 } 2739 }
2677 2740
2678 CODING_STREAM_COMPOSE (str, flags, ch); 2741 str->flags = flags;
2742 str->ch = ch;
2679 } 2743 }
2680 2744
2681 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /* 2745 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2682 Decode a JISX0208 character of Shift-JIS coding-system. 2746 Decode a JISX0208 character of Shift-JIS coding-system.
2683 CODE is the character code in Shift-JIS as a cons of type bytes. 2747 CODE is the character code in Shift-JIS as a cons of type bytes.
2740 -------------------------- 2804 --------------------------
2741 2805
2742 Since the number of characters in Big5 is larger than maximum 2806 Since the number of characters in Big5 is larger than maximum
2743 characters in Emacs' charset (96x96), it can't be handled as one 2807 characters in Emacs' charset (96x96), it can't be handled as one
2744 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1' 2808 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2745 and `charset-big5-2'. Both <type>s are TYPE94x94. The former 2809 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former
2746 contains frequently used characters and the latter contains less 2810 contains frequently used characters and the latter contains less
2747 frequently used characters. */ 2811 frequently used characters. */
2748 2812
2749 #define BYTE_BIG5_TWO_BYTE_1_P(c) \ 2813 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2750 ((c) >= 0xA1 && (c) <= 0xFE) 2814 ((c) >= 0xA1 && (c) <= 0xFE)
2856 static void 2920 static void
2857 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src, 2921 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2858 unsigned_char_dynarr *dst, unsigned int n) 2922 unsigned_char_dynarr *dst, unsigned int n)
2859 { 2923 {
2860 unsigned char c; 2924 unsigned char c;
2861 unsigned int flags, ch;
2862 enum eol_type eol_type;
2863 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); 2925 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2864 2926 unsigned int flags = str->flags;
2865 CODING_STREAM_DECOMPOSE (str, flags, ch); 2927 unsigned int ch = str->ch;
2866 eol_type = str->eol_type; 2928 eol_type_t eol_type = str->eol_type;
2867 2929
2868 while (n--) 2930 while (n--)
2869 { 2931 {
2870 c = *src++; 2932 c = *src++;
2871 if (ch) 2933 if (ch)
2897 label_continue_loop:; 2959 label_continue_loop:;
2898 } 2960 }
2899 2961
2900 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); 2962 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2901 2963
2902 CODING_STREAM_COMPOSE (str, flags, ch); 2964 str->flags = flags;
2965 str->ch = ch;
2903 } 2966 }
2904 2967
2905 /* Convert internally-formatted data to Big5. */ 2968 /* Convert internally-formatted data to Big5. */
2906 2969
2907 static void 2970 static void
2908 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src, 2971 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2909 unsigned_char_dynarr *dst, unsigned int n) 2972 unsigned_char_dynarr *dst, unsigned int n)
2910 { 2973 {
2911 unsigned char c; 2974 unsigned char c;
2912 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); 2975 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2913 unsigned int flags, ch; 2976 unsigned int flags = str->flags;
2914 enum eol_type eol_type; 2977 unsigned int ch = str->ch;
2915 2978 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2916 CODING_STREAM_DECOMPOSE (str, flags, ch);
2917 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
2918 2979
2919 while (n--) 2980 while (n--)
2920 { 2981 {
2921 c = *src++; 2982 c = *src++;
2922 if (c == '\n') 2983 if (c == '\n')
2960 } 3021 }
2961 3022
2962 ch = 0; 3023 ch = 0;
2963 } 3024 }
2964 3025
2965 CODING_STREAM_COMPOSE (str, flags, ch); 3026 str->flags = flags;
3027 str->ch = ch;
2966 } 3028 }
2967 3029
2968 3030
2969 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /* 3031 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
2970 Decode a Big5 character CODE of BIG5 coding-system. 3032 Decode a Big5 character CODE of BIG5 coding-system.
3015 return Qnil; 3077 return Qnil;
3016 } 3078 }
3017 3079
3018 3080
3019 /************************************************************************/ 3081 /************************************************************************/
3082 /* UCS-4 methods */
3083 /* */
3084 /* UCS-4 character codes are implemented as nonnegative integers. */
3085 /* */
3086 /************************************************************************/
3087
3088 Lisp_Object ucs_to_mule_table[65536];
3089 Lisp_Object mule_to_ucs_table;
3090
3091 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /*
3092 Map UCS-4 code CODE to Mule character CHARACTER.
3093
3094 Return T on success, NIL on failure.
3095 */
3096 (code, character))
3097 {
3098 unsigned int c;
3099
3100 CHECK_CHAR (character);
3101 CHECK_INT (code);
3102 c = XINT (code);
3103
3104 if (c < sizeof (ucs_to_mule_table))
3105 {
3106 ucs_to_mule_table[c] = character;
3107 return Qt;
3108 }
3109 else
3110 return Qnil;
3111 }
3112
3113 static Lisp_Object
3114 ucs_to_char (unsigned long code)
3115 {
3116 if (code < sizeof (ucs_to_mule_table))
3117 {
3118 return ucs_to_mule_table[code];
3119 }
3120 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14))
3121 {
3122 unsigned int c;
3123
3124 code -= 0xe00000;
3125 c = code % (94 * 94);
3126 return make_char
3127 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES
3128 (CHARSET_TYPE_94X94, code / (94 * 94) + '@',
3129 CHARSET_LEFT_TO_RIGHT),
3130 c / 94 + 33, c % 94 + 33));
3131 }
3132 else
3133 return Qnil;
3134 }
3135
3136 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /*
3137 Return Mule character corresponding to UCS code CODE (a positive integer).
3138 */
3139 (code))
3140 {
3141 CHECK_NATNUM (code);
3142 return ucs_to_char (XINT (code));
3143 }
3144
3145 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /*
3146 Map Mule character CHARACTER to UCS code CODE (a positive integer).
3147 */
3148 (character, code))
3149 {
3150 /* #### Isn't this gilding the lily? Fput_char_table checks its args.
3151 Fset_char_ucs is more restrictive on index arg, but should
3152 check code arg in a char_table method. */
3153 CHECK_CHAR (character);
3154 CHECK_NATNUM (code);
3155 return Fput_char_table (character, code, mule_to_ucs_table);
3156 }
3157
3158 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /*
3159 Return the UCS code (a positive integer) corresponding to CHARACTER.
3160 */
3161 (character))
3162 {
3163 return Fget_char_table (character, mule_to_ucs_table);
3164 }
3165
3166 /* Decode a UCS-4 character into a buffer. If the lookup fails, use
3167 JIS X 0208 double-width `=' instead.
3168 #### do something more appropriate (use blob?)
3169 Danger, Will Robinson! Data loss. Should we signal user? */
3170 static void
3171 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst)
3172 {
3173 Lisp_Object chr = ucs_to_char (ch);
3174
3175 if (! NILP (chr))
3176 {
3177 Bufbyte work[MAX_EMCHAR_LEN];
3178 int len;
3179
3180 ch = XCHAR (chr);
3181 len = (ch < 128) ?
3182 simple_set_charptr_emchar (work, ch) :
3183 non_ascii_set_charptr_emchar (work, ch);
3184 Dynarr_add_many (dst, work, len);
3185 }
3186 else
3187 {
3188 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
3189 Dynarr_add (dst, 34 + 128);
3190 Dynarr_add (dst, 46 + 128);
3191 }
3192 }
3193
3194 static unsigned long
3195 mule_char_to_ucs4 (Lisp_Object charset,
3196 unsigned char h, unsigned char l)
3197 {
3198 Lisp_Object code
3199 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)),
3200 mule_to_ucs_table);
3201
3202 if (INTP (code))
3203 {
3204 return XINT (code);
3205 }
3206 else if ( (XCHARSET_DIMENSION (charset) == 2) &&
3207 (XCHARSET_CHARS (charset) == 94) )
3208 {
3209 unsigned char final = XCHARSET_FINAL (charset);
3210
3211 if ( ('@' <= final) && (final < 0x7f) )
3212 {
3213 return 0xe00000 + (final - '@') * 94 * 94
3214 + ((h & 127) - 33) * 94 + (l & 127) - 33;
3215 }
3216 else
3217 {
3218 return '?';
3219 }
3220 }
3221 else
3222 {
3223 return '?';
3224 }
3225 }
3226
3227 static void
3228 encode_ucs4 (Lisp_Object charset,
3229 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3230 {
3231 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3232 Dynarr_add (dst, code >> 24);
3233 Dynarr_add (dst, (code >> 16) & 255);
3234 Dynarr_add (dst, (code >> 8) & 255);
3235 Dynarr_add (dst, code & 255);
3236 }
3237
3238 static int
3239 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src,
3240 unsigned int n)
3241 {
3242 while (n--)
3243 {
3244 int c = *src++;
3245 switch (st->ucs4.in_byte)
3246 {
3247 case 0:
3248 if (c >= 128)
3249 return 0;
3250 else
3251 st->ucs4.in_byte++;
3252 break;
3253 case 3:
3254 st->ucs4.in_byte = 0;
3255 break;
3256 default:
3257 st->ucs4.in_byte++;
3258 }
3259 }
3260 return CODING_CATEGORY_UCS4_MASK;
3261 }
3262
3263 static void
3264 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src,
3265 unsigned_char_dynarr *dst, unsigned int n)
3266 {
3267 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3268 unsigned int flags = str->flags;
3269 unsigned int ch = str->ch;
3270
3271 while (n--)
3272 {
3273 unsigned char c = *src++;
3274 switch (flags)
3275 {
3276 case 0:
3277 ch = c;
3278 flags = 3;
3279 break;
3280 case 1:
3281 decode_ucs4 ( ( ch << 8 ) | c, dst);
3282 ch = 0;
3283 flags = 0;
3284 break;
3285 default:
3286 ch = ( ch << 8 ) | c;
3287 flags--;
3288 }
3289 }
3290 if (flags & CODING_STATE_END)
3291 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3292
3293 str->flags = flags;
3294 str->ch = ch;
3295 }
3296
3297 static void
3298 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src,
3299 unsigned_char_dynarr *dst, unsigned int n)
3300 {
3301 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3302 unsigned int flags = str->flags;
3303 unsigned int ch = str->ch;
3304 unsigned char char_boundary = str->iso2022.current_char_boundary;
3305 Lisp_Object charset = str->iso2022.current_charset;
3306
3307 #ifdef ENABLE_COMPOSITE_CHARS
3308 /* flags for handling composite chars. We do a little switcharoo
3309 on the source while we're outputting the composite char. */
3310 unsigned int saved_n = 0;
3311 CONST unsigned char *saved_src = NULL;
3312 int in_composite = 0;
3313
3314 back_to_square_n:
3315 #endif
3316
3317 while (n--)
3318 {
3319 unsigned char c = *src++;
3320
3321 if (BYTE_ASCII_P (c))
3322 { /* Processing ASCII character */
3323 ch = 0;
3324 encode_ucs4 (Vcharset_ascii, c, 0, dst);
3325 char_boundary = 1;
3326 }
3327 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3328 { /* Processing Leading Byte */
3329 ch = 0;
3330 charset = CHARSET_BY_LEADING_BYTE (c);
3331 if (LEADING_BYTE_PREFIX_P(c))
3332 ch = c;
3333 char_boundary = 0;
3334 }
3335 else
3336 { /* Processing Non-ASCII character */
3337 char_boundary = 1;
3338 if (EQ (charset, Vcharset_control_1))
3339 {
3340 encode_ucs4 (Vcharset_control_1, c, 0, dst);
3341 }
3342 else
3343 {
3344 switch (XCHARSET_REP_BYTES (charset))
3345 {
3346 case 2:
3347 encode_ucs4 (charset, c, 0, dst);
3348 break;
3349 case 3:
3350 if (XCHARSET_PRIVATE_P (charset))
3351 {
3352 encode_ucs4 (charset, c, 0, dst);
3353 ch = 0;
3354 }
3355 else if (ch)
3356 {
3357 #ifdef ENABLE_COMPOSITE_CHARS
3358 if (EQ (charset, Vcharset_composite))
3359 {
3360 if (in_composite)
3361 {
3362 /* #### Bother! We don't know how to
3363 handle this yet. */
3364 Dynarr_add (dst, 0);
3365 Dynarr_add (dst, 0);
3366 Dynarr_add (dst, 0);
3367 Dynarr_add (dst, '~');
3368 }
3369 else
3370 {
3371 Emchar emch = MAKE_CHAR (Vcharset_composite,
3372 ch & 0x7F, c & 0x7F);
3373 Lisp_Object lstr = composite_char_string (emch);
3374 saved_n = n;
3375 saved_src = src;
3376 in_composite = 1;
3377 src = XSTRING_DATA (lstr);
3378 n = XSTRING_LENGTH (lstr);
3379 }
3380 }
3381 else
3382 #endif /* ENABLE_COMPOSITE_CHARS */
3383 {
3384 encode_ucs4(charset, ch, c, dst);
3385 }
3386 ch = 0;
3387 }
3388 else
3389 {
3390 ch = c;
3391 char_boundary = 0;
3392 }
3393 break;
3394 case 4:
3395 if (ch)
3396 {
3397 encode_ucs4 (charset, ch, c, dst);
3398 ch = 0;
3399 }
3400 else
3401 {
3402 ch = c;
3403 char_boundary = 0;
3404 }
3405 break;
3406 default:
3407 abort ();
3408 }
3409 }
3410 }
3411 }
3412
3413 #ifdef ENABLE_COMPOSITE_CHARS
3414 if (in_composite)
3415 {
3416 n = saved_n;
3417 src = saved_src;
3418 in_composite = 0;
3419 goto back_to_square_n; /* Wheeeeeeeee ..... */
3420 }
3421 #endif /* ENABLE_COMPOSITE_CHARS */
3422
3423 str->flags = flags;
3424 str->ch = ch;
3425 str->iso2022.current_char_boundary = char_boundary;
3426 str->iso2022.current_charset = charset;
3427
3428 /* Verbum caro factum est! */
3429 }
3430
3431
3432 /************************************************************************/
3433 /* UTF-8 methods */
3434 /************************************************************************/
3435
3436 static int
3437 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src,
3438 unsigned int n)
3439 {
3440 while (n--)
3441 {
3442 unsigned char c = *src++;
3443 switch (st->utf8.in_byte)
3444 {
3445 case 0:
3446 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
3447 return 0;
3448 else if (c >= 0xfc)
3449 st->utf8.in_byte = 5;
3450 else if (c >= 0xf8)
3451 st->utf8.in_byte = 4;
3452 else if (c >= 0xf0)
3453 st->utf8.in_byte = 3;
3454 else if (c >= 0xe0)
3455 st->utf8.in_byte = 2;
3456 else if (c >= 0xc0)
3457 st->utf8.in_byte = 1;
3458 else if (c >= 0x80)
3459 return 0;
3460 break;
3461 default:
3462 if ((c & 0xc0) != 0x80)
3463 return 0;
3464 else
3465 st->utf8.in_byte--;
3466 }
3467 }
3468 return CODING_CATEGORY_UTF8_MASK;
3469 }
3470
3471 static void
3472 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src,
3473 unsigned_char_dynarr *dst, unsigned int n)
3474 {
3475 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3476 unsigned int flags = str->flags;
3477 unsigned int ch = str->ch;
3478 eol_type_t eol_type = str->eol_type;
3479
3480 while (n--)
3481 {
3482 unsigned char c = *src++;
3483 switch (flags)
3484 {
3485 case 0:
3486 if ( c >= 0xfc )
3487 {
3488 ch = c & 0x01;
3489 flags = 5;
3490 }
3491 else if ( c >= 0xf8 )
3492 {
3493 ch = c & 0x03;
3494 flags = 4;
3495 }
3496 else if ( c >= 0xf0 )
3497 {
3498 ch = c & 0x07;
3499 flags = 3;
3500 }
3501 else if ( c >= 0xe0 )
3502 {
3503 ch = c & 0x0f;
3504 flags = 2;
3505 }
3506 else if ( c >= 0xc0 )
3507 {
3508 ch = c & 0x1f;
3509 flags = 1;
3510 }
3511 else
3512 {
3513 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst);
3514 decode_ucs4 (c, dst);
3515 }
3516 break;
3517 case 1:
3518 ch = ( ch << 6 ) | ( c & 0x3f );
3519 decode_ucs4 (ch, dst);
3520 ch = 0;
3521 flags = 0;
3522 break;
3523 default:
3524 ch = ( ch << 6 ) | ( c & 0x3f );
3525 flags--;
3526 }
3527 label_continue_loop:;
3528 }
3529
3530 if (flags & CODING_STATE_END)
3531 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3532
3533 str->flags = flags;
3534 str->ch = ch;
3535 }
3536
3537 static void
3538 encode_utf8 (Lisp_Object charset,
3539 unsigned char h, unsigned char l, unsigned_char_dynarr *dst)
3540 {
3541 unsigned long code = mule_char_to_ucs4 (charset, h, l);
3542 if ( code <= 0x7f )
3543 {
3544 Dynarr_add (dst, code);
3545 }
3546 else if ( code <= 0x7ff )
3547 {
3548 Dynarr_add (dst, (code >> 6) | 0xc0);
3549 Dynarr_add (dst, (code & 0x3f) | 0x80);
3550 }
3551 else if ( code <= 0xffff )
3552 {
3553 Dynarr_add (dst, (code >> 12) | 0xe0);
3554 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3555 Dynarr_add (dst, (code & 0x3f) | 0x80);
3556 }
3557 else if ( code <= 0x1fffff )
3558 {
3559 Dynarr_add (dst, (code >> 18) | 0xf0);
3560 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3561 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3562 Dynarr_add (dst, (code & 0x3f) | 0x80);
3563 }
3564 else if ( code <= 0x3ffffff )
3565 {
3566 Dynarr_add (dst, (code >> 24) | 0xf8);
3567 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3568 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3569 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3570 Dynarr_add (dst, (code & 0x3f) | 0x80);
3571 }
3572 else
3573 {
3574 Dynarr_add (dst, (code >> 30) | 0xfc);
3575 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80);
3576 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80);
3577 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80);
3578 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80);
3579 Dynarr_add (dst, (code & 0x3f) | 0x80);
3580 }
3581 }
3582
3583 static void
3584 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src,
3585 unsigned_char_dynarr *dst, unsigned int n)
3586 {
3587 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
3588 unsigned int flags = str->flags;
3589 unsigned int ch = str->ch;
3590 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
3591 unsigned char char_boundary = str->iso2022.current_char_boundary;
3592 Lisp_Object charset = str->iso2022.current_charset;
3593
3594 #ifdef ENABLE_COMPOSITE_CHARS
3595 /* flags for handling composite chars. We do a little switcharoo
3596 on the source while we're outputting the composite char. */
3597 unsigned int saved_n = 0;
3598 CONST unsigned char *saved_src = NULL;
3599 int in_composite = 0;
3600
3601 back_to_square_n:
3602 #endif /* ENABLE_COMPOSITE_CHARS */
3603
3604 while (n--)
3605 {
3606 unsigned char c = *src++;
3607
3608 if (BYTE_ASCII_P (c))
3609 { /* Processing ASCII character */
3610 ch = 0;
3611 if (c == '\n')
3612 {
3613 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT)
3614 Dynarr_add (dst, '\r');
3615 if (eol_type != EOL_CR)
3616 Dynarr_add (dst, c);
3617 }
3618 else
3619 encode_utf8 (Vcharset_ascii, c, 0, dst);
3620 char_boundary = 1;
3621 }
3622 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
3623 { /* Processing Leading Byte */
3624 ch = 0;
3625 charset = CHARSET_BY_LEADING_BYTE (c);
3626 if (LEADING_BYTE_PREFIX_P(c))
3627 ch = c;
3628 char_boundary = 0;
3629 }
3630 else
3631 { /* Processing Non-ASCII character */
3632 char_boundary = 1;
3633 if (EQ (charset, Vcharset_control_1))
3634 {
3635 encode_utf8 (Vcharset_control_1, c, 0, dst);
3636 }
3637 else
3638 {
3639 switch (XCHARSET_REP_BYTES (charset))
3640 {
3641 case 2:
3642 encode_utf8 (charset, c, 0, dst);
3643 break;
3644 case 3:
3645 if (XCHARSET_PRIVATE_P (charset))
3646 {
3647 encode_utf8 (charset, c, 0, dst);
3648 ch = 0;
3649 }
3650 else if (ch)
3651 {
3652 #ifdef ENABLE_COMPOSITE_CHARS
3653 if (EQ (charset, Vcharset_composite))
3654 {
3655 if (in_composite)
3656 {
3657 /* #### Bother! We don't know how to
3658 handle this yet. */
3659 encode_utf8 (Vcharset_ascii, '~', 0, dst);
3660 }
3661 else
3662 {
3663 Emchar emch = MAKE_CHAR (Vcharset_composite,
3664 ch & 0x7F, c & 0x7F);
3665 Lisp_Object lstr = composite_char_string (emch);
3666 saved_n = n;
3667 saved_src = src;
3668 in_composite = 1;
3669 src = XSTRING_DATA (lstr);
3670 n = XSTRING_LENGTH (lstr);
3671 }
3672 }
3673 else
3674 #endif /* ENABLE_COMPOSITE_CHARS */
3675 {
3676 encode_utf8 (charset, ch, c, dst);
3677 }
3678 ch = 0;
3679 }
3680 else
3681 {
3682 ch = c;
3683 char_boundary = 0;
3684 }
3685 break;
3686 case 4:
3687 if (ch)
3688 {
3689 encode_utf8 (charset, ch, c, dst);
3690 ch = 0;
3691 }
3692 else
3693 {
3694 ch = c;
3695 char_boundary = 0;
3696 }
3697 break;
3698 default:
3699 abort ();
3700 }
3701 }
3702 }
3703 }
3704
3705 #ifdef ENABLE_COMPOSITE_CHARS
3706 if (in_composite)
3707 {
3708 n = saved_n;
3709 src = saved_src;
3710 in_composite = 0;
3711 goto back_to_square_n; /* Wheeeeeeeee ..... */
3712 }
3713 #endif
3714
3715 str->flags = flags;
3716 str->ch = ch;
3717 str->iso2022.current_char_boundary = char_boundary;
3718 str->iso2022.current_charset = charset;
3719
3720 /* Verbum caro factum est! */
3721 }
3722
3723
3724 /************************************************************************/
3020 /* ISO2022 methods */ 3725 /* ISO2022 methods */
3021 /************************************************************************/ 3726 /************************************************************************/
3022 3727
3023 /* The following note describes the coding system ISO2022 briefly. 3728 /* The following note describes the coding system ISO2022 briefly.
3024 Since the intention of this note is to help understanding of the 3729 Since the intention of this note is to help understand the
3025 programs in this file, some parts are NOT ACCURATE or OVERLY 3730 functions in this file, some parts are NOT ACCURATE or OVERLY
3026 SIMPLIFIED. For thorough understanding, please refer to the 3731 SIMPLIFIED. For thorough understanding, please refer to the
3027 original document of ISO2022. 3732 original document of ISO2022.
3028 3733
3029 ISO2022 provides many mechanisms to encode several character sets 3734 ISO2022 provides many mechanisms to encode several character sets
3030 in 7-bit and 8-bit environments. If one chooses 7-bit environment, 3735 in 7-bit and 8-bit environments. For 7-bit environments, all text
3031 all text is encoded by codes of less than 128. This may make the 3736 is encoded using bytes less than 128. This may make the encoded
3032 encoded text a little bit longer, but the text get more stability 3737 text a little bit longer, but the text passes more easily through
3033 to pass through several gateways (some of them strip off MSB). 3738 several gateways, some of which strip off MSB (Most Signigant Bit).
3034 3739
3035 There are two kind of character sets: control character set and 3740 There are two kinds of character sets: control character set and
3036 graphic character set. The former contains control characters such 3741 graphic character set. The former contains control characters such
3037 as `newline' and `escape' to provide control functions (control 3742 as `newline' and `escape' to provide control functions (control
3038 functions are provided also by escape sequence). The latter 3743 functions are also provided by escape sequences). The latter
3039 contains graphic characters such as 'A' and '-'. Emacs recognizes 3744 contains graphic characters such as 'A' and '-'. Emacs recognizes
3040 two control character sets and many graphic character sets. 3745 two control character sets and many graphic character sets.
3041 3746
3042 Graphic character sets are classified into one of four types, 3747 Graphic character sets are classified into one of the following
3043 according to the dimension and number of characters in the set: 3748 four classes, according to the number of bytes (DIMENSION) and
3044 TYPE94, TYPE96, TYPE94x94, and TYPE96x96. In addition, each 3749 number of characters in one dimension (CHARS) of the set:
3045 character set is assigned an identification byte, unique for each 3750 - DIMENSION1_CHARS94
3046 type, called "final character" (denoted as <F> hereafter). The <F> 3751 - DIMENSION1_CHARS96
3047 of each character set is decided by ECMA(*) when it is registered 3752 - DIMENSION2_CHARS94
3048 in ISO. Code range of <F> is 0x30..0x7F (0x30..0x3F are for 3753 - DIMENSION2_CHARS96
3049 private use only). 3754
3755 In addition, each character set is assigned an identification tag,
3756 unique for each set, called "final character" (denoted as <F>
3757 hereafter). The <F> of each character set is decided by ECMA(*)
3758 when it is registered in ISO. The code range of <F> is 0x30..0x7F
3759 (0x30..0x3F are for private use only).
3050 3760
3051 Note (*): ECMA = European Computer Manufacturers Association 3761 Note (*): ECMA = European Computer Manufacturers Association
3052 3762
3053 Here are examples of graphic character set [NAME(<F>)]: 3763 Here are examples of graphic character set [NAME(<F>)]:
3054 o TYPE94 -- ASCII('B'), right-half-of-JISX0201('I'), ... 3764 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3055 o TYPE96 -- right-half-of-ISO8859-1('A'), ... 3765 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
3056 o TYPE94x94 -- GB2312('A'), JISX0208('B'), ... 3766 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
3057 o TYPE96x96 -- none for the moment 3767 o DIMENSION2_CHARS96 -- none for the moment
3058 3768
3059 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR. 3769 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR.
3060 C0 [0x00..0x1F] -- control character plane 0 3770 C0 [0x00..0x1F] -- control character plane 0
3061 GL [0x20..0x7F] -- graphic character plane 0 3771 GL [0x20..0x7F] -- graphic character plane 0
3062 C1 [0x80..0x9F] -- control character plane 1 3772 C1 [0x80..0x9F] -- control character plane 1
3063 GR [0xA0..0xFF] -- graphic character plane 1 3773 GR [0xA0..0xFF] -- graphic character plane 1
3064 3774
3078 done independently. The most common case is that G0 is invoked to 3788 done independently. The most common case is that G0 is invoked to
3079 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually 3789 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3080 these invocations and designations are omitted in encoded text. 3790 these invocations and designations are omitted in encoded text.
3081 In a 7-bit environment, only GL can be used. 3791 In a 7-bit environment, only GL can be used.
3082 3792
3083 When a graphic character set of TYPE94 or TYPE94x94 is invoked to 3793 When a graphic character set of CHARS94 is invoked to GL, codes
3084 GL, codes 0x20 and 0x7F of the GL area work as control characters 3794 0x20 and 0x7F of the GL area work as control characters SPACE and
3085 SPACE and DEL respectively, and code 0xA0 and 0xFF of GR area 3795 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
3086 should not be used. 3796 be used.
3087 3797
3088 There are two ways of invocation: locking-shift and single-shift. 3798 There are two ways of invocation: locking-shift and single-shift.
3089 With locking-shift, the invocation lasts until the next different 3799 With locking-shift, the invocation lasts until the next different
3090 invocation, whereas with single-shift, the invocation works only 3800 invocation, whereas with single-shift, the invocation affects the
3091 for the following character and doesn't affect locking-shift. 3801 following character only and doesn't affect the locking-shift
3092 Invocations are done by the following control characters or escape 3802 state. Invocations are done by the following control characters or
3093 sequences. 3803 escape sequences:
3094 3804
3095 ---------------------------------------------------------------------- 3805 ----------------------------------------------------------------------
3096 abbrev function cntrl escape seq description 3806 abbrev function cntrl escape seq description
3097 ---------------------------------------------------------------------- 3807 ----------------------------------------------------------------------
3098 SI/LS0 (shift-in) 0x0F none invoke G0 into GL 3808 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3099 SO/LS1 (shift-out) 0x0E none invoke G1 into GL 3809 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3100 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR
3101 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL 3810 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3102 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR
3103 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL 3811 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3104 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR 3812 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
3813 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
3814 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
3105 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char 3815 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3106 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char 3816 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3107 ---------------------------------------------------------------------- 3817 ----------------------------------------------------------------------
3108 The first four are for locking-shift. Control characters for these 3818 (*) These are not used by any known coding system.
3109 functions are defined by macros ISO_CODE_XXX in `coding.h'. 3819
3110 3820 Control characters for these functions are defined by macros
3111 Designations are done by the following escape sequences. 3821 ISO_CODE_XXX in `coding.h'.
3822
3823 Designations are done by the following escape sequences:
3112 ---------------------------------------------------------------------- 3824 ----------------------------------------------------------------------
3113 escape sequence description 3825 escape sequence description
3114 ---------------------------------------------------------------------- 3826 ----------------------------------------------------------------------
3115 ESC '(' <F> designate TYPE94<F> to G0 3827 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
3116 ESC ')' <F> designate TYPE94<F> to G1 3828 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
3117 ESC '*' <F> designate TYPE94<F> to G2 3829 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
3118 ESC '+' <F> designate TYPE94<F> to G3 3830 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
3119 ESC ',' <F> designate TYPE96<F> to G0 (*) 3831 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
3120 ESC '-' <F> designate TYPE96<F> to G1 3832 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
3121 ESC '.' <F> designate TYPE96<F> to G2 3833 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
3122 ESC '/' <F> designate TYPE96<F> to G3 3834 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
3123 ESC '$' '(' <F> designate TYPE94x94<F> to G0 (**) 3835 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
3124 ESC '$' ')' <F> designate TYPE94x94<F> to G1 3836 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
3125 ESC '$' '*' <F> designate TYPE94x94<F> to G2 3837 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
3126 ESC '$' '+' <F> designate TYPE94x94<F> to G3 3838 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
3127 ESC '$' ',' <F> designate TYPE96x96<F> to G0 (*) 3839 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
3128 ESC '$' '-' <F> designate TYPE96x96<F> to G1 3840 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
3129 ESC '$' '.' <F> designate TYPE96x96<F> to G2 3841 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
3130 ESC '$' '/' <F> designate TYPE96x96<F> to G3 3842 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
3131 ---------------------------------------------------------------------- 3843 ----------------------------------------------------------------------
3132 In this list, "TYPE94<F>" means a graphic character set of type TYPE94 3844
3133 and final character <F>, and etc. 3845 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
3846 of dimension 1, chars 94, and final character <F>, etc...
3134 3847
3135 Note (*): Although these designations are not allowed in ISO2022, 3848 Note (*): Although these designations are not allowed in ISO2022,
3136 Emacs accepts them on decoding, and produces them on encoding 3849 Emacs accepts them on decoding, and produces them on encoding
3137 TYPE96 or TYPE96x96 character set in a coding system which is 3850 CHARS96 character sets in a coding system which is characterized as
3138 characterized as 7-bit environment, non-locking-shift, and 3851 7-bit environment, non-locking-shift, and non-single-shift.
3139 non-single-shift.
3140 3852
3141 Note (**): If <F> is '@', 'A', or 'B', the intermediate character 3853 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3142 '(' can be omitted. We call this as "short-form" here after. 3854 '(' can be omitted. We refer to this as "short-form" hereafter.
3143 3855
3144 Now you may notice that there are a lot of ways for encoding the 3856 Now you may notice that there are a lot of ways for encoding the
3145 same multilingual text in ISO2022. Actually, there exist many 3857 same multilingual text in ISO2022. Actually, there exist many
3146 coding systems such as Compound Text (used in X's inter client 3858 coding systems such as Compound Text (used in X11's inter client
3147 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR 3859 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3148 (used in Korean internet), EUC (Extended UNIX Code, used in Asian 3860 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3149 localized platforms), and all of these are variants of ISO2022. 3861 localized platforms), and all of these are variants of ISO2022.
3150 3862
3151 In addition to the above, Emacs handles two more kinds of escape 3863 In addition to the above, Emacs handles two more kinds of escape
3152 sequences: ISO6429's direction specification and Emacs' private 3864 sequences: ISO6429's direction specification and Emacs' private
3153 sequence for specifying character composition. 3865 sequence for specifying character composition.
3154 3866
3155 ISO6429's direction specification takes the following format: 3867 ISO6429's direction specification takes the following form:
3156 o CSI ']' -- end of the current direction 3868 o CSI ']' -- end of the current direction
3157 o CSI '0' ']' -- end of the current direction 3869 o CSI '0' ']' -- end of the current direction
3158 o CSI '1' ']' -- start of left-to-right text 3870 o CSI '1' ']' -- start of left-to-right text
3159 o CSI '2' ']' -- start of right-to-left text 3871 o CSI '2' ']' -- start of right-to-left text
3160 The control character CSI (0x9B: control sequence introducer) is 3872 The control character CSI (0x9B: control sequence introducer) is
3161 abbreviated to the escape sequence ESC '[' in 7-bit environment. 3873 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
3162 3874
3163 Character composition specification takes the following format: 3875 Character composition specification takes the following form:
3164 o ESC '0' -- start character composition 3876 o ESC '0' -- start character composition
3165 o ESC '1' -- end character composition 3877 o ESC '1' -- end character composition
3166 Since these are not standard escape sequences of any ISO, the use 3878 Since these are not standard escape sequences of any ISO standard,
3167 of them for these meanings is restricted to Emacs only. */ 3879 their use with these meanings is restricted to Emacs only. */
3168 3880
3169 static void 3881 static void
3170 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso) 3882 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3171 { 3883 {
3172 int i; 3884 int i;
3186 iso->register_right = 1; 3898 iso->register_right = 1;
3187 iso->switched_dir_and_no_valid_charset_yet = 0; 3899 iso->switched_dir_and_no_valid_charset_yet = 0;
3188 iso->invalid_switch_dir = 0; 3900 iso->invalid_switch_dir = 0;
3189 iso->output_direction_sequence = 0; 3901 iso->output_direction_sequence = 0;
3190 iso->output_literally = 0; 3902 iso->output_literally = 0;
3903 #ifdef ENABLE_COMPOSITE_CHARS
3191 if (iso->composite_chars) 3904 if (iso->composite_chars)
3192 Dynarr_reset (iso->composite_chars); 3905 Dynarr_reset (iso->composite_chars);
3906 #endif
3193 } 3907 }
3194 3908
3195 static int 3909 static int
3196 fit_to_be_escape_quoted (unsigned char c) 3910 fit_to_be_escape_quoted (unsigned char c)
3197 { 3911 {
3315 goto locking_shift; 4029 goto locking_shift;
3316 case '|': /* locking shift 3 right */ 4030 case '|': /* locking shift 3 right */
3317 reg = 3; half = 1; 4031 reg = 3; half = 1;
3318 goto locking_shift; 4032 goto locking_shift;
3319 4033
4034 #ifdef ENABLE_COMPOSITE_CHARS
3320 /**** composite ****/ 4035 /**** composite ****/
3321 4036
3322 case '0': 4037 case '0':
3323 iso->esc = ISO_ESC_START_COMPOSITE; 4038 iso->esc = ISO_ESC_START_COMPOSITE;
3324 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | 4039 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3328 case '1': 4043 case '1':
3329 iso->esc = ISO_ESC_END_COMPOSITE; 4044 iso->esc = ISO_ESC_END_COMPOSITE;
3330 *flags = (*flags & CODING_STATE_ISO2022_LOCK) & 4045 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3331 ~CODING_STATE_COMPOSITE; 4046 ~CODING_STATE_COMPOSITE;
3332 return 1; 4047 return 1;
4048 #endif /* ENABLE_COMPOSITE_CHARS */
3333 4049
3334 /**** directionality ****/ 4050 /**** directionality ****/
3335 4051
3336 case '[': 4052 case '[':
3337 iso->esc = ISO_ESC_5_11; 4053 iso->esc = ISO_ESC_5_11;
3711 Also update FLAGS if it is not a null pointer. 4427 Also update FLAGS if it is not a null pointer.
3712 If INTERNAL_P is set, we are outputting in internal format and 4428 If INTERNAL_P is set, we are outputting in internal format and
3713 need to handle the CSI differently. */ 4429 need to handle the CSI differently. */
3714 4430
3715 static void 4431 static void
3716 restore_left_to_right_direction (struct Lisp_Coding_System *codesys, 4432 restore_left_to_right_direction (Lisp_Coding_System *codesys,
3717 unsigned_char_dynarr *dst, 4433 unsigned_char_dynarr *dst,
3718 unsigned int *flags, 4434 unsigned int *flags,
3719 int internal_p) 4435 int internal_p)
3720 { 4436 {
3721 if (!flags || (*flags & CODING_STATE_R2L)) 4437 if (!flags || (*flags & CODING_STATE_R2L))
3742 sequence to DST. Also update FLAGS if it is not a null pointer. 4458 sequence to DST. Also update FLAGS if it is not a null pointer.
3743 If INTERNAL_P is set, we are outputting in internal format and 4459 If INTERNAL_P is set, we are outputting in internal format and
3744 need to handle the CSI differently. */ 4460 need to handle the CSI differently. */
3745 4461
3746 static void 4462 static void
3747 ensure_correct_direction (int direction, struct Lisp_Coding_System *codesys, 4463 ensure_correct_direction (int direction, Lisp_Coding_System *codesys,
3748 unsigned_char_dynarr *dst, unsigned int *flags, 4464 unsigned_char_dynarr *dst, unsigned int *flags,
3749 int internal_p) 4465 int internal_p)
3750 { 4466 {
3751 if ((!flags || (*flags & CODING_STATE_R2L)) && 4467 if ((!flags || (*flags & CODING_STATE_R2L)) &&
3752 direction == CHARSET_LEFT_TO_RIGHT) 4468 direction == CHARSET_LEFT_TO_RIGHT)
3775 4491
3776 static void 4492 static void
3777 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src, 4493 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
3778 unsigned_char_dynarr *dst, unsigned int n) 4494 unsigned_char_dynarr *dst, unsigned int n)
3779 { 4495 {
3780 unsigned int flags, ch;
3781 enum eol_type eol_type;
3782 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); 4496 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4497 unsigned int flags = str->flags;
4498 unsigned int ch = str->ch;
4499 eol_type_t eol_type = str->eol_type;
4500 #ifdef ENABLE_COMPOSITE_CHARS
4501 unsigned_char_dynarr *real_dst = dst;
4502 #endif
3783 Lisp_Object coding_system; 4503 Lisp_Object coding_system;
3784 unsigned_char_dynarr *real_dst = dst; 4504
3785
3786 CODING_STREAM_DECOMPOSE (str, flags, ch);
3787 eol_type = str->eol_type;
3788 XSETCODING_SYSTEM (coding_system, str->codesys); 4505 XSETCODING_SYSTEM (coding_system, str->codesys);
3789 4506
4507 #ifdef ENABLE_COMPOSITE_CHARS
3790 if (flags & CODING_STATE_COMPOSITE) 4508 if (flags & CODING_STATE_COMPOSITE)
3791 dst = str->iso2022.composite_chars; 4509 dst = str->iso2022.composite_chars;
4510 #endif /* ENABLE_COMPOSITE_CHARS */
3792 4511
3793 while (n--) 4512 while (n--)
3794 { 4513 {
3795 unsigned char c = *src++; 4514 unsigned char c = *src++;
3796 if (flags & CODING_STATE_ESCAPE) 4515 if (flags & CODING_STATE_ESCAPE)
3800 4519
3801 if (retval) 4520 if (retval)
3802 { 4521 {
3803 switch (str->iso2022.esc) 4522 switch (str->iso2022.esc)
3804 { 4523 {
4524 #ifdef ENABLE_COMPOSITE_CHARS
3805 case ISO_ESC_START_COMPOSITE: 4525 case ISO_ESC_START_COMPOSITE:
3806 if (str->iso2022.composite_chars) 4526 if (str->iso2022.composite_chars)
3807 Dynarr_reset (str->iso2022.composite_chars); 4527 Dynarr_reset (str->iso2022.composite_chars);
3808 else 4528 else
3809 str->iso2022.composite_chars = Dynarr_new (unsigned_char); 4529 str->iso2022.composite_chars = Dynarr_new (unsigned_char);
3818 dst = real_dst; 4538 dst = real_dst;
3819 len = set_charptr_emchar (comstr, emch); 4539 len = set_charptr_emchar (comstr, emch);
3820 Dynarr_add_many (dst, comstr, len); 4540 Dynarr_add_many (dst, comstr, len);
3821 break; 4541 break;
3822 } 4542 }
4543 #endif /* ENABLE_COMPOSITE_CHARS */
3823 4544
3824 case ISO_ESC_LITERAL: 4545 case ISO_ESC_LITERAL:
3825 DECODE_ADD_BINARY_CHAR (c, dst); 4546 DECODE_ADD_BINARY_CHAR (c, dst);
3826 break; 4547 break;
3827 4548
3995 } 4716 }
3996 4717
3997 if (flags & CODING_STATE_END) 4718 if (flags & CODING_STATE_END)
3998 DECODE_OUTPUT_PARTIAL_CHAR (ch); 4719 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3999 4720
4000 CODING_STREAM_COMPOSE (str, flags, ch); 4721 str->flags = flags;
4722 str->ch = ch;
4001 } 4723 }
4002 4724
4003 4725
4004 /***** ISO2022 encoder *****/ 4726 /***** ISO2022 encoder *****/
4005 4727
4007 4729
4008 static void 4730 static void
4009 iso2022_designate (Lisp_Object charset, unsigned char reg, 4731 iso2022_designate (Lisp_Object charset, unsigned char reg,
4010 struct encoding_stream *str, unsigned_char_dynarr *dst) 4732 struct encoding_stream *str, unsigned_char_dynarr *dst)
4011 { 4733 {
4012 CONST char *inter94 = "()*+", *inter96= ",-./"; 4734 static CONST char inter94[] = "()*+";
4735 static CONST char inter96[] = ",-./";
4013 unsigned int type; 4736 unsigned int type;
4014 unsigned char final; 4737 unsigned char final;
4015 Lisp_Object old_charset = str->iso2022.charset[reg]; 4738 Lisp_Object old_charset = str->iso2022.charset[reg];
4016 4739
4017 str->iso2022.charset[reg] = charset; 4740 str->iso2022.charset[reg] = charset;
4095 static void 4818 static void
4096 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src, 4819 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4097 unsigned_char_dynarr *dst, unsigned int n) 4820 unsigned_char_dynarr *dst, unsigned int n)
4098 { 4821 {
4099 unsigned char charmask, c; 4822 unsigned char charmask, c;
4100 unsigned int flags, ch;
4101 enum eol_type eol_type;
4102 unsigned char char_boundary; 4823 unsigned char char_boundary;
4103 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); 4824 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4104 struct Lisp_Coding_System *codesys = str->codesys; 4825 unsigned int flags = str->flags;
4826 unsigned int ch = str->ch;
4827 Lisp_Coding_System *codesys = str->codesys;
4828 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4105 int i; 4829 int i;
4106 Lisp_Object charset; 4830 Lisp_Object charset;
4107 int half; 4831 int half;
4108 4832
4833 #ifdef ENABLE_COMPOSITE_CHARS
4109 /* flags for handling composite chars. We do a little switcharoo 4834 /* flags for handling composite chars. We do a little switcharoo
4110 on the source while we're outputting the composite char. */ 4835 on the source while we're outputting the composite char. */
4111 unsigned int saved_n = 0; 4836 unsigned int saved_n = 0;
4112 CONST unsigned char *saved_src = NULL; 4837 CONST unsigned char *saved_src = NULL;
4113 int in_composite = 0; 4838 int in_composite = 0;
4114 4839 #endif /* ENABLE_COMPOSITE_CHARS */
4115 CODING_STREAM_DECOMPOSE (str, flags, ch); 4840
4116 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4117 char_boundary = str->iso2022.current_char_boundary; 4841 char_boundary = str->iso2022.current_char_boundary;
4118 charset = str->iso2022.current_charset; 4842 charset = str->iso2022.current_charset;
4119 half = str->iso2022.current_half; 4843 half = str->iso2022.current_half;
4120 4844
4845 #ifdef ENABLE_COMPOSITE_CHARS
4121 back_to_square_n: 4846 back_to_square_n:
4847 #endif
4122 while (n--) 4848 while (n--)
4123 { 4849 {
4124 c = *src++; 4850 c = *src++;
4125 4851
4126 if (BYTE_ASCII_P (c)) 4852 if (BYTE_ASCII_P (c))
4175 ch = 0; 4901 ch = 0;
4176 charset = CHARSET_BY_LEADING_BYTE (c); 4902 charset = CHARSET_BY_LEADING_BYTE (c);
4177 if (LEADING_BYTE_PREFIX_P(c)) 4903 if (LEADING_BYTE_PREFIX_P(c))
4178 ch = c; 4904 ch = c;
4179 else if (!EQ (charset, Vcharset_control_1) 4905 else if (!EQ (charset, Vcharset_control_1)
4180 && !EQ (charset, Vcharset_composite)) 4906 #ifdef ENABLE_COMPOSITE_CHARS
4907 && !EQ (charset, Vcharset_composite)
4908 #endif
4909 )
4181 { 4910 {
4182 int reg; 4911 int reg;
4183 4912
4184 ensure_correct_direction (XCHARSET_DIRECTION (charset), 4913 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4185 codesys, dst, &flags, 0); 4914 codesys, dst, &flags, 0);
4295 Dynarr_add (dst, c & charmask); 5024 Dynarr_add (dst, c & charmask);
4296 ch = 0; 5025 ch = 0;
4297 } 5026 }
4298 else if (ch) 5027 else if (ch)
4299 { 5028 {
5029 #ifdef ENABLE_COMPOSITE_CHARS
4300 if (EQ (charset, Vcharset_composite)) 5030 if (EQ (charset, Vcharset_composite))
4301 { 5031 {
4302 if (in_composite) 5032 if (in_composite)
4303 { 5033 {
4304 /* #### Bother! We don't know how to 5034 /* #### Bother! We don't know how to
4318 Dynarr_add (dst, ISO_CODE_ESC); 5048 Dynarr_add (dst, ISO_CODE_ESC);
4319 Dynarr_add (dst, '0'); /* start composing */ 5049 Dynarr_add (dst, '0'); /* start composing */
4320 } 5050 }
4321 } 5051 }
4322 else 5052 else
5053 #endif /* ENABLE_COMPOSITE_CHARS */
4323 { 5054 {
4324 Dynarr_add (dst, ch & charmask); 5055 Dynarr_add (dst, ch & charmask);
4325 Dynarr_add (dst, c & charmask); 5056 Dynarr_add (dst, c & charmask);
4326 } 5057 }
4327 ch = 0; 5058 ch = 0;
4350 } 5081 }
4351 } 5082 }
4352 } 5083 }
4353 } 5084 }
4354 5085
5086 #ifdef ENABLE_COMPOSITE_CHARS
4355 if (in_composite) 5087 if (in_composite)
4356 { 5088 {
4357 n = saved_n; 5089 n = saved_n;
4358 src = saved_src; 5090 src = saved_src;
4359 in_composite = 0; 5091 in_composite = 0;
4360 Dynarr_add (dst, ISO_CODE_ESC); 5092 Dynarr_add (dst, ISO_CODE_ESC);
4361 Dynarr_add (dst, '1'); /* end composing */ 5093 Dynarr_add (dst, '1'); /* end composing */
4362 goto back_to_square_n; /* Wheeeeeeeee ..... */ 5094 goto back_to_square_n; /* Wheeeeeeeee ..... */
4363 } 5095 }
5096 #endif /* ENABLE_COMPOSITE_CHARS */
4364 5097
4365 if (char_boundary && flags & CODING_STATE_END) 5098 if (char_boundary && flags & CODING_STATE_END)
4366 { 5099 {
4367 restore_left_to_right_direction (codesys, dst, &flags, 0); 5100 restore_left_to_right_direction (codesys, dst, &flags, 0);
4368 ensure_normal_shift (str, dst); 5101 ensure_normal_shift (str, dst);
4372 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); 5105 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4373 iso2022_designate (initial_charset, i, str, dst); 5106 iso2022_designate (initial_charset, i, str, dst);
4374 } 5107 }
4375 } 5108 }
4376 5109
4377 CODING_STREAM_COMPOSE (str, flags, ch); 5110 str->flags = flags;
5111 str->ch = ch;
4378 str->iso2022.current_char_boundary = char_boundary; 5112 str->iso2022.current_char_boundary = char_boundary;
4379 str->iso2022.current_charset = charset; 5113 str->iso2022.current_charset = charset;
4380 str->iso2022.current_half = half; 5114 str->iso2022.current_half = half;
4381 5115
4382 /* Verbum caro factum est! */ 5116 /* Verbum caro factum est! */
4393 static void 5127 static void
4394 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src, 5128 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4395 unsigned_char_dynarr *dst, unsigned int n) 5129 unsigned_char_dynarr *dst, unsigned int n)
4396 { 5130 {
4397 unsigned char c; 5131 unsigned char c;
4398 unsigned int flags, ch;
4399 enum eol_type eol_type;
4400 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); 5132 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4401 5133 unsigned int flags = str->flags;
4402 CODING_STREAM_DECOMPOSE (str, flags, ch); 5134 unsigned int ch = str->ch;
4403 eol_type = str->eol_type; 5135 eol_type_t eol_type = str->eol_type;
4404 5136
4405 while (n--) 5137 while (n--)
4406 { 5138 {
4407 c = *src++; 5139 c = *src++;
4408 5140
4411 label_continue_loop:; 5143 label_continue_loop:;
4412 } 5144 }
4413 5145
4414 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); 5146 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4415 5147
4416 CODING_STREAM_COMPOSE (str, flags, ch); 5148 str->flags = flags;
5149 str->ch = ch;
4417 } 5150 }
4418 5151
4419 static void 5152 static void
4420 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src, 5153 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4421 unsigned_char_dynarr *dst, unsigned int n) 5154 unsigned_char_dynarr *dst, unsigned int n)
4422 { 5155 {
4423 unsigned char c; 5156 unsigned char c;
4424 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); 5157 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4425 unsigned int flags, ch; 5158 unsigned int flags = str->flags;
4426 enum eol_type eol_type; 5159 unsigned int ch = str->ch;
4427 5160 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4428 CODING_STREAM_DECOMPOSE (str, flags, ch);
4429 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys);
4430 5161
4431 while (n--) 5162 while (n--)
4432 { 5163 {
4433 c = *src++; 5164 c = *src++;
4434 if (c == '\n') 5165 if (c == '\n')
4466 untranslatable character, so ignore it */ 5197 untranslatable character, so ignore it */
4467 ch = 0; 5198 ch = 0;
4468 } 5199 }
4469 } 5200 }
4470 5201
4471 CODING_STREAM_COMPOSE (str, flags, ch); 5202 str->flags = flags;
5203 str->ch = ch;
4472 } 5204 }
4473 5205
4474 5206
4475 /************************************************************************/ 5207 /************************************************************************/
4476 /* Simple internal/external functions */ 5208 /* Simple internal/external functions */
4668 #ifdef MULE 5400 #ifdef MULE
4669 DEFSUBR (Fdecode_shift_jis_char); 5401 DEFSUBR (Fdecode_shift_jis_char);
4670 DEFSUBR (Fencode_shift_jis_char); 5402 DEFSUBR (Fencode_shift_jis_char);
4671 DEFSUBR (Fdecode_big5_char); 5403 DEFSUBR (Fdecode_big5_char);
4672 DEFSUBR (Fencode_big5_char); 5404 DEFSUBR (Fencode_big5_char);
5405 DEFSUBR (Fset_ucs_char);
5406 DEFSUBR (Fucs_char);
5407 DEFSUBR (Fset_char_ucs);
5408 DEFSUBR (Fchar_ucs);
4673 #endif /* MULE */ 5409 #endif /* MULE */
4674 defsymbol (&Qcoding_system_p, "coding-system-p"); 5410 defsymbol (&Qcoding_system_p, "coding-system-p");
4675 defsymbol (&Qno_conversion, "no-conversion"); 5411 defsymbol (&Qno_conversion, "no-conversion");
4676 #ifdef MULE 5412 #ifdef MULE
4677 defsymbol (&Qbig5, "big5"); 5413 defsymbol (&Qbig5, "big5");
4678 defsymbol (&Qshift_jis, "shift-jis"); 5414 defsymbol (&Qshift_jis, "shift-jis");
5415 defsymbol (&Qucs4, "ucs-4");
5416 defsymbol (&Qutf8, "utf-8");
4679 defsymbol (&Qccl, "ccl"); 5417 defsymbol (&Qccl, "ccl");
4680 defsymbol (&Qiso2022, "iso2022"); 5418 defsymbol (&Qiso2022, "iso2022");
4681 #endif /* MULE */ 5419 #endif /* MULE */
4682 defsymbol (&Qmnemonic, "mnemonic"); 5420 defsymbol (&Qmnemonic, "mnemonic");
4683 defsymbol (&Qeol_type, "eol-type"); 5421 defsymbol (&Qeol_type, "eol-type");
4717 defsymbol (&Qctext, "ctext"); 5455 defsymbol (&Qctext, "ctext");
4718 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS], 5456 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
4719 "shift-jis"); 5457 "shift-jis");
4720 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5], 5458 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
4721 "big5"); 5459 "big5");
5460 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4],
5461 "ucs-4");
5462 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8],
5463 "utf-8");
4722 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7], 5464 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
4723 "iso-7"); 5465 "iso-7");
4724 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE], 5466 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
4725 "iso-8-designate"); 5467 "iso-8-designate");
4726 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1], 5468 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
4870 Qbinary); 5612 Qbinary);
4871 5613
4872 /* Need this for bootstrapping */ 5614 /* Need this for bootstrapping */
4873 coding_category_system[CODING_CATEGORY_NO_CONVERSION] = 5615 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
4874 Fget_coding_system (Qno_conversion); 5616 Fget_coding_system (Qno_conversion);
4875 } 5617
5618 #ifdef MULE
5619 {
5620 unsigned int i;
5621
5622 for (i = 0; i < 65536; i++)
5623 ucs_to_mule_table[i] = Qnil;
5624 }
5625 staticpro (&mule_to_ucs_table);
5626 mule_to_ucs_table = Fmake_char_table(Qgeneric);
5627 #endif /* MULE */
5628 }