Mercurial > hg > xemacs-beta
comparison src/file-coding.c @ 396:6719134a07c2 r21-2-13
Import from CVS: tag r21-2-13
author | cvs |
---|---|
date | Mon, 13 Aug 2007 11:12:05 +0200 |
parents | aabb7f5b1c81 |
children | 74fd4e045ea6 |
comparison
equal
deleted
inserted
replaced
395:de2c2a7459d2 | 396:6719134a07c2 |
---|---|
29 #include "elhash.h" | 29 #include "elhash.h" |
30 #include "insdel.h" | 30 #include "insdel.h" |
31 #include "lstream.h" | 31 #include "lstream.h" |
32 #ifdef MULE | 32 #ifdef MULE |
33 #include "mule-ccl.h" | 33 #include "mule-ccl.h" |
34 #include "chartab.h" | |
34 #endif | 35 #endif |
35 #include "file-coding.h" | 36 #include "file-coding.h" |
36 | 37 |
37 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error; | 38 Lisp_Object Qbuffer_file_coding_system, Qcoding_system_error; |
38 | 39 |
62 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf; | 63 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf; |
63 Lisp_Object Qpost_read_conversion; | 64 Lisp_Object Qpost_read_conversion; |
64 Lisp_Object Qpre_write_conversion; | 65 Lisp_Object Qpre_write_conversion; |
65 | 66 |
66 #ifdef MULE | 67 #ifdef MULE |
68 Lisp_Object Qucs4, Qutf8; | |
67 Lisp_Object Qbig5, Qshift_jis; | 69 Lisp_Object Qbig5, Qshift_jis; |
68 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3; | 70 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3; |
69 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output; | 71 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output; |
70 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output; | 72 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output; |
71 Lisp_Object Qno_iso6429; | 73 Lisp_Object Qno_iso6429; |
101 unsigned char esc_bytes[8]; | 103 unsigned char esc_bytes[8]; |
102 | 104 |
103 /* Index for next byte to store in ISO escape sequence. */ | 105 /* Index for next byte to store in ISO escape sequence. */ |
104 int esc_bytes_index; | 106 int esc_bytes_index; |
105 | 107 |
108 #ifdef ENABLE_COMPOSITE_CHARS | |
106 /* Stuff seen so far when composing a string. */ | 109 /* Stuff seen so far when composing a string. */ |
107 unsigned_char_dynarr *composite_chars; | 110 unsigned_char_dynarr *composite_chars; |
111 #endif | |
108 | 112 |
109 /* If we saw an invalid designation sequence for a particular | 113 /* If we saw an invalid designation sequence for a particular |
110 register, we flag it here and switch to ASCII. The next time we | 114 register, we flag it here and switch to ASCII. The next time we |
111 see a valid designation for this register, we turn off the flag | 115 see a valid designation for this register, we turn off the flag |
112 and do the designation normally, but pretend the sequence was | 116 and do the designation normally, but pretend the sequence was |
164 CONST unsigned char *src, | 168 CONST unsigned char *src, |
165 unsigned_char_dynarr *dst, unsigned int n); | 169 unsigned_char_dynarr *dst, unsigned int n); |
166 static void encode_coding_big5 (Lstream *encoding, | 170 static void encode_coding_big5 (Lstream *encoding, |
167 CONST unsigned char *src, | 171 CONST unsigned char *src, |
168 unsigned_char_dynarr *dst, unsigned int n); | 172 unsigned_char_dynarr *dst, unsigned int n); |
173 static int detect_coding_ucs4 (struct detection_state *st, | |
174 CONST unsigned char *src, | |
175 unsigned int n); | |
176 static void decode_coding_ucs4 (Lstream *decoding, | |
177 CONST unsigned char *src, | |
178 unsigned_char_dynarr *dst, unsigned int n); | |
179 static void encode_coding_ucs4 (Lstream *encoding, | |
180 CONST unsigned char *src, | |
181 unsigned_char_dynarr *dst, unsigned int n); | |
182 static int detect_coding_utf8 (struct detection_state *st, | |
183 CONST unsigned char *src, | |
184 unsigned int n); | |
185 static void decode_coding_utf8 (Lstream *decoding, | |
186 CONST unsigned char *src, | |
187 unsigned_char_dynarr *dst, unsigned int n); | |
188 static void encode_coding_utf8 (Lstream *encoding, | |
189 CONST unsigned char *src, | |
190 unsigned_char_dynarr *dst, unsigned int n); | |
169 static int postprocess_iso2022_mask (int mask); | 191 static int postprocess_iso2022_mask (int mask); |
170 static void reset_iso2022 (Lisp_Object coding_system, | 192 static void reset_iso2022 (Lisp_Object coding_system, |
171 struct iso2022_decoder *iso); | 193 struct iso2022_decoder *iso); |
172 static int detect_coding_iso2022 (struct detection_state *st, | 194 static int detect_coding_iso2022 (struct detection_state *st, |
173 CONST unsigned char *src, | 195 CONST unsigned char *src, |
228 0, 0, struct Lisp_Coding_System); | 250 0, 0, struct Lisp_Coding_System); |
229 | 251 |
230 static Lisp_Object | 252 static Lisp_Object |
231 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object)) | 253 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object)) |
232 { | 254 { |
233 struct Lisp_Coding_System *codesys = XCODING_SYSTEM (obj); | 255 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj); |
234 | 256 |
235 markobj (CODING_SYSTEM_NAME (codesys)); | 257 markobj (CODING_SYSTEM_NAME (codesys)); |
236 markobj (CODING_SYSTEM_DOC_STRING (codesys)); | 258 markobj (CODING_SYSTEM_DOC_STRING (codesys)); |
237 markobj (CODING_SYSTEM_MNEMONIC (codesys)); | 259 markobj (CODING_SYSTEM_MNEMONIC (codesys)); |
238 markobj (CODING_SYSTEM_EOL_LF (codesys)); | 260 markobj (CODING_SYSTEM_EOL_LF (codesys)); |
283 | 305 |
284 static void | 306 static void |
285 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun, | 307 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun, |
286 int escapeflag) | 308 int escapeflag) |
287 { | 309 { |
288 struct Lisp_Coding_System *c = XCODING_SYSTEM (obj); | 310 Lisp_Coding_System *c = XCODING_SYSTEM (obj); |
289 if (print_readably) | 311 if (print_readably) |
290 error ("printing unreadable object #<coding_system 0x%x>", | 312 error ("printing unreadable object #<coding_system 0x%x>", |
291 c->header.uid); | 313 c->header.uid); |
292 | 314 |
293 write_c_string ("#<coding_system ", printcharfun); | 315 write_c_string ("#<coding_system ", printcharfun); |
296 } | 318 } |
297 | 319 |
298 static void | 320 static void |
299 finalize_coding_system (void *header, int for_disksave) | 321 finalize_coding_system (void *header, int for_disksave) |
300 { | 322 { |
301 struct Lisp_Coding_System *c = (struct Lisp_Coding_System *) header; | 323 Lisp_Coding_System *c = (Lisp_Coding_System *) header; |
302 /* Since coding systems never go away, this function is not | 324 /* Since coding systems never go away, this function is not |
303 necessary. But it would be necessary if we changed things | 325 necessary. But it would be necessary if we changed things |
304 so that coding systems could go away. */ | 326 so that coding systems could go away. */ |
305 if (!for_disksave) /* see comment in lstream.c */ | 327 if (!for_disksave) /* see comment in lstream.c */ |
306 { | 328 { |
351 case EOL_AUTODETECT: return Qnil; | 373 case EOL_AUTODETECT: return Qnil; |
352 } | 374 } |
353 } | 375 } |
354 | 376 |
355 static void | 377 static void |
356 setup_eol_coding_systems (struct Lisp_Coding_System *codesys) | 378 setup_eol_coding_systems (Lisp_Coding_System *codesys) |
357 { | 379 { |
358 Lisp_Object codesys_obj; | 380 Lisp_Object codesys_obj; |
359 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name); | 381 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name); |
360 char *codesys_name = (char *) alloca (len + 7); | 382 char *codesys_name = (char *) alloca (len + 7); |
361 int mlen = -1; | 383 int mlen = -1; |
503 { | 525 { |
504 coding_system = Fget_coding_system (coding_system); | 526 coding_system = Fget_coding_system (coding_system); |
505 return XCODING_SYSTEM_NAME (coding_system); | 527 return XCODING_SYSTEM_NAME (coding_system); |
506 } | 528 } |
507 | 529 |
508 static struct Lisp_Coding_System * | 530 static Lisp_Coding_System * |
509 allocate_coding_system (enum coding_system_type type, Lisp_Object name) | 531 allocate_coding_system (enum coding_system_type type, Lisp_Object name) |
510 { | 532 { |
511 struct Lisp_Coding_System *codesys = | 533 Lisp_Coding_System *codesys = |
512 alloc_lcrecord_type (struct Lisp_Coding_System, lrecord_coding_system); | 534 alloc_lcrecord_type (Lisp_Coding_System, lrecord_coding_system); |
513 | 535 |
514 zero_lcrecord (codesys); | 536 zero_lcrecord (codesys); |
515 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil; | 537 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil; |
516 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil; | 538 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil; |
517 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT; | 539 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT; |
606 graphic characters that are not in ASCII or Latin-1 will be | 628 graphic characters that are not in ASCII or Latin-1 will be |
607 replaced by a ?. (For a no-conversion-encoded buffer, these | 629 replaced by a ?. (For a no-conversion-encoded buffer, these |
608 characters will only be present if you explicitly insert them.) | 630 characters will only be present if you explicitly insert them.) |
609 'shift-jis | 631 'shift-jis |
610 Shift-JIS (a Japanese encoding commonly used in PC operating systems). | 632 Shift-JIS (a Japanese encoding commonly used in PC operating systems). |
633 'ucs-4 | |
634 ISO 10646 UCS-4 encoding. | |
635 'utf-8 | |
636 ISO 10646 UTF-8 encoding. | |
611 'iso2022 | 637 'iso2022 |
612 Any ISO2022-compliant encoding. Among other things, this includes | 638 Any ISO2022-compliant encoding. Among other things, this includes |
613 JIS (the Japanese encoding commonly used for e-mail), EUC (the | 639 JIS (the Japanese encoding commonly used for e-mail), EUC (the |
614 standard Unix encoding for Japanese and other languages), and | 640 standard Unix encoding for Japanese and other languages), and |
615 Compound Text (the encoding used in X11). You can specify more | 641 Compound Text (the encoding used in X11). You can specify more |
760 'encode | 786 'encode |
761 CCL program used for encoding (converting to external format). | 787 CCL program used for encoding (converting to external format). |
762 */ | 788 */ |
763 (name, type, doc_string, props)) | 789 (name, type, doc_string, props)) |
764 { | 790 { |
765 struct Lisp_Coding_System *codesys; | 791 Lisp_Coding_System *codesys; |
766 Lisp_Object rest, key, value; | 792 Lisp_Object rest, key, value; |
767 enum coding_system_type ty; | 793 enum coding_system_type ty; |
768 int need_to_setup_eol_systems = 1; | 794 int need_to_setup_eol_systems = 1; |
769 | 795 |
770 /* Convert type to constant */ | 796 /* Convert type to constant */ |
772 { ty = CODESYS_AUTODETECT; } | 798 { ty = CODESYS_AUTODETECT; } |
773 #ifdef MULE | 799 #ifdef MULE |
774 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; } | 800 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; } |
775 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; } | 801 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; } |
776 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; } | 802 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; } |
803 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; } | |
804 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; } | |
777 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; } | 805 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; } |
778 #endif | 806 #endif |
779 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; } | 807 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; } |
780 #ifdef DEBUG_XEMACS | 808 #ifdef DEBUG_XEMACS |
781 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; } | 809 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; } |
909 new_name)); | 937 new_name)); |
910 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table); | 938 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table); |
911 } | 939 } |
912 | 940 |
913 { | 941 { |
914 struct Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system); | 942 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system); |
915 struct Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system); | 943 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system); |
916 memcpy (((char *) to ) + sizeof (to->header), | 944 memcpy (((char *) to ) + sizeof (to->header), |
917 ((char *) from) + sizeof (from->header), | 945 ((char *) from) + sizeof (from->header), |
918 sizeof (*from) - sizeof (from->header)); | 946 sizeof (*from) - sizeof (from->header)); |
919 to->name = new_name; | 947 to->name = new_name; |
920 } | 948 } |
922 } | 950 } |
923 | 951 |
924 static Lisp_Object | 952 static Lisp_Object |
925 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type) | 953 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type) |
926 { | 954 { |
927 struct Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system); | 955 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system); |
928 Lisp_Object new_coding_system; | 956 Lisp_Object new_coding_system; |
929 | 957 |
930 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) | 958 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) |
931 return coding_system; | 959 return coding_system; |
932 | 960 |
978 case CODESYS_AUTODETECT: return Qundecided; | 1006 case CODESYS_AUTODETECT: return Qundecided; |
979 #ifdef MULE | 1007 #ifdef MULE |
980 case CODESYS_SHIFT_JIS: return Qshift_jis; | 1008 case CODESYS_SHIFT_JIS: return Qshift_jis; |
981 case CODESYS_ISO2022: return Qiso2022; | 1009 case CODESYS_ISO2022: return Qiso2022; |
982 case CODESYS_BIG5: return Qbig5; | 1010 case CODESYS_BIG5: return Qbig5; |
1011 case CODESYS_UCS4: return Qucs4; | |
1012 case CODESYS_UTF8: return Qutf8; | |
983 case CODESYS_CCL: return Qccl; | 1013 case CODESYS_CCL: return Qccl; |
984 #endif | 1014 #endif |
985 case CODESYS_NO_CONVERSION: return Qno_conversion; | 1015 case CODESYS_NO_CONVERSION: return Qno_conversion; |
986 #ifdef DEBUG_XEMACS | 1016 #ifdef DEBUG_XEMACS |
987 case CODESYS_INTERNAL: return Qinternal; | 1017 case CODESYS_INTERNAL: return Qinternal; |
1280 shift_jis; | 1310 shift_jis; |
1281 | 1311 |
1282 struct | 1312 struct |
1283 { | 1313 { |
1284 int mask; | 1314 int mask; |
1315 int in_byte; | |
1316 } | |
1317 ucs4; | |
1318 | |
1319 struct | |
1320 { | |
1321 int mask; | |
1322 int in_byte; | |
1323 } | |
1324 utf8; | |
1325 | |
1326 struct | |
1327 { | |
1328 int mask; | |
1285 int initted; | 1329 int initted; |
1286 struct iso2022_decoder iso; | 1330 struct iso2022_decoder iso; |
1287 unsigned int flags; | 1331 unsigned int flags; |
1288 int high_byte_count; | 1332 int high_byte_count; |
1289 unsigned int saw_single_shift:1; | 1333 unsigned int saw_single_shift:1; |
1396 { | 1440 { |
1397 st->seen_non_ascii = 1; | 1441 st->seen_non_ascii = 1; |
1398 #ifdef MULE | 1442 #ifdef MULE |
1399 st->shift_jis.mask = ~0; | 1443 st->shift_jis.mask = ~0; |
1400 st->big5.mask = ~0; | 1444 st->big5.mask = ~0; |
1445 st->ucs4.mask = ~0; | |
1446 st->utf8.mask = ~0; | |
1401 st->iso2022.mask = ~0; | 1447 st->iso2022.mask = ~0; |
1402 #endif | 1448 #endif |
1403 break; | 1449 break; |
1404 } | 1450 } |
1405 } | 1451 } |
1412 st->iso2022.mask = detect_coding_iso2022 (st, src, n); | 1458 st->iso2022.mask = detect_coding_iso2022 (st, src, n); |
1413 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask)) | 1459 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask)) |
1414 st->shift_jis.mask = detect_coding_sjis (st, src, n); | 1460 st->shift_jis.mask = detect_coding_sjis (st, src, n); |
1415 if (!mask_has_at_most_one_bit_p (st->big5.mask)) | 1461 if (!mask_has_at_most_one_bit_p (st->big5.mask)) |
1416 st->big5.mask = detect_coding_big5 (st, src, n); | 1462 st->big5.mask = detect_coding_big5 (st, src, n); |
1417 | 1463 if (!mask_has_at_most_one_bit_p (st->utf8.mask)) |
1418 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask; | 1464 st->utf8.mask = detect_coding_utf8 (st, src, n); |
1465 if (!mask_has_at_most_one_bit_p (st->ucs4.mask)) | |
1466 st->ucs4.mask = detect_coding_ucs4 (st, src, n); | |
1467 | |
1468 st->mask | |
1469 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask | |
1470 | st->utf8.mask | st->ucs4.mask; | |
1419 #endif | 1471 #endif |
1420 { | 1472 { |
1421 int retval = mask_has_at_most_one_bit_p (st->mask); | 1473 int retval = mask_has_at_most_one_bit_p (st->mask); |
1422 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK; | 1474 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK; |
1423 return retval && st->eol_type != EOL_AUTODETECT; | 1475 return retval && st->eol_type != EOL_AUTODETECT; |
1675 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding) | 1727 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding) |
1676 | 1728 |
1677 struct decoding_stream | 1729 struct decoding_stream |
1678 { | 1730 { |
1679 /* Coding system that governs the conversion. */ | 1731 /* Coding system that governs the conversion. */ |
1680 struct Lisp_Coding_System *codesys; | 1732 Lisp_Coding_System *codesys; |
1681 | 1733 |
1682 /* Stream that we read the encoded data from or | 1734 /* Stream that we read the encoded data from or |
1683 write the decoded data to. */ | 1735 write the decoded data to. */ |
1684 Lstream *other_end; | 1736 Lstream *other_end; |
1685 | 1737 |
1881 str->flags |= CODING_STATE_END; | 1933 str->flags |= CODING_STATE_END; |
1882 decoding_writer (stream, 0, 0); | 1934 decoding_writer (stream, 0, 0); |
1883 } | 1935 } |
1884 Dynarr_free (str->runoff); | 1936 Dynarr_free (str->runoff); |
1885 #ifdef MULE | 1937 #ifdef MULE |
1938 #ifdef ENABLE_COMPOSITE_CHARS | |
1886 if (str->iso2022.composite_chars) | 1939 if (str->iso2022.composite_chars) |
1887 Dynarr_free (str->iso2022.composite_chars); | 1940 Dynarr_free (str->iso2022.composite_chars); |
1888 #endif | 1941 #endif |
1942 #endif | |
1889 return Lstream_close (str->other_end); | 1943 return Lstream_close (str->other_end); |
1890 } | 1944 } |
1891 | 1945 |
1892 Lisp_Object | 1946 Lisp_Object |
1893 decoding_stream_coding_system (Lstream *stream) | 1947 decoding_stream_coding_system (Lstream *stream) |
1900 } | 1954 } |
1901 | 1955 |
1902 void | 1956 void |
1903 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) | 1957 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) |
1904 { | 1958 { |
1905 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); | 1959 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); |
1906 struct decoding_stream *str = DECODING_STREAM_DATA (lstr); | 1960 struct decoding_stream *str = DECODING_STREAM_DATA (lstr); |
1907 str->codesys = cs; | 1961 str->codesys = cs; |
1908 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) | 1962 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) |
1909 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs); | 1963 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs); |
1910 reset_decoding_stream (str); | 1964 reset_decoding_stream (str); |
2017 decode_coding_sjis (decoding, src, dst, n); | 2071 decode_coding_sjis (decoding, src, dst, n); |
2018 break; | 2072 break; |
2019 case CODESYS_BIG5: | 2073 case CODESYS_BIG5: |
2020 decode_coding_big5 (decoding, src, dst, n); | 2074 decode_coding_big5 (decoding, src, dst, n); |
2021 break; | 2075 break; |
2076 case CODESYS_UCS4: | |
2077 decode_coding_ucs4 (decoding, src, dst, n); | |
2078 break; | |
2079 case CODESYS_UTF8: | |
2080 decode_coding_utf8 (decoding, src, dst, n); | |
2081 break; | |
2022 case CODESYS_CCL: | 2082 case CODESYS_CCL: |
2023 ccl_driver (&str->ccl, src, dst, n, 0); | 2083 ccl_driver (&str->ccl, src, dst, n, 0); |
2024 break; | 2084 break; |
2025 case CODESYS_ISO2022: | 2085 case CODESYS_ISO2022: |
2026 decode_coding_iso2022 (decoding, src, dst, n); | 2086 decode_coding_iso2022 (decoding, src, dst, n); |
2108 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding) | 2168 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding) |
2109 | 2169 |
2110 struct encoding_stream | 2170 struct encoding_stream |
2111 { | 2171 { |
2112 /* Coding system that governs the conversion. */ | 2172 /* Coding system that governs the conversion. */ |
2113 struct Lisp_Coding_System *codesys; | 2173 Lisp_Coding_System *codesys; |
2114 | 2174 |
2115 /* Stream that we read the encoded data from or | 2175 /* Stream that we read the encoded data from or |
2116 write the decoded data to. */ | 2176 write the decoded data to. */ |
2117 Lstream *other_end; | 2177 Lstream *other_end; |
2118 | 2178 |
2359 } | 2419 } |
2360 | 2420 |
2361 void | 2421 void |
2362 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) | 2422 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) |
2363 { | 2423 { |
2364 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); | 2424 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); |
2365 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr); | 2425 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr); |
2366 str->codesys = cs; | 2426 str->codesys = cs; |
2367 reset_encoding_stream (str); | 2427 reset_encoding_stream (str); |
2368 } | 2428 } |
2369 | 2429 |
2423 encode_coding_sjis (encoding, src, dst, n); | 2483 encode_coding_sjis (encoding, src, dst, n); |
2424 break; | 2484 break; |
2425 case CODESYS_BIG5: | 2485 case CODESYS_BIG5: |
2426 encode_coding_big5 (encoding, src, dst, n); | 2486 encode_coding_big5 (encoding, src, dst, n); |
2427 break; | 2487 break; |
2488 case CODESYS_UCS4: | |
2489 encode_coding_ucs4 (encoding, src, dst, n); | |
2490 break; | |
2491 case CODESYS_UTF8: | |
2492 encode_coding_utf8 (encoding, src, dst, n); | |
2493 break; | |
2428 case CODESYS_CCL: | 2494 case CODESYS_CCL: |
2429 ccl_driver (&str->ccl, src, dst, n, 0); | 2495 ccl_driver (&str->ccl, src, dst, n, 0); |
2430 break; | 2496 break; |
2431 case CODESYS_ISO2022: | 2497 case CODESYS_ISO2022: |
2432 encode_coding_iso2022 (encoding, src, dst, n); | 2498 encode_coding_iso2022 (encoding, src, dst, n); |
2508 /* Shift-JIS methods */ | 2574 /* Shift-JIS methods */ |
2509 /************************************************************************/ | 2575 /************************************************************************/ |
2510 | 2576 |
2511 /* Shift-JIS is a coding system encoding three character sets: ASCII, right | 2577 /* Shift-JIS is a coding system encoding three character sets: ASCII, right |
2512 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded | 2578 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded |
2513 as is. A character of JISX0201-Kana (TYPE94 character set) is | 2579 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is |
2514 encoded by "position-code + 0x80". A character of JISX0208 | 2580 encoded by "position-code + 0x80". A character of JISX0208 |
2515 (TYPE94x94 character set) is encoded in 2-byte but two | 2581 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two |
2516 position-codes are divided and shifted so that it fit in the range | 2582 position-codes are divided and shifted so that it fit in the range |
2517 below. | 2583 below. |
2518 | 2584 |
2519 --- CODE RANGE of Shift-JIS --- | 2585 --- CODE RANGE of Shift-JIS --- |
2520 (character set) (range) | 2586 (character set) (range) |
2567 static void | 2633 static void |
2568 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src, | 2634 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src, |
2569 unsigned_char_dynarr *dst, unsigned int n) | 2635 unsigned_char_dynarr *dst, unsigned int n) |
2570 { | 2636 { |
2571 unsigned char c; | 2637 unsigned char c; |
2572 unsigned int flags, ch; | |
2573 enum eol_type eol_type; | |
2574 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | 2638 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); |
2575 | 2639 unsigned int flags = str->flags; |
2576 CODING_STREAM_DECOMPOSE (str, flags, ch); | 2640 unsigned int ch = str->ch; |
2577 eol_type = str->eol_type; | 2641 eol_type_t eol_type = str->eol_type; |
2578 | 2642 |
2579 while (n--) | 2643 while (n--) |
2580 { | 2644 { |
2581 c = *src++; | 2645 c = *src++; |
2582 | 2646 |
2615 label_continue_loop:; | 2679 label_continue_loop:; |
2616 } | 2680 } |
2617 | 2681 |
2618 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); | 2682 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); |
2619 | 2683 |
2620 CODING_STREAM_COMPOSE (str, flags, ch); | 2684 str->flags = flags; |
2685 str->ch = ch; | |
2621 } | 2686 } |
2622 | 2687 |
2623 /* Convert internally-formatted data to Shift-JIS. */ | 2688 /* Convert internally-formatted data to Shift-JIS. */ |
2624 | 2689 |
2625 static void | 2690 static void |
2626 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src, | 2691 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src, |
2627 unsigned_char_dynarr *dst, unsigned int n) | 2692 unsigned_char_dynarr *dst, unsigned int n) |
2628 { | 2693 { |
2629 unsigned char c; | 2694 unsigned char c; |
2630 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | 2695 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); |
2631 unsigned int flags, ch; | 2696 unsigned int flags = str->flags; |
2632 enum eol_type eol_type; | 2697 unsigned int ch = str->ch; |
2633 | 2698 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); |
2634 CODING_STREAM_DECOMPOSE (str, flags, ch); | |
2635 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); | |
2636 | 2699 |
2637 while (n--) | 2700 while (n--) |
2638 { | 2701 { |
2639 c = *src++; | 2702 c = *src++; |
2640 if (c == '\n') | 2703 if (c == '\n') |
2673 ch = 0; | 2736 ch = 0; |
2674 } | 2737 } |
2675 } | 2738 } |
2676 } | 2739 } |
2677 | 2740 |
2678 CODING_STREAM_COMPOSE (str, flags, ch); | 2741 str->flags = flags; |
2742 str->ch = ch; | |
2679 } | 2743 } |
2680 | 2744 |
2681 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /* | 2745 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /* |
2682 Decode a JISX0208 character of Shift-JIS coding-system. | 2746 Decode a JISX0208 character of Shift-JIS coding-system. |
2683 CODE is the character code in Shift-JIS as a cons of type bytes. | 2747 CODE is the character code in Shift-JIS as a cons of type bytes. |
2740 -------------------------- | 2804 -------------------------- |
2741 | 2805 |
2742 Since the number of characters in Big5 is larger than maximum | 2806 Since the number of characters in Big5 is larger than maximum |
2743 characters in Emacs' charset (96x96), it can't be handled as one | 2807 characters in Emacs' charset (96x96), it can't be handled as one |
2744 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1' | 2808 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1' |
2745 and `charset-big5-2'. Both <type>s are TYPE94x94. The former | 2809 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former |
2746 contains frequently used characters and the latter contains less | 2810 contains frequently used characters and the latter contains less |
2747 frequently used characters. */ | 2811 frequently used characters. */ |
2748 | 2812 |
2749 #define BYTE_BIG5_TWO_BYTE_1_P(c) \ | 2813 #define BYTE_BIG5_TWO_BYTE_1_P(c) \ |
2750 ((c) >= 0xA1 && (c) <= 0xFE) | 2814 ((c) >= 0xA1 && (c) <= 0xFE) |
2856 static void | 2920 static void |
2857 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src, | 2921 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src, |
2858 unsigned_char_dynarr *dst, unsigned int n) | 2922 unsigned_char_dynarr *dst, unsigned int n) |
2859 { | 2923 { |
2860 unsigned char c; | 2924 unsigned char c; |
2861 unsigned int flags, ch; | |
2862 enum eol_type eol_type; | |
2863 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | 2925 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); |
2864 | 2926 unsigned int flags = str->flags; |
2865 CODING_STREAM_DECOMPOSE (str, flags, ch); | 2927 unsigned int ch = str->ch; |
2866 eol_type = str->eol_type; | 2928 eol_type_t eol_type = str->eol_type; |
2867 | 2929 |
2868 while (n--) | 2930 while (n--) |
2869 { | 2931 { |
2870 c = *src++; | 2932 c = *src++; |
2871 if (ch) | 2933 if (ch) |
2897 label_continue_loop:; | 2959 label_continue_loop:; |
2898 } | 2960 } |
2899 | 2961 |
2900 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); | 2962 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); |
2901 | 2963 |
2902 CODING_STREAM_COMPOSE (str, flags, ch); | 2964 str->flags = flags; |
2965 str->ch = ch; | |
2903 } | 2966 } |
2904 | 2967 |
2905 /* Convert internally-formatted data to Big5. */ | 2968 /* Convert internally-formatted data to Big5. */ |
2906 | 2969 |
2907 static void | 2970 static void |
2908 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src, | 2971 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src, |
2909 unsigned_char_dynarr *dst, unsigned int n) | 2972 unsigned_char_dynarr *dst, unsigned int n) |
2910 { | 2973 { |
2911 unsigned char c; | 2974 unsigned char c; |
2912 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | 2975 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); |
2913 unsigned int flags, ch; | 2976 unsigned int flags = str->flags; |
2914 enum eol_type eol_type; | 2977 unsigned int ch = str->ch; |
2915 | 2978 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); |
2916 CODING_STREAM_DECOMPOSE (str, flags, ch); | |
2917 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); | |
2918 | 2979 |
2919 while (n--) | 2980 while (n--) |
2920 { | 2981 { |
2921 c = *src++; | 2982 c = *src++; |
2922 if (c == '\n') | 2983 if (c == '\n') |
2960 } | 3021 } |
2961 | 3022 |
2962 ch = 0; | 3023 ch = 0; |
2963 } | 3024 } |
2964 | 3025 |
2965 CODING_STREAM_COMPOSE (str, flags, ch); | 3026 str->flags = flags; |
3027 str->ch = ch; | |
2966 } | 3028 } |
2967 | 3029 |
2968 | 3030 |
2969 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /* | 3031 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /* |
2970 Decode a Big5 character CODE of BIG5 coding-system. | 3032 Decode a Big5 character CODE of BIG5 coding-system. |
3015 return Qnil; | 3077 return Qnil; |
3016 } | 3078 } |
3017 | 3079 |
3018 | 3080 |
3019 /************************************************************************/ | 3081 /************************************************************************/ |
3082 /* UCS-4 methods */ | |
3083 /* */ | |
3084 /* UCS-4 character codes are implemented as nonnegative integers. */ | |
3085 /* */ | |
3086 /************************************************************************/ | |
3087 | |
3088 Lisp_Object ucs_to_mule_table[65536]; | |
3089 Lisp_Object mule_to_ucs_table; | |
3090 | |
3091 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /* | |
3092 Map UCS-4 code CODE to Mule character CHARACTER. | |
3093 | |
3094 Return T on success, NIL on failure. | |
3095 */ | |
3096 (code, character)) | |
3097 { | |
3098 unsigned int c; | |
3099 | |
3100 CHECK_CHAR (character); | |
3101 CHECK_INT (code); | |
3102 c = XINT (code); | |
3103 | |
3104 if (c < sizeof (ucs_to_mule_table)) | |
3105 { | |
3106 ucs_to_mule_table[c] = character; | |
3107 return Qt; | |
3108 } | |
3109 else | |
3110 return Qnil; | |
3111 } | |
3112 | |
3113 static Lisp_Object | |
3114 ucs_to_char (unsigned long code) | |
3115 { | |
3116 if (code < sizeof (ucs_to_mule_table)) | |
3117 { | |
3118 return ucs_to_mule_table[code]; | |
3119 } | |
3120 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) | |
3121 { | |
3122 unsigned int c; | |
3123 | |
3124 code -= 0xe00000; | |
3125 c = code % (94 * 94); | |
3126 return make_char | |
3127 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES | |
3128 (CHARSET_TYPE_94X94, code / (94 * 94) + '@', | |
3129 CHARSET_LEFT_TO_RIGHT), | |
3130 c / 94 + 33, c % 94 + 33)); | |
3131 } | |
3132 else | |
3133 return Qnil; | |
3134 } | |
3135 | |
3136 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /* | |
3137 Return Mule character corresponding to UCS code CODE (a positive integer). | |
3138 */ | |
3139 (code)) | |
3140 { | |
3141 CHECK_NATNUM (code); | |
3142 return ucs_to_char (XINT (code)); | |
3143 } | |
3144 | |
3145 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /* | |
3146 Map Mule character CHARACTER to UCS code CODE (a positive integer). | |
3147 */ | |
3148 (character, code)) | |
3149 { | |
3150 /* #### Isn't this gilding the lily? Fput_char_table checks its args. | |
3151 Fset_char_ucs is more restrictive on index arg, but should | |
3152 check code arg in a char_table method. */ | |
3153 CHECK_CHAR (character); | |
3154 CHECK_NATNUM (code); | |
3155 return Fput_char_table (character, code, mule_to_ucs_table); | |
3156 } | |
3157 | |
3158 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /* | |
3159 Return the UCS code (a positive integer) corresponding to CHARACTER. | |
3160 */ | |
3161 (character)) | |
3162 { | |
3163 return Fget_char_table (character, mule_to_ucs_table); | |
3164 } | |
3165 | |
3166 /* Decode a UCS-4 character into a buffer. If the lookup fails, use | |
3167 JIS X 0208 double-width `=' instead. | |
3168 #### do something more appropriate (use blob?) | |
3169 Danger, Will Robinson! Data loss. Should we signal user? */ | |
3170 static void | |
3171 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst) | |
3172 { | |
3173 Lisp_Object chr = ucs_to_char (ch); | |
3174 | |
3175 if (! NILP (chr)) | |
3176 { | |
3177 Bufbyte work[MAX_EMCHAR_LEN]; | |
3178 int len; | |
3179 | |
3180 ch = XCHAR (chr); | |
3181 len = (ch < 128) ? | |
3182 simple_set_charptr_emchar (work, ch) : | |
3183 non_ascii_set_charptr_emchar (work, ch); | |
3184 Dynarr_add_many (dst, work, len); | |
3185 } | |
3186 else | |
3187 { | |
3188 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); | |
3189 Dynarr_add (dst, 34 + 128); | |
3190 Dynarr_add (dst, 46 + 128); | |
3191 } | |
3192 } | |
3193 | |
3194 static unsigned long | |
3195 mule_char_to_ucs4 (Lisp_Object charset, | |
3196 unsigned char h, unsigned char l) | |
3197 { | |
3198 Lisp_Object code | |
3199 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)), | |
3200 mule_to_ucs_table); | |
3201 | |
3202 if (INTP (code)) | |
3203 { | |
3204 return XINT (code); | |
3205 } | |
3206 else if ( (XCHARSET_DIMENSION (charset) == 2) && | |
3207 (XCHARSET_CHARS (charset) == 94) ) | |
3208 { | |
3209 unsigned char final = XCHARSET_FINAL (charset); | |
3210 | |
3211 if ( ('@' <= final) && (final < 0x7f) ) | |
3212 { | |
3213 return 0xe00000 + (final - '@') * 94 * 94 | |
3214 + ((h & 127) - 33) * 94 + (l & 127) - 33; | |
3215 } | |
3216 else | |
3217 { | |
3218 return '?'; | |
3219 } | |
3220 } | |
3221 else | |
3222 { | |
3223 return '?'; | |
3224 } | |
3225 } | |
3226 | |
3227 static void | |
3228 encode_ucs4 (Lisp_Object charset, | |
3229 unsigned char h, unsigned char l, unsigned_char_dynarr *dst) | |
3230 { | |
3231 unsigned long code = mule_char_to_ucs4 (charset, h, l); | |
3232 Dynarr_add (dst, code >> 24); | |
3233 Dynarr_add (dst, (code >> 16) & 255); | |
3234 Dynarr_add (dst, (code >> 8) & 255); | |
3235 Dynarr_add (dst, code & 255); | |
3236 } | |
3237 | |
3238 static int | |
3239 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src, | |
3240 unsigned int n) | |
3241 { | |
3242 while (n--) | |
3243 { | |
3244 int c = *src++; | |
3245 switch (st->ucs4.in_byte) | |
3246 { | |
3247 case 0: | |
3248 if (c >= 128) | |
3249 return 0; | |
3250 else | |
3251 st->ucs4.in_byte++; | |
3252 break; | |
3253 case 3: | |
3254 st->ucs4.in_byte = 0; | |
3255 break; | |
3256 default: | |
3257 st->ucs4.in_byte++; | |
3258 } | |
3259 } | |
3260 return CODING_CATEGORY_UCS4_MASK; | |
3261 } | |
3262 | |
3263 static void | |
3264 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src, | |
3265 unsigned_char_dynarr *dst, unsigned int n) | |
3266 { | |
3267 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | |
3268 unsigned int flags = str->flags; | |
3269 unsigned int ch = str->ch; | |
3270 | |
3271 while (n--) | |
3272 { | |
3273 unsigned char c = *src++; | |
3274 switch (flags) | |
3275 { | |
3276 case 0: | |
3277 ch = c; | |
3278 flags = 3; | |
3279 break; | |
3280 case 1: | |
3281 decode_ucs4 ( ( ch << 8 ) | c, dst); | |
3282 ch = 0; | |
3283 flags = 0; | |
3284 break; | |
3285 default: | |
3286 ch = ( ch << 8 ) | c; | |
3287 flags--; | |
3288 } | |
3289 } | |
3290 if (flags & CODING_STATE_END) | |
3291 DECODE_OUTPUT_PARTIAL_CHAR (ch); | |
3292 | |
3293 str->flags = flags; | |
3294 str->ch = ch; | |
3295 } | |
3296 | |
3297 static void | |
3298 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src, | |
3299 unsigned_char_dynarr *dst, unsigned int n) | |
3300 { | |
3301 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | |
3302 unsigned int flags = str->flags; | |
3303 unsigned int ch = str->ch; | |
3304 unsigned char char_boundary = str->iso2022.current_char_boundary; | |
3305 Lisp_Object charset = str->iso2022.current_charset; | |
3306 | |
3307 #ifdef ENABLE_COMPOSITE_CHARS | |
3308 /* flags for handling composite chars. We do a little switcharoo | |
3309 on the source while we're outputting the composite char. */ | |
3310 unsigned int saved_n = 0; | |
3311 CONST unsigned char *saved_src = NULL; | |
3312 int in_composite = 0; | |
3313 | |
3314 back_to_square_n: | |
3315 #endif | |
3316 | |
3317 while (n--) | |
3318 { | |
3319 unsigned char c = *src++; | |
3320 | |
3321 if (BYTE_ASCII_P (c)) | |
3322 { /* Processing ASCII character */ | |
3323 ch = 0; | |
3324 encode_ucs4 (Vcharset_ascii, c, 0, dst); | |
3325 char_boundary = 1; | |
3326 } | |
3327 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch)) | |
3328 { /* Processing Leading Byte */ | |
3329 ch = 0; | |
3330 charset = CHARSET_BY_LEADING_BYTE (c); | |
3331 if (LEADING_BYTE_PREFIX_P(c)) | |
3332 ch = c; | |
3333 char_boundary = 0; | |
3334 } | |
3335 else | |
3336 { /* Processing Non-ASCII character */ | |
3337 char_boundary = 1; | |
3338 if (EQ (charset, Vcharset_control_1)) | |
3339 { | |
3340 encode_ucs4 (Vcharset_control_1, c, 0, dst); | |
3341 } | |
3342 else | |
3343 { | |
3344 switch (XCHARSET_REP_BYTES (charset)) | |
3345 { | |
3346 case 2: | |
3347 encode_ucs4 (charset, c, 0, dst); | |
3348 break; | |
3349 case 3: | |
3350 if (XCHARSET_PRIVATE_P (charset)) | |
3351 { | |
3352 encode_ucs4 (charset, c, 0, dst); | |
3353 ch = 0; | |
3354 } | |
3355 else if (ch) | |
3356 { | |
3357 #ifdef ENABLE_COMPOSITE_CHARS | |
3358 if (EQ (charset, Vcharset_composite)) | |
3359 { | |
3360 if (in_composite) | |
3361 { | |
3362 /* #### Bother! We don't know how to | |
3363 handle this yet. */ | |
3364 Dynarr_add (dst, 0); | |
3365 Dynarr_add (dst, 0); | |
3366 Dynarr_add (dst, 0); | |
3367 Dynarr_add (dst, '~'); | |
3368 } | |
3369 else | |
3370 { | |
3371 Emchar emch = MAKE_CHAR (Vcharset_composite, | |
3372 ch & 0x7F, c & 0x7F); | |
3373 Lisp_Object lstr = composite_char_string (emch); | |
3374 saved_n = n; | |
3375 saved_src = src; | |
3376 in_composite = 1; | |
3377 src = XSTRING_DATA (lstr); | |
3378 n = XSTRING_LENGTH (lstr); | |
3379 } | |
3380 } | |
3381 else | |
3382 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3383 { | |
3384 encode_ucs4(charset, ch, c, dst); | |
3385 } | |
3386 ch = 0; | |
3387 } | |
3388 else | |
3389 { | |
3390 ch = c; | |
3391 char_boundary = 0; | |
3392 } | |
3393 break; | |
3394 case 4: | |
3395 if (ch) | |
3396 { | |
3397 encode_ucs4 (charset, ch, c, dst); | |
3398 ch = 0; | |
3399 } | |
3400 else | |
3401 { | |
3402 ch = c; | |
3403 char_boundary = 0; | |
3404 } | |
3405 break; | |
3406 default: | |
3407 abort (); | |
3408 } | |
3409 } | |
3410 } | |
3411 } | |
3412 | |
3413 #ifdef ENABLE_COMPOSITE_CHARS | |
3414 if (in_composite) | |
3415 { | |
3416 n = saved_n; | |
3417 src = saved_src; | |
3418 in_composite = 0; | |
3419 goto back_to_square_n; /* Wheeeeeeeee ..... */ | |
3420 } | |
3421 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3422 | |
3423 str->flags = flags; | |
3424 str->ch = ch; | |
3425 str->iso2022.current_char_boundary = char_boundary; | |
3426 str->iso2022.current_charset = charset; | |
3427 | |
3428 /* Verbum caro factum est! */ | |
3429 } | |
3430 | |
3431 | |
3432 /************************************************************************/ | |
3433 /* UTF-8 methods */ | |
3434 /************************************************************************/ | |
3435 | |
3436 static int | |
3437 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src, | |
3438 unsigned int n) | |
3439 { | |
3440 while (n--) | |
3441 { | |
3442 unsigned char c = *src++; | |
3443 switch (st->utf8.in_byte) | |
3444 { | |
3445 case 0: | |
3446 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
3447 return 0; | |
3448 else if (c >= 0xfc) | |
3449 st->utf8.in_byte = 5; | |
3450 else if (c >= 0xf8) | |
3451 st->utf8.in_byte = 4; | |
3452 else if (c >= 0xf0) | |
3453 st->utf8.in_byte = 3; | |
3454 else if (c >= 0xe0) | |
3455 st->utf8.in_byte = 2; | |
3456 else if (c >= 0xc0) | |
3457 st->utf8.in_byte = 1; | |
3458 else if (c >= 0x80) | |
3459 return 0; | |
3460 break; | |
3461 default: | |
3462 if ((c & 0xc0) != 0x80) | |
3463 return 0; | |
3464 else | |
3465 st->utf8.in_byte--; | |
3466 } | |
3467 } | |
3468 return CODING_CATEGORY_UTF8_MASK; | |
3469 } | |
3470 | |
3471 static void | |
3472 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src, | |
3473 unsigned_char_dynarr *dst, unsigned int n) | |
3474 { | |
3475 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | |
3476 unsigned int flags = str->flags; | |
3477 unsigned int ch = str->ch; | |
3478 eol_type_t eol_type = str->eol_type; | |
3479 | |
3480 while (n--) | |
3481 { | |
3482 unsigned char c = *src++; | |
3483 switch (flags) | |
3484 { | |
3485 case 0: | |
3486 if ( c >= 0xfc ) | |
3487 { | |
3488 ch = c & 0x01; | |
3489 flags = 5; | |
3490 } | |
3491 else if ( c >= 0xf8 ) | |
3492 { | |
3493 ch = c & 0x03; | |
3494 flags = 4; | |
3495 } | |
3496 else if ( c >= 0xf0 ) | |
3497 { | |
3498 ch = c & 0x07; | |
3499 flags = 3; | |
3500 } | |
3501 else if ( c >= 0xe0 ) | |
3502 { | |
3503 ch = c & 0x0f; | |
3504 flags = 2; | |
3505 } | |
3506 else if ( c >= 0xc0 ) | |
3507 { | |
3508 ch = c & 0x1f; | |
3509 flags = 1; | |
3510 } | |
3511 else | |
3512 { | |
3513 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); | |
3514 decode_ucs4 (c, dst); | |
3515 } | |
3516 break; | |
3517 case 1: | |
3518 ch = ( ch << 6 ) | ( c & 0x3f ); | |
3519 decode_ucs4 (ch, dst); | |
3520 ch = 0; | |
3521 flags = 0; | |
3522 break; | |
3523 default: | |
3524 ch = ( ch << 6 ) | ( c & 0x3f ); | |
3525 flags--; | |
3526 } | |
3527 label_continue_loop:; | |
3528 } | |
3529 | |
3530 if (flags & CODING_STATE_END) | |
3531 DECODE_OUTPUT_PARTIAL_CHAR (ch); | |
3532 | |
3533 str->flags = flags; | |
3534 str->ch = ch; | |
3535 } | |
3536 | |
3537 static void | |
3538 encode_utf8 (Lisp_Object charset, | |
3539 unsigned char h, unsigned char l, unsigned_char_dynarr *dst) | |
3540 { | |
3541 unsigned long code = mule_char_to_ucs4 (charset, h, l); | |
3542 if ( code <= 0x7f ) | |
3543 { | |
3544 Dynarr_add (dst, code); | |
3545 } | |
3546 else if ( code <= 0x7ff ) | |
3547 { | |
3548 Dynarr_add (dst, (code >> 6) | 0xc0); | |
3549 Dynarr_add (dst, (code & 0x3f) | 0x80); | |
3550 } | |
3551 else if ( code <= 0xffff ) | |
3552 { | |
3553 Dynarr_add (dst, (code >> 12) | 0xe0); | |
3554 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80); | |
3555 Dynarr_add (dst, (code & 0x3f) | 0x80); | |
3556 } | |
3557 else if ( code <= 0x1fffff ) | |
3558 { | |
3559 Dynarr_add (dst, (code >> 18) | 0xf0); | |
3560 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80); | |
3561 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80); | |
3562 Dynarr_add (dst, (code & 0x3f) | 0x80); | |
3563 } | |
3564 else if ( code <= 0x3ffffff ) | |
3565 { | |
3566 Dynarr_add (dst, (code >> 24) | 0xf8); | |
3567 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80); | |
3568 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80); | |
3569 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80); | |
3570 Dynarr_add (dst, (code & 0x3f) | 0x80); | |
3571 } | |
3572 else | |
3573 { | |
3574 Dynarr_add (dst, (code >> 30) | 0xfc); | |
3575 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80); | |
3576 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80); | |
3577 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80); | |
3578 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80); | |
3579 Dynarr_add (dst, (code & 0x3f) | 0x80); | |
3580 } | |
3581 } | |
3582 | |
3583 static void | |
3584 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src, | |
3585 unsigned_char_dynarr *dst, unsigned int n) | |
3586 { | |
3587 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | |
3588 unsigned int flags = str->flags; | |
3589 unsigned int ch = str->ch; | |
3590 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); | |
3591 unsigned char char_boundary = str->iso2022.current_char_boundary; | |
3592 Lisp_Object charset = str->iso2022.current_charset; | |
3593 | |
3594 #ifdef ENABLE_COMPOSITE_CHARS | |
3595 /* flags for handling composite chars. We do a little switcharoo | |
3596 on the source while we're outputting the composite char. */ | |
3597 unsigned int saved_n = 0; | |
3598 CONST unsigned char *saved_src = NULL; | |
3599 int in_composite = 0; | |
3600 | |
3601 back_to_square_n: | |
3602 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3603 | |
3604 while (n--) | |
3605 { | |
3606 unsigned char c = *src++; | |
3607 | |
3608 if (BYTE_ASCII_P (c)) | |
3609 { /* Processing ASCII character */ | |
3610 ch = 0; | |
3611 if (c == '\n') | |
3612 { | |
3613 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) | |
3614 Dynarr_add (dst, '\r'); | |
3615 if (eol_type != EOL_CR) | |
3616 Dynarr_add (dst, c); | |
3617 } | |
3618 else | |
3619 encode_utf8 (Vcharset_ascii, c, 0, dst); | |
3620 char_boundary = 1; | |
3621 } | |
3622 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch)) | |
3623 { /* Processing Leading Byte */ | |
3624 ch = 0; | |
3625 charset = CHARSET_BY_LEADING_BYTE (c); | |
3626 if (LEADING_BYTE_PREFIX_P(c)) | |
3627 ch = c; | |
3628 char_boundary = 0; | |
3629 } | |
3630 else | |
3631 { /* Processing Non-ASCII character */ | |
3632 char_boundary = 1; | |
3633 if (EQ (charset, Vcharset_control_1)) | |
3634 { | |
3635 encode_utf8 (Vcharset_control_1, c, 0, dst); | |
3636 } | |
3637 else | |
3638 { | |
3639 switch (XCHARSET_REP_BYTES (charset)) | |
3640 { | |
3641 case 2: | |
3642 encode_utf8 (charset, c, 0, dst); | |
3643 break; | |
3644 case 3: | |
3645 if (XCHARSET_PRIVATE_P (charset)) | |
3646 { | |
3647 encode_utf8 (charset, c, 0, dst); | |
3648 ch = 0; | |
3649 } | |
3650 else if (ch) | |
3651 { | |
3652 #ifdef ENABLE_COMPOSITE_CHARS | |
3653 if (EQ (charset, Vcharset_composite)) | |
3654 { | |
3655 if (in_composite) | |
3656 { | |
3657 /* #### Bother! We don't know how to | |
3658 handle this yet. */ | |
3659 encode_utf8 (Vcharset_ascii, '~', 0, dst); | |
3660 } | |
3661 else | |
3662 { | |
3663 Emchar emch = MAKE_CHAR (Vcharset_composite, | |
3664 ch & 0x7F, c & 0x7F); | |
3665 Lisp_Object lstr = composite_char_string (emch); | |
3666 saved_n = n; | |
3667 saved_src = src; | |
3668 in_composite = 1; | |
3669 src = XSTRING_DATA (lstr); | |
3670 n = XSTRING_LENGTH (lstr); | |
3671 } | |
3672 } | |
3673 else | |
3674 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3675 { | |
3676 encode_utf8 (charset, ch, c, dst); | |
3677 } | |
3678 ch = 0; | |
3679 } | |
3680 else | |
3681 { | |
3682 ch = c; | |
3683 char_boundary = 0; | |
3684 } | |
3685 break; | |
3686 case 4: | |
3687 if (ch) | |
3688 { | |
3689 encode_utf8 (charset, ch, c, dst); | |
3690 ch = 0; | |
3691 } | |
3692 else | |
3693 { | |
3694 ch = c; | |
3695 char_boundary = 0; | |
3696 } | |
3697 break; | |
3698 default: | |
3699 abort (); | |
3700 } | |
3701 } | |
3702 } | |
3703 } | |
3704 | |
3705 #ifdef ENABLE_COMPOSITE_CHARS | |
3706 if (in_composite) | |
3707 { | |
3708 n = saved_n; | |
3709 src = saved_src; | |
3710 in_composite = 0; | |
3711 goto back_to_square_n; /* Wheeeeeeeee ..... */ | |
3712 } | |
3713 #endif | |
3714 | |
3715 str->flags = flags; | |
3716 str->ch = ch; | |
3717 str->iso2022.current_char_boundary = char_boundary; | |
3718 str->iso2022.current_charset = charset; | |
3719 | |
3720 /* Verbum caro factum est! */ | |
3721 } | |
3722 | |
3723 | |
3724 /************************************************************************/ | |
3020 /* ISO2022 methods */ | 3725 /* ISO2022 methods */ |
3021 /************************************************************************/ | 3726 /************************************************************************/ |
3022 | 3727 |
3023 /* The following note describes the coding system ISO2022 briefly. | 3728 /* The following note describes the coding system ISO2022 briefly. |
3024 Since the intention of this note is to help understanding of the | 3729 Since the intention of this note is to help understand the |
3025 programs in this file, some parts are NOT ACCURATE or OVERLY | 3730 functions in this file, some parts are NOT ACCURATE or OVERLY |
3026 SIMPLIFIED. For thorough understanding, please refer to the | 3731 SIMPLIFIED. For thorough understanding, please refer to the |
3027 original document of ISO2022. | 3732 original document of ISO2022. |
3028 | 3733 |
3029 ISO2022 provides many mechanisms to encode several character sets | 3734 ISO2022 provides many mechanisms to encode several character sets |
3030 in 7-bit and 8-bit environments. If one chooses 7-bit environment, | 3735 in 7-bit and 8-bit environments. For 7-bit environments, all text |
3031 all text is encoded by codes of less than 128. This may make the | 3736 is encoded using bytes less than 128. This may make the encoded |
3032 encoded text a little bit longer, but the text get more stability | 3737 text a little bit longer, but the text passes more easily through |
3033 to pass through several gateways (some of them strip off MSB). | 3738 several gateways, some of which strip off MSB (Most Signigant Bit). |
3034 | 3739 |
3035 There are two kind of character sets: control character set and | 3740 There are two kinds of character sets: control character set and |
3036 graphic character set. The former contains control characters such | 3741 graphic character set. The former contains control characters such |
3037 as `newline' and `escape' to provide control functions (control | 3742 as `newline' and `escape' to provide control functions (control |
3038 functions are provided also by escape sequence). The latter | 3743 functions are also provided by escape sequences). The latter |
3039 contains graphic characters such as 'A' and '-'. Emacs recognizes | 3744 contains graphic characters such as 'A' and '-'. Emacs recognizes |
3040 two control character sets and many graphic character sets. | 3745 two control character sets and many graphic character sets. |
3041 | 3746 |
3042 Graphic character sets are classified into one of four types, | 3747 Graphic character sets are classified into one of the following |
3043 according to the dimension and number of characters in the set: | 3748 four classes, according to the number of bytes (DIMENSION) and |
3044 TYPE94, TYPE96, TYPE94x94, and TYPE96x96. In addition, each | 3749 number of characters in one dimension (CHARS) of the set: |
3045 character set is assigned an identification byte, unique for each | 3750 - DIMENSION1_CHARS94 |
3046 type, called "final character" (denoted as <F> hereafter). The <F> | 3751 - DIMENSION1_CHARS96 |
3047 of each character set is decided by ECMA(*) when it is registered | 3752 - DIMENSION2_CHARS94 |
3048 in ISO. Code range of <F> is 0x30..0x7F (0x30..0x3F are for | 3753 - DIMENSION2_CHARS96 |
3049 private use only). | 3754 |
3755 In addition, each character set is assigned an identification tag, | |
3756 unique for each set, called "final character" (denoted as <F> | |
3757 hereafter). The <F> of each character set is decided by ECMA(*) | |
3758 when it is registered in ISO. The code range of <F> is 0x30..0x7F | |
3759 (0x30..0x3F are for private use only). | |
3050 | 3760 |
3051 Note (*): ECMA = European Computer Manufacturers Association | 3761 Note (*): ECMA = European Computer Manufacturers Association |
3052 | 3762 |
3053 Here are examples of graphic character set [NAME(<F>)]: | 3763 Here are examples of graphic character set [NAME(<F>)]: |
3054 o TYPE94 -- ASCII('B'), right-half-of-JISX0201('I'), ... | 3764 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ... |
3055 o TYPE96 -- right-half-of-ISO8859-1('A'), ... | 3765 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ... |
3056 o TYPE94x94 -- GB2312('A'), JISX0208('B'), ... | 3766 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ... |
3057 o TYPE96x96 -- none for the moment | 3767 o DIMENSION2_CHARS96 -- none for the moment |
3058 | 3768 |
3059 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR. | 3769 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR. |
3060 C0 [0x00..0x1F] -- control character plane 0 | 3770 C0 [0x00..0x1F] -- control character plane 0 |
3061 GL [0x20..0x7F] -- graphic character plane 0 | 3771 GL [0x20..0x7F] -- graphic character plane 0 |
3062 C1 [0x80..0x9F] -- control character plane 1 | 3772 C1 [0x80..0x9F] -- control character plane 1 |
3063 GR [0xA0..0xFF] -- graphic character plane 1 | 3773 GR [0xA0..0xFF] -- graphic character plane 1 |
3064 | 3774 |
3078 done independently. The most common case is that G0 is invoked to | 3788 done independently. The most common case is that G0 is invoked to |
3079 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually | 3789 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually |
3080 these invocations and designations are omitted in encoded text. | 3790 these invocations and designations are omitted in encoded text. |
3081 In a 7-bit environment, only GL can be used. | 3791 In a 7-bit environment, only GL can be used. |
3082 | 3792 |
3083 When a graphic character set of TYPE94 or TYPE94x94 is invoked to | 3793 When a graphic character set of CHARS94 is invoked to GL, codes |
3084 GL, codes 0x20 and 0x7F of the GL area work as control characters | 3794 0x20 and 0x7F of the GL area work as control characters SPACE and |
3085 SPACE and DEL respectively, and code 0xA0 and 0xFF of GR area | 3795 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not |
3086 should not be used. | 3796 be used. |
3087 | 3797 |
3088 There are two ways of invocation: locking-shift and single-shift. | 3798 There are two ways of invocation: locking-shift and single-shift. |
3089 With locking-shift, the invocation lasts until the next different | 3799 With locking-shift, the invocation lasts until the next different |
3090 invocation, whereas with single-shift, the invocation works only | 3800 invocation, whereas with single-shift, the invocation affects the |
3091 for the following character and doesn't affect locking-shift. | 3801 following character only and doesn't affect the locking-shift |
3092 Invocations are done by the following control characters or escape | 3802 state. Invocations are done by the following control characters or |
3093 sequences. | 3803 escape sequences: |
3094 | 3804 |
3095 ---------------------------------------------------------------------- | 3805 ---------------------------------------------------------------------- |
3096 abbrev function cntrl escape seq description | 3806 abbrev function cntrl escape seq description |
3097 ---------------------------------------------------------------------- | 3807 ---------------------------------------------------------------------- |
3098 SI/LS0 (shift-in) 0x0F none invoke G0 into GL | 3808 SI/LS0 (shift-in) 0x0F none invoke G0 into GL |
3099 SO/LS1 (shift-out) 0x0E none invoke G1 into GL | 3809 SO/LS1 (shift-out) 0x0E none invoke G1 into GL |
3100 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR | |
3101 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL | 3810 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL |
3102 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR | |
3103 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL | 3811 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL |
3104 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR | 3812 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*) |
3813 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*) | |
3814 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*) | |
3105 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char | 3815 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char |
3106 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char | 3816 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char |
3107 ---------------------------------------------------------------------- | 3817 ---------------------------------------------------------------------- |
3108 The first four are for locking-shift. Control characters for these | 3818 (*) These are not used by any known coding system. |
3109 functions are defined by macros ISO_CODE_XXX in `coding.h'. | 3819 |
3110 | 3820 Control characters for these functions are defined by macros |
3111 Designations are done by the following escape sequences. | 3821 ISO_CODE_XXX in `coding.h'. |
3822 | |
3823 Designations are done by the following escape sequences: | |
3112 ---------------------------------------------------------------------- | 3824 ---------------------------------------------------------------------- |
3113 escape sequence description | 3825 escape sequence description |
3114 ---------------------------------------------------------------------- | 3826 ---------------------------------------------------------------------- |
3115 ESC '(' <F> designate TYPE94<F> to G0 | 3827 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0 |
3116 ESC ')' <F> designate TYPE94<F> to G1 | 3828 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1 |
3117 ESC '*' <F> designate TYPE94<F> to G2 | 3829 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2 |
3118 ESC '+' <F> designate TYPE94<F> to G3 | 3830 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3 |
3119 ESC ',' <F> designate TYPE96<F> to G0 (*) | 3831 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*) |
3120 ESC '-' <F> designate TYPE96<F> to G1 | 3832 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1 |
3121 ESC '.' <F> designate TYPE96<F> to G2 | 3833 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2 |
3122 ESC '/' <F> designate TYPE96<F> to G3 | 3834 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3 |
3123 ESC '$' '(' <F> designate TYPE94x94<F> to G0 (**) | 3835 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**) |
3124 ESC '$' ')' <F> designate TYPE94x94<F> to G1 | 3836 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1 |
3125 ESC '$' '*' <F> designate TYPE94x94<F> to G2 | 3837 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2 |
3126 ESC '$' '+' <F> designate TYPE94x94<F> to G3 | 3838 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3 |
3127 ESC '$' ',' <F> designate TYPE96x96<F> to G0 (*) | 3839 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*) |
3128 ESC '$' '-' <F> designate TYPE96x96<F> to G1 | 3840 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1 |
3129 ESC '$' '.' <F> designate TYPE96x96<F> to G2 | 3841 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2 |
3130 ESC '$' '/' <F> designate TYPE96x96<F> to G3 | 3842 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3 |
3131 ---------------------------------------------------------------------- | 3843 ---------------------------------------------------------------------- |
3132 In this list, "TYPE94<F>" means a graphic character set of type TYPE94 | 3844 |
3133 and final character <F>, and etc. | 3845 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set |
3846 of dimension 1, chars 94, and final character <F>, etc... | |
3134 | 3847 |
3135 Note (*): Although these designations are not allowed in ISO2022, | 3848 Note (*): Although these designations are not allowed in ISO2022, |
3136 Emacs accepts them on decoding, and produces them on encoding | 3849 Emacs accepts them on decoding, and produces them on encoding |
3137 TYPE96 or TYPE96x96 character set in a coding system which is | 3850 CHARS96 character sets in a coding system which is characterized as |
3138 characterized as 7-bit environment, non-locking-shift, and | 3851 7-bit environment, non-locking-shift, and non-single-shift. |
3139 non-single-shift. | |
3140 | 3852 |
3141 Note (**): If <F> is '@', 'A', or 'B', the intermediate character | 3853 Note (**): If <F> is '@', 'A', or 'B', the intermediate character |
3142 '(' can be omitted. We call this as "short-form" here after. | 3854 '(' can be omitted. We refer to this as "short-form" hereafter. |
3143 | 3855 |
3144 Now you may notice that there are a lot of ways for encoding the | 3856 Now you may notice that there are a lot of ways for encoding the |
3145 same multilingual text in ISO2022. Actually, there exist many | 3857 same multilingual text in ISO2022. Actually, there exist many |
3146 coding systems such as Compound Text (used in X's inter client | 3858 coding systems such as Compound Text (used in X11's inter client |
3147 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR | 3859 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR |
3148 (used in Korean internet), EUC (Extended UNIX Code, used in Asian | 3860 (used in Korean internet), EUC (Extended UNIX Code, used in Asian |
3149 localized platforms), and all of these are variants of ISO2022. | 3861 localized platforms), and all of these are variants of ISO2022. |
3150 | 3862 |
3151 In addition to the above, Emacs handles two more kinds of escape | 3863 In addition to the above, Emacs handles two more kinds of escape |
3152 sequences: ISO6429's direction specification and Emacs' private | 3864 sequences: ISO6429's direction specification and Emacs' private |
3153 sequence for specifying character composition. | 3865 sequence for specifying character composition. |
3154 | 3866 |
3155 ISO6429's direction specification takes the following format: | 3867 ISO6429's direction specification takes the following form: |
3156 o CSI ']' -- end of the current direction | 3868 o CSI ']' -- end of the current direction |
3157 o CSI '0' ']' -- end of the current direction | 3869 o CSI '0' ']' -- end of the current direction |
3158 o CSI '1' ']' -- start of left-to-right text | 3870 o CSI '1' ']' -- start of left-to-right text |
3159 o CSI '2' ']' -- start of right-to-left text | 3871 o CSI '2' ']' -- start of right-to-left text |
3160 The control character CSI (0x9B: control sequence introducer) is | 3872 The control character CSI (0x9B: control sequence introducer) is |
3161 abbreviated to the escape sequence ESC '[' in 7-bit environment. | 3873 abbreviated to the escape sequence ESC '[' in a 7-bit environment. |
3162 | 3874 |
3163 Character composition specification takes the following format: | 3875 Character composition specification takes the following form: |
3164 o ESC '0' -- start character composition | 3876 o ESC '0' -- start character composition |
3165 o ESC '1' -- end character composition | 3877 o ESC '1' -- end character composition |
3166 Since these are not standard escape sequences of any ISO, the use | 3878 Since these are not standard escape sequences of any ISO standard, |
3167 of them for these meanings is restricted to Emacs only. */ | 3879 their use with these meanings is restricted to Emacs only. */ |
3168 | 3880 |
3169 static void | 3881 static void |
3170 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso) | 3882 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso) |
3171 { | 3883 { |
3172 int i; | 3884 int i; |
3186 iso->register_right = 1; | 3898 iso->register_right = 1; |
3187 iso->switched_dir_and_no_valid_charset_yet = 0; | 3899 iso->switched_dir_and_no_valid_charset_yet = 0; |
3188 iso->invalid_switch_dir = 0; | 3900 iso->invalid_switch_dir = 0; |
3189 iso->output_direction_sequence = 0; | 3901 iso->output_direction_sequence = 0; |
3190 iso->output_literally = 0; | 3902 iso->output_literally = 0; |
3903 #ifdef ENABLE_COMPOSITE_CHARS | |
3191 if (iso->composite_chars) | 3904 if (iso->composite_chars) |
3192 Dynarr_reset (iso->composite_chars); | 3905 Dynarr_reset (iso->composite_chars); |
3906 #endif | |
3193 } | 3907 } |
3194 | 3908 |
3195 static int | 3909 static int |
3196 fit_to_be_escape_quoted (unsigned char c) | 3910 fit_to_be_escape_quoted (unsigned char c) |
3197 { | 3911 { |
3315 goto locking_shift; | 4029 goto locking_shift; |
3316 case '|': /* locking shift 3 right */ | 4030 case '|': /* locking shift 3 right */ |
3317 reg = 3; half = 1; | 4031 reg = 3; half = 1; |
3318 goto locking_shift; | 4032 goto locking_shift; |
3319 | 4033 |
4034 #ifdef ENABLE_COMPOSITE_CHARS | |
3320 /**** composite ****/ | 4035 /**** composite ****/ |
3321 | 4036 |
3322 case '0': | 4037 case '0': |
3323 iso->esc = ISO_ESC_START_COMPOSITE; | 4038 iso->esc = ISO_ESC_START_COMPOSITE; |
3324 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | | 4039 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | |
3328 case '1': | 4043 case '1': |
3329 iso->esc = ISO_ESC_END_COMPOSITE; | 4044 iso->esc = ISO_ESC_END_COMPOSITE; |
3330 *flags = (*flags & CODING_STATE_ISO2022_LOCK) & | 4045 *flags = (*flags & CODING_STATE_ISO2022_LOCK) & |
3331 ~CODING_STATE_COMPOSITE; | 4046 ~CODING_STATE_COMPOSITE; |
3332 return 1; | 4047 return 1; |
4048 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3333 | 4049 |
3334 /**** directionality ****/ | 4050 /**** directionality ****/ |
3335 | 4051 |
3336 case '[': | 4052 case '[': |
3337 iso->esc = ISO_ESC_5_11; | 4053 iso->esc = ISO_ESC_5_11; |
3711 Also update FLAGS if it is not a null pointer. | 4427 Also update FLAGS if it is not a null pointer. |
3712 If INTERNAL_P is set, we are outputting in internal format and | 4428 If INTERNAL_P is set, we are outputting in internal format and |
3713 need to handle the CSI differently. */ | 4429 need to handle the CSI differently. */ |
3714 | 4430 |
3715 static void | 4431 static void |
3716 restore_left_to_right_direction (struct Lisp_Coding_System *codesys, | 4432 restore_left_to_right_direction (Lisp_Coding_System *codesys, |
3717 unsigned_char_dynarr *dst, | 4433 unsigned_char_dynarr *dst, |
3718 unsigned int *flags, | 4434 unsigned int *flags, |
3719 int internal_p) | 4435 int internal_p) |
3720 { | 4436 { |
3721 if (!flags || (*flags & CODING_STATE_R2L)) | 4437 if (!flags || (*flags & CODING_STATE_R2L)) |
3742 sequence to DST. Also update FLAGS if it is not a null pointer. | 4458 sequence to DST. Also update FLAGS if it is not a null pointer. |
3743 If INTERNAL_P is set, we are outputting in internal format and | 4459 If INTERNAL_P is set, we are outputting in internal format and |
3744 need to handle the CSI differently. */ | 4460 need to handle the CSI differently. */ |
3745 | 4461 |
3746 static void | 4462 static void |
3747 ensure_correct_direction (int direction, struct Lisp_Coding_System *codesys, | 4463 ensure_correct_direction (int direction, Lisp_Coding_System *codesys, |
3748 unsigned_char_dynarr *dst, unsigned int *flags, | 4464 unsigned_char_dynarr *dst, unsigned int *flags, |
3749 int internal_p) | 4465 int internal_p) |
3750 { | 4466 { |
3751 if ((!flags || (*flags & CODING_STATE_R2L)) && | 4467 if ((!flags || (*flags & CODING_STATE_R2L)) && |
3752 direction == CHARSET_LEFT_TO_RIGHT) | 4468 direction == CHARSET_LEFT_TO_RIGHT) |
3775 | 4491 |
3776 static void | 4492 static void |
3777 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src, | 4493 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src, |
3778 unsigned_char_dynarr *dst, unsigned int n) | 4494 unsigned_char_dynarr *dst, unsigned int n) |
3779 { | 4495 { |
3780 unsigned int flags, ch; | |
3781 enum eol_type eol_type; | |
3782 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | 4496 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); |
4497 unsigned int flags = str->flags; | |
4498 unsigned int ch = str->ch; | |
4499 eol_type_t eol_type = str->eol_type; | |
4500 #ifdef ENABLE_COMPOSITE_CHARS | |
4501 unsigned_char_dynarr *real_dst = dst; | |
4502 #endif | |
3783 Lisp_Object coding_system; | 4503 Lisp_Object coding_system; |
3784 unsigned_char_dynarr *real_dst = dst; | 4504 |
3785 | |
3786 CODING_STREAM_DECOMPOSE (str, flags, ch); | |
3787 eol_type = str->eol_type; | |
3788 XSETCODING_SYSTEM (coding_system, str->codesys); | 4505 XSETCODING_SYSTEM (coding_system, str->codesys); |
3789 | 4506 |
4507 #ifdef ENABLE_COMPOSITE_CHARS | |
3790 if (flags & CODING_STATE_COMPOSITE) | 4508 if (flags & CODING_STATE_COMPOSITE) |
3791 dst = str->iso2022.composite_chars; | 4509 dst = str->iso2022.composite_chars; |
4510 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3792 | 4511 |
3793 while (n--) | 4512 while (n--) |
3794 { | 4513 { |
3795 unsigned char c = *src++; | 4514 unsigned char c = *src++; |
3796 if (flags & CODING_STATE_ESCAPE) | 4515 if (flags & CODING_STATE_ESCAPE) |
3800 | 4519 |
3801 if (retval) | 4520 if (retval) |
3802 { | 4521 { |
3803 switch (str->iso2022.esc) | 4522 switch (str->iso2022.esc) |
3804 { | 4523 { |
4524 #ifdef ENABLE_COMPOSITE_CHARS | |
3805 case ISO_ESC_START_COMPOSITE: | 4525 case ISO_ESC_START_COMPOSITE: |
3806 if (str->iso2022.composite_chars) | 4526 if (str->iso2022.composite_chars) |
3807 Dynarr_reset (str->iso2022.composite_chars); | 4527 Dynarr_reset (str->iso2022.composite_chars); |
3808 else | 4528 else |
3809 str->iso2022.composite_chars = Dynarr_new (unsigned_char); | 4529 str->iso2022.composite_chars = Dynarr_new (unsigned_char); |
3818 dst = real_dst; | 4538 dst = real_dst; |
3819 len = set_charptr_emchar (comstr, emch); | 4539 len = set_charptr_emchar (comstr, emch); |
3820 Dynarr_add_many (dst, comstr, len); | 4540 Dynarr_add_many (dst, comstr, len); |
3821 break; | 4541 break; |
3822 } | 4542 } |
4543 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3823 | 4544 |
3824 case ISO_ESC_LITERAL: | 4545 case ISO_ESC_LITERAL: |
3825 DECODE_ADD_BINARY_CHAR (c, dst); | 4546 DECODE_ADD_BINARY_CHAR (c, dst); |
3826 break; | 4547 break; |
3827 | 4548 |
3995 } | 4716 } |
3996 | 4717 |
3997 if (flags & CODING_STATE_END) | 4718 if (flags & CODING_STATE_END) |
3998 DECODE_OUTPUT_PARTIAL_CHAR (ch); | 4719 DECODE_OUTPUT_PARTIAL_CHAR (ch); |
3999 | 4720 |
4000 CODING_STREAM_COMPOSE (str, flags, ch); | 4721 str->flags = flags; |
4722 str->ch = ch; | |
4001 } | 4723 } |
4002 | 4724 |
4003 | 4725 |
4004 /***** ISO2022 encoder *****/ | 4726 /***** ISO2022 encoder *****/ |
4005 | 4727 |
4007 | 4729 |
4008 static void | 4730 static void |
4009 iso2022_designate (Lisp_Object charset, unsigned char reg, | 4731 iso2022_designate (Lisp_Object charset, unsigned char reg, |
4010 struct encoding_stream *str, unsigned_char_dynarr *dst) | 4732 struct encoding_stream *str, unsigned_char_dynarr *dst) |
4011 { | 4733 { |
4012 CONST char *inter94 = "()*+", *inter96= ",-./"; | 4734 static CONST char inter94[] = "()*+"; |
4735 static CONST char inter96[] = ",-./"; | |
4013 unsigned int type; | 4736 unsigned int type; |
4014 unsigned char final; | 4737 unsigned char final; |
4015 Lisp_Object old_charset = str->iso2022.charset[reg]; | 4738 Lisp_Object old_charset = str->iso2022.charset[reg]; |
4016 | 4739 |
4017 str->iso2022.charset[reg] = charset; | 4740 str->iso2022.charset[reg] = charset; |
4095 static void | 4818 static void |
4096 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src, | 4819 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src, |
4097 unsigned_char_dynarr *dst, unsigned int n) | 4820 unsigned_char_dynarr *dst, unsigned int n) |
4098 { | 4821 { |
4099 unsigned char charmask, c; | 4822 unsigned char charmask, c; |
4100 unsigned int flags, ch; | |
4101 enum eol_type eol_type; | |
4102 unsigned char char_boundary; | 4823 unsigned char char_boundary; |
4103 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | 4824 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); |
4104 struct Lisp_Coding_System *codesys = str->codesys; | 4825 unsigned int flags = str->flags; |
4826 unsigned int ch = str->ch; | |
4827 Lisp_Coding_System *codesys = str->codesys; | |
4828 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); | |
4105 int i; | 4829 int i; |
4106 Lisp_Object charset; | 4830 Lisp_Object charset; |
4107 int half; | 4831 int half; |
4108 | 4832 |
4833 #ifdef ENABLE_COMPOSITE_CHARS | |
4109 /* flags for handling composite chars. We do a little switcharoo | 4834 /* flags for handling composite chars. We do a little switcharoo |
4110 on the source while we're outputting the composite char. */ | 4835 on the source while we're outputting the composite char. */ |
4111 unsigned int saved_n = 0; | 4836 unsigned int saved_n = 0; |
4112 CONST unsigned char *saved_src = NULL; | 4837 CONST unsigned char *saved_src = NULL; |
4113 int in_composite = 0; | 4838 int in_composite = 0; |
4114 | 4839 #endif /* ENABLE_COMPOSITE_CHARS */ |
4115 CODING_STREAM_DECOMPOSE (str, flags, ch); | 4840 |
4116 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); | |
4117 char_boundary = str->iso2022.current_char_boundary; | 4841 char_boundary = str->iso2022.current_char_boundary; |
4118 charset = str->iso2022.current_charset; | 4842 charset = str->iso2022.current_charset; |
4119 half = str->iso2022.current_half; | 4843 half = str->iso2022.current_half; |
4120 | 4844 |
4845 #ifdef ENABLE_COMPOSITE_CHARS | |
4121 back_to_square_n: | 4846 back_to_square_n: |
4847 #endif | |
4122 while (n--) | 4848 while (n--) |
4123 { | 4849 { |
4124 c = *src++; | 4850 c = *src++; |
4125 | 4851 |
4126 if (BYTE_ASCII_P (c)) | 4852 if (BYTE_ASCII_P (c)) |
4175 ch = 0; | 4901 ch = 0; |
4176 charset = CHARSET_BY_LEADING_BYTE (c); | 4902 charset = CHARSET_BY_LEADING_BYTE (c); |
4177 if (LEADING_BYTE_PREFIX_P(c)) | 4903 if (LEADING_BYTE_PREFIX_P(c)) |
4178 ch = c; | 4904 ch = c; |
4179 else if (!EQ (charset, Vcharset_control_1) | 4905 else if (!EQ (charset, Vcharset_control_1) |
4180 && !EQ (charset, Vcharset_composite)) | 4906 #ifdef ENABLE_COMPOSITE_CHARS |
4907 && !EQ (charset, Vcharset_composite) | |
4908 #endif | |
4909 ) | |
4181 { | 4910 { |
4182 int reg; | 4911 int reg; |
4183 | 4912 |
4184 ensure_correct_direction (XCHARSET_DIRECTION (charset), | 4913 ensure_correct_direction (XCHARSET_DIRECTION (charset), |
4185 codesys, dst, &flags, 0); | 4914 codesys, dst, &flags, 0); |
4295 Dynarr_add (dst, c & charmask); | 5024 Dynarr_add (dst, c & charmask); |
4296 ch = 0; | 5025 ch = 0; |
4297 } | 5026 } |
4298 else if (ch) | 5027 else if (ch) |
4299 { | 5028 { |
5029 #ifdef ENABLE_COMPOSITE_CHARS | |
4300 if (EQ (charset, Vcharset_composite)) | 5030 if (EQ (charset, Vcharset_composite)) |
4301 { | 5031 { |
4302 if (in_composite) | 5032 if (in_composite) |
4303 { | 5033 { |
4304 /* #### Bother! We don't know how to | 5034 /* #### Bother! We don't know how to |
4318 Dynarr_add (dst, ISO_CODE_ESC); | 5048 Dynarr_add (dst, ISO_CODE_ESC); |
4319 Dynarr_add (dst, '0'); /* start composing */ | 5049 Dynarr_add (dst, '0'); /* start composing */ |
4320 } | 5050 } |
4321 } | 5051 } |
4322 else | 5052 else |
5053 #endif /* ENABLE_COMPOSITE_CHARS */ | |
4323 { | 5054 { |
4324 Dynarr_add (dst, ch & charmask); | 5055 Dynarr_add (dst, ch & charmask); |
4325 Dynarr_add (dst, c & charmask); | 5056 Dynarr_add (dst, c & charmask); |
4326 } | 5057 } |
4327 ch = 0; | 5058 ch = 0; |
4350 } | 5081 } |
4351 } | 5082 } |
4352 } | 5083 } |
4353 } | 5084 } |
4354 | 5085 |
5086 #ifdef ENABLE_COMPOSITE_CHARS | |
4355 if (in_composite) | 5087 if (in_composite) |
4356 { | 5088 { |
4357 n = saved_n; | 5089 n = saved_n; |
4358 src = saved_src; | 5090 src = saved_src; |
4359 in_composite = 0; | 5091 in_composite = 0; |
4360 Dynarr_add (dst, ISO_CODE_ESC); | 5092 Dynarr_add (dst, ISO_CODE_ESC); |
4361 Dynarr_add (dst, '1'); /* end composing */ | 5093 Dynarr_add (dst, '1'); /* end composing */ |
4362 goto back_to_square_n; /* Wheeeeeeeee ..... */ | 5094 goto back_to_square_n; /* Wheeeeeeeee ..... */ |
4363 } | 5095 } |
5096 #endif /* ENABLE_COMPOSITE_CHARS */ | |
4364 | 5097 |
4365 if (char_boundary && flags & CODING_STATE_END) | 5098 if (char_boundary && flags & CODING_STATE_END) |
4366 { | 5099 { |
4367 restore_left_to_right_direction (codesys, dst, &flags, 0); | 5100 restore_left_to_right_direction (codesys, dst, &flags, 0); |
4368 ensure_normal_shift (str, dst); | 5101 ensure_normal_shift (str, dst); |
4372 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); | 5105 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); |
4373 iso2022_designate (initial_charset, i, str, dst); | 5106 iso2022_designate (initial_charset, i, str, dst); |
4374 } | 5107 } |
4375 } | 5108 } |
4376 | 5109 |
4377 CODING_STREAM_COMPOSE (str, flags, ch); | 5110 str->flags = flags; |
5111 str->ch = ch; | |
4378 str->iso2022.current_char_boundary = char_boundary; | 5112 str->iso2022.current_char_boundary = char_boundary; |
4379 str->iso2022.current_charset = charset; | 5113 str->iso2022.current_charset = charset; |
4380 str->iso2022.current_half = half; | 5114 str->iso2022.current_half = half; |
4381 | 5115 |
4382 /* Verbum caro factum est! */ | 5116 /* Verbum caro factum est! */ |
4393 static void | 5127 static void |
4394 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src, | 5128 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src, |
4395 unsigned_char_dynarr *dst, unsigned int n) | 5129 unsigned_char_dynarr *dst, unsigned int n) |
4396 { | 5130 { |
4397 unsigned char c; | 5131 unsigned char c; |
4398 unsigned int flags, ch; | |
4399 enum eol_type eol_type; | |
4400 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | 5132 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); |
4401 | 5133 unsigned int flags = str->flags; |
4402 CODING_STREAM_DECOMPOSE (str, flags, ch); | 5134 unsigned int ch = str->ch; |
4403 eol_type = str->eol_type; | 5135 eol_type_t eol_type = str->eol_type; |
4404 | 5136 |
4405 while (n--) | 5137 while (n--) |
4406 { | 5138 { |
4407 c = *src++; | 5139 c = *src++; |
4408 | 5140 |
4411 label_continue_loop:; | 5143 label_continue_loop:; |
4412 } | 5144 } |
4413 | 5145 |
4414 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); | 5146 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); |
4415 | 5147 |
4416 CODING_STREAM_COMPOSE (str, flags, ch); | 5148 str->flags = flags; |
5149 str->ch = ch; | |
4417 } | 5150 } |
4418 | 5151 |
4419 static void | 5152 static void |
4420 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src, | 5153 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src, |
4421 unsigned_char_dynarr *dst, unsigned int n) | 5154 unsigned_char_dynarr *dst, unsigned int n) |
4422 { | 5155 { |
4423 unsigned char c; | 5156 unsigned char c; |
4424 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | 5157 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); |
4425 unsigned int flags, ch; | 5158 unsigned int flags = str->flags; |
4426 enum eol_type eol_type; | 5159 unsigned int ch = str->ch; |
4427 | 5160 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); |
4428 CODING_STREAM_DECOMPOSE (str, flags, ch); | |
4429 eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); | |
4430 | 5161 |
4431 while (n--) | 5162 while (n--) |
4432 { | 5163 { |
4433 c = *src++; | 5164 c = *src++; |
4434 if (c == '\n') | 5165 if (c == '\n') |
4466 untranslatable character, so ignore it */ | 5197 untranslatable character, so ignore it */ |
4467 ch = 0; | 5198 ch = 0; |
4468 } | 5199 } |
4469 } | 5200 } |
4470 | 5201 |
4471 CODING_STREAM_COMPOSE (str, flags, ch); | 5202 str->flags = flags; |
5203 str->ch = ch; | |
4472 } | 5204 } |
4473 | 5205 |
4474 | 5206 |
4475 /************************************************************************/ | 5207 /************************************************************************/ |
4476 /* Simple internal/external functions */ | 5208 /* Simple internal/external functions */ |
4668 #ifdef MULE | 5400 #ifdef MULE |
4669 DEFSUBR (Fdecode_shift_jis_char); | 5401 DEFSUBR (Fdecode_shift_jis_char); |
4670 DEFSUBR (Fencode_shift_jis_char); | 5402 DEFSUBR (Fencode_shift_jis_char); |
4671 DEFSUBR (Fdecode_big5_char); | 5403 DEFSUBR (Fdecode_big5_char); |
4672 DEFSUBR (Fencode_big5_char); | 5404 DEFSUBR (Fencode_big5_char); |
5405 DEFSUBR (Fset_ucs_char); | |
5406 DEFSUBR (Fucs_char); | |
5407 DEFSUBR (Fset_char_ucs); | |
5408 DEFSUBR (Fchar_ucs); | |
4673 #endif /* MULE */ | 5409 #endif /* MULE */ |
4674 defsymbol (&Qcoding_system_p, "coding-system-p"); | 5410 defsymbol (&Qcoding_system_p, "coding-system-p"); |
4675 defsymbol (&Qno_conversion, "no-conversion"); | 5411 defsymbol (&Qno_conversion, "no-conversion"); |
4676 #ifdef MULE | 5412 #ifdef MULE |
4677 defsymbol (&Qbig5, "big5"); | 5413 defsymbol (&Qbig5, "big5"); |
4678 defsymbol (&Qshift_jis, "shift-jis"); | 5414 defsymbol (&Qshift_jis, "shift-jis"); |
5415 defsymbol (&Qucs4, "ucs-4"); | |
5416 defsymbol (&Qutf8, "utf-8"); | |
4679 defsymbol (&Qccl, "ccl"); | 5417 defsymbol (&Qccl, "ccl"); |
4680 defsymbol (&Qiso2022, "iso2022"); | 5418 defsymbol (&Qiso2022, "iso2022"); |
4681 #endif /* MULE */ | 5419 #endif /* MULE */ |
4682 defsymbol (&Qmnemonic, "mnemonic"); | 5420 defsymbol (&Qmnemonic, "mnemonic"); |
4683 defsymbol (&Qeol_type, "eol-type"); | 5421 defsymbol (&Qeol_type, "eol-type"); |
4717 defsymbol (&Qctext, "ctext"); | 5455 defsymbol (&Qctext, "ctext"); |
4718 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS], | 5456 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS], |
4719 "shift-jis"); | 5457 "shift-jis"); |
4720 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5], | 5458 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5], |
4721 "big5"); | 5459 "big5"); |
5460 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4], | |
5461 "ucs-4"); | |
5462 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8], | |
5463 "utf-8"); | |
4722 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7], | 5464 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7], |
4723 "iso-7"); | 5465 "iso-7"); |
4724 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE], | 5466 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE], |
4725 "iso-8-designate"); | 5467 "iso-8-designate"); |
4726 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1], | 5468 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1], |
4870 Qbinary); | 5612 Qbinary); |
4871 | 5613 |
4872 /* Need this for bootstrapping */ | 5614 /* Need this for bootstrapping */ |
4873 coding_category_system[CODING_CATEGORY_NO_CONVERSION] = | 5615 coding_category_system[CODING_CATEGORY_NO_CONVERSION] = |
4874 Fget_coding_system (Qno_conversion); | 5616 Fget_coding_system (Qno_conversion); |
4875 } | 5617 |
5618 #ifdef MULE | |
5619 { | |
5620 unsigned int i; | |
5621 | |
5622 for (i = 0; i < 65536; i++) | |
5623 ucs_to_mule_table[i] = Qnil; | |
5624 } | |
5625 staticpro (&mule_to_ucs_table); | |
5626 mule_to_ucs_table = Fmake_char_table(Qgeneric); | |
5627 #endif /* MULE */ | |
5628 } |