Mercurial > hg > xemacs-beta
comparison src/mule-coding.c @ 3439:d1754e7f0cea
[xemacs-hg @ 2006-06-03 17:50:39 by aidan]
Just-in-time Unicode code point support.
author | aidan |
---|---|
date | Sat, 03 Jun 2006 17:51:06 +0000 |
parents | 96ec8f16af45 |
children | 42e4605ef1de |
comparison
equal
deleted
inserted
replaced
3438:14fbcab7c67b | 3439:d1754e7f0cea |
---|---|
92 | 92 |
93 inline static int | 93 inline static int |
94 byte_shift_jis_katakana_p (int c) | 94 byte_shift_jis_katakana_p (int c) |
95 { | 95 { |
96 return c >= 0xA1 && c <= 0xDF; | 96 return c >= 0xA1 && c <= 0xDF; |
97 } | |
98 | |
99 inline static void | |
100 dynarr_add_2022_one_dimension (Lisp_Object charset, Ibyte c, | |
101 unsigned char charmask, | |
102 unsigned_char_dynarr *dst) | |
103 { | |
104 if (XCHARSET_ENCODE_AS_UTF_8 (charset)) | |
105 { | |
106 encode_unicode_char (charset, c & charmask, 0, | |
107 dst, UNICODE_UTF_8, 0); | |
108 } | |
109 else | |
110 { | |
111 Dynarr_add (dst, c & charmask); | |
112 } | |
113 } | |
114 | |
115 inline static void | |
116 dynarr_add_2022_two_dimensions (Lisp_Object charset, Ibyte c, | |
117 unsigned int ch, | |
118 unsigned char charmask, | |
119 unsigned_char_dynarr *dst) | |
120 { | |
121 if (XCHARSET_ENCODE_AS_UTF_8 (charset)) | |
122 { | |
123 encode_unicode_char (charset, | |
124 ch & charmask, | |
125 c & charmask, dst, | |
126 UNICODE_UTF_8, 0); | |
127 } | |
128 else | |
129 { | |
130 Dynarr_add (dst, ch & charmask); | |
131 Dynarr_add (dst, c & charmask); | |
132 } | |
97 } | 133 } |
98 | 134 |
99 /* Convert Shift-JIS data to internal format. */ | 135 /* Convert Shift-JIS data to internal format. */ |
100 | 136 |
101 static Bytecount | 137 static Bytecount |
669 ISO_ESC_NOTHING, /* Nothing has been seen. */ | 705 ISO_ESC_NOTHING, /* Nothing has been seen. */ |
670 ISO_ESC, /* We've seen ESC. */ | 706 ISO_ESC, /* We've seen ESC. */ |
671 ISO_ESC_2_4, /* We've seen ESC $. This indicates | 707 ISO_ESC_2_4, /* We've seen ESC $. This indicates |
672 that we're designating a multi-byte, rather | 708 that we're designating a multi-byte, rather |
673 than a single-byte, character set. */ | 709 than a single-byte, character set. */ |
710 ISO_ESC_2_5, /* We've seen ESC %. This indicates an escape to a | |
711 Unicode coding system; the only one of these | |
712 we're prepared to deal with is UTF-8, which has | |
713 the next character as G. */ | |
674 ISO_ESC_2_8, /* We've seen ESC 0x28, i.e. ESC (. | 714 ISO_ESC_2_8, /* We've seen ESC 0x28, i.e. ESC (. |
675 This means designate a 94-character | 715 This means designate a 94-character |
676 character set into G0. */ | 716 character set into G0. */ |
677 ISO_ESC_2_9, /* We've seen ESC 0x29 -- designate a | 717 ISO_ESC_2_9, /* We've seen ESC 0x29 -- designate a |
678 94-character character set into G1. */ | 718 94-character character set into G1. */ |
750 #define ISO_STATE_SS3 (1 << 4) | 790 #define ISO_STATE_SS3 (1 << 4) |
751 /* If set, we're currently processing a composite character (i.e. a | 791 /* If set, we're currently processing a composite character (i.e. a |
752 character constructed by overstriking two or more characters). */ | 792 character constructed by overstriking two or more characters). */ |
753 #define ISO_STATE_COMPOSITE (1 << 5) | 793 #define ISO_STATE_COMPOSITE (1 << 5) |
754 | 794 |
795 /* If set, we're processing UTF-8 encoded data within ISO-2022 | |
796 processing. */ | |
797 #define ISO_STATE_UTF_8 (1 << 6) | |
798 | |
755 /* ISO_STATE_LOCK is the mask of flags that remain on until explicitly | 799 /* ISO_STATE_LOCK is the mask of flags that remain on until explicitly |
756 turned off when in the ISO2022 encoder/decoder. Other flags are turned | 800 turned off when in the ISO2022 encoder/decoder. Other flags are turned |
757 off at the end of processing each character or escape sequence. */ | 801 off at the end of processing each character or escape sequence. */ |
758 # define ISO_STATE_LOCK \ | 802 # define ISO_STATE_LOCK \ |
759 (ISO_STATE_COMPOSITE | ISO_STATE_R2L) | 803 (ISO_STATE_COMPOSITE | ISO_STATE_R2L | ISO_STATE_UTF_8) |
760 | 804 |
761 typedef struct charset_conversion_spec | 805 typedef struct charset_conversion_spec |
762 { | 806 { |
763 Lisp_Object from_charset; | 807 Lisp_Object from_charset; |
764 Lisp_Object to_charset; | 808 Lisp_Object to_charset; |
920 /* Other state variables that need to be preserved across | 964 /* Other state variables that need to be preserved across |
921 invocations. */ | 965 invocations. */ |
922 Lisp_Object current_charset; | 966 Lisp_Object current_charset; |
923 int current_half; | 967 int current_half; |
924 int current_char_boundary; | 968 int current_char_boundary; |
969 | |
970 /* Used for handling UTF-8. */ | |
971 unsigned char counter; | |
925 }; | 972 }; |
926 | 973 |
927 static const struct memory_description ccs_description_1[] = | 974 static const struct memory_description ccs_description_1[] = |
928 { | 975 { |
929 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) }, | 976 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) }, |
1342 *flags &= ISO_STATE_LOCK; | 1389 *flags &= ISO_STATE_LOCK; |
1343 return 0; | 1390 return 0; |
1344 } | 1391 } |
1345 | 1392 |
1346 case ISO_ESC: | 1393 case ISO_ESC: |
1394 | |
1395 /* The only available ISO 2022 sequence in UTF-8 mode is ESC % @, to | |
1396 exit from it. If we see any other escape sequence, pass it through | |
1397 in the error handler. */ | |
1398 if (*flags & ISO_STATE_UTF_8 && '%' != c) | |
1399 { | |
1400 return 0; | |
1401 } | |
1402 | |
1347 switch (c) | 1403 switch (c) |
1348 { | 1404 { |
1349 /**** single shift ****/ | 1405 /**** single shift ****/ |
1350 | 1406 |
1351 case 'N': /* single shift 2 */ | 1407 case 'N': /* single shift 2 */ |
1409 | 1465 |
1410 case '$': /* multibyte charset prefix */ | 1466 case '$': /* multibyte charset prefix */ |
1411 iso->esc = ISO_ESC_2_4; | 1467 iso->esc = ISO_ESC_2_4; |
1412 goto not_done; | 1468 goto not_done; |
1413 | 1469 |
1470 case '%': /* Prefix to an escape to or from Unicode. */ | |
1471 iso->esc = ISO_ESC_2_5; | |
1472 goto not_done; | |
1473 | |
1414 default: | 1474 default: |
1415 if (0x28 <= c && c <= 0x2F) | 1475 if (0x28 <= c && c <= 0x2F) |
1416 { | 1476 { |
1417 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8); | 1477 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8); |
1418 goto not_done; | 1478 goto not_done; |
1431 | 1491 |
1432 /* bzzzt! */ | 1492 /* bzzzt! */ |
1433 goto error; | 1493 goto error; |
1434 } | 1494 } |
1435 | 1495 |
1436 | 1496 /* ISO-IR 196 UTF-8 support. */ |
1437 | 1497 case ISO_ESC_2_5: |
1498 if ('G' == c) | |
1499 { | |
1500 /* Activate UTF-8 mode. */ | |
1501 *flags &= ISO_STATE_LOCK; | |
1502 *flags |= ISO_STATE_UTF_8; | |
1503 iso->esc = ISO_ESC_NOTHING; | |
1504 return 1; | |
1505 } | |
1506 else if ('@' == c) | |
1507 { | |
1508 /* Deactive UTF-8 mode. */ | |
1509 *flags &= ISO_STATE_LOCK; | |
1510 *flags &= ~(ISO_STATE_UTF_8); | |
1511 iso->esc = ISO_ESC_NOTHING; | |
1512 return 1; | |
1513 } | |
1514 else | |
1515 { | |
1516 /* Oops, we don't support the other UTF-? coding systems within | |
1517 ISO 2022, only in their own context. */ | |
1518 goto error; | |
1519 } | |
1438 /**** directionality ****/ | 1520 /**** directionality ****/ |
1439 | 1521 |
1440 case ISO_ESC_5_11: /* ISO6429 direction control */ | 1522 case ISO_ESC_5_11: /* ISO6429 direction control */ |
1441 if (c == ']') | 1523 if (c == ']') |
1442 { | 1524 { |
1820 DECODE_ADD_BINARY_CHAR (c, dst); | 1902 DECODE_ADD_BINARY_CHAR (c, dst); |
1821 } | 1903 } |
1822 } | 1904 } |
1823 ch = 0; | 1905 ch = 0; |
1824 } | 1906 } |
1907 else if (flags & ISO_STATE_UTF_8) | |
1908 { | |
1909 unsigned char counter = data->counter; | |
1910 Ibyte work[MAX_ICHAR_LEN]; | |
1911 int len; | |
1912 Lisp_Object chr; | |
1913 | |
1914 if (ISO_CODE_ESC == c) | |
1915 { | |
1916 /* Allow the escape sequence parser to end the UTF-8 state. */ | |
1917 flags |= ISO_STATE_ESCAPE; | |
1918 data->esc = ISO_ESC; | |
1919 data->esc_bytes_index = 1; | |
1920 continue; | |
1921 } | |
1922 | |
1923 switch (counter) | |
1924 { | |
1925 case 0: | |
1926 if (c >= 0xfc) | |
1927 { | |
1928 ch = c & 0x01; | |
1929 counter = 5; | |
1930 } | |
1931 else if (c >= 0xf8) | |
1932 { | |
1933 ch = c & 0x03; | |
1934 counter = 4; | |
1935 } | |
1936 else if (c >= 0xf0) | |
1937 { | |
1938 ch = c & 0x07; | |
1939 counter = 3; | |
1940 } | |
1941 else if (c >= 0xe0) | |
1942 { | |
1943 ch = c & 0x0f; | |
1944 counter = 2; | |
1945 } | |
1946 else if (c >= 0xc0) | |
1947 { | |
1948 ch = c & 0x1f; | |
1949 counter = 1; | |
1950 } | |
1951 else | |
1952 /* ASCII, or the lower control characters. */ | |
1953 Dynarr_add (dst, c); | |
1954 | |
1955 break; | |
1956 case 1: | |
1957 ch = (ch << 6) | (c & 0x3f); | |
1958 chr = Funicode_to_char(make_int(ch), Qnil); | |
1959 | |
1960 if (!NILP (chr)) | |
1961 { | |
1962 assert(CHARP(chr)); | |
1963 len = set_itext_ichar (work, XCHAR(chr)); | |
1964 Dynarr_add_many (dst, work, len); | |
1965 } | |
1966 else | |
1967 { | |
1968 /* Shouldn't happen, this code should only be enabled in | |
1969 XEmacsen with support for all of Unicode. */ | |
1970 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); | |
1971 Dynarr_add (dst, 34 + 128); | |
1972 Dynarr_add (dst, 46 + 128); | |
1973 } | |
1974 | |
1975 ch = 0; | |
1976 counter = 0; | |
1977 break; | |
1978 default: | |
1979 ch = (ch << 6) | (c & 0x3f); | |
1980 counter--; | |
1981 } | |
1982 | |
1983 if (str->eof) | |
1984 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); | |
1985 | |
1986 data->counter = counter; | |
1987 } | |
1825 else if (byte_c0_p (c) || byte_c1_p (c)) | 1988 else if (byte_c0_p (c) || byte_c1_p (c)) |
1826 { /* Control characters */ | 1989 { /* Control characters */ |
1827 | 1990 |
1828 /***** Error-handling *****/ | 1991 /***** Error-handling *****/ |
1829 | 1992 |
2008 } | 2171 } |
2009 } | 2172 } |
2010 } | 2173 } |
2011 | 2174 |
2012 Dynarr_add (dst, ISO_CODE_ESC); | 2175 Dynarr_add (dst, ISO_CODE_ESC); |
2176 | |
2013 switch (type) | 2177 switch (type) |
2014 { | 2178 { |
2015 case CHARSET_TYPE_94: | 2179 case CHARSET_TYPE_94: |
2016 Dynarr_add (dst, inter94[reg]); | 2180 Dynarr_add (dst, inter94[reg]); |
2017 break; | 2181 break; |
2100 | 2264 |
2101 if (byte_ascii_p (c)) | 2265 if (byte_ascii_p (c)) |
2102 { /* Processing ASCII character */ | 2266 { /* Processing ASCII character */ |
2103 ch = 0; | 2267 ch = 0; |
2104 | 2268 |
2269 if (flags & ISO_STATE_UTF_8) | |
2270 { | |
2271 Dynarr_add (dst, ISO_CODE_ESC); | |
2272 Dynarr_add (dst, '%'); | |
2273 Dynarr_add (dst, '@'); | |
2274 flags &= ~(ISO_STATE_UTF_8); | |
2275 } | |
2276 | |
2105 restore_left_to_right_direction (codesys, dst, &flags, 0); | 2277 restore_left_to_right_direction (codesys, dst, &flags, 0); |
2106 | 2278 |
2107 /* Make sure G0 contains ASCII */ | 2279 /* Make sure G0 contains ASCII */ |
2108 if ((c > ' ' && c < ISO_CODE_DEL) || | 2280 if ((c > ' ' && c < ISO_CODE_DEL) || |
2109 !XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys)) | 2281 !XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys)) |
2143 && fit_to_be_escape_quoted (c)) | 2315 && fit_to_be_escape_quoted (c)) |
2144 Dynarr_add (dst, ISO_CODE_ESC); | 2316 Dynarr_add (dst, ISO_CODE_ESC); |
2145 Dynarr_add (dst, c); | 2317 Dynarr_add (dst, c); |
2146 char_boundary = 1; | 2318 char_boundary = 1; |
2147 } | 2319 } |
2148 | |
2149 else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch)) | 2320 else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch)) |
2150 { /* Processing Leading Byte */ | 2321 { /* Processing Leading Byte */ |
2151 ch = 0; | 2322 ch = 0; |
2152 charset = charset_by_leading_byte (c); | 2323 charset = charset_by_leading_byte (c); |
2153 if (leading_byte_prefix_p (c)) | 2324 if (leading_byte_prefix_p (c)) |
2154 ch = c; | 2325 { |
2326 ch = c; | |
2327 } | |
2328 else if (XCHARSET_ENCODE_AS_UTF_8 (charset)) | |
2329 { | |
2330 assert (!EQ (charset, Vcharset_control_1) | |
2331 && !EQ (charset, Vcharset_composite)); | |
2332 | |
2333 /* If the character set is to be encoded as UTF-8, the escape | |
2334 is always the same. */ | |
2335 if (!(flags & ISO_STATE_UTF_8)) | |
2336 { | |
2337 Dynarr_add (dst, ISO_CODE_ESC); | |
2338 Dynarr_add (dst, '%'); | |
2339 Dynarr_add (dst, 'G'); | |
2340 flags |= ISO_STATE_UTF_8; | |
2341 } | |
2342 } | |
2155 else if (!EQ (charset, Vcharset_control_1) | 2343 else if (!EQ (charset, Vcharset_control_1) |
2156 && !EQ (charset, Vcharset_composite)) | 2344 && !EQ (charset, Vcharset_composite)) |
2157 { | 2345 { |
2158 int reg; | 2346 int reg; |
2347 | |
2348 /* End the UTF-8 state. */ | |
2349 if (flags & ISO_STATE_UTF_8) | |
2350 { | |
2351 Dynarr_add (dst, ISO_CODE_ESC); | |
2352 Dynarr_add (dst, '%'); | |
2353 Dynarr_add (dst, '@'); | |
2354 flags &= ~(ISO_STATE_UTF_8); | |
2355 } | |
2159 | 2356 |
2160 ensure_correct_direction (XCHARSET_DIRECTION (charset), | 2357 ensure_correct_direction (XCHARSET_DIRECTION (charset), |
2161 codesys, dst, &flags, 0); | 2358 codesys, dst, &flags, 0); |
2162 | 2359 |
2163 /* Now determine which register to use. */ | 2360 /* Now determine which register to use. */ |
2272 else | 2469 else |
2273 { | 2470 { |
2274 switch (XCHARSET_REP_BYTES (charset)) | 2471 switch (XCHARSET_REP_BYTES (charset)) |
2275 { | 2472 { |
2276 case 2: | 2473 case 2: |
2277 Dynarr_add (dst, c & charmask); | 2474 dynarr_add_2022_one_dimension (charset, c, |
2475 charmask, dst); | |
2278 break; | 2476 break; |
2279 case 3: | 2477 case 3: |
2280 if (XCHARSET_PRIVATE_P (charset)) | 2478 if (XCHARSET_PRIVATE_P (charset)) |
2281 { | 2479 { |
2282 Dynarr_add (dst, c & charmask); | 2480 dynarr_add_2022_one_dimension (charset, c, |
2481 charmask, dst); | |
2283 ch = 0; | 2482 ch = 0; |
2284 } | 2483 } |
2285 else if (ch) | 2484 else if (ch) |
2286 { | 2485 { |
2287 #ifdef ENABLE_COMPOSITE_CHARS | 2486 #ifdef ENABLE_COMPOSITE_CHARS |
2288 if (EQ (charset, Vcharset_composite)) | 2487 if (EQ (charset, Vcharset_composite)) |
2289 { | 2488 { |
2489 /* #### Hasn't been written to handle composite | |
2490 characters yet. */ | |
2491 assert(!XCHARSET_ENCODE_AS_UTF_8 (charset)) | |
2290 if (in_composite) | 2492 if (in_composite) |
2291 { | 2493 { |
2292 /* #### Bother! We don't know how to | 2494 /* #### Bother! We don't know how to |
2293 handle this yet. */ | 2495 handle this yet. */ |
2294 Dynarr_add (dst, '~'); | 2496 Dynarr_add (dst, '~'); |
2308 } | 2510 } |
2309 } | 2511 } |
2310 else | 2512 else |
2311 #endif /* ENABLE_COMPOSITE_CHARS */ | 2513 #endif /* ENABLE_COMPOSITE_CHARS */ |
2312 { | 2514 { |
2313 Dynarr_add (dst, ch & charmask); | 2515 dynarr_add_2022_two_dimensions (charset, c, ch, |
2314 Dynarr_add (dst, c & charmask); | 2516 charmask, dst); |
2315 } | 2517 } |
2316 ch = 0; | 2518 ch = 0; |
2317 } | 2519 } |
2318 else | 2520 else |
2319 { | 2521 { |
2322 } | 2524 } |
2323 break; | 2525 break; |
2324 case 4: | 2526 case 4: |
2325 if (ch) | 2527 if (ch) |
2326 { | 2528 { |
2327 Dynarr_add (dst, ch & charmask); | 2529 dynarr_add_2022_two_dimensions (charset, c, ch, |
2328 Dynarr_add (dst, c & charmask); | 2530 charmask, dst); |
2329 ch = 0; | 2531 ch = 0; |
2330 } | 2532 } |
2331 else | 2533 else |
2332 { | 2534 { |
2333 ch = c; | 2535 ch = c; |