comparison src/mule-coding.c @ 3439:d1754e7f0cea

[xemacs-hg @ 2006-06-03 17:50:39 by aidan] Just-in-time Unicode code point support.
author aidan
date Sat, 03 Jun 2006 17:51:06 +0000
parents 96ec8f16af45
children 42e4605ef1de
comparison
equal deleted inserted replaced
3438:14fbcab7c67b 3439:d1754e7f0cea
92 92
93 inline static int 93 inline static int
94 byte_shift_jis_katakana_p (int c) 94 byte_shift_jis_katakana_p (int c)
95 { 95 {
96 return c >= 0xA1 && c <= 0xDF; 96 return c >= 0xA1 && c <= 0xDF;
97 }
98
99 inline static void
100 dynarr_add_2022_one_dimension (Lisp_Object charset, Ibyte c,
101 unsigned char charmask,
102 unsigned_char_dynarr *dst)
103 {
104 if (XCHARSET_ENCODE_AS_UTF_8 (charset))
105 {
106 encode_unicode_char (charset, c & charmask, 0,
107 dst, UNICODE_UTF_8, 0);
108 }
109 else
110 {
111 Dynarr_add (dst, c & charmask);
112 }
113 }
114
115 inline static void
116 dynarr_add_2022_two_dimensions (Lisp_Object charset, Ibyte c,
117 unsigned int ch,
118 unsigned char charmask,
119 unsigned_char_dynarr *dst)
120 {
121 if (XCHARSET_ENCODE_AS_UTF_8 (charset))
122 {
123 encode_unicode_char (charset,
124 ch & charmask,
125 c & charmask, dst,
126 UNICODE_UTF_8, 0);
127 }
128 else
129 {
130 Dynarr_add (dst, ch & charmask);
131 Dynarr_add (dst, c & charmask);
132 }
97 } 133 }
98 134
99 /* Convert Shift-JIS data to internal format. */ 135 /* Convert Shift-JIS data to internal format. */
100 136
101 static Bytecount 137 static Bytecount
669 ISO_ESC_NOTHING, /* Nothing has been seen. */ 705 ISO_ESC_NOTHING, /* Nothing has been seen. */
670 ISO_ESC, /* We've seen ESC. */ 706 ISO_ESC, /* We've seen ESC. */
671 ISO_ESC_2_4, /* We've seen ESC $. This indicates 707 ISO_ESC_2_4, /* We've seen ESC $. This indicates
672 that we're designating a multi-byte, rather 708 that we're designating a multi-byte, rather
673 than a single-byte, character set. */ 709 than a single-byte, character set. */
710 ISO_ESC_2_5, /* We've seen ESC %. This indicates an escape to a
711 Unicode coding system; the only one of these
712 we're prepared to deal with is UTF-8, which has
713 the next character as G. */
674 ISO_ESC_2_8, /* We've seen ESC 0x28, i.e. ESC (. 714 ISO_ESC_2_8, /* We've seen ESC 0x28, i.e. ESC (.
675 This means designate a 94-character 715 This means designate a 94-character
676 character set into G0. */ 716 character set into G0. */
677 ISO_ESC_2_9, /* We've seen ESC 0x29 -- designate a 717 ISO_ESC_2_9, /* We've seen ESC 0x29 -- designate a
678 94-character character set into G1. */ 718 94-character character set into G1. */
750 #define ISO_STATE_SS3 (1 << 4) 790 #define ISO_STATE_SS3 (1 << 4)
751 /* If set, we're currently processing a composite character (i.e. a 791 /* If set, we're currently processing a composite character (i.e. a
752 character constructed by overstriking two or more characters). */ 792 character constructed by overstriking two or more characters). */
753 #define ISO_STATE_COMPOSITE (1 << 5) 793 #define ISO_STATE_COMPOSITE (1 << 5)
754 794
795 /* If set, we're processing UTF-8 encoded data within ISO-2022
796 processing. */
797 #define ISO_STATE_UTF_8 (1 << 6)
798
755 /* ISO_STATE_LOCK is the mask of flags that remain on until explicitly 799 /* ISO_STATE_LOCK is the mask of flags that remain on until explicitly
756 turned off when in the ISO2022 encoder/decoder. Other flags are turned 800 turned off when in the ISO2022 encoder/decoder. Other flags are turned
757 off at the end of processing each character or escape sequence. */ 801 off at the end of processing each character or escape sequence. */
758 # define ISO_STATE_LOCK \ 802 # define ISO_STATE_LOCK \
759 (ISO_STATE_COMPOSITE | ISO_STATE_R2L) 803 (ISO_STATE_COMPOSITE | ISO_STATE_R2L | ISO_STATE_UTF_8)
760 804
761 typedef struct charset_conversion_spec 805 typedef struct charset_conversion_spec
762 { 806 {
763 Lisp_Object from_charset; 807 Lisp_Object from_charset;
764 Lisp_Object to_charset; 808 Lisp_Object to_charset;
920 /* Other state variables that need to be preserved across 964 /* Other state variables that need to be preserved across
921 invocations. */ 965 invocations. */
922 Lisp_Object current_charset; 966 Lisp_Object current_charset;
923 int current_half; 967 int current_half;
924 int current_char_boundary; 968 int current_char_boundary;
969
970 /* Used for handling UTF-8. */
971 unsigned char counter;
925 }; 972 };
926 973
927 static const struct memory_description ccs_description_1[] = 974 static const struct memory_description ccs_description_1[] =
928 { 975 {
929 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) }, 976 { XD_LISP_OBJECT, offsetof (charset_conversion_spec, from_charset) },
1342 *flags &= ISO_STATE_LOCK; 1389 *flags &= ISO_STATE_LOCK;
1343 return 0; 1390 return 0;
1344 } 1391 }
1345 1392
1346 case ISO_ESC: 1393 case ISO_ESC:
1394
1395 /* The only available ISO 2022 sequence in UTF-8 mode is ESC % @, to
1396 exit from it. If we see any other escape sequence, pass it through
1397 in the error handler. */
1398 if (*flags & ISO_STATE_UTF_8 && '%' != c)
1399 {
1400 return 0;
1401 }
1402
1347 switch (c) 1403 switch (c)
1348 { 1404 {
1349 /**** single shift ****/ 1405 /**** single shift ****/
1350 1406
1351 case 'N': /* single shift 2 */ 1407 case 'N': /* single shift 2 */
1409 1465
1410 case '$': /* multibyte charset prefix */ 1466 case '$': /* multibyte charset prefix */
1411 iso->esc = ISO_ESC_2_4; 1467 iso->esc = ISO_ESC_2_4;
1412 goto not_done; 1468 goto not_done;
1413 1469
1470 case '%': /* Prefix to an escape to or from Unicode. */
1471 iso->esc = ISO_ESC_2_5;
1472 goto not_done;
1473
1414 default: 1474 default:
1415 if (0x28 <= c && c <= 0x2F) 1475 if (0x28 <= c && c <= 0x2F)
1416 { 1476 {
1417 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8); 1477 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8);
1418 goto not_done; 1478 goto not_done;
1431 1491
1432 /* bzzzt! */ 1492 /* bzzzt! */
1433 goto error; 1493 goto error;
1434 } 1494 }
1435 1495
1436 1496 /* ISO-IR 196 UTF-8 support. */
1437 1497 case ISO_ESC_2_5:
1498 if ('G' == c)
1499 {
1500 /* Activate UTF-8 mode. */
1501 *flags &= ISO_STATE_LOCK;
1502 *flags |= ISO_STATE_UTF_8;
1503 iso->esc = ISO_ESC_NOTHING;
1504 return 1;
1505 }
1506 else if ('@' == c)
1507 {
1508 /* Deactive UTF-8 mode. */
1509 *flags &= ISO_STATE_LOCK;
1510 *flags &= ~(ISO_STATE_UTF_8);
1511 iso->esc = ISO_ESC_NOTHING;
1512 return 1;
1513 }
1514 else
1515 {
1516 /* Oops, we don't support the other UTF-? coding systems within
1517 ISO 2022, only in their own context. */
1518 goto error;
1519 }
1438 /**** directionality ****/ 1520 /**** directionality ****/
1439 1521
1440 case ISO_ESC_5_11: /* ISO6429 direction control */ 1522 case ISO_ESC_5_11: /* ISO6429 direction control */
1441 if (c == ']') 1523 if (c == ']')
1442 { 1524 {
1820 DECODE_ADD_BINARY_CHAR (c, dst); 1902 DECODE_ADD_BINARY_CHAR (c, dst);
1821 } 1903 }
1822 } 1904 }
1823 ch = 0; 1905 ch = 0;
1824 } 1906 }
1907 else if (flags & ISO_STATE_UTF_8)
1908 {
1909 unsigned char counter = data->counter;
1910 Ibyte work[MAX_ICHAR_LEN];
1911 int len;
1912 Lisp_Object chr;
1913
1914 if (ISO_CODE_ESC == c)
1915 {
1916 /* Allow the escape sequence parser to end the UTF-8 state. */
1917 flags |= ISO_STATE_ESCAPE;
1918 data->esc = ISO_ESC;
1919 data->esc_bytes_index = 1;
1920 continue;
1921 }
1922
1923 switch (counter)
1924 {
1925 case 0:
1926 if (c >= 0xfc)
1927 {
1928 ch = c & 0x01;
1929 counter = 5;
1930 }
1931 else if (c >= 0xf8)
1932 {
1933 ch = c & 0x03;
1934 counter = 4;
1935 }
1936 else if (c >= 0xf0)
1937 {
1938 ch = c & 0x07;
1939 counter = 3;
1940 }
1941 else if (c >= 0xe0)
1942 {
1943 ch = c & 0x0f;
1944 counter = 2;
1945 }
1946 else if (c >= 0xc0)
1947 {
1948 ch = c & 0x1f;
1949 counter = 1;
1950 }
1951 else
1952 /* ASCII, or the lower control characters. */
1953 Dynarr_add (dst, c);
1954
1955 break;
1956 case 1:
1957 ch = (ch << 6) | (c & 0x3f);
1958 chr = Funicode_to_char(make_int(ch), Qnil);
1959
1960 if (!NILP (chr))
1961 {
1962 assert(CHARP(chr));
1963 len = set_itext_ichar (work, XCHAR(chr));
1964 Dynarr_add_many (dst, work, len);
1965 }
1966 else
1967 {
1968 /* Shouldn't happen, this code should only be enabled in
1969 XEmacsen with support for all of Unicode. */
1970 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
1971 Dynarr_add (dst, 34 + 128);
1972 Dynarr_add (dst, 46 + 128);
1973 }
1974
1975 ch = 0;
1976 counter = 0;
1977 break;
1978 default:
1979 ch = (ch << 6) | (c & 0x3f);
1980 counter--;
1981 }
1982
1983 if (str->eof)
1984 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
1985
1986 data->counter = counter;
1987 }
1825 else if (byte_c0_p (c) || byte_c1_p (c)) 1988 else if (byte_c0_p (c) || byte_c1_p (c))
1826 { /* Control characters */ 1989 { /* Control characters */
1827 1990
1828 /***** Error-handling *****/ 1991 /***** Error-handling *****/
1829 1992
2008 } 2171 }
2009 } 2172 }
2010 } 2173 }
2011 2174
2012 Dynarr_add (dst, ISO_CODE_ESC); 2175 Dynarr_add (dst, ISO_CODE_ESC);
2176
2013 switch (type) 2177 switch (type)
2014 { 2178 {
2015 case CHARSET_TYPE_94: 2179 case CHARSET_TYPE_94:
2016 Dynarr_add (dst, inter94[reg]); 2180 Dynarr_add (dst, inter94[reg]);
2017 break; 2181 break;
2100 2264
2101 if (byte_ascii_p (c)) 2265 if (byte_ascii_p (c))
2102 { /* Processing ASCII character */ 2266 { /* Processing ASCII character */
2103 ch = 0; 2267 ch = 0;
2104 2268
2269 if (flags & ISO_STATE_UTF_8)
2270 {
2271 Dynarr_add (dst, ISO_CODE_ESC);
2272 Dynarr_add (dst, '%');
2273 Dynarr_add (dst, '@');
2274 flags &= ~(ISO_STATE_UTF_8);
2275 }
2276
2105 restore_left_to_right_direction (codesys, dst, &flags, 0); 2277 restore_left_to_right_direction (codesys, dst, &flags, 0);
2106 2278
2107 /* Make sure G0 contains ASCII */ 2279 /* Make sure G0 contains ASCII */
2108 if ((c > ' ' && c < ISO_CODE_DEL) || 2280 if ((c > ' ' && c < ISO_CODE_DEL) ||
2109 !XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys)) 2281 !XCODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
2143 && fit_to_be_escape_quoted (c)) 2315 && fit_to_be_escape_quoted (c))
2144 Dynarr_add (dst, ISO_CODE_ESC); 2316 Dynarr_add (dst, ISO_CODE_ESC);
2145 Dynarr_add (dst, c); 2317 Dynarr_add (dst, c);
2146 char_boundary = 1; 2318 char_boundary = 1;
2147 } 2319 }
2148
2149 else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch)) 2320 else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch))
2150 { /* Processing Leading Byte */ 2321 { /* Processing Leading Byte */
2151 ch = 0; 2322 ch = 0;
2152 charset = charset_by_leading_byte (c); 2323 charset = charset_by_leading_byte (c);
2153 if (leading_byte_prefix_p (c)) 2324 if (leading_byte_prefix_p (c))
2154 ch = c; 2325 {
2326 ch = c;
2327 }
2328 else if (XCHARSET_ENCODE_AS_UTF_8 (charset))
2329 {
2330 assert (!EQ (charset, Vcharset_control_1)
2331 && !EQ (charset, Vcharset_composite));
2332
2333 /* If the character set is to be encoded as UTF-8, the escape
2334 is always the same. */
2335 if (!(flags & ISO_STATE_UTF_8))
2336 {
2337 Dynarr_add (dst, ISO_CODE_ESC);
2338 Dynarr_add (dst, '%');
2339 Dynarr_add (dst, 'G');
2340 flags |= ISO_STATE_UTF_8;
2341 }
2342 }
2155 else if (!EQ (charset, Vcharset_control_1) 2343 else if (!EQ (charset, Vcharset_control_1)
2156 && !EQ (charset, Vcharset_composite)) 2344 && !EQ (charset, Vcharset_composite))
2157 { 2345 {
2158 int reg; 2346 int reg;
2347
2348 /* End the UTF-8 state. */
2349 if (flags & ISO_STATE_UTF_8)
2350 {
2351 Dynarr_add (dst, ISO_CODE_ESC);
2352 Dynarr_add (dst, '%');
2353 Dynarr_add (dst, '@');
2354 flags &= ~(ISO_STATE_UTF_8);
2355 }
2159 2356
2160 ensure_correct_direction (XCHARSET_DIRECTION (charset), 2357 ensure_correct_direction (XCHARSET_DIRECTION (charset),
2161 codesys, dst, &flags, 0); 2358 codesys, dst, &flags, 0);
2162 2359
2163 /* Now determine which register to use. */ 2360 /* Now determine which register to use. */
2272 else 2469 else
2273 { 2470 {
2274 switch (XCHARSET_REP_BYTES (charset)) 2471 switch (XCHARSET_REP_BYTES (charset))
2275 { 2472 {
2276 case 2: 2473 case 2:
2277 Dynarr_add (dst, c & charmask); 2474 dynarr_add_2022_one_dimension (charset, c,
2475 charmask, dst);
2278 break; 2476 break;
2279 case 3: 2477 case 3:
2280 if (XCHARSET_PRIVATE_P (charset)) 2478 if (XCHARSET_PRIVATE_P (charset))
2281 { 2479 {
2282 Dynarr_add (dst, c & charmask); 2480 dynarr_add_2022_one_dimension (charset, c,
2481 charmask, dst);
2283 ch = 0; 2482 ch = 0;
2284 } 2483 }
2285 else if (ch) 2484 else if (ch)
2286 { 2485 {
2287 #ifdef ENABLE_COMPOSITE_CHARS 2486 #ifdef ENABLE_COMPOSITE_CHARS
2288 if (EQ (charset, Vcharset_composite)) 2487 if (EQ (charset, Vcharset_composite))
2289 { 2488 {
2489 /* #### Hasn't been written to handle composite
2490 characters yet. */
2491 assert(!XCHARSET_ENCODE_AS_UTF_8 (charset))
2290 if (in_composite) 2492 if (in_composite)
2291 { 2493 {
2292 /* #### Bother! We don't know how to 2494 /* #### Bother! We don't know how to
2293 handle this yet. */ 2495 handle this yet. */
2294 Dynarr_add (dst, '~'); 2496 Dynarr_add (dst, '~');
2308 } 2510 }
2309 } 2511 }
2310 else 2512 else
2311 #endif /* ENABLE_COMPOSITE_CHARS */ 2513 #endif /* ENABLE_COMPOSITE_CHARS */
2312 { 2514 {
2313 Dynarr_add (dst, ch & charmask); 2515 dynarr_add_2022_two_dimensions (charset, c, ch,
2314 Dynarr_add (dst, c & charmask); 2516 charmask, dst);
2315 } 2517 }
2316 ch = 0; 2518 ch = 0;
2317 } 2519 }
2318 else 2520 else
2319 { 2521 {
2322 } 2524 }
2323 break; 2525 break;
2324 case 4: 2526 case 4:
2325 if (ch) 2527 if (ch)
2326 { 2528 {
2327 Dynarr_add (dst, ch & charmask); 2529 dynarr_add_2022_two_dimensions (charset, c, ch,
2328 Dynarr_add (dst, c & charmask); 2530 charmask, dst);
2329 ch = 0; 2531 ch = 0;
2330 } 2532 }
2331 else 2533 else
2332 { 2534 {
2333 ch = c; 2535 ch = c;