Mercurial > hg > xemacs-beta
annotate src/unicode.c @ 4614:afbfad080ddd
The URLs in our current config.guess and config.sub files are obsolete.
Update to the latest upstream release to get correct URLs, as well as fixes
and enhancements to those scripts.
| author | Jerry James <james@xemacs.org> |
|---|---|
| date | Wed, 11 Feb 2009 11:09:35 -0700 |
| parents | 2669b1b7e33b |
| children | 7e54adf407a1 |
| rev | line source |
|---|---|
| 771 | 1 /* Code to handle Unicode conversion. |
| 3025 | 2 Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 Ben Wing. |
| 771 | 3 |
| 4 This file is part of XEmacs. | |
| 5 | |
| 6 XEmacs is free software; you can redistribute it and/or modify it | |
| 7 under the terms of the GNU General Public License as published by the | |
| 8 Free Software Foundation; either version 2, or (at your option) any | |
| 9 later version. | |
| 10 | |
| 11 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
| 12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
| 13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
| 14 for more details. | |
| 15 | |
| 16 You should have received a copy of the GNU General Public License | |
| 17 along with XEmacs; see the file COPYING. If not, write to | |
| 18 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
| 19 Boston, MA 02111-1307, USA. */ | |
| 20 | |
| 21 /* Synched up with: FSF 20.3. Not in FSF. */ | |
| 22 | |
| 23 /* Authorship: | |
| 24 | |
| 25 Current primary author: Ben Wing <ben@xemacs.org> | |
| 26 | |
| 27 Written by Ben Wing <ben@xemacs.org>, June, 2001. | |
| 28 Separated out into this file, August, 2001. | |
| 29 Includes Unicode coding systems, some parts of which have been written | |
| 877 | 30 by someone else. #### Morioka and Hayashi, I think. |
| 771 | 31 |
| 32 As of September 2001, the detection code is here and abstraction of the | |
| 877 | 33 detection system is finished. The unicode detectors have been rewritten |
| 771 | 34 to include multiple levels of likelihood. |
| 35 */ | |
| 36 | |
| 37 #include <config.h> | |
| 38 #include "lisp.h" | |
| 39 | |
| 40 #include "charset.h" | |
| 41 #include "file-coding.h" | |
| 42 #include "opaque.h" | |
| 43 | |
| 44 #include "sysfile.h" | |
| 45 | |
| 2367 | 46 /* For more info about how Unicode works under Windows, see intl-win32.c. */ |
| 47 | |
| 48 /* Info about Unicode translation tables [ben]: | |
| 49 | |
| 50 FORMAT: | |
| 51 ------- | |
| 52 | |
| 53 We currently use the following format for tables: | |
| 54 | |
| 55 If dimension == 1, to_unicode_table is a 96-element array of ints | |
| 56 (Unicode code points); else, it's a 96-element array of int * pointers, | |
| 57 each of which points to a 96-element array of ints. If no elements in a | |
| 58 row have been filled in, the pointer will point to a default empty | |
| 59 table; that way, memory usage is more reasonable but lookup still fast. | |
| 60 | |
| 61 -- If from_unicode_levels == 1, from_unicode_table is a 256-element | |
| 62 array of shorts (octet 1 in high byte, octet 2 in low byte; we don't | |
| 63 store Ichars directly to save space). | |
| 64 | |
| 65 -- If from_unicode_levels == 2, from_unicode_table is a 256-element | |
| 66 array of short * pointers, each of which points to a 256-element array | |
| 67 of shorts. | |
| 68 | |
| 69 -- If from_unicode_levels == 3, from_unicode_table is a 256-element | |
| 70 array of short ** pointers, each of which points to a 256-element array | |
| 71 of short * pointers, each of which points to a 256-element array of | |
| 72 shorts. | |
| 73 | |
| 74 -- If from_unicode_levels == 4, same thing but one level deeper. | |
| 75 | |
| 76 Just as for to_unicode_table, we use default tables to fill in all | |
| 77 entries with no values in them. | |
| 78 | |
| 79 #### An obvious space-saving optimization is to use variable-sized | |
| 80 tables, where each table instead of just being a 256-element array, is a | |
| 81 structure with a start value, an end value, and a variable number of | |
| 82 entries (END - START + 1). Only 8 bits are needed for END and START, | |
| 83 and could be stored at the end to avoid alignment problems. However, | |
| 84 before charging off and implementing this, we need to consider whether | |
| 85 it's worth it: | |
| 86 | |
| 87 (1) Most tables will be highly localized in which code points are | |
| 88 defined, heavily reducing the possible memory waste. Before doing any | |
| 89 rewriting, write some code to see how much memory is actually being | |
| 90 wasted (i.e. ratio of empty entries to total # of entries) and only | |
| 91 start rewriting if it's unacceptably high. You have to check over all | |
| 92 charsets. | |
| 93 | |
| 94 (2) Since entries are usually added one at a time, you have to be very | |
| 95 careful when creating the tables to avoid realloc()/free() thrashing in | |
| 96 the common case when you are in an area of high localization and are | |
| 97 going to end up using most entries in the table. You'd certainly want | |
| 98 to allow only certain sizes, not arbitrary ones (probably powers of 2, | |
| 99 where you want the entire block including the START/END values to fit | |
| 100 into a power of 2, minus any malloc overhead if there is any -- there's | |
| 101 none under gmalloc.c, and probably most system malloc() functions are | |
| 102 quite smart nowadays and also have no overhead). You could optimize | |
| 103 somewhat during the in-C initializations, because you can compute the | |
| 104 actual usage of various tables by scanning the entries you're going to | |
| 105 add in a separate pass before adding them. (You could actually do the | |
| 106 same thing when entries are added on the Lisp level by making the | |
| 107 assumption that all the entries will come in one after another before | |
| 108 any use is made of the data. So as they're coming in, you just store | |
| 109 them in a big long list, and the first time you need to retrieve an | |
| 110 entry, you compute the whole table at once.) You'd still have to deal | |
| 111 with the possibility of later entries coming in, though. | |
| 112 | |
| 113 (3) You do lose some speed using START/END values, since you need a | |
| 114 couple of comparisons at each level. This could easily make each single | |
| 115 lookup become 3-4 times slower. The Unicode book considers this a big | |
| 116 issue, and recommends against variable-sized tables for this reason; | |
| 117 however, they almost certainly have in mind applications that primarily | |
| 118 involve conversion of large amounts of data. Most Unicode strings that | |
| 119 are translated in XEmacs are fairly small. The only place where this | |
| 120 might matter is in loading large files -- e.g. a 3-megabyte | |
| 121 Unicode-encoded file. So think about this, and maybe do a trial | |
| 122 implementation where you don't worry too much about the intricacies of | |
| 123 (2) and just implement some basic "multiply by 1.5" trick or something | |
| 124 to do the resizing. There is a very good FAQ on Unicode called | |
| 125 something like the Linux-Unicode How-To (it should be part of the Linux | |
| 126 How-To's, I think), that lists the url of a guy with a whole bunch of | |
| 127 unicode files you can use to stress-test your implementations, and he's | |
| 128 highly likely to have a good multi-megabyte Unicode-encoded file (with | |
| 129 normal text in it -- if you created your own just by creating repeated | |
| 130 strings of letters and numbers, you probably wouldn't get accurate | |
| 131 results). | |
| 132 | |
| 133 INITIALIZATION: | |
| 134 --------------- | |
| 135 | |
| 136 There are advantages and disadvantages to loading the tables at | |
| 137 run-time. | |
| 138 | |
| 139 Advantages: | |
| 140 | |
| 141 They're big, and it's very fast to recreate them (a fraction of a second | |
| 142 on modern processors). | |
| 143 | |
| 144 Disadvantages: | |
| 145 | |
| 146 (1) User-defined charsets: It would be inconvenient to require all | |
| 147 dumped user-defined charsets to be reloaded at init time. | |
| 148 | |
| 149 NB With run-time loading, we load in init-mule-at-startup, in | |
| 150 mule-cmds.el. This is called from startup.el, which is quite late in | |
| 151 the initialization process -- but data-directory isn't set until then. | |
| 152 With dump-time loading, you still can't dump in a Japanese directory | |
| 153 (again, until we move to Unicode internally), but this is not such an | |
| 154 imposition. | |
| 155 | |
| 156 | |
| 157 */ | |
| 158 | |
| 771 | 159 /* #### WARNING! The current sledgehammer routines have a fundamental |
| 160 problem in that they can't handle two characters mapping to a | |
| 161 single Unicode codepoint or vice-versa in a single charset table. | |
| 162 It's not clear there is any way to handle this and still make the | |
| 877 | 163 sledgehammer routines useful. |
| 164 | |
| 165 Inquiring Minds Want To Know Dept: does the above WARNING mean that | |
| 166 _if_ it happens, then it will signal error, or then it will do | |
| 167 something evil and unpredictable? Signaling an error is OK: for | |
| 168 all national standards, the national to Unicode map is an inclusion | |
| 169 (1-to-1). Any character set that does not behave that way is | |
| 1318 | 170 broken according to the Unicode standard. |
| 171 | |
| 2500 | 172 Answer: You will get an ABORT(), since the purpose of the sledgehammer |
| 1318 | 173 routines is self-checking. The above problem with non-1-to-1 mapping |
| 174 occurs in the Big5 tables, as provided by the Unicode Consortium. */ | |
| 877 | 175 |
| 771 | 176 /* #define SLEDGEHAMMER_CHECK_UNICODE */ |
| 177 | |
| 178 /* When MULE is not defined, we may still need some Unicode support -- | |
| 179 in particular, some Windows API's always want Unicode, and the way | |
| 180 we've set up the Unicode encapsulation, we may as well go ahead and | |
| 181 always use the Unicode versions of split API's. (It would be | |
| 182 trickier to not use them, and pointless -- under NT, the ANSI API's | |
| 183 call the Unicode ones anyway, so in the case of structures, we'd be | |
| 184 converting from Unicode to ANSI structures, only to have the OS | |
| 185 convert them back.) */ | |
| 186 | |
| 187 Lisp_Object Qunicode; | |
| 4096 | 188 Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32; |
| 771 | 189 Lisp_Object Qneed_bom; |
| 190 | |
| 191 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; | |
| 192 Lisp_Object Qutf_16_little_endian_bom; | |
| 193 | |
| 985 | 194 Lisp_Object Qutf_8_bom; |
| 195 | |
| 3952 | 196 /* See the Unicode FAQ, http://www.unicode.org/faq/utf_bom.html#35 for this |
| 197 algorithm. | |
| 198 | |
| 199 (They also give another, really verbose one, as part of their explanation | |
| 200 of the various planes of the encoding, but we won't use that.) */ | |
| 201 | |
| 202 #define UTF_16_LEAD_OFFSET (0xD800 - (0x10000 >> 10)) | |
| 203 #define UTF_16_SURROGATE_OFFSET (0x10000 - (0xD800 << 10) - 0xDC00) | |
| 204 | |
| 205 #define utf_16_surrogates_to_code(lead, trail) \ | |
| 206 (((lead) << 10) + (trail) + UTF_16_SURROGATE_OFFSET) | |
| 207 | |
| 208 #define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do { \ | |
| 209 int __ctu16s_code = (codepoint); \ | |
| 210 lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10); \ | |
| 211 trail = 0xDC00 + (__ctu16s_code & 0x3FF); \ | |
| 212 } while (0) | |
| 213 | |
| 771 | 214 #ifdef MULE |
| 215 | |
| 3352 | 216 /* Using ints for to_unicode is OK (as long as they are >= 32 bits). |
| 217 In from_unicode, we're converting from Mule characters, which means | |
| 218 that the values being converted to are only 96x96, and we can save | |
| 219 space by using shorts (signedness doesn't matter). */ | |
| 771 | 220 static int *to_unicode_blank_1; |
| 221 static int **to_unicode_blank_2; | |
| 222 | |
| 223 static short *from_unicode_blank_1; | |
| 224 static short **from_unicode_blank_2; | |
| 225 static short ***from_unicode_blank_3; | |
| 226 static short ****from_unicode_blank_4; | |
| 227 | |
| 1204 | 228 static const struct memory_description to_unicode_level_0_desc_1[] = { |
| 771 | 229 { XD_END } |
| 230 }; | |
| 231 | |
| 1204 | 232 static const struct sized_memory_description to_unicode_level_0_desc = { |
| 233 sizeof (int), to_unicode_level_0_desc_1 | |
| 771 | 234 }; |
| 235 | |
| 1204 | 236 static const struct memory_description to_unicode_level_1_desc_1[] = { |
| 2551 | 237 { XD_BLOCK_PTR, 0, 96, { &to_unicode_level_0_desc } }, |
| 771 | 238 { XD_END } |
| 239 }; | |
| 240 | |
| 1204 | 241 static const struct sized_memory_description to_unicode_level_1_desc = { |
| 242 sizeof (void *), to_unicode_level_1_desc_1 | |
| 771 | 243 }; |
| 244 | |
| 1204 | 245 static const struct memory_description to_unicode_description_1[] = { |
| 2551 | 246 { XD_BLOCK_PTR, 1, 96, { &to_unicode_level_0_desc } }, |
| 247 { XD_BLOCK_PTR, 2, 96, { &to_unicode_level_1_desc } }, | |
| 771 | 248 { XD_END } |
| 249 }; | |
| 250 | |
| 251 /* Not static because each charset has a set of to and from tables and | |
| 252 needs to describe them to pdump. */ | |
| 1204 | 253 const struct sized_memory_description to_unicode_description = { |
| 254 sizeof (void *), to_unicode_description_1 | |
| 255 }; | |
| 256 | |
| 2367 | 257 /* Used only for to_unicode_blank_2 */ |
| 258 static const struct memory_description to_unicode_level_2_desc_1[] = { | |
| 2551 | 259 { XD_BLOCK_PTR, 0, 96, { &to_unicode_level_1_desc } }, |
| 2367 | 260 { XD_END } |
| 261 }; | |
| 262 | |
| 1204 | 263 static const struct memory_description from_unicode_level_0_desc_1[] = { |
| 771 | 264 { XD_END } |
| 265 }; | |
| 266 | |
| 1204 | 267 static const struct sized_memory_description from_unicode_level_0_desc = { |
| 268 sizeof (short), from_unicode_level_0_desc_1 | |
| 771 | 269 }; |
| 270 | |
| 1204 | 271 static const struct memory_description from_unicode_level_1_desc_1[] = { |
| 2551 | 272 { XD_BLOCK_PTR, 0, 256, { &from_unicode_level_0_desc } }, |
| 771 | 273 { XD_END } |
| 274 }; | |
| 275 | |
| 1204 | 276 static const struct sized_memory_description from_unicode_level_1_desc = { |
| 277 sizeof (void *), from_unicode_level_1_desc_1 | |
| 771 | 278 }; |
| 279 | |
| 1204 | 280 static const struct memory_description from_unicode_level_2_desc_1[] = { |
| 2551 | 281 { XD_BLOCK_PTR, 0, 256, { &from_unicode_level_1_desc } }, |
| 771 | 282 { XD_END } |
| 283 }; | |
| 284 | |
| 1204 | 285 static const struct sized_memory_description from_unicode_level_2_desc = { |
| 286 sizeof (void *), from_unicode_level_2_desc_1 | |
| 771 | 287 }; |
| 288 | |
| 1204 | 289 static const struct memory_description from_unicode_level_3_desc_1[] = { |
| 2551 | 290 { XD_BLOCK_PTR, 0, 256, { &from_unicode_level_2_desc } }, |
| 771 | 291 { XD_END } |
| 292 }; | |
| 293 | |
| 1204 | 294 static const struct sized_memory_description from_unicode_level_3_desc = { |
| 295 sizeof (void *), from_unicode_level_3_desc_1 | |
| 771 | 296 }; |
| 297 | |
| 1204 | 298 static const struct memory_description from_unicode_description_1[] = { |
| 2551 | 299 { XD_BLOCK_PTR, 1, 256, { &from_unicode_level_0_desc } }, |
| 300 { XD_BLOCK_PTR, 2, 256, { &from_unicode_level_1_desc } }, | |
| 301 { XD_BLOCK_PTR, 3, 256, { &from_unicode_level_2_desc } }, | |
| 302 { XD_BLOCK_PTR, 4, 256, { &from_unicode_level_3_desc } }, | |
| 771 | 303 { XD_END } |
| 304 }; | |
| 305 | |
| 306 /* Not static because each charset has a set of to and from tables and | |
| 307 needs to describe them to pdump. */ | |
| 1204 | 308 const struct sized_memory_description from_unicode_description = { |
| 309 sizeof (void *), from_unicode_description_1 | |
| 771 | 310 }; |
| 311 | |
| 2367 | 312 /* Used only for from_unicode_blank_4 */ |
| 313 static const struct memory_description from_unicode_level_4_desc_1[] = { | |
| 2551 | 314 { XD_BLOCK_PTR, 0, 256, { &from_unicode_level_3_desc } }, |
| 2367 | 315 { XD_END } |
| 316 }; | |
| 317 | |
| 771 | 318 static Lisp_Object_dynarr *unicode_precedence_dynarr; |
| 319 | |
| 1204 | 320 static const struct memory_description lod_description_1[] = { |
| 321 XD_DYNARR_DESC (Lisp_Object_dynarr, &lisp_object_description), | |
| 771 | 322 { XD_END } |
| 323 }; | |
| 324 | |
| 1204 | 325 static const struct sized_memory_description lisp_object_dynarr_description = { |
| 771 | 326 sizeof (Lisp_Object_dynarr), |
| 327 lod_description_1 | |
| 328 }; | |
| 329 | |
| 330 Lisp_Object Vlanguage_unicode_precedence_list; | |
| 331 Lisp_Object Vdefault_unicode_precedence_list; | |
| 332 | |
| 333 Lisp_Object Qignore_first_column; | |
| 334 | |
| 3439 | 335 Lisp_Object Vcurrent_jit_charset; |
| 336 Lisp_Object Qlast_allocated_character; | |
| 337 Lisp_Object Qccl_encode_to_ucs_2; | |
| 338 | |
| 4268 | 339 Lisp_Object Vnumber_of_jit_charsets; |
| 340 Lisp_Object Vlast_jit_charset_final; | |
| 341 Lisp_Object Vcharset_descr; | |
| 342 | |
| 343 | |
| 771 | 344 |
| 345 /************************************************************************/ | |
| 346 /* Unicode implementation */ | |
| 347 /************************************************************************/ | |
| 348 | |
| 349 #define BREAKUP_UNICODE_CODE(val, u1, u2, u3, u4, levels) \ | |
| 350 do { \ | |
| 351 int buc_val = (val); \ | |
| 352 \ | |
| 353 (u1) = buc_val >> 24; \ | |
| 354 (u2) = (buc_val >> 16) & 255; \ | |
| 355 (u3) = (buc_val >> 8) & 255; \ | |
| 356 (u4) = buc_val & 255; \ | |
| 357 (levels) = (buc_val <= 0xFF ? 1 : \ | |
| 358 buc_val <= 0xFFFF ? 2 : \ | |
| 359 buc_val <= 0xFFFFFF ? 3 : \ | |
| 360 4); \ | |
| 361 } while (0) | |
| 362 | |
| 363 static void | |
| 364 init_blank_unicode_tables (void) | |
| 365 { | |
| 366 int i; | |
| 367 | |
| 368 from_unicode_blank_1 = xnew_array (short, 256); | |
| 369 from_unicode_blank_2 = xnew_array (short *, 256); | |
| 370 from_unicode_blank_3 = xnew_array (short **, 256); | |
| 371 from_unicode_blank_4 = xnew_array (short ***, 256); | |
| 372 for (i = 0; i < 256; i++) | |
| 373 { | |
| 877 | 374 /* #### IMWTK: Why does using -1 here work? Simply because there are |
| 1318 | 375 no existing 96x96 charsets? |
| 376 | |
| 377 Answer: I don't understand the concern. -1 indicates there is no | |
| 378 entry for this particular codepoint, which is always the case for | |
| 379 blank tables. */ | |
| 771 | 380 from_unicode_blank_1[i] = (short) -1; |
| 381 from_unicode_blank_2[i] = from_unicode_blank_1; | |
| 382 from_unicode_blank_3[i] = from_unicode_blank_2; | |
| 383 from_unicode_blank_4[i] = from_unicode_blank_3; | |
| 384 } | |
| 385 | |
| 386 to_unicode_blank_1 = xnew_array (int, 96); | |
| 387 to_unicode_blank_2 = xnew_array (int *, 96); | |
| 388 for (i = 0; i < 96; i++) | |
| 389 { | |
| 877 | 390 /* Here -1 is guaranteed OK. */ |
| 771 | 391 to_unicode_blank_1[i] = -1; |
| 392 to_unicode_blank_2[i] = to_unicode_blank_1; | |
| 393 } | |
| 394 } | |
| 395 | |
| 396 static void * | |
| 397 create_new_from_unicode_table (int level) | |
| 398 { | |
| 399 switch (level) | |
| 400 { | |
| 401 /* WARNING: If you are thinking of compressing these, keep in | |
| 402 mind that sizeof (short) does not equal sizeof (short *). */ | |
| 403 case 1: | |
| 404 { | |
| 405 short *newtab = xnew_array (short, 256); | |
| 406 memcpy (newtab, from_unicode_blank_1, 256 * sizeof (short)); | |
| 407 return newtab; | |
| 408 } | |
| 409 case 2: | |
| 410 { | |
| 411 short **newtab = xnew_array (short *, 256); | |
| 412 memcpy (newtab, from_unicode_blank_2, 256 * sizeof (short *)); | |
| 413 return newtab; | |
| 414 } | |
| 415 case 3: | |
| 416 { | |
| 417 short ***newtab = xnew_array (short **, 256); | |
| 418 memcpy (newtab, from_unicode_blank_3, 256 * sizeof (short **)); | |
| 419 return newtab; | |
| 420 } | |
| 421 case 4: | |
| 422 { | |
| 423 short ****newtab = xnew_array (short ***, 256); | |
| 424 memcpy (newtab, from_unicode_blank_4, 256 * sizeof (short ***)); | |
| 425 return newtab; | |
| 426 } | |
| 427 default: | |
| 2500 | 428 ABORT (); |
| 771 | 429 return 0; |
| 430 } | |
| 431 } | |
| 432 | |
| 877 | 433 /* Allocate and blank the tables. |
| 1318 | 434 Loading them up is done by load-unicode-mapping-table. */ |
| 771 | 435 void |
| 436 init_charset_unicode_tables (Lisp_Object charset) | |
| 437 { | |
| 438 if (XCHARSET_DIMENSION (charset) == 1) | |
| 439 { | |
| 440 int *to_table = xnew_array (int, 96); | |
| 441 memcpy (to_table, to_unicode_blank_1, 96 * sizeof (int)); | |
| 442 XCHARSET_TO_UNICODE_TABLE (charset) = to_table; | |
| 443 } | |
| 444 else | |
| 445 { | |
| 446 int **to_table = xnew_array (int *, 96); | |
| 447 memcpy (to_table, to_unicode_blank_2, 96 * sizeof (int *)); | |
| 448 XCHARSET_TO_UNICODE_TABLE (charset) = to_table; | |
| 449 } | |
| 450 | |
| 451 { | |
| 2367 | 452 XCHARSET_FROM_UNICODE_TABLE (charset) = |
| 453 create_new_from_unicode_table (1); | |
| 771 | 454 XCHARSET_FROM_UNICODE_LEVELS (charset) = 1; |
| 455 } | |
| 456 } | |
| 457 | |
| 458 static void | |
| 459 free_from_unicode_table (void *table, int level) | |
| 460 { | |
| 461 int i; | |
| 462 | |
| 463 switch (level) | |
| 464 { | |
| 465 case 2: | |
| 466 { | |
| 467 short **tab = (short **) table; | |
| 468 for (i = 0; i < 256; i++) | |
| 469 { | |
| 470 if (tab[i] != from_unicode_blank_1) | |
| 471 free_from_unicode_table (tab[i], 1); | |
| 472 } | |
| 473 break; | |
| 474 } | |
| 475 case 3: | |
| 476 { | |
| 477 short ***tab = (short ***) table; | |
| 478 for (i = 0; i < 256; i++) | |
| 479 { | |
| 480 if (tab[i] != from_unicode_blank_2) | |
| 481 free_from_unicode_table (tab[i], 2); | |
| 482 } | |
| 483 break; | |
| 484 } | |
| 485 case 4: | |
| 486 { | |
| 487 short ****tab = (short ****) table; | |
| 488 for (i = 0; i < 256; i++) | |
| 489 { | |
| 490 if (tab[i] != from_unicode_blank_3) | |
| 491 free_from_unicode_table (tab[i], 3); | |
| 492 } | |
| 493 break; | |
| 494 } | |
| 495 } | |
| 496 | |
| 1726 | 497 xfree (table, void *); |
| 771 | 498 } |
| 499 | |
| 500 static void | |
| 501 free_to_unicode_table (void *table, int level) | |
| 502 { | |
| 503 if (level == 2) | |
| 504 { | |
| 505 int i; | |
| 506 int **tab = (int **) table; | |
| 507 | |
| 508 for (i = 0; i < 96; i++) | |
| 509 { | |
| 510 if (tab[i] != to_unicode_blank_1) | |
| 511 free_to_unicode_table (tab[i], 1); | |
| 512 } | |
| 513 } | |
| 514 | |
| 1726 | 515 xfree (table, void *); |
| 771 | 516 } |
| 517 | |
| 518 void | |
| 519 free_charset_unicode_tables (Lisp_Object charset) | |
| 520 { | |
| 521 free_to_unicode_table (XCHARSET_TO_UNICODE_TABLE (charset), | |
| 522 XCHARSET_DIMENSION (charset)); | |
| 523 free_from_unicode_table (XCHARSET_FROM_UNICODE_TABLE (charset), | |
| 524 XCHARSET_FROM_UNICODE_LEVELS (charset)); | |
| 525 } | |
| 526 | |
| 527 #ifdef MEMORY_USAGE_STATS | |
| 528 | |
| 529 static Bytecount | |
| 530 compute_from_unicode_table_size_1 (void *table, int level, | |
| 531 struct overhead_stats *stats) | |
| 532 { | |
| 533 int i; | |
| 534 Bytecount size = 0; | |
| 535 | |
| 536 switch (level) | |
| 537 { | |
| 538 case 2: | |
| 539 { | |
| 540 short **tab = (short **) table; | |
| 541 for (i = 0; i < 256; i++) | |
| 542 { | |
| 543 if (tab[i] != from_unicode_blank_1) | |
| 544 size += compute_from_unicode_table_size_1 (tab[i], 1, stats); | |
| 545 } | |
| 546 break; | |
| 547 } | |
| 548 case 3: | |
| 549 { | |
| 550 short ***tab = (short ***) table; | |
| 551 for (i = 0; i < 256; i++) | |
| 552 { | |
| 553 if (tab[i] != from_unicode_blank_2) | |
| 554 size += compute_from_unicode_table_size_1 (tab[i], 2, stats); | |
| 555 } | |
| 556 break; | |
| 557 } | |
| 558 case 4: | |
| 559 { | |
| 560 short ****tab = (short ****) table; | |
| 561 for (i = 0; i < 256; i++) | |
| 562 { | |
| 563 if (tab[i] != from_unicode_blank_3) | |
| 564 size += compute_from_unicode_table_size_1 (tab[i], 3, stats); | |
| 565 } | |
| 566 break; | |
| 567 } | |
| 568 } | |
| 569 | |
| 3024 | 570 size += malloced_storage_size (table, |
| 771 | 571 256 * (level == 1 ? sizeof (short) : |
| 572 sizeof (void *)), | |
| 573 stats); | |
| 574 return size; | |
| 575 } | |
| 576 | |
| 577 static Bytecount | |
| 578 compute_to_unicode_table_size_1 (void *table, int level, | |
| 579 struct overhead_stats *stats) | |
| 580 { | |
| 581 Bytecount size = 0; | |
| 582 | |
| 583 if (level == 2) | |
| 584 { | |
| 585 int i; | |
| 586 int **tab = (int **) table; | |
| 587 | |
| 588 for (i = 0; i < 96; i++) | |
| 589 { | |
| 590 if (tab[i] != to_unicode_blank_1) | |
| 591 size += compute_to_unicode_table_size_1 (tab[i], 1, stats); | |
| 592 } | |
| 593 } | |
| 594 | |
| 3024 | 595 size += malloced_storage_size (table, |
| 771 | 596 96 * (level == 1 ? sizeof (int) : |
| 597 sizeof (void *)), | |
| 598 stats); | |
| 599 return size; | |
| 600 } | |
| 601 | |
| 602 Bytecount | |
| 603 compute_from_unicode_table_size (Lisp_Object charset, | |
| 604 struct overhead_stats *stats) | |
| 605 { | |
| 606 return (compute_from_unicode_table_size_1 | |
| 607 (XCHARSET_FROM_UNICODE_TABLE (charset), | |
| 608 XCHARSET_FROM_UNICODE_LEVELS (charset), | |
| 609 stats)); | |
| 610 } | |
| 611 | |
| 612 Bytecount | |
| 613 compute_to_unicode_table_size (Lisp_Object charset, | |
| 614 struct overhead_stats *stats) | |
| 615 { | |
| 616 return (compute_to_unicode_table_size_1 | |
| 617 (XCHARSET_TO_UNICODE_TABLE (charset), | |
| 618 XCHARSET_DIMENSION (charset), | |
| 619 stats)); | |
| 620 } | |
| 621 | |
| 622 #endif | |
| 623 | |
| 624 #ifdef SLEDGEHAMMER_CHECK_UNICODE | |
| 625 | |
| 626 /* "Sledgehammer checks" are checks that verify the self-consistency | |
| 627 of an entire structure every time a change is about to be made or | |
| 628 has been made to the structure. Not fast but a pretty much | |
| 629 sure-fire way of flushing out any incorrectnesses in the algorithms | |
| 630 that create the structure. | |
| 631 | |
| 632 Checking only after a change has been made will speed things up by | |
| 633 a factor of 2, but it doesn't absolutely prove that the code just | |
| 634 checked caused the problem; perhaps it happened elsewhere, either | |
| 635 in some code you forgot to sledgehammer check or as a result of | |
| 636 data corruption. */ | |
| 637 | |
| 638 static void | |
| 639 assert_not_any_blank_table (void *tab) | |
| 640 { | |
| 641 assert (tab != from_unicode_blank_1); | |
| 642 assert (tab != from_unicode_blank_2); | |
| 643 assert (tab != from_unicode_blank_3); | |
| 644 assert (tab != from_unicode_blank_4); | |
| 645 assert (tab != to_unicode_blank_1); | |
| 646 assert (tab != to_unicode_blank_2); | |
| 647 assert (tab); | |
| 648 } | |
| 649 | |
| 650 static void | |
| 651 sledgehammer_check_from_table (Lisp_Object charset, void *table, int level, | |
| 652 int codetop) | |
| 653 { | |
| 654 int i; | |
| 655 | |
| 656 switch (level) | |
| 657 { | |
| 658 case 1: | |
| 659 { | |
| 660 short *tab = (short *) table; | |
| 661 for (i = 0; i < 256; i++) | |
| 662 { | |
| 663 if (tab[i] != -1) | |
| 664 { | |
| 665 Lisp_Object char_charset; | |
| 666 int c1, c2; | |
| 667 | |
| 867 | 668 assert (valid_ichar_p (tab[i])); |
| 669 BREAKUP_ICHAR (tab[i], char_charset, c1, c2); | |
| 771 | 670 assert (EQ (charset, char_charset)); |
| 671 if (XCHARSET_DIMENSION (charset) == 1) | |
| 672 { | |
| 673 int *to_table = | |
| 674 (int *) XCHARSET_TO_UNICODE_TABLE (charset); | |
| 675 assert_not_any_blank_table (to_table); | |
| 676 assert (to_table[c1 - 32] == (codetop << 8) + i); | |
| 677 } | |
| 678 else | |
| 679 { | |
| 680 int **to_table = | |
| 681 (int **) XCHARSET_TO_UNICODE_TABLE (charset); | |
| 682 assert_not_any_blank_table (to_table); | |
| 683 assert_not_any_blank_table (to_table[c1 - 32]); | |
| 684 assert (to_table[c1 - 32][c2 - 32] == (codetop << 8) + i); | |
| 685 } | |
| 686 } | |
| 687 } | |
| 688 break; | |
| 689 } | |
| 690 case 2: | |
| 691 { | |
| 692 short **tab = (short **) table; | |
| 693 for (i = 0; i < 256; i++) | |
| 694 { | |
| 695 if (tab[i] != from_unicode_blank_1) | |
| 696 sledgehammer_check_from_table (charset, tab[i], 1, | |
| 697 (codetop << 8) + i); | |
| 698 } | |
| 699 break; | |
| 700 } | |
| 701 case 3: | |
| 702 { | |
| 703 short ***tab = (short ***) table; | |
| 704 for (i = 0; i < 256; i++) | |
| 705 { | |
| 706 if (tab[i] != from_unicode_blank_2) | |
| 707 sledgehammer_check_from_table (charset, tab[i], 2, | |
| 708 (codetop << 8) + i); | |
| 709 } | |
| 710 break; | |
| 711 } | |
| 712 case 4: | |
| 713 { | |
| 714 short ****tab = (short ****) table; | |
| 715 for (i = 0; i < 256; i++) | |
| 716 { | |
| 717 if (tab[i] != from_unicode_blank_3) | |
| 718 sledgehammer_check_from_table (charset, tab[i], 3, | |
| 719 (codetop << 8) + i); | |
| 720 } | |
| 721 break; | |
| 722 } | |
| 723 default: | |
| 2500 | 724 ABORT (); |
| 771 | 725 } |
| 726 } | |
| 727 | |
| 728 static void | |
| 729 sledgehammer_check_to_table (Lisp_Object charset, void *table, int level, | |
| 730 int codetop) | |
| 731 { | |
| 732 int i; | |
| 733 | |
| 734 switch (level) | |
| 735 { | |
| 736 case 1: | |
| 737 { | |
| 738 int *tab = (int *) table; | |
| 739 | |
| 740 if (XCHARSET_CHARS (charset) == 94) | |
| 741 { | |
| 742 assert (tab[0] == -1); | |
| 743 assert (tab[95] == -1); | |
| 744 } | |
| 745 | |
| 746 for (i = 0; i < 96; i++) | |
| 747 { | |
| 748 if (tab[i] != -1) | |
| 749 { | |
| 750 int u4, u3, u2, u1, levels; | |
| 867 | 751 Ichar ch; |
| 752 Ichar this_ch; | |
| 771 | 753 short val; |
| 754 void *frtab = XCHARSET_FROM_UNICODE_TABLE (charset); | |
| 755 | |
| 756 if (XCHARSET_DIMENSION (charset) == 1) | |
| 867 | 757 this_ch = make_ichar (charset, i + 32, 0); |
| 771 | 758 else |
| 867 | 759 this_ch = make_ichar (charset, codetop + 32, i + 32); |
| 771 | 760 |
| 761 assert (tab[i] >= 0); | |
| 762 BREAKUP_UNICODE_CODE (tab[i], u4, u3, u2, u1, levels); | |
| 763 assert (levels <= XCHARSET_FROM_UNICODE_LEVELS (charset)); | |
| 764 | |
| 765 switch (XCHARSET_FROM_UNICODE_LEVELS (charset)) | |
| 766 { | |
| 767 case 1: val = ((short *) frtab)[u1]; break; | |
| 768 case 2: val = ((short **) frtab)[u2][u1]; break; | |
| 769 case 3: val = ((short ***) frtab)[u3][u2][u1]; break; | |
| 770 case 4: val = ((short ****) frtab)[u4][u3][u2][u1]; break; | |
| 2500 | 771 default: ABORT (); |
| 771 | 772 } |
| 773 | |
| 867 | 774 ch = make_ichar (charset, val >> 8, val & 0xFF); |
| 771 | 775 assert (ch == this_ch); |
| 776 | |
| 777 switch (XCHARSET_FROM_UNICODE_LEVELS (charset)) | |
| 778 { | |
| 779 case 4: | |
| 780 assert_not_any_blank_table (frtab); | |
| 781 frtab = ((short ****) frtab)[u4]; | |
| 782 /* fall through */ | |
| 783 case 3: | |
| 784 assert_not_any_blank_table (frtab); | |
| 785 frtab = ((short ***) frtab)[u3]; | |
| 786 /* fall through */ | |
| 787 case 2: | |
| 788 assert_not_any_blank_table (frtab); | |
| 789 frtab = ((short **) frtab)[u2]; | |
| 790 /* fall through */ | |
| 791 case 1: | |
| 792 assert_not_any_blank_table (frtab); | |
| 793 break; | |
| 2500 | 794 default: ABORT (); |
| 771 | 795 } |
| 796 } | |
| 797 } | |
| 798 break; | |
| 799 } | |
| 800 case 2: | |
| 801 { | |
| 802 int **tab = (int **) table; | |
| 803 | |
| 804 if (XCHARSET_CHARS (charset) == 94) | |
| 805 { | |
| 806 assert (tab[0] == to_unicode_blank_1); | |
| 807 assert (tab[95] == to_unicode_blank_1); | |
| 808 } | |
| 809 | |
| 810 for (i = 0; i < 96; i++) | |
| 811 { | |
| 812 if (tab[i] != to_unicode_blank_1) | |
| 813 sledgehammer_check_to_table (charset, tab[i], 1, i); | |
| 814 } | |
| 815 break; | |
| 816 } | |
| 817 default: | |
| 2500 | 818 ABORT (); |
| 771 | 819 } |
| 820 } | |
| 821 | |
| 822 static void | |
| 823 sledgehammer_check_unicode_tables (Lisp_Object charset) | |
| 824 { | |
| 825 /* verify that the blank tables have not been modified */ | |
| 826 int i; | |
| 827 int from_level = XCHARSET_FROM_UNICODE_LEVELS (charset); | |
| 828 int to_level = XCHARSET_FROM_UNICODE_LEVELS (charset); | |
| 829 | |
| 830 for (i = 0; i < 256; i++) | |
| 831 { | |
| 832 assert (from_unicode_blank_1[i] == (short) -1); | |
| 833 assert (from_unicode_blank_2[i] == from_unicode_blank_1); | |
| 834 assert (from_unicode_blank_3[i] == from_unicode_blank_2); | |
| 835 assert (from_unicode_blank_4[i] == from_unicode_blank_3); | |
| 836 } | |
| 837 | |
| 838 for (i = 0; i < 96; i++) | |
| 839 { | |
| 840 assert (to_unicode_blank_1[i] == -1); | |
| 841 assert (to_unicode_blank_2[i] == to_unicode_blank_1); | |
| 842 } | |
| 843 | |
| 844 assert (from_level >= 1 && from_level <= 4); | |
| 845 | |
| 846 sledgehammer_check_from_table (charset, | |
| 847 XCHARSET_FROM_UNICODE_TABLE (charset), | |
| 848 from_level, 0); | |
| 849 | |
| 850 sledgehammer_check_to_table (charset, | |
| 851 XCHARSET_TO_UNICODE_TABLE (charset), | |
| 852 XCHARSET_DIMENSION (charset), 0); | |
| 853 } | |
| 854 | |
| 855 #endif /* SLEDGEHAMMER_CHECK_UNICODE */ | |
| 856 | |
| 857 static void | |
| 867 | 858 set_unicode_conversion (Ichar chr, int code) |
| 771 | 859 { |
| 860 Lisp_Object charset; | |
| 861 int c1, c2; | |
| 862 | |
| 867 | 863 BREAKUP_ICHAR (chr, charset, c1, c2); |
| 771 | 864 |
| 877 | 865 /* I tried an assert on code > 255 || chr == code, but that fails because |
| 866 Mule gives many Latin characters separate code points for different | |
| 867 ISO 8859 coded character sets. Obvious in hindsight.... */ | |
| 868 assert (!EQ (charset, Vcharset_ascii) || chr == code); | |
| 869 assert (!EQ (charset, Vcharset_latin_iso8859_1) || chr == code); | |
| 870 assert (!EQ (charset, Vcharset_control_1) || chr == code); | |
| 871 | |
| 872 /* This assert is needed because it is simply unimplemented. */ | |
| 771 | 873 assert (!EQ (charset, Vcharset_composite)); |
| 874 | |
| 875 #ifdef SLEDGEHAMMER_CHECK_UNICODE | |
| 876 sledgehammer_check_unicode_tables (charset); | |
| 877 #endif | |
| 878 | |
| 2704 | 879 if (EQ(charset, Vcharset_ascii) || EQ(charset, Vcharset_control_1)) |
| 880 return; | |
| 881 | |
| 771 | 882 /* First, the char -> unicode translation */ |
| 883 | |
| 884 if (XCHARSET_DIMENSION (charset) == 1) | |
| 885 { | |
| 886 int *to_table = (int *) XCHARSET_TO_UNICODE_TABLE (charset); | |
| 887 to_table[c1 - 32] = code; | |
| 888 } | |
| 889 else | |
| 890 { | |
| 891 int **to_table_2 = (int **) XCHARSET_TO_UNICODE_TABLE (charset); | |
| 892 int *to_table_1; | |
| 893 | |
| 894 assert (XCHARSET_DIMENSION (charset) == 2); | |
| 895 to_table_1 = to_table_2[c1 - 32]; | |
| 896 if (to_table_1 == to_unicode_blank_1) | |
| 897 { | |
| 898 to_table_1 = xnew_array (int, 96); | |
| 899 memcpy (to_table_1, to_unicode_blank_1, 96 * sizeof (int)); | |
| 900 to_table_2[c1 - 32] = to_table_1; | |
| 901 } | |
| 902 to_table_1[c2 - 32] = code; | |
| 903 } | |
| 904 | |
| 905 /* Then, unicode -> char: much harder */ | |
| 906 | |
| 907 { | |
| 908 int charset_levels; | |
| 909 int u4, u3, u2, u1; | |
| 910 int code_levels; | |
| 911 BREAKUP_UNICODE_CODE (code, u4, u3, u2, u1, code_levels); | |
| 912 | |
| 913 charset_levels = XCHARSET_FROM_UNICODE_LEVELS (charset); | |
| 914 | |
| 915 /* Make sure the charset's tables have at least as many levels as | |
| 916 the code point has: Note that the charset is guaranteed to have | |
| 917 at least one level, because it was created that way */ | |
| 918 if (charset_levels < code_levels) | |
| 919 { | |
| 920 int i; | |
| 921 | |
| 922 assert (charset_levels > 0); | |
| 923 for (i = 2; i <= code_levels; i++) | |
| 924 { | |
| 925 if (charset_levels < i) | |
| 926 { | |
| 927 void *old_table = XCHARSET_FROM_UNICODE_TABLE (charset); | |
| 928 void *table = create_new_from_unicode_table (i); | |
| 929 XCHARSET_FROM_UNICODE_TABLE (charset) = table; | |
| 930 | |
| 931 switch (i) | |
| 932 { | |
| 933 case 2: | |
| 934 ((short **) table)[0] = (short *) old_table; | |
| 935 break; | |
| 936 case 3: | |
| 937 ((short ***) table)[0] = (short **) old_table; | |
| 938 break; | |
| 939 case 4: | |
| 940 ((short ****) table)[0] = (short ***) old_table; | |
| 941 break; | |
| 2500 | 942 default: ABORT (); |
| 771 | 943 } |
| 944 } | |
| 945 } | |
| 946 | |
| 947 charset_levels = code_levels; | |
| 948 XCHARSET_FROM_UNICODE_LEVELS (charset) = code_levels; | |
| 949 } | |
| 950 | |
| 951 /* Now, make sure there is a non-default table at each level */ | |
| 952 { | |
| 953 int i; | |
| 954 void *table = XCHARSET_FROM_UNICODE_TABLE (charset); | |
| 955 | |
| 956 for (i = charset_levels; i >= 2; i--) | |
| 957 { | |
| 958 switch (i) | |
| 959 { | |
| 960 case 4: | |
| 961 if (((short ****) table)[u4] == from_unicode_blank_3) | |
| 962 ((short ****) table)[u4] = | |
| 963 ((short ***) create_new_from_unicode_table (3)); | |
| 964 table = ((short ****) table)[u4]; | |
| 965 break; | |
| 966 case 3: | |
| 967 if (((short ***) table)[u3] == from_unicode_blank_2) | |
| 968 ((short ***) table)[u3] = | |
| 969 ((short **) create_new_from_unicode_table (2)); | |
| 970 table = ((short ***) table)[u3]; | |
| 971 break; | |
| 972 case 2: | |
| 973 if (((short **) table)[u2] == from_unicode_blank_1) | |
| 974 ((short **) table)[u2] = | |
| 975 ((short *) create_new_from_unicode_table (1)); | |
| 976 table = ((short **) table)[u2]; | |
| 977 break; | |
| 2500 | 978 default: ABORT (); |
| 771 | 979 } |
| 980 } | |
| 981 } | |
| 982 | |
| 983 /* Finally, set the character */ | |
| 984 | |
| 985 { | |
| 986 void *table = XCHARSET_FROM_UNICODE_TABLE (charset); | |
| 987 switch (charset_levels) | |
| 988 { | |
| 989 case 1: ((short *) table)[u1] = (c1 << 8) + c2; break; | |
| 990 case 2: ((short **) table)[u2][u1] = (c1 << 8) + c2; break; | |
| 991 case 3: ((short ***) table)[u3][u2][u1] = (c1 << 8) + c2; break; | |
| 992 case 4: ((short ****) table)[u4][u3][u2][u1] = (c1 << 8) + c2; break; | |
| 2500 | 993 default: ABORT (); |
| 771 | 994 } |
| 995 } | |
| 996 } | |
| 997 | |
| 998 #ifdef SLEDGEHAMMER_CHECK_UNICODE | |
| 999 sledgehammer_check_unicode_tables (charset); | |
| 1000 #endif | |
| 1001 } | |
| 1002 | |
| 788 | 1003 int |
| 867 | 1004 ichar_to_unicode (Ichar chr) |
| 771 | 1005 { |
| 1006 Lisp_Object charset; | |
| 1007 int c1, c2; | |
| 1008 | |
| 867 | 1009 type_checking_assert (valid_ichar_p (chr)); |
| 877 | 1010 /* This shortcut depends on the representation of an Ichar, see text.c. */ |
| 771 | 1011 if (chr < 256) |
| 1012 return (int) chr; | |
| 1013 | |
| 867 | 1014 BREAKUP_ICHAR (chr, charset, c1, c2); |
| 771 | 1015 if (EQ (charset, Vcharset_composite)) |
| 1016 return -1; /* #### don't know how to handle */ | |
| 1017 else if (XCHARSET_DIMENSION (charset) == 1) | |
| 1018 return ((int *) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32]; | |
| 1019 else | |
| 1020 return ((int **) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32][c2 - 32]; | |
| 1021 } | |
| 1022 | |
| 867 | 1023 static Ichar |
| 3439 | 1024 get_free_codepoint(Lisp_Object charset) |
| 1025 { | |
| 1026 Lisp_Object name = Fcharset_name(charset); | |
| 1027 Lisp_Object zeichen = Fget(name, Qlast_allocated_character, Qnil); | |
| 1028 Ichar res; | |
| 1029 | |
| 1030 /* Only allow this with the 96x96 character sets we are using for | |
| 1031 temporary Unicode support. */ | |
| 1032 assert(2 == XCHARSET_DIMENSION(charset) && 96 == XCHARSET_CHARS(charset)); | |
| 1033 | |
| 1034 if (!NILP(zeichen)) | |
| 1035 { | |
| 1036 int c1, c2; | |
| 1037 | |
| 1038 BREAKUP_ICHAR(XCHAR(zeichen), charset, c1, c2); | |
| 1039 | |
| 1040 if (127 == c1 && 127 == c2) | |
| 1041 { | |
| 1042 /* We've already used the hightest-numbered character in this | |
| 1043 set--tell our caller to create another. */ | |
| 1044 return -1; | |
| 1045 } | |
| 1046 | |
| 1047 if (127 == c2) | |
| 1048 { | |
| 1049 ++c1; | |
| 1050 c2 = 0x20; | |
| 1051 } | |
| 1052 else | |
| 1053 { | |
| 1054 ++c2; | |
| 1055 } | |
| 1056 | |
| 1057 res = make_ichar(charset, c1, c2); | |
| 1058 Fput(name, Qlast_allocated_character, make_char(res)); | |
| 1059 } | |
| 1060 else | |
| 1061 { | |
| 1062 res = make_ichar(charset, 32, 32); | |
| 1063 Fput(name, Qlast_allocated_character, make_char(res)); | |
| 1064 } | |
| 1065 return res; | |
| 1066 } | |
| 1067 | |
| 1068 /* The just-in-time creation of XEmacs characters that correspond to unknown | |
| 1069 Unicode code points happens when: | |
| 1070 | |
| 1071 1. The lookup would otherwise fail. | |
| 1072 | |
| 1073 2. The charsets array is the nil or the default. | |
| 1074 | |
| 1075 If there are no free code points in the just-in-time Unicode character | |
| 1076 set, and the charsets array is the default unicode precedence list, | |
| 1077 create a new just-in-time Unicode character set, add it at the end of the | |
| 1078 unicode precedence list, create the XEmacs character in that character | |
| 1079 set, and return it. */ | |
| 1080 | |
| 1081 static Ichar | |
| 877 | 1082 unicode_to_ichar (int code, Lisp_Object_dynarr *charsets) |
| 771 | 1083 { |
| 1084 int u1, u2, u3, u4; | |
| 1085 int code_levels; | |
| 1086 int i; | |
| 1087 int n = Dynarr_length (charsets); | |
| 1088 | |
| 1089 type_checking_assert (code >= 0); | |
| 877 | 1090 /* This shortcut depends on the representation of an Ichar, see text.c. |
| 1091 Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859 | |
| 893 | 1092 coded character sets have points that map into that region, so this |
| 1093 function is many-valued). */ | |
| 877 | 1094 if (code < 0xA0) |
| 867 | 1095 return (Ichar) code; |
| 771 | 1096 |
| 1097 BREAKUP_UNICODE_CODE (code, u4, u3, u2, u1, code_levels); | |
| 1098 | |
| 1099 for (i = 0; i < n; i++) | |
| 1100 { | |
| 1101 Lisp_Object charset = Dynarr_at (charsets, i); | |
| 1102 int charset_levels = XCHARSET_FROM_UNICODE_LEVELS (charset); | |
| 1103 if (charset_levels >= code_levels) | |
| 1104 { | |
| 1105 void *table = XCHARSET_FROM_UNICODE_TABLE (charset); | |
| 1106 short retval; | |
| 1107 | |
| 1108 switch (charset_levels) | |
| 1109 { | |
| 1110 case 1: retval = ((short *) table)[u1]; break; | |
| 1111 case 2: retval = ((short **) table)[u2][u1]; break; | |
| 1112 case 3: retval = ((short ***) table)[u3][u2][u1]; break; | |
| 1113 case 4: retval = ((short ****) table)[u4][u3][u2][u1]; break; | |
| 2500 | 1114 default: ABORT (); retval = 0; |
| 771 | 1115 } |
| 1116 | |
| 1117 if (retval != -1) | |
| 867 | 1118 return make_ichar (charset, retval >> 8, retval & 0xFF); |
| 771 | 1119 } |
| 1120 } | |
| 3439 | 1121 |
| 1122 /* Only do the magic just-in-time assignment if we're using the default | |
| 1123 list. */ | |
| 1124 if (unicode_precedence_dynarr == charsets) | |
| 1125 { | |
| 1126 if (NILP (Vcurrent_jit_charset) || | |
| 1127 (-1 == (i = get_free_codepoint(Vcurrent_jit_charset)))) | |
| 1128 { | |
| 3452 | 1129 Ibyte setname[32]; |
| 4268 | 1130 int number_of_jit_charsets = XINT (Vnumber_of_jit_charsets); |
| 1131 Ascbyte last_jit_charset_final = XCHAR (Vlast_jit_charset_final); | |
| 1132 | |
| 1133 /* This final byte shit is, umm, not that cool. */ | |
| 1134 assert (last_jit_charset_final >= 0x30); | |
| 3439 | 1135 |
| 3452 | 1136 /* Assertion added partly because our Win32 layer doesn't |
| 1137 support snprintf; with this, we're sure it won't overflow | |
| 1138 the buffer. */ | |
| 1139 assert(100 > number_of_jit_charsets); | |
| 1140 | |
| 4268 | 1141 qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets); |
| 1142 | |
| 3439 | 1143 Vcurrent_jit_charset = Fmake_charset |
| 4268 | 1144 (intern((const CIbyte *)setname), Vcharset_descr, |
| 3439 | 1145 /* Set encode-as-utf-8 to t, to have this character set written |
| 1146 using UTF-8 escapes in escape-quoted and ctext. This | |
| 1147 sidesteps the fact that our internal character -> Unicode | |
| 1148 mapping is not stable from one invocation to the next. */ | |
| 1149 nconc2 (list2(Qencode_as_utf_8, Qt), | |
| 1150 nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96), | |
| 1151 Qdimension, make_int(2)), | |
| 3659 | 1152 list6(Qregistries, Qunicode_registries, |
| 4268 | 1153 Qfinal, make_char(last_jit_charset_final), |
| 3439 | 1154 /* This CCL program is initialised in |
| 1155 unicode.el. */ | |
| 1156 Qccl_program, Qccl_encode_to_ucs_2)))); | |
| 4268 | 1157 |
| 1158 /* Record for the Unicode infrastructure that we've created | |
| 1159 this character set. */ | |
| 1160 Vnumber_of_jit_charsets = make_int (number_of_jit_charsets + 1); | |
| 1161 Vlast_jit_charset_final = make_char (last_jit_charset_final + 1); | |
| 3439 | 1162 |
| 1163 i = get_free_codepoint(Vcurrent_jit_charset); | |
| 1164 } | |
| 1165 | |
| 1166 if (-1 != i) | |
| 1167 { | |
| 1168 set_unicode_conversion((Ichar)i, code); | |
| 1169 /* No need to add the charset to the end of the list; it's done | |
| 1170 automatically. */ | |
| 1171 } | |
| 1172 } | |
| 1173 return (Ichar) i; | |
| 771 | 1174 } |
| 1175 | |
| 877 | 1176 /* Add charsets to precedence list. |
| 1177 LIST must be a list of charsets. Charsets which are in the list more | |
| 1178 than once are given the precedence implied by their earliest appearance. | |
| 1179 Later appearances are ignored. */ | |
| 771 | 1180 static void |
| 1181 add_charsets_to_precedence_list (Lisp_Object list, int *lbs, | |
| 1182 Lisp_Object_dynarr *dynarr) | |
| 1183 { | |
| 1184 { | |
| 1185 EXTERNAL_LIST_LOOP_2 (elt, list) | |
| 1186 { | |
| 1187 Lisp_Object charset = Fget_charset (elt); | |
| 778 | 1188 int lb = XCHARSET_LEADING_BYTE (charset); |
| 771 | 1189 if (lbs[lb - MIN_LEADING_BYTE] == 0) |
| 1190 { | |
| 877 | 1191 Dynarr_add (dynarr, charset); |
| 771 | 1192 lbs[lb - MIN_LEADING_BYTE] = 1; |
| 1193 } | |
| 1194 } | |
| 1195 } | |
| 1196 } | |
| 1197 | |
| 877 | 1198 /* Rebuild the charset precedence array. |
| 1199 The "charsets preferred for the current language" get highest precedence, | |
| 1200 followed by the "charsets preferred by default", ordered as in | |
| 1201 Vlanguage_unicode_precedence_list and Vdefault_unicode_precedence_list, | |
| 1202 respectively. All remaining charsets follow in an arbitrary order. */ | |
| 771 | 1203 void |
| 1204 recalculate_unicode_precedence (void) | |
| 1205 { | |
| 1206 int lbs[NUM_LEADING_BYTES]; | |
| 1207 int i; | |
| 1208 | |
| 1209 for (i = 0; i < NUM_LEADING_BYTES; i++) | |
| 1210 lbs[i] = 0; | |
| 1211 | |
| 1212 Dynarr_reset (unicode_precedence_dynarr); | |
| 1213 | |
| 1214 add_charsets_to_precedence_list (Vlanguage_unicode_precedence_list, | |
| 1215 lbs, unicode_precedence_dynarr); | |
| 1216 add_charsets_to_precedence_list (Vdefault_unicode_precedence_list, | |
| 1217 lbs, unicode_precedence_dynarr); | |
| 1218 | |
| 1219 for (i = 0; i < NUM_LEADING_BYTES; i++) | |
| 1220 { | |
| 1221 if (lbs[i] == 0) | |
| 1222 { | |
| 826 | 1223 Lisp_Object charset = charset_by_leading_byte (i + MIN_LEADING_BYTE); |
| 771 | 1224 if (!NILP (charset)) |
| 1225 Dynarr_add (unicode_precedence_dynarr, charset); | |
| 1226 } | |
| 1227 } | |
| 1228 } | |
| 1229 | |
| 877 | 1230 DEFUN ("unicode-precedence-list", |
| 1231 Funicode_precedence_list, | |
| 1232 0, 0, 0, /* | |
| 1233 Return the precedence order among charsets used for Unicode decoding. | |
| 1234 | |
| 1235 Value is a list of charsets, which are searched in order for a translation | |
| 1236 matching a given Unicode character. | |
| 1237 | |
| 1238 The highest precedence is given to the language-specific precedence list of | |
| 1239 charsets, defined by `set-language-unicode-precedence-list'. These are | |
| 1240 followed by charsets in the default precedence list, defined by | |
| 1241 `set-default-unicode-precedence-list'. Charsets occurring multiple times are | |
| 1242 given precedence according to their first occurrance in either list. These | |
| 1243 are followed by the remaining charsets, in some arbitrary order. | |
| 771 | 1244 |
| 1245 The language-specific precedence list is meant to be set as part of the | |
| 1246 language environment initialization; the default precedence list is meant | |
| 1247 to be set by the user. | |
| 1318 | 1248 |
| 1249 #### NOTE: This interface may be changed. | |
| 771 | 1250 */ |
| 877 | 1251 ()) |
| 1252 { | |
| 1253 int i; | |
| 1254 Lisp_Object list = Qnil; | |
| 1255 | |
| 1256 for (i = Dynarr_length (unicode_precedence_dynarr) - 1; i >= 0; i--) | |
| 1257 list = Fcons (Dynarr_at (unicode_precedence_dynarr, i), list); | |
| 1258 return list; | |
| 1259 } | |
| 1260 | |
| 1261 | |
| 1262 /* #### This interface is wrong. Cyrillic users and Chinese users are going | |
| 1263 to have varying opinions about whether ISO Cyrillic, KOI8-R, or Windows | |
| 1264 1251 should take precedence, and whether Big Five or CNS should take | |
| 1265 precedence, respectively. This means that users are sometimes going to | |
| 1266 want to set Vlanguage_unicode_precedence_list. | |
| 1267 Furthermore, this should be language-local (buffer-local would be a | |
| 1318 | 1268 reasonable approximation). |
| 1269 | |
| 1270 Answer: You are right, this needs rethinking. */ | |
| 877 | 1271 DEFUN ("set-language-unicode-precedence-list", |
| 1272 Fset_language_unicode_precedence_list, | |
| 1273 1, 1, 0, /* | |
| 1274 Set the language-specific precedence of charsets in Unicode decoding. | |
| 1275 LIST is a list of charsets. | |
| 1276 See `unicode-precedence-list' for more information. | |
| 1318 | 1277 |
| 1278 #### NOTE: This interface may be changed. | |
| 877 | 1279 */ |
| 771 | 1280 (list)) |
| 1281 { | |
| 1282 { | |
| 1283 EXTERNAL_LIST_LOOP_2 (elt, list) | |
| 1284 Fget_charset (elt); | |
| 1285 } | |
| 1286 | |
| 1287 Vlanguage_unicode_precedence_list = list; | |
| 1288 recalculate_unicode_precedence (); | |
| 1289 return Qnil; | |
| 1290 } | |
| 1291 | |
| 1292 DEFUN ("language-unicode-precedence-list", | |
| 1293 Flanguage_unicode_precedence_list, | |
| 1294 0, 0, 0, /* | |
| 1295 Return the language-specific precedence list used for Unicode decoding. | |
| 877 | 1296 See `unicode-precedence-list' for more information. |
| 1318 | 1297 |
| 1298 #### NOTE: This interface may be changed. | |
| 771 | 1299 */ |
| 1300 ()) | |
| 1301 { | |
| 1302 return Vlanguage_unicode_precedence_list; | |
| 1303 } | |
| 1304 | |
| 1305 DEFUN ("set-default-unicode-precedence-list", | |
| 1306 Fset_default_unicode_precedence_list, | |
| 1307 1, 1, 0, /* | |
| 1308 Set the default precedence list used for Unicode decoding. | |
| 877 | 1309 This is intended to be set by the user. See |
| 1310 `unicode-precedence-list' for more information. | |
| 1318 | 1311 |
| 1312 #### NOTE: This interface may be changed. | |
| 771 | 1313 */ |
| 1314 (list)) | |
| 1315 { | |
| 1316 { | |
| 1317 EXTERNAL_LIST_LOOP_2 (elt, list) | |
| 1318 Fget_charset (elt); | |
| 1319 } | |
| 1320 | |
| 1321 Vdefault_unicode_precedence_list = list; | |
| 1322 recalculate_unicode_precedence (); | |
| 1323 return Qnil; | |
| 1324 } | |
| 1325 | |
| 1326 DEFUN ("default-unicode-precedence-list", | |
| 1327 Fdefault_unicode_precedence_list, | |
| 1328 0, 0, 0, /* | |
| 1329 Return the default precedence list used for Unicode decoding. | |
| 877 | 1330 See `unicode-precedence-list' for more information. |
| 1318 | 1331 |
| 1332 #### NOTE: This interface may be changed. | |
| 771 | 1333 */ |
| 1334 ()) | |
| 1335 { | |
| 1336 return Vdefault_unicode_precedence_list; | |
| 1337 } | |
| 1338 | |
| 1339 DEFUN ("set-unicode-conversion", Fset_unicode_conversion, | |
| 1340 2, 2, 0, /* | |
| 1341 Add conversion information between Unicode codepoints and characters. | |
| 877 | 1342 Conversions for U+0000 to U+00FF are hardwired to ASCII, Control-1, and |
| 1343 Latin-1. Attempts to set these values will raise an error. | |
| 1344 | |
| 771 | 1345 CHARACTER is one of the following: |
| 1346 | |
| 1347 -- A character (in which case CODE must be a non-negative integer; values | |
| 1348 above 2^20 - 1 are allowed for the purpose of specifying private | |
| 877 | 1349 characters, but are illegal in standard Unicode---they will cause errors |
| 1350 when converted to utf-16) | |
| 771 | 1351 -- A vector of characters (in which case CODE must be a vector of integers |
| 1352 of the same length) | |
| 1353 */ | |
| 1354 (character, code)) | |
| 1355 { | |
| 1356 Lisp_Object charset; | |
| 877 | 1357 int ichar, unicode; |
| 771 | 1358 |
| 1359 CHECK_CHAR (character); | |
| 1360 CHECK_NATNUM (code); | |
| 1361 | |
| 877 | 1362 unicode = XINT (code); |
| 1363 ichar = XCHAR (character); | |
| 1364 charset = ichar_charset (ichar); | |
| 1365 | |
| 1366 /* The translations of ASCII, Control-1, and Latin-1 code points are | |
| 1367 hard-coded in ichar_to_unicode and unicode_to_ichar. | |
| 1368 | |
| 1369 Checking unicode < 256 && ichar != unicode is wrong because Mule gives | |
| 1370 many Latin characters code points in a few different character sets. */ | |
| 1371 if ((EQ (charset, Vcharset_ascii) || | |
| 1372 EQ (charset, Vcharset_control_1) || | |
| 1373 EQ (charset, Vcharset_latin_iso8859_1)) | |
| 1374 && unicode != ichar) | |
| 893 | 1375 signal_error (Qinvalid_argument, "Can't change Unicode translation for ASCII, Control-1 or Latin-1 character", |
| 771 | 1376 character); |
| 1377 | |
| 877 | 1378 /* #### Composite characters are not properly implemented yet. */ |
| 1379 if (EQ (charset, Vcharset_composite)) | |
| 1380 signal_error (Qinvalid_argument, "Can't set Unicode translation for Composite char", | |
| 1381 character); | |
| 1382 | |
| 1383 set_unicode_conversion (ichar, unicode); | |
| 771 | 1384 return Qnil; |
| 1385 } | |
| 1386 | |
| 1387 #endif /* MULE */ | |
| 1388 | |
| 800 | 1389 DEFUN ("char-to-unicode", Fchar_to_unicode, 1, 1, 0, /* |
| 771 | 1390 Convert character to Unicode codepoint. |
| 3025 | 1391 When there is no international support (i.e. the `mule' feature is not |
| 877 | 1392 present), this function simply does `char-to-int'. |
| 771 | 1393 */ |
| 1394 (character)) | |
| 1395 { | |
| 1396 CHECK_CHAR (character); | |
| 1397 #ifdef MULE | |
| 867 | 1398 return make_int (ichar_to_unicode (XCHAR (character))); |
| 771 | 1399 #else |
| 1400 return Fchar_to_int (character); | |
| 1401 #endif /* MULE */ | |
| 1402 } | |
| 1403 | |
| 800 | 1404 DEFUN ("unicode-to-char", Funicode_to_char, 1, 2, 0, /* |
| 771 | 1405 Convert Unicode codepoint to character. |
| 1406 CODE should be a non-negative integer. | |
| 1407 If CHARSETS is given, it should be a list of charsets, and only those | |
| 1408 charsets will be consulted, in the given order, for a translation. | |
| 1409 Otherwise, the default ordering of all charsets will be given (see | |
| 1410 `set-unicode-charset-precedence'). | |
| 1411 | |
| 3025 | 1412 When there is no international support (i.e. the `mule' feature is not |
| 877 | 1413 present), this function simply does `int-to-char' and ignores the CHARSETS |
| 1414 argument. | |
| 2622 | 1415 |
| 3439 | 1416 If the CODE would not otherwise be converted to an XEmacs character, and the |
| 1417 list of character sets to be consulted is nil or the default, a new XEmacs | |
| 1418 character will be created for it in one of the `jit-ucs-charset' Mule | |
| 4268 | 1419 character sets, and that character will be returned. |
| 1420 | |
| 1421 This is limited to around 400,000 characters per XEmacs session, though, so | |
| 1422 while normal usage will not be problematic, things like: | |
| 1423 | |
| 1424 \(dotimes (i #x110000) (decode-char 'ucs i)) | |
| 1425 | |
| 1426 will eventually error. The long-term solution to this is Unicode as an | |
| 1427 internal encoding. | |
| 771 | 1428 */ |
| 2333 | 1429 (code, USED_IF_MULE (charsets))) |
| 771 | 1430 { |
| 1431 #ifdef MULE | |
| 1432 Lisp_Object_dynarr *dyn; | |
| 1433 int lbs[NUM_LEADING_BYTES]; | |
| 1434 int c; | |
| 1435 | |
| 1436 CHECK_NATNUM (code); | |
| 1437 c = XINT (code); | |
| 1438 { | |
| 1439 EXTERNAL_LIST_LOOP_2 (elt, charsets) | |
| 1440 Fget_charset (elt); | |
| 1441 } | |
| 1442 | |
| 1443 if (NILP (charsets)) | |
| 1444 { | |
| 877 | 1445 Ichar ret = unicode_to_ichar (c, unicode_precedence_dynarr); |
| 771 | 1446 if (ret == -1) |
| 1447 return Qnil; | |
| 1448 return make_char (ret); | |
| 1449 } | |
| 1450 | |
| 1451 dyn = Dynarr_new (Lisp_Object); | |
| 1452 memset (lbs, 0, NUM_LEADING_BYTES * sizeof (int)); | |
| 1453 add_charsets_to_precedence_list (charsets, lbs, dyn); | |
| 1454 { | |
| 877 | 1455 Ichar ret = unicode_to_ichar (c, dyn); |
| 771 | 1456 Dynarr_free (dyn); |
| 1457 if (ret == -1) | |
| 1458 return Qnil; | |
| 1459 return make_char (ret); | |
| 1460 } | |
| 1461 #else | |
| 1462 CHECK_NATNUM (code); | |
| 1463 return Fint_to_char (code); | |
| 1464 #endif /* MULE */ | |
| 1465 } | |
| 1466 | |
| 872 | 1467 #ifdef MULE |
| 1468 | |
| 771 | 1469 static Lisp_Object |
| 1470 cerrar_el_fulano (Lisp_Object fulano) | |
| 1471 { | |
| 1472 FILE *file = (FILE *) get_opaque_ptr (fulano); | |
| 1473 retry_fclose (file); | |
| 1474 return Qnil; | |
| 1475 } | |
| 1476 | |
| 1318 | 1477 DEFUN ("load-unicode-mapping-table", Fload_unicode_mapping_table, |
| 771 | 1478 2, 6, 0, /* |
| 877 | 1479 Load Unicode tables with the Unicode mapping data in FILENAME for CHARSET. |
| 771 | 1480 Data is text, in the form of one translation per line -- charset |
| 1481 codepoint followed by Unicode codepoint. Numbers are decimal or hex | |
| 1482 \(preceded by 0x). Comments are marked with a #. Charset codepoints | |
| 877 | 1483 for two-dimensional charsets have the first octet stored in the |
| 771 | 1484 high 8 bits of the hex number and the second in the low 8 bits. |
| 1485 | |
| 1486 If START and END are given, only charset codepoints within the given | |
| 877 | 1487 range will be processed. (START and END apply to the codepoints in the |
| 1488 file, before OFFSET is applied.) | |
| 771 | 1489 |
| 877 | 1490 If OFFSET is given, that value will be added to all charset codepoints |
| 1491 in the file to obtain the internal charset codepoint. \(We assume | |
| 1492 that octets in the table are in the range 33 to 126 or 32 to 127. If | |
| 1493 you have a table in ku-ten form, with octets in the range 1 to 94, you | |
| 1494 will have to use an offset of 5140, i.e. 0x2020.) | |
| 771 | 1495 |
| 1496 FLAGS, if specified, control further how the tables are interpreted | |
| 877 | 1497 and are used to special-case certain known format deviations in the |
| 1498 Unicode tables or in the charset: | |
| 771 | 1499 |
| 1500 `ignore-first-column' | |
| 877 | 1501 The JIS X 0208 tables have 3 columns of data instead of 2. The first |
| 1502 column contains the Shift-JIS codepoint, which we ignore. | |
| 771 | 1503 `big5' |
| 877 | 1504 The charset codepoints are Big Five codepoints; convert it to the |
| 1505 hacked-up Mule codepoint in `chinese-big5-1' or `chinese-big5-2'. | |
| 771 | 1506 */ |
| 1507 (filename, charset, start, end, offset, flags)) | |
| 1508 { | |
| 1509 int st = 0, en = INT_MAX, of = 0; | |
| 1510 FILE *file; | |
| 1511 struct gcpro gcpro1; | |
| 1512 char line[1025]; | |
| 1513 int fondo = specpdl_depth (); | |
| 1514 int ignore_first_column = 0; | |
| 1515 int big5 = 0; | |
| 1516 | |
| 1517 CHECK_STRING (filename); | |
| 1518 charset = Fget_charset (charset); | |
| 1519 if (!NILP (start)) | |
| 1520 { | |
| 1521 CHECK_INT (start); | |
| 1522 st = XINT (start); | |
| 1523 } | |
| 1524 if (!NILP (end)) | |
| 1525 { | |
| 1526 CHECK_INT (end); | |
| 1527 en = XINT (end); | |
| 1528 } | |
| 1529 if (!NILP (offset)) | |
| 1530 { | |
| 1531 CHECK_INT (offset); | |
| 1532 of = XINT (offset); | |
| 1533 } | |
| 1534 | |
| 1535 if (!LISTP (flags)) | |
| 1536 flags = list1 (flags); | |
| 1537 | |
| 1538 { | |
| 1539 EXTERNAL_LIST_LOOP_2 (elt, flags) | |
| 1540 { | |
| 1541 if (EQ (elt, Qignore_first_column)) | |
| 1542 ignore_first_column = 1; | |
| 1543 else if (EQ (elt, Qbig5)) | |
| 1544 big5 = 1; | |
| 1545 else | |
| 1546 invalid_constant | |
| 1318 | 1547 ("Unrecognized `load-unicode-mapping-table' flag", elt); |
| 771 | 1548 } |
| 1549 } | |
| 1550 | |
| 1551 GCPRO1 (filename); | |
| 1552 filename = Fexpand_file_name (filename, Qnil); | |
| 1553 file = qxe_fopen (XSTRING_DATA (filename), READ_TEXT); | |
| 1554 if (!file) | |
| 1555 report_file_error ("Cannot open", filename); | |
| 1556 record_unwind_protect (cerrar_el_fulano, make_opaque_ptr (file)); | |
| 1557 while (fgets (line, sizeof (line), file)) | |
| 1558 { | |
| 1559 char *p = line; | |
| 1560 int cp1, cp2, endcount; | |
| 1561 int cp1high, cp1low; | |
| 1562 int dummy; | |
| 1563 | |
| 1564 while (*p) /* erase all comments out of the line */ | |
| 1565 { | |
| 1566 if (*p == '#') | |
| 1567 *p = '\0'; | |
| 1568 else | |
| 1569 p++; | |
| 1570 } | |
| 1571 /* see if line is nothing but whitespace and skip if so */ | |
| 1572 p = line + strspn (line, " \t\n\r\f"); | |
| 1573 if (!*p) | |
| 1574 continue; | |
| 1575 /* NOTE: It appears that MS Windows and Newlib sscanf() have | |
| 1576 different interpretations for whitespace (== "skip all whitespace | |
| 1577 at processing point"): Newlib requires at least one corresponding | |
| 1578 whitespace character in the input, but MS allows none. The | |
| 1579 following would be easier to write if we could count on the MS | |
| 1580 interpretation. | |
| 1581 | |
| 1582 Also, the return value does NOT include %n storage. */ | |
| 1583 if ((!ignore_first_column ? | |
| 1584 sscanf (p, "%i %i%n", &cp1, &cp2, &endcount) < 2 : | |
| 1585 sscanf (p, "%i %i %i%n", &dummy, &cp1, &cp2, &endcount) < 3) | |
| 2367 | 1586 /* #### Temporary code! Cygwin newlib fucked up scanf() handling |
| 1587 of numbers beginning 0x0... starting in 04/2004, in an attempt | |
| 1588 to fix another bug. A partial fix for this was put in in | |
| 1589 06/2004, but as of 10/2004 the value of ENDCOUNT returned in | |
| 1590 such case is still wrong. If this gets fixed soon, remove | |
| 1591 this code. --ben */ | |
| 1592 #ifndef CYGWIN_SCANF_BUG | |
| 1593 || *(p + endcount + strspn (p + endcount, " \t\n\r\f")) | |
| 1594 #endif | |
| 1595 ) | |
| 771 | 1596 { |
| 793 | 1597 warn_when_safe (Qunicode, Qwarning, |
| 771 | 1598 "Unrecognized line in translation file %s:\n%s", |
| 1599 XSTRING_DATA (filename), line); | |
| 1600 continue; | |
| 1601 } | |
| 1602 if (cp1 >= st && cp1 <= en) | |
| 1603 { | |
| 1604 cp1 += of; | |
| 1605 if (cp1 < 0 || cp1 >= 65536) | |
| 1606 { | |
| 1607 out_of_range: | |
| 793 | 1608 warn_when_safe (Qunicode, Qwarning, |
| 1609 "Out of range first codepoint 0x%x in " | |
| 1610 "translation file %s:\n%s", | |
| 771 | 1611 cp1, XSTRING_DATA (filename), line); |
| 1612 continue; | |
| 1613 } | |
| 1614 | |
| 1615 cp1high = cp1 >> 8; | |
| 1616 cp1low = cp1 & 255; | |
| 1617 | |
| 1618 if (big5) | |
| 1619 { | |
| 867 | 1620 Ichar ch = decode_big5_char (cp1high, cp1low); |
| 771 | 1621 if (ch == -1) |
| 793 | 1622 |
| 1623 warn_when_safe (Qunicode, Qwarning, | |
| 1624 "Out of range Big5 codepoint 0x%x in " | |
| 1625 "translation file %s:\n%s", | |
| 771 | 1626 cp1, XSTRING_DATA (filename), line); |
| 1627 else | |
| 1628 set_unicode_conversion (ch, cp2); | |
| 1629 } | |
| 1630 else | |
| 1631 { | |
| 1632 int l1, h1, l2, h2; | |
| 867 | 1633 Ichar emch; |
| 771 | 1634 |
| 1635 switch (XCHARSET_TYPE (charset)) | |
| 1636 { | |
| 1637 case CHARSET_TYPE_94: l1 = 33; h1 = 126; l2 = 0; h2 = 0; break; | |
| 1638 case CHARSET_TYPE_96: l1 = 32; h1 = 127; l2 = 0; h2 = 0; break; | |
| 1639 case CHARSET_TYPE_94X94: l1 = 33; h1 = 126; l2 = 33; h2 = 126; | |
| 1640 break; | |
| 1641 case CHARSET_TYPE_96X96: l1 = 32; h1 = 127; l2 = 32; h2 = 127; | |
| 1642 break; | |
| 2500 | 1643 default: ABORT (); l1 = 0; h1 = 0; l2 = 0; h2 = 0; |
| 771 | 1644 } |
| 1645 | |
| 1646 if (cp1high < l2 || cp1high > h2 || cp1low < l1 || cp1low > h1) | |
| 1647 goto out_of_range; | |
| 1648 | |
| 867 | 1649 emch = (cp1high == 0 ? make_ichar (charset, cp1low, 0) : |
| 1650 make_ichar (charset, cp1high, cp1low)); | |
| 771 | 1651 set_unicode_conversion (emch, cp2); |
| 1652 } | |
| 1653 } | |
| 1654 } | |
| 1655 | |
| 1656 if (ferror (file)) | |
| 1657 report_file_error ("IO error when reading", filename); | |
| 1658 | |
| 1659 unbind_to (fondo); /* close file */ | |
| 1660 UNGCPRO; | |
| 1661 return Qnil; | |
| 1662 } | |
| 1663 | |
| 1664 #endif /* MULE */ | |
| 1665 | |
| 1666 | |
| 1667 /************************************************************************/ | |
| 1668 /* Unicode coding system */ | |
| 1669 /************************************************************************/ | |
| 1670 | |
| 1671 struct unicode_coding_system | |
| 1672 { | |
| 1673 enum unicode_type type; | |
| 1887 | 1674 unsigned int little_endian :1; |
| 1675 unsigned int need_bom :1; | |
| 771 | 1676 }; |
| 1677 | |
| 1678 #define CODING_SYSTEM_UNICODE_TYPE(codesys) \ | |
| 1679 (CODING_SYSTEM_TYPE_DATA (codesys, unicode)->type) | |
| 1680 #define XCODING_SYSTEM_UNICODE_TYPE(codesys) \ | |
| 1681 CODING_SYSTEM_UNICODE_TYPE (XCODING_SYSTEM (codesys)) | |
| 1682 #define CODING_SYSTEM_UNICODE_LITTLE_ENDIAN(codesys) \ | |
| 1683 (CODING_SYSTEM_TYPE_DATA (codesys, unicode)->little_endian) | |
| 1684 #define XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN(codesys) \ | |
| 1685 CODING_SYSTEM_UNICODE_LITTLE_ENDIAN (XCODING_SYSTEM (codesys)) | |
| 1686 #define CODING_SYSTEM_UNICODE_NEED_BOM(codesys) \ | |
| 1687 (CODING_SYSTEM_TYPE_DATA (codesys, unicode)->need_bom) | |
| 1688 #define XCODING_SYSTEM_UNICODE_NEED_BOM(codesys) \ | |
| 1689 CODING_SYSTEM_UNICODE_NEED_BOM (XCODING_SYSTEM (codesys)) | |
| 1690 | |
| 1691 struct unicode_coding_stream | |
| 1692 { | |
| 1693 /* decode */ | |
| 1694 unsigned char counter; | |
| 4096 | 1695 unsigned char indicated_length; |
| 771 | 1696 int seen_char; |
| 1697 /* encode */ | |
| 1698 Lisp_Object current_charset; | |
| 1699 int current_char_boundary; | |
| 1700 int wrote_bom; | |
| 1701 }; | |
| 1702 | |
| 1204 | 1703 static const struct memory_description unicode_coding_system_description[] = { |
| 771 | 1704 { XD_END } |
| 1705 }; | |
| 1706 | |
| 1204 | 1707 DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode); |
| 1708 | |
| 771 | 1709 static void |
| 1710 decode_unicode_char (int ch, unsigned_char_dynarr *dst, | |
| 1887 | 1711 struct unicode_coding_stream *data, |
| 1712 unsigned int ignore_bom) | |
| 771 | 1713 { |
| 1714 if (ch == 0xFEFF && !data->seen_char && ignore_bom) | |
| 1715 ; | |
| 1716 else | |
| 1717 { | |
| 1718 #ifdef MULE | |
| 877 | 1719 Ichar chr = unicode_to_ichar (ch, unicode_precedence_dynarr); |
| 771 | 1720 |
| 1721 if (chr != -1) | |
| 1722 { | |
| 867 | 1723 Ibyte work[MAX_ICHAR_LEN]; |
| 771 | 1724 int len; |
| 1725 | |
| 867 | 1726 len = set_itext_ichar (work, chr); |
| 771 | 1727 Dynarr_add_many (dst, work, len); |
| 1728 } | |
| 1729 else | |
| 1730 { | |
| 1731 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); | |
| 1732 Dynarr_add (dst, 34 + 128); | |
| 1733 Dynarr_add (dst, 46 + 128); | |
| 1734 } | |
| 1735 #else | |
| 867 | 1736 Dynarr_add (dst, (Ibyte) ch); |
| 771 | 1737 #endif /* MULE */ |
| 1738 } | |
| 1739 | |
| 1740 data->seen_char = 1; | |
| 1741 } | |
| 1742 | |
| 4096 | 1743 #define DECODE_ERROR_OCTET(octet, dst, data, ignore_bom) \ |
| 1744 decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, \ | |
| 1745 dst, data, ignore_bom) | |
| 1746 | |
| 1747 static inline void | |
| 1748 indicate_invalid_utf_8 (unsigned char indicated_length, | |
| 1749 unsigned char counter, | |
| 1750 int ch, unsigned_char_dynarr *dst, | |
| 1751 struct unicode_coding_stream *data, | |
| 1752 unsigned int ignore_bom) | |
| 1753 { | |
| 1754 Binbyte stored = indicated_length - counter; | |
| 1755 Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length]; | |
| 1756 | |
| 1757 while (stored > 0) | |
| 1758 { | |
| 1759 DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask, | |
| 1760 dst, data, ignore_bom); | |
| 1761 mask = 0x80, stored--; | |
| 1762 } | |
| 1763 } | |
| 1764 | |
| 771 | 1765 static void |
| 1766 encode_unicode_char_1 (int code, unsigned_char_dynarr *dst, | |
| 4096 | 1767 enum unicode_type type, unsigned int little_endian, |
| 1768 int write_error_characters_as_such) | |
| 771 | 1769 { |
| 1770 switch (type) | |
| 1771 { | |
| 1772 case UNICODE_UTF_16: | |
| 1773 if (little_endian) | |
| 1774 { | |
| 3952 | 1775 if (code < 0x10000) { |
| 1776 Dynarr_add (dst, (unsigned char) (code & 255)); | |
| 1777 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | |
| 4096 | 1778 } else if (write_error_characters_as_such && |
| 1779 code >= UNICODE_ERROR_OCTET_RANGE_START && | |
| 1780 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) | |
| 1781 { | |
| 1782 Dynarr_add (dst, (unsigned char) ((code & 0xFF))); | |
| 1783 } | |
| 1784 else if (code < 0x110000) | |
| 1785 { | |
| 1786 /* Little endian; least significant byte first. */ | |
| 1787 int first, second; | |
| 1788 | |
| 1789 CODE_TO_UTF_16_SURROGATES(code, first, second); | |
| 1790 | |
| 1791 Dynarr_add (dst, (unsigned char) (first & 255)); | |
| 1792 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); | |
| 1793 | |
| 1794 Dynarr_add (dst, (unsigned char) (second & 255)); | |
| 1795 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); | |
| 1796 } | |
| 1797 else | |
| 1798 { | |
| 1799 /* Not valid Unicode. Pass U+FFFD, least significant byte | |
| 1800 first. */ | |
| 1801 Dynarr_add (dst, (unsigned char) 0xFD); | |
| 1802 Dynarr_add (dst, (unsigned char) 0xFF); | |
| 1803 } | |
| 771 | 1804 } |
| 1805 else | |
| 1806 { | |
| 3952 | 1807 if (code < 0x10000) { |
| 1808 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | |
| 1809 Dynarr_add (dst, (unsigned char) (code & 255)); | |
| 4096 | 1810 } else if (write_error_characters_as_such && |
| 1811 code >= UNICODE_ERROR_OCTET_RANGE_START && | |
| 1812 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) | |
| 1813 { | |
| 1814 Dynarr_add (dst, (unsigned char) ((code & 0xFF))); | |
| 1815 } | |
| 1816 else if (code < 0x110000) | |
| 1817 { | |
| 1818 /* Big endian; most significant byte first. */ | |
| 1819 int first, second; | |
| 1820 | |
| 1821 CODE_TO_UTF_16_SURROGATES(code, first, second); | |
| 1822 | |
| 1823 Dynarr_add (dst, (unsigned char) ((first >> 8) & 255)); | |
| 1824 Dynarr_add (dst, (unsigned char) (first & 255)); | |
| 1825 | |
| 1826 Dynarr_add (dst, (unsigned char) ((second >> 8) & 255)); | |
| 1827 Dynarr_add (dst, (unsigned char) (second & 255)); | |
| 1828 } | |
| 1829 else | |
| 1830 { | |
| 1831 /* Not valid Unicode. Pass U+FFFD, most significant byte | |
| 1832 first. */ | |
| 1833 Dynarr_add (dst, (unsigned char) 0xFF); | |
| 1834 Dynarr_add (dst, (unsigned char) 0xFD); | |
| 1835 } | |
| 771 | 1836 } |
| 1837 break; | |
| 1838 | |
| 1839 case UNICODE_UCS_4: | |
| 4096 | 1840 case UNICODE_UTF_32: |
| 771 | 1841 if (little_endian) |
| 1842 { | |
| 4096 | 1843 if (write_error_characters_as_such && |
| 1844 code >= UNICODE_ERROR_OCTET_RANGE_START && | |
| 1845 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) | |
| 1846 { | |
| 1847 Dynarr_add (dst, (unsigned char) ((code & 0xFF))); | |
| 1848 } | |
| 1849 else | |
| 1850 { | |
| 1851 /* We generate and accept incorrect sequences here, which is | |
| 1852 okay, in the interest of preservation of the user's | |
| 1853 data. */ | |
| 1854 Dynarr_add (dst, (unsigned char) (code & 255)); | |
| 1855 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | |
| 1856 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); | |
| 1857 Dynarr_add (dst, (unsigned char) (code >> 24)); | |
| 1858 } | |
| 771 | 1859 } |
| 1860 else | |
| 1861 { | |
| 4096 | 1862 if (write_error_characters_as_such && |
| 1863 code >= UNICODE_ERROR_OCTET_RANGE_START && | |
| 1864 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) | |
| 1865 { | |
| 1866 Dynarr_add (dst, (unsigned char) ((code & 0xFF))); | |
| 1867 } | |
| 1868 else | |
| 1869 { | |
| 1870 /* We generate and accept incorrect sequences here, which is okay, | |
| 1871 in the interest of preservation of the user's data. */ | |
| 1872 Dynarr_add (dst, (unsigned char) (code >> 24)); | |
| 1873 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); | |
| 1874 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | |
| 1875 Dynarr_add (dst, (unsigned char) (code & 255)); | |
| 1876 } | |
| 771 | 1877 } |
| 1878 break; | |
| 1879 | |
| 1880 case UNICODE_UTF_8: | |
| 1881 if (code <= 0x7f) | |
| 1882 { | |
| 1883 Dynarr_add (dst, (unsigned char) code); | |
| 1884 } | |
| 1885 else if (code <= 0x7ff) | |
| 1886 { | |
| 1887 Dynarr_add (dst, (unsigned char) ((code >> 6) | 0xc0)); | |
| 1888 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | |
| 1889 } | |
| 1890 else if (code <= 0xffff) | |
| 1891 { | |
| 1892 Dynarr_add (dst, (unsigned char) ((code >> 12) | 0xe0)); | |
| 1893 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); | |
| 1894 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | |
| 1895 } | |
| 1896 else if (code <= 0x1fffff) | |
| 1897 { | |
| 1898 Dynarr_add (dst, (unsigned char) ((code >> 18) | 0xf0)); | |
| 1899 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); | |
| 1900 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); | |
| 1901 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | |
| 1902 } | |
| 1903 else if (code <= 0x3ffffff) | |
| 1904 { | |
| 4096 | 1905 |
| 1906 #if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \ | |
| 1907 && UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff) | |
| 1908 #error "This code needs to be rewritten. " | |
| 1909 #endif | |
| 1910 if (write_error_characters_as_such && | |
| 1911 code >= UNICODE_ERROR_OCTET_RANGE_START && | |
| 1912 code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100)) | |
| 1913 { | |
| 1914 Dynarr_add (dst, (unsigned char) ((code & 0xFF))); | |
| 1915 } | |
| 1916 else | |
| 1917 { | |
| 1918 Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8)); | |
| 1919 Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); | |
| 1920 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); | |
| 1921 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); | |
| 1922 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | |
| 1923 } | |
| 771 | 1924 } |
| 1925 else | |
| 1926 { | |
| 1927 Dynarr_add (dst, (unsigned char) ((code >> 30) | 0xfc)); | |
| 1928 Dynarr_add (dst, (unsigned char) (((code >> 24) & 0x3f) | 0x80)); | |
| 1929 Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); | |
| 1930 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); | |
| 1931 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); | |
| 1932 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | |
| 1933 } | |
| 1934 break; | |
| 1935 | |
| 2500 | 1936 case UNICODE_UTF_7: ABORT (); |
| 771 | 1937 |
| 2500 | 1938 default: ABORT (); |
| 771 | 1939 } |
| 1940 } | |
| 1941 | |
| 3439 | 1942 /* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented |
| 1943 encodings. */ | |
| 1944 void | |
| 2333 | 1945 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h, |
| 1946 int USED_IF_MULE (l), unsigned_char_dynarr *dst, | |
| 4096 | 1947 enum unicode_type type, unsigned int little_endian, |
| 1948 int write_error_characters_as_such) | |
| 771 | 1949 { |
| 1950 #ifdef MULE | |
| 867 | 1951 int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127)); |
| 771 | 1952 |
| 1953 if (code == -1) | |
| 1954 { | |
| 1955 if (type != UNICODE_UTF_16 && | |
| 1956 XCHARSET_DIMENSION (charset) == 2 && | |
| 1957 XCHARSET_CHARS (charset) == 94) | |
| 1958 { | |
| 1959 unsigned char final = XCHARSET_FINAL (charset); | |
| 1960 | |
| 1961 if (('@' <= final) && (final < 0x7f)) | |
| 1962 code = (0xe00000 + (final - '@') * 94 * 94 | |
| 1963 + ((h & 127) - 33) * 94 + (l & 127) - 33); | |
| 1964 else | |
| 1965 code = '?'; | |
| 1966 } | |
| 1967 else | |
| 1968 code = '?'; | |
| 1969 } | |
| 1970 #else | |
| 1971 int code = h; | |
| 1972 #endif /* MULE */ | |
| 1973 | |
| 4096 | 1974 encode_unicode_char_1 (code, dst, type, little_endian, |
| 1975 write_error_characters_as_such); | |
| 771 | 1976 } |
| 1977 | |
| 1978 static Bytecount | |
| 1979 unicode_convert (struct coding_stream *str, const UExtbyte *src, | |
| 1980 unsigned_char_dynarr *dst, Bytecount n) | |
| 1981 { | |
| 1982 unsigned int ch = str->ch; | |
| 1983 struct unicode_coding_stream *data = CODING_STREAM_TYPE_DATA (str, unicode); | |
| 1984 enum unicode_type type = | |
| 1985 XCODING_SYSTEM_UNICODE_TYPE (str->codesys); | |
| 1887 | 1986 unsigned int little_endian = |
| 1987 XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (str->codesys); | |
| 1988 unsigned int ignore_bom = XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys); | |
| 771 | 1989 Bytecount orign = n; |
| 1990 | |
| 1991 if (str->direction == CODING_DECODE) | |
| 1992 { | |
| 1993 unsigned char counter = data->counter; | |
| 4096 | 1994 unsigned char indicated_length |
| 1995 = data->indicated_length; | |
| 771 | 1996 |
| 1997 while (n--) | |
| 1998 { | |
| 1999 UExtbyte c = *src++; | |
| 2000 | |
| 2001 switch (type) | |
| 2002 { | |
| 2003 case UNICODE_UTF_8: | |
| 4096 | 2004 if (0 == counter) |
| 2005 { | |
| 2006 if (0 == (c & 0x80)) | |
| 2007 { | |
| 2008 /* ASCII. */ | |
| 2009 decode_unicode_char (c, dst, data, ignore_bom); | |
| 2010 } | |
| 2011 else if (0 == (c & 0x40)) | |
| 2012 { | |
| 2013 /* Highest bit set, second highest not--there's | |
| 2014 something wrong. */ | |
| 2015 DECODE_ERROR_OCTET (c, dst, data, ignore_bom); | |
| 2016 } | |
| 2017 else if (0 == (c & 0x20)) | |
| 2018 { | |
| 2019 ch = c & 0x1f; | |
| 2020 counter = 1; | |
| 2021 indicated_length = 2; | |
| 2022 } | |
| 2023 else if (0 == (c & 0x10)) | |
| 2024 { | |
| 2025 ch = c & 0x0f; | |
| 2026 counter = 2; | |
| 2027 indicated_length = 3; | |
| 2028 } | |
| 2029 else if (0 == (c & 0x08)) | |
| 2030 { | |
| 2031 ch = c & 0x0f; | |
| 2032 counter = 3; | |
| 2033 indicated_length = 4; | |
| 2034 } | |
| 2035 else | |
| 2036 { | |
| 2037 /* We don't supports lengths longer than 4 in | |
| 2038 external-format data. */ | |
| 2039 DECODE_ERROR_OCTET (c, dst, data, ignore_bom); | |
| 2040 | |
| 2041 } | |
| 2042 } | |
| 2043 else | |
| 2044 { | |
| 2045 /* counter != 0 */ | |
| 2046 if ((0 == (c & 0x80)) || (0 != (c & 0x40))) | |
| 2047 { | |
| 2048 indicate_invalid_utf_8(indicated_length, | |
| 2049 counter, | |
| 2050 ch, dst, data, ignore_bom); | |
| 2051 if (c & 0x80) | |
| 2052 { | |
| 2053 DECODE_ERROR_OCTET (c, dst, data, ignore_bom); | |
| 2054 } | |
| 2055 else | |
| 2056 { | |
| 2057 /* The character just read is ASCII. Treat it as | |
| 2058 such. */ | |
| 2059 decode_unicode_char (c, dst, data, ignore_bom); | |
| 2060 } | |
| 2061 ch = 0; | |
| 2062 counter = 0; | |
| 2063 } | |
| 2064 else | |
| 2065 { | |
| 2066 ch = (ch << 6) | (c & 0x3f); | |
| 2067 counter--; | |
| 2068 /* Just processed the final byte. Emit the character. */ | |
| 2069 if (!counter) | |
| 2070 { | |
| 2071 /* Don't accept over-long sequences, surrogates, | |
| 2072 or codes above #x10FFFF. */ | |
| 2073 if ((ch < 0x80) || | |
| 2074 ((ch < 0x800) && indicated_length > 2) || | |
| 2075 ((ch < 0x10000) && indicated_length > 3) || | |
| 2076 valid_utf_16_surrogate(ch) || (ch > 0x110000)) | |
| 2077 { | |
| 2078 indicate_invalid_utf_8(indicated_length, | |
| 2079 counter, | |
| 2080 ch, dst, data, | |
| 2081 ignore_bom); | |
| 2082 } | |
| 2083 else | |
| 2084 { | |
| 2085 decode_unicode_char (ch, dst, data, ignore_bom); | |
| 2086 } | |
| 2087 ch = 0; | |
| 2088 } | |
| 2089 } | |
| 771 | 2090 } |
| 2091 break; | |
| 2092 | |
| 2093 case UNICODE_UTF_16: | |
| 3952 | 2094 |
| 771 | 2095 if (little_endian) |
| 2096 ch = (c << counter) | ch; | |
| 2097 else | |
| 2098 ch = (ch << 8) | c; | |
| 4096 | 2099 |
| 771 | 2100 counter += 8; |
| 3952 | 2101 |
| 4096 | 2102 if (16 == counter) |
| 2103 { | |
| 771 | 2104 int tempch = ch; |
| 4096 | 2105 |
| 2106 if (valid_utf_16_first_surrogate(ch)) | |
| 2107 { | |
| 2108 break; | |
| 2109 } | |
| 771 | 2110 ch = 0; |
| 2111 counter = 0; | |
| 2112 decode_unicode_char (tempch, dst, data, ignore_bom); | |
| 2113 } | |
| 4096 | 2114 else if (32 == counter) |
| 3952 | 2115 { |
| 2116 int tempch; | |
| 4096 | 2117 |
|
4583
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2118 if (little_endian) |
| 4096 | 2119 { |
|
4583
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2120 if (!valid_utf_16_last_surrogate(ch >> 16)) |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2121 { |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2122 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2123 ignore_bom); |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2124 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2125 ignore_bom); |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2126 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2127 ignore_bom); |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2128 DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2129 ignore_bom); |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2130 } |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2131 else |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2132 { |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2133 tempch = utf_16_surrogates_to_code((ch & 0xffff), |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2134 (ch >> 16)); |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2135 decode_unicode_char(tempch, dst, data, ignore_bom); |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2136 } |
| 4096 | 2137 } |
|
4583
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2138 else |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2139 { |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2140 if (!valid_utf_16_last_surrogate(ch & 0xFFFF)) |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2141 { |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2142 DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2143 ignore_bom); |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2144 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2145 ignore_bom); |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2146 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2147 ignore_bom); |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2148 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2149 ignore_bom); |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2150 } |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2151 else |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2152 { |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2153 tempch = utf_16_surrogates_to_code((ch >> 16), |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2154 (ch & 0xffff)); |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2155 decode_unicode_char(tempch, dst, data, ignore_bom); |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2156 } |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2157 } |
|
2669b1b7e33b
Correct little-endian UTF-16 surrogate handling.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4270
diff
changeset
|
2158 |
| 3952 | 2159 ch = 0; |
| 2160 counter = 0; | |
| 4096 | 2161 } |
| 2162 else | |
| 2163 assert(8 == counter || 24 == counter); | |
| 771 | 2164 break; |
| 2165 | |
| 2166 case UNICODE_UCS_4: | |
| 4096 | 2167 case UNICODE_UTF_32: |
| 771 | 2168 if (little_endian) |
| 2169 ch = (c << counter) | ch; | |
| 2170 else | |
| 2171 ch = (ch << 8) | c; | |
| 2172 counter += 8; | |
| 2173 if (counter == 32) | |
| 2174 { | |
| 4096 | 2175 if (ch > 0x10ffff) |
| 2176 { | |
| 2177 /* ch is not a legal Unicode character. We're fine | |
| 2178 with that in UCS-4, though not in UTF-32. */ | |
| 2179 if (UNICODE_UCS_4 == type && ch < 0x80000000) | |
| 2180 { | |
| 2181 decode_unicode_char (ch, dst, data, ignore_bom); | |
| 2182 } | |
| 2183 else if (little_endian) | |
| 2184 { | |
| 2185 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, | |
| 2186 ignore_bom); | |
| 2187 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
| 2188 ignore_bom); | |
| 2189 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, | |
| 2190 ignore_bom); | |
| 2191 DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, | |
| 2192 ignore_bom); | |
| 2193 } | |
| 2194 else | |
| 2195 { | |
| 2196 DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data, | |
| 2197 ignore_bom); | |
| 2198 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, | |
| 2199 ignore_bom); | |
| 2200 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
| 2201 ignore_bom); | |
| 2202 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, | |
| 2203 ignore_bom); | |
| 2204 } | |
| 2205 } | |
| 2206 else | |
| 2207 { | |
| 2208 decode_unicode_char (ch, dst, data, ignore_bom); | |
| 2209 } | |
| 771 | 2210 ch = 0; |
| 2211 counter = 0; | |
| 2212 } | |
| 2213 break; | |
| 2214 | |
| 2215 case UNICODE_UTF_7: | |
| 2500 | 2216 ABORT (); |
| 771 | 2217 break; |
| 2218 | |
| 2500 | 2219 default: ABORT (); |
| 771 | 2220 } |
| 2221 | |
| 2222 } | |
| 4096 | 2223 |
| 2224 if (str->eof && ch) | |
| 2225 { | |
| 2226 switch (type) | |
| 2227 { | |
| 2228 case UNICODE_UTF_8: | |
| 2229 indicate_invalid_utf_8(indicated_length, | |
| 2230 counter, ch, dst, data, | |
| 2231 ignore_bom); | |
| 2232 break; | |
| 2233 | |
| 2234 case UNICODE_UTF_16: | |
| 2235 case UNICODE_UCS_4: | |
| 2236 case UNICODE_UTF_32: | |
| 2237 if (8 == counter) | |
| 2238 { | |
| 2239 DECODE_ERROR_OCTET (ch, dst, data, ignore_bom); | |
| 2240 } | |
| 2241 else if (16 == counter) | |
| 2242 { | |
| 2243 if (little_endian) | |
| 2244 { | |
| 2245 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); | |
| 2246 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
| 2247 ignore_bom); | |
| 2248 } | |
| 2249 else | |
| 2250 { | |
| 2251 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
| 2252 ignore_bom); | |
| 2253 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); | |
| 2254 } | |
| 2255 } | |
| 2256 else if (24 == counter) | |
| 2257 { | |
| 2258 if (little_endian) | |
| 2259 { | |
| 2260 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, | |
| 2261 ignore_bom); | |
| 2262 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, ignore_bom); | |
| 2263 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
| 2264 ignore_bom); | |
| 2265 } | |
| 2266 else | |
| 2267 { | |
| 2268 DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data, | |
| 2269 ignore_bom); | |
| 2270 DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data, | |
| 2271 ignore_bom); | |
| 2272 DECODE_ERROR_OCTET (ch & 0xFF, dst, data, | |
| 2273 ignore_bom); | |
| 2274 } | |
| 2275 } | |
| 2276 else assert(0); | |
| 2277 break; | |
| 2278 } | |
| 2279 ch = 0; | |
| 2280 } | |
| 771 | 2281 |
| 2282 data->counter = counter; | |
| 4096 | 2283 data->indicated_length = indicated_length; |
| 771 | 2284 } |
| 2285 else | |
| 2286 { | |
| 2287 unsigned char char_boundary = data->current_char_boundary; | |
| 2288 Lisp_Object charset = data->current_charset; | |
| 2289 | |
| 2290 #ifdef ENABLE_COMPOSITE_CHARS | |
| 2291 /* flags for handling composite chars. We do a little switcheroo | |
| 2292 on the source while we're outputting the composite char. */ | |
| 2293 Bytecount saved_n = 0; | |
| 867 | 2294 const Ibyte *saved_src = NULL; |
| 771 | 2295 int in_composite = 0; |
| 2296 | |
| 2297 back_to_square_n: | |
| 2298 #endif /* ENABLE_COMPOSITE_CHARS */ | |
| 2299 | |
| 2300 if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom) | |
| 2301 { | |
| 4096 | 2302 encode_unicode_char_1 (0xFEFF, dst, type, little_endian, 1); |
| 771 | 2303 data->wrote_bom = 1; |
| 2304 } | |
| 2305 | |
| 2306 while (n--) | |
| 2307 { | |
| 867 | 2308 Ibyte c = *src++; |
| 771 | 2309 |
| 2310 #ifdef MULE | |
| 826 | 2311 if (byte_ascii_p (c)) |
| 771 | 2312 #endif /* MULE */ |
| 2313 { /* Processing ASCII character */ | |
| 2314 ch = 0; | |
| 2315 encode_unicode_char (Vcharset_ascii, c, 0, dst, type, | |
| 4096 | 2316 little_endian, 1); |
| 771 | 2317 |
| 2318 char_boundary = 1; | |
| 2319 } | |
| 2320 #ifdef MULE | |
| 867 | 2321 else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch)) |
| 771 | 2322 { /* Processing Leading Byte */ |
| 2323 ch = 0; | |
| 826 | 2324 charset = charset_by_leading_byte (c); |
| 2325 if (leading_byte_prefix_p(c)) | |
| 771 | 2326 ch = c; |
| 2327 char_boundary = 0; | |
| 2328 } | |
| 2329 else | |
| 2330 { /* Processing Non-ASCII character */ | |
| 2331 char_boundary = 1; | |
| 2332 if (EQ (charset, Vcharset_control_1)) | |
| 2704 | 2333 /* See: |
| 2334 | |
| 2335 (Info-goto-node "(internals)Internal String Encoding") | |
| 2336 | |
| 2337 for the rationale behind subtracting #xa0 from the | |
| 2338 character's code. */ | |
| 2339 encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst, | |
| 4096 | 2340 type, little_endian, 1); |
| 771 | 2341 else |
| 2342 { | |
| 2343 switch (XCHARSET_REP_BYTES (charset)) | |
| 2344 { | |
| 2345 case 2: | |
| 2346 encode_unicode_char (charset, c, 0, dst, type, | |
| 4096 | 2347 little_endian, 1); |
| 771 | 2348 break; |
| 2349 case 3: | |
| 2350 if (XCHARSET_PRIVATE_P (charset)) | |
| 2351 { | |
| 2352 encode_unicode_char (charset, c, 0, dst, type, | |
| 4096 | 2353 little_endian, 1); |
| 771 | 2354 ch = 0; |
| 2355 } | |
| 2356 else if (ch) | |
| 2357 { | |
| 2358 #ifdef ENABLE_COMPOSITE_CHARS | |
| 2359 if (EQ (charset, Vcharset_composite)) | |
| 2360 { | |
| 2361 if (in_composite) | |
| 2362 { | |
| 2363 /* #### Bother! We don't know how to | |
| 2364 handle this yet. */ | |
| 2365 encode_unicode_char (Vcharset_ascii, '~', 0, | |
| 2366 dst, type, | |
| 4096 | 2367 little_endian, 1); |
| 771 | 2368 } |
| 2369 else | |
| 2370 { | |
| 867 | 2371 Ichar emch = make_ichar (Vcharset_composite, |
| 771 | 2372 ch & 0x7F, |
| 2373 c & 0x7F); | |
| 2374 Lisp_Object lstr = | |
| 2375 composite_char_string (emch); | |
| 2376 saved_n = n; | |
| 2377 saved_src = src; | |
| 2378 in_composite = 1; | |
| 2379 src = XSTRING_DATA (lstr); | |
| 2380 n = XSTRING_LENGTH (lstr); | |
| 2381 } | |
| 2382 } | |
| 2383 else | |
| 2384 #endif /* ENABLE_COMPOSITE_CHARS */ | |
| 2385 encode_unicode_char (charset, ch, c, dst, type, | |
| 4096 | 2386 little_endian, 1); |
| 771 | 2387 ch = 0; |
| 2388 } | |
| 2389 else | |
| 2390 { | |
| 2391 ch = c; | |
| 2392 char_boundary = 0; | |
| 2393 } | |
| 2394 break; | |
| 2395 case 4: | |
| 2396 if (ch) | |
| 2397 { | |
| 2398 encode_unicode_char (charset, ch, c, dst, type, | |
| 4096 | 2399 little_endian, 1); |
| 771 | 2400 ch = 0; |
| 2401 } | |
| 2402 else | |
| 2403 { | |
| 2404 ch = c; | |
| 2405 char_boundary = 0; | |
| 2406 } | |
| 2407 break; | |
| 2408 default: | |
| 2500 | 2409 ABORT (); |
| 771 | 2410 } |
| 2411 } | |
| 2412 } | |
| 2413 #endif /* MULE */ | |
| 2414 } | |
| 2415 | |
| 2416 #ifdef ENABLE_COMPOSITE_CHARS | |
| 2417 if (in_composite) | |
| 2418 { | |
| 2419 n = saved_n; | |
| 2420 src = saved_src; | |
| 2421 in_composite = 0; | |
| 2422 goto back_to_square_n; /* Wheeeeeeeee ..... */ | |
| 2423 } | |
| 2424 #endif /* ENABLE_COMPOSITE_CHARS */ | |
| 2425 | |
| 2426 data->current_char_boundary = char_boundary; | |
| 2427 data->current_charset = charset; | |
| 2428 | |
| 2429 /* La palabra se hizo carne! */ | |
| 2430 /* A palavra fez-se carne! */ | |
| 2431 /* Whatever. */ | |
| 2432 } | |
| 2433 | |
| 2434 str->ch = ch; | |
| 2435 return orign; | |
| 2436 } | |
| 2437 | |
| 2438 /* DEFINE_DETECTOR (utf_7); */ | |
| 2439 DEFINE_DETECTOR (utf_8); | |
| 2440 DEFINE_DETECTOR_CATEGORY (utf_8, utf_8); | |
| 985 | 2441 DEFINE_DETECTOR_CATEGORY (utf_8, utf_8_bom); |
| 771 | 2442 DEFINE_DETECTOR (ucs_4); |
| 2443 DEFINE_DETECTOR_CATEGORY (ucs_4, ucs_4); | |
| 2444 DEFINE_DETECTOR (utf_16); | |
| 2445 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16); | |
| 2446 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian); | |
| 2447 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_bom); | |
| 2448 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian_bom); | |
| 2449 | |
| 2450 struct ucs_4_detector | |
| 2451 { | |
| 2452 int in_ucs_4_byte; | |
| 2453 }; | |
| 2454 | |
| 2455 static void | |
| 2456 ucs_4_detect (struct detection_state *st, const UExtbyte *src, | |
| 2457 Bytecount n) | |
| 2458 { | |
| 2459 struct ucs_4_detector *data = DETECTION_STATE_DATA (st, ucs_4); | |
| 2460 | |
| 2461 while (n--) | |
| 2462 { | |
| 2463 UExtbyte c = *src++; | |
| 2464 switch (data->in_ucs_4_byte) | |
| 2465 { | |
| 2466 case 0: | |
| 2467 if (c >= 128) | |
| 2468 { | |
| 2469 DET_RESULT (st, ucs_4) = DET_NEARLY_IMPOSSIBLE; | |
| 2470 return; | |
| 2471 } | |
| 2472 else | |
| 2473 data->in_ucs_4_byte++; | |
| 2474 break; | |
| 2475 case 3: | |
| 2476 data->in_ucs_4_byte = 0; | |
| 2477 break; | |
| 2478 default: | |
| 2479 data->in_ucs_4_byte++; | |
| 2480 } | |
| 2481 } | |
| 2482 | |
| 2483 /* !!#### write this for real */ | |
| 2484 DET_RESULT (st, ucs_4) = DET_AS_LIKELY_AS_UNLIKELY; | |
| 2485 } | |
| 2486 | |
| 2487 struct utf_16_detector | |
| 2488 { | |
| 2489 unsigned int seen_ffff:1; | |
| 2490 unsigned int seen_forward_bom:1; | |
| 2491 unsigned int seen_rev_bom:1; | |
| 2492 int byteno; | |
| 2493 int prev_char; | |
| 2494 int text, rev_text; | |
| 1267 | 2495 int sep, rev_sep; |
| 2496 int num_ascii; | |
| 771 | 2497 }; |
| 2498 | |
| 2499 static void | |
| 2500 utf_16_detect (struct detection_state *st, const UExtbyte *src, | |
| 2501 Bytecount n) | |
| 2502 { | |
| 2503 struct utf_16_detector *data = DETECTION_STATE_DATA (st, utf_16); | |
| 2504 | |
| 2505 while (n--) | |
| 2506 { | |
| 2507 UExtbyte c = *src++; | |
| 2508 int prevc = data->prev_char; | |
| 2509 if (data->byteno == 1 && c == 0xFF && prevc == 0xFE) | |
| 2510 data->seen_forward_bom = 1; | |
| 2511 else if (data->byteno == 1 && c == 0xFE && prevc == 0xFF) | |
| 2512 data->seen_rev_bom = 1; | |
| 2513 | |
| 2514 if (data->byteno & 1) | |
| 2515 { | |
| 2516 if (c == 0xFF && prevc == 0xFF) | |
| 2517 data->seen_ffff = 1; | |
| 2518 if (prevc == 0 | |
| 2519 && (c == '\r' || c == '\n' | |
| 2520 || (c >= 0x20 && c <= 0x7E))) | |
| 2521 data->text++; | |
| 2522 if (c == 0 | |
| 2523 && (prevc == '\r' || prevc == '\n' | |
| 2524 || (prevc >= 0x20 && prevc <= 0x7E))) | |
| 2525 data->rev_text++; | |
| 1267 | 2526 /* #### 0x2028 is LINE SEPARATOR and 0x2029 is PARAGRAPH SEPARATOR. |
| 2527 I used to count these in text and rev_text but that is very bad, | |
| 2528 as 0x2028 is also space + left-paren in ASCII, which is extremely | |
| 2529 common. So, what do we do with these? */ | |
| 771 | 2530 if (prevc == 0x20 && (c == 0x28 || c == 0x29)) |
| 1267 | 2531 data->sep++; |
| 771 | 2532 if (c == 0x20 && (prevc == 0x28 || prevc == 0x29)) |
| 1267 | 2533 data->rev_sep++; |
| 771 | 2534 } |
| 2535 | |
| 1267 | 2536 if ((c >= ' ' && c <= '~') || c == '\n' || c == '\r' || c == '\t' || |
| 2537 c == '\f' || c == '\v') | |
| 2538 data->num_ascii++; | |
| 771 | 2539 data->byteno++; |
| 2540 data->prev_char = c; | |
| 2541 } | |
| 2542 | |
| 2543 { | |
| 2544 int variance_indicates_big_endian = | |
| 2545 (data->text >= 10 | |
| 2546 && (data->rev_text == 0 | |
| 2547 || data->text / data->rev_text >= 10)); | |
| 2548 int variance_indicates_little_endian = | |
| 2549 (data->rev_text >= 10 | |
| 2550 && (data->text == 0 | |
| 2551 || data->rev_text / data->text >= 10)); | |
| 2552 | |
| 2553 if (data->seen_ffff) | |
| 2554 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | |
| 2555 else if (data->seen_forward_bom) | |
| 2556 { | |
| 2557 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | |
| 2558 if (variance_indicates_big_endian) | |
| 2559 DET_RESULT (st, utf_16_bom) = DET_NEAR_CERTAINTY; | |
| 2560 else if (variance_indicates_little_endian) | |
| 2561 DET_RESULT (st, utf_16_bom) = DET_SOMEWHAT_LIKELY; | |
| 2562 else | |
| 2563 DET_RESULT (st, utf_16_bom) = DET_QUITE_PROBABLE; | |
| 2564 } | |
| 2565 else if (data->seen_forward_bom) | |
| 2566 { | |
| 2567 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | |
| 2568 if (variance_indicates_big_endian) | |
| 2569 DET_RESULT (st, utf_16_bom) = DET_NEAR_CERTAINTY; | |
| 2570 else if (variance_indicates_little_endian) | |
| 2571 /* #### may need to rethink */ | |
| 2572 DET_RESULT (st, utf_16_bom) = DET_SOMEWHAT_LIKELY; | |
| 2573 else | |
| 2574 /* #### may need to rethink */ | |
| 2575 DET_RESULT (st, utf_16_bom) = DET_QUITE_PROBABLE; | |
| 2576 } | |
| 2577 else if (data->seen_rev_bom) | |
| 2578 { | |
| 2579 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | |
| 2580 if (variance_indicates_little_endian) | |
| 2581 DET_RESULT (st, utf_16_little_endian_bom) = DET_NEAR_CERTAINTY; | |
| 2582 else if (variance_indicates_big_endian) | |
| 2583 /* #### may need to rethink */ | |
| 2584 DET_RESULT (st, utf_16_little_endian_bom) = DET_SOMEWHAT_LIKELY; | |
| 2585 else | |
| 2586 /* #### may need to rethink */ | |
| 2587 DET_RESULT (st, utf_16_little_endian_bom) = DET_QUITE_PROBABLE; | |
| 2588 } | |
| 2589 else if (variance_indicates_big_endian) | |
| 2590 { | |
| 2591 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | |
| 2592 DET_RESULT (st, utf_16) = DET_SOMEWHAT_LIKELY; | |
| 2593 DET_RESULT (st, utf_16_little_endian) = DET_SOMEWHAT_UNLIKELY; | |
| 2594 } | |
| 2595 else if (variance_indicates_little_endian) | |
| 2596 { | |
| 2597 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | |
| 2598 DET_RESULT (st, utf_16) = DET_SOMEWHAT_UNLIKELY; | |
| 2599 DET_RESULT (st, utf_16_little_endian) = DET_SOMEWHAT_LIKELY; | |
| 2600 } | |
| 2601 else | |
| 1267 | 2602 { |
| 2603 /* #### FUCKME! There should really be an ASCII detector. This | |
| 2604 would rule out the need to have this built-in here as | |
| 2605 well. --ben */ | |
| 1292 | 2606 int pct_ascii = data->byteno ? (100 * data->num_ascii) / data->byteno |
| 2607 : 100; | |
| 1267 | 2608 |
| 2609 if (pct_ascii > 90) | |
| 2610 SET_DET_RESULTS (st, utf_16, DET_QUITE_IMPROBABLE); | |
| 2611 else if (pct_ascii > 75) | |
| 2612 SET_DET_RESULTS (st, utf_16, DET_SOMEWHAT_UNLIKELY); | |
| 2613 else | |
| 2614 SET_DET_RESULTS (st, utf_16, DET_AS_LIKELY_AS_UNLIKELY); | |
| 2615 } | |
| 771 | 2616 } |
| 2617 } | |
| 2618 | |
| 2619 struct utf_8_detector | |
| 2620 { | |
| 985 | 2621 int byteno; |
| 2622 int first_byte; | |
| 2623 int second_byte; | |
| 1267 | 2624 int prev_byte; |
| 771 | 2625 int in_utf_8_byte; |
| 1267 | 2626 int recent_utf_8_sequence; |
| 2627 int seen_bogus_utf8; | |
| 2628 int seen_really_bogus_utf8; | |
| 2629 int seen_2byte_sequence; | |
| 2630 int seen_longer_sequence; | |
| 2631 int seen_iso2022_esc; | |
| 2632 int seen_iso_shift; | |
| 1887 | 2633 unsigned int seen_utf_bom:1; |
| 771 | 2634 }; |
| 2635 | |
| 2636 static void | |
| 2637 utf_8_detect (struct detection_state *st, const UExtbyte *src, | |
| 2638 Bytecount n) | |
| 2639 { | |
| 2640 struct utf_8_detector *data = DETECTION_STATE_DATA (st, utf_8); | |
| 2641 | |
| 2642 while (n--) | |
| 2643 { | |
| 2644 UExtbyte c = *src++; | |
| 985 | 2645 switch (data->byteno) |
| 2646 { | |
| 2647 case 0: | |
| 2648 data->first_byte = c; | |
| 2649 break; | |
| 2650 case 1: | |
| 2651 data->second_byte = c; | |
| 2652 break; | |
| 2653 case 2: | |
| 2654 if (data->first_byte == 0xef && | |
| 2655 data->second_byte == 0xbb && | |
| 2656 c == 0xbf) | |
| 1267 | 2657 data->seen_utf_bom = 1; |
| 985 | 2658 break; |
| 2659 } | |
| 2660 | |
| 771 | 2661 switch (data->in_utf_8_byte) |
| 2662 { | |
| 2663 case 0: | |
| 1267 | 2664 if (data->prev_byte == ISO_CODE_ESC && c >= 0x28 && c <= 0x2F) |
| 2665 data->seen_iso2022_esc++; | |
| 2666 else if (c == ISO_CODE_SI || c == ISO_CODE_SO) | |
| 2667 data->seen_iso_shift++; | |
| 771 | 2668 else if (c >= 0xfc) |
| 2669 data->in_utf_8_byte = 5; | |
| 2670 else if (c >= 0xf8) | |
| 2671 data->in_utf_8_byte = 4; | |
| 2672 else if (c >= 0xf0) | |
| 2673 data->in_utf_8_byte = 3; | |
| 2674 else if (c >= 0xe0) | |
| 2675 data->in_utf_8_byte = 2; | |
| 2676 else if (c >= 0xc0) | |
| 2677 data->in_utf_8_byte = 1; | |
| 2678 else if (c >= 0x80) | |
| 1267 | 2679 data->seen_bogus_utf8++; |
| 2680 if (data->in_utf_8_byte > 0) | |
| 2681 data->recent_utf_8_sequence = data->in_utf_8_byte; | |
| 771 | 2682 break; |
| 2683 default: | |
| 2684 if ((c & 0xc0) != 0x80) | |
| 1267 | 2685 data->seen_really_bogus_utf8++; |
| 2686 else | |
| 771 | 2687 { |
| 1267 | 2688 data->in_utf_8_byte--; |
| 2689 if (data->in_utf_8_byte == 0) | |
| 2690 { | |
| 2691 if (data->recent_utf_8_sequence == 1) | |
| 2692 data->seen_2byte_sequence++; | |
| 2693 else | |
| 2694 { | |
| 2695 assert (data->recent_utf_8_sequence >= 2); | |
| 2696 data->seen_longer_sequence++; | |
| 2697 } | |
| 2698 } | |
| 771 | 2699 } |
| 2700 } | |
| 985 | 2701 |
| 2702 data->byteno++; | |
| 1267 | 2703 data->prev_byte = c; |
| 771 | 2704 } |
| 1267 | 2705 |
| 2706 /* either BOM or no BOM, but not both */ | |
| 2707 SET_DET_RESULTS (st, utf_8, DET_NEARLY_IMPOSSIBLE); | |
| 2708 | |
| 2709 | |
| 2710 if (data->seen_utf_bom) | |
| 2711 DET_RESULT (st, utf_8_bom) = DET_NEAR_CERTAINTY; | |
| 2712 else | |
| 2713 { | |
| 2714 if (data->seen_really_bogus_utf8 || | |
| 2715 data->seen_bogus_utf8 >= 2) | |
| 2716 ; /* bogus */ | |
| 2717 else if (data->seen_bogus_utf8) | |
| 2718 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | |
| 2719 else if ((data->seen_longer_sequence >= 5 || | |
| 2720 data->seen_2byte_sequence >= 10) && | |
| 2721 (!(data->seen_iso2022_esc + data->seen_iso_shift) || | |
| 2722 (data->seen_longer_sequence * 2 + data->seen_2byte_sequence) / | |
| 2723 (data->seen_iso2022_esc + data->seen_iso_shift) >= 10)) | |
| 2724 /* heuristics, heuristics, we love heuristics */ | |
| 2725 DET_RESULT (st, utf_8) = DET_QUITE_PROBABLE; | |
| 2726 else if (data->seen_iso2022_esc || | |
| 2727 data->seen_iso_shift >= 3) | |
| 2728 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | |
| 2729 else if (data->seen_longer_sequence || | |
| 2730 data->seen_2byte_sequence) | |
| 2731 DET_RESULT (st, utf_8) = DET_SOMEWHAT_LIKELY; | |
| 2732 else if (data->seen_iso_shift) | |
| 2733 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | |
| 2734 else | |
| 2735 DET_RESULT (st, utf_8) = DET_AS_LIKELY_AS_UNLIKELY; | |
| 2736 } | |
| 771 | 2737 } |
| 2738 | |
| 2739 static void | |
| 2740 unicode_init_coding_stream (struct coding_stream *str) | |
| 2741 { | |
| 2742 struct unicode_coding_stream *data = | |
| 2743 CODING_STREAM_TYPE_DATA (str, unicode); | |
| 2744 xzero (*data); | |
| 2745 data->current_charset = Qnil; | |
| 2746 } | |
| 2747 | |
| 2748 static void | |
| 2749 unicode_rewind_coding_stream (struct coding_stream *str) | |
| 2750 { | |
| 2751 unicode_init_coding_stream (str); | |
| 2752 } | |
| 2753 | |
| 2754 static int | |
| 2755 unicode_putprop (Lisp_Object codesys, Lisp_Object key, Lisp_Object value) | |
| 2756 { | |
| 3767 | 2757 if (EQ (key, Qunicode_type)) |
| 771 | 2758 { |
| 2759 enum unicode_type type; | |
| 2760 | |
| 2761 if (EQ (value, Qutf_8)) | |
| 2762 type = UNICODE_UTF_8; | |
| 2763 else if (EQ (value, Qutf_16)) | |
| 2764 type = UNICODE_UTF_16; | |
| 2765 else if (EQ (value, Qutf_7)) | |
| 2766 type = UNICODE_UTF_7; | |
| 2767 else if (EQ (value, Qucs_4)) | |
| 2768 type = UNICODE_UCS_4; | |
| 4096 | 2769 else if (EQ (value, Qutf_32)) |
| 2770 type = UNICODE_UTF_32; | |
| 771 | 2771 else |
| 2772 invalid_constant ("Invalid Unicode type", key); | |
| 2773 | |
| 2774 XCODING_SYSTEM_UNICODE_TYPE (codesys) = type; | |
| 2775 } | |
| 2776 else if (EQ (key, Qlittle_endian)) | |
| 2777 XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (codesys) = !NILP (value); | |
| 2778 else if (EQ (key, Qneed_bom)) | |
| 2779 XCODING_SYSTEM_UNICODE_NEED_BOM (codesys) = !NILP (value); | |
| 2780 else | |
| 2781 return 0; | |
| 2782 return 1; | |
| 2783 } | |
| 2784 | |
| 2785 static Lisp_Object | |
| 2786 unicode_getprop (Lisp_Object coding_system, Lisp_Object prop) | |
| 2787 { | |
| 3767 | 2788 if (EQ (prop, Qunicode_type)) |
| 771 | 2789 { |
| 2790 switch (XCODING_SYSTEM_UNICODE_TYPE (coding_system)) | |
| 2791 { | |
| 2792 case UNICODE_UTF_16: return Qutf_16; | |
| 2793 case UNICODE_UTF_8: return Qutf_8; | |
| 2794 case UNICODE_UTF_7: return Qutf_7; | |
| 2795 case UNICODE_UCS_4: return Qucs_4; | |
| 4096 | 2796 case UNICODE_UTF_32: return Qutf_32; |
| 2500 | 2797 default: ABORT (); |
| 771 | 2798 } |
| 2799 } | |
| 2800 else if (EQ (prop, Qlittle_endian)) | |
| 2801 return XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (coding_system) ? Qt : Qnil; | |
| 2802 else if (EQ (prop, Qneed_bom)) | |
| 2803 return XCODING_SYSTEM_UNICODE_NEED_BOM (coding_system) ? Qt : Qnil; | |
| 2804 return Qunbound; | |
| 2805 } | |
| 2806 | |
| 2807 static void | |
| 2286 | 2808 unicode_print (Lisp_Object cs, Lisp_Object printcharfun, |
| 2809 int UNUSED (escapeflag)) | |
| 771 | 2810 { |
| 3767 | 2811 write_fmt_string_lisp (printcharfun, "(%s", 1, |
| 2812 unicode_getprop (cs, Qunicode_type)); | |
| 771 | 2813 if (XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (cs)) |
| 826 | 2814 write_c_string (printcharfun, ", little-endian"); |
| 771 | 2815 if (XCODING_SYSTEM_UNICODE_NEED_BOM (cs)) |
| 826 | 2816 write_c_string (printcharfun, ", need-bom"); |
| 2817 write_c_string (printcharfun, ")"); | |
| 771 | 2818 } |
| 2819 | |
| 2820 int | |
| 2286 | 2821 dfc_coding_system_is_unicode ( |
| 2822 #ifdef WIN32_ANY | |
| 2823 Lisp_Object codesys | |
| 2824 #else | |
| 2825 Lisp_Object UNUSED (codesys) | |
| 2826 #endif | |
| 2827 ) | |
| 771 | 2828 { |
| 1315 | 2829 #ifdef WIN32_ANY |
| 771 | 2830 codesys = Fget_coding_system (codesys); |
| 2831 return (EQ (XCODING_SYSTEM_TYPE (codesys), Qunicode) && | |
| 2832 XCODING_SYSTEM_UNICODE_TYPE (codesys) == UNICODE_UTF_16 && | |
| 2833 XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (codesys)); | |
| 2834 | |
| 2835 #else | |
| 2836 return 0; | |
| 2837 #endif | |
| 2838 } | |
| 2839 | |
| 2840 | |
| 2841 /************************************************************************/ | |
| 2842 /* Initialization */ | |
| 2843 /************************************************************************/ | |
| 2844 | |
| 2845 void | |
| 2846 syms_of_unicode (void) | |
| 2847 { | |
| 2848 #ifdef MULE | |
| 877 | 2849 DEFSUBR (Funicode_precedence_list); |
| 771 | 2850 DEFSUBR (Fset_language_unicode_precedence_list); |
| 2851 DEFSUBR (Flanguage_unicode_precedence_list); | |
| 2852 DEFSUBR (Fset_default_unicode_precedence_list); | |
| 2853 DEFSUBR (Fdefault_unicode_precedence_list); | |
| 2854 DEFSUBR (Fset_unicode_conversion); | |
| 2855 | |
| 1318 | 2856 DEFSUBR (Fload_unicode_mapping_table); |
| 771 | 2857 |
| 3439 | 2858 DEFSYMBOL (Qccl_encode_to_ucs_2); |
| 2859 DEFSYMBOL (Qlast_allocated_character); | |
| 771 | 2860 DEFSYMBOL (Qignore_first_column); |
| 3659 | 2861 |
| 2862 DEFSYMBOL (Qunicode_registries); | |
| 771 | 2863 #endif /* MULE */ |
| 2864 | |
| 800 | 2865 DEFSUBR (Fchar_to_unicode); |
| 2866 DEFSUBR (Funicode_to_char); | |
| 771 | 2867 |
| 2868 DEFSYMBOL (Qunicode); | |
| 2869 DEFSYMBOL (Qucs_4); | |
| 2870 DEFSYMBOL (Qutf_16); | |
| 4096 | 2871 DEFSYMBOL (Qutf_32); |
| 771 | 2872 DEFSYMBOL (Qutf_8); |
| 2873 DEFSYMBOL (Qutf_7); | |
| 2874 | |
| 2875 DEFSYMBOL (Qneed_bom); | |
| 2876 | |
| 2877 DEFSYMBOL (Qutf_16); | |
| 2878 DEFSYMBOL (Qutf_16_little_endian); | |
| 2879 DEFSYMBOL (Qutf_16_bom); | |
| 2880 DEFSYMBOL (Qutf_16_little_endian_bom); | |
| 985 | 2881 |
| 2882 DEFSYMBOL (Qutf_8); | |
| 2883 DEFSYMBOL (Qutf_8_bom); | |
| 771 | 2884 } |
| 2885 | |
| 2886 void | |
| 2887 coding_system_type_create_unicode (void) | |
| 2888 { | |
| 2889 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (unicode, "unicode-coding-system-p"); | |
| 2890 CODING_SYSTEM_HAS_METHOD (unicode, print); | |
| 2891 CODING_SYSTEM_HAS_METHOD (unicode, convert); | |
| 2892 CODING_SYSTEM_HAS_METHOD (unicode, init_coding_stream); | |
| 2893 CODING_SYSTEM_HAS_METHOD (unicode, rewind_coding_stream); | |
| 2894 CODING_SYSTEM_HAS_METHOD (unicode, putprop); | |
| 2895 CODING_SYSTEM_HAS_METHOD (unicode, getprop); | |
| 2896 | |
| 2897 INITIALIZE_DETECTOR (utf_8); | |
| 2898 DETECTOR_HAS_METHOD (utf_8, detect); | |
| 2899 INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8); | |
| 985 | 2900 INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8_bom); |
| 771 | 2901 |
| 2902 INITIALIZE_DETECTOR (ucs_4); | |
| 2903 DETECTOR_HAS_METHOD (ucs_4, detect); | |
| 2904 INITIALIZE_DETECTOR_CATEGORY (ucs_4, ucs_4); | |
| 2905 | |
| 2906 INITIALIZE_DETECTOR (utf_16); | |
| 2907 DETECTOR_HAS_METHOD (utf_16, detect); | |
| 2908 INITIALIZE_DETECTOR_CATEGORY (utf_16, utf_16); | |
| 2909 INITIALIZE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian); | |
| 2910 INITIALIZE_DETECTOR_CATEGORY (utf_16, utf_16_bom); | |
| 2911 INITIALIZE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian_bom); | |
| 2912 } | |
| 2913 | |
| 2914 void | |
| 2915 reinit_coding_system_type_create_unicode (void) | |
| 2916 { | |
| 2917 REINITIALIZE_CODING_SYSTEM_TYPE (unicode); | |
| 2918 } | |
| 2919 | |
| 2920 void | |
| 2921 vars_of_unicode (void) | |
| 2922 { | |
| 2923 Fprovide (intern ("unicode")); | |
| 2924 | |
| 2925 #ifdef MULE | |
| 4270 | 2926 staticpro (&Vnumber_of_jit_charsets); |
| 2927 Vnumber_of_jit_charsets = make_int (0); | |
| 2928 staticpro (&Vlast_jit_charset_final); | |
| 2929 Vlast_jit_charset_final = make_char (0x30); | |
| 2930 staticpro (&Vcharset_descr); | |
| 2931 Vcharset_descr | |
| 2932 = build_string ("Mule charset for otherwise unknown Unicode code points."); | |
| 2933 | |
| 771 | 2934 staticpro (&Vlanguage_unicode_precedence_list); |
| 2935 Vlanguage_unicode_precedence_list = Qnil; | |
| 2936 | |
| 2937 staticpro (&Vdefault_unicode_precedence_list); | |
| 2938 Vdefault_unicode_precedence_list = Qnil; | |
| 2939 | |
| 2940 unicode_precedence_dynarr = Dynarr_new (Lisp_Object); | |
| 2367 | 2941 dump_add_root_block_ptr (&unicode_precedence_dynarr, |
| 771 | 2942 &lisp_object_dynarr_description); |
| 2367 | 2943 |
| 3659 | 2944 |
| 2945 | |
| 2367 | 2946 init_blank_unicode_tables (); |
| 2947 | |
| 3439 | 2948 staticpro (&Vcurrent_jit_charset); |
| 2949 Vcurrent_jit_charset = Qnil; | |
| 2950 | |
| 2367 | 2951 /* Note that the "block" we are describing is a single pointer, and hence |
| 2952 we could potentially use dump_add_root_block_ptr(). However, given | |
| 2953 the way the descriptions are written, we couldn't use them, and would | |
| 2954 have to write new descriptions for each of the pointers below, since | |
| 2955 we would have to make use of a description with an XD_BLOCK_ARRAY | |
| 2956 in it. */ | |
| 2957 | |
| 2958 dump_add_root_block (&to_unicode_blank_1, sizeof (void *), | |
| 2959 to_unicode_level_1_desc_1); | |
| 2960 dump_add_root_block (&to_unicode_blank_2, sizeof (void *), | |
| 2961 to_unicode_level_2_desc_1); | |
| 2962 | |
| 2963 dump_add_root_block (&from_unicode_blank_1, sizeof (void *), | |
| 2964 from_unicode_level_1_desc_1); | |
| 2965 dump_add_root_block (&from_unicode_blank_2, sizeof (void *), | |
| 2966 from_unicode_level_2_desc_1); | |
| 2967 dump_add_root_block (&from_unicode_blank_3, sizeof (void *), | |
| 2968 from_unicode_level_3_desc_1); | |
| 2969 dump_add_root_block (&from_unicode_blank_4, sizeof (void *), | |
| 2970 from_unicode_level_4_desc_1); | |
| 3659 | 2971 |
| 2972 DEFVAR_LISP ("unicode-registries", &Qunicode_registries /* | |
| 2973 Vector describing the X11 registries searched when using fallback fonts. | |
| 2974 | |
| 2975 "Fallback fonts" here includes by default those fonts used by redisplay when | |
| 2976 displaying charsets for which the `encode-as-utf-8' property is true, and | |
| 2977 those used when no font matching the charset's registries property has been | |
| 2978 found (that is, they're probably Mule-specific charsets like Ethiopic or | |
| 2979 IPA.) | |
| 2980 */ ); | |
| 2981 Qunicode_registries = vector1(build_string("iso10646-1")); | |
| 771 | 2982 #endif /* MULE */ |
| 2983 } |
