Mercurial > hg > xemacs-beta
comparison src/unicode.c @ 771:943eaba38521
[xemacs-hg @ 2002-03-13 08:51:24 by ben]
The big ben-mule-21-5 check-in!
Various files were added and deleted. See CHANGES-ben-mule.
There are still some test suite failures. No crashes, though.
Many of the failures have to do with problems in the test suite itself
rather than in the actual code. I'll be addressing these in the next
day or so -- none of the test suite failures are at all critical.
Meanwhile I'll be trying to address the biggest issues -- i.e. build
or run failures, which will almost certainly happen on various platforms.
All comments should be sent to ben@xemacs.org -- use a Cc: if necessary
when sending to mailing lists. There will be pre- and post- tags,
something like
pre-ben-mule-21-5-merge-in, and
post-ben-mule-21-5-merge-in.
author | ben |
---|---|
date | Wed, 13 Mar 2002 08:54:06 +0000 |
parents | |
children | 2923009caf47 |
comparison
equal
deleted
inserted
replaced
770:336a418893b5 | 771:943eaba38521 |
---|---|
1 /* Code to handle Unicode conversion. | |
2 Copyright (C) 2000, 2001, 2002 Ben Wing. | |
3 | |
4 This file is part of XEmacs. | |
5 | |
6 XEmacs is free software; you can redistribute it and/or modify it | |
7 under the terms of the GNU General Public License as published by the | |
8 Free Software Foundation; either version 2, or (at your option) any | |
9 later version. | |
10 | |
11 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
14 for more details. | |
15 | |
16 You should have received a copy of the GNU General Public License | |
17 along with XEmacs; see the file COPYING. If not, write to | |
18 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
19 Boston, MA 02111-1307, USA. */ | |
20 | |
21 /* Synched up with: FSF 20.3. Not in FSF. */ | |
22 | |
23 /* Authorship: | |
24 | |
25 Current primary author: Ben Wing <ben@xemacs.org> | |
26 | |
27 Written by Ben Wing <ben@xemacs.org>, June, 2001. | |
28 Separated out into this file, August, 2001. | |
29 Includes Unicode coding systems, some parts of which have been written | |
30 by someone else. | |
31 | |
32 As of September 2001, the detection code is here and abstraction of the | |
33 detection system is finished. the unicode detectors have been rewritten | |
34 to include multiple levels of likelihood. | |
35 */ | |
36 | |
37 #include <config.h> | |
38 #include "lisp.h" | |
39 | |
40 #include "charset.h" | |
41 #include "file-coding.h" | |
42 #include "opaque.h" | |
43 | |
44 #include "sysfile.h" | |
45 | |
46 /* #### WARNING! The current sledgehammer routines have a fundamental | |
47 problem in that they can't handle two characters mapping to a | |
48 single Unicode codepoint or vice-versa in a single charset table. | |
49 It's not clear there is any way to handle this and still make the | |
50 sledgehammer routines useful. */ | |
51 /* #define SLEDGEHAMMER_CHECK_UNICODE */ | |
52 | |
53 /* We currently use the following format for tables: | |
54 | |
55 If dimension == 1, to_unicode_table is a 96-element array of ints | |
56 (Unicode code points); else, it's a 96-element array of int * | |
57 pointers, each of which points to a 96-element array of ints. If no | |
58 elements in a row have been filled in, the pointer will point to a | |
59 default empty table; that way, memory usage is more reasonable but | |
60 lookup still fast. | |
61 | |
62 -- If from_unicode_levels == 1, from_unicode_table is a 256-element | |
63 array of shorts (octet 1 in high byte, octet 2 in low byte; we don't | |
64 store Emchars directly to save space). | |
65 | |
66 -- If from_unicode_levels == 2, from_unicode_table is a | |
67 256-element array of short * pointers, each of which points to a | |
68 256-element array of shorts. | |
69 | |
70 -- If from_unicode_levels == 3, from_unicode_table is a | |
71 256-element array of short ** pointers, each of which points to | |
72 a 256-element array of short * pointers, each of which points to | |
73 a 256-element array of shorts. | |
74 | |
75 -- If from_unicode_levels == 4, same thing but one level deeper. | |
76 | |
77 Just as for to_unicode_table, we use default tables to fill in | |
78 all entries with no values in them. | |
79 | |
80 #### An obvious space-saving optimization is to use variable-sized | |
81 tables, where each table instead of just being a 256-element array, | |
82 is a structure with a start value, an end value, and a variable | |
83 number of entries (END - START + 1). Only 8 bits are needed for | |
84 END and START, and could be stored at the end to avoid alignment | |
85 problems. However, before charging off and implementing this, | |
86 we need to consider whether it's worth it: | |
87 | |
88 (1) Most tables will be highly localized in which code points are | |
89 defined, heavily reducing the possible memory waste. Before | |
90 doing any rewriting, write some code to see how much memory is | |
91 actually being wasted (i.e. ratio of empty entries to total # of | |
92 entries) and only start rewriting if it's unacceptably high. You | |
93 have to check over all charsets. | |
94 | |
95 (2) Since entries are usually added one at a time, you have to be | |
96 very careful when creating the tables to avoid realloc()/free() | |
97 thrashing in the common case when you are in an area of high | |
98 localization and are going to end up using most entries in the | |
99 table. You'd certainly want to allow only certain sizes, not | |
100 arbitrary ones (probably powers of 2, where you want the entire | |
101 block including the START/END values to fit into a power of 2, | |
102 minus any malloc overhead if there is any -- there's none under | |
103 gmalloc.c, and probably most system malloc() functions are quite | |
104 smart nowadays and also have no overhead). You could optimize | |
105 somewhat during the in-C initializations, because you can compute | |
106 the actual usage of various tables by scanning the entries you're | |
107 going to add in a separate pass before adding them. (You could | |
108 actually do the same thing when entries are added on the Lisp | |
109 level by making the assumption that all the entries will come in | |
110 one after another before any use is made of the data. So as | |
111 they're coming in, you just store them in a big long list, and | |
112 the first time you need to retrieve an entry, you compute the | |
113 whole table at once.) You'd still have to deal with the | |
114 possibility of later entries coming in, though. | |
115 | |
116 (3) You do lose some speed using START/END values, since you need | |
117 a couple of comparisons at each level. This could easily make | |
118 each single lookup become 3-4 times slower. The Unicode book | |
119 considers this a big issue, and recommends against variable-sized | |
120 tables for this reason; however, they almost certainly have in | |
121 mind applications that primarily involve conversion of large | |
122 amounts of data. Most Unicode strings that are translated in | |
123 XEmacs are fairly small. The only place where this might matter | |
124 is in loading large files -- e.g. a 3-megabyte Unicode-encoded | |
125 file. So think about this, and maybe do a trial implementation | |
126 where you don't worry too much about the intricacies of (2) and | |
127 just implement some basic "multiply by 1.5" trick or something to | |
128 do the resizing. There is a very good FAQ on Unicode called | |
129 something like the Linux-Unicode How-To (it should be part of the | |
130 Linux How-To's, I think), that lists the url of a guy with a | |
131 whole bunch of unicode files you can use to stress-test your | |
132 implementations, and he's highly likely to have a good | |
133 multi-megabyte Unicode-encoded file (with normal text in it -- if | |
134 you created your own just by creating repeated strings of letters | |
135 and numbers, you probably wouldn't get accurate results). | |
136 */ | |
137 | |
138 /* When MULE is not defined, we may still need some Unicode support -- | |
139 in particular, some Windows API's always want Unicode, and the way | |
140 we've set up the Unicode encapsulation, we may as well go ahead and | |
141 always use the Unicode versions of split API's. (It would be | |
142 trickier to not use them, and pointless -- under NT, the ANSI API's | |
143 call the Unicode ones anyway, so in the case of structures, we'd be | |
144 converting from Unicode to ANSI structures, only to have the OS | |
145 convert them back.) */ | |
146 | |
147 Lisp_Object Qunicode; | |
148 Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7; | |
149 Lisp_Object Qneed_bom; | |
150 | |
151 Lisp_Object Qutf_16_little_endian, Qutf_16_bom; | |
152 Lisp_Object Qutf_16_little_endian_bom; | |
153 | |
154 #ifdef MULE | |
155 | |
156 static int *to_unicode_blank_1; | |
157 static int **to_unicode_blank_2; | |
158 | |
159 static short *from_unicode_blank_1; | |
160 static short **from_unicode_blank_2; | |
161 static short ***from_unicode_blank_3; | |
162 static short ****from_unicode_blank_4; | |
163 | |
164 #if 0 | |
165 | |
166 static const struct lrecord_description to_unicode_level_0_desc[] = { | |
167 { XD_END } | |
168 }; | |
169 | |
170 static const struct struct_description to_unicode_level_0_ptr_desc = { | |
171 sizeof (int), to_unicode_level_0_desc | |
172 }; | |
173 | |
174 static const struct lrecord_description to_unicode_level_1_desc[] = { | |
175 { XD_STRUCT_PTR, 0, 96, &to_unicode_level_0_ptr_desc }, | |
176 { XD_END } | |
177 }; | |
178 | |
179 static const struct struct_description to_unicode_level_1_ptr_desc = { | |
180 0, to_unicode_level_1_desc | |
181 }; | |
182 | |
183 static const struct lrecord_description to_unicode_level_2_desc[] = { | |
184 { XD_STRUCT_PTR, 0, 96, &to_unicode_level_1_ptr_desc }, | |
185 { XD_END } | |
186 }; | |
187 | |
188 /* Not static because each charset has a set of to and from tables and | |
189 needs to describe them to pdump. */ | |
190 const struct struct_description to_unicode_description[] = { | |
191 { 1, to_unicode_level_1_desc }, | |
192 { 2, to_unicode_level_2_desc }, | |
193 { XD_END } | |
194 }; | |
195 | |
196 static const struct lrecord_description from_unicode_level_0_desc[] = { | |
197 { XD_END } | |
198 }; | |
199 | |
200 static const struct struct_description from_unicode_level_0_ptr_desc = { | |
201 sizeof (short), from_unicode_level_0_desc | |
202 }; | |
203 | |
204 static const struct lrecord_description from_unicode_level_1_desc[] = { | |
205 { XD_STRUCT_PTR, 0, 256, &from_unicode_level_0_ptr_desc }, | |
206 { XD_END } | |
207 }; | |
208 | |
209 static const struct struct_description from_unicode_level_1_ptr_desc = { | |
210 0, from_unicode_level_1_desc | |
211 }; | |
212 | |
213 static const struct lrecord_description from_unicode_level_2_desc[] = { | |
214 { XD_STRUCT_PTR, 0, 256, &from_unicode_level_1_ptr_desc }, | |
215 { XD_END } | |
216 }; | |
217 | |
218 static const struct struct_description from_unicode_level_2_ptr_desc = { | |
219 0, from_unicode_level_2_desc | |
220 }; | |
221 | |
222 static const struct lrecord_description from_unicode_level_3_desc[] = { | |
223 { XD_STRUCT_PTR, 0, 256, &from_unicode_level_2_ptr_desc }, | |
224 { XD_END } | |
225 }; | |
226 | |
227 static const struct struct_description from_unicode_level_3_ptr_desc = { | |
228 0, from_unicode_level_3_desc | |
229 }; | |
230 | |
231 static const struct lrecord_description from_unicode_level_4_desc[] = { | |
232 { XD_STRUCT_PTR, 0, 256, &from_unicode_level_3_ptr_desc }, | |
233 { XD_END } | |
234 }; | |
235 | |
236 /* Not static because each charset has a set of to and from tables and | |
237 needs to describe them to pdump. */ | |
238 const struct struct_description from_unicode_description[] = { | |
239 { 1, from_unicode_level_1_desc }, | |
240 { 2, from_unicode_level_2_desc }, | |
241 { 3, from_unicode_level_3_desc }, | |
242 { 4, from_unicode_level_4_desc }, | |
243 { XD_END } | |
244 }; | |
245 | |
246 #endif /* 0 */ | |
247 | |
248 static Lisp_Object_dynarr *unicode_precedence_dynarr; | |
249 | |
250 static const struct lrecord_description lo_description_1[] = { | |
251 { XD_LISP_OBJECT, 0 }, | |
252 { XD_END } | |
253 }; | |
254 | |
255 static const struct struct_description lo_description = { | |
256 sizeof (Lisp_Object), | |
257 lo_description_1 | |
258 }; | |
259 | |
260 static const struct lrecord_description lod_description_1[] = { | |
261 XD_DYNARR_DESC (Lisp_Object_dynarr, &lo_description), | |
262 { XD_END } | |
263 }; | |
264 | |
265 static const struct struct_description lisp_object_dynarr_description = { | |
266 sizeof (Lisp_Object_dynarr), | |
267 lod_description_1 | |
268 }; | |
269 | |
270 Lisp_Object Vlanguage_unicode_precedence_list; | |
271 Lisp_Object Vdefault_unicode_precedence_list; | |
272 | |
273 Lisp_Object Qignore_first_column; | |
274 | |
275 | |
276 /************************************************************************/ | |
277 /* Unicode implementation */ | |
278 /************************************************************************/ | |
279 | |
280 #define BREAKUP_UNICODE_CODE(val, u1, u2, u3, u4, levels) \ | |
281 do { \ | |
282 int buc_val = (val); \ | |
283 \ | |
284 (u1) = buc_val >> 24; \ | |
285 (u2) = (buc_val >> 16) & 255; \ | |
286 (u3) = (buc_val >> 8) & 255; \ | |
287 (u4) = buc_val & 255; \ | |
288 (levels) = (buc_val <= 0xFF ? 1 : \ | |
289 buc_val <= 0xFFFF ? 2 : \ | |
290 buc_val <= 0xFFFFFF ? 3 : \ | |
291 4); \ | |
292 } while (0) | |
293 | |
294 static void | |
295 init_blank_unicode_tables (void) | |
296 { | |
297 int i; | |
298 | |
299 from_unicode_blank_1 = xnew_array (short, 256); | |
300 from_unicode_blank_2 = xnew_array (short *, 256); | |
301 from_unicode_blank_3 = xnew_array (short **, 256); | |
302 from_unicode_blank_4 = xnew_array (short ***, 256); | |
303 for (i = 0; i < 256; i++) | |
304 { | |
305 from_unicode_blank_1[i] = (short) -1; | |
306 from_unicode_blank_2[i] = from_unicode_blank_1; | |
307 from_unicode_blank_3[i] = from_unicode_blank_2; | |
308 from_unicode_blank_4[i] = from_unicode_blank_3; | |
309 } | |
310 | |
311 to_unicode_blank_1 = xnew_array (int, 96); | |
312 to_unicode_blank_2 = xnew_array (int *, 96); | |
313 for (i = 0; i < 96; i++) | |
314 { | |
315 to_unicode_blank_1[i] = -1; | |
316 to_unicode_blank_2[i] = to_unicode_blank_1; | |
317 } | |
318 } | |
319 | |
320 static void * | |
321 create_new_from_unicode_table (int level) | |
322 { | |
323 switch (level) | |
324 { | |
325 /* WARNING: If you are thinking of compressing these, keep in | |
326 mind that sizeof (short) does not equal sizeof (short *). */ | |
327 case 1: | |
328 { | |
329 short *newtab = xnew_array (short, 256); | |
330 memcpy (newtab, from_unicode_blank_1, 256 * sizeof (short)); | |
331 return newtab; | |
332 } | |
333 case 2: | |
334 { | |
335 short **newtab = xnew_array (short *, 256); | |
336 memcpy (newtab, from_unicode_blank_2, 256 * sizeof (short *)); | |
337 return newtab; | |
338 } | |
339 case 3: | |
340 { | |
341 short ***newtab = xnew_array (short **, 256); | |
342 memcpy (newtab, from_unicode_blank_3, 256 * sizeof (short **)); | |
343 return newtab; | |
344 } | |
345 case 4: | |
346 { | |
347 short ****newtab = xnew_array (short ***, 256); | |
348 memcpy (newtab, from_unicode_blank_4, 256 * sizeof (short ***)); | |
349 return newtab; | |
350 } | |
351 default: | |
352 abort (); | |
353 return 0; | |
354 } | |
355 } | |
356 | |
357 void | |
358 init_charset_unicode_tables (Lisp_Object charset) | |
359 { | |
360 if (XCHARSET_DIMENSION (charset) == 1) | |
361 { | |
362 int *to_table = xnew_array (int, 96); | |
363 memcpy (to_table, to_unicode_blank_1, 96 * sizeof (int)); | |
364 XCHARSET_TO_UNICODE_TABLE (charset) = to_table; | |
365 } | |
366 else | |
367 { | |
368 int **to_table = xnew_array (int *, 96); | |
369 memcpy (to_table, to_unicode_blank_2, 96 * sizeof (int *)); | |
370 XCHARSET_TO_UNICODE_TABLE (charset) = to_table; | |
371 } | |
372 | |
373 { | |
374 XCHARSET_FROM_UNICODE_TABLE (charset) = create_new_from_unicode_table (1); | |
375 XCHARSET_FROM_UNICODE_LEVELS (charset) = 1; | |
376 } | |
377 } | |
378 | |
379 static void | |
380 free_from_unicode_table (void *table, int level) | |
381 { | |
382 int i; | |
383 | |
384 switch (level) | |
385 { | |
386 case 2: | |
387 { | |
388 short **tab = (short **) table; | |
389 for (i = 0; i < 256; i++) | |
390 { | |
391 if (tab[i] != from_unicode_blank_1) | |
392 free_from_unicode_table (tab[i], 1); | |
393 } | |
394 break; | |
395 } | |
396 case 3: | |
397 { | |
398 short ***tab = (short ***) table; | |
399 for (i = 0; i < 256; i++) | |
400 { | |
401 if (tab[i] != from_unicode_blank_2) | |
402 free_from_unicode_table (tab[i], 2); | |
403 } | |
404 break; | |
405 } | |
406 case 4: | |
407 { | |
408 short ****tab = (short ****) table; | |
409 for (i = 0; i < 256; i++) | |
410 { | |
411 if (tab[i] != from_unicode_blank_3) | |
412 free_from_unicode_table (tab[i], 3); | |
413 } | |
414 break; | |
415 } | |
416 } | |
417 | |
418 xfree (table); | |
419 } | |
420 | |
421 static void | |
422 free_to_unicode_table (void *table, int level) | |
423 { | |
424 if (level == 2) | |
425 { | |
426 int i; | |
427 int **tab = (int **) table; | |
428 | |
429 for (i = 0; i < 96; i++) | |
430 { | |
431 if (tab[i] != to_unicode_blank_1) | |
432 free_to_unicode_table (tab[i], 1); | |
433 } | |
434 } | |
435 | |
436 xfree (table); | |
437 } | |
438 | |
439 void | |
440 free_charset_unicode_tables (Lisp_Object charset) | |
441 { | |
442 free_to_unicode_table (XCHARSET_TO_UNICODE_TABLE (charset), | |
443 XCHARSET_DIMENSION (charset)); | |
444 free_from_unicode_table (XCHARSET_FROM_UNICODE_TABLE (charset), | |
445 XCHARSET_FROM_UNICODE_LEVELS (charset)); | |
446 } | |
447 | |
448 #ifdef MEMORY_USAGE_STATS | |
449 | |
450 static Bytecount | |
451 compute_from_unicode_table_size_1 (void *table, int level, | |
452 struct overhead_stats *stats) | |
453 { | |
454 int i; | |
455 Bytecount size = 0; | |
456 | |
457 switch (level) | |
458 { | |
459 case 2: | |
460 { | |
461 short **tab = (short **) table; | |
462 for (i = 0; i < 256; i++) | |
463 { | |
464 if (tab[i] != from_unicode_blank_1) | |
465 size += compute_from_unicode_table_size_1 (tab[i], 1, stats); | |
466 } | |
467 break; | |
468 } | |
469 case 3: | |
470 { | |
471 short ***tab = (short ***) table; | |
472 for (i = 0; i < 256; i++) | |
473 { | |
474 if (tab[i] != from_unicode_blank_2) | |
475 size += compute_from_unicode_table_size_1 (tab[i], 2, stats); | |
476 } | |
477 break; | |
478 } | |
479 case 4: | |
480 { | |
481 short ****tab = (short ****) table; | |
482 for (i = 0; i < 256; i++) | |
483 { | |
484 if (tab[i] != from_unicode_blank_3) | |
485 size += compute_from_unicode_table_size_1 (tab[i], 3, stats); | |
486 } | |
487 break; | |
488 } | |
489 } | |
490 | |
491 size += malloced_storage_size (table, | |
492 256 * (level == 1 ? sizeof (short) : | |
493 sizeof (void *)), | |
494 stats); | |
495 return size; | |
496 } | |
497 | |
498 static Bytecount | |
499 compute_to_unicode_table_size_1 (void *table, int level, | |
500 struct overhead_stats *stats) | |
501 { | |
502 Bytecount size = 0; | |
503 | |
504 if (level == 2) | |
505 { | |
506 int i; | |
507 int **tab = (int **) table; | |
508 | |
509 for (i = 0; i < 96; i++) | |
510 { | |
511 if (tab[i] != to_unicode_blank_1) | |
512 size += compute_to_unicode_table_size_1 (tab[i], 1, stats); | |
513 } | |
514 } | |
515 | |
516 size += malloced_storage_size (table, | |
517 96 * (level == 1 ? sizeof (int) : | |
518 sizeof (void *)), | |
519 stats); | |
520 return size; | |
521 } | |
522 | |
523 Bytecount | |
524 compute_from_unicode_table_size (Lisp_Object charset, | |
525 struct overhead_stats *stats) | |
526 { | |
527 return (compute_from_unicode_table_size_1 | |
528 (XCHARSET_FROM_UNICODE_TABLE (charset), | |
529 XCHARSET_FROM_UNICODE_LEVELS (charset), | |
530 stats)); | |
531 } | |
532 | |
533 Bytecount | |
534 compute_to_unicode_table_size (Lisp_Object charset, | |
535 struct overhead_stats *stats) | |
536 { | |
537 return (compute_to_unicode_table_size_1 | |
538 (XCHARSET_TO_UNICODE_TABLE (charset), | |
539 XCHARSET_DIMENSION (charset), | |
540 stats)); | |
541 } | |
542 | |
543 #endif | |
544 | |
545 #ifdef SLEDGEHAMMER_CHECK_UNICODE | |
546 | |
547 /* "Sledgehammer checks" are checks that verify the self-consistency | |
548 of an entire structure every time a change is about to be made or | |
549 has been made to the structure. Not fast but a pretty much | |
550 sure-fire way of flushing out any incorrectnesses in the algorithms | |
551 that create the structure. | |
552 | |
553 Checking only after a change has been made will speed things up by | |
554 a factor of 2, but it doesn't absolutely prove that the code just | |
555 checked caused the problem; perhaps it happened elsewhere, either | |
556 in some code you forgot to sledgehammer check or as a result of | |
557 data corruption. */ | |
558 | |
559 static void | |
560 assert_not_any_blank_table (void *tab) | |
561 { | |
562 assert (tab != from_unicode_blank_1); | |
563 assert (tab != from_unicode_blank_2); | |
564 assert (tab != from_unicode_blank_3); | |
565 assert (tab != from_unicode_blank_4); | |
566 assert (tab != to_unicode_blank_1); | |
567 assert (tab != to_unicode_blank_2); | |
568 assert (tab); | |
569 } | |
570 | |
571 static void | |
572 sledgehammer_check_from_table (Lisp_Object charset, void *table, int level, | |
573 int codetop) | |
574 { | |
575 int i; | |
576 | |
577 switch (level) | |
578 { | |
579 case 1: | |
580 { | |
581 short *tab = (short *) table; | |
582 for (i = 0; i < 256; i++) | |
583 { | |
584 if (tab[i] != -1) | |
585 { | |
586 Lisp_Object char_charset; | |
587 int c1, c2; | |
588 | |
589 assert (valid_char_p (tab[i])); | |
590 BREAKUP_CHAR (tab[i], char_charset, c1, c2); | |
591 assert (EQ (charset, char_charset)); | |
592 if (XCHARSET_DIMENSION (charset) == 1) | |
593 { | |
594 int *to_table = | |
595 (int *) XCHARSET_TO_UNICODE_TABLE (charset); | |
596 assert_not_any_blank_table (to_table); | |
597 assert (to_table[c1 - 32] == (codetop << 8) + i); | |
598 } | |
599 else | |
600 { | |
601 int **to_table = | |
602 (int **) XCHARSET_TO_UNICODE_TABLE (charset); | |
603 assert_not_any_blank_table (to_table); | |
604 assert_not_any_blank_table (to_table[c1 - 32]); | |
605 assert (to_table[c1 - 32][c2 - 32] == (codetop << 8) + i); | |
606 } | |
607 } | |
608 } | |
609 break; | |
610 } | |
611 case 2: | |
612 { | |
613 short **tab = (short **) table; | |
614 for (i = 0; i < 256; i++) | |
615 { | |
616 if (tab[i] != from_unicode_blank_1) | |
617 sledgehammer_check_from_table (charset, tab[i], 1, | |
618 (codetop << 8) + i); | |
619 } | |
620 break; | |
621 } | |
622 case 3: | |
623 { | |
624 short ***tab = (short ***) table; | |
625 for (i = 0; i < 256; i++) | |
626 { | |
627 if (tab[i] != from_unicode_blank_2) | |
628 sledgehammer_check_from_table (charset, tab[i], 2, | |
629 (codetop << 8) + i); | |
630 } | |
631 break; | |
632 } | |
633 case 4: | |
634 { | |
635 short ****tab = (short ****) table; | |
636 for (i = 0; i < 256; i++) | |
637 { | |
638 if (tab[i] != from_unicode_blank_3) | |
639 sledgehammer_check_from_table (charset, tab[i], 3, | |
640 (codetop << 8) + i); | |
641 } | |
642 break; | |
643 } | |
644 default: | |
645 abort (); | |
646 } | |
647 } | |
648 | |
649 static void | |
650 sledgehammer_check_to_table (Lisp_Object charset, void *table, int level, | |
651 int codetop) | |
652 { | |
653 int i; | |
654 | |
655 switch (level) | |
656 { | |
657 case 1: | |
658 { | |
659 int *tab = (int *) table; | |
660 | |
661 if (XCHARSET_CHARS (charset) == 94) | |
662 { | |
663 assert (tab[0] == -1); | |
664 assert (tab[95] == -1); | |
665 } | |
666 | |
667 for (i = 0; i < 96; i++) | |
668 { | |
669 if (tab[i] != -1) | |
670 { | |
671 int u4, u3, u2, u1, levels; | |
672 Emchar ch; | |
673 Emchar this_ch; | |
674 short val; | |
675 void *frtab = XCHARSET_FROM_UNICODE_TABLE (charset); | |
676 | |
677 if (XCHARSET_DIMENSION (charset) == 1) | |
678 this_ch = MAKE_CHAR (charset, i + 32, 0); | |
679 else | |
680 this_ch = MAKE_CHAR (charset, codetop + 32, i + 32); | |
681 | |
682 assert (tab[i] >= 0); | |
683 BREAKUP_UNICODE_CODE (tab[i], u4, u3, u2, u1, levels); | |
684 assert (levels <= XCHARSET_FROM_UNICODE_LEVELS (charset)); | |
685 | |
686 switch (XCHARSET_FROM_UNICODE_LEVELS (charset)) | |
687 { | |
688 case 1: val = ((short *) frtab)[u1]; break; | |
689 case 2: val = ((short **) frtab)[u2][u1]; break; | |
690 case 3: val = ((short ***) frtab)[u3][u2][u1]; break; | |
691 case 4: val = ((short ****) frtab)[u4][u3][u2][u1]; break; | |
692 default: abort (); | |
693 } | |
694 | |
695 ch = MAKE_CHAR (charset, val >> 8, val & 0xFF); | |
696 assert (ch == this_ch); | |
697 | |
698 switch (XCHARSET_FROM_UNICODE_LEVELS (charset)) | |
699 { | |
700 case 4: | |
701 assert_not_any_blank_table (frtab); | |
702 frtab = ((short ****) frtab)[u4]; | |
703 /* fall through */ | |
704 case 3: | |
705 assert_not_any_blank_table (frtab); | |
706 frtab = ((short ***) frtab)[u3]; | |
707 /* fall through */ | |
708 case 2: | |
709 assert_not_any_blank_table (frtab); | |
710 frtab = ((short **) frtab)[u2]; | |
711 /* fall through */ | |
712 case 1: | |
713 assert_not_any_blank_table (frtab); | |
714 break; | |
715 default: abort (); | |
716 } | |
717 } | |
718 } | |
719 break; | |
720 } | |
721 case 2: | |
722 { | |
723 int **tab = (int **) table; | |
724 | |
725 if (XCHARSET_CHARS (charset) == 94) | |
726 { | |
727 assert (tab[0] == to_unicode_blank_1); | |
728 assert (tab[95] == to_unicode_blank_1); | |
729 } | |
730 | |
731 for (i = 0; i < 96; i++) | |
732 { | |
733 if (tab[i] != to_unicode_blank_1) | |
734 sledgehammer_check_to_table (charset, tab[i], 1, i); | |
735 } | |
736 break; | |
737 } | |
738 default: | |
739 abort (); | |
740 } | |
741 } | |
742 | |
743 static void | |
744 sledgehammer_check_unicode_tables (Lisp_Object charset) | |
745 { | |
746 /* verify that the blank tables have not been modified */ | |
747 int i; | |
748 int from_level = XCHARSET_FROM_UNICODE_LEVELS (charset); | |
749 int to_level = XCHARSET_FROM_UNICODE_LEVELS (charset); | |
750 | |
751 for (i = 0; i < 256; i++) | |
752 { | |
753 assert (from_unicode_blank_1[i] == (short) -1); | |
754 assert (from_unicode_blank_2[i] == from_unicode_blank_1); | |
755 assert (from_unicode_blank_3[i] == from_unicode_blank_2); | |
756 assert (from_unicode_blank_4[i] == from_unicode_blank_3); | |
757 } | |
758 | |
759 for (i = 0; i < 96; i++) | |
760 { | |
761 assert (to_unicode_blank_1[i] == -1); | |
762 assert (to_unicode_blank_2[i] == to_unicode_blank_1); | |
763 } | |
764 | |
765 assert (from_level >= 1 && from_level <= 4); | |
766 | |
767 sledgehammer_check_from_table (charset, | |
768 XCHARSET_FROM_UNICODE_TABLE (charset), | |
769 from_level, 0); | |
770 | |
771 sledgehammer_check_to_table (charset, | |
772 XCHARSET_TO_UNICODE_TABLE (charset), | |
773 XCHARSET_DIMENSION (charset), 0); | |
774 } | |
775 | |
776 #endif /* SLEDGEHAMMER_CHECK_UNICODE */ | |
777 | |
778 static void | |
779 set_unicode_conversion (Emchar chr, int code) | |
780 { | |
781 Lisp_Object charset; | |
782 int c1, c2; | |
783 | |
784 BREAKUP_CHAR (chr, charset, c1, c2); | |
785 | |
786 assert (!EQ (charset, Vcharset_ascii)); | |
787 assert (!EQ (charset, Vcharset_control_1)); | |
788 assert (!EQ (charset, Vcharset_composite)); | |
789 | |
790 #ifdef SLEDGEHAMMER_CHECK_UNICODE | |
791 sledgehammer_check_unicode_tables (charset); | |
792 #endif | |
793 | |
794 /* First, the char -> unicode translation */ | |
795 | |
796 if (XCHARSET_DIMENSION (charset) == 1) | |
797 { | |
798 int *to_table = (int *) XCHARSET_TO_UNICODE_TABLE (charset); | |
799 to_table[c1 - 32] = code; | |
800 } | |
801 else | |
802 { | |
803 int **to_table_2 = (int **) XCHARSET_TO_UNICODE_TABLE (charset); | |
804 int *to_table_1; | |
805 | |
806 assert (XCHARSET_DIMENSION (charset) == 2); | |
807 to_table_1 = to_table_2[c1 - 32]; | |
808 if (to_table_1 == to_unicode_blank_1) | |
809 { | |
810 to_table_1 = xnew_array (int, 96); | |
811 memcpy (to_table_1, to_unicode_blank_1, 96 * sizeof (int)); | |
812 to_table_2[c1 - 32] = to_table_1; | |
813 } | |
814 to_table_1[c2 - 32] = code; | |
815 } | |
816 | |
817 /* Then, unicode -> char: much harder */ | |
818 | |
819 { | |
820 int charset_levels; | |
821 int u4, u3, u2, u1; | |
822 int code_levels; | |
823 BREAKUP_UNICODE_CODE (code, u4, u3, u2, u1, code_levels); | |
824 | |
825 charset_levels = XCHARSET_FROM_UNICODE_LEVELS (charset); | |
826 | |
827 /* Make sure the charset's tables have at least as many levels as | |
828 the code point has: Note that the charset is guaranteed to have | |
829 at least one level, because it was created that way */ | |
830 if (charset_levels < code_levels) | |
831 { | |
832 int i; | |
833 | |
834 assert (charset_levels > 0); | |
835 for (i = 2; i <= code_levels; i++) | |
836 { | |
837 if (charset_levels < i) | |
838 { | |
839 void *old_table = XCHARSET_FROM_UNICODE_TABLE (charset); | |
840 void *table = create_new_from_unicode_table (i); | |
841 XCHARSET_FROM_UNICODE_TABLE (charset) = table; | |
842 | |
843 switch (i) | |
844 { | |
845 case 2: | |
846 ((short **) table)[0] = (short *) old_table; | |
847 break; | |
848 case 3: | |
849 ((short ***) table)[0] = (short **) old_table; | |
850 break; | |
851 case 4: | |
852 ((short ****) table)[0] = (short ***) old_table; | |
853 break; | |
854 default: abort (); | |
855 } | |
856 } | |
857 } | |
858 | |
859 charset_levels = code_levels; | |
860 XCHARSET_FROM_UNICODE_LEVELS (charset) = code_levels; | |
861 } | |
862 | |
863 /* Now, make sure there is a non-default table at each level */ | |
864 { | |
865 int i; | |
866 void *table = XCHARSET_FROM_UNICODE_TABLE (charset); | |
867 | |
868 for (i = charset_levels; i >= 2; i--) | |
869 { | |
870 switch (i) | |
871 { | |
872 case 4: | |
873 if (((short ****) table)[u4] == from_unicode_blank_3) | |
874 ((short ****) table)[u4] = | |
875 ((short ***) create_new_from_unicode_table (3)); | |
876 table = ((short ****) table)[u4]; | |
877 break; | |
878 case 3: | |
879 if (((short ***) table)[u3] == from_unicode_blank_2) | |
880 ((short ***) table)[u3] = | |
881 ((short **) create_new_from_unicode_table (2)); | |
882 table = ((short ***) table)[u3]; | |
883 break; | |
884 case 2: | |
885 if (((short **) table)[u2] == from_unicode_blank_1) | |
886 ((short **) table)[u2] = | |
887 ((short *) create_new_from_unicode_table (1)); | |
888 table = ((short **) table)[u2]; | |
889 break; | |
890 default: abort (); | |
891 } | |
892 } | |
893 } | |
894 | |
895 /* Finally, set the character */ | |
896 | |
897 { | |
898 void *table = XCHARSET_FROM_UNICODE_TABLE (charset); | |
899 switch (charset_levels) | |
900 { | |
901 case 1: ((short *) table)[u1] = (c1 << 8) + c2; break; | |
902 case 2: ((short **) table)[u2][u1] = (c1 << 8) + c2; break; | |
903 case 3: ((short ***) table)[u3][u2][u1] = (c1 << 8) + c2; break; | |
904 case 4: ((short ****) table)[u4][u3][u2][u1] = (c1 << 8) + c2; break; | |
905 default: abort (); | |
906 } | |
907 } | |
908 } | |
909 | |
910 #ifdef SLEDGEHAMMER_CHECK_UNICODE | |
911 sledgehammer_check_unicode_tables (charset); | |
912 #endif | |
913 } | |
914 | |
915 static int | |
916 char_to_unicode (Emchar chr) | |
917 { | |
918 Lisp_Object charset; | |
919 int c1, c2; | |
920 | |
921 type_checking_assert (valid_char_p (chr)); | |
922 if (chr < 256) | |
923 return (int) chr; | |
924 | |
925 BREAKUP_CHAR (chr, charset, c1, c2); | |
926 if (EQ (charset, Vcharset_composite)) | |
927 return -1; /* #### don't know how to handle */ | |
928 else if (XCHARSET_DIMENSION (charset) == 1) | |
929 return ((int *) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32]; | |
930 else | |
931 return ((int **) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32][c2 - 32]; | |
932 } | |
933 | |
934 static Emchar | |
935 unicode_to_char (int code, Lisp_Object_dynarr *charsets) | |
936 { | |
937 int u1, u2, u3, u4; | |
938 int code_levels; | |
939 int i; | |
940 int n = Dynarr_length (charsets); | |
941 | |
942 type_checking_assert (code >= 0); | |
943 if (code < 256) | |
944 return (Emchar) code; | |
945 | |
946 BREAKUP_UNICODE_CODE (code, u4, u3, u2, u1, code_levels); | |
947 | |
948 for (i = 0; i < n; i++) | |
949 { | |
950 Lisp_Object charset = Dynarr_at (charsets, i); | |
951 int charset_levels = XCHARSET_FROM_UNICODE_LEVELS (charset); | |
952 if (charset_levels >= code_levels) | |
953 { | |
954 void *table = XCHARSET_FROM_UNICODE_TABLE (charset); | |
955 short retval; | |
956 | |
957 switch (charset_levels) | |
958 { | |
959 case 1: retval = ((short *) table)[u1]; break; | |
960 case 2: retval = ((short **) table)[u2][u1]; break; | |
961 case 3: retval = ((short ***) table)[u3][u2][u1]; break; | |
962 case 4: retval = ((short ****) table)[u4][u3][u2][u1]; break; | |
963 default: abort (); retval = 0; | |
964 } | |
965 | |
966 if (retval != -1) | |
967 return MAKE_CHAR (charset, retval >> 8, retval & 0xFF); | |
968 } | |
969 } | |
970 | |
971 return (Emchar) -1; | |
972 } | |
973 | |
974 static void | |
975 add_charsets_to_precedence_list (Lisp_Object list, int *lbs, | |
976 Lisp_Object_dynarr *dynarr) | |
977 { | |
978 { | |
979 EXTERNAL_LIST_LOOP_2 (elt, list) | |
980 { | |
981 Lisp_Object charset = Fget_charset (elt); | |
982 int lb = XCHARSET_LEADING_BYTE (elt); | |
983 if (lbs[lb - MIN_LEADING_BYTE] == 0) | |
984 { | |
985 Dynarr_add (unicode_precedence_dynarr, charset); | |
986 lbs[lb - MIN_LEADING_BYTE] = 1; | |
987 } | |
988 } | |
989 } | |
990 } | |
991 | |
992 void | |
993 recalculate_unicode_precedence (void) | |
994 { | |
995 int lbs[NUM_LEADING_BYTES]; | |
996 int i; | |
997 | |
998 for (i = 0; i < NUM_LEADING_BYTES; i++) | |
999 lbs[i] = 0; | |
1000 | |
1001 Dynarr_reset (unicode_precedence_dynarr); | |
1002 | |
1003 add_charsets_to_precedence_list (Vlanguage_unicode_precedence_list, | |
1004 lbs, unicode_precedence_dynarr); | |
1005 add_charsets_to_precedence_list (Vdefault_unicode_precedence_list, | |
1006 lbs, unicode_precedence_dynarr); | |
1007 | |
1008 for (i = 0; i < NUM_LEADING_BYTES; i++) | |
1009 { | |
1010 if (lbs[i] == 0) | |
1011 { | |
1012 Lisp_Object charset = CHARSET_BY_LEADING_BYTE (i + MIN_LEADING_BYTE); | |
1013 if (!NILP (charset)) | |
1014 Dynarr_add (unicode_precedence_dynarr, charset); | |
1015 } | |
1016 } | |
1017 } | |
1018 | |
1019 DEFUN ("set-language-unicode-precedence-list", | |
1020 Fset_language_unicode_precedence_list, | |
1021 1, 1, 0, /* | |
1022 Set the language-specific precedence list used for Unicode decoding. | |
1023 This is a list of charsets, which are consulted in order for a translation | |
1024 matching a given Unicode character. If no matches are found, the charsets | |
1025 in the default precedence list (see `set-default-unicode-precedence-list') | |
1026 are consulted, and then all remaining charsets, in some arbitrary order. | |
1027 | |
1028 The language-specific precedence list is meant to be set as part of the | |
1029 language environment initialization; the default precedence list is meant | |
1030 to be set by the user. | |
1031 */ | |
1032 (list)) | |
1033 { | |
1034 { | |
1035 EXTERNAL_LIST_LOOP_2 (elt, list) | |
1036 Fget_charset (elt); | |
1037 } | |
1038 | |
1039 Vlanguage_unicode_precedence_list = list; | |
1040 recalculate_unicode_precedence (); | |
1041 return Qnil; | |
1042 } | |
1043 | |
1044 DEFUN ("language-unicode-precedence-list", | |
1045 Flanguage_unicode_precedence_list, | |
1046 0, 0, 0, /* | |
1047 Return the language-specific precedence list used for Unicode decoding. | |
1048 See `set-language-unicode-precedence-list' for more information. | |
1049 */ | |
1050 ()) | |
1051 { | |
1052 return Vlanguage_unicode_precedence_list; | |
1053 } | |
1054 | |
1055 DEFUN ("set-default-unicode-precedence-list", | |
1056 Fset_default_unicode_precedence_list, | |
1057 1, 1, 0, /* | |
1058 Set the default precedence list used for Unicode decoding. | |
1059 This is meant to be set by the user. See | |
1060 `set-language-unicode-precedence-list' for more information. | |
1061 */ | |
1062 (list)) | |
1063 { | |
1064 { | |
1065 EXTERNAL_LIST_LOOP_2 (elt, list) | |
1066 Fget_charset (elt); | |
1067 } | |
1068 | |
1069 Vdefault_unicode_precedence_list = list; | |
1070 recalculate_unicode_precedence (); | |
1071 return Qnil; | |
1072 } | |
1073 | |
1074 DEFUN ("default-unicode-precedence-list", | |
1075 Fdefault_unicode_precedence_list, | |
1076 0, 0, 0, /* | |
1077 Return the default precedence list used for Unicode decoding. | |
1078 See `set-language-unicode-precedence-list' for more information. | |
1079 */ | |
1080 ()) | |
1081 { | |
1082 return Vdefault_unicode_precedence_list; | |
1083 } | |
1084 | |
1085 DEFUN ("set-unicode-conversion", Fset_unicode_conversion, | |
1086 2, 2, 0, /* | |
1087 Add conversion information between Unicode codepoints and characters. | |
1088 CHARACTER is one of the following: | |
1089 | |
1090 -- A character (in which case CODE must be a non-negative integer; values | |
1091 above 2^20 - 1 are allowed for the purpose of specifying private | |
1092 characters, but will cause errors when converted to utf-16) | |
1093 -- A vector of characters (in which case CODE must be a vector of integers | |
1094 of the same length) | |
1095 */ | |
1096 (character, code)) | |
1097 { | |
1098 Lisp_Object charset; | |
1099 | |
1100 CHECK_CHAR (character); | |
1101 CHECK_NATNUM (code); | |
1102 | |
1103 charset = CHAR_CHARSET (XCHAR (character)); | |
1104 if (EQ (charset, Vcharset_ascii) || | |
1105 EQ (charset, Vcharset_control_1) || | |
1106 EQ (charset, Vcharset_composite)) | |
1107 signal_error (Qinvalid_argument, "Cannot set Unicode translation for ASCII, Control-1 or Composite chars", | |
1108 character); | |
1109 | |
1110 set_unicode_conversion (XCHAR (character), XINT (code)); | |
1111 return Qnil; | |
1112 } | |
1113 | |
1114 #endif /* MULE */ | |
1115 | |
1116 DEFUN ("character-to-unicode", Fcharacter_to_unicode, 1, 1, 0, /* | |
1117 Convert character to Unicode codepoint. | |
1118 When there is no international support (i.e. MULE is not defined), | |
1119 this function simply does `char-to-int'. | |
1120 */ | |
1121 (character)) | |
1122 { | |
1123 CHECK_CHAR (character); | |
1124 #ifdef MULE | |
1125 return make_int (char_to_unicode (XCHAR (character))); | |
1126 #else | |
1127 return Fchar_to_int (character); | |
1128 #endif /* MULE */ | |
1129 } | |
1130 | |
1131 DEFUN ("unicode-to-character", Funicode_to_character, 1, 2, 0, /* | |
1132 Convert Unicode codepoint to character. | |
1133 CODE should be a non-negative integer. | |
1134 If CHARSETS is given, it should be a list of charsets, and only those | |
1135 charsets will be consulted, in the given order, for a translation. | |
1136 Otherwise, the default ordering of all charsets will be given (see | |
1137 `set-unicode-charset-precedence'). | |
1138 | |
1139 When there is no international support (i.e. MULE is not defined), | |
1140 this function simply does `int-to-char' and ignores the CHARSETS | |
1141 argument.. | |
1142 */ | |
1143 (code, charsets)) | |
1144 { | |
1145 #ifdef MULE | |
1146 Lisp_Object_dynarr *dyn; | |
1147 int lbs[NUM_LEADING_BYTES]; | |
1148 int c; | |
1149 | |
1150 CHECK_NATNUM (code); | |
1151 c = XINT (code); | |
1152 { | |
1153 EXTERNAL_LIST_LOOP_2 (elt, charsets) | |
1154 Fget_charset (elt); | |
1155 } | |
1156 | |
1157 if (NILP (charsets)) | |
1158 { | |
1159 Emchar ret = unicode_to_char (c, unicode_precedence_dynarr); | |
1160 if (ret == -1) | |
1161 return Qnil; | |
1162 return make_char (ret); | |
1163 } | |
1164 | |
1165 dyn = Dynarr_new (Lisp_Object); | |
1166 memset (lbs, 0, NUM_LEADING_BYTES * sizeof (int)); | |
1167 add_charsets_to_precedence_list (charsets, lbs, dyn); | |
1168 { | |
1169 Emchar ret = unicode_to_char (c, unicode_precedence_dynarr); | |
1170 Dynarr_free (dyn); | |
1171 if (ret == -1) | |
1172 return Qnil; | |
1173 return make_char (ret); | |
1174 } | |
1175 #else | |
1176 CHECK_NATNUM (code); | |
1177 return Fint_to_char (code); | |
1178 #endif /* MULE */ | |
1179 } | |
1180 | |
1181 static Lisp_Object | |
1182 cerrar_el_fulano (Lisp_Object fulano) | |
1183 { | |
1184 FILE *file = (FILE *) get_opaque_ptr (fulano); | |
1185 retry_fclose (file); | |
1186 return Qnil; | |
1187 } | |
1188 | |
1189 #ifdef MULE | |
1190 | |
1191 DEFUN ("parse-unicode-translation-table", Fparse_unicode_translation_table, | |
1192 2, 6, 0, /* | |
1193 Parse Unicode translation data in FILENAME for CHARSET. | |
1194 Data is text, in the form of one translation per line -- charset | |
1195 codepoint followed by Unicode codepoint. Numbers are decimal or hex | |
1196 \(preceded by 0x). Comments are marked with a #. Charset codepoints | |
1197 for two-dimensional charsets should have the first octet stored in the | |
1198 high 8 bits of the hex number and the second in the low 8 bits. | |
1199 | |
1200 If START and END are given, only charset codepoints within the given | |
1201 range will be processed. If OFFSET is given, that value will be added | |
1202 to all charset codepoints in the file to obtain the internal charset | |
1203 codepoint. START and END apply to the codepoints in the file, before | |
1204 OFFSET is applied. | |
1205 | |
1206 \(Note that, as usual, we assume that octets are in the range 32 to | |
1207 127 or 33 to 126. If you have a table in kuten form, with octets in | |
1208 the range 1 to 94, you will have to use an offset of 5140, | |
1209 i.e. 0x2020.) | |
1210 | |
1211 FLAGS, if specified, control further how the tables are interpreted | |
1212 and are used to special-case certain known table weirdnesses in the | |
1213 Unicode tables: | |
1214 | |
1215 `ignore-first-column' | |
1216 Exactly as it sounds. The JIS X 0208 tables have 3 columns of data instead | |
1217 of 2; the first is the Shift-JIS codepoint. | |
1218 `big5' | |
1219 The charset codepoint is a Big Five codepoint; convert it to the | |
1220 proper hacked-up codepoint in `chinese-big5-1' or `chinese-big5-2'. | |
1221 */ | |
1222 (filename, charset, start, end, offset, flags)) | |
1223 { | |
1224 int st = 0, en = INT_MAX, of = 0; | |
1225 FILE *file; | |
1226 struct gcpro gcpro1; | |
1227 char line[1025]; | |
1228 int fondo = specpdl_depth (); | |
1229 int ignore_first_column = 0; | |
1230 int big5 = 0; | |
1231 | |
1232 CHECK_STRING (filename); | |
1233 charset = Fget_charset (charset); | |
1234 if (!NILP (start)) | |
1235 { | |
1236 CHECK_INT (start); | |
1237 st = XINT (start); | |
1238 } | |
1239 if (!NILP (end)) | |
1240 { | |
1241 CHECK_INT (end); | |
1242 en = XINT (end); | |
1243 } | |
1244 if (!NILP (offset)) | |
1245 { | |
1246 CHECK_INT (offset); | |
1247 of = XINT (offset); | |
1248 } | |
1249 | |
1250 if (!LISTP (flags)) | |
1251 flags = list1 (flags); | |
1252 | |
1253 { | |
1254 EXTERNAL_LIST_LOOP_2 (elt, flags) | |
1255 { | |
1256 if (EQ (elt, Qignore_first_column)) | |
1257 ignore_first_column = 1; | |
1258 else if (EQ (elt, Qbig5)) | |
1259 big5 = 1; | |
1260 else | |
1261 invalid_constant | |
1262 ("Unrecognized `parse-unicode-table' flag", elt); | |
1263 } | |
1264 } | |
1265 | |
1266 GCPRO1 (filename); | |
1267 filename = Fexpand_file_name (filename, Qnil); | |
1268 file = qxe_fopen (XSTRING_DATA (filename), READ_TEXT); | |
1269 if (!file) | |
1270 report_file_error ("Cannot open", filename); | |
1271 record_unwind_protect (cerrar_el_fulano, make_opaque_ptr (file)); | |
1272 while (fgets (line, sizeof (line), file)) | |
1273 { | |
1274 char *p = line; | |
1275 int cp1, cp2, endcount; | |
1276 int cp1high, cp1low; | |
1277 int dummy; | |
1278 | |
1279 while (*p) /* erase all comments out of the line */ | |
1280 { | |
1281 if (*p == '#') | |
1282 *p = '\0'; | |
1283 else | |
1284 p++; | |
1285 } | |
1286 /* see if line is nothing but whitespace and skip if so */ | |
1287 p = line + strspn (line, " \t\n\r\f"); | |
1288 if (!*p) | |
1289 continue; | |
1290 /* NOTE: It appears that MS Windows and Newlib sscanf() have | |
1291 different interpretations for whitespace (== "skip all whitespace | |
1292 at processing point"): Newlib requires at least one corresponding | |
1293 whitespace character in the input, but MS allows none. The | |
1294 following would be easier to write if we could count on the MS | |
1295 interpretation. | |
1296 | |
1297 Also, the return value does NOT include %n storage. */ | |
1298 if ((!ignore_first_column ? | |
1299 sscanf (p, "%i %i%n", &cp1, &cp2, &endcount) < 2 : | |
1300 sscanf (p, "%i %i %i%n", &dummy, &cp1, &cp2, &endcount) < 3) | |
1301 || *(p + endcount + strspn (p + endcount, " \t\n\r\f"))) | |
1302 { | |
1303 warn_when_safe (intern ("unicode"), Qnotice, | |
1304 "Unrecognized line in translation file %s:\n%s", | |
1305 XSTRING_DATA (filename), line); | |
1306 continue; | |
1307 } | |
1308 if (cp1 >= st && cp1 <= en) | |
1309 { | |
1310 cp1 += of; | |
1311 if (cp1 < 0 || cp1 >= 65536) | |
1312 { | |
1313 out_of_range: | |
1314 warn_when_safe (intern ("unicode"), Qnotice, | |
1315 "Out of range first codepoint 0x%x in translation file %s:\n%s", | |
1316 cp1, XSTRING_DATA (filename), line); | |
1317 continue; | |
1318 } | |
1319 | |
1320 cp1high = cp1 >> 8; | |
1321 cp1low = cp1 & 255; | |
1322 | |
1323 if (big5) | |
1324 { | |
1325 Emchar ch = decode_big5_char (cp1high, cp1low); | |
1326 if (ch == -1) | |
1327 warn_when_safe (intern ("unicode"), Qnotice, | |
1328 "Out of range Big5 codepoint 0x%x in translation file %s:\n%s", | |
1329 cp1, XSTRING_DATA (filename), line); | |
1330 else | |
1331 set_unicode_conversion (ch, cp2); | |
1332 } | |
1333 else | |
1334 { | |
1335 int l1, h1, l2, h2; | |
1336 Emchar emch; | |
1337 | |
1338 switch (XCHARSET_TYPE (charset)) | |
1339 { | |
1340 case CHARSET_TYPE_94: l1 = 33; h1 = 126; l2 = 0; h2 = 0; break; | |
1341 case CHARSET_TYPE_96: l1 = 32; h1 = 127; l2 = 0; h2 = 0; break; | |
1342 case CHARSET_TYPE_94X94: l1 = 33; h1 = 126; l2 = 33; h2 = 126; | |
1343 break; | |
1344 case CHARSET_TYPE_96X96: l1 = 32; h1 = 127; l2 = 32; h2 = 127; | |
1345 break; | |
1346 default: abort (); l1 = 0; h1 = 0; l2 = 0; h2 = 0; | |
1347 } | |
1348 | |
1349 if (cp1high < l2 || cp1high > h2 || cp1low < l1 || cp1low > h1) | |
1350 goto out_of_range; | |
1351 | |
1352 emch = (cp1high == 0 ? MAKE_CHAR (charset, cp1low, 0) : | |
1353 MAKE_CHAR (charset, cp1high, cp1low)); | |
1354 set_unicode_conversion (emch, cp2); | |
1355 } | |
1356 } | |
1357 } | |
1358 | |
1359 if (ferror (file)) | |
1360 report_file_error ("IO error when reading", filename); | |
1361 | |
1362 unbind_to (fondo); /* close file */ | |
1363 UNGCPRO; | |
1364 return Qnil; | |
1365 } | |
1366 | |
1367 #endif /* MULE */ | |
1368 | |
1369 | |
1370 /************************************************************************/ | |
1371 /* Unicode coding system */ | |
1372 /************************************************************************/ | |
1373 | |
1374 /* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */ | |
1375 DEFINE_CODING_SYSTEM_TYPE (unicode); | |
1376 | |
1377 enum unicode_type | |
1378 { | |
1379 UNICODE_UTF_16, | |
1380 UNICODE_UTF_8, | |
1381 UNICODE_UTF_7, | |
1382 UNICODE_UCS_4, | |
1383 }; | |
1384 | |
1385 struct unicode_coding_system | |
1386 { | |
1387 enum unicode_type type; | |
1388 int little_endian :1; | |
1389 int need_bom :1; | |
1390 }; | |
1391 | |
1392 #define CODING_SYSTEM_UNICODE_TYPE(codesys) \ | |
1393 (CODING_SYSTEM_TYPE_DATA (codesys, unicode)->type) | |
1394 #define XCODING_SYSTEM_UNICODE_TYPE(codesys) \ | |
1395 CODING_SYSTEM_UNICODE_TYPE (XCODING_SYSTEM (codesys)) | |
1396 #define CODING_SYSTEM_UNICODE_LITTLE_ENDIAN(codesys) \ | |
1397 (CODING_SYSTEM_TYPE_DATA (codesys, unicode)->little_endian) | |
1398 #define XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN(codesys) \ | |
1399 CODING_SYSTEM_UNICODE_LITTLE_ENDIAN (XCODING_SYSTEM (codesys)) | |
1400 #define CODING_SYSTEM_UNICODE_NEED_BOM(codesys) \ | |
1401 (CODING_SYSTEM_TYPE_DATA (codesys, unicode)->need_bom) | |
1402 #define XCODING_SYSTEM_UNICODE_NEED_BOM(codesys) \ | |
1403 CODING_SYSTEM_UNICODE_NEED_BOM (XCODING_SYSTEM (codesys)) | |
1404 | |
1405 struct unicode_coding_stream | |
1406 { | |
1407 /* decode */ | |
1408 unsigned char counter; | |
1409 int seen_char; | |
1410 /* encode */ | |
1411 Lisp_Object current_charset; | |
1412 int current_char_boundary; | |
1413 int wrote_bom; | |
1414 }; | |
1415 | |
1416 static const struct lrecord_description unicode_coding_system_description[] = { | |
1417 { XD_END } | |
1418 }; | |
1419 | |
1420 /* Decode a UCS-2 or UCS-4 character into a buffer. If the lookup fails, use | |
1421 <GETA MARK> (U+3013) of JIS X 0208, which means correct character | |
1422 is not found, instead. | |
1423 #### do something more appropriate (use blob?) | |
1424 Danger, Will Robinson! Data loss. Should we signal user? */ | |
1425 static void | |
1426 decode_unicode_char (int ch, unsigned_char_dynarr *dst, | |
1427 struct unicode_coding_stream *data, int ignore_bom) | |
1428 { | |
1429 if (ch == 0xFEFF && !data->seen_char && ignore_bom) | |
1430 ; | |
1431 else | |
1432 { | |
1433 #ifdef MULE | |
1434 Emchar chr = unicode_to_char (ch, unicode_precedence_dynarr); | |
1435 | |
1436 if (chr != -1) | |
1437 { | |
1438 Intbyte work[MAX_EMCHAR_LEN]; | |
1439 int len; | |
1440 | |
1441 len = set_charptr_emchar (work, chr); | |
1442 Dynarr_add_many (dst, work, len); | |
1443 } | |
1444 else | |
1445 { | |
1446 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); | |
1447 Dynarr_add (dst, 34 + 128); | |
1448 Dynarr_add (dst, 46 + 128); | |
1449 } | |
1450 #else | |
1451 Dynarr_add (dst, (Intbyte) ch); | |
1452 #endif /* MULE */ | |
1453 } | |
1454 | |
1455 data->seen_char = 1; | |
1456 } | |
1457 | |
1458 static void | |
1459 encode_unicode_char_1 (int code, unsigned_char_dynarr *dst, | |
1460 enum unicode_type type, int little_endian) | |
1461 { | |
1462 switch (type) | |
1463 { | |
1464 case UNICODE_UTF_16: | |
1465 if (little_endian) | |
1466 { | |
1467 Dynarr_add (dst, (unsigned char) (code & 255)); | |
1468 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | |
1469 } | |
1470 else | |
1471 { | |
1472 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | |
1473 Dynarr_add (dst, (unsigned char) (code & 255)); | |
1474 } | |
1475 break; | |
1476 | |
1477 case UNICODE_UCS_4: | |
1478 if (little_endian) | |
1479 { | |
1480 Dynarr_add (dst, (unsigned char) (code & 255)); | |
1481 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | |
1482 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); | |
1483 Dynarr_add (dst, (unsigned char) (code >> 24)); | |
1484 } | |
1485 else | |
1486 { | |
1487 Dynarr_add (dst, (unsigned char) (code >> 24)); | |
1488 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255)); | |
1489 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255)); | |
1490 Dynarr_add (dst, (unsigned char) (code & 255)); | |
1491 } | |
1492 break; | |
1493 | |
1494 case UNICODE_UTF_8: | |
1495 if (code <= 0x7f) | |
1496 { | |
1497 Dynarr_add (dst, (unsigned char) code); | |
1498 } | |
1499 else if (code <= 0x7ff) | |
1500 { | |
1501 Dynarr_add (dst, (unsigned char) ((code >> 6) | 0xc0)); | |
1502 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | |
1503 } | |
1504 else if (code <= 0xffff) | |
1505 { | |
1506 Dynarr_add (dst, (unsigned char) ((code >> 12) | 0xe0)); | |
1507 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); | |
1508 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | |
1509 } | |
1510 else if (code <= 0x1fffff) | |
1511 { | |
1512 Dynarr_add (dst, (unsigned char) ((code >> 18) | 0xf0)); | |
1513 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); | |
1514 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); | |
1515 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | |
1516 } | |
1517 else if (code <= 0x3ffffff) | |
1518 { | |
1519 Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8)); | |
1520 Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); | |
1521 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); | |
1522 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); | |
1523 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | |
1524 } | |
1525 else | |
1526 { | |
1527 Dynarr_add (dst, (unsigned char) ((code >> 30) | 0xfc)); | |
1528 Dynarr_add (dst, (unsigned char) (((code >> 24) & 0x3f) | 0x80)); | |
1529 Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80)); | |
1530 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80)); | |
1531 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80)); | |
1532 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80)); | |
1533 } | |
1534 break; | |
1535 | |
1536 case UNICODE_UTF_7: abort (); | |
1537 | |
1538 default: abort (); | |
1539 } | |
1540 } | |
1541 | |
1542 static void | |
1543 encode_unicode_char (Lisp_Object charset, int h, int l, | |
1544 unsigned_char_dynarr *dst, enum unicode_type type, | |
1545 int little_endian) | |
1546 { | |
1547 #ifdef MULE | |
1548 int code = char_to_unicode (MAKE_CHAR (charset, h & 127, l & 127)); | |
1549 | |
1550 if (code == -1) | |
1551 { | |
1552 if (type != UNICODE_UTF_16 && | |
1553 XCHARSET_DIMENSION (charset) == 2 && | |
1554 XCHARSET_CHARS (charset) == 94) | |
1555 { | |
1556 unsigned char final = XCHARSET_FINAL (charset); | |
1557 | |
1558 if (('@' <= final) && (final < 0x7f)) | |
1559 code = (0xe00000 + (final - '@') * 94 * 94 | |
1560 + ((h & 127) - 33) * 94 + (l & 127) - 33); | |
1561 else | |
1562 code = '?'; | |
1563 } | |
1564 else | |
1565 code = '?'; | |
1566 } | |
1567 #else | |
1568 int code = h; | |
1569 #endif /* MULE */ | |
1570 | |
1571 encode_unicode_char_1 (code, dst, type, little_endian); | |
1572 } | |
1573 | |
1574 static Bytecount | |
1575 unicode_convert (struct coding_stream *str, const UExtbyte *src, | |
1576 unsigned_char_dynarr *dst, Bytecount n) | |
1577 { | |
1578 unsigned int ch = str->ch; | |
1579 struct unicode_coding_stream *data = CODING_STREAM_TYPE_DATA (str, unicode); | |
1580 enum unicode_type type = | |
1581 XCODING_SYSTEM_UNICODE_TYPE (str->codesys); | |
1582 int little_endian = XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (str->codesys); | |
1583 int ignore_bom = XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys); | |
1584 Bytecount orign = n; | |
1585 | |
1586 if (str->direction == CODING_DECODE) | |
1587 { | |
1588 unsigned char counter = data->counter; | |
1589 | |
1590 while (n--) | |
1591 { | |
1592 UExtbyte c = *src++; | |
1593 | |
1594 switch (type) | |
1595 { | |
1596 case UNICODE_UTF_8: | |
1597 switch (counter) | |
1598 { | |
1599 case 0: | |
1600 if (c >= 0xfc) | |
1601 { | |
1602 ch = c & 0x01; | |
1603 counter = 5; | |
1604 } | |
1605 else if (c >= 0xf8) | |
1606 { | |
1607 ch = c & 0x03; | |
1608 counter = 4; | |
1609 } | |
1610 else if (c >= 0xf0) | |
1611 { | |
1612 ch = c & 0x07; | |
1613 counter = 3; | |
1614 } | |
1615 else if (c >= 0xe0) | |
1616 { | |
1617 ch = c & 0x0f; | |
1618 counter = 2; | |
1619 } | |
1620 else if (c >= 0xc0) | |
1621 { | |
1622 ch = c & 0x1f; | |
1623 counter = 1; | |
1624 } | |
1625 else | |
1626 decode_unicode_char (c, dst, data, ignore_bom); | |
1627 break; | |
1628 case 1: | |
1629 ch = (ch << 6) | (c & 0x3f); | |
1630 decode_unicode_char (ch, dst, data, ignore_bom); | |
1631 ch = 0; | |
1632 counter = 0; | |
1633 break; | |
1634 default: | |
1635 ch = (ch << 6) | (c & 0x3f); | |
1636 counter--; | |
1637 } | |
1638 break; | |
1639 | |
1640 case UNICODE_UTF_16: | |
1641 if (little_endian) | |
1642 ch = (c << counter) | ch; | |
1643 else | |
1644 ch = (ch << 8) | c; | |
1645 counter += 8; | |
1646 if (counter == 16) | |
1647 { | |
1648 int tempch = ch; | |
1649 ch = 0; | |
1650 counter = 0; | |
1651 decode_unicode_char (tempch, dst, data, ignore_bom); | |
1652 } | |
1653 break; | |
1654 | |
1655 case UNICODE_UCS_4: | |
1656 if (little_endian) | |
1657 ch = (c << counter) | ch; | |
1658 else | |
1659 ch = (ch << 8) | c; | |
1660 counter += 8; | |
1661 if (counter == 32) | |
1662 { | |
1663 int tempch = ch; | |
1664 ch = 0; | |
1665 counter = 0; | |
1666 if (tempch < 0) | |
1667 { | |
1668 /* !!#### indicate an error */ | |
1669 tempch = '~'; | |
1670 } | |
1671 decode_unicode_char (tempch, dst, data, ignore_bom); | |
1672 } | |
1673 break; | |
1674 | |
1675 case UNICODE_UTF_7: | |
1676 abort (); | |
1677 break; | |
1678 | |
1679 default: abort (); | |
1680 } | |
1681 | |
1682 } | |
1683 if (str->eof) | |
1684 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); | |
1685 | |
1686 data->counter = counter; | |
1687 } | |
1688 else | |
1689 { | |
1690 unsigned char char_boundary = data->current_char_boundary; | |
1691 Lisp_Object charset = data->current_charset; | |
1692 | |
1693 #ifdef ENABLE_COMPOSITE_CHARS | |
1694 /* flags for handling composite chars. We do a little switcheroo | |
1695 on the source while we're outputting the composite char. */ | |
1696 Bytecount saved_n = 0; | |
1697 const Intbyte *saved_src = NULL; | |
1698 int in_composite = 0; | |
1699 | |
1700 back_to_square_n: | |
1701 #endif /* ENABLE_COMPOSITE_CHARS */ | |
1702 | |
1703 if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom) | |
1704 { | |
1705 encode_unicode_char_1 (0xFEFF, dst, type, little_endian); | |
1706 data->wrote_bom = 1; | |
1707 } | |
1708 | |
1709 while (n--) | |
1710 { | |
1711 Intbyte c = *src++; | |
1712 | |
1713 #ifdef MULE | |
1714 if (BYTE_ASCII_P (c)) | |
1715 #endif /* MULE */ | |
1716 { /* Processing ASCII character */ | |
1717 ch = 0; | |
1718 encode_unicode_char (Vcharset_ascii, c, 0, dst, type, | |
1719 little_endian); | |
1720 | |
1721 char_boundary = 1; | |
1722 } | |
1723 #ifdef MULE | |
1724 else if (INTBYTE_LEADING_BYTE_P (c) || INTBYTE_LEADING_BYTE_P (ch)) | |
1725 { /* Processing Leading Byte */ | |
1726 ch = 0; | |
1727 charset = CHARSET_BY_LEADING_BYTE (c); | |
1728 if (LEADING_BYTE_PREFIX_P(c)) | |
1729 ch = c; | |
1730 char_boundary = 0; | |
1731 } | |
1732 else | |
1733 { /* Processing Non-ASCII character */ | |
1734 char_boundary = 1; | |
1735 if (EQ (charset, Vcharset_control_1)) | |
1736 encode_unicode_char (Vcharset_control_1, c, 0, dst, | |
1737 type, little_endian); | |
1738 else | |
1739 { | |
1740 switch (XCHARSET_REP_BYTES (charset)) | |
1741 { | |
1742 case 2: | |
1743 encode_unicode_char (charset, c, 0, dst, type, | |
1744 little_endian); | |
1745 break; | |
1746 case 3: | |
1747 if (XCHARSET_PRIVATE_P (charset)) | |
1748 { | |
1749 encode_unicode_char (charset, c, 0, dst, type, | |
1750 little_endian); | |
1751 ch = 0; | |
1752 } | |
1753 else if (ch) | |
1754 { | |
1755 #ifdef ENABLE_COMPOSITE_CHARS | |
1756 if (EQ (charset, Vcharset_composite)) | |
1757 { | |
1758 if (in_composite) | |
1759 { | |
1760 /* #### Bother! We don't know how to | |
1761 handle this yet. */ | |
1762 encode_unicode_char (Vcharset_ascii, '~', 0, | |
1763 dst, type, | |
1764 little_endian); | |
1765 } | |
1766 else | |
1767 { | |
1768 Emchar emch = MAKE_CHAR (Vcharset_composite, | |
1769 ch & 0x7F, | |
1770 c & 0x7F); | |
1771 Lisp_Object lstr = | |
1772 composite_char_string (emch); | |
1773 saved_n = n; | |
1774 saved_src = src; | |
1775 in_composite = 1; | |
1776 src = XSTRING_DATA (lstr); | |
1777 n = XSTRING_LENGTH (lstr); | |
1778 } | |
1779 } | |
1780 else | |
1781 #endif /* ENABLE_COMPOSITE_CHARS */ | |
1782 encode_unicode_char (charset, ch, c, dst, type, | |
1783 little_endian); | |
1784 ch = 0; | |
1785 } | |
1786 else | |
1787 { | |
1788 ch = c; | |
1789 char_boundary = 0; | |
1790 } | |
1791 break; | |
1792 case 4: | |
1793 if (ch) | |
1794 { | |
1795 encode_unicode_char (charset, ch, c, dst, type, | |
1796 little_endian); | |
1797 ch = 0; | |
1798 } | |
1799 else | |
1800 { | |
1801 ch = c; | |
1802 char_boundary = 0; | |
1803 } | |
1804 break; | |
1805 default: | |
1806 abort (); | |
1807 } | |
1808 } | |
1809 } | |
1810 #endif /* MULE */ | |
1811 } | |
1812 | |
1813 #ifdef ENABLE_COMPOSITE_CHARS | |
1814 if (in_composite) | |
1815 { | |
1816 n = saved_n; | |
1817 src = saved_src; | |
1818 in_composite = 0; | |
1819 goto back_to_square_n; /* Wheeeeeeeee ..... */ | |
1820 } | |
1821 #endif /* ENABLE_COMPOSITE_CHARS */ | |
1822 | |
1823 data->current_char_boundary = char_boundary; | |
1824 data->current_charset = charset; | |
1825 | |
1826 /* La palabra se hizo carne! */ | |
1827 /* A palavra fez-se carne! */ | |
1828 /* Whatever. */ | |
1829 } | |
1830 | |
1831 str->ch = ch; | |
1832 return orign; | |
1833 } | |
1834 | |
1835 /* DEFINE_DETECTOR (utf_7); */ | |
1836 DEFINE_DETECTOR (utf_8); | |
1837 DEFINE_DETECTOR_CATEGORY (utf_8, utf_8); | |
1838 DEFINE_DETECTOR (ucs_4); | |
1839 DEFINE_DETECTOR_CATEGORY (ucs_4, ucs_4); | |
1840 DEFINE_DETECTOR (utf_16); | |
1841 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16); | |
1842 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian); | |
1843 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_bom); | |
1844 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian_bom); | |
1845 | |
1846 struct ucs_4_detector | |
1847 { | |
1848 int in_ucs_4_byte; | |
1849 }; | |
1850 | |
1851 static void | |
1852 ucs_4_detect (struct detection_state *st, const UExtbyte *src, | |
1853 Bytecount n) | |
1854 { | |
1855 struct ucs_4_detector *data = DETECTION_STATE_DATA (st, ucs_4); | |
1856 | |
1857 while (n--) | |
1858 { | |
1859 UExtbyte c = *src++; | |
1860 switch (data->in_ucs_4_byte) | |
1861 { | |
1862 case 0: | |
1863 if (c >= 128) | |
1864 { | |
1865 DET_RESULT (st, ucs_4) = DET_NEARLY_IMPOSSIBLE; | |
1866 return; | |
1867 } | |
1868 else | |
1869 data->in_ucs_4_byte++; | |
1870 break; | |
1871 case 3: | |
1872 data->in_ucs_4_byte = 0; | |
1873 break; | |
1874 default: | |
1875 data->in_ucs_4_byte++; | |
1876 } | |
1877 } | |
1878 | |
1879 /* !!#### write this for real */ | |
1880 DET_RESULT (st, ucs_4) = DET_AS_LIKELY_AS_UNLIKELY; | |
1881 } | |
1882 | |
1883 struct utf_16_detector | |
1884 { | |
1885 unsigned int seen_ffff:1; | |
1886 unsigned int seen_forward_bom:1; | |
1887 unsigned int seen_rev_bom:1; | |
1888 int byteno; | |
1889 int prev_char; | |
1890 int text, rev_text; | |
1891 }; | |
1892 | |
1893 static void | |
1894 utf_16_detect (struct detection_state *st, const UExtbyte *src, | |
1895 Bytecount n) | |
1896 { | |
1897 struct utf_16_detector *data = DETECTION_STATE_DATA (st, utf_16); | |
1898 | |
1899 while (n--) | |
1900 { | |
1901 UExtbyte c = *src++; | |
1902 int prevc = data->prev_char; | |
1903 if (data->byteno == 1 && c == 0xFF && prevc == 0xFE) | |
1904 data->seen_forward_bom = 1; | |
1905 else if (data->byteno == 1 && c == 0xFE && prevc == 0xFF) | |
1906 data->seen_rev_bom = 1; | |
1907 | |
1908 if (data->byteno & 1) | |
1909 { | |
1910 if (c == 0xFF && prevc == 0xFF) | |
1911 data->seen_ffff = 1; | |
1912 if (prevc == 0 | |
1913 && (c == '\r' || c == '\n' | |
1914 || (c >= 0x20 && c <= 0x7E))) | |
1915 data->text++; | |
1916 if (c == 0 | |
1917 && (prevc == '\r' || prevc == '\n' | |
1918 || (prevc >= 0x20 && prevc <= 0x7E))) | |
1919 data->rev_text++; | |
1920 if (prevc == 0x20 && (c == 0x28 || c == 0x29)) | |
1921 data->text++; | |
1922 if (c == 0x20 && (prevc == 0x28 || prevc == 0x29)) | |
1923 data->rev_text++; | |
1924 } | |
1925 | |
1926 data->byteno++; | |
1927 data->prev_char = c; | |
1928 } | |
1929 | |
1930 { | |
1931 int variance_indicates_big_endian = | |
1932 (data->text >= 10 | |
1933 && (data->rev_text == 0 | |
1934 || data->text / data->rev_text >= 10)); | |
1935 int variance_indicates_little_endian = | |
1936 (data->rev_text >= 10 | |
1937 && (data->text == 0 | |
1938 || data->rev_text / data->text >= 10)); | |
1939 | |
1940 if (data->seen_ffff) | |
1941 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | |
1942 else if (data->seen_forward_bom) | |
1943 { | |
1944 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | |
1945 if (variance_indicates_big_endian) | |
1946 DET_RESULT (st, utf_16_bom) = DET_NEAR_CERTAINTY; | |
1947 else if (variance_indicates_little_endian) | |
1948 DET_RESULT (st, utf_16_bom) = DET_SOMEWHAT_LIKELY; | |
1949 else | |
1950 DET_RESULT (st, utf_16_bom) = DET_QUITE_PROBABLE; | |
1951 } | |
1952 else if (data->seen_forward_bom) | |
1953 { | |
1954 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | |
1955 if (variance_indicates_big_endian) | |
1956 DET_RESULT (st, utf_16_bom) = DET_NEAR_CERTAINTY; | |
1957 else if (variance_indicates_little_endian) | |
1958 /* #### may need to rethink */ | |
1959 DET_RESULT (st, utf_16_bom) = DET_SOMEWHAT_LIKELY; | |
1960 else | |
1961 /* #### may need to rethink */ | |
1962 DET_RESULT (st, utf_16_bom) = DET_QUITE_PROBABLE; | |
1963 } | |
1964 else if (data->seen_rev_bom) | |
1965 { | |
1966 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | |
1967 if (variance_indicates_little_endian) | |
1968 DET_RESULT (st, utf_16_little_endian_bom) = DET_NEAR_CERTAINTY; | |
1969 else if (variance_indicates_big_endian) | |
1970 /* #### may need to rethink */ | |
1971 DET_RESULT (st, utf_16_little_endian_bom) = DET_SOMEWHAT_LIKELY; | |
1972 else | |
1973 /* #### may need to rethink */ | |
1974 DET_RESULT (st, utf_16_little_endian_bom) = DET_QUITE_PROBABLE; | |
1975 } | |
1976 else if (variance_indicates_big_endian) | |
1977 { | |
1978 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | |
1979 DET_RESULT (st, utf_16) = DET_SOMEWHAT_LIKELY; | |
1980 DET_RESULT (st, utf_16_little_endian) = DET_SOMEWHAT_UNLIKELY; | |
1981 } | |
1982 else if (variance_indicates_little_endian) | |
1983 { | |
1984 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE); | |
1985 DET_RESULT (st, utf_16) = DET_SOMEWHAT_UNLIKELY; | |
1986 DET_RESULT (st, utf_16_little_endian) = DET_SOMEWHAT_LIKELY; | |
1987 } | |
1988 else | |
1989 SET_DET_RESULTS (st, utf_16, DET_AS_LIKELY_AS_UNLIKELY); | |
1990 } | |
1991 } | |
1992 | |
1993 struct utf_8_detector | |
1994 { | |
1995 int in_utf_8_byte; | |
1996 }; | |
1997 | |
1998 static void | |
1999 utf_8_detect (struct detection_state *st, const UExtbyte *src, | |
2000 Bytecount n) | |
2001 { | |
2002 struct utf_8_detector *data = DETECTION_STATE_DATA (st, utf_8); | |
2003 | |
2004 while (n--) | |
2005 { | |
2006 UExtbyte c = *src++; | |
2007 switch (data->in_utf_8_byte) | |
2008 { | |
2009 case 0: | |
2010 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
2011 { | |
2012 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | |
2013 return; | |
2014 } | |
2015 else if (c >= 0xfc) | |
2016 data->in_utf_8_byte = 5; | |
2017 else if (c >= 0xf8) | |
2018 data->in_utf_8_byte = 4; | |
2019 else if (c >= 0xf0) | |
2020 data->in_utf_8_byte = 3; | |
2021 else if (c >= 0xe0) | |
2022 data->in_utf_8_byte = 2; | |
2023 else if (c >= 0xc0) | |
2024 data->in_utf_8_byte = 1; | |
2025 else if (c >= 0x80) | |
2026 { | |
2027 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | |
2028 return; | |
2029 } | |
2030 break; | |
2031 default: | |
2032 if ((c & 0xc0) != 0x80) | |
2033 { | |
2034 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY; | |
2035 return; | |
2036 } | |
2037 else | |
2038 data->in_utf_8_byte--; | |
2039 } | |
2040 } | |
2041 DET_RESULT (st, utf_8) = DET_SOMEWHAT_LIKELY; | |
2042 } | |
2043 | |
2044 static void | |
2045 unicode_init_coding_stream (struct coding_stream *str) | |
2046 { | |
2047 struct unicode_coding_stream *data = | |
2048 CODING_STREAM_TYPE_DATA (str, unicode); | |
2049 xzero (*data); | |
2050 data->current_charset = Qnil; | |
2051 } | |
2052 | |
2053 static void | |
2054 unicode_rewind_coding_stream (struct coding_stream *str) | |
2055 { | |
2056 unicode_init_coding_stream (str); | |
2057 } | |
2058 | |
2059 static int | |
2060 unicode_putprop (Lisp_Object codesys, Lisp_Object key, Lisp_Object value) | |
2061 { | |
2062 if (EQ (key, Qtype)) | |
2063 { | |
2064 enum unicode_type type; | |
2065 | |
2066 if (EQ (value, Qutf_8)) | |
2067 type = UNICODE_UTF_8; | |
2068 else if (EQ (value, Qutf_16)) | |
2069 type = UNICODE_UTF_16; | |
2070 else if (EQ (value, Qutf_7)) | |
2071 type = UNICODE_UTF_7; | |
2072 else if (EQ (value, Qucs_4)) | |
2073 type = UNICODE_UCS_4; | |
2074 else | |
2075 invalid_constant ("Invalid Unicode type", key); | |
2076 | |
2077 XCODING_SYSTEM_UNICODE_TYPE (codesys) = type; | |
2078 } | |
2079 else if (EQ (key, Qlittle_endian)) | |
2080 XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (codesys) = !NILP (value); | |
2081 else if (EQ (key, Qneed_bom)) | |
2082 XCODING_SYSTEM_UNICODE_NEED_BOM (codesys) = !NILP (value); | |
2083 else | |
2084 return 0; | |
2085 return 1; | |
2086 } | |
2087 | |
2088 static Lisp_Object | |
2089 unicode_getprop (Lisp_Object coding_system, Lisp_Object prop) | |
2090 { | |
2091 if (EQ (prop, Qtype)) | |
2092 { | |
2093 switch (XCODING_SYSTEM_UNICODE_TYPE (coding_system)) | |
2094 { | |
2095 case UNICODE_UTF_16: return Qutf_16; | |
2096 case UNICODE_UTF_8: return Qutf_8; | |
2097 case UNICODE_UTF_7: return Qutf_7; | |
2098 case UNICODE_UCS_4: return Qucs_4; | |
2099 default: abort (); | |
2100 } | |
2101 } | |
2102 else if (EQ (prop, Qlittle_endian)) | |
2103 return XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (coding_system) ? Qt : Qnil; | |
2104 else if (EQ (prop, Qneed_bom)) | |
2105 return XCODING_SYSTEM_UNICODE_NEED_BOM (coding_system) ? Qt : Qnil; | |
2106 return Qunbound; | |
2107 } | |
2108 | |
2109 static void | |
2110 unicode_print (Lisp_Object cs, Lisp_Object printcharfun, int escapeflag) | |
2111 { | |
2112 write_c_string ("(", printcharfun); | |
2113 print_internal (unicode_getprop (cs, Qtype), printcharfun, 0); | |
2114 if (XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (cs)) | |
2115 write_c_string (", little-endian", printcharfun); | |
2116 if (XCODING_SYSTEM_UNICODE_NEED_BOM (cs)) | |
2117 write_c_string (", need-bom", printcharfun); | |
2118 write_c_string (")", printcharfun); | |
2119 } | |
2120 | |
2121 int | |
2122 dfc_coding_system_is_unicode (Lisp_Object codesys) | |
2123 { | |
2124 #ifdef HAVE_WIN32_CODING_SYSTEMS | |
2125 codesys = Fget_coding_system (codesys); | |
2126 return (EQ (XCODING_SYSTEM_TYPE (codesys), Qunicode) && | |
2127 XCODING_SYSTEM_UNICODE_TYPE (codesys) == UNICODE_UTF_16 && | |
2128 XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (codesys)); | |
2129 | |
2130 #else | |
2131 return 0; | |
2132 #endif | |
2133 } | |
2134 | |
2135 | |
2136 /************************************************************************/ | |
2137 /* Initialization */ | |
2138 /************************************************************************/ | |
2139 | |
2140 void | |
2141 syms_of_unicode (void) | |
2142 { | |
2143 #ifdef MULE | |
2144 DEFSUBR (Fset_language_unicode_precedence_list); | |
2145 DEFSUBR (Flanguage_unicode_precedence_list); | |
2146 DEFSUBR (Fset_default_unicode_precedence_list); | |
2147 DEFSUBR (Fdefault_unicode_precedence_list); | |
2148 DEFSUBR (Fset_unicode_conversion); | |
2149 | |
2150 DEFSUBR (Fparse_unicode_translation_table); | |
2151 | |
2152 DEFSYMBOL (Qignore_first_column); | |
2153 #endif /* MULE */ | |
2154 | |
2155 DEFSUBR (Fcharacter_to_unicode); | |
2156 DEFSUBR (Funicode_to_character); | |
2157 | |
2158 DEFSYMBOL (Qunicode); | |
2159 DEFSYMBOL (Qucs_4); | |
2160 DEFSYMBOL (Qutf_16); | |
2161 DEFSYMBOL (Qutf_8); | |
2162 DEFSYMBOL (Qutf_7); | |
2163 | |
2164 DEFSYMBOL (Qneed_bom); | |
2165 | |
2166 DEFSYMBOL (Qutf_16); | |
2167 DEFSYMBOL (Qutf_16_little_endian); | |
2168 DEFSYMBOL (Qutf_16_bom); | |
2169 DEFSYMBOL (Qutf_16_little_endian_bom); | |
2170 } | |
2171 | |
2172 void | |
2173 coding_system_type_create_unicode (void) | |
2174 { | |
2175 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (unicode, "unicode-coding-system-p"); | |
2176 CODING_SYSTEM_HAS_METHOD (unicode, print); | |
2177 CODING_SYSTEM_HAS_METHOD (unicode, convert); | |
2178 CODING_SYSTEM_HAS_METHOD (unicode, init_coding_stream); | |
2179 CODING_SYSTEM_HAS_METHOD (unicode, rewind_coding_stream); | |
2180 CODING_SYSTEM_HAS_METHOD (unicode, putprop); | |
2181 CODING_SYSTEM_HAS_METHOD (unicode, getprop); | |
2182 | |
2183 INITIALIZE_DETECTOR (utf_8); | |
2184 DETECTOR_HAS_METHOD (utf_8, detect); | |
2185 INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8); | |
2186 | |
2187 INITIALIZE_DETECTOR (ucs_4); | |
2188 DETECTOR_HAS_METHOD (ucs_4, detect); | |
2189 INITIALIZE_DETECTOR_CATEGORY (ucs_4, ucs_4); | |
2190 | |
2191 INITIALIZE_DETECTOR (utf_16); | |
2192 DETECTOR_HAS_METHOD (utf_16, detect); | |
2193 INITIALIZE_DETECTOR_CATEGORY (utf_16, utf_16); | |
2194 INITIALIZE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian); | |
2195 INITIALIZE_DETECTOR_CATEGORY (utf_16, utf_16_bom); | |
2196 INITIALIZE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian_bom); | |
2197 } | |
2198 | |
2199 void | |
2200 reinit_coding_system_type_create_unicode (void) | |
2201 { | |
2202 REINITIALIZE_CODING_SYSTEM_TYPE (unicode); | |
2203 } | |
2204 | |
2205 void | |
2206 reinit_vars_of_unicode (void) | |
2207 { | |
2208 #ifdef MULE | |
2209 init_blank_unicode_tables (); | |
2210 #endif /* MULE */ | |
2211 } | |
2212 | |
2213 void | |
2214 vars_of_unicode (void) | |
2215 { | |
2216 reinit_vars_of_unicode (); | |
2217 | |
2218 Fprovide (intern ("unicode")); | |
2219 | |
2220 #ifdef MULE | |
2221 staticpro (&Vlanguage_unicode_precedence_list); | |
2222 Vlanguage_unicode_precedence_list = Qnil; | |
2223 | |
2224 staticpro (&Vdefault_unicode_precedence_list); | |
2225 Vdefault_unicode_precedence_list = Qnil; | |
2226 | |
2227 unicode_precedence_dynarr = Dynarr_new (Lisp_Object); | |
2228 dump_add_root_struct_ptr (&unicode_precedence_dynarr, | |
2229 &lisp_object_dynarr_description); | |
2230 #if 0 | |
2231 dump_add_root_thing (&to_unicode_blank_1, to_unicode_level_1_desc); | |
2232 dump_add_root_thing (&to_unicode_blank_2, to_unicode_level_2_desc); | |
2233 | |
2234 dump_add_root_thing (&from_unicode_blank_1, from_unicode_level_1_desc); | |
2235 dump_add_root_thing (&from_unicode_blank_2, from_unicode_level_2_desc); | |
2236 dump_add_root_thing (&from_unicode_blank_3, from_unicode_level_3_desc); | |
2237 dump_add_root_thing (&from_unicode_blank_4, from_unicode_level_4_desc); | |
2238 #endif | |
2239 | |
2240 #endif /* MULE */ | |
2241 } |