comparison src/unicode.c @ 771:943eaba38521

[xemacs-hg @ 2002-03-13 08:51:24 by ben] The big ben-mule-21-5 check-in! Various files were added and deleted. See CHANGES-ben-mule. There are still some test suite failures. No crashes, though. Many of the failures have to do with problems in the test suite itself rather than in the actual code. I'll be addressing these in the next day or so -- none of the test suite failures are at all critical. Meanwhile I'll be trying to address the biggest issues -- i.e. build or run failures, which will almost certainly happen on various platforms. All comments should be sent to ben@xemacs.org -- use a Cc: if necessary when sending to mailing lists. There will be pre- and post- tags, something like pre-ben-mule-21-5-merge-in, and post-ben-mule-21-5-merge-in.
author ben
date Wed, 13 Mar 2002 08:54:06 +0000
parents
children 2923009caf47
comparison
equal deleted inserted replaced
770:336a418893b5 771:943eaba38521
1 /* Code to handle Unicode conversion.
2 Copyright (C) 2000, 2001, 2002 Ben Wing.
3
4 This file is part of XEmacs.
5
6 XEmacs is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 2, or (at your option) any
9 later version.
10
11 XEmacs is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with XEmacs; see the file COPYING. If not, write to
18 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
20
21 /* Synched up with: FSF 20.3. Not in FSF. */
22
23 /* Authorship:
24
25 Current primary author: Ben Wing <ben@xemacs.org>
26
27 Written by Ben Wing <ben@xemacs.org>, June, 2001.
28 Separated out into this file, August, 2001.
29 Includes Unicode coding systems, some parts of which have been written
30 by someone else.
31
32 As of September 2001, the detection code is here and abstraction of the
33 detection system is finished. the unicode detectors have been rewritten
34 to include multiple levels of likelihood.
35 */
36
37 #include <config.h>
38 #include "lisp.h"
39
40 #include "charset.h"
41 #include "file-coding.h"
42 #include "opaque.h"
43
44 #include "sysfile.h"
45
46 /* #### WARNING! The current sledgehammer routines have a fundamental
47 problem in that they can't handle two characters mapping to a
48 single Unicode codepoint or vice-versa in a single charset table.
49 It's not clear there is any way to handle this and still make the
50 sledgehammer routines useful. */
51 /* #define SLEDGEHAMMER_CHECK_UNICODE */
52
53 /* We currently use the following format for tables:
54
55 If dimension == 1, to_unicode_table is a 96-element array of ints
56 (Unicode code points); else, it's a 96-element array of int *
57 pointers, each of which points to a 96-element array of ints. If no
58 elements in a row have been filled in, the pointer will point to a
59 default empty table; that way, memory usage is more reasonable but
60 lookup still fast.
61
62 -- If from_unicode_levels == 1, from_unicode_table is a 256-element
63 array of shorts (octet 1 in high byte, octet 2 in low byte; we don't
64 store Emchars directly to save space).
65
66 -- If from_unicode_levels == 2, from_unicode_table is a
67 256-element array of short * pointers, each of which points to a
68 256-element array of shorts.
69
70 -- If from_unicode_levels == 3, from_unicode_table is a
71 256-element array of short ** pointers, each of which points to
72 a 256-element array of short * pointers, each of which points to
73 a 256-element array of shorts.
74
75 -- If from_unicode_levels == 4, same thing but one level deeper.
76
77 Just as for to_unicode_table, we use default tables to fill in
78 all entries with no values in them.
79
80 #### An obvious space-saving optimization is to use variable-sized
81 tables, where each table instead of just being a 256-element array,
82 is a structure with a start value, an end value, and a variable
83 number of entries (END - START + 1). Only 8 bits are needed for
84 END and START, and could be stored at the end to avoid alignment
85 problems. However, before charging off and implementing this,
86 we need to consider whether it's worth it:
87
88 (1) Most tables will be highly localized in which code points are
89 defined, heavily reducing the possible memory waste. Before
90 doing any rewriting, write some code to see how much memory is
91 actually being wasted (i.e. ratio of empty entries to total # of
92 entries) and only start rewriting if it's unacceptably high. You
93 have to check over all charsets.
94
95 (2) Since entries are usually added one at a time, you have to be
96 very careful when creating the tables to avoid realloc()/free()
97 thrashing in the common case when you are in an area of high
98 localization and are going to end up using most entries in the
99 table. You'd certainly want to allow only certain sizes, not
100 arbitrary ones (probably powers of 2, where you want the entire
101 block including the START/END values to fit into a power of 2,
102 minus any malloc overhead if there is any -- there's none under
103 gmalloc.c, and probably most system malloc() functions are quite
104 smart nowadays and also have no overhead). You could optimize
105 somewhat during the in-C initializations, because you can compute
106 the actual usage of various tables by scanning the entries you're
107 going to add in a separate pass before adding them. (You could
108 actually do the same thing when entries are added on the Lisp
109 level by making the assumption that all the entries will come in
110 one after another before any use is made of the data. So as
111 they're coming in, you just store them in a big long list, and
112 the first time you need to retrieve an entry, you compute the
113 whole table at once.) You'd still have to deal with the
114 possibility of later entries coming in, though.
115
116 (3) You do lose some speed using START/END values, since you need
117 a couple of comparisons at each level. This could easily make
118 each single lookup become 3-4 times slower. The Unicode book
119 considers this a big issue, and recommends against variable-sized
120 tables for this reason; however, they almost certainly have in
121 mind applications that primarily involve conversion of large
122 amounts of data. Most Unicode strings that are translated in
123 XEmacs are fairly small. The only place where this might matter
124 is in loading large files -- e.g. a 3-megabyte Unicode-encoded
125 file. So think about this, and maybe do a trial implementation
126 where you don't worry too much about the intricacies of (2) and
127 just implement some basic "multiply by 1.5" trick or something to
128 do the resizing. There is a very good FAQ on Unicode called
129 something like the Linux-Unicode How-To (it should be part of the
130 Linux How-To's, I think), that lists the url of a guy with a
131 whole bunch of unicode files you can use to stress-test your
132 implementations, and he's highly likely to have a good
133 multi-megabyte Unicode-encoded file (with normal text in it -- if
134 you created your own just by creating repeated strings of letters
135 and numbers, you probably wouldn't get accurate results).
136 */
137
138 /* When MULE is not defined, we may still need some Unicode support --
139 in particular, some Windows API's always want Unicode, and the way
140 we've set up the Unicode encapsulation, we may as well go ahead and
141 always use the Unicode versions of split API's. (It would be
142 trickier to not use them, and pointless -- under NT, the ANSI API's
143 call the Unicode ones anyway, so in the case of structures, we'd be
144 converting from Unicode to ANSI structures, only to have the OS
145 convert them back.) */
146
147 Lisp_Object Qunicode;
148 Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7;
149 Lisp_Object Qneed_bom;
150
151 Lisp_Object Qutf_16_little_endian, Qutf_16_bom;
152 Lisp_Object Qutf_16_little_endian_bom;
153
154 #ifdef MULE
155
156 static int *to_unicode_blank_1;
157 static int **to_unicode_blank_2;
158
159 static short *from_unicode_blank_1;
160 static short **from_unicode_blank_2;
161 static short ***from_unicode_blank_3;
162 static short ****from_unicode_blank_4;
163
164 #if 0
165
166 static const struct lrecord_description to_unicode_level_0_desc[] = {
167 { XD_END }
168 };
169
170 static const struct struct_description to_unicode_level_0_ptr_desc = {
171 sizeof (int), to_unicode_level_0_desc
172 };
173
174 static const struct lrecord_description to_unicode_level_1_desc[] = {
175 { XD_STRUCT_PTR, 0, 96, &to_unicode_level_0_ptr_desc },
176 { XD_END }
177 };
178
179 static const struct struct_description to_unicode_level_1_ptr_desc = {
180 0, to_unicode_level_1_desc
181 };
182
183 static const struct lrecord_description to_unicode_level_2_desc[] = {
184 { XD_STRUCT_PTR, 0, 96, &to_unicode_level_1_ptr_desc },
185 { XD_END }
186 };
187
188 /* Not static because each charset has a set of to and from tables and
189 needs to describe them to pdump. */
190 const struct struct_description to_unicode_description[] = {
191 { 1, to_unicode_level_1_desc },
192 { 2, to_unicode_level_2_desc },
193 { XD_END }
194 };
195
196 static const struct lrecord_description from_unicode_level_0_desc[] = {
197 { XD_END }
198 };
199
200 static const struct struct_description from_unicode_level_0_ptr_desc = {
201 sizeof (short), from_unicode_level_0_desc
202 };
203
204 static const struct lrecord_description from_unicode_level_1_desc[] = {
205 { XD_STRUCT_PTR, 0, 256, &from_unicode_level_0_ptr_desc },
206 { XD_END }
207 };
208
209 static const struct struct_description from_unicode_level_1_ptr_desc = {
210 0, from_unicode_level_1_desc
211 };
212
213 static const struct lrecord_description from_unicode_level_2_desc[] = {
214 { XD_STRUCT_PTR, 0, 256, &from_unicode_level_1_ptr_desc },
215 { XD_END }
216 };
217
218 static const struct struct_description from_unicode_level_2_ptr_desc = {
219 0, from_unicode_level_2_desc
220 };
221
222 static const struct lrecord_description from_unicode_level_3_desc[] = {
223 { XD_STRUCT_PTR, 0, 256, &from_unicode_level_2_ptr_desc },
224 { XD_END }
225 };
226
227 static const struct struct_description from_unicode_level_3_ptr_desc = {
228 0, from_unicode_level_3_desc
229 };
230
231 static const struct lrecord_description from_unicode_level_4_desc[] = {
232 { XD_STRUCT_PTR, 0, 256, &from_unicode_level_3_ptr_desc },
233 { XD_END }
234 };
235
236 /* Not static because each charset has a set of to and from tables and
237 needs to describe them to pdump. */
238 const struct struct_description from_unicode_description[] = {
239 { 1, from_unicode_level_1_desc },
240 { 2, from_unicode_level_2_desc },
241 { 3, from_unicode_level_3_desc },
242 { 4, from_unicode_level_4_desc },
243 { XD_END }
244 };
245
246 #endif /* 0 */
247
248 static Lisp_Object_dynarr *unicode_precedence_dynarr;
249
250 static const struct lrecord_description lo_description_1[] = {
251 { XD_LISP_OBJECT, 0 },
252 { XD_END }
253 };
254
255 static const struct struct_description lo_description = {
256 sizeof (Lisp_Object),
257 lo_description_1
258 };
259
260 static const struct lrecord_description lod_description_1[] = {
261 XD_DYNARR_DESC (Lisp_Object_dynarr, &lo_description),
262 { XD_END }
263 };
264
265 static const struct struct_description lisp_object_dynarr_description = {
266 sizeof (Lisp_Object_dynarr),
267 lod_description_1
268 };
269
270 Lisp_Object Vlanguage_unicode_precedence_list;
271 Lisp_Object Vdefault_unicode_precedence_list;
272
273 Lisp_Object Qignore_first_column;
274
275
276 /************************************************************************/
277 /* Unicode implementation */
278 /************************************************************************/
279
280 #define BREAKUP_UNICODE_CODE(val, u1, u2, u3, u4, levels) \
281 do { \
282 int buc_val = (val); \
283 \
284 (u1) = buc_val >> 24; \
285 (u2) = (buc_val >> 16) & 255; \
286 (u3) = (buc_val >> 8) & 255; \
287 (u4) = buc_val & 255; \
288 (levels) = (buc_val <= 0xFF ? 1 : \
289 buc_val <= 0xFFFF ? 2 : \
290 buc_val <= 0xFFFFFF ? 3 : \
291 4); \
292 } while (0)
293
294 static void
295 init_blank_unicode_tables (void)
296 {
297 int i;
298
299 from_unicode_blank_1 = xnew_array (short, 256);
300 from_unicode_blank_2 = xnew_array (short *, 256);
301 from_unicode_blank_3 = xnew_array (short **, 256);
302 from_unicode_blank_4 = xnew_array (short ***, 256);
303 for (i = 0; i < 256; i++)
304 {
305 from_unicode_blank_1[i] = (short) -1;
306 from_unicode_blank_2[i] = from_unicode_blank_1;
307 from_unicode_blank_3[i] = from_unicode_blank_2;
308 from_unicode_blank_4[i] = from_unicode_blank_3;
309 }
310
311 to_unicode_blank_1 = xnew_array (int, 96);
312 to_unicode_blank_2 = xnew_array (int *, 96);
313 for (i = 0; i < 96; i++)
314 {
315 to_unicode_blank_1[i] = -1;
316 to_unicode_blank_2[i] = to_unicode_blank_1;
317 }
318 }
319
320 static void *
321 create_new_from_unicode_table (int level)
322 {
323 switch (level)
324 {
325 /* WARNING: If you are thinking of compressing these, keep in
326 mind that sizeof (short) does not equal sizeof (short *). */
327 case 1:
328 {
329 short *newtab = xnew_array (short, 256);
330 memcpy (newtab, from_unicode_blank_1, 256 * sizeof (short));
331 return newtab;
332 }
333 case 2:
334 {
335 short **newtab = xnew_array (short *, 256);
336 memcpy (newtab, from_unicode_blank_2, 256 * sizeof (short *));
337 return newtab;
338 }
339 case 3:
340 {
341 short ***newtab = xnew_array (short **, 256);
342 memcpy (newtab, from_unicode_blank_3, 256 * sizeof (short **));
343 return newtab;
344 }
345 case 4:
346 {
347 short ****newtab = xnew_array (short ***, 256);
348 memcpy (newtab, from_unicode_blank_4, 256 * sizeof (short ***));
349 return newtab;
350 }
351 default:
352 abort ();
353 return 0;
354 }
355 }
356
357 void
358 init_charset_unicode_tables (Lisp_Object charset)
359 {
360 if (XCHARSET_DIMENSION (charset) == 1)
361 {
362 int *to_table = xnew_array (int, 96);
363 memcpy (to_table, to_unicode_blank_1, 96 * sizeof (int));
364 XCHARSET_TO_UNICODE_TABLE (charset) = to_table;
365 }
366 else
367 {
368 int **to_table = xnew_array (int *, 96);
369 memcpy (to_table, to_unicode_blank_2, 96 * sizeof (int *));
370 XCHARSET_TO_UNICODE_TABLE (charset) = to_table;
371 }
372
373 {
374 XCHARSET_FROM_UNICODE_TABLE (charset) = create_new_from_unicode_table (1);
375 XCHARSET_FROM_UNICODE_LEVELS (charset) = 1;
376 }
377 }
378
379 static void
380 free_from_unicode_table (void *table, int level)
381 {
382 int i;
383
384 switch (level)
385 {
386 case 2:
387 {
388 short **tab = (short **) table;
389 for (i = 0; i < 256; i++)
390 {
391 if (tab[i] != from_unicode_blank_1)
392 free_from_unicode_table (tab[i], 1);
393 }
394 break;
395 }
396 case 3:
397 {
398 short ***tab = (short ***) table;
399 for (i = 0; i < 256; i++)
400 {
401 if (tab[i] != from_unicode_blank_2)
402 free_from_unicode_table (tab[i], 2);
403 }
404 break;
405 }
406 case 4:
407 {
408 short ****tab = (short ****) table;
409 for (i = 0; i < 256; i++)
410 {
411 if (tab[i] != from_unicode_blank_3)
412 free_from_unicode_table (tab[i], 3);
413 }
414 break;
415 }
416 }
417
418 xfree (table);
419 }
420
421 static void
422 free_to_unicode_table (void *table, int level)
423 {
424 if (level == 2)
425 {
426 int i;
427 int **tab = (int **) table;
428
429 for (i = 0; i < 96; i++)
430 {
431 if (tab[i] != to_unicode_blank_1)
432 free_to_unicode_table (tab[i], 1);
433 }
434 }
435
436 xfree (table);
437 }
438
439 void
440 free_charset_unicode_tables (Lisp_Object charset)
441 {
442 free_to_unicode_table (XCHARSET_TO_UNICODE_TABLE (charset),
443 XCHARSET_DIMENSION (charset));
444 free_from_unicode_table (XCHARSET_FROM_UNICODE_TABLE (charset),
445 XCHARSET_FROM_UNICODE_LEVELS (charset));
446 }
447
448 #ifdef MEMORY_USAGE_STATS
449
450 static Bytecount
451 compute_from_unicode_table_size_1 (void *table, int level,
452 struct overhead_stats *stats)
453 {
454 int i;
455 Bytecount size = 0;
456
457 switch (level)
458 {
459 case 2:
460 {
461 short **tab = (short **) table;
462 for (i = 0; i < 256; i++)
463 {
464 if (tab[i] != from_unicode_blank_1)
465 size += compute_from_unicode_table_size_1 (tab[i], 1, stats);
466 }
467 break;
468 }
469 case 3:
470 {
471 short ***tab = (short ***) table;
472 for (i = 0; i < 256; i++)
473 {
474 if (tab[i] != from_unicode_blank_2)
475 size += compute_from_unicode_table_size_1 (tab[i], 2, stats);
476 }
477 break;
478 }
479 case 4:
480 {
481 short ****tab = (short ****) table;
482 for (i = 0; i < 256; i++)
483 {
484 if (tab[i] != from_unicode_blank_3)
485 size += compute_from_unicode_table_size_1 (tab[i], 3, stats);
486 }
487 break;
488 }
489 }
490
491 size += malloced_storage_size (table,
492 256 * (level == 1 ? sizeof (short) :
493 sizeof (void *)),
494 stats);
495 return size;
496 }
497
498 static Bytecount
499 compute_to_unicode_table_size_1 (void *table, int level,
500 struct overhead_stats *stats)
501 {
502 Bytecount size = 0;
503
504 if (level == 2)
505 {
506 int i;
507 int **tab = (int **) table;
508
509 for (i = 0; i < 96; i++)
510 {
511 if (tab[i] != to_unicode_blank_1)
512 size += compute_to_unicode_table_size_1 (tab[i], 1, stats);
513 }
514 }
515
516 size += malloced_storage_size (table,
517 96 * (level == 1 ? sizeof (int) :
518 sizeof (void *)),
519 stats);
520 return size;
521 }
522
523 Bytecount
524 compute_from_unicode_table_size (Lisp_Object charset,
525 struct overhead_stats *stats)
526 {
527 return (compute_from_unicode_table_size_1
528 (XCHARSET_FROM_UNICODE_TABLE (charset),
529 XCHARSET_FROM_UNICODE_LEVELS (charset),
530 stats));
531 }
532
533 Bytecount
534 compute_to_unicode_table_size (Lisp_Object charset,
535 struct overhead_stats *stats)
536 {
537 return (compute_to_unicode_table_size_1
538 (XCHARSET_TO_UNICODE_TABLE (charset),
539 XCHARSET_DIMENSION (charset),
540 stats));
541 }
542
543 #endif
544
545 #ifdef SLEDGEHAMMER_CHECK_UNICODE
546
547 /* "Sledgehammer checks" are checks that verify the self-consistency
548 of an entire structure every time a change is about to be made or
549 has been made to the structure. Not fast but a pretty much
550 sure-fire way of flushing out any incorrectnesses in the algorithms
551 that create the structure.
552
553 Checking only after a change has been made will speed things up by
554 a factor of 2, but it doesn't absolutely prove that the code just
555 checked caused the problem; perhaps it happened elsewhere, either
556 in some code you forgot to sledgehammer check or as a result of
557 data corruption. */
558
559 static void
560 assert_not_any_blank_table (void *tab)
561 {
562 assert (tab != from_unicode_blank_1);
563 assert (tab != from_unicode_blank_2);
564 assert (tab != from_unicode_blank_3);
565 assert (tab != from_unicode_blank_4);
566 assert (tab != to_unicode_blank_1);
567 assert (tab != to_unicode_blank_2);
568 assert (tab);
569 }
570
571 static void
572 sledgehammer_check_from_table (Lisp_Object charset, void *table, int level,
573 int codetop)
574 {
575 int i;
576
577 switch (level)
578 {
579 case 1:
580 {
581 short *tab = (short *) table;
582 for (i = 0; i < 256; i++)
583 {
584 if (tab[i] != -1)
585 {
586 Lisp_Object char_charset;
587 int c1, c2;
588
589 assert (valid_char_p (tab[i]));
590 BREAKUP_CHAR (tab[i], char_charset, c1, c2);
591 assert (EQ (charset, char_charset));
592 if (XCHARSET_DIMENSION (charset) == 1)
593 {
594 int *to_table =
595 (int *) XCHARSET_TO_UNICODE_TABLE (charset);
596 assert_not_any_blank_table (to_table);
597 assert (to_table[c1 - 32] == (codetop << 8) + i);
598 }
599 else
600 {
601 int **to_table =
602 (int **) XCHARSET_TO_UNICODE_TABLE (charset);
603 assert_not_any_blank_table (to_table);
604 assert_not_any_blank_table (to_table[c1 - 32]);
605 assert (to_table[c1 - 32][c2 - 32] == (codetop << 8) + i);
606 }
607 }
608 }
609 break;
610 }
611 case 2:
612 {
613 short **tab = (short **) table;
614 for (i = 0; i < 256; i++)
615 {
616 if (tab[i] != from_unicode_blank_1)
617 sledgehammer_check_from_table (charset, tab[i], 1,
618 (codetop << 8) + i);
619 }
620 break;
621 }
622 case 3:
623 {
624 short ***tab = (short ***) table;
625 for (i = 0; i < 256; i++)
626 {
627 if (tab[i] != from_unicode_blank_2)
628 sledgehammer_check_from_table (charset, tab[i], 2,
629 (codetop << 8) + i);
630 }
631 break;
632 }
633 case 4:
634 {
635 short ****tab = (short ****) table;
636 for (i = 0; i < 256; i++)
637 {
638 if (tab[i] != from_unicode_blank_3)
639 sledgehammer_check_from_table (charset, tab[i], 3,
640 (codetop << 8) + i);
641 }
642 break;
643 }
644 default:
645 abort ();
646 }
647 }
648
649 static void
650 sledgehammer_check_to_table (Lisp_Object charset, void *table, int level,
651 int codetop)
652 {
653 int i;
654
655 switch (level)
656 {
657 case 1:
658 {
659 int *tab = (int *) table;
660
661 if (XCHARSET_CHARS (charset) == 94)
662 {
663 assert (tab[0] == -1);
664 assert (tab[95] == -1);
665 }
666
667 for (i = 0; i < 96; i++)
668 {
669 if (tab[i] != -1)
670 {
671 int u4, u3, u2, u1, levels;
672 Emchar ch;
673 Emchar this_ch;
674 short val;
675 void *frtab = XCHARSET_FROM_UNICODE_TABLE (charset);
676
677 if (XCHARSET_DIMENSION (charset) == 1)
678 this_ch = MAKE_CHAR (charset, i + 32, 0);
679 else
680 this_ch = MAKE_CHAR (charset, codetop + 32, i + 32);
681
682 assert (tab[i] >= 0);
683 BREAKUP_UNICODE_CODE (tab[i], u4, u3, u2, u1, levels);
684 assert (levels <= XCHARSET_FROM_UNICODE_LEVELS (charset));
685
686 switch (XCHARSET_FROM_UNICODE_LEVELS (charset))
687 {
688 case 1: val = ((short *) frtab)[u1]; break;
689 case 2: val = ((short **) frtab)[u2][u1]; break;
690 case 3: val = ((short ***) frtab)[u3][u2][u1]; break;
691 case 4: val = ((short ****) frtab)[u4][u3][u2][u1]; break;
692 default: abort ();
693 }
694
695 ch = MAKE_CHAR (charset, val >> 8, val & 0xFF);
696 assert (ch == this_ch);
697
698 switch (XCHARSET_FROM_UNICODE_LEVELS (charset))
699 {
700 case 4:
701 assert_not_any_blank_table (frtab);
702 frtab = ((short ****) frtab)[u4];
703 /* fall through */
704 case 3:
705 assert_not_any_blank_table (frtab);
706 frtab = ((short ***) frtab)[u3];
707 /* fall through */
708 case 2:
709 assert_not_any_blank_table (frtab);
710 frtab = ((short **) frtab)[u2];
711 /* fall through */
712 case 1:
713 assert_not_any_blank_table (frtab);
714 break;
715 default: abort ();
716 }
717 }
718 }
719 break;
720 }
721 case 2:
722 {
723 int **tab = (int **) table;
724
725 if (XCHARSET_CHARS (charset) == 94)
726 {
727 assert (tab[0] == to_unicode_blank_1);
728 assert (tab[95] == to_unicode_blank_1);
729 }
730
731 for (i = 0; i < 96; i++)
732 {
733 if (tab[i] != to_unicode_blank_1)
734 sledgehammer_check_to_table (charset, tab[i], 1, i);
735 }
736 break;
737 }
738 default:
739 abort ();
740 }
741 }
742
743 static void
744 sledgehammer_check_unicode_tables (Lisp_Object charset)
745 {
746 /* verify that the blank tables have not been modified */
747 int i;
748 int from_level = XCHARSET_FROM_UNICODE_LEVELS (charset);
749 int to_level = XCHARSET_FROM_UNICODE_LEVELS (charset);
750
751 for (i = 0; i < 256; i++)
752 {
753 assert (from_unicode_blank_1[i] == (short) -1);
754 assert (from_unicode_blank_2[i] == from_unicode_blank_1);
755 assert (from_unicode_blank_3[i] == from_unicode_blank_2);
756 assert (from_unicode_blank_4[i] == from_unicode_blank_3);
757 }
758
759 for (i = 0; i < 96; i++)
760 {
761 assert (to_unicode_blank_1[i] == -1);
762 assert (to_unicode_blank_2[i] == to_unicode_blank_1);
763 }
764
765 assert (from_level >= 1 && from_level <= 4);
766
767 sledgehammer_check_from_table (charset,
768 XCHARSET_FROM_UNICODE_TABLE (charset),
769 from_level, 0);
770
771 sledgehammer_check_to_table (charset,
772 XCHARSET_TO_UNICODE_TABLE (charset),
773 XCHARSET_DIMENSION (charset), 0);
774 }
775
776 #endif /* SLEDGEHAMMER_CHECK_UNICODE */
777
778 static void
779 set_unicode_conversion (Emchar chr, int code)
780 {
781 Lisp_Object charset;
782 int c1, c2;
783
784 BREAKUP_CHAR (chr, charset, c1, c2);
785
786 assert (!EQ (charset, Vcharset_ascii));
787 assert (!EQ (charset, Vcharset_control_1));
788 assert (!EQ (charset, Vcharset_composite));
789
790 #ifdef SLEDGEHAMMER_CHECK_UNICODE
791 sledgehammer_check_unicode_tables (charset);
792 #endif
793
794 /* First, the char -> unicode translation */
795
796 if (XCHARSET_DIMENSION (charset) == 1)
797 {
798 int *to_table = (int *) XCHARSET_TO_UNICODE_TABLE (charset);
799 to_table[c1 - 32] = code;
800 }
801 else
802 {
803 int **to_table_2 = (int **) XCHARSET_TO_UNICODE_TABLE (charset);
804 int *to_table_1;
805
806 assert (XCHARSET_DIMENSION (charset) == 2);
807 to_table_1 = to_table_2[c1 - 32];
808 if (to_table_1 == to_unicode_blank_1)
809 {
810 to_table_1 = xnew_array (int, 96);
811 memcpy (to_table_1, to_unicode_blank_1, 96 * sizeof (int));
812 to_table_2[c1 - 32] = to_table_1;
813 }
814 to_table_1[c2 - 32] = code;
815 }
816
817 /* Then, unicode -> char: much harder */
818
819 {
820 int charset_levels;
821 int u4, u3, u2, u1;
822 int code_levels;
823 BREAKUP_UNICODE_CODE (code, u4, u3, u2, u1, code_levels);
824
825 charset_levels = XCHARSET_FROM_UNICODE_LEVELS (charset);
826
827 /* Make sure the charset's tables have at least as many levels as
828 the code point has: Note that the charset is guaranteed to have
829 at least one level, because it was created that way */
830 if (charset_levels < code_levels)
831 {
832 int i;
833
834 assert (charset_levels > 0);
835 for (i = 2; i <= code_levels; i++)
836 {
837 if (charset_levels < i)
838 {
839 void *old_table = XCHARSET_FROM_UNICODE_TABLE (charset);
840 void *table = create_new_from_unicode_table (i);
841 XCHARSET_FROM_UNICODE_TABLE (charset) = table;
842
843 switch (i)
844 {
845 case 2:
846 ((short **) table)[0] = (short *) old_table;
847 break;
848 case 3:
849 ((short ***) table)[0] = (short **) old_table;
850 break;
851 case 4:
852 ((short ****) table)[0] = (short ***) old_table;
853 break;
854 default: abort ();
855 }
856 }
857 }
858
859 charset_levels = code_levels;
860 XCHARSET_FROM_UNICODE_LEVELS (charset) = code_levels;
861 }
862
863 /* Now, make sure there is a non-default table at each level */
864 {
865 int i;
866 void *table = XCHARSET_FROM_UNICODE_TABLE (charset);
867
868 for (i = charset_levels; i >= 2; i--)
869 {
870 switch (i)
871 {
872 case 4:
873 if (((short ****) table)[u4] == from_unicode_blank_3)
874 ((short ****) table)[u4] =
875 ((short ***) create_new_from_unicode_table (3));
876 table = ((short ****) table)[u4];
877 break;
878 case 3:
879 if (((short ***) table)[u3] == from_unicode_blank_2)
880 ((short ***) table)[u3] =
881 ((short **) create_new_from_unicode_table (2));
882 table = ((short ***) table)[u3];
883 break;
884 case 2:
885 if (((short **) table)[u2] == from_unicode_blank_1)
886 ((short **) table)[u2] =
887 ((short *) create_new_from_unicode_table (1));
888 table = ((short **) table)[u2];
889 break;
890 default: abort ();
891 }
892 }
893 }
894
895 /* Finally, set the character */
896
897 {
898 void *table = XCHARSET_FROM_UNICODE_TABLE (charset);
899 switch (charset_levels)
900 {
901 case 1: ((short *) table)[u1] = (c1 << 8) + c2; break;
902 case 2: ((short **) table)[u2][u1] = (c1 << 8) + c2; break;
903 case 3: ((short ***) table)[u3][u2][u1] = (c1 << 8) + c2; break;
904 case 4: ((short ****) table)[u4][u3][u2][u1] = (c1 << 8) + c2; break;
905 default: abort ();
906 }
907 }
908 }
909
910 #ifdef SLEDGEHAMMER_CHECK_UNICODE
911 sledgehammer_check_unicode_tables (charset);
912 #endif
913 }
914
915 static int
916 char_to_unicode (Emchar chr)
917 {
918 Lisp_Object charset;
919 int c1, c2;
920
921 type_checking_assert (valid_char_p (chr));
922 if (chr < 256)
923 return (int) chr;
924
925 BREAKUP_CHAR (chr, charset, c1, c2);
926 if (EQ (charset, Vcharset_composite))
927 return -1; /* #### don't know how to handle */
928 else if (XCHARSET_DIMENSION (charset) == 1)
929 return ((int *) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32];
930 else
931 return ((int **) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32][c2 - 32];
932 }
933
934 static Emchar
935 unicode_to_char (int code, Lisp_Object_dynarr *charsets)
936 {
937 int u1, u2, u3, u4;
938 int code_levels;
939 int i;
940 int n = Dynarr_length (charsets);
941
942 type_checking_assert (code >= 0);
943 if (code < 256)
944 return (Emchar) code;
945
946 BREAKUP_UNICODE_CODE (code, u4, u3, u2, u1, code_levels);
947
948 for (i = 0; i < n; i++)
949 {
950 Lisp_Object charset = Dynarr_at (charsets, i);
951 int charset_levels = XCHARSET_FROM_UNICODE_LEVELS (charset);
952 if (charset_levels >= code_levels)
953 {
954 void *table = XCHARSET_FROM_UNICODE_TABLE (charset);
955 short retval;
956
957 switch (charset_levels)
958 {
959 case 1: retval = ((short *) table)[u1]; break;
960 case 2: retval = ((short **) table)[u2][u1]; break;
961 case 3: retval = ((short ***) table)[u3][u2][u1]; break;
962 case 4: retval = ((short ****) table)[u4][u3][u2][u1]; break;
963 default: abort (); retval = 0;
964 }
965
966 if (retval != -1)
967 return MAKE_CHAR (charset, retval >> 8, retval & 0xFF);
968 }
969 }
970
971 return (Emchar) -1;
972 }
973
974 static void
975 add_charsets_to_precedence_list (Lisp_Object list, int *lbs,
976 Lisp_Object_dynarr *dynarr)
977 {
978 {
979 EXTERNAL_LIST_LOOP_2 (elt, list)
980 {
981 Lisp_Object charset = Fget_charset (elt);
982 int lb = XCHARSET_LEADING_BYTE (elt);
983 if (lbs[lb - MIN_LEADING_BYTE] == 0)
984 {
985 Dynarr_add (unicode_precedence_dynarr, charset);
986 lbs[lb - MIN_LEADING_BYTE] = 1;
987 }
988 }
989 }
990 }
991
992 void
993 recalculate_unicode_precedence (void)
994 {
995 int lbs[NUM_LEADING_BYTES];
996 int i;
997
998 for (i = 0; i < NUM_LEADING_BYTES; i++)
999 lbs[i] = 0;
1000
1001 Dynarr_reset (unicode_precedence_dynarr);
1002
1003 add_charsets_to_precedence_list (Vlanguage_unicode_precedence_list,
1004 lbs, unicode_precedence_dynarr);
1005 add_charsets_to_precedence_list (Vdefault_unicode_precedence_list,
1006 lbs, unicode_precedence_dynarr);
1007
1008 for (i = 0; i < NUM_LEADING_BYTES; i++)
1009 {
1010 if (lbs[i] == 0)
1011 {
1012 Lisp_Object charset = CHARSET_BY_LEADING_BYTE (i + MIN_LEADING_BYTE);
1013 if (!NILP (charset))
1014 Dynarr_add (unicode_precedence_dynarr, charset);
1015 }
1016 }
1017 }
1018
1019 DEFUN ("set-language-unicode-precedence-list",
1020 Fset_language_unicode_precedence_list,
1021 1, 1, 0, /*
1022 Set the language-specific precedence list used for Unicode decoding.
1023 This is a list of charsets, which are consulted in order for a translation
1024 matching a given Unicode character. If no matches are found, the charsets
1025 in the default precedence list (see `set-default-unicode-precedence-list')
1026 are consulted, and then all remaining charsets, in some arbitrary order.
1027
1028 The language-specific precedence list is meant to be set as part of the
1029 language environment initialization; the default precedence list is meant
1030 to be set by the user.
1031 */
1032 (list))
1033 {
1034 {
1035 EXTERNAL_LIST_LOOP_2 (elt, list)
1036 Fget_charset (elt);
1037 }
1038
1039 Vlanguage_unicode_precedence_list = list;
1040 recalculate_unicode_precedence ();
1041 return Qnil;
1042 }
1043
1044 DEFUN ("language-unicode-precedence-list",
1045 Flanguage_unicode_precedence_list,
1046 0, 0, 0, /*
1047 Return the language-specific precedence list used for Unicode decoding.
1048 See `set-language-unicode-precedence-list' for more information.
1049 */
1050 ())
1051 {
1052 return Vlanguage_unicode_precedence_list;
1053 }
1054
1055 DEFUN ("set-default-unicode-precedence-list",
1056 Fset_default_unicode_precedence_list,
1057 1, 1, 0, /*
1058 Set the default precedence list used for Unicode decoding.
1059 This is meant to be set by the user. See
1060 `set-language-unicode-precedence-list' for more information.
1061 */
1062 (list))
1063 {
1064 {
1065 EXTERNAL_LIST_LOOP_2 (elt, list)
1066 Fget_charset (elt);
1067 }
1068
1069 Vdefault_unicode_precedence_list = list;
1070 recalculate_unicode_precedence ();
1071 return Qnil;
1072 }
1073
1074 DEFUN ("default-unicode-precedence-list",
1075 Fdefault_unicode_precedence_list,
1076 0, 0, 0, /*
1077 Return the default precedence list used for Unicode decoding.
1078 See `set-language-unicode-precedence-list' for more information.
1079 */
1080 ())
1081 {
1082 return Vdefault_unicode_precedence_list;
1083 }
1084
1085 DEFUN ("set-unicode-conversion", Fset_unicode_conversion,
1086 2, 2, 0, /*
1087 Add conversion information between Unicode codepoints and characters.
1088 CHARACTER is one of the following:
1089
1090 -- A character (in which case CODE must be a non-negative integer; values
1091 above 2^20 - 1 are allowed for the purpose of specifying private
1092 characters, but will cause errors when converted to utf-16)
1093 -- A vector of characters (in which case CODE must be a vector of integers
1094 of the same length)
1095 */
1096 (character, code))
1097 {
1098 Lisp_Object charset;
1099
1100 CHECK_CHAR (character);
1101 CHECK_NATNUM (code);
1102
1103 charset = CHAR_CHARSET (XCHAR (character));
1104 if (EQ (charset, Vcharset_ascii) ||
1105 EQ (charset, Vcharset_control_1) ||
1106 EQ (charset, Vcharset_composite))
1107 signal_error (Qinvalid_argument, "Cannot set Unicode translation for ASCII, Control-1 or Composite chars",
1108 character);
1109
1110 set_unicode_conversion (XCHAR (character), XINT (code));
1111 return Qnil;
1112 }
1113
1114 #endif /* MULE */
1115
1116 DEFUN ("character-to-unicode", Fcharacter_to_unicode, 1, 1, 0, /*
1117 Convert character to Unicode codepoint.
1118 When there is no international support (i.e. MULE is not defined),
1119 this function simply does `char-to-int'.
1120 */
1121 (character))
1122 {
1123 CHECK_CHAR (character);
1124 #ifdef MULE
1125 return make_int (char_to_unicode (XCHAR (character)));
1126 #else
1127 return Fchar_to_int (character);
1128 #endif /* MULE */
1129 }
1130
1131 DEFUN ("unicode-to-character", Funicode_to_character, 1, 2, 0, /*
1132 Convert Unicode codepoint to character.
1133 CODE should be a non-negative integer.
1134 If CHARSETS is given, it should be a list of charsets, and only those
1135 charsets will be consulted, in the given order, for a translation.
1136 Otherwise, the default ordering of all charsets will be given (see
1137 `set-unicode-charset-precedence').
1138
1139 When there is no international support (i.e. MULE is not defined),
1140 this function simply does `int-to-char' and ignores the CHARSETS
1141 argument..
1142 */
1143 (code, charsets))
1144 {
1145 #ifdef MULE
1146 Lisp_Object_dynarr *dyn;
1147 int lbs[NUM_LEADING_BYTES];
1148 int c;
1149
1150 CHECK_NATNUM (code);
1151 c = XINT (code);
1152 {
1153 EXTERNAL_LIST_LOOP_2 (elt, charsets)
1154 Fget_charset (elt);
1155 }
1156
1157 if (NILP (charsets))
1158 {
1159 Emchar ret = unicode_to_char (c, unicode_precedence_dynarr);
1160 if (ret == -1)
1161 return Qnil;
1162 return make_char (ret);
1163 }
1164
1165 dyn = Dynarr_new (Lisp_Object);
1166 memset (lbs, 0, NUM_LEADING_BYTES * sizeof (int));
1167 add_charsets_to_precedence_list (charsets, lbs, dyn);
1168 {
1169 Emchar ret = unicode_to_char (c, unicode_precedence_dynarr);
1170 Dynarr_free (dyn);
1171 if (ret == -1)
1172 return Qnil;
1173 return make_char (ret);
1174 }
1175 #else
1176 CHECK_NATNUM (code);
1177 return Fint_to_char (code);
1178 #endif /* MULE */
1179 }
1180
1181 static Lisp_Object
1182 cerrar_el_fulano (Lisp_Object fulano)
1183 {
1184 FILE *file = (FILE *) get_opaque_ptr (fulano);
1185 retry_fclose (file);
1186 return Qnil;
1187 }
1188
1189 #ifdef MULE
1190
1191 DEFUN ("parse-unicode-translation-table", Fparse_unicode_translation_table,
1192 2, 6, 0, /*
1193 Parse Unicode translation data in FILENAME for CHARSET.
1194 Data is text, in the form of one translation per line -- charset
1195 codepoint followed by Unicode codepoint. Numbers are decimal or hex
1196 \(preceded by 0x). Comments are marked with a #. Charset codepoints
1197 for two-dimensional charsets should have the first octet stored in the
1198 high 8 bits of the hex number and the second in the low 8 bits.
1199
1200 If START and END are given, only charset codepoints within the given
1201 range will be processed. If OFFSET is given, that value will be added
1202 to all charset codepoints in the file to obtain the internal charset
1203 codepoint. START and END apply to the codepoints in the file, before
1204 OFFSET is applied.
1205
1206 \(Note that, as usual, we assume that octets are in the range 32 to
1207 127 or 33 to 126. If you have a table in kuten form, with octets in
1208 the range 1 to 94, you will have to use an offset of 5140,
1209 i.e. 0x2020.)
1210
1211 FLAGS, if specified, control further how the tables are interpreted
1212 and are used to special-case certain known table weirdnesses in the
1213 Unicode tables:
1214
1215 `ignore-first-column'
1216 Exactly as it sounds. The JIS X 0208 tables have 3 columns of data instead
1217 of 2; the first is the Shift-JIS codepoint.
1218 `big5'
1219 The charset codepoint is a Big Five codepoint; convert it to the
1220 proper hacked-up codepoint in `chinese-big5-1' or `chinese-big5-2'.
1221 */
1222 (filename, charset, start, end, offset, flags))
1223 {
1224 int st = 0, en = INT_MAX, of = 0;
1225 FILE *file;
1226 struct gcpro gcpro1;
1227 char line[1025];
1228 int fondo = specpdl_depth ();
1229 int ignore_first_column = 0;
1230 int big5 = 0;
1231
1232 CHECK_STRING (filename);
1233 charset = Fget_charset (charset);
1234 if (!NILP (start))
1235 {
1236 CHECK_INT (start);
1237 st = XINT (start);
1238 }
1239 if (!NILP (end))
1240 {
1241 CHECK_INT (end);
1242 en = XINT (end);
1243 }
1244 if (!NILP (offset))
1245 {
1246 CHECK_INT (offset);
1247 of = XINT (offset);
1248 }
1249
1250 if (!LISTP (flags))
1251 flags = list1 (flags);
1252
1253 {
1254 EXTERNAL_LIST_LOOP_2 (elt, flags)
1255 {
1256 if (EQ (elt, Qignore_first_column))
1257 ignore_first_column = 1;
1258 else if (EQ (elt, Qbig5))
1259 big5 = 1;
1260 else
1261 invalid_constant
1262 ("Unrecognized `parse-unicode-table' flag", elt);
1263 }
1264 }
1265
1266 GCPRO1 (filename);
1267 filename = Fexpand_file_name (filename, Qnil);
1268 file = qxe_fopen (XSTRING_DATA (filename), READ_TEXT);
1269 if (!file)
1270 report_file_error ("Cannot open", filename);
1271 record_unwind_protect (cerrar_el_fulano, make_opaque_ptr (file));
1272 while (fgets (line, sizeof (line), file))
1273 {
1274 char *p = line;
1275 int cp1, cp2, endcount;
1276 int cp1high, cp1low;
1277 int dummy;
1278
1279 while (*p) /* erase all comments out of the line */
1280 {
1281 if (*p == '#')
1282 *p = '\0';
1283 else
1284 p++;
1285 }
1286 /* see if line is nothing but whitespace and skip if so */
1287 p = line + strspn (line, " \t\n\r\f");
1288 if (!*p)
1289 continue;
1290 /* NOTE: It appears that MS Windows and Newlib sscanf() have
1291 different interpretations for whitespace (== "skip all whitespace
1292 at processing point"): Newlib requires at least one corresponding
1293 whitespace character in the input, but MS allows none. The
1294 following would be easier to write if we could count on the MS
1295 interpretation.
1296
1297 Also, the return value does NOT include %n storage. */
1298 if ((!ignore_first_column ?
1299 sscanf (p, "%i %i%n", &cp1, &cp2, &endcount) < 2 :
1300 sscanf (p, "%i %i %i%n", &dummy, &cp1, &cp2, &endcount) < 3)
1301 || *(p + endcount + strspn (p + endcount, " \t\n\r\f")))
1302 {
1303 warn_when_safe (intern ("unicode"), Qnotice,
1304 "Unrecognized line in translation file %s:\n%s",
1305 XSTRING_DATA (filename), line);
1306 continue;
1307 }
1308 if (cp1 >= st && cp1 <= en)
1309 {
1310 cp1 += of;
1311 if (cp1 < 0 || cp1 >= 65536)
1312 {
1313 out_of_range:
1314 warn_when_safe (intern ("unicode"), Qnotice,
1315 "Out of range first codepoint 0x%x in translation file %s:\n%s",
1316 cp1, XSTRING_DATA (filename), line);
1317 continue;
1318 }
1319
1320 cp1high = cp1 >> 8;
1321 cp1low = cp1 & 255;
1322
1323 if (big5)
1324 {
1325 Emchar ch = decode_big5_char (cp1high, cp1low);
1326 if (ch == -1)
1327 warn_when_safe (intern ("unicode"), Qnotice,
1328 "Out of range Big5 codepoint 0x%x in translation file %s:\n%s",
1329 cp1, XSTRING_DATA (filename), line);
1330 else
1331 set_unicode_conversion (ch, cp2);
1332 }
1333 else
1334 {
1335 int l1, h1, l2, h2;
1336 Emchar emch;
1337
1338 switch (XCHARSET_TYPE (charset))
1339 {
1340 case CHARSET_TYPE_94: l1 = 33; h1 = 126; l2 = 0; h2 = 0; break;
1341 case CHARSET_TYPE_96: l1 = 32; h1 = 127; l2 = 0; h2 = 0; break;
1342 case CHARSET_TYPE_94X94: l1 = 33; h1 = 126; l2 = 33; h2 = 126;
1343 break;
1344 case CHARSET_TYPE_96X96: l1 = 32; h1 = 127; l2 = 32; h2 = 127;
1345 break;
1346 default: abort (); l1 = 0; h1 = 0; l2 = 0; h2 = 0;
1347 }
1348
1349 if (cp1high < l2 || cp1high > h2 || cp1low < l1 || cp1low > h1)
1350 goto out_of_range;
1351
1352 emch = (cp1high == 0 ? MAKE_CHAR (charset, cp1low, 0) :
1353 MAKE_CHAR (charset, cp1high, cp1low));
1354 set_unicode_conversion (emch, cp2);
1355 }
1356 }
1357 }
1358
1359 if (ferror (file))
1360 report_file_error ("IO error when reading", filename);
1361
1362 unbind_to (fondo); /* close file */
1363 UNGCPRO;
1364 return Qnil;
1365 }
1366
1367 #endif /* MULE */
1368
1369
1370 /************************************************************************/
1371 /* Unicode coding system */
1372 /************************************************************************/
1373
1374 /* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */
1375 DEFINE_CODING_SYSTEM_TYPE (unicode);
1376
1377 enum unicode_type
1378 {
1379 UNICODE_UTF_16,
1380 UNICODE_UTF_8,
1381 UNICODE_UTF_7,
1382 UNICODE_UCS_4,
1383 };
1384
1385 struct unicode_coding_system
1386 {
1387 enum unicode_type type;
1388 int little_endian :1;
1389 int need_bom :1;
1390 };
1391
1392 #define CODING_SYSTEM_UNICODE_TYPE(codesys) \
1393 (CODING_SYSTEM_TYPE_DATA (codesys, unicode)->type)
1394 #define XCODING_SYSTEM_UNICODE_TYPE(codesys) \
1395 CODING_SYSTEM_UNICODE_TYPE (XCODING_SYSTEM (codesys))
1396 #define CODING_SYSTEM_UNICODE_LITTLE_ENDIAN(codesys) \
1397 (CODING_SYSTEM_TYPE_DATA (codesys, unicode)->little_endian)
1398 #define XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN(codesys) \
1399 CODING_SYSTEM_UNICODE_LITTLE_ENDIAN (XCODING_SYSTEM (codesys))
1400 #define CODING_SYSTEM_UNICODE_NEED_BOM(codesys) \
1401 (CODING_SYSTEM_TYPE_DATA (codesys, unicode)->need_bom)
1402 #define XCODING_SYSTEM_UNICODE_NEED_BOM(codesys) \
1403 CODING_SYSTEM_UNICODE_NEED_BOM (XCODING_SYSTEM (codesys))
1404
1405 struct unicode_coding_stream
1406 {
1407 /* decode */
1408 unsigned char counter;
1409 int seen_char;
1410 /* encode */
1411 Lisp_Object current_charset;
1412 int current_char_boundary;
1413 int wrote_bom;
1414 };
1415
1416 static const struct lrecord_description unicode_coding_system_description[] = {
1417 { XD_END }
1418 };
1419
1420 /* Decode a UCS-2 or UCS-4 character into a buffer. If the lookup fails, use
1421 <GETA MARK> (U+3013) of JIS X 0208, which means correct character
1422 is not found, instead.
1423 #### do something more appropriate (use blob?)
1424 Danger, Will Robinson! Data loss. Should we signal user? */
1425 static void
1426 decode_unicode_char (int ch, unsigned_char_dynarr *dst,
1427 struct unicode_coding_stream *data, int ignore_bom)
1428 {
1429 if (ch == 0xFEFF && !data->seen_char && ignore_bom)
1430 ;
1431 else
1432 {
1433 #ifdef MULE
1434 Emchar chr = unicode_to_char (ch, unicode_precedence_dynarr);
1435
1436 if (chr != -1)
1437 {
1438 Intbyte work[MAX_EMCHAR_LEN];
1439 int len;
1440
1441 len = set_charptr_emchar (work, chr);
1442 Dynarr_add_many (dst, work, len);
1443 }
1444 else
1445 {
1446 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
1447 Dynarr_add (dst, 34 + 128);
1448 Dynarr_add (dst, 46 + 128);
1449 }
1450 #else
1451 Dynarr_add (dst, (Intbyte) ch);
1452 #endif /* MULE */
1453 }
1454
1455 data->seen_char = 1;
1456 }
1457
1458 static void
1459 encode_unicode_char_1 (int code, unsigned_char_dynarr *dst,
1460 enum unicode_type type, int little_endian)
1461 {
1462 switch (type)
1463 {
1464 case UNICODE_UTF_16:
1465 if (little_endian)
1466 {
1467 Dynarr_add (dst, (unsigned char) (code & 255));
1468 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
1469 }
1470 else
1471 {
1472 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
1473 Dynarr_add (dst, (unsigned char) (code & 255));
1474 }
1475 break;
1476
1477 case UNICODE_UCS_4:
1478 if (little_endian)
1479 {
1480 Dynarr_add (dst, (unsigned char) (code & 255));
1481 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
1482 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
1483 Dynarr_add (dst, (unsigned char) (code >> 24));
1484 }
1485 else
1486 {
1487 Dynarr_add (dst, (unsigned char) (code >> 24));
1488 Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
1489 Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
1490 Dynarr_add (dst, (unsigned char) (code & 255));
1491 }
1492 break;
1493
1494 case UNICODE_UTF_8:
1495 if (code <= 0x7f)
1496 {
1497 Dynarr_add (dst, (unsigned char) code);
1498 }
1499 else if (code <= 0x7ff)
1500 {
1501 Dynarr_add (dst, (unsigned char) ((code >> 6) | 0xc0));
1502 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80));
1503 }
1504 else if (code <= 0xffff)
1505 {
1506 Dynarr_add (dst, (unsigned char) ((code >> 12) | 0xe0));
1507 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80));
1508 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80));
1509 }
1510 else if (code <= 0x1fffff)
1511 {
1512 Dynarr_add (dst, (unsigned char) ((code >> 18) | 0xf0));
1513 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80));
1514 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80));
1515 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80));
1516 }
1517 else if (code <= 0x3ffffff)
1518 {
1519 Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8));
1520 Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80));
1521 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80));
1522 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80));
1523 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80));
1524 }
1525 else
1526 {
1527 Dynarr_add (dst, (unsigned char) ((code >> 30) | 0xfc));
1528 Dynarr_add (dst, (unsigned char) (((code >> 24) & 0x3f) | 0x80));
1529 Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80));
1530 Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80));
1531 Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80));
1532 Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80));
1533 }
1534 break;
1535
1536 case UNICODE_UTF_7: abort ();
1537
1538 default: abort ();
1539 }
1540 }
1541
1542 static void
1543 encode_unicode_char (Lisp_Object charset, int h, int l,
1544 unsigned_char_dynarr *dst, enum unicode_type type,
1545 int little_endian)
1546 {
1547 #ifdef MULE
1548 int code = char_to_unicode (MAKE_CHAR (charset, h & 127, l & 127));
1549
1550 if (code == -1)
1551 {
1552 if (type != UNICODE_UTF_16 &&
1553 XCHARSET_DIMENSION (charset) == 2 &&
1554 XCHARSET_CHARS (charset) == 94)
1555 {
1556 unsigned char final = XCHARSET_FINAL (charset);
1557
1558 if (('@' <= final) && (final < 0x7f))
1559 code = (0xe00000 + (final - '@') * 94 * 94
1560 + ((h & 127) - 33) * 94 + (l & 127) - 33);
1561 else
1562 code = '?';
1563 }
1564 else
1565 code = '?';
1566 }
1567 #else
1568 int code = h;
1569 #endif /* MULE */
1570
1571 encode_unicode_char_1 (code, dst, type, little_endian);
1572 }
1573
1574 static Bytecount
1575 unicode_convert (struct coding_stream *str, const UExtbyte *src,
1576 unsigned_char_dynarr *dst, Bytecount n)
1577 {
1578 unsigned int ch = str->ch;
1579 struct unicode_coding_stream *data = CODING_STREAM_TYPE_DATA (str, unicode);
1580 enum unicode_type type =
1581 XCODING_SYSTEM_UNICODE_TYPE (str->codesys);
1582 int little_endian = XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (str->codesys);
1583 int ignore_bom = XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys);
1584 Bytecount orign = n;
1585
1586 if (str->direction == CODING_DECODE)
1587 {
1588 unsigned char counter = data->counter;
1589
1590 while (n--)
1591 {
1592 UExtbyte c = *src++;
1593
1594 switch (type)
1595 {
1596 case UNICODE_UTF_8:
1597 switch (counter)
1598 {
1599 case 0:
1600 if (c >= 0xfc)
1601 {
1602 ch = c & 0x01;
1603 counter = 5;
1604 }
1605 else if (c >= 0xf8)
1606 {
1607 ch = c & 0x03;
1608 counter = 4;
1609 }
1610 else if (c >= 0xf0)
1611 {
1612 ch = c & 0x07;
1613 counter = 3;
1614 }
1615 else if (c >= 0xe0)
1616 {
1617 ch = c & 0x0f;
1618 counter = 2;
1619 }
1620 else if (c >= 0xc0)
1621 {
1622 ch = c & 0x1f;
1623 counter = 1;
1624 }
1625 else
1626 decode_unicode_char (c, dst, data, ignore_bom);
1627 break;
1628 case 1:
1629 ch = (ch << 6) | (c & 0x3f);
1630 decode_unicode_char (ch, dst, data, ignore_bom);
1631 ch = 0;
1632 counter = 0;
1633 break;
1634 default:
1635 ch = (ch << 6) | (c & 0x3f);
1636 counter--;
1637 }
1638 break;
1639
1640 case UNICODE_UTF_16:
1641 if (little_endian)
1642 ch = (c << counter) | ch;
1643 else
1644 ch = (ch << 8) | c;
1645 counter += 8;
1646 if (counter == 16)
1647 {
1648 int tempch = ch;
1649 ch = 0;
1650 counter = 0;
1651 decode_unicode_char (tempch, dst, data, ignore_bom);
1652 }
1653 break;
1654
1655 case UNICODE_UCS_4:
1656 if (little_endian)
1657 ch = (c << counter) | ch;
1658 else
1659 ch = (ch << 8) | c;
1660 counter += 8;
1661 if (counter == 32)
1662 {
1663 int tempch = ch;
1664 ch = 0;
1665 counter = 0;
1666 if (tempch < 0)
1667 {
1668 /* !!#### indicate an error */
1669 tempch = '~';
1670 }
1671 decode_unicode_char (tempch, dst, data, ignore_bom);
1672 }
1673 break;
1674
1675 case UNICODE_UTF_7:
1676 abort ();
1677 break;
1678
1679 default: abort ();
1680 }
1681
1682 }
1683 if (str->eof)
1684 DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
1685
1686 data->counter = counter;
1687 }
1688 else
1689 {
1690 unsigned char char_boundary = data->current_char_boundary;
1691 Lisp_Object charset = data->current_charset;
1692
1693 #ifdef ENABLE_COMPOSITE_CHARS
1694 /* flags for handling composite chars. We do a little switcheroo
1695 on the source while we're outputting the composite char. */
1696 Bytecount saved_n = 0;
1697 const Intbyte *saved_src = NULL;
1698 int in_composite = 0;
1699
1700 back_to_square_n:
1701 #endif /* ENABLE_COMPOSITE_CHARS */
1702
1703 if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom)
1704 {
1705 encode_unicode_char_1 (0xFEFF, dst, type, little_endian);
1706 data->wrote_bom = 1;
1707 }
1708
1709 while (n--)
1710 {
1711 Intbyte c = *src++;
1712
1713 #ifdef MULE
1714 if (BYTE_ASCII_P (c))
1715 #endif /* MULE */
1716 { /* Processing ASCII character */
1717 ch = 0;
1718 encode_unicode_char (Vcharset_ascii, c, 0, dst, type,
1719 little_endian);
1720
1721 char_boundary = 1;
1722 }
1723 #ifdef MULE
1724 else if (INTBYTE_LEADING_BYTE_P (c) || INTBYTE_LEADING_BYTE_P (ch))
1725 { /* Processing Leading Byte */
1726 ch = 0;
1727 charset = CHARSET_BY_LEADING_BYTE (c);
1728 if (LEADING_BYTE_PREFIX_P(c))
1729 ch = c;
1730 char_boundary = 0;
1731 }
1732 else
1733 { /* Processing Non-ASCII character */
1734 char_boundary = 1;
1735 if (EQ (charset, Vcharset_control_1))
1736 encode_unicode_char (Vcharset_control_1, c, 0, dst,
1737 type, little_endian);
1738 else
1739 {
1740 switch (XCHARSET_REP_BYTES (charset))
1741 {
1742 case 2:
1743 encode_unicode_char (charset, c, 0, dst, type,
1744 little_endian);
1745 break;
1746 case 3:
1747 if (XCHARSET_PRIVATE_P (charset))
1748 {
1749 encode_unicode_char (charset, c, 0, dst, type,
1750 little_endian);
1751 ch = 0;
1752 }
1753 else if (ch)
1754 {
1755 #ifdef ENABLE_COMPOSITE_CHARS
1756 if (EQ (charset, Vcharset_composite))
1757 {
1758 if (in_composite)
1759 {
1760 /* #### Bother! We don't know how to
1761 handle this yet. */
1762 encode_unicode_char (Vcharset_ascii, '~', 0,
1763 dst, type,
1764 little_endian);
1765 }
1766 else
1767 {
1768 Emchar emch = MAKE_CHAR (Vcharset_composite,
1769 ch & 0x7F,
1770 c & 0x7F);
1771 Lisp_Object lstr =
1772 composite_char_string (emch);
1773 saved_n = n;
1774 saved_src = src;
1775 in_composite = 1;
1776 src = XSTRING_DATA (lstr);
1777 n = XSTRING_LENGTH (lstr);
1778 }
1779 }
1780 else
1781 #endif /* ENABLE_COMPOSITE_CHARS */
1782 encode_unicode_char (charset, ch, c, dst, type,
1783 little_endian);
1784 ch = 0;
1785 }
1786 else
1787 {
1788 ch = c;
1789 char_boundary = 0;
1790 }
1791 break;
1792 case 4:
1793 if (ch)
1794 {
1795 encode_unicode_char (charset, ch, c, dst, type,
1796 little_endian);
1797 ch = 0;
1798 }
1799 else
1800 {
1801 ch = c;
1802 char_boundary = 0;
1803 }
1804 break;
1805 default:
1806 abort ();
1807 }
1808 }
1809 }
1810 #endif /* MULE */
1811 }
1812
1813 #ifdef ENABLE_COMPOSITE_CHARS
1814 if (in_composite)
1815 {
1816 n = saved_n;
1817 src = saved_src;
1818 in_composite = 0;
1819 goto back_to_square_n; /* Wheeeeeeeee ..... */
1820 }
1821 #endif /* ENABLE_COMPOSITE_CHARS */
1822
1823 data->current_char_boundary = char_boundary;
1824 data->current_charset = charset;
1825
1826 /* La palabra se hizo carne! */
1827 /* A palavra fez-se carne! */
1828 /* Whatever. */
1829 }
1830
1831 str->ch = ch;
1832 return orign;
1833 }
1834
1835 /* DEFINE_DETECTOR (utf_7); */
1836 DEFINE_DETECTOR (utf_8);
1837 DEFINE_DETECTOR_CATEGORY (utf_8, utf_8);
1838 DEFINE_DETECTOR (ucs_4);
1839 DEFINE_DETECTOR_CATEGORY (ucs_4, ucs_4);
1840 DEFINE_DETECTOR (utf_16);
1841 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16);
1842 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian);
1843 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_bom);
1844 DEFINE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian_bom);
1845
1846 struct ucs_4_detector
1847 {
1848 int in_ucs_4_byte;
1849 };
1850
1851 static void
1852 ucs_4_detect (struct detection_state *st, const UExtbyte *src,
1853 Bytecount n)
1854 {
1855 struct ucs_4_detector *data = DETECTION_STATE_DATA (st, ucs_4);
1856
1857 while (n--)
1858 {
1859 UExtbyte c = *src++;
1860 switch (data->in_ucs_4_byte)
1861 {
1862 case 0:
1863 if (c >= 128)
1864 {
1865 DET_RESULT (st, ucs_4) = DET_NEARLY_IMPOSSIBLE;
1866 return;
1867 }
1868 else
1869 data->in_ucs_4_byte++;
1870 break;
1871 case 3:
1872 data->in_ucs_4_byte = 0;
1873 break;
1874 default:
1875 data->in_ucs_4_byte++;
1876 }
1877 }
1878
1879 /* !!#### write this for real */
1880 DET_RESULT (st, ucs_4) = DET_AS_LIKELY_AS_UNLIKELY;
1881 }
1882
1883 struct utf_16_detector
1884 {
1885 unsigned int seen_ffff:1;
1886 unsigned int seen_forward_bom:1;
1887 unsigned int seen_rev_bom:1;
1888 int byteno;
1889 int prev_char;
1890 int text, rev_text;
1891 };
1892
1893 static void
1894 utf_16_detect (struct detection_state *st, const UExtbyte *src,
1895 Bytecount n)
1896 {
1897 struct utf_16_detector *data = DETECTION_STATE_DATA (st, utf_16);
1898
1899 while (n--)
1900 {
1901 UExtbyte c = *src++;
1902 int prevc = data->prev_char;
1903 if (data->byteno == 1 && c == 0xFF && prevc == 0xFE)
1904 data->seen_forward_bom = 1;
1905 else if (data->byteno == 1 && c == 0xFE && prevc == 0xFF)
1906 data->seen_rev_bom = 1;
1907
1908 if (data->byteno & 1)
1909 {
1910 if (c == 0xFF && prevc == 0xFF)
1911 data->seen_ffff = 1;
1912 if (prevc == 0
1913 && (c == '\r' || c == '\n'
1914 || (c >= 0x20 && c <= 0x7E)))
1915 data->text++;
1916 if (c == 0
1917 && (prevc == '\r' || prevc == '\n'
1918 || (prevc >= 0x20 && prevc <= 0x7E)))
1919 data->rev_text++;
1920 if (prevc == 0x20 && (c == 0x28 || c == 0x29))
1921 data->text++;
1922 if (c == 0x20 && (prevc == 0x28 || prevc == 0x29))
1923 data->rev_text++;
1924 }
1925
1926 data->byteno++;
1927 data->prev_char = c;
1928 }
1929
1930 {
1931 int variance_indicates_big_endian =
1932 (data->text >= 10
1933 && (data->rev_text == 0
1934 || data->text / data->rev_text >= 10));
1935 int variance_indicates_little_endian =
1936 (data->rev_text >= 10
1937 && (data->text == 0
1938 || data->rev_text / data->text >= 10));
1939
1940 if (data->seen_ffff)
1941 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE);
1942 else if (data->seen_forward_bom)
1943 {
1944 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE);
1945 if (variance_indicates_big_endian)
1946 DET_RESULT (st, utf_16_bom) = DET_NEAR_CERTAINTY;
1947 else if (variance_indicates_little_endian)
1948 DET_RESULT (st, utf_16_bom) = DET_SOMEWHAT_LIKELY;
1949 else
1950 DET_RESULT (st, utf_16_bom) = DET_QUITE_PROBABLE;
1951 }
1952 else if (data->seen_forward_bom)
1953 {
1954 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE);
1955 if (variance_indicates_big_endian)
1956 DET_RESULT (st, utf_16_bom) = DET_NEAR_CERTAINTY;
1957 else if (variance_indicates_little_endian)
1958 /* #### may need to rethink */
1959 DET_RESULT (st, utf_16_bom) = DET_SOMEWHAT_LIKELY;
1960 else
1961 /* #### may need to rethink */
1962 DET_RESULT (st, utf_16_bom) = DET_QUITE_PROBABLE;
1963 }
1964 else if (data->seen_rev_bom)
1965 {
1966 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE);
1967 if (variance_indicates_little_endian)
1968 DET_RESULT (st, utf_16_little_endian_bom) = DET_NEAR_CERTAINTY;
1969 else if (variance_indicates_big_endian)
1970 /* #### may need to rethink */
1971 DET_RESULT (st, utf_16_little_endian_bom) = DET_SOMEWHAT_LIKELY;
1972 else
1973 /* #### may need to rethink */
1974 DET_RESULT (st, utf_16_little_endian_bom) = DET_QUITE_PROBABLE;
1975 }
1976 else if (variance_indicates_big_endian)
1977 {
1978 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE);
1979 DET_RESULT (st, utf_16) = DET_SOMEWHAT_LIKELY;
1980 DET_RESULT (st, utf_16_little_endian) = DET_SOMEWHAT_UNLIKELY;
1981 }
1982 else if (variance_indicates_little_endian)
1983 {
1984 SET_DET_RESULTS (st, utf_16, DET_NEARLY_IMPOSSIBLE);
1985 DET_RESULT (st, utf_16) = DET_SOMEWHAT_UNLIKELY;
1986 DET_RESULT (st, utf_16_little_endian) = DET_SOMEWHAT_LIKELY;
1987 }
1988 else
1989 SET_DET_RESULTS (st, utf_16, DET_AS_LIKELY_AS_UNLIKELY);
1990 }
1991 }
1992
1993 struct utf_8_detector
1994 {
1995 int in_utf_8_byte;
1996 };
1997
1998 static void
1999 utf_8_detect (struct detection_state *st, const UExtbyte *src,
2000 Bytecount n)
2001 {
2002 struct utf_8_detector *data = DETECTION_STATE_DATA (st, utf_8);
2003
2004 while (n--)
2005 {
2006 UExtbyte c = *src++;
2007 switch (data->in_utf_8_byte)
2008 {
2009 case 0:
2010 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2011 {
2012 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
2013 return;
2014 }
2015 else if (c >= 0xfc)
2016 data->in_utf_8_byte = 5;
2017 else if (c >= 0xf8)
2018 data->in_utf_8_byte = 4;
2019 else if (c >= 0xf0)
2020 data->in_utf_8_byte = 3;
2021 else if (c >= 0xe0)
2022 data->in_utf_8_byte = 2;
2023 else if (c >= 0xc0)
2024 data->in_utf_8_byte = 1;
2025 else if (c >= 0x80)
2026 {
2027 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
2028 return;
2029 }
2030 break;
2031 default:
2032 if ((c & 0xc0) != 0x80)
2033 {
2034 DET_RESULT (st, utf_8) = DET_SOMEWHAT_UNLIKELY;
2035 return;
2036 }
2037 else
2038 data->in_utf_8_byte--;
2039 }
2040 }
2041 DET_RESULT (st, utf_8) = DET_SOMEWHAT_LIKELY;
2042 }
2043
2044 static void
2045 unicode_init_coding_stream (struct coding_stream *str)
2046 {
2047 struct unicode_coding_stream *data =
2048 CODING_STREAM_TYPE_DATA (str, unicode);
2049 xzero (*data);
2050 data->current_charset = Qnil;
2051 }
2052
2053 static void
2054 unicode_rewind_coding_stream (struct coding_stream *str)
2055 {
2056 unicode_init_coding_stream (str);
2057 }
2058
2059 static int
2060 unicode_putprop (Lisp_Object codesys, Lisp_Object key, Lisp_Object value)
2061 {
2062 if (EQ (key, Qtype))
2063 {
2064 enum unicode_type type;
2065
2066 if (EQ (value, Qutf_8))
2067 type = UNICODE_UTF_8;
2068 else if (EQ (value, Qutf_16))
2069 type = UNICODE_UTF_16;
2070 else if (EQ (value, Qutf_7))
2071 type = UNICODE_UTF_7;
2072 else if (EQ (value, Qucs_4))
2073 type = UNICODE_UCS_4;
2074 else
2075 invalid_constant ("Invalid Unicode type", key);
2076
2077 XCODING_SYSTEM_UNICODE_TYPE (codesys) = type;
2078 }
2079 else if (EQ (key, Qlittle_endian))
2080 XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (codesys) = !NILP (value);
2081 else if (EQ (key, Qneed_bom))
2082 XCODING_SYSTEM_UNICODE_NEED_BOM (codesys) = !NILP (value);
2083 else
2084 return 0;
2085 return 1;
2086 }
2087
2088 static Lisp_Object
2089 unicode_getprop (Lisp_Object coding_system, Lisp_Object prop)
2090 {
2091 if (EQ (prop, Qtype))
2092 {
2093 switch (XCODING_SYSTEM_UNICODE_TYPE (coding_system))
2094 {
2095 case UNICODE_UTF_16: return Qutf_16;
2096 case UNICODE_UTF_8: return Qutf_8;
2097 case UNICODE_UTF_7: return Qutf_7;
2098 case UNICODE_UCS_4: return Qucs_4;
2099 default: abort ();
2100 }
2101 }
2102 else if (EQ (prop, Qlittle_endian))
2103 return XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (coding_system) ? Qt : Qnil;
2104 else if (EQ (prop, Qneed_bom))
2105 return XCODING_SYSTEM_UNICODE_NEED_BOM (coding_system) ? Qt : Qnil;
2106 return Qunbound;
2107 }
2108
2109 static void
2110 unicode_print (Lisp_Object cs, Lisp_Object printcharfun, int escapeflag)
2111 {
2112 write_c_string ("(", printcharfun);
2113 print_internal (unicode_getprop (cs, Qtype), printcharfun, 0);
2114 if (XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (cs))
2115 write_c_string (", little-endian", printcharfun);
2116 if (XCODING_SYSTEM_UNICODE_NEED_BOM (cs))
2117 write_c_string (", need-bom", printcharfun);
2118 write_c_string (")", printcharfun);
2119 }
2120
2121 int
2122 dfc_coding_system_is_unicode (Lisp_Object codesys)
2123 {
2124 #ifdef HAVE_WIN32_CODING_SYSTEMS
2125 codesys = Fget_coding_system (codesys);
2126 return (EQ (XCODING_SYSTEM_TYPE (codesys), Qunicode) &&
2127 XCODING_SYSTEM_UNICODE_TYPE (codesys) == UNICODE_UTF_16 &&
2128 XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (codesys));
2129
2130 #else
2131 return 0;
2132 #endif
2133 }
2134
2135
2136 /************************************************************************/
2137 /* Initialization */
2138 /************************************************************************/
2139
2140 void
2141 syms_of_unicode (void)
2142 {
2143 #ifdef MULE
2144 DEFSUBR (Fset_language_unicode_precedence_list);
2145 DEFSUBR (Flanguage_unicode_precedence_list);
2146 DEFSUBR (Fset_default_unicode_precedence_list);
2147 DEFSUBR (Fdefault_unicode_precedence_list);
2148 DEFSUBR (Fset_unicode_conversion);
2149
2150 DEFSUBR (Fparse_unicode_translation_table);
2151
2152 DEFSYMBOL (Qignore_first_column);
2153 #endif /* MULE */
2154
2155 DEFSUBR (Fcharacter_to_unicode);
2156 DEFSUBR (Funicode_to_character);
2157
2158 DEFSYMBOL (Qunicode);
2159 DEFSYMBOL (Qucs_4);
2160 DEFSYMBOL (Qutf_16);
2161 DEFSYMBOL (Qutf_8);
2162 DEFSYMBOL (Qutf_7);
2163
2164 DEFSYMBOL (Qneed_bom);
2165
2166 DEFSYMBOL (Qutf_16);
2167 DEFSYMBOL (Qutf_16_little_endian);
2168 DEFSYMBOL (Qutf_16_bom);
2169 DEFSYMBOL (Qutf_16_little_endian_bom);
2170 }
2171
2172 void
2173 coding_system_type_create_unicode (void)
2174 {
2175 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (unicode, "unicode-coding-system-p");
2176 CODING_SYSTEM_HAS_METHOD (unicode, print);
2177 CODING_SYSTEM_HAS_METHOD (unicode, convert);
2178 CODING_SYSTEM_HAS_METHOD (unicode, init_coding_stream);
2179 CODING_SYSTEM_HAS_METHOD (unicode, rewind_coding_stream);
2180 CODING_SYSTEM_HAS_METHOD (unicode, putprop);
2181 CODING_SYSTEM_HAS_METHOD (unicode, getprop);
2182
2183 INITIALIZE_DETECTOR (utf_8);
2184 DETECTOR_HAS_METHOD (utf_8, detect);
2185 INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8);
2186
2187 INITIALIZE_DETECTOR (ucs_4);
2188 DETECTOR_HAS_METHOD (ucs_4, detect);
2189 INITIALIZE_DETECTOR_CATEGORY (ucs_4, ucs_4);
2190
2191 INITIALIZE_DETECTOR (utf_16);
2192 DETECTOR_HAS_METHOD (utf_16, detect);
2193 INITIALIZE_DETECTOR_CATEGORY (utf_16, utf_16);
2194 INITIALIZE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian);
2195 INITIALIZE_DETECTOR_CATEGORY (utf_16, utf_16_bom);
2196 INITIALIZE_DETECTOR_CATEGORY (utf_16, utf_16_little_endian_bom);
2197 }
2198
2199 void
2200 reinit_coding_system_type_create_unicode (void)
2201 {
2202 REINITIALIZE_CODING_SYSTEM_TYPE (unicode);
2203 }
2204
2205 void
2206 reinit_vars_of_unicode (void)
2207 {
2208 #ifdef MULE
2209 init_blank_unicode_tables ();
2210 #endif /* MULE */
2211 }
2212
2213 void
2214 vars_of_unicode (void)
2215 {
2216 reinit_vars_of_unicode ();
2217
2218 Fprovide (intern ("unicode"));
2219
2220 #ifdef MULE
2221 staticpro (&Vlanguage_unicode_precedence_list);
2222 Vlanguage_unicode_precedence_list = Qnil;
2223
2224 staticpro (&Vdefault_unicode_precedence_list);
2225 Vdefault_unicode_precedence_list = Qnil;
2226
2227 unicode_precedence_dynarr = Dynarr_new (Lisp_Object);
2228 dump_add_root_struct_ptr (&unicode_precedence_dynarr,
2229 &lisp_object_dynarr_description);
2230 #if 0
2231 dump_add_root_thing (&to_unicode_blank_1, to_unicode_level_1_desc);
2232 dump_add_root_thing (&to_unicode_blank_2, to_unicode_level_2_desc);
2233
2234 dump_add_root_thing (&from_unicode_blank_1, from_unicode_level_1_desc);
2235 dump_add_root_thing (&from_unicode_blank_2, from_unicode_level_2_desc);
2236 dump_add_root_thing (&from_unicode_blank_3, from_unicode_level_3_desc);
2237 dump_add_root_thing (&from_unicode_blank_4, from_unicode_level_4_desc);
2238 #endif
2239
2240 #endif /* MULE */
2241 }