Mercurial > hg > xemacs-beta
comparison lisp/unicode.el @ 4145:edb00a8b4eff
[xemacs-hg @ 2007-08-26 20:00:29 by aidan]
Generally make the language environments and coding systems a little more sane.
author | aidan |
---|---|
date | Sun, 26 Aug 2007 20:00:42 +0000 |
parents | 1abf84db2c7f |
children | a7c5de5b9880 |
comparison
equal
deleted
inserted
replaced
4144:4a08a9219456 | 4145:edb00a8b4eff |
---|---|
142 (mapcar #'(lambda (args) | 142 (mapcar #'(lambda (args) |
143 (apply 'load-unicode-mapping-table | 143 (apply 'load-unicode-mapping-table |
144 (expand-file-name (car args) undir) | 144 (expand-file-name (car args) undir) |
145 (cdr args))) | 145 (cdr args))) |
146 (cdr tables)))) | 146 (cdr tables)))) |
147 parse-args))) | 147 parse-args) |
148 ;; The default-unicode-precedence-list. We set this here to default to | |
149 ;; *not* mapping various European characters to East Asian characters; | |
150 ;; otherwise the default-unicode-precedence-list is numerically ordered | |
151 ;; by charset ID. | |
152 (set-default-unicode-precedence-list | |
153 '(ascii control-1 latin-iso8859-1 latin-iso8859-2 latin-iso8859-15 | |
154 greek-iso8859-7 hebrew-iso8859-8 ipa cyrillic-iso8859-5 | |
155 latin-iso8859-16 latin-iso8859-3 latin-iso8859-4 latin-iso8859-9 | |
156 vietnamese-viscii-lower vietnamese-viscii-upper arabic-iso8859-6 | |
157 jit-ucs-charset-0 japanese-jisx0208 japanese-jisx0208-1978 | |
158 japanese-jisx0212 japanese-jisx0213-1 japanese-jisx0213-2 | |
159 chinese-gb2312 chinese-sisheng chinese-big5-1 chinese-big5-2 | |
160 indian-is13194 korean-ksc5601 chinese-cns11643-1 chinese-cns11643-2 | |
161 chinese-isoir165 arabic-1-column arabic-2-column arabic-digit | |
162 composite ethiopic indian-1-column indian-2-column jit-ucs-charset-0 | |
163 katakana-jisx0201 lao thai-tis620 thai-xtis tibetan tibetan-1-column | |
164 latin-jisx0201 chinese-cns11643-3 chinese-cns11643-4 | |
165 chinese-cns11643-5 chinese-cns11643-6 chinese-cns11643-7)))) | |
148 | 166 |
149 (make-coding-system | 167 (make-coding-system |
150 'utf-16 'unicode | 168 'utf-16 'unicode |
151 "UTF-16" | 169 "UTF-16" |
152 '(mnemonic "UTF-16" | 170 '(mnemonic "UTF-16" |
317 ;; below. Since this file is dumped and ccl.el isn't (and | 335 ;; below. Since this file is dumped and ccl.el isn't (and |
318 ;; even when it was, it was dumped much later than this | 336 ;; even when it was, it was dumped much later than this |
319 ;; one), we can't compile the program at dump time. We can | 337 ;; one), we can't compile the program at dump time. We can |
320 ;; check at byte compile time that the program is as | 338 ;; check at byte compile time that the program is as |
321 ;; expected, though. | 339 ;; expected, though. |
322 [1 16 131127 7 98872 65823 1307 5 -65536 65313 64833 1028 147513 | 340 [1 16 131127 7 98872 65823 1307 5 -65536 65313 64833 1028 |
323 8 82009 255 22])) | 341 147513 8 82009 255 22])) |
324 (when (featurep 'mule) | 342 (when (featurep 'mule) |
325 ;; Check that the pre-existing constant reflects the intended | 343 ;; Check that the pre-existing constant reflects the intended |
326 ;; CCL program. | 344 ;; CCL program. |
327 (assert | 345 (assert |
328 (equal pre-existing | 346 (equal pre-existing |
348 "The pre-compiled CCL program appears broken. ")) | 366 "The pre-compiled CCL program appears broken. ")) |
349 pre-existing)))) | 367 pre-existing)))) |
350 (defconst ccl-encode-to-ucs-2 prog | 368 (defconst ccl-encode-to-ucs-2 prog |
351 "CCL program to transform Mule characters to UCS-2.") | 369 "CCL program to transform Mule characters to UCS-2.") |
352 (put 'ccl-encode-to-ucs-2 'ccl-program-idx | 370 (put 'ccl-encode-to-ucs-2 'ccl-program-idx |
353 (register-ccl-program 'ccl-encode-to-ucs-2 prog)))) | 371 (register-ccl-program 'ccl-encode-to-ucs-2 prog))) |
372 | |
373 ;; Now, create jit-ucs-charset-0 entries for those characters in Windows | |
374 ;; Glyph List 4 that would otherwise end up in East Asian character sets. | |
375 ;; | |
376 ;; WGL4 is a character repertoire from Microsoft that gives a guideline | |
377 ;; for font implementors as to what characters are sufficient for | |
378 ;; pan-European support. The intention of this code is to avoid the | |
379 ;; situation where these characters end up mapping to East Asian XEmacs | |
380 ;; characters, which generally clash strongly with European characters | |
381 ;; both in font choice and character width; jit-ucs-charset-0 is a | |
382 ;; single-width character set which comes before the East Asian character | |
383 ;; sets in the default-unicode-precedence-list above. | |
384 (loop for (ucs ascii-or-latin-1) | |
385 in '((#x2013 ?-) ;; U+2013 EN DASH | |
386 (#x2014 ?-) ;; U+2014 EM DASH | |
387 (#x2105 ?%) ;; U+2105 CARE OF | |
388 (#x203e ?-) ;; U+203E OVERLINE | |
389 (#x221f ?|) ;; U+221F RIGHT ANGLE | |
390 (#x2584 ?|) ;; U+2584 LOWER HALF BLOCK | |
391 (#x2588 ?|) ;; U+2588 FULL BLOCK | |
392 (#x258c ?|) ;; U+258C LEFT HALF BLOCK | |
393 (#x2550 ?|) ;; U+2550 BOX DRAWINGS DOUBLE HORIZONTAL | |
394 (#x255e ?|) ;; U+255E BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE | |
395 (#x256a ?|) ;; U+256A BOX DRAWINGS VERTICAL SINGLE & HORIZONTAL DOUBLE | |
396 (#x2561 ?|) ;; U+2561 BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE | |
397 (#x2215 ?/) ;; U+2215 DIVISION SLASH | |
398 (#x02c9 ?`) ;; U+02C9 MODIFIER LETTER MACRON | |
399 (#x2211 ?s) ;; U+2211 N-ARY SUMMATION | |
400 (#x220f ?s) ;; U+220F N-ARY PRODUCT | |
401 (#x2248 ?=) ;; U+2248 ALMOST EQUAL TO | |
402 (#x2264 ?=) ;; U+2264 LESS-THAN OR EQUAL TO | |
403 (#x2265 ?=) ;; U+2265 GREATER-THAN OR EQUAL TO | |
404 (#x201c ?') ;; U+201C LEFT DOUBLE QUOTATION MARK | |
405 (#x2026 ?.) ;; U+2026 HORIZONTAL ELLIPSIS | |
406 (#x2212 ?-) ;; U+2212 MINUS SIGN | |
407 (#x2260 ?=) ;; U+2260 NOT EQUAL TO | |
408 (#x221e ?=) ;; U+221E INFINITY | |
409 (#x2642 ?=) ;; U+2642 MALE SIGN | |
410 (#x2640 ?=) ;; U+2640 FEMALE SIGN | |
411 (#x2032 ?=) ;; U+2032 PRIME | |
412 (#x2033 ?=) ;; U+2033 DOUBLE PRIME | |
413 (#x25cb ?=) ;; U+25CB WHITE CIRCLE | |
414 (#x25cf ?=) ;; U+25CF BLACK CIRCLE | |
415 (#x25a1 ?=) ;; U+25A1 WHITE SQUARE | |
416 (#x25a0 ?=) ;; U+25A0 BLACK SQUARE | |
417 (#x25b2 ?=) ;; U+25B2 BLACK UP-POINTING TRIANGLE | |
418 (#x25bc ?=) ;; U+25BC BLACK DOWN-POINTING TRIANGLE | |
419 (#x2192 ?=) ;; U+2192 RIGHTWARDS ARROW | |
420 (#x2190 ?=) ;; U+2190 LEFTWARDS ARROW | |
421 (#x2191 ?=) ;; U+2191 UPWARDS ARROW | |
422 (#x2193 ?=) ;; U+2193 DOWNWARDS ARROW | |
423 (#x2229 ?=) ;; U+2229 INTERSECTION | |
424 (#x2202 ?=) ;; U+2202 PARTIAL DIFFERENTIAL | |
425 (#x2261 ?=) ;; U+2261 IDENTICAL TO | |
426 (#x221a ?=) ;; U+221A SQUARE ROOT | |
427 (#x222b ?=) ;; U+222B INTEGRAL | |
428 (#x2030 ?=) ;; U+2030 PER MILLE SIGN | |
429 (#x266a ?=) ;; U+266A EIGHTH NOTE | |
430 (#x2020 ?*) ;; U+2020 DAGGER | |
431 (#x2021 ?*) ;; U+2021 DOUBLE DAGGER | |
432 (#x2500 ?|) ;; U+2500 BOX DRAWINGS LIGHT HORIZONTAL | |
433 (#x2502 ?|) ;; U+2502 BOX DRAWINGS LIGHT VERTICAL | |
434 (#x250c ?|) ;; U+250C BOX DRAWINGS LIGHT DOWN AND RIGHT | |
435 (#x2510 ?|) ;; U+2510 BOX DRAWINGS LIGHT DOWN AND LEFT | |
436 (#x2518 ?|) ;; U+2518 BOX DRAWINGS LIGHT UP AND LEFT | |
437 (#x2514 ?|) ;; U+2514 BOX DRAWINGS LIGHT UP AND RIGHT | |
438 (#x251c ?|) ;; U+251C BOX DRAWINGS LIGHT VERTICAL AND RIGHT | |
439 (#x252c ?|) ;; U+252C BOX DRAWINGS LIGHT DOWN AND HORIZONTAL | |
440 (#x2524 ?|) ;; U+2524 BOX DRAWINGS LIGHT VERTICAL AND LEFT | |
441 (#x2534 ?|) ;; U+2534 BOX DRAWINGS LIGHT UP AND HORIZONTAL | |
442 (#x253c ?|) ;; U+253C BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL | |
443 (#x02da ?^) ;; U+02DA RING ABOVE | |
444 (#x2122 ?\xa9) ;; U+2122 TRADE MARK SIGN, ?,A)(B | |
445 | |
446 (#x0132 ?\xe6) ;; U+0132 LATIN CAPITAL LIGATURE IJ, ?,Af(B | |
447 (#x013f ?\xe6) ;; U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT, ?,Af(B | |
448 | |
449 (#x0133 ?\xe6) ;; U+0133 LATIN SMALL LIGATURE IJ, ?,Af(B | |
450 (#x0140 ?\xe6) ;; U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT, ?,Af(B | |
451 (#x0149 ?\xe6) ;; U+0149 LATIN SMALL LETTER N PRECEDED BY APOSTROPH,?,Af(B | |
452 | |
453 (#x2194 ?|) ;; U+2194 LEFT RIGHT ARROW | |
454 (#x2660 ?*) ;; U+2660 BLACK SPADE SUIT | |
455 (#x2665 ?*) ;; U+2665 BLACK HEART SUIT | |
456 (#x2663 ?*) ;; U+2663 BLACK CLUB SUIT | |
457 (#x2592 ?|) ;; U+2592 MEDIUM SHADE | |
458 (#x2195 ?|) ;; U+2195 UP DOWN ARROW | |
459 | |
460 (#x2113 ?\xb9) ;; U+2113 SCRIPT SMALL L, ?,A9(B | |
461 (#x215b ?\xbe) ;; U+215B VULGAR FRACTION ONE EIGHTH, ?,A>(B | |
462 (#x215c ?\xbe) ;; U+215C VULGAR FRACTION THREE EIGHTHS, ?,A>(B | |
463 (#x215d ?\xbe) ;; U+215D VULGAR FRACTION FIVE EIGHTHS, ?,A>(B | |
464 (#x215e ?\xbe) ;; U+215E VULGAR FRACTION SEVEN EIGHTHS, ?,A>(B | |
465 (#x207f ?\xbe) ;; U+207F SUPERSCRIPT LATIN SMALL LETTER N, ?,A>(B | |
466 | |
467 ;; These are not in WGL 4, but are IPA characters that should not | |
468 ;; be double width. They are the only IPA characters that both | |
469 ;; occur in packages/mule-packages/leim/ipa.el and end up in East | |
470 ;; Asian character sets when that file is loaded in an XEmacs | |
471 ;; without packages. | |
472 (#x2197 ?|) ;; U+2197 NORTH EAST ARROW | |
473 (#x2199 ?|) ;; U+2199 SOUTH WEST ARROW | |
474 (#x2191 ?|) ;; U+2191 UPWARDS ARROW | |
475 (#x207f ?\xb9));; U+207F SUPERSCRIPT LATIN SMALL LETTER N, ?,A9(B | |
476 with decoded = nil | |
477 with syntax-table = (standard-syntax-table) | |
478 ;; This creates jit-ucs-charset-0 entries because: | |
479 ;; | |
480 ;; 1. If the tables are dumped, it is run at dump time before they are | |
481 ;; dumped, and as such before the relevant conversions are available | |
482 ;; (they are made available in mule/general-late.el). | |
483 ;; | |
484 ;; 2. If the tables are not dumped, it is run at dump time, long before | |
485 ;; any of the other mappings are available. | |
486 ;; | |
487 do | |
488 (setq decoded (decode-char 'ucs ucs)) | |
489 (assert (eq (char-charset decoded) | |
490 'jit-ucs-charset-0) nil | |
491 "Unexpected Unicode decoding behavior. ") | |
492 (modify-syntax-entry decoded | |
493 (string | |
494 (char-syntax ascii-or-latin-1)) | |
495 syntax-table)) | |
496 | |
497 ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0 | |
498 ;; characters starting at U+200000 (which isn't a valid Unicode code | |
499 ;; point). | |
500 (loop | |
501 for i from #x00 to #xFF | |
502 ;; #xd800 is the first leading surrogate; trailing surrogates must be in | |
503 ;; the range #xdc00-#xdfff. These examples are not, so we intentionally | |
504 ;; provoke an error sequence. | |
505 do (decode-coding-string (format "\xd8\x00\x01%c" i) 'utf-16-be)) | |
506 | |
507 ;; Make them available to user code. | |
508 (defvar unicode-error-sequence-zero | |
509 (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3) | |
510 "The XEmacs character representing an invalid zero octet in Unicode. | |
511 | |
512 Subtract this character from each XEmacs character in an invalid sequence to | |
513 get the octet on disk. E.g. | |
514 | |
515 \(- (aref (decode-coding-string ?\\x80 'utf-8) 0) | |
516 unicode-error-characters-zero) | |
517 => ?\\x80 | |
518 | |
519 You can search for invalid sequences using | |
520 `unicode-error-sequence-regexp-range', which see. ") | |
521 | |
522 (defvar unicode-error-sequence-regexp-range | |
523 (format "%c-%c" | |
524 (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3) | |
525 (aref (decode-coding-string "\xd8\x00\x01\xFF" 'utf-16-be) 3)) | |
526 "Regular expression range to match Unicode error sequences in XEmacs. | |
527 | |
528 Invalid Unicode sequences on input are represented as XEmacs characters with | |
529 values starting at `unicode-error-sequence-zero', one character for each | |
530 invalid octet. Use this variable (with `re-search-forward' or | |
531 `skip-chars-forward') to search for such characters; use | |
532 `unicode-error-sequence-zero' from such characters to get a character | |
533 corresponding to the octet on disk. ")) | |
354 | 534 |
355 ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's | 535 ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's |
356 ;; an implementation in appendix A.1 of the Unicode Standard, Version | 536 ;; an implementation in appendix A.1 of the Unicode Standard, Version |
357 ;; 2.0, but I don't know its licensing characteristics. | 537 ;; 2.0, but I don't know its licensing characteristics. |
358 | 538 |