comparison lisp/unicode.el @ 4145:edb00a8b4eff

[xemacs-hg @ 2007-08-26 20:00:29 by aidan] Generally make the language environments and coding systems a little more sane.
author aidan
date Sun, 26 Aug 2007 20:00:42 +0000
parents 1abf84db2c7f
children a7c5de5b9880
comparison
equal deleted inserted replaced
4144:4a08a9219456 4145:edb00a8b4eff
142 (mapcar #'(lambda (args) 142 (mapcar #'(lambda (args)
143 (apply 'load-unicode-mapping-table 143 (apply 'load-unicode-mapping-table
144 (expand-file-name (car args) undir) 144 (expand-file-name (car args) undir)
145 (cdr args))) 145 (cdr args)))
146 (cdr tables)))) 146 (cdr tables))))
147 parse-args))) 147 parse-args)
148 ;; The default-unicode-precedence-list. We set this here to default to
149 ;; *not* mapping various European characters to East Asian characters;
150 ;; otherwise the default-unicode-precedence-list is numerically ordered
151 ;; by charset ID.
152 (set-default-unicode-precedence-list
153 '(ascii control-1 latin-iso8859-1 latin-iso8859-2 latin-iso8859-15
154 greek-iso8859-7 hebrew-iso8859-8 ipa cyrillic-iso8859-5
155 latin-iso8859-16 latin-iso8859-3 latin-iso8859-4 latin-iso8859-9
156 vietnamese-viscii-lower vietnamese-viscii-upper arabic-iso8859-6
157 jit-ucs-charset-0 japanese-jisx0208 japanese-jisx0208-1978
158 japanese-jisx0212 japanese-jisx0213-1 japanese-jisx0213-2
159 chinese-gb2312 chinese-sisheng chinese-big5-1 chinese-big5-2
160 indian-is13194 korean-ksc5601 chinese-cns11643-1 chinese-cns11643-2
161 chinese-isoir165 arabic-1-column arabic-2-column arabic-digit
162 composite ethiopic indian-1-column indian-2-column jit-ucs-charset-0
163 katakana-jisx0201 lao thai-tis620 thai-xtis tibetan tibetan-1-column
164 latin-jisx0201 chinese-cns11643-3 chinese-cns11643-4
165 chinese-cns11643-5 chinese-cns11643-6 chinese-cns11643-7))))
148 166
149 (make-coding-system 167 (make-coding-system
150 'utf-16 'unicode 168 'utf-16 'unicode
151 "UTF-16" 169 "UTF-16"
152 '(mnemonic "UTF-16" 170 '(mnemonic "UTF-16"
317 ;; below. Since this file is dumped and ccl.el isn't (and 335 ;; below. Since this file is dumped and ccl.el isn't (and
318 ;; even when it was, it was dumped much later than this 336 ;; even when it was, it was dumped much later than this
319 ;; one), we can't compile the program at dump time. We can 337 ;; one), we can't compile the program at dump time. We can
320 ;; check at byte compile time that the program is as 338 ;; check at byte compile time that the program is as
321 ;; expected, though. 339 ;; expected, though.
322 [1 16 131127 7 98872 65823 1307 5 -65536 65313 64833 1028 147513 340 [1 16 131127 7 98872 65823 1307 5 -65536 65313 64833 1028
323 8 82009 255 22])) 341 147513 8 82009 255 22]))
324 (when (featurep 'mule) 342 (when (featurep 'mule)
325 ;; Check that the pre-existing constant reflects the intended 343 ;; Check that the pre-existing constant reflects the intended
326 ;; CCL program. 344 ;; CCL program.
327 (assert 345 (assert
328 (equal pre-existing 346 (equal pre-existing
348 "The pre-compiled CCL program appears broken. ")) 366 "The pre-compiled CCL program appears broken. "))
349 pre-existing)))) 367 pre-existing))))
350 (defconst ccl-encode-to-ucs-2 prog 368 (defconst ccl-encode-to-ucs-2 prog
351 "CCL program to transform Mule characters to UCS-2.") 369 "CCL program to transform Mule characters to UCS-2.")
352 (put 'ccl-encode-to-ucs-2 'ccl-program-idx 370 (put 'ccl-encode-to-ucs-2 'ccl-program-idx
353 (register-ccl-program 'ccl-encode-to-ucs-2 prog)))) 371 (register-ccl-program 'ccl-encode-to-ucs-2 prog)))
372
373 ;; Now, create jit-ucs-charset-0 entries for those characters in Windows
374 ;; Glyph List 4 that would otherwise end up in East Asian character sets.
375 ;;
376 ;; WGL4 is a character repertoire from Microsoft that gives a guideline
377 ;; for font implementors as to what characters are sufficient for
378 ;; pan-European support. The intention of this code is to avoid the
379 ;; situation where these characters end up mapping to East Asian XEmacs
380 ;; characters, which generally clash strongly with European characters
381 ;; both in font choice and character width; jit-ucs-charset-0 is a
382 ;; single-width character set which comes before the East Asian character
383 ;; sets in the default-unicode-precedence-list above.
384 (loop for (ucs ascii-or-latin-1)
385 in '((#x2013 ?-) ;; U+2013 EN DASH
386 (#x2014 ?-) ;; U+2014 EM DASH
387 (#x2105 ?%) ;; U+2105 CARE OF
388 (#x203e ?-) ;; U+203E OVERLINE
389 (#x221f ?|) ;; U+221F RIGHT ANGLE
390 (#x2584 ?|) ;; U+2584 LOWER HALF BLOCK
391 (#x2588 ?|) ;; U+2588 FULL BLOCK
392 (#x258c ?|) ;; U+258C LEFT HALF BLOCK
393 (#x2550 ?|) ;; U+2550 BOX DRAWINGS DOUBLE HORIZONTAL
394 (#x255e ?|) ;; U+255E BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE
395 (#x256a ?|) ;; U+256A BOX DRAWINGS VERTICAL SINGLE & HORIZONTAL DOUBLE
396 (#x2561 ?|) ;; U+2561 BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE
397 (#x2215 ?/) ;; U+2215 DIVISION SLASH
398 (#x02c9 ?`) ;; U+02C9 MODIFIER LETTER MACRON
399 (#x2211 ?s) ;; U+2211 N-ARY SUMMATION
400 (#x220f ?s) ;; U+220F N-ARY PRODUCT
401 (#x2248 ?=) ;; U+2248 ALMOST EQUAL TO
402 (#x2264 ?=) ;; U+2264 LESS-THAN OR EQUAL TO
403 (#x2265 ?=) ;; U+2265 GREATER-THAN OR EQUAL TO
404 (#x201c ?') ;; U+201C LEFT DOUBLE QUOTATION MARK
405 (#x2026 ?.) ;; U+2026 HORIZONTAL ELLIPSIS
406 (#x2212 ?-) ;; U+2212 MINUS SIGN
407 (#x2260 ?=) ;; U+2260 NOT EQUAL TO
408 (#x221e ?=) ;; U+221E INFINITY
409 (#x2642 ?=) ;; U+2642 MALE SIGN
410 (#x2640 ?=) ;; U+2640 FEMALE SIGN
411 (#x2032 ?=) ;; U+2032 PRIME
412 (#x2033 ?=) ;; U+2033 DOUBLE PRIME
413 (#x25cb ?=) ;; U+25CB WHITE CIRCLE
414 (#x25cf ?=) ;; U+25CF BLACK CIRCLE
415 (#x25a1 ?=) ;; U+25A1 WHITE SQUARE
416 (#x25a0 ?=) ;; U+25A0 BLACK SQUARE
417 (#x25b2 ?=) ;; U+25B2 BLACK UP-POINTING TRIANGLE
418 (#x25bc ?=) ;; U+25BC BLACK DOWN-POINTING TRIANGLE
419 (#x2192 ?=) ;; U+2192 RIGHTWARDS ARROW
420 (#x2190 ?=) ;; U+2190 LEFTWARDS ARROW
421 (#x2191 ?=) ;; U+2191 UPWARDS ARROW
422 (#x2193 ?=) ;; U+2193 DOWNWARDS ARROW
423 (#x2229 ?=) ;; U+2229 INTERSECTION
424 (#x2202 ?=) ;; U+2202 PARTIAL DIFFERENTIAL
425 (#x2261 ?=) ;; U+2261 IDENTICAL TO
426 (#x221a ?=) ;; U+221A SQUARE ROOT
427 (#x222b ?=) ;; U+222B INTEGRAL
428 (#x2030 ?=) ;; U+2030 PER MILLE SIGN
429 (#x266a ?=) ;; U+266A EIGHTH NOTE
430 (#x2020 ?*) ;; U+2020 DAGGER
431 (#x2021 ?*) ;; U+2021 DOUBLE DAGGER
432 (#x2500 ?|) ;; U+2500 BOX DRAWINGS LIGHT HORIZONTAL
433 (#x2502 ?|) ;; U+2502 BOX DRAWINGS LIGHT VERTICAL
434 (#x250c ?|) ;; U+250C BOX DRAWINGS LIGHT DOWN AND RIGHT
435 (#x2510 ?|) ;; U+2510 BOX DRAWINGS LIGHT DOWN AND LEFT
436 (#x2518 ?|) ;; U+2518 BOX DRAWINGS LIGHT UP AND LEFT
437 (#x2514 ?|) ;; U+2514 BOX DRAWINGS LIGHT UP AND RIGHT
438 (#x251c ?|) ;; U+251C BOX DRAWINGS LIGHT VERTICAL AND RIGHT
439 (#x252c ?|) ;; U+252C BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
440 (#x2524 ?|) ;; U+2524 BOX DRAWINGS LIGHT VERTICAL AND LEFT
441 (#x2534 ?|) ;; U+2534 BOX DRAWINGS LIGHT UP AND HORIZONTAL
442 (#x253c ?|) ;; U+253C BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
443 (#x02da ?^) ;; U+02DA RING ABOVE
444 (#x2122 ?\xa9) ;; U+2122 TRADE MARK SIGN, ?,A)(B
445
446 (#x0132 ?\xe6) ;; U+0132 LATIN CAPITAL LIGATURE IJ, ?,Af(B
447 (#x013f ?\xe6) ;; U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT, ?,Af(B
448
449 (#x0133 ?\xe6) ;; U+0133 LATIN SMALL LIGATURE IJ, ?,Af(B
450 (#x0140 ?\xe6) ;; U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT, ?,Af(B
451 (#x0149 ?\xe6) ;; U+0149 LATIN SMALL LETTER N PRECEDED BY APOSTROPH,?,Af(B
452
453 (#x2194 ?|) ;; U+2194 LEFT RIGHT ARROW
454 (#x2660 ?*) ;; U+2660 BLACK SPADE SUIT
455 (#x2665 ?*) ;; U+2665 BLACK HEART SUIT
456 (#x2663 ?*) ;; U+2663 BLACK CLUB SUIT
457 (#x2592 ?|) ;; U+2592 MEDIUM SHADE
458 (#x2195 ?|) ;; U+2195 UP DOWN ARROW
459
460 (#x2113 ?\xb9) ;; U+2113 SCRIPT SMALL L, ?,A9(B
461 (#x215b ?\xbe) ;; U+215B VULGAR FRACTION ONE EIGHTH, ?,A>(B
462 (#x215c ?\xbe) ;; U+215C VULGAR FRACTION THREE EIGHTHS, ?,A>(B
463 (#x215d ?\xbe) ;; U+215D VULGAR FRACTION FIVE EIGHTHS, ?,A>(B
464 (#x215e ?\xbe) ;; U+215E VULGAR FRACTION SEVEN EIGHTHS, ?,A>(B
465 (#x207f ?\xbe) ;; U+207F SUPERSCRIPT LATIN SMALL LETTER N, ?,A>(B
466
467 ;; These are not in WGL 4, but are IPA characters that should not
468 ;; be double width. They are the only IPA characters that both
469 ;; occur in packages/mule-packages/leim/ipa.el and end up in East
470 ;; Asian character sets when that file is loaded in an XEmacs
471 ;; without packages.
472 (#x2197 ?|) ;; U+2197 NORTH EAST ARROW
473 (#x2199 ?|) ;; U+2199 SOUTH WEST ARROW
474 (#x2191 ?|) ;; U+2191 UPWARDS ARROW
475 (#x207f ?\xb9));; U+207F SUPERSCRIPT LATIN SMALL LETTER N, ?,A9(B
476 with decoded = nil
477 with syntax-table = (standard-syntax-table)
478 ;; This creates jit-ucs-charset-0 entries because:
479 ;;
480 ;; 1. If the tables are dumped, it is run at dump time before they are
481 ;; dumped, and as such before the relevant conversions are available
482 ;; (they are made available in mule/general-late.el).
483 ;;
484 ;; 2. If the tables are not dumped, it is run at dump time, long before
485 ;; any of the other mappings are available.
486 ;;
487 do
488 (setq decoded (decode-char 'ucs ucs))
489 (assert (eq (char-charset decoded)
490 'jit-ucs-charset-0) nil
491 "Unexpected Unicode decoding behavior. ")
492 (modify-syntax-entry decoded
493 (string
494 (char-syntax ascii-or-latin-1))
495 syntax-table))
496
497 ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0
498 ;; characters starting at U+200000 (which isn't a valid Unicode code
499 ;; point).
500 (loop
501 for i from #x00 to #xFF
502 ;; #xd800 is the first leading surrogate; trailing surrogates must be in
503 ;; the range #xdc00-#xdfff. These examples are not, so we intentionally
504 ;; provoke an error sequence.
505 do (decode-coding-string (format "\xd8\x00\x01%c" i) 'utf-16-be))
506
507 ;; Make them available to user code.
508 (defvar unicode-error-sequence-zero
509 (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3)
510 "The XEmacs character representing an invalid zero octet in Unicode.
511
512 Subtract this character from each XEmacs character in an invalid sequence to
513 get the octet on disk. E.g.
514
515 \(- (aref (decode-coding-string ?\\x80 'utf-8) 0)
516 unicode-error-characters-zero)
517 => ?\\x80
518
519 You can search for invalid sequences using
520 `unicode-error-sequence-regexp-range', which see. ")
521
522 (defvar unicode-error-sequence-regexp-range
523 (format "%c-%c"
524 (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3)
525 (aref (decode-coding-string "\xd8\x00\x01\xFF" 'utf-16-be) 3))
526 "Regular expression range to match Unicode error sequences in XEmacs.
527
528 Invalid Unicode sequences on input are represented as XEmacs characters with
529 values starting at `unicode-error-sequence-zero', one character for each
530 invalid octet. Use this variable (with `re-search-forward' or
531 `skip-chars-forward') to search for such characters; use
532 `unicode-error-sequence-zero' from such characters to get a character
533 corresponding to the octet on disk. "))
354 534
355 ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's 535 ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's
356 ;; an implementation in appendix A.1 of the Unicode Standard, Version 536 ;; an implementation in appendix A.1 of the Unicode Standard, Version
357 ;; 2.0, but I don't know its licensing characteristics. 537 ;; 2.0, but I don't know its licensing characteristics.
358 538