Mercurial > hg > xemacs-beta
diff lisp/unicode.el @ 4145:edb00a8b4eff
[xemacs-hg @ 2007-08-26 20:00:29 by aidan]
Generally make the language environments and coding systems a little more sane.
author | aidan |
---|---|
date | Sun, 26 Aug 2007 20:00:42 +0000 |
parents | 1abf84db2c7f |
children | a7c5de5b9880 |
line wrap: on
line diff
--- a/lisp/unicode.el Sat Aug 25 21:51:21 2007 +0000 +++ b/lisp/unicode.el Sun Aug 26 20:00:42 2007 +0000 @@ -144,7 +144,25 @@ (expand-file-name (car args) undir) (cdr args))) (cdr tables)))) - parse-args))) + parse-args) + ;; The default-unicode-precedence-list. We set this here to default to + ;; *not* mapping various European characters to East Asian characters; + ;; otherwise the default-unicode-precedence-list is numerically ordered + ;; by charset ID. + (set-default-unicode-precedence-list + '(ascii control-1 latin-iso8859-1 latin-iso8859-2 latin-iso8859-15 + greek-iso8859-7 hebrew-iso8859-8 ipa cyrillic-iso8859-5 + latin-iso8859-16 latin-iso8859-3 latin-iso8859-4 latin-iso8859-9 + vietnamese-viscii-lower vietnamese-viscii-upper arabic-iso8859-6 + jit-ucs-charset-0 japanese-jisx0208 japanese-jisx0208-1978 + japanese-jisx0212 japanese-jisx0213-1 japanese-jisx0213-2 + chinese-gb2312 chinese-sisheng chinese-big5-1 chinese-big5-2 + indian-is13194 korean-ksc5601 chinese-cns11643-1 chinese-cns11643-2 + chinese-isoir165 arabic-1-column arabic-2-column arabic-digit + composite ethiopic indian-1-column indian-2-column jit-ucs-charset-0 + katakana-jisx0201 lao thai-tis620 thai-xtis tibetan tibetan-1-column + latin-jisx0201 chinese-cns11643-3 chinese-cns11643-4 + chinese-cns11643-5 chinese-cns11643-6 chinese-cns11643-7)))) (make-coding-system 'utf-16 'unicode @@ -319,8 +337,8 @@ ;; one), we can't compile the program at dump time. We can ;; check at byte compile time that the program is as ;; expected, though. - [1 16 131127 7 98872 65823 1307 5 -65536 65313 64833 1028 147513 - 8 82009 255 22])) + [1 16 131127 7 98872 65823 1307 5 -65536 65313 64833 1028 + 147513 8 82009 255 22])) (when (featurep 'mule) ;; Check that the pre-existing constant reflects the intended ;; CCL program. @@ -350,7 +368,169 @@ (defconst ccl-encode-to-ucs-2 prog "CCL program to transform Mule characters to UCS-2.") (put 'ccl-encode-to-ucs-2 'ccl-program-idx - (register-ccl-program 'ccl-encode-to-ucs-2 prog)))) + (register-ccl-program 'ccl-encode-to-ucs-2 prog))) + + ;; Now, create jit-ucs-charset-0 entries for those characters in Windows + ;; Glyph List 4 that would otherwise end up in East Asian character sets. + ;; + ;; WGL4 is a character repertoire from Microsoft that gives a guideline + ;; for font implementors as to what characters are sufficient for + ;; pan-European support. The intention of this code is to avoid the + ;; situation where these characters end up mapping to East Asian XEmacs + ;; characters, which generally clash strongly with European characters + ;; both in font choice and character width; jit-ucs-charset-0 is a + ;; single-width character set which comes before the East Asian character + ;; sets in the default-unicode-precedence-list above. + (loop for (ucs ascii-or-latin-1) + in '((#x2013 ?-) ;; U+2013 EN DASH + (#x2014 ?-) ;; U+2014 EM DASH + (#x2105 ?%) ;; U+2105 CARE OF + (#x203e ?-) ;; U+203E OVERLINE + (#x221f ?|) ;; U+221F RIGHT ANGLE + (#x2584 ?|) ;; U+2584 LOWER HALF BLOCK + (#x2588 ?|) ;; U+2588 FULL BLOCK + (#x258c ?|) ;; U+258C LEFT HALF BLOCK + (#x2550 ?|) ;; U+2550 BOX DRAWINGS DOUBLE HORIZONTAL + (#x255e ?|) ;; U+255E BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE + (#x256a ?|) ;; U+256A BOX DRAWINGS VERTICAL SINGLE & HORIZONTAL DOUBLE + (#x2561 ?|) ;; U+2561 BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE + (#x2215 ?/) ;; U+2215 DIVISION SLASH + (#x02c9 ?`) ;; U+02C9 MODIFIER LETTER MACRON + (#x2211 ?s) ;; U+2211 N-ARY SUMMATION + (#x220f ?s) ;; U+220F N-ARY PRODUCT + (#x2248 ?=) ;; U+2248 ALMOST EQUAL TO + (#x2264 ?=) ;; U+2264 LESS-THAN OR EQUAL TO + (#x2265 ?=) ;; U+2265 GREATER-THAN OR EQUAL TO + (#x201c ?') ;; U+201C LEFT DOUBLE QUOTATION MARK + (#x2026 ?.) ;; U+2026 HORIZONTAL ELLIPSIS + (#x2212 ?-) ;; U+2212 MINUS SIGN + (#x2260 ?=) ;; U+2260 NOT EQUAL TO + (#x221e ?=) ;; U+221E INFINITY + (#x2642 ?=) ;; U+2642 MALE SIGN + (#x2640 ?=) ;; U+2640 FEMALE SIGN + (#x2032 ?=) ;; U+2032 PRIME + (#x2033 ?=) ;; U+2033 DOUBLE PRIME + (#x25cb ?=) ;; U+25CB WHITE CIRCLE + (#x25cf ?=) ;; U+25CF BLACK CIRCLE + (#x25a1 ?=) ;; U+25A1 WHITE SQUARE + (#x25a0 ?=) ;; U+25A0 BLACK SQUARE + (#x25b2 ?=) ;; U+25B2 BLACK UP-POINTING TRIANGLE + (#x25bc ?=) ;; U+25BC BLACK DOWN-POINTING TRIANGLE + (#x2192 ?=) ;; U+2192 RIGHTWARDS ARROW + (#x2190 ?=) ;; U+2190 LEFTWARDS ARROW + (#x2191 ?=) ;; U+2191 UPWARDS ARROW + (#x2193 ?=) ;; U+2193 DOWNWARDS ARROW + (#x2229 ?=) ;; U+2229 INTERSECTION + (#x2202 ?=) ;; U+2202 PARTIAL DIFFERENTIAL + (#x2261 ?=) ;; U+2261 IDENTICAL TO + (#x221a ?=) ;; U+221A SQUARE ROOT + (#x222b ?=) ;; U+222B INTEGRAL + (#x2030 ?=) ;; U+2030 PER MILLE SIGN + (#x266a ?=) ;; U+266A EIGHTH NOTE + (#x2020 ?*) ;; U+2020 DAGGER + (#x2021 ?*) ;; U+2021 DOUBLE DAGGER + (#x2500 ?|) ;; U+2500 BOX DRAWINGS LIGHT HORIZONTAL + (#x2502 ?|) ;; U+2502 BOX DRAWINGS LIGHT VERTICAL + (#x250c ?|) ;; U+250C BOX DRAWINGS LIGHT DOWN AND RIGHT + (#x2510 ?|) ;; U+2510 BOX DRAWINGS LIGHT DOWN AND LEFT + (#x2518 ?|) ;; U+2518 BOX DRAWINGS LIGHT UP AND LEFT + (#x2514 ?|) ;; U+2514 BOX DRAWINGS LIGHT UP AND RIGHT + (#x251c ?|) ;; U+251C BOX DRAWINGS LIGHT VERTICAL AND RIGHT + (#x252c ?|) ;; U+252C BOX DRAWINGS LIGHT DOWN AND HORIZONTAL + (#x2524 ?|) ;; U+2524 BOX DRAWINGS LIGHT VERTICAL AND LEFT + (#x2534 ?|) ;; U+2534 BOX DRAWINGS LIGHT UP AND HORIZONTAL + (#x253c ?|) ;; U+253C BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL + (#x02da ?^) ;; U+02DA RING ABOVE + (#x2122 ?\xa9) ;; U+2122 TRADE MARK SIGN, ?,A)(B + + (#x0132 ?\xe6) ;; U+0132 LATIN CAPITAL LIGATURE IJ, ?,Af(B + (#x013f ?\xe6) ;; U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT, ?,Af(B + + (#x0133 ?\xe6) ;; U+0133 LATIN SMALL LIGATURE IJ, ?,Af(B + (#x0140 ?\xe6) ;; U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT, ?,Af(B + (#x0149 ?\xe6) ;; U+0149 LATIN SMALL LETTER N PRECEDED BY APOSTROPH,?,Af(B + + (#x2194 ?|) ;; U+2194 LEFT RIGHT ARROW + (#x2660 ?*) ;; U+2660 BLACK SPADE SUIT + (#x2665 ?*) ;; U+2665 BLACK HEART SUIT + (#x2663 ?*) ;; U+2663 BLACK CLUB SUIT + (#x2592 ?|) ;; U+2592 MEDIUM SHADE + (#x2195 ?|) ;; U+2195 UP DOWN ARROW + + (#x2113 ?\xb9) ;; U+2113 SCRIPT SMALL L, ?,A9(B + (#x215b ?\xbe) ;; U+215B VULGAR FRACTION ONE EIGHTH, ?,A>(B + (#x215c ?\xbe) ;; U+215C VULGAR FRACTION THREE EIGHTHS, ?,A>(B + (#x215d ?\xbe) ;; U+215D VULGAR FRACTION FIVE EIGHTHS, ?,A>(B + (#x215e ?\xbe) ;; U+215E VULGAR FRACTION SEVEN EIGHTHS, ?,A>(B + (#x207f ?\xbe) ;; U+207F SUPERSCRIPT LATIN SMALL LETTER N, ?,A>(B + + ;; These are not in WGL 4, but are IPA characters that should not + ;; be double width. They are the only IPA characters that both + ;; occur in packages/mule-packages/leim/ipa.el and end up in East + ;; Asian character sets when that file is loaded in an XEmacs + ;; without packages. + (#x2197 ?|) ;; U+2197 NORTH EAST ARROW + (#x2199 ?|) ;; U+2199 SOUTH WEST ARROW + (#x2191 ?|) ;; U+2191 UPWARDS ARROW + (#x207f ?\xb9));; U+207F SUPERSCRIPT LATIN SMALL LETTER N, ?,A9(B + with decoded = nil + with syntax-table = (standard-syntax-table) + ;; This creates jit-ucs-charset-0 entries because: + ;; + ;; 1. If the tables are dumped, it is run at dump time before they are + ;; dumped, and as such before the relevant conversions are available + ;; (they are made available in mule/general-late.el). + ;; + ;; 2. If the tables are not dumped, it is run at dump time, long before + ;; any of the other mappings are available. + ;; + do + (setq decoded (decode-char 'ucs ucs)) + (assert (eq (char-charset decoded) + 'jit-ucs-charset-0) nil + "Unexpected Unicode decoding behavior. ") + (modify-syntax-entry decoded + (string + (char-syntax ascii-or-latin-1)) + syntax-table)) + + ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0 + ;; characters starting at U+200000 (which isn't a valid Unicode code + ;; point). + (loop + for i from #x00 to #xFF + ;; #xd800 is the first leading surrogate; trailing surrogates must be in + ;; the range #xdc00-#xdfff. These examples are not, so we intentionally + ;; provoke an error sequence. + do (decode-coding-string (format "\xd8\x00\x01%c" i) 'utf-16-be)) + + ;; Make them available to user code. + (defvar unicode-error-sequence-zero + (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3) + "The XEmacs character representing an invalid zero octet in Unicode. + +Subtract this character from each XEmacs character in an invalid sequence to +get the octet on disk. E.g. + +\(- (aref (decode-coding-string ?\\x80 'utf-8) 0) + unicode-error-characters-zero) +=> ?\\x80 + +You can search for invalid sequences using +`unicode-error-sequence-regexp-range', which see. ") + + (defvar unicode-error-sequence-regexp-range + (format "%c-%c" + (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3) + (aref (decode-coding-string "\xd8\x00\x01\xFF" 'utf-16-be) 3)) + "Regular expression range to match Unicode error sequences in XEmacs. + +Invalid Unicode sequences on input are represented as XEmacs characters with +values starting at `unicode-error-sequence-zero', one character for each +invalid octet. Use this variable (with `re-search-forward' or +`skip-chars-forward') to search for such characters; use +`unicode-error-sequence-zero' from such characters to get a character +corresponding to the octet on disk. ")) ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's ;; an implementation in appendix A.1 of the Unicode Standard, Version