diff lisp/unicode.el @ 4145:edb00a8b4eff

[xemacs-hg @ 2007-08-26 20:00:29 by aidan] Generally make the language environments and coding systems a little more sane.
author aidan
date Sun, 26 Aug 2007 20:00:42 +0000
parents 1abf84db2c7f
children a7c5de5b9880
line wrap: on
line diff
--- a/lisp/unicode.el	Sat Aug 25 21:51:21 2007 +0000
+++ b/lisp/unicode.el	Sun Aug 26 20:00:42 2007 +0000
@@ -144,7 +144,25 @@
 				     (expand-file-name (car args) undir)
 				     (cdr args)))
 			  (cdr tables))))
-	    parse-args)))
+	    parse-args)
+    ;; The default-unicode-precedence-list. We set this here to default to
+    ;; *not* mapping various European characters to East Asian characters;
+    ;; otherwise the default-unicode-precedence-list is numerically ordered
+    ;; by charset ID.
+    (set-default-unicode-precedence-list
+     '(ascii control-1 latin-iso8859-1 latin-iso8859-2 latin-iso8859-15
+       greek-iso8859-7 hebrew-iso8859-8 ipa cyrillic-iso8859-5
+       latin-iso8859-16 latin-iso8859-3 latin-iso8859-4 latin-iso8859-9
+       vietnamese-viscii-lower vietnamese-viscii-upper arabic-iso8859-6
+       jit-ucs-charset-0 japanese-jisx0208 japanese-jisx0208-1978
+       japanese-jisx0212 japanese-jisx0213-1 japanese-jisx0213-2
+       chinese-gb2312 chinese-sisheng chinese-big5-1 chinese-big5-2
+       indian-is13194 korean-ksc5601 chinese-cns11643-1 chinese-cns11643-2
+       chinese-isoir165 arabic-1-column arabic-2-column arabic-digit
+       composite ethiopic indian-1-column indian-2-column jit-ucs-charset-0
+       katakana-jisx0201 lao thai-tis620 thai-xtis tibetan tibetan-1-column
+       latin-jisx0201 chinese-cns11643-3 chinese-cns11643-4
+       chinese-cns11643-5 chinese-cns11643-6 chinese-cns11643-7))))
 
 (make-coding-system
  'utf-16 'unicode
@@ -319,8 +337,8 @@
                   ;; one), we can't compile the program at dump time. We can
                   ;; check at byte compile time that the program is as
                   ;; expected, though.
-                  [1 16 131127 7 98872 65823 1307 5 -65536 65313 64833 1028 147513
-                     8 82009 255 22]))
+                  [1 16 131127 7 98872 65823 1307 5 -65536 65313 64833 1028
+                     147513 8 82009 255 22]))
              (when (featurep 'mule)
                ;; Check that the pre-existing constant reflects the intended
                ;; CCL program.
@@ -350,7 +368,169 @@
     (defconst ccl-encode-to-ucs-2 prog
       "CCL program to transform Mule characters to UCS-2.")
     (put 'ccl-encode-to-ucs-2 'ccl-program-idx
-         (register-ccl-program 'ccl-encode-to-ucs-2 prog))))
+         (register-ccl-program 'ccl-encode-to-ucs-2 prog)))
+
+  ;; Now, create jit-ucs-charset-0 entries for those characters in Windows
+  ;; Glyph List 4 that would otherwise end up in East Asian character sets.
+  ;; 
+  ;; WGL4 is a character repertoire from Microsoft that gives a guideline
+  ;; for font implementors as to what characters are sufficient for
+  ;; pan-European support.  The intention of this code is to avoid the
+  ;; situation where these characters end up mapping to East Asian XEmacs
+  ;; characters, which generally clash strongly with European characters
+  ;; both in font choice and character width; jit-ucs-charset-0 is a
+  ;; single-width character set which comes before the East Asian character
+  ;; sets in the default-unicode-precedence-list above.
+  (loop for (ucs ascii-or-latin-1)
+    in '((#x2013 ?-) ;; U+2013 EN DASH
+         (#x2014 ?-) ;; U+2014 EM DASH
+         (#x2105 ?%) ;; U+2105 CARE OF
+         (#x203e ?-) ;; U+203E OVERLINE
+         (#x221f ?|) ;; U+221F RIGHT ANGLE
+         (#x2584 ?|) ;; U+2584 LOWER HALF BLOCK
+         (#x2588 ?|) ;; U+2588 FULL BLOCK
+         (#x258c ?|) ;; U+258C LEFT HALF BLOCK
+         (#x2550 ?|) ;; U+2550 BOX DRAWINGS DOUBLE HORIZONTAL
+         (#x255e ?|) ;; U+255E BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE
+         (#x256a ?|) ;; U+256A BOX DRAWINGS VERTICAL SINGLE & HORIZONTAL DOUBLE
+         (#x2561 ?|) ;; U+2561 BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE
+         (#x2215 ?/) ;; U+2215 DIVISION SLASH
+         (#x02c9 ?`) ;; U+02C9 MODIFIER LETTER MACRON
+         (#x2211 ?s) ;; U+2211 N-ARY SUMMATION
+         (#x220f ?s) ;; U+220F N-ARY PRODUCT
+         (#x2248 ?=) ;; U+2248 ALMOST EQUAL TO
+         (#x2264 ?=) ;; U+2264 LESS-THAN OR EQUAL TO
+         (#x2265 ?=) ;; U+2265 GREATER-THAN OR EQUAL TO
+         (#x201c ?') ;; U+201C LEFT DOUBLE QUOTATION MARK
+         (#x2026 ?.) ;; U+2026 HORIZONTAL ELLIPSIS
+         (#x2212 ?-) ;; U+2212 MINUS SIGN
+         (#x2260 ?=) ;; U+2260 NOT EQUAL TO
+         (#x221e ?=) ;; U+221E INFINITY
+         (#x2642 ?=) ;; U+2642 MALE SIGN
+         (#x2640 ?=) ;; U+2640 FEMALE SIGN
+         (#x2032 ?=) ;; U+2032 PRIME
+         (#x2033 ?=) ;; U+2033 DOUBLE PRIME
+         (#x25cb ?=) ;; U+25CB WHITE CIRCLE
+         (#x25cf ?=) ;; U+25CF BLACK CIRCLE
+         (#x25a1 ?=) ;; U+25A1 WHITE SQUARE
+         (#x25a0 ?=) ;; U+25A0 BLACK SQUARE
+         (#x25b2 ?=) ;; U+25B2 BLACK UP-POINTING TRIANGLE
+         (#x25bc ?=) ;; U+25BC BLACK DOWN-POINTING TRIANGLE
+         (#x2192 ?=) ;; U+2192 RIGHTWARDS ARROW
+         (#x2190 ?=) ;; U+2190 LEFTWARDS ARROW
+         (#x2191 ?=) ;; U+2191 UPWARDS ARROW
+         (#x2193 ?=) ;; U+2193 DOWNWARDS ARROW
+         (#x2229 ?=) ;; U+2229 INTERSECTION
+         (#x2202 ?=) ;; U+2202 PARTIAL DIFFERENTIAL
+         (#x2261 ?=) ;; U+2261 IDENTICAL TO
+         (#x221a ?=) ;; U+221A SQUARE ROOT
+         (#x222b ?=) ;; U+222B INTEGRAL
+         (#x2030 ?=) ;; U+2030 PER MILLE SIGN
+         (#x266a ?=) ;; U+266A EIGHTH NOTE
+         (#x2020 ?*) ;; U+2020 DAGGER
+         (#x2021 ?*) ;; U+2021 DOUBLE DAGGER
+         (#x2500 ?|) ;; U+2500 BOX DRAWINGS LIGHT HORIZONTAL
+         (#x2502 ?|) ;; U+2502 BOX DRAWINGS LIGHT VERTICAL
+         (#x250c ?|) ;; U+250C BOX DRAWINGS LIGHT DOWN AND RIGHT
+         (#x2510 ?|) ;; U+2510 BOX DRAWINGS LIGHT DOWN AND LEFT
+         (#x2518 ?|) ;; U+2518 BOX DRAWINGS LIGHT UP AND LEFT
+         (#x2514 ?|) ;; U+2514 BOX DRAWINGS LIGHT UP AND RIGHT
+         (#x251c ?|) ;; U+251C BOX DRAWINGS LIGHT VERTICAL AND RIGHT
+         (#x252c ?|) ;; U+252C BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
+         (#x2524 ?|) ;; U+2524 BOX DRAWINGS LIGHT VERTICAL AND LEFT
+         (#x2534 ?|) ;; U+2534 BOX DRAWINGS LIGHT UP AND HORIZONTAL
+         (#x253c ?|) ;; U+253C BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
+         (#x02da ?^) ;; U+02DA RING ABOVE
+         (#x2122 ?\xa9) ;; U+2122 TRADE MARK SIGN, ?,A)(B
+
+         (#x0132 ?\xe6) ;; U+0132 LATIN CAPITAL LIGATURE IJ, ?,Af(B
+         (#x013f ?\xe6) ;; U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT, ?,Af(B
+
+         (#x0133 ?\xe6) ;; U+0133 LATIN SMALL LIGATURE IJ, ?,Af(B
+         (#x0140 ?\xe6) ;; U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT, ?,Af(B
+         (#x0149 ?\xe6) ;; U+0149 LATIN SMALL LETTER N PRECEDED BY APOSTROPH,?,Af(B
+
+         (#x2194 ?|) ;; U+2194 LEFT RIGHT ARROW
+         (#x2660 ?*) ;; U+2660 BLACK SPADE SUIT
+         (#x2665 ?*) ;; U+2665 BLACK HEART SUIT
+         (#x2663 ?*) ;; U+2663 BLACK CLUB SUIT
+         (#x2592 ?|) ;; U+2592 MEDIUM SHADE
+         (#x2195 ?|) ;; U+2195 UP DOWN ARROW
+
+         (#x2113 ?\xb9) ;; U+2113 SCRIPT SMALL L, ?,A9(B
+         (#x215b ?\xbe) ;; U+215B VULGAR FRACTION ONE EIGHTH, ?,A>(B
+         (#x215c ?\xbe) ;; U+215C VULGAR FRACTION THREE EIGHTHS, ?,A>(B
+         (#x215d ?\xbe) ;; U+215D VULGAR FRACTION FIVE EIGHTHS, ?,A>(B
+         (#x215e ?\xbe) ;; U+215E VULGAR FRACTION SEVEN EIGHTHS, ?,A>(B
+         (#x207f ?\xbe) ;; U+207F SUPERSCRIPT LATIN SMALL LETTER N, ?,A>(B
+  
+         ;; These are not in WGL 4, but are IPA characters that should not
+         ;; be double width. They are the only IPA characters that both
+         ;; occur in packages/mule-packages/leim/ipa.el and end up in East
+         ;; Asian character sets when that file is loaded in an XEmacs
+         ;; without packages.
+         (#x2197 ?|) ;; U+2197 NORTH EAST ARROW
+         (#x2199 ?|) ;; U+2199 SOUTH WEST ARROW
+         (#x2191 ?|) ;; U+2191 UPWARDS ARROW
+         (#x207f ?\xb9));; U+207F SUPERSCRIPT LATIN SMALL LETTER N, ?,A9(B
+    with decoded = nil
+    with syntax-table = (standard-syntax-table)
+    ;; This creates jit-ucs-charset-0 entries because:
+    ;;
+    ;;    1. If the tables are dumped, it is run at dump time before they are
+    ;;    dumped, and as such before the relevant conversions are available
+    ;;    (they are made available in mule/general-late.el). 
+    ;;
+    ;;    2. If the tables are not dumped, it is run at dump time, long before
+    ;;    any of the other mappings are available.
+    ;;
+    do
+    (setq decoded (decode-char 'ucs ucs))
+    (assert (eq (char-charset decoded)
+                'jit-ucs-charset-0) nil 
+                "Unexpected Unicode decoding behavior.  ")
+    (modify-syntax-entry decoded
+                         (string 
+                          (char-syntax ascii-or-latin-1))
+                         syntax-table))
+
+  ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0
+  ;; characters starting at U+200000 (which isn't a valid Unicode code
+  ;; point).
+  (loop
+    for i from #x00 to #xFF
+    ;; #xd800 is the first leading surrogate; trailing surrogates must be in
+    ;; the range #xdc00-#xdfff. These examples are not, so we intentionally
+    ;; provoke an error sequence.
+    do (decode-coding-string (format "\xd8\x00\x01%c" i) 'utf-16-be))
+
+  ;; Make them available to user code.
+  (defvar unicode-error-sequence-zero
+    (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3)
+    "The XEmacs character representing an invalid zero octet in Unicode.
+
+Subtract this character from each XEmacs character in an invalid sequence to
+get the octet on disk. E.g.
+
+\(- (aref (decode-coding-string ?\\x80 'utf-8) 0)
+   unicode-error-characters-zero)
+=> ?\\x80
+
+You can search for invalid sequences using
+`unicode-error-sequence-regexp-range', which see.  ")
+
+  (defvar unicode-error-sequence-regexp-range
+    (format "%c-%c"
+            (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3)
+            (aref (decode-coding-string "\xd8\x00\x01\xFF" 'utf-16-be) 3))
+    "Regular expression range to match Unicode error sequences in XEmacs.
+
+Invalid Unicode sequences on input are represented as XEmacs characters with
+values starting at `unicode-error-sequence-zero', one character for each
+invalid octet.  Use this variable (with `re-search-forward' or
+`skip-chars-forward') to search for such characters; use
+`unicode-error-sequence-zero' from such characters to get a character
+corresponding to the octet on disk.  "))
 
 ;; #### UTF-7 is not yet implemented, and it's tricky to do.  There's
 ;; an implementation in appendix A.1 of the Unicode Standard, Version