comparison lisp/unicode.el @ 771:943eaba38521

[xemacs-hg @ 2002-03-13 08:51:24 by ben] The big ben-mule-21-5 check-in! Various files were added and deleted. See CHANGES-ben-mule. There are still some test suite failures. No crashes, though. Many of the failures have to do with problems in the test suite itself rather than in the actual code. I'll be addressing these in the next day or so -- none of the test suite failures are at all critical. Meanwhile I'll be trying to address the biggest issues -- i.e. build or run failures, which will almost certainly happen on various platforms. All comments should be sent to ben@xemacs.org -- use a Cc: if necessary when sending to mailing lists. There will be pre- and post- tags, something like pre-ben-mule-21-5-merge-in, and post-ben-mule-21-5-merge-in.
author ben
date Wed, 13 Mar 2002 08:54:06 +0000
parents
children 2923009caf47
comparison
equal deleted inserted replaced
770:336a418893b5 771:943eaba38521
1 ;;; unicode.el --- Unicode support -*- coding: iso-2022-7bit; -*-
2
3 ;; Copyright (C) 2001 Ben Wing.
4
5 ;; Keywords: multilingual, Unicode
6
7 ;; This file is part of XEmacs.
8
9 ;; XEmacs is free software; you can redistribute it and/or modify it
10 ;; under the terms of the GNU General Public License as published by
11 ;; the Free Software Foundation; either version 2, or (at your option)
12 ;; any later version.
13
14 ;; XEmacs is distributed in the hope that it will be useful, but
15 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;; General Public License for more details.
18
19 ;; You should have received a copy of the GNU General Public License
20 ;; along with XEmacs; see the file COPYING. If not, write to the Free
21 ;; Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
22 ;; 02111-1307, USA.
23
24 ;;; Synched up with: Not in FSF.
25
26 ;;; Commentary:
27
28 ;; Lisp support for Unicode, e.g. initialize the translation tables.
29
30 ;;; Code:
31
32 ;; NOTE: This takes only a fraction of a second on my Pentium III
33 ;; 700Mhz even with a totally optimization-disabled XEmacs.
34 (defun load-unicode-tables ()
35 "Initialize the Unicode translation tables for all standard charsets."
36 (let ((undir (expand-file-name "unicode/unicode-consortium" data-directory))
37 (parse-args
38 '(("8859-1.TXT" latin-iso8859-1 #xA0 #xFF #x-80)
39 ;; "8859-10.TXT"
40 ;; "8859-13.TXT"
41 ;; "8859-14.TXT"
42 ;; "8859-15.TXT"
43 ("8859-2.TXT" latin-iso8859-2 #xA0 #xFF #x-80)
44 ("8859-3.TXT" latin-iso8859-3 #xA0 #xFF #x-80)
45 ("8859-4.TXT" latin-iso8859-4 #xA0 #xFF #x-80)
46 ("8859-5.TXT" cyrillic-iso8859-5 #xA0 #xFF #x-80)
47 ("8859-6.TXT" arabic-iso8859-6 #xA0 #xFF #x-80)
48 ("8859-7.TXT" greek-iso8859-7 #xA0 #xFF #x-80)
49 ("8859-8.TXT" hebrew-iso8859-8 #xA0 #xFF #x-80)
50 ("8859-9.TXT" latin-iso8859-9 #xA0 #xFF #x-80)
51 ;; charset for Big5 does not matter; specifying `big5' will
52 ;; automatically make the right thing happen
53 ("BIG5.TXT" chinese-big5-1 nil nil nil big5)
54 ("CNS11643.TXT" chinese-cns11643-1 #x10000 #x1FFFF #x-10000)
55 ("CNS11643.TXT" chinese-cns11643-2 #x20000 #x2FFFF #x-20000)
56 ;; "CP1250.TXT"
57 ;; "CP1251.TXT"
58 ;; "CP1252.TXT"
59 ;; "CP1253.TXT"
60 ;; "CP1254.TXT"
61 ;; "CP1255.TXT"
62 ;; "CP1256.TXT"
63 ;; "CP1257.TXT"
64 ;; "CP1258.TXT"
65 ;; "CP874.TXT"
66 ;; "CP932.TXT"
67 ;; "CP936.TXT"
68 ;; "CP949.TXT"
69 ;; "CP950.TXT"
70 ;; "GB12345.TXT"
71 ("GB2312.TXT" chinese-gb2312)
72 ;; "HANGUL.TXT"
73 ("JIS0201.TXT" katakana-jisx0201 #xA0 #xFF #x-80)
74 ("JIS0208.TXT" japanese-jisx0208 nil nil nil ignore-first-column)
75 ("JIS0212.TXT" japanese-jisx0212)
76 ;; "JOHAB.TXT"
77 ;; "KOI8-R.TXT"
78 ;; "KSC5601.TXT"
79 ;; note that KSC5601.TXT as currently distributed is NOT what
80 ;; it claims to be! see comments in KSX1001.TXT.
81 ("KSX1001.TXT" korean-ksc5601)
82 ;; "OLD5601.TXT"
83 ;; "SHIFTJIS.TXT"
84 )))
85 (mapcar #'(lambda (args)
86 (apply 'parse-unicode-translation-table
87 (expand-file-name (car args) undir)
88 (cdr args)))
89 parse-args)))
90
91 (defun init-unicode-at-startup ()
92 (load-unicode-tables))
93
94 (make-coding-system
95 'utf-16 'unicode
96 "UTF-16"
97 '(mnemonic "UTF-16"
98 documentation
99 "UTF-16 Unicode encoding -- the standard (almost-) fixed-width
100 two-byte encoding, with surrogates. It will be fixed-width if all
101 characters are in the BMP (Basic Multilingual Plane -- first 65536
102 codepoints). Cannot represent characters with codepoints above
103 0x10FFFF (a little more than 1,000,000). Unicode and ISO guarantee
104 never to encode any characters outside this range -- all the rest are
105 for private, corporate or internal use."
106 type utf-16))
107
108 (make-coding-system
109 'utf-16-bom 'unicode
110 "UTF-16 w/BOM"
111 '(mnemonic "UTF16-BOM"
112 documentation
113 "UTF-16 Unicode encoding with byte order mark (BOM) at the beginning.
114 The BOM is Unicode character U+FEFF -- i.e. the first two bytes are
115 0xFE and 0xFF, respectively, or reversed in a little-endian
116 representation. It has been sanctioned by the Unicode Consortium for
117 use at the beginning of a Unicode stream as a marker of the byte order
118 of the stream, and commonly appears in Unicode files under Microsoft
119 Windows, where it also functions as a magic cookie identifying a
120 Unicode file. The character is called \"ZERO WIDTH NO-BREAK SPACE\"
121 and is suitable as a byte-order marker because:
122
123 -- it has no displayable representation
124 -- due to its semantics it never normally appears at the beginning
125 of a stream
126 -- its reverse U+FFFE is not a legal Unicode character
127 -- neither byte sequence is at all likely in any other standard
128 encoding, particularly at the beginning of a stream
129
130 This coding system will insert a BOM at the beginning of a stream when
131 writing and strip it off when reading."
132 type utf-16
133 need-bom t))
134
135 (make-coding-system
136 'utf-16-little-endian 'unicode
137 "UTF-16 Little Endian"
138 '(mnemonic "UTF16-LE"
139 documentation
140 "Little-endian version of UTF-16 Unicode encoding.
141 See `utf-16' coding system."
142 type utf-16
143 little-endian t))
144
145 (make-coding-system
146 'utf-16-little-endian-bom 'unicode
147 "UTF-16 Little Endian w/BOM"
148 '(mnemonic "MSW-Unicode"
149 documentation
150 "Little-endian version of UTF-16 Unicode encoding, with byte order mark.
151 Standard encoding for representing Unicode under MS Windows. See
152 `utf-16-bom' coding system."
153 type utf-16
154 little-endian t
155 need-bom t))
156
157 (make-coding-system
158 'ucs-4 'unicode
159 "UCS-4"
160 '(mnemonic "UCS4"
161 documentation
162 "UCS-4 Unicode encoding -- fully fixed-width four-byte encoding."
163 type ucs-4))
164
165 (make-coding-system
166 'ucs-4-little-endian 'unicode
167 "UCS-4 Little Endian"
168 '(mnemonic "UCS4-LE"
169 documentation
170 "Little-endian version of UCS-4 Unicode encoding. See `ucs-4' coding system."
171 type ucs-4
172 little-endian t))
173
174 (make-coding-system
175 'utf-8 'unicode
176 "UTF-8"
177 '(mnemonic "UTF8"
178 documentation
179 "UTF-8 Unicode encoding -- ASCII-compatible 8-bit variable-width encoding
180 with the same principles as the Mule-internal encoding:
181
182 -- All ASCII characters (codepoints 0 through 127) are represented
183 by themselves (i.e. using one byte, with the same value as the
184 ASCII codepoint), and these bytes are disjoint from bytes
185 representing non-ASCII characters.
186
187 This means that any 8-bit clean application can safely process
188 UTF-8-encoded text as it were ASCII, with no corruption (e.g. a
189 '/' byte is always a slash character, never the second byte of
190 some other character, as with Big5, so a pathname encoded in
191 UTF-8 can safely be split up into components and reassembled
192 again using standard ASCII processes).
193
194 -- Leading bytes and non-leading bytes in the encoding of a
195 character are disjoint, so moving backwards is easy.
196
197 -- Given only the leading byte, you know how many following bytes
198 are present.
199 "
200 type utf-8))
201
202 ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's
203 ;; an implementation in appendix A.1 of the Unicode Standard, Version
204 ;; 2.0, but I don't know its licensing characteristics.
205
206 ; (make-coding-system
207 ; 'utf-7 'unicode
208 ; "UTF-7"
209 ; '(mnemonic "UTF7"
210 ; documentation
211 ; "UTF-7 Unicode encoding -- 7-bit-ASCII modal Internet-mail-compatible
212 ; encoding especially designed for headers, with the following
213 ; properties:
214
215 ; -- Only characters that are considered safe for passing through any mail
216 ; gateway without damage are used.
217
218 ; -- This is a modal encoding, with two states. The first, default
219 ; state encodes the most common Unicode characters (upper and
220 ; lowercase letters, digits, and 9 common punctuation marks) as
221 ; themselves, and the second state, entered using '+' and
222 ; terminated with '-' or any character disallowed in state 2,
223 ; encodes any Unicode characters by first converting to UTF-16,
224 ; most significant byte first, and then to a slightly modified
225 ; Base64 encoding. (Thus, UTF-7 has the same limitations on the
226 ; characters it can encode as UTF-16.)
227
228 ; -- The modified Base64 encoding deviates from standard Base64 in
229 ; that it omits the `=' pad character. This is eliminated so as to
230 ; avoid conflicts with the use of `=' as an escape in the
231 ; Quoted-Printable encoding and the related Q encoding for headers:
232 ; With this modification, non-whitespace chars in UTF-7 will be
233 ; represented in Quoted-Printable and in Q as-is, with no further
234 ; encoding.
235
236 ; For more information, see Appendix A.1 of The Unicode Standard 2.0, or
237 ; wherever it is in v3.0."
238 ; type utf-7))