Mercurial > hg > xemacs-beta
comparison lisp/unicode.el @ 771:943eaba38521
[xemacs-hg @ 2002-03-13 08:51:24 by ben]
The big ben-mule-21-5 check-in!
Various files were added and deleted. See CHANGES-ben-mule.
There are still some test suite failures. No crashes, though.
Many of the failures have to do with problems in the test suite itself
rather than in the actual code. I'll be addressing these in the next
day or so -- none of the test suite failures are at all critical.
Meanwhile I'll be trying to address the biggest issues -- i.e. build
or run failures, which will almost certainly happen on various platforms.
All comments should be sent to ben@xemacs.org -- use a Cc: if necessary
when sending to mailing lists. There will be pre- and post- tags,
something like
pre-ben-mule-21-5-merge-in, and
post-ben-mule-21-5-merge-in.
author | ben |
---|---|
date | Wed, 13 Mar 2002 08:54:06 +0000 |
parents | |
children | 2923009caf47 |
comparison
equal
deleted
inserted
replaced
770:336a418893b5 | 771:943eaba38521 |
---|---|
1 ;;; unicode.el --- Unicode support -*- coding: iso-2022-7bit; -*- | |
2 | |
3 ;; Copyright (C) 2001 Ben Wing. | |
4 | |
5 ;; Keywords: multilingual, Unicode | |
6 | |
7 ;; This file is part of XEmacs. | |
8 | |
9 ;; XEmacs is free software; you can redistribute it and/or modify it | |
10 ;; under the terms of the GNU General Public License as published by | |
11 ;; the Free Software Foundation; either version 2, or (at your option) | |
12 ;; any later version. | |
13 | |
14 ;; XEmacs is distributed in the hope that it will be useful, but | |
15 ;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 ;; General Public License for more details. | |
18 | |
19 ;; You should have received a copy of the GNU General Public License | |
20 ;; along with XEmacs; see the file COPYING. If not, write to the Free | |
21 ;; Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA | |
22 ;; 02111-1307, USA. | |
23 | |
24 ;;; Synched up with: Not in FSF. | |
25 | |
26 ;;; Commentary: | |
27 | |
28 ;; Lisp support for Unicode, e.g. initialize the translation tables. | |
29 | |
30 ;;; Code: | |
31 | |
32 ;; NOTE: This takes only a fraction of a second on my Pentium III | |
33 ;; 700Mhz even with a totally optimization-disabled XEmacs. | |
34 (defun load-unicode-tables () | |
35 "Initialize the Unicode translation tables for all standard charsets." | |
36 (let ((undir (expand-file-name "unicode/unicode-consortium" data-directory)) | |
37 (parse-args | |
38 '(("8859-1.TXT" latin-iso8859-1 #xA0 #xFF #x-80) | |
39 ;; "8859-10.TXT" | |
40 ;; "8859-13.TXT" | |
41 ;; "8859-14.TXT" | |
42 ;; "8859-15.TXT" | |
43 ("8859-2.TXT" latin-iso8859-2 #xA0 #xFF #x-80) | |
44 ("8859-3.TXT" latin-iso8859-3 #xA0 #xFF #x-80) | |
45 ("8859-4.TXT" latin-iso8859-4 #xA0 #xFF #x-80) | |
46 ("8859-5.TXT" cyrillic-iso8859-5 #xA0 #xFF #x-80) | |
47 ("8859-6.TXT" arabic-iso8859-6 #xA0 #xFF #x-80) | |
48 ("8859-7.TXT" greek-iso8859-7 #xA0 #xFF #x-80) | |
49 ("8859-8.TXT" hebrew-iso8859-8 #xA0 #xFF #x-80) | |
50 ("8859-9.TXT" latin-iso8859-9 #xA0 #xFF #x-80) | |
51 ;; charset for Big5 does not matter; specifying `big5' will | |
52 ;; automatically make the right thing happen | |
53 ("BIG5.TXT" chinese-big5-1 nil nil nil big5) | |
54 ("CNS11643.TXT" chinese-cns11643-1 #x10000 #x1FFFF #x-10000) | |
55 ("CNS11643.TXT" chinese-cns11643-2 #x20000 #x2FFFF #x-20000) | |
56 ;; "CP1250.TXT" | |
57 ;; "CP1251.TXT" | |
58 ;; "CP1252.TXT" | |
59 ;; "CP1253.TXT" | |
60 ;; "CP1254.TXT" | |
61 ;; "CP1255.TXT" | |
62 ;; "CP1256.TXT" | |
63 ;; "CP1257.TXT" | |
64 ;; "CP1258.TXT" | |
65 ;; "CP874.TXT" | |
66 ;; "CP932.TXT" | |
67 ;; "CP936.TXT" | |
68 ;; "CP949.TXT" | |
69 ;; "CP950.TXT" | |
70 ;; "GB12345.TXT" | |
71 ("GB2312.TXT" chinese-gb2312) | |
72 ;; "HANGUL.TXT" | |
73 ("JIS0201.TXT" katakana-jisx0201 #xA0 #xFF #x-80) | |
74 ("JIS0208.TXT" japanese-jisx0208 nil nil nil ignore-first-column) | |
75 ("JIS0212.TXT" japanese-jisx0212) | |
76 ;; "JOHAB.TXT" | |
77 ;; "KOI8-R.TXT" | |
78 ;; "KSC5601.TXT" | |
79 ;; note that KSC5601.TXT as currently distributed is NOT what | |
80 ;; it claims to be! see comments in KSX1001.TXT. | |
81 ("KSX1001.TXT" korean-ksc5601) | |
82 ;; "OLD5601.TXT" | |
83 ;; "SHIFTJIS.TXT" | |
84 ))) | |
85 (mapcar #'(lambda (args) | |
86 (apply 'parse-unicode-translation-table | |
87 (expand-file-name (car args) undir) | |
88 (cdr args))) | |
89 parse-args))) | |
90 | |
91 (defun init-unicode-at-startup () | |
92 (load-unicode-tables)) | |
93 | |
94 (make-coding-system | |
95 'utf-16 'unicode | |
96 "UTF-16" | |
97 '(mnemonic "UTF-16" | |
98 documentation | |
99 "UTF-16 Unicode encoding -- the standard (almost-) fixed-width | |
100 two-byte encoding, with surrogates. It will be fixed-width if all | |
101 characters are in the BMP (Basic Multilingual Plane -- first 65536 | |
102 codepoints). Cannot represent characters with codepoints above | |
103 0x10FFFF (a little more than 1,000,000). Unicode and ISO guarantee | |
104 never to encode any characters outside this range -- all the rest are | |
105 for private, corporate or internal use." | |
106 type utf-16)) | |
107 | |
108 (make-coding-system | |
109 'utf-16-bom 'unicode | |
110 "UTF-16 w/BOM" | |
111 '(mnemonic "UTF16-BOM" | |
112 documentation | |
113 "UTF-16 Unicode encoding with byte order mark (BOM) at the beginning. | |
114 The BOM is Unicode character U+FEFF -- i.e. the first two bytes are | |
115 0xFE and 0xFF, respectively, or reversed in a little-endian | |
116 representation. It has been sanctioned by the Unicode Consortium for | |
117 use at the beginning of a Unicode stream as a marker of the byte order | |
118 of the stream, and commonly appears in Unicode files under Microsoft | |
119 Windows, where it also functions as a magic cookie identifying a | |
120 Unicode file. The character is called \"ZERO WIDTH NO-BREAK SPACE\" | |
121 and is suitable as a byte-order marker because: | |
122 | |
123 -- it has no displayable representation | |
124 -- due to its semantics it never normally appears at the beginning | |
125 of a stream | |
126 -- its reverse U+FFFE is not a legal Unicode character | |
127 -- neither byte sequence is at all likely in any other standard | |
128 encoding, particularly at the beginning of a stream | |
129 | |
130 This coding system will insert a BOM at the beginning of a stream when | |
131 writing and strip it off when reading." | |
132 type utf-16 | |
133 need-bom t)) | |
134 | |
135 (make-coding-system | |
136 'utf-16-little-endian 'unicode | |
137 "UTF-16 Little Endian" | |
138 '(mnemonic "UTF16-LE" | |
139 documentation | |
140 "Little-endian version of UTF-16 Unicode encoding. | |
141 See `utf-16' coding system." | |
142 type utf-16 | |
143 little-endian t)) | |
144 | |
145 (make-coding-system | |
146 'utf-16-little-endian-bom 'unicode | |
147 "UTF-16 Little Endian w/BOM" | |
148 '(mnemonic "MSW-Unicode" | |
149 documentation | |
150 "Little-endian version of UTF-16 Unicode encoding, with byte order mark. | |
151 Standard encoding for representing Unicode under MS Windows. See | |
152 `utf-16-bom' coding system." | |
153 type utf-16 | |
154 little-endian t | |
155 need-bom t)) | |
156 | |
157 (make-coding-system | |
158 'ucs-4 'unicode | |
159 "UCS-4" | |
160 '(mnemonic "UCS4" | |
161 documentation | |
162 "UCS-4 Unicode encoding -- fully fixed-width four-byte encoding." | |
163 type ucs-4)) | |
164 | |
165 (make-coding-system | |
166 'ucs-4-little-endian 'unicode | |
167 "UCS-4 Little Endian" | |
168 '(mnemonic "UCS4-LE" | |
169 documentation | |
170 "Little-endian version of UCS-4 Unicode encoding. See `ucs-4' coding system." | |
171 type ucs-4 | |
172 little-endian t)) | |
173 | |
174 (make-coding-system | |
175 'utf-8 'unicode | |
176 "UTF-8" | |
177 '(mnemonic "UTF8" | |
178 documentation | |
179 "UTF-8 Unicode encoding -- ASCII-compatible 8-bit variable-width encoding | |
180 with the same principles as the Mule-internal encoding: | |
181 | |
182 -- All ASCII characters (codepoints 0 through 127) are represented | |
183 by themselves (i.e. using one byte, with the same value as the | |
184 ASCII codepoint), and these bytes are disjoint from bytes | |
185 representing non-ASCII characters. | |
186 | |
187 This means that any 8-bit clean application can safely process | |
188 UTF-8-encoded text as it were ASCII, with no corruption (e.g. a | |
189 '/' byte is always a slash character, never the second byte of | |
190 some other character, as with Big5, so a pathname encoded in | |
191 UTF-8 can safely be split up into components and reassembled | |
192 again using standard ASCII processes). | |
193 | |
194 -- Leading bytes and non-leading bytes in the encoding of a | |
195 character are disjoint, so moving backwards is easy. | |
196 | |
197 -- Given only the leading byte, you know how many following bytes | |
198 are present. | |
199 " | |
200 type utf-8)) | |
201 | |
202 ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's | |
203 ;; an implementation in appendix A.1 of the Unicode Standard, Version | |
204 ;; 2.0, but I don't know its licensing characteristics. | |
205 | |
206 ; (make-coding-system | |
207 ; 'utf-7 'unicode | |
208 ; "UTF-7" | |
209 ; '(mnemonic "UTF7" | |
210 ; documentation | |
211 ; "UTF-7 Unicode encoding -- 7-bit-ASCII modal Internet-mail-compatible | |
212 ; encoding especially designed for headers, with the following | |
213 ; properties: | |
214 | |
215 ; -- Only characters that are considered safe for passing through any mail | |
216 ; gateway without damage are used. | |
217 | |
218 ; -- This is a modal encoding, with two states. The first, default | |
219 ; state encodes the most common Unicode characters (upper and | |
220 ; lowercase letters, digits, and 9 common punctuation marks) as | |
221 ; themselves, and the second state, entered using '+' and | |
222 ; terminated with '-' or any character disallowed in state 2, | |
223 ; encodes any Unicode characters by first converting to UTF-16, | |
224 ; most significant byte first, and then to a slightly modified | |
225 ; Base64 encoding. (Thus, UTF-7 has the same limitations on the | |
226 ; characters it can encode as UTF-16.) | |
227 | |
228 ; -- The modified Base64 encoding deviates from standard Base64 in | |
229 ; that it omits the `=' pad character. This is eliminated so as to | |
230 ; avoid conflicts with the use of `=' as an escape in the | |
231 ; Quoted-Printable encoding and the related Q encoding for headers: | |
232 ; With this modification, non-whitespace chars in UTF-7 will be | |
233 ; represented in Quoted-Printable and in Q as-is, with no further | |
234 ; encoding. | |
235 | |
236 ; For more information, see Appendix A.1 of The Unicode Standard 2.0, or | |
237 ; wherever it is in v3.0." | |
238 ; type utf-7)) |