502
|
1 ;;; mule-coding.el --- Coding-system functions for Mule. -*- coding: iso-2022-7bit; -*-
|
333
|
2
|
|
3 ;; Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
|
|
4 ;; Licensed to the Free Software Foundation.
|
|
5 ;; Copyright (C) 1995 Amdahl Corporation.
|
|
6 ;; Copyright (C) 1995 Sun Microsystems.
|
|
7 ;; Copyright (C) 1997 MORIOKA Tomohiko
|
771
|
8 ;; Copyright (C) 2001 Ben Wing.
|
333
|
9
|
|
10 ;; This file is part of XEmacs.
|
|
11
|
|
12 ;; XEmacs is free software; you can redistribute it and/or modify it
|
|
13 ;; under the terms of the GNU General Public License as published by
|
|
14 ;; the Free Software Foundation; either version 2, or (at your option)
|
|
15 ;; any later version.
|
|
16
|
|
17 ;; XEmacs is distributed in the hope that it will be useful, but
|
|
18 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
20 ;; General Public License for more details.
|
|
21
|
|
22 ;; You should have received a copy of the GNU General Public License
|
444
|
23 ;; along with XEmacs; see the file COPYING. If not, write to the
|
333
|
24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
25 ;; Boston, MA 02111-1307, USA.
|
|
26
|
|
27 ;;; Commentary:
|
|
28
|
|
29 ;;; split off of mule.el and mostly moved to coding.el
|
|
30
|
4072
|
31 ;; Needed for make-8-bit-coding-system.
|
4080
|
32 (eval-when-compile (require 'ccl))
|
4072
|
33
|
333
|
34 ;;; Code:
|
|
35
|
|
36 (defun coding-system-force-on-output (coding-system register)
|
|
37 "Return the 'force-on-output property of CODING-SYSTEM for the specified REGISTER."
|
444
|
38 (check-type register integer)
|
333
|
39 (coding-system-property
|
|
40 coding-system
|
|
41 (case register
|
|
42 (0 'force-g0-on-output)
|
|
43 (1 'force-g1-on-output)
|
|
44 (2 'force-g2-on-output)
|
|
45 (3 'force-g3-on-output)
|
|
46 (t (signal 'args-out-of-range (list register 0 3))))))
|
|
47
|
|
48 (defun coding-system-short (coding-system)
|
|
49 "Return the 'short property of CODING-SYSTEM."
|
|
50 (coding-system-property coding-system 'short))
|
|
51
|
|
52 (defun coding-system-no-ascii-eol (coding-system)
|
|
53 "Return the 'no-ascii-eol property of CODING-SYSTEM."
|
|
54 (coding-system-property coding-system 'no-ascii-eol))
|
|
55
|
|
56 (defun coding-system-no-ascii-cntl (coding-system)
|
|
57 "Return the 'no-ascii-cntl property of CODING-SYSTEM."
|
|
58 (coding-system-property coding-system 'no-ascii-cntl))
|
|
59
|
|
60 (defun coding-system-seven (coding-system)
|
|
61 "Return the 'seven property of CODING-SYSTEM."
|
|
62 (coding-system-property coding-system 'seven))
|
|
63
|
|
64 (defun coding-system-lock-shift (coding-system)
|
|
65 "Return the 'lock-shift property of CODING-SYSTEM."
|
|
66 (coding-system-property coding-system 'lock-shift))
|
|
67
|
|
68 ;;(defun coding-system-use-japanese-jisx0201-roman (coding-system)
|
|
69 ;; "Return the 'use-japanese-jisx0201-roman property of CODING-SYSTEM."
|
|
70 ;; (coding-system-property coding-system 'use-japanese-jisx0201-roman))
|
|
71
|
|
72 ;;(defun coding-system-use-japanese-jisx0208-1978 (coding-system)
|
|
73 ;; "Return the 'use-japanese-jisx0208-1978 property of CODING-SYSTEM."
|
|
74 ;; (coding-system-property coding-system 'use-japanese-jisx0208-2978))
|
|
75
|
|
76 (defun coding-system-no-iso6429 (coding-system)
|
|
77 "Return the 'no-iso6429 property of CODING-SYSTEM."
|
|
78 (coding-system-property coding-system 'no-iso6429))
|
|
79
|
|
80 (defun coding-system-ccl-encode (coding-system)
|
|
81 "Return the CCL 'encode property of CODING-SYSTEM."
|
|
82 (coding-system-property coding-system 'encode))
|
|
83
|
|
84 (defun coding-system-ccl-decode (coding-system)
|
|
85 "Return the CCL 'decode property of CODING-SYSTEM."
|
|
86 (coding-system-property coding-system 'decode))
|
|
87
|
771
|
88 (defun coding-system-iso2022-charset (coding-system register)
|
|
89 "Return the charset initially designated to REGISTER in CODING-SYSTEM.
|
|
90 The allowable range of REGISTER is 0 through 3."
|
|
91 (if (or (< register 0) (> register 3))
|
|
92 (error 'args-out-of-range "coding-system-charset REGISTER" register 0 3))
|
|
93 (coding-system-property coding-system (nth register '(charset-g0
|
|
94 charset-g1
|
|
95 charset-g2
|
|
96 charset-g3))))
|
|
97
|
333
|
98
|
|
99 ;;;; Definitions of predefined coding systems
|
|
100
|
|
101 (make-coding-system
|
|
102 'ctext 'iso2022
|
771
|
103 "Compound Text"
|
333
|
104 '(charset-g0 ascii
|
|
105 charset-g1 latin-iso8859-1
|
|
106 eol-type nil
|
|
107 mnemonic "CText"))
|
|
108
|
|
109 (make-coding-system
|
|
110 'iso-2022-8bit-ss2 'iso2022
|
771
|
111 "ISO-2022 8-bit w/SS2"
|
333
|
112 '(charset-g0 ascii
|
|
113 charset-g1 latin-iso8859-1
|
|
114 charset-g2 t ;; unspecified but can be used later.
|
|
115 short t
|
|
116 mnemonic "ISO8/SS"
|
771
|
117 documentation "ISO 2022 based 8-bit encoding using SS2 for 96-charset"
|
333
|
118 ))
|
|
119
|
|
120 (make-coding-system
|
|
121 'iso-2022-7bit-ss2 'iso2022
|
771
|
122 "ISO-2022 7-bit w/SS2"
|
333
|
123 '(charset-g0 ascii
|
|
124 charset-g2 t ;; unspecified but can be used later.
|
|
125 seven t
|
|
126 short t
|
|
127 mnemonic "ISO7/SS"
|
771
|
128 documentation "ISO 2022 based 7-bit encoding using SS2 for 96-charset"
|
333
|
129 eol-type nil))
|
|
130
|
|
131 ;; (copy-coding-system 'iso-2022-7bit-ss2 'iso-2022-jp-2)
|
|
132 (make-coding-system
|
|
133 'iso-2022-jp-2 'iso2022
|
771
|
134 "ISO-2022-JP-2"
|
333
|
135 '(charset-g0 ascii
|
|
136 charset-g2 t ;; unspecified but can be used later.
|
|
137 seven t
|
|
138 short t
|
|
139 mnemonic "ISO7/SS"
|
|
140 eol-type nil))
|
|
141
|
|
142 (make-coding-system
|
|
143 'iso-2022-7bit 'iso2022
|
771
|
144 "ISO 2022 7-bit"
|
333
|
145 '(charset-g0 ascii
|
|
146 seven t
|
|
147 short t
|
771
|
148 mnemonic "ISO7"
|
|
149 documentation "ISO-2022-based 7-bit encoding using only G0"
|
|
150 ))
|
333
|
151
|
|
152 ;; compatibility for old XEmacsen
|
771
|
153 (define-coding-system-alias 'iso-2022-7 'iso-2022-7bit)
|
333
|
154
|
|
155 (make-coding-system
|
|
156 'iso-2022-8 'iso2022
|
771
|
157 "ISO-2022 8-bit"
|
333
|
158 '(charset-g0 ascii
|
|
159 charset-g1 latin-iso8859-1
|
|
160 short t
|
|
161 mnemonic "ISO8"
|
771
|
162 documentation "ISO-2022 eight-bit coding system. No single-shift or locking-shift."
|
333
|
163 ))
|
|
164
|
|
165 (make-coding-system
|
|
166 'escape-quoted 'iso2022
|
771
|
167 "Escape-Quoted (for .ELC files)"
|
333
|
168 '(charset-g0 ascii
|
|
169 charset-g1 latin-iso8859-1
|
|
170 eol-type lf
|
|
171 escape-quoted t
|
|
172 mnemonic "ESC/Quot"
|
771
|
173 documentation "ISO-2022 eight-bit coding system with escape quoting; used for .ELC files."
|
333
|
174 ))
|
|
175
|
|
176 (make-coding-system
|
|
177 'iso-2022-lock 'iso2022
|
771
|
178 "ISO-2022 w/locking-shift"
|
333
|
179 '(charset-g0 ascii
|
|
180 charset-g1 t ;; unspecified but can be used later.
|
|
181 seven t
|
|
182 lock-shift t
|
|
183 mnemonic "ISO7/Lock"
|
771
|
184 documentation "ISO-2022 coding system using Locking-Shift for 96-charset."
|
333
|
185 ))
|
4072
|
186
|
333
|
187
|
4072
|
188 ;; This is used by people writing CCL programs, but is called at runtime.
|
|
189 (defun define-translation-hash-table (symbol table)
|
|
190 "Define SYMBOL as the name of the hash translation TABLE for use in CCL.
|
|
191
|
|
192 Analogous to `define-translation-table', but updates
|
|
193 `translation-hash-table-vector' and the table is for use in the CCL
|
|
194 `lookup-integer' and `lookup-character' functions."
|
4145
|
195 (check-argument-type #'symbolp symbol)
|
|
196 (check-argument-type #'hash-table-p table)
|
4072
|
197 (let ((len (length translation-hash-table-vector))
|
|
198 (id 0)
|
|
199 done)
|
|
200 (put symbol 'translation-hash-table table)
|
|
201 (while (not done)
|
|
202 (if (>= id len)
|
|
203 (setq translation-hash-table-vector
|
|
204 (vconcat translation-hash-table-vector [nil])))
|
|
205 (let ((slot (aref translation-hash-table-vector id)))
|
|
206 (if (or (not slot)
|
|
207 (eq (car slot) symbol))
|
|
208 (progn
|
|
209 (aset translation-hash-table-vector id (cons symbol table))
|
|
210 (setq done t))
|
|
211 (setq id (1+ id)))))
|
|
212 (put symbol 'translation-hash-table-id id)
|
|
213 id))
|
|
214
|
|
215 (defvar make-8-bit-private-use-start (decode-char 'ucs #xE000)
|
|
216 "Start of a 256 code private use area for make-8-bit-coding-system.
|
|
217
|
|
218 This is used to ensure that distinct octets on disk for a given coding
|
|
219 system map to distinct XEmacs characters, preventing a spurious changes when
|
|
220 a file is read, not changed, and then written. ")
|
|
221
|
|
222 (defun make-8-bit-generate-helper (decode-table encode-table
|
|
223 encode-failure-octet)
|
|
224 "Helper function for `make-8-bit-generate-encode-program', which see.
|
|
225
|
4145
|
226 Deals with the case where ASCII and another character set can both be
|
|
227 encoded unambiguously and completely into the coding-system; if this is so,
|
|
228 returns a list corresponding to such a ccl-program. If not, it returns nil. "
|
4072
|
229 (let ((tentative-encode-program-parts
|
|
230 (eval-when-compile
|
4295
|
231 (let* ((vec-len 128)
|
|
232 (compiled
|
4072
|
233 (append
|
|
234 (ccl-compile
|
|
235 `(1
|
|
236 (loop
|
|
237 (read-multibyte-character r0 r1)
|
|
238 (if (r0 == ,(charset-id 'ascii))
|
|
239 (write r1)
|
|
240 ((if (r0 == #xABAB)
|
|
241 ;; #xBFFE is a sentinel in the compiled
|
|
242 ;; program.
|
4295
|
243 ;; #xBFFE is a sentinel in the compiled
|
|
244 ;; program.
|
|
245 ((r0 = r1 & #x7F)
|
|
246 (write r0 ,(make-vector vec-len #xBFFE)))
|
4072
|
247 ((mule-to-unicode r0 r1)
|
|
248 (if (r0 == #xFFFD)
|
|
249 (write #xBEEF)
|
|
250 ((lookup-integer encode-table-sym r0 r3)
|
|
251 (if r7
|
|
252 (write-multibyte-character r0 r3)
|
|
253 (write #xBEEF))))))))
|
|
254 (repeat)))) nil))
|
|
255 (first-part compiled)
|
|
256 (last-part
|
|
257 (member-if-not (lambda (entr) (eq #xBFFE entr))
|
|
258 (member-if
|
|
259 (lambda (entr) (eq #xBFFE entr))
|
|
260 first-part))))
|
|
261 (while compiled
|
4295
|
262 (when (eq #xBFFE (cadr compiled))
|
|
263 (assert (= vec-len (search '(#xBFFE) (cdr compiled)
|
|
264 :test #'/=)) nil
|
|
265 "Strange ccl vector length")
|
|
266 (setcdr compiled nil))
|
4072
|
267 (setq compiled (cdr compiled)))
|
|
268 ;; Is the generated code as we expect it to be?
|
|
269 (assert (and (memq #xABAB first-part)
|
|
270 (memq #xBEEF14 last-part))
|
|
271 nil
|
|
272 "This code assumes that the constant #xBEEF is #xBEEF14 in \
|
|
273 compiled CCL code,\nand that the constant #xABAB is #xABAB. If that is
|
|
274 not the case, and it appears not to be--that's why you're getting this
|
|
275 message--it will not work. ")
|
4295
|
276 (list first-part last-part vec-len))))
|
4072
|
277 (charset-lower -1)
|
|
278 (charset-upper -1)
|
|
279 worth-trying known-charsets encode-program
|
4295
|
280 other-charset-vector ucs args-out-of-range)
|
4072
|
281
|
|
282 (loop for char across decode-table
|
|
283 do (pushnew (char-charset char) known-charsets))
|
|
284 (setq known-charsets (delq 'ascii known-charsets))
|
|
285
|
|
286 (loop for known-charset in known-charsets
|
|
287 do
|
|
288 ;; This is not possible for two dimensional charsets.
|
|
289 (when (eq 1 (charset-dimension known-charset))
|
|
290 (setq args-out-of-range t)
|
|
291 (if (eq 'control-1 known-charset)
|
|
292 (setq charset-lower 0
|
|
293 charset-upper 31)
|
|
294 ;; There should be a nicer way to get the limits here.
|
|
295 (condition-case args-out-of-range
|
|
296 (make-char known-charset #x100)
|
|
297 (args-out-of-range
|
|
298 (setq charset-lower (third args-out-of-range)
|
|
299 charset-upper (fourth args-out-of-range)))))
|
|
300 (loop
|
|
301 for i from charset-lower to charset-upper
|
|
302 always (and (setq ucs
|
|
303 (encode-char (make-char known-charset i) 'ucs))
|
|
304 (gethash ucs encode-table))
|
|
305 finally (setq worth-trying known-charset))
|
|
306
|
|
307 ;; Only trying this for one charset at a time, the first find.
|
|
308 (when worth-trying (return))
|
|
309
|
|
310 ;; Okay, this charset is not worth trying, Try the next.
|
|
311 (setq charset-lower -1
|
|
312 charset-upper -1
|
|
313 worth-trying nil)))
|
|
314
|
|
315 (when worth-trying
|
4295
|
316 (setq other-charset-vector
|
|
317 (make-vector (third tentative-encode-program-parts)
|
|
318 encode-failure-octet))
|
4072
|
319 (loop for i from charset-lower to charset-upper
|
4090
|
320 do (aset other-charset-vector i
|
4072
|
321 (gethash (encode-char (make-char worth-trying i)
|
|
322 'ucs) encode-table)))
|
|
323 (setq encode-program
|
|
324 (nsublis
|
|
325 (list (cons #xABAB (charset-id worth-trying)))
|
|
326 (nconc
|
|
327 (copy-list (first
|
|
328 tentative-encode-program-parts))
|
|
329 (append other-charset-vector nil)
|
|
330 (copy-tree (second
|
|
331 tentative-encode-program-parts))))))
|
|
332 encode-program))
|
|
333
|
|
334 (defun make-8-bit-generate-encode-program (decode-table encode-table
|
|
335 encode-failure-octet)
|
|
336 "Generate a CCL program to decode a 8-bit fixed-width charset.
|
|
337
|
|
338 DECODE-TABLE must have 256 non-cons entries, and will be regarded as
|
|
339 describing a map from the octet corresponding to an offset in the
|
|
340 table to the that entry in the table. ENCODE-TABLE is a hash table
|
|
341 map from unicode values to characters in the range [0,255].
|
|
342 ENCODE-FAILURE-OCTET describes an integer between 0 and 255
|
|
343 \(inclusive) to write in the event that a character cannot be encoded. "
|
|
344 (check-argument-type #'vectorp decode-table)
|
|
345 (check-argument-range (length decode-table) #x100 #x100)
|
|
346 (check-argument-type #'hash-table-p encode-table)
|
|
347 (check-argument-type #'integerp encode-failure-octet)
|
|
348 (check-argument-range encode-failure-octet #x00 #xFF)
|
|
349 (let ((encode-program nil)
|
|
350 (general-encode-program
|
|
351 (eval-when-compile
|
|
352 (let ((prog (append
|
|
353 (ccl-compile
|
|
354 `(1
|
|
355 (loop
|
|
356 (read-multibyte-character r0 r1)
|
|
357 (mule-to-unicode r0 r1)
|
|
358 (if (r0 == #xFFFD)
|
|
359 (write #xBEEF)
|
|
360 ((lookup-integer encode-table-sym r0 r3)
|
|
361 (if r7
|
|
362 (write-multibyte-character r0 r3)
|
|
363 (write #xBEEF))))
|
|
364 (repeat)))) nil)))
|
|
365 (assert (memq #xBEEF14 prog)
|
|
366 nil
|
|
367 "This code assumes that the constant #xBEEF is #xBEEF14 \
|
|
368 in compiled CCL code.\nIf that is not the case, and it appears not to
|
|
369 be--that's why you're getting this message--it will not work. ")
|
|
370 prog)))
|
|
371 (encode-program-with-ascii-optimisation
|
|
372 (eval-when-compile
|
|
373 (let ((prog (append
|
|
374 (ccl-compile
|
|
375 `(1
|
|
376 (loop
|
|
377 (read-multibyte-character r0 r1)
|
|
378 (if (r0 == ,(charset-id 'ascii))
|
|
379 (write r1)
|
|
380 ((mule-to-unicode r0 r1)
|
|
381 (if (r0 == #xFFFD)
|
|
382 (write #xBEEF)
|
|
383 ((lookup-integer encode-table-sym r0 r3)
|
|
384 (if r7
|
|
385 (write-multibyte-character r0 r3)
|
|
386 (write #xBEEF))))))
|
|
387 (repeat)))) nil)))
|
|
388 (assert (memq #xBEEF14 prog)
|
|
389 nil
|
|
390 "This code assumes that the constant #xBEEF is #xBEEF14 \
|
|
391 in compiled CCL code.\nIf that is not the case, and it appears not to
|
|
392 be--that's why you're getting this message--it will not work. ")
|
|
393 prog)))
|
|
394 (ascii-encodes-as-itself nil))
|
|
395
|
|
396 ;; Is this coding system ASCII-compatible? If so, we can avoid the hash
|
|
397 ;; table lookup for those characters.
|
|
398 (loop
|
|
399 for i from #x00 to #x7f
|
|
400 always (eq (int-to-char i) (gethash i encode-table))
|
|
401 finally (setq ascii-encodes-as-itself t))
|
|
402
|
|
403 ;; Note that this logic handles EBCDIC badly. For example, CP037,
|
|
404 ;; MIME name ebcdic-na, has the entire repertoire of ASCII and
|
|
405 ;; Latin 1, and thus a more optimal ccl encode program would check
|
|
406 ;; for those character sets and use tables. But for now, we do a
|
|
407 ;; hash table lookup for every character.
|
|
408 (if (null ascii-encodes-as-itself)
|
|
409 ;; General encode program. Pros; general and correct. Cons;
|
|
410 ;; slow, a hash table lookup + mule-unicode conversion is done
|
|
411 ;; for every character encoding.
|
|
412 (setq encode-program general-encode-program)
|
|
413 (setq encode-program
|
|
414 ;; Encode program with ascii-ascii mapping (based on a
|
|
415 ;; character's mule character set), and one other mule
|
|
416 ;; character set using table-based encoding, other
|
|
417 ;; character sets using hash table lookups.
|
|
418 ;; make-8-bit-non-ascii-completely-coveredp only returns
|
|
419 ;; such a mapping if some non-ASCII charset with
|
|
420 ;; characters in decode-table is entirely covered by
|
|
421 ;; encode-table.
|
|
422 (make-8-bit-generate-helper decode-table encode-table
|
|
423 encode-failure-octet))
|
|
424 (unless encode-program
|
|
425 ;; If make-8-bit-non-ascii-completely-coveredp returned nil,
|
|
426 ;; but ASCII still encodes as itself, do one-to-one mapping
|
|
427 ;; for ASCII, and a hash table lookup for everything else.
|
|
428 (setq encode-program encode-program-with-ascii-optimisation)))
|
|
429
|
|
430 (setq encode-program
|
|
431 (nsublis
|
|
432 (list (cons #xBEEF14
|
|
433 (logior (lsh encode-failure-octet 8)
|
|
434 #x14)))
|
|
435 (copy-tree encode-program)))
|
|
436 encode-program))
|
|
437
|
|
438 (defun make-8-bit-create-decode-encode-tables (unicode-map)
|
|
439 "Return a list \(DECODE-TABLE ENCODE-TABLE) given UNICODE-MAP.
|
|
440 UNICODE-MAP should be an alist mapping from integer octet values to
|
|
441 characters with UCS code points; DECODE-TABLE will be a 256-element
|
|
442 vector, and ENCODE-TABLE will be a hash table mapping from 256 numbers
|
|
443 to 256 distinct characters. "
|
|
444 (check-argument-type #'listp unicode-map)
|
|
445 (let ((decode-table (make-vector 256 nil))
|
|
446 (encode-table (make-hash-table :size 256))
|
|
447 (private-use-start (encode-char make-8-bit-private-use-start 'ucs))
|
|
448 desired-ucs)
|
|
449
|
|
450 (loop for (external internal)
|
|
451 in unicode-map
|
|
452 do
|
|
453 (aset decode-table external internal)
|
|
454 (assert (not (eq (encode-char internal 'ucs) -1))
|
|
455 nil
|
|
456 "Looks like you're calling make-8-bit-coding-system in a \
|
|
457 dumped file, \nand you're either not providing a literal UNICODE-MAP
|
|
458 or PROPS. Don't do that; make-8-bit-coding-system relies on sensible
|
|
459 Unicode mappings being available, which they are at compile time for
|
|
460 dumped files (but this requires the mentioned literals), but not, for
|
|
461 most of them, at run time. ")
|
|
462
|
|
463 (puthash (encode-char internal 'ucs)
|
|
464 ;; This is semantically an integer, but Dave Love's design
|
|
465 ;; for lookup-integer in CCL means we need to store it as a
|
|
466 ;; character.
|
|
467 (int-to-char external)
|
|
468 encode-table))
|
|
469
|
|
470 ;; Now, go through the decode table looking at the characters that
|
|
471 ;; remain nil. If the XEmacs character with that integer is already in
|
|
472 ;; the encode table, map the on-disk octet to a Unicode private use
|
|
473 ;; character. Otherwise map the on-disk octet to the XEmacs character
|
|
474 ;; with that numeric value, to make it clearer what it is.
|
|
475 (dotimes (i 256)
|
|
476 (when (null (aref decode-table i))
|
|
477 ;; Find a free code point.
|
|
478 (setq desired-ucs i)
|
|
479 (while (gethash desired-ucs encode-table)
|
|
480 ;; In the normal case, the code point chosen will be U+E0XY, where
|
|
481 ;; XY is the hexadecimal octet on disk. In pathological cases
|
|
482 ;; it'll be something else.
|
|
483 (setq desired-ucs (+ private-use-start desired-ucs)
|
|
484 private-use-start (+ private-use-start 1)))
|
4085
|
485 (puthash desired-ucs (int-to-char i) encode-table)
|
|
486 (setq desired-ucs (if (> desired-ucs #xFF)
|
|
487 (decode-char 'ucs desired-ucs)
|
|
488 ;; So we get Latin-1 when run at dump time,
|
|
489 ;; instead of JIT-allocated characters.
|
|
490 (int-to-char desired-ucs)))
|
|
491 (aset decode-table i desired-ucs)))
|
4072
|
492 (values decode-table encode-table)))
|
|
493
|
|
494 (defun make-8-bit-generate-decode-program (decode-table)
|
|
495 "Given DECODE-TABLE, generate a CCL program to decode an 8-bit charset.
|
|
496 DECODE-TABLE must have 256 non-cons entries, and will be regarded as
|
|
497 describing a map from the octet corresponding to an offset in the
|
|
498 table to the that entry in the table. "
|
|
499 (check-argument-type #'vectorp decode-table)
|
|
500 (check-argument-range (length decode-table) #x100 #x100)
|
|
501 (let ((decode-program-parts
|
|
502 (eval-when-compile
|
|
503 (let* ((compiled
|
|
504 (append
|
|
505 (ccl-compile
|
|
506 `(3
|
|
507 ((read r0)
|
|
508 (loop
|
|
509 (write-read-repeat r0 ,(make-vector
|
|
510 256 'sentinel)))))) nil))
|
|
511 (first-part compiled)
|
|
512 (last-part
|
|
513 (member-if-not #'symbolp
|
|
514 (member-if-not #'integerp first-part))))
|
|
515 ;; Chop off the sentinel sentinel sentinel [..] part.
|
|
516 (while compiled
|
|
517 (if (symbolp (cadr compiled))
|
|
518 (setcdr compiled nil))
|
|
519 (setq compiled (cdr compiled)))
|
|
520 (list first-part last-part)))))
|
|
521 (nconc
|
|
522 ;; copy-list needed, because the structure of the literal provided
|
|
523 ;; by our eval-when-compile hangs around.
|
|
524 (copy-list (first decode-program-parts))
|
|
525 (append decode-table nil)
|
|
526 (second decode-program-parts))))
|
|
527
|
4145
|
528 (defun make-8-bit-choose-category (decode-table)
|
|
529 "Given DECODE-TABLE, return an appropriate coding category.
|
|
530 DECODE-TABLE is a 256-entry vector describing the mapping from octets on
|
|
531 disk to XEmacs characters for some fixed-width 8-bit coding system. "
|
|
532 (check-argument-type #'vectorp decode-table)
|
|
533 (check-argument-range (length decode-table) #x100 #x100)
|
|
534 (block category
|
|
535 (loop
|
|
536 for i from #x80 to #xBF
|
|
537 do (unless (= i (aref decode-table i))
|
|
538 (return-from category 'no-conversion)))
|
|
539 'iso-8-1))
|
|
540
|
4072
|
541 ;;;###autoload
|
|
542 (defun make-8-bit-coding-system (name unicode-map &optional description props)
|
|
543 "Make and return a fixed-width 8-bit CCL coding system named NAME.
|
|
544 NAME must be a symbol, and UNICODE-MAP a list.
|
|
545
|
|
546 UNICODE-MAP is a plist describing a map from octets in the coding
|
|
547 system NAME (as integers) to XEmacs characters. Those XEmacs
|
|
548 characters will be used explicitly on decoding, but for encoding (most
|
|
549 relevantly, on writing to disk) XEmacs characters that map to the same
|
|
550 Unicode code point will be unified. This means that the ISO-8859-?
|
|
551 characters that map to the same Unicode code point will not be
|
|
552 distinct when written to disk, which is normally what is intended; it
|
|
553 also means that East Asian Han characters from different XEmacs
|
|
554 character sets will not be distinct when written to disk, which is
|
|
555 less often what is intended.
|
|
556
|
|
557 Any octets not mapped will be decoded into the ISO 8859-1 characters with
|
|
558 the corresponding numeric value; unless another octet maps to that
|
|
559 character, in which case the Unicode private use area will be used. This
|
|
560 avoids spurious changes to files on disk when they contain octets that would
|
|
561 be otherwise remapped to the canonical values for the corresponding
|
|
562 characters in the coding system.
|
|
563
|
|
564 DESCRIPTION and PROPS are as in `make-coding-system', which see. This
|
|
565 function also accepts two additional (optional) properties in PROPS;
|
|
566 `aliases', giving a list of aliases to be initialized for this
|
|
567 coding-system, and `encode-failure-octet', an integer between 0 and 256 to
|
|
568 write in place of XEmacs characters that cannot be encoded, defaulting to
|
|
569 the code for tilde `~'. "
|
|
570 (check-argument-type #'symbolp name)
|
|
571 (check-argument-type #'listp unicode-map)
|
|
572 (check-argument-type #'stringp
|
|
573 (or description
|
|
574 (setq description
|
|
575 (format "Coding system used for %s." name))))
|
|
576 (check-valid-plist props)
|
|
577 (let ((encode-failure-octet (or (plist-get props 'encode-failure-octet)
|
|
578 (char-to-int ?~)))
|
|
579 (aliases (plist-get props 'aliases))
|
|
580 (hash-table-sym (gentemp (format "%s-encode-table" name)))
|
|
581 encode-program decode-program result decode-table encode-table)
|
|
582
|
|
583 ;; Some more sanity checking.
|
|
584 (check-argument-range encode-failure-octet 0 #xFF)
|
|
585 (check-argument-type #'listp aliases)
|
|
586
|
|
587 ;; Don't pass on our extra data to make-coding-system.
|
|
588 (setq props (plist-remprop props 'encode-failure-octet)
|
|
589 props (plist-remprop props 'aliases))
|
|
590
|
|
591 (multiple-value-setq
|
|
592 (decode-table encode-table)
|
|
593 (make-8-bit-create-decode-encode-tables unicode-map))
|
|
594
|
|
595 ;; Register the decode-table.
|
|
596 (define-translation-hash-table hash-table-sym encode-table)
|
|
597
|
|
598 ;; Generate the programs.
|
|
599 (setq decode-program (make-8-bit-generate-decode-program decode-table)
|
|
600 encode-program (make-8-bit-generate-encode-program
|
|
601 decode-table encode-table encode-failure-octet))
|
|
602 (unless (vectorp encode-program)
|
|
603 (setq encode-program
|
|
604 (apply #'vector
|
|
605 (nsublis (list (cons 'encode-table-sym hash-table-sym))
|
|
606 (copy-tree encode-program)))))
|
|
607 (unless (vectorp decode-program)
|
|
608 (setq decode-program
|
|
609 (apply #'vector decode-program)))
|
|
610
|
|
611 ;; And now generate the actual coding system.
|
|
612 (setq result
|
|
613 (make-coding-system
|
|
614 name 'ccl
|
|
615 description
|
|
616 (plist-put (plist-put props 'decode decode-program)
|
|
617 'encode encode-program)))
|
4295
|
618 (coding-system-put name '8-bit-fixed t)
|
4145
|
619 (coding-system-put name 'category
|
|
620 (make-8-bit-choose-category decode-table))
|
4072
|
621 (loop for alias in aliases
|
|
622 do (define-coding-system-alias alias name))
|
|
623 result))
|
|
624
|
|
625 (define-compiler-macro make-8-bit-coding-system (&whole form name unicode-map
|
|
626 &optional description props)
|
|
627
|
|
628 ;; We provide the compiler macro (= macro that is expanded only on
|
|
629 ;; compilation, and that can punt to a runtime version of the
|
|
630 ;; associate function if necessary) not for reasons of speed, though
|
|
631 ;; it does speed up things at runtime a little, but because the
|
|
632 ;; Unicode mappings are available at compile time in the dumped
|
|
633 ;; files, but they are not available at run time for the vast
|
|
634 ;; majority of them.
|
|
635
|
|
636 (if (not (and (and (consp name) (eq (car name) 'quote))
|
|
637 (and (consp unicode-map) (eq (car unicode-map) 'quote))
|
|
638 (and (or (and (consp props) (eq (car props) 'quote))
|
|
639 (null props)))))
|
|
640 ;; The call does not use literals; do it at runtime.
|
|
641 form
|
|
642 (setq name (cadr name)
|
|
643 unicode-map (cadr unicode-map)
|
|
644 props (if props (cadr props)))
|
|
645 (let ((encode-failure-octet
|
|
646 (or (plist-get props 'encode-failure-octet) (char-to-int ?~)))
|
|
647 (aliases (plist-get props 'aliases))
|
|
648 encode-program decode-program
|
4103
|
649 decode-table encode-table)
|
4072
|
650
|
|
651 ;; Some sanity checking.
|
|
652 (check-argument-range encode-failure-octet 0 #xFF)
|
|
653 (check-argument-type #'listp aliases)
|
|
654
|
|
655 ;; Don't pass on our extra data to make-coding-system.
|
|
656 (setq props (plist-remprop props 'encode-failure-octet)
|
|
657 props (plist-remprop props 'aliases))
|
|
658
|
|
659 ;; Work out encode-table and decode-table.
|
|
660 (multiple-value-setq
|
|
661 (decode-table encode-table)
|
|
662 (make-8-bit-create-decode-encode-tables unicode-map))
|
|
663
|
|
664 ;; Generate the decode and encode programs.
|
|
665 (setq decode-program (make-8-bit-generate-decode-program decode-table)
|
|
666 encode-program (make-8-bit-generate-encode-program
|
|
667 decode-table encode-table encode-failure-octet))
|
|
668
|
|
669 ;; And return the generated code.
|
|
670 `(let ((encode-table-sym (gentemp (format "%s-encode-table" ',name)))
|
4103
|
671 ;; The case-fold-search bind shouldn't be necessary. If I take
|
|
672 ;; it, out, though, I get:
|
|
673 ;;
|
|
674 ;; (invalid-read-syntax "Multiply defined symbol label" 1)
|
|
675 ;;
|
|
676 ;; when the file is byte compiled.
|
|
677 (case-fold-search t))
|
4072
|
678 (define-translation-hash-table encode-table-sym ,encode-table)
|
4103
|
679 (make-coding-system
|
|
680 ',name 'ccl ,description
|
|
681 (plist-put (plist-put ',props 'decode
|
|
682 ,(apply #'vector decode-program))
|
|
683 'encode
|
|
684 (apply #'vector
|
|
685 (nsublis
|
|
686 (list (cons
|
|
687 'encode-table-sym
|
|
688 (symbol-value 'encode-table-sym)))
|
|
689 ',encode-program))))
|
4295
|
690 (coding-system-put ',name '8-bit-fixed t)
|
4145
|
691 (coding-system-put ',name 'category ',
|
|
692 (make-8-bit-choose-category decode-table))
|
4072
|
693 ,(macroexpand `(loop for alias in ',aliases
|
|
694 do (define-coding-system-alias alias
|
|
695 ',name)))
|
4103
|
696 (find-coding-system ',name)))))
|
4299
|
697
|
|
698 ;; Ideally this would be in latin.el, but code-init.el uses it.
|
|
699 (make-8-bit-coding-system
|
|
700 'iso-8859-1
|
|
701 '() ;; No differences from Latin 1.
|
|
702 "ISO-8859-1 (Latin-1)"
|
|
703 '(mnemonic "Latin 1"
|
|
704 documentation "The most used encoding of Western Europe and the Americas."
|
|
705 aliases (iso-latin-1 latin-1)))
|
|
706
|