502
|
1 ;;; mule-coding.el --- Coding-system functions for Mule. -*- coding: iso-2022-7bit; -*-
|
333
|
2
|
|
3 ;; Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
|
|
4 ;; Licensed to the Free Software Foundation.
|
|
5 ;; Copyright (C) 1995 Amdahl Corporation.
|
|
6 ;; Copyright (C) 1995 Sun Microsystems.
|
|
7 ;; Copyright (C) 1997 MORIOKA Tomohiko
|
771
|
8 ;; Copyright (C) 2001 Ben Wing.
|
333
|
9
|
|
10 ;; This file is part of XEmacs.
|
|
11
|
|
12 ;; XEmacs is free software; you can redistribute it and/or modify it
|
|
13 ;; under the terms of the GNU General Public License as published by
|
|
14 ;; the Free Software Foundation; either version 2, or (at your option)
|
|
15 ;; any later version.
|
|
16
|
|
17 ;; XEmacs is distributed in the hope that it will be useful, but
|
|
18 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
20 ;; General Public License for more details.
|
|
21
|
|
22 ;; You should have received a copy of the GNU General Public License
|
444
|
23 ;; along with XEmacs; see the file COPYING. If not, write to the
|
333
|
24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
25 ;; Boston, MA 02111-1307, USA.
|
|
26
|
|
27 ;;; Commentary:
|
|
28
|
|
29 ;;; split off of mule.el and mostly moved to coding.el
|
|
30
|
4072
|
31 ;; Needed for make-8-bit-coding-system.
|
4080
|
32 (eval-when-compile (require 'ccl))
|
4072
|
33
|
333
|
34 ;;; Code:
|
|
35
|
|
36 (defun coding-system-force-on-output (coding-system register)
|
|
37 "Return the 'force-on-output property of CODING-SYSTEM for the specified REGISTER."
|
444
|
38 (check-type register integer)
|
333
|
39 (coding-system-property
|
|
40 coding-system
|
|
41 (case register
|
|
42 (0 'force-g0-on-output)
|
|
43 (1 'force-g1-on-output)
|
|
44 (2 'force-g2-on-output)
|
|
45 (3 'force-g3-on-output)
|
|
46 (t (signal 'args-out-of-range (list register 0 3))))))
|
|
47
|
|
48 (defun coding-system-short (coding-system)
|
|
49 "Return the 'short property of CODING-SYSTEM."
|
|
50 (coding-system-property coding-system 'short))
|
|
51
|
|
52 (defun coding-system-no-ascii-eol (coding-system)
|
|
53 "Return the 'no-ascii-eol property of CODING-SYSTEM."
|
|
54 (coding-system-property coding-system 'no-ascii-eol))
|
|
55
|
|
56 (defun coding-system-no-ascii-cntl (coding-system)
|
|
57 "Return the 'no-ascii-cntl property of CODING-SYSTEM."
|
|
58 (coding-system-property coding-system 'no-ascii-cntl))
|
|
59
|
|
60 (defun coding-system-seven (coding-system)
|
|
61 "Return the 'seven property of CODING-SYSTEM."
|
|
62 (coding-system-property coding-system 'seven))
|
|
63
|
|
64 (defun coding-system-lock-shift (coding-system)
|
|
65 "Return the 'lock-shift property of CODING-SYSTEM."
|
|
66 (coding-system-property coding-system 'lock-shift))
|
|
67
|
|
68 ;;(defun coding-system-use-japanese-jisx0201-roman (coding-system)
|
|
69 ;; "Return the 'use-japanese-jisx0201-roman property of CODING-SYSTEM."
|
|
70 ;; (coding-system-property coding-system 'use-japanese-jisx0201-roman))
|
|
71
|
|
72 ;;(defun coding-system-use-japanese-jisx0208-1978 (coding-system)
|
|
73 ;; "Return the 'use-japanese-jisx0208-1978 property of CODING-SYSTEM."
|
|
74 ;; (coding-system-property coding-system 'use-japanese-jisx0208-2978))
|
|
75
|
|
76 (defun coding-system-no-iso6429 (coding-system)
|
|
77 "Return the 'no-iso6429 property of CODING-SYSTEM."
|
|
78 (coding-system-property coding-system 'no-iso6429))
|
|
79
|
|
80 (defun coding-system-ccl-encode (coding-system)
|
|
81 "Return the CCL 'encode property of CODING-SYSTEM."
|
|
82 (coding-system-property coding-system 'encode))
|
|
83
|
|
84 (defun coding-system-ccl-decode (coding-system)
|
|
85 "Return the CCL 'decode property of CODING-SYSTEM."
|
|
86 (coding-system-property coding-system 'decode))
|
|
87
|
771
|
88 (defun coding-system-iso2022-charset (coding-system register)
|
|
89 "Return the charset initially designated to REGISTER in CODING-SYSTEM.
|
|
90 The allowable range of REGISTER is 0 through 3."
|
|
91 (if (or (< register 0) (> register 3))
|
|
92 (error 'args-out-of-range "coding-system-charset REGISTER" register 0 3))
|
|
93 (coding-system-property coding-system (nth register '(charset-g0
|
|
94 charset-g1
|
|
95 charset-g2
|
|
96 charset-g3))))
|
|
97
|
333
|
98
|
|
99 ;;;; Definitions of predefined coding systems
|
|
100
|
|
101 (make-coding-system
|
|
102 'ctext 'iso2022
|
771
|
103 "Compound Text"
|
333
|
104 '(charset-g0 ascii
|
|
105 charset-g1 latin-iso8859-1
|
|
106 eol-type nil
|
|
107 mnemonic "CText"))
|
|
108
|
|
109 (make-coding-system
|
|
110 'iso-8859-1 'no-conversion
|
771
|
111 "ISO-8859-1 (Latin-1)"
|
333
|
112 '(eol-type nil mnemonic "Noconv"))
|
|
113
|
|
114 (make-coding-system
|
|
115 'iso-2022-8bit-ss2 'iso2022
|
771
|
116 "ISO-2022 8-bit w/SS2"
|
333
|
117 '(charset-g0 ascii
|
|
118 charset-g1 latin-iso8859-1
|
|
119 charset-g2 t ;; unspecified but can be used later.
|
|
120 short t
|
|
121 mnemonic "ISO8/SS"
|
771
|
122 documentation "ISO 2022 based 8-bit encoding using SS2 for 96-charset"
|
333
|
123 ))
|
|
124
|
|
125 (make-coding-system
|
|
126 'iso-2022-7bit-ss2 'iso2022
|
771
|
127 "ISO-2022 7-bit w/SS2"
|
333
|
128 '(charset-g0 ascii
|
|
129 charset-g2 t ;; unspecified but can be used later.
|
|
130 seven t
|
|
131 short t
|
|
132 mnemonic "ISO7/SS"
|
771
|
133 documentation "ISO 2022 based 7-bit encoding using SS2 for 96-charset"
|
333
|
134 eol-type nil))
|
|
135
|
|
136 ;; (copy-coding-system 'iso-2022-7bit-ss2 'iso-2022-jp-2)
|
|
137 (make-coding-system
|
|
138 'iso-2022-jp-2 'iso2022
|
771
|
139 "ISO-2022-JP-2"
|
333
|
140 '(charset-g0 ascii
|
|
141 charset-g2 t ;; unspecified but can be used later.
|
|
142 seven t
|
|
143 short t
|
|
144 mnemonic "ISO7/SS"
|
|
145 eol-type nil))
|
|
146
|
|
147 (make-coding-system
|
|
148 'iso-2022-7bit 'iso2022
|
771
|
149 "ISO 2022 7-bit"
|
333
|
150 '(charset-g0 ascii
|
|
151 seven t
|
|
152 short t
|
771
|
153 mnemonic "ISO7"
|
|
154 documentation "ISO-2022-based 7-bit encoding using only G0"
|
|
155 ))
|
333
|
156
|
|
157 ;; compatibility for old XEmacsen
|
771
|
158 (define-coding-system-alias 'iso-2022-7 'iso-2022-7bit)
|
333
|
159
|
|
160 (make-coding-system
|
|
161 'iso-2022-8 'iso2022
|
771
|
162 "ISO-2022 8-bit"
|
333
|
163 '(charset-g0 ascii
|
|
164 charset-g1 latin-iso8859-1
|
|
165 short t
|
|
166 mnemonic "ISO8"
|
771
|
167 documentation "ISO-2022 eight-bit coding system. No single-shift or locking-shift."
|
333
|
168 ))
|
|
169
|
|
170 (make-coding-system
|
|
171 'escape-quoted 'iso2022
|
771
|
172 "Escape-Quoted (for .ELC files)"
|
333
|
173 '(charset-g0 ascii
|
|
174 charset-g1 latin-iso8859-1
|
|
175 eol-type lf
|
|
176 escape-quoted t
|
|
177 mnemonic "ESC/Quot"
|
771
|
178 documentation "ISO-2022 eight-bit coding system with escape quoting; used for .ELC files."
|
333
|
179 ))
|
|
180
|
|
181 (make-coding-system
|
|
182 'iso-2022-lock 'iso2022
|
771
|
183 "ISO-2022 w/locking-shift"
|
333
|
184 '(charset-g0 ascii
|
|
185 charset-g1 t ;; unspecified but can be used later.
|
|
186 seven t
|
|
187 lock-shift t
|
|
188 mnemonic "ISO7/Lock"
|
771
|
189 documentation "ISO-2022 coding system using Locking-Shift for 96-charset."
|
333
|
190 ))
|
4072
|
191
|
333
|
192
|
4072
|
193 ;; This is used by people writing CCL programs, but is called at runtime.
|
|
194 (defun define-translation-hash-table (symbol table)
|
|
195 "Define SYMBOL as the name of the hash translation TABLE for use in CCL.
|
|
196
|
|
197 Analogous to `define-translation-table', but updates
|
|
198 `translation-hash-table-vector' and the table is for use in the CCL
|
|
199 `lookup-integer' and `lookup-character' functions."
|
4145
|
200 (check-argument-type #'symbolp symbol)
|
|
201 (check-argument-type #'hash-table-p table)
|
4072
|
202 (let ((len (length translation-hash-table-vector))
|
|
203 (id 0)
|
|
204 done)
|
|
205 (put symbol 'translation-hash-table table)
|
|
206 (while (not done)
|
|
207 (if (>= id len)
|
|
208 (setq translation-hash-table-vector
|
|
209 (vconcat translation-hash-table-vector [nil])))
|
|
210 (let ((slot (aref translation-hash-table-vector id)))
|
|
211 (if (or (not slot)
|
|
212 (eq (car slot) symbol))
|
|
213 (progn
|
|
214 (aset translation-hash-table-vector id (cons symbol table))
|
|
215 (setq done t))
|
|
216 (setq id (1+ id)))))
|
|
217 (put symbol 'translation-hash-table-id id)
|
|
218 id))
|
|
219
|
|
220 (defvar make-8-bit-private-use-start (decode-char 'ucs #xE000)
|
|
221 "Start of a 256 code private use area for make-8-bit-coding-system.
|
|
222
|
|
223 This is used to ensure that distinct octets on disk for a given coding
|
|
224 system map to distinct XEmacs characters, preventing a spurious changes when
|
|
225 a file is read, not changed, and then written. ")
|
|
226
|
|
227 (defun make-8-bit-generate-helper (decode-table encode-table
|
|
228 encode-failure-octet)
|
|
229 "Helper function for `make-8-bit-generate-encode-program', which see.
|
|
230
|
4145
|
231 Deals with the case where ASCII and another character set can both be
|
|
232 encoded unambiguously and completely into the coding-system; if this is so,
|
|
233 returns a list corresponding to such a ccl-program. If not, it returns nil. "
|
4072
|
234 (let ((tentative-encode-program-parts
|
|
235 (eval-when-compile
|
|
236 (let* ((compiled
|
|
237 (append
|
|
238 (ccl-compile
|
|
239 `(1
|
|
240 (loop
|
|
241 (read-multibyte-character r0 r1)
|
|
242 (if (r0 == ,(charset-id 'ascii))
|
|
243 (write r1)
|
|
244 ((if (r0 == #xABAB)
|
|
245 ;; #xBFFE is a sentinel in the compiled
|
|
246 ;; program.
|
|
247 (write r1 ,(make-vector 256 #xBFFE))
|
|
248 ((mule-to-unicode r0 r1)
|
|
249 (if (r0 == #xFFFD)
|
|
250 (write #xBEEF)
|
|
251 ((lookup-integer encode-table-sym r0 r3)
|
|
252 (if r7
|
|
253 (write-multibyte-character r0 r3)
|
|
254 (write #xBEEF))))))))
|
|
255 (repeat)))) nil))
|
|
256 (first-part compiled)
|
|
257 (last-part
|
|
258 (member-if-not (lambda (entr) (eq #xBFFE entr))
|
|
259 (member-if
|
|
260 (lambda (entr) (eq #xBFFE entr))
|
|
261 first-part))))
|
|
262 (while compiled
|
|
263 (if (eq #xBFFE (cadr compiled))
|
|
264 (setcdr compiled nil))
|
|
265 (setq compiled (cdr compiled)))
|
|
266 ;; Is the generated code as we expect it to be?
|
|
267 (assert (and (memq #xABAB first-part)
|
|
268 (memq #xBEEF14 last-part))
|
|
269 nil
|
|
270 "This code assumes that the constant #xBEEF is #xBEEF14 in \
|
|
271 compiled CCL code,\nand that the constant #xABAB is #xABAB. If that is
|
|
272 not the case, and it appears not to be--that's why you're getting this
|
|
273 message--it will not work. ")
|
|
274 (list first-part last-part))))
|
|
275 (charset-lower -1)
|
|
276 (charset-upper -1)
|
|
277 worth-trying known-charsets encode-program
|
|
278 other-charset-vector ucs)
|
|
279
|
|
280 (loop for char across decode-table
|
|
281 do (pushnew (char-charset char) known-charsets))
|
|
282 (setq known-charsets (delq 'ascii known-charsets))
|
|
283
|
|
284 (loop for known-charset in known-charsets
|
|
285 do
|
|
286 ;; This is not possible for two dimensional charsets.
|
|
287 (when (eq 1 (charset-dimension known-charset))
|
|
288 (setq args-out-of-range t)
|
|
289 (if (eq 'control-1 known-charset)
|
|
290 (setq charset-lower 0
|
|
291 charset-upper 31)
|
|
292 ;; There should be a nicer way to get the limits here.
|
|
293 (condition-case args-out-of-range
|
|
294 (make-char known-charset #x100)
|
|
295 (args-out-of-range
|
|
296 (setq charset-lower (third args-out-of-range)
|
|
297 charset-upper (fourth args-out-of-range)))))
|
|
298 (loop
|
|
299 for i from charset-lower to charset-upper
|
|
300 always (and (setq ucs
|
|
301 (encode-char (make-char known-charset i) 'ucs))
|
|
302 (gethash ucs encode-table))
|
|
303 finally (setq worth-trying known-charset))
|
|
304
|
|
305 ;; Only trying this for one charset at a time, the first find.
|
|
306 (when worth-trying (return))
|
|
307
|
|
308 ;; Okay, this charset is not worth trying, Try the next.
|
|
309 (setq charset-lower -1
|
|
310 charset-upper -1
|
|
311 worth-trying nil)))
|
|
312
|
|
313 (when worth-trying
|
4145
|
314 (setq other-charset-vector (make-vector 128 encode-failure-octet))
|
4072
|
315 (loop for i from charset-lower to charset-upper
|
4090
|
316 do (aset other-charset-vector i
|
4072
|
317 (gethash (encode-char (make-char worth-trying i)
|
|
318 'ucs) encode-table)))
|
|
319 (setq encode-program
|
|
320 (nsublis
|
|
321 (list (cons #xABAB (charset-id worth-trying)))
|
|
322 (nconc
|
|
323 (copy-list (first
|
|
324 tentative-encode-program-parts))
|
|
325 (append other-charset-vector nil)
|
|
326 (copy-tree (second
|
|
327 tentative-encode-program-parts))))))
|
|
328 encode-program))
|
|
329
|
|
330 (defun make-8-bit-generate-encode-program (decode-table encode-table
|
|
331 encode-failure-octet)
|
|
332 "Generate a CCL program to decode a 8-bit fixed-width charset.
|
|
333
|
|
334 DECODE-TABLE must have 256 non-cons entries, and will be regarded as
|
|
335 describing a map from the octet corresponding to an offset in the
|
|
336 table to the that entry in the table. ENCODE-TABLE is a hash table
|
|
337 map from unicode values to characters in the range [0,255].
|
|
338 ENCODE-FAILURE-OCTET describes an integer between 0 and 255
|
|
339 \(inclusive) to write in the event that a character cannot be encoded. "
|
|
340 (check-argument-type #'vectorp decode-table)
|
|
341 (check-argument-range (length decode-table) #x100 #x100)
|
|
342 (check-argument-type #'hash-table-p encode-table)
|
|
343 (check-argument-type #'integerp encode-failure-octet)
|
|
344 (check-argument-range encode-failure-octet #x00 #xFF)
|
|
345 (let ((encode-program nil)
|
|
346 (general-encode-program
|
|
347 (eval-when-compile
|
|
348 (let ((prog (append
|
|
349 (ccl-compile
|
|
350 `(1
|
|
351 (loop
|
|
352 (read-multibyte-character r0 r1)
|
|
353 (mule-to-unicode r0 r1)
|
|
354 (if (r0 == #xFFFD)
|
|
355 (write #xBEEF)
|
|
356 ((lookup-integer encode-table-sym r0 r3)
|
|
357 (if r7
|
|
358 (write-multibyte-character r0 r3)
|
|
359 (write #xBEEF))))
|
|
360 (repeat)))) nil)))
|
|
361 (assert (memq #xBEEF14 prog)
|
|
362 nil
|
|
363 "This code assumes that the constant #xBEEF is #xBEEF14 \
|
|
364 in compiled CCL code.\nIf that is not the case, and it appears not to
|
|
365 be--that's why you're getting this message--it will not work. ")
|
|
366 prog)))
|
|
367 (encode-program-with-ascii-optimisation
|
|
368 (eval-when-compile
|
|
369 (let ((prog (append
|
|
370 (ccl-compile
|
|
371 `(1
|
|
372 (loop
|
|
373 (read-multibyte-character r0 r1)
|
|
374 (if (r0 == ,(charset-id 'ascii))
|
|
375 (write r1)
|
|
376 ((mule-to-unicode r0 r1)
|
|
377 (if (r0 == #xFFFD)
|
|
378 (write #xBEEF)
|
|
379 ((lookup-integer encode-table-sym r0 r3)
|
|
380 (if r7
|
|
381 (write-multibyte-character r0 r3)
|
|
382 (write #xBEEF))))))
|
|
383 (repeat)))) nil)))
|
|
384 (assert (memq #xBEEF14 prog)
|
|
385 nil
|
|
386 "This code assumes that the constant #xBEEF is #xBEEF14 \
|
|
387 in compiled CCL code.\nIf that is not the case, and it appears not to
|
|
388 be--that's why you're getting this message--it will not work. ")
|
|
389 prog)))
|
|
390 (ascii-encodes-as-itself nil))
|
|
391
|
|
392 ;; Is this coding system ASCII-compatible? If so, we can avoid the hash
|
|
393 ;; table lookup for those characters.
|
|
394 (loop
|
|
395 for i from #x00 to #x7f
|
|
396 always (eq (int-to-char i) (gethash i encode-table))
|
|
397 finally (setq ascii-encodes-as-itself t))
|
|
398
|
|
399 ;; Note that this logic handles EBCDIC badly. For example, CP037,
|
|
400 ;; MIME name ebcdic-na, has the entire repertoire of ASCII and
|
|
401 ;; Latin 1, and thus a more optimal ccl encode program would check
|
|
402 ;; for those character sets and use tables. But for now, we do a
|
|
403 ;; hash table lookup for every character.
|
|
404 (if (null ascii-encodes-as-itself)
|
|
405 ;; General encode program. Pros; general and correct. Cons;
|
|
406 ;; slow, a hash table lookup + mule-unicode conversion is done
|
|
407 ;; for every character encoding.
|
|
408 (setq encode-program general-encode-program)
|
|
409 (setq encode-program
|
|
410 ;; Encode program with ascii-ascii mapping (based on a
|
|
411 ;; character's mule character set), and one other mule
|
|
412 ;; character set using table-based encoding, other
|
|
413 ;; character sets using hash table lookups.
|
|
414 ;; make-8-bit-non-ascii-completely-coveredp only returns
|
|
415 ;; such a mapping if some non-ASCII charset with
|
|
416 ;; characters in decode-table is entirely covered by
|
|
417 ;; encode-table.
|
|
418 (make-8-bit-generate-helper decode-table encode-table
|
|
419 encode-failure-octet))
|
|
420 (unless encode-program
|
|
421 ;; If make-8-bit-non-ascii-completely-coveredp returned nil,
|
|
422 ;; but ASCII still encodes as itself, do one-to-one mapping
|
|
423 ;; for ASCII, and a hash table lookup for everything else.
|
|
424 (setq encode-program encode-program-with-ascii-optimisation)))
|
|
425
|
|
426 (setq encode-program
|
|
427 (nsublis
|
|
428 (list (cons #xBEEF14
|
|
429 (logior (lsh encode-failure-octet 8)
|
|
430 #x14)))
|
|
431 (copy-tree encode-program)))
|
|
432 encode-program))
|
|
433
|
|
434 (defun make-8-bit-create-decode-encode-tables (unicode-map)
|
|
435 "Return a list \(DECODE-TABLE ENCODE-TABLE) given UNICODE-MAP.
|
|
436 UNICODE-MAP should be an alist mapping from integer octet values to
|
|
437 characters with UCS code points; DECODE-TABLE will be a 256-element
|
|
438 vector, and ENCODE-TABLE will be a hash table mapping from 256 numbers
|
|
439 to 256 distinct characters. "
|
|
440 (check-argument-type #'listp unicode-map)
|
|
441 (let ((decode-table (make-vector 256 nil))
|
|
442 (encode-table (make-hash-table :size 256))
|
|
443 (private-use-start (encode-char make-8-bit-private-use-start 'ucs))
|
|
444 desired-ucs)
|
|
445
|
|
446 (loop for (external internal)
|
|
447 in unicode-map
|
|
448 do
|
|
449 (aset decode-table external internal)
|
|
450 (assert (not (eq (encode-char internal 'ucs) -1))
|
|
451 nil
|
|
452 "Looks like you're calling make-8-bit-coding-system in a \
|
|
453 dumped file, \nand you're either not providing a literal UNICODE-MAP
|
|
454 or PROPS. Don't do that; make-8-bit-coding-system relies on sensible
|
|
455 Unicode mappings being available, which they are at compile time for
|
|
456 dumped files (but this requires the mentioned literals), but not, for
|
|
457 most of them, at run time. ")
|
|
458
|
|
459 (puthash (encode-char internal 'ucs)
|
|
460 ;; This is semantically an integer, but Dave Love's design
|
|
461 ;; for lookup-integer in CCL means we need to store it as a
|
|
462 ;; character.
|
|
463 (int-to-char external)
|
|
464 encode-table))
|
|
465
|
|
466 ;; Now, go through the decode table looking at the characters that
|
|
467 ;; remain nil. If the XEmacs character with that integer is already in
|
|
468 ;; the encode table, map the on-disk octet to a Unicode private use
|
|
469 ;; character. Otherwise map the on-disk octet to the XEmacs character
|
|
470 ;; with that numeric value, to make it clearer what it is.
|
|
471 (dotimes (i 256)
|
|
472 (when (null (aref decode-table i))
|
|
473 ;; Find a free code point.
|
|
474 (setq desired-ucs i)
|
|
475 (while (gethash desired-ucs encode-table)
|
|
476 ;; In the normal case, the code point chosen will be U+E0XY, where
|
|
477 ;; XY is the hexadecimal octet on disk. In pathological cases
|
|
478 ;; it'll be something else.
|
|
479 (setq desired-ucs (+ private-use-start desired-ucs)
|
|
480 private-use-start (+ private-use-start 1)))
|
4085
|
481 (puthash desired-ucs (int-to-char i) encode-table)
|
|
482 (setq desired-ucs (if (> desired-ucs #xFF)
|
|
483 (decode-char 'ucs desired-ucs)
|
|
484 ;; So we get Latin-1 when run at dump time,
|
|
485 ;; instead of JIT-allocated characters.
|
|
486 (int-to-char desired-ucs)))
|
|
487 (aset decode-table i desired-ucs)))
|
4072
|
488 (values decode-table encode-table)))
|
|
489
|
|
490 (defun make-8-bit-generate-decode-program (decode-table)
|
|
491 "Given DECODE-TABLE, generate a CCL program to decode an 8-bit charset.
|
|
492 DECODE-TABLE must have 256 non-cons entries, and will be regarded as
|
|
493 describing a map from the octet corresponding to an offset in the
|
|
494 table to the that entry in the table. "
|
|
495 (check-argument-type #'vectorp decode-table)
|
|
496 (check-argument-range (length decode-table) #x100 #x100)
|
|
497 (let ((decode-program-parts
|
|
498 (eval-when-compile
|
|
499 (let* ((compiled
|
|
500 (append
|
|
501 (ccl-compile
|
|
502 `(3
|
|
503 ((read r0)
|
|
504 (loop
|
|
505 (write-read-repeat r0 ,(make-vector
|
|
506 256 'sentinel)))))) nil))
|
|
507 (first-part compiled)
|
|
508 (last-part
|
|
509 (member-if-not #'symbolp
|
|
510 (member-if-not #'integerp first-part))))
|
|
511 ;; Chop off the sentinel sentinel sentinel [..] part.
|
|
512 (while compiled
|
|
513 (if (symbolp (cadr compiled))
|
|
514 (setcdr compiled nil))
|
|
515 (setq compiled (cdr compiled)))
|
|
516 (list first-part last-part)))))
|
|
517 (nconc
|
|
518 ;; copy-list needed, because the structure of the literal provided
|
|
519 ;; by our eval-when-compile hangs around.
|
|
520 (copy-list (first decode-program-parts))
|
|
521 (append decode-table nil)
|
|
522 (second decode-program-parts))))
|
|
523
|
4145
|
524 (defun make-8-bit-choose-category (decode-table)
|
|
525 "Given DECODE-TABLE, return an appropriate coding category.
|
|
526 DECODE-TABLE is a 256-entry vector describing the mapping from octets on
|
|
527 disk to XEmacs characters for some fixed-width 8-bit coding system. "
|
|
528 (check-argument-type #'vectorp decode-table)
|
|
529 (check-argument-range (length decode-table) #x100 #x100)
|
|
530 (block category
|
|
531 (loop
|
|
532 for i from #x80 to #xBF
|
|
533 do (unless (= i (aref decode-table i))
|
|
534 (return-from category 'no-conversion)))
|
|
535 'iso-8-1))
|
|
536
|
4072
|
537 ;;;###autoload
|
|
538 (defun make-8-bit-coding-system (name unicode-map &optional description props)
|
|
539 "Make and return a fixed-width 8-bit CCL coding system named NAME.
|
|
540 NAME must be a symbol, and UNICODE-MAP a list.
|
|
541
|
|
542 UNICODE-MAP is a plist describing a map from octets in the coding
|
|
543 system NAME (as integers) to XEmacs characters. Those XEmacs
|
|
544 characters will be used explicitly on decoding, but for encoding (most
|
|
545 relevantly, on writing to disk) XEmacs characters that map to the same
|
|
546 Unicode code point will be unified. This means that the ISO-8859-?
|
|
547 characters that map to the same Unicode code point will not be
|
|
548 distinct when written to disk, which is normally what is intended; it
|
|
549 also means that East Asian Han characters from different XEmacs
|
|
550 character sets will not be distinct when written to disk, which is
|
|
551 less often what is intended.
|
|
552
|
|
553 Any octets not mapped will be decoded into the ISO 8859-1 characters with
|
|
554 the corresponding numeric value; unless another octet maps to that
|
|
555 character, in which case the Unicode private use area will be used. This
|
|
556 avoids spurious changes to files on disk when they contain octets that would
|
|
557 be otherwise remapped to the canonical values for the corresponding
|
|
558 characters in the coding system.
|
|
559
|
|
560 DESCRIPTION and PROPS are as in `make-coding-system', which see. This
|
|
561 function also accepts two additional (optional) properties in PROPS;
|
|
562 `aliases', giving a list of aliases to be initialized for this
|
|
563 coding-system, and `encode-failure-octet', an integer between 0 and 256 to
|
|
564 write in place of XEmacs characters that cannot be encoded, defaulting to
|
|
565 the code for tilde `~'. "
|
|
566 (check-argument-type #'symbolp name)
|
|
567 (check-argument-type #'listp unicode-map)
|
|
568 (check-argument-type #'stringp
|
|
569 (or description
|
|
570 (setq description
|
|
571 (format "Coding system used for %s." name))))
|
|
572 (check-valid-plist props)
|
|
573 (let ((encode-failure-octet (or (plist-get props 'encode-failure-octet)
|
|
574 (char-to-int ?~)))
|
|
575 (aliases (plist-get props 'aliases))
|
|
576 (hash-table-sym (gentemp (format "%s-encode-table" name)))
|
|
577 encode-program decode-program result decode-table encode-table)
|
|
578
|
|
579 ;; Some more sanity checking.
|
|
580 (check-argument-range encode-failure-octet 0 #xFF)
|
|
581 (check-argument-type #'listp aliases)
|
|
582
|
|
583 ;; Don't pass on our extra data to make-coding-system.
|
|
584 (setq props (plist-remprop props 'encode-failure-octet)
|
|
585 props (plist-remprop props 'aliases))
|
|
586
|
|
587 (multiple-value-setq
|
|
588 (decode-table encode-table)
|
|
589 (make-8-bit-create-decode-encode-tables unicode-map))
|
|
590
|
|
591 ;; Register the decode-table.
|
|
592 (define-translation-hash-table hash-table-sym encode-table)
|
|
593
|
|
594 ;; Generate the programs.
|
|
595 (setq decode-program (make-8-bit-generate-decode-program decode-table)
|
|
596 encode-program (make-8-bit-generate-encode-program
|
|
597 decode-table encode-table encode-failure-octet))
|
|
598 (unless (vectorp encode-program)
|
|
599 (setq encode-program
|
|
600 (apply #'vector
|
|
601 (nsublis (list (cons 'encode-table-sym hash-table-sym))
|
|
602 (copy-tree encode-program)))))
|
|
603 (unless (vectorp decode-program)
|
|
604 (setq decode-program
|
|
605 (apply #'vector decode-program)))
|
|
606
|
|
607 ;; And now generate the actual coding system.
|
|
608 (setq result
|
|
609 (make-coding-system
|
|
610 name 'ccl
|
|
611 description
|
|
612 (plist-put (plist-put props 'decode decode-program)
|
|
613 'encode encode-program)))
|
4145
|
614 (coding-system-put name 'category
|
|
615 (make-8-bit-choose-category decode-table))
|
4072
|
616 (loop for alias in aliases
|
|
617 do (define-coding-system-alias alias name))
|
|
618 result))
|
|
619
|
|
620 (define-compiler-macro make-8-bit-coding-system (&whole form name unicode-map
|
|
621 &optional description props)
|
|
622
|
|
623 ;; We provide the compiler macro (= macro that is expanded only on
|
|
624 ;; compilation, and that can punt to a runtime version of the
|
|
625 ;; associate function if necessary) not for reasons of speed, though
|
|
626 ;; it does speed up things at runtime a little, but because the
|
|
627 ;; Unicode mappings are available at compile time in the dumped
|
|
628 ;; files, but they are not available at run time for the vast
|
|
629 ;; majority of them.
|
|
630
|
|
631 (if (not (and (and (consp name) (eq (car name) 'quote))
|
|
632 (and (consp unicode-map) (eq (car unicode-map) 'quote))
|
|
633 (and (or (and (consp props) (eq (car props) 'quote))
|
|
634 (null props)))))
|
|
635 ;; The call does not use literals; do it at runtime.
|
|
636 form
|
|
637 (setq name (cadr name)
|
|
638 unicode-map (cadr unicode-map)
|
|
639 props (if props (cadr props)))
|
|
640 (let ((encode-failure-octet
|
|
641 (or (plist-get props 'encode-failure-octet) (char-to-int ?~)))
|
|
642 (aliases (plist-get props 'aliases))
|
|
643 encode-program decode-program
|
4103
|
644 decode-table encode-table)
|
4072
|
645
|
|
646 ;; Some sanity checking.
|
|
647 (check-argument-range encode-failure-octet 0 #xFF)
|
|
648 (check-argument-type #'listp aliases)
|
|
649
|
|
650 ;; Don't pass on our extra data to make-coding-system.
|
|
651 (setq props (plist-remprop props 'encode-failure-octet)
|
|
652 props (plist-remprop props 'aliases))
|
|
653
|
|
654 ;; Work out encode-table and decode-table.
|
|
655 (multiple-value-setq
|
|
656 (decode-table encode-table)
|
|
657 (make-8-bit-create-decode-encode-tables unicode-map))
|
|
658
|
|
659 ;; Generate the decode and encode programs.
|
|
660 (setq decode-program (make-8-bit-generate-decode-program decode-table)
|
|
661 encode-program (make-8-bit-generate-encode-program
|
|
662 decode-table encode-table encode-failure-octet))
|
|
663
|
|
664 ;; And return the generated code.
|
|
665 `(let ((encode-table-sym (gentemp (format "%s-encode-table" ',name)))
|
4103
|
666 ;; The case-fold-search bind shouldn't be necessary. If I take
|
|
667 ;; it, out, though, I get:
|
|
668 ;;
|
|
669 ;; (invalid-read-syntax "Multiply defined symbol label" 1)
|
|
670 ;;
|
|
671 ;; when the file is byte compiled.
|
|
672 (case-fold-search t))
|
4072
|
673 (define-translation-hash-table encode-table-sym ,encode-table)
|
4103
|
674 (make-coding-system
|
|
675 ',name 'ccl ,description
|
|
676 (plist-put (plist-put ',props 'decode
|
|
677 ,(apply #'vector decode-program))
|
|
678 'encode
|
|
679 (apply #'vector
|
|
680 (nsublis
|
|
681 (list (cons
|
|
682 'encode-table-sym
|
|
683 (symbol-value 'encode-table-sym)))
|
|
684 ',encode-program))))
|
4145
|
685 (coding-system-put ',name 'category ',
|
|
686 (make-8-bit-choose-category decode-table))
|
4072
|
687 ,(macroexpand `(loop for alias in ',aliases
|
|
688 do (define-coding-system-alias alias
|
|
689 ',name)))
|
4103
|
690 (find-coding-system ',name)))))
|