comparison lisp/mule/mule-coding.el @ 4072:aa28d959af41

[xemacs-hg @ 2007-07-22 22:03:29 by aidan] Add support for non-ISO2022 8 bit fixed-width coding-systems
author aidan
date Sun, 22 Jul 2007 22:04:14 +0000
parents 943eaba38521
children 476d0799d704
comparison
equal deleted inserted replaced
4071:d607d13fca67 4072:aa28d959af41
25 ;; Boston, MA 02111-1307, USA. 25 ;; Boston, MA 02111-1307, USA.
26 26
27 ;;; Commentary: 27 ;;; Commentary:
28 28
29 ;;; split off of mule.el and mostly moved to coding.el 29 ;;; split off of mule.el and mostly moved to coding.el
30
31 ;; Needed for make-8-bit-coding-system.
32 (eval-when-compile (require 'ccl "mule-ccl"))
30 33
31 ;;; Code: 34 ;;; Code:
32 35
33 (defun coding-system-force-on-output (coding-system register) 36 (defun coding-system-force-on-output (coding-system register)
34 "Return the 'force-on-output property of CODING-SYSTEM for the specified REGISTER." 37 "Return the 'force-on-output property of CODING-SYSTEM for the specified REGISTER."
183 seven t 186 seven t
184 lock-shift t 187 lock-shift t
185 mnemonic "ISO7/Lock" 188 mnemonic "ISO7/Lock"
186 documentation "ISO-2022 coding system using Locking-Shift for 96-charset." 189 documentation "ISO-2022 coding system using Locking-Shift for 96-charset."
187 )) 190 ))
188 191
189 ;;; mule-coding.el ends here 192
193 ;; This is used by people writing CCL programs, but is called at runtime.
194 (defun define-translation-hash-table (symbol table)
195 "Define SYMBOL as the name of the hash translation TABLE for use in CCL.
196
197 Analogous to `define-translation-table', but updates
198 `translation-hash-table-vector' and the table is for use in the CCL
199 `lookup-integer' and `lookup-character' functions."
200 (unless (and (symbolp symbol)
201 (hash-table-p table))
202 (error "Bad args to define-translation-hash-table"))
203 (let ((len (length translation-hash-table-vector))
204 (id 0)
205 done)
206 (put symbol 'translation-hash-table table)
207 (while (not done)
208 (if (>= id len)
209 (setq translation-hash-table-vector
210 (vconcat translation-hash-table-vector [nil])))
211 (let ((slot (aref translation-hash-table-vector id)))
212 (if (or (not slot)
213 (eq (car slot) symbol))
214 (progn
215 (aset translation-hash-table-vector id (cons symbol table))
216 (setq done t))
217 (setq id (1+ id)))))
218 (put symbol 'translation-hash-table-id id)
219 id))
220
221 (defvar make-8-bit-private-use-start (decode-char 'ucs #xE000)
222 "Start of a 256 code private use area for make-8-bit-coding-system.
223
224 This is used to ensure that distinct octets on disk for a given coding
225 system map to distinct XEmacs characters, preventing a spurious changes when
226 a file is read, not changed, and then written. ")
227
228 (defun make-8-bit-generate-helper (decode-table encode-table
229 encode-failure-octet)
230 "Helper function for `make-8-bit-generate-encode-program', which see.
231
232 Deals with the case where ASCII and another character set provide the
233 can both be encoded unambiguously into the coding-system; if this is
234 so, returns a list corresponding to such a ccl-program. If not, it
235 returns nil. "
236 (let ((tentative-encode-program-parts
237 (eval-when-compile
238 (let* ((compiled
239 (append
240 (ccl-compile
241 `(1
242 (loop
243 (read-multibyte-character r0 r1)
244 (if (r0 == ,(charset-id 'ascii))
245 (write r1)
246 ((if (r0 == #xABAB)
247 ;; #xBFFE is a sentinel in the compiled
248 ;; program.
249 (write r1 ,(make-vector 256 #xBFFE))
250 ((mule-to-unicode r0 r1)
251 (if (r0 == #xFFFD)
252 (write #xBEEF)
253 ((lookup-integer encode-table-sym r0 r3)
254 (if r7
255 (write-multibyte-character r0 r3)
256 (write #xBEEF))))))))
257 (repeat)))) nil))
258 (first-part compiled)
259 (last-part
260 (member-if-not (lambda (entr) (eq #xBFFE entr))
261 (member-if
262 (lambda (entr) (eq #xBFFE entr))
263 first-part))))
264 (while compiled
265 (if (eq #xBFFE (cadr compiled))
266 (setcdr compiled nil))
267 (setq compiled (cdr compiled)))
268 ;; Is the generated code as we expect it to be?
269 (assert (and (memq #xABAB first-part)
270 (memq #xBEEF14 last-part))
271 nil
272 "This code assumes that the constant #xBEEF is #xBEEF14 in \
273 compiled CCL code,\nand that the constant #xABAB is #xABAB. If that is
274 not the case, and it appears not to be--that's why you're getting this
275 message--it will not work. ")
276 (list first-part last-part))))
277 (charset-lower -1)
278 (charset-upper -1)
279 worth-trying known-charsets encode-program
280 other-charset-vector ucs)
281
282 (loop for char across decode-table
283 do (pushnew (char-charset char) known-charsets))
284 (setq known-charsets (delq 'ascii known-charsets))
285
286 (loop for known-charset in known-charsets
287 do
288 ;; This is not possible for two dimensional charsets.
289 (when (eq 1 (charset-dimension known-charset))
290 (setq args-out-of-range t)
291 (if (eq 'control-1 known-charset)
292 (setq charset-lower 0
293 charset-upper 31)
294 ;; There should be a nicer way to get the limits here.
295 (condition-case args-out-of-range
296 (make-char known-charset #x100)
297 (args-out-of-range
298 (setq charset-lower (third args-out-of-range)
299 charset-upper (fourth args-out-of-range)))))
300 (loop
301 for i from charset-lower to charset-upper
302 always (and (setq ucs
303 (encode-char (make-char known-charset i) 'ucs))
304 (gethash ucs encode-table))
305 finally (setq worth-trying known-charset))
306
307 ;; Only trying this for one charset at a time, the first find.
308 (when worth-trying (return))
309
310 ;; Okay, this charset is not worth trying, Try the next.
311 (setq charset-lower -1
312 charset-upper -1
313 worth-trying nil)))
314
315 (when worth-trying
316 (setq other-charset-vector (make-vector 256 encode-failure-octet))
317 (loop for i from charset-lower to charset-upper
318 do (aset other-charset-vector (+ #x80 i)
319 (gethash (encode-char (make-char worth-trying i)
320 'ucs) encode-table)))
321 (setq encode-program
322 (nsublis
323 (list (cons #xABAB (charset-id worth-trying)))
324 (nconc
325 (copy-list (first
326 tentative-encode-program-parts))
327 (append other-charset-vector nil)
328 (copy-tree (second
329 tentative-encode-program-parts))))))
330 encode-program))
331
332 (defun make-8-bit-generate-encode-program (decode-table encode-table
333 encode-failure-octet)
334 "Generate a CCL program to decode a 8-bit fixed-width charset.
335
336 DECODE-TABLE must have 256 non-cons entries, and will be regarded as
337 describing a map from the octet corresponding to an offset in the
338 table to the that entry in the table. ENCODE-TABLE is a hash table
339 map from unicode values to characters in the range [0,255].
340 ENCODE-FAILURE-OCTET describes an integer between 0 and 255
341 \(inclusive) to write in the event that a character cannot be encoded. "
342 (check-argument-type #'vectorp decode-table)
343 (check-argument-range (length decode-table) #x100 #x100)
344 (check-argument-type #'hash-table-p encode-table)
345 (check-argument-type #'integerp encode-failure-octet)
346 (check-argument-range encode-failure-octet #x00 #xFF)
347 (let ((encode-program nil)
348 (general-encode-program
349 (eval-when-compile
350 (let ((prog (append
351 (ccl-compile
352 `(1
353 (loop
354 (read-multibyte-character r0 r1)
355 (mule-to-unicode r0 r1)
356 (if (r0 == #xFFFD)
357 (write #xBEEF)
358 ((lookup-integer encode-table-sym r0 r3)
359 (if r7
360 (write-multibyte-character r0 r3)
361 (write #xBEEF))))
362 (repeat)))) nil)))
363 (assert (memq #xBEEF14 prog)
364 nil
365 "This code assumes that the constant #xBEEF is #xBEEF14 \
366 in compiled CCL code.\nIf that is not the case, and it appears not to
367 be--that's why you're getting this message--it will not work. ")
368 prog)))
369 (encode-program-with-ascii-optimisation
370 (eval-when-compile
371 (let ((prog (append
372 (ccl-compile
373 `(1
374 (loop
375 (read-multibyte-character r0 r1)
376 (if (r0 == ,(charset-id 'ascii))
377 (write r1)
378 ((mule-to-unicode r0 r1)
379 (if (r0 == #xFFFD)
380 (write #xBEEF)
381 ((lookup-integer encode-table-sym r0 r3)
382 (if r7
383 (write-multibyte-character r0 r3)
384 (write #xBEEF))))))
385 (repeat)))) nil)))
386 (assert (memq #xBEEF14 prog)
387 nil
388 "This code assumes that the constant #xBEEF is #xBEEF14 \
389 in compiled CCL code.\nIf that is not the case, and it appears not to
390 be--that's why you're getting this message--it will not work. ")
391 prog)))
392 (ascii-encodes-as-itself nil))
393
394 ;; Is this coding system ASCII-compatible? If so, we can avoid the hash
395 ;; table lookup for those characters.
396 (loop
397 for i from #x00 to #x7f
398 always (eq (int-to-char i) (gethash i encode-table))
399 finally (setq ascii-encodes-as-itself t))
400
401 ;; Note that this logic handles EBCDIC badly. For example, CP037,
402 ;; MIME name ebcdic-na, has the entire repertoire of ASCII and
403 ;; Latin 1, and thus a more optimal ccl encode program would check
404 ;; for those character sets and use tables. But for now, we do a
405 ;; hash table lookup for every character.
406 (if (null ascii-encodes-as-itself)
407 ;; General encode program. Pros; general and correct. Cons;
408 ;; slow, a hash table lookup + mule-unicode conversion is done
409 ;; for every character encoding.
410 (setq encode-program general-encode-program)
411 (setq encode-program
412 ;; Encode program with ascii-ascii mapping (based on a
413 ;; character's mule character set), and one other mule
414 ;; character set using table-based encoding, other
415 ;; character sets using hash table lookups.
416 ;; make-8-bit-non-ascii-completely-coveredp only returns
417 ;; such a mapping if some non-ASCII charset with
418 ;; characters in decode-table is entirely covered by
419 ;; encode-table.
420 (make-8-bit-generate-helper decode-table encode-table
421 encode-failure-octet))
422 (unless encode-program
423 ;; If make-8-bit-non-ascii-completely-coveredp returned nil,
424 ;; but ASCII still encodes as itself, do one-to-one mapping
425 ;; for ASCII, and a hash table lookup for everything else.
426 (setq encode-program encode-program-with-ascii-optimisation)))
427
428 (setq encode-program
429 (nsublis
430 (list (cons #xBEEF14
431 (logior (lsh encode-failure-octet 8)
432 #x14)))
433 (copy-tree encode-program)))
434 encode-program))
435
436 (defun make-8-bit-create-decode-encode-tables (unicode-map)
437 "Return a list \(DECODE-TABLE ENCODE-TABLE) given UNICODE-MAP.
438 UNICODE-MAP should be an alist mapping from integer octet values to
439 characters with UCS code points; DECODE-TABLE will be a 256-element
440 vector, and ENCODE-TABLE will be a hash table mapping from 256 numbers
441 to 256 distinct characters. "
442 (check-argument-type #'listp unicode-map)
443 (let ((decode-table (make-vector 256 nil))
444 (encode-table (make-hash-table :size 256))
445 (private-use-start (encode-char make-8-bit-private-use-start 'ucs))
446 desired-ucs)
447
448 (loop for (external internal)
449 in unicode-map
450 do
451 (aset decode-table external internal)
452 (assert (not (eq (encode-char internal 'ucs) -1))
453 nil
454 "Looks like you're calling make-8-bit-coding-system in a \
455 dumped file, \nand you're either not providing a literal UNICODE-MAP
456 or PROPS. Don't do that; make-8-bit-coding-system relies on sensible
457 Unicode mappings being available, which they are at compile time for
458 dumped files (but this requires the mentioned literals), but not, for
459 most of them, at run time. ")
460
461 (puthash (encode-char internal 'ucs)
462 ;; This is semantically an integer, but Dave Love's design
463 ;; for lookup-integer in CCL means we need to store it as a
464 ;; character.
465 (int-to-char external)
466 encode-table))
467
468 ;; Now, go through the decode table looking at the characters that
469 ;; remain nil. If the XEmacs character with that integer is already in
470 ;; the encode table, map the on-disk octet to a Unicode private use
471 ;; character. Otherwise map the on-disk octet to the XEmacs character
472 ;; with that numeric value, to make it clearer what it is.
473 (dotimes (i 256)
474 (when (null (aref decode-table i))
475 ;; Find a free code point.
476 (setq desired-ucs i)
477 (while (gethash desired-ucs encode-table)
478 ;; In the normal case, the code point chosen will be U+E0XY, where
479 ;; XY is the hexadecimal octet on disk. In pathological cases
480 ;; it'll be something else.
481 (setq desired-ucs (+ private-use-start desired-ucs)
482 private-use-start (+ private-use-start 1)))
483 (aset decode-table i (decode-char 'ucs desired-ucs))
484 (puthash desired-ucs (int-to-char i) encode-table)))
485 (values decode-table encode-table)))
486
487 (defun make-8-bit-generate-decode-program (decode-table)
488 "Given DECODE-TABLE, generate a CCL program to decode an 8-bit charset.
489 DECODE-TABLE must have 256 non-cons entries, and will be regarded as
490 describing a map from the octet corresponding to an offset in the
491 table to the that entry in the table. "
492 (check-argument-type #'vectorp decode-table)
493 (check-argument-range (length decode-table) #x100 #x100)
494 (let ((decode-program-parts
495 (eval-when-compile
496 (let* ((compiled
497 (append
498 (ccl-compile
499 `(3
500 ((read r0)
501 (loop
502 (write-read-repeat r0 ,(make-vector
503 256 'sentinel)))))) nil))
504 (first-part compiled)
505 (last-part
506 (member-if-not #'symbolp
507 (member-if-not #'integerp first-part))))
508 ;; Chop off the sentinel sentinel sentinel [..] part.
509 (while compiled
510 (if (symbolp (cadr compiled))
511 (setcdr compiled nil))
512 (setq compiled (cdr compiled)))
513 (list first-part last-part)))))
514 (nconc
515 ;; copy-list needed, because the structure of the literal provided
516 ;; by our eval-when-compile hangs around.
517 (copy-list (first decode-program-parts))
518 (append decode-table nil)
519 (second decode-program-parts))))
520
521 ;;;###autoload
522 (defun make-8-bit-coding-system (name unicode-map &optional description props)
523 "Make and return a fixed-width 8-bit CCL coding system named NAME.
524 NAME must be a symbol, and UNICODE-MAP a list.
525
526 UNICODE-MAP is a plist describing a map from octets in the coding
527 system NAME (as integers) to XEmacs characters. Those XEmacs
528 characters will be used explicitly on decoding, but for encoding (most
529 relevantly, on writing to disk) XEmacs characters that map to the same
530 Unicode code point will be unified. This means that the ISO-8859-?
531 characters that map to the same Unicode code point will not be
532 distinct when written to disk, which is normally what is intended; it
533 also means that East Asian Han characters from different XEmacs
534 character sets will not be distinct when written to disk, which is
535 less often what is intended.
536
537 Any octets not mapped will be decoded into the ISO 8859-1 characters with
538 the corresponding numeric value; unless another octet maps to that
539 character, in which case the Unicode private use area will be used. This
540 avoids spurious changes to files on disk when they contain octets that would
541 be otherwise remapped to the canonical values for the corresponding
542 characters in the coding system.
543
544 DESCRIPTION and PROPS are as in `make-coding-system', which see. This
545 function also accepts two additional (optional) properties in PROPS;
546 `aliases', giving a list of aliases to be initialized for this
547 coding-system, and `encode-failure-octet', an integer between 0 and 256 to
548 write in place of XEmacs characters that cannot be encoded, defaulting to
549 the code for tilde `~'. "
550 (check-argument-type #'symbolp name)
551 (check-argument-type #'listp unicode-map)
552 (check-argument-type #'stringp
553 (or description
554 (setq description
555 (format "Coding system used for %s." name))))
556 (check-valid-plist props)
557 (let ((encode-failure-octet (or (plist-get props 'encode-failure-octet)
558 (char-to-int ?~)))
559 (aliases (plist-get props 'aliases))
560 (hash-table-sym (gentemp (format "%s-encode-table" name)))
561 encode-program decode-program result decode-table encode-table)
562
563 ;; Some more sanity checking.
564 (check-argument-range encode-failure-octet 0 #xFF)
565 (check-argument-type #'listp aliases)
566
567 ;; Don't pass on our extra data to make-coding-system.
568 (setq props (plist-remprop props 'encode-failure-octet)
569 props (plist-remprop props 'aliases))
570
571 (multiple-value-setq
572 (decode-table encode-table)
573 (make-8-bit-create-decode-encode-tables unicode-map))
574
575 ;; Register the decode-table.
576 (define-translation-hash-table hash-table-sym encode-table)
577
578 ;; Generate the programs.
579 (setq decode-program (make-8-bit-generate-decode-program decode-table)
580 encode-program (make-8-bit-generate-encode-program
581 decode-table encode-table encode-failure-octet))
582 (unless (vectorp encode-program)
583 (setq encode-program
584 (apply #'vector
585 (nsublis (list (cons 'encode-table-sym hash-table-sym))
586 (copy-tree encode-program)))))
587 (unless (vectorp decode-program)
588 (setq decode-program
589 (apply #'vector decode-program)))
590
591 ;; And now generate the actual coding system.
592 (setq result
593 (make-coding-system
594 name 'ccl
595 description
596 (plist-put (plist-put props 'decode decode-program)
597 'encode encode-program)))
598 (coding-system-put name 'category 'iso-8-1)
599 (loop for alias in aliases
600 do (define-coding-system-alias alias name))
601 result))
602
603 ;;;###autoload
604 (define-compiler-macro make-8-bit-coding-system (&whole form name unicode-map
605 &optional description props)
606
607 ;; We provide the compiler macro (= macro that is expanded only on
608 ;; compilation, and that can punt to a runtime version of the
609 ;; associate function if necessary) not for reasons of speed, though
610 ;; it does speed up things at runtime a little, but because the
611 ;; Unicode mappings are available at compile time in the dumped
612 ;; files, but they are not available at run time for the vast
613 ;; majority of them.
614
615 (if (not (and (and (consp name) (eq (car name) 'quote))
616 (and (consp unicode-map) (eq (car unicode-map) 'quote))
617 (and (or (and (consp props) (eq (car props) 'quote))
618 (null props)))))
619 ;; The call does not use literals; do it at runtime.
620 form
621 (setq name (cadr name)
622 unicode-map (cadr unicode-map)
623 props (if props (cadr props)))
624 (let ((encode-failure-octet
625 (or (plist-get props 'encode-failure-octet) (char-to-int ?~)))
626 (aliases (plist-get props 'aliases))
627 encode-program decode-program
628 decode-table encode-table res)
629
630 ;; Some sanity checking.
631 (check-argument-range encode-failure-octet 0 #xFF)
632 (check-argument-type #'listp aliases)
633
634 ;; Don't pass on our extra data to make-coding-system.
635 (setq props (plist-remprop props 'encode-failure-octet)
636 props (plist-remprop props 'aliases))
637
638 ;; Work out encode-table and decode-table.
639 (multiple-value-setq
640 (decode-table encode-table)
641 (make-8-bit-create-decode-encode-tables unicode-map))
642
643 ;; Generate the decode and encode programs.
644 (setq decode-program (make-8-bit-generate-decode-program decode-table)
645 encode-program (make-8-bit-generate-encode-program
646 decode-table encode-table encode-failure-octet))
647
648 ;; And return the generated code.
649 `(let ((encode-table-sym (gentemp (format "%s-encode-table" ',name)))
650 result)
651 (define-translation-hash-table encode-table-sym ,encode-table)
652 (setq result
653 (make-coding-system
654 ',name 'ccl ,description
655 (plist-put (plist-put ',props 'decode
656 ,(apply #'vector decode-program))
657 'encode
658 (apply #'vector
659 (nsublis
660 (list (cons
661 'encode-table-sym
662 (symbol-value 'encode-table-sym)))
663 ',encode-program)))))
664 (coding-system-put ',name 'category 'iso-8-1)
665 ,(macroexpand `(loop for alias in ',aliases
666 do (define-coding-system-alias alias
667 ',name)))
668 'result))))
669
670