diff lisp/mule/mule-coding.el @ 4072:aa28d959af41

[xemacs-hg @ 2007-07-22 22:03:29 by aidan] Add support for non-ISO2022 8 bit fixed-width coding-systems
author aidan
date Sun, 22 Jul 2007 22:04:14 +0000
parents 943eaba38521
children 476d0799d704
line wrap: on
line diff
--- a/lisp/mule/mule-coding.el	Sun Jul 22 21:53:08 2007 +0000
+++ b/lisp/mule/mule-coding.el	Sun Jul 22 22:04:14 2007 +0000
@@ -28,6 +28,9 @@
 
 ;;; split off of mule.el and mostly moved to coding.el
 
+;; Needed for make-8-bit-coding-system. 
+(eval-when-compile (require 'ccl "mule-ccl"))
+
 ;;; Code:
 
 (defun coding-system-force-on-output (coding-system register)
@@ -185,5 +188,483 @@
    mnemonic "ISO7/Lock"
    documentation "ISO-2022 coding system using Locking-Shift for 96-charset."
    ))
+
 
-;;; mule-coding.el ends here
+;; This is used by people writing CCL programs, but is called at runtime.
+(defun define-translation-hash-table (symbol table)
+  "Define SYMBOL as the name of the hash translation TABLE for use in CCL.
+
+Analogous to `define-translation-table', but updates
+`translation-hash-table-vector' and the table is for use in the CCL
+`lookup-integer' and `lookup-character' functions."
+  (unless (and (symbolp symbol)
+	       (hash-table-p table))
+    (error "Bad args to define-translation-hash-table"))
+  (let ((len (length translation-hash-table-vector))
+	(id 0)
+	done)
+    (put symbol 'translation-hash-table table)
+    (while (not done)
+      (if (>= id len)
+	  (setq translation-hash-table-vector
+		(vconcat translation-hash-table-vector [nil])))
+      (let ((slot (aref translation-hash-table-vector id)))
+	(if (or (not slot)
+		(eq (car slot) symbol))
+	    (progn
+	      (aset translation-hash-table-vector id (cons symbol table))
+	      (setq done t))
+	  (setq id (1+ id)))))
+    (put symbol 'translation-hash-table-id id)
+    id))
+
+(defvar make-8-bit-private-use-start (decode-char 'ucs #xE000)
+  "Start of a 256 code private use area for make-8-bit-coding-system.
+
+This is used to ensure that distinct octets on disk for a given coding
+system map to distinct XEmacs characters, preventing a spurious changes when
+a file is read, not changed, and then written.  ")
+
+(defun make-8-bit-generate-helper (decode-table encode-table
+				   encode-failure-octet)
+  "Helper function for `make-8-bit-generate-encode-program', which see.
+
+Deals with the case where ASCII and another character set provide the
+can both be encoded unambiguously into the coding-system; if this is
+so, returns a list corresponding to such a ccl-program.  If not, it
+returns nil.  "
+  (let ((tentative-encode-program-parts
+	 (eval-when-compile 
+	   (let* ((compiled 
+		   (append
+                    (ccl-compile
+                     `(1
+                       (loop
+                         (read-multibyte-character r0 r1)
+                         (if (r0 == ,(charset-id 'ascii))
+                             (write r1)
+                           ((if (r0 == #xABAB)
+                                ;; #xBFFE is a sentinel in the compiled
+                                ;; program.
+                                (write r1 ,(make-vector 256 #xBFFE))
+                              ((mule-to-unicode r0 r1)
+                               (if (r0 == #xFFFD)
+                                   (write #xBEEF)
+                                 ((lookup-integer encode-table-sym r0 r3)
+                                  (if r7
+                                      (write-multibyte-character r0 r3)
+                                    (write #xBEEF))))))))
+                         (repeat)))) nil))
+		  (first-part compiled)
+		  (last-part
+		   (member-if-not (lambda (entr) (eq #xBFFE entr))
+				  (member-if
+                                   (lambda (entr) (eq #xBFFE entr))
+                                   first-part))))
+	     (while compiled
+	       (if (eq #xBFFE (cadr compiled))
+		   (setcdr compiled nil))
+	       (setq compiled (cdr compiled)))
+             ;; Is the generated code as we expect it to be?
+	     (assert (and (memq #xABAB first-part)
+			  (memq #xBEEF14 last-part))
+	    nil
+	    "This code assumes that the constant #xBEEF is #xBEEF14 in \
+compiled CCL code,\nand that the constant #xABAB is #xABAB. If that is
+not the case, and it appears not to be--that's why you're getting this
+message--it will not work.  ")
+	     (list first-part last-part))))
+	(charset-lower -1)
+	(charset-upper -1)
+	worth-trying known-charsets encode-program
+	other-charset-vector ucs)
+
+    (loop for char across decode-table
+      do (pushnew (char-charset char) known-charsets))
+    (setq known-charsets (delq 'ascii known-charsets))
+
+    (loop for known-charset in known-charsets 
+      do
+      ;; This is not possible for two dimensional charsets. 
+      (when (eq 1 (charset-dimension known-charset))
+	(setq args-out-of-range t)
+        (if (eq 'control-1 known-charset)
+            (setq charset-lower 0
+                  charset-upper 31)
+	  ;; There should be a nicer way to get the limits here.
+          (condition-case args-out-of-range
+              (make-char known-charset #x100)
+            (args-out-of-range 
+             (setq charset-lower (third args-out-of-range)
+                   charset-upper (fourth args-out-of-range)))))
+	(loop
+	  for i from charset-lower to charset-upper
+	  always (and (setq ucs 
+			    (encode-char (make-char known-charset i) 'ucs))
+		      (gethash ucs encode-table))
+	  finally (setq worth-trying known-charset))
+
+	;; Only trying this for one charset at a time, the first find.
+	(when worth-trying (return))
+
+	;; Okay, this charset is not worth trying, Try the next.
+	(setq charset-lower -1
+	      charset-upper -1
+	      worth-trying nil)))
+
+    (when worth-trying
+      (setq other-charset-vector (make-vector 256 encode-failure-octet))
+      (loop for i from charset-lower to charset-upper
+        do (aset other-charset-vector (+ #x80 i)
+		 (gethash (encode-char (make-char worth-trying i)
+				       'ucs) encode-table)))
+      (setq encode-program
+            (nsublis
+             (list (cons #xABAB (charset-id worth-trying)))
+             (nconc
+              (copy-list (first 
+                          tentative-encode-program-parts))
+              (append other-charset-vector nil)
+              (copy-tree (second 
+                          tentative-encode-program-parts))))))
+    encode-program))
+
+(defun make-8-bit-generate-encode-program (decode-table encode-table
+					   encode-failure-octet)
+  "Generate a CCL program to decode a 8-bit fixed-width charset.
+
+DECODE-TABLE must have 256 non-cons entries, and will be regarded as
+describing a map from the octet corresponding to an offset in the
+table to the that entry in the table.  ENCODE-TABLE is a hash table
+map from unicode values to characters in the range [0,255].
+ENCODE-FAILURE-OCTET describes an integer between 0 and 255
+\(inclusive) to write in the event that a character cannot be encoded.  "
+  (check-argument-type #'vectorp decode-table)
+  (check-argument-range (length decode-table) #x100 #x100)
+  (check-argument-type #'hash-table-p encode-table)
+  (check-argument-type #'integerp encode-failure-octet)
+  (check-argument-range encode-failure-octet #x00 #xFF)
+  (let ((encode-program nil)
+	(general-encode-program
+	 (eval-when-compile
+	   (let ((prog (append 
+			(ccl-compile
+			 `(1
+			   (loop
+			     (read-multibyte-character r0 r1)
+			     (mule-to-unicode r0 r1)
+			     (if (r0 == #xFFFD)
+				 (write #xBEEF)
+			       ((lookup-integer encode-table-sym r0 r3)
+				(if r7
+				    (write-multibyte-character r0 r3)
+				  (write #xBEEF))))
+			     (repeat)))) nil)))
+	     (assert (memq #xBEEF14 prog)
+		     nil
+		     "This code assumes that the constant #xBEEF is #xBEEF14 \
+in compiled CCL code.\nIf that is not the case, and it appears not to
+be--that's why you're getting this message--it will not work.  ")
+	     prog)))
+	(encode-program-with-ascii-optimisation
+	 (eval-when-compile 
+	   (let ((prog (append
+			(ccl-compile
+			 `(1
+			   (loop
+			     (read-multibyte-character r0 r1)
+			     (if (r0 == ,(charset-id 'ascii))
+				 (write r1)
+			       ((mule-to-unicode r0 r1)
+				(if (r0 == #xFFFD)
+				    (write #xBEEF)
+				  ((lookup-integer encode-table-sym r0 r3)
+				   (if r7
+				       (write-multibyte-character r0 r3)
+				     (write #xBEEF))))))
+			     (repeat)))) nil)))
+	     (assert (memq #xBEEF14 prog)
+		     nil
+		     "This code assumes that the constant #xBEEF is #xBEEF14 \
+in compiled CCL code.\nIf that is not the case, and it appears not to
+be--that's why you're getting this message--it will not work.  ")
+	     prog)))
+         (ascii-encodes-as-itself nil))
+
+    ;; Is this coding system ASCII-compatible? If so, we can avoid the hash
+    ;; table lookup for those characters. 
+    (loop
+      for i from #x00 to #x7f
+      always (eq (int-to-char i) (gethash i encode-table))
+      finally (setq ascii-encodes-as-itself t))
+
+    ;; Note that this logic handles EBCDIC badly. For example, CP037,
+    ;; MIME name ebcdic-na, has the entire repertoire of ASCII and
+    ;; Latin 1, and thus a more optimal ccl encode program would check
+    ;; for those character sets and use tables. But for now, we do a
+    ;; hash table lookup for every character.
+    (if (null ascii-encodes-as-itself)
+	;; General encode program. Pros; general and correct. Cons;
+	;; slow, a hash table lookup + mule-unicode conversion is done
+	;; for every character encoding. 
+	(setq encode-program general-encode-program)
+      (setq encode-program
+	    ;; Encode program with ascii-ascii mapping (based on a
+	    ;; character's mule character set), and one other mule
+	    ;; character set using table-based encoding, other
+	    ;; character sets using hash table lookups.
+	    ;; make-8-bit-non-ascii-completely-coveredp only returns
+	    ;; such a mapping if some non-ASCII charset with
+	    ;; characters in decode-table is entirely covered by
+	    ;; encode-table.
+	    (make-8-bit-generate-helper decode-table encode-table
+					encode-failure-octet))
+      (unless encode-program
+	;; If make-8-bit-non-ascii-completely-coveredp returned nil,
+	;; but ASCII still encodes as itself, do one-to-one mapping
+	;; for ASCII, and a hash table lookup for everything else.
+	(setq encode-program encode-program-with-ascii-optimisation)))
+
+    (setq encode-program
+          (nsublis
+           (list (cons #xBEEF14 
+                       (logior (lsh encode-failure-octet 8)
+                               #x14)))
+           (copy-tree encode-program)))
+    encode-program))
+
+(defun make-8-bit-create-decode-encode-tables (unicode-map)
+  "Return a list \(DECODE-TABLE ENCODE-TABLE) given UNICODE-MAP. 
+UNICODE-MAP should be an alist mapping from integer octet values to
+characters with UCS code points; DECODE-TABLE will be a 256-element
+vector, and ENCODE-TABLE will be a hash table mapping from 256 numbers
+to 256 distinct characters.  "
+  (check-argument-type #'listp unicode-map)
+  (let ((decode-table (make-vector 256 nil))
+        (encode-table (make-hash-table :size 256))
+	(private-use-start (encode-char make-8-bit-private-use-start 'ucs))
+	desired-ucs)
+
+    (loop for (external internal)
+      in unicode-map
+      do
+      (aset decode-table external internal)
+      (assert (not (eq (encode-char internal 'ucs) -1))
+	      nil
+	      "Looks like you're calling make-8-bit-coding-system in a \
+dumped file, \nand you're either not providing a literal UNICODE-MAP
+or PROPS. Don't do that; make-8-bit-coding-system relies on sensible
+Unicode mappings being available, which they are at compile time for
+dumped files (but this requires the mentioned literals), but not, for
+most of them, at run time.  ")
+
+      (puthash (encode-char internal 'ucs)
+	       ;; This is semantically an integer, but Dave Love's design
+	       ;; for lookup-integer in CCL means we need to store it as a
+	       ;; character.
+	       (int-to-char external)
+	       encode-table))
+
+    ;; Now, go through the decode table looking at the characters that
+    ;; remain nil. If the XEmacs character with that integer is already in
+    ;; the encode table, map the on-disk octet to a Unicode private use
+    ;; character. Otherwise map the on-disk octet to the XEmacs character
+    ;; with that numeric value, to make it clearer what it is.
+    (dotimes (i 256)
+      (when (null (aref decode-table i))
+	;; Find a free code point. 
+	(setq desired-ucs i)
+	(while (gethash desired-ucs encode-table)
+	  ;; In the normal case, the code point chosen will be U+E0XY, where
+	  ;; XY is the hexadecimal octet on disk. In pathological cases
+	  ;; it'll be something else.
+	  (setq desired-ucs (+ private-use-start desired-ucs)
+		private-use-start (+ private-use-start 1)))
+	(aset decode-table i (decode-char 'ucs desired-ucs))
+	(puthash desired-ucs (int-to-char i) encode-table)))
+    (values decode-table encode-table)))
+
+(defun make-8-bit-generate-decode-program (decode-table)
+  "Given DECODE-TABLE, generate a CCL program to decode an 8-bit charset.
+DECODE-TABLE must have 256 non-cons entries, and will be regarded as
+describing a map from the octet corresponding to an offset in the
+table to the that entry in the table.  "
+  (check-argument-type #'vectorp decode-table)
+  (check-argument-range (length decode-table) #x100 #x100)
+  (let ((decode-program-parts
+	 (eval-when-compile
+	   (let* ((compiled
+		   (append 
+		    (ccl-compile 
+		     `(3
+		       ((read r0)
+                        (loop
+			 (write-read-repeat r0 ,(make-vector
+						 256 'sentinel)))))) nil))
+		  (first-part compiled)
+		  (last-part
+		   (member-if-not #'symbolp
+				  (member-if-not #'integerp first-part))))
+	     ;; Chop off the sentinel sentinel sentinel [..] part. 
+	     (while compiled
+	       (if (symbolp (cadr compiled))
+		   (setcdr compiled nil))
+	       (setq compiled (cdr compiled)))
+	     (list first-part last-part)))))
+   (nconc
+    ;; copy-list needed, because the structure of the literal provided
+    ;; by our eval-when-compile hangs around.
+    (copy-list (first decode-program-parts))
+    (append decode-table nil)
+    (second decode-program-parts))))
+
+;;;###autoload
+(defun make-8-bit-coding-system (name unicode-map &optional description props)
+  "Make and return a fixed-width 8-bit CCL coding system named NAME.
+NAME must be a symbol, and UNICODE-MAP a list. 
+
+UNICODE-MAP is a plist describing a map from octets in the coding
+system NAME (as integers) to XEmacs characters.  Those XEmacs
+characters will be used explicitly on decoding, but for encoding (most
+relevantly, on writing to disk) XEmacs characters that map to the same
+Unicode code point will be unified.  This means that the ISO-8859-? 
+characters that map to the same Unicode code point will not be
+distinct when written to disk, which is normally what is intended; it
+also means that East Asian Han characters from different XEmacs
+character sets will not be distinct when written to disk, which is
+less often what is intended.
+
+Any octets not mapped will be decoded into the ISO 8859-1 characters with
+the corresponding numeric value; unless another octet maps to that
+character, in which case the Unicode private use area will be used.  This
+avoids spurious changes to files on disk when they contain octets that would
+be otherwise remapped to the canonical values for the corresponding
+characters in the coding system.
+
+DESCRIPTION and PROPS are as in `make-coding-system', which see.  This
+function also accepts two additional (optional) properties in PROPS;
+`aliases', giving a list of aliases to be initialized for this
+coding-system, and `encode-failure-octet', an integer between 0 and 256 to
+write in place of XEmacs characters that cannot be encoded, defaulting to
+the code for tilde `~'.  "
+  (check-argument-type #'symbolp name)
+  (check-argument-type #'listp unicode-map)
+  (check-argument-type #'stringp
+		       (or description 
+			   (setq description
+				 (format "Coding system used for %s." name))))
+  (check-valid-plist props)
+  (let  ((encode-failure-octet (or (plist-get props 'encode-failure-octet)
+				   (char-to-int ?~)))
+	 (aliases (plist-get props 'aliases))
+	 (hash-table-sym (gentemp (format "%s-encode-table" name)))
+	 encode-program decode-program result decode-table encode-table)
+
+    ;; Some more sanity checking. 
+    (check-argument-range encode-failure-octet 0 #xFF)
+    (check-argument-type #'listp aliases)
+
+    ;; Don't pass on our extra data to make-coding-system.
+    (setq props (plist-remprop props 'encode-failure-octet)
+	  props (plist-remprop props 'aliases))
+
+    (multiple-value-setq
+	(decode-table encode-table)
+      (make-8-bit-create-decode-encode-tables unicode-map))
+
+    ;; Register the decode-table. 
+    (define-translation-hash-table hash-table-sym encode-table)
+
+    ;; Generate the programs. 
+    (setq decode-program (make-8-bit-generate-decode-program decode-table)
+          encode-program (make-8-bit-generate-encode-program
+                          decode-table encode-table encode-failure-octet))
+    (unless (vectorp encode-program) 
+      (setq encode-program 
+	    (apply #'vector
+		   (nsublis (list (cons 'encode-table-sym hash-table-sym))
+			    (copy-tree encode-program)))))
+    (unless (vectorp decode-program)
+      (setq decode-program
+	    (apply #'vector decode-program)))
+
+    ;; And now generate the actual coding system.
+    (setq result 
+	  (make-coding-system 
+           name  'ccl
+           description 
+           (plist-put (plist-put props 'decode decode-program)
+                      'encode encode-program)))
+    (coding-system-put name 'category 'iso-8-1)
+    (loop for alias in aliases
+      do (define-coding-system-alias alias name))
+    result))
+
+;;;###autoload
+(define-compiler-macro make-8-bit-coding-system (&whole form name unicode-map
+						 &optional description props)
+
+  ;; We provide the compiler macro (= macro that is expanded only on
+  ;; compilation, and that can punt to a runtime version of the
+  ;; associate function if necessary) not for reasons of speed, though
+  ;; it does speed up things at runtime a little, but because the
+  ;; Unicode mappings are available at compile time in the dumped
+  ;; files, but they are not available at run time for the vast
+  ;; majority of them.
+
+  (if (not (and (and (consp name) (eq (car name) 'quote))
+		(and (consp unicode-map) (eq (car unicode-map) 'quote))
+		(and (or (and (consp props) (eq (car props) 'quote))
+			 (null props)))))
+      ;; The call does not use literals; do it at runtime.
+      form
+    (setq name (cadr name)
+	  unicode-map (cadr unicode-map)
+	  props (if props (cadr props)))
+    (let  ((encode-failure-octet
+	    (or (plist-get props 'encode-failure-octet) (char-to-int ?~)))
+	   (aliases (plist-get props 'aliases))
+	   encode-program decode-program
+	   decode-table encode-table res)
+
+      ;; Some sanity checking. 
+      (check-argument-range encode-failure-octet 0 #xFF)
+      (check-argument-type #'listp aliases)
+
+      ;; Don't pass on our extra data to make-coding-system.
+      (setq props (plist-remprop props 'encode-failure-octet)
+	    props (plist-remprop props 'aliases))
+
+      ;; Work out encode-table and decode-table. 
+      (multiple-value-setq
+	  (decode-table encode-table)
+	(make-8-bit-create-decode-encode-tables unicode-map))
+
+      ;; Generate the decode and encode programs. 
+      (setq decode-program (make-8-bit-generate-decode-program decode-table)
+	    encode-program (make-8-bit-generate-encode-program
+			    decode-table encode-table encode-failure-octet))
+
+      ;; And return the generated code. 
+      `(let ((encode-table-sym (gentemp (format "%s-encode-table" ',name)))
+             result)
+        (define-translation-hash-table encode-table-sym ,encode-table)
+        (setq result 
+              (make-coding-system 
+               ',name 'ccl ,description
+               (plist-put (plist-put ',props 'decode 
+                                     ,(apply #'vector decode-program))
+                          'encode 
+                          (apply #'vector
+                                 (nsublis
+                                  (list (cons
+                                         'encode-table-sym
+                                         (symbol-value 'encode-table-sym)))
+                                  ',encode-program)))))
+        (coding-system-put ',name 'category 'iso-8-1)
+        ,(macroexpand `(loop for alias in ',aliases
+                        do (define-coding-system-alias alias
+                             ',name)))
+        'result))))
+    
+  
\ No newline at end of file