Mercurial > hg > xemacs-beta
diff lisp/mule/mule-coding.el @ 4072:aa28d959af41
[xemacs-hg @ 2007-07-22 22:03:29 by aidan]
Add support for non-ISO2022 8 bit fixed-width coding-systems
author | aidan |
---|---|
date | Sun, 22 Jul 2007 22:04:14 +0000 |
parents | 943eaba38521 |
children | 476d0799d704 |
line wrap: on
line diff
--- a/lisp/mule/mule-coding.el Sun Jul 22 21:53:08 2007 +0000 +++ b/lisp/mule/mule-coding.el Sun Jul 22 22:04:14 2007 +0000 @@ -28,6 +28,9 @@ ;;; split off of mule.el and mostly moved to coding.el +;; Needed for make-8-bit-coding-system. +(eval-when-compile (require 'ccl "mule-ccl")) + ;;; Code: (defun coding-system-force-on-output (coding-system register) @@ -185,5 +188,483 @@ mnemonic "ISO7/Lock" documentation "ISO-2022 coding system using Locking-Shift for 96-charset." )) + -;;; mule-coding.el ends here +;; This is used by people writing CCL programs, but is called at runtime. +(defun define-translation-hash-table (symbol table) + "Define SYMBOL as the name of the hash translation TABLE for use in CCL. + +Analogous to `define-translation-table', but updates +`translation-hash-table-vector' and the table is for use in the CCL +`lookup-integer' and `lookup-character' functions." + (unless (and (symbolp symbol) + (hash-table-p table)) + (error "Bad args to define-translation-hash-table")) + (let ((len (length translation-hash-table-vector)) + (id 0) + done) + (put symbol 'translation-hash-table table) + (while (not done) + (if (>= id len) + (setq translation-hash-table-vector + (vconcat translation-hash-table-vector [nil]))) + (let ((slot (aref translation-hash-table-vector id))) + (if (or (not slot) + (eq (car slot) symbol)) + (progn + (aset translation-hash-table-vector id (cons symbol table)) + (setq done t)) + (setq id (1+ id))))) + (put symbol 'translation-hash-table-id id) + id)) + +(defvar make-8-bit-private-use-start (decode-char 'ucs #xE000) + "Start of a 256 code private use area for make-8-bit-coding-system. + +This is used to ensure that distinct octets on disk for a given coding +system map to distinct XEmacs characters, preventing a spurious changes when +a file is read, not changed, and then written. ") + +(defun make-8-bit-generate-helper (decode-table encode-table + encode-failure-octet) + "Helper function for `make-8-bit-generate-encode-program', which see. + +Deals with the case where ASCII and another character set provide the +can both be encoded unambiguously into the coding-system; if this is +so, returns a list corresponding to such a ccl-program. If not, it +returns nil. " + (let ((tentative-encode-program-parts + (eval-when-compile + (let* ((compiled + (append + (ccl-compile + `(1 + (loop + (read-multibyte-character r0 r1) + (if (r0 == ,(charset-id 'ascii)) + (write r1) + ((if (r0 == #xABAB) + ;; #xBFFE is a sentinel in the compiled + ;; program. + (write r1 ,(make-vector 256 #xBFFE)) + ((mule-to-unicode r0 r1) + (if (r0 == #xFFFD) + (write #xBEEF) + ((lookup-integer encode-table-sym r0 r3) + (if r7 + (write-multibyte-character r0 r3) + (write #xBEEF)))))))) + (repeat)))) nil)) + (first-part compiled) + (last-part + (member-if-not (lambda (entr) (eq #xBFFE entr)) + (member-if + (lambda (entr) (eq #xBFFE entr)) + first-part)))) + (while compiled + (if (eq #xBFFE (cadr compiled)) + (setcdr compiled nil)) + (setq compiled (cdr compiled))) + ;; Is the generated code as we expect it to be? + (assert (and (memq #xABAB first-part) + (memq #xBEEF14 last-part)) + nil + "This code assumes that the constant #xBEEF is #xBEEF14 in \ +compiled CCL code,\nand that the constant #xABAB is #xABAB. If that is +not the case, and it appears not to be--that's why you're getting this +message--it will not work. ") + (list first-part last-part)))) + (charset-lower -1) + (charset-upper -1) + worth-trying known-charsets encode-program + other-charset-vector ucs) + + (loop for char across decode-table + do (pushnew (char-charset char) known-charsets)) + (setq known-charsets (delq 'ascii known-charsets)) + + (loop for known-charset in known-charsets + do + ;; This is not possible for two dimensional charsets. + (when (eq 1 (charset-dimension known-charset)) + (setq args-out-of-range t) + (if (eq 'control-1 known-charset) + (setq charset-lower 0 + charset-upper 31) + ;; There should be a nicer way to get the limits here. + (condition-case args-out-of-range + (make-char known-charset #x100) + (args-out-of-range + (setq charset-lower (third args-out-of-range) + charset-upper (fourth args-out-of-range))))) + (loop + for i from charset-lower to charset-upper + always (and (setq ucs + (encode-char (make-char known-charset i) 'ucs)) + (gethash ucs encode-table)) + finally (setq worth-trying known-charset)) + + ;; Only trying this for one charset at a time, the first find. + (when worth-trying (return)) + + ;; Okay, this charset is not worth trying, Try the next. + (setq charset-lower -1 + charset-upper -1 + worth-trying nil))) + + (when worth-trying + (setq other-charset-vector (make-vector 256 encode-failure-octet)) + (loop for i from charset-lower to charset-upper + do (aset other-charset-vector (+ #x80 i) + (gethash (encode-char (make-char worth-trying i) + 'ucs) encode-table))) + (setq encode-program + (nsublis + (list (cons #xABAB (charset-id worth-trying))) + (nconc + (copy-list (first + tentative-encode-program-parts)) + (append other-charset-vector nil) + (copy-tree (second + tentative-encode-program-parts)))))) + encode-program)) + +(defun make-8-bit-generate-encode-program (decode-table encode-table + encode-failure-octet) + "Generate a CCL program to decode a 8-bit fixed-width charset. + +DECODE-TABLE must have 256 non-cons entries, and will be regarded as +describing a map from the octet corresponding to an offset in the +table to the that entry in the table. ENCODE-TABLE is a hash table +map from unicode values to characters in the range [0,255]. +ENCODE-FAILURE-OCTET describes an integer between 0 and 255 +\(inclusive) to write in the event that a character cannot be encoded. " + (check-argument-type #'vectorp decode-table) + (check-argument-range (length decode-table) #x100 #x100) + (check-argument-type #'hash-table-p encode-table) + (check-argument-type #'integerp encode-failure-octet) + (check-argument-range encode-failure-octet #x00 #xFF) + (let ((encode-program nil) + (general-encode-program + (eval-when-compile + (let ((prog (append + (ccl-compile + `(1 + (loop + (read-multibyte-character r0 r1) + (mule-to-unicode r0 r1) + (if (r0 == #xFFFD) + (write #xBEEF) + ((lookup-integer encode-table-sym r0 r3) + (if r7 + (write-multibyte-character r0 r3) + (write #xBEEF)))) + (repeat)))) nil))) + (assert (memq #xBEEF14 prog) + nil + "This code assumes that the constant #xBEEF is #xBEEF14 \ +in compiled CCL code.\nIf that is not the case, and it appears not to +be--that's why you're getting this message--it will not work. ") + prog))) + (encode-program-with-ascii-optimisation + (eval-when-compile + (let ((prog (append + (ccl-compile + `(1 + (loop + (read-multibyte-character r0 r1) + (if (r0 == ,(charset-id 'ascii)) + (write r1) + ((mule-to-unicode r0 r1) + (if (r0 == #xFFFD) + (write #xBEEF) + ((lookup-integer encode-table-sym r0 r3) + (if r7 + (write-multibyte-character r0 r3) + (write #xBEEF)))))) + (repeat)))) nil))) + (assert (memq #xBEEF14 prog) + nil + "This code assumes that the constant #xBEEF is #xBEEF14 \ +in compiled CCL code.\nIf that is not the case, and it appears not to +be--that's why you're getting this message--it will not work. ") + prog))) + (ascii-encodes-as-itself nil)) + + ;; Is this coding system ASCII-compatible? If so, we can avoid the hash + ;; table lookup for those characters. + (loop + for i from #x00 to #x7f + always (eq (int-to-char i) (gethash i encode-table)) + finally (setq ascii-encodes-as-itself t)) + + ;; Note that this logic handles EBCDIC badly. For example, CP037, + ;; MIME name ebcdic-na, has the entire repertoire of ASCII and + ;; Latin 1, and thus a more optimal ccl encode program would check + ;; for those character sets and use tables. But for now, we do a + ;; hash table lookup for every character. + (if (null ascii-encodes-as-itself) + ;; General encode program. Pros; general and correct. Cons; + ;; slow, a hash table lookup + mule-unicode conversion is done + ;; for every character encoding. + (setq encode-program general-encode-program) + (setq encode-program + ;; Encode program with ascii-ascii mapping (based on a + ;; character's mule character set), and one other mule + ;; character set using table-based encoding, other + ;; character sets using hash table lookups. + ;; make-8-bit-non-ascii-completely-coveredp only returns + ;; such a mapping if some non-ASCII charset with + ;; characters in decode-table is entirely covered by + ;; encode-table. + (make-8-bit-generate-helper decode-table encode-table + encode-failure-octet)) + (unless encode-program + ;; If make-8-bit-non-ascii-completely-coveredp returned nil, + ;; but ASCII still encodes as itself, do one-to-one mapping + ;; for ASCII, and a hash table lookup for everything else. + (setq encode-program encode-program-with-ascii-optimisation))) + + (setq encode-program + (nsublis + (list (cons #xBEEF14 + (logior (lsh encode-failure-octet 8) + #x14))) + (copy-tree encode-program))) + encode-program)) + +(defun make-8-bit-create-decode-encode-tables (unicode-map) + "Return a list \(DECODE-TABLE ENCODE-TABLE) given UNICODE-MAP. +UNICODE-MAP should be an alist mapping from integer octet values to +characters with UCS code points; DECODE-TABLE will be a 256-element +vector, and ENCODE-TABLE will be a hash table mapping from 256 numbers +to 256 distinct characters. " + (check-argument-type #'listp unicode-map) + (let ((decode-table (make-vector 256 nil)) + (encode-table (make-hash-table :size 256)) + (private-use-start (encode-char make-8-bit-private-use-start 'ucs)) + desired-ucs) + + (loop for (external internal) + in unicode-map + do + (aset decode-table external internal) + (assert (not (eq (encode-char internal 'ucs) -1)) + nil + "Looks like you're calling make-8-bit-coding-system in a \ +dumped file, \nand you're either not providing a literal UNICODE-MAP +or PROPS. Don't do that; make-8-bit-coding-system relies on sensible +Unicode mappings being available, which they are at compile time for +dumped files (but this requires the mentioned literals), but not, for +most of them, at run time. ") + + (puthash (encode-char internal 'ucs) + ;; This is semantically an integer, but Dave Love's design + ;; for lookup-integer in CCL means we need to store it as a + ;; character. + (int-to-char external) + encode-table)) + + ;; Now, go through the decode table looking at the characters that + ;; remain nil. If the XEmacs character with that integer is already in + ;; the encode table, map the on-disk octet to a Unicode private use + ;; character. Otherwise map the on-disk octet to the XEmacs character + ;; with that numeric value, to make it clearer what it is. + (dotimes (i 256) + (when (null (aref decode-table i)) + ;; Find a free code point. + (setq desired-ucs i) + (while (gethash desired-ucs encode-table) + ;; In the normal case, the code point chosen will be U+E0XY, where + ;; XY is the hexadecimal octet on disk. In pathological cases + ;; it'll be something else. + (setq desired-ucs (+ private-use-start desired-ucs) + private-use-start (+ private-use-start 1))) + (aset decode-table i (decode-char 'ucs desired-ucs)) + (puthash desired-ucs (int-to-char i) encode-table))) + (values decode-table encode-table))) + +(defun make-8-bit-generate-decode-program (decode-table) + "Given DECODE-TABLE, generate a CCL program to decode an 8-bit charset. +DECODE-TABLE must have 256 non-cons entries, and will be regarded as +describing a map from the octet corresponding to an offset in the +table to the that entry in the table. " + (check-argument-type #'vectorp decode-table) + (check-argument-range (length decode-table) #x100 #x100) + (let ((decode-program-parts + (eval-when-compile + (let* ((compiled + (append + (ccl-compile + `(3 + ((read r0) + (loop + (write-read-repeat r0 ,(make-vector + 256 'sentinel)))))) nil)) + (first-part compiled) + (last-part + (member-if-not #'symbolp + (member-if-not #'integerp first-part)))) + ;; Chop off the sentinel sentinel sentinel [..] part. + (while compiled + (if (symbolp (cadr compiled)) + (setcdr compiled nil)) + (setq compiled (cdr compiled))) + (list first-part last-part))))) + (nconc + ;; copy-list needed, because the structure of the literal provided + ;; by our eval-when-compile hangs around. + (copy-list (first decode-program-parts)) + (append decode-table nil) + (second decode-program-parts)))) + +;;;###autoload +(defun make-8-bit-coding-system (name unicode-map &optional description props) + "Make and return a fixed-width 8-bit CCL coding system named NAME. +NAME must be a symbol, and UNICODE-MAP a list. + +UNICODE-MAP is a plist describing a map from octets in the coding +system NAME (as integers) to XEmacs characters. Those XEmacs +characters will be used explicitly on decoding, but for encoding (most +relevantly, on writing to disk) XEmacs characters that map to the same +Unicode code point will be unified. This means that the ISO-8859-? +characters that map to the same Unicode code point will not be +distinct when written to disk, which is normally what is intended; it +also means that East Asian Han characters from different XEmacs +character sets will not be distinct when written to disk, which is +less often what is intended. + +Any octets not mapped will be decoded into the ISO 8859-1 characters with +the corresponding numeric value; unless another octet maps to that +character, in which case the Unicode private use area will be used. This +avoids spurious changes to files on disk when they contain octets that would +be otherwise remapped to the canonical values for the corresponding +characters in the coding system. + +DESCRIPTION and PROPS are as in `make-coding-system', which see. This +function also accepts two additional (optional) properties in PROPS; +`aliases', giving a list of aliases to be initialized for this +coding-system, and `encode-failure-octet', an integer between 0 and 256 to +write in place of XEmacs characters that cannot be encoded, defaulting to +the code for tilde `~'. " + (check-argument-type #'symbolp name) + (check-argument-type #'listp unicode-map) + (check-argument-type #'stringp + (or description + (setq description + (format "Coding system used for %s." name)))) + (check-valid-plist props) + (let ((encode-failure-octet (or (plist-get props 'encode-failure-octet) + (char-to-int ?~))) + (aliases (plist-get props 'aliases)) + (hash-table-sym (gentemp (format "%s-encode-table" name))) + encode-program decode-program result decode-table encode-table) + + ;; Some more sanity checking. + (check-argument-range encode-failure-octet 0 #xFF) + (check-argument-type #'listp aliases) + + ;; Don't pass on our extra data to make-coding-system. + (setq props (plist-remprop props 'encode-failure-octet) + props (plist-remprop props 'aliases)) + + (multiple-value-setq + (decode-table encode-table) + (make-8-bit-create-decode-encode-tables unicode-map)) + + ;; Register the decode-table. + (define-translation-hash-table hash-table-sym encode-table) + + ;; Generate the programs. + (setq decode-program (make-8-bit-generate-decode-program decode-table) + encode-program (make-8-bit-generate-encode-program + decode-table encode-table encode-failure-octet)) + (unless (vectorp encode-program) + (setq encode-program + (apply #'vector + (nsublis (list (cons 'encode-table-sym hash-table-sym)) + (copy-tree encode-program))))) + (unless (vectorp decode-program) + (setq decode-program + (apply #'vector decode-program))) + + ;; And now generate the actual coding system. + (setq result + (make-coding-system + name 'ccl + description + (plist-put (plist-put props 'decode decode-program) + 'encode encode-program))) + (coding-system-put name 'category 'iso-8-1) + (loop for alias in aliases + do (define-coding-system-alias alias name)) + result)) + +;;;###autoload +(define-compiler-macro make-8-bit-coding-system (&whole form name unicode-map + &optional description props) + + ;; We provide the compiler macro (= macro that is expanded only on + ;; compilation, and that can punt to a runtime version of the + ;; associate function if necessary) not for reasons of speed, though + ;; it does speed up things at runtime a little, but because the + ;; Unicode mappings are available at compile time in the dumped + ;; files, but they are not available at run time for the vast + ;; majority of them. + + (if (not (and (and (consp name) (eq (car name) 'quote)) + (and (consp unicode-map) (eq (car unicode-map) 'quote)) + (and (or (and (consp props) (eq (car props) 'quote)) + (null props))))) + ;; The call does not use literals; do it at runtime. + form + (setq name (cadr name) + unicode-map (cadr unicode-map) + props (if props (cadr props))) + (let ((encode-failure-octet + (or (plist-get props 'encode-failure-octet) (char-to-int ?~))) + (aliases (plist-get props 'aliases)) + encode-program decode-program + decode-table encode-table res) + + ;; Some sanity checking. + (check-argument-range encode-failure-octet 0 #xFF) + (check-argument-type #'listp aliases) + + ;; Don't pass on our extra data to make-coding-system. + (setq props (plist-remprop props 'encode-failure-octet) + props (plist-remprop props 'aliases)) + + ;; Work out encode-table and decode-table. + (multiple-value-setq + (decode-table encode-table) + (make-8-bit-create-decode-encode-tables unicode-map)) + + ;; Generate the decode and encode programs. + (setq decode-program (make-8-bit-generate-decode-program decode-table) + encode-program (make-8-bit-generate-encode-program + decode-table encode-table encode-failure-octet)) + + ;; And return the generated code. + `(let ((encode-table-sym (gentemp (format "%s-encode-table" ',name))) + result) + (define-translation-hash-table encode-table-sym ,encode-table) + (setq result + (make-coding-system + ',name 'ccl ,description + (plist-put (plist-put ',props 'decode + ,(apply #'vector decode-program)) + 'encode + (apply #'vector + (nsublis + (list (cons + 'encode-table-sym + (symbol-value 'encode-table-sym))) + ',encode-program))))) + (coding-system-put ',name 'category 'iso-8-1) + ,(macroexpand `(loop for alias in ',aliases + do (define-coding-system-alias alias + ',name))) + 'result)))) + + \ No newline at end of file