502
+ − 1 ;;; mule-coding.el --- Coding-system functions for Mule. -*- coding: iso-2022-7bit; -*-
333
+ − 2
+ − 3 ;; Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
+ − 4 ;; Licensed to the Free Software Foundation.
+ − 5 ;; Copyright (C) 1995 Amdahl Corporation.
+ − 6 ;; Copyright (C) 1995 Sun Microsystems.
+ − 7 ;; Copyright (C) 1997 MORIOKA Tomohiko
771
+ − 8 ;; Copyright (C) 2001 Ben Wing.
333
+ − 9
+ − 10 ;; This file is part of XEmacs.
+ − 11
+ − 12 ;; XEmacs is free software; you can redistribute it and/or modify it
+ − 13 ;; under the terms of the GNU General Public License as published by
+ − 14 ;; the Free Software Foundation; either version 2, or (at your option)
+ − 15 ;; any later version.
+ − 16
+ − 17 ;; XEmacs is distributed in the hope that it will be useful, but
+ − 18 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
+ − 19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ − 20 ;; General Public License for more details.
+ − 21
+ − 22 ;; You should have received a copy of the GNU General Public License
444
+ − 23 ;; along with XEmacs; see the file COPYING. If not, write to the
333
+ − 24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ − 25 ;; Boston, MA 02111-1307, USA.
+ − 26
+ − 27 ;;; Commentary:
+ − 28
+ − 29 ;;; split off of mule.el and mostly moved to coding.el
+ − 30
4072
+ − 31 ;; Needed for make-8-bit-coding-system.
4080
+ − 32 (eval-when-compile (require 'ccl))
4072
+ − 33
333
+ − 34 ;;; Code:
+ − 35
+ − 36 (defun coding-system-force-on-output (coding-system register)
+ − 37 "Return the 'force-on-output property of CODING-SYSTEM for the specified REGISTER."
444
+ − 38 (check-type register integer)
333
+ − 39 (coding-system-property
+ − 40 coding-system
+ − 41 (case register
+ − 42 (0 'force-g0-on-output)
+ − 43 (1 'force-g1-on-output)
+ − 44 (2 'force-g2-on-output)
+ − 45 (3 'force-g3-on-output)
+ − 46 (t (signal 'args-out-of-range (list register 0 3))))))
+ − 47
+ − 48 (defun coding-system-short (coding-system)
+ − 49 "Return the 'short property of CODING-SYSTEM."
+ − 50 (coding-system-property coding-system 'short))
+ − 51
+ − 52 (defun coding-system-no-ascii-eol (coding-system)
+ − 53 "Return the 'no-ascii-eol property of CODING-SYSTEM."
+ − 54 (coding-system-property coding-system 'no-ascii-eol))
+ − 55
+ − 56 (defun coding-system-no-ascii-cntl (coding-system)
+ − 57 "Return the 'no-ascii-cntl property of CODING-SYSTEM."
+ − 58 (coding-system-property coding-system 'no-ascii-cntl))
+ − 59
+ − 60 (defun coding-system-seven (coding-system)
+ − 61 "Return the 'seven property of CODING-SYSTEM."
+ − 62 (coding-system-property coding-system 'seven))
+ − 63
+ − 64 (defun coding-system-lock-shift (coding-system)
+ − 65 "Return the 'lock-shift property of CODING-SYSTEM."
+ − 66 (coding-system-property coding-system 'lock-shift))
+ − 67
+ − 68 ;;(defun coding-system-use-japanese-jisx0201-roman (coding-system)
+ − 69 ;; "Return the 'use-japanese-jisx0201-roman property of CODING-SYSTEM."
+ − 70 ;; (coding-system-property coding-system 'use-japanese-jisx0201-roman))
+ − 71
+ − 72 ;;(defun coding-system-use-japanese-jisx0208-1978 (coding-system)
+ − 73 ;; "Return the 'use-japanese-jisx0208-1978 property of CODING-SYSTEM."
+ − 74 ;; (coding-system-property coding-system 'use-japanese-jisx0208-2978))
+ − 75
+ − 76 (defun coding-system-no-iso6429 (coding-system)
+ − 77 "Return the 'no-iso6429 property of CODING-SYSTEM."
+ − 78 (coding-system-property coding-system 'no-iso6429))
+ − 79
+ − 80 (defun coding-system-ccl-encode (coding-system)
+ − 81 "Return the CCL 'encode property of CODING-SYSTEM."
+ − 82 (coding-system-property coding-system 'encode))
+ − 83
+ − 84 (defun coding-system-ccl-decode (coding-system)
+ − 85 "Return the CCL 'decode property of CODING-SYSTEM."
+ − 86 (coding-system-property coding-system 'decode))
+ − 87
771
+ − 88 (defun coding-system-iso2022-charset (coding-system register)
+ − 89 "Return the charset initially designated to REGISTER in CODING-SYSTEM.
+ − 90 The allowable range of REGISTER is 0 through 3."
+ − 91 (if (or (< register 0) (> register 3))
+ − 92 (error 'args-out-of-range "coding-system-charset REGISTER" register 0 3))
+ − 93 (coding-system-property coding-system (nth register '(charset-g0
+ − 94 charset-g1
+ − 95 charset-g2
+ − 96 charset-g3))))
+ − 97
333
+ − 98
+ − 99 ;;;; Definitions of predefined coding systems
+ − 100
+ − 101 (make-coding-system
+ − 102 'ctext 'iso2022
771
+ − 103 "Compound Text"
333
+ − 104 '(charset-g0 ascii
+ − 105 charset-g1 latin-iso8859-1
+ − 106 eol-type nil
+ − 107 mnemonic "CText"))
+ − 108
+ − 109 (make-coding-system
+ − 110 'iso-2022-8bit-ss2 'iso2022
771
+ − 111 "ISO-2022 8-bit w/SS2"
333
+ − 112 '(charset-g0 ascii
+ − 113 charset-g1 latin-iso8859-1
+ − 114 charset-g2 t ;; unspecified but can be used later.
+ − 115 short t
+ − 116 mnemonic "ISO8/SS"
771
+ − 117 documentation "ISO 2022 based 8-bit encoding using SS2 for 96-charset"
333
+ − 118 ))
+ − 119
+ − 120 (make-coding-system
+ − 121 'iso-2022-7bit-ss2 'iso2022
771
+ − 122 "ISO-2022 7-bit w/SS2"
333
+ − 123 '(charset-g0 ascii
+ − 124 charset-g2 t ;; unspecified but can be used later.
+ − 125 seven t
+ − 126 short t
+ − 127 mnemonic "ISO7/SS"
771
+ − 128 documentation "ISO 2022 based 7-bit encoding using SS2 for 96-charset"
333
+ − 129 eol-type nil))
+ − 130
+ − 131 ;; (copy-coding-system 'iso-2022-7bit-ss2 'iso-2022-jp-2)
+ − 132 (make-coding-system
+ − 133 'iso-2022-jp-2 'iso2022
771
+ − 134 "ISO-2022-JP-2"
333
+ − 135 '(charset-g0 ascii
+ − 136 charset-g2 t ;; unspecified but can be used later.
+ − 137 seven t
+ − 138 short t
+ − 139 mnemonic "ISO7/SS"
+ − 140 eol-type nil))
+ − 141
+ − 142 (make-coding-system
+ − 143 'iso-2022-7bit 'iso2022
771
+ − 144 "ISO 2022 7-bit"
333
+ − 145 '(charset-g0 ascii
+ − 146 seven t
+ − 147 short t
771
+ − 148 mnemonic "ISO7"
+ − 149 documentation "ISO-2022-based 7-bit encoding using only G0"
+ − 150 ))
333
+ − 151
+ − 152 ;; compatibility for old XEmacsen
771
+ − 153 (define-coding-system-alias 'iso-2022-7 'iso-2022-7bit)
333
+ − 154
+ − 155 (make-coding-system
+ − 156 'iso-2022-8 'iso2022
771
+ − 157 "ISO-2022 8-bit"
333
+ − 158 '(charset-g0 ascii
+ − 159 charset-g1 latin-iso8859-1
+ − 160 short t
+ − 161 mnemonic "ISO8"
771
+ − 162 documentation "ISO-2022 eight-bit coding system. No single-shift or locking-shift."
333
+ − 163 ))
+ − 164
+ − 165 (make-coding-system
+ − 166 'escape-quoted 'iso2022
771
+ − 167 "Escape-Quoted (for .ELC files)"
333
+ − 168 '(charset-g0 ascii
+ − 169 charset-g1 latin-iso8859-1
+ − 170 eol-type lf
+ − 171 escape-quoted t
+ − 172 mnemonic "ESC/Quot"
771
+ − 173 documentation "ISO-2022 eight-bit coding system with escape quoting; used for .ELC files."
333
+ − 174 ))
+ − 175
+ − 176 (make-coding-system
+ − 177 'iso-2022-lock 'iso2022
771
+ − 178 "ISO-2022 w/locking-shift"
333
+ − 179 '(charset-g0 ascii
+ − 180 charset-g1 t ;; unspecified but can be used later.
+ − 181 seven t
+ − 182 lock-shift t
+ − 183 mnemonic "ISO7/Lock"
771
+ − 184 documentation "ISO-2022 coding system using Locking-Shift for 96-charset."
333
+ − 185 ))
4072
+ − 186
333
+ − 187
4072
+ − 188 ;; This is used by people writing CCL programs, but is called at runtime.
+ − 189 (defun define-translation-hash-table (symbol table)
+ − 190 "Define SYMBOL as the name of the hash translation TABLE for use in CCL.
+ − 191
+ − 192 Analogous to `define-translation-table', but updates
+ − 193 `translation-hash-table-vector' and the table is for use in the CCL
+ − 194 `lookup-integer' and `lookup-character' functions."
4145
+ − 195 (check-argument-type #'symbolp symbol)
+ − 196 (check-argument-type #'hash-table-p table)
4072
+ − 197 (let ((len (length translation-hash-table-vector))
+ − 198 (id 0)
+ − 199 done)
+ − 200 (put symbol 'translation-hash-table table)
+ − 201 (while (not done)
+ − 202 (if (>= id len)
+ − 203 (setq translation-hash-table-vector
+ − 204 (vconcat translation-hash-table-vector [nil])))
+ − 205 (let ((slot (aref translation-hash-table-vector id)))
+ − 206 (if (or (not slot)
+ − 207 (eq (car slot) symbol))
+ − 208 (progn
+ − 209 (aset translation-hash-table-vector id (cons symbol table))
+ − 210 (setq done t))
+ − 211 (setq id (1+ id)))))
+ − 212 (put symbol 'translation-hash-table-id id)
+ − 213 id))
+ − 214
+ − 215 (defvar make-8-bit-private-use-start (decode-char 'ucs #xE000)
+ − 216 "Start of a 256 code private use area for make-8-bit-coding-system.
+ − 217
+ − 218 This is used to ensure that distinct octets on disk for a given coding
+ − 219 system map to distinct XEmacs characters, preventing a spurious changes when
+ − 220 a file is read, not changed, and then written. ")
+ − 221
+ − 222 (defun make-8-bit-generate-helper (decode-table encode-table
+ − 223 encode-failure-octet)
+ − 224 "Helper function for `make-8-bit-generate-encode-program', which see.
+ − 225
4145
+ − 226 Deals with the case where ASCII and another character set can both be
+ − 227 encoded unambiguously and completely into the coding-system; if this is so,
+ − 228 returns a list corresponding to such a ccl-program. If not, it returns nil. "
4072
+ − 229 (let ((tentative-encode-program-parts
+ − 230 (eval-when-compile
4295
+ − 231 (let* ((vec-len 128)
+ − 232 (compiled
4072
+ − 233 (append
+ − 234 (ccl-compile
+ − 235 `(1
+ − 236 (loop
+ − 237 (read-multibyte-character r0 r1)
+ − 238 (if (r0 == ,(charset-id 'ascii))
+ − 239 (write r1)
+ − 240 ((if (r0 == #xABAB)
+ − 241 ;; #xBFFE is a sentinel in the compiled
+ − 242 ;; program.
4295
+ − 243 ;; #xBFFE is a sentinel in the compiled
+ − 244 ;; program.
+ − 245 ((r0 = r1 & #x7F)
+ − 246 (write r0 ,(make-vector vec-len #xBFFE)))
4072
+ − 247 ((mule-to-unicode r0 r1)
+ − 248 (if (r0 == #xFFFD)
+ − 249 (write #xBEEF)
+ − 250 ((lookup-integer encode-table-sym r0 r3)
+ − 251 (if r7
+ − 252 (write-multibyte-character r0 r3)
+ − 253 (write #xBEEF))))))))
+ − 254 (repeat)))) nil))
+ − 255 (first-part compiled)
+ − 256 (last-part
+ − 257 (member-if-not (lambda (entr) (eq #xBFFE entr))
+ − 258 (member-if
+ − 259 (lambda (entr) (eq #xBFFE entr))
+ − 260 first-part))))
+ − 261 (while compiled
4295
+ − 262 (when (eq #xBFFE (cadr compiled))
+ − 263 (assert (= vec-len (search '(#xBFFE) (cdr compiled)
+ − 264 :test #'/=)) nil
+ − 265 "Strange ccl vector length")
+ − 266 (setcdr compiled nil))
4072
+ − 267 (setq compiled (cdr compiled)))
+ − 268 ;; Is the generated code as we expect it to be?
+ − 269 (assert (and (memq #xABAB first-part)
+ − 270 (memq #xBEEF14 last-part))
+ − 271 nil
+ − 272 "This code assumes that the constant #xBEEF is #xBEEF14 in \
+ − 273 compiled CCL code,\nand that the constant #xABAB is #xABAB. If that is
+ − 274 not the case, and it appears not to be--that's why you're getting this
+ − 275 message--it will not work. ")
4295
+ − 276 (list first-part last-part vec-len))))
4072
+ − 277 (charset-lower -1)
+ − 278 (charset-upper -1)
+ − 279 worth-trying known-charsets encode-program
4295
+ − 280 other-charset-vector ucs args-out-of-range)
4072
+ − 281
+ − 282 (loop for char across decode-table
+ − 283 do (pushnew (char-charset char) known-charsets))
+ − 284 (setq known-charsets (delq 'ascii known-charsets))
+ − 285
+ − 286 (loop for known-charset in known-charsets
+ − 287 do
+ − 288 ;; This is not possible for two dimensional charsets.
+ − 289 (when (eq 1 (charset-dimension known-charset))
+ − 290 (setq args-out-of-range t)
+ − 291 (if (eq 'control-1 known-charset)
+ − 292 (setq charset-lower 0
+ − 293 charset-upper 31)
+ − 294 ;; There should be a nicer way to get the limits here.
+ − 295 (condition-case args-out-of-range
+ − 296 (make-char known-charset #x100)
+ − 297 (args-out-of-range
+ − 298 (setq charset-lower (third args-out-of-range)
+ − 299 charset-upper (fourth args-out-of-range)))))
+ − 300 (loop
+ − 301 for i from charset-lower to charset-upper
+ − 302 always (and (setq ucs
+ − 303 (encode-char (make-char known-charset i) 'ucs))
+ − 304 (gethash ucs encode-table))
+ − 305 finally (setq worth-trying known-charset))
+ − 306
+ − 307 ;; Only trying this for one charset at a time, the first find.
+ − 308 (when worth-trying (return))
+ − 309
+ − 310 ;; Okay, this charset is not worth trying, Try the next.
+ − 311 (setq charset-lower -1
+ − 312 charset-upper -1
+ − 313 worth-trying nil)))
+ − 314
+ − 315 (when worth-trying
4295
+ − 316 (setq other-charset-vector
+ − 317 (make-vector (third tentative-encode-program-parts)
+ − 318 encode-failure-octet))
4072
+ − 319 (loop for i from charset-lower to charset-upper
4090
+ − 320 do (aset other-charset-vector i
4072
+ − 321 (gethash (encode-char (make-char worth-trying i)
+ − 322 'ucs) encode-table)))
+ − 323 (setq encode-program
+ − 324 (nsublis
+ − 325 (list (cons #xABAB (charset-id worth-trying)))
+ − 326 (nconc
+ − 327 (copy-list (first
+ − 328 tentative-encode-program-parts))
+ − 329 (append other-charset-vector nil)
+ − 330 (copy-tree (second
+ − 331 tentative-encode-program-parts))))))
+ − 332 encode-program))
+ − 333
+ − 334 (defun make-8-bit-generate-encode-program (decode-table encode-table
+ − 335 encode-failure-octet)
+ − 336 "Generate a CCL program to decode a 8-bit fixed-width charset.
+ − 337
+ − 338 DECODE-TABLE must have 256 non-cons entries, and will be regarded as
+ − 339 describing a map from the octet corresponding to an offset in the
+ − 340 table to the that entry in the table. ENCODE-TABLE is a hash table
+ − 341 map from unicode values to characters in the range [0,255].
+ − 342 ENCODE-FAILURE-OCTET describes an integer between 0 and 255
+ − 343 \(inclusive) to write in the event that a character cannot be encoded. "
+ − 344 (check-argument-type #'vectorp decode-table)
+ − 345 (check-argument-range (length decode-table) #x100 #x100)
+ − 346 (check-argument-type #'hash-table-p encode-table)
+ − 347 (check-argument-type #'integerp encode-failure-octet)
+ − 348 (check-argument-range encode-failure-octet #x00 #xFF)
+ − 349 (let ((encode-program nil)
+ − 350 (general-encode-program
+ − 351 (eval-when-compile
+ − 352 (let ((prog (append
+ − 353 (ccl-compile
+ − 354 `(1
+ − 355 (loop
+ − 356 (read-multibyte-character r0 r1)
+ − 357 (mule-to-unicode r0 r1)
+ − 358 (if (r0 == #xFFFD)
+ − 359 (write #xBEEF)
+ − 360 ((lookup-integer encode-table-sym r0 r3)
+ − 361 (if r7
+ − 362 (write-multibyte-character r0 r3)
+ − 363 (write #xBEEF))))
+ − 364 (repeat)))) nil)))
+ − 365 (assert (memq #xBEEF14 prog)
+ − 366 nil
+ − 367 "This code assumes that the constant #xBEEF is #xBEEF14 \
+ − 368 in compiled CCL code.\nIf that is not the case, and it appears not to
+ − 369 be--that's why you're getting this message--it will not work. ")
+ − 370 prog)))
+ − 371 (encode-program-with-ascii-optimisation
+ − 372 (eval-when-compile
+ − 373 (let ((prog (append
+ − 374 (ccl-compile
+ − 375 `(1
+ − 376 (loop
+ − 377 (read-multibyte-character r0 r1)
+ − 378 (if (r0 == ,(charset-id 'ascii))
+ − 379 (write r1)
+ − 380 ((mule-to-unicode r0 r1)
+ − 381 (if (r0 == #xFFFD)
+ − 382 (write #xBEEF)
+ − 383 ((lookup-integer encode-table-sym r0 r3)
+ − 384 (if r7
+ − 385 (write-multibyte-character r0 r3)
+ − 386 (write #xBEEF))))))
+ − 387 (repeat)))) nil)))
+ − 388 (assert (memq #xBEEF14 prog)
+ − 389 nil
+ − 390 "This code assumes that the constant #xBEEF is #xBEEF14 \
+ − 391 in compiled CCL code.\nIf that is not the case, and it appears not to
+ − 392 be--that's why you're getting this message--it will not work. ")
+ − 393 prog)))
+ − 394 (ascii-encodes-as-itself nil))
+ − 395
+ − 396 ;; Is this coding system ASCII-compatible? If so, we can avoid the hash
+ − 397 ;; table lookup for those characters.
+ − 398 (loop
+ − 399 for i from #x00 to #x7f
+ − 400 always (eq (int-to-char i) (gethash i encode-table))
+ − 401 finally (setq ascii-encodes-as-itself t))
+ − 402
+ − 403 ;; Note that this logic handles EBCDIC badly. For example, CP037,
+ − 404 ;; MIME name ebcdic-na, has the entire repertoire of ASCII and
+ − 405 ;; Latin 1, and thus a more optimal ccl encode program would check
+ − 406 ;; for those character sets and use tables. But for now, we do a
+ − 407 ;; hash table lookup for every character.
+ − 408 (if (null ascii-encodes-as-itself)
+ − 409 ;; General encode program. Pros; general and correct. Cons;
+ − 410 ;; slow, a hash table lookup + mule-unicode conversion is done
+ − 411 ;; for every character encoding.
+ − 412 (setq encode-program general-encode-program)
+ − 413 (setq encode-program
+ − 414 ;; Encode program with ascii-ascii mapping (based on a
+ − 415 ;; character's mule character set), and one other mule
+ − 416 ;; character set using table-based encoding, other
+ − 417 ;; character sets using hash table lookups.
+ − 418 ;; make-8-bit-non-ascii-completely-coveredp only returns
+ − 419 ;; such a mapping if some non-ASCII charset with
+ − 420 ;; characters in decode-table is entirely covered by
+ − 421 ;; encode-table.
+ − 422 (make-8-bit-generate-helper decode-table encode-table
+ − 423 encode-failure-octet))
+ − 424 (unless encode-program
+ − 425 ;; If make-8-bit-non-ascii-completely-coveredp returned nil,
+ − 426 ;; but ASCII still encodes as itself, do one-to-one mapping
+ − 427 ;; for ASCII, and a hash table lookup for everything else.
+ − 428 (setq encode-program encode-program-with-ascii-optimisation)))
+ − 429
+ − 430 (setq encode-program
+ − 431 (nsublis
+ − 432 (list (cons #xBEEF14
+ − 433 (logior (lsh encode-failure-octet 8)
+ − 434 #x14)))
+ − 435 (copy-tree encode-program)))
+ − 436 encode-program))
+ − 437
+ − 438 (defun make-8-bit-create-decode-encode-tables (unicode-map)
+ − 439 "Return a list \(DECODE-TABLE ENCODE-TABLE) given UNICODE-MAP.
+ − 440 UNICODE-MAP should be an alist mapping from integer octet values to
+ − 441 characters with UCS code points; DECODE-TABLE will be a 256-element
+ − 442 vector, and ENCODE-TABLE will be a hash table mapping from 256 numbers
+ − 443 to 256 distinct characters. "
+ − 444 (check-argument-type #'listp unicode-map)
+ − 445 (let ((decode-table (make-vector 256 nil))
+ − 446 (encode-table (make-hash-table :size 256))
+ − 447 (private-use-start (encode-char make-8-bit-private-use-start 'ucs))
+ − 448 desired-ucs)
+ − 449
+ − 450 (loop for (external internal)
+ − 451 in unicode-map
+ − 452 do
+ − 453 (aset decode-table external internal)
+ − 454 (assert (not (eq (encode-char internal 'ucs) -1))
+ − 455 nil
+ − 456 "Looks like you're calling make-8-bit-coding-system in a \
+ − 457 dumped file, \nand you're either not providing a literal UNICODE-MAP
+ − 458 or PROPS. Don't do that; make-8-bit-coding-system relies on sensible
+ − 459 Unicode mappings being available, which they are at compile time for
+ − 460 dumped files (but this requires the mentioned literals), but not, for
+ − 461 most of them, at run time. ")
+ − 462
+ − 463 (puthash (encode-char internal 'ucs)
+ − 464 ;; This is semantically an integer, but Dave Love's design
+ − 465 ;; for lookup-integer in CCL means we need to store it as a
+ − 466 ;; character.
+ − 467 (int-to-char external)
+ − 468 encode-table))
+ − 469
+ − 470 ;; Now, go through the decode table looking at the characters that
+ − 471 ;; remain nil. If the XEmacs character with that integer is already in
+ − 472 ;; the encode table, map the on-disk octet to a Unicode private use
+ − 473 ;; character. Otherwise map the on-disk octet to the XEmacs character
+ − 474 ;; with that numeric value, to make it clearer what it is.
+ − 475 (dotimes (i 256)
+ − 476 (when (null (aref decode-table i))
+ − 477 ;; Find a free code point.
+ − 478 (setq desired-ucs i)
+ − 479 (while (gethash desired-ucs encode-table)
+ − 480 ;; In the normal case, the code point chosen will be U+E0XY, where
+ − 481 ;; XY is the hexadecimal octet on disk. In pathological cases
+ − 482 ;; it'll be something else.
+ − 483 (setq desired-ucs (+ private-use-start desired-ucs)
+ − 484 private-use-start (+ private-use-start 1)))
4085
+ − 485 (puthash desired-ucs (int-to-char i) encode-table)
+ − 486 (setq desired-ucs (if (> desired-ucs #xFF)
+ − 487 (decode-char 'ucs desired-ucs)
+ − 488 ;; So we get Latin-1 when run at dump time,
+ − 489 ;; instead of JIT-allocated characters.
+ − 490 (int-to-char desired-ucs)))
+ − 491 (aset decode-table i desired-ucs)))
4072
+ − 492 (values decode-table encode-table)))
+ − 493
+ − 494 (defun make-8-bit-generate-decode-program (decode-table)
+ − 495 "Given DECODE-TABLE, generate a CCL program to decode an 8-bit charset.
+ − 496 DECODE-TABLE must have 256 non-cons entries, and will be regarded as
+ − 497 describing a map from the octet corresponding to an offset in the
+ − 498 table to the that entry in the table. "
+ − 499 (check-argument-type #'vectorp decode-table)
+ − 500 (check-argument-range (length decode-table) #x100 #x100)
+ − 501 (let ((decode-program-parts
+ − 502 (eval-when-compile
+ − 503 (let* ((compiled
+ − 504 (append
+ − 505 (ccl-compile
+ − 506 `(3
+ − 507 ((read r0)
+ − 508 (loop
+ − 509 (write-read-repeat r0 ,(make-vector
+ − 510 256 'sentinel)))))) nil))
+ − 511 (first-part compiled)
+ − 512 (last-part
+ − 513 (member-if-not #'symbolp
+ − 514 (member-if-not #'integerp first-part))))
+ − 515 ;; Chop off the sentinel sentinel sentinel [..] part.
+ − 516 (while compiled
+ − 517 (if (symbolp (cadr compiled))
+ − 518 (setcdr compiled nil))
+ − 519 (setq compiled (cdr compiled)))
+ − 520 (list first-part last-part)))))
+ − 521 (nconc
+ − 522 ;; copy-list needed, because the structure of the literal provided
+ − 523 ;; by our eval-when-compile hangs around.
+ − 524 (copy-list (first decode-program-parts))
+ − 525 (append decode-table nil)
+ − 526 (second decode-program-parts))))
+ − 527
4145
+ − 528 (defun make-8-bit-choose-category (decode-table)
+ − 529 "Given DECODE-TABLE, return an appropriate coding category.
+ − 530 DECODE-TABLE is a 256-entry vector describing the mapping from octets on
+ − 531 disk to XEmacs characters for some fixed-width 8-bit coding system. "
+ − 532 (check-argument-type #'vectorp decode-table)
+ − 533 (check-argument-range (length decode-table) #x100 #x100)
+ − 534 (block category
+ − 535 (loop
+ − 536 for i from #x80 to #xBF
+ − 537 do (unless (= i (aref decode-table i))
+ − 538 (return-from category 'no-conversion)))
+ − 539 'iso-8-1))
+ − 540
4072
+ − 541 ;;;###autoload
+ − 542 (defun make-8-bit-coding-system (name unicode-map &optional description props)
+ − 543 "Make and return a fixed-width 8-bit CCL coding system named NAME.
+ − 544 NAME must be a symbol, and UNICODE-MAP a list.
+ − 545
+ − 546 UNICODE-MAP is a plist describing a map from octets in the coding
+ − 547 system NAME (as integers) to XEmacs characters. Those XEmacs
+ − 548 characters will be used explicitly on decoding, but for encoding (most
+ − 549 relevantly, on writing to disk) XEmacs characters that map to the same
+ − 550 Unicode code point will be unified. This means that the ISO-8859-?
+ − 551 characters that map to the same Unicode code point will not be
+ − 552 distinct when written to disk, which is normally what is intended; it
+ − 553 also means that East Asian Han characters from different XEmacs
+ − 554 character sets will not be distinct when written to disk, which is
+ − 555 less often what is intended.
+ − 556
+ − 557 Any octets not mapped will be decoded into the ISO 8859-1 characters with
+ − 558 the corresponding numeric value; unless another octet maps to that
+ − 559 character, in which case the Unicode private use area will be used. This
+ − 560 avoids spurious changes to files on disk when they contain octets that would
+ − 561 be otherwise remapped to the canonical values for the corresponding
+ − 562 characters in the coding system.
+ − 563
+ − 564 DESCRIPTION and PROPS are as in `make-coding-system', which see. This
+ − 565 function also accepts two additional (optional) properties in PROPS;
+ − 566 `aliases', giving a list of aliases to be initialized for this
+ − 567 coding-system, and `encode-failure-octet', an integer between 0 and 256 to
+ − 568 write in place of XEmacs characters that cannot be encoded, defaulting to
+ − 569 the code for tilde `~'. "
+ − 570 (check-argument-type #'symbolp name)
+ − 571 (check-argument-type #'listp unicode-map)
+ − 572 (check-argument-type #'stringp
+ − 573 (or description
+ − 574 (setq description
+ − 575 (format "Coding system used for %s." name))))
+ − 576 (check-valid-plist props)
+ − 577 (let ((encode-failure-octet (or (plist-get props 'encode-failure-octet)
+ − 578 (char-to-int ?~)))
+ − 579 (aliases (plist-get props 'aliases))
+ − 580 (hash-table-sym (gentemp (format "%s-encode-table" name)))
+ − 581 encode-program decode-program result decode-table encode-table)
+ − 582
+ − 583 ;; Some more sanity checking.
+ − 584 (check-argument-range encode-failure-octet 0 #xFF)
+ − 585 (check-argument-type #'listp aliases)
+ − 586
+ − 587 ;; Don't pass on our extra data to make-coding-system.
+ − 588 (setq props (plist-remprop props 'encode-failure-octet)
+ − 589 props (plist-remprop props 'aliases))
+ − 590
+ − 591 (multiple-value-setq
+ − 592 (decode-table encode-table)
+ − 593 (make-8-bit-create-decode-encode-tables unicode-map))
+ − 594
+ − 595 ;; Register the decode-table.
+ − 596 (define-translation-hash-table hash-table-sym encode-table)
+ − 597
+ − 598 ;; Generate the programs.
+ − 599 (setq decode-program (make-8-bit-generate-decode-program decode-table)
+ − 600 encode-program (make-8-bit-generate-encode-program
+ − 601 decode-table encode-table encode-failure-octet))
+ − 602 (unless (vectorp encode-program)
+ − 603 (setq encode-program
+ − 604 (apply #'vector
+ − 605 (nsublis (list (cons 'encode-table-sym hash-table-sym))
+ − 606 (copy-tree encode-program)))))
+ − 607 (unless (vectorp decode-program)
+ − 608 (setq decode-program
+ − 609 (apply #'vector decode-program)))
+ − 610
+ − 611 ;; And now generate the actual coding system.
+ − 612 (setq result
+ − 613 (make-coding-system
+ − 614 name 'ccl
+ − 615 description
+ − 616 (plist-put (plist-put props 'decode decode-program)
+ − 617 'encode encode-program)))
4295
+ − 618 (coding-system-put name '8-bit-fixed t)
4145
+ − 619 (coding-system-put name 'category
+ − 620 (make-8-bit-choose-category decode-table))
4072
+ − 621 (loop for alias in aliases
+ − 622 do (define-coding-system-alias alias name))
+ − 623 result))
+ − 624
+ − 625 (define-compiler-macro make-8-bit-coding-system (&whole form name unicode-map
+ − 626 &optional description props)
+ − 627
+ − 628 ;; We provide the compiler macro (= macro that is expanded only on
+ − 629 ;; compilation, and that can punt to a runtime version of the
+ − 630 ;; associate function if necessary) not for reasons of speed, though
+ − 631 ;; it does speed up things at runtime a little, but because the
+ − 632 ;; Unicode mappings are available at compile time in the dumped
+ − 633 ;; files, but they are not available at run time for the vast
+ − 634 ;; majority of them.
+ − 635
+ − 636 (if (not (and (and (consp name) (eq (car name) 'quote))
+ − 637 (and (consp unicode-map) (eq (car unicode-map) 'quote))
+ − 638 (and (or (and (consp props) (eq (car props) 'quote))
+ − 639 (null props)))))
+ − 640 ;; The call does not use literals; do it at runtime.
+ − 641 form
+ − 642 (setq name (cadr name)
+ − 643 unicode-map (cadr unicode-map)
+ − 644 props (if props (cadr props)))
+ − 645 (let ((encode-failure-octet
+ − 646 (or (plist-get props 'encode-failure-octet) (char-to-int ?~)))
+ − 647 (aliases (plist-get props 'aliases))
+ − 648 encode-program decode-program
4103
+ − 649 decode-table encode-table)
4072
+ − 650
+ − 651 ;; Some sanity checking.
+ − 652 (check-argument-range encode-failure-octet 0 #xFF)
+ − 653 (check-argument-type #'listp aliases)
+ − 654
+ − 655 ;; Don't pass on our extra data to make-coding-system.
+ − 656 (setq props (plist-remprop props 'encode-failure-octet)
+ − 657 props (plist-remprop props 'aliases))
+ − 658
+ − 659 ;; Work out encode-table and decode-table.
+ − 660 (multiple-value-setq
+ − 661 (decode-table encode-table)
+ − 662 (make-8-bit-create-decode-encode-tables unicode-map))
+ − 663
+ − 664 ;; Generate the decode and encode programs.
+ − 665 (setq decode-program (make-8-bit-generate-decode-program decode-table)
+ − 666 encode-program (make-8-bit-generate-encode-program
+ − 667 decode-table encode-table encode-failure-octet))
+ − 668
+ − 669 ;; And return the generated code.
+ − 670 `(let ((encode-table-sym (gentemp (format "%s-encode-table" ',name)))
4103
+ − 671 ;; The case-fold-search bind shouldn't be necessary. If I take
+ − 672 ;; it, out, though, I get:
+ − 673 ;;
+ − 674 ;; (invalid-read-syntax "Multiply defined symbol label" 1)
+ − 675 ;;
+ − 676 ;; when the file is byte compiled.
+ − 677 (case-fold-search t))
4072
+ − 678 (define-translation-hash-table encode-table-sym ,encode-table)
4103
+ − 679 (make-coding-system
+ − 680 ',name 'ccl ,description
+ − 681 (plist-put (plist-put ',props 'decode
+ − 682 ,(apply #'vector decode-program))
+ − 683 'encode
+ − 684 (apply #'vector
+ − 685 (nsublis
+ − 686 (list (cons
+ − 687 'encode-table-sym
+ − 688 (symbol-value 'encode-table-sym)))
+ − 689 ',encode-program))))
4295
+ − 690 (coding-system-put ',name '8-bit-fixed t)
4145
+ − 691 (coding-system-put ',name 'category ',
+ − 692 (make-8-bit-choose-category decode-table))
4072
+ − 693 ,(macroexpand `(loop for alias in ',aliases
+ − 694 do (define-coding-system-alias alias
+ − 695 ',name)))
4103
+ − 696 (find-coding-system ',name)))))
4299
+ − 697
+ − 698 ;; Ideally this would be in latin.el, but code-init.el uses it.
+ − 699 (make-8-bit-coding-system
+ − 700 'iso-8859-1
+ − 701 '() ;; No differences from Latin 1.
+ − 702 "ISO-8859-1 (Latin-1)"
+ − 703 '(mnemonic "Latin 1"
+ − 704 documentation "The most used encoding of Western Europe and the Americas."
+ − 705 aliases (iso-latin-1 latin-1)))
+ − 706