Mercurial > hg > xemacs-beta
comparison lisp/mule/mule-coding.el @ 4072:aa28d959af41
[xemacs-hg @ 2007-07-22 22:03:29 by aidan]
Add support for non-ISO2022 8 bit fixed-width coding-systems
author | aidan |
---|---|
date | Sun, 22 Jul 2007 22:04:14 +0000 |
parents | 943eaba38521 |
children | 476d0799d704 |
comparison
equal
deleted
inserted
replaced
4071:d607d13fca67 | 4072:aa28d959af41 |
---|---|
25 ;; Boston, MA 02111-1307, USA. | 25 ;; Boston, MA 02111-1307, USA. |
26 | 26 |
27 ;;; Commentary: | 27 ;;; Commentary: |
28 | 28 |
29 ;;; split off of mule.el and mostly moved to coding.el | 29 ;;; split off of mule.el and mostly moved to coding.el |
30 | |
31 ;; Needed for make-8-bit-coding-system. | |
32 (eval-when-compile (require 'ccl "mule-ccl")) | |
30 | 33 |
31 ;;; Code: | 34 ;;; Code: |
32 | 35 |
33 (defun coding-system-force-on-output (coding-system register) | 36 (defun coding-system-force-on-output (coding-system register) |
34 "Return the 'force-on-output property of CODING-SYSTEM for the specified REGISTER." | 37 "Return the 'force-on-output property of CODING-SYSTEM for the specified REGISTER." |
183 seven t | 186 seven t |
184 lock-shift t | 187 lock-shift t |
185 mnemonic "ISO7/Lock" | 188 mnemonic "ISO7/Lock" |
186 documentation "ISO-2022 coding system using Locking-Shift for 96-charset." | 189 documentation "ISO-2022 coding system using Locking-Shift for 96-charset." |
187 )) | 190 )) |
188 | 191 |
189 ;;; mule-coding.el ends here | 192 |
193 ;; This is used by people writing CCL programs, but is called at runtime. | |
194 (defun define-translation-hash-table (symbol table) | |
195 "Define SYMBOL as the name of the hash translation TABLE for use in CCL. | |
196 | |
197 Analogous to `define-translation-table', but updates | |
198 `translation-hash-table-vector' and the table is for use in the CCL | |
199 `lookup-integer' and `lookup-character' functions." | |
200 (unless (and (symbolp symbol) | |
201 (hash-table-p table)) | |
202 (error "Bad args to define-translation-hash-table")) | |
203 (let ((len (length translation-hash-table-vector)) | |
204 (id 0) | |
205 done) | |
206 (put symbol 'translation-hash-table table) | |
207 (while (not done) | |
208 (if (>= id len) | |
209 (setq translation-hash-table-vector | |
210 (vconcat translation-hash-table-vector [nil]))) | |
211 (let ((slot (aref translation-hash-table-vector id))) | |
212 (if (or (not slot) | |
213 (eq (car slot) symbol)) | |
214 (progn | |
215 (aset translation-hash-table-vector id (cons symbol table)) | |
216 (setq done t)) | |
217 (setq id (1+ id))))) | |
218 (put symbol 'translation-hash-table-id id) | |
219 id)) | |
220 | |
221 (defvar make-8-bit-private-use-start (decode-char 'ucs #xE000) | |
222 "Start of a 256 code private use area for make-8-bit-coding-system. | |
223 | |
224 This is used to ensure that distinct octets on disk for a given coding | |
225 system map to distinct XEmacs characters, preventing a spurious changes when | |
226 a file is read, not changed, and then written. ") | |
227 | |
228 (defun make-8-bit-generate-helper (decode-table encode-table | |
229 encode-failure-octet) | |
230 "Helper function for `make-8-bit-generate-encode-program', which see. | |
231 | |
232 Deals with the case where ASCII and another character set provide the | |
233 can both be encoded unambiguously into the coding-system; if this is | |
234 so, returns a list corresponding to such a ccl-program. If not, it | |
235 returns nil. " | |
236 (let ((tentative-encode-program-parts | |
237 (eval-when-compile | |
238 (let* ((compiled | |
239 (append | |
240 (ccl-compile | |
241 `(1 | |
242 (loop | |
243 (read-multibyte-character r0 r1) | |
244 (if (r0 == ,(charset-id 'ascii)) | |
245 (write r1) | |
246 ((if (r0 == #xABAB) | |
247 ;; #xBFFE is a sentinel in the compiled | |
248 ;; program. | |
249 (write r1 ,(make-vector 256 #xBFFE)) | |
250 ((mule-to-unicode r0 r1) | |
251 (if (r0 == #xFFFD) | |
252 (write #xBEEF) | |
253 ((lookup-integer encode-table-sym r0 r3) | |
254 (if r7 | |
255 (write-multibyte-character r0 r3) | |
256 (write #xBEEF)))))))) | |
257 (repeat)))) nil)) | |
258 (first-part compiled) | |
259 (last-part | |
260 (member-if-not (lambda (entr) (eq #xBFFE entr)) | |
261 (member-if | |
262 (lambda (entr) (eq #xBFFE entr)) | |
263 first-part)))) | |
264 (while compiled | |
265 (if (eq #xBFFE (cadr compiled)) | |
266 (setcdr compiled nil)) | |
267 (setq compiled (cdr compiled))) | |
268 ;; Is the generated code as we expect it to be? | |
269 (assert (and (memq #xABAB first-part) | |
270 (memq #xBEEF14 last-part)) | |
271 nil | |
272 "This code assumes that the constant #xBEEF is #xBEEF14 in \ | |
273 compiled CCL code,\nand that the constant #xABAB is #xABAB. If that is | |
274 not the case, and it appears not to be--that's why you're getting this | |
275 message--it will not work. ") | |
276 (list first-part last-part)))) | |
277 (charset-lower -1) | |
278 (charset-upper -1) | |
279 worth-trying known-charsets encode-program | |
280 other-charset-vector ucs) | |
281 | |
282 (loop for char across decode-table | |
283 do (pushnew (char-charset char) known-charsets)) | |
284 (setq known-charsets (delq 'ascii known-charsets)) | |
285 | |
286 (loop for known-charset in known-charsets | |
287 do | |
288 ;; This is not possible for two dimensional charsets. | |
289 (when (eq 1 (charset-dimension known-charset)) | |
290 (setq args-out-of-range t) | |
291 (if (eq 'control-1 known-charset) | |
292 (setq charset-lower 0 | |
293 charset-upper 31) | |
294 ;; There should be a nicer way to get the limits here. | |
295 (condition-case args-out-of-range | |
296 (make-char known-charset #x100) | |
297 (args-out-of-range | |
298 (setq charset-lower (third args-out-of-range) | |
299 charset-upper (fourth args-out-of-range))))) | |
300 (loop | |
301 for i from charset-lower to charset-upper | |
302 always (and (setq ucs | |
303 (encode-char (make-char known-charset i) 'ucs)) | |
304 (gethash ucs encode-table)) | |
305 finally (setq worth-trying known-charset)) | |
306 | |
307 ;; Only trying this for one charset at a time, the first find. | |
308 (when worth-trying (return)) | |
309 | |
310 ;; Okay, this charset is not worth trying, Try the next. | |
311 (setq charset-lower -1 | |
312 charset-upper -1 | |
313 worth-trying nil))) | |
314 | |
315 (when worth-trying | |
316 (setq other-charset-vector (make-vector 256 encode-failure-octet)) | |
317 (loop for i from charset-lower to charset-upper | |
318 do (aset other-charset-vector (+ #x80 i) | |
319 (gethash (encode-char (make-char worth-trying i) | |
320 'ucs) encode-table))) | |
321 (setq encode-program | |
322 (nsublis | |
323 (list (cons #xABAB (charset-id worth-trying))) | |
324 (nconc | |
325 (copy-list (first | |
326 tentative-encode-program-parts)) | |
327 (append other-charset-vector nil) | |
328 (copy-tree (second | |
329 tentative-encode-program-parts)))))) | |
330 encode-program)) | |
331 | |
332 (defun make-8-bit-generate-encode-program (decode-table encode-table | |
333 encode-failure-octet) | |
334 "Generate a CCL program to decode a 8-bit fixed-width charset. | |
335 | |
336 DECODE-TABLE must have 256 non-cons entries, and will be regarded as | |
337 describing a map from the octet corresponding to an offset in the | |
338 table to the that entry in the table. ENCODE-TABLE is a hash table | |
339 map from unicode values to characters in the range [0,255]. | |
340 ENCODE-FAILURE-OCTET describes an integer between 0 and 255 | |
341 \(inclusive) to write in the event that a character cannot be encoded. " | |
342 (check-argument-type #'vectorp decode-table) | |
343 (check-argument-range (length decode-table) #x100 #x100) | |
344 (check-argument-type #'hash-table-p encode-table) | |
345 (check-argument-type #'integerp encode-failure-octet) | |
346 (check-argument-range encode-failure-octet #x00 #xFF) | |
347 (let ((encode-program nil) | |
348 (general-encode-program | |
349 (eval-when-compile | |
350 (let ((prog (append | |
351 (ccl-compile | |
352 `(1 | |
353 (loop | |
354 (read-multibyte-character r0 r1) | |
355 (mule-to-unicode r0 r1) | |
356 (if (r0 == #xFFFD) | |
357 (write #xBEEF) | |
358 ((lookup-integer encode-table-sym r0 r3) | |
359 (if r7 | |
360 (write-multibyte-character r0 r3) | |
361 (write #xBEEF)))) | |
362 (repeat)))) nil))) | |
363 (assert (memq #xBEEF14 prog) | |
364 nil | |
365 "This code assumes that the constant #xBEEF is #xBEEF14 \ | |
366 in compiled CCL code.\nIf that is not the case, and it appears not to | |
367 be--that's why you're getting this message--it will not work. ") | |
368 prog))) | |
369 (encode-program-with-ascii-optimisation | |
370 (eval-when-compile | |
371 (let ((prog (append | |
372 (ccl-compile | |
373 `(1 | |
374 (loop | |
375 (read-multibyte-character r0 r1) | |
376 (if (r0 == ,(charset-id 'ascii)) | |
377 (write r1) | |
378 ((mule-to-unicode r0 r1) | |
379 (if (r0 == #xFFFD) | |
380 (write #xBEEF) | |
381 ((lookup-integer encode-table-sym r0 r3) | |
382 (if r7 | |
383 (write-multibyte-character r0 r3) | |
384 (write #xBEEF)))))) | |
385 (repeat)))) nil))) | |
386 (assert (memq #xBEEF14 prog) | |
387 nil | |
388 "This code assumes that the constant #xBEEF is #xBEEF14 \ | |
389 in compiled CCL code.\nIf that is not the case, and it appears not to | |
390 be--that's why you're getting this message--it will not work. ") | |
391 prog))) | |
392 (ascii-encodes-as-itself nil)) | |
393 | |
394 ;; Is this coding system ASCII-compatible? If so, we can avoid the hash | |
395 ;; table lookup for those characters. | |
396 (loop | |
397 for i from #x00 to #x7f | |
398 always (eq (int-to-char i) (gethash i encode-table)) | |
399 finally (setq ascii-encodes-as-itself t)) | |
400 | |
401 ;; Note that this logic handles EBCDIC badly. For example, CP037, | |
402 ;; MIME name ebcdic-na, has the entire repertoire of ASCII and | |
403 ;; Latin 1, and thus a more optimal ccl encode program would check | |
404 ;; for those character sets and use tables. But for now, we do a | |
405 ;; hash table lookup for every character. | |
406 (if (null ascii-encodes-as-itself) | |
407 ;; General encode program. Pros; general and correct. Cons; | |
408 ;; slow, a hash table lookup + mule-unicode conversion is done | |
409 ;; for every character encoding. | |
410 (setq encode-program general-encode-program) | |
411 (setq encode-program | |
412 ;; Encode program with ascii-ascii mapping (based on a | |
413 ;; character's mule character set), and one other mule | |
414 ;; character set using table-based encoding, other | |
415 ;; character sets using hash table lookups. | |
416 ;; make-8-bit-non-ascii-completely-coveredp only returns | |
417 ;; such a mapping if some non-ASCII charset with | |
418 ;; characters in decode-table is entirely covered by | |
419 ;; encode-table. | |
420 (make-8-bit-generate-helper decode-table encode-table | |
421 encode-failure-octet)) | |
422 (unless encode-program | |
423 ;; If make-8-bit-non-ascii-completely-coveredp returned nil, | |
424 ;; but ASCII still encodes as itself, do one-to-one mapping | |
425 ;; for ASCII, and a hash table lookup for everything else. | |
426 (setq encode-program encode-program-with-ascii-optimisation))) | |
427 | |
428 (setq encode-program | |
429 (nsublis | |
430 (list (cons #xBEEF14 | |
431 (logior (lsh encode-failure-octet 8) | |
432 #x14))) | |
433 (copy-tree encode-program))) | |
434 encode-program)) | |
435 | |
436 (defun make-8-bit-create-decode-encode-tables (unicode-map) | |
437 "Return a list \(DECODE-TABLE ENCODE-TABLE) given UNICODE-MAP. | |
438 UNICODE-MAP should be an alist mapping from integer octet values to | |
439 characters with UCS code points; DECODE-TABLE will be a 256-element | |
440 vector, and ENCODE-TABLE will be a hash table mapping from 256 numbers | |
441 to 256 distinct characters. " | |
442 (check-argument-type #'listp unicode-map) | |
443 (let ((decode-table (make-vector 256 nil)) | |
444 (encode-table (make-hash-table :size 256)) | |
445 (private-use-start (encode-char make-8-bit-private-use-start 'ucs)) | |
446 desired-ucs) | |
447 | |
448 (loop for (external internal) | |
449 in unicode-map | |
450 do | |
451 (aset decode-table external internal) | |
452 (assert (not (eq (encode-char internal 'ucs) -1)) | |
453 nil | |
454 "Looks like you're calling make-8-bit-coding-system in a \ | |
455 dumped file, \nand you're either not providing a literal UNICODE-MAP | |
456 or PROPS. Don't do that; make-8-bit-coding-system relies on sensible | |
457 Unicode mappings being available, which they are at compile time for | |
458 dumped files (but this requires the mentioned literals), but not, for | |
459 most of them, at run time. ") | |
460 | |
461 (puthash (encode-char internal 'ucs) | |
462 ;; This is semantically an integer, but Dave Love's design | |
463 ;; for lookup-integer in CCL means we need to store it as a | |
464 ;; character. | |
465 (int-to-char external) | |
466 encode-table)) | |
467 | |
468 ;; Now, go through the decode table looking at the characters that | |
469 ;; remain nil. If the XEmacs character with that integer is already in | |
470 ;; the encode table, map the on-disk octet to a Unicode private use | |
471 ;; character. Otherwise map the on-disk octet to the XEmacs character | |
472 ;; with that numeric value, to make it clearer what it is. | |
473 (dotimes (i 256) | |
474 (when (null (aref decode-table i)) | |
475 ;; Find a free code point. | |
476 (setq desired-ucs i) | |
477 (while (gethash desired-ucs encode-table) | |
478 ;; In the normal case, the code point chosen will be U+E0XY, where | |
479 ;; XY is the hexadecimal octet on disk. In pathological cases | |
480 ;; it'll be something else. | |
481 (setq desired-ucs (+ private-use-start desired-ucs) | |
482 private-use-start (+ private-use-start 1))) | |
483 (aset decode-table i (decode-char 'ucs desired-ucs)) | |
484 (puthash desired-ucs (int-to-char i) encode-table))) | |
485 (values decode-table encode-table))) | |
486 | |
487 (defun make-8-bit-generate-decode-program (decode-table) | |
488 "Given DECODE-TABLE, generate a CCL program to decode an 8-bit charset. | |
489 DECODE-TABLE must have 256 non-cons entries, and will be regarded as | |
490 describing a map from the octet corresponding to an offset in the | |
491 table to the that entry in the table. " | |
492 (check-argument-type #'vectorp decode-table) | |
493 (check-argument-range (length decode-table) #x100 #x100) | |
494 (let ((decode-program-parts | |
495 (eval-when-compile | |
496 (let* ((compiled | |
497 (append | |
498 (ccl-compile | |
499 `(3 | |
500 ((read r0) | |
501 (loop | |
502 (write-read-repeat r0 ,(make-vector | |
503 256 'sentinel)))))) nil)) | |
504 (first-part compiled) | |
505 (last-part | |
506 (member-if-not #'symbolp | |
507 (member-if-not #'integerp first-part)))) | |
508 ;; Chop off the sentinel sentinel sentinel [..] part. | |
509 (while compiled | |
510 (if (symbolp (cadr compiled)) | |
511 (setcdr compiled nil)) | |
512 (setq compiled (cdr compiled))) | |
513 (list first-part last-part))))) | |
514 (nconc | |
515 ;; copy-list needed, because the structure of the literal provided | |
516 ;; by our eval-when-compile hangs around. | |
517 (copy-list (first decode-program-parts)) | |
518 (append decode-table nil) | |
519 (second decode-program-parts)))) | |
520 | |
521 ;;;###autoload | |
522 (defun make-8-bit-coding-system (name unicode-map &optional description props) | |
523 "Make and return a fixed-width 8-bit CCL coding system named NAME. | |
524 NAME must be a symbol, and UNICODE-MAP a list. | |
525 | |
526 UNICODE-MAP is a plist describing a map from octets in the coding | |
527 system NAME (as integers) to XEmacs characters. Those XEmacs | |
528 characters will be used explicitly on decoding, but for encoding (most | |
529 relevantly, on writing to disk) XEmacs characters that map to the same | |
530 Unicode code point will be unified. This means that the ISO-8859-? | |
531 characters that map to the same Unicode code point will not be | |
532 distinct when written to disk, which is normally what is intended; it | |
533 also means that East Asian Han characters from different XEmacs | |
534 character sets will not be distinct when written to disk, which is | |
535 less often what is intended. | |
536 | |
537 Any octets not mapped will be decoded into the ISO 8859-1 characters with | |
538 the corresponding numeric value; unless another octet maps to that | |
539 character, in which case the Unicode private use area will be used. This | |
540 avoids spurious changes to files on disk when they contain octets that would | |
541 be otherwise remapped to the canonical values for the corresponding | |
542 characters in the coding system. | |
543 | |
544 DESCRIPTION and PROPS are as in `make-coding-system', which see. This | |
545 function also accepts two additional (optional) properties in PROPS; | |
546 `aliases', giving a list of aliases to be initialized for this | |
547 coding-system, and `encode-failure-octet', an integer between 0 and 256 to | |
548 write in place of XEmacs characters that cannot be encoded, defaulting to | |
549 the code for tilde `~'. " | |
550 (check-argument-type #'symbolp name) | |
551 (check-argument-type #'listp unicode-map) | |
552 (check-argument-type #'stringp | |
553 (or description | |
554 (setq description | |
555 (format "Coding system used for %s." name)))) | |
556 (check-valid-plist props) | |
557 (let ((encode-failure-octet (or (plist-get props 'encode-failure-octet) | |
558 (char-to-int ?~))) | |
559 (aliases (plist-get props 'aliases)) | |
560 (hash-table-sym (gentemp (format "%s-encode-table" name))) | |
561 encode-program decode-program result decode-table encode-table) | |
562 | |
563 ;; Some more sanity checking. | |
564 (check-argument-range encode-failure-octet 0 #xFF) | |
565 (check-argument-type #'listp aliases) | |
566 | |
567 ;; Don't pass on our extra data to make-coding-system. | |
568 (setq props (plist-remprop props 'encode-failure-octet) | |
569 props (plist-remprop props 'aliases)) | |
570 | |
571 (multiple-value-setq | |
572 (decode-table encode-table) | |
573 (make-8-bit-create-decode-encode-tables unicode-map)) | |
574 | |
575 ;; Register the decode-table. | |
576 (define-translation-hash-table hash-table-sym encode-table) | |
577 | |
578 ;; Generate the programs. | |
579 (setq decode-program (make-8-bit-generate-decode-program decode-table) | |
580 encode-program (make-8-bit-generate-encode-program | |
581 decode-table encode-table encode-failure-octet)) | |
582 (unless (vectorp encode-program) | |
583 (setq encode-program | |
584 (apply #'vector | |
585 (nsublis (list (cons 'encode-table-sym hash-table-sym)) | |
586 (copy-tree encode-program))))) | |
587 (unless (vectorp decode-program) | |
588 (setq decode-program | |
589 (apply #'vector decode-program))) | |
590 | |
591 ;; And now generate the actual coding system. | |
592 (setq result | |
593 (make-coding-system | |
594 name 'ccl | |
595 description | |
596 (plist-put (plist-put props 'decode decode-program) | |
597 'encode encode-program))) | |
598 (coding-system-put name 'category 'iso-8-1) | |
599 (loop for alias in aliases | |
600 do (define-coding-system-alias alias name)) | |
601 result)) | |
602 | |
603 ;;;###autoload | |
604 (define-compiler-macro make-8-bit-coding-system (&whole form name unicode-map | |
605 &optional description props) | |
606 | |
607 ;; We provide the compiler macro (= macro that is expanded only on | |
608 ;; compilation, and that can punt to a runtime version of the | |
609 ;; associate function if necessary) not for reasons of speed, though | |
610 ;; it does speed up things at runtime a little, but because the | |
611 ;; Unicode mappings are available at compile time in the dumped | |
612 ;; files, but they are not available at run time for the vast | |
613 ;; majority of them. | |
614 | |
615 (if (not (and (and (consp name) (eq (car name) 'quote)) | |
616 (and (consp unicode-map) (eq (car unicode-map) 'quote)) | |
617 (and (or (and (consp props) (eq (car props) 'quote)) | |
618 (null props))))) | |
619 ;; The call does not use literals; do it at runtime. | |
620 form | |
621 (setq name (cadr name) | |
622 unicode-map (cadr unicode-map) | |
623 props (if props (cadr props))) | |
624 (let ((encode-failure-octet | |
625 (or (plist-get props 'encode-failure-octet) (char-to-int ?~))) | |
626 (aliases (plist-get props 'aliases)) | |
627 encode-program decode-program | |
628 decode-table encode-table res) | |
629 | |
630 ;; Some sanity checking. | |
631 (check-argument-range encode-failure-octet 0 #xFF) | |
632 (check-argument-type #'listp aliases) | |
633 | |
634 ;; Don't pass on our extra data to make-coding-system. | |
635 (setq props (plist-remprop props 'encode-failure-octet) | |
636 props (plist-remprop props 'aliases)) | |
637 | |
638 ;; Work out encode-table and decode-table. | |
639 (multiple-value-setq | |
640 (decode-table encode-table) | |
641 (make-8-bit-create-decode-encode-tables unicode-map)) | |
642 | |
643 ;; Generate the decode and encode programs. | |
644 (setq decode-program (make-8-bit-generate-decode-program decode-table) | |
645 encode-program (make-8-bit-generate-encode-program | |
646 decode-table encode-table encode-failure-octet)) | |
647 | |
648 ;; And return the generated code. | |
649 `(let ((encode-table-sym (gentemp (format "%s-encode-table" ',name))) | |
650 result) | |
651 (define-translation-hash-table encode-table-sym ,encode-table) | |
652 (setq result | |
653 (make-coding-system | |
654 ',name 'ccl ,description | |
655 (plist-put (plist-put ',props 'decode | |
656 ,(apply #'vector decode-program)) | |
657 'encode | |
658 (apply #'vector | |
659 (nsublis | |
660 (list (cons | |
661 'encode-table-sym | |
662 (symbol-value 'encode-table-sym))) | |
663 ',encode-program))))) | |
664 (coding-system-put ',name 'category 'iso-8-1) | |
665 ,(macroexpand `(loop for alias in ',aliases | |
666 do (define-coding-system-alias alias | |
667 ',name))) | |
668 'result)))) | |
669 | |
670 |