comparison lisp/mule/mule-coding.el @ 4549:68d1ca56cffa

First part of interactive checks that coding systems encode regions. 2008-01-21 Aidan Kehoe <kehoea@parhasard.net> * coding.el (decode-coding-string): (encode-coding-string): Accept GNU's NOCOPY argument for these. Todo; write compiler macros to use it. (query-coding-warning-face): New face, to show unencodable characters. (default-query-coding-region-safe-charset-skip-chars-map): New variable, a cache used by #'default-query-coding-region. (default-query-coding-region): Default implementation of #'query-coding-region, using the safe-charsets and safe-chars coding systemproperties. (query-coding-region): New function; can a given coding system encode a given region? (query-coding-string): New function; can a given coding system encode a given string? (unencodable-char-position): Function API taken from GNU; return the first unencodable position given a string and coding system. (encode-coding-char): Function API taken from GNU; return CHAR encoded using CODING-SYSTEM, or nil if CODING-SYSTEM would trash CHAR. ((unless (featurep 'mule)): Override the default query-coding-region implementation on non-Mule. * mule/mule-coding.el (make-8-bit-generate-helper): Eliminate a duplicate comment. (make-8-bit-choose-category): Simplify implementation. (8-bit-fixed-query-coding-region): Implementation of #'query-coding-region for coding systems created with #'make-8-bit-coding-system. (make-8-bit-coding-system): Initialise the #'query-coding-region implementation for these character sets. (make-8-bit-coding-system): Ditto for the compiler macro version of this function. * unicode.el (unicode-query-coding-skip-chars-arg): New variable, used by unicode-query-coding-region, initialised in mule/general-late.el. (unicode-query-coding-region): New function, the #'query-coding-region implementation for Unicode coding systems. Initialise the query-coding-function property for the Unicode coding systems to #'unicode-query-coding-region. * mule/mule-charset.el (charset-skip-chars-string): New function. Return a #'skip-chars-forward argument that skips all characters in CHARSET. (map-charset-chars): Function synced from GNU, modified to work with XEmacs. Map FUNC across the int value charset ranges of CHARSET.
author Aidan Kehoe <kehoea@parhasard.net>
date Mon, 21 Jan 2008 22:51:21 +0100
parents f4c3ffe60a4f
children 6812571bfcb9
comparison
equal deleted inserted replaced
4402:e70cc8a90e90 4549:68d1ca56cffa
236 (loop 236 (loop
237 (read-multibyte-character r0 r1) 237 (read-multibyte-character r0 r1)
238 (if (r0 == ,(charset-id 'ascii)) 238 (if (r0 == ,(charset-id 'ascii))
239 (write r1) 239 (write r1)
240 ((if (r0 == #xABAB) 240 ((if (r0 == #xABAB)
241 ;; #xBFFE is a sentinel in the compiled
242 ;; program.
243 ;; #xBFFE is a sentinel in the compiled 241 ;; #xBFFE is a sentinel in the compiled
244 ;; program. 242 ;; program.
245 ((r0 = r1 & #x7F) 243 ((r0 = r1 & #x7F)
246 (write r0 ,(make-vector vec-len #xBFFE))) 244 (write r0 ,(make-vector vec-len #xBFFE)))
247 ((mule-to-unicode r0 r1) 245 ((mule-to-unicode r0 r1)
529 "Given DECODE-TABLE, return an appropriate coding category. 527 "Given DECODE-TABLE, return an appropriate coding category.
530 DECODE-TABLE is a 256-entry vector describing the mapping from octets on 528 DECODE-TABLE is a 256-entry vector describing the mapping from octets on
531 disk to XEmacs characters for some fixed-width 8-bit coding system. " 529 disk to XEmacs characters for some fixed-width 8-bit coding system. "
532 (check-argument-type #'vectorp decode-table) 530 (check-argument-type #'vectorp decode-table)
533 (check-argument-range (length decode-table) #x100 #x100) 531 (check-argument-range (length decode-table) #x100 #x100)
534 (block category 532 (loop
535 (loop 533 named category
536 for i from #x80 to #xBF 534 for i from #x80 to #xBF
537 do (unless (= i (aref decode-table i)) 535 do (unless (= i (aref decode-table i))
538 (return-from category 'no-conversion))) 536 (return-from category 'no-conversion))
539 'iso-8-1)) 537 finally return 'iso-8-1))
538
539 (defun 8-bit-fixed-query-coding-region (begin end coding-system
540 &optional buffer errorp highlightp)
541 "The `query-coding-region' implementation for 8-bit-fixed coding systems.
542
543 Uses the `8-bit-fixed-query-from-unicode' and `8-bit-fixed-query-skip-chars'
544 coding system properties. The former is a hash table mapping from valid
545 Unicode code points to on-disk octets in the coding system; the latter a set
546 of characters as used by `skip-chars-forward'. Both of these properties are
547 generated automatically by `make-8-bit-coding-system'.
548
549 See that the documentation of `query-coding-region'; see also
550 `make-8-bit-coding-system'. "
551 (check-argument-type #'coding-system-p
552 (setq coding-system (find-coding-system coding-system)))
553 (check-argument-type #'integer-or-marker-p begin)
554 (check-argument-type #'integer-or-marker-p end)
555 (let ((from-unicode
556 (coding-system-get coding-system '8-bit-fixed-query-from-unicode))
557 (skip-chars-arg
558 (coding-system-get coding-system '8-bit-fixed-query-skip-chars))
559 (ranges (make-range-table))
560 char-after fail-range-start fail-range-end previous-fail extent
561 failed)
562 (check-type from-unicode hash-table)
563 (check-type skip-chars-arg string)
564 (save-excursion
565 (goto-char begin buffer)
566 (skip-chars-forward skip-chars-arg end buffer)
567 (while (< (point buffer) end)
568 (message
569 "fail-range-start is %S, previous-fail %S, point is %S, end is %S"
570 fail-range-start previous-fail (point buffer) end)
571 (setq char-after (char-after (point buffer) buffer)
572 fail-range-start (point buffer))
573 (message "arguments are %S %S"
574 (< (point buffer) end)
575 (not (gethash (encode-char char-after 'ucs) from-unicode)))
576 (while (and
577 (< (point buffer) end)
578 (not (gethash (encode-char char-after 'ucs) from-unicode)))
579 (forward-char 1 buffer)
580 (setq char-after (char-after (point buffer) buffer)
581 failed t))
582 (if (= fail-range-start (point buffer))
583 ;; The character can actually be encoded by the coding
584 ;; system; check the characters past it.
585 (forward-char 1 buffer)
586 ;; The character actually failed.
587 (message "past the move through, point now %S" (point buffer))
588 (when errorp
589 (error 'text-conversion-error
590 (format "Cannot encode %s using coding system"
591 (buffer-substring fail-range-start (point buffeR)
592 buffer))
593 (coding-system-name coding-system)))
594 (put-range-table fail-range-start
595 ;; If char-after is non-nil, we're not at
596 ;; the end of the buffer.
597 (setq fail-range-end (if char-after
598 (point buffer)
599 (point-max buffer)))
600 t ranges)
601 (when highlightp
602 (message "highlighting")
603 (setq extent (make-extent fail-range-start fail-range-end buffer))
604 (set-extent-priority extent (+ mouse-highlight-priority 2))
605 (set-extent-face extent 'query-coding-warning-face))
606 (skip-chars-forward skip-chars-arg end buffer))
607 (message "about to give the result, ranges %S" ranges))
608 (if failed
609 (values nil ranges)
610 (values t nil)))))
540 611
541 ;;;###autoload 612 ;;;###autoload
542 (defun make-8-bit-coding-system (name unicode-map &optional description props) 613 (defun make-8-bit-coding-system (name unicode-map &optional description props)
543 "Make and return a fixed-width 8-bit CCL coding system named NAME. 614 "Make and return a fixed-width 8-bit CCL coding system named NAME.
544 NAME must be a symbol, and UNICODE-MAP a list. 615 NAME must be a symbol, and UNICODE-MAP a list.
616 (plist-put (plist-put props 'decode decode-program) 687 (plist-put (plist-put props 'decode decode-program)
617 'encode encode-program))) 688 'encode encode-program)))
618 (coding-system-put name '8-bit-fixed t) 689 (coding-system-put name '8-bit-fixed t)
619 (coding-system-put name 'category 690 (coding-system-put name 'category
620 (make-8-bit-choose-category decode-table)) 691 (make-8-bit-choose-category decode-table))
692 (coding-system-put name '8-bit-fixed-query-skip-chars
693 (apply #'string (append decode-table nil)))
694 (coding-system-put name '8-bit-fixed-query-from-unicode encode-table)
695
696 (coding-system-put name 'query-coding-function
697 #'8-bit-fixed-query-coding-region)
698 (coding-system-put (intern (format "%s-unix" name))
699 'query-coding-function
700 #'8-bit-fixed-query-coding-region)
701 (coding-system-put (intern (format "%s-dos" name))
702 'query-coding-function
703 #'8-bit-fixed-query-coding-region)
704 (coding-system-put (intern (format "%s-mac" name))
705 'query-coding-function
706 #'8-bit-fixed-query-coding-region)
621 (loop for alias in aliases 707 (loop for alias in aliases
622 do (define-coding-system-alias alias name)) 708 do (define-coding-system-alias alias name))
623 result)) 709 result))
624 710
625 (define-compiler-macro make-8-bit-coding-system (&whole form name unicode-map 711 (define-compiler-macro make-8-bit-coding-system (&whole form name unicode-map
626 &optional description props) 712 &optional description props)
627
628 ;; We provide the compiler macro (= macro that is expanded only on 713 ;; We provide the compiler macro (= macro that is expanded only on
629 ;; compilation, and that can punt to a runtime version of the 714 ;; compilation, and that can punt to a runtime version of the
630 ;; associate function if necessary) not for reasons of speed, though 715 ;; associate function if necessary) not for reasons of speed, though
631 ;; it does speed up things at runtime a little, but because the 716 ;; it does speed up things at runtime a little, but because the
632 ;; Unicode mappings are available at compile time in the dumped 717 ;; Unicode mappings are available at compile time in the dumped
672 ;; it, out, though, I get: 757 ;; it, out, though, I get:
673 ;; 758 ;;
674 ;; (invalid-read-syntax "Multiply defined symbol label" 1) 759 ;; (invalid-read-syntax "Multiply defined symbol label" 1)
675 ;; 760 ;;
676 ;; when the file is byte compiled. 761 ;; when the file is byte compiled.
677 (case-fold-search t)) 762 (case-fold-search t)
678 (define-translation-hash-table encode-table-sym ,encode-table) 763 (encode-table ,encode-table))
764 (define-translation-hash-table encode-table-sym encode-table)
679 (make-coding-system 765 (make-coding-system
680 ',name 'ccl ,description 766 ',name 'ccl ,description
681 (plist-put (plist-put ',props 'decode 767 (plist-put (plist-put ',props 'decode
682 ,(apply #'vector decode-program)) 768 ,(apply #'vector decode-program))
683 'encode 769 'encode
686 (list (cons 772 (list (cons
687 'encode-table-sym 773 'encode-table-sym
688 (symbol-value 'encode-table-sym))) 774 (symbol-value 'encode-table-sym)))
689 ',encode-program)))) 775 ',encode-program))))
690 (coding-system-put ',name '8-bit-fixed t) 776 (coding-system-put ',name '8-bit-fixed t)
691 (coding-system-put ',name 'category ', 777 (coding-system-put ',name 'category
692 (make-8-bit-choose-category decode-table)) 778 ',(make-8-bit-choose-category decode-table))
779 (coding-system-put ',name '8-bit-fixed-query-skip-chars
780 ',(apply #'string (append decode-table nil)))
781 (coding-system-put ',name '8-bit-fixed-query-from-unicode encode-table)
782 (coding-system-put ',name 'query-coding-function
783 #'8-bit-fixed-query-coding-region)
784 (coding-system-put ',(intern (format "%s-unix" name))
785 'query-coding-function
786 #'8-bit-fixed-query-coding-region)
787 (coding-system-put ',(intern (format "%s-dos" name))
788 'query-coding-function
789 #'8-bit-fixed-query-coding-region)
790 (coding-system-put ',(intern (format "%s-mac" name))
791 'query-coding-function
792 #'8-bit-fixed-query-coding-region)
693 ,(macroexpand `(loop for alias in ',aliases 793 ,(macroexpand `(loop for alias in ',aliases
694 do (define-coding-system-alias alias 794 do (define-coding-system-alias alias
695 ',name))) 795 ',name)))
696 (find-coding-system ',name))))) 796 (find-coding-system ',name)))))
697 797
701 '() ;; No differences from Latin 1. 801 '() ;; No differences from Latin 1.
702 "ISO-8859-1 (Latin-1)" 802 "ISO-8859-1 (Latin-1)"
703 '(mnemonic "Latin 1" 803 '(mnemonic "Latin 1"
704 documentation "The most used encoding of Western Europe and the Americas." 804 documentation "The most used encoding of Western Europe and the Americas."
705 aliases (iso-latin-1 latin-1))) 805 aliases (iso-latin-1 latin-1)))
706