comparison lisp/unicode.el @ 4604:e0a8715fdb1f

Support new IGNORE-INVALID-SEQUENCESP argument, #'query-coding-region. lisp/ChangeLog addition: 2009-02-07 Aidan Kehoe <kehoea@parhasard.net> * coding.el (query-coding-clear-highlights): Rename the BUFFER argument to BUFFER-OR-STRING, describe it as possibly being a string in its documentation. (default-query-coding-region): Add a new IGNORE-INVALID-SEQUENCESP argument, document that this function does not support it. Bind case-fold-search to nil, we don't want this to influence what the function thinks is encodable or not. (query-coding-region): Add a new IGNORE-INVALID-SEQUENCESP argument, document what it does; reflect this new argument in the associated compiler macro. (query-coding-string): Add a new IGNORE-INVALID-SEQUENCESP argument, document what it does. Support the HIGHLIGHT argument correctly. * unicode.el (unicode-query-coding-region): Add a new IGNORE-INVALID-SEQUENCESP argument, document what it does, implement this. Document a potential problem. Use #'query-coding-clear-highlights instead of reimplementing it ourselves. Remove some debugging messages. * mule/arabic.el (iso-8859-6): * mule/cyrillic.el (iso-8859-5): * mule/greek.el (iso-8859-7): * mule/hebrew.el (iso-8859-8): * mule/latin.el (iso-8859-2): * mule/latin.el (iso-8859-3): * mule/latin.el (iso-8859-4): * mule/latin.el (iso-8859-14): * mule/latin.el (iso-8859-15): * mule/latin.el (iso-8859-16): * mule/latin.el (iso-8859-9): * mule/latin.el (windows-1252): * mule/mule-coding.el (iso-8859-1): Avoid the assumption that characters not given an explicit mapping in these coding systems map to the ISO 8859-1 characters corresponding to the octets on disk; this makes it much more reasonable to implement the IGNORE-INVALID-SEQUENCESP argument to query-coding-region. * mule/mule-cmds.el (set-language-info): Correct the docstring. * mule/mule-cmds.el (finish-set-language-environment): Treat invalid Unicode sequences produced from invalid-sequence-coding-system and corresponding to control characters the same as control characters in redisplay. * mule/mule-cmds.el: Document that encode-coding-char is available in coding.el * mule/mule-coding.el (make-8-bit-generate-helper): Change to return the both the encode-program generated and the relevant non-ASCII charset; update the docstring to reflect this. * mule/mule-coding.el (make-8-bit-generate-encode-program-and-skip-chars-strings): Rename this function; have it return skip-chars-strings as well as the encode program. Have these skip-chars-strings use ranges for charsets, where possible. * mule/mule-coding.el (make-8-bit-create-decode-encode-tables): Revise this to allow people to specify explicitly characters that should be undefined (= corresponding to keys in unicode-error-default-translation-table), and treating unspecified octets above #x7f as undefined by default. * mule/mule-coding.el (8-bit-fixed-query-coding-region): Add a new IGNORE-INVALID-SEQUENCESP argument, implement support for it using the 8-bit-fixed-invalid-sequences-skip-chars coding system property; remove some debugging messages. * mule/mule-coding.el (make-8-bit-coding-system): This function is dumped, autoloading it makes no sense. Document what happens when characters above #x7f are not specified, implement this. * mule/vietnamese.el: Correct spelling. tests/ChangeLog addition: 2009-02-07 Aidan Kehoe <kehoea@parhasard.net> * automated/query-coding-tests.el: Add FAILING-CASE arguments to the Assert calls, making #'q-c-debug mostly unnecessary. Remove #'q-c-debug. Add new tests that use the IGNORE-INVALID-SEQUENCESP argument to #'query-coding-region; rework the existing ones to respect it.
author Aidan Kehoe <kehoea@parhasard.net>
date Sat, 07 Feb 2009 17:13:37 +0000
parents e6a7054a9c30
children c786c3fd0740
comparison
equal deleted inserted replaced
4603:202cb69c4d87 4604:e0a8715fdb1f
615 615
616 (defvar unicode-query-coding-skip-chars-arg nil ;; Set in general-late.el 616 (defvar unicode-query-coding-skip-chars-arg nil ;; Set in general-late.el
617 "Used by `unicode-query-coding-region' to skip chars with known mappings.") 617 "Used by `unicode-query-coding-region' to skip chars with known mappings.")
618 618
619 (defun unicode-query-coding-region (begin end coding-system 619 (defun unicode-query-coding-region (begin end coding-system
620 &optional buffer errorp highlightp) 620 &optional buffer ignore-invalid-sequencesp
621 "The `query-coding-region' implementation for Unicode coding systems." 621 errorp highlightp)
622 "The `query-coding-region' implementation for Unicode coding systems.
623
624 Supports IGNORE-INVALID-SEQUENCESP, that is, XEmacs characters that reflect
625 invalid octets on disk will be treated as encodable if this argument is
626 specified, and as not encodable if it is not specified."
627
628 ;; Potential problem here; the octets that correspond to octets from #x00
629 ;; to #x7f on disk will be treated by utf-8 and utf-7 as invalid
630 ;; sequences, and thus, in theory, encodable.
631
622 (check-argument-type #'coding-system-p 632 (check-argument-type #'coding-system-p
623 (setq coding-system (find-coding-system coding-system))) 633 (setq coding-system (find-coding-system coding-system)))
624 (check-argument-type #'integer-or-marker-p begin) 634 (check-argument-type #'integer-or-marker-p begin)
625 (check-argument-type #'integer-or-marker-p end) 635 (check-argument-type #'integer-or-marker-p end)
626 (let* ((skip-chars-arg unicode-query-coding-skip-chars-arg) 636 (let* ((skip-chars-arg (concat unicode-query-coding-skip-chars-arg
637 (if ignore-invalid-sequencesp
638 unicode-invalid-sequence-regexp-range
639 "")))
627 (ranges (make-range-table)) 640 (ranges (make-range-table))
628 (looking-at-arg (concat "[" skip-chars-arg "]")) 641 (looking-at-arg (concat "[" skip-chars-arg "]"))
642 (case-fold-search nil)
629 fail-range-start fail-range-end char-after failed 643 fail-range-start fail-range-end char-after failed
630 extent) 644 extent char-unicode invalid-sequence-p failed-reason
645 previous-failed-reason)
631 (save-excursion 646 (save-excursion
632 (when highlightp 647 (when highlightp
633 (map-extents #'(lambda (extent ignored-arg) 648 (query-coding-clear-highlights begin end buffer))
634 (when (eq 'query-coding-warning-face
635 (extent-face extent))
636 (delete-extent extent))) buffer begin end))
637 (goto-char begin buffer) 649 (goto-char begin buffer)
638 (skip-chars-forward skip-chars-arg end buffer) 650 (skip-chars-forward skip-chars-arg end buffer)
639 (while (< (point buffer) end) 651 (while (< (point buffer) end)
640 ; (message
641 ; "fail-range-start is %S, point is %S, end is %S"
642 ; fail-range-start (point buffer) end)
643 (setq char-after (char-after (point buffer) buffer) 652 (setq char-after (char-after (point buffer) buffer)
644 fail-range-start (point buffer)) 653 fail-range-start (point buffer))
645 (while (and 654 (while (and
646 (< (point buffer) end) 655 (< (point buffer) end)
647 (not (looking-at looking-at-arg)) 656 (not (looking-at looking-at-arg))
648 (= -1 (char-to-unicode char-after))) 657 (or (and
658 (= -1 (setq char-unicode (char-to-unicode char-after)))
659 (setq failed-reason 'unencodable))
660 (and (not ignore-invalid-sequencesp)
661 ;; The default case, with ignore-invalid-sequencesp
662 ;; not specified:
663 ;; If the character is in the Unicode range that
664 ;; corresponds to an invalid octet, we want to
665 ;; treat it as unencodable.
666 (<= (eval-when-compile
667 (char-to-unicode
668 (aref (decode-coding-string "\xd8\x00\x00\x00"
669 'utf-16-be) 3)))
670 char-unicode)
671 (<= char-unicode
672 (eval-when-compile
673 (char-to-unicode
674 (aref (decode-coding-string "\xd8\x00\x00\xFF"
675 'utf-16-be) 3))))
676 (setq failed-reason 'invalid-sequence)))
677 (or (null previous-failed-reason)
678 (eq previous-failed-reason failed-reason)))
649 (forward-char 1 buffer) 679 (forward-char 1 buffer)
650 (setq char-after (char-after (point buffer) buffer) 680 (setq char-after (char-after (point buffer) buffer)
651 failed t)) 681 failed t
682 previous-failed-reason failed-reason))
652 (if (= fail-range-start (point buffer)) 683 (if (= fail-range-start (point buffer))
653 ;; The character can actually be encoded by the coding 684 ;; The character can actually be encoded by the coding
654 ;; system; check the characters past it. 685 ;; system; check the characters past it.
655 (forward-char 1 buffer) 686 (forward-char 1 buffer)
656 ;; Can't be encoded; note this. 687 ;; Can't be encoded; note this.
658 (error 'text-conversion-error 689 (error 'text-conversion-error
659 (format "Cannot encode %s using coding system" 690 (format "Cannot encode %s using coding system"
660 (buffer-substring fail-range-start (point buffer) 691 (buffer-substring fail-range-start (point buffer)
661 buffer)) 692 buffer))
662 (coding-system-name coding-system))) 693 (coding-system-name coding-system)))
694 (assert
695 (not (null previous-failed-reason)) t
696 "If we've got here, previous-failed-reason should be non-nil.")
663 (put-range-table fail-range-start 697 (put-range-table fail-range-start
664 ;; If char-after is non-nil, we're not at 698 ;; If char-after is non-nil, we're not at
665 ;; the end of the buffer. 699 ;; the end of the buffer.
666 (setq fail-range-end (if char-after 700 (setq fail-range-end (if char-after
667 (point buffer) 701 (point buffer)
668 (point-max buffer))) 702 (point-max buffer)))
669 t ranges) 703 previous-failed-reason ranges)
704 (setq previous-failed-reason nil)
670 (when highlightp 705 (when highlightp
671 (setq extent (make-extent fail-range-start fail-range-end buffer)) 706 (setq extent (make-extent fail-range-start fail-range-end buffer))
672 (set-extent-priority extent (+ mouse-highlight-priority 2)) 707 (set-extent-priority extent (+ mouse-highlight-priority 2))
673 (set-extent-face extent 'query-coding-warning-face))) 708 (set-extent-face extent 'query-coding-warning-face)))
674 (skip-chars-forward skip-chars-arg end buffer)) 709 (skip-chars-forward skip-chars-arg end buffer))