diff lisp/unicode.el @ 4604:e0a8715fdb1f

Support new IGNORE-INVALID-SEQUENCESP argument, #'query-coding-region. lisp/ChangeLog addition: 2009-02-07 Aidan Kehoe <kehoea@parhasard.net> * coding.el (query-coding-clear-highlights): Rename the BUFFER argument to BUFFER-OR-STRING, describe it as possibly being a string in its documentation. (default-query-coding-region): Add a new IGNORE-INVALID-SEQUENCESP argument, document that this function does not support it. Bind case-fold-search to nil, we don't want this to influence what the function thinks is encodable or not. (query-coding-region): Add a new IGNORE-INVALID-SEQUENCESP argument, document what it does; reflect this new argument in the associated compiler macro. (query-coding-string): Add a new IGNORE-INVALID-SEQUENCESP argument, document what it does. Support the HIGHLIGHT argument correctly. * unicode.el (unicode-query-coding-region): Add a new IGNORE-INVALID-SEQUENCESP argument, document what it does, implement this. Document a potential problem. Use #'query-coding-clear-highlights instead of reimplementing it ourselves. Remove some debugging messages. * mule/arabic.el (iso-8859-6): * mule/cyrillic.el (iso-8859-5): * mule/greek.el (iso-8859-7): * mule/hebrew.el (iso-8859-8): * mule/latin.el (iso-8859-2): * mule/latin.el (iso-8859-3): * mule/latin.el (iso-8859-4): * mule/latin.el (iso-8859-14): * mule/latin.el (iso-8859-15): * mule/latin.el (iso-8859-16): * mule/latin.el (iso-8859-9): * mule/latin.el (windows-1252): * mule/mule-coding.el (iso-8859-1): Avoid the assumption that characters not given an explicit mapping in these coding systems map to the ISO 8859-1 characters corresponding to the octets on disk; this makes it much more reasonable to implement the IGNORE-INVALID-SEQUENCESP argument to query-coding-region. * mule/mule-cmds.el (set-language-info): Correct the docstring. * mule/mule-cmds.el (finish-set-language-environment): Treat invalid Unicode sequences produced from invalid-sequence-coding-system and corresponding to control characters the same as control characters in redisplay. * mule/mule-cmds.el: Document that encode-coding-char is available in coding.el * mule/mule-coding.el (make-8-bit-generate-helper): Change to return the both the encode-program generated and the relevant non-ASCII charset; update the docstring to reflect this. * mule/mule-coding.el (make-8-bit-generate-encode-program-and-skip-chars-strings): Rename this function; have it return skip-chars-strings as well as the encode program. Have these skip-chars-strings use ranges for charsets, where possible. * mule/mule-coding.el (make-8-bit-create-decode-encode-tables): Revise this to allow people to specify explicitly characters that should be undefined (= corresponding to keys in unicode-error-default-translation-table), and treating unspecified octets above #x7f as undefined by default. * mule/mule-coding.el (8-bit-fixed-query-coding-region): Add a new IGNORE-INVALID-SEQUENCESP argument, implement support for it using the 8-bit-fixed-invalid-sequences-skip-chars coding system property; remove some debugging messages. * mule/mule-coding.el (make-8-bit-coding-system): This function is dumped, autoloading it makes no sense. Document what happens when characters above #x7f are not specified, implement this. * mule/vietnamese.el: Correct spelling. tests/ChangeLog addition: 2009-02-07 Aidan Kehoe <kehoea@parhasard.net> * automated/query-coding-tests.el: Add FAILING-CASE arguments to the Assert calls, making #'q-c-debug mostly unnecessary. Remove #'q-c-debug. Add new tests that use the IGNORE-INVALID-SEQUENCESP argument to #'query-coding-region; rework the existing ones to respect it.
author Aidan Kehoe <kehoea@parhasard.net>
date Sat, 07 Feb 2009 17:13:37 +0000
parents e6a7054a9c30
children c786c3fd0740
line wrap: on
line diff
--- a/lisp/unicode.el	Thu Feb 05 21:18:37 2009 -0500
+++ b/lisp/unicode.el	Sat Feb 07 17:13:37 2009 +0000
@@ -617,38 +617,69 @@
   "Used by `unicode-query-coding-region' to skip chars with known mappings.")
 
 (defun unicode-query-coding-region (begin end coding-system
-				    &optional buffer errorp highlightp)
-  "The `query-coding-region' implementation for Unicode coding systems."
+				    &optional buffer ignore-invalid-sequencesp
+                                    errorp highlightp)
+  "The `query-coding-region' implementation for Unicode coding systems.
+
+Supports IGNORE-INVALID-SEQUENCESP, that is, XEmacs characters that reflect
+invalid octets on disk will be treated as encodable if this argument is
+specified, and as not encodable if it is not specified."
+
+  ;; Potential problem here; the octets that correspond to octets from #x00
+  ;; to #x7f on disk will be treated by utf-8 and utf-7 as invalid
+  ;; sequences, and thus, in theory, encodable.
+
   (check-argument-type #'coding-system-p
                        (setq coding-system (find-coding-system coding-system)))
   (check-argument-type #'integer-or-marker-p begin)
   (check-argument-type #'integer-or-marker-p end)
-  (let* ((skip-chars-arg unicode-query-coding-skip-chars-arg)
+  (let* ((skip-chars-arg (concat unicode-query-coding-skip-chars-arg
+				 (if ignore-invalid-sequencesp
+				     unicode-invalid-sequence-regexp-range
+				   "")))
          (ranges (make-range-table))
          (looking-at-arg (concat "[" skip-chars-arg "]"))
+         (case-fold-search nil)
          fail-range-start fail-range-end char-after failed
-	 extent)
+	 extent char-unicode invalid-sequence-p failed-reason
+         previous-failed-reason)
     (save-excursion
       (when highlightp
-	(map-extents #'(lambda (extent ignored-arg)
-			 (when (eq 'query-coding-warning-face
-				   (extent-face extent))
-			   (delete-extent extent))) buffer begin end))
+        (query-coding-clear-highlights begin end buffer))
       (goto-char begin buffer)
       (skip-chars-forward skip-chars-arg end buffer)
       (while (< (point buffer) end)
-;        (message
-;         "fail-range-start is %S, point is %S, end is %S"
-;         fail-range-start (point buffer) end)
         (setq char-after (char-after (point buffer) buffer)
               fail-range-start (point buffer))
         (while (and
                 (< (point buffer) end)
                 (not (looking-at looking-at-arg))
-                (= -1 (char-to-unicode char-after)))
+                (or (and
+                     (= -1 (setq char-unicode (char-to-unicode char-after)))
+                     (setq failed-reason 'unencodable))
+                    (and (not ignore-invalid-sequencesp)
+                         ;; The default case, with ignore-invalid-sequencesp
+                         ;; not specified:
+                         ;; If the character is in the Unicode range that
+                         ;; corresponds to an invalid octet, we want to
+                         ;; treat it as unencodable.
+                         (<= (eval-when-compile 
+                               (char-to-unicode
+                                (aref (decode-coding-string "\xd8\x00\x00\x00"
+                                                        'utf-16-be) 3)))
+                             char-unicode)
+                         (<= char-unicode
+                             (eval-when-compile 
+                               (char-to-unicode
+                                (aref (decode-coding-string "\xd8\x00\x00\xFF"
+                                                            'utf-16-be) 3))))
+                         (setq failed-reason 'invalid-sequence)))
+                (or (null previous-failed-reason)
+                    (eq previous-failed-reason failed-reason)))
           (forward-char 1 buffer)
           (setq char-after (char-after (point buffer) buffer)
-                failed t))
+                failed t
+                previous-failed-reason failed-reason))
         (if (= fail-range-start (point buffer))
             ;; The character can actually be encoded by the coding
             ;; system; check the characters past it.
@@ -660,13 +691,17 @@
                            (buffer-substring fail-range-start (point buffer)
                                              buffer))
                    (coding-system-name coding-system)))
+          (assert
+           (not (null previous-failed-reason)) t
+           "If we've got here, previous-failed-reason should be non-nil.")
           (put-range-table fail-range-start
                            ;; If char-after is non-nil, we're not at
                            ;; the end of the buffer.
                            (setq fail-range-end (if char-after
                                                     (point buffer)
                                                   (point-max buffer)))
-                           t ranges)
+                           previous-failed-reason ranges)
+          (setq previous-failed-reason nil)
           (when highlightp
             (setq extent (make-extent fail-range-start fail-range-end buffer))
             (set-extent-priority extent (+ mouse-highlight-priority 2))