diff lisp/coding.el @ 4604:e0a8715fdb1f

Support new IGNORE-INVALID-SEQUENCESP argument, #'query-coding-region. lisp/ChangeLog addition: 2009-02-07 Aidan Kehoe <kehoea@parhasard.net> * coding.el (query-coding-clear-highlights): Rename the BUFFER argument to BUFFER-OR-STRING, describe it as possibly being a string in its documentation. (default-query-coding-region): Add a new IGNORE-INVALID-SEQUENCESP argument, document that this function does not support it. Bind case-fold-search to nil, we don't want this to influence what the function thinks is encodable or not. (query-coding-region): Add a new IGNORE-INVALID-SEQUENCESP argument, document what it does; reflect this new argument in the associated compiler macro. (query-coding-string): Add a new IGNORE-INVALID-SEQUENCESP argument, document what it does. Support the HIGHLIGHT argument correctly. * unicode.el (unicode-query-coding-region): Add a new IGNORE-INVALID-SEQUENCESP argument, document what it does, implement this. Document a potential problem. Use #'query-coding-clear-highlights instead of reimplementing it ourselves. Remove some debugging messages. * mule/arabic.el (iso-8859-6): * mule/cyrillic.el (iso-8859-5): * mule/greek.el (iso-8859-7): * mule/hebrew.el (iso-8859-8): * mule/latin.el (iso-8859-2): * mule/latin.el (iso-8859-3): * mule/latin.el (iso-8859-4): * mule/latin.el (iso-8859-14): * mule/latin.el (iso-8859-15): * mule/latin.el (iso-8859-16): * mule/latin.el (iso-8859-9): * mule/latin.el (windows-1252): * mule/mule-coding.el (iso-8859-1): Avoid the assumption that characters not given an explicit mapping in these coding systems map to the ISO 8859-1 characters corresponding to the octets on disk; this makes it much more reasonable to implement the IGNORE-INVALID-SEQUENCESP argument to query-coding-region. * mule/mule-cmds.el (set-language-info): Correct the docstring. * mule/mule-cmds.el (finish-set-language-environment): Treat invalid Unicode sequences produced from invalid-sequence-coding-system and corresponding to control characters the same as control characters in redisplay. * mule/mule-cmds.el: Document that encode-coding-char is available in coding.el * mule/mule-coding.el (make-8-bit-generate-helper): Change to return the both the encode-program generated and the relevant non-ASCII charset; update the docstring to reflect this. * mule/mule-coding.el (make-8-bit-generate-encode-program-and-skip-chars-strings): Rename this function; have it return skip-chars-strings as well as the encode program. Have these skip-chars-strings use ranges for charsets, where possible. * mule/mule-coding.el (make-8-bit-create-decode-encode-tables): Revise this to allow people to specify explicitly characters that should be undefined (= corresponding to keys in unicode-error-default-translation-table), and treating unspecified octets above #x7f as undefined by default. * mule/mule-coding.el (8-bit-fixed-query-coding-region): Add a new IGNORE-INVALID-SEQUENCESP argument, implement support for it using the 8-bit-fixed-invalid-sequences-skip-chars coding system property; remove some debugging messages. * mule/mule-coding.el (make-8-bit-coding-system): This function is dumped, autoloading it makes no sense. Document what happens when characters above #x7f are not specified, implement this. * mule/vietnamese.el: Correct spelling. tests/ChangeLog addition: 2009-02-07 Aidan Kehoe <kehoea@parhasard.net> * automated/query-coding-tests.el: Add FAILING-CASE arguments to the Assert calls, making #'q-c-debug mostly unnecessary. Remove #'q-c-debug. Add new tests that use the IGNORE-INVALID-SEQUENCESP argument to #'query-coding-region; rework the existing ones to respect it.
author Aidan Kehoe <kehoea@parhasard.net>
date Sat, 07 Feb 2009 17:13:37 +0000
parents 0347879667ed
children 33b8c874b2c8
line wrap: on
line diff
--- a/lisp/coding.el	Thu Feb 05 21:18:37 2009 -0500
+++ b/lisp/coding.el	Sat Feb 07 17:13:37 2009 +0000
@@ -288,11 +288,11 @@
   #s(hash-table test equal data ())
   "A map from list of charsets to `skip-chars-forward' arguments for them.")
 
-(defsubst query-coding-clear-highlights (begin end &optional buffer)
+(defsubst query-coding-clear-highlights (begin end &optional buffer-or-string)
   "Remove extent faces added by `query-coding-region' between BEGIN and END.
 
-Optional argument BUFFER is the buffer to use, and defaults to the current
-buffer.
+Optional argument BUFFER-OR-STRING is the buffer or string to use, and
+defaults to the current buffer.
 
 The HIGHLIGHTP argument to `query-coding-region' indicates that it should
 display unencodable characters using `query-coding-warning-face'.  After
@@ -300,16 +300,19 @@
   (map-extents #'(lambda (extent ignored-arg)
                    (when (eq 'query-coding-warning-face
                              (extent-face extent))
-                     (delete-extent extent))) buffer begin end))
+                     (delete-extent extent))) buffer-or-string begin end))
 
 (defun* default-query-coding-region (begin end coding-system
-				     &optional buffer errorp highlightp)
+				     &optional buffer ignore-invalid-sequencesp
+                                     errorp highlightp)
   "The default `query-coding-region' implementation.
 
 Uses the `safe-charsets' and `safe-chars' coding system properties.
 The former is a list of XEmacs character sets that can be safely
 encoded by CODING-SYSTEM; the latter a char table describing, in
-addition, characters that can be safely encoded by CODING-SYSTEM."
+addition, characters that can be safely encoded by CODING-SYSTEM.
+
+Does not support IGNORE-INVALID-SEQUENCESP."
   (check-argument-type #'coding-system-p
                        (setq coding-system (find-coding-system coding-system)))
   (check-argument-type #'integer-or-marker-p begin)
@@ -326,6 +329,7 @@
           (gethash safe-charsets
                    default-query-coding-region-safe-charset-skip-chars-map))
          (ranges (make-range-table))
+         (case-fold-search nil)
          fail-range-start fail-range-end char-after
 	 looking-at-arg failed extent)
     ;; Coding systems with a value of t for safe-charsets support everything.
@@ -401,41 +405,27 @@
 	  (values t nil))))))
 
 (defun query-coding-region (start end coding-system &optional buffer
-                            errorp highlight)
+                            ignore-invalid-sequencesp errorp highlight)
   "Work out whether CODING-SYSTEM can losslessly encode a region.
 
 START and END are the beginning and end of the region to check.
 CODING-SYSTEM is the coding system to try.
 
 Optional argument BUFFER is the buffer to check, and defaults to the current
-buffer.  Optional argument ERRORP says to signal a `text-conversion-error'
-if some character in the region cannot be encoded, and defaults to nil. 
-
-Optional argument HIGHLIGHT says to display unencodable characters in the
-region using `query-coding-warning-face'. It defaults to nil.
+buffer.
 
-This function returns a list; the intention is that callers use 
-`multiple-value-bind' or the related CL multiple value functions to deal
-with it.  The first element is `t' if the region can be encoded using
-CODING-SYSTEM, or `nil' if not.  The second element is `nil' if the region
-can be encoded using CODING-SYSTEM; otherwise, it is a range table
-describing the positions of the unencodable characters. See
-`make-range-table'."
-  (funcall (or (coding-system-get coding-system 'query-coding-function)
-               #'default-query-coding-region)
-           start end coding-system buffer errorp highlight))
+IGNORE-INVALID-SEQUENCESP, also an optional argument, says to treat XEmacs
+characters which have an unambiguous encoded representation, despite being
+undefined in what they represent, as encodable.  These chiefly arise with
+variable-length encodings like UTF-8 and UTF-16, where an invalid sequence
+is passed through to XEmacs as a sequence of characters with a defined
+correspondence to the octets on disk, but no non-error semantics; see the
+`invalid-sequence-coding-system' argument to `set-language-info'.
 
-(define-compiler-macro query-coding-region (start end coding-system
-                                            &optional buffer errorp highlight)
-  `(funcall (or (coding-system-get ,coding-system 'query-coding-function)
-                #'default-query-coding-region)
-    ,start ,end ,coding-system ,@(append (if buffer (list buffer))
-                                         (if errorp (list errorp))
-                                         (if highlight (list highlight)))))
-
-(defun query-coding-string (string coding-system &optional errorp highlight)
-  "Work out whether CODING-SYSTEM can losslessly encode STRING.
-CODING-SYSTEM is the coding system to check.
+They can also arise with fixed-length encodings like ISO 8859-7, where
+certain octets on disk have undefined values, and treating them as
+corresponding to the ISO 8859-1 characters with the same numerical values
+may lead to data that is not understood by other applications.
 
 Optional argument ERRORP says to signal a `text-conversion-error' if some
 character in the region cannot be encoded, and defaults to nil.
@@ -443,28 +433,94 @@
 Optional argument HIGHLIGHT says to display unencodable characters in the
 region using `query-coding-warning-face'. It defaults to nil.
 
-This function returns a list; the intention is that callers use use
+This function returns a list; the intention is that callers use
 `multiple-value-bind' or the related CL multiple value functions to deal
-with it.  The first element is `t' if the string can be encoded using
-CODING-SYSTEM, or `nil' if not.  The second element is `nil' if the string
+with it.  The first element is `t' if the region can be encoded using
+CODING-SYSTEM, or `nil' if not.  The second element is `nil' if the region
 can be encoded using CODING-SYSTEM; otherwise, it is a range table
-describing the positions of the unencodable characters. See
-`make-range-table'."
+describing the positions of the unencodable characters.  Ranges that
+describe characters that would be ignored were IGNORE-INVALID-SEQUENCESP
+non-nil map to the symbol `invalid-sequence'; other ranges map to the symbol
+`unencodable'.  If IGNORE-INVALID-SEQUENCESP is non-nil, all ranges will map
+to the symbol `unencodable'.  See `make-range-table' for more details of
+range tables."
+  (funcall (or (coding-system-get coding-system 'query-coding-function)
+               #'default-query-coding-region)
+           start end coding-system buffer ignore-invalid-sequencesp errorp
+           highlight))
+
+(define-compiler-macro query-coding-region (start end coding-system
+                                            &optional buffer 
+                                            ignore-invalid-sequencesp
+                                            errorp highlight)
+  `(funcall (or (coding-system-get ,coding-system 'query-coding-function)
+                #'default-query-coding-region)
+    ,start ,end ,coding-system ,@(append (when (or buffer
+                                                   ignore-invalid-sequencesp
+						   errorp highlight)
+					   (list buffer))
+                                         (when (or ignore-invalid-sequencesp
+						   errorp highlight)
+					   (list ignore-invalid-sequencesp))
+                                         (when (or errorp highlight)
+					   (list errorp))
+                                         (when highlight (list highlight)))))
+
+(defun query-coding-string (string coding-system &optional
+                            ignore-invalid-sequencesp errorp highlight)
+  "Work out whether CODING-SYSTEM can losslessly encode STRING.
+CODING-SYSTEM is the coding system to check.
+
+IGNORE-INVALID-SEQUENCESP, an optional argument, says to treat XEmacs
+characters which have an unambiguous encoded representation, despite being
+undefined in what they represent, as encodable.  These chiefly arise with
+variable-length encodings like UTF-8 and UTF-16, where an invalid sequence
+is passed through to XEmacs as a sequence of characters with a defined
+correspondence to the octets on disk, but no non-error semantics; see the
+`invalid-sequence-coding-system' argument to `set-language-info'.
+
+They can also arise with fixed-length encodings like ISO 8859-7, where
+certain octets on disk have undefined values, and treating them as
+corresponding to the ISO 8859-1 characters with the same numerical values
+may lead to data that is not understood by other applications.
+
+Optional argument ERRORP says to signal a `text-conversion-error' if some
+character in the region cannot be encoded, and defaults to nil.
+
+Optional argument HIGHLIGHT says to display unencodable characters in the
+region using `query-coding-warning-face'. It defaults to nil.
+
+This function returns a list; the intention is that callers use
+`multiple-value-bind' or the related CL multiple value functions to deal
+with it.  The first element is `t' if the region can be encoded using
+CODING-SYSTEM, or `nil' if not.  The second element is `nil' if the region
+can be encoded using CODING-SYSTEM; otherwise, it is a range table
+describing the positions of the unencodable characters.  Ranges that
+describe characters that would be ignored were IGNORE-INVALID-SEQUENCESP
+non-nil map to the symbol `invalid-sequence'; other ranges map to the symbol
+`unencodable'.  If IGNORE-INVALID-SEQUENCESP is non-nil, all ranges will map
+to the symbol `unencodable'.  See `make-range-table' for more details of
+range tables."
   (with-temp-buffer 
+    (when highlight
+      (query-coding-clear-highlights 0 (length string) string))
     (insert string)
-    (multiple-value-bind (result ranges)
+    (multiple-value-bind (result ranges extent)
         (query-coding-region (point-min) (point-max) coding-system
                              (current-buffer) errorp
-                             ;; #### Highlight won't work here,
-                             ;; query-coding-region may need to be modified.
-                             highlight)
+                             nil ignore-invalid-sequencesp)
       (unless result
-        ;; Sigh, string indices are zero-based, buffer offsets are
-        ;; one-based.
         (map-range-table
          #'(lambda (begin end value)
+	     ;; Sigh, string indices are zero-based, buffer offsets are
+	     ;; one-based.
              (remove-range-table begin end ranges)
-             (put-range-table (1- begin) (1- end) value ranges))
+             (put-range-table (decf begin) (decf end) value ranges)
+	     (when highlight
+	       (setq extent (make-extent begin end string))
+	       (set-extent-priority extent (+ mouse-highlight-priority 2))
+	       (set-extent-property extent 'duplicable t)
+	       (set-extent-face extent 'query-coding-warning-face)))
          ranges))
       (values result ranges))))