diff lisp/coding.el @ 4549:68d1ca56cffa

First part of interactive checks that coding systems encode regions. 2008-01-21 Aidan Kehoe <kehoea@parhasard.net> * coding.el (decode-coding-string): (encode-coding-string): Accept GNU's NOCOPY argument for these. Todo; write compiler macros to use it. (query-coding-warning-face): New face, to show unencodable characters. (default-query-coding-region-safe-charset-skip-chars-map): New variable, a cache used by #'default-query-coding-region. (default-query-coding-region): Default implementation of #'query-coding-region, using the safe-charsets and safe-chars coding systemproperties. (query-coding-region): New function; can a given coding system encode a given region? (query-coding-string): New function; can a given coding system encode a given string? (unencodable-char-position): Function API taken from GNU; return the first unencodable position given a string and coding system. (encode-coding-char): Function API taken from GNU; return CHAR encoded using CODING-SYSTEM, or nil if CODING-SYSTEM would trash CHAR. ((unless (featurep 'mule)): Override the default query-coding-region implementation on non-Mule. * mule/mule-coding.el (make-8-bit-generate-helper): Eliminate a duplicate comment. (make-8-bit-choose-category): Simplify implementation. (8-bit-fixed-query-coding-region): Implementation of #'query-coding-region for coding systems created with #'make-8-bit-coding-system. (make-8-bit-coding-system): Initialise the #'query-coding-region implementation for these character sets. (make-8-bit-coding-system): Ditto for the compiler macro version of this function. * unicode.el (unicode-query-coding-skip-chars-arg): New variable, used by unicode-query-coding-region, initialised in mule/general-late.el. (unicode-query-coding-region): New function, the #'query-coding-region implementation for Unicode coding systems. Initialise the query-coding-function property for the Unicode coding systems to #'unicode-query-coding-region. * mule/mule-charset.el (charset-skip-chars-string): New function. Return a #'skip-chars-forward argument that skips all characters in CHARSET. (map-charset-chars): Function synced from GNU, modified to work with XEmacs. Map FUNC across the int value charset ranges of CHARSET.
author Aidan Kehoe <kehoea@parhasard.net>
date Mon, 21 Jan 2008 22:51:21 +0100
parents dd9c1d5f5319
children 6812571bfcb9
line wrap: on
line diff
--- a/lisp/coding.el	Thu Jan 17 11:55:11 2008 +0100
+++ b/lisp/coding.el	Mon Jan 21 22:51:21 2008 +0100
@@ -125,15 +125,20 @@
   (interactive "r\nP")
   (princ (detect-coding-region start end)))
 
-(defun decode-coding-string (str coding-system)
+(defun decode-coding-string (str coding-system &optional nocopy)
   "Decode the string STR which is encoded in CODING-SYSTEM.
-Does not modify STR.  Returns the decoded string on successful conversion."
+Normally does not modify STR.  Returns the decoded string on
+successful conversion.
+Optional argument NOCOPY says that modifying STR and returning it is
+allowed."
   (with-string-as-buffer-contents
    str (decode-coding-region (point-min) (point-max) coding-system)))
 
-(defun encode-coding-string (str coding-system)
+(defun encode-coding-string (str coding-system &optional nocopy)
   "Encode the string STR using CODING-SYSTEM.
-Does not modify STR.  Returns the encoded string on successful conversion."
+Does not modify STR.  Returns the encoded string on successful conversion.
+Optional argument NOCOPY says that the original string may be returned
+if does not differ from the encoded string. "
   (with-string-as-buffer-contents
    str (encode-coding-region (point-min) (point-max) coding-system)))
 
@@ -274,4 +279,204 @@
 
 (make-compatible-variable 'enable-multibyte-characters "Unimplemented")
 
+;; Sure would be nice to be able to use defface here. 
+(copy-face 'highlight 'query-coding-warning-face)
+
+(defvar default-query-coding-region-safe-charset-skip-chars-map
+  #s(hash-table test equal data ())
+  "A map from list of charsets to `skip-chars-forward' arguments for them.")
+
+(defun default-query-coding-region (begin end coding-system
+				    &optional buffer errorp highlightp)
+  "The default `query-coding-region' implementation.
+
+Uses the `safe-charsets' and `safe-chars' coding system properties.
+The former is a list of XEmacs character sets that can be safely
+encoded by CODING-SYSTEM; the latter a char table describing, in
+addition, characters that can be safely encoded by CODING-SYSTEM."
+  (check-argument-type #'coding-system-p
+                       (setq coding-system (find-coding-system coding-system)))
+  (check-argument-type #'integer-or-marker-p begin)
+  (check-argument-type #'integer-or-marker-p end)
+  (let* ((safe-charsets
+          (coding-system-get coding-system 'safe-charsets))
+         (safe-chars (coding-system-get coding-system 'safe-chars))
+         (skip-chars-arg
+          (gethash safe-charsets
+                   default-query-coding-region-safe-charset-skip-chars-map))
+         (ranges (make-range-table))
+         fail-range-start fail-range-end previous-fail char-after
+	 looking-at-arg failed extent)
+    (unless skip-chars-arg
+      (setq skip-chars-arg
+	    (puthash safe-charsets
+		     (mapconcat #'charset-skip-chars-string
+				safe-charsets "")
+		     default-query-coding-region-safe-charset-skip-chars-map)))
+    (if (and (zerop (length skip-chars-arg)) (null safe-chars))
+	(progn
+	    ;; Uh-oh, nothing known about this coding system. Fail. 
+	    (when errorp 
+	      (error 'text-conversion-error
+		     "Coding system doesn't say what it can encode"
+		     (coding-system-name coding-system)))
+	    (put-range-table begin end t ranges)
+	    (when highlightp
+	      (setq extent (make-extent begin end buffer))
+	      (set-extent-priority extent (+ mouse-highlight-priority 2))
+	      (set-extent-face extent 'query-coding-warning-face))
+	    (values nil ranges))
+      (setq looking-at-arg (if (equal "" skip-chars-arg)
+			       ;; Regexp that will never match.
+			       #r".\{0,0\}" 
+                             (concat "[" skip-chars-arg "]")))
+      (save-excursion
+	(goto-char begin buffer)
+	(skip-chars-forward skip-chars-arg end buffer)
+	(while (< (point buffer) end)
+	  (message
+	   "fail-range-start is %S, previous-fail %S, point is %S, end is %S"
+	   fail-range-start previous-fail (point buffer) end)
+	  (setq char-after (char-after (point buffer) buffer)
+		fail-range-start (point buffer))
+	  (while (and
+		  (< (point buffer) end)
+		  (not (looking-at looking-at-arg))
+		  (or (not safe-chars)
+		      (not (get-char-table char-after safe-chars))))
+	    (forward-char 1 buffer)
+	    (setq char-after (char-after (point buffer) buffer)
+		  failed t))
+	  (if (= fail-range-start (point buffer))
+	      ;; The character can actually be encoded by the coding
+	      ;; system; check the characters past it.
+	      (forward-char 1 buffer)
+            ;; Can't be encoded; note this.
+	    (when errorp 
+	      (error 'text-conversion-error
+		     (format "Cannot encode %s using coding system"
+			     (buffer-substring fail-range-start (point buffer)
+					       buffer))
+		     (coding-system-name coding-system)))
+	    (put-range-table fail-range-start
+			     ;; If char-after is non-nil, we're not at
+			     ;; the end of the buffer.
+			     (setq fail-range-end (if char-after
+						      (point buffer)
+						    (point-max buffer)))
+			     t ranges)
+	    (when highlightp
+	      (setq extent (make-extent fail-range-start fail-range-end buffer))
+	      (set-extent-priority extent (+ mouse-highlight-priority 2))
+	      (set-extent-face extent 'query-coding-warning-face)))
+	  (skip-chars-forward skip-chars-arg end buffer))
+	(if failed
+	    (values nil ranges)
+	  (values t nil))))))
+
+(defsubst query-coding-region (start end coding-system &optional buffer
+                               errorp highlight)
+  "Work out whether CODING-SYSTEM can losslessly encode a region.
+
+START and END are the beginning and end of the region to check.
+CODING-SYSTEM is the coding system to try.
+
+Optional argument BUFFER is the buffer to check, and defaults to the current
+buffer.  Optional argument ERRORP says to signal a `text-conversion-error'
+if some character in the region cannot be encoded, and defaults to nil. 
+
+Optional argument HIGHLIGHT says to display unencodable characters in the
+region using `query-coding-warning-face'. It defaults to nil.
+
+This function returns a list; the intention is that callers use use
+`multiple-value-bind' or the related CL multiple value functions to deal
+with it.  The first element is `t' if the string can be encoded using
+CODING-SYSTEM, or `nil' if not.  The second element is `nil' if the string
+can be encoded using CODING-SYSTEM; otherwise, it is a range table
+describing the positions of the unencodable characters. See
+`make-range-table'."
+  (funcall (or (coding-system-get coding-system 'query-coding-function)
+               #'default-query-coding-region)
+           start end coding-system buffer errorp highlight))
+
+(defun query-coding-string (string coding-system &optional errorp highlight)
+  "Work out whether CODING-SYSTEM can losslessly encode STRING.
+CODING-SYSTEM is the coding system to check.
+
+Optional argument ERRORP says to signal a `text-conversion-error' if some
+character in the region cannot be encoded, and defaults to nil.
+
+Optional argument HIGHLIGHT says to display unencodable characters in the
+region using `query-coding-warning-face'. It defaults to nil.
+
+This function returns a list; the intention is that callers use use
+`multiple-value-bind' or the related CL multiple value functions to deal
+with it.  The first element is `t' if the string can be encoded using
+CODING-SYSTEM, or `nil' if not.  The second element is `nil' if the string
+can be encoded using CODING-SYSTEM; otherwise, it is a range table
+describing the positions of the unencodable characters. See
+`make-range-table'."
+  (with-temp-buffer 
+    (insert string)
+    (query-coding-region (point-min) (point-max) coding-system (current-buffer)
+                         ;; ### Will highlight work here?
+                         errorp highlight)))
+
+(defun unencodable-char-position  (start end coding-system
+                                   &optional count string) 
+  "Return position of first un-encodable character in a region.
+START and END specify the region and CODING-SYSTEM specifies the
+encoding to check.  Return nil if CODING-SYSTEM does encode the region.
+
+If optional 4th argument COUNT is non-nil, it specifies at most how
+many un-encodable characters to search.  In this case, the value is a
+list of positions.
+
+If optional 5th argument STRING is non-nil, it is a string to search
+for un-encodable characters.  In that case, START and END are indexes
+in the string."
+  (flet ((thunk ()
+	   (multiple-value-bind (result ranges)
+	       (query-coding-region start end coding-system)
+	     (if result
+		 ;; If query-coding-region thinks the entire region is
+		 ;; encodable, result will be t, and the thunk should
+		 ;; return nil, because there are no unencodable
+		 ;; positions in the region.
+                 nil
+               (if count 
+                   (block counted
+                     (map-range-table
+                      #'(lambda (begin end value)
+                          (while (and (<= begin end) (<= begin count))
+                            (push begin result)
+                            (incf begin))
+                          (if (> begin count) (return-from counted)))
+                      ranges))
+                 (map-range-table
+                  #'(lambda (begin end value)
+		      (while (<= begin end)
+			(push begin result)
+			(incf begin))) ranges))
+	       result))))
+    (if string
+	(with-temp-buffer (insert string) (thunk))
+      (thunk))))
+
+(defun encode-coding-char (char coding-system)
+  "Encode CHAR by CODING-SYSTEM and return the resulting string.
+If CODING-SYSTEM can't safely encode CHAR, return nil."
+  (check-argument-type #'characterp char)
+  (multiple-value-bind (succeededp)
+      (query-coding-string char coding-system)
+    (when succeededp
+      (encode-coding-string char coding-system))))
+
+(unless (featurep 'mule)
+  ;; If we're under non-Mule, every XEmacs character can be encoded
+  ;; with every XEmacs coding system.
+  (fset #'default-query-coding-region
+	#'(lambda (&rest ignored) (values t nil)))
+  (unintern 'default-query-coding-region-safe-charset-skip-chars-map))
+
 ;;; coding.el ends here