Mercurial > hg > xemacs-beta
changeset 4570:e6a7054a9c30
Add check-coding-systems-region, test it and others, fix some bugs.
tests/ChangeLog addition:
2008-12-28 Aidan Kehoe <kehoea@parhasard.net>
* automated/query-coding-tests.el:
Add tests for #'unencodable-char-position,
#'check-coding-systems-region, #'encode-coding-char. Remove some
debugging statements.
lisp/ChangeLog addition:
2008-12-28 Aidan Kehoe <kehoea@parhasard.net>
* coding.el (query-coding-region):
(query-coding-string):
Make these defsubsts, they're short enough and they're called
explicitly rarely enough that it make some sense. The alternative
would be compiler macros that avoid the binding of the arguments.
(unencodable-char-position):
Document where the docstring and API are from.
Correct a special case for zero--check-argument-type returns nil
when it succeeds, we can't usefully chain its result in an and
here.
(check-coding-systems-region): New. API taken from GNU; docstring
and implementation are independent.
(encode-coding-char):
Add an optional third argument, as used by recent GNU. Document
the origen of the docstring.
(default-query-coding-region): Add a short docstring to the
non-Mule implementation of this function.
* unicode.el:
Don't set the query-coding-function property for unicode coding
systems if we're on non-mule. Unintern
unicode-query-coding-region, unicode-query-coding-skip-chars-arg
in the same context.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Sun, 28 Dec 2008 22:51:14 +0000 |
parents | 80e0588fb42f |
children | ebc01476e352 |
files | lisp/ChangeLog lisp/coding.el lisp/unicode.el tests/ChangeLog tests/automated/query-coding-tests.el |
diffstat | 5 files changed, 166 insertions(+), 61 deletions(-) [+] |
line wrap: on
line diff
--- a/lisp/ChangeLog Sun Dec 28 14:55:02 2008 +0000 +++ b/lisp/ChangeLog Sun Dec 28 22:51:14 2008 +0000 @@ -1,3 +1,28 @@ +2008-12-28 Aidan Kehoe <kehoea@parhasard.net> + + * coding.el (query-coding-region): + (query-coding-string): + Make these defsubsts, they're short enough and they're called + explicitly rarely enough that it make some sense. The alternative + would be compiler macros that avoid the binding of the arguments. + (unencodable-char-position): + Document where the docstring and API are from. + Correct a special case for zero--check-argument-type returns nil + when it succeeds, we can't usefully chain its result in an and + here. + (check-coding-systems-region): New. API taken from GNU; docstring + and implementation are independent. + (encode-coding-char): + Add an optional third argument, as used by recent GNU. Document + the origen of the docstring. + (default-query-coding-region): Add a short docstring to the + non-Mule implementation of this function. + * unicode.el: + Don't set the query-coding-function property for unicode coding + systems if we're on non-mule. Unintern + unicode-query-coding-region, unicode-query-coding-skip-chars-arg + in the same context. + 2008-12-28 Aidan Kehoe <kehoea@parhasard.net> * coding.el (default-query-coding-region):
--- a/lisp/coding.el Sun Dec 28 14:55:02 2008 +0000 +++ b/lisp/coding.el Sun Dec 28 22:51:14 2008 +0000 @@ -398,7 +398,7 @@ (values nil ranges) (values t nil)))))) -(defun query-coding-region (start end coding-system &optional buffer +(defsubst query-coding-region (start end coding-system &optional buffer errorp highlight) "Work out whether CODING-SYSTEM can losslessly encode a region. @@ -423,7 +423,7 @@ #'default-query-coding-region) start end coding-system buffer errorp highlight)) -(defun query-coding-string (string coding-system &optional errorp highlight) +(defsubst query-coding-string (string coding-system &optional errorp highlight) "Work out whether CODING-SYSTEM can losslessly encode STRING. CODING-SYSTEM is the coding system to check. @@ -446,6 +446,7 @@ ;; ### Will highlight work here? errorp highlight))) +;; Function docstring and API are taken from GNU coding.c version 1.353, GPLv2. (defun unencodable-char-position (start end coding-system &optional count string) "Return position of first un-encodable character in a region. @@ -486,9 +487,9 @@ (check-argument-type #'integer-or-marker-p start) (check-argument-type #'integer-or-marker-p end) (check-coding-system coding-system) - (and count (check-argument-type #'natnump count) - ;; Special-case zero, sigh. - (if (zerop count) (setq count 1))) + (when count (check-argument-type #'natnump count) + ;; Special-case zero, sigh. + (if (zerop count) (setq count 1))) (and string (check-argument-type #'stringp string)) (if string (with-temp-buffer @@ -496,9 +497,64 @@ (funcall thunk start end coding-system count)) (funcall thunk start end coding-system count)))) -(defun encode-coding-char (char coding-system) +;; XEmacs; this is a GPLv3 function in coding.c in GNU. This is why we have +;; both a very divergent docstring and a very divergent implementation. +(defun check-coding-systems-region (begin end coding-system-list) + "Can coding systems in CODING-SYSTEM-LIST encode text in a region? + +CODING-SYSTEM-LIST must be a list of coding systems. BEGIN and END are +normally buffer positions delimiting the region. If some coding system in +CODING-SYSTEM-LIST cannot encode the entire region, the return value of this +function is an alist mapping coding system names to lists of individual +buffer positions (not ranges) that the individual coding systems cannot +encode. + +If all coding systems in CODING-SYSTEM-LIST can encode the region, +this function returns t. This conflicts with the documented, but not +with the observed, GNU behavior. + +If BEGIN is a string, `check-coding-systems-region' ignores END, and checks +whether the coding systems can encode BEGIN. The alist that is returned +uses zero-based string indices, not one-based buffer positions. + +This function is for GNU compatibility. See also `query-coding-region'." + (let ((thunk + #'(lambda (begin end coding-system-list stringp) + (loop + for coding-system in coding-system-list + with result = nil + with intermediate = nil + with range-lambda = (if stringp + #'(lambda (begin end value) + (while (< begin end) + (push (1- begin) intermediate) + (incf begin))) + #'(lambda (begin end value) + (while (< begin end) + (push begin intermediate) + (incf begin)))) + do (setq coding-system (check-coding-system coding-system)) + (multiple-value-bind (encoded ranges) + (query-coding-region begin end coding-system) + (unless encoded + (setq intermediate (list (coding-system-name coding-system))) + (map-range-table range-lambda ranges) + (push (nreverse intermediate) result))) + finally return (or result t))))) + (if (stringp begin) + (with-temp-buffer + (insert begin) + (funcall thunk (point-min) (point-max) coding-system-list t)) + (check-argument-type #'integer-or-marker-p begin) + (check-argument-type #'integer-or-marker-p end) + (funcall thunk begin end coding-system-list nil)))) + +;; XEmacs; docstring taken from GNU, international/mule-cmds.el, revision +;; 1.311, GPLv2. +(defun encode-coding-char (char coding-system &optional charset) "Encode CHAR by CODING-SYSTEM and return the resulting string. -If CODING-SYSTEM can't safely encode CHAR, return nil." +If CODING-SYSTEM can't safely encode CHAR, return nil. +The optional third argument CHARSET is, for the moment, ignored." (check-argument-type #'characterp char) (multiple-value-bind (succeededp) (query-coding-string char coding-system) @@ -509,7 +565,9 @@ ;; If we're under non-Mule, every XEmacs character can be encoded ;; with every XEmacs coding system. (fset #'default-query-coding-region - #'(lambda (&rest ignored) (values t nil))) + #'(lambda (&rest ignored) + "Stub `query-coding-region' implementation. Always succeeds." + (values t nil))) (unintern 'default-query-coding-region-safe-charset-skip-chars-map)) ;;; coding.el ends here
--- a/lisp/unicode.el Sun Dec 28 14:55:02 2008 +0000 +++ b/lisp/unicode.el Sun Dec 28 22:51:14 2008 +0000 @@ -678,6 +678,7 @@ (loop for coding-system in (coding-system-list) + initially (unless (featurep 'mule) (return)) do (when (eq 'unicode (coding-system-type coding-system)) (coding-system-put coding-system 'query-coding-function #'unicode-query-coding-region))) @@ -691,7 +692,8 @@ (mapcar #'unintern '(ccl-encode-to-ucs-2 unicode-error-default-translation-table unicode-invalid-regexp-range frob-unicode-errors-region - unicode-error-translate-region))) + unicode-error-translate-region unicode-query-coding-region + unicode-query-coding-skip-chars-arg))) ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's ;; an implementation in appendix A.1 of the Unicode Standard, Version
--- a/tests/ChangeLog Sun Dec 28 14:55:02 2008 +0000 +++ b/tests/ChangeLog Sun Dec 28 22:51:14 2008 +0000 @@ -1,3 +1,10 @@ +2008-12-28 Aidan Kehoe <kehoea@parhasard.net> + + * automated/query-coding-tests.el: + Add tests for #'unencodable-char-position, + #'check-coding-systems-region, #'encode-coding-char. Remove some + debugging statements. + 2008-12-28 Aidan Kehoe <kehoea@parhasard.net> * automated/query-coding-tests.el:
--- a/tests/automated/query-coding-tests.el Sun Dec 28 14:55:02 2008 +0000 +++ b/tests/automated/query-coding-tests.el Sun Dec 28 22:51:14 2008 +0000 @@ -91,58 +91,31 @@ coding-system)) (multiple-value-bind (query-coding-succeeded query-coding-table) (query-coding-region (point-min) (point-max) coding-system) - (q-c-debug "checking type, coding-system, q-c-s, q-c-t %S" - (list (coding-system-type coding-system) - coding-system query-coding-succeeded - query-coding-table)) - (unless (and (eq t query-coding-succeeded) - (null query-coding-table)) - (q-c-debug "(eq t query-coding-succeeded) %S, (\ -null query-coding-table) %S" (eq t query-coding-succeeded) - (null query-coding-table))) (Assert (eq t query-coding-succeeded)) (Assert (null query-coding-table))) - (q-c-debug "testing the ASCII strings for %S" coding-system) (multiple-value-bind (query-coding-succeeded query-coding-table) (query-coding-string ascii-chars-string coding-system) - (unless (and (eq t query-coding-succeeded) - (null query-coding-table)) - (q-c-debug "(eq t query-coding-succeeded) %S, (\ -null query-coding-table) %S" (eq t query-coding-succeeded) - (null query-coding-table))) (Assert (eq t query-coding-succeeded)) (Assert (null query-coding-table)))) - (q-c-debug "past the loop through the coding systems") (delete-region (point-min) (point-max)) ;; Check for success from the two Latin-1 coding systems (insert latin-1-chars-string) - (q-c-debug "point is now %S" (point)) (multiple-value-bind (query-coding-succeeded query-coding-table) (query-coding-region (point-min) (point-max) 'iso-8859-1-unix) (Assert (eq t query-coding-succeeded)) (Assert (null query-coding-table))) - (q-c-debug "point is now %S" (point)) (multiple-value-bind (query-coding-succeeded query-coding-table) (query-coding-string (buffer-string) 'iso-8859-1-unix) (Assert (eq t query-coding-succeeded)) (Assert (null query-coding-table))) - (q-c-debug "point is now %S" (point)) (multiple-value-bind (query-coding-succeeded query-coding-table) (query-coding-string (buffer-string) 'iso-latin-1-with-esc-unix) (Assert (eq t query-coding-succeeded)) (Assert (null query-coding-table))) - (q-c-debug "point is now %S" (point)) ;; Make it fail, check that it fails correctly (insert (decode-char 'ucs #x20AC)) ;; EURO SIGN (multiple-value-bind (query-coding-succeeded query-coding-table) (query-coding-region (point-min) (point-max) 'iso-8859-1-unix) - (unless (and (null query-coding-succeeded) - (equal query-coding-table - #s(range-table type start-closed-end-open data - ((257 258) t)))) - (q-c-debug "dealing with %S" 'iso-8859-1-unix) - (q-c-debug "query-coding-succeeded not null, query-coding-table \ -%S" query-coding-table)) (Assert (null query-coding-succeeded)) (Assert (equal query-coding-table #s(range-table type start-closed-end-open data @@ -153,12 +126,6 @@ ;; Stupidly, this succeeds. The behaviour is compatible with ;; GNU, though, and we encourage people not to use ;; iso-latin-1-with-esc-unix anyway: - - (unless (and query-coding-succeeded - (null query-coding-table)) - (q-c-debug "dealing with %S" 'iso-latin-1-with-esc-unix) - (q-c-debug "query-coding-succeeded %S, query-coding-table \ -%S" query-coding-succeeded query-coding-table)) (Assert query-coding-succeeded) (Assert (null query-coding-table))) ;; Check that it errors correctly. @@ -186,13 +153,6 @@ (insert ?\x80) (multiple-value-bind (query-coding-succeeded query-coding-table) (query-coding-region (point-min) (point-max) 'windows-1252-unix) - (unless (and (null query-coding-succeeded) - (equal query-coding-table - #s(range-table type start-closed-end-open data - ((257 258) t)))) - (q-c-debug "dealing with %S" 'windows-1252-unix) - (q-c-debug "query-coding-succeeded not null, query-coding-table \ -%S" query-coding-table)) (Assert (null query-coding-succeeded)) (Assert (equal query-coding-table #s(range-table type start-closed-end-open data @@ -212,17 +172,6 @@ (Assert (null query-coding-table))) (multiple-value-bind (query-coding-succeeded query-coding-table) (query-coding-region (point-min) (point-max) 'windows-1252-unix) - (unless (and (null query-coding-succeeded) - (equal query-coding-table - #s(range-table type start-closed-end-open - data ((129 131) t (132 133) t - (139 140) t (141 146) t - (155 156) t (157 161) t - (162 170) t (173 176) t - (178 187) t (189 192) t - (193 257) t)))) - (q-c-debug "query-coding-succeeded not null, query-coding-table \ -%S" query-coding-table)) (Assert (null query-coding-succeeded)) (Assert (equal query-coding-table #s(range-table type start-closed-end-open @@ -290,4 +239,68 @@ (query-coding-region (point-min) 173 coding-system nil t) (text-conversion-error (setq text-conversion-error-signalled t))) - (Assert (null text-conversion-error-signalled)))))) + (Assert (null text-conversion-error-signalled))) + + ;; Now to test #'encode-coding-char. Most of the functionality was + ;; tested in the query-coding-region tests above, so we don't go into + ;; as much detail here. + (Assert (null (encode-coding-char + (decode-char 'ucs #x20ac) 'iso-8859-1))) + (Assert (equal "\x80" (encode-coding-char + (decode-char 'ucs #x20ac) 'windows-1252))) + (delete-region (point-min) (point-max)) + + ;; And #'unencodable-char-position. + (insert latin-1-chars-string) + (insert (decode-char 'ucs #x20ac)) + (Assert (= 257 (unencodable-char-position (point-min) (point-max) + 'iso-8859-1))) + (Assert (equal '(257) (unencodable-char-position (point-min) (point-max) + 'iso-8859-1 1))) + ;; Compatiblity, sigh: + (Assert (equal '(257) (unencodable-char-position (point-min) (point-max) + 'iso-8859-1 0))) + (dotimes (i 6) (insert (decode-char 'ucs #x20ac))) + ;; Check if it stops at one: + (Assert (equal '(257) (unencodable-char-position (point-min) (point-max) + 'iso-8859-1 1))) + ;; Check if it stops at four: + (Assert (equal '(260 259 258 257) + (unencodable-char-position (point-min) (point-max) + 'iso-8859-1 4))) + ;; Check whether it stops at seven: + (Assert (equal '(263 262 261 260 259 258 257) + (unencodable-char-position (point-min) (point-max) + 'iso-8859-1 7))) + ;; Check that it still stops at seven: + (Assert (equal '(263 262 261 260 259 258 257) + (unencodable-char-position (point-min) (point-max) + 'iso-8859-1 2000))) + ;; Now, #'check-coding-systems-region. + ;; UTF-8 should certainly be able to encode these characters: + (Assert (eq t (check-coding-systems-region (point-min) (point-max) + '(utf-8)))) + (Assert (equal '((iso-8859-1 257 258 259 260 261 262 263) + (windows-1252 129 131 132 133 134 135 136 137 138 139 + 140 141 143 146 147 148 149 150 151 152 + 153 154 155 156 157 159 160)) + (sort + (check-coding-systems-region (point-min) (point-max) + '(utf-8 iso-8859-1 + windows-1252)) + ;; (The sort is to make the algorithm irrelevant.) + #'(lambda (left right) + (string< (car left) (car right)))))) + ;; Ensure that the indices are all decreased by one when passed a + ;; string: + (Assert (equal '((iso-8859-1 256 257 258 259 260 261 262) + (windows-1252 128 130 131 132 133 134 135 136 137 138 + 139 140 142 145 146 147 148 149 150 151 + 152 153 154 155 156 158 159)) + (sort + (check-coding-systems-region (buffer-string) nil + '(utf-8 iso-8859-1 + windows-1252)) + #'(lambda (left right) + (string< (car left) (car right))))))))) +