changeset 4202:a7c5de5b9880

[xemacs-hg @ 2007-10-02 10:33:04 by aidan] Fix some bugs in the Unicode error sequence Lisp API.
author aidan
date Tue, 02 Oct 2007 10:33:05 +0000
parents 6e9bd19ec103
children 0a63e5de7bdc
files lisp/ChangeLog lisp/unicode.el
diffstat 2 files changed, 115 insertions(+), 30 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/ChangeLog	Tue Oct 02 07:54:08 2007 +0000
+++ b/lisp/ChangeLog	Tue Oct 02 10:33:05 2007 +0000
@@ -1,3 +1,20 @@
+2007-09-09  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* unicode.el (unicode-error-default-translation-table):
+	New. A char table mapping from Unicode error octets (as
+	represented in XEmacs) to ASCII, Control-1 and latin-iso8859-1.
+
+	* unicode.el (unicode-error-sequence-regexp-range):
+	Correct it, add a build-time check that it matches every possible
+	error octet.
+
+	* unicode.el (frob-unicode-errors-region) New.
+	Apply a supplied function to each set of error sequences in a
+	region.
+	* unicode.el (unicode-error-translate-region) New.
+	Translate the error octets in a region to the corresponding
+	ASCII, control-1 and latin-1 characters. 
+
 2007-10-02  Didier Verna  <didier@xemacs.org>
 
 	* objects.el (make-face-boolean-specifier): Fix docstring, really
--- a/lisp/unicode.el	Tue Oct 02 07:54:08 2007 +0000
+++ b/lisp/unicode.el	Tue Oct 02 10:33:05 2007 +0000
@@ -496,41 +496,109 @@
 
   ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0
   ;; characters starting at U+200000 (which isn't a valid Unicode code
-  ;; point).
-  (loop
-    for i from #x00 to #xFF
-    ;; #xd800 is the first leading surrogate; trailing surrogates must be in
-    ;; the range #xdc00-#xdfff. These examples are not, so we intentionally
-    ;; provoke an error sequence.
-    do (decode-coding-string (format "\xd8\x00\x01%c" i) 'utf-16-be))
+  ;; point). Make them available to user code. 
+  (defvar unicode-error-default-translation-table
+    (loop 
+      with char-table = (make-char-table 'char)
+      for i from ?\x00 to ?\xFF
+      do
+      (put-char-table (aref
+		       ;; #xd800 is the first leading surrogate;
+		       ;; trailing surrogates must be in the range
+		       ;; #xdc00-#xdfff. These examples are not, so we
+		       ;; intentionally provoke an error sequence.
+		       (decode-coding-string (format "\xd8\x00\x00%c" i)
+					     'utf-16-be)
+		       3)
+		      i
+                      char-table)
+      finally return char-table)
+    "Translation table mapping Unicode error sequences to Latin-1 chars.
 
-  ;; Make them available to user code.
-  (defvar unicode-error-sequence-zero
-    (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3)
-    "The XEmacs character representing an invalid zero octet in Unicode.
-
-Subtract this character from each XEmacs character in an invalid sequence to
-get the octet on disk. E.g.
-
-\(- (aref (decode-coding-string ?\\x80 'utf-8) 0)
-   unicode-error-characters-zero)
-=> ?\\x80
-
-You can search for invalid sequences using
-`unicode-error-sequence-regexp-range', which see.  ")
+To transform XEmacs Unicode error sequences to the Latin-1 characters that
+correspond to the octets on disk, you can use this variable.  ")
 
   (defvar unicode-error-sequence-regexp-range
-    (format "%c-%c"
-            (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3)
-            (aref (decode-coding-string "\xd8\x00\x01\xFF" 'utf-16-be) 3))
+    (format "%c%c-%c"
+            (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 0)
+            (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 3)
+            (aref (decode-coding-string "\xd8\x00\x00\xFF" 'utf-16-be) 3))
     "Regular expression range to match Unicode error sequences in XEmacs.
 
-Invalid Unicode sequences on input are represented as XEmacs characters with
-values starting at `unicode-error-sequence-zero', one character for each
-invalid octet.  Use this variable (with `re-search-forward' or
-`skip-chars-forward') to search for such characters; use
-`unicode-error-sequence-zero' from such characters to get a character
-corresponding to the octet on disk.  "))
+Invalid Unicode sequences on input are represented as XEmacs
+characters with values stored as the keys in
+`unicode-error-default-translation-table', one character for each
+invalid octet.  You can use this variable (with `re-search-forward' or
+`skip-chars-forward') to search for such characters; see also
+`unicode-error-translate-region'.  ")
+
+  ;; Check that the lookup table is correct, and that all the actual error
+  ;; sequences are caught by the regexp.
+  (with-temp-buffer
+    (loop
+      for i from ?\x00 to ?\xFF
+      with to-check = (make-string 20 ?\x20) 
+      with res = t
+      do 
+      (delete-region (point-min) (point-max))
+      (insert to-check)
+      (goto-char 10)
+      (insert (decode-coding-string (format "\xd8\x00\x00%c" i)
+				    'utf-16-be))
+      (backward-char)
+      (assert (= i (get-char-table (char-after (point)) 
+				   unicode-error-default-translation-table))
+	      (format "Char ?\\x%x not the expected error sequence!"
+		      i))
+
+      (goto-char (point-min))
+      (assert (re-search-forward (concat "[" 
+					 unicode-error-sequence-regexp-range
+					 "]"))
+	      nil
+	      (format "Could not find char ?\\x%x in buffer" i))))
+
+  (defun frob-unicode-errors-region (frob-function begin end &optional buffer)
+    "Call FROB-FUNCTION on the Unicode error sequences between BEGIN and END.
+
+Optional argument BUFFER specifies the buffer that should be examined for
+such sequences.  "
+    (check-argument-type #'functionp frob-function)
+    (check-argument-range begin (point-min buffer) (point-max buffer))
+    (check-argument-range end (point-min buffer) (point-max buffer))
+    (save-excursion
+      (save-restriction
+	(if buffer (set-buffer buffer))
+	(narrow-to-region begin end)
+	(goto-char (point-min))
+	(while end
+	  (setq begin
+		(progn
+		  (skip-chars-forward
+		   (concat "^" unicode-error-sequence-regexp-range))
+		  (point))
+		end (and (not (= (point) (point-max)))
+			 (progn
+			   (skip-chars-forward
+			    unicode-error-sequence-regexp-range)
+			   (point))))
+	  (if end
+	      (funcall frob-function begin end))))))
+
+  (defun unicode-error-translate-region (begin end &optional buffer table)
+    "Translate the Unicode error sequences in BUFFER between BEGIN and END.
+
+The error sequences are transformed, by default, into the ASCII,
+control-1 and latin-iso8859-1 characters with the numeric values
+corresponding to the incorrect octets encountered.  This is achieved
+by using `unicode-error-default-translation-table' (which see) for
+TABLE; you can change this by supplying another character table,
+mapping from the error sequences to the desired characters.  "
+    (unless table (setq table unicode-error-default-translation-table))
+    (frob-unicode-errors-region
+     (lambda (start finish)
+       (translate-region start finish table))
+     begin end buffer)))
 
 ;; #### UTF-7 is not yet implemented, and it's tricky to do.  There's
 ;; an implementation in appendix A.1 of the Unicode Standard, Version