comparison lisp/unicode.el @ 4202:a7c5de5b9880

[xemacs-hg @ 2007-10-02 10:33:04 by aidan] Fix some bugs in the Unicode error sequence Lisp API.
author aidan
date Tue, 02 Oct 2007 10:33:05 +0000
parents edb00a8b4eff
children 3651e9c49476
comparison
equal deleted inserted replaced
4201:6e9bd19ec103 4202:a7c5de5b9880
494 (char-syntax ascii-or-latin-1)) 494 (char-syntax ascii-or-latin-1))
495 syntax-table)) 495 syntax-table))
496 496
497 ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0 497 ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0
498 ;; characters starting at U+200000 (which isn't a valid Unicode code 498 ;; characters starting at U+200000 (which isn't a valid Unicode code
499 ;; point). 499 ;; point). Make them available to user code.
500 (loop 500 (defvar unicode-error-default-translation-table
501 for i from #x00 to #xFF 501 (loop
502 ;; #xd800 is the first leading surrogate; trailing surrogates must be in 502 with char-table = (make-char-table 'char)
503 ;; the range #xdc00-#xdfff. These examples are not, so we intentionally 503 for i from ?\x00 to ?\xFF
504 ;; provoke an error sequence. 504 do
505 do (decode-coding-string (format "\xd8\x00\x01%c" i) 'utf-16-be)) 505 (put-char-table (aref
506 506 ;; #xd800 is the first leading surrogate;
507 ;; Make them available to user code. 507 ;; trailing surrogates must be in the range
508 (defvar unicode-error-sequence-zero 508 ;; #xdc00-#xdfff. These examples are not, so we
509 (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3) 509 ;; intentionally provoke an error sequence.
510 "The XEmacs character representing an invalid zero octet in Unicode. 510 (decode-coding-string (format "\xd8\x00\x00%c" i)
511 511 'utf-16-be)
512 Subtract this character from each XEmacs character in an invalid sequence to 512 3)
513 get the octet on disk. E.g. 513 i
514 514 char-table)
515 \(- (aref (decode-coding-string ?\\x80 'utf-8) 0) 515 finally return char-table)
516 unicode-error-characters-zero) 516 "Translation table mapping Unicode error sequences to Latin-1 chars.
517 => ?\\x80 517
518 518 To transform XEmacs Unicode error sequences to the Latin-1 characters that
519 You can search for invalid sequences using 519 correspond to the octets on disk, you can use this variable. ")
520 `unicode-error-sequence-regexp-range', which see. ")
521 520
522 (defvar unicode-error-sequence-regexp-range 521 (defvar unicode-error-sequence-regexp-range
523 (format "%c-%c" 522 (format "%c%c-%c"
524 (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3) 523 (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 0)
525 (aref (decode-coding-string "\xd8\x00\x01\xFF" 'utf-16-be) 3)) 524 (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 3)
525 (aref (decode-coding-string "\xd8\x00\x00\xFF" 'utf-16-be) 3))
526 "Regular expression range to match Unicode error sequences in XEmacs. 526 "Regular expression range to match Unicode error sequences in XEmacs.
527 527
528 Invalid Unicode sequences on input are represented as XEmacs characters with 528 Invalid Unicode sequences on input are represented as XEmacs
529 values starting at `unicode-error-sequence-zero', one character for each 529 characters with values stored as the keys in
530 invalid octet. Use this variable (with `re-search-forward' or 530 `unicode-error-default-translation-table', one character for each
531 `skip-chars-forward') to search for such characters; use 531 invalid octet. You can use this variable (with `re-search-forward' or
532 `unicode-error-sequence-zero' from such characters to get a character 532 `skip-chars-forward') to search for such characters; see also
533 corresponding to the octet on disk. ")) 533 `unicode-error-translate-region'. ")
534
535 ;; Check that the lookup table is correct, and that all the actual error
536 ;; sequences are caught by the regexp.
537 (with-temp-buffer
538 (loop
539 for i from ?\x00 to ?\xFF
540 with to-check = (make-string 20 ?\x20)
541 with res = t
542 do
543 (delete-region (point-min) (point-max))
544 (insert to-check)
545 (goto-char 10)
546 (insert (decode-coding-string (format "\xd8\x00\x00%c" i)
547 'utf-16-be))
548 (backward-char)
549 (assert (= i (get-char-table (char-after (point))
550 unicode-error-default-translation-table))
551 (format "Char ?\\x%x not the expected error sequence!"
552 i))
553
554 (goto-char (point-min))
555 (assert (re-search-forward (concat "["
556 unicode-error-sequence-regexp-range
557 "]"))
558 nil
559 (format "Could not find char ?\\x%x in buffer" i))))
560
561 (defun frob-unicode-errors-region (frob-function begin end &optional buffer)
562 "Call FROB-FUNCTION on the Unicode error sequences between BEGIN and END.
563
564 Optional argument BUFFER specifies the buffer that should be examined for
565 such sequences. "
566 (check-argument-type #'functionp frob-function)
567 (check-argument-range begin (point-min buffer) (point-max buffer))
568 (check-argument-range end (point-min buffer) (point-max buffer))
569 (save-excursion
570 (save-restriction
571 (if buffer (set-buffer buffer))
572 (narrow-to-region begin end)
573 (goto-char (point-min))
574 (while end
575 (setq begin
576 (progn
577 (skip-chars-forward
578 (concat "^" unicode-error-sequence-regexp-range))
579 (point))
580 end (and (not (= (point) (point-max)))
581 (progn
582 (skip-chars-forward
583 unicode-error-sequence-regexp-range)
584 (point))))
585 (if end
586 (funcall frob-function begin end))))))
587
588 (defun unicode-error-translate-region (begin end &optional buffer table)
589 "Translate the Unicode error sequences in BUFFER between BEGIN and END.
590
591 The error sequences are transformed, by default, into the ASCII,
592 control-1 and latin-iso8859-1 characters with the numeric values
593 corresponding to the incorrect octets encountered. This is achieved
594 by using `unicode-error-default-translation-table' (which see) for
595 TABLE; you can change this by supplying another character table,
596 mapping from the error sequences to the desired characters. "
597 (unless table (setq table unicode-error-default-translation-table))
598 (frob-unicode-errors-region
599 (lambda (start finish)
600 (translate-region start finish table))
601 begin end buffer)))
534 602
535 ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's 603 ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's
536 ;; an implementation in appendix A.1 of the Unicode Standard, Version 604 ;; an implementation in appendix A.1 of the Unicode Standard, Version
537 ;; 2.0, but I don't know its licensing characteristics. 605 ;; 2.0, but I don't know its licensing characteristics.
538 606