Mercurial > hg > xemacs-beta
comparison lisp/unicode.el @ 4202:a7c5de5b9880
[xemacs-hg @ 2007-10-02 10:33:04 by aidan]
Fix some bugs in the Unicode error sequence Lisp API.
author | aidan |
---|---|
date | Tue, 02 Oct 2007 10:33:05 +0000 |
parents | edb00a8b4eff |
children | 3651e9c49476 |
comparison
equal
deleted
inserted
replaced
4201:6e9bd19ec103 | 4202:a7c5de5b9880 |
---|---|
494 (char-syntax ascii-or-latin-1)) | 494 (char-syntax ascii-or-latin-1)) |
495 syntax-table)) | 495 syntax-table)) |
496 | 496 |
497 ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0 | 497 ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0 |
498 ;; characters starting at U+200000 (which isn't a valid Unicode code | 498 ;; characters starting at U+200000 (which isn't a valid Unicode code |
499 ;; point). | 499 ;; point). Make them available to user code. |
500 (loop | 500 (defvar unicode-error-default-translation-table |
501 for i from #x00 to #xFF | 501 (loop |
502 ;; #xd800 is the first leading surrogate; trailing surrogates must be in | 502 with char-table = (make-char-table 'char) |
503 ;; the range #xdc00-#xdfff. These examples are not, so we intentionally | 503 for i from ?\x00 to ?\xFF |
504 ;; provoke an error sequence. | 504 do |
505 do (decode-coding-string (format "\xd8\x00\x01%c" i) 'utf-16-be)) | 505 (put-char-table (aref |
506 | 506 ;; #xd800 is the first leading surrogate; |
507 ;; Make them available to user code. | 507 ;; trailing surrogates must be in the range |
508 (defvar unicode-error-sequence-zero | 508 ;; #xdc00-#xdfff. These examples are not, so we |
509 (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3) | 509 ;; intentionally provoke an error sequence. |
510 "The XEmacs character representing an invalid zero octet in Unicode. | 510 (decode-coding-string (format "\xd8\x00\x00%c" i) |
511 | 511 'utf-16-be) |
512 Subtract this character from each XEmacs character in an invalid sequence to | 512 3) |
513 get the octet on disk. E.g. | 513 i |
514 | 514 char-table) |
515 \(- (aref (decode-coding-string ?\\x80 'utf-8) 0) | 515 finally return char-table) |
516 unicode-error-characters-zero) | 516 "Translation table mapping Unicode error sequences to Latin-1 chars. |
517 => ?\\x80 | 517 |
518 | 518 To transform XEmacs Unicode error sequences to the Latin-1 characters that |
519 You can search for invalid sequences using | 519 correspond to the octets on disk, you can use this variable. ") |
520 `unicode-error-sequence-regexp-range', which see. ") | |
521 | 520 |
522 (defvar unicode-error-sequence-regexp-range | 521 (defvar unicode-error-sequence-regexp-range |
523 (format "%c-%c" | 522 (format "%c%c-%c" |
524 (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3) | 523 (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 0) |
525 (aref (decode-coding-string "\xd8\x00\x01\xFF" 'utf-16-be) 3)) | 524 (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 3) |
525 (aref (decode-coding-string "\xd8\x00\x00\xFF" 'utf-16-be) 3)) | |
526 "Regular expression range to match Unicode error sequences in XEmacs. | 526 "Regular expression range to match Unicode error sequences in XEmacs. |
527 | 527 |
528 Invalid Unicode sequences on input are represented as XEmacs characters with | 528 Invalid Unicode sequences on input are represented as XEmacs |
529 values starting at `unicode-error-sequence-zero', one character for each | 529 characters with values stored as the keys in |
530 invalid octet. Use this variable (with `re-search-forward' or | 530 `unicode-error-default-translation-table', one character for each |
531 `skip-chars-forward') to search for such characters; use | 531 invalid octet. You can use this variable (with `re-search-forward' or |
532 `unicode-error-sequence-zero' from such characters to get a character | 532 `skip-chars-forward') to search for such characters; see also |
533 corresponding to the octet on disk. ")) | 533 `unicode-error-translate-region'. ") |
534 | |
535 ;; Check that the lookup table is correct, and that all the actual error | |
536 ;; sequences are caught by the regexp. | |
537 (with-temp-buffer | |
538 (loop | |
539 for i from ?\x00 to ?\xFF | |
540 with to-check = (make-string 20 ?\x20) | |
541 with res = t | |
542 do | |
543 (delete-region (point-min) (point-max)) | |
544 (insert to-check) | |
545 (goto-char 10) | |
546 (insert (decode-coding-string (format "\xd8\x00\x00%c" i) | |
547 'utf-16-be)) | |
548 (backward-char) | |
549 (assert (= i (get-char-table (char-after (point)) | |
550 unicode-error-default-translation-table)) | |
551 (format "Char ?\\x%x not the expected error sequence!" | |
552 i)) | |
553 | |
554 (goto-char (point-min)) | |
555 (assert (re-search-forward (concat "[" | |
556 unicode-error-sequence-regexp-range | |
557 "]")) | |
558 nil | |
559 (format "Could not find char ?\\x%x in buffer" i)))) | |
560 | |
561 (defun frob-unicode-errors-region (frob-function begin end &optional buffer) | |
562 "Call FROB-FUNCTION on the Unicode error sequences between BEGIN and END. | |
563 | |
564 Optional argument BUFFER specifies the buffer that should be examined for | |
565 such sequences. " | |
566 (check-argument-type #'functionp frob-function) | |
567 (check-argument-range begin (point-min buffer) (point-max buffer)) | |
568 (check-argument-range end (point-min buffer) (point-max buffer)) | |
569 (save-excursion | |
570 (save-restriction | |
571 (if buffer (set-buffer buffer)) | |
572 (narrow-to-region begin end) | |
573 (goto-char (point-min)) | |
574 (while end | |
575 (setq begin | |
576 (progn | |
577 (skip-chars-forward | |
578 (concat "^" unicode-error-sequence-regexp-range)) | |
579 (point)) | |
580 end (and (not (= (point) (point-max))) | |
581 (progn | |
582 (skip-chars-forward | |
583 unicode-error-sequence-regexp-range) | |
584 (point)))) | |
585 (if end | |
586 (funcall frob-function begin end)))))) | |
587 | |
588 (defun unicode-error-translate-region (begin end &optional buffer table) | |
589 "Translate the Unicode error sequences in BUFFER between BEGIN and END. | |
590 | |
591 The error sequences are transformed, by default, into the ASCII, | |
592 control-1 and latin-iso8859-1 characters with the numeric values | |
593 corresponding to the incorrect octets encountered. This is achieved | |
594 by using `unicode-error-default-translation-table' (which see) for | |
595 TABLE; you can change this by supplying another character table, | |
596 mapping from the error sequences to the desired characters. " | |
597 (unless table (setq table unicode-error-default-translation-table)) | |
598 (frob-unicode-errors-region | |
599 (lambda (start finish) | |
600 (translate-region start finish table)) | |
601 begin end buffer))) | |
534 | 602 |
535 ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's | 603 ;; #### UTF-7 is not yet implemented, and it's tricky to do. There's |
536 ;; an implementation in appendix A.1 of the Unicode Standard, Version | 604 ;; an implementation in appendix A.1 of the Unicode Standard, Version |
537 ;; 2.0, but I don't know its licensing characteristics. | 605 ;; 2.0, but I don't know its licensing characteristics. |
538 | 606 |