diff lisp/mule/mule-cmds.el @ 5118:e0db3c197671 ben-lisp-object

merge up to latest default branch, doesn't compile yet
author Ben Wing <ben@xemacs.org>
date Sat, 26 Dec 2009 21:18:49 -0600
parents 02b7c7189041
children b3ea9c582280
line wrap: on
line diff
--- a/lisp/mule/mule-cmds.el	Sat Dec 26 00:20:27 2009 -0600
+++ b/lisp/mule/mule-cmds.el	Sat Dec 26 21:18:49 2009 -0600
@@ -73,6 +73,14 @@
   (let ((coding-system-for-read 'iso-2022-7bit))
     (find-file-read-only (expand-file-name "HELLO" data-directory))))
 
+(defvar system-type-file-name-coding
+  '((darwin . utf-8))
+  "A map from values of `system-type' to invariant file name coding systems.
+Used if a give system type does not vary in the coding system it uses for
+file names; otherwise, `language-info-alist' is consulted for this
+information.  This affects the `file-name' coding system alias, but not the
+`file-name-coding-system' variable, which in practice is mostly ignored. ")
+
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;                       Language Support Functions                    ;;;
@@ -217,7 +225,15 @@
 
                      If there is no value for this property, the MS Windows
                      locale is assumed to have the same name as the
-                     language environment."
+                     language environment.
+
+  invalid-sequence-coding-system
+                     VALUE is a fixed-width 8-bit coding system used to
+                     display Unicode error sequences (using a face to make
+                     it clear that the data is invalid).  In Western Europe
+                     and the Americas this is normally windows-1252; in
+                     Russia and the former Soviet Union koi8-ru or
+                     windows-1251 makes more sense."
   (if (symbolp lang-env)
       (setq lang-env (symbol-name lang-env)))
   (let (lang-slot prop-slot)
@@ -242,7 +258,8 @@
   ;; appropriately.  We just use a filter.
   (while alist
     (set-language-info lang-env (car (car alist)) (cdr (car alist)))
-    (setq alist (cdr alist))))
+    (setq alist (cdr alist)))
+  lang-env)
 
 (defun read-language-name (key prompt &optional default)
   "Read a language environment name which has information for KEY.
@@ -670,7 +687,7 @@
       (let ((func (get-language-info current-language-environment
 				     'exit-function)))
 	(run-hooks 'exit-language-environment-hook)
-	(if (fboundp func) (funcall func))))
+	(if (functionp func) (funcall func))))
   (setq current-language-environment language-name)
   (let ((default-eol-type (coding-system-eol-type
 			   default-buffer-file-coding-system)))
@@ -748,9 +765,39 @@
       (require (car required-features))
       (setq required-features (cdr required-features))))
   (let ((func (get-language-info language-name 'setup-function)))
-    (if (fboundp func)
+    (if (functionp func)
 	(funcall func)))
 
+  (let ((invalid-sequence-coding-system
+         (get-language-info language-name 'invalid-sequence-coding-system))
+        (disp-table (specifier-instance current-display-table))
+        glyph string unicode-error-lookup first-char)
+    (when (consp invalid-sequence-coding-system)
+      (setq invalid-sequence-coding-system
+            (car invalid-sequence-coding-system)))
+    (map-char-table
+     #'(lambda (key entry)
+         (setq string (decode-coding-string (string entry)
+                                            invalid-sequence-coding-system))
+         (when (= 1 (length string))
+	   ;; Treat Unicode error sequence chars as the octets
+	   ;; corresponding to those on disk:
+	   (setq unicode-error-lookup
+		 (get-char-table (aref string 0)
+				 unicode-error-default-translation-table))
+	   (when unicode-error-lookup
+	     (setq string (format "%c" unicode-error-lookup)))
+           ;; Treat control characters specially:
+           (setq first-char (aref string 0))
+           (when (or (and (>= #x00 first-char) (<= first-char #x1f))
+                     (and (>= #x80 first-char) (<= first-char #x9f)))
+	     (setq string (format "^%c" (+ ?@ (aref string 0))))))
+         (setq glyph (make-glyph (vector 'string :data string)))
+         (set-glyph-face glyph 'unicode-invalid-sequence-warning-face)
+         (put-char-table key glyph disp-table)
+         nil)
+     unicode-error-default-translation-table))
+
   ;; Fit the charsets preferences in unicode conversions for the
   ;; language environment.
   (set-language-unicode-precedence-list
@@ -892,65 +939,27 @@
 ;; Pretty description of encoded string
 
 ;; Alist of ISO 2022 control code vs the corresponding mnemonic string.
-;; (defvar iso-2022-control-alist
-;;   '((?\x1b . "ESC")
-;;     (?\x0e . "SO")
-;;     (?\x0f . "SI")
-;;     (?\x8e . "SS2")
-;;     (?\x8f . "SS3")
-;;     (?\x9b . "CSI")))
-
-;; (defun encoded-string-description (str coding-system)
-;;   "Return a pretty description of STR that is encoded by CODING-SYSTEM."
-;;   (setq str (string-as-unibyte str))
-;;   (let ((char (aref str 0))
-;;         desc)
-;;     (when (< char 128)
-;;       (setq desc (or (cdr (assq char iso-2022-control-alist))
-;;                      (char-to-string char)))
-;;       (let ((i 1)
-;;             (len (length str))) 
-;;         (while (< i len)
-;;           (setq char (aref str i))
-;;           (if (>= char 128)
-;;               (setq desc nil i len)
-;;             (setq desc (concat desc " "
-;;                                (or (cdr (assq char iso-2022-control-alist))
-;;                                    (char-to-string char)))
-;;                   i (1+ i))))))
-;;     (or desc
-;;         (mapconcat (function (lambda (x) (format "0x%02x" x))) str " "))))
+(defvar iso-2022-control-alist
+  '((?\x1b . "ESC")
+    (?\x0e . "SO")
+    (?\x0f . "SI")
+    (?\x8e . "SS2")
+    (?\x8f . "SS3")
+    (?\x9b . "CSI")))
 
-;; (defun encode-coding-char (char coding-system)
-;;   "Encode CHAR by CODING-SYSTEM and return the resulting string.
-;; If CODING-SYSTEM can't safely encode CHAR, return nil."
-;;   (if (cmpcharp char)
-;;       (setq char (car (decompose-composite-char char 'list))))
-;;   (let ((str1 (char-to-string char))
-;;         (str2 (make-string 2 char))
-;;         (safe-charsets (and coding-system
-;;                             (coding-system-get coding-system 'safe-charsets)))
-;;         enc1 enc2 i1 i2)
-;;     (when (or (eq safe-charsets t)
-;;               (memq (char-charset char) safe-charsets))
-;;       ;; We must find the encoded string of CHAR.  But, just encoding
-;;       ;; CHAR will put extra control sequences (usually to designate
-;;       ;; ASCII charset) at the tail if type of CODING is ISO 2022.
-;;       ;; To exclude such tailing bytes, we at first encode one-char
-;;       ;; string and two-char string, then check how many bytes at the
-;;       ;; tail of both encoded strings are the same.
-;; 
-;;       (setq enc1 (string-as-unibyte (encode-coding-string str1 coding-system))
-;;             i1 (length enc1)
-;;             enc2 (string-as-unibyte (encode-coding-string str2 coding-system))
-;;             i2 (length enc2))
-;;       (while (and (> i1 0) (= (aref enc1 (1- i1)) (aref enc2 (1- i2))))
-;;         (setq i1 (1- i1) i2 (1- i2)))
-;; 
-;;       ;; Now (substring enc1 i1) and (substring enc2 i2) are the same,
-;;       ;; and they are the extra control sequences at the tail to
-;;       ;; exclude.
-;;       (substring enc2 0 i2))))
+(defun encoded-string-description (str coding-system)
+  "Return a pretty description of STR that is encoded by CODING-SYSTEM."
+  ;; XEmacs; no transformation to unibyte.
+  (mapconcat
+   (if (and coding-system (eq (coding-system-type coding-system) 'iso2022))
+       ;; Try to get a pretty description for ISO 2022 escape sequences.
+       (function (lambda (x) (or (cdr (assq x iso-2022-control-alist))
+				 (format "#x%02X" x))))
+     (function (lambda (x) (format "#x%02X" x))))
+   str " "))
+
+;; XEmacs; 
+;; (defun encode-coding-char (char coding-system) in coding.el.
 
 
 ;; #### The following section is utter junk from mule-misc.el.
@@ -1012,29 +1021,135 @@
 ;; auto-language-alist deleted.  We have a more sophisticated system,
 ;; with the locales stored in the language data.
 
-(defconst langenv-to-locale-hash (make-hash-table :test 'equal))
+;; Initialised in mule/general-late.el, which is compiled after all the
+;; language support--and, thus, minority Chinese coding systems and so
+;; on--has been loaded.
+(defvar posix-charset-to-coding-system-hash nil
+  "A map from the POSIX locale charset versions of the defined coding
+systems' names, with all alpha-numeric characters removed, to the actual
+coding system names.  Used at startup when working out which coding system
+should be the default for the locale.  ")
+
+(defun parse-posix-locale-string (locale-string)
+  "Return values \(LANGUAGE REGION CHARSET MODIFIERS\) given LOCALE-STRING.
+
+LOCALE-STRING should be a POSIX locale.  If it cannot be parsed as such, this
+function returns nil. "
+  (let (language region charset modifiers locinfo)
+    (setq locale-string (downcase locale-string))
+    (cond ((string-match
+	    #r"^\([a-z0-9]\{2,2\}\)\(_[a-z0-9]\{2,2\}\)?\(\.[^@]*\)?\(@.*\)?$"
+	    locale-string)
+	   (setq language (match-string 1 locale-string)
+		 region (match-string 2 locale-string)
+		 charset (match-string 3 locale-string)
+		 modifiers (match-string 4 locale-string)
+		 region (and region (replace-in-string region "^_" ""))
+		 charset (and charset (replace-in-string charset #r"^\." ""))
+		 modifiers (and modifiers
+				(replace-in-string modifiers "^@" "")))
+	   (when (and modifiers (equal modifiers "euro") (null charset))
+	     ;; Not ideal for Latvian, say, but I don't have any locales
+	     ;; where the @euro modifier doesn't mean ISO-8859-15 in the 956
+	     ;; I have.
+	     (setq charset "iso-8859-15"))
+	   (when (and modifiers (equal modifiers "cyrillic") (null charset))
+	     ;; Feedback wanted!
+	     (setq charset "windows-1251"))
+	   (values language region charset modifiers))
+	  ((and (string-match "^[a-z0-9]+$" locale-string)
+		(assoc-ignore-case locale-string language-info-alist))
+	   (setq language (get-language-info locale-string 'locale)
+		 language (if (listp language) (car language) language))
+	   (values language region charset modifiers))
+	  ((string-match #r"^\([a-z0-9]+\)\.\([a-z0-9]+\)$" locale-string)
+	   (when (assoc-ignore-case
+		  (setq locinfo (match-string 1 locale-string))
+		  language-info-alist)
+	     (setq language (get-language-info locinfo 'locale)
+		   language (if (listp language) (car language) language)))
+	   (setq charset (match-string 2 locale-string))
+	   (values language region charset modifiers)))))
+
+(defun create-variant-language-environment (langenv coding-system)
+  "Create a variant of LANGENV with CODING-SYSTEM as its coding systems.
+
+The coding systems in question are those described in the
+`set-language-info' docstring with the property names of
+`native-coding-system' and `coding-system'.  The name of the new language
+environment is the name of the old language environment, followed by
+CODING-SYSTEM in parentheses.  Returns the name of the new language
+environment.  
+
+This function also modifies the `coding-priority' of a language
+environment.  "
+  (check-coding-system coding-system)
+  (if (symbolp langenv) (setq langenv (symbol-name langenv)))
+  (unless (setq langenv
+		(assoc-ignore-case langenv language-info-alist))
+    (error 'wrong-type-argument "Not a known language environment"))
+  (set-language-info-alist
+   (if (string-match " ([^)]+)$" (car langenv))
+       (replace-match (format " (%s)" 
+                              (upcase (symbol-name
+                                       (coding-system-name coding-system))))
+                      nil nil (car langenv))
+     (format "%s (%s)" (car langenv)
+             (upcase (symbol-name (coding-system-name coding-system)))))
+   (destructive-plist-to-alist 
+    (plist-put
+     (plist-put
+      (plist-put
+       (plist-put (alist-to-plist (cdr langenv)) 'native-coding-system
+                  coding-system)
+       'coding-system (cons coding-system
+                            (cdr (assoc 'coding-system (cdr langenv)))))
+      'coding-priority (cons coding-system
+                             (cdr (assq 'coding-priority (cdr langenv)))))
+     ;; The tutorial coding system is important; otherwise the tutorial file
+     ;; gets loaded in the variant coding system.
+     'tutorial-coding-system
+     (or (car-safe (cdr-safe (assoc 'tutorial-coding-system (cdr langenv))))
+         (car-safe (cdr-safe (assoc 'coding-system (cdr langenv)))))))))
 
 (defun get-language-environment-from-locale (locale)
   "Convert LOCALE into a language environment.
 LOCALE is a C library locale string, as returned by `current-locale'.
 Uses the `locale' property of the language environment."
-  (or (gethash locale langenv-to-locale-hash)
-      (let ((retval
-	     (block langenv
-	       (dolist (langcons language-info-alist)
-		 (let* ((lang (car langcons))
-			(locs (get-language-info lang 'locale))
-			(case-fold-search t))
-		   (dolist (loc (if (listp locs) locs (list locs)))
-		     (if (cond ((functionp loc)
-				(funcall loc locale))
-			       ((stringp loc)
-				(string-match
-				 (concat "^" loc "\\([^A-Za-z0-9]\\|$\\)")
-				 locale)))
-			 (return-from langenv lang))))))))
-	(puthash locale retval langenv-to-locale-hash)
-	retval)))
+  (block langenv
+    (multiple-value-bind (language ignored-arg charset ignored-arg)
+	(parse-posix-locale-string locale)
+      (let ((case-fold-search t)
+	    (desired-coding-system
+	     (and charset (gethash (replace-in-string charset "[^a-z0-9]" "")
+                                   posix-charset-to-coding-system-hash)))
+	    lang locs given-coding-system)
+	(dolist (langcons language-info-alist)
+	  (setq lang (car langcons)
+		locs (get-language-info lang 'locale))
+	  (dolist (loc (if (listp locs) locs (list locs)))
+	    (cond ((functionp loc)
+		   (if (funcall loc locale)
+		       (return-from langenv lang)))
+		  ((stringp loc)
+		   (when (or (equal loc language)
+			     (string-match
+			      (format "^%s\\([^A-Za-z0-9]\\|$\\)" loc)
+			      locale))
+		     (if (or (null desired-coding-system)
+			     (and desired-coding-system
+                                  (or (eq desired-coding-system
+                                          (setq given-coding-system
+                                                (get-language-info
+                                                 lang
+                                                 'native-coding-system)))
+                                      (and (listp given-coding-system)
+                                           (memq desired-coding-system
+                                                 given-coding-system)))))
+			 (return-from langenv lang)
+		       (return-from langenv
+			 (create-variant-language-environment
+			  lang desired-coding-system))))))))))))
 
 (defun mswindows-get-language-environment-from-locale (ms-locale)
   "Convert MS-LOCALE (an MS Windows locale) into a language environment.
@@ -1250,11 +1365,19 @@
       ;; set the default buffer coding system from the first element of the
       ;; list in the `coding-priority' property, under Unix.  Under Windows, it
       ;; should stay at `mswindows-multibyte', which will reference the current
-      ;; code page. (#### Does it really make sense the set the Unix default
+      ;; code page. ([Does it really make sense to set the Unix default
       ;; that way?  NOTE also that it's not the same as the native coding
       ;; system for the locale, which is correct -- the form we choose for text
-      ;; files should not necessarily have any relevant to whether we're in a
-      ;; Shift-JIS, EUC-JP, JIS, or other Japanese locale.)
+      ;; files should not necessarily have any relevance to whether we're in a
+      ;; Shift-JIS, EUC-JP, JIS, or other Japanese locale.])
+      ;;
+      ;;     On Unix--with the exception of Mac OS X--there is no way to
+      ;;     know for certain what coding system to use for file names, and
+      ;;     the environment is the best guess. If a particular user's
+      ;;     preferences differ from this, then that particular user needs
+      ;;     to edit ~/.xemacs/init.el. Aidan Kehoe, Sun Nov 26 18:11:31 CET
+      ;;     2006. OS X uses an almost-normal-form version of UTF-8. 
+
       (unless (memq system-type '(windows-nt cygwin32))
 	(set-default-buffer-file-coding-system
 	 (maybe-change-coding-system-with-eol default-coding eol-type))))
@@ -1270,7 +1393,27 @@
 	(error
 	 (warn "Invalid native-coding-system %s in language environment %s"
 	       native language-name)))
-      (define-coding-system-alias 'file-name 'native)
+      ;; These variables have magic handlers to make setting them equivalent
+      ;; to setting the file-name, terminal and keyboard coding system
+      ;; aliases. See coding.el. 
+      (setq file-name-coding-system 
+            (or 
+             (let ((fncs (assq system-type system-type-file-name-coding)))
+               (and fncs (cdr fncs)))
+             native)
+            ;; Set the default keyboard and terminal coding systems to the
+            ;; native coding system of the language environment.
+            keyboard-coding-system native
+	    terminal-coding-system native)
+
+      ;; And do the same for any TTYs. 
+      (dolist (con (console-list))
+	(when (eq 'tty (device-type (car (console-device-list con))))
+	  ;; Calling set-input-mode at the same time would be a sane thing
+	  ;; to do here. I would prefer to default to accepting eight bit
+	  ;; input and not using the top bit for Meta.
+	  (set-console-tty-coding-system con native)))
+
       ;; process output should not have EOL conversion.  under MS Windows
       ;; and Cygwin, this screws things up (`cmd' is fine with just LF and
       ;; `bash' chokes on CR-LF).
@@ -1327,12 +1470,19 @@
 	;;     locale but we should still use the right code page, etc.
 	(declare-fboundp (mswindows-set-current-locale userdef)))
       ;; Unix:
-      (let ((locstring (set-current-locale "")))
-	;; assume C lib locale and LANG env var are set correctly.  use
-	;; them to find the langenv.
-	(setq langenv
-	      (and locstring (get-language-environment-from-locale
-			      locstring)))))
+      (let (locstring)
+	;; Init the POSIX locale from the environment--this calls the C
+	;; library's setlocale(3).
+	(set-current-locale "")
+	;; Can't let locstring be the result of (set-current-locale "")
+	;; because that can return a more detailed string than we know how
+	;; to handle.
+	(setq locstring (current-locale)
+	      ;; assume C lib locale and LANG env var are set correctly.
+	      ;; use them to find the langenv.
+	      langenv
+ 	      (and locstring (get-language-environment-from-locale
+ 			      locstring)))))
     ;; All systems:
     (unless langenv (setq langenv "English"))
     (setq current-language-environment langenv)
@@ -1372,8 +1522,7 @@
     (setq Manual-use-rosetta-man nil))
   
   ;; Register available input methods by loading LEIM list file.
-  (load "leim-list.el" 'noerror 'nomessage 'nosuffix)
-  )
+  (load leim-list-file-name 'noerror 'nomessage 'nosuffix))
 
 ;; Code deleted: init-mule-tm (Enable the tm package by default)