comparison man/internals/internals.texi @ 5128:7be849cb8828 ben-lisp-object

merge
author Ben Wing <ben@xemacs.org>
date Sun, 07 Mar 2010 02:09:59 -0600
parents a9c41067dd88 e0587c615e8b
children f965e31a35f0
comparison
equal deleted inserted replaced
5127:a9c41067dd88 5128:7be849cb8828
159 that has been formatted into ASCII lists and tables. 159 that has been formatted into ASCII lists and tables.
160 160
161 Note: to define these routines, put point after the end of the definition 161 Note: to define these routines, put point after the end of the definition
162 and type C-x C-e. 162 and type C-x C-e.
163 163
164 (defun list-to-texinfo (b e) 164 (defun convert-list-to-texinfo (b e)
165 "Convert the selected region from an ASCII list to a Texinfo list." 165 "Convert the selected region from an ASCII list to a Texinfo list."
166 (interactive "r") 166 (interactive "r")
167 (save-restriction 167 (save-restriction
168 (narrow-to-region b e) 168 (narrow-to-region b e)
169 (goto-char (point-min)) 169 (goto-char (point-min))
170 (let ((dash-type "^ *-+ +") 170 (let ((dash-type "^ *\\(-+\\|o\\) +")
171 ;; allow single-letter numbering or roman numerals 171 ;; allow single-letter numbering or roman numerals
172 (letter-type "^ *[[(]?\\([a-zA-Z]\\|[IVXivx]+\\)[]).] +") 172 (letter-type "^ *[[(]?\\([a-zA-Z]\\|[IVXivx]+\\)[]).] +")
173 (num-type "^ *[[(]?[0-9]+[]).] +") 173 (num-type "^ *[[(]?[0-9]+[]).] +")
174 dash regexp) 174 dash regexp)
175 (save-excursion 175 (save-excursion
237 (insert-char ?\ (- min (current-column))) 237 (insert-char ?\ (- min (current-column)))
238 (beginning-of-line) 238 (beginning-of-line)
239 (forward-char min)) 239 (forward-char min))
240 (kill-rectangle b (point)))))) 240 (kill-rectangle b (point))))))
241 241
242 (defun table-to-texinfo (b e) 242 (defun convert-table-to-texinfo (b e)
243 "Convert the selected region from an ASCII table to a Texinfo table. 243 "Convert the selected region from an ASCII table to a Texinfo table.
244 Assumes entries are separated by a blank line, and the first sexp in 244 Assumes entries are separated by a blank line, and the first sexp in
245 each entry is the table heading." 245 each entry is the table heading."
246 (interactive "r") 246 (interactive "r")
247 (save-restriction 247 (save-restriction
281 If the region is active, do the region; otherwise, go from point to the end 281 If the region is active, do the region; otherwise, go from point to the end
282 of the buffer. This query-replaces for various kinds of conventions used 282 of the buffer. This query-replaces for various kinds of conventions used
283 in text: @code{} surrounded by ` and ' or followed by a (); @strong{} 283 in text: @code{} surrounded by ` and ' or followed by a (); @strong{}
284 surrounded by *'s; @file{} something that looks like a file name." 284 surrounded by *'s; @file{} something that looks like a file name."
285 (interactive) 285 (interactive)
286 (if (and (not no-narrow) (region-active-p)) 286 (save-excursion
287 (save-restriction 287 (if (and (not no-narrow) (region-active-p))
288 (narrow-to-region (region-beginning) (region-end)) 288 (save-restriction
289 (convert-text-to-texinfo t)) 289 (narrow-to-region (region-beginning) (region-end))
290 (let ((p (point)) 290 (goto-char (region-beginning))
291 (case-replace nil)) 291 (zmacs-deactivate-region)
292 (query-replace-regexp "`\\([^']+\\)'\\([^']\\)" "@code{\\1}\\2" nil) 292 (convert-text-to-texinfo t))
293 (goto-char p) 293 (let ((p (point))
294 (query-replace-regexp "\\(\\Sw\\)\\*\\(\\(?:\\s_\\|\\sw\\)+\\)\\*\\([^A-Za-z.}]\\)" "\\1@strong{\\2}\\3" nil) 294 (case-replace nil))
295 (goto-char p) 295 (message "Point is %d" (point))
296 (query-replace-regexp "\\(\\(\\s_\\|\\sw\\)+()\\)\\([^}]\\)" "@code{\\1}\\3" nil) 296 (query-replace-regexp "`\\([^']+\\)'\\([^']\\)" "@code{\\1}\\2" nil)
297 (goto-char p) 297 (goto-char p)
298 (query-replace-regexp "\\(\\(\\s_\\|\\sw\\)+\\.[A-Za-z]+\\)\\([^A-Za-z.}]\\)" "@file{\\1}\\3" nil) 298 (query-replace-regexp "\\(\\Sw\\)\\*\\(\\(?:\\s_\\|\\sw\\)+\\)\\*\\([^A-Za-z.}]\\)" "\\1@strong{\\2}\\3" nil)
299 ))) 299 (goto-char p)
300 (query-replace-regexp "\\(\\(\\s_\\|\\sw\\)+()\\)\\([^}]\\)" "@code{\\1}\\3" nil)
301 (goto-char p)
302 (query-replace-regexp "\\(\\(\\s_\\|\\sw\\)+\\.[A-Za-z]+\\)\\([^A-Za-z.}]\\)" "@file{\\1}\\3" nil)
303 ))))
300 304
301 4. Adding new sections: 305 4. Adding new sections:
302 ----------------------- 306 -----------------------
303 307
304 NOTE: These are in the form of macros. #### FIXME Convert them to 308 NOTE: These are in the form of macros. #### FIXME Convert them to
1236 XEmacs is a powerful, customizable text editor and development 1240 XEmacs is a powerful, customizable text editor and development
1237 environment. It began in 1991 as Lucid Emacs, which was in turn 1241 environment. It began in 1991 as Lucid Emacs, which was in turn
1238 derived from GNU Emacs, a program written by Richard Stallman of the 1242 derived from GNU Emacs, a program written by Richard Stallman of the
1239 Free Software Foundation. GNU Emacs dates back to 1985 and was 1243 Free Software Foundation. GNU Emacs dates back to 1985 and was
1240 modelled after Unipress Emacs, an editor written by James Gosling in 1244 modelled after Unipress Emacs, an editor written by James Gosling in
1241 1981 and based on a series of other "Emacs"-like editors, including 1245 1981 and based on a series of other ``Emacs''-like editors, including
1242 EINE (EINE Is Not EMACS), c. 1976, by Dan Weinreb, which run on the 1246 EINE (EINE Is Not EMACS), c. 1976, by Dan Weinreb, which run on the
1243 MIT Lisp Machine and was the first Emacs written in Lisp; ZWEI (ZWEI 1247 MIT Lisp Machine and was the first Emacs written in Lisp; ZWEI (ZWEI
1244 Was EINE Initially), c. 1978, by Dan Weinreb and Mike McMahon; Multics 1248 Was EINE Initially), c. 1978, by Dan Weinreb and Mike McMahon; Multics
1245 Emacs, c. 1978, by Bernie Greenberg, which was written in MacLisp and 1249 Emacs, c. 1978, by Bernie Greenberg, which was written in MacLisp and
1246 also used Lisp as its extension language; and ZMACS, c. 1980, a direct 1250 also used Lisp as its extension language; and ZMACS, c. 1980, a direct
1247 descendant of ZWEI that on ran the Symbolics LM-2, LMI LispM, and 1251 descendant of ZWEI that on ran the Symbolics LM-2, LMI LispM, and
1248 later, TI Explorer (1983-1989). These in turn were inspired by the 1252 later, TI Explorer (1983-1989). These in turn were inspired by the
1249 first Emacs, a package called EMACS, written in 1976 by Richard 1253 first Emacs, a package called EMACS, written in 1976 by Richard
1250 Stallman, Guy Steele, and Dave Moon. This was a merger of TECMAC and 1254 Stallman, Guy Steele, and Dave Moon. This was a merger of TECMAC and
1251 TMACS, a pair of "TECO-macro realtime editors" written by Guy Steele, 1255 TMACS, a pair of ``TECO-macro realtime editors'' written by Guy Steele,
1252 Dave Moon, Richard Greenblatt, Charles Frankston, et al., and added a 1256 Dave Moon, Richard Greenblatt, Charles Frankston, et al., and added a
1253 dynamic loader and Meta-key cmds. It ran under ITS (the Incompatible 1257 dynamic loader and Meta-key cmds. It ran under ITS (the Incompatible
1254 Timesharing System) on a DEC PDP 10 and under TWENEX on a Tops-20 and 1258 Timesharing System) on a DEC PDP 10 and under TWENEX on a Tops-20 and
1255 was written in TECO and PDP 10 assembly. ITS was one of the first 1259 was written in TECO and PDP 10 assembly. ITS was one of the first
1256 time-sharing operating systems and dates back well before Unix. ITS, 1260 time-sharing operating systems and dates back well before Unix. ITS,
1284 M. Stallman (RMS) and James Gosling (the creator of Java); its extension 1288 M. Stallman (RMS) and James Gosling (the creator of Java); its extension
1285 language was known as @dfn{Mocklisp}. This version of Emacs-in-C formed 1289 language was known as @dfn{Mocklisp}. This version of Emacs-in-C formed
1286 the basis for the early versions of GNU Emacs and also for Gosling's 1290 the basis for the early versions of GNU Emacs and also for Gosling's
1287 Unipress Emacs, a commercial product. Because of bad blood between the 1291 Unipress Emacs, a commercial product. Because of bad blood between the
1288 two over the issue of commercialism, RMS pretty much disowned this 1292 two over the issue of commercialism, RMS pretty much disowned this
1289 collaboration, referring to it as "Gosling Emacs". 1293 collaboration, referring to it as ``Gosling Emacs''.
1290 1294
1291 At this point we pick up with a time line of events. (A broader timeline 1295 At this point we pick up with a time line of events. (A broader timeline
1292 is available at @uref{http://www.jwz.org/doc/emacs-timeline.html, 1296 is available at @uref{http://www.jwz.org/doc/emacs-timeline.html,
1293 ``Emacs Timeline''}.) 1297 ``Emacs Timeline''}.)
1294 1298
1575 redisplay code, preliminary I18N support, code merged from GNU Emacs 1579 redisplay code, preliminary I18N support, code merged from GNU Emacs
1576 19.8 beta) 1580 19.8 beta)
1577 @item 1581 @item
1578 Version 19.9 released January 12, 1994. (Scrollbars, Athena.) 1582 Version 19.9 released January 12, 1994. (Scrollbars, Athena.)
1579 @item 1583 @item
1580 Version 19.10 released May 27, 1994. (Uses `configure'; code merged 1584 Version 19.10 released May 27, 1994. (Uses @code{configure}; code merged
1581 from GNU Emacs 19.23 beta and further merging with Epoch 4.0) Known as 1585 from GNU Emacs 19.23 beta and further merging with Epoch 4.0) Known as
1582 "Lucid Emacs" when shipped by Lucid, and as "XEmacs" when shipped by 1586 ``Lucid Emacs'' when shipped by Lucid, and as ``XEmacs'' when shipped by
1583 Sun; but Lucid went out of business a few days later and it's unclear 1587 Sun; but Lucid went out of business a few days later and it's unclear
1584 very many copies of 19.10 were released by Lucid. (Last release by 1588 very many copies of 19.10 were released by Lucid. (Last release by
1585 Jamie Zawinski.) 1589 Jamie Zawinski.)
1586 @end itemize 1590 @end itemize
1587 1591
1887 rewritten redisplay, TTY support, multi-device support, device and 1891 rewritten redisplay, TTY support, multi-device support, device and
1888 console objects, specifiers, glyphs, toolbars, horizontal scrollbars, 1892 console objects, specifiers, glyphs, toolbars, horizontal scrollbars,
1889 Lucid scrollbar widget, 3-d modeline, stay-up Lucid menus, resizable 1893 Lucid scrollbar widget, 3-d modeline, stay-up Lucid menus, resizable
1890 minibuffer, echo area is a true buffer, MD5 hashing support, expanded 1894 minibuffer, echo area is a true buffer, MD5 hashing support, expanded
1891 menubar, redone menu specification format (including menu filters), 1895 menubar, redone menu specification format (including menu filters),
1892 rewritten extents, renamed "screen" to "frame", misc-user events, 1896 rewritten extents, renamed ``screen'' to ``frame'', misc-user events,
1893 rewritten face code, rewritten mouse code, warnings system, CL 1897 rewritten face code, rewritten mouse code, warnings system, CL
1894 backquote syntax, critical C-g, code merging with GNU Emacs 19.28. 1898 backquote syntax, critical C-g, code merging with GNU Emacs 19.28.
1895 New packages Hyperbole, OOBR, hm--html-menus, viper, lazy-lock, 1899 New packages Hyperbole, OOBR, hm--html-menus, viper, lazy-lock,
1896 ksh-mode, rsz-minibuf.) 1900 ksh-mode, rsz-minibuf.)
1897 @item 1901 @item
1935 version 20.4 released February 28, 1998. 1939 version 20.4 released February 28, 1998.
1936 @item 1940 @item
1937 version 21.0.60 released December 10, 1998. (The version naming scheme was 1941 version 21.0.60 released December 10, 1998. (The version naming scheme was
1938 changed at this point: [a] the second version number is odd for stable 1942 changed at this point: [a] the second version number is odd for stable
1939 versions, even for beta versions; [b] a third version number is added, 1943 versions, even for beta versions; [b] a third version number is added,
1940 replacing the "beta xxx" ending for beta versions and allowing for 1944 replacing the ``beta xxx'' ending for beta versions and allowing for
1941 periodic maintenance releases for stable versions. Therefore, 21.0 was 1945 periodic maintenance releases for stable versions. Therefore, 21.0 was
1942 never "officially" released; similarly for 21.2, etc.) 1946 never ``officially'' released; similarly for 21.2, etc.)
1943 @item 1947 @item
1944 version 21.0.61 released January 4, 1999. 1948 version 21.0.61 released January 4, 1999.
1945 @item 1949 @item
1946 version 21.0.63 released February 3, 1999. 1950 version 21.0.63 released February 3, 1999.
1947 @item 1951 @item
1953 @item 1957 @item
1954 version 21.0.67 released March 25, 1999. 1958 version 21.0.67 released March 25, 1999.
1955 @item 1959 @item
1956 version 21.1.2 released May 14, 1999. (This is the followup to 21.0.67. 1960 version 21.1.2 released May 14, 1999. (This is the followup to 21.0.67.
1957 The second version number was bumped to indicate the beginning of the 1961 The second version number was bumped to indicate the beginning of the
1958 "stable" series.) 1962 ``stable'' series.)
1959 @item 1963 @item
1960 version 21.1.3 released June 26, 1999. 1964 version 21.1.3 released June 26, 1999.
1961 @item 1965 @item
1962 version 21.1.4 released July 8, 1999. 1966 version 21.1.4 released July 8, 1999.
1963 @item 1967 @item
2043 @item 2047 @item
2044 version 21.2.39 released December 31, 2000. 2048 version 21.2.39 released December 31, 2000.
2045 @item 2049 @item
2046 version 21.2.40 released January 8, 2001. 2050 version 21.2.40 released January 8, 2001.
2047 @item 2051 @item
2048 version 21.2.41 "Polyhymnia" released January 17, 2001. 2052 version 21.2.41 ``Polyhymnia'' released January 17, 2001.
2049 @item 2053 @item
2050 version 21.2.42 "Poseidon" released January 20, 2001. 2054 version 21.2.42 ``Poseidon'' released January 20, 2001.
2051 @item 2055 @item
2052 version 21.2.43 "Terspichore" released January 26, 2001. 2056 version 21.2.43 ``Terspichore'' released January 26, 2001.
2053 @item 2057 @item
2054 version 21.2.44 "Thalia" released February 8, 2001. 2058 version 21.2.44 ``Thalia'' released February 8, 2001.
2055 @item 2059 @item
2056 version 21.2.45 "Thelxepeia" released February 23, 2001. 2060 version 21.2.45 ``Thelxepeia'' released February 23, 2001.
2057 @item 2061 @item
2058 version 21.2.46 "Urania" released March 21, 2001. 2062 version 21.2.46 ``Urania'' released March 21, 2001.
2059 @item 2063 @item
2060 version 21.2.47 "Zephir" released April 14, 2001. 2064 version 21.2.47 ``Zephir'' released April 14, 2001.
2061 @item 2065 @item
2062 XEmacs 21.4.0 "Solid Vapor" released April 16, 2001. 2066 XEmacs 21.4.0 ``Solid Vapor'' released April 16, 2001.
2063 @item 2067 @item
2064 XEmacs 21.4.1 "Copyleft" released April 19, 2001. 2068 XEmacs 21.4.1 ``Copyleft'' released April 19, 2001.
2065 @item 2069 @item
2066 XEmacs 21.4.2 "Developer-Friendly Unix APIs" released May 10, 2001. 2070 XEmacs 21.4.2 ``Developer-Friendly Unix APIs'' released May 10, 2001.
2067 @item 2071 @item
2068 XEmacs 21.4.3 "Academic Rigor" released May 17, 2001. 2072 XEmacs 21.4.3 ``Academic Rigor'' released May 17, 2001.
2069 @item 2073 @item
2070 XEmacs 21.4.4 "Artificial Intelligence" released July 28, 2001. 2074 XEmacs 21.4.4 ``Artificial Intelligence'' released July 28, 2001.
2071 @item 2075 @item
2072 XEmacs 21.4.5 "Civil Service" released October 23, 2001. 2076 XEmacs 21.4.5 ``Civil Service'' released October 23, 2001.
2073 @item 2077 @item
2074 XEmacs 21.4.6 "Common Lisp" released December 17, 2001. 2078 XEmacs 21.4.6 ``Common Lisp'' released December 17, 2001.
2075 @item 2079 @item
2076 XEmacs 21.4.7 "Economic Science" released May 4, 2002. 2080 XEmacs 21.4.7 ``Economic Science'' released May 4, 2002.
2077 @item 2081 @item
2078 XEmacs 21.4.8 "Honest Recruiter" released May 9, 2002. 2082 XEmacs 21.4.8 ``Honest Recruiter'' released May 9, 2002.
2079 @item 2083 @item
2080 XEmacs 21.4.9 "Informed Management" released August 23, 2002. 2084 XEmacs 21.4.9 ``Informed Management'' released August 23, 2002.
2081 @item 2085 @item
2082 XEmacs 21.4.10 "Military Intelligence" released November 2, 2002. 2086 XEmacs 21.4.10 ``Military Intelligence'' released November 2, 2002.
2083 @item 2087 @item
2084 XEmacs 21.4.11 "Native Windows TTY Support" released January 3, 2003. 2088 XEmacs 21.4.11 ``Native Windows TTY Support'' released January 3, 2003.
2085 @item 2089 @item
2086 XEmacs 21.4.12 "Portable Code" released January 15, 2003. 2090 XEmacs 21.4.12 ``Portable Code'' released January 15, 2003.
2087 @item 2091 @item
2088 XEmacs 21.4.13 "Rational FORTRAN" released May 25, 2003. 2092 XEmacs 21.4.13 ``Rational FORTRAN'' released May 25, 2003.
2089 @item 2093 @item
2090 XEmacs 21.4.14 "Reasonable Discussion" released September 3, 2003. 2094 XEmacs 21.4.14 ``Reasonable Discussion'' released September 3, 2003.
2091 @item 2095 @item
2092 XEmacs 21.4.15 "Security Through Obscurity" released February 2, 2004. 2096 XEmacs 21.4.15 ``Security Through Obscurity'' released February 2, 2004.
2093 @item 2097 @item
2094 XEmacs 21.4.16 "Successful IPO" released December 5, 2004. 2098 XEmacs 21.4.16 ``Successful IPO'' released December 5, 2004.
2095 @item 2099 @item
2096 version 21.5.0 "alfalfa" released April 18, 2001. 2100 version 21.5.0 ``alfalfa'' released April 18, 2001.
2097 @item 2101 @item
2098 version 21.5.1 "anise" released May 9, 2001. 2102 version 21.5.1 ``anise'' released May 9, 2001.
2099 @item 2103 @item
2100 version 21.5.2 "artichoke" released July 28, 2001. 2104 version 21.5.2 ``artichoke'' released July 28, 2001.
2101 @item 2105 @item
2102 version 21.5.3 "asparagus" released September 7, 2001. 2106 version 21.5.3 ``asparagus'' released September 7, 2001.
2103 @item 2107 @item
2104 version 21.5.4 "bamboo" released January 8, 2002. 2108 version 21.5.4 ``bamboo'' released January 8, 2002.
2105 @item 2109 @item
2106 version 21.5.5 "beets" released March 5, 2002. 2110 version 21.5.5 ``beets'' released March 5, 2002.
2107 @item 2111 @item
2108 version 21.5.6 "bok choi" released April 5, 2002. 2112 version 21.5.6 ``bok choi'' released April 5, 2002.
2109 @item 2113 @item
2110 version 21.5.7 "broccoflower" released July 2, 2002. 2114 version 21.5.7 ``broccoflower'' released July 2, 2002.
2111 @item 2115 @item
2112 version 21.5.8 "broccoli" released July 27, 2002. 2116 version 21.5.8 ``broccoli'' released July 27, 2002.
2113 @item 2117 @item
2114 version 21.5.9 "brussels sprouts" released August 30, 2002. 2118 version 21.5.9 ``brussels sprouts'' released August 30, 2002.
2115 @item 2119 @item
2116 version 21.5.10 "burdock" released January 4, 2003. 2120 version 21.5.10 ``burdock'' released January 4, 2003.
2117 @item 2121 @item
2118 version 21.5.11 "cabbage" released February 16, 2003. 2122 version 21.5.11 ``cabbage'' released February 16, 2003.
2119 @item 2123 @item
2120 version 21.5.12 "carrot" released April 24, 2003. 2124 version 21.5.12 ``carrot'' released April 24, 2003.
2121 @item 2125 @item
2122 version 21.5.13 "cauliflower" released May 10, 2003. 2126 version 21.5.13 ``cauliflower'' released May 10, 2003.
2123 @item 2127 @item
2124 version 21.5.14 "cassava" released June 1, 2003. 2128 version 21.5.14 ``cassava'' released June 1, 2003.
2125 @item 2129 @item
2126 version 21.5.15 "celery" released September 3, 2003. 2130 version 21.5.15 ``celery'' released September 3, 2003.
2127 @item 2131 @item
2128 version 21.5.16 "celeriac" released September 26, 2003. 2132 version 21.5.16 ``celeriac'' released September 26, 2003.
2129 @item 2133 @item
2130 version 21.5.17 "chayote" released March 22, 2004. 2134 version 21.5.17 ``chayote'' released March 22, 2004.
2131 @item 2135 @item
2132 version 21.5.18 "chestnut" released October 22, 2004. 2136 version 21.5.18 ``chestnut'' released October 22, 2004.
2133 @end itemize 2137 @end itemize
2134 2138
2135 @node The XEmacs Split, XEmacs from the Outside, A History of Emacs, Top 2139 @node The XEmacs Split, XEmacs from the Outside, A History of Emacs, Top
2136 @chapter The XEmacs Split 2140 @chapter The XEmacs Split
2137 @cindex XEmacs split 2141 @cindex XEmacs split
2151 to cooperate a bit with RMS, and the two versions of Emacs will merge. In 2155 to cooperate a bit with RMS, and the two versions of Emacs will merge. In
2152 fact there have been six to seven major attempts at merging, each running 2156 fact there have been six to seven major attempts at merging, each running
2153 hundreds of messages long and all of them coming from the XEmacs side. All 2157 hundreds of messages long and all of them coming from the XEmacs side. All
2154 have failed because they have eventually come to the same conclusion, which 2158 have failed because they have eventually come to the same conclusion, which
2155 is that RMS has no real interest in cooperation at all. If you work with 2159 is that RMS has no real interest in cooperation at all. If you work with
2156 him, you have to do it his way -- "my way or the highway". Specifically: 2160 him, you have to do it his way -- ``my way or the highway''. Specifically:
2157 2161
2158 @enumerate 2162 @enumerate
2159 @item 2163 @item
2160 2164
2161 RMS insists on having legal papers signed for every bit of code that goes 2165 RMS insists on having legal papers signed for every bit of code that goes
4046 zero or more Kanji characters followed by zero or more 4050 zero or more Kanji characters followed by zero or more
4047 Hiragana characters. 4051 Hiragana characters.
4048 @end display 4052 @end display
4049 4053
4050 Then, the problem is that now we can't say that a sequence of 4054 Then, the problem is that now we can't say that a sequence of
4051 word-constituents makes up a word. For instance, both Hiragana "A" 4055 word-constituents makes up a word. For instance, both Hiragana ``A''
4052 and Kanji "KAN" are word-constituents but the sequence of these two 4056 and Kanji ``KAN'' are word-constituents but the sequence of these two
4053 letters can't be a single word. 4057 letters can't be a single word.
4054 4058
4055 So, we introduced Sextword for Japanese letters. 4059 So, we introduced Sextword for Japanese letters.
4056 @end quotation 4060 @end quotation
4057 4061
5006 @item 5010 @item
5007 Any header-file declarations of the sort 5011 Any header-file declarations of the sort
5008 5012
5009 struct foobar; 5013 struct foobar;
5010 5014
5011 go into the "types" section of lisp.h. 5015 go into the ``types'' section of @file{lisp.h}.
5012 @end itemize 5016 @end itemize
5013 5017
5014 @node Writing New Modules, Working with Lisp Objects, Introduction to Writing C Code, Rules When Writing New C Code 5018 @node Writing New Modules, Working with Lisp Objects, Introduction to Writing C Code, Rules When Writing New C Code
5015 @section Writing New Modules 5019 @section Writing New Modules
5016 @cindex writing new modules 5020 @cindex writing new modules
5664 correct it or flag it as incorrect, as described in the previous 5668 correct it or flag it as incorrect, as described in the previous
5665 paragraph. Whenever you work on a section of code, @emph{always} make 5669 paragraph. Whenever you work on a section of code, @emph{always} make
5666 sure to update any comments to be correct -- or, at the very least, flag 5670 sure to update any comments to be correct -- or, at the very least, flag
5667 them as incorrect. 5671 them as incorrect.
5668 5672
5669 To indicate a "todo" or other problem, use four pound signs -- 5673 To indicate a ``todo'' or other problem, use four pound signs --
5670 i.e. @samp{####}. 5674 i.e. @samp{####}.
5671 5675
5672 @node Adding Global Lisp Variables, Writing Macros, Writing Good Comments, Rules When Writing New C Code 5676 @node Adding Global Lisp Variables, Writing Macros, Writing Good Comments, Rules When Writing New C Code
5673 @section Adding Global Lisp Variables 5677 @section Adding Global Lisp Variables
5674 @cindex global Lisp variables, adding 5678 @cindex global Lisp variables, adding
5849 @enumerate 5853 @enumerate
5850 @item 5854 @item
5851 Anything that's an lvalue can be evaluated more than once. 5855 Anything that's an lvalue can be evaluated more than once.
5852 @item 5856 @item
5853 Macros where anything else can be evaluated more than once should 5857 Macros where anything else can be evaluated more than once should
5854 have the word "unsafe" in their name (exceptions may be made for 5858 have the word ``unsafe'' in their name (exceptions may be made for
5855 large sets of macros that evaluate arguments of certain types more 5859 large sets of macros that evaluate arguments of certain types more
5856 than once, e.g. struct buffer * arguments, when clearly indicated in 5860 than once, e.g. struct buffer * arguments, when clearly indicated in
5857 the macro documentation). These macros are generally meant to be 5861 the macro documentation). These macros are generally meant to be
5858 called only by other macros that have already stored the calling 5862 called only by other macros that have already stored the calling
5859 values in temporary variables. 5863 values in temporary variables.
5881 Capitalize macros doing stuff obviously impossible with (C) 5885 Capitalize macros doing stuff obviously impossible with (C)
5882 functions, e.g. directly modifying arguments as if they were passed by 5886 functions, e.g. directly modifying arguments as if they were passed by
5883 reference. 5887 reference.
5884 @item 5888 @item
5885 Capitalize macros that evaluate @strong{any} argument more than once regardless 5889 Capitalize macros that evaluate @strong{any} argument more than once regardless
5886 of whether that's "allowed" (e.g. buffer arguments). 5890 of whether that's ``allowed'' (e.g. buffer arguments).
5887 @item 5891 @item
5888 Capitalize macros that directly access a field in a Lisp_Object or 5892 Capitalize macros that directly access a field in a Lisp_Object or
5889 its equivalent underlying structure. In such cases, access through the 5893 its equivalent underlying structure. In such cases, access through the
5890 Lisp_Object precedes the macro with an X, and access through the underlying 5894 Lisp_Object precedes the macro with an X, and access through the underlying
5891 structure doesn't. 5895 structure doesn't.
5936 a search-and-replace is done to change type names and such. Some people 5940 a search-and-replace is done to change type names and such. Some people
5937 disagree with such changes, and certainly if done without good reason 5941 disagree with such changes, and certainly if done without good reason
5938 will just lead to headaches. But it's important to keep the code clean 5942 will just lead to headaches. But it's important to keep the code clean
5939 and understandable, and consistent naming goes a long way towards this. 5943 and understandable, and consistent naming goes a long way towards this.
5940 5944
5941 An example of the right way to do this was the so-called "great integral 5945 An example of the right way to do this was the so-called ``great integral
5942 type renaming". 5946 type renaming''.
5943 5947
5944 @menu 5948 @menu
5945 * Great Integral Type Renaming:: 5949 * Great Integral Type Renaming::
5946 * Text/Char Type Renaming:: 5950 * Text/Char Type Renaming::
5947 @end menu 5951 @end menu
5964 @item 5968 @item
5965 All integral types that measure quantities of anything are signed. Some 5969 All integral types that measure quantities of anything are signed. Some
5966 people disagree vociferously with this, but their arguments are mostly 5970 people disagree vociferously with this, but their arguments are mostly
5967 theoretical, and are vastly outweighed by the practical headaches of 5971 theoretical, and are vastly outweighed by the practical headaches of
5968 mixing signed and unsigned values, and more importantly by the far 5972 mixing signed and unsigned values, and more importantly by the far
5969 increased likelihood of inadvertent bugs: Because of the broken "viral" 5973 increased likelihood of inadvertent bugs: Because of the broken ``viral''
5970 nature of unsigned quantities in C (operations involving mixed 5974 nature of unsigned quantities in C (operations involving mixed
5971 signed/unsigned are done unsigned, when exactly the opposite is nearly 5975 signed/unsigned are done unsigned, when exactly the opposite is nearly
5972 always wanted), even a single error in declaring a quantity unsigned 5976 always wanted), even a single error in declaring a quantity unsigned
5973 that should be signed, or even the even more subtle error of comparing 5977 that should be signed, or even the even more subtle error of comparing
5974 signed and unsigned values and forgetting the necessary cast, can be 5978 signed and unsigned values and forgetting the necessary cast, can be
5975 catastrophic, as comparisons will yield wrong results. -Wsign-compare 5979 catastrophic, as comparisons will yield wrong results. @samp{-Wsign-compare}
5976 is turned on specifically to catch this, but this tends to result in a 5980 is turned on specifically to catch this, but this tends to result in a
5977 great number of warnings when mixing signed and unsigned, and the casts 5981 great number of warnings when mixing signed and unsigned, and the casts
5978 are annoying. More has been written on this elsewhere. 5982 are annoying. More has been written on this elsewhere.
5979 5983
5980 @item 5984 @item
5989 Type names should be relatively short (no more than 10 characters or 5993 Type names should be relatively short (no more than 10 characters or
5990 so), with the first letter capitalized and no underscores if they can at 5994 so), with the first letter capitalized and no underscores if they can at
5991 all be avoided. 5995 all be avoided.
5992 5996
5993 @item 5997 @item
5994 "count" == a zero-based measurement of some quantity. Includes sizes, 5998 ``count'' == a zero-based measurement of some quantity. Includes sizes,
5995 offsets, and indexes. 5999 offsets, and indexes.
5996 6000
5997 @item 6001 @item
5998 "bpos" == a one-based measurement of a position in a buffer. "Charbpos" 6002 ``bpos'' == a one-based measurement of a position in a buffer. ``Charbpos''
5999 and "Bytebpos" count text in the buffer, rather than bytes in memory; 6003 and ``Bytebpos'' count text in the buffer, rather than bytes in memory;
6000 thus Bytebpos does not directly correspond to the memory representation. 6004 thus Bytebpos does not directly correspond to the memory representation.
6001 Use "Membpos" for this. 6005 Use ``Membpos'' for this.
6002 6006
6003 @item 6007 @item
6004 "Char" refers to internal-format characters, not to the C type "char", 6008 ``Char'' refers to internal-format characters, not to the C type ``char'',
6005 which is really a byte. 6009 which is really a byte.
6006 @end itemize 6010 @end itemize
6007 6011
6008 For the actual name changes, see the script below. 6012 For the actual name changes, see the script below.
6009 6013
6094 #endif 6098 #endif
6095 6099
6096 /* The have been some arguments over the what the type should be that 6100 /* The have been some arguments over the what the type should be that
6097 specifies a count of bytes in a data block to be written out or read in, 6101 specifies a count of bytes in a data block to be written out or read in,
6098 using @code{Lstream_read()}, @code{Lstream_write()}, and related functions. 6102 using @code{Lstream_read()}, @code{Lstream_write()}, and related functions.
6099 Originally it was long, which worked fine; Martin "corrected" these to 6103 Originally it was long, which worked fine; Martin ``corrected'' these to
6100 size_t and ssize_t on the grounds that this is theoretically cleaner and 6104 size_t and ssize_t on the grounds that this is theoretically cleaner and
6101 is in keeping with the C standards. Unfortunately, this practice is 6105 is in keeping with the C standards. Unfortunately, this practice is
6102 horribly error-prone due to design flaws in the way that mixed 6106 horribly error-prone due to design flaws in the way that mixed
6103 signed/unsigned arithmetic happens. In fact, by doing this change, 6107 signed/unsigned arithmetic happens. In fact, by doing this change,
6104 Martin introduced a subtle but fatal error that caused the operation of 6108 Martin introduced a subtle but fatal error that caused the operation of
6469 fixed---use the @code{Known-Bug-Expect-Failure} wrapper macro to mark 6473 fixed---use the @code{Known-Bug-Expect-Failure} wrapper macro to mark
6470 them. 6474 them.
6471 6475
6472 @deffn Macro Known-Bug-Expect-Failure body 6476 @deffn Macro Known-Bug-Expect-Failure body
6473 Arrange for failing tests in @var{body} to generate messages prefixed 6477 Arrange for failing tests in @var{body} to generate messages prefixed
6474 with "KNOWN BUG:" instead of "FAIL:". @var{body} is a @code{progn}-like 6478 with ``KNOWN BUG:'' instead of ``FAIL:''. @var{body} is a @code{progn}-like
6475 body, and may contain several tests. 6479 body, and may contain several tests.
6476 @end deffn 6480 @end deffn
6477 6481
6478 A lot of the tests we run push limits; suppress Ebola warning messages 6482 A lot of the tests we run push limits; suppress Ebola warning messages
6479 with the @code{Ignore-Ebola} wrapper macro. 6483 with the @code{Ignore-Ebola} wrapper macro.
6650 with added or deleted files.} If you are lucky, the operation will 6654 with added or deleted files.} If you are lucky, the operation will
6651 simply fail. If you are less lucky, it will proceed, but make the 6655 simply fail. If you are less lucky, it will proceed, but make the
6652 adds and deletes on the main line, which you do not want at all. 6656 adds and deletes on the main line, which you do not want at all.
6653 Therefore, you must undo all adds and deletes. To find out what is 6657 Therefore, you must undo all adds and deletes. To find out what is
6654 added and deleted, use something like @code{cvs -n update >&! 6658 added and deleted, use something like @code{cvs -n update >&!
6655 cvs.out}, which does a "dry run". (You did make a backup copy first, 6659 cvs.out}, which does a ``dry run''. (You did make a backup copy first,
6656 right? What if you forgot the @samp{-n}, for example, and wasn't 6660 right? What if you forgot the @samp{-n}, for example, and wasn't
6657 prepared for the sudden onslaught of merging action?) Take a look at 6661 prepared for the sudden onslaught of merging action?) Take a look at
6658 the output file @file{cvs.out} and check very carefully for newly 6662 the output file @file{cvs.out} and check very carefully for newly
6659 added files (marked with an @samp{A}) and newly removed files (marked 6663 added files (marked with an @samp{A}) and newly removed files (marked
6660 with an @samp{R}). Double check that your newly added files are in 6664 with an @samp{R}). Double check that your newly added files are in
6682 crw tag -b ben-mule-21-5 6686 crw tag -b ben-mule-21-5
6683 @end example 6687 @end example
6684 6688
6685 Note that this doesn't actually do anything to your local workspace! 6689 Note that this doesn't actually do anything to your local workspace!
6686 It basically just creates another tag in the repository, identical to 6690 It basically just creates another tag in the repository, identical to
6687 the branch point tag but internally marked as a "branch tag" rather 6691 the branch point tag but internally marked as a ``branch tag'' rather
6688 than a regular tag. 6692 than a regular tag.
6689 6693
6690 @item 6694 @item
6691 Now, move your workspace onto the branch: 6695 Now, move your workspace onto the branch:
6692 6696
7016 and when you add a new element, the array automatically resizes itself 7020 and when you add a new element, the array automatically resizes itself
7017 if it isn't big enough. Dynarrs are extensively used in the redisplay 7021 if it isn't big enough. Dynarrs are extensively used in the redisplay
7018 mechanism. 7022 mechanism.
7019 7023
7020 7024
7021 A "dynamic array" is a contiguous array of fixed-size elements where there 7025 A ``dynamic array'' is a contiguous array of fixed-size elements where there
7022 is no upper limit (except available memory) on the number of elements in the 7026 is no upper limit (except available memory) on the number of elements in the
7023 array. Because the elements are maintained contiguously, space is used 7027 array. Because the elements are maintained contiguously, space is used
7024 efficiently (no per-element pointers necessary) and random access to a 7028 efficiently (no per-element pointers necessary) and random access to a
7025 particular element is in constant time. At any one point, the block of memory 7029 particular element is in constant time. At any one point, the block of memory
7026 that holds the array has an upper limit; if this limit is exceeded, the 7030 that holds the array has an upper limit; if this limit is exceeded, the
7027 memory is realloc()ed into a new array that is twice as big. Assuming that 7031 memory is @code{realloc()}ed into a new array that is twice as big. Assuming that
7028 the time to grow the array is on the order of the new size of the array 7032 the time to grow the array is on the order of the new size of the array
7029 block, this scheme has a provably constant amortized time (i.e. average 7033 block, this scheme has a provably constant amortized time (i.e. average
7030 time over all additions). 7034 time over all additions).
7031 7035
7032 When you add elements or retrieve elements, pointers are used. Note that 7036 When you add elements or retrieve elements, pointers are used. Note that
7130 onto a linked list, so they can be efficiently reused. This data type 7134 onto a linked list, so they can be efficiently reused. This data type
7131 is not much used in XEmacs currently, because it's a fairly new 7135 is not much used in XEmacs currently, because it's a fairly new
7132 addition. 7136 addition.
7133 7137
7134 7138
7135 A "block-type object" is used to efficiently allocate and free blocks 7139 A ``block-type object'' is used to efficiently allocate and free blocks
7136 of a particular size. Freed blocks are remembered in a free list and 7140 of a particular size. Freed blocks are remembered in a free list and
7137 are reused as necessary to allocate new blocks, so as to avoid as 7141 are reused as necessary to allocate new blocks, so as to avoid as
7138 much as possible making calls to malloc() and free(). 7142 much as possible making calls to @code{malloc()} and @code{free()}.
7139 7143
7140 This is a container object. Declare a block-type object of a specific type 7144 This is a container object. Declare a block-type object of a specific type
7141 as follows: 7145 as follows:
7142 7146
7143 struct mytype_blocktype @{ 7147 struct mytype_blocktype @{
8275 @code{this_one_is_unmarkable} in @code{alloc.c}). 8279 @code{this_one_is_unmarkable} in @code{alloc.c}).
8276 8280
8277 Now, the actual marking is feasible. We do so by once using the macro 8281 Now, the actual marking is feasible. We do so by once using the macro
8278 @code{MARK_RECORD_HEADER} to mark the object itself (actually the 8282 @code{MARK_RECORD_HEADER} to mark the object itself (actually the
8279 special flag in the lrecord header), and calling its special marker 8283 special flag in the lrecord header), and calling its special marker
8280 "method" @code{marker} if available. The marker method marks every 8284 ``method'' @code{marker} if available. The marker method marks every
8281 other object that is in reach from our current object. Note, that these 8285 other object that is in reach from our current object. Note, that these
8282 marker methods should not call @code{mark_object} recursively, but 8286 marker methods should not call @code{mark_object} recursively, but
8283 instead should return the next object from where further marking has to 8287 instead should return the next object from where further marking has to
8284 be performed. 8288 be performed.
8285 8289
8330 @code{sweep_conses}, @code{sweep_bit_vectors_1}, 8334 @code{sweep_conses}, @code{sweep_bit_vectors_1},
8331 @code{sweep_compiled_functions}, @code{sweep_floats}, 8335 @code{sweep_compiled_functions}, @code{sweep_floats},
8332 @code{sweep_symbols}, @code{sweep_extents}, @code{sweep_markers} and 8336 @code{sweep_symbols}, @code{sweep_extents}, @code{sweep_markers} and
8333 @code{sweep_extents}. They are the fixed-size types cons, floats, 8337 @code{sweep_extents}. They are the fixed-size types cons, floats,
8334 compiled-functions, symbol, marker, extent, and event stored in 8338 compiled-functions, symbol, marker, extent, and event stored in
8335 so-called "frob blocks", and therefore we can basically do the same on 8339 so-called ``frob blocks'', and therefore we can basically do the same on
8336 every type objects, using the same macros, especially defined only to 8340 every type objects, using the same macros, especially defined only to
8337 handle everything with respect to fixed-size blocks. The only fixed-size 8341 handle everything with respect to fixed-size blocks. The only fixed-size
8338 type that is not handled here are the fixed-size portion of strings, 8342 type that is not handled here are the fixed-size portion of strings,
8339 because we took special care of them earlier. 8343 because we took special care of them earlier.
8340 8344
10055 complicated depending on how much information we cache. In addition to 10059 complicated depending on how much information we cache. In addition to
10056 the known region, we always cache the correct conversions for point, 10060 the known region, we always cache the correct conversions for point,
10057 BEGV, and ZV, and in addition to this we cache 16 positions where the 10061 BEGV, and ZV, and in addition to this we cache 16 positions where the
10058 conversion is known. We only look in the cache or update it when we 10062 conversion is known. We only look in the cache or update it when we
10059 need to move the known region more than a certain amount (currently 50 10063 need to move the known region more than a certain amount (currently 50
10060 chars), and then we throw away a "random" value and replace it with the 10064 chars), and then we throw away a ``random'' value and replace it with the
10061 newly calculated value. 10065 newly calculated value.
10062 10066
10063 Finally, we maintain an extra flag that tracks whether the buffer is 10067 Finally, we maintain an extra flag that tracks whether the buffer is
10064 entirely ASCII, to speed up the conversions even more. This flag is 10068 entirely ASCII, to speed up the conversions even more. This flag is
10065 actually of dubious value because in an entirely-ASCII buffer the known 10069 actually of dubious value because in an entirely-ASCII buffer the known
10091 track of a shifter value (0, 1, or 2) indicating how much to shift. 10095 track of a shifter value (0, 1, or 2) indicating how much to shift.
10092 Multiplying by 3 can be implemented by doubling and then adding the 10096 Multiplying by 3 can be implemented by doubling and then adding the
10093 original value. Dividing by 3, alas, cannot be implemented in any 10097 original value. Dividing by 3, alas, cannot be implemented in any
10094 simple shift/subtract method, as far as I know; so we just do a table 10098 simple shift/subtract method, as far as I know; so we just do a table
10095 lookup. For simplicity, we use a table of size 128K, which indexes the 10099 lookup. For simplicity, we use a table of size 128K, which indexes the
10096 "divide-by-3" values for the first 64K non-negative numbers. (Note that 10100 ``divide-by-3'' values for the first 64K non-negative numbers. (Note that
10097 we can increase the size up to 384K, i.e. indexing the first 192K 10101 we can increase the size up to 384K, i.e. indexing the first 192K
10098 non-negative numbers, while still using shorts in the array.) This also 10102 non-negative numbers, while still using shorts in the array.) This also
10099 means that the size of the known region can be at most 64K for 10103 means that the size of the known region can be at most 64K for
10100 width-three characters. 10104 width-three characters.
10101 @end quotation 10105 @end quotation
10121 @item 10125 @item
10122 the position of the gap 10126 the position of the gap
10123 @item 10127 @item
10124 the last value we computed 10128 the last value we computed
10125 @item 10129 @item
10126 a set of positions that are "far away" from previously computed positions 10130 a set of positions that are ``far away'' from previously computed positions
10127 (5000 chars currently; #### perhaps should be smaller) 10131 (5000 chars currently; #### perhaps should be smaller)
10128 @end itemize 10132 @end itemize
10129 10133
10130 For each position, we @code{CONSIDER()} it. This means: 10134 For each position, we @code{CONSIDER()} it. This means:
10131 10135
10147 the simple loop in FSF with the use of @code{bytecount_to_charcount()}, 10151 the simple loop in FSF with the use of @code{bytecount_to_charcount()},
10148 @code{charcount_to_bytecount()}, @code{bytecount_to_charcount_down()}, or 10152 @code{charcount_to_bytecount()}, @code{bytecount_to_charcount_down()}, or
10149 @code{charcount_to_bytecount_down()}. (The latter two I added for this purpose.) 10153 @code{charcount_to_bytecount_down()}. (The latter two I added for this purpose.)
10150 These scan 4 or 8 bytes at a time through purely single-byte characters. 10154 These scan 4 or 8 bytes at a time through purely single-byte characters.
10151 10155
10152 If the amount we had to scan was more than our "far away" distance (5000 10156 If the amount we had to scan was more than our ``far away'' distance (5000
10153 characters, see above), then cache the new position. 10157 characters, see above), then cache the new position.
10154 10158
10155 #### Things to do: 10159 #### Things to do:
10156 10160
10157 @itemize @bullet 10161 @itemize @bullet
10158 @item 10162 @item
10159 Look at the most recent GNU Emacs to see whether anything has changed. 10163 Look at the most recent GNU Emacs to see whether anything has changed.
10160 @item 10164 @item
10161 Think about whether it makes sense to try to implement some sort of 10165 Think about whether it makes sense to try to implement some sort of
10162 known region or list of "known regions", like we had before. This would 10166 known region or list of ``known regions'', like we had before. This would
10163 be a region of entirely single-byte characters that we can check very 10167 be a region of entirely single-byte characters that we can check very
10164 quickly. (Previously I used a range of same-width characters of any 10168 quickly. (Previously I used a range of same-width characters of any
10165 size; but this adds extra complexity and slows down the scanning, and is 10169 size; but this adds extra complexity and slows down the scanning, and is
10166 probably not worth it.) As part of the scanning process in 10170 probably not worth it.) As part of the scanning process in
10167 @code{bytecount_to_charcount()} et al, we skip over chunks of entirely 10171 @code{bytecount_to_charcount()} et al, we skip over chunks of entirely
10375 In terms of reading the actual code, there are five optimizations 10379 In terms of reading the actual code, there are five optimizations
10376 (obfuscations, if you like) that have been done. 10380 (obfuscations, if you like) that have been done.
10377 10381
10378 @enumerate 10382 @enumerate
10379 @item 10383 @item
10380 An explicit "failure stack" has been substituted for recursion. 10384 An explicit ``failure stack'' has been substituted for recursion.
10381 10385
10382 @item 10386 @item
10383 The @code{match_1_operator}, @code{next_p}, and @code{next_b} functions 10387 The @code{match_1_operator}, @code{next_p}, and @code{next_b} functions
10384 are actually inlined into the @code{match} function for efficiency. 10388 are actually inlined into the @code{match} function for efficiency.
10385 Then the pointer movement is interspersed with the matching operations. 10389 Then the pointer movement is interspersed with the matching operations.
10388 If the operator uses buffer context, the buffer pointer movement is 10392 If the operator uses buffer context, the buffer pointer movement is
10389 sometimes implicit in the operations retrieving the context. 10393 sometimes implicit in the operations retrieving the context.
10390 10394
10391 @item 10395 @item
10392 Some cases are combined into short preparation for individual cases, and 10396 Some cases are combined into short preparation for individual cases, and
10393 a "fall-through" into combined code for several cases. 10397 a ``fall-through'' into combined code for several cases.
10394 10398
10395 @item 10399 @item
10396 The @code{pattern} type is not an explicit @samp{struct}. Instead, the 10400 The @code{pattern} type is not an explicit @samp{struct}. Instead, the
10397 data (including, @emph{e.g.}, @samp{range_table}) is inlined into the 10401 data (including, @emph{e.g.}, @samp{range_table}) is inlined into the
10398 compiled bytecode. This leads to bizarre code in the interpreter like 10402 compiled bytecode. This leads to bizarre code in the interpreter like
10407 @example 10411 @example
10408 ..., 'range', count, first_8_flags, second_8_flags, ..., next_op, ... 10412 ..., 'range', count, first_8_flags, second_8_flags, ..., next_op, ...
10409 @end example 10413 @end example
10410 @end enumerate 10414 @end enumerate
10411 10415
10412 But if you keep your eye on the "switch in a loop" structure, you 10416 But if you keep your eye on the ``switch in a loop'' structure, you
10413 should be able to understand the parts you need. 10417 should be able to understand the parts you need.
10414 10418
10415 @node Multilingual Support, Consoles; Devices; Frames; Windows, Text, Top 10419 @node Multilingual Support, Consoles; Devices; Frames; Windows, Text, Top
10416 @chapter Multilingual Support 10420 @chapter Multilingual Support
10417 @cindex Mule character sets and encodings 10421 @cindex Mule character sets and encodings
10869 a simple charset like ASCII, there is only one encoding normally used -- 10873 a simple charset like ASCII, there is only one encoding normally used --
10870 each character is represented by a single byte, with the same value as 10874 each character is represented by a single byte, with the same value as
10871 its code point. For more complicated charsets, however, things are not 10875 its code point. For more complicated charsets, however, things are not
10872 so obvious. Unicode version 2, for example, is a large charset with 10876 so obvious. Unicode version 2, for example, is a large charset with
10873 thousands of characters, each indexed by a 16-bit number, often 10877 thousands of characters, each indexed by a 16-bit number, often
10874 represented in hex, e.g. 0x05D0 for the Hebrew letter "aleph". One 10878 represented in hex, e.g. 0x05D0 for the Hebrew letter ``aleph''. One
10875 obvious encoding uses two bytes per character (actually two encodings, 10879 obvious encoding uses two bytes per character (actually two encodings,
10876 depending on which of the two possible byte orderings is chosen). This 10880 depending on which of the two possible byte orderings is chosen). This
10877 encoding is convenient for internal processing of Unicode text; however, 10881 encoding is convenient for internal processing of Unicode text; however,
10878 it's incompatible with ASCII, so a different encoding, e.g. UTF-8, is 10882 it's incompatible with ASCII, so a different encoding, e.g. UTF-8, is
10879 usually used for external text, for example files or e-mail. UTF-8 10883 usually used for external text, for example files or e-mail. UTF-8
10890 10894
10891 In an ASCII or single-European-character-set world, life is very simple. 10895 In an ASCII or single-European-character-set world, life is very simple.
10892 There are 256 characters, and each character is represented using the 10896 There are 256 characters, and each character is represented using the
10893 numbers 0 through 255, which fit into a single byte. With a few 10897 numbers 0 through 255, which fit into a single byte. With a few
10894 exceptions (such as case-changing operations or syntax classes like 10898 exceptions (such as case-changing operations or syntax classes like
10895 'whitespace'), "text" is simply an array of indices into a font. You 10899 @code{whitespace}), ``text'' is simply an array of indices into a font. You
10896 can get different languages simply by choosing fonts with different 10900 can get different languages simply by choosing fonts with different
10897 8-bit character sets (ISO-8859-1, -2, special-symbol fonts, etc.), and 10901 8-bit character sets (ISO-8859-1, -2, special-symbol fonts, etc.), and
10898 everything will "just work" as long as anyone else receiving your text 10902 everything will ``just work'' as long as anyone else receiving your text
10899 uses a compatible font. 10903 uses a compatible font.
10900 10904
10901 In the multi-lingual world, however, it is much more complicated. There 10905 In the multi-lingual world, however, it is much more complicated. There
10902 are a great number of different characters which are organized in a 10906 are a great number of different characters which are organized in a
10903 complex fashion into various character sets. The representation to use 10907 complex fashion into various character sets. The representation to use
10943 text as possible. No operations should ever be performed on text encoded 10947 text as possible. No operations should ever be performed on text encoded
10944 in an external representation other than simple copying, because no 10948 in an external representation other than simple copying, because no
10945 assumptions can reliably be made about the format of this text. You 10949 assumptions can reliably be made about the format of this text. You
10946 cannot assume, for example, that the end of text is terminated by a null 10950 cannot assume, for example, that the end of text is terminated by a null
10947 byte. (For example, if the text is Unicode, it will have many null bytes 10951 byte. (For example, if the text is Unicode, it will have many null bytes
10948 in it.) You cannot find the next "slash" character by searching through 10952 in it.) You cannot find the next ``slash'' character by searching through
10949 the bytes until you find a byte that looks like a "slash" character, 10953 the bytes until you find a byte that looks like a ``slash'' character,
10950 because it might actually be the second byte of a Kanji character. 10954 because it might actually be the second byte of a Kanji character.
10951 Furthermore, all text in the internal representation must be converted, 10955 Furthermore, all text in the internal representation must be converted,
10952 even if it is known to be completely ASCII, because the external 10956 even if it is known to be completely ASCII, because the external
10953 representation may not be ASCII compatible (for example, if it is 10957 representation may not be ASCII compatible (for example, if it is
10954 Unicode). 10958 Unicode).
10974 the structures of a particular external encoding and the methods required 10978 the structures of a particular external encoding and the methods required
10975 to convert to and from this encoding. A facility exists to create coding 10979 to convert to and from this encoding. A facility exists to create coding
10976 system aliases, which in essence gives a single coding system two 10980 system aliases, which in essence gives a single coding system two
10977 different names. It is effectively used in XEmacs to provide a layer of 10981 different names. It is effectively used in XEmacs to provide a layer of
10978 abstraction on top of the actual coding systems. For example, the coding 10982 abstraction on top of the actual coding systems. For example, the coding
10979 system alias "file-name" points to whichever coding system is currently 10983 system alias ``file-name'' points to whichever coding system is currently
10980 used for encoding and decoding file names as passed to or retrieved from 10984 used for encoding and decoding file names as passed to or retrieved from
10981 system calls. In general, the actual encoding will differ from system to 10985 system calls. In general, the actual encoding will differ from system to
10982 system, and also on the particular locale that the user is in. The use 10986 system, and also on the particular locale that the user is in. The use
10983 of the file-name alias effectively hides that implementation detail on 10987 of the file-name alias effectively hides that implementation detail on
10984 top of that abstract interface layer which provides a unified set of 10988 top of that abstract interface layer which provides a unified set of
11485 C = plain char, when the base type is unsigned 11489 C = plain char, when the base type is unsigned
11486 U = unsigned 11490 U = unsigned
11487 S = signed 11491 S = signed
11488 @end example 11492 @end example
11489 11493
11490 (Formerly I had a comment saying that type (e) "should be replaced with 11494 (Formerly I had a comment saying that type (e) ``should be replaced with
11491 void *". However, there are in fact many places where an unsigned char 11495 void *''. However, there are in fact many places where an unsigned char
11492 * might be used -- e.g. for ease in pointer computation, since void * 11496 * might be used -- e.g. for ease in pointer computation, since void *
11493 doesn't allow this, and for compatibility with external APIs.) 11497 doesn't allow this, and for compatibility with external APIs.)
11494 11498
11495 Note that these typedefs are purely for documentation purposes; from 11499 Note that these typedefs are purely for documentation purposes; from
11496 the C code's perspective, they are exactly equivalent to @code{char *}, 11500 the C code's perspective, they are exactly equivalent to @code{char *},
11507 @node Different Ways of Seeing Internal Text, Buffer Positions, Byte Types, Byte/Character Types; Buffer Positions; Other Typedefs 11511 @node Different Ways of Seeing Internal Text, Buffer Positions, Byte Types, Byte/Character Types; Buffer Positions; Other Typedefs
11508 @subsection Different Ways of Seeing Internal Text 11512 @subsection Different Ways of Seeing Internal Text
11509 @cindex different ways of seeing internal text 11513 @cindex different ways of seeing internal text
11510 11514
11511 There are various ways of representing internal text. The two primary 11515 There are various ways of representing internal text. The two primary
11512 ways are as an "array" of individual characters; the other is as a 11516 ways are as an ``array'' of individual characters; the other is as a
11513 "stream" of bytes. In the ASCII world, where there are only 255 11517 ``stream'' of bytes. In the ASCII world, where there are only 255
11514 characters at most, things are easy because each character fits into a 11518 characters at most, things are easy because each character fits into a
11515 byte. In general, however, this is not true -- see the above discussion 11519 byte. In general, however, this is not true -- see the above discussion
11516 of characters vs. encodings. 11520 of characters vs. encodings.
11517 11521
11518 In some cases, it's also important to distinguish between a stream 11522 In some cases, it's also important to distinguish between a stream
11519 representation as a series of bytes and as a series of textual units. 11523 representation as a series of bytes and as a series of textual units.
11520 This is particularly important wrt Unicode. The UTF-16 representation 11524 This is particularly important wrt Unicode. The UTF-16 representation
11521 (sometimes referred to, rather sloppily, as simply the "Unicode" format) 11525 (sometimes referred to, rather sloppily, as simply the ``Unicode'' format)
11522 represents text as a series of 16-bit units. Mostly, each unit 11526 represents text as a series of 16-bit units. Mostly, each unit
11523 corresponds to a single character, but not necessarily, as characters 11527 corresponds to a single character, but not necessarily, as characters
11524 outside of the range 0-65535 (the BMP or "Basic Multilingual Plane" of 11528 outside of the range 0-65535 (the BMP or ``Basic Multilingual Plane'' of
11525 Unicode) require two 16-bit units, through the mechanism of 11529 Unicode) require two 16-bit units, through the mechanism of
11526 "surrogates". When a series of 16-bit units is serialized into a byte 11530 ``surrogates''. When a series of 16-bit units is serialized into a byte
11527 stream, there are at least two possible representations, little-endian 11531 stream, there are at least two possible representations, little-endian
11528 and big-endian, and which one is used may depend on the native format of 11532 and big-endian, and which one is used may depend on the native format of
11529 16-bit integers in the CPU of the machine that XEmacs is running 11533 16-bit integers in the CPU of the machine that XEmacs is running
11530 on. (Similarly, UTF-32 is logically a representation with 32-bit textual 11534 on. (Similarly, UTF-32 is logically a representation with 32-bit textual
11531 units.) 11535 units.)
11538 @item 11542 @item
11539 UTF-16 has 2-byte (16-bit) units. 11543 UTF-16 has 2-byte (16-bit) units.
11540 @item 11544 @item
11541 UTF-32 has 4-byte (32-bit) units. 11545 UTF-32 has 4-byte (32-bit) units.
11542 @item 11546 @item
11543 XEmacs-internal encoding (the old "Mule" encoding) has 1-byte (8-bit) 11547 XEmacs-internal encoding (the old ``Mule'' encoding) has 1-byte (8-bit)
11544 units. 11548 units.
11545 @item 11549 @item
11546 UTF-7 technically has 7-bit units that are within the "mail-safe" range 11550 UTF-7 technically has 7-bit units that are within the ``mail-safe'' range
11547 (ASCII 32 - 126 plus a few control characters), but normally is encoded 11551 (ASCII 32 - 126 plus a few control characters), but normally is encoded
11548 in an 8-bit stream. (UTF-7 is also a modal encoding, since it has a 11552 in an 8-bit stream. (UTF-7 is also a modal encoding, since it has a
11549 normal mode where printable ASCII characters represent themselves and a 11553 normal mode where printable ASCII characters represent themselves and a
11550 shifted mode, introduced with a plus sign, where a base-64 encoding is 11554 shifted mode, introduced with a plus sign, where a base-64 encoding is
11551 used.) 11555 used.)
11606 @table @code 11610 @table @code
11607 @item Ibyte 11611 @item Ibyte
11608 The data in a buffer or string is logically made up of Ibyte objects, 11612 The data in a buffer or string is logically made up of Ibyte objects,
11609 where a Ibyte takes up the same amount of space as a char. (It is 11613 where a Ibyte takes up the same amount of space as a char. (It is
11610 declared differently, though, to catch invalid usages.) Strings stored 11614 declared differently, though, to catch invalid usages.) Strings stored
11611 using Ibytes are said to be in "internal format". The important 11615 using Ibytes are said to be in ``internal format''. The important
11612 characteristics of internal format are 11616 characteristics of internal format are
11613 11617
11614 @itemize @minus 11618 @itemize @minus
11615 @item 11619 @item
11616 ASCII characters are represented as a single Ibyte, in the range 0 - 11620 ASCII characters are represented as a single Ibyte, in the range 0 -
11659 11663
11660 This means that Ichar values are upwardly compatible with the standard 11664 This means that Ichar values are upwardly compatible with the standard
11661 8-bit representation of ASCII/ISO-8859-1. 11665 8-bit representation of ASCII/ISO-8859-1.
11662 11666
11663 @item Extbyte 11667 @item Extbyte
11664 Strings that go in or out of Emacs are in "external format", typedef'ed 11668 Strings that go in or out of Emacs are in ``external format'', typedef'ed
11665 as an array of char or a char *. There is more than one external format 11669 as an array of char or a char *. There is more than one external format
11666 (JIS, EUC, etc.) but they all have similar properties. They are modal 11670 (JIS, EUC, etc.) but they all have similar properties. They are modal
11667 encodings, which is to say that the meaning of particular bytes is not 11671 encodings, which is to say that the meaning of particular bytes is not
11668 fixed but depends on what "mode" the string is currently in (e.g. bytes 11672 fixed but depends on what ``mode'' the string is currently in (e.g. bytes
11669 in the range 0 - 0x7f might be interpreted as ASCII, or as Hiragana, or 11673 in the range 0 - 0x7f might be interpreted as ASCII, or as Hiragana, or
11670 as 2-byte Kanji, depending on the current mode). The mode starts out in 11674 as 2-byte Kanji, depending on the current mode). The mode starts out in
11671 ASCII/ISO-8859-1 and is switched using escape sequences -- for example, 11675 ASCII/ISO-8859-1 and is switched using escape sequences -- for example,
11672 in the JIS encoding, 'ESC $ B' switches to a mode where pairs of bytes 11676 in the JIS encoding, 'ESC $ B' switches to a mode where pairs of bytes
11673 in the range 0 - 0x7f are interpreted as Kanji characters. 11677 in the range 0 - 0x7f are interpreted as Kanji characters.
11693 11697
11694 There are three possible ways to specify positions in a buffer. All 11698 There are three possible ways to specify positions in a buffer. All
11695 of these are one-based: the beginning of the buffer is position or 11699 of these are one-based: the beginning of the buffer is position or
11696 index 1, and 0 is not a valid position. 11700 index 1, and 0 is not a valid position.
11697 11701
11698 As a "buffer position" (typedef Charbpos): 11702 As a ``buffer position'' (typedef Charbpos):
11699 11703
11700 This is an index specifying an offset in characters from the 11704 This is an index specifying an offset in characters from the
11701 beginning of the buffer. Note that buffer positions are 11705 beginning of the buffer. Note that buffer positions are
11702 logically @strong{between} characters, not on a character. The 11706 logically @strong{between} characters, not on a character. The
11703 difference between two buffer positions specifies the number of 11707 difference between two buffer positions specifies the number of
11704 characters between those positions. Buffer positions are the 11708 characters between those positions. Buffer positions are the
11705 only kind of position externally visible to the user. 11709 only kind of position externally visible to the user.
11706 11710
11707 As a "byte index" (typedef Bytebpos): 11711 As a ``byte index'' (typedef Bytebpos):
11708 11712
11709 This is an index over the bytes used to represent the characters 11713 This is an index over the bytes used to represent the characters
11710 in the buffer. If there is no Mule support, this is identical 11714 in the buffer. If there is no Mule support, this is identical
11711 to a buffer position, because each character is represented 11715 to a buffer position, because each character is represented
11712 using one byte. However, with Mule support, many characters 11716 using one byte. However, with Mule support, many characters
11713 require two or more bytes for their representation, and so a 11717 require two or more bytes for their representation, and so a
11714 byte index may be greater than the corresponding buffer 11718 byte index may be greater than the corresponding buffer
11715 position. 11719 position.
11716 11720
11717 As a "memory index" (typedef Membpos): 11721 As a ``memory index'' (typedef Membpos):
11718 11722
11719 This is the byte index adjusted for the gap. For positions 11723 This is the byte index adjusted for the gap. For positions
11720 before the gap, this is identical to the byte index. For 11724 before the gap, this is identical to the byte index. For
11721 positions after the gap, this is the byte index plus the gap 11725 positions after the gap, this is the byte index plus the gap
11722 size. There are two possible memory indices for the gap 11726 size. There are two possible memory indices for the gap
11723 position; the memory index at the beginning of the gap should 11727 position; the memory index at the beginning of the gap should
11724 always be used, except in code that deals with manipulating the 11728 always be used, except in code that deals with manipulating the
11725 gap, where both indices may be seen. The address of the 11729 gap, where both indices may be seen. The address of the
11726 character "at" (i.e. following) a particular position can be 11730 character ``at'' (i.e. following) a particular position can be
11727 obtained from the formula 11731 obtained from the formula
11728 11732
11729 buffer_start_address + memory_index(position) - 1 11733 buffer_start_address + memory_index(position) - 1
11730 11734
11731 except in the case of characters at the gap position. 11735 except in the case of characters at the gap position.
11830 use the buffer-level functions in buffer.h, which automatically know the 11834 use the buffer-level functions in buffer.h, which automatically know the
11831 correct format and handle the gap. 11835 correct format and handle the gap.
11832 11836
11833 Some terminology: 11837 Some terminology:
11834 11838
11835 "itext" appearing in the macros means "internal-format text" -- type 11839 itext" appearing in the macros means "internal-format text" -- type
11836 @code{Ibyte *}. Operations on such pointers themselves, rather than on the 11840 @code{Ibyte *}. Operations on such pointers themselves, rather than on the
11837 text being pointed to, have "itext" instead of "itext" in the macro 11841 text being pointed to, have "itext" instead of "itext" in the macro
11838 name. "ichar" in the macro names means an Ichar -- the representation 11842 name. "ichar" in the macro names means an Ichar -- the representation
11839 of a character as a single integer rather than a series of bytes, as part 11843 of a character as a single integer rather than a series of bytes, as part
11840 of "itext". Many of the macros below are for converting between the 11844 of "itext". Many of the macros below are for converting between the
12039 @item 12043 @item
12040 (c) using the GCC extension (@{ ... @}). 12044 (c) using the GCC extension (@{ ... @}).
12041 @end itemize 12045 @end itemize
12042 12046
12043 Turned out that all of the above had bugs, all caused by GCC (hence the 12047 Turned out that all of the above had bugs, all caused by GCC (hence the
12044 comments about "those GCC wankers" and "ream gcc up the ass"). As for 12048 comments about ``those GCC wankers'' and ``ream gcc up the ass''). As for
12045 (a), some versions of GCC (especially on Intel platforms), which had 12049 (a), some versions of GCC (especially on Intel platforms), which had
12046 buggy implementations of @code{alloca()} that couldn't handle being called 12050 buggy implementations of @code{alloca()} that couldn't handle being called
12047 inside of a function call -- they just decremented the stack right in the 12051 inside of a function call -- they just decremented the stack right in the
12048 middle of pushing args. Oops, crash with stack trashing, very bad. (b) 12052 middle of pushing args. Oops, crash with stack trashing, very bad. (b)
12049 was an attempt to fix (a), and that led to further GCC crashes, esp. when 12053 was an attempt to fix (a), and that led to further GCC crashes, esp. when
13022 consistency. For example, the new Mule workspace contains Ibyte 13026 consistency. For example, the new Mule workspace contains Ibyte
13023 versions of the stdlib string functions. 13027 versions of the stdlib string functions.
13024 @item Extbyte, UExtbyte 13028 @item Extbyte, UExtbyte
13025 Pointer to text in some external format, which can be defined as all 13029 Pointer to text in some external format, which can be defined as all
13026 formats other than the internal one. The data representing a string 13030 formats other than the internal one. The data representing a string
13027 in "external" format (binary or any external encoding) is logically a 13031 in ``external'' format (binary or any external encoding) is logically a
13028 set of Extbytes. Extbyte is guaranteed to be just a char, so for 13032 set of Extbytes. Extbyte is guaranteed to be just a char, so for
13029 example strlen (Extbyte *) is OK. Extbyte is only a documentation 13033 example strlen (Extbyte *) is OK. Extbyte is only a documentation
13030 device for referring to external text. 13034 device for referring to external text.
13031 @item Ascbyte, UAscbyte 13035 @item Ascbyte, UAscbyte
13032 pure ASCII text, consisting of bytesf in a string in entirely US-ASCII 13036 pure ASCII text, consisting of bytesf in a string in entirely US-ASCII
13166 13170
13167 @node Mule-izing Code, , An Example of Mule-Aware Code, Coding for Mule 13171 @node Mule-izing Code, , An Example of Mule-Aware Code, Coding for Mule
13168 @subsection Mule-izing Code 13172 @subsection Mule-izing Code
13169 13173
13170 A lot of code is written without Mule in mind, and needs to be made 13174 A lot of code is written without Mule in mind, and needs to be made
13171 Mule-correct or "Mule-ized". There is really no substitute for 13175 Mule-correct or ``Mule-ized''. There is really no substitute for
13172 line-by-line analysis when doing this, but the following checklist can 13176 line-by-line analysis when doing this, but the following checklist can
13173 help: 13177 help:
13174 13178
13175 @itemize @bullet 13179 @itemize @bullet
13176 @item 13180 @item
13384 @item 13388 @item
13385 Look in the CRT sources! They come with VC++. See win32.c. 13389 Look in the CRT sources! They come with VC++. See win32.c.
13386 @end enumerate 13390 @end enumerate
13387 13391
13388 @node Locales, More about code pages, Microsoft Documentation, Microsoft Windows-Related Multilingual Issues 13392 @node Locales, More about code pages, Microsoft Documentation, Microsoft Windows-Related Multilingual Issues
13389 @subsection Locales, code pages, and other concepts of "language" 13393 @subsection Locales, code pages, and other concepts of ``language''
13390 @cindex locales, code pages, and other concepts of "language" 13394 @cindex locales, code pages, and other concepts of ``language''
13391 13395
13392 First, make sure you clearly understand the difference between the C 13396 First, make sure you clearly understand the difference between the C
13393 runtime library (CRT) and the Win32 API! See win32.c. 13397 runtime library (CRT) and the Win32 API! See win32.c.
13394 13398
13395 There are various different ways of representing the vague concept 13399 There are various different ways of representing the vague concept
13396 of "language", and it can be very confusing. So: 13400 of ``language'', and it can be very confusing. So:
13397 13401
13398 @itemize @bullet 13402 @itemize @bullet
13399 @item 13403 @item
13400 The CRT library has the concept of "locale", which is a 13404 The CRT library has the concept of ``locale'', which is a
13401 combination of language and country, and which controls the way 13405 combination of language and country, and which controls the way
13402 currency and dates are displayed, the encoding of data, etc. 13406 currency and dates are displayed, the encoding of data, etc.
13403 13407
13404 @item 13408 @item
13405 XEmacs has the concept of "language environment", more or less 13409 XEmacs has the concept of ``language environment'', more or less
13406 like a locale; although currently in most cases it just refers to 13410 like a locale; although currently in most cases it just refers to
13407 the language, and no sub-language distinctions are 13411 the language, and no sub-language distinctions are
13408 made. (Exceptions are with Chinese, which has different language 13412 made. (Exceptions are with Chinese, which has different language
13409 environments for Taiwan and mainland China, due to the different 13413 environments for Taiwan and mainland China, due to the different
13410 encodings and writing systems.) 13414 encodings and writing systems.)
13412 @item 13416 @item
13413 Windows has a number of different language concepts: 13417 Windows has a number of different language concepts:
13414 13418
13415 @enumerate 13419 @enumerate
13416 @item 13420 @item
13417 There are "languages" and "sublanguages", which correspond to 13421 There are ``languages'' and ``sublanguages'', which correspond to
13418 the languages and countries of the C library -- e.g. LANG_ENGLISH 13422 the languages and countries of the C library -- e.g. LANG_ENGLISH
13419 and SUBLANG_ENGLISH_US. These are identified by 8-bit integers, 13423 and SUBLANG_ENGLISH_US. These are identified by 8-bit integers,
13420 called the "primary language identifier" and "sublanguage 13424 called the ``primary language identifier'' and ``sublanguage
13421 identifier", respectively. These are combined into a 16-bit 13425 identifier'', respectively. These are combined into a 16-bit
13422 integer or "language identifier" by MAKELANGID(). 13426 integer or ``language identifier'' by @code{MAKELANGID()}.
13423 13427
13424 @item 13428 @item
13425 The language identifier in turn is combined with a "sort 13429 The language identifier in turn is combined with a ``sort
13426 identifier" (and optionally a "sort version") to yield a 32-bit 13430 identifier'' (and optionally a ``sort version'') to yield a 32-bit
13427 integer called a "locale identifier" (type LCID), which identifies 13431 integer called a ``locale identifier'' (type LCID), which identifies
13428 locales -- the primary means of distinguishing language/regional 13432 locales -- the primary means of distinguishing language/regional
13429 settings and similar to C library locales. 13433 settings and similar to C library locales.
13430 13434
13431 @item 13435 @item
13432 A "code page" combines the XEmacs concepts of "charset" and "coding 13436 A ``code page'' combines the XEmacs concepts of ``charset'' and ``coding
13433 system". It logically encompasses 13437 system''. It logically encompasses
13434 13438
13435 @itemize @minus 13439 @itemize @minus
13436 @item 13440 @item
13437 a set of supported characters 13441 a set of supported characters
13438 @item 13442 @item
13441 supported 13445 supported
13442 @item 13446 @item
13443 a way of encoding a series of characters into a string of bytes 13447 a way of encoding a series of characters into a string of bytes
13444 @end itemize 13448 @end itemize
13445 13449
13446 Note that the first two properties correspond to an XEmacs "charset" 13450 Note that the first two properties correspond to an XEmacs ``charset''
13447 and the latter an XEmacs "coding system". 13451 and the latter an XEmacs ``coding system''.
13448 13452
13449 Traditional encodings are either simple one-byte encodings, or 13453 Traditional encodings are either simple one-byte encodings, or
13450 combination one-byte/two-byte encodings (aka MBCS encodings, where MBCS 13454 combination one-byte/two-byte encodings (aka MBCS encodings, where MBCS
13451 stands for "Multibyte Character Set") with the following properties: 13455 stands for ``Multibyte Character Set'') with the following properties:
13452 13456
13453 @itemize @minus 13457 @itemize @minus
13454 @item 13458 @item
13455 all characters are encoded as a one-byte or two-byte sequence 13459 all characters are encoded as a one-byte or two-byte sequence
13456 @item 13460 @item
13457 the encoding is stateless (non-modal) 13461 the encoding is stateless (non-modal)
13458 @item 13462 @item
13459 the lower 128 bytes are compatible with ASCII 13463 the lower 128 bytes are compatible with ASCII
13460 @item 13464 @item
13461 in the higher bytes, the value of the first byte ("lead byte") 13465 in the higher bytes, the value of the first byte (``lead byte'')
13462 determines whether a second byte follows 13466 determines whether a second byte follows
13463 @item 13467 @item
13464 the values used for second bytes may overlap those used for first 13468 the values used for second bytes may overlap those used for first
13465 bytes, and (in some encodings) include values in the low half; thus, 13469 bytes, and (in some encodings) include values in the low half; thus,
13466 moving backwards is hard, and pure-ASCII algorithms (e.g. finding the 13470 moving backwards is hard, and pure-ASCII algorithms (e.g. finding the
13478 Every Windows locale has four associated code pages: ANSI (an 13482 Every Windows locale has four associated code pages: ANSI (an
13479 international standard or some Microsoft-created approximation; the 13483 international standard or some Microsoft-created approximation; the
13480 native code page under Windows), OEM (a DOS encoding, still used in the 13484 native code page under Windows), OEM (a DOS encoding, still used in the
13481 FAT file system), Mac (an encoding used on the Macintosh) and EBCDIC (a 13485 FAT file system), Mac (an encoding used on the Macintosh) and EBCDIC (a
13482 non-ASCII-compatible encoding used on IBM mainframes, originally based 13486 non-ASCII-compatible encoding used on IBM mainframes, originally based
13483 on the BCD or "binary-coded decimal" encoding of numbers). All code 13487 on the BCD or ``binary-coded decimal'' encoding of numbers). All code
13484 pages associated with a locale follow (as far as I know) the properties 13488 pages associated with a locale follow (as far as I know) the properties
13485 listed above for traditional code pages. More than one locale can share 13489 listed above for traditional code pages. More than one locale can share
13486 a code page -- e.g. all the Western European languages, including 13490 a code page -- e.g. all the Western European languages, including
13487 English, do. 13491 English, do.
13488 13492
13489 @item 13493 @item
13490 Windows also has an "input locale identifier" (aka "keyboard 13494 Windows also has an ``input locale identifier'' (aka ``keyboard
13491 layout id") or HKL, which is a 32-bit integer composed of the 13495 layout id'') or HKL, which is a 32-bit integer composed of the
13492 16-bit language identifier and a 16-bit "device identifier", which 13496 16-bit language identifier and a 16-bit ``device identifier'', which
13493 originally specified a particular keyboard layout (e.g. the locale 13497 originally specified a particular keyboard layout (e.g. the locale
13494 "US English" can have the QWERTY layout, the Dvorak layout, etc.), 13498 ``US English'' can have the QWERTY layout, the Dvorak layout, etc.),
13495 but has been expanded to include speech-to-text converters and 13499 but has been expanded to include speech-to-text converters and
13496 other non-keyboard ways of inputting text. Note that both the HKL 13500 other non-keyboard ways of inputting text. Note that both the HKL
13497 and LCID share the language identifier in the lower 16 bits, and in 13501 and LCID share the language identifier in the lower 16 bits, and in
13498 both cases a 0 in the upper 16 bits means "default" (sort order or 13502 both cases a 0 in the upper 16 bits means ``default'' (sort order or
13499 device), providing a way to convert between HKL's, LCID's, and 13503 device), providing a way to convert between HKL's, LCID's, and
13500 language identifiers (i.e. language/sublanguage pairs). The 13504 language identifiers (i.e. language/sublanguage pairs). The
13501 default keyboard layout for a language is (as far as I can 13505 default keyboard layout for a language is (as far as I can
13502 determine) established using the Regional Settings control panel 13506 determine) established using the Regional Settings control panel
13503 applet, where you can add input locales as combinations of language 13507 applet, where you can add input locales as combinations of language
13511 13515
13512 @node More about code pages, More about locales, Locales, Microsoft Windows-Related Multilingual Issues 13516 @node More about code pages, More about locales, Locales, Microsoft Windows-Related Multilingual Issues
13513 @subsection More about code pages 13517 @subsection More about code pages
13514 @cindex more about code pages 13518 @cindex more about code pages
13515 13519
13516 Here is what MSDN says about code pages (article "Code Pages"): 13520 Here is what MSDN says about code pages (article ``Code Pages''):
13517 13521
13518 @quotation 13522 @quotation
13519 A code page is a character set, which can include numbers, 13523 A code page is a character set, which can include numbers,
13520 punctuation marks, and other glyphs. Different languages and locales 13524 punctuation marks, and other glyphs. Different languages and locales
13521 may use different code pages. For example, ANSI code page 1252 is 13525 may use different code pages. For example, ANSI code page 1252 is
13553 13557
13554 -- The "C" locale is defined by ANSI to correspond to the locale in 13558 -- The "C" locale is defined by ANSI to correspond to the locale in
13555 which C programs have traditionally executed. The code page for the 13559 which C programs have traditionally executed. The code page for the
13556 "C" locale (code page) corresponds to the ASCII character 13560 "C" locale (code page) corresponds to the ASCII character
13557 set. For example, in the "C" locale, islower returns true for the 13561 set. For example, in the "C" locale, islower returns true for the
13558 values 0x61 ?0x7A only. In another locale, islower may return true 13562 values 0x61 to 0x7A only. In another locale, islower may return true
13559 for these as well as other values, as defined by that locale. 13563 for these as well as other values, as defined by that locale.
13560 13564
13561 Under "Locale-Dependent Routines" we notice the following setlocale 13565 Under ``Locale-Dependent Routines'' we notice the following setlocale
13562 dependencies: 13566 dependencies:
13563 13567
13564 atof, atoi, atol (LC_NUMERIC) 13568 atof, atoi, atol (LC_NUMERIC)
13565 is Routines (LC_CTYPE) 13569 is Routines (LC_CTYPE)
13566 isleadbyte (LC_CTYPE) 13570 isleadbyte (LC_CTYPE)
13589 wcstombs (LC_CTYPE) 13593 wcstombs (LC_CTYPE)
13590 wctomb (LC_CTYPE) 13594 wctomb (LC_CTYPE)
13591 _wtoi/_wtol (LC_NUMERIC) 13595 _wtoi/_wtol (LC_NUMERIC)
13592 @end quotation 13596 @end quotation
13593 13597
13594 NOTE: The above documentation doesn't clearly explain the "locale code 13598 NOTE: The above documentation doesn't clearly explain the ``locale code
13595 page" and "multibyte code page". These are two different values, 13599 page'' and ``multibyte code page''. These are two different values,
13596 maintained respectively in the CRT global variables __lc_codepage and 13600 maintained respectively in the CRT global variables __lc_codepage and
13597 __mbcodepage. Calling e.g. setlocale (LC_ALL, "JAPANESE") sets @strong{ONLY} 13601 __mbcodepage. Calling e.g. setlocale (LC_ALL, "JAPANESE") sets @strong{ONLY}
13598 __lc_codepage to 932 (the code page for Japanese), and leaves 13602 __lc_codepage to 932 (the code page for Japanese), and leaves
13599 __mbcodepage unchanged (usually 1252, i.e. Windows-ANSI). You'd have to 13603 __mbcodepage unchanged (usually 1252, i.e. Windows-ANSI). You'd have to
13600 call _setmbcp() to change __mbcodepage. Figuring out from the 13604 call _setmbcp() to change __mbcodepage. Figuring out from the
13601 documentation which routines use which code page is not so obvious. But: 13605 documentation which routines use which code page is not so obvious. But:
13602 13606
13603 @itemize @bullet 13607 @itemize @bullet
13604 @item 13608 @item
13605 from "Interpretation of Multibyte-Character Sequences" it appears that 13609 from ``Interpretation of Multibyte-Character Sequences'' it appears that
13606 all "multibyte-character routines" use the multibyte code page except for 13610 all ``multibyte-character routines'' use the multibyte code page except for
13607 mblen(), _mbstrlen(), mbstowcs(), mbtowc(), wcstombs(), and wctomb(). 13611 @code{mblen()}, @code{_mbstrlen()}, @code{mbstowcs()}, @code{mbtowc()}, @code{wcstombs()}, and @code{wctomb()}.
13608 13612
13609 @item 13613 @item
13610 from "_setmbcp": "The multibyte code page also affects 13614 from ``_setmbcp'': ``The multibyte code page also affects
13611 multibyte-character processing by the following run-time library 13615 multibyte-character processing by the following run-time library
13612 routines: _exec functions _mktemp _stat _fullpath _spawn functions 13616 routines: _exec functions _mktemp _stat _fullpath _spawn functions
13613 _tempnam _makepath _splitpath tmpnam. In addition, all run-time library 13617 _tempnam _makepath _splitpath tmpnam. In addition, all run-time library
13614 routines that receive multibyte-character argv or envp program arguments 13618 routines that receive multibyte-character argv or envp program arguments
13615 as parameters (such as the _exec and _spawn families) process these 13619 as parameters (such as the _exec and _spawn families) process these
13616 strings according to the multibyte code page. Hence these routines are 13620 strings according to the multibyte code page. Hence these routines are
13617 also affected by a call to _setmbcp that changes the multibyte code 13621 also affected by a call to _setmbcp that changes the multibyte code
13618 page." 13622 page.''
13619 @end itemize 13623 @end itemize
13620 13624
13621 Summary: from looking at the CRT source (which comes with VC++) and 13625 Summary: from looking at the CRT source (which comes with VC++) and
13622 carefully looking through the docs, it appears that: 13626 carefully looking through the docs, it appears that:
13623 13627
13624 @itemize @bullet 13628 @itemize @bullet
13625 @item 13629 @item
13626 the "locale code page" is used by all of the routines listed above 13630 the ``locale code page'' is used by all of the routines listed above
13627 under "Locale-Dependent Routines" (EXCEPT _mbccpy() and _mbclen()), 13631 under ``Locale-Dependent Routines'' (EXCEPT @code{_mbccpy()} and @code{_mbclen()}),
13628 as well as any other place that converts between multibyte and Unicode 13632 as well as any other place that converts between multibyte and Unicode
13629 strings, e.g. the startup code. 13633 strings, e.g. the startup code.
13630 @item 13634 @item
13631 the "multibyte code page" is used in all of the *mb*() routines 13635 the ``multibyte code page'' is used in all of the @code{mb*()} routines
13632 except mblen(), _mbstrlen(), mbstowcs(), mbtowc(), wcstombs(), 13636 except @code{mblen()}, @code{_mbstrlen()}, @code{mbstowcs()}, @code{mbtowc()}, @code{wcstombs()},
13633 and wctomb(); also _exec*(), _spawn*(), _mktemp(), _stat(), _fullpath(), 13637 and @code{wctomb()}; also @code{_exec*()}, @code{_spawn*()}, @code{_mktemp()}, @code{_stat()}, @code{_fullpath()},
13634 _tempnam(), _makepath(), _splitpath(), tmpnam(), and similar functions 13638 @code{_tempnam()}, @code{_makepath()}, @code{_splitpath()}, @code{tmpnam()}, and similar functions
13635 without the leading underscore. 13639 without the leading underscore.
13636 @end itemize 13640 @end itemize
13637 13641
13638 @node More about locales, Unicode support under Windows, More about code pages, Microsoft Windows-Related Multilingual Issues 13642 @node More about locales, Unicode support under Windows, More about code pages, Microsoft Windows-Related Multilingual Issues
13639 @subsection More about locales 13643 @subsection More about locales
13642 In addition to the locale defined by the CRT, Windows (i.e. the Win32 API) 13646 In addition to the locale defined by the CRT, Windows (i.e. the Win32 API)
13643 defines various locales: 13647 defines various locales:
13644 13648
13645 @itemize @bullet 13649 @itemize @bullet
13646 @item 13650 @item
13647 The system-default locale is the locale defined under "Language 13651 The system-default locale is the locale defined under ``Language
13648 settings for the system" in the "Regional Options" control panel. This 13652 settings for the system'' in the ``Regional Options'' control panel. This
13649 is NOT user-specific, and changing it requires a reboot (at least under 13653 is NOT user-specific, and changing it requires a reboot (at least under
13650 Windows 2000). The ANSI code page of the system-default locale is 13654 Windows 2000). The ANSI code page of the system-default locale is
13651 returned by GetACP(), and you can specify this code page in calls 13655 returned by @code{GetACP()}, and you can specify this code page in calls
13652 e.g. to MultiByteToWideChar with the constant CP_ACP. 13656 e.g. to MultiByteToWideChar with the constant CP_ACP.
13653 13657
13654 @item 13658 @item
13655 The user-default locale is the locale defined under "Settings for the 13659 The user-default locale is the locale defined under ``Settings for the
13656 current user" in the "Regional Options" control panel. 13660 current user'' in the ``Regional Options'' control panel.
13657 13661
13658 @item 13662 @item
13659 There is a thread-local locale set by SetThreadLocale. #### What is this 13663 There is a thread-local locale set by SetThreadLocale. #### What is this
13660 used for? 13664 used for?
13661 @end itemize 13665 @end itemize
13662 13666
13663 The Win32 API has a bunch of multibyte functions -- all of those that 13667 The Win32 API has a bunch of multibyte functions -- all of those that
13664 end with ...A(), and on which we spend so much effort in 13668 end with ...@code{A()}, and on which we spend so much effort in
13665 intl-encap-win32.c. These appear to ALWAYS use the ANSI code page of 13669 intl-encap-win32.c. These appear to ALWAYS use the ANSI code page of
13666 the system-default locale (GetACP(), CP_ACP). Note that this applies 13670 the system-default locale (@code{GetACP()}, CP_ACP). Note that this applies
13667 also, for example, to the encoding of filenames in all file-handling 13671 also, for example, to the encoding of filenames in all file-handling
13668 routines, including the CRT ones such as open(), because they pass their 13672 routines, including the CRT ones such as @code{open()}, because they pass their
13669 args unchanged to the Win32 API. 13673 args unchanged to the Win32 API.
13670 13674
13671 @node Unicode support under Windows, The golden rules of writing Unicode-safe code, More about locales, Microsoft Windows-Related Multilingual Issues 13675 @node Unicode support under Windows, The golden rules of writing Unicode-safe code, More about locales, Microsoft Windows-Related Multilingual Issues
13672 @subsection Unicode support under Windows 13676 @subsection Unicode support under Windows
13673 @cindex unicode support under windows 13677 @cindex unicode support under windows
13681 table to convert the characters of that code page to and from Unicode, and 13685 table to convert the characters of that code page to and from Unicode, and
13682 the Win32 API itself probably (perhaps always) uses Unicode internally. 13686 the Win32 API itself probably (perhaps always) uses Unicode internally.
13683 13687
13684 Under Windows there are two different versions of all library routines that 13688 Under Windows there are two different versions of all library routines that
13685 accept or return text, those that handle Unicode text and those handling 13689 accept or return text, those that handle Unicode text and those handling
13686 "multibyte" text, i.e. variable-width ASCII-compatible text in some 13690 ``multibyte'' text, i.e. variable-width ASCII-compatible text in some
13687 national format such as EUC or Shift-JIS. Because Windows 95 basically 13691 national format such as EUC or Shift-JIS. Because Windows 95 basically
13688 doesn't support Unicode but Windows NT does, and Microsoft doesn't provide 13692 doesn't support Unicode but Windows NT does, and Microsoft doesn't provide
13689 any way of writing a single binary that will work on both systems and still 13693 any way of writing a single binary that will work on both systems and still
13690 use Unicode when it's available (although see below, Microsoft Layer for 13694 use Unicode when it's available (although see below, Microsoft Layer for
13691 Unicode), we need to provide a way of run-time conditionalizing so you 13695 Unicode), we need to provide a way of run-time conditionalizing so you
13692 could have one binary for both systems. "Unicode-splitting" refers to 13696 could have one binary for both systems. ``Unicode-splitting'' refers to
13693 writing code that will handle this properly. This means using 13697 writing code that will handle this properly. This means using
13694 Qmswindows_tstr as the external conversion format, calling the appropriate 13698 Qmswindows_tstr as the external conversion format, calling the appropriate
13695 qxe...() Unicode-split version of library functions, and doing other things 13699 qxe...() Unicode-split version of library functions, and doing other things
13696 in certain cases, e.g. when a qxe() function is not present. 13700 in certain cases, e.g. when a @code{qxe()} function is not present.
13697 13701
13698 Unicode support also requires that the various Windows APIs be 13702 Unicode support also requires that the various Windows APIs be
13699 "Unicode-encapsulated", so that they automatically call the ANSI or 13703 ``Unicode-encapsulated'', so that they automatically call the ANSI or
13700 Unicode version of the API call appropriately and handle the size 13704 Unicode version of the API call appropriately and handle the size
13701 differences in structures. What this means is: 13705 differences in structures. What this means is:
13702 13706
13703 @itemize @bullet 13707 @itemize @bullet
13704 @item 13708 @item
13705 first, note that Windows already provides a sort of encapsulation 13709 first, note that Windows already provides a sort of encapsulation
13706 of all APIs that deal with text. All such APIs are underlyingly 13710 of all APIs that deal with text. All such APIs are underlyingly
13707 provided in two versions, with an A or W suffix (ANSI or "wide" 13711 provided in two versions, with an A or W suffix (ANSI or ``wide''
13708 i.e. Unicode), and the compile-time constant UNICODE controls which is 13712 i.e. Unicode), and the compile-time constant UNICODE controls which is
13709 selected by the unsuffixed API. Same thing happens with structures, and 13713 selected by the unsuffixed API. Same thing happens with structures, and
13710 also with types, where the generic types have names beginning with T -- 13714 also with types, where the generic types have names beginning with T --
13711 TCHAR, LPTSTR, etc.. Unfortunately, this is compile-time only, not 13715 TCHAR, LPTSTR, etc.. Unfortunately, this is compile-time only, not
13712 run-time, so not sufficient. (Creating the necessary run-time encoding 13716 run-time, so not sufficient. (Creating the necessary run-time encoding
13721 such an API available internally.) 13725 such an API available internally.)
13722 13726
13723 @item 13727 @item
13724 what we do is provide an encapsulation of each standard Windows API call 13728 what we do is provide an encapsulation of each standard Windows API call
13725 that is split into A and W versions. current theory is to avoid all 13729 that is split into A and W versions. current theory is to avoid all
13726 preprocessor games; so we name the function with a prefix -- "qxe" 13730 preprocessor games; so we name the function with a prefix -- ``qxe''
13727 currently -- and require callers to use the prefixed name. Callers need 13731 currently -- and require callers to use the prefixed name. Callers need
13728 to explicitly use the W version of all structures, and convert text 13732 to explicitly use the W version of all structures, and convert text
13729 themselves using Qmswindows_tstr. the qxe encapsulated version will 13733 themselves using Qmswindows_tstr. the qxe encapsulated version will
13730 automatically call the appropriate A or W version depending on whether 13734 automatically call the appropriate A or W version depending on whether
13731 we're running on 9x or NT (you can force use of the A calls on NT, 13735 we're running on 9x or NT (you can force use of the A calls on NT,
13781 purpose, to make the code easier to follow for someone who's not familiar 13785 purpose, to make the code easier to follow for someone who's not familiar
13782 with it. until our library is really complete and bug-free, we should 13786 with it. until our library is really complete and bug-free, we should
13783 think twice before doing this. 13787 think twice before doing this.
13784 13788
13785 According to Microsoft documentation, only the following functions are 13789 According to Microsoft documentation, only the following functions are
13786 provided under Windows 9x to support Unicode (see MSDN page "Windows 13790 provided under Windows 9x to support Unicode (see MSDN page ``Windows
13787 95/98/Me General Limitations"): 13791 95/98/Me General Limitations''):
13788 13792
13789 EnumResourceLanguagesW 13793 EnumResourceLanguagesW
13790 EnumResourceNamesW 13794 EnumResourceNamesW
13791 EnumResourceTypesW 13795 EnumResourceTypesW
13792 ExtTextOutW 13796 ExtTextOutW
13803 MessageBoxExW 13807 MessageBoxExW
13804 MultiByteToWideChar 13808 MultiByteToWideChar
13805 TextOutW 13809 TextOutW
13806 WideCharToMultiByte 13810 WideCharToMultiByte
13807 13811
13808 also maybe GetTextExtentExPoint? (KB Q125671 "Unicode Functions Supported 13812 also maybe GetTextExtentExPoint? (KB Q125671 ``Unicode Functions Supported
13809 by Windows 95") 13813 by Windows 95'')
13810 13814
13811 Q210341 says this in addition: 13815 Q210341 says this in addition:
13812 13816
13813 @quotation 13817 @quotation
13814 SUMMARY: 13818 SUMMARY:
13829 range beyond the 256 limitation of a one-byte representation. 13833 range beyond the 256 limitation of a one-byte representation.
13830 13834
13831 The Unicode standard offers application developers an opportunity to 13835 The Unicode standard offers application developers an opportunity to
13832 work with text without the limitations of character set based 13836 work with text without the limitations of character set based
13833 systems. For more information on the Unicode standard see the 13837 systems. For more information on the Unicode standard see the
13834 "References" section of this article. Windows NT is a fully Unicode 13838 References" section of this article. Windows NT is a fully Unicode
13835 capable operating system so it may be desirable to write software that 13839 capable operating system so it may be desirable to write software that
13836 supports Unicode on Windows 95. 13840 supports Unicode on Windows 95.
13837 13841
13838 Even though Windows 95 and Windows 98 are not Unicode based, they do 13842 Even though Windows 95 and Windows 98 are not Unicode based, they do
13839 provide some limited Unicode functionality. Drawing of Unicode text is 13843 provide some limited Unicode functionality. Drawing of Unicode text is
13912 @itemize @bullet 13916 @itemize @bullet
13913 @item 13917 @item
13914 wmain() is completely supported, and appropriate Unicode-formatted argv 13918 wmain() is completely supported, and appropriate Unicode-formatted argv
13915 and envp will always be passed. 13919 and envp will always be passed.
13916 @item 13920 @item
13917 Likewise, wWinMain() is completely supported. (NOTE: The docs are not at 13921 Likewise, @code{wWinMain()} is completely supported. (NOTE: The docs are not at
13918 all clear on how these various entry points interact, and implies that 13922 all clear on how these various entry points interact, and implies that
13919 a windows-subsystem program "must" use WinMain(), while a console- 13923 a windows-subsystem program ``must'' use @code{WinMain()}, while a console-
13920 subsystem program "must" use main(), and a program compiled with UNICODE 13924 subsystem program ``must'' use @code{main()}, and a program compiled with UNICODE
13921 (which we don't, see above) "must" use the w*() versions, while a program 13925 (which we don't, see above) ``must'' use the @code{w*()} versions, while a program
13922 not compiled this way "must" use the plain versions. In fact it appears 13926 not compiled this way ``must'' use the plain versions. In fact it appears
13923 that the CRT provides four different compiler entry points, namely 13927 that the CRT provides four different compiler entry points, namely
13924 w?(main|WinMain)CRTStartup, and we simply choose the one we like using 13928 w?(main|WinMain)CRTStartup, and we simply choose the one we like using
13925 the appropriate link flag. 13929 the appropriate link flag.
13926 @item 13930 @item
13927 _wenviron, _wputenv 13931 _wenviron, _wputenv
17939 | +--------------------------------------------------------------------+ | 17943 | +--------------------------------------------------------------------+ |
17940 | | menubar | | 17944 | | menubar | |
17941 | ###################################################################### | 17945 | ###################################################################### |
17942 | # toolbar # | 17946 | # toolbar # |
17943 | #--------------------------------------------------------------------# | 17947 | #--------------------------------------------------------------------# |
17944 | # | gutter | # | 17948 | # | internal border | # |
17945 | # |--------------------------------------------------------------| # | 17949 | # | +----------------------------------------------------------+ | # |
17946 | # | | internal border width | | # | 17950 | # | | gutter | | # |
17947 | # | | ******************************************************** | | # | 17951 | # | |-********************************************************-| | # |
17948 |w# | | * |s|v* |s* | | #w| 17952 |w# | | *@| scrollbar |v* |s* | | #w|
17949 |i# | | * |c|e* |c* | | #i| 17953 |i# | | *-+-------------------------|e* |c* | | #i|
17950 |n# | | * |r|r* |r* | | #n| 17954 |n# | | *s| |r* |r* | | #n|
17951 |d# | | * |o|t* |o* | | #d| 17955 |d# | | *c| |t* |o* | | #d|
17952 |o# | | * text area |l|.* text area |l* | | #o| 17956 |o# | | *r| |.* text area |l* | | #o|
17953 |w# | |i* |l| * |l*i| | #w| 17957 |w# |i| *o| | * |l* |i| #w|
17954 |-# | |n* |b|d* |b*n| | #-| 17958 |-# |n| *l| text area |d* |b* |n| #-|
17955 |m# | |t* |a|i* |a*t| | #m| 17959 |m# |t| *l| |i* |a* |t| #m|
17956 |a# | |.* |r|v* |r*.| | #a| 17960 |a# |e| *b| |v* |r* |e| #a|
17957 |n# t| | *-------------------------+-|i*----------------------+-* | |t #n| 17961 |n# t|r| *a| |i*----------------------+-* |r|t #n|
17958 |a# o|g|b* scrollbar | |d* scrollbar | *b|g|o #a| 17962 |a# o|n|g*r| |d* scrollbar |@*g|n|o #a|
17959 |g# o|u|o*-------------------------+-|e*----------------------+-*o|u|o #g| 17963 |g# o|a|u*-+-------------------------|e*----------------------+-*u|a|o #g|
17960 |e# l|t|r* modeline |r* modeline *r|t|l #e| 17964 |e# l|l|t* modeline |r* modeline *t|l|l #e|
17961 |r# b|t|d********************************************************d|t|b #r| 17965 |r# b| |t********************************************************t| |b #r|
17962 | # a|e|e* =..texttexttex....= |s|v* |s*e|e|a # | 17966 | # a|b|e* =..texttexttex....= |s|v* |s*e|b|a # |
17963 |d# r|r|r*o m=..texttexttextt..=o m|c|e* |c*r|r|r #d| 17967 |d# r|o|r*o m=..texttexttextt..=o m|c|e* |c*r|o|r #d|
17964 |e# | | *u a=.exttexttextte...=u a|r|r* |r* | | #e| 17968 |e# |r| *u a=.exttexttextte...=u a|r|r* |r* |r| #e|
17965 |c# | |w*t r=....texttexttex..=t r|o|t* |o*w| | #c| 17969 |c# |d| *t r=....texttexttex..=t r|o|t* |o* |d| #c|
17966 |o# | |i*s g= etc. =s g|l|.* text area |l*i| | #o| 17970 |o# |e| *s g= etc. =s g|l|.* text area |l* |e| #o|
17967 |r# | |d*i i= =i i|l| * |l*d| | #r| 17971 |r# |r| *i i= =i i|l| * |l* |r| #r|
17968 |a# | |t*d n= =d n|b|d* |b*t| | #a| 17972 |a# | | *d n= =d n|b|d* |b* | | #a|
17969 |t# | |h*e = inner text area =e |a|i* |a*h| | #t| 17973 |t# | | *e = inner text area =e |a|i* |a* | | #t|
17970 |i# | | * = = |r|v* |r* | | #i| 17974 |i# | | * = = |r|v* |r* | | #i|
17971 |o# | | *---===================---+-|i*----------------------+-* | | #o| 17975 |o# | | *---===================---+-|i*----------------------+-* | | #o|
17972 |n# | | * scrollbar | |d* scrollbar | * | | #n| 17976 |n# | | * scrollbar |@|d* scrollbar |@* | | #n|
17973 | # | | *-------------------------+-|e*----------------------+-* | | # | 17977 | # | | *-------------------------+-|e*----------------------+-* | | # |
17974 | # | | * modeline |r* modeline * | | # | 17978 | # | | * modeline |r* modeline * | | # |
17975 | # | | ******************************************************** | | # | 17979 | # | |-********************************************************-| | # |
17976 | # | | * minibuffer * | | # | 17980 | # | | gutter | | # |
17977 | # | | ******************************************************** | | # | 17981 | # | |-********************************************************-| | # |
17978 | # | | internal border width | | # | 17982 | # | |@* minibuffer *@| | # |
17979 | # |--------------------------------------------------------------| # | 17983 | # | +-********************************************************-+ | # |
17980 | # | gutter | # | 17984 | # | internal border | # |
17981 | #--------------------------------------------------------------------# | 17985 | #--------------------------------------------------------------------# |
17982 | # toolbar # | 17986 | # toolbar # |
17983 | ###################################################################### | 17987 | ###################################################################### |
17984 | window manager decoration | 17988 | window manager decoration |
17985 +------------------------------------------------------------------------+ 17989 +------------------------------------------------------------------------+
17986 17990
17987 # = boundary of client area; * = window boundaries, boundary of paned area 17991 # = boundary of client area; * = window boundaries, boundary of paned area
17988 = = boundary of inner text area; . = inside margin area 17992 = = boundary of inner text area; . = inside margin area; @ = dead boxes
17989 @end example 17993 @end example
17990 17994
17991 Note in particular what happens at the corners, where a "corner box" 17995 Note in particular what happens at the corners, where a ``corner box''
17992 occurs. Top and bottom toolbars take precedence over left and right 17996 occurs. Top and bottom toolbars take precedence over left and right
17993 toolbars, extending out horizontally into the corner boxes. Gutters 17997 toolbars, extending out horizontally into the corner boxes. Gutters
17994 work the same way. The corner box where the scrollbars meet, however, 17998 work the same way. The corner box where the scrollbars meet, however,
17995 is assigned to neither scrollbar, and is known as the "dead box"; it is 17999 is assigned to neither scrollbar, and is known as the ``dead box''; it is
17996 an area that must be cleared specially. 18000 an area that must be cleared specially. There are similar dead boxes at
18001 the bottom-right and bottom-left corners where the minibuffer and
18002 left/right gutters meet, but there is currently a bug in that these dead
18003 boxes are not explicitly cleared and may contain junk.
17997 18004
17998 @node The Frame, The Non-Client Area, Intro to Window and Frame Geometry, Window and Frame Geometry 18005 @node The Frame, The Non-Client Area, Intro to Window and Frame Geometry, Window and Frame Geometry
17999 @section The Frame 18006 @section The Frame
18000 18007
18001 The "top-level window area" is the entire area of a top-level window (or 18008 The ``top-level window area'' is the entire area of a top-level window (or
18002 "frame"). The "client area" (a term from MS Windows) is the area of a 18009 ``frame''). The ``client area'' (a term from MS Windows) is the area of a
18003 top-level window that XEmacs draws into and manages with redisplay. 18010 top-level window that XEmacs draws into and manages with redisplay.
18004 This includes the toolbar, scrollbars, gutters, dividers, text area, 18011 This includes the toolbar, scrollbars, gutters, dividers, text area,
18005 modeline and minibuffer. It does not include the menubar, title or 18012 modeline and minibuffer. It does not include the menubar, title or
18006 outer borders. The "non-client area" is the area of a top-level window 18013 outer borders. The ``non-client area'' is the area of a top-level window
18007 outside of the client area and includes the menubar, title and outer 18014 outside of the client area and includes the menubar, title and outer
18008 borders. Internally, all frame coordinates are relative to the client 18015 borders. Internally, all frame coordinates are relative to the client
18009 area. 18016 area.
18010 18017
18011 18018
18018 @item 18025 @item
18019 The outer layer is the window-manager decorations: The title and 18026 The outer layer is the window-manager decorations: The title and
18020 borders. These are controlled by the window manager, a separate process 18027 borders. These are controlled by the window manager, a separate process
18021 that controls the desktop, the location of icons, etc. When a process 18028 that controls the desktop, the location of icons, etc. When a process
18022 tries to create a window, the window manager intercepts this action and 18029 tries to create a window, the window manager intercepts this action and
18023 "reparents" the window, placing another window around it which contains 18030 ``reparents'' the window, placing another window around it which contains
18024 the window decorations, including the title bar, outer borders used for 18031 the window decorations, including the title bar, outer borders used for
18025 resizing, etc. The window manager also implements any actions involving 18032 resizing, etc. The window manager also implements any actions involving
18026 the decorations, such as the ability to resize a window by dragging its 18033 the decorations, such as the ability to resize a window by dragging its
18027 borders, move a window by dragging its title bar, etc. If there is no 18034 borders, move a window by dragging its title bar, etc. If there is no
18028 window manager or you kill it, windows will have no decorations (and 18035 window manager or you kill it, windows will have no decorations (and
18029 will lose them if they previously had any) and you will not be able to 18036 will lose them if they previously had any) and you will not be able to
18030 move or resize them. 18037 move or resize them.
18031 18038
18032 @item 18039 @item
18033 Inside of the window-manager decorations is the "shell", which is 18040 Inside of the window-manager decorations is the ``shell'', which is
18034 managed by the toolkit and widget libraries your program is linked with. 18041 managed by the toolkit and widget libraries your program is linked with.
18035 The code in @file{*-x.c} uses the Xt toolkit and various possible widget 18042 The code in @file{*-x.c} uses the Xt toolkit and various possible widget
18036 libraries built on top of Xt, such as Motif, Athena, the "Lucid" 18043 libraries built on top of Xt, such as Motif, Athena, the ``Lucid''
18037 widgets, etc. Another possibility is GTK (@file{*-gtk.c}), which implements 18044 widgets, etc. Another possibility is GTK (@file{*-gtk.c}), which implements
18038 both the toolkit and widgets. Under Xt, the "shell" window is an 18045 both the toolkit and widgets. Under Xt, the ``shell'' window is an
18039 EmacsShell widget, containing an EmacsManager widget of the same size, 18046 EmacsShell widget, containing an EmacsManager widget of the same size,
18040 which in turn contains a menubar widget and an EmacsFrame widget, inside 18047 which in turn contains a menubar widget and an EmacsFrame widget, inside
18041 of which is the client area. (The division into EmacsShell and 18048 of which is the client area. (The division into EmacsShell and
18042 EmacsManager is due to the complex and screwy geometry-management system 18049 EmacsManager is due to the complex and screwy geometry-management system
18043 in Xt [and X more generally]. The EmacsShell handles negotation with 18050 in Xt [and X more generally]. The EmacsShell handles negotation with
18049 18056
18050 Under Windows, the non-client area is managed by the window system. 18057 Under Windows, the non-client area is managed by the window system.
18051 There is no division such as under X. Part of the window-system API 18058 There is no division such as under X. Part of the window-system API
18052 (@file{USER.DLL}) of Win32 includes functions to control the menubars, title, 18059 (@file{USER.DLL}) of Win32 includes functions to control the menubars, title,
18053 etc. and implements the move and resize behavior. There @strong{is} an 18060 etc. and implements the move and resize behavior. There @strong{is} an
18054 equivalent of the window manager, called the "shell", but it manages 18061 equivalent of the window manager, called the ``shell'', but it manages
18055 only the desktop, not the windows themselves. The normal shell under 18062 only the desktop, not the windows themselves. The normal shell under
18056 Windows is @file{EXPLORER.EXE}; if you kill this, you will lose the bar 18063 Windows is @file{EXPLORER.EXE}; if you kill this, you will lose the bar
18057 containing the "Start" menu and tray and such, but the windows 18064 containing the ``Start'' menu and tray and such, but the windows
18058 themselves will not be affected or lose their decorations. 18065 themselves will not be affected or lose their decorations.
18059 18066
18060 18067
18061 @node The Client Area, The Paned Area, The Non-Client Area, Window and Frame Geometry 18068 @node The Client Area, The Paned Area, The Non-Client Area, Window and Frame Geometry
18062 @section The Client Area 18069 @section The Client Area
18063 18070
18064 Inside of the client area is the toolbars, the gutters (where the buffer 18071 Inside of the client area is the toolbars, the gutters (where the buffer
18065 tabs are displayed), the minibuffer, the internal border width, and one 18072 tabs are displayed), the minibuffer, the internal border width, and one
18066 or more non-overlapping "windows" (this is old Emacs terminology, from 18073 or more non-overlapping ``windows'' (this is old Emacs terminology, from
18067 before the time when frames existed at all; the standard terminology for 18074 before the time when frames existed at all; the standard terminology for
18068 this would be "pane"). Each window can contain a modeline, horizontal 18075 this would be ``pane''). Each window can contain a modeline, horizontal
18069 and/or vertical scrollbars, and (for non-rightmost windows) a vertical 18076 and/or vertical scrollbars, and (for non-rightmost windows) a vertical
18070 divider, surrounding a text area. 18077 divider, surrounding a text area.
18071 18078
18072 The dimensions of the toolbars and gutters are determined by the formula 18079 The dimensions of the toolbars and gutters are determined by the formula
18073 (THICKNESS + 2 * BORDER-THICKNESS), where "thickness" is a cover term 18080 (THICKNESS + 2 * BORDER-THICKNESS), where ``thickness'' is a cover term
18074 for height or width, as appropriate. The height and width come from 18081 for height or width, as appropriate. The height and width come from
18075 @code{default-toolbar-height} and @code{default-toolbar-width} and the specific 18082 @code{default-toolbar-height} and @code{default-toolbar-width} and the specific
18076 versions of these (@code{top-toolbar-height}, @code{left-toolbar-width}, etc.). 18083 versions of these (@code{top-toolbar-height}, @code{left-toolbar-width}, etc.).
18077 The border thickness comes from @code{default-toolbar-border-height} and 18084 The border thickness comes from @code{default-toolbar-border-height} and
18078 @code{default-toolbar-border-width}, and the specific versions of these. The 18085 @code{default-toolbar-border-width}, and the specific versions of these. The
18093 18100
18094 18101
18095 @node The Paned Area, Text Areas, The Client Area, Window and Frame Geometry 18102 @node The Paned Area, Text Areas, The Client Area, Window and Frame Geometry
18096 @section The Paned Area 18103 @section The Paned Area
18097 18104
18098 The area occupied by the "windows" is called the paned area. Note that 18105 The area occupied by the ``windows'' is called the paned area.
18099 this includes the minibuffer, which is just another window but is 18106 Unfortunately, because of the presence of the gutter @strong{between} the
18100 special-cased in XEmacs. Each window can include a horizontal and/or 18107 minibuffer and other windows, the bottom of the paned area is not
18101 vertical scrollbar, a modeline and a vertical divider to its right, as 18108 well-defined -- does it include the minibuffer (in which case it also
18102 well as the text area. Only non-rightmost windows can include a 18109 includes the bottom gutter, but none others) or does it not include
18103 vertical divider. (The minibuffer normally does not include either 18110 the minibuffer? (In which case not all windows are included.) It would
18104 modeline or scrollbars.) 18111 be cleaner to put the bottom gutter @strong{below} the minibuffer instead of
18112 above it.
18113
18114 Each window can include a horizontal and/or vertical scrollbar, a
18115 modeline and a vertical divider to its right, as well as the text area.
18116 Only non-rightmost windows can include a vertical divider. (The
18117 minibuffer normally does not include either modeline or scrollbars.)
18105 18118
18106 Note that, because the toolbars and gutters are controlled by 18119 Note that, because the toolbars and gutters are controlled by
18107 specifiers, and specifiers can have window-specific and buffer-specific 18120 specifiers, and specifiers can have window-specific and buffer-specific
18108 values, the size of the paned area can change depending on which window 18121 values, the size of the paned area can change depending on which window
18109 is selected: In other words, if the selected window or buffer changes, 18122 is selected: In other words, if the selected window or buffer changes,
18122 @code{horizontal-scrollbar-visible-p}, @code{vertical-scrollbar-visible-p}, 18135 @code{horizontal-scrollbar-visible-p}, @code{vertical-scrollbar-visible-p},
18123 @code{vertical-divider-always-visible-p}, etc. 18136 @code{vertical-divider-always-visible-p}, etc.
18124 18137
18125 In addition, it is possible to set margins in the text area using the 18138 In addition, it is possible to set margins in the text area using the
18126 specifiers @code{left-margin-width} and @code{right-margin-width}. When this is 18139 specifiers @code{left-margin-width} and @code{right-margin-width}. When this is
18127 done, only the "inner text area" (the area inside of the margins) will 18140 done, only the ``inner text area'' (the area inside of the margins) will
18128 be used for normal display of text; the margins will be used for glyphs 18141 be used for normal display of text; the margins will be used for glyphs
18129 with a layout policy of @code{outside-margin} (as set on an extent containing 18142 with a layout policy of @code{outside-margin} (as set on an extent containing
18130 the glyph by @code{set-extent-begin-glyph-layout} or 18143 the glyph by @code{set-extent-begin-glyph-layout} or
18131 @code{set-extent-end-glyph-layout}). However, the calculation of the text 18144 @code{set-extent-end-glyph-layout}). However, the calculation of the text
18132 area size (e.g. in the function @code{window-text-area-width}) includes the 18145 area size (e.g. in the function @code{window-text-area-width}) includes the
18133 margins. Which margin is used depends on whether a glyph has been set 18146 margins. Which margin is used depends on whether a glyph has been set
18134 as the begin-glyph or end-glyph of an extent (@code{set-extent-begin-glyph} 18147 as the begin-glyph or end-glyph of an extent (@code{set-extent-begin-glyph}
18135 etc.), using the left and right margins, respectively. 18148 etc.), using the left and right margins, respectively.
18136 18149
18137 Technically, the margins outside of the inner text area are known as the 18150 Technically, the margins outside of the inner text area are known as the
18138 "outside margins". The "inside margins" are in the inner text area and 18151 ``outside margins''. The ``inside margins'' are in the inner text area and
18139 constitute the whitespace between the outside margins and the first or 18152 constitute the whitespace between the outside margins and the first or
18140 last non-whitespace character in a line; their width can vary from line 18153 last non-whitespace character in a line; their width can vary from line
18141 to line. Glyphs will be placed in the inside margin if their layout 18154 to line. Glyphs will be placed in the inside margin if their layout
18142 policy is @code{inside-margin} or @code{whitespace}, with @code{whitespace} glyphs on 18155 policy is @code{inside-margin} or @code{whitespace}, with @code{whitespace} glyphs on
18143 the inside and @code{inside-margin} glyphs on the outside. Inside-margin 18156 the inside and @code{inside-margin} glyphs on the outside. Inside-margin
18148 18161
18149 18162
18150 @node The Displayable Area, Which Functions Use Which?, Text Areas, Window and Frame Geometry 18163 @node The Displayable Area, Which Functions Use Which?, Text Areas, Window and Frame Geometry
18151 @section The Displayable Area 18164 @section The Displayable Area
18152 18165
18153 The "displayable area" is not so much an actual area as a convenient 18166 The ``displayable area'' is not so much an actual area as a convenient
18154 fiction. It is the area used to convert between pixel and character 18167 fiction. It is the area used to convert between pixel and character
18155 dimensions for frames. The character dimensions for a frame (e.g. as 18168 dimensions for frames. The character dimensions for a frame (e.g. as
18156 returned by @code{frame-width} and @code{frame-height} and set by 18169 returned by @code{frame-width} and @code{frame-height} and set by
18157 @code{set-frame-width} and @code{set-frame-height}) are determined from the 18170 @code{set-frame-width} and @code{set-frame-height}) are determined from the
18158 displayable area by dividing by the pixel size of the default font as 18171 displayable area by dividing by the pixel size of the default font as
18159 instantiated in the frame. (For proportional fonts, the "average" width 18172 instantiated in the frame. (For proportional fonts, the ``average'' width
18160 is used. Under Windows, this is a built-in property of the fonts. 18173 is used. Under Windows, this is a built-in property of the fonts.
18161 Under X, this is based on the width of the lowercase 'n', or if this is 18174 Under X, this is based on the width of the lowercase 'n', or if this is
18162 zero then the width of the default character. [We prefer 'n' to the 18175 zero then the width of the default character. [We prefer 'n' to the
18163 specified default character because many X fonts have a default 18176 specified default character because many X fonts have a default
18164 character with a zero or otherwise non-representative width.]) 18177 character with a zero or otherwise non-representative width.])
18165 18178
18166 The displayable area is essentially the "theoretical" paned area of the 18179 The displayable area is essentially the ``theoretical'' gutter area of the
18167 frame excluding the rightmost and bottom-most scrollbars. In this 18180 frame, excluding the rightmost and bottom-most scrollbars. That is, it
18168 context, "theoretical" means that all calculations on based on 18181 starts from the client (or ``total'') area and then excludes the
18169 frame-level values for toolbar, gutter and scrollbar thicknesses. 18182 ``theoretical'' toolbars and bottom-most/rightmost scrollbars, and the
18170 Because these thicknesses are controlled by specifiers, and specifiers 18183 internal border width. In this context, ``theoretical'' means that all
18171 can have window-specific and buffer-specific values, these calculations 18184 calculations on based on frame-level values for toolbar and scrollbar
18172 may or may not reflect the actual size of the paned area or of the 18185 thicknesses. Because these thicknesses are controlled by specifiers,
18173 scrollbars when any particular window is selected. Note also that the 18186 and specifiers can have window-specific and buffer-specific values,
18174 "displayable area" may not even be contiguous! In particular, if the 18187 these calculations may or may not reflect the actual size of the paned
18175 frame-level value of the horizontal scrollbar height is non-zero, then 18188 area or of the scrollbars when any particular window is selected. Note
18176 the displayable area includes the paned area above and below the bottom 18189 also that the ``displayable area'' may not even be contiguous! In
18177 horizontal scrollbar but not the scrollbar itself. 18190 particular, the gutters are included, but the bottom-most and rightmost
18191 scrollbars are excluded even though they are inside of the gutters.
18192 Furthermore, if the frame-level value of the horizontal scrollbar height
18193 is non-zero, then the displayable area includes the paned area above and
18194 below the bottom horizontal scrollbar (i.e. the modeline and minibuffer)
18195 but not the scrollbar itself.
18178 18196
18179 As a further twist, the character-dimension calculations are adjusted so 18197 As a further twist, the character-dimension calculations are adjusted so
18180 that the truncation and continuation glyphs (see @code{truncation-glyph} and 18198 that the truncation and continuation glyphs (see @code{truncation-glyph} and
18181 @code{continuation-glyph}) count as a single character even if they are wider 18199 @code{continuation-glyph}) count as a single character even if they are wider
18182 than the default font width. (Technically, the character width is 18200 than the default font width. (Technically, the character width is
18185 width before dividing by the default-font width, and then adding 1 to 18203 width before dividing by the default-font width, and then adding 1 to
18186 the result.) (The ultimate motivation for this kludge as well as the 18204 the result.) (The ultimate motivation for this kludge as well as the
18187 subtraction of the scrollbars, but not the minibuffer or bottom-most 18205 subtraction of the scrollbars, but not the minibuffer or bottom-most
18188 modeline, is to maintain compatibility with TTY's.) 18206 modeline, is to maintain compatibility with TTY's.)
18189 18207
18190 Despite all these concerns and kludges, however, the "displayable area" 18208 Despite all these concerns and kludges, however, the ``displayable area''
18191 concept works well in practice and mostly ensures that by default the 18209 concept works well in practice and mostly ensures that by default the
18192 frame will actually fit 79 characters + continuation/truncation glyph. 18210 frame will actually fit 79 characters + continuation/truncation glyph.
18193 18211
18194 18212
18195 @node Which Functions Use Which?, , The Displayable Area, Window and Frame Geometry 18213 @node Which Functions Use Which?, , The Displayable Area, Window and Frame Geometry
19834 @section Event Queues 19852 @section Event Queues
19835 @cindex event queues 19853 @cindex event queues
19836 @cindex queues, event 19854 @cindex queues, event
19837 19855
19838 There are two event queues here -- the command event queue (#### which 19856 There are two event queues here -- the command event queue (#### which
19839 should be called "deferred event queue" and is in my glyph ws) and the 19857 should be called ``deferred event queue'' and is in my glyph ws) and the
19840 dispatch event queue. (MS Windows actually has an extra dispatch queue 19858 dispatch event queue. (MS Windows actually has an extra dispatch queue
19841 for non-user events and uses the generic one only for user events. This 19859 for non-user events and uses the generic one only for user events. This
19842 is because user and non-user events in Windows come through the same 19860 is because user and non-user events in Windows come through the same
19843 place -- the window procedure -- but under X, it's possible to 19861 place -- the window procedure -- but under X, it's possible to
19844 selectively process events such that we take all the user events before 19862 selectively process events such that we take all the user events before
19939 19957
19940 @item handle_magic_event_cb 19958 @item handle_magic_event_cb
19941 XEmacs calls this with an event structure which contains window-system 19959 XEmacs calls this with an event structure which contains window-system
19942 dependent information that XEmacs doesn't need to know about, but which 19960 dependent information that XEmacs doesn't need to know about, but which
19943 must happen in order. If the @code{next_event_cb} never returns an 19961 must happen in order. If the @code{next_event_cb} never returns an
19944 event of type "magic", this will never be used. 19962 event of type ``magic'', this will never be used.
19945 19963
19946 @item format_magic_event_cb 19964 @item format_magic_event_cb
19947 Called with a magic event; print a representation of the innards of the 19965 Called with a magic event; print a representation of the innards of the
19948 event to @var{PSTREAM}. 19966 event to @var{PSTREAM}.
19949 19967
19971 @item select_process_cb 19989 @item select_process_cb
19972 @item unselect_process_cb 19990 @item unselect_process_cb
19973 These callbacks tell the underlying implementation to add or remove a 19991 These callbacks tell the underlying implementation to add or remove a
19974 file descriptor from the list of fds which are polled for 19992 file descriptor from the list of fds which are polled for
19975 inferior-process input. When input becomes available on the given 19993 inferior-process input. When input becomes available on the given
19976 process connection, an event of type "process" should be generated. 19994 process connection, an event of type ``process'' should be generated.
19977 19995
19978 @item select_console_cb 19996 @item select_console_cb
19979 @item unselect_console_cb 19997 @item unselect_console_cb
19980 These callbacks tell the underlying implementation to add or remove a 19998 These callbacks tell the underlying implementation to add or remove a
19981 console from the list of consoles which are polled for user-input. 19999 console from the list of consoles which are polled for user-input.
20099 @cindex focus handling 20117 @cindex focus handling
20100 20118
20101 Ben's capsule lecture on focus: 20119 Ben's capsule lecture on focus:
20102 20120
20103 In GNU Emacs @code{select-frame} never changes the window-manager frame 20121 In GNU Emacs @code{select-frame} never changes the window-manager frame
20104 focus. All it does is change the "selected frame". This is similar to 20122 focus. All it does is change the ``selected frame''. This is similar to
20105 what happens when we call @code{select-device} or @code{select-console}. 20123 what happens when we call @code{select-device} or @code{select-console}.
20106 Whenever an event comes in (including a keyboard event), its frame is 20124 Whenever an event comes in (including a keyboard event), its frame is
20107 selected; therefore, evaluating @code{select-frame} in @samp{*scratch*} 20125 selected; therefore, evaluating @code{select-frame} in @samp{*scratch*}
20108 won't cause any effects because the next received event (in the same 20126 won't cause any effects because the next received event (in the same
20109 frame) will cause a switch back to the frame displaying 20127 frame) will cause a switch back to the frame displaying
20134 minibuffer, you essentially want to temporarily switch the WM focus to 20152 minibuffer, you essentially want to temporarily switch the WM focus to
20135 the frame with the minibuffer, and switch it back when you exit the 20153 the frame with the minibuffer, and switch it back when you exit the
20136 minibuffer. 20154 minibuffer.
20137 20155
20138 GNU Emacs solves this with the crockish @code{redirect-frame-focus}, 20156 GNU Emacs solves this with the crockish @code{redirect-frame-focus},
20139 which says "for keyboard events received from FRAME, act like they're 20157 which says ``for keyboard events received from FRAME, act like they're
20140 coming from FOCUS-FRAME". I think what this means is that, when a 20158 coming from FOCUS-FRAME''. I think what this means is that, when a
20141 keyboard event comes in and the event manager is about to select the 20159 keyboard event comes in and the event manager is about to select the
20142 event's frame, if that frame has its focus redirected, the redirected-to 20160 event's frame, if that frame has its focus redirected, the redirected-to
20143 frame is selected instead. That way, if you're in a minibufferless 20161 frame is selected instead. That way, if you're in a minibufferless
20144 frame and enter the minibuffer, then all Lisp functions that run see the 20162 frame and enter the minibuffer, then all Lisp functions that run see the
20145 selected frame as the minibuffer's frame rather than the minibufferless 20163 selected frame as the minibuffer's frame rather than the minibufferless
20149 There's also some weird logic that switches the redirected frame focus 20167 There's also some weird logic that switches the redirected frame focus
20150 from one frame to another if Lisp code explicitly calls 20168 from one frame to another if Lisp code explicitly calls
20151 @code{select-frame} (but not if @code{handle-switch-frame} is called), 20169 @code{select-frame} (but not if @code{handle-switch-frame} is called),
20152 and saves and restores the frame focus in window configurations, 20170 and saves and restores the frame focus in window configurations,
20153 etc. etc. All of this logic is heavily @code{#if 0}'d, with lots of 20171 etc. etc. All of this logic is heavily @code{#if 0}'d, with lots of
20154 comments saying "No, this approach doesn't seem to work, so I'm trying 20172 comments saying ``No, this approach doesn't seem to work, so I'm trying
20155 this ... is it reasonable? Well, I'm not sure ..." that are a red flag 20173 this ... is it reasonable? Well, I'm not sure ...'' that are a red flag
20156 indicating crockishness. 20174 indicating crockishness.
20157 20175
20158 Because of our way of doing things, we can avoid all this crock. 20176 Because of our way of doing things, we can avoid all this crock.
20159 Keyboard events never cause a select-frame (who cares what frame they're 20177 Keyboard events never cause a select-frame (who cares what frame they're
20160 associated with? They come from a console, only). We change the actual 20178 associated with? They come from a console, only). We change the actual
24933 return value should be an alist consisting of a list of all of the 24951 return value should be an alist consisting of a list of all of the
24934 defined subtypes for that coding system type along with a level of 24952 defined subtypes for that coding system type along with a level of
24935 likelihood and a list of additional properties indicating certain 24953 likelihood and a list of additional properties indicating certain
24936 features detected in the data. The extra properties returned are 24954 features detected in the data. The extra properties returned are
24937 defined entirely by the particular coding system type and are used 24955 defined entirely by the particular coding system type and are used
24938 only in the algorithm described below under "user control." However, 24956 only in the algorithm described below under ``user control.'' However,
24939 the levels of likelihood have a standard meaning as follows: 24957 the levels of likelihood have a standard meaning as follows:
24940 24958
24941 Level 4 means "near certainty" and typically indicates that a 24959 Level 4 means ``near certainty'' and typically indicates that a
24942 signature has been detected, usually at the beginning of the data, 24960 signature has been detected, usually at the beginning of the data,
24943 indicating that the data is encoded in this particular coding system 24961 indicating that the data is encoded in this particular coding system
24944 type. An example of this would be the byte order mark at the beginning 24962 type. An example of this would be the byte order mark at the beginning
24945 of UCS2 encoded data or the GZIP mark at the beginning of GZIP data. 24963 of UCS2 encoded data or the GZIP mark at the beginning of GZIP data.
24946 24964
24947 Level 3 means "highly likely" and indicates that tell-tale signs have 24965 Level 3 means ``highly likely'' and indicates that tell-tale signs have
24948 been discovered in the data that are characteristic of this particular 24966 been discovered in the data that are characteristic of this particular
24949 coding system type. Examples of this might be ISO 2022 escape 24967 coding system type. Examples of this might be ISO 2022 escape
24950 sequences or the current Unicode end of line markers at regular 24968 sequences or the current Unicode end of line markers at regular
24951 intervals. 24969 intervals.
24952 24970
24953 Level 2 means "strongly statistically likely" indicating that 24971 Level 2 means ``strongly statistically likely'' indicating that
24954 statistical analysis concludes that there's a high chance that this 24972 statistical analysis concludes that there's a high chance that this
24955 data is encoded according to this particular type. For example, this 24973 data is encoded according to this particular type. For example, this
24956 might mean that for UCS2 data, there is a high proportion of null bytes 24974 might mean that for UCS2 data, there is a high proportion of null bytes
24957 or other repeated bytes in the odd-numbered bytes of the data and a 24975 or other repeated bytes in the odd-numbered bytes of the data and a
24958 high variance in the even-numbered bytes of the data. For Shift-JIS, 24976 high variance in the even-numbered bytes of the data. For Shift-JIS,
24959 this might indicate that there were no illegal Shift-JIS sequences 24977 this might indicate that there were no illegal Shift-JIS sequences
24960 and a fairly high occurrence of common Shift-JIS characters. 24978 and a fairly high occurrence of common Shift-JIS characters.
24961 24979
24962 Level 1 means "weak statistical likelihood" meaning that there is some 24980 Level 1 means ``weak statistical likelihood'' meaning that there is some
24963 indication that the data is encoded in this coding system type. In 24981 indication that the data is encoded in this coding system type. In
24964 fact, there is a reasonable chance that it may be some other type as 24982 fact, there is a reasonable chance that it may be some other type as
24965 well. This means, for example, that no illegal sequences were 24983 well. This means, for example, that no illegal sequences were
24966 encountered and at least some data was encountered that is purposely 24984 encountered and at least some data was encountered that is purposely
24967 not in other coding system types. For Shift-JIS data, this might mean 24985 not in other coding system types. For Shift-JIS data, this might mean
24968 that some bytes in the range 128 to 159 were encountered in the data. 24986 that some bytes in the range 128 to 159 were encountered in the data.
24969 24987
24970 Level 0 means "neutral" which is to say that there's either not enough 24988 Level 0 means ``neutral'' which is to say that there's either not enough
24971 data to make any decision or that the data could well be interpreted 24989 data to make any decision or that the data could well be interpreted
24972 as this type (meaning no illegal sequences), but there is little or no 24990 as this type (meaning no illegal sequences), but there is little or no
24973 indication of anything particular to this particular type. 24991 indication of anything particular to this particular type.
24974 24992
24975 Level -1 means "weakly unlikely" meaning that some data was 24993 Level -1 means ``weakly unlikely'' meaning that some data was
24976 encountered that could conceivably be part of the coding system type 24994 encountered that could conceivably be part of the coding system type
24977 but is probably not. For example, successively long line-lengths or 24995 but is probably not. For example, successively long line-lengths or
24978 very rarely-encountered sequences. 24996 very rarely-encountered sequences.
24979 24997
24980 Level -2 means "strongly unlikely" meaning that typically a number 24998 Level -2 means ``strongly unlikely'' meaning that typically a number
24981 of illegal sequences were encountered. 24999 of illegal sequences were encountered.
24982 25000
24983 The algorithm to determine when to stop and indicate that the data has 25001 The algorithm to determine when to stop and indicate that the data has
24984 been detected as a particular coding system uses a priority list, 25002 been detected as a particular coding system uses a priority list,
24985 which is typically specified as part of the language environment 25003 which is typically specified as part of the language environment
24994 Japanese-language environment particular subtypes of ISO 2022 will be 25012 Japanese-language environment particular subtypes of ISO 2022 will be
24995 associated with the Japanese coding system version of those 25013 associated with the Japanese coding system version of those
24996 subtypes). It is perfectly legal and quite common in fact, to list the 25014 subtypes). It is perfectly legal and quite common in fact, to list the
24997 same subtype more than once in the priority list with successively 25015 same subtype more than once in the priority list with successively
24998 lower requirements. Other facts that can be listed in the priority 25016 lower requirements. Other facts that can be listed in the priority
24999 list for a subtype are "reject", meaning that the data should never be 25017 list for a subtype are ``reject'', meaning that the data should never be
25000 detected as this subtype, or "ask", meaning that if the data is 25018 detected as this subtype, or ``ask'', meaning that if the data is
25001 detected to be this subtype, the user will be asked whether they 25019 detected to be this subtype, the user will be asked whether they
25002 actually mean this. This latter property could be used, for example, 25020 actually mean this. This latter property could be used, for example,
25003 towards the bottom of the priority list. 25021 towards the bottom of the priority list.
25004 25022
25005 In addition there is a global variable which specifies the minimum 25023 In addition there is a global variable which specifies the minimum
25012 system, the subtype, the coding system and the associated level of 25030 system, the subtype, the coding system and the associated level of
25013 likelihood will be prominently displayed either in the echo area or in 25031 likelihood will be prominently displayed either in the echo area or in
25014 a status box somewhere. 25032 a status box somewhere.
25015 25033
25016 If no positive match is found according to the priority list, or if 25034 If no positive match is found according to the priority list, or if
25017 the matches that are found have the "ask" property on them, then the 25035 the matches that are found have the ``ask'' property on them, then the
25018 user will be presented with a list of choices of possible encodings 25036 user will be presented with a list of choices of possible encodings
25019 and asked to choose one. This list is typically sorted first by level 25037 and asked to choose one. This list is typically sorted first by level
25020 of likelihood, and then within this, by the order in which the 25038 of likelihood, and then within this, by the order in which the
25021 subtypes appear in the priority list. This list is displayed in a 25039 subtypes appear in the priority list. This list is displayed in a
25022 special kind of dialog box or other buffer allowing the user, in 25040 special kind of dialog box or other buffer allowing the user, in
25029 will be in the form of errors or warnings of various levels, some of 25047 will be in the form of errors or warnings of various levels, some of
25030 which may be severe enough to stop the decoding entirely, and some of 25048 which may be severe enough to stop the decoding entirely, and some of
25031 which may either indicate definitely malformed data but from which 25049 which may either indicate definitely malformed data but from which
25032 it's possible to recover, or simply data that appears rather 25050 it's possible to recover, or simply data that appears rather
25033 questionable. If any of these status values are reported during 25051 questionable. If any of these status values are reported during
25034 decoding, the user will be informed of this and asked "are you sure?" 25052 decoding, the user will be informed of this and asked ``are you sure?''
25035 As part of the "are you sure" dialog box or question, the user can 25053 As part of the ``are you sure'' dialog box or question, the user can
25036 display the results of the decoding to make sure it's correct. If the 25054 display the results of the decoding to make sure it's correct. If the
25037 user says "no, they're not sure," then the same list of choices as 25055 user says ``no, they're not sure,'' then the same list of choices as
25038 previously mentioned will be presented. 25056 previously mentioned will be presented.
25039 25057
25040 @subheading RFC: Autodetection 25058 @subheading RFC: Autodetection
25041 25059
25042 Also appeared under heading "Implementation of Coding System Priority 25060 Also appeared under heading "Implementation of Coding System Priority
25252 25270
25253 @enumerate 25271 @enumerate
25254 @item 25272 @item
25255 Hopefully a system general enough to handle (2)--(4) will 25273 Hopefully a system general enough to handle (2)--(4) will
25256 handle these, too, but we should watch out for gotchas like 25274 handle these, too, but we should watch out for gotchas like
25257 Unicode "plane 14" tags which (I think _both_ Ben and Olivier 25275 Unicode ``plane 14'' tags which (I think _both_ Ben and Olivier
25258 will agree) have no place in the internal representation, and 25276 will agree) have no place in the internal representation, and
25259 thus must be treated as out-of-band control sequences. I 25277 thus must be treated as out-of-band control sequences. I
25260 don't know if all such gotchas will be as easy to dispose of. 25278 don't know if all such gotchas will be as easy to dispose of.
25261 25279
25262 @item 25280 @item
25293 25311
25294 sly, it can't be perfect if any autodecoding is done; 25312 sly, it can't be perfect if any autodecoding is done;
25295 like Hrvoje should have an easily available option to 25313 like Hrvoje should have an easily available option to
25296 to this default (or an optimized approximation which 25314 to this default (or an optimized approximation which
25297 t actually read the whole file into a buffer) or simply 25315 t actually read the whole file into a buffer) or simply
25298 y everything as binary (with the "font" for binary files 25316 y everything as binary (with the ``font'' for binary files
25299 a user option). 25317 a user option).
25300 25318
25301 @item 25319 @item
25302 This implies that we should be detecting conditions in the 25320 This implies that we should be detecting conditions in the
25303 tail of the file which violate the implicit assumptions of the 25321 tail of the file which violate the implicit assumptions of the
25402 25420
25403 Date: 11/1/1999 7:24 AM 25421 Date: 11/1/1999 7:24 AM
25404 25422
25405 Stephen, thank you very much for writing this up. I think it is a good start, 25423 Stephen, thank you very much for writing this up. I think it is a good start,
25406 and definitely moving in the direction I would like to see things going: more 25424 and definitely moving in the direction I would like to see things going: more
25407 proposals, less arguing. (aka "more light, less heat") However, I have some 25425 proposals, less arguing. (aka ``more light, less heat'') However, I have some
25408 suggestions for cleaning this up: 25426 suggestions for cleaning this up:
25409 25427
25410 You should try to make it more layered. For example, you might have one 25428 You should try to make it more layered. For example, you might have one
25411 section devoted to the workings of autodetection, which starts out like this 25429 section devoted to the workings of autodetection, which starts out like this
25412 (the section numbers below are totally arbitrary): 25430 (the section numbers below are totally arbitrary):