comparison man/internals/internals.texi @ 5178:97eb4942aec8

merge
author Ben Wing <ben@xemacs.org>
date Mon, 29 Mar 2010 21:28:13 -0500
parents 8b2f75cecb89 f965e31a35f0
children 3889ef128488
comparison
equal deleted inserted replaced
5177:b785049378e3 5178:97eb4942aec8
159 that has been formatted into ASCII lists and tables. 159 that has been formatted into ASCII lists and tables.
160 160
161 Note: to define these routines, put point after the end of the definition 161 Note: to define these routines, put point after the end of the definition
162 and type C-x C-e. 162 and type C-x C-e.
163 163
164 (defun list-to-texinfo (b e) 164 (defun convert-list-to-texinfo (b e)
165 "Convert the selected region from an ASCII list to a Texinfo list." 165 "Convert the selected region from an ASCII list to a Texinfo list."
166 (interactive "r") 166 (interactive "r")
167 (save-restriction 167 (save-restriction
168 (narrow-to-region b e) 168 (narrow-to-region b e)
169 (goto-char (point-min)) 169 (goto-char (point-min))
170 (let ((dash-type "^ *-+ +") 170 (let ((dash-type "^ *\\(-+\\|o\\) +")
171 ;; allow single-letter numbering or roman numerals 171 ;; allow single-letter numbering or roman numerals
172 (letter-type "^ *[[(]?\\([a-zA-Z]\\|[IVXivx]+\\)[]).] +") 172 (letter-type "^ *[[(]?\\([a-zA-Z]\\|[IVXivx]+\\)[]).] +")
173 (num-type "^ *[[(]?[0-9]+[]).] +") 173 (num-type "^ *[[(]?[0-9]+[]).] +")
174 dash regexp) 174 dash regexp)
175 (save-excursion 175 (save-excursion
237 (insert-char ?\ (- min (current-column))) 237 (insert-char ?\ (- min (current-column)))
238 (beginning-of-line) 238 (beginning-of-line)
239 (forward-char min)) 239 (forward-char min))
240 (kill-rectangle b (point)))))) 240 (kill-rectangle b (point))))))
241 241
242 (defun table-to-texinfo (b e) 242 (defun convert-table-to-texinfo (b e)
243 "Convert the selected region from an ASCII table to a Texinfo table. 243 "Convert the selected region from an ASCII table to a Texinfo table.
244 Assumes entries are separated by a blank line, and the first sexp in 244 Assumes entries are separated by a blank line, and the first sexp in
245 each entry is the table heading." 245 each entry is the table heading."
246 (interactive "r") 246 (interactive "r")
247 (save-restriction 247 (save-restriction
281 If the region is active, do the region; otherwise, go from point to the end 281 If the region is active, do the region; otherwise, go from point to the end
282 of the buffer. This query-replaces for various kinds of conventions used 282 of the buffer. This query-replaces for various kinds of conventions used
283 in text: @code{} surrounded by ` and ' or followed by a (); @strong{} 283 in text: @code{} surrounded by ` and ' or followed by a (); @strong{}
284 surrounded by *'s; @file{} something that looks like a file name." 284 surrounded by *'s; @file{} something that looks like a file name."
285 (interactive) 285 (interactive)
286 (if (and (not no-narrow) (region-active-p)) 286 (save-excursion
287 (save-restriction 287 (if (and (not no-narrow) (region-active-p))
288 (narrow-to-region (region-beginning) (region-end)) 288 (save-restriction
289 (convert-text-to-texinfo t)) 289 (narrow-to-region (region-beginning) (region-end))
290 (let ((p (point)) 290 (goto-char (region-beginning))
291 (case-replace nil)) 291 (zmacs-deactivate-region)
292 (query-replace-regexp "`\\([^']+\\)'\\([^']\\)" "@code{\\1}\\2" nil) 292 (convert-text-to-texinfo t))
293 (goto-char p) 293 (let ((p (point))
294 (query-replace-regexp "\\(\\Sw\\)\\*\\(\\(?:\\s_\\|\\sw\\)+\\)\\*\\([^A-Za-z.}]\\)" "\\1@strong{\\2}\\3" nil) 294 (case-replace nil))
295 (goto-char p) 295 (message "Point is %d" (point))
296 (query-replace-regexp "\\(\\(\\s_\\|\\sw\\)+()\\)\\([^}]\\)" "@code{\\1}\\3" nil) 296 (query-replace-regexp "`\\([^']+\\)'\\([^']\\)" "@code{\\1}\\2" nil)
297 (goto-char p) 297 (goto-char p)
298 (query-replace-regexp "\\(\\(\\s_\\|\\sw\\)+\\.[A-Za-z]+\\)\\([^A-Za-z.}]\\)" "@file{\\1}\\3" nil) 298 (query-replace-regexp "\\(\\Sw\\)\\*\\(\\(?:\\s_\\|\\sw\\)+\\)\\*\\([^A-Za-z.}]\\)" "\\1@strong{\\2}\\3" nil)
299 ))) 299 (goto-char p)
300 (query-replace-regexp "\\(\\(\\s_\\|\\sw\\)+()\\)\\([^}]\\)" "@code{\\1}\\3" nil)
301 (goto-char p)
302 (query-replace-regexp "\\(\\(\\s_\\|\\sw\\)+\\.[A-Za-z]+\\)\\([^A-Za-z.}]\\)" "@file{\\1}\\3" nil)
303 ))))
300 304
301 4. Adding new sections: 305 4. Adding new sections:
302 ----------------------- 306 -----------------------
303 307
304 NOTE: These are in the form of macros. #### FIXME Convert them to 308 NOTE: These are in the form of macros. #### FIXME Convert them to
1236 XEmacs is a powerful, customizable text editor and development 1240 XEmacs is a powerful, customizable text editor and development
1237 environment. It began in 1991 as Lucid Emacs, which was in turn 1241 environment. It began in 1991 as Lucid Emacs, which was in turn
1238 derived from GNU Emacs, a program written by Richard Stallman of the 1242 derived from GNU Emacs, a program written by Richard Stallman of the
1239 Free Software Foundation. GNU Emacs dates back to 1985 and was 1243 Free Software Foundation. GNU Emacs dates back to 1985 and was
1240 modelled after Unipress Emacs, an editor written by James Gosling in 1244 modelled after Unipress Emacs, an editor written by James Gosling in
1241 1981 and based on a series of other "Emacs"-like editors, including 1245 1981 and based on a series of other ``Emacs''-like editors, including
1242 EINE (EINE Is Not EMACS), c. 1976, by Dan Weinreb, which run on the 1246 EINE (EINE Is Not EMACS), c. 1976, by Dan Weinreb, which run on the
1243 MIT Lisp Machine and was the first Emacs written in Lisp; ZWEI (ZWEI 1247 MIT Lisp Machine and was the first Emacs written in Lisp; ZWEI (ZWEI
1244 Was EINE Initially), c. 1978, by Dan Weinreb and Mike McMahon; Multics 1248 Was EINE Initially), c. 1978, by Dan Weinreb and Mike McMahon; Multics
1245 Emacs, c. 1978, by Bernie Greenberg, which was written in MacLisp and 1249 Emacs, c. 1978, by Bernie Greenberg, which was written in MacLisp and
1246 also used Lisp as its extension language; and ZMACS, c. 1980, a direct 1250 also used Lisp as its extension language; and ZMACS, c. 1980, a direct
1247 descendant of ZWEI that on ran the Symbolics LM-2, LMI LispM, and 1251 descendant of ZWEI that on ran the Symbolics LM-2, LMI LispM, and
1248 later, TI Explorer (1983-1989). These in turn were inspired by the 1252 later, TI Explorer (1983-1989). These in turn were inspired by the
1249 first Emacs, a package called EMACS, written in 1976 by Richard 1253 first Emacs, a package called EMACS, written in 1976 by Richard
1250 Stallman, Guy Steele, and Dave Moon. This was a merger of TECMAC and 1254 Stallman, Guy Steele, and Dave Moon. This was a merger of TECMAC and
1251 TMACS, a pair of "TECO-macro realtime editors" written by Guy Steele, 1255 TMACS, a pair of ``TECO-macro realtime editors'' written by Guy Steele,
1252 Dave Moon, Richard Greenblatt, Charles Frankston, et al., and added a 1256 Dave Moon, Richard Greenblatt, Charles Frankston, et al., and added a
1253 dynamic loader and Meta-key cmds. It ran under ITS (the Incompatible 1257 dynamic loader and Meta-key cmds. It ran under ITS (the Incompatible
1254 Timesharing System) on a DEC PDP 10 and under TWENEX on a Tops-20 and 1258 Timesharing System) on a DEC PDP 10 and under TWENEX on a Tops-20 and
1255 was written in TECO and PDP 10 assembly. ITS was one of the first 1259 was written in TECO and PDP 10 assembly. ITS was one of the first
1256 time-sharing operating systems and dates back well before Unix. ITS, 1260 time-sharing operating systems and dates back well before Unix. ITS,
1284 M. Stallman (RMS) and James Gosling (the creator of Java); its extension 1288 M. Stallman (RMS) and James Gosling (the creator of Java); its extension
1285 language was known as @dfn{Mocklisp}. This version of Emacs-in-C formed 1289 language was known as @dfn{Mocklisp}. This version of Emacs-in-C formed
1286 the basis for the early versions of GNU Emacs and also for Gosling's 1290 the basis for the early versions of GNU Emacs and also for Gosling's
1287 Unipress Emacs, a commercial product. Because of bad blood between the 1291 Unipress Emacs, a commercial product. Because of bad blood between the
1288 two over the issue of commercialism, RMS pretty much disowned this 1292 two over the issue of commercialism, RMS pretty much disowned this
1289 collaboration, referring to it as "Gosling Emacs". 1293 collaboration, referring to it as ``Gosling Emacs''.
1290 1294
1291 At this point we pick up with a time line of events. (A broader timeline 1295 At this point we pick up with a time line of events. (A broader timeline
1292 is available at @uref{http://www.jwz.org/doc/emacs-timeline.html, 1296 is available at @uref{http://www.jwz.org/doc/emacs-timeline.html,
1293 ``Emacs Timeline''}.) 1297 ``Emacs Timeline''}.)
1294 1298
1575 redisplay code, preliminary I18N support, code merged from GNU Emacs 1579 redisplay code, preliminary I18N support, code merged from GNU Emacs
1576 19.8 beta) 1580 19.8 beta)
1577 @item 1581 @item
1578 Version 19.9 released January 12, 1994. (Scrollbars, Athena.) 1582 Version 19.9 released January 12, 1994. (Scrollbars, Athena.)
1579 @item 1583 @item
1580 Version 19.10 released May 27, 1994. (Uses `configure'; code merged 1584 Version 19.10 released May 27, 1994. (Uses @code{configure}; code merged
1581 from GNU Emacs 19.23 beta and further merging with Epoch 4.0) Known as 1585 from GNU Emacs 19.23 beta and further merging with Epoch 4.0) Known as
1582 "Lucid Emacs" when shipped by Lucid, and as "XEmacs" when shipped by 1586 ``Lucid Emacs'' when shipped by Lucid, and as ``XEmacs'' when shipped by
1583 Sun; but Lucid went out of business a few days later and it's unclear 1587 Sun; but Lucid went out of business a few days later and it's unclear
1584 very many copies of 19.10 were released by Lucid. (Last release by 1588 very many copies of 19.10 were released by Lucid. (Last release by
1585 Jamie Zawinski.) 1589 Jamie Zawinski.)
1586 @end itemize 1590 @end itemize
1587 1591
1887 rewritten redisplay, TTY support, multi-device support, device and 1891 rewritten redisplay, TTY support, multi-device support, device and
1888 console objects, specifiers, glyphs, toolbars, horizontal scrollbars, 1892 console objects, specifiers, glyphs, toolbars, horizontal scrollbars,
1889 Lucid scrollbar widget, 3-d modeline, stay-up Lucid menus, resizable 1893 Lucid scrollbar widget, 3-d modeline, stay-up Lucid menus, resizable
1890 minibuffer, echo area is a true buffer, MD5 hashing support, expanded 1894 minibuffer, echo area is a true buffer, MD5 hashing support, expanded
1891 menubar, redone menu specification format (including menu filters), 1895 menubar, redone menu specification format (including menu filters),
1892 rewritten extents, renamed "screen" to "frame", misc-user events, 1896 rewritten extents, renamed ``screen'' to ``frame'', misc-user events,
1893 rewritten face code, rewritten mouse code, warnings system, CL 1897 rewritten face code, rewritten mouse code, warnings system, CL
1894 backquote syntax, critical C-g, code merging with GNU Emacs 19.28. 1898 backquote syntax, critical C-g, code merging with GNU Emacs 19.28.
1895 New packages Hyperbole, OOBR, hm--html-menus, viper, lazy-lock, 1899 New packages Hyperbole, OOBR, hm--html-menus, viper, lazy-lock,
1896 ksh-mode, rsz-minibuf.) 1900 ksh-mode, rsz-minibuf.)
1897 @item 1901 @item
1935 version 20.4 released February 28, 1998. 1939 version 20.4 released February 28, 1998.
1936 @item 1940 @item
1937 version 21.0.60 released December 10, 1998. (The version naming scheme was 1941 version 21.0.60 released December 10, 1998. (The version naming scheme was
1938 changed at this point: [a] the second version number is odd for stable 1942 changed at this point: [a] the second version number is odd for stable
1939 versions, even for beta versions; [b] a third version number is added, 1943 versions, even for beta versions; [b] a third version number is added,
1940 replacing the "beta xxx" ending for beta versions and allowing for 1944 replacing the ``beta xxx'' ending for beta versions and allowing for
1941 periodic maintenance releases for stable versions. Therefore, 21.0 was 1945 periodic maintenance releases for stable versions. Therefore, 21.0 was
1942 never "officially" released; similarly for 21.2, etc.) 1946 never ``officially'' released; similarly for 21.2, etc.)
1943 @item 1947 @item
1944 version 21.0.61 released January 4, 1999. 1948 version 21.0.61 released January 4, 1999.
1945 @item 1949 @item
1946 version 21.0.63 released February 3, 1999. 1950 version 21.0.63 released February 3, 1999.
1947 @item 1951 @item
1953 @item 1957 @item
1954 version 21.0.67 released March 25, 1999. 1958 version 21.0.67 released March 25, 1999.
1955 @item 1959 @item
1956 version 21.1.2 released May 14, 1999. (This is the followup to 21.0.67. 1960 version 21.1.2 released May 14, 1999. (This is the followup to 21.0.67.
1957 The second version number was bumped to indicate the beginning of the 1961 The second version number was bumped to indicate the beginning of the
1958 "stable" series.) 1962 ``stable'' series.)
1959 @item 1963 @item
1960 version 21.1.3 released June 26, 1999. 1964 version 21.1.3 released June 26, 1999.
1961 @item 1965 @item
1962 version 21.1.4 released July 8, 1999. 1966 version 21.1.4 released July 8, 1999.
1963 @item 1967 @item
2043 @item 2047 @item
2044 version 21.2.39 released December 31, 2000. 2048 version 21.2.39 released December 31, 2000.
2045 @item 2049 @item
2046 version 21.2.40 released January 8, 2001. 2050 version 21.2.40 released January 8, 2001.
2047 @item 2051 @item
2048 version 21.2.41 "Polyhymnia" released January 17, 2001. 2052 version 21.2.41 ``Polyhymnia'' released January 17, 2001.
2049 @item 2053 @item
2050 version 21.2.42 "Poseidon" released January 20, 2001. 2054 version 21.2.42 ``Poseidon'' released January 20, 2001.
2051 @item 2055 @item
2052 version 21.2.43 "Terspichore" released January 26, 2001. 2056 version 21.2.43 ``Terspichore'' released January 26, 2001.
2053 @item 2057 @item
2054 version 21.2.44 "Thalia" released February 8, 2001. 2058 version 21.2.44 ``Thalia'' released February 8, 2001.
2055 @item 2059 @item
2056 version 21.2.45 "Thelxepeia" released February 23, 2001. 2060 version 21.2.45 ``Thelxepeia'' released February 23, 2001.
2057 @item 2061 @item
2058 version 21.2.46 "Urania" released March 21, 2001. 2062 version 21.2.46 ``Urania'' released March 21, 2001.
2059 @item 2063 @item
2060 version 21.2.47 "Zephir" released April 14, 2001. 2064 version 21.2.47 ``Zephir'' released April 14, 2001.
2061 @item 2065 @item
2062 XEmacs 21.4.0 "Solid Vapor" released April 16, 2001. 2066 XEmacs 21.4.0 ``Solid Vapor'' released April 16, 2001.
2063 @item 2067 @item
2064 XEmacs 21.4.1 "Copyleft" released April 19, 2001. 2068 XEmacs 21.4.1 ``Copyleft'' released April 19, 2001.
2065 @item 2069 @item
2066 XEmacs 21.4.2 "Developer-Friendly Unix APIs" released May 10, 2001. 2070 XEmacs 21.4.2 ``Developer-Friendly Unix APIs'' released May 10, 2001.
2067 @item 2071 @item
2068 XEmacs 21.4.3 "Academic Rigor" released May 17, 2001. 2072 XEmacs 21.4.3 ``Academic Rigor'' released May 17, 2001.
2069 @item 2073 @item
2070 XEmacs 21.4.4 "Artificial Intelligence" released July 28, 2001. 2074 XEmacs 21.4.4 ``Artificial Intelligence'' released July 28, 2001.
2071 @item 2075 @item
2072 XEmacs 21.4.5 "Civil Service" released October 23, 2001. 2076 XEmacs 21.4.5 ``Civil Service'' released October 23, 2001.
2073 @item 2077 @item
2074 XEmacs 21.4.6 "Common Lisp" released December 17, 2001. 2078 XEmacs 21.4.6 ``Common Lisp'' released December 17, 2001.
2075 @item 2079 @item
2076 XEmacs 21.4.7 "Economic Science" released May 4, 2002. 2080 XEmacs 21.4.7 ``Economic Science'' released May 4, 2002.
2077 @item 2081 @item
2078 XEmacs 21.4.8 "Honest Recruiter" released May 9, 2002. 2082 XEmacs 21.4.8 ``Honest Recruiter'' released May 9, 2002.
2079 @item 2083 @item
2080 XEmacs 21.4.9 "Informed Management" released August 23, 2002. 2084 XEmacs 21.4.9 ``Informed Management'' released August 23, 2002.
2081 @item 2085 @item
2082 XEmacs 21.4.10 "Military Intelligence" released November 2, 2002. 2086 XEmacs 21.4.10 ``Military Intelligence'' released November 2, 2002.
2083 @item 2087 @item
2084 XEmacs 21.4.11 "Native Windows TTY Support" released January 3, 2003. 2088 XEmacs 21.4.11 ``Native Windows TTY Support'' released January 3, 2003.
2085 @item 2089 @item
2086 XEmacs 21.4.12 "Portable Code" released January 15, 2003. 2090 XEmacs 21.4.12 ``Portable Code'' released January 15, 2003.
2087 @item 2091 @item
2088 XEmacs 21.4.13 "Rational FORTRAN" released May 25, 2003. 2092 XEmacs 21.4.13 ``Rational FORTRAN'' released May 25, 2003.
2089 @item 2093 @item
2090 XEmacs 21.4.14 "Reasonable Discussion" released September 3, 2003. 2094 XEmacs 21.4.14 ``Reasonable Discussion'' released September 3, 2003.
2091 @item 2095 @item
2092 XEmacs 21.4.15 "Security Through Obscurity" released February 2, 2004. 2096 XEmacs 21.4.15 ``Security Through Obscurity'' released February 2, 2004.
2093 @item 2097 @item
2094 XEmacs 21.4.16 "Successful IPO" released December 5, 2004. 2098 XEmacs 21.4.16 ``Successful IPO'' released December 5, 2004.
2095 @item 2099 @item
2096 version 21.5.0 "alfalfa" released April 18, 2001. 2100 version 21.5.0 ``alfalfa'' released April 18, 2001.
2097 @item 2101 @item
2098 version 21.5.1 "anise" released May 9, 2001. 2102 version 21.5.1 ``anise'' released May 9, 2001.
2099 @item 2103 @item
2100 version 21.5.2 "artichoke" released July 28, 2001. 2104 version 21.5.2 ``artichoke'' released July 28, 2001.
2101 @item 2105 @item
2102 version 21.5.3 "asparagus" released September 7, 2001. 2106 version 21.5.3 ``asparagus'' released September 7, 2001.
2103 @item 2107 @item
2104 version 21.5.4 "bamboo" released January 8, 2002. 2108 version 21.5.4 ``bamboo'' released January 8, 2002.
2105 @item 2109 @item
2106 version 21.5.5 "beets" released March 5, 2002. 2110 version 21.5.5 ``beets'' released March 5, 2002.
2107 @item 2111 @item
2108 version 21.5.6 "bok choi" released April 5, 2002. 2112 version 21.5.6 ``bok choi'' released April 5, 2002.
2109 @item 2113 @item
2110 version 21.5.7 "broccoflower" released July 2, 2002. 2114 version 21.5.7 ``broccoflower'' released July 2, 2002.
2111 @item 2115 @item
2112 version 21.5.8 "broccoli" released July 27, 2002. 2116 version 21.5.8 ``broccoli'' released July 27, 2002.
2113 @item 2117 @item
2114 version 21.5.9 "brussels sprouts" released August 30, 2002. 2118 version 21.5.9 ``brussels sprouts'' released August 30, 2002.
2115 @item 2119 @item
2116 version 21.5.10 "burdock" released January 4, 2003. 2120 version 21.5.10 ``burdock'' released January 4, 2003.
2117 @item 2121 @item
2118 version 21.5.11 "cabbage" released February 16, 2003. 2122 version 21.5.11 ``cabbage'' released February 16, 2003.
2119 @item 2123 @item
2120 version 21.5.12 "carrot" released April 24, 2003. 2124 version 21.5.12 ``carrot'' released April 24, 2003.
2121 @item 2125 @item
2122 version 21.5.13 "cauliflower" released May 10, 2003. 2126 version 21.5.13 ``cauliflower'' released May 10, 2003.
2123 @item 2127 @item
2124 version 21.5.14 "cassava" released June 1, 2003. 2128 version 21.5.14 ``cassava'' released June 1, 2003.
2125 @item 2129 @item
2126 version 21.5.15 "celery" released September 3, 2003. 2130 version 21.5.15 ``celery'' released September 3, 2003.
2127 @item 2131 @item
2128 version 21.5.16 "celeriac" released September 26, 2003. 2132 version 21.5.16 ``celeriac'' released September 26, 2003.
2129 @item 2133 @item
2130 version 21.5.17 "chayote" released March 22, 2004. 2134 version 21.5.17 ``chayote'' released March 22, 2004.
2131 @item 2135 @item
2132 version 21.5.18 "chestnut" released October 22, 2004. 2136 version 21.5.18 ``chestnut'' released October 22, 2004.
2133 @end itemize 2137 @end itemize
2134 2138
2135 @node The XEmacs Split, XEmacs from the Outside, A History of Emacs, Top 2139 @node The XEmacs Split, XEmacs from the Outside, A History of Emacs, Top
2136 @chapter The XEmacs Split 2140 @chapter The XEmacs Split
2137 @cindex XEmacs split 2141 @cindex XEmacs split
2151 to cooperate a bit with RMS, and the two versions of Emacs will merge. In 2155 to cooperate a bit with RMS, and the two versions of Emacs will merge. In
2152 fact there have been six to seven major attempts at merging, each running 2156 fact there have been six to seven major attempts at merging, each running
2153 hundreds of messages long and all of them coming from the XEmacs side. All 2157 hundreds of messages long and all of them coming from the XEmacs side. All
2154 have failed because they have eventually come to the same conclusion, which 2158 have failed because they have eventually come to the same conclusion, which
2155 is that RMS has no real interest in cooperation at all. If you work with 2159 is that RMS has no real interest in cooperation at all. If you work with
2156 him, you have to do it his way -- "my way or the highway". Specifically: 2160 him, you have to do it his way -- ``my way or the highway''. Specifically:
2157 2161
2158 @enumerate 2162 @enumerate
2159 @item 2163 @item
2160 2164
2161 RMS insists on having legal papers signed for every bit of code that goes 2165 RMS insists on having legal papers signed for every bit of code that goes
4046 zero or more Kanji characters followed by zero or more 4050 zero or more Kanji characters followed by zero or more
4047 Hiragana characters. 4051 Hiragana characters.
4048 @end display 4052 @end display
4049 4053
4050 Then, the problem is that now we can't say that a sequence of 4054 Then, the problem is that now we can't say that a sequence of
4051 word-constituents makes up a word. For instance, both Hiragana "A" 4055 word-constituents makes up a word. For instance, both Hiragana ``A''
4052 and Kanji "KAN" are word-constituents but the sequence of these two 4056 and Kanji ``KAN'' are word-constituents but the sequence of these two
4053 letters can't be a single word. 4057 letters can't be a single word.
4054 4058
4055 So, we introduced Sextword for Japanese letters. 4059 So, we introduced Sextword for Japanese letters.
4056 @end quotation 4060 @end quotation
4057 4061
5006 @item 5010 @item
5007 Any header-file declarations of the sort 5011 Any header-file declarations of the sort
5008 5012
5009 struct foobar; 5013 struct foobar;
5010 5014
5011 go into the "types" section of lisp.h. 5015 go into the ``types'' section of @file{lisp.h}.
5012 @end itemize 5016 @end itemize
5013 5017
5014 @node Writing New Modules, Working with Lisp Objects, Introduction to Writing C Code, Rules When Writing New C Code 5018 @node Writing New Modules, Working with Lisp Objects, Introduction to Writing C Code, Rules When Writing New C Code
5015 @section Writing New Modules 5019 @section Writing New Modules
5016 @cindex writing new modules 5020 @cindex writing new modules
5269 style now forbids passing pointers to @samp{Lisp_<Type>} structures into 5273 style now forbids passing pointers to @samp{Lisp_<Type>} structures into
5270 or out of a function; instead, a @samp{Lisp_Object} should be passed or 5274 or out of a function; instead, a @samp{Lisp_Object} should be passed or
5271 returned (created using @samp{wrap_<type>}, if necessary). 5275 returned (created using @samp{wrap_<type>}, if necessary).
5272 5276
5273 @c #### declaration 5277 @c #### declaration
5274 @item DECLARE_LRECORD (<type>, Lisp_<Type>) 5278 @item DECLARE_LISP_OBJECT (<type>, Lisp_<Type>)
5275 Declares an @samp{lrecord} for @samp{<Type>}, which is the unit of 5279 Declares a Lisp object for @samp{<Type>}, which is the unit of
5276 allocation. 5280 allocation.
5277 5281
5278 @item #define X<TYPE>(x) XRECORD (x, <type>, Lisp_<Type>) 5282 @item #define X<TYPE>(x) XRECORD (x, <type>, Lisp_<Type>)
5279 Turns a @code{Lisp_Object} into a pointer to @samp{struct Lisp_<Type>}. 5283 Turns a @code{Lisp_Object} into a pointer to @samp{struct Lisp_<Type>}.
5280 5284
5336 Here is a checklist of things to do when creating a new lisp object type 5340 Here is a checklist of things to do when creating a new lisp object type
5337 named @var{foo}: 5341 named @var{foo}:
5338 5342
5339 @enumerate 5343 @enumerate
5340 @item 5344 @item
5341 create @var{foo}.h 5345 Create @var{foo}.h
5342 @item 5346 @item
5343 create @var{foo}.c 5347 Create @var{foo}.c
5344 @item 5348 @item
5345 add definitions of @code{syms_of_@var{foo}}, etc. to @file{@var{foo}.c} 5349 Add definitions of @code{syms_of_@var{foo}}, etc. to @file{@var{foo}.c}
5346 @item 5350 @item
5347 add declarations of @code{syms_of_@var{foo}}, etc. to @file{symsinit.h} 5351 Add declarations of @code{syms_of_@var{foo}}, etc. to @file{symsinit.h}
5348 @item 5352 @item
5349 add calls to @code{syms_of_@var{foo}}, etc. to @file{emacs.c} 5353 Add calls to @code{syms_of_@var{foo}}, etc. to @file{emacs.c}
5350 @item 5354 @item
5351 add definitions of macros like @code{CHECK_@var{FOO}} and 5355 Add definitions of macros like @code{CHECK_@var{FOO}} and
5352 @code{@var{FOO}P} to @file{@var{foo}.h} 5356 @code{@var{FOO}P} to @file{@var{foo}.h}
5353 @item 5357 @item
5354 add the new type index to @code{enum lrecord_type} 5358 Add the new type index to @code{enum lrecord_type}
5355 @item 5359 @item
5356 add a DEFINE_LRECORD_IMPLEMENTATION call to @file{@var{foo}.c} 5360 Add a @code{DEFINE_*_LISP_OBJECT()} to @file{@var{foo}.c}
5357 @item 5361 @item
5358 add an INIT_LRECORD_IMPLEMENTATION call to @code{syms_of_@var{foo}.c} 5362 Add an @code{INIT_LISP_OBJECT} call to @code{syms_of_@var{foo}.c}
5359 @end enumerate 5363 @end enumerate
5360 5364
5361 5365
5362 @node Writing Lisp Primitives, Writing Good Comments, Working with Lisp Objects, Rules When Writing New C Code 5366 @node Writing Lisp Primitives, Writing Good Comments, Working with Lisp Objects, Rules When Writing New C Code
5363 @section Writing Lisp Primitives 5367 @section Writing Lisp Primitives
5664 correct it or flag it as incorrect, as described in the previous 5668 correct it or flag it as incorrect, as described in the previous
5665 paragraph. Whenever you work on a section of code, @emph{always} make 5669 paragraph. Whenever you work on a section of code, @emph{always} make
5666 sure to update any comments to be correct -- or, at the very least, flag 5670 sure to update any comments to be correct -- or, at the very least, flag
5667 them as incorrect. 5671 them as incorrect.
5668 5672
5669 To indicate a "todo" or other problem, use four pound signs -- 5673 To indicate a ``todo'' or other problem, use four pound signs --
5670 i.e. @samp{####}. 5674 i.e. @samp{####}.
5671 5675
5672 @node Adding Global Lisp Variables, Writing Macros, Writing Good Comments, Rules When Writing New C Code 5676 @node Adding Global Lisp Variables, Writing Macros, Writing Good Comments, Rules When Writing New C Code
5673 @section Adding Global Lisp Variables 5677 @section Adding Global Lisp Variables
5674 @cindex global Lisp variables, adding 5678 @cindex global Lisp variables, adding
5836 functions a gcc bug, but the gcc maintainers disagree. 5840 functions a gcc bug, but the gcc maintainers disagree.
5837 5841
5838 @cindex inline functions, headers 5842 @cindex inline functions, headers
5839 @cindex header files, inline functions 5843 @cindex header files, inline functions
5840 Every header which contains inline functions, either directly by using 5844 Every header which contains inline functions, either directly by using
5841 @code{DECLARE_INLINE_HEADER} or indirectly by using @code{DECLARE_LRECORD} must 5845 @code{DECLARE_INLINE_HEADER} or indirectly by using
5842 be added to @file{inline.c}'s includes to make the optimization 5846 @code{DECLARE_LISP_OBJECT} must be added to @file{inline.c}'s includes
5843 described above work. (Optimization note: if all INLINE_HEADER 5847 to make the optimization described above work. (Optimization note: if
5844 functions are in fact inlined in all translation units, then the linker 5848 all INLINE_HEADER functions are in fact inlined in all translation
5845 can just discard @code{inline.o}, since it contains only unreferenced code). 5849 units, then the linker can just discard @code{inline.o}, since it
5850 contains only unreferenced code).
5846 5851
5847 The three golden rules of macros: 5852 The three golden rules of macros:
5848 5853
5849 @enumerate 5854 @enumerate
5850 @item 5855 @item
5851 Anything that's an lvalue can be evaluated more than once. 5856 Anything that's an lvalue can be evaluated more than once.
5852 @item 5857 @item
5853 Macros where anything else can be evaluated more than once should 5858 Macros where anything else can be evaluated more than once should
5854 have the word "unsafe" in their name (exceptions may be made for 5859 have the word ``unsafe'' in their name (exceptions may be made for
5855 large sets of macros that evaluate arguments of certain types more 5860 large sets of macros that evaluate arguments of certain types more
5856 than once, e.g. struct buffer * arguments, when clearly indicated in 5861 than once, e.g. struct buffer * arguments, when clearly indicated in
5857 the macro documentation). These macros are generally meant to be 5862 the macro documentation). These macros are generally meant to be
5858 called only by other macros that have already stored the calling 5863 called only by other macros that have already stored the calling
5859 values in temporary variables. 5864 values in temporary variables.
5881 Capitalize macros doing stuff obviously impossible with (C) 5886 Capitalize macros doing stuff obviously impossible with (C)
5882 functions, e.g. directly modifying arguments as if they were passed by 5887 functions, e.g. directly modifying arguments as if they were passed by
5883 reference. 5888 reference.
5884 @item 5889 @item
5885 Capitalize macros that evaluate @strong{any} argument more than once regardless 5890 Capitalize macros that evaluate @strong{any} argument more than once regardless
5886 of whether that's "allowed" (e.g. buffer arguments). 5891 of whether that's ``allowed'' (e.g. buffer arguments).
5887 @item 5892 @item
5888 Capitalize macros that directly access a field in a Lisp_Object or 5893 Capitalize macros that directly access a field in a Lisp_Object or
5889 its equivalent underlying structure. In such cases, access through the 5894 its equivalent underlying structure. In such cases, access through the
5890 Lisp_Object precedes the macro with an X, and access through the underlying 5895 Lisp_Object precedes the macro with an X, and access through the underlying
5891 structure doesn't. 5896 structure doesn't.
5936 a search-and-replace is done to change type names and such. Some people 5941 a search-and-replace is done to change type names and such. Some people
5937 disagree with such changes, and certainly if done without good reason 5942 disagree with such changes, and certainly if done without good reason
5938 will just lead to headaches. But it's important to keep the code clean 5943 will just lead to headaches. But it's important to keep the code clean
5939 and understandable, and consistent naming goes a long way towards this. 5944 and understandable, and consistent naming goes a long way towards this.
5940 5945
5941 An example of the right way to do this was the so-called "great integral 5946 An example of the right way to do this was the so-called ``great integral
5942 type renaming". 5947 type renaming''.
5943 5948
5944 @menu 5949 @menu
5945 * Great Integral Type Renaming:: 5950 * Great Integral Type Renaming::
5946 * Text/Char Type Renaming:: 5951 * Text/Char Type Renaming::
5947 @end menu 5952 @end menu
5964 @item 5969 @item
5965 All integral types that measure quantities of anything are signed. Some 5970 All integral types that measure quantities of anything are signed. Some
5966 people disagree vociferously with this, but their arguments are mostly 5971 people disagree vociferously with this, but their arguments are mostly
5967 theoretical, and are vastly outweighed by the practical headaches of 5972 theoretical, and are vastly outweighed by the practical headaches of
5968 mixing signed and unsigned values, and more importantly by the far 5973 mixing signed and unsigned values, and more importantly by the far
5969 increased likelihood of inadvertent bugs: Because of the broken "viral" 5974 increased likelihood of inadvertent bugs: Because of the broken ``viral''
5970 nature of unsigned quantities in C (operations involving mixed 5975 nature of unsigned quantities in C (operations involving mixed
5971 signed/unsigned are done unsigned, when exactly the opposite is nearly 5976 signed/unsigned are done unsigned, when exactly the opposite is nearly
5972 always wanted), even a single error in declaring a quantity unsigned 5977 always wanted), even a single error in declaring a quantity unsigned
5973 that should be signed, or even the even more subtle error of comparing 5978 that should be signed, or even the even more subtle error of comparing
5974 signed and unsigned values and forgetting the necessary cast, can be 5979 signed and unsigned values and forgetting the necessary cast, can be
5975 catastrophic, as comparisons will yield wrong results. -Wsign-compare 5980 catastrophic, as comparisons will yield wrong results. @samp{-Wsign-compare}
5976 is turned on specifically to catch this, but this tends to result in a 5981 is turned on specifically to catch this, but this tends to result in a
5977 great number of warnings when mixing signed and unsigned, and the casts 5982 great number of warnings when mixing signed and unsigned, and the casts
5978 are annoying. More has been written on this elsewhere. 5983 are annoying. More has been written on this elsewhere.
5979 5984
5980 @item 5985 @item
5989 Type names should be relatively short (no more than 10 characters or 5994 Type names should be relatively short (no more than 10 characters or
5990 so), with the first letter capitalized and no underscores if they can at 5995 so), with the first letter capitalized and no underscores if they can at
5991 all be avoided. 5996 all be avoided.
5992 5997
5993 @item 5998 @item
5994 "count" == a zero-based measurement of some quantity. Includes sizes, 5999 ``count'' == a zero-based measurement of some quantity. Includes sizes,
5995 offsets, and indexes. 6000 offsets, and indexes.
5996 6001
5997 @item 6002 @item
5998 "bpos" == a one-based measurement of a position in a buffer. "Charbpos" 6003 ``bpos'' == a one-based measurement of a position in a buffer. ``Charbpos''
5999 and "Bytebpos" count text in the buffer, rather than bytes in memory; 6004 and ``Bytebpos'' count text in the buffer, rather than bytes in memory;
6000 thus Bytebpos does not directly correspond to the memory representation. 6005 thus Bytebpos does not directly correspond to the memory representation.
6001 Use "Membpos" for this. 6006 Use ``Membpos'' for this.
6002 6007
6003 @item 6008 @item
6004 "Char" refers to internal-format characters, not to the C type "char", 6009 ``Char'' refers to internal-format characters, not to the C type ``char'',
6005 which is really a byte. 6010 which is really a byte.
6006 @end itemize 6011 @end itemize
6007 6012
6008 For the actual name changes, see the script below. 6013 For the actual name changes, see the script below.
6009 6014
6094 #endif 6099 #endif
6095 6100
6096 /* The have been some arguments over the what the type should be that 6101 /* The have been some arguments over the what the type should be that
6097 specifies a count of bytes in a data block to be written out or read in, 6102 specifies a count of bytes in a data block to be written out or read in,
6098 using @code{Lstream_read()}, @code{Lstream_write()}, and related functions. 6103 using @code{Lstream_read()}, @code{Lstream_write()}, and related functions.
6099 Originally it was long, which worked fine; Martin "corrected" these to 6104 Originally it was long, which worked fine; Martin ``corrected'' these to
6100 size_t and ssize_t on the grounds that this is theoretically cleaner and 6105 size_t and ssize_t on the grounds that this is theoretically cleaner and
6101 is in keeping with the C standards. Unfortunately, this practice is 6106 is in keeping with the C standards. Unfortunately, this practice is
6102 horribly error-prone due to design flaws in the way that mixed 6107 horribly error-prone due to design flaws in the way that mixed
6103 signed/unsigned arithmetic happens. In fact, by doing this change, 6108 signed/unsigned arithmetic happens. In fact, by doing this change,
6104 Martin introduced a subtle but fatal error that caused the operation of 6109 Martin introduced a subtle but fatal error that caused the operation of
6469 fixed---use the @code{Known-Bug-Expect-Failure} wrapper macro to mark 6474 fixed---use the @code{Known-Bug-Expect-Failure} wrapper macro to mark
6470 them. 6475 them.
6471 6476
6472 @deffn Macro Known-Bug-Expect-Failure body 6477 @deffn Macro Known-Bug-Expect-Failure body
6473 Arrange for failing tests in @var{body} to generate messages prefixed 6478 Arrange for failing tests in @var{body} to generate messages prefixed
6474 with "KNOWN BUG:" instead of "FAIL:". @var{body} is a @code{progn}-like 6479 with ``KNOWN BUG:'' instead of ``FAIL:''. @var{body} is a @code{progn}-like
6475 body, and may contain several tests. 6480 body, and may contain several tests.
6476 @end deffn 6481 @end deffn
6477 6482
6478 A lot of the tests we run push limits; suppress Ebola warning messages 6483 A lot of the tests we run push limits; suppress Ebola warning messages
6479 with the @code{Ignore-Ebola} wrapper macro. 6484 with the @code{Ignore-Ebola} wrapper macro.
6650 with added or deleted files.} If you are lucky, the operation will 6655 with added or deleted files.} If you are lucky, the operation will
6651 simply fail. If you are less lucky, it will proceed, but make the 6656 simply fail. If you are less lucky, it will proceed, but make the
6652 adds and deletes on the main line, which you do not want at all. 6657 adds and deletes on the main line, which you do not want at all.
6653 Therefore, you must undo all adds and deletes. To find out what is 6658 Therefore, you must undo all adds and deletes. To find out what is
6654 added and deleted, use something like @code{cvs -n update >&! 6659 added and deleted, use something like @code{cvs -n update >&!
6655 cvs.out}, which does a "dry run". (You did make a backup copy first, 6660 cvs.out}, which does a ``dry run''. (You did make a backup copy first,
6656 right? What if you forgot the @samp{-n}, for example, and wasn't 6661 right? What if you forgot the @samp{-n}, for example, and wasn't
6657 prepared for the sudden onslaught of merging action?) Take a look at 6662 prepared for the sudden onslaught of merging action?) Take a look at
6658 the output file @file{cvs.out} and check very carefully for newly 6663 the output file @file{cvs.out} and check very carefully for newly
6659 added files (marked with an @samp{A}) and newly removed files (marked 6664 added files (marked with an @samp{A}) and newly removed files (marked
6660 with an @samp{R}). Double check that your newly added files are in 6665 with an @samp{R}). Double check that your newly added files are in
6682 crw tag -b ben-mule-21-5 6687 crw tag -b ben-mule-21-5
6683 @end example 6688 @end example
6684 6689
6685 Note that this doesn't actually do anything to your local workspace! 6690 Note that this doesn't actually do anything to your local workspace!
6686 It basically just creates another tag in the repository, identical to 6691 It basically just creates another tag in the repository, identical to
6687 the branch point tag but internally marked as a "branch tag" rather 6692 the branch point tag but internally marked as a ``branch tag'' rather
6688 than a regular tag. 6693 than a regular tag.
6689 6694
6690 @item 6695 @item
6691 Now, move your workspace onto the branch: 6696 Now, move your workspace onto the branch:
6692 6697
7016 and when you add a new element, the array automatically resizes itself 7021 and when you add a new element, the array automatically resizes itself
7017 if it isn't big enough. Dynarrs are extensively used in the redisplay 7022 if it isn't big enough. Dynarrs are extensively used in the redisplay
7018 mechanism. 7023 mechanism.
7019 7024
7020 7025
7021 A "dynamic array" is a contiguous array of fixed-size elements where there 7026 A ``dynamic array'' is a contiguous array of fixed-size elements where there
7022 is no upper limit (except available memory) on the number of elements in the 7027 is no upper limit (except available memory) on the number of elements in the
7023 array. Because the elements are maintained contiguously, space is used 7028 array. Because the elements are maintained contiguously, space is used
7024 efficiently (no per-element pointers necessary) and random access to a 7029 efficiently (no per-element pointers necessary) and random access to a
7025 particular element is in constant time. At any one point, the block of memory 7030 particular element is in constant time. At any one point, the block of memory
7026 that holds the array has an upper limit; if this limit is exceeded, the 7031 that holds the array has an upper limit; if this limit is exceeded, the
7027 memory is realloc()ed into a new array that is twice as big. Assuming that 7032 memory is @code{realloc()}ed into a new array that is twice as big. Assuming that
7028 the time to grow the array is on the order of the new size of the array 7033 the time to grow the array is on the order of the new size of the array
7029 block, this scheme has a provably constant amortized time (i.e. average 7034 block, this scheme has a provably constant amortized time (i.e. average
7030 time over all additions). 7035 time over all additions).
7031 7036
7032 When you add elements or retrieve elements, pointers are used. Note that 7037 When you add elements or retrieve elements, pointers are used. Note that
7130 onto a linked list, so they can be efficiently reused. This data type 7135 onto a linked list, so they can be efficiently reused. This data type
7131 is not much used in XEmacs currently, because it's a fairly new 7136 is not much used in XEmacs currently, because it's a fairly new
7132 addition. 7137 addition.
7133 7138
7134 7139
7135 A "block-type object" is used to efficiently allocate and free blocks 7140 A ``block-type object'' is used to efficiently allocate and free blocks
7136 of a particular size. Freed blocks are remembered in a free list and 7141 of a particular size. Freed blocks are remembered in a free list and
7137 are reused as necessary to allocate new blocks, so as to avoid as 7142 are reused as necessary to allocate new blocks, so as to avoid as
7138 much as possible making calls to malloc() and free(). 7143 much as possible making calls to @code{malloc()} and @code{free()}.
7139 7144
7140 This is a container object. Declare a block-type object of a specific type 7145 This is a container object. Declare a block-type object of a specific type
7141 as follows: 7146 as follows:
7142 7147
7143 struct mytype_blocktype @{ 7148 struct mytype_blocktype @{
7750 characters. No special allocation or garbage collection is necessary 7755 characters. No special allocation or garbage collection is necessary
7751 for such objects. Lisp objects of these types do not need to be 7756 for such objects. Lisp objects of these types do not need to be
7752 @code{GCPRO}ed. 7757 @code{GCPRO}ed.
7753 @end itemize 7758 @end itemize
7754 7759
7755 In the remaining two categories, the type is stored in the object 7760 In the remaining two categories, the type is stored in the object
7756 itself. The tag for all such objects is the generic @dfn{lrecord} 7761 itself. The tag for all such objects is the generic @dfn{lrecord}
7757 (Lisp_Type_Record) tag. The first bytes of the object's structure are an 7762 (Lisp_Type_Record) tag. The first bytes of the object's structure are an
7758 integer (actually a char) characterising the object's type and some 7763 integer (actually a char) characterising the object's type and some
7759 flags, in particular the mark bit used for garbage collection. A 7764 flags, in particular the mark bit used for garbage collection. A
7760 structure describing the type is accessible thru the 7765 structure describing the type is accessible thru the
8275 @code{this_one_is_unmarkable} in @code{alloc.c}). 8280 @code{this_one_is_unmarkable} in @code{alloc.c}).
8276 8281
8277 Now, the actual marking is feasible. We do so by once using the macro 8282 Now, the actual marking is feasible. We do so by once using the macro
8278 @code{MARK_RECORD_HEADER} to mark the object itself (actually the 8283 @code{MARK_RECORD_HEADER} to mark the object itself (actually the
8279 special flag in the lrecord header), and calling its special marker 8284 special flag in the lrecord header), and calling its special marker
8280 "method" @code{marker} if available. The marker method marks every 8285 ``method'' @code{marker} if available. The marker method marks every
8281 other object that is in reach from our current object. Note, that these 8286 other object that is in reach from our current object. Note, that these
8282 marker methods should not call @code{mark_object} recursively, but 8287 marker methods should not call @code{mark_object} recursively, but
8283 instead should return the next object from where further marking has to 8288 instead should return the next object from where further marking has to
8284 be performed. 8289 be performed.
8285 8290
8330 @code{sweep_conses}, @code{sweep_bit_vectors_1}, 8335 @code{sweep_conses}, @code{sweep_bit_vectors_1},
8331 @code{sweep_compiled_functions}, @code{sweep_floats}, 8336 @code{sweep_compiled_functions}, @code{sweep_floats},
8332 @code{sweep_symbols}, @code{sweep_extents}, @code{sweep_markers} and 8337 @code{sweep_symbols}, @code{sweep_extents}, @code{sweep_markers} and
8333 @code{sweep_extents}. They are the fixed-size types cons, floats, 8338 @code{sweep_extents}. They are the fixed-size types cons, floats,
8334 compiled-functions, symbol, marker, extent, and event stored in 8339 compiled-functions, symbol, marker, extent, and event stored in
8335 so-called "frob blocks", and therefore we can basically do the same on 8340 so-called ``frob blocks'', and therefore we can basically do the same on
8336 every type objects, using the same macros, especially defined only to 8341 every type objects, using the same macros, especially defined only to
8337 handle everything with respect to fixed-size blocks. The only fixed-size 8342 handle everything with respect to fixed-size blocks. The only fixed-size
8338 type that is not handled here are the fixed-size portion of strings, 8343 type that is not handled here are the fixed-size portion of strings,
8339 because we took special care of them earlier. 8344 because we took special care of them earlier.
8340 8345
8486 @node Integers and Characters, Allocation from Frob Blocks, Garbage Collection - Step by Step, Allocation of Objects in XEmacs Lisp 8491 @node Integers and Characters, Allocation from Frob Blocks, Garbage Collection - Step by Step, Allocation of Objects in XEmacs Lisp
8487 @section Integers and Characters 8492 @section Integers and Characters
8488 @cindex integers and characters 8493 @cindex integers and characters
8489 @cindex characters, integers and 8494 @cindex characters, integers and
8490 8495
8491 Integer and character Lisp objects are created from integers using the 8496 Integer and character Lisp objects are created from integers using the
8492 macros @code{XSETINT()} and @code{XSETCHAR()} or the equivalent
8493 functions @code{make_int()} and @code{make_char()}. (These are actually 8497 functions @code{make_int()} and @code{make_char()}. (These are actually
8494 macros on most systems.) These functions basically just do some moving 8498 macros on most systems.) These functions basically just do some moving
8495 of bits around, since the integral value of the object is stored 8499 of bits around, since the integral value of the object is stored
8496 directly in the @code{Lisp_Object}. 8500 directly in the @code{Lisp_Object}.
8497 8501
8498 @code{XSETINT()} and the like will truncate values given to them that
8499 are too big; i.e. you won't get the value you expected but the tag bits
8500 will at least be correct.
8501
8502 @node Allocation from Frob Blocks, lrecords, Integers and Characters, Allocation of Objects in XEmacs Lisp 8502 @node Allocation from Frob Blocks, lrecords, Integers and Characters, Allocation of Objects in XEmacs Lisp
8503 @section Allocation from Frob Blocks 8503 @section Allocation from Frob Blocks
8504 @cindex allocation from frob blocks 8504 @cindex allocation from frob blocks
8505 @cindex frob blocks, allocation from 8505 @cindex frob blocks, allocation from
8506 8506
8507 The uninitialized memory required by a @code{Lisp_Object} of a particular type 8507 The uninitialized memory required by a @code{Lisp_Object} of a
8508 is allocated using 8508 particular type is allocated using @code{ALLOCATE_FIXED_TYPE()}. This
8509 @code{ALLOCATE_FIXED_TYPE()}. This only occurs inside of the 8509 only occurs inside of the lowest-level object-creating functions in
8510 lowest-level object-creating functions in @file{alloc.c}: 8510 @file{alloc.c}: @code{Fcons()}, @code{make_float()},
8511 @code{Fcons()}, @code{make_float()}, @code{Fmake_byte_code()}, 8511 @code{Fmake_byte_code()}, @code{Fmake_symbol()},
8512 @code{Fmake_symbol()}, @code{allocate_extent()}, 8512 @code{allocate_extent()}, @code{allocate_event()},
8513 @code{allocate_event()}, @code{Fmake_marker()}, and 8513 @code{Fmake_marker()}, and @code{make_uninit_string()}. The idea is
8514 @code{make_uninit_string()}. The idea is that, for each type, there are 8514 that, for each type, there are a number of frob blocks (each 2K in
8515 a number of frob blocks (each 2K in size); each frob block is divided up 8515 size); each frob block is divided up into object-sized chunks. Each
8516 into object-sized chunks. Each frob block will have some of these 8516 frob block will have some of these chunks that are currently assigned
8517 chunks that are currently assigned to objects, and perhaps some that are 8517 to objects, and perhaps some that are free. (If a frob block has
8518 free. (If a frob block has nothing but free chunks, it is freed at the 8518 nothing but free chunks, it is freed at the end of the garbage
8519 end of the garbage collection cycle.) The free chunks are stored in a 8519 collection cycle.) The free chunks are stored in a free list, which
8520 free list, which is chained by storing a pointer in the first four bytes 8520 is chained by storing a pointer in the first four bytes of the
8521 of the chunk. (Except for the free chunks at the end of the last frob 8521 chunk. (Except for the free chunks at the end of the last frob block,
8522 block, which are handled using an index which points past the end of the 8522 which are handled using an index which points past the end of the
8523 last-allocated chunk in the last frob block.) 8523 last-allocated chunk in the last frob block.)
8524 @code{ALLOCATE_FIXED_TYPE()} first tries to retrieve a chunk from the 8524 @code{ALLOCATE_FIXED_TYPE()} first tries to retrieve a chunk from the
8525 free list; if that fails, it calls 8525 free list; if that fails, it calls
8526 @code{ALLOCATE_FIXED_TYPE_FROM_BLOCK()}, which looks at the end of the 8526 @code{ALLOCATE_FIXED_TYPE_FROM_BLOCK()}, which looks at the end of the
8527 last frob block for space, and creates a new frob block if there is 8527 last frob block for space, and creates a new frob block if there is
8528 none. (There are actually two versions of these macros, one of which is 8528 none. (There are actually two versions of these macros, one of which
8529 more defensive but less efficient and is used for error-checking.) 8529 is more defensive but less efficient and is used for error-checking.)
8530 8530
8531 @node lrecords, Low-level allocation, Allocation from Frob Blocks, Allocation of Objects in XEmacs Lisp 8531 @node lrecords, Low-level allocation, Allocation from Frob Blocks, Allocation of Objects in XEmacs Lisp
8532 @section lrecords 8532 @section lrecords
8533 @cindex lrecords 8533 @cindex lrecords
8534 8534
8535 [see @file{lrecord.h}] 8535 [see @file{lrecord.h}]
8536 8536
8537 @strong{This node needs updating for the ``new garbage collection 8537 @strong{This node needs updating for the ``new garbage collection
8538 algorithms'' (KKCC) and the ``incremental'' collector.} 8538 algorithms'' (KKCC) and the ``incremental'' collector.}
8539 8539
8540 All lrecords have at the beginning of their structure a @code{struct 8540 All lrecords have at the beginning of their structure a @code{struct
8541 lrecord_header}. This just contains a type number and some flags, 8541 lrecord_header}. This just contains a type number and some flags,
8542 including the mark bit. All builtin type numbers are defined as 8542 including the mark bit. All builtin type numbers are defined as
8543 constants in @code{enum lrecord_type}, to allow the compiler to generate 8543 constants in @code{enum lrecord_type}, to allow the compiler to generate
8544 more efficient code for @code{@var{type}P}. The type number, thru the 8544 more efficient code for @code{@var{type}P}. The type number, thru the
8545 @code{lrecord_implementation_table}, gives access to a @code{struct 8545 @code{lrecord_implementation_table}, gives access to a @code{struct
8546 lrecord_implementation}, which is a structure containing method pointers 8546 lrecord_implementation}, which is a structure containing method pointers
8547 and such. There is one of these for each type, and it is a global, 8547 and such. There is one of these for each type, and it is a global,
8548 constant, statically-declared structure that is declared in the 8548 constant, statically-declared structure that is declared in the
8549 @code{DEFINE_LRECORD_IMPLEMENTATION()} macro. 8549 @code{DEFINE_*_LISP_OBJECT()} macro.
8550 8550
8551 Simple lrecords (of type (b) above) just have a @code{struct 8551 Frob-block lrecords just have a @code{struct lrecord_header} at their
8552 lrecord_header} at their beginning. lcrecords, however, actually have a 8552 beginning. lcrecords, however, actually have a
8553 @code{struct lcrecord_header}. This, in turn, has a @code{struct 8553 @code{struct old_lcrecord_header}. This, in turn, has a @code{struct
8554 lrecord_header} at its beginning, so sanity is preserved; but it also 8554 lrecord_header} at its beginning, so sanity is preserved; but it also
8555 has a pointer used to chain all lcrecords together, and a special ID 8555 has a pointer used to chain all lcrecords together.
8556 field used to distinguish one lcrecord from another. (This field is used
8557 only for debugging and could be removed, but the space gain is not
8558 significant.)
8559 8556
8560 @strong{lcrecords are now obsolete when using the write-barrier-based 8557 @strong{lcrecords are now obsolete when using the write-barrier-based
8561 collector.} 8558 collector.}
8562 8559
8563 Simple lrecords are created using @code{ALLOCATE_FIXED_TYPE()}, just 8560 Frob-block objects are created using @code{ALLOC_FROB_BLOCK_LISP_OBJECT()}.
8564 like for other frob blocks. The only change is that the implementation 8561 All this does is call @code{ALLOCATE_FIXED_TYPE()} to allocate an
8565 pointer must be initialized correctly. (The implementation structure for 8562 object, and @code{set_lheader_implementation()} to initialize the header.
8566 an lrecord, or rather the pointer to it, is named @code{lrecord_float}, 8563
8567 @code{lrecord_extent}, @code{lrecord_buffer}, etc.) 8564 Normal objects (i.e. lcrecords) are created using
8568 8565 @code{ALLOC_NORMAL_LISP_OBJECT()}, which takes a type name (resolved
8569 lcrecords are created using @code{alloc_lcrecord()}. This takes a 8566 internally to a structure named @code{lrecord_foo} for type
8570 size to allocate and an implementation pointer. (The size needs to be 8567 @code{foo}). If they are of variable size, however, they are created
8571 passed because some lcrecords, such as window configurations, are of 8568 with @code{ALLOC_SIZED_LISP_OBJECT()}, which takes a size to allocate
8572 variable size.) This basically just @code{malloc()}s the storage, 8569 in addition to a type. This basically just @code{malloc()}s the
8573 initializes the @code{struct lcrecord_header}, and chains the lcrecord 8570 storage, initializes the @code{struct lcrecord_header}, and chains the
8574 onto the head of the list of all lcrecords, which is stored in the 8571 lcrecord onto the head of the list of all lcrecords, which is stored
8575 variable @code{all_lcrecords}. The calls to @code{alloc_lcrecord()} 8572 in the variable @code{all_lcrecords}. The calls to the above
8576 generally occur in the lowest-level allocation function for each lrecord 8573 allocation macros generally occur in the lowest-level allocation
8577 type. 8574 function for each lrecord type.
8578 8575
8579 Whenever you create an lrecord, you need to call either 8576 Whenever you create a normal object, you need to call one of the
8580 @code{DEFINE_LRECORD_IMPLEMENTATION()} or 8577 @code{DEFINE_*_LISP_OBJECT()} macros. This needs to be
8581 @code{DEFINE_LRECORD_SEQUENCE_IMPLEMENTATION()}. This needs to be
8582 specified in a @file{.c} file, at the top level. What this actually 8578 specified in a @file{.c} file, at the top level. What this actually
8583 does is define and initialize the implementation structure for the 8579 does is define and initialize the implementation structure for the
8584 lrecord. (And possibly declares a function @code{error_check_foo()} that 8580 lrecord. (And possibly declares a function @code{error_check_foo()} that
8585 implements the @code{XFOO()} macro when error-checking is enabled.) The 8581 implements the @code{XFOO()} macro when error-checking is enabled.) The
8586 arguments to the macros are the actual type name (this is used to 8582 arguments to the macros are the actual type name (this is used to
8593 are used to encapsulate type-specific information about the object, such 8589 are used to encapsulate type-specific information about the object, such
8594 as how to print it or mark it for garbage collection, so that it's easy 8590 as how to print it or mark it for garbage collection, so that it's easy
8595 to add new object types without having to add a specific case for each 8591 to add new object types without having to add a specific case for each
8596 new type in a bunch of different places. 8592 new type in a bunch of different places.
8597 8593
8598 The difference between @code{DEFINE_LRECORD_IMPLEMENTATION()} and 8594 The various macros for defining Lisp objects are as follows:
8599 @code{DEFINE_LRECORD_SEQUENCE_IMPLEMENTATION()} is that the former is 8595
8600 used for fixed-size object types and the latter is for variable-size 8596 @itemize @bullet
8601 object types. Most object types are fixed-size; some complex 8597 @item
8602 types, however (e.g. window configurations), are variable-size. 8598 @code{DEFINE_*_LISP_OBJECT} is for objects with constant size. (Either
8603 Variable-size object types have an extra method, which is called 8599 @code{DEFINE_DUMPABLE_LISP_OBJECT} for objects that can be saved in a
8604 to determine the actual size of a particular object of that type. 8600 dumped executable, or @code{DEFINE_NODUMP_LISP_OBJECT} for objects
8605 (Currently this is only used for keeping allocation statistics.) 8601 that cannot be saved -- e.g. that contain pointers to non-persistent
8606 8602 external objects such as window-system windows.)
8607 For the purpose of keeping allocation statistics, the allocation 8603
8604 @item
8605 @code{DEFINE_*_SIZABLE_LISP_OBJECT} is for objects whose size varies.
8606 This includes some simple types such as vectors, bit vectors and
8607 opaque objects, as well complex types, especially types such as
8608 specifiers, lstreams or coding systems that have subtypes and include
8609 subtype-specific data attached to the end of the structure.
8610 Variable-size objects have an extra method that returns the size of
8611 the object. This is not used at allocation (rather, the size is
8612 specified in the call to the allocation macro), but is used for
8613 operations such as copying a Lisp object, as well as for keeping
8614 allocation statistics.
8615
8616 @item
8617 @code{DEFINE_*_FROB_BLOCK_LISP_OBJECT} is for objects that are
8618 allocated in large blocks (``frob blocks''), which are parceled up
8619 individually. Such objects need special handling in @file{alloc.c}.
8620 This does not apply to NEW_GC, because it does this automatically.
8621
8622 @item
8623 @code{DEFINE_*_INTERNAL_LISP_OBJECT} is for ``internal'' objects that
8624 should never be visible on the Lisp level. This is a shorthand for
8625 the most common type of internal objects, which have no equal or hash
8626 method (since they generally won't appear in hash tables), no
8627 finalizer and @code{internal_object_printer()} as their print method
8628 (which prints that the object is internal and shouldn't be visible
8629 externally). For internal objects needing a finalizer, equal or hash
8630 method, or wanting to customize the print method, use the normal
8631 @code{DEFINE_*_LISP_OBJECT} mechanism for defining these objects.
8632
8633 @item
8634 @code{DEFINE_*_GENERAL_LISP_OBJECT} is for objects that need to
8635 provide one of the less common methods that are omitted on most
8636 objects. These methods include the methods supporting the unified
8637 property interface using @code{get}, @code{put}, @code{remprop} and
8638 @code{object-plist}, and (for dumpable objects only) the
8639 @code{disksaver} method.
8640
8641 @item
8642 @code{DEFINE_MODULE_*} is for objects defined in an external module.
8643 @end itemize
8644
8645 @code{MAKE_LISP_OBJECT} and @code{MAKE_MODULE_LISP_OBJECT} are what
8646 underlies all of these; they define a structure containing pointers to
8647 object methods and other info such as the size of the structure
8648 containing the object.
8649
8650 For the purpose of keeping allocation statistics, the allocation
8608 engine keeps a list of all the different types that exist. Note that, 8651 engine keeps a list of all the different types that exist. Note that,
8609 since @code{DEFINE_LRECORD_IMPLEMENTATION()} is a macro that is 8652 since @code{DEFINE_*_LISP_OBJECT()} is a macro that is
8610 specified at top-level, there is no way for it to initialize the global 8653 specified at top-level, there is no way for it to initialize the
8611 data structures containing type information, like 8654 global data structures containing type information, like
8612 @code{lrecord_implementations_table}. For this reason a call to 8655 @code{lrecord_implementations_table}. For this reason a call to
8613 @code{INIT_LRECORD_IMPLEMENTATION} must be added to the same source file 8656 @code{INIT_LISP_OBJECT()} must be added to the same source
8614 containing @code{DEFINE_LRECORD_IMPLEMENTATION}, but instead of to the 8657 file containing @code{DEFINE_*_LISP_OBJECT()}, but instead of
8615 top level, to one of the init functions, typically 8658 to the top level, to one of the init functions, typically
8616 @code{syms_of_@var{foo}.c}. @code{INIT_LRECORD_IMPLEMENTATION} must be 8659 @code{syms_of_@var{foo}.c}. @code{INIT_LISP_OBJECT()} must
8617 called before an object of this type is used. 8660 be called before an object of this type is used.
8618 8661
8619 The type number is also used to index into an array holding the number 8662 The type number is also used to index into an array holding the number
8620 of objects of each type and the total memory allocated for objects of 8663 of objects of each type and the total memory allocated for objects of
8621 that type. The statistics in this array are computed during the sweep 8664 that type. The statistics in this array are computed during the sweep
8622 stage. These statistics are returned by the call to 8665 stage. These statistics are returned by the call to
8623 @code{garbage-collect}. 8666 @code{garbage-collect}.
8624 8667
8625 Note that for every type defined with a @code{DEFINE_LRECORD_*()} 8668 Note that for every type defined with a @code{DEFINE_*_LISP_OBJECT()}
8626 macro, there needs to be a @code{DECLARE_LRECORD_IMPLEMENTATION()} 8669 macro, there needs to be a @code{DECLARE_LISP_OBJECT()} somewhere in a
8627 somewhere in a @file{.h} file, and this @file{.h} file needs to be 8670 @file{.h} file, and this @file{.h} file needs to be included by
8628 included by @file{inline.c}. 8671 @file{inline.c}.
8629 8672
8630 Furthermore, there should generally be a set of @code{XFOOBAR()}, 8673 Furthermore, there should generally be a set of @code{XFOOBAR()},
8631 @code{FOOBARP()}, etc. macros in a @file{.h} (or occasionally @file{.c}) 8674 @code{FOOBARP()}, etc. macros in a @file{.h} (or occasionally
8632 file. To create one of these, copy an existing model and modify as 8675 @file{.c}) file. To create one of these, copy an existing model and
8633 necessary. 8676 modify as necessary.
8634 8677
8635 @strong{Please note:} If you define an lrecord in an external 8678 @strong{Please note:} If you define an lrecord in an external
8636 dynamically-loaded module, you must use @code{DECLARE_EXTERNAL_LRECORD}, 8679 dynamically-loaded module, you must use
8637 @code{DEFINE_EXTERNAL_LRECORD_IMPLEMENTATION}, and 8680 @code{DECLARE_MODULE_LISP_OBJECT()},
8638 @code{DEFINE_EXTERNAL_LRECORD_SEQUENCE_IMPLEMENTATION} instead of the 8681 @code{DEFINE_MODULE_*_LISP_OBJECT()}, and
8639 non-EXTERNAL forms. These macros will dynamically add new type numbers 8682 @code{INIT_MODULE_LISP_OBJECT()} instead of the non-MODULE
8640 to the global enum that records them, whereas the non-EXTERNAL forms 8683 forms. These macros will dynamically add new type numbers to the
8641 assume that the programmer has already inserted the correct type numbers 8684 global enum that records them, whereas the non-MODULE forms assume
8642 into the enum's code at compile-time. 8685 that the programmer has already inserted the correct type numbers into
8686 the enum's code at compile-time.
8643 8687
8644 The various methods in the lrecord implementation structure are: 8688 The various methods in the lrecord implementation structure are:
8645 8689
8646 @enumerate 8690 @enumerate
8647 @item 8691 @item
8701 operating-system and window-system resources associated with the object 8745 operating-system and window-system resources associated with the object
8702 (e.g. pixmaps, fonts), etc. 8746 (e.g. pixmaps, fonts), etc.
8703 8747
8704 The finalize method can be NULL if nothing needs to be done. 8748 The finalize method can be NULL if nothing needs to be done.
8705 8749
8706 WARNING #1: The finalize method is also called at the end of the dump
8707 phase; this time with the for_disksave parameter set to non-zero. The
8708 object is @emph{not} about to disappear, so you have to make sure to
8709 @emph{not} free any extra @code{malloc()}ed memory if you're going to
8710 need it later. (Also, signal an error if there are any operating-system
8711 and window-system resources here, because they can't be dumped.)
8712
8713 Finalize methods should, as a rule, set to zero any pointers after 8750 Finalize methods should, as a rule, set to zero any pointers after
8714 they've been freed, and check to make sure pointers are not zero before 8751 they've been freed, and check to make sure pointers are not zero
8715 freeing. Although I'm pretty sure that finalize methods are not called 8752 before freeing. Although I'm pretty sure that finalize methods are
8716 twice on the same object (except for the @code{for_disksave} proviso), 8753 not called twice on the same object, we've gotten nastily burned in
8717 we've gotten nastily burned in some cases by not doing this. 8754 some cases by not doing this.
8718 8755
8719 WARNING #2: The finalize method is @emph{only} called for 8756 WARNING #1: The finalize method is @emph{only} called for
8720 lcrecords, @emph{not} for simply lrecords. If you need a 8757 normal objects, @emph{not} for frob-block objects. If you need a
8721 finalize method for simple lrecords, you have to stick 8758 finalize method for frob-block objects, you have to stick
8722 it in the @code{ADDITIONAL_FREE_foo()} macro in @file{alloc.c}. 8759 it in the @code{ADDITIONAL_FREE_foo()} macro in @file{alloc.c}.
8723 8760
8724 WARNING #3: Things are in an @emph{extremely} bizarre state 8761 WARNING #2: Things are in an @emph{extremely} bizarre state
8725 when @code{ADDITIONAL_FREE_foo()} is called, so you have to 8762 when @code{ADDITIONAL_FREE_foo()} is called, so you have to
8726 be incredibly careful when writing one of these functions. 8763 be incredibly careful when writing one of these functions.
8727 See the comment in @code{gc_sweep()}. If you ever have to add 8764 See the comment in @code{gc_sweep()}. If you ever have to add
8728 one of these, consider using an lcrecord or dealing with 8765 one of these, consider using an lcrecord or dealing with
8729 the problem in a different fashion. 8766 the problem in a different fashion.
8759 To hash two or more values together into a single value, use 8796 To hash two or more values together into a single value, use
8760 @code{HASH2()}, @code{HASH3()}, @code{HASH4()}, etc. 8797 @code{HASH2()}, @code{HASH3()}, @code{HASH4()}, etc.
8761 8798
8762 @item 8799 @item
8763 @dfn{getprop}, @dfn{putprop}, @dfn{remprop}, and @dfn{plist} methods. 8800 @dfn{getprop}, @dfn{putprop}, @dfn{remprop}, and @dfn{plist} methods.
8764 These are used for object types that have properties. I don't feel like 8801 These are used for object types that have properties, and are called
8765 documenting them here. If you create one of these objects, you have to 8802 when @code{get}, @code{put}, @code{remprop}, and @code{object-plist},
8766 use different macros to define them, 8803 respectively are called on the object. If you create one of these
8767 i.e. @code{DEFINE_LRECORD_IMPLEMENTATION_WITH_PROPS()} or 8804 objects, you have to use a different macro to define them,
8768 @code{DEFINE_LRECORD_SEQUENCE_IMPLEMENTATION_WITH_PROPS()}. 8805 i.e. @code{DEFINE_*_GENERAL_LISP_OBJECT()}.
8769 8806
8770 @item 8807 @item
8771 A @dfn{size_in_bytes} method, when the object is of variable-size. 8808 A @dfn{size_in_bytes} method, when the object is of variable-size.
8772 (i.e. declared with a @code{_SEQUENCE_IMPLEMENTATION} macro.) This should 8809 (i.e. declared with a @code{DEFINE_*_SIZABLE_*_LISP_OBJECT} macro.)
8773 simply return the object's size in bytes, exactly as you might expect. 8810 This should simply return the object's size in bytes, exactly as you
8774 For an example, see the methods for window configurations and opaques. 8811 might expect. For an example, see the methods for lstreams and opaques.
8812
8813 @item
8814 A @dfn{disksave} method. This is called at the end of the dump phase.
8815 It is used for objects that contain pointers or handles to objects
8816 created in external libraries, such as window-system windows or file
8817 handles. Such external objects cannot be dumped, so it is necessary
8818 to release them at dump time and arrange somehow or other for them to
8819 be resurrected if necessary later on.
8820
8821 It seems that even non-dumpable objects may be around at dump time,
8822 and a disksaver may be provided. (In fact, the only object currently
8823 with a disksaver, lstream, is non-dumpable.)
8824
8825 Objects rarely need to provide this method; most of the time it will
8826 be NULL. If you want to provide this method, you have to use the
8827 @code{DEFINE_*_GENERAL_LISP_OBJECT()} macro to define your object.
8775 @end enumerate 8828 @end enumerate
8776 8829
8777 @node Low-level allocation, Cons, lrecords, Allocation of Objects in XEmacs Lisp 8830 @node Low-level allocation, Cons, lrecords, Allocation of Objects in XEmacs Lisp
8778 @section Low-level allocation 8831 @section Low-level allocation
8779 @cindex low-level allocation 8832 @cindex low-level allocation
10004 complicated depending on how much information we cache. In addition to 10057 complicated depending on how much information we cache. In addition to
10005 the known region, we always cache the correct conversions for point, 10058 the known region, we always cache the correct conversions for point,
10006 BEGV, and ZV, and in addition to this we cache 16 positions where the 10059 BEGV, and ZV, and in addition to this we cache 16 positions where the
10007 conversion is known. We only look in the cache or update it when we 10060 conversion is known. We only look in the cache or update it when we
10008 need to move the known region more than a certain amount (currently 50 10061 need to move the known region more than a certain amount (currently 50
10009 chars), and then we throw away a "random" value and replace it with the 10062 chars), and then we throw away a ``random'' value and replace it with the
10010 newly calculated value. 10063 newly calculated value.
10011 10064
10012 Finally, we maintain an extra flag that tracks whether the buffer is 10065 Finally, we maintain an extra flag that tracks whether the buffer is
10013 entirely ASCII, to speed up the conversions even more. This flag is 10066 entirely ASCII, to speed up the conversions even more. This flag is
10014 actually of dubious value because in an entirely-ASCII buffer the known 10067 actually of dubious value because in an entirely-ASCII buffer the known
10040 track of a shifter value (0, 1, or 2) indicating how much to shift. 10093 track of a shifter value (0, 1, or 2) indicating how much to shift.
10041 Multiplying by 3 can be implemented by doubling and then adding the 10094 Multiplying by 3 can be implemented by doubling and then adding the
10042 original value. Dividing by 3, alas, cannot be implemented in any 10095 original value. Dividing by 3, alas, cannot be implemented in any
10043 simple shift/subtract method, as far as I know; so we just do a table 10096 simple shift/subtract method, as far as I know; so we just do a table
10044 lookup. For simplicity, we use a table of size 128K, which indexes the 10097 lookup. For simplicity, we use a table of size 128K, which indexes the
10045 "divide-by-3" values for the first 64K non-negative numbers. (Note that 10098 ``divide-by-3'' values for the first 64K non-negative numbers. (Note that
10046 we can increase the size up to 384K, i.e. indexing the first 192K 10099 we can increase the size up to 384K, i.e. indexing the first 192K
10047 non-negative numbers, while still using shorts in the array.) This also 10100 non-negative numbers, while still using shorts in the array.) This also
10048 means that the size of the known region can be at most 64K for 10101 means that the size of the known region can be at most 64K for
10049 width-three characters. 10102 width-three characters.
10050 @end quotation 10103 @end quotation
10070 @item 10123 @item
10071 the position of the gap 10124 the position of the gap
10072 @item 10125 @item
10073 the last value we computed 10126 the last value we computed
10074 @item 10127 @item
10075 a set of positions that are "far away" from previously computed positions 10128 a set of positions that are ``far away'' from previously computed positions
10076 (5000 chars currently; #### perhaps should be smaller) 10129 (5000 chars currently; #### perhaps should be smaller)
10077 @end itemize 10130 @end itemize
10078 10131
10079 For each position, we @code{CONSIDER()} it. This means: 10132 For each position, we @code{CONSIDER()} it. This means:
10080 10133
10096 the simple loop in FSF with the use of @code{bytecount_to_charcount()}, 10149 the simple loop in FSF with the use of @code{bytecount_to_charcount()},
10097 @code{charcount_to_bytecount()}, @code{bytecount_to_charcount_down()}, or 10150 @code{charcount_to_bytecount()}, @code{bytecount_to_charcount_down()}, or
10098 @code{charcount_to_bytecount_down()}. (The latter two I added for this purpose.) 10151 @code{charcount_to_bytecount_down()}. (The latter two I added for this purpose.)
10099 These scan 4 or 8 bytes at a time through purely single-byte characters. 10152 These scan 4 or 8 bytes at a time through purely single-byte characters.
10100 10153
10101 If the amount we had to scan was more than our "far away" distance (5000 10154 If the amount we had to scan was more than our ``far away'' distance (5000
10102 characters, see above), then cache the new position. 10155 characters, see above), then cache the new position.
10103 10156
10104 #### Things to do: 10157 #### Things to do:
10105 10158
10106 @itemize @bullet 10159 @itemize @bullet
10107 @item 10160 @item
10108 Look at the most recent GNU Emacs to see whether anything has changed. 10161 Look at the most recent GNU Emacs to see whether anything has changed.
10109 @item 10162 @item
10110 Think about whether it makes sense to try to implement some sort of 10163 Think about whether it makes sense to try to implement some sort of
10111 known region or list of "known regions", like we had before. This would 10164 known region or list of ``known regions'', like we had before. This would
10112 be a region of entirely single-byte characters that we can check very 10165 be a region of entirely single-byte characters that we can check very
10113 quickly. (Previously I used a range of same-width characters of any 10166 quickly. (Previously I used a range of same-width characters of any
10114 size; but this adds extra complexity and slows down the scanning, and is 10167 size; but this adds extra complexity and slows down the scanning, and is
10115 probably not worth it.) As part of the scanning process in 10168 probably not worth it.) As part of the scanning process in
10116 @code{bytecount_to_charcount()} et al, we skip over chunks of entirely 10169 @code{bytecount_to_charcount()} et al, we skip over chunks of entirely
10324 In terms of reading the actual code, there are five optimizations 10377 In terms of reading the actual code, there are five optimizations
10325 (obfuscations, if you like) that have been done. 10378 (obfuscations, if you like) that have been done.
10326 10379
10327 @enumerate 10380 @enumerate
10328 @item 10381 @item
10329 An explicit "failure stack" has been substituted for recursion. 10382 An explicit ``failure stack'' has been substituted for recursion.
10330 10383
10331 @item 10384 @item
10332 The @code{match_1_operator}, @code{next_p}, and @code{next_b} functions 10385 The @code{match_1_operator}, @code{next_p}, and @code{next_b} functions
10333 are actually inlined into the @code{match} function for efficiency. 10386 are actually inlined into the @code{match} function for efficiency.
10334 Then the pointer movement is interspersed with the matching operations. 10387 Then the pointer movement is interspersed with the matching operations.
10337 If the operator uses buffer context, the buffer pointer movement is 10390 If the operator uses buffer context, the buffer pointer movement is
10338 sometimes implicit in the operations retrieving the context. 10391 sometimes implicit in the operations retrieving the context.
10339 10392
10340 @item 10393 @item
10341 Some cases are combined into short preparation for individual cases, and 10394 Some cases are combined into short preparation for individual cases, and
10342 a "fall-through" into combined code for several cases. 10395 a ``fall-through'' into combined code for several cases.
10343 10396
10344 @item 10397 @item
10345 The @code{pattern} type is not an explicit @samp{struct}. Instead, the 10398 The @code{pattern} type is not an explicit @samp{struct}. Instead, the
10346 data (including, @emph{e.g.}, @samp{range_table}) is inlined into the 10399 data (including, @emph{e.g.}, @samp{range_table}) is inlined into the
10347 compiled bytecode. This leads to bizarre code in the interpreter like 10400 compiled bytecode. This leads to bizarre code in the interpreter like
10356 @example 10409 @example
10357 ..., 'range', count, first_8_flags, second_8_flags, ..., next_op, ... 10410 ..., 'range', count, first_8_flags, second_8_flags, ..., next_op, ...
10358 @end example 10411 @end example
10359 @end enumerate 10412 @end enumerate
10360 10413
10361 But if you keep your eye on the "switch in a loop" structure, you 10414 But if you keep your eye on the ``switch in a loop'' structure, you
10362 should be able to understand the parts you need. 10415 should be able to understand the parts you need.
10363 10416
10364 @node Multilingual Support, Consoles; Devices; Frames; Windows, Text, Top 10417 @node Multilingual Support, Consoles; Devices; Frames; Windows, Text, Top
10365 @chapter Multilingual Support 10418 @chapter Multilingual Support
10366 @cindex Mule character sets and encodings 10419 @cindex Mule character sets and encodings
10818 a simple charset like ASCII, there is only one encoding normally used -- 10871 a simple charset like ASCII, there is only one encoding normally used --
10819 each character is represented by a single byte, with the same value as 10872 each character is represented by a single byte, with the same value as
10820 its code point. For more complicated charsets, however, things are not 10873 its code point. For more complicated charsets, however, things are not
10821 so obvious. Unicode version 2, for example, is a large charset with 10874 so obvious. Unicode version 2, for example, is a large charset with
10822 thousands of characters, each indexed by a 16-bit number, often 10875 thousands of characters, each indexed by a 16-bit number, often
10823 represented in hex, e.g. 0x05D0 for the Hebrew letter "aleph". One 10876 represented in hex, e.g. 0x05D0 for the Hebrew letter ``aleph''. One
10824 obvious encoding uses two bytes per character (actually two encodings, 10877 obvious encoding uses two bytes per character (actually two encodings,
10825 depending on which of the two possible byte orderings is chosen). This 10878 depending on which of the two possible byte orderings is chosen). This
10826 encoding is convenient for internal processing of Unicode text; however, 10879 encoding is convenient for internal processing of Unicode text; however,
10827 it's incompatible with ASCII, so a different encoding, e.g. UTF-8, is 10880 it's incompatible with ASCII, so a different encoding, e.g. UTF-8, is
10828 usually used for external text, for example files or e-mail. UTF-8 10881 usually used for external text, for example files or e-mail. UTF-8
10839 10892
10840 In an ASCII or single-European-character-set world, life is very simple. 10893 In an ASCII or single-European-character-set world, life is very simple.
10841 There are 256 characters, and each character is represented using the 10894 There are 256 characters, and each character is represented using the
10842 numbers 0 through 255, which fit into a single byte. With a few 10895 numbers 0 through 255, which fit into a single byte. With a few
10843 exceptions (such as case-changing operations or syntax classes like 10896 exceptions (such as case-changing operations or syntax classes like
10844 'whitespace'), "text" is simply an array of indices into a font. You 10897 @code{whitespace}), ``text'' is simply an array of indices into a font. You
10845 can get different languages simply by choosing fonts with different 10898 can get different languages simply by choosing fonts with different
10846 8-bit character sets (ISO-8859-1, -2, special-symbol fonts, etc.), and 10899 8-bit character sets (ISO-8859-1, -2, special-symbol fonts, etc.), and
10847 everything will "just work" as long as anyone else receiving your text 10900 everything will ``just work'' as long as anyone else receiving your text
10848 uses a compatible font. 10901 uses a compatible font.
10849 10902
10850 In the multi-lingual world, however, it is much more complicated. There 10903 In the multi-lingual world, however, it is much more complicated. There
10851 are a great number of different characters which are organized in a 10904 are a great number of different characters which are organized in a
10852 complex fashion into various character sets. The representation to use 10905 complex fashion into various character sets. The representation to use
10892 text as possible. No operations should ever be performed on text encoded 10945 text as possible. No operations should ever be performed on text encoded
10893 in an external representation other than simple copying, because no 10946 in an external representation other than simple copying, because no
10894 assumptions can reliably be made about the format of this text. You 10947 assumptions can reliably be made about the format of this text. You
10895 cannot assume, for example, that the end of text is terminated by a null 10948 cannot assume, for example, that the end of text is terminated by a null
10896 byte. (For example, if the text is Unicode, it will have many null bytes 10949 byte. (For example, if the text is Unicode, it will have many null bytes
10897 in it.) You cannot find the next "slash" character by searching through 10950 in it.) You cannot find the next ``slash'' character by searching through
10898 the bytes until you find a byte that looks like a "slash" character, 10951 the bytes until you find a byte that looks like a ``slash'' character,
10899 because it might actually be the second byte of a Kanji character. 10952 because it might actually be the second byte of a Kanji character.
10900 Furthermore, all text in the internal representation must be converted, 10953 Furthermore, all text in the internal representation must be converted,
10901 even if it is known to be completely ASCII, because the external 10954 even if it is known to be completely ASCII, because the external
10902 representation may not be ASCII compatible (for example, if it is 10955 representation may not be ASCII compatible (for example, if it is
10903 Unicode). 10956 Unicode).
10923 the structures of a particular external encoding and the methods required 10976 the structures of a particular external encoding and the methods required
10924 to convert to and from this encoding. A facility exists to create coding 10977 to convert to and from this encoding. A facility exists to create coding
10925 system aliases, which in essence gives a single coding system two 10978 system aliases, which in essence gives a single coding system two
10926 different names. It is effectively used in XEmacs to provide a layer of 10979 different names. It is effectively used in XEmacs to provide a layer of
10927 abstraction on top of the actual coding systems. For example, the coding 10980 abstraction on top of the actual coding systems. For example, the coding
10928 system alias "file-name" points to whichever coding system is currently 10981 system alias ``file-name'' points to whichever coding system is currently
10929 used for encoding and decoding file names as passed to or retrieved from 10982 used for encoding and decoding file names as passed to or retrieved from
10930 system calls. In general, the actual encoding will differ from system to 10983 system calls. In general, the actual encoding will differ from system to
10931 system, and also on the particular locale that the user is in. The use 10984 system, and also on the particular locale that the user is in. The use
10932 of the file-name alias effectively hides that implementation detail on 10985 of the file-name alias effectively hides that implementation detail on
10933 top of that abstract interface layer which provides a unified set of 10986 top of that abstract interface layer which provides a unified set of
11434 C = plain char, when the base type is unsigned 11487 C = plain char, when the base type is unsigned
11435 U = unsigned 11488 U = unsigned
11436 S = signed 11489 S = signed
11437 @end example 11490 @end example
11438 11491
11439 (Formerly I had a comment saying that type (e) "should be replaced with 11492 (Formerly I had a comment saying that type (e) ``should be replaced with
11440 void *". However, there are in fact many places where an unsigned char 11493 void *''. However, there are in fact many places where an unsigned char
11441 * might be used -- e.g. for ease in pointer computation, since void * 11494 * might be used -- e.g. for ease in pointer computation, since void *
11442 doesn't allow this, and for compatibility with external APIs.) 11495 doesn't allow this, and for compatibility with external APIs.)
11443 11496
11444 Note that these typedefs are purely for documentation purposes; from 11497 Note that these typedefs are purely for documentation purposes; from
11445 the C code's perspective, they are exactly equivalent to @code{char *}, 11498 the C code's perspective, they are exactly equivalent to @code{char *},
11456 @node Different Ways of Seeing Internal Text, Buffer Positions, Byte Types, Byte/Character Types; Buffer Positions; Other Typedefs 11509 @node Different Ways of Seeing Internal Text, Buffer Positions, Byte Types, Byte/Character Types; Buffer Positions; Other Typedefs
11457 @subsection Different Ways of Seeing Internal Text 11510 @subsection Different Ways of Seeing Internal Text
11458 @cindex different ways of seeing internal text 11511 @cindex different ways of seeing internal text
11459 11512
11460 There are various ways of representing internal text. The two primary 11513 There are various ways of representing internal text. The two primary
11461 ways are as an "array" of individual characters; the other is as a 11514 ways are as an ``array'' of individual characters; the other is as a
11462 "stream" of bytes. In the ASCII world, where there are only 255 11515 ``stream'' of bytes. In the ASCII world, where there are only 255
11463 characters at most, things are easy because each character fits into a 11516 characters at most, things are easy because each character fits into a
11464 byte. In general, however, this is not true -- see the above discussion 11517 byte. In general, however, this is not true -- see the above discussion
11465 of characters vs. encodings. 11518 of characters vs. encodings.
11466 11519
11467 In some cases, it's also important to distinguish between a stream 11520 In some cases, it's also important to distinguish between a stream
11468 representation as a series of bytes and as a series of textual units. 11521 representation as a series of bytes and as a series of textual units.
11469 This is particularly important wrt Unicode. The UTF-16 representation 11522 This is particularly important wrt Unicode. The UTF-16 representation
11470 (sometimes referred to, rather sloppily, as simply the "Unicode" format) 11523 (sometimes referred to, rather sloppily, as simply the ``Unicode'' format)
11471 represents text as a series of 16-bit units. Mostly, each unit 11524 represents text as a series of 16-bit units. Mostly, each unit
11472 corresponds to a single character, but not necessarily, as characters 11525 corresponds to a single character, but not necessarily, as characters
11473 outside of the range 0-65535 (the BMP or "Basic Multilingual Plane" of 11526 outside of the range 0-65535 (the BMP or ``Basic Multilingual Plane'' of
11474 Unicode) require two 16-bit units, through the mechanism of 11527 Unicode) require two 16-bit units, through the mechanism of
11475 "surrogates". When a series of 16-bit units is serialized into a byte 11528 ``surrogates''. When a series of 16-bit units is serialized into a byte
11476 stream, there are at least two possible representations, little-endian 11529 stream, there are at least two possible representations, little-endian
11477 and big-endian, and which one is used may depend on the native format of 11530 and big-endian, and which one is used may depend on the native format of
11478 16-bit integers in the CPU of the machine that XEmacs is running 11531 16-bit integers in the CPU of the machine that XEmacs is running
11479 on. (Similarly, UTF-32 is logically a representation with 32-bit textual 11532 on. (Similarly, UTF-32 is logically a representation with 32-bit textual
11480 units.) 11533 units.)
11487 @item 11540 @item
11488 UTF-16 has 2-byte (16-bit) units. 11541 UTF-16 has 2-byte (16-bit) units.
11489 @item 11542 @item
11490 UTF-32 has 4-byte (32-bit) units. 11543 UTF-32 has 4-byte (32-bit) units.
11491 @item 11544 @item
11492 XEmacs-internal encoding (the old "Mule" encoding) has 1-byte (8-bit) 11545 XEmacs-internal encoding (the old ``Mule'' encoding) has 1-byte (8-bit)
11493 units. 11546 units.
11494 @item 11547 @item
11495 UTF-7 technically has 7-bit units that are within the "mail-safe" range 11548 UTF-7 technically has 7-bit units that are within the ``mail-safe'' range
11496 (ASCII 32 - 126 plus a few control characters), but normally is encoded 11549 (ASCII 32 - 126 plus a few control characters), but normally is encoded
11497 in an 8-bit stream. (UTF-7 is also a modal encoding, since it has a 11550 in an 8-bit stream. (UTF-7 is also a modal encoding, since it has a
11498 normal mode where printable ASCII characters represent themselves and a 11551 normal mode where printable ASCII characters represent themselves and a
11499 shifted mode, introduced with a plus sign, where a base-64 encoding is 11552 shifted mode, introduced with a plus sign, where a base-64 encoding is
11500 used.) 11553 used.)
11555 @table @code 11608 @table @code
11556 @item Ibyte 11609 @item Ibyte
11557 The data in a buffer or string is logically made up of Ibyte objects, 11610 The data in a buffer or string is logically made up of Ibyte objects,
11558 where a Ibyte takes up the same amount of space as a char. (It is 11611 where a Ibyte takes up the same amount of space as a char. (It is
11559 declared differently, though, to catch invalid usages.) Strings stored 11612 declared differently, though, to catch invalid usages.) Strings stored
11560 using Ibytes are said to be in "internal format". The important 11613 using Ibytes are said to be in ``internal format''. The important
11561 characteristics of internal format are 11614 characteristics of internal format are
11562 11615
11563 @itemize @minus 11616 @itemize @minus
11564 @item 11617 @item
11565 ASCII characters are represented as a single Ibyte, in the range 0 - 11618 ASCII characters are represented as a single Ibyte, in the range 0 -
11608 11661
11609 This means that Ichar values are upwardly compatible with the standard 11662 This means that Ichar values are upwardly compatible with the standard
11610 8-bit representation of ASCII/ISO-8859-1. 11663 8-bit representation of ASCII/ISO-8859-1.
11611 11664
11612 @item Extbyte 11665 @item Extbyte
11613 Strings that go in or out of Emacs are in "external format", typedef'ed 11666 Strings that go in or out of Emacs are in ``external format'', typedef'ed
11614 as an array of char or a char *. There is more than one external format 11667 as an array of char or a char *. There is more than one external format
11615 (JIS, EUC, etc.) but they all have similar properties. They are modal 11668 (JIS, EUC, etc.) but they all have similar properties. They are modal
11616 encodings, which is to say that the meaning of particular bytes is not 11669 encodings, which is to say that the meaning of particular bytes is not
11617 fixed but depends on what "mode" the string is currently in (e.g. bytes 11670 fixed but depends on what ``mode'' the string is currently in (e.g. bytes
11618 in the range 0 - 0x7f might be interpreted as ASCII, or as Hiragana, or 11671 in the range 0 - 0x7f might be interpreted as ASCII, or as Hiragana, or
11619 as 2-byte Kanji, depending on the current mode). The mode starts out in 11672 as 2-byte Kanji, depending on the current mode). The mode starts out in
11620 ASCII/ISO-8859-1 and is switched using escape sequences -- for example, 11673 ASCII/ISO-8859-1 and is switched using escape sequences -- for example,
11621 in the JIS encoding, 'ESC $ B' switches to a mode where pairs of bytes 11674 in the JIS encoding, 'ESC $ B' switches to a mode where pairs of bytes
11622 in the range 0 - 0x7f are interpreted as Kanji characters. 11675 in the range 0 - 0x7f are interpreted as Kanji characters.
11642 11695
11643 There are three possible ways to specify positions in a buffer. All 11696 There are three possible ways to specify positions in a buffer. All
11644 of these are one-based: the beginning of the buffer is position or 11697 of these are one-based: the beginning of the buffer is position or
11645 index 1, and 0 is not a valid position. 11698 index 1, and 0 is not a valid position.
11646 11699
11647 As a "buffer position" (typedef Charbpos): 11700 As a ``buffer position'' (typedef Charbpos):
11648 11701
11649 This is an index specifying an offset in characters from the 11702 This is an index specifying an offset in characters from the
11650 beginning of the buffer. Note that buffer positions are 11703 beginning of the buffer. Note that buffer positions are
11651 logically @strong{between} characters, not on a character. The 11704 logically @strong{between} characters, not on a character. The
11652 difference between two buffer positions specifies the number of 11705 difference between two buffer positions specifies the number of
11653 characters between those positions. Buffer positions are the 11706 characters between those positions. Buffer positions are the
11654 only kind of position externally visible to the user. 11707 only kind of position externally visible to the user.
11655 11708
11656 As a "byte index" (typedef Bytebpos): 11709 As a ``byte index'' (typedef Bytebpos):
11657 11710
11658 This is an index over the bytes used to represent the characters 11711 This is an index over the bytes used to represent the characters
11659 in the buffer. If there is no Mule support, this is identical 11712 in the buffer. If there is no Mule support, this is identical
11660 to a buffer position, because each character is represented 11713 to a buffer position, because each character is represented
11661 using one byte. However, with Mule support, many characters 11714 using one byte. However, with Mule support, many characters
11662 require two or more bytes for their representation, and so a 11715 require two or more bytes for their representation, and so a
11663 byte index may be greater than the corresponding buffer 11716 byte index may be greater than the corresponding buffer
11664 position. 11717 position.
11665 11718
11666 As a "memory index" (typedef Membpos): 11719 As a ``memory index'' (typedef Membpos):
11667 11720
11668 This is the byte index adjusted for the gap. For positions 11721 This is the byte index adjusted for the gap. For positions
11669 before the gap, this is identical to the byte index. For 11722 before the gap, this is identical to the byte index. For
11670 positions after the gap, this is the byte index plus the gap 11723 positions after the gap, this is the byte index plus the gap
11671 size. There are two possible memory indices for the gap 11724 size. There are two possible memory indices for the gap
11672 position; the memory index at the beginning of the gap should 11725 position; the memory index at the beginning of the gap should
11673 always be used, except in code that deals with manipulating the 11726 always be used, except in code that deals with manipulating the
11674 gap, where both indices may be seen. The address of the 11727 gap, where both indices may be seen. The address of the
11675 character "at" (i.e. following) a particular position can be 11728 character ``at'' (i.e. following) a particular position can be
11676 obtained from the formula 11729 obtained from the formula
11677 11730
11678 buffer_start_address + memory_index(position) - 1 11731 buffer_start_address + memory_index(position) - 1
11679 11732
11680 except in the case of characters at the gap position. 11733 except in the case of characters at the gap position.
11779 use the buffer-level functions in buffer.h, which automatically know the 11832 use the buffer-level functions in buffer.h, which automatically know the
11780 correct format and handle the gap. 11833 correct format and handle the gap.
11781 11834
11782 Some terminology: 11835 Some terminology:
11783 11836
11784 "itext" appearing in the macros means "internal-format text" -- type 11837 itext" appearing in the macros means "internal-format text" -- type
11785 @code{Ibyte *}. Operations on such pointers themselves, rather than on the 11838 @code{Ibyte *}. Operations on such pointers themselves, rather than on the
11786 text being pointed to, have "itext" instead of "itext" in the macro 11839 text being pointed to, have "itext" instead of "itext" in the macro
11787 name. "ichar" in the macro names means an Ichar -- the representation 11840 name. "ichar" in the macro names means an Ichar -- the representation
11788 of a character as a single integer rather than a series of bytes, as part 11841 of a character as a single integer rather than a series of bytes, as part
11789 of "itext". Many of the macros below are for converting between the 11842 of "itext". Many of the macros below are for converting between the
11988 @item 12041 @item
11989 (c) using the GCC extension (@{ ... @}). 12042 (c) using the GCC extension (@{ ... @}).
11990 @end itemize 12043 @end itemize
11991 12044
11992 Turned out that all of the above had bugs, all caused by GCC (hence the 12045 Turned out that all of the above had bugs, all caused by GCC (hence the
11993 comments about "those GCC wankers" and "ream gcc up the ass"). As for 12046 comments about ``those GCC wankers'' and ``ream gcc up the ass''). As for
11994 (a), some versions of GCC (especially on Intel platforms), which had 12047 (a), some versions of GCC (especially on Intel platforms), which had
11995 buggy implementations of @code{alloca()} that couldn't handle being called 12048 buggy implementations of @code{alloca()} that couldn't handle being called
11996 inside of a function call -- they just decremented the stack right in the 12049 inside of a function call -- they just decremented the stack right in the
11997 middle of pushing args. Oops, crash with stack trashing, very bad. (b) 12050 middle of pushing args. Oops, crash with stack trashing, very bad. (b)
11998 was an attempt to fix (a), and that led to further GCC crashes, esp. when 12051 was an attempt to fix (a), and that led to further GCC crashes, esp. when
12971 consistency. For example, the new Mule workspace contains Ibyte 13024 consistency. For example, the new Mule workspace contains Ibyte
12972 versions of the stdlib string functions. 13025 versions of the stdlib string functions.
12973 @item Extbyte, UExtbyte 13026 @item Extbyte, UExtbyte
12974 Pointer to text in some external format, which can be defined as all 13027 Pointer to text in some external format, which can be defined as all
12975 formats other than the internal one. The data representing a string 13028 formats other than the internal one. The data representing a string
12976 in "external" format (binary or any external encoding) is logically a 13029 in ``external'' format (binary or any external encoding) is logically a
12977 set of Extbytes. Extbyte is guaranteed to be just a char, so for 13030 set of Extbytes. Extbyte is guaranteed to be just a char, so for
12978 example strlen (Extbyte *) is OK. Extbyte is only a documentation 13031 example strlen (Extbyte *) is OK. Extbyte is only a documentation
12979 device for referring to external text. 13032 device for referring to external text.
12980 @item Ascbyte, UAscbyte 13033 @item Ascbyte, UAscbyte
12981 pure ASCII text, consisting of bytesf in a string in entirely US-ASCII 13034 pure ASCII text, consisting of bytesf in a string in entirely US-ASCII
13115 13168
13116 @node Mule-izing Code, , An Example of Mule-Aware Code, Coding for Mule 13169 @node Mule-izing Code, , An Example of Mule-Aware Code, Coding for Mule
13117 @subsection Mule-izing Code 13170 @subsection Mule-izing Code
13118 13171
13119 A lot of code is written without Mule in mind, and needs to be made 13172 A lot of code is written without Mule in mind, and needs to be made
13120 Mule-correct or "Mule-ized". There is really no substitute for 13173 Mule-correct or ``Mule-ized''. There is really no substitute for
13121 line-by-line analysis when doing this, but the following checklist can 13174 line-by-line analysis when doing this, but the following checklist can
13122 help: 13175 help:
13123 13176
13124 @itemize @bullet 13177 @itemize @bullet
13125 @item 13178 @item
13333 @item 13386 @item
13334 Look in the CRT sources! They come with VC++. See win32.c. 13387 Look in the CRT sources! They come with VC++. See win32.c.
13335 @end enumerate 13388 @end enumerate
13336 13389
13337 @node Locales, More about code pages, Microsoft Documentation, Microsoft Windows-Related Multilingual Issues 13390 @node Locales, More about code pages, Microsoft Documentation, Microsoft Windows-Related Multilingual Issues
13338 @subsection Locales, code pages, and other concepts of "language" 13391 @subsection Locales, code pages, and other concepts of ``language''
13339 @cindex locales, code pages, and other concepts of "language" 13392 @cindex locales, code pages, and other concepts of ``language''
13340 13393
13341 First, make sure you clearly understand the difference between the C 13394 First, make sure you clearly understand the difference between the C
13342 runtime library (CRT) and the Win32 API! See win32.c. 13395 runtime library (CRT) and the Win32 API! See win32.c.
13343 13396
13344 There are various different ways of representing the vague concept 13397 There are various different ways of representing the vague concept
13345 of "language", and it can be very confusing. So: 13398 of ``language'', and it can be very confusing. So:
13346 13399
13347 @itemize @bullet 13400 @itemize @bullet
13348 @item 13401 @item
13349 The CRT library has the concept of "locale", which is a 13402 The CRT library has the concept of ``locale'', which is a
13350 combination of language and country, and which controls the way 13403 combination of language and country, and which controls the way
13351 currency and dates are displayed, the encoding of data, etc. 13404 currency and dates are displayed, the encoding of data, etc.
13352 13405
13353 @item 13406 @item
13354 XEmacs has the concept of "language environment", more or less 13407 XEmacs has the concept of ``language environment'', more or less
13355 like a locale; although currently in most cases it just refers to 13408 like a locale; although currently in most cases it just refers to
13356 the language, and no sub-language distinctions are 13409 the language, and no sub-language distinctions are
13357 made. (Exceptions are with Chinese, which has different language 13410 made. (Exceptions are with Chinese, which has different language
13358 environments for Taiwan and mainland China, due to the different 13411 environments for Taiwan and mainland China, due to the different
13359 encodings and writing systems.) 13412 encodings and writing systems.)
13361 @item 13414 @item
13362 Windows has a number of different language concepts: 13415 Windows has a number of different language concepts:
13363 13416
13364 @enumerate 13417 @enumerate
13365 @item 13418 @item
13366 There are "languages" and "sublanguages", which correspond to 13419 There are ``languages'' and ``sublanguages'', which correspond to
13367 the languages and countries of the C library -- e.g. LANG_ENGLISH 13420 the languages and countries of the C library -- e.g. LANG_ENGLISH
13368 and SUBLANG_ENGLISH_US. These are identified by 8-bit integers, 13421 and SUBLANG_ENGLISH_US. These are identified by 8-bit integers,
13369 called the "primary language identifier" and "sublanguage 13422 called the ``primary language identifier'' and ``sublanguage
13370 identifier", respectively. These are combined into a 16-bit 13423 identifier'', respectively. These are combined into a 16-bit
13371 integer or "language identifier" by MAKELANGID(). 13424 integer or ``language identifier'' by @code{MAKELANGID()}.
13372 13425
13373 @item 13426 @item
13374 The language identifier in turn is combined with a "sort 13427 The language identifier in turn is combined with a ``sort
13375 identifier" (and optionally a "sort version") to yield a 32-bit 13428 identifier'' (and optionally a ``sort version'') to yield a 32-bit
13376 integer called a "locale identifier" (type LCID), which identifies 13429 integer called a ``locale identifier'' (type LCID), which identifies
13377 locales -- the primary means of distinguishing language/regional 13430 locales -- the primary means of distinguishing language/regional
13378 settings and similar to C library locales. 13431 settings and similar to C library locales.
13379 13432
13380 @item 13433 @item
13381 A "code page" combines the XEmacs concepts of "charset" and "coding 13434 A ``code page'' combines the XEmacs concepts of ``charset'' and ``coding
13382 system". It logically encompasses 13435 system''. It logically encompasses
13383 13436
13384 @itemize @minus 13437 @itemize @minus
13385 @item 13438 @item
13386 a set of supported characters 13439 a set of supported characters
13387 @item 13440 @item
13390 supported 13443 supported
13391 @item 13444 @item
13392 a way of encoding a series of characters into a string of bytes 13445 a way of encoding a series of characters into a string of bytes
13393 @end itemize 13446 @end itemize
13394 13447
13395 Note that the first two properties correspond to an XEmacs "charset" 13448 Note that the first two properties correspond to an XEmacs ``charset''
13396 and the latter an XEmacs "coding system". 13449 and the latter an XEmacs ``coding system''.
13397 13450
13398 Traditional encodings are either simple one-byte encodings, or 13451 Traditional encodings are either simple one-byte encodings, or
13399 combination one-byte/two-byte encodings (aka MBCS encodings, where MBCS 13452 combination one-byte/two-byte encodings (aka MBCS encodings, where MBCS
13400 stands for "Multibyte Character Set") with the following properties: 13453 stands for ``Multibyte Character Set'') with the following properties:
13401 13454
13402 @itemize @minus 13455 @itemize @minus
13403 @item 13456 @item
13404 all characters are encoded as a one-byte or two-byte sequence 13457 all characters are encoded as a one-byte or two-byte sequence
13405 @item 13458 @item
13406 the encoding is stateless (non-modal) 13459 the encoding is stateless (non-modal)
13407 @item 13460 @item
13408 the lower 128 bytes are compatible with ASCII 13461 the lower 128 bytes are compatible with ASCII
13409 @item 13462 @item
13410 in the higher bytes, the value of the first byte ("lead byte") 13463 in the higher bytes, the value of the first byte (``lead byte'')
13411 determines whether a second byte follows 13464 determines whether a second byte follows
13412 @item 13465 @item
13413 the values used for second bytes may overlap those used for first 13466 the values used for second bytes may overlap those used for first
13414 bytes, and (in some encodings) include values in the low half; thus, 13467 bytes, and (in some encodings) include values in the low half; thus,
13415 moving backwards is hard, and pure-ASCII algorithms (e.g. finding the 13468 moving backwards is hard, and pure-ASCII algorithms (e.g. finding the
13427 Every Windows locale has four associated code pages: ANSI (an 13480 Every Windows locale has four associated code pages: ANSI (an
13428 international standard or some Microsoft-created approximation; the 13481 international standard or some Microsoft-created approximation; the
13429 native code page under Windows), OEM (a DOS encoding, still used in the 13482 native code page under Windows), OEM (a DOS encoding, still used in the
13430 FAT file system), Mac (an encoding used on the Macintosh) and EBCDIC (a 13483 FAT file system), Mac (an encoding used on the Macintosh) and EBCDIC (a
13431 non-ASCII-compatible encoding used on IBM mainframes, originally based 13484 non-ASCII-compatible encoding used on IBM mainframes, originally based
13432 on the BCD or "binary-coded decimal" encoding of numbers). All code 13485 on the BCD or ``binary-coded decimal'' encoding of numbers). All code
13433 pages associated with a locale follow (as far as I know) the properties 13486 pages associated with a locale follow (as far as I know) the properties
13434 listed above for traditional code pages. More than one locale can share 13487 listed above for traditional code pages. More than one locale can share
13435 a code page -- e.g. all the Western European languages, including 13488 a code page -- e.g. all the Western European languages, including
13436 English, do. 13489 English, do.
13437 13490
13438 @item 13491 @item
13439 Windows also has an "input locale identifier" (aka "keyboard 13492 Windows also has an ``input locale identifier'' (aka ``keyboard
13440 layout id") or HKL, which is a 32-bit integer composed of the 13493 layout id'') or HKL, which is a 32-bit integer composed of the
13441 16-bit language identifier and a 16-bit "device identifier", which 13494 16-bit language identifier and a 16-bit ``device identifier'', which
13442 originally specified a particular keyboard layout (e.g. the locale 13495 originally specified a particular keyboard layout (e.g. the locale
13443 "US English" can have the QWERTY layout, the Dvorak layout, etc.), 13496 ``US English'' can have the QWERTY layout, the Dvorak layout, etc.),
13444 but has been expanded to include speech-to-text converters and 13497 but has been expanded to include speech-to-text converters and
13445 other non-keyboard ways of inputting text. Note that both the HKL 13498 other non-keyboard ways of inputting text. Note that both the HKL
13446 and LCID share the language identifier in the lower 16 bits, and in 13499 and LCID share the language identifier in the lower 16 bits, and in
13447 both cases a 0 in the upper 16 bits means "default" (sort order or 13500 both cases a 0 in the upper 16 bits means ``default'' (sort order or
13448 device), providing a way to convert between HKL's, LCID's, and 13501 device), providing a way to convert between HKL's, LCID's, and
13449 language identifiers (i.e. language/sublanguage pairs). The 13502 language identifiers (i.e. language/sublanguage pairs). The
13450 default keyboard layout for a language is (as far as I can 13503 default keyboard layout for a language is (as far as I can
13451 determine) established using the Regional Settings control panel 13504 determine) established using the Regional Settings control panel
13452 applet, where you can add input locales as combinations of language 13505 applet, where you can add input locales as combinations of language
13460 13513
13461 @node More about code pages, More about locales, Locales, Microsoft Windows-Related Multilingual Issues 13514 @node More about code pages, More about locales, Locales, Microsoft Windows-Related Multilingual Issues
13462 @subsection More about code pages 13515 @subsection More about code pages
13463 @cindex more about code pages 13516 @cindex more about code pages
13464 13517
13465 Here is what MSDN says about code pages (article "Code Pages"): 13518 Here is what MSDN says about code pages (article ``Code Pages''):
13466 13519
13467 @quotation 13520 @quotation
13468 A code page is a character set, which can include numbers, 13521 A code page is a character set, which can include numbers,
13469 punctuation marks, and other glyphs. Different languages and locales 13522 punctuation marks, and other glyphs. Different languages and locales
13470 may use different code pages. For example, ANSI code page 1252 is 13523 may use different code pages. For example, ANSI code page 1252 is
13502 13555
13503 -- The "C" locale is defined by ANSI to correspond to the locale in 13556 -- The "C" locale is defined by ANSI to correspond to the locale in
13504 which C programs have traditionally executed. The code page for the 13557 which C programs have traditionally executed. The code page for the
13505 "C" locale (code page) corresponds to the ASCII character 13558 "C" locale (code page) corresponds to the ASCII character
13506 set. For example, in the "C" locale, islower returns true for the 13559 set. For example, in the "C" locale, islower returns true for the
13507 values 0x61 ?0x7A only. In another locale, islower may return true 13560 values 0x61 to 0x7A only. In another locale, islower may return true
13508 for these as well as other values, as defined by that locale. 13561 for these as well as other values, as defined by that locale.
13509 13562
13510 Under "Locale-Dependent Routines" we notice the following setlocale 13563 Under ``Locale-Dependent Routines'' we notice the following setlocale
13511 dependencies: 13564 dependencies:
13512 13565
13513 atof, atoi, atol (LC_NUMERIC) 13566 atof, atoi, atol (LC_NUMERIC)
13514 is Routines (LC_CTYPE) 13567 is Routines (LC_CTYPE)
13515 isleadbyte (LC_CTYPE) 13568 isleadbyte (LC_CTYPE)
13538 wcstombs (LC_CTYPE) 13591 wcstombs (LC_CTYPE)
13539 wctomb (LC_CTYPE) 13592 wctomb (LC_CTYPE)
13540 _wtoi/_wtol (LC_NUMERIC) 13593 _wtoi/_wtol (LC_NUMERIC)
13541 @end quotation 13594 @end quotation
13542 13595
13543 NOTE: The above documentation doesn't clearly explain the "locale code 13596 NOTE: The above documentation doesn't clearly explain the ``locale code
13544 page" and "multibyte code page". These are two different values, 13597 page'' and ``multibyte code page''. These are two different values,
13545 maintained respectively in the CRT global variables __lc_codepage and 13598 maintained respectively in the CRT global variables __lc_codepage and
13546 __mbcodepage. Calling e.g. setlocale (LC_ALL, "JAPANESE") sets @strong{ONLY} 13599 __mbcodepage. Calling e.g. setlocale (LC_ALL, "JAPANESE") sets @strong{ONLY}
13547 __lc_codepage to 932 (the code page for Japanese), and leaves 13600 __lc_codepage to 932 (the code page for Japanese), and leaves
13548 __mbcodepage unchanged (usually 1252, i.e. Windows-ANSI). You'd have to 13601 __mbcodepage unchanged (usually 1252, i.e. Windows-ANSI). You'd have to
13549 call _setmbcp() to change __mbcodepage. Figuring out from the 13602 call _setmbcp() to change __mbcodepage. Figuring out from the
13550 documentation which routines use which code page is not so obvious. But: 13603 documentation which routines use which code page is not so obvious. But:
13551 13604
13552 @itemize @bullet 13605 @itemize @bullet
13553 @item 13606 @item
13554 from "Interpretation of Multibyte-Character Sequences" it appears that 13607 from ``Interpretation of Multibyte-Character Sequences'' it appears that
13555 all "multibyte-character routines" use the multibyte code page except for 13608 all ``multibyte-character routines'' use the multibyte code page except for
13556 mblen(), _mbstrlen(), mbstowcs(), mbtowc(), wcstombs(), and wctomb(). 13609 @code{mblen()}, @code{_mbstrlen()}, @code{mbstowcs()}, @code{mbtowc()}, @code{wcstombs()}, and @code{wctomb()}.
13557 13610
13558 @item 13611 @item
13559 from "_setmbcp": "The multibyte code page also affects 13612 from ``_setmbcp'': ``The multibyte code page also affects
13560 multibyte-character processing by the following run-time library 13613 multibyte-character processing by the following run-time library
13561 routines: _exec functions _mktemp _stat _fullpath _spawn functions 13614 routines: _exec functions _mktemp _stat _fullpath _spawn functions
13562 _tempnam _makepath _splitpath tmpnam. In addition, all run-time library 13615 _tempnam _makepath _splitpath tmpnam. In addition, all run-time library
13563 routines that receive multibyte-character argv or envp program arguments 13616 routines that receive multibyte-character argv or envp program arguments
13564 as parameters (such as the _exec and _spawn families) process these 13617 as parameters (such as the _exec and _spawn families) process these
13565 strings according to the multibyte code page. Hence these routines are 13618 strings according to the multibyte code page. Hence these routines are
13566 also affected by a call to _setmbcp that changes the multibyte code 13619 also affected by a call to _setmbcp that changes the multibyte code
13567 page." 13620 page.''
13568 @end itemize 13621 @end itemize
13569 13622
13570 Summary: from looking at the CRT source (which comes with VC++) and 13623 Summary: from looking at the CRT source (which comes with VC++) and
13571 carefully looking through the docs, it appears that: 13624 carefully looking through the docs, it appears that:
13572 13625
13573 @itemize @bullet 13626 @itemize @bullet
13574 @item 13627 @item
13575 the "locale code page" is used by all of the routines listed above 13628 the ``locale code page'' is used by all of the routines listed above
13576 under "Locale-Dependent Routines" (EXCEPT _mbccpy() and _mbclen()), 13629 under ``Locale-Dependent Routines'' (EXCEPT @code{_mbccpy()} and @code{_mbclen()}),
13577 as well as any other place that converts between multibyte and Unicode 13630 as well as any other place that converts between multibyte and Unicode
13578 strings, e.g. the startup code. 13631 strings, e.g. the startup code.
13579 @item 13632 @item
13580 the "multibyte code page" is used in all of the *mb*() routines 13633 the ``multibyte code page'' is used in all of the @code{mb*()} routines
13581 except mblen(), _mbstrlen(), mbstowcs(), mbtowc(), wcstombs(), 13634 except @code{mblen()}, @code{_mbstrlen()}, @code{mbstowcs()}, @code{mbtowc()}, @code{wcstombs()},
13582 and wctomb(); also _exec*(), _spawn*(), _mktemp(), _stat(), _fullpath(), 13635 and @code{wctomb()}; also @code{_exec*()}, @code{_spawn*()}, @code{_mktemp()}, @code{_stat()}, @code{_fullpath()},
13583 _tempnam(), _makepath(), _splitpath(), tmpnam(), and similar functions 13636 @code{_tempnam()}, @code{_makepath()}, @code{_splitpath()}, @code{tmpnam()}, and similar functions
13584 without the leading underscore. 13637 without the leading underscore.
13585 @end itemize 13638 @end itemize
13586 13639
13587 @node More about locales, Unicode support under Windows, More about code pages, Microsoft Windows-Related Multilingual Issues 13640 @node More about locales, Unicode support under Windows, More about code pages, Microsoft Windows-Related Multilingual Issues
13588 @subsection More about locales 13641 @subsection More about locales
13591 In addition to the locale defined by the CRT, Windows (i.e. the Win32 API) 13644 In addition to the locale defined by the CRT, Windows (i.e. the Win32 API)
13592 defines various locales: 13645 defines various locales:
13593 13646
13594 @itemize @bullet 13647 @itemize @bullet
13595 @item 13648 @item
13596 The system-default locale is the locale defined under "Language 13649 The system-default locale is the locale defined under ``Language
13597 settings for the system" in the "Regional Options" control panel. This 13650 settings for the system'' in the ``Regional Options'' control panel. This
13598 is NOT user-specific, and changing it requires a reboot (at least under 13651 is NOT user-specific, and changing it requires a reboot (at least under
13599 Windows 2000). The ANSI code page of the system-default locale is 13652 Windows 2000). The ANSI code page of the system-default locale is
13600 returned by GetACP(), and you can specify this code page in calls 13653 returned by @code{GetACP()}, and you can specify this code page in calls
13601 e.g. to MultiByteToWideChar with the constant CP_ACP. 13654 e.g. to MultiByteToWideChar with the constant CP_ACP.
13602 13655
13603 @item 13656 @item
13604 The user-default locale is the locale defined under "Settings for the 13657 The user-default locale is the locale defined under ``Settings for the
13605 current user" in the "Regional Options" control panel. 13658 current user'' in the ``Regional Options'' control panel.
13606 13659
13607 @item 13660 @item
13608 There is a thread-local locale set by SetThreadLocale. #### What is this 13661 There is a thread-local locale set by SetThreadLocale. #### What is this
13609 used for? 13662 used for?
13610 @end itemize 13663 @end itemize
13611 13664
13612 The Win32 API has a bunch of multibyte functions -- all of those that 13665 The Win32 API has a bunch of multibyte functions -- all of those that
13613 end with ...A(), and on which we spend so much effort in 13666 end with ...@code{A()}, and on which we spend so much effort in
13614 intl-encap-win32.c. These appear to ALWAYS use the ANSI code page of 13667 intl-encap-win32.c. These appear to ALWAYS use the ANSI code page of
13615 the system-default locale (GetACP(), CP_ACP). Note that this applies 13668 the system-default locale (@code{GetACP()}, CP_ACP). Note that this applies
13616 also, for example, to the encoding of filenames in all file-handling 13669 also, for example, to the encoding of filenames in all file-handling
13617 routines, including the CRT ones such as open(), because they pass their 13670 routines, including the CRT ones such as @code{open()}, because they pass their
13618 args unchanged to the Win32 API. 13671 args unchanged to the Win32 API.
13619 13672
13620 @node Unicode support under Windows, The golden rules of writing Unicode-safe code, More about locales, Microsoft Windows-Related Multilingual Issues 13673 @node Unicode support under Windows, The golden rules of writing Unicode-safe code, More about locales, Microsoft Windows-Related Multilingual Issues
13621 @subsection Unicode support under Windows 13674 @subsection Unicode support under Windows
13622 @cindex unicode support under windows 13675 @cindex unicode support under windows
13630 table to convert the characters of that code page to and from Unicode, and 13683 table to convert the characters of that code page to and from Unicode, and
13631 the Win32 API itself probably (perhaps always) uses Unicode internally. 13684 the Win32 API itself probably (perhaps always) uses Unicode internally.
13632 13685
13633 Under Windows there are two different versions of all library routines that 13686 Under Windows there are two different versions of all library routines that
13634 accept or return text, those that handle Unicode text and those handling 13687 accept or return text, those that handle Unicode text and those handling
13635 "multibyte" text, i.e. variable-width ASCII-compatible text in some 13688 ``multibyte'' text, i.e. variable-width ASCII-compatible text in some
13636 national format such as EUC or Shift-JIS. Because Windows 95 basically 13689 national format such as EUC or Shift-JIS. Because Windows 95 basically
13637 doesn't support Unicode but Windows NT does, and Microsoft doesn't provide 13690 doesn't support Unicode but Windows NT does, and Microsoft doesn't provide
13638 any way of writing a single binary that will work on both systems and still 13691 any way of writing a single binary that will work on both systems and still
13639 use Unicode when it's available (although see below, Microsoft Layer for 13692 use Unicode when it's available (although see below, Microsoft Layer for
13640 Unicode), we need to provide a way of run-time conditionalizing so you 13693 Unicode), we need to provide a way of run-time conditionalizing so you
13641 could have one binary for both systems. "Unicode-splitting" refers to 13694 could have one binary for both systems. ``Unicode-splitting'' refers to
13642 writing code that will handle this properly. This means using 13695 writing code that will handle this properly. This means using
13643 Qmswindows_tstr as the external conversion format, calling the appropriate 13696 Qmswindows_tstr as the external conversion format, calling the appropriate
13644 qxe...() Unicode-split version of library functions, and doing other things 13697 qxe...() Unicode-split version of library functions, and doing other things
13645 in certain cases, e.g. when a qxe() function is not present. 13698 in certain cases, e.g. when a @code{qxe()} function is not present.
13646 13699
13647 Unicode support also requires that the various Windows APIs be 13700 Unicode support also requires that the various Windows APIs be
13648 "Unicode-encapsulated", so that they automatically call the ANSI or 13701 ``Unicode-encapsulated'', so that they automatically call the ANSI or
13649 Unicode version of the API call appropriately and handle the size 13702 Unicode version of the API call appropriately and handle the size
13650 differences in structures. What this means is: 13703 differences in structures. What this means is:
13651 13704
13652 @itemize @bullet 13705 @itemize @bullet
13653 @item 13706 @item
13654 first, note that Windows already provides a sort of encapsulation 13707 first, note that Windows already provides a sort of encapsulation
13655 of all APIs that deal with text. All such APIs are underlyingly 13708 of all APIs that deal with text. All such APIs are underlyingly
13656 provided in two versions, with an A or W suffix (ANSI or "wide" 13709 provided in two versions, with an A or W suffix (ANSI or ``wide''
13657 i.e. Unicode), and the compile-time constant UNICODE controls which is 13710 i.e. Unicode), and the compile-time constant UNICODE controls which is
13658 selected by the unsuffixed API. Same thing happens with structures, and 13711 selected by the unsuffixed API. Same thing happens with structures, and
13659 also with types, where the generic types have names beginning with T -- 13712 also with types, where the generic types have names beginning with T --
13660 TCHAR, LPTSTR, etc.. Unfortunately, this is compile-time only, not 13713 TCHAR, LPTSTR, etc.. Unfortunately, this is compile-time only, not
13661 run-time, so not sufficient. (Creating the necessary run-time encoding 13714 run-time, so not sufficient. (Creating the necessary run-time encoding
13670 such an API available internally.) 13723 such an API available internally.)
13671 13724
13672 @item 13725 @item
13673 what we do is provide an encapsulation of each standard Windows API call 13726 what we do is provide an encapsulation of each standard Windows API call
13674 that is split into A and W versions. current theory is to avoid all 13727 that is split into A and W versions. current theory is to avoid all
13675 preprocessor games; so we name the function with a prefix -- "qxe" 13728 preprocessor games; so we name the function with a prefix -- ``qxe''
13676 currently -- and require callers to use the prefixed name. Callers need 13729 currently -- and require callers to use the prefixed name. Callers need
13677 to explicitly use the W version of all structures, and convert text 13730 to explicitly use the W version of all structures, and convert text
13678 themselves using Qmswindows_tstr. the qxe encapsulated version will 13731 themselves using Qmswindows_tstr. the qxe encapsulated version will
13679 automatically call the appropriate A or W version depending on whether 13732 automatically call the appropriate A or W version depending on whether
13680 we're running on 9x or NT (you can force use of the A calls on NT, 13733 we're running on 9x or NT (you can force use of the A calls on NT,
13730 purpose, to make the code easier to follow for someone who's not familiar 13783 purpose, to make the code easier to follow for someone who's not familiar
13731 with it. until our library is really complete and bug-free, we should 13784 with it. until our library is really complete and bug-free, we should
13732 think twice before doing this. 13785 think twice before doing this.
13733 13786
13734 According to Microsoft documentation, only the following functions are 13787 According to Microsoft documentation, only the following functions are
13735 provided under Windows 9x to support Unicode (see MSDN page "Windows 13788 provided under Windows 9x to support Unicode (see MSDN page ``Windows
13736 95/98/Me General Limitations"): 13789 95/98/Me General Limitations''):
13737 13790
13738 EnumResourceLanguagesW 13791 EnumResourceLanguagesW
13739 EnumResourceNamesW 13792 EnumResourceNamesW
13740 EnumResourceTypesW 13793 EnumResourceTypesW
13741 ExtTextOutW 13794 ExtTextOutW
13752 MessageBoxExW 13805 MessageBoxExW
13753 MultiByteToWideChar 13806 MultiByteToWideChar
13754 TextOutW 13807 TextOutW
13755 WideCharToMultiByte 13808 WideCharToMultiByte
13756 13809
13757 also maybe GetTextExtentExPoint? (KB Q125671 "Unicode Functions Supported 13810 also maybe GetTextExtentExPoint? (KB Q125671 ``Unicode Functions Supported
13758 by Windows 95") 13811 by Windows 95'')
13759 13812
13760 Q210341 says this in addition: 13813 Q210341 says this in addition:
13761 13814
13762 @quotation 13815 @quotation
13763 SUMMARY: 13816 SUMMARY:
13778 range beyond the 256 limitation of a one-byte representation. 13831 range beyond the 256 limitation of a one-byte representation.
13779 13832
13780 The Unicode standard offers application developers an opportunity to 13833 The Unicode standard offers application developers an opportunity to
13781 work with text without the limitations of character set based 13834 work with text without the limitations of character set based
13782 systems. For more information on the Unicode standard see the 13835 systems. For more information on the Unicode standard see the
13783 "References" section of this article. Windows NT is a fully Unicode 13836 References" section of this article. Windows NT is a fully Unicode
13784 capable operating system so it may be desirable to write software that 13837 capable operating system so it may be desirable to write software that
13785 supports Unicode on Windows 95. 13838 supports Unicode on Windows 95.
13786 13839
13787 Even though Windows 95 and Windows 98 are not Unicode based, they do 13840 Even though Windows 95 and Windows 98 are not Unicode based, they do
13788 provide some limited Unicode functionality. Drawing of Unicode text is 13841 provide some limited Unicode functionality. Drawing of Unicode text is
13861 @itemize @bullet 13914 @itemize @bullet
13862 @item 13915 @item
13863 wmain() is completely supported, and appropriate Unicode-formatted argv 13916 wmain() is completely supported, and appropriate Unicode-formatted argv
13864 and envp will always be passed. 13917 and envp will always be passed.
13865 @item 13918 @item
13866 Likewise, wWinMain() is completely supported. (NOTE: The docs are not at 13919 Likewise, @code{wWinMain()} is completely supported. (NOTE: The docs are not at
13867 all clear on how these various entry points interact, and implies that 13920 all clear on how these various entry points interact, and implies that
13868 a windows-subsystem program "must" use WinMain(), while a console- 13921 a windows-subsystem program ``must'' use @code{WinMain()}, while a console-
13869 subsystem program "must" use main(), and a program compiled with UNICODE 13922 subsystem program ``must'' use @code{main()}, and a program compiled with UNICODE
13870 (which we don't, see above) "must" use the w*() versions, while a program 13923 (which we don't, see above) ``must'' use the @code{w*()} versions, while a program
13871 not compiled this way "must" use the plain versions. In fact it appears 13924 not compiled this way ``must'' use the plain versions. In fact it appears
13872 that the CRT provides four different compiler entry points, namely 13925 that the CRT provides four different compiler entry points, namely
13873 w?(main|WinMain)CRTStartup, and we simply choose the one we like using 13926 w?(main|WinMain)CRTStartup, and we simply choose the one we like using
13874 the appropriate link flag. 13927 the appropriate link flag.
13875 @item 13928 @item
13876 _wenviron, _wputenv 13929 _wenviron, _wputenv
17888 | +--------------------------------------------------------------------+ | 17941 | +--------------------------------------------------------------------+ |
17889 | | menubar | | 17942 | | menubar | |
17890 | ###################################################################### | 17943 | ###################################################################### |
17891 | # toolbar # | 17944 | # toolbar # |
17892 | #--------------------------------------------------------------------# | 17945 | #--------------------------------------------------------------------# |
17893 | # | gutter | # | 17946 | # | internal border | # |
17894 | # |--------------------------------------------------------------| # | 17947 | # | +----------------------------------------------------------+ | # |
17895 | # | | internal border width | | # | 17948 | # | | gutter | | # |
17896 | # | | ******************************************************** | | # | 17949 | # | |-********************************************************-| | # |
17897 |w# | | * |s|v* |s* | | #w| 17950 |w# | | *@| scrollbar |v* |s* | | #w|
17898 |i# | | * |c|e* |c* | | #i| 17951 |i# | | *-+-------------------------|e* |c* | | #i|
17899 |n# | | * |r|r* |r* | | #n| 17952 |n# | | *s| |r* |r* | | #n|
17900 |d# | | * |o|t* |o* | | #d| 17953 |d# | | *c| |t* |o* | | #d|
17901 |o# | | * text area |l|.* text area |l* | | #o| 17954 |o# | | *r| |.* text area |l* | | #o|
17902 |w# | |i* |l| * |l*i| | #w| 17955 |w# |i| *o| | * |l* |i| #w|
17903 |-# | |n* |b|d* |b*n| | #-| 17956 |-# |n| *l| text area |d* |b* |n| #-|
17904 |m# | |t* |a|i* |a*t| | #m| 17957 |m# |t| *l| |i* |a* |t| #m|
17905 |a# | |.* |r|v* |r*.| | #a| 17958 |a# |e| *b| |v* |r* |e| #a|
17906 |n# t| | *-------------------------+-|i*----------------------+-* | |t #n| 17959 |n# t|r| *a| |i*----------------------+-* |r|t #n|
17907 |a# o|g|b* scrollbar | |d* scrollbar | *b|g|o #a| 17960 |a# o|n|g*r| |d* scrollbar |@*g|n|o #a|
17908 |g# o|u|o*-------------------------+-|e*----------------------+-*o|u|o #g| 17961 |g# o|a|u*-+-------------------------|e*----------------------+-*u|a|o #g|
17909 |e# l|t|r* modeline |r* modeline *r|t|l #e| 17962 |e# l|l|t* modeline |r* modeline *t|l|l #e|
17910 |r# b|t|d********************************************************d|t|b #r| 17963 |r# b| |t********************************************************t| |b #r|
17911 | # a|e|e* =..texttexttex....= |s|v* |s*e|e|a # | 17964 | # a|b|e* =..texttexttex....= |s|v* |s*e|b|a # |
17912 |d# r|r|r*o m=..texttexttextt..=o m|c|e* |c*r|r|r #d| 17965 |d# r|o|r*o m=..texttexttextt..=o m|c|e* |c*r|o|r #d|
17913 |e# | | *u a=.exttexttextte...=u a|r|r* |r* | | #e| 17966 |e# |r| *u a=.exttexttextte...=u a|r|r* |r* |r| #e|
17914 |c# | |w*t r=....texttexttex..=t r|o|t* |o*w| | #c| 17967 |c# |d| *t r=....texttexttex..=t r|o|t* |o* |d| #c|
17915 |o# | |i*s g= etc. =s g|l|.* text area |l*i| | #o| 17968 |o# |e| *s g= etc. =s g|l|.* text area |l* |e| #o|
17916 |r# | |d*i i= =i i|l| * |l*d| | #r| 17969 |r# |r| *i i= =i i|l| * |l* |r| #r|
17917 |a# | |t*d n= =d n|b|d* |b*t| | #a| 17970 |a# | | *d n= =d n|b|d* |b* | | #a|
17918 |t# | |h*e = inner text area =e |a|i* |a*h| | #t| 17971 |t# | | *e = inner text area =e |a|i* |a* | | #t|
17919 |i# | | * = = |r|v* |r* | | #i| 17972 |i# | | * = = |r|v* |r* | | #i|
17920 |o# | | *---===================---+-|i*----------------------+-* | | #o| 17973 |o# | | *---===================---+-|i*----------------------+-* | | #o|
17921 |n# | | * scrollbar | |d* scrollbar | * | | #n| 17974 |n# | | * scrollbar |@|d* scrollbar |@* | | #n|
17922 | # | | *-------------------------+-|e*----------------------+-* | | # | 17975 | # | | *-------------------------+-|e*----------------------+-* | | # |
17923 | # | | * modeline |r* modeline * | | # | 17976 | # | | * modeline |r* modeline * | | # |
17924 | # | | ******************************************************** | | # | 17977 | # | |-********************************************************-| | # |
17925 | # | | * minibuffer * | | # | 17978 | # | | gutter | | # |
17926 | # | | ******************************************************** | | # | 17979 | # | |-********************************************************-| | # |
17927 | # | | internal border width | | # | 17980 | # | |@* minibuffer *@| | # |
17928 | # |--------------------------------------------------------------| # | 17981 | # | +-********************************************************-+ | # |
17929 | # | gutter | # | 17982 | # | internal border | # |
17930 | #--------------------------------------------------------------------# | 17983 | #--------------------------------------------------------------------# |
17931 | # toolbar # | 17984 | # toolbar # |
17932 | ###################################################################### | 17985 | ###################################################################### |
17933 | window manager decoration | 17986 | window manager decoration |
17934 +------------------------------------------------------------------------+ 17987 +------------------------------------------------------------------------+
17935 17988
17936 # = boundary of client area; * = window boundaries, boundary of paned area 17989 # = boundary of client area; * = window boundaries, boundary of paned area
17937 = = boundary of inner text area; . = inside margin area 17990 = = boundary of inner text area; . = inside margin area; @ = dead boxes
17938 @end example 17991 @end example
17939 17992
17940 Note in particular what happens at the corners, where a "corner box" 17993 Note in particular what happens at the corners, where a ``corner box''
17941 occurs. Top and bottom toolbars take precedence over left and right 17994 occurs. Top and bottom toolbars take precedence over left and right
17942 toolbars, extending out horizontally into the corner boxes. Gutters 17995 toolbars, extending out horizontally into the corner boxes. Gutters
17943 work the same way. The corner box where the scrollbars meet, however, 17996 work the same way. The corner box where the scrollbars meet, however,
17944 is assigned to neither scrollbar, and is known as the "dead box"; it is 17997 is assigned to neither scrollbar, and is known as the ``dead box''; it is
17945 an area that must be cleared specially. 17998 an area that must be cleared specially. There are similar dead boxes at
17999 the bottom-right and bottom-left corners where the minibuffer and
18000 left/right gutters meet, but there is currently a bug in that these dead
18001 boxes are not explicitly cleared and may contain junk.
17946 18002
17947 @node The Frame, The Non-Client Area, Intro to Window and Frame Geometry, Window and Frame Geometry 18003 @node The Frame, The Non-Client Area, Intro to Window and Frame Geometry, Window and Frame Geometry
17948 @section The Frame 18004 @section The Frame
17949 18005
17950 The "top-level window area" is the entire area of a top-level window (or 18006 The ``top-level window area'' is the entire area of a top-level window (or
17951 "frame"). The "client area" (a term from MS Windows) is the area of a 18007 ``frame''). The ``client area'' (a term from MS Windows) is the area of a
17952 top-level window that XEmacs draws into and manages with redisplay. 18008 top-level window that XEmacs draws into and manages with redisplay.
17953 This includes the toolbar, scrollbars, gutters, dividers, text area, 18009 This includes the toolbar, scrollbars, gutters, dividers, text area,
17954 modeline and minibuffer. It does not include the menubar, title or 18010 modeline and minibuffer. It does not include the menubar, title or
17955 outer borders. The "non-client area" is the area of a top-level window 18011 outer borders. The ``non-client area'' is the area of a top-level window
17956 outside of the client area and includes the menubar, title and outer 18012 outside of the client area and includes the menubar, title and outer
17957 borders. Internally, all frame coordinates are relative to the client 18013 borders. Internally, all frame coordinates are relative to the client
17958 area. 18014 area.
17959 18015
17960 18016
17967 @item 18023 @item
17968 The outer layer is the window-manager decorations: The title and 18024 The outer layer is the window-manager decorations: The title and
17969 borders. These are controlled by the window manager, a separate process 18025 borders. These are controlled by the window manager, a separate process
17970 that controls the desktop, the location of icons, etc. When a process 18026 that controls the desktop, the location of icons, etc. When a process
17971 tries to create a window, the window manager intercepts this action and 18027 tries to create a window, the window manager intercepts this action and
17972 "reparents" the window, placing another window around it which contains 18028 ``reparents'' the window, placing another window around it which contains
17973 the window decorations, including the title bar, outer borders used for 18029 the window decorations, including the title bar, outer borders used for
17974 resizing, etc. The window manager also implements any actions involving 18030 resizing, etc. The window manager also implements any actions involving
17975 the decorations, such as the ability to resize a window by dragging its 18031 the decorations, such as the ability to resize a window by dragging its
17976 borders, move a window by dragging its title bar, etc. If there is no 18032 borders, move a window by dragging its title bar, etc. If there is no
17977 window manager or you kill it, windows will have no decorations (and 18033 window manager or you kill it, windows will have no decorations (and
17978 will lose them if they previously had any) and you will not be able to 18034 will lose them if they previously had any) and you will not be able to
17979 move or resize them. 18035 move or resize them.
17980 18036
17981 @item 18037 @item
17982 Inside of the window-manager decorations is the "shell", which is 18038 Inside of the window-manager decorations is the ``shell'', which is
17983 managed by the toolkit and widget libraries your program is linked with. 18039 managed by the toolkit and widget libraries your program is linked with.
17984 The code in @file{*-x.c} uses the Xt toolkit and various possible widget 18040 The code in @file{*-x.c} uses the Xt toolkit and various possible widget
17985 libraries built on top of Xt, such as Motif, Athena, the "Lucid" 18041 libraries built on top of Xt, such as Motif, Athena, the ``Lucid''
17986 widgets, etc. Another possibility is GTK (@file{*-gtk.c}), which implements 18042 widgets, etc. Another possibility is GTK (@file{*-gtk.c}), which implements
17987 both the toolkit and widgets. Under Xt, the "shell" window is an 18043 both the toolkit and widgets. Under Xt, the ``shell'' window is an
17988 EmacsShell widget, containing an EmacsManager widget of the same size, 18044 EmacsShell widget, containing an EmacsManager widget of the same size,
17989 which in turn contains a menubar widget and an EmacsFrame widget, inside 18045 which in turn contains a menubar widget and an EmacsFrame widget, inside
17990 of which is the client area. (The division into EmacsShell and 18046 of which is the client area. (The division into EmacsShell and
17991 EmacsManager is due to the complex and screwy geometry-management system 18047 EmacsManager is due to the complex and screwy geometry-management system
17992 in Xt [and X more generally]. The EmacsShell handles negotation with 18048 in Xt [and X more generally]. The EmacsShell handles negotation with
17998 18054
17999 Under Windows, the non-client area is managed by the window system. 18055 Under Windows, the non-client area is managed by the window system.
18000 There is no division such as under X. Part of the window-system API 18056 There is no division such as under X. Part of the window-system API
18001 (@file{USER.DLL}) of Win32 includes functions to control the menubars, title, 18057 (@file{USER.DLL}) of Win32 includes functions to control the menubars, title,
18002 etc. and implements the move and resize behavior. There @strong{is} an 18058 etc. and implements the move and resize behavior. There @strong{is} an
18003 equivalent of the window manager, called the "shell", but it manages 18059 equivalent of the window manager, called the ``shell'', but it manages
18004 only the desktop, not the windows themselves. The normal shell under 18060 only the desktop, not the windows themselves. The normal shell under
18005 Windows is @file{EXPLORER.EXE}; if you kill this, you will lose the bar 18061 Windows is @file{EXPLORER.EXE}; if you kill this, you will lose the bar
18006 containing the "Start" menu and tray and such, but the windows 18062 containing the ``Start'' menu and tray and such, but the windows
18007 themselves will not be affected or lose their decorations. 18063 themselves will not be affected or lose their decorations.
18008 18064
18009 18065
18010 @node The Client Area, The Paned Area, The Non-Client Area, Window and Frame Geometry 18066 @node The Client Area, The Paned Area, The Non-Client Area, Window and Frame Geometry
18011 @section The Client Area 18067 @section The Client Area
18012 18068
18013 Inside of the client area is the toolbars, the gutters (where the buffer 18069 Inside of the client area is the toolbars, the gutters (where the buffer
18014 tabs are displayed), the minibuffer, the internal border width, and one 18070 tabs are displayed), the minibuffer, the internal border width, and one
18015 or more non-overlapping "windows" (this is old Emacs terminology, from 18071 or more non-overlapping ``windows'' (this is old Emacs terminology, from
18016 before the time when frames existed at all; the standard terminology for 18072 before the time when frames existed at all; the standard terminology for
18017 this would be "pane"). Each window can contain a modeline, horizontal 18073 this would be ``pane''). Each window can contain a modeline, horizontal
18018 and/or vertical scrollbars, and (for non-rightmost windows) a vertical 18074 and/or vertical scrollbars, and (for non-rightmost windows) a vertical
18019 divider, surrounding a text area. 18075 divider, surrounding a text area.
18020 18076
18021 The dimensions of the toolbars and gutters are determined by the formula 18077 The dimensions of the toolbars and gutters are determined by the formula
18022 (THICKNESS + 2 * BORDER-THICKNESS), where "thickness" is a cover term 18078 (THICKNESS + 2 * BORDER-THICKNESS), where ``thickness'' is a cover term
18023 for height or width, as appropriate. The height and width come from 18079 for height or width, as appropriate. The height and width come from
18024 @code{default-toolbar-height} and @code{default-toolbar-width} and the specific 18080 @code{default-toolbar-height} and @code{default-toolbar-width} and the specific
18025 versions of these (@code{top-toolbar-height}, @code{left-toolbar-width}, etc.). 18081 versions of these (@code{top-toolbar-height}, @code{left-toolbar-width}, etc.).
18026 The border thickness comes from @code{default-toolbar-border-height} and 18082 The border thickness comes from @code{default-toolbar-border-height} and
18027 @code{default-toolbar-border-width}, and the specific versions of these. The 18083 @code{default-toolbar-border-width}, and the specific versions of these. The
18042 18098
18043 18099
18044 @node The Paned Area, Text Areas, The Client Area, Window and Frame Geometry 18100 @node The Paned Area, Text Areas, The Client Area, Window and Frame Geometry
18045 @section The Paned Area 18101 @section The Paned Area
18046 18102
18047 The area occupied by the "windows" is called the paned area. Note that 18103 The area occupied by the ``windows'' is called the paned area.
18048 this includes the minibuffer, which is just another window but is 18104 Unfortunately, because of the presence of the gutter @strong{between} the
18049 special-cased in XEmacs. Each window can include a horizontal and/or 18105 minibuffer and other windows, the bottom of the paned area is not
18050 vertical scrollbar, a modeline and a vertical divider to its right, as 18106 well-defined -- does it include the minibuffer (in which case it also
18051 well as the text area. Only non-rightmost windows can include a 18107 includes the bottom gutter, but none others) or does it not include
18052 vertical divider. (The minibuffer normally does not include either 18108 the minibuffer? (In which case not all windows are included.) It would
18053 modeline or scrollbars.) 18109 be cleaner to put the bottom gutter @strong{below} the minibuffer instead of
18110 above it.
18111
18112 Each window can include a horizontal and/or vertical scrollbar, a
18113 modeline and a vertical divider to its right, as well as the text area.
18114 Only non-rightmost windows can include a vertical divider. (The
18115 minibuffer normally does not include either modeline or scrollbars.)
18054 18116
18055 Note that, because the toolbars and gutters are controlled by 18117 Note that, because the toolbars and gutters are controlled by
18056 specifiers, and specifiers can have window-specific and buffer-specific 18118 specifiers, and specifiers can have window-specific and buffer-specific
18057 values, the size of the paned area can change depending on which window 18119 values, the size of the paned area can change depending on which window
18058 is selected: In other words, if the selected window or buffer changes, 18120 is selected: In other words, if the selected window or buffer changes,
18071 @code{horizontal-scrollbar-visible-p}, @code{vertical-scrollbar-visible-p}, 18133 @code{horizontal-scrollbar-visible-p}, @code{vertical-scrollbar-visible-p},
18072 @code{vertical-divider-always-visible-p}, etc. 18134 @code{vertical-divider-always-visible-p}, etc.
18073 18135
18074 In addition, it is possible to set margins in the text area using the 18136 In addition, it is possible to set margins in the text area using the
18075 specifiers @code{left-margin-width} and @code{right-margin-width}. When this is 18137 specifiers @code{left-margin-width} and @code{right-margin-width}. When this is
18076 done, only the "inner text area" (the area inside of the margins) will 18138 done, only the ``inner text area'' (the area inside of the margins) will
18077 be used for normal display of text; the margins will be used for glyphs 18139 be used for normal display of text; the margins will be used for glyphs
18078 with a layout policy of @code{outside-margin} (as set on an extent containing 18140 with a layout policy of @code{outside-margin} (as set on an extent containing
18079 the glyph by @code{set-extent-begin-glyph-layout} or 18141 the glyph by @code{set-extent-begin-glyph-layout} or
18080 @code{set-extent-end-glyph-layout}). However, the calculation of the text 18142 @code{set-extent-end-glyph-layout}). However, the calculation of the text
18081 area size (e.g. in the function @code{window-text-area-width}) includes the 18143 area size (e.g. in the function @code{window-text-area-width}) includes the
18082 margins. Which margin is used depends on whether a glyph has been set 18144 margins. Which margin is used depends on whether a glyph has been set
18083 as the begin-glyph or end-glyph of an extent (@code{set-extent-begin-glyph} 18145 as the begin-glyph or end-glyph of an extent (@code{set-extent-begin-glyph}
18084 etc.), using the left and right margins, respectively. 18146 etc.), using the left and right margins, respectively.
18085 18147
18086 Technically, the margins outside of the inner text area are known as the 18148 Technically, the margins outside of the inner text area are known as the
18087 "outside margins". The "inside margins" are in the inner text area and 18149 ``outside margins''. The ``inside margins'' are in the inner text area and
18088 constitute the whitespace between the outside margins and the first or 18150 constitute the whitespace between the outside margins and the first or
18089 last non-whitespace character in a line; their width can vary from line 18151 last non-whitespace character in a line; their width can vary from line
18090 to line. Glyphs will be placed in the inside margin if their layout 18152 to line. Glyphs will be placed in the inside margin if their layout
18091 policy is @code{inside-margin} or @code{whitespace}, with @code{whitespace} glyphs on 18153 policy is @code{inside-margin} or @code{whitespace}, with @code{whitespace} glyphs on
18092 the inside and @code{inside-margin} glyphs on the outside. Inside-margin 18154 the inside and @code{inside-margin} glyphs on the outside. Inside-margin
18097 18159
18098 18160
18099 @node The Displayable Area, Which Functions Use Which?, Text Areas, Window and Frame Geometry 18161 @node The Displayable Area, Which Functions Use Which?, Text Areas, Window and Frame Geometry
18100 @section The Displayable Area 18162 @section The Displayable Area
18101 18163
18102 The "displayable area" is not so much an actual area as a convenient 18164 The ``displayable area'' is not so much an actual area as a convenient
18103 fiction. It is the area used to convert between pixel and character 18165 fiction. It is the area used to convert between pixel and character
18104 dimensions for frames. The character dimensions for a frame (e.g. as 18166 dimensions for frames. The character dimensions for a frame (e.g. as
18105 returned by @code{frame-width} and @code{frame-height} and set by 18167 returned by @code{frame-width} and @code{frame-height} and set by
18106 @code{set-frame-width} and @code{set-frame-height}) are determined from the 18168 @code{set-frame-width} and @code{set-frame-height}) are determined from the
18107 displayable area by dividing by the pixel size of the default font as 18169 displayable area by dividing by the pixel size of the default font as
18108 instantiated in the frame. (For proportional fonts, the "average" width 18170 instantiated in the frame. (For proportional fonts, the ``average'' width
18109 is used. Under Windows, this is a built-in property of the fonts. 18171 is used. Under Windows, this is a built-in property of the fonts.
18110 Under X, this is based on the width of the lowercase 'n', or if this is 18172 Under X, this is based on the width of the lowercase 'n', or if this is
18111 zero then the width of the default character. [We prefer 'n' to the 18173 zero then the width of the default character. [We prefer 'n' to the
18112 specified default character because many X fonts have a default 18174 specified default character because many X fonts have a default
18113 character with a zero or otherwise non-representative width.]) 18175 character with a zero or otherwise non-representative width.])
18114 18176
18115 The displayable area is essentially the "theoretical" paned area of the 18177 The displayable area is essentially the ``theoretical'' gutter area of the
18116 frame excluding the rightmost and bottom-most scrollbars. In this 18178 frame, excluding the rightmost and bottom-most scrollbars. That is, it
18117 context, "theoretical" means that all calculations on based on 18179 starts from the client (or ``total'') area and then excludes the
18118 frame-level values for toolbar, gutter and scrollbar thicknesses. 18180 ``theoretical'' toolbars and bottom-most/rightmost scrollbars, and the
18119 Because these thicknesses are controlled by specifiers, and specifiers 18181 internal border width. In this context, ``theoretical'' means that all
18120 can have window-specific and buffer-specific values, these calculations 18182 calculations on based on frame-level values for toolbar and scrollbar
18121 may or may not reflect the actual size of the paned area or of the 18183 thicknesses. Because these thicknesses are controlled by specifiers,
18122 scrollbars when any particular window is selected. Note also that the 18184 and specifiers can have window-specific and buffer-specific values,
18123 "displayable area" may not even be contiguous! In particular, if the 18185 these calculations may or may not reflect the actual size of the paned
18124 frame-level value of the horizontal scrollbar height is non-zero, then 18186 area or of the scrollbars when any particular window is selected. Note
18125 the displayable area includes the paned area above and below the bottom 18187 also that the ``displayable area'' may not even be contiguous! In
18126 horizontal scrollbar but not the scrollbar itself. 18188 particular, the gutters are included, but the bottom-most and rightmost
18189 scrollbars are excluded even though they are inside of the gutters.
18190 Furthermore, if the frame-level value of the horizontal scrollbar height
18191 is non-zero, then the displayable area includes the paned area above and
18192 below the bottom horizontal scrollbar (i.e. the modeline and minibuffer)
18193 but not the scrollbar itself.
18127 18194
18128 As a further twist, the character-dimension calculations are adjusted so 18195 As a further twist, the character-dimension calculations are adjusted so
18129 that the truncation and continuation glyphs (see @code{truncation-glyph} and 18196 that the truncation and continuation glyphs (see @code{truncation-glyph} and
18130 @code{continuation-glyph}) count as a single character even if they are wider 18197 @code{continuation-glyph}) count as a single character even if they are wider
18131 than the default font width. (Technically, the character width is 18198 than the default font width. (Technically, the character width is
18134 width before dividing by the default-font width, and then adding 1 to 18201 width before dividing by the default-font width, and then adding 1 to
18135 the result.) (The ultimate motivation for this kludge as well as the 18202 the result.) (The ultimate motivation for this kludge as well as the
18136 subtraction of the scrollbars, but not the minibuffer or bottom-most 18203 subtraction of the scrollbars, but not the minibuffer or bottom-most
18137 modeline, is to maintain compatibility with TTY's.) 18204 modeline, is to maintain compatibility with TTY's.)
18138 18205
18139 Despite all these concerns and kludges, however, the "displayable area" 18206 Despite all these concerns and kludges, however, the ``displayable area''
18140 concept works well in practice and mostly ensures that by default the 18207 concept works well in practice and mostly ensures that by default the
18141 frame will actually fit 79 characters + continuation/truncation glyph. 18208 frame will actually fit 79 characters + continuation/truncation glyph.
18142 18209
18143 18210
18144 @node Which Functions Use Which?, , The Displayable Area, Window and Frame Geometry 18211 @node Which Functions Use Which?, , The Displayable Area, Window and Frame Geometry
19783 @section Event Queues 19850 @section Event Queues
19784 @cindex event queues 19851 @cindex event queues
19785 @cindex queues, event 19852 @cindex queues, event
19786 19853
19787 There are two event queues here -- the command event queue (#### which 19854 There are two event queues here -- the command event queue (#### which
19788 should be called "deferred event queue" and is in my glyph ws) and the 19855 should be called ``deferred event queue'' and is in my glyph ws) and the
19789 dispatch event queue. (MS Windows actually has an extra dispatch queue 19856 dispatch event queue. (MS Windows actually has an extra dispatch queue
19790 for non-user events and uses the generic one only for user events. This 19857 for non-user events and uses the generic one only for user events. This
19791 is because user and non-user events in Windows come through the same 19858 is because user and non-user events in Windows come through the same
19792 place -- the window procedure -- but under X, it's possible to 19859 place -- the window procedure -- but under X, it's possible to
19793 selectively process events such that we take all the user events before 19860 selectively process events such that we take all the user events before
19888 19955
19889 @item handle_magic_event_cb 19956 @item handle_magic_event_cb
19890 XEmacs calls this with an event structure which contains window-system 19957 XEmacs calls this with an event structure which contains window-system
19891 dependent information that XEmacs doesn't need to know about, but which 19958 dependent information that XEmacs doesn't need to know about, but which
19892 must happen in order. If the @code{next_event_cb} never returns an 19959 must happen in order. If the @code{next_event_cb} never returns an
19893 event of type "magic", this will never be used. 19960 event of type ``magic'', this will never be used.
19894 19961
19895 @item format_magic_event_cb 19962 @item format_magic_event_cb
19896 Called with a magic event; print a representation of the innards of the 19963 Called with a magic event; print a representation of the innards of the
19897 event to @var{PSTREAM}. 19964 event to @var{PSTREAM}.
19898 19965
19920 @item select_process_cb 19987 @item select_process_cb
19921 @item unselect_process_cb 19988 @item unselect_process_cb
19922 These callbacks tell the underlying implementation to add or remove a 19989 These callbacks tell the underlying implementation to add or remove a
19923 file descriptor from the list of fds which are polled for 19990 file descriptor from the list of fds which are polled for
19924 inferior-process input. When input becomes available on the given 19991 inferior-process input. When input becomes available on the given
19925 process connection, an event of type "process" should be generated. 19992 process connection, an event of type ``process'' should be generated.
19926 19993
19927 @item select_console_cb 19994 @item select_console_cb
19928 @item unselect_console_cb 19995 @item unselect_console_cb
19929 These callbacks tell the underlying implementation to add or remove a 19996 These callbacks tell the underlying implementation to add or remove a
19930 console from the list of consoles which are polled for user-input. 19997 console from the list of consoles which are polled for user-input.
20048 @cindex focus handling 20115 @cindex focus handling
20049 20116
20050 Ben's capsule lecture on focus: 20117 Ben's capsule lecture on focus:
20051 20118
20052 In GNU Emacs @code{select-frame} never changes the window-manager frame 20119 In GNU Emacs @code{select-frame} never changes the window-manager frame
20053 focus. All it does is change the "selected frame". This is similar to 20120 focus. All it does is change the ``selected frame''. This is similar to
20054 what happens when we call @code{select-device} or @code{select-console}. 20121 what happens when we call @code{select-device} or @code{select-console}.
20055 Whenever an event comes in (including a keyboard event), its frame is 20122 Whenever an event comes in (including a keyboard event), its frame is
20056 selected; therefore, evaluating @code{select-frame} in @samp{*scratch*} 20123 selected; therefore, evaluating @code{select-frame} in @samp{*scratch*}
20057 won't cause any effects because the next received event (in the same 20124 won't cause any effects because the next received event (in the same
20058 frame) will cause a switch back to the frame displaying 20125 frame) will cause a switch back to the frame displaying
20083 minibuffer, you essentially want to temporarily switch the WM focus to 20150 minibuffer, you essentially want to temporarily switch the WM focus to
20084 the frame with the minibuffer, and switch it back when you exit the 20151 the frame with the minibuffer, and switch it back when you exit the
20085 minibuffer. 20152 minibuffer.
20086 20153
20087 GNU Emacs solves this with the crockish @code{redirect-frame-focus}, 20154 GNU Emacs solves this with the crockish @code{redirect-frame-focus},
20088 which says "for keyboard events received from FRAME, act like they're 20155 which says ``for keyboard events received from FRAME, act like they're
20089 coming from FOCUS-FRAME". I think what this means is that, when a 20156 coming from FOCUS-FRAME''. I think what this means is that, when a
20090 keyboard event comes in and the event manager is about to select the 20157 keyboard event comes in and the event manager is about to select the
20091 event's frame, if that frame has its focus redirected, the redirected-to 20158 event's frame, if that frame has its focus redirected, the redirected-to
20092 frame is selected instead. That way, if you're in a minibufferless 20159 frame is selected instead. That way, if you're in a minibufferless
20093 frame and enter the minibuffer, then all Lisp functions that run see the 20160 frame and enter the minibuffer, then all Lisp functions that run see the
20094 selected frame as the minibuffer's frame rather than the minibufferless 20161 selected frame as the minibuffer's frame rather than the minibufferless
20098 There's also some weird logic that switches the redirected frame focus 20165 There's also some weird logic that switches the redirected frame focus
20099 from one frame to another if Lisp code explicitly calls 20166 from one frame to another if Lisp code explicitly calls
20100 @code{select-frame} (but not if @code{handle-switch-frame} is called), 20167 @code{select-frame} (but not if @code{handle-switch-frame} is called),
20101 and saves and restores the frame focus in window configurations, 20168 and saves and restores the frame focus in window configurations,
20102 etc. etc. All of this logic is heavily @code{#if 0}'d, with lots of 20169 etc. etc. All of this logic is heavily @code{#if 0}'d, with lots of
20103 comments saying "No, this approach doesn't seem to work, so I'm trying 20170 comments saying ``No, this approach doesn't seem to work, so I'm trying
20104 this ... is it reasonable? Well, I'm not sure ..." that are a red flag 20171 this ... is it reasonable? Well, I'm not sure ...'' that are a red flag
20105 indicating crockishness. 20172 indicating crockishness.
20106 20173
20107 Because of our way of doing things, we can avoid all this crock. 20174 Because of our way of doing things, we can avoid all this crock.
20108 Keyboard events never cause a select-frame (who cares what frame they're 20175 Keyboard events never cause a select-frame (who cares what frame they're
20109 associated with? They come from a console, only). We change the actual 20176 associated with? They come from a console, only). We change the actual
24882 return value should be an alist consisting of a list of all of the 24949 return value should be an alist consisting of a list of all of the
24883 defined subtypes for that coding system type along with a level of 24950 defined subtypes for that coding system type along with a level of
24884 likelihood and a list of additional properties indicating certain 24951 likelihood and a list of additional properties indicating certain
24885 features detected in the data. The extra properties returned are 24952 features detected in the data. The extra properties returned are
24886 defined entirely by the particular coding system type and are used 24953 defined entirely by the particular coding system type and are used
24887 only in the algorithm described below under "user control." However, 24954 only in the algorithm described below under ``user control.'' However,
24888 the levels of likelihood have a standard meaning as follows: 24955 the levels of likelihood have a standard meaning as follows:
24889 24956
24890 Level 4 means "near certainty" and typically indicates that a 24957 Level 4 means ``near certainty'' and typically indicates that a
24891 signature has been detected, usually at the beginning of the data, 24958 signature has been detected, usually at the beginning of the data,
24892 indicating that the data is encoded in this particular coding system 24959 indicating that the data is encoded in this particular coding system
24893 type. An example of this would be the byte order mark at the beginning 24960 type. An example of this would be the byte order mark at the beginning
24894 of UCS2 encoded data or the GZIP mark at the beginning of GZIP data. 24961 of UCS2 encoded data or the GZIP mark at the beginning of GZIP data.
24895 24962
24896 Level 3 means "highly likely" and indicates that tell-tale signs have 24963 Level 3 means ``highly likely'' and indicates that tell-tale signs have
24897 been discovered in the data that are characteristic of this particular 24964 been discovered in the data that are characteristic of this particular
24898 coding system type. Examples of this might be ISO 2022 escape 24965 coding system type. Examples of this might be ISO 2022 escape
24899 sequences or the current Unicode end of line markers at regular 24966 sequences or the current Unicode end of line markers at regular
24900 intervals. 24967 intervals.
24901 24968
24902 Level 2 means "strongly statistically likely" indicating that 24969 Level 2 means ``strongly statistically likely'' indicating that
24903 statistical analysis concludes that there's a high chance that this 24970 statistical analysis concludes that there's a high chance that this
24904 data is encoded according to this particular type. For example, this 24971 data is encoded according to this particular type. For example, this
24905 might mean that for UCS2 data, there is a high proportion of null bytes 24972 might mean that for UCS2 data, there is a high proportion of null bytes
24906 or other repeated bytes in the odd-numbered bytes of the data and a 24973 or other repeated bytes in the odd-numbered bytes of the data and a
24907 high variance in the even-numbered bytes of the data. For Shift-JIS, 24974 high variance in the even-numbered bytes of the data. For Shift-JIS,
24908 this might indicate that there were no illegal Shift-JIS sequences 24975 this might indicate that there were no illegal Shift-JIS sequences
24909 and a fairly high occurrence of common Shift-JIS characters. 24976 and a fairly high occurrence of common Shift-JIS characters.
24910 24977
24911 Level 1 means "weak statistical likelihood" meaning that there is some 24978 Level 1 means ``weak statistical likelihood'' meaning that there is some
24912 indication that the data is encoded in this coding system type. In 24979 indication that the data is encoded in this coding system type. In
24913 fact, there is a reasonable chance that it may be some other type as 24980 fact, there is a reasonable chance that it may be some other type as
24914 well. This means, for example, that no illegal sequences were 24981 well. This means, for example, that no illegal sequences were
24915 encountered and at least some data was encountered that is purposely 24982 encountered and at least some data was encountered that is purposely
24916 not in other coding system types. For Shift-JIS data, this might mean 24983 not in other coding system types. For Shift-JIS data, this might mean
24917 that some bytes in the range 128 to 159 were encountered in the data. 24984 that some bytes in the range 128 to 159 were encountered in the data.
24918 24985
24919 Level 0 means "neutral" which is to say that there's either not enough 24986 Level 0 means ``neutral'' which is to say that there's either not enough
24920 data to make any decision or that the data could well be interpreted 24987 data to make any decision or that the data could well be interpreted
24921 as this type (meaning no illegal sequences), but there is little or no 24988 as this type (meaning no illegal sequences), but there is little or no
24922 indication of anything particular to this particular type. 24989 indication of anything particular to this particular type.
24923 24990
24924 Level -1 means "weakly unlikely" meaning that some data was 24991 Level -1 means ``weakly unlikely'' meaning that some data was
24925 encountered that could conceivably be part of the coding system type 24992 encountered that could conceivably be part of the coding system type
24926 but is probably not. For example, successively long line-lengths or 24993 but is probably not. For example, successively long line-lengths or
24927 very rarely-encountered sequences. 24994 very rarely-encountered sequences.
24928 24995
24929 Level -2 means "strongly unlikely" meaning that typically a number 24996 Level -2 means ``strongly unlikely'' meaning that typically a number
24930 of illegal sequences were encountered. 24997 of illegal sequences were encountered.
24931 24998
24932 The algorithm to determine when to stop and indicate that the data has 24999 The algorithm to determine when to stop and indicate that the data has
24933 been detected as a particular coding system uses a priority list, 25000 been detected as a particular coding system uses a priority list,
24934 which is typically specified as part of the language environment 25001 which is typically specified as part of the language environment
24943 Japanese-language environment particular subtypes of ISO 2022 will be 25010 Japanese-language environment particular subtypes of ISO 2022 will be
24944 associated with the Japanese coding system version of those 25011 associated with the Japanese coding system version of those
24945 subtypes). It is perfectly legal and quite common in fact, to list the 25012 subtypes). It is perfectly legal and quite common in fact, to list the
24946 same subtype more than once in the priority list with successively 25013 same subtype more than once in the priority list with successively
24947 lower requirements. Other facts that can be listed in the priority 25014 lower requirements. Other facts that can be listed in the priority
24948 list for a subtype are "reject", meaning that the data should never be 25015 list for a subtype are ``reject'', meaning that the data should never be
24949 detected as this subtype, or "ask", meaning that if the data is 25016 detected as this subtype, or ``ask'', meaning that if the data is
24950 detected to be this subtype, the user will be asked whether they 25017 detected to be this subtype, the user will be asked whether they
24951 actually mean this. This latter property could be used, for example, 25018 actually mean this. This latter property could be used, for example,
24952 towards the bottom of the priority list. 25019 towards the bottom of the priority list.
24953 25020
24954 In addition there is a global variable which specifies the minimum 25021 In addition there is a global variable which specifies the minimum
24961 system, the subtype, the coding system and the associated level of 25028 system, the subtype, the coding system and the associated level of
24962 likelihood will be prominently displayed either in the echo area or in 25029 likelihood will be prominently displayed either in the echo area or in
24963 a status box somewhere. 25030 a status box somewhere.
24964 25031
24965 If no positive match is found according to the priority list, or if 25032 If no positive match is found according to the priority list, or if
24966 the matches that are found have the "ask" property on them, then the 25033 the matches that are found have the ``ask'' property on them, then the
24967 user will be presented with a list of choices of possible encodings 25034 user will be presented with a list of choices of possible encodings
24968 and asked to choose one. This list is typically sorted first by level 25035 and asked to choose one. This list is typically sorted first by level
24969 of likelihood, and then within this, by the order in which the 25036 of likelihood, and then within this, by the order in which the
24970 subtypes appear in the priority list. This list is displayed in a 25037 subtypes appear in the priority list. This list is displayed in a
24971 special kind of dialog box or other buffer allowing the user, in 25038 special kind of dialog box or other buffer allowing the user, in
24978 will be in the form of errors or warnings of various levels, some of 25045 will be in the form of errors or warnings of various levels, some of
24979 which may be severe enough to stop the decoding entirely, and some of 25046 which may be severe enough to stop the decoding entirely, and some of
24980 which may either indicate definitely malformed data but from which 25047 which may either indicate definitely malformed data but from which
24981 it's possible to recover, or simply data that appears rather 25048 it's possible to recover, or simply data that appears rather
24982 questionable. If any of these status values are reported during 25049 questionable. If any of these status values are reported during
24983 decoding, the user will be informed of this and asked "are you sure?" 25050 decoding, the user will be informed of this and asked ``are you sure?''
24984 As part of the "are you sure" dialog box or question, the user can 25051 As part of the ``are you sure'' dialog box or question, the user can
24985 display the results of the decoding to make sure it's correct. If the 25052 display the results of the decoding to make sure it's correct. If the
24986 user says "no, they're not sure," then the same list of choices as 25053 user says ``no, they're not sure,'' then the same list of choices as
24987 previously mentioned will be presented. 25054 previously mentioned will be presented.
24988 25055
24989 @subheading RFC: Autodetection 25056 @subheading RFC: Autodetection
24990 25057
24991 Also appeared under heading "Implementation of Coding System Priority 25058 Also appeared under heading "Implementation of Coding System Priority
25201 25268
25202 @enumerate 25269 @enumerate
25203 @item 25270 @item
25204 Hopefully a system general enough to handle (2)--(4) will 25271 Hopefully a system general enough to handle (2)--(4) will
25205 handle these, too, but we should watch out for gotchas like 25272 handle these, too, but we should watch out for gotchas like
25206 Unicode "plane 14" tags which (I think _both_ Ben and Olivier 25273 Unicode ``plane 14'' tags which (I think _both_ Ben and Olivier
25207 will agree) have no place in the internal representation, and 25274 will agree) have no place in the internal representation, and
25208 thus must be treated as out-of-band control sequences. I 25275 thus must be treated as out-of-band control sequences. I
25209 don't know if all such gotchas will be as easy to dispose of. 25276 don't know if all such gotchas will be as easy to dispose of.
25210 25277
25211 @item 25278 @item
25242 25309
25243 sly, it can't be perfect if any autodecoding is done; 25310 sly, it can't be perfect if any autodecoding is done;
25244 like Hrvoje should have an easily available option to 25311 like Hrvoje should have an easily available option to
25245 to this default (or an optimized approximation which 25312 to this default (or an optimized approximation which
25246 t actually read the whole file into a buffer) or simply 25313 t actually read the whole file into a buffer) or simply
25247 y everything as binary (with the "font" for binary files 25314 y everything as binary (with the ``font'' for binary files
25248 a user option). 25315 a user option).
25249 25316
25250 @item 25317 @item
25251 This implies that we should be detecting conditions in the 25318 This implies that we should be detecting conditions in the
25252 tail of the file which violate the implicit assumptions of the 25319 tail of the file which violate the implicit assumptions of the
25351 25418
25352 Date: 11/1/1999 7:24 AM 25419 Date: 11/1/1999 7:24 AM
25353 25420
25354 Stephen, thank you very much for writing this up. I think it is a good start, 25421 Stephen, thank you very much for writing this up. I think it is a good start,
25355 and definitely moving in the direction I would like to see things going: more 25422 and definitely moving in the direction I would like to see things going: more
25356 proposals, less arguing. (aka "more light, less heat") However, I have some 25423 proposals, less arguing. (aka ``more light, less heat'') However, I have some
25357 suggestions for cleaning this up: 25424 suggestions for cleaning this up:
25358 25425
25359 You should try to make it more layered. For example, you might have one 25426 You should try to make it more layered. For example, you might have one
25360 section devoted to the workings of autodetection, which starts out like this 25427 section devoted to the workings of autodetection, which starts out like this
25361 (the section numbers below are totally arbitrary): 25428 (the section numbers below are totally arbitrary):