comparison src/text.c @ 826:6728e641994e

[xemacs-hg @ 2002-05-05 11:30:15 by ben] syntax cache, 8-bit-format, lots of code cleanup README.packages: Update info about --package-path. i.c: Create an inheritable event and pass it on to XEmacs, so that ^C can be handled properly. Intercept ^C and signal the event. "Stop Build" in VC++ now works. bytecomp-runtime.el: Doc string changes. compat.el: Some attempts to redo this to make it truly useful and fix the "multiple versions interacting with each other" problem. Not yet done. Currently doesn't work. files.el: Use with-obsolete-variable to avoid warnings in new revert-buffer code. xemacs.mak: Split up CFLAGS into a version without flags specifying the C library. The problem seems to be that minitar depends on zlib, which depends specifically on libc.lib, not on any of the other C libraries. Unless you compile with libc.lib, you get errors -- specifically, no _errno in the other libraries, which must make it something other than an int. (#### But this doesn't seem to obtain in XEmacs, which also uses zlib, and can be linked with any of the C libraries. Maybe zlib is used differently and doesn't need errno, or maybe XEmacs provides an int errno; ... I don't understand. Makefile.in.in: Fix so that packages are around when testing. abbrev.c, alloc.c, buffer.c, buffer.h, bytecode.c, callint.c, casefiddle.c, casetab.c, casetab.h, charset.h, chartab.c, chartab.h, cmds.c, console-msw.h, console-stream.c, console-x.c, console.c, console.h, data.c, device-msw.c, device.c, device.h, dialog-msw.c, dialog-x.c, dired-msw.c, dired.c, doc.c, doprnt.c, dumper.c, editfns.c, elhash.c, emacs.c, eval.c, event-Xt.c, event-gtk.c, event-msw.c, event-stream.c, events.c, events.h, extents.c, extents.h, faces.c, file-coding.c, file-coding.h, fileio.c, fns.c, font-lock.c, frame-gtk.c, frame-msw.c, frame-x.c, frame.c, frame.h, glade.c, glyphs-gtk.c, glyphs-msw.c, glyphs-msw.h, glyphs-x.c, glyphs.c, glyphs.h, gui-msw.c, gui-x.c, gui.h, gutter.h, hash.h, indent.c, insdel.c, intl-win32.c, intl.c, keymap.c, lisp-disunion.h, lisp-union.h, lisp.h, lread.c, lrecord.h, lstream.c, lstream.h, marker.c, menubar-gtk.c, menubar-msw.c, menubar-x.c, menubar.c, minibuf.c, mule-ccl.c, mule-charset.c, mule-coding.c, mule-wnnfns.c, nas.c, objects-msw.c, objects-x.c, opaque.c, postgresql.c, print.c, process-nt.c, process-unix.c, process.c, process.h, profile.c, rangetab.c, redisplay-gtk.c, redisplay-msw.c, redisplay-output.c, redisplay-x.c, redisplay.c, redisplay.h, regex.c, regex.h, scrollbar-msw.c, search.c, select-x.c, specifier.c, specifier.h, symbols.c, symsinit.h, syntax.c, syntax.h, syswindows.h, tests.c, text.c, text.h, tooltalk.c, ui-byhand.c, ui-gtk.c, unicode.c, win32.c, window.c: Another big Ben patch. -- FUNCTIONALITY CHANGES: add partial support for 8-bit-fixed, 16-bit-fixed, and 32-bit-fixed formats. not quite done yet. (in particular, needs functions to actually convert the buffer.) NOTE: lots of changes to regex.c here. also, many new *_fmt() inline funs that take an Internal_Format argument. redo syntax cache code. make the cache per-buffer; keep the cache valid across calls to functions that use it. also keep it valid across insertions/deletions and extent changes, as much as is possible. eliminate the junky regex-reentrancy code by passing in the relevant lisp info to the regex routines as local vars. add general mechanism in extents code for signalling extent changes. fix numerous problems with the case-table implementation; yoshiki never properly transferred many algorithms from old-style to new-style case tables. redo char tables to support a default argument, so that mapping only occurs over changed args. change many chartab functions to accept Lisp_Object instead of Lisp_Char_Table *. comment out the code in font-lock.c by default, because font-lock.el no longer uses it. we should consider eliminating it entirely. Don't output bell as ^G in console-stream when not a TTY. add -mswindows-termination-handle to interface with i.c, so we can properly kill a build. add more error-checking to buffer/string macros. add some additional buffer_or_string_() funs. -- INTERFACE CHANGES AFFECTING MORE CODE: switch the arguments of write_c_string and friends to be consistent with write_fmt_string, which must have printcharfun first. change BI_* macros to BYTE_* for increased clarity; similarly for bi_* local vars. change VOID_TO_LISP to be a one-argument function. eliminate no-longer-needed CVOID_TO_LISP. -- char/string macro changes: rename MAKE_CHAR() to make_emchar() for slightly less confusion with make_char(). (The former generates an Emchar, the latter a Lisp object. Conceivably we should rename make_char() -> wrap_char() and similarly for make_int(), make_float().) Similar changes for other *CHAR* macros -- we now consistently use names with `emchar' whenever we are working with Emchars. Any remaining name with just `char' always refers to a Lisp object. rename macros with XSTRING_* to string_* except for those that reference actual fields in the Lisp_String object, following conventions used elsewhere. rename set_string_{data,length} macros (the only ones to work with a Lisp_String_* instead of a Lisp_Object) to set_lispstringp_* to make the difference clear. try to be consistent about caps vs. lowercase in macro/inline-fun names for chars and such, which wasn't the case before. we now reserve caps either for XFOO_ macros that reference object fields (e.g. XSTRING_DATA) or for things that have non-function semantics, e.g. directly modifying an arg (BREAKUP_EMCHAR) or evaluating an arg (any arg) more than once. otherwise, use lowercase. here is a summary of most of the macros/inline funs changed by all of the above changes: BYTE_*_P -> byte_*_p XSTRING_BYTE -> string_byte set_string_data/length -> set_lispstringp_data/length XSTRING_CHAR_LENGTH -> string_char_length XSTRING_CHAR -> string_emchar INTBYTE_FIRST_BYTE_P -> intbyte_first_byte_p INTBYTE_LEADING_BYTE_P -> intbyte_leading_byte_p charptr_copy_char -> charptr_copy_emchar LEADING_BYTE_* -> leading_byte_* CHAR_* -> EMCHAR_* *_CHAR_* -> *_EMCHAR_* *_CHAR -> *_EMCHAR CHARSET_BY_ -> charset_by_* BYTE_SHIFT_JIS* -> byte_shift_jis* BYTE_BIG5* -> byte_big5* REP_BYTES_BY_FIRST_BYTE -> rep_bytes_by_first_byte char_to_unicode -> emchar_to_unicode valid_char_p -> valid_emchar_p Change intbyte_strcmp -> qxestrcmp_c (duplicated functionality). -- INTERFACE CHANGES AFFECTING LESS CODE: use DECLARE_INLINE_HEADER in various places. remove '#ifdef emacs' from XEmacs-only files. eliminate CHAR_TABLE_VALUE(), which duplicated the functionality of get_char_table(). add BUFFER_TEXT_LOOP to simplify iterations over buffer text. define typedefs for signed and unsigned types of fixed sizes (INT_32_BIT, UINT_32_BIT, etc.). create ALIGN_FOR_TYPE as a higher-level interface onto ALIGN_SIZE; fix code to use it. add charptr_emchar_len to return the text length of the character pointed to by a ptr; use it in place of charcount_to_bytecount(..., 1). add emchar_len to return the text length of a given character. add types Bytexpos and Charxpos to generalize Bytebpos/Bytecount and Charbpos/Charcount, in code (particularly, the extents code and redisplay code) that works with either kind of index. rename redisplay struct params with names such as `charbpos' to e.g. `charpos' when they are e.g. a Charxpos, not a Charbpos. eliminate xxDEFUN in place of DEFUN; no longer necessary with changes awhile back to doc.c. split up big ugly combined list of EXFUNs in lisp.h on a file-by-file basis, since other prototypes are similarly split. rewrite some "*_UNSAFE" macros as inline funs and eliminate the _UNSAFE suffix. move most string code from lisp.h to text.h; the string code and text.h code is now intertwined in such a fashion that they need to be in the same place and partially interleaved. (you can't create forward references for inline funs) automated/lisp-tests.el, automated/symbol-tests.el, automated/test-harness.el: Fix test harness to output FAIL messages to stderr when in batch mode. Fix up some problems in lisp-tests/symbol-tests that were causing spurious failures.
author ben
date Sun, 05 May 2002 11:33:57 +0000
parents a634e3b7acc8
children 44478bd99873
comparison
equal deleted inserted replaced
825:eb3bc15a6e0f 826:6728e641994e
37 /************************************************************************/ 37 /************************************************************************/
38 /* long comments */ 38 /* long comments */
39 /************************************************************************/ 39 /************************************************************************/
40 40
41 /* 41 /*
42 ==========================================================================
43 1. Character Sets
44 ==========================================================================
45
46 A character set (or "charset") is an ordered set of characters.
47
48 A character (which is, BTW, a surprisingly complex concept) is, in a
49 written representation of text, the most basic written unit that has a
50 meaning of its own. It's comparable to a phoneme when analyzing words
51 in spoken speech. Just like with a phoneme (which is an abstract
52 concept, and is represented in actual spoken speech by one or more
53 allophones, ...&&#### finish this., a character is actually an abstract
54 concept
55
56 A particular character in a charset is indexed using one or
57 more "position codes", which are non-negative integers.
58 The number of position codes needed to identify a particular
59 character in a charset is called the "dimension" of the
60 charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions,
61 and the size of all charsets (except for a few special cases)
62 is either 94, 96, 94 by 94, or 96 by 96. The range of
63 position codes used to index characters from any of these
64 types of character sets is as follows:
65
66 Charset type Position code 1 Position code 2
67 ------------------------------------------------------------
68 94 33 - 126 N/A
69 96 32 - 127 N/A
70 94x94 33 - 126 33 - 126
71 96x96 32 - 127 32 - 127
72
73 Note that in the above cases position codes do not start at
74 an expected value such as 0 or 1. The reason for this will
75 become clear later.
76
77 For example, Latin-1 is a 96-character charset, and JISX0208
78 (the Japanese national character set) is a 94x94-character
79 charset.
80
81 [Note that, although the ranges above define the *valid*
82 position codes for a charset, some of the slots in a particular
83 charset may in fact be empty. This is the case for JISX0208,
84 for example, where (e.g.) all the slots whose first
85 position code is in the range 118 - 127 are empty.]
86
87 There are three charsets that do not follow the above rules.
88 All of them have one dimension, and have ranges of position
89 codes as follows:
90
91 Charset name Position code 1
92 ------------------------------------
93 ASCII 0 - 127
94 Control-1 0 - 31
95 Composite 0 - some large number
96
97 (The upper bound of the position code for composite characters
98 has not yet been determined, but it will probably be at
99 least 16,383).
100
101 ASCII is the union of two subsidiary character sets:
102 Printing-ASCII (the printing ASCII character set,
103 consisting of position codes 33 - 126, like for a standard
104 94-character charset) and Control-ASCII (the non-printing
105 characters that would appear in a binary file with codes 0
106 - 32 and 127).
107
108 Control-1 contains the non-printing characters that would
109 appear in a binary file with codes 128 - 159.
110
111 Composite contains characters that are generated by
112 overstriking one or more characters from other charsets.
113
114 Note that some characters in ASCII, and all characters
115 in Control-1, are "control" (non-printing) characters.
116 These have no printed representation but instead control
117 some other function of the printing (e.g. TAB or 8 moves
118 the current character position to the next tab stop).
119 All other characters in all charsets are "graphic"
120 (printing) characters.
121
122 When a binary file is read in, the bytes in the file are
123 assigned to character sets as follows:
124
125 Bytes Character set Range
126 --------------------------------------------------
127 0 - 127 ASCII 0 - 127
128 128 - 159 Control-1 0 - 31
129 160 - 255 Latin-1 32 - 127
130
131 This is a bit ad-hoc but gets the job done.
132
133 ==========================================================================
134 2. Encodings
135 ==========================================================================
136
137 An "encoding" is a way of numerically representing
138 characters from one or more character sets. If an encoding
139 only encompasses one character set, then the position codes
140 for the characters in that character set could be used
141 directly. This is not possible, however, if more than one
142 character set is to be used in the encoding.
143
144 For example, the conversion detailed above between bytes in
145 a binary file and characters is effectively an encoding
146 that encompasses the three character sets ASCII, Control-1,
147 and Latin-1 in a stream of 8-bit bytes.
148
149 Thus, an encoding can be viewed as a way of encoding
150 characters from a specified group of character sets using a
151 stream of bytes, each of which contains a fixed number of
152 bits (but not necessarily 8, as in the common usage of
153 "byte").
154
155 Here are descriptions of a couple of common
156 encodings:
157
158
159 A. Japanese EUC (Extended Unix Code)
160
161 This encompasses the character sets:
162 - Printing-ASCII,
163 - Katakana-JISX0201 (half-width katakana, the right half of JISX0201).
164 - Japanese-JISX0208
165 - Japanese-JISX0212
166 It uses 8-bit bytes.
167
168 Note that Printing-ASCII and Katakana-JISX0201 are 94-character
169 charsets, while Japanese-JISX0208 is a 94x94-character charset.
170
171 The encoding is as follows:
172
173 Character set Representation (PC == position-code)
174 ------------- --------------
175 Printing-ASCII PC1
176 Japanese-JISX0208 PC1 + 0x80 | PC2 + 0x80
177 Katakana-JISX0201 0x8E | PC1 + 0x80
178
179
180 B. JIS7
181
182 This encompasses the character sets:
183 - Printing-ASCII
184 - Latin-JISX0201 (the left half of JISX0201; this character set is
185 very similar to Printing-ASCII and is a 94-character charset)
186 - Japanese-JISX0208
187 - Katakana-JISX0201
188 It uses 7-bit bytes.
189
190 Unlike Japanese EUC, this is a "modal" encoding, which
191 means that there are multiple states that the encoding can
192 be in, which affect how the bytes are to be interpreted.
193 Special sequences of bytes (called "escape sequences")
194 are used to change states.
195
196 The encoding is as follows:
197
198 Character set Representation
199 ------------- --------------
200 Printing-ASCII PC1
201 Latin-JISX0201 PC1
202 Katakana-JISX0201 PC1
203 Japanese-JISX0208 PC1 | PC2
204
205 Escape sequence ASCII equivalent Meaning
206 --------------- ---------------- -------
207 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII
208 0x1B 0x28 0x4A ESC ( J invoke Latin-JISX0201
209 0x1B 0x28 0x49 ESC ( I invoke Katakana-JISX0201
210 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208
211
212 Initially, Printing-ASCII is invoked.
213
214 ==========================================================================
215 3. Internal Mule Encodings
216 ==========================================================================
217
218 In XEmacs/Mule, each character set is assigned a unique number,
219 called a "leading byte". This is used in the encodings of a
220 character. Leading bytes are in the range 0x80 - 0xFF
221 (except for ASCII, which has a leading byte of 0), although
222 some leading bytes are reserved.
223
224 Charsets whose leading byte is in the range 0x80 - 0x9F are
225 called "official" and are used for built-in charsets.
226 Other charsets are called "private" and have leading bytes
227 in the range 0xA0 - 0xFF; these are user-defined charsets.
228
229 More specifically:
230
231 Character set Leading byte
232 ------------- ------------
233 ASCII 0 (0x7F in arrays indexed by leading byte)
234 Composite 0x8D
235 Dimension-1 Official 0x80 - 0x8C/0x8D
236 (0x8E is free)
237 Control 0x8F
238 Dimension-2 Official 0x90 - 0x99
239 (0x9A - 0x9D are free)
240 Dimension-1 Private Marker 0x9E
241 Dimension-2 Private Marker 0x9F
242 Dimension-1 Private 0xA0 - 0xEF
243 Dimension-2 Private 0xF0 - 0xFF
244
245 There are two internal encodings for characters in XEmacs/Mule.
246 One is called "string encoding" and is an 8-bit encoding that
247 is used for representing characters in a buffer or string.
248 It uses 1 to 4 bytes per character. The other is called
249 "character encoding" and is a 19-bit encoding that is used
250 for representing characters individually in a variable.
251
252 (In the following descriptions, we'll ignore composite
253 characters for the moment. We also give a general (structural)
254 overview first, followed later by the exact details.)
255
256 A. Internal String Encoding
257
258 ASCII characters are encoded using their position code directly.
259 Other characters are encoded using their leading byte followed
260 by their position code(s) with the high bit set. Characters
261 in private character sets have their leading byte prefixed with
262 a "leading byte prefix", which is either 0x9E or 0x9F. (No
263 character sets are ever assigned these leading bytes.) Specifically:
264
265 Character set Encoding (PC == position-code)
266 ------------- -------- (LB == leading-byte)
267 ASCII PC1 |
268 Control-1 LB | PC1 + 0xA0
269 Dimension-1 official LB | PC1 + 0x80
270 Dimension-1 private 0x9E | LB | PC1 + 0x80
271 Dimension-2 official LB | PC1 | PC2 + 0x80
272 Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80
273
274 The basic characteristic of this encoding is that the first byte
275 of all characters is in the range 0x00 - 0x9F, and the second and
276 following bytes of all characters is in the range 0xA0 - 0xFF.
277 This means that it is impossible to get out of sync, or more
278 specifically:
279
280 1. Given any byte position, the beginning of the character it is
281 within can be determined in constant time.
282 2. Given any byte position at the beginning of a character, the
283 beginning of the next character can be determined in constant
284 time.
285 3. Given any byte position at the beginning of a character, the
286 beginning of the previous character can be determined in constant
287 time.
288 4. Textual searches can simply treat encoded strings as if they
289 were encoded in a one-byte-per-character fashion rather than
290 the actual multi-byte encoding.
291
292 None of the standard non-modal encodings meet all of these
293 conditions. For example, EUC satisfies only (2) and (3), while
294 Shift-JIS and Big5 (not yet described) satisfy only (2). (All
295 non-modal encodings must satisfy (2), in order to be unambiguous.)
296
297 B. Internal Character Encoding
298
299 One 19-bit word represents a single character. The word is
300 separated into three fields:
301
302 Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
303 <------------> <------------------> <------------------>
304 Field: 1 2 3
305
306 Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits.
307
308 Character set Field 1 Field 2 Field 3
309 ------------- ------- ------- -------
310 ASCII 0 0 PC1
311 range: (00 - 7F)
312 Control-1 0 1 PC1
313 range: (00 - 1F)
314 Dimension-1 official 0 LB - 0x7F PC1
315 range: (01 - 0D) (20 - 7F)
316 Dimension-1 private 0 LB - 0x80 PC1
317 range: (20 - 6F) (20 - 7F)
318 Dimension-2 official LB - 0x8F PC1 PC2
319 range: (01 - 0A) (20 - 7F) (20 - 7F)
320 Dimension-2 private LB - 0xE1 PC1 PC2
321 range: (0F - 1E) (20 - 7F) (20 - 7F)
322 Composite 0x1F ? ?
323
324 Note that character codes 0 - 255 are the same as the "binary encoding"
325 described above.
326
327 Most of the code in XEmacs knows nothing of the representation of a
328 character other than that values 0 - 255 represent ASCII, Control 1,
329 and Latin 1.
330
331 WARNING WARNING WARNING: The Boyer-Moore code in search.c, and the
332 code in search_buffer() that determines whether that code can be used,
333 knows that "field 3" in a character always corresponds to the last
334 byte in the textual representation of the character. (This is important
335 because the Boyer-Moore algorithm works by looking at the last byte
336 of the search string and &&#### finish this.
337
338 ==========================================================================
339 4. Buffer Positions and Other Typedefs
340 ==========================================================================
341
342 A. Buffer Positions
343
42 There are three possible ways to specify positions in a buffer. All 344 There are three possible ways to specify positions in a buffer. All
43 of these are one-based: the beginning of the buffer is position or 345 of these are one-based: the beginning of the buffer is position or
44 index 1, and 0 is not a valid position. 346 index 1, and 0 is not a valid position.
45 347
46 As a "buffer position" (typedef Charbpos): 348 As a "buffer position" (typedef Charbpos):
76 378
77 buffer_start_address + memory_index(position) - 1 379 buffer_start_address + memory_index(position) - 1
78 380
79 except in the case of characters at the gap position. 381 except in the case of characters at the gap position.
80 382
81 Other typedefs: 383 B. Other Typedefs
82 ===============
83 384
84 Emchar: 385 Emchar:
85 ------- 386 -------
86 This typedef represents a single Emacs character, which can be 387 This typedef represents a single Emacs character, which can be
87 ASCII, ISO-8859, or some extended character, as would typically 388 ASCII, ISO-8859, or some extended character, as would typically
172 ---------- 473 ----------
173 Similar to a Charcount but represents a count of bytes. 474 Similar to a Charcount but represents a count of bytes.
174 The difference between two Bytebpos's is a Bytecount. 475 The difference between two Bytebpos's is a Bytecount.
175 476
176 477
177 Usage of the various representations: 478 C. Usage of the Various Representations
178 =====================================
179 479
180 Memory indices are used in low-level functions in insdel.c and for 480 Memory indices are used in low-level functions in insdel.c and for
181 extent endpoints and marker positions. The reason for this is that 481 extent endpoints and marker positions. The reason for this is that
182 this way, the extents and markers don't need to be updated for most 482 this way, the extents and markers don't need to be updated for most
183 insertions, which merely shrink the gap and don't move any 483 insertions, which merely shrink the gap and don't move any
205 505
206 Strings are always passed around internally using internal format. 506 Strings are always passed around internally using internal format.
207 Conversions between external format are performed at the time 507 Conversions between external format are performed at the time
208 that the data goes in or out of Emacs. 508 that the data goes in or out of Emacs.
209 509
210 Working with the various representations: 510 D. Working With the Various Representations
211 ========================================= */ 511
212 512 We write things this way because it's very important the
213 /* We write things this way because it's very important the
214 MAX_BYTEBPOS_GAP_SIZE_3 is a multiple of 3. (As it happens, 513 MAX_BYTEBPOS_GAP_SIZE_3 is a multiple of 3. (As it happens,
215 65535 is a multiple of 3, but this may not always be the 514 65535 is a multiple of 3, but this may not always be the
216 case.) */ 515 case. #### unfinished
217 516
218 517 ==========================================================================
219 /* 518 5. Miscellaneous
220 1. Character Sets 519 ==========================================================================
221 ================= 520
222 521 A. Unicode Support
223 A character set (or "charset") is an ordered set of characters.
224 A particular character in a charset is indexed using one or
225 more "position codes", which are non-negative integers.
226 The number of position codes needed to identify a particular
227 character in a charset is called the "dimension" of the
228 charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions,
229 and the size of all charsets (except for a few special cases)
230 is either 94, 96, 94 by 94, or 96 by 96. The range of
231 position codes used to index characters from any of these
232 types of character sets is as follows:
233
234 Charset type Position code 1 Position code 2
235 ------------------------------------------------------------
236 94 33 - 126 N/A
237 96 32 - 127 N/A
238 94x94 33 - 126 33 - 126
239 96x96 32 - 127 32 - 127
240
241 Note that in the above cases position codes do not start at
242 an expected value such as 0 or 1. The reason for this will
243 become clear later.
244
245 For example, Latin-1 is a 96-character charset, and JISX0208
246 (the Japanese national character set) is a 94x94-character
247 charset.
248
249 [Note that, although the ranges above define the *valid*
250 position codes for a charset, some of the slots in a particular
251 charset may in fact be empty. This is the case for JISX0208,
252 for example, where (e.g.) all the slots whose first
253 position code is in the range 118 - 127 are empty.]
254
255 There are three charsets that do not follow the above rules.
256 All of them have one dimension, and have ranges of position
257 codes as follows:
258
259 Charset name Position code 1
260 ------------------------------------
261 ASCII 0 - 127
262 Control-1 0 - 31
263 Composite 0 - some large number
264
265 (The upper bound of the position code for composite characters
266 has not yet been determined, but it will probably be at
267 least 16,383).
268
269 ASCII is the union of two subsidiary character sets:
270 Printing-ASCII (the printing ASCII character set,
271 consisting of position codes 33 - 126, like for a standard
272 94-character charset) and Control-ASCII (the non-printing
273 characters that would appear in a binary file with codes 0
274 - 32 and 127).
275
276 Control-1 contains the non-printing characters that would
277 appear in a binary file with codes 128 - 159.
278
279 Composite contains characters that are generated by
280 overstriking one or more characters from other charsets.
281
282 Note that some characters in ASCII, and all characters
283 in Control-1, are "control" (non-printing) characters.
284 These have no printed representation but instead control
285 some other function of the printing (e.g. TAB or 8 moves
286 the current character position to the next tab stop).
287 All other characters in all charsets are "graphic"
288 (printing) characters.
289
290 When a binary file is read in, the bytes in the file are
291 assigned to character sets as follows:
292
293 Bytes Character set Range
294 --------------------------------------------------
295 0 - 127 ASCII 0 - 127
296 128 - 159 Control-1 0 - 31
297 160 - 255 Latin-1 32 - 127
298
299 This is a bit ad-hoc but gets the job done.
300
301 2. Encodings
302 ============
303
304 An "encoding" is a way of numerically representing
305 characters from one or more character sets. If an encoding
306 only encompasses one character set, then the position codes
307 for the characters in that character set could be used
308 directly. This is not possible, however, if more than one
309 character set is to be used in the encoding.
310
311 For example, the conversion detailed above between bytes in
312 a binary file and characters is effectively an encoding
313 that encompasses the three character sets ASCII, Control-1,
314 and Latin-1 in a stream of 8-bit bytes.
315
316 Thus, an encoding can be viewed as a way of encoding
317 characters from a specified group of character sets using a
318 stream of bytes, each of which contains a fixed number of
319 bits (but not necessarily 8, as in the common usage of
320 "byte").
321
322 Here are descriptions of a couple of common
323 encodings:
324
325
326 A. Japanese EUC (Extended Unix Code)
327
328 This encompasses the character sets:
329 - Printing-ASCII,
330 - Katakana-JISX0201 (half-width katakana, the right half of JISX0201).
331 - Japanese-JISX0208
332 - Japanese-JISX0212
333 It uses 8-bit bytes.
334
335 Note that Printing-ASCII and Katakana-JISX0201 are 94-character
336 charsets, while Japanese-JISX0208 is a 94x94-character charset.
337
338 The encoding is as follows:
339
340 Character set Representation (PC == position-code)
341 ------------- --------------
342 Printing-ASCII PC1
343 Japanese-JISX0208 PC1 + 0x80 | PC2 + 0x80
344 Katakana-JISX0201 0x8E | PC1 + 0x80
345
346
347 B. JIS7
348
349 This encompasses the character sets:
350 - Printing-ASCII
351 - Latin-JISX0201 (the left half of JISX0201; this character set is
352 very similar to Printing-ASCII and is a 94-character charset)
353 - Japanese-JISX0208
354 - Katakana-JISX0201
355 It uses 7-bit bytes.
356
357 Unlike Japanese EUC, this is a "modal" encoding, which
358 means that there are multiple states that the encoding can
359 be in, which affect how the bytes are to be interpreted.
360 Special sequences of bytes (called "escape sequences")
361 are used to change states.
362
363 The encoding is as follows:
364
365 Character set Representation
366 ------------- --------------
367 Printing-ASCII PC1
368 Latin-JISX0201 PC1
369 Katakana-JISX0201 PC1
370 Japanese-JISX0208 PC1 | PC2
371
372 Escape sequence ASCII equivalent Meaning
373 --------------- ---------------- -------
374 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII
375 0x1B 0x28 0x4A ESC ( J invoke Latin-JISX0201
376 0x1B 0x28 0x49 ESC ( I invoke Katakana-JISX0201
377 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208
378
379 Initially, Printing-ASCII is invoked.
380
381 3. Internal Mule Encodings
382 ==========================
383
384 In XEmacs/Mule, each character set is assigned a unique number,
385 called a "leading byte". This is used in the encodings of a
386 character. Leading bytes are in the range 0x80 - 0xFF
387 (except for ASCII, which has a leading byte of 0), although
388 some leading bytes are reserved.
389
390 Charsets whose leading byte is in the range 0x80 - 0x9F are
391 called "official" and are used for built-in charsets.
392 Other charsets are called "private" and have leading bytes
393 in the range 0xA0 - 0xFF; these are user-defined charsets.
394
395 More specifically:
396
397 Character set Leading byte
398 ------------- ------------
399 ASCII 0 (0x7F in arrays indexed by leading byte)
400 Composite 0x8D
401 Dimension-1 Official 0x80 - 0x8C/0x8D
402 (0x8E is free)
403 Control 0x8F
404 Dimension-2 Official 0x90 - 0x99
405 (0x9A - 0x9D are free)
406 Dimension-1 Private Marker 0x9E
407 Dimension-2 Private Marker 0x9F
408 Dimension-1 Private 0xA0 - 0xEF
409 Dimension-2 Private 0xF0 - 0xFF
410
411 There are two internal encodings for characters in XEmacs/Mule.
412 One is called "string encoding" and is an 8-bit encoding that
413 is used for representing characters in a buffer or string.
414 It uses 1 to 4 bytes per character. The other is called
415 "character encoding" and is a 19-bit encoding that is used
416 for representing characters individually in a variable.
417
418 (In the following descriptions, we'll ignore composite
419 characters for the moment. We also give a general (structural)
420 overview first, followed later by the exact details.)
421
422 A. Internal String Encoding
423
424 ASCII characters are encoded using their position code directly.
425 Other characters are encoded using their leading byte followed
426 by their position code(s) with the high bit set. Characters
427 in private character sets have their leading byte prefixed with
428 a "leading byte prefix", which is either 0x9E or 0x9F. (No
429 character sets are ever assigned these leading bytes.) Specifically:
430
431 Character set Encoding (PC == position-code)
432 ------------- -------- (LB == leading-byte)
433 ASCII PC1 |
434 Control-1 LB | PC1 + 0xA0
435 Dimension-1 official LB | PC1 + 0x80
436 Dimension-1 private 0x9E | LB | PC1 + 0x80
437 Dimension-2 official LB | PC1 | PC2 + 0x80
438 Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80
439
440 The basic characteristic of this encoding is that the first byte
441 of all characters is in the range 0x00 - 0x9F, and the second and
442 following bytes of all characters is in the range 0xA0 - 0xFF.
443 This means that it is impossible to get out of sync, or more
444 specifically:
445
446 1. Given any byte position, the beginning of the character it is
447 within can be determined in constant time.
448 2. Given any byte position at the beginning of a character, the
449 beginning of the next character can be determined in constant
450 time.
451 3. Given any byte position at the beginning of a character, the
452 beginning of the previous character can be determined in constant
453 time.
454 4. Textual searches can simply treat encoded strings as if they
455 were encoded in a one-byte-per-character fashion rather than
456 the actual multi-byte encoding.
457
458 None of the standard non-modal encodings meet all of these
459 conditions. For example, EUC satisfies only (2) and (3), while
460 Shift-JIS and Big5 (not yet described) satisfy only (2). (All
461 non-modal encodings must satisfy (2), in order to be unambiguous.)
462
463 B. Internal Character Encoding
464
465 One 19-bit word represents a single character. The word is
466 separated into three fields:
467
468 Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
469 <------------> <------------------> <------------------>
470 Field: 1 2 3
471
472 Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits.
473
474 Character set Field 1 Field 2 Field 3
475 ------------- ------- ------- -------
476 ASCII 0 0 PC1
477 range: (00 - 7F)
478 Control-1 0 1 PC1
479 range: (00 - 1F)
480 Dimension-1 official 0 LB - 0x7F PC1
481 range: (01 - 0D) (20 - 7F)
482 Dimension-1 private 0 LB - 0x80 PC1
483 range: (20 - 6F) (20 - 7F)
484 Dimension-2 official LB - 0x8F PC1 PC2
485 range: (01 - 0A) (20 - 7F) (20 - 7F)
486 Dimension-2 private LB - 0xE1 PC1 PC2
487 range: (0F - 1E) (20 - 7F) (20 - 7F)
488 Composite 0x1F ? ?
489
490 Note that character codes 0 - 255 are the same as the "binary encoding"
491 described above.
492 */
493
494 /*
495 About Unicode support:
496 522
497 Adding Unicode support is very desirable. Unicode will likely be a 523 Adding Unicode support is very desirable. Unicode will likely be a
498 very common representation in the future, and thus we should 524 very common representation in the future, and thus we should
499 represent Unicode characters using three bytes instead of four. 525 represent Unicode characters using three bytes instead of four.
500 This means we need to find leading bytes for Unicode. Given that 526 This means we need to find leading bytes for Unicode. Given that
506 with a little change to the functions that assume that 0x80 is the 532 with a little change to the functions that assume that 0x80 is the
507 lowest leading byte.) This means we still need to dump three 533 lowest leading byte.) This means we still need to dump three
508 leading bytes and move them into private space. The CNS charsets 534 leading bytes and move them into private space. The CNS charsets
509 are good candidates since they are rarely used, and 535 are good candidates since they are rarely used, and
510 JAPANESE_JISX0208_1978 is becoming less and less used and could 536 JAPANESE_JISX0208_1978 is becoming less and less used and could
511 also be dumped. */ 537 also be dumped.
512 538
513 539 B. Composite Characters
514 /* Composite characters are characters constructed by overstriking two 540
541 Composite characters are characters constructed by overstriking two
515 or more regular characters. 542 or more regular characters.
516 543
517 1) The old Mule implementation involves storing composite characters 544 1) The old Mule implementation involves storing composite characters
518 in a buffer as a tag followed by all of the actual characters 545 in a buffer as a tag followed by all of the actual characters
519 used to make up the composite character. I think this is a bad 546 used to make up the composite character. I think this is a bad
536 where each C[1-3] is in the range 0xA0 - 0xFF. This allows 563 where each C[1-3] is in the range 0xA0 - 0xFF. This allows
537 for slightly under 2^20 (one million) composite characters 564 for slightly under 2^20 (one million) composite characters
538 over the XEmacs process lifetime, and you only need to 565 over the XEmacs process lifetime, and you only need to
539 increase the size of a Mule character from 19 to 21 bits. 566 increase the size of a Mule character from 19 to 21 bits.
540 Or you could use 0x8D C1 C2 C3 C4, allowing for about 567 Or you could use 0x8D C1 C2 C3 C4, allowing for about
541 85 million (slightly over 2^26) composite characters. */ 568 85 million (slightly over 2^26) composite characters.
569
570 */
542 571
543 572
544 /************************************************************************/ 573 /************************************************************************/
545 /* declarations */ 574 /* declarations */
546 /************************************************************************/ 575 /************************************************************************/
558 indexed by the first byte of that representation. 587 indexed by the first byte of that representation.
559 588
560 rep_bytes_by_first_byte(c) is more efficient than the equivalent 589 rep_bytes_by_first_byte(c) is more efficient than the equivalent
561 canonical computation: 590 canonical computation:
562 591
563 XCHARSET_REP_BYTES (CHARSET_BY_LEADING_BYTE (c)) */ 592 XCHARSET_REP_BYTES (charset_by_leading_byte (c)) */
564 593
565 const Bytecount rep_bytes_by_first_byte[0xA0] = 594 const Bytecount rep_bytes_by_first_byte[0xA0] =
566 { /* 0x00 - 0x7f are for straight ASCII */ 595 { /* 0x00 - 0x7f are for straight ASCII */
567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 596 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
568 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 597 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1008 malloc()ed string. Note that the actual number of Intbytes allocated 1037 malloc()ed string. Note that the actual number of Intbytes allocated
1009 is one more than this: the returned string is zero-terminated. */ 1038 is one more than this: the returned string is zero-terminated. */
1010 1039
1011 Intbyte * 1040 Intbyte *
1012 convert_emchar_string_into_malloced_string (Emchar *arr, int nels, 1041 convert_emchar_string_into_malloced_string (Emchar *arr, int nels,
1013 Bytecount *len_out) 1042 Bytecount *len_out)
1014 { 1043 {
1015 /* Damn zero-termination. */ 1044 /* Damn zero-termination. */
1016 Intbyte *str = (Intbyte *) alloca (nels * MAX_EMCHAR_LEN + 1); 1045 Intbyte *str = (Intbyte *) alloca (nels * MAX_EMCHAR_LEN + 1);
1017 Intbyte *strorig = str; 1046 Intbyte *strorig = str;
1018 Bytecount len; 1047 Bytecount len;
1028 if (len_out) 1057 if (len_out)
1029 *len_out = len; 1058 *len_out = len;
1030 return str; 1059 return str;
1031 } 1060 }
1032 1061
1062 #define COPY_TEXT_BETWEEN_FORMATS(srcfmt, dstfmt) \
1063 do \
1064 { \
1065 if (dst) \
1066 { \
1067 Intbyte *dstend = dst + dstlen; \
1068 Intbyte *dstp = dst; \
1069 const Intbyte *srcend = src + srclen; \
1070 const Intbyte *srcp = src; \
1071 \
1072 while (srcp < srcend) \
1073 { \
1074 Emchar ch = charptr_emchar_fmt (srcp, srcfmt, srcobj); \
1075 Bytecount len = emchar_len_fmt (ch, dstfmt); \
1076 \
1077 if (dstp + len <= dstend) \
1078 { \
1079 set_charptr_emchar_fmt (dstp, ch, dstfmt, dstobj); \
1080 dstp += len; \
1081 } \
1082 else \
1083 break; \
1084 INC_CHARPTR_FMT (srcp, srcfmt); \
1085 } \
1086 text_checking_assert (srcp <= srcend); \
1087 if (src_used) \
1088 *src_used = srcp - src; \
1089 return dstp - dst; \
1090 } \
1091 else \
1092 { \
1093 const Intbyte *srcend = src + srclen; \
1094 const Intbyte *srcp = src; \
1095 Bytecount total = 0; \
1096 \
1097 while (srcp < srcend) \
1098 { \
1099 total += emchar_len_fmt (charptr_emchar_fmt (srcp, srcfmt, \
1100 srcobj), dstfmt); \
1101 INC_CHARPTR_FMT (srcp, srcfmt); \
1102 } \
1103 text_checking_assert (srcp == srcend); \
1104 if (src_used) \
1105 *src_used = srcp - src; \
1106 return total; \
1107 } \
1108 } \
1109 while (0)
1110
1111 /* Copy as much text from SRC/SRCLEN to DST/DSTLEN as will fit, converting
1112 from SRCFMT/SRCOBJ to DSTFMT/DSTOBJ. Return number of bytes stored into
1113 DST as return value, and number of bytes copied from SRC through
1114 SRC_USED (if not NULL). If DST is NULL, don't actually store anything
1115 and just return the size needed to store all the text. Will not copy
1116 partial characters into DST. */
1117
1118 Bytecount
1119 copy_text_between_formats (const Intbyte *src, Bytecount srclen,
1120 Internal_Format srcfmt,
1121 Lisp_Object srcobj,
1122 Intbyte *dst, Bytecount dstlen,
1123 Internal_Format dstfmt,
1124 Lisp_Object dstobj,
1125 Bytecount *src_used)
1126 {
1127 if (srcfmt == dstfmt &&
1128 objects_have_same_internal_representation (srcobj, dstobj))
1129 {
1130 if (dst)
1131 {
1132 srclen = min (srclen, dstlen);
1133 srclen = validate_intbyte_string_backward (src, srclen);
1134 memcpy (dst, src, srclen);
1135 if (src_used)
1136 *src_used = srclen;
1137 return srclen;
1138 }
1139 else
1140 return srclen;
1141 }
1142 /* Everything before the final else statement is an optimization.
1143 The inner loops inside COPY_TEXT_BETWEEN_FORMATS() have a number
1144 of calls to *_fmt(), each of which has a switch statement in it.
1145 By using constants as the FMT argument, these switch statements
1146 will be optimized out of existence. */
1147 #define ELSE_FORMATS(fmt1, fmt2) \
1148 else if (srcfmt == fmt1 && dstfmt == fmt2) \
1149 COPY_TEXT_BETWEEN_FORMATS (fmt1, fmt2)
1150 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_8_BIT_FIXED);
1151 ELSE_FORMATS (FORMAT_8_BIT_FIXED, FORMAT_DEFAULT);
1152 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_32_BIT_FIXED);
1153 ELSE_FORMATS (FORMAT_32_BIT_FIXED, FORMAT_DEFAULT);
1154 else
1155 COPY_TEXT_BETWEEN_FORMATS (srcfmt, dstfmt);
1156 #undef ELSE_FORMATS
1157 }
1158
1159 /* Copy as much buffer text in BUF, starting at POS, of length LEN, as will
1160 fit into DST/DSTLEN, converting to DSTFMT. Return number of bytes
1161 stored into DST as return value, and number of bytes copied from BUF
1162 through SRC_USED (if not NULL). If DST is NULL, don't actually store
1163 anything and just return the size needed to store all the text. */
1164
1165 Bytecount
1166 copy_buffer_text_out (struct buffer *buf, Bytebpos pos,
1167 Bytecount len, Intbyte *dst, Bytecount dstlen,
1168 Internal_Format dstfmt, Lisp_Object dstobj,
1169 Bytecount *src_used)
1170 {
1171 Bytecount dst_used = 0;
1172 if (src_used)
1173 *src_used = 0;
1174
1175 {
1176 BUFFER_TEXT_LOOP (buf, pos, len, runptr, runlen)
1177 {
1178 Bytecount the_src_used, the_dst_used;
1179
1180 the_dst_used = copy_text_between_formats (runptr, runlen,
1181 BUF_FORMAT (buf),
1182 wrap_buffer (buf),
1183 dst, dstlen, dstfmt,
1184 dstobj, &the_src_used);
1185 dst_used += the_dst_used;
1186 if (src_used)
1187 *src_used += the_src_used;
1188 if (dst)
1189 {
1190 dst += the_dst_used;
1191 dstlen -= the_dst_used;
1192 if (!dstlen)
1193 break;
1194 }
1195 }
1196 }
1197
1198 return dst_used;
1199 }
1200
1033 1201
1034 /************************************************************************/ 1202 /************************************************************************/
1035 /* charset properties of strings */ 1203 /* charset properties of strings */
1036 /************************************************************************/ 1204 /************************************************************************/
1037 1205
1053 return; 1221 return;
1054 } 1222 }
1055 1223
1056 while (str < strend) 1224 while (str < strend)
1057 { 1225 {
1058 charsets[CHAR_LEADING_BYTE (charptr_emchar (str)) - MIN_LEADING_BYTE] = 1226 charsets[emchar_leading_byte (charptr_emchar (str)) - MIN_LEADING_BYTE] =
1059 1; 1227 1;
1060 INC_CHARPTR (str); 1228 INC_CHARPTR (str);
1061 } 1229 }
1062 #endif 1230 #endif
1063 } 1231 }
1081 return; 1249 return;
1082 } 1250 }
1083 1251
1084 for (i = 0; i < len; i++) 1252 for (i = 0; i < len; i++)
1085 { 1253 {
1086 charsets[CHAR_LEADING_BYTE (str[i]) - MIN_LEADING_BYTE] = 1; 1254 charsets[emchar_leading_byte (str[i]) - MIN_LEADING_BYTE] = 1;
1087 } 1255 }
1088 #endif 1256 #endif
1089 } 1257 }
1090 1258
1091 int 1259 int
1096 1264
1097 while (str < end) 1265 while (str < end)
1098 { 1266 {
1099 #ifdef MULE 1267 #ifdef MULE
1100 Emchar ch = charptr_emchar (str); 1268 Emchar ch = charptr_emchar (str);
1101 cols += XCHARSET_COLUMNS (CHAR_CHARSET (ch)); 1269 cols += XCHARSET_COLUMNS (emchar_charset (ch));
1102 #else 1270 #else
1103 cols++; 1271 cols++;
1104 #endif 1272 #endif
1105 INC_CHARPTR (str); 1273 INC_CHARPTR (str);
1106 } 1274 }
1114 #ifdef MULE 1282 #ifdef MULE
1115 int cols = 0; 1283 int cols = 0;
1116 int i; 1284 int i;
1117 1285
1118 for (i = 0; i < len; i++) 1286 for (i = 0; i < len; i++)
1119 cols += XCHARSET_COLUMNS (CHAR_CHARSET (str[i])); 1287 cols += XCHARSET_COLUMNS (emchar_charset (str[i]));
1120 1288
1121 return cols; 1289 return cols;
1122 #else /* not MULE */ 1290 #else /* not MULE */
1123 return len; 1291 return len;
1124 #endif 1292 #endif
1131 const Intbyte *end = str + len; 1299 const Intbyte *end = str + len;
1132 Charcount retval = 0; 1300 Charcount retval = 0;
1133 1301
1134 while (str < end) 1302 while (str < end)
1135 { 1303 {
1136 if (!BYTE_ASCII_P (*str)) 1304 if (!byte_ascii_p (*str))
1137 retval++; 1305 retval++;
1138 INC_CHARPTR (str); 1306 INC_CHARPTR (str);
1139 } 1307 }
1140 1308
1141 return retval; 1309 return retval;
1266 qxetextcasecmp (src, len, dst, dstlen)); 1434 qxetextcasecmp (src, len, dst, dstlen));
1267 } 1435 }
1268 } 1436 }
1269 1437
1270 Intbyte * 1438 Intbyte *
1271 eicpyout_malloc_fmt (Eistring *eistr, Bytecount *len_out, Internal_Format fmt) 1439 eicpyout_malloc_fmt (Eistring *eistr, Bytecount *len_out, Internal_Format fmt,
1440 Lisp_Object object)
1272 { 1441 {
1273 Intbyte *ptr; 1442 Intbyte *ptr;
1274 1443
1275 assert (fmt == FORMAT_DEFAULT); 1444 assert (fmt == FORMAT_DEFAULT);
1276 ptr = xnew_array (Intbyte, eistr->bytelen_ + 1); 1445 ptr = xnew_array (Intbyte, eistr->bytelen_ + 1);
1287 1456
1288 /* Optimization. Do it. Live it. Love it. */ 1457 /* Optimization. Do it. Live it. Love it. */
1289 1458
1290 #ifdef MULE 1459 #ifdef MULE
1291 1460
1292 /* We include the basic functions here that require no specific 1461 /* Skip as many ASCII bytes as possible in the memory block [PTR, END).
1293 knowledge of how data is Mule-encoded into a buffer other 1462 Return pointer to the first non-ASCII byte. optimized for long
1294 than the basic (00 - 7F), (80 - 9F), (A0 - FF) scheme. 1463 stretches of ASCII. */
1295 Anything that requires more specific knowledge goes into 1464 inline static const Intbyte *
1296 mule-charset.c. */ 1465 skip_ascii (const Intbyte *ptr, const Intbyte *end)
1297 1466 {
1298 /* Given a pointer to a text string and a length in bytes, return 1467 #ifdef EFFICIENT_INT_128_BIT
1299 the equivalent length in characters. */ 1468 # define STRIDE_TYPE INT_128_BIT
1300 1469 # define HIGH_BIT_MASK \
1301 Charcount 1470 MAKE_128_BIT_UNSIGNED_CONSTANT (0x80808080808080808080808080808080)
1302 bytecount_to_charcount (const Intbyte *ptr, Bytecount len) 1471 #elif defined (EFFICIENT_INT_64_BIT)
1303 { 1472 # define STRIDE_TYPE INT_64_BIT
1304 Charcount count = 0; 1473 # define HIGH_BIT_MASK MAKE_64_BIT_UNSIGNED_CONSTANT (0x8080808080808080)
1305 const Intbyte *end = ptr + len;
1306
1307 #if SIZEOF_LONG == 8
1308 # define STRIDE_TYPE long
1309 # define HIGH_BIT_MASK 0x8080808080808080UL
1310 #elif SIZEOF_LONG_LONG == 8 && !(defined (i386) || defined (__i386__))
1311 # define STRIDE_TYPE long long
1312 # define HIGH_BIT_MASK 0x8080808080808080ULL
1313 #elif SIZEOF_LONG == 4
1314 # define STRIDE_TYPE long
1315 # define HIGH_BIT_MASK 0x80808080UL
1316 #else 1474 #else
1317 # error Add support for 128-bit systems here 1475 # define STRIDE_TYPE INT_32_BIT
1476 # define HIGH_BIT_MASK MAKE_32_BIT_UNSIGNED_CONSTANT (0x80808080)
1318 #endif 1477 #endif
1319 1478
1320 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1)) 1479 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1))
1321 #define ALIGN_MASK (~ ALIGN_BITS) 1480 #define ALIGN_MASK (~ ALIGN_BITS)
1322 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0) 1481 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0)
1323 #define STRIDE sizeof (STRIDE_TYPE) 1482 #define STRIDE sizeof (STRIDE_TYPE)
1324 1483
1325 while (ptr < end) 1484 const unsigned STRIDE_TYPE *ascii_end;
1326 { 1485
1327 if (BYTE_ASCII_P (*ptr)) 1486 /* Need to do in 3 sections -- before alignment start, aligned chunk,
1328 { 1487 after alignment end. */
1329 /* optimize for long stretches of ASCII */ 1488 while (!ALIGNED (ptr))
1330 if (! ALIGNED (ptr)) 1489 {
1331 ptr++, count++; 1490 if (ptr == end || !byte_ascii_p (*ptr))
1332 else 1491 return ptr;
1333 { 1492 ptr++;
1334 const unsigned STRIDE_TYPE *ascii_end = 1493 }
1335 (const unsigned STRIDE_TYPE *) ptr; 1494 ascii_end = (const unsigned STRIDE_TYPE *) ptr;
1336 /* This loop screams, because we can detect ASCII 1495 /* This loop screams, because we can detect ASCII
1337 characters 4 or 8 at a time. */ 1496 characters 4 or 8 at a time. */
1338 while ((const Intbyte *) ascii_end + STRIDE <= end 1497 while ((const Intbyte *) ascii_end + STRIDE <= end
1339 && !(*ascii_end & HIGH_BIT_MASK)) 1498 && !(*ascii_end & HIGH_BIT_MASK))
1340 ascii_end++; 1499 ascii_end++;
1341 if ((Intbyte *) ascii_end == ptr) 1500 ptr = (Intbyte *) ascii_end;
1342 ptr++, count++; 1501 while (ptr < end && byte_ascii_p (*ptr))
1343 else 1502 ptr++;
1344 { 1503 return ptr;
1345 count += (Intbyte *) ascii_end - ptr; 1504 }
1346 ptr = (Intbyte *) ascii_end; 1505
1347 } 1506 /* Function equivalents of bytecount_to_charcount/charcount_to_bytecount.
1348 } 1507 These work on strings of all sizes but are more efficient than a simple
1349 } 1508 loop on large strings and probably less efficient on sufficiently small
1350 else 1509 strings. */
1351 { 1510
1352 /* optimize for successive characters from the same charset */ 1511 Charcount
1353 Intbyte leading_byte = *ptr; 1512 bytecount_to_charcount_fun (const Intbyte *ptr, Bytecount len)
1354 int bytes = REP_BYTES_BY_FIRST_BYTE (leading_byte); 1513 {
1355 while ((ptr < end) && (*ptr == leading_byte)) 1514 Charcount count = 0;
1356 ptr += bytes, count++; 1515 const Intbyte *end = ptr + len;
1357 } 1516 while (1)
1517 {
1518 const Intbyte *newptr = skip_ascii (ptr, end);
1519 count += newptr - ptr;
1520 ptr = newptr;
1521 if (ptr == end)
1522 break;
1523 {
1524 /* Optimize for successive characters from the same charset */
1525 Intbyte leading_byte = *ptr;
1526 int bytes = rep_bytes_by_first_byte (leading_byte);
1527 while (ptr < end && *ptr == leading_byte)
1528 ptr += bytes, count++;
1529 }
1358 } 1530 }
1359 1531
1360 /* Bomb out if the specified substring ends in the middle 1532 /* Bomb out if the specified substring ends in the middle
1361 of a character. Note that we might have already gotten 1533 of a character. Note that we might have already gotten
1362 a core dump above from an invalid reference, but at least 1534 a core dump above from an invalid reference, but at least
1366 text_checking_assert (ptr == end); 1538 text_checking_assert (ptr == end);
1367 1539
1368 return count; 1540 return count;
1369 } 1541 }
1370 1542
1371 /* Given a pointer to a text string and a length in characters, return
1372 the equivalent length in bytes. */
1373
1374 Bytecount 1543 Bytecount
1375 charcount_to_bytecount (const Intbyte *ptr, Charcount len) 1544 charcount_to_bytecount_fun (const Intbyte *ptr, Charcount len)
1376 { 1545 {
1377 const Intbyte *newptr = ptr; 1546 const Intbyte *newptr = ptr;
1378 1547 while (1)
1379 text_checking_assert (len >= 0); 1548 {
1380 while (len > 0) 1549 const Intbyte *newnewptr = skip_ascii (newptr, newptr + len);
1381 { 1550 len -= newnewptr - newptr;
1382 INC_CHARPTR (newptr); 1551 newptr = newnewptr;
1383 len--; 1552 if (!len)
1553 break;
1554 {
1555 /* Optimize for successive characters from the same charset */
1556 Intbyte leading_byte = *newptr;
1557 int bytes = rep_bytes_by_first_byte (leading_byte);
1558 while (len > 0 && *newptr == leading_byte)
1559 newptr += bytes, len--;
1560 }
1384 } 1561 }
1385 return newptr - ptr; 1562 return newptr - ptr;
1386 }
1387
1388 inline static void
1389 update_entirely_ascii_p_flag (struct buffer *buf)
1390 {
1391 buf->text->entirely_ascii_p = buf->text->z == buf->text->bufz;
1392 } 1563 }
1393 1564
1394 /* The next two functions are the actual meat behind the 1565 /* The next two functions are the actual meat behind the
1395 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently 1566 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently
1396 the method they use is fairly unsophisticated; see buffer.h. 1567 the method they use is fairly unsophisticated; see buffer.h.
1420 int diff_so_far; 1591 int diff_so_far;
1421 int add_to_cache = 0; 1592 int add_to_cache = 0;
1422 1593
1423 /* Check for some cached positions, for speed. */ 1594 /* Check for some cached positions, for speed. */
1424 if (x == BUF_PT (buf)) 1595 if (x == BUF_PT (buf))
1425 return BI_BUF_PT (buf); 1596 return BYTE_BUF_PT (buf);
1426 if (x == BUF_ZV (buf)) 1597 if (x == BUF_ZV (buf))
1427 return BI_BUF_ZV (buf); 1598 return BYTE_BUF_ZV (buf);
1428 if (x == BUF_BEGV (buf)) 1599 if (x == BUF_BEGV (buf))
1429 return BI_BUF_BEGV (buf); 1600 return BYTE_BUF_BEGV (buf);
1430 1601
1431 bufmin = buf->text->mule_bufmin; 1602 bufmin = buf->text->mule_bufmin;
1432 bufmax = buf->text->mule_bufmax; 1603 bufmax = buf->text->mule_bufmax;
1433 bytmin = buf->text->mule_bytmin; 1604 bytmin = buf->text->mule_bytmin;
1434 bytmax = buf->text->mule_bytmax; 1605 bytmax = buf->text->mule_bytmax;
1472 diffzv += heuristic_hack; 1643 diffzv += heuristic_hack;
1473 diffpt += heuristic_hack; 1644 diffpt += heuristic_hack;
1474 if (diffpt < diffmax && diffpt <= diffzv) 1645 if (diffpt < diffmax && diffpt <= diffzv)
1475 { 1646 {
1476 bufmax = bufmin = BUF_PT (buf); 1647 bufmax = bufmin = BUF_PT (buf);
1477 bytmax = bytmin = BI_BUF_PT (buf); 1648 bytmax = bytmin = BYTE_BUF_PT (buf);
1478 /* We set the size to 1 even though it doesn't really 1649 /* We set the size to 1 even though it doesn't really
1479 matter because the new known region contains no 1650 matter because the new known region contains no
1480 characters. We do this because this is the most 1651 characters. We do this because this is the most
1481 likely size of the characters around the new known 1652 likely size of the characters around the new known
1482 region, and we avoid potential yuckiness that is 1653 region, and we avoid potential yuckiness that is
1484 size = 1; 1655 size = 1;
1485 } 1656 }
1486 if (diffzv < diffmax) 1657 if (diffzv < diffmax)
1487 { 1658 {
1488 bufmax = bufmin = BUF_ZV (buf); 1659 bufmax = bufmin = BUF_ZV (buf);
1489 bytmax = bytmin = BI_BUF_ZV (buf); 1660 bytmax = bytmin = BYTE_BUF_ZV (buf);
1490 size = 1; 1661 size = 1;
1491 } 1662 }
1492 } 1663 }
1493 #ifdef ERROR_CHECK_TEXT 1664 #ifdef ERROR_CHECK_TEXT
1494 else if (x >= bufmin) 1665 else if (x >= bufmin)
1514 diffpt += heuristic_hack; 1685 diffpt += heuristic_hack;
1515 1686
1516 if (diffpt < diffmin && diffpt <= diffbegv) 1687 if (diffpt < diffmin && diffpt <= diffbegv)
1517 { 1688 {
1518 bufmax = bufmin = BUF_PT (buf); 1689 bufmax = bufmin = BUF_PT (buf);
1519 bytmax = bytmin = BI_BUF_PT (buf); 1690 bytmax = bytmin = BYTE_BUF_PT (buf);
1520 /* We set the size to 1 even though it doesn't really 1691 /* We set the size to 1 even though it doesn't really
1521 matter because the new known region contains no 1692 matter because the new known region contains no
1522 characters. We do this because this is the most 1693 characters. We do this because this is the most
1523 likely size of the characters around the new known 1694 likely size of the characters around the new known
1524 region, and we avoid potential yuckiness that is 1695 region, and we avoid potential yuckiness that is
1526 size = 1; 1697 size = 1;
1527 } 1698 }
1528 if (diffbegv < diffmin) 1699 if (diffbegv < diffmin)
1529 { 1700 {
1530 bufmax = bufmin = BUF_BEGV (buf); 1701 bufmax = bufmin = BUF_BEGV (buf);
1531 bytmax = bytmin = BI_BUF_BEGV (buf); 1702 bytmax = bytmin = BYTE_BUF_BEGV (buf);
1532 size = 1; 1703 size = 1;
1533 } 1704 }
1534 } 1705 }
1535 1706
1536 diff_so_far = x > bufmax ? x - bufmax : bufmin - x; 1707 diff_so_far = x > bufmax ? x - bufmax : bufmin - x;
1696 Charbpos retval; 1867 Charbpos retval;
1697 int diff_so_far; 1868 int diff_so_far;
1698 int add_to_cache = 0; 1869 int add_to_cache = 0;
1699 1870
1700 /* Check for some cached positions, for speed. */ 1871 /* Check for some cached positions, for speed. */
1701 if (x == BI_BUF_PT (buf)) 1872 if (x == BYTE_BUF_PT (buf))
1702 return BUF_PT (buf); 1873 return BUF_PT (buf);
1703 if (x == BI_BUF_ZV (buf)) 1874 if (x == BYTE_BUF_ZV (buf))
1704 return BUF_ZV (buf); 1875 return BUF_ZV (buf);
1705 if (x == BI_BUF_BEGV (buf)) 1876 if (x == BYTE_BUF_BEGV (buf))
1706 return BUF_BEGV (buf); 1877 return BUF_BEGV (buf);
1707 1878
1708 bufmin = buf->text->mule_bufmin; 1879 bufmin = buf->text->mule_bufmin;
1709 bufmax = buf->text->mule_bufmax; 1880 bufmax = buf->text->mule_bufmax;
1710 bytmin = buf->text->mule_bytmin; 1881 bytmin = buf->text->mule_bytmin;
1716 the upper bound of the known region up one character at a time, 1887 the upper bound of the known region up one character at a time,
1717 and moving the lower bound of the known region up as necessary 1888 and moving the lower bound of the known region up as necessary
1718 when the size of the character just seen changes. 1889 when the size of the character just seen changes.
1719 1890
1720 We optimize this, however, by first shifting the known region to 1891 We optimize this, however, by first shifting the known region to
1721 one of the cached points if it's close by. (We don't check BI_BEG or 1892 one of the cached points if it's close by. (We don't check BYTE_BEG or
1722 BI_Z, even though they're cached; most of the time these will be the 1893 BYTE_Z, even though they're cached; most of the time these will be the
1723 same as BI_BEGV and BI_ZV, and when they're not, they're not likely 1894 same as BYTE_BEGV and BYTE_ZV, and when they're not, they're not likely
1724 to be used.) */ 1895 to be used.) */
1725 1896
1726 if (x > bytmax) 1897 if (x > bytmax)
1727 { 1898 {
1728 Bytebpos diffmax = x - bytmax; 1899 Bytebpos diffmax = x - bytmax;
1729 Bytebpos diffpt = x - BI_BUF_PT (buf); 1900 Bytebpos diffpt = x - BYTE_BUF_PT (buf);
1730 Bytebpos diffzv = BI_BUF_ZV (buf) - x; 1901 Bytebpos diffzv = BYTE_BUF_ZV (buf) - x;
1731 /* #### This value could stand some more exploration. */ 1902 /* #### This value could stand some more exploration. */
1732 Bytecount heuristic_hack = (bytmax - bytmin) >> 2; 1903 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
1733 1904
1734 /* Check if the position is closer to PT or ZV than to the 1905 /* Check if the position is closer to PT or ZV than to the
1735 end of the known region. */ 1906 end of the known region. */
1738 diffpt = -diffpt; 1909 diffpt = -diffpt;
1739 if (diffzv < 0) 1910 if (diffzv < 0)
1740 diffzv = -diffzv; 1911 diffzv = -diffzv;
1741 1912
1742 /* But also implement a heuristic that favors the known region 1913 /* But also implement a heuristic that favors the known region
1743 over BI_PT or BI_ZV. The reason for this is that switching to 1914 over BYTE_PT or BYTE_ZV. The reason for this is that switching to
1744 BI_PT or BI_ZV will wipe out the knowledge in the known region, 1915 BYTE_PT or BYTE_ZV will wipe out the knowledge in the known region,
1745 which might be annoying if the known region is large and 1916 which might be annoying if the known region is large and
1746 BI_PT or BI_ZV is not that much closer than the end of the known 1917 BYTE_PT or BYTE_ZV is not that much closer than the end of the known
1747 region. */ 1918 region. */
1748 1919
1749 diffzv += heuristic_hack; 1920 diffzv += heuristic_hack;
1750 diffpt += heuristic_hack; 1921 diffpt += heuristic_hack;
1751 if (diffpt < diffmax && diffpt <= diffzv) 1922 if (diffpt < diffmax && diffpt <= diffzv)
1752 { 1923 {
1753 bufmax = bufmin = BUF_PT (buf); 1924 bufmax = bufmin = BUF_PT (buf);
1754 bytmax = bytmin = BI_BUF_PT (buf); 1925 bytmax = bytmin = BYTE_BUF_PT (buf);
1755 /* We set the size to 1 even though it doesn't really 1926 /* We set the size to 1 even though it doesn't really
1756 matter because the new known region contains no 1927 matter because the new known region contains no
1757 characters. We do this because this is the most 1928 characters. We do this because this is the most
1758 likely size of the characters around the new known 1929 likely size of the characters around the new known
1759 region, and we avoid potential yuckiness that is 1930 region, and we avoid potential yuckiness that is
1761 size = 1; 1932 size = 1;
1762 } 1933 }
1763 if (diffzv < diffmax) 1934 if (diffzv < diffmax)
1764 { 1935 {
1765 bufmax = bufmin = BUF_ZV (buf); 1936 bufmax = bufmin = BUF_ZV (buf);
1766 bytmax = bytmin = BI_BUF_ZV (buf); 1937 bytmax = bytmin = BYTE_BUF_ZV (buf);
1767 size = 1; 1938 size = 1;
1768 } 1939 }
1769 } 1940 }
1770 #ifdef ERROR_CHECK_TEXT 1941 #ifdef ERROR_CHECK_TEXT
1771 else if (x >= bytmin) 1942 else if (x >= bytmin)
1772 abort (); 1943 abort ();
1773 #endif 1944 #endif
1774 else 1945 else
1775 { 1946 {
1776 Bytebpos diffmin = bytmin - x; 1947 Bytebpos diffmin = bytmin - x;
1777 Bytebpos diffpt = BI_BUF_PT (buf) - x; 1948 Bytebpos diffpt = BYTE_BUF_PT (buf) - x;
1778 Bytebpos diffbegv = x - BI_BUF_BEGV (buf); 1949 Bytebpos diffbegv = x - BYTE_BUF_BEGV (buf);
1779 /* #### This value could stand some more exploration. */ 1950 /* #### This value could stand some more exploration. */
1780 Bytecount heuristic_hack = (bytmax - bytmin) >> 2; 1951 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
1781 1952
1782 if (diffpt < 0) 1953 if (diffpt < 0)
1783 diffpt = -diffpt; 1954 diffpt = -diffpt;
1791 diffpt += heuristic_hack; 1962 diffpt += heuristic_hack;
1792 1963
1793 if (diffpt < diffmin && diffpt <= diffbegv) 1964 if (diffpt < diffmin && diffpt <= diffbegv)
1794 { 1965 {
1795 bufmax = bufmin = BUF_PT (buf); 1966 bufmax = bufmin = BUF_PT (buf);
1796 bytmax = bytmin = BI_BUF_PT (buf); 1967 bytmax = bytmin = BYTE_BUF_PT (buf);
1797 /* We set the size to 1 even though it doesn't really 1968 /* We set the size to 1 even though it doesn't really
1798 matter because the new known region contains no 1969 matter because the new known region contains no
1799 characters. We do this because this is the most 1970 characters. We do this because this is the most
1800 likely size of the characters around the new known 1971 likely size of the characters around the new known
1801 region, and we avoid potential yuckiness that is 1972 region, and we avoid potential yuckiness that is
1803 size = 1; 1974 size = 1;
1804 } 1975 }
1805 if (diffbegv < diffmin) 1976 if (diffbegv < diffmin)
1806 { 1977 {
1807 bufmax = bufmin = BUF_BEGV (buf); 1978 bufmax = bufmin = BUF_BEGV (buf);
1808 bytmax = bytmin = BI_BUF_BEGV (buf); 1979 bytmax = bytmin = BYTE_BUF_BEGV (buf);
1809 size = 1; 1980 size = 1;
1810 } 1981 }
1811 } 1982 }
1812 1983
1813 diff_so_far = x > bytmax ? x - bytmax : bytmin - x; 1984 diff_so_far = x > bytmax ? x - bytmax : bytmin - x;
1979 buf->text->mule_bytebpos_cache[i] += bytelength; 2150 buf->text->mule_bytebpos_cache[i] += bytelength;
1980 } 2151 }
1981 } 2152 }
1982 2153
1983 if (start >= buf->text->mule_bufmax) 2154 if (start >= buf->text->mule_bufmax)
1984 goto done; 2155 return;
1985 2156
1986 /* The insertion is either before the known region, in which case 2157 /* The insertion is either before the known region, in which case
1987 it shoves it forward; or within the known region, in which case 2158 it shoves it forward; or within the known region, in which case
1988 it shoves the end forward. (But it may make the known region 2159 it shoves the end forward. (But it may make the known region
1989 inconsistent, so we may have to shorten it.) */ 2160 inconsistent, so we may have to shorten it.) */
2052 buf->text->mule_bufmin = end; 2223 buf->text->mule_bufmin = end;
2053 buf->text->mule_bytmin = byteend; 2224 buf->text->mule_bytmin = byteend;
2054 } 2225 }
2055 } 2226 }
2056 } 2227 }
2057 done: 2228 }
2058 update_entirely_ascii_p_flag (buf); 2229
2059 } 2230 /* Text from START to END (equivalent in Bytebpos's: from BYTE_START to
2060 2231 BYTE_END) was deleted. */
2061 /* Text from START to END (equivalent in Bytebposs: from BI_START to
2062 BI_END) was deleted. */
2063 2232
2064 void 2233 void
2065 buffer_mule_signal_deleted_region (struct buffer *buf, Charbpos start, 2234 buffer_mule_signal_deleted_region (struct buffer *buf, Charbpos start,
2066 Charbpos end, Bytebpos bi_start, 2235 Charbpos end, Bytebpos byte_start,
2067 Bytebpos bi_end) 2236 Bytebpos byte_end)
2068 { 2237 {
2069 int i; 2238 int i;
2070 2239
2071 /* Adjust the cache of known positions. */ 2240 /* Adjust the cache of known positions. */
2072 for (i = 0; i < 16; i++) 2241 for (i = 0; i < 16; i++)
2073 { 2242 {
2074 /* After the end; gets shoved backward */ 2243 /* After the end; gets shoved backward */
2075 if (buf->text->mule_charbpos_cache[i] > end) 2244 if (buf->text->mule_charbpos_cache[i] > end)
2076 { 2245 {
2077 buf->text->mule_charbpos_cache[i] -= end - start; 2246 buf->text->mule_charbpos_cache[i] -= end - start;
2078 buf->text->mule_bytebpos_cache[i] -= bi_end - bi_start; 2247 buf->text->mule_bytebpos_cache[i] -= byte_end - byte_start;
2079 } 2248 }
2080 /* In the range; moves to start of range */ 2249 /* In the range; moves to start of range */
2081 else if (buf->text->mule_charbpos_cache[i] > start) 2250 else if (buf->text->mule_charbpos_cache[i] > start)
2082 { 2251 {
2083 buf->text->mule_charbpos_cache[i] = start; 2252 buf->text->mule_charbpos_cache[i] = start;
2084 buf->text->mule_bytebpos_cache[i] = bi_start; 2253 buf->text->mule_bytebpos_cache[i] = byte_start;
2085 } 2254 }
2086 } 2255 }
2087 2256
2088 /* We don't care about any text after the end of the known region. */ 2257 /* We don't care about any text after the end of the known region. */
2089 2258
2090 end = min (end, buf->text->mule_bufmax); 2259 end = min (end, buf->text->mule_bufmax);
2091 bi_end = min (bi_end, buf->text->mule_bytmax); 2260 byte_end = min (byte_end, buf->text->mule_bytmax);
2092 if (start >= end) 2261 if (start >= end)
2093 goto done; 2262 return;
2094 2263
2095 /* The end of the known region offsets by the total amount of deletion, 2264 /* The end of the known region offsets by the total amount of deletion,
2096 since it's all before it. */ 2265 since it's all before it. */
2097 2266
2098 buf->text->mule_bufmax -= end - start; 2267 buf->text->mule_bufmax -= end - start;
2099 buf->text->mule_bytmax -= bi_end - bi_start; 2268 buf->text->mule_bytmax -= byte_end - byte_start;
2100 2269
2101 /* Now we don't care about any text after the start of the known region. */ 2270 /* Now we don't care about any text after the start of the known region. */
2102 2271
2103 end = min (end, buf->text->mule_bufmin); 2272 end = min (end, buf->text->mule_bufmin);
2104 bi_end = min (bi_end, buf->text->mule_bytmin); 2273 byte_end = min (byte_end, buf->text->mule_bytmin);
2105 if (start < end) 2274 if (start < end)
2106 { 2275 {
2107 buf->text->mule_bufmin -= end - start; 2276 buf->text->mule_bufmin -= end - start;
2108 buf->text->mule_bytmin -= bi_end - bi_start; 2277 buf->text->mule_bytmin -= byte_end - byte_start;
2109 } 2278 }
2110
2111 done:
2112 update_entirely_ascii_p_flag (buf);
2113 } 2279 }
2114 2280
2115 #endif /* MULE */ 2281 #endif /* MULE */
2116
2117 #ifdef ERROR_CHECK_TEXT
2118
2119 Bytebpos
2120 charbpos_to_bytebpos (struct buffer *buf, Charbpos x)
2121 {
2122 Bytebpos retval = real_charbpos_to_bytebpos (buf, x);
2123 ASSERT_VALID_BYTEBPOS_UNSAFE (buf, retval);
2124 return retval;
2125 }
2126
2127 Charbpos
2128 bytebpos_to_charbpos (struct buffer *buf, Bytebpos x)
2129 {
2130 ASSERT_VALID_BYTEBPOS_UNSAFE (buf, x);
2131 return real_bytebpos_to_charbpos (buf, x);
2132 }
2133
2134 #endif /* ERROR_CHECK_TEXT */
2135 2282
2136 2283
2137 /************************************************************************/ 2284 /************************************************************************/
2138 /* verifying buffer and string positions */ 2285 /* verifying buffer and string positions */
2139 /************************************************************************/ 2286 /************************************************************************/
2248 and to signal an error if the positions are out of range. 2395 and to signal an error if the positions are out of range.
2249 */ 2396 */
2250 2397
2251 void 2398 void
2252 get_buffer_range_char (struct buffer *b, Lisp_Object from, Lisp_Object to, 2399 get_buffer_range_char (struct buffer *b, Lisp_Object from, Lisp_Object to,
2253 Charbpos *from_out, Charbpos *to_out, unsigned int flags) 2400 Charbpos *from_out, Charbpos *to_out,
2401 unsigned int flags)
2254 { 2402 {
2255 /* Does not GC */ 2403 /* Does not GC */
2256 Charbpos min_allowed, max_allowed; 2404 Charbpos min_allowed, max_allowed;
2257 2405
2258 min_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ? 2406 min_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
2290 } 2438 }
2291 } 2439 }
2292 2440
2293 void 2441 void
2294 get_buffer_range_byte (struct buffer *b, Lisp_Object from, Lisp_Object to, 2442 get_buffer_range_byte (struct buffer *b, Lisp_Object from, Lisp_Object to,
2295 Bytebpos *from_out, Bytebpos *to_out, unsigned int flags) 2443 Bytebpos *from_out, Bytebpos *to_out,
2444 unsigned int flags)
2296 { 2445 {
2297 Charbpos s, e; 2446 Charbpos s, e;
2298 2447
2299 get_buffer_range_char (b, from, to, &s, &e, flags); 2448 get_buffer_range_char (b, from, to, &s, &e, flags);
2300 if (s >= 0) 2449 if (s >= 0)
2337 2486
2338 Charcount 2487 Charcount
2339 get_string_pos_char (Lisp_Object string, Lisp_Object pos, unsigned int flags) 2488 get_string_pos_char (Lisp_Object string, Lisp_Object pos, unsigned int flags)
2340 { 2489 {
2341 return get_string_pos_char_1 (string, pos, flags, 2490 return get_string_pos_char_1 (string, pos, flags,
2342 XSTRING_CHAR_LENGTH (string)); 2491 string_char_length (string));
2343 } 2492 }
2344 2493
2345 Bytecount 2494 Bytecount
2346 get_string_pos_byte (Lisp_Object string, Lisp_Object pos, unsigned int flags) 2495 get_string_pos_byte (Lisp_Object string, Lisp_Object pos, unsigned int flags)
2347 { 2496 {
2355 get_string_range_char (Lisp_Object string, Lisp_Object from, Lisp_Object to, 2504 get_string_range_char (Lisp_Object string, Lisp_Object from, Lisp_Object to,
2356 Charcount *from_out, Charcount *to_out, 2505 Charcount *from_out, Charcount *to_out,
2357 unsigned int flags) 2506 unsigned int flags)
2358 { 2507 {
2359 Charcount min_allowed = 0; 2508 Charcount min_allowed = 0;
2360 Charcount max_allowed = XSTRING_CHAR_LENGTH (string); 2509 Charcount max_allowed = string_char_length (string);
2361 2510
2362 if (NILP (from) && (flags & GB_ALLOW_NIL)) 2511 if (NILP (from) && (flags & GB_ALLOW_NIL))
2363 *from_out = min_allowed; 2512 *from_out = min_allowed;
2364 else 2513 else
2365 *from_out = get_string_pos_char_1 (string, from, 2514 *from_out = get_string_pos_char_1 (string, from,
2406 else 2555 else
2407 *to_out = -1; 2556 *to_out = -1;
2408 2557
2409 } 2558 }
2410 2559
2411 Charbpos 2560 Charxpos
2412 get_buffer_or_string_pos_char (Lisp_Object object, Lisp_Object pos, 2561 get_buffer_or_string_pos_char (Lisp_Object object, Lisp_Object pos,
2413 unsigned int flags) 2562 unsigned int flags)
2414 { 2563 {
2415 return STRINGP (object) ? 2564 return STRINGP (object) ?
2416 get_string_pos_char (object, pos, flags) : 2565 get_string_pos_char (object, pos, flags) :
2417 get_buffer_pos_char (XBUFFER (object), pos, flags); 2566 get_buffer_pos_char (XBUFFER (object), pos, flags);
2418 } 2567 }
2419 2568
2420 Bytebpos 2569 Bytexpos
2421 get_buffer_or_string_pos_byte (Lisp_Object object, Lisp_Object pos, 2570 get_buffer_or_string_pos_byte (Lisp_Object object, Lisp_Object pos,
2422 unsigned int flags) 2571 unsigned int flags)
2423 { 2572 {
2424 return STRINGP (object) ? 2573 return STRINGP (object) ?
2425 get_string_pos_byte (object, pos, flags) : 2574 get_string_pos_byte (object, pos, flags) :
2426 get_buffer_pos_byte (XBUFFER (object), pos, flags); 2575 get_buffer_pos_byte (XBUFFER (object), pos, flags);
2427 } 2576 }
2428 2577
2429 void 2578 void
2430 get_buffer_or_string_range_char (Lisp_Object object, Lisp_Object from, 2579 get_buffer_or_string_range_char (Lisp_Object object, Lisp_Object from,
2431 Lisp_Object to, Charbpos *from_out, 2580 Lisp_Object to, Charxpos *from_out,
2432 Charbpos *to_out, unsigned int flags) 2581 Charxpos *to_out, unsigned int flags)
2433 { 2582 {
2434 if (STRINGP (object)) 2583 if (STRINGP (object))
2435 get_string_range_char (object, from, to, from_out, to_out, flags); 2584 get_string_range_char (object, from, to, from_out, to_out, flags);
2436 else 2585 else
2437 get_buffer_range_char (XBUFFER (object), from, to, from_out, to_out, flags); 2586 get_buffer_range_char (XBUFFER (object), from, to, from_out, to_out,
2587 flags);
2438 } 2588 }
2439 2589
2440 void 2590 void
2441 get_buffer_or_string_range_byte (Lisp_Object object, Lisp_Object from, 2591 get_buffer_or_string_range_byte (Lisp_Object object, Lisp_Object from,
2442 Lisp_Object to, Bytebpos *from_out, 2592 Lisp_Object to, Bytexpos *from_out,
2443 Bytebpos *to_out, unsigned int flags) 2593 Bytexpos *to_out, unsigned int flags)
2444 { 2594 {
2445 if (STRINGP (object)) 2595 if (STRINGP (object))
2446 get_string_range_byte (object, from, to, from_out, to_out, flags); 2596 get_string_range_byte (object, from, to, from_out, to_out, flags);
2447 else 2597 else
2448 get_buffer_range_byte (XBUFFER (object), from, to, from_out, to_out, flags); 2598 get_buffer_range_byte (XBUFFER (object), from, to, from_out, to_out,
2599 flags);
2600 }
2601
2602 Charxpos
2603 buffer_or_string_accessible_begin_char (Lisp_Object object)
2604 {
2605 return STRINGP (object) ? 0 : BUF_BEGV (XBUFFER (object));
2606 }
2607
2608 Charxpos
2609 buffer_or_string_accessible_end_char (Lisp_Object object)
2610 {
2611 return STRINGP (object) ?
2612 string_char_length (object) : BUF_ZV (XBUFFER (object));
2613 }
2614
2615 Bytexpos
2616 buffer_or_string_accessible_begin_byte (Lisp_Object object)
2617 {
2618 return STRINGP (object) ? 0 : BYTE_BUF_BEGV (XBUFFER (object));
2619 }
2620
2621 Bytexpos
2622 buffer_or_string_accessible_end_byte (Lisp_Object object)
2623 {
2624 return STRINGP (object) ?
2625 XSTRING_LENGTH (object) : BYTE_BUF_ZV (XBUFFER (object));
2626 }
2627
2628 Charxpos
2629 buffer_or_string_absolute_begin_char (Lisp_Object object)
2630 {
2631 return STRINGP (object) ? 0 : BUF_BEG (XBUFFER (object));
2632 }
2633
2634 Charxpos
2635 buffer_or_string_absolute_end_char (Lisp_Object object)
2636 {
2637 return STRINGP (object) ?
2638 string_char_length (object) : BUF_Z (XBUFFER (object));
2639 }
2640
2641 Bytexpos
2642 buffer_or_string_absolute_begin_byte (Lisp_Object object)
2643 {
2644 return STRINGP (object) ? 0 : BYTE_BUF_BEG (XBUFFER (object));
2645 }
2646
2647 Bytexpos
2648 buffer_or_string_absolute_end_byte (Lisp_Object object)
2649 {
2650 return STRINGP (object) ?
2651 XSTRING_LENGTH (object) : BYTE_BUF_Z (XBUFFER (object));
2449 } 2652 }
2450 2653
2451 Charbpos 2654 Charbpos
2452 buffer_or_string_accessible_begin_char (Lisp_Object object) 2655 charbpos_clip_to_bounds (Charbpos lower, Charbpos num, Charbpos upper)
2453 { 2656 {
2454 return STRINGP (object) ? 0 : BUF_BEGV (XBUFFER (object)); 2657 return (num < lower ? lower :
2455 } 2658 num > upper ? upper :
2456 2659 num);
2457 Charbpos
2458 buffer_or_string_accessible_end_char (Lisp_Object object)
2459 {
2460 return STRINGP (object) ?
2461 XSTRING_CHAR_LENGTH (object) : BUF_ZV (XBUFFER (object));
2462 } 2660 }
2463 2661
2464 Bytebpos 2662 Bytebpos
2465 buffer_or_string_accessible_begin_byte (Lisp_Object object) 2663 bytebpos_clip_to_bounds (Bytebpos lower, Bytebpos num, Bytebpos upper)
2466 { 2664 {
2467 return STRINGP (object) ? 0 : BI_BUF_BEGV (XBUFFER (object)); 2665 return (num < lower ? lower :
2468 } 2666 num > upper ? upper :
2469 2667 num);
2470 Bytebpos 2668 }
2471 buffer_or_string_accessible_end_byte (Lisp_Object object) 2669
2472 { 2670 Charxpos
2473 return STRINGP (object) ? 2671 charxpos_clip_to_bounds (Charxpos lower, Charxpos num, Charxpos upper)
2474 XSTRING_LENGTH (object) : BI_BUF_ZV (XBUFFER (object)); 2672 {
2475 } 2673 return (num < lower ? lower :
2476 2674 num > upper ? upper :
2477 Charbpos 2675 num);
2478 buffer_or_string_absolute_begin_char (Lisp_Object object) 2676 }
2479 { 2677
2480 return STRINGP (object) ? 0 : BUF_BEG (XBUFFER (object)); 2678 Bytexpos
2481 } 2679 bytexpos_clip_to_bounds (Bytexpos lower, Bytexpos num, Bytexpos upper)
2482 2680 {
2483 Charbpos 2681 return (num < lower ? lower :
2484 buffer_or_string_absolute_end_char (Lisp_Object object) 2682 num > upper ? upper :
2485 { 2683 num);
2486 return STRINGP (object) ? 2684 }
2487 XSTRING_CHAR_LENGTH (object) : BUF_Z (XBUFFER (object)); 2685
2488 } 2686 /* These could be implemented in terms of the get_buffer_or_string()
2489 2687 functions above, but those are complicated and handle lots of weird
2490 Bytebpos 2688 cases stemming from uncertain external input. */
2491 buffer_or_string_absolute_begin_byte (Lisp_Object object) 2689
2492 { 2690 Charxpos
2493 return STRINGP (object) ? 0 : BI_BUF_BEG (XBUFFER (object)); 2691 buffer_or_string_clip_to_accessible_char (Lisp_Object object, Charxpos pos)
2494 } 2692 {
2495 2693 return (charxpos_clip_to_bounds
2496 Bytebpos 2694 (pos, buffer_or_string_accessible_begin_char (object),
2497 buffer_or_string_absolute_end_byte (Lisp_Object object) 2695 buffer_or_string_accessible_end_char (object)));
2498 { 2696 }
2499 return STRINGP (object) ? 2697
2500 XSTRING_LENGTH (object) : BI_BUF_Z (XBUFFER (object)); 2698 Bytexpos
2699 buffer_or_string_clip_to_accessible_byte (Lisp_Object object, Bytexpos pos)
2700 {
2701 return (bytexpos_clip_to_bounds
2702 (pos, buffer_or_string_accessible_begin_byte (object),
2703 buffer_or_string_accessible_end_byte (object)));
2704 }
2705
2706 Charxpos
2707 buffer_or_string_clip_to_absolute_char (Lisp_Object object, Charxpos pos)
2708 {
2709 return (charxpos_clip_to_bounds
2710 (pos, buffer_or_string_absolute_begin_char (object),
2711 buffer_or_string_absolute_end_char (object)));
2712 }
2713
2714 Bytexpos
2715 buffer_or_string_clip_to_absolute_byte (Lisp_Object object, Bytexpos pos)
2716 {
2717 return (bytexpos_clip_to_bounds
2718 (pos, buffer_or_string_absolute_begin_byte (object),
2719 buffer_or_string_absolute_end_byte (object)));
2501 } 2720 }
2502 2721
2503 2722
2504 /************************************************************************/ 2723 /************************************************************************/
2505 /* Implement TO_EXTERNAL_FORMAT, TO_INTERNAL_FORMAT */ 2724 /* Implement TO_EXTERNAL_FORMAT, TO_INTERNAL_FORMAT */
2595 { 2814 {
2596 const Intbyte *end; 2815 const Intbyte *end;
2597 for (end = ptr + len; ptr < end;) 2816 for (end = ptr + len; ptr < end;)
2598 { 2817 {
2599 Intbyte c = 2818 Intbyte c =
2600 (BYTE_ASCII_P (*ptr)) ? *ptr : 2819 (byte_ascii_p (*ptr)) ? *ptr :
2601 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) : 2820 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
2602 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) : 2821 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
2603 '~'; 2822 '~';
2604 2823
2605 Dynarr_add (conversion_out_dynarr, (Extbyte) c); 2824 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
2634 } 2853 }
2635 end = ptr + len; 2854 end = ptr + len;
2636 2855
2637 for (p = ptr; p < end; p++) 2856 for (p = ptr; p < end; p++)
2638 { 2857 {
2639 if (!BYTE_ASCII_P (*p)) 2858 if (!byte_ascii_p (*p))
2640 goto the_hard_way; 2859 goto the_hard_way;
2641 } 2860 }
2642 2861
2643 for (p = ptr; p < end; p++) 2862 for (p = ptr; p < end; p++)
2644 { 2863 {
2774 2993
2775 for (; ptr < end; ptr++) 2994 for (; ptr < end; ptr++)
2776 { 2995 {
2777 Intbyte c = *ptr; 2996 Intbyte c = *ptr;
2778 2997
2779 if (BYTE_ASCII_P (c)) 2998 if (byte_ascii_p (c))
2780 Dynarr_add (conversion_in_dynarr, c); 2999 Dynarr_add (conversion_in_dynarr, c);
2781 else if (BYTE_C1_P (c)) 3000 else if (byte_c1_p (c))
2782 { 3001 {
2783 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1); 3002 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
2784 Dynarr_add (conversion_in_dynarr, c + 0x20); 3003 Dynarr_add (conversion_in_dynarr, c + 0x20);
2785 } 3004 }
2786 else 3005 else
2817 3036
2818 for (; ptr < end; ptr += 2) 3037 for (; ptr < end; ptr += 2)
2819 { 3038 {
2820 Intbyte c = *ptr; 3039 Intbyte c = *ptr;
2821 3040
2822 if (BYTE_ASCII_P (c)) 3041 if (byte_ascii_p (c))
2823 Dynarr_add (conversion_in_dynarr, c); 3042 Dynarr_add (conversion_in_dynarr, c);
2824 #ifdef MULE 3043 #ifdef MULE
2825 else if (BYTE_C1_P (c)) 3044 else if (byte_c1_p (c))
2826 { 3045 {
2827 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1); 3046 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
2828 Dynarr_add (conversion_in_dynarr, c + 0x20); 3047 Dynarr_add (conversion_in_dynarr, c + 0x20);
2829 } 3048 }
2830 else 3049 else
2938 Intbyte lb; 3157 Intbyte lb;
2939 int c1, c2; 3158 int c1, c2;
2940 Lisp_Object charset; 3159 Lisp_Object charset;
2941 3160
2942 p = str; 3161 p = str;
2943 BREAKUP_CHAR (c, charset, c1, c2); 3162 BREAKUP_EMCHAR (c, charset, c1, c2);
2944 lb = CHAR_LEADING_BYTE (c); 3163 lb = emchar_leading_byte (c);
2945 if (LEADING_BYTE_PRIVATE_P (lb)) 3164 if (leading_byte_private_p (lb))
2946 *p++ = PRIVATE_LEADING_BYTE_PREFIX (lb); 3165 *p++ = private_leading_byte_prefix (lb);
2947 *p++ = lb; 3166 *p++ = lb;
2948 if (EQ (charset, Vcharset_control_1)) 3167 if (EQ (charset, Vcharset_control_1))
2949 c1 += 0x20; 3168 c1 += 0x20;
2950 *p++ = c1 | 0x80; 3169 *p++ = c1 | 0x80;
2951 if (c2) 3170 if (c2)
2965 Lisp_Object charset; 3184 Lisp_Object charset;
2966 3185
2967 if (i0 == LEADING_BYTE_CONTROL_1) 3186 if (i0 == LEADING_BYTE_CONTROL_1)
2968 return (Emchar) (*++str - 0x20); 3187 return (Emchar) (*++str - 0x20);
2969 3188
2970 if (LEADING_BYTE_PREFIX_P (i0)) 3189 if (leading_byte_prefix_p (i0))
2971 i0 = *++str; 3190 i0 = *++str;
2972 3191
2973 i1 = *++str & 0x7F; 3192 i1 = *++str & 0x7F;
2974 3193
2975 charset = CHARSET_BY_LEADING_BYTE (i0); 3194 charset = charset_by_leading_byte (i0);
2976 if (XCHARSET_DIMENSION (charset) == 2) 3195 if (XCHARSET_DIMENSION (charset) == 2)
2977 i2 = *++str & 0x7F; 3196 i2 = *++str & 0x7F;
2978 3197
2979 return MAKE_CHAR (charset, i1, i2); 3198 return make_emchar (charset, i1, i2);
2980 } 3199 }
2981 3200
2982 /* Return whether CH is a valid Emchar, assuming it's non-ASCII. 3201 /* Return whether CH is a valid Emchar, assuming it's non-ASCII.
2983 Do not call this directly. Use the macro valid_char_p() instead. */ 3202 Do not call this directly. Use the macro valid_emchar_p() instead. */
2984 3203
2985 int 3204 int
2986 non_ascii_valid_char_p (Emchar ch) 3205 non_ascii_valid_emchar_p (Emchar ch)
2987 { 3206 {
2988 int f1, f2, f3; 3207 int f1, f2, f3;
2989 3208
2990 /* Must have only lowest 19 bits set */ 3209 /* Must have only lowest 19 bits set */
2991 if (ch & ~0x7FFFF) 3210 if (ch & ~0x7FFFF)
2992 return 0; 3211 return 0;
2993 3212
2994 f1 = CHAR_FIELD1 (ch); 3213 f1 = emchar_field1 (ch);
2995 f2 = CHAR_FIELD2 (ch); 3214 f2 = emchar_field2 (ch);
2996 f3 = CHAR_FIELD3 (ch); 3215 f3 = emchar_field3 (ch);
2997 3216
2998 if (f1 == 0) 3217 if (f1 == 0)
2999 { 3218 {
3000 /* dimension-1 char */ 3219 /* dimension-1 char */
3001 Lisp_Object charset; 3220 Lisp_Object charset;
3002 3221
3003 /* leading byte must be correct */ 3222 /* leading byte must be correct */
3004 if (f2 < MIN_CHAR_FIELD2_OFFICIAL || 3223 if (f2 < MIN_EMCHAR_FIELD2_OFFICIAL ||
3005 (f2 > MAX_CHAR_FIELD2_OFFICIAL && f2 < MIN_CHAR_FIELD2_PRIVATE) || 3224 (f2 > MAX_EMCHAR_FIELD2_OFFICIAL && f2 < MIN_EMCHAR_FIELD2_PRIVATE) ||
3006 f2 > MAX_CHAR_FIELD2_PRIVATE) 3225 f2 > MAX_EMCHAR_FIELD2_PRIVATE)
3007 return 0; 3226 return 0;
3008 /* octet not out of range */ 3227 /* octet not out of range */
3009 if (f3 < 0x20) 3228 if (f3 < 0x20)
3010 return 0; 3229 return 0;
3011 /* charset exists */ 3230 /* charset exists */
3012 /* 3231 /*
3013 NOTE: This takes advantage of the fact that 3232 NOTE: This takes advantage of the fact that
3014 FIELD2_TO_OFFICIAL_LEADING_BYTE and 3233 FIELD2_TO_OFFICIAL_LEADING_BYTE and
3015 FIELD2_TO_PRIVATE_LEADING_BYTE are the same. 3234 FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
3016 */ 3235 */
3017 charset = CHARSET_BY_LEADING_BYTE (f2 + FIELD2_TO_OFFICIAL_LEADING_BYTE); 3236 charset = charset_by_leading_byte (f2 + FIELD2_TO_OFFICIAL_LEADING_BYTE);
3018 if (EQ (charset, Qnil)) 3237 if (EQ (charset, Qnil))
3019 return 0; 3238 return 0;
3020 /* check range as per size (94 or 96) of charset */ 3239 /* check range as per size (94 or 96) of charset */
3021 return ((f3 > 0x20 && f3 < 0x7f) || XCHARSET_CHARS (charset) == 96); 3240 return ((f3 > 0x20 && f3 < 0x7f) || XCHARSET_CHARS (charset) == 96);
3022 } 3241 }
3024 { 3243 {
3025 /* dimension-2 char */ 3244 /* dimension-2 char */
3026 Lisp_Object charset; 3245 Lisp_Object charset;
3027 3246
3028 /* leading byte must be correct */ 3247 /* leading byte must be correct */
3029 if (f1 < MIN_CHAR_FIELD1_OFFICIAL || 3248 if (f1 < MIN_EMCHAR_FIELD1_OFFICIAL ||
3030 (f1 > MAX_CHAR_FIELD1_OFFICIAL && f1 < MIN_CHAR_FIELD1_PRIVATE) || 3249 (f1 > MAX_EMCHAR_FIELD1_OFFICIAL && f1 < MIN_EMCHAR_FIELD1_PRIVATE) ||
3031 f1 > MAX_CHAR_FIELD1_PRIVATE) 3250 f1 > MAX_EMCHAR_FIELD1_PRIVATE)
3032 return 0; 3251 return 0;
3033 /* octets not out of range */ 3252 /* octets not out of range */
3034 if (f2 < 0x20 || f3 < 0x20) 3253 if (f2 < 0x20 || f3 < 0x20)
3035 return 0; 3254 return 0;
3036 3255
3044 return 1; 3263 return 1;
3045 } 3264 }
3046 #endif /* ENABLE_COMPOSITE_CHARS */ 3265 #endif /* ENABLE_COMPOSITE_CHARS */
3047 3266
3048 /* charset exists */ 3267 /* charset exists */
3049 if (f1 <= MAX_CHAR_FIELD1_OFFICIAL) 3268 if (f1 <= MAX_EMCHAR_FIELD1_OFFICIAL)
3050 charset = 3269 charset =
3051 CHARSET_BY_LEADING_BYTE (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE); 3270 charset_by_leading_byte (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE);
3052 else 3271 else
3053 charset = 3272 charset =
3054 CHARSET_BY_LEADING_BYTE (f1 + FIELD1_TO_PRIVATE_LEADING_BYTE); 3273 charset_by_leading_byte (f1 + FIELD1_TO_PRIVATE_LEADING_BYTE);
3055 3274
3056 if (EQ (charset, Qnil)) 3275 if (EQ (charset, Qnil))
3057 return 0; 3276 return 0;
3058 /* check range as per size (94x94 or 96x96) of charset */ 3277 /* check range as per size (94x94 or 96x96) of charset */
3059 return ((f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F) || 3278 return ((f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F) ||
3060 XCHARSET_CHARS (charset) == 96); 3279 XCHARSET_CHARS (charset) == 96);
3061 } 3280 }
3062 } 3281 }
3063 3282
3064 /* Copy the character pointed to by SRC into DST. Do not call this 3283 /* Copy the character pointed to by SRC into DST. Do not call this
3065 directly. Use the macro charptr_copy_char() instead. 3284 directly. Use the macro charptr_copy_emchar() instead.
3066 Return the number of bytes copied. */ 3285 Return the number of bytes copied. */
3067 3286
3068 Bytecount 3287 Bytecount
3069 non_ascii_charptr_copy_char (const Intbyte *src, Intbyte *dst) 3288 non_ascii_charptr_copy_emchar (const Intbyte *src, Intbyte *dst)
3070 { 3289 {
3071 Bytecount bytes = REP_BYTES_BY_FIRST_BYTE (*src); 3290 Bytecount bytes = rep_bytes_by_first_byte (*src);
3072 Bytecount i; 3291 Bytecount i;
3073 for (i = bytes; i; i--, dst++, src++) 3292 for (i = bytes; i; i--, dst++, src++)
3074 *dst = *src; 3293 *dst = *src;
3075 return bytes; 3294 return bytes;
3076 } 3295 }
3095 Intbyte *strptr = str; 3314 Intbyte *strptr = str;
3096 Bytecount bytes; 3315 Bytecount bytes;
3097 3316
3098 str[0] = (Intbyte) ch; 3317 str[0] = (Intbyte) ch;
3099 3318
3100 for (bytes = REP_BYTES_BY_FIRST_BYTE (ch) - 1; bytes; bytes--) 3319 for (bytes = rep_bytes_by_first_byte (ch) - 1; bytes; bytes--)
3101 { 3320 {
3102 int c = Lstream_getc (stream); 3321 int c = Lstream_getc (stream);
3103 text_checking_assert (c >= 0); 3322 text_checking_assert (c >= 0);
3104 *++strptr = (Intbyte) c; 3323 *++strptr = (Intbyte) c;
3105 } 3324 }
3222 if (CHARSET_DIMENSION (cs) == 1) 3441 if (CHARSET_DIMENSION (cs) == 1)
3223 { 3442 {
3224 if (!NILP (arg2)) 3443 if (!NILP (arg2))
3225 invalid_argument 3444 invalid_argument
3226 ("Charset is of dimension one; second octet must be nil", arg2); 3445 ("Charset is of dimension one; second octet must be nil", arg2);
3227 return make_char (MAKE_CHAR (charset, a1, 0)); 3446 return make_char (make_emchar (charset, a1, 0));
3228 } 3447 }
3229 3448
3230 CHECK_INT (arg2); 3449 CHECK_INT (arg2);
3231 a2 = XINT (arg2) & 0x7f; 3450 a2 = XINT (arg2) & 0x7f;
3232 if (a2 < lowlim || a2 > highlim) 3451 if (a2 < lowlim || a2 > highlim)
3233 args_out_of_range_3 (arg2, make_int (lowlim), make_int (highlim)); 3452 args_out_of_range_3 (arg2, make_int (lowlim), make_int (highlim));
3234 3453
3235 return make_char (MAKE_CHAR (charset, a1, a2)); 3454 return make_char (make_emchar (charset, a1, a2));
3236 #else 3455 #else
3237 int a1; 3456 int a1;
3238 int lowlim, highlim; 3457 int lowlim, highlim;
3239 3458
3240 if (EQ (charset, Qascii)) lowlim = 0, highlim = 127; 3459 if (EQ (charset, Qascii)) lowlim = 0, highlim = 127;
3263 */ 3482 */
3264 (ch)) 3483 (ch))
3265 { 3484 {
3266 CHECK_CHAR_COERCE_INT (ch); 3485 CHECK_CHAR_COERCE_INT (ch);
3267 3486
3268 return XCHARSET_NAME (CHARSET_BY_LEADING_BYTE 3487 return XCHARSET_NAME (charset_by_leading_byte
3269 (CHAR_LEADING_BYTE (XCHAR (ch)))); 3488 (emchar_leading_byte (XCHAR (ch))));
3270 } 3489 }
3271 3490
3272 DEFUN ("char-octet", Fchar_octet, 1, 2, 0, /* 3491 DEFUN ("char-octet", Fchar_octet, 1, 2, 0, /*
3273 Return the octet numbered N (should be 0 or 1) of char CH. 3492 Return the octet numbered N (should be 0 or 1) of char CH.
3274 N defaults to 0 if omitted. 3493 N defaults to 0 if omitted.
3278 Lisp_Object charset; 3497 Lisp_Object charset;
3279 int octet0, octet1; 3498 int octet0, octet1;
3280 3499
3281 CHECK_CHAR_COERCE_INT (ch); 3500 CHECK_CHAR_COERCE_INT (ch);
3282 3501
3283 BREAKUP_CHAR (XCHAR (ch), charset, octet0, octet1); 3502 BREAKUP_EMCHAR (XCHAR (ch), charset, octet0, octet1);
3284 3503
3285 if (NILP (n) || EQ (n, Qzero)) 3504 if (NILP (n) || EQ (n, Qzero))
3286 return make_int (octet0); 3505 return make_int (octet0);
3287 else if (EQ (n, make_int (1))) 3506 else if (EQ (n, make_int (1)))
3288 return make_int (octet1); 3507 return make_int (octet1);
3302 int c1, c2; 3521 int c1, c2;
3303 3522
3304 GCPRO2 (charset, rc); 3523 GCPRO2 (charset, rc);
3305 CHECK_CHAR_COERCE_INT (character); 3524 CHECK_CHAR_COERCE_INT (character);
3306 3525
3307 BREAKUP_CHAR (XCHAR (character), charset, c1, c2); 3526 BREAKUP_EMCHAR (XCHAR (character), charset, c1, c2);
3308 3527
3309 if (XCHARSET_DIMENSION (Fget_charset (charset)) == 2) 3528 if (XCHARSET_DIMENSION (Fget_charset (charset)) == 2)
3310 { 3529 {
3311 rc = list3 (XCHARSET_NAME (charset), make_int (c1), make_int (c2)); 3530 rc = list3 (XCHARSET_NAME (charset), make_int (c1), make_int (c2));
3312 } 3531 }
3339 3558
3340 if (UNBOUNDP (ch)) 3559 if (UNBOUNDP (ch))
3341 { 3560 {
3342 if (composite_char_row_next >= 128) 3561 if (composite_char_row_next >= 128)
3343 invalid_operation ("No more composite chars available", lispstr); 3562 invalid_operation ("No more composite chars available", lispstr);
3344 emch = MAKE_CHAR (Vcharset_composite, composite_char_row_next, 3563 emch = make_emchar (Vcharset_composite, composite_char_row_next,
3345 composite_char_col_next); 3564 composite_char_col_next);
3346 Fputhash (make_char (emch), lispstr, 3565 Fputhash (make_char (emch), lispstr,
3347 Vcomposite_char_char2string_hash_table); 3566 Vcomposite_char_char2string_hash_table);
3348 Fputhash (lispstr, make_char (emch), 3567 Fputhash (lispstr, make_char (emch),
3349 Vcomposite_char_string2char_hash_table); 3568 Vcomposite_char_string2char_hash_table);
3367 Qunbound); 3586 Qunbound);
3368 assert (!UNBOUNDP (str)); 3587 assert (!UNBOUNDP (str));
3369 return str; 3588 return str;
3370 } 3589 }
3371 3590
3372 xxDEFUN ("make-composite-char", Fmake_composite_char, 1, 1, 0, /* 3591 DEFUN ("make-composite-char", Fmake_composite_char, 1, 1, 0, /*
3373 Convert a string into a single composite character. 3592 Convert a string into a single composite character.
3374 The character is the result of overstriking all the characters in 3593 The character is the result of overstriking all the characters in
3375 the string. 3594 the string.
3376 */ 3595 */
3377 (string)) 3596 (string))
3379 CHECK_STRING (string); 3598 CHECK_STRING (string);
3380 return make_char (lookup_composite_char (XSTRING_DATA (string), 3599 return make_char (lookup_composite_char (XSTRING_DATA (string),
3381 XSTRING_LENGTH (string))); 3600 XSTRING_LENGTH (string)));
3382 } 3601 }
3383 3602
3384 xxDEFUN ("composite-char-string", Fcomposite_char_string, 1, 1, 0, /* 3603 DEFUN ("composite-char-string", Fcomposite_char_string, 1, 1, 0, /*
3385 Return a string of the characters comprising a composite character. 3604 Return a string of the characters comprising a composite character.
3386 */ 3605 */
3387 (ch)) 3606 (ch))
3388 { 3607 {
3389 Emchar emch; 3608 Emchar emch;
3390 3609
3391 CHECK_CHAR (ch); 3610 CHECK_CHAR (ch);
3392 emch = XCHAR (ch); 3611 emch = XCHAR (ch);
3393 if (CHAR_LEADING_BYTE (emch) != LEADING_BYTE_COMPOSITE) 3612 if (emchar_leading_byte (emch) != LEADING_BYTE_COMPOSITE)
3394 invalid_argument ("Must be composite char", ch); 3613 invalid_argument ("Must be composite char", ch);
3395 return composite_char_string (emch); 3614 return composite_char_string (emch);
3396 } 3615 }
3397 #endif /* ENABLE_COMPOSITE_CHARS */ 3616 #endif /* ENABLE_COMPOSITE_CHARS */
3398 3617