771
|
1 /* Buffer manipulation primitives for XEmacs.
|
|
2 Copyright (C) 1995 Sun Microsystems, Inc.
|
1292
|
3 Copyright (C) 1995, 1996, 2000, 2001, 2002, 2003 Ben Wing.
|
771
|
4 Copyright (C) 1999 Martin Buchholz.
|
|
5
|
|
6 This file is part of XEmacs.
|
|
7
|
|
8 XEmacs is free software; you can redistribute it and/or modify it
|
|
9 under the terms of the GNU General Public License as published by the
|
|
10 Free Software Foundation; either version 2, or (at your option) any
|
|
11 later version.
|
|
12
|
|
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
|
|
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
16 for more details.
|
|
17
|
|
18 You should have received a copy of the GNU General Public License
|
|
19 along with XEmacs; see the file COPYING. If not, write to
|
|
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
21 Boston, MA 02111-1307, USA. */
|
|
22
|
|
23 /* Synched up with: Not in FSF. */
|
|
24
|
|
25 /* Authorship:
|
|
26 */
|
|
27
|
|
28 #include <config.h>
|
|
29 #include "lisp.h"
|
|
30
|
|
31 #include "buffer.h"
|
|
32 #include "charset.h"
|
|
33 #include "file-coding.h"
|
|
34 #include "lstream.h"
|
1292
|
35 #include "profile.h"
|
771
|
36
|
|
37
|
|
38 /************************************************************************/
|
|
39 /* long comments */
|
|
40 /************************************************************************/
|
|
41
|
|
42 /*
|
826
|
43 ==========================================================================
|
1292
|
44 1. Intro to Characters, Character Sets, and Encodings
|
826
|
45 ==========================================================================
|
771
|
46
|
826
|
47 A character (which is, BTW, a surprisingly complex concept) is, in a
|
|
48 written representation of text, the most basic written unit that has a
|
|
49 meaning of its own. It's comparable to a phoneme when analyzing words
|
1292
|
50 in spoken speech (for example, the sound of `t' in English, which in
|
|
51 fact has different pronunciations in different words -- aspirated in
|
|
52 `time', unaspirated in `stop', unreleased or even pronounced as a
|
|
53 glottal stop in `button', etc. -- but logically is a single concept).
|
|
54 Like a phoneme, a character is an abstract concept defined by its
|
|
55 *meaning*. The character `lowercase f', for example, can always be used
|
|
56 to represent the first letter in the word `fill', regardless of whether
|
|
57 it's drawn upright or italic, whether the `fi' combination is drawn as a
|
|
58 single ligature, whether there are serifs on the bottom of the vertical
|
|
59 stroke, etc. (These different appearances of a single character are
|
|
60 often called "graphs" or "glyphs".) Our concern when representing text
|
|
61 is on representing the abstract characters, and not on their exact
|
|
62 appearance.
|
|
63
|
|
64 A character set (or "charset"), as we define it, is a set of characters,
|
|
65 each with an associated number (or set of numbers -- see below), called
|
|
66 a "code point". It's important to understand that a character is not
|
|
67 defined by any number attached to it, but by its meaning. For example,
|
|
68 ASCII and EBCDIC are two charsets containing exactly the same characters
|
|
69 (lowercase and uppercase letters, numbers 0 through 9, particular
|
|
70 punctuation marks) but with different numberings. The `comma' character
|
|
71 in ASCII and EBCDIC, for instance, is the same character despite having
|
|
72 a different numbering. Conversely, when comparing ASCII and JIS-Roman,
|
|
73 which look the same except that the latter has a yen sign substituted
|
|
74 for the backslash, we would say that the backslash and yen sign are
|
|
75 *not* the same characters, despite having the same number (95) and
|
|
76 despite the fact that all other characters are present in both charsets,
|
|
77 with the same numbering. ASCII and JIS-Roman, then, do *not* have
|
|
78 exactly the same characters in them (ASCII has a backslash character but
|
|
79 no yen-sign character, and vice-versa for JIS-Roman), unlike ASCII and
|
|
80 EBCDIC, even though the numberings in ASCII and JIS-Roman are closer.
|
|
81
|
|
82 It's also important to distinguish between charsets and encodings. For
|
|
83 a simple charset like ASCII, there is only one encoding normally used --
|
|
84 each character is represented by a single byte, with the same value as
|
|
85 its code point. For more complicated charsets, however, things are not
|
|
86 so obvious. Unicode version 2, for example, is a large charset with
|
|
87 thousands of characters, each indexed by a 16-bit number, often
|
|
88 represented in hex, e.g. 0x05D0 for the Hebrew letter "aleph". One
|
|
89 obvious encoding uses two bytes per character (actually two encodings,
|
|
90 depending on which of the two possible byte orderings is chosen). This
|
|
91 encoding is convenient for internal processing of Unicode text; however,
|
|
92 it's incompatible with ASCII, so a different encoding, e.g. UTF-8, is
|
|
93 usually used for external text, for example files or e-mail. UTF-8
|
|
94 represents Unicode characters with one to three bytes (often extended to
|
|
95 six bytes to handle characters with up to 31-bit indices). Unicode
|
|
96 characters 00 to 7F (identical with ASCII) are directly represented with
|
|
97 one byte, and other characters with two or more bytes, each in the range
|
|
98 80 to FF.
|
|
99
|
|
100 In general, a single encoding may be able to represent more than one
|
|
101 charset.
|
|
102
|
|
103 See also man/lispref/mule.texi.
|
826
|
104
|
1292
|
105 ==========================================================================
|
|
106 2. Character Sets
|
|
107 ==========================================================================
|
|
108
|
771
|
109 A particular character in a charset is indexed using one or
|
|
110 more "position codes", which are non-negative integers.
|
|
111 The number of position codes needed to identify a particular
|
|
112 character in a charset is called the "dimension" of the
|
|
113 charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions,
|
|
114 and the size of all charsets (except for a few special cases)
|
|
115 is either 94, 96, 94 by 94, or 96 by 96. The range of
|
|
116 position codes used to index characters from any of these
|
|
117 types of character sets is as follows:
|
|
118
|
|
119 Charset type Position code 1 Position code 2
|
|
120 ------------------------------------------------------------
|
|
121 94 33 - 126 N/A
|
|
122 96 32 - 127 N/A
|
|
123 94x94 33 - 126 33 - 126
|
|
124 96x96 32 - 127 32 - 127
|
|
125
|
|
126 Note that in the above cases position codes do not start at
|
|
127 an expected value such as 0 or 1. The reason for this will
|
|
128 become clear later.
|
|
129
|
|
130 For example, Latin-1 is a 96-character charset, and JISX0208
|
|
131 (the Japanese national character set) is a 94x94-character
|
|
132 charset.
|
|
133
|
|
134 [Note that, although the ranges above define the *valid*
|
|
135 position codes for a charset, some of the slots in a particular
|
|
136 charset may in fact be empty. This is the case for JISX0208,
|
|
137 for example, where (e.g.) all the slots whose first
|
|
138 position code is in the range 118 - 127 are empty.]
|
|
139
|
|
140 There are three charsets that do not follow the above rules.
|
|
141 All of them have one dimension, and have ranges of position
|
|
142 codes as follows:
|
|
143
|
|
144 Charset name Position code 1
|
|
145 ------------------------------------
|
|
146 ASCII 0 - 127
|
|
147 Control-1 0 - 31
|
|
148 Composite 0 - some large number
|
|
149
|
|
150 (The upper bound of the position code for composite characters
|
|
151 has not yet been determined, but it will probably be at
|
|
152 least 16,383).
|
|
153
|
|
154 ASCII is the union of two subsidiary character sets:
|
|
155 Printing-ASCII (the printing ASCII character set,
|
|
156 consisting of position codes 33 - 126, like for a standard
|
|
157 94-character charset) and Control-ASCII (the non-printing
|
|
158 characters that would appear in a binary file with codes 0
|
|
159 - 32 and 127).
|
|
160
|
|
161 Control-1 contains the non-printing characters that would
|
|
162 appear in a binary file with codes 128 - 159.
|
|
163
|
|
164 Composite contains characters that are generated by
|
|
165 overstriking one or more characters from other charsets.
|
|
166
|
|
167 Note that some characters in ASCII, and all characters
|
|
168 in Control-1, are "control" (non-printing) characters.
|
|
169 These have no printed representation but instead control
|
|
170 some other function of the printing (e.g. TAB or 8 moves
|
|
171 the current character position to the next tab stop).
|
|
172 All other characters in all charsets are "graphic"
|
|
173 (printing) characters.
|
|
174
|
|
175 When a binary file is read in, the bytes in the file are
|
|
176 assigned to character sets as follows:
|
|
177
|
|
178 Bytes Character set Range
|
|
179 --------------------------------------------------
|
|
180 0 - 127 ASCII 0 - 127
|
|
181 128 - 159 Control-1 0 - 31
|
|
182 160 - 255 Latin-1 32 - 127
|
|
183
|
|
184 This is a bit ad-hoc but gets the job done.
|
|
185
|
826
|
186 ==========================================================================
|
1292
|
187 3. Encodings
|
826
|
188 ==========================================================================
|
771
|
189
|
|
190 An "encoding" is a way of numerically representing
|
|
191 characters from one or more character sets. If an encoding
|
|
192 only encompasses one character set, then the position codes
|
|
193 for the characters in that character set could be used
|
|
194 directly. This is not possible, however, if more than one
|
|
195 character set is to be used in the encoding.
|
|
196
|
|
197 For example, the conversion detailed above between bytes in
|
|
198 a binary file and characters is effectively an encoding
|
|
199 that encompasses the three character sets ASCII, Control-1,
|
|
200 and Latin-1 in a stream of 8-bit bytes.
|
|
201
|
|
202 Thus, an encoding can be viewed as a way of encoding
|
|
203 characters from a specified group of character sets using a
|
|
204 stream of bytes, each of which contains a fixed number of
|
|
205 bits (but not necessarily 8, as in the common usage of
|
|
206 "byte").
|
|
207
|
|
208 Here are descriptions of a couple of common
|
|
209 encodings:
|
|
210
|
|
211
|
|
212 A. Japanese EUC (Extended Unix Code)
|
|
213
|
|
214 This encompasses the character sets:
|
|
215 - Printing-ASCII,
|
|
216 - Katakana-JISX0201 (half-width katakana, the right half of JISX0201).
|
|
217 - Japanese-JISX0208
|
|
218 - Japanese-JISX0212
|
|
219 It uses 8-bit bytes.
|
|
220
|
|
221 Note that Printing-ASCII and Katakana-JISX0201 are 94-character
|
|
222 charsets, while Japanese-JISX0208 is a 94x94-character charset.
|
|
223
|
|
224 The encoding is as follows:
|
|
225
|
|
226 Character set Representation (PC == position-code)
|
|
227 ------------- --------------
|
|
228 Printing-ASCII PC1
|
|
229 Japanese-JISX0208 PC1 + 0x80 | PC2 + 0x80
|
|
230 Katakana-JISX0201 0x8E | PC1 + 0x80
|
|
231
|
|
232
|
|
233 B. JIS7
|
|
234
|
|
235 This encompasses the character sets:
|
|
236 - Printing-ASCII
|
|
237 - Latin-JISX0201 (the left half of JISX0201; this character set is
|
|
238 very similar to Printing-ASCII and is a 94-character charset)
|
|
239 - Japanese-JISX0208
|
|
240 - Katakana-JISX0201
|
|
241 It uses 7-bit bytes.
|
|
242
|
|
243 Unlike Japanese EUC, this is a "modal" encoding, which
|
|
244 means that there are multiple states that the encoding can
|
|
245 be in, which affect how the bytes are to be interpreted.
|
|
246 Special sequences of bytes (called "escape sequences")
|
|
247 are used to change states.
|
|
248
|
|
249 The encoding is as follows:
|
|
250
|
|
251 Character set Representation
|
|
252 ------------- --------------
|
|
253 Printing-ASCII PC1
|
|
254 Latin-JISX0201 PC1
|
|
255 Katakana-JISX0201 PC1
|
|
256 Japanese-JISX0208 PC1 | PC2
|
|
257
|
|
258 Escape sequence ASCII equivalent Meaning
|
|
259 --------------- ---------------- -------
|
|
260 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII
|
|
261 0x1B 0x28 0x4A ESC ( J invoke Latin-JISX0201
|
|
262 0x1B 0x28 0x49 ESC ( I invoke Katakana-JISX0201
|
|
263 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208
|
|
264
|
|
265 Initially, Printing-ASCII is invoked.
|
|
266
|
826
|
267 ==========================================================================
|
1292
|
268 4. Internal Mule Encodings
|
826
|
269 ==========================================================================
|
771
|
270
|
|
271 In XEmacs/Mule, each character set is assigned a unique number,
|
|
272 called a "leading byte". This is used in the encodings of a
|
|
273 character. Leading bytes are in the range 0x80 - 0xFF
|
|
274 (except for ASCII, which has a leading byte of 0), although
|
|
275 some leading bytes are reserved.
|
|
276
|
|
277 Charsets whose leading byte is in the range 0x80 - 0x9F are
|
|
278 called "official" and are used for built-in charsets.
|
|
279 Other charsets are called "private" and have leading bytes
|
|
280 in the range 0xA0 - 0xFF; these are user-defined charsets.
|
|
281
|
|
282 More specifically:
|
|
283
|
|
284 Character set Leading byte
|
|
285 ------------- ------------
|
|
286 ASCII 0 (0x7F in arrays indexed by leading byte)
|
|
287 Composite 0x8D
|
|
288 Dimension-1 Official 0x80 - 0x8C/0x8D
|
|
289 (0x8E is free)
|
|
290 Control 0x8F
|
|
291 Dimension-2 Official 0x90 - 0x99
|
|
292 (0x9A - 0x9D are free)
|
|
293 Dimension-1 Private Marker 0x9E
|
|
294 Dimension-2 Private Marker 0x9F
|
|
295 Dimension-1 Private 0xA0 - 0xEF
|
|
296 Dimension-2 Private 0xF0 - 0xFF
|
|
297
|
|
298 There are two internal encodings for characters in XEmacs/Mule.
|
|
299 One is called "string encoding" and is an 8-bit encoding that
|
|
300 is used for representing characters in a buffer or string.
|
|
301 It uses 1 to 4 bytes per character. The other is called
|
|
302 "character encoding" and is a 19-bit encoding that is used
|
|
303 for representing characters individually in a variable.
|
|
304
|
|
305 (In the following descriptions, we'll ignore composite
|
|
306 characters for the moment. We also give a general (structural)
|
|
307 overview first, followed later by the exact details.)
|
|
308
|
|
309 A. Internal String Encoding
|
|
310
|
|
311 ASCII characters are encoded using their position code directly.
|
|
312 Other characters are encoded using their leading byte followed
|
|
313 by their position code(s) with the high bit set. Characters
|
|
314 in private character sets have their leading byte prefixed with
|
|
315 a "leading byte prefix", which is either 0x9E or 0x9F. (No
|
|
316 character sets are ever assigned these leading bytes.) Specifically:
|
|
317
|
|
318 Character set Encoding (PC == position-code)
|
|
319 ------------- -------- (LB == leading-byte)
|
|
320 ASCII PC1 |
|
|
321 Control-1 LB | PC1 + 0xA0
|
|
322 Dimension-1 official LB | PC1 + 0x80
|
|
323 Dimension-1 private 0x9E | LB | PC1 + 0x80
|
|
324 Dimension-2 official LB | PC1 | PC2 + 0x80
|
|
325 Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80
|
|
326
|
|
327 The basic characteristic of this encoding is that the first byte
|
|
328 of all characters is in the range 0x00 - 0x9F, and the second and
|
|
329 following bytes of all characters is in the range 0xA0 - 0xFF.
|
|
330 This means that it is impossible to get out of sync, or more
|
|
331 specifically:
|
|
332
|
|
333 1. Given any byte position, the beginning of the character it is
|
|
334 within can be determined in constant time.
|
|
335 2. Given any byte position at the beginning of a character, the
|
|
336 beginning of the next character can be determined in constant
|
|
337 time.
|
|
338 3. Given any byte position at the beginning of a character, the
|
|
339 beginning of the previous character can be determined in constant
|
|
340 time.
|
|
341 4. Textual searches can simply treat encoded strings as if they
|
|
342 were encoded in a one-byte-per-character fashion rather than
|
|
343 the actual multi-byte encoding.
|
|
344
|
|
345 None of the standard non-modal encodings meet all of these
|
|
346 conditions. For example, EUC satisfies only (2) and (3), while
|
|
347 Shift-JIS and Big5 (not yet described) satisfy only (2). (All
|
|
348 non-modal encodings must satisfy (2), in order to be unambiguous.)
|
|
349
|
|
350 B. Internal Character Encoding
|
|
351
|
|
352 One 19-bit word represents a single character. The word is
|
|
353 separated into three fields:
|
|
354
|
|
355 Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
|
|
356 <------------> <------------------> <------------------>
|
|
357 Field: 1 2 3
|
|
358
|
|
359 Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits.
|
|
360
|
|
361 Character set Field 1 Field 2 Field 3
|
|
362 ------------- ------- ------- -------
|
|
363 ASCII 0 0 PC1
|
|
364 range: (00 - 7F)
|
|
365 Control-1 0 1 PC1
|
|
366 range: (00 - 1F)
|
|
367 Dimension-1 official 0 LB - 0x7F PC1
|
|
368 range: (01 - 0D) (20 - 7F)
|
|
369 Dimension-1 private 0 LB - 0x80 PC1
|
|
370 range: (20 - 6F) (20 - 7F)
|
|
371 Dimension-2 official LB - 0x8F PC1 PC2
|
|
372 range: (01 - 0A) (20 - 7F) (20 - 7F)
|
|
373 Dimension-2 private LB - 0xE1 PC1 PC2
|
|
374 range: (0F - 1E) (20 - 7F) (20 - 7F)
|
|
375 Composite 0x1F ? ?
|
|
376
|
|
377 Note that character codes 0 - 255 are the same as the "binary encoding"
|
|
378 described above.
|
826
|
379
|
|
380 Most of the code in XEmacs knows nothing of the representation of a
|
|
381 character other than that values 0 - 255 represent ASCII, Control 1,
|
|
382 and Latin 1.
|
|
383
|
|
384 WARNING WARNING WARNING: The Boyer-Moore code in search.c, and the
|
|
385 code in search_buffer() that determines whether that code can be used,
|
|
386 knows that "field 3" in a character always corresponds to the last
|
|
387 byte in the textual representation of the character. (This is important
|
|
388 because the Boyer-Moore algorithm works by looking at the last byte
|
|
389 of the search string and &&#### finish this.
|
|
390
|
|
391 ==========================================================================
|
1292
|
392 5. Buffer Positions and Other Typedefs
|
826
|
393 ==========================================================================
|
|
394
|
|
395 A. Buffer Positions
|
|
396
|
|
397 There are three possible ways to specify positions in a buffer. All
|
|
398 of these are one-based: the beginning of the buffer is position or
|
|
399 index 1, and 0 is not a valid position.
|
|
400
|
|
401 As a "buffer position" (typedef Charbpos):
|
|
402
|
|
403 This is an index specifying an offset in characters from the
|
|
404 beginning of the buffer. Note that buffer positions are
|
|
405 logically *between* characters, not on a character. The
|
|
406 difference between two buffer positions specifies the number of
|
|
407 characters between those positions. Buffer positions are the
|
|
408 only kind of position externally visible to the user.
|
|
409
|
|
410 As a "byte index" (typedef Bytebpos):
|
|
411
|
|
412 This is an index over the bytes used to represent the characters
|
|
413 in the buffer. If there is no Mule support, this is identical
|
|
414 to a buffer position, because each character is represented
|
|
415 using one byte. However, with Mule support, many characters
|
|
416 require two or more bytes for their representation, and so a
|
|
417 byte index may be greater than the corresponding buffer
|
|
418 position.
|
|
419
|
|
420 As a "memory index" (typedef Membpos):
|
|
421
|
|
422 This is the byte index adjusted for the gap. For positions
|
|
423 before the gap, this is identical to the byte index. For
|
|
424 positions after the gap, this is the byte index plus the gap
|
|
425 size. There are two possible memory indices for the gap
|
|
426 position; the memory index at the beginning of the gap should
|
|
427 always be used, except in code that deals with manipulating the
|
|
428 gap, where both indices may be seen. The address of the
|
|
429 character "at" (i.e. following) a particular position can be
|
|
430 obtained from the formula
|
|
431
|
|
432 buffer_start_address + memory_index(position) - 1
|
|
433
|
|
434 except in the case of characters at the gap position.
|
|
435
|
|
436 B. Other Typedefs
|
|
437
|
867
|
438 Ichar:
|
1292
|
439 ------
|
826
|
440 This typedef represents a single Emacs character, which can be
|
|
441 ASCII, ISO-8859, or some extended character, as would typically
|
|
442 be used for Kanji. Note that the representation of a character
|
867
|
443 as an Ichar is *not* the same as the representation of that
|
826
|
444 same character in a string; thus, you cannot do the standard
|
|
445 C trick of passing a pointer to a character to a function that
|
|
446 expects a string.
|
|
447
|
867
|
448 An Ichar takes up 19 bits of representation and (for code
|
826
|
449 compatibility and such) is compatible with an int. This
|
|
450 representation is visible on the Lisp level. The important
|
867
|
451 characteristics of the Ichar representation are
|
826
|
452
|
|
453 -- values 0x00 - 0x7f represent ASCII.
|
|
454 -- values 0x80 - 0xff represent the right half of ISO-8859-1.
|
|
455 -- values 0x100 and up represent all other characters.
|
|
456
|
867
|
457 This means that Ichar values are upwardly compatible with
|
826
|
458 the standard 8-bit representation of ASCII/ISO-8859-1.
|
|
459
|
867
|
460 Ibyte:
|
1292
|
461 ------
|
867
|
462 The data in a buffer or string is logically made up of Ibyte
|
|
463 objects, where a Ibyte takes up the same amount of space as a
|
826
|
464 char. (It is declared differently, though, to catch invalid
|
867
|
465 usages.) Strings stored using Ibytes are said to be in
|
826
|
466 "internal format". The important characteristics of internal
|
|
467 format are
|
|
468
|
867
|
469 -- ASCII characters are represented as a single Ibyte,
|
826
|
470 in the range 0 - 0x7f.
|
867
|
471 -- All other characters are represented as a Ibyte in
|
|
472 the range 0x80 - 0x9f followed by one or more Ibytes
|
826
|
473 in the range 0xa0 to 0xff.
|
|
474
|
|
475 This leads to a number of desirable properties:
|
|
476
|
|
477 -- Given the position of the beginning of a character,
|
|
478 you can find the beginning of the next or previous
|
|
479 character in constant time.
|
|
480 -- When searching for a substring or an ASCII character
|
|
481 within the string, you need merely use standard
|
|
482 searching routines.
|
|
483
|
1292
|
484 Extbyte:
|
|
485 --------
|
826
|
486 Strings that go in or out of Emacs are in "external format",
|
|
487 typedef'ed as an array of char or a char *. There is more
|
|
488 than one external format (JIS, EUC, etc.) but they all
|
|
489 have similar properties. They are modal encodings,
|
|
490 which is to say that the meaning of particular bytes is
|
|
491 not fixed but depends on what "mode" the string is currently
|
|
492 in (e.g. bytes in the range 0 - 0x7f might be
|
|
493 interpreted as ASCII, or as Hiragana, or as 2-byte Kanji,
|
|
494 depending on the current mode). The mode starts out in
|
|
495 ASCII/ISO-8859-1 and is switched using escape sequences --
|
|
496 for example, in the JIS encoding, 'ESC $ B' switches to a
|
|
497 mode where pairs of bytes in the range 0 - 0x7f
|
|
498 are interpreted as Kanji characters.
|
|
499
|
|
500 External-formatted data is generally desirable for passing
|
|
501 data between programs because it is upwardly compatible
|
|
502 with standard ASCII/ISO-8859-1 strings and may require
|
|
503 less space than internal encodings such as the one
|
|
504 described above. In addition, some encodings (e.g. JIS)
|
|
505 keep all characters (except the ESC used to switch modes)
|
|
506 in the printing ASCII range 0x20 - 0x7e, which results in
|
|
507 a much higher probability that the data will avoid being
|
|
508 garbled in transmission. Externally-formatted data is
|
|
509 generally not very convenient to work with, however, and
|
|
510 for this reason is usually converted to internal format
|
|
511 before any work is done on the string.
|
|
512
|
|
513 NOTE: filenames need to be in external format so that
|
|
514 ISO-8859-1 characters come out correctly.
|
|
515
|
|
516 Charcount:
|
|
517 ----------
|
|
518 This typedef represents a count of characters, such as
|
|
519 a character offset into a string or the number of
|
|
520 characters between two positions in a buffer. The
|
|
521 difference between two Charbpos's is a Charcount, and
|
|
522 character positions in a string are represented using
|
|
523 a Charcount.
|
|
524
|
|
525 Bytecount:
|
|
526 ----------
|
|
527 Similar to a Charcount but represents a count of bytes.
|
|
528 The difference between two Bytebpos's is a Bytecount.
|
|
529
|
|
530
|
|
531 C. Usage of the Various Representations
|
|
532
|
|
533 Memory indices are used in low-level functions in insdel.c and for
|
|
534 extent endpoints and marker positions. The reason for this is that
|
|
535 this way, the extents and markers don't need to be updated for most
|
|
536 insertions, which merely shrink the gap and don't move any
|
|
537 characters around in memory.
|
|
538
|
|
539 (The beginning-of-gap memory index simplifies insertions w.r.t.
|
|
540 markers, because text usually gets inserted after markers. For
|
|
541 extents, it is merely for consistency, because text can get
|
|
542 inserted either before or after an extent's endpoint depending on
|
|
543 the open/closedness of the endpoint.)
|
|
544
|
|
545 Byte indices are used in other code that needs to be fast,
|
|
546 such as the searching, redisplay, and extent-manipulation code.
|
|
547
|
|
548 Buffer positions are used in all other code. This is because this
|
|
549 representation is easiest to work with (especially since Lisp
|
|
550 code always uses buffer positions), necessitates the fewest
|
|
551 changes to existing code, and is the safest (e.g. if the text gets
|
|
552 shifted underneath a buffer position, it will still point to a
|
|
553 character; if text is shifted under a byte index, it might point
|
|
554 to the middle of a character, which would be bad).
|
|
555
|
|
556 Similarly, Charcounts are used in all code that deals with strings
|
|
557 except for code that needs to be fast, which used Bytecounts.
|
|
558
|
|
559 Strings are always passed around internally using internal format.
|
|
560 Conversions between external format are performed at the time
|
|
561 that the data goes in or out of Emacs.
|
|
562
|
|
563 D. Working With the Various Representations
|
|
564
|
|
565 We write things this way because it's very important the
|
|
566 MAX_BYTEBPOS_GAP_SIZE_3 is a multiple of 3. (As it happens,
|
|
567 65535 is a multiple of 3, but this may not always be the
|
|
568 case. #### unfinished
|
|
569
|
|
570 ==========================================================================
|
1292
|
571 6. Miscellaneous
|
826
|
572 ==========================================================================
|
|
573
|
|
574 A. Unicode Support
|
771
|
575
|
1292
|
576 Unicode support is very desirable. Currrently we know how to handle
|
|
577 externally-encoded Unicode data in various encodings -- UTF-16, UTF-8,
|
|
578 etc. However, we really need to represent Unicode characters internally
|
|
579 as-is, rather than converting to some language-specific character set.
|
|
580 For efficiency, we should represent Unicode characters using 3 bytes
|
|
581 rather than 4. This means we need to find leading bytes for Unicode.
|
|
582 Given that there are 65,536 characters in Unicode and we can attach
|
|
583 96x96 = 9,216 characters per leading byte, we need eight leading bytes
|
|
584 for Unicode. We currently have four free (0x9A - 0x9D), and with a
|
|
585 little bit of rearranging we can get five: ASCII doesn't really need to
|
|
586 take up a leading byte. (We could just as well use 0x7F, with a little
|
|
587 change to the functions that assume that 0x80 is the lowest leading
|
|
588 byte.) This means we still need to dump three leading bytes and move
|
|
589 them into private space. The CNS charsets are good candidates since
|
|
590 they are rarely used, and JAPANESE_JISX0208_1978 is becoming less and
|
|
591 less used and could also be dumped.
|
826
|
592
|
|
593 B. Composite Characters
|
|
594
|
|
595 Composite characters are characters constructed by overstriking two
|
771
|
596 or more regular characters.
|
|
597
|
|
598 1) The old Mule implementation involves storing composite characters
|
|
599 in a buffer as a tag followed by all of the actual characters
|
|
600 used to make up the composite character. I think this is a bad
|
|
601 idea; it greatly complicates code that wants to handle strings
|
|
602 one character at a time because it has to deal with the possibility
|
|
603 of great big ungainly characters. It's much more reasonable to
|
|
604 simply store an index into a table of composite characters.
|
|
605
|
|
606 2) The current implementation only allows for 16,384 separate
|
|
607 composite characters over the lifetime of the XEmacs process.
|
|
608 This could become a potential problem if the user
|
|
609 edited lots of different files that use composite characters.
|
|
610 Due to FSF bogosity, increasing the number of allowable
|
|
611 composite characters under Mule would decrease the number
|
|
612 of possible faces that can exist. Mule already has shrunk
|
|
613 this to 2048, and further shrinkage would become uncomfortable.
|
|
614 No such problems exist in XEmacs.
|
|
615
|
|
616 Composite characters could be represented as 0x8D C1 C2 C3,
|
|
617 where each C[1-3] is in the range 0xA0 - 0xFF. This allows
|
|
618 for slightly under 2^20 (one million) composite characters
|
|
619 over the XEmacs process lifetime, and you only need to
|
|
620 increase the size of a Mule character from 19 to 21 bits.
|
|
621 Or you could use 0x8D C1 C2 C3 C4, allowing for about
|
826
|
622 85 million (slightly over 2^26) composite characters.
|
|
623
|
|
624 */
|
771
|
625
|
|
626
|
|
627 /************************************************************************/
|
|
628 /* declarations */
|
|
629 /************************************************************************/
|
|
630
|
|
631 Eistring the_eistring_zero_init, the_eistring_malloc_zero_init;
|
|
632
|
|
633 #define MAX_CHARBPOS_GAP_SIZE_3 (65535/3)
|
|
634 #define MAX_BYTEBPOS_GAP_SIZE_3 (3 * MAX_CHARBPOS_GAP_SIZE_3)
|
|
635
|
|
636 short three_to_one_table[1 + MAX_BYTEBPOS_GAP_SIZE_3];
|
|
637
|
|
638 #ifdef MULE
|
|
639
|
|
640 /* Table of number of bytes in the string representation of a character
|
|
641 indexed by the first byte of that representation.
|
|
642
|
|
643 rep_bytes_by_first_byte(c) is more efficient than the equivalent
|
|
644 canonical computation:
|
|
645
|
826
|
646 XCHARSET_REP_BYTES (charset_by_leading_byte (c)) */
|
771
|
647
|
|
648 const Bytecount rep_bytes_by_first_byte[0xA0] =
|
|
649 { /* 0x00 - 0x7f are for straight ASCII */
|
|
650 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
651 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
652 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
653 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
654 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
655 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
658 /* 0x80 - 0x8f are for Dimension-1 official charsets */
|
|
659 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
660 /* 0x90 - 0x9d are for Dimension-2 official charsets */
|
|
661 /* 0x9e is for Dimension-1 private charsets */
|
|
662 /* 0x9f is for Dimension-2 private charsets */
|
|
663 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
|
|
664 };
|
|
665
|
|
666 #ifdef ENABLE_COMPOSITE_CHARS
|
|
667
|
|
668 /* Hash tables for composite chars. One maps string representing
|
|
669 composed chars to their equivalent chars; one goes the
|
|
670 other way. */
|
|
671 Lisp_Object Vcomposite_char_char2string_hash_table;
|
|
672 Lisp_Object Vcomposite_char_string2char_hash_table;
|
|
673
|
|
674 static int composite_char_row_next;
|
|
675 static int composite_char_col_next;
|
|
676
|
|
677 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
678
|
|
679 #endif /* MULE */
|
|
680
|
1292
|
681 Lisp_Object QSin_char_byte_conversion;
|
|
682 Lisp_Object QSin_internal_external_conversion;
|
|
683
|
771
|
684
|
|
685 /************************************************************************/
|
|
686 /* qxestr***() functions */
|
|
687 /************************************************************************/
|
|
688
|
|
689 /* Most are inline functions in lisp.h */
|
|
690
|
|
691 int
|
867
|
692 qxesprintf (Ibyte *buffer, const CIbyte *format, ...)
|
771
|
693 {
|
|
694 va_list args;
|
|
695 int retval;
|
|
696
|
|
697 va_start (args, format);
|
|
698 retval = vsprintf ((char *) buffer, format, args);
|
|
699 va_end (args);
|
|
700
|
|
701 return retval;
|
|
702 }
|
|
703
|
|
704 /* strcasecmp() implementation from BSD */
|
867
|
705 static Ibyte strcasecmp_charmap[] = {
|
1429
|
706 0000, 0001, 0002, 0003, 0004, 0005, 0006, 0007,
|
|
707 0010, 0011, 0012, 0013, 0014, 0015, 0016, 0017,
|
|
708 0020, 0021, 0022, 0023, 0024, 0025, 0026, 0027,
|
|
709 0030, 0031, 0032, 0033, 0034, 0035, 0036, 0037,
|
|
710 0040, 0041, 0042, 0043, 0044, 0045, 0046, 0047,
|
|
711 0050, 0051, 0052, 0053, 0054, 0055, 0056, 0057,
|
|
712 0060, 0061, 0062, 0063, 0064, 0065, 0066, 0067,
|
|
713 0070, 0071, 0072, 0073, 0074, 0075, 0076, 0077,
|
|
714 0100, 0141, 0142, 0143, 0144, 0145, 0146, 0147,
|
|
715 0150, 0151, 0152, 0153, 0154, 0155, 0156, 0157,
|
|
716 0160, 0161, 0162, 0163, 0164, 0165, 0166, 0167,
|
|
717 0170, 0171, 0172, 0133, 0134, 0135, 0136, 0137,
|
|
718 0140, 0141, 0142, 0143, 0144, 0145, 0146, 0147,
|
|
719 0150, 0151, 0152, 0153, 0154, 0155, 0156, 0157,
|
|
720 0160, 0161, 0162, 0163, 0164, 0165, 0166, 0167,
|
|
721 0170, 0171, 0172, 0173, 0174, 0175, 0176, 0177,
|
|
722 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
|
|
723 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
|
|
724 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
|
|
725 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
|
|
726 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
|
|
727 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
|
|
728 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
|
|
729 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
|
|
730 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
|
|
731 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
|
|
732 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
|
|
733 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
|
|
734 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
|
|
735 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
|
|
736 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
|
|
737 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
|
771
|
738 };
|
|
739
|
|
740 /* A version that works like generic strcasecmp() -- only collapsing
|
|
741 case in ASCII A-Z/a-z. This is safe on Mule strings due to the
|
|
742 current representation.
|
|
743
|
|
744 This version was written by some Berkeley coder, favoring
|
|
745 nanosecond improvements over clarity. In all other versions below,
|
|
746 we use symmetrical algorithms that may sacrifice a few machine
|
|
747 cycles but are MUCH MUCH clearer, which counts a lot more.
|
|
748 */
|
|
749
|
|
750 int
|
867
|
751 qxestrcasecmp (const Ibyte *s1, const Ibyte *s2)
|
771
|
752 {
|
867
|
753 Ibyte *cm = strcasecmp_charmap;
|
771
|
754
|
|
755 while (cm[*s1] == cm[*s2++])
|
|
756 if (*s1++ == '\0')
|
|
757 return (0);
|
|
758
|
|
759 return (cm[*s1] - cm[*--s2]);
|
|
760 }
|
|
761
|
|
762 int
|
|
763 ascii_strcasecmp (const Char_ASCII *s1, const Char_ASCII *s2)
|
|
764 {
|
867
|
765 return qxestrcasecmp ((const Ibyte *) s1, (const Ibyte *) s2);
|
771
|
766 }
|
|
767
|
|
768 int
|
867
|
769 qxestrcasecmp_c (const Ibyte *s1, const Char_ASCII *s2)
|
771
|
770 {
|
867
|
771 return qxestrcasecmp (s1, (const Ibyte *) s2);
|
771
|
772 }
|
|
773
|
|
774 /* An internationalized version that collapses case in a general fashion.
|
|
775 */
|
|
776
|
|
777 int
|
867
|
778 qxestrcasecmp_i18n (const Ibyte *s1, const Ibyte *s2)
|
771
|
779 {
|
|
780 while (*s1 && *s2)
|
|
781 {
|
867
|
782 if (DOWNCASE (0, itext_ichar (s1)) !=
|
|
783 DOWNCASE (0, itext_ichar (s2)))
|
771
|
784 break;
|
867
|
785 INC_IBYTEPTR (s1);
|
|
786 INC_IBYTEPTR (s2);
|
771
|
787 }
|
|
788
|
867
|
789 return (DOWNCASE (0, itext_ichar (s1)) -
|
|
790 DOWNCASE (0, itext_ichar (s2)));
|
771
|
791 }
|
|
792
|
|
793 /* The only difference between these next two and
|
|
794 qxememcasecmp()/qxememcasecmp_i18n() is that these two will stop if
|
|
795 both strings are equal and less than LEN in length, while
|
|
796 the mem...() versions would would run off the end. */
|
|
797
|
|
798 int
|
867
|
799 qxestrncasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len)
|
771
|
800 {
|
867
|
801 Ibyte *cm = strcasecmp_charmap;
|
771
|
802
|
|
803 while (len--)
|
|
804 {
|
|
805 int diff = cm[*s1] - cm[*s2];
|
|
806 if (diff != 0)
|
|
807 return diff;
|
|
808 if (!*s1)
|
|
809 return 0;
|
|
810 s1++, s2++;
|
|
811 }
|
|
812
|
|
813 return 0;
|
|
814 }
|
|
815
|
|
816 int
|
|
817 ascii_strncasecmp (const Char_ASCII *s1, const Char_ASCII *s2, Bytecount len)
|
|
818 {
|
867
|
819 return qxestrncasecmp ((const Ibyte *) s1, (const Ibyte *) s2, len);
|
771
|
820 }
|
|
821
|
|
822 int
|
867
|
823 qxestrncasecmp_c (const Ibyte *s1, const Char_ASCII *s2, Bytecount len)
|
771
|
824 {
|
867
|
825 return qxestrncasecmp (s1, (const Ibyte *) s2, len);
|
771
|
826 }
|
|
827
|
801
|
828 /* Compare LEN_FROM_S1 worth of characters from S1 with the same number of
|
|
829 characters from S2, case insensitive. NOTE: Downcasing can convert
|
|
830 characters from one length in bytes to another, so reversing S1 and S2
|
|
831 is *NOT* a symmetric operations! You must choose a length that agrees
|
|
832 with S1. */
|
|
833
|
771
|
834 int
|
867
|
835 qxestrncasecmp_i18n (const Ibyte *s1, const Ibyte *s2,
|
801
|
836 Bytecount len_from_s1)
|
771
|
837 {
|
801
|
838 while (len_from_s1 > 0)
|
771
|
839 {
|
867
|
840 const Ibyte *old_s1 = s1;
|
|
841 int diff = (DOWNCASE (0, itext_ichar (s1)) -
|
|
842 DOWNCASE (0, itext_ichar (s2)));
|
771
|
843 if (diff != 0)
|
|
844 return diff;
|
|
845 if (!*s1)
|
|
846 return 0;
|
867
|
847 INC_IBYTEPTR (s1);
|
|
848 INC_IBYTEPTR (s2);
|
801
|
849 len_from_s1 -= s1 - old_s1;
|
771
|
850 }
|
|
851
|
|
852 return 0;
|
|
853 }
|
|
854
|
|
855 int
|
867
|
856 qxememcmp (const Ibyte *s1, const Ibyte *s2, Bytecount len)
|
771
|
857 {
|
|
858 return memcmp (s1, s2, len);
|
|
859 }
|
|
860
|
|
861 int
|
867
|
862 qxememcmp4 (const Ibyte *s1, Bytecount len1,
|
|
863 const Ibyte *s2, Bytecount len2)
|
801
|
864 {
|
|
865 int retval = qxememcmp (s1, s2, min (len1, len2));
|
|
866 if (retval)
|
|
867 return retval;
|
|
868 return len1 - len2;
|
|
869 }
|
|
870
|
|
871 int
|
867
|
872 qxememcasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len)
|
771
|
873 {
|
867
|
874 Ibyte *cm = strcasecmp_charmap;
|
771
|
875
|
|
876 while (len--)
|
|
877 {
|
|
878 int diff = cm[*s1] - cm[*s2];
|
|
879 if (diff != 0)
|
|
880 return diff;
|
|
881 s1++, s2++;
|
|
882 }
|
|
883
|
|
884 return 0;
|
|
885 }
|
|
886
|
|
887 int
|
867
|
888 qxememcasecmp4 (const Ibyte *s1, Bytecount len1,
|
|
889 const Ibyte *s2, Bytecount len2)
|
771
|
890 {
|
801
|
891 int retval = qxememcasecmp (s1, s2, min (len1, len2));
|
|
892 if (retval)
|
|
893 return retval;
|
|
894 return len1 - len2;
|
|
895 }
|
|
896
|
|
897 /* Do a character-by-character comparison, returning "which is greater" by
|
867
|
898 comparing the Ichar values. (#### Should have option to compare Unicode
|
801
|
899 points) */
|
|
900
|
|
901 int
|
867
|
902 qxetextcmp (const Ibyte *s1, Bytecount len1,
|
|
903 const Ibyte *s2, Bytecount len2)
|
801
|
904 {
|
|
905 while (len1 > 0 && len2 > 0)
|
771
|
906 {
|
867
|
907 const Ibyte *old_s1 = s1;
|
|
908 const Ibyte *old_s2 = s2;
|
|
909 int diff = itext_ichar (s1) - itext_ichar (s2);
|
801
|
910 if (diff != 0)
|
|
911 return diff;
|
867
|
912 INC_IBYTEPTR (s1);
|
|
913 INC_IBYTEPTR (s2);
|
801
|
914 len1 -= s1 - old_s1;
|
|
915 len2 -= s2 - old_s2;
|
|
916 }
|
|
917
|
|
918 assert (len1 >= 0 && len2 >= 0);
|
|
919 return len1 - len2;
|
|
920 }
|
|
921
|
|
922 int
|
867
|
923 qxetextcmp_matching (const Ibyte *s1, Bytecount len1,
|
|
924 const Ibyte *s2, Bytecount len2,
|
801
|
925 Charcount *matching)
|
|
926 {
|
|
927 *matching = 0;
|
|
928 while (len1 > 0 && len2 > 0)
|
|
929 {
|
867
|
930 const Ibyte *old_s1 = s1;
|
|
931 const Ibyte *old_s2 = s2;
|
|
932 int diff = itext_ichar (s1) - itext_ichar (s2);
|
801
|
933 if (diff != 0)
|
|
934 return diff;
|
867
|
935 INC_IBYTEPTR (s1);
|
|
936 INC_IBYTEPTR (s2);
|
801
|
937 len1 -= s1 - old_s1;
|
|
938 len2 -= s2 - old_s2;
|
|
939 (*matching)++;
|
|
940 }
|
|
941
|
|
942 assert (len1 >= 0 && len2 >= 0);
|
|
943 return len1 - len2;
|
|
944 }
|
|
945
|
|
946 /* Do a character-by-character comparison, returning "which is greater" by
|
867
|
947 comparing the Ichar values, case insensitively (by downcasing both
|
801
|
948 first). (#### Should have option to compare Unicode points)
|
|
949
|
|
950 In this case, both lengths must be specified becaused downcasing can
|
|
951 convert characters from one length in bytes to another; therefore, two
|
|
952 blocks of text of different length might be equal. If both compare
|
|
953 equal up to the limit in length of one but not the other, the longer one
|
|
954 is "greater". */
|
|
955
|
|
956 int
|
867
|
957 qxetextcasecmp (const Ibyte *s1, Bytecount len1,
|
|
958 const Ibyte *s2, Bytecount len2)
|
801
|
959 {
|
|
960 while (len1 > 0 && len2 > 0)
|
|
961 {
|
867
|
962 const Ibyte *old_s1 = s1;
|
|
963 const Ibyte *old_s2 = s2;
|
|
964 int diff = (DOWNCASE (0, itext_ichar (s1)) -
|
|
965 DOWNCASE (0, itext_ichar (s2)));
|
771
|
966 if (diff != 0)
|
|
967 return diff;
|
867
|
968 INC_IBYTEPTR (s1);
|
|
969 INC_IBYTEPTR (s2);
|
801
|
970 len1 -= s1 - old_s1;
|
|
971 len2 -= s2 - old_s2;
|
771
|
972 }
|
|
973
|
801
|
974 assert (len1 >= 0 && len2 >= 0);
|
|
975 return len1 - len2;
|
|
976 }
|
|
977
|
|
978 /* Like qxetextcasecmp() but also return number of characters at
|
|
979 beginning that match. */
|
|
980
|
|
981 int
|
867
|
982 qxetextcasecmp_matching (const Ibyte *s1, Bytecount len1,
|
|
983 const Ibyte *s2, Bytecount len2,
|
801
|
984 Charcount *matching)
|
|
985 {
|
|
986 *matching = 0;
|
|
987 while (len1 > 0 && len2 > 0)
|
|
988 {
|
867
|
989 const Ibyte *old_s1 = s1;
|
|
990 const Ibyte *old_s2 = s2;
|
|
991 int diff = (DOWNCASE (0, itext_ichar (s1)) -
|
|
992 DOWNCASE (0, itext_ichar (s2)));
|
801
|
993 if (diff != 0)
|
|
994 return diff;
|
867
|
995 INC_IBYTEPTR (s1);
|
|
996 INC_IBYTEPTR (s2);
|
801
|
997 len1 -= s1 - old_s1;
|
|
998 len2 -= s2 - old_s2;
|
|
999 (*matching)++;
|
|
1000 }
|
|
1001
|
|
1002 assert (len1 >= 0 && len2 >= 0);
|
|
1003 return len1 - len2;
|
771
|
1004 }
|
|
1005
|
|
1006 int
|
|
1007 lisp_strcasecmp (Lisp_Object s1, Lisp_Object s2)
|
|
1008 {
|
867
|
1009 Ibyte *cm = strcasecmp_charmap;
|
|
1010 Ibyte *p1 = XSTRING_DATA (s1);
|
|
1011 Ibyte *p2 = XSTRING_DATA (s2);
|
|
1012 Ibyte *e1 = p1 + XSTRING_LENGTH (s1);
|
|
1013 Ibyte *e2 = p2 + XSTRING_LENGTH (s2);
|
771
|
1014
|
|
1015 /* again, we use a symmetric algorithm and favor clarity over
|
|
1016 nanosecond improvements. */
|
|
1017 while (1)
|
|
1018 {
|
|
1019 /* if we reached the end of either string, compare lengths.
|
|
1020 do NOT compare the final null byte against anything, in case
|
|
1021 the other string also has a null byte at that position. */
|
|
1022 if (p1 == e1 || p2 == e2)
|
|
1023 return e1 - e2;
|
|
1024 if (cm[*p1] != cm[*p2])
|
|
1025 return cm[*p1] - cm[*p2];
|
|
1026 p1++, p2++;
|
|
1027 }
|
|
1028 }
|
|
1029
|
|
1030 int
|
|
1031 lisp_strcasecmp_i18n (Lisp_Object s1, Lisp_Object s2)
|
|
1032 {
|
801
|
1033 return qxetextcasecmp (XSTRING_DATA (s1), XSTRING_LENGTH (s1),
|
|
1034 XSTRING_DATA (s2), XSTRING_LENGTH (s2));
|
771
|
1035 }
|
|
1036
|
|
1037
|
|
1038 /************************************************************************/
|
|
1039 /* conversion between textual representations */
|
|
1040 /************************************************************************/
|
|
1041
|
|
1042 /* NOTE: Does not reset the Dynarr. */
|
|
1043
|
|
1044 void
|
867
|
1045 convert_ibyte_string_into_ichar_dynarr (const Ibyte *str, Bytecount len,
|
|
1046 Ichar_dynarr *dyn)
|
771
|
1047 {
|
867
|
1048 const Ibyte *strend = str + len;
|
771
|
1049
|
|
1050 while (str < strend)
|
|
1051 {
|
867
|
1052 Ichar ch = itext_ichar (str);
|
771
|
1053 Dynarr_add (dyn, ch);
|
867
|
1054 INC_IBYTEPTR (str);
|
771
|
1055 }
|
|
1056 }
|
|
1057
|
|
1058 Charcount
|
867
|
1059 convert_ibyte_string_into_ichar_string (const Ibyte *str, Bytecount len,
|
|
1060 Ichar *arr)
|
771
|
1061 {
|
867
|
1062 const Ibyte *strend = str + len;
|
771
|
1063 Charcount newlen = 0;
|
|
1064 while (str < strend)
|
|
1065 {
|
867
|
1066 Ichar ch = itext_ichar (str);
|
771
|
1067 arr[newlen++] = ch;
|
867
|
1068 INC_IBYTEPTR (str);
|
771
|
1069 }
|
|
1070 return newlen;
|
|
1071 }
|
|
1072
|
867
|
1073 /* Convert an array of Ichars into the equivalent string representation.
|
|
1074 Store into the given Ibyte dynarr. Does not reset the dynarr.
|
771
|
1075 Does not add a terminating zero. */
|
|
1076
|
|
1077 void
|
867
|
1078 convert_ichar_string_into_ibyte_dynarr (Ichar *arr, int nels,
|
|
1079 Ibyte_dynarr *dyn)
|
771
|
1080 {
|
867
|
1081 Ibyte str[MAX_ICHAR_LEN];
|
771
|
1082 int i;
|
|
1083
|
|
1084 for (i = 0; i < nels; i++)
|
|
1085 {
|
867
|
1086 Bytecount len = set_itext_ichar (str, arr[i]);
|
771
|
1087 Dynarr_add_many (dyn, str, len);
|
|
1088 }
|
|
1089 }
|
|
1090
|
867
|
1091 /* Convert an array of Ichars into the equivalent string representation.
|
771
|
1092 Malloc the space needed for this and return it. If LEN_OUT is not a
|
867
|
1093 NULL pointer, store into LEN_OUT the number of Ibytes in the
|
|
1094 malloc()ed string. Note that the actual number of Ibytes allocated
|
771
|
1095 is one more than this: the returned string is zero-terminated. */
|
|
1096
|
867
|
1097 Ibyte *
|
|
1098 convert_ichar_string_into_malloced_string (Ichar *arr, int nels,
|
826
|
1099 Bytecount *len_out)
|
771
|
1100 {
|
|
1101 /* Damn zero-termination. */
|
867
|
1102 Ibyte *str = (Ibyte *) ALLOCA (nels * MAX_ICHAR_LEN + 1);
|
|
1103 Ibyte *strorig = str;
|
771
|
1104 Bytecount len;
|
|
1105
|
|
1106 int i;
|
|
1107
|
|
1108 for (i = 0; i < nels; i++)
|
867
|
1109 str += set_itext_ichar (str, arr[i]);
|
771
|
1110 *str = '\0';
|
|
1111 len = str - strorig;
|
867
|
1112 str = (Ibyte *) xmalloc (1 + len);
|
771
|
1113 memcpy (str, strorig, 1 + len);
|
|
1114 if (len_out)
|
|
1115 *len_out = len;
|
|
1116 return str;
|
|
1117 }
|
|
1118
|
826
|
1119 #define COPY_TEXT_BETWEEN_FORMATS(srcfmt, dstfmt) \
|
|
1120 do \
|
|
1121 { \
|
|
1122 if (dst) \
|
|
1123 { \
|
867
|
1124 Ibyte *dstend = dst + dstlen; \
|
|
1125 Ibyte *dstp = dst; \
|
|
1126 const Ibyte *srcend = src + srclen; \
|
|
1127 const Ibyte *srcp = src; \
|
826
|
1128 \
|
|
1129 while (srcp < srcend) \
|
|
1130 { \
|
867
|
1131 Ichar ch = itext_ichar_fmt (srcp, srcfmt, srcobj); \
|
|
1132 Bytecount len = ichar_len_fmt (ch, dstfmt); \
|
826
|
1133 \
|
|
1134 if (dstp + len <= dstend) \
|
|
1135 { \
|
867
|
1136 set_itext_ichar_fmt (dstp, ch, dstfmt, dstobj); \
|
826
|
1137 dstp += len; \
|
|
1138 } \
|
|
1139 else \
|
|
1140 break; \
|
867
|
1141 INC_IBYTEPTR_FMT (srcp, srcfmt); \
|
826
|
1142 } \
|
|
1143 text_checking_assert (srcp <= srcend); \
|
|
1144 if (src_used) \
|
|
1145 *src_used = srcp - src; \
|
|
1146 return dstp - dst; \
|
|
1147 } \
|
|
1148 else \
|
|
1149 { \
|
867
|
1150 const Ibyte *srcend = src + srclen; \
|
|
1151 const Ibyte *srcp = src; \
|
826
|
1152 Bytecount total = 0; \
|
|
1153 \
|
|
1154 while (srcp < srcend) \
|
|
1155 { \
|
867
|
1156 total += ichar_len_fmt (itext_ichar_fmt (srcp, srcfmt, \
|
826
|
1157 srcobj), dstfmt); \
|
867
|
1158 INC_IBYTEPTR_FMT (srcp, srcfmt); \
|
826
|
1159 } \
|
|
1160 text_checking_assert (srcp == srcend); \
|
|
1161 if (src_used) \
|
|
1162 *src_used = srcp - src; \
|
|
1163 return total; \
|
|
1164 } \
|
|
1165 } \
|
|
1166 while (0)
|
|
1167
|
|
1168 /* Copy as much text from SRC/SRCLEN to DST/DSTLEN as will fit, converting
|
|
1169 from SRCFMT/SRCOBJ to DSTFMT/DSTOBJ. Return number of bytes stored into
|
|
1170 DST as return value, and number of bytes copied from SRC through
|
|
1171 SRC_USED (if not NULL). If DST is NULL, don't actually store anything
|
|
1172 and just return the size needed to store all the text. Will not copy
|
|
1173 partial characters into DST. */
|
|
1174
|
|
1175 Bytecount
|
867
|
1176 copy_text_between_formats (const Ibyte *src, Bytecount srclen,
|
826
|
1177 Internal_Format srcfmt,
|
2333
|
1178 Lisp_Object USED_IF_MULE (srcobj),
|
867
|
1179 Ibyte *dst, Bytecount dstlen,
|
826
|
1180 Internal_Format dstfmt,
|
2333
|
1181 Lisp_Object USED_IF_MULE (dstobj),
|
826
|
1182 Bytecount *src_used)
|
|
1183 {
|
|
1184 if (srcfmt == dstfmt &&
|
|
1185 objects_have_same_internal_representation (srcobj, dstobj))
|
|
1186 {
|
|
1187 if (dst)
|
|
1188 {
|
|
1189 srclen = min (srclen, dstlen);
|
867
|
1190 srclen = validate_ibyte_string_backward (src, srclen);
|
826
|
1191 memcpy (dst, src, srclen);
|
|
1192 if (src_used)
|
|
1193 *src_used = srclen;
|
|
1194 return srclen;
|
|
1195 }
|
|
1196 else
|
|
1197 return srclen;
|
|
1198 }
|
|
1199 /* Everything before the final else statement is an optimization.
|
|
1200 The inner loops inside COPY_TEXT_BETWEEN_FORMATS() have a number
|
|
1201 of calls to *_fmt(), each of which has a switch statement in it.
|
|
1202 By using constants as the FMT argument, these switch statements
|
|
1203 will be optimized out of existence. */
|
|
1204 #define ELSE_FORMATS(fmt1, fmt2) \
|
|
1205 else if (srcfmt == fmt1 && dstfmt == fmt2) \
|
|
1206 COPY_TEXT_BETWEEN_FORMATS (fmt1, fmt2)
|
|
1207 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_8_BIT_FIXED);
|
|
1208 ELSE_FORMATS (FORMAT_8_BIT_FIXED, FORMAT_DEFAULT);
|
|
1209 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_32_BIT_FIXED);
|
|
1210 ELSE_FORMATS (FORMAT_32_BIT_FIXED, FORMAT_DEFAULT);
|
|
1211 else
|
|
1212 COPY_TEXT_BETWEEN_FORMATS (srcfmt, dstfmt);
|
|
1213 #undef ELSE_FORMATS
|
|
1214 }
|
|
1215
|
|
1216 /* Copy as much buffer text in BUF, starting at POS, of length LEN, as will
|
|
1217 fit into DST/DSTLEN, converting to DSTFMT. Return number of bytes
|
|
1218 stored into DST as return value, and number of bytes copied from BUF
|
|
1219 through SRC_USED (if not NULL). If DST is NULL, don't actually store
|
|
1220 anything and just return the size needed to store all the text. */
|
|
1221
|
|
1222 Bytecount
|
|
1223 copy_buffer_text_out (struct buffer *buf, Bytebpos pos,
|
867
|
1224 Bytecount len, Ibyte *dst, Bytecount dstlen,
|
826
|
1225 Internal_Format dstfmt, Lisp_Object dstobj,
|
|
1226 Bytecount *src_used)
|
|
1227 {
|
|
1228 Bytecount dst_used = 0;
|
|
1229 if (src_used)
|
|
1230 *src_used = 0;
|
|
1231
|
|
1232 {
|
|
1233 BUFFER_TEXT_LOOP (buf, pos, len, runptr, runlen)
|
|
1234 {
|
|
1235 Bytecount the_src_used, the_dst_used;
|
|
1236
|
|
1237 the_dst_used = copy_text_between_formats (runptr, runlen,
|
|
1238 BUF_FORMAT (buf),
|
|
1239 wrap_buffer (buf),
|
|
1240 dst, dstlen, dstfmt,
|
|
1241 dstobj, &the_src_used);
|
|
1242 dst_used += the_dst_used;
|
|
1243 if (src_used)
|
|
1244 *src_used += the_src_used;
|
|
1245 if (dst)
|
|
1246 {
|
|
1247 dst += the_dst_used;
|
|
1248 dstlen -= the_dst_used;
|
841
|
1249 /* Stop if we didn't use all of the source text. Also stop
|
|
1250 if the destination is full. We need the first test because
|
|
1251 there might be a couple bytes left in the destination, but
|
|
1252 not enough to fit a full character. The first test will in
|
|
1253 fact catch the vast majority of cases where the destination
|
|
1254 is empty, too -- but in case the destination holds *exactly*
|
|
1255 the run length, we put in the second check. (It shouldn't
|
|
1256 really matter though -- next time through we'll just get a
|
|
1257 0.) */
|
|
1258 if (the_src_used < runlen || !dstlen)
|
826
|
1259 break;
|
|
1260 }
|
|
1261 }
|
|
1262 }
|
|
1263
|
|
1264 return dst_used;
|
|
1265 }
|
|
1266
|
771
|
1267
|
|
1268 /************************************************************************/
|
|
1269 /* charset properties of strings */
|
|
1270 /************************************************************************/
|
|
1271
|
|
1272 void
|
2333
|
1273 find_charsets_in_ibyte_string (unsigned char *charsets,
|
|
1274 const Ibyte *USED_IF_MULE (str),
|
|
1275 Bytecount USED_IF_MULE (len))
|
771
|
1276 {
|
|
1277 #ifndef MULE
|
|
1278 /* Telescope this. */
|
|
1279 charsets[0] = 1;
|
|
1280 #else
|
867
|
1281 const Ibyte *strend = str + len;
|
771
|
1282 memset (charsets, 0, NUM_LEADING_BYTES);
|
|
1283
|
|
1284 /* #### SJT doesn't like this. */
|
|
1285 if (len == 0)
|
|
1286 {
|
|
1287 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1;
|
|
1288 return;
|
|
1289 }
|
|
1290
|
|
1291 while (str < strend)
|
|
1292 {
|
867
|
1293 charsets[ichar_leading_byte (itext_ichar (str)) - MIN_LEADING_BYTE] =
|
771
|
1294 1;
|
867
|
1295 INC_IBYTEPTR (str);
|
771
|
1296 }
|
|
1297 #endif
|
|
1298 }
|
|
1299
|
|
1300 void
|
2333
|
1301 find_charsets_in_ichar_string (unsigned char *charsets,
|
|
1302 const Ichar *USED_IF_MULE (str),
|
|
1303 Charcount USED_IF_MULE (len))
|
771
|
1304 {
|
|
1305 #ifndef MULE
|
|
1306 /* Telescope this. */
|
|
1307 charsets[0] = 1;
|
|
1308 #else
|
|
1309 int i;
|
|
1310
|
|
1311 memset (charsets, 0, NUM_LEADING_BYTES);
|
|
1312
|
|
1313 /* #### SJT doesn't like this. */
|
|
1314 if (len == 0)
|
|
1315 {
|
|
1316 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1;
|
|
1317 return;
|
|
1318 }
|
|
1319
|
|
1320 for (i = 0; i < len; i++)
|
|
1321 {
|
867
|
1322 charsets[ichar_leading_byte (str[i]) - MIN_LEADING_BYTE] = 1;
|
771
|
1323 }
|
|
1324 #endif
|
|
1325 }
|
|
1326
|
|
1327 int
|
867
|
1328 ibyte_string_displayed_columns (const Ibyte *str, Bytecount len)
|
771
|
1329 {
|
|
1330 int cols = 0;
|
867
|
1331 const Ibyte *end = str + len;
|
771
|
1332
|
|
1333 while (str < end)
|
|
1334 {
|
|
1335 #ifdef MULE
|
867
|
1336 Ichar ch = itext_ichar (str);
|
|
1337 cols += XCHARSET_COLUMNS (ichar_charset (ch));
|
771
|
1338 #else
|
|
1339 cols++;
|
|
1340 #endif
|
867
|
1341 INC_IBYTEPTR (str);
|
771
|
1342 }
|
|
1343
|
|
1344 return cols;
|
|
1345 }
|
|
1346
|
|
1347 int
|
2333
|
1348 ichar_string_displayed_columns (const Ichar *USED_IF_MULE (str), Charcount len)
|
771
|
1349 {
|
|
1350 #ifdef MULE
|
|
1351 int cols = 0;
|
|
1352 int i;
|
|
1353
|
|
1354 for (i = 0; i < len; i++)
|
867
|
1355 cols += XCHARSET_COLUMNS (ichar_charset (str[i]));
|
771
|
1356
|
|
1357 return cols;
|
|
1358 #else /* not MULE */
|
|
1359 return len;
|
|
1360 #endif
|
|
1361 }
|
|
1362
|
|
1363 Charcount
|
2333
|
1364 ibyte_string_nonascii_chars (const Ibyte *USED_IF_MULE (str),
|
|
1365 Bytecount USED_IF_MULE (len))
|
771
|
1366 {
|
|
1367 #ifdef MULE
|
867
|
1368 const Ibyte *end = str + len;
|
771
|
1369 Charcount retval = 0;
|
|
1370
|
|
1371 while (str < end)
|
|
1372 {
|
826
|
1373 if (!byte_ascii_p (*str))
|
771
|
1374 retval++;
|
867
|
1375 INC_IBYTEPTR (str);
|
771
|
1376 }
|
|
1377
|
|
1378 return retval;
|
|
1379 #else
|
|
1380 return 0;
|
|
1381 #endif
|
|
1382 }
|
|
1383
|
|
1384
|
|
1385 /***************************************************************************/
|
|
1386 /* Eistring helper functions */
|
|
1387 /***************************************************************************/
|
|
1388
|
|
1389 int
|
867
|
1390 eistr_casefiddle_1 (Ibyte *olddata, Bytecount len, Ibyte *newdata,
|
771
|
1391 int downp)
|
|
1392 {
|
867
|
1393 Ibyte *endp = olddata + len;
|
|
1394 Ibyte *newp = newdata;
|
771
|
1395 int changedp = 0;
|
|
1396
|
|
1397 while (olddata < endp)
|
|
1398 {
|
867
|
1399 Ichar c = itext_ichar (olddata);
|
|
1400 Ichar newc;
|
771
|
1401
|
|
1402 if (downp)
|
|
1403 newc = DOWNCASE (0, c);
|
|
1404 else
|
|
1405 newc = UPCASE (0, c);
|
|
1406
|
|
1407 if (c != newc)
|
|
1408 changedp = 1;
|
|
1409
|
867
|
1410 newp += set_itext_ichar (newp, newc);
|
|
1411 INC_IBYTEPTR (olddata);
|
771
|
1412 }
|
|
1413
|
|
1414 *newp = '\0';
|
|
1415
|
|
1416 return changedp ? newp - newdata : 0;
|
|
1417 }
|
|
1418
|
|
1419 int
|
|
1420 eifind_large_enough_buffer (int oldbufsize, int needed_size)
|
|
1421 {
|
|
1422 while (oldbufsize < needed_size)
|
|
1423 {
|
|
1424 oldbufsize = oldbufsize * 3 / 2;
|
|
1425 oldbufsize = max (oldbufsize, 32);
|
|
1426 }
|
|
1427
|
|
1428 return oldbufsize;
|
|
1429 }
|
|
1430
|
|
1431 void
|
|
1432 eito_malloc_1 (Eistring *ei)
|
|
1433 {
|
|
1434 if (ei->mallocp_)
|
|
1435 return;
|
|
1436 ei->mallocp_ = 1;
|
|
1437 if (ei->data_)
|
|
1438 {
|
867
|
1439 Ibyte *newdata;
|
771
|
1440
|
|
1441 ei->max_size_allocated_ =
|
|
1442 eifind_large_enough_buffer (0, ei->bytelen_ + 1);
|
867
|
1443 newdata = (Ibyte *) xmalloc (ei->max_size_allocated_);
|
771
|
1444 memcpy (newdata, ei->data_, ei->bytelen_ + 1);
|
|
1445 ei->data_ = newdata;
|
|
1446 }
|
|
1447
|
|
1448 if (ei->extdata_)
|
|
1449 {
|
|
1450 Extbyte *newdata = (Extbyte *) xmalloc (ei->extlen_ + 2);
|
|
1451
|
|
1452 memcpy (newdata, ei->extdata_, ei->extlen_);
|
|
1453 /* Double null-terminate in case of Unicode data */
|
|
1454 newdata[ei->extlen_] = '\0';
|
|
1455 newdata[ei->extlen_ + 1] = '\0';
|
|
1456 ei->extdata_ = newdata;
|
|
1457 }
|
|
1458 }
|
|
1459
|
|
1460 int
|
|
1461 eicmp_1 (Eistring *ei, Bytecount off, Charcount charoff,
|
867
|
1462 Bytecount len, Charcount charlen, const Ibyte *data,
|
771
|
1463 const Eistring *ei2, int is_c, int fold_case)
|
|
1464 {
|
|
1465 assert ((off < 0) != (charoff < 0));
|
|
1466 if (off < 0)
|
|
1467 {
|
|
1468 off = charcount_to_bytecount (ei->data_, charoff);
|
|
1469 if (charlen < 0)
|
|
1470 len = -1;
|
|
1471 else
|
|
1472 len = charcount_to_bytecount (ei->data_ + off, charlen);
|
|
1473 }
|
|
1474 if (len < 0)
|
|
1475 len = ei->bytelen_ - off;
|
|
1476
|
|
1477 assert (off >= 0 && off <= ei->bytelen_);
|
|
1478 assert (len >= 0 && off + len <= ei->bytelen_);
|
|
1479 assert ((data == 0) != (ei == 0));
|
|
1480 assert ((is_c != 0) == (data != 0));
|
|
1481 assert (fold_case >= 0 && fold_case <= 2);
|
|
1482
|
|
1483 {
|
|
1484 Bytecount dstlen;
|
867
|
1485 const Ibyte *src = ei->data_, *dst;
|
771
|
1486
|
|
1487 if (data)
|
|
1488 {
|
|
1489 dst = data;
|
|
1490 dstlen = qxestrlen (data);
|
|
1491 }
|
|
1492 else
|
|
1493 {
|
|
1494 dst = ei2->data_;
|
|
1495 dstlen = ei2->bytelen_;
|
|
1496 }
|
|
1497
|
|
1498 if (is_c)
|
|
1499 EI_ASSERT_ASCII ((Char_ASCII *) dst, dstlen);
|
|
1500
|
801
|
1501 return (fold_case == 0 ? qxememcmp4 (src, len, dst, dstlen) :
|
|
1502 fold_case == 1 ? qxememcasecmp4 (src, len, dst, dstlen) :
|
|
1503 qxetextcasecmp (src, len, dst, dstlen));
|
771
|
1504 }
|
|
1505 }
|
|
1506
|
867
|
1507 Ibyte *
|
826
|
1508 eicpyout_malloc_fmt (Eistring *eistr, Bytecount *len_out, Internal_Format fmt,
|
2286
|
1509 Lisp_Object UNUSED (object))
|
771
|
1510 {
|
867
|
1511 Ibyte *ptr;
|
771
|
1512
|
|
1513 assert (fmt == FORMAT_DEFAULT);
|
867
|
1514 ptr = xnew_array (Ibyte, eistr->bytelen_ + 1);
|
771
|
1515 if (len_out)
|
|
1516 *len_out = eistr->bytelen_;
|
|
1517 memcpy (ptr, eistr->data_, eistr->bytelen_ + 1);
|
|
1518 return ptr;
|
|
1519 }
|
|
1520
|
|
1521
|
|
1522 /************************************************************************/
|
|
1523 /* Charcount/Bytecount conversion */
|
|
1524 /************************************************************************/
|
|
1525
|
|
1526 /* Optimization. Do it. Live it. Love it. */
|
|
1527
|
|
1528 #ifdef MULE
|
|
1529
|
826
|
1530 /* Skip as many ASCII bytes as possible in the memory block [PTR, END).
|
|
1531 Return pointer to the first non-ASCII byte. optimized for long
|
|
1532 stretches of ASCII. */
|
867
|
1533 inline static const Ibyte *
|
|
1534 skip_ascii (const Ibyte *ptr, const Ibyte *end)
|
771
|
1535 {
|
826
|
1536 #ifdef EFFICIENT_INT_128_BIT
|
|
1537 # define STRIDE_TYPE INT_128_BIT
|
|
1538 # define HIGH_BIT_MASK \
|
|
1539 MAKE_128_BIT_UNSIGNED_CONSTANT (0x80808080808080808080808080808080)
|
|
1540 #elif defined (EFFICIENT_INT_64_BIT)
|
|
1541 # define STRIDE_TYPE INT_64_BIT
|
|
1542 # define HIGH_BIT_MASK MAKE_64_BIT_UNSIGNED_CONSTANT (0x8080808080808080)
|
771
|
1543 #else
|
826
|
1544 # define STRIDE_TYPE INT_32_BIT
|
|
1545 # define HIGH_BIT_MASK MAKE_32_BIT_UNSIGNED_CONSTANT (0x80808080)
|
771
|
1546 #endif
|
|
1547
|
|
1548 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1))
|
|
1549 #define ALIGN_MASK (~ ALIGN_BITS)
|
|
1550 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0)
|
|
1551 #define STRIDE sizeof (STRIDE_TYPE)
|
|
1552
|
826
|
1553 const unsigned STRIDE_TYPE *ascii_end;
|
|
1554
|
|
1555 /* Need to do in 3 sections -- before alignment start, aligned chunk,
|
|
1556 after alignment end. */
|
|
1557 while (!ALIGNED (ptr))
|
771
|
1558 {
|
826
|
1559 if (ptr == end || !byte_ascii_p (*ptr))
|
|
1560 return ptr;
|
|
1561 ptr++;
|
|
1562 }
|
|
1563 ascii_end = (const unsigned STRIDE_TYPE *) ptr;
|
|
1564 /* This loop screams, because we can detect ASCII
|
|
1565 characters 4 or 8 at a time. */
|
867
|
1566 while ((const Ibyte *) ascii_end + STRIDE <= end
|
826
|
1567 && !(*ascii_end & HIGH_BIT_MASK))
|
|
1568 ascii_end++;
|
867
|
1569 ptr = (Ibyte *) ascii_end;
|
826
|
1570 while (ptr < end && byte_ascii_p (*ptr))
|
|
1571 ptr++;
|
|
1572 return ptr;
|
|
1573 }
|
|
1574
|
|
1575 /* Function equivalents of bytecount_to_charcount/charcount_to_bytecount.
|
|
1576 These work on strings of all sizes but are more efficient than a simple
|
|
1577 loop on large strings and probably less efficient on sufficiently small
|
|
1578 strings. */
|
|
1579
|
|
1580 Charcount
|
867
|
1581 bytecount_to_charcount_fun (const Ibyte *ptr, Bytecount len)
|
826
|
1582 {
|
|
1583 Charcount count = 0;
|
867
|
1584 const Ibyte *end = ptr + len;
|
826
|
1585 while (1)
|
|
1586 {
|
867
|
1587 const Ibyte *newptr = skip_ascii (ptr, end);
|
826
|
1588 count += newptr - ptr;
|
|
1589 ptr = newptr;
|
|
1590 if (ptr == end)
|
|
1591 break;
|
|
1592 {
|
|
1593 /* Optimize for successive characters from the same charset */
|
867
|
1594 Ibyte leading_byte = *ptr;
|
826
|
1595 int bytes = rep_bytes_by_first_byte (leading_byte);
|
|
1596 while (ptr < end && *ptr == leading_byte)
|
|
1597 ptr += bytes, count++;
|
|
1598 }
|
771
|
1599 }
|
|
1600
|
|
1601 /* Bomb out if the specified substring ends in the middle
|
|
1602 of a character. Note that we might have already gotten
|
|
1603 a core dump above from an invalid reference, but at least
|
|
1604 we will get no farther than here.
|
|
1605
|
|
1606 This also catches len < 0. */
|
800
|
1607 text_checking_assert (ptr == end);
|
771
|
1608
|
|
1609 return count;
|
|
1610 }
|
|
1611
|
|
1612 Bytecount
|
867
|
1613 charcount_to_bytecount_fun (const Ibyte *ptr, Charcount len)
|
771
|
1614 {
|
867
|
1615 const Ibyte *newptr = ptr;
|
826
|
1616 while (1)
|
771
|
1617 {
|
867
|
1618 const Ibyte *newnewptr = skip_ascii (newptr, newptr + len);
|
826
|
1619 len -= newnewptr - newptr;
|
|
1620 newptr = newnewptr;
|
|
1621 if (!len)
|
|
1622 break;
|
|
1623 {
|
|
1624 /* Optimize for successive characters from the same charset */
|
867
|
1625 Ibyte leading_byte = *newptr;
|
826
|
1626 int bytes = rep_bytes_by_first_byte (leading_byte);
|
|
1627 while (len > 0 && *newptr == leading_byte)
|
|
1628 newptr += bytes, len--;
|
|
1629 }
|
771
|
1630 }
|
|
1631 return newptr - ptr;
|
|
1632 }
|
|
1633
|
|
1634 /* The next two functions are the actual meat behind the
|
|
1635 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently
|
|
1636 the method they use is fairly unsophisticated; see buffer.h.
|
|
1637
|
|
1638 Note that charbpos_to_bytebpos_func() is probably the most-called
|
|
1639 function in all of XEmacs. Therefore, it must be FAST FAST FAST.
|
|
1640 This is the reason why so much of the code is duplicated.
|
|
1641
|
|
1642 Similar considerations apply to bytebpos_to_charbpos_func(), although
|
|
1643 less so because the function is not called so often.
|
|
1644
|
|
1645 #### At some point this should use a more sophisticated method;
|
|
1646 see buffer.h. */
|
|
1647
|
|
1648 static int not_very_random_number;
|
|
1649
|
|
1650 Bytebpos
|
|
1651 charbpos_to_bytebpos_func (struct buffer *buf, Charbpos x)
|
|
1652 {
|
|
1653 Charbpos bufmin;
|
|
1654 Charbpos bufmax;
|
|
1655 Bytebpos bytmin;
|
|
1656 Bytebpos bytmax;
|
|
1657 int size;
|
|
1658 int forward_p;
|
|
1659 Bytebpos retval;
|
|
1660 int diff_so_far;
|
|
1661 int add_to_cache = 0;
|
1292
|
1662 PROFILE_DECLARE ();
|
771
|
1663
|
|
1664 /* Check for some cached positions, for speed. */
|
|
1665 if (x == BUF_PT (buf))
|
826
|
1666 return BYTE_BUF_PT (buf);
|
771
|
1667 if (x == BUF_ZV (buf))
|
826
|
1668 return BYTE_BUF_ZV (buf);
|
771
|
1669 if (x == BUF_BEGV (buf))
|
826
|
1670 return BYTE_BUF_BEGV (buf);
|
771
|
1671
|
1292
|
1672 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion);
|
|
1673
|
771
|
1674 bufmin = buf->text->mule_bufmin;
|
|
1675 bufmax = buf->text->mule_bufmax;
|
|
1676 bytmin = buf->text->mule_bytmin;
|
|
1677 bytmax = buf->text->mule_bytmax;
|
|
1678 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
|
|
1679
|
|
1680 /* The basic idea here is that we shift the "known region" up or down
|
|
1681 until it overlaps the specified position. We do this by moving
|
|
1682 the upper bound of the known region up one character at a time,
|
|
1683 and moving the lower bound of the known region up as necessary
|
|
1684 when the size of the character just seen changes.
|
|
1685
|
|
1686 We optimize this, however, by first shifting the known region to
|
|
1687 one of the cached points if it's close by. (We don't check BEG or
|
|
1688 Z, even though they're cached; most of the time these will be the
|
|
1689 same as BEGV and ZV, and when they're not, they're not likely
|
|
1690 to be used.) */
|
|
1691
|
|
1692 if (x > bufmax)
|
|
1693 {
|
|
1694 Charbpos diffmax = x - bufmax;
|
|
1695 Charbpos diffpt = x - BUF_PT (buf);
|
|
1696 Charbpos diffzv = BUF_ZV (buf) - x;
|
|
1697 /* #### This value could stand some more exploration. */
|
|
1698 Charcount heuristic_hack = (bufmax - bufmin) >> 2;
|
|
1699
|
|
1700 /* Check if the position is closer to PT or ZV than to the
|
|
1701 end of the known region. */
|
|
1702
|
|
1703 if (diffpt < 0)
|
|
1704 diffpt = -diffpt;
|
|
1705 if (diffzv < 0)
|
|
1706 diffzv = -diffzv;
|
|
1707
|
|
1708 /* But also implement a heuristic that favors the known region
|
|
1709 over PT or ZV. The reason for this is that switching to
|
|
1710 PT or ZV will wipe out the knowledge in the known region,
|
|
1711 which might be annoying if the known region is large and
|
|
1712 PT or ZV is not that much closer than the end of the known
|
|
1713 region. */
|
|
1714
|
|
1715 diffzv += heuristic_hack;
|
|
1716 diffpt += heuristic_hack;
|
|
1717 if (diffpt < diffmax && diffpt <= diffzv)
|
|
1718 {
|
|
1719 bufmax = bufmin = BUF_PT (buf);
|
826
|
1720 bytmax = bytmin = BYTE_BUF_PT (buf);
|
771
|
1721 /* We set the size to 1 even though it doesn't really
|
|
1722 matter because the new known region contains no
|
|
1723 characters. We do this because this is the most
|
|
1724 likely size of the characters around the new known
|
|
1725 region, and we avoid potential yuckiness that is
|
|
1726 done when size == 3. */
|
|
1727 size = 1;
|
|
1728 }
|
|
1729 if (diffzv < diffmax)
|
|
1730 {
|
|
1731 bufmax = bufmin = BUF_ZV (buf);
|
826
|
1732 bytmax = bytmin = BYTE_BUF_ZV (buf);
|
771
|
1733 size = 1;
|
|
1734 }
|
|
1735 }
|
800
|
1736 #ifdef ERROR_CHECK_TEXT
|
771
|
1737 else if (x >= bufmin)
|
|
1738 abort ();
|
|
1739 #endif
|
|
1740 else
|
|
1741 {
|
|
1742 Charbpos diffmin = bufmin - x;
|
|
1743 Charbpos diffpt = BUF_PT (buf) - x;
|
|
1744 Charbpos diffbegv = x - BUF_BEGV (buf);
|
|
1745 /* #### This value could stand some more exploration. */
|
|
1746 Charcount heuristic_hack = (bufmax - bufmin) >> 2;
|
|
1747
|
|
1748 if (diffpt < 0)
|
|
1749 diffpt = -diffpt;
|
|
1750 if (diffbegv < 0)
|
|
1751 diffbegv = -diffbegv;
|
|
1752
|
|
1753 /* But also implement a heuristic that favors the known region --
|
|
1754 see above. */
|
|
1755
|
|
1756 diffbegv += heuristic_hack;
|
|
1757 diffpt += heuristic_hack;
|
|
1758
|
|
1759 if (diffpt < diffmin && diffpt <= diffbegv)
|
|
1760 {
|
|
1761 bufmax = bufmin = BUF_PT (buf);
|
826
|
1762 bytmax = bytmin = BYTE_BUF_PT (buf);
|
771
|
1763 /* We set the size to 1 even though it doesn't really
|
|
1764 matter because the new known region contains no
|
|
1765 characters. We do this because this is the most
|
|
1766 likely size of the characters around the new known
|
|
1767 region, and we avoid potential yuckiness that is
|
|
1768 done when size == 3. */
|
|
1769 size = 1;
|
|
1770 }
|
|
1771 if (diffbegv < diffmin)
|
|
1772 {
|
|
1773 bufmax = bufmin = BUF_BEGV (buf);
|
826
|
1774 bytmax = bytmin = BYTE_BUF_BEGV (buf);
|
771
|
1775 size = 1;
|
|
1776 }
|
|
1777 }
|
|
1778
|
|
1779 diff_so_far = x > bufmax ? x - bufmax : bufmin - x;
|
|
1780 if (diff_so_far > 50)
|
|
1781 {
|
|
1782 /* If we have to move more than a certain amount, then look
|
|
1783 into our cache. */
|
|
1784 int minval = INT_MAX;
|
|
1785 int found = 0;
|
|
1786 int i;
|
|
1787
|
|
1788 add_to_cache = 1;
|
|
1789 /* I considered keeping the positions ordered. This would speed
|
|
1790 up this loop, but updating the cache would take longer, so
|
|
1791 it doesn't seem like it would really matter. */
|
|
1792 for (i = 0; i < 16; i++)
|
|
1793 {
|
|
1794 int diff = buf->text->mule_charbpos_cache[i] - x;
|
|
1795
|
|
1796 if (diff < 0)
|
|
1797 diff = -diff;
|
|
1798 if (diff < minval)
|
|
1799 {
|
|
1800 minval = diff;
|
|
1801 found = i;
|
|
1802 }
|
|
1803 }
|
|
1804
|
|
1805 if (minval < diff_so_far)
|
|
1806 {
|
|
1807 bufmax = bufmin = buf->text->mule_charbpos_cache[found];
|
|
1808 bytmax = bytmin = buf->text->mule_bytebpos_cache[found];
|
|
1809 size = 1;
|
|
1810 }
|
|
1811 }
|
|
1812
|
|
1813 /* It's conceivable that the caching above could lead to X being
|
|
1814 the same as one of the range edges. */
|
|
1815 if (x >= bufmax)
|
|
1816 {
|
|
1817 Bytebpos newmax;
|
|
1818 Bytecount newsize;
|
|
1819
|
|
1820 forward_p = 1;
|
|
1821 while (x > bufmax)
|
|
1822 {
|
|
1823 newmax = bytmax;
|
|
1824
|
|
1825 INC_BYTEBPOS (buf, newmax);
|
|
1826 newsize = newmax - bytmax;
|
|
1827 if (newsize != size)
|
|
1828 {
|
|
1829 bufmin = bufmax;
|
|
1830 bytmin = bytmax;
|
|
1831 size = newsize;
|
|
1832 }
|
|
1833 bytmax = newmax;
|
|
1834 bufmax++;
|
|
1835 }
|
|
1836 retval = bytmax;
|
|
1837
|
|
1838 /* #### Should go past the found location to reduce the number
|
|
1839 of times that this function is called */
|
|
1840 }
|
|
1841 else /* x < bufmin */
|
|
1842 {
|
|
1843 Bytebpos newmin;
|
|
1844 Bytecount newsize;
|
|
1845
|
|
1846 forward_p = 0;
|
|
1847 while (x < bufmin)
|
|
1848 {
|
|
1849 newmin = bytmin;
|
|
1850
|
|
1851 DEC_BYTEBPOS (buf, newmin);
|
|
1852 newsize = bytmin - newmin;
|
|
1853 if (newsize != size)
|
|
1854 {
|
|
1855 bufmax = bufmin;
|
|
1856 bytmax = bytmin;
|
|
1857 size = newsize;
|
|
1858 }
|
|
1859 bytmin = newmin;
|
|
1860 bufmin--;
|
|
1861 }
|
|
1862 retval = bytmin;
|
|
1863
|
|
1864 /* #### Should go past the found location to reduce the number
|
|
1865 of times that this function is called
|
|
1866 */
|
|
1867 }
|
|
1868
|
|
1869 /* If size is three, than we have to max sure that the range we
|
|
1870 discovered isn't too large, because we use a fixed-length
|
|
1871 table to divide by 3. */
|
|
1872
|
|
1873 if (size == 3)
|
|
1874 {
|
|
1875 int gap = bytmax - bytmin;
|
|
1876 buf->text->mule_three_p = 1;
|
|
1877 buf->text->mule_shifter = 1;
|
|
1878
|
|
1879 if (gap > MAX_BYTEBPOS_GAP_SIZE_3)
|
|
1880 {
|
|
1881 if (forward_p)
|
|
1882 {
|
|
1883 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3;
|
|
1884 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3;
|
|
1885 }
|
|
1886 else
|
|
1887 {
|
|
1888 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3;
|
|
1889 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3;
|
|
1890 }
|
|
1891 }
|
|
1892 }
|
|
1893 else
|
|
1894 {
|
|
1895 buf->text->mule_three_p = 0;
|
|
1896 if (size == 4)
|
|
1897 buf->text->mule_shifter = 2;
|
|
1898 else
|
|
1899 buf->text->mule_shifter = size - 1;
|
|
1900 }
|
|
1901
|
|
1902 buf->text->mule_bufmin = bufmin;
|
|
1903 buf->text->mule_bufmax = bufmax;
|
|
1904 buf->text->mule_bytmin = bytmin;
|
|
1905 buf->text->mule_bytmax = bytmax;
|
|
1906
|
|
1907 if (add_to_cache)
|
|
1908 {
|
|
1909 int replace_loc;
|
|
1910
|
|
1911 /* We throw away a "random" cached value and replace it with
|
|
1912 the new value. It doesn't actually have to be very random
|
|
1913 at all, just evenly distributed.
|
|
1914
|
|
1915 #### It would be better to use a least-recently-used algorithm
|
|
1916 or something that tries to space things out, but I'm not sure
|
|
1917 it's worth it to go to the trouble of maintaining that. */
|
|
1918 not_very_random_number += 621;
|
|
1919 replace_loc = not_very_random_number & 15;
|
|
1920 buf->text->mule_charbpos_cache[replace_loc] = x;
|
|
1921 buf->text->mule_bytebpos_cache[replace_loc] = retval;
|
|
1922 }
|
|
1923
|
1292
|
1924 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion);
|
|
1925
|
771
|
1926 return retval;
|
|
1927 }
|
|
1928
|
|
1929 /* The logic in this function is almost identical to the logic in
|
|
1930 the previous function. */
|
|
1931
|
|
1932 Charbpos
|
|
1933 bytebpos_to_charbpos_func (struct buffer *buf, Bytebpos x)
|
|
1934 {
|
|
1935 Charbpos bufmin;
|
|
1936 Charbpos bufmax;
|
|
1937 Bytebpos bytmin;
|
|
1938 Bytebpos bytmax;
|
|
1939 int size;
|
|
1940 int forward_p;
|
|
1941 Charbpos retval;
|
|
1942 int diff_so_far;
|
|
1943 int add_to_cache = 0;
|
1292
|
1944 PROFILE_DECLARE ();
|
771
|
1945
|
|
1946 /* Check for some cached positions, for speed. */
|
826
|
1947 if (x == BYTE_BUF_PT (buf))
|
771
|
1948 return BUF_PT (buf);
|
826
|
1949 if (x == BYTE_BUF_ZV (buf))
|
771
|
1950 return BUF_ZV (buf);
|
826
|
1951 if (x == BYTE_BUF_BEGV (buf))
|
771
|
1952 return BUF_BEGV (buf);
|
|
1953
|
1292
|
1954 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion);
|
|
1955
|
771
|
1956 bufmin = buf->text->mule_bufmin;
|
|
1957 bufmax = buf->text->mule_bufmax;
|
|
1958 bytmin = buf->text->mule_bytmin;
|
|
1959 bytmax = buf->text->mule_bytmax;
|
|
1960 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
|
|
1961
|
|
1962 /* The basic idea here is that we shift the "known region" up or down
|
|
1963 until it overlaps the specified position. We do this by moving
|
|
1964 the upper bound of the known region up one character at a time,
|
|
1965 and moving the lower bound of the known region up as necessary
|
|
1966 when the size of the character just seen changes.
|
|
1967
|
|
1968 We optimize this, however, by first shifting the known region to
|
826
|
1969 one of the cached points if it's close by. (We don't check BYTE_BEG or
|
|
1970 BYTE_Z, even though they're cached; most of the time these will be the
|
|
1971 same as BYTE_BEGV and BYTE_ZV, and when they're not, they're not likely
|
771
|
1972 to be used.) */
|
|
1973
|
|
1974 if (x > bytmax)
|
|
1975 {
|
|
1976 Bytebpos diffmax = x - bytmax;
|
826
|
1977 Bytebpos diffpt = x - BYTE_BUF_PT (buf);
|
|
1978 Bytebpos diffzv = BYTE_BUF_ZV (buf) - x;
|
771
|
1979 /* #### This value could stand some more exploration. */
|
|
1980 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
|
|
1981
|
|
1982 /* Check if the position is closer to PT or ZV than to the
|
|
1983 end of the known region. */
|
|
1984
|
|
1985 if (diffpt < 0)
|
|
1986 diffpt = -diffpt;
|
|
1987 if (diffzv < 0)
|
|
1988 diffzv = -diffzv;
|
|
1989
|
|
1990 /* But also implement a heuristic that favors the known region
|
826
|
1991 over BYTE_PT or BYTE_ZV. The reason for this is that switching to
|
|
1992 BYTE_PT or BYTE_ZV will wipe out the knowledge in the known region,
|
771
|
1993 which might be annoying if the known region is large and
|
826
|
1994 BYTE_PT or BYTE_ZV is not that much closer than the end of the known
|
771
|
1995 region. */
|
|
1996
|
|
1997 diffzv += heuristic_hack;
|
|
1998 diffpt += heuristic_hack;
|
|
1999 if (diffpt < diffmax && diffpt <= diffzv)
|
|
2000 {
|
|
2001 bufmax = bufmin = BUF_PT (buf);
|
826
|
2002 bytmax = bytmin = BYTE_BUF_PT (buf);
|
771
|
2003 /* We set the size to 1 even though it doesn't really
|
|
2004 matter because the new known region contains no
|
|
2005 characters. We do this because this is the most
|
|
2006 likely size of the characters around the new known
|
|
2007 region, and we avoid potential yuckiness that is
|
|
2008 done when size == 3. */
|
|
2009 size = 1;
|
|
2010 }
|
|
2011 if (diffzv < diffmax)
|
|
2012 {
|
|
2013 bufmax = bufmin = BUF_ZV (buf);
|
826
|
2014 bytmax = bytmin = BYTE_BUF_ZV (buf);
|
771
|
2015 size = 1;
|
|
2016 }
|
|
2017 }
|
800
|
2018 #ifdef ERROR_CHECK_TEXT
|
771
|
2019 else if (x >= bytmin)
|
|
2020 abort ();
|
|
2021 #endif
|
|
2022 else
|
|
2023 {
|
|
2024 Bytebpos diffmin = bytmin - x;
|
826
|
2025 Bytebpos diffpt = BYTE_BUF_PT (buf) - x;
|
|
2026 Bytebpos diffbegv = x - BYTE_BUF_BEGV (buf);
|
771
|
2027 /* #### This value could stand some more exploration. */
|
|
2028 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
|
|
2029
|
|
2030 if (diffpt < 0)
|
|
2031 diffpt = -diffpt;
|
|
2032 if (diffbegv < 0)
|
|
2033 diffbegv = -diffbegv;
|
|
2034
|
|
2035 /* But also implement a heuristic that favors the known region --
|
|
2036 see above. */
|
|
2037
|
|
2038 diffbegv += heuristic_hack;
|
|
2039 diffpt += heuristic_hack;
|
|
2040
|
|
2041 if (diffpt < diffmin && diffpt <= diffbegv)
|
|
2042 {
|
|
2043 bufmax = bufmin = BUF_PT (buf);
|
826
|
2044 bytmax = bytmin = BYTE_BUF_PT (buf);
|
771
|
2045 /* We set the size to 1 even though it doesn't really
|
|
2046 matter because the new known region contains no
|
|
2047 characters. We do this because this is the most
|
|
2048 likely size of the characters around the new known
|
|
2049 region, and we avoid potential yuckiness that is
|
|
2050 done when size == 3. */
|
|
2051 size = 1;
|
|
2052 }
|
|
2053 if (diffbegv < diffmin)
|
|
2054 {
|
|
2055 bufmax = bufmin = BUF_BEGV (buf);
|
826
|
2056 bytmax = bytmin = BYTE_BUF_BEGV (buf);
|
771
|
2057 size = 1;
|
|
2058 }
|
|
2059 }
|
|
2060
|
|
2061 diff_so_far = x > bytmax ? x - bytmax : bytmin - x;
|
|
2062 if (diff_so_far > 50)
|
|
2063 {
|
|
2064 /* If we have to move more than a certain amount, then look
|
|
2065 into our cache. */
|
|
2066 int minval = INT_MAX;
|
|
2067 int found = 0;
|
|
2068 int i;
|
|
2069
|
|
2070 add_to_cache = 1;
|
|
2071 /* I considered keeping the positions ordered. This would speed
|
|
2072 up this loop, but updating the cache would take longer, so
|
|
2073 it doesn't seem like it would really matter. */
|
|
2074 for (i = 0; i < 16; i++)
|
|
2075 {
|
|
2076 int diff = buf->text->mule_bytebpos_cache[i] - x;
|
|
2077
|
|
2078 if (diff < 0)
|
|
2079 diff = -diff;
|
|
2080 if (diff < minval)
|
|
2081 {
|
|
2082 minval = diff;
|
|
2083 found = i;
|
|
2084 }
|
|
2085 }
|
|
2086
|
|
2087 if (minval < diff_so_far)
|
|
2088 {
|
|
2089 bufmax = bufmin = buf->text->mule_charbpos_cache[found];
|
|
2090 bytmax = bytmin = buf->text->mule_bytebpos_cache[found];
|
|
2091 size = 1;
|
|
2092 }
|
|
2093 }
|
|
2094
|
|
2095 /* It's conceivable that the caching above could lead to X being
|
|
2096 the same as one of the range edges. */
|
|
2097 if (x >= bytmax)
|
|
2098 {
|
|
2099 Bytebpos newmax;
|
|
2100 Bytecount newsize;
|
|
2101
|
|
2102 forward_p = 1;
|
|
2103 while (x > bytmax)
|
|
2104 {
|
|
2105 newmax = bytmax;
|
|
2106
|
|
2107 INC_BYTEBPOS (buf, newmax);
|
|
2108 newsize = newmax - bytmax;
|
|
2109 if (newsize != size)
|
|
2110 {
|
|
2111 bufmin = bufmax;
|
|
2112 bytmin = bytmax;
|
|
2113 size = newsize;
|
|
2114 }
|
|
2115 bytmax = newmax;
|
|
2116 bufmax++;
|
|
2117 }
|
|
2118 retval = bufmax;
|
|
2119
|
|
2120 /* #### Should go past the found location to reduce the number
|
|
2121 of times that this function is called */
|
|
2122 }
|
|
2123 else /* x <= bytmin */
|
|
2124 {
|
|
2125 Bytebpos newmin;
|
|
2126 Bytecount newsize;
|
|
2127
|
|
2128 forward_p = 0;
|
|
2129 while (x < bytmin)
|
|
2130 {
|
|
2131 newmin = bytmin;
|
|
2132
|
|
2133 DEC_BYTEBPOS (buf, newmin);
|
|
2134 newsize = bytmin - newmin;
|
|
2135 if (newsize != size)
|
|
2136 {
|
|
2137 bufmax = bufmin;
|
|
2138 bytmax = bytmin;
|
|
2139 size = newsize;
|
|
2140 }
|
|
2141 bytmin = newmin;
|
|
2142 bufmin--;
|
|
2143 }
|
|
2144 retval = bufmin;
|
|
2145
|
|
2146 /* #### Should go past the found location to reduce the number
|
|
2147 of times that this function is called
|
|
2148 */
|
|
2149 }
|
|
2150
|
|
2151 /* If size is three, than we have to max sure that the range we
|
|
2152 discovered isn't too large, because we use a fixed-length
|
|
2153 table to divide by 3. */
|
|
2154
|
|
2155 if (size == 3)
|
|
2156 {
|
|
2157 int gap = bytmax - bytmin;
|
|
2158 buf->text->mule_three_p = 1;
|
|
2159 buf->text->mule_shifter = 1;
|
|
2160
|
|
2161 if (gap > MAX_BYTEBPOS_GAP_SIZE_3)
|
|
2162 {
|
|
2163 if (forward_p)
|
|
2164 {
|
|
2165 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3;
|
|
2166 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3;
|
|
2167 }
|
|
2168 else
|
|
2169 {
|
|
2170 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3;
|
|
2171 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3;
|
|
2172 }
|
|
2173 }
|
|
2174 }
|
|
2175 else
|
|
2176 {
|
|
2177 buf->text->mule_three_p = 0;
|
|
2178 if (size == 4)
|
|
2179 buf->text->mule_shifter = 2;
|
|
2180 else
|
|
2181 buf->text->mule_shifter = size - 1;
|
|
2182 }
|
|
2183
|
|
2184 buf->text->mule_bufmin = bufmin;
|
|
2185 buf->text->mule_bufmax = bufmax;
|
|
2186 buf->text->mule_bytmin = bytmin;
|
|
2187 buf->text->mule_bytmax = bytmax;
|
|
2188
|
|
2189 if (add_to_cache)
|
|
2190 {
|
|
2191 int replace_loc;
|
|
2192
|
|
2193 /* We throw away a "random" cached value and replace it with
|
|
2194 the new value. It doesn't actually have to be very random
|
|
2195 at all, just evenly distributed.
|
|
2196
|
|
2197 #### It would be better to use a least-recently-used algorithm
|
|
2198 or something that tries to space things out, but I'm not sure
|
|
2199 it's worth it to go to the trouble of maintaining that. */
|
|
2200 not_very_random_number += 621;
|
|
2201 replace_loc = not_very_random_number & 15;
|
|
2202 buf->text->mule_charbpos_cache[replace_loc] = retval;
|
|
2203 buf->text->mule_bytebpos_cache[replace_loc] = x;
|
|
2204 }
|
|
2205
|
1292
|
2206 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion);
|
|
2207
|
771
|
2208 return retval;
|
|
2209 }
|
|
2210
|
|
2211 /* Text of length BYTELENGTH and CHARLENGTH (in different units)
|
|
2212 was inserted at charbpos START. */
|
|
2213
|
|
2214 void
|
|
2215 buffer_mule_signal_inserted_region (struct buffer *buf, Charbpos start,
|
|
2216 Bytecount bytelength,
|
|
2217 Charcount charlength)
|
|
2218 {
|
|
2219 int size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
|
|
2220 int i;
|
|
2221
|
|
2222 /* Adjust the cache of known positions. */
|
|
2223 for (i = 0; i < 16; i++)
|
|
2224 {
|
|
2225
|
|
2226 if (buf->text->mule_charbpos_cache[i] > start)
|
|
2227 {
|
|
2228 buf->text->mule_charbpos_cache[i] += charlength;
|
|
2229 buf->text->mule_bytebpos_cache[i] += bytelength;
|
|
2230 }
|
|
2231 }
|
|
2232
|
|
2233 if (start >= buf->text->mule_bufmax)
|
826
|
2234 return;
|
771
|
2235
|
|
2236 /* The insertion is either before the known region, in which case
|
|
2237 it shoves it forward; or within the known region, in which case
|
|
2238 it shoves the end forward. (But it may make the known region
|
|
2239 inconsistent, so we may have to shorten it.) */
|
|
2240
|
|
2241 if (start <= buf->text->mule_bufmin)
|
|
2242 {
|
|
2243 buf->text->mule_bufmin += charlength;
|
|
2244 buf->text->mule_bufmax += charlength;
|
|
2245 buf->text->mule_bytmin += bytelength;
|
|
2246 buf->text->mule_bytmax += bytelength;
|
|
2247 }
|
|
2248 else
|
|
2249 {
|
|
2250 Charbpos end = start + charlength;
|
|
2251 /* the insertion point divides the known region in two.
|
|
2252 Keep the longer half, at least, and expand into the
|
|
2253 inserted chunk as much as possible. */
|
|
2254
|
|
2255 if (start - buf->text->mule_bufmin > buf->text->mule_bufmax - start)
|
|
2256 {
|
|
2257 Bytebpos bytestart = (buf->text->mule_bytmin
|
|
2258 + size * (start - buf->text->mule_bufmin));
|
|
2259 Bytebpos bytenew;
|
|
2260
|
|
2261 while (start < end)
|
|
2262 {
|
|
2263 bytenew = bytestart;
|
|
2264 INC_BYTEBPOS (buf, bytenew);
|
|
2265 if (bytenew - bytestart != size)
|
|
2266 break;
|
|
2267 start++;
|
|
2268 bytestart = bytenew;
|
|
2269 }
|
|
2270 if (start != end)
|
|
2271 {
|
|
2272 buf->text->mule_bufmax = start;
|
|
2273 buf->text->mule_bytmax = bytestart;
|
|
2274 }
|
|
2275 else
|
|
2276 {
|
|
2277 buf->text->mule_bufmax += charlength;
|
|
2278 buf->text->mule_bytmax += bytelength;
|
|
2279 }
|
|
2280 }
|
|
2281 else
|
|
2282 {
|
|
2283 Bytebpos byteend = (buf->text->mule_bytmin
|
|
2284 + size * (start - buf->text->mule_bufmin)
|
|
2285 + bytelength);
|
|
2286 Bytebpos bytenew;
|
|
2287
|
|
2288 buf->text->mule_bufmax += charlength;
|
|
2289 buf->text->mule_bytmax += bytelength;
|
|
2290
|
|
2291 while (end > start)
|
|
2292 {
|
|
2293 bytenew = byteend;
|
|
2294 DEC_BYTEBPOS (buf, bytenew);
|
|
2295 if (byteend - bytenew != size)
|
|
2296 break;
|
|
2297 end--;
|
|
2298 byteend = bytenew;
|
|
2299 }
|
|
2300 if (start != end)
|
|
2301 {
|
|
2302 buf->text->mule_bufmin = end;
|
|
2303 buf->text->mule_bytmin = byteend;
|
|
2304 }
|
|
2305 }
|
|
2306 }
|
|
2307 }
|
|
2308
|
826
|
2309 /* Text from START to END (equivalent in Bytebpos's: from BYTE_START to
|
|
2310 BYTE_END) was deleted. */
|
771
|
2311
|
|
2312 void
|
|
2313 buffer_mule_signal_deleted_region (struct buffer *buf, Charbpos start,
|
826
|
2314 Charbpos end, Bytebpos byte_start,
|
|
2315 Bytebpos byte_end)
|
771
|
2316 {
|
|
2317 int i;
|
|
2318
|
|
2319 /* Adjust the cache of known positions. */
|
|
2320 for (i = 0; i < 16; i++)
|
|
2321 {
|
|
2322 /* After the end; gets shoved backward */
|
|
2323 if (buf->text->mule_charbpos_cache[i] > end)
|
|
2324 {
|
|
2325 buf->text->mule_charbpos_cache[i] -= end - start;
|
826
|
2326 buf->text->mule_bytebpos_cache[i] -= byte_end - byte_start;
|
771
|
2327 }
|
|
2328 /* In the range; moves to start of range */
|
|
2329 else if (buf->text->mule_charbpos_cache[i] > start)
|
|
2330 {
|
|
2331 buf->text->mule_charbpos_cache[i] = start;
|
826
|
2332 buf->text->mule_bytebpos_cache[i] = byte_start;
|
771
|
2333 }
|
|
2334 }
|
|
2335
|
|
2336 /* We don't care about any text after the end of the known region. */
|
|
2337
|
|
2338 end = min (end, buf->text->mule_bufmax);
|
826
|
2339 byte_end = min (byte_end, buf->text->mule_bytmax);
|
771
|
2340 if (start >= end)
|
826
|
2341 return;
|
771
|
2342
|
|
2343 /* The end of the known region offsets by the total amount of deletion,
|
|
2344 since it's all before it. */
|
|
2345
|
|
2346 buf->text->mule_bufmax -= end - start;
|
826
|
2347 buf->text->mule_bytmax -= byte_end - byte_start;
|
771
|
2348
|
|
2349 /* Now we don't care about any text after the start of the known region. */
|
|
2350
|
|
2351 end = min (end, buf->text->mule_bufmin);
|
826
|
2352 byte_end = min (byte_end, buf->text->mule_bytmin);
|
771
|
2353 if (start < end)
|
|
2354 {
|
|
2355 buf->text->mule_bufmin -= end - start;
|
826
|
2356 buf->text->mule_bytmin -= byte_end - byte_start;
|
771
|
2357 }
|
|
2358 }
|
|
2359
|
|
2360 #endif /* MULE */
|
|
2361
|
|
2362
|
|
2363 /************************************************************************/
|
|
2364 /* verifying buffer and string positions */
|
|
2365 /************************************************************************/
|
|
2366
|
|
2367 /* Functions below are tagged with either _byte or _char indicating
|
|
2368 whether they return byte or character positions. For a buffer,
|
|
2369 a character position is a "Charbpos" and a byte position is a "Bytebpos".
|
|
2370 For strings, these are sometimes typed using "Charcount" and
|
|
2371 "Bytecount". */
|
|
2372
|
|
2373 /* Flags for the functions below are:
|
|
2374
|
|
2375 GB_ALLOW_PAST_ACCESSIBLE
|
|
2376
|
|
2377 Allow positions to range over the entire buffer (BUF_BEG to BUF_Z),
|
|
2378 rather than just the accessible portion (BUF_BEGV to BUF_ZV).
|
|
2379 For strings, this flag has no effect.
|
|
2380
|
|
2381 GB_COERCE_RANGE
|
|
2382
|
|
2383 If the position is outside the allowable range, return the lower
|
|
2384 or upper bound of the range, whichever is closer to the specified
|
|
2385 position.
|
|
2386
|
|
2387 GB_NO_ERROR_IF_BAD
|
|
2388
|
|
2389 If the position is outside the allowable range, return -1.
|
|
2390
|
|
2391 GB_NEGATIVE_FROM_END
|
|
2392
|
|
2393 If a value is negative, treat it as an offset from the end.
|
|
2394 Only applies to strings.
|
|
2395
|
|
2396 The following additional flags apply only to the functions
|
|
2397 that return ranges:
|
|
2398
|
|
2399 GB_ALLOW_NIL
|
|
2400
|
|
2401 Either or both positions can be nil. If FROM is nil,
|
|
2402 FROM_OUT will contain the lower bound of the allowed range.
|
|
2403 If TO is nil, TO_OUT will contain the upper bound of the
|
|
2404 allowed range.
|
|
2405
|
|
2406 GB_CHECK_ORDER
|
|
2407
|
|
2408 FROM must contain the lower bound and TO the upper bound
|
|
2409 of the range. If the positions are reversed, an error is
|
|
2410 signalled.
|
|
2411
|
|
2412 The following is a combination flag:
|
|
2413
|
|
2414 GB_HISTORICAL_STRING_BEHAVIOR
|
|
2415
|
|
2416 Equivalent to (GB_NEGATIVE_FROM_END | GB_ALLOW_NIL).
|
|
2417 */
|
|
2418
|
|
2419 /* Return a buffer position stored in a Lisp_Object. Full
|
|
2420 error-checking is done on the position. Flags can be specified to
|
|
2421 control the behavior of out-of-range values. The default behavior
|
|
2422 is to require that the position is within the accessible part of
|
|
2423 the buffer (BEGV and ZV), and to signal an error if the position is
|
|
2424 out of range.
|
|
2425
|
|
2426 */
|
|
2427
|
|
2428 Charbpos
|
|
2429 get_buffer_pos_char (struct buffer *b, Lisp_Object pos, unsigned int flags)
|
|
2430 {
|
|
2431 /* Does not GC */
|
|
2432 Charbpos ind;
|
|
2433 Charbpos min_allowed, max_allowed;
|
|
2434
|
|
2435 CHECK_INT_COERCE_MARKER (pos);
|
|
2436 ind = XINT (pos);
|
|
2437 min_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_BEG (b) : BUF_BEGV (b);
|
|
2438 max_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_Z (b) : BUF_ZV (b);
|
|
2439
|
|
2440 if (ind < min_allowed || ind > max_allowed)
|
|
2441 {
|
|
2442 if (flags & GB_COERCE_RANGE)
|
|
2443 ind = ind < min_allowed ? min_allowed : max_allowed;
|
|
2444 else if (flags & GB_NO_ERROR_IF_BAD)
|
|
2445 ind = -1;
|
|
2446 else
|
|
2447 {
|
793
|
2448 Lisp_Object buffer = wrap_buffer (b);
|
|
2449
|
771
|
2450 args_out_of_range (buffer, pos);
|
|
2451 }
|
|
2452 }
|
|
2453
|
|
2454 return ind;
|
|
2455 }
|
|
2456
|
|
2457 Bytebpos
|
|
2458 get_buffer_pos_byte (struct buffer *b, Lisp_Object pos, unsigned int flags)
|
|
2459 {
|
|
2460 Charbpos bpos = get_buffer_pos_char (b, pos, flags);
|
|
2461 if (bpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */
|
|
2462 return -1;
|
|
2463 return charbpos_to_bytebpos (b, bpos);
|
|
2464 }
|
|
2465
|
|
2466 /* Return a pair of buffer positions representing a range of text,
|
|
2467 taken from a pair of Lisp_Objects. Full error-checking is
|
|
2468 done on the positions. Flags can be specified to control the
|
|
2469 behavior of out-of-range values. The default behavior is to
|
|
2470 allow the range bounds to be specified in either order
|
|
2471 (however, FROM_OUT will always be the lower bound of the range
|
|
2472 and TO_OUT the upper bound),to require that the positions
|
|
2473 are within the accessible part of the buffer (BEGV and ZV),
|
|
2474 and to signal an error if the positions are out of range.
|
|
2475 */
|
|
2476
|
|
2477 void
|
|
2478 get_buffer_range_char (struct buffer *b, Lisp_Object from, Lisp_Object to,
|
826
|
2479 Charbpos *from_out, Charbpos *to_out,
|
|
2480 unsigned int flags)
|
771
|
2481 {
|
|
2482 /* Does not GC */
|
|
2483 Charbpos min_allowed, max_allowed;
|
|
2484
|
|
2485 min_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
|
|
2486 BUF_BEG (b) : BUF_BEGV (b);
|
|
2487 max_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
|
|
2488 BUF_Z (b) : BUF_ZV (b);
|
|
2489
|
|
2490 if (NILP (from) && (flags & GB_ALLOW_NIL))
|
|
2491 *from_out = min_allowed;
|
|
2492 else
|
|
2493 *from_out = get_buffer_pos_char (b, from, flags | GB_NO_ERROR_IF_BAD);
|
|
2494
|
|
2495 if (NILP (to) && (flags & GB_ALLOW_NIL))
|
|
2496 *to_out = max_allowed;
|
|
2497 else
|
|
2498 *to_out = get_buffer_pos_char (b, to, flags | GB_NO_ERROR_IF_BAD);
|
|
2499
|
|
2500 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD))
|
|
2501 {
|
793
|
2502 Lisp_Object buffer = wrap_buffer (b);
|
|
2503
|
771
|
2504 args_out_of_range_3 (buffer, from, to);
|
|
2505 }
|
|
2506
|
|
2507 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out)
|
|
2508 {
|
|
2509 if (flags & GB_CHECK_ORDER)
|
|
2510 invalid_argument_2 ("start greater than end", from, to);
|
|
2511 else
|
|
2512 {
|
|
2513 Charbpos temp = *from_out;
|
|
2514 *from_out = *to_out;
|
|
2515 *to_out = temp;
|
|
2516 }
|
|
2517 }
|
|
2518 }
|
|
2519
|
|
2520 void
|
|
2521 get_buffer_range_byte (struct buffer *b, Lisp_Object from, Lisp_Object to,
|
826
|
2522 Bytebpos *from_out, Bytebpos *to_out,
|
|
2523 unsigned int flags)
|
771
|
2524 {
|
|
2525 Charbpos s, e;
|
|
2526
|
|
2527 get_buffer_range_char (b, from, to, &s, &e, flags);
|
|
2528 if (s >= 0)
|
|
2529 *from_out = charbpos_to_bytebpos (b, s);
|
|
2530 else /* could happen with GB_NO_ERROR_IF_BAD */
|
|
2531 *from_out = -1;
|
|
2532 if (e >= 0)
|
|
2533 *to_out = charbpos_to_bytebpos (b, e);
|
|
2534 else
|
|
2535 *to_out = -1;
|
|
2536 }
|
|
2537
|
|
2538 static Charcount
|
|
2539 get_string_pos_char_1 (Lisp_Object string, Lisp_Object pos, unsigned int flags,
|
|
2540 Charcount known_length)
|
|
2541 {
|
|
2542 Charcount ccpos;
|
|
2543 Charcount min_allowed = 0;
|
|
2544 Charcount max_allowed = known_length;
|
|
2545
|
|
2546 /* Computation of KNOWN_LENGTH is potentially expensive so we pass
|
|
2547 it in. */
|
|
2548 CHECK_INT (pos);
|
|
2549 ccpos = XINT (pos);
|
|
2550 if (ccpos < 0 && flags & GB_NEGATIVE_FROM_END)
|
|
2551 ccpos += max_allowed;
|
|
2552
|
|
2553 if (ccpos < min_allowed || ccpos > max_allowed)
|
|
2554 {
|
|
2555 if (flags & GB_COERCE_RANGE)
|
|
2556 ccpos = ccpos < min_allowed ? min_allowed : max_allowed;
|
|
2557 else if (flags & GB_NO_ERROR_IF_BAD)
|
|
2558 ccpos = -1;
|
|
2559 else
|
|
2560 args_out_of_range (string, pos);
|
|
2561 }
|
|
2562
|
|
2563 return ccpos;
|
|
2564 }
|
|
2565
|
|
2566 Charcount
|
|
2567 get_string_pos_char (Lisp_Object string, Lisp_Object pos, unsigned int flags)
|
|
2568 {
|
|
2569 return get_string_pos_char_1 (string, pos, flags,
|
826
|
2570 string_char_length (string));
|
771
|
2571 }
|
|
2572
|
|
2573 Bytecount
|
|
2574 get_string_pos_byte (Lisp_Object string, Lisp_Object pos, unsigned int flags)
|
|
2575 {
|
|
2576 Charcount ccpos = get_string_pos_char (string, pos, flags);
|
|
2577 if (ccpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */
|
|
2578 return -1;
|
793
|
2579 return string_index_char_to_byte (string, ccpos);
|
771
|
2580 }
|
|
2581
|
|
2582 void
|
|
2583 get_string_range_char (Lisp_Object string, Lisp_Object from, Lisp_Object to,
|
|
2584 Charcount *from_out, Charcount *to_out,
|
|
2585 unsigned int flags)
|
|
2586 {
|
|
2587 Charcount min_allowed = 0;
|
826
|
2588 Charcount max_allowed = string_char_length (string);
|
771
|
2589
|
|
2590 if (NILP (from) && (flags & GB_ALLOW_NIL))
|
|
2591 *from_out = min_allowed;
|
|
2592 else
|
|
2593 *from_out = get_string_pos_char_1 (string, from,
|
|
2594 flags | GB_NO_ERROR_IF_BAD,
|
|
2595 max_allowed);
|
|
2596
|
|
2597 if (NILP (to) && (flags & GB_ALLOW_NIL))
|
|
2598 *to_out = max_allowed;
|
|
2599 else
|
|
2600 *to_out = get_string_pos_char_1 (string, to,
|
|
2601 flags | GB_NO_ERROR_IF_BAD,
|
|
2602 max_allowed);
|
|
2603
|
|
2604 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD))
|
|
2605 args_out_of_range_3 (string, from, to);
|
|
2606
|
|
2607 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out)
|
|
2608 {
|
|
2609 if (flags & GB_CHECK_ORDER)
|
|
2610 invalid_argument_2 ("start greater than end", from, to);
|
|
2611 else
|
|
2612 {
|
|
2613 Charbpos temp = *from_out;
|
|
2614 *from_out = *to_out;
|
|
2615 *to_out = temp;
|
|
2616 }
|
|
2617 }
|
|
2618 }
|
|
2619
|
|
2620 void
|
|
2621 get_string_range_byte (Lisp_Object string, Lisp_Object from, Lisp_Object to,
|
|
2622 Bytecount *from_out, Bytecount *to_out,
|
|
2623 unsigned int flags)
|
|
2624 {
|
|
2625 Charcount s, e;
|
|
2626
|
|
2627 get_string_range_char (string, from, to, &s, &e, flags);
|
|
2628 if (s >= 0)
|
793
|
2629 *from_out = string_index_char_to_byte (string, s);
|
771
|
2630 else /* could happen with GB_NO_ERROR_IF_BAD */
|
|
2631 *from_out = -1;
|
|
2632 if (e >= 0)
|
793
|
2633 *to_out = string_index_char_to_byte (string, e);
|
771
|
2634 else
|
|
2635 *to_out = -1;
|
|
2636
|
|
2637 }
|
|
2638
|
826
|
2639 Charxpos
|
771
|
2640 get_buffer_or_string_pos_char (Lisp_Object object, Lisp_Object pos,
|
|
2641 unsigned int flags)
|
|
2642 {
|
|
2643 return STRINGP (object) ?
|
|
2644 get_string_pos_char (object, pos, flags) :
|
|
2645 get_buffer_pos_char (XBUFFER (object), pos, flags);
|
|
2646 }
|
|
2647
|
826
|
2648 Bytexpos
|
771
|
2649 get_buffer_or_string_pos_byte (Lisp_Object object, Lisp_Object pos,
|
|
2650 unsigned int flags)
|
|
2651 {
|
|
2652 return STRINGP (object) ?
|
|
2653 get_string_pos_byte (object, pos, flags) :
|
|
2654 get_buffer_pos_byte (XBUFFER (object), pos, flags);
|
|
2655 }
|
|
2656
|
|
2657 void
|
|
2658 get_buffer_or_string_range_char (Lisp_Object object, Lisp_Object from,
|
826
|
2659 Lisp_Object to, Charxpos *from_out,
|
|
2660 Charxpos *to_out, unsigned int flags)
|
771
|
2661 {
|
|
2662 if (STRINGP (object))
|
|
2663 get_string_range_char (object, from, to, from_out, to_out, flags);
|
|
2664 else
|
826
|
2665 get_buffer_range_char (XBUFFER (object), from, to, from_out, to_out,
|
|
2666 flags);
|
771
|
2667 }
|
|
2668
|
|
2669 void
|
|
2670 get_buffer_or_string_range_byte (Lisp_Object object, Lisp_Object from,
|
826
|
2671 Lisp_Object to, Bytexpos *from_out,
|
|
2672 Bytexpos *to_out, unsigned int flags)
|
771
|
2673 {
|
|
2674 if (STRINGP (object))
|
|
2675 get_string_range_byte (object, from, to, from_out, to_out, flags);
|
|
2676 else
|
826
|
2677 get_buffer_range_byte (XBUFFER (object), from, to, from_out, to_out,
|
|
2678 flags);
|
771
|
2679 }
|
|
2680
|
826
|
2681 Charxpos
|
771
|
2682 buffer_or_string_accessible_begin_char (Lisp_Object object)
|
|
2683 {
|
|
2684 return STRINGP (object) ? 0 : BUF_BEGV (XBUFFER (object));
|
|
2685 }
|
|
2686
|
826
|
2687 Charxpos
|
771
|
2688 buffer_or_string_accessible_end_char (Lisp_Object object)
|
|
2689 {
|
|
2690 return STRINGP (object) ?
|
826
|
2691 string_char_length (object) : BUF_ZV (XBUFFER (object));
|
771
|
2692 }
|
|
2693
|
826
|
2694 Bytexpos
|
771
|
2695 buffer_or_string_accessible_begin_byte (Lisp_Object object)
|
|
2696 {
|
826
|
2697 return STRINGP (object) ? 0 : BYTE_BUF_BEGV (XBUFFER (object));
|
771
|
2698 }
|
|
2699
|
826
|
2700 Bytexpos
|
771
|
2701 buffer_or_string_accessible_end_byte (Lisp_Object object)
|
|
2702 {
|
|
2703 return STRINGP (object) ?
|
826
|
2704 XSTRING_LENGTH (object) : BYTE_BUF_ZV (XBUFFER (object));
|
771
|
2705 }
|
|
2706
|
826
|
2707 Charxpos
|
771
|
2708 buffer_or_string_absolute_begin_char (Lisp_Object object)
|
|
2709 {
|
|
2710 return STRINGP (object) ? 0 : BUF_BEG (XBUFFER (object));
|
|
2711 }
|
|
2712
|
826
|
2713 Charxpos
|
771
|
2714 buffer_or_string_absolute_end_char (Lisp_Object object)
|
|
2715 {
|
|
2716 return STRINGP (object) ?
|
826
|
2717 string_char_length (object) : BUF_Z (XBUFFER (object));
|
|
2718 }
|
|
2719
|
|
2720 Bytexpos
|
|
2721 buffer_or_string_absolute_begin_byte (Lisp_Object object)
|
|
2722 {
|
|
2723 return STRINGP (object) ? 0 : BYTE_BUF_BEG (XBUFFER (object));
|
|
2724 }
|
|
2725
|
|
2726 Bytexpos
|
|
2727 buffer_or_string_absolute_end_byte (Lisp_Object object)
|
|
2728 {
|
|
2729 return STRINGP (object) ?
|
|
2730 XSTRING_LENGTH (object) : BYTE_BUF_Z (XBUFFER (object));
|
|
2731 }
|
|
2732
|
|
2733 Charbpos
|
|
2734 charbpos_clip_to_bounds (Charbpos lower, Charbpos num, Charbpos upper)
|
|
2735 {
|
|
2736 return (num < lower ? lower :
|
|
2737 num > upper ? upper :
|
|
2738 num);
|
771
|
2739 }
|
|
2740
|
|
2741 Bytebpos
|
826
|
2742 bytebpos_clip_to_bounds (Bytebpos lower, Bytebpos num, Bytebpos upper)
|
|
2743 {
|
|
2744 return (num < lower ? lower :
|
|
2745 num > upper ? upper :
|
|
2746 num);
|
|
2747 }
|
|
2748
|
|
2749 Charxpos
|
|
2750 charxpos_clip_to_bounds (Charxpos lower, Charxpos num, Charxpos upper)
|
771
|
2751 {
|
826
|
2752 return (num < lower ? lower :
|
|
2753 num > upper ? upper :
|
|
2754 num);
|
|
2755 }
|
|
2756
|
|
2757 Bytexpos
|
|
2758 bytexpos_clip_to_bounds (Bytexpos lower, Bytexpos num, Bytexpos upper)
|
|
2759 {
|
|
2760 return (num < lower ? lower :
|
|
2761 num > upper ? upper :
|
|
2762 num);
|
771
|
2763 }
|
|
2764
|
826
|
2765 /* These could be implemented in terms of the get_buffer_or_string()
|
|
2766 functions above, but those are complicated and handle lots of weird
|
|
2767 cases stemming from uncertain external input. */
|
|
2768
|
|
2769 Charxpos
|
|
2770 buffer_or_string_clip_to_accessible_char (Lisp_Object object, Charxpos pos)
|
|
2771 {
|
|
2772 return (charxpos_clip_to_bounds
|
|
2773 (pos, buffer_or_string_accessible_begin_char (object),
|
|
2774 buffer_or_string_accessible_end_char (object)));
|
|
2775 }
|
|
2776
|
|
2777 Bytexpos
|
|
2778 buffer_or_string_clip_to_accessible_byte (Lisp_Object object, Bytexpos pos)
|
771
|
2779 {
|
826
|
2780 return (bytexpos_clip_to_bounds
|
|
2781 (pos, buffer_or_string_accessible_begin_byte (object),
|
|
2782 buffer_or_string_accessible_end_byte (object)));
|
|
2783 }
|
|
2784
|
|
2785 Charxpos
|
|
2786 buffer_or_string_clip_to_absolute_char (Lisp_Object object, Charxpos pos)
|
|
2787 {
|
|
2788 return (charxpos_clip_to_bounds
|
|
2789 (pos, buffer_or_string_absolute_begin_char (object),
|
|
2790 buffer_or_string_absolute_end_char (object)));
|
|
2791 }
|
|
2792
|
|
2793 Bytexpos
|
|
2794 buffer_or_string_clip_to_absolute_byte (Lisp_Object object, Bytexpos pos)
|
|
2795 {
|
|
2796 return (bytexpos_clip_to_bounds
|
|
2797 (pos, buffer_or_string_absolute_begin_byte (object),
|
|
2798 buffer_or_string_absolute_end_byte (object)));
|
771
|
2799 }
|
|
2800
|
|
2801
|
|
2802 /************************************************************************/
|
|
2803 /* Implement TO_EXTERNAL_FORMAT, TO_INTERNAL_FORMAT */
|
|
2804 /************************************************************************/
|
|
2805
|
|
2806 typedef struct
|
|
2807 {
|
867
|
2808 Dynarr_declare (Ibyte_dynarr *);
|
|
2809 } Ibyte_dynarr_dynarr;
|
771
|
2810
|
|
2811 typedef struct
|
|
2812 {
|
|
2813 Dynarr_declare (Extbyte_dynarr *);
|
|
2814 } Extbyte_dynarr_dynarr;
|
|
2815
|
|
2816 static Extbyte_dynarr_dynarr *conversion_out_dynarr_list;
|
867
|
2817 static Ibyte_dynarr_dynarr *conversion_in_dynarr_list;
|
771
|
2818
|
|
2819 static int dfc_convert_to_external_format_in_use;
|
|
2820 static int dfc_convert_to_internal_format_in_use;
|
|
2821
|
|
2822 void
|
|
2823 dfc_convert_to_external_format (dfc_conversion_type source_type,
|
|
2824 dfc_conversion_data *source,
|
|
2825 Lisp_Object coding_system,
|
|
2826 dfc_conversion_type sink_type,
|
|
2827 dfc_conversion_data *sink)
|
|
2828 {
|
|
2829 /* It's guaranteed that many callers are not prepared for GC here,
|
|
2830 esp. given that this code conversion occurs in many very hidden
|
|
2831 places. */
|
1292
|
2832 int count;
|
771
|
2833 Extbyte_dynarr *conversion_out_dynarr;
|
1292
|
2834 PROFILE_DECLARE ();
|
|
2835
|
|
2836 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion);
|
|
2837
|
|
2838 count = begin_gc_forbidden ();
|
771
|
2839
|
|
2840 type_checking_assert
|
|
2841 (((source_type == DFC_TYPE_DATA) ||
|
|
2842 (source_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)) ||
|
|
2843 (source_type == DFC_TYPE_LISP_STRING && STRINGP (source->lisp_object)))
|
|
2844 &&
|
|
2845 ((sink_type == DFC_TYPE_DATA) ||
|
|
2846 (sink_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object))));
|
|
2847
|
|
2848 if (Dynarr_length (conversion_out_dynarr_list) <=
|
|
2849 dfc_convert_to_external_format_in_use)
|
|
2850 Dynarr_add (conversion_out_dynarr_list, Dynarr_new (Extbyte));
|
|
2851 conversion_out_dynarr = Dynarr_at (conversion_out_dynarr_list,
|
|
2852 dfc_convert_to_external_format_in_use);
|
|
2853 Dynarr_reset (conversion_out_dynarr);
|
|
2854
|
853
|
2855 internal_bind_int (&dfc_convert_to_external_format_in_use,
|
|
2856 dfc_convert_to_external_format_in_use + 1);
|
|
2857
|
771
|
2858 coding_system = get_coding_system_for_text_file (coding_system, 0);
|
|
2859
|
|
2860 /* Here we optimize in the case where the coding system does no
|
|
2861 conversion. However, we don't want to optimize in case the source
|
|
2862 or sink is an lstream, since writing to an lstream can cause a
|
|
2863 garbage collection, and this could be problematic if the source
|
|
2864 is a lisp string. */
|
|
2865 if (source_type != DFC_TYPE_LISP_LSTREAM &&
|
|
2866 sink_type != DFC_TYPE_LISP_LSTREAM &&
|
|
2867 coding_system_is_binary (coding_system))
|
|
2868 {
|
867
|
2869 const Ibyte *ptr;
|
771
|
2870 Bytecount len;
|
|
2871
|
|
2872 if (source_type == DFC_TYPE_LISP_STRING)
|
|
2873 {
|
|
2874 ptr = XSTRING_DATA (source->lisp_object);
|
|
2875 len = XSTRING_LENGTH (source->lisp_object);
|
|
2876 }
|
|
2877 else
|
|
2878 {
|
867
|
2879 ptr = (Ibyte *) source->data.ptr;
|
771
|
2880 len = source->data.len;
|
|
2881 }
|
|
2882
|
|
2883 #ifdef MULE
|
|
2884 {
|
867
|
2885 const Ibyte *end;
|
771
|
2886 for (end = ptr + len; ptr < end;)
|
|
2887 {
|
867
|
2888 Ibyte c =
|
826
|
2889 (byte_ascii_p (*ptr)) ? *ptr :
|
771
|
2890 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
|
|
2891 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
|
|
2892 '~';
|
|
2893
|
|
2894 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
|
867
|
2895 INC_IBYTEPTR (ptr);
|
771
|
2896 }
|
800
|
2897 text_checking_assert (ptr == end);
|
771
|
2898 }
|
|
2899 #else
|
|
2900 Dynarr_add_many (conversion_out_dynarr, ptr, len);
|
|
2901 #endif
|
|
2902
|
|
2903 }
|
1315
|
2904 #ifdef WIN32_ANY
|
771
|
2905 /* Optimize the common case involving Unicode where only ASCII is involved */
|
|
2906 else if (source_type != DFC_TYPE_LISP_LSTREAM &&
|
|
2907 sink_type != DFC_TYPE_LISP_LSTREAM &&
|
|
2908 dfc_coding_system_is_unicode (coding_system))
|
|
2909 {
|
867
|
2910 const Ibyte *ptr, *p;
|
771
|
2911 Bytecount len;
|
867
|
2912 const Ibyte *end;
|
771
|
2913
|
|
2914 if (source_type == DFC_TYPE_LISP_STRING)
|
|
2915 {
|
|
2916 ptr = XSTRING_DATA (source->lisp_object);
|
|
2917 len = XSTRING_LENGTH (source->lisp_object);
|
|
2918 }
|
|
2919 else
|
|
2920 {
|
867
|
2921 ptr = (Ibyte *) source->data.ptr;
|
771
|
2922 len = source->data.len;
|
|
2923 }
|
|
2924 end = ptr + len;
|
|
2925
|
|
2926 for (p = ptr; p < end; p++)
|
|
2927 {
|
826
|
2928 if (!byte_ascii_p (*p))
|
771
|
2929 goto the_hard_way;
|
|
2930 }
|
|
2931
|
|
2932 for (p = ptr; p < end; p++)
|
|
2933 {
|
|
2934 Dynarr_add (conversion_out_dynarr, (Extbyte) (*p));
|
|
2935 Dynarr_add (conversion_out_dynarr, (Extbyte) '\0');
|
|
2936 }
|
|
2937 }
|
1315
|
2938 #endif /* WIN32_ANY */
|
771
|
2939 else
|
|
2940 {
|
|
2941 Lisp_Object streams_to_delete[3];
|
|
2942 int delete_count;
|
|
2943 Lisp_Object instream, outstream;
|
|
2944 Lstream *reader, *writer;
|
|
2945
|
1315
|
2946 #ifdef WIN32_ANY
|
771
|
2947 the_hard_way:
|
1315
|
2948 #endif /* WIN32_ANY */
|
771
|
2949 delete_count = 0;
|
|
2950 if (source_type == DFC_TYPE_LISP_LSTREAM)
|
|
2951 instream = source->lisp_object;
|
|
2952 else if (source_type == DFC_TYPE_DATA)
|
|
2953 streams_to_delete[delete_count++] = instream =
|
|
2954 make_fixed_buffer_input_stream (source->data.ptr, source->data.len);
|
|
2955 else
|
|
2956 {
|
|
2957 type_checking_assert (source_type == DFC_TYPE_LISP_STRING);
|
|
2958 streams_to_delete[delete_count++] = instream =
|
|
2959 /* This will GCPRO the Lisp string */
|
|
2960 make_lisp_string_input_stream (source->lisp_object, 0, -1);
|
|
2961 }
|
|
2962
|
|
2963 if (sink_type == DFC_TYPE_LISP_LSTREAM)
|
|
2964 outstream = sink->lisp_object;
|
|
2965 else
|
|
2966 {
|
|
2967 type_checking_assert (sink_type == DFC_TYPE_DATA);
|
|
2968 streams_to_delete[delete_count++] = outstream =
|
|
2969 make_dynarr_output_stream
|
|
2970 ((unsigned_char_dynarr *) conversion_out_dynarr);
|
|
2971 }
|
|
2972
|
|
2973 streams_to_delete[delete_count++] = outstream =
|
800
|
2974 make_coding_output_stream (XLSTREAM (outstream), coding_system,
|
|
2975 CODING_ENCODE, 0);
|
771
|
2976
|
|
2977 reader = XLSTREAM (instream);
|
|
2978 writer = XLSTREAM (outstream);
|
|
2979 /* decoding_stream will gc-protect outstream */
|
1204
|
2980 {
|
|
2981 struct gcpro gcpro1, gcpro2;
|
|
2982 GCPRO2 (instream, outstream);
|
|
2983
|
|
2984 while (1)
|
|
2985 {
|
|
2986 Bytecount size_in_bytes;
|
|
2987 char tempbuf[1024]; /* some random amount */
|
|
2988
|
|
2989 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf));
|
|
2990
|
|
2991 if (size_in_bytes == 0)
|
|
2992 break;
|
|
2993 else if (size_in_bytes < 0)
|
|
2994 signal_error (Qtext_conversion_error,
|
|
2995 "Error converting to external format", Qunbound);
|
|
2996
|
|
2997 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0)
|
|
2998 signal_error (Qtext_conversion_error,
|
|
2999 "Error converting to external format", Qunbound);
|
|
3000 }
|
|
3001
|
|
3002 /* Closing writer will close any stream at the other end of writer. */
|
|
3003 Lstream_close (writer);
|
|
3004 Lstream_close (reader);
|
|
3005 UNGCPRO;
|
|
3006 }
|
771
|
3007
|
|
3008 /* The idea is that this function will create no garbage. */
|
|
3009 while (delete_count)
|
|
3010 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count]));
|
|
3011 }
|
|
3012
|
|
3013 unbind_to (count);
|
|
3014
|
|
3015 if (sink_type != DFC_TYPE_LISP_LSTREAM)
|
|
3016 {
|
|
3017 sink->data.len = Dynarr_length (conversion_out_dynarr);
|
|
3018 /* double zero-extend because we may be dealing with Unicode data */
|
|
3019 Dynarr_add (conversion_out_dynarr, '\0');
|
|
3020 Dynarr_add (conversion_out_dynarr, '\0');
|
|
3021 sink->data.ptr = Dynarr_atp (conversion_out_dynarr, 0);
|
|
3022 }
|
1292
|
3023
|
|
3024 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion);
|
771
|
3025 }
|
|
3026
|
|
3027 void
|
|
3028 dfc_convert_to_internal_format (dfc_conversion_type source_type,
|
|
3029 dfc_conversion_data *source,
|
|
3030 Lisp_Object coding_system,
|
|
3031 dfc_conversion_type sink_type,
|
|
3032 dfc_conversion_data *sink)
|
|
3033 {
|
|
3034 /* It's guaranteed that many callers are not prepared for GC here,
|
|
3035 esp. given that this code conversion occurs in many very hidden
|
|
3036 places. */
|
1292
|
3037 int count;
|
867
|
3038 Ibyte_dynarr *conversion_in_dynarr;
|
1292
|
3039 PROFILE_DECLARE ();
|
|
3040
|
|
3041 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion);
|
|
3042
|
|
3043 count = begin_gc_forbidden ();
|
771
|
3044
|
|
3045 type_checking_assert
|
|
3046 ((source_type == DFC_TYPE_DATA ||
|
|
3047 source_type == DFC_TYPE_LISP_LSTREAM)
|
|
3048 &&
|
|
3049 (sink_type == DFC_TYPE_DATA ||
|
|
3050 sink_type == DFC_TYPE_LISP_LSTREAM));
|
|
3051
|
|
3052 if (Dynarr_length (conversion_in_dynarr_list) <=
|
|
3053 dfc_convert_to_internal_format_in_use)
|
867
|
3054 Dynarr_add (conversion_in_dynarr_list, Dynarr_new (Ibyte));
|
771
|
3055 conversion_in_dynarr = Dynarr_at (conversion_in_dynarr_list,
|
|
3056 dfc_convert_to_internal_format_in_use);
|
|
3057 Dynarr_reset (conversion_in_dynarr);
|
|
3058
|
853
|
3059 internal_bind_int (&dfc_convert_to_internal_format_in_use,
|
|
3060 dfc_convert_to_internal_format_in_use + 1);
|
|
3061
|
771
|
3062 coding_system = get_coding_system_for_text_file (coding_system, 1);
|
|
3063
|
|
3064 if (source_type != DFC_TYPE_LISP_LSTREAM &&
|
|
3065 sink_type != DFC_TYPE_LISP_LSTREAM &&
|
|
3066 coding_system_is_binary (coding_system))
|
|
3067 {
|
|
3068 #ifdef MULE
|
867
|
3069 const Ibyte *ptr = (const Ibyte *) source->data.ptr;
|
771
|
3070 Bytecount len = source->data.len;
|
867
|
3071 const Ibyte *end = ptr + len;
|
771
|
3072
|
|
3073 for (; ptr < end; ptr++)
|
|
3074 {
|
867
|
3075 Ibyte c = *ptr;
|
771
|
3076
|
826
|
3077 if (byte_ascii_p (c))
|
771
|
3078 Dynarr_add (conversion_in_dynarr, c);
|
826
|
3079 else if (byte_c1_p (c))
|
771
|
3080 {
|
|
3081 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
|
|
3082 Dynarr_add (conversion_in_dynarr, c + 0x20);
|
|
3083 }
|
|
3084 else
|
|
3085 {
|
|
3086 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1);
|
|
3087 Dynarr_add (conversion_in_dynarr, c);
|
|
3088 }
|
|
3089 }
|
|
3090 #else
|
|
3091 Dynarr_add_many (conversion_in_dynarr, source->data.ptr, source->data.len);
|
|
3092 #endif
|
|
3093 }
|
1315
|
3094 #ifdef WIN32_ANY
|
1292
|
3095 /* Optimize the common case involving Unicode where only ASCII/Latin-1 is
|
|
3096 involved */
|
771
|
3097 else if (source_type != DFC_TYPE_LISP_LSTREAM &&
|
|
3098 sink_type != DFC_TYPE_LISP_LSTREAM &&
|
|
3099 dfc_coding_system_is_unicode (coding_system))
|
|
3100 {
|
867
|
3101 const Ibyte *ptr = (const Ibyte *) source->data.ptr + 1;
|
771
|
3102 Bytecount len = source->data.len;
|
867
|
3103 const Ibyte *end = ptr + len;
|
771
|
3104
|
|
3105 if (len & 1)
|
|
3106 goto the_hard_way;
|
|
3107
|
|
3108 for (; ptr < end; ptr += 2)
|
|
3109 {
|
|
3110 if (*ptr)
|
|
3111 goto the_hard_way;
|
|
3112 }
|
|
3113
|
867
|
3114 ptr = (const Ibyte *) source->data.ptr;
|
771
|
3115 end = ptr + len;
|
|
3116
|
|
3117 for (; ptr < end; ptr += 2)
|
|
3118 {
|
867
|
3119 Ibyte c = *ptr;
|
771
|
3120
|
826
|
3121 if (byte_ascii_p (c))
|
771
|
3122 Dynarr_add (conversion_in_dynarr, c);
|
|
3123 #ifdef MULE
|
826
|
3124 else if (byte_c1_p (c))
|
771
|
3125 {
|
|
3126 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
|
|
3127 Dynarr_add (conversion_in_dynarr, c + 0x20);
|
|
3128 }
|
|
3129 else
|
|
3130 {
|
|
3131 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1);
|
|
3132 Dynarr_add (conversion_in_dynarr, c);
|
|
3133 }
|
|
3134 #endif /* MULE */
|
|
3135 }
|
|
3136 }
|
1315
|
3137 #endif /* WIN32_ANY */
|
771
|
3138 else
|
|
3139 {
|
|
3140 Lisp_Object streams_to_delete[3];
|
|
3141 int delete_count;
|
|
3142 Lisp_Object instream, outstream;
|
|
3143 Lstream *reader, *writer;
|
|
3144
|
1315
|
3145 #ifdef WIN32_ANY
|
771
|
3146 the_hard_way:
|
1315
|
3147 #endif /* WIN32_ANY */
|
771
|
3148 delete_count = 0;
|
|
3149 if (source_type == DFC_TYPE_LISP_LSTREAM)
|
|
3150 instream = source->lisp_object;
|
|
3151 else
|
|
3152 {
|
|
3153 type_checking_assert (source_type == DFC_TYPE_DATA);
|
|
3154 streams_to_delete[delete_count++] = instream =
|
|
3155 make_fixed_buffer_input_stream (source->data.ptr, source->data.len);
|
|
3156 }
|
|
3157
|
|
3158 if (sink_type == DFC_TYPE_LISP_LSTREAM)
|
|
3159 outstream = sink->lisp_object;
|
|
3160 else
|
|
3161 {
|
|
3162 type_checking_assert (sink_type == DFC_TYPE_DATA);
|
|
3163 streams_to_delete[delete_count++] = outstream =
|
|
3164 make_dynarr_output_stream
|
|
3165 ((unsigned_char_dynarr *) conversion_in_dynarr);
|
|
3166 }
|
|
3167
|
|
3168 streams_to_delete[delete_count++] = outstream =
|
800
|
3169 make_coding_output_stream (XLSTREAM (outstream), coding_system,
|
|
3170 CODING_DECODE, 0);
|
771
|
3171
|
|
3172 reader = XLSTREAM (instream);
|
|
3173 writer = XLSTREAM (outstream);
|
1204
|
3174 {
|
|
3175 struct gcpro gcpro1, gcpro2;
|
|
3176 /* outstream will gc-protect its sink stream, if necessary */
|
|
3177 GCPRO2 (instream, outstream);
|
|
3178
|
|
3179 while (1)
|
|
3180 {
|
|
3181 Bytecount size_in_bytes;
|
|
3182 char tempbuf[1024]; /* some random amount */
|
|
3183
|
|
3184 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf));
|
|
3185
|
|
3186 if (size_in_bytes == 0)
|
|
3187 break;
|
|
3188 else if (size_in_bytes < 0)
|
|
3189 signal_error (Qtext_conversion_error,
|
|
3190 "Error converting to internal format", Qunbound);
|
|
3191
|
|
3192 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0)
|
|
3193 signal_error (Qtext_conversion_error,
|
|
3194 "Error converting to internal format", Qunbound);
|
|
3195 }
|
|
3196
|
|
3197 /* Closing writer will close any stream at the other end of writer. */
|
|
3198 Lstream_close (writer);
|
|
3199 Lstream_close (reader);
|
|
3200 UNGCPRO;
|
|
3201 }
|
771
|
3202
|
|
3203 /* The idea is that this function will create no garbage. */
|
|
3204 while (delete_count)
|
|
3205 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count]));
|
|
3206 }
|
|
3207
|
|
3208 unbind_to (count);
|
|
3209
|
|
3210 if (sink_type != DFC_TYPE_LISP_LSTREAM)
|
|
3211 {
|
|
3212 sink->data.len = Dynarr_length (conversion_in_dynarr);
|
|
3213 Dynarr_add (conversion_in_dynarr, '\0'); /* remember to NUL-terminate! */
|
|
3214 /* The macros don't currently distinguish between internal and
|
|
3215 external sinks, and allocate and copy two extra bytes in both
|
|
3216 cases. So we add a second zero, just like for external data
|
|
3217 (in that case, because we may be converting to Unicode). */
|
|
3218 Dynarr_add (conversion_in_dynarr, '\0');
|
|
3219 sink->data.ptr = Dynarr_atp (conversion_in_dynarr, 0);
|
|
3220 }
|
1292
|
3221
|
|
3222 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion);
|
771
|
3223 }
|
|
3224
|
1318
|
3225 /* ----------------------------------------------------------------------- */
|
|
3226 /* New-style DFC converters (data is returned rather than stored into var) */
|
|
3227 /* ----------------------------------------------------------------------- */
|
|
3228
|
|
3229 /* We handle here the cases where SRC is a Lisp_Object, internal data
|
|
3230 (sized or unsized), or external data (sized or unsized), and return type
|
|
3231 is unsized alloca() or malloc() data. If the return type is a
|
|
3232 Lisp_Object, use build_ext_string() for unsized external data,
|
|
3233 make_ext_string() for sized external data. If the return type needs to
|
|
3234 be sized data, use the *_TO_SIZED_*() macros, and for other more
|
|
3235 complicated cases, use the original TO_*_FORMAT() macros. */
|
|
3236
|
|
3237 static void
|
|
3238 new_dfc_convert_now_damn_it (const void *src, Bytecount src_size,
|
|
3239 enum new_dfc_src_type type,
|
|
3240 void **dst, Bytecount *dst_size,
|
|
3241 Lisp_Object codesys)
|
|
3242 {
|
|
3243 /* #### In the case of alloca(), it would be a bit more efficient, for
|
|
3244 small strings, to use static Dynarr's like are used internally in
|
|
3245 TO_*_FORMAT(), or some other way of avoiding malloc() followed by
|
|
3246 free(). I doubt it really matters, though. */
|
|
3247
|
|
3248 switch (type)
|
|
3249 {
|
|
3250 case DFC_EXTERNAL:
|
|
3251 TO_INTERNAL_FORMAT (C_STRING, src,
|
|
3252 MALLOC, (*dst, *dst_size), codesys);
|
|
3253 break;
|
|
3254
|
|
3255 case DFC_SIZED_EXTERNAL:
|
|
3256 TO_INTERNAL_FORMAT (DATA, (src, src_size),
|
|
3257 MALLOC, (*dst, *dst_size), codesys);
|
|
3258 break;
|
|
3259
|
|
3260 case DFC_INTERNAL:
|
|
3261 TO_EXTERNAL_FORMAT (C_STRING, src,
|
|
3262 MALLOC, (*dst, *dst_size), codesys);
|
|
3263 break;
|
|
3264
|
|
3265 case DFC_SIZED_INTERNAL:
|
|
3266 TO_EXTERNAL_FORMAT (DATA, (src, src_size),
|
|
3267 MALLOC, (*dst, *dst_size), codesys);
|
|
3268 break;
|
|
3269
|
|
3270 case DFC_LISP_STRING:
|
|
3271 TO_EXTERNAL_FORMAT (LISP_STRING, VOID_TO_LISP (src),
|
|
3272 MALLOC, (*dst, *dst_size), codesys);
|
|
3273 break;
|
|
3274
|
|
3275 default:
|
|
3276 abort ();
|
|
3277 }
|
|
3278 }
|
|
3279
|
|
3280 void *
|
|
3281 new_dfc_convert_malloc (const void *src, Bytecount src_size,
|
|
3282 enum new_dfc_src_type type, Lisp_Object codesys)
|
|
3283 {
|
|
3284 void *dst;
|
|
3285 Bytecount dst_size;
|
|
3286
|
|
3287 new_dfc_convert_now_damn_it (src, src_size, type, &dst, &dst_size, codesys);
|
|
3288 return dst;
|
|
3289 }
|
|
3290
|
|
3291 /* For alloca(), things are trickier because the calling function needs to
|
|
3292 allocate. This means that the caller needs to do the following:
|
|
3293
|
|
3294 (a) invoke us to do the conversion, remember the data and return the size.
|
|
3295 (b) alloca() the proper size.
|
|
3296 (c) invoke us again to copy the data.
|
|
3297
|
|
3298 We need to handle the possibility of two or more invocations of the
|
|
3299 converter in the same expression. In such cases it's conceivable that
|
|
3300 the evaluation of the sub-expressions will be overlapping (e.g. one size
|
|
3301 function called, then the other one called, then the copy functions
|
|
3302 called). To handle this, we keep a list of active data, indexed by the
|
|
3303 src expression. (We use the stringize operator to avoid evaluating the
|
|
3304 expression multiple times.) If the caller uses the exact same src
|
|
3305 expression twice in two converter calls in the same subexpression, we
|
|
3306 will lose, but at least we can check for this and abort(). We could
|
|
3307 conceivably try to index on other parameters as well, but there is not
|
|
3308 really any point. */
|
|
3309
|
|
3310 typedef struct
|
|
3311 {
|
|
3312 const char *srctext;
|
|
3313 void *dst;
|
|
3314 Bytecount dst_size;
|
|
3315 } dfc_e2c_vals;
|
|
3316
|
|
3317 typedef struct
|
|
3318 {
|
|
3319 Dynarr_declare (dfc_e2c_vals);
|
|
3320 } dfc_e2c_vals_dynarr;
|
|
3321
|
|
3322 static dfc_e2c_vals_dynarr *active_dfc_e2c;
|
|
3323
|
|
3324 static int
|
|
3325 find_pos_of_existing_active_dfc_e2c (const char *srctext)
|
|
3326 {
|
|
3327 dfc_e2c_vals *vals = NULL;
|
|
3328 int i;
|
|
3329
|
|
3330 for (i = 0; i < Dynarr_length (active_dfc_e2c); i++)
|
|
3331 {
|
|
3332 vals = Dynarr_atp (active_dfc_e2c, i);
|
|
3333 if (vals->srctext == srctext)
|
|
3334 return i;
|
|
3335 }
|
|
3336
|
|
3337 return -1;
|
|
3338 }
|
|
3339
|
|
3340 void *
|
|
3341 new_dfc_convert_alloca (const char *srctext, void *alloca_data)
|
|
3342 {
|
|
3343 dfc_e2c_vals *vals;
|
|
3344 int i = find_pos_of_existing_active_dfc_e2c (srctext);
|
|
3345
|
|
3346 assert (i >= 0);
|
|
3347 vals = Dynarr_atp (active_dfc_e2c, i);
|
|
3348 assert (alloca_data);
|
|
3349 memcpy (alloca_data, vals->dst, vals->dst_size + 2);
|
1726
|
3350 xfree (vals->dst, void *);
|
1318
|
3351 Dynarr_delete (active_dfc_e2c, i);
|
|
3352 return alloca_data;
|
|
3353 }
|
|
3354
|
|
3355 Bytecount
|
|
3356 new_dfc_convert_size (const char *srctext, const void *src,
|
|
3357 Bytecount src_size, enum new_dfc_src_type type,
|
|
3358 Lisp_Object codesys)
|
|
3359 {
|
|
3360 dfc_e2c_vals vals;
|
|
3361
|
|
3362 assert (find_pos_of_existing_active_dfc_e2c (srctext) < 0);
|
|
3363
|
|
3364 vals.srctext = srctext;
|
|
3365
|
|
3366 new_dfc_convert_now_damn_it (src, src_size, type, &vals.dst, &vals.dst_size,
|
|
3367 codesys);
|
|
3368
|
|
3369 Dynarr_add (active_dfc_e2c, vals);
|
|
3370 /* The size is always + 2 because we have double zero-termination at the
|
|
3371 end of all data (for Unicode-correctness). */
|
|
3372 return vals.dst_size + 2;
|
|
3373 }
|
|
3374
|
771
|
3375
|
|
3376 /************************************************************************/
|
867
|
3377 /* Basic Ichar functions */
|
771
|
3378 /************************************************************************/
|
|
3379
|
|
3380 #ifdef MULE
|
|
3381
|
|
3382 /* Convert a non-ASCII Mule character C into a one-character Mule-encoded
|
|
3383 string in STR. Returns the number of bytes stored.
|
867
|
3384 Do not call this directly. Use the macro set_itext_ichar() instead.
|
771
|
3385 */
|
|
3386
|
|
3387 Bytecount
|
867
|
3388 non_ascii_set_itext_ichar (Ibyte *str, Ichar c)
|
771
|
3389 {
|
867
|
3390 Ibyte *p;
|
|
3391 Ibyte lb;
|
771
|
3392 int c1, c2;
|
|
3393 Lisp_Object charset;
|
|
3394
|
|
3395 p = str;
|
867
|
3396 BREAKUP_ICHAR (c, charset, c1, c2);
|
|
3397 lb = ichar_leading_byte (c);
|
826
|
3398 if (leading_byte_private_p (lb))
|
|
3399 *p++ = private_leading_byte_prefix (lb);
|
771
|
3400 *p++ = lb;
|
|
3401 if (EQ (charset, Vcharset_control_1))
|
|
3402 c1 += 0x20;
|
|
3403 *p++ = c1 | 0x80;
|
|
3404 if (c2)
|
|
3405 *p++ = c2 | 0x80;
|
|
3406
|
|
3407 return (p - str);
|
|
3408 }
|
|
3409
|
|
3410 /* Return the first character from a Mule-encoded string in STR,
|
|
3411 assuming it's non-ASCII. Do not call this directly.
|
867
|
3412 Use the macro itext_ichar() instead. */
|
|
3413
|
|
3414 Ichar
|
|
3415 non_ascii_itext_ichar (const Ibyte *str)
|
771
|
3416 {
|
867
|
3417 Ibyte i0 = *str, i1, i2 = 0;
|
771
|
3418 Lisp_Object charset;
|
|
3419
|
|
3420 if (i0 == LEADING_BYTE_CONTROL_1)
|
867
|
3421 return (Ichar) (*++str - 0x20);
|
771
|
3422
|
826
|
3423 if (leading_byte_prefix_p (i0))
|
771
|
3424 i0 = *++str;
|
|
3425
|
|
3426 i1 = *++str & 0x7F;
|
|
3427
|
826
|
3428 charset = charset_by_leading_byte (i0);
|
771
|
3429 if (XCHARSET_DIMENSION (charset) == 2)
|
|
3430 i2 = *++str & 0x7F;
|
|
3431
|
867
|
3432 return make_ichar (charset, i1, i2);
|
771
|
3433 }
|
|
3434
|
867
|
3435 /* Return whether CH is a valid Ichar, assuming it's non-ASCII.
|
|
3436 Do not call this directly. Use the macro valid_ichar_p() instead. */
|
771
|
3437
|
|
3438 int
|
867
|
3439 non_ascii_valid_ichar_p (Ichar ch)
|
771
|
3440 {
|
|
3441 int f1, f2, f3;
|
|
3442
|
|
3443 /* Must have only lowest 19 bits set */
|
|
3444 if (ch & ~0x7FFFF)
|
|
3445 return 0;
|
|
3446
|
867
|
3447 f1 = ichar_field1 (ch);
|
|
3448 f2 = ichar_field2 (ch);
|
|
3449 f3 = ichar_field3 (ch);
|
771
|
3450
|
|
3451 if (f1 == 0)
|
|
3452 {
|
|
3453 /* dimension-1 char */
|
|
3454 Lisp_Object charset;
|
|
3455
|
|
3456 /* leading byte must be correct */
|
867
|
3457 if (f2 < MIN_ICHAR_FIELD2_OFFICIAL ||
|
|
3458 (f2 > MAX_ICHAR_FIELD2_OFFICIAL && f2 < MIN_ICHAR_FIELD2_PRIVATE) ||
|
|
3459 f2 > MAX_ICHAR_FIELD2_PRIVATE)
|
771
|
3460 return 0;
|
|
3461 /* octet not out of range */
|
|
3462 if (f3 < 0x20)
|
|
3463 return 0;
|
|
3464 /* charset exists */
|
|
3465 /*
|
|
3466 NOTE: This takes advantage of the fact that
|
|
3467 FIELD2_TO_OFFICIAL_LEADING_BYTE and
|
|
3468 FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
|
|
3469 */
|
826
|
3470 charset = charset_by_leading_byte (f2 + FIELD2_TO_OFFICIAL_LEADING_BYTE);
|
771
|
3471 if (EQ (charset, Qnil))
|
|
3472 return 0;
|
|
3473 /* check range as per size (94 or 96) of charset */
|
|
3474 return ((f3 > 0x20 && f3 < 0x7f) || XCHARSET_CHARS (charset) == 96);
|
|
3475 }
|
|
3476 else
|
|
3477 {
|
|
3478 /* dimension-2 char */
|
|
3479 Lisp_Object charset;
|
|
3480
|
|
3481 /* leading byte must be correct */
|
867
|
3482 if (f1 < MIN_ICHAR_FIELD1_OFFICIAL ||
|
|
3483 (f1 > MAX_ICHAR_FIELD1_OFFICIAL && f1 < MIN_ICHAR_FIELD1_PRIVATE) ||
|
|
3484 f1 > MAX_ICHAR_FIELD1_PRIVATE)
|
771
|
3485 return 0;
|
|
3486 /* octets not out of range */
|
|
3487 if (f2 < 0x20 || f3 < 0x20)
|
|
3488 return 0;
|
|
3489
|
|
3490 #ifdef ENABLE_COMPOSITE_CHARS
|
|
3491 if (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE == LEADING_BYTE_COMPOSITE)
|
|
3492 {
|
|
3493 if (UNBOUNDP (Fgethash (make_int (ch),
|
|
3494 Vcomposite_char_char2string_hash_table,
|
|
3495 Qunbound)))
|
|
3496 return 0;
|
|
3497 return 1;
|
|
3498 }
|
|
3499 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
3500
|
|
3501 /* charset exists */
|
867
|
3502 if (f1 <= MAX_ICHAR_FIELD1_OFFICIAL)
|
771
|
3503 charset =
|
826
|
3504 charset_by_leading_byte (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE);
|
771
|
3505 else
|
|
3506 charset =
|
826
|
3507 charset_by_leading_byte (f1 + FIELD1_TO_PRIVATE_LEADING_BYTE);
|
771
|
3508
|
|
3509 if (EQ (charset, Qnil))
|
|
3510 return 0;
|
|
3511 /* check range as per size (94x94 or 96x96) of charset */
|
|
3512 return ((f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F) ||
|
|
3513 XCHARSET_CHARS (charset) == 96);
|
|
3514 }
|
|
3515 }
|
|
3516
|
|
3517 /* Copy the character pointed to by SRC into DST. Do not call this
|
867
|
3518 directly. Use the macro itext_copy_ichar() instead.
|
771
|
3519 Return the number of bytes copied. */
|
|
3520
|
|
3521 Bytecount
|
867
|
3522 non_ascii_itext_copy_ichar (const Ibyte *src, Ibyte *dst)
|
771
|
3523 {
|
826
|
3524 Bytecount bytes = rep_bytes_by_first_byte (*src);
|
771
|
3525 Bytecount i;
|
|
3526 for (i = bytes; i; i--, dst++, src++)
|
|
3527 *dst = *src;
|
|
3528 return bytes;
|
|
3529 }
|
|
3530
|
|
3531 #endif /* MULE */
|
|
3532
|
|
3533
|
|
3534 /************************************************************************/
|
867
|
3535 /* streams of Ichars */
|
771
|
3536 /************************************************************************/
|
|
3537
|
|
3538 #ifdef MULE
|
|
3539
|
867
|
3540 /* Treat a stream as a stream of Ichar's rather than a stream of bytes.
|
771
|
3541 The functions below are not meant to be called directly; use
|
|
3542 the macros in insdel.h. */
|
|
3543
|
867
|
3544 Ichar
|
|
3545 Lstream_get_ichar_1 (Lstream *stream, int ch)
|
771
|
3546 {
|
867
|
3547 Ibyte str[MAX_ICHAR_LEN];
|
|
3548 Ibyte *strptr = str;
|
771
|
3549 Bytecount bytes;
|
|
3550
|
867
|
3551 str[0] = (Ibyte) ch;
|
771
|
3552
|
826
|
3553 for (bytes = rep_bytes_by_first_byte (ch) - 1; bytes; bytes--)
|
771
|
3554 {
|
|
3555 int c = Lstream_getc (stream);
|
800
|
3556 text_checking_assert (c >= 0);
|
867
|
3557 *++strptr = (Ibyte) c;
|
771
|
3558 }
|
867
|
3559 return itext_ichar (str);
|
771
|
3560 }
|
|
3561
|
|
3562 int
|
867
|
3563 Lstream_fput_ichar (Lstream *stream, Ichar ch)
|
771
|
3564 {
|
867
|
3565 Ibyte str[MAX_ICHAR_LEN];
|
|
3566 Bytecount len = set_itext_ichar (str, ch);
|
771
|
3567 return Lstream_write (stream, str, len);
|
|
3568 }
|
|
3569
|
|
3570 void
|
867
|
3571 Lstream_funget_ichar (Lstream *stream, Ichar ch)
|
771
|
3572 {
|
867
|
3573 Ibyte str[MAX_ICHAR_LEN];
|
|
3574 Bytecount len = set_itext_ichar (str, ch);
|
771
|
3575 Lstream_unread (stream, str, len);
|
|
3576 }
|
|
3577
|
|
3578 #endif /* MULE */
|
|
3579
|
|
3580
|
|
3581 /************************************************************************/
|
|
3582 /* Lisp primitives for working with characters */
|
|
3583 /************************************************************************/
|
|
3584
|
|
3585 DEFUN ("make-char", Fmake_char, 2, 3, 0, /*
|
|
3586 Make a character from CHARSET and octets ARG1 and ARG2.
|
|
3587 ARG2 is required only for characters from two-dimensional charsets.
|
|
3588
|
|
3589 Each octet should be in the range 32 through 127 for a 96 or 96x96
|
|
3590 charset and 33 through 126 for a 94 or 94x94 charset. (Most charsets
|
|
3591 are either 96 or 94x94.) Note that this is 32 more than the values
|
|
3592 typically given for 94x94 charsets. When two octets are required, the
|
|
3593 order is "standard" -- the same as appears in ISO-2022 encodings,
|
|
3594 reference tables, etc.
|
|
3595
|
|
3596 \(Note the following non-obvious result: Computerized translation
|
|
3597 tables often encode the two octets as the high and low bytes,
|
|
3598 respectively, of a hex short, while when there's only one octet, it
|
|
3599 goes in the low byte. When decoding such a value, you need to treat
|
|
3600 the two cases differently when calling make-char: One is (make-char
|
|
3601 CHARSET HIGH LOW), the other is (make-char CHARSET LOW).)
|
|
3602
|
|
3603 For example, (make-char 'latin-iso8859-2 185) or (make-char
|
|
3604 'latin-iso8859-2 57) will return the Latin 2 character s with caron.
|
|
3605
|
|
3606 As another example, the Japanese character for "kawa" (stream), which
|
|
3607 looks something like this:
|
|
3608
|
|
3609 | |
|
|
3610 | | |
|
|
3611 | | |
|
|
3612 | | |
|
|
3613 / |
|
|
3614
|
|
3615 appears in the Unicode Standard (version 2.0) on page 7-287 with the
|
|
3616 following values (see also page 7-4):
|
|
3617
|
|
3618 U 5DDD (Unicode)
|
|
3619 G 0-2008 (GB 2312-80)
|
|
3620 J 0-3278 (JIS X 0208-1990)
|
|
3621 K 0-8425 (KS C 5601-1987)
|
|
3622 B A474 (Big Five)
|
|
3623 C 1-4455 (CNS 11643-1986 (1st plane))
|
|
3624 A 213C34 (ANSI Z39.64-1989)
|
|
3625
|
|
3626 These are equivalent to:
|
|
3627
|
|
3628 \(make-char 'chinese-gb2312 52 40)
|
|
3629 \(make-char 'japanese-jisx0208 64 110)
|
|
3630 \(make-char 'korean-ksc5601 116 57)
|
|
3631 \(make-char 'chinese-cns11643-1 76 87)
|
|
3632 \(decode-big5-char '(164 . 116))
|
|
3633
|
|
3634 \(All codes above are two decimal numbers except for Big Five and ANSI
|
|
3635 Z39.64, which we don't support. We add 32 to each of the decimal
|
|
3636 numbers. Big Five is split in a rather hackish fashion into two
|
|
3637 charsets, `big5-1' and `big5-2', due to its excessive size -- 94x157,
|
|
3638 with the first codepoint in the range 0xA1 to 0xFE and the second in
|
|
3639 the range 0x40 to 0x7E or 0xA1 to 0xFE. `decode-big5-char' is used to
|
|
3640 generate the char from its codes, and `encode-big5-char' extracts the
|
|
3641 codes.)
|
|
3642
|
|
3643 When compiled without MULE, this function does not do much, but it's
|
|
3644 provided for compatibility. In this case, the following CHARSET symbols
|
|
3645 are allowed:
|
|
3646
|
|
3647 `ascii' -- ARG1 should be in the range 0 through 127.
|
|
3648 `control-1' -- ARG1 should be in the range 128 through 159.
|
|
3649 else -- ARG1 is coerced to be between 0 and 255, and then the high
|
|
3650 bit is set.
|
|
3651
|
|
3652 `int-to-char of the resulting ARG1' is returned, and ARG2 is always ignored.
|
|
3653 */
|
2333
|
3654 (charset, arg1, USED_IF_MULE (arg2)))
|
771
|
3655 {
|
|
3656 #ifdef MULE
|
|
3657 Lisp_Charset *cs;
|
|
3658 int a1, a2;
|
|
3659 int lowlim, highlim;
|
|
3660
|
|
3661 charset = Fget_charset (charset);
|
|
3662 cs = XCHARSET (charset);
|
|
3663
|
788
|
3664 get_charset_limits (charset, &lowlim, &highlim);
|
771
|
3665
|
|
3666 CHECK_INT (arg1);
|
|
3667 /* It is useful (and safe, according to Olivier Galibert) to strip
|
|
3668 the 8th bit off ARG1 and ARG2 because it allows programmers to
|
|
3669 write (make-char 'latin-iso8859-2 CODE) where code is the actual
|
|
3670 Latin 2 code of the character. */
|
|
3671 a1 = XINT (arg1) & 0x7f;
|
|
3672 if (a1 < lowlim || a1 > highlim)
|
|
3673 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim));
|
|
3674
|
|
3675 if (CHARSET_DIMENSION (cs) == 1)
|
|
3676 {
|
|
3677 if (!NILP (arg2))
|
|
3678 invalid_argument
|
|
3679 ("Charset is of dimension one; second octet must be nil", arg2);
|
867
|
3680 return make_char (make_ichar (charset, a1, 0));
|
771
|
3681 }
|
|
3682
|
|
3683 CHECK_INT (arg2);
|
|
3684 a2 = XINT (arg2) & 0x7f;
|
|
3685 if (a2 < lowlim || a2 > highlim)
|
|
3686 args_out_of_range_3 (arg2, make_int (lowlim), make_int (highlim));
|
|
3687
|
867
|
3688 return make_char (make_ichar (charset, a1, a2));
|
771
|
3689 #else
|
|
3690 int a1;
|
|
3691 int lowlim, highlim;
|
|
3692
|
|
3693 if (EQ (charset, Qascii)) lowlim = 0, highlim = 127;
|
|
3694 else if (EQ (charset, Qcontrol_1)) lowlim = 0, highlim = 31;
|
|
3695 else lowlim = 0, highlim = 127;
|
|
3696
|
|
3697 CHECK_INT (arg1);
|
|
3698 /* It is useful (and safe, according to Olivier Galibert) to strip
|
|
3699 the 8th bit off ARG1 and ARG2 because it allows programmers to
|
|
3700 write (make-char 'latin-iso8859-2 CODE) where code is the actual
|
|
3701 Latin 2 code of the character. */
|
|
3702 a1 = XINT (arg1) & 0x7f;
|
|
3703 if (a1 < lowlim || a1 > highlim)
|
|
3704 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim));
|
|
3705
|
|
3706 if (EQ (charset, Qascii))
|
|
3707 return make_char (a1);
|
|
3708 return make_char (a1 + 128);
|
|
3709 #endif /* MULE */
|
|
3710 }
|
|
3711
|
|
3712 #ifdef MULE
|
|
3713
|
|
3714 DEFUN ("char-charset", Fchar_charset, 1, 1, 0, /*
|
|
3715 Return the character set of char CH.
|
|
3716 */
|
|
3717 (ch))
|
|
3718 {
|
|
3719 CHECK_CHAR_COERCE_INT (ch);
|
|
3720
|
826
|
3721 return XCHARSET_NAME (charset_by_leading_byte
|
867
|
3722 (ichar_leading_byte (XCHAR (ch))));
|
771
|
3723 }
|
|
3724
|
|
3725 DEFUN ("char-octet", Fchar_octet, 1, 2, 0, /*
|
|
3726 Return the octet numbered N (should be 0 or 1) of char CH.
|
|
3727 N defaults to 0 if omitted.
|
|
3728 */
|
|
3729 (ch, n))
|
|
3730 {
|
|
3731 Lisp_Object charset;
|
|
3732 int octet0, octet1;
|
|
3733
|
|
3734 CHECK_CHAR_COERCE_INT (ch);
|
|
3735
|
867
|
3736 BREAKUP_ICHAR (XCHAR (ch), charset, octet0, octet1);
|
771
|
3737
|
|
3738 if (NILP (n) || EQ (n, Qzero))
|
|
3739 return make_int (octet0);
|
|
3740 else if (EQ (n, make_int (1)))
|
|
3741 return make_int (octet1);
|
|
3742 else
|
|
3743 invalid_constant ("Octet number must be 0 or 1", n);
|
|
3744 }
|
|
3745
|
|
3746 DEFUN ("split-char", Fsplit_char, 1, 1, 0, /*
|
|
3747 Return list of charset and one or two position-codes of CHAR.
|
|
3748 */
|
|
3749 (character))
|
|
3750 {
|
|
3751 /* This function can GC */
|
|
3752 struct gcpro gcpro1, gcpro2;
|
|
3753 Lisp_Object charset = Qnil;
|
|
3754 Lisp_Object rc = Qnil;
|
|
3755 int c1, c2;
|
|
3756
|
|
3757 GCPRO2 (charset, rc);
|
|
3758 CHECK_CHAR_COERCE_INT (character);
|
|
3759
|
867
|
3760 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2);
|
771
|
3761
|
|
3762 if (XCHARSET_DIMENSION (Fget_charset (charset)) == 2)
|
|
3763 {
|
|
3764 rc = list3 (XCHARSET_NAME (charset), make_int (c1), make_int (c2));
|
|
3765 }
|
|
3766 else
|
|
3767 {
|
|
3768 rc = list2 (XCHARSET_NAME (charset), make_int (c1));
|
|
3769 }
|
|
3770 UNGCPRO;
|
|
3771
|
|
3772 return rc;
|
|
3773 }
|
|
3774
|
|
3775 #endif /* MULE */
|
|
3776
|
|
3777
|
|
3778 /************************************************************************/
|
|
3779 /* composite character functions */
|
|
3780 /************************************************************************/
|
|
3781
|
|
3782 #ifdef ENABLE_COMPOSITE_CHARS
|
|
3783
|
867
|
3784 Ichar
|
|
3785 lookup_composite_char (Ibyte *str, int len)
|
771
|
3786 {
|
|
3787 Lisp_Object lispstr = make_string (str, len);
|
|
3788 Lisp_Object ch = Fgethash (lispstr,
|
|
3789 Vcomposite_char_string2char_hash_table,
|
|
3790 Qunbound);
|
867
|
3791 Ichar emch;
|
771
|
3792
|
|
3793 if (UNBOUNDP (ch))
|
|
3794 {
|
|
3795 if (composite_char_row_next >= 128)
|
|
3796 invalid_operation ("No more composite chars available", lispstr);
|
867
|
3797 emch = make_ichar (Vcharset_composite, composite_char_row_next,
|
771
|
3798 composite_char_col_next);
|
|
3799 Fputhash (make_char (emch), lispstr,
|
|
3800 Vcomposite_char_char2string_hash_table);
|
|
3801 Fputhash (lispstr, make_char (emch),
|
|
3802 Vcomposite_char_string2char_hash_table);
|
|
3803 composite_char_col_next++;
|
|
3804 if (composite_char_col_next >= 128)
|
|
3805 {
|
|
3806 composite_char_col_next = 32;
|
|
3807 composite_char_row_next++;
|
|
3808 }
|
|
3809 }
|
|
3810 else
|
|
3811 emch = XCHAR (ch);
|
|
3812 return emch;
|
|
3813 }
|
|
3814
|
|
3815 Lisp_Object
|
867
|
3816 composite_char_string (Ichar ch)
|
771
|
3817 {
|
|
3818 Lisp_Object str = Fgethash (make_char (ch),
|
|
3819 Vcomposite_char_char2string_hash_table,
|
|
3820 Qunbound);
|
|
3821 assert (!UNBOUNDP (str));
|
|
3822 return str;
|
|
3823 }
|
|
3824
|
826
|
3825 DEFUN ("make-composite-char", Fmake_composite_char, 1, 1, 0, /*
|
771
|
3826 Convert a string into a single composite character.
|
|
3827 The character is the result of overstriking all the characters in
|
|
3828 the string.
|
|
3829 */
|
|
3830 (string))
|
|
3831 {
|
|
3832 CHECK_STRING (string);
|
|
3833 return make_char (lookup_composite_char (XSTRING_DATA (string),
|
|
3834 XSTRING_LENGTH (string)));
|
|
3835 }
|
|
3836
|
826
|
3837 DEFUN ("composite-char-string", Fcomposite_char_string, 1, 1, 0, /*
|
771
|
3838 Return a string of the characters comprising a composite character.
|
|
3839 */
|
|
3840 (ch))
|
|
3841 {
|
867
|
3842 Ichar emch;
|
771
|
3843
|
|
3844 CHECK_CHAR (ch);
|
|
3845 emch = XCHAR (ch);
|
867
|
3846 if (ichar_leading_byte (emch) != LEADING_BYTE_COMPOSITE)
|
771
|
3847 invalid_argument ("Must be composite char", ch);
|
|
3848 return composite_char_string (emch);
|
|
3849 }
|
|
3850 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
3851
|
|
3852
|
|
3853 /************************************************************************/
|
|
3854 /* initialization */
|
|
3855 /************************************************************************/
|
|
3856
|
|
3857 void
|
1204
|
3858 reinit_eistring_early (void)
|
771
|
3859 {
|
|
3860 the_eistring_malloc_zero_init = the_eistring_zero_init;
|
|
3861 the_eistring_malloc_zero_init.mallocp_ = 1;
|
|
3862 }
|
|
3863
|
|
3864 void
|
814
|
3865 init_eistring_once_early (void)
|
|
3866 {
|
1204
|
3867 reinit_eistring_early ();
|
814
|
3868 }
|
|
3869
|
|
3870 void
|
771
|
3871 syms_of_text (void)
|
|
3872 {
|
|
3873 DEFSUBR (Fmake_char);
|
|
3874
|
|
3875 #ifdef MULE
|
|
3876 DEFSUBR (Fchar_charset);
|
|
3877 DEFSUBR (Fchar_octet);
|
|
3878 DEFSUBR (Fsplit_char);
|
|
3879
|
|
3880 #ifdef ENABLE_COMPOSITE_CHARS
|
|
3881 DEFSUBR (Fmake_composite_char);
|
|
3882 DEFSUBR (Fcomposite_char_string);
|
|
3883 #endif
|
|
3884 #endif /* MULE */
|
|
3885 }
|
|
3886
|
|
3887 void
|
|
3888 reinit_vars_of_text (void)
|
|
3889 {
|
|
3890 int i;
|
|
3891
|
867
|
3892 conversion_in_dynarr_list = Dynarr_new2 (Ibyte_dynarr_dynarr,
|
|
3893 Ibyte_dynarr *);
|
771
|
3894 conversion_out_dynarr_list = Dynarr_new2 (Extbyte_dynarr_dynarr,
|
|
3895 Extbyte_dynarr *);
|
1318
|
3896 active_dfc_e2c = Dynarr_new (dfc_e2c_vals);
|
771
|
3897
|
|
3898 for (i = 0; i <= MAX_BYTEBPOS_GAP_SIZE_3; i++)
|
|
3899 three_to_one_table[i] = i / 3;
|
|
3900 }
|
|
3901
|
|
3902 void
|
|
3903 vars_of_text (void)
|
|
3904 {
|
|
3905 reinit_vars_of_text ();
|
|
3906
|
1292
|
3907 QSin_char_byte_conversion = build_msg_string ("(in char-byte conversion)");
|
|
3908 staticpro (&QSin_char_byte_conversion);
|
|
3909 QSin_internal_external_conversion =
|
|
3910 build_msg_string ("(in internal-external conversion)");
|
|
3911 staticpro (&QSin_internal_external_conversion);
|
|
3912
|
771
|
3913 #ifdef ENABLE_COMPOSITE_CHARS
|
|
3914 /* #### not dumped properly */
|
|
3915 composite_char_row_next = 32;
|
|
3916 composite_char_col_next = 32;
|
|
3917
|
|
3918 Vcomposite_char_string2char_hash_table =
|
|
3919 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQUAL);
|
|
3920 Vcomposite_char_char2string_hash_table =
|
|
3921 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
|
|
3922 staticpro (&Vcomposite_char_string2char_hash_table);
|
|
3923 staticpro (&Vcomposite_char_char2string_hash_table);
|
|
3924 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
3925 }
|