771
|
1 /* Buffer manipulation primitives for XEmacs.
|
|
2 Copyright (C) 1995 Sun Microsystems, Inc.
|
|
3 Copyright (C) 1995, 1996, 2000, 2001, 2002 Ben Wing.
|
|
4 Copyright (C) 1999 Martin Buchholz.
|
|
5
|
|
6 This file is part of XEmacs.
|
|
7
|
|
8 XEmacs is free software; you can redistribute it and/or modify it
|
|
9 under the terms of the GNU General Public License as published by the
|
|
10 Free Software Foundation; either version 2, or (at your option) any
|
|
11 later version.
|
|
12
|
|
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
|
|
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
16 for more details.
|
|
17
|
|
18 You should have received a copy of the GNU General Public License
|
|
19 along with XEmacs; see the file COPYING. If not, write to
|
|
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
21 Boston, MA 02111-1307, USA. */
|
|
22
|
|
23 /* Synched up with: Not in FSF. */
|
|
24
|
|
25 /* Authorship:
|
|
26 */
|
|
27
|
|
28 #include <config.h>
|
|
29 #include "lisp.h"
|
|
30
|
|
31 #include "buffer.h"
|
|
32 #include "charset.h"
|
|
33 #include "file-coding.h"
|
|
34 #include "lstream.h"
|
|
35
|
|
36
|
|
37 /************************************************************************/
|
|
38 /* long comments */
|
|
39 /************************************************************************/
|
|
40
|
|
41 /*
|
|
42 There are three possible ways to specify positions in a buffer. All
|
|
43 of these are one-based: the beginning of the buffer is position or
|
|
44 index 1, and 0 is not a valid position.
|
|
45
|
|
46 As a "buffer position" (typedef Charbpos):
|
|
47
|
|
48 This is an index specifying an offset in characters from the
|
|
49 beginning of the buffer. Note that buffer positions are
|
|
50 logically *between* characters, not on a character. The
|
|
51 difference between two buffer positions specifies the number of
|
|
52 characters between those positions. Buffer positions are the
|
|
53 only kind of position externally visible to the user.
|
|
54
|
|
55 As a "byte index" (typedef Bytebpos):
|
|
56
|
|
57 This is an index over the bytes used to represent the characters
|
|
58 in the buffer. If there is no Mule support, this is identical
|
|
59 to a buffer position, because each character is represented
|
|
60 using one byte. However, with Mule support, many characters
|
|
61 require two or more bytes for their representation, and so a
|
|
62 byte index may be greater than the corresponding buffer
|
|
63 position.
|
|
64
|
|
65 As a "memory index" (typedef Membpos):
|
|
66
|
|
67 This is the byte index adjusted for the gap. For positions
|
|
68 before the gap, this is identical to the byte index. For
|
|
69 positions after the gap, this is the byte index plus the gap
|
|
70 size. There are two possible memory indices for the gap
|
|
71 position; the memory index at the beginning of the gap should
|
|
72 always be used, except in code that deals with manipulating the
|
|
73 gap, where both indices may be seen. The address of the
|
|
74 character "at" (i.e. following) a particular position can be
|
|
75 obtained from the formula
|
|
76
|
|
77 buffer_start_address + memory_index(position) - 1
|
|
78
|
|
79 except in the case of characters at the gap position.
|
|
80
|
|
81 Other typedefs:
|
|
82 ===============
|
|
83
|
|
84 Emchar:
|
|
85 -------
|
|
86 This typedef represents a single Emacs character, which can be
|
|
87 ASCII, ISO-8859, or some extended character, as would typically
|
|
88 be used for Kanji. Note that the representation of a character
|
|
89 as an Emchar is *not* the same as the representation of that
|
|
90 same character in a string; thus, you cannot do the standard
|
|
91 C trick of passing a pointer to a character to a function that
|
|
92 expects a string.
|
|
93
|
|
94 An Emchar takes up 19 bits of representation and (for code
|
|
95 compatibility and such) is compatible with an int. This
|
|
96 representation is visible on the Lisp level. The important
|
|
97 characteristics of the Emchar representation are
|
|
98
|
|
99 -- values 0x00 - 0x7f represent ASCII.
|
|
100 -- values 0x80 - 0xff represent the right half of ISO-8859-1.
|
|
101 -- values 0x100 and up represent all other characters.
|
|
102
|
|
103 This means that Emchar values are upwardly compatible with
|
|
104 the standard 8-bit representation of ASCII/ISO-8859-1.
|
|
105
|
|
106 Intbyte:
|
|
107 --------
|
|
108 The data in a buffer or string is logically made up of Intbyte
|
|
109 objects, where a Intbyte takes up the same amount of space as a
|
|
110 char. (It is declared differently, though, to catch invalid
|
|
111 usages.) Strings stored using Intbytes are said to be in
|
|
112 "internal format". The important characteristics of internal
|
|
113 format are
|
|
114
|
|
115 -- ASCII characters are represented as a single Intbyte,
|
|
116 in the range 0 - 0x7f.
|
|
117 -- All other characters are represented as a Intbyte in
|
|
118 the range 0x80 - 0x9f followed by one or more Intbytes
|
|
119 in the range 0xa0 to 0xff.
|
|
120
|
|
121 This leads to a number of desirable properties:
|
|
122
|
|
123 -- Given the position of the beginning of a character,
|
|
124 you can find the beginning of the next or previous
|
|
125 character in constant time.
|
|
126 -- When searching for a substring or an ASCII character
|
|
127 within the string, you need merely use standard
|
|
128 searching routines.
|
|
129
|
|
130 array of char:
|
|
131 --------------
|
|
132 Strings that go in or out of Emacs are in "external format",
|
|
133 typedef'ed as an array of char or a char *. There is more
|
|
134 than one external format (JIS, EUC, etc.) but they all
|
|
135 have similar properties. They are modal encodings,
|
|
136 which is to say that the meaning of particular bytes is
|
|
137 not fixed but depends on what "mode" the string is currently
|
|
138 in (e.g. bytes in the range 0 - 0x7f might be
|
|
139 interpreted as ASCII, or as Hiragana, or as 2-byte Kanji,
|
|
140 depending on the current mode). The mode starts out in
|
|
141 ASCII/ISO-8859-1 and is switched using escape sequences --
|
|
142 for example, in the JIS encoding, 'ESC $ B' switches to a
|
|
143 mode where pairs of bytes in the range 0 - 0x7f
|
|
144 are interpreted as Kanji characters.
|
|
145
|
|
146 External-formatted data is generally desirable for passing
|
|
147 data between programs because it is upwardly compatible
|
|
148 with standard ASCII/ISO-8859-1 strings and may require
|
|
149 less space than internal encodings such as the one
|
|
150 described above. In addition, some encodings (e.g. JIS)
|
|
151 keep all characters (except the ESC used to switch modes)
|
|
152 in the printing ASCII range 0x20 - 0x7e, which results in
|
|
153 a much higher probability that the data will avoid being
|
|
154 garbled in transmission. Externally-formatted data is
|
|
155 generally not very convenient to work with, however, and
|
|
156 for this reason is usually converted to internal format
|
|
157 before any work is done on the string.
|
|
158
|
|
159 NOTE: filenames need to be in external format so that
|
|
160 ISO-8859-1 characters come out correctly.
|
|
161
|
|
162 Charcount:
|
|
163 ----------
|
|
164 This typedef represents a count of characters, such as
|
|
165 a character offset into a string or the number of
|
|
166 characters between two positions in a buffer. The
|
|
167 difference between two Charbpos's is a Charcount, and
|
|
168 character positions in a string are represented using
|
|
169 a Charcount.
|
|
170
|
|
171 Bytecount:
|
|
172 ----------
|
|
173 Similar to a Charcount but represents a count of bytes.
|
|
174 The difference between two Bytebpos's is a Bytecount.
|
|
175
|
|
176
|
|
177 Usage of the various representations:
|
|
178 =====================================
|
|
179
|
|
180 Memory indices are used in low-level functions in insdel.c and for
|
|
181 extent endpoints and marker positions. The reason for this is that
|
|
182 this way, the extents and markers don't need to be updated for most
|
|
183 insertions, which merely shrink the gap and don't move any
|
|
184 characters around in memory.
|
|
185
|
|
186 (The beginning-of-gap memory index simplifies insertions w.r.t.
|
|
187 markers, because text usually gets inserted after markers. For
|
|
188 extents, it is merely for consistency, because text can get
|
|
189 inserted either before or after an extent's endpoint depending on
|
|
190 the open/closedness of the endpoint.)
|
|
191
|
|
192 Byte indices are used in other code that needs to be fast,
|
|
193 such as the searching, redisplay, and extent-manipulation code.
|
|
194
|
|
195 Buffer positions are used in all other code. This is because this
|
|
196 representation is easiest to work with (especially since Lisp
|
|
197 code always uses buffer positions), necessitates the fewest
|
|
198 changes to existing code, and is the safest (e.g. if the text gets
|
|
199 shifted underneath a buffer position, it will still point to a
|
|
200 character; if text is shifted under a byte index, it might point
|
|
201 to the middle of a character, which would be bad).
|
|
202
|
|
203 Similarly, Charcounts are used in all code that deals with strings
|
|
204 except for code that needs to be fast, which used Bytecounts.
|
|
205
|
|
206 Strings are always passed around internally using internal format.
|
|
207 Conversions between external format are performed at the time
|
|
208 that the data goes in or out of Emacs.
|
|
209
|
|
210 Working with the various representations:
|
|
211 ========================================= */
|
|
212
|
|
213 /* We write things this way because it's very important the
|
|
214 MAX_BYTEBPOS_GAP_SIZE_3 is a multiple of 3. (As it happens,
|
|
215 65535 is a multiple of 3, but this may not always be the
|
|
216 case.) */
|
|
217
|
|
218
|
|
219 /*
|
|
220 1. Character Sets
|
|
221 =================
|
|
222
|
|
223 A character set (or "charset") is an ordered set of characters.
|
|
224 A particular character in a charset is indexed using one or
|
|
225 more "position codes", which are non-negative integers.
|
|
226 The number of position codes needed to identify a particular
|
|
227 character in a charset is called the "dimension" of the
|
|
228 charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions,
|
|
229 and the size of all charsets (except for a few special cases)
|
|
230 is either 94, 96, 94 by 94, or 96 by 96. The range of
|
|
231 position codes used to index characters from any of these
|
|
232 types of character sets is as follows:
|
|
233
|
|
234 Charset type Position code 1 Position code 2
|
|
235 ------------------------------------------------------------
|
|
236 94 33 - 126 N/A
|
|
237 96 32 - 127 N/A
|
|
238 94x94 33 - 126 33 - 126
|
|
239 96x96 32 - 127 32 - 127
|
|
240
|
|
241 Note that in the above cases position codes do not start at
|
|
242 an expected value such as 0 or 1. The reason for this will
|
|
243 become clear later.
|
|
244
|
|
245 For example, Latin-1 is a 96-character charset, and JISX0208
|
|
246 (the Japanese national character set) is a 94x94-character
|
|
247 charset.
|
|
248
|
|
249 [Note that, although the ranges above define the *valid*
|
|
250 position codes for a charset, some of the slots in a particular
|
|
251 charset may in fact be empty. This is the case for JISX0208,
|
|
252 for example, where (e.g.) all the slots whose first
|
|
253 position code is in the range 118 - 127 are empty.]
|
|
254
|
|
255 There are three charsets that do not follow the above rules.
|
|
256 All of them have one dimension, and have ranges of position
|
|
257 codes as follows:
|
|
258
|
|
259 Charset name Position code 1
|
|
260 ------------------------------------
|
|
261 ASCII 0 - 127
|
|
262 Control-1 0 - 31
|
|
263 Composite 0 - some large number
|
|
264
|
|
265 (The upper bound of the position code for composite characters
|
|
266 has not yet been determined, but it will probably be at
|
|
267 least 16,383).
|
|
268
|
|
269 ASCII is the union of two subsidiary character sets:
|
|
270 Printing-ASCII (the printing ASCII character set,
|
|
271 consisting of position codes 33 - 126, like for a standard
|
|
272 94-character charset) and Control-ASCII (the non-printing
|
|
273 characters that would appear in a binary file with codes 0
|
|
274 - 32 and 127).
|
|
275
|
|
276 Control-1 contains the non-printing characters that would
|
|
277 appear in a binary file with codes 128 - 159.
|
|
278
|
|
279 Composite contains characters that are generated by
|
|
280 overstriking one or more characters from other charsets.
|
|
281
|
|
282 Note that some characters in ASCII, and all characters
|
|
283 in Control-1, are "control" (non-printing) characters.
|
|
284 These have no printed representation but instead control
|
|
285 some other function of the printing (e.g. TAB or 8 moves
|
|
286 the current character position to the next tab stop).
|
|
287 All other characters in all charsets are "graphic"
|
|
288 (printing) characters.
|
|
289
|
|
290 When a binary file is read in, the bytes in the file are
|
|
291 assigned to character sets as follows:
|
|
292
|
|
293 Bytes Character set Range
|
|
294 --------------------------------------------------
|
|
295 0 - 127 ASCII 0 - 127
|
|
296 128 - 159 Control-1 0 - 31
|
|
297 160 - 255 Latin-1 32 - 127
|
|
298
|
|
299 This is a bit ad-hoc but gets the job done.
|
|
300
|
|
301 2. Encodings
|
|
302 ============
|
|
303
|
|
304 An "encoding" is a way of numerically representing
|
|
305 characters from one or more character sets. If an encoding
|
|
306 only encompasses one character set, then the position codes
|
|
307 for the characters in that character set could be used
|
|
308 directly. This is not possible, however, if more than one
|
|
309 character set is to be used in the encoding.
|
|
310
|
|
311 For example, the conversion detailed above between bytes in
|
|
312 a binary file and characters is effectively an encoding
|
|
313 that encompasses the three character sets ASCII, Control-1,
|
|
314 and Latin-1 in a stream of 8-bit bytes.
|
|
315
|
|
316 Thus, an encoding can be viewed as a way of encoding
|
|
317 characters from a specified group of character sets using a
|
|
318 stream of bytes, each of which contains a fixed number of
|
|
319 bits (but not necessarily 8, as in the common usage of
|
|
320 "byte").
|
|
321
|
|
322 Here are descriptions of a couple of common
|
|
323 encodings:
|
|
324
|
|
325
|
|
326 A. Japanese EUC (Extended Unix Code)
|
|
327
|
|
328 This encompasses the character sets:
|
|
329 - Printing-ASCII,
|
|
330 - Katakana-JISX0201 (half-width katakana, the right half of JISX0201).
|
|
331 - Japanese-JISX0208
|
|
332 - Japanese-JISX0212
|
|
333 It uses 8-bit bytes.
|
|
334
|
|
335 Note that Printing-ASCII and Katakana-JISX0201 are 94-character
|
|
336 charsets, while Japanese-JISX0208 is a 94x94-character charset.
|
|
337
|
|
338 The encoding is as follows:
|
|
339
|
|
340 Character set Representation (PC == position-code)
|
|
341 ------------- --------------
|
|
342 Printing-ASCII PC1
|
|
343 Japanese-JISX0208 PC1 + 0x80 | PC2 + 0x80
|
|
344 Katakana-JISX0201 0x8E | PC1 + 0x80
|
|
345
|
|
346
|
|
347 B. JIS7
|
|
348
|
|
349 This encompasses the character sets:
|
|
350 - Printing-ASCII
|
|
351 - Latin-JISX0201 (the left half of JISX0201; this character set is
|
|
352 very similar to Printing-ASCII and is a 94-character charset)
|
|
353 - Japanese-JISX0208
|
|
354 - Katakana-JISX0201
|
|
355 It uses 7-bit bytes.
|
|
356
|
|
357 Unlike Japanese EUC, this is a "modal" encoding, which
|
|
358 means that there are multiple states that the encoding can
|
|
359 be in, which affect how the bytes are to be interpreted.
|
|
360 Special sequences of bytes (called "escape sequences")
|
|
361 are used to change states.
|
|
362
|
|
363 The encoding is as follows:
|
|
364
|
|
365 Character set Representation
|
|
366 ------------- --------------
|
|
367 Printing-ASCII PC1
|
|
368 Latin-JISX0201 PC1
|
|
369 Katakana-JISX0201 PC1
|
|
370 Japanese-JISX0208 PC1 | PC2
|
|
371
|
|
372 Escape sequence ASCII equivalent Meaning
|
|
373 --------------- ---------------- -------
|
|
374 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII
|
|
375 0x1B 0x28 0x4A ESC ( J invoke Latin-JISX0201
|
|
376 0x1B 0x28 0x49 ESC ( I invoke Katakana-JISX0201
|
|
377 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208
|
|
378
|
|
379 Initially, Printing-ASCII is invoked.
|
|
380
|
|
381 3. Internal Mule Encodings
|
|
382 ==========================
|
|
383
|
|
384 In XEmacs/Mule, each character set is assigned a unique number,
|
|
385 called a "leading byte". This is used in the encodings of a
|
|
386 character. Leading bytes are in the range 0x80 - 0xFF
|
|
387 (except for ASCII, which has a leading byte of 0), although
|
|
388 some leading bytes are reserved.
|
|
389
|
|
390 Charsets whose leading byte is in the range 0x80 - 0x9F are
|
|
391 called "official" and are used for built-in charsets.
|
|
392 Other charsets are called "private" and have leading bytes
|
|
393 in the range 0xA0 - 0xFF; these are user-defined charsets.
|
|
394
|
|
395 More specifically:
|
|
396
|
|
397 Character set Leading byte
|
|
398 ------------- ------------
|
|
399 ASCII 0 (0x7F in arrays indexed by leading byte)
|
|
400 Composite 0x8D
|
|
401 Dimension-1 Official 0x80 - 0x8C/0x8D
|
|
402 (0x8E is free)
|
|
403 Control 0x8F
|
|
404 Dimension-2 Official 0x90 - 0x99
|
|
405 (0x9A - 0x9D are free)
|
|
406 Dimension-1 Private Marker 0x9E
|
|
407 Dimension-2 Private Marker 0x9F
|
|
408 Dimension-1 Private 0xA0 - 0xEF
|
|
409 Dimension-2 Private 0xF0 - 0xFF
|
|
410
|
|
411 There are two internal encodings for characters in XEmacs/Mule.
|
|
412 One is called "string encoding" and is an 8-bit encoding that
|
|
413 is used for representing characters in a buffer or string.
|
|
414 It uses 1 to 4 bytes per character. The other is called
|
|
415 "character encoding" and is a 19-bit encoding that is used
|
|
416 for representing characters individually in a variable.
|
|
417
|
|
418 (In the following descriptions, we'll ignore composite
|
|
419 characters for the moment. We also give a general (structural)
|
|
420 overview first, followed later by the exact details.)
|
|
421
|
|
422 A. Internal String Encoding
|
|
423
|
|
424 ASCII characters are encoded using their position code directly.
|
|
425 Other characters are encoded using their leading byte followed
|
|
426 by their position code(s) with the high bit set. Characters
|
|
427 in private character sets have their leading byte prefixed with
|
|
428 a "leading byte prefix", which is either 0x9E or 0x9F. (No
|
|
429 character sets are ever assigned these leading bytes.) Specifically:
|
|
430
|
|
431 Character set Encoding (PC == position-code)
|
|
432 ------------- -------- (LB == leading-byte)
|
|
433 ASCII PC1 |
|
|
434 Control-1 LB | PC1 + 0xA0
|
|
435 Dimension-1 official LB | PC1 + 0x80
|
|
436 Dimension-1 private 0x9E | LB | PC1 + 0x80
|
|
437 Dimension-2 official LB | PC1 | PC2 + 0x80
|
|
438 Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80
|
|
439
|
|
440 The basic characteristic of this encoding is that the first byte
|
|
441 of all characters is in the range 0x00 - 0x9F, and the second and
|
|
442 following bytes of all characters is in the range 0xA0 - 0xFF.
|
|
443 This means that it is impossible to get out of sync, or more
|
|
444 specifically:
|
|
445
|
|
446 1. Given any byte position, the beginning of the character it is
|
|
447 within can be determined in constant time.
|
|
448 2. Given any byte position at the beginning of a character, the
|
|
449 beginning of the next character can be determined in constant
|
|
450 time.
|
|
451 3. Given any byte position at the beginning of a character, the
|
|
452 beginning of the previous character can be determined in constant
|
|
453 time.
|
|
454 4. Textual searches can simply treat encoded strings as if they
|
|
455 were encoded in a one-byte-per-character fashion rather than
|
|
456 the actual multi-byte encoding.
|
|
457
|
|
458 None of the standard non-modal encodings meet all of these
|
|
459 conditions. For example, EUC satisfies only (2) and (3), while
|
|
460 Shift-JIS and Big5 (not yet described) satisfy only (2). (All
|
|
461 non-modal encodings must satisfy (2), in order to be unambiguous.)
|
|
462
|
|
463 B. Internal Character Encoding
|
|
464
|
|
465 One 19-bit word represents a single character. The word is
|
|
466 separated into three fields:
|
|
467
|
|
468 Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
|
|
469 <------------> <------------------> <------------------>
|
|
470 Field: 1 2 3
|
|
471
|
|
472 Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits.
|
|
473
|
|
474 Character set Field 1 Field 2 Field 3
|
|
475 ------------- ------- ------- -------
|
|
476 ASCII 0 0 PC1
|
|
477 range: (00 - 7F)
|
|
478 Control-1 0 1 PC1
|
|
479 range: (00 - 1F)
|
|
480 Dimension-1 official 0 LB - 0x7F PC1
|
|
481 range: (01 - 0D) (20 - 7F)
|
|
482 Dimension-1 private 0 LB - 0x80 PC1
|
|
483 range: (20 - 6F) (20 - 7F)
|
|
484 Dimension-2 official LB - 0x8F PC1 PC2
|
|
485 range: (01 - 0A) (20 - 7F) (20 - 7F)
|
|
486 Dimension-2 private LB - 0xE1 PC1 PC2
|
|
487 range: (0F - 1E) (20 - 7F) (20 - 7F)
|
|
488 Composite 0x1F ? ?
|
|
489
|
|
490 Note that character codes 0 - 255 are the same as the "binary encoding"
|
|
491 described above.
|
|
492 */
|
|
493
|
|
494 /*
|
|
495 About Unicode support:
|
|
496
|
|
497 Adding Unicode support is very desirable. Unicode will likely be a
|
|
498 very common representation in the future, and thus we should
|
|
499 represent Unicode characters using three bytes instead of four.
|
|
500 This means we need to find leading bytes for Unicode. Given that
|
|
501 there are 65,536 characters in Unicode and we can attach 96x96 =
|
|
502 9,216 characters per leading byte, we need eight leading bytes for
|
|
503 Unicode. We currently have four free (0x9A - 0x9D), and with a
|
|
504 little bit of rearranging we can get five: ASCII doesn't really
|
|
505 need to take up a leading byte. (We could just as well use 0x7F,
|
|
506 with a little change to the functions that assume that 0x80 is the
|
|
507 lowest leading byte.) This means we still need to dump three
|
|
508 leading bytes and move them into private space. The CNS charsets
|
|
509 are good candidates since they are rarely used, and
|
|
510 JAPANESE_JISX0208_1978 is becoming less and less used and could
|
|
511 also be dumped. */
|
|
512
|
|
513
|
|
514 /* Composite characters are characters constructed by overstriking two
|
|
515 or more regular characters.
|
|
516
|
|
517 1) The old Mule implementation involves storing composite characters
|
|
518 in a buffer as a tag followed by all of the actual characters
|
|
519 used to make up the composite character. I think this is a bad
|
|
520 idea; it greatly complicates code that wants to handle strings
|
|
521 one character at a time because it has to deal with the possibility
|
|
522 of great big ungainly characters. It's much more reasonable to
|
|
523 simply store an index into a table of composite characters.
|
|
524
|
|
525 2) The current implementation only allows for 16,384 separate
|
|
526 composite characters over the lifetime of the XEmacs process.
|
|
527 This could become a potential problem if the user
|
|
528 edited lots of different files that use composite characters.
|
|
529 Due to FSF bogosity, increasing the number of allowable
|
|
530 composite characters under Mule would decrease the number
|
|
531 of possible faces that can exist. Mule already has shrunk
|
|
532 this to 2048, and further shrinkage would become uncomfortable.
|
|
533 No such problems exist in XEmacs.
|
|
534
|
|
535 Composite characters could be represented as 0x8D C1 C2 C3,
|
|
536 where each C[1-3] is in the range 0xA0 - 0xFF. This allows
|
|
537 for slightly under 2^20 (one million) composite characters
|
|
538 over the XEmacs process lifetime, and you only need to
|
|
539 increase the size of a Mule character from 19 to 21 bits.
|
|
540 Or you could use 0x8D C1 C2 C3 C4, allowing for about
|
|
541 85 million (slightly over 2^26) composite characters. */
|
|
542
|
|
543
|
|
544 /************************************************************************/
|
|
545 /* declarations */
|
|
546 /************************************************************************/
|
|
547
|
|
548 Eistring the_eistring_zero_init, the_eistring_malloc_zero_init;
|
|
549
|
|
550 #define MAX_CHARBPOS_GAP_SIZE_3 (65535/3)
|
|
551 #define MAX_BYTEBPOS_GAP_SIZE_3 (3 * MAX_CHARBPOS_GAP_SIZE_3)
|
|
552
|
|
553 short three_to_one_table[1 + MAX_BYTEBPOS_GAP_SIZE_3];
|
|
554
|
|
555 #ifdef MULE
|
|
556
|
|
557 /* Table of number of bytes in the string representation of a character
|
|
558 indexed by the first byte of that representation.
|
|
559
|
|
560 rep_bytes_by_first_byte(c) is more efficient than the equivalent
|
|
561 canonical computation:
|
|
562
|
|
563 XCHARSET_REP_BYTES (CHARSET_BY_LEADING_BYTE (c)) */
|
|
564
|
|
565 const Bytecount rep_bytes_by_first_byte[0xA0] =
|
|
566 { /* 0x00 - 0x7f are for straight ASCII */
|
|
567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
568 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
569 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
570 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
571 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
572 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
573 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
574 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
575 /* 0x80 - 0x8f are for Dimension-1 official charsets */
|
|
576 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
577 /* 0x90 - 0x9d are for Dimension-2 official charsets */
|
|
578 /* 0x9e is for Dimension-1 private charsets */
|
|
579 /* 0x9f is for Dimension-2 private charsets */
|
|
580 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
|
|
581 };
|
|
582
|
|
583 #ifdef ENABLE_COMPOSITE_CHARS
|
|
584
|
|
585 /* Hash tables for composite chars. One maps string representing
|
|
586 composed chars to their equivalent chars; one goes the
|
|
587 other way. */
|
|
588 Lisp_Object Vcomposite_char_char2string_hash_table;
|
|
589 Lisp_Object Vcomposite_char_string2char_hash_table;
|
|
590
|
|
591 static int composite_char_row_next;
|
|
592 static int composite_char_col_next;
|
|
593
|
|
594 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
595
|
|
596 #endif /* MULE */
|
|
597
|
|
598
|
|
599 /************************************************************************/
|
|
600 /* qxestr***() functions */
|
|
601 /************************************************************************/
|
|
602
|
|
603 /* Most are inline functions in lisp.h */
|
|
604
|
|
605 int
|
|
606 qxesprintf (Intbyte *buffer, const CIntbyte *format, ...)
|
|
607 {
|
|
608 va_list args;
|
|
609 int retval;
|
|
610
|
|
611 va_start (args, format);
|
|
612 retval = vsprintf ((char *) buffer, format, args);
|
|
613 va_end (args);
|
|
614
|
|
615 return retval;
|
|
616 }
|
|
617
|
|
618 /* strcasecmp() implementation from BSD */
|
|
619 static Intbyte strcasecmp_charmap[] = {
|
|
620 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
|
|
621 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
|
|
622 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
|
|
623 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
|
|
624 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
|
|
625 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
|
|
626 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
|
|
627 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
|
|
628 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
|
|
629 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
|
|
630 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
|
|
631 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
|
|
632 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
|
|
633 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
|
|
634 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
|
|
635 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
|
|
636 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
|
|
637 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
|
|
638 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
|
|
639 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
|
|
640 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
|
|
641 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
|
|
642 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
|
|
643 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
|
|
644 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
|
|
645 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
|
|
646 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
|
|
647 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
|
|
648 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
|
|
649 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
|
|
650 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
|
|
651 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
|
|
652 };
|
|
653
|
|
654 /* A version that works like generic strcasecmp() -- only collapsing
|
|
655 case in ASCII A-Z/a-z. This is safe on Mule strings due to the
|
|
656 current representation.
|
|
657
|
|
658 This version was written by some Berkeley coder, favoring
|
|
659 nanosecond improvements over clarity. In all other versions below,
|
|
660 we use symmetrical algorithms that may sacrifice a few machine
|
|
661 cycles but are MUCH MUCH clearer, which counts a lot more.
|
|
662 */
|
|
663
|
|
664 int
|
|
665 qxestrcasecmp (const Intbyte *s1, const Intbyte *s2)
|
|
666 {
|
|
667 Intbyte *cm = strcasecmp_charmap;
|
|
668
|
|
669 while (cm[*s1] == cm[*s2++])
|
|
670 if (*s1++ == '\0')
|
|
671 return (0);
|
|
672
|
|
673 return (cm[*s1] - cm[*--s2]);
|
|
674 }
|
|
675
|
|
676 int
|
|
677 ascii_strcasecmp (const Char_ASCII *s1, const Char_ASCII *s2)
|
|
678 {
|
|
679 return qxestrcasecmp ((const Intbyte *) s1, (const Intbyte *) s2);
|
|
680 }
|
|
681
|
|
682 int
|
|
683 qxestrcasecmp_c (const Intbyte *s1, const Char_ASCII *s2)
|
|
684 {
|
|
685 return qxestrcasecmp (s1, (const Intbyte *) s2);
|
|
686 }
|
|
687
|
|
688 /* An internationalized version that collapses case in a general fashion.
|
|
689 */
|
|
690
|
|
691 int
|
|
692 qxestrcasecmp_i18n (const Intbyte *s1, const Intbyte *s2)
|
|
693 {
|
|
694 while (*s1 && *s2)
|
|
695 {
|
|
696 if (DOWNCASE (0, charptr_emchar (s1)) !=
|
|
697 DOWNCASE (0, charptr_emchar (s2)))
|
|
698 break;
|
|
699 INC_CHARPTR (s1);
|
|
700 INC_CHARPTR (s2);
|
|
701 }
|
|
702
|
|
703 return (DOWNCASE (0, charptr_emchar (s1)) -
|
|
704 DOWNCASE (0, charptr_emchar (s2)));
|
|
705 }
|
|
706
|
|
707 /* The only difference between these next two and
|
|
708 qxememcasecmp()/qxememcasecmp_i18n() is that these two will stop if
|
|
709 both strings are equal and less than LEN in length, while
|
|
710 the mem...() versions would would run off the end. */
|
|
711
|
|
712 int
|
|
713 qxestrncasecmp (const Intbyte *s1, const Intbyte *s2, Bytecount len)
|
|
714 {
|
|
715 Intbyte *cm = strcasecmp_charmap;
|
|
716
|
|
717 while (len--)
|
|
718 {
|
|
719 int diff = cm[*s1] - cm[*s2];
|
|
720 if (diff != 0)
|
|
721 return diff;
|
|
722 if (!*s1)
|
|
723 return 0;
|
|
724 s1++, s2++;
|
|
725 }
|
|
726
|
|
727 return 0;
|
|
728 }
|
|
729
|
|
730 int
|
|
731 ascii_strncasecmp (const Char_ASCII *s1, const Char_ASCII *s2, Bytecount len)
|
|
732 {
|
|
733 return qxestrncasecmp ((const Intbyte *) s1, (const Intbyte *) s2, len);
|
|
734 }
|
|
735
|
|
736 int
|
|
737 qxestrncasecmp_c (const Intbyte *s1, const Char_ASCII *s2, Bytecount len)
|
|
738 {
|
|
739 return qxestrncasecmp (s1, (const Intbyte *) s2, len);
|
|
740 }
|
|
741
|
801
|
742 /* Compare LEN_FROM_S1 worth of characters from S1 with the same number of
|
|
743 characters from S2, case insensitive. NOTE: Downcasing can convert
|
|
744 characters from one length in bytes to another, so reversing S1 and S2
|
|
745 is *NOT* a symmetric operations! You must choose a length that agrees
|
|
746 with S1. */
|
|
747
|
771
|
748 int
|
801
|
749 qxestrncasecmp_i18n (const Intbyte *s1, const Intbyte *s2,
|
|
750 Bytecount len_from_s1)
|
771
|
751 {
|
801
|
752 while (len_from_s1 > 0)
|
771
|
753 {
|
|
754 const Intbyte *old_s1 = s1;
|
|
755 int diff = (DOWNCASE (0, charptr_emchar (s1)) -
|
|
756 DOWNCASE (0, charptr_emchar (s2)));
|
|
757 if (diff != 0)
|
|
758 return diff;
|
|
759 if (!*s1)
|
|
760 return 0;
|
|
761 INC_CHARPTR (s1);
|
|
762 INC_CHARPTR (s2);
|
801
|
763 len_from_s1 -= s1 - old_s1;
|
771
|
764 }
|
|
765
|
|
766 return 0;
|
|
767 }
|
|
768
|
|
769 int
|
|
770 qxememcmp (const Intbyte *s1, const Intbyte *s2, Bytecount len)
|
|
771 {
|
|
772 return memcmp (s1, s2, len);
|
|
773 }
|
|
774
|
|
775 int
|
801
|
776 qxememcmp4 (const Intbyte *s1, Bytecount len1,
|
|
777 const Intbyte *s2, Bytecount len2)
|
|
778 {
|
|
779 int retval = qxememcmp (s1, s2, min (len1, len2));
|
|
780 if (retval)
|
|
781 return retval;
|
|
782 return len1 - len2;
|
|
783 }
|
|
784
|
|
785 int
|
771
|
786 qxememcasecmp (const Intbyte *s1, const Intbyte *s2, Bytecount len)
|
|
787 {
|
|
788 Intbyte *cm = strcasecmp_charmap;
|
|
789
|
|
790 while (len--)
|
|
791 {
|
|
792 int diff = cm[*s1] - cm[*s2];
|
|
793 if (diff != 0)
|
|
794 return diff;
|
|
795 s1++, s2++;
|
|
796 }
|
|
797
|
|
798 return 0;
|
|
799 }
|
|
800
|
|
801 int
|
801
|
802 qxememcasecmp4 (const Intbyte *s1, Bytecount len1,
|
|
803 const Intbyte *s2, Bytecount len2)
|
771
|
804 {
|
801
|
805 int retval = qxememcasecmp (s1, s2, min (len1, len2));
|
|
806 if (retval)
|
|
807 return retval;
|
|
808 return len1 - len2;
|
|
809 }
|
|
810
|
|
811 /* Do a character-by-character comparison, returning "which is greater" by
|
|
812 comparing the Emchar values. (#### Should have option to compare Unicode
|
|
813 points) */
|
|
814
|
|
815 int
|
|
816 qxetextcmp (const Intbyte *s1, Bytecount len1,
|
|
817 const Intbyte *s2, Bytecount len2)
|
|
818 {
|
|
819 while (len1 > 0 && len2 > 0)
|
771
|
820 {
|
|
821 const Intbyte *old_s1 = s1;
|
801
|
822 const Intbyte *old_s2 = s2;
|
|
823 int diff = charptr_emchar (s1) - charptr_emchar (s2);
|
|
824 if (diff != 0)
|
|
825 return diff;
|
|
826 INC_CHARPTR (s1);
|
|
827 INC_CHARPTR (s2);
|
|
828 len1 -= s1 - old_s1;
|
|
829 len2 -= s2 - old_s2;
|
|
830 }
|
|
831
|
|
832 assert (len1 >= 0 && len2 >= 0);
|
|
833 return len1 - len2;
|
|
834 }
|
|
835
|
|
836 int
|
|
837 qxetextcmp_matching (const Intbyte *s1, Bytecount len1,
|
|
838 const Intbyte *s2, Bytecount len2,
|
|
839 Charcount *matching)
|
|
840 {
|
|
841 *matching = 0;
|
|
842 while (len1 > 0 && len2 > 0)
|
|
843 {
|
|
844 const Intbyte *old_s1 = s1;
|
|
845 const Intbyte *old_s2 = s2;
|
|
846 int diff = charptr_emchar (s1) - charptr_emchar (s2);
|
|
847 if (diff != 0)
|
|
848 return diff;
|
|
849 INC_CHARPTR (s1);
|
|
850 INC_CHARPTR (s2);
|
|
851 len1 -= s1 - old_s1;
|
|
852 len2 -= s2 - old_s2;
|
|
853 (*matching)++;
|
|
854 }
|
|
855
|
|
856 assert (len1 >= 0 && len2 >= 0);
|
|
857 return len1 - len2;
|
|
858 }
|
|
859
|
|
860 /* Do a character-by-character comparison, returning "which is greater" by
|
|
861 comparing the Emchar values, case insensitively (by downcasing both
|
|
862 first). (#### Should have option to compare Unicode points)
|
|
863
|
|
864 In this case, both lengths must be specified becaused downcasing can
|
|
865 convert characters from one length in bytes to another; therefore, two
|
|
866 blocks of text of different length might be equal. If both compare
|
|
867 equal up to the limit in length of one but not the other, the longer one
|
|
868 is "greater". */
|
|
869
|
|
870 int
|
|
871 qxetextcasecmp (const Intbyte *s1, Bytecount len1,
|
|
872 const Intbyte *s2, Bytecount len2)
|
|
873 {
|
|
874 while (len1 > 0 && len2 > 0)
|
|
875 {
|
|
876 const Intbyte *old_s1 = s1;
|
|
877 const Intbyte *old_s2 = s2;
|
771
|
878 int diff = (DOWNCASE (0, charptr_emchar (s1)) -
|
|
879 DOWNCASE (0, charptr_emchar (s2)));
|
|
880 if (diff != 0)
|
|
881 return diff;
|
|
882 INC_CHARPTR (s1);
|
|
883 INC_CHARPTR (s2);
|
801
|
884 len1 -= s1 - old_s1;
|
|
885 len2 -= s2 - old_s2;
|
771
|
886 }
|
|
887
|
801
|
888 assert (len1 >= 0 && len2 >= 0);
|
|
889 return len1 - len2;
|
|
890 }
|
|
891
|
|
892 /* Like qxetextcasecmp() but also return number of characters at
|
|
893 beginning that match. */
|
|
894
|
|
895 int
|
|
896 qxetextcasecmp_matching (const Intbyte *s1, Bytecount len1,
|
|
897 const Intbyte *s2, Bytecount len2,
|
|
898 Charcount *matching)
|
|
899 {
|
|
900 *matching = 0;
|
|
901 while (len1 > 0 && len2 > 0)
|
|
902 {
|
|
903 const Intbyte *old_s1 = s1;
|
|
904 const Intbyte *old_s2 = s2;
|
|
905 int diff = (DOWNCASE (0, charptr_emchar (s1)) -
|
|
906 DOWNCASE (0, charptr_emchar (s2)));
|
|
907 if (diff != 0)
|
|
908 return diff;
|
|
909 INC_CHARPTR (s1);
|
|
910 INC_CHARPTR (s2);
|
|
911 len1 -= s1 - old_s1;
|
|
912 len2 -= s2 - old_s2;
|
|
913 (*matching)++;
|
|
914 }
|
|
915
|
|
916 assert (len1 >= 0 && len2 >= 0);
|
|
917 return len1 - len2;
|
771
|
918 }
|
|
919
|
|
920 int
|
|
921 lisp_strcasecmp (Lisp_Object s1, Lisp_Object s2)
|
|
922 {
|
|
923 Intbyte *cm = strcasecmp_charmap;
|
|
924 Intbyte *p1 = XSTRING_DATA (s1);
|
|
925 Intbyte *p2 = XSTRING_DATA (s2);
|
|
926 Intbyte *e1 = p1 + XSTRING_LENGTH (s1);
|
|
927 Intbyte *e2 = p2 + XSTRING_LENGTH (s2);
|
|
928
|
|
929 /* again, we use a symmetric algorithm and favor clarity over
|
|
930 nanosecond improvements. */
|
|
931 while (1)
|
|
932 {
|
|
933 /* if we reached the end of either string, compare lengths.
|
|
934 do NOT compare the final null byte against anything, in case
|
|
935 the other string also has a null byte at that position. */
|
|
936 if (p1 == e1 || p2 == e2)
|
|
937 return e1 - e2;
|
|
938 if (cm[*p1] != cm[*p2])
|
|
939 return cm[*p1] - cm[*p2];
|
|
940 p1++, p2++;
|
|
941 }
|
|
942 }
|
|
943
|
|
944 int
|
|
945 lisp_strcasecmp_i18n (Lisp_Object s1, Lisp_Object s2)
|
|
946 {
|
801
|
947 return qxetextcasecmp (XSTRING_DATA (s1), XSTRING_LENGTH (s1),
|
|
948 XSTRING_DATA (s2), XSTRING_LENGTH (s2));
|
771
|
949 }
|
|
950
|
|
951
|
|
952 /************************************************************************/
|
|
953 /* conversion between textual representations */
|
|
954 /************************************************************************/
|
|
955
|
|
956 /* NOTE: Does not reset the Dynarr. */
|
|
957
|
|
958 void
|
|
959 convert_intbyte_string_into_emchar_dynarr (const Intbyte *str, Bytecount len,
|
|
960 Emchar_dynarr *dyn)
|
|
961 {
|
|
962 const Intbyte *strend = str + len;
|
|
963
|
|
964 while (str < strend)
|
|
965 {
|
|
966 Emchar ch = charptr_emchar (str);
|
|
967 Dynarr_add (dyn, ch);
|
|
968 INC_CHARPTR (str);
|
|
969 }
|
|
970 }
|
|
971
|
|
972 Charcount
|
|
973 convert_intbyte_string_into_emchar_string (const Intbyte *str, Bytecount len,
|
|
974 Emchar *arr)
|
|
975 {
|
|
976 const Intbyte *strend = str + len;
|
|
977 Charcount newlen = 0;
|
|
978 while (str < strend)
|
|
979 {
|
|
980 Emchar ch = charptr_emchar (str);
|
|
981 arr[newlen++] = ch;
|
|
982 INC_CHARPTR (str);
|
|
983 }
|
|
984 return newlen;
|
|
985 }
|
|
986
|
|
987 /* Convert an array of Emchars into the equivalent string representation.
|
|
988 Store into the given Intbyte dynarr. Does not reset the dynarr.
|
|
989 Does not add a terminating zero. */
|
|
990
|
|
991 void
|
|
992 convert_emchar_string_into_intbyte_dynarr (Emchar *arr, int nels,
|
|
993 Intbyte_dynarr *dyn)
|
|
994 {
|
|
995 Intbyte str[MAX_EMCHAR_LEN];
|
|
996 int i;
|
|
997
|
|
998 for (i = 0; i < nels; i++)
|
|
999 {
|
|
1000 Bytecount len = set_charptr_emchar (str, arr[i]);
|
|
1001 Dynarr_add_many (dyn, str, len);
|
|
1002 }
|
|
1003 }
|
|
1004
|
|
1005 /* Convert an array of Emchars into the equivalent string representation.
|
|
1006 Malloc the space needed for this and return it. If LEN_OUT is not a
|
|
1007 NULL pointer, store into LEN_OUT the number of Intbytes in the
|
|
1008 malloc()ed string. Note that the actual number of Intbytes allocated
|
|
1009 is one more than this: the returned string is zero-terminated. */
|
|
1010
|
|
1011 Intbyte *
|
|
1012 convert_emchar_string_into_malloced_string (Emchar *arr, int nels,
|
|
1013 Bytecount *len_out)
|
|
1014 {
|
|
1015 /* Damn zero-termination. */
|
|
1016 Intbyte *str = (Intbyte *) alloca (nels * MAX_EMCHAR_LEN + 1);
|
|
1017 Intbyte *strorig = str;
|
|
1018 Bytecount len;
|
|
1019
|
|
1020 int i;
|
|
1021
|
|
1022 for (i = 0; i < nels; i++)
|
|
1023 str += set_charptr_emchar (str, arr[i]);
|
|
1024 *str = '\0';
|
|
1025 len = str - strorig;
|
|
1026 str = (Intbyte *) xmalloc (1 + len);
|
|
1027 memcpy (str, strorig, 1 + len);
|
|
1028 if (len_out)
|
|
1029 *len_out = len;
|
|
1030 return str;
|
|
1031 }
|
|
1032
|
|
1033
|
|
1034 /************************************************************************/
|
|
1035 /* charset properties of strings */
|
|
1036 /************************************************************************/
|
|
1037
|
|
1038 void
|
|
1039 find_charsets_in_intbyte_string (unsigned char *charsets, const Intbyte *str,
|
|
1040 Bytecount len)
|
|
1041 {
|
|
1042 #ifndef MULE
|
|
1043 /* Telescope this. */
|
|
1044 charsets[0] = 1;
|
|
1045 #else
|
|
1046 const Intbyte *strend = str + len;
|
|
1047 memset (charsets, 0, NUM_LEADING_BYTES);
|
|
1048
|
|
1049 /* #### SJT doesn't like this. */
|
|
1050 if (len == 0)
|
|
1051 {
|
|
1052 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1;
|
|
1053 return;
|
|
1054 }
|
|
1055
|
|
1056 while (str < strend)
|
|
1057 {
|
|
1058 charsets[CHAR_LEADING_BYTE (charptr_emchar (str)) - MIN_LEADING_BYTE] =
|
|
1059 1;
|
|
1060 INC_CHARPTR (str);
|
|
1061 }
|
|
1062 #endif
|
|
1063 }
|
|
1064
|
|
1065 void
|
|
1066 find_charsets_in_emchar_string (unsigned char *charsets, const Emchar *str,
|
|
1067 Charcount len)
|
|
1068 {
|
|
1069 #ifndef MULE
|
|
1070 /* Telescope this. */
|
|
1071 charsets[0] = 1;
|
|
1072 #else
|
|
1073 int i;
|
|
1074
|
|
1075 memset (charsets, 0, NUM_LEADING_BYTES);
|
|
1076
|
|
1077 /* #### SJT doesn't like this. */
|
|
1078 if (len == 0)
|
|
1079 {
|
|
1080 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1;
|
|
1081 return;
|
|
1082 }
|
|
1083
|
|
1084 for (i = 0; i < len; i++)
|
|
1085 {
|
|
1086 charsets[CHAR_LEADING_BYTE (str[i]) - MIN_LEADING_BYTE] = 1;
|
|
1087 }
|
|
1088 #endif
|
|
1089 }
|
|
1090
|
|
1091 int
|
|
1092 intbyte_string_displayed_columns (const Intbyte *str, Bytecount len)
|
|
1093 {
|
|
1094 int cols = 0;
|
|
1095 const Intbyte *end = str + len;
|
|
1096
|
|
1097 while (str < end)
|
|
1098 {
|
|
1099 #ifdef MULE
|
|
1100 Emchar ch = charptr_emchar (str);
|
|
1101 cols += XCHARSET_COLUMNS (CHAR_CHARSET (ch));
|
|
1102 #else
|
|
1103 cols++;
|
|
1104 #endif
|
|
1105 INC_CHARPTR (str);
|
|
1106 }
|
|
1107
|
|
1108 return cols;
|
|
1109 }
|
|
1110
|
|
1111 int
|
|
1112 emchar_string_displayed_columns (const Emchar *str, Charcount len)
|
|
1113 {
|
|
1114 #ifdef MULE
|
|
1115 int cols = 0;
|
|
1116 int i;
|
|
1117
|
|
1118 for (i = 0; i < len; i++)
|
|
1119 cols += XCHARSET_COLUMNS (CHAR_CHARSET (str[i]));
|
|
1120
|
|
1121 return cols;
|
|
1122 #else /* not MULE */
|
|
1123 return len;
|
|
1124 #endif
|
|
1125 }
|
|
1126
|
|
1127 Charcount
|
|
1128 intbyte_string_nonascii_chars (const Intbyte *str, Bytecount len)
|
|
1129 {
|
|
1130 #ifdef MULE
|
|
1131 const Intbyte *end = str + len;
|
|
1132 Charcount retval = 0;
|
|
1133
|
|
1134 while (str < end)
|
|
1135 {
|
|
1136 if (!BYTE_ASCII_P (*str))
|
|
1137 retval++;
|
|
1138 INC_CHARPTR (str);
|
|
1139 }
|
|
1140
|
|
1141 return retval;
|
|
1142 #else
|
|
1143 return 0;
|
|
1144 #endif
|
|
1145 }
|
|
1146
|
|
1147
|
|
1148 /***************************************************************************/
|
|
1149 /* Eistring helper functions */
|
|
1150 /***************************************************************************/
|
|
1151
|
|
1152 int
|
|
1153 eistr_casefiddle_1 (Intbyte *olddata, Bytecount len, Intbyte *newdata,
|
|
1154 int downp)
|
|
1155 {
|
|
1156 Intbyte *endp = olddata + len;
|
|
1157 Intbyte *newp = newdata;
|
|
1158 int changedp = 0;
|
|
1159
|
|
1160 while (olddata < endp)
|
|
1161 {
|
|
1162 Emchar c = charptr_emchar (olddata);
|
|
1163 Emchar newc;
|
|
1164
|
|
1165 if (downp)
|
|
1166 newc = DOWNCASE (0, c);
|
|
1167 else
|
|
1168 newc = UPCASE (0, c);
|
|
1169
|
|
1170 if (c != newc)
|
|
1171 changedp = 1;
|
|
1172
|
|
1173 newp += set_charptr_emchar (newp, newc);
|
|
1174 INC_CHARPTR (olddata);
|
|
1175 }
|
|
1176
|
|
1177 *newp = '\0';
|
|
1178
|
|
1179 return changedp ? newp - newdata : 0;
|
|
1180 }
|
|
1181
|
|
1182 int
|
|
1183 eifind_large_enough_buffer (int oldbufsize, int needed_size)
|
|
1184 {
|
|
1185 while (oldbufsize < needed_size)
|
|
1186 {
|
|
1187 oldbufsize = oldbufsize * 3 / 2;
|
|
1188 oldbufsize = max (oldbufsize, 32);
|
|
1189 }
|
|
1190
|
|
1191 return oldbufsize;
|
|
1192 }
|
|
1193
|
|
1194 void
|
|
1195 eito_malloc_1 (Eistring *ei)
|
|
1196 {
|
|
1197 if (ei->mallocp_)
|
|
1198 return;
|
|
1199 ei->mallocp_ = 1;
|
|
1200 if (ei->data_)
|
|
1201 {
|
|
1202 Intbyte *newdata;
|
|
1203
|
|
1204 ei->max_size_allocated_ =
|
|
1205 eifind_large_enough_buffer (0, ei->bytelen_ + 1);
|
|
1206 newdata = (Intbyte *) xmalloc (ei->max_size_allocated_);
|
|
1207 memcpy (newdata, ei->data_, ei->bytelen_ + 1);
|
|
1208 ei->data_ = newdata;
|
|
1209 }
|
|
1210
|
|
1211 if (ei->extdata_)
|
|
1212 {
|
|
1213 Extbyte *newdata = (Extbyte *) xmalloc (ei->extlen_ + 2);
|
|
1214
|
|
1215 memcpy (newdata, ei->extdata_, ei->extlen_);
|
|
1216 /* Double null-terminate in case of Unicode data */
|
|
1217 newdata[ei->extlen_] = '\0';
|
|
1218 newdata[ei->extlen_ + 1] = '\0';
|
|
1219 ei->extdata_ = newdata;
|
|
1220 }
|
|
1221 }
|
|
1222
|
|
1223 int
|
|
1224 eicmp_1 (Eistring *ei, Bytecount off, Charcount charoff,
|
|
1225 Bytecount len, Charcount charlen, const Intbyte *data,
|
|
1226 const Eistring *ei2, int is_c, int fold_case)
|
|
1227 {
|
|
1228 assert ((off < 0) != (charoff < 0));
|
|
1229 if (off < 0)
|
|
1230 {
|
|
1231 off = charcount_to_bytecount (ei->data_, charoff);
|
|
1232 if (charlen < 0)
|
|
1233 len = -1;
|
|
1234 else
|
|
1235 len = charcount_to_bytecount (ei->data_ + off, charlen);
|
|
1236 }
|
|
1237 if (len < 0)
|
|
1238 len = ei->bytelen_ - off;
|
|
1239
|
|
1240 assert (off >= 0 && off <= ei->bytelen_);
|
|
1241 assert (len >= 0 && off + len <= ei->bytelen_);
|
|
1242 assert ((data == 0) != (ei == 0));
|
|
1243 assert ((is_c != 0) == (data != 0));
|
|
1244 assert (fold_case >= 0 && fold_case <= 2);
|
|
1245
|
|
1246 {
|
|
1247 Bytecount dstlen;
|
|
1248 const Intbyte *src = ei->data_, *dst;
|
|
1249
|
|
1250 if (data)
|
|
1251 {
|
|
1252 dst = data;
|
|
1253 dstlen = qxestrlen (data);
|
|
1254 }
|
|
1255 else
|
|
1256 {
|
|
1257 dst = ei2->data_;
|
|
1258 dstlen = ei2->bytelen_;
|
|
1259 }
|
|
1260
|
|
1261 if (is_c)
|
|
1262 EI_ASSERT_ASCII ((Char_ASCII *) dst, dstlen);
|
|
1263
|
801
|
1264 return (fold_case == 0 ? qxememcmp4 (src, len, dst, dstlen) :
|
|
1265 fold_case == 1 ? qxememcasecmp4 (src, len, dst, dstlen) :
|
|
1266 qxetextcasecmp (src, len, dst, dstlen));
|
771
|
1267 }
|
|
1268 }
|
|
1269
|
|
1270 Intbyte *
|
|
1271 eicpyout_malloc_fmt (Eistring *eistr, Bytecount *len_out, Internal_Format fmt)
|
|
1272 {
|
|
1273 Intbyte *ptr;
|
|
1274
|
|
1275 assert (fmt == FORMAT_DEFAULT);
|
|
1276 ptr = xnew_array (Intbyte, eistr->bytelen_ + 1);
|
|
1277 if (len_out)
|
|
1278 *len_out = eistr->bytelen_;
|
|
1279 memcpy (ptr, eistr->data_, eistr->bytelen_ + 1);
|
|
1280 return ptr;
|
|
1281 }
|
|
1282
|
|
1283
|
|
1284 /************************************************************************/
|
|
1285 /* Charcount/Bytecount conversion */
|
|
1286 /************************************************************************/
|
|
1287
|
|
1288 /* Optimization. Do it. Live it. Love it. */
|
|
1289
|
|
1290 #ifdef MULE
|
|
1291
|
|
1292 /* We include the basic functions here that require no specific
|
|
1293 knowledge of how data is Mule-encoded into a buffer other
|
|
1294 than the basic (00 - 7F), (80 - 9F), (A0 - FF) scheme.
|
|
1295 Anything that requires more specific knowledge goes into
|
|
1296 mule-charset.c. */
|
|
1297
|
|
1298 /* Given a pointer to a text string and a length in bytes, return
|
|
1299 the equivalent length in characters. */
|
|
1300
|
|
1301 Charcount
|
|
1302 bytecount_to_charcount (const Intbyte *ptr, Bytecount len)
|
|
1303 {
|
|
1304 Charcount count = 0;
|
|
1305 const Intbyte *end = ptr + len;
|
|
1306
|
|
1307 #if SIZEOF_LONG == 8
|
|
1308 # define STRIDE_TYPE long
|
|
1309 # define HIGH_BIT_MASK 0x8080808080808080UL
|
|
1310 #elif SIZEOF_LONG_LONG == 8 && !(defined (i386) || defined (__i386__))
|
|
1311 # define STRIDE_TYPE long long
|
|
1312 # define HIGH_BIT_MASK 0x8080808080808080ULL
|
|
1313 #elif SIZEOF_LONG == 4
|
|
1314 # define STRIDE_TYPE long
|
|
1315 # define HIGH_BIT_MASK 0x80808080UL
|
|
1316 #else
|
|
1317 # error Add support for 128-bit systems here
|
|
1318 #endif
|
|
1319
|
|
1320 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1))
|
|
1321 #define ALIGN_MASK (~ ALIGN_BITS)
|
|
1322 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0)
|
|
1323 #define STRIDE sizeof (STRIDE_TYPE)
|
|
1324
|
|
1325 while (ptr < end)
|
|
1326 {
|
|
1327 if (BYTE_ASCII_P (*ptr))
|
|
1328 {
|
|
1329 /* optimize for long stretches of ASCII */
|
|
1330 if (! ALIGNED (ptr))
|
|
1331 ptr++, count++;
|
|
1332 else
|
|
1333 {
|
|
1334 const unsigned STRIDE_TYPE *ascii_end =
|
|
1335 (const unsigned STRIDE_TYPE *) ptr;
|
|
1336 /* This loop screams, because we can detect ASCII
|
|
1337 characters 4 or 8 at a time. */
|
|
1338 while ((const Intbyte *) ascii_end + STRIDE <= end
|
|
1339 && !(*ascii_end & HIGH_BIT_MASK))
|
|
1340 ascii_end++;
|
|
1341 if ((Intbyte *) ascii_end == ptr)
|
|
1342 ptr++, count++;
|
|
1343 else
|
|
1344 {
|
|
1345 count += (Intbyte *) ascii_end - ptr;
|
|
1346 ptr = (Intbyte *) ascii_end;
|
|
1347 }
|
|
1348 }
|
|
1349 }
|
|
1350 else
|
|
1351 {
|
|
1352 /* optimize for successive characters from the same charset */
|
|
1353 Intbyte leading_byte = *ptr;
|
|
1354 int bytes = REP_BYTES_BY_FIRST_BYTE (leading_byte);
|
|
1355 while ((ptr < end) && (*ptr == leading_byte))
|
|
1356 ptr += bytes, count++;
|
|
1357 }
|
|
1358 }
|
|
1359
|
|
1360 /* Bomb out if the specified substring ends in the middle
|
|
1361 of a character. Note that we might have already gotten
|
|
1362 a core dump above from an invalid reference, but at least
|
|
1363 we will get no farther than here.
|
|
1364
|
|
1365 This also catches len < 0. */
|
800
|
1366 text_checking_assert (ptr == end);
|
771
|
1367
|
|
1368 return count;
|
|
1369 }
|
|
1370
|
|
1371 /* Given a pointer to a text string and a length in characters, return
|
|
1372 the equivalent length in bytes. */
|
|
1373
|
|
1374 Bytecount
|
|
1375 charcount_to_bytecount (const Intbyte *ptr, Charcount len)
|
|
1376 {
|
|
1377 const Intbyte *newptr = ptr;
|
|
1378
|
800
|
1379 text_checking_assert (len >= 0);
|
771
|
1380 while (len > 0)
|
|
1381 {
|
|
1382 INC_CHARPTR (newptr);
|
|
1383 len--;
|
|
1384 }
|
|
1385 return newptr - ptr;
|
|
1386 }
|
|
1387
|
|
1388 inline static void
|
|
1389 update_entirely_ascii_p_flag (struct buffer *buf)
|
|
1390 {
|
|
1391 buf->text->entirely_ascii_p =
|
|
1392 (buf->text->mule_bufmin == 1 &&
|
|
1393 buf->text->mule_bufmax == buf->text->bufz &&
|
|
1394 !buf->text->mule_shifter &&
|
|
1395 !buf->text->mule_three_p);
|
|
1396 }
|
|
1397
|
|
1398 /* The next two functions are the actual meat behind the
|
|
1399 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently
|
|
1400 the method they use is fairly unsophisticated; see buffer.h.
|
|
1401
|
|
1402 Note that charbpos_to_bytebpos_func() is probably the most-called
|
|
1403 function in all of XEmacs. Therefore, it must be FAST FAST FAST.
|
|
1404 This is the reason why so much of the code is duplicated.
|
|
1405
|
|
1406 Similar considerations apply to bytebpos_to_charbpos_func(), although
|
|
1407 less so because the function is not called so often.
|
|
1408
|
|
1409 #### At some point this should use a more sophisticated method;
|
|
1410 see buffer.h. */
|
|
1411
|
|
1412 static int not_very_random_number;
|
|
1413
|
|
1414 Bytebpos
|
|
1415 charbpos_to_bytebpos_func (struct buffer *buf, Charbpos x)
|
|
1416 {
|
|
1417 Charbpos bufmin;
|
|
1418 Charbpos bufmax;
|
|
1419 Bytebpos bytmin;
|
|
1420 Bytebpos bytmax;
|
|
1421 int size;
|
|
1422 int forward_p;
|
|
1423 Bytebpos retval;
|
|
1424 int diff_so_far;
|
|
1425 int add_to_cache = 0;
|
|
1426
|
|
1427 /* Check for some cached positions, for speed. */
|
|
1428 if (x == BUF_PT (buf))
|
|
1429 return BI_BUF_PT (buf);
|
|
1430 if (x == BUF_ZV (buf))
|
|
1431 return BI_BUF_ZV (buf);
|
|
1432 if (x == BUF_BEGV (buf))
|
|
1433 return BI_BUF_BEGV (buf);
|
|
1434
|
|
1435 bufmin = buf->text->mule_bufmin;
|
|
1436 bufmax = buf->text->mule_bufmax;
|
|
1437 bytmin = buf->text->mule_bytmin;
|
|
1438 bytmax = buf->text->mule_bytmax;
|
|
1439 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
|
|
1440
|
|
1441 /* The basic idea here is that we shift the "known region" up or down
|
|
1442 until it overlaps the specified position. We do this by moving
|
|
1443 the upper bound of the known region up one character at a time,
|
|
1444 and moving the lower bound of the known region up as necessary
|
|
1445 when the size of the character just seen changes.
|
|
1446
|
|
1447 We optimize this, however, by first shifting the known region to
|
|
1448 one of the cached points if it's close by. (We don't check BEG or
|
|
1449 Z, even though they're cached; most of the time these will be the
|
|
1450 same as BEGV and ZV, and when they're not, they're not likely
|
|
1451 to be used.) */
|
|
1452
|
|
1453 if (x > bufmax)
|
|
1454 {
|
|
1455 Charbpos diffmax = x - bufmax;
|
|
1456 Charbpos diffpt = x - BUF_PT (buf);
|
|
1457 Charbpos diffzv = BUF_ZV (buf) - x;
|
|
1458 /* #### This value could stand some more exploration. */
|
|
1459 Charcount heuristic_hack = (bufmax - bufmin) >> 2;
|
|
1460
|
|
1461 /* Check if the position is closer to PT or ZV than to the
|
|
1462 end of the known region. */
|
|
1463
|
|
1464 if (diffpt < 0)
|
|
1465 diffpt = -diffpt;
|
|
1466 if (diffzv < 0)
|
|
1467 diffzv = -diffzv;
|
|
1468
|
|
1469 /* But also implement a heuristic that favors the known region
|
|
1470 over PT or ZV. The reason for this is that switching to
|
|
1471 PT or ZV will wipe out the knowledge in the known region,
|
|
1472 which might be annoying if the known region is large and
|
|
1473 PT or ZV is not that much closer than the end of the known
|
|
1474 region. */
|
|
1475
|
|
1476 diffzv += heuristic_hack;
|
|
1477 diffpt += heuristic_hack;
|
|
1478 if (diffpt < diffmax && diffpt <= diffzv)
|
|
1479 {
|
|
1480 bufmax = bufmin = BUF_PT (buf);
|
|
1481 bytmax = bytmin = BI_BUF_PT (buf);
|
|
1482 /* We set the size to 1 even though it doesn't really
|
|
1483 matter because the new known region contains no
|
|
1484 characters. We do this because this is the most
|
|
1485 likely size of the characters around the new known
|
|
1486 region, and we avoid potential yuckiness that is
|
|
1487 done when size == 3. */
|
|
1488 size = 1;
|
|
1489 }
|
|
1490 if (diffzv < diffmax)
|
|
1491 {
|
|
1492 bufmax = bufmin = BUF_ZV (buf);
|
|
1493 bytmax = bytmin = BI_BUF_ZV (buf);
|
|
1494 size = 1;
|
|
1495 }
|
|
1496 }
|
800
|
1497 #ifdef ERROR_CHECK_TEXT
|
771
|
1498 else if (x >= bufmin)
|
|
1499 abort ();
|
|
1500 #endif
|
|
1501 else
|
|
1502 {
|
|
1503 Charbpos diffmin = bufmin - x;
|
|
1504 Charbpos diffpt = BUF_PT (buf) - x;
|
|
1505 Charbpos diffbegv = x - BUF_BEGV (buf);
|
|
1506 /* #### This value could stand some more exploration. */
|
|
1507 Charcount heuristic_hack = (bufmax - bufmin) >> 2;
|
|
1508
|
|
1509 if (diffpt < 0)
|
|
1510 diffpt = -diffpt;
|
|
1511 if (diffbegv < 0)
|
|
1512 diffbegv = -diffbegv;
|
|
1513
|
|
1514 /* But also implement a heuristic that favors the known region --
|
|
1515 see above. */
|
|
1516
|
|
1517 diffbegv += heuristic_hack;
|
|
1518 diffpt += heuristic_hack;
|
|
1519
|
|
1520 if (diffpt < diffmin && diffpt <= diffbegv)
|
|
1521 {
|
|
1522 bufmax = bufmin = BUF_PT (buf);
|
|
1523 bytmax = bytmin = BI_BUF_PT (buf);
|
|
1524 /* We set the size to 1 even though it doesn't really
|
|
1525 matter because the new known region contains no
|
|
1526 characters. We do this because this is the most
|
|
1527 likely size of the characters around the new known
|
|
1528 region, and we avoid potential yuckiness that is
|
|
1529 done when size == 3. */
|
|
1530 size = 1;
|
|
1531 }
|
|
1532 if (diffbegv < diffmin)
|
|
1533 {
|
|
1534 bufmax = bufmin = BUF_BEGV (buf);
|
|
1535 bytmax = bytmin = BI_BUF_BEGV (buf);
|
|
1536 size = 1;
|
|
1537 }
|
|
1538 }
|
|
1539
|
|
1540 diff_so_far = x > bufmax ? x - bufmax : bufmin - x;
|
|
1541 if (diff_so_far > 50)
|
|
1542 {
|
|
1543 /* If we have to move more than a certain amount, then look
|
|
1544 into our cache. */
|
|
1545 int minval = INT_MAX;
|
|
1546 int found = 0;
|
|
1547 int i;
|
|
1548
|
|
1549 add_to_cache = 1;
|
|
1550 /* I considered keeping the positions ordered. This would speed
|
|
1551 up this loop, but updating the cache would take longer, so
|
|
1552 it doesn't seem like it would really matter. */
|
|
1553 for (i = 0; i < 16; i++)
|
|
1554 {
|
|
1555 int diff = buf->text->mule_charbpos_cache[i] - x;
|
|
1556
|
|
1557 if (diff < 0)
|
|
1558 diff = -diff;
|
|
1559 if (diff < minval)
|
|
1560 {
|
|
1561 minval = diff;
|
|
1562 found = i;
|
|
1563 }
|
|
1564 }
|
|
1565
|
|
1566 if (minval < diff_so_far)
|
|
1567 {
|
|
1568 bufmax = bufmin = buf->text->mule_charbpos_cache[found];
|
|
1569 bytmax = bytmin = buf->text->mule_bytebpos_cache[found];
|
|
1570 size = 1;
|
|
1571 }
|
|
1572 }
|
|
1573
|
|
1574 /* It's conceivable that the caching above could lead to X being
|
|
1575 the same as one of the range edges. */
|
|
1576 if (x >= bufmax)
|
|
1577 {
|
|
1578 Bytebpos newmax;
|
|
1579 Bytecount newsize;
|
|
1580
|
|
1581 forward_p = 1;
|
|
1582 while (x > bufmax)
|
|
1583 {
|
|
1584 newmax = bytmax;
|
|
1585
|
|
1586 INC_BYTEBPOS (buf, newmax);
|
|
1587 newsize = newmax - bytmax;
|
|
1588 if (newsize != size)
|
|
1589 {
|
|
1590 bufmin = bufmax;
|
|
1591 bytmin = bytmax;
|
|
1592 size = newsize;
|
|
1593 }
|
|
1594 bytmax = newmax;
|
|
1595 bufmax++;
|
|
1596 }
|
|
1597 retval = bytmax;
|
|
1598
|
|
1599 /* #### Should go past the found location to reduce the number
|
|
1600 of times that this function is called */
|
|
1601 }
|
|
1602 else /* x < bufmin */
|
|
1603 {
|
|
1604 Bytebpos newmin;
|
|
1605 Bytecount newsize;
|
|
1606
|
|
1607 forward_p = 0;
|
|
1608 while (x < bufmin)
|
|
1609 {
|
|
1610 newmin = bytmin;
|
|
1611
|
|
1612 DEC_BYTEBPOS (buf, newmin);
|
|
1613 newsize = bytmin - newmin;
|
|
1614 if (newsize != size)
|
|
1615 {
|
|
1616 bufmax = bufmin;
|
|
1617 bytmax = bytmin;
|
|
1618 size = newsize;
|
|
1619 }
|
|
1620 bytmin = newmin;
|
|
1621 bufmin--;
|
|
1622 }
|
|
1623 retval = bytmin;
|
|
1624
|
|
1625 /* #### Should go past the found location to reduce the number
|
|
1626 of times that this function is called
|
|
1627 */
|
|
1628 }
|
|
1629
|
|
1630 /* If size is three, than we have to max sure that the range we
|
|
1631 discovered isn't too large, because we use a fixed-length
|
|
1632 table to divide by 3. */
|
|
1633
|
|
1634 if (size == 3)
|
|
1635 {
|
|
1636 int gap = bytmax - bytmin;
|
|
1637 buf->text->mule_three_p = 1;
|
|
1638 buf->text->mule_shifter = 1;
|
|
1639
|
|
1640 if (gap > MAX_BYTEBPOS_GAP_SIZE_3)
|
|
1641 {
|
|
1642 if (forward_p)
|
|
1643 {
|
|
1644 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3;
|
|
1645 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3;
|
|
1646 }
|
|
1647 else
|
|
1648 {
|
|
1649 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3;
|
|
1650 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3;
|
|
1651 }
|
|
1652 }
|
|
1653 }
|
|
1654 else
|
|
1655 {
|
|
1656 buf->text->mule_three_p = 0;
|
|
1657 if (size == 4)
|
|
1658 buf->text->mule_shifter = 2;
|
|
1659 else
|
|
1660 buf->text->mule_shifter = size - 1;
|
|
1661 }
|
|
1662
|
|
1663 buf->text->mule_bufmin = bufmin;
|
|
1664 buf->text->mule_bufmax = bufmax;
|
|
1665 buf->text->mule_bytmin = bytmin;
|
|
1666 buf->text->mule_bytmax = bytmax;
|
|
1667 update_entirely_ascii_p_flag (buf);
|
|
1668
|
|
1669 if (add_to_cache)
|
|
1670 {
|
|
1671 int replace_loc;
|
|
1672
|
|
1673 /* We throw away a "random" cached value and replace it with
|
|
1674 the new value. It doesn't actually have to be very random
|
|
1675 at all, just evenly distributed.
|
|
1676
|
|
1677 #### It would be better to use a least-recently-used algorithm
|
|
1678 or something that tries to space things out, but I'm not sure
|
|
1679 it's worth it to go to the trouble of maintaining that. */
|
|
1680 not_very_random_number += 621;
|
|
1681 replace_loc = not_very_random_number & 15;
|
|
1682 buf->text->mule_charbpos_cache[replace_loc] = x;
|
|
1683 buf->text->mule_bytebpos_cache[replace_loc] = retval;
|
|
1684 }
|
|
1685
|
|
1686 return retval;
|
|
1687 }
|
|
1688
|
|
1689 /* The logic in this function is almost identical to the logic in
|
|
1690 the previous function. */
|
|
1691
|
|
1692 Charbpos
|
|
1693 bytebpos_to_charbpos_func (struct buffer *buf, Bytebpos x)
|
|
1694 {
|
|
1695 Charbpos bufmin;
|
|
1696 Charbpos bufmax;
|
|
1697 Bytebpos bytmin;
|
|
1698 Bytebpos bytmax;
|
|
1699 int size;
|
|
1700 int forward_p;
|
|
1701 Charbpos retval;
|
|
1702 int diff_so_far;
|
|
1703 int add_to_cache = 0;
|
|
1704
|
|
1705 /* Check for some cached positions, for speed. */
|
|
1706 if (x == BI_BUF_PT (buf))
|
|
1707 return BUF_PT (buf);
|
|
1708 if (x == BI_BUF_ZV (buf))
|
|
1709 return BUF_ZV (buf);
|
|
1710 if (x == BI_BUF_BEGV (buf))
|
|
1711 return BUF_BEGV (buf);
|
|
1712
|
|
1713 bufmin = buf->text->mule_bufmin;
|
|
1714 bufmax = buf->text->mule_bufmax;
|
|
1715 bytmin = buf->text->mule_bytmin;
|
|
1716 bytmax = buf->text->mule_bytmax;
|
|
1717 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
|
|
1718
|
|
1719 /* The basic idea here is that we shift the "known region" up or down
|
|
1720 until it overlaps the specified position. We do this by moving
|
|
1721 the upper bound of the known region up one character at a time,
|
|
1722 and moving the lower bound of the known region up as necessary
|
|
1723 when the size of the character just seen changes.
|
|
1724
|
|
1725 We optimize this, however, by first shifting the known region to
|
|
1726 one of the cached points if it's close by. (We don't check BI_BEG or
|
|
1727 BI_Z, even though they're cached; most of the time these will be the
|
|
1728 same as BI_BEGV and BI_ZV, and when they're not, they're not likely
|
|
1729 to be used.) */
|
|
1730
|
|
1731 if (x > bytmax)
|
|
1732 {
|
|
1733 Bytebpos diffmax = x - bytmax;
|
|
1734 Bytebpos diffpt = x - BI_BUF_PT (buf);
|
|
1735 Bytebpos diffzv = BI_BUF_ZV (buf) - x;
|
|
1736 /* #### This value could stand some more exploration. */
|
|
1737 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
|
|
1738
|
|
1739 /* Check if the position is closer to PT or ZV than to the
|
|
1740 end of the known region. */
|
|
1741
|
|
1742 if (diffpt < 0)
|
|
1743 diffpt = -diffpt;
|
|
1744 if (diffzv < 0)
|
|
1745 diffzv = -diffzv;
|
|
1746
|
|
1747 /* But also implement a heuristic that favors the known region
|
|
1748 over BI_PT or BI_ZV. The reason for this is that switching to
|
|
1749 BI_PT or BI_ZV will wipe out the knowledge in the known region,
|
|
1750 which might be annoying if the known region is large and
|
|
1751 BI_PT or BI_ZV is not that much closer than the end of the known
|
|
1752 region. */
|
|
1753
|
|
1754 diffzv += heuristic_hack;
|
|
1755 diffpt += heuristic_hack;
|
|
1756 if (diffpt < diffmax && diffpt <= diffzv)
|
|
1757 {
|
|
1758 bufmax = bufmin = BUF_PT (buf);
|
|
1759 bytmax = bytmin = BI_BUF_PT (buf);
|
|
1760 /* We set the size to 1 even though it doesn't really
|
|
1761 matter because the new known region contains no
|
|
1762 characters. We do this because this is the most
|
|
1763 likely size of the characters around the new known
|
|
1764 region, and we avoid potential yuckiness that is
|
|
1765 done when size == 3. */
|
|
1766 size = 1;
|
|
1767 }
|
|
1768 if (diffzv < diffmax)
|
|
1769 {
|
|
1770 bufmax = bufmin = BUF_ZV (buf);
|
|
1771 bytmax = bytmin = BI_BUF_ZV (buf);
|
|
1772 size = 1;
|
|
1773 }
|
|
1774 }
|
800
|
1775 #ifdef ERROR_CHECK_TEXT
|
771
|
1776 else if (x >= bytmin)
|
|
1777 abort ();
|
|
1778 #endif
|
|
1779 else
|
|
1780 {
|
|
1781 Bytebpos diffmin = bytmin - x;
|
|
1782 Bytebpos diffpt = BI_BUF_PT (buf) - x;
|
|
1783 Bytebpos diffbegv = x - BI_BUF_BEGV (buf);
|
|
1784 /* #### This value could stand some more exploration. */
|
|
1785 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
|
|
1786
|
|
1787 if (diffpt < 0)
|
|
1788 diffpt = -diffpt;
|
|
1789 if (diffbegv < 0)
|
|
1790 diffbegv = -diffbegv;
|
|
1791
|
|
1792 /* But also implement a heuristic that favors the known region --
|
|
1793 see above. */
|
|
1794
|
|
1795 diffbegv += heuristic_hack;
|
|
1796 diffpt += heuristic_hack;
|
|
1797
|
|
1798 if (diffpt < diffmin && diffpt <= diffbegv)
|
|
1799 {
|
|
1800 bufmax = bufmin = BUF_PT (buf);
|
|
1801 bytmax = bytmin = BI_BUF_PT (buf);
|
|
1802 /* We set the size to 1 even though it doesn't really
|
|
1803 matter because the new known region contains no
|
|
1804 characters. We do this because this is the most
|
|
1805 likely size of the characters around the new known
|
|
1806 region, and we avoid potential yuckiness that is
|
|
1807 done when size == 3. */
|
|
1808 size = 1;
|
|
1809 }
|
|
1810 if (diffbegv < diffmin)
|
|
1811 {
|
|
1812 bufmax = bufmin = BUF_BEGV (buf);
|
|
1813 bytmax = bytmin = BI_BUF_BEGV (buf);
|
|
1814 size = 1;
|
|
1815 }
|
|
1816 }
|
|
1817
|
|
1818 diff_so_far = x > bytmax ? x - bytmax : bytmin - x;
|
|
1819 if (diff_so_far > 50)
|
|
1820 {
|
|
1821 /* If we have to move more than a certain amount, then look
|
|
1822 into our cache. */
|
|
1823 int minval = INT_MAX;
|
|
1824 int found = 0;
|
|
1825 int i;
|
|
1826
|
|
1827 add_to_cache = 1;
|
|
1828 /* I considered keeping the positions ordered. This would speed
|
|
1829 up this loop, but updating the cache would take longer, so
|
|
1830 it doesn't seem like it would really matter. */
|
|
1831 for (i = 0; i < 16; i++)
|
|
1832 {
|
|
1833 int diff = buf->text->mule_bytebpos_cache[i] - x;
|
|
1834
|
|
1835 if (diff < 0)
|
|
1836 diff = -diff;
|
|
1837 if (diff < minval)
|
|
1838 {
|
|
1839 minval = diff;
|
|
1840 found = i;
|
|
1841 }
|
|
1842 }
|
|
1843
|
|
1844 if (minval < diff_so_far)
|
|
1845 {
|
|
1846 bufmax = bufmin = buf->text->mule_charbpos_cache[found];
|
|
1847 bytmax = bytmin = buf->text->mule_bytebpos_cache[found];
|
|
1848 size = 1;
|
|
1849 }
|
|
1850 }
|
|
1851
|
|
1852 /* It's conceivable that the caching above could lead to X being
|
|
1853 the same as one of the range edges. */
|
|
1854 if (x >= bytmax)
|
|
1855 {
|
|
1856 Bytebpos newmax;
|
|
1857 Bytecount newsize;
|
|
1858
|
|
1859 forward_p = 1;
|
|
1860 while (x > bytmax)
|
|
1861 {
|
|
1862 newmax = bytmax;
|
|
1863
|
|
1864 INC_BYTEBPOS (buf, newmax);
|
|
1865 newsize = newmax - bytmax;
|
|
1866 if (newsize != size)
|
|
1867 {
|
|
1868 bufmin = bufmax;
|
|
1869 bytmin = bytmax;
|
|
1870 size = newsize;
|
|
1871 }
|
|
1872 bytmax = newmax;
|
|
1873 bufmax++;
|
|
1874 }
|
|
1875 retval = bufmax;
|
|
1876
|
|
1877 /* #### Should go past the found location to reduce the number
|
|
1878 of times that this function is called */
|
|
1879 }
|
|
1880 else /* x <= bytmin */
|
|
1881 {
|
|
1882 Bytebpos newmin;
|
|
1883 Bytecount newsize;
|
|
1884
|
|
1885 forward_p = 0;
|
|
1886 while (x < bytmin)
|
|
1887 {
|
|
1888 newmin = bytmin;
|
|
1889
|
|
1890 DEC_BYTEBPOS (buf, newmin);
|
|
1891 newsize = bytmin - newmin;
|
|
1892 if (newsize != size)
|
|
1893 {
|
|
1894 bufmax = bufmin;
|
|
1895 bytmax = bytmin;
|
|
1896 size = newsize;
|
|
1897 }
|
|
1898 bytmin = newmin;
|
|
1899 bufmin--;
|
|
1900 }
|
|
1901 retval = bufmin;
|
|
1902
|
|
1903 /* #### Should go past the found location to reduce the number
|
|
1904 of times that this function is called
|
|
1905 */
|
|
1906 }
|
|
1907
|
|
1908 /* If size is three, than we have to max sure that the range we
|
|
1909 discovered isn't too large, because we use a fixed-length
|
|
1910 table to divide by 3. */
|
|
1911
|
|
1912 if (size == 3)
|
|
1913 {
|
|
1914 int gap = bytmax - bytmin;
|
|
1915 buf->text->mule_three_p = 1;
|
|
1916 buf->text->mule_shifter = 1;
|
|
1917
|
|
1918 if (gap > MAX_BYTEBPOS_GAP_SIZE_3)
|
|
1919 {
|
|
1920 if (forward_p)
|
|
1921 {
|
|
1922 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3;
|
|
1923 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3;
|
|
1924 }
|
|
1925 else
|
|
1926 {
|
|
1927 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3;
|
|
1928 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3;
|
|
1929 }
|
|
1930 }
|
|
1931 }
|
|
1932 else
|
|
1933 {
|
|
1934 buf->text->mule_three_p = 0;
|
|
1935 if (size == 4)
|
|
1936 buf->text->mule_shifter = 2;
|
|
1937 else
|
|
1938 buf->text->mule_shifter = size - 1;
|
|
1939 }
|
|
1940
|
|
1941 buf->text->mule_bufmin = bufmin;
|
|
1942 buf->text->mule_bufmax = bufmax;
|
|
1943 buf->text->mule_bytmin = bytmin;
|
|
1944 buf->text->mule_bytmax = bytmax;
|
|
1945 update_entirely_ascii_p_flag (buf);
|
|
1946
|
|
1947 if (add_to_cache)
|
|
1948 {
|
|
1949 int replace_loc;
|
|
1950
|
|
1951 /* We throw away a "random" cached value and replace it with
|
|
1952 the new value. It doesn't actually have to be very random
|
|
1953 at all, just evenly distributed.
|
|
1954
|
|
1955 #### It would be better to use a least-recently-used algorithm
|
|
1956 or something that tries to space things out, but I'm not sure
|
|
1957 it's worth it to go to the trouble of maintaining that. */
|
|
1958 not_very_random_number += 621;
|
|
1959 replace_loc = not_very_random_number & 15;
|
|
1960 buf->text->mule_charbpos_cache[replace_loc] = retval;
|
|
1961 buf->text->mule_bytebpos_cache[replace_loc] = x;
|
|
1962 }
|
|
1963
|
|
1964 return retval;
|
|
1965 }
|
|
1966
|
|
1967 /* Text of length BYTELENGTH and CHARLENGTH (in different units)
|
|
1968 was inserted at charbpos START. */
|
|
1969
|
|
1970 void
|
|
1971 buffer_mule_signal_inserted_region (struct buffer *buf, Charbpos start,
|
|
1972 Bytecount bytelength,
|
|
1973 Charcount charlength)
|
|
1974 {
|
|
1975 int size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
|
|
1976 int i;
|
|
1977
|
|
1978 /* Adjust the cache of known positions. */
|
|
1979 for (i = 0; i < 16; i++)
|
|
1980 {
|
|
1981
|
|
1982 if (buf->text->mule_charbpos_cache[i] > start)
|
|
1983 {
|
|
1984 buf->text->mule_charbpos_cache[i] += charlength;
|
|
1985 buf->text->mule_bytebpos_cache[i] += bytelength;
|
|
1986 }
|
|
1987 }
|
|
1988
|
|
1989 if (start >= buf->text->mule_bufmax)
|
|
1990 goto done;
|
|
1991
|
|
1992 /* The insertion is either before the known region, in which case
|
|
1993 it shoves it forward; or within the known region, in which case
|
|
1994 it shoves the end forward. (But it may make the known region
|
|
1995 inconsistent, so we may have to shorten it.) */
|
|
1996
|
|
1997 if (start <= buf->text->mule_bufmin)
|
|
1998 {
|
|
1999 buf->text->mule_bufmin += charlength;
|
|
2000 buf->text->mule_bufmax += charlength;
|
|
2001 buf->text->mule_bytmin += bytelength;
|
|
2002 buf->text->mule_bytmax += bytelength;
|
|
2003 }
|
|
2004 else
|
|
2005 {
|
|
2006 Charbpos end = start + charlength;
|
|
2007 /* the insertion point divides the known region in two.
|
|
2008 Keep the longer half, at least, and expand into the
|
|
2009 inserted chunk as much as possible. */
|
|
2010
|
|
2011 if (start - buf->text->mule_bufmin > buf->text->mule_bufmax - start)
|
|
2012 {
|
|
2013 Bytebpos bytestart = (buf->text->mule_bytmin
|
|
2014 + size * (start - buf->text->mule_bufmin));
|
|
2015 Bytebpos bytenew;
|
|
2016
|
|
2017 while (start < end)
|
|
2018 {
|
|
2019 bytenew = bytestart;
|
|
2020 INC_BYTEBPOS (buf, bytenew);
|
|
2021 if (bytenew - bytestart != size)
|
|
2022 break;
|
|
2023 start++;
|
|
2024 bytestart = bytenew;
|
|
2025 }
|
|
2026 if (start != end)
|
|
2027 {
|
|
2028 buf->text->mule_bufmax = start;
|
|
2029 buf->text->mule_bytmax = bytestart;
|
|
2030 }
|
|
2031 else
|
|
2032 {
|
|
2033 buf->text->mule_bufmax += charlength;
|
|
2034 buf->text->mule_bytmax += bytelength;
|
|
2035 }
|
|
2036 }
|
|
2037 else
|
|
2038 {
|
|
2039 Bytebpos byteend = (buf->text->mule_bytmin
|
|
2040 + size * (start - buf->text->mule_bufmin)
|
|
2041 + bytelength);
|
|
2042 Bytebpos bytenew;
|
|
2043
|
|
2044 buf->text->mule_bufmax += charlength;
|
|
2045 buf->text->mule_bytmax += bytelength;
|
|
2046
|
|
2047 while (end > start)
|
|
2048 {
|
|
2049 bytenew = byteend;
|
|
2050 DEC_BYTEBPOS (buf, bytenew);
|
|
2051 if (byteend - bytenew != size)
|
|
2052 break;
|
|
2053 end--;
|
|
2054 byteend = bytenew;
|
|
2055 }
|
|
2056 if (start != end)
|
|
2057 {
|
|
2058 buf->text->mule_bufmin = end;
|
|
2059 buf->text->mule_bytmin = byteend;
|
|
2060 }
|
|
2061 }
|
|
2062 }
|
|
2063 done:
|
|
2064 update_entirely_ascii_p_flag (buf);
|
|
2065 }
|
|
2066
|
|
2067 /* Text from START to END (equivalent in Bytebposs: from BI_START to
|
|
2068 BI_END) was deleted. */
|
|
2069
|
|
2070 void
|
|
2071 buffer_mule_signal_deleted_region (struct buffer *buf, Charbpos start,
|
|
2072 Charbpos end, Bytebpos bi_start,
|
|
2073 Bytebpos bi_end)
|
|
2074 {
|
|
2075 int i;
|
|
2076
|
|
2077 /* Adjust the cache of known positions. */
|
|
2078 for (i = 0; i < 16; i++)
|
|
2079 {
|
|
2080 /* After the end; gets shoved backward */
|
|
2081 if (buf->text->mule_charbpos_cache[i] > end)
|
|
2082 {
|
|
2083 buf->text->mule_charbpos_cache[i] -= end - start;
|
|
2084 buf->text->mule_bytebpos_cache[i] -= bi_end - bi_start;
|
|
2085 }
|
|
2086 /* In the range; moves to start of range */
|
|
2087 else if (buf->text->mule_charbpos_cache[i] > start)
|
|
2088 {
|
|
2089 buf->text->mule_charbpos_cache[i] = start;
|
|
2090 buf->text->mule_bytebpos_cache[i] = bi_start;
|
|
2091 }
|
|
2092 }
|
|
2093
|
|
2094 /* We don't care about any text after the end of the known region. */
|
|
2095
|
|
2096 end = min (end, buf->text->mule_bufmax);
|
|
2097 bi_end = min (bi_end, buf->text->mule_bytmax);
|
|
2098 if (start >= end)
|
|
2099 goto done;
|
|
2100
|
|
2101 /* The end of the known region offsets by the total amount of deletion,
|
|
2102 since it's all before it. */
|
|
2103
|
|
2104 buf->text->mule_bufmax -= end - start;
|
|
2105 buf->text->mule_bytmax -= bi_end - bi_start;
|
|
2106
|
|
2107 /* Now we don't care about any text after the start of the known region. */
|
|
2108
|
|
2109 end = min (end, buf->text->mule_bufmin);
|
|
2110 bi_end = min (bi_end, buf->text->mule_bytmin);
|
|
2111 if (start < end)
|
|
2112 {
|
|
2113 buf->text->mule_bufmin -= end - start;
|
|
2114 buf->text->mule_bytmin -= bi_end - bi_start;
|
|
2115 }
|
|
2116
|
|
2117 done:
|
|
2118 update_entirely_ascii_p_flag (buf);
|
|
2119 }
|
|
2120
|
|
2121 #endif /* MULE */
|
|
2122
|
800
|
2123 #ifdef ERROR_CHECK_TEXT
|
771
|
2124
|
|
2125 Bytebpos
|
|
2126 charbpos_to_bytebpos (struct buffer *buf, Charbpos x)
|
|
2127 {
|
|
2128 Bytebpos retval = real_charbpos_to_bytebpos (buf, x);
|
|
2129 ASSERT_VALID_BYTEBPOS_UNSAFE (buf, retval);
|
|
2130 return retval;
|
|
2131 }
|
|
2132
|
|
2133 Charbpos
|
|
2134 bytebpos_to_charbpos (struct buffer *buf, Bytebpos x)
|
|
2135 {
|
|
2136 ASSERT_VALID_BYTEBPOS_UNSAFE (buf, x);
|
|
2137 return real_bytebpos_to_charbpos (buf, x);
|
|
2138 }
|
|
2139
|
800
|
2140 #endif /* ERROR_CHECK_TEXT */
|
771
|
2141
|
|
2142
|
|
2143 /************************************************************************/
|
|
2144 /* verifying buffer and string positions */
|
|
2145 /************************************************************************/
|
|
2146
|
|
2147 /* Functions below are tagged with either _byte or _char indicating
|
|
2148 whether they return byte or character positions. For a buffer,
|
|
2149 a character position is a "Charbpos" and a byte position is a "Bytebpos".
|
|
2150 For strings, these are sometimes typed using "Charcount" and
|
|
2151 "Bytecount". */
|
|
2152
|
|
2153 /* Flags for the functions below are:
|
|
2154
|
|
2155 GB_ALLOW_PAST_ACCESSIBLE
|
|
2156
|
|
2157 Allow positions to range over the entire buffer (BUF_BEG to BUF_Z),
|
|
2158 rather than just the accessible portion (BUF_BEGV to BUF_ZV).
|
|
2159 For strings, this flag has no effect.
|
|
2160
|
|
2161 GB_COERCE_RANGE
|
|
2162
|
|
2163 If the position is outside the allowable range, return the lower
|
|
2164 or upper bound of the range, whichever is closer to the specified
|
|
2165 position.
|
|
2166
|
|
2167 GB_NO_ERROR_IF_BAD
|
|
2168
|
|
2169 If the position is outside the allowable range, return -1.
|
|
2170
|
|
2171 GB_NEGATIVE_FROM_END
|
|
2172
|
|
2173 If a value is negative, treat it as an offset from the end.
|
|
2174 Only applies to strings.
|
|
2175
|
|
2176 The following additional flags apply only to the functions
|
|
2177 that return ranges:
|
|
2178
|
|
2179 GB_ALLOW_NIL
|
|
2180
|
|
2181 Either or both positions can be nil. If FROM is nil,
|
|
2182 FROM_OUT will contain the lower bound of the allowed range.
|
|
2183 If TO is nil, TO_OUT will contain the upper bound of the
|
|
2184 allowed range.
|
|
2185
|
|
2186 GB_CHECK_ORDER
|
|
2187
|
|
2188 FROM must contain the lower bound and TO the upper bound
|
|
2189 of the range. If the positions are reversed, an error is
|
|
2190 signalled.
|
|
2191
|
|
2192 The following is a combination flag:
|
|
2193
|
|
2194 GB_HISTORICAL_STRING_BEHAVIOR
|
|
2195
|
|
2196 Equivalent to (GB_NEGATIVE_FROM_END | GB_ALLOW_NIL).
|
|
2197 */
|
|
2198
|
|
2199 /* Return a buffer position stored in a Lisp_Object. Full
|
|
2200 error-checking is done on the position. Flags can be specified to
|
|
2201 control the behavior of out-of-range values. The default behavior
|
|
2202 is to require that the position is within the accessible part of
|
|
2203 the buffer (BEGV and ZV), and to signal an error if the position is
|
|
2204 out of range.
|
|
2205
|
|
2206 */
|
|
2207
|
|
2208 Charbpos
|
|
2209 get_buffer_pos_char (struct buffer *b, Lisp_Object pos, unsigned int flags)
|
|
2210 {
|
|
2211 /* Does not GC */
|
|
2212 Charbpos ind;
|
|
2213 Charbpos min_allowed, max_allowed;
|
|
2214
|
|
2215 CHECK_INT_COERCE_MARKER (pos);
|
|
2216 ind = XINT (pos);
|
|
2217 min_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_BEG (b) : BUF_BEGV (b);
|
|
2218 max_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_Z (b) : BUF_ZV (b);
|
|
2219
|
|
2220 if (ind < min_allowed || ind > max_allowed)
|
|
2221 {
|
|
2222 if (flags & GB_COERCE_RANGE)
|
|
2223 ind = ind < min_allowed ? min_allowed : max_allowed;
|
|
2224 else if (flags & GB_NO_ERROR_IF_BAD)
|
|
2225 ind = -1;
|
|
2226 else
|
|
2227 {
|
793
|
2228 Lisp_Object buffer = wrap_buffer (b);
|
|
2229
|
771
|
2230 args_out_of_range (buffer, pos);
|
|
2231 }
|
|
2232 }
|
|
2233
|
|
2234 return ind;
|
|
2235 }
|
|
2236
|
|
2237 Bytebpos
|
|
2238 get_buffer_pos_byte (struct buffer *b, Lisp_Object pos, unsigned int flags)
|
|
2239 {
|
|
2240 Charbpos bpos = get_buffer_pos_char (b, pos, flags);
|
|
2241 if (bpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */
|
|
2242 return -1;
|
|
2243 return charbpos_to_bytebpos (b, bpos);
|
|
2244 }
|
|
2245
|
|
2246 /* Return a pair of buffer positions representing a range of text,
|
|
2247 taken from a pair of Lisp_Objects. Full error-checking is
|
|
2248 done on the positions. Flags can be specified to control the
|
|
2249 behavior of out-of-range values. The default behavior is to
|
|
2250 allow the range bounds to be specified in either order
|
|
2251 (however, FROM_OUT will always be the lower bound of the range
|
|
2252 and TO_OUT the upper bound),to require that the positions
|
|
2253 are within the accessible part of the buffer (BEGV and ZV),
|
|
2254 and to signal an error if the positions are out of range.
|
|
2255 */
|
|
2256
|
|
2257 void
|
|
2258 get_buffer_range_char (struct buffer *b, Lisp_Object from, Lisp_Object to,
|
|
2259 Charbpos *from_out, Charbpos *to_out, unsigned int flags)
|
|
2260 {
|
|
2261 /* Does not GC */
|
|
2262 Charbpos min_allowed, max_allowed;
|
|
2263
|
|
2264 min_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
|
|
2265 BUF_BEG (b) : BUF_BEGV (b);
|
|
2266 max_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
|
|
2267 BUF_Z (b) : BUF_ZV (b);
|
|
2268
|
|
2269 if (NILP (from) && (flags & GB_ALLOW_NIL))
|
|
2270 *from_out = min_allowed;
|
|
2271 else
|
|
2272 *from_out = get_buffer_pos_char (b, from, flags | GB_NO_ERROR_IF_BAD);
|
|
2273
|
|
2274 if (NILP (to) && (flags & GB_ALLOW_NIL))
|
|
2275 *to_out = max_allowed;
|
|
2276 else
|
|
2277 *to_out = get_buffer_pos_char (b, to, flags | GB_NO_ERROR_IF_BAD);
|
|
2278
|
|
2279 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD))
|
|
2280 {
|
793
|
2281 Lisp_Object buffer = wrap_buffer (b);
|
|
2282
|
771
|
2283 args_out_of_range_3 (buffer, from, to);
|
|
2284 }
|
|
2285
|
|
2286 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out)
|
|
2287 {
|
|
2288 if (flags & GB_CHECK_ORDER)
|
|
2289 invalid_argument_2 ("start greater than end", from, to);
|
|
2290 else
|
|
2291 {
|
|
2292 Charbpos temp = *from_out;
|
|
2293 *from_out = *to_out;
|
|
2294 *to_out = temp;
|
|
2295 }
|
|
2296 }
|
|
2297 }
|
|
2298
|
|
2299 void
|
|
2300 get_buffer_range_byte (struct buffer *b, Lisp_Object from, Lisp_Object to,
|
|
2301 Bytebpos *from_out, Bytebpos *to_out, unsigned int flags)
|
|
2302 {
|
|
2303 Charbpos s, e;
|
|
2304
|
|
2305 get_buffer_range_char (b, from, to, &s, &e, flags);
|
|
2306 if (s >= 0)
|
|
2307 *from_out = charbpos_to_bytebpos (b, s);
|
|
2308 else /* could happen with GB_NO_ERROR_IF_BAD */
|
|
2309 *from_out = -1;
|
|
2310 if (e >= 0)
|
|
2311 *to_out = charbpos_to_bytebpos (b, e);
|
|
2312 else
|
|
2313 *to_out = -1;
|
|
2314 }
|
|
2315
|
|
2316 static Charcount
|
|
2317 get_string_pos_char_1 (Lisp_Object string, Lisp_Object pos, unsigned int flags,
|
|
2318 Charcount known_length)
|
|
2319 {
|
|
2320 Charcount ccpos;
|
|
2321 Charcount min_allowed = 0;
|
|
2322 Charcount max_allowed = known_length;
|
|
2323
|
|
2324 /* Computation of KNOWN_LENGTH is potentially expensive so we pass
|
|
2325 it in. */
|
|
2326 CHECK_INT (pos);
|
|
2327 ccpos = XINT (pos);
|
|
2328 if (ccpos < 0 && flags & GB_NEGATIVE_FROM_END)
|
|
2329 ccpos += max_allowed;
|
|
2330
|
|
2331 if (ccpos < min_allowed || ccpos > max_allowed)
|
|
2332 {
|
|
2333 if (flags & GB_COERCE_RANGE)
|
|
2334 ccpos = ccpos < min_allowed ? min_allowed : max_allowed;
|
|
2335 else if (flags & GB_NO_ERROR_IF_BAD)
|
|
2336 ccpos = -1;
|
|
2337 else
|
|
2338 args_out_of_range (string, pos);
|
|
2339 }
|
|
2340
|
|
2341 return ccpos;
|
|
2342 }
|
|
2343
|
|
2344 Charcount
|
|
2345 get_string_pos_char (Lisp_Object string, Lisp_Object pos, unsigned int flags)
|
|
2346 {
|
|
2347 return get_string_pos_char_1 (string, pos, flags,
|
|
2348 XSTRING_CHAR_LENGTH (string));
|
|
2349 }
|
|
2350
|
|
2351 Bytecount
|
|
2352 get_string_pos_byte (Lisp_Object string, Lisp_Object pos, unsigned int flags)
|
|
2353 {
|
|
2354 Charcount ccpos = get_string_pos_char (string, pos, flags);
|
|
2355 if (ccpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */
|
|
2356 return -1;
|
793
|
2357 return string_index_char_to_byte (string, ccpos);
|
771
|
2358 }
|
|
2359
|
|
2360 void
|
|
2361 get_string_range_char (Lisp_Object string, Lisp_Object from, Lisp_Object to,
|
|
2362 Charcount *from_out, Charcount *to_out,
|
|
2363 unsigned int flags)
|
|
2364 {
|
|
2365 Charcount min_allowed = 0;
|
|
2366 Charcount max_allowed = XSTRING_CHAR_LENGTH (string);
|
|
2367
|
|
2368 if (NILP (from) && (flags & GB_ALLOW_NIL))
|
|
2369 *from_out = min_allowed;
|
|
2370 else
|
|
2371 *from_out = get_string_pos_char_1 (string, from,
|
|
2372 flags | GB_NO_ERROR_IF_BAD,
|
|
2373 max_allowed);
|
|
2374
|
|
2375 if (NILP (to) && (flags & GB_ALLOW_NIL))
|
|
2376 *to_out = max_allowed;
|
|
2377 else
|
|
2378 *to_out = get_string_pos_char_1 (string, to,
|
|
2379 flags | GB_NO_ERROR_IF_BAD,
|
|
2380 max_allowed);
|
|
2381
|
|
2382 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD))
|
|
2383 args_out_of_range_3 (string, from, to);
|
|
2384
|
|
2385 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out)
|
|
2386 {
|
|
2387 if (flags & GB_CHECK_ORDER)
|
|
2388 invalid_argument_2 ("start greater than end", from, to);
|
|
2389 else
|
|
2390 {
|
|
2391 Charbpos temp = *from_out;
|
|
2392 *from_out = *to_out;
|
|
2393 *to_out = temp;
|
|
2394 }
|
|
2395 }
|
|
2396 }
|
|
2397
|
|
2398 void
|
|
2399 get_string_range_byte (Lisp_Object string, Lisp_Object from, Lisp_Object to,
|
|
2400 Bytecount *from_out, Bytecount *to_out,
|
|
2401 unsigned int flags)
|
|
2402 {
|
|
2403 Charcount s, e;
|
|
2404
|
|
2405 get_string_range_char (string, from, to, &s, &e, flags);
|
|
2406 if (s >= 0)
|
793
|
2407 *from_out = string_index_char_to_byte (string, s);
|
771
|
2408 else /* could happen with GB_NO_ERROR_IF_BAD */
|
|
2409 *from_out = -1;
|
|
2410 if (e >= 0)
|
793
|
2411 *to_out = string_index_char_to_byte (string, e);
|
771
|
2412 else
|
|
2413 *to_out = -1;
|
|
2414
|
|
2415 }
|
|
2416
|
|
2417 Charbpos
|
|
2418 get_buffer_or_string_pos_char (Lisp_Object object, Lisp_Object pos,
|
|
2419 unsigned int flags)
|
|
2420 {
|
|
2421 return STRINGP (object) ?
|
|
2422 get_string_pos_char (object, pos, flags) :
|
|
2423 get_buffer_pos_char (XBUFFER (object), pos, flags);
|
|
2424 }
|
|
2425
|
|
2426 Bytebpos
|
|
2427 get_buffer_or_string_pos_byte (Lisp_Object object, Lisp_Object pos,
|
|
2428 unsigned int flags)
|
|
2429 {
|
|
2430 return STRINGP (object) ?
|
|
2431 get_string_pos_byte (object, pos, flags) :
|
|
2432 get_buffer_pos_byte (XBUFFER (object), pos, flags);
|
|
2433 }
|
|
2434
|
|
2435 void
|
|
2436 get_buffer_or_string_range_char (Lisp_Object object, Lisp_Object from,
|
|
2437 Lisp_Object to, Charbpos *from_out,
|
|
2438 Charbpos *to_out, unsigned int flags)
|
|
2439 {
|
|
2440 if (STRINGP (object))
|
|
2441 get_string_range_char (object, from, to, from_out, to_out, flags);
|
|
2442 else
|
|
2443 get_buffer_range_char (XBUFFER (object), from, to, from_out, to_out, flags);
|
|
2444 }
|
|
2445
|
|
2446 void
|
|
2447 get_buffer_or_string_range_byte (Lisp_Object object, Lisp_Object from,
|
|
2448 Lisp_Object to, Bytebpos *from_out,
|
|
2449 Bytebpos *to_out, unsigned int flags)
|
|
2450 {
|
|
2451 if (STRINGP (object))
|
|
2452 get_string_range_byte (object, from, to, from_out, to_out, flags);
|
|
2453 else
|
|
2454 get_buffer_range_byte (XBUFFER (object), from, to, from_out, to_out, flags);
|
|
2455 }
|
|
2456
|
|
2457 Charbpos
|
|
2458 buffer_or_string_accessible_begin_char (Lisp_Object object)
|
|
2459 {
|
|
2460 return STRINGP (object) ? 0 : BUF_BEGV (XBUFFER (object));
|
|
2461 }
|
|
2462
|
|
2463 Charbpos
|
|
2464 buffer_or_string_accessible_end_char (Lisp_Object object)
|
|
2465 {
|
|
2466 return STRINGP (object) ?
|
|
2467 XSTRING_CHAR_LENGTH (object) : BUF_ZV (XBUFFER (object));
|
|
2468 }
|
|
2469
|
|
2470 Bytebpos
|
|
2471 buffer_or_string_accessible_begin_byte (Lisp_Object object)
|
|
2472 {
|
|
2473 return STRINGP (object) ? 0 : BI_BUF_BEGV (XBUFFER (object));
|
|
2474 }
|
|
2475
|
|
2476 Bytebpos
|
|
2477 buffer_or_string_accessible_end_byte (Lisp_Object object)
|
|
2478 {
|
|
2479 return STRINGP (object) ?
|
|
2480 XSTRING_LENGTH (object) : BI_BUF_ZV (XBUFFER (object));
|
|
2481 }
|
|
2482
|
|
2483 Charbpos
|
|
2484 buffer_or_string_absolute_begin_char (Lisp_Object object)
|
|
2485 {
|
|
2486 return STRINGP (object) ? 0 : BUF_BEG (XBUFFER (object));
|
|
2487 }
|
|
2488
|
|
2489 Charbpos
|
|
2490 buffer_or_string_absolute_end_char (Lisp_Object object)
|
|
2491 {
|
|
2492 return STRINGP (object) ?
|
|
2493 XSTRING_CHAR_LENGTH (object) : BUF_Z (XBUFFER (object));
|
|
2494 }
|
|
2495
|
|
2496 Bytebpos
|
|
2497 buffer_or_string_absolute_begin_byte (Lisp_Object object)
|
|
2498 {
|
|
2499 return STRINGP (object) ? 0 : BI_BUF_BEG (XBUFFER (object));
|
|
2500 }
|
|
2501
|
|
2502 Bytebpos
|
|
2503 buffer_or_string_absolute_end_byte (Lisp_Object object)
|
|
2504 {
|
|
2505 return STRINGP (object) ?
|
|
2506 XSTRING_LENGTH (object) : BI_BUF_Z (XBUFFER (object));
|
|
2507 }
|
|
2508
|
|
2509
|
|
2510 /************************************************************************/
|
|
2511 /* Implement TO_EXTERNAL_FORMAT, TO_INTERNAL_FORMAT */
|
|
2512 /************************************************************************/
|
|
2513
|
|
2514 typedef struct
|
|
2515 {
|
|
2516 Dynarr_declare (Intbyte_dynarr *);
|
|
2517 } Intbyte_dynarr_dynarr;
|
|
2518
|
|
2519 typedef struct
|
|
2520 {
|
|
2521 Dynarr_declare (Extbyte_dynarr *);
|
|
2522 } Extbyte_dynarr_dynarr;
|
|
2523
|
|
2524 static Extbyte_dynarr_dynarr *conversion_out_dynarr_list;
|
|
2525 static Intbyte_dynarr_dynarr *conversion_in_dynarr_list;
|
|
2526
|
|
2527 static int dfc_convert_to_external_format_in_use;
|
|
2528 static int dfc_convert_to_internal_format_in_use;
|
|
2529
|
|
2530 static Lisp_Object
|
|
2531 dfc_convert_to_external_format_reset_in_use (Lisp_Object value)
|
|
2532 {
|
|
2533 dfc_convert_to_external_format_in_use = XINT (value);
|
|
2534 return Qnil;
|
|
2535 }
|
|
2536
|
|
2537 static Lisp_Object
|
|
2538 dfc_convert_to_internal_format_reset_in_use (Lisp_Object value)
|
|
2539 {
|
|
2540 dfc_convert_to_internal_format_in_use = XINT (value);
|
|
2541 return Qnil;
|
|
2542 }
|
|
2543
|
|
2544 void
|
|
2545 dfc_convert_to_external_format (dfc_conversion_type source_type,
|
|
2546 dfc_conversion_data *source,
|
|
2547 Lisp_Object coding_system,
|
|
2548 dfc_conversion_type sink_type,
|
|
2549 dfc_conversion_data *sink)
|
|
2550 {
|
|
2551 /* It's guaranteed that many callers are not prepared for GC here,
|
|
2552 esp. given that this code conversion occurs in many very hidden
|
|
2553 places. */
|
|
2554 int count = begin_gc_forbidden ();
|
|
2555 Extbyte_dynarr *conversion_out_dynarr;
|
|
2556
|
|
2557 type_checking_assert
|
|
2558 (((source_type == DFC_TYPE_DATA) ||
|
|
2559 (source_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)) ||
|
|
2560 (source_type == DFC_TYPE_LISP_STRING && STRINGP (source->lisp_object)))
|
|
2561 &&
|
|
2562 ((sink_type == DFC_TYPE_DATA) ||
|
|
2563 (sink_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object))));
|
|
2564
|
|
2565 record_unwind_protect (dfc_convert_to_external_format_reset_in_use,
|
|
2566 make_int (dfc_convert_to_external_format_in_use));
|
|
2567 if (Dynarr_length (conversion_out_dynarr_list) <=
|
|
2568 dfc_convert_to_external_format_in_use)
|
|
2569 Dynarr_add (conversion_out_dynarr_list, Dynarr_new (Extbyte));
|
|
2570 conversion_out_dynarr = Dynarr_at (conversion_out_dynarr_list,
|
|
2571 dfc_convert_to_external_format_in_use);
|
|
2572 dfc_convert_to_external_format_in_use++;
|
|
2573 Dynarr_reset (conversion_out_dynarr);
|
|
2574
|
|
2575 coding_system = get_coding_system_for_text_file (coding_system, 0);
|
|
2576
|
|
2577 /* Here we optimize in the case where the coding system does no
|
|
2578 conversion. However, we don't want to optimize in case the source
|
|
2579 or sink is an lstream, since writing to an lstream can cause a
|
|
2580 garbage collection, and this could be problematic if the source
|
|
2581 is a lisp string. */
|
|
2582 if (source_type != DFC_TYPE_LISP_LSTREAM &&
|
|
2583 sink_type != DFC_TYPE_LISP_LSTREAM &&
|
|
2584 coding_system_is_binary (coding_system))
|
|
2585 {
|
|
2586 const Intbyte *ptr;
|
|
2587 Bytecount len;
|
|
2588
|
|
2589 if (source_type == DFC_TYPE_LISP_STRING)
|
|
2590 {
|
|
2591 ptr = XSTRING_DATA (source->lisp_object);
|
|
2592 len = XSTRING_LENGTH (source->lisp_object);
|
|
2593 }
|
|
2594 else
|
|
2595 {
|
|
2596 ptr = (Intbyte *) source->data.ptr;
|
|
2597 len = source->data.len;
|
|
2598 }
|
|
2599
|
|
2600 #ifdef MULE
|
|
2601 {
|
|
2602 const Intbyte *end;
|
|
2603 for (end = ptr + len; ptr < end;)
|
|
2604 {
|
|
2605 Intbyte c =
|
|
2606 (BYTE_ASCII_P (*ptr)) ? *ptr :
|
|
2607 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
|
|
2608 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
|
|
2609 '~';
|
|
2610
|
|
2611 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
|
|
2612 INC_CHARPTR (ptr);
|
|
2613 }
|
800
|
2614 text_checking_assert (ptr == end);
|
771
|
2615 }
|
|
2616 #else
|
|
2617 Dynarr_add_many (conversion_out_dynarr, ptr, len);
|
|
2618 #endif
|
|
2619
|
|
2620 }
|
|
2621 #ifdef HAVE_WIN32_CODING_SYSTEMS
|
|
2622 /* Optimize the common case involving Unicode where only ASCII is involved */
|
|
2623 else if (source_type != DFC_TYPE_LISP_LSTREAM &&
|
|
2624 sink_type != DFC_TYPE_LISP_LSTREAM &&
|
|
2625 dfc_coding_system_is_unicode (coding_system))
|
|
2626 {
|
|
2627 const Intbyte *ptr, *p;
|
|
2628 Bytecount len;
|
|
2629 const Intbyte *end;
|
|
2630
|
|
2631 if (source_type == DFC_TYPE_LISP_STRING)
|
|
2632 {
|
|
2633 ptr = XSTRING_DATA (source->lisp_object);
|
|
2634 len = XSTRING_LENGTH (source->lisp_object);
|
|
2635 }
|
|
2636 else
|
|
2637 {
|
|
2638 ptr = (Intbyte *) source->data.ptr;
|
|
2639 len = source->data.len;
|
|
2640 }
|
|
2641 end = ptr + len;
|
|
2642
|
|
2643 for (p = ptr; p < end; p++)
|
|
2644 {
|
|
2645 if (!BYTE_ASCII_P (*p))
|
|
2646 goto the_hard_way;
|
|
2647 }
|
|
2648
|
|
2649 for (p = ptr; p < end; p++)
|
|
2650 {
|
|
2651 Dynarr_add (conversion_out_dynarr, (Extbyte) (*p));
|
|
2652 Dynarr_add (conversion_out_dynarr, (Extbyte) '\0');
|
|
2653 }
|
|
2654 }
|
|
2655 #endif /* HAVE_WIN32_CODING_SYSTEMS */
|
|
2656 else
|
|
2657 {
|
|
2658 Lisp_Object streams_to_delete[3];
|
|
2659 int delete_count;
|
|
2660 Lisp_Object instream, outstream;
|
|
2661 Lstream *reader, *writer;
|
|
2662 struct gcpro gcpro1, gcpro2;
|
|
2663
|
|
2664 #ifdef HAVE_WIN32_CODING_SYSTEMS
|
|
2665 the_hard_way:
|
|
2666 #endif /* HAVE_WIN32_CODING_SYSTEMS */
|
|
2667 delete_count = 0;
|
|
2668 if (source_type == DFC_TYPE_LISP_LSTREAM)
|
|
2669 instream = source->lisp_object;
|
|
2670 else if (source_type == DFC_TYPE_DATA)
|
|
2671 streams_to_delete[delete_count++] = instream =
|
|
2672 make_fixed_buffer_input_stream (source->data.ptr, source->data.len);
|
|
2673 else
|
|
2674 {
|
|
2675 type_checking_assert (source_type == DFC_TYPE_LISP_STRING);
|
|
2676 streams_to_delete[delete_count++] = instream =
|
|
2677 /* This will GCPRO the Lisp string */
|
|
2678 make_lisp_string_input_stream (source->lisp_object, 0, -1);
|
|
2679 }
|
|
2680
|
|
2681 if (sink_type == DFC_TYPE_LISP_LSTREAM)
|
|
2682 outstream = sink->lisp_object;
|
|
2683 else
|
|
2684 {
|
|
2685 type_checking_assert (sink_type == DFC_TYPE_DATA);
|
|
2686 streams_to_delete[delete_count++] = outstream =
|
|
2687 make_dynarr_output_stream
|
|
2688 ((unsigned_char_dynarr *) conversion_out_dynarr);
|
|
2689 }
|
|
2690
|
|
2691 streams_to_delete[delete_count++] = outstream =
|
800
|
2692 make_coding_output_stream (XLSTREAM (outstream), coding_system,
|
|
2693 CODING_ENCODE, 0);
|
771
|
2694
|
|
2695 reader = XLSTREAM (instream);
|
|
2696 writer = XLSTREAM (outstream);
|
|
2697 /* decoding_stream will gc-protect outstream */
|
|
2698 GCPRO2 (instream, outstream);
|
|
2699
|
|
2700 while (1)
|
|
2701 {
|
|
2702 Bytecount size_in_bytes;
|
|
2703 char tempbuf[1024]; /* some random amount */
|
|
2704
|
|
2705 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf));
|
|
2706
|
|
2707 if (size_in_bytes == 0)
|
|
2708 break;
|
|
2709 else if (size_in_bytes < 0)
|
|
2710 signal_error (Qtext_conversion_error,
|
|
2711 "Error converting to external format", Qunbound);
|
|
2712
|
|
2713 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0)
|
|
2714 signal_error (Qtext_conversion_error,
|
|
2715 "Error converting to external format", Qunbound);
|
|
2716 }
|
|
2717
|
|
2718 /* Closing writer will close any stream at the other end of writer. */
|
|
2719 Lstream_close (writer);
|
|
2720 Lstream_close (reader);
|
|
2721 UNGCPRO;
|
|
2722
|
|
2723 /* The idea is that this function will create no garbage. */
|
|
2724 while (delete_count)
|
|
2725 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count]));
|
|
2726 }
|
|
2727
|
|
2728 unbind_to (count);
|
|
2729
|
|
2730 if (sink_type != DFC_TYPE_LISP_LSTREAM)
|
|
2731 {
|
|
2732 sink->data.len = Dynarr_length (conversion_out_dynarr);
|
|
2733 /* double zero-extend because we may be dealing with Unicode data */
|
|
2734 Dynarr_add (conversion_out_dynarr, '\0');
|
|
2735 Dynarr_add (conversion_out_dynarr, '\0');
|
|
2736 sink->data.ptr = Dynarr_atp (conversion_out_dynarr, 0);
|
|
2737 }
|
|
2738 }
|
|
2739
|
|
2740 void
|
|
2741 dfc_convert_to_internal_format (dfc_conversion_type source_type,
|
|
2742 dfc_conversion_data *source,
|
|
2743 Lisp_Object coding_system,
|
|
2744 dfc_conversion_type sink_type,
|
|
2745 dfc_conversion_data *sink)
|
|
2746 {
|
|
2747 /* It's guaranteed that many callers are not prepared for GC here,
|
|
2748 esp. given that this code conversion occurs in many very hidden
|
|
2749 places. */
|
|
2750 int count = begin_gc_forbidden ();
|
|
2751 Intbyte_dynarr *conversion_in_dynarr;
|
|
2752
|
|
2753 type_checking_assert
|
|
2754 ((source_type == DFC_TYPE_DATA ||
|
|
2755 source_type == DFC_TYPE_LISP_LSTREAM)
|
|
2756 &&
|
|
2757 (sink_type == DFC_TYPE_DATA ||
|
|
2758 sink_type == DFC_TYPE_LISP_LSTREAM));
|
|
2759
|
|
2760 record_unwind_protect (dfc_convert_to_internal_format_reset_in_use,
|
|
2761 make_int (dfc_convert_to_internal_format_in_use));
|
|
2762 if (Dynarr_length (conversion_in_dynarr_list) <=
|
|
2763 dfc_convert_to_internal_format_in_use)
|
|
2764 Dynarr_add (conversion_in_dynarr_list, Dynarr_new (Intbyte));
|
|
2765 conversion_in_dynarr = Dynarr_at (conversion_in_dynarr_list,
|
|
2766 dfc_convert_to_internal_format_in_use);
|
|
2767 dfc_convert_to_internal_format_in_use++;
|
|
2768 Dynarr_reset (conversion_in_dynarr);
|
|
2769
|
|
2770 coding_system = get_coding_system_for_text_file (coding_system, 1);
|
|
2771
|
|
2772 if (source_type != DFC_TYPE_LISP_LSTREAM &&
|
|
2773 sink_type != DFC_TYPE_LISP_LSTREAM &&
|
|
2774 coding_system_is_binary (coding_system))
|
|
2775 {
|
|
2776 #ifdef MULE
|
|
2777 const Intbyte *ptr = (const Intbyte *) source->data.ptr;
|
|
2778 Bytecount len = source->data.len;
|
|
2779 const Intbyte *end = ptr + len;
|
|
2780
|
|
2781 for (; ptr < end; ptr++)
|
|
2782 {
|
|
2783 Intbyte c = *ptr;
|
|
2784
|
|
2785 if (BYTE_ASCII_P (c))
|
|
2786 Dynarr_add (conversion_in_dynarr, c);
|
|
2787 else if (BYTE_C1_P (c))
|
|
2788 {
|
|
2789 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
|
|
2790 Dynarr_add (conversion_in_dynarr, c + 0x20);
|
|
2791 }
|
|
2792 else
|
|
2793 {
|
|
2794 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1);
|
|
2795 Dynarr_add (conversion_in_dynarr, c);
|
|
2796 }
|
|
2797 }
|
|
2798 #else
|
|
2799 Dynarr_add_many (conversion_in_dynarr, source->data.ptr, source->data.len);
|
|
2800 #endif
|
|
2801 }
|
|
2802 #ifdef HAVE_WIN32_CODING_SYSTEMS
|
|
2803 /* Optimize the common case involving Unicode where only ASCII/Latin-1 is involved */
|
|
2804 else if (source_type != DFC_TYPE_LISP_LSTREAM &&
|
|
2805 sink_type != DFC_TYPE_LISP_LSTREAM &&
|
|
2806 dfc_coding_system_is_unicode (coding_system))
|
|
2807 {
|
|
2808 const Intbyte *ptr = (const Intbyte *) source->data.ptr + 1;
|
|
2809 Bytecount len = source->data.len;
|
|
2810 const Intbyte *end = ptr + len;
|
|
2811
|
|
2812 if (len & 1)
|
|
2813 goto the_hard_way;
|
|
2814
|
|
2815 for (; ptr < end; ptr += 2)
|
|
2816 {
|
|
2817 if (*ptr)
|
|
2818 goto the_hard_way;
|
|
2819 }
|
|
2820
|
|
2821 ptr = (const Intbyte *) source->data.ptr;
|
|
2822 end = ptr + len;
|
|
2823
|
|
2824 for (; ptr < end; ptr += 2)
|
|
2825 {
|
|
2826 Intbyte c = *ptr;
|
|
2827
|
|
2828 if (BYTE_ASCII_P (c))
|
|
2829 Dynarr_add (conversion_in_dynarr, c);
|
|
2830 #ifdef MULE
|
|
2831 else if (BYTE_C1_P (c))
|
|
2832 {
|
|
2833 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
|
|
2834 Dynarr_add (conversion_in_dynarr, c + 0x20);
|
|
2835 }
|
|
2836 else
|
|
2837 {
|
|
2838 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1);
|
|
2839 Dynarr_add (conversion_in_dynarr, c);
|
|
2840 }
|
|
2841 #endif /* MULE */
|
|
2842 }
|
|
2843 }
|
|
2844 #endif /* HAVE_WIN32_CODING_SYSTEMS */
|
|
2845 else
|
|
2846 {
|
|
2847 Lisp_Object streams_to_delete[3];
|
|
2848 int delete_count;
|
|
2849 Lisp_Object instream, outstream;
|
|
2850 Lstream *reader, *writer;
|
|
2851 struct gcpro gcpro1, gcpro2;
|
|
2852
|
|
2853 #ifdef HAVE_WIN32_CODING_SYSTEMS
|
|
2854 the_hard_way:
|
|
2855 #endif /* HAVE_WIN32_CODING_SYSTEMS */
|
|
2856 delete_count = 0;
|
|
2857 if (source_type == DFC_TYPE_LISP_LSTREAM)
|
|
2858 instream = source->lisp_object;
|
|
2859 else
|
|
2860 {
|
|
2861 type_checking_assert (source_type == DFC_TYPE_DATA);
|
|
2862 streams_to_delete[delete_count++] = instream =
|
|
2863 make_fixed_buffer_input_stream (source->data.ptr, source->data.len);
|
|
2864 }
|
|
2865
|
|
2866 if (sink_type == DFC_TYPE_LISP_LSTREAM)
|
|
2867 outstream = sink->lisp_object;
|
|
2868 else
|
|
2869 {
|
|
2870 type_checking_assert (sink_type == DFC_TYPE_DATA);
|
|
2871 streams_to_delete[delete_count++] = outstream =
|
|
2872 make_dynarr_output_stream
|
|
2873 ((unsigned_char_dynarr *) conversion_in_dynarr);
|
|
2874 }
|
|
2875
|
|
2876 streams_to_delete[delete_count++] = outstream =
|
800
|
2877 make_coding_output_stream (XLSTREAM (outstream), coding_system,
|
|
2878 CODING_DECODE, 0);
|
771
|
2879
|
|
2880 reader = XLSTREAM (instream);
|
|
2881 writer = XLSTREAM (outstream);
|
|
2882 /* outstream will gc-protect its sink stream, if necessary */
|
|
2883 GCPRO2 (instream, outstream);
|
|
2884
|
|
2885 while (1)
|
|
2886 {
|
|
2887 Bytecount size_in_bytes;
|
|
2888 char tempbuf[1024]; /* some random amount */
|
|
2889
|
|
2890 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf));
|
|
2891
|
|
2892 if (size_in_bytes == 0)
|
|
2893 break;
|
|
2894 else if (size_in_bytes < 0)
|
|
2895 signal_error (Qtext_conversion_error,
|
|
2896 "Error converting to internal format", Qunbound);
|
|
2897
|
|
2898 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0)
|
|
2899 signal_error (Qtext_conversion_error,
|
|
2900 "Error converting to internal format", Qunbound);
|
|
2901 }
|
|
2902
|
|
2903 /* Closing writer will close any stream at the other end of writer. */
|
|
2904 Lstream_close (writer);
|
|
2905 Lstream_close (reader);
|
|
2906 UNGCPRO;
|
|
2907
|
|
2908 /* The idea is that this function will create no garbage. */
|
|
2909 while (delete_count)
|
|
2910 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count]));
|
|
2911 }
|
|
2912
|
|
2913 unbind_to (count);
|
|
2914
|
|
2915 if (sink_type != DFC_TYPE_LISP_LSTREAM)
|
|
2916 {
|
|
2917 sink->data.len = Dynarr_length (conversion_in_dynarr);
|
|
2918 Dynarr_add (conversion_in_dynarr, '\0'); /* remember to NUL-terminate! */
|
|
2919 /* The macros don't currently distinguish between internal and
|
|
2920 external sinks, and allocate and copy two extra bytes in both
|
|
2921 cases. So we add a second zero, just like for external data
|
|
2922 (in that case, because we may be converting to Unicode). */
|
|
2923 Dynarr_add (conversion_in_dynarr, '\0');
|
|
2924 sink->data.ptr = Dynarr_atp (conversion_in_dynarr, 0);
|
|
2925 }
|
|
2926 }
|
|
2927
|
|
2928
|
|
2929 /************************************************************************/
|
|
2930 /* Basic Emchar functions */
|
|
2931 /************************************************************************/
|
|
2932
|
|
2933 #ifdef MULE
|
|
2934
|
|
2935 /* Convert a non-ASCII Mule character C into a one-character Mule-encoded
|
|
2936 string in STR. Returns the number of bytes stored.
|
|
2937 Do not call this directly. Use the macro set_charptr_emchar() instead.
|
|
2938 */
|
|
2939
|
|
2940 Bytecount
|
|
2941 non_ascii_set_charptr_emchar (Intbyte *str, Emchar c)
|
|
2942 {
|
|
2943 Intbyte *p;
|
|
2944 Intbyte lb;
|
|
2945 int c1, c2;
|
|
2946 Lisp_Object charset;
|
|
2947
|
|
2948 p = str;
|
|
2949 BREAKUP_CHAR (c, charset, c1, c2);
|
|
2950 lb = CHAR_LEADING_BYTE (c);
|
|
2951 if (LEADING_BYTE_PRIVATE_P (lb))
|
|
2952 *p++ = PRIVATE_LEADING_BYTE_PREFIX (lb);
|
|
2953 *p++ = lb;
|
|
2954 if (EQ (charset, Vcharset_control_1))
|
|
2955 c1 += 0x20;
|
|
2956 *p++ = c1 | 0x80;
|
|
2957 if (c2)
|
|
2958 *p++ = c2 | 0x80;
|
|
2959
|
|
2960 return (p - str);
|
|
2961 }
|
|
2962
|
|
2963 /* Return the first character from a Mule-encoded string in STR,
|
|
2964 assuming it's non-ASCII. Do not call this directly.
|
|
2965 Use the macro charptr_emchar() instead. */
|
|
2966
|
|
2967 Emchar
|
|
2968 non_ascii_charptr_emchar (const Intbyte *str)
|
|
2969 {
|
|
2970 Intbyte i0 = *str, i1, i2 = 0;
|
|
2971 Lisp_Object charset;
|
|
2972
|
|
2973 if (i0 == LEADING_BYTE_CONTROL_1)
|
|
2974 return (Emchar) (*++str - 0x20);
|
|
2975
|
|
2976 if (LEADING_BYTE_PREFIX_P (i0))
|
|
2977 i0 = *++str;
|
|
2978
|
|
2979 i1 = *++str & 0x7F;
|
|
2980
|
|
2981 charset = CHARSET_BY_LEADING_BYTE (i0);
|
|
2982 if (XCHARSET_DIMENSION (charset) == 2)
|
|
2983 i2 = *++str & 0x7F;
|
|
2984
|
|
2985 return MAKE_CHAR (charset, i1, i2);
|
|
2986 }
|
|
2987
|
|
2988 /* Return whether CH is a valid Emchar, assuming it's non-ASCII.
|
|
2989 Do not call this directly. Use the macro valid_char_p() instead. */
|
|
2990
|
|
2991 int
|
|
2992 non_ascii_valid_char_p (Emchar ch)
|
|
2993 {
|
|
2994 int f1, f2, f3;
|
|
2995
|
|
2996 /* Must have only lowest 19 bits set */
|
|
2997 if (ch & ~0x7FFFF)
|
|
2998 return 0;
|
|
2999
|
|
3000 f1 = CHAR_FIELD1 (ch);
|
|
3001 f2 = CHAR_FIELD2 (ch);
|
|
3002 f3 = CHAR_FIELD3 (ch);
|
|
3003
|
|
3004 if (f1 == 0)
|
|
3005 {
|
|
3006 /* dimension-1 char */
|
|
3007 Lisp_Object charset;
|
|
3008
|
|
3009 /* leading byte must be correct */
|
|
3010 if (f2 < MIN_CHAR_FIELD2_OFFICIAL ||
|
|
3011 (f2 > MAX_CHAR_FIELD2_OFFICIAL && f2 < MIN_CHAR_FIELD2_PRIVATE) ||
|
|
3012 f2 > MAX_CHAR_FIELD2_PRIVATE)
|
|
3013 return 0;
|
|
3014 /* octet not out of range */
|
|
3015 if (f3 < 0x20)
|
|
3016 return 0;
|
|
3017 /* charset exists */
|
|
3018 /*
|
|
3019 NOTE: This takes advantage of the fact that
|
|
3020 FIELD2_TO_OFFICIAL_LEADING_BYTE and
|
|
3021 FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
|
|
3022 */
|
|
3023 charset = CHARSET_BY_LEADING_BYTE (f2 + FIELD2_TO_OFFICIAL_LEADING_BYTE);
|
|
3024 if (EQ (charset, Qnil))
|
|
3025 return 0;
|
|
3026 /* check range as per size (94 or 96) of charset */
|
|
3027 return ((f3 > 0x20 && f3 < 0x7f) || XCHARSET_CHARS (charset) == 96);
|
|
3028 }
|
|
3029 else
|
|
3030 {
|
|
3031 /* dimension-2 char */
|
|
3032 Lisp_Object charset;
|
|
3033
|
|
3034 /* leading byte must be correct */
|
|
3035 if (f1 < MIN_CHAR_FIELD1_OFFICIAL ||
|
|
3036 (f1 > MAX_CHAR_FIELD1_OFFICIAL && f1 < MIN_CHAR_FIELD1_PRIVATE) ||
|
|
3037 f1 > MAX_CHAR_FIELD1_PRIVATE)
|
|
3038 return 0;
|
|
3039 /* octets not out of range */
|
|
3040 if (f2 < 0x20 || f3 < 0x20)
|
|
3041 return 0;
|
|
3042
|
|
3043 #ifdef ENABLE_COMPOSITE_CHARS
|
|
3044 if (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE == LEADING_BYTE_COMPOSITE)
|
|
3045 {
|
|
3046 if (UNBOUNDP (Fgethash (make_int (ch),
|
|
3047 Vcomposite_char_char2string_hash_table,
|
|
3048 Qunbound)))
|
|
3049 return 0;
|
|
3050 return 1;
|
|
3051 }
|
|
3052 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
3053
|
|
3054 /* charset exists */
|
|
3055 if (f1 <= MAX_CHAR_FIELD1_OFFICIAL)
|
|
3056 charset =
|
|
3057 CHARSET_BY_LEADING_BYTE (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE);
|
|
3058 else
|
|
3059 charset =
|
|
3060 CHARSET_BY_LEADING_BYTE (f1 + FIELD1_TO_PRIVATE_LEADING_BYTE);
|
|
3061
|
|
3062 if (EQ (charset, Qnil))
|
|
3063 return 0;
|
|
3064 /* check range as per size (94x94 or 96x96) of charset */
|
|
3065 return ((f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F) ||
|
|
3066 XCHARSET_CHARS (charset) == 96);
|
|
3067 }
|
|
3068 }
|
|
3069
|
|
3070 /* Copy the character pointed to by SRC into DST. Do not call this
|
|
3071 directly. Use the macro charptr_copy_char() instead.
|
|
3072 Return the number of bytes copied. */
|
|
3073
|
|
3074 Bytecount
|
|
3075 non_ascii_charptr_copy_char (const Intbyte *src, Intbyte *dst)
|
|
3076 {
|
|
3077 Bytecount bytes = REP_BYTES_BY_FIRST_BYTE (*src);
|
|
3078 Bytecount i;
|
|
3079 for (i = bytes; i; i--, dst++, src++)
|
|
3080 *dst = *src;
|
|
3081 return bytes;
|
|
3082 }
|
|
3083
|
|
3084 #endif /* MULE */
|
|
3085
|
|
3086
|
|
3087 /************************************************************************/
|
|
3088 /* streams of Emchars */
|
|
3089 /************************************************************************/
|
|
3090
|
|
3091 #ifdef MULE
|
|
3092
|
|
3093 /* Treat a stream as a stream of Emchar's rather than a stream of bytes.
|
|
3094 The functions below are not meant to be called directly; use
|
|
3095 the macros in insdel.h. */
|
|
3096
|
|
3097 Emchar
|
|
3098 Lstream_get_emchar_1 (Lstream *stream, int ch)
|
|
3099 {
|
|
3100 Intbyte str[MAX_EMCHAR_LEN];
|
|
3101 Intbyte *strptr = str;
|
|
3102 Bytecount bytes;
|
|
3103
|
|
3104 str[0] = (Intbyte) ch;
|
|
3105
|
|
3106 for (bytes = REP_BYTES_BY_FIRST_BYTE (ch) - 1; bytes; bytes--)
|
|
3107 {
|
|
3108 int c = Lstream_getc (stream);
|
800
|
3109 text_checking_assert (c >= 0);
|
771
|
3110 *++strptr = (Intbyte) c;
|
|
3111 }
|
|
3112 return charptr_emchar (str);
|
|
3113 }
|
|
3114
|
|
3115 int
|
|
3116 Lstream_fput_emchar (Lstream *stream, Emchar ch)
|
|
3117 {
|
|
3118 Intbyte str[MAX_EMCHAR_LEN];
|
|
3119 Bytecount len = set_charptr_emchar (str, ch);
|
|
3120 return Lstream_write (stream, str, len);
|
|
3121 }
|
|
3122
|
|
3123 void
|
|
3124 Lstream_funget_emchar (Lstream *stream, Emchar ch)
|
|
3125 {
|
|
3126 Intbyte str[MAX_EMCHAR_LEN];
|
|
3127 Bytecount len = set_charptr_emchar (str, ch);
|
|
3128 Lstream_unread (stream, str, len);
|
|
3129 }
|
|
3130
|
|
3131 #endif /* MULE */
|
|
3132
|
|
3133
|
|
3134 /************************************************************************/
|
|
3135 /* Lisp primitives for working with characters */
|
|
3136 /************************************************************************/
|
|
3137
|
|
3138 DEFUN ("make-char", Fmake_char, 2, 3, 0, /*
|
|
3139 Make a character from CHARSET and octets ARG1 and ARG2.
|
|
3140 ARG2 is required only for characters from two-dimensional charsets.
|
|
3141
|
|
3142 Each octet should be in the range 32 through 127 for a 96 or 96x96
|
|
3143 charset and 33 through 126 for a 94 or 94x94 charset. (Most charsets
|
|
3144 are either 96 or 94x94.) Note that this is 32 more than the values
|
|
3145 typically given for 94x94 charsets. When two octets are required, the
|
|
3146 order is "standard" -- the same as appears in ISO-2022 encodings,
|
|
3147 reference tables, etc.
|
|
3148
|
|
3149 \(Note the following non-obvious result: Computerized translation
|
|
3150 tables often encode the two octets as the high and low bytes,
|
|
3151 respectively, of a hex short, while when there's only one octet, it
|
|
3152 goes in the low byte. When decoding such a value, you need to treat
|
|
3153 the two cases differently when calling make-char: One is (make-char
|
|
3154 CHARSET HIGH LOW), the other is (make-char CHARSET LOW).)
|
|
3155
|
|
3156 For example, (make-char 'latin-iso8859-2 185) or (make-char
|
|
3157 'latin-iso8859-2 57) will return the Latin 2 character s with caron.
|
|
3158
|
|
3159 As another example, the Japanese character for "kawa" (stream), which
|
|
3160 looks something like this:
|
|
3161
|
|
3162 | |
|
|
3163 | | |
|
|
3164 | | |
|
|
3165 | | |
|
|
3166 / |
|
|
3167
|
|
3168 appears in the Unicode Standard (version 2.0) on page 7-287 with the
|
|
3169 following values (see also page 7-4):
|
|
3170
|
|
3171 U 5DDD (Unicode)
|
|
3172 G 0-2008 (GB 2312-80)
|
|
3173 J 0-3278 (JIS X 0208-1990)
|
|
3174 K 0-8425 (KS C 5601-1987)
|
|
3175 B A474 (Big Five)
|
|
3176 C 1-4455 (CNS 11643-1986 (1st plane))
|
|
3177 A 213C34 (ANSI Z39.64-1989)
|
|
3178
|
|
3179 These are equivalent to:
|
|
3180
|
|
3181 \(make-char 'chinese-gb2312 52 40)
|
|
3182 \(make-char 'japanese-jisx0208 64 110)
|
|
3183 \(make-char 'korean-ksc5601 116 57)
|
|
3184 \(make-char 'chinese-cns11643-1 76 87)
|
|
3185 \(decode-big5-char '(164 . 116))
|
|
3186
|
|
3187 \(All codes above are two decimal numbers except for Big Five and ANSI
|
|
3188 Z39.64, which we don't support. We add 32 to each of the decimal
|
|
3189 numbers. Big Five is split in a rather hackish fashion into two
|
|
3190 charsets, `big5-1' and `big5-2', due to its excessive size -- 94x157,
|
|
3191 with the first codepoint in the range 0xA1 to 0xFE and the second in
|
|
3192 the range 0x40 to 0x7E or 0xA1 to 0xFE. `decode-big5-char' is used to
|
|
3193 generate the char from its codes, and `encode-big5-char' extracts the
|
|
3194 codes.)
|
|
3195
|
|
3196 When compiled without MULE, this function does not do much, but it's
|
|
3197 provided for compatibility. In this case, the following CHARSET symbols
|
|
3198 are allowed:
|
|
3199
|
|
3200 `ascii' -- ARG1 should be in the range 0 through 127.
|
|
3201 `control-1' -- ARG1 should be in the range 128 through 159.
|
|
3202 else -- ARG1 is coerced to be between 0 and 255, and then the high
|
|
3203 bit is set.
|
|
3204
|
|
3205 `int-to-char of the resulting ARG1' is returned, and ARG2 is always ignored.
|
|
3206 */
|
|
3207 (charset, arg1, arg2))
|
|
3208 {
|
|
3209 #ifdef MULE
|
|
3210 Lisp_Charset *cs;
|
|
3211 int a1, a2;
|
|
3212 int lowlim, highlim;
|
|
3213
|
|
3214 charset = Fget_charset (charset);
|
|
3215 cs = XCHARSET (charset);
|
|
3216
|
788
|
3217 get_charset_limits (charset, &lowlim, &highlim);
|
771
|
3218
|
|
3219 CHECK_INT (arg1);
|
|
3220 /* It is useful (and safe, according to Olivier Galibert) to strip
|
|
3221 the 8th bit off ARG1 and ARG2 because it allows programmers to
|
|
3222 write (make-char 'latin-iso8859-2 CODE) where code is the actual
|
|
3223 Latin 2 code of the character. */
|
|
3224 a1 = XINT (arg1) & 0x7f;
|
|
3225 if (a1 < lowlim || a1 > highlim)
|
|
3226 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim));
|
|
3227
|
|
3228 if (CHARSET_DIMENSION (cs) == 1)
|
|
3229 {
|
|
3230 if (!NILP (arg2))
|
|
3231 invalid_argument
|
|
3232 ("Charset is of dimension one; second octet must be nil", arg2);
|
|
3233 return make_char (MAKE_CHAR (charset, a1, 0));
|
|
3234 }
|
|
3235
|
|
3236 CHECK_INT (arg2);
|
|
3237 a2 = XINT (arg2) & 0x7f;
|
|
3238 if (a2 < lowlim || a2 > highlim)
|
|
3239 args_out_of_range_3 (arg2, make_int (lowlim), make_int (highlim));
|
|
3240
|
|
3241 return make_char (MAKE_CHAR (charset, a1, a2));
|
|
3242 #else
|
|
3243 int a1;
|
|
3244 int lowlim, highlim;
|
|
3245
|
|
3246 if (EQ (charset, Qascii)) lowlim = 0, highlim = 127;
|
|
3247 else if (EQ (charset, Qcontrol_1)) lowlim = 0, highlim = 31;
|
|
3248 else lowlim = 0, highlim = 127;
|
|
3249
|
|
3250 CHECK_INT (arg1);
|
|
3251 /* It is useful (and safe, according to Olivier Galibert) to strip
|
|
3252 the 8th bit off ARG1 and ARG2 because it allows programmers to
|
|
3253 write (make-char 'latin-iso8859-2 CODE) where code is the actual
|
|
3254 Latin 2 code of the character. */
|
|
3255 a1 = XINT (arg1) & 0x7f;
|
|
3256 if (a1 < lowlim || a1 > highlim)
|
|
3257 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim));
|
|
3258
|
|
3259 if (EQ (charset, Qascii))
|
|
3260 return make_char (a1);
|
|
3261 return make_char (a1 + 128);
|
|
3262 #endif /* MULE */
|
|
3263 }
|
|
3264
|
|
3265 #ifdef MULE
|
|
3266
|
|
3267 DEFUN ("char-charset", Fchar_charset, 1, 1, 0, /*
|
|
3268 Return the character set of char CH.
|
|
3269 */
|
|
3270 (ch))
|
|
3271 {
|
|
3272 CHECK_CHAR_COERCE_INT (ch);
|
|
3273
|
|
3274 return XCHARSET_NAME (CHARSET_BY_LEADING_BYTE
|
|
3275 (CHAR_LEADING_BYTE (XCHAR (ch))));
|
|
3276 }
|
|
3277
|
|
3278 DEFUN ("char-octet", Fchar_octet, 1, 2, 0, /*
|
|
3279 Return the octet numbered N (should be 0 or 1) of char CH.
|
|
3280 N defaults to 0 if omitted.
|
|
3281 */
|
|
3282 (ch, n))
|
|
3283 {
|
|
3284 Lisp_Object charset;
|
|
3285 int octet0, octet1;
|
|
3286
|
|
3287 CHECK_CHAR_COERCE_INT (ch);
|
|
3288
|
|
3289 BREAKUP_CHAR (XCHAR (ch), charset, octet0, octet1);
|
|
3290
|
|
3291 if (NILP (n) || EQ (n, Qzero))
|
|
3292 return make_int (octet0);
|
|
3293 else if (EQ (n, make_int (1)))
|
|
3294 return make_int (octet1);
|
|
3295 else
|
|
3296 invalid_constant ("Octet number must be 0 or 1", n);
|
|
3297 }
|
|
3298
|
|
3299 DEFUN ("split-char", Fsplit_char, 1, 1, 0, /*
|
|
3300 Return list of charset and one or two position-codes of CHAR.
|
|
3301 */
|
|
3302 (character))
|
|
3303 {
|
|
3304 /* This function can GC */
|
|
3305 struct gcpro gcpro1, gcpro2;
|
|
3306 Lisp_Object charset = Qnil;
|
|
3307 Lisp_Object rc = Qnil;
|
|
3308 int c1, c2;
|
|
3309
|
|
3310 GCPRO2 (charset, rc);
|
|
3311 CHECK_CHAR_COERCE_INT (character);
|
|
3312
|
|
3313 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
|
|
3314
|
|
3315 if (XCHARSET_DIMENSION (Fget_charset (charset)) == 2)
|
|
3316 {
|
|
3317 rc = list3 (XCHARSET_NAME (charset), make_int (c1), make_int (c2));
|
|
3318 }
|
|
3319 else
|
|
3320 {
|
|
3321 rc = list2 (XCHARSET_NAME (charset), make_int (c1));
|
|
3322 }
|
|
3323 UNGCPRO;
|
|
3324
|
|
3325 return rc;
|
|
3326 }
|
|
3327
|
|
3328 #endif /* MULE */
|
|
3329
|
|
3330
|
|
3331 /************************************************************************/
|
|
3332 /* composite character functions */
|
|
3333 /************************************************************************/
|
|
3334
|
|
3335 #ifdef ENABLE_COMPOSITE_CHARS
|
|
3336
|
|
3337 Emchar
|
|
3338 lookup_composite_char (Intbyte *str, int len)
|
|
3339 {
|
|
3340 Lisp_Object lispstr = make_string (str, len);
|
|
3341 Lisp_Object ch = Fgethash (lispstr,
|
|
3342 Vcomposite_char_string2char_hash_table,
|
|
3343 Qunbound);
|
|
3344 Emchar emch;
|
|
3345
|
|
3346 if (UNBOUNDP (ch))
|
|
3347 {
|
|
3348 if (composite_char_row_next >= 128)
|
|
3349 invalid_operation ("No more composite chars available", lispstr);
|
|
3350 emch = MAKE_CHAR (Vcharset_composite, composite_char_row_next,
|
|
3351 composite_char_col_next);
|
|
3352 Fputhash (make_char (emch), lispstr,
|
|
3353 Vcomposite_char_char2string_hash_table);
|
|
3354 Fputhash (lispstr, make_char (emch),
|
|
3355 Vcomposite_char_string2char_hash_table);
|
|
3356 composite_char_col_next++;
|
|
3357 if (composite_char_col_next >= 128)
|
|
3358 {
|
|
3359 composite_char_col_next = 32;
|
|
3360 composite_char_row_next++;
|
|
3361 }
|
|
3362 }
|
|
3363 else
|
|
3364 emch = XCHAR (ch);
|
|
3365 return emch;
|
|
3366 }
|
|
3367
|
|
3368 Lisp_Object
|
|
3369 composite_char_string (Emchar ch)
|
|
3370 {
|
|
3371 Lisp_Object str = Fgethash (make_char (ch),
|
|
3372 Vcomposite_char_char2string_hash_table,
|
|
3373 Qunbound);
|
|
3374 assert (!UNBOUNDP (str));
|
|
3375 return str;
|
|
3376 }
|
|
3377
|
|
3378 xxDEFUN ("make-composite-char", Fmake_composite_char, 1, 1, 0, /*
|
|
3379 Convert a string into a single composite character.
|
|
3380 The character is the result of overstriking all the characters in
|
|
3381 the string.
|
|
3382 */
|
|
3383 (string))
|
|
3384 {
|
|
3385 CHECK_STRING (string);
|
|
3386 return make_char (lookup_composite_char (XSTRING_DATA (string),
|
|
3387 XSTRING_LENGTH (string)));
|
|
3388 }
|
|
3389
|
|
3390 xxDEFUN ("composite-char-string", Fcomposite_char_string, 1, 1, 0, /*
|
|
3391 Return a string of the characters comprising a composite character.
|
|
3392 */
|
|
3393 (ch))
|
|
3394 {
|
|
3395 Emchar emch;
|
|
3396
|
|
3397 CHECK_CHAR (ch);
|
|
3398 emch = XCHAR (ch);
|
|
3399 if (CHAR_LEADING_BYTE (emch) != LEADING_BYTE_COMPOSITE)
|
|
3400 invalid_argument ("Must be composite char", ch);
|
|
3401 return composite_char_string (emch);
|
|
3402 }
|
|
3403 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
3404
|
|
3405
|
|
3406 /************************************************************************/
|
|
3407 /* initialization */
|
|
3408 /************************************************************************/
|
|
3409
|
|
3410 void
|
|
3411 init_eistring_once_early (void)
|
|
3412 {
|
|
3413 the_eistring_malloc_zero_init = the_eistring_zero_init;
|
|
3414 the_eistring_malloc_zero_init.mallocp_ = 1;
|
|
3415 }
|
|
3416
|
|
3417 void
|
|
3418 syms_of_text (void)
|
|
3419 {
|
|
3420 DEFSUBR (Fmake_char);
|
|
3421
|
|
3422 #ifdef MULE
|
|
3423 DEFSUBR (Fchar_charset);
|
|
3424 DEFSUBR (Fchar_octet);
|
|
3425 DEFSUBR (Fsplit_char);
|
|
3426
|
|
3427 #ifdef ENABLE_COMPOSITE_CHARS
|
|
3428 DEFSUBR (Fmake_composite_char);
|
|
3429 DEFSUBR (Fcomposite_char_string);
|
|
3430 #endif
|
|
3431 #endif /* MULE */
|
|
3432 }
|
|
3433
|
|
3434 void
|
|
3435 reinit_vars_of_text (void)
|
|
3436 {
|
|
3437 int i;
|
|
3438
|
|
3439 conversion_in_dynarr_list = Dynarr_new2 (Intbyte_dynarr_dynarr,
|
|
3440 Intbyte_dynarr *);
|
|
3441 conversion_out_dynarr_list = Dynarr_new2 (Extbyte_dynarr_dynarr,
|
|
3442 Extbyte_dynarr *);
|
|
3443
|
|
3444 /* #### Olivier, why does this need to be reinitted? */
|
|
3445 for (i = 0; i <= MAX_BYTEBPOS_GAP_SIZE_3; i++)
|
|
3446 three_to_one_table[i] = i / 3;
|
|
3447 }
|
|
3448
|
|
3449 void
|
|
3450 vars_of_text (void)
|
|
3451 {
|
|
3452 reinit_vars_of_text ();
|
|
3453
|
|
3454 #ifdef ENABLE_COMPOSITE_CHARS
|
|
3455 /* #### not dumped properly */
|
|
3456 composite_char_row_next = 32;
|
|
3457 composite_char_col_next = 32;
|
|
3458
|
|
3459 Vcomposite_char_string2char_hash_table =
|
|
3460 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQUAL);
|
|
3461 Vcomposite_char_char2string_hash_table =
|
|
3462 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
|
|
3463 staticpro (&Vcomposite_char_string2char_hash_table);
|
|
3464 staticpro (&Vcomposite_char_char2string_hash_table);
|
|
3465 #endif /* ENABLE_COMPOSITE_CHARS */
|
|
3466 }
|