Mercurial > hg > xemacs-beta
comparison src/text.c @ 771:943eaba38521
[xemacs-hg @ 2002-03-13 08:51:24 by ben]
The big ben-mule-21-5 check-in!
Various files were added and deleted. See CHANGES-ben-mule.
There are still some test suite failures. No crashes, though.
Many of the failures have to do with problems in the test suite itself
rather than in the actual code. I'll be addressing these in the next
day or so -- none of the test suite failures are at all critical.
Meanwhile I'll be trying to address the biggest issues -- i.e. build
or run failures, which will almost certainly happen on various platforms.
All comments should be sent to ben@xemacs.org -- use a Cc: if necessary
when sending to mailing lists. There will be pre- and post- tags,
something like
pre-ben-mule-21-5-merge-in, and
post-ben-mule-21-5-merge-in.
author | ben |
---|---|
date | Wed, 13 Mar 2002 08:54:06 +0000 |
parents | |
children | 026c5bf9c134 |
comparison
equal
deleted
inserted
replaced
770:336a418893b5 | 771:943eaba38521 |
---|---|
1 /* Buffer manipulation primitives for XEmacs. | |
2 Copyright (C) 1995 Sun Microsystems, Inc. | |
3 Copyright (C) 1995, 1996, 2000, 2001, 2002 Ben Wing. | |
4 Copyright (C) 1999 Martin Buchholz. | |
5 | |
6 This file is part of XEmacs. | |
7 | |
8 XEmacs is free software; you can redistribute it and/or modify it | |
9 under the terms of the GNU General Public License as published by the | |
10 Free Software Foundation; either version 2, or (at your option) any | |
11 later version. | |
12 | |
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 for more details. | |
17 | |
18 You should have received a copy of the GNU General Public License | |
19 along with XEmacs; see the file COPYING. If not, write to | |
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
21 Boston, MA 02111-1307, USA. */ | |
22 | |
23 /* Synched up with: Not in FSF. */ | |
24 | |
25 /* Authorship: | |
26 */ | |
27 | |
28 #include <config.h> | |
29 #include "lisp.h" | |
30 | |
31 #include "buffer.h" | |
32 #include "charset.h" | |
33 #include "file-coding.h" | |
34 #include "lstream.h" | |
35 | |
36 | |
37 /************************************************************************/ | |
38 /* long comments */ | |
39 /************************************************************************/ | |
40 | |
41 /* | |
42 There are three possible ways to specify positions in a buffer. All | |
43 of these are one-based: the beginning of the buffer is position or | |
44 index 1, and 0 is not a valid position. | |
45 | |
46 As a "buffer position" (typedef Charbpos): | |
47 | |
48 This is an index specifying an offset in characters from the | |
49 beginning of the buffer. Note that buffer positions are | |
50 logically *between* characters, not on a character. The | |
51 difference between two buffer positions specifies the number of | |
52 characters between those positions. Buffer positions are the | |
53 only kind of position externally visible to the user. | |
54 | |
55 As a "byte index" (typedef Bytebpos): | |
56 | |
57 This is an index over the bytes used to represent the characters | |
58 in the buffer. If there is no Mule support, this is identical | |
59 to a buffer position, because each character is represented | |
60 using one byte. However, with Mule support, many characters | |
61 require two or more bytes for their representation, and so a | |
62 byte index may be greater than the corresponding buffer | |
63 position. | |
64 | |
65 As a "memory index" (typedef Membpos): | |
66 | |
67 This is the byte index adjusted for the gap. For positions | |
68 before the gap, this is identical to the byte index. For | |
69 positions after the gap, this is the byte index plus the gap | |
70 size. There are two possible memory indices for the gap | |
71 position; the memory index at the beginning of the gap should | |
72 always be used, except in code that deals with manipulating the | |
73 gap, where both indices may be seen. The address of the | |
74 character "at" (i.e. following) a particular position can be | |
75 obtained from the formula | |
76 | |
77 buffer_start_address + memory_index(position) - 1 | |
78 | |
79 except in the case of characters at the gap position. | |
80 | |
81 Other typedefs: | |
82 =============== | |
83 | |
84 Emchar: | |
85 ------- | |
86 This typedef represents a single Emacs character, which can be | |
87 ASCII, ISO-8859, or some extended character, as would typically | |
88 be used for Kanji. Note that the representation of a character | |
89 as an Emchar is *not* the same as the representation of that | |
90 same character in a string; thus, you cannot do the standard | |
91 C trick of passing a pointer to a character to a function that | |
92 expects a string. | |
93 | |
94 An Emchar takes up 19 bits of representation and (for code | |
95 compatibility and such) is compatible with an int. This | |
96 representation is visible on the Lisp level. The important | |
97 characteristics of the Emchar representation are | |
98 | |
99 -- values 0x00 - 0x7f represent ASCII. | |
100 -- values 0x80 - 0xff represent the right half of ISO-8859-1. | |
101 -- values 0x100 and up represent all other characters. | |
102 | |
103 This means that Emchar values are upwardly compatible with | |
104 the standard 8-bit representation of ASCII/ISO-8859-1. | |
105 | |
106 Intbyte: | |
107 -------- | |
108 The data in a buffer or string is logically made up of Intbyte | |
109 objects, where a Intbyte takes up the same amount of space as a | |
110 char. (It is declared differently, though, to catch invalid | |
111 usages.) Strings stored using Intbytes are said to be in | |
112 "internal format". The important characteristics of internal | |
113 format are | |
114 | |
115 -- ASCII characters are represented as a single Intbyte, | |
116 in the range 0 - 0x7f. | |
117 -- All other characters are represented as a Intbyte in | |
118 the range 0x80 - 0x9f followed by one or more Intbytes | |
119 in the range 0xa0 to 0xff. | |
120 | |
121 This leads to a number of desirable properties: | |
122 | |
123 -- Given the position of the beginning of a character, | |
124 you can find the beginning of the next or previous | |
125 character in constant time. | |
126 -- When searching for a substring or an ASCII character | |
127 within the string, you need merely use standard | |
128 searching routines. | |
129 | |
130 array of char: | |
131 -------------- | |
132 Strings that go in or out of Emacs are in "external format", | |
133 typedef'ed as an array of char or a char *. There is more | |
134 than one external format (JIS, EUC, etc.) but they all | |
135 have similar properties. They are modal encodings, | |
136 which is to say that the meaning of particular bytes is | |
137 not fixed but depends on what "mode" the string is currently | |
138 in (e.g. bytes in the range 0 - 0x7f might be | |
139 interpreted as ASCII, or as Hiragana, or as 2-byte Kanji, | |
140 depending on the current mode). The mode starts out in | |
141 ASCII/ISO-8859-1 and is switched using escape sequences -- | |
142 for example, in the JIS encoding, 'ESC $ B' switches to a | |
143 mode where pairs of bytes in the range 0 - 0x7f | |
144 are interpreted as Kanji characters. | |
145 | |
146 External-formatted data is generally desirable for passing | |
147 data between programs because it is upwardly compatible | |
148 with standard ASCII/ISO-8859-1 strings and may require | |
149 less space than internal encodings such as the one | |
150 described above. In addition, some encodings (e.g. JIS) | |
151 keep all characters (except the ESC used to switch modes) | |
152 in the printing ASCII range 0x20 - 0x7e, which results in | |
153 a much higher probability that the data will avoid being | |
154 garbled in transmission. Externally-formatted data is | |
155 generally not very convenient to work with, however, and | |
156 for this reason is usually converted to internal format | |
157 before any work is done on the string. | |
158 | |
159 NOTE: filenames need to be in external format so that | |
160 ISO-8859-1 characters come out correctly. | |
161 | |
162 Charcount: | |
163 ---------- | |
164 This typedef represents a count of characters, such as | |
165 a character offset into a string or the number of | |
166 characters between two positions in a buffer. The | |
167 difference between two Charbpos's is a Charcount, and | |
168 character positions in a string are represented using | |
169 a Charcount. | |
170 | |
171 Bytecount: | |
172 ---------- | |
173 Similar to a Charcount but represents a count of bytes. | |
174 The difference between two Bytebpos's is a Bytecount. | |
175 | |
176 | |
177 Usage of the various representations: | |
178 ===================================== | |
179 | |
180 Memory indices are used in low-level functions in insdel.c and for | |
181 extent endpoints and marker positions. The reason for this is that | |
182 this way, the extents and markers don't need to be updated for most | |
183 insertions, which merely shrink the gap and don't move any | |
184 characters around in memory. | |
185 | |
186 (The beginning-of-gap memory index simplifies insertions w.r.t. | |
187 markers, because text usually gets inserted after markers. For | |
188 extents, it is merely for consistency, because text can get | |
189 inserted either before or after an extent's endpoint depending on | |
190 the open/closedness of the endpoint.) | |
191 | |
192 Byte indices are used in other code that needs to be fast, | |
193 such as the searching, redisplay, and extent-manipulation code. | |
194 | |
195 Buffer positions are used in all other code. This is because this | |
196 representation is easiest to work with (especially since Lisp | |
197 code always uses buffer positions), necessitates the fewest | |
198 changes to existing code, and is the safest (e.g. if the text gets | |
199 shifted underneath a buffer position, it will still point to a | |
200 character; if text is shifted under a byte index, it might point | |
201 to the middle of a character, which would be bad). | |
202 | |
203 Similarly, Charcounts are used in all code that deals with strings | |
204 except for code that needs to be fast, which used Bytecounts. | |
205 | |
206 Strings are always passed around internally using internal format. | |
207 Conversions between external format are performed at the time | |
208 that the data goes in or out of Emacs. | |
209 | |
210 Working with the various representations: | |
211 ========================================= */ | |
212 | |
213 /* We write things this way because it's very important the | |
214 MAX_BYTEBPOS_GAP_SIZE_3 is a multiple of 3. (As it happens, | |
215 65535 is a multiple of 3, but this may not always be the | |
216 case.) */ | |
217 | |
218 | |
219 /* | |
220 1. Character Sets | |
221 ================= | |
222 | |
223 A character set (or "charset") is an ordered set of characters. | |
224 A particular character in a charset is indexed using one or | |
225 more "position codes", which are non-negative integers. | |
226 The number of position codes needed to identify a particular | |
227 character in a charset is called the "dimension" of the | |
228 charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions, | |
229 and the size of all charsets (except for a few special cases) | |
230 is either 94, 96, 94 by 94, or 96 by 96. The range of | |
231 position codes used to index characters from any of these | |
232 types of character sets is as follows: | |
233 | |
234 Charset type Position code 1 Position code 2 | |
235 ------------------------------------------------------------ | |
236 94 33 - 126 N/A | |
237 96 32 - 127 N/A | |
238 94x94 33 - 126 33 - 126 | |
239 96x96 32 - 127 32 - 127 | |
240 | |
241 Note that in the above cases position codes do not start at | |
242 an expected value such as 0 or 1. The reason for this will | |
243 become clear later. | |
244 | |
245 For example, Latin-1 is a 96-character charset, and JISX0208 | |
246 (the Japanese national character set) is a 94x94-character | |
247 charset. | |
248 | |
249 [Note that, although the ranges above define the *valid* | |
250 position codes for a charset, some of the slots in a particular | |
251 charset may in fact be empty. This is the case for JISX0208, | |
252 for example, where (e.g.) all the slots whose first | |
253 position code is in the range 118 - 127 are empty.] | |
254 | |
255 There are three charsets that do not follow the above rules. | |
256 All of them have one dimension, and have ranges of position | |
257 codes as follows: | |
258 | |
259 Charset name Position code 1 | |
260 ------------------------------------ | |
261 ASCII 0 - 127 | |
262 Control-1 0 - 31 | |
263 Composite 0 - some large number | |
264 | |
265 (The upper bound of the position code for composite characters | |
266 has not yet been determined, but it will probably be at | |
267 least 16,383). | |
268 | |
269 ASCII is the union of two subsidiary character sets: | |
270 Printing-ASCII (the printing ASCII character set, | |
271 consisting of position codes 33 - 126, like for a standard | |
272 94-character charset) and Control-ASCII (the non-printing | |
273 characters that would appear in a binary file with codes 0 | |
274 - 32 and 127). | |
275 | |
276 Control-1 contains the non-printing characters that would | |
277 appear in a binary file with codes 128 - 159. | |
278 | |
279 Composite contains characters that are generated by | |
280 overstriking one or more characters from other charsets. | |
281 | |
282 Note that some characters in ASCII, and all characters | |
283 in Control-1, are "control" (non-printing) characters. | |
284 These have no printed representation but instead control | |
285 some other function of the printing (e.g. TAB or 8 moves | |
286 the current character position to the next tab stop). | |
287 All other characters in all charsets are "graphic" | |
288 (printing) characters. | |
289 | |
290 When a binary file is read in, the bytes in the file are | |
291 assigned to character sets as follows: | |
292 | |
293 Bytes Character set Range | |
294 -------------------------------------------------- | |
295 0 - 127 ASCII 0 - 127 | |
296 128 - 159 Control-1 0 - 31 | |
297 160 - 255 Latin-1 32 - 127 | |
298 | |
299 This is a bit ad-hoc but gets the job done. | |
300 | |
301 2. Encodings | |
302 ============ | |
303 | |
304 An "encoding" is a way of numerically representing | |
305 characters from one or more character sets. If an encoding | |
306 only encompasses one character set, then the position codes | |
307 for the characters in that character set could be used | |
308 directly. This is not possible, however, if more than one | |
309 character set is to be used in the encoding. | |
310 | |
311 For example, the conversion detailed above between bytes in | |
312 a binary file and characters is effectively an encoding | |
313 that encompasses the three character sets ASCII, Control-1, | |
314 and Latin-1 in a stream of 8-bit bytes. | |
315 | |
316 Thus, an encoding can be viewed as a way of encoding | |
317 characters from a specified group of character sets using a | |
318 stream of bytes, each of which contains a fixed number of | |
319 bits (but not necessarily 8, as in the common usage of | |
320 "byte"). | |
321 | |
322 Here are descriptions of a couple of common | |
323 encodings: | |
324 | |
325 | |
326 A. Japanese EUC (Extended Unix Code) | |
327 | |
328 This encompasses the character sets: | |
329 - Printing-ASCII, | |
330 - Katakana-JISX0201 (half-width katakana, the right half of JISX0201). | |
331 - Japanese-JISX0208 | |
332 - Japanese-JISX0212 | |
333 It uses 8-bit bytes. | |
334 | |
335 Note that Printing-ASCII and Katakana-JISX0201 are 94-character | |
336 charsets, while Japanese-JISX0208 is a 94x94-character charset. | |
337 | |
338 The encoding is as follows: | |
339 | |
340 Character set Representation (PC == position-code) | |
341 ------------- -------------- | |
342 Printing-ASCII PC1 | |
343 Japanese-JISX0208 PC1 + 0x80 | PC2 + 0x80 | |
344 Katakana-JISX0201 0x8E | PC1 + 0x80 | |
345 | |
346 | |
347 B. JIS7 | |
348 | |
349 This encompasses the character sets: | |
350 - Printing-ASCII | |
351 - Latin-JISX0201 (the left half of JISX0201; this character set is | |
352 very similar to Printing-ASCII and is a 94-character charset) | |
353 - Japanese-JISX0208 | |
354 - Katakana-JISX0201 | |
355 It uses 7-bit bytes. | |
356 | |
357 Unlike Japanese EUC, this is a "modal" encoding, which | |
358 means that there are multiple states that the encoding can | |
359 be in, which affect how the bytes are to be interpreted. | |
360 Special sequences of bytes (called "escape sequences") | |
361 are used to change states. | |
362 | |
363 The encoding is as follows: | |
364 | |
365 Character set Representation | |
366 ------------- -------------- | |
367 Printing-ASCII PC1 | |
368 Latin-JISX0201 PC1 | |
369 Katakana-JISX0201 PC1 | |
370 Japanese-JISX0208 PC1 | PC2 | |
371 | |
372 Escape sequence ASCII equivalent Meaning | |
373 --------------- ---------------- ------- | |
374 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII | |
375 0x1B 0x28 0x4A ESC ( J invoke Latin-JISX0201 | |
376 0x1B 0x28 0x49 ESC ( I invoke Katakana-JISX0201 | |
377 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208 | |
378 | |
379 Initially, Printing-ASCII is invoked. | |
380 | |
381 3. Internal Mule Encodings | |
382 ========================== | |
383 | |
384 In XEmacs/Mule, each character set is assigned a unique number, | |
385 called a "leading byte". This is used in the encodings of a | |
386 character. Leading bytes are in the range 0x80 - 0xFF | |
387 (except for ASCII, which has a leading byte of 0), although | |
388 some leading bytes are reserved. | |
389 | |
390 Charsets whose leading byte is in the range 0x80 - 0x9F are | |
391 called "official" and are used for built-in charsets. | |
392 Other charsets are called "private" and have leading bytes | |
393 in the range 0xA0 - 0xFF; these are user-defined charsets. | |
394 | |
395 More specifically: | |
396 | |
397 Character set Leading byte | |
398 ------------- ------------ | |
399 ASCII 0 (0x7F in arrays indexed by leading byte) | |
400 Composite 0x8D | |
401 Dimension-1 Official 0x80 - 0x8C/0x8D | |
402 (0x8E is free) | |
403 Control 0x8F | |
404 Dimension-2 Official 0x90 - 0x99 | |
405 (0x9A - 0x9D are free) | |
406 Dimension-1 Private Marker 0x9E | |
407 Dimension-2 Private Marker 0x9F | |
408 Dimension-1 Private 0xA0 - 0xEF | |
409 Dimension-2 Private 0xF0 - 0xFF | |
410 | |
411 There are two internal encodings for characters in XEmacs/Mule. | |
412 One is called "string encoding" and is an 8-bit encoding that | |
413 is used for representing characters in a buffer or string. | |
414 It uses 1 to 4 bytes per character. The other is called | |
415 "character encoding" and is a 19-bit encoding that is used | |
416 for representing characters individually in a variable. | |
417 | |
418 (In the following descriptions, we'll ignore composite | |
419 characters for the moment. We also give a general (structural) | |
420 overview first, followed later by the exact details.) | |
421 | |
422 A. Internal String Encoding | |
423 | |
424 ASCII characters are encoded using their position code directly. | |
425 Other characters are encoded using their leading byte followed | |
426 by their position code(s) with the high bit set. Characters | |
427 in private character sets have their leading byte prefixed with | |
428 a "leading byte prefix", which is either 0x9E or 0x9F. (No | |
429 character sets are ever assigned these leading bytes.) Specifically: | |
430 | |
431 Character set Encoding (PC == position-code) | |
432 ------------- -------- (LB == leading-byte) | |
433 ASCII PC1 | | |
434 Control-1 LB | PC1 + 0xA0 | |
435 Dimension-1 official LB | PC1 + 0x80 | |
436 Dimension-1 private 0x9E | LB | PC1 + 0x80 | |
437 Dimension-2 official LB | PC1 | PC2 + 0x80 | |
438 Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80 | |
439 | |
440 The basic characteristic of this encoding is that the first byte | |
441 of all characters is in the range 0x00 - 0x9F, and the second and | |
442 following bytes of all characters is in the range 0xA0 - 0xFF. | |
443 This means that it is impossible to get out of sync, or more | |
444 specifically: | |
445 | |
446 1. Given any byte position, the beginning of the character it is | |
447 within can be determined in constant time. | |
448 2. Given any byte position at the beginning of a character, the | |
449 beginning of the next character can be determined in constant | |
450 time. | |
451 3. Given any byte position at the beginning of a character, the | |
452 beginning of the previous character can be determined in constant | |
453 time. | |
454 4. Textual searches can simply treat encoded strings as if they | |
455 were encoded in a one-byte-per-character fashion rather than | |
456 the actual multi-byte encoding. | |
457 | |
458 None of the standard non-modal encodings meet all of these | |
459 conditions. For example, EUC satisfies only (2) and (3), while | |
460 Shift-JIS and Big5 (not yet described) satisfy only (2). (All | |
461 non-modal encodings must satisfy (2), in order to be unambiguous.) | |
462 | |
463 B. Internal Character Encoding | |
464 | |
465 One 19-bit word represents a single character. The word is | |
466 separated into three fields: | |
467 | |
468 Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 | |
469 <------------> <------------------> <------------------> | |
470 Field: 1 2 3 | |
471 | |
472 Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits. | |
473 | |
474 Character set Field 1 Field 2 Field 3 | |
475 ------------- ------- ------- ------- | |
476 ASCII 0 0 PC1 | |
477 range: (00 - 7F) | |
478 Control-1 0 1 PC1 | |
479 range: (00 - 1F) | |
480 Dimension-1 official 0 LB - 0x7F PC1 | |
481 range: (01 - 0D) (20 - 7F) | |
482 Dimension-1 private 0 LB - 0x80 PC1 | |
483 range: (20 - 6F) (20 - 7F) | |
484 Dimension-2 official LB - 0x8F PC1 PC2 | |
485 range: (01 - 0A) (20 - 7F) (20 - 7F) | |
486 Dimension-2 private LB - 0xE1 PC1 PC2 | |
487 range: (0F - 1E) (20 - 7F) (20 - 7F) | |
488 Composite 0x1F ? ? | |
489 | |
490 Note that character codes 0 - 255 are the same as the "binary encoding" | |
491 described above. | |
492 */ | |
493 | |
494 /* | |
495 About Unicode support: | |
496 | |
497 Adding Unicode support is very desirable. Unicode will likely be a | |
498 very common representation in the future, and thus we should | |
499 represent Unicode characters using three bytes instead of four. | |
500 This means we need to find leading bytes for Unicode. Given that | |
501 there are 65,536 characters in Unicode and we can attach 96x96 = | |
502 9,216 characters per leading byte, we need eight leading bytes for | |
503 Unicode. We currently have four free (0x9A - 0x9D), and with a | |
504 little bit of rearranging we can get five: ASCII doesn't really | |
505 need to take up a leading byte. (We could just as well use 0x7F, | |
506 with a little change to the functions that assume that 0x80 is the | |
507 lowest leading byte.) This means we still need to dump three | |
508 leading bytes and move them into private space. The CNS charsets | |
509 are good candidates since they are rarely used, and | |
510 JAPANESE_JISX0208_1978 is becoming less and less used and could | |
511 also be dumped. */ | |
512 | |
513 | |
514 /* Composite characters are characters constructed by overstriking two | |
515 or more regular characters. | |
516 | |
517 1) The old Mule implementation involves storing composite characters | |
518 in a buffer as a tag followed by all of the actual characters | |
519 used to make up the composite character. I think this is a bad | |
520 idea; it greatly complicates code that wants to handle strings | |
521 one character at a time because it has to deal with the possibility | |
522 of great big ungainly characters. It's much more reasonable to | |
523 simply store an index into a table of composite characters. | |
524 | |
525 2) The current implementation only allows for 16,384 separate | |
526 composite characters over the lifetime of the XEmacs process. | |
527 This could become a potential problem if the user | |
528 edited lots of different files that use composite characters. | |
529 Due to FSF bogosity, increasing the number of allowable | |
530 composite characters under Mule would decrease the number | |
531 of possible faces that can exist. Mule already has shrunk | |
532 this to 2048, and further shrinkage would become uncomfortable. | |
533 No such problems exist in XEmacs. | |
534 | |
535 Composite characters could be represented as 0x8D C1 C2 C3, | |
536 where each C[1-3] is in the range 0xA0 - 0xFF. This allows | |
537 for slightly under 2^20 (one million) composite characters | |
538 over the XEmacs process lifetime, and you only need to | |
539 increase the size of a Mule character from 19 to 21 bits. | |
540 Or you could use 0x8D C1 C2 C3 C4, allowing for about | |
541 85 million (slightly over 2^26) composite characters. */ | |
542 | |
543 | |
544 /************************************************************************/ | |
545 /* declarations */ | |
546 /************************************************************************/ | |
547 | |
548 Eistring the_eistring_zero_init, the_eistring_malloc_zero_init; | |
549 | |
550 #define MAX_CHARBPOS_GAP_SIZE_3 (65535/3) | |
551 #define MAX_BYTEBPOS_GAP_SIZE_3 (3 * MAX_CHARBPOS_GAP_SIZE_3) | |
552 | |
553 short three_to_one_table[1 + MAX_BYTEBPOS_GAP_SIZE_3]; | |
554 | |
555 #ifdef MULE | |
556 | |
557 /* Table of number of bytes in the string representation of a character | |
558 indexed by the first byte of that representation. | |
559 | |
560 rep_bytes_by_first_byte(c) is more efficient than the equivalent | |
561 canonical computation: | |
562 | |
563 XCHARSET_REP_BYTES (CHARSET_BY_LEADING_BYTE (c)) */ | |
564 | |
565 const Bytecount rep_bytes_by_first_byte[0xA0] = | |
566 { /* 0x00 - 0x7f are for straight ASCII */ | |
567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
568 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
569 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
570 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
571 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
572 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
573 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
574 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
575 /* 0x80 - 0x8f are for Dimension-1 official charsets */ | |
576 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
577 /* 0x90 - 0x9d are for Dimension-2 official charsets */ | |
578 /* 0x9e is for Dimension-1 private charsets */ | |
579 /* 0x9f is for Dimension-2 private charsets */ | |
580 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4 | |
581 }; | |
582 | |
583 #ifdef ENABLE_COMPOSITE_CHARS | |
584 | |
585 /* Hash tables for composite chars. One maps string representing | |
586 composed chars to their equivalent chars; one goes the | |
587 other way. */ | |
588 Lisp_Object Vcomposite_char_char2string_hash_table; | |
589 Lisp_Object Vcomposite_char_string2char_hash_table; | |
590 | |
591 static int composite_char_row_next; | |
592 static int composite_char_col_next; | |
593 | |
594 #endif /* ENABLE_COMPOSITE_CHARS */ | |
595 | |
596 #endif /* MULE */ | |
597 | |
598 | |
599 /************************************************************************/ | |
600 /* qxestr***() functions */ | |
601 /************************************************************************/ | |
602 | |
603 /* Most are inline functions in lisp.h */ | |
604 | |
605 int | |
606 qxesprintf (Intbyte *buffer, const CIntbyte *format, ...) | |
607 { | |
608 va_list args; | |
609 int retval; | |
610 | |
611 va_start (args, format); | |
612 retval = vsprintf ((char *) buffer, format, args); | |
613 va_end (args); | |
614 | |
615 return retval; | |
616 } | |
617 | |
618 /* strcasecmp() implementation from BSD */ | |
619 static Intbyte strcasecmp_charmap[] = { | |
620 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', | |
621 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', | |
622 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', | |
623 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', | |
624 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', | |
625 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', | |
626 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', | |
627 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', | |
628 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', | |
629 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', | |
630 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', | |
631 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', | |
632 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', | |
633 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', | |
634 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', | |
635 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', | |
636 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', | |
637 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', | |
638 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', | |
639 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', | |
640 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', | |
641 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', | |
642 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', | |
643 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', | |
644 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', | |
645 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', | |
646 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', | |
647 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', | |
648 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', | |
649 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', | |
650 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', | |
651 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', | |
652 }; | |
653 | |
654 /* A version that works like generic strcasecmp() -- only collapsing | |
655 case in ASCII A-Z/a-z. This is safe on Mule strings due to the | |
656 current representation. | |
657 | |
658 This version was written by some Berkeley coder, favoring | |
659 nanosecond improvements over clarity. In all other versions below, | |
660 we use symmetrical algorithms that may sacrifice a few machine | |
661 cycles but are MUCH MUCH clearer, which counts a lot more. | |
662 */ | |
663 | |
664 int | |
665 qxestrcasecmp (const Intbyte *s1, const Intbyte *s2) | |
666 { | |
667 Intbyte *cm = strcasecmp_charmap; | |
668 | |
669 while (cm[*s1] == cm[*s2++]) | |
670 if (*s1++ == '\0') | |
671 return (0); | |
672 | |
673 return (cm[*s1] - cm[*--s2]); | |
674 } | |
675 | |
676 int | |
677 ascii_strcasecmp (const Char_ASCII *s1, const Char_ASCII *s2) | |
678 { | |
679 return qxestrcasecmp ((const Intbyte *) s1, (const Intbyte *) s2); | |
680 } | |
681 | |
682 int | |
683 qxestrcasecmp_c (const Intbyte *s1, const Char_ASCII *s2) | |
684 { | |
685 return qxestrcasecmp (s1, (const Intbyte *) s2); | |
686 } | |
687 | |
688 /* An internationalized version that collapses case in a general fashion. | |
689 */ | |
690 | |
691 int | |
692 qxestrcasecmp_i18n (const Intbyte *s1, const Intbyte *s2) | |
693 { | |
694 while (*s1 && *s2) | |
695 { | |
696 if (DOWNCASE (0, charptr_emchar (s1)) != | |
697 DOWNCASE (0, charptr_emchar (s2))) | |
698 break; | |
699 INC_CHARPTR (s1); | |
700 INC_CHARPTR (s2); | |
701 } | |
702 | |
703 return (DOWNCASE (0, charptr_emchar (s1)) - | |
704 DOWNCASE (0, charptr_emchar (s2))); | |
705 } | |
706 | |
707 /* The only difference between these next two and | |
708 qxememcasecmp()/qxememcasecmp_i18n() is that these two will stop if | |
709 both strings are equal and less than LEN in length, while | |
710 the mem...() versions would would run off the end. */ | |
711 | |
712 int | |
713 qxestrncasecmp (const Intbyte *s1, const Intbyte *s2, Bytecount len) | |
714 { | |
715 Intbyte *cm = strcasecmp_charmap; | |
716 | |
717 while (len--) | |
718 { | |
719 int diff = cm[*s1] - cm[*s2]; | |
720 if (diff != 0) | |
721 return diff; | |
722 if (!*s1) | |
723 return 0; | |
724 s1++, s2++; | |
725 } | |
726 | |
727 return 0; | |
728 } | |
729 | |
730 int | |
731 ascii_strncasecmp (const Char_ASCII *s1, const Char_ASCII *s2, Bytecount len) | |
732 { | |
733 return qxestrncasecmp ((const Intbyte *) s1, (const Intbyte *) s2, len); | |
734 } | |
735 | |
736 int | |
737 qxestrncasecmp_c (const Intbyte *s1, const Char_ASCII *s2, Bytecount len) | |
738 { | |
739 return qxestrncasecmp (s1, (const Intbyte *) s2, len); | |
740 } | |
741 | |
742 int | |
743 qxestrncasecmp_i18n (const Intbyte *s1, const Intbyte *s2, Bytecount len) | |
744 { | |
745 while (len > 0) | |
746 { | |
747 const Intbyte *old_s1 = s1; | |
748 int diff = (DOWNCASE (0, charptr_emchar (s1)) - | |
749 DOWNCASE (0, charptr_emchar (s2))); | |
750 if (diff != 0) | |
751 return diff; | |
752 if (!*s1) | |
753 return 0; | |
754 INC_CHARPTR (s1); | |
755 INC_CHARPTR (s2); | |
756 len -= s1 - old_s1; | |
757 } | |
758 | |
759 return 0; | |
760 } | |
761 | |
762 int | |
763 qxememcmp (const Intbyte *s1, const Intbyte *s2, Bytecount len) | |
764 { | |
765 return memcmp (s1, s2, len); | |
766 } | |
767 | |
768 int | |
769 qxememcasecmp (const Intbyte *s1, const Intbyte *s2, Bytecount len) | |
770 { | |
771 Intbyte *cm = strcasecmp_charmap; | |
772 | |
773 while (len--) | |
774 { | |
775 int diff = cm[*s1] - cm[*s2]; | |
776 if (diff != 0) | |
777 return diff; | |
778 s1++, s2++; | |
779 } | |
780 | |
781 return 0; | |
782 } | |
783 | |
784 int | |
785 qxememcasecmp_i18n (const Intbyte *s1, const Intbyte *s2, Bytecount len) | |
786 { | |
787 while (len > 0) | |
788 { | |
789 const Intbyte *old_s1 = s1; | |
790 int diff = (DOWNCASE (0, charptr_emchar (s1)) - | |
791 DOWNCASE (0, charptr_emchar (s2))); | |
792 if (diff != 0) | |
793 return diff; | |
794 INC_CHARPTR (s1); | |
795 INC_CHARPTR (s2); | |
796 len -= s1 - old_s1; | |
797 } | |
798 | |
799 return 0; | |
800 } | |
801 | |
802 int | |
803 lisp_strcasecmp (Lisp_Object s1, Lisp_Object s2) | |
804 { | |
805 Intbyte *cm = strcasecmp_charmap; | |
806 Intbyte *p1 = XSTRING_DATA (s1); | |
807 Intbyte *p2 = XSTRING_DATA (s2); | |
808 Intbyte *e1 = p1 + XSTRING_LENGTH (s1); | |
809 Intbyte *e2 = p2 + XSTRING_LENGTH (s2); | |
810 | |
811 /* again, we use a symmetric algorithm and favor clarity over | |
812 nanosecond improvements. */ | |
813 while (1) | |
814 { | |
815 /* if we reached the end of either string, compare lengths. | |
816 do NOT compare the final null byte against anything, in case | |
817 the other string also has a null byte at that position. */ | |
818 if (p1 == e1 || p2 == e2) | |
819 return e1 - e2; | |
820 if (cm[*p1] != cm[*p2]) | |
821 return cm[*p1] - cm[*p2]; | |
822 p1++, p2++; | |
823 } | |
824 } | |
825 | |
826 int | |
827 lisp_strcasecmp_i18n (Lisp_Object s1, Lisp_Object s2) | |
828 { | |
829 Intbyte *p1 = XSTRING_DATA (s1); | |
830 Intbyte *p2 = XSTRING_DATA (s2); | |
831 Intbyte *e1 = p1 + XSTRING_LENGTH (s1); | |
832 Intbyte *e2 = p2 + XSTRING_LENGTH (s2); | |
833 | |
834 /* again, we use a symmetric algorithm and favor clarity over | |
835 nanosecond improvements. */ | |
836 while (1) | |
837 { | |
838 /* if we reached the end of either string, compare lengths. | |
839 do NOT compare the final null byte against anything, in case | |
840 the other string also has a null byte at that position. */ | |
841 assert (p1 <= e1); | |
842 assert (p2 <= e2); | |
843 if (p1 == e1 || p2 == e2) | |
844 return e1 - e2; | |
845 if (DOWNCASE (0, charptr_emchar (p1)) != | |
846 DOWNCASE (0, charptr_emchar (p2))) | |
847 return (DOWNCASE (0, charptr_emchar (p1)) - | |
848 DOWNCASE (0, charptr_emchar (p2))); | |
849 INC_CHARPTR (p1); | |
850 INC_CHARPTR (p2); | |
851 } | |
852 } | |
853 | |
854 | |
855 /************************************************************************/ | |
856 /* conversion between textual representations */ | |
857 /************************************************************************/ | |
858 | |
859 /* NOTE: Does not reset the Dynarr. */ | |
860 | |
861 void | |
862 convert_intbyte_string_into_emchar_dynarr (const Intbyte *str, Bytecount len, | |
863 Emchar_dynarr *dyn) | |
864 { | |
865 const Intbyte *strend = str + len; | |
866 | |
867 while (str < strend) | |
868 { | |
869 Emchar ch = charptr_emchar (str); | |
870 Dynarr_add (dyn, ch); | |
871 INC_CHARPTR (str); | |
872 } | |
873 } | |
874 | |
875 Charcount | |
876 convert_intbyte_string_into_emchar_string (const Intbyte *str, Bytecount len, | |
877 Emchar *arr) | |
878 { | |
879 const Intbyte *strend = str + len; | |
880 Charcount newlen = 0; | |
881 while (str < strend) | |
882 { | |
883 Emchar ch = charptr_emchar (str); | |
884 arr[newlen++] = ch; | |
885 INC_CHARPTR (str); | |
886 } | |
887 return newlen; | |
888 } | |
889 | |
890 /* Convert an array of Emchars into the equivalent string representation. | |
891 Store into the given Intbyte dynarr. Does not reset the dynarr. | |
892 Does not add a terminating zero. */ | |
893 | |
894 void | |
895 convert_emchar_string_into_intbyte_dynarr (Emchar *arr, int nels, | |
896 Intbyte_dynarr *dyn) | |
897 { | |
898 Intbyte str[MAX_EMCHAR_LEN]; | |
899 int i; | |
900 | |
901 for (i = 0; i < nels; i++) | |
902 { | |
903 Bytecount len = set_charptr_emchar (str, arr[i]); | |
904 Dynarr_add_many (dyn, str, len); | |
905 } | |
906 } | |
907 | |
908 /* Convert an array of Emchars into the equivalent string representation. | |
909 Malloc the space needed for this and return it. If LEN_OUT is not a | |
910 NULL pointer, store into LEN_OUT the number of Intbytes in the | |
911 malloc()ed string. Note that the actual number of Intbytes allocated | |
912 is one more than this: the returned string is zero-terminated. */ | |
913 | |
914 Intbyte * | |
915 convert_emchar_string_into_malloced_string (Emchar *arr, int nels, | |
916 Bytecount *len_out) | |
917 { | |
918 /* Damn zero-termination. */ | |
919 Intbyte *str = (Intbyte *) alloca (nels * MAX_EMCHAR_LEN + 1); | |
920 Intbyte *strorig = str; | |
921 Bytecount len; | |
922 | |
923 int i; | |
924 | |
925 for (i = 0; i < nels; i++) | |
926 str += set_charptr_emchar (str, arr[i]); | |
927 *str = '\0'; | |
928 len = str - strorig; | |
929 str = (Intbyte *) xmalloc (1 + len); | |
930 memcpy (str, strorig, 1 + len); | |
931 if (len_out) | |
932 *len_out = len; | |
933 return str; | |
934 } | |
935 | |
936 | |
937 /************************************************************************/ | |
938 /* charset properties of strings */ | |
939 /************************************************************************/ | |
940 | |
941 void | |
942 find_charsets_in_intbyte_string (unsigned char *charsets, const Intbyte *str, | |
943 Bytecount len) | |
944 { | |
945 #ifndef MULE | |
946 /* Telescope this. */ | |
947 charsets[0] = 1; | |
948 #else | |
949 const Intbyte *strend = str + len; | |
950 memset (charsets, 0, NUM_LEADING_BYTES); | |
951 | |
952 /* #### SJT doesn't like this. */ | |
953 if (len == 0) | |
954 { | |
955 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1; | |
956 return; | |
957 } | |
958 | |
959 while (str < strend) | |
960 { | |
961 charsets[CHAR_LEADING_BYTE (charptr_emchar (str)) - MIN_LEADING_BYTE] = | |
962 1; | |
963 INC_CHARPTR (str); | |
964 } | |
965 #endif | |
966 } | |
967 | |
968 void | |
969 find_charsets_in_emchar_string (unsigned char *charsets, const Emchar *str, | |
970 Charcount len) | |
971 { | |
972 #ifndef MULE | |
973 /* Telescope this. */ | |
974 charsets[0] = 1; | |
975 #else | |
976 int i; | |
977 | |
978 memset (charsets, 0, NUM_LEADING_BYTES); | |
979 | |
980 /* #### SJT doesn't like this. */ | |
981 if (len == 0) | |
982 { | |
983 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1; | |
984 return; | |
985 } | |
986 | |
987 for (i = 0; i < len; i++) | |
988 { | |
989 charsets[CHAR_LEADING_BYTE (str[i]) - MIN_LEADING_BYTE] = 1; | |
990 } | |
991 #endif | |
992 } | |
993 | |
994 int | |
995 intbyte_string_displayed_columns (const Intbyte *str, Bytecount len) | |
996 { | |
997 int cols = 0; | |
998 const Intbyte *end = str + len; | |
999 | |
1000 while (str < end) | |
1001 { | |
1002 #ifdef MULE | |
1003 Emchar ch = charptr_emchar (str); | |
1004 cols += XCHARSET_COLUMNS (CHAR_CHARSET (ch)); | |
1005 #else | |
1006 cols++; | |
1007 #endif | |
1008 INC_CHARPTR (str); | |
1009 } | |
1010 | |
1011 return cols; | |
1012 } | |
1013 | |
1014 int | |
1015 emchar_string_displayed_columns (const Emchar *str, Charcount len) | |
1016 { | |
1017 #ifdef MULE | |
1018 int cols = 0; | |
1019 int i; | |
1020 | |
1021 for (i = 0; i < len; i++) | |
1022 cols += XCHARSET_COLUMNS (CHAR_CHARSET (str[i])); | |
1023 | |
1024 return cols; | |
1025 #else /* not MULE */ | |
1026 return len; | |
1027 #endif | |
1028 } | |
1029 | |
1030 Charcount | |
1031 intbyte_string_nonascii_chars (const Intbyte *str, Bytecount len) | |
1032 { | |
1033 #ifdef MULE | |
1034 const Intbyte *end = str + len; | |
1035 Charcount retval = 0; | |
1036 | |
1037 while (str < end) | |
1038 { | |
1039 if (!BYTE_ASCII_P (*str)) | |
1040 retval++; | |
1041 INC_CHARPTR (str); | |
1042 } | |
1043 | |
1044 return retval; | |
1045 #else | |
1046 return 0; | |
1047 #endif | |
1048 } | |
1049 | |
1050 | |
1051 /***************************************************************************/ | |
1052 /* Eistring helper functions */ | |
1053 /***************************************************************************/ | |
1054 | |
1055 int | |
1056 eistr_casefiddle_1 (Intbyte *olddata, Bytecount len, Intbyte *newdata, | |
1057 int downp) | |
1058 { | |
1059 Intbyte *endp = olddata + len; | |
1060 Intbyte *newp = newdata; | |
1061 int changedp = 0; | |
1062 | |
1063 while (olddata < endp) | |
1064 { | |
1065 Emchar c = charptr_emchar (olddata); | |
1066 Emchar newc; | |
1067 | |
1068 if (downp) | |
1069 newc = DOWNCASE (0, c); | |
1070 else | |
1071 newc = UPCASE (0, c); | |
1072 | |
1073 if (c != newc) | |
1074 changedp = 1; | |
1075 | |
1076 newp += set_charptr_emchar (newp, newc); | |
1077 INC_CHARPTR (olddata); | |
1078 } | |
1079 | |
1080 *newp = '\0'; | |
1081 | |
1082 return changedp ? newp - newdata : 0; | |
1083 } | |
1084 | |
1085 int | |
1086 eifind_large_enough_buffer (int oldbufsize, int needed_size) | |
1087 { | |
1088 while (oldbufsize < needed_size) | |
1089 { | |
1090 oldbufsize = oldbufsize * 3 / 2; | |
1091 oldbufsize = max (oldbufsize, 32); | |
1092 } | |
1093 | |
1094 return oldbufsize; | |
1095 } | |
1096 | |
1097 void | |
1098 eito_malloc_1 (Eistring *ei) | |
1099 { | |
1100 if (ei->mallocp_) | |
1101 return; | |
1102 ei->mallocp_ = 1; | |
1103 if (ei->data_) | |
1104 { | |
1105 Intbyte *newdata; | |
1106 | |
1107 ei->max_size_allocated_ = | |
1108 eifind_large_enough_buffer (0, ei->bytelen_ + 1); | |
1109 newdata = (Intbyte *) xmalloc (ei->max_size_allocated_); | |
1110 memcpy (newdata, ei->data_, ei->bytelen_ + 1); | |
1111 ei->data_ = newdata; | |
1112 } | |
1113 | |
1114 if (ei->extdata_) | |
1115 { | |
1116 Extbyte *newdata = (Extbyte *) xmalloc (ei->extlen_ + 2); | |
1117 | |
1118 memcpy (newdata, ei->extdata_, ei->extlen_); | |
1119 /* Double null-terminate in case of Unicode data */ | |
1120 newdata[ei->extlen_] = '\0'; | |
1121 newdata[ei->extlen_ + 1] = '\0'; | |
1122 ei->extdata_ = newdata; | |
1123 } | |
1124 } | |
1125 | |
1126 int | |
1127 eicmp_1 (Eistring *ei, Bytecount off, Charcount charoff, | |
1128 Bytecount len, Charcount charlen, const Intbyte *data, | |
1129 const Eistring *ei2, int is_c, int fold_case) | |
1130 { | |
1131 assert ((off < 0) != (charoff < 0)); | |
1132 if (off < 0) | |
1133 { | |
1134 off = charcount_to_bytecount (ei->data_, charoff); | |
1135 if (charlen < 0) | |
1136 len = -1; | |
1137 else | |
1138 len = charcount_to_bytecount (ei->data_ + off, charlen); | |
1139 } | |
1140 if (len < 0) | |
1141 len = ei->bytelen_ - off; | |
1142 | |
1143 assert (off >= 0 && off <= ei->bytelen_); | |
1144 assert (len >= 0 && off + len <= ei->bytelen_); | |
1145 assert ((data == 0) != (ei == 0)); | |
1146 assert ((is_c != 0) == (data != 0)); | |
1147 assert (fold_case >= 0 && fold_case <= 2); | |
1148 | |
1149 { | |
1150 Bytecount dstlen; | |
1151 int result; | |
1152 const Intbyte *src = ei->data_, *dst; | |
1153 Bytecount cmplen; | |
1154 | |
1155 if (data) | |
1156 { | |
1157 dst = data; | |
1158 dstlen = qxestrlen (data); | |
1159 } | |
1160 else | |
1161 { | |
1162 dst = ei2->data_; | |
1163 dstlen = ei2->bytelen_; | |
1164 } | |
1165 | |
1166 if (is_c) | |
1167 EI_ASSERT_ASCII ((Char_ASCII *) dst, dstlen); | |
1168 | |
1169 cmplen = min (len, dstlen); | |
1170 result = (fold_case == 0 ? qxememcmp (src, dst, cmplen) : | |
1171 fold_case == 1 ? qxememcasecmp (src, dst, cmplen) : | |
1172 qxememcasecmp_i18n (src, dst, cmplen)); | |
1173 | |
1174 if (result) | |
1175 return result; | |
1176 | |
1177 return len - dstlen; | |
1178 } | |
1179 } | |
1180 | |
1181 Intbyte * | |
1182 eicpyout_malloc_fmt (Eistring *eistr, Bytecount *len_out, Internal_Format fmt) | |
1183 { | |
1184 Intbyte *ptr; | |
1185 | |
1186 assert (fmt == FORMAT_DEFAULT); | |
1187 ptr = xnew_array (Intbyte, eistr->bytelen_ + 1); | |
1188 if (len_out) | |
1189 *len_out = eistr->bytelen_; | |
1190 memcpy (ptr, eistr->data_, eistr->bytelen_ + 1); | |
1191 return ptr; | |
1192 } | |
1193 | |
1194 | |
1195 /************************************************************************/ | |
1196 /* Charcount/Bytecount conversion */ | |
1197 /************************************************************************/ | |
1198 | |
1199 /* Optimization. Do it. Live it. Love it. */ | |
1200 | |
1201 #ifdef MULE | |
1202 | |
1203 /* We include the basic functions here that require no specific | |
1204 knowledge of how data is Mule-encoded into a buffer other | |
1205 than the basic (00 - 7F), (80 - 9F), (A0 - FF) scheme. | |
1206 Anything that requires more specific knowledge goes into | |
1207 mule-charset.c. */ | |
1208 | |
1209 /* Given a pointer to a text string and a length in bytes, return | |
1210 the equivalent length in characters. */ | |
1211 | |
1212 Charcount | |
1213 bytecount_to_charcount (const Intbyte *ptr, Bytecount len) | |
1214 { | |
1215 Charcount count = 0; | |
1216 const Intbyte *end = ptr + len; | |
1217 | |
1218 #if SIZEOF_LONG == 8 | |
1219 # define STRIDE_TYPE long | |
1220 # define HIGH_BIT_MASK 0x8080808080808080UL | |
1221 #elif SIZEOF_LONG_LONG == 8 && !(defined (i386) || defined (__i386__)) | |
1222 # define STRIDE_TYPE long long | |
1223 # define HIGH_BIT_MASK 0x8080808080808080ULL | |
1224 #elif SIZEOF_LONG == 4 | |
1225 # define STRIDE_TYPE long | |
1226 # define HIGH_BIT_MASK 0x80808080UL | |
1227 #else | |
1228 # error Add support for 128-bit systems here | |
1229 #endif | |
1230 | |
1231 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1)) | |
1232 #define ALIGN_MASK (~ ALIGN_BITS) | |
1233 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0) | |
1234 #define STRIDE sizeof (STRIDE_TYPE) | |
1235 | |
1236 while (ptr < end) | |
1237 { | |
1238 if (BYTE_ASCII_P (*ptr)) | |
1239 { | |
1240 /* optimize for long stretches of ASCII */ | |
1241 if (! ALIGNED (ptr)) | |
1242 ptr++, count++; | |
1243 else | |
1244 { | |
1245 const unsigned STRIDE_TYPE *ascii_end = | |
1246 (const unsigned STRIDE_TYPE *) ptr; | |
1247 /* This loop screams, because we can detect ASCII | |
1248 characters 4 or 8 at a time. */ | |
1249 while ((const Intbyte *) ascii_end + STRIDE <= end | |
1250 && !(*ascii_end & HIGH_BIT_MASK)) | |
1251 ascii_end++; | |
1252 if ((Intbyte *) ascii_end == ptr) | |
1253 ptr++, count++; | |
1254 else | |
1255 { | |
1256 count += (Intbyte *) ascii_end - ptr; | |
1257 ptr = (Intbyte *) ascii_end; | |
1258 } | |
1259 } | |
1260 } | |
1261 else | |
1262 { | |
1263 /* optimize for successive characters from the same charset */ | |
1264 Intbyte leading_byte = *ptr; | |
1265 int bytes = REP_BYTES_BY_FIRST_BYTE (leading_byte); | |
1266 while ((ptr < end) && (*ptr == leading_byte)) | |
1267 ptr += bytes, count++; | |
1268 } | |
1269 } | |
1270 | |
1271 /* Bomb out if the specified substring ends in the middle | |
1272 of a character. Note that we might have already gotten | |
1273 a core dump above from an invalid reference, but at least | |
1274 we will get no farther than here. | |
1275 | |
1276 This also catches len < 0. */ | |
1277 charbpos_checking_assert (ptr == end); | |
1278 | |
1279 return count; | |
1280 } | |
1281 | |
1282 /* Given a pointer to a text string and a length in characters, return | |
1283 the equivalent length in bytes. */ | |
1284 | |
1285 Bytecount | |
1286 charcount_to_bytecount (const Intbyte *ptr, Charcount len) | |
1287 { | |
1288 const Intbyte *newptr = ptr; | |
1289 | |
1290 charbpos_checking_assert (len >= 0); | |
1291 while (len > 0) | |
1292 { | |
1293 INC_CHARPTR (newptr); | |
1294 len--; | |
1295 } | |
1296 return newptr - ptr; | |
1297 } | |
1298 | |
1299 inline static void | |
1300 update_entirely_ascii_p_flag (struct buffer *buf) | |
1301 { | |
1302 buf->text->entirely_ascii_p = | |
1303 (buf->text->mule_bufmin == 1 && | |
1304 buf->text->mule_bufmax == buf->text->bufz && | |
1305 !buf->text->mule_shifter && | |
1306 !buf->text->mule_three_p); | |
1307 } | |
1308 | |
1309 /* The next two functions are the actual meat behind the | |
1310 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently | |
1311 the method they use is fairly unsophisticated; see buffer.h. | |
1312 | |
1313 Note that charbpos_to_bytebpos_func() is probably the most-called | |
1314 function in all of XEmacs. Therefore, it must be FAST FAST FAST. | |
1315 This is the reason why so much of the code is duplicated. | |
1316 | |
1317 Similar considerations apply to bytebpos_to_charbpos_func(), although | |
1318 less so because the function is not called so often. | |
1319 | |
1320 #### At some point this should use a more sophisticated method; | |
1321 see buffer.h. */ | |
1322 | |
1323 static int not_very_random_number; | |
1324 | |
1325 Bytebpos | |
1326 charbpos_to_bytebpos_func (struct buffer *buf, Charbpos x) | |
1327 { | |
1328 Charbpos bufmin; | |
1329 Charbpos bufmax; | |
1330 Bytebpos bytmin; | |
1331 Bytebpos bytmax; | |
1332 int size; | |
1333 int forward_p; | |
1334 Bytebpos retval; | |
1335 int diff_so_far; | |
1336 int add_to_cache = 0; | |
1337 | |
1338 /* Check for some cached positions, for speed. */ | |
1339 if (x == BUF_PT (buf)) | |
1340 return BI_BUF_PT (buf); | |
1341 if (x == BUF_ZV (buf)) | |
1342 return BI_BUF_ZV (buf); | |
1343 if (x == BUF_BEGV (buf)) | |
1344 return BI_BUF_BEGV (buf); | |
1345 | |
1346 bufmin = buf->text->mule_bufmin; | |
1347 bufmax = buf->text->mule_bufmax; | |
1348 bytmin = buf->text->mule_bytmin; | |
1349 bytmax = buf->text->mule_bytmax; | |
1350 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; | |
1351 | |
1352 /* The basic idea here is that we shift the "known region" up or down | |
1353 until it overlaps the specified position. We do this by moving | |
1354 the upper bound of the known region up one character at a time, | |
1355 and moving the lower bound of the known region up as necessary | |
1356 when the size of the character just seen changes. | |
1357 | |
1358 We optimize this, however, by first shifting the known region to | |
1359 one of the cached points if it's close by. (We don't check BEG or | |
1360 Z, even though they're cached; most of the time these will be the | |
1361 same as BEGV and ZV, and when they're not, they're not likely | |
1362 to be used.) */ | |
1363 | |
1364 if (x > bufmax) | |
1365 { | |
1366 Charbpos diffmax = x - bufmax; | |
1367 Charbpos diffpt = x - BUF_PT (buf); | |
1368 Charbpos diffzv = BUF_ZV (buf) - x; | |
1369 /* #### This value could stand some more exploration. */ | |
1370 Charcount heuristic_hack = (bufmax - bufmin) >> 2; | |
1371 | |
1372 /* Check if the position is closer to PT or ZV than to the | |
1373 end of the known region. */ | |
1374 | |
1375 if (diffpt < 0) | |
1376 diffpt = -diffpt; | |
1377 if (diffzv < 0) | |
1378 diffzv = -diffzv; | |
1379 | |
1380 /* But also implement a heuristic that favors the known region | |
1381 over PT or ZV. The reason for this is that switching to | |
1382 PT or ZV will wipe out the knowledge in the known region, | |
1383 which might be annoying if the known region is large and | |
1384 PT or ZV is not that much closer than the end of the known | |
1385 region. */ | |
1386 | |
1387 diffzv += heuristic_hack; | |
1388 diffpt += heuristic_hack; | |
1389 if (diffpt < diffmax && diffpt <= diffzv) | |
1390 { | |
1391 bufmax = bufmin = BUF_PT (buf); | |
1392 bytmax = bytmin = BI_BUF_PT (buf); | |
1393 /* We set the size to 1 even though it doesn't really | |
1394 matter because the new known region contains no | |
1395 characters. We do this because this is the most | |
1396 likely size of the characters around the new known | |
1397 region, and we avoid potential yuckiness that is | |
1398 done when size == 3. */ | |
1399 size = 1; | |
1400 } | |
1401 if (diffzv < diffmax) | |
1402 { | |
1403 bufmax = bufmin = BUF_ZV (buf); | |
1404 bytmax = bytmin = BI_BUF_ZV (buf); | |
1405 size = 1; | |
1406 } | |
1407 } | |
1408 #ifdef ERROR_CHECK_CHARBPOS | |
1409 else if (x >= bufmin) | |
1410 abort (); | |
1411 #endif | |
1412 else | |
1413 { | |
1414 Charbpos diffmin = bufmin - x; | |
1415 Charbpos diffpt = BUF_PT (buf) - x; | |
1416 Charbpos diffbegv = x - BUF_BEGV (buf); | |
1417 /* #### This value could stand some more exploration. */ | |
1418 Charcount heuristic_hack = (bufmax - bufmin) >> 2; | |
1419 | |
1420 if (diffpt < 0) | |
1421 diffpt = -diffpt; | |
1422 if (diffbegv < 0) | |
1423 diffbegv = -diffbegv; | |
1424 | |
1425 /* But also implement a heuristic that favors the known region -- | |
1426 see above. */ | |
1427 | |
1428 diffbegv += heuristic_hack; | |
1429 diffpt += heuristic_hack; | |
1430 | |
1431 if (diffpt < diffmin && diffpt <= diffbegv) | |
1432 { | |
1433 bufmax = bufmin = BUF_PT (buf); | |
1434 bytmax = bytmin = BI_BUF_PT (buf); | |
1435 /* We set the size to 1 even though it doesn't really | |
1436 matter because the new known region contains no | |
1437 characters. We do this because this is the most | |
1438 likely size of the characters around the new known | |
1439 region, and we avoid potential yuckiness that is | |
1440 done when size == 3. */ | |
1441 size = 1; | |
1442 } | |
1443 if (diffbegv < diffmin) | |
1444 { | |
1445 bufmax = bufmin = BUF_BEGV (buf); | |
1446 bytmax = bytmin = BI_BUF_BEGV (buf); | |
1447 size = 1; | |
1448 } | |
1449 } | |
1450 | |
1451 diff_so_far = x > bufmax ? x - bufmax : bufmin - x; | |
1452 if (diff_so_far > 50) | |
1453 { | |
1454 /* If we have to move more than a certain amount, then look | |
1455 into our cache. */ | |
1456 int minval = INT_MAX; | |
1457 int found = 0; | |
1458 int i; | |
1459 | |
1460 add_to_cache = 1; | |
1461 /* I considered keeping the positions ordered. This would speed | |
1462 up this loop, but updating the cache would take longer, so | |
1463 it doesn't seem like it would really matter. */ | |
1464 for (i = 0; i < 16; i++) | |
1465 { | |
1466 int diff = buf->text->mule_charbpos_cache[i] - x; | |
1467 | |
1468 if (diff < 0) | |
1469 diff = -diff; | |
1470 if (diff < minval) | |
1471 { | |
1472 minval = diff; | |
1473 found = i; | |
1474 } | |
1475 } | |
1476 | |
1477 if (minval < diff_so_far) | |
1478 { | |
1479 bufmax = bufmin = buf->text->mule_charbpos_cache[found]; | |
1480 bytmax = bytmin = buf->text->mule_bytebpos_cache[found]; | |
1481 size = 1; | |
1482 } | |
1483 } | |
1484 | |
1485 /* It's conceivable that the caching above could lead to X being | |
1486 the same as one of the range edges. */ | |
1487 if (x >= bufmax) | |
1488 { | |
1489 Bytebpos newmax; | |
1490 Bytecount newsize; | |
1491 | |
1492 forward_p = 1; | |
1493 while (x > bufmax) | |
1494 { | |
1495 newmax = bytmax; | |
1496 | |
1497 INC_BYTEBPOS (buf, newmax); | |
1498 newsize = newmax - bytmax; | |
1499 if (newsize != size) | |
1500 { | |
1501 bufmin = bufmax; | |
1502 bytmin = bytmax; | |
1503 size = newsize; | |
1504 } | |
1505 bytmax = newmax; | |
1506 bufmax++; | |
1507 } | |
1508 retval = bytmax; | |
1509 | |
1510 /* #### Should go past the found location to reduce the number | |
1511 of times that this function is called */ | |
1512 } | |
1513 else /* x < bufmin */ | |
1514 { | |
1515 Bytebpos newmin; | |
1516 Bytecount newsize; | |
1517 | |
1518 forward_p = 0; | |
1519 while (x < bufmin) | |
1520 { | |
1521 newmin = bytmin; | |
1522 | |
1523 DEC_BYTEBPOS (buf, newmin); | |
1524 newsize = bytmin - newmin; | |
1525 if (newsize != size) | |
1526 { | |
1527 bufmax = bufmin; | |
1528 bytmax = bytmin; | |
1529 size = newsize; | |
1530 } | |
1531 bytmin = newmin; | |
1532 bufmin--; | |
1533 } | |
1534 retval = bytmin; | |
1535 | |
1536 /* #### Should go past the found location to reduce the number | |
1537 of times that this function is called | |
1538 */ | |
1539 } | |
1540 | |
1541 /* If size is three, than we have to max sure that the range we | |
1542 discovered isn't too large, because we use a fixed-length | |
1543 table to divide by 3. */ | |
1544 | |
1545 if (size == 3) | |
1546 { | |
1547 int gap = bytmax - bytmin; | |
1548 buf->text->mule_three_p = 1; | |
1549 buf->text->mule_shifter = 1; | |
1550 | |
1551 if (gap > MAX_BYTEBPOS_GAP_SIZE_3) | |
1552 { | |
1553 if (forward_p) | |
1554 { | |
1555 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3; | |
1556 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3; | |
1557 } | |
1558 else | |
1559 { | |
1560 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3; | |
1561 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3; | |
1562 } | |
1563 } | |
1564 } | |
1565 else | |
1566 { | |
1567 buf->text->mule_three_p = 0; | |
1568 if (size == 4) | |
1569 buf->text->mule_shifter = 2; | |
1570 else | |
1571 buf->text->mule_shifter = size - 1; | |
1572 } | |
1573 | |
1574 buf->text->mule_bufmin = bufmin; | |
1575 buf->text->mule_bufmax = bufmax; | |
1576 buf->text->mule_bytmin = bytmin; | |
1577 buf->text->mule_bytmax = bytmax; | |
1578 update_entirely_ascii_p_flag (buf); | |
1579 | |
1580 if (add_to_cache) | |
1581 { | |
1582 int replace_loc; | |
1583 | |
1584 /* We throw away a "random" cached value and replace it with | |
1585 the new value. It doesn't actually have to be very random | |
1586 at all, just evenly distributed. | |
1587 | |
1588 #### It would be better to use a least-recently-used algorithm | |
1589 or something that tries to space things out, but I'm not sure | |
1590 it's worth it to go to the trouble of maintaining that. */ | |
1591 not_very_random_number += 621; | |
1592 replace_loc = not_very_random_number & 15; | |
1593 buf->text->mule_charbpos_cache[replace_loc] = x; | |
1594 buf->text->mule_bytebpos_cache[replace_loc] = retval; | |
1595 } | |
1596 | |
1597 return retval; | |
1598 } | |
1599 | |
1600 /* The logic in this function is almost identical to the logic in | |
1601 the previous function. */ | |
1602 | |
1603 Charbpos | |
1604 bytebpos_to_charbpos_func (struct buffer *buf, Bytebpos x) | |
1605 { | |
1606 Charbpos bufmin; | |
1607 Charbpos bufmax; | |
1608 Bytebpos bytmin; | |
1609 Bytebpos bytmax; | |
1610 int size; | |
1611 int forward_p; | |
1612 Charbpos retval; | |
1613 int diff_so_far; | |
1614 int add_to_cache = 0; | |
1615 | |
1616 /* Check for some cached positions, for speed. */ | |
1617 if (x == BI_BUF_PT (buf)) | |
1618 return BUF_PT (buf); | |
1619 if (x == BI_BUF_ZV (buf)) | |
1620 return BUF_ZV (buf); | |
1621 if (x == BI_BUF_BEGV (buf)) | |
1622 return BUF_BEGV (buf); | |
1623 | |
1624 bufmin = buf->text->mule_bufmin; | |
1625 bufmax = buf->text->mule_bufmax; | |
1626 bytmin = buf->text->mule_bytmin; | |
1627 bytmax = buf->text->mule_bytmax; | |
1628 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; | |
1629 | |
1630 /* The basic idea here is that we shift the "known region" up or down | |
1631 until it overlaps the specified position. We do this by moving | |
1632 the upper bound of the known region up one character at a time, | |
1633 and moving the lower bound of the known region up as necessary | |
1634 when the size of the character just seen changes. | |
1635 | |
1636 We optimize this, however, by first shifting the known region to | |
1637 one of the cached points if it's close by. (We don't check BI_BEG or | |
1638 BI_Z, even though they're cached; most of the time these will be the | |
1639 same as BI_BEGV and BI_ZV, and when they're not, they're not likely | |
1640 to be used.) */ | |
1641 | |
1642 if (x > bytmax) | |
1643 { | |
1644 Bytebpos diffmax = x - bytmax; | |
1645 Bytebpos diffpt = x - BI_BUF_PT (buf); | |
1646 Bytebpos diffzv = BI_BUF_ZV (buf) - x; | |
1647 /* #### This value could stand some more exploration. */ | |
1648 Bytecount heuristic_hack = (bytmax - bytmin) >> 2; | |
1649 | |
1650 /* Check if the position is closer to PT or ZV than to the | |
1651 end of the known region. */ | |
1652 | |
1653 if (diffpt < 0) | |
1654 diffpt = -diffpt; | |
1655 if (diffzv < 0) | |
1656 diffzv = -diffzv; | |
1657 | |
1658 /* But also implement a heuristic that favors the known region | |
1659 over BI_PT or BI_ZV. The reason for this is that switching to | |
1660 BI_PT or BI_ZV will wipe out the knowledge in the known region, | |
1661 which might be annoying if the known region is large and | |
1662 BI_PT or BI_ZV is not that much closer than the end of the known | |
1663 region. */ | |
1664 | |
1665 diffzv += heuristic_hack; | |
1666 diffpt += heuristic_hack; | |
1667 if (diffpt < diffmax && diffpt <= diffzv) | |
1668 { | |
1669 bufmax = bufmin = BUF_PT (buf); | |
1670 bytmax = bytmin = BI_BUF_PT (buf); | |
1671 /* We set the size to 1 even though it doesn't really | |
1672 matter because the new known region contains no | |
1673 characters. We do this because this is the most | |
1674 likely size of the characters around the new known | |
1675 region, and we avoid potential yuckiness that is | |
1676 done when size == 3. */ | |
1677 size = 1; | |
1678 } | |
1679 if (diffzv < diffmax) | |
1680 { | |
1681 bufmax = bufmin = BUF_ZV (buf); | |
1682 bytmax = bytmin = BI_BUF_ZV (buf); | |
1683 size = 1; | |
1684 } | |
1685 } | |
1686 #ifdef ERROR_CHECK_CHARBPOS | |
1687 else if (x >= bytmin) | |
1688 abort (); | |
1689 #endif | |
1690 else | |
1691 { | |
1692 Bytebpos diffmin = bytmin - x; | |
1693 Bytebpos diffpt = BI_BUF_PT (buf) - x; | |
1694 Bytebpos diffbegv = x - BI_BUF_BEGV (buf); | |
1695 /* #### This value could stand some more exploration. */ | |
1696 Bytecount heuristic_hack = (bytmax - bytmin) >> 2; | |
1697 | |
1698 if (diffpt < 0) | |
1699 diffpt = -diffpt; | |
1700 if (diffbegv < 0) | |
1701 diffbegv = -diffbegv; | |
1702 | |
1703 /* But also implement a heuristic that favors the known region -- | |
1704 see above. */ | |
1705 | |
1706 diffbegv += heuristic_hack; | |
1707 diffpt += heuristic_hack; | |
1708 | |
1709 if (diffpt < diffmin && diffpt <= diffbegv) | |
1710 { | |
1711 bufmax = bufmin = BUF_PT (buf); | |
1712 bytmax = bytmin = BI_BUF_PT (buf); | |
1713 /* We set the size to 1 even though it doesn't really | |
1714 matter because the new known region contains no | |
1715 characters. We do this because this is the most | |
1716 likely size of the characters around the new known | |
1717 region, and we avoid potential yuckiness that is | |
1718 done when size == 3. */ | |
1719 size = 1; | |
1720 } | |
1721 if (diffbegv < diffmin) | |
1722 { | |
1723 bufmax = bufmin = BUF_BEGV (buf); | |
1724 bytmax = bytmin = BI_BUF_BEGV (buf); | |
1725 size = 1; | |
1726 } | |
1727 } | |
1728 | |
1729 diff_so_far = x > bytmax ? x - bytmax : bytmin - x; | |
1730 if (diff_so_far > 50) | |
1731 { | |
1732 /* If we have to move more than a certain amount, then look | |
1733 into our cache. */ | |
1734 int minval = INT_MAX; | |
1735 int found = 0; | |
1736 int i; | |
1737 | |
1738 add_to_cache = 1; | |
1739 /* I considered keeping the positions ordered. This would speed | |
1740 up this loop, but updating the cache would take longer, so | |
1741 it doesn't seem like it would really matter. */ | |
1742 for (i = 0; i < 16; i++) | |
1743 { | |
1744 int diff = buf->text->mule_bytebpos_cache[i] - x; | |
1745 | |
1746 if (diff < 0) | |
1747 diff = -diff; | |
1748 if (diff < minval) | |
1749 { | |
1750 minval = diff; | |
1751 found = i; | |
1752 } | |
1753 } | |
1754 | |
1755 if (minval < diff_so_far) | |
1756 { | |
1757 bufmax = bufmin = buf->text->mule_charbpos_cache[found]; | |
1758 bytmax = bytmin = buf->text->mule_bytebpos_cache[found]; | |
1759 size = 1; | |
1760 } | |
1761 } | |
1762 | |
1763 /* It's conceivable that the caching above could lead to X being | |
1764 the same as one of the range edges. */ | |
1765 if (x >= bytmax) | |
1766 { | |
1767 Bytebpos newmax; | |
1768 Bytecount newsize; | |
1769 | |
1770 forward_p = 1; | |
1771 while (x > bytmax) | |
1772 { | |
1773 newmax = bytmax; | |
1774 | |
1775 INC_BYTEBPOS (buf, newmax); | |
1776 newsize = newmax - bytmax; | |
1777 if (newsize != size) | |
1778 { | |
1779 bufmin = bufmax; | |
1780 bytmin = bytmax; | |
1781 size = newsize; | |
1782 } | |
1783 bytmax = newmax; | |
1784 bufmax++; | |
1785 } | |
1786 retval = bufmax; | |
1787 | |
1788 /* #### Should go past the found location to reduce the number | |
1789 of times that this function is called */ | |
1790 } | |
1791 else /* x <= bytmin */ | |
1792 { | |
1793 Bytebpos newmin; | |
1794 Bytecount newsize; | |
1795 | |
1796 forward_p = 0; | |
1797 while (x < bytmin) | |
1798 { | |
1799 newmin = bytmin; | |
1800 | |
1801 DEC_BYTEBPOS (buf, newmin); | |
1802 newsize = bytmin - newmin; | |
1803 if (newsize != size) | |
1804 { | |
1805 bufmax = bufmin; | |
1806 bytmax = bytmin; | |
1807 size = newsize; | |
1808 } | |
1809 bytmin = newmin; | |
1810 bufmin--; | |
1811 } | |
1812 retval = bufmin; | |
1813 | |
1814 /* #### Should go past the found location to reduce the number | |
1815 of times that this function is called | |
1816 */ | |
1817 } | |
1818 | |
1819 /* If size is three, than we have to max sure that the range we | |
1820 discovered isn't too large, because we use a fixed-length | |
1821 table to divide by 3. */ | |
1822 | |
1823 if (size == 3) | |
1824 { | |
1825 int gap = bytmax - bytmin; | |
1826 buf->text->mule_three_p = 1; | |
1827 buf->text->mule_shifter = 1; | |
1828 | |
1829 if (gap > MAX_BYTEBPOS_GAP_SIZE_3) | |
1830 { | |
1831 if (forward_p) | |
1832 { | |
1833 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3; | |
1834 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3; | |
1835 } | |
1836 else | |
1837 { | |
1838 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3; | |
1839 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3; | |
1840 } | |
1841 } | |
1842 } | |
1843 else | |
1844 { | |
1845 buf->text->mule_three_p = 0; | |
1846 if (size == 4) | |
1847 buf->text->mule_shifter = 2; | |
1848 else | |
1849 buf->text->mule_shifter = size - 1; | |
1850 } | |
1851 | |
1852 buf->text->mule_bufmin = bufmin; | |
1853 buf->text->mule_bufmax = bufmax; | |
1854 buf->text->mule_bytmin = bytmin; | |
1855 buf->text->mule_bytmax = bytmax; | |
1856 update_entirely_ascii_p_flag (buf); | |
1857 | |
1858 if (add_to_cache) | |
1859 { | |
1860 int replace_loc; | |
1861 | |
1862 /* We throw away a "random" cached value and replace it with | |
1863 the new value. It doesn't actually have to be very random | |
1864 at all, just evenly distributed. | |
1865 | |
1866 #### It would be better to use a least-recently-used algorithm | |
1867 or something that tries to space things out, but I'm not sure | |
1868 it's worth it to go to the trouble of maintaining that. */ | |
1869 not_very_random_number += 621; | |
1870 replace_loc = not_very_random_number & 15; | |
1871 buf->text->mule_charbpos_cache[replace_loc] = retval; | |
1872 buf->text->mule_bytebpos_cache[replace_loc] = x; | |
1873 } | |
1874 | |
1875 return retval; | |
1876 } | |
1877 | |
1878 /* Text of length BYTELENGTH and CHARLENGTH (in different units) | |
1879 was inserted at charbpos START. */ | |
1880 | |
1881 void | |
1882 buffer_mule_signal_inserted_region (struct buffer *buf, Charbpos start, | |
1883 Bytecount bytelength, | |
1884 Charcount charlength) | |
1885 { | |
1886 int size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; | |
1887 int i; | |
1888 | |
1889 /* Adjust the cache of known positions. */ | |
1890 for (i = 0; i < 16; i++) | |
1891 { | |
1892 | |
1893 if (buf->text->mule_charbpos_cache[i] > start) | |
1894 { | |
1895 buf->text->mule_charbpos_cache[i] += charlength; | |
1896 buf->text->mule_bytebpos_cache[i] += bytelength; | |
1897 } | |
1898 } | |
1899 | |
1900 if (start >= buf->text->mule_bufmax) | |
1901 goto done; | |
1902 | |
1903 /* The insertion is either before the known region, in which case | |
1904 it shoves it forward; or within the known region, in which case | |
1905 it shoves the end forward. (But it may make the known region | |
1906 inconsistent, so we may have to shorten it.) */ | |
1907 | |
1908 if (start <= buf->text->mule_bufmin) | |
1909 { | |
1910 buf->text->mule_bufmin += charlength; | |
1911 buf->text->mule_bufmax += charlength; | |
1912 buf->text->mule_bytmin += bytelength; | |
1913 buf->text->mule_bytmax += bytelength; | |
1914 } | |
1915 else | |
1916 { | |
1917 Charbpos end = start + charlength; | |
1918 /* the insertion point divides the known region in two. | |
1919 Keep the longer half, at least, and expand into the | |
1920 inserted chunk as much as possible. */ | |
1921 | |
1922 if (start - buf->text->mule_bufmin > buf->text->mule_bufmax - start) | |
1923 { | |
1924 Bytebpos bytestart = (buf->text->mule_bytmin | |
1925 + size * (start - buf->text->mule_bufmin)); | |
1926 Bytebpos bytenew; | |
1927 | |
1928 while (start < end) | |
1929 { | |
1930 bytenew = bytestart; | |
1931 INC_BYTEBPOS (buf, bytenew); | |
1932 if (bytenew - bytestart != size) | |
1933 break; | |
1934 start++; | |
1935 bytestart = bytenew; | |
1936 } | |
1937 if (start != end) | |
1938 { | |
1939 buf->text->mule_bufmax = start; | |
1940 buf->text->mule_bytmax = bytestart; | |
1941 } | |
1942 else | |
1943 { | |
1944 buf->text->mule_bufmax += charlength; | |
1945 buf->text->mule_bytmax += bytelength; | |
1946 } | |
1947 } | |
1948 else | |
1949 { | |
1950 Bytebpos byteend = (buf->text->mule_bytmin | |
1951 + size * (start - buf->text->mule_bufmin) | |
1952 + bytelength); | |
1953 Bytebpos bytenew; | |
1954 | |
1955 buf->text->mule_bufmax += charlength; | |
1956 buf->text->mule_bytmax += bytelength; | |
1957 | |
1958 while (end > start) | |
1959 { | |
1960 bytenew = byteend; | |
1961 DEC_BYTEBPOS (buf, bytenew); | |
1962 if (byteend - bytenew != size) | |
1963 break; | |
1964 end--; | |
1965 byteend = bytenew; | |
1966 } | |
1967 if (start != end) | |
1968 { | |
1969 buf->text->mule_bufmin = end; | |
1970 buf->text->mule_bytmin = byteend; | |
1971 } | |
1972 } | |
1973 } | |
1974 done: | |
1975 update_entirely_ascii_p_flag (buf); | |
1976 } | |
1977 | |
1978 /* Text from START to END (equivalent in Bytebposs: from BI_START to | |
1979 BI_END) was deleted. */ | |
1980 | |
1981 void | |
1982 buffer_mule_signal_deleted_region (struct buffer *buf, Charbpos start, | |
1983 Charbpos end, Bytebpos bi_start, | |
1984 Bytebpos bi_end) | |
1985 { | |
1986 int i; | |
1987 | |
1988 /* Adjust the cache of known positions. */ | |
1989 for (i = 0; i < 16; i++) | |
1990 { | |
1991 /* After the end; gets shoved backward */ | |
1992 if (buf->text->mule_charbpos_cache[i] > end) | |
1993 { | |
1994 buf->text->mule_charbpos_cache[i] -= end - start; | |
1995 buf->text->mule_bytebpos_cache[i] -= bi_end - bi_start; | |
1996 } | |
1997 /* In the range; moves to start of range */ | |
1998 else if (buf->text->mule_charbpos_cache[i] > start) | |
1999 { | |
2000 buf->text->mule_charbpos_cache[i] = start; | |
2001 buf->text->mule_bytebpos_cache[i] = bi_start; | |
2002 } | |
2003 } | |
2004 | |
2005 /* We don't care about any text after the end of the known region. */ | |
2006 | |
2007 end = min (end, buf->text->mule_bufmax); | |
2008 bi_end = min (bi_end, buf->text->mule_bytmax); | |
2009 if (start >= end) | |
2010 goto done; | |
2011 | |
2012 /* The end of the known region offsets by the total amount of deletion, | |
2013 since it's all before it. */ | |
2014 | |
2015 buf->text->mule_bufmax -= end - start; | |
2016 buf->text->mule_bytmax -= bi_end - bi_start; | |
2017 | |
2018 /* Now we don't care about any text after the start of the known region. */ | |
2019 | |
2020 end = min (end, buf->text->mule_bufmin); | |
2021 bi_end = min (bi_end, buf->text->mule_bytmin); | |
2022 if (start < end) | |
2023 { | |
2024 buf->text->mule_bufmin -= end - start; | |
2025 buf->text->mule_bytmin -= bi_end - bi_start; | |
2026 } | |
2027 | |
2028 done: | |
2029 update_entirely_ascii_p_flag (buf); | |
2030 } | |
2031 | |
2032 #endif /* MULE */ | |
2033 | |
2034 #ifdef ERROR_CHECK_CHARBPOS | |
2035 | |
2036 Bytebpos | |
2037 charbpos_to_bytebpos (struct buffer *buf, Charbpos x) | |
2038 { | |
2039 Bytebpos retval = real_charbpos_to_bytebpos (buf, x); | |
2040 ASSERT_VALID_BYTEBPOS_UNSAFE (buf, retval); | |
2041 return retval; | |
2042 } | |
2043 | |
2044 Charbpos | |
2045 bytebpos_to_charbpos (struct buffer *buf, Bytebpos x) | |
2046 { | |
2047 ASSERT_VALID_BYTEBPOS_UNSAFE (buf, x); | |
2048 return real_bytebpos_to_charbpos (buf, x); | |
2049 } | |
2050 | |
2051 #endif /* ERROR_CHECK_CHARBPOS */ | |
2052 | |
2053 | |
2054 /************************************************************************/ | |
2055 /* verifying buffer and string positions */ | |
2056 /************************************************************************/ | |
2057 | |
2058 /* Functions below are tagged with either _byte or _char indicating | |
2059 whether they return byte or character positions. For a buffer, | |
2060 a character position is a "Charbpos" and a byte position is a "Bytebpos". | |
2061 For strings, these are sometimes typed using "Charcount" and | |
2062 "Bytecount". */ | |
2063 | |
2064 /* Flags for the functions below are: | |
2065 | |
2066 GB_ALLOW_PAST_ACCESSIBLE | |
2067 | |
2068 Allow positions to range over the entire buffer (BUF_BEG to BUF_Z), | |
2069 rather than just the accessible portion (BUF_BEGV to BUF_ZV). | |
2070 For strings, this flag has no effect. | |
2071 | |
2072 GB_COERCE_RANGE | |
2073 | |
2074 If the position is outside the allowable range, return the lower | |
2075 or upper bound of the range, whichever is closer to the specified | |
2076 position. | |
2077 | |
2078 GB_NO_ERROR_IF_BAD | |
2079 | |
2080 If the position is outside the allowable range, return -1. | |
2081 | |
2082 GB_NEGATIVE_FROM_END | |
2083 | |
2084 If a value is negative, treat it as an offset from the end. | |
2085 Only applies to strings. | |
2086 | |
2087 The following additional flags apply only to the functions | |
2088 that return ranges: | |
2089 | |
2090 GB_ALLOW_NIL | |
2091 | |
2092 Either or both positions can be nil. If FROM is nil, | |
2093 FROM_OUT will contain the lower bound of the allowed range. | |
2094 If TO is nil, TO_OUT will contain the upper bound of the | |
2095 allowed range. | |
2096 | |
2097 GB_CHECK_ORDER | |
2098 | |
2099 FROM must contain the lower bound and TO the upper bound | |
2100 of the range. If the positions are reversed, an error is | |
2101 signalled. | |
2102 | |
2103 The following is a combination flag: | |
2104 | |
2105 GB_HISTORICAL_STRING_BEHAVIOR | |
2106 | |
2107 Equivalent to (GB_NEGATIVE_FROM_END | GB_ALLOW_NIL). | |
2108 */ | |
2109 | |
2110 /* Return a buffer position stored in a Lisp_Object. Full | |
2111 error-checking is done on the position. Flags can be specified to | |
2112 control the behavior of out-of-range values. The default behavior | |
2113 is to require that the position is within the accessible part of | |
2114 the buffer (BEGV and ZV), and to signal an error if the position is | |
2115 out of range. | |
2116 | |
2117 */ | |
2118 | |
2119 Charbpos | |
2120 get_buffer_pos_char (struct buffer *b, Lisp_Object pos, unsigned int flags) | |
2121 { | |
2122 /* Does not GC */ | |
2123 Charbpos ind; | |
2124 Charbpos min_allowed, max_allowed; | |
2125 | |
2126 CHECK_INT_COERCE_MARKER (pos); | |
2127 ind = XINT (pos); | |
2128 min_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_BEG (b) : BUF_BEGV (b); | |
2129 max_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_Z (b) : BUF_ZV (b); | |
2130 | |
2131 if (ind < min_allowed || ind > max_allowed) | |
2132 { | |
2133 if (flags & GB_COERCE_RANGE) | |
2134 ind = ind < min_allowed ? min_allowed : max_allowed; | |
2135 else if (flags & GB_NO_ERROR_IF_BAD) | |
2136 ind = -1; | |
2137 else | |
2138 { | |
2139 Lisp_Object buffer; | |
2140 XSETBUFFER (buffer, b); | |
2141 args_out_of_range (buffer, pos); | |
2142 } | |
2143 } | |
2144 | |
2145 return ind; | |
2146 } | |
2147 | |
2148 Bytebpos | |
2149 get_buffer_pos_byte (struct buffer *b, Lisp_Object pos, unsigned int flags) | |
2150 { | |
2151 Charbpos bpos = get_buffer_pos_char (b, pos, flags); | |
2152 if (bpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */ | |
2153 return -1; | |
2154 return charbpos_to_bytebpos (b, bpos); | |
2155 } | |
2156 | |
2157 /* Return a pair of buffer positions representing a range of text, | |
2158 taken from a pair of Lisp_Objects. Full error-checking is | |
2159 done on the positions. Flags can be specified to control the | |
2160 behavior of out-of-range values. The default behavior is to | |
2161 allow the range bounds to be specified in either order | |
2162 (however, FROM_OUT will always be the lower bound of the range | |
2163 and TO_OUT the upper bound),to require that the positions | |
2164 are within the accessible part of the buffer (BEGV and ZV), | |
2165 and to signal an error if the positions are out of range. | |
2166 */ | |
2167 | |
2168 void | |
2169 get_buffer_range_char (struct buffer *b, Lisp_Object from, Lisp_Object to, | |
2170 Charbpos *from_out, Charbpos *to_out, unsigned int flags) | |
2171 { | |
2172 /* Does not GC */ | |
2173 Charbpos min_allowed, max_allowed; | |
2174 | |
2175 min_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ? | |
2176 BUF_BEG (b) : BUF_BEGV (b); | |
2177 max_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ? | |
2178 BUF_Z (b) : BUF_ZV (b); | |
2179 | |
2180 if (NILP (from) && (flags & GB_ALLOW_NIL)) | |
2181 *from_out = min_allowed; | |
2182 else | |
2183 *from_out = get_buffer_pos_char (b, from, flags | GB_NO_ERROR_IF_BAD); | |
2184 | |
2185 if (NILP (to) && (flags & GB_ALLOW_NIL)) | |
2186 *to_out = max_allowed; | |
2187 else | |
2188 *to_out = get_buffer_pos_char (b, to, flags | GB_NO_ERROR_IF_BAD); | |
2189 | |
2190 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD)) | |
2191 { | |
2192 Lisp_Object buffer; | |
2193 XSETBUFFER (buffer, b); | |
2194 args_out_of_range_3 (buffer, from, to); | |
2195 } | |
2196 | |
2197 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out) | |
2198 { | |
2199 if (flags & GB_CHECK_ORDER) | |
2200 invalid_argument_2 ("start greater than end", from, to); | |
2201 else | |
2202 { | |
2203 Charbpos temp = *from_out; | |
2204 *from_out = *to_out; | |
2205 *to_out = temp; | |
2206 } | |
2207 } | |
2208 } | |
2209 | |
2210 void | |
2211 get_buffer_range_byte (struct buffer *b, Lisp_Object from, Lisp_Object to, | |
2212 Bytebpos *from_out, Bytebpos *to_out, unsigned int flags) | |
2213 { | |
2214 Charbpos s, e; | |
2215 | |
2216 get_buffer_range_char (b, from, to, &s, &e, flags); | |
2217 if (s >= 0) | |
2218 *from_out = charbpos_to_bytebpos (b, s); | |
2219 else /* could happen with GB_NO_ERROR_IF_BAD */ | |
2220 *from_out = -1; | |
2221 if (e >= 0) | |
2222 *to_out = charbpos_to_bytebpos (b, e); | |
2223 else | |
2224 *to_out = -1; | |
2225 } | |
2226 | |
2227 static Charcount | |
2228 get_string_pos_char_1 (Lisp_Object string, Lisp_Object pos, unsigned int flags, | |
2229 Charcount known_length) | |
2230 { | |
2231 Charcount ccpos; | |
2232 Charcount min_allowed = 0; | |
2233 Charcount max_allowed = known_length; | |
2234 | |
2235 /* Computation of KNOWN_LENGTH is potentially expensive so we pass | |
2236 it in. */ | |
2237 CHECK_INT (pos); | |
2238 ccpos = XINT (pos); | |
2239 if (ccpos < 0 && flags & GB_NEGATIVE_FROM_END) | |
2240 ccpos += max_allowed; | |
2241 | |
2242 if (ccpos < min_allowed || ccpos > max_allowed) | |
2243 { | |
2244 if (flags & GB_COERCE_RANGE) | |
2245 ccpos = ccpos < min_allowed ? min_allowed : max_allowed; | |
2246 else if (flags & GB_NO_ERROR_IF_BAD) | |
2247 ccpos = -1; | |
2248 else | |
2249 args_out_of_range (string, pos); | |
2250 } | |
2251 | |
2252 return ccpos; | |
2253 } | |
2254 | |
2255 Charcount | |
2256 get_string_pos_char (Lisp_Object string, Lisp_Object pos, unsigned int flags) | |
2257 { | |
2258 return get_string_pos_char_1 (string, pos, flags, | |
2259 XSTRING_CHAR_LENGTH (string)); | |
2260 } | |
2261 | |
2262 Bytecount | |
2263 get_string_pos_byte (Lisp_Object string, Lisp_Object pos, unsigned int flags) | |
2264 { | |
2265 Charcount ccpos = get_string_pos_char (string, pos, flags); | |
2266 if (ccpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */ | |
2267 return -1; | |
2268 return XSTRING_INDEX_CHAR_TO_BYTE (string, ccpos); | |
2269 } | |
2270 | |
2271 void | |
2272 get_string_range_char (Lisp_Object string, Lisp_Object from, Lisp_Object to, | |
2273 Charcount *from_out, Charcount *to_out, | |
2274 unsigned int flags) | |
2275 { | |
2276 Charcount min_allowed = 0; | |
2277 Charcount max_allowed = XSTRING_CHAR_LENGTH (string); | |
2278 | |
2279 if (NILP (from) && (flags & GB_ALLOW_NIL)) | |
2280 *from_out = min_allowed; | |
2281 else | |
2282 *from_out = get_string_pos_char_1 (string, from, | |
2283 flags | GB_NO_ERROR_IF_BAD, | |
2284 max_allowed); | |
2285 | |
2286 if (NILP (to) && (flags & GB_ALLOW_NIL)) | |
2287 *to_out = max_allowed; | |
2288 else | |
2289 *to_out = get_string_pos_char_1 (string, to, | |
2290 flags | GB_NO_ERROR_IF_BAD, | |
2291 max_allowed); | |
2292 | |
2293 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD)) | |
2294 args_out_of_range_3 (string, from, to); | |
2295 | |
2296 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out) | |
2297 { | |
2298 if (flags & GB_CHECK_ORDER) | |
2299 invalid_argument_2 ("start greater than end", from, to); | |
2300 else | |
2301 { | |
2302 Charbpos temp = *from_out; | |
2303 *from_out = *to_out; | |
2304 *to_out = temp; | |
2305 } | |
2306 } | |
2307 } | |
2308 | |
2309 void | |
2310 get_string_range_byte (Lisp_Object string, Lisp_Object from, Lisp_Object to, | |
2311 Bytecount *from_out, Bytecount *to_out, | |
2312 unsigned int flags) | |
2313 { | |
2314 Charcount s, e; | |
2315 | |
2316 get_string_range_char (string, from, to, &s, &e, flags); | |
2317 if (s >= 0) | |
2318 *from_out = XSTRING_INDEX_CHAR_TO_BYTE (string, s); | |
2319 else /* could happen with GB_NO_ERROR_IF_BAD */ | |
2320 *from_out = -1; | |
2321 if (e >= 0) | |
2322 *to_out = XSTRING_INDEX_CHAR_TO_BYTE (string, e); | |
2323 else | |
2324 *to_out = -1; | |
2325 | |
2326 } | |
2327 | |
2328 Charbpos | |
2329 get_buffer_or_string_pos_char (Lisp_Object object, Lisp_Object pos, | |
2330 unsigned int flags) | |
2331 { | |
2332 return STRINGP (object) ? | |
2333 get_string_pos_char (object, pos, flags) : | |
2334 get_buffer_pos_char (XBUFFER (object), pos, flags); | |
2335 } | |
2336 | |
2337 Bytebpos | |
2338 get_buffer_or_string_pos_byte (Lisp_Object object, Lisp_Object pos, | |
2339 unsigned int flags) | |
2340 { | |
2341 return STRINGP (object) ? | |
2342 get_string_pos_byte (object, pos, flags) : | |
2343 get_buffer_pos_byte (XBUFFER (object), pos, flags); | |
2344 } | |
2345 | |
2346 void | |
2347 get_buffer_or_string_range_char (Lisp_Object object, Lisp_Object from, | |
2348 Lisp_Object to, Charbpos *from_out, | |
2349 Charbpos *to_out, unsigned int flags) | |
2350 { | |
2351 if (STRINGP (object)) | |
2352 get_string_range_char (object, from, to, from_out, to_out, flags); | |
2353 else | |
2354 get_buffer_range_char (XBUFFER (object), from, to, from_out, to_out, flags); | |
2355 } | |
2356 | |
2357 void | |
2358 get_buffer_or_string_range_byte (Lisp_Object object, Lisp_Object from, | |
2359 Lisp_Object to, Bytebpos *from_out, | |
2360 Bytebpos *to_out, unsigned int flags) | |
2361 { | |
2362 if (STRINGP (object)) | |
2363 get_string_range_byte (object, from, to, from_out, to_out, flags); | |
2364 else | |
2365 get_buffer_range_byte (XBUFFER (object), from, to, from_out, to_out, flags); | |
2366 } | |
2367 | |
2368 Charbpos | |
2369 buffer_or_string_accessible_begin_char (Lisp_Object object) | |
2370 { | |
2371 return STRINGP (object) ? 0 : BUF_BEGV (XBUFFER (object)); | |
2372 } | |
2373 | |
2374 Charbpos | |
2375 buffer_or_string_accessible_end_char (Lisp_Object object) | |
2376 { | |
2377 return STRINGP (object) ? | |
2378 XSTRING_CHAR_LENGTH (object) : BUF_ZV (XBUFFER (object)); | |
2379 } | |
2380 | |
2381 Bytebpos | |
2382 buffer_or_string_accessible_begin_byte (Lisp_Object object) | |
2383 { | |
2384 return STRINGP (object) ? 0 : BI_BUF_BEGV (XBUFFER (object)); | |
2385 } | |
2386 | |
2387 Bytebpos | |
2388 buffer_or_string_accessible_end_byte (Lisp_Object object) | |
2389 { | |
2390 return STRINGP (object) ? | |
2391 XSTRING_LENGTH (object) : BI_BUF_ZV (XBUFFER (object)); | |
2392 } | |
2393 | |
2394 Charbpos | |
2395 buffer_or_string_absolute_begin_char (Lisp_Object object) | |
2396 { | |
2397 return STRINGP (object) ? 0 : BUF_BEG (XBUFFER (object)); | |
2398 } | |
2399 | |
2400 Charbpos | |
2401 buffer_or_string_absolute_end_char (Lisp_Object object) | |
2402 { | |
2403 return STRINGP (object) ? | |
2404 XSTRING_CHAR_LENGTH (object) : BUF_Z (XBUFFER (object)); | |
2405 } | |
2406 | |
2407 Bytebpos | |
2408 buffer_or_string_absolute_begin_byte (Lisp_Object object) | |
2409 { | |
2410 return STRINGP (object) ? 0 : BI_BUF_BEG (XBUFFER (object)); | |
2411 } | |
2412 | |
2413 Bytebpos | |
2414 buffer_or_string_absolute_end_byte (Lisp_Object object) | |
2415 { | |
2416 return STRINGP (object) ? | |
2417 XSTRING_LENGTH (object) : BI_BUF_Z (XBUFFER (object)); | |
2418 } | |
2419 | |
2420 | |
2421 /************************************************************************/ | |
2422 /* Implement TO_EXTERNAL_FORMAT, TO_INTERNAL_FORMAT */ | |
2423 /************************************************************************/ | |
2424 | |
2425 typedef struct | |
2426 { | |
2427 Dynarr_declare (Intbyte_dynarr *); | |
2428 } Intbyte_dynarr_dynarr; | |
2429 | |
2430 typedef struct | |
2431 { | |
2432 Dynarr_declare (Extbyte_dynarr *); | |
2433 } Extbyte_dynarr_dynarr; | |
2434 | |
2435 static Extbyte_dynarr_dynarr *conversion_out_dynarr_list; | |
2436 static Intbyte_dynarr_dynarr *conversion_in_dynarr_list; | |
2437 | |
2438 static int dfc_convert_to_external_format_in_use; | |
2439 static int dfc_convert_to_internal_format_in_use; | |
2440 | |
2441 static Lisp_Object | |
2442 dfc_convert_to_external_format_reset_in_use (Lisp_Object value) | |
2443 { | |
2444 dfc_convert_to_external_format_in_use = XINT (value); | |
2445 return Qnil; | |
2446 } | |
2447 | |
2448 static Lisp_Object | |
2449 dfc_convert_to_internal_format_reset_in_use (Lisp_Object value) | |
2450 { | |
2451 dfc_convert_to_internal_format_in_use = XINT (value); | |
2452 return Qnil; | |
2453 } | |
2454 | |
2455 void | |
2456 dfc_convert_to_external_format (dfc_conversion_type source_type, | |
2457 dfc_conversion_data *source, | |
2458 Lisp_Object coding_system, | |
2459 dfc_conversion_type sink_type, | |
2460 dfc_conversion_data *sink) | |
2461 { | |
2462 /* It's guaranteed that many callers are not prepared for GC here, | |
2463 esp. given that this code conversion occurs in many very hidden | |
2464 places. */ | |
2465 int count = begin_gc_forbidden (); | |
2466 Extbyte_dynarr *conversion_out_dynarr; | |
2467 | |
2468 type_checking_assert | |
2469 (((source_type == DFC_TYPE_DATA) || | |
2470 (source_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)) || | |
2471 (source_type == DFC_TYPE_LISP_STRING && STRINGP (source->lisp_object))) | |
2472 && | |
2473 ((sink_type == DFC_TYPE_DATA) || | |
2474 (sink_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)))); | |
2475 | |
2476 record_unwind_protect (dfc_convert_to_external_format_reset_in_use, | |
2477 make_int (dfc_convert_to_external_format_in_use)); | |
2478 if (Dynarr_length (conversion_out_dynarr_list) <= | |
2479 dfc_convert_to_external_format_in_use) | |
2480 Dynarr_add (conversion_out_dynarr_list, Dynarr_new (Extbyte)); | |
2481 conversion_out_dynarr = Dynarr_at (conversion_out_dynarr_list, | |
2482 dfc_convert_to_external_format_in_use); | |
2483 dfc_convert_to_external_format_in_use++; | |
2484 Dynarr_reset (conversion_out_dynarr); | |
2485 | |
2486 coding_system = get_coding_system_for_text_file (coding_system, 0); | |
2487 | |
2488 /* Here we optimize in the case where the coding system does no | |
2489 conversion. However, we don't want to optimize in case the source | |
2490 or sink is an lstream, since writing to an lstream can cause a | |
2491 garbage collection, and this could be problematic if the source | |
2492 is a lisp string. */ | |
2493 if (source_type != DFC_TYPE_LISP_LSTREAM && | |
2494 sink_type != DFC_TYPE_LISP_LSTREAM && | |
2495 coding_system_is_binary (coding_system)) | |
2496 { | |
2497 const Intbyte *ptr; | |
2498 Bytecount len; | |
2499 | |
2500 if (source_type == DFC_TYPE_LISP_STRING) | |
2501 { | |
2502 ptr = XSTRING_DATA (source->lisp_object); | |
2503 len = XSTRING_LENGTH (source->lisp_object); | |
2504 } | |
2505 else | |
2506 { | |
2507 ptr = (Intbyte *) source->data.ptr; | |
2508 len = source->data.len; | |
2509 } | |
2510 | |
2511 #ifdef MULE | |
2512 { | |
2513 const Intbyte *end; | |
2514 for (end = ptr + len; ptr < end;) | |
2515 { | |
2516 Intbyte c = | |
2517 (BYTE_ASCII_P (*ptr)) ? *ptr : | |
2518 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) : | |
2519 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) : | |
2520 '~'; | |
2521 | |
2522 Dynarr_add (conversion_out_dynarr, (Extbyte) c); | |
2523 INC_CHARPTR (ptr); | |
2524 } | |
2525 charbpos_checking_assert (ptr == end); | |
2526 } | |
2527 #else | |
2528 Dynarr_add_many (conversion_out_dynarr, ptr, len); | |
2529 #endif | |
2530 | |
2531 } | |
2532 #ifdef HAVE_WIN32_CODING_SYSTEMS | |
2533 /* Optimize the common case involving Unicode where only ASCII is involved */ | |
2534 else if (source_type != DFC_TYPE_LISP_LSTREAM && | |
2535 sink_type != DFC_TYPE_LISP_LSTREAM && | |
2536 dfc_coding_system_is_unicode (coding_system)) | |
2537 { | |
2538 const Intbyte *ptr, *p; | |
2539 Bytecount len; | |
2540 const Intbyte *end; | |
2541 | |
2542 if (source_type == DFC_TYPE_LISP_STRING) | |
2543 { | |
2544 ptr = XSTRING_DATA (source->lisp_object); | |
2545 len = XSTRING_LENGTH (source->lisp_object); | |
2546 } | |
2547 else | |
2548 { | |
2549 ptr = (Intbyte *) source->data.ptr; | |
2550 len = source->data.len; | |
2551 } | |
2552 end = ptr + len; | |
2553 | |
2554 for (p = ptr; p < end; p++) | |
2555 { | |
2556 if (!BYTE_ASCII_P (*p)) | |
2557 goto the_hard_way; | |
2558 } | |
2559 | |
2560 for (p = ptr; p < end; p++) | |
2561 { | |
2562 Dynarr_add (conversion_out_dynarr, (Extbyte) (*p)); | |
2563 Dynarr_add (conversion_out_dynarr, (Extbyte) '\0'); | |
2564 } | |
2565 } | |
2566 #endif /* HAVE_WIN32_CODING_SYSTEMS */ | |
2567 else | |
2568 { | |
2569 Lisp_Object streams_to_delete[3]; | |
2570 int delete_count; | |
2571 Lisp_Object instream, outstream; | |
2572 Lstream *reader, *writer; | |
2573 struct gcpro gcpro1, gcpro2; | |
2574 | |
2575 #ifdef HAVE_WIN32_CODING_SYSTEMS | |
2576 the_hard_way: | |
2577 #endif /* HAVE_WIN32_CODING_SYSTEMS */ | |
2578 delete_count = 0; | |
2579 if (source_type == DFC_TYPE_LISP_LSTREAM) | |
2580 instream = source->lisp_object; | |
2581 else if (source_type == DFC_TYPE_DATA) | |
2582 streams_to_delete[delete_count++] = instream = | |
2583 make_fixed_buffer_input_stream (source->data.ptr, source->data.len); | |
2584 else | |
2585 { | |
2586 type_checking_assert (source_type == DFC_TYPE_LISP_STRING); | |
2587 streams_to_delete[delete_count++] = instream = | |
2588 /* This will GCPRO the Lisp string */ | |
2589 make_lisp_string_input_stream (source->lisp_object, 0, -1); | |
2590 } | |
2591 | |
2592 if (sink_type == DFC_TYPE_LISP_LSTREAM) | |
2593 outstream = sink->lisp_object; | |
2594 else | |
2595 { | |
2596 type_checking_assert (sink_type == DFC_TYPE_DATA); | |
2597 streams_to_delete[delete_count++] = outstream = | |
2598 make_dynarr_output_stream | |
2599 ((unsigned_char_dynarr *) conversion_out_dynarr); | |
2600 } | |
2601 | |
2602 streams_to_delete[delete_count++] = outstream = | |
2603 make_coding_output_stream (XLSTREAM (outstream), coding_system, CODING_ENCODE); | |
2604 | |
2605 reader = XLSTREAM (instream); | |
2606 writer = XLSTREAM (outstream); | |
2607 /* decoding_stream will gc-protect outstream */ | |
2608 GCPRO2 (instream, outstream); | |
2609 | |
2610 while (1) | |
2611 { | |
2612 Bytecount size_in_bytes; | |
2613 char tempbuf[1024]; /* some random amount */ | |
2614 | |
2615 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf)); | |
2616 | |
2617 if (size_in_bytes == 0) | |
2618 break; | |
2619 else if (size_in_bytes < 0) | |
2620 signal_error (Qtext_conversion_error, | |
2621 "Error converting to external format", Qunbound); | |
2622 | |
2623 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0) | |
2624 signal_error (Qtext_conversion_error, | |
2625 "Error converting to external format", Qunbound); | |
2626 } | |
2627 | |
2628 /* Closing writer will close any stream at the other end of writer. */ | |
2629 Lstream_close (writer); | |
2630 Lstream_close (reader); | |
2631 UNGCPRO; | |
2632 | |
2633 /* The idea is that this function will create no garbage. */ | |
2634 while (delete_count) | |
2635 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count])); | |
2636 } | |
2637 | |
2638 unbind_to (count); | |
2639 | |
2640 if (sink_type != DFC_TYPE_LISP_LSTREAM) | |
2641 { | |
2642 sink->data.len = Dynarr_length (conversion_out_dynarr); | |
2643 /* double zero-extend because we may be dealing with Unicode data */ | |
2644 Dynarr_add (conversion_out_dynarr, '\0'); | |
2645 Dynarr_add (conversion_out_dynarr, '\0'); | |
2646 sink->data.ptr = Dynarr_atp (conversion_out_dynarr, 0); | |
2647 } | |
2648 } | |
2649 | |
2650 void | |
2651 dfc_convert_to_internal_format (dfc_conversion_type source_type, | |
2652 dfc_conversion_data *source, | |
2653 Lisp_Object coding_system, | |
2654 dfc_conversion_type sink_type, | |
2655 dfc_conversion_data *sink) | |
2656 { | |
2657 /* It's guaranteed that many callers are not prepared for GC here, | |
2658 esp. given that this code conversion occurs in many very hidden | |
2659 places. */ | |
2660 int count = begin_gc_forbidden (); | |
2661 Intbyte_dynarr *conversion_in_dynarr; | |
2662 | |
2663 type_checking_assert | |
2664 ((source_type == DFC_TYPE_DATA || | |
2665 source_type == DFC_TYPE_LISP_LSTREAM) | |
2666 && | |
2667 (sink_type == DFC_TYPE_DATA || | |
2668 sink_type == DFC_TYPE_LISP_LSTREAM)); | |
2669 | |
2670 record_unwind_protect (dfc_convert_to_internal_format_reset_in_use, | |
2671 make_int (dfc_convert_to_internal_format_in_use)); | |
2672 if (Dynarr_length (conversion_in_dynarr_list) <= | |
2673 dfc_convert_to_internal_format_in_use) | |
2674 Dynarr_add (conversion_in_dynarr_list, Dynarr_new (Intbyte)); | |
2675 conversion_in_dynarr = Dynarr_at (conversion_in_dynarr_list, | |
2676 dfc_convert_to_internal_format_in_use); | |
2677 dfc_convert_to_internal_format_in_use++; | |
2678 Dynarr_reset (conversion_in_dynarr); | |
2679 | |
2680 coding_system = get_coding_system_for_text_file (coding_system, 1); | |
2681 | |
2682 if (source_type != DFC_TYPE_LISP_LSTREAM && | |
2683 sink_type != DFC_TYPE_LISP_LSTREAM && | |
2684 coding_system_is_binary (coding_system)) | |
2685 { | |
2686 #ifdef MULE | |
2687 const Intbyte *ptr = (const Intbyte *) source->data.ptr; | |
2688 Bytecount len = source->data.len; | |
2689 const Intbyte *end = ptr + len; | |
2690 | |
2691 for (; ptr < end; ptr++) | |
2692 { | |
2693 Intbyte c = *ptr; | |
2694 | |
2695 if (BYTE_ASCII_P (c)) | |
2696 Dynarr_add (conversion_in_dynarr, c); | |
2697 else if (BYTE_C1_P (c)) | |
2698 { | |
2699 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1); | |
2700 Dynarr_add (conversion_in_dynarr, c + 0x20); | |
2701 } | |
2702 else | |
2703 { | |
2704 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1); | |
2705 Dynarr_add (conversion_in_dynarr, c); | |
2706 } | |
2707 } | |
2708 #else | |
2709 Dynarr_add_many (conversion_in_dynarr, source->data.ptr, source->data.len); | |
2710 #endif | |
2711 } | |
2712 #ifdef HAVE_WIN32_CODING_SYSTEMS | |
2713 /* Optimize the common case involving Unicode where only ASCII/Latin-1 is involved */ | |
2714 else if (source_type != DFC_TYPE_LISP_LSTREAM && | |
2715 sink_type != DFC_TYPE_LISP_LSTREAM && | |
2716 dfc_coding_system_is_unicode (coding_system)) | |
2717 { | |
2718 const Intbyte *ptr = (const Intbyte *) source->data.ptr + 1; | |
2719 Bytecount len = source->data.len; | |
2720 const Intbyte *end = ptr + len; | |
2721 | |
2722 if (len & 1) | |
2723 goto the_hard_way; | |
2724 | |
2725 for (; ptr < end; ptr += 2) | |
2726 { | |
2727 if (*ptr) | |
2728 goto the_hard_way; | |
2729 } | |
2730 | |
2731 ptr = (const Intbyte *) source->data.ptr; | |
2732 end = ptr + len; | |
2733 | |
2734 for (; ptr < end; ptr += 2) | |
2735 { | |
2736 Intbyte c = *ptr; | |
2737 | |
2738 if (BYTE_ASCII_P (c)) | |
2739 Dynarr_add (conversion_in_dynarr, c); | |
2740 #ifdef MULE | |
2741 else if (BYTE_C1_P (c)) | |
2742 { | |
2743 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1); | |
2744 Dynarr_add (conversion_in_dynarr, c + 0x20); | |
2745 } | |
2746 else | |
2747 { | |
2748 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1); | |
2749 Dynarr_add (conversion_in_dynarr, c); | |
2750 } | |
2751 #endif /* MULE */ | |
2752 } | |
2753 } | |
2754 #endif /* HAVE_WIN32_CODING_SYSTEMS */ | |
2755 else | |
2756 { | |
2757 Lisp_Object streams_to_delete[3]; | |
2758 int delete_count; | |
2759 Lisp_Object instream, outstream; | |
2760 Lstream *reader, *writer; | |
2761 struct gcpro gcpro1, gcpro2; | |
2762 | |
2763 #ifdef HAVE_WIN32_CODING_SYSTEMS | |
2764 the_hard_way: | |
2765 #endif /* HAVE_WIN32_CODING_SYSTEMS */ | |
2766 delete_count = 0; | |
2767 if (source_type == DFC_TYPE_LISP_LSTREAM) | |
2768 instream = source->lisp_object; | |
2769 else | |
2770 { | |
2771 type_checking_assert (source_type == DFC_TYPE_DATA); | |
2772 streams_to_delete[delete_count++] = instream = | |
2773 make_fixed_buffer_input_stream (source->data.ptr, source->data.len); | |
2774 } | |
2775 | |
2776 if (sink_type == DFC_TYPE_LISP_LSTREAM) | |
2777 outstream = sink->lisp_object; | |
2778 else | |
2779 { | |
2780 type_checking_assert (sink_type == DFC_TYPE_DATA); | |
2781 streams_to_delete[delete_count++] = outstream = | |
2782 make_dynarr_output_stream | |
2783 ((unsigned_char_dynarr *) conversion_in_dynarr); | |
2784 } | |
2785 | |
2786 streams_to_delete[delete_count++] = outstream = | |
2787 make_coding_output_stream (XLSTREAM (outstream), coding_system, CODING_DECODE); | |
2788 | |
2789 reader = XLSTREAM (instream); | |
2790 writer = XLSTREAM (outstream); | |
2791 /* outstream will gc-protect its sink stream, if necessary */ | |
2792 GCPRO2 (instream, outstream); | |
2793 | |
2794 while (1) | |
2795 { | |
2796 Bytecount size_in_bytes; | |
2797 char tempbuf[1024]; /* some random amount */ | |
2798 | |
2799 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf)); | |
2800 | |
2801 if (size_in_bytes == 0) | |
2802 break; | |
2803 else if (size_in_bytes < 0) | |
2804 signal_error (Qtext_conversion_error, | |
2805 "Error converting to internal format", Qunbound); | |
2806 | |
2807 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0) | |
2808 signal_error (Qtext_conversion_error, | |
2809 "Error converting to internal format", Qunbound); | |
2810 } | |
2811 | |
2812 /* Closing writer will close any stream at the other end of writer. */ | |
2813 Lstream_close (writer); | |
2814 Lstream_close (reader); | |
2815 UNGCPRO; | |
2816 | |
2817 /* The idea is that this function will create no garbage. */ | |
2818 while (delete_count) | |
2819 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count])); | |
2820 } | |
2821 | |
2822 unbind_to (count); | |
2823 | |
2824 if (sink_type != DFC_TYPE_LISP_LSTREAM) | |
2825 { | |
2826 sink->data.len = Dynarr_length (conversion_in_dynarr); | |
2827 Dynarr_add (conversion_in_dynarr, '\0'); /* remember to NUL-terminate! */ | |
2828 /* The macros don't currently distinguish between internal and | |
2829 external sinks, and allocate and copy two extra bytes in both | |
2830 cases. So we add a second zero, just like for external data | |
2831 (in that case, because we may be converting to Unicode). */ | |
2832 Dynarr_add (conversion_in_dynarr, '\0'); | |
2833 sink->data.ptr = Dynarr_atp (conversion_in_dynarr, 0); | |
2834 } | |
2835 } | |
2836 | |
2837 | |
2838 /************************************************************************/ | |
2839 /* Basic Emchar functions */ | |
2840 /************************************************************************/ | |
2841 | |
2842 #ifdef MULE | |
2843 | |
2844 /* Convert a non-ASCII Mule character C into a one-character Mule-encoded | |
2845 string in STR. Returns the number of bytes stored. | |
2846 Do not call this directly. Use the macro set_charptr_emchar() instead. | |
2847 */ | |
2848 | |
2849 Bytecount | |
2850 non_ascii_set_charptr_emchar (Intbyte *str, Emchar c) | |
2851 { | |
2852 Intbyte *p; | |
2853 Intbyte lb; | |
2854 int c1, c2; | |
2855 Lisp_Object charset; | |
2856 | |
2857 p = str; | |
2858 BREAKUP_CHAR (c, charset, c1, c2); | |
2859 lb = CHAR_LEADING_BYTE (c); | |
2860 if (LEADING_BYTE_PRIVATE_P (lb)) | |
2861 *p++ = PRIVATE_LEADING_BYTE_PREFIX (lb); | |
2862 *p++ = lb; | |
2863 if (EQ (charset, Vcharset_control_1)) | |
2864 c1 += 0x20; | |
2865 *p++ = c1 | 0x80; | |
2866 if (c2) | |
2867 *p++ = c2 | 0x80; | |
2868 | |
2869 return (p - str); | |
2870 } | |
2871 | |
2872 /* Return the first character from a Mule-encoded string in STR, | |
2873 assuming it's non-ASCII. Do not call this directly. | |
2874 Use the macro charptr_emchar() instead. */ | |
2875 | |
2876 Emchar | |
2877 non_ascii_charptr_emchar (const Intbyte *str) | |
2878 { | |
2879 Intbyte i0 = *str, i1, i2 = 0; | |
2880 Lisp_Object charset; | |
2881 | |
2882 if (i0 == LEADING_BYTE_CONTROL_1) | |
2883 return (Emchar) (*++str - 0x20); | |
2884 | |
2885 if (LEADING_BYTE_PREFIX_P (i0)) | |
2886 i0 = *++str; | |
2887 | |
2888 i1 = *++str & 0x7F; | |
2889 | |
2890 charset = CHARSET_BY_LEADING_BYTE (i0); | |
2891 if (XCHARSET_DIMENSION (charset) == 2) | |
2892 i2 = *++str & 0x7F; | |
2893 | |
2894 return MAKE_CHAR (charset, i1, i2); | |
2895 } | |
2896 | |
2897 /* Return whether CH is a valid Emchar, assuming it's non-ASCII. | |
2898 Do not call this directly. Use the macro valid_char_p() instead. */ | |
2899 | |
2900 int | |
2901 non_ascii_valid_char_p (Emchar ch) | |
2902 { | |
2903 int f1, f2, f3; | |
2904 | |
2905 /* Must have only lowest 19 bits set */ | |
2906 if (ch & ~0x7FFFF) | |
2907 return 0; | |
2908 | |
2909 f1 = CHAR_FIELD1 (ch); | |
2910 f2 = CHAR_FIELD2 (ch); | |
2911 f3 = CHAR_FIELD3 (ch); | |
2912 | |
2913 if (f1 == 0) | |
2914 { | |
2915 /* dimension-1 char */ | |
2916 Lisp_Object charset; | |
2917 | |
2918 /* leading byte must be correct */ | |
2919 if (f2 < MIN_CHAR_FIELD2_OFFICIAL || | |
2920 (f2 > MAX_CHAR_FIELD2_OFFICIAL && f2 < MIN_CHAR_FIELD2_PRIVATE) || | |
2921 f2 > MAX_CHAR_FIELD2_PRIVATE) | |
2922 return 0; | |
2923 /* octet not out of range */ | |
2924 if (f3 < 0x20) | |
2925 return 0; | |
2926 /* charset exists */ | |
2927 /* | |
2928 NOTE: This takes advantage of the fact that | |
2929 FIELD2_TO_OFFICIAL_LEADING_BYTE and | |
2930 FIELD2_TO_PRIVATE_LEADING_BYTE are the same. | |
2931 */ | |
2932 charset = CHARSET_BY_LEADING_BYTE (f2 + FIELD2_TO_OFFICIAL_LEADING_BYTE); | |
2933 if (EQ (charset, Qnil)) | |
2934 return 0; | |
2935 /* check range as per size (94 or 96) of charset */ | |
2936 return ((f3 > 0x20 && f3 < 0x7f) || XCHARSET_CHARS (charset) == 96); | |
2937 } | |
2938 else | |
2939 { | |
2940 /* dimension-2 char */ | |
2941 Lisp_Object charset; | |
2942 | |
2943 /* leading byte must be correct */ | |
2944 if (f1 < MIN_CHAR_FIELD1_OFFICIAL || | |
2945 (f1 > MAX_CHAR_FIELD1_OFFICIAL && f1 < MIN_CHAR_FIELD1_PRIVATE) || | |
2946 f1 > MAX_CHAR_FIELD1_PRIVATE) | |
2947 return 0; | |
2948 /* octets not out of range */ | |
2949 if (f2 < 0x20 || f3 < 0x20) | |
2950 return 0; | |
2951 | |
2952 #ifdef ENABLE_COMPOSITE_CHARS | |
2953 if (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE == LEADING_BYTE_COMPOSITE) | |
2954 { | |
2955 if (UNBOUNDP (Fgethash (make_int (ch), | |
2956 Vcomposite_char_char2string_hash_table, | |
2957 Qunbound))) | |
2958 return 0; | |
2959 return 1; | |
2960 } | |
2961 #endif /* ENABLE_COMPOSITE_CHARS */ | |
2962 | |
2963 /* charset exists */ | |
2964 if (f1 <= MAX_CHAR_FIELD1_OFFICIAL) | |
2965 charset = | |
2966 CHARSET_BY_LEADING_BYTE (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE); | |
2967 else | |
2968 charset = | |
2969 CHARSET_BY_LEADING_BYTE (f1 + FIELD1_TO_PRIVATE_LEADING_BYTE); | |
2970 | |
2971 if (EQ (charset, Qnil)) | |
2972 return 0; | |
2973 /* check range as per size (94x94 or 96x96) of charset */ | |
2974 return ((f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F) || | |
2975 XCHARSET_CHARS (charset) == 96); | |
2976 } | |
2977 } | |
2978 | |
2979 /* Copy the character pointed to by SRC into DST. Do not call this | |
2980 directly. Use the macro charptr_copy_char() instead. | |
2981 Return the number of bytes copied. */ | |
2982 | |
2983 Bytecount | |
2984 non_ascii_charptr_copy_char (const Intbyte *src, Intbyte *dst) | |
2985 { | |
2986 Bytecount bytes = REP_BYTES_BY_FIRST_BYTE (*src); | |
2987 Bytecount i; | |
2988 for (i = bytes; i; i--, dst++, src++) | |
2989 *dst = *src; | |
2990 return bytes; | |
2991 } | |
2992 | |
2993 #endif /* MULE */ | |
2994 | |
2995 | |
2996 /************************************************************************/ | |
2997 /* streams of Emchars */ | |
2998 /************************************************************************/ | |
2999 | |
3000 #ifdef MULE | |
3001 | |
3002 /* Treat a stream as a stream of Emchar's rather than a stream of bytes. | |
3003 The functions below are not meant to be called directly; use | |
3004 the macros in insdel.h. */ | |
3005 | |
3006 Emchar | |
3007 Lstream_get_emchar_1 (Lstream *stream, int ch) | |
3008 { | |
3009 Intbyte str[MAX_EMCHAR_LEN]; | |
3010 Intbyte *strptr = str; | |
3011 Bytecount bytes; | |
3012 | |
3013 str[0] = (Intbyte) ch; | |
3014 | |
3015 for (bytes = REP_BYTES_BY_FIRST_BYTE (ch) - 1; bytes; bytes--) | |
3016 { | |
3017 int c = Lstream_getc (stream); | |
3018 charbpos_checking_assert (c >= 0); | |
3019 *++strptr = (Intbyte) c; | |
3020 } | |
3021 return charptr_emchar (str); | |
3022 } | |
3023 | |
3024 int | |
3025 Lstream_fput_emchar (Lstream *stream, Emchar ch) | |
3026 { | |
3027 Intbyte str[MAX_EMCHAR_LEN]; | |
3028 Bytecount len = set_charptr_emchar (str, ch); | |
3029 return Lstream_write (stream, str, len); | |
3030 } | |
3031 | |
3032 void | |
3033 Lstream_funget_emchar (Lstream *stream, Emchar ch) | |
3034 { | |
3035 Intbyte str[MAX_EMCHAR_LEN]; | |
3036 Bytecount len = set_charptr_emchar (str, ch); | |
3037 Lstream_unread (stream, str, len); | |
3038 } | |
3039 | |
3040 #endif /* MULE */ | |
3041 | |
3042 | |
3043 /************************************************************************/ | |
3044 /* Lisp primitives for working with characters */ | |
3045 /************************************************************************/ | |
3046 | |
3047 DEFUN ("make-char", Fmake_char, 2, 3, 0, /* | |
3048 Make a character from CHARSET and octets ARG1 and ARG2. | |
3049 ARG2 is required only for characters from two-dimensional charsets. | |
3050 | |
3051 Each octet should be in the range 32 through 127 for a 96 or 96x96 | |
3052 charset and 33 through 126 for a 94 or 94x94 charset. (Most charsets | |
3053 are either 96 or 94x94.) Note that this is 32 more than the values | |
3054 typically given for 94x94 charsets. When two octets are required, the | |
3055 order is "standard" -- the same as appears in ISO-2022 encodings, | |
3056 reference tables, etc. | |
3057 | |
3058 \(Note the following non-obvious result: Computerized translation | |
3059 tables often encode the two octets as the high and low bytes, | |
3060 respectively, of a hex short, while when there's only one octet, it | |
3061 goes in the low byte. When decoding such a value, you need to treat | |
3062 the two cases differently when calling make-char: One is (make-char | |
3063 CHARSET HIGH LOW), the other is (make-char CHARSET LOW).) | |
3064 | |
3065 For example, (make-char 'latin-iso8859-2 185) or (make-char | |
3066 'latin-iso8859-2 57) will return the Latin 2 character s with caron. | |
3067 | |
3068 As another example, the Japanese character for "kawa" (stream), which | |
3069 looks something like this: | |
3070 | |
3071 | | | |
3072 | | | | |
3073 | | | | |
3074 | | | | |
3075 / | | |
3076 | |
3077 appears in the Unicode Standard (version 2.0) on page 7-287 with the | |
3078 following values (see also page 7-4): | |
3079 | |
3080 U 5DDD (Unicode) | |
3081 G 0-2008 (GB 2312-80) | |
3082 J 0-3278 (JIS X 0208-1990) | |
3083 K 0-8425 (KS C 5601-1987) | |
3084 B A474 (Big Five) | |
3085 C 1-4455 (CNS 11643-1986 (1st plane)) | |
3086 A 213C34 (ANSI Z39.64-1989) | |
3087 | |
3088 These are equivalent to: | |
3089 | |
3090 \(make-char 'chinese-gb2312 52 40) | |
3091 \(make-char 'japanese-jisx0208 64 110) | |
3092 \(make-char 'korean-ksc5601 116 57) | |
3093 \(make-char 'chinese-cns11643-1 76 87) | |
3094 \(decode-big5-char '(164 . 116)) | |
3095 | |
3096 \(All codes above are two decimal numbers except for Big Five and ANSI | |
3097 Z39.64, which we don't support. We add 32 to each of the decimal | |
3098 numbers. Big Five is split in a rather hackish fashion into two | |
3099 charsets, `big5-1' and `big5-2', due to its excessive size -- 94x157, | |
3100 with the first codepoint in the range 0xA1 to 0xFE and the second in | |
3101 the range 0x40 to 0x7E or 0xA1 to 0xFE. `decode-big5-char' is used to | |
3102 generate the char from its codes, and `encode-big5-char' extracts the | |
3103 codes.) | |
3104 | |
3105 When compiled without MULE, this function does not do much, but it's | |
3106 provided for compatibility. In this case, the following CHARSET symbols | |
3107 are allowed: | |
3108 | |
3109 `ascii' -- ARG1 should be in the range 0 through 127. | |
3110 `control-1' -- ARG1 should be in the range 128 through 159. | |
3111 else -- ARG1 is coerced to be between 0 and 255, and then the high | |
3112 bit is set. | |
3113 | |
3114 `int-to-char of the resulting ARG1' is returned, and ARG2 is always ignored. | |
3115 */ | |
3116 (charset, arg1, arg2)) | |
3117 { | |
3118 #ifdef MULE | |
3119 Lisp_Charset *cs; | |
3120 int a1, a2; | |
3121 int lowlim, highlim; | |
3122 | |
3123 charset = Fget_charset (charset); | |
3124 cs = XCHARSET (charset); | |
3125 | |
3126 if (EQ (charset, Vcharset_ascii)) lowlim = 0, highlim = 127; | |
3127 else if (EQ (charset, Vcharset_control_1)) lowlim = 0, highlim = 31; | |
3128 else if (CHARSET_CHARS (cs) == 94) lowlim = 33, highlim = 126; | |
3129 else /* CHARSET_CHARS (cs) == 96) */ lowlim = 32, highlim = 127; | |
3130 | |
3131 CHECK_INT (arg1); | |
3132 /* It is useful (and safe, according to Olivier Galibert) to strip | |
3133 the 8th bit off ARG1 and ARG2 because it allows programmers to | |
3134 write (make-char 'latin-iso8859-2 CODE) where code is the actual | |
3135 Latin 2 code of the character. */ | |
3136 a1 = XINT (arg1) & 0x7f; | |
3137 if (a1 < lowlim || a1 > highlim) | |
3138 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim)); | |
3139 | |
3140 if (CHARSET_DIMENSION (cs) == 1) | |
3141 { | |
3142 if (!NILP (arg2)) | |
3143 invalid_argument | |
3144 ("Charset is of dimension one; second octet must be nil", arg2); | |
3145 return make_char (MAKE_CHAR (charset, a1, 0)); | |
3146 } | |
3147 | |
3148 CHECK_INT (arg2); | |
3149 a2 = XINT (arg2) & 0x7f; | |
3150 if (a2 < lowlim || a2 > highlim) | |
3151 args_out_of_range_3 (arg2, make_int (lowlim), make_int (highlim)); | |
3152 | |
3153 return make_char (MAKE_CHAR (charset, a1, a2)); | |
3154 #else | |
3155 int a1; | |
3156 int lowlim, highlim; | |
3157 | |
3158 if (EQ (charset, Qascii)) lowlim = 0, highlim = 127; | |
3159 else if (EQ (charset, Qcontrol_1)) lowlim = 0, highlim = 31; | |
3160 else lowlim = 0, highlim = 127; | |
3161 | |
3162 CHECK_INT (arg1); | |
3163 /* It is useful (and safe, according to Olivier Galibert) to strip | |
3164 the 8th bit off ARG1 and ARG2 because it allows programmers to | |
3165 write (make-char 'latin-iso8859-2 CODE) where code is the actual | |
3166 Latin 2 code of the character. */ | |
3167 a1 = XINT (arg1) & 0x7f; | |
3168 if (a1 < lowlim || a1 > highlim) | |
3169 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim)); | |
3170 | |
3171 if (EQ (charset, Qascii)) | |
3172 return make_char (a1); | |
3173 return make_char (a1 + 128); | |
3174 #endif /* MULE */ | |
3175 } | |
3176 | |
3177 #ifdef MULE | |
3178 | |
3179 DEFUN ("char-charset", Fchar_charset, 1, 1, 0, /* | |
3180 Return the character set of char CH. | |
3181 */ | |
3182 (ch)) | |
3183 { | |
3184 CHECK_CHAR_COERCE_INT (ch); | |
3185 | |
3186 return XCHARSET_NAME (CHARSET_BY_LEADING_BYTE | |
3187 (CHAR_LEADING_BYTE (XCHAR (ch)))); | |
3188 } | |
3189 | |
3190 DEFUN ("char-octet", Fchar_octet, 1, 2, 0, /* | |
3191 Return the octet numbered N (should be 0 or 1) of char CH. | |
3192 N defaults to 0 if omitted. | |
3193 */ | |
3194 (ch, n)) | |
3195 { | |
3196 Lisp_Object charset; | |
3197 int octet0, octet1; | |
3198 | |
3199 CHECK_CHAR_COERCE_INT (ch); | |
3200 | |
3201 BREAKUP_CHAR (XCHAR (ch), charset, octet0, octet1); | |
3202 | |
3203 if (NILP (n) || EQ (n, Qzero)) | |
3204 return make_int (octet0); | |
3205 else if (EQ (n, make_int (1))) | |
3206 return make_int (octet1); | |
3207 else | |
3208 invalid_constant ("Octet number must be 0 or 1", n); | |
3209 } | |
3210 | |
3211 DEFUN ("split-char", Fsplit_char, 1, 1, 0, /* | |
3212 Return list of charset and one or two position-codes of CHAR. | |
3213 */ | |
3214 (character)) | |
3215 { | |
3216 /* This function can GC */ | |
3217 struct gcpro gcpro1, gcpro2; | |
3218 Lisp_Object charset = Qnil; | |
3219 Lisp_Object rc = Qnil; | |
3220 int c1, c2; | |
3221 | |
3222 GCPRO2 (charset, rc); | |
3223 CHECK_CHAR_COERCE_INT (character); | |
3224 | |
3225 BREAKUP_CHAR (XCHAR (character), charset, c1, c2); | |
3226 | |
3227 if (XCHARSET_DIMENSION (Fget_charset (charset)) == 2) | |
3228 { | |
3229 rc = list3 (XCHARSET_NAME (charset), make_int (c1), make_int (c2)); | |
3230 } | |
3231 else | |
3232 { | |
3233 rc = list2 (XCHARSET_NAME (charset), make_int (c1)); | |
3234 } | |
3235 UNGCPRO; | |
3236 | |
3237 return rc; | |
3238 } | |
3239 | |
3240 #endif /* MULE */ | |
3241 | |
3242 | |
3243 /************************************************************************/ | |
3244 /* composite character functions */ | |
3245 /************************************************************************/ | |
3246 | |
3247 #ifdef ENABLE_COMPOSITE_CHARS | |
3248 | |
3249 Emchar | |
3250 lookup_composite_char (Intbyte *str, int len) | |
3251 { | |
3252 Lisp_Object lispstr = make_string (str, len); | |
3253 Lisp_Object ch = Fgethash (lispstr, | |
3254 Vcomposite_char_string2char_hash_table, | |
3255 Qunbound); | |
3256 Emchar emch; | |
3257 | |
3258 if (UNBOUNDP (ch)) | |
3259 { | |
3260 if (composite_char_row_next >= 128) | |
3261 invalid_operation ("No more composite chars available", lispstr); | |
3262 emch = MAKE_CHAR (Vcharset_composite, composite_char_row_next, | |
3263 composite_char_col_next); | |
3264 Fputhash (make_char (emch), lispstr, | |
3265 Vcomposite_char_char2string_hash_table); | |
3266 Fputhash (lispstr, make_char (emch), | |
3267 Vcomposite_char_string2char_hash_table); | |
3268 composite_char_col_next++; | |
3269 if (composite_char_col_next >= 128) | |
3270 { | |
3271 composite_char_col_next = 32; | |
3272 composite_char_row_next++; | |
3273 } | |
3274 } | |
3275 else | |
3276 emch = XCHAR (ch); | |
3277 return emch; | |
3278 } | |
3279 | |
3280 Lisp_Object | |
3281 composite_char_string (Emchar ch) | |
3282 { | |
3283 Lisp_Object str = Fgethash (make_char (ch), | |
3284 Vcomposite_char_char2string_hash_table, | |
3285 Qunbound); | |
3286 assert (!UNBOUNDP (str)); | |
3287 return str; | |
3288 } | |
3289 | |
3290 xxDEFUN ("make-composite-char", Fmake_composite_char, 1, 1, 0, /* | |
3291 Convert a string into a single composite character. | |
3292 The character is the result of overstriking all the characters in | |
3293 the string. | |
3294 */ | |
3295 (string)) | |
3296 { | |
3297 CHECK_STRING (string); | |
3298 return make_char (lookup_composite_char (XSTRING_DATA (string), | |
3299 XSTRING_LENGTH (string))); | |
3300 } | |
3301 | |
3302 xxDEFUN ("composite-char-string", Fcomposite_char_string, 1, 1, 0, /* | |
3303 Return a string of the characters comprising a composite character. | |
3304 */ | |
3305 (ch)) | |
3306 { | |
3307 Emchar emch; | |
3308 | |
3309 CHECK_CHAR (ch); | |
3310 emch = XCHAR (ch); | |
3311 if (CHAR_LEADING_BYTE (emch) != LEADING_BYTE_COMPOSITE) | |
3312 invalid_argument ("Must be composite char", ch); | |
3313 return composite_char_string (emch); | |
3314 } | |
3315 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3316 | |
3317 | |
3318 /************************************************************************/ | |
3319 /* initialization */ | |
3320 /************************************************************************/ | |
3321 | |
3322 void | |
3323 init_eistring_once_early (void) | |
3324 { | |
3325 the_eistring_malloc_zero_init = the_eistring_zero_init; | |
3326 the_eistring_malloc_zero_init.mallocp_ = 1; | |
3327 } | |
3328 | |
3329 void | |
3330 syms_of_text (void) | |
3331 { | |
3332 DEFSUBR (Fmake_char); | |
3333 | |
3334 #ifdef MULE | |
3335 DEFSUBR (Fchar_charset); | |
3336 DEFSUBR (Fchar_octet); | |
3337 DEFSUBR (Fsplit_char); | |
3338 | |
3339 #ifdef ENABLE_COMPOSITE_CHARS | |
3340 DEFSUBR (Fmake_composite_char); | |
3341 DEFSUBR (Fcomposite_char_string); | |
3342 #endif | |
3343 #endif /* MULE */ | |
3344 } | |
3345 | |
3346 void | |
3347 reinit_vars_of_text (void) | |
3348 { | |
3349 int i; | |
3350 | |
3351 conversion_in_dynarr_list = Dynarr_new2 (Intbyte_dynarr_dynarr, | |
3352 Intbyte_dynarr *); | |
3353 conversion_out_dynarr_list = Dynarr_new2 (Extbyte_dynarr_dynarr, | |
3354 Extbyte_dynarr *); | |
3355 | |
3356 /* #### Olivier, why does this need to be reinitted? */ | |
3357 for (i = 0; i <= MAX_BYTEBPOS_GAP_SIZE_3; i++) | |
3358 three_to_one_table[i] = i / 3; | |
3359 } | |
3360 | |
3361 void | |
3362 vars_of_text (void) | |
3363 { | |
3364 reinit_vars_of_text (); | |
3365 | |
3366 #ifdef ENABLE_COMPOSITE_CHARS | |
3367 /* #### not dumped properly */ | |
3368 composite_char_row_next = 32; | |
3369 composite_char_col_next = 32; | |
3370 | |
3371 Vcomposite_char_string2char_hash_table = | |
3372 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQUAL); | |
3373 Vcomposite_char_char2string_hash_table = | |
3374 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ); | |
3375 staticpro (&Vcomposite_char_string2char_hash_table); | |
3376 staticpro (&Vcomposite_char_char2string_hash_table); | |
3377 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3378 } |