comparison src/text.c @ 771:943eaba38521

[xemacs-hg @ 2002-03-13 08:51:24 by ben] The big ben-mule-21-5 check-in! Various files were added and deleted. See CHANGES-ben-mule. There are still some test suite failures. No crashes, though. Many of the failures have to do with problems in the test suite itself rather than in the actual code. I'll be addressing these in the next day or so -- none of the test suite failures are at all critical. Meanwhile I'll be trying to address the biggest issues -- i.e. build or run failures, which will almost certainly happen on various platforms. All comments should be sent to ben@xemacs.org -- use a Cc: if necessary when sending to mailing lists. There will be pre- and post- tags, something like pre-ben-mule-21-5-merge-in, and post-ben-mule-21-5-merge-in.
author ben
date Wed, 13 Mar 2002 08:54:06 +0000
parents
children 026c5bf9c134
comparison
equal deleted inserted replaced
770:336a418893b5 771:943eaba38521
1 /* Buffer manipulation primitives for XEmacs.
2 Copyright (C) 1995 Sun Microsystems, Inc.
3 Copyright (C) 1995, 1996, 2000, 2001, 2002 Ben Wing.
4 Copyright (C) 1999 Martin Buchholz.
5
6 This file is part of XEmacs.
7
8 XEmacs is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 2, or (at your option) any
11 later version.
12
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with XEmacs; see the file COPYING. If not, write to
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
22
23 /* Synched up with: Not in FSF. */
24
25 /* Authorship:
26 */
27
28 #include <config.h>
29 #include "lisp.h"
30
31 #include "buffer.h"
32 #include "charset.h"
33 #include "file-coding.h"
34 #include "lstream.h"
35
36
37 /************************************************************************/
38 /* long comments */
39 /************************************************************************/
40
41 /*
42 There are three possible ways to specify positions in a buffer. All
43 of these are one-based: the beginning of the buffer is position or
44 index 1, and 0 is not a valid position.
45
46 As a "buffer position" (typedef Charbpos):
47
48 This is an index specifying an offset in characters from the
49 beginning of the buffer. Note that buffer positions are
50 logically *between* characters, not on a character. The
51 difference between two buffer positions specifies the number of
52 characters between those positions. Buffer positions are the
53 only kind of position externally visible to the user.
54
55 As a "byte index" (typedef Bytebpos):
56
57 This is an index over the bytes used to represent the characters
58 in the buffer. If there is no Mule support, this is identical
59 to a buffer position, because each character is represented
60 using one byte. However, with Mule support, many characters
61 require two or more bytes for their representation, and so a
62 byte index may be greater than the corresponding buffer
63 position.
64
65 As a "memory index" (typedef Membpos):
66
67 This is the byte index adjusted for the gap. For positions
68 before the gap, this is identical to the byte index. For
69 positions after the gap, this is the byte index plus the gap
70 size. There are two possible memory indices for the gap
71 position; the memory index at the beginning of the gap should
72 always be used, except in code that deals with manipulating the
73 gap, where both indices may be seen. The address of the
74 character "at" (i.e. following) a particular position can be
75 obtained from the formula
76
77 buffer_start_address + memory_index(position) - 1
78
79 except in the case of characters at the gap position.
80
81 Other typedefs:
82 ===============
83
84 Emchar:
85 -------
86 This typedef represents a single Emacs character, which can be
87 ASCII, ISO-8859, or some extended character, as would typically
88 be used for Kanji. Note that the representation of a character
89 as an Emchar is *not* the same as the representation of that
90 same character in a string; thus, you cannot do the standard
91 C trick of passing a pointer to a character to a function that
92 expects a string.
93
94 An Emchar takes up 19 bits of representation and (for code
95 compatibility and such) is compatible with an int. This
96 representation is visible on the Lisp level. The important
97 characteristics of the Emchar representation are
98
99 -- values 0x00 - 0x7f represent ASCII.
100 -- values 0x80 - 0xff represent the right half of ISO-8859-1.
101 -- values 0x100 and up represent all other characters.
102
103 This means that Emchar values are upwardly compatible with
104 the standard 8-bit representation of ASCII/ISO-8859-1.
105
106 Intbyte:
107 --------
108 The data in a buffer or string is logically made up of Intbyte
109 objects, where a Intbyte takes up the same amount of space as a
110 char. (It is declared differently, though, to catch invalid
111 usages.) Strings stored using Intbytes are said to be in
112 "internal format". The important characteristics of internal
113 format are
114
115 -- ASCII characters are represented as a single Intbyte,
116 in the range 0 - 0x7f.
117 -- All other characters are represented as a Intbyte in
118 the range 0x80 - 0x9f followed by one or more Intbytes
119 in the range 0xa0 to 0xff.
120
121 This leads to a number of desirable properties:
122
123 -- Given the position of the beginning of a character,
124 you can find the beginning of the next or previous
125 character in constant time.
126 -- When searching for a substring or an ASCII character
127 within the string, you need merely use standard
128 searching routines.
129
130 array of char:
131 --------------
132 Strings that go in or out of Emacs are in "external format",
133 typedef'ed as an array of char or a char *. There is more
134 than one external format (JIS, EUC, etc.) but they all
135 have similar properties. They are modal encodings,
136 which is to say that the meaning of particular bytes is
137 not fixed but depends on what "mode" the string is currently
138 in (e.g. bytes in the range 0 - 0x7f might be
139 interpreted as ASCII, or as Hiragana, or as 2-byte Kanji,
140 depending on the current mode). The mode starts out in
141 ASCII/ISO-8859-1 and is switched using escape sequences --
142 for example, in the JIS encoding, 'ESC $ B' switches to a
143 mode where pairs of bytes in the range 0 - 0x7f
144 are interpreted as Kanji characters.
145
146 External-formatted data is generally desirable for passing
147 data between programs because it is upwardly compatible
148 with standard ASCII/ISO-8859-1 strings and may require
149 less space than internal encodings such as the one
150 described above. In addition, some encodings (e.g. JIS)
151 keep all characters (except the ESC used to switch modes)
152 in the printing ASCII range 0x20 - 0x7e, which results in
153 a much higher probability that the data will avoid being
154 garbled in transmission. Externally-formatted data is
155 generally not very convenient to work with, however, and
156 for this reason is usually converted to internal format
157 before any work is done on the string.
158
159 NOTE: filenames need to be in external format so that
160 ISO-8859-1 characters come out correctly.
161
162 Charcount:
163 ----------
164 This typedef represents a count of characters, such as
165 a character offset into a string or the number of
166 characters between two positions in a buffer. The
167 difference between two Charbpos's is a Charcount, and
168 character positions in a string are represented using
169 a Charcount.
170
171 Bytecount:
172 ----------
173 Similar to a Charcount but represents a count of bytes.
174 The difference between two Bytebpos's is a Bytecount.
175
176
177 Usage of the various representations:
178 =====================================
179
180 Memory indices are used in low-level functions in insdel.c and for
181 extent endpoints and marker positions. The reason for this is that
182 this way, the extents and markers don't need to be updated for most
183 insertions, which merely shrink the gap and don't move any
184 characters around in memory.
185
186 (The beginning-of-gap memory index simplifies insertions w.r.t.
187 markers, because text usually gets inserted after markers. For
188 extents, it is merely for consistency, because text can get
189 inserted either before or after an extent's endpoint depending on
190 the open/closedness of the endpoint.)
191
192 Byte indices are used in other code that needs to be fast,
193 such as the searching, redisplay, and extent-manipulation code.
194
195 Buffer positions are used in all other code. This is because this
196 representation is easiest to work with (especially since Lisp
197 code always uses buffer positions), necessitates the fewest
198 changes to existing code, and is the safest (e.g. if the text gets
199 shifted underneath a buffer position, it will still point to a
200 character; if text is shifted under a byte index, it might point
201 to the middle of a character, which would be bad).
202
203 Similarly, Charcounts are used in all code that deals with strings
204 except for code that needs to be fast, which used Bytecounts.
205
206 Strings are always passed around internally using internal format.
207 Conversions between external format are performed at the time
208 that the data goes in or out of Emacs.
209
210 Working with the various representations:
211 ========================================= */
212
213 /* We write things this way because it's very important the
214 MAX_BYTEBPOS_GAP_SIZE_3 is a multiple of 3. (As it happens,
215 65535 is a multiple of 3, but this may not always be the
216 case.) */
217
218
219 /*
220 1. Character Sets
221 =================
222
223 A character set (or "charset") is an ordered set of characters.
224 A particular character in a charset is indexed using one or
225 more "position codes", which are non-negative integers.
226 The number of position codes needed to identify a particular
227 character in a charset is called the "dimension" of the
228 charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions,
229 and the size of all charsets (except for a few special cases)
230 is either 94, 96, 94 by 94, or 96 by 96. The range of
231 position codes used to index characters from any of these
232 types of character sets is as follows:
233
234 Charset type Position code 1 Position code 2
235 ------------------------------------------------------------
236 94 33 - 126 N/A
237 96 32 - 127 N/A
238 94x94 33 - 126 33 - 126
239 96x96 32 - 127 32 - 127
240
241 Note that in the above cases position codes do not start at
242 an expected value such as 0 or 1. The reason for this will
243 become clear later.
244
245 For example, Latin-1 is a 96-character charset, and JISX0208
246 (the Japanese national character set) is a 94x94-character
247 charset.
248
249 [Note that, although the ranges above define the *valid*
250 position codes for a charset, some of the slots in a particular
251 charset may in fact be empty. This is the case for JISX0208,
252 for example, where (e.g.) all the slots whose first
253 position code is in the range 118 - 127 are empty.]
254
255 There are three charsets that do not follow the above rules.
256 All of them have one dimension, and have ranges of position
257 codes as follows:
258
259 Charset name Position code 1
260 ------------------------------------
261 ASCII 0 - 127
262 Control-1 0 - 31
263 Composite 0 - some large number
264
265 (The upper bound of the position code for composite characters
266 has not yet been determined, but it will probably be at
267 least 16,383).
268
269 ASCII is the union of two subsidiary character sets:
270 Printing-ASCII (the printing ASCII character set,
271 consisting of position codes 33 - 126, like for a standard
272 94-character charset) and Control-ASCII (the non-printing
273 characters that would appear in a binary file with codes 0
274 - 32 and 127).
275
276 Control-1 contains the non-printing characters that would
277 appear in a binary file with codes 128 - 159.
278
279 Composite contains characters that are generated by
280 overstriking one or more characters from other charsets.
281
282 Note that some characters in ASCII, and all characters
283 in Control-1, are "control" (non-printing) characters.
284 These have no printed representation but instead control
285 some other function of the printing (e.g. TAB or 8 moves
286 the current character position to the next tab stop).
287 All other characters in all charsets are "graphic"
288 (printing) characters.
289
290 When a binary file is read in, the bytes in the file are
291 assigned to character sets as follows:
292
293 Bytes Character set Range
294 --------------------------------------------------
295 0 - 127 ASCII 0 - 127
296 128 - 159 Control-1 0 - 31
297 160 - 255 Latin-1 32 - 127
298
299 This is a bit ad-hoc but gets the job done.
300
301 2. Encodings
302 ============
303
304 An "encoding" is a way of numerically representing
305 characters from one or more character sets. If an encoding
306 only encompasses one character set, then the position codes
307 for the characters in that character set could be used
308 directly. This is not possible, however, if more than one
309 character set is to be used in the encoding.
310
311 For example, the conversion detailed above between bytes in
312 a binary file and characters is effectively an encoding
313 that encompasses the three character sets ASCII, Control-1,
314 and Latin-1 in a stream of 8-bit bytes.
315
316 Thus, an encoding can be viewed as a way of encoding
317 characters from a specified group of character sets using a
318 stream of bytes, each of which contains a fixed number of
319 bits (but not necessarily 8, as in the common usage of
320 "byte").
321
322 Here are descriptions of a couple of common
323 encodings:
324
325
326 A. Japanese EUC (Extended Unix Code)
327
328 This encompasses the character sets:
329 - Printing-ASCII,
330 - Katakana-JISX0201 (half-width katakana, the right half of JISX0201).
331 - Japanese-JISX0208
332 - Japanese-JISX0212
333 It uses 8-bit bytes.
334
335 Note that Printing-ASCII and Katakana-JISX0201 are 94-character
336 charsets, while Japanese-JISX0208 is a 94x94-character charset.
337
338 The encoding is as follows:
339
340 Character set Representation (PC == position-code)
341 ------------- --------------
342 Printing-ASCII PC1
343 Japanese-JISX0208 PC1 + 0x80 | PC2 + 0x80
344 Katakana-JISX0201 0x8E | PC1 + 0x80
345
346
347 B. JIS7
348
349 This encompasses the character sets:
350 - Printing-ASCII
351 - Latin-JISX0201 (the left half of JISX0201; this character set is
352 very similar to Printing-ASCII and is a 94-character charset)
353 - Japanese-JISX0208
354 - Katakana-JISX0201
355 It uses 7-bit bytes.
356
357 Unlike Japanese EUC, this is a "modal" encoding, which
358 means that there are multiple states that the encoding can
359 be in, which affect how the bytes are to be interpreted.
360 Special sequences of bytes (called "escape sequences")
361 are used to change states.
362
363 The encoding is as follows:
364
365 Character set Representation
366 ------------- --------------
367 Printing-ASCII PC1
368 Latin-JISX0201 PC1
369 Katakana-JISX0201 PC1
370 Japanese-JISX0208 PC1 | PC2
371
372 Escape sequence ASCII equivalent Meaning
373 --------------- ---------------- -------
374 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII
375 0x1B 0x28 0x4A ESC ( J invoke Latin-JISX0201
376 0x1B 0x28 0x49 ESC ( I invoke Katakana-JISX0201
377 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208
378
379 Initially, Printing-ASCII is invoked.
380
381 3. Internal Mule Encodings
382 ==========================
383
384 In XEmacs/Mule, each character set is assigned a unique number,
385 called a "leading byte". This is used in the encodings of a
386 character. Leading bytes are in the range 0x80 - 0xFF
387 (except for ASCII, which has a leading byte of 0), although
388 some leading bytes are reserved.
389
390 Charsets whose leading byte is in the range 0x80 - 0x9F are
391 called "official" and are used for built-in charsets.
392 Other charsets are called "private" and have leading bytes
393 in the range 0xA0 - 0xFF; these are user-defined charsets.
394
395 More specifically:
396
397 Character set Leading byte
398 ------------- ------------
399 ASCII 0 (0x7F in arrays indexed by leading byte)
400 Composite 0x8D
401 Dimension-1 Official 0x80 - 0x8C/0x8D
402 (0x8E is free)
403 Control 0x8F
404 Dimension-2 Official 0x90 - 0x99
405 (0x9A - 0x9D are free)
406 Dimension-1 Private Marker 0x9E
407 Dimension-2 Private Marker 0x9F
408 Dimension-1 Private 0xA0 - 0xEF
409 Dimension-2 Private 0xF0 - 0xFF
410
411 There are two internal encodings for characters in XEmacs/Mule.
412 One is called "string encoding" and is an 8-bit encoding that
413 is used for representing characters in a buffer or string.
414 It uses 1 to 4 bytes per character. The other is called
415 "character encoding" and is a 19-bit encoding that is used
416 for representing characters individually in a variable.
417
418 (In the following descriptions, we'll ignore composite
419 characters for the moment. We also give a general (structural)
420 overview first, followed later by the exact details.)
421
422 A. Internal String Encoding
423
424 ASCII characters are encoded using their position code directly.
425 Other characters are encoded using their leading byte followed
426 by their position code(s) with the high bit set. Characters
427 in private character sets have their leading byte prefixed with
428 a "leading byte prefix", which is either 0x9E or 0x9F. (No
429 character sets are ever assigned these leading bytes.) Specifically:
430
431 Character set Encoding (PC == position-code)
432 ------------- -------- (LB == leading-byte)
433 ASCII PC1 |
434 Control-1 LB | PC1 + 0xA0
435 Dimension-1 official LB | PC1 + 0x80
436 Dimension-1 private 0x9E | LB | PC1 + 0x80
437 Dimension-2 official LB | PC1 | PC2 + 0x80
438 Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80
439
440 The basic characteristic of this encoding is that the first byte
441 of all characters is in the range 0x00 - 0x9F, and the second and
442 following bytes of all characters is in the range 0xA0 - 0xFF.
443 This means that it is impossible to get out of sync, or more
444 specifically:
445
446 1. Given any byte position, the beginning of the character it is
447 within can be determined in constant time.
448 2. Given any byte position at the beginning of a character, the
449 beginning of the next character can be determined in constant
450 time.
451 3. Given any byte position at the beginning of a character, the
452 beginning of the previous character can be determined in constant
453 time.
454 4. Textual searches can simply treat encoded strings as if they
455 were encoded in a one-byte-per-character fashion rather than
456 the actual multi-byte encoding.
457
458 None of the standard non-modal encodings meet all of these
459 conditions. For example, EUC satisfies only (2) and (3), while
460 Shift-JIS and Big5 (not yet described) satisfy only (2). (All
461 non-modal encodings must satisfy (2), in order to be unambiguous.)
462
463 B. Internal Character Encoding
464
465 One 19-bit word represents a single character. The word is
466 separated into three fields:
467
468 Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
469 <------------> <------------------> <------------------>
470 Field: 1 2 3
471
472 Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits.
473
474 Character set Field 1 Field 2 Field 3
475 ------------- ------- ------- -------
476 ASCII 0 0 PC1
477 range: (00 - 7F)
478 Control-1 0 1 PC1
479 range: (00 - 1F)
480 Dimension-1 official 0 LB - 0x7F PC1
481 range: (01 - 0D) (20 - 7F)
482 Dimension-1 private 0 LB - 0x80 PC1
483 range: (20 - 6F) (20 - 7F)
484 Dimension-2 official LB - 0x8F PC1 PC2
485 range: (01 - 0A) (20 - 7F) (20 - 7F)
486 Dimension-2 private LB - 0xE1 PC1 PC2
487 range: (0F - 1E) (20 - 7F) (20 - 7F)
488 Composite 0x1F ? ?
489
490 Note that character codes 0 - 255 are the same as the "binary encoding"
491 described above.
492 */
493
494 /*
495 About Unicode support:
496
497 Adding Unicode support is very desirable. Unicode will likely be a
498 very common representation in the future, and thus we should
499 represent Unicode characters using three bytes instead of four.
500 This means we need to find leading bytes for Unicode. Given that
501 there are 65,536 characters in Unicode and we can attach 96x96 =
502 9,216 characters per leading byte, we need eight leading bytes for
503 Unicode. We currently have four free (0x9A - 0x9D), and with a
504 little bit of rearranging we can get five: ASCII doesn't really
505 need to take up a leading byte. (We could just as well use 0x7F,
506 with a little change to the functions that assume that 0x80 is the
507 lowest leading byte.) This means we still need to dump three
508 leading bytes and move them into private space. The CNS charsets
509 are good candidates since they are rarely used, and
510 JAPANESE_JISX0208_1978 is becoming less and less used and could
511 also be dumped. */
512
513
514 /* Composite characters are characters constructed by overstriking two
515 or more regular characters.
516
517 1) The old Mule implementation involves storing composite characters
518 in a buffer as a tag followed by all of the actual characters
519 used to make up the composite character. I think this is a bad
520 idea; it greatly complicates code that wants to handle strings
521 one character at a time because it has to deal with the possibility
522 of great big ungainly characters. It's much more reasonable to
523 simply store an index into a table of composite characters.
524
525 2) The current implementation only allows for 16,384 separate
526 composite characters over the lifetime of the XEmacs process.
527 This could become a potential problem if the user
528 edited lots of different files that use composite characters.
529 Due to FSF bogosity, increasing the number of allowable
530 composite characters under Mule would decrease the number
531 of possible faces that can exist. Mule already has shrunk
532 this to 2048, and further shrinkage would become uncomfortable.
533 No such problems exist in XEmacs.
534
535 Composite characters could be represented as 0x8D C1 C2 C3,
536 where each C[1-3] is in the range 0xA0 - 0xFF. This allows
537 for slightly under 2^20 (one million) composite characters
538 over the XEmacs process lifetime, and you only need to
539 increase the size of a Mule character from 19 to 21 bits.
540 Or you could use 0x8D C1 C2 C3 C4, allowing for about
541 85 million (slightly over 2^26) composite characters. */
542
543
544 /************************************************************************/
545 /* declarations */
546 /************************************************************************/
547
548 Eistring the_eistring_zero_init, the_eistring_malloc_zero_init;
549
550 #define MAX_CHARBPOS_GAP_SIZE_3 (65535/3)
551 #define MAX_BYTEBPOS_GAP_SIZE_3 (3 * MAX_CHARBPOS_GAP_SIZE_3)
552
553 short three_to_one_table[1 + MAX_BYTEBPOS_GAP_SIZE_3];
554
555 #ifdef MULE
556
557 /* Table of number of bytes in the string representation of a character
558 indexed by the first byte of that representation.
559
560 rep_bytes_by_first_byte(c) is more efficient than the equivalent
561 canonical computation:
562
563 XCHARSET_REP_BYTES (CHARSET_BY_LEADING_BYTE (c)) */
564
565 const Bytecount rep_bytes_by_first_byte[0xA0] =
566 { /* 0x00 - 0x7f are for straight ASCII */
567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
568 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
569 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
570 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
571 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
572 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
573 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
574 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
575 /* 0x80 - 0x8f are for Dimension-1 official charsets */
576 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
577 /* 0x90 - 0x9d are for Dimension-2 official charsets */
578 /* 0x9e is for Dimension-1 private charsets */
579 /* 0x9f is for Dimension-2 private charsets */
580 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
581 };
582
583 #ifdef ENABLE_COMPOSITE_CHARS
584
585 /* Hash tables for composite chars. One maps string representing
586 composed chars to their equivalent chars; one goes the
587 other way. */
588 Lisp_Object Vcomposite_char_char2string_hash_table;
589 Lisp_Object Vcomposite_char_string2char_hash_table;
590
591 static int composite_char_row_next;
592 static int composite_char_col_next;
593
594 #endif /* ENABLE_COMPOSITE_CHARS */
595
596 #endif /* MULE */
597
598
599 /************************************************************************/
600 /* qxestr***() functions */
601 /************************************************************************/
602
603 /* Most are inline functions in lisp.h */
604
605 int
606 qxesprintf (Intbyte *buffer, const CIntbyte *format, ...)
607 {
608 va_list args;
609 int retval;
610
611 va_start (args, format);
612 retval = vsprintf ((char *) buffer, format, args);
613 va_end (args);
614
615 return retval;
616 }
617
618 /* strcasecmp() implementation from BSD */
619 static Intbyte strcasecmp_charmap[] = {
620 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
621 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
622 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
623 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
624 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
625 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
626 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
627 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
628 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
629 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
630 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
631 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
632 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
633 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
634 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
635 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
636 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
637 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
638 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
639 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
640 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
641 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
642 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
643 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
644 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
645 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
646 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
647 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
648 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
649 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
650 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
651 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
652 };
653
654 /* A version that works like generic strcasecmp() -- only collapsing
655 case in ASCII A-Z/a-z. This is safe on Mule strings due to the
656 current representation.
657
658 This version was written by some Berkeley coder, favoring
659 nanosecond improvements over clarity. In all other versions below,
660 we use symmetrical algorithms that may sacrifice a few machine
661 cycles but are MUCH MUCH clearer, which counts a lot more.
662 */
663
664 int
665 qxestrcasecmp (const Intbyte *s1, const Intbyte *s2)
666 {
667 Intbyte *cm = strcasecmp_charmap;
668
669 while (cm[*s1] == cm[*s2++])
670 if (*s1++ == '\0')
671 return (0);
672
673 return (cm[*s1] - cm[*--s2]);
674 }
675
676 int
677 ascii_strcasecmp (const Char_ASCII *s1, const Char_ASCII *s2)
678 {
679 return qxestrcasecmp ((const Intbyte *) s1, (const Intbyte *) s2);
680 }
681
682 int
683 qxestrcasecmp_c (const Intbyte *s1, const Char_ASCII *s2)
684 {
685 return qxestrcasecmp (s1, (const Intbyte *) s2);
686 }
687
688 /* An internationalized version that collapses case in a general fashion.
689 */
690
691 int
692 qxestrcasecmp_i18n (const Intbyte *s1, const Intbyte *s2)
693 {
694 while (*s1 && *s2)
695 {
696 if (DOWNCASE (0, charptr_emchar (s1)) !=
697 DOWNCASE (0, charptr_emchar (s2)))
698 break;
699 INC_CHARPTR (s1);
700 INC_CHARPTR (s2);
701 }
702
703 return (DOWNCASE (0, charptr_emchar (s1)) -
704 DOWNCASE (0, charptr_emchar (s2)));
705 }
706
707 /* The only difference between these next two and
708 qxememcasecmp()/qxememcasecmp_i18n() is that these two will stop if
709 both strings are equal and less than LEN in length, while
710 the mem...() versions would would run off the end. */
711
712 int
713 qxestrncasecmp (const Intbyte *s1, const Intbyte *s2, Bytecount len)
714 {
715 Intbyte *cm = strcasecmp_charmap;
716
717 while (len--)
718 {
719 int diff = cm[*s1] - cm[*s2];
720 if (diff != 0)
721 return diff;
722 if (!*s1)
723 return 0;
724 s1++, s2++;
725 }
726
727 return 0;
728 }
729
730 int
731 ascii_strncasecmp (const Char_ASCII *s1, const Char_ASCII *s2, Bytecount len)
732 {
733 return qxestrncasecmp ((const Intbyte *) s1, (const Intbyte *) s2, len);
734 }
735
736 int
737 qxestrncasecmp_c (const Intbyte *s1, const Char_ASCII *s2, Bytecount len)
738 {
739 return qxestrncasecmp (s1, (const Intbyte *) s2, len);
740 }
741
742 int
743 qxestrncasecmp_i18n (const Intbyte *s1, const Intbyte *s2, Bytecount len)
744 {
745 while (len > 0)
746 {
747 const Intbyte *old_s1 = s1;
748 int diff = (DOWNCASE (0, charptr_emchar (s1)) -
749 DOWNCASE (0, charptr_emchar (s2)));
750 if (diff != 0)
751 return diff;
752 if (!*s1)
753 return 0;
754 INC_CHARPTR (s1);
755 INC_CHARPTR (s2);
756 len -= s1 - old_s1;
757 }
758
759 return 0;
760 }
761
762 int
763 qxememcmp (const Intbyte *s1, const Intbyte *s2, Bytecount len)
764 {
765 return memcmp (s1, s2, len);
766 }
767
768 int
769 qxememcasecmp (const Intbyte *s1, const Intbyte *s2, Bytecount len)
770 {
771 Intbyte *cm = strcasecmp_charmap;
772
773 while (len--)
774 {
775 int diff = cm[*s1] - cm[*s2];
776 if (diff != 0)
777 return diff;
778 s1++, s2++;
779 }
780
781 return 0;
782 }
783
784 int
785 qxememcasecmp_i18n (const Intbyte *s1, const Intbyte *s2, Bytecount len)
786 {
787 while (len > 0)
788 {
789 const Intbyte *old_s1 = s1;
790 int diff = (DOWNCASE (0, charptr_emchar (s1)) -
791 DOWNCASE (0, charptr_emchar (s2)));
792 if (diff != 0)
793 return diff;
794 INC_CHARPTR (s1);
795 INC_CHARPTR (s2);
796 len -= s1 - old_s1;
797 }
798
799 return 0;
800 }
801
802 int
803 lisp_strcasecmp (Lisp_Object s1, Lisp_Object s2)
804 {
805 Intbyte *cm = strcasecmp_charmap;
806 Intbyte *p1 = XSTRING_DATA (s1);
807 Intbyte *p2 = XSTRING_DATA (s2);
808 Intbyte *e1 = p1 + XSTRING_LENGTH (s1);
809 Intbyte *e2 = p2 + XSTRING_LENGTH (s2);
810
811 /* again, we use a symmetric algorithm and favor clarity over
812 nanosecond improvements. */
813 while (1)
814 {
815 /* if we reached the end of either string, compare lengths.
816 do NOT compare the final null byte against anything, in case
817 the other string also has a null byte at that position. */
818 if (p1 == e1 || p2 == e2)
819 return e1 - e2;
820 if (cm[*p1] != cm[*p2])
821 return cm[*p1] - cm[*p2];
822 p1++, p2++;
823 }
824 }
825
826 int
827 lisp_strcasecmp_i18n (Lisp_Object s1, Lisp_Object s2)
828 {
829 Intbyte *p1 = XSTRING_DATA (s1);
830 Intbyte *p2 = XSTRING_DATA (s2);
831 Intbyte *e1 = p1 + XSTRING_LENGTH (s1);
832 Intbyte *e2 = p2 + XSTRING_LENGTH (s2);
833
834 /* again, we use a symmetric algorithm and favor clarity over
835 nanosecond improvements. */
836 while (1)
837 {
838 /* if we reached the end of either string, compare lengths.
839 do NOT compare the final null byte against anything, in case
840 the other string also has a null byte at that position. */
841 assert (p1 <= e1);
842 assert (p2 <= e2);
843 if (p1 == e1 || p2 == e2)
844 return e1 - e2;
845 if (DOWNCASE (0, charptr_emchar (p1)) !=
846 DOWNCASE (0, charptr_emchar (p2)))
847 return (DOWNCASE (0, charptr_emchar (p1)) -
848 DOWNCASE (0, charptr_emchar (p2)));
849 INC_CHARPTR (p1);
850 INC_CHARPTR (p2);
851 }
852 }
853
854
855 /************************************************************************/
856 /* conversion between textual representations */
857 /************************************************************************/
858
859 /* NOTE: Does not reset the Dynarr. */
860
861 void
862 convert_intbyte_string_into_emchar_dynarr (const Intbyte *str, Bytecount len,
863 Emchar_dynarr *dyn)
864 {
865 const Intbyte *strend = str + len;
866
867 while (str < strend)
868 {
869 Emchar ch = charptr_emchar (str);
870 Dynarr_add (dyn, ch);
871 INC_CHARPTR (str);
872 }
873 }
874
875 Charcount
876 convert_intbyte_string_into_emchar_string (const Intbyte *str, Bytecount len,
877 Emchar *arr)
878 {
879 const Intbyte *strend = str + len;
880 Charcount newlen = 0;
881 while (str < strend)
882 {
883 Emchar ch = charptr_emchar (str);
884 arr[newlen++] = ch;
885 INC_CHARPTR (str);
886 }
887 return newlen;
888 }
889
890 /* Convert an array of Emchars into the equivalent string representation.
891 Store into the given Intbyte dynarr. Does not reset the dynarr.
892 Does not add a terminating zero. */
893
894 void
895 convert_emchar_string_into_intbyte_dynarr (Emchar *arr, int nels,
896 Intbyte_dynarr *dyn)
897 {
898 Intbyte str[MAX_EMCHAR_LEN];
899 int i;
900
901 for (i = 0; i < nels; i++)
902 {
903 Bytecount len = set_charptr_emchar (str, arr[i]);
904 Dynarr_add_many (dyn, str, len);
905 }
906 }
907
908 /* Convert an array of Emchars into the equivalent string representation.
909 Malloc the space needed for this and return it. If LEN_OUT is not a
910 NULL pointer, store into LEN_OUT the number of Intbytes in the
911 malloc()ed string. Note that the actual number of Intbytes allocated
912 is one more than this: the returned string is zero-terminated. */
913
914 Intbyte *
915 convert_emchar_string_into_malloced_string (Emchar *arr, int nels,
916 Bytecount *len_out)
917 {
918 /* Damn zero-termination. */
919 Intbyte *str = (Intbyte *) alloca (nels * MAX_EMCHAR_LEN + 1);
920 Intbyte *strorig = str;
921 Bytecount len;
922
923 int i;
924
925 for (i = 0; i < nels; i++)
926 str += set_charptr_emchar (str, arr[i]);
927 *str = '\0';
928 len = str - strorig;
929 str = (Intbyte *) xmalloc (1 + len);
930 memcpy (str, strorig, 1 + len);
931 if (len_out)
932 *len_out = len;
933 return str;
934 }
935
936
937 /************************************************************************/
938 /* charset properties of strings */
939 /************************************************************************/
940
941 void
942 find_charsets_in_intbyte_string (unsigned char *charsets, const Intbyte *str,
943 Bytecount len)
944 {
945 #ifndef MULE
946 /* Telescope this. */
947 charsets[0] = 1;
948 #else
949 const Intbyte *strend = str + len;
950 memset (charsets, 0, NUM_LEADING_BYTES);
951
952 /* #### SJT doesn't like this. */
953 if (len == 0)
954 {
955 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1;
956 return;
957 }
958
959 while (str < strend)
960 {
961 charsets[CHAR_LEADING_BYTE (charptr_emchar (str)) - MIN_LEADING_BYTE] =
962 1;
963 INC_CHARPTR (str);
964 }
965 #endif
966 }
967
968 void
969 find_charsets_in_emchar_string (unsigned char *charsets, const Emchar *str,
970 Charcount len)
971 {
972 #ifndef MULE
973 /* Telescope this. */
974 charsets[0] = 1;
975 #else
976 int i;
977
978 memset (charsets, 0, NUM_LEADING_BYTES);
979
980 /* #### SJT doesn't like this. */
981 if (len == 0)
982 {
983 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1;
984 return;
985 }
986
987 for (i = 0; i < len; i++)
988 {
989 charsets[CHAR_LEADING_BYTE (str[i]) - MIN_LEADING_BYTE] = 1;
990 }
991 #endif
992 }
993
994 int
995 intbyte_string_displayed_columns (const Intbyte *str, Bytecount len)
996 {
997 int cols = 0;
998 const Intbyte *end = str + len;
999
1000 while (str < end)
1001 {
1002 #ifdef MULE
1003 Emchar ch = charptr_emchar (str);
1004 cols += XCHARSET_COLUMNS (CHAR_CHARSET (ch));
1005 #else
1006 cols++;
1007 #endif
1008 INC_CHARPTR (str);
1009 }
1010
1011 return cols;
1012 }
1013
1014 int
1015 emchar_string_displayed_columns (const Emchar *str, Charcount len)
1016 {
1017 #ifdef MULE
1018 int cols = 0;
1019 int i;
1020
1021 for (i = 0; i < len; i++)
1022 cols += XCHARSET_COLUMNS (CHAR_CHARSET (str[i]));
1023
1024 return cols;
1025 #else /* not MULE */
1026 return len;
1027 #endif
1028 }
1029
1030 Charcount
1031 intbyte_string_nonascii_chars (const Intbyte *str, Bytecount len)
1032 {
1033 #ifdef MULE
1034 const Intbyte *end = str + len;
1035 Charcount retval = 0;
1036
1037 while (str < end)
1038 {
1039 if (!BYTE_ASCII_P (*str))
1040 retval++;
1041 INC_CHARPTR (str);
1042 }
1043
1044 return retval;
1045 #else
1046 return 0;
1047 #endif
1048 }
1049
1050
1051 /***************************************************************************/
1052 /* Eistring helper functions */
1053 /***************************************************************************/
1054
1055 int
1056 eistr_casefiddle_1 (Intbyte *olddata, Bytecount len, Intbyte *newdata,
1057 int downp)
1058 {
1059 Intbyte *endp = olddata + len;
1060 Intbyte *newp = newdata;
1061 int changedp = 0;
1062
1063 while (olddata < endp)
1064 {
1065 Emchar c = charptr_emchar (olddata);
1066 Emchar newc;
1067
1068 if (downp)
1069 newc = DOWNCASE (0, c);
1070 else
1071 newc = UPCASE (0, c);
1072
1073 if (c != newc)
1074 changedp = 1;
1075
1076 newp += set_charptr_emchar (newp, newc);
1077 INC_CHARPTR (olddata);
1078 }
1079
1080 *newp = '\0';
1081
1082 return changedp ? newp - newdata : 0;
1083 }
1084
1085 int
1086 eifind_large_enough_buffer (int oldbufsize, int needed_size)
1087 {
1088 while (oldbufsize < needed_size)
1089 {
1090 oldbufsize = oldbufsize * 3 / 2;
1091 oldbufsize = max (oldbufsize, 32);
1092 }
1093
1094 return oldbufsize;
1095 }
1096
1097 void
1098 eito_malloc_1 (Eistring *ei)
1099 {
1100 if (ei->mallocp_)
1101 return;
1102 ei->mallocp_ = 1;
1103 if (ei->data_)
1104 {
1105 Intbyte *newdata;
1106
1107 ei->max_size_allocated_ =
1108 eifind_large_enough_buffer (0, ei->bytelen_ + 1);
1109 newdata = (Intbyte *) xmalloc (ei->max_size_allocated_);
1110 memcpy (newdata, ei->data_, ei->bytelen_ + 1);
1111 ei->data_ = newdata;
1112 }
1113
1114 if (ei->extdata_)
1115 {
1116 Extbyte *newdata = (Extbyte *) xmalloc (ei->extlen_ + 2);
1117
1118 memcpy (newdata, ei->extdata_, ei->extlen_);
1119 /* Double null-terminate in case of Unicode data */
1120 newdata[ei->extlen_] = '\0';
1121 newdata[ei->extlen_ + 1] = '\0';
1122 ei->extdata_ = newdata;
1123 }
1124 }
1125
1126 int
1127 eicmp_1 (Eistring *ei, Bytecount off, Charcount charoff,
1128 Bytecount len, Charcount charlen, const Intbyte *data,
1129 const Eistring *ei2, int is_c, int fold_case)
1130 {
1131 assert ((off < 0) != (charoff < 0));
1132 if (off < 0)
1133 {
1134 off = charcount_to_bytecount (ei->data_, charoff);
1135 if (charlen < 0)
1136 len = -1;
1137 else
1138 len = charcount_to_bytecount (ei->data_ + off, charlen);
1139 }
1140 if (len < 0)
1141 len = ei->bytelen_ - off;
1142
1143 assert (off >= 0 && off <= ei->bytelen_);
1144 assert (len >= 0 && off + len <= ei->bytelen_);
1145 assert ((data == 0) != (ei == 0));
1146 assert ((is_c != 0) == (data != 0));
1147 assert (fold_case >= 0 && fold_case <= 2);
1148
1149 {
1150 Bytecount dstlen;
1151 int result;
1152 const Intbyte *src = ei->data_, *dst;
1153 Bytecount cmplen;
1154
1155 if (data)
1156 {
1157 dst = data;
1158 dstlen = qxestrlen (data);
1159 }
1160 else
1161 {
1162 dst = ei2->data_;
1163 dstlen = ei2->bytelen_;
1164 }
1165
1166 if (is_c)
1167 EI_ASSERT_ASCII ((Char_ASCII *) dst, dstlen);
1168
1169 cmplen = min (len, dstlen);
1170 result = (fold_case == 0 ? qxememcmp (src, dst, cmplen) :
1171 fold_case == 1 ? qxememcasecmp (src, dst, cmplen) :
1172 qxememcasecmp_i18n (src, dst, cmplen));
1173
1174 if (result)
1175 return result;
1176
1177 return len - dstlen;
1178 }
1179 }
1180
1181 Intbyte *
1182 eicpyout_malloc_fmt (Eistring *eistr, Bytecount *len_out, Internal_Format fmt)
1183 {
1184 Intbyte *ptr;
1185
1186 assert (fmt == FORMAT_DEFAULT);
1187 ptr = xnew_array (Intbyte, eistr->bytelen_ + 1);
1188 if (len_out)
1189 *len_out = eistr->bytelen_;
1190 memcpy (ptr, eistr->data_, eistr->bytelen_ + 1);
1191 return ptr;
1192 }
1193
1194
1195 /************************************************************************/
1196 /* Charcount/Bytecount conversion */
1197 /************************************************************************/
1198
1199 /* Optimization. Do it. Live it. Love it. */
1200
1201 #ifdef MULE
1202
1203 /* We include the basic functions here that require no specific
1204 knowledge of how data is Mule-encoded into a buffer other
1205 than the basic (00 - 7F), (80 - 9F), (A0 - FF) scheme.
1206 Anything that requires more specific knowledge goes into
1207 mule-charset.c. */
1208
1209 /* Given a pointer to a text string and a length in bytes, return
1210 the equivalent length in characters. */
1211
1212 Charcount
1213 bytecount_to_charcount (const Intbyte *ptr, Bytecount len)
1214 {
1215 Charcount count = 0;
1216 const Intbyte *end = ptr + len;
1217
1218 #if SIZEOF_LONG == 8
1219 # define STRIDE_TYPE long
1220 # define HIGH_BIT_MASK 0x8080808080808080UL
1221 #elif SIZEOF_LONG_LONG == 8 && !(defined (i386) || defined (__i386__))
1222 # define STRIDE_TYPE long long
1223 # define HIGH_BIT_MASK 0x8080808080808080ULL
1224 #elif SIZEOF_LONG == 4
1225 # define STRIDE_TYPE long
1226 # define HIGH_BIT_MASK 0x80808080UL
1227 #else
1228 # error Add support for 128-bit systems here
1229 #endif
1230
1231 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1))
1232 #define ALIGN_MASK (~ ALIGN_BITS)
1233 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0)
1234 #define STRIDE sizeof (STRIDE_TYPE)
1235
1236 while (ptr < end)
1237 {
1238 if (BYTE_ASCII_P (*ptr))
1239 {
1240 /* optimize for long stretches of ASCII */
1241 if (! ALIGNED (ptr))
1242 ptr++, count++;
1243 else
1244 {
1245 const unsigned STRIDE_TYPE *ascii_end =
1246 (const unsigned STRIDE_TYPE *) ptr;
1247 /* This loop screams, because we can detect ASCII
1248 characters 4 or 8 at a time. */
1249 while ((const Intbyte *) ascii_end + STRIDE <= end
1250 && !(*ascii_end & HIGH_BIT_MASK))
1251 ascii_end++;
1252 if ((Intbyte *) ascii_end == ptr)
1253 ptr++, count++;
1254 else
1255 {
1256 count += (Intbyte *) ascii_end - ptr;
1257 ptr = (Intbyte *) ascii_end;
1258 }
1259 }
1260 }
1261 else
1262 {
1263 /* optimize for successive characters from the same charset */
1264 Intbyte leading_byte = *ptr;
1265 int bytes = REP_BYTES_BY_FIRST_BYTE (leading_byte);
1266 while ((ptr < end) && (*ptr == leading_byte))
1267 ptr += bytes, count++;
1268 }
1269 }
1270
1271 /* Bomb out if the specified substring ends in the middle
1272 of a character. Note that we might have already gotten
1273 a core dump above from an invalid reference, but at least
1274 we will get no farther than here.
1275
1276 This also catches len < 0. */
1277 charbpos_checking_assert (ptr == end);
1278
1279 return count;
1280 }
1281
1282 /* Given a pointer to a text string and a length in characters, return
1283 the equivalent length in bytes. */
1284
1285 Bytecount
1286 charcount_to_bytecount (const Intbyte *ptr, Charcount len)
1287 {
1288 const Intbyte *newptr = ptr;
1289
1290 charbpos_checking_assert (len >= 0);
1291 while (len > 0)
1292 {
1293 INC_CHARPTR (newptr);
1294 len--;
1295 }
1296 return newptr - ptr;
1297 }
1298
1299 inline static void
1300 update_entirely_ascii_p_flag (struct buffer *buf)
1301 {
1302 buf->text->entirely_ascii_p =
1303 (buf->text->mule_bufmin == 1 &&
1304 buf->text->mule_bufmax == buf->text->bufz &&
1305 !buf->text->mule_shifter &&
1306 !buf->text->mule_three_p);
1307 }
1308
1309 /* The next two functions are the actual meat behind the
1310 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently
1311 the method they use is fairly unsophisticated; see buffer.h.
1312
1313 Note that charbpos_to_bytebpos_func() is probably the most-called
1314 function in all of XEmacs. Therefore, it must be FAST FAST FAST.
1315 This is the reason why so much of the code is duplicated.
1316
1317 Similar considerations apply to bytebpos_to_charbpos_func(), although
1318 less so because the function is not called so often.
1319
1320 #### At some point this should use a more sophisticated method;
1321 see buffer.h. */
1322
1323 static int not_very_random_number;
1324
1325 Bytebpos
1326 charbpos_to_bytebpos_func (struct buffer *buf, Charbpos x)
1327 {
1328 Charbpos bufmin;
1329 Charbpos bufmax;
1330 Bytebpos bytmin;
1331 Bytebpos bytmax;
1332 int size;
1333 int forward_p;
1334 Bytebpos retval;
1335 int diff_so_far;
1336 int add_to_cache = 0;
1337
1338 /* Check for some cached positions, for speed. */
1339 if (x == BUF_PT (buf))
1340 return BI_BUF_PT (buf);
1341 if (x == BUF_ZV (buf))
1342 return BI_BUF_ZV (buf);
1343 if (x == BUF_BEGV (buf))
1344 return BI_BUF_BEGV (buf);
1345
1346 bufmin = buf->text->mule_bufmin;
1347 bufmax = buf->text->mule_bufmax;
1348 bytmin = buf->text->mule_bytmin;
1349 bytmax = buf->text->mule_bytmax;
1350 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
1351
1352 /* The basic idea here is that we shift the "known region" up or down
1353 until it overlaps the specified position. We do this by moving
1354 the upper bound of the known region up one character at a time,
1355 and moving the lower bound of the known region up as necessary
1356 when the size of the character just seen changes.
1357
1358 We optimize this, however, by first shifting the known region to
1359 one of the cached points if it's close by. (We don't check BEG or
1360 Z, even though they're cached; most of the time these will be the
1361 same as BEGV and ZV, and when they're not, they're not likely
1362 to be used.) */
1363
1364 if (x > bufmax)
1365 {
1366 Charbpos diffmax = x - bufmax;
1367 Charbpos diffpt = x - BUF_PT (buf);
1368 Charbpos diffzv = BUF_ZV (buf) - x;
1369 /* #### This value could stand some more exploration. */
1370 Charcount heuristic_hack = (bufmax - bufmin) >> 2;
1371
1372 /* Check if the position is closer to PT or ZV than to the
1373 end of the known region. */
1374
1375 if (diffpt < 0)
1376 diffpt = -diffpt;
1377 if (diffzv < 0)
1378 diffzv = -diffzv;
1379
1380 /* But also implement a heuristic that favors the known region
1381 over PT or ZV. The reason for this is that switching to
1382 PT or ZV will wipe out the knowledge in the known region,
1383 which might be annoying if the known region is large and
1384 PT or ZV is not that much closer than the end of the known
1385 region. */
1386
1387 diffzv += heuristic_hack;
1388 diffpt += heuristic_hack;
1389 if (diffpt < diffmax && diffpt <= diffzv)
1390 {
1391 bufmax = bufmin = BUF_PT (buf);
1392 bytmax = bytmin = BI_BUF_PT (buf);
1393 /* We set the size to 1 even though it doesn't really
1394 matter because the new known region contains no
1395 characters. We do this because this is the most
1396 likely size of the characters around the new known
1397 region, and we avoid potential yuckiness that is
1398 done when size == 3. */
1399 size = 1;
1400 }
1401 if (diffzv < diffmax)
1402 {
1403 bufmax = bufmin = BUF_ZV (buf);
1404 bytmax = bytmin = BI_BUF_ZV (buf);
1405 size = 1;
1406 }
1407 }
1408 #ifdef ERROR_CHECK_CHARBPOS
1409 else if (x >= bufmin)
1410 abort ();
1411 #endif
1412 else
1413 {
1414 Charbpos diffmin = bufmin - x;
1415 Charbpos diffpt = BUF_PT (buf) - x;
1416 Charbpos diffbegv = x - BUF_BEGV (buf);
1417 /* #### This value could stand some more exploration. */
1418 Charcount heuristic_hack = (bufmax - bufmin) >> 2;
1419
1420 if (diffpt < 0)
1421 diffpt = -diffpt;
1422 if (diffbegv < 0)
1423 diffbegv = -diffbegv;
1424
1425 /* But also implement a heuristic that favors the known region --
1426 see above. */
1427
1428 diffbegv += heuristic_hack;
1429 diffpt += heuristic_hack;
1430
1431 if (diffpt < diffmin && diffpt <= diffbegv)
1432 {
1433 bufmax = bufmin = BUF_PT (buf);
1434 bytmax = bytmin = BI_BUF_PT (buf);
1435 /* We set the size to 1 even though it doesn't really
1436 matter because the new known region contains no
1437 characters. We do this because this is the most
1438 likely size of the characters around the new known
1439 region, and we avoid potential yuckiness that is
1440 done when size == 3. */
1441 size = 1;
1442 }
1443 if (diffbegv < diffmin)
1444 {
1445 bufmax = bufmin = BUF_BEGV (buf);
1446 bytmax = bytmin = BI_BUF_BEGV (buf);
1447 size = 1;
1448 }
1449 }
1450
1451 diff_so_far = x > bufmax ? x - bufmax : bufmin - x;
1452 if (diff_so_far > 50)
1453 {
1454 /* If we have to move more than a certain amount, then look
1455 into our cache. */
1456 int minval = INT_MAX;
1457 int found = 0;
1458 int i;
1459
1460 add_to_cache = 1;
1461 /* I considered keeping the positions ordered. This would speed
1462 up this loop, but updating the cache would take longer, so
1463 it doesn't seem like it would really matter. */
1464 for (i = 0; i < 16; i++)
1465 {
1466 int diff = buf->text->mule_charbpos_cache[i] - x;
1467
1468 if (diff < 0)
1469 diff = -diff;
1470 if (diff < minval)
1471 {
1472 minval = diff;
1473 found = i;
1474 }
1475 }
1476
1477 if (minval < diff_so_far)
1478 {
1479 bufmax = bufmin = buf->text->mule_charbpos_cache[found];
1480 bytmax = bytmin = buf->text->mule_bytebpos_cache[found];
1481 size = 1;
1482 }
1483 }
1484
1485 /* It's conceivable that the caching above could lead to X being
1486 the same as one of the range edges. */
1487 if (x >= bufmax)
1488 {
1489 Bytebpos newmax;
1490 Bytecount newsize;
1491
1492 forward_p = 1;
1493 while (x > bufmax)
1494 {
1495 newmax = bytmax;
1496
1497 INC_BYTEBPOS (buf, newmax);
1498 newsize = newmax - bytmax;
1499 if (newsize != size)
1500 {
1501 bufmin = bufmax;
1502 bytmin = bytmax;
1503 size = newsize;
1504 }
1505 bytmax = newmax;
1506 bufmax++;
1507 }
1508 retval = bytmax;
1509
1510 /* #### Should go past the found location to reduce the number
1511 of times that this function is called */
1512 }
1513 else /* x < bufmin */
1514 {
1515 Bytebpos newmin;
1516 Bytecount newsize;
1517
1518 forward_p = 0;
1519 while (x < bufmin)
1520 {
1521 newmin = bytmin;
1522
1523 DEC_BYTEBPOS (buf, newmin);
1524 newsize = bytmin - newmin;
1525 if (newsize != size)
1526 {
1527 bufmax = bufmin;
1528 bytmax = bytmin;
1529 size = newsize;
1530 }
1531 bytmin = newmin;
1532 bufmin--;
1533 }
1534 retval = bytmin;
1535
1536 /* #### Should go past the found location to reduce the number
1537 of times that this function is called
1538 */
1539 }
1540
1541 /* If size is three, than we have to max sure that the range we
1542 discovered isn't too large, because we use a fixed-length
1543 table to divide by 3. */
1544
1545 if (size == 3)
1546 {
1547 int gap = bytmax - bytmin;
1548 buf->text->mule_three_p = 1;
1549 buf->text->mule_shifter = 1;
1550
1551 if (gap > MAX_BYTEBPOS_GAP_SIZE_3)
1552 {
1553 if (forward_p)
1554 {
1555 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3;
1556 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3;
1557 }
1558 else
1559 {
1560 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3;
1561 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3;
1562 }
1563 }
1564 }
1565 else
1566 {
1567 buf->text->mule_three_p = 0;
1568 if (size == 4)
1569 buf->text->mule_shifter = 2;
1570 else
1571 buf->text->mule_shifter = size - 1;
1572 }
1573
1574 buf->text->mule_bufmin = bufmin;
1575 buf->text->mule_bufmax = bufmax;
1576 buf->text->mule_bytmin = bytmin;
1577 buf->text->mule_bytmax = bytmax;
1578 update_entirely_ascii_p_flag (buf);
1579
1580 if (add_to_cache)
1581 {
1582 int replace_loc;
1583
1584 /* We throw away a "random" cached value and replace it with
1585 the new value. It doesn't actually have to be very random
1586 at all, just evenly distributed.
1587
1588 #### It would be better to use a least-recently-used algorithm
1589 or something that tries to space things out, but I'm not sure
1590 it's worth it to go to the trouble of maintaining that. */
1591 not_very_random_number += 621;
1592 replace_loc = not_very_random_number & 15;
1593 buf->text->mule_charbpos_cache[replace_loc] = x;
1594 buf->text->mule_bytebpos_cache[replace_loc] = retval;
1595 }
1596
1597 return retval;
1598 }
1599
1600 /* The logic in this function is almost identical to the logic in
1601 the previous function. */
1602
1603 Charbpos
1604 bytebpos_to_charbpos_func (struct buffer *buf, Bytebpos x)
1605 {
1606 Charbpos bufmin;
1607 Charbpos bufmax;
1608 Bytebpos bytmin;
1609 Bytebpos bytmax;
1610 int size;
1611 int forward_p;
1612 Charbpos retval;
1613 int diff_so_far;
1614 int add_to_cache = 0;
1615
1616 /* Check for some cached positions, for speed. */
1617 if (x == BI_BUF_PT (buf))
1618 return BUF_PT (buf);
1619 if (x == BI_BUF_ZV (buf))
1620 return BUF_ZV (buf);
1621 if (x == BI_BUF_BEGV (buf))
1622 return BUF_BEGV (buf);
1623
1624 bufmin = buf->text->mule_bufmin;
1625 bufmax = buf->text->mule_bufmax;
1626 bytmin = buf->text->mule_bytmin;
1627 bytmax = buf->text->mule_bytmax;
1628 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
1629
1630 /* The basic idea here is that we shift the "known region" up or down
1631 until it overlaps the specified position. We do this by moving
1632 the upper bound of the known region up one character at a time,
1633 and moving the lower bound of the known region up as necessary
1634 when the size of the character just seen changes.
1635
1636 We optimize this, however, by first shifting the known region to
1637 one of the cached points if it's close by. (We don't check BI_BEG or
1638 BI_Z, even though they're cached; most of the time these will be the
1639 same as BI_BEGV and BI_ZV, and when they're not, they're not likely
1640 to be used.) */
1641
1642 if (x > bytmax)
1643 {
1644 Bytebpos diffmax = x - bytmax;
1645 Bytebpos diffpt = x - BI_BUF_PT (buf);
1646 Bytebpos diffzv = BI_BUF_ZV (buf) - x;
1647 /* #### This value could stand some more exploration. */
1648 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
1649
1650 /* Check if the position is closer to PT or ZV than to the
1651 end of the known region. */
1652
1653 if (diffpt < 0)
1654 diffpt = -diffpt;
1655 if (diffzv < 0)
1656 diffzv = -diffzv;
1657
1658 /* But also implement a heuristic that favors the known region
1659 over BI_PT or BI_ZV. The reason for this is that switching to
1660 BI_PT or BI_ZV will wipe out the knowledge in the known region,
1661 which might be annoying if the known region is large and
1662 BI_PT or BI_ZV is not that much closer than the end of the known
1663 region. */
1664
1665 diffzv += heuristic_hack;
1666 diffpt += heuristic_hack;
1667 if (diffpt < diffmax && diffpt <= diffzv)
1668 {
1669 bufmax = bufmin = BUF_PT (buf);
1670 bytmax = bytmin = BI_BUF_PT (buf);
1671 /* We set the size to 1 even though it doesn't really
1672 matter because the new known region contains no
1673 characters. We do this because this is the most
1674 likely size of the characters around the new known
1675 region, and we avoid potential yuckiness that is
1676 done when size == 3. */
1677 size = 1;
1678 }
1679 if (diffzv < diffmax)
1680 {
1681 bufmax = bufmin = BUF_ZV (buf);
1682 bytmax = bytmin = BI_BUF_ZV (buf);
1683 size = 1;
1684 }
1685 }
1686 #ifdef ERROR_CHECK_CHARBPOS
1687 else if (x >= bytmin)
1688 abort ();
1689 #endif
1690 else
1691 {
1692 Bytebpos diffmin = bytmin - x;
1693 Bytebpos diffpt = BI_BUF_PT (buf) - x;
1694 Bytebpos diffbegv = x - BI_BUF_BEGV (buf);
1695 /* #### This value could stand some more exploration. */
1696 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
1697
1698 if (diffpt < 0)
1699 diffpt = -diffpt;
1700 if (diffbegv < 0)
1701 diffbegv = -diffbegv;
1702
1703 /* But also implement a heuristic that favors the known region --
1704 see above. */
1705
1706 diffbegv += heuristic_hack;
1707 diffpt += heuristic_hack;
1708
1709 if (diffpt < diffmin && diffpt <= diffbegv)
1710 {
1711 bufmax = bufmin = BUF_PT (buf);
1712 bytmax = bytmin = BI_BUF_PT (buf);
1713 /* We set the size to 1 even though it doesn't really
1714 matter because the new known region contains no
1715 characters. We do this because this is the most
1716 likely size of the characters around the new known
1717 region, and we avoid potential yuckiness that is
1718 done when size == 3. */
1719 size = 1;
1720 }
1721 if (diffbegv < diffmin)
1722 {
1723 bufmax = bufmin = BUF_BEGV (buf);
1724 bytmax = bytmin = BI_BUF_BEGV (buf);
1725 size = 1;
1726 }
1727 }
1728
1729 diff_so_far = x > bytmax ? x - bytmax : bytmin - x;
1730 if (diff_so_far > 50)
1731 {
1732 /* If we have to move more than a certain amount, then look
1733 into our cache. */
1734 int minval = INT_MAX;
1735 int found = 0;
1736 int i;
1737
1738 add_to_cache = 1;
1739 /* I considered keeping the positions ordered. This would speed
1740 up this loop, but updating the cache would take longer, so
1741 it doesn't seem like it would really matter. */
1742 for (i = 0; i < 16; i++)
1743 {
1744 int diff = buf->text->mule_bytebpos_cache[i] - x;
1745
1746 if (diff < 0)
1747 diff = -diff;
1748 if (diff < minval)
1749 {
1750 minval = diff;
1751 found = i;
1752 }
1753 }
1754
1755 if (minval < diff_so_far)
1756 {
1757 bufmax = bufmin = buf->text->mule_charbpos_cache[found];
1758 bytmax = bytmin = buf->text->mule_bytebpos_cache[found];
1759 size = 1;
1760 }
1761 }
1762
1763 /* It's conceivable that the caching above could lead to X being
1764 the same as one of the range edges. */
1765 if (x >= bytmax)
1766 {
1767 Bytebpos newmax;
1768 Bytecount newsize;
1769
1770 forward_p = 1;
1771 while (x > bytmax)
1772 {
1773 newmax = bytmax;
1774
1775 INC_BYTEBPOS (buf, newmax);
1776 newsize = newmax - bytmax;
1777 if (newsize != size)
1778 {
1779 bufmin = bufmax;
1780 bytmin = bytmax;
1781 size = newsize;
1782 }
1783 bytmax = newmax;
1784 bufmax++;
1785 }
1786 retval = bufmax;
1787
1788 /* #### Should go past the found location to reduce the number
1789 of times that this function is called */
1790 }
1791 else /* x <= bytmin */
1792 {
1793 Bytebpos newmin;
1794 Bytecount newsize;
1795
1796 forward_p = 0;
1797 while (x < bytmin)
1798 {
1799 newmin = bytmin;
1800
1801 DEC_BYTEBPOS (buf, newmin);
1802 newsize = bytmin - newmin;
1803 if (newsize != size)
1804 {
1805 bufmax = bufmin;
1806 bytmax = bytmin;
1807 size = newsize;
1808 }
1809 bytmin = newmin;
1810 bufmin--;
1811 }
1812 retval = bufmin;
1813
1814 /* #### Should go past the found location to reduce the number
1815 of times that this function is called
1816 */
1817 }
1818
1819 /* If size is three, than we have to max sure that the range we
1820 discovered isn't too large, because we use a fixed-length
1821 table to divide by 3. */
1822
1823 if (size == 3)
1824 {
1825 int gap = bytmax - bytmin;
1826 buf->text->mule_three_p = 1;
1827 buf->text->mule_shifter = 1;
1828
1829 if (gap > MAX_BYTEBPOS_GAP_SIZE_3)
1830 {
1831 if (forward_p)
1832 {
1833 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3;
1834 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3;
1835 }
1836 else
1837 {
1838 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3;
1839 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3;
1840 }
1841 }
1842 }
1843 else
1844 {
1845 buf->text->mule_three_p = 0;
1846 if (size == 4)
1847 buf->text->mule_shifter = 2;
1848 else
1849 buf->text->mule_shifter = size - 1;
1850 }
1851
1852 buf->text->mule_bufmin = bufmin;
1853 buf->text->mule_bufmax = bufmax;
1854 buf->text->mule_bytmin = bytmin;
1855 buf->text->mule_bytmax = bytmax;
1856 update_entirely_ascii_p_flag (buf);
1857
1858 if (add_to_cache)
1859 {
1860 int replace_loc;
1861
1862 /* We throw away a "random" cached value and replace it with
1863 the new value. It doesn't actually have to be very random
1864 at all, just evenly distributed.
1865
1866 #### It would be better to use a least-recently-used algorithm
1867 or something that tries to space things out, but I'm not sure
1868 it's worth it to go to the trouble of maintaining that. */
1869 not_very_random_number += 621;
1870 replace_loc = not_very_random_number & 15;
1871 buf->text->mule_charbpos_cache[replace_loc] = retval;
1872 buf->text->mule_bytebpos_cache[replace_loc] = x;
1873 }
1874
1875 return retval;
1876 }
1877
1878 /* Text of length BYTELENGTH and CHARLENGTH (in different units)
1879 was inserted at charbpos START. */
1880
1881 void
1882 buffer_mule_signal_inserted_region (struct buffer *buf, Charbpos start,
1883 Bytecount bytelength,
1884 Charcount charlength)
1885 {
1886 int size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
1887 int i;
1888
1889 /* Adjust the cache of known positions. */
1890 for (i = 0; i < 16; i++)
1891 {
1892
1893 if (buf->text->mule_charbpos_cache[i] > start)
1894 {
1895 buf->text->mule_charbpos_cache[i] += charlength;
1896 buf->text->mule_bytebpos_cache[i] += bytelength;
1897 }
1898 }
1899
1900 if (start >= buf->text->mule_bufmax)
1901 goto done;
1902
1903 /* The insertion is either before the known region, in which case
1904 it shoves it forward; or within the known region, in which case
1905 it shoves the end forward. (But it may make the known region
1906 inconsistent, so we may have to shorten it.) */
1907
1908 if (start <= buf->text->mule_bufmin)
1909 {
1910 buf->text->mule_bufmin += charlength;
1911 buf->text->mule_bufmax += charlength;
1912 buf->text->mule_bytmin += bytelength;
1913 buf->text->mule_bytmax += bytelength;
1914 }
1915 else
1916 {
1917 Charbpos end = start + charlength;
1918 /* the insertion point divides the known region in two.
1919 Keep the longer half, at least, and expand into the
1920 inserted chunk as much as possible. */
1921
1922 if (start - buf->text->mule_bufmin > buf->text->mule_bufmax - start)
1923 {
1924 Bytebpos bytestart = (buf->text->mule_bytmin
1925 + size * (start - buf->text->mule_bufmin));
1926 Bytebpos bytenew;
1927
1928 while (start < end)
1929 {
1930 bytenew = bytestart;
1931 INC_BYTEBPOS (buf, bytenew);
1932 if (bytenew - bytestart != size)
1933 break;
1934 start++;
1935 bytestart = bytenew;
1936 }
1937 if (start != end)
1938 {
1939 buf->text->mule_bufmax = start;
1940 buf->text->mule_bytmax = bytestart;
1941 }
1942 else
1943 {
1944 buf->text->mule_bufmax += charlength;
1945 buf->text->mule_bytmax += bytelength;
1946 }
1947 }
1948 else
1949 {
1950 Bytebpos byteend = (buf->text->mule_bytmin
1951 + size * (start - buf->text->mule_bufmin)
1952 + bytelength);
1953 Bytebpos bytenew;
1954
1955 buf->text->mule_bufmax += charlength;
1956 buf->text->mule_bytmax += bytelength;
1957
1958 while (end > start)
1959 {
1960 bytenew = byteend;
1961 DEC_BYTEBPOS (buf, bytenew);
1962 if (byteend - bytenew != size)
1963 break;
1964 end--;
1965 byteend = bytenew;
1966 }
1967 if (start != end)
1968 {
1969 buf->text->mule_bufmin = end;
1970 buf->text->mule_bytmin = byteend;
1971 }
1972 }
1973 }
1974 done:
1975 update_entirely_ascii_p_flag (buf);
1976 }
1977
1978 /* Text from START to END (equivalent in Bytebposs: from BI_START to
1979 BI_END) was deleted. */
1980
1981 void
1982 buffer_mule_signal_deleted_region (struct buffer *buf, Charbpos start,
1983 Charbpos end, Bytebpos bi_start,
1984 Bytebpos bi_end)
1985 {
1986 int i;
1987
1988 /* Adjust the cache of known positions. */
1989 for (i = 0; i < 16; i++)
1990 {
1991 /* After the end; gets shoved backward */
1992 if (buf->text->mule_charbpos_cache[i] > end)
1993 {
1994 buf->text->mule_charbpos_cache[i] -= end - start;
1995 buf->text->mule_bytebpos_cache[i] -= bi_end - bi_start;
1996 }
1997 /* In the range; moves to start of range */
1998 else if (buf->text->mule_charbpos_cache[i] > start)
1999 {
2000 buf->text->mule_charbpos_cache[i] = start;
2001 buf->text->mule_bytebpos_cache[i] = bi_start;
2002 }
2003 }
2004
2005 /* We don't care about any text after the end of the known region. */
2006
2007 end = min (end, buf->text->mule_bufmax);
2008 bi_end = min (bi_end, buf->text->mule_bytmax);
2009 if (start >= end)
2010 goto done;
2011
2012 /* The end of the known region offsets by the total amount of deletion,
2013 since it's all before it. */
2014
2015 buf->text->mule_bufmax -= end - start;
2016 buf->text->mule_bytmax -= bi_end - bi_start;
2017
2018 /* Now we don't care about any text after the start of the known region. */
2019
2020 end = min (end, buf->text->mule_bufmin);
2021 bi_end = min (bi_end, buf->text->mule_bytmin);
2022 if (start < end)
2023 {
2024 buf->text->mule_bufmin -= end - start;
2025 buf->text->mule_bytmin -= bi_end - bi_start;
2026 }
2027
2028 done:
2029 update_entirely_ascii_p_flag (buf);
2030 }
2031
2032 #endif /* MULE */
2033
2034 #ifdef ERROR_CHECK_CHARBPOS
2035
2036 Bytebpos
2037 charbpos_to_bytebpos (struct buffer *buf, Charbpos x)
2038 {
2039 Bytebpos retval = real_charbpos_to_bytebpos (buf, x);
2040 ASSERT_VALID_BYTEBPOS_UNSAFE (buf, retval);
2041 return retval;
2042 }
2043
2044 Charbpos
2045 bytebpos_to_charbpos (struct buffer *buf, Bytebpos x)
2046 {
2047 ASSERT_VALID_BYTEBPOS_UNSAFE (buf, x);
2048 return real_bytebpos_to_charbpos (buf, x);
2049 }
2050
2051 #endif /* ERROR_CHECK_CHARBPOS */
2052
2053
2054 /************************************************************************/
2055 /* verifying buffer and string positions */
2056 /************************************************************************/
2057
2058 /* Functions below are tagged with either _byte or _char indicating
2059 whether they return byte or character positions. For a buffer,
2060 a character position is a "Charbpos" and a byte position is a "Bytebpos".
2061 For strings, these are sometimes typed using "Charcount" and
2062 "Bytecount". */
2063
2064 /* Flags for the functions below are:
2065
2066 GB_ALLOW_PAST_ACCESSIBLE
2067
2068 Allow positions to range over the entire buffer (BUF_BEG to BUF_Z),
2069 rather than just the accessible portion (BUF_BEGV to BUF_ZV).
2070 For strings, this flag has no effect.
2071
2072 GB_COERCE_RANGE
2073
2074 If the position is outside the allowable range, return the lower
2075 or upper bound of the range, whichever is closer to the specified
2076 position.
2077
2078 GB_NO_ERROR_IF_BAD
2079
2080 If the position is outside the allowable range, return -1.
2081
2082 GB_NEGATIVE_FROM_END
2083
2084 If a value is negative, treat it as an offset from the end.
2085 Only applies to strings.
2086
2087 The following additional flags apply only to the functions
2088 that return ranges:
2089
2090 GB_ALLOW_NIL
2091
2092 Either or both positions can be nil. If FROM is nil,
2093 FROM_OUT will contain the lower bound of the allowed range.
2094 If TO is nil, TO_OUT will contain the upper bound of the
2095 allowed range.
2096
2097 GB_CHECK_ORDER
2098
2099 FROM must contain the lower bound and TO the upper bound
2100 of the range. If the positions are reversed, an error is
2101 signalled.
2102
2103 The following is a combination flag:
2104
2105 GB_HISTORICAL_STRING_BEHAVIOR
2106
2107 Equivalent to (GB_NEGATIVE_FROM_END | GB_ALLOW_NIL).
2108 */
2109
2110 /* Return a buffer position stored in a Lisp_Object. Full
2111 error-checking is done on the position. Flags can be specified to
2112 control the behavior of out-of-range values. The default behavior
2113 is to require that the position is within the accessible part of
2114 the buffer (BEGV and ZV), and to signal an error if the position is
2115 out of range.
2116
2117 */
2118
2119 Charbpos
2120 get_buffer_pos_char (struct buffer *b, Lisp_Object pos, unsigned int flags)
2121 {
2122 /* Does not GC */
2123 Charbpos ind;
2124 Charbpos min_allowed, max_allowed;
2125
2126 CHECK_INT_COERCE_MARKER (pos);
2127 ind = XINT (pos);
2128 min_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_BEG (b) : BUF_BEGV (b);
2129 max_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_Z (b) : BUF_ZV (b);
2130
2131 if (ind < min_allowed || ind > max_allowed)
2132 {
2133 if (flags & GB_COERCE_RANGE)
2134 ind = ind < min_allowed ? min_allowed : max_allowed;
2135 else if (flags & GB_NO_ERROR_IF_BAD)
2136 ind = -1;
2137 else
2138 {
2139 Lisp_Object buffer;
2140 XSETBUFFER (buffer, b);
2141 args_out_of_range (buffer, pos);
2142 }
2143 }
2144
2145 return ind;
2146 }
2147
2148 Bytebpos
2149 get_buffer_pos_byte (struct buffer *b, Lisp_Object pos, unsigned int flags)
2150 {
2151 Charbpos bpos = get_buffer_pos_char (b, pos, flags);
2152 if (bpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */
2153 return -1;
2154 return charbpos_to_bytebpos (b, bpos);
2155 }
2156
2157 /* Return a pair of buffer positions representing a range of text,
2158 taken from a pair of Lisp_Objects. Full error-checking is
2159 done on the positions. Flags can be specified to control the
2160 behavior of out-of-range values. The default behavior is to
2161 allow the range bounds to be specified in either order
2162 (however, FROM_OUT will always be the lower bound of the range
2163 and TO_OUT the upper bound),to require that the positions
2164 are within the accessible part of the buffer (BEGV and ZV),
2165 and to signal an error if the positions are out of range.
2166 */
2167
2168 void
2169 get_buffer_range_char (struct buffer *b, Lisp_Object from, Lisp_Object to,
2170 Charbpos *from_out, Charbpos *to_out, unsigned int flags)
2171 {
2172 /* Does not GC */
2173 Charbpos min_allowed, max_allowed;
2174
2175 min_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
2176 BUF_BEG (b) : BUF_BEGV (b);
2177 max_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
2178 BUF_Z (b) : BUF_ZV (b);
2179
2180 if (NILP (from) && (flags & GB_ALLOW_NIL))
2181 *from_out = min_allowed;
2182 else
2183 *from_out = get_buffer_pos_char (b, from, flags | GB_NO_ERROR_IF_BAD);
2184
2185 if (NILP (to) && (flags & GB_ALLOW_NIL))
2186 *to_out = max_allowed;
2187 else
2188 *to_out = get_buffer_pos_char (b, to, flags | GB_NO_ERROR_IF_BAD);
2189
2190 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD))
2191 {
2192 Lisp_Object buffer;
2193 XSETBUFFER (buffer, b);
2194 args_out_of_range_3 (buffer, from, to);
2195 }
2196
2197 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out)
2198 {
2199 if (flags & GB_CHECK_ORDER)
2200 invalid_argument_2 ("start greater than end", from, to);
2201 else
2202 {
2203 Charbpos temp = *from_out;
2204 *from_out = *to_out;
2205 *to_out = temp;
2206 }
2207 }
2208 }
2209
2210 void
2211 get_buffer_range_byte (struct buffer *b, Lisp_Object from, Lisp_Object to,
2212 Bytebpos *from_out, Bytebpos *to_out, unsigned int flags)
2213 {
2214 Charbpos s, e;
2215
2216 get_buffer_range_char (b, from, to, &s, &e, flags);
2217 if (s >= 0)
2218 *from_out = charbpos_to_bytebpos (b, s);
2219 else /* could happen with GB_NO_ERROR_IF_BAD */
2220 *from_out = -1;
2221 if (e >= 0)
2222 *to_out = charbpos_to_bytebpos (b, e);
2223 else
2224 *to_out = -1;
2225 }
2226
2227 static Charcount
2228 get_string_pos_char_1 (Lisp_Object string, Lisp_Object pos, unsigned int flags,
2229 Charcount known_length)
2230 {
2231 Charcount ccpos;
2232 Charcount min_allowed = 0;
2233 Charcount max_allowed = known_length;
2234
2235 /* Computation of KNOWN_LENGTH is potentially expensive so we pass
2236 it in. */
2237 CHECK_INT (pos);
2238 ccpos = XINT (pos);
2239 if (ccpos < 0 && flags & GB_NEGATIVE_FROM_END)
2240 ccpos += max_allowed;
2241
2242 if (ccpos < min_allowed || ccpos > max_allowed)
2243 {
2244 if (flags & GB_COERCE_RANGE)
2245 ccpos = ccpos < min_allowed ? min_allowed : max_allowed;
2246 else if (flags & GB_NO_ERROR_IF_BAD)
2247 ccpos = -1;
2248 else
2249 args_out_of_range (string, pos);
2250 }
2251
2252 return ccpos;
2253 }
2254
2255 Charcount
2256 get_string_pos_char (Lisp_Object string, Lisp_Object pos, unsigned int flags)
2257 {
2258 return get_string_pos_char_1 (string, pos, flags,
2259 XSTRING_CHAR_LENGTH (string));
2260 }
2261
2262 Bytecount
2263 get_string_pos_byte (Lisp_Object string, Lisp_Object pos, unsigned int flags)
2264 {
2265 Charcount ccpos = get_string_pos_char (string, pos, flags);
2266 if (ccpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */
2267 return -1;
2268 return XSTRING_INDEX_CHAR_TO_BYTE (string, ccpos);
2269 }
2270
2271 void
2272 get_string_range_char (Lisp_Object string, Lisp_Object from, Lisp_Object to,
2273 Charcount *from_out, Charcount *to_out,
2274 unsigned int flags)
2275 {
2276 Charcount min_allowed = 0;
2277 Charcount max_allowed = XSTRING_CHAR_LENGTH (string);
2278
2279 if (NILP (from) && (flags & GB_ALLOW_NIL))
2280 *from_out = min_allowed;
2281 else
2282 *from_out = get_string_pos_char_1 (string, from,
2283 flags | GB_NO_ERROR_IF_BAD,
2284 max_allowed);
2285
2286 if (NILP (to) && (flags & GB_ALLOW_NIL))
2287 *to_out = max_allowed;
2288 else
2289 *to_out = get_string_pos_char_1 (string, to,
2290 flags | GB_NO_ERROR_IF_BAD,
2291 max_allowed);
2292
2293 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD))
2294 args_out_of_range_3 (string, from, to);
2295
2296 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out)
2297 {
2298 if (flags & GB_CHECK_ORDER)
2299 invalid_argument_2 ("start greater than end", from, to);
2300 else
2301 {
2302 Charbpos temp = *from_out;
2303 *from_out = *to_out;
2304 *to_out = temp;
2305 }
2306 }
2307 }
2308
2309 void
2310 get_string_range_byte (Lisp_Object string, Lisp_Object from, Lisp_Object to,
2311 Bytecount *from_out, Bytecount *to_out,
2312 unsigned int flags)
2313 {
2314 Charcount s, e;
2315
2316 get_string_range_char (string, from, to, &s, &e, flags);
2317 if (s >= 0)
2318 *from_out = XSTRING_INDEX_CHAR_TO_BYTE (string, s);
2319 else /* could happen with GB_NO_ERROR_IF_BAD */
2320 *from_out = -1;
2321 if (e >= 0)
2322 *to_out = XSTRING_INDEX_CHAR_TO_BYTE (string, e);
2323 else
2324 *to_out = -1;
2325
2326 }
2327
2328 Charbpos
2329 get_buffer_or_string_pos_char (Lisp_Object object, Lisp_Object pos,
2330 unsigned int flags)
2331 {
2332 return STRINGP (object) ?
2333 get_string_pos_char (object, pos, flags) :
2334 get_buffer_pos_char (XBUFFER (object), pos, flags);
2335 }
2336
2337 Bytebpos
2338 get_buffer_or_string_pos_byte (Lisp_Object object, Lisp_Object pos,
2339 unsigned int flags)
2340 {
2341 return STRINGP (object) ?
2342 get_string_pos_byte (object, pos, flags) :
2343 get_buffer_pos_byte (XBUFFER (object), pos, flags);
2344 }
2345
2346 void
2347 get_buffer_or_string_range_char (Lisp_Object object, Lisp_Object from,
2348 Lisp_Object to, Charbpos *from_out,
2349 Charbpos *to_out, unsigned int flags)
2350 {
2351 if (STRINGP (object))
2352 get_string_range_char (object, from, to, from_out, to_out, flags);
2353 else
2354 get_buffer_range_char (XBUFFER (object), from, to, from_out, to_out, flags);
2355 }
2356
2357 void
2358 get_buffer_or_string_range_byte (Lisp_Object object, Lisp_Object from,
2359 Lisp_Object to, Bytebpos *from_out,
2360 Bytebpos *to_out, unsigned int flags)
2361 {
2362 if (STRINGP (object))
2363 get_string_range_byte (object, from, to, from_out, to_out, flags);
2364 else
2365 get_buffer_range_byte (XBUFFER (object), from, to, from_out, to_out, flags);
2366 }
2367
2368 Charbpos
2369 buffer_or_string_accessible_begin_char (Lisp_Object object)
2370 {
2371 return STRINGP (object) ? 0 : BUF_BEGV (XBUFFER (object));
2372 }
2373
2374 Charbpos
2375 buffer_or_string_accessible_end_char (Lisp_Object object)
2376 {
2377 return STRINGP (object) ?
2378 XSTRING_CHAR_LENGTH (object) : BUF_ZV (XBUFFER (object));
2379 }
2380
2381 Bytebpos
2382 buffer_or_string_accessible_begin_byte (Lisp_Object object)
2383 {
2384 return STRINGP (object) ? 0 : BI_BUF_BEGV (XBUFFER (object));
2385 }
2386
2387 Bytebpos
2388 buffer_or_string_accessible_end_byte (Lisp_Object object)
2389 {
2390 return STRINGP (object) ?
2391 XSTRING_LENGTH (object) : BI_BUF_ZV (XBUFFER (object));
2392 }
2393
2394 Charbpos
2395 buffer_or_string_absolute_begin_char (Lisp_Object object)
2396 {
2397 return STRINGP (object) ? 0 : BUF_BEG (XBUFFER (object));
2398 }
2399
2400 Charbpos
2401 buffer_or_string_absolute_end_char (Lisp_Object object)
2402 {
2403 return STRINGP (object) ?
2404 XSTRING_CHAR_LENGTH (object) : BUF_Z (XBUFFER (object));
2405 }
2406
2407 Bytebpos
2408 buffer_or_string_absolute_begin_byte (Lisp_Object object)
2409 {
2410 return STRINGP (object) ? 0 : BI_BUF_BEG (XBUFFER (object));
2411 }
2412
2413 Bytebpos
2414 buffer_or_string_absolute_end_byte (Lisp_Object object)
2415 {
2416 return STRINGP (object) ?
2417 XSTRING_LENGTH (object) : BI_BUF_Z (XBUFFER (object));
2418 }
2419
2420
2421 /************************************************************************/
2422 /* Implement TO_EXTERNAL_FORMAT, TO_INTERNAL_FORMAT */
2423 /************************************************************************/
2424
2425 typedef struct
2426 {
2427 Dynarr_declare (Intbyte_dynarr *);
2428 } Intbyte_dynarr_dynarr;
2429
2430 typedef struct
2431 {
2432 Dynarr_declare (Extbyte_dynarr *);
2433 } Extbyte_dynarr_dynarr;
2434
2435 static Extbyte_dynarr_dynarr *conversion_out_dynarr_list;
2436 static Intbyte_dynarr_dynarr *conversion_in_dynarr_list;
2437
2438 static int dfc_convert_to_external_format_in_use;
2439 static int dfc_convert_to_internal_format_in_use;
2440
2441 static Lisp_Object
2442 dfc_convert_to_external_format_reset_in_use (Lisp_Object value)
2443 {
2444 dfc_convert_to_external_format_in_use = XINT (value);
2445 return Qnil;
2446 }
2447
2448 static Lisp_Object
2449 dfc_convert_to_internal_format_reset_in_use (Lisp_Object value)
2450 {
2451 dfc_convert_to_internal_format_in_use = XINT (value);
2452 return Qnil;
2453 }
2454
2455 void
2456 dfc_convert_to_external_format (dfc_conversion_type source_type,
2457 dfc_conversion_data *source,
2458 Lisp_Object coding_system,
2459 dfc_conversion_type sink_type,
2460 dfc_conversion_data *sink)
2461 {
2462 /* It's guaranteed that many callers are not prepared for GC here,
2463 esp. given that this code conversion occurs in many very hidden
2464 places. */
2465 int count = begin_gc_forbidden ();
2466 Extbyte_dynarr *conversion_out_dynarr;
2467
2468 type_checking_assert
2469 (((source_type == DFC_TYPE_DATA) ||
2470 (source_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)) ||
2471 (source_type == DFC_TYPE_LISP_STRING && STRINGP (source->lisp_object)))
2472 &&
2473 ((sink_type == DFC_TYPE_DATA) ||
2474 (sink_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object))));
2475
2476 record_unwind_protect (dfc_convert_to_external_format_reset_in_use,
2477 make_int (dfc_convert_to_external_format_in_use));
2478 if (Dynarr_length (conversion_out_dynarr_list) <=
2479 dfc_convert_to_external_format_in_use)
2480 Dynarr_add (conversion_out_dynarr_list, Dynarr_new (Extbyte));
2481 conversion_out_dynarr = Dynarr_at (conversion_out_dynarr_list,
2482 dfc_convert_to_external_format_in_use);
2483 dfc_convert_to_external_format_in_use++;
2484 Dynarr_reset (conversion_out_dynarr);
2485
2486 coding_system = get_coding_system_for_text_file (coding_system, 0);
2487
2488 /* Here we optimize in the case where the coding system does no
2489 conversion. However, we don't want to optimize in case the source
2490 or sink is an lstream, since writing to an lstream can cause a
2491 garbage collection, and this could be problematic if the source
2492 is a lisp string. */
2493 if (source_type != DFC_TYPE_LISP_LSTREAM &&
2494 sink_type != DFC_TYPE_LISP_LSTREAM &&
2495 coding_system_is_binary (coding_system))
2496 {
2497 const Intbyte *ptr;
2498 Bytecount len;
2499
2500 if (source_type == DFC_TYPE_LISP_STRING)
2501 {
2502 ptr = XSTRING_DATA (source->lisp_object);
2503 len = XSTRING_LENGTH (source->lisp_object);
2504 }
2505 else
2506 {
2507 ptr = (Intbyte *) source->data.ptr;
2508 len = source->data.len;
2509 }
2510
2511 #ifdef MULE
2512 {
2513 const Intbyte *end;
2514 for (end = ptr + len; ptr < end;)
2515 {
2516 Intbyte c =
2517 (BYTE_ASCII_P (*ptr)) ? *ptr :
2518 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
2519 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
2520 '~';
2521
2522 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
2523 INC_CHARPTR (ptr);
2524 }
2525 charbpos_checking_assert (ptr == end);
2526 }
2527 #else
2528 Dynarr_add_many (conversion_out_dynarr, ptr, len);
2529 #endif
2530
2531 }
2532 #ifdef HAVE_WIN32_CODING_SYSTEMS
2533 /* Optimize the common case involving Unicode where only ASCII is involved */
2534 else if (source_type != DFC_TYPE_LISP_LSTREAM &&
2535 sink_type != DFC_TYPE_LISP_LSTREAM &&
2536 dfc_coding_system_is_unicode (coding_system))
2537 {
2538 const Intbyte *ptr, *p;
2539 Bytecount len;
2540 const Intbyte *end;
2541
2542 if (source_type == DFC_TYPE_LISP_STRING)
2543 {
2544 ptr = XSTRING_DATA (source->lisp_object);
2545 len = XSTRING_LENGTH (source->lisp_object);
2546 }
2547 else
2548 {
2549 ptr = (Intbyte *) source->data.ptr;
2550 len = source->data.len;
2551 }
2552 end = ptr + len;
2553
2554 for (p = ptr; p < end; p++)
2555 {
2556 if (!BYTE_ASCII_P (*p))
2557 goto the_hard_way;
2558 }
2559
2560 for (p = ptr; p < end; p++)
2561 {
2562 Dynarr_add (conversion_out_dynarr, (Extbyte) (*p));
2563 Dynarr_add (conversion_out_dynarr, (Extbyte) '\0');
2564 }
2565 }
2566 #endif /* HAVE_WIN32_CODING_SYSTEMS */
2567 else
2568 {
2569 Lisp_Object streams_to_delete[3];
2570 int delete_count;
2571 Lisp_Object instream, outstream;
2572 Lstream *reader, *writer;
2573 struct gcpro gcpro1, gcpro2;
2574
2575 #ifdef HAVE_WIN32_CODING_SYSTEMS
2576 the_hard_way:
2577 #endif /* HAVE_WIN32_CODING_SYSTEMS */
2578 delete_count = 0;
2579 if (source_type == DFC_TYPE_LISP_LSTREAM)
2580 instream = source->lisp_object;
2581 else if (source_type == DFC_TYPE_DATA)
2582 streams_to_delete[delete_count++] = instream =
2583 make_fixed_buffer_input_stream (source->data.ptr, source->data.len);
2584 else
2585 {
2586 type_checking_assert (source_type == DFC_TYPE_LISP_STRING);
2587 streams_to_delete[delete_count++] = instream =
2588 /* This will GCPRO the Lisp string */
2589 make_lisp_string_input_stream (source->lisp_object, 0, -1);
2590 }
2591
2592 if (sink_type == DFC_TYPE_LISP_LSTREAM)
2593 outstream = sink->lisp_object;
2594 else
2595 {
2596 type_checking_assert (sink_type == DFC_TYPE_DATA);
2597 streams_to_delete[delete_count++] = outstream =
2598 make_dynarr_output_stream
2599 ((unsigned_char_dynarr *) conversion_out_dynarr);
2600 }
2601
2602 streams_to_delete[delete_count++] = outstream =
2603 make_coding_output_stream (XLSTREAM (outstream), coding_system, CODING_ENCODE);
2604
2605 reader = XLSTREAM (instream);
2606 writer = XLSTREAM (outstream);
2607 /* decoding_stream will gc-protect outstream */
2608 GCPRO2 (instream, outstream);
2609
2610 while (1)
2611 {
2612 Bytecount size_in_bytes;
2613 char tempbuf[1024]; /* some random amount */
2614
2615 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf));
2616
2617 if (size_in_bytes == 0)
2618 break;
2619 else if (size_in_bytes < 0)
2620 signal_error (Qtext_conversion_error,
2621 "Error converting to external format", Qunbound);
2622
2623 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0)
2624 signal_error (Qtext_conversion_error,
2625 "Error converting to external format", Qunbound);
2626 }
2627
2628 /* Closing writer will close any stream at the other end of writer. */
2629 Lstream_close (writer);
2630 Lstream_close (reader);
2631 UNGCPRO;
2632
2633 /* The idea is that this function will create no garbage. */
2634 while (delete_count)
2635 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count]));
2636 }
2637
2638 unbind_to (count);
2639
2640 if (sink_type != DFC_TYPE_LISP_LSTREAM)
2641 {
2642 sink->data.len = Dynarr_length (conversion_out_dynarr);
2643 /* double zero-extend because we may be dealing with Unicode data */
2644 Dynarr_add (conversion_out_dynarr, '\0');
2645 Dynarr_add (conversion_out_dynarr, '\0');
2646 sink->data.ptr = Dynarr_atp (conversion_out_dynarr, 0);
2647 }
2648 }
2649
2650 void
2651 dfc_convert_to_internal_format (dfc_conversion_type source_type,
2652 dfc_conversion_data *source,
2653 Lisp_Object coding_system,
2654 dfc_conversion_type sink_type,
2655 dfc_conversion_data *sink)
2656 {
2657 /* It's guaranteed that many callers are not prepared for GC here,
2658 esp. given that this code conversion occurs in many very hidden
2659 places. */
2660 int count = begin_gc_forbidden ();
2661 Intbyte_dynarr *conversion_in_dynarr;
2662
2663 type_checking_assert
2664 ((source_type == DFC_TYPE_DATA ||
2665 source_type == DFC_TYPE_LISP_LSTREAM)
2666 &&
2667 (sink_type == DFC_TYPE_DATA ||
2668 sink_type == DFC_TYPE_LISP_LSTREAM));
2669
2670 record_unwind_protect (dfc_convert_to_internal_format_reset_in_use,
2671 make_int (dfc_convert_to_internal_format_in_use));
2672 if (Dynarr_length (conversion_in_dynarr_list) <=
2673 dfc_convert_to_internal_format_in_use)
2674 Dynarr_add (conversion_in_dynarr_list, Dynarr_new (Intbyte));
2675 conversion_in_dynarr = Dynarr_at (conversion_in_dynarr_list,
2676 dfc_convert_to_internal_format_in_use);
2677 dfc_convert_to_internal_format_in_use++;
2678 Dynarr_reset (conversion_in_dynarr);
2679
2680 coding_system = get_coding_system_for_text_file (coding_system, 1);
2681
2682 if (source_type != DFC_TYPE_LISP_LSTREAM &&
2683 sink_type != DFC_TYPE_LISP_LSTREAM &&
2684 coding_system_is_binary (coding_system))
2685 {
2686 #ifdef MULE
2687 const Intbyte *ptr = (const Intbyte *) source->data.ptr;
2688 Bytecount len = source->data.len;
2689 const Intbyte *end = ptr + len;
2690
2691 for (; ptr < end; ptr++)
2692 {
2693 Intbyte c = *ptr;
2694
2695 if (BYTE_ASCII_P (c))
2696 Dynarr_add (conversion_in_dynarr, c);
2697 else if (BYTE_C1_P (c))
2698 {
2699 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
2700 Dynarr_add (conversion_in_dynarr, c + 0x20);
2701 }
2702 else
2703 {
2704 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1);
2705 Dynarr_add (conversion_in_dynarr, c);
2706 }
2707 }
2708 #else
2709 Dynarr_add_many (conversion_in_dynarr, source->data.ptr, source->data.len);
2710 #endif
2711 }
2712 #ifdef HAVE_WIN32_CODING_SYSTEMS
2713 /* Optimize the common case involving Unicode where only ASCII/Latin-1 is involved */
2714 else if (source_type != DFC_TYPE_LISP_LSTREAM &&
2715 sink_type != DFC_TYPE_LISP_LSTREAM &&
2716 dfc_coding_system_is_unicode (coding_system))
2717 {
2718 const Intbyte *ptr = (const Intbyte *) source->data.ptr + 1;
2719 Bytecount len = source->data.len;
2720 const Intbyte *end = ptr + len;
2721
2722 if (len & 1)
2723 goto the_hard_way;
2724
2725 for (; ptr < end; ptr += 2)
2726 {
2727 if (*ptr)
2728 goto the_hard_way;
2729 }
2730
2731 ptr = (const Intbyte *) source->data.ptr;
2732 end = ptr + len;
2733
2734 for (; ptr < end; ptr += 2)
2735 {
2736 Intbyte c = *ptr;
2737
2738 if (BYTE_ASCII_P (c))
2739 Dynarr_add (conversion_in_dynarr, c);
2740 #ifdef MULE
2741 else if (BYTE_C1_P (c))
2742 {
2743 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
2744 Dynarr_add (conversion_in_dynarr, c + 0x20);
2745 }
2746 else
2747 {
2748 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1);
2749 Dynarr_add (conversion_in_dynarr, c);
2750 }
2751 #endif /* MULE */
2752 }
2753 }
2754 #endif /* HAVE_WIN32_CODING_SYSTEMS */
2755 else
2756 {
2757 Lisp_Object streams_to_delete[3];
2758 int delete_count;
2759 Lisp_Object instream, outstream;
2760 Lstream *reader, *writer;
2761 struct gcpro gcpro1, gcpro2;
2762
2763 #ifdef HAVE_WIN32_CODING_SYSTEMS
2764 the_hard_way:
2765 #endif /* HAVE_WIN32_CODING_SYSTEMS */
2766 delete_count = 0;
2767 if (source_type == DFC_TYPE_LISP_LSTREAM)
2768 instream = source->lisp_object;
2769 else
2770 {
2771 type_checking_assert (source_type == DFC_TYPE_DATA);
2772 streams_to_delete[delete_count++] = instream =
2773 make_fixed_buffer_input_stream (source->data.ptr, source->data.len);
2774 }
2775
2776 if (sink_type == DFC_TYPE_LISP_LSTREAM)
2777 outstream = sink->lisp_object;
2778 else
2779 {
2780 type_checking_assert (sink_type == DFC_TYPE_DATA);
2781 streams_to_delete[delete_count++] = outstream =
2782 make_dynarr_output_stream
2783 ((unsigned_char_dynarr *) conversion_in_dynarr);
2784 }
2785
2786 streams_to_delete[delete_count++] = outstream =
2787 make_coding_output_stream (XLSTREAM (outstream), coding_system, CODING_DECODE);
2788
2789 reader = XLSTREAM (instream);
2790 writer = XLSTREAM (outstream);
2791 /* outstream will gc-protect its sink stream, if necessary */
2792 GCPRO2 (instream, outstream);
2793
2794 while (1)
2795 {
2796 Bytecount size_in_bytes;
2797 char tempbuf[1024]; /* some random amount */
2798
2799 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf));
2800
2801 if (size_in_bytes == 0)
2802 break;
2803 else if (size_in_bytes < 0)
2804 signal_error (Qtext_conversion_error,
2805 "Error converting to internal format", Qunbound);
2806
2807 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0)
2808 signal_error (Qtext_conversion_error,
2809 "Error converting to internal format", Qunbound);
2810 }
2811
2812 /* Closing writer will close any stream at the other end of writer. */
2813 Lstream_close (writer);
2814 Lstream_close (reader);
2815 UNGCPRO;
2816
2817 /* The idea is that this function will create no garbage. */
2818 while (delete_count)
2819 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count]));
2820 }
2821
2822 unbind_to (count);
2823
2824 if (sink_type != DFC_TYPE_LISP_LSTREAM)
2825 {
2826 sink->data.len = Dynarr_length (conversion_in_dynarr);
2827 Dynarr_add (conversion_in_dynarr, '\0'); /* remember to NUL-terminate! */
2828 /* The macros don't currently distinguish between internal and
2829 external sinks, and allocate and copy two extra bytes in both
2830 cases. So we add a second zero, just like for external data
2831 (in that case, because we may be converting to Unicode). */
2832 Dynarr_add (conversion_in_dynarr, '\0');
2833 sink->data.ptr = Dynarr_atp (conversion_in_dynarr, 0);
2834 }
2835 }
2836
2837
2838 /************************************************************************/
2839 /* Basic Emchar functions */
2840 /************************************************************************/
2841
2842 #ifdef MULE
2843
2844 /* Convert a non-ASCII Mule character C into a one-character Mule-encoded
2845 string in STR. Returns the number of bytes stored.
2846 Do not call this directly. Use the macro set_charptr_emchar() instead.
2847 */
2848
2849 Bytecount
2850 non_ascii_set_charptr_emchar (Intbyte *str, Emchar c)
2851 {
2852 Intbyte *p;
2853 Intbyte lb;
2854 int c1, c2;
2855 Lisp_Object charset;
2856
2857 p = str;
2858 BREAKUP_CHAR (c, charset, c1, c2);
2859 lb = CHAR_LEADING_BYTE (c);
2860 if (LEADING_BYTE_PRIVATE_P (lb))
2861 *p++ = PRIVATE_LEADING_BYTE_PREFIX (lb);
2862 *p++ = lb;
2863 if (EQ (charset, Vcharset_control_1))
2864 c1 += 0x20;
2865 *p++ = c1 | 0x80;
2866 if (c2)
2867 *p++ = c2 | 0x80;
2868
2869 return (p - str);
2870 }
2871
2872 /* Return the first character from a Mule-encoded string in STR,
2873 assuming it's non-ASCII. Do not call this directly.
2874 Use the macro charptr_emchar() instead. */
2875
2876 Emchar
2877 non_ascii_charptr_emchar (const Intbyte *str)
2878 {
2879 Intbyte i0 = *str, i1, i2 = 0;
2880 Lisp_Object charset;
2881
2882 if (i0 == LEADING_BYTE_CONTROL_1)
2883 return (Emchar) (*++str - 0x20);
2884
2885 if (LEADING_BYTE_PREFIX_P (i0))
2886 i0 = *++str;
2887
2888 i1 = *++str & 0x7F;
2889
2890 charset = CHARSET_BY_LEADING_BYTE (i0);
2891 if (XCHARSET_DIMENSION (charset) == 2)
2892 i2 = *++str & 0x7F;
2893
2894 return MAKE_CHAR (charset, i1, i2);
2895 }
2896
2897 /* Return whether CH is a valid Emchar, assuming it's non-ASCII.
2898 Do not call this directly. Use the macro valid_char_p() instead. */
2899
2900 int
2901 non_ascii_valid_char_p (Emchar ch)
2902 {
2903 int f1, f2, f3;
2904
2905 /* Must have only lowest 19 bits set */
2906 if (ch & ~0x7FFFF)
2907 return 0;
2908
2909 f1 = CHAR_FIELD1 (ch);
2910 f2 = CHAR_FIELD2 (ch);
2911 f3 = CHAR_FIELD3 (ch);
2912
2913 if (f1 == 0)
2914 {
2915 /* dimension-1 char */
2916 Lisp_Object charset;
2917
2918 /* leading byte must be correct */
2919 if (f2 < MIN_CHAR_FIELD2_OFFICIAL ||
2920 (f2 > MAX_CHAR_FIELD2_OFFICIAL && f2 < MIN_CHAR_FIELD2_PRIVATE) ||
2921 f2 > MAX_CHAR_FIELD2_PRIVATE)
2922 return 0;
2923 /* octet not out of range */
2924 if (f3 < 0x20)
2925 return 0;
2926 /* charset exists */
2927 /*
2928 NOTE: This takes advantage of the fact that
2929 FIELD2_TO_OFFICIAL_LEADING_BYTE and
2930 FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
2931 */
2932 charset = CHARSET_BY_LEADING_BYTE (f2 + FIELD2_TO_OFFICIAL_LEADING_BYTE);
2933 if (EQ (charset, Qnil))
2934 return 0;
2935 /* check range as per size (94 or 96) of charset */
2936 return ((f3 > 0x20 && f3 < 0x7f) || XCHARSET_CHARS (charset) == 96);
2937 }
2938 else
2939 {
2940 /* dimension-2 char */
2941 Lisp_Object charset;
2942
2943 /* leading byte must be correct */
2944 if (f1 < MIN_CHAR_FIELD1_OFFICIAL ||
2945 (f1 > MAX_CHAR_FIELD1_OFFICIAL && f1 < MIN_CHAR_FIELD1_PRIVATE) ||
2946 f1 > MAX_CHAR_FIELD1_PRIVATE)
2947 return 0;
2948 /* octets not out of range */
2949 if (f2 < 0x20 || f3 < 0x20)
2950 return 0;
2951
2952 #ifdef ENABLE_COMPOSITE_CHARS
2953 if (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE == LEADING_BYTE_COMPOSITE)
2954 {
2955 if (UNBOUNDP (Fgethash (make_int (ch),
2956 Vcomposite_char_char2string_hash_table,
2957 Qunbound)))
2958 return 0;
2959 return 1;
2960 }
2961 #endif /* ENABLE_COMPOSITE_CHARS */
2962
2963 /* charset exists */
2964 if (f1 <= MAX_CHAR_FIELD1_OFFICIAL)
2965 charset =
2966 CHARSET_BY_LEADING_BYTE (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE);
2967 else
2968 charset =
2969 CHARSET_BY_LEADING_BYTE (f1 + FIELD1_TO_PRIVATE_LEADING_BYTE);
2970
2971 if (EQ (charset, Qnil))
2972 return 0;
2973 /* check range as per size (94x94 or 96x96) of charset */
2974 return ((f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F) ||
2975 XCHARSET_CHARS (charset) == 96);
2976 }
2977 }
2978
2979 /* Copy the character pointed to by SRC into DST. Do not call this
2980 directly. Use the macro charptr_copy_char() instead.
2981 Return the number of bytes copied. */
2982
2983 Bytecount
2984 non_ascii_charptr_copy_char (const Intbyte *src, Intbyte *dst)
2985 {
2986 Bytecount bytes = REP_BYTES_BY_FIRST_BYTE (*src);
2987 Bytecount i;
2988 for (i = bytes; i; i--, dst++, src++)
2989 *dst = *src;
2990 return bytes;
2991 }
2992
2993 #endif /* MULE */
2994
2995
2996 /************************************************************************/
2997 /* streams of Emchars */
2998 /************************************************************************/
2999
3000 #ifdef MULE
3001
3002 /* Treat a stream as a stream of Emchar's rather than a stream of bytes.
3003 The functions below are not meant to be called directly; use
3004 the macros in insdel.h. */
3005
3006 Emchar
3007 Lstream_get_emchar_1 (Lstream *stream, int ch)
3008 {
3009 Intbyte str[MAX_EMCHAR_LEN];
3010 Intbyte *strptr = str;
3011 Bytecount bytes;
3012
3013 str[0] = (Intbyte) ch;
3014
3015 for (bytes = REP_BYTES_BY_FIRST_BYTE (ch) - 1; bytes; bytes--)
3016 {
3017 int c = Lstream_getc (stream);
3018 charbpos_checking_assert (c >= 0);
3019 *++strptr = (Intbyte) c;
3020 }
3021 return charptr_emchar (str);
3022 }
3023
3024 int
3025 Lstream_fput_emchar (Lstream *stream, Emchar ch)
3026 {
3027 Intbyte str[MAX_EMCHAR_LEN];
3028 Bytecount len = set_charptr_emchar (str, ch);
3029 return Lstream_write (stream, str, len);
3030 }
3031
3032 void
3033 Lstream_funget_emchar (Lstream *stream, Emchar ch)
3034 {
3035 Intbyte str[MAX_EMCHAR_LEN];
3036 Bytecount len = set_charptr_emchar (str, ch);
3037 Lstream_unread (stream, str, len);
3038 }
3039
3040 #endif /* MULE */
3041
3042
3043 /************************************************************************/
3044 /* Lisp primitives for working with characters */
3045 /************************************************************************/
3046
3047 DEFUN ("make-char", Fmake_char, 2, 3, 0, /*
3048 Make a character from CHARSET and octets ARG1 and ARG2.
3049 ARG2 is required only for characters from two-dimensional charsets.
3050
3051 Each octet should be in the range 32 through 127 for a 96 or 96x96
3052 charset and 33 through 126 for a 94 or 94x94 charset. (Most charsets
3053 are either 96 or 94x94.) Note that this is 32 more than the values
3054 typically given for 94x94 charsets. When two octets are required, the
3055 order is "standard" -- the same as appears in ISO-2022 encodings,
3056 reference tables, etc.
3057
3058 \(Note the following non-obvious result: Computerized translation
3059 tables often encode the two octets as the high and low bytes,
3060 respectively, of a hex short, while when there's only one octet, it
3061 goes in the low byte. When decoding such a value, you need to treat
3062 the two cases differently when calling make-char: One is (make-char
3063 CHARSET HIGH LOW), the other is (make-char CHARSET LOW).)
3064
3065 For example, (make-char 'latin-iso8859-2 185) or (make-char
3066 'latin-iso8859-2 57) will return the Latin 2 character s with caron.
3067
3068 As another example, the Japanese character for "kawa" (stream), which
3069 looks something like this:
3070
3071 | |
3072 | | |
3073 | | |
3074 | | |
3075 / |
3076
3077 appears in the Unicode Standard (version 2.0) on page 7-287 with the
3078 following values (see also page 7-4):
3079
3080 U 5DDD (Unicode)
3081 G 0-2008 (GB 2312-80)
3082 J 0-3278 (JIS X 0208-1990)
3083 K 0-8425 (KS C 5601-1987)
3084 B A474 (Big Five)
3085 C 1-4455 (CNS 11643-1986 (1st plane))
3086 A 213C34 (ANSI Z39.64-1989)
3087
3088 These are equivalent to:
3089
3090 \(make-char 'chinese-gb2312 52 40)
3091 \(make-char 'japanese-jisx0208 64 110)
3092 \(make-char 'korean-ksc5601 116 57)
3093 \(make-char 'chinese-cns11643-1 76 87)
3094 \(decode-big5-char '(164 . 116))
3095
3096 \(All codes above are two decimal numbers except for Big Five and ANSI
3097 Z39.64, which we don't support. We add 32 to each of the decimal
3098 numbers. Big Five is split in a rather hackish fashion into two
3099 charsets, `big5-1' and `big5-2', due to its excessive size -- 94x157,
3100 with the first codepoint in the range 0xA1 to 0xFE and the second in
3101 the range 0x40 to 0x7E or 0xA1 to 0xFE. `decode-big5-char' is used to
3102 generate the char from its codes, and `encode-big5-char' extracts the
3103 codes.)
3104
3105 When compiled without MULE, this function does not do much, but it's
3106 provided for compatibility. In this case, the following CHARSET symbols
3107 are allowed:
3108
3109 `ascii' -- ARG1 should be in the range 0 through 127.
3110 `control-1' -- ARG1 should be in the range 128 through 159.
3111 else -- ARG1 is coerced to be between 0 and 255, and then the high
3112 bit is set.
3113
3114 `int-to-char of the resulting ARG1' is returned, and ARG2 is always ignored.
3115 */
3116 (charset, arg1, arg2))
3117 {
3118 #ifdef MULE
3119 Lisp_Charset *cs;
3120 int a1, a2;
3121 int lowlim, highlim;
3122
3123 charset = Fget_charset (charset);
3124 cs = XCHARSET (charset);
3125
3126 if (EQ (charset, Vcharset_ascii)) lowlim = 0, highlim = 127;
3127 else if (EQ (charset, Vcharset_control_1)) lowlim = 0, highlim = 31;
3128 else if (CHARSET_CHARS (cs) == 94) lowlim = 33, highlim = 126;
3129 else /* CHARSET_CHARS (cs) == 96) */ lowlim = 32, highlim = 127;
3130
3131 CHECK_INT (arg1);
3132 /* It is useful (and safe, according to Olivier Galibert) to strip
3133 the 8th bit off ARG1 and ARG2 because it allows programmers to
3134 write (make-char 'latin-iso8859-2 CODE) where code is the actual
3135 Latin 2 code of the character. */
3136 a1 = XINT (arg1) & 0x7f;
3137 if (a1 < lowlim || a1 > highlim)
3138 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim));
3139
3140 if (CHARSET_DIMENSION (cs) == 1)
3141 {
3142 if (!NILP (arg2))
3143 invalid_argument
3144 ("Charset is of dimension one; second octet must be nil", arg2);
3145 return make_char (MAKE_CHAR (charset, a1, 0));
3146 }
3147
3148 CHECK_INT (arg2);
3149 a2 = XINT (arg2) & 0x7f;
3150 if (a2 < lowlim || a2 > highlim)
3151 args_out_of_range_3 (arg2, make_int (lowlim), make_int (highlim));
3152
3153 return make_char (MAKE_CHAR (charset, a1, a2));
3154 #else
3155 int a1;
3156 int lowlim, highlim;
3157
3158 if (EQ (charset, Qascii)) lowlim = 0, highlim = 127;
3159 else if (EQ (charset, Qcontrol_1)) lowlim = 0, highlim = 31;
3160 else lowlim = 0, highlim = 127;
3161
3162 CHECK_INT (arg1);
3163 /* It is useful (and safe, according to Olivier Galibert) to strip
3164 the 8th bit off ARG1 and ARG2 because it allows programmers to
3165 write (make-char 'latin-iso8859-2 CODE) where code is the actual
3166 Latin 2 code of the character. */
3167 a1 = XINT (arg1) & 0x7f;
3168 if (a1 < lowlim || a1 > highlim)
3169 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim));
3170
3171 if (EQ (charset, Qascii))
3172 return make_char (a1);
3173 return make_char (a1 + 128);
3174 #endif /* MULE */
3175 }
3176
3177 #ifdef MULE
3178
3179 DEFUN ("char-charset", Fchar_charset, 1, 1, 0, /*
3180 Return the character set of char CH.
3181 */
3182 (ch))
3183 {
3184 CHECK_CHAR_COERCE_INT (ch);
3185
3186 return XCHARSET_NAME (CHARSET_BY_LEADING_BYTE
3187 (CHAR_LEADING_BYTE (XCHAR (ch))));
3188 }
3189
3190 DEFUN ("char-octet", Fchar_octet, 1, 2, 0, /*
3191 Return the octet numbered N (should be 0 or 1) of char CH.
3192 N defaults to 0 if omitted.
3193 */
3194 (ch, n))
3195 {
3196 Lisp_Object charset;
3197 int octet0, octet1;
3198
3199 CHECK_CHAR_COERCE_INT (ch);
3200
3201 BREAKUP_CHAR (XCHAR (ch), charset, octet0, octet1);
3202
3203 if (NILP (n) || EQ (n, Qzero))
3204 return make_int (octet0);
3205 else if (EQ (n, make_int (1)))
3206 return make_int (octet1);
3207 else
3208 invalid_constant ("Octet number must be 0 or 1", n);
3209 }
3210
3211 DEFUN ("split-char", Fsplit_char, 1, 1, 0, /*
3212 Return list of charset and one or two position-codes of CHAR.
3213 */
3214 (character))
3215 {
3216 /* This function can GC */
3217 struct gcpro gcpro1, gcpro2;
3218 Lisp_Object charset = Qnil;
3219 Lisp_Object rc = Qnil;
3220 int c1, c2;
3221
3222 GCPRO2 (charset, rc);
3223 CHECK_CHAR_COERCE_INT (character);
3224
3225 BREAKUP_CHAR (XCHAR (character), charset, c1, c2);
3226
3227 if (XCHARSET_DIMENSION (Fget_charset (charset)) == 2)
3228 {
3229 rc = list3 (XCHARSET_NAME (charset), make_int (c1), make_int (c2));
3230 }
3231 else
3232 {
3233 rc = list2 (XCHARSET_NAME (charset), make_int (c1));
3234 }
3235 UNGCPRO;
3236
3237 return rc;
3238 }
3239
3240 #endif /* MULE */
3241
3242
3243 /************************************************************************/
3244 /* composite character functions */
3245 /************************************************************************/
3246
3247 #ifdef ENABLE_COMPOSITE_CHARS
3248
3249 Emchar
3250 lookup_composite_char (Intbyte *str, int len)
3251 {
3252 Lisp_Object lispstr = make_string (str, len);
3253 Lisp_Object ch = Fgethash (lispstr,
3254 Vcomposite_char_string2char_hash_table,
3255 Qunbound);
3256 Emchar emch;
3257
3258 if (UNBOUNDP (ch))
3259 {
3260 if (composite_char_row_next >= 128)
3261 invalid_operation ("No more composite chars available", lispstr);
3262 emch = MAKE_CHAR (Vcharset_composite, composite_char_row_next,
3263 composite_char_col_next);
3264 Fputhash (make_char (emch), lispstr,
3265 Vcomposite_char_char2string_hash_table);
3266 Fputhash (lispstr, make_char (emch),
3267 Vcomposite_char_string2char_hash_table);
3268 composite_char_col_next++;
3269 if (composite_char_col_next >= 128)
3270 {
3271 composite_char_col_next = 32;
3272 composite_char_row_next++;
3273 }
3274 }
3275 else
3276 emch = XCHAR (ch);
3277 return emch;
3278 }
3279
3280 Lisp_Object
3281 composite_char_string (Emchar ch)
3282 {
3283 Lisp_Object str = Fgethash (make_char (ch),
3284 Vcomposite_char_char2string_hash_table,
3285 Qunbound);
3286 assert (!UNBOUNDP (str));
3287 return str;
3288 }
3289
3290 xxDEFUN ("make-composite-char", Fmake_composite_char, 1, 1, 0, /*
3291 Convert a string into a single composite character.
3292 The character is the result of overstriking all the characters in
3293 the string.
3294 */
3295 (string))
3296 {
3297 CHECK_STRING (string);
3298 return make_char (lookup_composite_char (XSTRING_DATA (string),
3299 XSTRING_LENGTH (string)));
3300 }
3301
3302 xxDEFUN ("composite-char-string", Fcomposite_char_string, 1, 1, 0, /*
3303 Return a string of the characters comprising a composite character.
3304 */
3305 (ch))
3306 {
3307 Emchar emch;
3308
3309 CHECK_CHAR (ch);
3310 emch = XCHAR (ch);
3311 if (CHAR_LEADING_BYTE (emch) != LEADING_BYTE_COMPOSITE)
3312 invalid_argument ("Must be composite char", ch);
3313 return composite_char_string (emch);
3314 }
3315 #endif /* ENABLE_COMPOSITE_CHARS */
3316
3317
3318 /************************************************************************/
3319 /* initialization */
3320 /************************************************************************/
3321
3322 void
3323 init_eistring_once_early (void)
3324 {
3325 the_eistring_malloc_zero_init = the_eistring_zero_init;
3326 the_eistring_malloc_zero_init.mallocp_ = 1;
3327 }
3328
3329 void
3330 syms_of_text (void)
3331 {
3332 DEFSUBR (Fmake_char);
3333
3334 #ifdef MULE
3335 DEFSUBR (Fchar_charset);
3336 DEFSUBR (Fchar_octet);
3337 DEFSUBR (Fsplit_char);
3338
3339 #ifdef ENABLE_COMPOSITE_CHARS
3340 DEFSUBR (Fmake_composite_char);
3341 DEFSUBR (Fcomposite_char_string);
3342 #endif
3343 #endif /* MULE */
3344 }
3345
3346 void
3347 reinit_vars_of_text (void)
3348 {
3349 int i;
3350
3351 conversion_in_dynarr_list = Dynarr_new2 (Intbyte_dynarr_dynarr,
3352 Intbyte_dynarr *);
3353 conversion_out_dynarr_list = Dynarr_new2 (Extbyte_dynarr_dynarr,
3354 Extbyte_dynarr *);
3355
3356 /* #### Olivier, why does this need to be reinitted? */
3357 for (i = 0; i <= MAX_BYTEBPOS_GAP_SIZE_3; i++)
3358 three_to_one_table[i] = i / 3;
3359 }
3360
3361 void
3362 vars_of_text (void)
3363 {
3364 reinit_vars_of_text ();
3365
3366 #ifdef ENABLE_COMPOSITE_CHARS
3367 /* #### not dumped properly */
3368 composite_char_row_next = 32;
3369 composite_char_col_next = 32;
3370
3371 Vcomposite_char_string2char_hash_table =
3372 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQUAL);
3373 Vcomposite_char_char2string_hash_table =
3374 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
3375 staticpro (&Vcomposite_char_string2char_hash_table);
3376 staticpro (&Vcomposite_char_char2string_hash_table);
3377 #endif /* ENABLE_COMPOSITE_CHARS */
3378 }