771
+ − 1 /* Buffer manipulation primitives for XEmacs.
+ − 2 Copyright (C) 1995 Sun Microsystems, Inc.
+ − 3 Copyright (C) 1995, 1996, 2000, 2001, 2002 Ben Wing.
+ − 4 Copyright (C) 1999 Martin Buchholz.
+ − 5
+ − 6 This file is part of XEmacs.
+ − 7
+ − 8 XEmacs is free software; you can redistribute it and/or modify it
+ − 9 under the terms of the GNU General Public License as published by the
+ − 10 Free Software Foundation; either version 2, or (at your option) any
+ − 11 later version.
+ − 12
+ − 13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
+ − 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ − 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ − 16 for more details.
+ − 17
+ − 18 You should have received a copy of the GNU General Public License
+ − 19 along with XEmacs; see the file COPYING. If not, write to
+ − 20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ − 21 Boston, MA 02111-1307, USA. */
+ − 22
+ − 23 /* Synched up with: Not in FSF. */
+ − 24
+ − 25 /* Authorship:
+ − 26 */
+ − 27
+ − 28 #include <config.h>
+ − 29 #include "lisp.h"
+ − 30
+ − 31 #include "buffer.h"
+ − 32 #include "charset.h"
+ − 33 #include "file-coding.h"
+ − 34 #include "lstream.h"
+ − 35
+ − 36
+ − 37 /************************************************************************/
+ − 38 /* long comments */
+ − 39 /************************************************************************/
+ − 40
+ − 41 /*
826
+ − 42 ==========================================================================
+ − 43 1. Character Sets
+ − 44 ==========================================================================
771
+ − 45
+ − 46 A character set (or "charset") is an ordered set of characters.
826
+ − 47
+ − 48 A character (which is, BTW, a surprisingly complex concept) is, in a
+ − 49 written representation of text, the most basic written unit that has a
+ − 50 meaning of its own. It's comparable to a phoneme when analyzing words
+ − 51 in spoken speech. Just like with a phoneme (which is an abstract
+ − 52 concept, and is represented in actual spoken speech by one or more
+ − 53 allophones, ...&&#### finish this., a character is actually an abstract
+ − 54 concept
+ − 55
771
+ − 56 A particular character in a charset is indexed using one or
+ − 57 more "position codes", which are non-negative integers.
+ − 58 The number of position codes needed to identify a particular
+ − 59 character in a charset is called the "dimension" of the
+ − 60 charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions,
+ − 61 and the size of all charsets (except for a few special cases)
+ − 62 is either 94, 96, 94 by 94, or 96 by 96. The range of
+ − 63 position codes used to index characters from any of these
+ − 64 types of character sets is as follows:
+ − 65
+ − 66 Charset type Position code 1 Position code 2
+ − 67 ------------------------------------------------------------
+ − 68 94 33 - 126 N/A
+ − 69 96 32 - 127 N/A
+ − 70 94x94 33 - 126 33 - 126
+ − 71 96x96 32 - 127 32 - 127
+ − 72
+ − 73 Note that in the above cases position codes do not start at
+ − 74 an expected value such as 0 or 1. The reason for this will
+ − 75 become clear later.
+ − 76
+ − 77 For example, Latin-1 is a 96-character charset, and JISX0208
+ − 78 (the Japanese national character set) is a 94x94-character
+ − 79 charset.
+ − 80
+ − 81 [Note that, although the ranges above define the *valid*
+ − 82 position codes for a charset, some of the slots in a particular
+ − 83 charset may in fact be empty. This is the case for JISX0208,
+ − 84 for example, where (e.g.) all the slots whose first
+ − 85 position code is in the range 118 - 127 are empty.]
+ − 86
+ − 87 There are three charsets that do not follow the above rules.
+ − 88 All of them have one dimension, and have ranges of position
+ − 89 codes as follows:
+ − 90
+ − 91 Charset name Position code 1
+ − 92 ------------------------------------
+ − 93 ASCII 0 - 127
+ − 94 Control-1 0 - 31
+ − 95 Composite 0 - some large number
+ − 96
+ − 97 (The upper bound of the position code for composite characters
+ − 98 has not yet been determined, but it will probably be at
+ − 99 least 16,383).
+ − 100
+ − 101 ASCII is the union of two subsidiary character sets:
+ − 102 Printing-ASCII (the printing ASCII character set,
+ − 103 consisting of position codes 33 - 126, like for a standard
+ − 104 94-character charset) and Control-ASCII (the non-printing
+ − 105 characters that would appear in a binary file with codes 0
+ − 106 - 32 and 127).
+ − 107
+ − 108 Control-1 contains the non-printing characters that would
+ − 109 appear in a binary file with codes 128 - 159.
+ − 110
+ − 111 Composite contains characters that are generated by
+ − 112 overstriking one or more characters from other charsets.
+ − 113
+ − 114 Note that some characters in ASCII, and all characters
+ − 115 in Control-1, are "control" (non-printing) characters.
+ − 116 These have no printed representation but instead control
+ − 117 some other function of the printing (e.g. TAB or 8 moves
+ − 118 the current character position to the next tab stop).
+ − 119 All other characters in all charsets are "graphic"
+ − 120 (printing) characters.
+ − 121
+ − 122 When a binary file is read in, the bytes in the file are
+ − 123 assigned to character sets as follows:
+ − 124
+ − 125 Bytes Character set Range
+ − 126 --------------------------------------------------
+ − 127 0 - 127 ASCII 0 - 127
+ − 128 128 - 159 Control-1 0 - 31
+ − 129 160 - 255 Latin-1 32 - 127
+ − 130
+ − 131 This is a bit ad-hoc but gets the job done.
+ − 132
826
+ − 133 ==========================================================================
+ − 134 2. Encodings
+ − 135 ==========================================================================
771
+ − 136
+ − 137 An "encoding" is a way of numerically representing
+ − 138 characters from one or more character sets. If an encoding
+ − 139 only encompasses one character set, then the position codes
+ − 140 for the characters in that character set could be used
+ − 141 directly. This is not possible, however, if more than one
+ − 142 character set is to be used in the encoding.
+ − 143
+ − 144 For example, the conversion detailed above between bytes in
+ − 145 a binary file and characters is effectively an encoding
+ − 146 that encompasses the three character sets ASCII, Control-1,
+ − 147 and Latin-1 in a stream of 8-bit bytes.
+ − 148
+ − 149 Thus, an encoding can be viewed as a way of encoding
+ − 150 characters from a specified group of character sets using a
+ − 151 stream of bytes, each of which contains a fixed number of
+ − 152 bits (but not necessarily 8, as in the common usage of
+ − 153 "byte").
+ − 154
+ − 155 Here are descriptions of a couple of common
+ − 156 encodings:
+ − 157
+ − 158
+ − 159 A. Japanese EUC (Extended Unix Code)
+ − 160
+ − 161 This encompasses the character sets:
+ − 162 - Printing-ASCII,
+ − 163 - Katakana-JISX0201 (half-width katakana, the right half of JISX0201).
+ − 164 - Japanese-JISX0208
+ − 165 - Japanese-JISX0212
+ − 166 It uses 8-bit bytes.
+ − 167
+ − 168 Note that Printing-ASCII and Katakana-JISX0201 are 94-character
+ − 169 charsets, while Japanese-JISX0208 is a 94x94-character charset.
+ − 170
+ − 171 The encoding is as follows:
+ − 172
+ − 173 Character set Representation (PC == position-code)
+ − 174 ------------- --------------
+ − 175 Printing-ASCII PC1
+ − 176 Japanese-JISX0208 PC1 + 0x80 | PC2 + 0x80
+ − 177 Katakana-JISX0201 0x8E | PC1 + 0x80
+ − 178
+ − 179
+ − 180 B. JIS7
+ − 181
+ − 182 This encompasses the character sets:
+ − 183 - Printing-ASCII
+ − 184 - Latin-JISX0201 (the left half of JISX0201; this character set is
+ − 185 very similar to Printing-ASCII and is a 94-character charset)
+ − 186 - Japanese-JISX0208
+ − 187 - Katakana-JISX0201
+ − 188 It uses 7-bit bytes.
+ − 189
+ − 190 Unlike Japanese EUC, this is a "modal" encoding, which
+ − 191 means that there are multiple states that the encoding can
+ − 192 be in, which affect how the bytes are to be interpreted.
+ − 193 Special sequences of bytes (called "escape sequences")
+ − 194 are used to change states.
+ − 195
+ − 196 The encoding is as follows:
+ − 197
+ − 198 Character set Representation
+ − 199 ------------- --------------
+ − 200 Printing-ASCII PC1
+ − 201 Latin-JISX0201 PC1
+ − 202 Katakana-JISX0201 PC1
+ − 203 Japanese-JISX0208 PC1 | PC2
+ − 204
+ − 205 Escape sequence ASCII equivalent Meaning
+ − 206 --------------- ---------------- -------
+ − 207 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII
+ − 208 0x1B 0x28 0x4A ESC ( J invoke Latin-JISX0201
+ − 209 0x1B 0x28 0x49 ESC ( I invoke Katakana-JISX0201
+ − 210 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208
+ − 211
+ − 212 Initially, Printing-ASCII is invoked.
+ − 213
826
+ − 214 ==========================================================================
+ − 215 3. Internal Mule Encodings
+ − 216 ==========================================================================
771
+ − 217
+ − 218 In XEmacs/Mule, each character set is assigned a unique number,
+ − 219 called a "leading byte". This is used in the encodings of a
+ − 220 character. Leading bytes are in the range 0x80 - 0xFF
+ − 221 (except for ASCII, which has a leading byte of 0), although
+ − 222 some leading bytes are reserved.
+ − 223
+ − 224 Charsets whose leading byte is in the range 0x80 - 0x9F are
+ − 225 called "official" and are used for built-in charsets.
+ − 226 Other charsets are called "private" and have leading bytes
+ − 227 in the range 0xA0 - 0xFF; these are user-defined charsets.
+ − 228
+ − 229 More specifically:
+ − 230
+ − 231 Character set Leading byte
+ − 232 ------------- ------------
+ − 233 ASCII 0 (0x7F in arrays indexed by leading byte)
+ − 234 Composite 0x8D
+ − 235 Dimension-1 Official 0x80 - 0x8C/0x8D
+ − 236 (0x8E is free)
+ − 237 Control 0x8F
+ − 238 Dimension-2 Official 0x90 - 0x99
+ − 239 (0x9A - 0x9D are free)
+ − 240 Dimension-1 Private Marker 0x9E
+ − 241 Dimension-2 Private Marker 0x9F
+ − 242 Dimension-1 Private 0xA0 - 0xEF
+ − 243 Dimension-2 Private 0xF0 - 0xFF
+ − 244
+ − 245 There are two internal encodings for characters in XEmacs/Mule.
+ − 246 One is called "string encoding" and is an 8-bit encoding that
+ − 247 is used for representing characters in a buffer or string.
+ − 248 It uses 1 to 4 bytes per character. The other is called
+ − 249 "character encoding" and is a 19-bit encoding that is used
+ − 250 for representing characters individually in a variable.
+ − 251
+ − 252 (In the following descriptions, we'll ignore composite
+ − 253 characters for the moment. We also give a general (structural)
+ − 254 overview first, followed later by the exact details.)
+ − 255
+ − 256 A. Internal String Encoding
+ − 257
+ − 258 ASCII characters are encoded using their position code directly.
+ − 259 Other characters are encoded using their leading byte followed
+ − 260 by their position code(s) with the high bit set. Characters
+ − 261 in private character sets have their leading byte prefixed with
+ − 262 a "leading byte prefix", which is either 0x9E or 0x9F. (No
+ − 263 character sets are ever assigned these leading bytes.) Specifically:
+ − 264
+ − 265 Character set Encoding (PC == position-code)
+ − 266 ------------- -------- (LB == leading-byte)
+ − 267 ASCII PC1 |
+ − 268 Control-1 LB | PC1 + 0xA0
+ − 269 Dimension-1 official LB | PC1 + 0x80
+ − 270 Dimension-1 private 0x9E | LB | PC1 + 0x80
+ − 271 Dimension-2 official LB | PC1 | PC2 + 0x80
+ − 272 Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80
+ − 273
+ − 274 The basic characteristic of this encoding is that the first byte
+ − 275 of all characters is in the range 0x00 - 0x9F, and the second and
+ − 276 following bytes of all characters is in the range 0xA0 - 0xFF.
+ − 277 This means that it is impossible to get out of sync, or more
+ − 278 specifically:
+ − 279
+ − 280 1. Given any byte position, the beginning of the character it is
+ − 281 within can be determined in constant time.
+ − 282 2. Given any byte position at the beginning of a character, the
+ − 283 beginning of the next character can be determined in constant
+ − 284 time.
+ − 285 3. Given any byte position at the beginning of a character, the
+ − 286 beginning of the previous character can be determined in constant
+ − 287 time.
+ − 288 4. Textual searches can simply treat encoded strings as if they
+ − 289 were encoded in a one-byte-per-character fashion rather than
+ − 290 the actual multi-byte encoding.
+ − 291
+ − 292 None of the standard non-modal encodings meet all of these
+ − 293 conditions. For example, EUC satisfies only (2) and (3), while
+ − 294 Shift-JIS and Big5 (not yet described) satisfy only (2). (All
+ − 295 non-modal encodings must satisfy (2), in order to be unambiguous.)
+ − 296
+ − 297 B. Internal Character Encoding
+ − 298
+ − 299 One 19-bit word represents a single character. The word is
+ − 300 separated into three fields:
+ − 301
+ − 302 Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+ − 303 <------------> <------------------> <------------------>
+ − 304 Field: 1 2 3
+ − 305
+ − 306 Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits.
+ − 307
+ − 308 Character set Field 1 Field 2 Field 3
+ − 309 ------------- ------- ------- -------
+ − 310 ASCII 0 0 PC1
+ − 311 range: (00 - 7F)
+ − 312 Control-1 0 1 PC1
+ − 313 range: (00 - 1F)
+ − 314 Dimension-1 official 0 LB - 0x7F PC1
+ − 315 range: (01 - 0D) (20 - 7F)
+ − 316 Dimension-1 private 0 LB - 0x80 PC1
+ − 317 range: (20 - 6F) (20 - 7F)
+ − 318 Dimension-2 official LB - 0x8F PC1 PC2
+ − 319 range: (01 - 0A) (20 - 7F) (20 - 7F)
+ − 320 Dimension-2 private LB - 0xE1 PC1 PC2
+ − 321 range: (0F - 1E) (20 - 7F) (20 - 7F)
+ − 322 Composite 0x1F ? ?
+ − 323
+ − 324 Note that character codes 0 - 255 are the same as the "binary encoding"
+ − 325 described above.
826
+ − 326
+ − 327 Most of the code in XEmacs knows nothing of the representation of a
+ − 328 character other than that values 0 - 255 represent ASCII, Control 1,
+ − 329 and Latin 1.
+ − 330
+ − 331 WARNING WARNING WARNING: The Boyer-Moore code in search.c, and the
+ − 332 code in search_buffer() that determines whether that code can be used,
+ − 333 knows that "field 3" in a character always corresponds to the last
+ − 334 byte in the textual representation of the character. (This is important
+ − 335 because the Boyer-Moore algorithm works by looking at the last byte
+ − 336 of the search string and &&#### finish this.
+ − 337
+ − 338 ==========================================================================
+ − 339 4. Buffer Positions and Other Typedefs
+ − 340 ==========================================================================
+ − 341
+ − 342 A. Buffer Positions
+ − 343
+ − 344 There are three possible ways to specify positions in a buffer. All
+ − 345 of these are one-based: the beginning of the buffer is position or
+ − 346 index 1, and 0 is not a valid position.
+ − 347
+ − 348 As a "buffer position" (typedef Charbpos):
+ − 349
+ − 350 This is an index specifying an offset in characters from the
+ − 351 beginning of the buffer. Note that buffer positions are
+ − 352 logically *between* characters, not on a character. The
+ − 353 difference between two buffer positions specifies the number of
+ − 354 characters between those positions. Buffer positions are the
+ − 355 only kind of position externally visible to the user.
+ − 356
+ − 357 As a "byte index" (typedef Bytebpos):
+ − 358
+ − 359 This is an index over the bytes used to represent the characters
+ − 360 in the buffer. If there is no Mule support, this is identical
+ − 361 to a buffer position, because each character is represented
+ − 362 using one byte. However, with Mule support, many characters
+ − 363 require two or more bytes for their representation, and so a
+ − 364 byte index may be greater than the corresponding buffer
+ − 365 position.
+ − 366
+ − 367 As a "memory index" (typedef Membpos):
+ − 368
+ − 369 This is the byte index adjusted for the gap. For positions
+ − 370 before the gap, this is identical to the byte index. For
+ − 371 positions after the gap, this is the byte index plus the gap
+ − 372 size. There are two possible memory indices for the gap
+ − 373 position; the memory index at the beginning of the gap should
+ − 374 always be used, except in code that deals with manipulating the
+ − 375 gap, where both indices may be seen. The address of the
+ − 376 character "at" (i.e. following) a particular position can be
+ − 377 obtained from the formula
+ − 378
+ − 379 buffer_start_address + memory_index(position) - 1
+ − 380
+ − 381 except in the case of characters at the gap position.
+ − 382
+ − 383 B. Other Typedefs
+ − 384
867
+ − 385 Ichar:
826
+ − 386 -------
+ − 387 This typedef represents a single Emacs character, which can be
+ − 388 ASCII, ISO-8859, or some extended character, as would typically
+ − 389 be used for Kanji. Note that the representation of a character
867
+ − 390 as an Ichar is *not* the same as the representation of that
826
+ − 391 same character in a string; thus, you cannot do the standard
+ − 392 C trick of passing a pointer to a character to a function that
+ − 393 expects a string.
+ − 394
867
+ − 395 An Ichar takes up 19 bits of representation and (for code
826
+ − 396 compatibility and such) is compatible with an int. This
+ − 397 representation is visible on the Lisp level. The important
867
+ − 398 characteristics of the Ichar representation are
826
+ − 399
+ − 400 -- values 0x00 - 0x7f represent ASCII.
+ − 401 -- values 0x80 - 0xff represent the right half of ISO-8859-1.
+ − 402 -- values 0x100 and up represent all other characters.
+ − 403
867
+ − 404 This means that Ichar values are upwardly compatible with
826
+ − 405 the standard 8-bit representation of ASCII/ISO-8859-1.
+ − 406
867
+ − 407 Ibyte:
826
+ − 408 --------
867
+ − 409 The data in a buffer or string is logically made up of Ibyte
+ − 410 objects, where a Ibyte takes up the same amount of space as a
826
+ − 411 char. (It is declared differently, though, to catch invalid
867
+ − 412 usages.) Strings stored using Ibytes are said to be in
826
+ − 413 "internal format". The important characteristics of internal
+ − 414 format are
+ − 415
867
+ − 416 -- ASCII characters are represented as a single Ibyte,
826
+ − 417 in the range 0 - 0x7f.
867
+ − 418 -- All other characters are represented as a Ibyte in
+ − 419 the range 0x80 - 0x9f followed by one or more Ibytes
826
+ − 420 in the range 0xa0 to 0xff.
+ − 421
+ − 422 This leads to a number of desirable properties:
+ − 423
+ − 424 -- Given the position of the beginning of a character,
+ − 425 you can find the beginning of the next or previous
+ − 426 character in constant time.
+ − 427 -- When searching for a substring or an ASCII character
+ − 428 within the string, you need merely use standard
+ − 429 searching routines.
+ − 430
+ − 431 array of char:
+ − 432 --------------
+ − 433 Strings that go in or out of Emacs are in "external format",
+ − 434 typedef'ed as an array of char or a char *. There is more
+ − 435 than one external format (JIS, EUC, etc.) but they all
+ − 436 have similar properties. They are modal encodings,
+ − 437 which is to say that the meaning of particular bytes is
+ − 438 not fixed but depends on what "mode" the string is currently
+ − 439 in (e.g. bytes in the range 0 - 0x7f might be
+ − 440 interpreted as ASCII, or as Hiragana, or as 2-byte Kanji,
+ − 441 depending on the current mode). The mode starts out in
+ − 442 ASCII/ISO-8859-1 and is switched using escape sequences --
+ − 443 for example, in the JIS encoding, 'ESC $ B' switches to a
+ − 444 mode where pairs of bytes in the range 0 - 0x7f
+ − 445 are interpreted as Kanji characters.
+ − 446
+ − 447 External-formatted data is generally desirable for passing
+ − 448 data between programs because it is upwardly compatible
+ − 449 with standard ASCII/ISO-8859-1 strings and may require
+ − 450 less space than internal encodings such as the one
+ − 451 described above. In addition, some encodings (e.g. JIS)
+ − 452 keep all characters (except the ESC used to switch modes)
+ − 453 in the printing ASCII range 0x20 - 0x7e, which results in
+ − 454 a much higher probability that the data will avoid being
+ − 455 garbled in transmission. Externally-formatted data is
+ − 456 generally not very convenient to work with, however, and
+ − 457 for this reason is usually converted to internal format
+ − 458 before any work is done on the string.
+ − 459
+ − 460 NOTE: filenames need to be in external format so that
+ − 461 ISO-8859-1 characters come out correctly.
+ − 462
+ − 463 Charcount:
+ − 464 ----------
+ − 465 This typedef represents a count of characters, such as
+ − 466 a character offset into a string or the number of
+ − 467 characters between two positions in a buffer. The
+ − 468 difference between two Charbpos's is a Charcount, and
+ − 469 character positions in a string are represented using
+ − 470 a Charcount.
+ − 471
+ − 472 Bytecount:
+ − 473 ----------
+ − 474 Similar to a Charcount but represents a count of bytes.
+ − 475 The difference between two Bytebpos's is a Bytecount.
+ − 476
+ − 477
+ − 478 C. Usage of the Various Representations
+ − 479
+ − 480 Memory indices are used in low-level functions in insdel.c and for
+ − 481 extent endpoints and marker positions. The reason for this is that
+ − 482 this way, the extents and markers don't need to be updated for most
+ − 483 insertions, which merely shrink the gap and don't move any
+ − 484 characters around in memory.
+ − 485
+ − 486 (The beginning-of-gap memory index simplifies insertions w.r.t.
+ − 487 markers, because text usually gets inserted after markers. For
+ − 488 extents, it is merely for consistency, because text can get
+ − 489 inserted either before or after an extent's endpoint depending on
+ − 490 the open/closedness of the endpoint.)
+ − 491
+ − 492 Byte indices are used in other code that needs to be fast,
+ − 493 such as the searching, redisplay, and extent-manipulation code.
+ − 494
+ − 495 Buffer positions are used in all other code. This is because this
+ − 496 representation is easiest to work with (especially since Lisp
+ − 497 code always uses buffer positions), necessitates the fewest
+ − 498 changes to existing code, and is the safest (e.g. if the text gets
+ − 499 shifted underneath a buffer position, it will still point to a
+ − 500 character; if text is shifted under a byte index, it might point
+ − 501 to the middle of a character, which would be bad).
+ − 502
+ − 503 Similarly, Charcounts are used in all code that deals with strings
+ − 504 except for code that needs to be fast, which used Bytecounts.
+ − 505
+ − 506 Strings are always passed around internally using internal format.
+ − 507 Conversions between external format are performed at the time
+ − 508 that the data goes in or out of Emacs.
+ − 509
+ − 510 D. Working With the Various Representations
+ − 511
+ − 512 We write things this way because it's very important the
+ − 513 MAX_BYTEBPOS_GAP_SIZE_3 is a multiple of 3. (As it happens,
+ − 514 65535 is a multiple of 3, but this may not always be the
+ − 515 case. #### unfinished
+ − 516
+ − 517 ==========================================================================
+ − 518 5. Miscellaneous
+ − 519 ==========================================================================
+ − 520
+ − 521 A. Unicode Support
771
+ − 522
+ − 523 Adding Unicode support is very desirable. Unicode will likely be a
+ − 524 very common representation in the future, and thus we should
+ − 525 represent Unicode characters using three bytes instead of four.
+ − 526 This means we need to find leading bytes for Unicode. Given that
+ − 527 there are 65,536 characters in Unicode and we can attach 96x96 =
+ − 528 9,216 characters per leading byte, we need eight leading bytes for
+ − 529 Unicode. We currently have four free (0x9A - 0x9D), and with a
+ − 530 little bit of rearranging we can get five: ASCII doesn't really
+ − 531 need to take up a leading byte. (We could just as well use 0x7F,
+ − 532 with a little change to the functions that assume that 0x80 is the
+ − 533 lowest leading byte.) This means we still need to dump three
+ − 534 leading bytes and move them into private space. The CNS charsets
+ − 535 are good candidates since they are rarely used, and
+ − 536 JAPANESE_JISX0208_1978 is becoming less and less used and could
826
+ − 537 also be dumped.
+ − 538
+ − 539 B. Composite Characters
+ − 540
+ − 541 Composite characters are characters constructed by overstriking two
771
+ − 542 or more regular characters.
+ − 543
+ − 544 1) The old Mule implementation involves storing composite characters
+ − 545 in a buffer as a tag followed by all of the actual characters
+ − 546 used to make up the composite character. I think this is a bad
+ − 547 idea; it greatly complicates code that wants to handle strings
+ − 548 one character at a time because it has to deal with the possibility
+ − 549 of great big ungainly characters. It's much more reasonable to
+ − 550 simply store an index into a table of composite characters.
+ − 551
+ − 552 2) The current implementation only allows for 16,384 separate
+ − 553 composite characters over the lifetime of the XEmacs process.
+ − 554 This could become a potential problem if the user
+ − 555 edited lots of different files that use composite characters.
+ − 556 Due to FSF bogosity, increasing the number of allowable
+ − 557 composite characters under Mule would decrease the number
+ − 558 of possible faces that can exist. Mule already has shrunk
+ − 559 this to 2048, and further shrinkage would become uncomfortable.
+ − 560 No such problems exist in XEmacs.
+ − 561
+ − 562 Composite characters could be represented as 0x8D C1 C2 C3,
+ − 563 where each C[1-3] is in the range 0xA0 - 0xFF. This allows
+ − 564 for slightly under 2^20 (one million) composite characters
+ − 565 over the XEmacs process lifetime, and you only need to
+ − 566 increase the size of a Mule character from 19 to 21 bits.
+ − 567 Or you could use 0x8D C1 C2 C3 C4, allowing for about
826
+ − 568 85 million (slightly over 2^26) composite characters.
+ − 569
+ − 570 */
771
+ − 571
+ − 572
+ − 573 /************************************************************************/
+ − 574 /* declarations */
+ − 575 /************************************************************************/
+ − 576
+ − 577 Eistring the_eistring_zero_init, the_eistring_malloc_zero_init;
+ − 578
+ − 579 #define MAX_CHARBPOS_GAP_SIZE_3 (65535/3)
+ − 580 #define MAX_BYTEBPOS_GAP_SIZE_3 (3 * MAX_CHARBPOS_GAP_SIZE_3)
+ − 581
+ − 582 short three_to_one_table[1 + MAX_BYTEBPOS_GAP_SIZE_3];
+ − 583
+ − 584 #ifdef MULE
+ − 585
+ − 586 /* Table of number of bytes in the string representation of a character
+ − 587 indexed by the first byte of that representation.
+ − 588
+ − 589 rep_bytes_by_first_byte(c) is more efficient than the equivalent
+ − 590 canonical computation:
+ − 591
826
+ − 592 XCHARSET_REP_BYTES (charset_by_leading_byte (c)) */
771
+ − 593
+ − 594 const Bytecount rep_bytes_by_first_byte[0xA0] =
+ − 595 { /* 0x00 - 0x7f are for straight ASCII */
+ − 596 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 597 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 598 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 599 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 600 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 601 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 602 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 603 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 604 /* 0x80 - 0x8f are for Dimension-1 official charsets */
+ − 605 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ − 606 /* 0x90 - 0x9d are for Dimension-2 official charsets */
+ − 607 /* 0x9e is for Dimension-1 private charsets */
+ − 608 /* 0x9f is for Dimension-2 private charsets */
+ − 609 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
+ − 610 };
+ − 611
+ − 612 #ifdef ENABLE_COMPOSITE_CHARS
+ − 613
+ − 614 /* Hash tables for composite chars. One maps string representing
+ − 615 composed chars to their equivalent chars; one goes the
+ − 616 other way. */
+ − 617 Lisp_Object Vcomposite_char_char2string_hash_table;
+ − 618 Lisp_Object Vcomposite_char_string2char_hash_table;
+ − 619
+ − 620 static int composite_char_row_next;
+ − 621 static int composite_char_col_next;
+ − 622
+ − 623 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 624
+ − 625 #endif /* MULE */
+ − 626
+ − 627
+ − 628 /************************************************************************/
+ − 629 /* qxestr***() functions */
+ − 630 /************************************************************************/
+ − 631
+ − 632 /* Most are inline functions in lisp.h */
+ − 633
+ − 634 int
867
+ − 635 qxesprintf (Ibyte *buffer, const CIbyte *format, ...)
771
+ − 636 {
+ − 637 va_list args;
+ − 638 int retval;
+ − 639
+ − 640 va_start (args, format);
+ − 641 retval = vsprintf ((char *) buffer, format, args);
+ − 642 va_end (args);
+ − 643
+ − 644 return retval;
+ − 645 }
+ − 646
+ − 647 /* strcasecmp() implementation from BSD */
867
+ − 648 static Ibyte strcasecmp_charmap[] = {
771
+ − 649 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
+ − 650 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
+ − 651 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
+ − 652 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
+ − 653 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
+ − 654 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
+ − 655 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
+ − 656 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
+ − 657 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+ − 658 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+ − 659 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+ − 660 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
+ − 661 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+ − 662 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+ − 663 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+ − 664 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
+ − 665 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
+ − 666 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
+ − 667 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
+ − 668 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
+ − 669 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
+ − 670 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
+ − 671 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
+ − 672 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
+ − 673 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
+ − 674 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
+ − 675 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
+ − 676 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
+ − 677 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+ − 678 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
+ − 679 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
+ − 680 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
+ − 681 };
+ − 682
+ − 683 /* A version that works like generic strcasecmp() -- only collapsing
+ − 684 case in ASCII A-Z/a-z. This is safe on Mule strings due to the
+ − 685 current representation.
+ − 686
+ − 687 This version was written by some Berkeley coder, favoring
+ − 688 nanosecond improvements over clarity. In all other versions below,
+ − 689 we use symmetrical algorithms that may sacrifice a few machine
+ − 690 cycles but are MUCH MUCH clearer, which counts a lot more.
+ − 691 */
+ − 692
+ − 693 int
867
+ − 694 qxestrcasecmp (const Ibyte *s1, const Ibyte *s2)
771
+ − 695 {
867
+ − 696 Ibyte *cm = strcasecmp_charmap;
771
+ − 697
+ − 698 while (cm[*s1] == cm[*s2++])
+ − 699 if (*s1++ == '\0')
+ − 700 return (0);
+ − 701
+ − 702 return (cm[*s1] - cm[*--s2]);
+ − 703 }
+ − 704
+ − 705 int
+ − 706 ascii_strcasecmp (const Char_ASCII *s1, const Char_ASCII *s2)
+ − 707 {
867
+ − 708 return qxestrcasecmp ((const Ibyte *) s1, (const Ibyte *) s2);
771
+ − 709 }
+ − 710
+ − 711 int
867
+ − 712 qxestrcasecmp_c (const Ibyte *s1, const Char_ASCII *s2)
771
+ − 713 {
867
+ − 714 return qxestrcasecmp (s1, (const Ibyte *) s2);
771
+ − 715 }
+ − 716
+ − 717 /* An internationalized version that collapses case in a general fashion.
+ − 718 */
+ − 719
+ − 720 int
867
+ − 721 qxestrcasecmp_i18n (const Ibyte *s1, const Ibyte *s2)
771
+ − 722 {
+ − 723 while (*s1 && *s2)
+ − 724 {
867
+ − 725 if (DOWNCASE (0, itext_ichar (s1)) !=
+ − 726 DOWNCASE (0, itext_ichar (s2)))
771
+ − 727 break;
867
+ − 728 INC_IBYTEPTR (s1);
+ − 729 INC_IBYTEPTR (s2);
771
+ − 730 }
+ − 731
867
+ − 732 return (DOWNCASE (0, itext_ichar (s1)) -
+ − 733 DOWNCASE (0, itext_ichar (s2)));
771
+ − 734 }
+ − 735
+ − 736 /* The only difference between these next two and
+ − 737 qxememcasecmp()/qxememcasecmp_i18n() is that these two will stop if
+ − 738 both strings are equal and less than LEN in length, while
+ − 739 the mem...() versions would would run off the end. */
+ − 740
+ − 741 int
867
+ − 742 qxestrncasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len)
771
+ − 743 {
867
+ − 744 Ibyte *cm = strcasecmp_charmap;
771
+ − 745
+ − 746 while (len--)
+ − 747 {
+ − 748 int diff = cm[*s1] - cm[*s2];
+ − 749 if (diff != 0)
+ − 750 return diff;
+ − 751 if (!*s1)
+ − 752 return 0;
+ − 753 s1++, s2++;
+ − 754 }
+ − 755
+ − 756 return 0;
+ − 757 }
+ − 758
+ − 759 int
+ − 760 ascii_strncasecmp (const Char_ASCII *s1, const Char_ASCII *s2, Bytecount len)
+ − 761 {
867
+ − 762 return qxestrncasecmp ((const Ibyte *) s1, (const Ibyte *) s2, len);
771
+ − 763 }
+ − 764
+ − 765 int
867
+ − 766 qxestrncasecmp_c (const Ibyte *s1, const Char_ASCII *s2, Bytecount len)
771
+ − 767 {
867
+ − 768 return qxestrncasecmp (s1, (const Ibyte *) s2, len);
771
+ − 769 }
+ − 770
801
+ − 771 /* Compare LEN_FROM_S1 worth of characters from S1 with the same number of
+ − 772 characters from S2, case insensitive. NOTE: Downcasing can convert
+ − 773 characters from one length in bytes to another, so reversing S1 and S2
+ − 774 is *NOT* a symmetric operations! You must choose a length that agrees
+ − 775 with S1. */
+ − 776
771
+ − 777 int
867
+ − 778 qxestrncasecmp_i18n (const Ibyte *s1, const Ibyte *s2,
801
+ − 779 Bytecount len_from_s1)
771
+ − 780 {
801
+ − 781 while (len_from_s1 > 0)
771
+ − 782 {
867
+ − 783 const Ibyte *old_s1 = s1;
+ − 784 int diff = (DOWNCASE (0, itext_ichar (s1)) -
+ − 785 DOWNCASE (0, itext_ichar (s2)));
771
+ − 786 if (diff != 0)
+ − 787 return diff;
+ − 788 if (!*s1)
+ − 789 return 0;
867
+ − 790 INC_IBYTEPTR (s1);
+ − 791 INC_IBYTEPTR (s2);
801
+ − 792 len_from_s1 -= s1 - old_s1;
771
+ − 793 }
+ − 794
+ − 795 return 0;
+ − 796 }
+ − 797
+ − 798 int
867
+ − 799 qxememcmp (const Ibyte *s1, const Ibyte *s2, Bytecount len)
771
+ − 800 {
+ − 801 return memcmp (s1, s2, len);
+ − 802 }
+ − 803
+ − 804 int
867
+ − 805 qxememcmp4 (const Ibyte *s1, Bytecount len1,
+ − 806 const Ibyte *s2, Bytecount len2)
801
+ − 807 {
+ − 808 int retval = qxememcmp (s1, s2, min (len1, len2));
+ − 809 if (retval)
+ − 810 return retval;
+ − 811 return len1 - len2;
+ − 812 }
+ − 813
+ − 814 int
867
+ − 815 qxememcasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len)
771
+ − 816 {
867
+ − 817 Ibyte *cm = strcasecmp_charmap;
771
+ − 818
+ − 819 while (len--)
+ − 820 {
+ − 821 int diff = cm[*s1] - cm[*s2];
+ − 822 if (diff != 0)
+ − 823 return diff;
+ − 824 s1++, s2++;
+ − 825 }
+ − 826
+ − 827 return 0;
+ − 828 }
+ − 829
+ − 830 int
867
+ − 831 qxememcasecmp4 (const Ibyte *s1, Bytecount len1,
+ − 832 const Ibyte *s2, Bytecount len2)
771
+ − 833 {
801
+ − 834 int retval = qxememcasecmp (s1, s2, min (len1, len2));
+ − 835 if (retval)
+ − 836 return retval;
+ − 837 return len1 - len2;
+ − 838 }
+ − 839
+ − 840 /* Do a character-by-character comparison, returning "which is greater" by
867
+ − 841 comparing the Ichar values. (#### Should have option to compare Unicode
801
+ − 842 points) */
+ − 843
+ − 844 int
867
+ − 845 qxetextcmp (const Ibyte *s1, Bytecount len1,
+ − 846 const Ibyte *s2, Bytecount len2)
801
+ − 847 {
+ − 848 while (len1 > 0 && len2 > 0)
771
+ − 849 {
867
+ − 850 const Ibyte *old_s1 = s1;
+ − 851 const Ibyte *old_s2 = s2;
+ − 852 int diff = itext_ichar (s1) - itext_ichar (s2);
801
+ − 853 if (diff != 0)
+ − 854 return diff;
867
+ − 855 INC_IBYTEPTR (s1);
+ − 856 INC_IBYTEPTR (s2);
801
+ − 857 len1 -= s1 - old_s1;
+ − 858 len2 -= s2 - old_s2;
+ − 859 }
+ − 860
+ − 861 assert (len1 >= 0 && len2 >= 0);
+ − 862 return len1 - len2;
+ − 863 }
+ − 864
+ − 865 int
867
+ − 866 qxetextcmp_matching (const Ibyte *s1, Bytecount len1,
+ − 867 const Ibyte *s2, Bytecount len2,
801
+ − 868 Charcount *matching)
+ − 869 {
+ − 870 *matching = 0;
+ − 871 while (len1 > 0 && len2 > 0)
+ − 872 {
867
+ − 873 const Ibyte *old_s1 = s1;
+ − 874 const Ibyte *old_s2 = s2;
+ − 875 int diff = itext_ichar (s1) - itext_ichar (s2);
801
+ − 876 if (diff != 0)
+ − 877 return diff;
867
+ − 878 INC_IBYTEPTR (s1);
+ − 879 INC_IBYTEPTR (s2);
801
+ − 880 len1 -= s1 - old_s1;
+ − 881 len2 -= s2 - old_s2;
+ − 882 (*matching)++;
+ − 883 }
+ − 884
+ − 885 assert (len1 >= 0 && len2 >= 0);
+ − 886 return len1 - len2;
+ − 887 }
+ − 888
+ − 889 /* Do a character-by-character comparison, returning "which is greater" by
867
+ − 890 comparing the Ichar values, case insensitively (by downcasing both
801
+ − 891 first). (#### Should have option to compare Unicode points)
+ − 892
+ − 893 In this case, both lengths must be specified becaused downcasing can
+ − 894 convert characters from one length in bytes to another; therefore, two
+ − 895 blocks of text of different length might be equal. If both compare
+ − 896 equal up to the limit in length of one but not the other, the longer one
+ − 897 is "greater". */
+ − 898
+ − 899 int
867
+ − 900 qxetextcasecmp (const Ibyte *s1, Bytecount len1,
+ − 901 const Ibyte *s2, Bytecount len2)
801
+ − 902 {
+ − 903 while (len1 > 0 && len2 > 0)
+ − 904 {
867
+ − 905 const Ibyte *old_s1 = s1;
+ − 906 const Ibyte *old_s2 = s2;
+ − 907 int diff = (DOWNCASE (0, itext_ichar (s1)) -
+ − 908 DOWNCASE (0, itext_ichar (s2)));
771
+ − 909 if (diff != 0)
+ − 910 return diff;
867
+ − 911 INC_IBYTEPTR (s1);
+ − 912 INC_IBYTEPTR (s2);
801
+ − 913 len1 -= s1 - old_s1;
+ − 914 len2 -= s2 - old_s2;
771
+ − 915 }
+ − 916
801
+ − 917 assert (len1 >= 0 && len2 >= 0);
+ − 918 return len1 - len2;
+ − 919 }
+ − 920
+ − 921 /* Like qxetextcasecmp() but also return number of characters at
+ − 922 beginning that match. */
+ − 923
+ − 924 int
867
+ − 925 qxetextcasecmp_matching (const Ibyte *s1, Bytecount len1,
+ − 926 const Ibyte *s2, Bytecount len2,
801
+ − 927 Charcount *matching)
+ − 928 {
+ − 929 *matching = 0;
+ − 930 while (len1 > 0 && len2 > 0)
+ − 931 {
867
+ − 932 const Ibyte *old_s1 = s1;
+ − 933 const Ibyte *old_s2 = s2;
+ − 934 int diff = (DOWNCASE (0, itext_ichar (s1)) -
+ − 935 DOWNCASE (0, itext_ichar (s2)));
801
+ − 936 if (diff != 0)
+ − 937 return diff;
867
+ − 938 INC_IBYTEPTR (s1);
+ − 939 INC_IBYTEPTR (s2);
801
+ − 940 len1 -= s1 - old_s1;
+ − 941 len2 -= s2 - old_s2;
+ − 942 (*matching)++;
+ − 943 }
+ − 944
+ − 945 assert (len1 >= 0 && len2 >= 0);
+ − 946 return len1 - len2;
771
+ − 947 }
+ − 948
+ − 949 int
+ − 950 lisp_strcasecmp (Lisp_Object s1, Lisp_Object s2)
+ − 951 {
867
+ − 952 Ibyte *cm = strcasecmp_charmap;
+ − 953 Ibyte *p1 = XSTRING_DATA (s1);
+ − 954 Ibyte *p2 = XSTRING_DATA (s2);
+ − 955 Ibyte *e1 = p1 + XSTRING_LENGTH (s1);
+ − 956 Ibyte *e2 = p2 + XSTRING_LENGTH (s2);
771
+ − 957
+ − 958 /* again, we use a symmetric algorithm and favor clarity over
+ − 959 nanosecond improvements. */
+ − 960 while (1)
+ − 961 {
+ − 962 /* if we reached the end of either string, compare lengths.
+ − 963 do NOT compare the final null byte against anything, in case
+ − 964 the other string also has a null byte at that position. */
+ − 965 if (p1 == e1 || p2 == e2)
+ − 966 return e1 - e2;
+ − 967 if (cm[*p1] != cm[*p2])
+ − 968 return cm[*p1] - cm[*p2];
+ − 969 p1++, p2++;
+ − 970 }
+ − 971 }
+ − 972
+ − 973 int
+ − 974 lisp_strcasecmp_i18n (Lisp_Object s1, Lisp_Object s2)
+ − 975 {
801
+ − 976 return qxetextcasecmp (XSTRING_DATA (s1), XSTRING_LENGTH (s1),
+ − 977 XSTRING_DATA (s2), XSTRING_LENGTH (s2));
771
+ − 978 }
+ − 979
+ − 980
+ − 981 /************************************************************************/
+ − 982 /* conversion between textual representations */
+ − 983 /************************************************************************/
+ − 984
+ − 985 /* NOTE: Does not reset the Dynarr. */
+ − 986
+ − 987 void
867
+ − 988 convert_ibyte_string_into_ichar_dynarr (const Ibyte *str, Bytecount len,
+ − 989 Ichar_dynarr *dyn)
771
+ − 990 {
867
+ − 991 const Ibyte *strend = str + len;
771
+ − 992
+ − 993 while (str < strend)
+ − 994 {
867
+ − 995 Ichar ch = itext_ichar (str);
771
+ − 996 Dynarr_add (dyn, ch);
867
+ − 997 INC_IBYTEPTR (str);
771
+ − 998 }
+ − 999 }
+ − 1000
+ − 1001 Charcount
867
+ − 1002 convert_ibyte_string_into_ichar_string (const Ibyte *str, Bytecount len,
+ − 1003 Ichar *arr)
771
+ − 1004 {
867
+ − 1005 const Ibyte *strend = str + len;
771
+ − 1006 Charcount newlen = 0;
+ − 1007 while (str < strend)
+ − 1008 {
867
+ − 1009 Ichar ch = itext_ichar (str);
771
+ − 1010 arr[newlen++] = ch;
867
+ − 1011 INC_IBYTEPTR (str);
771
+ − 1012 }
+ − 1013 return newlen;
+ − 1014 }
+ − 1015
867
+ − 1016 /* Convert an array of Ichars into the equivalent string representation.
+ − 1017 Store into the given Ibyte dynarr. Does not reset the dynarr.
771
+ − 1018 Does not add a terminating zero. */
+ − 1019
+ − 1020 void
867
+ − 1021 convert_ichar_string_into_ibyte_dynarr (Ichar *arr, int nels,
+ − 1022 Ibyte_dynarr *dyn)
771
+ − 1023 {
867
+ − 1024 Ibyte str[MAX_ICHAR_LEN];
771
+ − 1025 int i;
+ − 1026
+ − 1027 for (i = 0; i < nels; i++)
+ − 1028 {
867
+ − 1029 Bytecount len = set_itext_ichar (str, arr[i]);
771
+ − 1030 Dynarr_add_many (dyn, str, len);
+ − 1031 }
+ − 1032 }
+ − 1033
867
+ − 1034 /* Convert an array of Ichars into the equivalent string representation.
771
+ − 1035 Malloc the space needed for this and return it. If LEN_OUT is not a
867
+ − 1036 NULL pointer, store into LEN_OUT the number of Ibytes in the
+ − 1037 malloc()ed string. Note that the actual number of Ibytes allocated
771
+ − 1038 is one more than this: the returned string is zero-terminated. */
+ − 1039
867
+ − 1040 Ibyte *
+ − 1041 convert_ichar_string_into_malloced_string (Ichar *arr, int nels,
826
+ − 1042 Bytecount *len_out)
771
+ − 1043 {
+ − 1044 /* Damn zero-termination. */
867
+ − 1045 Ibyte *str = (Ibyte *) ALLOCA (nels * MAX_ICHAR_LEN + 1);
+ − 1046 Ibyte *strorig = str;
771
+ − 1047 Bytecount len;
+ − 1048
+ − 1049 int i;
+ − 1050
+ − 1051 for (i = 0; i < nels; i++)
867
+ − 1052 str += set_itext_ichar (str, arr[i]);
771
+ − 1053 *str = '\0';
+ − 1054 len = str - strorig;
867
+ − 1055 str = (Ibyte *) xmalloc (1 + len);
771
+ − 1056 memcpy (str, strorig, 1 + len);
+ − 1057 if (len_out)
+ − 1058 *len_out = len;
+ − 1059 return str;
+ − 1060 }
+ − 1061
826
+ − 1062 #define COPY_TEXT_BETWEEN_FORMATS(srcfmt, dstfmt) \
+ − 1063 do \
+ − 1064 { \
+ − 1065 if (dst) \
+ − 1066 { \
867
+ − 1067 Ibyte *dstend = dst + dstlen; \
+ − 1068 Ibyte *dstp = dst; \
+ − 1069 const Ibyte *srcend = src + srclen; \
+ − 1070 const Ibyte *srcp = src; \
826
+ − 1071 \
+ − 1072 while (srcp < srcend) \
+ − 1073 { \
867
+ − 1074 Ichar ch = itext_ichar_fmt (srcp, srcfmt, srcobj); \
+ − 1075 Bytecount len = ichar_len_fmt (ch, dstfmt); \
826
+ − 1076 \
+ − 1077 if (dstp + len <= dstend) \
+ − 1078 { \
867
+ − 1079 set_itext_ichar_fmt (dstp, ch, dstfmt, dstobj); \
826
+ − 1080 dstp += len; \
+ − 1081 } \
+ − 1082 else \
+ − 1083 break; \
867
+ − 1084 INC_IBYTEPTR_FMT (srcp, srcfmt); \
826
+ − 1085 } \
+ − 1086 text_checking_assert (srcp <= srcend); \
+ − 1087 if (src_used) \
+ − 1088 *src_used = srcp - src; \
+ − 1089 return dstp - dst; \
+ − 1090 } \
+ − 1091 else \
+ − 1092 { \
867
+ − 1093 const Ibyte *srcend = src + srclen; \
+ − 1094 const Ibyte *srcp = src; \
826
+ − 1095 Bytecount total = 0; \
+ − 1096 \
+ − 1097 while (srcp < srcend) \
+ − 1098 { \
867
+ − 1099 total += ichar_len_fmt (itext_ichar_fmt (srcp, srcfmt, \
826
+ − 1100 srcobj), dstfmt); \
867
+ − 1101 INC_IBYTEPTR_FMT (srcp, srcfmt); \
826
+ − 1102 } \
+ − 1103 text_checking_assert (srcp == srcend); \
+ − 1104 if (src_used) \
+ − 1105 *src_used = srcp - src; \
+ − 1106 return total; \
+ − 1107 } \
+ − 1108 } \
+ − 1109 while (0)
+ − 1110
+ − 1111 /* Copy as much text from SRC/SRCLEN to DST/DSTLEN as will fit, converting
+ − 1112 from SRCFMT/SRCOBJ to DSTFMT/DSTOBJ. Return number of bytes stored into
+ − 1113 DST as return value, and number of bytes copied from SRC through
+ − 1114 SRC_USED (if not NULL). If DST is NULL, don't actually store anything
+ − 1115 and just return the size needed to store all the text. Will not copy
+ − 1116 partial characters into DST. */
+ − 1117
+ − 1118 Bytecount
867
+ − 1119 copy_text_between_formats (const Ibyte *src, Bytecount srclen,
826
+ − 1120 Internal_Format srcfmt,
+ − 1121 Lisp_Object srcobj,
867
+ − 1122 Ibyte *dst, Bytecount dstlen,
826
+ − 1123 Internal_Format dstfmt,
+ − 1124 Lisp_Object dstobj,
+ − 1125 Bytecount *src_used)
+ − 1126 {
+ − 1127 if (srcfmt == dstfmt &&
+ − 1128 objects_have_same_internal_representation (srcobj, dstobj))
+ − 1129 {
+ − 1130 if (dst)
+ − 1131 {
+ − 1132 srclen = min (srclen, dstlen);
867
+ − 1133 srclen = validate_ibyte_string_backward (src, srclen);
826
+ − 1134 memcpy (dst, src, srclen);
+ − 1135 if (src_used)
+ − 1136 *src_used = srclen;
+ − 1137 return srclen;
+ − 1138 }
+ − 1139 else
+ − 1140 return srclen;
+ − 1141 }
+ − 1142 /* Everything before the final else statement is an optimization.
+ − 1143 The inner loops inside COPY_TEXT_BETWEEN_FORMATS() have a number
+ − 1144 of calls to *_fmt(), each of which has a switch statement in it.
+ − 1145 By using constants as the FMT argument, these switch statements
+ − 1146 will be optimized out of existence. */
+ − 1147 #define ELSE_FORMATS(fmt1, fmt2) \
+ − 1148 else if (srcfmt == fmt1 && dstfmt == fmt2) \
+ − 1149 COPY_TEXT_BETWEEN_FORMATS (fmt1, fmt2)
+ − 1150 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_8_BIT_FIXED);
+ − 1151 ELSE_FORMATS (FORMAT_8_BIT_FIXED, FORMAT_DEFAULT);
+ − 1152 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_32_BIT_FIXED);
+ − 1153 ELSE_FORMATS (FORMAT_32_BIT_FIXED, FORMAT_DEFAULT);
+ − 1154 else
+ − 1155 COPY_TEXT_BETWEEN_FORMATS (srcfmt, dstfmt);
+ − 1156 #undef ELSE_FORMATS
+ − 1157 }
+ − 1158
+ − 1159 /* Copy as much buffer text in BUF, starting at POS, of length LEN, as will
+ − 1160 fit into DST/DSTLEN, converting to DSTFMT. Return number of bytes
+ − 1161 stored into DST as return value, and number of bytes copied from BUF
+ − 1162 through SRC_USED (if not NULL). If DST is NULL, don't actually store
+ − 1163 anything and just return the size needed to store all the text. */
+ − 1164
+ − 1165 Bytecount
+ − 1166 copy_buffer_text_out (struct buffer *buf, Bytebpos pos,
867
+ − 1167 Bytecount len, Ibyte *dst, Bytecount dstlen,
826
+ − 1168 Internal_Format dstfmt, Lisp_Object dstobj,
+ − 1169 Bytecount *src_used)
+ − 1170 {
+ − 1171 Bytecount dst_used = 0;
+ − 1172 if (src_used)
+ − 1173 *src_used = 0;
+ − 1174
+ − 1175 {
+ − 1176 BUFFER_TEXT_LOOP (buf, pos, len, runptr, runlen)
+ − 1177 {
+ − 1178 Bytecount the_src_used, the_dst_used;
+ − 1179
+ − 1180 the_dst_used = copy_text_between_formats (runptr, runlen,
+ − 1181 BUF_FORMAT (buf),
+ − 1182 wrap_buffer (buf),
+ − 1183 dst, dstlen, dstfmt,
+ − 1184 dstobj, &the_src_used);
+ − 1185 dst_used += the_dst_used;
+ − 1186 if (src_used)
+ − 1187 *src_used += the_src_used;
+ − 1188 if (dst)
+ − 1189 {
+ − 1190 dst += the_dst_used;
+ − 1191 dstlen -= the_dst_used;
841
+ − 1192 /* Stop if we didn't use all of the source text. Also stop
+ − 1193 if the destination is full. We need the first test because
+ − 1194 there might be a couple bytes left in the destination, but
+ − 1195 not enough to fit a full character. The first test will in
+ − 1196 fact catch the vast majority of cases where the destination
+ − 1197 is empty, too -- but in case the destination holds *exactly*
+ − 1198 the run length, we put in the second check. (It shouldn't
+ − 1199 really matter though -- next time through we'll just get a
+ − 1200 0.) */
+ − 1201 if (the_src_used < runlen || !dstlen)
826
+ − 1202 break;
+ − 1203 }
+ − 1204 }
+ − 1205 }
+ − 1206
+ − 1207 return dst_used;
+ − 1208 }
+ − 1209
771
+ − 1210
+ − 1211 /************************************************************************/
+ − 1212 /* charset properties of strings */
+ − 1213 /************************************************************************/
+ − 1214
+ − 1215 void
867
+ − 1216 find_charsets_in_ibyte_string (unsigned char *charsets, const Ibyte *str,
771
+ − 1217 Bytecount len)
+ − 1218 {
+ − 1219 #ifndef MULE
+ − 1220 /* Telescope this. */
+ − 1221 charsets[0] = 1;
+ − 1222 #else
867
+ − 1223 const Ibyte *strend = str + len;
771
+ − 1224 memset (charsets, 0, NUM_LEADING_BYTES);
+ − 1225
+ − 1226 /* #### SJT doesn't like this. */
+ − 1227 if (len == 0)
+ − 1228 {
+ − 1229 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1;
+ − 1230 return;
+ − 1231 }
+ − 1232
+ − 1233 while (str < strend)
+ − 1234 {
867
+ − 1235 charsets[ichar_leading_byte (itext_ichar (str)) - MIN_LEADING_BYTE] =
771
+ − 1236 1;
867
+ − 1237 INC_IBYTEPTR (str);
771
+ − 1238 }
+ − 1239 #endif
+ − 1240 }
+ − 1241
+ − 1242 void
867
+ − 1243 find_charsets_in_ichar_string (unsigned char *charsets, const Ichar *str,
771
+ − 1244 Charcount len)
+ − 1245 {
+ − 1246 #ifndef MULE
+ − 1247 /* Telescope this. */
+ − 1248 charsets[0] = 1;
+ − 1249 #else
+ − 1250 int i;
+ − 1251
+ − 1252 memset (charsets, 0, NUM_LEADING_BYTES);
+ − 1253
+ − 1254 /* #### SJT doesn't like this. */
+ − 1255 if (len == 0)
+ − 1256 {
+ − 1257 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1;
+ − 1258 return;
+ − 1259 }
+ − 1260
+ − 1261 for (i = 0; i < len; i++)
+ − 1262 {
867
+ − 1263 charsets[ichar_leading_byte (str[i]) - MIN_LEADING_BYTE] = 1;
771
+ − 1264 }
+ − 1265 #endif
+ − 1266 }
+ − 1267
+ − 1268 int
867
+ − 1269 ibyte_string_displayed_columns (const Ibyte *str, Bytecount len)
771
+ − 1270 {
+ − 1271 int cols = 0;
867
+ − 1272 const Ibyte *end = str + len;
771
+ − 1273
+ − 1274 while (str < end)
+ − 1275 {
+ − 1276 #ifdef MULE
867
+ − 1277 Ichar ch = itext_ichar (str);
+ − 1278 cols += XCHARSET_COLUMNS (ichar_charset (ch));
771
+ − 1279 #else
+ − 1280 cols++;
+ − 1281 #endif
867
+ − 1282 INC_IBYTEPTR (str);
771
+ − 1283 }
+ − 1284
+ − 1285 return cols;
+ − 1286 }
+ − 1287
+ − 1288 int
867
+ − 1289 ichar_string_displayed_columns (const Ichar *str, Charcount len)
771
+ − 1290 {
+ − 1291 #ifdef MULE
+ − 1292 int cols = 0;
+ − 1293 int i;
+ − 1294
+ − 1295 for (i = 0; i < len; i++)
867
+ − 1296 cols += XCHARSET_COLUMNS (ichar_charset (str[i]));
771
+ − 1297
+ − 1298 return cols;
+ − 1299 #else /* not MULE */
+ − 1300 return len;
+ − 1301 #endif
+ − 1302 }
+ − 1303
+ − 1304 Charcount
867
+ − 1305 ibyte_string_nonascii_chars (const Ibyte *str, Bytecount len)
771
+ − 1306 {
+ − 1307 #ifdef MULE
867
+ − 1308 const Ibyte *end = str + len;
771
+ − 1309 Charcount retval = 0;
+ − 1310
+ − 1311 while (str < end)
+ − 1312 {
826
+ − 1313 if (!byte_ascii_p (*str))
771
+ − 1314 retval++;
867
+ − 1315 INC_IBYTEPTR (str);
771
+ − 1316 }
+ − 1317
+ − 1318 return retval;
+ − 1319 #else
+ − 1320 return 0;
+ − 1321 #endif
+ − 1322 }
+ − 1323
+ − 1324
+ − 1325 /***************************************************************************/
+ − 1326 /* Eistring helper functions */
+ − 1327 /***************************************************************************/
+ − 1328
+ − 1329 int
867
+ − 1330 eistr_casefiddle_1 (Ibyte *olddata, Bytecount len, Ibyte *newdata,
771
+ − 1331 int downp)
+ − 1332 {
867
+ − 1333 Ibyte *endp = olddata + len;
+ − 1334 Ibyte *newp = newdata;
771
+ − 1335 int changedp = 0;
+ − 1336
+ − 1337 while (olddata < endp)
+ − 1338 {
867
+ − 1339 Ichar c = itext_ichar (olddata);
+ − 1340 Ichar newc;
771
+ − 1341
+ − 1342 if (downp)
+ − 1343 newc = DOWNCASE (0, c);
+ − 1344 else
+ − 1345 newc = UPCASE (0, c);
+ − 1346
+ − 1347 if (c != newc)
+ − 1348 changedp = 1;
+ − 1349
867
+ − 1350 newp += set_itext_ichar (newp, newc);
+ − 1351 INC_IBYTEPTR (olddata);
771
+ − 1352 }
+ − 1353
+ − 1354 *newp = '\0';
+ − 1355
+ − 1356 return changedp ? newp - newdata : 0;
+ − 1357 }
+ − 1358
+ − 1359 int
+ − 1360 eifind_large_enough_buffer (int oldbufsize, int needed_size)
+ − 1361 {
+ − 1362 while (oldbufsize < needed_size)
+ − 1363 {
+ − 1364 oldbufsize = oldbufsize * 3 / 2;
+ − 1365 oldbufsize = max (oldbufsize, 32);
+ − 1366 }
+ − 1367
+ − 1368 return oldbufsize;
+ − 1369 }
+ − 1370
+ − 1371 void
+ − 1372 eito_malloc_1 (Eistring *ei)
+ − 1373 {
+ − 1374 if (ei->mallocp_)
+ − 1375 return;
+ − 1376 ei->mallocp_ = 1;
+ − 1377 if (ei->data_)
+ − 1378 {
867
+ − 1379 Ibyte *newdata;
771
+ − 1380
+ − 1381 ei->max_size_allocated_ =
+ − 1382 eifind_large_enough_buffer (0, ei->bytelen_ + 1);
867
+ − 1383 newdata = (Ibyte *) xmalloc (ei->max_size_allocated_);
771
+ − 1384 memcpy (newdata, ei->data_, ei->bytelen_ + 1);
+ − 1385 ei->data_ = newdata;
+ − 1386 }
+ − 1387
+ − 1388 if (ei->extdata_)
+ − 1389 {
+ − 1390 Extbyte *newdata = (Extbyte *) xmalloc (ei->extlen_ + 2);
+ − 1391
+ − 1392 memcpy (newdata, ei->extdata_, ei->extlen_);
+ − 1393 /* Double null-terminate in case of Unicode data */
+ − 1394 newdata[ei->extlen_] = '\0';
+ − 1395 newdata[ei->extlen_ + 1] = '\0';
+ − 1396 ei->extdata_ = newdata;
+ − 1397 }
+ − 1398 }
+ − 1399
+ − 1400 int
+ − 1401 eicmp_1 (Eistring *ei, Bytecount off, Charcount charoff,
867
+ − 1402 Bytecount len, Charcount charlen, const Ibyte *data,
771
+ − 1403 const Eistring *ei2, int is_c, int fold_case)
+ − 1404 {
+ − 1405 assert ((off < 0) != (charoff < 0));
+ − 1406 if (off < 0)
+ − 1407 {
+ − 1408 off = charcount_to_bytecount (ei->data_, charoff);
+ − 1409 if (charlen < 0)
+ − 1410 len = -1;
+ − 1411 else
+ − 1412 len = charcount_to_bytecount (ei->data_ + off, charlen);
+ − 1413 }
+ − 1414 if (len < 0)
+ − 1415 len = ei->bytelen_ - off;
+ − 1416
+ − 1417 assert (off >= 0 && off <= ei->bytelen_);
+ − 1418 assert (len >= 0 && off + len <= ei->bytelen_);
+ − 1419 assert ((data == 0) != (ei == 0));
+ − 1420 assert ((is_c != 0) == (data != 0));
+ − 1421 assert (fold_case >= 0 && fold_case <= 2);
+ − 1422
+ − 1423 {
+ − 1424 Bytecount dstlen;
867
+ − 1425 const Ibyte *src = ei->data_, *dst;
771
+ − 1426
+ − 1427 if (data)
+ − 1428 {
+ − 1429 dst = data;
+ − 1430 dstlen = qxestrlen (data);
+ − 1431 }
+ − 1432 else
+ − 1433 {
+ − 1434 dst = ei2->data_;
+ − 1435 dstlen = ei2->bytelen_;
+ − 1436 }
+ − 1437
+ − 1438 if (is_c)
+ − 1439 EI_ASSERT_ASCII ((Char_ASCII *) dst, dstlen);
+ − 1440
801
+ − 1441 return (fold_case == 0 ? qxememcmp4 (src, len, dst, dstlen) :
+ − 1442 fold_case == 1 ? qxememcasecmp4 (src, len, dst, dstlen) :
+ − 1443 qxetextcasecmp (src, len, dst, dstlen));
771
+ − 1444 }
+ − 1445 }
+ − 1446
867
+ − 1447 Ibyte *
826
+ − 1448 eicpyout_malloc_fmt (Eistring *eistr, Bytecount *len_out, Internal_Format fmt,
+ − 1449 Lisp_Object object)
771
+ − 1450 {
867
+ − 1451 Ibyte *ptr;
771
+ − 1452
+ − 1453 assert (fmt == FORMAT_DEFAULT);
867
+ − 1454 ptr = xnew_array (Ibyte, eistr->bytelen_ + 1);
771
+ − 1455 if (len_out)
+ − 1456 *len_out = eistr->bytelen_;
+ − 1457 memcpy (ptr, eistr->data_, eistr->bytelen_ + 1);
+ − 1458 return ptr;
+ − 1459 }
+ − 1460
+ − 1461
+ − 1462 /************************************************************************/
+ − 1463 /* Charcount/Bytecount conversion */
+ − 1464 /************************************************************************/
+ − 1465
+ − 1466 /* Optimization. Do it. Live it. Love it. */
+ − 1467
+ − 1468 #ifdef MULE
+ − 1469
826
+ − 1470 /* Skip as many ASCII bytes as possible in the memory block [PTR, END).
+ − 1471 Return pointer to the first non-ASCII byte. optimized for long
+ − 1472 stretches of ASCII. */
867
+ − 1473 inline static const Ibyte *
+ − 1474 skip_ascii (const Ibyte *ptr, const Ibyte *end)
771
+ − 1475 {
826
+ − 1476 #ifdef EFFICIENT_INT_128_BIT
+ − 1477 # define STRIDE_TYPE INT_128_BIT
+ − 1478 # define HIGH_BIT_MASK \
+ − 1479 MAKE_128_BIT_UNSIGNED_CONSTANT (0x80808080808080808080808080808080)
+ − 1480 #elif defined (EFFICIENT_INT_64_BIT)
+ − 1481 # define STRIDE_TYPE INT_64_BIT
+ − 1482 # define HIGH_BIT_MASK MAKE_64_BIT_UNSIGNED_CONSTANT (0x8080808080808080)
771
+ − 1483 #else
826
+ − 1484 # define STRIDE_TYPE INT_32_BIT
+ − 1485 # define HIGH_BIT_MASK MAKE_32_BIT_UNSIGNED_CONSTANT (0x80808080)
771
+ − 1486 #endif
+ − 1487
+ − 1488 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1))
+ − 1489 #define ALIGN_MASK (~ ALIGN_BITS)
+ − 1490 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0)
+ − 1491 #define STRIDE sizeof (STRIDE_TYPE)
+ − 1492
826
+ − 1493 const unsigned STRIDE_TYPE *ascii_end;
+ − 1494
+ − 1495 /* Need to do in 3 sections -- before alignment start, aligned chunk,
+ − 1496 after alignment end. */
+ − 1497 while (!ALIGNED (ptr))
771
+ − 1498 {
826
+ − 1499 if (ptr == end || !byte_ascii_p (*ptr))
+ − 1500 return ptr;
+ − 1501 ptr++;
+ − 1502 }
+ − 1503 ascii_end = (const unsigned STRIDE_TYPE *) ptr;
+ − 1504 /* This loop screams, because we can detect ASCII
+ − 1505 characters 4 or 8 at a time. */
867
+ − 1506 while ((const Ibyte *) ascii_end + STRIDE <= end
826
+ − 1507 && !(*ascii_end & HIGH_BIT_MASK))
+ − 1508 ascii_end++;
867
+ − 1509 ptr = (Ibyte *) ascii_end;
826
+ − 1510 while (ptr < end && byte_ascii_p (*ptr))
+ − 1511 ptr++;
+ − 1512 return ptr;
+ − 1513 }
+ − 1514
+ − 1515 /* Function equivalents of bytecount_to_charcount/charcount_to_bytecount.
+ − 1516 These work on strings of all sizes but are more efficient than a simple
+ − 1517 loop on large strings and probably less efficient on sufficiently small
+ − 1518 strings. */
+ − 1519
+ − 1520 Charcount
867
+ − 1521 bytecount_to_charcount_fun (const Ibyte *ptr, Bytecount len)
826
+ − 1522 {
+ − 1523 Charcount count = 0;
867
+ − 1524 const Ibyte *end = ptr + len;
826
+ − 1525 while (1)
+ − 1526 {
867
+ − 1527 const Ibyte *newptr = skip_ascii (ptr, end);
826
+ − 1528 count += newptr - ptr;
+ − 1529 ptr = newptr;
+ − 1530 if (ptr == end)
+ − 1531 break;
+ − 1532 {
+ − 1533 /* Optimize for successive characters from the same charset */
867
+ − 1534 Ibyte leading_byte = *ptr;
826
+ − 1535 int bytes = rep_bytes_by_first_byte (leading_byte);
+ − 1536 while (ptr < end && *ptr == leading_byte)
+ − 1537 ptr += bytes, count++;
+ − 1538 }
771
+ − 1539 }
+ − 1540
+ − 1541 /* Bomb out if the specified substring ends in the middle
+ − 1542 of a character. Note that we might have already gotten
+ − 1543 a core dump above from an invalid reference, but at least
+ − 1544 we will get no farther than here.
+ − 1545
+ − 1546 This also catches len < 0. */
800
+ − 1547 text_checking_assert (ptr == end);
771
+ − 1548
+ − 1549 return count;
+ − 1550 }
+ − 1551
+ − 1552 Bytecount
867
+ − 1553 charcount_to_bytecount_fun (const Ibyte *ptr, Charcount len)
771
+ − 1554 {
867
+ − 1555 const Ibyte *newptr = ptr;
826
+ − 1556 while (1)
771
+ − 1557 {
867
+ − 1558 const Ibyte *newnewptr = skip_ascii (newptr, newptr + len);
826
+ − 1559 len -= newnewptr - newptr;
+ − 1560 newptr = newnewptr;
+ − 1561 if (!len)
+ − 1562 break;
+ − 1563 {
+ − 1564 /* Optimize for successive characters from the same charset */
867
+ − 1565 Ibyte leading_byte = *newptr;
826
+ − 1566 int bytes = rep_bytes_by_first_byte (leading_byte);
+ − 1567 while (len > 0 && *newptr == leading_byte)
+ − 1568 newptr += bytes, len--;
+ − 1569 }
771
+ − 1570 }
+ − 1571 return newptr - ptr;
+ − 1572 }
+ − 1573
+ − 1574 /* The next two functions are the actual meat behind the
+ − 1575 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently
+ − 1576 the method they use is fairly unsophisticated; see buffer.h.
+ − 1577
+ − 1578 Note that charbpos_to_bytebpos_func() is probably the most-called
+ − 1579 function in all of XEmacs. Therefore, it must be FAST FAST FAST.
+ − 1580 This is the reason why so much of the code is duplicated.
+ − 1581
+ − 1582 Similar considerations apply to bytebpos_to_charbpos_func(), although
+ − 1583 less so because the function is not called so often.
+ − 1584
+ − 1585 #### At some point this should use a more sophisticated method;
+ − 1586 see buffer.h. */
+ − 1587
+ − 1588 static int not_very_random_number;
+ − 1589
+ − 1590 Bytebpos
+ − 1591 charbpos_to_bytebpos_func (struct buffer *buf, Charbpos x)
+ − 1592 {
+ − 1593 Charbpos bufmin;
+ − 1594 Charbpos bufmax;
+ − 1595 Bytebpos bytmin;
+ − 1596 Bytebpos bytmax;
+ − 1597 int size;
+ − 1598 int forward_p;
+ − 1599 Bytebpos retval;
+ − 1600 int diff_so_far;
+ − 1601 int add_to_cache = 0;
+ − 1602
+ − 1603 /* Check for some cached positions, for speed. */
+ − 1604 if (x == BUF_PT (buf))
826
+ − 1605 return BYTE_BUF_PT (buf);
771
+ − 1606 if (x == BUF_ZV (buf))
826
+ − 1607 return BYTE_BUF_ZV (buf);
771
+ − 1608 if (x == BUF_BEGV (buf))
826
+ − 1609 return BYTE_BUF_BEGV (buf);
771
+ − 1610
+ − 1611 bufmin = buf->text->mule_bufmin;
+ − 1612 bufmax = buf->text->mule_bufmax;
+ − 1613 bytmin = buf->text->mule_bytmin;
+ − 1614 bytmax = buf->text->mule_bytmax;
+ − 1615 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
+ − 1616
+ − 1617 /* The basic idea here is that we shift the "known region" up or down
+ − 1618 until it overlaps the specified position. We do this by moving
+ − 1619 the upper bound of the known region up one character at a time,
+ − 1620 and moving the lower bound of the known region up as necessary
+ − 1621 when the size of the character just seen changes.
+ − 1622
+ − 1623 We optimize this, however, by first shifting the known region to
+ − 1624 one of the cached points if it's close by. (We don't check BEG or
+ − 1625 Z, even though they're cached; most of the time these will be the
+ − 1626 same as BEGV and ZV, and when they're not, they're not likely
+ − 1627 to be used.) */
+ − 1628
+ − 1629 if (x > bufmax)
+ − 1630 {
+ − 1631 Charbpos diffmax = x - bufmax;
+ − 1632 Charbpos diffpt = x - BUF_PT (buf);
+ − 1633 Charbpos diffzv = BUF_ZV (buf) - x;
+ − 1634 /* #### This value could stand some more exploration. */
+ − 1635 Charcount heuristic_hack = (bufmax - bufmin) >> 2;
+ − 1636
+ − 1637 /* Check if the position is closer to PT or ZV than to the
+ − 1638 end of the known region. */
+ − 1639
+ − 1640 if (diffpt < 0)
+ − 1641 diffpt = -diffpt;
+ − 1642 if (diffzv < 0)
+ − 1643 diffzv = -diffzv;
+ − 1644
+ − 1645 /* But also implement a heuristic that favors the known region
+ − 1646 over PT or ZV. The reason for this is that switching to
+ − 1647 PT or ZV will wipe out the knowledge in the known region,
+ − 1648 which might be annoying if the known region is large and
+ − 1649 PT or ZV is not that much closer than the end of the known
+ − 1650 region. */
+ − 1651
+ − 1652 diffzv += heuristic_hack;
+ − 1653 diffpt += heuristic_hack;
+ − 1654 if (diffpt < diffmax && diffpt <= diffzv)
+ − 1655 {
+ − 1656 bufmax = bufmin = BUF_PT (buf);
826
+ − 1657 bytmax = bytmin = BYTE_BUF_PT (buf);
771
+ − 1658 /* We set the size to 1 even though it doesn't really
+ − 1659 matter because the new known region contains no
+ − 1660 characters. We do this because this is the most
+ − 1661 likely size of the characters around the new known
+ − 1662 region, and we avoid potential yuckiness that is
+ − 1663 done when size == 3. */
+ − 1664 size = 1;
+ − 1665 }
+ − 1666 if (diffzv < diffmax)
+ − 1667 {
+ − 1668 bufmax = bufmin = BUF_ZV (buf);
826
+ − 1669 bytmax = bytmin = BYTE_BUF_ZV (buf);
771
+ − 1670 size = 1;
+ − 1671 }
+ − 1672 }
800
+ − 1673 #ifdef ERROR_CHECK_TEXT
771
+ − 1674 else if (x >= bufmin)
+ − 1675 abort ();
+ − 1676 #endif
+ − 1677 else
+ − 1678 {
+ − 1679 Charbpos diffmin = bufmin - x;
+ − 1680 Charbpos diffpt = BUF_PT (buf) - x;
+ − 1681 Charbpos diffbegv = x - BUF_BEGV (buf);
+ − 1682 /* #### This value could stand some more exploration. */
+ − 1683 Charcount heuristic_hack = (bufmax - bufmin) >> 2;
+ − 1684
+ − 1685 if (diffpt < 0)
+ − 1686 diffpt = -diffpt;
+ − 1687 if (diffbegv < 0)
+ − 1688 diffbegv = -diffbegv;
+ − 1689
+ − 1690 /* But also implement a heuristic that favors the known region --
+ − 1691 see above. */
+ − 1692
+ − 1693 diffbegv += heuristic_hack;
+ − 1694 diffpt += heuristic_hack;
+ − 1695
+ − 1696 if (diffpt < diffmin && diffpt <= diffbegv)
+ − 1697 {
+ − 1698 bufmax = bufmin = BUF_PT (buf);
826
+ − 1699 bytmax = bytmin = BYTE_BUF_PT (buf);
771
+ − 1700 /* We set the size to 1 even though it doesn't really
+ − 1701 matter because the new known region contains no
+ − 1702 characters. We do this because this is the most
+ − 1703 likely size of the characters around the new known
+ − 1704 region, and we avoid potential yuckiness that is
+ − 1705 done when size == 3. */
+ − 1706 size = 1;
+ − 1707 }
+ − 1708 if (diffbegv < diffmin)
+ − 1709 {
+ − 1710 bufmax = bufmin = BUF_BEGV (buf);
826
+ − 1711 bytmax = bytmin = BYTE_BUF_BEGV (buf);
771
+ − 1712 size = 1;
+ − 1713 }
+ − 1714 }
+ − 1715
+ − 1716 diff_so_far = x > bufmax ? x - bufmax : bufmin - x;
+ − 1717 if (diff_so_far > 50)
+ − 1718 {
+ − 1719 /* If we have to move more than a certain amount, then look
+ − 1720 into our cache. */
+ − 1721 int minval = INT_MAX;
+ − 1722 int found = 0;
+ − 1723 int i;
+ − 1724
+ − 1725 add_to_cache = 1;
+ − 1726 /* I considered keeping the positions ordered. This would speed
+ − 1727 up this loop, but updating the cache would take longer, so
+ − 1728 it doesn't seem like it would really matter. */
+ − 1729 for (i = 0; i < 16; i++)
+ − 1730 {
+ − 1731 int diff = buf->text->mule_charbpos_cache[i] - x;
+ − 1732
+ − 1733 if (diff < 0)
+ − 1734 diff = -diff;
+ − 1735 if (diff < minval)
+ − 1736 {
+ − 1737 minval = diff;
+ − 1738 found = i;
+ − 1739 }
+ − 1740 }
+ − 1741
+ − 1742 if (minval < diff_so_far)
+ − 1743 {
+ − 1744 bufmax = bufmin = buf->text->mule_charbpos_cache[found];
+ − 1745 bytmax = bytmin = buf->text->mule_bytebpos_cache[found];
+ − 1746 size = 1;
+ − 1747 }
+ − 1748 }
+ − 1749
+ − 1750 /* It's conceivable that the caching above could lead to X being
+ − 1751 the same as one of the range edges. */
+ − 1752 if (x >= bufmax)
+ − 1753 {
+ − 1754 Bytebpos newmax;
+ − 1755 Bytecount newsize;
+ − 1756
+ − 1757 forward_p = 1;
+ − 1758 while (x > bufmax)
+ − 1759 {
+ − 1760 newmax = bytmax;
+ − 1761
+ − 1762 INC_BYTEBPOS (buf, newmax);
+ − 1763 newsize = newmax - bytmax;
+ − 1764 if (newsize != size)
+ − 1765 {
+ − 1766 bufmin = bufmax;
+ − 1767 bytmin = bytmax;
+ − 1768 size = newsize;
+ − 1769 }
+ − 1770 bytmax = newmax;
+ − 1771 bufmax++;
+ − 1772 }
+ − 1773 retval = bytmax;
+ − 1774
+ − 1775 /* #### Should go past the found location to reduce the number
+ − 1776 of times that this function is called */
+ − 1777 }
+ − 1778 else /* x < bufmin */
+ − 1779 {
+ − 1780 Bytebpos newmin;
+ − 1781 Bytecount newsize;
+ − 1782
+ − 1783 forward_p = 0;
+ − 1784 while (x < bufmin)
+ − 1785 {
+ − 1786 newmin = bytmin;
+ − 1787
+ − 1788 DEC_BYTEBPOS (buf, newmin);
+ − 1789 newsize = bytmin - newmin;
+ − 1790 if (newsize != size)
+ − 1791 {
+ − 1792 bufmax = bufmin;
+ − 1793 bytmax = bytmin;
+ − 1794 size = newsize;
+ − 1795 }
+ − 1796 bytmin = newmin;
+ − 1797 bufmin--;
+ − 1798 }
+ − 1799 retval = bytmin;
+ − 1800
+ − 1801 /* #### Should go past the found location to reduce the number
+ − 1802 of times that this function is called
+ − 1803 */
+ − 1804 }
+ − 1805
+ − 1806 /* If size is three, than we have to max sure that the range we
+ − 1807 discovered isn't too large, because we use a fixed-length
+ − 1808 table to divide by 3. */
+ − 1809
+ − 1810 if (size == 3)
+ − 1811 {
+ − 1812 int gap = bytmax - bytmin;
+ − 1813 buf->text->mule_three_p = 1;
+ − 1814 buf->text->mule_shifter = 1;
+ − 1815
+ − 1816 if (gap > MAX_BYTEBPOS_GAP_SIZE_3)
+ − 1817 {
+ − 1818 if (forward_p)
+ − 1819 {
+ − 1820 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3;
+ − 1821 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3;
+ − 1822 }
+ − 1823 else
+ − 1824 {
+ − 1825 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3;
+ − 1826 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3;
+ − 1827 }
+ − 1828 }
+ − 1829 }
+ − 1830 else
+ − 1831 {
+ − 1832 buf->text->mule_three_p = 0;
+ − 1833 if (size == 4)
+ − 1834 buf->text->mule_shifter = 2;
+ − 1835 else
+ − 1836 buf->text->mule_shifter = size - 1;
+ − 1837 }
+ − 1838
+ − 1839 buf->text->mule_bufmin = bufmin;
+ − 1840 buf->text->mule_bufmax = bufmax;
+ − 1841 buf->text->mule_bytmin = bytmin;
+ − 1842 buf->text->mule_bytmax = bytmax;
+ − 1843
+ − 1844 if (add_to_cache)
+ − 1845 {
+ − 1846 int replace_loc;
+ − 1847
+ − 1848 /* We throw away a "random" cached value and replace it with
+ − 1849 the new value. It doesn't actually have to be very random
+ − 1850 at all, just evenly distributed.
+ − 1851
+ − 1852 #### It would be better to use a least-recently-used algorithm
+ − 1853 or something that tries to space things out, but I'm not sure
+ − 1854 it's worth it to go to the trouble of maintaining that. */
+ − 1855 not_very_random_number += 621;
+ − 1856 replace_loc = not_very_random_number & 15;
+ − 1857 buf->text->mule_charbpos_cache[replace_loc] = x;
+ − 1858 buf->text->mule_bytebpos_cache[replace_loc] = retval;
+ − 1859 }
+ − 1860
+ − 1861 return retval;
+ − 1862 }
+ − 1863
+ − 1864 /* The logic in this function is almost identical to the logic in
+ − 1865 the previous function. */
+ − 1866
+ − 1867 Charbpos
+ − 1868 bytebpos_to_charbpos_func (struct buffer *buf, Bytebpos x)
+ − 1869 {
+ − 1870 Charbpos bufmin;
+ − 1871 Charbpos bufmax;
+ − 1872 Bytebpos bytmin;
+ − 1873 Bytebpos bytmax;
+ − 1874 int size;
+ − 1875 int forward_p;
+ − 1876 Charbpos retval;
+ − 1877 int diff_so_far;
+ − 1878 int add_to_cache = 0;
+ − 1879
+ − 1880 /* Check for some cached positions, for speed. */
826
+ − 1881 if (x == BYTE_BUF_PT (buf))
771
+ − 1882 return BUF_PT (buf);
826
+ − 1883 if (x == BYTE_BUF_ZV (buf))
771
+ − 1884 return BUF_ZV (buf);
826
+ − 1885 if (x == BYTE_BUF_BEGV (buf))
771
+ − 1886 return BUF_BEGV (buf);
+ − 1887
+ − 1888 bufmin = buf->text->mule_bufmin;
+ − 1889 bufmax = buf->text->mule_bufmax;
+ − 1890 bytmin = buf->text->mule_bytmin;
+ − 1891 bytmax = buf->text->mule_bytmax;
+ − 1892 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
+ − 1893
+ − 1894 /* The basic idea here is that we shift the "known region" up or down
+ − 1895 until it overlaps the specified position. We do this by moving
+ − 1896 the upper bound of the known region up one character at a time,
+ − 1897 and moving the lower bound of the known region up as necessary
+ − 1898 when the size of the character just seen changes.
+ − 1899
+ − 1900 We optimize this, however, by first shifting the known region to
826
+ − 1901 one of the cached points if it's close by. (We don't check BYTE_BEG or
+ − 1902 BYTE_Z, even though they're cached; most of the time these will be the
+ − 1903 same as BYTE_BEGV and BYTE_ZV, and when they're not, they're not likely
771
+ − 1904 to be used.) */
+ − 1905
+ − 1906 if (x > bytmax)
+ − 1907 {
+ − 1908 Bytebpos diffmax = x - bytmax;
826
+ − 1909 Bytebpos diffpt = x - BYTE_BUF_PT (buf);
+ − 1910 Bytebpos diffzv = BYTE_BUF_ZV (buf) - x;
771
+ − 1911 /* #### This value could stand some more exploration. */
+ − 1912 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
+ − 1913
+ − 1914 /* Check if the position is closer to PT or ZV than to the
+ − 1915 end of the known region. */
+ − 1916
+ − 1917 if (diffpt < 0)
+ − 1918 diffpt = -diffpt;
+ − 1919 if (diffzv < 0)
+ − 1920 diffzv = -diffzv;
+ − 1921
+ − 1922 /* But also implement a heuristic that favors the known region
826
+ − 1923 over BYTE_PT or BYTE_ZV. The reason for this is that switching to
+ − 1924 BYTE_PT or BYTE_ZV will wipe out the knowledge in the known region,
771
+ − 1925 which might be annoying if the known region is large and
826
+ − 1926 BYTE_PT or BYTE_ZV is not that much closer than the end of the known
771
+ − 1927 region. */
+ − 1928
+ − 1929 diffzv += heuristic_hack;
+ − 1930 diffpt += heuristic_hack;
+ − 1931 if (diffpt < diffmax && diffpt <= diffzv)
+ − 1932 {
+ − 1933 bufmax = bufmin = BUF_PT (buf);
826
+ − 1934 bytmax = bytmin = BYTE_BUF_PT (buf);
771
+ − 1935 /* We set the size to 1 even though it doesn't really
+ − 1936 matter because the new known region contains no
+ − 1937 characters. We do this because this is the most
+ − 1938 likely size of the characters around the new known
+ − 1939 region, and we avoid potential yuckiness that is
+ − 1940 done when size == 3. */
+ − 1941 size = 1;
+ − 1942 }
+ − 1943 if (diffzv < diffmax)
+ − 1944 {
+ − 1945 bufmax = bufmin = BUF_ZV (buf);
826
+ − 1946 bytmax = bytmin = BYTE_BUF_ZV (buf);
771
+ − 1947 size = 1;
+ − 1948 }
+ − 1949 }
800
+ − 1950 #ifdef ERROR_CHECK_TEXT
771
+ − 1951 else if (x >= bytmin)
+ − 1952 abort ();
+ − 1953 #endif
+ − 1954 else
+ − 1955 {
+ − 1956 Bytebpos diffmin = bytmin - x;
826
+ − 1957 Bytebpos diffpt = BYTE_BUF_PT (buf) - x;
+ − 1958 Bytebpos diffbegv = x - BYTE_BUF_BEGV (buf);
771
+ − 1959 /* #### This value could stand some more exploration. */
+ − 1960 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
+ − 1961
+ − 1962 if (diffpt < 0)
+ − 1963 diffpt = -diffpt;
+ − 1964 if (diffbegv < 0)
+ − 1965 diffbegv = -diffbegv;
+ − 1966
+ − 1967 /* But also implement a heuristic that favors the known region --
+ − 1968 see above. */
+ − 1969
+ − 1970 diffbegv += heuristic_hack;
+ − 1971 diffpt += heuristic_hack;
+ − 1972
+ − 1973 if (diffpt < diffmin && diffpt <= diffbegv)
+ − 1974 {
+ − 1975 bufmax = bufmin = BUF_PT (buf);
826
+ − 1976 bytmax = bytmin = BYTE_BUF_PT (buf);
771
+ − 1977 /* We set the size to 1 even though it doesn't really
+ − 1978 matter because the new known region contains no
+ − 1979 characters. We do this because this is the most
+ − 1980 likely size of the characters around the new known
+ − 1981 region, and we avoid potential yuckiness that is
+ − 1982 done when size == 3. */
+ − 1983 size = 1;
+ − 1984 }
+ − 1985 if (diffbegv < diffmin)
+ − 1986 {
+ − 1987 bufmax = bufmin = BUF_BEGV (buf);
826
+ − 1988 bytmax = bytmin = BYTE_BUF_BEGV (buf);
771
+ − 1989 size = 1;
+ − 1990 }
+ − 1991 }
+ − 1992
+ − 1993 diff_so_far = x > bytmax ? x - bytmax : bytmin - x;
+ − 1994 if (diff_so_far > 50)
+ − 1995 {
+ − 1996 /* If we have to move more than a certain amount, then look
+ − 1997 into our cache. */
+ − 1998 int minval = INT_MAX;
+ − 1999 int found = 0;
+ − 2000 int i;
+ − 2001
+ − 2002 add_to_cache = 1;
+ − 2003 /* I considered keeping the positions ordered. This would speed
+ − 2004 up this loop, but updating the cache would take longer, so
+ − 2005 it doesn't seem like it would really matter. */
+ − 2006 for (i = 0; i < 16; i++)
+ − 2007 {
+ − 2008 int diff = buf->text->mule_bytebpos_cache[i] - x;
+ − 2009
+ − 2010 if (diff < 0)
+ − 2011 diff = -diff;
+ − 2012 if (diff < minval)
+ − 2013 {
+ − 2014 minval = diff;
+ − 2015 found = i;
+ − 2016 }
+ − 2017 }
+ − 2018
+ − 2019 if (minval < diff_so_far)
+ − 2020 {
+ − 2021 bufmax = bufmin = buf->text->mule_charbpos_cache[found];
+ − 2022 bytmax = bytmin = buf->text->mule_bytebpos_cache[found];
+ − 2023 size = 1;
+ − 2024 }
+ − 2025 }
+ − 2026
+ − 2027 /* It's conceivable that the caching above could lead to X being
+ − 2028 the same as one of the range edges. */
+ − 2029 if (x >= bytmax)
+ − 2030 {
+ − 2031 Bytebpos newmax;
+ − 2032 Bytecount newsize;
+ − 2033
+ − 2034 forward_p = 1;
+ − 2035 while (x > bytmax)
+ − 2036 {
+ − 2037 newmax = bytmax;
+ − 2038
+ − 2039 INC_BYTEBPOS (buf, newmax);
+ − 2040 newsize = newmax - bytmax;
+ − 2041 if (newsize != size)
+ − 2042 {
+ − 2043 bufmin = bufmax;
+ − 2044 bytmin = bytmax;
+ − 2045 size = newsize;
+ − 2046 }
+ − 2047 bytmax = newmax;
+ − 2048 bufmax++;
+ − 2049 }
+ − 2050 retval = bufmax;
+ − 2051
+ − 2052 /* #### Should go past the found location to reduce the number
+ − 2053 of times that this function is called */
+ − 2054 }
+ − 2055 else /* x <= bytmin */
+ − 2056 {
+ − 2057 Bytebpos newmin;
+ − 2058 Bytecount newsize;
+ − 2059
+ − 2060 forward_p = 0;
+ − 2061 while (x < bytmin)
+ − 2062 {
+ − 2063 newmin = bytmin;
+ − 2064
+ − 2065 DEC_BYTEBPOS (buf, newmin);
+ − 2066 newsize = bytmin - newmin;
+ − 2067 if (newsize != size)
+ − 2068 {
+ − 2069 bufmax = bufmin;
+ − 2070 bytmax = bytmin;
+ − 2071 size = newsize;
+ − 2072 }
+ − 2073 bytmin = newmin;
+ − 2074 bufmin--;
+ − 2075 }
+ − 2076 retval = bufmin;
+ − 2077
+ − 2078 /* #### Should go past the found location to reduce the number
+ − 2079 of times that this function is called
+ − 2080 */
+ − 2081 }
+ − 2082
+ − 2083 /* If size is three, than we have to max sure that the range we
+ − 2084 discovered isn't too large, because we use a fixed-length
+ − 2085 table to divide by 3. */
+ − 2086
+ − 2087 if (size == 3)
+ − 2088 {
+ − 2089 int gap = bytmax - bytmin;
+ − 2090 buf->text->mule_three_p = 1;
+ − 2091 buf->text->mule_shifter = 1;
+ − 2092
+ − 2093 if (gap > MAX_BYTEBPOS_GAP_SIZE_3)
+ − 2094 {
+ − 2095 if (forward_p)
+ − 2096 {
+ − 2097 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3;
+ − 2098 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3;
+ − 2099 }
+ − 2100 else
+ − 2101 {
+ − 2102 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3;
+ − 2103 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3;
+ − 2104 }
+ − 2105 }
+ − 2106 }
+ − 2107 else
+ − 2108 {
+ − 2109 buf->text->mule_three_p = 0;
+ − 2110 if (size == 4)
+ − 2111 buf->text->mule_shifter = 2;
+ − 2112 else
+ − 2113 buf->text->mule_shifter = size - 1;
+ − 2114 }
+ − 2115
+ − 2116 buf->text->mule_bufmin = bufmin;
+ − 2117 buf->text->mule_bufmax = bufmax;
+ − 2118 buf->text->mule_bytmin = bytmin;
+ − 2119 buf->text->mule_bytmax = bytmax;
+ − 2120
+ − 2121 if (add_to_cache)
+ − 2122 {
+ − 2123 int replace_loc;
+ − 2124
+ − 2125 /* We throw away a "random" cached value and replace it with
+ − 2126 the new value. It doesn't actually have to be very random
+ − 2127 at all, just evenly distributed.
+ − 2128
+ − 2129 #### It would be better to use a least-recently-used algorithm
+ − 2130 or something that tries to space things out, but I'm not sure
+ − 2131 it's worth it to go to the trouble of maintaining that. */
+ − 2132 not_very_random_number += 621;
+ − 2133 replace_loc = not_very_random_number & 15;
+ − 2134 buf->text->mule_charbpos_cache[replace_loc] = retval;
+ − 2135 buf->text->mule_bytebpos_cache[replace_loc] = x;
+ − 2136 }
+ − 2137
+ − 2138 return retval;
+ − 2139 }
+ − 2140
+ − 2141 /* Text of length BYTELENGTH and CHARLENGTH (in different units)
+ − 2142 was inserted at charbpos START. */
+ − 2143
+ − 2144 void
+ − 2145 buffer_mule_signal_inserted_region (struct buffer *buf, Charbpos start,
+ − 2146 Bytecount bytelength,
+ − 2147 Charcount charlength)
+ − 2148 {
+ − 2149 int size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
+ − 2150 int i;
+ − 2151
+ − 2152 /* Adjust the cache of known positions. */
+ − 2153 for (i = 0; i < 16; i++)
+ − 2154 {
+ − 2155
+ − 2156 if (buf->text->mule_charbpos_cache[i] > start)
+ − 2157 {
+ − 2158 buf->text->mule_charbpos_cache[i] += charlength;
+ − 2159 buf->text->mule_bytebpos_cache[i] += bytelength;
+ − 2160 }
+ − 2161 }
+ − 2162
+ − 2163 if (start >= buf->text->mule_bufmax)
826
+ − 2164 return;
771
+ − 2165
+ − 2166 /* The insertion is either before the known region, in which case
+ − 2167 it shoves it forward; or within the known region, in which case
+ − 2168 it shoves the end forward. (But it may make the known region
+ − 2169 inconsistent, so we may have to shorten it.) */
+ − 2170
+ − 2171 if (start <= buf->text->mule_bufmin)
+ − 2172 {
+ − 2173 buf->text->mule_bufmin += charlength;
+ − 2174 buf->text->mule_bufmax += charlength;
+ − 2175 buf->text->mule_bytmin += bytelength;
+ − 2176 buf->text->mule_bytmax += bytelength;
+ − 2177 }
+ − 2178 else
+ − 2179 {
+ − 2180 Charbpos end = start + charlength;
+ − 2181 /* the insertion point divides the known region in two.
+ − 2182 Keep the longer half, at least, and expand into the
+ − 2183 inserted chunk as much as possible. */
+ − 2184
+ − 2185 if (start - buf->text->mule_bufmin > buf->text->mule_bufmax - start)
+ − 2186 {
+ − 2187 Bytebpos bytestart = (buf->text->mule_bytmin
+ − 2188 + size * (start - buf->text->mule_bufmin));
+ − 2189 Bytebpos bytenew;
+ − 2190
+ − 2191 while (start < end)
+ − 2192 {
+ − 2193 bytenew = bytestart;
+ − 2194 INC_BYTEBPOS (buf, bytenew);
+ − 2195 if (bytenew - bytestart != size)
+ − 2196 break;
+ − 2197 start++;
+ − 2198 bytestart = bytenew;
+ − 2199 }
+ − 2200 if (start != end)
+ − 2201 {
+ − 2202 buf->text->mule_bufmax = start;
+ − 2203 buf->text->mule_bytmax = bytestart;
+ − 2204 }
+ − 2205 else
+ − 2206 {
+ − 2207 buf->text->mule_bufmax += charlength;
+ − 2208 buf->text->mule_bytmax += bytelength;
+ − 2209 }
+ − 2210 }
+ − 2211 else
+ − 2212 {
+ − 2213 Bytebpos byteend = (buf->text->mule_bytmin
+ − 2214 + size * (start - buf->text->mule_bufmin)
+ − 2215 + bytelength);
+ − 2216 Bytebpos bytenew;
+ − 2217
+ − 2218 buf->text->mule_bufmax += charlength;
+ − 2219 buf->text->mule_bytmax += bytelength;
+ − 2220
+ − 2221 while (end > start)
+ − 2222 {
+ − 2223 bytenew = byteend;
+ − 2224 DEC_BYTEBPOS (buf, bytenew);
+ − 2225 if (byteend - bytenew != size)
+ − 2226 break;
+ − 2227 end--;
+ − 2228 byteend = bytenew;
+ − 2229 }
+ − 2230 if (start != end)
+ − 2231 {
+ − 2232 buf->text->mule_bufmin = end;
+ − 2233 buf->text->mule_bytmin = byteend;
+ − 2234 }
+ − 2235 }
+ − 2236 }
+ − 2237 }
+ − 2238
826
+ − 2239 /* Text from START to END (equivalent in Bytebpos's: from BYTE_START to
+ − 2240 BYTE_END) was deleted. */
771
+ − 2241
+ − 2242 void
+ − 2243 buffer_mule_signal_deleted_region (struct buffer *buf, Charbpos start,
826
+ − 2244 Charbpos end, Bytebpos byte_start,
+ − 2245 Bytebpos byte_end)
771
+ − 2246 {
+ − 2247 int i;
+ − 2248
+ − 2249 /* Adjust the cache of known positions. */
+ − 2250 for (i = 0; i < 16; i++)
+ − 2251 {
+ − 2252 /* After the end; gets shoved backward */
+ − 2253 if (buf->text->mule_charbpos_cache[i] > end)
+ − 2254 {
+ − 2255 buf->text->mule_charbpos_cache[i] -= end - start;
826
+ − 2256 buf->text->mule_bytebpos_cache[i] -= byte_end - byte_start;
771
+ − 2257 }
+ − 2258 /* In the range; moves to start of range */
+ − 2259 else if (buf->text->mule_charbpos_cache[i] > start)
+ − 2260 {
+ − 2261 buf->text->mule_charbpos_cache[i] = start;
826
+ − 2262 buf->text->mule_bytebpos_cache[i] = byte_start;
771
+ − 2263 }
+ − 2264 }
+ − 2265
+ − 2266 /* We don't care about any text after the end of the known region. */
+ − 2267
+ − 2268 end = min (end, buf->text->mule_bufmax);
826
+ − 2269 byte_end = min (byte_end, buf->text->mule_bytmax);
771
+ − 2270 if (start >= end)
826
+ − 2271 return;
771
+ − 2272
+ − 2273 /* The end of the known region offsets by the total amount of deletion,
+ − 2274 since it's all before it. */
+ − 2275
+ − 2276 buf->text->mule_bufmax -= end - start;
826
+ − 2277 buf->text->mule_bytmax -= byte_end - byte_start;
771
+ − 2278
+ − 2279 /* Now we don't care about any text after the start of the known region. */
+ − 2280
+ − 2281 end = min (end, buf->text->mule_bufmin);
826
+ − 2282 byte_end = min (byte_end, buf->text->mule_bytmin);
771
+ − 2283 if (start < end)
+ − 2284 {
+ − 2285 buf->text->mule_bufmin -= end - start;
826
+ − 2286 buf->text->mule_bytmin -= byte_end - byte_start;
771
+ − 2287 }
+ − 2288 }
+ − 2289
+ − 2290 #endif /* MULE */
+ − 2291
+ − 2292
+ − 2293 /************************************************************************/
+ − 2294 /* verifying buffer and string positions */
+ − 2295 /************************************************************************/
+ − 2296
+ − 2297 /* Functions below are tagged with either _byte or _char indicating
+ − 2298 whether they return byte or character positions. For a buffer,
+ − 2299 a character position is a "Charbpos" and a byte position is a "Bytebpos".
+ − 2300 For strings, these are sometimes typed using "Charcount" and
+ − 2301 "Bytecount". */
+ − 2302
+ − 2303 /* Flags for the functions below are:
+ − 2304
+ − 2305 GB_ALLOW_PAST_ACCESSIBLE
+ − 2306
+ − 2307 Allow positions to range over the entire buffer (BUF_BEG to BUF_Z),
+ − 2308 rather than just the accessible portion (BUF_BEGV to BUF_ZV).
+ − 2309 For strings, this flag has no effect.
+ − 2310
+ − 2311 GB_COERCE_RANGE
+ − 2312
+ − 2313 If the position is outside the allowable range, return the lower
+ − 2314 or upper bound of the range, whichever is closer to the specified
+ − 2315 position.
+ − 2316
+ − 2317 GB_NO_ERROR_IF_BAD
+ − 2318
+ − 2319 If the position is outside the allowable range, return -1.
+ − 2320
+ − 2321 GB_NEGATIVE_FROM_END
+ − 2322
+ − 2323 If a value is negative, treat it as an offset from the end.
+ − 2324 Only applies to strings.
+ − 2325
+ − 2326 The following additional flags apply only to the functions
+ − 2327 that return ranges:
+ − 2328
+ − 2329 GB_ALLOW_NIL
+ − 2330
+ − 2331 Either or both positions can be nil. If FROM is nil,
+ − 2332 FROM_OUT will contain the lower bound of the allowed range.
+ − 2333 If TO is nil, TO_OUT will contain the upper bound of the
+ − 2334 allowed range.
+ − 2335
+ − 2336 GB_CHECK_ORDER
+ − 2337
+ − 2338 FROM must contain the lower bound and TO the upper bound
+ − 2339 of the range. If the positions are reversed, an error is
+ − 2340 signalled.
+ − 2341
+ − 2342 The following is a combination flag:
+ − 2343
+ − 2344 GB_HISTORICAL_STRING_BEHAVIOR
+ − 2345
+ − 2346 Equivalent to (GB_NEGATIVE_FROM_END | GB_ALLOW_NIL).
+ − 2347 */
+ − 2348
+ − 2349 /* Return a buffer position stored in a Lisp_Object. Full
+ − 2350 error-checking is done on the position. Flags can be specified to
+ − 2351 control the behavior of out-of-range values. The default behavior
+ − 2352 is to require that the position is within the accessible part of
+ − 2353 the buffer (BEGV and ZV), and to signal an error if the position is
+ − 2354 out of range.
+ − 2355
+ − 2356 */
+ − 2357
+ − 2358 Charbpos
+ − 2359 get_buffer_pos_char (struct buffer *b, Lisp_Object pos, unsigned int flags)
+ − 2360 {
+ − 2361 /* Does not GC */
+ − 2362 Charbpos ind;
+ − 2363 Charbpos min_allowed, max_allowed;
+ − 2364
+ − 2365 CHECK_INT_COERCE_MARKER (pos);
+ − 2366 ind = XINT (pos);
+ − 2367 min_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_BEG (b) : BUF_BEGV (b);
+ − 2368 max_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_Z (b) : BUF_ZV (b);
+ − 2369
+ − 2370 if (ind < min_allowed || ind > max_allowed)
+ − 2371 {
+ − 2372 if (flags & GB_COERCE_RANGE)
+ − 2373 ind = ind < min_allowed ? min_allowed : max_allowed;
+ − 2374 else if (flags & GB_NO_ERROR_IF_BAD)
+ − 2375 ind = -1;
+ − 2376 else
+ − 2377 {
793
+ − 2378 Lisp_Object buffer = wrap_buffer (b);
+ − 2379
771
+ − 2380 args_out_of_range (buffer, pos);
+ − 2381 }
+ − 2382 }
+ − 2383
+ − 2384 return ind;
+ − 2385 }
+ − 2386
+ − 2387 Bytebpos
+ − 2388 get_buffer_pos_byte (struct buffer *b, Lisp_Object pos, unsigned int flags)
+ − 2389 {
+ − 2390 Charbpos bpos = get_buffer_pos_char (b, pos, flags);
+ − 2391 if (bpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */
+ − 2392 return -1;
+ − 2393 return charbpos_to_bytebpos (b, bpos);
+ − 2394 }
+ − 2395
+ − 2396 /* Return a pair of buffer positions representing a range of text,
+ − 2397 taken from a pair of Lisp_Objects. Full error-checking is
+ − 2398 done on the positions. Flags can be specified to control the
+ − 2399 behavior of out-of-range values. The default behavior is to
+ − 2400 allow the range bounds to be specified in either order
+ − 2401 (however, FROM_OUT will always be the lower bound of the range
+ − 2402 and TO_OUT the upper bound),to require that the positions
+ − 2403 are within the accessible part of the buffer (BEGV and ZV),
+ − 2404 and to signal an error if the positions are out of range.
+ − 2405 */
+ − 2406
+ − 2407 void
+ − 2408 get_buffer_range_char (struct buffer *b, Lisp_Object from, Lisp_Object to,
826
+ − 2409 Charbpos *from_out, Charbpos *to_out,
+ − 2410 unsigned int flags)
771
+ − 2411 {
+ − 2412 /* Does not GC */
+ − 2413 Charbpos min_allowed, max_allowed;
+ − 2414
+ − 2415 min_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
+ − 2416 BUF_BEG (b) : BUF_BEGV (b);
+ − 2417 max_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
+ − 2418 BUF_Z (b) : BUF_ZV (b);
+ − 2419
+ − 2420 if (NILP (from) && (flags & GB_ALLOW_NIL))
+ − 2421 *from_out = min_allowed;
+ − 2422 else
+ − 2423 *from_out = get_buffer_pos_char (b, from, flags | GB_NO_ERROR_IF_BAD);
+ − 2424
+ − 2425 if (NILP (to) && (flags & GB_ALLOW_NIL))
+ − 2426 *to_out = max_allowed;
+ − 2427 else
+ − 2428 *to_out = get_buffer_pos_char (b, to, flags | GB_NO_ERROR_IF_BAD);
+ − 2429
+ − 2430 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD))
+ − 2431 {
793
+ − 2432 Lisp_Object buffer = wrap_buffer (b);
+ − 2433
771
+ − 2434 args_out_of_range_3 (buffer, from, to);
+ − 2435 }
+ − 2436
+ − 2437 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out)
+ − 2438 {
+ − 2439 if (flags & GB_CHECK_ORDER)
+ − 2440 invalid_argument_2 ("start greater than end", from, to);
+ − 2441 else
+ − 2442 {
+ − 2443 Charbpos temp = *from_out;
+ − 2444 *from_out = *to_out;
+ − 2445 *to_out = temp;
+ − 2446 }
+ − 2447 }
+ − 2448 }
+ − 2449
+ − 2450 void
+ − 2451 get_buffer_range_byte (struct buffer *b, Lisp_Object from, Lisp_Object to,
826
+ − 2452 Bytebpos *from_out, Bytebpos *to_out,
+ − 2453 unsigned int flags)
771
+ − 2454 {
+ − 2455 Charbpos s, e;
+ − 2456
+ − 2457 get_buffer_range_char (b, from, to, &s, &e, flags);
+ − 2458 if (s >= 0)
+ − 2459 *from_out = charbpos_to_bytebpos (b, s);
+ − 2460 else /* could happen with GB_NO_ERROR_IF_BAD */
+ − 2461 *from_out = -1;
+ − 2462 if (e >= 0)
+ − 2463 *to_out = charbpos_to_bytebpos (b, e);
+ − 2464 else
+ − 2465 *to_out = -1;
+ − 2466 }
+ − 2467
+ − 2468 static Charcount
+ − 2469 get_string_pos_char_1 (Lisp_Object string, Lisp_Object pos, unsigned int flags,
+ − 2470 Charcount known_length)
+ − 2471 {
+ − 2472 Charcount ccpos;
+ − 2473 Charcount min_allowed = 0;
+ − 2474 Charcount max_allowed = known_length;
+ − 2475
+ − 2476 /* Computation of KNOWN_LENGTH is potentially expensive so we pass
+ − 2477 it in. */
+ − 2478 CHECK_INT (pos);
+ − 2479 ccpos = XINT (pos);
+ − 2480 if (ccpos < 0 && flags & GB_NEGATIVE_FROM_END)
+ − 2481 ccpos += max_allowed;
+ − 2482
+ − 2483 if (ccpos < min_allowed || ccpos > max_allowed)
+ − 2484 {
+ − 2485 if (flags & GB_COERCE_RANGE)
+ − 2486 ccpos = ccpos < min_allowed ? min_allowed : max_allowed;
+ − 2487 else if (flags & GB_NO_ERROR_IF_BAD)
+ − 2488 ccpos = -1;
+ − 2489 else
+ − 2490 args_out_of_range (string, pos);
+ − 2491 }
+ − 2492
+ − 2493 return ccpos;
+ − 2494 }
+ − 2495
+ − 2496 Charcount
+ − 2497 get_string_pos_char (Lisp_Object string, Lisp_Object pos, unsigned int flags)
+ − 2498 {
+ − 2499 return get_string_pos_char_1 (string, pos, flags,
826
+ − 2500 string_char_length (string));
771
+ − 2501 }
+ − 2502
+ − 2503 Bytecount
+ − 2504 get_string_pos_byte (Lisp_Object string, Lisp_Object pos, unsigned int flags)
+ − 2505 {
+ − 2506 Charcount ccpos = get_string_pos_char (string, pos, flags);
+ − 2507 if (ccpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */
+ − 2508 return -1;
793
+ − 2509 return string_index_char_to_byte (string, ccpos);
771
+ − 2510 }
+ − 2511
+ − 2512 void
+ − 2513 get_string_range_char (Lisp_Object string, Lisp_Object from, Lisp_Object to,
+ − 2514 Charcount *from_out, Charcount *to_out,
+ − 2515 unsigned int flags)
+ − 2516 {
+ − 2517 Charcount min_allowed = 0;
826
+ − 2518 Charcount max_allowed = string_char_length (string);
771
+ − 2519
+ − 2520 if (NILP (from) && (flags & GB_ALLOW_NIL))
+ − 2521 *from_out = min_allowed;
+ − 2522 else
+ − 2523 *from_out = get_string_pos_char_1 (string, from,
+ − 2524 flags | GB_NO_ERROR_IF_BAD,
+ − 2525 max_allowed);
+ − 2526
+ − 2527 if (NILP (to) && (flags & GB_ALLOW_NIL))
+ − 2528 *to_out = max_allowed;
+ − 2529 else
+ − 2530 *to_out = get_string_pos_char_1 (string, to,
+ − 2531 flags | GB_NO_ERROR_IF_BAD,
+ − 2532 max_allowed);
+ − 2533
+ − 2534 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD))
+ − 2535 args_out_of_range_3 (string, from, to);
+ − 2536
+ − 2537 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out)
+ − 2538 {
+ − 2539 if (flags & GB_CHECK_ORDER)
+ − 2540 invalid_argument_2 ("start greater than end", from, to);
+ − 2541 else
+ − 2542 {
+ − 2543 Charbpos temp = *from_out;
+ − 2544 *from_out = *to_out;
+ − 2545 *to_out = temp;
+ − 2546 }
+ − 2547 }
+ − 2548 }
+ − 2549
+ − 2550 void
+ − 2551 get_string_range_byte (Lisp_Object string, Lisp_Object from, Lisp_Object to,
+ − 2552 Bytecount *from_out, Bytecount *to_out,
+ − 2553 unsigned int flags)
+ − 2554 {
+ − 2555 Charcount s, e;
+ − 2556
+ − 2557 get_string_range_char (string, from, to, &s, &e, flags);
+ − 2558 if (s >= 0)
793
+ − 2559 *from_out = string_index_char_to_byte (string, s);
771
+ − 2560 else /* could happen with GB_NO_ERROR_IF_BAD */
+ − 2561 *from_out = -1;
+ − 2562 if (e >= 0)
793
+ − 2563 *to_out = string_index_char_to_byte (string, e);
771
+ − 2564 else
+ − 2565 *to_out = -1;
+ − 2566
+ − 2567 }
+ − 2568
826
+ − 2569 Charxpos
771
+ − 2570 get_buffer_or_string_pos_char (Lisp_Object object, Lisp_Object pos,
+ − 2571 unsigned int flags)
+ − 2572 {
+ − 2573 return STRINGP (object) ?
+ − 2574 get_string_pos_char (object, pos, flags) :
+ − 2575 get_buffer_pos_char (XBUFFER (object), pos, flags);
+ − 2576 }
+ − 2577
826
+ − 2578 Bytexpos
771
+ − 2579 get_buffer_or_string_pos_byte (Lisp_Object object, Lisp_Object pos,
+ − 2580 unsigned int flags)
+ − 2581 {
+ − 2582 return STRINGP (object) ?
+ − 2583 get_string_pos_byte (object, pos, flags) :
+ − 2584 get_buffer_pos_byte (XBUFFER (object), pos, flags);
+ − 2585 }
+ − 2586
+ − 2587 void
+ − 2588 get_buffer_or_string_range_char (Lisp_Object object, Lisp_Object from,
826
+ − 2589 Lisp_Object to, Charxpos *from_out,
+ − 2590 Charxpos *to_out, unsigned int flags)
771
+ − 2591 {
+ − 2592 if (STRINGP (object))
+ − 2593 get_string_range_char (object, from, to, from_out, to_out, flags);
+ − 2594 else
826
+ − 2595 get_buffer_range_char (XBUFFER (object), from, to, from_out, to_out,
+ − 2596 flags);
771
+ − 2597 }
+ − 2598
+ − 2599 void
+ − 2600 get_buffer_or_string_range_byte (Lisp_Object object, Lisp_Object from,
826
+ − 2601 Lisp_Object to, Bytexpos *from_out,
+ − 2602 Bytexpos *to_out, unsigned int flags)
771
+ − 2603 {
+ − 2604 if (STRINGP (object))
+ − 2605 get_string_range_byte (object, from, to, from_out, to_out, flags);
+ − 2606 else
826
+ − 2607 get_buffer_range_byte (XBUFFER (object), from, to, from_out, to_out,
+ − 2608 flags);
771
+ − 2609 }
+ − 2610
826
+ − 2611 Charxpos
771
+ − 2612 buffer_or_string_accessible_begin_char (Lisp_Object object)
+ − 2613 {
+ − 2614 return STRINGP (object) ? 0 : BUF_BEGV (XBUFFER (object));
+ − 2615 }
+ − 2616
826
+ − 2617 Charxpos
771
+ − 2618 buffer_or_string_accessible_end_char (Lisp_Object object)
+ − 2619 {
+ − 2620 return STRINGP (object) ?
826
+ − 2621 string_char_length (object) : BUF_ZV (XBUFFER (object));
771
+ − 2622 }
+ − 2623
826
+ − 2624 Bytexpos
771
+ − 2625 buffer_or_string_accessible_begin_byte (Lisp_Object object)
+ − 2626 {
826
+ − 2627 return STRINGP (object) ? 0 : BYTE_BUF_BEGV (XBUFFER (object));
771
+ − 2628 }
+ − 2629
826
+ − 2630 Bytexpos
771
+ − 2631 buffer_or_string_accessible_end_byte (Lisp_Object object)
+ − 2632 {
+ − 2633 return STRINGP (object) ?
826
+ − 2634 XSTRING_LENGTH (object) : BYTE_BUF_ZV (XBUFFER (object));
771
+ − 2635 }
+ − 2636
826
+ − 2637 Charxpos
771
+ − 2638 buffer_or_string_absolute_begin_char (Lisp_Object object)
+ − 2639 {
+ − 2640 return STRINGP (object) ? 0 : BUF_BEG (XBUFFER (object));
+ − 2641 }
+ − 2642
826
+ − 2643 Charxpos
771
+ − 2644 buffer_or_string_absolute_end_char (Lisp_Object object)
+ − 2645 {
+ − 2646 return STRINGP (object) ?
826
+ − 2647 string_char_length (object) : BUF_Z (XBUFFER (object));
+ − 2648 }
+ − 2649
+ − 2650 Bytexpos
+ − 2651 buffer_or_string_absolute_begin_byte (Lisp_Object object)
+ − 2652 {
+ − 2653 return STRINGP (object) ? 0 : BYTE_BUF_BEG (XBUFFER (object));
+ − 2654 }
+ − 2655
+ − 2656 Bytexpos
+ − 2657 buffer_or_string_absolute_end_byte (Lisp_Object object)
+ − 2658 {
+ − 2659 return STRINGP (object) ?
+ − 2660 XSTRING_LENGTH (object) : BYTE_BUF_Z (XBUFFER (object));
+ − 2661 }
+ − 2662
+ − 2663 Charbpos
+ − 2664 charbpos_clip_to_bounds (Charbpos lower, Charbpos num, Charbpos upper)
+ − 2665 {
+ − 2666 return (num < lower ? lower :
+ − 2667 num > upper ? upper :
+ − 2668 num);
771
+ − 2669 }
+ − 2670
+ − 2671 Bytebpos
826
+ − 2672 bytebpos_clip_to_bounds (Bytebpos lower, Bytebpos num, Bytebpos upper)
+ − 2673 {
+ − 2674 return (num < lower ? lower :
+ − 2675 num > upper ? upper :
+ − 2676 num);
+ − 2677 }
+ − 2678
+ − 2679 Charxpos
+ − 2680 charxpos_clip_to_bounds (Charxpos lower, Charxpos num, Charxpos upper)
771
+ − 2681 {
826
+ − 2682 return (num < lower ? lower :
+ − 2683 num > upper ? upper :
+ − 2684 num);
+ − 2685 }
+ − 2686
+ − 2687 Bytexpos
+ − 2688 bytexpos_clip_to_bounds (Bytexpos lower, Bytexpos num, Bytexpos upper)
+ − 2689 {
+ − 2690 return (num < lower ? lower :
+ − 2691 num > upper ? upper :
+ − 2692 num);
771
+ − 2693 }
+ − 2694
826
+ − 2695 /* These could be implemented in terms of the get_buffer_or_string()
+ − 2696 functions above, but those are complicated and handle lots of weird
+ − 2697 cases stemming from uncertain external input. */
+ − 2698
+ − 2699 Charxpos
+ − 2700 buffer_or_string_clip_to_accessible_char (Lisp_Object object, Charxpos pos)
+ − 2701 {
+ − 2702 return (charxpos_clip_to_bounds
+ − 2703 (pos, buffer_or_string_accessible_begin_char (object),
+ − 2704 buffer_or_string_accessible_end_char (object)));
+ − 2705 }
+ − 2706
+ − 2707 Bytexpos
+ − 2708 buffer_or_string_clip_to_accessible_byte (Lisp_Object object, Bytexpos pos)
771
+ − 2709 {
826
+ − 2710 return (bytexpos_clip_to_bounds
+ − 2711 (pos, buffer_or_string_accessible_begin_byte (object),
+ − 2712 buffer_or_string_accessible_end_byte (object)));
+ − 2713 }
+ − 2714
+ − 2715 Charxpos
+ − 2716 buffer_or_string_clip_to_absolute_char (Lisp_Object object, Charxpos pos)
+ − 2717 {
+ − 2718 return (charxpos_clip_to_bounds
+ − 2719 (pos, buffer_or_string_absolute_begin_char (object),
+ − 2720 buffer_or_string_absolute_end_char (object)));
+ − 2721 }
+ − 2722
+ − 2723 Bytexpos
+ − 2724 buffer_or_string_clip_to_absolute_byte (Lisp_Object object, Bytexpos pos)
+ − 2725 {
+ − 2726 return (bytexpos_clip_to_bounds
+ − 2727 (pos, buffer_or_string_absolute_begin_byte (object),
+ − 2728 buffer_or_string_absolute_end_byte (object)));
771
+ − 2729 }
+ − 2730
+ − 2731
+ − 2732 /************************************************************************/
+ − 2733 /* Implement TO_EXTERNAL_FORMAT, TO_INTERNAL_FORMAT */
+ − 2734 /************************************************************************/
+ − 2735
+ − 2736 typedef struct
+ − 2737 {
867
+ − 2738 Dynarr_declare (Ibyte_dynarr *);
+ − 2739 } Ibyte_dynarr_dynarr;
771
+ − 2740
+ − 2741 typedef struct
+ − 2742 {
+ − 2743 Dynarr_declare (Extbyte_dynarr *);
+ − 2744 } Extbyte_dynarr_dynarr;
+ − 2745
+ − 2746 static Extbyte_dynarr_dynarr *conversion_out_dynarr_list;
867
+ − 2747 static Ibyte_dynarr_dynarr *conversion_in_dynarr_list;
771
+ − 2748
+ − 2749 static int dfc_convert_to_external_format_in_use;
+ − 2750 static int dfc_convert_to_internal_format_in_use;
+ − 2751
+ − 2752 void
+ − 2753 dfc_convert_to_external_format (dfc_conversion_type source_type,
+ − 2754 dfc_conversion_data *source,
+ − 2755 Lisp_Object coding_system,
+ − 2756 dfc_conversion_type sink_type,
+ − 2757 dfc_conversion_data *sink)
+ − 2758 {
+ − 2759 /* It's guaranteed that many callers are not prepared for GC here,
+ − 2760 esp. given that this code conversion occurs in many very hidden
+ − 2761 places. */
+ − 2762 int count = begin_gc_forbidden ();
+ − 2763 Extbyte_dynarr *conversion_out_dynarr;
+ − 2764
+ − 2765 type_checking_assert
+ − 2766 (((source_type == DFC_TYPE_DATA) ||
+ − 2767 (source_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)) ||
+ − 2768 (source_type == DFC_TYPE_LISP_STRING && STRINGP (source->lisp_object)))
+ − 2769 &&
+ − 2770 ((sink_type == DFC_TYPE_DATA) ||
+ − 2771 (sink_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object))));
+ − 2772
+ − 2773 if (Dynarr_length (conversion_out_dynarr_list) <=
+ − 2774 dfc_convert_to_external_format_in_use)
+ − 2775 Dynarr_add (conversion_out_dynarr_list, Dynarr_new (Extbyte));
+ − 2776 conversion_out_dynarr = Dynarr_at (conversion_out_dynarr_list,
+ − 2777 dfc_convert_to_external_format_in_use);
+ − 2778 Dynarr_reset (conversion_out_dynarr);
+ − 2779
853
+ − 2780 internal_bind_int (&dfc_convert_to_external_format_in_use,
+ − 2781 dfc_convert_to_external_format_in_use + 1);
+ − 2782
771
+ − 2783 coding_system = get_coding_system_for_text_file (coding_system, 0);
+ − 2784
+ − 2785 /* Here we optimize in the case where the coding system does no
+ − 2786 conversion. However, we don't want to optimize in case the source
+ − 2787 or sink is an lstream, since writing to an lstream can cause a
+ − 2788 garbage collection, and this could be problematic if the source
+ − 2789 is a lisp string. */
+ − 2790 if (source_type != DFC_TYPE_LISP_LSTREAM &&
+ − 2791 sink_type != DFC_TYPE_LISP_LSTREAM &&
+ − 2792 coding_system_is_binary (coding_system))
+ − 2793 {
867
+ − 2794 const Ibyte *ptr;
771
+ − 2795 Bytecount len;
+ − 2796
+ − 2797 if (source_type == DFC_TYPE_LISP_STRING)
+ − 2798 {
+ − 2799 ptr = XSTRING_DATA (source->lisp_object);
+ − 2800 len = XSTRING_LENGTH (source->lisp_object);
+ − 2801 }
+ − 2802 else
+ − 2803 {
867
+ − 2804 ptr = (Ibyte *) source->data.ptr;
771
+ − 2805 len = source->data.len;
+ − 2806 }
+ − 2807
+ − 2808 #ifdef MULE
+ − 2809 {
867
+ − 2810 const Ibyte *end;
771
+ − 2811 for (end = ptr + len; ptr < end;)
+ − 2812 {
867
+ − 2813 Ibyte c =
826
+ − 2814 (byte_ascii_p (*ptr)) ? *ptr :
771
+ − 2815 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
+ − 2816 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
+ − 2817 '~';
+ − 2818
+ − 2819 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
867
+ − 2820 INC_IBYTEPTR (ptr);
771
+ − 2821 }
800
+ − 2822 text_checking_assert (ptr == end);
771
+ − 2823 }
+ − 2824 #else
+ − 2825 Dynarr_add_many (conversion_out_dynarr, ptr, len);
+ − 2826 #endif
+ − 2827
+ − 2828 }
+ − 2829 #ifdef HAVE_WIN32_CODING_SYSTEMS
+ − 2830 /* Optimize the common case involving Unicode where only ASCII is involved */
+ − 2831 else if (source_type != DFC_TYPE_LISP_LSTREAM &&
+ − 2832 sink_type != DFC_TYPE_LISP_LSTREAM &&
+ − 2833 dfc_coding_system_is_unicode (coding_system))
+ − 2834 {
867
+ − 2835 const Ibyte *ptr, *p;
771
+ − 2836 Bytecount len;
867
+ − 2837 const Ibyte *end;
771
+ − 2838
+ − 2839 if (source_type == DFC_TYPE_LISP_STRING)
+ − 2840 {
+ − 2841 ptr = XSTRING_DATA (source->lisp_object);
+ − 2842 len = XSTRING_LENGTH (source->lisp_object);
+ − 2843 }
+ − 2844 else
+ − 2845 {
867
+ − 2846 ptr = (Ibyte *) source->data.ptr;
771
+ − 2847 len = source->data.len;
+ − 2848 }
+ − 2849 end = ptr + len;
+ − 2850
+ − 2851 for (p = ptr; p < end; p++)
+ − 2852 {
826
+ − 2853 if (!byte_ascii_p (*p))
771
+ − 2854 goto the_hard_way;
+ − 2855 }
+ − 2856
+ − 2857 for (p = ptr; p < end; p++)
+ − 2858 {
+ − 2859 Dynarr_add (conversion_out_dynarr, (Extbyte) (*p));
+ − 2860 Dynarr_add (conversion_out_dynarr, (Extbyte) '\0');
+ − 2861 }
+ − 2862 }
+ − 2863 #endif /* HAVE_WIN32_CODING_SYSTEMS */
+ − 2864 else
+ − 2865 {
+ − 2866 Lisp_Object streams_to_delete[3];
+ − 2867 int delete_count;
+ − 2868 Lisp_Object instream, outstream;
+ − 2869 Lstream *reader, *writer;
+ − 2870 struct gcpro gcpro1, gcpro2;
+ − 2871
+ − 2872 #ifdef HAVE_WIN32_CODING_SYSTEMS
+ − 2873 the_hard_way:
+ − 2874 #endif /* HAVE_WIN32_CODING_SYSTEMS */
+ − 2875 delete_count = 0;
+ − 2876 if (source_type == DFC_TYPE_LISP_LSTREAM)
+ − 2877 instream = source->lisp_object;
+ − 2878 else if (source_type == DFC_TYPE_DATA)
+ − 2879 streams_to_delete[delete_count++] = instream =
+ − 2880 make_fixed_buffer_input_stream (source->data.ptr, source->data.len);
+ − 2881 else
+ − 2882 {
+ − 2883 type_checking_assert (source_type == DFC_TYPE_LISP_STRING);
+ − 2884 streams_to_delete[delete_count++] = instream =
+ − 2885 /* This will GCPRO the Lisp string */
+ − 2886 make_lisp_string_input_stream (source->lisp_object, 0, -1);
+ − 2887 }
+ − 2888
+ − 2889 if (sink_type == DFC_TYPE_LISP_LSTREAM)
+ − 2890 outstream = sink->lisp_object;
+ − 2891 else
+ − 2892 {
+ − 2893 type_checking_assert (sink_type == DFC_TYPE_DATA);
+ − 2894 streams_to_delete[delete_count++] = outstream =
+ − 2895 make_dynarr_output_stream
+ − 2896 ((unsigned_char_dynarr *) conversion_out_dynarr);
+ − 2897 }
+ − 2898
+ − 2899 streams_to_delete[delete_count++] = outstream =
800
+ − 2900 make_coding_output_stream (XLSTREAM (outstream), coding_system,
+ − 2901 CODING_ENCODE, 0);
771
+ − 2902
+ − 2903 reader = XLSTREAM (instream);
+ − 2904 writer = XLSTREAM (outstream);
+ − 2905 /* decoding_stream will gc-protect outstream */
+ − 2906 GCPRO2 (instream, outstream);
+ − 2907
+ − 2908 while (1)
+ − 2909 {
+ − 2910 Bytecount size_in_bytes;
+ − 2911 char tempbuf[1024]; /* some random amount */
+ − 2912
+ − 2913 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf));
+ − 2914
+ − 2915 if (size_in_bytes == 0)
+ − 2916 break;
+ − 2917 else if (size_in_bytes < 0)
+ − 2918 signal_error (Qtext_conversion_error,
+ − 2919 "Error converting to external format", Qunbound);
+ − 2920
+ − 2921 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0)
+ − 2922 signal_error (Qtext_conversion_error,
+ − 2923 "Error converting to external format", Qunbound);
+ − 2924 }
+ − 2925
+ − 2926 /* Closing writer will close any stream at the other end of writer. */
+ − 2927 Lstream_close (writer);
+ − 2928 Lstream_close (reader);
+ − 2929 UNGCPRO;
+ − 2930
+ − 2931 /* The idea is that this function will create no garbage. */
+ − 2932 while (delete_count)
+ − 2933 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count]));
+ − 2934 }
+ − 2935
+ − 2936 unbind_to (count);
+ − 2937
+ − 2938 if (sink_type != DFC_TYPE_LISP_LSTREAM)
+ − 2939 {
+ − 2940 sink->data.len = Dynarr_length (conversion_out_dynarr);
+ − 2941 /* double zero-extend because we may be dealing with Unicode data */
+ − 2942 Dynarr_add (conversion_out_dynarr, '\0');
+ − 2943 Dynarr_add (conversion_out_dynarr, '\0');
+ − 2944 sink->data.ptr = Dynarr_atp (conversion_out_dynarr, 0);
+ − 2945 }
+ − 2946 }
+ − 2947
+ − 2948 void
+ − 2949 dfc_convert_to_internal_format (dfc_conversion_type source_type,
+ − 2950 dfc_conversion_data *source,
+ − 2951 Lisp_Object coding_system,
+ − 2952 dfc_conversion_type sink_type,
+ − 2953 dfc_conversion_data *sink)
+ − 2954 {
+ − 2955 /* It's guaranteed that many callers are not prepared for GC here,
+ − 2956 esp. given that this code conversion occurs in many very hidden
+ − 2957 places. */
+ − 2958 int count = begin_gc_forbidden ();
867
+ − 2959 Ibyte_dynarr *conversion_in_dynarr;
771
+ − 2960
+ − 2961 type_checking_assert
+ − 2962 ((source_type == DFC_TYPE_DATA ||
+ − 2963 source_type == DFC_TYPE_LISP_LSTREAM)
+ − 2964 &&
+ − 2965 (sink_type == DFC_TYPE_DATA ||
+ − 2966 sink_type == DFC_TYPE_LISP_LSTREAM));
+ − 2967
+ − 2968 if (Dynarr_length (conversion_in_dynarr_list) <=
+ − 2969 dfc_convert_to_internal_format_in_use)
867
+ − 2970 Dynarr_add (conversion_in_dynarr_list, Dynarr_new (Ibyte));
771
+ − 2971 conversion_in_dynarr = Dynarr_at (conversion_in_dynarr_list,
+ − 2972 dfc_convert_to_internal_format_in_use);
+ − 2973 Dynarr_reset (conversion_in_dynarr);
+ − 2974
853
+ − 2975 internal_bind_int (&dfc_convert_to_internal_format_in_use,
+ − 2976 dfc_convert_to_internal_format_in_use + 1);
+ − 2977
771
+ − 2978 coding_system = get_coding_system_for_text_file (coding_system, 1);
+ − 2979
+ − 2980 if (source_type != DFC_TYPE_LISP_LSTREAM &&
+ − 2981 sink_type != DFC_TYPE_LISP_LSTREAM &&
+ − 2982 coding_system_is_binary (coding_system))
+ − 2983 {
+ − 2984 #ifdef MULE
867
+ − 2985 const Ibyte *ptr = (const Ibyte *) source->data.ptr;
771
+ − 2986 Bytecount len = source->data.len;
867
+ − 2987 const Ibyte *end = ptr + len;
771
+ − 2988
+ − 2989 for (; ptr < end; ptr++)
+ − 2990 {
867
+ − 2991 Ibyte c = *ptr;
771
+ − 2992
826
+ − 2993 if (byte_ascii_p (c))
771
+ − 2994 Dynarr_add (conversion_in_dynarr, c);
826
+ − 2995 else if (byte_c1_p (c))
771
+ − 2996 {
+ − 2997 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
+ − 2998 Dynarr_add (conversion_in_dynarr, c + 0x20);
+ − 2999 }
+ − 3000 else
+ − 3001 {
+ − 3002 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1);
+ − 3003 Dynarr_add (conversion_in_dynarr, c);
+ − 3004 }
+ − 3005 }
+ − 3006 #else
+ − 3007 Dynarr_add_many (conversion_in_dynarr, source->data.ptr, source->data.len);
+ − 3008 #endif
+ − 3009 }
+ − 3010 #ifdef HAVE_WIN32_CODING_SYSTEMS
+ − 3011 /* Optimize the common case involving Unicode where only ASCII/Latin-1 is involved */
+ − 3012 else if (source_type != DFC_TYPE_LISP_LSTREAM &&
+ − 3013 sink_type != DFC_TYPE_LISP_LSTREAM &&
+ − 3014 dfc_coding_system_is_unicode (coding_system))
+ − 3015 {
867
+ − 3016 const Ibyte *ptr = (const Ibyte *) source->data.ptr + 1;
771
+ − 3017 Bytecount len = source->data.len;
867
+ − 3018 const Ibyte *end = ptr + len;
771
+ − 3019
+ − 3020 if (len & 1)
+ − 3021 goto the_hard_way;
+ − 3022
+ − 3023 for (; ptr < end; ptr += 2)
+ − 3024 {
+ − 3025 if (*ptr)
+ − 3026 goto the_hard_way;
+ − 3027 }
+ − 3028
867
+ − 3029 ptr = (const Ibyte *) source->data.ptr;
771
+ − 3030 end = ptr + len;
+ − 3031
+ − 3032 for (; ptr < end; ptr += 2)
+ − 3033 {
867
+ − 3034 Ibyte c = *ptr;
771
+ − 3035
826
+ − 3036 if (byte_ascii_p (c))
771
+ − 3037 Dynarr_add (conversion_in_dynarr, c);
+ − 3038 #ifdef MULE
826
+ − 3039 else if (byte_c1_p (c))
771
+ − 3040 {
+ − 3041 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
+ − 3042 Dynarr_add (conversion_in_dynarr, c + 0x20);
+ − 3043 }
+ − 3044 else
+ − 3045 {
+ − 3046 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1);
+ − 3047 Dynarr_add (conversion_in_dynarr, c);
+ − 3048 }
+ − 3049 #endif /* MULE */
+ − 3050 }
+ − 3051 }
+ − 3052 #endif /* HAVE_WIN32_CODING_SYSTEMS */
+ − 3053 else
+ − 3054 {
+ − 3055 Lisp_Object streams_to_delete[3];
+ − 3056 int delete_count;
+ − 3057 Lisp_Object instream, outstream;
+ − 3058 Lstream *reader, *writer;
+ − 3059 struct gcpro gcpro1, gcpro2;
+ − 3060
+ − 3061 #ifdef HAVE_WIN32_CODING_SYSTEMS
+ − 3062 the_hard_way:
+ − 3063 #endif /* HAVE_WIN32_CODING_SYSTEMS */
+ − 3064 delete_count = 0;
+ − 3065 if (source_type == DFC_TYPE_LISP_LSTREAM)
+ − 3066 instream = source->lisp_object;
+ − 3067 else
+ − 3068 {
+ − 3069 type_checking_assert (source_type == DFC_TYPE_DATA);
+ − 3070 streams_to_delete[delete_count++] = instream =
+ − 3071 make_fixed_buffer_input_stream (source->data.ptr, source->data.len);
+ − 3072 }
+ − 3073
+ − 3074 if (sink_type == DFC_TYPE_LISP_LSTREAM)
+ − 3075 outstream = sink->lisp_object;
+ − 3076 else
+ − 3077 {
+ − 3078 type_checking_assert (sink_type == DFC_TYPE_DATA);
+ − 3079 streams_to_delete[delete_count++] = outstream =
+ − 3080 make_dynarr_output_stream
+ − 3081 ((unsigned_char_dynarr *) conversion_in_dynarr);
+ − 3082 }
+ − 3083
+ − 3084 streams_to_delete[delete_count++] = outstream =
800
+ − 3085 make_coding_output_stream (XLSTREAM (outstream), coding_system,
+ − 3086 CODING_DECODE, 0);
771
+ − 3087
+ − 3088 reader = XLSTREAM (instream);
+ − 3089 writer = XLSTREAM (outstream);
+ − 3090 /* outstream will gc-protect its sink stream, if necessary */
+ − 3091 GCPRO2 (instream, outstream);
+ − 3092
+ − 3093 while (1)
+ − 3094 {
+ − 3095 Bytecount size_in_bytes;
+ − 3096 char tempbuf[1024]; /* some random amount */
+ − 3097
+ − 3098 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf));
+ − 3099
+ − 3100 if (size_in_bytes == 0)
+ − 3101 break;
+ − 3102 else if (size_in_bytes < 0)
+ − 3103 signal_error (Qtext_conversion_error,
+ − 3104 "Error converting to internal format", Qunbound);
+ − 3105
+ − 3106 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0)
+ − 3107 signal_error (Qtext_conversion_error,
+ − 3108 "Error converting to internal format", Qunbound);
+ − 3109 }
+ − 3110
+ − 3111 /* Closing writer will close any stream at the other end of writer. */
+ − 3112 Lstream_close (writer);
+ − 3113 Lstream_close (reader);
+ − 3114 UNGCPRO;
+ − 3115
+ − 3116 /* The idea is that this function will create no garbage. */
+ − 3117 while (delete_count)
+ − 3118 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count]));
+ − 3119 }
+ − 3120
+ − 3121 unbind_to (count);
+ − 3122
+ − 3123 if (sink_type != DFC_TYPE_LISP_LSTREAM)
+ − 3124 {
+ − 3125 sink->data.len = Dynarr_length (conversion_in_dynarr);
+ − 3126 Dynarr_add (conversion_in_dynarr, '\0'); /* remember to NUL-terminate! */
+ − 3127 /* The macros don't currently distinguish between internal and
+ − 3128 external sinks, and allocate and copy two extra bytes in both
+ − 3129 cases. So we add a second zero, just like for external data
+ − 3130 (in that case, because we may be converting to Unicode). */
+ − 3131 Dynarr_add (conversion_in_dynarr, '\0');
+ − 3132 sink->data.ptr = Dynarr_atp (conversion_in_dynarr, 0);
+ − 3133 }
+ − 3134 }
+ − 3135
+ − 3136
+ − 3137 /************************************************************************/
867
+ − 3138 /* Basic Ichar functions */
771
+ − 3139 /************************************************************************/
+ − 3140
+ − 3141 #ifdef MULE
+ − 3142
+ − 3143 /* Convert a non-ASCII Mule character C into a one-character Mule-encoded
+ − 3144 string in STR. Returns the number of bytes stored.
867
+ − 3145 Do not call this directly. Use the macro set_itext_ichar() instead.
771
+ − 3146 */
+ − 3147
+ − 3148 Bytecount
867
+ − 3149 non_ascii_set_itext_ichar (Ibyte *str, Ichar c)
771
+ − 3150 {
867
+ − 3151 Ibyte *p;
+ − 3152 Ibyte lb;
771
+ − 3153 int c1, c2;
+ − 3154 Lisp_Object charset;
+ − 3155
+ − 3156 p = str;
867
+ − 3157 BREAKUP_ICHAR (c, charset, c1, c2);
+ − 3158 lb = ichar_leading_byte (c);
826
+ − 3159 if (leading_byte_private_p (lb))
+ − 3160 *p++ = private_leading_byte_prefix (lb);
771
+ − 3161 *p++ = lb;
+ − 3162 if (EQ (charset, Vcharset_control_1))
+ − 3163 c1 += 0x20;
+ − 3164 *p++ = c1 | 0x80;
+ − 3165 if (c2)
+ − 3166 *p++ = c2 | 0x80;
+ − 3167
+ − 3168 return (p - str);
+ − 3169 }
+ − 3170
+ − 3171 /* Return the first character from a Mule-encoded string in STR,
+ − 3172 assuming it's non-ASCII. Do not call this directly.
867
+ − 3173 Use the macro itext_ichar() instead. */
+ − 3174
+ − 3175 Ichar
+ − 3176 non_ascii_itext_ichar (const Ibyte *str)
771
+ − 3177 {
867
+ − 3178 Ibyte i0 = *str, i1, i2 = 0;
771
+ − 3179 Lisp_Object charset;
+ − 3180
+ − 3181 if (i0 == LEADING_BYTE_CONTROL_1)
867
+ − 3182 return (Ichar) (*++str - 0x20);
771
+ − 3183
826
+ − 3184 if (leading_byte_prefix_p (i0))
771
+ − 3185 i0 = *++str;
+ − 3186
+ − 3187 i1 = *++str & 0x7F;
+ − 3188
826
+ − 3189 charset = charset_by_leading_byte (i0);
771
+ − 3190 if (XCHARSET_DIMENSION (charset) == 2)
+ − 3191 i2 = *++str & 0x7F;
+ − 3192
867
+ − 3193 return make_ichar (charset, i1, i2);
771
+ − 3194 }
+ − 3195
867
+ − 3196 /* Return whether CH is a valid Ichar, assuming it's non-ASCII.
+ − 3197 Do not call this directly. Use the macro valid_ichar_p() instead. */
771
+ − 3198
+ − 3199 int
867
+ − 3200 non_ascii_valid_ichar_p (Ichar ch)
771
+ − 3201 {
+ − 3202 int f1, f2, f3;
+ − 3203
+ − 3204 /* Must have only lowest 19 bits set */
+ − 3205 if (ch & ~0x7FFFF)
+ − 3206 return 0;
+ − 3207
867
+ − 3208 f1 = ichar_field1 (ch);
+ − 3209 f2 = ichar_field2 (ch);
+ − 3210 f3 = ichar_field3 (ch);
771
+ − 3211
+ − 3212 if (f1 == 0)
+ − 3213 {
+ − 3214 /* dimension-1 char */
+ − 3215 Lisp_Object charset;
+ − 3216
+ − 3217 /* leading byte must be correct */
867
+ − 3218 if (f2 < MIN_ICHAR_FIELD2_OFFICIAL ||
+ − 3219 (f2 > MAX_ICHAR_FIELD2_OFFICIAL && f2 < MIN_ICHAR_FIELD2_PRIVATE) ||
+ − 3220 f2 > MAX_ICHAR_FIELD2_PRIVATE)
771
+ − 3221 return 0;
+ − 3222 /* octet not out of range */
+ − 3223 if (f3 < 0x20)
+ − 3224 return 0;
+ − 3225 /* charset exists */
+ − 3226 /*
+ − 3227 NOTE: This takes advantage of the fact that
+ − 3228 FIELD2_TO_OFFICIAL_LEADING_BYTE and
+ − 3229 FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
+ − 3230 */
826
+ − 3231 charset = charset_by_leading_byte (f2 + FIELD2_TO_OFFICIAL_LEADING_BYTE);
771
+ − 3232 if (EQ (charset, Qnil))
+ − 3233 return 0;
+ − 3234 /* check range as per size (94 or 96) of charset */
+ − 3235 return ((f3 > 0x20 && f3 < 0x7f) || XCHARSET_CHARS (charset) == 96);
+ − 3236 }
+ − 3237 else
+ − 3238 {
+ − 3239 /* dimension-2 char */
+ − 3240 Lisp_Object charset;
+ − 3241
+ − 3242 /* leading byte must be correct */
867
+ − 3243 if (f1 < MIN_ICHAR_FIELD1_OFFICIAL ||
+ − 3244 (f1 > MAX_ICHAR_FIELD1_OFFICIAL && f1 < MIN_ICHAR_FIELD1_PRIVATE) ||
+ − 3245 f1 > MAX_ICHAR_FIELD1_PRIVATE)
771
+ − 3246 return 0;
+ − 3247 /* octets not out of range */
+ − 3248 if (f2 < 0x20 || f3 < 0x20)
+ − 3249 return 0;
+ − 3250
+ − 3251 #ifdef ENABLE_COMPOSITE_CHARS
+ − 3252 if (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE == LEADING_BYTE_COMPOSITE)
+ − 3253 {
+ − 3254 if (UNBOUNDP (Fgethash (make_int (ch),
+ − 3255 Vcomposite_char_char2string_hash_table,
+ − 3256 Qunbound)))
+ − 3257 return 0;
+ − 3258 return 1;
+ − 3259 }
+ − 3260 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 3261
+ − 3262 /* charset exists */
867
+ − 3263 if (f1 <= MAX_ICHAR_FIELD1_OFFICIAL)
771
+ − 3264 charset =
826
+ − 3265 charset_by_leading_byte (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE);
771
+ − 3266 else
+ − 3267 charset =
826
+ − 3268 charset_by_leading_byte (f1 + FIELD1_TO_PRIVATE_LEADING_BYTE);
771
+ − 3269
+ − 3270 if (EQ (charset, Qnil))
+ − 3271 return 0;
+ − 3272 /* check range as per size (94x94 or 96x96) of charset */
+ − 3273 return ((f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F) ||
+ − 3274 XCHARSET_CHARS (charset) == 96);
+ − 3275 }
+ − 3276 }
+ − 3277
+ − 3278 /* Copy the character pointed to by SRC into DST. Do not call this
867
+ − 3279 directly. Use the macro itext_copy_ichar() instead.
771
+ − 3280 Return the number of bytes copied. */
+ − 3281
+ − 3282 Bytecount
867
+ − 3283 non_ascii_itext_copy_ichar (const Ibyte *src, Ibyte *dst)
771
+ − 3284 {
826
+ − 3285 Bytecount bytes = rep_bytes_by_first_byte (*src);
771
+ − 3286 Bytecount i;
+ − 3287 for (i = bytes; i; i--, dst++, src++)
+ − 3288 *dst = *src;
+ − 3289 return bytes;
+ − 3290 }
+ − 3291
+ − 3292 #endif /* MULE */
+ − 3293
+ − 3294
+ − 3295 /************************************************************************/
867
+ − 3296 /* streams of Ichars */
771
+ − 3297 /************************************************************************/
+ − 3298
+ − 3299 #ifdef MULE
+ − 3300
867
+ − 3301 /* Treat a stream as a stream of Ichar's rather than a stream of bytes.
771
+ − 3302 The functions below are not meant to be called directly; use
+ − 3303 the macros in insdel.h. */
+ − 3304
867
+ − 3305 Ichar
+ − 3306 Lstream_get_ichar_1 (Lstream *stream, int ch)
771
+ − 3307 {
867
+ − 3308 Ibyte str[MAX_ICHAR_LEN];
+ − 3309 Ibyte *strptr = str;
771
+ − 3310 Bytecount bytes;
+ − 3311
867
+ − 3312 str[0] = (Ibyte) ch;
771
+ − 3313
826
+ − 3314 for (bytes = rep_bytes_by_first_byte (ch) - 1; bytes; bytes--)
771
+ − 3315 {
+ − 3316 int c = Lstream_getc (stream);
800
+ − 3317 text_checking_assert (c >= 0);
867
+ − 3318 *++strptr = (Ibyte) c;
771
+ − 3319 }
867
+ − 3320 return itext_ichar (str);
771
+ − 3321 }
+ − 3322
+ − 3323 int
867
+ − 3324 Lstream_fput_ichar (Lstream *stream, Ichar ch)
771
+ − 3325 {
867
+ − 3326 Ibyte str[MAX_ICHAR_LEN];
+ − 3327 Bytecount len = set_itext_ichar (str, ch);
771
+ − 3328 return Lstream_write (stream, str, len);
+ − 3329 }
+ − 3330
+ − 3331 void
867
+ − 3332 Lstream_funget_ichar (Lstream *stream, Ichar ch)
771
+ − 3333 {
867
+ − 3334 Ibyte str[MAX_ICHAR_LEN];
+ − 3335 Bytecount len = set_itext_ichar (str, ch);
771
+ − 3336 Lstream_unread (stream, str, len);
+ − 3337 }
+ − 3338
+ − 3339 #endif /* MULE */
+ − 3340
+ − 3341
+ − 3342 /************************************************************************/
+ − 3343 /* Lisp primitives for working with characters */
+ − 3344 /************************************************************************/
+ − 3345
+ − 3346 DEFUN ("make-char", Fmake_char, 2, 3, 0, /*
+ − 3347 Make a character from CHARSET and octets ARG1 and ARG2.
+ − 3348 ARG2 is required only for characters from two-dimensional charsets.
+ − 3349
+ − 3350 Each octet should be in the range 32 through 127 for a 96 or 96x96
+ − 3351 charset and 33 through 126 for a 94 or 94x94 charset. (Most charsets
+ − 3352 are either 96 or 94x94.) Note that this is 32 more than the values
+ − 3353 typically given for 94x94 charsets. When two octets are required, the
+ − 3354 order is "standard" -- the same as appears in ISO-2022 encodings,
+ − 3355 reference tables, etc.
+ − 3356
+ − 3357 \(Note the following non-obvious result: Computerized translation
+ − 3358 tables often encode the two octets as the high and low bytes,
+ − 3359 respectively, of a hex short, while when there's only one octet, it
+ − 3360 goes in the low byte. When decoding such a value, you need to treat
+ − 3361 the two cases differently when calling make-char: One is (make-char
+ − 3362 CHARSET HIGH LOW), the other is (make-char CHARSET LOW).)
+ − 3363
+ − 3364 For example, (make-char 'latin-iso8859-2 185) or (make-char
+ − 3365 'latin-iso8859-2 57) will return the Latin 2 character s with caron.
+ − 3366
+ − 3367 As another example, the Japanese character for "kawa" (stream), which
+ − 3368 looks something like this:
+ − 3369
+ − 3370 | |
+ − 3371 | | |
+ − 3372 | | |
+ − 3373 | | |
+ − 3374 / |
+ − 3375
+ − 3376 appears in the Unicode Standard (version 2.0) on page 7-287 with the
+ − 3377 following values (see also page 7-4):
+ − 3378
+ − 3379 U 5DDD (Unicode)
+ − 3380 G 0-2008 (GB 2312-80)
+ − 3381 J 0-3278 (JIS X 0208-1990)
+ − 3382 K 0-8425 (KS C 5601-1987)
+ − 3383 B A474 (Big Five)
+ − 3384 C 1-4455 (CNS 11643-1986 (1st plane))
+ − 3385 A 213C34 (ANSI Z39.64-1989)
+ − 3386
+ − 3387 These are equivalent to:
+ − 3388
+ − 3389 \(make-char 'chinese-gb2312 52 40)
+ − 3390 \(make-char 'japanese-jisx0208 64 110)
+ − 3391 \(make-char 'korean-ksc5601 116 57)
+ − 3392 \(make-char 'chinese-cns11643-1 76 87)
+ − 3393 \(decode-big5-char '(164 . 116))
+ − 3394
+ − 3395 \(All codes above are two decimal numbers except for Big Five and ANSI
+ − 3396 Z39.64, which we don't support. We add 32 to each of the decimal
+ − 3397 numbers. Big Five is split in a rather hackish fashion into two
+ − 3398 charsets, `big5-1' and `big5-2', due to its excessive size -- 94x157,
+ − 3399 with the first codepoint in the range 0xA1 to 0xFE and the second in
+ − 3400 the range 0x40 to 0x7E or 0xA1 to 0xFE. `decode-big5-char' is used to
+ − 3401 generate the char from its codes, and `encode-big5-char' extracts the
+ − 3402 codes.)
+ − 3403
+ − 3404 When compiled without MULE, this function does not do much, but it's
+ − 3405 provided for compatibility. In this case, the following CHARSET symbols
+ − 3406 are allowed:
+ − 3407
+ − 3408 `ascii' -- ARG1 should be in the range 0 through 127.
+ − 3409 `control-1' -- ARG1 should be in the range 128 through 159.
+ − 3410 else -- ARG1 is coerced to be between 0 and 255, and then the high
+ − 3411 bit is set.
+ − 3412
+ − 3413 `int-to-char of the resulting ARG1' is returned, and ARG2 is always ignored.
+ − 3414 */
+ − 3415 (charset, arg1, arg2))
+ − 3416 {
+ − 3417 #ifdef MULE
+ − 3418 Lisp_Charset *cs;
+ − 3419 int a1, a2;
+ − 3420 int lowlim, highlim;
+ − 3421
+ − 3422 charset = Fget_charset (charset);
+ − 3423 cs = XCHARSET (charset);
+ − 3424
788
+ − 3425 get_charset_limits (charset, &lowlim, &highlim);
771
+ − 3426
+ − 3427 CHECK_INT (arg1);
+ − 3428 /* It is useful (and safe, according to Olivier Galibert) to strip
+ − 3429 the 8th bit off ARG1 and ARG2 because it allows programmers to
+ − 3430 write (make-char 'latin-iso8859-2 CODE) where code is the actual
+ − 3431 Latin 2 code of the character. */
+ − 3432 a1 = XINT (arg1) & 0x7f;
+ − 3433 if (a1 < lowlim || a1 > highlim)
+ − 3434 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim));
+ − 3435
+ − 3436 if (CHARSET_DIMENSION (cs) == 1)
+ − 3437 {
+ − 3438 if (!NILP (arg2))
+ − 3439 invalid_argument
+ − 3440 ("Charset is of dimension one; second octet must be nil", arg2);
867
+ − 3441 return make_char (make_ichar (charset, a1, 0));
771
+ − 3442 }
+ − 3443
+ − 3444 CHECK_INT (arg2);
+ − 3445 a2 = XINT (arg2) & 0x7f;
+ − 3446 if (a2 < lowlim || a2 > highlim)
+ − 3447 args_out_of_range_3 (arg2, make_int (lowlim), make_int (highlim));
+ − 3448
867
+ − 3449 return make_char (make_ichar (charset, a1, a2));
771
+ − 3450 #else
+ − 3451 int a1;
+ − 3452 int lowlim, highlim;
+ − 3453
+ − 3454 if (EQ (charset, Qascii)) lowlim = 0, highlim = 127;
+ − 3455 else if (EQ (charset, Qcontrol_1)) lowlim = 0, highlim = 31;
+ − 3456 else lowlim = 0, highlim = 127;
+ − 3457
+ − 3458 CHECK_INT (arg1);
+ − 3459 /* It is useful (and safe, according to Olivier Galibert) to strip
+ − 3460 the 8th bit off ARG1 and ARG2 because it allows programmers to
+ − 3461 write (make-char 'latin-iso8859-2 CODE) where code is the actual
+ − 3462 Latin 2 code of the character. */
+ − 3463 a1 = XINT (arg1) & 0x7f;
+ − 3464 if (a1 < lowlim || a1 > highlim)
+ − 3465 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim));
+ − 3466
+ − 3467 if (EQ (charset, Qascii))
+ − 3468 return make_char (a1);
+ − 3469 return make_char (a1 + 128);
+ − 3470 #endif /* MULE */
+ − 3471 }
+ − 3472
+ − 3473 #ifdef MULE
+ − 3474
+ − 3475 DEFUN ("char-charset", Fchar_charset, 1, 1, 0, /*
+ − 3476 Return the character set of char CH.
+ − 3477 */
+ − 3478 (ch))
+ − 3479 {
+ − 3480 CHECK_CHAR_COERCE_INT (ch);
+ − 3481
826
+ − 3482 return XCHARSET_NAME (charset_by_leading_byte
867
+ − 3483 (ichar_leading_byte (XCHAR (ch))));
771
+ − 3484 }
+ − 3485
+ − 3486 DEFUN ("char-octet", Fchar_octet, 1, 2, 0, /*
+ − 3487 Return the octet numbered N (should be 0 or 1) of char CH.
+ − 3488 N defaults to 0 if omitted.
+ − 3489 */
+ − 3490 (ch, n))
+ − 3491 {
+ − 3492 Lisp_Object charset;
+ − 3493 int octet0, octet1;
+ − 3494
+ − 3495 CHECK_CHAR_COERCE_INT (ch);
+ − 3496
867
+ − 3497 BREAKUP_ICHAR (XCHAR (ch), charset, octet0, octet1);
771
+ − 3498
+ − 3499 if (NILP (n) || EQ (n, Qzero))
+ − 3500 return make_int (octet0);
+ − 3501 else if (EQ (n, make_int (1)))
+ − 3502 return make_int (octet1);
+ − 3503 else
+ − 3504 invalid_constant ("Octet number must be 0 or 1", n);
+ − 3505 }
+ − 3506
+ − 3507 DEFUN ("split-char", Fsplit_char, 1, 1, 0, /*
+ − 3508 Return list of charset and one or two position-codes of CHAR.
+ − 3509 */
+ − 3510 (character))
+ − 3511 {
+ − 3512 /* This function can GC */
+ − 3513 struct gcpro gcpro1, gcpro2;
+ − 3514 Lisp_Object charset = Qnil;
+ − 3515 Lisp_Object rc = Qnil;
+ − 3516 int c1, c2;
+ − 3517
+ − 3518 GCPRO2 (charset, rc);
+ − 3519 CHECK_CHAR_COERCE_INT (character);
+ − 3520
867
+ − 3521 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2);
771
+ − 3522
+ − 3523 if (XCHARSET_DIMENSION (Fget_charset (charset)) == 2)
+ − 3524 {
+ − 3525 rc = list3 (XCHARSET_NAME (charset), make_int (c1), make_int (c2));
+ − 3526 }
+ − 3527 else
+ − 3528 {
+ − 3529 rc = list2 (XCHARSET_NAME (charset), make_int (c1));
+ − 3530 }
+ − 3531 UNGCPRO;
+ − 3532
+ − 3533 return rc;
+ − 3534 }
+ − 3535
+ − 3536 #endif /* MULE */
+ − 3537
+ − 3538
+ − 3539 /************************************************************************/
+ − 3540 /* composite character functions */
+ − 3541 /************************************************************************/
+ − 3542
+ − 3543 #ifdef ENABLE_COMPOSITE_CHARS
+ − 3544
867
+ − 3545 Ichar
+ − 3546 lookup_composite_char (Ibyte *str, int len)
771
+ − 3547 {
+ − 3548 Lisp_Object lispstr = make_string (str, len);
+ − 3549 Lisp_Object ch = Fgethash (lispstr,
+ − 3550 Vcomposite_char_string2char_hash_table,
+ − 3551 Qunbound);
867
+ − 3552 Ichar emch;
771
+ − 3553
+ − 3554 if (UNBOUNDP (ch))
+ − 3555 {
+ − 3556 if (composite_char_row_next >= 128)
+ − 3557 invalid_operation ("No more composite chars available", lispstr);
867
+ − 3558 emch = make_ichar (Vcharset_composite, composite_char_row_next,
771
+ − 3559 composite_char_col_next);
+ − 3560 Fputhash (make_char (emch), lispstr,
+ − 3561 Vcomposite_char_char2string_hash_table);
+ − 3562 Fputhash (lispstr, make_char (emch),
+ − 3563 Vcomposite_char_string2char_hash_table);
+ − 3564 composite_char_col_next++;
+ − 3565 if (composite_char_col_next >= 128)
+ − 3566 {
+ − 3567 composite_char_col_next = 32;
+ − 3568 composite_char_row_next++;
+ − 3569 }
+ − 3570 }
+ − 3571 else
+ − 3572 emch = XCHAR (ch);
+ − 3573 return emch;
+ − 3574 }
+ − 3575
+ − 3576 Lisp_Object
867
+ − 3577 composite_char_string (Ichar ch)
771
+ − 3578 {
+ − 3579 Lisp_Object str = Fgethash (make_char (ch),
+ − 3580 Vcomposite_char_char2string_hash_table,
+ − 3581 Qunbound);
+ − 3582 assert (!UNBOUNDP (str));
+ − 3583 return str;
+ − 3584 }
+ − 3585
826
+ − 3586 DEFUN ("make-composite-char", Fmake_composite_char, 1, 1, 0, /*
771
+ − 3587 Convert a string into a single composite character.
+ − 3588 The character is the result of overstriking all the characters in
+ − 3589 the string.
+ − 3590 */
+ − 3591 (string))
+ − 3592 {
+ − 3593 CHECK_STRING (string);
+ − 3594 return make_char (lookup_composite_char (XSTRING_DATA (string),
+ − 3595 XSTRING_LENGTH (string)));
+ − 3596 }
+ − 3597
826
+ − 3598 DEFUN ("composite-char-string", Fcomposite_char_string, 1, 1, 0, /*
771
+ − 3599 Return a string of the characters comprising a composite character.
+ − 3600 */
+ − 3601 (ch))
+ − 3602 {
867
+ − 3603 Ichar emch;
771
+ − 3604
+ − 3605 CHECK_CHAR (ch);
+ − 3606 emch = XCHAR (ch);
867
+ − 3607 if (ichar_leading_byte (emch) != LEADING_BYTE_COMPOSITE)
771
+ − 3608 invalid_argument ("Must be composite char", ch);
+ − 3609 return composite_char_string (emch);
+ − 3610 }
+ − 3611 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 3612
+ − 3613
+ − 3614 /************************************************************************/
+ − 3615 /* initialization */
+ − 3616 /************************************************************************/
+ − 3617
+ − 3618 void
814
+ − 3619 reinit_eistring_once_early (void)
771
+ − 3620 {
+ − 3621 the_eistring_malloc_zero_init = the_eistring_zero_init;
+ − 3622 the_eistring_malloc_zero_init.mallocp_ = 1;
+ − 3623 }
+ − 3624
+ − 3625 void
814
+ − 3626 init_eistring_once_early (void)
+ − 3627 {
+ − 3628 reinit_eistring_once_early ();
+ − 3629 }
+ − 3630
+ − 3631 void
771
+ − 3632 syms_of_text (void)
+ − 3633 {
+ − 3634 DEFSUBR (Fmake_char);
+ − 3635
+ − 3636 #ifdef MULE
+ − 3637 DEFSUBR (Fchar_charset);
+ − 3638 DEFSUBR (Fchar_octet);
+ − 3639 DEFSUBR (Fsplit_char);
+ − 3640
+ − 3641 #ifdef ENABLE_COMPOSITE_CHARS
+ − 3642 DEFSUBR (Fmake_composite_char);
+ − 3643 DEFSUBR (Fcomposite_char_string);
+ − 3644 #endif
+ − 3645 #endif /* MULE */
+ − 3646 }
+ − 3647
+ − 3648 void
+ − 3649 reinit_vars_of_text (void)
+ − 3650 {
+ − 3651 int i;
+ − 3652
867
+ − 3653 conversion_in_dynarr_list = Dynarr_new2 (Ibyte_dynarr_dynarr,
+ − 3654 Ibyte_dynarr *);
771
+ − 3655 conversion_out_dynarr_list = Dynarr_new2 (Extbyte_dynarr_dynarr,
+ − 3656 Extbyte_dynarr *);
+ − 3657
+ − 3658 /* #### Olivier, why does this need to be reinitted? */
+ − 3659 for (i = 0; i <= MAX_BYTEBPOS_GAP_SIZE_3; i++)
+ − 3660 three_to_one_table[i] = i / 3;
+ − 3661 }
+ − 3662
+ − 3663 void
+ − 3664 vars_of_text (void)
+ − 3665 {
+ − 3666 reinit_vars_of_text ();
+ − 3667
+ − 3668 #ifdef ENABLE_COMPOSITE_CHARS
+ − 3669 /* #### not dumped properly */
+ − 3670 composite_char_row_next = 32;
+ − 3671 composite_char_col_next = 32;
+ − 3672
+ − 3673 Vcomposite_char_string2char_hash_table =
+ − 3674 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQUAL);
+ − 3675 Vcomposite_char_char2string_hash_table =
+ − 3676 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
+ − 3677 staticpro (&Vcomposite_char_string2char_hash_table);
+ − 3678 staticpro (&Vcomposite_char_char2string_hash_table);
+ − 3679 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 3680 }