771
+ − 1 /* Buffer manipulation primitives for XEmacs.
+ − 2 Copyright (C) 1995 Sun Microsystems, Inc.
1292
+ − 3 Copyright (C) 1995, 1996, 2000, 2001, 2002, 2003 Ben Wing.
771
+ − 4 Copyright (C) 1999 Martin Buchholz.
+ − 5
+ − 6 This file is part of XEmacs.
+ − 7
+ − 8 XEmacs is free software; you can redistribute it and/or modify it
+ − 9 under the terms of the GNU General Public License as published by the
+ − 10 Free Software Foundation; either version 2, or (at your option) any
+ − 11 later version.
+ − 12
+ − 13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
+ − 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ − 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ − 16 for more details.
+ − 17
+ − 18 You should have received a copy of the GNU General Public License
+ − 19 along with XEmacs; see the file COPYING. If not, write to
+ − 20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ − 21 Boston, MA 02111-1307, USA. */
+ − 22
+ − 23 /* Synched up with: Not in FSF. */
+ − 24
+ − 25 /* Authorship:
+ − 26 */
+ − 27
+ − 28 #include <config.h>
+ − 29 #include "lisp.h"
+ − 30
+ − 31 #include "buffer.h"
+ − 32 #include "charset.h"
+ − 33 #include "file-coding.h"
+ − 34 #include "lstream.h"
1292
+ − 35 #include "profile.h"
771
+ − 36
+ − 37
+ − 38 /************************************************************************/
+ − 39 /* long comments */
+ − 40 /************************************************************************/
+ − 41
+ − 42 /*
826
+ − 43 ==========================================================================
1292
+ − 44 1. Intro to Characters, Character Sets, and Encodings
826
+ − 45 ==========================================================================
771
+ − 46
826
+ − 47 A character (which is, BTW, a surprisingly complex concept) is, in a
+ − 48 written representation of text, the most basic written unit that has a
+ − 49 meaning of its own. It's comparable to a phoneme when analyzing words
1292
+ − 50 in spoken speech (for example, the sound of `t' in English, which in
+ − 51 fact has different pronunciations in different words -- aspirated in
+ − 52 `time', unaspirated in `stop', unreleased or even pronounced as a
+ − 53 glottal stop in `button', etc. -- but logically is a single concept).
+ − 54 Like a phoneme, a character is an abstract concept defined by its
+ − 55 *meaning*. The character `lowercase f', for example, can always be used
+ − 56 to represent the first letter in the word `fill', regardless of whether
+ − 57 it's drawn upright or italic, whether the `fi' combination is drawn as a
+ − 58 single ligature, whether there are serifs on the bottom of the vertical
+ − 59 stroke, etc. (These different appearances of a single character are
+ − 60 often called "graphs" or "glyphs".) Our concern when representing text
+ − 61 is on representing the abstract characters, and not on their exact
+ − 62 appearance.
+ − 63
+ − 64 A character set (or "charset"), as we define it, is a set of characters,
+ − 65 each with an associated number (or set of numbers -- see below), called
+ − 66 a "code point". It's important to understand that a character is not
+ − 67 defined by any number attached to it, but by its meaning. For example,
+ − 68 ASCII and EBCDIC are two charsets containing exactly the same characters
+ − 69 (lowercase and uppercase letters, numbers 0 through 9, particular
+ − 70 punctuation marks) but with different numberings. The `comma' character
+ − 71 in ASCII and EBCDIC, for instance, is the same character despite having
+ − 72 a different numbering. Conversely, when comparing ASCII and JIS-Roman,
+ − 73 which look the same except that the latter has a yen sign substituted
+ − 74 for the backslash, we would say that the backslash and yen sign are
+ − 75 *not* the same characters, despite having the same number (95) and
+ − 76 despite the fact that all other characters are present in both charsets,
+ − 77 with the same numbering. ASCII and JIS-Roman, then, do *not* have
+ − 78 exactly the same characters in them (ASCII has a backslash character but
+ − 79 no yen-sign character, and vice-versa for JIS-Roman), unlike ASCII and
+ − 80 EBCDIC, even though the numberings in ASCII and JIS-Roman are closer.
+ − 81
+ − 82 It's also important to distinguish between charsets and encodings. For
+ − 83 a simple charset like ASCII, there is only one encoding normally used --
+ − 84 each character is represented by a single byte, with the same value as
+ − 85 its code point. For more complicated charsets, however, things are not
+ − 86 so obvious. Unicode version 2, for example, is a large charset with
+ − 87 thousands of characters, each indexed by a 16-bit number, often
+ − 88 represented in hex, e.g. 0x05D0 for the Hebrew letter "aleph". One
+ − 89 obvious encoding uses two bytes per character (actually two encodings,
+ − 90 depending on which of the two possible byte orderings is chosen). This
+ − 91 encoding is convenient for internal processing of Unicode text; however,
+ − 92 it's incompatible with ASCII, so a different encoding, e.g. UTF-8, is
+ − 93 usually used for external text, for example files or e-mail. UTF-8
+ − 94 represents Unicode characters with one to three bytes (often extended to
+ − 95 six bytes to handle characters with up to 31-bit indices). Unicode
+ − 96 characters 00 to 7F (identical with ASCII) are directly represented with
+ − 97 one byte, and other characters with two or more bytes, each in the range
+ − 98 80 to FF.
+ − 99
+ − 100 In general, a single encoding may be able to represent more than one
+ − 101 charset.
+ − 102
+ − 103 See also man/lispref/mule.texi.
826
+ − 104
1292
+ − 105 ==========================================================================
+ − 106 2. Character Sets
+ − 107 ==========================================================================
+ − 108
771
+ − 109 A particular character in a charset is indexed using one or
+ − 110 more "position codes", which are non-negative integers.
+ − 111 The number of position codes needed to identify a particular
+ − 112 character in a charset is called the "dimension" of the
+ − 113 charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions,
+ − 114 and the size of all charsets (except for a few special cases)
+ − 115 is either 94, 96, 94 by 94, or 96 by 96. The range of
+ − 116 position codes used to index characters from any of these
+ − 117 types of character sets is as follows:
+ − 118
+ − 119 Charset type Position code 1 Position code 2
+ − 120 ------------------------------------------------------------
+ − 121 94 33 - 126 N/A
+ − 122 96 32 - 127 N/A
+ − 123 94x94 33 - 126 33 - 126
+ − 124 96x96 32 - 127 32 - 127
+ − 125
+ − 126 Note that in the above cases position codes do not start at
+ − 127 an expected value such as 0 or 1. The reason for this will
+ − 128 become clear later.
+ − 129
+ − 130 For example, Latin-1 is a 96-character charset, and JISX0208
+ − 131 (the Japanese national character set) is a 94x94-character
+ − 132 charset.
+ − 133
+ − 134 [Note that, although the ranges above define the *valid*
+ − 135 position codes for a charset, some of the slots in a particular
+ − 136 charset may in fact be empty. This is the case for JISX0208,
+ − 137 for example, where (e.g.) all the slots whose first
+ − 138 position code is in the range 118 - 127 are empty.]
+ − 139
+ − 140 There are three charsets that do not follow the above rules.
+ − 141 All of them have one dimension, and have ranges of position
+ − 142 codes as follows:
+ − 143
+ − 144 Charset name Position code 1
+ − 145 ------------------------------------
+ − 146 ASCII 0 - 127
+ − 147 Control-1 0 - 31
+ − 148 Composite 0 - some large number
+ − 149
+ − 150 (The upper bound of the position code for composite characters
+ − 151 has not yet been determined, but it will probably be at
+ − 152 least 16,383).
+ − 153
+ − 154 ASCII is the union of two subsidiary character sets:
+ − 155 Printing-ASCII (the printing ASCII character set,
+ − 156 consisting of position codes 33 - 126, like for a standard
+ − 157 94-character charset) and Control-ASCII (the non-printing
+ − 158 characters that would appear in a binary file with codes 0
+ − 159 - 32 and 127).
+ − 160
+ − 161 Control-1 contains the non-printing characters that would
+ − 162 appear in a binary file with codes 128 - 159.
+ − 163
+ − 164 Composite contains characters that are generated by
+ − 165 overstriking one or more characters from other charsets.
+ − 166
+ − 167 Note that some characters in ASCII, and all characters
+ − 168 in Control-1, are "control" (non-printing) characters.
+ − 169 These have no printed representation but instead control
+ − 170 some other function of the printing (e.g. TAB or 8 moves
+ − 171 the current character position to the next tab stop).
+ − 172 All other characters in all charsets are "graphic"
+ − 173 (printing) characters.
+ − 174
+ − 175 When a binary file is read in, the bytes in the file are
+ − 176 assigned to character sets as follows:
+ − 177
+ − 178 Bytes Character set Range
+ − 179 --------------------------------------------------
+ − 180 0 - 127 ASCII 0 - 127
+ − 181 128 - 159 Control-1 0 - 31
+ − 182 160 - 255 Latin-1 32 - 127
+ − 183
+ − 184 This is a bit ad-hoc but gets the job done.
+ − 185
826
+ − 186 ==========================================================================
1292
+ − 187 3. Encodings
826
+ − 188 ==========================================================================
771
+ − 189
+ − 190 An "encoding" is a way of numerically representing
+ − 191 characters from one or more character sets. If an encoding
+ − 192 only encompasses one character set, then the position codes
+ − 193 for the characters in that character set could be used
+ − 194 directly. This is not possible, however, if more than one
+ − 195 character set is to be used in the encoding.
+ − 196
+ − 197 For example, the conversion detailed above between bytes in
+ − 198 a binary file and characters is effectively an encoding
+ − 199 that encompasses the three character sets ASCII, Control-1,
+ − 200 and Latin-1 in a stream of 8-bit bytes.
+ − 201
+ − 202 Thus, an encoding can be viewed as a way of encoding
+ − 203 characters from a specified group of character sets using a
+ − 204 stream of bytes, each of which contains a fixed number of
+ − 205 bits (but not necessarily 8, as in the common usage of
+ − 206 "byte").
+ − 207
+ − 208 Here are descriptions of a couple of common
+ − 209 encodings:
+ − 210
+ − 211
+ − 212 A. Japanese EUC (Extended Unix Code)
+ − 213
+ − 214 This encompasses the character sets:
+ − 215 - Printing-ASCII,
+ − 216 - Katakana-JISX0201 (half-width katakana, the right half of JISX0201).
+ − 217 - Japanese-JISX0208
+ − 218 - Japanese-JISX0212
+ − 219 It uses 8-bit bytes.
+ − 220
+ − 221 Note that Printing-ASCII and Katakana-JISX0201 are 94-character
+ − 222 charsets, while Japanese-JISX0208 is a 94x94-character charset.
+ − 223
+ − 224 The encoding is as follows:
+ − 225
+ − 226 Character set Representation (PC == position-code)
+ − 227 ------------- --------------
+ − 228 Printing-ASCII PC1
+ − 229 Japanese-JISX0208 PC1 + 0x80 | PC2 + 0x80
+ − 230 Katakana-JISX0201 0x8E | PC1 + 0x80
+ − 231
+ − 232
+ − 233 B. JIS7
+ − 234
+ − 235 This encompasses the character sets:
+ − 236 - Printing-ASCII
+ − 237 - Latin-JISX0201 (the left half of JISX0201; this character set is
+ − 238 very similar to Printing-ASCII and is a 94-character charset)
+ − 239 - Japanese-JISX0208
+ − 240 - Katakana-JISX0201
+ − 241 It uses 7-bit bytes.
+ − 242
+ − 243 Unlike Japanese EUC, this is a "modal" encoding, which
+ − 244 means that there are multiple states that the encoding can
+ − 245 be in, which affect how the bytes are to be interpreted.
+ − 246 Special sequences of bytes (called "escape sequences")
+ − 247 are used to change states.
+ − 248
+ − 249 The encoding is as follows:
+ − 250
+ − 251 Character set Representation
+ − 252 ------------- --------------
+ − 253 Printing-ASCII PC1
+ − 254 Latin-JISX0201 PC1
+ − 255 Katakana-JISX0201 PC1
+ − 256 Japanese-JISX0208 PC1 | PC2
+ − 257
+ − 258 Escape sequence ASCII equivalent Meaning
+ − 259 --------------- ---------------- -------
+ − 260 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII
+ − 261 0x1B 0x28 0x4A ESC ( J invoke Latin-JISX0201
+ − 262 0x1B 0x28 0x49 ESC ( I invoke Katakana-JISX0201
+ − 263 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208
+ − 264
+ − 265 Initially, Printing-ASCII is invoked.
+ − 266
826
+ − 267 ==========================================================================
1292
+ − 268 4. Internal Mule Encodings
826
+ − 269 ==========================================================================
771
+ − 270
+ − 271 In XEmacs/Mule, each character set is assigned a unique number,
+ − 272 called a "leading byte". This is used in the encodings of a
+ − 273 character. Leading bytes are in the range 0x80 - 0xFF
+ − 274 (except for ASCII, which has a leading byte of 0), although
+ − 275 some leading bytes are reserved.
+ − 276
+ − 277 Charsets whose leading byte is in the range 0x80 - 0x9F are
+ − 278 called "official" and are used for built-in charsets.
+ − 279 Other charsets are called "private" and have leading bytes
+ − 280 in the range 0xA0 - 0xFF; these are user-defined charsets.
+ − 281
+ − 282 More specifically:
+ − 283
+ − 284 Character set Leading byte
+ − 285 ------------- ------------
+ − 286 ASCII 0 (0x7F in arrays indexed by leading byte)
+ − 287 Composite 0x8D
+ − 288 Dimension-1 Official 0x80 - 0x8C/0x8D
+ − 289 (0x8E is free)
+ − 290 Control 0x8F
+ − 291 Dimension-2 Official 0x90 - 0x99
+ − 292 (0x9A - 0x9D are free)
+ − 293 Dimension-1 Private Marker 0x9E
+ − 294 Dimension-2 Private Marker 0x9F
+ − 295 Dimension-1 Private 0xA0 - 0xEF
+ − 296 Dimension-2 Private 0xF0 - 0xFF
+ − 297
+ − 298 There are two internal encodings for characters in XEmacs/Mule.
+ − 299 One is called "string encoding" and is an 8-bit encoding that
+ − 300 is used for representing characters in a buffer or string.
+ − 301 It uses 1 to 4 bytes per character. The other is called
+ − 302 "character encoding" and is a 19-bit encoding that is used
+ − 303 for representing characters individually in a variable.
+ − 304
+ − 305 (In the following descriptions, we'll ignore composite
+ − 306 characters for the moment. We also give a general (structural)
+ − 307 overview first, followed later by the exact details.)
+ − 308
+ − 309 A. Internal String Encoding
+ − 310
+ − 311 ASCII characters are encoded using their position code directly.
+ − 312 Other characters are encoded using their leading byte followed
+ − 313 by their position code(s) with the high bit set. Characters
+ − 314 in private character sets have their leading byte prefixed with
+ − 315 a "leading byte prefix", which is either 0x9E or 0x9F. (No
+ − 316 character sets are ever assigned these leading bytes.) Specifically:
+ − 317
+ − 318 Character set Encoding (PC == position-code)
+ − 319 ------------- -------- (LB == leading-byte)
+ − 320 ASCII PC1 |
+ − 321 Control-1 LB | PC1 + 0xA0
+ − 322 Dimension-1 official LB | PC1 + 0x80
+ − 323 Dimension-1 private 0x9E | LB | PC1 + 0x80
+ − 324 Dimension-2 official LB | PC1 | PC2 + 0x80
+ − 325 Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80
+ − 326
+ − 327 The basic characteristic of this encoding is that the first byte
+ − 328 of all characters is in the range 0x00 - 0x9F, and the second and
+ − 329 following bytes of all characters is in the range 0xA0 - 0xFF.
+ − 330 This means that it is impossible to get out of sync, or more
+ − 331 specifically:
+ − 332
+ − 333 1. Given any byte position, the beginning of the character it is
+ − 334 within can be determined in constant time.
+ − 335 2. Given any byte position at the beginning of a character, the
+ − 336 beginning of the next character can be determined in constant
+ − 337 time.
+ − 338 3. Given any byte position at the beginning of a character, the
+ − 339 beginning of the previous character can be determined in constant
+ − 340 time.
+ − 341 4. Textual searches can simply treat encoded strings as if they
+ − 342 were encoded in a one-byte-per-character fashion rather than
+ − 343 the actual multi-byte encoding.
+ − 344
+ − 345 None of the standard non-modal encodings meet all of these
+ − 346 conditions. For example, EUC satisfies only (2) and (3), while
+ − 347 Shift-JIS and Big5 (not yet described) satisfy only (2). (All
+ − 348 non-modal encodings must satisfy (2), in order to be unambiguous.)
+ − 349
+ − 350 B. Internal Character Encoding
+ − 351
+ − 352 One 19-bit word represents a single character. The word is
+ − 353 separated into three fields:
+ − 354
+ − 355 Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+ − 356 <------------> <------------------> <------------------>
+ − 357 Field: 1 2 3
+ − 358
+ − 359 Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits.
+ − 360
+ − 361 Character set Field 1 Field 2 Field 3
+ − 362 ------------- ------- ------- -------
+ − 363 ASCII 0 0 PC1
+ − 364 range: (00 - 7F)
+ − 365 Control-1 0 1 PC1
+ − 366 range: (00 - 1F)
+ − 367 Dimension-1 official 0 LB - 0x7F PC1
+ − 368 range: (01 - 0D) (20 - 7F)
+ − 369 Dimension-1 private 0 LB - 0x80 PC1
+ − 370 range: (20 - 6F) (20 - 7F)
+ − 371 Dimension-2 official LB - 0x8F PC1 PC2
+ − 372 range: (01 - 0A) (20 - 7F) (20 - 7F)
+ − 373 Dimension-2 private LB - 0xE1 PC1 PC2
+ − 374 range: (0F - 1E) (20 - 7F) (20 - 7F)
+ − 375 Composite 0x1F ? ?
+ − 376
+ − 377 Note that character codes 0 - 255 are the same as the "binary encoding"
+ − 378 described above.
826
+ − 379
+ − 380 Most of the code in XEmacs knows nothing of the representation of a
+ − 381 character other than that values 0 - 255 represent ASCII, Control 1,
+ − 382 and Latin 1.
+ − 383
+ − 384 WARNING WARNING WARNING: The Boyer-Moore code in search.c, and the
+ − 385 code in search_buffer() that determines whether that code can be used,
+ − 386 knows that "field 3" in a character always corresponds to the last
+ − 387 byte in the textual representation of the character. (This is important
+ − 388 because the Boyer-Moore algorithm works by looking at the last byte
+ − 389 of the search string and &&#### finish this.
+ − 390
+ − 391 ==========================================================================
1292
+ − 392 5. Buffer Positions and Other Typedefs
826
+ − 393 ==========================================================================
+ − 394
+ − 395 A. Buffer Positions
+ − 396
+ − 397 There are three possible ways to specify positions in a buffer. All
+ − 398 of these are one-based: the beginning of the buffer is position or
+ − 399 index 1, and 0 is not a valid position.
+ − 400
+ − 401 As a "buffer position" (typedef Charbpos):
+ − 402
+ − 403 This is an index specifying an offset in characters from the
+ − 404 beginning of the buffer. Note that buffer positions are
+ − 405 logically *between* characters, not on a character. The
+ − 406 difference between two buffer positions specifies the number of
+ − 407 characters between those positions. Buffer positions are the
+ − 408 only kind of position externally visible to the user.
+ − 409
+ − 410 As a "byte index" (typedef Bytebpos):
+ − 411
+ − 412 This is an index over the bytes used to represent the characters
+ − 413 in the buffer. If there is no Mule support, this is identical
+ − 414 to a buffer position, because each character is represented
+ − 415 using one byte. However, with Mule support, many characters
+ − 416 require two or more bytes for their representation, and so a
+ − 417 byte index may be greater than the corresponding buffer
+ − 418 position.
+ − 419
+ − 420 As a "memory index" (typedef Membpos):
+ − 421
+ − 422 This is the byte index adjusted for the gap. For positions
+ − 423 before the gap, this is identical to the byte index. For
+ − 424 positions after the gap, this is the byte index plus the gap
+ − 425 size. There are two possible memory indices for the gap
+ − 426 position; the memory index at the beginning of the gap should
+ − 427 always be used, except in code that deals with manipulating the
+ − 428 gap, where both indices may be seen. The address of the
+ − 429 character "at" (i.e. following) a particular position can be
+ − 430 obtained from the formula
+ − 431
+ − 432 buffer_start_address + memory_index(position) - 1
+ − 433
+ − 434 except in the case of characters at the gap position.
+ − 435
+ − 436 B. Other Typedefs
+ − 437
867
+ − 438 Ichar:
1292
+ − 439 ------
826
+ − 440 This typedef represents a single Emacs character, which can be
+ − 441 ASCII, ISO-8859, or some extended character, as would typically
+ − 442 be used for Kanji. Note that the representation of a character
867
+ − 443 as an Ichar is *not* the same as the representation of that
826
+ − 444 same character in a string; thus, you cannot do the standard
+ − 445 C trick of passing a pointer to a character to a function that
+ − 446 expects a string.
+ − 447
867
+ − 448 An Ichar takes up 19 bits of representation and (for code
826
+ − 449 compatibility and such) is compatible with an int. This
+ − 450 representation is visible on the Lisp level. The important
867
+ − 451 characteristics of the Ichar representation are
826
+ − 452
+ − 453 -- values 0x00 - 0x7f represent ASCII.
+ − 454 -- values 0x80 - 0xff represent the right half of ISO-8859-1.
+ − 455 -- values 0x100 and up represent all other characters.
+ − 456
867
+ − 457 This means that Ichar values are upwardly compatible with
826
+ − 458 the standard 8-bit representation of ASCII/ISO-8859-1.
+ − 459
867
+ − 460 Ibyte:
1292
+ − 461 ------
867
+ − 462 The data in a buffer or string is logically made up of Ibyte
+ − 463 objects, where a Ibyte takes up the same amount of space as a
826
+ − 464 char. (It is declared differently, though, to catch invalid
867
+ − 465 usages.) Strings stored using Ibytes are said to be in
826
+ − 466 "internal format". The important characteristics of internal
+ − 467 format are
+ − 468
867
+ − 469 -- ASCII characters are represented as a single Ibyte,
826
+ − 470 in the range 0 - 0x7f.
867
+ − 471 -- All other characters are represented as a Ibyte in
+ − 472 the range 0x80 - 0x9f followed by one or more Ibytes
826
+ − 473 in the range 0xa0 to 0xff.
+ − 474
+ − 475 This leads to a number of desirable properties:
+ − 476
+ − 477 -- Given the position of the beginning of a character,
+ − 478 you can find the beginning of the next or previous
+ − 479 character in constant time.
+ − 480 -- When searching for a substring or an ASCII character
+ − 481 within the string, you need merely use standard
+ − 482 searching routines.
+ − 483
1292
+ − 484 Extbyte:
+ − 485 --------
826
+ − 486 Strings that go in or out of Emacs are in "external format",
+ − 487 typedef'ed as an array of char or a char *. There is more
+ − 488 than one external format (JIS, EUC, etc.) but they all
+ − 489 have similar properties. They are modal encodings,
+ − 490 which is to say that the meaning of particular bytes is
+ − 491 not fixed but depends on what "mode" the string is currently
+ − 492 in (e.g. bytes in the range 0 - 0x7f might be
+ − 493 interpreted as ASCII, or as Hiragana, or as 2-byte Kanji,
+ − 494 depending on the current mode). The mode starts out in
+ − 495 ASCII/ISO-8859-1 and is switched using escape sequences --
+ − 496 for example, in the JIS encoding, 'ESC $ B' switches to a
+ − 497 mode where pairs of bytes in the range 0 - 0x7f
+ − 498 are interpreted as Kanji characters.
+ − 499
+ − 500 External-formatted data is generally desirable for passing
+ − 501 data between programs because it is upwardly compatible
+ − 502 with standard ASCII/ISO-8859-1 strings and may require
+ − 503 less space than internal encodings such as the one
+ − 504 described above. In addition, some encodings (e.g. JIS)
+ − 505 keep all characters (except the ESC used to switch modes)
+ − 506 in the printing ASCII range 0x20 - 0x7e, which results in
+ − 507 a much higher probability that the data will avoid being
+ − 508 garbled in transmission. Externally-formatted data is
+ − 509 generally not very convenient to work with, however, and
+ − 510 for this reason is usually converted to internal format
+ − 511 before any work is done on the string.
+ − 512
+ − 513 NOTE: filenames need to be in external format so that
+ − 514 ISO-8859-1 characters come out correctly.
+ − 515
+ − 516 Charcount:
+ − 517 ----------
+ − 518 This typedef represents a count of characters, such as
+ − 519 a character offset into a string or the number of
+ − 520 characters between two positions in a buffer. The
+ − 521 difference between two Charbpos's is a Charcount, and
+ − 522 character positions in a string are represented using
+ − 523 a Charcount.
+ − 524
+ − 525 Bytecount:
+ − 526 ----------
+ − 527 Similar to a Charcount but represents a count of bytes.
+ − 528 The difference between two Bytebpos's is a Bytecount.
+ − 529
+ − 530
+ − 531 C. Usage of the Various Representations
+ − 532
+ − 533 Memory indices are used in low-level functions in insdel.c and for
+ − 534 extent endpoints and marker positions. The reason for this is that
+ − 535 this way, the extents and markers don't need to be updated for most
+ − 536 insertions, which merely shrink the gap and don't move any
+ − 537 characters around in memory.
+ − 538
+ − 539 (The beginning-of-gap memory index simplifies insertions w.r.t.
+ − 540 markers, because text usually gets inserted after markers. For
+ − 541 extents, it is merely for consistency, because text can get
+ − 542 inserted either before or after an extent's endpoint depending on
+ − 543 the open/closedness of the endpoint.)
+ − 544
+ − 545 Byte indices are used in other code that needs to be fast,
+ − 546 such as the searching, redisplay, and extent-manipulation code.
+ − 547
+ − 548 Buffer positions are used in all other code. This is because this
+ − 549 representation is easiest to work with (especially since Lisp
+ − 550 code always uses buffer positions), necessitates the fewest
+ − 551 changes to existing code, and is the safest (e.g. if the text gets
+ − 552 shifted underneath a buffer position, it will still point to a
+ − 553 character; if text is shifted under a byte index, it might point
+ − 554 to the middle of a character, which would be bad).
+ − 555
+ − 556 Similarly, Charcounts are used in all code that deals with strings
+ − 557 except for code that needs to be fast, which used Bytecounts.
+ − 558
+ − 559 Strings are always passed around internally using internal format.
+ − 560 Conversions between external format are performed at the time
+ − 561 that the data goes in or out of Emacs.
+ − 562
+ − 563 D. Working With the Various Representations
+ − 564
+ − 565 We write things this way because it's very important the
+ − 566 MAX_BYTEBPOS_GAP_SIZE_3 is a multiple of 3. (As it happens,
+ − 567 65535 is a multiple of 3, but this may not always be the
+ − 568 case. #### unfinished
+ − 569
+ − 570 ==========================================================================
1292
+ − 571 6. Miscellaneous
826
+ − 572 ==========================================================================
+ − 573
+ − 574 A. Unicode Support
771
+ − 575
1292
+ − 576 Unicode support is very desirable. Currrently we know how to handle
+ − 577 externally-encoded Unicode data in various encodings -- UTF-16, UTF-8,
+ − 578 etc. However, we really need to represent Unicode characters internally
+ − 579 as-is, rather than converting to some language-specific character set.
+ − 580 For efficiency, we should represent Unicode characters using 3 bytes
+ − 581 rather than 4. This means we need to find leading bytes for Unicode.
+ − 582 Given that there are 65,536 characters in Unicode and we can attach
+ − 583 96x96 = 9,216 characters per leading byte, we need eight leading bytes
+ − 584 for Unicode. We currently have four free (0x9A - 0x9D), and with a
+ − 585 little bit of rearranging we can get five: ASCII doesn't really need to
+ − 586 take up a leading byte. (We could just as well use 0x7F, with a little
+ − 587 change to the functions that assume that 0x80 is the lowest leading
+ − 588 byte.) This means we still need to dump three leading bytes and move
+ − 589 them into private space. The CNS charsets are good candidates since
+ − 590 they are rarely used, and JAPANESE_JISX0208_1978 is becoming less and
+ − 591 less used and could also be dumped.
826
+ − 592
+ − 593 B. Composite Characters
+ − 594
+ − 595 Composite characters are characters constructed by overstriking two
771
+ − 596 or more regular characters.
+ − 597
+ − 598 1) The old Mule implementation involves storing composite characters
+ − 599 in a buffer as a tag followed by all of the actual characters
+ − 600 used to make up the composite character. I think this is a bad
+ − 601 idea; it greatly complicates code that wants to handle strings
+ − 602 one character at a time because it has to deal with the possibility
+ − 603 of great big ungainly characters. It's much more reasonable to
+ − 604 simply store an index into a table of composite characters.
+ − 605
+ − 606 2) The current implementation only allows for 16,384 separate
+ − 607 composite characters over the lifetime of the XEmacs process.
+ − 608 This could become a potential problem if the user
+ − 609 edited lots of different files that use composite characters.
+ − 610 Due to FSF bogosity, increasing the number of allowable
+ − 611 composite characters under Mule would decrease the number
+ − 612 of possible faces that can exist. Mule already has shrunk
+ − 613 this to 2048, and further shrinkage would become uncomfortable.
+ − 614 No such problems exist in XEmacs.
+ − 615
+ − 616 Composite characters could be represented as 0x8D C1 C2 C3,
+ − 617 where each C[1-3] is in the range 0xA0 - 0xFF. This allows
+ − 618 for slightly under 2^20 (one million) composite characters
+ − 619 over the XEmacs process lifetime, and you only need to
+ − 620 increase the size of a Mule character from 19 to 21 bits.
+ − 621 Or you could use 0x8D C1 C2 C3 C4, allowing for about
826
+ − 622 85 million (slightly over 2^26) composite characters.
+ − 623
+ − 624 */
771
+ − 625
+ − 626
+ − 627 /************************************************************************/
+ − 628 /* declarations */
+ − 629 /************************************************************************/
+ − 630
+ − 631 Eistring the_eistring_zero_init, the_eistring_malloc_zero_init;
+ − 632
+ − 633 #define MAX_CHARBPOS_GAP_SIZE_3 (65535/3)
+ − 634 #define MAX_BYTEBPOS_GAP_SIZE_3 (3 * MAX_CHARBPOS_GAP_SIZE_3)
+ − 635
+ − 636 short three_to_one_table[1 + MAX_BYTEBPOS_GAP_SIZE_3];
+ − 637
+ − 638 #ifdef MULE
+ − 639
+ − 640 /* Table of number of bytes in the string representation of a character
+ − 641 indexed by the first byte of that representation.
+ − 642
+ − 643 rep_bytes_by_first_byte(c) is more efficient than the equivalent
+ − 644 canonical computation:
+ − 645
826
+ − 646 XCHARSET_REP_BYTES (charset_by_leading_byte (c)) */
771
+ − 647
+ − 648 const Bytecount rep_bytes_by_first_byte[0xA0] =
+ − 649 { /* 0x00 - 0x7f are for straight ASCII */
+ − 650 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 651 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 652 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 653 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 654 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 655 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 656 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 658 /* 0x80 - 0x8f are for Dimension-1 official charsets */
+ − 659 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ − 660 /* 0x90 - 0x9d are for Dimension-2 official charsets */
+ − 661 /* 0x9e is for Dimension-1 private charsets */
+ − 662 /* 0x9f is for Dimension-2 private charsets */
+ − 663 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
+ − 664 };
+ − 665
+ − 666 #ifdef ENABLE_COMPOSITE_CHARS
+ − 667
+ − 668 /* Hash tables for composite chars. One maps string representing
+ − 669 composed chars to their equivalent chars; one goes the
+ − 670 other way. */
+ − 671 Lisp_Object Vcomposite_char_char2string_hash_table;
+ − 672 Lisp_Object Vcomposite_char_string2char_hash_table;
+ − 673
+ − 674 static int composite_char_row_next;
+ − 675 static int composite_char_col_next;
+ − 676
+ − 677 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 678
+ − 679 #endif /* MULE */
+ − 680
1292
+ − 681 Lisp_Object QSin_char_byte_conversion;
+ − 682 Lisp_Object QSin_internal_external_conversion;
+ − 683
771
+ − 684
+ − 685 /************************************************************************/
+ − 686 /* qxestr***() functions */
+ − 687 /************************************************************************/
+ − 688
+ − 689 /* Most are inline functions in lisp.h */
+ − 690
+ − 691 int
867
+ − 692 qxesprintf (Ibyte *buffer, const CIbyte *format, ...)
771
+ − 693 {
+ − 694 va_list args;
+ − 695 int retval;
+ − 696
+ − 697 va_start (args, format);
+ − 698 retval = vsprintf ((char *) buffer, format, args);
+ − 699 va_end (args);
+ − 700
+ − 701 return retval;
+ − 702 }
+ − 703
+ − 704 /* strcasecmp() implementation from BSD */
867
+ − 705 static Ibyte strcasecmp_charmap[] = {
771
+ − 706 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
+ − 707 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
+ − 708 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
+ − 709 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
+ − 710 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
+ − 711 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
+ − 712 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
+ − 713 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
+ − 714 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+ − 715 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+ − 716 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+ − 717 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
+ − 718 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+ − 719 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+ − 720 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+ − 721 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
+ − 722 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
+ − 723 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
+ − 724 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
+ − 725 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
+ − 726 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
+ − 727 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
+ − 728 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
+ − 729 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
+ − 730 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
+ − 731 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
+ − 732 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
+ − 733 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
+ − 734 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+ − 735 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
+ − 736 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
+ − 737 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
+ − 738 };
+ − 739
+ − 740 /* A version that works like generic strcasecmp() -- only collapsing
+ − 741 case in ASCII A-Z/a-z. This is safe on Mule strings due to the
+ − 742 current representation.
+ − 743
+ − 744 This version was written by some Berkeley coder, favoring
+ − 745 nanosecond improvements over clarity. In all other versions below,
+ − 746 we use symmetrical algorithms that may sacrifice a few machine
+ − 747 cycles but are MUCH MUCH clearer, which counts a lot more.
+ − 748 */
+ − 749
+ − 750 int
867
+ − 751 qxestrcasecmp (const Ibyte *s1, const Ibyte *s2)
771
+ − 752 {
867
+ − 753 Ibyte *cm = strcasecmp_charmap;
771
+ − 754
+ − 755 while (cm[*s1] == cm[*s2++])
+ − 756 if (*s1++ == '\0')
+ − 757 return (0);
+ − 758
+ − 759 return (cm[*s1] - cm[*--s2]);
+ − 760 }
+ − 761
+ − 762 int
+ − 763 ascii_strcasecmp (const Char_ASCII *s1, const Char_ASCII *s2)
+ − 764 {
867
+ − 765 return qxestrcasecmp ((const Ibyte *) s1, (const Ibyte *) s2);
771
+ − 766 }
+ − 767
+ − 768 int
867
+ − 769 qxestrcasecmp_c (const Ibyte *s1, const Char_ASCII *s2)
771
+ − 770 {
867
+ − 771 return qxestrcasecmp (s1, (const Ibyte *) s2);
771
+ − 772 }
+ − 773
+ − 774 /* An internationalized version that collapses case in a general fashion.
+ − 775 */
+ − 776
+ − 777 int
867
+ − 778 qxestrcasecmp_i18n (const Ibyte *s1, const Ibyte *s2)
771
+ − 779 {
+ − 780 while (*s1 && *s2)
+ − 781 {
867
+ − 782 if (DOWNCASE (0, itext_ichar (s1)) !=
+ − 783 DOWNCASE (0, itext_ichar (s2)))
771
+ − 784 break;
867
+ − 785 INC_IBYTEPTR (s1);
+ − 786 INC_IBYTEPTR (s2);
771
+ − 787 }
+ − 788
867
+ − 789 return (DOWNCASE (0, itext_ichar (s1)) -
+ − 790 DOWNCASE (0, itext_ichar (s2)));
771
+ − 791 }
+ − 792
+ − 793 /* The only difference between these next two and
+ − 794 qxememcasecmp()/qxememcasecmp_i18n() is that these two will stop if
+ − 795 both strings are equal and less than LEN in length, while
+ − 796 the mem...() versions would would run off the end. */
+ − 797
+ − 798 int
867
+ − 799 qxestrncasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len)
771
+ − 800 {
867
+ − 801 Ibyte *cm = strcasecmp_charmap;
771
+ − 802
+ − 803 while (len--)
+ − 804 {
+ − 805 int diff = cm[*s1] - cm[*s2];
+ − 806 if (diff != 0)
+ − 807 return diff;
+ − 808 if (!*s1)
+ − 809 return 0;
+ − 810 s1++, s2++;
+ − 811 }
+ − 812
+ − 813 return 0;
+ − 814 }
+ − 815
+ − 816 int
+ − 817 ascii_strncasecmp (const Char_ASCII *s1, const Char_ASCII *s2, Bytecount len)
+ − 818 {
867
+ − 819 return qxestrncasecmp ((const Ibyte *) s1, (const Ibyte *) s2, len);
771
+ − 820 }
+ − 821
+ − 822 int
867
+ − 823 qxestrncasecmp_c (const Ibyte *s1, const Char_ASCII *s2, Bytecount len)
771
+ − 824 {
867
+ − 825 return qxestrncasecmp (s1, (const Ibyte *) s2, len);
771
+ − 826 }
+ − 827
801
+ − 828 /* Compare LEN_FROM_S1 worth of characters from S1 with the same number of
+ − 829 characters from S2, case insensitive. NOTE: Downcasing can convert
+ − 830 characters from one length in bytes to another, so reversing S1 and S2
+ − 831 is *NOT* a symmetric operations! You must choose a length that agrees
+ − 832 with S1. */
+ − 833
771
+ − 834 int
867
+ − 835 qxestrncasecmp_i18n (const Ibyte *s1, const Ibyte *s2,
801
+ − 836 Bytecount len_from_s1)
771
+ − 837 {
801
+ − 838 while (len_from_s1 > 0)
771
+ − 839 {
867
+ − 840 const Ibyte *old_s1 = s1;
+ − 841 int diff = (DOWNCASE (0, itext_ichar (s1)) -
+ − 842 DOWNCASE (0, itext_ichar (s2)));
771
+ − 843 if (diff != 0)
+ − 844 return diff;
+ − 845 if (!*s1)
+ − 846 return 0;
867
+ − 847 INC_IBYTEPTR (s1);
+ − 848 INC_IBYTEPTR (s2);
801
+ − 849 len_from_s1 -= s1 - old_s1;
771
+ − 850 }
+ − 851
+ − 852 return 0;
+ − 853 }
+ − 854
+ − 855 int
867
+ − 856 qxememcmp (const Ibyte *s1, const Ibyte *s2, Bytecount len)
771
+ − 857 {
+ − 858 return memcmp (s1, s2, len);
+ − 859 }
+ − 860
+ − 861 int
867
+ − 862 qxememcmp4 (const Ibyte *s1, Bytecount len1,
+ − 863 const Ibyte *s2, Bytecount len2)
801
+ − 864 {
+ − 865 int retval = qxememcmp (s1, s2, min (len1, len2));
+ − 866 if (retval)
+ − 867 return retval;
+ − 868 return len1 - len2;
+ − 869 }
+ − 870
+ − 871 int
867
+ − 872 qxememcasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len)
771
+ − 873 {
867
+ − 874 Ibyte *cm = strcasecmp_charmap;
771
+ − 875
+ − 876 while (len--)
+ − 877 {
+ − 878 int diff = cm[*s1] - cm[*s2];
+ − 879 if (diff != 0)
+ − 880 return diff;
+ − 881 s1++, s2++;
+ − 882 }
+ − 883
+ − 884 return 0;
+ − 885 }
+ − 886
+ − 887 int
867
+ − 888 qxememcasecmp4 (const Ibyte *s1, Bytecount len1,
+ − 889 const Ibyte *s2, Bytecount len2)
771
+ − 890 {
801
+ − 891 int retval = qxememcasecmp (s1, s2, min (len1, len2));
+ − 892 if (retval)
+ − 893 return retval;
+ − 894 return len1 - len2;
+ − 895 }
+ − 896
+ − 897 /* Do a character-by-character comparison, returning "which is greater" by
867
+ − 898 comparing the Ichar values. (#### Should have option to compare Unicode
801
+ − 899 points) */
+ − 900
+ − 901 int
867
+ − 902 qxetextcmp (const Ibyte *s1, Bytecount len1,
+ − 903 const Ibyte *s2, Bytecount len2)
801
+ − 904 {
+ − 905 while (len1 > 0 && len2 > 0)
771
+ − 906 {
867
+ − 907 const Ibyte *old_s1 = s1;
+ − 908 const Ibyte *old_s2 = s2;
+ − 909 int diff = itext_ichar (s1) - itext_ichar (s2);
801
+ − 910 if (diff != 0)
+ − 911 return diff;
867
+ − 912 INC_IBYTEPTR (s1);
+ − 913 INC_IBYTEPTR (s2);
801
+ − 914 len1 -= s1 - old_s1;
+ − 915 len2 -= s2 - old_s2;
+ − 916 }
+ − 917
+ − 918 assert (len1 >= 0 && len2 >= 0);
+ − 919 return len1 - len2;
+ − 920 }
+ − 921
+ − 922 int
867
+ − 923 qxetextcmp_matching (const Ibyte *s1, Bytecount len1,
+ − 924 const Ibyte *s2, Bytecount len2,
801
+ − 925 Charcount *matching)
+ − 926 {
+ − 927 *matching = 0;
+ − 928 while (len1 > 0 && len2 > 0)
+ − 929 {
867
+ − 930 const Ibyte *old_s1 = s1;
+ − 931 const Ibyte *old_s2 = s2;
+ − 932 int diff = itext_ichar (s1) - itext_ichar (s2);
801
+ − 933 if (diff != 0)
+ − 934 return diff;
867
+ − 935 INC_IBYTEPTR (s1);
+ − 936 INC_IBYTEPTR (s2);
801
+ − 937 len1 -= s1 - old_s1;
+ − 938 len2 -= s2 - old_s2;
+ − 939 (*matching)++;
+ − 940 }
+ − 941
+ − 942 assert (len1 >= 0 && len2 >= 0);
+ − 943 return len1 - len2;
+ − 944 }
+ − 945
+ − 946 /* Do a character-by-character comparison, returning "which is greater" by
867
+ − 947 comparing the Ichar values, case insensitively (by downcasing both
801
+ − 948 first). (#### Should have option to compare Unicode points)
+ − 949
+ − 950 In this case, both lengths must be specified becaused downcasing can
+ − 951 convert characters from one length in bytes to another; therefore, two
+ − 952 blocks of text of different length might be equal. If both compare
+ − 953 equal up to the limit in length of one but not the other, the longer one
+ − 954 is "greater". */
+ − 955
+ − 956 int
867
+ − 957 qxetextcasecmp (const Ibyte *s1, Bytecount len1,
+ − 958 const Ibyte *s2, Bytecount len2)
801
+ − 959 {
+ − 960 while (len1 > 0 && len2 > 0)
+ − 961 {
867
+ − 962 const Ibyte *old_s1 = s1;
+ − 963 const Ibyte *old_s2 = s2;
+ − 964 int diff = (DOWNCASE (0, itext_ichar (s1)) -
+ − 965 DOWNCASE (0, itext_ichar (s2)));
771
+ − 966 if (diff != 0)
+ − 967 return diff;
867
+ − 968 INC_IBYTEPTR (s1);
+ − 969 INC_IBYTEPTR (s2);
801
+ − 970 len1 -= s1 - old_s1;
+ − 971 len2 -= s2 - old_s2;
771
+ − 972 }
+ − 973
801
+ − 974 assert (len1 >= 0 && len2 >= 0);
+ − 975 return len1 - len2;
+ − 976 }
+ − 977
+ − 978 /* Like qxetextcasecmp() but also return number of characters at
+ − 979 beginning that match. */
+ − 980
+ − 981 int
867
+ − 982 qxetextcasecmp_matching (const Ibyte *s1, Bytecount len1,
+ − 983 const Ibyte *s2, Bytecount len2,
801
+ − 984 Charcount *matching)
+ − 985 {
+ − 986 *matching = 0;
+ − 987 while (len1 > 0 && len2 > 0)
+ − 988 {
867
+ − 989 const Ibyte *old_s1 = s1;
+ − 990 const Ibyte *old_s2 = s2;
+ − 991 int diff = (DOWNCASE (0, itext_ichar (s1)) -
+ − 992 DOWNCASE (0, itext_ichar (s2)));
801
+ − 993 if (diff != 0)
+ − 994 return diff;
867
+ − 995 INC_IBYTEPTR (s1);
+ − 996 INC_IBYTEPTR (s2);
801
+ − 997 len1 -= s1 - old_s1;
+ − 998 len2 -= s2 - old_s2;
+ − 999 (*matching)++;
+ − 1000 }
+ − 1001
+ − 1002 assert (len1 >= 0 && len2 >= 0);
+ − 1003 return len1 - len2;
771
+ − 1004 }
+ − 1005
+ − 1006 int
+ − 1007 lisp_strcasecmp (Lisp_Object s1, Lisp_Object s2)
+ − 1008 {
867
+ − 1009 Ibyte *cm = strcasecmp_charmap;
+ − 1010 Ibyte *p1 = XSTRING_DATA (s1);
+ − 1011 Ibyte *p2 = XSTRING_DATA (s2);
+ − 1012 Ibyte *e1 = p1 + XSTRING_LENGTH (s1);
+ − 1013 Ibyte *e2 = p2 + XSTRING_LENGTH (s2);
771
+ − 1014
+ − 1015 /* again, we use a symmetric algorithm and favor clarity over
+ − 1016 nanosecond improvements. */
+ − 1017 while (1)
+ − 1018 {
+ − 1019 /* if we reached the end of either string, compare lengths.
+ − 1020 do NOT compare the final null byte against anything, in case
+ − 1021 the other string also has a null byte at that position. */
+ − 1022 if (p1 == e1 || p2 == e2)
+ − 1023 return e1 - e2;
+ − 1024 if (cm[*p1] != cm[*p2])
+ − 1025 return cm[*p1] - cm[*p2];
+ − 1026 p1++, p2++;
+ − 1027 }
+ − 1028 }
+ − 1029
+ − 1030 int
+ − 1031 lisp_strcasecmp_i18n (Lisp_Object s1, Lisp_Object s2)
+ − 1032 {
801
+ − 1033 return qxetextcasecmp (XSTRING_DATA (s1), XSTRING_LENGTH (s1),
+ − 1034 XSTRING_DATA (s2), XSTRING_LENGTH (s2));
771
+ − 1035 }
+ − 1036
+ − 1037
+ − 1038 /************************************************************************/
+ − 1039 /* conversion between textual representations */
+ − 1040 /************************************************************************/
+ − 1041
+ − 1042 /* NOTE: Does not reset the Dynarr. */
+ − 1043
+ − 1044 void
867
+ − 1045 convert_ibyte_string_into_ichar_dynarr (const Ibyte *str, Bytecount len,
+ − 1046 Ichar_dynarr *dyn)
771
+ − 1047 {
867
+ − 1048 const Ibyte *strend = str + len;
771
+ − 1049
+ − 1050 while (str < strend)
+ − 1051 {
867
+ − 1052 Ichar ch = itext_ichar (str);
771
+ − 1053 Dynarr_add (dyn, ch);
867
+ − 1054 INC_IBYTEPTR (str);
771
+ − 1055 }
+ − 1056 }
+ − 1057
+ − 1058 Charcount
867
+ − 1059 convert_ibyte_string_into_ichar_string (const Ibyte *str, Bytecount len,
+ − 1060 Ichar *arr)
771
+ − 1061 {
867
+ − 1062 const Ibyte *strend = str + len;
771
+ − 1063 Charcount newlen = 0;
+ − 1064 while (str < strend)
+ − 1065 {
867
+ − 1066 Ichar ch = itext_ichar (str);
771
+ − 1067 arr[newlen++] = ch;
867
+ − 1068 INC_IBYTEPTR (str);
771
+ − 1069 }
+ − 1070 return newlen;
+ − 1071 }
+ − 1072
867
+ − 1073 /* Convert an array of Ichars into the equivalent string representation.
+ − 1074 Store into the given Ibyte dynarr. Does not reset the dynarr.
771
+ − 1075 Does not add a terminating zero. */
+ − 1076
+ − 1077 void
867
+ − 1078 convert_ichar_string_into_ibyte_dynarr (Ichar *arr, int nels,
+ − 1079 Ibyte_dynarr *dyn)
771
+ − 1080 {
867
+ − 1081 Ibyte str[MAX_ICHAR_LEN];
771
+ − 1082 int i;
+ − 1083
+ − 1084 for (i = 0; i < nels; i++)
+ − 1085 {
867
+ − 1086 Bytecount len = set_itext_ichar (str, arr[i]);
771
+ − 1087 Dynarr_add_many (dyn, str, len);
+ − 1088 }
+ − 1089 }
+ − 1090
867
+ − 1091 /* Convert an array of Ichars into the equivalent string representation.
771
+ − 1092 Malloc the space needed for this and return it. If LEN_OUT is not a
867
+ − 1093 NULL pointer, store into LEN_OUT the number of Ibytes in the
+ − 1094 malloc()ed string. Note that the actual number of Ibytes allocated
771
+ − 1095 is one more than this: the returned string is zero-terminated. */
+ − 1096
867
+ − 1097 Ibyte *
+ − 1098 convert_ichar_string_into_malloced_string (Ichar *arr, int nels,
826
+ − 1099 Bytecount *len_out)
771
+ − 1100 {
+ − 1101 /* Damn zero-termination. */
867
+ − 1102 Ibyte *str = (Ibyte *) ALLOCA (nels * MAX_ICHAR_LEN + 1);
+ − 1103 Ibyte *strorig = str;
771
+ − 1104 Bytecount len;
+ − 1105
+ − 1106 int i;
+ − 1107
+ − 1108 for (i = 0; i < nels; i++)
867
+ − 1109 str += set_itext_ichar (str, arr[i]);
771
+ − 1110 *str = '\0';
+ − 1111 len = str - strorig;
867
+ − 1112 str = (Ibyte *) xmalloc (1 + len);
771
+ − 1113 memcpy (str, strorig, 1 + len);
+ − 1114 if (len_out)
+ − 1115 *len_out = len;
+ − 1116 return str;
+ − 1117 }
+ − 1118
826
+ − 1119 #define COPY_TEXT_BETWEEN_FORMATS(srcfmt, dstfmt) \
+ − 1120 do \
+ − 1121 { \
+ − 1122 if (dst) \
+ − 1123 { \
867
+ − 1124 Ibyte *dstend = dst + dstlen; \
+ − 1125 Ibyte *dstp = dst; \
+ − 1126 const Ibyte *srcend = src + srclen; \
+ − 1127 const Ibyte *srcp = src; \
826
+ − 1128 \
+ − 1129 while (srcp < srcend) \
+ − 1130 { \
867
+ − 1131 Ichar ch = itext_ichar_fmt (srcp, srcfmt, srcobj); \
+ − 1132 Bytecount len = ichar_len_fmt (ch, dstfmt); \
826
+ − 1133 \
+ − 1134 if (dstp + len <= dstend) \
+ − 1135 { \
867
+ − 1136 set_itext_ichar_fmt (dstp, ch, dstfmt, dstobj); \
826
+ − 1137 dstp += len; \
+ − 1138 } \
+ − 1139 else \
+ − 1140 break; \
867
+ − 1141 INC_IBYTEPTR_FMT (srcp, srcfmt); \
826
+ − 1142 } \
+ − 1143 text_checking_assert (srcp <= srcend); \
+ − 1144 if (src_used) \
+ − 1145 *src_used = srcp - src; \
+ − 1146 return dstp - dst; \
+ − 1147 } \
+ − 1148 else \
+ − 1149 { \
867
+ − 1150 const Ibyte *srcend = src + srclen; \
+ − 1151 const Ibyte *srcp = src; \
826
+ − 1152 Bytecount total = 0; \
+ − 1153 \
+ − 1154 while (srcp < srcend) \
+ − 1155 { \
867
+ − 1156 total += ichar_len_fmt (itext_ichar_fmt (srcp, srcfmt, \
826
+ − 1157 srcobj), dstfmt); \
867
+ − 1158 INC_IBYTEPTR_FMT (srcp, srcfmt); \
826
+ − 1159 } \
+ − 1160 text_checking_assert (srcp == srcend); \
+ − 1161 if (src_used) \
+ − 1162 *src_used = srcp - src; \
+ − 1163 return total; \
+ − 1164 } \
+ − 1165 } \
+ − 1166 while (0)
+ − 1167
+ − 1168 /* Copy as much text from SRC/SRCLEN to DST/DSTLEN as will fit, converting
+ − 1169 from SRCFMT/SRCOBJ to DSTFMT/DSTOBJ. Return number of bytes stored into
+ − 1170 DST as return value, and number of bytes copied from SRC through
+ − 1171 SRC_USED (if not NULL). If DST is NULL, don't actually store anything
+ − 1172 and just return the size needed to store all the text. Will not copy
+ − 1173 partial characters into DST. */
+ − 1174
+ − 1175 Bytecount
867
+ − 1176 copy_text_between_formats (const Ibyte *src, Bytecount srclen,
826
+ − 1177 Internal_Format srcfmt,
+ − 1178 Lisp_Object srcobj,
867
+ − 1179 Ibyte *dst, Bytecount dstlen,
826
+ − 1180 Internal_Format dstfmt,
+ − 1181 Lisp_Object dstobj,
+ − 1182 Bytecount *src_used)
+ − 1183 {
+ − 1184 if (srcfmt == dstfmt &&
+ − 1185 objects_have_same_internal_representation (srcobj, dstobj))
+ − 1186 {
+ − 1187 if (dst)
+ − 1188 {
+ − 1189 srclen = min (srclen, dstlen);
867
+ − 1190 srclen = validate_ibyte_string_backward (src, srclen);
826
+ − 1191 memcpy (dst, src, srclen);
+ − 1192 if (src_used)
+ − 1193 *src_used = srclen;
+ − 1194 return srclen;
+ − 1195 }
+ − 1196 else
+ − 1197 return srclen;
+ − 1198 }
+ − 1199 /* Everything before the final else statement is an optimization.
+ − 1200 The inner loops inside COPY_TEXT_BETWEEN_FORMATS() have a number
+ − 1201 of calls to *_fmt(), each of which has a switch statement in it.
+ − 1202 By using constants as the FMT argument, these switch statements
+ − 1203 will be optimized out of existence. */
+ − 1204 #define ELSE_FORMATS(fmt1, fmt2) \
+ − 1205 else if (srcfmt == fmt1 && dstfmt == fmt2) \
+ − 1206 COPY_TEXT_BETWEEN_FORMATS (fmt1, fmt2)
+ − 1207 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_8_BIT_FIXED);
+ − 1208 ELSE_FORMATS (FORMAT_8_BIT_FIXED, FORMAT_DEFAULT);
+ − 1209 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_32_BIT_FIXED);
+ − 1210 ELSE_FORMATS (FORMAT_32_BIT_FIXED, FORMAT_DEFAULT);
+ − 1211 else
+ − 1212 COPY_TEXT_BETWEEN_FORMATS (srcfmt, dstfmt);
+ − 1213 #undef ELSE_FORMATS
+ − 1214 }
+ − 1215
+ − 1216 /* Copy as much buffer text in BUF, starting at POS, of length LEN, as will
+ − 1217 fit into DST/DSTLEN, converting to DSTFMT. Return number of bytes
+ − 1218 stored into DST as return value, and number of bytes copied from BUF
+ − 1219 through SRC_USED (if not NULL). If DST is NULL, don't actually store
+ − 1220 anything and just return the size needed to store all the text. */
+ − 1221
+ − 1222 Bytecount
+ − 1223 copy_buffer_text_out (struct buffer *buf, Bytebpos pos,
867
+ − 1224 Bytecount len, Ibyte *dst, Bytecount dstlen,
826
+ − 1225 Internal_Format dstfmt, Lisp_Object dstobj,
+ − 1226 Bytecount *src_used)
+ − 1227 {
+ − 1228 Bytecount dst_used = 0;
+ − 1229 if (src_used)
+ − 1230 *src_used = 0;
+ − 1231
+ − 1232 {
+ − 1233 BUFFER_TEXT_LOOP (buf, pos, len, runptr, runlen)
+ − 1234 {
+ − 1235 Bytecount the_src_used, the_dst_used;
+ − 1236
+ − 1237 the_dst_used = copy_text_between_formats (runptr, runlen,
+ − 1238 BUF_FORMAT (buf),
+ − 1239 wrap_buffer (buf),
+ − 1240 dst, dstlen, dstfmt,
+ − 1241 dstobj, &the_src_used);
+ − 1242 dst_used += the_dst_used;
+ − 1243 if (src_used)
+ − 1244 *src_used += the_src_used;
+ − 1245 if (dst)
+ − 1246 {
+ − 1247 dst += the_dst_used;
+ − 1248 dstlen -= the_dst_used;
841
+ − 1249 /* Stop if we didn't use all of the source text. Also stop
+ − 1250 if the destination is full. We need the first test because
+ − 1251 there might be a couple bytes left in the destination, but
+ − 1252 not enough to fit a full character. The first test will in
+ − 1253 fact catch the vast majority of cases where the destination
+ − 1254 is empty, too -- but in case the destination holds *exactly*
+ − 1255 the run length, we put in the second check. (It shouldn't
+ − 1256 really matter though -- next time through we'll just get a
+ − 1257 0.) */
+ − 1258 if (the_src_used < runlen || !dstlen)
826
+ − 1259 break;
+ − 1260 }
+ − 1261 }
+ − 1262 }
+ − 1263
+ − 1264 return dst_used;
+ − 1265 }
+ − 1266
771
+ − 1267
+ − 1268 /************************************************************************/
+ − 1269 /* charset properties of strings */
+ − 1270 /************************************************************************/
+ − 1271
+ − 1272 void
867
+ − 1273 find_charsets_in_ibyte_string (unsigned char *charsets, const Ibyte *str,
771
+ − 1274 Bytecount len)
+ − 1275 {
+ − 1276 #ifndef MULE
+ − 1277 /* Telescope this. */
+ − 1278 charsets[0] = 1;
+ − 1279 #else
867
+ − 1280 const Ibyte *strend = str + len;
771
+ − 1281 memset (charsets, 0, NUM_LEADING_BYTES);
+ − 1282
+ − 1283 /* #### SJT doesn't like this. */
+ − 1284 if (len == 0)
+ − 1285 {
+ − 1286 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1;
+ − 1287 return;
+ − 1288 }
+ − 1289
+ − 1290 while (str < strend)
+ − 1291 {
867
+ − 1292 charsets[ichar_leading_byte (itext_ichar (str)) - MIN_LEADING_BYTE] =
771
+ − 1293 1;
867
+ − 1294 INC_IBYTEPTR (str);
771
+ − 1295 }
+ − 1296 #endif
+ − 1297 }
+ − 1298
+ − 1299 void
867
+ − 1300 find_charsets_in_ichar_string (unsigned char *charsets, const Ichar *str,
771
+ − 1301 Charcount len)
+ − 1302 {
+ − 1303 #ifndef MULE
+ − 1304 /* Telescope this. */
+ − 1305 charsets[0] = 1;
+ − 1306 #else
+ − 1307 int i;
+ − 1308
+ − 1309 memset (charsets, 0, NUM_LEADING_BYTES);
+ − 1310
+ − 1311 /* #### SJT doesn't like this. */
+ − 1312 if (len == 0)
+ − 1313 {
+ − 1314 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1;
+ − 1315 return;
+ − 1316 }
+ − 1317
+ − 1318 for (i = 0; i < len; i++)
+ − 1319 {
867
+ − 1320 charsets[ichar_leading_byte (str[i]) - MIN_LEADING_BYTE] = 1;
771
+ − 1321 }
+ − 1322 #endif
+ − 1323 }
+ − 1324
+ − 1325 int
867
+ − 1326 ibyte_string_displayed_columns (const Ibyte *str, Bytecount len)
771
+ − 1327 {
+ − 1328 int cols = 0;
867
+ − 1329 const Ibyte *end = str + len;
771
+ − 1330
+ − 1331 while (str < end)
+ − 1332 {
+ − 1333 #ifdef MULE
867
+ − 1334 Ichar ch = itext_ichar (str);
+ − 1335 cols += XCHARSET_COLUMNS (ichar_charset (ch));
771
+ − 1336 #else
+ − 1337 cols++;
+ − 1338 #endif
867
+ − 1339 INC_IBYTEPTR (str);
771
+ − 1340 }
+ − 1341
+ − 1342 return cols;
+ − 1343 }
+ − 1344
+ − 1345 int
867
+ − 1346 ichar_string_displayed_columns (const Ichar *str, Charcount len)
771
+ − 1347 {
+ − 1348 #ifdef MULE
+ − 1349 int cols = 0;
+ − 1350 int i;
+ − 1351
+ − 1352 for (i = 0; i < len; i++)
867
+ − 1353 cols += XCHARSET_COLUMNS (ichar_charset (str[i]));
771
+ − 1354
+ − 1355 return cols;
+ − 1356 #else /* not MULE */
+ − 1357 return len;
+ − 1358 #endif
+ − 1359 }
+ − 1360
+ − 1361 Charcount
867
+ − 1362 ibyte_string_nonascii_chars (const Ibyte *str, Bytecount len)
771
+ − 1363 {
+ − 1364 #ifdef MULE
867
+ − 1365 const Ibyte *end = str + len;
771
+ − 1366 Charcount retval = 0;
+ − 1367
+ − 1368 while (str < end)
+ − 1369 {
826
+ − 1370 if (!byte_ascii_p (*str))
771
+ − 1371 retval++;
867
+ − 1372 INC_IBYTEPTR (str);
771
+ − 1373 }
+ − 1374
+ − 1375 return retval;
+ − 1376 #else
+ − 1377 return 0;
+ − 1378 #endif
+ − 1379 }
+ − 1380
+ − 1381
+ − 1382 /***************************************************************************/
+ − 1383 /* Eistring helper functions */
+ − 1384 /***************************************************************************/
+ − 1385
+ − 1386 int
867
+ − 1387 eistr_casefiddle_1 (Ibyte *olddata, Bytecount len, Ibyte *newdata,
771
+ − 1388 int downp)
+ − 1389 {
867
+ − 1390 Ibyte *endp = olddata + len;
+ − 1391 Ibyte *newp = newdata;
771
+ − 1392 int changedp = 0;
+ − 1393
+ − 1394 while (olddata < endp)
+ − 1395 {
867
+ − 1396 Ichar c = itext_ichar (olddata);
+ − 1397 Ichar newc;
771
+ − 1398
+ − 1399 if (downp)
+ − 1400 newc = DOWNCASE (0, c);
+ − 1401 else
+ − 1402 newc = UPCASE (0, c);
+ − 1403
+ − 1404 if (c != newc)
+ − 1405 changedp = 1;
+ − 1406
867
+ − 1407 newp += set_itext_ichar (newp, newc);
+ − 1408 INC_IBYTEPTR (olddata);
771
+ − 1409 }
+ − 1410
+ − 1411 *newp = '\0';
+ − 1412
+ − 1413 return changedp ? newp - newdata : 0;
+ − 1414 }
+ − 1415
+ − 1416 int
+ − 1417 eifind_large_enough_buffer (int oldbufsize, int needed_size)
+ − 1418 {
+ − 1419 while (oldbufsize < needed_size)
+ − 1420 {
+ − 1421 oldbufsize = oldbufsize * 3 / 2;
+ − 1422 oldbufsize = max (oldbufsize, 32);
+ − 1423 }
+ − 1424
+ − 1425 return oldbufsize;
+ − 1426 }
+ − 1427
+ − 1428 void
+ − 1429 eito_malloc_1 (Eistring *ei)
+ − 1430 {
+ − 1431 if (ei->mallocp_)
+ − 1432 return;
+ − 1433 ei->mallocp_ = 1;
+ − 1434 if (ei->data_)
+ − 1435 {
867
+ − 1436 Ibyte *newdata;
771
+ − 1437
+ − 1438 ei->max_size_allocated_ =
+ − 1439 eifind_large_enough_buffer (0, ei->bytelen_ + 1);
867
+ − 1440 newdata = (Ibyte *) xmalloc (ei->max_size_allocated_);
771
+ − 1441 memcpy (newdata, ei->data_, ei->bytelen_ + 1);
+ − 1442 ei->data_ = newdata;
+ − 1443 }
+ − 1444
+ − 1445 if (ei->extdata_)
+ − 1446 {
+ − 1447 Extbyte *newdata = (Extbyte *) xmalloc (ei->extlen_ + 2);
+ − 1448
+ − 1449 memcpy (newdata, ei->extdata_, ei->extlen_);
+ − 1450 /* Double null-terminate in case of Unicode data */
+ − 1451 newdata[ei->extlen_] = '\0';
+ − 1452 newdata[ei->extlen_ + 1] = '\0';
+ − 1453 ei->extdata_ = newdata;
+ − 1454 }
+ − 1455 }
+ − 1456
+ − 1457 int
+ − 1458 eicmp_1 (Eistring *ei, Bytecount off, Charcount charoff,
867
+ − 1459 Bytecount len, Charcount charlen, const Ibyte *data,
771
+ − 1460 const Eistring *ei2, int is_c, int fold_case)
+ − 1461 {
+ − 1462 assert ((off < 0) != (charoff < 0));
+ − 1463 if (off < 0)
+ − 1464 {
+ − 1465 off = charcount_to_bytecount (ei->data_, charoff);
+ − 1466 if (charlen < 0)
+ − 1467 len = -1;
+ − 1468 else
+ − 1469 len = charcount_to_bytecount (ei->data_ + off, charlen);
+ − 1470 }
+ − 1471 if (len < 0)
+ − 1472 len = ei->bytelen_ - off;
+ − 1473
+ − 1474 assert (off >= 0 && off <= ei->bytelen_);
+ − 1475 assert (len >= 0 && off + len <= ei->bytelen_);
+ − 1476 assert ((data == 0) != (ei == 0));
+ − 1477 assert ((is_c != 0) == (data != 0));
+ − 1478 assert (fold_case >= 0 && fold_case <= 2);
+ − 1479
+ − 1480 {
+ − 1481 Bytecount dstlen;
867
+ − 1482 const Ibyte *src = ei->data_, *dst;
771
+ − 1483
+ − 1484 if (data)
+ − 1485 {
+ − 1486 dst = data;
+ − 1487 dstlen = qxestrlen (data);
+ − 1488 }
+ − 1489 else
+ − 1490 {
+ − 1491 dst = ei2->data_;
+ − 1492 dstlen = ei2->bytelen_;
+ − 1493 }
+ − 1494
+ − 1495 if (is_c)
+ − 1496 EI_ASSERT_ASCII ((Char_ASCII *) dst, dstlen);
+ − 1497
801
+ − 1498 return (fold_case == 0 ? qxememcmp4 (src, len, dst, dstlen) :
+ − 1499 fold_case == 1 ? qxememcasecmp4 (src, len, dst, dstlen) :
+ − 1500 qxetextcasecmp (src, len, dst, dstlen));
771
+ − 1501 }
+ − 1502 }
+ − 1503
867
+ − 1504 Ibyte *
826
+ − 1505 eicpyout_malloc_fmt (Eistring *eistr, Bytecount *len_out, Internal_Format fmt,
+ − 1506 Lisp_Object object)
771
+ − 1507 {
867
+ − 1508 Ibyte *ptr;
771
+ − 1509
+ − 1510 assert (fmt == FORMAT_DEFAULT);
867
+ − 1511 ptr = xnew_array (Ibyte, eistr->bytelen_ + 1);
771
+ − 1512 if (len_out)
+ − 1513 *len_out = eistr->bytelen_;
+ − 1514 memcpy (ptr, eistr->data_, eistr->bytelen_ + 1);
+ − 1515 return ptr;
+ − 1516 }
+ − 1517
+ − 1518
+ − 1519 /************************************************************************/
+ − 1520 /* Charcount/Bytecount conversion */
+ − 1521 /************************************************************************/
+ − 1522
+ − 1523 /* Optimization. Do it. Live it. Love it. */
+ − 1524
+ − 1525 #ifdef MULE
+ − 1526
826
+ − 1527 /* Skip as many ASCII bytes as possible in the memory block [PTR, END).
+ − 1528 Return pointer to the first non-ASCII byte. optimized for long
+ − 1529 stretches of ASCII. */
867
+ − 1530 inline static const Ibyte *
+ − 1531 skip_ascii (const Ibyte *ptr, const Ibyte *end)
771
+ − 1532 {
826
+ − 1533 #ifdef EFFICIENT_INT_128_BIT
+ − 1534 # define STRIDE_TYPE INT_128_BIT
+ − 1535 # define HIGH_BIT_MASK \
+ − 1536 MAKE_128_BIT_UNSIGNED_CONSTANT (0x80808080808080808080808080808080)
+ − 1537 #elif defined (EFFICIENT_INT_64_BIT)
+ − 1538 # define STRIDE_TYPE INT_64_BIT
+ − 1539 # define HIGH_BIT_MASK MAKE_64_BIT_UNSIGNED_CONSTANT (0x8080808080808080)
771
+ − 1540 #else
826
+ − 1541 # define STRIDE_TYPE INT_32_BIT
+ − 1542 # define HIGH_BIT_MASK MAKE_32_BIT_UNSIGNED_CONSTANT (0x80808080)
771
+ − 1543 #endif
+ − 1544
+ − 1545 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1))
+ − 1546 #define ALIGN_MASK (~ ALIGN_BITS)
+ − 1547 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0)
+ − 1548 #define STRIDE sizeof (STRIDE_TYPE)
+ − 1549
826
+ − 1550 const unsigned STRIDE_TYPE *ascii_end;
+ − 1551
+ − 1552 /* Need to do in 3 sections -- before alignment start, aligned chunk,
+ − 1553 after alignment end. */
+ − 1554 while (!ALIGNED (ptr))
771
+ − 1555 {
826
+ − 1556 if (ptr == end || !byte_ascii_p (*ptr))
+ − 1557 return ptr;
+ − 1558 ptr++;
+ − 1559 }
+ − 1560 ascii_end = (const unsigned STRIDE_TYPE *) ptr;
+ − 1561 /* This loop screams, because we can detect ASCII
+ − 1562 characters 4 or 8 at a time. */
867
+ − 1563 while ((const Ibyte *) ascii_end + STRIDE <= end
826
+ − 1564 && !(*ascii_end & HIGH_BIT_MASK))
+ − 1565 ascii_end++;
867
+ − 1566 ptr = (Ibyte *) ascii_end;
826
+ − 1567 while (ptr < end && byte_ascii_p (*ptr))
+ − 1568 ptr++;
+ − 1569 return ptr;
+ − 1570 }
+ − 1571
+ − 1572 /* Function equivalents of bytecount_to_charcount/charcount_to_bytecount.
+ − 1573 These work on strings of all sizes but are more efficient than a simple
+ − 1574 loop on large strings and probably less efficient on sufficiently small
+ − 1575 strings. */
+ − 1576
+ − 1577 Charcount
867
+ − 1578 bytecount_to_charcount_fun (const Ibyte *ptr, Bytecount len)
826
+ − 1579 {
+ − 1580 Charcount count = 0;
867
+ − 1581 const Ibyte *end = ptr + len;
826
+ − 1582 while (1)
+ − 1583 {
867
+ − 1584 const Ibyte *newptr = skip_ascii (ptr, end);
826
+ − 1585 count += newptr - ptr;
+ − 1586 ptr = newptr;
+ − 1587 if (ptr == end)
+ − 1588 break;
+ − 1589 {
+ − 1590 /* Optimize for successive characters from the same charset */
867
+ − 1591 Ibyte leading_byte = *ptr;
826
+ − 1592 int bytes = rep_bytes_by_first_byte (leading_byte);
+ − 1593 while (ptr < end && *ptr == leading_byte)
+ − 1594 ptr += bytes, count++;
+ − 1595 }
771
+ − 1596 }
+ − 1597
+ − 1598 /* Bomb out if the specified substring ends in the middle
+ − 1599 of a character. Note that we might have already gotten
+ − 1600 a core dump above from an invalid reference, but at least
+ − 1601 we will get no farther than here.
+ − 1602
+ − 1603 This also catches len < 0. */
800
+ − 1604 text_checking_assert (ptr == end);
771
+ − 1605
+ − 1606 return count;
+ − 1607 }
+ − 1608
+ − 1609 Bytecount
867
+ − 1610 charcount_to_bytecount_fun (const Ibyte *ptr, Charcount len)
771
+ − 1611 {
867
+ − 1612 const Ibyte *newptr = ptr;
826
+ − 1613 while (1)
771
+ − 1614 {
867
+ − 1615 const Ibyte *newnewptr = skip_ascii (newptr, newptr + len);
826
+ − 1616 len -= newnewptr - newptr;
+ − 1617 newptr = newnewptr;
+ − 1618 if (!len)
+ − 1619 break;
+ − 1620 {
+ − 1621 /* Optimize for successive characters from the same charset */
867
+ − 1622 Ibyte leading_byte = *newptr;
826
+ − 1623 int bytes = rep_bytes_by_first_byte (leading_byte);
+ − 1624 while (len > 0 && *newptr == leading_byte)
+ − 1625 newptr += bytes, len--;
+ − 1626 }
771
+ − 1627 }
+ − 1628 return newptr - ptr;
+ − 1629 }
+ − 1630
+ − 1631 /* The next two functions are the actual meat behind the
+ − 1632 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently
+ − 1633 the method they use is fairly unsophisticated; see buffer.h.
+ − 1634
+ − 1635 Note that charbpos_to_bytebpos_func() is probably the most-called
+ − 1636 function in all of XEmacs. Therefore, it must be FAST FAST FAST.
+ − 1637 This is the reason why so much of the code is duplicated.
+ − 1638
+ − 1639 Similar considerations apply to bytebpos_to_charbpos_func(), although
+ − 1640 less so because the function is not called so often.
+ − 1641
+ − 1642 #### At some point this should use a more sophisticated method;
+ − 1643 see buffer.h. */
+ − 1644
+ − 1645 static int not_very_random_number;
+ − 1646
+ − 1647 Bytebpos
+ − 1648 charbpos_to_bytebpos_func (struct buffer *buf, Charbpos x)
+ − 1649 {
+ − 1650 Charbpos bufmin;
+ − 1651 Charbpos bufmax;
+ − 1652 Bytebpos bytmin;
+ − 1653 Bytebpos bytmax;
+ − 1654 int size;
+ − 1655 int forward_p;
+ − 1656 Bytebpos retval;
+ − 1657 int diff_so_far;
+ − 1658 int add_to_cache = 0;
1292
+ − 1659 PROFILE_DECLARE ();
771
+ − 1660
+ − 1661 /* Check for some cached positions, for speed. */
+ − 1662 if (x == BUF_PT (buf))
826
+ − 1663 return BYTE_BUF_PT (buf);
771
+ − 1664 if (x == BUF_ZV (buf))
826
+ − 1665 return BYTE_BUF_ZV (buf);
771
+ − 1666 if (x == BUF_BEGV (buf))
826
+ − 1667 return BYTE_BUF_BEGV (buf);
771
+ − 1668
1292
+ − 1669 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion);
+ − 1670
771
+ − 1671 bufmin = buf->text->mule_bufmin;
+ − 1672 bufmax = buf->text->mule_bufmax;
+ − 1673 bytmin = buf->text->mule_bytmin;
+ − 1674 bytmax = buf->text->mule_bytmax;
+ − 1675 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
+ − 1676
+ − 1677 /* The basic idea here is that we shift the "known region" up or down
+ − 1678 until it overlaps the specified position. We do this by moving
+ − 1679 the upper bound of the known region up one character at a time,
+ − 1680 and moving the lower bound of the known region up as necessary
+ − 1681 when the size of the character just seen changes.
+ − 1682
+ − 1683 We optimize this, however, by first shifting the known region to
+ − 1684 one of the cached points if it's close by. (We don't check BEG or
+ − 1685 Z, even though they're cached; most of the time these will be the
+ − 1686 same as BEGV and ZV, and when they're not, they're not likely
+ − 1687 to be used.) */
+ − 1688
+ − 1689 if (x > bufmax)
+ − 1690 {
+ − 1691 Charbpos diffmax = x - bufmax;
+ − 1692 Charbpos diffpt = x - BUF_PT (buf);
+ − 1693 Charbpos diffzv = BUF_ZV (buf) - x;
+ − 1694 /* #### This value could stand some more exploration. */
+ − 1695 Charcount heuristic_hack = (bufmax - bufmin) >> 2;
+ − 1696
+ − 1697 /* Check if the position is closer to PT or ZV than to the
+ − 1698 end of the known region. */
+ − 1699
+ − 1700 if (diffpt < 0)
+ − 1701 diffpt = -diffpt;
+ − 1702 if (diffzv < 0)
+ − 1703 diffzv = -diffzv;
+ − 1704
+ − 1705 /* But also implement a heuristic that favors the known region
+ − 1706 over PT or ZV. The reason for this is that switching to
+ − 1707 PT or ZV will wipe out the knowledge in the known region,
+ − 1708 which might be annoying if the known region is large and
+ − 1709 PT or ZV is not that much closer than the end of the known
+ − 1710 region. */
+ − 1711
+ − 1712 diffzv += heuristic_hack;
+ − 1713 diffpt += heuristic_hack;
+ − 1714 if (diffpt < diffmax && diffpt <= diffzv)
+ − 1715 {
+ − 1716 bufmax = bufmin = BUF_PT (buf);
826
+ − 1717 bytmax = bytmin = BYTE_BUF_PT (buf);
771
+ − 1718 /* We set the size to 1 even though it doesn't really
+ − 1719 matter because the new known region contains no
+ − 1720 characters. We do this because this is the most
+ − 1721 likely size of the characters around the new known
+ − 1722 region, and we avoid potential yuckiness that is
+ − 1723 done when size == 3. */
+ − 1724 size = 1;
+ − 1725 }
+ − 1726 if (diffzv < diffmax)
+ − 1727 {
+ − 1728 bufmax = bufmin = BUF_ZV (buf);
826
+ − 1729 bytmax = bytmin = BYTE_BUF_ZV (buf);
771
+ − 1730 size = 1;
+ − 1731 }
+ − 1732 }
800
+ − 1733 #ifdef ERROR_CHECK_TEXT
771
+ − 1734 else if (x >= bufmin)
+ − 1735 abort ();
+ − 1736 #endif
+ − 1737 else
+ − 1738 {
+ − 1739 Charbpos diffmin = bufmin - x;
+ − 1740 Charbpos diffpt = BUF_PT (buf) - x;
+ − 1741 Charbpos diffbegv = x - BUF_BEGV (buf);
+ − 1742 /* #### This value could stand some more exploration. */
+ − 1743 Charcount heuristic_hack = (bufmax - bufmin) >> 2;
+ − 1744
+ − 1745 if (diffpt < 0)
+ − 1746 diffpt = -diffpt;
+ − 1747 if (diffbegv < 0)
+ − 1748 diffbegv = -diffbegv;
+ − 1749
+ − 1750 /* But also implement a heuristic that favors the known region --
+ − 1751 see above. */
+ − 1752
+ − 1753 diffbegv += heuristic_hack;
+ − 1754 diffpt += heuristic_hack;
+ − 1755
+ − 1756 if (diffpt < diffmin && diffpt <= diffbegv)
+ − 1757 {
+ − 1758 bufmax = bufmin = BUF_PT (buf);
826
+ − 1759 bytmax = bytmin = BYTE_BUF_PT (buf);
771
+ − 1760 /* We set the size to 1 even though it doesn't really
+ − 1761 matter because the new known region contains no
+ − 1762 characters. We do this because this is the most
+ − 1763 likely size of the characters around the new known
+ − 1764 region, and we avoid potential yuckiness that is
+ − 1765 done when size == 3. */
+ − 1766 size = 1;
+ − 1767 }
+ − 1768 if (diffbegv < diffmin)
+ − 1769 {
+ − 1770 bufmax = bufmin = BUF_BEGV (buf);
826
+ − 1771 bytmax = bytmin = BYTE_BUF_BEGV (buf);
771
+ − 1772 size = 1;
+ − 1773 }
+ − 1774 }
+ − 1775
+ − 1776 diff_so_far = x > bufmax ? x - bufmax : bufmin - x;
+ − 1777 if (diff_so_far > 50)
+ − 1778 {
+ − 1779 /* If we have to move more than a certain amount, then look
+ − 1780 into our cache. */
+ − 1781 int minval = INT_MAX;
+ − 1782 int found = 0;
+ − 1783 int i;
+ − 1784
+ − 1785 add_to_cache = 1;
+ − 1786 /* I considered keeping the positions ordered. This would speed
+ − 1787 up this loop, but updating the cache would take longer, so
+ − 1788 it doesn't seem like it would really matter. */
+ − 1789 for (i = 0; i < 16; i++)
+ − 1790 {
+ − 1791 int diff = buf->text->mule_charbpos_cache[i] - x;
+ − 1792
+ − 1793 if (diff < 0)
+ − 1794 diff = -diff;
+ − 1795 if (diff < minval)
+ − 1796 {
+ − 1797 minval = diff;
+ − 1798 found = i;
+ − 1799 }
+ − 1800 }
+ − 1801
+ − 1802 if (minval < diff_so_far)
+ − 1803 {
+ − 1804 bufmax = bufmin = buf->text->mule_charbpos_cache[found];
+ − 1805 bytmax = bytmin = buf->text->mule_bytebpos_cache[found];
+ − 1806 size = 1;
+ − 1807 }
+ − 1808 }
+ − 1809
+ − 1810 /* It's conceivable that the caching above could lead to X being
+ − 1811 the same as one of the range edges. */
+ − 1812 if (x >= bufmax)
+ − 1813 {
+ − 1814 Bytebpos newmax;
+ − 1815 Bytecount newsize;
+ − 1816
+ − 1817 forward_p = 1;
+ − 1818 while (x > bufmax)
+ − 1819 {
+ − 1820 newmax = bytmax;
+ − 1821
+ − 1822 INC_BYTEBPOS (buf, newmax);
+ − 1823 newsize = newmax - bytmax;
+ − 1824 if (newsize != size)
+ − 1825 {
+ − 1826 bufmin = bufmax;
+ − 1827 bytmin = bytmax;
+ − 1828 size = newsize;
+ − 1829 }
+ − 1830 bytmax = newmax;
+ − 1831 bufmax++;
+ − 1832 }
+ − 1833 retval = bytmax;
+ − 1834
+ − 1835 /* #### Should go past the found location to reduce the number
+ − 1836 of times that this function is called */
+ − 1837 }
+ − 1838 else /* x < bufmin */
+ − 1839 {
+ − 1840 Bytebpos newmin;
+ − 1841 Bytecount newsize;
+ − 1842
+ − 1843 forward_p = 0;
+ − 1844 while (x < bufmin)
+ − 1845 {
+ − 1846 newmin = bytmin;
+ − 1847
+ − 1848 DEC_BYTEBPOS (buf, newmin);
+ − 1849 newsize = bytmin - newmin;
+ − 1850 if (newsize != size)
+ − 1851 {
+ − 1852 bufmax = bufmin;
+ − 1853 bytmax = bytmin;
+ − 1854 size = newsize;
+ − 1855 }
+ − 1856 bytmin = newmin;
+ − 1857 bufmin--;
+ − 1858 }
+ − 1859 retval = bytmin;
+ − 1860
+ − 1861 /* #### Should go past the found location to reduce the number
+ − 1862 of times that this function is called
+ − 1863 */
+ − 1864 }
+ − 1865
+ − 1866 /* If size is three, than we have to max sure that the range we
+ − 1867 discovered isn't too large, because we use a fixed-length
+ − 1868 table to divide by 3. */
+ − 1869
+ − 1870 if (size == 3)
+ − 1871 {
+ − 1872 int gap = bytmax - bytmin;
+ − 1873 buf->text->mule_three_p = 1;
+ − 1874 buf->text->mule_shifter = 1;
+ − 1875
+ − 1876 if (gap > MAX_BYTEBPOS_GAP_SIZE_3)
+ − 1877 {
+ − 1878 if (forward_p)
+ − 1879 {
+ − 1880 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3;
+ − 1881 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3;
+ − 1882 }
+ − 1883 else
+ − 1884 {
+ − 1885 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3;
+ − 1886 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3;
+ − 1887 }
+ − 1888 }
+ − 1889 }
+ − 1890 else
+ − 1891 {
+ − 1892 buf->text->mule_three_p = 0;
+ − 1893 if (size == 4)
+ − 1894 buf->text->mule_shifter = 2;
+ − 1895 else
+ − 1896 buf->text->mule_shifter = size - 1;
+ − 1897 }
+ − 1898
+ − 1899 buf->text->mule_bufmin = bufmin;
+ − 1900 buf->text->mule_bufmax = bufmax;
+ − 1901 buf->text->mule_bytmin = bytmin;
+ − 1902 buf->text->mule_bytmax = bytmax;
+ − 1903
+ − 1904 if (add_to_cache)
+ − 1905 {
+ − 1906 int replace_loc;
+ − 1907
+ − 1908 /* We throw away a "random" cached value and replace it with
+ − 1909 the new value. It doesn't actually have to be very random
+ − 1910 at all, just evenly distributed.
+ − 1911
+ − 1912 #### It would be better to use a least-recently-used algorithm
+ − 1913 or something that tries to space things out, but I'm not sure
+ − 1914 it's worth it to go to the trouble of maintaining that. */
+ − 1915 not_very_random_number += 621;
+ − 1916 replace_loc = not_very_random_number & 15;
+ − 1917 buf->text->mule_charbpos_cache[replace_loc] = x;
+ − 1918 buf->text->mule_bytebpos_cache[replace_loc] = retval;
+ − 1919 }
+ − 1920
1292
+ − 1921 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion);
+ − 1922
771
+ − 1923 return retval;
+ − 1924 }
+ − 1925
+ − 1926 /* The logic in this function is almost identical to the logic in
+ − 1927 the previous function. */
+ − 1928
+ − 1929 Charbpos
+ − 1930 bytebpos_to_charbpos_func (struct buffer *buf, Bytebpos x)
+ − 1931 {
+ − 1932 Charbpos bufmin;
+ − 1933 Charbpos bufmax;
+ − 1934 Bytebpos bytmin;
+ − 1935 Bytebpos bytmax;
+ − 1936 int size;
+ − 1937 int forward_p;
+ − 1938 Charbpos retval;
+ − 1939 int diff_so_far;
+ − 1940 int add_to_cache = 0;
1292
+ − 1941 PROFILE_DECLARE ();
771
+ − 1942
+ − 1943 /* Check for some cached positions, for speed. */
826
+ − 1944 if (x == BYTE_BUF_PT (buf))
771
+ − 1945 return BUF_PT (buf);
826
+ − 1946 if (x == BYTE_BUF_ZV (buf))
771
+ − 1947 return BUF_ZV (buf);
826
+ − 1948 if (x == BYTE_BUF_BEGV (buf))
771
+ − 1949 return BUF_BEGV (buf);
+ − 1950
1292
+ − 1951 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion);
+ − 1952
771
+ − 1953 bufmin = buf->text->mule_bufmin;
+ − 1954 bufmax = buf->text->mule_bufmax;
+ − 1955 bytmin = buf->text->mule_bytmin;
+ − 1956 bytmax = buf->text->mule_bytmax;
+ − 1957 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
+ − 1958
+ − 1959 /* The basic idea here is that we shift the "known region" up or down
+ − 1960 until it overlaps the specified position. We do this by moving
+ − 1961 the upper bound of the known region up one character at a time,
+ − 1962 and moving the lower bound of the known region up as necessary
+ − 1963 when the size of the character just seen changes.
+ − 1964
+ − 1965 We optimize this, however, by first shifting the known region to
826
+ − 1966 one of the cached points if it's close by. (We don't check BYTE_BEG or
+ − 1967 BYTE_Z, even though they're cached; most of the time these will be the
+ − 1968 same as BYTE_BEGV and BYTE_ZV, and when they're not, they're not likely
771
+ − 1969 to be used.) */
+ − 1970
+ − 1971 if (x > bytmax)
+ − 1972 {
+ − 1973 Bytebpos diffmax = x - bytmax;
826
+ − 1974 Bytebpos diffpt = x - BYTE_BUF_PT (buf);
+ − 1975 Bytebpos diffzv = BYTE_BUF_ZV (buf) - x;
771
+ − 1976 /* #### This value could stand some more exploration. */
+ − 1977 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
+ − 1978
+ − 1979 /* Check if the position is closer to PT or ZV than to the
+ − 1980 end of the known region. */
+ − 1981
+ − 1982 if (diffpt < 0)
+ − 1983 diffpt = -diffpt;
+ − 1984 if (diffzv < 0)
+ − 1985 diffzv = -diffzv;
+ − 1986
+ − 1987 /* But also implement a heuristic that favors the known region
826
+ − 1988 over BYTE_PT or BYTE_ZV. The reason for this is that switching to
+ − 1989 BYTE_PT or BYTE_ZV will wipe out the knowledge in the known region,
771
+ − 1990 which might be annoying if the known region is large and
826
+ − 1991 BYTE_PT or BYTE_ZV is not that much closer than the end of the known
771
+ − 1992 region. */
+ − 1993
+ − 1994 diffzv += heuristic_hack;
+ − 1995 diffpt += heuristic_hack;
+ − 1996 if (diffpt < diffmax && diffpt <= diffzv)
+ − 1997 {
+ − 1998 bufmax = bufmin = BUF_PT (buf);
826
+ − 1999 bytmax = bytmin = BYTE_BUF_PT (buf);
771
+ − 2000 /* We set the size to 1 even though it doesn't really
+ − 2001 matter because the new known region contains no
+ − 2002 characters. We do this because this is the most
+ − 2003 likely size of the characters around the new known
+ − 2004 region, and we avoid potential yuckiness that is
+ − 2005 done when size == 3. */
+ − 2006 size = 1;
+ − 2007 }
+ − 2008 if (diffzv < diffmax)
+ − 2009 {
+ − 2010 bufmax = bufmin = BUF_ZV (buf);
826
+ − 2011 bytmax = bytmin = BYTE_BUF_ZV (buf);
771
+ − 2012 size = 1;
+ − 2013 }
+ − 2014 }
800
+ − 2015 #ifdef ERROR_CHECK_TEXT
771
+ − 2016 else if (x >= bytmin)
+ − 2017 abort ();
+ − 2018 #endif
+ − 2019 else
+ − 2020 {
+ − 2021 Bytebpos diffmin = bytmin - x;
826
+ − 2022 Bytebpos diffpt = BYTE_BUF_PT (buf) - x;
+ − 2023 Bytebpos diffbegv = x - BYTE_BUF_BEGV (buf);
771
+ − 2024 /* #### This value could stand some more exploration. */
+ − 2025 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
+ − 2026
+ − 2027 if (diffpt < 0)
+ − 2028 diffpt = -diffpt;
+ − 2029 if (diffbegv < 0)
+ − 2030 diffbegv = -diffbegv;
+ − 2031
+ − 2032 /* But also implement a heuristic that favors the known region --
+ − 2033 see above. */
+ − 2034
+ − 2035 diffbegv += heuristic_hack;
+ − 2036 diffpt += heuristic_hack;
+ − 2037
+ − 2038 if (diffpt < diffmin && diffpt <= diffbegv)
+ − 2039 {
+ − 2040 bufmax = bufmin = BUF_PT (buf);
826
+ − 2041 bytmax = bytmin = BYTE_BUF_PT (buf);
771
+ − 2042 /* We set the size to 1 even though it doesn't really
+ − 2043 matter because the new known region contains no
+ − 2044 characters. We do this because this is the most
+ − 2045 likely size of the characters around the new known
+ − 2046 region, and we avoid potential yuckiness that is
+ − 2047 done when size == 3. */
+ − 2048 size = 1;
+ − 2049 }
+ − 2050 if (diffbegv < diffmin)
+ − 2051 {
+ − 2052 bufmax = bufmin = BUF_BEGV (buf);
826
+ − 2053 bytmax = bytmin = BYTE_BUF_BEGV (buf);
771
+ − 2054 size = 1;
+ − 2055 }
+ − 2056 }
+ − 2057
+ − 2058 diff_so_far = x > bytmax ? x - bytmax : bytmin - x;
+ − 2059 if (diff_so_far > 50)
+ − 2060 {
+ − 2061 /* If we have to move more than a certain amount, then look
+ − 2062 into our cache. */
+ − 2063 int minval = INT_MAX;
+ − 2064 int found = 0;
+ − 2065 int i;
+ − 2066
+ − 2067 add_to_cache = 1;
+ − 2068 /* I considered keeping the positions ordered. This would speed
+ − 2069 up this loop, but updating the cache would take longer, so
+ − 2070 it doesn't seem like it would really matter. */
+ − 2071 for (i = 0; i < 16; i++)
+ − 2072 {
+ − 2073 int diff = buf->text->mule_bytebpos_cache[i] - x;
+ − 2074
+ − 2075 if (diff < 0)
+ − 2076 diff = -diff;
+ − 2077 if (diff < minval)
+ − 2078 {
+ − 2079 minval = diff;
+ − 2080 found = i;
+ − 2081 }
+ − 2082 }
+ − 2083
+ − 2084 if (minval < diff_so_far)
+ − 2085 {
+ − 2086 bufmax = bufmin = buf->text->mule_charbpos_cache[found];
+ − 2087 bytmax = bytmin = buf->text->mule_bytebpos_cache[found];
+ − 2088 size = 1;
+ − 2089 }
+ − 2090 }
+ − 2091
+ − 2092 /* It's conceivable that the caching above could lead to X being
+ − 2093 the same as one of the range edges. */
+ − 2094 if (x >= bytmax)
+ − 2095 {
+ − 2096 Bytebpos newmax;
+ − 2097 Bytecount newsize;
+ − 2098
+ − 2099 forward_p = 1;
+ − 2100 while (x > bytmax)
+ − 2101 {
+ − 2102 newmax = bytmax;
+ − 2103
+ − 2104 INC_BYTEBPOS (buf, newmax);
+ − 2105 newsize = newmax - bytmax;
+ − 2106 if (newsize != size)
+ − 2107 {
+ − 2108 bufmin = bufmax;
+ − 2109 bytmin = bytmax;
+ − 2110 size = newsize;
+ − 2111 }
+ − 2112 bytmax = newmax;
+ − 2113 bufmax++;
+ − 2114 }
+ − 2115 retval = bufmax;
+ − 2116
+ − 2117 /* #### Should go past the found location to reduce the number
+ − 2118 of times that this function is called */
+ − 2119 }
+ − 2120 else /* x <= bytmin */
+ − 2121 {
+ − 2122 Bytebpos newmin;
+ − 2123 Bytecount newsize;
+ − 2124
+ − 2125 forward_p = 0;
+ − 2126 while (x < bytmin)
+ − 2127 {
+ − 2128 newmin = bytmin;
+ − 2129
+ − 2130 DEC_BYTEBPOS (buf, newmin);
+ − 2131 newsize = bytmin - newmin;
+ − 2132 if (newsize != size)
+ − 2133 {
+ − 2134 bufmax = bufmin;
+ − 2135 bytmax = bytmin;
+ − 2136 size = newsize;
+ − 2137 }
+ − 2138 bytmin = newmin;
+ − 2139 bufmin--;
+ − 2140 }
+ − 2141 retval = bufmin;
+ − 2142
+ − 2143 /* #### Should go past the found location to reduce the number
+ − 2144 of times that this function is called
+ − 2145 */
+ − 2146 }
+ − 2147
+ − 2148 /* If size is three, than we have to max sure that the range we
+ − 2149 discovered isn't too large, because we use a fixed-length
+ − 2150 table to divide by 3. */
+ − 2151
+ − 2152 if (size == 3)
+ − 2153 {
+ − 2154 int gap = bytmax - bytmin;
+ − 2155 buf->text->mule_three_p = 1;
+ − 2156 buf->text->mule_shifter = 1;
+ − 2157
+ − 2158 if (gap > MAX_BYTEBPOS_GAP_SIZE_3)
+ − 2159 {
+ − 2160 if (forward_p)
+ − 2161 {
+ − 2162 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3;
+ − 2163 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3;
+ − 2164 }
+ − 2165 else
+ − 2166 {
+ − 2167 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3;
+ − 2168 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3;
+ − 2169 }
+ − 2170 }
+ − 2171 }
+ − 2172 else
+ − 2173 {
+ − 2174 buf->text->mule_three_p = 0;
+ − 2175 if (size == 4)
+ − 2176 buf->text->mule_shifter = 2;
+ − 2177 else
+ − 2178 buf->text->mule_shifter = size - 1;
+ − 2179 }
+ − 2180
+ − 2181 buf->text->mule_bufmin = bufmin;
+ − 2182 buf->text->mule_bufmax = bufmax;
+ − 2183 buf->text->mule_bytmin = bytmin;
+ − 2184 buf->text->mule_bytmax = bytmax;
+ − 2185
+ − 2186 if (add_to_cache)
+ − 2187 {
+ − 2188 int replace_loc;
+ − 2189
+ − 2190 /* We throw away a "random" cached value and replace it with
+ − 2191 the new value. It doesn't actually have to be very random
+ − 2192 at all, just evenly distributed.
+ − 2193
+ − 2194 #### It would be better to use a least-recently-used algorithm
+ − 2195 or something that tries to space things out, but I'm not sure
+ − 2196 it's worth it to go to the trouble of maintaining that. */
+ − 2197 not_very_random_number += 621;
+ − 2198 replace_loc = not_very_random_number & 15;
+ − 2199 buf->text->mule_charbpos_cache[replace_loc] = retval;
+ − 2200 buf->text->mule_bytebpos_cache[replace_loc] = x;
+ − 2201 }
+ − 2202
1292
+ − 2203 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion);
+ − 2204
771
+ − 2205 return retval;
+ − 2206 }
+ − 2207
+ − 2208 /* Text of length BYTELENGTH and CHARLENGTH (in different units)
+ − 2209 was inserted at charbpos START. */
+ − 2210
+ − 2211 void
+ − 2212 buffer_mule_signal_inserted_region (struct buffer *buf, Charbpos start,
+ − 2213 Bytecount bytelength,
+ − 2214 Charcount charlength)
+ − 2215 {
+ − 2216 int size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
+ − 2217 int i;
+ − 2218
+ − 2219 /* Adjust the cache of known positions. */
+ − 2220 for (i = 0; i < 16; i++)
+ − 2221 {
+ − 2222
+ − 2223 if (buf->text->mule_charbpos_cache[i] > start)
+ − 2224 {
+ − 2225 buf->text->mule_charbpos_cache[i] += charlength;
+ − 2226 buf->text->mule_bytebpos_cache[i] += bytelength;
+ − 2227 }
+ − 2228 }
+ − 2229
+ − 2230 if (start >= buf->text->mule_bufmax)
826
+ − 2231 return;
771
+ − 2232
+ − 2233 /* The insertion is either before the known region, in which case
+ − 2234 it shoves it forward; or within the known region, in which case
+ − 2235 it shoves the end forward. (But it may make the known region
+ − 2236 inconsistent, so we may have to shorten it.) */
+ − 2237
+ − 2238 if (start <= buf->text->mule_bufmin)
+ − 2239 {
+ − 2240 buf->text->mule_bufmin += charlength;
+ − 2241 buf->text->mule_bufmax += charlength;
+ − 2242 buf->text->mule_bytmin += bytelength;
+ − 2243 buf->text->mule_bytmax += bytelength;
+ − 2244 }
+ − 2245 else
+ − 2246 {
+ − 2247 Charbpos end = start + charlength;
+ − 2248 /* the insertion point divides the known region in two.
+ − 2249 Keep the longer half, at least, and expand into the
+ − 2250 inserted chunk as much as possible. */
+ − 2251
+ − 2252 if (start - buf->text->mule_bufmin > buf->text->mule_bufmax - start)
+ − 2253 {
+ − 2254 Bytebpos bytestart = (buf->text->mule_bytmin
+ − 2255 + size * (start - buf->text->mule_bufmin));
+ − 2256 Bytebpos bytenew;
+ − 2257
+ − 2258 while (start < end)
+ − 2259 {
+ − 2260 bytenew = bytestart;
+ − 2261 INC_BYTEBPOS (buf, bytenew);
+ − 2262 if (bytenew - bytestart != size)
+ − 2263 break;
+ − 2264 start++;
+ − 2265 bytestart = bytenew;
+ − 2266 }
+ − 2267 if (start != end)
+ − 2268 {
+ − 2269 buf->text->mule_bufmax = start;
+ − 2270 buf->text->mule_bytmax = bytestart;
+ − 2271 }
+ − 2272 else
+ − 2273 {
+ − 2274 buf->text->mule_bufmax += charlength;
+ − 2275 buf->text->mule_bytmax += bytelength;
+ − 2276 }
+ − 2277 }
+ − 2278 else
+ − 2279 {
+ − 2280 Bytebpos byteend = (buf->text->mule_bytmin
+ − 2281 + size * (start - buf->text->mule_bufmin)
+ − 2282 + bytelength);
+ − 2283 Bytebpos bytenew;
+ − 2284
+ − 2285 buf->text->mule_bufmax += charlength;
+ − 2286 buf->text->mule_bytmax += bytelength;
+ − 2287
+ − 2288 while (end > start)
+ − 2289 {
+ − 2290 bytenew = byteend;
+ − 2291 DEC_BYTEBPOS (buf, bytenew);
+ − 2292 if (byteend - bytenew != size)
+ − 2293 break;
+ − 2294 end--;
+ − 2295 byteend = bytenew;
+ − 2296 }
+ − 2297 if (start != end)
+ − 2298 {
+ − 2299 buf->text->mule_bufmin = end;
+ − 2300 buf->text->mule_bytmin = byteend;
+ − 2301 }
+ − 2302 }
+ − 2303 }
+ − 2304 }
+ − 2305
826
+ − 2306 /* Text from START to END (equivalent in Bytebpos's: from BYTE_START to
+ − 2307 BYTE_END) was deleted. */
771
+ − 2308
+ − 2309 void
+ − 2310 buffer_mule_signal_deleted_region (struct buffer *buf, Charbpos start,
826
+ − 2311 Charbpos end, Bytebpos byte_start,
+ − 2312 Bytebpos byte_end)
771
+ − 2313 {
+ − 2314 int i;
+ − 2315
+ − 2316 /* Adjust the cache of known positions. */
+ − 2317 for (i = 0; i < 16; i++)
+ − 2318 {
+ − 2319 /* After the end; gets shoved backward */
+ − 2320 if (buf->text->mule_charbpos_cache[i] > end)
+ − 2321 {
+ − 2322 buf->text->mule_charbpos_cache[i] -= end - start;
826
+ − 2323 buf->text->mule_bytebpos_cache[i] -= byte_end - byte_start;
771
+ − 2324 }
+ − 2325 /* In the range; moves to start of range */
+ − 2326 else if (buf->text->mule_charbpos_cache[i] > start)
+ − 2327 {
+ − 2328 buf->text->mule_charbpos_cache[i] = start;
826
+ − 2329 buf->text->mule_bytebpos_cache[i] = byte_start;
771
+ − 2330 }
+ − 2331 }
+ − 2332
+ − 2333 /* We don't care about any text after the end of the known region. */
+ − 2334
+ − 2335 end = min (end, buf->text->mule_bufmax);
826
+ − 2336 byte_end = min (byte_end, buf->text->mule_bytmax);
771
+ − 2337 if (start >= end)
826
+ − 2338 return;
771
+ − 2339
+ − 2340 /* The end of the known region offsets by the total amount of deletion,
+ − 2341 since it's all before it. */
+ − 2342
+ − 2343 buf->text->mule_bufmax -= end - start;
826
+ − 2344 buf->text->mule_bytmax -= byte_end - byte_start;
771
+ − 2345
+ − 2346 /* Now we don't care about any text after the start of the known region. */
+ − 2347
+ − 2348 end = min (end, buf->text->mule_bufmin);
826
+ − 2349 byte_end = min (byte_end, buf->text->mule_bytmin);
771
+ − 2350 if (start < end)
+ − 2351 {
+ − 2352 buf->text->mule_bufmin -= end - start;
826
+ − 2353 buf->text->mule_bytmin -= byte_end - byte_start;
771
+ − 2354 }
+ − 2355 }
+ − 2356
+ − 2357 #endif /* MULE */
+ − 2358
+ − 2359
+ − 2360 /************************************************************************/
+ − 2361 /* verifying buffer and string positions */
+ − 2362 /************************************************************************/
+ − 2363
+ − 2364 /* Functions below are tagged with either _byte or _char indicating
+ − 2365 whether they return byte or character positions. For a buffer,
+ − 2366 a character position is a "Charbpos" and a byte position is a "Bytebpos".
+ − 2367 For strings, these are sometimes typed using "Charcount" and
+ − 2368 "Bytecount". */
+ − 2369
+ − 2370 /* Flags for the functions below are:
+ − 2371
+ − 2372 GB_ALLOW_PAST_ACCESSIBLE
+ − 2373
+ − 2374 Allow positions to range over the entire buffer (BUF_BEG to BUF_Z),
+ − 2375 rather than just the accessible portion (BUF_BEGV to BUF_ZV).
+ − 2376 For strings, this flag has no effect.
+ − 2377
+ − 2378 GB_COERCE_RANGE
+ − 2379
+ − 2380 If the position is outside the allowable range, return the lower
+ − 2381 or upper bound of the range, whichever is closer to the specified
+ − 2382 position.
+ − 2383
+ − 2384 GB_NO_ERROR_IF_BAD
+ − 2385
+ − 2386 If the position is outside the allowable range, return -1.
+ − 2387
+ − 2388 GB_NEGATIVE_FROM_END
+ − 2389
+ − 2390 If a value is negative, treat it as an offset from the end.
+ − 2391 Only applies to strings.
+ − 2392
+ − 2393 The following additional flags apply only to the functions
+ − 2394 that return ranges:
+ − 2395
+ − 2396 GB_ALLOW_NIL
+ − 2397
+ − 2398 Either or both positions can be nil. If FROM is nil,
+ − 2399 FROM_OUT will contain the lower bound of the allowed range.
+ − 2400 If TO is nil, TO_OUT will contain the upper bound of the
+ − 2401 allowed range.
+ − 2402
+ − 2403 GB_CHECK_ORDER
+ − 2404
+ − 2405 FROM must contain the lower bound and TO the upper bound
+ − 2406 of the range. If the positions are reversed, an error is
+ − 2407 signalled.
+ − 2408
+ − 2409 The following is a combination flag:
+ − 2410
+ − 2411 GB_HISTORICAL_STRING_BEHAVIOR
+ − 2412
+ − 2413 Equivalent to (GB_NEGATIVE_FROM_END | GB_ALLOW_NIL).
+ − 2414 */
+ − 2415
+ − 2416 /* Return a buffer position stored in a Lisp_Object. Full
+ − 2417 error-checking is done on the position. Flags can be specified to
+ − 2418 control the behavior of out-of-range values. The default behavior
+ − 2419 is to require that the position is within the accessible part of
+ − 2420 the buffer (BEGV and ZV), and to signal an error if the position is
+ − 2421 out of range.
+ − 2422
+ − 2423 */
+ − 2424
+ − 2425 Charbpos
+ − 2426 get_buffer_pos_char (struct buffer *b, Lisp_Object pos, unsigned int flags)
+ − 2427 {
+ − 2428 /* Does not GC */
+ − 2429 Charbpos ind;
+ − 2430 Charbpos min_allowed, max_allowed;
+ − 2431
+ − 2432 CHECK_INT_COERCE_MARKER (pos);
+ − 2433 ind = XINT (pos);
+ − 2434 min_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_BEG (b) : BUF_BEGV (b);
+ − 2435 max_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_Z (b) : BUF_ZV (b);
+ − 2436
+ − 2437 if (ind < min_allowed || ind > max_allowed)
+ − 2438 {
+ − 2439 if (flags & GB_COERCE_RANGE)
+ − 2440 ind = ind < min_allowed ? min_allowed : max_allowed;
+ − 2441 else if (flags & GB_NO_ERROR_IF_BAD)
+ − 2442 ind = -1;
+ − 2443 else
+ − 2444 {
793
+ − 2445 Lisp_Object buffer = wrap_buffer (b);
+ − 2446
771
+ − 2447 args_out_of_range (buffer, pos);
+ − 2448 }
+ − 2449 }
+ − 2450
+ − 2451 return ind;
+ − 2452 }
+ − 2453
+ − 2454 Bytebpos
+ − 2455 get_buffer_pos_byte (struct buffer *b, Lisp_Object pos, unsigned int flags)
+ − 2456 {
+ − 2457 Charbpos bpos = get_buffer_pos_char (b, pos, flags);
+ − 2458 if (bpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */
+ − 2459 return -1;
+ − 2460 return charbpos_to_bytebpos (b, bpos);
+ − 2461 }
+ − 2462
+ − 2463 /* Return a pair of buffer positions representing a range of text,
+ − 2464 taken from a pair of Lisp_Objects. Full error-checking is
+ − 2465 done on the positions. Flags can be specified to control the
+ − 2466 behavior of out-of-range values. The default behavior is to
+ − 2467 allow the range bounds to be specified in either order
+ − 2468 (however, FROM_OUT will always be the lower bound of the range
+ − 2469 and TO_OUT the upper bound),to require that the positions
+ − 2470 are within the accessible part of the buffer (BEGV and ZV),
+ − 2471 and to signal an error if the positions are out of range.
+ − 2472 */
+ − 2473
+ − 2474 void
+ − 2475 get_buffer_range_char (struct buffer *b, Lisp_Object from, Lisp_Object to,
826
+ − 2476 Charbpos *from_out, Charbpos *to_out,
+ − 2477 unsigned int flags)
771
+ − 2478 {
+ − 2479 /* Does not GC */
+ − 2480 Charbpos min_allowed, max_allowed;
+ − 2481
+ − 2482 min_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
+ − 2483 BUF_BEG (b) : BUF_BEGV (b);
+ − 2484 max_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
+ − 2485 BUF_Z (b) : BUF_ZV (b);
+ − 2486
+ − 2487 if (NILP (from) && (flags & GB_ALLOW_NIL))
+ − 2488 *from_out = min_allowed;
+ − 2489 else
+ − 2490 *from_out = get_buffer_pos_char (b, from, flags | GB_NO_ERROR_IF_BAD);
+ − 2491
+ − 2492 if (NILP (to) && (flags & GB_ALLOW_NIL))
+ − 2493 *to_out = max_allowed;
+ − 2494 else
+ − 2495 *to_out = get_buffer_pos_char (b, to, flags | GB_NO_ERROR_IF_BAD);
+ − 2496
+ − 2497 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD))
+ − 2498 {
793
+ − 2499 Lisp_Object buffer = wrap_buffer (b);
+ − 2500
771
+ − 2501 args_out_of_range_3 (buffer, from, to);
+ − 2502 }
+ − 2503
+ − 2504 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out)
+ − 2505 {
+ − 2506 if (flags & GB_CHECK_ORDER)
+ − 2507 invalid_argument_2 ("start greater than end", from, to);
+ − 2508 else
+ − 2509 {
+ − 2510 Charbpos temp = *from_out;
+ − 2511 *from_out = *to_out;
+ − 2512 *to_out = temp;
+ − 2513 }
+ − 2514 }
+ − 2515 }
+ − 2516
+ − 2517 void
+ − 2518 get_buffer_range_byte (struct buffer *b, Lisp_Object from, Lisp_Object to,
826
+ − 2519 Bytebpos *from_out, Bytebpos *to_out,
+ − 2520 unsigned int flags)
771
+ − 2521 {
+ − 2522 Charbpos s, e;
+ − 2523
+ − 2524 get_buffer_range_char (b, from, to, &s, &e, flags);
+ − 2525 if (s >= 0)
+ − 2526 *from_out = charbpos_to_bytebpos (b, s);
+ − 2527 else /* could happen with GB_NO_ERROR_IF_BAD */
+ − 2528 *from_out = -1;
+ − 2529 if (e >= 0)
+ − 2530 *to_out = charbpos_to_bytebpos (b, e);
+ − 2531 else
+ − 2532 *to_out = -1;
+ − 2533 }
+ − 2534
+ − 2535 static Charcount
+ − 2536 get_string_pos_char_1 (Lisp_Object string, Lisp_Object pos, unsigned int flags,
+ − 2537 Charcount known_length)
+ − 2538 {
+ − 2539 Charcount ccpos;
+ − 2540 Charcount min_allowed = 0;
+ − 2541 Charcount max_allowed = known_length;
+ − 2542
+ − 2543 /* Computation of KNOWN_LENGTH is potentially expensive so we pass
+ − 2544 it in. */
+ − 2545 CHECK_INT (pos);
+ − 2546 ccpos = XINT (pos);
+ − 2547 if (ccpos < 0 && flags & GB_NEGATIVE_FROM_END)
+ − 2548 ccpos += max_allowed;
+ − 2549
+ − 2550 if (ccpos < min_allowed || ccpos > max_allowed)
+ − 2551 {
+ − 2552 if (flags & GB_COERCE_RANGE)
+ − 2553 ccpos = ccpos < min_allowed ? min_allowed : max_allowed;
+ − 2554 else if (flags & GB_NO_ERROR_IF_BAD)
+ − 2555 ccpos = -1;
+ − 2556 else
+ − 2557 args_out_of_range (string, pos);
+ − 2558 }
+ − 2559
+ − 2560 return ccpos;
+ − 2561 }
+ − 2562
+ − 2563 Charcount
+ − 2564 get_string_pos_char (Lisp_Object string, Lisp_Object pos, unsigned int flags)
+ − 2565 {
+ − 2566 return get_string_pos_char_1 (string, pos, flags,
826
+ − 2567 string_char_length (string));
771
+ − 2568 }
+ − 2569
+ − 2570 Bytecount
+ − 2571 get_string_pos_byte (Lisp_Object string, Lisp_Object pos, unsigned int flags)
+ − 2572 {
+ − 2573 Charcount ccpos = get_string_pos_char (string, pos, flags);
+ − 2574 if (ccpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */
+ − 2575 return -1;
793
+ − 2576 return string_index_char_to_byte (string, ccpos);
771
+ − 2577 }
+ − 2578
+ − 2579 void
+ − 2580 get_string_range_char (Lisp_Object string, Lisp_Object from, Lisp_Object to,
+ − 2581 Charcount *from_out, Charcount *to_out,
+ − 2582 unsigned int flags)
+ − 2583 {
+ − 2584 Charcount min_allowed = 0;
826
+ − 2585 Charcount max_allowed = string_char_length (string);
771
+ − 2586
+ − 2587 if (NILP (from) && (flags & GB_ALLOW_NIL))
+ − 2588 *from_out = min_allowed;
+ − 2589 else
+ − 2590 *from_out = get_string_pos_char_1 (string, from,
+ − 2591 flags | GB_NO_ERROR_IF_BAD,
+ − 2592 max_allowed);
+ − 2593
+ − 2594 if (NILP (to) && (flags & GB_ALLOW_NIL))
+ − 2595 *to_out = max_allowed;
+ − 2596 else
+ − 2597 *to_out = get_string_pos_char_1 (string, to,
+ − 2598 flags | GB_NO_ERROR_IF_BAD,
+ − 2599 max_allowed);
+ − 2600
+ − 2601 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD))
+ − 2602 args_out_of_range_3 (string, from, to);
+ − 2603
+ − 2604 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out)
+ − 2605 {
+ − 2606 if (flags & GB_CHECK_ORDER)
+ − 2607 invalid_argument_2 ("start greater than end", from, to);
+ − 2608 else
+ − 2609 {
+ − 2610 Charbpos temp = *from_out;
+ − 2611 *from_out = *to_out;
+ − 2612 *to_out = temp;
+ − 2613 }
+ − 2614 }
+ − 2615 }
+ − 2616
+ − 2617 void
+ − 2618 get_string_range_byte (Lisp_Object string, Lisp_Object from, Lisp_Object to,
+ − 2619 Bytecount *from_out, Bytecount *to_out,
+ − 2620 unsigned int flags)
+ − 2621 {
+ − 2622 Charcount s, e;
+ − 2623
+ − 2624 get_string_range_char (string, from, to, &s, &e, flags);
+ − 2625 if (s >= 0)
793
+ − 2626 *from_out = string_index_char_to_byte (string, s);
771
+ − 2627 else /* could happen with GB_NO_ERROR_IF_BAD */
+ − 2628 *from_out = -1;
+ − 2629 if (e >= 0)
793
+ − 2630 *to_out = string_index_char_to_byte (string, e);
771
+ − 2631 else
+ − 2632 *to_out = -1;
+ − 2633
+ − 2634 }
+ − 2635
826
+ − 2636 Charxpos
771
+ − 2637 get_buffer_or_string_pos_char (Lisp_Object object, Lisp_Object pos,
+ − 2638 unsigned int flags)
+ − 2639 {
+ − 2640 return STRINGP (object) ?
+ − 2641 get_string_pos_char (object, pos, flags) :
+ − 2642 get_buffer_pos_char (XBUFFER (object), pos, flags);
+ − 2643 }
+ − 2644
826
+ − 2645 Bytexpos
771
+ − 2646 get_buffer_or_string_pos_byte (Lisp_Object object, Lisp_Object pos,
+ − 2647 unsigned int flags)
+ − 2648 {
+ − 2649 return STRINGP (object) ?
+ − 2650 get_string_pos_byte (object, pos, flags) :
+ − 2651 get_buffer_pos_byte (XBUFFER (object), pos, flags);
+ − 2652 }
+ − 2653
+ − 2654 void
+ − 2655 get_buffer_or_string_range_char (Lisp_Object object, Lisp_Object from,
826
+ − 2656 Lisp_Object to, Charxpos *from_out,
+ − 2657 Charxpos *to_out, unsigned int flags)
771
+ − 2658 {
+ − 2659 if (STRINGP (object))
+ − 2660 get_string_range_char (object, from, to, from_out, to_out, flags);
+ − 2661 else
826
+ − 2662 get_buffer_range_char (XBUFFER (object), from, to, from_out, to_out,
+ − 2663 flags);
771
+ − 2664 }
+ − 2665
+ − 2666 void
+ − 2667 get_buffer_or_string_range_byte (Lisp_Object object, Lisp_Object from,
826
+ − 2668 Lisp_Object to, Bytexpos *from_out,
+ − 2669 Bytexpos *to_out, unsigned int flags)
771
+ − 2670 {
+ − 2671 if (STRINGP (object))
+ − 2672 get_string_range_byte (object, from, to, from_out, to_out, flags);
+ − 2673 else
826
+ − 2674 get_buffer_range_byte (XBUFFER (object), from, to, from_out, to_out,
+ − 2675 flags);
771
+ − 2676 }
+ − 2677
826
+ − 2678 Charxpos
771
+ − 2679 buffer_or_string_accessible_begin_char (Lisp_Object object)
+ − 2680 {
+ − 2681 return STRINGP (object) ? 0 : BUF_BEGV (XBUFFER (object));
+ − 2682 }
+ − 2683
826
+ − 2684 Charxpos
771
+ − 2685 buffer_or_string_accessible_end_char (Lisp_Object object)
+ − 2686 {
+ − 2687 return STRINGP (object) ?
826
+ − 2688 string_char_length (object) : BUF_ZV (XBUFFER (object));
771
+ − 2689 }
+ − 2690
826
+ − 2691 Bytexpos
771
+ − 2692 buffer_or_string_accessible_begin_byte (Lisp_Object object)
+ − 2693 {
826
+ − 2694 return STRINGP (object) ? 0 : BYTE_BUF_BEGV (XBUFFER (object));
771
+ − 2695 }
+ − 2696
826
+ − 2697 Bytexpos
771
+ − 2698 buffer_or_string_accessible_end_byte (Lisp_Object object)
+ − 2699 {
+ − 2700 return STRINGP (object) ?
826
+ − 2701 XSTRING_LENGTH (object) : BYTE_BUF_ZV (XBUFFER (object));
771
+ − 2702 }
+ − 2703
826
+ − 2704 Charxpos
771
+ − 2705 buffer_or_string_absolute_begin_char (Lisp_Object object)
+ − 2706 {
+ − 2707 return STRINGP (object) ? 0 : BUF_BEG (XBUFFER (object));
+ − 2708 }
+ − 2709
826
+ − 2710 Charxpos
771
+ − 2711 buffer_or_string_absolute_end_char (Lisp_Object object)
+ − 2712 {
+ − 2713 return STRINGP (object) ?
826
+ − 2714 string_char_length (object) : BUF_Z (XBUFFER (object));
+ − 2715 }
+ − 2716
+ − 2717 Bytexpos
+ − 2718 buffer_or_string_absolute_begin_byte (Lisp_Object object)
+ − 2719 {
+ − 2720 return STRINGP (object) ? 0 : BYTE_BUF_BEG (XBUFFER (object));
+ − 2721 }
+ − 2722
+ − 2723 Bytexpos
+ − 2724 buffer_or_string_absolute_end_byte (Lisp_Object object)
+ − 2725 {
+ − 2726 return STRINGP (object) ?
+ − 2727 XSTRING_LENGTH (object) : BYTE_BUF_Z (XBUFFER (object));
+ − 2728 }
+ − 2729
+ − 2730 Charbpos
+ − 2731 charbpos_clip_to_bounds (Charbpos lower, Charbpos num, Charbpos upper)
+ − 2732 {
+ − 2733 return (num < lower ? lower :
+ − 2734 num > upper ? upper :
+ − 2735 num);
771
+ − 2736 }
+ − 2737
+ − 2738 Bytebpos
826
+ − 2739 bytebpos_clip_to_bounds (Bytebpos lower, Bytebpos num, Bytebpos upper)
+ − 2740 {
+ − 2741 return (num < lower ? lower :
+ − 2742 num > upper ? upper :
+ − 2743 num);
+ − 2744 }
+ − 2745
+ − 2746 Charxpos
+ − 2747 charxpos_clip_to_bounds (Charxpos lower, Charxpos num, Charxpos upper)
771
+ − 2748 {
826
+ − 2749 return (num < lower ? lower :
+ − 2750 num > upper ? upper :
+ − 2751 num);
+ − 2752 }
+ − 2753
+ − 2754 Bytexpos
+ − 2755 bytexpos_clip_to_bounds (Bytexpos lower, Bytexpos num, Bytexpos upper)
+ − 2756 {
+ − 2757 return (num < lower ? lower :
+ − 2758 num > upper ? upper :
+ − 2759 num);
771
+ − 2760 }
+ − 2761
826
+ − 2762 /* These could be implemented in terms of the get_buffer_or_string()
+ − 2763 functions above, but those are complicated and handle lots of weird
+ − 2764 cases stemming from uncertain external input. */
+ − 2765
+ − 2766 Charxpos
+ − 2767 buffer_or_string_clip_to_accessible_char (Lisp_Object object, Charxpos pos)
+ − 2768 {
+ − 2769 return (charxpos_clip_to_bounds
+ − 2770 (pos, buffer_or_string_accessible_begin_char (object),
+ − 2771 buffer_or_string_accessible_end_char (object)));
+ − 2772 }
+ − 2773
+ − 2774 Bytexpos
+ − 2775 buffer_or_string_clip_to_accessible_byte (Lisp_Object object, Bytexpos pos)
771
+ − 2776 {
826
+ − 2777 return (bytexpos_clip_to_bounds
+ − 2778 (pos, buffer_or_string_accessible_begin_byte (object),
+ − 2779 buffer_or_string_accessible_end_byte (object)));
+ − 2780 }
+ − 2781
+ − 2782 Charxpos
+ − 2783 buffer_or_string_clip_to_absolute_char (Lisp_Object object, Charxpos pos)
+ − 2784 {
+ − 2785 return (charxpos_clip_to_bounds
+ − 2786 (pos, buffer_or_string_absolute_begin_char (object),
+ − 2787 buffer_or_string_absolute_end_char (object)));
+ − 2788 }
+ − 2789
+ − 2790 Bytexpos
+ − 2791 buffer_or_string_clip_to_absolute_byte (Lisp_Object object, Bytexpos pos)
+ − 2792 {
+ − 2793 return (bytexpos_clip_to_bounds
+ − 2794 (pos, buffer_or_string_absolute_begin_byte (object),
+ − 2795 buffer_or_string_absolute_end_byte (object)));
771
+ − 2796 }
+ − 2797
+ − 2798
+ − 2799 /************************************************************************/
+ − 2800 /* Implement TO_EXTERNAL_FORMAT, TO_INTERNAL_FORMAT */
+ − 2801 /************************************************************************/
+ − 2802
+ − 2803 typedef struct
+ − 2804 {
867
+ − 2805 Dynarr_declare (Ibyte_dynarr *);
+ − 2806 } Ibyte_dynarr_dynarr;
771
+ − 2807
+ − 2808 typedef struct
+ − 2809 {
+ − 2810 Dynarr_declare (Extbyte_dynarr *);
+ − 2811 } Extbyte_dynarr_dynarr;
+ − 2812
+ − 2813 static Extbyte_dynarr_dynarr *conversion_out_dynarr_list;
867
+ − 2814 static Ibyte_dynarr_dynarr *conversion_in_dynarr_list;
771
+ − 2815
+ − 2816 static int dfc_convert_to_external_format_in_use;
+ − 2817 static int dfc_convert_to_internal_format_in_use;
+ − 2818
+ − 2819 void
+ − 2820 dfc_convert_to_external_format (dfc_conversion_type source_type,
+ − 2821 dfc_conversion_data *source,
+ − 2822 Lisp_Object coding_system,
+ − 2823 dfc_conversion_type sink_type,
+ − 2824 dfc_conversion_data *sink)
+ − 2825 {
+ − 2826 /* It's guaranteed that many callers are not prepared for GC here,
+ − 2827 esp. given that this code conversion occurs in many very hidden
+ − 2828 places. */
1292
+ − 2829 int count;
771
+ − 2830 Extbyte_dynarr *conversion_out_dynarr;
1292
+ − 2831 PROFILE_DECLARE ();
+ − 2832
+ − 2833 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion);
+ − 2834
+ − 2835 count = begin_gc_forbidden ();
771
+ − 2836
+ − 2837 type_checking_assert
+ − 2838 (((source_type == DFC_TYPE_DATA) ||
+ − 2839 (source_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)) ||
+ − 2840 (source_type == DFC_TYPE_LISP_STRING && STRINGP (source->lisp_object)))
+ − 2841 &&
+ − 2842 ((sink_type == DFC_TYPE_DATA) ||
+ − 2843 (sink_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object))));
+ − 2844
+ − 2845 if (Dynarr_length (conversion_out_dynarr_list) <=
+ − 2846 dfc_convert_to_external_format_in_use)
+ − 2847 Dynarr_add (conversion_out_dynarr_list, Dynarr_new (Extbyte));
+ − 2848 conversion_out_dynarr = Dynarr_at (conversion_out_dynarr_list,
+ − 2849 dfc_convert_to_external_format_in_use);
+ − 2850 Dynarr_reset (conversion_out_dynarr);
+ − 2851
853
+ − 2852 internal_bind_int (&dfc_convert_to_external_format_in_use,
+ − 2853 dfc_convert_to_external_format_in_use + 1);
+ − 2854
771
+ − 2855 coding_system = get_coding_system_for_text_file (coding_system, 0);
+ − 2856
+ − 2857 /* Here we optimize in the case where the coding system does no
+ − 2858 conversion. However, we don't want to optimize in case the source
+ − 2859 or sink is an lstream, since writing to an lstream can cause a
+ − 2860 garbage collection, and this could be problematic if the source
+ − 2861 is a lisp string. */
+ − 2862 if (source_type != DFC_TYPE_LISP_LSTREAM &&
+ − 2863 sink_type != DFC_TYPE_LISP_LSTREAM &&
+ − 2864 coding_system_is_binary (coding_system))
+ − 2865 {
867
+ − 2866 const Ibyte *ptr;
771
+ − 2867 Bytecount len;
+ − 2868
+ − 2869 if (source_type == DFC_TYPE_LISP_STRING)
+ − 2870 {
+ − 2871 ptr = XSTRING_DATA (source->lisp_object);
+ − 2872 len = XSTRING_LENGTH (source->lisp_object);
+ − 2873 }
+ − 2874 else
+ − 2875 {
867
+ − 2876 ptr = (Ibyte *) source->data.ptr;
771
+ − 2877 len = source->data.len;
+ − 2878 }
+ − 2879
+ − 2880 #ifdef MULE
+ − 2881 {
867
+ − 2882 const Ibyte *end;
771
+ − 2883 for (end = ptr + len; ptr < end;)
+ − 2884 {
867
+ − 2885 Ibyte c =
826
+ − 2886 (byte_ascii_p (*ptr)) ? *ptr :
771
+ − 2887 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
+ − 2888 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
+ − 2889 '~';
+ − 2890
+ − 2891 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
867
+ − 2892 INC_IBYTEPTR (ptr);
771
+ − 2893 }
800
+ − 2894 text_checking_assert (ptr == end);
771
+ − 2895 }
+ − 2896 #else
+ − 2897 Dynarr_add_many (conversion_out_dynarr, ptr, len);
+ − 2898 #endif
+ − 2899
+ − 2900 }
1315
+ − 2901 #ifdef WIN32_ANY
771
+ − 2902 /* Optimize the common case involving Unicode where only ASCII is involved */
+ − 2903 else if (source_type != DFC_TYPE_LISP_LSTREAM &&
+ − 2904 sink_type != DFC_TYPE_LISP_LSTREAM &&
+ − 2905 dfc_coding_system_is_unicode (coding_system))
+ − 2906 {
867
+ − 2907 const Ibyte *ptr, *p;
771
+ − 2908 Bytecount len;
867
+ − 2909 const Ibyte *end;
771
+ − 2910
+ − 2911 if (source_type == DFC_TYPE_LISP_STRING)
+ − 2912 {
+ − 2913 ptr = XSTRING_DATA (source->lisp_object);
+ − 2914 len = XSTRING_LENGTH (source->lisp_object);
+ − 2915 }
+ − 2916 else
+ − 2917 {
867
+ − 2918 ptr = (Ibyte *) source->data.ptr;
771
+ − 2919 len = source->data.len;
+ − 2920 }
+ − 2921 end = ptr + len;
+ − 2922
+ − 2923 for (p = ptr; p < end; p++)
+ − 2924 {
826
+ − 2925 if (!byte_ascii_p (*p))
771
+ − 2926 goto the_hard_way;
+ − 2927 }
+ − 2928
+ − 2929 for (p = ptr; p < end; p++)
+ − 2930 {
+ − 2931 Dynarr_add (conversion_out_dynarr, (Extbyte) (*p));
+ − 2932 Dynarr_add (conversion_out_dynarr, (Extbyte) '\0');
+ − 2933 }
+ − 2934 }
1315
+ − 2935 #endif /* WIN32_ANY */
771
+ − 2936 else
+ − 2937 {
+ − 2938 Lisp_Object streams_to_delete[3];
+ − 2939 int delete_count;
+ − 2940 Lisp_Object instream, outstream;
+ − 2941 Lstream *reader, *writer;
+ − 2942
1315
+ − 2943 #ifdef WIN32_ANY
771
+ − 2944 the_hard_way:
1315
+ − 2945 #endif /* WIN32_ANY */
771
+ − 2946 delete_count = 0;
+ − 2947 if (source_type == DFC_TYPE_LISP_LSTREAM)
+ − 2948 instream = source->lisp_object;
+ − 2949 else if (source_type == DFC_TYPE_DATA)
+ − 2950 streams_to_delete[delete_count++] = instream =
+ − 2951 make_fixed_buffer_input_stream (source->data.ptr, source->data.len);
+ − 2952 else
+ − 2953 {
+ − 2954 type_checking_assert (source_type == DFC_TYPE_LISP_STRING);
+ − 2955 streams_to_delete[delete_count++] = instream =
+ − 2956 /* This will GCPRO the Lisp string */
+ − 2957 make_lisp_string_input_stream (source->lisp_object, 0, -1);
+ − 2958 }
+ − 2959
+ − 2960 if (sink_type == DFC_TYPE_LISP_LSTREAM)
+ − 2961 outstream = sink->lisp_object;
+ − 2962 else
+ − 2963 {
+ − 2964 type_checking_assert (sink_type == DFC_TYPE_DATA);
+ − 2965 streams_to_delete[delete_count++] = outstream =
+ − 2966 make_dynarr_output_stream
+ − 2967 ((unsigned_char_dynarr *) conversion_out_dynarr);
+ − 2968 }
+ − 2969
+ − 2970 streams_to_delete[delete_count++] = outstream =
800
+ − 2971 make_coding_output_stream (XLSTREAM (outstream), coding_system,
+ − 2972 CODING_ENCODE, 0);
771
+ − 2973
+ − 2974 reader = XLSTREAM (instream);
+ − 2975 writer = XLSTREAM (outstream);
+ − 2976 /* decoding_stream will gc-protect outstream */
1204
+ − 2977 {
+ − 2978 struct gcpro gcpro1, gcpro2;
+ − 2979 GCPRO2 (instream, outstream);
+ − 2980
+ − 2981 while (1)
+ − 2982 {
+ − 2983 Bytecount size_in_bytes;
+ − 2984 char tempbuf[1024]; /* some random amount */
+ − 2985
+ − 2986 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf));
+ − 2987
+ − 2988 if (size_in_bytes == 0)
+ − 2989 break;
+ − 2990 else if (size_in_bytes < 0)
+ − 2991 signal_error (Qtext_conversion_error,
+ − 2992 "Error converting to external format", Qunbound);
+ − 2993
+ − 2994 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0)
+ − 2995 signal_error (Qtext_conversion_error,
+ − 2996 "Error converting to external format", Qunbound);
+ − 2997 }
+ − 2998
+ − 2999 /* Closing writer will close any stream at the other end of writer. */
+ − 3000 Lstream_close (writer);
+ − 3001 Lstream_close (reader);
+ − 3002 UNGCPRO;
+ − 3003 }
771
+ − 3004
+ − 3005 /* The idea is that this function will create no garbage. */
+ − 3006 while (delete_count)
+ − 3007 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count]));
+ − 3008 }
+ − 3009
+ − 3010 unbind_to (count);
+ − 3011
+ − 3012 if (sink_type != DFC_TYPE_LISP_LSTREAM)
+ − 3013 {
+ − 3014 sink->data.len = Dynarr_length (conversion_out_dynarr);
+ − 3015 /* double zero-extend because we may be dealing with Unicode data */
+ − 3016 Dynarr_add (conversion_out_dynarr, '\0');
+ − 3017 Dynarr_add (conversion_out_dynarr, '\0');
+ − 3018 sink->data.ptr = Dynarr_atp (conversion_out_dynarr, 0);
+ − 3019 }
1292
+ − 3020
+ − 3021 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion);
771
+ − 3022 }
+ − 3023
+ − 3024 void
+ − 3025 dfc_convert_to_internal_format (dfc_conversion_type source_type,
+ − 3026 dfc_conversion_data *source,
+ − 3027 Lisp_Object coding_system,
+ − 3028 dfc_conversion_type sink_type,
+ − 3029 dfc_conversion_data *sink)
+ − 3030 {
+ − 3031 /* It's guaranteed that many callers are not prepared for GC here,
+ − 3032 esp. given that this code conversion occurs in many very hidden
+ − 3033 places. */
1292
+ − 3034 int count;
867
+ − 3035 Ibyte_dynarr *conversion_in_dynarr;
1292
+ − 3036 PROFILE_DECLARE ();
+ − 3037
+ − 3038 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion);
+ − 3039
+ − 3040 count = begin_gc_forbidden ();
771
+ − 3041
+ − 3042 type_checking_assert
+ − 3043 ((source_type == DFC_TYPE_DATA ||
+ − 3044 source_type == DFC_TYPE_LISP_LSTREAM)
+ − 3045 &&
+ − 3046 (sink_type == DFC_TYPE_DATA ||
+ − 3047 sink_type == DFC_TYPE_LISP_LSTREAM));
+ − 3048
+ − 3049 if (Dynarr_length (conversion_in_dynarr_list) <=
+ − 3050 dfc_convert_to_internal_format_in_use)
867
+ − 3051 Dynarr_add (conversion_in_dynarr_list, Dynarr_new (Ibyte));
771
+ − 3052 conversion_in_dynarr = Dynarr_at (conversion_in_dynarr_list,
+ − 3053 dfc_convert_to_internal_format_in_use);
+ − 3054 Dynarr_reset (conversion_in_dynarr);
+ − 3055
853
+ − 3056 internal_bind_int (&dfc_convert_to_internal_format_in_use,
+ − 3057 dfc_convert_to_internal_format_in_use + 1);
+ − 3058
771
+ − 3059 coding_system = get_coding_system_for_text_file (coding_system, 1);
+ − 3060
+ − 3061 if (source_type != DFC_TYPE_LISP_LSTREAM &&
+ − 3062 sink_type != DFC_TYPE_LISP_LSTREAM &&
+ − 3063 coding_system_is_binary (coding_system))
+ − 3064 {
+ − 3065 #ifdef MULE
867
+ − 3066 const Ibyte *ptr = (const Ibyte *) source->data.ptr;
771
+ − 3067 Bytecount len = source->data.len;
867
+ − 3068 const Ibyte *end = ptr + len;
771
+ − 3069
+ − 3070 for (; ptr < end; ptr++)
+ − 3071 {
867
+ − 3072 Ibyte c = *ptr;
771
+ − 3073
826
+ − 3074 if (byte_ascii_p (c))
771
+ − 3075 Dynarr_add (conversion_in_dynarr, c);
826
+ − 3076 else if (byte_c1_p (c))
771
+ − 3077 {
+ − 3078 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
+ − 3079 Dynarr_add (conversion_in_dynarr, c + 0x20);
+ − 3080 }
+ − 3081 else
+ − 3082 {
+ − 3083 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1);
+ − 3084 Dynarr_add (conversion_in_dynarr, c);
+ − 3085 }
+ − 3086 }
+ − 3087 #else
+ − 3088 Dynarr_add_many (conversion_in_dynarr, source->data.ptr, source->data.len);
+ − 3089 #endif
+ − 3090 }
1315
+ − 3091 #ifdef WIN32_ANY
1292
+ − 3092 /* Optimize the common case involving Unicode where only ASCII/Latin-1 is
+ − 3093 involved */
771
+ − 3094 else if (source_type != DFC_TYPE_LISP_LSTREAM &&
+ − 3095 sink_type != DFC_TYPE_LISP_LSTREAM &&
+ − 3096 dfc_coding_system_is_unicode (coding_system))
+ − 3097 {
867
+ − 3098 const Ibyte *ptr = (const Ibyte *) source->data.ptr + 1;
771
+ − 3099 Bytecount len = source->data.len;
867
+ − 3100 const Ibyte *end = ptr + len;
771
+ − 3101
+ − 3102 if (len & 1)
+ − 3103 goto the_hard_way;
+ − 3104
+ − 3105 for (; ptr < end; ptr += 2)
+ − 3106 {
+ − 3107 if (*ptr)
+ − 3108 goto the_hard_way;
+ − 3109 }
+ − 3110
867
+ − 3111 ptr = (const Ibyte *) source->data.ptr;
771
+ − 3112 end = ptr + len;
+ − 3113
+ − 3114 for (; ptr < end; ptr += 2)
+ − 3115 {
867
+ − 3116 Ibyte c = *ptr;
771
+ − 3117
826
+ − 3118 if (byte_ascii_p (c))
771
+ − 3119 Dynarr_add (conversion_in_dynarr, c);
+ − 3120 #ifdef MULE
826
+ − 3121 else if (byte_c1_p (c))
771
+ − 3122 {
+ − 3123 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
+ − 3124 Dynarr_add (conversion_in_dynarr, c + 0x20);
+ − 3125 }
+ − 3126 else
+ − 3127 {
+ − 3128 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1);
+ − 3129 Dynarr_add (conversion_in_dynarr, c);
+ − 3130 }
+ − 3131 #endif /* MULE */
+ − 3132 }
+ − 3133 }
1315
+ − 3134 #endif /* WIN32_ANY */
771
+ − 3135 else
+ − 3136 {
+ − 3137 Lisp_Object streams_to_delete[3];
+ − 3138 int delete_count;
+ − 3139 Lisp_Object instream, outstream;
+ − 3140 Lstream *reader, *writer;
+ − 3141
1315
+ − 3142 #ifdef WIN32_ANY
771
+ − 3143 the_hard_way:
1315
+ − 3144 #endif /* WIN32_ANY */
771
+ − 3145 delete_count = 0;
+ − 3146 if (source_type == DFC_TYPE_LISP_LSTREAM)
+ − 3147 instream = source->lisp_object;
+ − 3148 else
+ − 3149 {
+ − 3150 type_checking_assert (source_type == DFC_TYPE_DATA);
+ − 3151 streams_to_delete[delete_count++] = instream =
+ − 3152 make_fixed_buffer_input_stream (source->data.ptr, source->data.len);
+ − 3153 }
+ − 3154
+ − 3155 if (sink_type == DFC_TYPE_LISP_LSTREAM)
+ − 3156 outstream = sink->lisp_object;
+ − 3157 else
+ − 3158 {
+ − 3159 type_checking_assert (sink_type == DFC_TYPE_DATA);
+ − 3160 streams_to_delete[delete_count++] = outstream =
+ − 3161 make_dynarr_output_stream
+ − 3162 ((unsigned_char_dynarr *) conversion_in_dynarr);
+ − 3163 }
+ − 3164
+ − 3165 streams_to_delete[delete_count++] = outstream =
800
+ − 3166 make_coding_output_stream (XLSTREAM (outstream), coding_system,
+ − 3167 CODING_DECODE, 0);
771
+ − 3168
+ − 3169 reader = XLSTREAM (instream);
+ − 3170 writer = XLSTREAM (outstream);
1204
+ − 3171 {
+ − 3172 struct gcpro gcpro1, gcpro2;
+ − 3173 /* outstream will gc-protect its sink stream, if necessary */
+ − 3174 GCPRO2 (instream, outstream);
+ − 3175
+ − 3176 while (1)
+ − 3177 {
+ − 3178 Bytecount size_in_bytes;
+ − 3179 char tempbuf[1024]; /* some random amount */
+ − 3180
+ − 3181 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf));
+ − 3182
+ − 3183 if (size_in_bytes == 0)
+ − 3184 break;
+ − 3185 else if (size_in_bytes < 0)
+ − 3186 signal_error (Qtext_conversion_error,
+ − 3187 "Error converting to internal format", Qunbound);
+ − 3188
+ − 3189 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0)
+ − 3190 signal_error (Qtext_conversion_error,
+ − 3191 "Error converting to internal format", Qunbound);
+ − 3192 }
+ − 3193
+ − 3194 /* Closing writer will close any stream at the other end of writer. */
+ − 3195 Lstream_close (writer);
+ − 3196 Lstream_close (reader);
+ − 3197 UNGCPRO;
+ − 3198 }
771
+ − 3199
+ − 3200 /* The idea is that this function will create no garbage. */
+ − 3201 while (delete_count)
+ − 3202 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count]));
+ − 3203 }
+ − 3204
+ − 3205 unbind_to (count);
+ − 3206
+ − 3207 if (sink_type != DFC_TYPE_LISP_LSTREAM)
+ − 3208 {
+ − 3209 sink->data.len = Dynarr_length (conversion_in_dynarr);
+ − 3210 Dynarr_add (conversion_in_dynarr, '\0'); /* remember to NUL-terminate! */
+ − 3211 /* The macros don't currently distinguish between internal and
+ − 3212 external sinks, and allocate and copy two extra bytes in both
+ − 3213 cases. So we add a second zero, just like for external data
+ − 3214 (in that case, because we may be converting to Unicode). */
+ − 3215 Dynarr_add (conversion_in_dynarr, '\0');
+ − 3216 sink->data.ptr = Dynarr_atp (conversion_in_dynarr, 0);
+ − 3217 }
1292
+ − 3218
+ − 3219 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion);
771
+ − 3220 }
+ − 3221
1318
+ − 3222 /* ----------------------------------------------------------------------- */
+ − 3223 /* New-style DFC converters (data is returned rather than stored into var) */
+ − 3224 /* ----------------------------------------------------------------------- */
+ − 3225
+ − 3226 /* We handle here the cases where SRC is a Lisp_Object, internal data
+ − 3227 (sized or unsized), or external data (sized or unsized), and return type
+ − 3228 is unsized alloca() or malloc() data. If the return type is a
+ − 3229 Lisp_Object, use build_ext_string() for unsized external data,
+ − 3230 make_ext_string() for sized external data. If the return type needs to
+ − 3231 be sized data, use the *_TO_SIZED_*() macros, and for other more
+ − 3232 complicated cases, use the original TO_*_FORMAT() macros. */
+ − 3233
+ − 3234 static void
+ − 3235 new_dfc_convert_now_damn_it (const void *src, Bytecount src_size,
+ − 3236 enum new_dfc_src_type type,
+ − 3237 void **dst, Bytecount *dst_size,
+ − 3238 Lisp_Object codesys)
+ − 3239 {
+ − 3240 /* #### In the case of alloca(), it would be a bit more efficient, for
+ − 3241 small strings, to use static Dynarr's like are used internally in
+ − 3242 TO_*_FORMAT(), or some other way of avoiding malloc() followed by
+ − 3243 free(). I doubt it really matters, though. */
+ − 3244
+ − 3245 switch (type)
+ − 3246 {
+ − 3247 case DFC_EXTERNAL:
+ − 3248 TO_INTERNAL_FORMAT (C_STRING, src,
+ − 3249 MALLOC, (*dst, *dst_size), codesys);
+ − 3250 break;
+ − 3251
+ − 3252 case DFC_SIZED_EXTERNAL:
+ − 3253 TO_INTERNAL_FORMAT (DATA, (src, src_size),
+ − 3254 MALLOC, (*dst, *dst_size), codesys);
+ − 3255 break;
+ − 3256
+ − 3257 case DFC_INTERNAL:
+ − 3258 TO_EXTERNAL_FORMAT (C_STRING, src,
+ − 3259 MALLOC, (*dst, *dst_size), codesys);
+ − 3260 break;
+ − 3261
+ − 3262 case DFC_SIZED_INTERNAL:
+ − 3263 TO_EXTERNAL_FORMAT (DATA, (src, src_size),
+ − 3264 MALLOC, (*dst, *dst_size), codesys);
+ − 3265 break;
+ − 3266
+ − 3267 case DFC_LISP_STRING:
+ − 3268 TO_EXTERNAL_FORMAT (LISP_STRING, VOID_TO_LISP (src),
+ − 3269 MALLOC, (*dst, *dst_size), codesys);
+ − 3270 break;
+ − 3271
+ − 3272 default:
+ − 3273 abort ();
+ − 3274 }
+ − 3275 }
+ − 3276
+ − 3277 void *
+ − 3278 new_dfc_convert_malloc (const void *src, Bytecount src_size,
+ − 3279 enum new_dfc_src_type type, Lisp_Object codesys)
+ − 3280 {
+ − 3281 void *dst;
+ − 3282 Bytecount dst_size;
+ − 3283
+ − 3284 new_dfc_convert_now_damn_it (src, src_size, type, &dst, &dst_size, codesys);
+ − 3285 return dst;
+ − 3286 }
+ − 3287
+ − 3288 /* For alloca(), things are trickier because the calling function needs to
+ − 3289 allocate. This means that the caller needs to do the following:
+ − 3290
+ − 3291 (a) invoke us to do the conversion, remember the data and return the size.
+ − 3292 (b) alloca() the proper size.
+ − 3293 (c) invoke us again to copy the data.
+ − 3294
+ − 3295 We need to handle the possibility of two or more invocations of the
+ − 3296 converter in the same expression. In such cases it's conceivable that
+ − 3297 the evaluation of the sub-expressions will be overlapping (e.g. one size
+ − 3298 function called, then the other one called, then the copy functions
+ − 3299 called). To handle this, we keep a list of active data, indexed by the
+ − 3300 src expression. (We use the stringize operator to avoid evaluating the
+ − 3301 expression multiple times.) If the caller uses the exact same src
+ − 3302 expression twice in two converter calls in the same subexpression, we
+ − 3303 will lose, but at least we can check for this and abort(). We could
+ − 3304 conceivably try to index on other parameters as well, but there is not
+ − 3305 really any point. */
+ − 3306
+ − 3307 typedef struct
+ − 3308 {
+ − 3309 const char *srctext;
+ − 3310 void *dst;
+ − 3311 Bytecount dst_size;
+ − 3312 } dfc_e2c_vals;
+ − 3313
+ − 3314 typedef struct
+ − 3315 {
+ − 3316 Dynarr_declare (dfc_e2c_vals);
+ − 3317 } dfc_e2c_vals_dynarr;
+ − 3318
+ − 3319 static dfc_e2c_vals_dynarr *active_dfc_e2c;
+ − 3320
+ − 3321 static int
+ − 3322 find_pos_of_existing_active_dfc_e2c (const char *srctext)
+ − 3323 {
+ − 3324 dfc_e2c_vals *vals = NULL;
+ − 3325 int i;
+ − 3326
+ − 3327 for (i = 0; i < Dynarr_length (active_dfc_e2c); i++)
+ − 3328 {
+ − 3329 vals = Dynarr_atp (active_dfc_e2c, i);
+ − 3330 if (vals->srctext == srctext)
+ − 3331 return i;
+ − 3332 }
+ − 3333
+ − 3334 return -1;
+ − 3335 }
+ − 3336
+ − 3337 void *
+ − 3338 new_dfc_convert_alloca (const char *srctext, void *alloca_data)
+ − 3339 {
+ − 3340 dfc_e2c_vals *vals;
+ − 3341 int i = find_pos_of_existing_active_dfc_e2c (srctext);
+ − 3342
+ − 3343 assert (i >= 0);
+ − 3344 vals = Dynarr_atp (active_dfc_e2c, i);
+ − 3345 assert (alloca_data);
+ − 3346 memcpy (alloca_data, vals->dst, vals->dst_size + 2);
+ − 3347 xfree (vals->dst);
+ − 3348 Dynarr_delete (active_dfc_e2c, i);
+ − 3349 return alloca_data;
+ − 3350 }
+ − 3351
+ − 3352 Bytecount
+ − 3353 new_dfc_convert_size (const char *srctext, const void *src,
+ − 3354 Bytecount src_size, enum new_dfc_src_type type,
+ − 3355 Lisp_Object codesys)
+ − 3356 {
+ − 3357 dfc_e2c_vals vals;
+ − 3358
+ − 3359 assert (find_pos_of_existing_active_dfc_e2c (srctext) < 0);
+ − 3360
+ − 3361 vals.srctext = srctext;
+ − 3362
+ − 3363 new_dfc_convert_now_damn_it (src, src_size, type, &vals.dst, &vals.dst_size,
+ − 3364 codesys);
+ − 3365
+ − 3366 Dynarr_add (active_dfc_e2c, vals);
+ − 3367 /* The size is always + 2 because we have double zero-termination at the
+ − 3368 end of all data (for Unicode-correctness). */
+ − 3369 return vals.dst_size + 2;
+ − 3370 }
+ − 3371
771
+ − 3372
+ − 3373 /************************************************************************/
867
+ − 3374 /* Basic Ichar functions */
771
+ − 3375 /************************************************************************/
+ − 3376
+ − 3377 #ifdef MULE
+ − 3378
+ − 3379 /* Convert a non-ASCII Mule character C into a one-character Mule-encoded
+ − 3380 string in STR. Returns the number of bytes stored.
867
+ − 3381 Do not call this directly. Use the macro set_itext_ichar() instead.
771
+ − 3382 */
+ − 3383
+ − 3384 Bytecount
867
+ − 3385 non_ascii_set_itext_ichar (Ibyte *str, Ichar c)
771
+ − 3386 {
867
+ − 3387 Ibyte *p;
+ − 3388 Ibyte lb;
771
+ − 3389 int c1, c2;
+ − 3390 Lisp_Object charset;
+ − 3391
+ − 3392 p = str;
867
+ − 3393 BREAKUP_ICHAR (c, charset, c1, c2);
+ − 3394 lb = ichar_leading_byte (c);
826
+ − 3395 if (leading_byte_private_p (lb))
+ − 3396 *p++ = private_leading_byte_prefix (lb);
771
+ − 3397 *p++ = lb;
+ − 3398 if (EQ (charset, Vcharset_control_1))
+ − 3399 c1 += 0x20;
+ − 3400 *p++ = c1 | 0x80;
+ − 3401 if (c2)
+ − 3402 *p++ = c2 | 0x80;
+ − 3403
+ − 3404 return (p - str);
+ − 3405 }
+ − 3406
+ − 3407 /* Return the first character from a Mule-encoded string in STR,
+ − 3408 assuming it's non-ASCII. Do not call this directly.
867
+ − 3409 Use the macro itext_ichar() instead. */
+ − 3410
+ − 3411 Ichar
+ − 3412 non_ascii_itext_ichar (const Ibyte *str)
771
+ − 3413 {
867
+ − 3414 Ibyte i0 = *str, i1, i2 = 0;
771
+ − 3415 Lisp_Object charset;
+ − 3416
+ − 3417 if (i0 == LEADING_BYTE_CONTROL_1)
867
+ − 3418 return (Ichar) (*++str - 0x20);
771
+ − 3419
826
+ − 3420 if (leading_byte_prefix_p (i0))
771
+ − 3421 i0 = *++str;
+ − 3422
+ − 3423 i1 = *++str & 0x7F;
+ − 3424
826
+ − 3425 charset = charset_by_leading_byte (i0);
771
+ − 3426 if (XCHARSET_DIMENSION (charset) == 2)
+ − 3427 i2 = *++str & 0x7F;
+ − 3428
867
+ − 3429 return make_ichar (charset, i1, i2);
771
+ − 3430 }
+ − 3431
867
+ − 3432 /* Return whether CH is a valid Ichar, assuming it's non-ASCII.
+ − 3433 Do not call this directly. Use the macro valid_ichar_p() instead. */
771
+ − 3434
+ − 3435 int
867
+ − 3436 non_ascii_valid_ichar_p (Ichar ch)
771
+ − 3437 {
+ − 3438 int f1, f2, f3;
+ − 3439
+ − 3440 /* Must have only lowest 19 bits set */
+ − 3441 if (ch & ~0x7FFFF)
+ − 3442 return 0;
+ − 3443
867
+ − 3444 f1 = ichar_field1 (ch);
+ − 3445 f2 = ichar_field2 (ch);
+ − 3446 f3 = ichar_field3 (ch);
771
+ − 3447
+ − 3448 if (f1 == 0)
+ − 3449 {
+ − 3450 /* dimension-1 char */
+ − 3451 Lisp_Object charset;
+ − 3452
+ − 3453 /* leading byte must be correct */
867
+ − 3454 if (f2 < MIN_ICHAR_FIELD2_OFFICIAL ||
+ − 3455 (f2 > MAX_ICHAR_FIELD2_OFFICIAL && f2 < MIN_ICHAR_FIELD2_PRIVATE) ||
+ − 3456 f2 > MAX_ICHAR_FIELD2_PRIVATE)
771
+ − 3457 return 0;
+ − 3458 /* octet not out of range */
+ − 3459 if (f3 < 0x20)
+ − 3460 return 0;
+ − 3461 /* charset exists */
+ − 3462 /*
+ − 3463 NOTE: This takes advantage of the fact that
+ − 3464 FIELD2_TO_OFFICIAL_LEADING_BYTE and
+ − 3465 FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
+ − 3466 */
826
+ − 3467 charset = charset_by_leading_byte (f2 + FIELD2_TO_OFFICIAL_LEADING_BYTE);
771
+ − 3468 if (EQ (charset, Qnil))
+ − 3469 return 0;
+ − 3470 /* check range as per size (94 or 96) of charset */
+ − 3471 return ((f3 > 0x20 && f3 < 0x7f) || XCHARSET_CHARS (charset) == 96);
+ − 3472 }
+ − 3473 else
+ − 3474 {
+ − 3475 /* dimension-2 char */
+ − 3476 Lisp_Object charset;
+ − 3477
+ − 3478 /* leading byte must be correct */
867
+ − 3479 if (f1 < MIN_ICHAR_FIELD1_OFFICIAL ||
+ − 3480 (f1 > MAX_ICHAR_FIELD1_OFFICIAL && f1 < MIN_ICHAR_FIELD1_PRIVATE) ||
+ − 3481 f1 > MAX_ICHAR_FIELD1_PRIVATE)
771
+ − 3482 return 0;
+ − 3483 /* octets not out of range */
+ − 3484 if (f2 < 0x20 || f3 < 0x20)
+ − 3485 return 0;
+ − 3486
+ − 3487 #ifdef ENABLE_COMPOSITE_CHARS
+ − 3488 if (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE == LEADING_BYTE_COMPOSITE)
+ − 3489 {
+ − 3490 if (UNBOUNDP (Fgethash (make_int (ch),
+ − 3491 Vcomposite_char_char2string_hash_table,
+ − 3492 Qunbound)))
+ − 3493 return 0;
+ − 3494 return 1;
+ − 3495 }
+ − 3496 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 3497
+ − 3498 /* charset exists */
867
+ − 3499 if (f1 <= MAX_ICHAR_FIELD1_OFFICIAL)
771
+ − 3500 charset =
826
+ − 3501 charset_by_leading_byte (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE);
771
+ − 3502 else
+ − 3503 charset =
826
+ − 3504 charset_by_leading_byte (f1 + FIELD1_TO_PRIVATE_LEADING_BYTE);
771
+ − 3505
+ − 3506 if (EQ (charset, Qnil))
+ − 3507 return 0;
+ − 3508 /* check range as per size (94x94 or 96x96) of charset */
+ − 3509 return ((f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F) ||
+ − 3510 XCHARSET_CHARS (charset) == 96);
+ − 3511 }
+ − 3512 }
+ − 3513
+ − 3514 /* Copy the character pointed to by SRC into DST. Do not call this
867
+ − 3515 directly. Use the macro itext_copy_ichar() instead.
771
+ − 3516 Return the number of bytes copied. */
+ − 3517
+ − 3518 Bytecount
867
+ − 3519 non_ascii_itext_copy_ichar (const Ibyte *src, Ibyte *dst)
771
+ − 3520 {
826
+ − 3521 Bytecount bytes = rep_bytes_by_first_byte (*src);
771
+ − 3522 Bytecount i;
+ − 3523 for (i = bytes; i; i--, dst++, src++)
+ − 3524 *dst = *src;
+ − 3525 return bytes;
+ − 3526 }
+ − 3527
+ − 3528 #endif /* MULE */
+ − 3529
+ − 3530
+ − 3531 /************************************************************************/
867
+ − 3532 /* streams of Ichars */
771
+ − 3533 /************************************************************************/
+ − 3534
+ − 3535 #ifdef MULE
+ − 3536
867
+ − 3537 /* Treat a stream as a stream of Ichar's rather than a stream of bytes.
771
+ − 3538 The functions below are not meant to be called directly; use
+ − 3539 the macros in insdel.h. */
+ − 3540
867
+ − 3541 Ichar
+ − 3542 Lstream_get_ichar_1 (Lstream *stream, int ch)
771
+ − 3543 {
867
+ − 3544 Ibyte str[MAX_ICHAR_LEN];
+ − 3545 Ibyte *strptr = str;
771
+ − 3546 Bytecount bytes;
+ − 3547
867
+ − 3548 str[0] = (Ibyte) ch;
771
+ − 3549
826
+ − 3550 for (bytes = rep_bytes_by_first_byte (ch) - 1; bytes; bytes--)
771
+ − 3551 {
+ − 3552 int c = Lstream_getc (stream);
800
+ − 3553 text_checking_assert (c >= 0);
867
+ − 3554 *++strptr = (Ibyte) c;
771
+ − 3555 }
867
+ − 3556 return itext_ichar (str);
771
+ − 3557 }
+ − 3558
+ − 3559 int
867
+ − 3560 Lstream_fput_ichar (Lstream *stream, Ichar ch)
771
+ − 3561 {
867
+ − 3562 Ibyte str[MAX_ICHAR_LEN];
+ − 3563 Bytecount len = set_itext_ichar (str, ch);
771
+ − 3564 return Lstream_write (stream, str, len);
+ − 3565 }
+ − 3566
+ − 3567 void
867
+ − 3568 Lstream_funget_ichar (Lstream *stream, Ichar ch)
771
+ − 3569 {
867
+ − 3570 Ibyte str[MAX_ICHAR_LEN];
+ − 3571 Bytecount len = set_itext_ichar (str, ch);
771
+ − 3572 Lstream_unread (stream, str, len);
+ − 3573 }
+ − 3574
+ − 3575 #endif /* MULE */
+ − 3576
+ − 3577
+ − 3578 /************************************************************************/
+ − 3579 /* Lisp primitives for working with characters */
+ − 3580 /************************************************************************/
+ − 3581
+ − 3582 DEFUN ("make-char", Fmake_char, 2, 3, 0, /*
+ − 3583 Make a character from CHARSET and octets ARG1 and ARG2.
+ − 3584 ARG2 is required only for characters from two-dimensional charsets.
+ − 3585
+ − 3586 Each octet should be in the range 32 through 127 for a 96 or 96x96
+ − 3587 charset and 33 through 126 for a 94 or 94x94 charset. (Most charsets
+ − 3588 are either 96 or 94x94.) Note that this is 32 more than the values
+ − 3589 typically given for 94x94 charsets. When two octets are required, the
+ − 3590 order is "standard" -- the same as appears in ISO-2022 encodings,
+ − 3591 reference tables, etc.
+ − 3592
+ − 3593 \(Note the following non-obvious result: Computerized translation
+ − 3594 tables often encode the two octets as the high and low bytes,
+ − 3595 respectively, of a hex short, while when there's only one octet, it
+ − 3596 goes in the low byte. When decoding such a value, you need to treat
+ − 3597 the two cases differently when calling make-char: One is (make-char
+ − 3598 CHARSET HIGH LOW), the other is (make-char CHARSET LOW).)
+ − 3599
+ − 3600 For example, (make-char 'latin-iso8859-2 185) or (make-char
+ − 3601 'latin-iso8859-2 57) will return the Latin 2 character s with caron.
+ − 3602
+ − 3603 As another example, the Japanese character for "kawa" (stream), which
+ − 3604 looks something like this:
+ − 3605
+ − 3606 | |
+ − 3607 | | |
+ − 3608 | | |
+ − 3609 | | |
+ − 3610 / |
+ − 3611
+ − 3612 appears in the Unicode Standard (version 2.0) on page 7-287 with the
+ − 3613 following values (see also page 7-4):
+ − 3614
+ − 3615 U 5DDD (Unicode)
+ − 3616 G 0-2008 (GB 2312-80)
+ − 3617 J 0-3278 (JIS X 0208-1990)
+ − 3618 K 0-8425 (KS C 5601-1987)
+ − 3619 B A474 (Big Five)
+ − 3620 C 1-4455 (CNS 11643-1986 (1st plane))
+ − 3621 A 213C34 (ANSI Z39.64-1989)
+ − 3622
+ − 3623 These are equivalent to:
+ − 3624
+ − 3625 \(make-char 'chinese-gb2312 52 40)
+ − 3626 \(make-char 'japanese-jisx0208 64 110)
+ − 3627 \(make-char 'korean-ksc5601 116 57)
+ − 3628 \(make-char 'chinese-cns11643-1 76 87)
+ − 3629 \(decode-big5-char '(164 . 116))
+ − 3630
+ − 3631 \(All codes above are two decimal numbers except for Big Five and ANSI
+ − 3632 Z39.64, which we don't support. We add 32 to each of the decimal
+ − 3633 numbers. Big Five is split in a rather hackish fashion into two
+ − 3634 charsets, `big5-1' and `big5-2', due to its excessive size -- 94x157,
+ − 3635 with the first codepoint in the range 0xA1 to 0xFE and the second in
+ − 3636 the range 0x40 to 0x7E or 0xA1 to 0xFE. `decode-big5-char' is used to
+ − 3637 generate the char from its codes, and `encode-big5-char' extracts the
+ − 3638 codes.)
+ − 3639
+ − 3640 When compiled without MULE, this function does not do much, but it's
+ − 3641 provided for compatibility. In this case, the following CHARSET symbols
+ − 3642 are allowed:
+ − 3643
+ − 3644 `ascii' -- ARG1 should be in the range 0 through 127.
+ − 3645 `control-1' -- ARG1 should be in the range 128 through 159.
+ − 3646 else -- ARG1 is coerced to be between 0 and 255, and then the high
+ − 3647 bit is set.
+ − 3648
+ − 3649 `int-to-char of the resulting ARG1' is returned, and ARG2 is always ignored.
+ − 3650 */
+ − 3651 (charset, arg1, arg2))
+ − 3652 {
+ − 3653 #ifdef MULE
+ − 3654 Lisp_Charset *cs;
+ − 3655 int a1, a2;
+ − 3656 int lowlim, highlim;
+ − 3657
+ − 3658 charset = Fget_charset (charset);
+ − 3659 cs = XCHARSET (charset);
+ − 3660
788
+ − 3661 get_charset_limits (charset, &lowlim, &highlim);
771
+ − 3662
+ − 3663 CHECK_INT (arg1);
+ − 3664 /* It is useful (and safe, according to Olivier Galibert) to strip
+ − 3665 the 8th bit off ARG1 and ARG2 because it allows programmers to
+ − 3666 write (make-char 'latin-iso8859-2 CODE) where code is the actual
+ − 3667 Latin 2 code of the character. */
+ − 3668 a1 = XINT (arg1) & 0x7f;
+ − 3669 if (a1 < lowlim || a1 > highlim)
+ − 3670 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim));
+ − 3671
+ − 3672 if (CHARSET_DIMENSION (cs) == 1)
+ − 3673 {
+ − 3674 if (!NILP (arg2))
+ − 3675 invalid_argument
+ − 3676 ("Charset is of dimension one; second octet must be nil", arg2);
867
+ − 3677 return make_char (make_ichar (charset, a1, 0));
771
+ − 3678 }
+ − 3679
+ − 3680 CHECK_INT (arg2);
+ − 3681 a2 = XINT (arg2) & 0x7f;
+ − 3682 if (a2 < lowlim || a2 > highlim)
+ − 3683 args_out_of_range_3 (arg2, make_int (lowlim), make_int (highlim));
+ − 3684
867
+ − 3685 return make_char (make_ichar (charset, a1, a2));
771
+ − 3686 #else
+ − 3687 int a1;
+ − 3688 int lowlim, highlim;
+ − 3689
+ − 3690 if (EQ (charset, Qascii)) lowlim = 0, highlim = 127;
+ − 3691 else if (EQ (charset, Qcontrol_1)) lowlim = 0, highlim = 31;
+ − 3692 else lowlim = 0, highlim = 127;
+ − 3693
+ − 3694 CHECK_INT (arg1);
+ − 3695 /* It is useful (and safe, according to Olivier Galibert) to strip
+ − 3696 the 8th bit off ARG1 and ARG2 because it allows programmers to
+ − 3697 write (make-char 'latin-iso8859-2 CODE) where code is the actual
+ − 3698 Latin 2 code of the character. */
+ − 3699 a1 = XINT (arg1) & 0x7f;
+ − 3700 if (a1 < lowlim || a1 > highlim)
+ − 3701 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim));
+ − 3702
+ − 3703 if (EQ (charset, Qascii))
+ − 3704 return make_char (a1);
+ − 3705 return make_char (a1 + 128);
+ − 3706 #endif /* MULE */
+ − 3707 }
+ − 3708
+ − 3709 #ifdef MULE
+ − 3710
+ − 3711 DEFUN ("char-charset", Fchar_charset, 1, 1, 0, /*
+ − 3712 Return the character set of char CH.
+ − 3713 */
+ − 3714 (ch))
+ − 3715 {
+ − 3716 CHECK_CHAR_COERCE_INT (ch);
+ − 3717
826
+ − 3718 return XCHARSET_NAME (charset_by_leading_byte
867
+ − 3719 (ichar_leading_byte (XCHAR (ch))));
771
+ − 3720 }
+ − 3721
+ − 3722 DEFUN ("char-octet", Fchar_octet, 1, 2, 0, /*
+ − 3723 Return the octet numbered N (should be 0 or 1) of char CH.
+ − 3724 N defaults to 0 if omitted.
+ − 3725 */
+ − 3726 (ch, n))
+ − 3727 {
+ − 3728 Lisp_Object charset;
+ − 3729 int octet0, octet1;
+ − 3730
+ − 3731 CHECK_CHAR_COERCE_INT (ch);
+ − 3732
867
+ − 3733 BREAKUP_ICHAR (XCHAR (ch), charset, octet0, octet1);
771
+ − 3734
+ − 3735 if (NILP (n) || EQ (n, Qzero))
+ − 3736 return make_int (octet0);
+ − 3737 else if (EQ (n, make_int (1)))
+ − 3738 return make_int (octet1);
+ − 3739 else
+ − 3740 invalid_constant ("Octet number must be 0 or 1", n);
+ − 3741 }
+ − 3742
+ − 3743 DEFUN ("split-char", Fsplit_char, 1, 1, 0, /*
+ − 3744 Return list of charset and one or two position-codes of CHAR.
+ − 3745 */
+ − 3746 (character))
+ − 3747 {
+ − 3748 /* This function can GC */
+ − 3749 struct gcpro gcpro1, gcpro2;
+ − 3750 Lisp_Object charset = Qnil;
+ − 3751 Lisp_Object rc = Qnil;
+ − 3752 int c1, c2;
+ − 3753
+ − 3754 GCPRO2 (charset, rc);
+ − 3755 CHECK_CHAR_COERCE_INT (character);
+ − 3756
867
+ − 3757 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2);
771
+ − 3758
+ − 3759 if (XCHARSET_DIMENSION (Fget_charset (charset)) == 2)
+ − 3760 {
+ − 3761 rc = list3 (XCHARSET_NAME (charset), make_int (c1), make_int (c2));
+ − 3762 }
+ − 3763 else
+ − 3764 {
+ − 3765 rc = list2 (XCHARSET_NAME (charset), make_int (c1));
+ − 3766 }
+ − 3767 UNGCPRO;
+ − 3768
+ − 3769 return rc;
+ − 3770 }
+ − 3771
+ − 3772 #endif /* MULE */
+ − 3773
+ − 3774
+ − 3775 /************************************************************************/
+ − 3776 /* composite character functions */
+ − 3777 /************************************************************************/
+ − 3778
+ − 3779 #ifdef ENABLE_COMPOSITE_CHARS
+ − 3780
867
+ − 3781 Ichar
+ − 3782 lookup_composite_char (Ibyte *str, int len)
771
+ − 3783 {
+ − 3784 Lisp_Object lispstr = make_string (str, len);
+ − 3785 Lisp_Object ch = Fgethash (lispstr,
+ − 3786 Vcomposite_char_string2char_hash_table,
+ − 3787 Qunbound);
867
+ − 3788 Ichar emch;
771
+ − 3789
+ − 3790 if (UNBOUNDP (ch))
+ − 3791 {
+ − 3792 if (composite_char_row_next >= 128)
+ − 3793 invalid_operation ("No more composite chars available", lispstr);
867
+ − 3794 emch = make_ichar (Vcharset_composite, composite_char_row_next,
771
+ − 3795 composite_char_col_next);
+ − 3796 Fputhash (make_char (emch), lispstr,
+ − 3797 Vcomposite_char_char2string_hash_table);
+ − 3798 Fputhash (lispstr, make_char (emch),
+ − 3799 Vcomposite_char_string2char_hash_table);
+ − 3800 composite_char_col_next++;
+ − 3801 if (composite_char_col_next >= 128)
+ − 3802 {
+ − 3803 composite_char_col_next = 32;
+ − 3804 composite_char_row_next++;
+ − 3805 }
+ − 3806 }
+ − 3807 else
+ − 3808 emch = XCHAR (ch);
+ − 3809 return emch;
+ − 3810 }
+ − 3811
+ − 3812 Lisp_Object
867
+ − 3813 composite_char_string (Ichar ch)
771
+ − 3814 {
+ − 3815 Lisp_Object str = Fgethash (make_char (ch),
+ − 3816 Vcomposite_char_char2string_hash_table,
+ − 3817 Qunbound);
+ − 3818 assert (!UNBOUNDP (str));
+ − 3819 return str;
+ − 3820 }
+ − 3821
826
+ − 3822 DEFUN ("make-composite-char", Fmake_composite_char, 1, 1, 0, /*
771
+ − 3823 Convert a string into a single composite character.
+ − 3824 The character is the result of overstriking all the characters in
+ − 3825 the string.
+ − 3826 */
+ − 3827 (string))
+ − 3828 {
+ − 3829 CHECK_STRING (string);
+ − 3830 return make_char (lookup_composite_char (XSTRING_DATA (string),
+ − 3831 XSTRING_LENGTH (string)));
+ − 3832 }
+ − 3833
826
+ − 3834 DEFUN ("composite-char-string", Fcomposite_char_string, 1, 1, 0, /*
771
+ − 3835 Return a string of the characters comprising a composite character.
+ − 3836 */
+ − 3837 (ch))
+ − 3838 {
867
+ − 3839 Ichar emch;
771
+ − 3840
+ − 3841 CHECK_CHAR (ch);
+ − 3842 emch = XCHAR (ch);
867
+ − 3843 if (ichar_leading_byte (emch) != LEADING_BYTE_COMPOSITE)
771
+ − 3844 invalid_argument ("Must be composite char", ch);
+ − 3845 return composite_char_string (emch);
+ − 3846 }
+ − 3847 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 3848
+ − 3849
+ − 3850 /************************************************************************/
+ − 3851 /* initialization */
+ − 3852 /************************************************************************/
+ − 3853
+ − 3854 void
1204
+ − 3855 reinit_eistring_early (void)
771
+ − 3856 {
+ − 3857 the_eistring_malloc_zero_init = the_eistring_zero_init;
+ − 3858 the_eistring_malloc_zero_init.mallocp_ = 1;
+ − 3859 }
+ − 3860
+ − 3861 void
814
+ − 3862 init_eistring_once_early (void)
+ − 3863 {
1204
+ − 3864 reinit_eistring_early ();
814
+ − 3865 }
+ − 3866
+ − 3867 void
771
+ − 3868 syms_of_text (void)
+ − 3869 {
+ − 3870 DEFSUBR (Fmake_char);
+ − 3871
+ − 3872 #ifdef MULE
+ − 3873 DEFSUBR (Fchar_charset);
+ − 3874 DEFSUBR (Fchar_octet);
+ − 3875 DEFSUBR (Fsplit_char);
+ − 3876
+ − 3877 #ifdef ENABLE_COMPOSITE_CHARS
+ − 3878 DEFSUBR (Fmake_composite_char);
+ − 3879 DEFSUBR (Fcomposite_char_string);
+ − 3880 #endif
+ − 3881 #endif /* MULE */
+ − 3882 }
+ − 3883
+ − 3884 void
+ − 3885 reinit_vars_of_text (void)
+ − 3886 {
+ − 3887 int i;
+ − 3888
867
+ − 3889 conversion_in_dynarr_list = Dynarr_new2 (Ibyte_dynarr_dynarr,
+ − 3890 Ibyte_dynarr *);
771
+ − 3891 conversion_out_dynarr_list = Dynarr_new2 (Extbyte_dynarr_dynarr,
+ − 3892 Extbyte_dynarr *);
1318
+ − 3893 active_dfc_e2c = Dynarr_new (dfc_e2c_vals);
771
+ − 3894
+ − 3895 for (i = 0; i <= MAX_BYTEBPOS_GAP_SIZE_3; i++)
+ − 3896 three_to_one_table[i] = i / 3;
+ − 3897 }
+ − 3898
+ − 3899 void
+ − 3900 vars_of_text (void)
+ − 3901 {
+ − 3902 reinit_vars_of_text ();
+ − 3903
1292
+ − 3904 QSin_char_byte_conversion = build_msg_string ("(in char-byte conversion)");
+ − 3905 staticpro (&QSin_char_byte_conversion);
+ − 3906 QSin_internal_external_conversion =
+ − 3907 build_msg_string ("(in internal-external conversion)");
+ − 3908 staticpro (&QSin_internal_external_conversion);
+ − 3909
771
+ − 3910 #ifdef ENABLE_COMPOSITE_CHARS
+ − 3911 /* #### not dumped properly */
+ − 3912 composite_char_row_next = 32;
+ − 3913 composite_char_col_next = 32;
+ − 3914
+ − 3915 Vcomposite_char_string2char_hash_table =
+ − 3916 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQUAL);
+ − 3917 Vcomposite_char_char2string_hash_table =
+ − 3918 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
+ − 3919 staticpro (&Vcomposite_char_string2char_hash_table);
+ − 3920 staticpro (&Vcomposite_char_char2string_hash_table);
+ − 3921 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 3922 }