2367
+ − 1 /* Text manipulation primitives for XEmacs.
771
+ − 2 Copyright (C) 1995 Sun Microsystems, Inc.
2367
+ − 3 Copyright (C) 1995, 1996, 2000, 2001, 2002, 2003, 2004 Ben Wing.
771
+ − 4 Copyright (C) 1999 Martin Buchholz.
+ − 5
+ − 6 This file is part of XEmacs.
+ − 7
+ − 8 XEmacs is free software; you can redistribute it and/or modify it
+ − 9 under the terms of the GNU General Public License as published by the
+ − 10 Free Software Foundation; either version 2, or (at your option) any
+ − 11 later version.
+ − 12
+ − 13 XEmacs is distributed in the hope that it will be useful, but WITHOUT
+ − 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ − 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ − 16 for more details.
+ − 17
+ − 18 You should have received a copy of the GNU General Public License
+ − 19 along with XEmacs; see the file COPYING. If not, write to
+ − 20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ − 21 Boston, MA 02111-1307, USA. */
+ − 22
+ − 23 /* Synched up with: Not in FSF. */
+ − 24
+ − 25 /* Authorship:
+ − 26 */
+ − 27
+ − 28 #include <config.h>
+ − 29 #include "lisp.h"
+ − 30
+ − 31 #include "buffer.h"
+ − 32 #include "charset.h"
+ − 33 #include "file-coding.h"
+ − 34 #include "lstream.h"
1292
+ − 35 #include "profile.h"
771
+ − 36
+ − 37
+ − 38 /************************************************************************/
+ − 39 /* long comments */
+ − 40 /************************************************************************/
+ − 41
2367
+ − 42 /* NB: Everything below was written by Ben Wing except as otherwise noted. */
+ − 43
+ − 44 /************************************************************************/
+ − 45 /* */
+ − 46 /* */
+ − 47 /* Part A: More carefully-written documentation */
+ − 48 /* */
+ − 49 /* */
+ − 50 /************************************************************************/
+ − 51
+ − 52 /* Authorship: Ben Wing
+ − 53
771
+ − 54
826
+ − 55 ==========================================================================
2367
+ − 56 7. Handling non-default formats
826
+ − 57 ==========================================================================
771
+ − 58
2367
+ − 59 We support, at least to some extent, formats other than the default
+ − 60 variable-width format, for speed; all of these alternative formats are
+ − 61 fixed-width. Currently we only handle these non-default formats in
+ − 62 buffers, because access to their text is strictly controlled and thus
+ − 63 the details of the format mostly compartmentalized. The only really
+ − 64 tricky part is the search code -- the regex, Boyer-Moore, and
+ − 65 simple-search algorithms in search.c and regex.c. All other code that
+ − 66 knows directly about the buffer representation is the basic code to
+ − 67 modify or retrieve the buffer text.
+ − 68
+ − 69 Supporting fixed-width formats in Lisp strings is harder, but possible
+ − 70 -- FSF currently does this, for example. In this case, however,
+ − 71 probably only 8-bit-fixed is reasonable for Lisp strings -- getting
+ − 72 non-ASCII-compatible fixed-width formats to work is much, much harder
+ − 73 because a lot of code assumes that strings are ASCII-compatible
+ − 74 (i.e. ASCII + other characters represented exclusively using high-bit
+ − 75 bytes) and a lot of code mixes Lisp strings and non-Lisp strings freely.
+ − 76
+ − 77 The different possible fixed-width formats are 8-bit fixed, 16-bit
+ − 78 fixed, and 32-bit fixed. The latter can represent all possible
+ − 79 characters, but at a substantial memory penalty. The other two can
+ − 80 represent only a subset of the possible characters. How these subsets
+ − 81 are defined can be simple or very tricky.
+ − 82
+ − 83 Currently we support only the default format and the 8-bit fixed format,
+ − 84 and in the latter, we only allow these to be the first 256 characters in
+ − 85 an Ichar (ASCII and Latin 1).
+ − 86
+ − 87 One reasonable approach for 8-bit fixed is to allow the upper half to
+ − 88 represent any 1-byte charset, which is specified on a per-buffer basis.
+ − 89 This should work fairly well in practice since most documents are in
+ − 90 only one foreign language (possibly with some English mixed in). I
+ − 91 think FSF does something like this; or at least, they have something
+ − 92 called nonascii-translation-table and use it when converting from
+ − 93 8-bit-fixed text ("unibyte text") to default text ("multibyte text").
+ − 94 With 16-bit fixed, you could do something like assign chunks of the 64K
+ − 95 worth of characters to charsets as they're encountered in documents.
+ − 96 This should work well with most Asian documents.
+ − 97
+ − 98 If/when we switch to using Unicode internally, we might have formats more
+ − 99 like this:
+ − 100
+ − 101 -- UTF-8 or some extension as the default format. Perl uses an
+ − 102 extension that handles 64-bit chars and requires as much as 13 bytes per
+ − 103 char, vs. the standard of 31-bit chars and 6 bytes max. UTF-8 has the
+ − 104 same basic properties as our own variable-width format (see text.c,
+ − 105 Internal String Encoding) and so most code would not need to be changed.
+ − 106
+ − 107 -- UTF-16 as a "pseudo-fixed" format (i.e. 16-bit fixed plus surrogates
+ − 108 for representing characters not in the BMP, aka >= 65536). The vast
+ − 109 majority of documents will have no surrogates in them so byte/char
+ − 110 conversion will be very fast.
+ − 111
+ − 112 -- an 8-bit fixed format, like currently.
+ − 113
+ − 114 -- possibly, UCS-4 as a 32-bit fixed format.
+ − 115
+ − 116 The fixed-width formats essentially treat the buffer as an array of
+ − 117 8-bit, 16-bit or 32-bit integers. This means that how they are stored
+ − 118 in memory (in particular, big-endian or little-endian) depends on the
+ − 119 native format of the machine's processor. It also means we have to
+ − 120 worry a bit about alignment (basically, we just need to keep the gap an
+ − 121 integral size of the character size, and get things aligned properly
+ − 122 when converting the buffer between formats).
826
+ − 123
+ − 124 ==========================================================================
2367
+ − 125 8. Using UTF-16 as the default text format
826
+ − 126 ==========================================================================
+ − 127
2367
+ − 128 NOTE: The Eistring API is (or should be) Mule-correct even without
+ − 129 an ASCII-compatible internal representation.
+ − 130
+ − 131 #### Currently, the assumption that text units are one byte in size is
+ − 132 embedded throughout XEmacs, and `Ibyte *' is used where `Itext *' should
+ − 133 be. The way to fix this is to (among other things)
+ − 134
+ − 135 (a) review all places referencing `Ibyte' and `Ibyte *', change them to
+ − 136 use Itext, and fix up the code.
+ − 137 (b) change XSTRING_DATA to be of type Itext *
+ − 138 (c) review all uses of XSTRING_DATA
+ − 139 (d) eliminate XSTRING_LENGTH, splitting it into XSTRING_BYTE_LENGTH and
+ − 140 XSTRING_TEXT_LENGTH and reviewing all places referencing this
+ − 141 (e) make similar changes to other API's that refer to the "length" of
+ − 142 something, such as qxestrlen() and eilen()
+ − 143 (f) review all use of `CIbyte *'. Currently this is usually a way of
+ − 144 passing literal ASCII text strings in places that want internal text.
+ − 145 Either create separate _ascii() and _itext() versions of the
+ − 146 functions taking CIbyte *, or make use of something like the
+ − 147 WEXTTEXT() macro, which will generate wide strings as appropriate.
+ − 148 (g) review all uses of Bytecount and see which ones should be Textcount.
+ − 149 (h) put in error-checking code that will be tripped as often as possible
+ − 150 when doing anything with internal text, and check to see that ASCII
+ − 151 text has not mistakenly filtered in. This should be fairly easy as
+ − 152 ASCII text will generally be entirely spaces and letters whereas every
+ − 153 second byte of Unicode text will generally be a null byte. Either we
+ − 154 abort if the second bytes are entirely letters and numbers, or,
+ − 155 perhaps better, do the equivalent of a non-MULE build, where we should
+ − 156 be dealing entirely with 8-bit characters, and assert that the high
+ − 157 bytes of each pair are null.
+ − 158 (i) review places where xmalloc() is called. If we convert each use of
+ − 159 xmalloc() to instead be xnew_array() or some other typed routine,
+ − 160 then we will find every place that allocates space for Itext and
+ − 161 assumes it is based on one-byte units.
+ − 162 (j) encourage the use of ITEXT_ZTERM_SIZE instead of '+ 1' whenever we
+ − 163 are adding space for a zero-terminator, to emphasize what we are
+ − 164 doing and make sure the calculations are correct. Similarly for
+ − 165 EXTTEXT_ZTERM_SIZE.
+ − 166 (k) Note that the qxestr*() functions, among other things, will need to
+ − 167 be rewritten.
+ − 168
+ − 169 Note that this is a lot of work, and is not high on the list of priorities
+ − 170 currently.
826
+ − 171
+ − 172 ==========================================================================
2367
+ − 173 9. Miscellaneous
826
+ − 174 ==========================================================================
+ − 175
+ − 176 A. Unicode Support
771
+ − 177
1292
+ − 178 Unicode support is very desirable. Currrently we know how to handle
+ − 179 externally-encoded Unicode data in various encodings -- UTF-16, UTF-8,
+ − 180 etc. However, we really need to represent Unicode characters internally
+ − 181 as-is, rather than converting to some language-specific character set.
+ − 182 For efficiency, we should represent Unicode characters using 3 bytes
+ − 183 rather than 4. This means we need to find leading bytes for Unicode.
+ − 184 Given that there are 65,536 characters in Unicode and we can attach
+ − 185 96x96 = 9,216 characters per leading byte, we need eight leading bytes
+ − 186 for Unicode. We currently have four free (0x9A - 0x9D), and with a
+ − 187 little bit of rearranging we can get five: ASCII doesn't really need to
+ − 188 take up a leading byte. (We could just as well use 0x7F, with a little
+ − 189 change to the functions that assume that 0x80 is the lowest leading
+ − 190 byte.) This means we still need to dump three leading bytes and move
+ − 191 them into private space. The CNS charsets are good candidates since
+ − 192 they are rarely used, and JAPANESE_JISX0208_1978 is becoming less and
+ − 193 less used and could also be dumped.
826
+ − 194
+ − 195 B. Composite Characters
+ − 196
+ − 197 Composite characters are characters constructed by overstriking two
771
+ − 198 or more regular characters.
+ − 199
+ − 200 1) The old Mule implementation involves storing composite characters
+ − 201 in a buffer as a tag followed by all of the actual characters
+ − 202 used to make up the composite character. I think this is a bad
+ − 203 idea; it greatly complicates code that wants to handle strings
+ − 204 one character at a time because it has to deal with the possibility
+ − 205 of great big ungainly characters. It's much more reasonable to
+ − 206 simply store an index into a table of composite characters.
+ − 207
+ − 208 2) The current implementation only allows for 16,384 separate
+ − 209 composite characters over the lifetime of the XEmacs process.
+ − 210 This could become a potential problem if the user
+ − 211 edited lots of different files that use composite characters.
+ − 212 Due to FSF bogosity, increasing the number of allowable
+ − 213 composite characters under Mule would decrease the number
+ − 214 of possible faces that can exist. Mule already has shrunk
+ − 215 this to 2048, and further shrinkage would become uncomfortable.
+ − 216 No such problems exist in XEmacs.
+ − 217
+ − 218 Composite characters could be represented as 0x8D C1 C2 C3,
+ − 219 where each C[1-3] is in the range 0xA0 - 0xFF. This allows
+ − 220 for slightly under 2^20 (one million) composite characters
+ − 221 over the XEmacs process lifetime, and you only need to
+ − 222 increase the size of a Mule character from 19 to 21 bits.
+ − 223 Or you could use 0x8D C1 C2 C3 C4, allowing for about
826
+ − 224 85 million (slightly over 2^26) composite characters.
+ − 225
2367
+ − 226 ==========================================================================
+ − 227 10. Internal API's
+ − 228 ==========================================================================
+ − 229
+ − 230 All of these are documented in more detail in text.h.
+ − 231
+ − 232 @enumerate
+ − 233 @item
+ − 234 Basic internal-format API's
+ − 235
+ − 236 These are simple functions and macros to convert between text
+ − 237 representation and characters, move forward and back in text, etc.
+ − 238
+ − 239 @item
+ − 240 The DFC API
+ − 241
+ − 242 This is for conversion between internal and external text. Note that
+ − 243 there is also the "new DFC" API, which *returns* a pointer to the
+ − 244 converted text (in alloca space), rather than storing it into a
+ − 245 variable.
+ − 246
+ − 247 @item
+ − 248 The Eistring API
+ − 249
+ − 250 (This API is currently under-used) When doing simple things with
+ − 251 internal text, the basic internal-format API's are enough. But to do
+ − 252 things like delete or replace a substring, concatenate various strings,
+ − 253 etc. is difficult to do cleanly because of the allocation issues.
+ − 254 The Eistring API is designed to deal with this, and provides a clean
+ − 255 way of modifying and building up internal text. (Note that the former
+ − 256 lack of this API has meant that some code uses Lisp strings to do
+ − 257 similar manipulations, resulting in excess garbage and increased
+ − 258 garbage collection.)
+ − 259
+ − 260 NOTE: The Eistring API is (or should be) Mule-correct even without
+ − 261 an ASCII-compatible internal representation.
+ − 262 @end enumerate
+ − 263
+ − 264 ==========================================================================
+ − 265 11. Other Sources of Documentation
+ − 266 ==========================================================================
+ − 267
+ − 268 man/lispref/mule.texi
+ − 269 @enumerate
+ − 270 @item
+ − 271 another intro to characters, encodings, etc; #### Merge with the
+ − 272 above info
+ − 273 @item
+ − 274 documentation of ISO-2022
+ − 275 @item
+ − 276 The charset and coding-system Lisp API's
+ − 277 @item
+ − 278 The CCL conversion language for writing encoding conversions
+ − 279 @item
+ − 280 The Latin-Unity package for unifying Latin charsets
+ − 281 @end enumerate
+ − 282
+ − 283 man/internals/internals.texi (the Internals manual)
+ − 284 @enumerate
+ − 285 @item
+ − 286 "Coding for Mule" -- how to write Mule-aware code
+ − 287 @item
+ − 288 "Modules for Internationalization"
+ − 289 @item
+ − 290 "The Text in a Buffer" -- more about the different ways of
+ − 291 viewing buffer positions; #### Merge with the above info
+ − 292 @item
+ − 293 "MULE Character Sets and Encodings" -- yet another intro
+ − 294 to characters, encodings, etc; #### Merge with the
+ − 295 above info; also some documentation of Japanese EUC and JIS7,
+ − 296 and CCL internals
+ − 297 @end enumerate
+ − 298
+ − 299 text.h -- info about specific XEmacs-C API's for handling internal and
+ − 300 external text
+ − 301
+ − 302 intl-win32.c -- Windows-specific I18N information
+ − 303
+ − 304 lisp.h -- some info appears alongside the definitions of the basic
+ − 305 character-related types
+ − 306
+ − 307 unicode.c -- documentation about Unicode translation tables
826
+ − 308 */
771
+ − 309
2367
+ − 310
+ − 311 /************************************************************************/
+ − 312 /* */
+ − 313 /* */
+ − 314 /* Part B: Random proposals for work to be done */
+ − 315 /* */
+ − 316 /* */
+ − 317 /************************************************************************/
+ − 318
+ − 319
+ − 320 /*
+ − 321
+ − 322
+ − 323 ==========================================================================
+ − 324 - Mule design issues (ben)
+ − 325 ==========================================================================
+ − 326
+ − 327 circa 1999
+ − 328
+ − 329 Here is a more detailed list of Mule-related projects that we will be
+ − 330 working on. They are more or less ordered according to how we will
+ − 331 proceed, but it's not exact. In particular, there will probably be
+ − 332 time overlap among adjacent projects.
+ − 333
+ − 334 @enumerate
+ − 335 @item
+ − 336 Modify the internal/external conversion macros to allow for
+ − 337 MS Windows support.
+ − 338
+ − 339 @item
+ − 340 Modify the buffer macros to allow for more than one internal
+ − 341 representation, e.g. fixed width and variable width.
+ − 342
+ − 343 @item
+ − 344 Review the existing Mule code, especially the lisp code, for code
+ − 345 quality issues and improve the cleanliness of it. Also work on
+ − 346 creating a specification for the Mule API.
+ − 347
+ − 348 @item
+ − 349 Write some more automated mule tests.
+ − 350
+ − 351 @item
+ − 352 Integrate Tomohiko's UTF-2000 code, fixing it up so that nothing is
+ − 353 broken when the UTF-2000 configure option is not enabled.
+ − 354
+ − 355 @item
+ − 356 Fix up the MS Windows code to be Mule-correct, so that you can
+ − 357 compile with Mule support under MS windows and have a working
+ − 358 XEmacs, at least just with Latin-1.
+ − 359
+ − 360 @item
+ − 361 Implement a scheme to guarantee no corruption of files, even with
+ − 362 an incorrect coding system - in particular, guarantee no corruption
+ − 363 of binary files.
+ − 364
+ − 365 @item
+ − 366 Make the text property support in XEmacs robust with respect to
+ − 367 string and text operations, so that the `no corruption' support in
+ − 368 the previous entry works properly, even if a lot of cutting and
+ − 369 pasting is done.
+ − 370
+ − 371 @item
+ − 372 Improve the handling of auto-detection so that, when there is any
+ − 373 possibility at all of mistake, the user is informed of the detected
+ − 374 encoding and given the choice of choosing other possibilities.
+ − 375
+ − 376 @item
+ − 377 Improve the support for different language environments in XEmacs,
+ − 378 for example, the priority of coding systems used in auto-detection
+ − 379 should properly reflect the language environment. This probably
+ − 380 necessitates rethinking the current `coding system priority'
+ − 381 scheme.
+ − 382
+ − 383 @item
+ − 384 Do quality work to improve the existing UTF-2000 implementation.
+ − 385
+ − 386 @item
+ − 387 Implement preliminary support for 8-bit fixed width
+ − 388 representation. First, we will only implement 7-bit support, and
+ − 389 will fall back to variable width as soon as any non-ASCII
+ − 390 character is encountered. Then we will improve the support to
+ − 391 handle an arbitrary character set in the upper half of the 8-bit space.
+ − 392
+ − 393 @item
+ − 394 Investigate any remaining hurdles to making --with-mule be the
+ − 395 default configure option.
+ − 396 @end enumerate
+ − 397
+ − 398 ==========================================================================
+ − 399 - Mule design issues (stephen)
+ − 400 ==========================================================================
+ − 401
+ − 402 What I see as Mule priorities (in rough benefit order, I am not taking
+ − 403 account of difficulty, nor the fact that some - eg 8 & 10 - will
+ − 404 probably come as packages):
+ − 405
+ − 406 @enumerate
+ − 407 @item
+ − 408 Fix the autodetect problem (by making the coding priority list
+ − 409 user-configurable, as short as he likes, even null, with "binary"
+ − 410 as the default).
+ − 411 @item
+ − 412 Document the language environments and other Mule "APIs" as
+ − 413 implemented (since there is no real design spec). Check to see
+ − 414 how and where they are broken.
+ − 415 @item
+ − 416 Make the Mule menu useful to non-ISO-2022-literate folks.
+ − 417 @item
+ − 418 Redo the lstreams stuff to make it easy and robust to "pipeline",
+ − 419 eg, libz | gnupg | jis2mule.
+ − 420 @item
+ − 421 Make Custom Mule-aware. (This probably depends on a sensible
+ − 422 fonts model.)
+ − 423 @item
+ − 424 Implement the "literal byte stream" memory feature.
+ − 425 @item
+ − 426 Study the FSF implementation of Mule for background for 7 & 8.
+ − 427 @item
+ − 428 Identify desirable Mule features (eg, i18n-ized messages as above,
+ − 429 collating tables by language environment, etc). (New features
+ − 430 might have priority as high as 9.)
+ − 431 @item
+ − 432 Specify Mule UIs, APIs, etc, and design and (re)implement them.
+ − 433 @item
+ − 434 Implement the 8-bit-wide buffer optimization.
+ − 435 @item
+ − 436 Move the internal encoding to UTF-32 (subject to Olivier's caveats
+ − 437 regarding compose characters), with the variable-width char
+ − 438 buffers using UTF-8.
+ − 439 @item
+ − 440 Implement the 16- and 32-bit-wide buffer optimizations.
+ − 441 @end enumerate
+ − 442
+ − 443 ==========================================================================
+ − 444 - Mule design issues "short term" (ben)
+ − 445 ==========================================================================
+ − 446
+ − 447 @enumerate
+ − 448 @item
+ − 449 Finish changes in fixup/directory, get in CVS.
+ − 450
+ − 451 (Test with and without "quick-build", to see if really faster)
+ − 452 (need autoconf)
+ − 453
+ − 454 @item
+ − 455 Finish up Windows/Mule changes. Outline of this elsewhere; Do
+ − 456 *minimal* effort.
+ − 457
+ − 458 @item
+ − 459 Continue work on Windows stability, e.g. go through existing notes
+ − 460 on Windows Mule-ization + extract all info.
+ − 461
+ − 462 @item
+ − 463 Get Unicode translation tables integrated.
+ − 464
+ − 465 Finish UCS2/UTF16 coding system.
+ − 466
+ − 467 @item
+ − 468 Make sure coding system priority list is language-environment specific.
+ − 469
+ − 470 @item
+ − 471 Consider moving language selection Menu up to be parallel with Mule menu.
+ − 472
+ − 473 @item
+ − 474 Check to make sure we grok the default locale at startup under
+ − 475 Windows and understand the Windows locales. Finish implementation
+ − 476 of mswindows-multibyte and make sure it groks all the locales.
+ − 477
+ − 478 @item
+ − 479 Do the above as best as we can without using Unicode tables.
+ − 480
+ − 481 @item
+ − 482 Start tagging all text with a language text property,
+ − 483 indicating the current language environment when the text was input.
+ − 484
+ − 485 @item
+ − 486 Make sure we correctly accept input of non-ASCII chars
+ − 487 (probably already do!)
+ − 488
+ − 489 @item
+ − 490 Implement active language/keyboard switching under Windows.
+ − 491
+ − 492 @item
+ − 493 Look into implementing support for "MS IME" protocol (Microsoft
+ − 494 fancy built-in Asian input methods).
+ − 495
+ − 496 @item
+ − 497 Redo implementation of mswindows-multibyte and internal display to
+ − 498 entirely use translation to/from Unicode for increased accuracy.
+ − 499
+ − 500 @item
+ − 501 Implement buf<->char improvements from FSF. Also implement
+ − 502 my string byte<->char optimization structure.
+ − 503
+ − 504 @item
+ − 505 Integrate all Mule DOCS from 20.6 or 21.0. Try to add sections
+ − 506 for what we've added.
+ − 507
+ − 508 @item
+ − 509 Implement 8-bit fixed width optimizations. Then work on 16-bit.
+ − 510 @end enumerate
+ − 511
+ − 512 ==========================================================================
+ − 513 - Mule design issues (more) (ben)
+ − 514 ==========================================================================
+ − 515
+ − 516 Get minimal Mule for Windows working using Ikeyama's patches. At
+ − 517 first, rely on his conversion of internal -> external
+ − 518 locale-specific but very soon (as soon as we get translation
+ − 519 tables) can switch to using Unicode versions of display funs, which
+ − 520 will allow many more charsets to be handled and in a more
+ − 521 consistent fashion.
+ − 522
+ − 523 i.e. to convert an internal string to an external format, at first
+ − 524 we use our own knowledge of the Microsoft locale file formats but
+ − 525 an alternative is to convert to Unicode and use Microsoft's
+ − 526 convert-Unicode-to-locale encoding functions. This gains us a
+ − 527 great deal of generality, since in practice all charset caching
+ − 528 points can be wrapped into Unicode caching points.
+ − 529
+ − 530 This requires adding UCS2 support, which I'm doing. This support
+ − 531 would let us convert internal -> Unicode, which is exactly what we
+ − 532 want.
+ − 533
+ − 534 At first, though, I would do the UCS2 support, but leave the
+ − 535 existing way of doing things in redisplay. Meanwhile, I'd go
+ − 536 through and fix up the places in the code that assume we are
+ − 537 dealing with unibytes.
+ − 538
+ − 539 After this, the font problems will be fixed , we should have a
+ − 540 pretty well working XEmacs + MULE under Windows. The only real
+ − 541 other work is the clipboard code, which should be straightforward.
+ − 542
+ − 543 ==========================================================================
+ − 544 - Mule design discussion
+ − 545 ==========================================================================
+ − 546
+ − 547 --------------------------------------------------------------------------
+ − 548
+ − 549 Ben
+ − 550
+ − 551 April 11, 2000
+ − 552
+ − 553 Well yes, this was the whole point of my "no lossage" proposal of being
+ − 554 able to undo any coding-system transformation on a buffer. The idea was
+ − 555 to figure out which transformations were definitely reversable, and for
+ − 556 all the others, cache the original text in a text property. This way, you
+ − 557 could probably still do a fairly good job at constructing a good reversal
+ − 558 even after you've gone into the text and added, deleted, and rearranged
+ − 559 some things.
+ − 560
+ − 561 But you could implement it much more simply and usefully by just
+ − 562 determining, for any text being decoded into mule-internal, can we go back
+ − 563 and read the source again? If not, remember the entire file (GNUS
+ − 564 message, etc) in text properties. Then, implement the UI interface (like
+ − 565 Netscape's) on top of that. This way, you have something that at least
+ − 566 works, but it might be inefficient. All we would need to do is work on
+ − 567 making the
+ − 568 underlying implementation more efficient.
+ − 569
+ − 570 Are you interested in doing this? It would be a huge win for users.
+ − 571 Hrvoje Niksic wrote:
+ − 572
+ − 573 > Ben Wing <ben@666.com> writes:
+ − 574 >
+ − 575 > > let me know exactly what "rethink" functionality you want and i'll
+ − 576 > > come up with an interface. perhaps you just want something like
+ − 577 > > netscape's encoding menu, where if you switch encodings, it reloads
+ − 578 > > and reencodes?
+ − 579 >
+ − 580 > It might be a bit more complex than that. In many cases, it's hard or
+ − 581 > impossible to meaningfully "reload" -- for instance, this
+ − 582 > functionality should be available while editing a Gnus message, as
+ − 583 > well as while visiting a file.
+ − 584 >
+ − 585 > For the special case of Latin-N <-> Latin-M conversion, things could
+ − 586 > be done easily -- to convert from N to M, you only need to convert
+ − 587 > internal representation back to N, and then convert it forth to M.
+ − 588
+ − 589 --------------------------------------------------------------------------
+ − 590 April 11, 2000
+ − 591
+ − 592 Well yes, this was the whole point of my "no lossage" proposal of being
+ − 593 able to undo any coding-system transformation on a buffer. The idea was
+ − 594 to figure out which transformations were definitely reversable, and for
+ − 595 all the others, cache the original text in a text property. This way, you
+ − 596 could probably still do a fairly good job at constructing a good reversal
+ − 597 even after you've gone into the text and added, deleted, and rearranged
+ − 598 some things.
+ − 599
+ − 600 But you could implement it much more simply and usefully by just
+ − 601 determining, for any text being decoded into mule-internal, can we go back
+ − 602 and read the source again? If not, remember the entire file (GNUS
+ − 603 message, etc) in text properties. Then, implement the UI interface (like
+ − 604 Netscape's) on top of that. This way, you have something that at least
+ − 605 works, but it might be inefficient. All we would need to do is work on
+ − 606 making the
+ − 607 underlying implementation more efficient.
+ − 608
+ − 609 Are you interested in doing this? It would be a huge win for users.
+ − 610 Hrvoje Niksic wrote:
+ − 611
+ − 612 > Ben Wing <ben@666.com> writes:
+ − 613 >
+ − 614 > > let me know exactly what "rethink" functionality you want and i'll
+ − 615 > > come up with an interface. perhaps you just want something like
+ − 616 > > netscape's encoding menu, where if you switch encodings, it reloads
+ − 617 > > and reencodes?
+ − 618 >
+ − 619 > It might be a bit more complex than that. In many cases, it's hard or
+ − 620 > impossible to meaningfully "reload" -- for instance, this
+ − 621 > functionality should be available while editing a Gnus message, as
+ − 622 > well as while visiting a file.
+ − 623 >
+ − 624 > For the special case of Latin-N <-> Latin-M conversion, things could
+ − 625 > be done easily -- to convert from N to M, you only need to convert
+ − 626 > internal representation back to N, and then convert it forth to M.
+ − 627
+ − 628
+ − 629 ------------------------------------------------------------------------
+ − 630
+ − 631 ==========================================================================
+ − 632 - Redoing translation macros [old]
+ − 633 ==========================================================================
+ − 634
+ − 635 Currently the translation macros (the macros with names such as
+ − 636 GET_C_STRING_CTEXT_DATA_ALLOCA) have names that are difficult to parse
+ − 637 or remember, and are not all that general. In the process of
+ − 638 reviewing the Windows code so that it could be muleized, I discovered
+ − 639 that these macros need to be extended in various ways to allow for
+ − 640 the Windows code to be easily muleized.
+ − 641
+ − 642 Since the macros needed to be changed anyways, I figured it would be a
+ − 643 good time to redo them properly. I propose new macros which have
+ − 644 names like this:
+ − 645
+ − 646 @itemize @bullet
+ − 647 @item
+ − 648 <A>_TO_EXTERNAL_FORMAT_<B>
+ − 649 @item
+ − 650 <A>_TO_EXTERNAL_FORMAT_<B>_1
+ − 651 @item
+ − 652 <C>_TO_INTERNAL_FORMAT_<D>
+ − 653 @item
+ − 654 <C>_TO_INTERNAL_FORMAT_<D>_1
+ − 655 @end itemize
+ − 656
+ − 657 A and C represent the source of the data, and B and D represent the
+ − 658 sink of the data.
+ − 659
+ − 660 All of these macros call either the functions
+ − 661 convert_to_external_format or convert_to_internal_format internally,
+ − 662 with some massaging of the arguments.
+ − 663
+ − 664 All of these macros take the following arguments:
+ − 665
+ − 666 @itemize @bullet
+ − 667 @item
+ − 668 First, one or two arguments indicating the source of the data.
+ − 669 @item
+ − 670 Second, an argument indicating the coding system. (In order to avoid
+ − 671 an excessive number of macros, we no longer provide separate macros
+ − 672 for specific coding systems.)
+ − 673 @item
+ − 674 Third, one or two arguments indicating the sink of the data.
+ − 675 @item
+ − 676 Fourth, optionally, arguments indicating the error behavior and the
+ − 677 warning class (these arguments are only present in the _1 versions
+ − 678 of the macros). The other, shorter named macros are trivial
+ − 679 interfaces onto these macros with the error behavior being
+ − 680 ERROR_ME_WARN, with the warning class being Vstandard_warning_class.
+ − 681 @end itemize
+ − 682
+ − 683 <A> can be one of the following:
+ − 684 @itemize @bullet
+ − 685 @item
+ − 686 LISP (which means a Lisp string) Takes one argument, a Lisp Object.
+ − 687 @item
+ − 688 LSTREAM (which indicates an lstream) Takes one argument, an
+ − 689 lstream. The data is read from the lstream until EOF is reached.
+ − 690 @item
+ − 691 DATA (which indicates a raw memory area) Takes two arguments, a
+ − 692 pointer and a length in bytes.
+ − 693 (You must never use this if the source of the data is a Lisp string,
+ − 694 because of the possibility of relocation during garbage collection.)
+ − 695 @end itemize
+ − 696
+ − 697 <B> can be one of the following:
+ − 698 @itemize @bullet
+ − 699 @item
+ − 700 ALLOCA (which means that the resulting data is stored in alloca()ed
+ − 701 memory. Two arguments should be specified, a pointer and a length,
+ − 702 which should be lvalues.)
+ − 703 @item
+ − 704 MALLOC (which means that the resulting data is stored in malloc()ed
+ − 705 memory. Two arguments should be specified, a pointer and a
+ − 706 length. The memory must be free()d by the caller.
+ − 707 @item
+ − 708 OPAQUE (which means the resulting data is stored in an opaque Lisp
+ − 709 Object. This takes one argument, a lvalue Lisp Object.
+ − 710 @item
+ − 711 LSTREAM. The data is written to an lstream.
+ − 712 @end itemize
+ − 713
+ − 714 <C> can be one of the :
+ − 715 @itemize @bullet
+ − 716 @item
+ − 717 DATA
+ − 718 @item
+ − 719 LSTREAM
+ − 720 @end itemize
+ − 721 (just like <A> above)
+ − 722
+ − 723 <D> can be one of
+ − 724 @itemize @bullet
+ − 725 @item
+ − 726 ALLOCA
+ − 727 @item
+ − 728 MALLOC
+ − 729 @item
+ − 730 LISP This means a Lisp String.
+ − 731 @item
+ − 732 BUFFER The resulting data is inserted into a buffer at the buffer's
+ − 733 value of point.
+ − 734 @item
+ − 735 LSTREAM The data is written to the lstream.
+ − 736 @end itemize
+ − 737
+ − 738
+ − 739 Note that I have eliminated the FORMAT argument of previous macros,
+ − 740 and replaced it with a coding system. This was made possible by
+ − 741 coding system aliases. In place of old `format's, we use a `virtual
+ − 742 coding system', which is aliased to the actual coding system.
+ − 743
+ − 744 The value of the coding system argument can be anything that is legal
+ − 745 input to get_coding_system, i.e. a symbol or a coding system object.
+ − 746
+ − 747 ==========================================================================
+ − 748 - creation of generic macros for accessing internally formatted data [old]
+ − 749 ==========================================================================
+ − 750
+ − 751 I have a design; it's all written down (I did it in Tsukuba), and I just have
+ − 752 to have it transcribed. It's higher level than the macros, though; it's Lisp
+ − 753 primitives that I'm designing.
+ − 754
+ − 755 As for the design of the macros, don't worry so much about all files having to
+ − 756 get included (which is inevitable with macros), but about how the files are
+ − 757 separated. Your design might go like this:
+ − 758
+ − 759 @enumerate
+ − 760 @item
+ − 761 you have generic macro interfaces, which specify a particular
+ − 762 behavior but not an implementation. these generic macros have
+ − 763 complementary versions for buffers and for strings (and the buffer
+ − 764 or string is an argument to all of the macros), and do such things
+ − 765 as convert between byte and char indices, retrieve the character at
+ − 766 a particular byte or char index, increment or decrement a byte
+ − 767 index to the beginning of the next or previous character, indicate
+ − 768 the number of bytes occupied by the character at a particular byte
+ − 769 or character index, etc. These are similar to what's already out
+ − 770 there except that they confound buffers and strings and that they
+ − 771 can also work with actual char *'s, which I think is a really bad
+ − 772 idea because it encourages code to "assume" that the representation
+ − 773 is ASCII compatible, which is might not be (e.g. 16-bit fixed
+ − 774 width). In fact, one thing I'm planning on doing is redefining
+ − 775 Bufbyte as a struct, for debugging purposes, to catch all places
+ − 776 that cavalierly compare them with ASCII char's. Note also that I
+ − 777 really want to rename Bufpos and Bytind, which are confusing and
+ − 778 wrong in that they also apply to strings. They should be Bytepos
+ − 779 and Charpos, or something like that, to go along with Bytecount and
+ − 780 Charcount. Similarly, Bufbyte is similarly a misnomer and should be
+ − 781 Intbyte -- a byte in the internal string representation (any of the
+ − 782 internal representations) of a string or buffer. Corresponding to
+ − 783 this is Extbyte (which we already have), a byte in any external
+ − 784 string representation. We also have Extcount, which makes sense,
+ − 785 and we might possibly want Extcharcount, the number of characters
+ − 786 in an external string representation; but that gets sticky in modal
+ − 787 encodings, and it's not clear how useful it would be.
+ − 788
+ − 789 @item
+ − 790 for all generic macro interfaces, there are specific versions of
+ − 791 each of them for each possible representation (pure ASCII in the
+ − 792 non-Mule world, Mule standard, UTF-8, 8-bit fixed, 16-bit fixed,
+ − 793 32-bit fixed, etc.; there may well be more than one possible 16-bit
+ − 794 fixed version, as well). Each representation has a corresponding
+ − 795 prefix, e.g. MULE_ or FIXED16_ or whatever, which is prefixed onto
+ − 796 the generic macro names. The resulting macros perform the
+ − 797 operation defined for the macro, but assume, and only work
+ − 798 correctly with, text in the corresponding representation.
+ − 799
+ − 800 @item
+ − 801 The definition of the generic versions merely conditionalizes on
+ − 802 the appropriate things (i.e. bit flags in the buffer or string
+ − 803 object) and calls the appropriate representation-specific version.
+ − 804 There may be more than one definition (protected by ifdefs, of
+ − 805 course), or one definition that amalgamated out of many ifdef'ed
+ − 806 sections.
+ − 807
+ − 808 @item
+ − 809 You should probably put each different representation in its own
+ − 810 header file, e.g. charset-mule.h or charset-fixed16.h or
+ − 811 charset-ascii.h or whatever. Then put the main macros into
+ − 812 charset.h, and conditionalize in this file appropriately to include
+ − 813 the other ones. That way, code that actually needs to play around
+ − 814 with internal-format text at this level can include "charset.h"
+ − 815 (certainly a much better place than buffer.h), and everyone else
+ − 816 uses higher-level routines. The representation-specific macros
+ − 817 should not normally be used *directly* at all; they are invoked
+ − 818 automatically from the generic macros. However, code that needs to
+ − 819 be highly, highly optimized might choose to take a loop and write
+ − 820 two versions of it, one for each representation, to avoid the
+ − 821 per-loop-iteration cost of a comparison. Until the macro interface
+ − 822 is rock stable and solid, we should strongly discourage such
+ − 823 nanosecond optimizations.
+ − 824 @end enumerate
+ − 825
+ − 826 ==========================================================================
+ − 827 - UTF-16 compatible representation
+ − 828 ==========================================================================
+ − 829
+ − 830 NOTE: One possible default internal representation that was compatible
+ − 831 with UTF16 but allowed all possible chars in UCS4 would be to take a
+ − 832 more-or-less unused range of 2048 chars (not from the private area
+ − 833 because Microsoft actually uses up most or all of it with EUDC chars).
+ − 834 Let's say we picked A400 - ABFF. Then, we'd have:
+ − 835
+ − 836 0000 - FFFF Simple chars
+ − 837
+ − 838 D[8-B]xx D[C-F]xx Surrogate char, represents 1M chars
+ − 839
+ − 840 A[4-B]xx D[C-F]xx D[C-F]xx Surrogate char, represents 2G chars
+ − 841
+ − 842 This is exactly the same number of chars as UCS-4 handles, and it follows the
+ − 843 same property as UTF8 and Mule-internal:
+ − 844
+ − 845 @enumerate
+ − 846 @item
+ − 847 There are two disjoint groupings of units, one representing leading units
+ − 848 and one representing non-leading units.
+ − 849 @item
+ − 850 Given a leading unit, you immediately know how many units follow to make
+ − 851 up a valid char, irrespective of any other context.
+ − 852 @end enumerate
+ − 853
+ − 854 Note that A4xx is actually currently assigned to Yi. Since this is an
+ − 855 internal representation, we could just move these elsewhere.
+ − 856
+ − 857 An alternative is to pick two disjoint ranges, e.g. 2D00 - 2DFF and
+ − 858 A500 - ABFF.
+ − 859
+ − 860 ==========================================================================
+ − 861 New API for char->font mapping
+ − 862 ==========================================================================
+ − 863 - ; supersedes charset-registry and CCL;
+ − 864 supports all windows systems; powerful enough for Unicode; etc.
+ − 865
+ − 866 (charset-font-mapping charset)
+ − 867
+ − 868 font-mapping-specifier string
+ − 869
+ − 870 char-font-mapping-table
+ − 871
+ − 872 char-table, specifier; elements of char table are either strings (which
+ − 873 specify a registry or comparable font property, or vectors of a string
+ − 874 (same) followed by keyword-value pairs (optional). The only allowable
+ − 875 keyword currently is :ccl-program, which specifies a CCL program to map
+ − 876 the characters into font indices. Other keywords may be added
+ − 877 e.g. allowing Elisp fragments instead of CCL programs, also allowed is
+ − 878 [inherit], which inherits from the next less-specific char-table in the
+ − 879 specifier.
+ − 880
+ − 881 The preferred interface onto this mapping (which should be portable
+ − 882 across Emacsen) is
+ − 883
+ − 884 (set-char-font-mapping key value &optional locale tag-set how-to-add)
+ − 885
+ − 886 where key is a char, range or charset (as for put-char-table), value is
+ − 887 as above, and the other arguments are standard for specifiers. This
+ − 888 automatically creates a char table in the locale, as necessary (all
+ − 889 elements default to [inherit]). On GNU Emacs, some specifiers arguments
+ − 890 may be unimplemented.
+ − 891
+ − 892 (char-font-mapping key value &optional locale)
+ − 893 works vaguely like get-specifier? But does inheritance processing.
+ − 894 locale should clearly default here to current-buffer
+ − 895
+ − 896 #### should get-specifier as well? Would make it work most like
+ − 897 #### buffer-local variables.
+ − 898
+ − 899 NB. set-charset-registry and set-charset-ccl-program are obsoleted.
+ − 900
+ − 901 ==========================================================================
+ − 902 Implementing fixed-width 8,16,32 bit buffer optimizations
+ − 903 ==========================================================================
+ − 904
+ − 905 Add set-buffer-optimization (buffer &rest keywords) for
+ − 906 controlling these things.
+ − 907
+ − 908 Also, put in hack so that correct arglist can be retrieved by
+ − 909 Lisp code.
+ − 910
+ − 911 Look at the way keyword primitives are currently handled; make
+ − 912 sure it works and is documented, etc.
+ − 913
+ − 914 Implement 8-bit fixed width optimization. Take the things that
+ − 915 know about the actual implementation and put them in a single
+ − 916 file, in essence creating an abstraction layer to allow
+ − 917 pluggable internal representations. Implement a fairly general
+ − 918 scheme for mapping between character codes in the 8 bits or 16
+ − 919 bits representation and on actual charset characters. As part of
+ − 920 set-buffer-optimization, you can specify a list of character sets
+ − 921 to be used in the 8 bit to 16 bit, etc. world. You can also
+ − 922 request that the buffer be in 8, 16, etc. if possible.
+ − 923
+ − 924 -> set defaults wrt this.
+ − 925 -> perhaps this should be just buffer properties.
+ − 926 -> this brings up the idea of default properties on an object.
+ − 927 -> Implement default-put, default-get, etc.
+ − 928
+ − 929 What happens when a character not assigned in the range gets
+ − 930 added? Then, must convert to variable width of some sort.
+ − 931
+ − 932 Note: at first, possibly we just convert whole hog to get things
+ − 933 right. Then we'd have to poy alternative to characters that got
+ − 934 added + deleted that were unassigned in the fixed width. When
+ − 935 this goes to zero and there's been enough time (heuristics), we
+ − 936 go back to fixed.
+ − 937
+ − 938 Side note: We could dynamically build up the set of assigned
+ − 939 chars as they go. Conceivably this could even go down to the
+ − 940 single char level: Just keep a big array of mapping from 16 bit
+ − 941 values to chars, and add empty time, a char has been encountered
+ − 942 that wasn't there before. Problem need inverse mapping.
+ − 943
+ − 944 -> Possibility; chars are actual objects, not just numbers.
+ − 945 Then you could keep track of such info in the chars itself.
+ − 946 *Think about this.*
+ − 947
+ − 948 Eventually, we might consider allowing mixed fixed-width,
+ − 949 variable-width buffer encodings. Then, we use range tables to
+ − 950 indicate which sections are fixed and which variable and INC_CHAR does
+ − 951 something like this: binary search to find the current range, which
+ − 952 indicates whether it's fixed or variable, and tells us what the
+ − 953 increment is. We can cache this info and use it next time to speed
+ − 954 up.
+ − 955
+ − 956 -> We will then have two partially shared range tables - one for
+ − 957 overall fixed width vs. variable width, and possibly one containing
+ − 958 this same info, but partitioning the variable width in one. Maybe
+ − 959 need fancier nested range table model.
+ − 960
+ − 961 ==========================================================================
+ − 962 Expansion of display table and case mapping table support for all
+ − 963 chars, not just ASCII/Latin1.
+ − 964 ==========================================================================
+ − 965
+ − 966 ==========================================================================
+ − 967 Improved flexibility for display tables, and evaluation of its
+ − 968 features to make sure it meshes with and complements the char<->font
+ − 969 mapping API mentioned earlier
+ − 970 ==========================================================================
+ − 971
+ − 972 ==========================================================================
+ − 973 String access speedup:
+ − 974 ==========================================================================
+ − 975
+ − 976 For strings larger than some size in bytes (10?), keep extra fields of
+ − 977 info: length in chars, and a (char, byte) pair in the middle to speed
+ − 978 up sequential access.
+ − 979
+ − 980 (Better idea: do this for any size string, but only if it contains
+ − 981 non-ASCII chars. Then if info is missing, we know string is
+ − 982 ASCII-only.)
+ − 983
+ − 984 Use a string-extra-info object, replacing string property slot and
+ − 985 containing fields for string mod tick, string extents, string props,
+ − 986 and string char length, and cached (char,byte) pair.
+ − 987 string-extra-info (or string-auxiliary?) objects could be in frob
+ − 988 blocks, esp. if creating frob blocks is easy + worth it.
+ − 989
+ − 990 - Caching of char<->byte conversions in strings - should make nearly
+ − 991 all operations on strings O(N)
+ − 992
+ − 993 ==========================================================================
+ − 994 Improvements in buffer char<->byte mapping
+ − 995 ==========================================================================
+ − 996
+ − 997 - Range table implementation - especially when there are few runs of
+ − 998 different widths, e.g. recently converted from fixed-width
+ − 999 optimization to variable width
+ − 1000
+ − 1001 Range Tables to speed up Bufpos <-> Bytind caching
+ − 1002 ==================================================
+ − 1003
+ − 1004 This describes an alternative implementation using ranges. We
+ − 1005 maintain a range table of all spans of characters of a fixed width.
+ − 1006 Updating this table could take time if there are a large number of
+ − 1007 spans; but constant factors of operations should be quick. This method really wins
+ − 1008 when you have 8-bit buffers just converted to variable width, where
+ − 1009 there will be few spans. More specifically, lookup in this range
+ − 1010 table is O(log N) and can be done with simple binary search, which is
+ − 1011 very fast. If we maintain the ranges using a gap array, updating this
+ − 1012 table will be fast for local operations, which is most of the time.
+ − 1013
+ − 1014 We will also provide (at first, at least) a Lisp function to set the
+ − 1015 caching mechanism explicitly - either range tables or the existing
+ − 1016 implementation. Eventually, we want to improve things, to the point
+ − 1017 where we automatically pick the right caching for the situation and
+ − 1018 have more caching schemes implemented.
+ − 1019
+ − 1020 ==========================================================================
+ − 1021 - Robustify Text Properties
+ − 1022 ==========================================================================
+ − 1023
+ − 1024 ==========================================================================
+ − 1025 Support for unified internal representation, e.g. Unicode
+ − 1026 ==========================================================================
+ − 1027
+ − 1028 Start tagging all text with a language text property,
+ − 1029 indicating the current language environment when the text was input.
+ − 1030 (needs "Robustify Text Properties")
+ − 1031
+ − 1032 ==========================================================================
+ − 1033 - Generalized Coding Systems
+ − 1034 ==========================================================================
+ − 1035
+ − 1036 - Lisp API for Defining Coding Systems
+ − 1037
+ − 1038 User-defined coding systems.
+ − 1039
+ − 1040 (define-coding-system-type 'type
+ − 1041 :encode-function fun
+ − 1042 :decode-function fun
+ − 1043 :detect-function fun
+ − 1044 :buffering (number = at least this many chars
+ − 1045 line = buffer up to end of line
+ − 1046 regexp = buffer until this regexp is found in match
+ − 1047 source data. match data will be appropriate when fun is
+ − 1048 called
+ − 1049
+ − 1050 encode fun is called as
+ − 1051
+ − 1052 (encode instream outstream)
+ − 1053
+ − 1054 should read data from instream and write converted result onto
+ − 1055 outstream. Can leave some data stuff in stream, it will reappear
+ − 1056 next time. Generally, there is a finite amount of data in instream
+ − 1057 and further attempts to read lead to would-block errors or retvals.
+ − 1058 Can use instream properties to record state. May use read-stream
+ − 1059 functionality to read everything into a vector or string.
+ − 1060
+ − 1061 ->Need vectors + string exposed to resizing of Lisp implementation
+ − 1062 where necessary.
+ − 1063
+ − 1064 ==========================================================================
+ − 1065 Support Windows Active Kbd Switching, Far East IME API (done already?)
+ − 1066 ==========================================================================
+ − 1067
+ − 1068 ==========================================================================
+ − 1069 - UI/design changes for Coding System Pipelining
+ − 1070 ==========================================================================
+ − 1071
+ − 1072 ------------------------------------------------------------------
+ − 1073 CODING-SYSTEM CHAINS
+ − 1074 ------------------------------------------------------------------
+ − 1075
+ − 1076 sjt sez:
+ − 1077
+ − 1078 There should be no elementary coding systems in the Lisp API, only
+ − 1079 chains. Chains should be declared, not computed, as a sequence of coding
+ − 1080 formats. (Probably the internal representation can be a vector for
+ − 1081 efficiency but programmers would probably rather work with lists.) A
+ − 1082 stream has a token type. Most streams are octet streams. Text is a
+ − 1083 stream of characters (in _internal_ format; a file on disk is not text!)
+ − 1084 An octet-stream has no implicit semantics, so its format must always be
+ − 1085 specified. The only type currently having semantics is characters. This
+ − 1086 means that the chain [euc-jp -> internal -> shift_jis) may be specified
+ − 1087 (euc-jp, shift_jis), and if no euc-jp -> shift_jis converter is
+ − 1088 available, then the chain is automatically constructed. (N.B. I f we
+ − 1089 have fixed width buffers in the future, then we could have ASCII -> 8-bit
+ − 1090 char -> 16-bit char -> ISO-2022-JP (with escape sequences).
+ − 1091
+ − 1092 EOL handling is a char <-> char coding. It should not be part of another
+ − 1093 coding system except as a convenience for users. For text coding,
+ − 1094 automatically insert EOL handlers between char <-> octet boundaries.
+ − 1095
+ − 1096 ------------------------------------------------------------------
+ − 1097 ABOUT DETECTION
+ − 1098 ------------------------------------------------------------------
+ − 1099
+ − 1100
+ − 1101 ------------------------------------------------------------------
+ − 1102 EFFICIENCY OF CODING CONVERSION WITH MULTIPLE COPIES/CHAINS
+ − 1103 ------------------------------------------------------------------
+ − 1104
+ − 1105 A comment in encode_decode_coding_region():
+ − 1106
+ − 1107 The chain of streams looks like this:
+ − 1108
+ − 1109 [BUFFER] <----- (( read from/send to loop ))
+ − 1110 ------> [CHAR->BYTE i.e. ENCODE AS BINARY if source is
+ − 1111 in bytes]
+ − 1112 ------> [ENCODE/DECODE AS SPECIFIED]
+ − 1113 ------> [BYTE->CHAR i.e. DECODE AS BINARY
+ − 1114 if sink is in bytes]
+ − 1115 ------> [AUTODETECT EOL if
+ − 1116 we're decoding and
+ − 1117 coding system calls
+ − 1118 for this]
+ − 1119 ------> [BUFFER]
+ − 1120
+ − 1121 sjt (?) responds:
+ − 1122
+ − 1123 Of course, this is just horrible. BYTE<->CHAR should only be available
+ − 1124 to I/O routines. It should not be visible to Mule proper.
+ − 1125
+ − 1126 A comment on the implementation. Hrvoje and Kyle worry about the
+ − 1127 inefficiency of repeated copying among buffers that chained coding
+ − 1128 systems entail. But this may not be as time inefficient as it appears
+ − 1129 in the Mule ("house rules") context. The issue is how do you do chain
+ − 1130 coding systems without copying? In theory you could have
+ − 1131
+ − 1132 IChar external_to_raw (ExtChar *cp, State *s);
+ − 1133 IChar decode_utf16 (IChar c, State *s);
+ − 1134 IChar decode_crlf (ExtChar *cp, State *s);
+ − 1135
+ − 1136 typedef Ichar (*Converter[]) (Ichar, State*);
+ − 1137
+ − 1138 Converter utf16[2] = { &decode_utf16, &decode_crlf };
+ − 1139
+ − 1140 void convert (ExtChar *inbuf, IChar *outbuf, Converter cvtr)
+ − 1141 {
+ − 1142 int i;
+ − 1143 ExtChar c;
+ − 1144 State s;
+ − 1145
+ − 1146 while (c = external_to_raw (*inbuf++, &s))
+ − 1147 {
+ − 1148 for (i = 0; i < sizeof(cvtr)/sizeof(Converter); ++i)
+ − 1149 if (s.ready)
+ − 1150 c = (*cvtr[i]) (c, &s);
+ − 1151 }
+ − 1152 if (s.ready)
+ − 1153 *outbuf++ = c;
+ − 1154 }
+ − 1155
+ − 1156 But this is a lot of function calls; what Ben is doing is basically
+ − 1157 reducing this to one call per buffer-full. The only way to avoid this
+ − 1158 is to hardcode all the "interesting" coding systems, maybe using
+ − 1159 inline or macros to give structure. But this is still a huge amount
+ − 1160 of work, and code.
+ − 1161
+ − 1162 One advantage to the call-per-char approach is that we might be able
+ − 1163 to do something about the marker/extent destruction that coding
+ − 1164 normally entails.
+ − 1165
+ − 1166 ben sez:
+ − 1167
+ − 1168 it should be possible to preserve the markers/extents without
+ − 1169 switching completely to one-call-per-char -- we could at least do one
+ − 1170 call per "run", where a run is more or less the maximal stretch of
+ − 1171 text not overlapping any markers or extent boundaries. (It's a bit
+ − 1172 more complicated if we want to properly support the different extent
+ − 1173 begins/ends; in some cases we might have to pump a single character
+ − 1174 adjacent to where two extents meet.) The "stateless" way that I wrote
+ − 1175 all of the conversion routines may be a real hassle but it allows
+ − 1176 something like this to work without too much problem -- pump in one
+ − 1177 run at a time into one end of the chain, do a flush after each
+ − 1178 iteration, and stick what comes out the other end in its place.
+ − 1179
+ − 1180 ------------------------------------------------------------------
+ − 1181 ABOUT FORMATS
+ − 1182 ------------------------------------------------------------------
+ − 1183
+ − 1184 when calling make-coding-system, the name can be a cons of (format1 .
+ − 1185 format2), specifying that it decodes format1->format2 and encodes the other
+ − 1186 way. if only one name is given, that is assumed to be format1, and the
+ − 1187 other is either `external' or `internal' depending on the end type.
+ − 1188 normally the user when decoding gives the decoding order in formats, but
+ − 1189 can leave off the last one, `internal', which is assumed. a multichain
+ − 1190 might look like gzip|multibyte|unicode, using the coding systems named
+ − 1191 `gzip', `(unicode . multibyte)' and `unicode'. the way this actually works
+ − 1192 is by searching for gzip->multibyte; if not found, look for gzip->external
+ − 1193 or gzip->internal. (In general we automatically do conversion between
+ − 1194 internal and external as necessary: thus gzip|crlf does the expected, and
+ − 1195 maps to gzip->external, external->internal, crlf->internal, which when
+ − 1196 fully specified would be gzip|external:external|internal:crlf|internal --
+ − 1197 see below.) To forcibly fit together two converters that have explicitly
+ − 1198 specified and incompatible names (say you have unicode->multibyte and
+ − 1199 iso8859-1->ebcdic and you know that the multibyte and iso8859-1 in this
+ − 1200 case are compatible), you can force-cast using :, like this:
+ − 1201 ebcdic|iso8859-1:multibyte|unicode. (again, if you force-cast between
+ − 1202 internal and external formats, the conversion happens automatically.)
+ − 1203
+ − 1204 --------------------------------------------------------------------------
+ − 1205 ABOUT PDUMP, UNICODE, AND RUNNING XEMACS FROM A DIRECTORY WITH WEIRD CHARS
+ − 1206 --------------------------------------------------------------------------
+ − 1207
+ − 1208 -- there's the problem that XEmacs can't be run in a directory with
+ − 1209 non-ASCII/Latin-1 chars in it, since it will be doing Unicode
+ − 1210 processing before we've had a chance to load the tables. In fact,
+ − 1211 even finding the tables in such a situation is problematic using
+ − 1212 the normal commands. my idea is to eventually load the stuff
+ − 1213 extremely extremely early, at the same time as the pdump data gets
+ − 1214 loaded. in fact, the unicode table data (stored in an efficient
+ − 1215 binary format) can even be stuck into the pdump file (which would
+ − 1216 mean as a resource to the executable, for windows). we'd need to
+ − 1217 extend pdump a bit: to allow for attaching extra data to the pdump
+ − 1218 file. (something like pdump_attach_extra_data (addr, length)
+ − 1219 returns a number of some sort, an index into the file, which you
+ − 1220 can then retrieve with pdump_load_extra_data(), which returns an
+ − 1221 addr (mmap()ed or loaded), and later you pdump_unload_extra_data()
+ − 1222 when finished. we'd probably also need
+ − 1223 pdump_attach_extra_data_append(), which appends data to the data
+ − 1224 just written out with pdump_attach_extra_data(). this way,
+ − 1225 multiple tables in memory can be written out into one contiguous
+ − 1226 table. (we'd use the tar-like trick of allowing new blocks to be
+ − 1227 written without going back to change the old blocks -- we just rely
+ − 1228 on the end of file/end of memory.) this same mechanism could be
+ − 1229 extracted out of pdump and used to handle the non-pdump situation
+ − 1230 (or alternatively, we could just dump either the memory image of
+ − 1231 the tables themselves or the compressed binary version). in the
+ − 1232 case of extra unicode tables not known about at compile time that
+ − 1233 get loaded before dumping, we either just dump them into the image
+ − 1234 (pdump and all) or extract them into the compressed binary format,
+ − 1235 free the original tables, and treat them like all other tables.
+ − 1236
+ − 1237
+ − 1238 ==========================================================================
+ − 1239 - Generalized language appropriate word wrapping (requires
+ − 1240 layout-exposing API defined in BIDI section)
+ − 1241 ==========================================================================
+ − 1242
+ − 1243 ==========================================================================
+ − 1244 - Make Custom Mule-aware
+ − 1245 ==========================================================================
+ − 1246
+ − 1247 ==========================================================================
+ − 1248 - Composite character support
+ − 1249 ==========================================================================
+ − 1250
+ − 1251 ==========================================================================
+ − 1252 - Language appropriate sorting and searching
+ − 1253 ==========================================================================
+ − 1254
+ − 1255 ==========================================================================
+ − 1256 - Glyph shaping for Arabic and Devanagari
+ − 1257 ==========================================================================
+ − 1258
+ − 1259 - (needs to be handled mostly
+ − 1260 at C level, as part of layout; luckily it's entirely local in its
+ − 1261 changes, as this is not hard)
+ − 1262
+ − 1263
+ − 1264 ==========================================================================
+ − 1265 Consider moving language selection Menu up to be parallel with Mule menu
+ − 1266 ==========================================================================
+ − 1267
+ − 1268 */
+ − 1269
+ − 1270
771
+ − 1271
+ − 1272 /************************************************************************/
+ − 1273 /* declarations */
+ − 1274 /************************************************************************/
+ − 1275
+ − 1276 Eistring the_eistring_zero_init, the_eistring_malloc_zero_init;
+ − 1277
+ − 1278 #define MAX_CHARBPOS_GAP_SIZE_3 (65535/3)
+ − 1279 #define MAX_BYTEBPOS_GAP_SIZE_3 (3 * MAX_CHARBPOS_GAP_SIZE_3)
+ − 1280
+ − 1281 short three_to_one_table[1 + MAX_BYTEBPOS_GAP_SIZE_3];
+ − 1282
+ − 1283 #ifdef MULE
+ − 1284
+ − 1285 /* Table of number of bytes in the string representation of a character
+ − 1286 indexed by the first byte of that representation.
+ − 1287
+ − 1288 rep_bytes_by_first_byte(c) is more efficient than the equivalent
+ − 1289 canonical computation:
+ − 1290
826
+ − 1291 XCHARSET_REP_BYTES (charset_by_leading_byte (c)) */
771
+ − 1292
+ − 1293 const Bytecount rep_bytes_by_first_byte[0xA0] =
+ − 1294 { /* 0x00 - 0x7f are for straight ASCII */
+ − 1295 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 1296 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 1297 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 1298 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 1299 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 1300 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 1301 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 1302 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ − 1303 /* 0x80 - 0x8f are for Dimension-1 official charsets */
+ − 1304 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ − 1305 /* 0x90 - 0x9d are for Dimension-2 official charsets */
+ − 1306 /* 0x9e is for Dimension-1 private charsets */
+ − 1307 /* 0x9f is for Dimension-2 private charsets */
+ − 1308 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
+ − 1309 };
+ − 1310
+ − 1311 #ifdef ENABLE_COMPOSITE_CHARS
+ − 1312
+ − 1313 /* Hash tables for composite chars. One maps string representing
+ − 1314 composed chars to their equivalent chars; one goes the
+ − 1315 other way. */
+ − 1316 Lisp_Object Vcomposite_char_char2string_hash_table;
+ − 1317 Lisp_Object Vcomposite_char_string2char_hash_table;
+ − 1318
+ − 1319 static int composite_char_row_next;
+ − 1320 static int composite_char_col_next;
+ − 1321
+ − 1322 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 1323
+ − 1324 #endif /* MULE */
+ − 1325
1292
+ − 1326 Lisp_Object QSin_char_byte_conversion;
+ − 1327 Lisp_Object QSin_internal_external_conversion;
+ − 1328
771
+ − 1329
+ − 1330 /************************************************************************/
+ − 1331 /* qxestr***() functions */
+ − 1332 /************************************************************************/
+ − 1333
+ − 1334 /* Most are inline functions in lisp.h */
+ − 1335
+ − 1336 int
867
+ − 1337 qxesprintf (Ibyte *buffer, const CIbyte *format, ...)
771
+ − 1338 {
+ − 1339 va_list args;
+ − 1340 int retval;
+ − 1341
+ − 1342 va_start (args, format);
2367
+ − 1343 retval = vsprintf ((Chbyte *) buffer, format, args);
771
+ − 1344 va_end (args);
+ − 1345
+ − 1346 return retval;
+ − 1347 }
+ − 1348
+ − 1349 /* strcasecmp() implementation from BSD */
867
+ − 1350 static Ibyte strcasecmp_charmap[] = {
1429
+ − 1351 0000, 0001, 0002, 0003, 0004, 0005, 0006, 0007,
+ − 1352 0010, 0011, 0012, 0013, 0014, 0015, 0016, 0017,
+ − 1353 0020, 0021, 0022, 0023, 0024, 0025, 0026, 0027,
+ − 1354 0030, 0031, 0032, 0033, 0034, 0035, 0036, 0037,
+ − 1355 0040, 0041, 0042, 0043, 0044, 0045, 0046, 0047,
+ − 1356 0050, 0051, 0052, 0053, 0054, 0055, 0056, 0057,
+ − 1357 0060, 0061, 0062, 0063, 0064, 0065, 0066, 0067,
+ − 1358 0070, 0071, 0072, 0073, 0074, 0075, 0076, 0077,
+ − 1359 0100, 0141, 0142, 0143, 0144, 0145, 0146, 0147,
+ − 1360 0150, 0151, 0152, 0153, 0154, 0155, 0156, 0157,
+ − 1361 0160, 0161, 0162, 0163, 0164, 0165, 0166, 0167,
+ − 1362 0170, 0171, 0172, 0133, 0134, 0135, 0136, 0137,
+ − 1363 0140, 0141, 0142, 0143, 0144, 0145, 0146, 0147,
+ − 1364 0150, 0151, 0152, 0153, 0154, 0155, 0156, 0157,
+ − 1365 0160, 0161, 0162, 0163, 0164, 0165, 0166, 0167,
+ − 1366 0170, 0171, 0172, 0173, 0174, 0175, 0176, 0177,
+ − 1367 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
+ − 1368 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
+ − 1369 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
+ − 1370 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
+ − 1371 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
+ − 1372 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
+ − 1373 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
+ − 1374 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
+ − 1375 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
+ − 1376 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
+ − 1377 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
+ − 1378 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
+ − 1379 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
+ − 1380 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
+ − 1381 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
+ − 1382 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
771
+ − 1383 };
+ − 1384
+ − 1385 /* A version that works like generic strcasecmp() -- only collapsing
+ − 1386 case in ASCII A-Z/a-z. This is safe on Mule strings due to the
+ − 1387 current representation.
+ − 1388
+ − 1389 This version was written by some Berkeley coder, favoring
+ − 1390 nanosecond improvements over clarity. In all other versions below,
+ − 1391 we use symmetrical algorithms that may sacrifice a few machine
+ − 1392 cycles but are MUCH MUCH clearer, which counts a lot more.
+ − 1393 */
+ − 1394
+ − 1395 int
867
+ − 1396 qxestrcasecmp (const Ibyte *s1, const Ibyte *s2)
771
+ − 1397 {
867
+ − 1398 Ibyte *cm = strcasecmp_charmap;
771
+ − 1399
+ − 1400 while (cm[*s1] == cm[*s2++])
+ − 1401 if (*s1++ == '\0')
+ − 1402 return (0);
+ − 1403
+ − 1404 return (cm[*s1] - cm[*--s2]);
+ − 1405 }
+ − 1406
+ − 1407 int
2367
+ − 1408 ascii_strcasecmp (const Ascbyte *s1, const Ascbyte *s2)
771
+ − 1409 {
867
+ − 1410 return qxestrcasecmp ((const Ibyte *) s1, (const Ibyte *) s2);
771
+ − 1411 }
+ − 1412
+ − 1413 int
2367
+ − 1414 qxestrcasecmp_ascii (const Ibyte *s1, const Ascbyte *s2)
771
+ − 1415 {
867
+ − 1416 return qxestrcasecmp (s1, (const Ibyte *) s2);
771
+ − 1417 }
+ − 1418
+ − 1419 /* An internationalized version that collapses case in a general fashion.
+ − 1420 */
+ − 1421
+ − 1422 int
867
+ − 1423 qxestrcasecmp_i18n (const Ibyte *s1, const Ibyte *s2)
771
+ − 1424 {
+ − 1425 while (*s1 && *s2)
+ − 1426 {
867
+ − 1427 if (DOWNCASE (0, itext_ichar (s1)) !=
+ − 1428 DOWNCASE (0, itext_ichar (s2)))
771
+ − 1429 break;
867
+ − 1430 INC_IBYTEPTR (s1);
+ − 1431 INC_IBYTEPTR (s2);
771
+ − 1432 }
+ − 1433
867
+ − 1434 return (DOWNCASE (0, itext_ichar (s1)) -
+ − 1435 DOWNCASE (0, itext_ichar (s2)));
771
+ − 1436 }
+ − 1437
+ − 1438 /* The only difference between these next two and
+ − 1439 qxememcasecmp()/qxememcasecmp_i18n() is that these two will stop if
+ − 1440 both strings are equal and less than LEN in length, while
+ − 1441 the mem...() versions would would run off the end. */
+ − 1442
+ − 1443 int
867
+ − 1444 qxestrncasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len)
771
+ − 1445 {
867
+ − 1446 Ibyte *cm = strcasecmp_charmap;
771
+ − 1447
+ − 1448 while (len--)
+ − 1449 {
+ − 1450 int diff = cm[*s1] - cm[*s2];
+ − 1451 if (diff != 0)
+ − 1452 return diff;
+ − 1453 if (!*s1)
+ − 1454 return 0;
+ − 1455 s1++, s2++;
+ − 1456 }
+ − 1457
+ − 1458 return 0;
+ − 1459 }
+ − 1460
+ − 1461 int
2367
+ − 1462 ascii_strncasecmp (const Ascbyte *s1, const Ascbyte *s2, Bytecount len)
771
+ − 1463 {
867
+ − 1464 return qxestrncasecmp ((const Ibyte *) s1, (const Ibyte *) s2, len);
771
+ − 1465 }
+ − 1466
+ − 1467 int
2367
+ − 1468 qxestrncasecmp_ascii (const Ibyte *s1, const Ascbyte *s2, Bytecount len)
771
+ − 1469 {
867
+ − 1470 return qxestrncasecmp (s1, (const Ibyte *) s2, len);
771
+ − 1471 }
+ − 1472
801
+ − 1473 /* Compare LEN_FROM_S1 worth of characters from S1 with the same number of
+ − 1474 characters from S2, case insensitive. NOTE: Downcasing can convert
+ − 1475 characters from one length in bytes to another, so reversing S1 and S2
+ − 1476 is *NOT* a symmetric operations! You must choose a length that agrees
+ − 1477 with S1. */
+ − 1478
771
+ − 1479 int
867
+ − 1480 qxestrncasecmp_i18n (const Ibyte *s1, const Ibyte *s2,
801
+ − 1481 Bytecount len_from_s1)
771
+ − 1482 {
801
+ − 1483 while (len_from_s1 > 0)
771
+ − 1484 {
867
+ − 1485 const Ibyte *old_s1 = s1;
+ − 1486 int diff = (DOWNCASE (0, itext_ichar (s1)) -
+ − 1487 DOWNCASE (0, itext_ichar (s2)));
771
+ − 1488 if (diff != 0)
+ − 1489 return diff;
+ − 1490 if (!*s1)
+ − 1491 return 0;
867
+ − 1492 INC_IBYTEPTR (s1);
+ − 1493 INC_IBYTEPTR (s2);
801
+ − 1494 len_from_s1 -= s1 - old_s1;
771
+ − 1495 }
+ − 1496
+ − 1497 return 0;
+ − 1498 }
+ − 1499
+ − 1500 int
867
+ − 1501 qxememcmp (const Ibyte *s1, const Ibyte *s2, Bytecount len)
771
+ − 1502 {
+ − 1503 return memcmp (s1, s2, len);
+ − 1504 }
+ − 1505
+ − 1506 int
867
+ − 1507 qxememcmp4 (const Ibyte *s1, Bytecount len1,
+ − 1508 const Ibyte *s2, Bytecount len2)
801
+ − 1509 {
+ − 1510 int retval = qxememcmp (s1, s2, min (len1, len2));
+ − 1511 if (retval)
+ − 1512 return retval;
+ − 1513 return len1 - len2;
+ − 1514 }
+ − 1515
+ − 1516 int
867
+ − 1517 qxememcasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len)
771
+ − 1518 {
867
+ − 1519 Ibyte *cm = strcasecmp_charmap;
771
+ − 1520
+ − 1521 while (len--)
+ − 1522 {
+ − 1523 int diff = cm[*s1] - cm[*s2];
+ − 1524 if (diff != 0)
+ − 1525 return diff;
+ − 1526 s1++, s2++;
+ − 1527 }
+ − 1528
+ − 1529 return 0;
+ − 1530 }
+ − 1531
+ − 1532 int
867
+ − 1533 qxememcasecmp4 (const Ibyte *s1, Bytecount len1,
+ − 1534 const Ibyte *s2, Bytecount len2)
771
+ − 1535 {
801
+ − 1536 int retval = qxememcasecmp (s1, s2, min (len1, len2));
+ − 1537 if (retval)
+ − 1538 return retval;
+ − 1539 return len1 - len2;
+ − 1540 }
+ − 1541
+ − 1542 /* Do a character-by-character comparison, returning "which is greater" by
867
+ − 1543 comparing the Ichar values. (#### Should have option to compare Unicode
801
+ − 1544 points) */
+ − 1545
+ − 1546 int
867
+ − 1547 qxetextcmp (const Ibyte *s1, Bytecount len1,
+ − 1548 const Ibyte *s2, Bytecount len2)
801
+ − 1549 {
+ − 1550 while (len1 > 0 && len2 > 0)
771
+ − 1551 {
867
+ − 1552 const Ibyte *old_s1 = s1;
+ − 1553 const Ibyte *old_s2 = s2;
+ − 1554 int diff = itext_ichar (s1) - itext_ichar (s2);
801
+ − 1555 if (diff != 0)
+ − 1556 return diff;
867
+ − 1557 INC_IBYTEPTR (s1);
+ − 1558 INC_IBYTEPTR (s2);
801
+ − 1559 len1 -= s1 - old_s1;
+ − 1560 len2 -= s2 - old_s2;
+ − 1561 }
+ − 1562
+ − 1563 assert (len1 >= 0 && len2 >= 0);
+ − 1564 return len1 - len2;
+ − 1565 }
+ − 1566
+ − 1567 int
867
+ − 1568 qxetextcmp_matching (const Ibyte *s1, Bytecount len1,
+ − 1569 const Ibyte *s2, Bytecount len2,
801
+ − 1570 Charcount *matching)
+ − 1571 {
+ − 1572 *matching = 0;
+ − 1573 while (len1 > 0 && len2 > 0)
+ − 1574 {
867
+ − 1575 const Ibyte *old_s1 = s1;
+ − 1576 const Ibyte *old_s2 = s2;
+ − 1577 int diff = itext_ichar (s1) - itext_ichar (s2);
801
+ − 1578 if (diff != 0)
+ − 1579 return diff;
867
+ − 1580 INC_IBYTEPTR (s1);
+ − 1581 INC_IBYTEPTR (s2);
801
+ − 1582 len1 -= s1 - old_s1;
+ − 1583 len2 -= s2 - old_s2;
+ − 1584 (*matching)++;
+ − 1585 }
+ − 1586
+ − 1587 assert (len1 >= 0 && len2 >= 0);
+ − 1588 return len1 - len2;
+ − 1589 }
+ − 1590
+ − 1591 /* Do a character-by-character comparison, returning "which is greater" by
867
+ − 1592 comparing the Ichar values, case insensitively (by downcasing both
801
+ − 1593 first). (#### Should have option to compare Unicode points)
+ − 1594
+ − 1595 In this case, both lengths must be specified becaused downcasing can
+ − 1596 convert characters from one length in bytes to another; therefore, two
+ − 1597 blocks of text of different length might be equal. If both compare
+ − 1598 equal up to the limit in length of one but not the other, the longer one
+ − 1599 is "greater". */
+ − 1600
+ − 1601 int
867
+ − 1602 qxetextcasecmp (const Ibyte *s1, Bytecount len1,
+ − 1603 const Ibyte *s2, Bytecount len2)
801
+ − 1604 {
+ − 1605 while (len1 > 0 && len2 > 0)
+ − 1606 {
867
+ − 1607 const Ibyte *old_s1 = s1;
+ − 1608 const Ibyte *old_s2 = s2;
+ − 1609 int diff = (DOWNCASE (0, itext_ichar (s1)) -
+ − 1610 DOWNCASE (0, itext_ichar (s2)));
771
+ − 1611 if (diff != 0)
+ − 1612 return diff;
867
+ − 1613 INC_IBYTEPTR (s1);
+ − 1614 INC_IBYTEPTR (s2);
801
+ − 1615 len1 -= s1 - old_s1;
+ − 1616 len2 -= s2 - old_s2;
771
+ − 1617 }
+ − 1618
801
+ − 1619 assert (len1 >= 0 && len2 >= 0);
+ − 1620 return len1 - len2;
+ − 1621 }
+ − 1622
+ − 1623 /* Like qxetextcasecmp() but also return number of characters at
+ − 1624 beginning that match. */
+ − 1625
+ − 1626 int
867
+ − 1627 qxetextcasecmp_matching (const Ibyte *s1, Bytecount len1,
+ − 1628 const Ibyte *s2, Bytecount len2,
801
+ − 1629 Charcount *matching)
+ − 1630 {
+ − 1631 *matching = 0;
+ − 1632 while (len1 > 0 && len2 > 0)
+ − 1633 {
867
+ − 1634 const Ibyte *old_s1 = s1;
+ − 1635 const Ibyte *old_s2 = s2;
+ − 1636 int diff = (DOWNCASE (0, itext_ichar (s1)) -
+ − 1637 DOWNCASE (0, itext_ichar (s2)));
801
+ − 1638 if (diff != 0)
+ − 1639 return diff;
867
+ − 1640 INC_IBYTEPTR (s1);
+ − 1641 INC_IBYTEPTR (s2);
801
+ − 1642 len1 -= s1 - old_s1;
+ − 1643 len2 -= s2 - old_s2;
+ − 1644 (*matching)++;
+ − 1645 }
+ − 1646
+ − 1647 assert (len1 >= 0 && len2 >= 0);
+ − 1648 return len1 - len2;
771
+ − 1649 }
+ − 1650
+ − 1651 int
+ − 1652 lisp_strcasecmp (Lisp_Object s1, Lisp_Object s2)
+ − 1653 {
867
+ − 1654 Ibyte *cm = strcasecmp_charmap;
+ − 1655 Ibyte *p1 = XSTRING_DATA (s1);
+ − 1656 Ibyte *p2 = XSTRING_DATA (s2);
+ − 1657 Ibyte *e1 = p1 + XSTRING_LENGTH (s1);
+ − 1658 Ibyte *e2 = p2 + XSTRING_LENGTH (s2);
771
+ − 1659
+ − 1660 /* again, we use a symmetric algorithm and favor clarity over
+ − 1661 nanosecond improvements. */
+ − 1662 while (1)
+ − 1663 {
+ − 1664 /* if we reached the end of either string, compare lengths.
+ − 1665 do NOT compare the final null byte against anything, in case
+ − 1666 the other string also has a null byte at that position. */
+ − 1667 if (p1 == e1 || p2 == e2)
+ − 1668 return e1 - e2;
+ − 1669 if (cm[*p1] != cm[*p2])
+ − 1670 return cm[*p1] - cm[*p2];
+ − 1671 p1++, p2++;
+ − 1672 }
+ − 1673 }
+ − 1674
+ − 1675 int
+ − 1676 lisp_strcasecmp_i18n (Lisp_Object s1, Lisp_Object s2)
+ − 1677 {
801
+ − 1678 return qxetextcasecmp (XSTRING_DATA (s1), XSTRING_LENGTH (s1),
+ − 1679 XSTRING_DATA (s2), XSTRING_LENGTH (s2));
771
+ − 1680 }
+ − 1681
2367
+ − 1682 /* Compare a wide string with an ASCII string */
+ − 1683
+ − 1684 int
+ − 1685 wcscmp_ascii (const wchar_t *s1, const Ascbyte *s2)
+ − 1686 {
+ − 1687 while (*s1 && *s2)
+ − 1688 {
+ − 1689 if (*s1 != *s2)
+ − 1690 break;
+ − 1691 s1++, s2++;
+ − 1692 }
+ − 1693
+ − 1694 return *s1 - *s2;
+ − 1695 }
+ − 1696
+ − 1697 int
+ − 1698 wcsncmp_ascii (const wchar_t *s1, const Ascbyte *s2, Charcount len)
+ − 1699 {
+ − 1700 while (len--)
+ − 1701 {
+ − 1702 int diff = *s1 - *s2;
+ − 1703 if (diff != 0)
+ − 1704 return diff;
+ − 1705 if (!*s1)
+ − 1706 return 0;
+ − 1707 s1++, s2++;
+ − 1708 }
+ − 1709
+ − 1710 return 0;
+ − 1711 }
+ − 1712
771
+ − 1713
+ − 1714 /************************************************************************/
+ − 1715 /* conversion between textual representations */
+ − 1716 /************************************************************************/
+ − 1717
+ − 1718 /* NOTE: Does not reset the Dynarr. */
+ − 1719
+ − 1720 void
867
+ − 1721 convert_ibyte_string_into_ichar_dynarr (const Ibyte *str, Bytecount len,
2367
+ − 1722 Ichar_dynarr *dyn)
771
+ − 1723 {
867
+ − 1724 const Ibyte *strend = str + len;
771
+ − 1725
+ − 1726 while (str < strend)
+ − 1727 {
867
+ − 1728 Ichar ch = itext_ichar (str);
771
+ − 1729 Dynarr_add (dyn, ch);
867
+ − 1730 INC_IBYTEPTR (str);
771
+ − 1731 }
+ − 1732 }
+ − 1733
+ − 1734 Charcount
867
+ − 1735 convert_ibyte_string_into_ichar_string (const Ibyte *str, Bytecount len,
2367
+ − 1736 Ichar *arr)
771
+ − 1737 {
867
+ − 1738 const Ibyte *strend = str + len;
771
+ − 1739 Charcount newlen = 0;
+ − 1740 while (str < strend)
+ − 1741 {
867
+ − 1742 Ichar ch = itext_ichar (str);
771
+ − 1743 arr[newlen++] = ch;
867
+ − 1744 INC_IBYTEPTR (str);
771
+ − 1745 }
+ − 1746 return newlen;
+ − 1747 }
+ − 1748
867
+ − 1749 /* Convert an array of Ichars into the equivalent string representation.
+ − 1750 Store into the given Ibyte dynarr. Does not reset the dynarr.
771
+ − 1751 Does not add a terminating zero. */
+ − 1752
+ − 1753 void
867
+ − 1754 convert_ichar_string_into_ibyte_dynarr (Ichar *arr, int nels,
+ − 1755 Ibyte_dynarr *dyn)
771
+ − 1756 {
867
+ − 1757 Ibyte str[MAX_ICHAR_LEN];
771
+ − 1758 int i;
+ − 1759
+ − 1760 for (i = 0; i < nels; i++)
+ − 1761 {
867
+ − 1762 Bytecount len = set_itext_ichar (str, arr[i]);
771
+ − 1763 Dynarr_add_many (dyn, str, len);
+ − 1764 }
+ − 1765 }
+ − 1766
867
+ − 1767 /* Convert an array of Ichars into the equivalent string representation.
771
+ − 1768 Malloc the space needed for this and return it. If LEN_OUT is not a
867
+ − 1769 NULL pointer, store into LEN_OUT the number of Ibytes in the
+ − 1770 malloc()ed string. Note that the actual number of Ibytes allocated
771
+ − 1771 is one more than this: the returned string is zero-terminated. */
+ − 1772
867
+ − 1773 Ibyte *
+ − 1774 convert_ichar_string_into_malloced_string (Ichar *arr, int nels,
826
+ − 1775 Bytecount *len_out)
771
+ − 1776 {
+ − 1777 /* Damn zero-termination. */
2367
+ − 1778 Ibyte *str = alloca_ibytes (nels * MAX_ICHAR_LEN + 1);
867
+ − 1779 Ibyte *strorig = str;
771
+ − 1780 Bytecount len;
+ − 1781
+ − 1782 int i;
+ − 1783
+ − 1784 for (i = 0; i < nels; i++)
867
+ − 1785 str += set_itext_ichar (str, arr[i]);
771
+ − 1786 *str = '\0';
+ − 1787 len = str - strorig;
2367
+ − 1788 str = xnew_ibytes (1 + len);
771
+ − 1789 memcpy (str, strorig, 1 + len);
+ − 1790 if (len_out)
+ − 1791 *len_out = len;
+ − 1792 return str;
+ − 1793 }
+ − 1794
826
+ − 1795 #define COPY_TEXT_BETWEEN_FORMATS(srcfmt, dstfmt) \
+ − 1796 do \
+ − 1797 { \
+ − 1798 if (dst) \
+ − 1799 { \
867
+ − 1800 Ibyte *dstend = dst + dstlen; \
+ − 1801 Ibyte *dstp = dst; \
+ − 1802 const Ibyte *srcend = src + srclen; \
+ − 1803 const Ibyte *srcp = src; \
826
+ − 1804 \
+ − 1805 while (srcp < srcend) \
+ − 1806 { \
867
+ − 1807 Ichar ch = itext_ichar_fmt (srcp, srcfmt, srcobj); \
+ − 1808 Bytecount len = ichar_len_fmt (ch, dstfmt); \
826
+ − 1809 \
+ − 1810 if (dstp + len <= dstend) \
+ − 1811 { \
867
+ − 1812 set_itext_ichar_fmt (dstp, ch, dstfmt, dstobj); \
826
+ − 1813 dstp += len; \
+ − 1814 } \
+ − 1815 else \
+ − 1816 break; \
867
+ − 1817 INC_IBYTEPTR_FMT (srcp, srcfmt); \
826
+ − 1818 } \
+ − 1819 text_checking_assert (srcp <= srcend); \
+ − 1820 if (src_used) \
+ − 1821 *src_used = srcp - src; \
+ − 1822 return dstp - dst; \
+ − 1823 } \
+ − 1824 else \
+ − 1825 { \
867
+ − 1826 const Ibyte *srcend = src + srclen; \
+ − 1827 const Ibyte *srcp = src; \
826
+ − 1828 Bytecount total = 0; \
+ − 1829 \
+ − 1830 while (srcp < srcend) \
+ − 1831 { \
867
+ − 1832 total += ichar_len_fmt (itext_ichar_fmt (srcp, srcfmt, \
826
+ − 1833 srcobj), dstfmt); \
867
+ − 1834 INC_IBYTEPTR_FMT (srcp, srcfmt); \
826
+ − 1835 } \
+ − 1836 text_checking_assert (srcp == srcend); \
+ − 1837 if (src_used) \
+ − 1838 *src_used = srcp - src; \
+ − 1839 return total; \
+ − 1840 } \
+ − 1841 } \
+ − 1842 while (0)
+ − 1843
+ − 1844 /* Copy as much text from SRC/SRCLEN to DST/DSTLEN as will fit, converting
+ − 1845 from SRCFMT/SRCOBJ to DSTFMT/DSTOBJ. Return number of bytes stored into
+ − 1846 DST as return value, and number of bytes copied from SRC through
+ − 1847 SRC_USED (if not NULL). If DST is NULL, don't actually store anything
+ − 1848 and just return the size needed to store all the text. Will not copy
+ − 1849 partial characters into DST. */
+ − 1850
+ − 1851 Bytecount
867
+ − 1852 copy_text_between_formats (const Ibyte *src, Bytecount srclen,
826
+ − 1853 Internal_Format srcfmt,
2333
+ − 1854 Lisp_Object USED_IF_MULE (srcobj),
867
+ − 1855 Ibyte *dst, Bytecount dstlen,
826
+ − 1856 Internal_Format dstfmt,
2333
+ − 1857 Lisp_Object USED_IF_MULE (dstobj),
826
+ − 1858 Bytecount *src_used)
+ − 1859 {
+ − 1860 if (srcfmt == dstfmt &&
+ − 1861 objects_have_same_internal_representation (srcobj, dstobj))
+ − 1862 {
+ − 1863 if (dst)
+ − 1864 {
+ − 1865 srclen = min (srclen, dstlen);
867
+ − 1866 srclen = validate_ibyte_string_backward (src, srclen);
826
+ − 1867 memcpy (dst, src, srclen);
+ − 1868 if (src_used)
+ − 1869 *src_used = srclen;
+ − 1870 return srclen;
+ − 1871 }
+ − 1872 else
+ − 1873 return srclen;
+ − 1874 }
+ − 1875 /* Everything before the final else statement is an optimization.
+ − 1876 The inner loops inside COPY_TEXT_BETWEEN_FORMATS() have a number
+ − 1877 of calls to *_fmt(), each of which has a switch statement in it.
+ − 1878 By using constants as the FMT argument, these switch statements
+ − 1879 will be optimized out of existence. */
+ − 1880 #define ELSE_FORMATS(fmt1, fmt2) \
+ − 1881 else if (srcfmt == fmt1 && dstfmt == fmt2) \
+ − 1882 COPY_TEXT_BETWEEN_FORMATS (fmt1, fmt2)
+ − 1883 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_8_BIT_FIXED);
+ − 1884 ELSE_FORMATS (FORMAT_8_BIT_FIXED, FORMAT_DEFAULT);
+ − 1885 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_32_BIT_FIXED);
+ − 1886 ELSE_FORMATS (FORMAT_32_BIT_FIXED, FORMAT_DEFAULT);
+ − 1887 else
+ − 1888 COPY_TEXT_BETWEEN_FORMATS (srcfmt, dstfmt);
+ − 1889 #undef ELSE_FORMATS
+ − 1890 }
+ − 1891
+ − 1892 /* Copy as much buffer text in BUF, starting at POS, of length LEN, as will
+ − 1893 fit into DST/DSTLEN, converting to DSTFMT. Return number of bytes
+ − 1894 stored into DST as return value, and number of bytes copied from BUF
+ − 1895 through SRC_USED (if not NULL). If DST is NULL, don't actually store
+ − 1896 anything and just return the size needed to store all the text. */
+ − 1897
+ − 1898 Bytecount
+ − 1899 copy_buffer_text_out (struct buffer *buf, Bytebpos pos,
867
+ − 1900 Bytecount len, Ibyte *dst, Bytecount dstlen,
826
+ − 1901 Internal_Format dstfmt, Lisp_Object dstobj,
+ − 1902 Bytecount *src_used)
+ − 1903 {
+ − 1904 Bytecount dst_used = 0;
+ − 1905 if (src_used)
+ − 1906 *src_used = 0;
+ − 1907
+ − 1908 {
+ − 1909 BUFFER_TEXT_LOOP (buf, pos, len, runptr, runlen)
+ − 1910 {
+ − 1911 Bytecount the_src_used, the_dst_used;
+ − 1912
+ − 1913 the_dst_used = copy_text_between_formats (runptr, runlen,
+ − 1914 BUF_FORMAT (buf),
+ − 1915 wrap_buffer (buf),
+ − 1916 dst, dstlen, dstfmt,
+ − 1917 dstobj, &the_src_used);
+ − 1918 dst_used += the_dst_used;
+ − 1919 if (src_used)
+ − 1920 *src_used += the_src_used;
+ − 1921 if (dst)
+ − 1922 {
+ − 1923 dst += the_dst_used;
+ − 1924 dstlen -= the_dst_used;
841
+ − 1925 /* Stop if we didn't use all of the source text. Also stop
+ − 1926 if the destination is full. We need the first test because
+ − 1927 there might be a couple bytes left in the destination, but
+ − 1928 not enough to fit a full character. The first test will in
+ − 1929 fact catch the vast majority of cases where the destination
+ − 1930 is empty, too -- but in case the destination holds *exactly*
+ − 1931 the run length, we put in the second check. (It shouldn't
+ − 1932 really matter though -- next time through we'll just get a
+ − 1933 0.) */
+ − 1934 if (the_src_used < runlen || !dstlen)
826
+ − 1935 break;
+ − 1936 }
+ − 1937 }
+ − 1938 }
+ − 1939
+ − 1940 return dst_used;
+ − 1941 }
+ − 1942
771
+ − 1943
+ − 1944 /************************************************************************/
+ − 1945 /* charset properties of strings */
+ − 1946 /************************************************************************/
+ − 1947
+ − 1948 void
2333
+ − 1949 find_charsets_in_ibyte_string (unsigned char *charsets,
+ − 1950 const Ibyte *USED_IF_MULE (str),
+ − 1951 Bytecount USED_IF_MULE (len))
771
+ − 1952 {
+ − 1953 #ifndef MULE
+ − 1954 /* Telescope this. */
+ − 1955 charsets[0] = 1;
+ − 1956 #else
867
+ − 1957 const Ibyte *strend = str + len;
771
+ − 1958 memset (charsets, 0, NUM_LEADING_BYTES);
+ − 1959
+ − 1960 /* #### SJT doesn't like this. */
+ − 1961 if (len == 0)
+ − 1962 {
+ − 1963 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1;
+ − 1964 return;
+ − 1965 }
+ − 1966
+ − 1967 while (str < strend)
+ − 1968 {
867
+ − 1969 charsets[ichar_leading_byte (itext_ichar (str)) - MIN_LEADING_BYTE] =
771
+ − 1970 1;
867
+ − 1971 INC_IBYTEPTR (str);
771
+ − 1972 }
+ − 1973 #endif
+ − 1974 }
+ − 1975
+ − 1976 void
2333
+ − 1977 find_charsets_in_ichar_string (unsigned char *charsets,
+ − 1978 const Ichar *USED_IF_MULE (str),
+ − 1979 Charcount USED_IF_MULE (len))
771
+ − 1980 {
+ − 1981 #ifndef MULE
+ − 1982 /* Telescope this. */
+ − 1983 charsets[0] = 1;
+ − 1984 #else
+ − 1985 int i;
+ − 1986
+ − 1987 memset (charsets, 0, NUM_LEADING_BYTES);
+ − 1988
+ − 1989 /* #### SJT doesn't like this. */
+ − 1990 if (len == 0)
+ − 1991 {
+ − 1992 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1;
+ − 1993 return;
+ − 1994 }
+ − 1995
+ − 1996 for (i = 0; i < len; i++)
+ − 1997 {
867
+ − 1998 charsets[ichar_leading_byte (str[i]) - MIN_LEADING_BYTE] = 1;
771
+ − 1999 }
+ − 2000 #endif
+ − 2001 }
+ − 2002
+ − 2003 int
867
+ − 2004 ibyte_string_displayed_columns (const Ibyte *str, Bytecount len)
771
+ − 2005 {
+ − 2006 int cols = 0;
867
+ − 2007 const Ibyte *end = str + len;
771
+ − 2008
+ − 2009 while (str < end)
+ − 2010 {
+ − 2011 #ifdef MULE
867
+ − 2012 Ichar ch = itext_ichar (str);
+ − 2013 cols += XCHARSET_COLUMNS (ichar_charset (ch));
771
+ − 2014 #else
+ − 2015 cols++;
+ − 2016 #endif
867
+ − 2017 INC_IBYTEPTR (str);
771
+ − 2018 }
+ − 2019
+ − 2020 return cols;
+ − 2021 }
+ − 2022
+ − 2023 int
2333
+ − 2024 ichar_string_displayed_columns (const Ichar *USED_IF_MULE (str), Charcount len)
771
+ − 2025 {
+ − 2026 #ifdef MULE
+ − 2027 int cols = 0;
+ − 2028 int i;
+ − 2029
+ − 2030 for (i = 0; i < len; i++)
867
+ − 2031 cols += XCHARSET_COLUMNS (ichar_charset (str[i]));
771
+ − 2032
+ − 2033 return cols;
+ − 2034 #else /* not MULE */
+ − 2035 return len;
+ − 2036 #endif
+ − 2037 }
+ − 2038
+ − 2039 Charcount
2333
+ − 2040 ibyte_string_nonascii_chars (const Ibyte *USED_IF_MULE (str),
+ − 2041 Bytecount USED_IF_MULE (len))
771
+ − 2042 {
+ − 2043 #ifdef MULE
867
+ − 2044 const Ibyte *end = str + len;
771
+ − 2045 Charcount retval = 0;
+ − 2046
+ − 2047 while (str < end)
+ − 2048 {
826
+ − 2049 if (!byte_ascii_p (*str))
771
+ − 2050 retval++;
867
+ − 2051 INC_IBYTEPTR (str);
771
+ − 2052 }
+ − 2053
+ − 2054 return retval;
+ − 2055 #else
+ − 2056 return 0;
+ − 2057 #endif
+ − 2058 }
+ − 2059
+ − 2060
+ − 2061 /***************************************************************************/
+ − 2062 /* Eistring helper functions */
+ − 2063 /***************************************************************************/
+ − 2064
+ − 2065 int
867
+ − 2066 eistr_casefiddle_1 (Ibyte *olddata, Bytecount len, Ibyte *newdata,
771
+ − 2067 int downp)
+ − 2068 {
867
+ − 2069 Ibyte *endp = olddata + len;
+ − 2070 Ibyte *newp = newdata;
771
+ − 2071 int changedp = 0;
+ − 2072
+ − 2073 while (olddata < endp)
+ − 2074 {
867
+ − 2075 Ichar c = itext_ichar (olddata);
+ − 2076 Ichar newc;
771
+ − 2077
+ − 2078 if (downp)
+ − 2079 newc = DOWNCASE (0, c);
+ − 2080 else
+ − 2081 newc = UPCASE (0, c);
+ − 2082
+ − 2083 if (c != newc)
+ − 2084 changedp = 1;
+ − 2085
867
+ − 2086 newp += set_itext_ichar (newp, newc);
+ − 2087 INC_IBYTEPTR (olddata);
771
+ − 2088 }
+ − 2089
+ − 2090 *newp = '\0';
+ − 2091
+ − 2092 return changedp ? newp - newdata : 0;
+ − 2093 }
+ − 2094
+ − 2095 int
+ − 2096 eifind_large_enough_buffer (int oldbufsize, int needed_size)
+ − 2097 {
+ − 2098 while (oldbufsize < needed_size)
+ − 2099 {
+ − 2100 oldbufsize = oldbufsize * 3 / 2;
+ − 2101 oldbufsize = max (oldbufsize, 32);
+ − 2102 }
+ − 2103
+ − 2104 return oldbufsize;
+ − 2105 }
+ − 2106
+ − 2107 void
+ − 2108 eito_malloc_1 (Eistring *ei)
+ − 2109 {
+ − 2110 if (ei->mallocp_)
+ − 2111 return;
+ − 2112 ei->mallocp_ = 1;
+ − 2113 if (ei->data_)
+ − 2114 {
867
+ − 2115 Ibyte *newdata;
771
+ − 2116
+ − 2117 ei->max_size_allocated_ =
+ − 2118 eifind_large_enough_buffer (0, ei->bytelen_ + 1);
2367
+ − 2119 newdata = xnew_ibytes (ei->max_size_allocated_);
771
+ − 2120 memcpy (newdata, ei->data_, ei->bytelen_ + 1);
+ − 2121 ei->data_ = newdata;
+ − 2122 }
+ − 2123
+ − 2124 if (ei->extdata_)
+ − 2125 {
2367
+ − 2126 Extbyte *newdata = xnew_extbytes (ei->extlen_ + 2);
771
+ − 2127
+ − 2128 memcpy (newdata, ei->extdata_, ei->extlen_);
+ − 2129 /* Double null-terminate in case of Unicode data */
+ − 2130 newdata[ei->extlen_] = '\0';
+ − 2131 newdata[ei->extlen_ + 1] = '\0';
+ − 2132 ei->extdata_ = newdata;
+ − 2133 }
+ − 2134 }
+ − 2135
+ − 2136 int
+ − 2137 eicmp_1 (Eistring *ei, Bytecount off, Charcount charoff,
867
+ − 2138 Bytecount len, Charcount charlen, const Ibyte *data,
2421
+ − 2139 const Eistring *ei2, int is_ascii, int fold_case)
771
+ − 2140 {
+ − 2141 assert ((off < 0) != (charoff < 0));
+ − 2142 if (off < 0)
+ − 2143 {
+ − 2144 off = charcount_to_bytecount (ei->data_, charoff);
+ − 2145 if (charlen < 0)
+ − 2146 len = -1;
+ − 2147 else
+ − 2148 len = charcount_to_bytecount (ei->data_ + off, charlen);
+ − 2149 }
+ − 2150 if (len < 0)
+ − 2151 len = ei->bytelen_ - off;
+ − 2152
+ − 2153 assert (off >= 0 && off <= ei->bytelen_);
+ − 2154 assert (len >= 0 && off + len <= ei->bytelen_);
+ − 2155 assert ((data == 0) != (ei == 0));
2421
+ − 2156 assert ((is_ascii != 0) == (data != 0));
771
+ − 2157 assert (fold_case >= 0 && fold_case <= 2);
+ − 2158
+ − 2159 {
+ − 2160 Bytecount dstlen;
867
+ − 2161 const Ibyte *src = ei->data_, *dst;
771
+ − 2162
+ − 2163 if (data)
+ − 2164 {
+ − 2165 dst = data;
+ − 2166 dstlen = qxestrlen (data);
+ − 2167 }
+ − 2168 else
+ − 2169 {
+ − 2170 dst = ei2->data_;
+ − 2171 dstlen = ei2->bytelen_;
+ − 2172 }
+ − 2173
2421
+ − 2174 if (is_ascii)
2367
+ − 2175 ASSERT_ASCTEXT_ASCII_LEN ((Ascbyte *) dst, dstlen);
771
+ − 2176
801
+ − 2177 return (fold_case == 0 ? qxememcmp4 (src, len, dst, dstlen) :
+ − 2178 fold_case == 1 ? qxememcasecmp4 (src, len, dst, dstlen) :
+ − 2179 qxetextcasecmp (src, len, dst, dstlen));
771
+ − 2180 }
+ − 2181 }
+ − 2182
867
+ − 2183 Ibyte *
826
+ − 2184 eicpyout_malloc_fmt (Eistring *eistr, Bytecount *len_out, Internal_Format fmt,
2286
+ − 2185 Lisp_Object UNUSED (object))
771
+ − 2186 {
867
+ − 2187 Ibyte *ptr;
771
+ − 2188
+ − 2189 assert (fmt == FORMAT_DEFAULT);
867
+ − 2190 ptr = xnew_array (Ibyte, eistr->bytelen_ + 1);
771
+ − 2191 if (len_out)
+ − 2192 *len_out = eistr->bytelen_;
+ − 2193 memcpy (ptr, eistr->data_, eistr->bytelen_ + 1);
+ − 2194 return ptr;
+ − 2195 }
+ − 2196
+ − 2197
+ − 2198 /************************************************************************/
+ − 2199 /* Charcount/Bytecount conversion */
+ − 2200 /************************************************************************/
+ − 2201
+ − 2202 /* Optimization. Do it. Live it. Love it. */
+ − 2203
+ − 2204 #ifdef MULE
+ − 2205
826
+ − 2206 #ifdef EFFICIENT_INT_128_BIT
+ − 2207 # define STRIDE_TYPE INT_128_BIT
+ − 2208 # define HIGH_BIT_MASK \
+ − 2209 MAKE_128_BIT_UNSIGNED_CONSTANT (0x80808080808080808080808080808080)
+ − 2210 #elif defined (EFFICIENT_INT_64_BIT)
+ − 2211 # define STRIDE_TYPE INT_64_BIT
+ − 2212 # define HIGH_BIT_MASK MAKE_64_BIT_UNSIGNED_CONSTANT (0x8080808080808080)
771
+ − 2213 #else
826
+ − 2214 # define STRIDE_TYPE INT_32_BIT
+ − 2215 # define HIGH_BIT_MASK MAKE_32_BIT_UNSIGNED_CONSTANT (0x80808080)
771
+ − 2216 #endif
+ − 2217
+ − 2218 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1))
+ − 2219 #define ALIGN_MASK (~ ALIGN_BITS)
+ − 2220 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0)
+ − 2221 #define STRIDE sizeof (STRIDE_TYPE)
+ − 2222
2367
+ − 2223 /* Skip as many ASCII bytes as possible in the memory block [PTR, END).
+ − 2224 Return pointer to the first non-ASCII byte. optimized for long
+ − 2225 stretches of ASCII. */
+ − 2226 inline static const Ibyte *
+ − 2227 skip_ascii (const Ibyte *ptr, const Ibyte *end)
+ − 2228 {
826
+ − 2229 const unsigned STRIDE_TYPE *ascii_end;
+ − 2230
+ − 2231 /* Need to do in 3 sections -- before alignment start, aligned chunk,
+ − 2232 after alignment end. */
+ − 2233 while (!ALIGNED (ptr))
771
+ − 2234 {
826
+ − 2235 if (ptr == end || !byte_ascii_p (*ptr))
+ − 2236 return ptr;
+ − 2237 ptr++;
+ − 2238 }
+ − 2239 ascii_end = (const unsigned STRIDE_TYPE *) ptr;
+ − 2240 /* This loop screams, because we can detect ASCII
+ − 2241 characters 4 or 8 at a time. */
867
+ − 2242 while ((const Ibyte *) ascii_end + STRIDE <= end
826
+ − 2243 && !(*ascii_end & HIGH_BIT_MASK))
+ − 2244 ascii_end++;
867
+ − 2245 ptr = (Ibyte *) ascii_end;
826
+ − 2246 while (ptr < end && byte_ascii_p (*ptr))
+ − 2247 ptr++;
+ − 2248 return ptr;
+ − 2249 }
+ − 2250
2367
+ − 2251 /* Skip as many ASCII bytes as possible in the memory block [END, PTR),
+ − 2252 going downwards. Return pointer to the location above the first
+ − 2253 non-ASCII byte. Optimized for long stretches of ASCII. */
+ − 2254 inline static const Ibyte *
+ − 2255 skip_ascii_down (const Ibyte *ptr, const Ibyte *end)
+ − 2256 {
+ − 2257 const unsigned STRIDE_TYPE *ascii_end;
+ − 2258
+ − 2259 /* Need to do in 3 sections -- before alignment start, aligned chunk,
+ − 2260 after alignment end. */
+ − 2261 while (!ALIGNED (ptr))
+ − 2262 {
+ − 2263 if (ptr == end || !byte_ascii_p (*(ptr - 1)))
+ − 2264 return ptr;
+ − 2265 ptr--;
+ − 2266 }
+ − 2267 ascii_end = (const unsigned STRIDE_TYPE *) ptr - 1;
+ − 2268 /* This loop screams, because we can detect ASCII
+ − 2269 characters 4 or 8 at a time. */
+ − 2270 while ((const Ibyte *) ascii_end >= end
+ − 2271 && !(*ascii_end & HIGH_BIT_MASK))
+ − 2272 ascii_end--;
+ − 2273 ptr = (Ibyte *) (ascii_end + 1);
+ − 2274 while (ptr > end && byte_ascii_p (*(ptr - 1)))
+ − 2275 ptr--;
+ − 2276 return ptr;
+ − 2277 }
+ − 2278
826
+ − 2279 /* Function equivalents of bytecount_to_charcount/charcount_to_bytecount.
+ − 2280 These work on strings of all sizes but are more efficient than a simple
+ − 2281 loop on large strings and probably less efficient on sufficiently small
+ − 2282 strings. */
+ − 2283
+ − 2284 Charcount
867
+ − 2285 bytecount_to_charcount_fun (const Ibyte *ptr, Bytecount len)
826
+ − 2286 {
+ − 2287 Charcount count = 0;
867
+ − 2288 const Ibyte *end = ptr + len;
826
+ − 2289 while (1)
+ − 2290 {
867
+ − 2291 const Ibyte *newptr = skip_ascii (ptr, end);
826
+ − 2292 count += newptr - ptr;
+ − 2293 ptr = newptr;
+ − 2294 if (ptr == end)
+ − 2295 break;
+ − 2296 {
+ − 2297 /* Optimize for successive characters from the same charset */
867
+ − 2298 Ibyte leading_byte = *ptr;
826
+ − 2299 int bytes = rep_bytes_by_first_byte (leading_byte);
+ − 2300 while (ptr < end && *ptr == leading_byte)
+ − 2301 ptr += bytes, count++;
+ − 2302 }
771
+ − 2303 }
+ − 2304
+ − 2305 /* Bomb out if the specified substring ends in the middle
+ − 2306 of a character. Note that we might have already gotten
+ − 2307 a core dump above from an invalid reference, but at least
+ − 2308 we will get no farther than here.
+ − 2309
+ − 2310 This also catches len < 0. */
800
+ − 2311 text_checking_assert (ptr == end);
771
+ − 2312
+ − 2313 return count;
+ − 2314 }
+ − 2315
+ − 2316 Bytecount
867
+ − 2317 charcount_to_bytecount_fun (const Ibyte *ptr, Charcount len)
771
+ − 2318 {
867
+ − 2319 const Ibyte *newptr = ptr;
826
+ − 2320 while (1)
771
+ − 2321 {
867
+ − 2322 const Ibyte *newnewptr = skip_ascii (newptr, newptr + len);
826
+ − 2323 len -= newnewptr - newptr;
+ − 2324 newptr = newnewptr;
+ − 2325 if (!len)
+ − 2326 break;
+ − 2327 {
+ − 2328 /* Optimize for successive characters from the same charset */
867
+ − 2329 Ibyte leading_byte = *newptr;
826
+ − 2330 int bytes = rep_bytes_by_first_byte (leading_byte);
+ − 2331 while (len > 0 && *newptr == leading_byte)
+ − 2332 newptr += bytes, len--;
+ − 2333 }
771
+ − 2334 }
+ − 2335 return newptr - ptr;
+ − 2336 }
+ − 2337
2367
+ − 2338 /* Function equivalent of charcount_to_bytecount_down. This works on strings
+ − 2339 of all sizes but is more efficient than a simple loop on large strings
+ − 2340 and probably less efficient on sufficiently small strings. */
+ − 2341
+ − 2342 Bytecount
+ − 2343 charcount_to_bytecount_down_fun (const Ibyte *ptr, Charcount len)
+ − 2344 {
+ − 2345 const Ibyte *newptr = ptr;
+ − 2346 while (1)
+ − 2347 {
+ − 2348 const Ibyte *newnewptr = skip_ascii_down (newptr, newptr - len);
+ − 2349 len -= newptr - newnewptr;
+ − 2350 newptr = newnewptr;
+ − 2351 /* Skip over all non-ASCII chars, counting the length and
+ − 2352 stopping if it's zero */
+ − 2353 while (len && !byte_ascii_p (*(newptr - 1)))
+ − 2354 if (ibyte_first_byte_p (*--newptr))
+ − 2355 len--;
+ − 2356 if (!len)
+ − 2357 break;
+ − 2358 }
+ − 2359 text_checking_assert (ptr - newptr >= 0);
+ − 2360 return ptr - newptr;
+ − 2361 }
+ − 2362
771
+ − 2363 /* The next two functions are the actual meat behind the
+ − 2364 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently
+ − 2365 the method they use is fairly unsophisticated; see buffer.h.
+ − 2366
+ − 2367 Note that charbpos_to_bytebpos_func() is probably the most-called
+ − 2368 function in all of XEmacs. Therefore, it must be FAST FAST FAST.
+ − 2369 This is the reason why so much of the code is duplicated.
+ − 2370
+ − 2371 Similar considerations apply to bytebpos_to_charbpos_func(), although
+ − 2372 less so because the function is not called so often.
2367
+ − 2373 */
+ − 2374
+ − 2375 /*
+ − 2376
+ − 2377 Info on Byte-Char conversion:
+ − 2378
+ − 2379 (Info-goto-node "(internals)Byte-Char Position Conversion")
+ − 2380 */
+ − 2381
+ − 2382 #ifdef OLD_BYTE_CHAR
771
+ − 2383 static int not_very_random_number;
2367
+ − 2384 #endif /* OLD_BYTE_CHAR */
+ − 2385
+ − 2386 #define OLD_LOOP
+ − 2387
+ − 2388 /* If we are this many characters away from any known position, cache the
+ − 2389 new position in the buffer's char-byte cache. */
+ − 2390 #define FAR_AWAY_DISTANCE 5000
+ − 2391
+ − 2392 /* Converting between character positions and byte positions. */
+ − 2393
+ − 2394 /* There are several places in the buffer where we know
+ − 2395 the correspondence: BEG, BEGV, PT, GPT, ZV and Z,
+ − 2396 and everywhere there is a marker. So we find the one of these places
+ − 2397 that is closest to the specified position, and scan from there. */
+ − 2398
+ − 2399 /* This macro is a subroutine of charbpos_to_bytebpos_func.
+ − 2400 Note that it is desirable that BYTEPOS is not evaluated
+ − 2401 except when we really want its value. */
+ − 2402
+ − 2403 #define CONSIDER(CHARPOS, BYTEPOS) \
+ − 2404 do \
+ − 2405 { \
+ − 2406 Charbpos this_charpos = (CHARPOS); \
+ − 2407 int changed = 0; \
+ − 2408 \
+ − 2409 if (this_charpos == x) \
+ − 2410 { \
+ − 2411 retval = (BYTEPOS); \
+ − 2412 goto done; \
+ − 2413 } \
+ − 2414 else if (this_charpos > x) \
+ − 2415 { \
+ − 2416 if (this_charpos < best_above) \
+ − 2417 { \
+ − 2418 best_above = this_charpos; \
+ − 2419 best_above_byte = (BYTEPOS); \
+ − 2420 changed = 1; \
+ − 2421 } \
+ − 2422 } \
+ − 2423 else if (this_charpos > best_below) \
+ − 2424 { \
+ − 2425 best_below = this_charpos; \
+ − 2426 best_below_byte = (BYTEPOS); \
+ − 2427 changed = 1; \
+ − 2428 } \
+ − 2429 \
+ − 2430 if (changed) \
+ − 2431 { \
+ − 2432 if (best_above - best_below == best_above_byte - best_below_byte) \
+ − 2433 { \
+ − 2434 retval = best_below_byte + (x - best_below); \
+ − 2435 goto done; \
+ − 2436 } \
+ − 2437 } \
+ − 2438 } \
+ − 2439 while (0)
+ − 2440
771
+ − 2441
+ − 2442 Bytebpos
+ − 2443 charbpos_to_bytebpos_func (struct buffer *buf, Charbpos x)
+ − 2444 {
2367
+ − 2445 #ifdef OLD_BYTE_CHAR
771
+ − 2446 Charbpos bufmin;
+ − 2447 Charbpos bufmax;
+ − 2448 Bytebpos bytmin;
+ − 2449 Bytebpos bytmax;
+ − 2450 int size;
+ − 2451 int forward_p;
+ − 2452 int diff_so_far;
+ − 2453 int add_to_cache = 0;
2367
+ − 2454 #endif /* OLD_BYTE_CHAR */
+ − 2455
+ − 2456 Charbpos best_above, best_below;
+ − 2457 Bytebpos best_above_byte, best_below_byte;
+ − 2458 int i;
+ − 2459 struct buffer_text *t;
+ − 2460 Bytebpos retval;
+ − 2461
1292
+ − 2462 PROFILE_DECLARE ();
771
+ − 2463
1292
+ − 2464 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion);
+ − 2465
2367
+ − 2466 best_above = BUF_Z (buf);
+ − 2467 best_above_byte = BYTE_BUF_Z (buf);
+ − 2468
+ − 2469 /* In this case, we simply have all one-byte characters. But this should
+ − 2470 have been intercepted before, in charbpos_to_bytebpos(). */
+ − 2471 text_checking_assert (best_above != best_above_byte);
+ − 2472
+ − 2473 best_below = BUF_BEG (buf);
+ − 2474 best_below_byte = BYTE_BUF_BEG (buf);
+ − 2475
+ − 2476 /* We find in best_above and best_above_byte
+ − 2477 the closest known point above CHARPOS,
+ − 2478 and in best_below and best_below_byte
+ − 2479 the closest known point below CHARPOS,
+ − 2480
+ − 2481 If at any point we can tell that the space between those
+ − 2482 two best approximations is all single-byte,
+ − 2483 we interpolate the result immediately. */
+ − 2484
+ − 2485 CONSIDER (BUF_PT (buf), BYTE_BUF_PT (buf));
+ − 2486 CONSIDER (BUF_GPT (buf), BYTE_BUF_GPT (buf));
+ − 2487 CONSIDER (BUF_BEGV (buf), BYTE_BUF_BEGV (buf));
+ − 2488 CONSIDER (BUF_ZV (buf), BYTE_BUF_ZV (buf));
+ − 2489
+ − 2490 t = buf->text;
+ − 2491 CONSIDER (t->cached_charpos, t->cached_bytepos);
+ − 2492
+ − 2493 /* Check the most recently entered positions first */
+ − 2494
+ − 2495 for (i = t->next_cache_pos - 1; i >= 0; i--)
+ − 2496 {
+ − 2497 CONSIDER (t->mule_charbpos_cache[i], t->mule_bytebpos_cache[i]);
+ − 2498
+ − 2499 /* If we are down to a range of 50 chars,
+ − 2500 don't bother checking any other markers;
+ − 2501 scan the intervening chars directly now. */
+ − 2502 if (best_above - best_below < 50)
+ − 2503 break;
+ − 2504 }
+ − 2505
+ − 2506 /* We get here if we did not exactly hit one of the known places.
+ − 2507 We have one known above and one known below.
+ − 2508 Scan, counting characters, from whichever one is closer. */
+ − 2509
+ − 2510 if (x - best_below < best_above - x)
+ − 2511 {
+ − 2512 int record = x - best_below > FAR_AWAY_DISTANCE;
+ − 2513
+ − 2514 #ifdef OLD_LOOP /* old code */
+ − 2515 while (best_below != x)
+ − 2516 {
+ − 2517 best_below++;
+ − 2518 INC_BYTEBPOS (buf, best_below_byte);
+ − 2519 }
+ − 2520 #else
+ − 2521 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT);
+ − 2522 /* The gap should not occur between best_below and x, or we will be
+ − 2523 screwed in using charcount_to_bytecount(). It should not be exactly
+ − 2524 at x either, because we already should have caught that. */
+ − 2525 text_checking_assert
+ − 2526 (BUF_CEILING_OF_IGNORE_ACCESSIBLE (buf, best_below) > x);
+ − 2527
+ − 2528 /* Using charcount_to_bytecount() is potentially a lot faster than a
+ − 2529 simple loop using INC_BYTEBPOS() because (a) the checks for gap
+ − 2530 and buffer format are factored out instead of getting checked
+ − 2531 every time; (b) the checking goes 4 or 8 bytes at a time in ASCII
+ − 2532 text.
+ − 2533 */
+ − 2534 best_below_byte +=
+ − 2535 charcount_to_bytecount
+ − 2536 (BYTE_BUF_BYTE_ADDRESS (buf, best_below_byte), x - best_below);
+ − 2537 best_below = x;
+ − 2538 #endif /* 0 */
+ − 2539
+ − 2540 /* If this position is quite far from the nearest known position,
+ − 2541 cache the correspondence.
+ − 2542
+ − 2543 NB FSF does this: "... by creating a marker here.
+ − 2544 It will last until the next GC."
+ − 2545 */
+ − 2546
+ − 2547 if (record)
+ − 2548 {
+ − 2549 /* If we have run out of positions to record, discard some of the
+ − 2550 old ones. I used to use a circular buffer, which avoids the
+ − 2551 need to block-move any memory. But it makes it more difficult
+ − 2552 to keep track of which positions haven't been used -- commonly
+ − 2553 we haven't yet filled out anywhere near the whole set of
+ − 2554 positions and don't want to check them all. We should not be
+ − 2555 recording that often, and block-moving is extremely fast in
+ − 2556 any case. --ben */
+ − 2557 if (t->next_cache_pos == NUM_CACHED_POSITIONS)
+ − 2558 {
+ − 2559 memmove (t->mule_charbpos_cache,
+ − 2560 t->mule_charbpos_cache + NUM_MOVED_POSITIONS,
+ − 2561 sizeof (Charbpos) *
+ − 2562 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
+ − 2563 memmove (t->mule_bytebpos_cache,
+ − 2564 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS,
+ − 2565 sizeof (Bytebpos) *
+ − 2566 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
+ − 2567 t->next_cache_pos -= NUM_MOVED_POSITIONS;
+ − 2568 }
+ − 2569 t->mule_charbpos_cache[t->next_cache_pos] = best_below;
+ − 2570 t->mule_bytebpos_cache[t->next_cache_pos] = best_below_byte;
+ − 2571 t->next_cache_pos++;
+ − 2572 }
+ − 2573
+ − 2574 t->cached_charpos = best_below;
+ − 2575 t->cached_bytepos = best_below_byte;
+ − 2576
+ − 2577 retval = best_below_byte;
+ − 2578 text_checking_assert (best_below_byte >= best_below);
+ − 2579 goto done;
+ − 2580 }
+ − 2581 else
+ − 2582 {
+ − 2583 int record = best_above - x > FAR_AWAY_DISTANCE;
+ − 2584
+ − 2585 #ifdef OLD_LOOP
+ − 2586 while (best_above != x)
+ − 2587 {
+ − 2588 best_above--;
+ − 2589 DEC_BYTEBPOS (buf, best_above_byte);
+ − 2590 }
+ − 2591 #else
+ − 2592 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT);
+ − 2593 /* The gap should not occur between best_above and x, or we will be
+ − 2594 screwed in using charcount_to_bytecount_down(). It should not be
+ − 2595 exactly at x either, because we already should have caught
+ − 2596 that. */
+ − 2597 text_checking_assert
+ − 2598 (BUF_FLOOR_OF_IGNORE_ACCESSIBLE (buf, best_above) < x);
+ − 2599
+ − 2600 /* Using charcount_to_bytecount_down() is potentially a lot faster
+ − 2601 than a simple loop using DEC_BYTEBPOS(); see above. */
+ − 2602 best_above_byte -=
+ − 2603 charcount_to_bytecount_down
+ − 2604 /* BYTE_BUF_BYTE_ADDRESS will return a value on the high side of the
+ − 2605 gap if we are at the gap, which is the wrong side. So do the
+ − 2606 following trick instead. */
+ − 2607 (BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, best_above_byte) + 1,
+ − 2608 best_above - x);
+ − 2609 best_above = x;
+ − 2610 #endif /* SLEDGEHAMMER_CHECK_TEXT */
+ − 2611
+ − 2612
+ − 2613 /* If this position is quite far from the nearest known position,
+ − 2614 cache the correspondence.
+ − 2615
+ − 2616 NB FSF does this: "... by creating a marker here.
+ − 2617 It will last until the next GC."
+ − 2618 */
+ − 2619 if (record)
+ − 2620 {
+ − 2621 if (t->next_cache_pos == NUM_CACHED_POSITIONS)
+ − 2622 {
+ − 2623 memmove (t->mule_charbpos_cache,
+ − 2624 t->mule_charbpos_cache + NUM_MOVED_POSITIONS,
+ − 2625 sizeof (Charbpos) *
+ − 2626 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
+ − 2627 memmove (t->mule_bytebpos_cache,
+ − 2628 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS,
+ − 2629 sizeof (Bytebpos) *
+ − 2630 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
+ − 2631 t->next_cache_pos -= NUM_MOVED_POSITIONS;
+ − 2632 }
+ − 2633 t->mule_charbpos_cache[t->next_cache_pos] = best_above;
+ − 2634 t->mule_bytebpos_cache[t->next_cache_pos] = best_above_byte;
+ − 2635 t->next_cache_pos++;
+ − 2636 }
+ − 2637
+ − 2638 t->cached_charpos = best_above;
+ − 2639 t->cached_bytepos = best_above_byte;
+ − 2640
+ − 2641 retval = best_above_byte;
+ − 2642 text_checking_assert (best_above_byte >= best_above);
+ − 2643 goto done;
+ − 2644 }
+ − 2645
+ − 2646 #ifdef OLD_BYTE_CHAR
+ − 2647
771
+ − 2648 bufmin = buf->text->mule_bufmin;
+ − 2649 bufmax = buf->text->mule_bufmax;
+ − 2650 bytmin = buf->text->mule_bytmin;
+ − 2651 bytmax = buf->text->mule_bytmax;
+ − 2652 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
+ − 2653
+ − 2654 /* The basic idea here is that we shift the "known region" up or down
+ − 2655 until it overlaps the specified position. We do this by moving
+ − 2656 the upper bound of the known region up one character at a time,
+ − 2657 and moving the lower bound of the known region up as necessary
+ − 2658 when the size of the character just seen changes.
+ − 2659
+ − 2660 We optimize this, however, by first shifting the known region to
+ − 2661 one of the cached points if it's close by. (We don't check BEG or
+ − 2662 Z, even though they're cached; most of the time these will be the
+ − 2663 same as BEGV and ZV, and when they're not, they're not likely
+ − 2664 to be used.) */
+ − 2665
+ − 2666 if (x > bufmax)
+ − 2667 {
+ − 2668 Charbpos diffmax = x - bufmax;
+ − 2669 Charbpos diffpt = x - BUF_PT (buf);
+ − 2670 Charbpos diffzv = BUF_ZV (buf) - x;
+ − 2671 /* #### This value could stand some more exploration. */
+ − 2672 Charcount heuristic_hack = (bufmax - bufmin) >> 2;
+ − 2673
+ − 2674 /* Check if the position is closer to PT or ZV than to the
+ − 2675 end of the known region. */
+ − 2676
+ − 2677 if (diffpt < 0)
+ − 2678 diffpt = -diffpt;
+ − 2679 if (diffzv < 0)
+ − 2680 diffzv = -diffzv;
+ − 2681
+ − 2682 /* But also implement a heuristic that favors the known region
+ − 2683 over PT or ZV. The reason for this is that switching to
+ − 2684 PT or ZV will wipe out the knowledge in the known region,
+ − 2685 which might be annoying if the known region is large and
+ − 2686 PT or ZV is not that much closer than the end of the known
+ − 2687 region. */
+ − 2688
+ − 2689 diffzv += heuristic_hack;
+ − 2690 diffpt += heuristic_hack;
+ − 2691 if (diffpt < diffmax && diffpt <= diffzv)
+ − 2692 {
+ − 2693 bufmax = bufmin = BUF_PT (buf);
826
+ − 2694 bytmax = bytmin = BYTE_BUF_PT (buf);
771
+ − 2695 /* We set the size to 1 even though it doesn't really
+ − 2696 matter because the new known region contains no
+ − 2697 characters. We do this because this is the most
+ − 2698 likely size of the characters around the new known
+ − 2699 region, and we avoid potential yuckiness that is
+ − 2700 done when size == 3. */
+ − 2701 size = 1;
+ − 2702 }
+ − 2703 if (diffzv < diffmax)
+ − 2704 {
+ − 2705 bufmax = bufmin = BUF_ZV (buf);
826
+ − 2706 bytmax = bytmin = BYTE_BUF_ZV (buf);
771
+ − 2707 size = 1;
+ − 2708 }
+ − 2709 }
800
+ − 2710 #ifdef ERROR_CHECK_TEXT
771
+ − 2711 else if (x >= bufmin)
2500
+ − 2712 ABORT ();
771
+ − 2713 #endif
+ − 2714 else
+ − 2715 {
+ − 2716 Charbpos diffmin = bufmin - x;
+ − 2717 Charbpos diffpt = BUF_PT (buf) - x;
+ − 2718 Charbpos diffbegv = x - BUF_BEGV (buf);
+ − 2719 /* #### This value could stand some more exploration. */
+ − 2720 Charcount heuristic_hack = (bufmax - bufmin) >> 2;
+ − 2721
+ − 2722 if (diffpt < 0)
+ − 2723 diffpt = -diffpt;
+ − 2724 if (diffbegv < 0)
+ − 2725 diffbegv = -diffbegv;
+ − 2726
+ − 2727 /* But also implement a heuristic that favors the known region --
+ − 2728 see above. */
+ − 2729
+ − 2730 diffbegv += heuristic_hack;
+ − 2731 diffpt += heuristic_hack;
+ − 2732
+ − 2733 if (diffpt < diffmin && diffpt <= diffbegv)
+ − 2734 {
+ − 2735 bufmax = bufmin = BUF_PT (buf);
826
+ − 2736 bytmax = bytmin = BYTE_BUF_PT (buf);
771
+ − 2737 /* We set the size to 1 even though it doesn't really
+ − 2738 matter because the new known region contains no
+ − 2739 characters. We do this because this is the most
+ − 2740 likely size of the characters around the new known
+ − 2741 region, and we avoid potential yuckiness that is
+ − 2742 done when size == 3. */
+ − 2743 size = 1;
+ − 2744 }
+ − 2745 if (diffbegv < diffmin)
+ − 2746 {
+ − 2747 bufmax = bufmin = BUF_BEGV (buf);
826
+ − 2748 bytmax = bytmin = BYTE_BUF_BEGV (buf);
771
+ − 2749 size = 1;
+ − 2750 }
+ − 2751 }
+ − 2752
+ − 2753 diff_so_far = x > bufmax ? x - bufmax : bufmin - x;
+ − 2754 if (diff_so_far > 50)
+ − 2755 {
+ − 2756 /* If we have to move more than a certain amount, then look
+ − 2757 into our cache. */
+ − 2758 int minval = INT_MAX;
+ − 2759 int found = 0;
+ − 2760 int i;
+ − 2761
+ − 2762 add_to_cache = 1;
+ − 2763 /* I considered keeping the positions ordered. This would speed
+ − 2764 up this loop, but updating the cache would take longer, so
+ − 2765 it doesn't seem like it would really matter. */
2367
+ − 2766 for (i = 0; i < NUM_CACHED_POSITIONS; i++)
771
+ − 2767 {
+ − 2768 int diff = buf->text->mule_charbpos_cache[i] - x;
+ − 2769
+ − 2770 if (diff < 0)
+ − 2771 diff = -diff;
+ − 2772 if (diff < minval)
+ − 2773 {
+ − 2774 minval = diff;
+ − 2775 found = i;
+ − 2776 }
+ − 2777 }
+ − 2778
+ − 2779 if (minval < diff_so_far)
+ − 2780 {
+ − 2781 bufmax = bufmin = buf->text->mule_charbpos_cache[found];
+ − 2782 bytmax = bytmin = buf->text->mule_bytebpos_cache[found];
+ − 2783 size = 1;
+ − 2784 }
+ − 2785 }
+ − 2786
+ − 2787 /* It's conceivable that the caching above could lead to X being
+ − 2788 the same as one of the range edges. */
+ − 2789 if (x >= bufmax)
+ − 2790 {
+ − 2791 Bytebpos newmax;
+ − 2792 Bytecount newsize;
+ − 2793
+ − 2794 forward_p = 1;
+ − 2795 while (x > bufmax)
+ − 2796 {
+ − 2797 newmax = bytmax;
+ − 2798
+ − 2799 INC_BYTEBPOS (buf, newmax);
+ − 2800 newsize = newmax - bytmax;
+ − 2801 if (newsize != size)
+ − 2802 {
+ − 2803 bufmin = bufmax;
+ − 2804 bytmin = bytmax;
+ − 2805 size = newsize;
+ − 2806 }
+ − 2807 bytmax = newmax;
+ − 2808 bufmax++;
+ − 2809 }
+ − 2810 retval = bytmax;
+ − 2811
+ − 2812 /* #### Should go past the found location to reduce the number
+ − 2813 of times that this function is called */
+ − 2814 }
+ − 2815 else /* x < bufmin */
+ − 2816 {
+ − 2817 Bytebpos newmin;
+ − 2818 Bytecount newsize;
+ − 2819
+ − 2820 forward_p = 0;
+ − 2821 while (x < bufmin)
+ − 2822 {
+ − 2823 newmin = bytmin;
+ − 2824
+ − 2825 DEC_BYTEBPOS (buf, newmin);
+ − 2826 newsize = bytmin - newmin;
+ − 2827 if (newsize != size)
+ − 2828 {
+ − 2829 bufmax = bufmin;
+ − 2830 bytmax = bytmin;
+ − 2831 size = newsize;
+ − 2832 }
+ − 2833 bytmin = newmin;
+ − 2834 bufmin--;
+ − 2835 }
+ − 2836 retval = bytmin;
+ − 2837
+ − 2838 /* #### Should go past the found location to reduce the number
+ − 2839 of times that this function is called
+ − 2840 */
+ − 2841 }
+ − 2842
+ − 2843 /* If size is three, than we have to max sure that the range we
+ − 2844 discovered isn't too large, because we use a fixed-length
+ − 2845 table to divide by 3. */
+ − 2846
+ − 2847 if (size == 3)
+ − 2848 {
+ − 2849 int gap = bytmax - bytmin;
+ − 2850 buf->text->mule_three_p = 1;
+ − 2851 buf->text->mule_shifter = 1;
+ − 2852
+ − 2853 if (gap > MAX_BYTEBPOS_GAP_SIZE_3)
+ − 2854 {
+ − 2855 if (forward_p)
+ − 2856 {
+ − 2857 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3;
+ − 2858 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3;
+ − 2859 }
+ − 2860 else
+ − 2861 {
+ − 2862 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3;
+ − 2863 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3;
+ − 2864 }
+ − 2865 }
+ − 2866 }
+ − 2867 else
+ − 2868 {
+ − 2869 buf->text->mule_three_p = 0;
+ − 2870 if (size == 4)
+ − 2871 buf->text->mule_shifter = 2;
+ − 2872 else
+ − 2873 buf->text->mule_shifter = size - 1;
+ − 2874 }
+ − 2875
+ − 2876 buf->text->mule_bufmin = bufmin;
+ − 2877 buf->text->mule_bufmax = bufmax;
+ − 2878 buf->text->mule_bytmin = bytmin;
+ − 2879 buf->text->mule_bytmax = bytmax;
+ − 2880
+ − 2881 if (add_to_cache)
+ − 2882 {
+ − 2883 int replace_loc;
+ − 2884
+ − 2885 /* We throw away a "random" cached value and replace it with
+ − 2886 the new value. It doesn't actually have to be very random
+ − 2887 at all, just evenly distributed.
+ − 2888
+ − 2889 #### It would be better to use a least-recently-used algorithm
+ − 2890 or something that tries to space things out, but I'm not sure
+ − 2891 it's worth it to go to the trouble of maintaining that. */
+ − 2892 not_very_random_number += 621;
+ − 2893 replace_loc = not_very_random_number & 15;
+ − 2894 buf->text->mule_charbpos_cache[replace_loc] = x;
+ − 2895 buf->text->mule_bytebpos_cache[replace_loc] = retval;
+ − 2896 }
+ − 2897
2367
+ − 2898 #endif /* OLD_BYTE_CHAR */
+ − 2899
+ − 2900 done:
1292
+ − 2901 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion);
+ − 2902
771
+ − 2903 return retval;
+ − 2904 }
+ − 2905
2367
+ − 2906 #undef CONSIDER
+ − 2907
+ − 2908 /* bytepos_to_charpos returns the char position corresponding to BYTEPOS. */
+ − 2909
+ − 2910 /* This macro is a subroutine of bytebpos_to_charbpos_func.
+ − 2911 It is used when BYTEPOS is actually the byte position. */
+ − 2912
+ − 2913 #define CONSIDER(BYTEPOS, CHARPOS) \
+ − 2914 do \
+ − 2915 { \
+ − 2916 Bytebpos this_bytepos = (BYTEPOS); \
+ − 2917 int changed = 0; \
+ − 2918 \
+ − 2919 if (this_bytepos == x) \
+ − 2920 { \
+ − 2921 retval = (CHARPOS); \
+ − 2922 goto done; \
+ − 2923 } \
+ − 2924 else if (this_bytepos > x) \
+ − 2925 { \
+ − 2926 if (this_bytepos < best_above_byte) \
+ − 2927 { \
+ − 2928 best_above = (CHARPOS); \
+ − 2929 best_above_byte = this_bytepos; \
+ − 2930 changed = 1; \
+ − 2931 } \
+ − 2932 } \
+ − 2933 else if (this_bytepos > best_below_byte) \
+ − 2934 { \
+ − 2935 best_below = (CHARPOS); \
+ − 2936 best_below_byte = this_bytepos; \
+ − 2937 changed = 1; \
+ − 2938 } \
+ − 2939 \
+ − 2940 if (changed) \
+ − 2941 { \
+ − 2942 if (best_above - best_below == best_above_byte - best_below_byte) \
+ − 2943 { \
+ − 2944 retval = best_below + (x - best_below_byte); \
+ − 2945 goto done; \
+ − 2946 } \
+ − 2947 } \
+ − 2948 } \
+ − 2949 while (0)
+ − 2950
771
+ − 2951 /* The logic in this function is almost identical to the logic in
+ − 2952 the previous function. */
+ − 2953
+ − 2954 Charbpos
+ − 2955 bytebpos_to_charbpos_func (struct buffer *buf, Bytebpos x)
+ − 2956 {
2367
+ − 2957 #ifdef OLD_BYTE_CHAR
771
+ − 2958 Charbpos bufmin;
+ − 2959 Charbpos bufmax;
+ − 2960 Bytebpos bytmin;
+ − 2961 Bytebpos bytmax;
+ − 2962 int size;
+ − 2963 int forward_p;
+ − 2964 int diff_so_far;
+ − 2965 int add_to_cache = 0;
2367
+ − 2966 #endif /* OLD_BYTE_CHAR */
+ − 2967
+ − 2968 Charbpos best_above, best_above_byte;
+ − 2969 Bytebpos best_below, best_below_byte;
+ − 2970 int i;
+ − 2971 struct buffer_text *t;
+ − 2972 Charbpos retval;
+ − 2973
1292
+ − 2974 PROFILE_DECLARE ();
771
+ − 2975
1292
+ − 2976 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion);
+ − 2977
2367
+ − 2978 best_above = BUF_Z (buf);
+ − 2979 best_above_byte = BYTE_BUF_Z (buf);
+ − 2980
+ − 2981 /* In this case, we simply have all one-byte characters. But this should
+ − 2982 have been intercepted before, in bytebpos_to_charbpos(). */
+ − 2983 text_checking_assert (best_above != best_above_byte);
+ − 2984
+ − 2985 best_below = BUF_BEG (buf);
+ − 2986 best_below_byte = BYTE_BUF_BEG (buf);
+ − 2987
+ − 2988 CONSIDER (BYTE_BUF_PT (buf), BUF_PT (buf));
+ − 2989 CONSIDER (BYTE_BUF_GPT (buf), BUF_GPT (buf));
+ − 2990 CONSIDER (BYTE_BUF_BEGV (buf), BUF_BEGV (buf));
+ − 2991 CONSIDER (BYTE_BUF_ZV (buf), BUF_ZV (buf));
+ − 2992
+ − 2993 t = buf->text;
+ − 2994 CONSIDER (t->cached_bytepos, t->cached_charpos);
+ − 2995
+ − 2996 /* Check the most recently entered positions first */
+ − 2997
+ − 2998 for (i = t->next_cache_pos - 1; i >= 0; i--)
+ − 2999 {
+ − 3000 CONSIDER (t->mule_bytebpos_cache[i], t->mule_charbpos_cache[i]);
+ − 3001
+ − 3002 /* If we are down to a range of 50 chars,
+ − 3003 don't bother checking any other markers;
+ − 3004 scan the intervening chars directly now. */
+ − 3005 if (best_above - best_below < 50)
+ − 3006 break;
+ − 3007 }
+ − 3008
+ − 3009 /* We get here if we did not exactly hit one of the known places.
+ − 3010 We have one known above and one known below.
+ − 3011 Scan, counting characters, from whichever one is closer. */
+ − 3012
+ − 3013 if (x - best_below_byte < best_above_byte - x)
+ − 3014 {
+ − 3015 int record = x - best_below_byte > 5000;
+ − 3016
+ − 3017 #ifdef OLD_LOOP /* old code */
+ − 3018 while (best_below_byte < x)
+ − 3019 {
+ − 3020 best_below++;
+ − 3021 INC_BYTEBPOS (buf, best_below_byte);
+ − 3022 }
+ − 3023 #else
+ − 3024 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT);
+ − 3025 /* The gap should not occur between best_below and x, or we will be
+ − 3026 screwed in using charcount_to_bytecount(). It should not be exactly
+ − 3027 at x either, because we already should have caught that. */
+ − 3028 text_checking_assert
+ − 3029 (BYTE_BUF_CEILING_OF_IGNORE_ACCESSIBLE (buf, best_below_byte) > x);
+ − 3030
+ − 3031 /* Using bytecount_to_charcount() is potentially a lot faster than
+ − 3032 a simple loop above using INC_BYTEBPOS(); see above.
+ − 3033 */
+ − 3034 best_below +=
+ − 3035 bytecount_to_charcount
+ − 3036 (BYTE_BUF_BYTE_ADDRESS (buf, best_below_byte), x - best_below_byte);
+ − 3037 best_below_byte = x;
+ − 3038 #endif
+ − 3039
+ − 3040 /* If this position is quite far from the nearest known position,
+ − 3041 cache the correspondence.
+ − 3042
+ − 3043 NB FSF does this: "... by creating a marker here.
+ − 3044 It will last until the next GC."
+ − 3045 */
+ − 3046
+ − 3047 if (record)
+ − 3048 {
+ − 3049 if (t->next_cache_pos == NUM_CACHED_POSITIONS)
+ − 3050 {
+ − 3051 memmove (t->mule_charbpos_cache,
+ − 3052 t->mule_charbpos_cache + NUM_MOVED_POSITIONS,
+ − 3053 sizeof (Charbpos) *
+ − 3054 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
+ − 3055 memmove (t->mule_bytebpos_cache,
+ − 3056 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS,
+ − 3057 sizeof (Bytebpos) *
+ − 3058 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
+ − 3059 t->next_cache_pos -= NUM_MOVED_POSITIONS;
+ − 3060 }
+ − 3061 t->mule_charbpos_cache[t->next_cache_pos] = best_below;
+ − 3062 t->mule_bytebpos_cache[t->next_cache_pos] = best_below_byte;
+ − 3063 t->next_cache_pos++;
+ − 3064 }
+ − 3065
+ − 3066
+ − 3067 t->cached_charpos = best_below;
+ − 3068 t->cached_bytepos = best_below_byte;
+ − 3069
+ − 3070 retval = best_below;
+ − 3071 text_checking_assert (best_below_byte >= best_below);
+ − 3072 goto done;
+ − 3073 }
+ − 3074 else
+ − 3075 {
+ − 3076 int record = best_above_byte - x > 5000;
+ − 3077
+ − 3078 #ifdef OLD_LOOP /* old code */
+ − 3079 while (best_above_byte > x)
+ − 3080 {
+ − 3081 best_above--;
+ − 3082 DEC_BYTEBPOS (buf, best_above_byte);
+ − 3083 }
+ − 3084 #else
+ − 3085 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT);
+ − 3086 /* The gap should not occur between best_above and x, or we will be
+ − 3087 screwed in using bytecount_to_charcount_down(). It should not be
+ − 3088 exactly at x either, because we already should have caught
+ − 3089 that. */
+ − 3090 text_checking_assert
+ − 3091 (BYTE_BUF_FLOOR_OF_IGNORE_ACCESSIBLE (buf, best_above_byte) < x);
+ − 3092
+ − 3093 /* Using bytecount_to_charcount_down() is potentially a lot faster
+ − 3094 than a simple loop using INC_BYTEBPOS(); see above. */
+ − 3095 best_above -=
+ − 3096 bytecount_to_charcount_down
+ − 3097 /* BYTE_BUF_BYTE_ADDRESS will return a value on the high side of the
+ − 3098 gap if we are at the gap, which is the wrong side. So do the
+ − 3099 following trick instead. */
+ − 3100 (BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, best_above_byte) + 1,
+ − 3101 best_above_byte - x);
+ − 3102 best_above_byte = x;
+ − 3103 #endif
+ − 3104
+ − 3105
+ − 3106 /* If this position is quite far from the nearest known position,
+ − 3107 cache the correspondence.
+ − 3108
+ − 3109 NB FSF does this: "... by creating a marker here.
+ − 3110 It will last until the next GC."
+ − 3111 */
+ − 3112 if (record)
+ − 3113 {
+ − 3114 if (t->next_cache_pos == NUM_CACHED_POSITIONS)
+ − 3115 {
+ − 3116 memmove (t->mule_charbpos_cache,
+ − 3117 t->mule_charbpos_cache + NUM_MOVED_POSITIONS,
+ − 3118 sizeof (Charbpos) *
+ − 3119 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
+ − 3120 memmove (t->mule_bytebpos_cache,
+ − 3121 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS,
+ − 3122 sizeof (Bytebpos) *
+ − 3123 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
+ − 3124 t->next_cache_pos -= NUM_MOVED_POSITIONS;
+ − 3125 }
+ − 3126 t->mule_charbpos_cache[t->next_cache_pos] = best_above;
+ − 3127 t->mule_bytebpos_cache[t->next_cache_pos] = best_above_byte;
+ − 3128 t->next_cache_pos++;
+ − 3129 }
+ − 3130
+ − 3131 t->cached_charpos = best_above;
+ − 3132 t->cached_bytepos = best_above_byte;
+ − 3133
+ − 3134 retval = best_above;
+ − 3135 text_checking_assert (best_above_byte >= best_above);
+ − 3136 goto done;
+ − 3137 }
+ − 3138
+ − 3139 #ifdef OLD_BYTE_CHAR
+ − 3140
771
+ − 3141 bufmin = buf->text->mule_bufmin;
+ − 3142 bufmax = buf->text->mule_bufmax;
+ − 3143 bytmin = buf->text->mule_bytmin;
+ − 3144 bytmax = buf->text->mule_bytmax;
+ − 3145 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
+ − 3146
+ − 3147 /* The basic idea here is that we shift the "known region" up or down
+ − 3148 until it overlaps the specified position. We do this by moving
+ − 3149 the upper bound of the known region up one character at a time,
+ − 3150 and moving the lower bound of the known region up as necessary
+ − 3151 when the size of the character just seen changes.
+ − 3152
+ − 3153 We optimize this, however, by first shifting the known region to
826
+ − 3154 one of the cached points if it's close by. (We don't check BYTE_BEG or
+ − 3155 BYTE_Z, even though they're cached; most of the time these will be the
+ − 3156 same as BYTE_BEGV and BYTE_ZV, and when they're not, they're not likely
771
+ − 3157 to be used.) */
+ − 3158
+ − 3159 if (x > bytmax)
+ − 3160 {
+ − 3161 Bytebpos diffmax = x - bytmax;
826
+ − 3162 Bytebpos diffpt = x - BYTE_BUF_PT (buf);
+ − 3163 Bytebpos diffzv = BYTE_BUF_ZV (buf) - x;
771
+ − 3164 /* #### This value could stand some more exploration. */
+ − 3165 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
+ − 3166
+ − 3167 /* Check if the position is closer to PT or ZV than to the
+ − 3168 end of the known region. */
+ − 3169
+ − 3170 if (diffpt < 0)
+ − 3171 diffpt = -diffpt;
+ − 3172 if (diffzv < 0)
+ − 3173 diffzv = -diffzv;
+ − 3174
+ − 3175 /* But also implement a heuristic that favors the known region
826
+ − 3176 over BYTE_PT or BYTE_ZV. The reason for this is that switching to
+ − 3177 BYTE_PT or BYTE_ZV will wipe out the knowledge in the known region,
771
+ − 3178 which might be annoying if the known region is large and
826
+ − 3179 BYTE_PT or BYTE_ZV is not that much closer than the end of the known
771
+ − 3180 region. */
+ − 3181
+ − 3182 diffzv += heuristic_hack;
+ − 3183 diffpt += heuristic_hack;
+ − 3184 if (diffpt < diffmax && diffpt <= diffzv)
+ − 3185 {
+ − 3186 bufmax = bufmin = BUF_PT (buf);
826
+ − 3187 bytmax = bytmin = BYTE_BUF_PT (buf);
771
+ − 3188 /* We set the size to 1 even though it doesn't really
+ − 3189 matter because the new known region contains no
+ − 3190 characters. We do this because this is the most
+ − 3191 likely size of the characters around the new known
+ − 3192 region, and we avoid potential yuckiness that is
+ − 3193 done when size == 3. */
+ − 3194 size = 1;
+ − 3195 }
+ − 3196 if (diffzv < diffmax)
+ − 3197 {
+ − 3198 bufmax = bufmin = BUF_ZV (buf);
826
+ − 3199 bytmax = bytmin = BYTE_BUF_ZV (buf);
771
+ − 3200 size = 1;
+ − 3201 }
+ − 3202 }
800
+ − 3203 #ifdef ERROR_CHECK_TEXT
771
+ − 3204 else if (x >= bytmin)
2500
+ − 3205 ABORT ();
771
+ − 3206 #endif
+ − 3207 else
+ − 3208 {
+ − 3209 Bytebpos diffmin = bytmin - x;
826
+ − 3210 Bytebpos diffpt = BYTE_BUF_PT (buf) - x;
+ − 3211 Bytebpos diffbegv = x - BYTE_BUF_BEGV (buf);
771
+ − 3212 /* #### This value could stand some more exploration. */
+ − 3213 Bytecount heuristic_hack = (bytmax - bytmin) >> 2;
+ − 3214
+ − 3215 if (diffpt < 0)
+ − 3216 diffpt = -diffpt;
+ − 3217 if (diffbegv < 0)
+ − 3218 diffbegv = -diffbegv;
+ − 3219
+ − 3220 /* But also implement a heuristic that favors the known region --
+ − 3221 see above. */
+ − 3222
+ − 3223 diffbegv += heuristic_hack;
+ − 3224 diffpt += heuristic_hack;
+ − 3225
+ − 3226 if (diffpt < diffmin && diffpt <= diffbegv)
+ − 3227 {
+ − 3228 bufmax = bufmin = BUF_PT (buf);
826
+ − 3229 bytmax = bytmin = BYTE_BUF_PT (buf);
771
+ − 3230 /* We set the size to 1 even though it doesn't really
+ − 3231 matter because the new known region contains no
+ − 3232 characters. We do this because this is the most
+ − 3233 likely size of the characters around the new known
+ − 3234 region, and we avoid potential yuckiness that is
+ − 3235 done when size == 3. */
+ − 3236 size = 1;
+ − 3237 }
+ − 3238 if (diffbegv < diffmin)
+ − 3239 {
+ − 3240 bufmax = bufmin = BUF_BEGV (buf);
826
+ − 3241 bytmax = bytmin = BYTE_BUF_BEGV (buf);
771
+ − 3242 size = 1;
+ − 3243 }
+ − 3244 }
+ − 3245
+ − 3246 diff_so_far = x > bytmax ? x - bytmax : bytmin - x;
+ − 3247 if (diff_so_far > 50)
+ − 3248 {
+ − 3249 /* If we have to move more than a certain amount, then look
+ − 3250 into our cache. */
+ − 3251 int minval = INT_MAX;
+ − 3252 int found = 0;
+ − 3253 int i;
+ − 3254
+ − 3255 add_to_cache = 1;
+ − 3256 /* I considered keeping the positions ordered. This would speed
+ − 3257 up this loop, but updating the cache would take longer, so
+ − 3258 it doesn't seem like it would really matter. */
2367
+ − 3259 for (i = 0; i < NUM_CACHED_POSITIONS; i++)
771
+ − 3260 {
+ − 3261 int diff = buf->text->mule_bytebpos_cache[i] - x;
+ − 3262
+ − 3263 if (diff < 0)
+ − 3264 diff = -diff;
+ − 3265 if (diff < minval)
+ − 3266 {
+ − 3267 minval = diff;
+ − 3268 found = i;
+ − 3269 }
+ − 3270 }
+ − 3271
+ − 3272 if (minval < diff_so_far)
+ − 3273 {
+ − 3274 bufmax = bufmin = buf->text->mule_charbpos_cache[found];
+ − 3275 bytmax = bytmin = buf->text->mule_bytebpos_cache[found];
+ − 3276 size = 1;
+ − 3277 }
+ − 3278 }
+ − 3279
+ − 3280 /* It's conceivable that the caching above could lead to X being
+ − 3281 the same as one of the range edges. */
+ − 3282 if (x >= bytmax)
+ − 3283 {
+ − 3284 Bytebpos newmax;
+ − 3285 Bytecount newsize;
+ − 3286
+ − 3287 forward_p = 1;
+ − 3288 while (x > bytmax)
+ − 3289 {
+ − 3290 newmax = bytmax;
+ − 3291
+ − 3292 INC_BYTEBPOS (buf, newmax);
+ − 3293 newsize = newmax - bytmax;
+ − 3294 if (newsize != size)
+ − 3295 {
+ − 3296 bufmin = bufmax;
+ − 3297 bytmin = bytmax;
+ − 3298 size = newsize;
+ − 3299 }
+ − 3300 bytmax = newmax;
+ − 3301 bufmax++;
+ − 3302 }
+ − 3303 retval = bufmax;
+ − 3304
+ − 3305 /* #### Should go past the found location to reduce the number
+ − 3306 of times that this function is called */
+ − 3307 }
+ − 3308 else /* x <= bytmin */
+ − 3309 {
+ − 3310 Bytebpos newmin;
+ − 3311 Bytecount newsize;
+ − 3312
+ − 3313 forward_p = 0;
+ − 3314 while (x < bytmin)
+ − 3315 {
+ − 3316 newmin = bytmin;
+ − 3317
+ − 3318 DEC_BYTEBPOS (buf, newmin);
+ − 3319 newsize = bytmin - newmin;
+ − 3320 if (newsize != size)
+ − 3321 {
+ − 3322 bufmax = bufmin;
+ − 3323 bytmax = bytmin;
+ − 3324 size = newsize;
+ − 3325 }
+ − 3326 bytmin = newmin;
+ − 3327 bufmin--;
+ − 3328 }
+ − 3329 retval = bufmin;
+ − 3330
+ − 3331 /* #### Should go past the found location to reduce the number
+ − 3332 of times that this function is called
+ − 3333 */
+ − 3334 }
+ − 3335
+ − 3336 /* If size is three, than we have to max sure that the range we
+ − 3337 discovered isn't too large, because we use a fixed-length
+ − 3338 table to divide by 3. */
+ − 3339
+ − 3340 if (size == 3)
+ − 3341 {
+ − 3342 int gap = bytmax - bytmin;
+ − 3343 buf->text->mule_three_p = 1;
+ − 3344 buf->text->mule_shifter = 1;
+ − 3345
+ − 3346 if (gap > MAX_BYTEBPOS_GAP_SIZE_3)
+ − 3347 {
+ − 3348 if (forward_p)
+ − 3349 {
+ − 3350 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3;
+ − 3351 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3;
+ − 3352 }
+ − 3353 else
+ − 3354 {
+ − 3355 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3;
+ − 3356 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3;
+ − 3357 }
+ − 3358 }
+ − 3359 }
+ − 3360 else
+ − 3361 {
+ − 3362 buf->text->mule_three_p = 0;
+ − 3363 if (size == 4)
+ − 3364 buf->text->mule_shifter = 2;
+ − 3365 else
+ − 3366 buf->text->mule_shifter = size - 1;
+ − 3367 }
+ − 3368
+ − 3369 buf->text->mule_bufmin = bufmin;
+ − 3370 buf->text->mule_bufmax = bufmax;
+ − 3371 buf->text->mule_bytmin = bytmin;
+ − 3372 buf->text->mule_bytmax = bytmax;
+ − 3373
+ − 3374 if (add_to_cache)
+ − 3375 {
+ − 3376 int replace_loc;
+ − 3377
+ − 3378 /* We throw away a "random" cached value and replace it with
+ − 3379 the new value. It doesn't actually have to be very random
+ − 3380 at all, just evenly distributed.
+ − 3381
+ − 3382 #### It would be better to use a least-recently-used algorithm
+ − 3383 or something that tries to space things out, but I'm not sure
+ − 3384 it's worth it to go to the trouble of maintaining that. */
+ − 3385 not_very_random_number += 621;
+ − 3386 replace_loc = not_very_random_number & 15;
+ − 3387 buf->text->mule_charbpos_cache[replace_loc] = retval;
+ − 3388 buf->text->mule_bytebpos_cache[replace_loc] = x;
+ − 3389 }
2367
+ − 3390 #endif /* OLD_BYTE_CHAR */
+ − 3391
+ − 3392 done:
1292
+ − 3393 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion);
+ − 3394
771
+ − 3395 return retval;
+ − 3396 }
+ − 3397
+ − 3398 /* Text of length BYTELENGTH and CHARLENGTH (in different units)
+ − 3399 was inserted at charbpos START. */
+ − 3400
+ − 3401 void
+ − 3402 buffer_mule_signal_inserted_region (struct buffer *buf, Charbpos start,
+ − 3403 Bytecount bytelength,
+ − 3404 Charcount charlength)
+ − 3405 {
2367
+ − 3406 #ifdef OLD_BYTE_CHAR
771
+ − 3407 int size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
2367
+ − 3408 #endif /* OLD_BYTE_CHAR */
771
+ − 3409 int i;
+ − 3410
+ − 3411 /* Adjust the cache of known positions. */
2367
+ − 3412 for (i = 0; i < buf->text->next_cache_pos; i++)
771
+ − 3413 {
+ − 3414
+ − 3415 if (buf->text->mule_charbpos_cache[i] > start)
+ − 3416 {
+ − 3417 buf->text->mule_charbpos_cache[i] += charlength;
+ − 3418 buf->text->mule_bytebpos_cache[i] += bytelength;
+ − 3419 }
+ − 3420 }
+ − 3421
2367
+ − 3422 /* Adjust the special cached position. */
+ − 3423
+ − 3424 if (buf->text->cached_charpos > start)
+ − 3425 {
+ − 3426 buf->text->cached_charpos += charlength;
+ − 3427 buf->text->cached_bytepos += bytelength;
+ − 3428 }
+ − 3429
+ − 3430 #ifdef OLD_BYTE_CHAR
771
+ − 3431 if (start >= buf->text->mule_bufmax)
826
+ − 3432 return;
771
+ − 3433
+ − 3434 /* The insertion is either before the known region, in which case
+ − 3435 it shoves it forward; or within the known region, in which case
+ − 3436 it shoves the end forward. (But it may make the known region
+ − 3437 inconsistent, so we may have to shorten it.) */
+ − 3438
+ − 3439 if (start <= buf->text->mule_bufmin)
+ − 3440 {
+ − 3441 buf->text->mule_bufmin += charlength;
+ − 3442 buf->text->mule_bufmax += charlength;
+ − 3443 buf->text->mule_bytmin += bytelength;
+ − 3444 buf->text->mule_bytmax += bytelength;
+ − 3445 }
+ − 3446 else
+ − 3447 {
+ − 3448 Charbpos end = start + charlength;
+ − 3449 /* the insertion point divides the known region in two.
+ − 3450 Keep the longer half, at least, and expand into the
+ − 3451 inserted chunk as much as possible. */
+ − 3452
+ − 3453 if (start - buf->text->mule_bufmin > buf->text->mule_bufmax - start)
+ − 3454 {
+ − 3455 Bytebpos bytestart = (buf->text->mule_bytmin
+ − 3456 + size * (start - buf->text->mule_bufmin));
+ − 3457 Bytebpos bytenew;
+ − 3458
+ − 3459 while (start < end)
+ − 3460 {
+ − 3461 bytenew = bytestart;
+ − 3462 INC_BYTEBPOS (buf, bytenew);
+ − 3463 if (bytenew - bytestart != size)
+ − 3464 break;
+ − 3465 start++;
+ − 3466 bytestart = bytenew;
+ − 3467 }
+ − 3468 if (start != end)
+ − 3469 {
+ − 3470 buf->text->mule_bufmax = start;
+ − 3471 buf->text->mule_bytmax = bytestart;
+ − 3472 }
+ − 3473 else
+ − 3474 {
+ − 3475 buf->text->mule_bufmax += charlength;
+ − 3476 buf->text->mule_bytmax += bytelength;
+ − 3477 }
+ − 3478 }
+ − 3479 else
+ − 3480 {
+ − 3481 Bytebpos byteend = (buf->text->mule_bytmin
+ − 3482 + size * (start - buf->text->mule_bufmin)
+ − 3483 + bytelength);
+ − 3484 Bytebpos bytenew;
+ − 3485
+ − 3486 buf->text->mule_bufmax += charlength;
+ − 3487 buf->text->mule_bytmax += bytelength;
+ − 3488
+ − 3489 while (end > start)
+ − 3490 {
+ − 3491 bytenew = byteend;
+ − 3492 DEC_BYTEBPOS (buf, bytenew);
+ − 3493 if (byteend - bytenew != size)
+ − 3494 break;
+ − 3495 end--;
+ − 3496 byteend = bytenew;
+ − 3497 }
+ − 3498 if (start != end)
+ − 3499 {
+ − 3500 buf->text->mule_bufmin = end;
+ − 3501 buf->text->mule_bytmin = byteend;
+ − 3502 }
+ − 3503 }
+ − 3504 }
2367
+ − 3505 #endif /* OLD_BYTE_CHAR */
771
+ − 3506 }
+ − 3507
826
+ − 3508 /* Text from START to END (equivalent in Bytebpos's: from BYTE_START to
+ − 3509 BYTE_END) was deleted. */
771
+ − 3510
+ − 3511 void
+ − 3512 buffer_mule_signal_deleted_region (struct buffer *buf, Charbpos start,
826
+ − 3513 Charbpos end, Bytebpos byte_start,
+ − 3514 Bytebpos byte_end)
771
+ − 3515 {
+ − 3516 int i;
+ − 3517
+ − 3518 /* Adjust the cache of known positions. */
2367
+ − 3519 for (i = 0; i < buf->text->next_cache_pos; i++)
771
+ − 3520 {
+ − 3521 /* After the end; gets shoved backward */
+ − 3522 if (buf->text->mule_charbpos_cache[i] > end)
+ − 3523 {
+ − 3524 buf->text->mule_charbpos_cache[i] -= end - start;
826
+ − 3525 buf->text->mule_bytebpos_cache[i] -= byte_end - byte_start;
771
+ − 3526 }
+ − 3527 /* In the range; moves to start of range */
+ − 3528 else if (buf->text->mule_charbpos_cache[i] > start)
+ − 3529 {
+ − 3530 buf->text->mule_charbpos_cache[i] = start;
826
+ − 3531 buf->text->mule_bytebpos_cache[i] = byte_start;
771
+ − 3532 }
+ − 3533 }
+ − 3534
2367
+ − 3535 /* Adjust the special cached position. */
+ − 3536
+ − 3537 /* After the end; gets shoved backward */
+ − 3538 if (buf->text->cached_charpos > end)
+ − 3539 {
+ − 3540 buf->text->cached_charpos -= end - start;
+ − 3541 buf->text->cached_bytepos -= byte_end - byte_start;
+ − 3542 }
+ − 3543 /* In the range; moves to start of range */
+ − 3544 else if (buf->text->cached_charpos > start)
+ − 3545 {
+ − 3546 buf->text->cached_charpos = start;
+ − 3547 buf->text->cached_bytepos = byte_start;
+ − 3548 }
+ − 3549
+ − 3550 #ifdef OLD_BYTE_CHAR
771
+ − 3551 /* We don't care about any text after the end of the known region. */
+ − 3552
+ − 3553 end = min (end, buf->text->mule_bufmax);
826
+ − 3554 byte_end = min (byte_end, buf->text->mule_bytmax);
771
+ − 3555 if (start >= end)
826
+ − 3556 return;
771
+ − 3557
+ − 3558 /* The end of the known region offsets by the total amount of deletion,
+ − 3559 since it's all before it. */
+ − 3560
+ − 3561 buf->text->mule_bufmax -= end - start;
826
+ − 3562 buf->text->mule_bytmax -= byte_end - byte_start;
771
+ − 3563
+ − 3564 /* Now we don't care about any text after the start of the known region. */
+ − 3565
+ − 3566 end = min (end, buf->text->mule_bufmin);
826
+ − 3567 byte_end = min (byte_end, buf->text->mule_bytmin);
771
+ − 3568 if (start < end)
+ − 3569 {
+ − 3570 buf->text->mule_bufmin -= end - start;
826
+ − 3571 buf->text->mule_bytmin -= byte_end - byte_start;
771
+ − 3572 }
2367
+ − 3573 #endif /* OLD_BYTE_CHAR */
771
+ − 3574 }
+ − 3575
+ − 3576 #endif /* MULE */
+ − 3577
+ − 3578
+ − 3579 /************************************************************************/
+ − 3580 /* verifying buffer and string positions */
+ − 3581 /************************************************************************/
+ − 3582
+ − 3583 /* Functions below are tagged with either _byte or _char indicating
+ − 3584 whether they return byte or character positions. For a buffer,
+ − 3585 a character position is a "Charbpos" and a byte position is a "Bytebpos".
+ − 3586 For strings, these are sometimes typed using "Charcount" and
+ − 3587 "Bytecount". */
+ − 3588
+ − 3589 /* Flags for the functions below are:
+ − 3590
+ − 3591 GB_ALLOW_PAST_ACCESSIBLE
+ − 3592
+ − 3593 Allow positions to range over the entire buffer (BUF_BEG to BUF_Z),
+ − 3594 rather than just the accessible portion (BUF_BEGV to BUF_ZV).
+ − 3595 For strings, this flag has no effect.
+ − 3596
+ − 3597 GB_COERCE_RANGE
+ − 3598
+ − 3599 If the position is outside the allowable range, return the lower
+ − 3600 or upper bound of the range, whichever is closer to the specified
+ − 3601 position.
+ − 3602
+ − 3603 GB_NO_ERROR_IF_BAD
+ − 3604
+ − 3605 If the position is outside the allowable range, return -1.
+ − 3606
+ − 3607 GB_NEGATIVE_FROM_END
+ − 3608
+ − 3609 If a value is negative, treat it as an offset from the end.
+ − 3610 Only applies to strings.
+ − 3611
+ − 3612 The following additional flags apply only to the functions
+ − 3613 that return ranges:
+ − 3614
+ − 3615 GB_ALLOW_NIL
+ − 3616
+ − 3617 Either or both positions can be nil. If FROM is nil,
+ − 3618 FROM_OUT will contain the lower bound of the allowed range.
+ − 3619 If TO is nil, TO_OUT will contain the upper bound of the
+ − 3620 allowed range.
+ − 3621
+ − 3622 GB_CHECK_ORDER
+ − 3623
+ − 3624 FROM must contain the lower bound and TO the upper bound
+ − 3625 of the range. If the positions are reversed, an error is
+ − 3626 signalled.
+ − 3627
+ − 3628 The following is a combination flag:
+ − 3629
+ − 3630 GB_HISTORICAL_STRING_BEHAVIOR
+ − 3631
+ − 3632 Equivalent to (GB_NEGATIVE_FROM_END | GB_ALLOW_NIL).
+ − 3633 */
+ − 3634
+ − 3635 /* Return a buffer position stored in a Lisp_Object. Full
+ − 3636 error-checking is done on the position. Flags can be specified to
+ − 3637 control the behavior of out-of-range values. The default behavior
+ − 3638 is to require that the position is within the accessible part of
+ − 3639 the buffer (BEGV and ZV), and to signal an error if the position is
+ − 3640 out of range.
+ − 3641
+ − 3642 */
+ − 3643
+ − 3644 Charbpos
+ − 3645 get_buffer_pos_char (struct buffer *b, Lisp_Object pos, unsigned int flags)
+ − 3646 {
+ − 3647 /* Does not GC */
+ − 3648 Charbpos ind;
+ − 3649 Charbpos min_allowed, max_allowed;
+ − 3650
+ − 3651 CHECK_INT_COERCE_MARKER (pos);
+ − 3652 ind = XINT (pos);
+ − 3653 min_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_BEG (b) : BUF_BEGV (b);
+ − 3654 max_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_Z (b) : BUF_ZV (b);
+ − 3655
+ − 3656 if (ind < min_allowed || ind > max_allowed)
+ − 3657 {
+ − 3658 if (flags & GB_COERCE_RANGE)
+ − 3659 ind = ind < min_allowed ? min_allowed : max_allowed;
+ − 3660 else if (flags & GB_NO_ERROR_IF_BAD)
+ − 3661 ind = -1;
+ − 3662 else
+ − 3663 {
793
+ − 3664 Lisp_Object buffer = wrap_buffer (b);
+ − 3665
771
+ − 3666 args_out_of_range (buffer, pos);
+ − 3667 }
+ − 3668 }
+ − 3669
+ − 3670 return ind;
+ − 3671 }
+ − 3672
+ − 3673 Bytebpos
+ − 3674 get_buffer_pos_byte (struct buffer *b, Lisp_Object pos, unsigned int flags)
+ − 3675 {
+ − 3676 Charbpos bpos = get_buffer_pos_char (b, pos, flags);
+ − 3677 if (bpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */
+ − 3678 return -1;
+ − 3679 return charbpos_to_bytebpos (b, bpos);
+ − 3680 }
+ − 3681
+ − 3682 /* Return a pair of buffer positions representing a range of text,
+ − 3683 taken from a pair of Lisp_Objects. Full error-checking is
+ − 3684 done on the positions. Flags can be specified to control the
+ − 3685 behavior of out-of-range values. The default behavior is to
+ − 3686 allow the range bounds to be specified in either order
+ − 3687 (however, FROM_OUT will always be the lower bound of the range
+ − 3688 and TO_OUT the upper bound),to require that the positions
+ − 3689 are within the accessible part of the buffer (BEGV and ZV),
+ − 3690 and to signal an error if the positions are out of range.
+ − 3691 */
+ − 3692
+ − 3693 void
+ − 3694 get_buffer_range_char (struct buffer *b, Lisp_Object from, Lisp_Object to,
826
+ − 3695 Charbpos *from_out, Charbpos *to_out,
+ − 3696 unsigned int flags)
771
+ − 3697 {
+ − 3698 /* Does not GC */
+ − 3699 Charbpos min_allowed, max_allowed;
+ − 3700
+ − 3701 min_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
+ − 3702 BUF_BEG (b) : BUF_BEGV (b);
+ − 3703 max_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ?
+ − 3704 BUF_Z (b) : BUF_ZV (b);
+ − 3705
+ − 3706 if (NILP (from) && (flags & GB_ALLOW_NIL))
+ − 3707 *from_out = min_allowed;
+ − 3708 else
+ − 3709 *from_out = get_buffer_pos_char (b, from, flags | GB_NO_ERROR_IF_BAD);
+ − 3710
+ − 3711 if (NILP (to) && (flags & GB_ALLOW_NIL))
+ − 3712 *to_out = max_allowed;
+ − 3713 else
+ − 3714 *to_out = get_buffer_pos_char (b, to, flags | GB_NO_ERROR_IF_BAD);
+ − 3715
+ − 3716 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD))
+ − 3717 {
793
+ − 3718 Lisp_Object buffer = wrap_buffer (b);
+ − 3719
771
+ − 3720 args_out_of_range_3 (buffer, from, to);
+ − 3721 }
+ − 3722
+ − 3723 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out)
+ − 3724 {
+ − 3725 if (flags & GB_CHECK_ORDER)
+ − 3726 invalid_argument_2 ("start greater than end", from, to);
+ − 3727 else
+ − 3728 {
+ − 3729 Charbpos temp = *from_out;
+ − 3730 *from_out = *to_out;
+ − 3731 *to_out = temp;
+ − 3732 }
+ − 3733 }
+ − 3734 }
+ − 3735
+ − 3736 void
+ − 3737 get_buffer_range_byte (struct buffer *b, Lisp_Object from, Lisp_Object to,
826
+ − 3738 Bytebpos *from_out, Bytebpos *to_out,
+ − 3739 unsigned int flags)
771
+ − 3740 {
+ − 3741 Charbpos s, e;
+ − 3742
+ − 3743 get_buffer_range_char (b, from, to, &s, &e, flags);
+ − 3744 if (s >= 0)
+ − 3745 *from_out = charbpos_to_bytebpos (b, s);
+ − 3746 else /* could happen with GB_NO_ERROR_IF_BAD */
+ − 3747 *from_out = -1;
+ − 3748 if (e >= 0)
+ − 3749 *to_out = charbpos_to_bytebpos (b, e);
+ − 3750 else
+ − 3751 *to_out = -1;
+ − 3752 }
+ − 3753
+ − 3754 static Charcount
+ − 3755 get_string_pos_char_1 (Lisp_Object string, Lisp_Object pos, unsigned int flags,
+ − 3756 Charcount known_length)
+ − 3757 {
+ − 3758 Charcount ccpos;
+ − 3759 Charcount min_allowed = 0;
+ − 3760 Charcount max_allowed = known_length;
+ − 3761
+ − 3762 /* Computation of KNOWN_LENGTH is potentially expensive so we pass
+ − 3763 it in. */
+ − 3764 CHECK_INT (pos);
+ − 3765 ccpos = XINT (pos);
+ − 3766 if (ccpos < 0 && flags & GB_NEGATIVE_FROM_END)
+ − 3767 ccpos += max_allowed;
+ − 3768
+ − 3769 if (ccpos < min_allowed || ccpos > max_allowed)
+ − 3770 {
+ − 3771 if (flags & GB_COERCE_RANGE)
+ − 3772 ccpos = ccpos < min_allowed ? min_allowed : max_allowed;
+ − 3773 else if (flags & GB_NO_ERROR_IF_BAD)
+ − 3774 ccpos = -1;
+ − 3775 else
+ − 3776 args_out_of_range (string, pos);
+ − 3777 }
+ − 3778
+ − 3779 return ccpos;
+ − 3780 }
+ − 3781
+ − 3782 Charcount
+ − 3783 get_string_pos_char (Lisp_Object string, Lisp_Object pos, unsigned int flags)
+ − 3784 {
+ − 3785 return get_string_pos_char_1 (string, pos, flags,
826
+ − 3786 string_char_length (string));
771
+ − 3787 }
+ − 3788
+ − 3789 Bytecount
+ − 3790 get_string_pos_byte (Lisp_Object string, Lisp_Object pos, unsigned int flags)
+ − 3791 {
+ − 3792 Charcount ccpos = get_string_pos_char (string, pos, flags);
+ − 3793 if (ccpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */
+ − 3794 return -1;
793
+ − 3795 return string_index_char_to_byte (string, ccpos);
771
+ − 3796 }
+ − 3797
+ − 3798 void
+ − 3799 get_string_range_char (Lisp_Object string, Lisp_Object from, Lisp_Object to,
+ − 3800 Charcount *from_out, Charcount *to_out,
+ − 3801 unsigned int flags)
+ − 3802 {
+ − 3803 Charcount min_allowed = 0;
826
+ − 3804 Charcount max_allowed = string_char_length (string);
771
+ − 3805
+ − 3806 if (NILP (from) && (flags & GB_ALLOW_NIL))
+ − 3807 *from_out = min_allowed;
+ − 3808 else
+ − 3809 *from_out = get_string_pos_char_1 (string, from,
+ − 3810 flags | GB_NO_ERROR_IF_BAD,
+ − 3811 max_allowed);
+ − 3812
+ − 3813 if (NILP (to) && (flags & GB_ALLOW_NIL))
+ − 3814 *to_out = max_allowed;
+ − 3815 else
+ − 3816 *to_out = get_string_pos_char_1 (string, to,
+ − 3817 flags | GB_NO_ERROR_IF_BAD,
+ − 3818 max_allowed);
+ − 3819
+ − 3820 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD))
+ − 3821 args_out_of_range_3 (string, from, to);
+ − 3822
+ − 3823 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out)
+ − 3824 {
+ − 3825 if (flags & GB_CHECK_ORDER)
+ − 3826 invalid_argument_2 ("start greater than end", from, to);
+ − 3827 else
+ − 3828 {
+ − 3829 Charbpos temp = *from_out;
+ − 3830 *from_out = *to_out;
+ − 3831 *to_out = temp;
+ − 3832 }
+ − 3833 }
+ − 3834 }
+ − 3835
+ − 3836 void
+ − 3837 get_string_range_byte (Lisp_Object string, Lisp_Object from, Lisp_Object to,
+ − 3838 Bytecount *from_out, Bytecount *to_out,
+ − 3839 unsigned int flags)
+ − 3840 {
+ − 3841 Charcount s, e;
+ − 3842
+ − 3843 get_string_range_char (string, from, to, &s, &e, flags);
+ − 3844 if (s >= 0)
793
+ − 3845 *from_out = string_index_char_to_byte (string, s);
771
+ − 3846 else /* could happen with GB_NO_ERROR_IF_BAD */
+ − 3847 *from_out = -1;
+ − 3848 if (e >= 0)
793
+ − 3849 *to_out = string_index_char_to_byte (string, e);
771
+ − 3850 else
+ − 3851 *to_out = -1;
+ − 3852
+ − 3853 }
+ − 3854
826
+ − 3855 Charxpos
771
+ − 3856 get_buffer_or_string_pos_char (Lisp_Object object, Lisp_Object pos,
+ − 3857 unsigned int flags)
+ − 3858 {
+ − 3859 return STRINGP (object) ?
+ − 3860 get_string_pos_char (object, pos, flags) :
+ − 3861 get_buffer_pos_char (XBUFFER (object), pos, flags);
+ − 3862 }
+ − 3863
826
+ − 3864 Bytexpos
771
+ − 3865 get_buffer_or_string_pos_byte (Lisp_Object object, Lisp_Object pos,
+ − 3866 unsigned int flags)
+ − 3867 {
+ − 3868 return STRINGP (object) ?
+ − 3869 get_string_pos_byte (object, pos, flags) :
+ − 3870 get_buffer_pos_byte (XBUFFER (object), pos, flags);
+ − 3871 }
+ − 3872
+ − 3873 void
+ − 3874 get_buffer_or_string_range_char (Lisp_Object object, Lisp_Object from,
826
+ − 3875 Lisp_Object to, Charxpos *from_out,
+ − 3876 Charxpos *to_out, unsigned int flags)
771
+ − 3877 {
+ − 3878 if (STRINGP (object))
+ − 3879 get_string_range_char (object, from, to, from_out, to_out, flags);
+ − 3880 else
826
+ − 3881 get_buffer_range_char (XBUFFER (object), from, to, from_out, to_out,
+ − 3882 flags);
771
+ − 3883 }
+ − 3884
+ − 3885 void
+ − 3886 get_buffer_or_string_range_byte (Lisp_Object object, Lisp_Object from,
826
+ − 3887 Lisp_Object to, Bytexpos *from_out,
+ − 3888 Bytexpos *to_out, unsigned int flags)
771
+ − 3889 {
+ − 3890 if (STRINGP (object))
+ − 3891 get_string_range_byte (object, from, to, from_out, to_out, flags);
+ − 3892 else
826
+ − 3893 get_buffer_range_byte (XBUFFER (object), from, to, from_out, to_out,
+ − 3894 flags);
771
+ − 3895 }
+ − 3896
826
+ − 3897 Charxpos
771
+ − 3898 buffer_or_string_accessible_begin_char (Lisp_Object object)
+ − 3899 {
+ − 3900 return STRINGP (object) ? 0 : BUF_BEGV (XBUFFER (object));
+ − 3901 }
+ − 3902
826
+ − 3903 Charxpos
771
+ − 3904 buffer_or_string_accessible_end_char (Lisp_Object object)
+ − 3905 {
+ − 3906 return STRINGP (object) ?
826
+ − 3907 string_char_length (object) : BUF_ZV (XBUFFER (object));
771
+ − 3908 }
+ − 3909
826
+ − 3910 Bytexpos
771
+ − 3911 buffer_or_string_accessible_begin_byte (Lisp_Object object)
+ − 3912 {
826
+ − 3913 return STRINGP (object) ? 0 : BYTE_BUF_BEGV (XBUFFER (object));
771
+ − 3914 }
+ − 3915
826
+ − 3916 Bytexpos
771
+ − 3917 buffer_or_string_accessible_end_byte (Lisp_Object object)
+ − 3918 {
+ − 3919 return STRINGP (object) ?
826
+ − 3920 XSTRING_LENGTH (object) : BYTE_BUF_ZV (XBUFFER (object));
771
+ − 3921 }
+ − 3922
826
+ − 3923 Charxpos
771
+ − 3924 buffer_or_string_absolute_begin_char (Lisp_Object object)
+ − 3925 {
+ − 3926 return STRINGP (object) ? 0 : BUF_BEG (XBUFFER (object));
+ − 3927 }
+ − 3928
826
+ − 3929 Charxpos
771
+ − 3930 buffer_or_string_absolute_end_char (Lisp_Object object)
+ − 3931 {
+ − 3932 return STRINGP (object) ?
826
+ − 3933 string_char_length (object) : BUF_Z (XBUFFER (object));
+ − 3934 }
+ − 3935
+ − 3936 Bytexpos
+ − 3937 buffer_or_string_absolute_begin_byte (Lisp_Object object)
+ − 3938 {
+ − 3939 return STRINGP (object) ? 0 : BYTE_BUF_BEG (XBUFFER (object));
+ − 3940 }
+ − 3941
+ − 3942 Bytexpos
+ − 3943 buffer_or_string_absolute_end_byte (Lisp_Object object)
+ − 3944 {
+ − 3945 return STRINGP (object) ?
+ − 3946 XSTRING_LENGTH (object) : BYTE_BUF_Z (XBUFFER (object));
+ − 3947 }
+ − 3948
+ − 3949 Charbpos
+ − 3950 charbpos_clip_to_bounds (Charbpos lower, Charbpos num, Charbpos upper)
+ − 3951 {
+ − 3952 return (num < lower ? lower :
+ − 3953 num > upper ? upper :
+ − 3954 num);
771
+ − 3955 }
+ − 3956
+ − 3957 Bytebpos
826
+ − 3958 bytebpos_clip_to_bounds (Bytebpos lower, Bytebpos num, Bytebpos upper)
+ − 3959 {
+ − 3960 return (num < lower ? lower :
+ − 3961 num > upper ? upper :
+ − 3962 num);
+ − 3963 }
+ − 3964
+ − 3965 Charxpos
+ − 3966 charxpos_clip_to_bounds (Charxpos lower, Charxpos num, Charxpos upper)
771
+ − 3967 {
826
+ − 3968 return (num < lower ? lower :
+ − 3969 num > upper ? upper :
+ − 3970 num);
+ − 3971 }
+ − 3972
+ − 3973 Bytexpos
+ − 3974 bytexpos_clip_to_bounds (Bytexpos lower, Bytexpos num, Bytexpos upper)
+ − 3975 {
+ − 3976 return (num < lower ? lower :
+ − 3977 num > upper ? upper :
+ − 3978 num);
771
+ − 3979 }
+ − 3980
826
+ − 3981 /* These could be implemented in terms of the get_buffer_or_string()
+ − 3982 functions above, but those are complicated and handle lots of weird
+ − 3983 cases stemming from uncertain external input. */
+ − 3984
+ − 3985 Charxpos
+ − 3986 buffer_or_string_clip_to_accessible_char (Lisp_Object object, Charxpos pos)
+ − 3987 {
+ − 3988 return (charxpos_clip_to_bounds
+ − 3989 (pos, buffer_or_string_accessible_begin_char (object),
+ − 3990 buffer_or_string_accessible_end_char (object)));
+ − 3991 }
+ − 3992
+ − 3993 Bytexpos
+ − 3994 buffer_or_string_clip_to_accessible_byte (Lisp_Object object, Bytexpos pos)
771
+ − 3995 {
826
+ − 3996 return (bytexpos_clip_to_bounds
+ − 3997 (pos, buffer_or_string_accessible_begin_byte (object),
+ − 3998 buffer_or_string_accessible_end_byte (object)));
+ − 3999 }
+ − 4000
+ − 4001 Charxpos
+ − 4002 buffer_or_string_clip_to_absolute_char (Lisp_Object object, Charxpos pos)
+ − 4003 {
+ − 4004 return (charxpos_clip_to_bounds
+ − 4005 (pos, buffer_or_string_absolute_begin_char (object),
+ − 4006 buffer_or_string_absolute_end_char (object)));
+ − 4007 }
+ − 4008
+ − 4009 Bytexpos
+ − 4010 buffer_or_string_clip_to_absolute_byte (Lisp_Object object, Bytexpos pos)
+ − 4011 {
+ − 4012 return (bytexpos_clip_to_bounds
+ − 4013 (pos, buffer_or_string_absolute_begin_byte (object),
+ − 4014 buffer_or_string_absolute_end_byte (object)));
771
+ − 4015 }
+ − 4016
+ − 4017
+ − 4018 /************************************************************************/
+ − 4019 /* Implement TO_EXTERNAL_FORMAT, TO_INTERNAL_FORMAT */
+ − 4020 /************************************************************************/
+ − 4021
+ − 4022 typedef struct
+ − 4023 {
867
+ − 4024 Dynarr_declare (Ibyte_dynarr *);
+ − 4025 } Ibyte_dynarr_dynarr;
771
+ − 4026
+ − 4027 typedef struct
+ − 4028 {
+ − 4029 Dynarr_declare (Extbyte_dynarr *);
+ − 4030 } Extbyte_dynarr_dynarr;
+ − 4031
+ − 4032 static Extbyte_dynarr_dynarr *conversion_out_dynarr_list;
867
+ − 4033 static Ibyte_dynarr_dynarr *conversion_in_dynarr_list;
771
+ − 4034
+ − 4035 static int dfc_convert_to_external_format_in_use;
+ − 4036 static int dfc_convert_to_internal_format_in_use;
+ − 4037
+ − 4038 void
+ − 4039 dfc_convert_to_external_format (dfc_conversion_type source_type,
+ − 4040 dfc_conversion_data *source,
+ − 4041 Lisp_Object coding_system,
+ − 4042 dfc_conversion_type sink_type,
+ − 4043 dfc_conversion_data *sink)
+ − 4044 {
+ − 4045 /* It's guaranteed that many callers are not prepared for GC here,
+ − 4046 esp. given that this code conversion occurs in many very hidden
+ − 4047 places. */
1292
+ − 4048 int count;
771
+ − 4049 Extbyte_dynarr *conversion_out_dynarr;
1292
+ − 4050 PROFILE_DECLARE ();
+ − 4051
2367
+ − 4052 assert (!inhibit_non_essential_conversion_operations);
1292
+ − 4053 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion);
+ − 4054
+ − 4055 count = begin_gc_forbidden ();
771
+ − 4056
+ − 4057 type_checking_assert
+ − 4058 (((source_type == DFC_TYPE_DATA) ||
+ − 4059 (source_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)) ||
+ − 4060 (source_type == DFC_TYPE_LISP_STRING && STRINGP (source->lisp_object)))
+ − 4061 &&
+ − 4062 ((sink_type == DFC_TYPE_DATA) ||
+ − 4063 (sink_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object))));
+ − 4064
+ − 4065 if (Dynarr_length (conversion_out_dynarr_list) <=
+ − 4066 dfc_convert_to_external_format_in_use)
+ − 4067 Dynarr_add (conversion_out_dynarr_list, Dynarr_new (Extbyte));
+ − 4068 conversion_out_dynarr = Dynarr_at (conversion_out_dynarr_list,
+ − 4069 dfc_convert_to_external_format_in_use);
+ − 4070 Dynarr_reset (conversion_out_dynarr);
+ − 4071
853
+ − 4072 internal_bind_int (&dfc_convert_to_external_format_in_use,
+ − 4073 dfc_convert_to_external_format_in_use + 1);
+ − 4074
771
+ − 4075 coding_system = get_coding_system_for_text_file (coding_system, 0);
+ − 4076
+ − 4077 /* Here we optimize in the case where the coding system does no
+ − 4078 conversion. However, we don't want to optimize in case the source
+ − 4079 or sink is an lstream, since writing to an lstream can cause a
+ − 4080 garbage collection, and this could be problematic if the source
+ − 4081 is a lisp string. */
+ − 4082 if (source_type != DFC_TYPE_LISP_LSTREAM &&
+ − 4083 sink_type != DFC_TYPE_LISP_LSTREAM &&
+ − 4084 coding_system_is_binary (coding_system))
+ − 4085 {
867
+ − 4086 const Ibyte *ptr;
771
+ − 4087 Bytecount len;
+ − 4088
+ − 4089 if (source_type == DFC_TYPE_LISP_STRING)
+ − 4090 {
+ − 4091 ptr = XSTRING_DATA (source->lisp_object);
+ − 4092 len = XSTRING_LENGTH (source->lisp_object);
+ − 4093 }
+ − 4094 else
+ − 4095 {
867
+ − 4096 ptr = (Ibyte *) source->data.ptr;
771
+ − 4097 len = source->data.len;
+ − 4098 }
+ − 4099
+ − 4100 #ifdef MULE
+ − 4101 {
867
+ − 4102 const Ibyte *end;
771
+ − 4103 for (end = ptr + len; ptr < end;)
+ − 4104 {
867
+ − 4105 Ibyte c =
826
+ − 4106 (byte_ascii_p (*ptr)) ? *ptr :
771
+ − 4107 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
+ − 4108 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) :
+ − 4109 '~';
+ − 4110
+ − 4111 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
867
+ − 4112 INC_IBYTEPTR (ptr);
771
+ − 4113 }
800
+ − 4114 text_checking_assert (ptr == end);
771
+ − 4115 }
+ − 4116 #else
+ − 4117 Dynarr_add_many (conversion_out_dynarr, ptr, len);
+ − 4118 #endif
+ − 4119
+ − 4120 }
1315
+ − 4121 #ifdef WIN32_ANY
771
+ − 4122 /* Optimize the common case involving Unicode where only ASCII is involved */
+ − 4123 else if (source_type != DFC_TYPE_LISP_LSTREAM &&
+ − 4124 sink_type != DFC_TYPE_LISP_LSTREAM &&
+ − 4125 dfc_coding_system_is_unicode (coding_system))
+ − 4126 {
867
+ − 4127 const Ibyte *ptr, *p;
771
+ − 4128 Bytecount len;
867
+ − 4129 const Ibyte *end;
771
+ − 4130
+ − 4131 if (source_type == DFC_TYPE_LISP_STRING)
+ − 4132 {
+ − 4133 ptr = XSTRING_DATA (source->lisp_object);
+ − 4134 len = XSTRING_LENGTH (source->lisp_object);
+ − 4135 }
+ − 4136 else
+ − 4137 {
867
+ − 4138 ptr = (Ibyte *) source->data.ptr;
771
+ − 4139 len = source->data.len;
+ − 4140 }
+ − 4141 end = ptr + len;
+ − 4142
+ − 4143 for (p = ptr; p < end; p++)
+ − 4144 {
826
+ − 4145 if (!byte_ascii_p (*p))
771
+ − 4146 goto the_hard_way;
+ − 4147 }
+ − 4148
+ − 4149 for (p = ptr; p < end; p++)
+ − 4150 {
+ − 4151 Dynarr_add (conversion_out_dynarr, (Extbyte) (*p));
+ − 4152 Dynarr_add (conversion_out_dynarr, (Extbyte) '\0');
+ − 4153 }
+ − 4154 }
1315
+ − 4155 #endif /* WIN32_ANY */
771
+ − 4156 else
+ − 4157 {
+ − 4158 Lisp_Object streams_to_delete[3];
+ − 4159 int delete_count;
+ − 4160 Lisp_Object instream, outstream;
+ − 4161 Lstream *reader, *writer;
+ − 4162
1315
+ − 4163 #ifdef WIN32_ANY
771
+ − 4164 the_hard_way:
1315
+ − 4165 #endif /* WIN32_ANY */
771
+ − 4166 delete_count = 0;
+ − 4167 if (source_type == DFC_TYPE_LISP_LSTREAM)
+ − 4168 instream = source->lisp_object;
+ − 4169 else if (source_type == DFC_TYPE_DATA)
+ − 4170 streams_to_delete[delete_count++] = instream =
+ − 4171 make_fixed_buffer_input_stream (source->data.ptr, source->data.len);
+ − 4172 else
+ − 4173 {
+ − 4174 type_checking_assert (source_type == DFC_TYPE_LISP_STRING);
+ − 4175 streams_to_delete[delete_count++] = instream =
+ − 4176 /* This will GCPRO the Lisp string */
+ − 4177 make_lisp_string_input_stream (source->lisp_object, 0, -1);
+ − 4178 }
+ − 4179
+ − 4180 if (sink_type == DFC_TYPE_LISP_LSTREAM)
+ − 4181 outstream = sink->lisp_object;
+ − 4182 else
+ − 4183 {
+ − 4184 type_checking_assert (sink_type == DFC_TYPE_DATA);
+ − 4185 streams_to_delete[delete_count++] = outstream =
+ − 4186 make_dynarr_output_stream
+ − 4187 ((unsigned_char_dynarr *) conversion_out_dynarr);
+ − 4188 }
+ − 4189
+ − 4190 streams_to_delete[delete_count++] = outstream =
800
+ − 4191 make_coding_output_stream (XLSTREAM (outstream), coding_system,
+ − 4192 CODING_ENCODE, 0);
771
+ − 4193
+ − 4194 reader = XLSTREAM (instream);
+ − 4195 writer = XLSTREAM (outstream);
+ − 4196 /* decoding_stream will gc-protect outstream */
1204
+ − 4197 {
+ − 4198 struct gcpro gcpro1, gcpro2;
+ − 4199 GCPRO2 (instream, outstream);
+ − 4200
+ − 4201 while (1)
+ − 4202 {
+ − 4203 Bytecount size_in_bytes;
+ − 4204 char tempbuf[1024]; /* some random amount */
+ − 4205
+ − 4206 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf));
+ − 4207
+ − 4208 if (size_in_bytes == 0)
+ − 4209 break;
+ − 4210 else if (size_in_bytes < 0)
+ − 4211 signal_error (Qtext_conversion_error,
+ − 4212 "Error converting to external format", Qunbound);
+ − 4213
+ − 4214 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0)
+ − 4215 signal_error (Qtext_conversion_error,
+ − 4216 "Error converting to external format", Qunbound);
+ − 4217 }
+ − 4218
+ − 4219 /* Closing writer will close any stream at the other end of writer. */
+ − 4220 Lstream_close (writer);
+ − 4221 Lstream_close (reader);
+ − 4222 UNGCPRO;
+ − 4223 }
771
+ − 4224
+ − 4225 /* The idea is that this function will create no garbage. */
+ − 4226 while (delete_count)
+ − 4227 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count]));
+ − 4228 }
+ − 4229
+ − 4230 unbind_to (count);
+ − 4231
+ − 4232 if (sink_type != DFC_TYPE_LISP_LSTREAM)
+ − 4233 {
+ − 4234 sink->data.len = Dynarr_length (conversion_out_dynarr);
+ − 4235 /* double zero-extend because we may be dealing with Unicode data */
+ − 4236 Dynarr_add (conversion_out_dynarr, '\0');
+ − 4237 Dynarr_add (conversion_out_dynarr, '\0');
+ − 4238 sink->data.ptr = Dynarr_atp (conversion_out_dynarr, 0);
+ − 4239 }
1292
+ − 4240
+ − 4241 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion);
771
+ − 4242 }
+ − 4243
+ − 4244 void
+ − 4245 dfc_convert_to_internal_format (dfc_conversion_type source_type,
+ − 4246 dfc_conversion_data *source,
+ − 4247 Lisp_Object coding_system,
+ − 4248 dfc_conversion_type sink_type,
+ − 4249 dfc_conversion_data *sink)
+ − 4250 {
+ − 4251 /* It's guaranteed that many callers are not prepared for GC here,
+ − 4252 esp. given that this code conversion occurs in many very hidden
+ − 4253 places. */
1292
+ − 4254 int count;
867
+ − 4255 Ibyte_dynarr *conversion_in_dynarr;
2421
+ − 4256 Lisp_Object underlying_cs;
1292
+ − 4257 PROFILE_DECLARE ();
+ − 4258
2367
+ − 4259 assert (!inhibit_non_essential_conversion_operations);
1292
+ − 4260 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion);
+ − 4261
+ − 4262 count = begin_gc_forbidden ();
771
+ − 4263
+ − 4264 type_checking_assert
+ − 4265 ((source_type == DFC_TYPE_DATA ||
+ − 4266 source_type == DFC_TYPE_LISP_LSTREAM)
+ − 4267 &&
+ − 4268 (sink_type == DFC_TYPE_DATA ||
+ − 4269 sink_type == DFC_TYPE_LISP_LSTREAM));
+ − 4270
+ − 4271 if (Dynarr_length (conversion_in_dynarr_list) <=
+ − 4272 dfc_convert_to_internal_format_in_use)
867
+ − 4273 Dynarr_add (conversion_in_dynarr_list, Dynarr_new (Ibyte));
771
+ − 4274 conversion_in_dynarr = Dynarr_at (conversion_in_dynarr_list,
+ − 4275 dfc_convert_to_internal_format_in_use);
+ − 4276 Dynarr_reset (conversion_in_dynarr);
+ − 4277
853
+ − 4278 internal_bind_int (&dfc_convert_to_internal_format_in_use,
+ − 4279 dfc_convert_to_internal_format_in_use + 1);
+ − 4280
2421
+ − 4281 /* The second call does the equivalent of both calls, but we need
+ − 4282 the result after the first call (which wraps just a to-text
+ − 4283 converter) as well as the result after the second call (which
+ − 4284 also wraps an EOL-detection converter). */
+ − 4285 underlying_cs = get_coding_system_for_text_file (coding_system, 0);
+ − 4286 coding_system = get_coding_system_for_text_file (underlying_cs, 1);
771
+ − 4287
+ − 4288 if (source_type != DFC_TYPE_LISP_LSTREAM &&
+ − 4289 sink_type != DFC_TYPE_LISP_LSTREAM &&
2421
+ − 4290 coding_system_is_binary (underlying_cs))
771
+ − 4291 {
+ − 4292 #ifdef MULE
2421
+ − 4293 const Ibyte *ptr;
771
+ − 4294 Bytecount len = source->data.len;
2421
+ − 4295 const Ibyte *end;
+ − 4296
+ − 4297 /* Make sure no EOL conversion is needed. With a little work we
+ − 4298 could handle EOL conversion as well but it may not be needed as an
+ − 4299 optimization. */
+ − 4300 if (!EQ (coding_system, underlying_cs))
+ − 4301 {
+ − 4302 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len;
+ − 4303 ptr < end; ptr++)
+ − 4304 {
+ − 4305 if (*ptr == '\r' || *ptr == '\n')
+ − 4306 goto the_hard_way;
+ − 4307 }
+ − 4308 }
+ − 4309
+ − 4310 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len;
+ − 4311 ptr < end; ptr++)
771
+ − 4312 {
867
+ − 4313 Ibyte c = *ptr;
771
+ − 4314
826
+ − 4315 if (byte_ascii_p (c))
771
+ − 4316 Dynarr_add (conversion_in_dynarr, c);
826
+ − 4317 else if (byte_c1_p (c))
771
+ − 4318 {
+ − 4319 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
+ − 4320 Dynarr_add (conversion_in_dynarr, c + 0x20);
+ − 4321 }
+ − 4322 else
+ − 4323 {
+ − 4324 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1);
+ − 4325 Dynarr_add (conversion_in_dynarr, c);
+ − 4326 }
+ − 4327 }
+ − 4328 #else
+ − 4329 Dynarr_add_many (conversion_in_dynarr, source->data.ptr, source->data.len);
+ − 4330 #endif
+ − 4331 }
1315
+ − 4332 #ifdef WIN32_ANY
1292
+ − 4333 /* Optimize the common case involving Unicode where only ASCII/Latin-1 is
+ − 4334 involved */
771
+ − 4335 else if (source_type != DFC_TYPE_LISP_LSTREAM &&
+ − 4336 sink_type != DFC_TYPE_LISP_LSTREAM &&
2421
+ − 4337 dfc_coding_system_is_unicode (underlying_cs))
771
+ − 4338 {
2421
+ − 4339 const Ibyte *ptr;
771
+ − 4340 Bytecount len = source->data.len;
2421
+ − 4341 const Ibyte *end;
771
+ − 4342
+ − 4343 if (len & 1)
+ − 4344 goto the_hard_way;
+ − 4345
2421
+ − 4346 /* Make sure only ASCII/Latin-1 is involved */
+ − 4347 for (ptr = (const Ibyte *) source->data.ptr + 1, end = ptr + len;
+ − 4348 ptr < end; ptr += 2)
771
+ − 4349 {
+ − 4350 if (*ptr)
+ − 4351 goto the_hard_way;
+ − 4352 }
+ − 4353
2421
+ − 4354 /* Make sure no EOL conversion is needed. With a little work we
+ − 4355 could handle EOL conversion as well but it may not be needed as an
+ − 4356 optimization. */
+ − 4357 if (!EQ (coding_system, underlying_cs))
+ − 4358 {
+ − 4359 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len;
+ − 4360 ptr < end; ptr += 2)
+ − 4361 {
+ − 4362 if (*ptr == '\r' || *ptr == '\n')
+ − 4363 goto the_hard_way;
+ − 4364 }
+ − 4365 }
+ − 4366
+ − 4367 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len;
+ − 4368 ptr < end; ptr += 2)
771
+ − 4369 {
867
+ − 4370 Ibyte c = *ptr;
771
+ − 4371
826
+ − 4372 if (byte_ascii_p (c))
771
+ − 4373 Dynarr_add (conversion_in_dynarr, c);
+ − 4374 #ifdef MULE
826
+ − 4375 else if (byte_c1_p (c))
771
+ − 4376 {
+ − 4377 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1);
+ − 4378 Dynarr_add (conversion_in_dynarr, c + 0x20);
+ − 4379 }
+ − 4380 else
+ − 4381 {
+ − 4382 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1);
+ − 4383 Dynarr_add (conversion_in_dynarr, c);
+ − 4384 }
+ − 4385 #endif /* MULE */
+ − 4386 }
+ − 4387 }
1315
+ − 4388 #endif /* WIN32_ANY */
771
+ − 4389 else
+ − 4390 {
+ − 4391 Lisp_Object streams_to_delete[3];
+ − 4392 int delete_count;
+ − 4393 Lisp_Object instream, outstream;
+ − 4394 Lstream *reader, *writer;
+ − 4395
2421
+ − 4396 #if defined (WIN32_ANY) || defined (MULE)
771
+ − 4397 the_hard_way:
2421
+ − 4398 #endif
771
+ − 4399 delete_count = 0;
+ − 4400 if (source_type == DFC_TYPE_LISP_LSTREAM)
+ − 4401 instream = source->lisp_object;
+ − 4402 else
+ − 4403 {
+ − 4404 type_checking_assert (source_type == DFC_TYPE_DATA);
+ − 4405 streams_to_delete[delete_count++] = instream =
+ − 4406 make_fixed_buffer_input_stream (source->data.ptr, source->data.len);
+ − 4407 }
+ − 4408
+ − 4409 if (sink_type == DFC_TYPE_LISP_LSTREAM)
+ − 4410 outstream = sink->lisp_object;
+ − 4411 else
+ − 4412 {
+ − 4413 type_checking_assert (sink_type == DFC_TYPE_DATA);
+ − 4414 streams_to_delete[delete_count++] = outstream =
+ − 4415 make_dynarr_output_stream
+ − 4416 ((unsigned_char_dynarr *) conversion_in_dynarr);
+ − 4417 }
+ − 4418
+ − 4419 streams_to_delete[delete_count++] = outstream =
800
+ − 4420 make_coding_output_stream (XLSTREAM (outstream), coding_system,
+ − 4421 CODING_DECODE, 0);
771
+ − 4422
+ − 4423 reader = XLSTREAM (instream);
+ − 4424 writer = XLSTREAM (outstream);
1204
+ − 4425 {
+ − 4426 struct gcpro gcpro1, gcpro2;
+ − 4427 /* outstream will gc-protect its sink stream, if necessary */
+ − 4428 GCPRO2 (instream, outstream);
+ − 4429
+ − 4430 while (1)
+ − 4431 {
+ − 4432 Bytecount size_in_bytes;
+ − 4433 char tempbuf[1024]; /* some random amount */
+ − 4434
+ − 4435 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf));
+ − 4436
+ − 4437 if (size_in_bytes == 0)
+ − 4438 break;
+ − 4439 else if (size_in_bytes < 0)
+ − 4440 signal_error (Qtext_conversion_error,
+ − 4441 "Error converting to internal format", Qunbound);
+ − 4442
+ − 4443 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0)
+ − 4444 signal_error (Qtext_conversion_error,
+ − 4445 "Error converting to internal format", Qunbound);
+ − 4446 }
+ − 4447
+ − 4448 /* Closing writer will close any stream at the other end of writer. */
+ − 4449 Lstream_close (writer);
+ − 4450 Lstream_close (reader);
+ − 4451 UNGCPRO;
+ − 4452 }
771
+ − 4453
+ − 4454 /* The idea is that this function will create no garbage. */
+ − 4455 while (delete_count)
+ − 4456 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count]));
+ − 4457 }
+ − 4458
+ − 4459 unbind_to (count);
+ − 4460
+ − 4461 if (sink_type != DFC_TYPE_LISP_LSTREAM)
+ − 4462 {
+ − 4463 sink->data.len = Dynarr_length (conversion_in_dynarr);
+ − 4464 Dynarr_add (conversion_in_dynarr, '\0'); /* remember to NUL-terminate! */
+ − 4465 /* The macros don't currently distinguish between internal and
+ − 4466 external sinks, and allocate and copy two extra bytes in both
+ − 4467 cases. So we add a second zero, just like for external data
+ − 4468 (in that case, because we may be converting to Unicode). */
+ − 4469 Dynarr_add (conversion_in_dynarr, '\0');
+ − 4470 sink->data.ptr = Dynarr_atp (conversion_in_dynarr, 0);
+ − 4471 }
1292
+ − 4472
+ − 4473 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion);
771
+ − 4474 }
+ − 4475
1318
+ − 4476 /* ----------------------------------------------------------------------- */
2367
+ − 4477 /* Alloca-conversion helpers */
+ − 4478 /* ----------------------------------------------------------------------- */
+ − 4479
+ − 4480 /* For alloca(), things are trickier because the calling function needs to
+ − 4481 allocate. This means that the caller needs to do the following:
+ − 4482
+ − 4483 (a) invoke us to do the conversion, remember the data and return the size.
+ − 4484 (b) alloca() the proper size.
+ − 4485 (c) invoke us again to copy the data.
+ − 4486
+ − 4487 We need to handle the possibility of two or more invocations of the
+ − 4488 converter in the same expression. In such cases it's conceivable that
+ − 4489 the evaluation of the sub-expressions will be overlapping (e.g. one size
+ − 4490 function called, then the other one called, then the copy functions
+ − 4491 called). To handle this, we keep a list of active data, indexed by the
+ − 4492 src expression. (We use the stringize operator to avoid evaluating the
+ − 4493 expression multiple times.) If the caller uses the exact same src
+ − 4494 expression twice in two converter calls in the same subexpression, we
2500
+ − 4495 will lose, but at least we can check for this and ABORT(). We could
2367
+ − 4496 conceivably try to index on other parameters as well, but there is not
+ − 4497 really any point. */
+ − 4498
+ − 4499 alloca_convert_vals_dynarr *active_alloca_convert;
+ − 4500
+ − 4501 int
+ − 4502 find_pos_of_existing_active_alloca_convert (const char *srctext)
+ − 4503 {
+ − 4504 alloca_convert_vals *vals = NULL;
+ − 4505 int i;
+ − 4506
+ − 4507 if (!active_alloca_convert)
+ − 4508 active_alloca_convert = Dynarr_new (alloca_convert_vals);
+ − 4509
+ − 4510 for (i = 0; i < Dynarr_length (active_alloca_convert); i++)
+ − 4511 {
+ − 4512 vals = Dynarr_atp (active_alloca_convert, i);
2385
+ − 4513 /* On my system, two different occurrences of the same stringized
+ − 4514 argument always point to the same string. However, on someone
+ − 4515 else's system, that wasn't the case. We check for equality
+ − 4516 first, since it seems systems work my way more than the other
+ − 4517 way. */
+ − 4518 if (vals->srctext == srctext || !strcmp (vals->srctext, srctext))
2367
+ − 4519 return i;
+ − 4520 }
+ − 4521
+ − 4522 return -1;
+ − 4523 }
+ − 4524
+ − 4525 /* ----------------------------------------------------------------------- */
1318
+ − 4526 /* New-style DFC converters (data is returned rather than stored into var) */
+ − 4527 /* ----------------------------------------------------------------------- */
+ − 4528
+ − 4529 /* We handle here the cases where SRC is a Lisp_Object, internal data
+ − 4530 (sized or unsized), or external data (sized or unsized), and return type
+ − 4531 is unsized alloca() or malloc() data. If the return type is a
+ − 4532 Lisp_Object, use build_ext_string() for unsized external data,
+ − 4533 make_ext_string() for sized external data. If the return type needs to
+ − 4534 be sized data, use the *_TO_SIZED_*() macros, and for other more
+ − 4535 complicated cases, use the original TO_*_FORMAT() macros. */
+ − 4536
+ − 4537 static void
+ − 4538 new_dfc_convert_now_damn_it (const void *src, Bytecount src_size,
+ − 4539 enum new_dfc_src_type type,
+ − 4540 void **dst, Bytecount *dst_size,
+ − 4541 Lisp_Object codesys)
+ − 4542 {
+ − 4543 /* #### In the case of alloca(), it would be a bit more efficient, for
+ − 4544 small strings, to use static Dynarr's like are used internally in
+ − 4545 TO_*_FORMAT(), or some other way of avoiding malloc() followed by
+ − 4546 free(). I doubt it really matters, though. */
+ − 4547
+ − 4548 switch (type)
+ − 4549 {
+ − 4550 case DFC_EXTERNAL:
+ − 4551 TO_INTERNAL_FORMAT (C_STRING, src,
+ − 4552 MALLOC, (*dst, *dst_size), codesys);
+ − 4553 break;
+ − 4554
+ − 4555 case DFC_SIZED_EXTERNAL:
+ − 4556 TO_INTERNAL_FORMAT (DATA, (src, src_size),
+ − 4557 MALLOC, (*dst, *dst_size), codesys);
+ − 4558 break;
+ − 4559
+ − 4560 case DFC_INTERNAL:
+ − 4561 TO_EXTERNAL_FORMAT (C_STRING, src,
+ − 4562 MALLOC, (*dst, *dst_size), codesys);
+ − 4563 break;
+ − 4564
+ − 4565 case DFC_SIZED_INTERNAL:
+ − 4566 TO_EXTERNAL_FORMAT (DATA, (src, src_size),
+ − 4567 MALLOC, (*dst, *dst_size), codesys);
+ − 4568 break;
+ − 4569
+ − 4570 case DFC_LISP_STRING:
+ − 4571 TO_EXTERNAL_FORMAT (LISP_STRING, VOID_TO_LISP (src),
+ − 4572 MALLOC, (*dst, *dst_size), codesys);
+ − 4573 break;
+ − 4574
+ − 4575 default:
2500
+ − 4576 ABORT ();
1318
+ − 4577 }
2367
+ − 4578
+ − 4579 /* The size is always + 2 because we have double zero-termination at the
+ − 4580 end of all data (for Unicode-correctness). */
+ − 4581 *dst_size += 2;
+ − 4582 }
+ − 4583
+ − 4584 Bytecount
+ − 4585 new_dfc_convert_size (const char *srctext, const void *src,
+ − 4586 Bytecount src_size, enum new_dfc_src_type type,
+ − 4587 Lisp_Object codesys)
+ − 4588 {
+ − 4589 alloca_convert_vals vals;
+ − 4590
+ − 4591 assert (find_pos_of_existing_active_alloca_convert (srctext) < 0);
+ − 4592
+ − 4593 vals.srctext = srctext;
+ − 4594
+ − 4595 new_dfc_convert_now_damn_it (src, src_size, type, &vals.dst, &vals.dst_size,
+ − 4596 codesys);
+ − 4597
+ − 4598 Dynarr_add (active_alloca_convert, vals);
+ − 4599 return vals.dst_size;
+ − 4600 }
+ − 4601
+ − 4602 void *
+ − 4603 new_dfc_convert_copy_data (const char *srctext, void *alloca_data)
+ − 4604 {
+ − 4605 alloca_convert_vals *vals;
+ − 4606 int i = find_pos_of_existing_active_alloca_convert (srctext);
+ − 4607
+ − 4608 assert (i >= 0);
+ − 4609 vals = Dynarr_atp (active_alloca_convert, i);
+ − 4610 assert (alloca_data);
+ − 4611 memcpy (alloca_data, vals->dst, vals->dst_size);
+ − 4612 xfree (vals->dst, void *);
+ − 4613 Dynarr_delete (active_alloca_convert, i);
+ − 4614 return alloca_data;
1318
+ − 4615 }
+ − 4616
+ − 4617 void *
+ − 4618 new_dfc_convert_malloc (const void *src, Bytecount src_size,
+ − 4619 enum new_dfc_src_type type, Lisp_Object codesys)
+ − 4620 {
+ − 4621 void *dst;
+ − 4622 Bytecount dst_size;
+ − 4623
+ − 4624 new_dfc_convert_now_damn_it (src, src_size, type, &dst, &dst_size, codesys);
+ − 4625 return dst;
+ − 4626 }
+ − 4627
771
+ − 4628
+ − 4629 /************************************************************************/
867
+ − 4630 /* Basic Ichar functions */
771
+ − 4631 /************************************************************************/
+ − 4632
+ − 4633 #ifdef MULE
+ − 4634
+ − 4635 /* Convert a non-ASCII Mule character C into a one-character Mule-encoded
+ − 4636 string in STR. Returns the number of bytes stored.
867
+ − 4637 Do not call this directly. Use the macro set_itext_ichar() instead.
771
+ − 4638 */
+ − 4639
+ − 4640 Bytecount
867
+ − 4641 non_ascii_set_itext_ichar (Ibyte *str, Ichar c)
771
+ − 4642 {
867
+ − 4643 Ibyte *p;
+ − 4644 Ibyte lb;
771
+ − 4645 int c1, c2;
+ − 4646 Lisp_Object charset;
+ − 4647
+ − 4648 p = str;
867
+ − 4649 BREAKUP_ICHAR (c, charset, c1, c2);
+ − 4650 lb = ichar_leading_byte (c);
826
+ − 4651 if (leading_byte_private_p (lb))
+ − 4652 *p++ = private_leading_byte_prefix (lb);
771
+ − 4653 *p++ = lb;
+ − 4654 if (EQ (charset, Vcharset_control_1))
+ − 4655 c1 += 0x20;
+ − 4656 *p++ = c1 | 0x80;
+ − 4657 if (c2)
+ − 4658 *p++ = c2 | 0x80;
+ − 4659
+ − 4660 return (p - str);
+ − 4661 }
+ − 4662
+ − 4663 /* Return the first character from a Mule-encoded string in STR,
+ − 4664 assuming it's non-ASCII. Do not call this directly.
867
+ − 4665 Use the macro itext_ichar() instead. */
+ − 4666
+ − 4667 Ichar
+ − 4668 non_ascii_itext_ichar (const Ibyte *str)
771
+ − 4669 {
867
+ − 4670 Ibyte i0 = *str, i1, i2 = 0;
771
+ − 4671 Lisp_Object charset;
+ − 4672
+ − 4673 if (i0 == LEADING_BYTE_CONTROL_1)
867
+ − 4674 return (Ichar) (*++str - 0x20);
771
+ − 4675
826
+ − 4676 if (leading_byte_prefix_p (i0))
771
+ − 4677 i0 = *++str;
+ − 4678
+ − 4679 i1 = *++str & 0x7F;
+ − 4680
826
+ − 4681 charset = charset_by_leading_byte (i0);
771
+ − 4682 if (XCHARSET_DIMENSION (charset) == 2)
+ − 4683 i2 = *++str & 0x7F;
+ − 4684
867
+ − 4685 return make_ichar (charset, i1, i2);
771
+ − 4686 }
+ − 4687
867
+ − 4688 /* Return whether CH is a valid Ichar, assuming it's non-ASCII.
+ − 4689 Do not call this directly. Use the macro valid_ichar_p() instead. */
771
+ − 4690
+ − 4691 int
867
+ − 4692 non_ascii_valid_ichar_p (Ichar ch)
771
+ − 4693 {
+ − 4694 int f1, f2, f3;
+ − 4695
+ − 4696 /* Must have only lowest 19 bits set */
+ − 4697 if (ch & ~0x7FFFF)
+ − 4698 return 0;
+ − 4699
867
+ − 4700 f1 = ichar_field1 (ch);
+ − 4701 f2 = ichar_field2 (ch);
+ − 4702 f3 = ichar_field3 (ch);
771
+ − 4703
+ − 4704 if (f1 == 0)
+ − 4705 {
+ − 4706 /* dimension-1 char */
+ − 4707 Lisp_Object charset;
+ − 4708
+ − 4709 /* leading byte must be correct */
867
+ − 4710 if (f2 < MIN_ICHAR_FIELD2_OFFICIAL ||
+ − 4711 (f2 > MAX_ICHAR_FIELD2_OFFICIAL && f2 < MIN_ICHAR_FIELD2_PRIVATE) ||
+ − 4712 f2 > MAX_ICHAR_FIELD2_PRIVATE)
771
+ − 4713 return 0;
+ − 4714 /* octet not out of range */
+ − 4715 if (f3 < 0x20)
+ − 4716 return 0;
+ − 4717 /* charset exists */
+ − 4718 /*
+ − 4719 NOTE: This takes advantage of the fact that
+ − 4720 FIELD2_TO_OFFICIAL_LEADING_BYTE and
+ − 4721 FIELD2_TO_PRIVATE_LEADING_BYTE are the same.
+ − 4722 */
826
+ − 4723 charset = charset_by_leading_byte (f2 + FIELD2_TO_OFFICIAL_LEADING_BYTE);
771
+ − 4724 if (EQ (charset, Qnil))
+ − 4725 return 0;
+ − 4726 /* check range as per size (94 or 96) of charset */
+ − 4727 return ((f3 > 0x20 && f3 < 0x7f) || XCHARSET_CHARS (charset) == 96);
+ − 4728 }
+ − 4729 else
+ − 4730 {
+ − 4731 /* dimension-2 char */
+ − 4732 Lisp_Object charset;
+ − 4733
+ − 4734 /* leading byte must be correct */
867
+ − 4735 if (f1 < MIN_ICHAR_FIELD1_OFFICIAL ||
+ − 4736 (f1 > MAX_ICHAR_FIELD1_OFFICIAL && f1 < MIN_ICHAR_FIELD1_PRIVATE) ||
+ − 4737 f1 > MAX_ICHAR_FIELD1_PRIVATE)
771
+ − 4738 return 0;
+ − 4739 /* octets not out of range */
+ − 4740 if (f2 < 0x20 || f3 < 0x20)
+ − 4741 return 0;
+ − 4742
+ − 4743 #ifdef ENABLE_COMPOSITE_CHARS
+ − 4744 if (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE == LEADING_BYTE_COMPOSITE)
+ − 4745 {
+ − 4746 if (UNBOUNDP (Fgethash (make_int (ch),
+ − 4747 Vcomposite_char_char2string_hash_table,
+ − 4748 Qunbound)))
+ − 4749 return 0;
+ − 4750 return 1;
+ − 4751 }
+ − 4752 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 4753
+ − 4754 /* charset exists */
867
+ − 4755 if (f1 <= MAX_ICHAR_FIELD1_OFFICIAL)
771
+ − 4756 charset =
826
+ − 4757 charset_by_leading_byte (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE);
771
+ − 4758 else
+ − 4759 charset =
826
+ − 4760 charset_by_leading_byte (f1 + FIELD1_TO_PRIVATE_LEADING_BYTE);
771
+ − 4761
+ − 4762 if (EQ (charset, Qnil))
+ − 4763 return 0;
+ − 4764 /* check range as per size (94x94 or 96x96) of charset */
+ − 4765 return ((f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F) ||
+ − 4766 XCHARSET_CHARS (charset) == 96);
+ − 4767 }
+ − 4768 }
+ − 4769
+ − 4770 /* Copy the character pointed to by SRC into DST. Do not call this
867
+ − 4771 directly. Use the macro itext_copy_ichar() instead.
771
+ − 4772 Return the number of bytes copied. */
+ − 4773
+ − 4774 Bytecount
867
+ − 4775 non_ascii_itext_copy_ichar (const Ibyte *src, Ibyte *dst)
771
+ − 4776 {
826
+ − 4777 Bytecount bytes = rep_bytes_by_first_byte (*src);
771
+ − 4778 Bytecount i;
+ − 4779 for (i = bytes; i; i--, dst++, src++)
+ − 4780 *dst = *src;
+ − 4781 return bytes;
+ − 4782 }
+ − 4783
+ − 4784 #endif /* MULE */
+ − 4785
+ − 4786
+ − 4787 /************************************************************************/
867
+ − 4788 /* streams of Ichars */
771
+ − 4789 /************************************************************************/
+ − 4790
+ − 4791 #ifdef MULE
+ − 4792
867
+ − 4793 /* Treat a stream as a stream of Ichar's rather than a stream of bytes.
771
+ − 4794 The functions below are not meant to be called directly; use
+ − 4795 the macros in insdel.h. */
+ − 4796
867
+ − 4797 Ichar
+ − 4798 Lstream_get_ichar_1 (Lstream *stream, int ch)
771
+ − 4799 {
867
+ − 4800 Ibyte str[MAX_ICHAR_LEN];
+ − 4801 Ibyte *strptr = str;
771
+ − 4802 Bytecount bytes;
+ − 4803
867
+ − 4804 str[0] = (Ibyte) ch;
771
+ − 4805
826
+ − 4806 for (bytes = rep_bytes_by_first_byte (ch) - 1; bytes; bytes--)
771
+ − 4807 {
+ − 4808 int c = Lstream_getc (stream);
800
+ − 4809 text_checking_assert (c >= 0);
867
+ − 4810 *++strptr = (Ibyte) c;
771
+ − 4811 }
867
+ − 4812 return itext_ichar (str);
771
+ − 4813 }
+ − 4814
+ − 4815 int
867
+ − 4816 Lstream_fput_ichar (Lstream *stream, Ichar ch)
771
+ − 4817 {
867
+ − 4818 Ibyte str[MAX_ICHAR_LEN];
+ − 4819 Bytecount len = set_itext_ichar (str, ch);
771
+ − 4820 return Lstream_write (stream, str, len);
+ − 4821 }
+ − 4822
+ − 4823 void
867
+ − 4824 Lstream_funget_ichar (Lstream *stream, Ichar ch)
771
+ − 4825 {
867
+ − 4826 Ibyte str[MAX_ICHAR_LEN];
+ − 4827 Bytecount len = set_itext_ichar (str, ch);
771
+ − 4828 Lstream_unread (stream, str, len);
+ − 4829 }
+ − 4830
+ − 4831 #endif /* MULE */
+ − 4832
+ − 4833
+ − 4834 /************************************************************************/
+ − 4835 /* Lisp primitives for working with characters */
+ − 4836 /************************************************************************/
+ − 4837
+ − 4838 DEFUN ("make-char", Fmake_char, 2, 3, 0, /*
+ − 4839 Make a character from CHARSET and octets ARG1 and ARG2.
+ − 4840 ARG2 is required only for characters from two-dimensional charsets.
+ − 4841
+ − 4842 Each octet should be in the range 32 through 127 for a 96 or 96x96
+ − 4843 charset and 33 through 126 for a 94 or 94x94 charset. (Most charsets
+ − 4844 are either 96 or 94x94.) Note that this is 32 more than the values
+ − 4845 typically given for 94x94 charsets. When two octets are required, the
+ − 4846 order is "standard" -- the same as appears in ISO-2022 encodings,
+ − 4847 reference tables, etc.
+ − 4848
+ − 4849 \(Note the following non-obvious result: Computerized translation
+ − 4850 tables often encode the two octets as the high and low bytes,
+ − 4851 respectively, of a hex short, while when there's only one octet, it
+ − 4852 goes in the low byte. When decoding such a value, you need to treat
+ − 4853 the two cases differently when calling make-char: One is (make-char
+ − 4854 CHARSET HIGH LOW), the other is (make-char CHARSET LOW).)
+ − 4855
+ − 4856 For example, (make-char 'latin-iso8859-2 185) or (make-char
+ − 4857 'latin-iso8859-2 57) will return the Latin 2 character s with caron.
+ − 4858
+ − 4859 As another example, the Japanese character for "kawa" (stream), which
+ − 4860 looks something like this:
+ − 4861
+ − 4862 | |
+ − 4863 | | |
+ − 4864 | | |
+ − 4865 | | |
+ − 4866 / |
+ − 4867
+ − 4868 appears in the Unicode Standard (version 2.0) on page 7-287 with the
+ − 4869 following values (see also page 7-4):
+ − 4870
+ − 4871 U 5DDD (Unicode)
+ − 4872 G 0-2008 (GB 2312-80)
+ − 4873 J 0-3278 (JIS X 0208-1990)
+ − 4874 K 0-8425 (KS C 5601-1987)
+ − 4875 B A474 (Big Five)
+ − 4876 C 1-4455 (CNS 11643-1986 (1st plane))
+ − 4877 A 213C34 (ANSI Z39.64-1989)
+ − 4878
+ − 4879 These are equivalent to:
+ − 4880
+ − 4881 \(make-char 'chinese-gb2312 52 40)
+ − 4882 \(make-char 'japanese-jisx0208 64 110)
+ − 4883 \(make-char 'korean-ksc5601 116 57)
+ − 4884 \(make-char 'chinese-cns11643-1 76 87)
+ − 4885 \(decode-big5-char '(164 . 116))
+ − 4886
+ − 4887 \(All codes above are two decimal numbers except for Big Five and ANSI
+ − 4888 Z39.64, which we don't support. We add 32 to each of the decimal
+ − 4889 numbers. Big Five is split in a rather hackish fashion into two
+ − 4890 charsets, `big5-1' and `big5-2', due to its excessive size -- 94x157,
+ − 4891 with the first codepoint in the range 0xA1 to 0xFE and the second in
+ − 4892 the range 0x40 to 0x7E or 0xA1 to 0xFE. `decode-big5-char' is used to
+ − 4893 generate the char from its codes, and `encode-big5-char' extracts the
+ − 4894 codes.)
+ − 4895
+ − 4896 When compiled without MULE, this function does not do much, but it's
+ − 4897 provided for compatibility. In this case, the following CHARSET symbols
+ − 4898 are allowed:
+ − 4899
+ − 4900 `ascii' -- ARG1 should be in the range 0 through 127.
+ − 4901 `control-1' -- ARG1 should be in the range 128 through 159.
+ − 4902 else -- ARG1 is coerced to be between 0 and 255, and then the high
+ − 4903 bit is set.
+ − 4904
+ − 4905 `int-to-char of the resulting ARG1' is returned, and ARG2 is always ignored.
+ − 4906 */
2333
+ − 4907 (charset, arg1, USED_IF_MULE (arg2)))
771
+ − 4908 {
+ − 4909 #ifdef MULE
+ − 4910 Lisp_Charset *cs;
+ − 4911 int a1, a2;
+ − 4912 int lowlim, highlim;
+ − 4913
+ − 4914 charset = Fget_charset (charset);
+ − 4915 cs = XCHARSET (charset);
+ − 4916
788
+ − 4917 get_charset_limits (charset, &lowlim, &highlim);
771
+ − 4918
+ − 4919 CHECK_INT (arg1);
+ − 4920 /* It is useful (and safe, according to Olivier Galibert) to strip
+ − 4921 the 8th bit off ARG1 and ARG2 because it allows programmers to
+ − 4922 write (make-char 'latin-iso8859-2 CODE) where code is the actual
+ − 4923 Latin 2 code of the character. */
+ − 4924 a1 = XINT (arg1) & 0x7f;
+ − 4925 if (a1 < lowlim || a1 > highlim)
+ − 4926 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim));
+ − 4927
+ − 4928 if (CHARSET_DIMENSION (cs) == 1)
+ − 4929 {
+ − 4930 if (!NILP (arg2))
+ − 4931 invalid_argument
+ − 4932 ("Charset is of dimension one; second octet must be nil", arg2);
867
+ − 4933 return make_char (make_ichar (charset, a1, 0));
771
+ − 4934 }
+ − 4935
+ − 4936 CHECK_INT (arg2);
+ − 4937 a2 = XINT (arg2) & 0x7f;
+ − 4938 if (a2 < lowlim || a2 > highlim)
+ − 4939 args_out_of_range_3 (arg2, make_int (lowlim), make_int (highlim));
+ − 4940
867
+ − 4941 return make_char (make_ichar (charset, a1, a2));
771
+ − 4942 #else
+ − 4943 int a1;
+ − 4944 int lowlim, highlim;
+ − 4945
+ − 4946 if (EQ (charset, Qascii)) lowlim = 0, highlim = 127;
+ − 4947 else if (EQ (charset, Qcontrol_1)) lowlim = 0, highlim = 31;
+ − 4948 else lowlim = 0, highlim = 127;
+ − 4949
+ − 4950 CHECK_INT (arg1);
+ − 4951 /* It is useful (and safe, according to Olivier Galibert) to strip
+ − 4952 the 8th bit off ARG1 and ARG2 because it allows programmers to
+ − 4953 write (make-char 'latin-iso8859-2 CODE) where code is the actual
+ − 4954 Latin 2 code of the character. */
+ − 4955 a1 = XINT (arg1) & 0x7f;
+ − 4956 if (a1 < lowlim || a1 > highlim)
+ − 4957 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim));
+ − 4958
+ − 4959 if (EQ (charset, Qascii))
+ − 4960 return make_char (a1);
+ − 4961 return make_char (a1 + 128);
+ − 4962 #endif /* MULE */
+ − 4963 }
+ − 4964
+ − 4965 #ifdef MULE
+ − 4966
+ − 4967 DEFUN ("char-charset", Fchar_charset, 1, 1, 0, /*
+ − 4968 Return the character set of char CH.
+ − 4969 */
+ − 4970 (ch))
+ − 4971 {
+ − 4972 CHECK_CHAR_COERCE_INT (ch);
+ − 4973
826
+ − 4974 return XCHARSET_NAME (charset_by_leading_byte
867
+ − 4975 (ichar_leading_byte (XCHAR (ch))));
771
+ − 4976 }
+ − 4977
+ − 4978 DEFUN ("char-octet", Fchar_octet, 1, 2, 0, /*
+ − 4979 Return the octet numbered N (should be 0 or 1) of char CH.
+ − 4980 N defaults to 0 if omitted.
+ − 4981 */
+ − 4982 (ch, n))
+ − 4983 {
+ − 4984 Lisp_Object charset;
+ − 4985 int octet0, octet1;
+ − 4986
+ − 4987 CHECK_CHAR_COERCE_INT (ch);
+ − 4988
867
+ − 4989 BREAKUP_ICHAR (XCHAR (ch), charset, octet0, octet1);
771
+ − 4990
+ − 4991 if (NILP (n) || EQ (n, Qzero))
+ − 4992 return make_int (octet0);
+ − 4993 else if (EQ (n, make_int (1)))
+ − 4994 return make_int (octet1);
+ − 4995 else
+ − 4996 invalid_constant ("Octet number must be 0 or 1", n);
+ − 4997 }
+ − 4998
+ − 4999 DEFUN ("split-char", Fsplit_char, 1, 1, 0, /*
+ − 5000 Return list of charset and one or two position-codes of CHAR.
+ − 5001 */
+ − 5002 (character))
+ − 5003 {
+ − 5004 /* This function can GC */
+ − 5005 struct gcpro gcpro1, gcpro2;
+ − 5006 Lisp_Object charset = Qnil;
+ − 5007 Lisp_Object rc = Qnil;
+ − 5008 int c1, c2;
+ − 5009
+ − 5010 GCPRO2 (charset, rc);
+ − 5011 CHECK_CHAR_COERCE_INT (character);
+ − 5012
867
+ − 5013 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2);
771
+ − 5014
+ − 5015 if (XCHARSET_DIMENSION (Fget_charset (charset)) == 2)
+ − 5016 {
+ − 5017 rc = list3 (XCHARSET_NAME (charset), make_int (c1), make_int (c2));
+ − 5018 }
+ − 5019 else
+ − 5020 {
+ − 5021 rc = list2 (XCHARSET_NAME (charset), make_int (c1));
+ − 5022 }
+ − 5023 UNGCPRO;
+ − 5024
+ − 5025 return rc;
+ − 5026 }
+ − 5027
+ − 5028 #endif /* MULE */
+ − 5029
+ − 5030
+ − 5031 /************************************************************************/
+ − 5032 /* composite character functions */
+ − 5033 /************************************************************************/
+ − 5034
+ − 5035 #ifdef ENABLE_COMPOSITE_CHARS
+ − 5036
867
+ − 5037 Ichar
+ − 5038 lookup_composite_char (Ibyte *str, int len)
771
+ − 5039 {
+ − 5040 Lisp_Object lispstr = make_string (str, len);
+ − 5041 Lisp_Object ch = Fgethash (lispstr,
+ − 5042 Vcomposite_char_string2char_hash_table,
+ − 5043 Qunbound);
867
+ − 5044 Ichar emch;
771
+ − 5045
+ − 5046 if (UNBOUNDP (ch))
+ − 5047 {
+ − 5048 if (composite_char_row_next >= 128)
+ − 5049 invalid_operation ("No more composite chars available", lispstr);
867
+ − 5050 emch = make_ichar (Vcharset_composite, composite_char_row_next,
771
+ − 5051 composite_char_col_next);
+ − 5052 Fputhash (make_char (emch), lispstr,
+ − 5053 Vcomposite_char_char2string_hash_table);
+ − 5054 Fputhash (lispstr, make_char (emch),
+ − 5055 Vcomposite_char_string2char_hash_table);
+ − 5056 composite_char_col_next++;
+ − 5057 if (composite_char_col_next >= 128)
+ − 5058 {
+ − 5059 composite_char_col_next = 32;
+ − 5060 composite_char_row_next++;
+ − 5061 }
+ − 5062 }
+ − 5063 else
+ − 5064 emch = XCHAR (ch);
+ − 5065 return emch;
+ − 5066 }
+ − 5067
+ − 5068 Lisp_Object
867
+ − 5069 composite_char_string (Ichar ch)
771
+ − 5070 {
+ − 5071 Lisp_Object str = Fgethash (make_char (ch),
+ − 5072 Vcomposite_char_char2string_hash_table,
+ − 5073 Qunbound);
+ − 5074 assert (!UNBOUNDP (str));
+ − 5075 return str;
+ − 5076 }
+ − 5077
826
+ − 5078 DEFUN ("make-composite-char", Fmake_composite_char, 1, 1, 0, /*
771
+ − 5079 Convert a string into a single composite character.
+ − 5080 The character is the result of overstriking all the characters in
+ − 5081 the string.
+ − 5082 */
+ − 5083 (string))
+ − 5084 {
+ − 5085 CHECK_STRING (string);
+ − 5086 return make_char (lookup_composite_char (XSTRING_DATA (string),
+ − 5087 XSTRING_LENGTH (string)));
+ − 5088 }
+ − 5089
826
+ − 5090 DEFUN ("composite-char-string", Fcomposite_char_string, 1, 1, 0, /*
771
+ − 5091 Return a string of the characters comprising a composite character.
+ − 5092 */
+ − 5093 (ch))
+ − 5094 {
867
+ − 5095 Ichar emch;
771
+ − 5096
+ − 5097 CHECK_CHAR (ch);
+ − 5098 emch = XCHAR (ch);
867
+ − 5099 if (ichar_leading_byte (emch) != LEADING_BYTE_COMPOSITE)
771
+ − 5100 invalid_argument ("Must be composite char", ch);
+ − 5101 return composite_char_string (emch);
+ − 5102 }
+ − 5103 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 5104
+ − 5105
+ − 5106 /************************************************************************/
+ − 5107 /* initialization */
+ − 5108 /************************************************************************/
+ − 5109
+ − 5110 void
1204
+ − 5111 reinit_eistring_early (void)
771
+ − 5112 {
+ − 5113 the_eistring_malloc_zero_init = the_eistring_zero_init;
+ − 5114 the_eistring_malloc_zero_init.mallocp_ = 1;
+ − 5115 }
+ − 5116
+ − 5117 void
814
+ − 5118 init_eistring_once_early (void)
+ − 5119 {
1204
+ − 5120 reinit_eistring_early ();
814
+ − 5121 }
+ − 5122
+ − 5123 void
771
+ − 5124 syms_of_text (void)
+ − 5125 {
+ − 5126 DEFSUBR (Fmake_char);
+ − 5127
+ − 5128 #ifdef MULE
+ − 5129 DEFSUBR (Fchar_charset);
+ − 5130 DEFSUBR (Fchar_octet);
+ − 5131 DEFSUBR (Fsplit_char);
+ − 5132
+ − 5133 #ifdef ENABLE_COMPOSITE_CHARS
+ − 5134 DEFSUBR (Fmake_composite_char);
+ − 5135 DEFSUBR (Fcomposite_char_string);
+ − 5136 #endif
+ − 5137 #endif /* MULE */
+ − 5138 }
+ − 5139
+ − 5140 void
+ − 5141 reinit_vars_of_text (void)
+ − 5142 {
+ − 5143 int i;
+ − 5144
867
+ − 5145 conversion_in_dynarr_list = Dynarr_new2 (Ibyte_dynarr_dynarr,
+ − 5146 Ibyte_dynarr *);
771
+ − 5147 conversion_out_dynarr_list = Dynarr_new2 (Extbyte_dynarr_dynarr,
+ − 5148 Extbyte_dynarr *);
+ − 5149
+ − 5150 for (i = 0; i <= MAX_BYTEBPOS_GAP_SIZE_3; i++)
+ − 5151 three_to_one_table[i] = i / 3;
+ − 5152 }
+ − 5153
+ − 5154 void
+ − 5155 vars_of_text (void)
+ − 5156 {
1292
+ − 5157 QSin_char_byte_conversion = build_msg_string ("(in char-byte conversion)");
+ − 5158 staticpro (&QSin_char_byte_conversion);
+ − 5159 QSin_internal_external_conversion =
+ − 5160 build_msg_string ("(in internal-external conversion)");
+ − 5161 staticpro (&QSin_internal_external_conversion);
+ − 5162
771
+ − 5163 #ifdef ENABLE_COMPOSITE_CHARS
+ − 5164 /* #### not dumped properly */
+ − 5165 composite_char_row_next = 32;
+ − 5166 composite_char_col_next = 32;
+ − 5167
+ − 5168 Vcomposite_char_string2char_hash_table =
+ − 5169 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQUAL);
+ − 5170 Vcomposite_char_char2string_hash_table =
+ − 5171 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
+ − 5172 staticpro (&Vcomposite_char_string2char_hash_table);
+ − 5173 staticpro (&Vcomposite_char_char2string_hash_table);
+ − 5174 #endif /* ENABLE_COMPOSITE_CHARS */
+ − 5175 }