Mercurial > hg > xemacs-beta
annotate src/text.c @ 5774:7a538e1a4676
Use skip_ascii() in no_conversion_convert() when encoding.
src/ChangeLog addition:
2013-12-19 Aidan Kehoe <kehoea@parhasard.net>
* text.c:
* text.h:
* text.h (skip_ascii):
Move skip_ascii (), the very fast inline function from the
bytecount-to-charcount code, to text.h, to allow the coding
systems to use it too as needed.
* file-coding.c (no_conversion_convert):
Use skip_ascii() as appropriate here, halving the time taken to
write large files in my tests (again, relevant to VM buffers, but
not a panacea to our issues with them.)
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Thu, 19 Dec 2013 18:13:11 +0000 |
parents | 56144c8593a8 |
children | 0cb4f494a548 |
rev | line source |
---|---|
2367 | 1 /* Text manipulation primitives for XEmacs. |
771 | 2 Copyright (C) 1995 Sun Microsystems, Inc. |
2367 | 3 Copyright (C) 1995, 1996, 2000, 2001, 2002, 2003, 2004 Ben Wing. |
771 | 4 Copyright (C) 1999 Martin Buchholz. |
5 | |
6 This file is part of XEmacs. | |
7 | |
5402
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5191
diff
changeset
|
8 XEmacs is free software: you can redistribute it and/or modify it |
771 | 9 under the terms of the GNU General Public License as published by the |
5402
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5191
diff
changeset
|
10 Free Software Foundation, either version 3 of the License, or (at your |
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5191
diff
changeset
|
11 option) any later version. |
771 | 12 |
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 for more details. | |
17 | |
18 You should have received a copy of the GNU General Public License | |
5402
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5191
diff
changeset
|
19 along with XEmacs. If not, see <http://www.gnu.org/licenses/>. */ |
771 | 20 |
21 /* Synched up with: Not in FSF. */ | |
22 | |
23 /* Authorship: | |
24 */ | |
25 | |
26 #include <config.h> | |
27 #include "lisp.h" | |
28 | |
29 #include "buffer.h" | |
30 #include "charset.h" | |
31 #include "file-coding.h" | |
32 #include "lstream.h" | |
1292 | 33 #include "profile.h" |
771 | 34 |
35 | |
36 /************************************************************************/ | |
37 /* long comments */ | |
38 /************************************************************************/ | |
39 | |
2367 | 40 /* NB: Everything below was written by Ben Wing except as otherwise noted. */ |
41 | |
42 /************************************************************************/ | |
43 /* */ | |
44 /* */ | |
45 /* Part A: More carefully-written documentation */ | |
46 /* */ | |
47 /* */ | |
48 /************************************************************************/ | |
49 | |
50 /* Authorship: Ben Wing | |
51 | |
771 | 52 |
826 | 53 ========================================================================== |
2367 | 54 7. Handling non-default formats |
826 | 55 ========================================================================== |
771 | 56 |
2367 | 57 We support, at least to some extent, formats other than the default |
58 variable-width format, for speed; all of these alternative formats are | |
59 fixed-width. Currently we only handle these non-default formats in | |
60 buffers, because access to their text is strictly controlled and thus | |
61 the details of the format mostly compartmentalized. The only really | |
62 tricky part is the search code -- the regex, Boyer-Moore, and | |
63 simple-search algorithms in search.c and regex.c. All other code that | |
64 knows directly about the buffer representation is the basic code to | |
65 modify or retrieve the buffer text. | |
66 | |
67 Supporting fixed-width formats in Lisp strings is harder, but possible | |
68 -- FSF currently does this, for example. In this case, however, | |
69 probably only 8-bit-fixed is reasonable for Lisp strings -- getting | |
70 non-ASCII-compatible fixed-width formats to work is much, much harder | |
71 because a lot of code assumes that strings are ASCII-compatible | |
72 (i.e. ASCII + other characters represented exclusively using high-bit | |
73 bytes) and a lot of code mixes Lisp strings and non-Lisp strings freely. | |
74 | |
75 The different possible fixed-width formats are 8-bit fixed, 16-bit | |
76 fixed, and 32-bit fixed. The latter can represent all possible | |
77 characters, but at a substantial memory penalty. The other two can | |
78 represent only a subset of the possible characters. How these subsets | |
79 are defined can be simple or very tricky. | |
80 | |
81 Currently we support only the default format and the 8-bit fixed format, | |
82 and in the latter, we only allow these to be the first 256 characters in | |
83 an Ichar (ASCII and Latin 1). | |
84 | |
85 One reasonable approach for 8-bit fixed is to allow the upper half to | |
86 represent any 1-byte charset, which is specified on a per-buffer basis. | |
87 This should work fairly well in practice since most documents are in | |
88 only one foreign language (possibly with some English mixed in). I | |
89 think FSF does something like this; or at least, they have something | |
90 called nonascii-translation-table and use it when converting from | |
91 8-bit-fixed text ("unibyte text") to default text ("multibyte text"). | |
92 With 16-bit fixed, you could do something like assign chunks of the 64K | |
93 worth of characters to charsets as they're encountered in documents. | |
94 This should work well with most Asian documents. | |
95 | |
96 If/when we switch to using Unicode internally, we might have formats more | |
97 like this: | |
98 | |
99 -- UTF-8 or some extension as the default format. Perl uses an | |
100 extension that handles 64-bit chars and requires as much as 13 bytes per | |
101 char, vs. the standard of 31-bit chars and 6 bytes max. UTF-8 has the | |
102 same basic properties as our own variable-width format (see text.c, | |
103 Internal String Encoding) and so most code would not need to be changed. | |
104 | |
105 -- UTF-16 as a "pseudo-fixed" format (i.e. 16-bit fixed plus surrogates | |
106 for representing characters not in the BMP, aka >= 65536). The vast | |
107 majority of documents will have no surrogates in them so byte/char | |
108 conversion will be very fast. | |
109 | |
110 -- an 8-bit fixed format, like currently. | |
111 | |
112 -- possibly, UCS-4 as a 32-bit fixed format. | |
113 | |
114 The fixed-width formats essentially treat the buffer as an array of | |
115 8-bit, 16-bit or 32-bit integers. This means that how they are stored | |
116 in memory (in particular, big-endian or little-endian) depends on the | |
117 native format of the machine's processor. It also means we have to | |
118 worry a bit about alignment (basically, we just need to keep the gap an | |
119 integral size of the character size, and get things aligned properly | |
120 when converting the buffer between formats). | |
826 | 121 |
122 ========================================================================== | |
2367 | 123 8. Using UTF-16 as the default text format |
826 | 124 ========================================================================== |
125 | |
2367 | 126 NOTE: The Eistring API is (or should be) Mule-correct even without |
127 an ASCII-compatible internal representation. | |
128 | |
129 #### Currently, the assumption that text units are one byte in size is | |
130 embedded throughout XEmacs, and `Ibyte *' is used where `Itext *' should | |
131 be. The way to fix this is to (among other things) | |
132 | |
133 (a) review all places referencing `Ibyte' and `Ibyte *', change them to | |
134 use Itext, and fix up the code. | |
135 (b) change XSTRING_DATA to be of type Itext * | |
136 (c) review all uses of XSTRING_DATA | |
137 (d) eliminate XSTRING_LENGTH, splitting it into XSTRING_BYTE_LENGTH and | |
138 XSTRING_TEXT_LENGTH and reviewing all places referencing this | |
139 (e) make similar changes to other API's that refer to the "length" of | |
140 something, such as qxestrlen() and eilen() | |
141 (f) review all use of `CIbyte *'. Currently this is usually a way of | |
142 passing literal ASCII text strings in places that want internal text. | |
143 Either create separate _ascii() and _itext() versions of the | |
144 functions taking CIbyte *, or make use of something like the | |
145 WEXTTEXT() macro, which will generate wide strings as appropriate. | |
146 (g) review all uses of Bytecount and see which ones should be Textcount. | |
147 (h) put in error-checking code that will be tripped as often as possible | |
148 when doing anything with internal text, and check to see that ASCII | |
149 text has not mistakenly filtered in. This should be fairly easy as | |
150 ASCII text will generally be entirely spaces and letters whereas every | |
151 second byte of Unicode text will generally be a null byte. Either we | |
152 abort if the second bytes are entirely letters and numbers, or, | |
153 perhaps better, do the equivalent of a non-MULE build, where we should | |
154 be dealing entirely with 8-bit characters, and assert that the high | |
155 bytes of each pair are null. | |
156 (i) review places where xmalloc() is called. If we convert each use of | |
157 xmalloc() to instead be xnew_array() or some other typed routine, | |
158 then we will find every place that allocates space for Itext and | |
159 assumes it is based on one-byte units. | |
160 (j) encourage the use of ITEXT_ZTERM_SIZE instead of '+ 1' whenever we | |
161 are adding space for a zero-terminator, to emphasize what we are | |
162 doing and make sure the calculations are correct. Similarly for | |
163 EXTTEXT_ZTERM_SIZE. | |
164 (k) Note that the qxestr*() functions, among other things, will need to | |
165 be rewritten. | |
166 | |
167 Note that this is a lot of work, and is not high on the list of priorities | |
168 currently. | |
826 | 169 |
170 ========================================================================== | |
2367 | 171 9. Miscellaneous |
826 | 172 ========================================================================== |
173 | |
174 A. Unicode Support | |
771 | 175 |
1292 | 176 Unicode support is very desirable. Currrently we know how to handle |
177 externally-encoded Unicode data in various encodings -- UTF-16, UTF-8, | |
178 etc. However, we really need to represent Unicode characters internally | |
179 as-is, rather than converting to some language-specific character set. | |
180 For efficiency, we should represent Unicode characters using 3 bytes | |
181 rather than 4. This means we need to find leading bytes for Unicode. | |
182 Given that there are 65,536 characters in Unicode and we can attach | |
183 96x96 = 9,216 characters per leading byte, we need eight leading bytes | |
184 for Unicode. We currently have four free (0x9A - 0x9D), and with a | |
185 little bit of rearranging we can get five: ASCII doesn't really need to | |
186 take up a leading byte. (We could just as well use 0x7F, with a little | |
187 change to the functions that assume that 0x80 is the lowest leading | |
188 byte.) This means we still need to dump three leading bytes and move | |
189 them into private space. The CNS charsets are good candidates since | |
190 they are rarely used, and JAPANESE_JISX0208_1978 is becoming less and | |
191 less used and could also be dumped. | |
826 | 192 |
193 B. Composite Characters | |
194 | |
195 Composite characters are characters constructed by overstriking two | |
771 | 196 or more regular characters. |
197 | |
198 1) The old Mule implementation involves storing composite characters | |
199 in a buffer as a tag followed by all of the actual characters | |
200 used to make up the composite character. I think this is a bad | |
201 idea; it greatly complicates code that wants to handle strings | |
202 one character at a time because it has to deal with the possibility | |
203 of great big ungainly characters. It's much more reasonable to | |
204 simply store an index into a table of composite characters. | |
205 | |
206 2) The current implementation only allows for 16,384 separate | |
207 composite characters over the lifetime of the XEmacs process. | |
208 This could become a potential problem if the user | |
209 edited lots of different files that use composite characters. | |
210 Due to FSF bogosity, increasing the number of allowable | |
211 composite characters under Mule would decrease the number | |
212 of possible faces that can exist. Mule already has shrunk | |
213 this to 2048, and further shrinkage would become uncomfortable. | |
214 No such problems exist in XEmacs. | |
215 | |
3498 | 216 Composite characters could be represented as 0x8D C1 C2 C3, where each |
217 C[1-3] is in the range 0xA0 - 0xFF. This allows for slightly under | |
218 2^20 (one million) composite characters over the XEmacs process | |
219 lifetime. Or you could use 0x8D C1 C2 C3 C4, allowing for about 85 | |
220 million (slightly over 2^26) composite characters. | |
826 | 221 |
2367 | 222 ========================================================================== |
223 10. Internal API's | |
224 ========================================================================== | |
225 | |
226 All of these are documented in more detail in text.h. | |
227 | |
228 @enumerate | |
229 @item | |
230 Basic internal-format API's | |
231 | |
232 These are simple functions and macros to convert between text | |
233 representation and characters, move forward and back in text, etc. | |
234 | |
235 @item | |
236 The DFC API | |
237 | |
238 This is for conversion between internal and external text. Note that | |
239 there is also the "new DFC" API, which *returns* a pointer to the | |
240 converted text (in alloca space), rather than storing it into a | |
241 variable. | |
242 | |
243 @item | |
244 The Eistring API | |
245 | |
4073 | 246 \(This API is currently under-used) When doing simple things with |
2367 | 247 internal text, the basic internal-format API's are enough. But to do |
248 things like delete or replace a substring, concatenate various strings, | |
249 etc. is difficult to do cleanly because of the allocation issues. | |
250 The Eistring API is designed to deal with this, and provides a clean | |
251 way of modifying and building up internal text. (Note that the former | |
252 lack of this API has meant that some code uses Lisp strings to do | |
253 similar manipulations, resulting in excess garbage and increased | |
254 garbage collection.) | |
255 | |
256 NOTE: The Eistring API is (or should be) Mule-correct even without | |
257 an ASCII-compatible internal representation. | |
258 @end enumerate | |
259 | |
260 ========================================================================== | |
261 11. Other Sources of Documentation | |
262 ========================================================================== | |
263 | |
264 man/lispref/mule.texi | |
265 @enumerate | |
266 @item | |
267 another intro to characters, encodings, etc; #### Merge with the | |
268 above info | |
269 @item | |
270 documentation of ISO-2022 | |
271 @item | |
272 The charset and coding-system Lisp API's | |
273 @item | |
274 The CCL conversion language for writing encoding conversions | |
275 @item | |
276 The Latin-Unity package for unifying Latin charsets | |
277 @end enumerate | |
278 | |
279 man/internals/internals.texi (the Internals manual) | |
280 @enumerate | |
281 @item | |
282 "Coding for Mule" -- how to write Mule-aware code | |
283 @item | |
284 "Modules for Internationalization" | |
285 @item | |
286 "The Text in a Buffer" -- more about the different ways of | |
287 viewing buffer positions; #### Merge with the above info | |
288 @item | |
289 "MULE Character Sets and Encodings" -- yet another intro | |
290 to characters, encodings, etc; #### Merge with the | |
291 above info; also some documentation of Japanese EUC and JIS7, | |
292 and CCL internals | |
293 @end enumerate | |
294 | |
295 text.h -- info about specific XEmacs-C API's for handling internal and | |
296 external text | |
297 | |
298 intl-win32.c -- Windows-specific I18N information | |
299 | |
300 lisp.h -- some info appears alongside the definitions of the basic | |
301 character-related types | |
302 | |
303 unicode.c -- documentation about Unicode translation tables | |
826 | 304 */ |
771 | 305 |
2367 | 306 |
307 /************************************************************************/ | |
308 /* */ | |
309 /* */ | |
310 /* Part B: Random proposals for work to be done */ | |
311 /* */ | |
312 /* */ | |
313 /************************************************************************/ | |
314 | |
315 | |
316 /* | |
317 | |
318 | |
319 ========================================================================== | |
320 - Mule design issues (ben) | |
321 ========================================================================== | |
322 | |
323 circa 1999 | |
324 | |
325 Here is a more detailed list of Mule-related projects that we will be | |
326 working on. They are more or less ordered according to how we will | |
327 proceed, but it's not exact. In particular, there will probably be | |
328 time overlap among adjacent projects. | |
329 | |
330 @enumerate | |
331 @item | |
332 Modify the internal/external conversion macros to allow for | |
333 MS Windows support. | |
334 | |
335 @item | |
336 Modify the buffer macros to allow for more than one internal | |
337 representation, e.g. fixed width and variable width. | |
338 | |
339 @item | |
340 Review the existing Mule code, especially the lisp code, for code | |
341 quality issues and improve the cleanliness of it. Also work on | |
342 creating a specification for the Mule API. | |
343 | |
344 @item | |
345 Write some more automated mule tests. | |
346 | |
347 @item | |
348 Integrate Tomohiko's UTF-2000 code, fixing it up so that nothing is | |
349 broken when the UTF-2000 configure option is not enabled. | |
350 | |
351 @item | |
352 Fix up the MS Windows code to be Mule-correct, so that you can | |
353 compile with Mule support under MS windows and have a working | |
354 XEmacs, at least just with Latin-1. | |
355 | |
356 @item | |
357 Implement a scheme to guarantee no corruption of files, even with | |
358 an incorrect coding system - in particular, guarantee no corruption | |
359 of binary files. | |
360 | |
361 @item | |
362 Make the text property support in XEmacs robust with respect to | |
363 string and text operations, so that the `no corruption' support in | |
364 the previous entry works properly, even if a lot of cutting and | |
365 pasting is done. | |
366 | |
367 @item | |
368 Improve the handling of auto-detection so that, when there is any | |
369 possibility at all of mistake, the user is informed of the detected | |
370 encoding and given the choice of choosing other possibilities. | |
371 | |
372 @item | |
373 Improve the support for different language environments in XEmacs, | |
374 for example, the priority of coding systems used in auto-detection | |
375 should properly reflect the language environment. This probably | |
376 necessitates rethinking the current `coding system priority' | |
377 scheme. | |
378 | |
379 @item | |
380 Do quality work to improve the existing UTF-2000 implementation. | |
381 | |
382 @item | |
383 Implement preliminary support for 8-bit fixed width | |
384 representation. First, we will only implement 7-bit support, and | |
385 will fall back to variable width as soon as any non-ASCII | |
386 character is encountered. Then we will improve the support to | |
387 handle an arbitrary character set in the upper half of the 8-bit space. | |
388 | |
389 @item | |
390 Investigate any remaining hurdles to making --with-mule be the | |
391 default configure option. | |
392 @end enumerate | |
393 | |
394 ========================================================================== | |
395 - Mule design issues (stephen) | |
396 ========================================================================== | |
397 | |
398 What I see as Mule priorities (in rough benefit order, I am not taking | |
399 account of difficulty, nor the fact that some - eg 8 & 10 - will | |
400 probably come as packages): | |
401 | |
402 @enumerate | |
403 @item | |
404 Fix the autodetect problem (by making the coding priority list | |
405 user-configurable, as short as he likes, even null, with "binary" | |
406 as the default). | |
407 @item | |
408 Document the language environments and other Mule "APIs" as | |
409 implemented (since there is no real design spec). Check to see | |
410 how and where they are broken. | |
411 @item | |
412 Make the Mule menu useful to non-ISO-2022-literate folks. | |
413 @item | |
414 Redo the lstreams stuff to make it easy and robust to "pipeline", | |
415 eg, libz | gnupg | jis2mule. | |
416 @item | |
417 Make Custom Mule-aware. (This probably depends on a sensible | |
418 fonts model.) | |
419 @item | |
420 Implement the "literal byte stream" memory feature. | |
421 @item | |
422 Study the FSF implementation of Mule for background for 7 & 8. | |
423 @item | |
424 Identify desirable Mule features (eg, i18n-ized messages as above, | |
425 collating tables by language environment, etc). (New features | |
426 might have priority as high as 9.) | |
427 @item | |
428 Specify Mule UIs, APIs, etc, and design and (re)implement them. | |
429 @item | |
430 Implement the 8-bit-wide buffer optimization. | |
431 @item | |
432 Move the internal encoding to UTF-32 (subject to Olivier's caveats | |
433 regarding compose characters), with the variable-width char | |
434 buffers using UTF-8. | |
435 @item | |
436 Implement the 16- and 32-bit-wide buffer optimizations. | |
437 @end enumerate | |
438 | |
439 ========================================================================== | |
440 - Mule design issues "short term" (ben) | |
441 ========================================================================== | |
442 | |
443 @enumerate | |
444 @item | |
445 Finish changes in fixup/directory, get in CVS. | |
446 | |
447 (Test with and without "quick-build", to see if really faster) | |
448 (need autoconf) | |
449 | |
450 @item | |
451 Finish up Windows/Mule changes. Outline of this elsewhere; Do | |
452 *minimal* effort. | |
453 | |
454 @item | |
455 Continue work on Windows stability, e.g. go through existing notes | |
456 on Windows Mule-ization + extract all info. | |
457 | |
458 @item | |
459 Get Unicode translation tables integrated. | |
460 | |
461 Finish UCS2/UTF16 coding system. | |
462 | |
463 @item | |
464 Make sure coding system priority list is language-environment specific. | |
465 | |
466 @item | |
467 Consider moving language selection Menu up to be parallel with Mule menu. | |
468 | |
469 @item | |
470 Check to make sure we grok the default locale at startup under | |
471 Windows and understand the Windows locales. Finish implementation | |
472 of mswindows-multibyte and make sure it groks all the locales. | |
473 | |
474 @item | |
475 Do the above as best as we can without using Unicode tables. | |
476 | |
477 @item | |
478 Start tagging all text with a language text property, | |
479 indicating the current language environment when the text was input. | |
480 | |
481 @item | |
482 Make sure we correctly accept input of non-ASCII chars | |
483 (probably already do!) | |
484 | |
485 @item | |
486 Implement active language/keyboard switching under Windows. | |
487 | |
488 @item | |
489 Look into implementing support for "MS IME" protocol (Microsoft | |
490 fancy built-in Asian input methods). | |
491 | |
492 @item | |
493 Redo implementation of mswindows-multibyte and internal display to | |
494 entirely use translation to/from Unicode for increased accuracy. | |
495 | |
496 @item | |
497 Implement buf<->char improvements from FSF. Also implement | |
498 my string byte<->char optimization structure. | |
499 | |
500 @item | |
501 Integrate all Mule DOCS from 20.6 or 21.0. Try to add sections | |
502 for what we've added. | |
503 | |
504 @item | |
505 Implement 8-bit fixed width optimizations. Then work on 16-bit. | |
506 @end enumerate | |
507 | |
508 ========================================================================== | |
509 - Mule design issues (more) (ben) | |
510 ========================================================================== | |
511 | |
512 Get minimal Mule for Windows working using Ikeyama's patches. At | |
513 first, rely on his conversion of internal -> external | |
514 locale-specific but very soon (as soon as we get translation | |
515 tables) can switch to using Unicode versions of display funs, which | |
516 will allow many more charsets to be handled and in a more | |
517 consistent fashion. | |
518 | |
519 i.e. to convert an internal string to an external format, at first | |
520 we use our own knowledge of the Microsoft locale file formats but | |
521 an alternative is to convert to Unicode and use Microsoft's | |
522 convert-Unicode-to-locale encoding functions. This gains us a | |
523 great deal of generality, since in practice all charset caching | |
524 points can be wrapped into Unicode caching points. | |
525 | |
526 This requires adding UCS2 support, which I'm doing. This support | |
527 would let us convert internal -> Unicode, which is exactly what we | |
528 want. | |
529 | |
530 At first, though, I would do the UCS2 support, but leave the | |
531 existing way of doing things in redisplay. Meanwhile, I'd go | |
532 through and fix up the places in the code that assume we are | |
533 dealing with unibytes. | |
534 | |
535 After this, the font problems will be fixed , we should have a | |
536 pretty well working XEmacs + MULE under Windows. The only real | |
537 other work is the clipboard code, which should be straightforward. | |
538 | |
539 ========================================================================== | |
540 - Mule design discussion | |
541 ========================================================================== | |
542 | |
543 -------------------------------------------------------------------------- | |
544 | |
545 Ben | |
546 | |
547 April 11, 2000 | |
548 | |
549 Well yes, this was the whole point of my "no lossage" proposal of being | |
550 able to undo any coding-system transformation on a buffer. The idea was | |
5384
3889ef128488
Fix misspelled words, and some grammar, across the entire source tree.
Jerry James <james@xemacs.org>
parents:
5191
diff
changeset
|
551 to figure out which transformations were definitely reversible, and for |
2367 | 552 all the others, cache the original text in a text property. This way, you |
553 could probably still do a fairly good job at constructing a good reversal | |
554 even after you've gone into the text and added, deleted, and rearranged | |
555 some things. | |
556 | |
557 But you could implement it much more simply and usefully by just | |
558 determining, for any text being decoded into mule-internal, can we go back | |
559 and read the source again? If not, remember the entire file (GNUS | |
560 message, etc) in text properties. Then, implement the UI interface (like | |
561 Netscape's) on top of that. This way, you have something that at least | |
562 works, but it might be inefficient. All we would need to do is work on | |
563 making the | |
564 underlying implementation more efficient. | |
565 | |
566 Are you interested in doing this? It would be a huge win for users. | |
567 Hrvoje Niksic wrote: | |
568 | |
569 > Ben Wing <ben@666.com> writes: | |
570 > | |
571 > > let me know exactly what "rethink" functionality you want and i'll | |
572 > > come up with an interface. perhaps you just want something like | |
573 > > netscape's encoding menu, where if you switch encodings, it reloads | |
574 > > and reencodes? | |
575 > | |
576 > It might be a bit more complex than that. In many cases, it's hard or | |
577 > impossible to meaningfully "reload" -- for instance, this | |
578 > functionality should be available while editing a Gnus message, as | |
579 > well as while visiting a file. | |
580 > | |
581 > For the special case of Latin-N <-> Latin-M conversion, things could | |
582 > be done easily -- to convert from N to M, you only need to convert | |
583 > internal representation back to N, and then convert it forth to M. | |
584 | |
585 -------------------------------------------------------------------------- | |
586 April 11, 2000 | |
587 | |
588 Well yes, this was the whole point of my "no lossage" proposal of being | |
589 able to undo any coding-system transformation on a buffer. The idea was | |
5384
3889ef128488
Fix misspelled words, and some grammar, across the entire source tree.
Jerry James <james@xemacs.org>
parents:
5191
diff
changeset
|
590 to figure out which transformations were definitely reversible, and for |
2367 | 591 all the others, cache the original text in a text property. This way, you |
592 could probably still do a fairly good job at constructing a good reversal | |
593 even after you've gone into the text and added, deleted, and rearranged | |
594 some things. | |
595 | |
596 But you could implement it much more simply and usefully by just | |
597 determining, for any text being decoded into mule-internal, can we go back | |
598 and read the source again? If not, remember the entire file (GNUS | |
599 message, etc) in text properties. Then, implement the UI interface (like | |
600 Netscape's) on top of that. This way, you have something that at least | |
601 works, but it might be inefficient. All we would need to do is work on | |
602 making the | |
603 underlying implementation more efficient. | |
604 | |
605 Are you interested in doing this? It would be a huge win for users. | |
606 Hrvoje Niksic wrote: | |
607 | |
608 > Ben Wing <ben@666.com> writes: | |
609 > | |
610 > > let me know exactly what "rethink" functionality you want and i'll | |
611 > > come up with an interface. perhaps you just want something like | |
612 > > netscape's encoding menu, where if you switch encodings, it reloads | |
613 > > and reencodes? | |
614 > | |
615 > It might be a bit more complex than that. In many cases, it's hard or | |
616 > impossible to meaningfully "reload" -- for instance, this | |
617 > functionality should be available while editing a Gnus message, as | |
618 > well as while visiting a file. | |
619 > | |
620 > For the special case of Latin-N <-> Latin-M conversion, things could | |
621 > be done easily -- to convert from N to M, you only need to convert | |
622 > internal representation back to N, and then convert it forth to M. | |
623 | |
624 | |
625 ------------------------------------------------------------------------ | |
626 | |
627 ========================================================================== | |
628 - Redoing translation macros [old] | |
629 ========================================================================== | |
630 | |
631 Currently the translation macros (the macros with names such as | |
632 GET_C_STRING_CTEXT_DATA_ALLOCA) have names that are difficult to parse | |
633 or remember, and are not all that general. In the process of | |
634 reviewing the Windows code so that it could be muleized, I discovered | |
635 that these macros need to be extended in various ways to allow for | |
636 the Windows code to be easily muleized. | |
637 | |
638 Since the macros needed to be changed anyways, I figured it would be a | |
639 good time to redo them properly. I propose new macros which have | |
640 names like this: | |
641 | |
642 @itemize @bullet | |
643 @item | |
644 <A>_TO_EXTERNAL_FORMAT_<B> | |
645 @item | |
646 <A>_TO_EXTERNAL_FORMAT_<B>_1 | |
647 @item | |
648 <C>_TO_INTERNAL_FORMAT_<D> | |
649 @item | |
650 <C>_TO_INTERNAL_FORMAT_<D>_1 | |
651 @end itemize | |
652 | |
653 A and C represent the source of the data, and B and D represent the | |
654 sink of the data. | |
655 | |
656 All of these macros call either the functions | |
657 convert_to_external_format or convert_to_internal_format internally, | |
658 with some massaging of the arguments. | |
659 | |
660 All of these macros take the following arguments: | |
661 | |
662 @itemize @bullet | |
663 @item | |
664 First, one or two arguments indicating the source of the data. | |
665 @item | |
666 Second, an argument indicating the coding system. (In order to avoid | |
667 an excessive number of macros, we no longer provide separate macros | |
668 for specific coding systems.) | |
669 @item | |
670 Third, one or two arguments indicating the sink of the data. | |
671 @item | |
672 Fourth, optionally, arguments indicating the error behavior and the | |
673 warning class (these arguments are only present in the _1 versions | |
674 of the macros). The other, shorter named macros are trivial | |
675 interfaces onto these macros with the error behavior being | |
676 ERROR_ME_WARN, with the warning class being Vstandard_warning_class. | |
677 @end itemize | |
678 | |
679 <A> can be one of the following: | |
680 @itemize @bullet | |
681 @item | |
682 LISP (which means a Lisp string) Takes one argument, a Lisp Object. | |
683 @item | |
684 LSTREAM (which indicates an lstream) Takes one argument, an | |
685 lstream. The data is read from the lstream until EOF is reached. | |
686 @item | |
687 DATA (which indicates a raw memory area) Takes two arguments, a | |
688 pointer and a length in bytes. | |
689 (You must never use this if the source of the data is a Lisp string, | |
690 because of the possibility of relocation during garbage collection.) | |
691 @end itemize | |
692 | |
693 <B> can be one of the following: | |
694 @itemize @bullet | |
695 @item | |
696 ALLOCA (which means that the resulting data is stored in alloca()ed | |
697 memory. Two arguments should be specified, a pointer and a length, | |
698 which should be lvalues.) | |
699 @item | |
700 MALLOC (which means that the resulting data is stored in malloc()ed | |
701 memory. Two arguments should be specified, a pointer and a | |
702 length. The memory must be free()d by the caller. | |
703 @item | |
704 OPAQUE (which means the resulting data is stored in an opaque Lisp | |
705 Object. This takes one argument, a lvalue Lisp Object. | |
706 @item | |
707 LSTREAM. The data is written to an lstream. | |
708 @end itemize | |
709 | |
710 <C> can be one of the : | |
711 @itemize @bullet | |
712 @item | |
713 DATA | |
714 @item | |
715 LSTREAM | |
716 @end itemize | |
717 (just like <A> above) | |
718 | |
719 <D> can be one of | |
720 @itemize @bullet | |
721 @item | |
722 ALLOCA | |
723 @item | |
724 MALLOC | |
725 @item | |
726 LISP This means a Lisp String. | |
727 @item | |
728 BUFFER The resulting data is inserted into a buffer at the buffer's | |
729 value of point. | |
730 @item | |
731 LSTREAM The data is written to the lstream. | |
732 @end itemize | |
733 | |
734 | |
735 Note that I have eliminated the FORMAT argument of previous macros, | |
736 and replaced it with a coding system. This was made possible by | |
737 coding system aliases. In place of old `format's, we use a `virtual | |
738 coding system', which is aliased to the actual coding system. | |
739 | |
740 The value of the coding system argument can be anything that is legal | |
741 input to get_coding_system, i.e. a symbol or a coding system object. | |
742 | |
743 ========================================================================== | |
744 - creation of generic macros for accessing internally formatted data [old] | |
745 ========================================================================== | |
746 | |
747 I have a design; it's all written down (I did it in Tsukuba), and I just have | |
748 to have it transcribed. It's higher level than the macros, though; it's Lisp | |
749 primitives that I'm designing. | |
750 | |
751 As for the design of the macros, don't worry so much about all files having to | |
752 get included (which is inevitable with macros), but about how the files are | |
753 separated. Your design might go like this: | |
754 | |
755 @enumerate | |
756 @item | |
757 you have generic macro interfaces, which specify a particular | |
758 behavior but not an implementation. these generic macros have | |
759 complementary versions for buffers and for strings (and the buffer | |
760 or string is an argument to all of the macros), and do such things | |
761 as convert between byte and char indices, retrieve the character at | |
762 a particular byte or char index, increment or decrement a byte | |
763 index to the beginning of the next or previous character, indicate | |
764 the number of bytes occupied by the character at a particular byte | |
765 or character index, etc. These are similar to what's already out | |
766 there except that they confound buffers and strings and that they | |
767 can also work with actual char *'s, which I think is a really bad | |
768 idea because it encourages code to "assume" that the representation | |
769 is ASCII compatible, which is might not be (e.g. 16-bit fixed | |
770 width). In fact, one thing I'm planning on doing is redefining | |
771 Bufbyte as a struct, for debugging purposes, to catch all places | |
772 that cavalierly compare them with ASCII char's. Note also that I | |
773 really want to rename Bufpos and Bytind, which are confusing and | |
774 wrong in that they also apply to strings. They should be Bytepos | |
775 and Charpos, or something like that, to go along with Bytecount and | |
776 Charcount. Similarly, Bufbyte is similarly a misnomer and should be | |
777 Intbyte -- a byte in the internal string representation (any of the | |
778 internal representations) of a string or buffer. Corresponding to | |
779 this is Extbyte (which we already have), a byte in any external | |
780 string representation. We also have Extcount, which makes sense, | |
781 and we might possibly want Extcharcount, the number of characters | |
782 in an external string representation; but that gets sticky in modal | |
783 encodings, and it's not clear how useful it would be. | |
784 | |
785 @item | |
786 for all generic macro interfaces, there are specific versions of | |
787 each of them for each possible representation (pure ASCII in the | |
788 non-Mule world, Mule standard, UTF-8, 8-bit fixed, 16-bit fixed, | |
789 32-bit fixed, etc.; there may well be more than one possible 16-bit | |
790 fixed version, as well). Each representation has a corresponding | |
791 prefix, e.g. MULE_ or FIXED16_ or whatever, which is prefixed onto | |
792 the generic macro names. The resulting macros perform the | |
793 operation defined for the macro, but assume, and only work | |
794 correctly with, text in the corresponding representation. | |
795 | |
796 @item | |
797 The definition of the generic versions merely conditionalizes on | |
798 the appropriate things (i.e. bit flags in the buffer or string | |
799 object) and calls the appropriate representation-specific version. | |
800 There may be more than one definition (protected by ifdefs, of | |
801 course), or one definition that amalgamated out of many ifdef'ed | |
802 sections. | |
803 | |
804 @item | |
805 You should probably put each different representation in its own | |
806 header file, e.g. charset-mule.h or charset-fixed16.h or | |
807 charset-ascii.h or whatever. Then put the main macros into | |
808 charset.h, and conditionalize in this file appropriately to include | |
809 the other ones. That way, code that actually needs to play around | |
810 with internal-format text at this level can include "charset.h" | |
811 (certainly a much better place than buffer.h), and everyone else | |
812 uses higher-level routines. The representation-specific macros | |
813 should not normally be used *directly* at all; they are invoked | |
814 automatically from the generic macros. However, code that needs to | |
815 be highly, highly optimized might choose to take a loop and write | |
816 two versions of it, one for each representation, to avoid the | |
817 per-loop-iteration cost of a comparison. Until the macro interface | |
818 is rock stable and solid, we should strongly discourage such | |
819 nanosecond optimizations. | |
820 @end enumerate | |
821 | |
822 ========================================================================== | |
823 - UTF-16 compatible representation | |
824 ========================================================================== | |
825 | |
826 NOTE: One possible default internal representation that was compatible | |
827 with UTF16 but allowed all possible chars in UCS4 would be to take a | |
828 more-or-less unused range of 2048 chars (not from the private area | |
829 because Microsoft actually uses up most or all of it with EUDC chars). | |
830 Let's say we picked A400 - ABFF. Then, we'd have: | |
831 | |
832 0000 - FFFF Simple chars | |
833 | |
834 D[8-B]xx D[C-F]xx Surrogate char, represents 1M chars | |
835 | |
836 A[4-B]xx D[C-F]xx D[C-F]xx Surrogate char, represents 2G chars | |
837 | |
838 This is exactly the same number of chars as UCS-4 handles, and it follows the | |
839 same property as UTF8 and Mule-internal: | |
840 | |
841 @enumerate | |
842 @item | |
843 There are two disjoint groupings of units, one representing leading units | |
844 and one representing non-leading units. | |
845 @item | |
846 Given a leading unit, you immediately know how many units follow to make | |
847 up a valid char, irrespective of any other context. | |
848 @end enumerate | |
849 | |
850 Note that A4xx is actually currently assigned to Yi. Since this is an | |
851 internal representation, we could just move these elsewhere. | |
852 | |
853 An alternative is to pick two disjoint ranges, e.g. 2D00 - 2DFF and | |
854 A500 - ABFF. | |
855 | |
856 ========================================================================== | |
857 New API for char->font mapping | |
858 ========================================================================== | |
859 - ; supersedes charset-registry and CCL; | |
860 supports all windows systems; powerful enough for Unicode; etc. | |
861 | |
862 (charset-font-mapping charset) | |
863 | |
864 font-mapping-specifier string | |
865 | |
866 char-font-mapping-table | |
867 | |
868 char-table, specifier; elements of char table are either strings (which | |
869 specify a registry or comparable font property, or vectors of a string | |
870 (same) followed by keyword-value pairs (optional). The only allowable | |
871 keyword currently is :ccl-program, which specifies a CCL program to map | |
872 the characters into font indices. Other keywords may be added | |
873 e.g. allowing Elisp fragments instead of CCL programs, also allowed is | |
874 [inherit], which inherits from the next less-specific char-table in the | |
875 specifier. | |
876 | |
877 The preferred interface onto this mapping (which should be portable | |
878 across Emacsen) is | |
879 | |
880 (set-char-font-mapping key value &optional locale tag-set how-to-add) | |
881 | |
882 where key is a char, range or charset (as for put-char-table), value is | |
883 as above, and the other arguments are standard for specifiers. This | |
884 automatically creates a char table in the locale, as necessary (all | |
885 elements default to [inherit]). On GNU Emacs, some specifiers arguments | |
886 may be unimplemented. | |
887 | |
888 (char-font-mapping key value &optional locale) | |
889 works vaguely like get-specifier? But does inheritance processing. | |
890 locale should clearly default here to current-buffer | |
891 | |
892 #### should get-specifier as well? Would make it work most like | |
893 #### buffer-local variables. | |
894 | |
895 NB. set-charset-registry and set-charset-ccl-program are obsoleted. | |
896 | |
897 ========================================================================== | |
898 Implementing fixed-width 8,16,32 bit buffer optimizations | |
899 ========================================================================== | |
900 | |
901 Add set-buffer-optimization (buffer &rest keywords) for | |
902 controlling these things. | |
903 | |
904 Also, put in hack so that correct arglist can be retrieved by | |
905 Lisp code. | |
906 | |
907 Look at the way keyword primitives are currently handled; make | |
908 sure it works and is documented, etc. | |
909 | |
910 Implement 8-bit fixed width optimization. Take the things that | |
911 know about the actual implementation and put them in a single | |
912 file, in essence creating an abstraction layer to allow | |
913 pluggable internal representations. Implement a fairly general | |
914 scheme for mapping between character codes in the 8 bits or 16 | |
915 bits representation and on actual charset characters. As part of | |
916 set-buffer-optimization, you can specify a list of character sets | |
917 to be used in the 8 bit to 16 bit, etc. world. You can also | |
918 request that the buffer be in 8, 16, etc. if possible. | |
919 | |
920 -> set defaults wrt this. | |
921 -> perhaps this should be just buffer properties. | |
922 -> this brings up the idea of default properties on an object. | |
923 -> Implement default-put, default-get, etc. | |
924 | |
925 What happens when a character not assigned in the range gets | |
926 added? Then, must convert to variable width of some sort. | |
927 | |
928 Note: at first, possibly we just convert whole hog to get things | |
929 right. Then we'd have to poy alternative to characters that got | |
930 added + deleted that were unassigned in the fixed width. When | |
931 this goes to zero and there's been enough time (heuristics), we | |
932 go back to fixed. | |
933 | |
934 Side note: We could dynamically build up the set of assigned | |
935 chars as they go. Conceivably this could even go down to the | |
936 single char level: Just keep a big array of mapping from 16 bit | |
937 values to chars, and add empty time, a char has been encountered | |
938 that wasn't there before. Problem need inverse mapping. | |
939 | |
940 -> Possibility; chars are actual objects, not just numbers. | |
941 Then you could keep track of such info in the chars itself. | |
942 *Think about this.* | |
943 | |
944 Eventually, we might consider allowing mixed fixed-width, | |
945 variable-width buffer encodings. Then, we use range tables to | |
946 indicate which sections are fixed and which variable and INC_CHAR does | |
947 something like this: binary search to find the current range, which | |
948 indicates whether it's fixed or variable, and tells us what the | |
949 increment is. We can cache this info and use it next time to speed | |
950 up. | |
951 | |
952 -> We will then have two partially shared range tables - one for | |
953 overall fixed width vs. variable width, and possibly one containing | |
954 this same info, but partitioning the variable width in one. Maybe | |
955 need fancier nested range table model. | |
956 | |
957 ========================================================================== | |
958 Expansion of display table and case mapping table support for all | |
959 chars, not just ASCII/Latin1. | |
960 ========================================================================== | |
961 | |
962 ========================================================================== | |
963 Improved flexibility for display tables, and evaluation of its | |
964 features to make sure it meshes with and complements the char<->font | |
965 mapping API mentioned earlier | |
966 ========================================================================== | |
967 | |
968 ========================================================================== | |
969 String access speedup: | |
970 ========================================================================== | |
971 | |
972 For strings larger than some size in bytes (10?), keep extra fields of | |
973 info: length in chars, and a (char, byte) pair in the middle to speed | |
974 up sequential access. | |
975 | |
976 (Better idea: do this for any size string, but only if it contains | |
977 non-ASCII chars. Then if info is missing, we know string is | |
978 ASCII-only.) | |
979 | |
980 Use a string-extra-info object, replacing string property slot and | |
981 containing fields for string mod tick, string extents, string props, | |
982 and string char length, and cached (char,byte) pair. | |
983 string-extra-info (or string-auxiliary?) objects could be in frob | |
984 blocks, esp. if creating frob blocks is easy + worth it. | |
985 | |
986 - Caching of char<->byte conversions in strings - should make nearly | |
987 all operations on strings O(N) | |
988 | |
989 ========================================================================== | |
990 Improvements in buffer char<->byte mapping | |
991 ========================================================================== | |
992 | |
993 - Range table implementation - especially when there are few runs of | |
994 different widths, e.g. recently converted from fixed-width | |
995 optimization to variable width | |
996 | |
997 Range Tables to speed up Bufpos <-> Bytind caching | |
998 ================================================== | |
999 | |
1000 This describes an alternative implementation using ranges. We | |
1001 maintain a range table of all spans of characters of a fixed width. | |
1002 Updating this table could take time if there are a large number of | |
1003 spans; but constant factors of operations should be quick. This method really wins | |
1004 when you have 8-bit buffers just converted to variable width, where | |
1005 there will be few spans. More specifically, lookup in this range | |
1006 table is O(log N) and can be done with simple binary search, which is | |
1007 very fast. If we maintain the ranges using a gap array, updating this | |
1008 table will be fast for local operations, which is most of the time. | |
1009 | |
1010 We will also provide (at first, at least) a Lisp function to set the | |
1011 caching mechanism explicitly - either range tables or the existing | |
1012 implementation. Eventually, we want to improve things, to the point | |
1013 where we automatically pick the right caching for the situation and | |
1014 have more caching schemes implemented. | |
1015 | |
1016 ========================================================================== | |
1017 - Robustify Text Properties | |
1018 ========================================================================== | |
1019 | |
1020 ========================================================================== | |
1021 Support for unified internal representation, e.g. Unicode | |
1022 ========================================================================== | |
1023 | |
1024 Start tagging all text with a language text property, | |
1025 indicating the current language environment when the text was input. | |
1026 (needs "Robustify Text Properties") | |
1027 | |
1028 ========================================================================== | |
1029 - Generalized Coding Systems | |
1030 ========================================================================== | |
1031 | |
1032 - Lisp API for Defining Coding Systems | |
1033 | |
1034 User-defined coding systems. | |
1035 | |
1036 (define-coding-system-type 'type | |
1037 :encode-function fun | |
1038 :decode-function fun | |
1039 :detect-function fun | |
1040 :buffering (number = at least this many chars | |
1041 line = buffer up to end of line | |
1042 regexp = buffer until this regexp is found in match | |
1043 source data. match data will be appropriate when fun is | |
1044 called | |
1045 | |
1046 encode fun is called as | |
1047 | |
1048 (encode instream outstream) | |
1049 | |
1050 should read data from instream and write converted result onto | |
1051 outstream. Can leave some data stuff in stream, it will reappear | |
1052 next time. Generally, there is a finite amount of data in instream | |
1053 and further attempts to read lead to would-block errors or retvals. | |
1054 Can use instream properties to record state. May use read-stream | |
1055 functionality to read everything into a vector or string. | |
1056 | |
1057 ->Need vectors + string exposed to resizing of Lisp implementation | |
1058 where necessary. | |
1059 | |
1060 ========================================================================== | |
1061 Support Windows Active Kbd Switching, Far East IME API (done already?) | |
1062 ========================================================================== | |
1063 | |
1064 ========================================================================== | |
1065 - UI/design changes for Coding System Pipelining | |
1066 ========================================================================== | |
1067 | |
1068 ------------------------------------------------------------------ | |
1069 CODING-SYSTEM CHAINS | |
1070 ------------------------------------------------------------------ | |
1071 | |
1072 sjt sez: | |
1073 | |
1074 There should be no elementary coding systems in the Lisp API, only | |
1075 chains. Chains should be declared, not computed, as a sequence of coding | |
1076 formats. (Probably the internal representation can be a vector for | |
1077 efficiency but programmers would probably rather work with lists.) A | |
1078 stream has a token type. Most streams are octet streams. Text is a | |
1079 stream of characters (in _internal_ format; a file on disk is not text!) | |
1080 An octet-stream has no implicit semantics, so its format must always be | |
1081 specified. The only type currently having semantics is characters. This | |
1082 means that the chain [euc-jp -> internal -> shift_jis) may be specified | |
1083 (euc-jp, shift_jis), and if no euc-jp -> shift_jis converter is | |
1084 available, then the chain is automatically constructed. (N.B. I f we | |
1085 have fixed width buffers in the future, then we could have ASCII -> 8-bit | |
1086 char -> 16-bit char -> ISO-2022-JP (with escape sequences). | |
1087 | |
1088 EOL handling is a char <-> char coding. It should not be part of another | |
1089 coding system except as a convenience for users. For text coding, | |
1090 automatically insert EOL handlers between char <-> octet boundaries. | |
1091 | |
1092 ------------------------------------------------------------------ | |
1093 ABOUT DETECTION | |
1094 ------------------------------------------------------------------ | |
1095 | |
1096 | |
1097 ------------------------------------------------------------------ | |
1098 EFFICIENCY OF CODING CONVERSION WITH MULTIPLE COPIES/CHAINS | |
1099 ------------------------------------------------------------------ | |
1100 | |
1101 A comment in encode_decode_coding_region(): | |
1102 | |
1103 The chain of streams looks like this: | |
1104 | |
1105 [BUFFER] <----- (( read from/send to loop )) | |
1106 ------> [CHAR->BYTE i.e. ENCODE AS BINARY if source is | |
1107 in bytes] | |
1108 ------> [ENCODE/DECODE AS SPECIFIED] | |
1109 ------> [BYTE->CHAR i.e. DECODE AS BINARY | |
1110 if sink is in bytes] | |
1111 ------> [AUTODETECT EOL if | |
1112 we're decoding and | |
1113 coding system calls | |
1114 for this] | |
1115 ------> [BUFFER] | |
1116 | |
1117 sjt (?) responds: | |
1118 | |
1119 Of course, this is just horrible. BYTE<->CHAR should only be available | |
1120 to I/O routines. It should not be visible to Mule proper. | |
1121 | |
1122 A comment on the implementation. Hrvoje and Kyle worry about the | |
1123 inefficiency of repeated copying among buffers that chained coding | |
1124 systems entail. But this may not be as time inefficient as it appears | |
1125 in the Mule ("house rules") context. The issue is how do you do chain | |
1126 coding systems without copying? In theory you could have | |
1127 | |
1128 IChar external_to_raw (ExtChar *cp, State *s); | |
1129 IChar decode_utf16 (IChar c, State *s); | |
1130 IChar decode_crlf (ExtChar *cp, State *s); | |
1131 | |
1132 typedef Ichar (*Converter[]) (Ichar, State*); | |
1133 | |
1134 Converter utf16[2] = { &decode_utf16, &decode_crlf }; | |
1135 | |
1136 void convert (ExtChar *inbuf, IChar *outbuf, Converter cvtr) | |
1137 { | |
1138 int i; | |
1139 ExtChar c; | |
1140 State s; | |
1141 | |
1142 while (c = external_to_raw (*inbuf++, &s)) | |
1143 { | |
1144 for (i = 0; i < sizeof(cvtr)/sizeof(Converter); ++i) | |
1145 if (s.ready) | |
1146 c = (*cvtr[i]) (c, &s); | |
1147 } | |
1148 if (s.ready) | |
1149 *outbuf++ = c; | |
1150 } | |
1151 | |
1152 But this is a lot of function calls; what Ben is doing is basically | |
1153 reducing this to one call per buffer-full. The only way to avoid this | |
1154 is to hardcode all the "interesting" coding systems, maybe using | |
1155 inline or macros to give structure. But this is still a huge amount | |
1156 of work, and code. | |
1157 | |
1158 One advantage to the call-per-char approach is that we might be able | |
1159 to do something about the marker/extent destruction that coding | |
1160 normally entails. | |
1161 | |
1162 ben sez: | |
1163 | |
1164 it should be possible to preserve the markers/extents without | |
1165 switching completely to one-call-per-char -- we could at least do one | |
1166 call per "run", where a run is more or less the maximal stretch of | |
1167 text not overlapping any markers or extent boundaries. (It's a bit | |
1168 more complicated if we want to properly support the different extent | |
1169 begins/ends; in some cases we might have to pump a single character | |
1170 adjacent to where two extents meet.) The "stateless" way that I wrote | |
1171 all of the conversion routines may be a real hassle but it allows | |
1172 something like this to work without too much problem -- pump in one | |
1173 run at a time into one end of the chain, do a flush after each | |
1174 iteration, and stick what comes out the other end in its place. | |
1175 | |
1176 ------------------------------------------------------------------ | |
1177 ABOUT FORMATS | |
1178 ------------------------------------------------------------------ | |
1179 | |
1180 when calling make-coding-system, the name can be a cons of (format1 . | |
1181 format2), specifying that it decodes format1->format2 and encodes the other | |
1182 way. if only one name is given, that is assumed to be format1, and the | |
1183 other is either `external' or `internal' depending on the end type. | |
1184 normally the user when decoding gives the decoding order in formats, but | |
1185 can leave off the last one, `internal', which is assumed. a multichain | |
1186 might look like gzip|multibyte|unicode, using the coding systems named | |
1187 `gzip', `(unicode . multibyte)' and `unicode'. the way this actually works | |
1188 is by searching for gzip->multibyte; if not found, look for gzip->external | |
1189 or gzip->internal. (In general we automatically do conversion between | |
1190 internal and external as necessary: thus gzip|crlf does the expected, and | |
1191 maps to gzip->external, external->internal, crlf->internal, which when | |
1192 fully specified would be gzip|external:external|internal:crlf|internal -- | |
1193 see below.) To forcibly fit together two converters that have explicitly | |
1194 specified and incompatible names (say you have unicode->multibyte and | |
1195 iso8859-1->ebcdic and you know that the multibyte and iso8859-1 in this | |
1196 case are compatible), you can force-cast using :, like this: | |
1197 ebcdic|iso8859-1:multibyte|unicode. (again, if you force-cast between | |
1198 internal and external formats, the conversion happens automatically.) | |
1199 | |
1200 -------------------------------------------------------------------------- | |
1201 ABOUT PDUMP, UNICODE, AND RUNNING XEMACS FROM A DIRECTORY WITH WEIRD CHARS | |
1202 -------------------------------------------------------------------------- | |
1203 | |
1204 -- there's the problem that XEmacs can't be run in a directory with | |
1205 non-ASCII/Latin-1 chars in it, since it will be doing Unicode | |
1206 processing before we've had a chance to load the tables. In fact, | |
1207 even finding the tables in such a situation is problematic using | |
1208 the normal commands. my idea is to eventually load the stuff | |
1209 extremely extremely early, at the same time as the pdump data gets | |
1210 loaded. in fact, the unicode table data (stored in an efficient | |
1211 binary format) can even be stuck into the pdump file (which would | |
1212 mean as a resource to the executable, for windows). we'd need to | |
1213 extend pdump a bit: to allow for attaching extra data to the pdump | |
1214 file. (something like pdump_attach_extra_data (addr, length) | |
1215 returns a number of some sort, an index into the file, which you | |
1216 can then retrieve with pdump_load_extra_data(), which returns an | |
1217 addr (mmap()ed or loaded), and later you pdump_unload_extra_data() | |
1218 when finished. we'd probably also need | |
1219 pdump_attach_extra_data_append(), which appends data to the data | |
1220 just written out with pdump_attach_extra_data(). this way, | |
1221 multiple tables in memory can be written out into one contiguous | |
1222 table. (we'd use the tar-like trick of allowing new blocks to be | |
1223 written without going back to change the old blocks -- we just rely | |
1224 on the end of file/end of memory.) this same mechanism could be | |
1225 extracted out of pdump and used to handle the non-pdump situation | |
1226 (or alternatively, we could just dump either the memory image of | |
1227 the tables themselves or the compressed binary version). in the | |
1228 case of extra unicode tables not known about at compile time that | |
1229 get loaded before dumping, we either just dump them into the image | |
1230 (pdump and all) or extract them into the compressed binary format, | |
1231 free the original tables, and treat them like all other tables. | |
1232 | |
1233 | |
1234 ========================================================================== | |
1235 - Generalized language appropriate word wrapping (requires | |
1236 layout-exposing API defined in BIDI section) | |
1237 ========================================================================== | |
1238 | |
1239 ========================================================================== | |
1240 - Make Custom Mule-aware | |
1241 ========================================================================== | |
1242 | |
1243 ========================================================================== | |
1244 - Composite character support | |
1245 ========================================================================== | |
1246 | |
1247 ========================================================================== | |
1248 - Language appropriate sorting and searching | |
1249 ========================================================================== | |
1250 | |
1251 ========================================================================== | |
1252 - Glyph shaping for Arabic and Devanagari | |
1253 ========================================================================== | |
1254 | |
1255 - (needs to be handled mostly | |
1256 at C level, as part of layout; luckily it's entirely local in its | |
1257 changes, as this is not hard) | |
1258 | |
1259 | |
1260 ========================================================================== | |
1261 Consider moving language selection Menu up to be parallel with Mule menu | |
1262 ========================================================================== | |
1263 | |
1264 */ | |
1265 | |
1266 | |
771 | 1267 |
1268 /************************************************************************/ | |
1269 /* declarations */ | |
1270 /************************************************************************/ | |
1271 | |
1272 Eistring the_eistring_zero_init, the_eistring_malloc_zero_init; | |
1273 | |
1274 #define MAX_CHARBPOS_GAP_SIZE_3 (65535/3) | |
1275 #define MAX_BYTEBPOS_GAP_SIZE_3 (3 * MAX_CHARBPOS_GAP_SIZE_3) | |
1276 | |
1277 short three_to_one_table[1 + MAX_BYTEBPOS_GAP_SIZE_3]; | |
1278 | |
1279 #ifdef MULE | |
1280 | |
1281 /* Table of number of bytes in the string representation of a character | |
1282 indexed by the first byte of that representation. | |
1283 | |
1284 rep_bytes_by_first_byte(c) is more efficient than the equivalent | |
1285 canonical computation: | |
1286 | |
826 | 1287 XCHARSET_REP_BYTES (charset_by_leading_byte (c)) */ |
771 | 1288 |
1289 const Bytecount rep_bytes_by_first_byte[0xA0] = | |
1290 { /* 0x00 - 0x7f are for straight ASCII */ | |
1291 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1292 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1293 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1294 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1295 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1296 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1297 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1298 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1299 /* 0x80 - 0x8f are for Dimension-1 official charsets */ | |
1300 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
1301 /* 0x90 - 0x9d are for Dimension-2 official charsets */ | |
1302 /* 0x9e is for Dimension-1 private charsets */ | |
1303 /* 0x9f is for Dimension-2 private charsets */ | |
1304 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4 | |
1305 }; | |
1306 | |
1307 #ifdef ENABLE_COMPOSITE_CHARS | |
1308 | |
1309 /* Hash tables for composite chars. One maps string representing | |
1310 composed chars to their equivalent chars; one goes the | |
1311 other way. */ | |
1312 Lisp_Object Vcomposite_char_char2string_hash_table; | |
1313 Lisp_Object Vcomposite_char_string2char_hash_table; | |
1314 | |
1315 static int composite_char_row_next; | |
1316 static int composite_char_col_next; | |
1317 | |
1318 #endif /* ENABLE_COMPOSITE_CHARS */ | |
1319 | |
1320 #endif /* MULE */ | |
1321 | |
1292 | 1322 Lisp_Object QSin_char_byte_conversion; |
1323 Lisp_Object QSin_internal_external_conversion; | |
1324 | |
771 | 1325 |
1326 /************************************************************************/ | |
1327 /* qxestr***() functions */ | |
1328 /************************************************************************/ | |
1329 | |
1330 /* Most are inline functions in lisp.h */ | |
1331 | |
1332 int | |
867 | 1333 qxesprintf (Ibyte *buffer, const CIbyte *format, ...) |
771 | 1334 { |
1335 va_list args; | |
1336 int retval; | |
1337 | |
1338 va_start (args, format); | |
2367 | 1339 retval = vsprintf ((Chbyte *) buffer, format, args); |
771 | 1340 va_end (args); |
1341 | |
1342 return retval; | |
1343 } | |
1344 | |
1345 /* strcasecmp() implementation from BSD */ | |
867 | 1346 static Ibyte strcasecmp_charmap[] = { |
1429 | 1347 0000, 0001, 0002, 0003, 0004, 0005, 0006, 0007, |
1348 0010, 0011, 0012, 0013, 0014, 0015, 0016, 0017, | |
1349 0020, 0021, 0022, 0023, 0024, 0025, 0026, 0027, | |
1350 0030, 0031, 0032, 0033, 0034, 0035, 0036, 0037, | |
1351 0040, 0041, 0042, 0043, 0044, 0045, 0046, 0047, | |
1352 0050, 0051, 0052, 0053, 0054, 0055, 0056, 0057, | |
1353 0060, 0061, 0062, 0063, 0064, 0065, 0066, 0067, | |
1354 0070, 0071, 0072, 0073, 0074, 0075, 0076, 0077, | |
1355 0100, 0141, 0142, 0143, 0144, 0145, 0146, 0147, | |
1356 0150, 0151, 0152, 0153, 0154, 0155, 0156, 0157, | |
1357 0160, 0161, 0162, 0163, 0164, 0165, 0166, 0167, | |
1358 0170, 0171, 0172, 0133, 0134, 0135, 0136, 0137, | |
1359 0140, 0141, 0142, 0143, 0144, 0145, 0146, 0147, | |
1360 0150, 0151, 0152, 0153, 0154, 0155, 0156, 0157, | |
1361 0160, 0161, 0162, 0163, 0164, 0165, 0166, 0167, | |
1362 0170, 0171, 0172, 0173, 0174, 0175, 0176, 0177, | |
1363 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207, | |
1364 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217, | |
1365 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227, | |
1366 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237, | |
1367 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247, | |
1368 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257, | |
1369 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267, | |
1370 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277, | |
1371 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307, | |
1372 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317, | |
1373 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327, | |
1374 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337, | |
1375 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347, | |
1376 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357, | |
1377 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367, | |
1378 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377 | |
771 | 1379 }; |
1380 | |
1381 /* A version that works like generic strcasecmp() -- only collapsing | |
1382 case in ASCII A-Z/a-z. This is safe on Mule strings due to the | |
1383 current representation. | |
1384 | |
1385 This version was written by some Berkeley coder, favoring | |
1386 nanosecond improvements over clarity. In all other versions below, | |
1387 we use symmetrical algorithms that may sacrifice a few machine | |
1388 cycles but are MUCH MUCH clearer, which counts a lot more. | |
1389 */ | |
1390 | |
1391 int | |
867 | 1392 qxestrcasecmp (const Ibyte *s1, const Ibyte *s2) |
771 | 1393 { |
867 | 1394 Ibyte *cm = strcasecmp_charmap; |
771 | 1395 |
1396 while (cm[*s1] == cm[*s2++]) | |
1397 if (*s1++ == '\0') | |
1398 return (0); | |
1399 | |
1400 return (cm[*s1] - cm[*--s2]); | |
1401 } | |
1402 | |
1403 int | |
2367 | 1404 ascii_strcasecmp (const Ascbyte *s1, const Ascbyte *s2) |
771 | 1405 { |
867 | 1406 return qxestrcasecmp ((const Ibyte *) s1, (const Ibyte *) s2); |
771 | 1407 } |
1408 | |
1409 int | |
2367 | 1410 qxestrcasecmp_ascii (const Ibyte *s1, const Ascbyte *s2) |
771 | 1411 { |
867 | 1412 return qxestrcasecmp (s1, (const Ibyte *) s2); |
771 | 1413 } |
1414 | |
1415 /* An internationalized version that collapses case in a general fashion. | |
1416 */ | |
1417 | |
1418 int | |
867 | 1419 qxestrcasecmp_i18n (const Ibyte *s1, const Ibyte *s2) |
771 | 1420 { |
1421 while (*s1 && *s2) | |
1422 { | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1423 if (CANONCASE (0, itext_ichar (s1)) != |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1424 CANONCASE (0, itext_ichar (s2))) |
771 | 1425 break; |
867 | 1426 INC_IBYTEPTR (s1); |
1427 INC_IBYTEPTR (s2); | |
771 | 1428 } |
1429 | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1430 return (CANONCASE (0, itext_ichar (s1)) - |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1431 CANONCASE (0, itext_ichar (s2))); |
771 | 1432 } |
1433 | |
1434 /* The only difference between these next two and | |
1435 qxememcasecmp()/qxememcasecmp_i18n() is that these two will stop if | |
1436 both strings are equal and less than LEN in length, while | |
1437 the mem...() versions would would run off the end. */ | |
1438 | |
1439 int | |
867 | 1440 qxestrncasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len) |
771 | 1441 { |
867 | 1442 Ibyte *cm = strcasecmp_charmap; |
771 | 1443 |
1444 while (len--) | |
1445 { | |
1446 int diff = cm[*s1] - cm[*s2]; | |
1447 if (diff != 0) | |
1448 return diff; | |
1449 if (!*s1) | |
1450 return 0; | |
1451 s1++, s2++; | |
1452 } | |
1453 | |
1454 return 0; | |
1455 } | |
1456 | |
1457 int | |
2367 | 1458 ascii_strncasecmp (const Ascbyte *s1, const Ascbyte *s2, Bytecount len) |
771 | 1459 { |
867 | 1460 return qxestrncasecmp ((const Ibyte *) s1, (const Ibyte *) s2, len); |
771 | 1461 } |
1462 | |
1463 int | |
2367 | 1464 qxestrncasecmp_ascii (const Ibyte *s1, const Ascbyte *s2, Bytecount len) |
771 | 1465 { |
867 | 1466 return qxestrncasecmp (s1, (const Ibyte *) s2, len); |
771 | 1467 } |
1468 | |
801 | 1469 /* Compare LEN_FROM_S1 worth of characters from S1 with the same number of |
1470 characters from S2, case insensitive. NOTE: Downcasing can convert | |
1471 characters from one length in bytes to another, so reversing S1 and S2 | |
1472 is *NOT* a symmetric operations! You must choose a length that agrees | |
1473 with S1. */ | |
1474 | |
771 | 1475 int |
867 | 1476 qxestrncasecmp_i18n (const Ibyte *s1, const Ibyte *s2, |
801 | 1477 Bytecount len_from_s1) |
771 | 1478 { |
801 | 1479 while (len_from_s1 > 0) |
771 | 1480 { |
867 | 1481 const Ibyte *old_s1 = s1; |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1482 int diff = (CANONCASE (0, itext_ichar (s1)) - |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1483 CANONCASE (0, itext_ichar (s2))); |
771 | 1484 if (diff != 0) |
1485 return diff; | |
1486 if (!*s1) | |
1487 return 0; | |
867 | 1488 INC_IBYTEPTR (s1); |
1489 INC_IBYTEPTR (s2); | |
801 | 1490 len_from_s1 -= s1 - old_s1; |
771 | 1491 } |
1492 | |
1493 return 0; | |
1494 } | |
1495 | |
1496 int | |
867 | 1497 qxememcmp (const Ibyte *s1, const Ibyte *s2, Bytecount len) |
771 | 1498 { |
1499 return memcmp (s1, s2, len); | |
1500 } | |
1501 | |
1502 int | |
867 | 1503 qxememcmp4 (const Ibyte *s1, Bytecount len1, |
1504 const Ibyte *s2, Bytecount len2) | |
801 | 1505 { |
1506 int retval = qxememcmp (s1, s2, min (len1, len2)); | |
1507 if (retval) | |
1508 return retval; | |
1509 return len1 - len2; | |
1510 } | |
1511 | |
1512 int | |
867 | 1513 qxememcasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len) |
771 | 1514 { |
867 | 1515 Ibyte *cm = strcasecmp_charmap; |
771 | 1516 |
1517 while (len--) | |
1518 { | |
1519 int diff = cm[*s1] - cm[*s2]; | |
1520 if (diff != 0) | |
1521 return diff; | |
1522 s1++, s2++; | |
1523 } | |
1524 | |
1525 return 0; | |
1526 } | |
1527 | |
1528 int | |
867 | 1529 qxememcasecmp4 (const Ibyte *s1, Bytecount len1, |
1530 const Ibyte *s2, Bytecount len2) | |
771 | 1531 { |
801 | 1532 int retval = qxememcasecmp (s1, s2, min (len1, len2)); |
1533 if (retval) | |
1534 return retval; | |
1535 return len1 - len2; | |
1536 } | |
1537 | |
1538 /* Do a character-by-character comparison, returning "which is greater" by | |
867 | 1539 comparing the Ichar values. (#### Should have option to compare Unicode |
801 | 1540 points) */ |
1541 | |
1542 int | |
867 | 1543 qxetextcmp (const Ibyte *s1, Bytecount len1, |
1544 const Ibyte *s2, Bytecount len2) | |
801 | 1545 { |
1546 while (len1 > 0 && len2 > 0) | |
771 | 1547 { |
867 | 1548 const Ibyte *old_s1 = s1; |
1549 const Ibyte *old_s2 = s2; | |
1550 int diff = itext_ichar (s1) - itext_ichar (s2); | |
801 | 1551 if (diff != 0) |
1552 return diff; | |
867 | 1553 INC_IBYTEPTR (s1); |
1554 INC_IBYTEPTR (s2); | |
801 | 1555 len1 -= s1 - old_s1; |
1556 len2 -= s2 - old_s2; | |
1557 } | |
1558 | |
1559 assert (len1 >= 0 && len2 >= 0); | |
1560 return len1 - len2; | |
1561 } | |
1562 | |
1563 int | |
867 | 1564 qxetextcmp_matching (const Ibyte *s1, Bytecount len1, |
1565 const Ibyte *s2, Bytecount len2, | |
801 | 1566 Charcount *matching) |
1567 { | |
1568 *matching = 0; | |
1569 while (len1 > 0 && len2 > 0) | |
1570 { | |
867 | 1571 const Ibyte *old_s1 = s1; |
1572 const Ibyte *old_s2 = s2; | |
1573 int diff = itext_ichar (s1) - itext_ichar (s2); | |
801 | 1574 if (diff != 0) |
1575 return diff; | |
867 | 1576 INC_IBYTEPTR (s1); |
1577 INC_IBYTEPTR (s2); | |
801 | 1578 len1 -= s1 - old_s1; |
1579 len2 -= s2 - old_s2; | |
1580 (*matching)++; | |
1581 } | |
1582 | |
1583 assert (len1 >= 0 && len2 >= 0); | |
1584 return len1 - len2; | |
1585 } | |
1586 | |
1587 /* Do a character-by-character comparison, returning "which is greater" by | |
867 | 1588 comparing the Ichar values, case insensitively (by downcasing both |
801 | 1589 first). (#### Should have option to compare Unicode points) |
1590 | |
1591 In this case, both lengths must be specified becaused downcasing can | |
1592 convert characters from one length in bytes to another; therefore, two | |
1593 blocks of text of different length might be equal. If both compare | |
1594 equal up to the limit in length of one but not the other, the longer one | |
1595 is "greater". */ | |
1596 | |
1597 int | |
867 | 1598 qxetextcasecmp (const Ibyte *s1, Bytecount len1, |
1599 const Ibyte *s2, Bytecount len2) | |
801 | 1600 { |
1601 while (len1 > 0 && len2 > 0) | |
1602 { | |
867 | 1603 const Ibyte *old_s1 = s1; |
1604 const Ibyte *old_s2 = s2; | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1605 int diff = (CANONCASE (0, itext_ichar (s1)) - |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1606 CANONCASE (0, itext_ichar (s2))); |
771 | 1607 if (diff != 0) |
1608 return diff; | |
867 | 1609 INC_IBYTEPTR (s1); |
1610 INC_IBYTEPTR (s2); | |
801 | 1611 len1 -= s1 - old_s1; |
1612 len2 -= s2 - old_s2; | |
771 | 1613 } |
1614 | |
801 | 1615 assert (len1 >= 0 && len2 >= 0); |
1616 return len1 - len2; | |
1617 } | |
1618 | |
1619 /* Like qxetextcasecmp() but also return number of characters at | |
1620 beginning that match. */ | |
1621 | |
1622 int | |
867 | 1623 qxetextcasecmp_matching (const Ibyte *s1, Bytecount len1, |
1624 const Ibyte *s2, Bytecount len2, | |
801 | 1625 Charcount *matching) |
1626 { | |
1627 *matching = 0; | |
1628 while (len1 > 0 && len2 > 0) | |
1629 { | |
867 | 1630 const Ibyte *old_s1 = s1; |
1631 const Ibyte *old_s2 = s2; | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1632 int diff = (CANONCASE (0, itext_ichar (s1)) - |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1633 CANONCASE (0, itext_ichar (s2))); |
801 | 1634 if (diff != 0) |
1635 return diff; | |
867 | 1636 INC_IBYTEPTR (s1); |
1637 INC_IBYTEPTR (s2); | |
801 | 1638 len1 -= s1 - old_s1; |
1639 len2 -= s2 - old_s2; | |
1640 (*matching)++; | |
1641 } | |
1642 | |
1643 assert (len1 >= 0 && len2 >= 0); | |
1644 return len1 - len2; | |
771 | 1645 } |
1646 | |
1647 int | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1648 lisp_strcasecmp_ascii (Lisp_Object s1, Lisp_Object s2) |
771 | 1649 { |
867 | 1650 Ibyte *cm = strcasecmp_charmap; |
1651 Ibyte *p1 = XSTRING_DATA (s1); | |
1652 Ibyte *p2 = XSTRING_DATA (s2); | |
1653 Ibyte *e1 = p1 + XSTRING_LENGTH (s1); | |
1654 Ibyte *e2 = p2 + XSTRING_LENGTH (s2); | |
771 | 1655 |
1656 /* again, we use a symmetric algorithm and favor clarity over | |
1657 nanosecond improvements. */ | |
1658 while (1) | |
1659 { | |
1660 /* if we reached the end of either string, compare lengths. | |
1661 do NOT compare the final null byte against anything, in case | |
1662 the other string also has a null byte at that position. */ | |
1663 if (p1 == e1 || p2 == e2) | |
1664 return e1 - e2; | |
1665 if (cm[*p1] != cm[*p2]) | |
1666 return cm[*p1] - cm[*p2]; | |
1667 p1++, p2++; | |
1668 } | |
1669 } | |
1670 | |
1671 int | |
1672 lisp_strcasecmp_i18n (Lisp_Object s1, Lisp_Object s2) | |
1673 { | |
801 | 1674 return qxetextcasecmp (XSTRING_DATA (s1), XSTRING_LENGTH (s1), |
1675 XSTRING_DATA (s2), XSTRING_LENGTH (s2)); | |
771 | 1676 } |
1677 | |
2367 | 1678 /* Compare a wide string with an ASCII string */ |
1679 | |
1680 int | |
1681 wcscmp_ascii (const wchar_t *s1, const Ascbyte *s2) | |
1682 { | |
1683 while (*s1 && *s2) | |
1684 { | |
2956 | 1685 if (*s1 != (wchar_t) *s2) |
2367 | 1686 break; |
1687 s1++, s2++; | |
1688 } | |
1689 | |
1690 return *s1 - *s2; | |
1691 } | |
1692 | |
1693 int | |
1694 wcsncmp_ascii (const wchar_t *s1, const Ascbyte *s2, Charcount len) | |
1695 { | |
1696 while (len--) | |
1697 { | |
1698 int diff = *s1 - *s2; | |
1699 if (diff != 0) | |
1700 return diff; | |
1701 if (!*s1) | |
1702 return 0; | |
1703 s1++, s2++; | |
1704 } | |
1705 | |
1706 return 0; | |
1707 } | |
1708 | |
771 | 1709 |
1710 /************************************************************************/ | |
1711 /* conversion between textual representations */ | |
1712 /************************************************************************/ | |
1713 | |
1714 /* NOTE: Does not reset the Dynarr. */ | |
1715 | |
1716 void | |
867 | 1717 convert_ibyte_string_into_ichar_dynarr (const Ibyte *str, Bytecount len, |
2367 | 1718 Ichar_dynarr *dyn) |
771 | 1719 { |
867 | 1720 const Ibyte *strend = str + len; |
771 | 1721 |
1722 while (str < strend) | |
1723 { | |
867 | 1724 Ichar ch = itext_ichar (str); |
771 | 1725 Dynarr_add (dyn, ch); |
867 | 1726 INC_IBYTEPTR (str); |
771 | 1727 } |
1728 } | |
1729 | |
1730 Charcount | |
867 | 1731 convert_ibyte_string_into_ichar_string (const Ibyte *str, Bytecount len, |
2367 | 1732 Ichar *arr) |
771 | 1733 { |
867 | 1734 const Ibyte *strend = str + len; |
771 | 1735 Charcount newlen = 0; |
1736 while (str < strend) | |
1737 { | |
867 | 1738 Ichar ch = itext_ichar (str); |
771 | 1739 arr[newlen++] = ch; |
867 | 1740 INC_IBYTEPTR (str); |
771 | 1741 } |
1742 return newlen; | |
1743 } | |
1744 | |
867 | 1745 /* Convert an array of Ichars into the equivalent string representation. |
1746 Store into the given Ibyte dynarr. Does not reset the dynarr. | |
771 | 1747 Does not add a terminating zero. */ |
1748 | |
1749 void | |
867 | 1750 convert_ichar_string_into_ibyte_dynarr (Ichar *arr, int nels, |
1751 Ibyte_dynarr *dyn) | |
771 | 1752 { |
867 | 1753 Ibyte str[MAX_ICHAR_LEN]; |
771 | 1754 int i; |
1755 | |
1756 for (i = 0; i < nels; i++) | |
1757 { | |
867 | 1758 Bytecount len = set_itext_ichar (str, arr[i]); |
771 | 1759 Dynarr_add_many (dyn, str, len); |
1760 } | |
1761 } | |
1762 | |
867 | 1763 /* Convert an array of Ichars into the equivalent string representation. |
771 | 1764 Malloc the space needed for this and return it. If LEN_OUT is not a |
867 | 1765 NULL pointer, store into LEN_OUT the number of Ibytes in the |
1766 malloc()ed string. Note that the actual number of Ibytes allocated | |
771 | 1767 is one more than this: the returned string is zero-terminated. */ |
1768 | |
867 | 1769 Ibyte * |
1770 convert_ichar_string_into_malloced_string (Ichar *arr, int nels, | |
826 | 1771 Bytecount *len_out) |
771 | 1772 { |
1773 /* Damn zero-termination. */ | |
2367 | 1774 Ibyte *str = alloca_ibytes (nels * MAX_ICHAR_LEN + 1); |
867 | 1775 Ibyte *strorig = str; |
771 | 1776 Bytecount len; |
1777 | |
1778 int i; | |
1779 | |
1780 for (i = 0; i < nels; i++) | |
867 | 1781 str += set_itext_ichar (str, arr[i]); |
771 | 1782 *str = '\0'; |
1783 len = str - strorig; | |
2367 | 1784 str = xnew_ibytes (1 + len); |
771 | 1785 memcpy (str, strorig, 1 + len); |
1786 if (len_out) | |
1787 *len_out = len; | |
1788 return str; | |
1789 } | |
1790 | |
826 | 1791 #define COPY_TEXT_BETWEEN_FORMATS(srcfmt, dstfmt) \ |
1792 do \ | |
1793 { \ | |
1794 if (dst) \ | |
1795 { \ | |
867 | 1796 Ibyte *dstend = dst + dstlen; \ |
1797 Ibyte *dstp = dst; \ | |
1798 const Ibyte *srcend = src + srclen; \ | |
1799 const Ibyte *srcp = src; \ | |
826 | 1800 \ |
1801 while (srcp < srcend) \ | |
1802 { \ | |
867 | 1803 Ichar ch = itext_ichar_fmt (srcp, srcfmt, srcobj); \ |
1804 Bytecount len = ichar_len_fmt (ch, dstfmt); \ | |
826 | 1805 \ |
1806 if (dstp + len <= dstend) \ | |
1807 { \ | |
2956 | 1808 (void) set_itext_ichar_fmt (dstp, ch, dstfmt, dstobj); \ |
826 | 1809 dstp += len; \ |
1810 } \ | |
1811 else \ | |
1812 break; \ | |
867 | 1813 INC_IBYTEPTR_FMT (srcp, srcfmt); \ |
826 | 1814 } \ |
1815 text_checking_assert (srcp <= srcend); \ | |
1816 if (src_used) \ | |
1817 *src_used = srcp - src; \ | |
1818 return dstp - dst; \ | |
1819 } \ | |
1820 else \ | |
1821 { \ | |
867 | 1822 const Ibyte *srcend = src + srclen; \ |
1823 const Ibyte *srcp = src; \ | |
826 | 1824 Bytecount total = 0; \ |
1825 \ | |
1826 while (srcp < srcend) \ | |
1827 { \ | |
867 | 1828 total += ichar_len_fmt (itext_ichar_fmt (srcp, srcfmt, \ |
826 | 1829 srcobj), dstfmt); \ |
867 | 1830 INC_IBYTEPTR_FMT (srcp, srcfmt); \ |
826 | 1831 } \ |
1832 text_checking_assert (srcp == srcend); \ | |
1833 if (src_used) \ | |
1834 *src_used = srcp - src; \ | |
1835 return total; \ | |
1836 } \ | |
1837 } \ | |
1838 while (0) | |
1839 | |
1840 /* Copy as much text from SRC/SRCLEN to DST/DSTLEN as will fit, converting | |
1841 from SRCFMT/SRCOBJ to DSTFMT/DSTOBJ. Return number of bytes stored into | |
1842 DST as return value, and number of bytes copied from SRC through | |
1843 SRC_USED (if not NULL). If DST is NULL, don't actually store anything | |
1844 and just return the size needed to store all the text. Will not copy | |
1845 partial characters into DST. */ | |
1846 | |
1847 Bytecount | |
867 | 1848 copy_text_between_formats (const Ibyte *src, Bytecount srclen, |
826 | 1849 Internal_Format srcfmt, |
2333 | 1850 Lisp_Object USED_IF_MULE (srcobj), |
867 | 1851 Ibyte *dst, Bytecount dstlen, |
826 | 1852 Internal_Format dstfmt, |
2333 | 1853 Lisp_Object USED_IF_MULE (dstobj), |
826 | 1854 Bytecount *src_used) |
1855 { | |
1856 if (srcfmt == dstfmt && | |
1857 objects_have_same_internal_representation (srcobj, dstobj)) | |
1858 { | |
1859 if (dst) | |
1860 { | |
1861 srclen = min (srclen, dstlen); | |
867 | 1862 srclen = validate_ibyte_string_backward (src, srclen); |
826 | 1863 memcpy (dst, src, srclen); |
1864 if (src_used) | |
1865 *src_used = srclen; | |
1866 return srclen; | |
1867 } | |
1868 else | |
1869 return srclen; | |
1870 } | |
1871 /* Everything before the final else statement is an optimization. | |
1872 The inner loops inside COPY_TEXT_BETWEEN_FORMATS() have a number | |
1873 of calls to *_fmt(), each of which has a switch statement in it. | |
1874 By using constants as the FMT argument, these switch statements | |
1875 will be optimized out of existence. */ | |
1876 #define ELSE_FORMATS(fmt1, fmt2) \ | |
1877 else if (srcfmt == fmt1 && dstfmt == fmt2) \ | |
1878 COPY_TEXT_BETWEEN_FORMATS (fmt1, fmt2) | |
1879 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_8_BIT_FIXED); | |
1880 ELSE_FORMATS (FORMAT_8_BIT_FIXED, FORMAT_DEFAULT); | |
1881 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_32_BIT_FIXED); | |
1882 ELSE_FORMATS (FORMAT_32_BIT_FIXED, FORMAT_DEFAULT); | |
1883 else | |
1884 COPY_TEXT_BETWEEN_FORMATS (srcfmt, dstfmt); | |
1885 #undef ELSE_FORMATS | |
1886 } | |
1887 | |
1888 /* Copy as much buffer text in BUF, starting at POS, of length LEN, as will | |
1889 fit into DST/DSTLEN, converting to DSTFMT. Return number of bytes | |
1890 stored into DST as return value, and number of bytes copied from BUF | |
1891 through SRC_USED (if not NULL). If DST is NULL, don't actually store | |
1892 anything and just return the size needed to store all the text. */ | |
1893 | |
1894 Bytecount | |
1895 copy_buffer_text_out (struct buffer *buf, Bytebpos pos, | |
867 | 1896 Bytecount len, Ibyte *dst, Bytecount dstlen, |
826 | 1897 Internal_Format dstfmt, Lisp_Object dstobj, |
1898 Bytecount *src_used) | |
1899 { | |
1900 Bytecount dst_used = 0; | |
1901 if (src_used) | |
1902 *src_used = 0; | |
1903 | |
1904 { | |
1905 BUFFER_TEXT_LOOP (buf, pos, len, runptr, runlen) | |
1906 { | |
1907 Bytecount the_src_used, the_dst_used; | |
1908 | |
1909 the_dst_used = copy_text_between_formats (runptr, runlen, | |
1910 BUF_FORMAT (buf), | |
1911 wrap_buffer (buf), | |
1912 dst, dstlen, dstfmt, | |
1913 dstobj, &the_src_used); | |
1914 dst_used += the_dst_used; | |
1915 if (src_used) | |
1916 *src_used += the_src_used; | |
1917 if (dst) | |
1918 { | |
1919 dst += the_dst_used; | |
1920 dstlen -= the_dst_used; | |
841 | 1921 /* Stop if we didn't use all of the source text. Also stop |
1922 if the destination is full. We need the first test because | |
1923 there might be a couple bytes left in the destination, but | |
1924 not enough to fit a full character. The first test will in | |
1925 fact catch the vast majority of cases where the destination | |
1926 is empty, too -- but in case the destination holds *exactly* | |
1927 the run length, we put in the second check. (It shouldn't | |
1928 really matter though -- next time through we'll just get a | |
1929 0.) */ | |
1930 if (the_src_used < runlen || !dstlen) | |
826 | 1931 break; |
1932 } | |
1933 } | |
1934 } | |
1935 | |
1936 return dst_used; | |
1937 } | |
1938 | |
771 | 1939 |
1940 /************************************************************************/ | |
1941 /* charset properties of strings */ | |
1942 /************************************************************************/ | |
1943 | |
1944 void | |
2333 | 1945 find_charsets_in_ibyte_string (unsigned char *charsets, |
1946 const Ibyte *USED_IF_MULE (str), | |
1947 Bytecount USED_IF_MULE (len)) | |
771 | 1948 { |
1949 #ifndef MULE | |
1950 /* Telescope this. */ | |
1951 charsets[0] = 1; | |
1952 #else | |
867 | 1953 const Ibyte *strend = str + len; |
771 | 1954 memset (charsets, 0, NUM_LEADING_BYTES); |
1955 | |
1956 /* #### SJT doesn't like this. */ | |
1957 if (len == 0) | |
1958 { | |
1959 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1; | |
1960 return; | |
1961 } | |
1962 | |
1963 while (str < strend) | |
1964 { | |
867 | 1965 charsets[ichar_leading_byte (itext_ichar (str)) - MIN_LEADING_BYTE] = |
771 | 1966 1; |
867 | 1967 INC_IBYTEPTR (str); |
771 | 1968 } |
1969 #endif | |
1970 } | |
1971 | |
1972 void | |
2333 | 1973 find_charsets_in_ichar_string (unsigned char *charsets, |
1974 const Ichar *USED_IF_MULE (str), | |
1975 Charcount USED_IF_MULE (len)) | |
771 | 1976 { |
1977 #ifndef MULE | |
1978 /* Telescope this. */ | |
1979 charsets[0] = 1; | |
1980 #else | |
1981 int i; | |
1982 | |
1983 memset (charsets, 0, NUM_LEADING_BYTES); | |
1984 | |
1985 /* #### SJT doesn't like this. */ | |
1986 if (len == 0) | |
1987 { | |
1988 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1; | |
1989 return; | |
1990 } | |
1991 | |
1992 for (i = 0; i < len; i++) | |
1993 { | |
867 | 1994 charsets[ichar_leading_byte (str[i]) - MIN_LEADING_BYTE] = 1; |
771 | 1995 } |
1996 #endif | |
1997 } | |
1998 | |
3571 | 1999 /* A couple of these functions should only be called on a non-Mule build. */ |
2000 #ifdef MULE | |
2001 #define ASSERT_BUILT_WITH_MULE() assert(1) | |
2002 #else /* MULE */ | |
2003 #define ASSERT_BUILT_WITH_MULE() assert(0) | |
2004 #endif /* MULE */ | |
2005 | |
771 | 2006 int |
867 | 2007 ibyte_string_displayed_columns (const Ibyte *str, Bytecount len) |
771 | 2008 { |
2009 int cols = 0; | |
867 | 2010 const Ibyte *end = str + len; |
3571 | 2011 Ichar ch; |
2012 | |
2013 ASSERT_BUILT_WITH_MULE(); | |
771 | 2014 |
2015 while (str < end) | |
2016 { | |
3571 | 2017 ch = itext_ichar (str); |
867 | 2018 cols += XCHARSET_COLUMNS (ichar_charset (ch)); |
2019 INC_IBYTEPTR (str); | |
771 | 2020 } |
2021 | |
2022 return cols; | |
2023 } | |
2024 | |
2025 int | |
3571 | 2026 ichar_string_displayed_columns (const Ichar * USED_IF_MULE(str), Charcount len) |
771 | 2027 { |
2028 int cols = 0; | |
2029 int i; | |
2030 | |
3571 | 2031 ASSERT_BUILT_WITH_MULE(); |
2032 | |
771 | 2033 for (i = 0; i < len; i++) |
867 | 2034 cols += XCHARSET_COLUMNS (ichar_charset (str[i])); |
771 | 2035 |
2036 return cols; | |
2037 } | |
2038 | |
2039 Charcount | |
2333 | 2040 ibyte_string_nonascii_chars (const Ibyte *USED_IF_MULE (str), |
2041 Bytecount USED_IF_MULE (len)) | |
771 | 2042 { |
2043 #ifdef MULE | |
867 | 2044 const Ibyte *end = str + len; |
771 | 2045 Charcount retval = 0; |
2046 | |
2047 while (str < end) | |
2048 { | |
826 | 2049 if (!byte_ascii_p (*str)) |
771 | 2050 retval++; |
867 | 2051 INC_IBYTEPTR (str); |
771 | 2052 } |
2053 | |
2054 return retval; | |
2055 #else | |
2056 return 0; | |
2057 #endif | |
2058 } | |
2059 | |
2060 | |
2061 /***************************************************************************/ | |
2062 /* Eistring helper functions */ | |
2063 /***************************************************************************/ | |
2064 | |
2065 int | |
867 | 2066 eistr_casefiddle_1 (Ibyte *olddata, Bytecount len, Ibyte *newdata, |
771 | 2067 int downp) |
2068 { | |
867 | 2069 Ibyte *endp = olddata + len; |
2070 Ibyte *newp = newdata; | |
771 | 2071 int changedp = 0; |
2072 | |
2073 while (olddata < endp) | |
2074 { | |
867 | 2075 Ichar c = itext_ichar (olddata); |
2076 Ichar newc; | |
771 | 2077 |
2078 if (downp) | |
2079 newc = DOWNCASE (0, c); | |
2080 else | |
2081 newc = UPCASE (0, c); | |
2082 | |
2083 if (c != newc) | |
2084 changedp = 1; | |
2085 | |
867 | 2086 newp += set_itext_ichar (newp, newc); |
2087 INC_IBYTEPTR (olddata); | |
771 | 2088 } |
2089 | |
2090 *newp = '\0'; | |
2091 | |
2092 return changedp ? newp - newdata : 0; | |
2093 } | |
2094 | |
2095 int | |
2096 eifind_large_enough_buffer (int oldbufsize, int needed_size) | |
2097 { | |
2098 while (oldbufsize < needed_size) | |
2099 { | |
2100 oldbufsize = oldbufsize * 3 / 2; | |
2101 oldbufsize = max (oldbufsize, 32); | |
2102 } | |
2103 | |
2104 return oldbufsize; | |
2105 } | |
2106 | |
2107 void | |
2108 eito_malloc_1 (Eistring *ei) | |
2109 { | |
2110 if (ei->mallocp_) | |
2111 return; | |
2112 ei->mallocp_ = 1; | |
2113 if (ei->data_) | |
2114 { | |
867 | 2115 Ibyte *newdata; |
771 | 2116 |
2117 ei->max_size_allocated_ = | |
2118 eifind_large_enough_buffer (0, ei->bytelen_ + 1); | |
2367 | 2119 newdata = xnew_ibytes (ei->max_size_allocated_); |
771 | 2120 memcpy (newdata, ei->data_, ei->bytelen_ + 1); |
2121 ei->data_ = newdata; | |
2122 } | |
2123 | |
2124 if (ei->extdata_) | |
2125 { | |
2367 | 2126 Extbyte *newdata = xnew_extbytes (ei->extlen_ + 2); |
771 | 2127 |
2128 memcpy (newdata, ei->extdata_, ei->extlen_); | |
2129 /* Double null-terminate in case of Unicode data */ | |
2130 newdata[ei->extlen_] = '\0'; | |
2131 newdata[ei->extlen_ + 1] = '\0'; | |
2132 ei->extdata_ = newdata; | |
2133 } | |
2134 } | |
2135 | |
2136 int | |
2137 eicmp_1 (Eistring *ei, Bytecount off, Charcount charoff, | |
867 | 2138 Bytecount len, Charcount charlen, const Ibyte *data, |
2421 | 2139 const Eistring *ei2, int is_ascii, int fold_case) |
771 | 2140 { |
3462 | 2141 assert ((data == 0) != (ei == 0)); |
2142 assert ((is_ascii != 0) == (data != 0)); | |
2143 assert (fold_case >= 0 && fold_case <= 2); | |
771 | 2144 assert ((off < 0) != (charoff < 0)); |
3462 | 2145 |
771 | 2146 if (off < 0) |
2147 { | |
2148 off = charcount_to_bytecount (ei->data_, charoff); | |
2149 if (charlen < 0) | |
2150 len = -1; | |
2151 else | |
2152 len = charcount_to_bytecount (ei->data_ + off, charlen); | |
2153 } | |
2154 if (len < 0) | |
2155 len = ei->bytelen_ - off; | |
2156 | |
2157 assert (off >= 0 && off <= ei->bytelen_); | |
2158 assert (len >= 0 && off + len <= ei->bytelen_); | |
2159 | |
2160 { | |
2161 Bytecount dstlen; | |
867 | 2162 const Ibyte *src = ei->data_, *dst; |
771 | 2163 |
2164 if (data) | |
2165 { | |
2166 dst = data; | |
2167 dstlen = qxestrlen (data); | |
2168 } | |
2169 else | |
2170 { | |
2171 dst = ei2->data_; | |
2172 dstlen = ei2->bytelen_; | |
2173 } | |
2174 | |
2421 | 2175 if (is_ascii) |
2367 | 2176 ASSERT_ASCTEXT_ASCII_LEN ((Ascbyte *) dst, dstlen); |
771 | 2177 |
801 | 2178 return (fold_case == 0 ? qxememcmp4 (src, len, dst, dstlen) : |
2179 fold_case == 1 ? qxememcasecmp4 (src, len, dst, dstlen) : | |
2180 qxetextcasecmp (src, len, dst, dstlen)); | |
771 | 2181 } |
2182 } | |
2183 | |
867 | 2184 Ibyte * |
826 | 2185 eicpyout_malloc_fmt (Eistring *eistr, Bytecount *len_out, Internal_Format fmt, |
2286 | 2186 Lisp_Object UNUSED (object)) |
771 | 2187 { |
867 | 2188 Ibyte *ptr; |
771 | 2189 |
2190 assert (fmt == FORMAT_DEFAULT); | |
867 | 2191 ptr = xnew_array (Ibyte, eistr->bytelen_ + 1); |
771 | 2192 if (len_out) |
2193 *len_out = eistr->bytelen_; | |
2194 memcpy (ptr, eistr->data_, eistr->bytelen_ + 1); | |
2195 return ptr; | |
2196 } | |
2197 | |
2198 | |
2199 /************************************************************************/ | |
2200 /* Charcount/Bytecount conversion */ | |
2201 /************************************************************************/ | |
2202 | |
2203 /* Optimization. Do it. Live it. Love it. */ | |
2204 | |
2205 #ifdef MULE | |
2206 | |
826 | 2207 /* Function equivalents of bytecount_to_charcount/charcount_to_bytecount. |
2208 These work on strings of all sizes but are more efficient than a simple | |
2209 loop on large strings and probably less efficient on sufficiently small | |
2210 strings. */ | |
2211 | |
2212 Charcount | |
867 | 2213 bytecount_to_charcount_fun (const Ibyte *ptr, Bytecount len) |
826 | 2214 { |
2215 Charcount count = 0; | |
867 | 2216 const Ibyte *end = ptr + len; |
826 | 2217 while (1) |
2218 { | |
867 | 2219 const Ibyte *newptr = skip_ascii (ptr, end); |
826 | 2220 count += newptr - ptr; |
2221 ptr = newptr; | |
2222 if (ptr == end) | |
2223 break; | |
2224 { | |
2225 /* Optimize for successive characters from the same charset */ | |
867 | 2226 Ibyte leading_byte = *ptr; |
826 | 2227 int bytes = rep_bytes_by_first_byte (leading_byte); |
2228 while (ptr < end && *ptr == leading_byte) | |
2229 ptr += bytes, count++; | |
2230 } | |
771 | 2231 } |
2232 | |
2233 /* Bomb out if the specified substring ends in the middle | |
2234 of a character. Note that we might have already gotten | |
2235 a core dump above from an invalid reference, but at least | |
2236 we will get no farther than here. | |
2237 | |
2238 This also catches len < 0. */ | |
800 | 2239 text_checking_assert (ptr == end); |
771 | 2240 |
2241 return count; | |
2242 } | |
2243 | |
2244 Bytecount | |
867 | 2245 charcount_to_bytecount_fun (const Ibyte *ptr, Charcount len) |
771 | 2246 { |
867 | 2247 const Ibyte *newptr = ptr; |
826 | 2248 while (1) |
771 | 2249 { |
867 | 2250 const Ibyte *newnewptr = skip_ascii (newptr, newptr + len); |
826 | 2251 len -= newnewptr - newptr; |
2252 newptr = newnewptr; | |
2253 if (!len) | |
2254 break; | |
2255 { | |
2256 /* Optimize for successive characters from the same charset */ | |
867 | 2257 Ibyte leading_byte = *newptr; |
826 | 2258 int bytes = rep_bytes_by_first_byte (leading_byte); |
2259 while (len > 0 && *newptr == leading_byte) | |
2260 newptr += bytes, len--; | |
2261 } | |
771 | 2262 } |
2263 return newptr - ptr; | |
2264 } | |
2265 | |
2367 | 2266 /* Function equivalent of charcount_to_bytecount_down. This works on strings |
2267 of all sizes but is more efficient than a simple loop on large strings | |
2268 and probably less efficient on sufficiently small strings. */ | |
2269 | |
2270 Bytecount | |
2271 charcount_to_bytecount_down_fun (const Ibyte *ptr, Charcount len) | |
2272 { | |
2273 const Ibyte *newptr = ptr; | |
2274 while (1) | |
2275 { | |
2276 const Ibyte *newnewptr = skip_ascii_down (newptr, newptr - len); | |
2277 len -= newptr - newnewptr; | |
2278 newptr = newnewptr; | |
2279 /* Skip over all non-ASCII chars, counting the length and | |
2280 stopping if it's zero */ | |
2281 while (len && !byte_ascii_p (*(newptr - 1))) | |
2282 if (ibyte_first_byte_p (*--newptr)) | |
2283 len--; | |
2284 if (!len) | |
2285 break; | |
2286 } | |
2287 text_checking_assert (ptr - newptr >= 0); | |
2288 return ptr - newptr; | |
2289 } | |
2290 | |
771 | 2291 /* The next two functions are the actual meat behind the |
2292 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently | |
2293 the method they use is fairly unsophisticated; see buffer.h. | |
2294 | |
2295 Note that charbpos_to_bytebpos_func() is probably the most-called | |
2296 function in all of XEmacs. Therefore, it must be FAST FAST FAST. | |
2297 This is the reason why so much of the code is duplicated. | |
2298 | |
2299 Similar considerations apply to bytebpos_to_charbpos_func(), although | |
2300 less so because the function is not called so often. | |
2367 | 2301 */ |
2302 | |
2303 /* | |
2304 | |
2305 Info on Byte-Char conversion: | |
2306 | |
2307 (Info-goto-node "(internals)Byte-Char Position Conversion") | |
2308 */ | |
2309 | |
2310 #ifdef OLD_BYTE_CHAR | |
771 | 2311 static int not_very_random_number; |
2367 | 2312 #endif /* OLD_BYTE_CHAR */ |
2313 | |
2314 #define OLD_LOOP | |
2315 | |
2316 /* If we are this many characters away from any known position, cache the | |
2317 new position in the buffer's char-byte cache. */ | |
2318 #define FAR_AWAY_DISTANCE 5000 | |
2319 | |
2320 /* Converting between character positions and byte positions. */ | |
2321 | |
2322 /* There are several places in the buffer where we know | |
2323 the correspondence: BEG, BEGV, PT, GPT, ZV and Z, | |
2324 and everywhere there is a marker. So we find the one of these places | |
2325 that is closest to the specified position, and scan from there. */ | |
2326 | |
2327 /* This macro is a subroutine of charbpos_to_bytebpos_func. | |
2328 Note that it is desirable that BYTEPOS is not evaluated | |
2329 except when we really want its value. */ | |
2330 | |
2331 #define CONSIDER(CHARPOS, BYTEPOS) \ | |
2332 do \ | |
2333 { \ | |
2334 Charbpos this_charpos = (CHARPOS); \ | |
2335 int changed = 0; \ | |
2336 \ | |
2337 if (this_charpos == x) \ | |
2338 { \ | |
2339 retval = (BYTEPOS); \ | |
2340 goto done; \ | |
2341 } \ | |
2342 else if (this_charpos > x) \ | |
2343 { \ | |
2344 if (this_charpos < best_above) \ | |
2345 { \ | |
2346 best_above = this_charpos; \ | |
2347 best_above_byte = (BYTEPOS); \ | |
2348 changed = 1; \ | |
2349 } \ | |
2350 } \ | |
2351 else if (this_charpos > best_below) \ | |
2352 { \ | |
2353 best_below = this_charpos; \ | |
2354 best_below_byte = (BYTEPOS); \ | |
2355 changed = 1; \ | |
2356 } \ | |
2357 \ | |
2358 if (changed) \ | |
2359 { \ | |
2360 if (best_above - best_below == best_above_byte - best_below_byte) \ | |
2361 { \ | |
2362 retval = best_below_byte + (x - best_below); \ | |
2363 goto done; \ | |
2364 } \ | |
2365 } \ | |
2366 } \ | |
2367 while (0) | |
2368 | |
771 | 2369 |
2370 Bytebpos | |
2371 charbpos_to_bytebpos_func (struct buffer *buf, Charbpos x) | |
2372 { | |
2367 | 2373 #ifdef OLD_BYTE_CHAR |
771 | 2374 Charbpos bufmin; |
2375 Charbpos bufmax; | |
2376 Bytebpos bytmin; | |
2377 Bytebpos bytmax; | |
2378 int size; | |
2379 int forward_p; | |
2380 int diff_so_far; | |
2381 int add_to_cache = 0; | |
2367 | 2382 #endif /* OLD_BYTE_CHAR */ |
2383 | |
2384 Charbpos best_above, best_below; | |
2385 Bytebpos best_above_byte, best_below_byte; | |
2386 int i; | |
2387 struct buffer_text *t; | |
2388 Bytebpos retval; | |
2389 | |
1292 | 2390 PROFILE_DECLARE (); |
771 | 2391 |
1292 | 2392 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion); |
2393 | |
2367 | 2394 best_above = BUF_Z (buf); |
2395 best_above_byte = BYTE_BUF_Z (buf); | |
2396 | |
2397 /* In this case, we simply have all one-byte characters. But this should | |
2398 have been intercepted before, in charbpos_to_bytebpos(). */ | |
2399 text_checking_assert (best_above != best_above_byte); | |
2400 | |
2401 best_below = BUF_BEG (buf); | |
2402 best_below_byte = BYTE_BUF_BEG (buf); | |
2403 | |
2404 /* We find in best_above and best_above_byte | |
2405 the closest known point above CHARPOS, | |
2406 and in best_below and best_below_byte | |
2407 the closest known point below CHARPOS, | |
2408 | |
2409 If at any point we can tell that the space between those | |
2410 two best approximations is all single-byte, | |
2411 we interpolate the result immediately. */ | |
2412 | |
2413 CONSIDER (BUF_PT (buf), BYTE_BUF_PT (buf)); | |
2414 CONSIDER (BUF_GPT (buf), BYTE_BUF_GPT (buf)); | |
2415 CONSIDER (BUF_BEGV (buf), BYTE_BUF_BEGV (buf)); | |
2416 CONSIDER (BUF_ZV (buf), BYTE_BUF_ZV (buf)); | |
2417 | |
2418 t = buf->text; | |
2419 CONSIDER (t->cached_charpos, t->cached_bytepos); | |
2420 | |
2421 /* Check the most recently entered positions first */ | |
2422 | |
2423 for (i = t->next_cache_pos - 1; i >= 0; i--) | |
2424 { | |
2425 CONSIDER (t->mule_charbpos_cache[i], t->mule_bytebpos_cache[i]); | |
2426 | |
2427 /* If we are down to a range of 50 chars, | |
2428 don't bother checking any other markers; | |
2429 scan the intervening chars directly now. */ | |
2430 if (best_above - best_below < 50) | |
2431 break; | |
2432 } | |
2433 | |
2434 /* We get here if we did not exactly hit one of the known places. | |
2435 We have one known above and one known below. | |
2436 Scan, counting characters, from whichever one is closer. */ | |
2437 | |
2438 if (x - best_below < best_above - x) | |
2439 { | |
2440 int record = x - best_below > FAR_AWAY_DISTANCE; | |
2441 | |
2442 #ifdef OLD_LOOP /* old code */ | |
2443 while (best_below != x) | |
2444 { | |
2445 best_below++; | |
2446 INC_BYTEBPOS (buf, best_below_byte); | |
2447 } | |
2448 #else | |
2449 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT); | |
2450 /* The gap should not occur between best_below and x, or we will be | |
2451 screwed in using charcount_to_bytecount(). It should not be exactly | |
2452 at x either, because we already should have caught that. */ | |
2453 text_checking_assert | |
2454 (BUF_CEILING_OF_IGNORE_ACCESSIBLE (buf, best_below) > x); | |
2455 | |
2456 /* Using charcount_to_bytecount() is potentially a lot faster than a | |
2457 simple loop using INC_BYTEBPOS() because (a) the checks for gap | |
2458 and buffer format are factored out instead of getting checked | |
2459 every time; (b) the checking goes 4 or 8 bytes at a time in ASCII | |
2460 text. | |
2461 */ | |
2462 best_below_byte += | |
2463 charcount_to_bytecount | |
2464 (BYTE_BUF_BYTE_ADDRESS (buf, best_below_byte), x - best_below); | |
2465 best_below = x; | |
2466 #endif /* 0 */ | |
2467 | |
2468 /* If this position is quite far from the nearest known position, | |
2469 cache the correspondence. | |
2470 | |
2471 NB FSF does this: "... by creating a marker here. | |
2472 It will last until the next GC." | |
2473 */ | |
2474 | |
2475 if (record) | |
2476 { | |
2477 /* If we have run out of positions to record, discard some of the | |
2478 old ones. I used to use a circular buffer, which avoids the | |
2479 need to block-move any memory. But it makes it more difficult | |
2480 to keep track of which positions haven't been used -- commonly | |
2481 we haven't yet filled out anywhere near the whole set of | |
2482 positions and don't want to check them all. We should not be | |
2483 recording that often, and block-moving is extremely fast in | |
2484 any case. --ben */ | |
2485 if (t->next_cache_pos == NUM_CACHED_POSITIONS) | |
2486 { | |
2487 memmove (t->mule_charbpos_cache, | |
2488 t->mule_charbpos_cache + NUM_MOVED_POSITIONS, | |
2489 sizeof (Charbpos) * | |
2490 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2491 memmove (t->mule_bytebpos_cache, | |
2492 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS, | |
2493 sizeof (Bytebpos) * | |
2494 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2495 t->next_cache_pos -= NUM_MOVED_POSITIONS; | |
2496 } | |
2497 t->mule_charbpos_cache[t->next_cache_pos] = best_below; | |
2498 t->mule_bytebpos_cache[t->next_cache_pos] = best_below_byte; | |
2499 t->next_cache_pos++; | |
2500 } | |
2501 | |
2502 t->cached_charpos = best_below; | |
2503 t->cached_bytepos = best_below_byte; | |
2504 | |
2505 retval = best_below_byte; | |
2506 text_checking_assert (best_below_byte >= best_below); | |
2507 goto done; | |
2508 } | |
2509 else | |
2510 { | |
2511 int record = best_above - x > FAR_AWAY_DISTANCE; | |
2512 | |
2513 #ifdef OLD_LOOP | |
2514 while (best_above != x) | |
2515 { | |
2516 best_above--; | |
2517 DEC_BYTEBPOS (buf, best_above_byte); | |
2518 } | |
2519 #else | |
2520 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT); | |
2521 /* The gap should not occur between best_above and x, or we will be | |
2522 screwed in using charcount_to_bytecount_down(). It should not be | |
2523 exactly at x either, because we already should have caught | |
2524 that. */ | |
2525 text_checking_assert | |
2526 (BUF_FLOOR_OF_IGNORE_ACCESSIBLE (buf, best_above) < x); | |
2527 | |
2528 /* Using charcount_to_bytecount_down() is potentially a lot faster | |
2529 than a simple loop using DEC_BYTEBPOS(); see above. */ | |
2530 best_above_byte -= | |
2531 charcount_to_bytecount_down | |
2532 /* BYTE_BUF_BYTE_ADDRESS will return a value on the high side of the | |
2533 gap if we are at the gap, which is the wrong side. So do the | |
2534 following trick instead. */ | |
2535 (BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, best_above_byte) + 1, | |
2536 best_above - x); | |
2537 best_above = x; | |
2538 #endif /* SLEDGEHAMMER_CHECK_TEXT */ | |
2539 | |
2540 | |
2541 /* If this position is quite far from the nearest known position, | |
2542 cache the correspondence. | |
2543 | |
2544 NB FSF does this: "... by creating a marker here. | |
2545 It will last until the next GC." | |
2546 */ | |
2547 if (record) | |
2548 { | |
2549 if (t->next_cache_pos == NUM_CACHED_POSITIONS) | |
2550 { | |
2551 memmove (t->mule_charbpos_cache, | |
2552 t->mule_charbpos_cache + NUM_MOVED_POSITIONS, | |
2553 sizeof (Charbpos) * | |
2554 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2555 memmove (t->mule_bytebpos_cache, | |
2556 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS, | |
2557 sizeof (Bytebpos) * | |
2558 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2559 t->next_cache_pos -= NUM_MOVED_POSITIONS; | |
2560 } | |
2561 t->mule_charbpos_cache[t->next_cache_pos] = best_above; | |
2562 t->mule_bytebpos_cache[t->next_cache_pos] = best_above_byte; | |
2563 t->next_cache_pos++; | |
2564 } | |
2565 | |
2566 t->cached_charpos = best_above; | |
2567 t->cached_bytepos = best_above_byte; | |
2568 | |
2569 retval = best_above_byte; | |
2570 text_checking_assert (best_above_byte >= best_above); | |
2571 goto done; | |
2572 } | |
2573 | |
2574 #ifdef OLD_BYTE_CHAR | |
2575 | |
771 | 2576 bufmin = buf->text->mule_bufmin; |
2577 bufmax = buf->text->mule_bufmax; | |
2578 bytmin = buf->text->mule_bytmin; | |
2579 bytmax = buf->text->mule_bytmax; | |
2580 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; | |
2581 | |
2582 /* The basic idea here is that we shift the "known region" up or down | |
2583 until it overlaps the specified position. We do this by moving | |
2584 the upper bound of the known region up one character at a time, | |
2585 and moving the lower bound of the known region up as necessary | |
2586 when the size of the character just seen changes. | |
2587 | |
2588 We optimize this, however, by first shifting the known region to | |
2589 one of the cached points if it's close by. (We don't check BEG or | |
2590 Z, even though they're cached; most of the time these will be the | |
2591 same as BEGV and ZV, and when they're not, they're not likely | |
2592 to be used.) */ | |
2593 | |
2594 if (x > bufmax) | |
2595 { | |
2596 Charbpos diffmax = x - bufmax; | |
2597 Charbpos diffpt = x - BUF_PT (buf); | |
2598 Charbpos diffzv = BUF_ZV (buf) - x; | |
2599 /* #### This value could stand some more exploration. */ | |
2600 Charcount heuristic_hack = (bufmax - bufmin) >> 2; | |
2601 | |
2602 /* Check if the position is closer to PT or ZV than to the | |
2603 end of the known region. */ | |
2604 | |
2605 if (diffpt < 0) | |
2606 diffpt = -diffpt; | |
2607 if (diffzv < 0) | |
2608 diffzv = -diffzv; | |
2609 | |
2610 /* But also implement a heuristic that favors the known region | |
2611 over PT or ZV. The reason for this is that switching to | |
2612 PT or ZV will wipe out the knowledge in the known region, | |
2613 which might be annoying if the known region is large and | |
2614 PT or ZV is not that much closer than the end of the known | |
2615 region. */ | |
2616 | |
2617 diffzv += heuristic_hack; | |
2618 diffpt += heuristic_hack; | |
2619 if (diffpt < diffmax && diffpt <= diffzv) | |
2620 { | |
2621 bufmax = bufmin = BUF_PT (buf); | |
826 | 2622 bytmax = bytmin = BYTE_BUF_PT (buf); |
771 | 2623 /* We set the size to 1 even though it doesn't really |
2624 matter because the new known region contains no | |
2625 characters. We do this because this is the most | |
2626 likely size of the characters around the new known | |
2627 region, and we avoid potential yuckiness that is | |
2628 done when size == 3. */ | |
2629 size = 1; | |
2630 } | |
2631 if (diffzv < diffmax) | |
2632 { | |
2633 bufmax = bufmin = BUF_ZV (buf); | |
826 | 2634 bytmax = bytmin = BYTE_BUF_ZV (buf); |
771 | 2635 size = 1; |
2636 } | |
2637 } | |
800 | 2638 #ifdef ERROR_CHECK_TEXT |
771 | 2639 else if (x >= bufmin) |
2500 | 2640 ABORT (); |
771 | 2641 #endif |
2642 else | |
2643 { | |
2644 Charbpos diffmin = bufmin - x; | |
2645 Charbpos diffpt = BUF_PT (buf) - x; | |
2646 Charbpos diffbegv = x - BUF_BEGV (buf); | |
2647 /* #### This value could stand some more exploration. */ | |
2648 Charcount heuristic_hack = (bufmax - bufmin) >> 2; | |
2649 | |
2650 if (diffpt < 0) | |
2651 diffpt = -diffpt; | |
2652 if (diffbegv < 0) | |
2653 diffbegv = -diffbegv; | |
2654 | |
2655 /* But also implement a heuristic that favors the known region -- | |
2656 see above. */ | |
2657 | |
2658 diffbegv += heuristic_hack; | |
2659 diffpt += heuristic_hack; | |
2660 | |
2661 if (diffpt < diffmin && diffpt <= diffbegv) | |
2662 { | |
2663 bufmax = bufmin = BUF_PT (buf); | |
826 | 2664 bytmax = bytmin = BYTE_BUF_PT (buf); |
771 | 2665 /* We set the size to 1 even though it doesn't really |
2666 matter because the new known region contains no | |
2667 characters. We do this because this is the most | |
2668 likely size of the characters around the new known | |
2669 region, and we avoid potential yuckiness that is | |
2670 done when size == 3. */ | |
2671 size = 1; | |
2672 } | |
2673 if (diffbegv < diffmin) | |
2674 { | |
2675 bufmax = bufmin = BUF_BEGV (buf); | |
826 | 2676 bytmax = bytmin = BYTE_BUF_BEGV (buf); |
771 | 2677 size = 1; |
2678 } | |
2679 } | |
2680 | |
2681 diff_so_far = x > bufmax ? x - bufmax : bufmin - x; | |
2682 if (diff_so_far > 50) | |
2683 { | |
2684 /* If we have to move more than a certain amount, then look | |
2685 into our cache. */ | |
2686 int minval = INT_MAX; | |
2687 int found = 0; | |
2688 int i; | |
2689 | |
2690 add_to_cache = 1; | |
2691 /* I considered keeping the positions ordered. This would speed | |
2692 up this loop, but updating the cache would take longer, so | |
2693 it doesn't seem like it would really matter. */ | |
2367 | 2694 for (i = 0; i < NUM_CACHED_POSITIONS; i++) |
771 | 2695 { |
2696 int diff = buf->text->mule_charbpos_cache[i] - x; | |
2697 | |
2698 if (diff < 0) | |
2699 diff = -diff; | |
2700 if (diff < minval) | |
2701 { | |
2702 minval = diff; | |
2703 found = i; | |
2704 } | |
2705 } | |
2706 | |
2707 if (minval < diff_so_far) | |
2708 { | |
2709 bufmax = bufmin = buf->text->mule_charbpos_cache[found]; | |
2710 bytmax = bytmin = buf->text->mule_bytebpos_cache[found]; | |
2711 size = 1; | |
2712 } | |
2713 } | |
2714 | |
2715 /* It's conceivable that the caching above could lead to X being | |
2716 the same as one of the range edges. */ | |
2717 if (x >= bufmax) | |
2718 { | |
2719 Bytebpos newmax; | |
2720 Bytecount newsize; | |
2721 | |
2722 forward_p = 1; | |
2723 while (x > bufmax) | |
2724 { | |
2725 newmax = bytmax; | |
2726 | |
2727 INC_BYTEBPOS (buf, newmax); | |
2728 newsize = newmax - bytmax; | |
2729 if (newsize != size) | |
2730 { | |
2731 bufmin = bufmax; | |
2732 bytmin = bytmax; | |
2733 size = newsize; | |
2734 } | |
2735 bytmax = newmax; | |
2736 bufmax++; | |
2737 } | |
2738 retval = bytmax; | |
2739 | |
2740 /* #### Should go past the found location to reduce the number | |
2741 of times that this function is called */ | |
2742 } | |
2743 else /* x < bufmin */ | |
2744 { | |
2745 Bytebpos newmin; | |
2746 Bytecount newsize; | |
2747 | |
2748 forward_p = 0; | |
2749 while (x < bufmin) | |
2750 { | |
2751 newmin = bytmin; | |
2752 | |
2753 DEC_BYTEBPOS (buf, newmin); | |
2754 newsize = bytmin - newmin; | |
2755 if (newsize != size) | |
2756 { | |
2757 bufmax = bufmin; | |
2758 bytmax = bytmin; | |
2759 size = newsize; | |
2760 } | |
2761 bytmin = newmin; | |
2762 bufmin--; | |
2763 } | |
2764 retval = bytmin; | |
2765 | |
2766 /* #### Should go past the found location to reduce the number | |
2767 of times that this function is called | |
2768 */ | |
2769 } | |
2770 | |
2771 /* If size is three, than we have to max sure that the range we | |
2772 discovered isn't too large, because we use a fixed-length | |
2773 table to divide by 3. */ | |
2774 | |
2775 if (size == 3) | |
2776 { | |
2777 int gap = bytmax - bytmin; | |
2778 buf->text->mule_three_p = 1; | |
2779 buf->text->mule_shifter = 1; | |
2780 | |
2781 if (gap > MAX_BYTEBPOS_GAP_SIZE_3) | |
2782 { | |
2783 if (forward_p) | |
2784 { | |
2785 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3; | |
2786 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3; | |
2787 } | |
2788 else | |
2789 { | |
2790 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3; | |
2791 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3; | |
2792 } | |
2793 } | |
2794 } | |
2795 else | |
2796 { | |
2797 buf->text->mule_three_p = 0; | |
2798 if (size == 4) | |
2799 buf->text->mule_shifter = 2; | |
2800 else | |
2801 buf->text->mule_shifter = size - 1; | |
2802 } | |
2803 | |
2804 buf->text->mule_bufmin = bufmin; | |
2805 buf->text->mule_bufmax = bufmax; | |
2806 buf->text->mule_bytmin = bytmin; | |
2807 buf->text->mule_bytmax = bytmax; | |
2808 | |
2809 if (add_to_cache) | |
2810 { | |
2811 int replace_loc; | |
2812 | |
2813 /* We throw away a "random" cached value and replace it with | |
2814 the new value. It doesn't actually have to be very random | |
2815 at all, just evenly distributed. | |
2816 | |
2817 #### It would be better to use a least-recently-used algorithm | |
2818 or something that tries to space things out, but I'm not sure | |
2819 it's worth it to go to the trouble of maintaining that. */ | |
2820 not_very_random_number += 621; | |
2821 replace_loc = not_very_random_number & 15; | |
2822 buf->text->mule_charbpos_cache[replace_loc] = x; | |
2823 buf->text->mule_bytebpos_cache[replace_loc] = retval; | |
2824 } | |
2825 | |
2367 | 2826 #endif /* OLD_BYTE_CHAR */ |
2827 | |
2828 done: | |
1292 | 2829 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion); |
2830 | |
771 | 2831 return retval; |
2832 } | |
2833 | |
2367 | 2834 #undef CONSIDER |
2835 | |
2836 /* bytepos_to_charpos returns the char position corresponding to BYTEPOS. */ | |
2837 | |
2838 /* This macro is a subroutine of bytebpos_to_charbpos_func. | |
2839 It is used when BYTEPOS is actually the byte position. */ | |
2840 | |
2841 #define CONSIDER(BYTEPOS, CHARPOS) \ | |
2842 do \ | |
2843 { \ | |
2844 Bytebpos this_bytepos = (BYTEPOS); \ | |
2845 int changed = 0; \ | |
2846 \ | |
2847 if (this_bytepos == x) \ | |
2848 { \ | |
2849 retval = (CHARPOS); \ | |
2850 goto done; \ | |
2851 } \ | |
2852 else if (this_bytepos > x) \ | |
2853 { \ | |
2854 if (this_bytepos < best_above_byte) \ | |
2855 { \ | |
2856 best_above = (CHARPOS); \ | |
2857 best_above_byte = this_bytepos; \ | |
2858 changed = 1; \ | |
2859 } \ | |
2860 } \ | |
2861 else if (this_bytepos > best_below_byte) \ | |
2862 { \ | |
2863 best_below = (CHARPOS); \ | |
2864 best_below_byte = this_bytepos; \ | |
2865 changed = 1; \ | |
2866 } \ | |
2867 \ | |
2868 if (changed) \ | |
2869 { \ | |
2870 if (best_above - best_below == best_above_byte - best_below_byte) \ | |
2871 { \ | |
2872 retval = best_below + (x - best_below_byte); \ | |
2873 goto done; \ | |
2874 } \ | |
2875 } \ | |
2876 } \ | |
2877 while (0) | |
2878 | |
771 | 2879 /* The logic in this function is almost identical to the logic in |
2880 the previous function. */ | |
2881 | |
2882 Charbpos | |
2883 bytebpos_to_charbpos_func (struct buffer *buf, Bytebpos x) | |
2884 { | |
2367 | 2885 #ifdef OLD_BYTE_CHAR |
771 | 2886 Charbpos bufmin; |
2887 Charbpos bufmax; | |
2888 Bytebpos bytmin; | |
2889 Bytebpos bytmax; | |
2890 int size; | |
2891 int forward_p; | |
2892 int diff_so_far; | |
2893 int add_to_cache = 0; | |
2367 | 2894 #endif /* OLD_BYTE_CHAR */ |
2895 | |
2896 Charbpos best_above, best_above_byte; | |
2897 Bytebpos best_below, best_below_byte; | |
2898 int i; | |
2899 struct buffer_text *t; | |
2900 Charbpos retval; | |
2901 | |
1292 | 2902 PROFILE_DECLARE (); |
771 | 2903 |
1292 | 2904 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion); |
2905 | |
2367 | 2906 best_above = BUF_Z (buf); |
2907 best_above_byte = BYTE_BUF_Z (buf); | |
2908 | |
2909 /* In this case, we simply have all one-byte characters. But this should | |
2910 have been intercepted before, in bytebpos_to_charbpos(). */ | |
2911 text_checking_assert (best_above != best_above_byte); | |
2912 | |
2913 best_below = BUF_BEG (buf); | |
2914 best_below_byte = BYTE_BUF_BEG (buf); | |
2915 | |
2916 CONSIDER (BYTE_BUF_PT (buf), BUF_PT (buf)); | |
2917 CONSIDER (BYTE_BUF_GPT (buf), BUF_GPT (buf)); | |
2918 CONSIDER (BYTE_BUF_BEGV (buf), BUF_BEGV (buf)); | |
2919 CONSIDER (BYTE_BUF_ZV (buf), BUF_ZV (buf)); | |
2920 | |
2921 t = buf->text; | |
2922 CONSIDER (t->cached_bytepos, t->cached_charpos); | |
2923 | |
2924 /* Check the most recently entered positions first */ | |
2925 | |
2926 for (i = t->next_cache_pos - 1; i >= 0; i--) | |
2927 { | |
2928 CONSIDER (t->mule_bytebpos_cache[i], t->mule_charbpos_cache[i]); | |
2929 | |
2930 /* If we are down to a range of 50 chars, | |
2931 don't bother checking any other markers; | |
2932 scan the intervening chars directly now. */ | |
2933 if (best_above - best_below < 50) | |
2934 break; | |
2935 } | |
2936 | |
2937 /* We get here if we did not exactly hit one of the known places. | |
2938 We have one known above and one known below. | |
2939 Scan, counting characters, from whichever one is closer. */ | |
2940 | |
2941 if (x - best_below_byte < best_above_byte - x) | |
2942 { | |
2943 int record = x - best_below_byte > 5000; | |
2944 | |
2945 #ifdef OLD_LOOP /* old code */ | |
4526
38493c0fb952
Fix accidental deletion in src/text.c.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4525
diff
changeset
|
2946 while (best_below_byte < x) |
2367 | 2947 { |
2948 best_below++; | |
2949 INC_BYTEBPOS (buf, best_below_byte); | |
2950 } | |
2951 #else | |
2952 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT); | |
2953 /* The gap should not occur between best_below and x, or we will be | |
2954 screwed in using charcount_to_bytecount(). It should not be exactly | |
2955 at x either, because we already should have caught that. */ | |
2956 text_checking_assert | |
2957 (BYTE_BUF_CEILING_OF_IGNORE_ACCESSIBLE (buf, best_below_byte) > x); | |
2958 | |
2959 /* Using bytecount_to_charcount() is potentially a lot faster than | |
2960 a simple loop above using INC_BYTEBPOS(); see above. | |
2961 */ | |
2962 best_below += | |
2963 bytecount_to_charcount | |
2964 (BYTE_BUF_BYTE_ADDRESS (buf, best_below_byte), x - best_below_byte); | |
2965 best_below_byte = x; | |
2966 #endif | |
2967 | |
2968 /* If this position is quite far from the nearest known position, | |
2969 cache the correspondence. | |
2970 | |
2971 NB FSF does this: "... by creating a marker here. | |
2972 It will last until the next GC." | |
2973 */ | |
2974 | |
2975 if (record) | |
2976 { | |
2977 if (t->next_cache_pos == NUM_CACHED_POSITIONS) | |
2978 { | |
2979 memmove (t->mule_charbpos_cache, | |
2980 t->mule_charbpos_cache + NUM_MOVED_POSITIONS, | |
2981 sizeof (Charbpos) * | |
2982 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2983 memmove (t->mule_bytebpos_cache, | |
2984 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS, | |
2985 sizeof (Bytebpos) * | |
2986 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2987 t->next_cache_pos -= NUM_MOVED_POSITIONS; | |
2988 } | |
2989 t->mule_charbpos_cache[t->next_cache_pos] = best_below; | |
2990 t->mule_bytebpos_cache[t->next_cache_pos] = best_below_byte; | |
2991 t->next_cache_pos++; | |
2992 } | |
2993 | |
2994 | |
2995 t->cached_charpos = best_below; | |
2996 t->cached_bytepos = best_below_byte; | |
2997 | |
2998 retval = best_below; | |
2999 text_checking_assert (best_below_byte >= best_below); | |
3000 goto done; | |
3001 } | |
3002 else | |
3003 { | |
3004 int record = best_above_byte - x > 5000; | |
3005 | |
3006 #ifdef OLD_LOOP /* old code */ | |
3007 while (best_above_byte > x) | |
3008 { | |
3009 best_above--; | |
3010 DEC_BYTEBPOS (buf, best_above_byte); | |
3011 } | |
3012 #else | |
3013 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT); | |
3014 /* The gap should not occur between best_above and x, or we will be | |
3015 screwed in using bytecount_to_charcount_down(). It should not be | |
3016 exactly at x either, because we already should have caught | |
3017 that. */ | |
3018 text_checking_assert | |
3019 (BYTE_BUF_FLOOR_OF_IGNORE_ACCESSIBLE (buf, best_above_byte) < x); | |
3020 | |
3021 /* Using bytecount_to_charcount_down() is potentially a lot faster | |
3022 than a simple loop using INC_BYTEBPOS(); see above. */ | |
3023 best_above -= | |
3024 bytecount_to_charcount_down | |
3025 /* BYTE_BUF_BYTE_ADDRESS will return a value on the high side of the | |
3026 gap if we are at the gap, which is the wrong side. So do the | |
3027 following trick instead. */ | |
3028 (BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, best_above_byte) + 1, | |
3029 best_above_byte - x); | |
3030 best_above_byte = x; | |
3031 #endif | |
3032 | |
3033 | |
3034 /* If this position is quite far from the nearest known position, | |
3035 cache the correspondence. | |
3036 | |
3037 NB FSF does this: "... by creating a marker here. | |
3038 It will last until the next GC." | |
3039 */ | |
3040 if (record) | |
3041 { | |
3042 if (t->next_cache_pos == NUM_CACHED_POSITIONS) | |
3043 { | |
3044 memmove (t->mule_charbpos_cache, | |
3045 t->mule_charbpos_cache + NUM_MOVED_POSITIONS, | |
3046 sizeof (Charbpos) * | |
3047 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
3048 memmove (t->mule_bytebpos_cache, | |
3049 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS, | |
3050 sizeof (Bytebpos) * | |
3051 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
3052 t->next_cache_pos -= NUM_MOVED_POSITIONS; | |
3053 } | |
3054 t->mule_charbpos_cache[t->next_cache_pos] = best_above; | |
3055 t->mule_bytebpos_cache[t->next_cache_pos] = best_above_byte; | |
3056 t->next_cache_pos++; | |
3057 } | |
3058 | |
3059 t->cached_charpos = best_above; | |
3060 t->cached_bytepos = best_above_byte; | |
3061 | |
3062 retval = best_above; | |
3063 text_checking_assert (best_above_byte >= best_above); | |
3064 goto done; | |
3065 } | |
3066 | |
3067 #ifdef OLD_BYTE_CHAR | |
3068 | |
771 | 3069 bufmin = buf->text->mule_bufmin; |
3070 bufmax = buf->text->mule_bufmax; | |
3071 bytmin = buf->text->mule_bytmin; | |
3072 bytmax = buf->text->mule_bytmax; | |
3073 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; | |
3074 | |
3075 /* The basic idea here is that we shift the "known region" up or down | |
3076 until it overlaps the specified position. We do this by moving | |
3077 the upper bound of the known region up one character at a time, | |
3078 and moving the lower bound of the known region up as necessary | |
3079 when the size of the character just seen changes. | |
3080 | |
3081 We optimize this, however, by first shifting the known region to | |
826 | 3082 one of the cached points if it's close by. (We don't check BYTE_BEG or |
3083 BYTE_Z, even though they're cached; most of the time these will be the | |
3084 same as BYTE_BEGV and BYTE_ZV, and when they're not, they're not likely | |
771 | 3085 to be used.) */ |
3086 | |
3087 if (x > bytmax) | |
3088 { | |
3089 Bytebpos diffmax = x - bytmax; | |
826 | 3090 Bytebpos diffpt = x - BYTE_BUF_PT (buf); |
3091 Bytebpos diffzv = BYTE_BUF_ZV (buf) - x; | |
771 | 3092 /* #### This value could stand some more exploration. */ |
3093 Bytecount heuristic_hack = (bytmax - bytmin) >> 2; | |
3094 | |
3095 /* Check if the position is closer to PT or ZV than to the | |
3096 end of the known region. */ | |
3097 | |
3098 if (diffpt < 0) | |
3099 diffpt = -diffpt; | |
3100 if (diffzv < 0) | |
3101 diffzv = -diffzv; | |
3102 | |
3103 /* But also implement a heuristic that favors the known region | |
826 | 3104 over BYTE_PT or BYTE_ZV. The reason for this is that switching to |
3105 BYTE_PT or BYTE_ZV will wipe out the knowledge in the known region, | |
771 | 3106 which might be annoying if the known region is large and |
826 | 3107 BYTE_PT or BYTE_ZV is not that much closer than the end of the known |
771 | 3108 region. */ |
3109 | |
3110 diffzv += heuristic_hack; | |
3111 diffpt += heuristic_hack; | |
3112 if (diffpt < diffmax && diffpt <= diffzv) | |
3113 { | |
3114 bufmax = bufmin = BUF_PT (buf); | |
826 | 3115 bytmax = bytmin = BYTE_BUF_PT (buf); |
771 | 3116 /* We set the size to 1 even though it doesn't really |
3117 matter because the new known region contains no | |
3118 characters. We do this because this is the most | |
3119 likely size of the characters around the new known | |
3120 region, and we avoid potential yuckiness that is | |
3121 done when size == 3. */ | |
3122 size = 1; | |
3123 } | |
3124 if (diffzv < diffmax) | |
3125 { | |
3126 bufmax = bufmin = BUF_ZV (buf); | |
826 | 3127 bytmax = bytmin = BYTE_BUF_ZV (buf); |
771 | 3128 size = 1; |
3129 } | |
3130 } | |
800 | 3131 #ifdef ERROR_CHECK_TEXT |
771 | 3132 else if (x >= bytmin) |
2500 | 3133 ABORT (); |
771 | 3134 #endif |
3135 else | |
3136 { | |
3137 Bytebpos diffmin = bytmin - x; | |
826 | 3138 Bytebpos diffpt = BYTE_BUF_PT (buf) - x; |
3139 Bytebpos diffbegv = x - BYTE_BUF_BEGV (buf); | |
771 | 3140 /* #### This value could stand some more exploration. */ |
3141 Bytecount heuristic_hack = (bytmax - bytmin) >> 2; | |
3142 | |
3143 if (diffpt < 0) | |
3144 diffpt = -diffpt; | |
3145 if (diffbegv < 0) | |
3146 diffbegv = -diffbegv; | |
3147 | |
3148 /* But also implement a heuristic that favors the known region -- | |
3149 see above. */ | |
3150 | |
3151 diffbegv += heuristic_hack; | |
3152 diffpt += heuristic_hack; | |
3153 | |
3154 if (diffpt < diffmin && diffpt <= diffbegv) | |
3155 { | |
3156 bufmax = bufmin = BUF_PT (buf); | |
826 | 3157 bytmax = bytmin = BYTE_BUF_PT (buf); |
771 | 3158 /* We set the size to 1 even though it doesn't really |
3159 matter because the new known region contains no | |
3160 characters. We do this because this is the most | |
3161 likely size of the characters around the new known | |
3162 region, and we avoid potential yuckiness that is | |
3163 done when size == 3. */ | |
3164 size = 1; | |
3165 } | |
3166 if (diffbegv < diffmin) | |
3167 { | |
3168 bufmax = bufmin = BUF_BEGV (buf); | |
826 | 3169 bytmax = bytmin = BYTE_BUF_BEGV (buf); |
771 | 3170 size = 1; |
3171 } | |
3172 } | |
3173 | |
3174 diff_so_far = x > bytmax ? x - bytmax : bytmin - x; | |
3175 if (diff_so_far > 50) | |
3176 { | |
3177 /* If we have to move more than a certain amount, then look | |
3178 into our cache. */ | |
3179 int minval = INT_MAX; | |
3180 int found = 0; | |
3181 int i; | |
3182 | |
3183 add_to_cache = 1; | |
3184 /* I considered keeping the positions ordered. This would speed | |
3185 up this loop, but updating the cache would take longer, so | |
3186 it doesn't seem like it would really matter. */ | |
2367 | 3187 for (i = 0; i < NUM_CACHED_POSITIONS; i++) |
771 | 3188 { |
3189 int diff = buf->text->mule_bytebpos_cache[i] - x; | |
3190 | |
3191 if (diff < 0) | |
3192 diff = -diff; | |
3193 if (diff < minval) | |
3194 { | |
3195 minval = diff; | |
3196 found = i; | |
3197 } | |
3198 } | |
3199 | |
3200 if (minval < diff_so_far) | |
3201 { | |
3202 bufmax = bufmin = buf->text->mule_charbpos_cache[found]; | |
3203 bytmax = bytmin = buf->text->mule_bytebpos_cache[found]; | |
3204 size = 1; | |
3205 } | |
3206 } | |
3207 | |
3208 /* It's conceivable that the caching above could lead to X being | |
3209 the same as one of the range edges. */ | |
3210 if (x >= bytmax) | |
3211 { | |
3212 Bytebpos newmax; | |
3213 Bytecount newsize; | |
3214 | |
3215 forward_p = 1; | |
3216 while (x > bytmax) | |
3217 { | |
3218 newmax = bytmax; | |
3219 | |
3220 INC_BYTEBPOS (buf, newmax); | |
3221 newsize = newmax - bytmax; | |
3222 if (newsize != size) | |
3223 { | |
3224 bufmin = bufmax; | |
3225 bytmin = bytmax; | |
3226 size = newsize; | |
3227 } | |
3228 bytmax = newmax; | |
3229 bufmax++; | |
3230 } | |
3231 retval = bufmax; | |
3232 | |
3233 /* #### Should go past the found location to reduce the number | |
3234 of times that this function is called */ | |
3235 } | |
3236 else /* x <= bytmin */ | |
3237 { | |
3238 Bytebpos newmin; | |
3239 Bytecount newsize; | |
3240 | |
3241 forward_p = 0; | |
3242 while (x < bytmin) | |
3243 { | |
3244 newmin = bytmin; | |
3245 | |
3246 DEC_BYTEBPOS (buf, newmin); | |
3247 newsize = bytmin - newmin; | |
3248 if (newsize != size) | |
3249 { | |
3250 bufmax = bufmin; | |
3251 bytmax = bytmin; | |
3252 size = newsize; | |
3253 } | |
3254 bytmin = newmin; | |
3255 bufmin--; | |
3256 } | |
3257 retval = bufmin; | |
3258 | |
3259 /* #### Should go past the found location to reduce the number | |
3260 of times that this function is called | |
3261 */ | |
3262 } | |
3263 | |
3264 /* If size is three, than we have to max sure that the range we | |
3265 discovered isn't too large, because we use a fixed-length | |
3266 table to divide by 3. */ | |
3267 | |
3268 if (size == 3) | |
3269 { | |
3270 int gap = bytmax - bytmin; | |
3271 buf->text->mule_three_p = 1; | |
3272 buf->text->mule_shifter = 1; | |
3273 | |
3274 if (gap > MAX_BYTEBPOS_GAP_SIZE_3) | |
3275 { | |
3276 if (forward_p) | |
3277 { | |
3278 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3; | |
3279 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3; | |
3280 } | |
3281 else | |
3282 { | |
3283 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3; | |
3284 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3; | |
3285 } | |
3286 } | |
3287 } | |
3288 else | |
3289 { | |
3290 buf->text->mule_three_p = 0; | |
3291 if (size == 4) | |
3292 buf->text->mule_shifter = 2; | |
3293 else | |
3294 buf->text->mule_shifter = size - 1; | |
3295 } | |
3296 | |
3297 buf->text->mule_bufmin = bufmin; | |
3298 buf->text->mule_bufmax = bufmax; | |
3299 buf->text->mule_bytmin = bytmin; | |
3300 buf->text->mule_bytmax = bytmax; | |
3301 | |
3302 if (add_to_cache) | |
3303 { | |
3304 int replace_loc; | |
3305 | |
3306 /* We throw away a "random" cached value and replace it with | |
3307 the new value. It doesn't actually have to be very random | |
3308 at all, just evenly distributed. | |
3309 | |
3310 #### It would be better to use a least-recently-used algorithm | |
3311 or something that tries to space things out, but I'm not sure | |
3312 it's worth it to go to the trouble of maintaining that. */ | |
3313 not_very_random_number += 621; | |
3314 replace_loc = not_very_random_number & 15; | |
3315 buf->text->mule_charbpos_cache[replace_loc] = retval; | |
3316 buf->text->mule_bytebpos_cache[replace_loc] = x; | |
3317 } | |
2367 | 3318 #endif /* OLD_BYTE_CHAR */ |
3319 | |
3320 done: | |
1292 | 3321 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion); |
3322 | |
771 | 3323 return retval; |
3324 } | |
3325 | |
3326 /* Text of length BYTELENGTH and CHARLENGTH (in different units) | |
3327 was inserted at charbpos START. */ | |
3328 | |
3329 void | |
3330 buffer_mule_signal_inserted_region (struct buffer *buf, Charbpos start, | |
3331 Bytecount bytelength, | |
3332 Charcount charlength) | |
3333 { | |
2367 | 3334 #ifdef OLD_BYTE_CHAR |
771 | 3335 int size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; |
2367 | 3336 #endif /* OLD_BYTE_CHAR */ |
771 | 3337 int i; |
3338 | |
3339 /* Adjust the cache of known positions. */ | |
2367 | 3340 for (i = 0; i < buf->text->next_cache_pos; i++) |
771 | 3341 { |
3342 | |
3343 if (buf->text->mule_charbpos_cache[i] > start) | |
3344 { | |
3345 buf->text->mule_charbpos_cache[i] += charlength; | |
3346 buf->text->mule_bytebpos_cache[i] += bytelength; | |
3347 } | |
3348 } | |
3349 | |
2367 | 3350 /* Adjust the special cached position. */ |
3351 | |
3352 if (buf->text->cached_charpos > start) | |
3353 { | |
3354 buf->text->cached_charpos += charlength; | |
3355 buf->text->cached_bytepos += bytelength; | |
3356 } | |
3357 | |
3358 #ifdef OLD_BYTE_CHAR | |
771 | 3359 if (start >= buf->text->mule_bufmax) |
826 | 3360 return; |
771 | 3361 |
3362 /* The insertion is either before the known region, in which case | |
3363 it shoves it forward; or within the known region, in which case | |
3364 it shoves the end forward. (But it may make the known region | |
3365 inconsistent, so we may have to shorten it.) */ | |
3366 | |
3367 if (start <= buf->text->mule_bufmin) | |
3368 { | |
3369 buf->text->mule_bufmin += charlength; | |
3370 buf->text->mule_bufmax += charlength; | |
3371 buf->text->mule_bytmin += bytelength; | |
3372 buf->text->mule_bytmax += bytelength; | |
3373 } | |
3374 else | |
3375 { | |
3376 Charbpos end = start + charlength; | |
3377 /* the insertion point divides the known region in two. | |
3378 Keep the longer half, at least, and expand into the | |
3379 inserted chunk as much as possible. */ | |
3380 | |
3381 if (start - buf->text->mule_bufmin > buf->text->mule_bufmax - start) | |
3382 { | |
3383 Bytebpos bytestart = (buf->text->mule_bytmin | |
3384 + size * (start - buf->text->mule_bufmin)); | |
3385 Bytebpos bytenew; | |
3386 | |
3387 while (start < end) | |
3388 { | |
3389 bytenew = bytestart; | |
3390 INC_BYTEBPOS (buf, bytenew); | |
3391 if (bytenew - bytestart != size) | |
3392 break; | |
3393 start++; | |
3394 bytestart = bytenew; | |
3395 } | |
3396 if (start != end) | |
3397 { | |
3398 buf->text->mule_bufmax = start; | |
3399 buf->text->mule_bytmax = bytestart; | |
3400 } | |
3401 else | |
3402 { | |
3403 buf->text->mule_bufmax += charlength; | |
3404 buf->text->mule_bytmax += bytelength; | |
3405 } | |
3406 } | |
3407 else | |
3408 { | |
3409 Bytebpos byteend = (buf->text->mule_bytmin | |
3410 + size * (start - buf->text->mule_bufmin) | |
3411 + bytelength); | |
3412 Bytebpos bytenew; | |
3413 | |
3414 buf->text->mule_bufmax += charlength; | |
3415 buf->text->mule_bytmax += bytelength; | |
3416 | |
3417 while (end > start) | |
3418 { | |
3419 bytenew = byteend; | |
3420 DEC_BYTEBPOS (buf, bytenew); | |
3421 if (byteend - bytenew != size) | |
3422 break; | |
3423 end--; | |
3424 byteend = bytenew; | |
3425 } | |
3426 if (start != end) | |
3427 { | |
3428 buf->text->mule_bufmin = end; | |
3429 buf->text->mule_bytmin = byteend; | |
3430 } | |
3431 } | |
3432 } | |
2367 | 3433 #endif /* OLD_BYTE_CHAR */ |
771 | 3434 } |
3435 | |
826 | 3436 /* Text from START to END (equivalent in Bytebpos's: from BYTE_START to |
3437 BYTE_END) was deleted. */ | |
771 | 3438 |
3439 void | |
3440 buffer_mule_signal_deleted_region (struct buffer *buf, Charbpos start, | |
826 | 3441 Charbpos end, Bytebpos byte_start, |
3442 Bytebpos byte_end) | |
771 | 3443 { |
3444 int i; | |
3445 | |
3446 /* Adjust the cache of known positions. */ | |
2367 | 3447 for (i = 0; i < buf->text->next_cache_pos; i++) |
771 | 3448 { |
3449 /* After the end; gets shoved backward */ | |
3450 if (buf->text->mule_charbpos_cache[i] > end) | |
3451 { | |
3452 buf->text->mule_charbpos_cache[i] -= end - start; | |
826 | 3453 buf->text->mule_bytebpos_cache[i] -= byte_end - byte_start; |
771 | 3454 } |
3455 /* In the range; moves to start of range */ | |
3456 else if (buf->text->mule_charbpos_cache[i] > start) | |
3457 { | |
3458 buf->text->mule_charbpos_cache[i] = start; | |
826 | 3459 buf->text->mule_bytebpos_cache[i] = byte_start; |
771 | 3460 } |
3461 } | |
3462 | |
2367 | 3463 /* Adjust the special cached position. */ |
3464 | |
3465 /* After the end; gets shoved backward */ | |
3466 if (buf->text->cached_charpos > end) | |
3467 { | |
3468 buf->text->cached_charpos -= end - start; | |
3469 buf->text->cached_bytepos -= byte_end - byte_start; | |
3470 } | |
3471 /* In the range; moves to start of range */ | |
3472 else if (buf->text->cached_charpos > start) | |
3473 { | |
3474 buf->text->cached_charpos = start; | |
3475 buf->text->cached_bytepos = byte_start; | |
3476 } | |
3477 | |
3478 #ifdef OLD_BYTE_CHAR | |
771 | 3479 /* We don't care about any text after the end of the known region. */ |
3480 | |
3481 end = min (end, buf->text->mule_bufmax); | |
826 | 3482 byte_end = min (byte_end, buf->text->mule_bytmax); |
771 | 3483 if (start >= end) |
826 | 3484 return; |
771 | 3485 |
3486 /* The end of the known region offsets by the total amount of deletion, | |
3487 since it's all before it. */ | |
3488 | |
3489 buf->text->mule_bufmax -= end - start; | |
826 | 3490 buf->text->mule_bytmax -= byte_end - byte_start; |
771 | 3491 |
3492 /* Now we don't care about any text after the start of the known region. */ | |
3493 | |
3494 end = min (end, buf->text->mule_bufmin); | |
826 | 3495 byte_end = min (byte_end, buf->text->mule_bytmin); |
771 | 3496 if (start < end) |
3497 { | |
3498 buf->text->mule_bufmin -= end - start; | |
826 | 3499 buf->text->mule_bytmin -= byte_end - byte_start; |
771 | 3500 } |
2367 | 3501 #endif /* OLD_BYTE_CHAR */ |
771 | 3502 } |
3503 | |
3504 #endif /* MULE */ | |
3505 | |
3506 | |
3507 /************************************************************************/ | |
3508 /* verifying buffer and string positions */ | |
3509 /************************************************************************/ | |
3510 | |
3511 /* Functions below are tagged with either _byte or _char indicating | |
3512 whether they return byte or character positions. For a buffer, | |
3513 a character position is a "Charbpos" and a byte position is a "Bytebpos". | |
3514 For strings, these are sometimes typed using "Charcount" and | |
3515 "Bytecount". */ | |
3516 | |
3517 /* Flags for the functions below are: | |
3518 | |
3519 GB_ALLOW_PAST_ACCESSIBLE | |
3520 | |
3521 Allow positions to range over the entire buffer (BUF_BEG to BUF_Z), | |
3522 rather than just the accessible portion (BUF_BEGV to BUF_ZV). | |
3523 For strings, this flag has no effect. | |
3524 | |
3525 GB_COERCE_RANGE | |
3526 | |
3527 If the position is outside the allowable range, return the lower | |
3528 or upper bound of the range, whichever is closer to the specified | |
3529 position. | |
3530 | |
3531 GB_NO_ERROR_IF_BAD | |
3532 | |
3533 If the position is outside the allowable range, return -1. | |
3534 | |
3535 GB_NEGATIVE_FROM_END | |
3536 | |
3537 If a value is negative, treat it as an offset from the end. | |
3538 Only applies to strings. | |
3539 | |
3540 The following additional flags apply only to the functions | |
3541 that return ranges: | |
3542 | |
3543 GB_ALLOW_NIL | |
3544 | |
3545 Either or both positions can be nil. If FROM is nil, | |
3546 FROM_OUT will contain the lower bound of the allowed range. | |
3547 If TO is nil, TO_OUT will contain the upper bound of the | |
3548 allowed range. | |
3549 | |
3550 GB_CHECK_ORDER | |
3551 | |
3552 FROM must contain the lower bound and TO the upper bound | |
3553 of the range. If the positions are reversed, an error is | |
3554 signalled. | |
3555 | |
3556 The following is a combination flag: | |
3557 | |
3558 GB_HISTORICAL_STRING_BEHAVIOR | |
3559 | |
3560 Equivalent to (GB_NEGATIVE_FROM_END | GB_ALLOW_NIL). | |
3561 */ | |
3562 | |
3563 /* Return a buffer position stored in a Lisp_Object. Full | |
3564 error-checking is done on the position. Flags can be specified to | |
3565 control the behavior of out-of-range values. The default behavior | |
3566 is to require that the position is within the accessible part of | |
3567 the buffer (BEGV and ZV), and to signal an error if the position is | |
3568 out of range. | |
3569 | |
3570 */ | |
3571 | |
3572 Charbpos | |
3573 get_buffer_pos_char (struct buffer *b, Lisp_Object pos, unsigned int flags) | |
3574 { | |
3575 /* Does not GC */ | |
3576 Charbpos ind; | |
3577 Charbpos min_allowed, max_allowed; | |
3578 | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
3579 CHECK_FIXNUM_COERCE_MARKER (pos); |
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
3580 ind = XFIXNUM (pos); |
771 | 3581 min_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_BEG (b) : BUF_BEGV (b); |
3582 max_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_Z (b) : BUF_ZV (b); | |
3583 | |
3584 if (ind < min_allowed || ind > max_allowed) | |
3585 { | |
3586 if (flags & GB_COERCE_RANGE) | |
3587 ind = ind < min_allowed ? min_allowed : max_allowed; | |
3588 else if (flags & GB_NO_ERROR_IF_BAD) | |
3589 ind = -1; | |
3590 else | |
3591 { | |
793 | 3592 Lisp_Object buffer = wrap_buffer (b); |
3593 | |
771 | 3594 args_out_of_range (buffer, pos); |
3595 } | |
3596 } | |
3597 | |
3598 return ind; | |
3599 } | |
3600 | |
3601 Bytebpos | |
3602 get_buffer_pos_byte (struct buffer *b, Lisp_Object pos, unsigned int flags) | |
3603 { | |
3604 Charbpos bpos = get_buffer_pos_char (b, pos, flags); | |
3605 if (bpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */ | |
3606 return -1; | |
3607 return charbpos_to_bytebpos (b, bpos); | |
3608 } | |
3609 | |
3610 /* Return a pair of buffer positions representing a range of text, | |
3611 taken from a pair of Lisp_Objects. Full error-checking is | |
3612 done on the positions. Flags can be specified to control the | |
3613 behavior of out-of-range values. The default behavior is to | |
3614 allow the range bounds to be specified in either order | |
3615 (however, FROM_OUT will always be the lower bound of the range | |
3616 and TO_OUT the upper bound),to require that the positions | |
3617 are within the accessible part of the buffer (BEGV and ZV), | |
3618 and to signal an error if the positions are out of range. | |
3619 */ | |
3620 | |
3621 void | |
3622 get_buffer_range_char (struct buffer *b, Lisp_Object from, Lisp_Object to, | |
826 | 3623 Charbpos *from_out, Charbpos *to_out, |
3624 unsigned int flags) | |
771 | 3625 { |
3626 /* Does not GC */ | |
3627 Charbpos min_allowed, max_allowed; | |
3628 | |
3629 min_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ? | |
3630 BUF_BEG (b) : BUF_BEGV (b); | |
3631 max_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ? | |
3632 BUF_Z (b) : BUF_ZV (b); | |
3633 | |
3634 if (NILP (from) && (flags & GB_ALLOW_NIL)) | |
3635 *from_out = min_allowed; | |
3636 else | |
3637 *from_out = get_buffer_pos_char (b, from, flags | GB_NO_ERROR_IF_BAD); | |
3638 | |
3639 if (NILP (to) && (flags & GB_ALLOW_NIL)) | |
3640 *to_out = max_allowed; | |
3641 else | |
3642 *to_out = get_buffer_pos_char (b, to, flags | GB_NO_ERROR_IF_BAD); | |
3643 | |
3644 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD)) | |
3645 { | |
793 | 3646 Lisp_Object buffer = wrap_buffer (b); |
3647 | |
771 | 3648 args_out_of_range_3 (buffer, from, to); |
3649 } | |
3650 | |
3651 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out) | |
3652 { | |
3653 if (flags & GB_CHECK_ORDER) | |
3654 invalid_argument_2 ("start greater than end", from, to); | |
3655 else | |
3656 { | |
3657 Charbpos temp = *from_out; | |
3658 *from_out = *to_out; | |
3659 *to_out = temp; | |
3660 } | |
3661 } | |
3662 } | |
3663 | |
3664 void | |
3665 get_buffer_range_byte (struct buffer *b, Lisp_Object from, Lisp_Object to, | |
826 | 3666 Bytebpos *from_out, Bytebpos *to_out, |
3667 unsigned int flags) | |
771 | 3668 { |
3669 Charbpos s, e; | |
3670 | |
3671 get_buffer_range_char (b, from, to, &s, &e, flags); | |
3672 if (s >= 0) | |
3673 *from_out = charbpos_to_bytebpos (b, s); | |
3674 else /* could happen with GB_NO_ERROR_IF_BAD */ | |
3675 *from_out = -1; | |
3676 if (e >= 0) | |
3677 *to_out = charbpos_to_bytebpos (b, e); | |
3678 else | |
3679 *to_out = -1; | |
3680 } | |
3681 | |
3682 static Charcount | |
3683 get_string_pos_char_1 (Lisp_Object string, Lisp_Object pos, unsigned int flags, | |
3684 Charcount known_length) | |
3685 { | |
3686 Charcount ccpos; | |
3687 Charcount min_allowed = 0; | |
3688 Charcount max_allowed = known_length; | |
3689 | |
3690 /* Computation of KNOWN_LENGTH is potentially expensive so we pass | |
3691 it in. */ | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
3692 CHECK_FIXNUM (pos); |
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
3693 ccpos = XFIXNUM (pos); |
771 | 3694 if (ccpos < 0 && flags & GB_NEGATIVE_FROM_END) |
3695 ccpos += max_allowed; | |
3696 | |
3697 if (ccpos < min_allowed || ccpos > max_allowed) | |
3698 { | |
3699 if (flags & GB_COERCE_RANGE) | |
3700 ccpos = ccpos < min_allowed ? min_allowed : max_allowed; | |
3701 else if (flags & GB_NO_ERROR_IF_BAD) | |
3702 ccpos = -1; | |
3703 else | |
3704 args_out_of_range (string, pos); | |
3705 } | |
3706 | |
3707 return ccpos; | |
3708 } | |
3709 | |
3710 Charcount | |
3711 get_string_pos_char (Lisp_Object string, Lisp_Object pos, unsigned int flags) | |
3712 { | |
3713 return get_string_pos_char_1 (string, pos, flags, | |
826 | 3714 string_char_length (string)); |
771 | 3715 } |
3716 | |
3717 Bytecount | |
3718 get_string_pos_byte (Lisp_Object string, Lisp_Object pos, unsigned int flags) | |
3719 { | |
3720 Charcount ccpos = get_string_pos_char (string, pos, flags); | |
3721 if (ccpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */ | |
3722 return -1; | |
793 | 3723 return string_index_char_to_byte (string, ccpos); |
771 | 3724 } |
3725 | |
3726 void | |
3727 get_string_range_char (Lisp_Object string, Lisp_Object from, Lisp_Object to, | |
3728 Charcount *from_out, Charcount *to_out, | |
3729 unsigned int flags) | |
3730 { | |
3731 Charcount min_allowed = 0; | |
826 | 3732 Charcount max_allowed = string_char_length (string); |
771 | 3733 |
3734 if (NILP (from) && (flags & GB_ALLOW_NIL)) | |
3735 *from_out = min_allowed; | |
3736 else | |
3737 *from_out = get_string_pos_char_1 (string, from, | |
3738 flags | GB_NO_ERROR_IF_BAD, | |
3739 max_allowed); | |
3740 | |
3741 if (NILP (to) && (flags & GB_ALLOW_NIL)) | |
3742 *to_out = max_allowed; | |
3743 else | |
3744 *to_out = get_string_pos_char_1 (string, to, | |
3745 flags | GB_NO_ERROR_IF_BAD, | |
3746 max_allowed); | |
3747 | |
3748 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD)) | |
3749 args_out_of_range_3 (string, from, to); | |
3750 | |
3751 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out) | |
3752 { | |
3753 if (flags & GB_CHECK_ORDER) | |
3754 invalid_argument_2 ("start greater than end", from, to); | |
3755 else | |
3756 { | |
3757 Charbpos temp = *from_out; | |
3758 *from_out = *to_out; | |
3759 *to_out = temp; | |
3760 } | |
3761 } | |
3762 } | |
3763 | |
3764 void | |
3765 get_string_range_byte (Lisp_Object string, Lisp_Object from, Lisp_Object to, | |
3766 Bytecount *from_out, Bytecount *to_out, | |
3767 unsigned int flags) | |
3768 { | |
3769 Charcount s, e; | |
3770 | |
3771 get_string_range_char (string, from, to, &s, &e, flags); | |
3772 if (s >= 0) | |
793 | 3773 *from_out = string_index_char_to_byte (string, s); |
771 | 3774 else /* could happen with GB_NO_ERROR_IF_BAD */ |
3775 *from_out = -1; | |
3776 if (e >= 0) | |
793 | 3777 *to_out = string_index_char_to_byte (string, e); |
771 | 3778 else |
3779 *to_out = -1; | |
3780 | |
3781 } | |
3782 | |
826 | 3783 Charxpos |
771 | 3784 get_buffer_or_string_pos_char (Lisp_Object object, Lisp_Object pos, |
3785 unsigned int flags) | |
3786 { | |
3787 return STRINGP (object) ? | |
3788 get_string_pos_char (object, pos, flags) : | |
3789 get_buffer_pos_char (XBUFFER (object), pos, flags); | |
3790 } | |
3791 | |
826 | 3792 Bytexpos |
771 | 3793 get_buffer_or_string_pos_byte (Lisp_Object object, Lisp_Object pos, |
3794 unsigned int flags) | |
3795 { | |
3796 return STRINGP (object) ? | |
3797 get_string_pos_byte (object, pos, flags) : | |
3798 get_buffer_pos_byte (XBUFFER (object), pos, flags); | |
3799 } | |
3800 | |
3801 void | |
3802 get_buffer_or_string_range_char (Lisp_Object object, Lisp_Object from, | |
826 | 3803 Lisp_Object to, Charxpos *from_out, |
3804 Charxpos *to_out, unsigned int flags) | |
771 | 3805 { |
3806 if (STRINGP (object)) | |
3807 get_string_range_char (object, from, to, from_out, to_out, flags); | |
3808 else | |
826 | 3809 get_buffer_range_char (XBUFFER (object), from, to, from_out, to_out, |
3810 flags); | |
771 | 3811 } |
3812 | |
3813 void | |
3814 get_buffer_or_string_range_byte (Lisp_Object object, Lisp_Object from, | |
826 | 3815 Lisp_Object to, Bytexpos *from_out, |
3816 Bytexpos *to_out, unsigned int flags) | |
771 | 3817 { |
3818 if (STRINGP (object)) | |
3819 get_string_range_byte (object, from, to, from_out, to_out, flags); | |
3820 else | |
826 | 3821 get_buffer_range_byte (XBUFFER (object), from, to, from_out, to_out, |
3822 flags); | |
771 | 3823 } |
3824 | |
826 | 3825 Charxpos |
771 | 3826 buffer_or_string_accessible_begin_char (Lisp_Object object) |
3827 { | |
3828 return STRINGP (object) ? 0 : BUF_BEGV (XBUFFER (object)); | |
3829 } | |
3830 | |
826 | 3831 Charxpos |
771 | 3832 buffer_or_string_accessible_end_char (Lisp_Object object) |
3833 { | |
3834 return STRINGP (object) ? | |
826 | 3835 string_char_length (object) : BUF_ZV (XBUFFER (object)); |
771 | 3836 } |
3837 | |
826 | 3838 Bytexpos |
771 | 3839 buffer_or_string_accessible_begin_byte (Lisp_Object object) |
3840 { | |
826 | 3841 return STRINGP (object) ? 0 : BYTE_BUF_BEGV (XBUFFER (object)); |
771 | 3842 } |
3843 | |
826 | 3844 Bytexpos |
771 | 3845 buffer_or_string_accessible_end_byte (Lisp_Object object) |
3846 { | |
3847 return STRINGP (object) ? | |
826 | 3848 XSTRING_LENGTH (object) : BYTE_BUF_ZV (XBUFFER (object)); |
771 | 3849 } |
3850 | |
826 | 3851 Charxpos |
771 | 3852 buffer_or_string_absolute_begin_char (Lisp_Object object) |
3853 { | |
3854 return STRINGP (object) ? 0 : BUF_BEG (XBUFFER (object)); | |
3855 } | |
3856 | |
826 | 3857 Charxpos |
771 | 3858 buffer_or_string_absolute_end_char (Lisp_Object object) |
3859 { | |
3860 return STRINGP (object) ? | |
826 | 3861 string_char_length (object) : BUF_Z (XBUFFER (object)); |
3862 } | |
3863 | |
3864 Bytexpos | |
3865 buffer_or_string_absolute_begin_byte (Lisp_Object object) | |
3866 { | |
3867 return STRINGP (object) ? 0 : BYTE_BUF_BEG (XBUFFER (object)); | |
3868 } | |
3869 | |
3870 Bytexpos | |
3871 buffer_or_string_absolute_end_byte (Lisp_Object object) | |
3872 { | |
3873 return STRINGP (object) ? | |
3874 XSTRING_LENGTH (object) : BYTE_BUF_Z (XBUFFER (object)); | |
3875 } | |
3876 | |
3877 Charbpos | |
3878 charbpos_clip_to_bounds (Charbpos lower, Charbpos num, Charbpos upper) | |
3879 { | |
3880 return (num < lower ? lower : | |
3881 num > upper ? upper : | |
3882 num); | |
771 | 3883 } |
3884 | |
3885 Bytebpos | |
826 | 3886 bytebpos_clip_to_bounds (Bytebpos lower, Bytebpos num, Bytebpos upper) |
3887 { | |
3888 return (num < lower ? lower : | |
3889 num > upper ? upper : | |
3890 num); | |
3891 } | |
3892 | |
3893 Charxpos | |
3894 charxpos_clip_to_bounds (Charxpos lower, Charxpos num, Charxpos upper) | |
771 | 3895 { |
826 | 3896 return (num < lower ? lower : |
3897 num > upper ? upper : | |
3898 num); | |
3899 } | |
3900 | |
3901 Bytexpos | |
3902 bytexpos_clip_to_bounds (Bytexpos lower, Bytexpos num, Bytexpos upper) | |
3903 { | |
3904 return (num < lower ? lower : | |
3905 num > upper ? upper : | |
3906 num); | |
771 | 3907 } |
3908 | |
826 | 3909 /* These could be implemented in terms of the get_buffer_or_string() |
3910 functions above, but those are complicated and handle lots of weird | |
3911 cases stemming from uncertain external input. */ | |
3912 | |
3913 Charxpos | |
3914 buffer_or_string_clip_to_accessible_char (Lisp_Object object, Charxpos pos) | |
3915 { | |
3916 return (charxpos_clip_to_bounds | |
3917 (pos, buffer_or_string_accessible_begin_char (object), | |
3918 buffer_or_string_accessible_end_char (object))); | |
3919 } | |
3920 | |
3921 Bytexpos | |
3922 buffer_or_string_clip_to_accessible_byte (Lisp_Object object, Bytexpos pos) | |
771 | 3923 { |
826 | 3924 return (bytexpos_clip_to_bounds |
3925 (pos, buffer_or_string_accessible_begin_byte (object), | |
3926 buffer_or_string_accessible_end_byte (object))); | |
3927 } | |
3928 | |
3929 Charxpos | |
3930 buffer_or_string_clip_to_absolute_char (Lisp_Object object, Charxpos pos) | |
3931 { | |
3932 return (charxpos_clip_to_bounds | |
3933 (pos, buffer_or_string_absolute_begin_char (object), | |
3934 buffer_or_string_absolute_end_char (object))); | |
3935 } | |
3936 | |
3937 Bytexpos | |
3938 buffer_or_string_clip_to_absolute_byte (Lisp_Object object, Bytexpos pos) | |
3939 { | |
3940 return (bytexpos_clip_to_bounds | |
3941 (pos, buffer_or_string_absolute_begin_byte (object), | |
3942 buffer_or_string_absolute_end_byte (object))); | |
771 | 3943 } |
3944 | |
3945 | |
3946 /************************************************************************/ | |
3947 /* Implement TO_EXTERNAL_FORMAT, TO_INTERNAL_FORMAT */ | |
3948 /************************************************************************/ | |
3949 | |
3950 typedef struct | |
3951 { | |
867 | 3952 Dynarr_declare (Ibyte_dynarr *); |
3953 } Ibyte_dynarr_dynarr; | |
771 | 3954 |
3955 typedef struct | |
3956 { | |
3957 Dynarr_declare (Extbyte_dynarr *); | |
3958 } Extbyte_dynarr_dynarr; | |
3959 | |
3960 static Extbyte_dynarr_dynarr *conversion_out_dynarr_list; | |
867 | 3961 static Ibyte_dynarr_dynarr *conversion_in_dynarr_list; |
771 | 3962 |
3963 static int dfc_convert_to_external_format_in_use; | |
3964 static int dfc_convert_to_internal_format_in_use; | |
3965 | |
3966 void | |
3967 dfc_convert_to_external_format (dfc_conversion_type source_type, | |
3968 dfc_conversion_data *source, | |
3969 Lisp_Object coding_system, | |
3970 dfc_conversion_type sink_type, | |
3971 dfc_conversion_data *sink) | |
3972 { | |
3973 /* It's guaranteed that many callers are not prepared for GC here, | |
3974 esp. given that this code conversion occurs in many very hidden | |
3975 places. */ | |
1292 | 3976 int count; |
771 | 3977 Extbyte_dynarr *conversion_out_dynarr; |
1292 | 3978 PROFILE_DECLARE (); |
3979 | |
2367 | 3980 assert (!inhibit_non_essential_conversion_operations); |
1292 | 3981 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion); |
3982 | |
3983 count = begin_gc_forbidden (); | |
771 | 3984 |
3985 type_checking_assert | |
3986 (((source_type == DFC_TYPE_DATA) || | |
3987 (source_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)) || | |
3988 (source_type == DFC_TYPE_LISP_STRING && STRINGP (source->lisp_object))) | |
3989 && | |
3990 ((sink_type == DFC_TYPE_DATA) || | |
3991 (sink_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)))); | |
3992 | |
3993 if (Dynarr_length (conversion_out_dynarr_list) <= | |
3994 dfc_convert_to_external_format_in_use) | |
3995 Dynarr_add (conversion_out_dynarr_list, Dynarr_new (Extbyte)); | |
3996 conversion_out_dynarr = Dynarr_at (conversion_out_dynarr_list, | |
3997 dfc_convert_to_external_format_in_use); | |
3998 Dynarr_reset (conversion_out_dynarr); | |
3999 | |
853 | 4000 internal_bind_int (&dfc_convert_to_external_format_in_use, |
4001 dfc_convert_to_external_format_in_use + 1); | |
4002 | |
771 | 4003 coding_system = get_coding_system_for_text_file (coding_system, 0); |
4004 | |
4005 /* Here we optimize in the case where the coding system does no | |
4006 conversion. However, we don't want to optimize in case the source | |
4007 or sink is an lstream, since writing to an lstream can cause a | |
4008 garbage collection, and this could be problematic if the source | |
4009 is a lisp string. */ | |
4010 if (source_type != DFC_TYPE_LISP_LSTREAM && | |
4011 sink_type != DFC_TYPE_LISP_LSTREAM && | |
4012 coding_system_is_binary (coding_system)) | |
4013 { | |
867 | 4014 const Ibyte *ptr; |
771 | 4015 Bytecount len; |
4016 | |
4017 if (source_type == DFC_TYPE_LISP_STRING) | |
4018 { | |
4019 ptr = XSTRING_DATA (source->lisp_object); | |
4020 len = XSTRING_LENGTH (source->lisp_object); | |
4021 } | |
4022 else | |
4023 { | |
867 | 4024 ptr = (Ibyte *) source->data.ptr; |
771 | 4025 len = source->data.len; |
4026 } | |
4027 | |
4028 #ifdef MULE | |
4029 { | |
867 | 4030 const Ibyte *end; |
771 | 4031 for (end = ptr + len; ptr < end;) |
4032 { | |
867 | 4033 Ibyte c = |
826 | 4034 (byte_ascii_p (*ptr)) ? *ptr : |
771 | 4035 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) : |
4036 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) : | |
4037 '~'; | |
4038 | |
4039 Dynarr_add (conversion_out_dynarr, (Extbyte) c); | |
867 | 4040 INC_IBYTEPTR (ptr); |
771 | 4041 } |
800 | 4042 text_checking_assert (ptr == end); |
771 | 4043 } |
4044 #else | |
4045 Dynarr_add_many (conversion_out_dynarr, ptr, len); | |
4046 #endif | |
4047 | |
4048 } | |
1315 | 4049 #ifdef WIN32_ANY |
771 | 4050 /* Optimize the common case involving Unicode where only ASCII is involved */ |
4051 else if (source_type != DFC_TYPE_LISP_LSTREAM && | |
4052 sink_type != DFC_TYPE_LISP_LSTREAM && | |
4053 dfc_coding_system_is_unicode (coding_system)) | |
4054 { | |
867 | 4055 const Ibyte *ptr, *p; |
771 | 4056 Bytecount len; |
867 | 4057 const Ibyte *end; |
771 | 4058 |
4059 if (source_type == DFC_TYPE_LISP_STRING) | |
4060 { | |
4061 ptr = XSTRING_DATA (source->lisp_object); | |
4062 len = XSTRING_LENGTH (source->lisp_object); | |
4063 } | |
4064 else | |
4065 { | |
867 | 4066 ptr = (Ibyte *) source->data.ptr; |
771 | 4067 len = source->data.len; |
4068 } | |
4069 end = ptr + len; | |
4070 | |
4071 for (p = ptr; p < end; p++) | |
4072 { | |
826 | 4073 if (!byte_ascii_p (*p)) |
771 | 4074 goto the_hard_way; |
4075 } | |
4076 | |
4077 for (p = ptr; p < end; p++) | |
4078 { | |
4079 Dynarr_add (conversion_out_dynarr, (Extbyte) (*p)); | |
4080 Dynarr_add (conversion_out_dynarr, (Extbyte) '\0'); | |
4081 } | |
4082 } | |
1315 | 4083 #endif /* WIN32_ANY */ |
771 | 4084 else |
4085 { | |
4086 Lisp_Object streams_to_delete[3]; | |
4087 int delete_count; | |
4088 Lisp_Object instream, outstream; | |
4089 Lstream *reader, *writer; | |
4090 | |
1315 | 4091 #ifdef WIN32_ANY |
771 | 4092 the_hard_way: |
1315 | 4093 #endif /* WIN32_ANY */ |
771 | 4094 delete_count = 0; |
4095 if (source_type == DFC_TYPE_LISP_LSTREAM) | |
4096 instream = source->lisp_object; | |
4097 else if (source_type == DFC_TYPE_DATA) | |
4098 streams_to_delete[delete_count++] = instream = | |
4099 make_fixed_buffer_input_stream (source->data.ptr, source->data.len); | |
4100 else | |
4101 { | |
4102 type_checking_assert (source_type == DFC_TYPE_LISP_STRING); | |
4103 streams_to_delete[delete_count++] = instream = | |
4104 /* This will GCPRO the Lisp string */ | |
4105 make_lisp_string_input_stream (source->lisp_object, 0, -1); | |
4106 } | |
4107 | |
4108 if (sink_type == DFC_TYPE_LISP_LSTREAM) | |
4109 outstream = sink->lisp_object; | |
4110 else | |
4111 { | |
4112 type_checking_assert (sink_type == DFC_TYPE_DATA); | |
4113 streams_to_delete[delete_count++] = outstream = | |
4114 make_dynarr_output_stream | |
4115 ((unsigned_char_dynarr *) conversion_out_dynarr); | |
4116 } | |
4117 | |
4118 streams_to_delete[delete_count++] = outstream = | |
800 | 4119 make_coding_output_stream (XLSTREAM (outstream), coding_system, |
4120 CODING_ENCODE, 0); | |
771 | 4121 |
4122 reader = XLSTREAM (instream); | |
4123 writer = XLSTREAM (outstream); | |
4124 /* decoding_stream will gc-protect outstream */ | |
1204 | 4125 { |
4126 struct gcpro gcpro1, gcpro2; | |
4127 GCPRO2 (instream, outstream); | |
4128 | |
4129 while (1) | |
4130 { | |
4131 Bytecount size_in_bytes; | |
4132 char tempbuf[1024]; /* some random amount */ | |
4133 | |
4134 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf)); | |
4135 | |
4136 if (size_in_bytes == 0) | |
4137 break; | |
4138 else if (size_in_bytes < 0) | |
4139 signal_error (Qtext_conversion_error, | |
4140 "Error converting to external format", Qunbound); | |
4141 | |
4142 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0) | |
4143 signal_error (Qtext_conversion_error, | |
4144 "Error converting to external format", Qunbound); | |
4145 } | |
4146 | |
4147 /* Closing writer will close any stream at the other end of writer. */ | |
4148 Lstream_close (writer); | |
4149 Lstream_close (reader); | |
4150 UNGCPRO; | |
4151 } | |
771 | 4152 |
4153 /* The idea is that this function will create no garbage. */ | |
4154 while (delete_count) | |
4155 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count])); | |
4156 } | |
4157 | |
4158 unbind_to (count); | |
4159 | |
4160 if (sink_type != DFC_TYPE_LISP_LSTREAM) | |
4161 { | |
4162 sink->data.len = Dynarr_length (conversion_out_dynarr); | |
4163 /* double zero-extend because we may be dealing with Unicode data */ | |
4164 Dynarr_add (conversion_out_dynarr, '\0'); | |
4165 Dynarr_add (conversion_out_dynarr, '\0'); | |
4967 | 4166 sink->data.ptr = Dynarr_begin (conversion_out_dynarr); |
771 | 4167 } |
1292 | 4168 |
4169 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion); | |
771 | 4170 } |
4171 | |
4172 void | |
4173 dfc_convert_to_internal_format (dfc_conversion_type source_type, | |
4174 dfc_conversion_data *source, | |
4175 Lisp_Object coding_system, | |
4176 dfc_conversion_type sink_type, | |
4177 dfc_conversion_data *sink) | |
4178 { | |
4179 /* It's guaranteed that many callers are not prepared for GC here, | |
4180 esp. given that this code conversion occurs in many very hidden | |
4181 places. */ | |
1292 | 4182 int count; |
867 | 4183 Ibyte_dynarr *conversion_in_dynarr; |
2421 | 4184 Lisp_Object underlying_cs; |
1292 | 4185 PROFILE_DECLARE (); |
4186 | |
2367 | 4187 assert (!inhibit_non_essential_conversion_operations); |
1292 | 4188 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion); |
4189 | |
4190 count = begin_gc_forbidden (); | |
771 | 4191 |
4192 type_checking_assert | |
4193 ((source_type == DFC_TYPE_DATA || | |
4194 source_type == DFC_TYPE_LISP_LSTREAM) | |
4195 && | |
4196 (sink_type == DFC_TYPE_DATA || | |
4197 sink_type == DFC_TYPE_LISP_LSTREAM)); | |
4198 | |
4199 if (Dynarr_length (conversion_in_dynarr_list) <= | |
4200 dfc_convert_to_internal_format_in_use) | |
867 | 4201 Dynarr_add (conversion_in_dynarr_list, Dynarr_new (Ibyte)); |
771 | 4202 conversion_in_dynarr = Dynarr_at (conversion_in_dynarr_list, |
4203 dfc_convert_to_internal_format_in_use); | |
4204 Dynarr_reset (conversion_in_dynarr); | |
4205 | |
853 | 4206 internal_bind_int (&dfc_convert_to_internal_format_in_use, |
4207 dfc_convert_to_internal_format_in_use + 1); | |
4208 | |
2421 | 4209 /* The second call does the equivalent of both calls, but we need |
4210 the result after the first call (which wraps just a to-text | |
4211 converter) as well as the result after the second call (which | |
4212 also wraps an EOL-detection converter). */ | |
4213 underlying_cs = get_coding_system_for_text_file (coding_system, 0); | |
4214 coding_system = get_coding_system_for_text_file (underlying_cs, 1); | |
771 | 4215 |
4216 if (source_type != DFC_TYPE_LISP_LSTREAM && | |
4217 sink_type != DFC_TYPE_LISP_LSTREAM && | |
2421 | 4218 coding_system_is_binary (underlying_cs)) |
771 | 4219 { |
4220 #ifdef MULE | |
2421 | 4221 const Ibyte *ptr; |
771 | 4222 Bytecount len = source->data.len; |
2421 | 4223 const Ibyte *end; |
4224 | |
4225 /* Make sure no EOL conversion is needed. With a little work we | |
4226 could handle EOL conversion as well but it may not be needed as an | |
4227 optimization. */ | |
4228 if (!EQ (coding_system, underlying_cs)) | |
4229 { | |
4230 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len; | |
4231 ptr < end; ptr++) | |
4232 { | |
4233 if (*ptr == '\r' || *ptr == '\n') | |
4234 goto the_hard_way; | |
4235 } | |
4236 } | |
4237 | |
4238 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len; | |
4239 ptr < end; ptr++) | |
771 | 4240 { |
867 | 4241 Ibyte c = *ptr; |
771 | 4242 |
826 | 4243 if (byte_ascii_p (c)) |
771 | 4244 Dynarr_add (conversion_in_dynarr, c); |
826 | 4245 else if (byte_c1_p (c)) |
771 | 4246 { |
4247 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1); | |
4248 Dynarr_add (conversion_in_dynarr, c + 0x20); | |
4249 } | |
4250 else | |
4251 { | |
4252 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1); | |
4253 Dynarr_add (conversion_in_dynarr, c); | |
4254 } | |
4255 } | |
4256 #else | |
4257 Dynarr_add_many (conversion_in_dynarr, source->data.ptr, source->data.len); | |
4258 #endif | |
4259 } | |
1315 | 4260 #ifdef WIN32_ANY |
1292 | 4261 /* Optimize the common case involving Unicode where only ASCII/Latin-1 is |
4262 involved */ | |
771 | 4263 else if (source_type != DFC_TYPE_LISP_LSTREAM && |
4264 sink_type != DFC_TYPE_LISP_LSTREAM && | |
2421 | 4265 dfc_coding_system_is_unicode (underlying_cs)) |
771 | 4266 { |
2421 | 4267 const Ibyte *ptr; |
771 | 4268 Bytecount len = source->data.len; |
2421 | 4269 const Ibyte *end; |
771 | 4270 |
4271 if (len & 1) | |
4272 goto the_hard_way; | |
4273 | |
2421 | 4274 /* Make sure only ASCII/Latin-1 is involved */ |
4275 for (ptr = (const Ibyte *) source->data.ptr + 1, end = ptr + len; | |
4276 ptr < end; ptr += 2) | |
771 | 4277 { |
4278 if (*ptr) | |
4279 goto the_hard_way; | |
4280 } | |
4281 | |
2421 | 4282 /* Make sure no EOL conversion is needed. With a little work we |
4283 could handle EOL conversion as well but it may not be needed as an | |
4284 optimization. */ | |
4285 if (!EQ (coding_system, underlying_cs)) | |
4286 { | |
4287 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len; | |
4288 ptr < end; ptr += 2) | |
4289 { | |
4290 if (*ptr == '\r' || *ptr == '\n') | |
4291 goto the_hard_way; | |
4292 } | |
4293 } | |
4294 | |
4295 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len; | |
4296 ptr < end; ptr += 2) | |
771 | 4297 { |
867 | 4298 Ibyte c = *ptr; |
771 | 4299 |
826 | 4300 if (byte_ascii_p (c)) |
771 | 4301 Dynarr_add (conversion_in_dynarr, c); |
4302 #ifdef MULE | |
826 | 4303 else if (byte_c1_p (c)) |
771 | 4304 { |
4305 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1); | |
4306 Dynarr_add (conversion_in_dynarr, c + 0x20); | |
4307 } | |
4308 else | |
4309 { | |
4310 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1); | |
4311 Dynarr_add (conversion_in_dynarr, c); | |
4312 } | |
4313 #endif /* MULE */ | |
4314 } | |
4315 } | |
1315 | 4316 #endif /* WIN32_ANY */ |
771 | 4317 else |
4318 { | |
4319 Lisp_Object streams_to_delete[3]; | |
4320 int delete_count; | |
4321 Lisp_Object instream, outstream; | |
4322 Lstream *reader, *writer; | |
4323 | |
2421 | 4324 #if defined (WIN32_ANY) || defined (MULE) |
771 | 4325 the_hard_way: |
2421 | 4326 #endif |
771 | 4327 delete_count = 0; |
4328 if (source_type == DFC_TYPE_LISP_LSTREAM) | |
4329 instream = source->lisp_object; | |
4330 else | |
4331 { | |
4332 type_checking_assert (source_type == DFC_TYPE_DATA); | |
4333 streams_to_delete[delete_count++] = instream = | |
4334 make_fixed_buffer_input_stream (source->data.ptr, source->data.len); | |
4335 } | |
4336 | |
4337 if (sink_type == DFC_TYPE_LISP_LSTREAM) | |
4338 outstream = sink->lisp_object; | |
4339 else | |
4340 { | |
4341 type_checking_assert (sink_type == DFC_TYPE_DATA); | |
4342 streams_to_delete[delete_count++] = outstream = | |
4343 make_dynarr_output_stream | |
4344 ((unsigned_char_dynarr *) conversion_in_dynarr); | |
4345 } | |
4346 | |
4347 streams_to_delete[delete_count++] = outstream = | |
800 | 4348 make_coding_output_stream (XLSTREAM (outstream), coding_system, |
4349 CODING_DECODE, 0); | |
771 | 4350 |
4351 reader = XLSTREAM (instream); | |
4352 writer = XLSTREAM (outstream); | |
1204 | 4353 { |
4354 struct gcpro gcpro1, gcpro2; | |
4355 /* outstream will gc-protect its sink stream, if necessary */ | |
4356 GCPRO2 (instream, outstream); | |
4357 | |
4358 while (1) | |
4359 { | |
4360 Bytecount size_in_bytes; | |
4361 char tempbuf[1024]; /* some random amount */ | |
4362 | |
4363 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf)); | |
4364 | |
4365 if (size_in_bytes == 0) | |
4366 break; | |
4367 else if (size_in_bytes < 0) | |
4368 signal_error (Qtext_conversion_error, | |
4369 "Error converting to internal format", Qunbound); | |
4370 | |
4371 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0) | |
4372 signal_error (Qtext_conversion_error, | |
4373 "Error converting to internal format", Qunbound); | |
4374 } | |
4375 | |
4376 /* Closing writer will close any stream at the other end of writer. */ | |
4377 Lstream_close (writer); | |
4378 Lstream_close (reader); | |
4379 UNGCPRO; | |
4380 } | |
771 | 4381 |
4382 /* The idea is that this function will create no garbage. */ | |
4383 while (delete_count) | |
4384 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count])); | |
4385 } | |
4386 | |
4387 unbind_to (count); | |
4388 | |
4389 if (sink_type != DFC_TYPE_LISP_LSTREAM) | |
4390 { | |
4391 sink->data.len = Dynarr_length (conversion_in_dynarr); | |
4392 Dynarr_add (conversion_in_dynarr, '\0'); /* remember to NUL-terminate! */ | |
4393 /* The macros don't currently distinguish between internal and | |
4394 external sinks, and allocate and copy two extra bytes in both | |
4395 cases. So we add a second zero, just like for external data | |
4396 (in that case, because we may be converting to Unicode). */ | |
4397 Dynarr_add (conversion_in_dynarr, '\0'); | |
4967 | 4398 sink->data.ptr = Dynarr_begin (conversion_in_dynarr); |
771 | 4399 } |
1292 | 4400 |
4401 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion); | |
771 | 4402 } |
4403 | |
1318 | 4404 /* ----------------------------------------------------------------------- */ |
2367 | 4405 /* Alloca-conversion helpers */ |
4406 /* ----------------------------------------------------------------------- */ | |
4407 | |
4408 /* For alloca(), things are trickier because the calling function needs to | |
4409 allocate. This means that the caller needs to do the following: | |
4410 | |
4411 (a) invoke us to do the conversion, remember the data and return the size. | |
4412 (b) alloca() the proper size. | |
4413 (c) invoke us again to copy the data. | |
4414 | |
4415 We need to handle the possibility of two or more invocations of the | |
4416 converter in the same expression. In such cases it's conceivable that | |
4417 the evaluation of the sub-expressions will be overlapping (e.g. one size | |
4418 function called, then the other one called, then the copy functions | |
4419 called). To handle this, we keep a list of active data, indexed by the | |
4420 src expression. (We use the stringize operator to avoid evaluating the | |
4421 expression multiple times.) If the caller uses the exact same src | |
4422 expression twice in two converter calls in the same subexpression, we | |
2500 | 4423 will lose, but at least we can check for this and ABORT(). We could |
2367 | 4424 conceivably try to index on other parameters as well, but there is not |
4425 really any point. */ | |
4426 | |
4427 alloca_convert_vals_dynarr *active_alloca_convert; | |
4428 | |
4429 int | |
4430 find_pos_of_existing_active_alloca_convert (const char *srctext) | |
4431 { | |
4432 alloca_convert_vals *vals = NULL; | |
4433 int i; | |
4434 | |
4435 if (!active_alloca_convert) | |
4436 active_alloca_convert = Dynarr_new (alloca_convert_vals); | |
4437 | |
4438 for (i = 0; i < Dynarr_length (active_alloca_convert); i++) | |
4439 { | |
4440 vals = Dynarr_atp (active_alloca_convert, i); | |
2385 | 4441 /* On my system, two different occurrences of the same stringized |
4442 argument always point to the same string. However, on someone | |
4443 else's system, that wasn't the case. We check for equality | |
4444 first, since it seems systems work my way more than the other | |
4445 way. */ | |
4446 if (vals->srctext == srctext || !strcmp (vals->srctext, srctext)) | |
2367 | 4447 return i; |
4448 } | |
4449 | |
4450 return -1; | |
4451 } | |
4452 | |
4453 /* ----------------------------------------------------------------------- */ | |
1318 | 4454 /* New-style DFC converters (data is returned rather than stored into var) */ |
4455 /* ----------------------------------------------------------------------- */ | |
4456 | |
4457 /* We handle here the cases where SRC is a Lisp_Object, internal data | |
4458 (sized or unsized), or external data (sized or unsized), and return type | |
4459 is unsized alloca() or malloc() data. If the return type is a | |
4953
304aebb79cd3
function renamings to track names of char typedefs
Ben Wing <ben@xemacs.org>
parents:
4952
diff
changeset
|
4460 Lisp_Object, use build_extstring() for unsized external data, |
304aebb79cd3
function renamings to track names of char typedefs
Ben Wing <ben@xemacs.org>
parents:
4952
diff
changeset
|
4461 make_extstring() for sized external data. If the return type needs to |
1318 | 4462 be sized data, use the *_TO_SIZED_*() macros, and for other more |
4463 complicated cases, use the original TO_*_FORMAT() macros. */ | |
4464 | |
4465 static void | |
4466 new_dfc_convert_now_damn_it (const void *src, Bytecount src_size, | |
4467 enum new_dfc_src_type type, | |
4468 void **dst, Bytecount *dst_size, | |
4469 Lisp_Object codesys) | |
4470 { | |
4471 /* #### In the case of alloca(), it would be a bit more efficient, for | |
4472 small strings, to use static Dynarr's like are used internally in | |
4473 TO_*_FORMAT(), or some other way of avoiding malloc() followed by | |
4474 free(). I doubt it really matters, though. */ | |
4475 | |
4476 switch (type) | |
4477 { | |
4478 case DFC_EXTERNAL: | |
4479 TO_INTERNAL_FORMAT (C_STRING, src, | |
4480 MALLOC, (*dst, *dst_size), codesys); | |
4481 break; | |
4482 | |
4483 case DFC_SIZED_EXTERNAL: | |
4484 TO_INTERNAL_FORMAT (DATA, (src, src_size), | |
4485 MALLOC, (*dst, *dst_size), codesys); | |
4486 break; | |
4487 | |
4488 case DFC_INTERNAL: | |
4489 TO_EXTERNAL_FORMAT (C_STRING, src, | |
4490 MALLOC, (*dst, *dst_size), codesys); | |
4491 break; | |
4492 | |
4493 case DFC_SIZED_INTERNAL: | |
4494 TO_EXTERNAL_FORMAT (DATA, (src, src_size), | |
4495 MALLOC, (*dst, *dst_size), codesys); | |
4496 break; | |
4497 | |
4498 case DFC_LISP_STRING: | |
5013 | 4499 TO_EXTERNAL_FORMAT (LISP_STRING, GET_LISP_FROM_VOID (src), |
1318 | 4500 MALLOC, (*dst, *dst_size), codesys); |
4501 break; | |
4502 | |
4503 default: | |
2500 | 4504 ABORT (); |
1318 | 4505 } |
2367 | 4506 |
4507 /* The size is always + 2 because we have double zero-termination at the | |
4508 end of all data (for Unicode-correctness). */ | |
4509 *dst_size += 2; | |
4510 } | |
4511 | |
4512 Bytecount | |
4513 new_dfc_convert_size (const char *srctext, const void *src, | |
4514 Bytecount src_size, enum new_dfc_src_type type, | |
4515 Lisp_Object codesys) | |
4516 { | |
4517 alloca_convert_vals vals; | |
4518 | |
2721 | 4519 int i = find_pos_of_existing_active_alloca_convert (srctext); |
4520 assert (i < 0); | |
2367 | 4521 |
4522 vals.srctext = srctext; | |
4523 | |
4524 new_dfc_convert_now_damn_it (src, src_size, type, &vals.dst, &vals.dst_size, | |
4525 codesys); | |
4526 | |
4527 Dynarr_add (active_alloca_convert, vals); | |
4528 return vals.dst_size; | |
4529 } | |
4530 | |
4531 void * | |
4532 new_dfc_convert_copy_data (const char *srctext, void *alloca_data) | |
4533 { | |
4534 alloca_convert_vals *vals; | |
4535 int i = find_pos_of_existing_active_alloca_convert (srctext); | |
4536 | |
4537 assert (i >= 0); | |
4538 vals = Dynarr_atp (active_alloca_convert, i); | |
4539 assert (alloca_data); | |
4540 memcpy (alloca_data, vals->dst, vals->dst_size); | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4967
diff
changeset
|
4541 xfree (vals->dst); |
2367 | 4542 Dynarr_delete (active_alloca_convert, i); |
4543 return alloca_data; | |
1318 | 4544 } |
4545 | |
4546 void * | |
4547 new_dfc_convert_malloc (const void *src, Bytecount src_size, | |
4548 enum new_dfc_src_type type, Lisp_Object codesys) | |
4549 { | |
4550 void *dst; | |
4551 Bytecount dst_size; | |
4552 | |
4553 new_dfc_convert_now_damn_it (src, src_size, type, &dst, &dst_size, codesys); | |
4554 return dst; | |
4555 } | |
4556 | |
771 | 4557 |
4558 /************************************************************************/ | |
867 | 4559 /* Basic Ichar functions */ |
771 | 4560 /************************************************************************/ |
4561 | |
4562 #ifdef MULE | |
4563 | |
4564 /* Convert a non-ASCII Mule character C into a one-character Mule-encoded | |
4565 string in STR. Returns the number of bytes stored. | |
867 | 4566 Do not call this directly. Use the macro set_itext_ichar() instead. |
771 | 4567 */ |
4568 | |
4569 Bytecount | |
867 | 4570 non_ascii_set_itext_ichar (Ibyte *str, Ichar c) |
771 | 4571 { |
867 | 4572 Ibyte *p; |
4573 Ibyte lb; | |
771 | 4574 int c1, c2; |
4575 Lisp_Object charset; | |
4576 | |
4577 p = str; | |
867 | 4578 BREAKUP_ICHAR (c, charset, c1, c2); |
4579 lb = ichar_leading_byte (c); | |
826 | 4580 if (leading_byte_private_p (lb)) |
4581 *p++ = private_leading_byte_prefix (lb); | |
771 | 4582 *p++ = lb; |
4583 if (EQ (charset, Vcharset_control_1)) | |
4584 c1 += 0x20; | |
4585 *p++ = c1 | 0x80; | |
4586 if (c2) | |
4587 *p++ = c2 | 0x80; | |
4588 | |
4589 return (p - str); | |
4590 } | |
4591 | |
4592 /* Return the first character from a Mule-encoded string in STR, | |
4593 assuming it's non-ASCII. Do not call this directly. | |
867 | 4594 Use the macro itext_ichar() instead. */ |
4595 | |
4596 Ichar | |
4597 non_ascii_itext_ichar (const Ibyte *str) | |
771 | 4598 { |
867 | 4599 Ibyte i0 = *str, i1, i2 = 0; |
771 | 4600 Lisp_Object charset; |
4601 | |
4602 if (i0 == LEADING_BYTE_CONTROL_1) | |
867 | 4603 return (Ichar) (*++str - 0x20); |
771 | 4604 |
826 | 4605 if (leading_byte_prefix_p (i0)) |
771 | 4606 i0 = *++str; |
4607 | |
4608 i1 = *++str & 0x7F; | |
4609 | |
826 | 4610 charset = charset_by_leading_byte (i0); |
771 | 4611 if (XCHARSET_DIMENSION (charset) == 2) |
4612 i2 = *++str & 0x7F; | |
4613 | |
867 | 4614 return make_ichar (charset, i1, i2); |
771 | 4615 } |
4616 | |
867 | 4617 /* Return whether CH is a valid Ichar, assuming it's non-ASCII. |
4618 Do not call this directly. Use the macro valid_ichar_p() instead. */ | |
771 | 4619 |
4620 int | |
867 | 4621 non_ascii_valid_ichar_p (Ichar ch) |
771 | 4622 { |
4623 int f1, f2, f3; | |
4624 | |
3498 | 4625 /* Must have only lowest 21 bits set */ |
4626 if (ch & ~0x1FFFFF) | |
771 | 4627 return 0; |
4628 | |
867 | 4629 f1 = ichar_field1 (ch); |
4630 f2 = ichar_field2 (ch); | |
4631 f3 = ichar_field3 (ch); | |
771 | 4632 |
4633 if (f1 == 0) | |
4634 { | |
4635 /* dimension-1 char */ | |
4636 Lisp_Object charset; | |
4637 | |
4638 /* leading byte must be correct */ | |
867 | 4639 if (f2 < MIN_ICHAR_FIELD2_OFFICIAL || |
4640 (f2 > MAX_ICHAR_FIELD2_OFFICIAL && f2 < MIN_ICHAR_FIELD2_PRIVATE) || | |
4641 f2 > MAX_ICHAR_FIELD2_PRIVATE) | |
771 | 4642 return 0; |
4643 /* octet not out of range */ | |
4644 if (f3 < 0x20) | |
4645 return 0; | |
4646 /* charset exists */ | |
4647 /* | |
4648 NOTE: This takes advantage of the fact that | |
4649 FIELD2_TO_OFFICIAL_LEADING_BYTE and | |
4650 FIELD2_TO_PRIVATE_LEADING_BYTE are the same. | |
4651 */ | |
826 | 4652 charset = charset_by_leading_byte (f2 + FIELD2_TO_OFFICIAL_LEADING_BYTE); |
771 | 4653 if (EQ (charset, Qnil)) |
4654 return 0; | |
4655 /* check range as per size (94 or 96) of charset */ | |
4656 return ((f3 > 0x20 && f3 < 0x7f) || XCHARSET_CHARS (charset) == 96); | |
4657 } | |
4658 else | |
4659 { | |
4660 /* dimension-2 char */ | |
4661 Lisp_Object charset; | |
4662 | |
4663 /* leading byte must be correct */ | |
867 | 4664 if (f1 < MIN_ICHAR_FIELD1_OFFICIAL || |
4665 (f1 > MAX_ICHAR_FIELD1_OFFICIAL && f1 < MIN_ICHAR_FIELD1_PRIVATE) || | |
4666 f1 > MAX_ICHAR_FIELD1_PRIVATE) | |
771 | 4667 return 0; |
4668 /* octets not out of range */ | |
4669 if (f2 < 0x20 || f3 < 0x20) | |
4670 return 0; | |
4671 | |
4672 #ifdef ENABLE_COMPOSITE_CHARS | |
4673 if (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE == LEADING_BYTE_COMPOSITE) | |
4674 { | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4675 if (UNBOUNDP (Fgethash (make_fixnum (ch), |
771 | 4676 Vcomposite_char_char2string_hash_table, |
4677 Qunbound))) | |
4678 return 0; | |
4679 return 1; | |
4680 } | |
4681 #endif /* ENABLE_COMPOSITE_CHARS */ | |
4682 | |
4683 /* charset exists */ | |
867 | 4684 if (f1 <= MAX_ICHAR_FIELD1_OFFICIAL) |
771 | 4685 charset = |
826 | 4686 charset_by_leading_byte (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE); |
771 | 4687 else |
4688 charset = | |
826 | 4689 charset_by_leading_byte (f1 + FIELD1_TO_PRIVATE_LEADING_BYTE); |
771 | 4690 |
4691 if (EQ (charset, Qnil)) | |
4692 return 0; | |
4693 /* check range as per size (94x94 or 96x96) of charset */ | |
4694 return ((f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F) || | |
4695 XCHARSET_CHARS (charset) == 96); | |
4696 } | |
4697 } | |
4698 | |
4699 /* Copy the character pointed to by SRC into DST. Do not call this | |
867 | 4700 directly. Use the macro itext_copy_ichar() instead. |
771 | 4701 Return the number of bytes copied. */ |
4702 | |
4703 Bytecount | |
867 | 4704 non_ascii_itext_copy_ichar (const Ibyte *src, Ibyte *dst) |
771 | 4705 { |
826 | 4706 Bytecount bytes = rep_bytes_by_first_byte (*src); |
771 | 4707 Bytecount i; |
4708 for (i = bytes; i; i--, dst++, src++) | |
4709 *dst = *src; | |
4710 return bytes; | |
4711 } | |
4712 | |
4713 #endif /* MULE */ | |
4714 | |
4715 | |
4716 /************************************************************************/ | |
867 | 4717 /* streams of Ichars */ |
771 | 4718 /************************************************************************/ |
4719 | |
4720 #ifdef MULE | |
4721 | |
867 | 4722 /* Treat a stream as a stream of Ichar's rather than a stream of bytes. |
771 | 4723 The functions below are not meant to be called directly; use |
4724 the macros in insdel.h. */ | |
4725 | |
867 | 4726 Ichar |
4727 Lstream_get_ichar_1 (Lstream *stream, int ch) | |
771 | 4728 { |
867 | 4729 Ibyte str[MAX_ICHAR_LEN]; |
4730 Ibyte *strptr = str; | |
771 | 4731 Bytecount bytes; |
4732 | |
867 | 4733 str[0] = (Ibyte) ch; |
771 | 4734 |
826 | 4735 for (bytes = rep_bytes_by_first_byte (ch) - 1; bytes; bytes--) |
771 | 4736 { |
4737 int c = Lstream_getc (stream); | |
800 | 4738 text_checking_assert (c >= 0); |
867 | 4739 *++strptr = (Ibyte) c; |
771 | 4740 } |
867 | 4741 return itext_ichar (str); |
771 | 4742 } |
4743 | |
4744 int | |
867 | 4745 Lstream_fput_ichar (Lstream *stream, Ichar ch) |
771 | 4746 { |
867 | 4747 Ibyte str[MAX_ICHAR_LEN]; |
4748 Bytecount len = set_itext_ichar (str, ch); | |
771 | 4749 return Lstream_write (stream, str, len); |
4750 } | |
4751 | |
4752 void | |
867 | 4753 Lstream_funget_ichar (Lstream *stream, Ichar ch) |
771 | 4754 { |
867 | 4755 Ibyte str[MAX_ICHAR_LEN]; |
4756 Bytecount len = set_itext_ichar (str, ch); | |
771 | 4757 Lstream_unread (stream, str, len); |
4758 } | |
4759 | |
4760 #endif /* MULE */ | |
4761 | |
4762 | |
4763 /************************************************************************/ | |
4764 /* Lisp primitives for working with characters */ | |
4765 /************************************************************************/ | |
4766 | |
4767 DEFUN ("make-char", Fmake_char, 2, 3, 0, /* | |
4768 Make a character from CHARSET and octets ARG1 and ARG2. | |
4769 ARG2 is required only for characters from two-dimensional charsets. | |
4770 | |
4771 Each octet should be in the range 32 through 127 for a 96 or 96x96 | |
4772 charset and 33 through 126 for a 94 or 94x94 charset. (Most charsets | |
4773 are either 96 or 94x94.) Note that this is 32 more than the values | |
4774 typically given for 94x94 charsets. When two octets are required, the | |
4775 order is "standard" -- the same as appears in ISO-2022 encodings, | |
4776 reference tables, etc. | |
4777 | |
4778 \(Note the following non-obvious result: Computerized translation | |
4779 tables often encode the two octets as the high and low bytes, | |
4780 respectively, of a hex short, while when there's only one octet, it | |
4781 goes in the low byte. When decoding such a value, you need to treat | |
4782 the two cases differently when calling make-char: One is (make-char | |
4783 CHARSET HIGH LOW), the other is (make-char CHARSET LOW).) | |
4784 | |
4785 For example, (make-char 'latin-iso8859-2 185) or (make-char | |
4786 'latin-iso8859-2 57) will return the Latin 2 character s with caron. | |
4787 | |
4788 As another example, the Japanese character for "kawa" (stream), which | |
4789 looks something like this: | |
4790 | |
4791 | | | |
4792 | | | | |
4793 | | | | |
4794 | | | | |
4795 / | | |
4796 | |
4797 appears in the Unicode Standard (version 2.0) on page 7-287 with the | |
4798 following values (see also page 7-4): | |
4799 | |
4800 U 5DDD (Unicode) | |
4801 G 0-2008 (GB 2312-80) | |
4802 J 0-3278 (JIS X 0208-1990) | |
4803 K 0-8425 (KS C 5601-1987) | |
4804 B A474 (Big Five) | |
4805 C 1-4455 (CNS 11643-1986 (1st plane)) | |
4806 A 213C34 (ANSI Z39.64-1989) | |
4807 | |
4808 These are equivalent to: | |
4809 | |
4810 \(make-char 'chinese-gb2312 52 40) | |
4811 \(make-char 'japanese-jisx0208 64 110) | |
4812 \(make-char 'korean-ksc5601 116 57) | |
4813 \(make-char 'chinese-cns11643-1 76 87) | |
4814 \(decode-big5-char '(164 . 116)) | |
4815 | |
4816 \(All codes above are two decimal numbers except for Big Five and ANSI | |
4817 Z39.64, which we don't support. We add 32 to each of the decimal | |
4818 numbers. Big Five is split in a rather hackish fashion into two | |
4819 charsets, `big5-1' and `big5-2', due to its excessive size -- 94x157, | |
4820 with the first codepoint in the range 0xA1 to 0xFE and the second in | |
4821 the range 0x40 to 0x7E or 0xA1 to 0xFE. `decode-big5-char' is used to | |
4822 generate the char from its codes, and `encode-big5-char' extracts the | |
4823 codes.) | |
4824 | |
4825 When compiled without MULE, this function does not do much, but it's | |
4826 provided for compatibility. In this case, the following CHARSET symbols | |
4827 are allowed: | |
4828 | |
4829 `ascii' -- ARG1 should be in the range 0 through 127. | |
4830 `control-1' -- ARG1 should be in the range 128 through 159. | |
4831 else -- ARG1 is coerced to be between 0 and 255, and then the high | |
4832 bit is set. | |
4833 | |
4834 `int-to-char of the resulting ARG1' is returned, and ARG2 is always ignored. | |
4835 */ | |
2333 | 4836 (charset, arg1, USED_IF_MULE (arg2))) |
771 | 4837 { |
4838 #ifdef MULE | |
4839 Lisp_Charset *cs; | |
4840 int a1, a2; | |
4841 int lowlim, highlim; | |
4842 | |
4843 charset = Fget_charset (charset); | |
4844 cs = XCHARSET (charset); | |
4845 | |
788 | 4846 get_charset_limits (charset, &lowlim, &highlim); |
771 | 4847 |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4848 CHECK_FIXNUM (arg1); |
771 | 4849 /* It is useful (and safe, according to Olivier Galibert) to strip |
4850 the 8th bit off ARG1 and ARG2 because it allows programmers to | |
4851 write (make-char 'latin-iso8859-2 CODE) where code is the actual | |
4852 Latin 2 code of the character. */ | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4853 a1 = XFIXNUM (arg1) & 0x7f; |
771 | 4854 if (a1 < lowlim || a1 > highlim) |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4855 args_out_of_range_3 (arg1, make_fixnum (lowlim), make_fixnum (highlim)); |
771 | 4856 |
4857 if (CHARSET_DIMENSION (cs) == 1) | |
4858 { | |
4859 if (!NILP (arg2)) | |
4860 invalid_argument | |
4861 ("Charset is of dimension one; second octet must be nil", arg2); | |
867 | 4862 return make_char (make_ichar (charset, a1, 0)); |
771 | 4863 } |
4864 | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4865 CHECK_FIXNUM (arg2); |
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4866 a2 = XFIXNUM (arg2) & 0x7f; |
771 | 4867 if (a2 < lowlim || a2 > highlim) |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4868 args_out_of_range_3 (arg2, make_fixnum (lowlim), make_fixnum (highlim)); |
771 | 4869 |
867 | 4870 return make_char (make_ichar (charset, a1, a2)); |
771 | 4871 #else |
4872 int a1; | |
4873 int lowlim, highlim; | |
4874 | |
4875 if (EQ (charset, Qascii)) lowlim = 0, highlim = 127; | |
4876 else if (EQ (charset, Qcontrol_1)) lowlim = 0, highlim = 31; | |
4877 else lowlim = 0, highlim = 127; | |
4878 | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4879 CHECK_FIXNUM (arg1); |
771 | 4880 /* It is useful (and safe, according to Olivier Galibert) to strip |
4881 the 8th bit off ARG1 and ARG2 because it allows programmers to | |
4882 write (make-char 'latin-iso8859-2 CODE) where code is the actual | |
4883 Latin 2 code of the character. */ | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4884 a1 = XFIXNUM (arg1) & 0x7f; |
771 | 4885 if (a1 < lowlim || a1 > highlim) |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4886 args_out_of_range_3 (arg1, make_fixnum (lowlim), make_fixnum (highlim)); |
771 | 4887 |
4888 if (EQ (charset, Qascii)) | |
4889 return make_char (a1); | |
4890 return make_char (a1 + 128); | |
4891 #endif /* MULE */ | |
4892 } | |
4893 | |
4894 #ifdef MULE | |
4895 | |
4896 DEFUN ("char-charset", Fchar_charset, 1, 1, 0, /* | |
4897 Return the character set of char CH. | |
4898 */ | |
4899 (ch)) | |
4900 { | |
4901 CHECK_CHAR_COERCE_INT (ch); | |
4902 | |
826 | 4903 return XCHARSET_NAME (charset_by_leading_byte |
867 | 4904 (ichar_leading_byte (XCHAR (ch)))); |
771 | 4905 } |
4906 | |
4907 DEFUN ("char-octet", Fchar_octet, 1, 2, 0, /* | |
4908 Return the octet numbered N (should be 0 or 1) of char CH. | |
4909 N defaults to 0 if omitted. | |
4910 */ | |
4911 (ch, n)) | |
4912 { | |
4913 Lisp_Object charset; | |
4914 int octet0, octet1; | |
4915 | |
4916 CHECK_CHAR_COERCE_INT (ch); | |
4917 | |
867 | 4918 BREAKUP_ICHAR (XCHAR (ch), charset, octet0, octet1); |
771 | 4919 |
4920 if (NILP (n) || EQ (n, Qzero)) | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4921 return make_fixnum (octet0); |
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4922 else if (EQ (n, make_fixnum (1))) |
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4923 return make_fixnum (octet1); |
771 | 4924 else |
4925 invalid_constant ("Octet number must be 0 or 1", n); | |
4926 } | |
4927 | |
3724 | 4928 #endif /* MULE */ |
4929 | |
771 | 4930 DEFUN ("split-char", Fsplit_char, 1, 1, 0, /* |
4931 Return list of charset and one or two position-codes of CHAR. | |
4932 */ | |
4933 (character)) | |
4934 { | |
4935 /* This function can GC */ | |
4936 struct gcpro gcpro1, gcpro2; | |
4937 Lisp_Object charset = Qnil; | |
4938 Lisp_Object rc = Qnil; | |
4939 int c1, c2; | |
4940 | |
4941 GCPRO2 (charset, rc); | |
4942 CHECK_CHAR_COERCE_INT (character); | |
4943 | |
867 | 4944 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2); |
771 | 4945 |
3724 | 4946 if (XCHARSET_DIMENSION (charset) == 2) |
771 | 4947 { |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4948 rc = list3 (XCHARSET_NAME (charset), make_fixnum (c1), make_fixnum (c2)); |
771 | 4949 } |
4950 else | |
4951 { | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4952 rc = list2 (XCHARSET_NAME (charset), make_fixnum (c1)); |
771 | 4953 } |
4954 UNGCPRO; | |
4955 | |
4956 return rc; | |
4957 } | |
4958 | |
4959 | |
4960 /************************************************************************/ | |
4961 /* composite character functions */ | |
4962 /************************************************************************/ | |
4963 | |
4964 #ifdef ENABLE_COMPOSITE_CHARS | |
4965 | |
867 | 4966 Ichar |
4967 lookup_composite_char (Ibyte *str, int len) | |
771 | 4968 { |
4969 Lisp_Object lispstr = make_string (str, len); | |
4970 Lisp_Object ch = Fgethash (lispstr, | |
4971 Vcomposite_char_string2char_hash_table, | |
4972 Qunbound); | |
867 | 4973 Ichar emch; |
771 | 4974 |
4975 if (UNBOUNDP (ch)) | |
4976 { | |
4977 if (composite_char_row_next >= 128) | |
4978 invalid_operation ("No more composite chars available", lispstr); | |
867 | 4979 emch = make_ichar (Vcharset_composite, composite_char_row_next, |
771 | 4980 composite_char_col_next); |
4981 Fputhash (make_char (emch), lispstr, | |
4982 Vcomposite_char_char2string_hash_table); | |
4983 Fputhash (lispstr, make_char (emch), | |
4984 Vcomposite_char_string2char_hash_table); | |
4985 composite_char_col_next++; | |
4986 if (composite_char_col_next >= 128) | |
4987 { | |
4988 composite_char_col_next = 32; | |
4989 composite_char_row_next++; | |
4990 } | |
4991 } | |
4992 else | |
4993 emch = XCHAR (ch); | |
4994 return emch; | |
4995 } | |
4996 | |
4997 Lisp_Object | |
867 | 4998 composite_char_string (Ichar ch) |
771 | 4999 { |
5000 Lisp_Object str = Fgethash (make_char (ch), | |
5001 Vcomposite_char_char2string_hash_table, | |
5002 Qunbound); | |
5003 assert (!UNBOUNDP (str)); | |
5004 return str; | |
5005 } | |
5006 | |
826 | 5007 DEFUN ("make-composite-char", Fmake_composite_char, 1, 1, 0, /* |
771 | 5008 Convert a string into a single composite character. |
5009 The character is the result of overstriking all the characters in | |
5010 the string. | |
5011 */ | |
5012 (string)) | |
5013 { | |
5014 CHECK_STRING (string); | |
5015 return make_char (lookup_composite_char (XSTRING_DATA (string), | |
5016 XSTRING_LENGTH (string))); | |
5017 } | |
5018 | |
826 | 5019 DEFUN ("composite-char-string", Fcomposite_char_string, 1, 1, 0, /* |
771 | 5020 Return a string of the characters comprising a composite character. |
5021 */ | |
5022 (ch)) | |
5023 { | |
867 | 5024 Ichar emch; |
771 | 5025 |
5026 CHECK_CHAR (ch); | |
5027 emch = XCHAR (ch); | |
867 | 5028 if (ichar_leading_byte (emch) != LEADING_BYTE_COMPOSITE) |
771 | 5029 invalid_argument ("Must be composite char", ch); |
5030 return composite_char_string (emch); | |
5031 } | |
5032 #endif /* ENABLE_COMPOSITE_CHARS */ | |
5033 | |
5034 | |
5035 /************************************************************************/ | |
5036 /* initialization */ | |
5037 /************************************************************************/ | |
5038 | |
5039 void | |
1204 | 5040 reinit_eistring_early (void) |
771 | 5041 { |
5042 the_eistring_malloc_zero_init = the_eistring_zero_init; | |
5043 the_eistring_malloc_zero_init.mallocp_ = 1; | |
5044 } | |
5045 | |
5046 void | |
814 | 5047 init_eistring_once_early (void) |
5048 { | |
1204 | 5049 reinit_eistring_early (); |
814 | 5050 } |
5051 | |
5052 void | |
771 | 5053 syms_of_text (void) |
5054 { | |
5055 DEFSUBR (Fmake_char); | |
3724 | 5056 DEFSUBR (Fsplit_char); |
771 | 5057 |
5058 #ifdef MULE | |
5059 DEFSUBR (Fchar_charset); | |
5060 DEFSUBR (Fchar_octet); | |
5061 | |
5062 #ifdef ENABLE_COMPOSITE_CHARS | |
5063 DEFSUBR (Fmake_composite_char); | |
5064 DEFSUBR (Fcomposite_char_string); | |
5065 #endif | |
5066 #endif /* MULE */ | |
5067 } | |
5068 | |
5069 void | |
5070 reinit_vars_of_text (void) | |
5071 { | |
5072 int i; | |
5073 | |
867 | 5074 conversion_in_dynarr_list = Dynarr_new2 (Ibyte_dynarr_dynarr, |
5075 Ibyte_dynarr *); | |
771 | 5076 conversion_out_dynarr_list = Dynarr_new2 (Extbyte_dynarr_dynarr, |
5077 Extbyte_dynarr *); | |
5078 | |
5079 for (i = 0; i <= MAX_BYTEBPOS_GAP_SIZE_3; i++) | |
5080 three_to_one_table[i] = i / 3; | |
5081 } | |
5082 | |
5083 void | |
5084 vars_of_text (void) | |
5085 { | |
4952
19a72041c5ed
Mule-izing, various fixes related to char * arguments
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
5086 QSin_char_byte_conversion = build_defer_string ("(in char-byte conversion)"); |
1292 | 5087 staticpro (&QSin_char_byte_conversion); |
5088 QSin_internal_external_conversion = | |
4952
19a72041c5ed
Mule-izing, various fixes related to char * arguments
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
5089 build_defer_string ("(in internal-external conversion)"); |
1292 | 5090 staticpro (&QSin_internal_external_conversion); |
5091 | |
771 | 5092 #ifdef ENABLE_COMPOSITE_CHARS |
5093 /* #### not dumped properly */ | |
5094 composite_char_row_next = 32; | |
5095 composite_char_col_next = 32; | |
5096 | |
5097 Vcomposite_char_string2char_hash_table = | |
5191
71ee43b8a74d
Add #'equalp as a hash test by default; add #'define-hash-table-test, GNU API
Aidan Kehoe <kehoea@parhasard.net>
parents:
5013
diff
changeset
|
5098 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, Qequal); |
771 | 5099 Vcomposite_char_char2string_hash_table = |
5191
71ee43b8a74d
Add #'equalp as a hash test by default; add #'define-hash-table-test, GNU API
Aidan Kehoe <kehoea@parhasard.net>
parents:
5013
diff
changeset
|
5100 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, Qeq); |
771 | 5101 staticpro (&Vcomposite_char_string2char_hash_table); |
5102 staticpro (&Vcomposite_char_char2string_hash_table); | |
5103 #endif /* ENABLE_COMPOSITE_CHARS */ | |
5104 } |