Mercurial > hg > xemacs-beta
annotate src/text.c @ 5636:07256dcc0c8b
Add missing foreback specifier values to the GUI Element face.
They were missing for an unexplicable reason in my initial patch, leading to
nil color instances in the whole hierarchy of widget faces.
-------------------- ChangeLog entries follow: --------------------
src/ChangeLog addition:
2012-01-03 Didier Verna <didier@xemacs.org>
* faces.c (complex_vars_of_faces): Add missing foreback specifier
values to the GUI Element face.
author | Didier Verna <didier@lrde.epita.fr> |
---|---|
date | Tue, 03 Jan 2012 11:25:06 +0100 |
parents | 56144c8593a8 |
children | 7a538e1a4676 |
rev | line source |
---|---|
2367 | 1 /* Text manipulation primitives for XEmacs. |
771 | 2 Copyright (C) 1995 Sun Microsystems, Inc. |
2367 | 3 Copyright (C) 1995, 1996, 2000, 2001, 2002, 2003, 2004 Ben Wing. |
771 | 4 Copyright (C) 1999 Martin Buchholz. |
5 | |
6 This file is part of XEmacs. | |
7 | |
5402
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5191
diff
changeset
|
8 XEmacs is free software: you can redistribute it and/or modify it |
771 | 9 under the terms of the GNU General Public License as published by the |
5402
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5191
diff
changeset
|
10 Free Software Foundation, either version 3 of the License, or (at your |
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5191
diff
changeset
|
11 option) any later version. |
771 | 12 |
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 for more details. | |
17 | |
18 You should have received a copy of the GNU General Public License | |
5402
308d34e9f07d
Changed bulk of GPLv2 or later files identified by script
Mats Lidell <matsl@xemacs.org>
parents:
5191
diff
changeset
|
19 along with XEmacs. If not, see <http://www.gnu.org/licenses/>. */ |
771 | 20 |
21 /* Synched up with: Not in FSF. */ | |
22 | |
23 /* Authorship: | |
24 */ | |
25 | |
26 #include <config.h> | |
27 #include "lisp.h" | |
28 | |
29 #include "buffer.h" | |
30 #include "charset.h" | |
31 #include "file-coding.h" | |
32 #include "lstream.h" | |
1292 | 33 #include "profile.h" |
771 | 34 |
35 | |
36 /************************************************************************/ | |
37 /* long comments */ | |
38 /************************************************************************/ | |
39 | |
2367 | 40 /* NB: Everything below was written by Ben Wing except as otherwise noted. */ |
41 | |
42 /************************************************************************/ | |
43 /* */ | |
44 /* */ | |
45 /* Part A: More carefully-written documentation */ | |
46 /* */ | |
47 /* */ | |
48 /************************************************************************/ | |
49 | |
50 /* Authorship: Ben Wing | |
51 | |
771 | 52 |
826 | 53 ========================================================================== |
2367 | 54 7. Handling non-default formats |
826 | 55 ========================================================================== |
771 | 56 |
2367 | 57 We support, at least to some extent, formats other than the default |
58 variable-width format, for speed; all of these alternative formats are | |
59 fixed-width. Currently we only handle these non-default formats in | |
60 buffers, because access to their text is strictly controlled and thus | |
61 the details of the format mostly compartmentalized. The only really | |
62 tricky part is the search code -- the regex, Boyer-Moore, and | |
63 simple-search algorithms in search.c and regex.c. All other code that | |
64 knows directly about the buffer representation is the basic code to | |
65 modify or retrieve the buffer text. | |
66 | |
67 Supporting fixed-width formats in Lisp strings is harder, but possible | |
68 -- FSF currently does this, for example. In this case, however, | |
69 probably only 8-bit-fixed is reasonable for Lisp strings -- getting | |
70 non-ASCII-compatible fixed-width formats to work is much, much harder | |
71 because a lot of code assumes that strings are ASCII-compatible | |
72 (i.e. ASCII + other characters represented exclusively using high-bit | |
73 bytes) and a lot of code mixes Lisp strings and non-Lisp strings freely. | |
74 | |
75 The different possible fixed-width formats are 8-bit fixed, 16-bit | |
76 fixed, and 32-bit fixed. The latter can represent all possible | |
77 characters, but at a substantial memory penalty. The other two can | |
78 represent only a subset of the possible characters. How these subsets | |
79 are defined can be simple or very tricky. | |
80 | |
81 Currently we support only the default format and the 8-bit fixed format, | |
82 and in the latter, we only allow these to be the first 256 characters in | |
83 an Ichar (ASCII and Latin 1). | |
84 | |
85 One reasonable approach for 8-bit fixed is to allow the upper half to | |
86 represent any 1-byte charset, which is specified on a per-buffer basis. | |
87 This should work fairly well in practice since most documents are in | |
88 only one foreign language (possibly with some English mixed in). I | |
89 think FSF does something like this; or at least, they have something | |
90 called nonascii-translation-table and use it when converting from | |
91 8-bit-fixed text ("unibyte text") to default text ("multibyte text"). | |
92 With 16-bit fixed, you could do something like assign chunks of the 64K | |
93 worth of characters to charsets as they're encountered in documents. | |
94 This should work well with most Asian documents. | |
95 | |
96 If/when we switch to using Unicode internally, we might have formats more | |
97 like this: | |
98 | |
99 -- UTF-8 or some extension as the default format. Perl uses an | |
100 extension that handles 64-bit chars and requires as much as 13 bytes per | |
101 char, vs. the standard of 31-bit chars and 6 bytes max. UTF-8 has the | |
102 same basic properties as our own variable-width format (see text.c, | |
103 Internal String Encoding) and so most code would not need to be changed. | |
104 | |
105 -- UTF-16 as a "pseudo-fixed" format (i.e. 16-bit fixed plus surrogates | |
106 for representing characters not in the BMP, aka >= 65536). The vast | |
107 majority of documents will have no surrogates in them so byte/char | |
108 conversion will be very fast. | |
109 | |
110 -- an 8-bit fixed format, like currently. | |
111 | |
112 -- possibly, UCS-4 as a 32-bit fixed format. | |
113 | |
114 The fixed-width formats essentially treat the buffer as an array of | |
115 8-bit, 16-bit or 32-bit integers. This means that how they are stored | |
116 in memory (in particular, big-endian or little-endian) depends on the | |
117 native format of the machine's processor. It also means we have to | |
118 worry a bit about alignment (basically, we just need to keep the gap an | |
119 integral size of the character size, and get things aligned properly | |
120 when converting the buffer between formats). | |
826 | 121 |
122 ========================================================================== | |
2367 | 123 8. Using UTF-16 as the default text format |
826 | 124 ========================================================================== |
125 | |
2367 | 126 NOTE: The Eistring API is (or should be) Mule-correct even without |
127 an ASCII-compatible internal representation. | |
128 | |
129 #### Currently, the assumption that text units are one byte in size is | |
130 embedded throughout XEmacs, and `Ibyte *' is used where `Itext *' should | |
131 be. The way to fix this is to (among other things) | |
132 | |
133 (a) review all places referencing `Ibyte' and `Ibyte *', change them to | |
134 use Itext, and fix up the code. | |
135 (b) change XSTRING_DATA to be of type Itext * | |
136 (c) review all uses of XSTRING_DATA | |
137 (d) eliminate XSTRING_LENGTH, splitting it into XSTRING_BYTE_LENGTH and | |
138 XSTRING_TEXT_LENGTH and reviewing all places referencing this | |
139 (e) make similar changes to other API's that refer to the "length" of | |
140 something, such as qxestrlen() and eilen() | |
141 (f) review all use of `CIbyte *'. Currently this is usually a way of | |
142 passing literal ASCII text strings in places that want internal text. | |
143 Either create separate _ascii() and _itext() versions of the | |
144 functions taking CIbyte *, or make use of something like the | |
145 WEXTTEXT() macro, which will generate wide strings as appropriate. | |
146 (g) review all uses of Bytecount and see which ones should be Textcount. | |
147 (h) put in error-checking code that will be tripped as often as possible | |
148 when doing anything with internal text, and check to see that ASCII | |
149 text has not mistakenly filtered in. This should be fairly easy as | |
150 ASCII text will generally be entirely spaces and letters whereas every | |
151 second byte of Unicode text will generally be a null byte. Either we | |
152 abort if the second bytes are entirely letters and numbers, or, | |
153 perhaps better, do the equivalent of a non-MULE build, where we should | |
154 be dealing entirely with 8-bit characters, and assert that the high | |
155 bytes of each pair are null. | |
156 (i) review places where xmalloc() is called. If we convert each use of | |
157 xmalloc() to instead be xnew_array() or some other typed routine, | |
158 then we will find every place that allocates space for Itext and | |
159 assumes it is based on one-byte units. | |
160 (j) encourage the use of ITEXT_ZTERM_SIZE instead of '+ 1' whenever we | |
161 are adding space for a zero-terminator, to emphasize what we are | |
162 doing and make sure the calculations are correct. Similarly for | |
163 EXTTEXT_ZTERM_SIZE. | |
164 (k) Note that the qxestr*() functions, among other things, will need to | |
165 be rewritten. | |
166 | |
167 Note that this is a lot of work, and is not high on the list of priorities | |
168 currently. | |
826 | 169 |
170 ========================================================================== | |
2367 | 171 9. Miscellaneous |
826 | 172 ========================================================================== |
173 | |
174 A. Unicode Support | |
771 | 175 |
1292 | 176 Unicode support is very desirable. Currrently we know how to handle |
177 externally-encoded Unicode data in various encodings -- UTF-16, UTF-8, | |
178 etc. However, we really need to represent Unicode characters internally | |
179 as-is, rather than converting to some language-specific character set. | |
180 For efficiency, we should represent Unicode characters using 3 bytes | |
181 rather than 4. This means we need to find leading bytes for Unicode. | |
182 Given that there are 65,536 characters in Unicode and we can attach | |
183 96x96 = 9,216 characters per leading byte, we need eight leading bytes | |
184 for Unicode. We currently have four free (0x9A - 0x9D), and with a | |
185 little bit of rearranging we can get five: ASCII doesn't really need to | |
186 take up a leading byte. (We could just as well use 0x7F, with a little | |
187 change to the functions that assume that 0x80 is the lowest leading | |
188 byte.) This means we still need to dump three leading bytes and move | |
189 them into private space. The CNS charsets are good candidates since | |
190 they are rarely used, and JAPANESE_JISX0208_1978 is becoming less and | |
191 less used and could also be dumped. | |
826 | 192 |
193 B. Composite Characters | |
194 | |
195 Composite characters are characters constructed by overstriking two | |
771 | 196 or more regular characters. |
197 | |
198 1) The old Mule implementation involves storing composite characters | |
199 in a buffer as a tag followed by all of the actual characters | |
200 used to make up the composite character. I think this is a bad | |
201 idea; it greatly complicates code that wants to handle strings | |
202 one character at a time because it has to deal with the possibility | |
203 of great big ungainly characters. It's much more reasonable to | |
204 simply store an index into a table of composite characters. | |
205 | |
206 2) The current implementation only allows for 16,384 separate | |
207 composite characters over the lifetime of the XEmacs process. | |
208 This could become a potential problem if the user | |
209 edited lots of different files that use composite characters. | |
210 Due to FSF bogosity, increasing the number of allowable | |
211 composite characters under Mule would decrease the number | |
212 of possible faces that can exist. Mule already has shrunk | |
213 this to 2048, and further shrinkage would become uncomfortable. | |
214 No such problems exist in XEmacs. | |
215 | |
3498 | 216 Composite characters could be represented as 0x8D C1 C2 C3, where each |
217 C[1-3] is in the range 0xA0 - 0xFF. This allows for slightly under | |
218 2^20 (one million) composite characters over the XEmacs process | |
219 lifetime. Or you could use 0x8D C1 C2 C3 C4, allowing for about 85 | |
220 million (slightly over 2^26) composite characters. | |
826 | 221 |
2367 | 222 ========================================================================== |
223 10. Internal API's | |
224 ========================================================================== | |
225 | |
226 All of these are documented in more detail in text.h. | |
227 | |
228 @enumerate | |
229 @item | |
230 Basic internal-format API's | |
231 | |
232 These are simple functions and macros to convert between text | |
233 representation and characters, move forward and back in text, etc. | |
234 | |
235 @item | |
236 The DFC API | |
237 | |
238 This is for conversion between internal and external text. Note that | |
239 there is also the "new DFC" API, which *returns* a pointer to the | |
240 converted text (in alloca space), rather than storing it into a | |
241 variable. | |
242 | |
243 @item | |
244 The Eistring API | |
245 | |
4073 | 246 \(This API is currently under-used) When doing simple things with |
2367 | 247 internal text, the basic internal-format API's are enough. But to do |
248 things like delete or replace a substring, concatenate various strings, | |
249 etc. is difficult to do cleanly because of the allocation issues. | |
250 The Eistring API is designed to deal with this, and provides a clean | |
251 way of modifying and building up internal text. (Note that the former | |
252 lack of this API has meant that some code uses Lisp strings to do | |
253 similar manipulations, resulting in excess garbage and increased | |
254 garbage collection.) | |
255 | |
256 NOTE: The Eistring API is (or should be) Mule-correct even without | |
257 an ASCII-compatible internal representation. | |
258 @end enumerate | |
259 | |
260 ========================================================================== | |
261 11. Other Sources of Documentation | |
262 ========================================================================== | |
263 | |
264 man/lispref/mule.texi | |
265 @enumerate | |
266 @item | |
267 another intro to characters, encodings, etc; #### Merge with the | |
268 above info | |
269 @item | |
270 documentation of ISO-2022 | |
271 @item | |
272 The charset and coding-system Lisp API's | |
273 @item | |
274 The CCL conversion language for writing encoding conversions | |
275 @item | |
276 The Latin-Unity package for unifying Latin charsets | |
277 @end enumerate | |
278 | |
279 man/internals/internals.texi (the Internals manual) | |
280 @enumerate | |
281 @item | |
282 "Coding for Mule" -- how to write Mule-aware code | |
283 @item | |
284 "Modules for Internationalization" | |
285 @item | |
286 "The Text in a Buffer" -- more about the different ways of | |
287 viewing buffer positions; #### Merge with the above info | |
288 @item | |
289 "MULE Character Sets and Encodings" -- yet another intro | |
290 to characters, encodings, etc; #### Merge with the | |
291 above info; also some documentation of Japanese EUC and JIS7, | |
292 and CCL internals | |
293 @end enumerate | |
294 | |
295 text.h -- info about specific XEmacs-C API's for handling internal and | |
296 external text | |
297 | |
298 intl-win32.c -- Windows-specific I18N information | |
299 | |
300 lisp.h -- some info appears alongside the definitions of the basic | |
301 character-related types | |
302 | |
303 unicode.c -- documentation about Unicode translation tables | |
826 | 304 */ |
771 | 305 |
2367 | 306 |
307 /************************************************************************/ | |
308 /* */ | |
309 /* */ | |
310 /* Part B: Random proposals for work to be done */ | |
311 /* */ | |
312 /* */ | |
313 /************************************************************************/ | |
314 | |
315 | |
316 /* | |
317 | |
318 | |
319 ========================================================================== | |
320 - Mule design issues (ben) | |
321 ========================================================================== | |
322 | |
323 circa 1999 | |
324 | |
325 Here is a more detailed list of Mule-related projects that we will be | |
326 working on. They are more or less ordered according to how we will | |
327 proceed, but it's not exact. In particular, there will probably be | |
328 time overlap among adjacent projects. | |
329 | |
330 @enumerate | |
331 @item | |
332 Modify the internal/external conversion macros to allow for | |
333 MS Windows support. | |
334 | |
335 @item | |
336 Modify the buffer macros to allow for more than one internal | |
337 representation, e.g. fixed width and variable width. | |
338 | |
339 @item | |
340 Review the existing Mule code, especially the lisp code, for code | |
341 quality issues and improve the cleanliness of it. Also work on | |
342 creating a specification for the Mule API. | |
343 | |
344 @item | |
345 Write some more automated mule tests. | |
346 | |
347 @item | |
348 Integrate Tomohiko's UTF-2000 code, fixing it up so that nothing is | |
349 broken when the UTF-2000 configure option is not enabled. | |
350 | |
351 @item | |
352 Fix up the MS Windows code to be Mule-correct, so that you can | |
353 compile with Mule support under MS windows and have a working | |
354 XEmacs, at least just with Latin-1. | |
355 | |
356 @item | |
357 Implement a scheme to guarantee no corruption of files, even with | |
358 an incorrect coding system - in particular, guarantee no corruption | |
359 of binary files. | |
360 | |
361 @item | |
362 Make the text property support in XEmacs robust with respect to | |
363 string and text operations, so that the `no corruption' support in | |
364 the previous entry works properly, even if a lot of cutting and | |
365 pasting is done. | |
366 | |
367 @item | |
368 Improve the handling of auto-detection so that, when there is any | |
369 possibility at all of mistake, the user is informed of the detected | |
370 encoding and given the choice of choosing other possibilities. | |
371 | |
372 @item | |
373 Improve the support for different language environments in XEmacs, | |
374 for example, the priority of coding systems used in auto-detection | |
375 should properly reflect the language environment. This probably | |
376 necessitates rethinking the current `coding system priority' | |
377 scheme. | |
378 | |
379 @item | |
380 Do quality work to improve the existing UTF-2000 implementation. | |
381 | |
382 @item | |
383 Implement preliminary support for 8-bit fixed width | |
384 representation. First, we will only implement 7-bit support, and | |
385 will fall back to variable width as soon as any non-ASCII | |
386 character is encountered. Then we will improve the support to | |
387 handle an arbitrary character set in the upper half of the 8-bit space. | |
388 | |
389 @item | |
390 Investigate any remaining hurdles to making --with-mule be the | |
391 default configure option. | |
392 @end enumerate | |
393 | |
394 ========================================================================== | |
395 - Mule design issues (stephen) | |
396 ========================================================================== | |
397 | |
398 What I see as Mule priorities (in rough benefit order, I am not taking | |
399 account of difficulty, nor the fact that some - eg 8 & 10 - will | |
400 probably come as packages): | |
401 | |
402 @enumerate | |
403 @item | |
404 Fix the autodetect problem (by making the coding priority list | |
405 user-configurable, as short as he likes, even null, with "binary" | |
406 as the default). | |
407 @item | |
408 Document the language environments and other Mule "APIs" as | |
409 implemented (since there is no real design spec). Check to see | |
410 how and where they are broken. | |
411 @item | |
412 Make the Mule menu useful to non-ISO-2022-literate folks. | |
413 @item | |
414 Redo the lstreams stuff to make it easy and robust to "pipeline", | |
415 eg, libz | gnupg | jis2mule. | |
416 @item | |
417 Make Custom Mule-aware. (This probably depends on a sensible | |
418 fonts model.) | |
419 @item | |
420 Implement the "literal byte stream" memory feature. | |
421 @item | |
422 Study the FSF implementation of Mule for background for 7 & 8. | |
423 @item | |
424 Identify desirable Mule features (eg, i18n-ized messages as above, | |
425 collating tables by language environment, etc). (New features | |
426 might have priority as high as 9.) | |
427 @item | |
428 Specify Mule UIs, APIs, etc, and design and (re)implement them. | |
429 @item | |
430 Implement the 8-bit-wide buffer optimization. | |
431 @item | |
432 Move the internal encoding to UTF-32 (subject to Olivier's caveats | |
433 regarding compose characters), with the variable-width char | |
434 buffers using UTF-8. | |
435 @item | |
436 Implement the 16- and 32-bit-wide buffer optimizations. | |
437 @end enumerate | |
438 | |
439 ========================================================================== | |
440 - Mule design issues "short term" (ben) | |
441 ========================================================================== | |
442 | |
443 @enumerate | |
444 @item | |
445 Finish changes in fixup/directory, get in CVS. | |
446 | |
447 (Test with and without "quick-build", to see if really faster) | |
448 (need autoconf) | |
449 | |
450 @item | |
451 Finish up Windows/Mule changes. Outline of this elsewhere; Do | |
452 *minimal* effort. | |
453 | |
454 @item | |
455 Continue work on Windows stability, e.g. go through existing notes | |
456 on Windows Mule-ization + extract all info. | |
457 | |
458 @item | |
459 Get Unicode translation tables integrated. | |
460 | |
461 Finish UCS2/UTF16 coding system. | |
462 | |
463 @item | |
464 Make sure coding system priority list is language-environment specific. | |
465 | |
466 @item | |
467 Consider moving language selection Menu up to be parallel with Mule menu. | |
468 | |
469 @item | |
470 Check to make sure we grok the default locale at startup under | |
471 Windows and understand the Windows locales. Finish implementation | |
472 of mswindows-multibyte and make sure it groks all the locales. | |
473 | |
474 @item | |
475 Do the above as best as we can without using Unicode tables. | |
476 | |
477 @item | |
478 Start tagging all text with a language text property, | |
479 indicating the current language environment when the text was input. | |
480 | |
481 @item | |
482 Make sure we correctly accept input of non-ASCII chars | |
483 (probably already do!) | |
484 | |
485 @item | |
486 Implement active language/keyboard switching under Windows. | |
487 | |
488 @item | |
489 Look into implementing support for "MS IME" protocol (Microsoft | |
490 fancy built-in Asian input methods). | |
491 | |
492 @item | |
493 Redo implementation of mswindows-multibyte and internal display to | |
494 entirely use translation to/from Unicode for increased accuracy. | |
495 | |
496 @item | |
497 Implement buf<->char improvements from FSF. Also implement | |
498 my string byte<->char optimization structure. | |
499 | |
500 @item | |
501 Integrate all Mule DOCS from 20.6 or 21.0. Try to add sections | |
502 for what we've added. | |
503 | |
504 @item | |
505 Implement 8-bit fixed width optimizations. Then work on 16-bit. | |
506 @end enumerate | |
507 | |
508 ========================================================================== | |
509 - Mule design issues (more) (ben) | |
510 ========================================================================== | |
511 | |
512 Get minimal Mule for Windows working using Ikeyama's patches. At | |
513 first, rely on his conversion of internal -> external | |
514 locale-specific but very soon (as soon as we get translation | |
515 tables) can switch to using Unicode versions of display funs, which | |
516 will allow many more charsets to be handled and in a more | |
517 consistent fashion. | |
518 | |
519 i.e. to convert an internal string to an external format, at first | |
520 we use our own knowledge of the Microsoft locale file formats but | |
521 an alternative is to convert to Unicode and use Microsoft's | |
522 convert-Unicode-to-locale encoding functions. This gains us a | |
523 great deal of generality, since in practice all charset caching | |
524 points can be wrapped into Unicode caching points. | |
525 | |
526 This requires adding UCS2 support, which I'm doing. This support | |
527 would let us convert internal -> Unicode, which is exactly what we | |
528 want. | |
529 | |
530 At first, though, I would do the UCS2 support, but leave the | |
531 existing way of doing things in redisplay. Meanwhile, I'd go | |
532 through and fix up the places in the code that assume we are | |
533 dealing with unibytes. | |
534 | |
535 After this, the font problems will be fixed , we should have a | |
536 pretty well working XEmacs + MULE under Windows. The only real | |
537 other work is the clipboard code, which should be straightforward. | |
538 | |
539 ========================================================================== | |
540 - Mule design discussion | |
541 ========================================================================== | |
542 | |
543 -------------------------------------------------------------------------- | |
544 | |
545 Ben | |
546 | |
547 April 11, 2000 | |
548 | |
549 Well yes, this was the whole point of my "no lossage" proposal of being | |
550 able to undo any coding-system transformation on a buffer. The idea was | |
5384
3889ef128488
Fix misspelled words, and some grammar, across the entire source tree.
Jerry James <james@xemacs.org>
parents:
5191
diff
changeset
|
551 to figure out which transformations were definitely reversible, and for |
2367 | 552 all the others, cache the original text in a text property. This way, you |
553 could probably still do a fairly good job at constructing a good reversal | |
554 even after you've gone into the text and added, deleted, and rearranged | |
555 some things. | |
556 | |
557 But you could implement it much more simply and usefully by just | |
558 determining, for any text being decoded into mule-internal, can we go back | |
559 and read the source again? If not, remember the entire file (GNUS | |
560 message, etc) in text properties. Then, implement the UI interface (like | |
561 Netscape's) on top of that. This way, you have something that at least | |
562 works, but it might be inefficient. All we would need to do is work on | |
563 making the | |
564 underlying implementation more efficient. | |
565 | |
566 Are you interested in doing this? It would be a huge win for users. | |
567 Hrvoje Niksic wrote: | |
568 | |
569 > Ben Wing <ben@666.com> writes: | |
570 > | |
571 > > let me know exactly what "rethink" functionality you want and i'll | |
572 > > come up with an interface. perhaps you just want something like | |
573 > > netscape's encoding menu, where if you switch encodings, it reloads | |
574 > > and reencodes? | |
575 > | |
576 > It might be a bit more complex than that. In many cases, it's hard or | |
577 > impossible to meaningfully "reload" -- for instance, this | |
578 > functionality should be available while editing a Gnus message, as | |
579 > well as while visiting a file. | |
580 > | |
581 > For the special case of Latin-N <-> Latin-M conversion, things could | |
582 > be done easily -- to convert from N to M, you only need to convert | |
583 > internal representation back to N, and then convert it forth to M. | |
584 | |
585 -------------------------------------------------------------------------- | |
586 April 11, 2000 | |
587 | |
588 Well yes, this was the whole point of my "no lossage" proposal of being | |
589 able to undo any coding-system transformation on a buffer. The idea was | |
5384
3889ef128488
Fix misspelled words, and some grammar, across the entire source tree.
Jerry James <james@xemacs.org>
parents:
5191
diff
changeset
|
590 to figure out which transformations were definitely reversible, and for |
2367 | 591 all the others, cache the original text in a text property. This way, you |
592 could probably still do a fairly good job at constructing a good reversal | |
593 even after you've gone into the text and added, deleted, and rearranged | |
594 some things. | |
595 | |
596 But you could implement it much more simply and usefully by just | |
597 determining, for any text being decoded into mule-internal, can we go back | |
598 and read the source again? If not, remember the entire file (GNUS | |
599 message, etc) in text properties. Then, implement the UI interface (like | |
600 Netscape's) on top of that. This way, you have something that at least | |
601 works, but it might be inefficient. All we would need to do is work on | |
602 making the | |
603 underlying implementation more efficient. | |
604 | |
605 Are you interested in doing this? It would be a huge win for users. | |
606 Hrvoje Niksic wrote: | |
607 | |
608 > Ben Wing <ben@666.com> writes: | |
609 > | |
610 > > let me know exactly what "rethink" functionality you want and i'll | |
611 > > come up with an interface. perhaps you just want something like | |
612 > > netscape's encoding menu, where if you switch encodings, it reloads | |
613 > > and reencodes? | |
614 > | |
615 > It might be a bit more complex than that. In many cases, it's hard or | |
616 > impossible to meaningfully "reload" -- for instance, this | |
617 > functionality should be available while editing a Gnus message, as | |
618 > well as while visiting a file. | |
619 > | |
620 > For the special case of Latin-N <-> Latin-M conversion, things could | |
621 > be done easily -- to convert from N to M, you only need to convert | |
622 > internal representation back to N, and then convert it forth to M. | |
623 | |
624 | |
625 ------------------------------------------------------------------------ | |
626 | |
627 ========================================================================== | |
628 - Redoing translation macros [old] | |
629 ========================================================================== | |
630 | |
631 Currently the translation macros (the macros with names such as | |
632 GET_C_STRING_CTEXT_DATA_ALLOCA) have names that are difficult to parse | |
633 or remember, and are not all that general. In the process of | |
634 reviewing the Windows code so that it could be muleized, I discovered | |
635 that these macros need to be extended in various ways to allow for | |
636 the Windows code to be easily muleized. | |
637 | |
638 Since the macros needed to be changed anyways, I figured it would be a | |
639 good time to redo them properly. I propose new macros which have | |
640 names like this: | |
641 | |
642 @itemize @bullet | |
643 @item | |
644 <A>_TO_EXTERNAL_FORMAT_<B> | |
645 @item | |
646 <A>_TO_EXTERNAL_FORMAT_<B>_1 | |
647 @item | |
648 <C>_TO_INTERNAL_FORMAT_<D> | |
649 @item | |
650 <C>_TO_INTERNAL_FORMAT_<D>_1 | |
651 @end itemize | |
652 | |
653 A and C represent the source of the data, and B and D represent the | |
654 sink of the data. | |
655 | |
656 All of these macros call either the functions | |
657 convert_to_external_format or convert_to_internal_format internally, | |
658 with some massaging of the arguments. | |
659 | |
660 All of these macros take the following arguments: | |
661 | |
662 @itemize @bullet | |
663 @item | |
664 First, one or two arguments indicating the source of the data. | |
665 @item | |
666 Second, an argument indicating the coding system. (In order to avoid | |
667 an excessive number of macros, we no longer provide separate macros | |
668 for specific coding systems.) | |
669 @item | |
670 Third, one or two arguments indicating the sink of the data. | |
671 @item | |
672 Fourth, optionally, arguments indicating the error behavior and the | |
673 warning class (these arguments are only present in the _1 versions | |
674 of the macros). The other, shorter named macros are trivial | |
675 interfaces onto these macros with the error behavior being | |
676 ERROR_ME_WARN, with the warning class being Vstandard_warning_class. | |
677 @end itemize | |
678 | |
679 <A> can be one of the following: | |
680 @itemize @bullet | |
681 @item | |
682 LISP (which means a Lisp string) Takes one argument, a Lisp Object. | |
683 @item | |
684 LSTREAM (which indicates an lstream) Takes one argument, an | |
685 lstream. The data is read from the lstream until EOF is reached. | |
686 @item | |
687 DATA (which indicates a raw memory area) Takes two arguments, a | |
688 pointer and a length in bytes. | |
689 (You must never use this if the source of the data is a Lisp string, | |
690 because of the possibility of relocation during garbage collection.) | |
691 @end itemize | |
692 | |
693 <B> can be one of the following: | |
694 @itemize @bullet | |
695 @item | |
696 ALLOCA (which means that the resulting data is stored in alloca()ed | |
697 memory. Two arguments should be specified, a pointer and a length, | |
698 which should be lvalues.) | |
699 @item | |
700 MALLOC (which means that the resulting data is stored in malloc()ed | |
701 memory. Two arguments should be specified, a pointer and a | |
702 length. The memory must be free()d by the caller. | |
703 @item | |
704 OPAQUE (which means the resulting data is stored in an opaque Lisp | |
705 Object. This takes one argument, a lvalue Lisp Object. | |
706 @item | |
707 LSTREAM. The data is written to an lstream. | |
708 @end itemize | |
709 | |
710 <C> can be one of the : | |
711 @itemize @bullet | |
712 @item | |
713 DATA | |
714 @item | |
715 LSTREAM | |
716 @end itemize | |
717 (just like <A> above) | |
718 | |
719 <D> can be one of | |
720 @itemize @bullet | |
721 @item | |
722 ALLOCA | |
723 @item | |
724 MALLOC | |
725 @item | |
726 LISP This means a Lisp String. | |
727 @item | |
728 BUFFER The resulting data is inserted into a buffer at the buffer's | |
729 value of point. | |
730 @item | |
731 LSTREAM The data is written to the lstream. | |
732 @end itemize | |
733 | |
734 | |
735 Note that I have eliminated the FORMAT argument of previous macros, | |
736 and replaced it with a coding system. This was made possible by | |
737 coding system aliases. In place of old `format's, we use a `virtual | |
738 coding system', which is aliased to the actual coding system. | |
739 | |
740 The value of the coding system argument can be anything that is legal | |
741 input to get_coding_system, i.e. a symbol or a coding system object. | |
742 | |
743 ========================================================================== | |
744 - creation of generic macros for accessing internally formatted data [old] | |
745 ========================================================================== | |
746 | |
747 I have a design; it's all written down (I did it in Tsukuba), and I just have | |
748 to have it transcribed. It's higher level than the macros, though; it's Lisp | |
749 primitives that I'm designing. | |
750 | |
751 As for the design of the macros, don't worry so much about all files having to | |
752 get included (which is inevitable with macros), but about how the files are | |
753 separated. Your design might go like this: | |
754 | |
755 @enumerate | |
756 @item | |
757 you have generic macro interfaces, which specify a particular | |
758 behavior but not an implementation. these generic macros have | |
759 complementary versions for buffers and for strings (and the buffer | |
760 or string is an argument to all of the macros), and do such things | |
761 as convert between byte and char indices, retrieve the character at | |
762 a particular byte or char index, increment or decrement a byte | |
763 index to the beginning of the next or previous character, indicate | |
764 the number of bytes occupied by the character at a particular byte | |
765 or character index, etc. These are similar to what's already out | |
766 there except that they confound buffers and strings and that they | |
767 can also work with actual char *'s, which I think is a really bad | |
768 idea because it encourages code to "assume" that the representation | |
769 is ASCII compatible, which is might not be (e.g. 16-bit fixed | |
770 width). In fact, one thing I'm planning on doing is redefining | |
771 Bufbyte as a struct, for debugging purposes, to catch all places | |
772 that cavalierly compare them with ASCII char's. Note also that I | |
773 really want to rename Bufpos and Bytind, which are confusing and | |
774 wrong in that they also apply to strings. They should be Bytepos | |
775 and Charpos, or something like that, to go along with Bytecount and | |
776 Charcount. Similarly, Bufbyte is similarly a misnomer and should be | |
777 Intbyte -- a byte in the internal string representation (any of the | |
778 internal representations) of a string or buffer. Corresponding to | |
779 this is Extbyte (which we already have), a byte in any external | |
780 string representation. We also have Extcount, which makes sense, | |
781 and we might possibly want Extcharcount, the number of characters | |
782 in an external string representation; but that gets sticky in modal | |
783 encodings, and it's not clear how useful it would be. | |
784 | |
785 @item | |
786 for all generic macro interfaces, there are specific versions of | |
787 each of them for each possible representation (pure ASCII in the | |
788 non-Mule world, Mule standard, UTF-8, 8-bit fixed, 16-bit fixed, | |
789 32-bit fixed, etc.; there may well be more than one possible 16-bit | |
790 fixed version, as well). Each representation has a corresponding | |
791 prefix, e.g. MULE_ or FIXED16_ or whatever, which is prefixed onto | |
792 the generic macro names. The resulting macros perform the | |
793 operation defined for the macro, but assume, and only work | |
794 correctly with, text in the corresponding representation. | |
795 | |
796 @item | |
797 The definition of the generic versions merely conditionalizes on | |
798 the appropriate things (i.e. bit flags in the buffer or string | |
799 object) and calls the appropriate representation-specific version. | |
800 There may be more than one definition (protected by ifdefs, of | |
801 course), or one definition that amalgamated out of many ifdef'ed | |
802 sections. | |
803 | |
804 @item | |
805 You should probably put each different representation in its own | |
806 header file, e.g. charset-mule.h or charset-fixed16.h or | |
807 charset-ascii.h or whatever. Then put the main macros into | |
808 charset.h, and conditionalize in this file appropriately to include | |
809 the other ones. That way, code that actually needs to play around | |
810 with internal-format text at this level can include "charset.h" | |
811 (certainly a much better place than buffer.h), and everyone else | |
812 uses higher-level routines. The representation-specific macros | |
813 should not normally be used *directly* at all; they are invoked | |
814 automatically from the generic macros. However, code that needs to | |
815 be highly, highly optimized might choose to take a loop and write | |
816 two versions of it, one for each representation, to avoid the | |
817 per-loop-iteration cost of a comparison. Until the macro interface | |
818 is rock stable and solid, we should strongly discourage such | |
819 nanosecond optimizations. | |
820 @end enumerate | |
821 | |
822 ========================================================================== | |
823 - UTF-16 compatible representation | |
824 ========================================================================== | |
825 | |
826 NOTE: One possible default internal representation that was compatible | |
827 with UTF16 but allowed all possible chars in UCS4 would be to take a | |
828 more-or-less unused range of 2048 chars (not from the private area | |
829 because Microsoft actually uses up most or all of it with EUDC chars). | |
830 Let's say we picked A400 - ABFF. Then, we'd have: | |
831 | |
832 0000 - FFFF Simple chars | |
833 | |
834 D[8-B]xx D[C-F]xx Surrogate char, represents 1M chars | |
835 | |
836 A[4-B]xx D[C-F]xx D[C-F]xx Surrogate char, represents 2G chars | |
837 | |
838 This is exactly the same number of chars as UCS-4 handles, and it follows the | |
839 same property as UTF8 and Mule-internal: | |
840 | |
841 @enumerate | |
842 @item | |
843 There are two disjoint groupings of units, one representing leading units | |
844 and one representing non-leading units. | |
845 @item | |
846 Given a leading unit, you immediately know how many units follow to make | |
847 up a valid char, irrespective of any other context. | |
848 @end enumerate | |
849 | |
850 Note that A4xx is actually currently assigned to Yi. Since this is an | |
851 internal representation, we could just move these elsewhere. | |
852 | |
853 An alternative is to pick two disjoint ranges, e.g. 2D00 - 2DFF and | |
854 A500 - ABFF. | |
855 | |
856 ========================================================================== | |
857 New API for char->font mapping | |
858 ========================================================================== | |
859 - ; supersedes charset-registry and CCL; | |
860 supports all windows systems; powerful enough for Unicode; etc. | |
861 | |
862 (charset-font-mapping charset) | |
863 | |
864 font-mapping-specifier string | |
865 | |
866 char-font-mapping-table | |
867 | |
868 char-table, specifier; elements of char table are either strings (which | |
869 specify a registry or comparable font property, or vectors of a string | |
870 (same) followed by keyword-value pairs (optional). The only allowable | |
871 keyword currently is :ccl-program, which specifies a CCL program to map | |
872 the characters into font indices. Other keywords may be added | |
873 e.g. allowing Elisp fragments instead of CCL programs, also allowed is | |
874 [inherit], which inherits from the next less-specific char-table in the | |
875 specifier. | |
876 | |
877 The preferred interface onto this mapping (which should be portable | |
878 across Emacsen) is | |
879 | |
880 (set-char-font-mapping key value &optional locale tag-set how-to-add) | |
881 | |
882 where key is a char, range or charset (as for put-char-table), value is | |
883 as above, and the other arguments are standard for specifiers. This | |
884 automatically creates a char table in the locale, as necessary (all | |
885 elements default to [inherit]). On GNU Emacs, some specifiers arguments | |
886 may be unimplemented. | |
887 | |
888 (char-font-mapping key value &optional locale) | |
889 works vaguely like get-specifier? But does inheritance processing. | |
890 locale should clearly default here to current-buffer | |
891 | |
892 #### should get-specifier as well? Would make it work most like | |
893 #### buffer-local variables. | |
894 | |
895 NB. set-charset-registry and set-charset-ccl-program are obsoleted. | |
896 | |
897 ========================================================================== | |
898 Implementing fixed-width 8,16,32 bit buffer optimizations | |
899 ========================================================================== | |
900 | |
901 Add set-buffer-optimization (buffer &rest keywords) for | |
902 controlling these things. | |
903 | |
904 Also, put in hack so that correct arglist can be retrieved by | |
905 Lisp code. | |
906 | |
907 Look at the way keyword primitives are currently handled; make | |
908 sure it works and is documented, etc. | |
909 | |
910 Implement 8-bit fixed width optimization. Take the things that | |
911 know about the actual implementation and put them in a single | |
912 file, in essence creating an abstraction layer to allow | |
913 pluggable internal representations. Implement a fairly general | |
914 scheme for mapping between character codes in the 8 bits or 16 | |
915 bits representation and on actual charset characters. As part of | |
916 set-buffer-optimization, you can specify a list of character sets | |
917 to be used in the 8 bit to 16 bit, etc. world. You can also | |
918 request that the buffer be in 8, 16, etc. if possible. | |
919 | |
920 -> set defaults wrt this. | |
921 -> perhaps this should be just buffer properties. | |
922 -> this brings up the idea of default properties on an object. | |
923 -> Implement default-put, default-get, etc. | |
924 | |
925 What happens when a character not assigned in the range gets | |
926 added? Then, must convert to variable width of some sort. | |
927 | |
928 Note: at first, possibly we just convert whole hog to get things | |
929 right. Then we'd have to poy alternative to characters that got | |
930 added + deleted that were unassigned in the fixed width. When | |
931 this goes to zero and there's been enough time (heuristics), we | |
932 go back to fixed. | |
933 | |
934 Side note: We could dynamically build up the set of assigned | |
935 chars as they go. Conceivably this could even go down to the | |
936 single char level: Just keep a big array of mapping from 16 bit | |
937 values to chars, and add empty time, a char has been encountered | |
938 that wasn't there before. Problem need inverse mapping. | |
939 | |
940 -> Possibility; chars are actual objects, not just numbers. | |
941 Then you could keep track of such info in the chars itself. | |
942 *Think about this.* | |
943 | |
944 Eventually, we might consider allowing mixed fixed-width, | |
945 variable-width buffer encodings. Then, we use range tables to | |
946 indicate which sections are fixed and which variable and INC_CHAR does | |
947 something like this: binary search to find the current range, which | |
948 indicates whether it's fixed or variable, and tells us what the | |
949 increment is. We can cache this info and use it next time to speed | |
950 up. | |
951 | |
952 -> We will then have two partially shared range tables - one for | |
953 overall fixed width vs. variable width, and possibly one containing | |
954 this same info, but partitioning the variable width in one. Maybe | |
955 need fancier nested range table model. | |
956 | |
957 ========================================================================== | |
958 Expansion of display table and case mapping table support for all | |
959 chars, not just ASCII/Latin1. | |
960 ========================================================================== | |
961 | |
962 ========================================================================== | |
963 Improved flexibility for display tables, and evaluation of its | |
964 features to make sure it meshes with and complements the char<->font | |
965 mapping API mentioned earlier | |
966 ========================================================================== | |
967 | |
968 ========================================================================== | |
969 String access speedup: | |
970 ========================================================================== | |
971 | |
972 For strings larger than some size in bytes (10?), keep extra fields of | |
973 info: length in chars, and a (char, byte) pair in the middle to speed | |
974 up sequential access. | |
975 | |
976 (Better idea: do this for any size string, but only if it contains | |
977 non-ASCII chars. Then if info is missing, we know string is | |
978 ASCII-only.) | |
979 | |
980 Use a string-extra-info object, replacing string property slot and | |
981 containing fields for string mod tick, string extents, string props, | |
982 and string char length, and cached (char,byte) pair. | |
983 string-extra-info (or string-auxiliary?) objects could be in frob | |
984 blocks, esp. if creating frob blocks is easy + worth it. | |
985 | |
986 - Caching of char<->byte conversions in strings - should make nearly | |
987 all operations on strings O(N) | |
988 | |
989 ========================================================================== | |
990 Improvements in buffer char<->byte mapping | |
991 ========================================================================== | |
992 | |
993 - Range table implementation - especially when there are few runs of | |
994 different widths, e.g. recently converted from fixed-width | |
995 optimization to variable width | |
996 | |
997 Range Tables to speed up Bufpos <-> Bytind caching | |
998 ================================================== | |
999 | |
1000 This describes an alternative implementation using ranges. We | |
1001 maintain a range table of all spans of characters of a fixed width. | |
1002 Updating this table could take time if there are a large number of | |
1003 spans; but constant factors of operations should be quick. This method really wins | |
1004 when you have 8-bit buffers just converted to variable width, where | |
1005 there will be few spans. More specifically, lookup in this range | |
1006 table is O(log N) and can be done with simple binary search, which is | |
1007 very fast. If we maintain the ranges using a gap array, updating this | |
1008 table will be fast for local operations, which is most of the time. | |
1009 | |
1010 We will also provide (at first, at least) a Lisp function to set the | |
1011 caching mechanism explicitly - either range tables or the existing | |
1012 implementation. Eventually, we want to improve things, to the point | |
1013 where we automatically pick the right caching for the situation and | |
1014 have more caching schemes implemented. | |
1015 | |
1016 ========================================================================== | |
1017 - Robustify Text Properties | |
1018 ========================================================================== | |
1019 | |
1020 ========================================================================== | |
1021 Support for unified internal representation, e.g. Unicode | |
1022 ========================================================================== | |
1023 | |
1024 Start tagging all text with a language text property, | |
1025 indicating the current language environment when the text was input. | |
1026 (needs "Robustify Text Properties") | |
1027 | |
1028 ========================================================================== | |
1029 - Generalized Coding Systems | |
1030 ========================================================================== | |
1031 | |
1032 - Lisp API for Defining Coding Systems | |
1033 | |
1034 User-defined coding systems. | |
1035 | |
1036 (define-coding-system-type 'type | |
1037 :encode-function fun | |
1038 :decode-function fun | |
1039 :detect-function fun | |
1040 :buffering (number = at least this many chars | |
1041 line = buffer up to end of line | |
1042 regexp = buffer until this regexp is found in match | |
1043 source data. match data will be appropriate when fun is | |
1044 called | |
1045 | |
1046 encode fun is called as | |
1047 | |
1048 (encode instream outstream) | |
1049 | |
1050 should read data from instream and write converted result onto | |
1051 outstream. Can leave some data stuff in stream, it will reappear | |
1052 next time. Generally, there is a finite amount of data in instream | |
1053 and further attempts to read lead to would-block errors or retvals. | |
1054 Can use instream properties to record state. May use read-stream | |
1055 functionality to read everything into a vector or string. | |
1056 | |
1057 ->Need vectors + string exposed to resizing of Lisp implementation | |
1058 where necessary. | |
1059 | |
1060 ========================================================================== | |
1061 Support Windows Active Kbd Switching, Far East IME API (done already?) | |
1062 ========================================================================== | |
1063 | |
1064 ========================================================================== | |
1065 - UI/design changes for Coding System Pipelining | |
1066 ========================================================================== | |
1067 | |
1068 ------------------------------------------------------------------ | |
1069 CODING-SYSTEM CHAINS | |
1070 ------------------------------------------------------------------ | |
1071 | |
1072 sjt sez: | |
1073 | |
1074 There should be no elementary coding systems in the Lisp API, only | |
1075 chains. Chains should be declared, not computed, as a sequence of coding | |
1076 formats. (Probably the internal representation can be a vector for | |
1077 efficiency but programmers would probably rather work with lists.) A | |
1078 stream has a token type. Most streams are octet streams. Text is a | |
1079 stream of characters (in _internal_ format; a file on disk is not text!) | |
1080 An octet-stream has no implicit semantics, so its format must always be | |
1081 specified. The only type currently having semantics is characters. This | |
1082 means that the chain [euc-jp -> internal -> shift_jis) may be specified | |
1083 (euc-jp, shift_jis), and if no euc-jp -> shift_jis converter is | |
1084 available, then the chain is automatically constructed. (N.B. I f we | |
1085 have fixed width buffers in the future, then we could have ASCII -> 8-bit | |
1086 char -> 16-bit char -> ISO-2022-JP (with escape sequences). | |
1087 | |
1088 EOL handling is a char <-> char coding. It should not be part of another | |
1089 coding system except as a convenience for users. For text coding, | |
1090 automatically insert EOL handlers between char <-> octet boundaries. | |
1091 | |
1092 ------------------------------------------------------------------ | |
1093 ABOUT DETECTION | |
1094 ------------------------------------------------------------------ | |
1095 | |
1096 | |
1097 ------------------------------------------------------------------ | |
1098 EFFICIENCY OF CODING CONVERSION WITH MULTIPLE COPIES/CHAINS | |
1099 ------------------------------------------------------------------ | |
1100 | |
1101 A comment in encode_decode_coding_region(): | |
1102 | |
1103 The chain of streams looks like this: | |
1104 | |
1105 [BUFFER] <----- (( read from/send to loop )) | |
1106 ------> [CHAR->BYTE i.e. ENCODE AS BINARY if source is | |
1107 in bytes] | |
1108 ------> [ENCODE/DECODE AS SPECIFIED] | |
1109 ------> [BYTE->CHAR i.e. DECODE AS BINARY | |
1110 if sink is in bytes] | |
1111 ------> [AUTODETECT EOL if | |
1112 we're decoding and | |
1113 coding system calls | |
1114 for this] | |
1115 ------> [BUFFER] | |
1116 | |
1117 sjt (?) responds: | |
1118 | |
1119 Of course, this is just horrible. BYTE<->CHAR should only be available | |
1120 to I/O routines. It should not be visible to Mule proper. | |
1121 | |
1122 A comment on the implementation. Hrvoje and Kyle worry about the | |
1123 inefficiency of repeated copying among buffers that chained coding | |
1124 systems entail. But this may not be as time inefficient as it appears | |
1125 in the Mule ("house rules") context. The issue is how do you do chain | |
1126 coding systems without copying? In theory you could have | |
1127 | |
1128 IChar external_to_raw (ExtChar *cp, State *s); | |
1129 IChar decode_utf16 (IChar c, State *s); | |
1130 IChar decode_crlf (ExtChar *cp, State *s); | |
1131 | |
1132 typedef Ichar (*Converter[]) (Ichar, State*); | |
1133 | |
1134 Converter utf16[2] = { &decode_utf16, &decode_crlf }; | |
1135 | |
1136 void convert (ExtChar *inbuf, IChar *outbuf, Converter cvtr) | |
1137 { | |
1138 int i; | |
1139 ExtChar c; | |
1140 State s; | |
1141 | |
1142 while (c = external_to_raw (*inbuf++, &s)) | |
1143 { | |
1144 for (i = 0; i < sizeof(cvtr)/sizeof(Converter); ++i) | |
1145 if (s.ready) | |
1146 c = (*cvtr[i]) (c, &s); | |
1147 } | |
1148 if (s.ready) | |
1149 *outbuf++ = c; | |
1150 } | |
1151 | |
1152 But this is a lot of function calls; what Ben is doing is basically | |
1153 reducing this to one call per buffer-full. The only way to avoid this | |
1154 is to hardcode all the "interesting" coding systems, maybe using | |
1155 inline or macros to give structure. But this is still a huge amount | |
1156 of work, and code. | |
1157 | |
1158 One advantage to the call-per-char approach is that we might be able | |
1159 to do something about the marker/extent destruction that coding | |
1160 normally entails. | |
1161 | |
1162 ben sez: | |
1163 | |
1164 it should be possible to preserve the markers/extents without | |
1165 switching completely to one-call-per-char -- we could at least do one | |
1166 call per "run", where a run is more or less the maximal stretch of | |
1167 text not overlapping any markers or extent boundaries. (It's a bit | |
1168 more complicated if we want to properly support the different extent | |
1169 begins/ends; in some cases we might have to pump a single character | |
1170 adjacent to where two extents meet.) The "stateless" way that I wrote | |
1171 all of the conversion routines may be a real hassle but it allows | |
1172 something like this to work without too much problem -- pump in one | |
1173 run at a time into one end of the chain, do a flush after each | |
1174 iteration, and stick what comes out the other end in its place. | |
1175 | |
1176 ------------------------------------------------------------------ | |
1177 ABOUT FORMATS | |
1178 ------------------------------------------------------------------ | |
1179 | |
1180 when calling make-coding-system, the name can be a cons of (format1 . | |
1181 format2), specifying that it decodes format1->format2 and encodes the other | |
1182 way. if only one name is given, that is assumed to be format1, and the | |
1183 other is either `external' or `internal' depending on the end type. | |
1184 normally the user when decoding gives the decoding order in formats, but | |
1185 can leave off the last one, `internal', which is assumed. a multichain | |
1186 might look like gzip|multibyte|unicode, using the coding systems named | |
1187 `gzip', `(unicode . multibyte)' and `unicode'. the way this actually works | |
1188 is by searching for gzip->multibyte; if not found, look for gzip->external | |
1189 or gzip->internal. (In general we automatically do conversion between | |
1190 internal and external as necessary: thus gzip|crlf does the expected, and | |
1191 maps to gzip->external, external->internal, crlf->internal, which when | |
1192 fully specified would be gzip|external:external|internal:crlf|internal -- | |
1193 see below.) To forcibly fit together two converters that have explicitly | |
1194 specified and incompatible names (say you have unicode->multibyte and | |
1195 iso8859-1->ebcdic and you know that the multibyte and iso8859-1 in this | |
1196 case are compatible), you can force-cast using :, like this: | |
1197 ebcdic|iso8859-1:multibyte|unicode. (again, if you force-cast between | |
1198 internal and external formats, the conversion happens automatically.) | |
1199 | |
1200 -------------------------------------------------------------------------- | |
1201 ABOUT PDUMP, UNICODE, AND RUNNING XEMACS FROM A DIRECTORY WITH WEIRD CHARS | |
1202 -------------------------------------------------------------------------- | |
1203 | |
1204 -- there's the problem that XEmacs can't be run in a directory with | |
1205 non-ASCII/Latin-1 chars in it, since it will be doing Unicode | |
1206 processing before we've had a chance to load the tables. In fact, | |
1207 even finding the tables in such a situation is problematic using | |
1208 the normal commands. my idea is to eventually load the stuff | |
1209 extremely extremely early, at the same time as the pdump data gets | |
1210 loaded. in fact, the unicode table data (stored in an efficient | |
1211 binary format) can even be stuck into the pdump file (which would | |
1212 mean as a resource to the executable, for windows). we'd need to | |
1213 extend pdump a bit: to allow for attaching extra data to the pdump | |
1214 file. (something like pdump_attach_extra_data (addr, length) | |
1215 returns a number of some sort, an index into the file, which you | |
1216 can then retrieve with pdump_load_extra_data(), which returns an | |
1217 addr (mmap()ed or loaded), and later you pdump_unload_extra_data() | |
1218 when finished. we'd probably also need | |
1219 pdump_attach_extra_data_append(), which appends data to the data | |
1220 just written out with pdump_attach_extra_data(). this way, | |
1221 multiple tables in memory can be written out into one contiguous | |
1222 table. (we'd use the tar-like trick of allowing new blocks to be | |
1223 written without going back to change the old blocks -- we just rely | |
1224 on the end of file/end of memory.) this same mechanism could be | |
1225 extracted out of pdump and used to handle the non-pdump situation | |
1226 (or alternatively, we could just dump either the memory image of | |
1227 the tables themselves or the compressed binary version). in the | |
1228 case of extra unicode tables not known about at compile time that | |
1229 get loaded before dumping, we either just dump them into the image | |
1230 (pdump and all) or extract them into the compressed binary format, | |
1231 free the original tables, and treat them like all other tables. | |
1232 | |
1233 | |
1234 ========================================================================== | |
1235 - Generalized language appropriate word wrapping (requires | |
1236 layout-exposing API defined in BIDI section) | |
1237 ========================================================================== | |
1238 | |
1239 ========================================================================== | |
1240 - Make Custom Mule-aware | |
1241 ========================================================================== | |
1242 | |
1243 ========================================================================== | |
1244 - Composite character support | |
1245 ========================================================================== | |
1246 | |
1247 ========================================================================== | |
1248 - Language appropriate sorting and searching | |
1249 ========================================================================== | |
1250 | |
1251 ========================================================================== | |
1252 - Glyph shaping for Arabic and Devanagari | |
1253 ========================================================================== | |
1254 | |
1255 - (needs to be handled mostly | |
1256 at C level, as part of layout; luckily it's entirely local in its | |
1257 changes, as this is not hard) | |
1258 | |
1259 | |
1260 ========================================================================== | |
1261 Consider moving language selection Menu up to be parallel with Mule menu | |
1262 ========================================================================== | |
1263 | |
1264 */ | |
1265 | |
1266 | |
771 | 1267 |
1268 /************************************************************************/ | |
1269 /* declarations */ | |
1270 /************************************************************************/ | |
1271 | |
1272 Eistring the_eistring_zero_init, the_eistring_malloc_zero_init; | |
1273 | |
1274 #define MAX_CHARBPOS_GAP_SIZE_3 (65535/3) | |
1275 #define MAX_BYTEBPOS_GAP_SIZE_3 (3 * MAX_CHARBPOS_GAP_SIZE_3) | |
1276 | |
1277 short three_to_one_table[1 + MAX_BYTEBPOS_GAP_SIZE_3]; | |
1278 | |
1279 #ifdef MULE | |
1280 | |
1281 /* Table of number of bytes in the string representation of a character | |
1282 indexed by the first byte of that representation. | |
1283 | |
1284 rep_bytes_by_first_byte(c) is more efficient than the equivalent | |
1285 canonical computation: | |
1286 | |
826 | 1287 XCHARSET_REP_BYTES (charset_by_leading_byte (c)) */ |
771 | 1288 |
1289 const Bytecount rep_bytes_by_first_byte[0xA0] = | |
1290 { /* 0x00 - 0x7f are for straight ASCII */ | |
1291 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1292 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1293 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1294 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1295 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1296 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1297 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1298 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1299 /* 0x80 - 0x8f are for Dimension-1 official charsets */ | |
1300 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
1301 /* 0x90 - 0x9d are for Dimension-2 official charsets */ | |
1302 /* 0x9e is for Dimension-1 private charsets */ | |
1303 /* 0x9f is for Dimension-2 private charsets */ | |
1304 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4 | |
1305 }; | |
1306 | |
1307 #ifdef ENABLE_COMPOSITE_CHARS | |
1308 | |
1309 /* Hash tables for composite chars. One maps string representing | |
1310 composed chars to their equivalent chars; one goes the | |
1311 other way. */ | |
1312 Lisp_Object Vcomposite_char_char2string_hash_table; | |
1313 Lisp_Object Vcomposite_char_string2char_hash_table; | |
1314 | |
1315 static int composite_char_row_next; | |
1316 static int composite_char_col_next; | |
1317 | |
1318 #endif /* ENABLE_COMPOSITE_CHARS */ | |
1319 | |
1320 #endif /* MULE */ | |
1321 | |
1292 | 1322 Lisp_Object QSin_char_byte_conversion; |
1323 Lisp_Object QSin_internal_external_conversion; | |
1324 | |
771 | 1325 |
1326 /************************************************************************/ | |
1327 /* qxestr***() functions */ | |
1328 /************************************************************************/ | |
1329 | |
1330 /* Most are inline functions in lisp.h */ | |
1331 | |
1332 int | |
867 | 1333 qxesprintf (Ibyte *buffer, const CIbyte *format, ...) |
771 | 1334 { |
1335 va_list args; | |
1336 int retval; | |
1337 | |
1338 va_start (args, format); | |
2367 | 1339 retval = vsprintf ((Chbyte *) buffer, format, args); |
771 | 1340 va_end (args); |
1341 | |
1342 return retval; | |
1343 } | |
1344 | |
1345 /* strcasecmp() implementation from BSD */ | |
867 | 1346 static Ibyte strcasecmp_charmap[] = { |
1429 | 1347 0000, 0001, 0002, 0003, 0004, 0005, 0006, 0007, |
1348 0010, 0011, 0012, 0013, 0014, 0015, 0016, 0017, | |
1349 0020, 0021, 0022, 0023, 0024, 0025, 0026, 0027, | |
1350 0030, 0031, 0032, 0033, 0034, 0035, 0036, 0037, | |
1351 0040, 0041, 0042, 0043, 0044, 0045, 0046, 0047, | |
1352 0050, 0051, 0052, 0053, 0054, 0055, 0056, 0057, | |
1353 0060, 0061, 0062, 0063, 0064, 0065, 0066, 0067, | |
1354 0070, 0071, 0072, 0073, 0074, 0075, 0076, 0077, | |
1355 0100, 0141, 0142, 0143, 0144, 0145, 0146, 0147, | |
1356 0150, 0151, 0152, 0153, 0154, 0155, 0156, 0157, | |
1357 0160, 0161, 0162, 0163, 0164, 0165, 0166, 0167, | |
1358 0170, 0171, 0172, 0133, 0134, 0135, 0136, 0137, | |
1359 0140, 0141, 0142, 0143, 0144, 0145, 0146, 0147, | |
1360 0150, 0151, 0152, 0153, 0154, 0155, 0156, 0157, | |
1361 0160, 0161, 0162, 0163, 0164, 0165, 0166, 0167, | |
1362 0170, 0171, 0172, 0173, 0174, 0175, 0176, 0177, | |
1363 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207, | |
1364 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217, | |
1365 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227, | |
1366 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237, | |
1367 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247, | |
1368 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257, | |
1369 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267, | |
1370 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277, | |
1371 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307, | |
1372 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317, | |
1373 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327, | |
1374 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337, | |
1375 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347, | |
1376 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357, | |
1377 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367, | |
1378 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377 | |
771 | 1379 }; |
1380 | |
1381 /* A version that works like generic strcasecmp() -- only collapsing | |
1382 case in ASCII A-Z/a-z. This is safe on Mule strings due to the | |
1383 current representation. | |
1384 | |
1385 This version was written by some Berkeley coder, favoring | |
1386 nanosecond improvements over clarity. In all other versions below, | |
1387 we use symmetrical algorithms that may sacrifice a few machine | |
1388 cycles but are MUCH MUCH clearer, which counts a lot more. | |
1389 */ | |
1390 | |
1391 int | |
867 | 1392 qxestrcasecmp (const Ibyte *s1, const Ibyte *s2) |
771 | 1393 { |
867 | 1394 Ibyte *cm = strcasecmp_charmap; |
771 | 1395 |
1396 while (cm[*s1] == cm[*s2++]) | |
1397 if (*s1++ == '\0') | |
1398 return (0); | |
1399 | |
1400 return (cm[*s1] - cm[*--s2]); | |
1401 } | |
1402 | |
1403 int | |
2367 | 1404 ascii_strcasecmp (const Ascbyte *s1, const Ascbyte *s2) |
771 | 1405 { |
867 | 1406 return qxestrcasecmp ((const Ibyte *) s1, (const Ibyte *) s2); |
771 | 1407 } |
1408 | |
1409 int | |
2367 | 1410 qxestrcasecmp_ascii (const Ibyte *s1, const Ascbyte *s2) |
771 | 1411 { |
867 | 1412 return qxestrcasecmp (s1, (const Ibyte *) s2); |
771 | 1413 } |
1414 | |
1415 /* An internationalized version that collapses case in a general fashion. | |
1416 */ | |
1417 | |
1418 int | |
867 | 1419 qxestrcasecmp_i18n (const Ibyte *s1, const Ibyte *s2) |
771 | 1420 { |
1421 while (*s1 && *s2) | |
1422 { | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1423 if (CANONCASE (0, itext_ichar (s1)) != |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1424 CANONCASE (0, itext_ichar (s2))) |
771 | 1425 break; |
867 | 1426 INC_IBYTEPTR (s1); |
1427 INC_IBYTEPTR (s2); | |
771 | 1428 } |
1429 | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1430 return (CANONCASE (0, itext_ichar (s1)) - |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1431 CANONCASE (0, itext_ichar (s2))); |
771 | 1432 } |
1433 | |
1434 /* The only difference between these next two and | |
1435 qxememcasecmp()/qxememcasecmp_i18n() is that these two will stop if | |
1436 both strings are equal and less than LEN in length, while | |
1437 the mem...() versions would would run off the end. */ | |
1438 | |
1439 int | |
867 | 1440 qxestrncasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len) |
771 | 1441 { |
867 | 1442 Ibyte *cm = strcasecmp_charmap; |
771 | 1443 |
1444 while (len--) | |
1445 { | |
1446 int diff = cm[*s1] - cm[*s2]; | |
1447 if (diff != 0) | |
1448 return diff; | |
1449 if (!*s1) | |
1450 return 0; | |
1451 s1++, s2++; | |
1452 } | |
1453 | |
1454 return 0; | |
1455 } | |
1456 | |
1457 int | |
2367 | 1458 ascii_strncasecmp (const Ascbyte *s1, const Ascbyte *s2, Bytecount len) |
771 | 1459 { |
867 | 1460 return qxestrncasecmp ((const Ibyte *) s1, (const Ibyte *) s2, len); |
771 | 1461 } |
1462 | |
1463 int | |
2367 | 1464 qxestrncasecmp_ascii (const Ibyte *s1, const Ascbyte *s2, Bytecount len) |
771 | 1465 { |
867 | 1466 return qxestrncasecmp (s1, (const Ibyte *) s2, len); |
771 | 1467 } |
1468 | |
801 | 1469 /* Compare LEN_FROM_S1 worth of characters from S1 with the same number of |
1470 characters from S2, case insensitive. NOTE: Downcasing can convert | |
1471 characters from one length in bytes to another, so reversing S1 and S2 | |
1472 is *NOT* a symmetric operations! You must choose a length that agrees | |
1473 with S1. */ | |
1474 | |
771 | 1475 int |
867 | 1476 qxestrncasecmp_i18n (const Ibyte *s1, const Ibyte *s2, |
801 | 1477 Bytecount len_from_s1) |
771 | 1478 { |
801 | 1479 while (len_from_s1 > 0) |
771 | 1480 { |
867 | 1481 const Ibyte *old_s1 = s1; |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1482 int diff = (CANONCASE (0, itext_ichar (s1)) - |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1483 CANONCASE (0, itext_ichar (s2))); |
771 | 1484 if (diff != 0) |
1485 return diff; | |
1486 if (!*s1) | |
1487 return 0; | |
867 | 1488 INC_IBYTEPTR (s1); |
1489 INC_IBYTEPTR (s2); | |
801 | 1490 len_from_s1 -= s1 - old_s1; |
771 | 1491 } |
1492 | |
1493 return 0; | |
1494 } | |
1495 | |
1496 int | |
867 | 1497 qxememcmp (const Ibyte *s1, const Ibyte *s2, Bytecount len) |
771 | 1498 { |
1499 return memcmp (s1, s2, len); | |
1500 } | |
1501 | |
1502 int | |
867 | 1503 qxememcmp4 (const Ibyte *s1, Bytecount len1, |
1504 const Ibyte *s2, Bytecount len2) | |
801 | 1505 { |
1506 int retval = qxememcmp (s1, s2, min (len1, len2)); | |
1507 if (retval) | |
1508 return retval; | |
1509 return len1 - len2; | |
1510 } | |
1511 | |
1512 int | |
867 | 1513 qxememcasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len) |
771 | 1514 { |
867 | 1515 Ibyte *cm = strcasecmp_charmap; |
771 | 1516 |
1517 while (len--) | |
1518 { | |
1519 int diff = cm[*s1] - cm[*s2]; | |
1520 if (diff != 0) | |
1521 return diff; | |
1522 s1++, s2++; | |
1523 } | |
1524 | |
1525 return 0; | |
1526 } | |
1527 | |
1528 int | |
867 | 1529 qxememcasecmp4 (const Ibyte *s1, Bytecount len1, |
1530 const Ibyte *s2, Bytecount len2) | |
771 | 1531 { |
801 | 1532 int retval = qxememcasecmp (s1, s2, min (len1, len2)); |
1533 if (retval) | |
1534 return retval; | |
1535 return len1 - len2; | |
1536 } | |
1537 | |
1538 /* Do a character-by-character comparison, returning "which is greater" by | |
867 | 1539 comparing the Ichar values. (#### Should have option to compare Unicode |
801 | 1540 points) */ |
1541 | |
1542 int | |
867 | 1543 qxetextcmp (const Ibyte *s1, Bytecount len1, |
1544 const Ibyte *s2, Bytecount len2) | |
801 | 1545 { |
1546 while (len1 > 0 && len2 > 0) | |
771 | 1547 { |
867 | 1548 const Ibyte *old_s1 = s1; |
1549 const Ibyte *old_s2 = s2; | |
1550 int diff = itext_ichar (s1) - itext_ichar (s2); | |
801 | 1551 if (diff != 0) |
1552 return diff; | |
867 | 1553 INC_IBYTEPTR (s1); |
1554 INC_IBYTEPTR (s2); | |
801 | 1555 len1 -= s1 - old_s1; |
1556 len2 -= s2 - old_s2; | |
1557 } | |
1558 | |
1559 assert (len1 >= 0 && len2 >= 0); | |
1560 return len1 - len2; | |
1561 } | |
1562 | |
1563 int | |
867 | 1564 qxetextcmp_matching (const Ibyte *s1, Bytecount len1, |
1565 const Ibyte *s2, Bytecount len2, | |
801 | 1566 Charcount *matching) |
1567 { | |
1568 *matching = 0; | |
1569 while (len1 > 0 && len2 > 0) | |
1570 { | |
867 | 1571 const Ibyte *old_s1 = s1; |
1572 const Ibyte *old_s2 = s2; | |
1573 int diff = itext_ichar (s1) - itext_ichar (s2); | |
801 | 1574 if (diff != 0) |
1575 return diff; | |
867 | 1576 INC_IBYTEPTR (s1); |
1577 INC_IBYTEPTR (s2); | |
801 | 1578 len1 -= s1 - old_s1; |
1579 len2 -= s2 - old_s2; | |
1580 (*matching)++; | |
1581 } | |
1582 | |
1583 assert (len1 >= 0 && len2 >= 0); | |
1584 return len1 - len2; | |
1585 } | |
1586 | |
1587 /* Do a character-by-character comparison, returning "which is greater" by | |
867 | 1588 comparing the Ichar values, case insensitively (by downcasing both |
801 | 1589 first). (#### Should have option to compare Unicode points) |
1590 | |
1591 In this case, both lengths must be specified becaused downcasing can | |
1592 convert characters from one length in bytes to another; therefore, two | |
1593 blocks of text of different length might be equal. If both compare | |
1594 equal up to the limit in length of one but not the other, the longer one | |
1595 is "greater". */ | |
1596 | |
1597 int | |
867 | 1598 qxetextcasecmp (const Ibyte *s1, Bytecount len1, |
1599 const Ibyte *s2, Bytecount len2) | |
801 | 1600 { |
1601 while (len1 > 0 && len2 > 0) | |
1602 { | |
867 | 1603 const Ibyte *old_s1 = s1; |
1604 const Ibyte *old_s2 = s2; | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1605 int diff = (CANONCASE (0, itext_ichar (s1)) - |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1606 CANONCASE (0, itext_ichar (s2))); |
771 | 1607 if (diff != 0) |
1608 return diff; | |
867 | 1609 INC_IBYTEPTR (s1); |
1610 INC_IBYTEPTR (s2); | |
801 | 1611 len1 -= s1 - old_s1; |
1612 len2 -= s2 - old_s2; | |
771 | 1613 } |
1614 | |
801 | 1615 assert (len1 >= 0 && len2 >= 0); |
1616 return len1 - len2; | |
1617 } | |
1618 | |
1619 /* Like qxetextcasecmp() but also return number of characters at | |
1620 beginning that match. */ | |
1621 | |
1622 int | |
867 | 1623 qxetextcasecmp_matching (const Ibyte *s1, Bytecount len1, |
1624 const Ibyte *s2, Bytecount len2, | |
801 | 1625 Charcount *matching) |
1626 { | |
1627 *matching = 0; | |
1628 while (len1 > 0 && len2 > 0) | |
1629 { | |
867 | 1630 const Ibyte *old_s1 = s1; |
1631 const Ibyte *old_s2 = s2; | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1632 int diff = (CANONCASE (0, itext_ichar (s1)) - |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1633 CANONCASE (0, itext_ichar (s2))); |
801 | 1634 if (diff != 0) |
1635 return diff; | |
867 | 1636 INC_IBYTEPTR (s1); |
1637 INC_IBYTEPTR (s2); | |
801 | 1638 len1 -= s1 - old_s1; |
1639 len2 -= s2 - old_s2; | |
1640 (*matching)++; | |
1641 } | |
1642 | |
1643 assert (len1 >= 0 && len2 >= 0); | |
1644 return len1 - len2; | |
771 | 1645 } |
1646 | |
1647 int | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1648 lisp_strcasecmp_ascii (Lisp_Object s1, Lisp_Object s2) |
771 | 1649 { |
867 | 1650 Ibyte *cm = strcasecmp_charmap; |
1651 Ibyte *p1 = XSTRING_DATA (s1); | |
1652 Ibyte *p2 = XSTRING_DATA (s2); | |
1653 Ibyte *e1 = p1 + XSTRING_LENGTH (s1); | |
1654 Ibyte *e2 = p2 + XSTRING_LENGTH (s2); | |
771 | 1655 |
1656 /* again, we use a symmetric algorithm and favor clarity over | |
1657 nanosecond improvements. */ | |
1658 while (1) | |
1659 { | |
1660 /* if we reached the end of either string, compare lengths. | |
1661 do NOT compare the final null byte against anything, in case | |
1662 the other string also has a null byte at that position. */ | |
1663 if (p1 == e1 || p2 == e2) | |
1664 return e1 - e2; | |
1665 if (cm[*p1] != cm[*p2]) | |
1666 return cm[*p1] - cm[*p2]; | |
1667 p1++, p2++; | |
1668 } | |
1669 } | |
1670 | |
1671 int | |
1672 lisp_strcasecmp_i18n (Lisp_Object s1, Lisp_Object s2) | |
1673 { | |
801 | 1674 return qxetextcasecmp (XSTRING_DATA (s1), XSTRING_LENGTH (s1), |
1675 XSTRING_DATA (s2), XSTRING_LENGTH (s2)); | |
771 | 1676 } |
1677 | |
2367 | 1678 /* Compare a wide string with an ASCII string */ |
1679 | |
1680 int | |
1681 wcscmp_ascii (const wchar_t *s1, const Ascbyte *s2) | |
1682 { | |
1683 while (*s1 && *s2) | |
1684 { | |
2956 | 1685 if (*s1 != (wchar_t) *s2) |
2367 | 1686 break; |
1687 s1++, s2++; | |
1688 } | |
1689 | |
1690 return *s1 - *s2; | |
1691 } | |
1692 | |
1693 int | |
1694 wcsncmp_ascii (const wchar_t *s1, const Ascbyte *s2, Charcount len) | |
1695 { | |
1696 while (len--) | |
1697 { | |
1698 int diff = *s1 - *s2; | |
1699 if (diff != 0) | |
1700 return diff; | |
1701 if (!*s1) | |
1702 return 0; | |
1703 s1++, s2++; | |
1704 } | |
1705 | |
1706 return 0; | |
1707 } | |
1708 | |
771 | 1709 |
1710 /************************************************************************/ | |
1711 /* conversion between textual representations */ | |
1712 /************************************************************************/ | |
1713 | |
1714 /* NOTE: Does not reset the Dynarr. */ | |
1715 | |
1716 void | |
867 | 1717 convert_ibyte_string_into_ichar_dynarr (const Ibyte *str, Bytecount len, |
2367 | 1718 Ichar_dynarr *dyn) |
771 | 1719 { |
867 | 1720 const Ibyte *strend = str + len; |
771 | 1721 |
1722 while (str < strend) | |
1723 { | |
867 | 1724 Ichar ch = itext_ichar (str); |
771 | 1725 Dynarr_add (dyn, ch); |
867 | 1726 INC_IBYTEPTR (str); |
771 | 1727 } |
1728 } | |
1729 | |
1730 Charcount | |
867 | 1731 convert_ibyte_string_into_ichar_string (const Ibyte *str, Bytecount len, |
2367 | 1732 Ichar *arr) |
771 | 1733 { |
867 | 1734 const Ibyte *strend = str + len; |
771 | 1735 Charcount newlen = 0; |
1736 while (str < strend) | |
1737 { | |
867 | 1738 Ichar ch = itext_ichar (str); |
771 | 1739 arr[newlen++] = ch; |
867 | 1740 INC_IBYTEPTR (str); |
771 | 1741 } |
1742 return newlen; | |
1743 } | |
1744 | |
867 | 1745 /* Convert an array of Ichars into the equivalent string representation. |
1746 Store into the given Ibyte dynarr. Does not reset the dynarr. | |
771 | 1747 Does not add a terminating zero. */ |
1748 | |
1749 void | |
867 | 1750 convert_ichar_string_into_ibyte_dynarr (Ichar *arr, int nels, |
1751 Ibyte_dynarr *dyn) | |
771 | 1752 { |
867 | 1753 Ibyte str[MAX_ICHAR_LEN]; |
771 | 1754 int i; |
1755 | |
1756 for (i = 0; i < nels; i++) | |
1757 { | |
867 | 1758 Bytecount len = set_itext_ichar (str, arr[i]); |
771 | 1759 Dynarr_add_many (dyn, str, len); |
1760 } | |
1761 } | |
1762 | |
867 | 1763 /* Convert an array of Ichars into the equivalent string representation. |
771 | 1764 Malloc the space needed for this and return it. If LEN_OUT is not a |
867 | 1765 NULL pointer, store into LEN_OUT the number of Ibytes in the |
1766 malloc()ed string. Note that the actual number of Ibytes allocated | |
771 | 1767 is one more than this: the returned string is zero-terminated. */ |
1768 | |
867 | 1769 Ibyte * |
1770 convert_ichar_string_into_malloced_string (Ichar *arr, int nels, | |
826 | 1771 Bytecount *len_out) |
771 | 1772 { |
1773 /* Damn zero-termination. */ | |
2367 | 1774 Ibyte *str = alloca_ibytes (nels * MAX_ICHAR_LEN + 1); |
867 | 1775 Ibyte *strorig = str; |
771 | 1776 Bytecount len; |
1777 | |
1778 int i; | |
1779 | |
1780 for (i = 0; i < nels; i++) | |
867 | 1781 str += set_itext_ichar (str, arr[i]); |
771 | 1782 *str = '\0'; |
1783 len = str - strorig; | |
2367 | 1784 str = xnew_ibytes (1 + len); |
771 | 1785 memcpy (str, strorig, 1 + len); |
1786 if (len_out) | |
1787 *len_out = len; | |
1788 return str; | |
1789 } | |
1790 | |
826 | 1791 #define COPY_TEXT_BETWEEN_FORMATS(srcfmt, dstfmt) \ |
1792 do \ | |
1793 { \ | |
1794 if (dst) \ | |
1795 { \ | |
867 | 1796 Ibyte *dstend = dst + dstlen; \ |
1797 Ibyte *dstp = dst; \ | |
1798 const Ibyte *srcend = src + srclen; \ | |
1799 const Ibyte *srcp = src; \ | |
826 | 1800 \ |
1801 while (srcp < srcend) \ | |
1802 { \ | |
867 | 1803 Ichar ch = itext_ichar_fmt (srcp, srcfmt, srcobj); \ |
1804 Bytecount len = ichar_len_fmt (ch, dstfmt); \ | |
826 | 1805 \ |
1806 if (dstp + len <= dstend) \ | |
1807 { \ | |
2956 | 1808 (void) set_itext_ichar_fmt (dstp, ch, dstfmt, dstobj); \ |
826 | 1809 dstp += len; \ |
1810 } \ | |
1811 else \ | |
1812 break; \ | |
867 | 1813 INC_IBYTEPTR_FMT (srcp, srcfmt); \ |
826 | 1814 } \ |
1815 text_checking_assert (srcp <= srcend); \ | |
1816 if (src_used) \ | |
1817 *src_used = srcp - src; \ | |
1818 return dstp - dst; \ | |
1819 } \ | |
1820 else \ | |
1821 { \ | |
867 | 1822 const Ibyte *srcend = src + srclen; \ |
1823 const Ibyte *srcp = src; \ | |
826 | 1824 Bytecount total = 0; \ |
1825 \ | |
1826 while (srcp < srcend) \ | |
1827 { \ | |
867 | 1828 total += ichar_len_fmt (itext_ichar_fmt (srcp, srcfmt, \ |
826 | 1829 srcobj), dstfmt); \ |
867 | 1830 INC_IBYTEPTR_FMT (srcp, srcfmt); \ |
826 | 1831 } \ |
1832 text_checking_assert (srcp == srcend); \ | |
1833 if (src_used) \ | |
1834 *src_used = srcp - src; \ | |
1835 return total; \ | |
1836 } \ | |
1837 } \ | |
1838 while (0) | |
1839 | |
1840 /* Copy as much text from SRC/SRCLEN to DST/DSTLEN as will fit, converting | |
1841 from SRCFMT/SRCOBJ to DSTFMT/DSTOBJ. Return number of bytes stored into | |
1842 DST as return value, and number of bytes copied from SRC through | |
1843 SRC_USED (if not NULL). If DST is NULL, don't actually store anything | |
1844 and just return the size needed to store all the text. Will not copy | |
1845 partial characters into DST. */ | |
1846 | |
1847 Bytecount | |
867 | 1848 copy_text_between_formats (const Ibyte *src, Bytecount srclen, |
826 | 1849 Internal_Format srcfmt, |
2333 | 1850 Lisp_Object USED_IF_MULE (srcobj), |
867 | 1851 Ibyte *dst, Bytecount dstlen, |
826 | 1852 Internal_Format dstfmt, |
2333 | 1853 Lisp_Object USED_IF_MULE (dstobj), |
826 | 1854 Bytecount *src_used) |
1855 { | |
1856 if (srcfmt == dstfmt && | |
1857 objects_have_same_internal_representation (srcobj, dstobj)) | |
1858 { | |
1859 if (dst) | |
1860 { | |
1861 srclen = min (srclen, dstlen); | |
867 | 1862 srclen = validate_ibyte_string_backward (src, srclen); |
826 | 1863 memcpy (dst, src, srclen); |
1864 if (src_used) | |
1865 *src_used = srclen; | |
1866 return srclen; | |
1867 } | |
1868 else | |
1869 return srclen; | |
1870 } | |
1871 /* Everything before the final else statement is an optimization. | |
1872 The inner loops inside COPY_TEXT_BETWEEN_FORMATS() have a number | |
1873 of calls to *_fmt(), each of which has a switch statement in it. | |
1874 By using constants as the FMT argument, these switch statements | |
1875 will be optimized out of existence. */ | |
1876 #define ELSE_FORMATS(fmt1, fmt2) \ | |
1877 else if (srcfmt == fmt1 && dstfmt == fmt2) \ | |
1878 COPY_TEXT_BETWEEN_FORMATS (fmt1, fmt2) | |
1879 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_8_BIT_FIXED); | |
1880 ELSE_FORMATS (FORMAT_8_BIT_FIXED, FORMAT_DEFAULT); | |
1881 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_32_BIT_FIXED); | |
1882 ELSE_FORMATS (FORMAT_32_BIT_FIXED, FORMAT_DEFAULT); | |
1883 else | |
1884 COPY_TEXT_BETWEEN_FORMATS (srcfmt, dstfmt); | |
1885 #undef ELSE_FORMATS | |
1886 } | |
1887 | |
1888 /* Copy as much buffer text in BUF, starting at POS, of length LEN, as will | |
1889 fit into DST/DSTLEN, converting to DSTFMT. Return number of bytes | |
1890 stored into DST as return value, and number of bytes copied from BUF | |
1891 through SRC_USED (if not NULL). If DST is NULL, don't actually store | |
1892 anything and just return the size needed to store all the text. */ | |
1893 | |
1894 Bytecount | |
1895 copy_buffer_text_out (struct buffer *buf, Bytebpos pos, | |
867 | 1896 Bytecount len, Ibyte *dst, Bytecount dstlen, |
826 | 1897 Internal_Format dstfmt, Lisp_Object dstobj, |
1898 Bytecount *src_used) | |
1899 { | |
1900 Bytecount dst_used = 0; | |
1901 if (src_used) | |
1902 *src_used = 0; | |
1903 | |
1904 { | |
1905 BUFFER_TEXT_LOOP (buf, pos, len, runptr, runlen) | |
1906 { | |
1907 Bytecount the_src_used, the_dst_used; | |
1908 | |
1909 the_dst_used = copy_text_between_formats (runptr, runlen, | |
1910 BUF_FORMAT (buf), | |
1911 wrap_buffer (buf), | |
1912 dst, dstlen, dstfmt, | |
1913 dstobj, &the_src_used); | |
1914 dst_used += the_dst_used; | |
1915 if (src_used) | |
1916 *src_used += the_src_used; | |
1917 if (dst) | |
1918 { | |
1919 dst += the_dst_used; | |
1920 dstlen -= the_dst_used; | |
841 | 1921 /* Stop if we didn't use all of the source text. Also stop |
1922 if the destination is full. We need the first test because | |
1923 there might be a couple bytes left in the destination, but | |
1924 not enough to fit a full character. The first test will in | |
1925 fact catch the vast majority of cases where the destination | |
1926 is empty, too -- but in case the destination holds *exactly* | |
1927 the run length, we put in the second check. (It shouldn't | |
1928 really matter though -- next time through we'll just get a | |
1929 0.) */ | |
1930 if (the_src_used < runlen || !dstlen) | |
826 | 1931 break; |
1932 } | |
1933 } | |
1934 } | |
1935 | |
1936 return dst_used; | |
1937 } | |
1938 | |
771 | 1939 |
1940 /************************************************************************/ | |
1941 /* charset properties of strings */ | |
1942 /************************************************************************/ | |
1943 | |
1944 void | |
2333 | 1945 find_charsets_in_ibyte_string (unsigned char *charsets, |
1946 const Ibyte *USED_IF_MULE (str), | |
1947 Bytecount USED_IF_MULE (len)) | |
771 | 1948 { |
1949 #ifndef MULE | |
1950 /* Telescope this. */ | |
1951 charsets[0] = 1; | |
1952 #else | |
867 | 1953 const Ibyte *strend = str + len; |
771 | 1954 memset (charsets, 0, NUM_LEADING_BYTES); |
1955 | |
1956 /* #### SJT doesn't like this. */ | |
1957 if (len == 0) | |
1958 { | |
1959 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1; | |
1960 return; | |
1961 } | |
1962 | |
1963 while (str < strend) | |
1964 { | |
867 | 1965 charsets[ichar_leading_byte (itext_ichar (str)) - MIN_LEADING_BYTE] = |
771 | 1966 1; |
867 | 1967 INC_IBYTEPTR (str); |
771 | 1968 } |
1969 #endif | |
1970 } | |
1971 | |
1972 void | |
2333 | 1973 find_charsets_in_ichar_string (unsigned char *charsets, |
1974 const Ichar *USED_IF_MULE (str), | |
1975 Charcount USED_IF_MULE (len)) | |
771 | 1976 { |
1977 #ifndef MULE | |
1978 /* Telescope this. */ | |
1979 charsets[0] = 1; | |
1980 #else | |
1981 int i; | |
1982 | |
1983 memset (charsets, 0, NUM_LEADING_BYTES); | |
1984 | |
1985 /* #### SJT doesn't like this. */ | |
1986 if (len == 0) | |
1987 { | |
1988 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1; | |
1989 return; | |
1990 } | |
1991 | |
1992 for (i = 0; i < len; i++) | |
1993 { | |
867 | 1994 charsets[ichar_leading_byte (str[i]) - MIN_LEADING_BYTE] = 1; |
771 | 1995 } |
1996 #endif | |
1997 } | |
1998 | |
3571 | 1999 /* A couple of these functions should only be called on a non-Mule build. */ |
2000 #ifdef MULE | |
2001 #define ASSERT_BUILT_WITH_MULE() assert(1) | |
2002 #else /* MULE */ | |
2003 #define ASSERT_BUILT_WITH_MULE() assert(0) | |
2004 #endif /* MULE */ | |
2005 | |
771 | 2006 int |
867 | 2007 ibyte_string_displayed_columns (const Ibyte *str, Bytecount len) |
771 | 2008 { |
2009 int cols = 0; | |
867 | 2010 const Ibyte *end = str + len; |
3571 | 2011 Ichar ch; |
2012 | |
2013 ASSERT_BUILT_WITH_MULE(); | |
771 | 2014 |
2015 while (str < end) | |
2016 { | |
3571 | 2017 ch = itext_ichar (str); |
867 | 2018 cols += XCHARSET_COLUMNS (ichar_charset (ch)); |
2019 INC_IBYTEPTR (str); | |
771 | 2020 } |
2021 | |
2022 return cols; | |
2023 } | |
2024 | |
2025 int | |
3571 | 2026 ichar_string_displayed_columns (const Ichar * USED_IF_MULE(str), Charcount len) |
771 | 2027 { |
2028 int cols = 0; | |
2029 int i; | |
2030 | |
3571 | 2031 ASSERT_BUILT_WITH_MULE(); |
2032 | |
771 | 2033 for (i = 0; i < len; i++) |
867 | 2034 cols += XCHARSET_COLUMNS (ichar_charset (str[i])); |
771 | 2035 |
2036 return cols; | |
2037 } | |
2038 | |
2039 Charcount | |
2333 | 2040 ibyte_string_nonascii_chars (const Ibyte *USED_IF_MULE (str), |
2041 Bytecount USED_IF_MULE (len)) | |
771 | 2042 { |
2043 #ifdef MULE | |
867 | 2044 const Ibyte *end = str + len; |
771 | 2045 Charcount retval = 0; |
2046 | |
2047 while (str < end) | |
2048 { | |
826 | 2049 if (!byte_ascii_p (*str)) |
771 | 2050 retval++; |
867 | 2051 INC_IBYTEPTR (str); |
771 | 2052 } |
2053 | |
2054 return retval; | |
2055 #else | |
2056 return 0; | |
2057 #endif | |
2058 } | |
2059 | |
2060 | |
2061 /***************************************************************************/ | |
2062 /* Eistring helper functions */ | |
2063 /***************************************************************************/ | |
2064 | |
2065 int | |
867 | 2066 eistr_casefiddle_1 (Ibyte *olddata, Bytecount len, Ibyte *newdata, |
771 | 2067 int downp) |
2068 { | |
867 | 2069 Ibyte *endp = olddata + len; |
2070 Ibyte *newp = newdata; | |
771 | 2071 int changedp = 0; |
2072 | |
2073 while (olddata < endp) | |
2074 { | |
867 | 2075 Ichar c = itext_ichar (olddata); |
2076 Ichar newc; | |
771 | 2077 |
2078 if (downp) | |
2079 newc = DOWNCASE (0, c); | |
2080 else | |
2081 newc = UPCASE (0, c); | |
2082 | |
2083 if (c != newc) | |
2084 changedp = 1; | |
2085 | |
867 | 2086 newp += set_itext_ichar (newp, newc); |
2087 INC_IBYTEPTR (olddata); | |
771 | 2088 } |
2089 | |
2090 *newp = '\0'; | |
2091 | |
2092 return changedp ? newp - newdata : 0; | |
2093 } | |
2094 | |
2095 int | |
2096 eifind_large_enough_buffer (int oldbufsize, int needed_size) | |
2097 { | |
2098 while (oldbufsize < needed_size) | |
2099 { | |
2100 oldbufsize = oldbufsize * 3 / 2; | |
2101 oldbufsize = max (oldbufsize, 32); | |
2102 } | |
2103 | |
2104 return oldbufsize; | |
2105 } | |
2106 | |
2107 void | |
2108 eito_malloc_1 (Eistring *ei) | |
2109 { | |
2110 if (ei->mallocp_) | |
2111 return; | |
2112 ei->mallocp_ = 1; | |
2113 if (ei->data_) | |
2114 { | |
867 | 2115 Ibyte *newdata; |
771 | 2116 |
2117 ei->max_size_allocated_ = | |
2118 eifind_large_enough_buffer (0, ei->bytelen_ + 1); | |
2367 | 2119 newdata = xnew_ibytes (ei->max_size_allocated_); |
771 | 2120 memcpy (newdata, ei->data_, ei->bytelen_ + 1); |
2121 ei->data_ = newdata; | |
2122 } | |
2123 | |
2124 if (ei->extdata_) | |
2125 { | |
2367 | 2126 Extbyte *newdata = xnew_extbytes (ei->extlen_ + 2); |
771 | 2127 |
2128 memcpy (newdata, ei->extdata_, ei->extlen_); | |
2129 /* Double null-terminate in case of Unicode data */ | |
2130 newdata[ei->extlen_] = '\0'; | |
2131 newdata[ei->extlen_ + 1] = '\0'; | |
2132 ei->extdata_ = newdata; | |
2133 } | |
2134 } | |
2135 | |
2136 int | |
2137 eicmp_1 (Eistring *ei, Bytecount off, Charcount charoff, | |
867 | 2138 Bytecount len, Charcount charlen, const Ibyte *data, |
2421 | 2139 const Eistring *ei2, int is_ascii, int fold_case) |
771 | 2140 { |
3462 | 2141 assert ((data == 0) != (ei == 0)); |
2142 assert ((is_ascii != 0) == (data != 0)); | |
2143 assert (fold_case >= 0 && fold_case <= 2); | |
771 | 2144 assert ((off < 0) != (charoff < 0)); |
3462 | 2145 |
771 | 2146 if (off < 0) |
2147 { | |
2148 off = charcount_to_bytecount (ei->data_, charoff); | |
2149 if (charlen < 0) | |
2150 len = -1; | |
2151 else | |
2152 len = charcount_to_bytecount (ei->data_ + off, charlen); | |
2153 } | |
2154 if (len < 0) | |
2155 len = ei->bytelen_ - off; | |
2156 | |
2157 assert (off >= 0 && off <= ei->bytelen_); | |
2158 assert (len >= 0 && off + len <= ei->bytelen_); | |
2159 | |
2160 { | |
2161 Bytecount dstlen; | |
867 | 2162 const Ibyte *src = ei->data_, *dst; |
771 | 2163 |
2164 if (data) | |
2165 { | |
2166 dst = data; | |
2167 dstlen = qxestrlen (data); | |
2168 } | |
2169 else | |
2170 { | |
2171 dst = ei2->data_; | |
2172 dstlen = ei2->bytelen_; | |
2173 } | |
2174 | |
2421 | 2175 if (is_ascii) |
2367 | 2176 ASSERT_ASCTEXT_ASCII_LEN ((Ascbyte *) dst, dstlen); |
771 | 2177 |
801 | 2178 return (fold_case == 0 ? qxememcmp4 (src, len, dst, dstlen) : |
2179 fold_case == 1 ? qxememcasecmp4 (src, len, dst, dstlen) : | |
2180 qxetextcasecmp (src, len, dst, dstlen)); | |
771 | 2181 } |
2182 } | |
2183 | |
867 | 2184 Ibyte * |
826 | 2185 eicpyout_malloc_fmt (Eistring *eistr, Bytecount *len_out, Internal_Format fmt, |
2286 | 2186 Lisp_Object UNUSED (object)) |
771 | 2187 { |
867 | 2188 Ibyte *ptr; |
771 | 2189 |
2190 assert (fmt == FORMAT_DEFAULT); | |
867 | 2191 ptr = xnew_array (Ibyte, eistr->bytelen_ + 1); |
771 | 2192 if (len_out) |
2193 *len_out = eistr->bytelen_; | |
2194 memcpy (ptr, eistr->data_, eistr->bytelen_ + 1); | |
2195 return ptr; | |
2196 } | |
2197 | |
2198 | |
2199 /************************************************************************/ | |
2200 /* Charcount/Bytecount conversion */ | |
2201 /************************************************************************/ | |
2202 | |
2203 /* Optimization. Do it. Live it. Love it. */ | |
2204 | |
2205 #ifdef MULE | |
2206 | |
826 | 2207 #ifdef EFFICIENT_INT_128_BIT |
2208 # define STRIDE_TYPE INT_128_BIT | |
2209 # define HIGH_BIT_MASK \ | |
2210 MAKE_128_BIT_UNSIGNED_CONSTANT (0x80808080808080808080808080808080) | |
2211 #elif defined (EFFICIENT_INT_64_BIT) | |
2212 # define STRIDE_TYPE INT_64_BIT | |
2213 # define HIGH_BIT_MASK MAKE_64_BIT_UNSIGNED_CONSTANT (0x8080808080808080) | |
771 | 2214 #else |
826 | 2215 # define STRIDE_TYPE INT_32_BIT |
2216 # define HIGH_BIT_MASK MAKE_32_BIT_UNSIGNED_CONSTANT (0x80808080) | |
771 | 2217 #endif |
2218 | |
2219 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1)) | |
2220 #define ALIGN_MASK (~ ALIGN_BITS) | |
2221 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0) | |
2222 #define STRIDE sizeof (STRIDE_TYPE) | |
2223 | |
2367 | 2224 /* Skip as many ASCII bytes as possible in the memory block [PTR, END). |
2225 Return pointer to the first non-ASCII byte. optimized for long | |
2226 stretches of ASCII. */ | |
2227 inline static const Ibyte * | |
2228 skip_ascii (const Ibyte *ptr, const Ibyte *end) | |
2229 { | |
826 | 2230 const unsigned STRIDE_TYPE *ascii_end; |
2231 | |
2232 /* Need to do in 3 sections -- before alignment start, aligned chunk, | |
2233 after alignment end. */ | |
2234 while (!ALIGNED (ptr)) | |
771 | 2235 { |
826 | 2236 if (ptr == end || !byte_ascii_p (*ptr)) |
2237 return ptr; | |
2238 ptr++; | |
2239 } | |
2240 ascii_end = (const unsigned STRIDE_TYPE *) ptr; | |
2241 /* This loop screams, because we can detect ASCII | |
2242 characters 4 or 8 at a time. */ | |
867 | 2243 while ((const Ibyte *) ascii_end + STRIDE <= end |
826 | 2244 && !(*ascii_end & HIGH_BIT_MASK)) |
2245 ascii_end++; | |
867 | 2246 ptr = (Ibyte *) ascii_end; |
826 | 2247 while (ptr < end && byte_ascii_p (*ptr)) |
2248 ptr++; | |
2249 return ptr; | |
2250 } | |
2251 | |
2367 | 2252 /* Skip as many ASCII bytes as possible in the memory block [END, PTR), |
2253 going downwards. Return pointer to the location above the first | |
2254 non-ASCII byte. Optimized for long stretches of ASCII. */ | |
2255 inline static const Ibyte * | |
2256 skip_ascii_down (const Ibyte *ptr, const Ibyte *end) | |
2257 { | |
2258 const unsigned STRIDE_TYPE *ascii_end; | |
2259 | |
2260 /* Need to do in 3 sections -- before alignment start, aligned chunk, | |
2261 after alignment end. */ | |
2262 while (!ALIGNED (ptr)) | |
2263 { | |
2264 if (ptr == end || !byte_ascii_p (*(ptr - 1))) | |
2265 return ptr; | |
2266 ptr--; | |
2267 } | |
2268 ascii_end = (const unsigned STRIDE_TYPE *) ptr - 1; | |
2269 /* This loop screams, because we can detect ASCII | |
2270 characters 4 or 8 at a time. */ | |
2271 while ((const Ibyte *) ascii_end >= end | |
2272 && !(*ascii_end & HIGH_BIT_MASK)) | |
2273 ascii_end--; | |
2274 ptr = (Ibyte *) (ascii_end + 1); | |
2275 while (ptr > end && byte_ascii_p (*(ptr - 1))) | |
2276 ptr--; | |
2277 return ptr; | |
2278 } | |
2279 | |
826 | 2280 /* Function equivalents of bytecount_to_charcount/charcount_to_bytecount. |
2281 These work on strings of all sizes but are more efficient than a simple | |
2282 loop on large strings and probably less efficient on sufficiently small | |
2283 strings. */ | |
2284 | |
2285 Charcount | |
867 | 2286 bytecount_to_charcount_fun (const Ibyte *ptr, Bytecount len) |
826 | 2287 { |
2288 Charcount count = 0; | |
867 | 2289 const Ibyte *end = ptr + len; |
826 | 2290 while (1) |
2291 { | |
867 | 2292 const Ibyte *newptr = skip_ascii (ptr, end); |
826 | 2293 count += newptr - ptr; |
2294 ptr = newptr; | |
2295 if (ptr == end) | |
2296 break; | |
2297 { | |
2298 /* Optimize for successive characters from the same charset */ | |
867 | 2299 Ibyte leading_byte = *ptr; |
826 | 2300 int bytes = rep_bytes_by_first_byte (leading_byte); |
2301 while (ptr < end && *ptr == leading_byte) | |
2302 ptr += bytes, count++; | |
2303 } | |
771 | 2304 } |
2305 | |
2306 /* Bomb out if the specified substring ends in the middle | |
2307 of a character. Note that we might have already gotten | |
2308 a core dump above from an invalid reference, but at least | |
2309 we will get no farther than here. | |
2310 | |
2311 This also catches len < 0. */ | |
800 | 2312 text_checking_assert (ptr == end); |
771 | 2313 |
2314 return count; | |
2315 } | |
2316 | |
2317 Bytecount | |
867 | 2318 charcount_to_bytecount_fun (const Ibyte *ptr, Charcount len) |
771 | 2319 { |
867 | 2320 const Ibyte *newptr = ptr; |
826 | 2321 while (1) |
771 | 2322 { |
867 | 2323 const Ibyte *newnewptr = skip_ascii (newptr, newptr + len); |
826 | 2324 len -= newnewptr - newptr; |
2325 newptr = newnewptr; | |
2326 if (!len) | |
2327 break; | |
2328 { | |
2329 /* Optimize for successive characters from the same charset */ | |
867 | 2330 Ibyte leading_byte = *newptr; |
826 | 2331 int bytes = rep_bytes_by_first_byte (leading_byte); |
2332 while (len > 0 && *newptr == leading_byte) | |
2333 newptr += bytes, len--; | |
2334 } | |
771 | 2335 } |
2336 return newptr - ptr; | |
2337 } | |
2338 | |
2367 | 2339 /* Function equivalent of charcount_to_bytecount_down. This works on strings |
2340 of all sizes but is more efficient than a simple loop on large strings | |
2341 and probably less efficient on sufficiently small strings. */ | |
2342 | |
2343 Bytecount | |
2344 charcount_to_bytecount_down_fun (const Ibyte *ptr, Charcount len) | |
2345 { | |
2346 const Ibyte *newptr = ptr; | |
2347 while (1) | |
2348 { | |
2349 const Ibyte *newnewptr = skip_ascii_down (newptr, newptr - len); | |
2350 len -= newptr - newnewptr; | |
2351 newptr = newnewptr; | |
2352 /* Skip over all non-ASCII chars, counting the length and | |
2353 stopping if it's zero */ | |
2354 while (len && !byte_ascii_p (*(newptr - 1))) | |
2355 if (ibyte_first_byte_p (*--newptr)) | |
2356 len--; | |
2357 if (!len) | |
2358 break; | |
2359 } | |
2360 text_checking_assert (ptr - newptr >= 0); | |
2361 return ptr - newptr; | |
2362 } | |
2363 | |
771 | 2364 /* The next two functions are the actual meat behind the |
2365 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently | |
2366 the method they use is fairly unsophisticated; see buffer.h. | |
2367 | |
2368 Note that charbpos_to_bytebpos_func() is probably the most-called | |
2369 function in all of XEmacs. Therefore, it must be FAST FAST FAST. | |
2370 This is the reason why so much of the code is duplicated. | |
2371 | |
2372 Similar considerations apply to bytebpos_to_charbpos_func(), although | |
2373 less so because the function is not called so often. | |
2367 | 2374 */ |
2375 | |
2376 /* | |
2377 | |
2378 Info on Byte-Char conversion: | |
2379 | |
2380 (Info-goto-node "(internals)Byte-Char Position Conversion") | |
2381 */ | |
2382 | |
2383 #ifdef OLD_BYTE_CHAR | |
771 | 2384 static int not_very_random_number; |
2367 | 2385 #endif /* OLD_BYTE_CHAR */ |
2386 | |
2387 #define OLD_LOOP | |
2388 | |
2389 /* If we are this many characters away from any known position, cache the | |
2390 new position in the buffer's char-byte cache. */ | |
2391 #define FAR_AWAY_DISTANCE 5000 | |
2392 | |
2393 /* Converting between character positions and byte positions. */ | |
2394 | |
2395 /* There are several places in the buffer where we know | |
2396 the correspondence: BEG, BEGV, PT, GPT, ZV and Z, | |
2397 and everywhere there is a marker. So we find the one of these places | |
2398 that is closest to the specified position, and scan from there. */ | |
2399 | |
2400 /* This macro is a subroutine of charbpos_to_bytebpos_func. | |
2401 Note that it is desirable that BYTEPOS is not evaluated | |
2402 except when we really want its value. */ | |
2403 | |
2404 #define CONSIDER(CHARPOS, BYTEPOS) \ | |
2405 do \ | |
2406 { \ | |
2407 Charbpos this_charpos = (CHARPOS); \ | |
2408 int changed = 0; \ | |
2409 \ | |
2410 if (this_charpos == x) \ | |
2411 { \ | |
2412 retval = (BYTEPOS); \ | |
2413 goto done; \ | |
2414 } \ | |
2415 else if (this_charpos > x) \ | |
2416 { \ | |
2417 if (this_charpos < best_above) \ | |
2418 { \ | |
2419 best_above = this_charpos; \ | |
2420 best_above_byte = (BYTEPOS); \ | |
2421 changed = 1; \ | |
2422 } \ | |
2423 } \ | |
2424 else if (this_charpos > best_below) \ | |
2425 { \ | |
2426 best_below = this_charpos; \ | |
2427 best_below_byte = (BYTEPOS); \ | |
2428 changed = 1; \ | |
2429 } \ | |
2430 \ | |
2431 if (changed) \ | |
2432 { \ | |
2433 if (best_above - best_below == best_above_byte - best_below_byte) \ | |
2434 { \ | |
2435 retval = best_below_byte + (x - best_below); \ | |
2436 goto done; \ | |
2437 } \ | |
2438 } \ | |
2439 } \ | |
2440 while (0) | |
2441 | |
771 | 2442 |
2443 Bytebpos | |
2444 charbpos_to_bytebpos_func (struct buffer *buf, Charbpos x) | |
2445 { | |
2367 | 2446 #ifdef OLD_BYTE_CHAR |
771 | 2447 Charbpos bufmin; |
2448 Charbpos bufmax; | |
2449 Bytebpos bytmin; | |
2450 Bytebpos bytmax; | |
2451 int size; | |
2452 int forward_p; | |
2453 int diff_so_far; | |
2454 int add_to_cache = 0; | |
2367 | 2455 #endif /* OLD_BYTE_CHAR */ |
2456 | |
2457 Charbpos best_above, best_below; | |
2458 Bytebpos best_above_byte, best_below_byte; | |
2459 int i; | |
2460 struct buffer_text *t; | |
2461 Bytebpos retval; | |
2462 | |
1292 | 2463 PROFILE_DECLARE (); |
771 | 2464 |
1292 | 2465 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion); |
2466 | |
2367 | 2467 best_above = BUF_Z (buf); |
2468 best_above_byte = BYTE_BUF_Z (buf); | |
2469 | |
2470 /* In this case, we simply have all one-byte characters. But this should | |
2471 have been intercepted before, in charbpos_to_bytebpos(). */ | |
2472 text_checking_assert (best_above != best_above_byte); | |
2473 | |
2474 best_below = BUF_BEG (buf); | |
2475 best_below_byte = BYTE_BUF_BEG (buf); | |
2476 | |
2477 /* We find in best_above and best_above_byte | |
2478 the closest known point above CHARPOS, | |
2479 and in best_below and best_below_byte | |
2480 the closest known point below CHARPOS, | |
2481 | |
2482 If at any point we can tell that the space between those | |
2483 two best approximations is all single-byte, | |
2484 we interpolate the result immediately. */ | |
2485 | |
2486 CONSIDER (BUF_PT (buf), BYTE_BUF_PT (buf)); | |
2487 CONSIDER (BUF_GPT (buf), BYTE_BUF_GPT (buf)); | |
2488 CONSIDER (BUF_BEGV (buf), BYTE_BUF_BEGV (buf)); | |
2489 CONSIDER (BUF_ZV (buf), BYTE_BUF_ZV (buf)); | |
2490 | |
2491 t = buf->text; | |
2492 CONSIDER (t->cached_charpos, t->cached_bytepos); | |
2493 | |
2494 /* Check the most recently entered positions first */ | |
2495 | |
2496 for (i = t->next_cache_pos - 1; i >= 0; i--) | |
2497 { | |
2498 CONSIDER (t->mule_charbpos_cache[i], t->mule_bytebpos_cache[i]); | |
2499 | |
2500 /* If we are down to a range of 50 chars, | |
2501 don't bother checking any other markers; | |
2502 scan the intervening chars directly now. */ | |
2503 if (best_above - best_below < 50) | |
2504 break; | |
2505 } | |
2506 | |
2507 /* We get here if we did not exactly hit one of the known places. | |
2508 We have one known above and one known below. | |
2509 Scan, counting characters, from whichever one is closer. */ | |
2510 | |
2511 if (x - best_below < best_above - x) | |
2512 { | |
2513 int record = x - best_below > FAR_AWAY_DISTANCE; | |
2514 | |
2515 #ifdef OLD_LOOP /* old code */ | |
2516 while (best_below != x) | |
2517 { | |
2518 best_below++; | |
2519 INC_BYTEBPOS (buf, best_below_byte); | |
2520 } | |
2521 #else | |
2522 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT); | |
2523 /* The gap should not occur between best_below and x, or we will be | |
2524 screwed in using charcount_to_bytecount(). It should not be exactly | |
2525 at x either, because we already should have caught that. */ | |
2526 text_checking_assert | |
2527 (BUF_CEILING_OF_IGNORE_ACCESSIBLE (buf, best_below) > x); | |
2528 | |
2529 /* Using charcount_to_bytecount() is potentially a lot faster than a | |
2530 simple loop using INC_BYTEBPOS() because (a) the checks for gap | |
2531 and buffer format are factored out instead of getting checked | |
2532 every time; (b) the checking goes 4 or 8 bytes at a time in ASCII | |
2533 text. | |
2534 */ | |
2535 best_below_byte += | |
2536 charcount_to_bytecount | |
2537 (BYTE_BUF_BYTE_ADDRESS (buf, best_below_byte), x - best_below); | |
2538 best_below = x; | |
2539 #endif /* 0 */ | |
2540 | |
2541 /* If this position is quite far from the nearest known position, | |
2542 cache the correspondence. | |
2543 | |
2544 NB FSF does this: "... by creating a marker here. | |
2545 It will last until the next GC." | |
2546 */ | |
2547 | |
2548 if (record) | |
2549 { | |
2550 /* If we have run out of positions to record, discard some of the | |
2551 old ones. I used to use a circular buffer, which avoids the | |
2552 need to block-move any memory. But it makes it more difficult | |
2553 to keep track of which positions haven't been used -- commonly | |
2554 we haven't yet filled out anywhere near the whole set of | |
2555 positions and don't want to check them all. We should not be | |
2556 recording that often, and block-moving is extremely fast in | |
2557 any case. --ben */ | |
2558 if (t->next_cache_pos == NUM_CACHED_POSITIONS) | |
2559 { | |
2560 memmove (t->mule_charbpos_cache, | |
2561 t->mule_charbpos_cache + NUM_MOVED_POSITIONS, | |
2562 sizeof (Charbpos) * | |
2563 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2564 memmove (t->mule_bytebpos_cache, | |
2565 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS, | |
2566 sizeof (Bytebpos) * | |
2567 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2568 t->next_cache_pos -= NUM_MOVED_POSITIONS; | |
2569 } | |
2570 t->mule_charbpos_cache[t->next_cache_pos] = best_below; | |
2571 t->mule_bytebpos_cache[t->next_cache_pos] = best_below_byte; | |
2572 t->next_cache_pos++; | |
2573 } | |
2574 | |
2575 t->cached_charpos = best_below; | |
2576 t->cached_bytepos = best_below_byte; | |
2577 | |
2578 retval = best_below_byte; | |
2579 text_checking_assert (best_below_byte >= best_below); | |
2580 goto done; | |
2581 } | |
2582 else | |
2583 { | |
2584 int record = best_above - x > FAR_AWAY_DISTANCE; | |
2585 | |
2586 #ifdef OLD_LOOP | |
2587 while (best_above != x) | |
2588 { | |
2589 best_above--; | |
2590 DEC_BYTEBPOS (buf, best_above_byte); | |
2591 } | |
2592 #else | |
2593 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT); | |
2594 /* The gap should not occur between best_above and x, or we will be | |
2595 screwed in using charcount_to_bytecount_down(). It should not be | |
2596 exactly at x either, because we already should have caught | |
2597 that. */ | |
2598 text_checking_assert | |
2599 (BUF_FLOOR_OF_IGNORE_ACCESSIBLE (buf, best_above) < x); | |
2600 | |
2601 /* Using charcount_to_bytecount_down() is potentially a lot faster | |
2602 than a simple loop using DEC_BYTEBPOS(); see above. */ | |
2603 best_above_byte -= | |
2604 charcount_to_bytecount_down | |
2605 /* BYTE_BUF_BYTE_ADDRESS will return a value on the high side of the | |
2606 gap if we are at the gap, which is the wrong side. So do the | |
2607 following trick instead. */ | |
2608 (BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, best_above_byte) + 1, | |
2609 best_above - x); | |
2610 best_above = x; | |
2611 #endif /* SLEDGEHAMMER_CHECK_TEXT */ | |
2612 | |
2613 | |
2614 /* If this position is quite far from the nearest known position, | |
2615 cache the correspondence. | |
2616 | |
2617 NB FSF does this: "... by creating a marker here. | |
2618 It will last until the next GC." | |
2619 */ | |
2620 if (record) | |
2621 { | |
2622 if (t->next_cache_pos == NUM_CACHED_POSITIONS) | |
2623 { | |
2624 memmove (t->mule_charbpos_cache, | |
2625 t->mule_charbpos_cache + NUM_MOVED_POSITIONS, | |
2626 sizeof (Charbpos) * | |
2627 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2628 memmove (t->mule_bytebpos_cache, | |
2629 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS, | |
2630 sizeof (Bytebpos) * | |
2631 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2632 t->next_cache_pos -= NUM_MOVED_POSITIONS; | |
2633 } | |
2634 t->mule_charbpos_cache[t->next_cache_pos] = best_above; | |
2635 t->mule_bytebpos_cache[t->next_cache_pos] = best_above_byte; | |
2636 t->next_cache_pos++; | |
2637 } | |
2638 | |
2639 t->cached_charpos = best_above; | |
2640 t->cached_bytepos = best_above_byte; | |
2641 | |
2642 retval = best_above_byte; | |
2643 text_checking_assert (best_above_byte >= best_above); | |
2644 goto done; | |
2645 } | |
2646 | |
2647 #ifdef OLD_BYTE_CHAR | |
2648 | |
771 | 2649 bufmin = buf->text->mule_bufmin; |
2650 bufmax = buf->text->mule_bufmax; | |
2651 bytmin = buf->text->mule_bytmin; | |
2652 bytmax = buf->text->mule_bytmax; | |
2653 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; | |
2654 | |
2655 /* The basic idea here is that we shift the "known region" up or down | |
2656 until it overlaps the specified position. We do this by moving | |
2657 the upper bound of the known region up one character at a time, | |
2658 and moving the lower bound of the known region up as necessary | |
2659 when the size of the character just seen changes. | |
2660 | |
2661 We optimize this, however, by first shifting the known region to | |
2662 one of the cached points if it's close by. (We don't check BEG or | |
2663 Z, even though they're cached; most of the time these will be the | |
2664 same as BEGV and ZV, and when they're not, they're not likely | |
2665 to be used.) */ | |
2666 | |
2667 if (x > bufmax) | |
2668 { | |
2669 Charbpos diffmax = x - bufmax; | |
2670 Charbpos diffpt = x - BUF_PT (buf); | |
2671 Charbpos diffzv = BUF_ZV (buf) - x; | |
2672 /* #### This value could stand some more exploration. */ | |
2673 Charcount heuristic_hack = (bufmax - bufmin) >> 2; | |
2674 | |
2675 /* Check if the position is closer to PT or ZV than to the | |
2676 end of the known region. */ | |
2677 | |
2678 if (diffpt < 0) | |
2679 diffpt = -diffpt; | |
2680 if (diffzv < 0) | |
2681 diffzv = -diffzv; | |
2682 | |
2683 /* But also implement a heuristic that favors the known region | |
2684 over PT or ZV. The reason for this is that switching to | |
2685 PT or ZV will wipe out the knowledge in the known region, | |
2686 which might be annoying if the known region is large and | |
2687 PT or ZV is not that much closer than the end of the known | |
2688 region. */ | |
2689 | |
2690 diffzv += heuristic_hack; | |
2691 diffpt += heuristic_hack; | |
2692 if (diffpt < diffmax && diffpt <= diffzv) | |
2693 { | |
2694 bufmax = bufmin = BUF_PT (buf); | |
826 | 2695 bytmax = bytmin = BYTE_BUF_PT (buf); |
771 | 2696 /* We set the size to 1 even though it doesn't really |
2697 matter because the new known region contains no | |
2698 characters. We do this because this is the most | |
2699 likely size of the characters around the new known | |
2700 region, and we avoid potential yuckiness that is | |
2701 done when size == 3. */ | |
2702 size = 1; | |
2703 } | |
2704 if (diffzv < diffmax) | |
2705 { | |
2706 bufmax = bufmin = BUF_ZV (buf); | |
826 | 2707 bytmax = bytmin = BYTE_BUF_ZV (buf); |
771 | 2708 size = 1; |
2709 } | |
2710 } | |
800 | 2711 #ifdef ERROR_CHECK_TEXT |
771 | 2712 else if (x >= bufmin) |
2500 | 2713 ABORT (); |
771 | 2714 #endif |
2715 else | |
2716 { | |
2717 Charbpos diffmin = bufmin - x; | |
2718 Charbpos diffpt = BUF_PT (buf) - x; | |
2719 Charbpos diffbegv = x - BUF_BEGV (buf); | |
2720 /* #### This value could stand some more exploration. */ | |
2721 Charcount heuristic_hack = (bufmax - bufmin) >> 2; | |
2722 | |
2723 if (diffpt < 0) | |
2724 diffpt = -diffpt; | |
2725 if (diffbegv < 0) | |
2726 diffbegv = -diffbegv; | |
2727 | |
2728 /* But also implement a heuristic that favors the known region -- | |
2729 see above. */ | |
2730 | |
2731 diffbegv += heuristic_hack; | |
2732 diffpt += heuristic_hack; | |
2733 | |
2734 if (diffpt < diffmin && diffpt <= diffbegv) | |
2735 { | |
2736 bufmax = bufmin = BUF_PT (buf); | |
826 | 2737 bytmax = bytmin = BYTE_BUF_PT (buf); |
771 | 2738 /* We set the size to 1 even though it doesn't really |
2739 matter because the new known region contains no | |
2740 characters. We do this because this is the most | |
2741 likely size of the characters around the new known | |
2742 region, and we avoid potential yuckiness that is | |
2743 done when size == 3. */ | |
2744 size = 1; | |
2745 } | |
2746 if (diffbegv < diffmin) | |
2747 { | |
2748 bufmax = bufmin = BUF_BEGV (buf); | |
826 | 2749 bytmax = bytmin = BYTE_BUF_BEGV (buf); |
771 | 2750 size = 1; |
2751 } | |
2752 } | |
2753 | |
2754 diff_so_far = x > bufmax ? x - bufmax : bufmin - x; | |
2755 if (diff_so_far > 50) | |
2756 { | |
2757 /* If we have to move more than a certain amount, then look | |
2758 into our cache. */ | |
2759 int minval = INT_MAX; | |
2760 int found = 0; | |
2761 int i; | |
2762 | |
2763 add_to_cache = 1; | |
2764 /* I considered keeping the positions ordered. This would speed | |
2765 up this loop, but updating the cache would take longer, so | |
2766 it doesn't seem like it would really matter. */ | |
2367 | 2767 for (i = 0; i < NUM_CACHED_POSITIONS; i++) |
771 | 2768 { |
2769 int diff = buf->text->mule_charbpos_cache[i] - x; | |
2770 | |
2771 if (diff < 0) | |
2772 diff = -diff; | |
2773 if (diff < minval) | |
2774 { | |
2775 minval = diff; | |
2776 found = i; | |
2777 } | |
2778 } | |
2779 | |
2780 if (minval < diff_so_far) | |
2781 { | |
2782 bufmax = bufmin = buf->text->mule_charbpos_cache[found]; | |
2783 bytmax = bytmin = buf->text->mule_bytebpos_cache[found]; | |
2784 size = 1; | |
2785 } | |
2786 } | |
2787 | |
2788 /* It's conceivable that the caching above could lead to X being | |
2789 the same as one of the range edges. */ | |
2790 if (x >= bufmax) | |
2791 { | |
2792 Bytebpos newmax; | |
2793 Bytecount newsize; | |
2794 | |
2795 forward_p = 1; | |
2796 while (x > bufmax) | |
2797 { | |
2798 newmax = bytmax; | |
2799 | |
2800 INC_BYTEBPOS (buf, newmax); | |
2801 newsize = newmax - bytmax; | |
2802 if (newsize != size) | |
2803 { | |
2804 bufmin = bufmax; | |
2805 bytmin = bytmax; | |
2806 size = newsize; | |
2807 } | |
2808 bytmax = newmax; | |
2809 bufmax++; | |
2810 } | |
2811 retval = bytmax; | |
2812 | |
2813 /* #### Should go past the found location to reduce the number | |
2814 of times that this function is called */ | |
2815 } | |
2816 else /* x < bufmin */ | |
2817 { | |
2818 Bytebpos newmin; | |
2819 Bytecount newsize; | |
2820 | |
2821 forward_p = 0; | |
2822 while (x < bufmin) | |
2823 { | |
2824 newmin = bytmin; | |
2825 | |
2826 DEC_BYTEBPOS (buf, newmin); | |
2827 newsize = bytmin - newmin; | |
2828 if (newsize != size) | |
2829 { | |
2830 bufmax = bufmin; | |
2831 bytmax = bytmin; | |
2832 size = newsize; | |
2833 } | |
2834 bytmin = newmin; | |
2835 bufmin--; | |
2836 } | |
2837 retval = bytmin; | |
2838 | |
2839 /* #### Should go past the found location to reduce the number | |
2840 of times that this function is called | |
2841 */ | |
2842 } | |
2843 | |
2844 /* If size is three, than we have to max sure that the range we | |
2845 discovered isn't too large, because we use a fixed-length | |
2846 table to divide by 3. */ | |
2847 | |
2848 if (size == 3) | |
2849 { | |
2850 int gap = bytmax - bytmin; | |
2851 buf->text->mule_three_p = 1; | |
2852 buf->text->mule_shifter = 1; | |
2853 | |
2854 if (gap > MAX_BYTEBPOS_GAP_SIZE_3) | |
2855 { | |
2856 if (forward_p) | |
2857 { | |
2858 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3; | |
2859 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3; | |
2860 } | |
2861 else | |
2862 { | |
2863 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3; | |
2864 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3; | |
2865 } | |
2866 } | |
2867 } | |
2868 else | |
2869 { | |
2870 buf->text->mule_three_p = 0; | |
2871 if (size == 4) | |
2872 buf->text->mule_shifter = 2; | |
2873 else | |
2874 buf->text->mule_shifter = size - 1; | |
2875 } | |
2876 | |
2877 buf->text->mule_bufmin = bufmin; | |
2878 buf->text->mule_bufmax = bufmax; | |
2879 buf->text->mule_bytmin = bytmin; | |
2880 buf->text->mule_bytmax = bytmax; | |
2881 | |
2882 if (add_to_cache) | |
2883 { | |
2884 int replace_loc; | |
2885 | |
2886 /* We throw away a "random" cached value and replace it with | |
2887 the new value. It doesn't actually have to be very random | |
2888 at all, just evenly distributed. | |
2889 | |
2890 #### It would be better to use a least-recently-used algorithm | |
2891 or something that tries to space things out, but I'm not sure | |
2892 it's worth it to go to the trouble of maintaining that. */ | |
2893 not_very_random_number += 621; | |
2894 replace_loc = not_very_random_number & 15; | |
2895 buf->text->mule_charbpos_cache[replace_loc] = x; | |
2896 buf->text->mule_bytebpos_cache[replace_loc] = retval; | |
2897 } | |
2898 | |
2367 | 2899 #endif /* OLD_BYTE_CHAR */ |
2900 | |
2901 done: | |
1292 | 2902 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion); |
2903 | |
771 | 2904 return retval; |
2905 } | |
2906 | |
2367 | 2907 #undef CONSIDER |
2908 | |
2909 /* bytepos_to_charpos returns the char position corresponding to BYTEPOS. */ | |
2910 | |
2911 /* This macro is a subroutine of bytebpos_to_charbpos_func. | |
2912 It is used when BYTEPOS is actually the byte position. */ | |
2913 | |
2914 #define CONSIDER(BYTEPOS, CHARPOS) \ | |
2915 do \ | |
2916 { \ | |
2917 Bytebpos this_bytepos = (BYTEPOS); \ | |
2918 int changed = 0; \ | |
2919 \ | |
2920 if (this_bytepos == x) \ | |
2921 { \ | |
2922 retval = (CHARPOS); \ | |
2923 goto done; \ | |
2924 } \ | |
2925 else if (this_bytepos > x) \ | |
2926 { \ | |
2927 if (this_bytepos < best_above_byte) \ | |
2928 { \ | |
2929 best_above = (CHARPOS); \ | |
2930 best_above_byte = this_bytepos; \ | |
2931 changed = 1; \ | |
2932 } \ | |
2933 } \ | |
2934 else if (this_bytepos > best_below_byte) \ | |
2935 { \ | |
2936 best_below = (CHARPOS); \ | |
2937 best_below_byte = this_bytepos; \ | |
2938 changed = 1; \ | |
2939 } \ | |
2940 \ | |
2941 if (changed) \ | |
2942 { \ | |
2943 if (best_above - best_below == best_above_byte - best_below_byte) \ | |
2944 { \ | |
2945 retval = best_below + (x - best_below_byte); \ | |
2946 goto done; \ | |
2947 } \ | |
2948 } \ | |
2949 } \ | |
2950 while (0) | |
2951 | |
771 | 2952 /* The logic in this function is almost identical to the logic in |
2953 the previous function. */ | |
2954 | |
2955 Charbpos | |
2956 bytebpos_to_charbpos_func (struct buffer *buf, Bytebpos x) | |
2957 { | |
2367 | 2958 #ifdef OLD_BYTE_CHAR |
771 | 2959 Charbpos bufmin; |
2960 Charbpos bufmax; | |
2961 Bytebpos bytmin; | |
2962 Bytebpos bytmax; | |
2963 int size; | |
2964 int forward_p; | |
2965 int diff_so_far; | |
2966 int add_to_cache = 0; | |
2367 | 2967 #endif /* OLD_BYTE_CHAR */ |
2968 | |
2969 Charbpos best_above, best_above_byte; | |
2970 Bytebpos best_below, best_below_byte; | |
2971 int i; | |
2972 struct buffer_text *t; | |
2973 Charbpos retval; | |
2974 | |
1292 | 2975 PROFILE_DECLARE (); |
771 | 2976 |
1292 | 2977 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion); |
2978 | |
2367 | 2979 best_above = BUF_Z (buf); |
2980 best_above_byte = BYTE_BUF_Z (buf); | |
2981 | |
2982 /* In this case, we simply have all one-byte characters. But this should | |
2983 have been intercepted before, in bytebpos_to_charbpos(). */ | |
2984 text_checking_assert (best_above != best_above_byte); | |
2985 | |
2986 best_below = BUF_BEG (buf); | |
2987 best_below_byte = BYTE_BUF_BEG (buf); | |
2988 | |
2989 CONSIDER (BYTE_BUF_PT (buf), BUF_PT (buf)); | |
2990 CONSIDER (BYTE_BUF_GPT (buf), BUF_GPT (buf)); | |
2991 CONSIDER (BYTE_BUF_BEGV (buf), BUF_BEGV (buf)); | |
2992 CONSIDER (BYTE_BUF_ZV (buf), BUF_ZV (buf)); | |
2993 | |
2994 t = buf->text; | |
2995 CONSIDER (t->cached_bytepos, t->cached_charpos); | |
2996 | |
2997 /* Check the most recently entered positions first */ | |
2998 | |
2999 for (i = t->next_cache_pos - 1; i >= 0; i--) | |
3000 { | |
3001 CONSIDER (t->mule_bytebpos_cache[i], t->mule_charbpos_cache[i]); | |
3002 | |
3003 /* If we are down to a range of 50 chars, | |
3004 don't bother checking any other markers; | |
3005 scan the intervening chars directly now. */ | |
3006 if (best_above - best_below < 50) | |
3007 break; | |
3008 } | |
3009 | |
3010 /* We get here if we did not exactly hit one of the known places. | |
3011 We have one known above and one known below. | |
3012 Scan, counting characters, from whichever one is closer. */ | |
3013 | |
3014 if (x - best_below_byte < best_above_byte - x) | |
3015 { | |
3016 int record = x - best_below_byte > 5000; | |
3017 | |
3018 #ifdef OLD_LOOP /* old code */ | |
4526
38493c0fb952
Fix accidental deletion in src/text.c.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4525
diff
changeset
|
3019 while (best_below_byte < x) |
2367 | 3020 { |
3021 best_below++; | |
3022 INC_BYTEBPOS (buf, best_below_byte); | |
3023 } | |
3024 #else | |
3025 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT); | |
3026 /* The gap should not occur between best_below and x, or we will be | |
3027 screwed in using charcount_to_bytecount(). It should not be exactly | |
3028 at x either, because we already should have caught that. */ | |
3029 text_checking_assert | |
3030 (BYTE_BUF_CEILING_OF_IGNORE_ACCESSIBLE (buf, best_below_byte) > x); | |
3031 | |
3032 /* Using bytecount_to_charcount() is potentially a lot faster than | |
3033 a simple loop above using INC_BYTEBPOS(); see above. | |
3034 */ | |
3035 best_below += | |
3036 bytecount_to_charcount | |
3037 (BYTE_BUF_BYTE_ADDRESS (buf, best_below_byte), x - best_below_byte); | |
3038 best_below_byte = x; | |
3039 #endif | |
3040 | |
3041 /* If this position is quite far from the nearest known position, | |
3042 cache the correspondence. | |
3043 | |
3044 NB FSF does this: "... by creating a marker here. | |
3045 It will last until the next GC." | |
3046 */ | |
3047 | |
3048 if (record) | |
3049 { | |
3050 if (t->next_cache_pos == NUM_CACHED_POSITIONS) | |
3051 { | |
3052 memmove (t->mule_charbpos_cache, | |
3053 t->mule_charbpos_cache + NUM_MOVED_POSITIONS, | |
3054 sizeof (Charbpos) * | |
3055 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
3056 memmove (t->mule_bytebpos_cache, | |
3057 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS, | |
3058 sizeof (Bytebpos) * | |
3059 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
3060 t->next_cache_pos -= NUM_MOVED_POSITIONS; | |
3061 } | |
3062 t->mule_charbpos_cache[t->next_cache_pos] = best_below; | |
3063 t->mule_bytebpos_cache[t->next_cache_pos] = best_below_byte; | |
3064 t->next_cache_pos++; | |
3065 } | |
3066 | |
3067 | |
3068 t->cached_charpos = best_below; | |
3069 t->cached_bytepos = best_below_byte; | |
3070 | |
3071 retval = best_below; | |
3072 text_checking_assert (best_below_byte >= best_below); | |
3073 goto done; | |
3074 } | |
3075 else | |
3076 { | |
3077 int record = best_above_byte - x > 5000; | |
3078 | |
3079 #ifdef OLD_LOOP /* old code */ | |
3080 while (best_above_byte > x) | |
3081 { | |
3082 best_above--; | |
3083 DEC_BYTEBPOS (buf, best_above_byte); | |
3084 } | |
3085 #else | |
3086 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT); | |
3087 /* The gap should not occur between best_above and x, or we will be | |
3088 screwed in using bytecount_to_charcount_down(). It should not be | |
3089 exactly at x either, because we already should have caught | |
3090 that. */ | |
3091 text_checking_assert | |
3092 (BYTE_BUF_FLOOR_OF_IGNORE_ACCESSIBLE (buf, best_above_byte) < x); | |
3093 | |
3094 /* Using bytecount_to_charcount_down() is potentially a lot faster | |
3095 than a simple loop using INC_BYTEBPOS(); see above. */ | |
3096 best_above -= | |
3097 bytecount_to_charcount_down | |
3098 /* BYTE_BUF_BYTE_ADDRESS will return a value on the high side of the | |
3099 gap if we are at the gap, which is the wrong side. So do the | |
3100 following trick instead. */ | |
3101 (BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, best_above_byte) + 1, | |
3102 best_above_byte - x); | |
3103 best_above_byte = x; | |
3104 #endif | |
3105 | |
3106 | |
3107 /* If this position is quite far from the nearest known position, | |
3108 cache the correspondence. | |
3109 | |
3110 NB FSF does this: "... by creating a marker here. | |
3111 It will last until the next GC." | |
3112 */ | |
3113 if (record) | |
3114 { | |
3115 if (t->next_cache_pos == NUM_CACHED_POSITIONS) | |
3116 { | |
3117 memmove (t->mule_charbpos_cache, | |
3118 t->mule_charbpos_cache + NUM_MOVED_POSITIONS, | |
3119 sizeof (Charbpos) * | |
3120 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
3121 memmove (t->mule_bytebpos_cache, | |
3122 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS, | |
3123 sizeof (Bytebpos) * | |
3124 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
3125 t->next_cache_pos -= NUM_MOVED_POSITIONS; | |
3126 } | |
3127 t->mule_charbpos_cache[t->next_cache_pos] = best_above; | |
3128 t->mule_bytebpos_cache[t->next_cache_pos] = best_above_byte; | |
3129 t->next_cache_pos++; | |
3130 } | |
3131 | |
3132 t->cached_charpos = best_above; | |
3133 t->cached_bytepos = best_above_byte; | |
3134 | |
3135 retval = best_above; | |
3136 text_checking_assert (best_above_byte >= best_above); | |
3137 goto done; | |
3138 } | |
3139 | |
3140 #ifdef OLD_BYTE_CHAR | |
3141 | |
771 | 3142 bufmin = buf->text->mule_bufmin; |
3143 bufmax = buf->text->mule_bufmax; | |
3144 bytmin = buf->text->mule_bytmin; | |
3145 bytmax = buf->text->mule_bytmax; | |
3146 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; | |
3147 | |
3148 /* The basic idea here is that we shift the "known region" up or down | |
3149 until it overlaps the specified position. We do this by moving | |
3150 the upper bound of the known region up one character at a time, | |
3151 and moving the lower bound of the known region up as necessary | |
3152 when the size of the character just seen changes. | |
3153 | |
3154 We optimize this, however, by first shifting the known region to | |
826 | 3155 one of the cached points if it's close by. (We don't check BYTE_BEG or |
3156 BYTE_Z, even though they're cached; most of the time these will be the | |
3157 same as BYTE_BEGV and BYTE_ZV, and when they're not, they're not likely | |
771 | 3158 to be used.) */ |
3159 | |
3160 if (x > bytmax) | |
3161 { | |
3162 Bytebpos diffmax = x - bytmax; | |
826 | 3163 Bytebpos diffpt = x - BYTE_BUF_PT (buf); |
3164 Bytebpos diffzv = BYTE_BUF_ZV (buf) - x; | |
771 | 3165 /* #### This value could stand some more exploration. */ |
3166 Bytecount heuristic_hack = (bytmax - bytmin) >> 2; | |
3167 | |
3168 /* Check if the position is closer to PT or ZV than to the | |
3169 end of the known region. */ | |
3170 | |
3171 if (diffpt < 0) | |
3172 diffpt = -diffpt; | |
3173 if (diffzv < 0) | |
3174 diffzv = -diffzv; | |
3175 | |
3176 /* But also implement a heuristic that favors the known region | |
826 | 3177 over BYTE_PT or BYTE_ZV. The reason for this is that switching to |
3178 BYTE_PT or BYTE_ZV will wipe out the knowledge in the known region, | |
771 | 3179 which might be annoying if the known region is large and |
826 | 3180 BYTE_PT or BYTE_ZV is not that much closer than the end of the known |
771 | 3181 region. */ |
3182 | |
3183 diffzv += heuristic_hack; | |
3184 diffpt += heuristic_hack; | |
3185 if (diffpt < diffmax && diffpt <= diffzv) | |
3186 { | |
3187 bufmax = bufmin = BUF_PT (buf); | |
826 | 3188 bytmax = bytmin = BYTE_BUF_PT (buf); |
771 | 3189 /* We set the size to 1 even though it doesn't really |
3190 matter because the new known region contains no | |
3191 characters. We do this because this is the most | |
3192 likely size of the characters around the new known | |
3193 region, and we avoid potential yuckiness that is | |
3194 done when size == 3. */ | |
3195 size = 1; | |
3196 } | |
3197 if (diffzv < diffmax) | |
3198 { | |
3199 bufmax = bufmin = BUF_ZV (buf); | |
826 | 3200 bytmax = bytmin = BYTE_BUF_ZV (buf); |
771 | 3201 size = 1; |
3202 } | |
3203 } | |
800 | 3204 #ifdef ERROR_CHECK_TEXT |
771 | 3205 else if (x >= bytmin) |
2500 | 3206 ABORT (); |
771 | 3207 #endif |
3208 else | |
3209 { | |
3210 Bytebpos diffmin = bytmin - x; | |
826 | 3211 Bytebpos diffpt = BYTE_BUF_PT (buf) - x; |
3212 Bytebpos diffbegv = x - BYTE_BUF_BEGV (buf); | |
771 | 3213 /* #### This value could stand some more exploration. */ |
3214 Bytecount heuristic_hack = (bytmax - bytmin) >> 2; | |
3215 | |
3216 if (diffpt < 0) | |
3217 diffpt = -diffpt; | |
3218 if (diffbegv < 0) | |
3219 diffbegv = -diffbegv; | |
3220 | |
3221 /* But also implement a heuristic that favors the known region -- | |
3222 see above. */ | |
3223 | |
3224 diffbegv += heuristic_hack; | |
3225 diffpt += heuristic_hack; | |
3226 | |
3227 if (diffpt < diffmin && diffpt <= diffbegv) | |
3228 { | |
3229 bufmax = bufmin = BUF_PT (buf); | |
826 | 3230 bytmax = bytmin = BYTE_BUF_PT (buf); |
771 | 3231 /* We set the size to 1 even though it doesn't really |
3232 matter because the new known region contains no | |
3233 characters. We do this because this is the most | |
3234 likely size of the characters around the new known | |
3235 region, and we avoid potential yuckiness that is | |
3236 done when size == 3. */ | |
3237 size = 1; | |
3238 } | |
3239 if (diffbegv < diffmin) | |
3240 { | |
3241 bufmax = bufmin = BUF_BEGV (buf); | |
826 | 3242 bytmax = bytmin = BYTE_BUF_BEGV (buf); |
771 | 3243 size = 1; |
3244 } | |
3245 } | |
3246 | |
3247 diff_so_far = x > bytmax ? x - bytmax : bytmin - x; | |
3248 if (diff_so_far > 50) | |
3249 { | |
3250 /* If we have to move more than a certain amount, then look | |
3251 into our cache. */ | |
3252 int minval = INT_MAX; | |
3253 int found = 0; | |
3254 int i; | |
3255 | |
3256 add_to_cache = 1; | |
3257 /* I considered keeping the positions ordered. This would speed | |
3258 up this loop, but updating the cache would take longer, so | |
3259 it doesn't seem like it would really matter. */ | |
2367 | 3260 for (i = 0; i < NUM_CACHED_POSITIONS; i++) |
771 | 3261 { |
3262 int diff = buf->text->mule_bytebpos_cache[i] - x; | |
3263 | |
3264 if (diff < 0) | |
3265 diff = -diff; | |
3266 if (diff < minval) | |
3267 { | |
3268 minval = diff; | |
3269 found = i; | |
3270 } | |
3271 } | |
3272 | |
3273 if (minval < diff_so_far) | |
3274 { | |
3275 bufmax = bufmin = buf->text->mule_charbpos_cache[found]; | |
3276 bytmax = bytmin = buf->text->mule_bytebpos_cache[found]; | |
3277 size = 1; | |
3278 } | |
3279 } | |
3280 | |
3281 /* It's conceivable that the caching above could lead to X being | |
3282 the same as one of the range edges. */ | |
3283 if (x >= bytmax) | |
3284 { | |
3285 Bytebpos newmax; | |
3286 Bytecount newsize; | |
3287 | |
3288 forward_p = 1; | |
3289 while (x > bytmax) | |
3290 { | |
3291 newmax = bytmax; | |
3292 | |
3293 INC_BYTEBPOS (buf, newmax); | |
3294 newsize = newmax - bytmax; | |
3295 if (newsize != size) | |
3296 { | |
3297 bufmin = bufmax; | |
3298 bytmin = bytmax; | |
3299 size = newsize; | |
3300 } | |
3301 bytmax = newmax; | |
3302 bufmax++; | |
3303 } | |
3304 retval = bufmax; | |
3305 | |
3306 /* #### Should go past the found location to reduce the number | |
3307 of times that this function is called */ | |
3308 } | |
3309 else /* x <= bytmin */ | |
3310 { | |
3311 Bytebpos newmin; | |
3312 Bytecount newsize; | |
3313 | |
3314 forward_p = 0; | |
3315 while (x < bytmin) | |
3316 { | |
3317 newmin = bytmin; | |
3318 | |
3319 DEC_BYTEBPOS (buf, newmin); | |
3320 newsize = bytmin - newmin; | |
3321 if (newsize != size) | |
3322 { | |
3323 bufmax = bufmin; | |
3324 bytmax = bytmin; | |
3325 size = newsize; | |
3326 } | |
3327 bytmin = newmin; | |
3328 bufmin--; | |
3329 } | |
3330 retval = bufmin; | |
3331 | |
3332 /* #### Should go past the found location to reduce the number | |
3333 of times that this function is called | |
3334 */ | |
3335 } | |
3336 | |
3337 /* If size is three, than we have to max sure that the range we | |
3338 discovered isn't too large, because we use a fixed-length | |
3339 table to divide by 3. */ | |
3340 | |
3341 if (size == 3) | |
3342 { | |
3343 int gap = bytmax - bytmin; | |
3344 buf->text->mule_three_p = 1; | |
3345 buf->text->mule_shifter = 1; | |
3346 | |
3347 if (gap > MAX_BYTEBPOS_GAP_SIZE_3) | |
3348 { | |
3349 if (forward_p) | |
3350 { | |
3351 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3; | |
3352 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3; | |
3353 } | |
3354 else | |
3355 { | |
3356 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3; | |
3357 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3; | |
3358 } | |
3359 } | |
3360 } | |
3361 else | |
3362 { | |
3363 buf->text->mule_three_p = 0; | |
3364 if (size == 4) | |
3365 buf->text->mule_shifter = 2; | |
3366 else | |
3367 buf->text->mule_shifter = size - 1; | |
3368 } | |
3369 | |
3370 buf->text->mule_bufmin = bufmin; | |
3371 buf->text->mule_bufmax = bufmax; | |
3372 buf->text->mule_bytmin = bytmin; | |
3373 buf->text->mule_bytmax = bytmax; | |
3374 | |
3375 if (add_to_cache) | |
3376 { | |
3377 int replace_loc; | |
3378 | |
3379 /* We throw away a "random" cached value and replace it with | |
3380 the new value. It doesn't actually have to be very random | |
3381 at all, just evenly distributed. | |
3382 | |
3383 #### It would be better to use a least-recently-used algorithm | |
3384 or something that tries to space things out, but I'm not sure | |
3385 it's worth it to go to the trouble of maintaining that. */ | |
3386 not_very_random_number += 621; | |
3387 replace_loc = not_very_random_number & 15; | |
3388 buf->text->mule_charbpos_cache[replace_loc] = retval; | |
3389 buf->text->mule_bytebpos_cache[replace_loc] = x; | |
3390 } | |
2367 | 3391 #endif /* OLD_BYTE_CHAR */ |
3392 | |
3393 done: | |
1292 | 3394 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion); |
3395 | |
771 | 3396 return retval; |
3397 } | |
3398 | |
3399 /* Text of length BYTELENGTH and CHARLENGTH (in different units) | |
3400 was inserted at charbpos START. */ | |
3401 | |
3402 void | |
3403 buffer_mule_signal_inserted_region (struct buffer *buf, Charbpos start, | |
3404 Bytecount bytelength, | |
3405 Charcount charlength) | |
3406 { | |
2367 | 3407 #ifdef OLD_BYTE_CHAR |
771 | 3408 int size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; |
2367 | 3409 #endif /* OLD_BYTE_CHAR */ |
771 | 3410 int i; |
3411 | |
3412 /* Adjust the cache of known positions. */ | |
2367 | 3413 for (i = 0; i < buf->text->next_cache_pos; i++) |
771 | 3414 { |
3415 | |
3416 if (buf->text->mule_charbpos_cache[i] > start) | |
3417 { | |
3418 buf->text->mule_charbpos_cache[i] += charlength; | |
3419 buf->text->mule_bytebpos_cache[i] += bytelength; | |
3420 } | |
3421 } | |
3422 | |
2367 | 3423 /* Adjust the special cached position. */ |
3424 | |
3425 if (buf->text->cached_charpos > start) | |
3426 { | |
3427 buf->text->cached_charpos += charlength; | |
3428 buf->text->cached_bytepos += bytelength; | |
3429 } | |
3430 | |
3431 #ifdef OLD_BYTE_CHAR | |
771 | 3432 if (start >= buf->text->mule_bufmax) |
826 | 3433 return; |
771 | 3434 |
3435 /* The insertion is either before the known region, in which case | |
3436 it shoves it forward; or within the known region, in which case | |
3437 it shoves the end forward. (But it may make the known region | |
3438 inconsistent, so we may have to shorten it.) */ | |
3439 | |
3440 if (start <= buf->text->mule_bufmin) | |
3441 { | |
3442 buf->text->mule_bufmin += charlength; | |
3443 buf->text->mule_bufmax += charlength; | |
3444 buf->text->mule_bytmin += bytelength; | |
3445 buf->text->mule_bytmax += bytelength; | |
3446 } | |
3447 else | |
3448 { | |
3449 Charbpos end = start + charlength; | |
3450 /* the insertion point divides the known region in two. | |
3451 Keep the longer half, at least, and expand into the | |
3452 inserted chunk as much as possible. */ | |
3453 | |
3454 if (start - buf->text->mule_bufmin > buf->text->mule_bufmax - start) | |
3455 { | |
3456 Bytebpos bytestart = (buf->text->mule_bytmin | |
3457 + size * (start - buf->text->mule_bufmin)); | |
3458 Bytebpos bytenew; | |
3459 | |
3460 while (start < end) | |
3461 { | |
3462 bytenew = bytestart; | |
3463 INC_BYTEBPOS (buf, bytenew); | |
3464 if (bytenew - bytestart != size) | |
3465 break; | |
3466 start++; | |
3467 bytestart = bytenew; | |
3468 } | |
3469 if (start != end) | |
3470 { | |
3471 buf->text->mule_bufmax = start; | |
3472 buf->text->mule_bytmax = bytestart; | |
3473 } | |
3474 else | |
3475 { | |
3476 buf->text->mule_bufmax += charlength; | |
3477 buf->text->mule_bytmax += bytelength; | |
3478 } | |
3479 } | |
3480 else | |
3481 { | |
3482 Bytebpos byteend = (buf->text->mule_bytmin | |
3483 + size * (start - buf->text->mule_bufmin) | |
3484 + bytelength); | |
3485 Bytebpos bytenew; | |
3486 | |
3487 buf->text->mule_bufmax += charlength; | |
3488 buf->text->mule_bytmax += bytelength; | |
3489 | |
3490 while (end > start) | |
3491 { | |
3492 bytenew = byteend; | |
3493 DEC_BYTEBPOS (buf, bytenew); | |
3494 if (byteend - bytenew != size) | |
3495 break; | |
3496 end--; | |
3497 byteend = bytenew; | |
3498 } | |
3499 if (start != end) | |
3500 { | |
3501 buf->text->mule_bufmin = end; | |
3502 buf->text->mule_bytmin = byteend; | |
3503 } | |
3504 } | |
3505 } | |
2367 | 3506 #endif /* OLD_BYTE_CHAR */ |
771 | 3507 } |
3508 | |
826 | 3509 /* Text from START to END (equivalent in Bytebpos's: from BYTE_START to |
3510 BYTE_END) was deleted. */ | |
771 | 3511 |
3512 void | |
3513 buffer_mule_signal_deleted_region (struct buffer *buf, Charbpos start, | |
826 | 3514 Charbpos end, Bytebpos byte_start, |
3515 Bytebpos byte_end) | |
771 | 3516 { |
3517 int i; | |
3518 | |
3519 /* Adjust the cache of known positions. */ | |
2367 | 3520 for (i = 0; i < buf->text->next_cache_pos; i++) |
771 | 3521 { |
3522 /* After the end; gets shoved backward */ | |
3523 if (buf->text->mule_charbpos_cache[i] > end) | |
3524 { | |
3525 buf->text->mule_charbpos_cache[i] -= end - start; | |
826 | 3526 buf->text->mule_bytebpos_cache[i] -= byte_end - byte_start; |
771 | 3527 } |
3528 /* In the range; moves to start of range */ | |
3529 else if (buf->text->mule_charbpos_cache[i] > start) | |
3530 { | |
3531 buf->text->mule_charbpos_cache[i] = start; | |
826 | 3532 buf->text->mule_bytebpos_cache[i] = byte_start; |
771 | 3533 } |
3534 } | |
3535 | |
2367 | 3536 /* Adjust the special cached position. */ |
3537 | |
3538 /* After the end; gets shoved backward */ | |
3539 if (buf->text->cached_charpos > end) | |
3540 { | |
3541 buf->text->cached_charpos -= end - start; | |
3542 buf->text->cached_bytepos -= byte_end - byte_start; | |
3543 } | |
3544 /* In the range; moves to start of range */ | |
3545 else if (buf->text->cached_charpos > start) | |
3546 { | |
3547 buf->text->cached_charpos = start; | |
3548 buf->text->cached_bytepos = byte_start; | |
3549 } | |
3550 | |
3551 #ifdef OLD_BYTE_CHAR | |
771 | 3552 /* We don't care about any text after the end of the known region. */ |
3553 | |
3554 end = min (end, buf->text->mule_bufmax); | |
826 | 3555 byte_end = min (byte_end, buf->text->mule_bytmax); |
771 | 3556 if (start >= end) |
826 | 3557 return; |
771 | 3558 |
3559 /* The end of the known region offsets by the total amount of deletion, | |
3560 since it's all before it. */ | |
3561 | |
3562 buf->text->mule_bufmax -= end - start; | |
826 | 3563 buf->text->mule_bytmax -= byte_end - byte_start; |
771 | 3564 |
3565 /* Now we don't care about any text after the start of the known region. */ | |
3566 | |
3567 end = min (end, buf->text->mule_bufmin); | |
826 | 3568 byte_end = min (byte_end, buf->text->mule_bytmin); |
771 | 3569 if (start < end) |
3570 { | |
3571 buf->text->mule_bufmin -= end - start; | |
826 | 3572 buf->text->mule_bytmin -= byte_end - byte_start; |
771 | 3573 } |
2367 | 3574 #endif /* OLD_BYTE_CHAR */ |
771 | 3575 } |
3576 | |
3577 #endif /* MULE */ | |
3578 | |
3579 | |
3580 /************************************************************************/ | |
3581 /* verifying buffer and string positions */ | |
3582 /************************************************************************/ | |
3583 | |
3584 /* Functions below are tagged with either _byte or _char indicating | |
3585 whether they return byte or character positions. For a buffer, | |
3586 a character position is a "Charbpos" and a byte position is a "Bytebpos". | |
3587 For strings, these are sometimes typed using "Charcount" and | |
3588 "Bytecount". */ | |
3589 | |
3590 /* Flags for the functions below are: | |
3591 | |
3592 GB_ALLOW_PAST_ACCESSIBLE | |
3593 | |
3594 Allow positions to range over the entire buffer (BUF_BEG to BUF_Z), | |
3595 rather than just the accessible portion (BUF_BEGV to BUF_ZV). | |
3596 For strings, this flag has no effect. | |
3597 | |
3598 GB_COERCE_RANGE | |
3599 | |
3600 If the position is outside the allowable range, return the lower | |
3601 or upper bound of the range, whichever is closer to the specified | |
3602 position. | |
3603 | |
3604 GB_NO_ERROR_IF_BAD | |
3605 | |
3606 If the position is outside the allowable range, return -1. | |
3607 | |
3608 GB_NEGATIVE_FROM_END | |
3609 | |
3610 If a value is negative, treat it as an offset from the end. | |
3611 Only applies to strings. | |
3612 | |
3613 The following additional flags apply only to the functions | |
3614 that return ranges: | |
3615 | |
3616 GB_ALLOW_NIL | |
3617 | |
3618 Either or both positions can be nil. If FROM is nil, | |
3619 FROM_OUT will contain the lower bound of the allowed range. | |
3620 If TO is nil, TO_OUT will contain the upper bound of the | |
3621 allowed range. | |
3622 | |
3623 GB_CHECK_ORDER | |
3624 | |
3625 FROM must contain the lower bound and TO the upper bound | |
3626 of the range. If the positions are reversed, an error is | |
3627 signalled. | |
3628 | |
3629 The following is a combination flag: | |
3630 | |
3631 GB_HISTORICAL_STRING_BEHAVIOR | |
3632 | |
3633 Equivalent to (GB_NEGATIVE_FROM_END | GB_ALLOW_NIL). | |
3634 */ | |
3635 | |
3636 /* Return a buffer position stored in a Lisp_Object. Full | |
3637 error-checking is done on the position. Flags can be specified to | |
3638 control the behavior of out-of-range values. The default behavior | |
3639 is to require that the position is within the accessible part of | |
3640 the buffer (BEGV and ZV), and to signal an error if the position is | |
3641 out of range. | |
3642 | |
3643 */ | |
3644 | |
3645 Charbpos | |
3646 get_buffer_pos_char (struct buffer *b, Lisp_Object pos, unsigned int flags) | |
3647 { | |
3648 /* Does not GC */ | |
3649 Charbpos ind; | |
3650 Charbpos min_allowed, max_allowed; | |
3651 | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
3652 CHECK_FIXNUM_COERCE_MARKER (pos); |
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
3653 ind = XFIXNUM (pos); |
771 | 3654 min_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_BEG (b) : BUF_BEGV (b); |
3655 max_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_Z (b) : BUF_ZV (b); | |
3656 | |
3657 if (ind < min_allowed || ind > max_allowed) | |
3658 { | |
3659 if (flags & GB_COERCE_RANGE) | |
3660 ind = ind < min_allowed ? min_allowed : max_allowed; | |
3661 else if (flags & GB_NO_ERROR_IF_BAD) | |
3662 ind = -1; | |
3663 else | |
3664 { | |
793 | 3665 Lisp_Object buffer = wrap_buffer (b); |
3666 | |
771 | 3667 args_out_of_range (buffer, pos); |
3668 } | |
3669 } | |
3670 | |
3671 return ind; | |
3672 } | |
3673 | |
3674 Bytebpos | |
3675 get_buffer_pos_byte (struct buffer *b, Lisp_Object pos, unsigned int flags) | |
3676 { | |
3677 Charbpos bpos = get_buffer_pos_char (b, pos, flags); | |
3678 if (bpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */ | |
3679 return -1; | |
3680 return charbpos_to_bytebpos (b, bpos); | |
3681 } | |
3682 | |
3683 /* Return a pair of buffer positions representing a range of text, | |
3684 taken from a pair of Lisp_Objects. Full error-checking is | |
3685 done on the positions. Flags can be specified to control the | |
3686 behavior of out-of-range values. The default behavior is to | |
3687 allow the range bounds to be specified in either order | |
3688 (however, FROM_OUT will always be the lower bound of the range | |
3689 and TO_OUT the upper bound),to require that the positions | |
3690 are within the accessible part of the buffer (BEGV and ZV), | |
3691 and to signal an error if the positions are out of range. | |
3692 */ | |
3693 | |
3694 void | |
3695 get_buffer_range_char (struct buffer *b, Lisp_Object from, Lisp_Object to, | |
826 | 3696 Charbpos *from_out, Charbpos *to_out, |
3697 unsigned int flags) | |
771 | 3698 { |
3699 /* Does not GC */ | |
3700 Charbpos min_allowed, max_allowed; | |
3701 | |
3702 min_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ? | |
3703 BUF_BEG (b) : BUF_BEGV (b); | |
3704 max_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ? | |
3705 BUF_Z (b) : BUF_ZV (b); | |
3706 | |
3707 if (NILP (from) && (flags & GB_ALLOW_NIL)) | |
3708 *from_out = min_allowed; | |
3709 else | |
3710 *from_out = get_buffer_pos_char (b, from, flags | GB_NO_ERROR_IF_BAD); | |
3711 | |
3712 if (NILP (to) && (flags & GB_ALLOW_NIL)) | |
3713 *to_out = max_allowed; | |
3714 else | |
3715 *to_out = get_buffer_pos_char (b, to, flags | GB_NO_ERROR_IF_BAD); | |
3716 | |
3717 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD)) | |
3718 { | |
793 | 3719 Lisp_Object buffer = wrap_buffer (b); |
3720 | |
771 | 3721 args_out_of_range_3 (buffer, from, to); |
3722 } | |
3723 | |
3724 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out) | |
3725 { | |
3726 if (flags & GB_CHECK_ORDER) | |
3727 invalid_argument_2 ("start greater than end", from, to); | |
3728 else | |
3729 { | |
3730 Charbpos temp = *from_out; | |
3731 *from_out = *to_out; | |
3732 *to_out = temp; | |
3733 } | |
3734 } | |
3735 } | |
3736 | |
3737 void | |
3738 get_buffer_range_byte (struct buffer *b, Lisp_Object from, Lisp_Object to, | |
826 | 3739 Bytebpos *from_out, Bytebpos *to_out, |
3740 unsigned int flags) | |
771 | 3741 { |
3742 Charbpos s, e; | |
3743 | |
3744 get_buffer_range_char (b, from, to, &s, &e, flags); | |
3745 if (s >= 0) | |
3746 *from_out = charbpos_to_bytebpos (b, s); | |
3747 else /* could happen with GB_NO_ERROR_IF_BAD */ | |
3748 *from_out = -1; | |
3749 if (e >= 0) | |
3750 *to_out = charbpos_to_bytebpos (b, e); | |
3751 else | |
3752 *to_out = -1; | |
3753 } | |
3754 | |
3755 static Charcount | |
3756 get_string_pos_char_1 (Lisp_Object string, Lisp_Object pos, unsigned int flags, | |
3757 Charcount known_length) | |
3758 { | |
3759 Charcount ccpos; | |
3760 Charcount min_allowed = 0; | |
3761 Charcount max_allowed = known_length; | |
3762 | |
3763 /* Computation of KNOWN_LENGTH is potentially expensive so we pass | |
3764 it in. */ | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
3765 CHECK_FIXNUM (pos); |
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
3766 ccpos = XFIXNUM (pos); |
771 | 3767 if (ccpos < 0 && flags & GB_NEGATIVE_FROM_END) |
3768 ccpos += max_allowed; | |
3769 | |
3770 if (ccpos < min_allowed || ccpos > max_allowed) | |
3771 { | |
3772 if (flags & GB_COERCE_RANGE) | |
3773 ccpos = ccpos < min_allowed ? min_allowed : max_allowed; | |
3774 else if (flags & GB_NO_ERROR_IF_BAD) | |
3775 ccpos = -1; | |
3776 else | |
3777 args_out_of_range (string, pos); | |
3778 } | |
3779 | |
3780 return ccpos; | |
3781 } | |
3782 | |
3783 Charcount | |
3784 get_string_pos_char (Lisp_Object string, Lisp_Object pos, unsigned int flags) | |
3785 { | |
3786 return get_string_pos_char_1 (string, pos, flags, | |
826 | 3787 string_char_length (string)); |
771 | 3788 } |
3789 | |
3790 Bytecount | |
3791 get_string_pos_byte (Lisp_Object string, Lisp_Object pos, unsigned int flags) | |
3792 { | |
3793 Charcount ccpos = get_string_pos_char (string, pos, flags); | |
3794 if (ccpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */ | |
3795 return -1; | |
793 | 3796 return string_index_char_to_byte (string, ccpos); |
771 | 3797 } |
3798 | |
3799 void | |
3800 get_string_range_char (Lisp_Object string, Lisp_Object from, Lisp_Object to, | |
3801 Charcount *from_out, Charcount *to_out, | |
3802 unsigned int flags) | |
3803 { | |
3804 Charcount min_allowed = 0; | |
826 | 3805 Charcount max_allowed = string_char_length (string); |
771 | 3806 |
3807 if (NILP (from) && (flags & GB_ALLOW_NIL)) | |
3808 *from_out = min_allowed; | |
3809 else | |
3810 *from_out = get_string_pos_char_1 (string, from, | |
3811 flags | GB_NO_ERROR_IF_BAD, | |
3812 max_allowed); | |
3813 | |
3814 if (NILP (to) && (flags & GB_ALLOW_NIL)) | |
3815 *to_out = max_allowed; | |
3816 else | |
3817 *to_out = get_string_pos_char_1 (string, to, | |
3818 flags | GB_NO_ERROR_IF_BAD, | |
3819 max_allowed); | |
3820 | |
3821 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD)) | |
3822 args_out_of_range_3 (string, from, to); | |
3823 | |
3824 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out) | |
3825 { | |
3826 if (flags & GB_CHECK_ORDER) | |
3827 invalid_argument_2 ("start greater than end", from, to); | |
3828 else | |
3829 { | |
3830 Charbpos temp = *from_out; | |
3831 *from_out = *to_out; | |
3832 *to_out = temp; | |
3833 } | |
3834 } | |
3835 } | |
3836 | |
3837 void | |
3838 get_string_range_byte (Lisp_Object string, Lisp_Object from, Lisp_Object to, | |
3839 Bytecount *from_out, Bytecount *to_out, | |
3840 unsigned int flags) | |
3841 { | |
3842 Charcount s, e; | |
3843 | |
3844 get_string_range_char (string, from, to, &s, &e, flags); | |
3845 if (s >= 0) | |
793 | 3846 *from_out = string_index_char_to_byte (string, s); |
771 | 3847 else /* could happen with GB_NO_ERROR_IF_BAD */ |
3848 *from_out = -1; | |
3849 if (e >= 0) | |
793 | 3850 *to_out = string_index_char_to_byte (string, e); |
771 | 3851 else |
3852 *to_out = -1; | |
3853 | |
3854 } | |
3855 | |
826 | 3856 Charxpos |
771 | 3857 get_buffer_or_string_pos_char (Lisp_Object object, Lisp_Object pos, |
3858 unsigned int flags) | |
3859 { | |
3860 return STRINGP (object) ? | |
3861 get_string_pos_char (object, pos, flags) : | |
3862 get_buffer_pos_char (XBUFFER (object), pos, flags); | |
3863 } | |
3864 | |
826 | 3865 Bytexpos |
771 | 3866 get_buffer_or_string_pos_byte (Lisp_Object object, Lisp_Object pos, |
3867 unsigned int flags) | |
3868 { | |
3869 return STRINGP (object) ? | |
3870 get_string_pos_byte (object, pos, flags) : | |
3871 get_buffer_pos_byte (XBUFFER (object), pos, flags); | |
3872 } | |
3873 | |
3874 void | |
3875 get_buffer_or_string_range_char (Lisp_Object object, Lisp_Object from, | |
826 | 3876 Lisp_Object to, Charxpos *from_out, |
3877 Charxpos *to_out, unsigned int flags) | |
771 | 3878 { |
3879 if (STRINGP (object)) | |
3880 get_string_range_char (object, from, to, from_out, to_out, flags); | |
3881 else | |
826 | 3882 get_buffer_range_char (XBUFFER (object), from, to, from_out, to_out, |
3883 flags); | |
771 | 3884 } |
3885 | |
3886 void | |
3887 get_buffer_or_string_range_byte (Lisp_Object object, Lisp_Object from, | |
826 | 3888 Lisp_Object to, Bytexpos *from_out, |
3889 Bytexpos *to_out, unsigned int flags) | |
771 | 3890 { |
3891 if (STRINGP (object)) | |
3892 get_string_range_byte (object, from, to, from_out, to_out, flags); | |
3893 else | |
826 | 3894 get_buffer_range_byte (XBUFFER (object), from, to, from_out, to_out, |
3895 flags); | |
771 | 3896 } |
3897 | |
826 | 3898 Charxpos |
771 | 3899 buffer_or_string_accessible_begin_char (Lisp_Object object) |
3900 { | |
3901 return STRINGP (object) ? 0 : BUF_BEGV (XBUFFER (object)); | |
3902 } | |
3903 | |
826 | 3904 Charxpos |
771 | 3905 buffer_or_string_accessible_end_char (Lisp_Object object) |
3906 { | |
3907 return STRINGP (object) ? | |
826 | 3908 string_char_length (object) : BUF_ZV (XBUFFER (object)); |
771 | 3909 } |
3910 | |
826 | 3911 Bytexpos |
771 | 3912 buffer_or_string_accessible_begin_byte (Lisp_Object object) |
3913 { | |
826 | 3914 return STRINGP (object) ? 0 : BYTE_BUF_BEGV (XBUFFER (object)); |
771 | 3915 } |
3916 | |
826 | 3917 Bytexpos |
771 | 3918 buffer_or_string_accessible_end_byte (Lisp_Object object) |
3919 { | |
3920 return STRINGP (object) ? | |
826 | 3921 XSTRING_LENGTH (object) : BYTE_BUF_ZV (XBUFFER (object)); |
771 | 3922 } |
3923 | |
826 | 3924 Charxpos |
771 | 3925 buffer_or_string_absolute_begin_char (Lisp_Object object) |
3926 { | |
3927 return STRINGP (object) ? 0 : BUF_BEG (XBUFFER (object)); | |
3928 } | |
3929 | |
826 | 3930 Charxpos |
771 | 3931 buffer_or_string_absolute_end_char (Lisp_Object object) |
3932 { | |
3933 return STRINGP (object) ? | |
826 | 3934 string_char_length (object) : BUF_Z (XBUFFER (object)); |
3935 } | |
3936 | |
3937 Bytexpos | |
3938 buffer_or_string_absolute_begin_byte (Lisp_Object object) | |
3939 { | |
3940 return STRINGP (object) ? 0 : BYTE_BUF_BEG (XBUFFER (object)); | |
3941 } | |
3942 | |
3943 Bytexpos | |
3944 buffer_or_string_absolute_end_byte (Lisp_Object object) | |
3945 { | |
3946 return STRINGP (object) ? | |
3947 XSTRING_LENGTH (object) : BYTE_BUF_Z (XBUFFER (object)); | |
3948 } | |
3949 | |
3950 Charbpos | |
3951 charbpos_clip_to_bounds (Charbpos lower, Charbpos num, Charbpos upper) | |
3952 { | |
3953 return (num < lower ? lower : | |
3954 num > upper ? upper : | |
3955 num); | |
771 | 3956 } |
3957 | |
3958 Bytebpos | |
826 | 3959 bytebpos_clip_to_bounds (Bytebpos lower, Bytebpos num, Bytebpos upper) |
3960 { | |
3961 return (num < lower ? lower : | |
3962 num > upper ? upper : | |
3963 num); | |
3964 } | |
3965 | |
3966 Charxpos | |
3967 charxpos_clip_to_bounds (Charxpos lower, Charxpos num, Charxpos upper) | |
771 | 3968 { |
826 | 3969 return (num < lower ? lower : |
3970 num > upper ? upper : | |
3971 num); | |
3972 } | |
3973 | |
3974 Bytexpos | |
3975 bytexpos_clip_to_bounds (Bytexpos lower, Bytexpos num, Bytexpos upper) | |
3976 { | |
3977 return (num < lower ? lower : | |
3978 num > upper ? upper : | |
3979 num); | |
771 | 3980 } |
3981 | |
826 | 3982 /* These could be implemented in terms of the get_buffer_or_string() |
3983 functions above, but those are complicated and handle lots of weird | |
3984 cases stemming from uncertain external input. */ | |
3985 | |
3986 Charxpos | |
3987 buffer_or_string_clip_to_accessible_char (Lisp_Object object, Charxpos pos) | |
3988 { | |
3989 return (charxpos_clip_to_bounds | |
3990 (pos, buffer_or_string_accessible_begin_char (object), | |
3991 buffer_or_string_accessible_end_char (object))); | |
3992 } | |
3993 | |
3994 Bytexpos | |
3995 buffer_or_string_clip_to_accessible_byte (Lisp_Object object, Bytexpos pos) | |
771 | 3996 { |
826 | 3997 return (bytexpos_clip_to_bounds |
3998 (pos, buffer_or_string_accessible_begin_byte (object), | |
3999 buffer_or_string_accessible_end_byte (object))); | |
4000 } | |
4001 | |
4002 Charxpos | |
4003 buffer_or_string_clip_to_absolute_char (Lisp_Object object, Charxpos pos) | |
4004 { | |
4005 return (charxpos_clip_to_bounds | |
4006 (pos, buffer_or_string_absolute_begin_char (object), | |
4007 buffer_or_string_absolute_end_char (object))); | |
4008 } | |
4009 | |
4010 Bytexpos | |
4011 buffer_or_string_clip_to_absolute_byte (Lisp_Object object, Bytexpos pos) | |
4012 { | |
4013 return (bytexpos_clip_to_bounds | |
4014 (pos, buffer_or_string_absolute_begin_byte (object), | |
4015 buffer_or_string_absolute_end_byte (object))); | |
771 | 4016 } |
4017 | |
4018 | |
4019 /************************************************************************/ | |
4020 /* Implement TO_EXTERNAL_FORMAT, TO_INTERNAL_FORMAT */ | |
4021 /************************************************************************/ | |
4022 | |
4023 typedef struct | |
4024 { | |
867 | 4025 Dynarr_declare (Ibyte_dynarr *); |
4026 } Ibyte_dynarr_dynarr; | |
771 | 4027 |
4028 typedef struct | |
4029 { | |
4030 Dynarr_declare (Extbyte_dynarr *); | |
4031 } Extbyte_dynarr_dynarr; | |
4032 | |
4033 static Extbyte_dynarr_dynarr *conversion_out_dynarr_list; | |
867 | 4034 static Ibyte_dynarr_dynarr *conversion_in_dynarr_list; |
771 | 4035 |
4036 static int dfc_convert_to_external_format_in_use; | |
4037 static int dfc_convert_to_internal_format_in_use; | |
4038 | |
4039 void | |
4040 dfc_convert_to_external_format (dfc_conversion_type source_type, | |
4041 dfc_conversion_data *source, | |
4042 Lisp_Object coding_system, | |
4043 dfc_conversion_type sink_type, | |
4044 dfc_conversion_data *sink) | |
4045 { | |
4046 /* It's guaranteed that many callers are not prepared for GC here, | |
4047 esp. given that this code conversion occurs in many very hidden | |
4048 places. */ | |
1292 | 4049 int count; |
771 | 4050 Extbyte_dynarr *conversion_out_dynarr; |
1292 | 4051 PROFILE_DECLARE (); |
4052 | |
2367 | 4053 assert (!inhibit_non_essential_conversion_operations); |
1292 | 4054 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion); |
4055 | |
4056 count = begin_gc_forbidden (); | |
771 | 4057 |
4058 type_checking_assert | |
4059 (((source_type == DFC_TYPE_DATA) || | |
4060 (source_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)) || | |
4061 (source_type == DFC_TYPE_LISP_STRING && STRINGP (source->lisp_object))) | |
4062 && | |
4063 ((sink_type == DFC_TYPE_DATA) || | |
4064 (sink_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)))); | |
4065 | |
4066 if (Dynarr_length (conversion_out_dynarr_list) <= | |
4067 dfc_convert_to_external_format_in_use) | |
4068 Dynarr_add (conversion_out_dynarr_list, Dynarr_new (Extbyte)); | |
4069 conversion_out_dynarr = Dynarr_at (conversion_out_dynarr_list, | |
4070 dfc_convert_to_external_format_in_use); | |
4071 Dynarr_reset (conversion_out_dynarr); | |
4072 | |
853 | 4073 internal_bind_int (&dfc_convert_to_external_format_in_use, |
4074 dfc_convert_to_external_format_in_use + 1); | |
4075 | |
771 | 4076 coding_system = get_coding_system_for_text_file (coding_system, 0); |
4077 | |
4078 /* Here we optimize in the case where the coding system does no | |
4079 conversion. However, we don't want to optimize in case the source | |
4080 or sink is an lstream, since writing to an lstream can cause a | |
4081 garbage collection, and this could be problematic if the source | |
4082 is a lisp string. */ | |
4083 if (source_type != DFC_TYPE_LISP_LSTREAM && | |
4084 sink_type != DFC_TYPE_LISP_LSTREAM && | |
4085 coding_system_is_binary (coding_system)) | |
4086 { | |
867 | 4087 const Ibyte *ptr; |
771 | 4088 Bytecount len; |
4089 | |
4090 if (source_type == DFC_TYPE_LISP_STRING) | |
4091 { | |
4092 ptr = XSTRING_DATA (source->lisp_object); | |
4093 len = XSTRING_LENGTH (source->lisp_object); | |
4094 } | |
4095 else | |
4096 { | |
867 | 4097 ptr = (Ibyte *) source->data.ptr; |
771 | 4098 len = source->data.len; |
4099 } | |
4100 | |
4101 #ifdef MULE | |
4102 { | |
867 | 4103 const Ibyte *end; |
771 | 4104 for (end = ptr + len; ptr < end;) |
4105 { | |
867 | 4106 Ibyte c = |
826 | 4107 (byte_ascii_p (*ptr)) ? *ptr : |
771 | 4108 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) : |
4109 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) : | |
4110 '~'; | |
4111 | |
4112 Dynarr_add (conversion_out_dynarr, (Extbyte) c); | |
867 | 4113 INC_IBYTEPTR (ptr); |
771 | 4114 } |
800 | 4115 text_checking_assert (ptr == end); |
771 | 4116 } |
4117 #else | |
4118 Dynarr_add_many (conversion_out_dynarr, ptr, len); | |
4119 #endif | |
4120 | |
4121 } | |
1315 | 4122 #ifdef WIN32_ANY |
771 | 4123 /* Optimize the common case involving Unicode where only ASCII is involved */ |
4124 else if (source_type != DFC_TYPE_LISP_LSTREAM && | |
4125 sink_type != DFC_TYPE_LISP_LSTREAM && | |
4126 dfc_coding_system_is_unicode (coding_system)) | |
4127 { | |
867 | 4128 const Ibyte *ptr, *p; |
771 | 4129 Bytecount len; |
867 | 4130 const Ibyte *end; |
771 | 4131 |
4132 if (source_type == DFC_TYPE_LISP_STRING) | |
4133 { | |
4134 ptr = XSTRING_DATA (source->lisp_object); | |
4135 len = XSTRING_LENGTH (source->lisp_object); | |
4136 } | |
4137 else | |
4138 { | |
867 | 4139 ptr = (Ibyte *) source->data.ptr; |
771 | 4140 len = source->data.len; |
4141 } | |
4142 end = ptr + len; | |
4143 | |
4144 for (p = ptr; p < end; p++) | |
4145 { | |
826 | 4146 if (!byte_ascii_p (*p)) |
771 | 4147 goto the_hard_way; |
4148 } | |
4149 | |
4150 for (p = ptr; p < end; p++) | |
4151 { | |
4152 Dynarr_add (conversion_out_dynarr, (Extbyte) (*p)); | |
4153 Dynarr_add (conversion_out_dynarr, (Extbyte) '\0'); | |
4154 } | |
4155 } | |
1315 | 4156 #endif /* WIN32_ANY */ |
771 | 4157 else |
4158 { | |
4159 Lisp_Object streams_to_delete[3]; | |
4160 int delete_count; | |
4161 Lisp_Object instream, outstream; | |
4162 Lstream *reader, *writer; | |
4163 | |
1315 | 4164 #ifdef WIN32_ANY |
771 | 4165 the_hard_way: |
1315 | 4166 #endif /* WIN32_ANY */ |
771 | 4167 delete_count = 0; |
4168 if (source_type == DFC_TYPE_LISP_LSTREAM) | |
4169 instream = source->lisp_object; | |
4170 else if (source_type == DFC_TYPE_DATA) | |
4171 streams_to_delete[delete_count++] = instream = | |
4172 make_fixed_buffer_input_stream (source->data.ptr, source->data.len); | |
4173 else | |
4174 { | |
4175 type_checking_assert (source_type == DFC_TYPE_LISP_STRING); | |
4176 streams_to_delete[delete_count++] = instream = | |
4177 /* This will GCPRO the Lisp string */ | |
4178 make_lisp_string_input_stream (source->lisp_object, 0, -1); | |
4179 } | |
4180 | |
4181 if (sink_type == DFC_TYPE_LISP_LSTREAM) | |
4182 outstream = sink->lisp_object; | |
4183 else | |
4184 { | |
4185 type_checking_assert (sink_type == DFC_TYPE_DATA); | |
4186 streams_to_delete[delete_count++] = outstream = | |
4187 make_dynarr_output_stream | |
4188 ((unsigned_char_dynarr *) conversion_out_dynarr); | |
4189 } | |
4190 | |
4191 streams_to_delete[delete_count++] = outstream = | |
800 | 4192 make_coding_output_stream (XLSTREAM (outstream), coding_system, |
4193 CODING_ENCODE, 0); | |
771 | 4194 |
4195 reader = XLSTREAM (instream); | |
4196 writer = XLSTREAM (outstream); | |
4197 /* decoding_stream will gc-protect outstream */ | |
1204 | 4198 { |
4199 struct gcpro gcpro1, gcpro2; | |
4200 GCPRO2 (instream, outstream); | |
4201 | |
4202 while (1) | |
4203 { | |
4204 Bytecount size_in_bytes; | |
4205 char tempbuf[1024]; /* some random amount */ | |
4206 | |
4207 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf)); | |
4208 | |
4209 if (size_in_bytes == 0) | |
4210 break; | |
4211 else if (size_in_bytes < 0) | |
4212 signal_error (Qtext_conversion_error, | |
4213 "Error converting to external format", Qunbound); | |
4214 | |
4215 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0) | |
4216 signal_error (Qtext_conversion_error, | |
4217 "Error converting to external format", Qunbound); | |
4218 } | |
4219 | |
4220 /* Closing writer will close any stream at the other end of writer. */ | |
4221 Lstream_close (writer); | |
4222 Lstream_close (reader); | |
4223 UNGCPRO; | |
4224 } | |
771 | 4225 |
4226 /* The idea is that this function will create no garbage. */ | |
4227 while (delete_count) | |
4228 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count])); | |
4229 } | |
4230 | |
4231 unbind_to (count); | |
4232 | |
4233 if (sink_type != DFC_TYPE_LISP_LSTREAM) | |
4234 { | |
4235 sink->data.len = Dynarr_length (conversion_out_dynarr); | |
4236 /* double zero-extend because we may be dealing with Unicode data */ | |
4237 Dynarr_add (conversion_out_dynarr, '\0'); | |
4238 Dynarr_add (conversion_out_dynarr, '\0'); | |
4967 | 4239 sink->data.ptr = Dynarr_begin (conversion_out_dynarr); |
771 | 4240 } |
1292 | 4241 |
4242 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion); | |
771 | 4243 } |
4244 | |
4245 void | |
4246 dfc_convert_to_internal_format (dfc_conversion_type source_type, | |
4247 dfc_conversion_data *source, | |
4248 Lisp_Object coding_system, | |
4249 dfc_conversion_type sink_type, | |
4250 dfc_conversion_data *sink) | |
4251 { | |
4252 /* It's guaranteed that many callers are not prepared for GC here, | |
4253 esp. given that this code conversion occurs in many very hidden | |
4254 places. */ | |
1292 | 4255 int count; |
867 | 4256 Ibyte_dynarr *conversion_in_dynarr; |
2421 | 4257 Lisp_Object underlying_cs; |
1292 | 4258 PROFILE_DECLARE (); |
4259 | |
2367 | 4260 assert (!inhibit_non_essential_conversion_operations); |
1292 | 4261 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion); |
4262 | |
4263 count = begin_gc_forbidden (); | |
771 | 4264 |
4265 type_checking_assert | |
4266 ((source_type == DFC_TYPE_DATA || | |
4267 source_type == DFC_TYPE_LISP_LSTREAM) | |
4268 && | |
4269 (sink_type == DFC_TYPE_DATA || | |
4270 sink_type == DFC_TYPE_LISP_LSTREAM)); | |
4271 | |
4272 if (Dynarr_length (conversion_in_dynarr_list) <= | |
4273 dfc_convert_to_internal_format_in_use) | |
867 | 4274 Dynarr_add (conversion_in_dynarr_list, Dynarr_new (Ibyte)); |
771 | 4275 conversion_in_dynarr = Dynarr_at (conversion_in_dynarr_list, |
4276 dfc_convert_to_internal_format_in_use); | |
4277 Dynarr_reset (conversion_in_dynarr); | |
4278 | |
853 | 4279 internal_bind_int (&dfc_convert_to_internal_format_in_use, |
4280 dfc_convert_to_internal_format_in_use + 1); | |
4281 | |
2421 | 4282 /* The second call does the equivalent of both calls, but we need |
4283 the result after the first call (which wraps just a to-text | |
4284 converter) as well as the result after the second call (which | |
4285 also wraps an EOL-detection converter). */ | |
4286 underlying_cs = get_coding_system_for_text_file (coding_system, 0); | |
4287 coding_system = get_coding_system_for_text_file (underlying_cs, 1); | |
771 | 4288 |
4289 if (source_type != DFC_TYPE_LISP_LSTREAM && | |
4290 sink_type != DFC_TYPE_LISP_LSTREAM && | |
2421 | 4291 coding_system_is_binary (underlying_cs)) |
771 | 4292 { |
4293 #ifdef MULE | |
2421 | 4294 const Ibyte *ptr; |
771 | 4295 Bytecount len = source->data.len; |
2421 | 4296 const Ibyte *end; |
4297 | |
4298 /* Make sure no EOL conversion is needed. With a little work we | |
4299 could handle EOL conversion as well but it may not be needed as an | |
4300 optimization. */ | |
4301 if (!EQ (coding_system, underlying_cs)) | |
4302 { | |
4303 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len; | |
4304 ptr < end; ptr++) | |
4305 { | |
4306 if (*ptr == '\r' || *ptr == '\n') | |
4307 goto the_hard_way; | |
4308 } | |
4309 } | |
4310 | |
4311 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len; | |
4312 ptr < end; ptr++) | |
771 | 4313 { |
867 | 4314 Ibyte c = *ptr; |
771 | 4315 |
826 | 4316 if (byte_ascii_p (c)) |
771 | 4317 Dynarr_add (conversion_in_dynarr, c); |
826 | 4318 else if (byte_c1_p (c)) |
771 | 4319 { |
4320 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1); | |
4321 Dynarr_add (conversion_in_dynarr, c + 0x20); | |
4322 } | |
4323 else | |
4324 { | |
4325 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1); | |
4326 Dynarr_add (conversion_in_dynarr, c); | |
4327 } | |
4328 } | |
4329 #else | |
4330 Dynarr_add_many (conversion_in_dynarr, source->data.ptr, source->data.len); | |
4331 #endif | |
4332 } | |
1315 | 4333 #ifdef WIN32_ANY |
1292 | 4334 /* Optimize the common case involving Unicode where only ASCII/Latin-1 is |
4335 involved */ | |
771 | 4336 else if (source_type != DFC_TYPE_LISP_LSTREAM && |
4337 sink_type != DFC_TYPE_LISP_LSTREAM && | |
2421 | 4338 dfc_coding_system_is_unicode (underlying_cs)) |
771 | 4339 { |
2421 | 4340 const Ibyte *ptr; |
771 | 4341 Bytecount len = source->data.len; |
2421 | 4342 const Ibyte *end; |
771 | 4343 |
4344 if (len & 1) | |
4345 goto the_hard_way; | |
4346 | |
2421 | 4347 /* Make sure only ASCII/Latin-1 is involved */ |
4348 for (ptr = (const Ibyte *) source->data.ptr + 1, end = ptr + len; | |
4349 ptr < end; ptr += 2) | |
771 | 4350 { |
4351 if (*ptr) | |
4352 goto the_hard_way; | |
4353 } | |
4354 | |
2421 | 4355 /* Make sure no EOL conversion is needed. With a little work we |
4356 could handle EOL conversion as well but it may not be needed as an | |
4357 optimization. */ | |
4358 if (!EQ (coding_system, underlying_cs)) | |
4359 { | |
4360 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len; | |
4361 ptr < end; ptr += 2) | |
4362 { | |
4363 if (*ptr == '\r' || *ptr == '\n') | |
4364 goto the_hard_way; | |
4365 } | |
4366 } | |
4367 | |
4368 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len; | |
4369 ptr < end; ptr += 2) | |
771 | 4370 { |
867 | 4371 Ibyte c = *ptr; |
771 | 4372 |
826 | 4373 if (byte_ascii_p (c)) |
771 | 4374 Dynarr_add (conversion_in_dynarr, c); |
4375 #ifdef MULE | |
826 | 4376 else if (byte_c1_p (c)) |
771 | 4377 { |
4378 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1); | |
4379 Dynarr_add (conversion_in_dynarr, c + 0x20); | |
4380 } | |
4381 else | |
4382 { | |
4383 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1); | |
4384 Dynarr_add (conversion_in_dynarr, c); | |
4385 } | |
4386 #endif /* MULE */ | |
4387 } | |
4388 } | |
1315 | 4389 #endif /* WIN32_ANY */ |
771 | 4390 else |
4391 { | |
4392 Lisp_Object streams_to_delete[3]; | |
4393 int delete_count; | |
4394 Lisp_Object instream, outstream; | |
4395 Lstream *reader, *writer; | |
4396 | |
2421 | 4397 #if defined (WIN32_ANY) || defined (MULE) |
771 | 4398 the_hard_way: |
2421 | 4399 #endif |
771 | 4400 delete_count = 0; |
4401 if (source_type == DFC_TYPE_LISP_LSTREAM) | |
4402 instream = source->lisp_object; | |
4403 else | |
4404 { | |
4405 type_checking_assert (source_type == DFC_TYPE_DATA); | |
4406 streams_to_delete[delete_count++] = instream = | |
4407 make_fixed_buffer_input_stream (source->data.ptr, source->data.len); | |
4408 } | |
4409 | |
4410 if (sink_type == DFC_TYPE_LISP_LSTREAM) | |
4411 outstream = sink->lisp_object; | |
4412 else | |
4413 { | |
4414 type_checking_assert (sink_type == DFC_TYPE_DATA); | |
4415 streams_to_delete[delete_count++] = outstream = | |
4416 make_dynarr_output_stream | |
4417 ((unsigned_char_dynarr *) conversion_in_dynarr); | |
4418 } | |
4419 | |
4420 streams_to_delete[delete_count++] = outstream = | |
800 | 4421 make_coding_output_stream (XLSTREAM (outstream), coding_system, |
4422 CODING_DECODE, 0); | |
771 | 4423 |
4424 reader = XLSTREAM (instream); | |
4425 writer = XLSTREAM (outstream); | |
1204 | 4426 { |
4427 struct gcpro gcpro1, gcpro2; | |
4428 /* outstream will gc-protect its sink stream, if necessary */ | |
4429 GCPRO2 (instream, outstream); | |
4430 | |
4431 while (1) | |
4432 { | |
4433 Bytecount size_in_bytes; | |
4434 char tempbuf[1024]; /* some random amount */ | |
4435 | |
4436 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf)); | |
4437 | |
4438 if (size_in_bytes == 0) | |
4439 break; | |
4440 else if (size_in_bytes < 0) | |
4441 signal_error (Qtext_conversion_error, | |
4442 "Error converting to internal format", Qunbound); | |
4443 | |
4444 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0) | |
4445 signal_error (Qtext_conversion_error, | |
4446 "Error converting to internal format", Qunbound); | |
4447 } | |
4448 | |
4449 /* Closing writer will close any stream at the other end of writer. */ | |
4450 Lstream_close (writer); | |
4451 Lstream_close (reader); | |
4452 UNGCPRO; | |
4453 } | |
771 | 4454 |
4455 /* The idea is that this function will create no garbage. */ | |
4456 while (delete_count) | |
4457 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count])); | |
4458 } | |
4459 | |
4460 unbind_to (count); | |
4461 | |
4462 if (sink_type != DFC_TYPE_LISP_LSTREAM) | |
4463 { | |
4464 sink->data.len = Dynarr_length (conversion_in_dynarr); | |
4465 Dynarr_add (conversion_in_dynarr, '\0'); /* remember to NUL-terminate! */ | |
4466 /* The macros don't currently distinguish between internal and | |
4467 external sinks, and allocate and copy two extra bytes in both | |
4468 cases. So we add a second zero, just like for external data | |
4469 (in that case, because we may be converting to Unicode). */ | |
4470 Dynarr_add (conversion_in_dynarr, '\0'); | |
4967 | 4471 sink->data.ptr = Dynarr_begin (conversion_in_dynarr); |
771 | 4472 } |
1292 | 4473 |
4474 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion); | |
771 | 4475 } |
4476 | |
1318 | 4477 /* ----------------------------------------------------------------------- */ |
2367 | 4478 /* Alloca-conversion helpers */ |
4479 /* ----------------------------------------------------------------------- */ | |
4480 | |
4481 /* For alloca(), things are trickier because the calling function needs to | |
4482 allocate. This means that the caller needs to do the following: | |
4483 | |
4484 (a) invoke us to do the conversion, remember the data and return the size. | |
4485 (b) alloca() the proper size. | |
4486 (c) invoke us again to copy the data. | |
4487 | |
4488 We need to handle the possibility of two or more invocations of the | |
4489 converter in the same expression. In such cases it's conceivable that | |
4490 the evaluation of the sub-expressions will be overlapping (e.g. one size | |
4491 function called, then the other one called, then the copy functions | |
4492 called). To handle this, we keep a list of active data, indexed by the | |
4493 src expression. (We use the stringize operator to avoid evaluating the | |
4494 expression multiple times.) If the caller uses the exact same src | |
4495 expression twice in two converter calls in the same subexpression, we | |
2500 | 4496 will lose, but at least we can check for this and ABORT(). We could |
2367 | 4497 conceivably try to index on other parameters as well, but there is not |
4498 really any point. */ | |
4499 | |
4500 alloca_convert_vals_dynarr *active_alloca_convert; | |
4501 | |
4502 int | |
4503 find_pos_of_existing_active_alloca_convert (const char *srctext) | |
4504 { | |
4505 alloca_convert_vals *vals = NULL; | |
4506 int i; | |
4507 | |
4508 if (!active_alloca_convert) | |
4509 active_alloca_convert = Dynarr_new (alloca_convert_vals); | |
4510 | |
4511 for (i = 0; i < Dynarr_length (active_alloca_convert); i++) | |
4512 { | |
4513 vals = Dynarr_atp (active_alloca_convert, i); | |
2385 | 4514 /* On my system, two different occurrences of the same stringized |
4515 argument always point to the same string. However, on someone | |
4516 else's system, that wasn't the case. We check for equality | |
4517 first, since it seems systems work my way more than the other | |
4518 way. */ | |
4519 if (vals->srctext == srctext || !strcmp (vals->srctext, srctext)) | |
2367 | 4520 return i; |
4521 } | |
4522 | |
4523 return -1; | |
4524 } | |
4525 | |
4526 /* ----------------------------------------------------------------------- */ | |
1318 | 4527 /* New-style DFC converters (data is returned rather than stored into var) */ |
4528 /* ----------------------------------------------------------------------- */ | |
4529 | |
4530 /* We handle here the cases where SRC is a Lisp_Object, internal data | |
4531 (sized or unsized), or external data (sized or unsized), and return type | |
4532 is unsized alloca() or malloc() data. If the return type is a | |
4953
304aebb79cd3
function renamings to track names of char typedefs
Ben Wing <ben@xemacs.org>
parents:
4952
diff
changeset
|
4533 Lisp_Object, use build_extstring() for unsized external data, |
304aebb79cd3
function renamings to track names of char typedefs
Ben Wing <ben@xemacs.org>
parents:
4952
diff
changeset
|
4534 make_extstring() for sized external data. If the return type needs to |
1318 | 4535 be sized data, use the *_TO_SIZED_*() macros, and for other more |
4536 complicated cases, use the original TO_*_FORMAT() macros. */ | |
4537 | |
4538 static void | |
4539 new_dfc_convert_now_damn_it (const void *src, Bytecount src_size, | |
4540 enum new_dfc_src_type type, | |
4541 void **dst, Bytecount *dst_size, | |
4542 Lisp_Object codesys) | |
4543 { | |
4544 /* #### In the case of alloca(), it would be a bit more efficient, for | |
4545 small strings, to use static Dynarr's like are used internally in | |
4546 TO_*_FORMAT(), or some other way of avoiding malloc() followed by | |
4547 free(). I doubt it really matters, though. */ | |
4548 | |
4549 switch (type) | |
4550 { | |
4551 case DFC_EXTERNAL: | |
4552 TO_INTERNAL_FORMAT (C_STRING, src, | |
4553 MALLOC, (*dst, *dst_size), codesys); | |
4554 break; | |
4555 | |
4556 case DFC_SIZED_EXTERNAL: | |
4557 TO_INTERNAL_FORMAT (DATA, (src, src_size), | |
4558 MALLOC, (*dst, *dst_size), codesys); | |
4559 break; | |
4560 | |
4561 case DFC_INTERNAL: | |
4562 TO_EXTERNAL_FORMAT (C_STRING, src, | |
4563 MALLOC, (*dst, *dst_size), codesys); | |
4564 break; | |
4565 | |
4566 case DFC_SIZED_INTERNAL: | |
4567 TO_EXTERNAL_FORMAT (DATA, (src, src_size), | |
4568 MALLOC, (*dst, *dst_size), codesys); | |
4569 break; | |
4570 | |
4571 case DFC_LISP_STRING: | |
5013 | 4572 TO_EXTERNAL_FORMAT (LISP_STRING, GET_LISP_FROM_VOID (src), |
1318 | 4573 MALLOC, (*dst, *dst_size), codesys); |
4574 break; | |
4575 | |
4576 default: | |
2500 | 4577 ABORT (); |
1318 | 4578 } |
2367 | 4579 |
4580 /* The size is always + 2 because we have double zero-termination at the | |
4581 end of all data (for Unicode-correctness). */ | |
4582 *dst_size += 2; | |
4583 } | |
4584 | |
4585 Bytecount | |
4586 new_dfc_convert_size (const char *srctext, const void *src, | |
4587 Bytecount src_size, enum new_dfc_src_type type, | |
4588 Lisp_Object codesys) | |
4589 { | |
4590 alloca_convert_vals vals; | |
4591 | |
2721 | 4592 int i = find_pos_of_existing_active_alloca_convert (srctext); |
4593 assert (i < 0); | |
2367 | 4594 |
4595 vals.srctext = srctext; | |
4596 | |
4597 new_dfc_convert_now_damn_it (src, src_size, type, &vals.dst, &vals.dst_size, | |
4598 codesys); | |
4599 | |
4600 Dynarr_add (active_alloca_convert, vals); | |
4601 return vals.dst_size; | |
4602 } | |
4603 | |
4604 void * | |
4605 new_dfc_convert_copy_data (const char *srctext, void *alloca_data) | |
4606 { | |
4607 alloca_convert_vals *vals; | |
4608 int i = find_pos_of_existing_active_alloca_convert (srctext); | |
4609 | |
4610 assert (i >= 0); | |
4611 vals = Dynarr_atp (active_alloca_convert, i); | |
4612 assert (alloca_data); | |
4613 memcpy (alloca_data, vals->dst, vals->dst_size); | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4967
diff
changeset
|
4614 xfree (vals->dst); |
2367 | 4615 Dynarr_delete (active_alloca_convert, i); |
4616 return alloca_data; | |
1318 | 4617 } |
4618 | |
4619 void * | |
4620 new_dfc_convert_malloc (const void *src, Bytecount src_size, | |
4621 enum new_dfc_src_type type, Lisp_Object codesys) | |
4622 { | |
4623 void *dst; | |
4624 Bytecount dst_size; | |
4625 | |
4626 new_dfc_convert_now_damn_it (src, src_size, type, &dst, &dst_size, codesys); | |
4627 return dst; | |
4628 } | |
4629 | |
771 | 4630 |
4631 /************************************************************************/ | |
867 | 4632 /* Basic Ichar functions */ |
771 | 4633 /************************************************************************/ |
4634 | |
4635 #ifdef MULE | |
4636 | |
4637 /* Convert a non-ASCII Mule character C into a one-character Mule-encoded | |
4638 string in STR. Returns the number of bytes stored. | |
867 | 4639 Do not call this directly. Use the macro set_itext_ichar() instead. |
771 | 4640 */ |
4641 | |
4642 Bytecount | |
867 | 4643 non_ascii_set_itext_ichar (Ibyte *str, Ichar c) |
771 | 4644 { |
867 | 4645 Ibyte *p; |
4646 Ibyte lb; | |
771 | 4647 int c1, c2; |
4648 Lisp_Object charset; | |
4649 | |
4650 p = str; | |
867 | 4651 BREAKUP_ICHAR (c, charset, c1, c2); |
4652 lb = ichar_leading_byte (c); | |
826 | 4653 if (leading_byte_private_p (lb)) |
4654 *p++ = private_leading_byte_prefix (lb); | |
771 | 4655 *p++ = lb; |
4656 if (EQ (charset, Vcharset_control_1)) | |
4657 c1 += 0x20; | |
4658 *p++ = c1 | 0x80; | |
4659 if (c2) | |
4660 *p++ = c2 | 0x80; | |
4661 | |
4662 return (p - str); | |
4663 } | |
4664 | |
4665 /* Return the first character from a Mule-encoded string in STR, | |
4666 assuming it's non-ASCII. Do not call this directly. | |
867 | 4667 Use the macro itext_ichar() instead. */ |
4668 | |
4669 Ichar | |
4670 non_ascii_itext_ichar (const Ibyte *str) | |
771 | 4671 { |
867 | 4672 Ibyte i0 = *str, i1, i2 = 0; |
771 | 4673 Lisp_Object charset; |
4674 | |
4675 if (i0 == LEADING_BYTE_CONTROL_1) | |
867 | 4676 return (Ichar) (*++str - 0x20); |
771 | 4677 |
826 | 4678 if (leading_byte_prefix_p (i0)) |
771 | 4679 i0 = *++str; |
4680 | |
4681 i1 = *++str & 0x7F; | |
4682 | |
826 | 4683 charset = charset_by_leading_byte (i0); |
771 | 4684 if (XCHARSET_DIMENSION (charset) == 2) |
4685 i2 = *++str & 0x7F; | |
4686 | |
867 | 4687 return make_ichar (charset, i1, i2); |
771 | 4688 } |
4689 | |
867 | 4690 /* Return whether CH is a valid Ichar, assuming it's non-ASCII. |
4691 Do not call this directly. Use the macro valid_ichar_p() instead. */ | |
771 | 4692 |
4693 int | |
867 | 4694 non_ascii_valid_ichar_p (Ichar ch) |
771 | 4695 { |
4696 int f1, f2, f3; | |
4697 | |
3498 | 4698 /* Must have only lowest 21 bits set */ |
4699 if (ch & ~0x1FFFFF) | |
771 | 4700 return 0; |
4701 | |
867 | 4702 f1 = ichar_field1 (ch); |
4703 f2 = ichar_field2 (ch); | |
4704 f3 = ichar_field3 (ch); | |
771 | 4705 |
4706 if (f1 == 0) | |
4707 { | |
4708 /* dimension-1 char */ | |
4709 Lisp_Object charset; | |
4710 | |
4711 /* leading byte must be correct */ | |
867 | 4712 if (f2 < MIN_ICHAR_FIELD2_OFFICIAL || |
4713 (f2 > MAX_ICHAR_FIELD2_OFFICIAL && f2 < MIN_ICHAR_FIELD2_PRIVATE) || | |
4714 f2 > MAX_ICHAR_FIELD2_PRIVATE) | |
771 | 4715 return 0; |
4716 /* octet not out of range */ | |
4717 if (f3 < 0x20) | |
4718 return 0; | |
4719 /* charset exists */ | |
4720 /* | |
4721 NOTE: This takes advantage of the fact that | |
4722 FIELD2_TO_OFFICIAL_LEADING_BYTE and | |
4723 FIELD2_TO_PRIVATE_LEADING_BYTE are the same. | |
4724 */ | |
826 | 4725 charset = charset_by_leading_byte (f2 + FIELD2_TO_OFFICIAL_LEADING_BYTE); |
771 | 4726 if (EQ (charset, Qnil)) |
4727 return 0; | |
4728 /* check range as per size (94 or 96) of charset */ | |
4729 return ((f3 > 0x20 && f3 < 0x7f) || XCHARSET_CHARS (charset) == 96); | |
4730 } | |
4731 else | |
4732 { | |
4733 /* dimension-2 char */ | |
4734 Lisp_Object charset; | |
4735 | |
4736 /* leading byte must be correct */ | |
867 | 4737 if (f1 < MIN_ICHAR_FIELD1_OFFICIAL || |
4738 (f1 > MAX_ICHAR_FIELD1_OFFICIAL && f1 < MIN_ICHAR_FIELD1_PRIVATE) || | |
4739 f1 > MAX_ICHAR_FIELD1_PRIVATE) | |
771 | 4740 return 0; |
4741 /* octets not out of range */ | |
4742 if (f2 < 0x20 || f3 < 0x20) | |
4743 return 0; | |
4744 | |
4745 #ifdef ENABLE_COMPOSITE_CHARS | |
4746 if (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE == LEADING_BYTE_COMPOSITE) | |
4747 { | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4748 if (UNBOUNDP (Fgethash (make_fixnum (ch), |
771 | 4749 Vcomposite_char_char2string_hash_table, |
4750 Qunbound))) | |
4751 return 0; | |
4752 return 1; | |
4753 } | |
4754 #endif /* ENABLE_COMPOSITE_CHARS */ | |
4755 | |
4756 /* charset exists */ | |
867 | 4757 if (f1 <= MAX_ICHAR_FIELD1_OFFICIAL) |
771 | 4758 charset = |
826 | 4759 charset_by_leading_byte (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE); |
771 | 4760 else |
4761 charset = | |
826 | 4762 charset_by_leading_byte (f1 + FIELD1_TO_PRIVATE_LEADING_BYTE); |
771 | 4763 |
4764 if (EQ (charset, Qnil)) | |
4765 return 0; | |
4766 /* check range as per size (94x94 or 96x96) of charset */ | |
4767 return ((f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F) || | |
4768 XCHARSET_CHARS (charset) == 96); | |
4769 } | |
4770 } | |
4771 | |
4772 /* Copy the character pointed to by SRC into DST. Do not call this | |
867 | 4773 directly. Use the macro itext_copy_ichar() instead. |
771 | 4774 Return the number of bytes copied. */ |
4775 | |
4776 Bytecount | |
867 | 4777 non_ascii_itext_copy_ichar (const Ibyte *src, Ibyte *dst) |
771 | 4778 { |
826 | 4779 Bytecount bytes = rep_bytes_by_first_byte (*src); |
771 | 4780 Bytecount i; |
4781 for (i = bytes; i; i--, dst++, src++) | |
4782 *dst = *src; | |
4783 return bytes; | |
4784 } | |
4785 | |
4786 #endif /* MULE */ | |
4787 | |
4788 | |
4789 /************************************************************************/ | |
867 | 4790 /* streams of Ichars */ |
771 | 4791 /************************************************************************/ |
4792 | |
4793 #ifdef MULE | |
4794 | |
867 | 4795 /* Treat a stream as a stream of Ichar's rather than a stream of bytes. |
771 | 4796 The functions below are not meant to be called directly; use |
4797 the macros in insdel.h. */ | |
4798 | |
867 | 4799 Ichar |
4800 Lstream_get_ichar_1 (Lstream *stream, int ch) | |
771 | 4801 { |
867 | 4802 Ibyte str[MAX_ICHAR_LEN]; |
4803 Ibyte *strptr = str; | |
771 | 4804 Bytecount bytes; |
4805 | |
867 | 4806 str[0] = (Ibyte) ch; |
771 | 4807 |
826 | 4808 for (bytes = rep_bytes_by_first_byte (ch) - 1; bytes; bytes--) |
771 | 4809 { |
4810 int c = Lstream_getc (stream); | |
800 | 4811 text_checking_assert (c >= 0); |
867 | 4812 *++strptr = (Ibyte) c; |
771 | 4813 } |
867 | 4814 return itext_ichar (str); |
771 | 4815 } |
4816 | |
4817 int | |
867 | 4818 Lstream_fput_ichar (Lstream *stream, Ichar ch) |
771 | 4819 { |
867 | 4820 Ibyte str[MAX_ICHAR_LEN]; |
4821 Bytecount len = set_itext_ichar (str, ch); | |
771 | 4822 return Lstream_write (stream, str, len); |
4823 } | |
4824 | |
4825 void | |
867 | 4826 Lstream_funget_ichar (Lstream *stream, Ichar ch) |
771 | 4827 { |
867 | 4828 Ibyte str[MAX_ICHAR_LEN]; |
4829 Bytecount len = set_itext_ichar (str, ch); | |
771 | 4830 Lstream_unread (stream, str, len); |
4831 } | |
4832 | |
4833 #endif /* MULE */ | |
4834 | |
4835 | |
4836 /************************************************************************/ | |
4837 /* Lisp primitives for working with characters */ | |
4838 /************************************************************************/ | |
4839 | |
4840 DEFUN ("make-char", Fmake_char, 2, 3, 0, /* | |
4841 Make a character from CHARSET and octets ARG1 and ARG2. | |
4842 ARG2 is required only for characters from two-dimensional charsets. | |
4843 | |
4844 Each octet should be in the range 32 through 127 for a 96 or 96x96 | |
4845 charset and 33 through 126 for a 94 or 94x94 charset. (Most charsets | |
4846 are either 96 or 94x94.) Note that this is 32 more than the values | |
4847 typically given for 94x94 charsets. When two octets are required, the | |
4848 order is "standard" -- the same as appears in ISO-2022 encodings, | |
4849 reference tables, etc. | |
4850 | |
4851 \(Note the following non-obvious result: Computerized translation | |
4852 tables often encode the two octets as the high and low bytes, | |
4853 respectively, of a hex short, while when there's only one octet, it | |
4854 goes in the low byte. When decoding such a value, you need to treat | |
4855 the two cases differently when calling make-char: One is (make-char | |
4856 CHARSET HIGH LOW), the other is (make-char CHARSET LOW).) | |
4857 | |
4858 For example, (make-char 'latin-iso8859-2 185) or (make-char | |
4859 'latin-iso8859-2 57) will return the Latin 2 character s with caron. | |
4860 | |
4861 As another example, the Japanese character for "kawa" (stream), which | |
4862 looks something like this: | |
4863 | |
4864 | | | |
4865 | | | | |
4866 | | | | |
4867 | | | | |
4868 / | | |
4869 | |
4870 appears in the Unicode Standard (version 2.0) on page 7-287 with the | |
4871 following values (see also page 7-4): | |
4872 | |
4873 U 5DDD (Unicode) | |
4874 G 0-2008 (GB 2312-80) | |
4875 J 0-3278 (JIS X 0208-1990) | |
4876 K 0-8425 (KS C 5601-1987) | |
4877 B A474 (Big Five) | |
4878 C 1-4455 (CNS 11643-1986 (1st plane)) | |
4879 A 213C34 (ANSI Z39.64-1989) | |
4880 | |
4881 These are equivalent to: | |
4882 | |
4883 \(make-char 'chinese-gb2312 52 40) | |
4884 \(make-char 'japanese-jisx0208 64 110) | |
4885 \(make-char 'korean-ksc5601 116 57) | |
4886 \(make-char 'chinese-cns11643-1 76 87) | |
4887 \(decode-big5-char '(164 . 116)) | |
4888 | |
4889 \(All codes above are two decimal numbers except for Big Five and ANSI | |
4890 Z39.64, which we don't support. We add 32 to each of the decimal | |
4891 numbers. Big Five is split in a rather hackish fashion into two | |
4892 charsets, `big5-1' and `big5-2', due to its excessive size -- 94x157, | |
4893 with the first codepoint in the range 0xA1 to 0xFE and the second in | |
4894 the range 0x40 to 0x7E or 0xA1 to 0xFE. `decode-big5-char' is used to | |
4895 generate the char from its codes, and `encode-big5-char' extracts the | |
4896 codes.) | |
4897 | |
4898 When compiled without MULE, this function does not do much, but it's | |
4899 provided for compatibility. In this case, the following CHARSET symbols | |
4900 are allowed: | |
4901 | |
4902 `ascii' -- ARG1 should be in the range 0 through 127. | |
4903 `control-1' -- ARG1 should be in the range 128 through 159. | |
4904 else -- ARG1 is coerced to be between 0 and 255, and then the high | |
4905 bit is set. | |
4906 | |
4907 `int-to-char of the resulting ARG1' is returned, and ARG2 is always ignored. | |
4908 */ | |
2333 | 4909 (charset, arg1, USED_IF_MULE (arg2))) |
771 | 4910 { |
4911 #ifdef MULE | |
4912 Lisp_Charset *cs; | |
4913 int a1, a2; | |
4914 int lowlim, highlim; | |
4915 | |
4916 charset = Fget_charset (charset); | |
4917 cs = XCHARSET (charset); | |
4918 | |
788 | 4919 get_charset_limits (charset, &lowlim, &highlim); |
771 | 4920 |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4921 CHECK_FIXNUM (arg1); |
771 | 4922 /* It is useful (and safe, according to Olivier Galibert) to strip |
4923 the 8th bit off ARG1 and ARG2 because it allows programmers to | |
4924 write (make-char 'latin-iso8859-2 CODE) where code is the actual | |
4925 Latin 2 code of the character. */ | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4926 a1 = XFIXNUM (arg1) & 0x7f; |
771 | 4927 if (a1 < lowlim || a1 > highlim) |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4928 args_out_of_range_3 (arg1, make_fixnum (lowlim), make_fixnum (highlim)); |
771 | 4929 |
4930 if (CHARSET_DIMENSION (cs) == 1) | |
4931 { | |
4932 if (!NILP (arg2)) | |
4933 invalid_argument | |
4934 ("Charset is of dimension one; second octet must be nil", arg2); | |
867 | 4935 return make_char (make_ichar (charset, a1, 0)); |
771 | 4936 } |
4937 | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4938 CHECK_FIXNUM (arg2); |
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4939 a2 = XFIXNUM (arg2) & 0x7f; |
771 | 4940 if (a2 < lowlim || a2 > highlim) |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4941 args_out_of_range_3 (arg2, make_fixnum (lowlim), make_fixnum (highlim)); |
771 | 4942 |
867 | 4943 return make_char (make_ichar (charset, a1, a2)); |
771 | 4944 #else |
4945 int a1; | |
4946 int lowlim, highlim; | |
4947 | |
4948 if (EQ (charset, Qascii)) lowlim = 0, highlim = 127; | |
4949 else if (EQ (charset, Qcontrol_1)) lowlim = 0, highlim = 31; | |
4950 else lowlim = 0, highlim = 127; | |
4951 | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4952 CHECK_FIXNUM (arg1); |
771 | 4953 /* It is useful (and safe, according to Olivier Galibert) to strip |
4954 the 8th bit off ARG1 and ARG2 because it allows programmers to | |
4955 write (make-char 'latin-iso8859-2 CODE) where code is the actual | |
4956 Latin 2 code of the character. */ | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4957 a1 = XFIXNUM (arg1) & 0x7f; |
771 | 4958 if (a1 < lowlim || a1 > highlim) |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4959 args_out_of_range_3 (arg1, make_fixnum (lowlim), make_fixnum (highlim)); |
771 | 4960 |
4961 if (EQ (charset, Qascii)) | |
4962 return make_char (a1); | |
4963 return make_char (a1 + 128); | |
4964 #endif /* MULE */ | |
4965 } | |
4966 | |
4967 #ifdef MULE | |
4968 | |
4969 DEFUN ("char-charset", Fchar_charset, 1, 1, 0, /* | |
4970 Return the character set of char CH. | |
4971 */ | |
4972 (ch)) | |
4973 { | |
4974 CHECK_CHAR_COERCE_INT (ch); | |
4975 | |
826 | 4976 return XCHARSET_NAME (charset_by_leading_byte |
867 | 4977 (ichar_leading_byte (XCHAR (ch)))); |
771 | 4978 } |
4979 | |
4980 DEFUN ("char-octet", Fchar_octet, 1, 2, 0, /* | |
4981 Return the octet numbered N (should be 0 or 1) of char CH. | |
4982 N defaults to 0 if omitted. | |
4983 */ | |
4984 (ch, n)) | |
4985 { | |
4986 Lisp_Object charset; | |
4987 int octet0, octet1; | |
4988 | |
4989 CHECK_CHAR_COERCE_INT (ch); | |
4990 | |
867 | 4991 BREAKUP_ICHAR (XCHAR (ch), charset, octet0, octet1); |
771 | 4992 |
4993 if (NILP (n) || EQ (n, Qzero)) | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4994 return make_fixnum (octet0); |
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4995 else if (EQ (n, make_fixnum (1))) |
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
4996 return make_fixnum (octet1); |
771 | 4997 else |
4998 invalid_constant ("Octet number must be 0 or 1", n); | |
4999 } | |
5000 | |
3724 | 5001 #endif /* MULE */ |
5002 | |
771 | 5003 DEFUN ("split-char", Fsplit_char, 1, 1, 0, /* |
5004 Return list of charset and one or two position-codes of CHAR. | |
5005 */ | |
5006 (character)) | |
5007 { | |
5008 /* This function can GC */ | |
5009 struct gcpro gcpro1, gcpro2; | |
5010 Lisp_Object charset = Qnil; | |
5011 Lisp_Object rc = Qnil; | |
5012 int c1, c2; | |
5013 | |
5014 GCPRO2 (charset, rc); | |
5015 CHECK_CHAR_COERCE_INT (character); | |
5016 | |
867 | 5017 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2); |
771 | 5018 |
3724 | 5019 if (XCHARSET_DIMENSION (charset) == 2) |
771 | 5020 { |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
5021 rc = list3 (XCHARSET_NAME (charset), make_fixnum (c1), make_fixnum (c2)); |
771 | 5022 } |
5023 else | |
5024 { | |
5581
56144c8593a8
Mechanically change INT to FIXNUM in our sources.
Aidan Kehoe <kehoea@parhasard.net>
parents:
5474
diff
changeset
|
5025 rc = list2 (XCHARSET_NAME (charset), make_fixnum (c1)); |
771 | 5026 } |
5027 UNGCPRO; | |
5028 | |
5029 return rc; | |
5030 } | |
5031 | |
5032 | |
5033 /************************************************************************/ | |
5034 /* composite character functions */ | |
5035 /************************************************************************/ | |
5036 | |
5037 #ifdef ENABLE_COMPOSITE_CHARS | |
5038 | |
867 | 5039 Ichar |
5040 lookup_composite_char (Ibyte *str, int len) | |
771 | 5041 { |
5042 Lisp_Object lispstr = make_string (str, len); | |
5043 Lisp_Object ch = Fgethash (lispstr, | |
5044 Vcomposite_char_string2char_hash_table, | |
5045 Qunbound); | |
867 | 5046 Ichar emch; |
771 | 5047 |
5048 if (UNBOUNDP (ch)) | |
5049 { | |
5050 if (composite_char_row_next >= 128) | |
5051 invalid_operation ("No more composite chars available", lispstr); | |
867 | 5052 emch = make_ichar (Vcharset_composite, composite_char_row_next, |
771 | 5053 composite_char_col_next); |
5054 Fputhash (make_char (emch), lispstr, | |
5055 Vcomposite_char_char2string_hash_table); | |
5056 Fputhash (lispstr, make_char (emch), | |
5057 Vcomposite_char_string2char_hash_table); | |
5058 composite_char_col_next++; | |
5059 if (composite_char_col_next >= 128) | |
5060 { | |
5061 composite_char_col_next = 32; | |
5062 composite_char_row_next++; | |
5063 } | |
5064 } | |
5065 else | |
5066 emch = XCHAR (ch); | |
5067 return emch; | |
5068 } | |
5069 | |
5070 Lisp_Object | |
867 | 5071 composite_char_string (Ichar ch) |
771 | 5072 { |
5073 Lisp_Object str = Fgethash (make_char (ch), | |
5074 Vcomposite_char_char2string_hash_table, | |
5075 Qunbound); | |
5076 assert (!UNBOUNDP (str)); | |
5077 return str; | |
5078 } | |
5079 | |
826 | 5080 DEFUN ("make-composite-char", Fmake_composite_char, 1, 1, 0, /* |
771 | 5081 Convert a string into a single composite character. |
5082 The character is the result of overstriking all the characters in | |
5083 the string. | |
5084 */ | |
5085 (string)) | |
5086 { | |
5087 CHECK_STRING (string); | |
5088 return make_char (lookup_composite_char (XSTRING_DATA (string), | |
5089 XSTRING_LENGTH (string))); | |
5090 } | |
5091 | |
826 | 5092 DEFUN ("composite-char-string", Fcomposite_char_string, 1, 1, 0, /* |
771 | 5093 Return a string of the characters comprising a composite character. |
5094 */ | |
5095 (ch)) | |
5096 { | |
867 | 5097 Ichar emch; |
771 | 5098 |
5099 CHECK_CHAR (ch); | |
5100 emch = XCHAR (ch); | |
867 | 5101 if (ichar_leading_byte (emch) != LEADING_BYTE_COMPOSITE) |
771 | 5102 invalid_argument ("Must be composite char", ch); |
5103 return composite_char_string (emch); | |
5104 } | |
5105 #endif /* ENABLE_COMPOSITE_CHARS */ | |
5106 | |
5107 | |
5108 /************************************************************************/ | |
5109 /* initialization */ | |
5110 /************************************************************************/ | |
5111 | |
5112 void | |
1204 | 5113 reinit_eistring_early (void) |
771 | 5114 { |
5115 the_eistring_malloc_zero_init = the_eistring_zero_init; | |
5116 the_eistring_malloc_zero_init.mallocp_ = 1; | |
5117 } | |
5118 | |
5119 void | |
814 | 5120 init_eistring_once_early (void) |
5121 { | |
1204 | 5122 reinit_eistring_early (); |
814 | 5123 } |
5124 | |
5125 void | |
771 | 5126 syms_of_text (void) |
5127 { | |
5128 DEFSUBR (Fmake_char); | |
3724 | 5129 DEFSUBR (Fsplit_char); |
771 | 5130 |
5131 #ifdef MULE | |
5132 DEFSUBR (Fchar_charset); | |
5133 DEFSUBR (Fchar_octet); | |
5134 | |
5135 #ifdef ENABLE_COMPOSITE_CHARS | |
5136 DEFSUBR (Fmake_composite_char); | |
5137 DEFSUBR (Fcomposite_char_string); | |
5138 #endif | |
5139 #endif /* MULE */ | |
5140 } | |
5141 | |
5142 void | |
5143 reinit_vars_of_text (void) | |
5144 { | |
5145 int i; | |
5146 | |
867 | 5147 conversion_in_dynarr_list = Dynarr_new2 (Ibyte_dynarr_dynarr, |
5148 Ibyte_dynarr *); | |
771 | 5149 conversion_out_dynarr_list = Dynarr_new2 (Extbyte_dynarr_dynarr, |
5150 Extbyte_dynarr *); | |
5151 | |
5152 for (i = 0; i <= MAX_BYTEBPOS_GAP_SIZE_3; i++) | |
5153 three_to_one_table[i] = i / 3; | |
5154 } | |
5155 | |
5156 void | |
5157 vars_of_text (void) | |
5158 { | |
4952
19a72041c5ed
Mule-izing, various fixes related to char * arguments
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
5159 QSin_char_byte_conversion = build_defer_string ("(in char-byte conversion)"); |
1292 | 5160 staticpro (&QSin_char_byte_conversion); |
5161 QSin_internal_external_conversion = | |
4952
19a72041c5ed
Mule-izing, various fixes related to char * arguments
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
5162 build_defer_string ("(in internal-external conversion)"); |
1292 | 5163 staticpro (&QSin_internal_external_conversion); |
5164 | |
771 | 5165 #ifdef ENABLE_COMPOSITE_CHARS |
5166 /* #### not dumped properly */ | |
5167 composite_char_row_next = 32; | |
5168 composite_char_col_next = 32; | |
5169 | |
5170 Vcomposite_char_string2char_hash_table = | |
5191
71ee43b8a74d
Add #'equalp as a hash test by default; add #'define-hash-table-test, GNU API
Aidan Kehoe <kehoea@parhasard.net>
parents:
5013
diff
changeset
|
5171 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, Qequal); |
771 | 5172 Vcomposite_char_char2string_hash_table = |
5191
71ee43b8a74d
Add #'equalp as a hash test by default; add #'define-hash-table-test, GNU API
Aidan Kehoe <kehoea@parhasard.net>
parents:
5013
diff
changeset
|
5173 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, Qeq); |
771 | 5174 staticpro (&Vcomposite_char_string2char_hash_table); |
5175 staticpro (&Vcomposite_char_char2string_hash_table); | |
5176 #endif /* ENABLE_COMPOSITE_CHARS */ | |
5177 } |