Mercurial > hg > xemacs-beta
annotate src/text.c @ 4976:16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
-------------------- ChangeLog entries follow: --------------------
src/ChangeLog addition:
2010-02-04 Ben Wing <ben@xemacs.org>
* alloc.c (release_breathing_space):
* alloc.c (resize_string):
* alloc.c (sweep_lcrecords_1):
* alloc.c (SWEEP_FIXED_TYPE_BLOCK_1):
* alloc.c (ADDITIONAL_FREE_compiled_function):
* alloc.c (compact_string_chars):
* alloc.c (ADDITIONAL_FREE_string):
* alloc.c (sweep_strings):
* alloca.c (xemacs_c_alloca):
* alsaplay.c (alsa_play_sound_file):
* buffer.c (init_initial_directory):
* buffer.h:
* buffer.h (BUFFER_FREE):
* console-stream.c (stream_delete_console):
* console-tty.c (free_tty_console_struct):
* data.c (Fnumber_to_string):
* device-gtk.c (gtk_init_device):
* device-gtk.c (free_gtk_device_struct):
* device-gtk.c (gtk_delete_device):
* device-msw.c (mswindows_delete_device):
* device-msw.c (msprinter_delete_device):
* device-tty.c (free_tty_device_struct):
* device-tty.c (tty_delete_device):
* device-x.c (x_init_device):
* device-x.c (free_x_device_struct):
* device-x.c (x_delete_device):
* dialog-msw.c (handle_directory_dialog_box):
* dialog-x.c (dbox_descriptor_to_widget_value):
* dired-msw.c (Fmswindows_insert_directory):
* dired.c (free_user_cache):
* dired.c (user_name_completion_unwind):
* doc.c (unparesseuxify_doc_string):
* doc.c (Fsubstitute_command_keys):
* doprnt.c (emacs_doprnt_1):
* dumper.c (pdump_load_finish):
* dumper.c (pdump_file_free):
* dumper.c (pdump_file_unmap):
* dynarr.c:
* dynarr.c (Dynarr_free):
* editfns.c (uncache_home_directory):
* editfns.c (Fset_time_zone_rule):
* elhash.c:
* elhash.c (pdump_reorganize_hash_table):
* elhash.c (maphash_unwind):
* emacs.c (make_arg_list_1):
* emacs.c (free_argc_argv):
* emacs.c (sort_args):
* emacs.c (Frunning_temacs_p):
* emodules.c (attempt_module_delete):
* eval.c (free_pointer):
* event-Xt.c (unselect_filedesc):
* event-Xt.c (emacs_Xt_select_process):
* event-gtk.c (unselect_filedesc):
* event-gtk.c (dragndrop_data_received):
* event-msw.c (winsock_closer):
* event-msw.c (mswindows_dde_callback):
* event-msw.c (mswindows_wnd_proc):
* event-stream.c (finalize_command_builder):
* event-stream.c (free_command_builder):
* extents.c (free_gap_array):
* extents.c (free_extent_list):
* extents.c (free_soe):
* extents.c (extent_fragment_delete):
* extents.c (extent_priority_sort_function):
* file-coding.c (make_coding_system_1):
* file-coding.c (coding_finalizer):
* file-coding.c (set_coding_stream_coding_system):
* file-coding.c (chain_finalize_coding_stream_1):
* file-coding.c (chain_finalize):
* file-coding.c (free_detection_state):
* file-coding.c (coding_category_symbol_to_id):
* fileio.c:
* fileio.c (Ffile_name_directory):
* fileio.c (if):
* fileio.c (Ffile_symlink_p):
* filelock.c (FREE_LOCK_INFO):
* filelock.c (current_lock_owner):
* font-mgr.c (Ffc_name_unparse):
* font-mgr.c (Ffc_pattern_duplicate):
* frame-gtk.c (gtk_delete_frame):
* frame-msw.c (mswindows_delete_frame):
* frame-msw.c (msprinter_delete_frame):
* frame-x.c (x_cde_destroy_callback):
* frame-x.c (Fcde_start_drag_internal):
* frame-x.c (x_cde_transfer_callback):
* frame-x.c (x_delete_frame):
* frame.c (update_frame_title):
* frame.c (Fset_frame_pointer):
* gc.c (register_for_finalization):
* gccache-gtk.c (free_gc_cache):
* gccache-gtk.c (gc_cache_lookup):
* gccache-x.c (free_gc_cache):
* gccache-x.c (gc_cache_lookup):
* glyphs-eimage.c:
* glyphs-eimage.c (jpeg_instantiate_unwind):
* glyphs-eimage.c (gif_instantiate_unwind):
* glyphs-eimage.c (png_instantiate_unwind):
* glyphs-eimage.c (png_instantiate):
* glyphs-eimage.c (tiff_instantiate_unwind):
* glyphs-gtk.c (convert_EImage_to_GDKImage):
* glyphs-gtk.c (gtk_finalize_image_instance):
* glyphs-gtk.c (gtk_init_image_instance_from_eimage):
* glyphs-gtk.c (gtk_xpm_instantiate):
* glyphs-msw.c (convert_EImage_to_DIBitmap):
* glyphs-msw.c (mswindows_init_image_instance_from_eimage):
* glyphs-msw.c (mswindows_initialize_image_instance_mask):
* glyphs-msw.c (xpm_to_eimage):
* glyphs-msw.c (mswindows_xpm_instantiate):
* glyphs-msw.c (xbm_create_bitmap_from_data):
* glyphs-msw.c (mswindows_finalize_image_instance):
* glyphs-x.c (convert_EImage_to_XImage):
* glyphs-x.c (x_finalize_image_instance):
* glyphs-x.c (x_init_image_instance_from_eimage):
* glyphs-x.c (x_xpm_instantiate):
* gui-x.c (free_popup_widget_value_tree):
* hash.c (free_hash_table):
* hash.c (grow_hash_table):
* hash.c (pregrow_hash_table_if_necessary):
* imgproc.c (build_EImage_quantable):
* insdel.c (uninit_buffer_text):
* intl-win32.c (convert_multibyte_to_internal_malloc):
* intl.c:
* intl.c (Fset_current_locale):
* keymap.c:
* keymap.c (where_is_recursive_mapper):
* keymap.c (where_is_internal):
* lisp.h:
* lisp.h (xfree):
* lstream.c (Lstream_close):
* lstream.c (resizing_buffer_closer):
* mule-coding.c:
* mule-coding.c (iso2022_finalize_detection_state):
* nt.c:
* nt.c (mswindows_get_long_filename):
* nt.c (nt_get_resource):
* nt.c (init_mswindows_environment):
* nt.c (get_cached_volume_information):
* nt.c (mswindows_opendir):
* nt.c (mswindows_closedir):
* nt.c (mswindows_readdir):
* nt.c (mswindows_stat):
* nt.c (mswindows_getdcwd):
* nt.c (Fmswindows_long_file_name):
* ntplay.c (nt_play_sound_file):
* ntplay.c (play_sound_data_1):
* number-gmp.c (gmp_free):
* number-gmp.c (init_number_gmp):
* number-mp.c (bignum_to_string):
* number-mp.c (BIGNUM_TO_TYPE):
* number.c (bignum_print):
* number.c (bignum_convfree):
* number.c (ratio_print):
* number.c (bigfloat_print):
* number.c (bigfloat_finalize):
* objects-gtk.c (gtk_finalize_color_instance):
* objects-gtk.c (gtk_finalize_font_instance):
* objects-msw.c (mswindows_finalize_color_instance):
* objects-msw.c (mswindows_finalize_font_instance):
* objects-tty.c (tty_finalize_color_instance):
* objects-tty.c (tty_finalize_font_instance):
* objects-tty.c (tty_font_list):
* objects-x.c (x_finalize_color_instance):
* objects-x.c (x_finalize_font_instance):
* process.c:
* process.c (finalize_process):
* realpath.c:
* redisplay.c (add_propagation_runes):
* regex.c:
* regex.c (xfree):
* regex.c (REGEX_FREE_STACK):
* regex.c (FREE_STACK_RETURN):
* regex.c (regex_compile):
* regex.c (regexec):
* regex.c (regfree):
* scrollbar-gtk.c (gtk_free_scrollbar_instance):
* scrollbar-gtk.c (gtk_release_scrollbar_instance):
* scrollbar-msw.c (mswindows_free_scrollbar_instance):
* scrollbar-msw.c (unshow_that_mofo):
* scrollbar-x.c (x_free_scrollbar_instance):
* scrollbar-x.c (x_release_scrollbar_instance):
* select-gtk.c (emacs_gtk_selection_handle):
* select-msw.c (mswindows_own_selection):
* select-x.c:
* select-x.c (x_handle_selection_request):
* select-x.c (unexpect_property_change):
* select-x.c (x_handle_property_notify):
* select-x.c (receive_incremental_selection):
* select-x.c (x_get_window_property_as_lisp_data):
* select-x.c (Fx_get_cutbuffer_internal):
* specifier.c (finalize_specifier):
* syntax.c (uninit_buffer_syntax_cache):
* sysdep.c (qxe_allocating_getcwd):
* sysdep.c (qxe_lstat):
* sysdep.c (copy_in_passwd):
* sysdep.c (qxe_ctime):
* sysdep.c (closedir):
* sysdep.c (DIRSIZ):
* termcap.c (tgetent):
* termcap.c (tprint):
* tests.c (Ftest_data_format_conversion):
* text.c (new_dfc_convert_copy_data):
* text.h (eifree):
* text.h (eito_alloca):
* text.h (eito_external):
* toolbar-msw.c (mswindows_output_toolbar):
* ui-gtk.c (CONVERT_RETVAL):
* ui-gtk.c (__allocate_object_storage):
* unicode.c (free_from_unicode_table):
* unicode.c (free_to_unicode_table):
* unicode.c (free_charset_unicode_tables):
* win32.c (mswindows_read_link_1):
Rename: xfree(VAL, TYPE)->xfree(VAL)
Command used:
gr 'xfree *\((.*),.*\);' 'xfree (\1);' *.[ch]
Followed by grepping for 'xfree.*,' and fixing anything left.
Rationale: Having to specify the TYPE argument is annoying and
error-prone. It was originally put in to work around warnings
due to strict aliasing but years and years ago I rewrote it
in a way that doesn't use the TYPE argument at all and no one
has complained since then. (And anyway, XEmacs is far from
ever being in compliance with strict aliasing and would require
far-reaching changes to get that way.)
author | Ben Wing <ben@xemacs.org> |
---|---|
date | Thu, 04 Feb 2010 07:28:14 -0600 |
parents | 0d4c9d0f6a8d |
children | ae48681c47fa |
rev | line source |
---|---|
2367 | 1 /* Text manipulation primitives for XEmacs. |
771 | 2 Copyright (C) 1995 Sun Microsystems, Inc. |
2367 | 3 Copyright (C) 1995, 1996, 2000, 2001, 2002, 2003, 2004 Ben Wing. |
771 | 4 Copyright (C) 1999 Martin Buchholz. |
5 | |
6 This file is part of XEmacs. | |
7 | |
8 XEmacs is free software; you can redistribute it and/or modify it | |
9 under the terms of the GNU General Public License as published by the | |
10 Free Software Foundation; either version 2, or (at your option) any | |
11 later version. | |
12 | |
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 for more details. | |
17 | |
18 You should have received a copy of the GNU General Public License | |
19 along with XEmacs; see the file COPYING. If not, write to | |
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
21 Boston, MA 02111-1307, USA. */ | |
22 | |
23 /* Synched up with: Not in FSF. */ | |
24 | |
25 /* Authorship: | |
26 */ | |
27 | |
28 #include <config.h> | |
29 #include "lisp.h" | |
30 | |
31 #include "buffer.h" | |
32 #include "charset.h" | |
33 #include "file-coding.h" | |
34 #include "lstream.h" | |
1292 | 35 #include "profile.h" |
771 | 36 |
37 | |
38 /************************************************************************/ | |
39 /* long comments */ | |
40 /************************************************************************/ | |
41 | |
2367 | 42 /* NB: Everything below was written by Ben Wing except as otherwise noted. */ |
43 | |
44 /************************************************************************/ | |
45 /* */ | |
46 /* */ | |
47 /* Part A: More carefully-written documentation */ | |
48 /* */ | |
49 /* */ | |
50 /************************************************************************/ | |
51 | |
52 /* Authorship: Ben Wing | |
53 | |
771 | 54 |
826 | 55 ========================================================================== |
2367 | 56 7. Handling non-default formats |
826 | 57 ========================================================================== |
771 | 58 |
2367 | 59 We support, at least to some extent, formats other than the default |
60 variable-width format, for speed; all of these alternative formats are | |
61 fixed-width. Currently we only handle these non-default formats in | |
62 buffers, because access to their text is strictly controlled and thus | |
63 the details of the format mostly compartmentalized. The only really | |
64 tricky part is the search code -- the regex, Boyer-Moore, and | |
65 simple-search algorithms in search.c and regex.c. All other code that | |
66 knows directly about the buffer representation is the basic code to | |
67 modify or retrieve the buffer text. | |
68 | |
69 Supporting fixed-width formats in Lisp strings is harder, but possible | |
70 -- FSF currently does this, for example. In this case, however, | |
71 probably only 8-bit-fixed is reasonable for Lisp strings -- getting | |
72 non-ASCII-compatible fixed-width formats to work is much, much harder | |
73 because a lot of code assumes that strings are ASCII-compatible | |
74 (i.e. ASCII + other characters represented exclusively using high-bit | |
75 bytes) and a lot of code mixes Lisp strings and non-Lisp strings freely. | |
76 | |
77 The different possible fixed-width formats are 8-bit fixed, 16-bit | |
78 fixed, and 32-bit fixed. The latter can represent all possible | |
79 characters, but at a substantial memory penalty. The other two can | |
80 represent only a subset of the possible characters. How these subsets | |
81 are defined can be simple or very tricky. | |
82 | |
83 Currently we support only the default format and the 8-bit fixed format, | |
84 and in the latter, we only allow these to be the first 256 characters in | |
85 an Ichar (ASCII and Latin 1). | |
86 | |
87 One reasonable approach for 8-bit fixed is to allow the upper half to | |
88 represent any 1-byte charset, which is specified on a per-buffer basis. | |
89 This should work fairly well in practice since most documents are in | |
90 only one foreign language (possibly with some English mixed in). I | |
91 think FSF does something like this; or at least, they have something | |
92 called nonascii-translation-table and use it when converting from | |
93 8-bit-fixed text ("unibyte text") to default text ("multibyte text"). | |
94 With 16-bit fixed, you could do something like assign chunks of the 64K | |
95 worth of characters to charsets as they're encountered in documents. | |
96 This should work well with most Asian documents. | |
97 | |
98 If/when we switch to using Unicode internally, we might have formats more | |
99 like this: | |
100 | |
101 -- UTF-8 or some extension as the default format. Perl uses an | |
102 extension that handles 64-bit chars and requires as much as 13 bytes per | |
103 char, vs. the standard of 31-bit chars and 6 bytes max. UTF-8 has the | |
104 same basic properties as our own variable-width format (see text.c, | |
105 Internal String Encoding) and so most code would not need to be changed. | |
106 | |
107 -- UTF-16 as a "pseudo-fixed" format (i.e. 16-bit fixed plus surrogates | |
108 for representing characters not in the BMP, aka >= 65536). The vast | |
109 majority of documents will have no surrogates in them so byte/char | |
110 conversion will be very fast. | |
111 | |
112 -- an 8-bit fixed format, like currently. | |
113 | |
114 -- possibly, UCS-4 as a 32-bit fixed format. | |
115 | |
116 The fixed-width formats essentially treat the buffer as an array of | |
117 8-bit, 16-bit or 32-bit integers. This means that how they are stored | |
118 in memory (in particular, big-endian or little-endian) depends on the | |
119 native format of the machine's processor. It also means we have to | |
120 worry a bit about alignment (basically, we just need to keep the gap an | |
121 integral size of the character size, and get things aligned properly | |
122 when converting the buffer between formats). | |
826 | 123 |
124 ========================================================================== | |
2367 | 125 8. Using UTF-16 as the default text format |
826 | 126 ========================================================================== |
127 | |
2367 | 128 NOTE: The Eistring API is (or should be) Mule-correct even without |
129 an ASCII-compatible internal representation. | |
130 | |
131 #### Currently, the assumption that text units are one byte in size is | |
132 embedded throughout XEmacs, and `Ibyte *' is used where `Itext *' should | |
133 be. The way to fix this is to (among other things) | |
134 | |
135 (a) review all places referencing `Ibyte' and `Ibyte *', change them to | |
136 use Itext, and fix up the code. | |
137 (b) change XSTRING_DATA to be of type Itext * | |
138 (c) review all uses of XSTRING_DATA | |
139 (d) eliminate XSTRING_LENGTH, splitting it into XSTRING_BYTE_LENGTH and | |
140 XSTRING_TEXT_LENGTH and reviewing all places referencing this | |
141 (e) make similar changes to other API's that refer to the "length" of | |
142 something, such as qxestrlen() and eilen() | |
143 (f) review all use of `CIbyte *'. Currently this is usually a way of | |
144 passing literal ASCII text strings in places that want internal text. | |
145 Either create separate _ascii() and _itext() versions of the | |
146 functions taking CIbyte *, or make use of something like the | |
147 WEXTTEXT() macro, which will generate wide strings as appropriate. | |
148 (g) review all uses of Bytecount and see which ones should be Textcount. | |
149 (h) put in error-checking code that will be tripped as often as possible | |
150 when doing anything with internal text, and check to see that ASCII | |
151 text has not mistakenly filtered in. This should be fairly easy as | |
152 ASCII text will generally be entirely spaces and letters whereas every | |
153 second byte of Unicode text will generally be a null byte. Either we | |
154 abort if the second bytes are entirely letters and numbers, or, | |
155 perhaps better, do the equivalent of a non-MULE build, where we should | |
156 be dealing entirely with 8-bit characters, and assert that the high | |
157 bytes of each pair are null. | |
158 (i) review places where xmalloc() is called. If we convert each use of | |
159 xmalloc() to instead be xnew_array() or some other typed routine, | |
160 then we will find every place that allocates space for Itext and | |
161 assumes it is based on one-byte units. | |
162 (j) encourage the use of ITEXT_ZTERM_SIZE instead of '+ 1' whenever we | |
163 are adding space for a zero-terminator, to emphasize what we are | |
164 doing and make sure the calculations are correct. Similarly for | |
165 EXTTEXT_ZTERM_SIZE. | |
166 (k) Note that the qxestr*() functions, among other things, will need to | |
167 be rewritten. | |
168 | |
169 Note that this is a lot of work, and is not high on the list of priorities | |
170 currently. | |
826 | 171 |
172 ========================================================================== | |
2367 | 173 9. Miscellaneous |
826 | 174 ========================================================================== |
175 | |
176 A. Unicode Support | |
771 | 177 |
1292 | 178 Unicode support is very desirable. Currrently we know how to handle |
179 externally-encoded Unicode data in various encodings -- UTF-16, UTF-8, | |
180 etc. However, we really need to represent Unicode characters internally | |
181 as-is, rather than converting to some language-specific character set. | |
182 For efficiency, we should represent Unicode characters using 3 bytes | |
183 rather than 4. This means we need to find leading bytes for Unicode. | |
184 Given that there are 65,536 characters in Unicode and we can attach | |
185 96x96 = 9,216 characters per leading byte, we need eight leading bytes | |
186 for Unicode. We currently have four free (0x9A - 0x9D), and with a | |
187 little bit of rearranging we can get five: ASCII doesn't really need to | |
188 take up a leading byte. (We could just as well use 0x7F, with a little | |
189 change to the functions that assume that 0x80 is the lowest leading | |
190 byte.) This means we still need to dump three leading bytes and move | |
191 them into private space. The CNS charsets are good candidates since | |
192 they are rarely used, and JAPANESE_JISX0208_1978 is becoming less and | |
193 less used and could also be dumped. | |
826 | 194 |
195 B. Composite Characters | |
196 | |
197 Composite characters are characters constructed by overstriking two | |
771 | 198 or more regular characters. |
199 | |
200 1) The old Mule implementation involves storing composite characters | |
201 in a buffer as a tag followed by all of the actual characters | |
202 used to make up the composite character. I think this is a bad | |
203 idea; it greatly complicates code that wants to handle strings | |
204 one character at a time because it has to deal with the possibility | |
205 of great big ungainly characters. It's much more reasonable to | |
206 simply store an index into a table of composite characters. | |
207 | |
208 2) The current implementation only allows for 16,384 separate | |
209 composite characters over the lifetime of the XEmacs process. | |
210 This could become a potential problem if the user | |
211 edited lots of different files that use composite characters. | |
212 Due to FSF bogosity, increasing the number of allowable | |
213 composite characters under Mule would decrease the number | |
214 of possible faces that can exist. Mule already has shrunk | |
215 this to 2048, and further shrinkage would become uncomfortable. | |
216 No such problems exist in XEmacs. | |
217 | |
3498 | 218 Composite characters could be represented as 0x8D C1 C2 C3, where each |
219 C[1-3] is in the range 0xA0 - 0xFF. This allows for slightly under | |
220 2^20 (one million) composite characters over the XEmacs process | |
221 lifetime. Or you could use 0x8D C1 C2 C3 C4, allowing for about 85 | |
222 million (slightly over 2^26) composite characters. | |
826 | 223 |
2367 | 224 ========================================================================== |
225 10. Internal API's | |
226 ========================================================================== | |
227 | |
228 All of these are documented in more detail in text.h. | |
229 | |
230 @enumerate | |
231 @item | |
232 Basic internal-format API's | |
233 | |
234 These are simple functions and macros to convert between text | |
235 representation and characters, move forward and back in text, etc. | |
236 | |
237 @item | |
238 The DFC API | |
239 | |
240 This is for conversion between internal and external text. Note that | |
241 there is also the "new DFC" API, which *returns* a pointer to the | |
242 converted text (in alloca space), rather than storing it into a | |
243 variable. | |
244 | |
245 @item | |
246 The Eistring API | |
247 | |
4073 | 248 \(This API is currently under-used) When doing simple things with |
2367 | 249 internal text, the basic internal-format API's are enough. But to do |
250 things like delete or replace a substring, concatenate various strings, | |
251 etc. is difficult to do cleanly because of the allocation issues. | |
252 The Eistring API is designed to deal with this, and provides a clean | |
253 way of modifying and building up internal text. (Note that the former | |
254 lack of this API has meant that some code uses Lisp strings to do | |
255 similar manipulations, resulting in excess garbage and increased | |
256 garbage collection.) | |
257 | |
258 NOTE: The Eistring API is (or should be) Mule-correct even without | |
259 an ASCII-compatible internal representation. | |
260 @end enumerate | |
261 | |
262 ========================================================================== | |
263 11. Other Sources of Documentation | |
264 ========================================================================== | |
265 | |
266 man/lispref/mule.texi | |
267 @enumerate | |
268 @item | |
269 another intro to characters, encodings, etc; #### Merge with the | |
270 above info | |
271 @item | |
272 documentation of ISO-2022 | |
273 @item | |
274 The charset and coding-system Lisp API's | |
275 @item | |
276 The CCL conversion language for writing encoding conversions | |
277 @item | |
278 The Latin-Unity package for unifying Latin charsets | |
279 @end enumerate | |
280 | |
281 man/internals/internals.texi (the Internals manual) | |
282 @enumerate | |
283 @item | |
284 "Coding for Mule" -- how to write Mule-aware code | |
285 @item | |
286 "Modules for Internationalization" | |
287 @item | |
288 "The Text in a Buffer" -- more about the different ways of | |
289 viewing buffer positions; #### Merge with the above info | |
290 @item | |
291 "MULE Character Sets and Encodings" -- yet another intro | |
292 to characters, encodings, etc; #### Merge with the | |
293 above info; also some documentation of Japanese EUC and JIS7, | |
294 and CCL internals | |
295 @end enumerate | |
296 | |
297 text.h -- info about specific XEmacs-C API's for handling internal and | |
298 external text | |
299 | |
300 intl-win32.c -- Windows-specific I18N information | |
301 | |
302 lisp.h -- some info appears alongside the definitions of the basic | |
303 character-related types | |
304 | |
305 unicode.c -- documentation about Unicode translation tables | |
826 | 306 */ |
771 | 307 |
2367 | 308 |
309 /************************************************************************/ | |
310 /* */ | |
311 /* */ | |
312 /* Part B: Random proposals for work to be done */ | |
313 /* */ | |
314 /* */ | |
315 /************************************************************************/ | |
316 | |
317 | |
318 /* | |
319 | |
320 | |
321 ========================================================================== | |
322 - Mule design issues (ben) | |
323 ========================================================================== | |
324 | |
325 circa 1999 | |
326 | |
327 Here is a more detailed list of Mule-related projects that we will be | |
328 working on. They are more or less ordered according to how we will | |
329 proceed, but it's not exact. In particular, there will probably be | |
330 time overlap among adjacent projects. | |
331 | |
332 @enumerate | |
333 @item | |
334 Modify the internal/external conversion macros to allow for | |
335 MS Windows support. | |
336 | |
337 @item | |
338 Modify the buffer macros to allow for more than one internal | |
339 representation, e.g. fixed width and variable width. | |
340 | |
341 @item | |
342 Review the existing Mule code, especially the lisp code, for code | |
343 quality issues and improve the cleanliness of it. Also work on | |
344 creating a specification for the Mule API. | |
345 | |
346 @item | |
347 Write some more automated mule tests. | |
348 | |
349 @item | |
350 Integrate Tomohiko's UTF-2000 code, fixing it up so that nothing is | |
351 broken when the UTF-2000 configure option is not enabled. | |
352 | |
353 @item | |
354 Fix up the MS Windows code to be Mule-correct, so that you can | |
355 compile with Mule support under MS windows and have a working | |
356 XEmacs, at least just with Latin-1. | |
357 | |
358 @item | |
359 Implement a scheme to guarantee no corruption of files, even with | |
360 an incorrect coding system - in particular, guarantee no corruption | |
361 of binary files. | |
362 | |
363 @item | |
364 Make the text property support in XEmacs robust with respect to | |
365 string and text operations, so that the `no corruption' support in | |
366 the previous entry works properly, even if a lot of cutting and | |
367 pasting is done. | |
368 | |
369 @item | |
370 Improve the handling of auto-detection so that, when there is any | |
371 possibility at all of mistake, the user is informed of the detected | |
372 encoding and given the choice of choosing other possibilities. | |
373 | |
374 @item | |
375 Improve the support for different language environments in XEmacs, | |
376 for example, the priority of coding systems used in auto-detection | |
377 should properly reflect the language environment. This probably | |
378 necessitates rethinking the current `coding system priority' | |
379 scheme. | |
380 | |
381 @item | |
382 Do quality work to improve the existing UTF-2000 implementation. | |
383 | |
384 @item | |
385 Implement preliminary support for 8-bit fixed width | |
386 representation. First, we will only implement 7-bit support, and | |
387 will fall back to variable width as soon as any non-ASCII | |
388 character is encountered. Then we will improve the support to | |
389 handle an arbitrary character set in the upper half of the 8-bit space. | |
390 | |
391 @item | |
392 Investigate any remaining hurdles to making --with-mule be the | |
393 default configure option. | |
394 @end enumerate | |
395 | |
396 ========================================================================== | |
397 - Mule design issues (stephen) | |
398 ========================================================================== | |
399 | |
400 What I see as Mule priorities (in rough benefit order, I am not taking | |
401 account of difficulty, nor the fact that some - eg 8 & 10 - will | |
402 probably come as packages): | |
403 | |
404 @enumerate | |
405 @item | |
406 Fix the autodetect problem (by making the coding priority list | |
407 user-configurable, as short as he likes, even null, with "binary" | |
408 as the default). | |
409 @item | |
410 Document the language environments and other Mule "APIs" as | |
411 implemented (since there is no real design spec). Check to see | |
412 how and where they are broken. | |
413 @item | |
414 Make the Mule menu useful to non-ISO-2022-literate folks. | |
415 @item | |
416 Redo the lstreams stuff to make it easy and robust to "pipeline", | |
417 eg, libz | gnupg | jis2mule. | |
418 @item | |
419 Make Custom Mule-aware. (This probably depends on a sensible | |
420 fonts model.) | |
421 @item | |
422 Implement the "literal byte stream" memory feature. | |
423 @item | |
424 Study the FSF implementation of Mule for background for 7 & 8. | |
425 @item | |
426 Identify desirable Mule features (eg, i18n-ized messages as above, | |
427 collating tables by language environment, etc). (New features | |
428 might have priority as high as 9.) | |
429 @item | |
430 Specify Mule UIs, APIs, etc, and design and (re)implement them. | |
431 @item | |
432 Implement the 8-bit-wide buffer optimization. | |
433 @item | |
434 Move the internal encoding to UTF-32 (subject to Olivier's caveats | |
435 regarding compose characters), with the variable-width char | |
436 buffers using UTF-8. | |
437 @item | |
438 Implement the 16- and 32-bit-wide buffer optimizations. | |
439 @end enumerate | |
440 | |
441 ========================================================================== | |
442 - Mule design issues "short term" (ben) | |
443 ========================================================================== | |
444 | |
445 @enumerate | |
446 @item | |
447 Finish changes in fixup/directory, get in CVS. | |
448 | |
449 (Test with and without "quick-build", to see if really faster) | |
450 (need autoconf) | |
451 | |
452 @item | |
453 Finish up Windows/Mule changes. Outline of this elsewhere; Do | |
454 *minimal* effort. | |
455 | |
456 @item | |
457 Continue work on Windows stability, e.g. go through existing notes | |
458 on Windows Mule-ization + extract all info. | |
459 | |
460 @item | |
461 Get Unicode translation tables integrated. | |
462 | |
463 Finish UCS2/UTF16 coding system. | |
464 | |
465 @item | |
466 Make sure coding system priority list is language-environment specific. | |
467 | |
468 @item | |
469 Consider moving language selection Menu up to be parallel with Mule menu. | |
470 | |
471 @item | |
472 Check to make sure we grok the default locale at startup under | |
473 Windows and understand the Windows locales. Finish implementation | |
474 of mswindows-multibyte and make sure it groks all the locales. | |
475 | |
476 @item | |
477 Do the above as best as we can without using Unicode tables. | |
478 | |
479 @item | |
480 Start tagging all text with a language text property, | |
481 indicating the current language environment when the text was input. | |
482 | |
483 @item | |
484 Make sure we correctly accept input of non-ASCII chars | |
485 (probably already do!) | |
486 | |
487 @item | |
488 Implement active language/keyboard switching under Windows. | |
489 | |
490 @item | |
491 Look into implementing support for "MS IME" protocol (Microsoft | |
492 fancy built-in Asian input methods). | |
493 | |
494 @item | |
495 Redo implementation of mswindows-multibyte and internal display to | |
496 entirely use translation to/from Unicode for increased accuracy. | |
497 | |
498 @item | |
499 Implement buf<->char improvements from FSF. Also implement | |
500 my string byte<->char optimization structure. | |
501 | |
502 @item | |
503 Integrate all Mule DOCS from 20.6 or 21.0. Try to add sections | |
504 for what we've added. | |
505 | |
506 @item | |
507 Implement 8-bit fixed width optimizations. Then work on 16-bit. | |
508 @end enumerate | |
509 | |
510 ========================================================================== | |
511 - Mule design issues (more) (ben) | |
512 ========================================================================== | |
513 | |
514 Get minimal Mule for Windows working using Ikeyama's patches. At | |
515 first, rely on his conversion of internal -> external | |
516 locale-specific but very soon (as soon as we get translation | |
517 tables) can switch to using Unicode versions of display funs, which | |
518 will allow many more charsets to be handled and in a more | |
519 consistent fashion. | |
520 | |
521 i.e. to convert an internal string to an external format, at first | |
522 we use our own knowledge of the Microsoft locale file formats but | |
523 an alternative is to convert to Unicode and use Microsoft's | |
524 convert-Unicode-to-locale encoding functions. This gains us a | |
525 great deal of generality, since in practice all charset caching | |
526 points can be wrapped into Unicode caching points. | |
527 | |
528 This requires adding UCS2 support, which I'm doing. This support | |
529 would let us convert internal -> Unicode, which is exactly what we | |
530 want. | |
531 | |
532 At first, though, I would do the UCS2 support, but leave the | |
533 existing way of doing things in redisplay. Meanwhile, I'd go | |
534 through and fix up the places in the code that assume we are | |
535 dealing with unibytes. | |
536 | |
537 After this, the font problems will be fixed , we should have a | |
538 pretty well working XEmacs + MULE under Windows. The only real | |
539 other work is the clipboard code, which should be straightforward. | |
540 | |
541 ========================================================================== | |
542 - Mule design discussion | |
543 ========================================================================== | |
544 | |
545 -------------------------------------------------------------------------- | |
546 | |
547 Ben | |
548 | |
549 April 11, 2000 | |
550 | |
551 Well yes, this was the whole point of my "no lossage" proposal of being | |
552 able to undo any coding-system transformation on a buffer. The idea was | |
553 to figure out which transformations were definitely reversable, and for | |
554 all the others, cache the original text in a text property. This way, you | |
555 could probably still do a fairly good job at constructing a good reversal | |
556 even after you've gone into the text and added, deleted, and rearranged | |
557 some things. | |
558 | |
559 But you could implement it much more simply and usefully by just | |
560 determining, for any text being decoded into mule-internal, can we go back | |
561 and read the source again? If not, remember the entire file (GNUS | |
562 message, etc) in text properties. Then, implement the UI interface (like | |
563 Netscape's) on top of that. This way, you have something that at least | |
564 works, but it might be inefficient. All we would need to do is work on | |
565 making the | |
566 underlying implementation more efficient. | |
567 | |
568 Are you interested in doing this? It would be a huge win for users. | |
569 Hrvoje Niksic wrote: | |
570 | |
571 > Ben Wing <ben@666.com> writes: | |
572 > | |
573 > > let me know exactly what "rethink" functionality you want and i'll | |
574 > > come up with an interface. perhaps you just want something like | |
575 > > netscape's encoding menu, where if you switch encodings, it reloads | |
576 > > and reencodes? | |
577 > | |
578 > It might be a bit more complex than that. In many cases, it's hard or | |
579 > impossible to meaningfully "reload" -- for instance, this | |
580 > functionality should be available while editing a Gnus message, as | |
581 > well as while visiting a file. | |
582 > | |
583 > For the special case of Latin-N <-> Latin-M conversion, things could | |
584 > be done easily -- to convert from N to M, you only need to convert | |
585 > internal representation back to N, and then convert it forth to M. | |
586 | |
587 -------------------------------------------------------------------------- | |
588 April 11, 2000 | |
589 | |
590 Well yes, this was the whole point of my "no lossage" proposal of being | |
591 able to undo any coding-system transformation on a buffer. The idea was | |
592 to figure out which transformations were definitely reversable, and for | |
593 all the others, cache the original text in a text property. This way, you | |
594 could probably still do a fairly good job at constructing a good reversal | |
595 even after you've gone into the text and added, deleted, and rearranged | |
596 some things. | |
597 | |
598 But you could implement it much more simply and usefully by just | |
599 determining, for any text being decoded into mule-internal, can we go back | |
600 and read the source again? If not, remember the entire file (GNUS | |
601 message, etc) in text properties. Then, implement the UI interface (like | |
602 Netscape's) on top of that. This way, you have something that at least | |
603 works, but it might be inefficient. All we would need to do is work on | |
604 making the | |
605 underlying implementation more efficient. | |
606 | |
607 Are you interested in doing this? It would be a huge win for users. | |
608 Hrvoje Niksic wrote: | |
609 | |
610 > Ben Wing <ben@666.com> writes: | |
611 > | |
612 > > let me know exactly what "rethink" functionality you want and i'll | |
613 > > come up with an interface. perhaps you just want something like | |
614 > > netscape's encoding menu, where if you switch encodings, it reloads | |
615 > > and reencodes? | |
616 > | |
617 > It might be a bit more complex than that. In many cases, it's hard or | |
618 > impossible to meaningfully "reload" -- for instance, this | |
619 > functionality should be available while editing a Gnus message, as | |
620 > well as while visiting a file. | |
621 > | |
622 > For the special case of Latin-N <-> Latin-M conversion, things could | |
623 > be done easily -- to convert from N to M, you only need to convert | |
624 > internal representation back to N, and then convert it forth to M. | |
625 | |
626 | |
627 ------------------------------------------------------------------------ | |
628 | |
629 ========================================================================== | |
630 - Redoing translation macros [old] | |
631 ========================================================================== | |
632 | |
633 Currently the translation macros (the macros with names such as | |
634 GET_C_STRING_CTEXT_DATA_ALLOCA) have names that are difficult to parse | |
635 or remember, and are not all that general. In the process of | |
636 reviewing the Windows code so that it could be muleized, I discovered | |
637 that these macros need to be extended in various ways to allow for | |
638 the Windows code to be easily muleized. | |
639 | |
640 Since the macros needed to be changed anyways, I figured it would be a | |
641 good time to redo them properly. I propose new macros which have | |
642 names like this: | |
643 | |
644 @itemize @bullet | |
645 @item | |
646 <A>_TO_EXTERNAL_FORMAT_<B> | |
647 @item | |
648 <A>_TO_EXTERNAL_FORMAT_<B>_1 | |
649 @item | |
650 <C>_TO_INTERNAL_FORMAT_<D> | |
651 @item | |
652 <C>_TO_INTERNAL_FORMAT_<D>_1 | |
653 @end itemize | |
654 | |
655 A and C represent the source of the data, and B and D represent the | |
656 sink of the data. | |
657 | |
658 All of these macros call either the functions | |
659 convert_to_external_format or convert_to_internal_format internally, | |
660 with some massaging of the arguments. | |
661 | |
662 All of these macros take the following arguments: | |
663 | |
664 @itemize @bullet | |
665 @item | |
666 First, one or two arguments indicating the source of the data. | |
667 @item | |
668 Second, an argument indicating the coding system. (In order to avoid | |
669 an excessive number of macros, we no longer provide separate macros | |
670 for specific coding systems.) | |
671 @item | |
672 Third, one or two arguments indicating the sink of the data. | |
673 @item | |
674 Fourth, optionally, arguments indicating the error behavior and the | |
675 warning class (these arguments are only present in the _1 versions | |
676 of the macros). The other, shorter named macros are trivial | |
677 interfaces onto these macros with the error behavior being | |
678 ERROR_ME_WARN, with the warning class being Vstandard_warning_class. | |
679 @end itemize | |
680 | |
681 <A> can be one of the following: | |
682 @itemize @bullet | |
683 @item | |
684 LISP (which means a Lisp string) Takes one argument, a Lisp Object. | |
685 @item | |
686 LSTREAM (which indicates an lstream) Takes one argument, an | |
687 lstream. The data is read from the lstream until EOF is reached. | |
688 @item | |
689 DATA (which indicates a raw memory area) Takes two arguments, a | |
690 pointer and a length in bytes. | |
691 (You must never use this if the source of the data is a Lisp string, | |
692 because of the possibility of relocation during garbage collection.) | |
693 @end itemize | |
694 | |
695 <B> can be one of the following: | |
696 @itemize @bullet | |
697 @item | |
698 ALLOCA (which means that the resulting data is stored in alloca()ed | |
699 memory. Two arguments should be specified, a pointer and a length, | |
700 which should be lvalues.) | |
701 @item | |
702 MALLOC (which means that the resulting data is stored in malloc()ed | |
703 memory. Two arguments should be specified, a pointer and a | |
704 length. The memory must be free()d by the caller. | |
705 @item | |
706 OPAQUE (which means the resulting data is stored in an opaque Lisp | |
707 Object. This takes one argument, a lvalue Lisp Object. | |
708 @item | |
709 LSTREAM. The data is written to an lstream. | |
710 @end itemize | |
711 | |
712 <C> can be one of the : | |
713 @itemize @bullet | |
714 @item | |
715 DATA | |
716 @item | |
717 LSTREAM | |
718 @end itemize | |
719 (just like <A> above) | |
720 | |
721 <D> can be one of | |
722 @itemize @bullet | |
723 @item | |
724 ALLOCA | |
725 @item | |
726 MALLOC | |
727 @item | |
728 LISP This means a Lisp String. | |
729 @item | |
730 BUFFER The resulting data is inserted into a buffer at the buffer's | |
731 value of point. | |
732 @item | |
733 LSTREAM The data is written to the lstream. | |
734 @end itemize | |
735 | |
736 | |
737 Note that I have eliminated the FORMAT argument of previous macros, | |
738 and replaced it with a coding system. This was made possible by | |
739 coding system aliases. In place of old `format's, we use a `virtual | |
740 coding system', which is aliased to the actual coding system. | |
741 | |
742 The value of the coding system argument can be anything that is legal | |
743 input to get_coding_system, i.e. a symbol or a coding system object. | |
744 | |
745 ========================================================================== | |
746 - creation of generic macros for accessing internally formatted data [old] | |
747 ========================================================================== | |
748 | |
749 I have a design; it's all written down (I did it in Tsukuba), and I just have | |
750 to have it transcribed. It's higher level than the macros, though; it's Lisp | |
751 primitives that I'm designing. | |
752 | |
753 As for the design of the macros, don't worry so much about all files having to | |
754 get included (which is inevitable with macros), but about how the files are | |
755 separated. Your design might go like this: | |
756 | |
757 @enumerate | |
758 @item | |
759 you have generic macro interfaces, which specify a particular | |
760 behavior but not an implementation. these generic macros have | |
761 complementary versions for buffers and for strings (and the buffer | |
762 or string is an argument to all of the macros), and do such things | |
763 as convert between byte and char indices, retrieve the character at | |
764 a particular byte or char index, increment or decrement a byte | |
765 index to the beginning of the next or previous character, indicate | |
766 the number of bytes occupied by the character at a particular byte | |
767 or character index, etc. These are similar to what's already out | |
768 there except that they confound buffers and strings and that they | |
769 can also work with actual char *'s, which I think is a really bad | |
770 idea because it encourages code to "assume" that the representation | |
771 is ASCII compatible, which is might not be (e.g. 16-bit fixed | |
772 width). In fact, one thing I'm planning on doing is redefining | |
773 Bufbyte as a struct, for debugging purposes, to catch all places | |
774 that cavalierly compare them with ASCII char's. Note also that I | |
775 really want to rename Bufpos and Bytind, which are confusing and | |
776 wrong in that they also apply to strings. They should be Bytepos | |
777 and Charpos, or something like that, to go along with Bytecount and | |
778 Charcount. Similarly, Bufbyte is similarly a misnomer and should be | |
779 Intbyte -- a byte in the internal string representation (any of the | |
780 internal representations) of a string or buffer. Corresponding to | |
781 this is Extbyte (which we already have), a byte in any external | |
782 string representation. We also have Extcount, which makes sense, | |
783 and we might possibly want Extcharcount, the number of characters | |
784 in an external string representation; but that gets sticky in modal | |
785 encodings, and it's not clear how useful it would be. | |
786 | |
787 @item | |
788 for all generic macro interfaces, there are specific versions of | |
789 each of them for each possible representation (pure ASCII in the | |
790 non-Mule world, Mule standard, UTF-8, 8-bit fixed, 16-bit fixed, | |
791 32-bit fixed, etc.; there may well be more than one possible 16-bit | |
792 fixed version, as well). Each representation has a corresponding | |
793 prefix, e.g. MULE_ or FIXED16_ or whatever, which is prefixed onto | |
794 the generic macro names. The resulting macros perform the | |
795 operation defined for the macro, but assume, and only work | |
796 correctly with, text in the corresponding representation. | |
797 | |
798 @item | |
799 The definition of the generic versions merely conditionalizes on | |
800 the appropriate things (i.e. bit flags in the buffer or string | |
801 object) and calls the appropriate representation-specific version. | |
802 There may be more than one definition (protected by ifdefs, of | |
803 course), or one definition that amalgamated out of many ifdef'ed | |
804 sections. | |
805 | |
806 @item | |
807 You should probably put each different representation in its own | |
808 header file, e.g. charset-mule.h or charset-fixed16.h or | |
809 charset-ascii.h or whatever. Then put the main macros into | |
810 charset.h, and conditionalize in this file appropriately to include | |
811 the other ones. That way, code that actually needs to play around | |
812 with internal-format text at this level can include "charset.h" | |
813 (certainly a much better place than buffer.h), and everyone else | |
814 uses higher-level routines. The representation-specific macros | |
815 should not normally be used *directly* at all; they are invoked | |
816 automatically from the generic macros. However, code that needs to | |
817 be highly, highly optimized might choose to take a loop and write | |
818 two versions of it, one for each representation, to avoid the | |
819 per-loop-iteration cost of a comparison. Until the macro interface | |
820 is rock stable and solid, we should strongly discourage such | |
821 nanosecond optimizations. | |
822 @end enumerate | |
823 | |
824 ========================================================================== | |
825 - UTF-16 compatible representation | |
826 ========================================================================== | |
827 | |
828 NOTE: One possible default internal representation that was compatible | |
829 with UTF16 but allowed all possible chars in UCS4 would be to take a | |
830 more-or-less unused range of 2048 chars (not from the private area | |
831 because Microsoft actually uses up most or all of it with EUDC chars). | |
832 Let's say we picked A400 - ABFF. Then, we'd have: | |
833 | |
834 0000 - FFFF Simple chars | |
835 | |
836 D[8-B]xx D[C-F]xx Surrogate char, represents 1M chars | |
837 | |
838 A[4-B]xx D[C-F]xx D[C-F]xx Surrogate char, represents 2G chars | |
839 | |
840 This is exactly the same number of chars as UCS-4 handles, and it follows the | |
841 same property as UTF8 and Mule-internal: | |
842 | |
843 @enumerate | |
844 @item | |
845 There are two disjoint groupings of units, one representing leading units | |
846 and one representing non-leading units. | |
847 @item | |
848 Given a leading unit, you immediately know how many units follow to make | |
849 up a valid char, irrespective of any other context. | |
850 @end enumerate | |
851 | |
852 Note that A4xx is actually currently assigned to Yi. Since this is an | |
853 internal representation, we could just move these elsewhere. | |
854 | |
855 An alternative is to pick two disjoint ranges, e.g. 2D00 - 2DFF and | |
856 A500 - ABFF. | |
857 | |
858 ========================================================================== | |
859 New API for char->font mapping | |
860 ========================================================================== | |
861 - ; supersedes charset-registry and CCL; | |
862 supports all windows systems; powerful enough for Unicode; etc. | |
863 | |
864 (charset-font-mapping charset) | |
865 | |
866 font-mapping-specifier string | |
867 | |
868 char-font-mapping-table | |
869 | |
870 char-table, specifier; elements of char table are either strings (which | |
871 specify a registry or comparable font property, or vectors of a string | |
872 (same) followed by keyword-value pairs (optional). The only allowable | |
873 keyword currently is :ccl-program, which specifies a CCL program to map | |
874 the characters into font indices. Other keywords may be added | |
875 e.g. allowing Elisp fragments instead of CCL programs, also allowed is | |
876 [inherit], which inherits from the next less-specific char-table in the | |
877 specifier. | |
878 | |
879 The preferred interface onto this mapping (which should be portable | |
880 across Emacsen) is | |
881 | |
882 (set-char-font-mapping key value &optional locale tag-set how-to-add) | |
883 | |
884 where key is a char, range or charset (as for put-char-table), value is | |
885 as above, and the other arguments are standard for specifiers. This | |
886 automatically creates a char table in the locale, as necessary (all | |
887 elements default to [inherit]). On GNU Emacs, some specifiers arguments | |
888 may be unimplemented. | |
889 | |
890 (char-font-mapping key value &optional locale) | |
891 works vaguely like get-specifier? But does inheritance processing. | |
892 locale should clearly default here to current-buffer | |
893 | |
894 #### should get-specifier as well? Would make it work most like | |
895 #### buffer-local variables. | |
896 | |
897 NB. set-charset-registry and set-charset-ccl-program are obsoleted. | |
898 | |
899 ========================================================================== | |
900 Implementing fixed-width 8,16,32 bit buffer optimizations | |
901 ========================================================================== | |
902 | |
903 Add set-buffer-optimization (buffer &rest keywords) for | |
904 controlling these things. | |
905 | |
906 Also, put in hack so that correct arglist can be retrieved by | |
907 Lisp code. | |
908 | |
909 Look at the way keyword primitives are currently handled; make | |
910 sure it works and is documented, etc. | |
911 | |
912 Implement 8-bit fixed width optimization. Take the things that | |
913 know about the actual implementation and put them in a single | |
914 file, in essence creating an abstraction layer to allow | |
915 pluggable internal representations. Implement a fairly general | |
916 scheme for mapping between character codes in the 8 bits or 16 | |
917 bits representation and on actual charset characters. As part of | |
918 set-buffer-optimization, you can specify a list of character sets | |
919 to be used in the 8 bit to 16 bit, etc. world. You can also | |
920 request that the buffer be in 8, 16, etc. if possible. | |
921 | |
922 -> set defaults wrt this. | |
923 -> perhaps this should be just buffer properties. | |
924 -> this brings up the idea of default properties on an object. | |
925 -> Implement default-put, default-get, etc. | |
926 | |
927 What happens when a character not assigned in the range gets | |
928 added? Then, must convert to variable width of some sort. | |
929 | |
930 Note: at first, possibly we just convert whole hog to get things | |
931 right. Then we'd have to poy alternative to characters that got | |
932 added + deleted that were unassigned in the fixed width. When | |
933 this goes to zero and there's been enough time (heuristics), we | |
934 go back to fixed. | |
935 | |
936 Side note: We could dynamically build up the set of assigned | |
937 chars as they go. Conceivably this could even go down to the | |
938 single char level: Just keep a big array of mapping from 16 bit | |
939 values to chars, and add empty time, a char has been encountered | |
940 that wasn't there before. Problem need inverse mapping. | |
941 | |
942 -> Possibility; chars are actual objects, not just numbers. | |
943 Then you could keep track of such info in the chars itself. | |
944 *Think about this.* | |
945 | |
946 Eventually, we might consider allowing mixed fixed-width, | |
947 variable-width buffer encodings. Then, we use range tables to | |
948 indicate which sections are fixed and which variable and INC_CHAR does | |
949 something like this: binary search to find the current range, which | |
950 indicates whether it's fixed or variable, and tells us what the | |
951 increment is. We can cache this info and use it next time to speed | |
952 up. | |
953 | |
954 -> We will then have two partially shared range tables - one for | |
955 overall fixed width vs. variable width, and possibly one containing | |
956 this same info, but partitioning the variable width in one. Maybe | |
957 need fancier nested range table model. | |
958 | |
959 ========================================================================== | |
960 Expansion of display table and case mapping table support for all | |
961 chars, not just ASCII/Latin1. | |
962 ========================================================================== | |
963 | |
964 ========================================================================== | |
965 Improved flexibility for display tables, and evaluation of its | |
966 features to make sure it meshes with and complements the char<->font | |
967 mapping API mentioned earlier | |
968 ========================================================================== | |
969 | |
970 ========================================================================== | |
971 String access speedup: | |
972 ========================================================================== | |
973 | |
974 For strings larger than some size in bytes (10?), keep extra fields of | |
975 info: length in chars, and a (char, byte) pair in the middle to speed | |
976 up sequential access. | |
977 | |
978 (Better idea: do this for any size string, but only if it contains | |
979 non-ASCII chars. Then if info is missing, we know string is | |
980 ASCII-only.) | |
981 | |
982 Use a string-extra-info object, replacing string property slot and | |
983 containing fields for string mod tick, string extents, string props, | |
984 and string char length, and cached (char,byte) pair. | |
985 string-extra-info (or string-auxiliary?) objects could be in frob | |
986 blocks, esp. if creating frob blocks is easy + worth it. | |
987 | |
988 - Caching of char<->byte conversions in strings - should make nearly | |
989 all operations on strings O(N) | |
990 | |
991 ========================================================================== | |
992 Improvements in buffer char<->byte mapping | |
993 ========================================================================== | |
994 | |
995 - Range table implementation - especially when there are few runs of | |
996 different widths, e.g. recently converted from fixed-width | |
997 optimization to variable width | |
998 | |
999 Range Tables to speed up Bufpos <-> Bytind caching | |
1000 ================================================== | |
1001 | |
1002 This describes an alternative implementation using ranges. We | |
1003 maintain a range table of all spans of characters of a fixed width. | |
1004 Updating this table could take time if there are a large number of | |
1005 spans; but constant factors of operations should be quick. This method really wins | |
1006 when you have 8-bit buffers just converted to variable width, where | |
1007 there will be few spans. More specifically, lookup in this range | |
1008 table is O(log N) and can be done with simple binary search, which is | |
1009 very fast. If we maintain the ranges using a gap array, updating this | |
1010 table will be fast for local operations, which is most of the time. | |
1011 | |
1012 We will also provide (at first, at least) a Lisp function to set the | |
1013 caching mechanism explicitly - either range tables or the existing | |
1014 implementation. Eventually, we want to improve things, to the point | |
1015 where we automatically pick the right caching for the situation and | |
1016 have more caching schemes implemented. | |
1017 | |
1018 ========================================================================== | |
1019 - Robustify Text Properties | |
1020 ========================================================================== | |
1021 | |
1022 ========================================================================== | |
1023 Support for unified internal representation, e.g. Unicode | |
1024 ========================================================================== | |
1025 | |
1026 Start tagging all text with a language text property, | |
1027 indicating the current language environment when the text was input. | |
1028 (needs "Robustify Text Properties") | |
1029 | |
1030 ========================================================================== | |
1031 - Generalized Coding Systems | |
1032 ========================================================================== | |
1033 | |
1034 - Lisp API for Defining Coding Systems | |
1035 | |
1036 User-defined coding systems. | |
1037 | |
1038 (define-coding-system-type 'type | |
1039 :encode-function fun | |
1040 :decode-function fun | |
1041 :detect-function fun | |
1042 :buffering (number = at least this many chars | |
1043 line = buffer up to end of line | |
1044 regexp = buffer until this regexp is found in match | |
1045 source data. match data will be appropriate when fun is | |
1046 called | |
1047 | |
1048 encode fun is called as | |
1049 | |
1050 (encode instream outstream) | |
1051 | |
1052 should read data from instream and write converted result onto | |
1053 outstream. Can leave some data stuff in stream, it will reappear | |
1054 next time. Generally, there is a finite amount of data in instream | |
1055 and further attempts to read lead to would-block errors or retvals. | |
1056 Can use instream properties to record state. May use read-stream | |
1057 functionality to read everything into a vector or string. | |
1058 | |
1059 ->Need vectors + string exposed to resizing of Lisp implementation | |
1060 where necessary. | |
1061 | |
1062 ========================================================================== | |
1063 Support Windows Active Kbd Switching, Far East IME API (done already?) | |
1064 ========================================================================== | |
1065 | |
1066 ========================================================================== | |
1067 - UI/design changes for Coding System Pipelining | |
1068 ========================================================================== | |
1069 | |
1070 ------------------------------------------------------------------ | |
1071 CODING-SYSTEM CHAINS | |
1072 ------------------------------------------------------------------ | |
1073 | |
1074 sjt sez: | |
1075 | |
1076 There should be no elementary coding systems in the Lisp API, only | |
1077 chains. Chains should be declared, not computed, as a sequence of coding | |
1078 formats. (Probably the internal representation can be a vector for | |
1079 efficiency but programmers would probably rather work with lists.) A | |
1080 stream has a token type. Most streams are octet streams. Text is a | |
1081 stream of characters (in _internal_ format; a file on disk is not text!) | |
1082 An octet-stream has no implicit semantics, so its format must always be | |
1083 specified. The only type currently having semantics is characters. This | |
1084 means that the chain [euc-jp -> internal -> shift_jis) may be specified | |
1085 (euc-jp, shift_jis), and if no euc-jp -> shift_jis converter is | |
1086 available, then the chain is automatically constructed. (N.B. I f we | |
1087 have fixed width buffers in the future, then we could have ASCII -> 8-bit | |
1088 char -> 16-bit char -> ISO-2022-JP (with escape sequences). | |
1089 | |
1090 EOL handling is a char <-> char coding. It should not be part of another | |
1091 coding system except as a convenience for users. For text coding, | |
1092 automatically insert EOL handlers between char <-> octet boundaries. | |
1093 | |
1094 ------------------------------------------------------------------ | |
1095 ABOUT DETECTION | |
1096 ------------------------------------------------------------------ | |
1097 | |
1098 | |
1099 ------------------------------------------------------------------ | |
1100 EFFICIENCY OF CODING CONVERSION WITH MULTIPLE COPIES/CHAINS | |
1101 ------------------------------------------------------------------ | |
1102 | |
1103 A comment in encode_decode_coding_region(): | |
1104 | |
1105 The chain of streams looks like this: | |
1106 | |
1107 [BUFFER] <----- (( read from/send to loop )) | |
1108 ------> [CHAR->BYTE i.e. ENCODE AS BINARY if source is | |
1109 in bytes] | |
1110 ------> [ENCODE/DECODE AS SPECIFIED] | |
1111 ------> [BYTE->CHAR i.e. DECODE AS BINARY | |
1112 if sink is in bytes] | |
1113 ------> [AUTODETECT EOL if | |
1114 we're decoding and | |
1115 coding system calls | |
1116 for this] | |
1117 ------> [BUFFER] | |
1118 | |
1119 sjt (?) responds: | |
1120 | |
1121 Of course, this is just horrible. BYTE<->CHAR should only be available | |
1122 to I/O routines. It should not be visible to Mule proper. | |
1123 | |
1124 A comment on the implementation. Hrvoje and Kyle worry about the | |
1125 inefficiency of repeated copying among buffers that chained coding | |
1126 systems entail. But this may not be as time inefficient as it appears | |
1127 in the Mule ("house rules") context. The issue is how do you do chain | |
1128 coding systems without copying? In theory you could have | |
1129 | |
1130 IChar external_to_raw (ExtChar *cp, State *s); | |
1131 IChar decode_utf16 (IChar c, State *s); | |
1132 IChar decode_crlf (ExtChar *cp, State *s); | |
1133 | |
1134 typedef Ichar (*Converter[]) (Ichar, State*); | |
1135 | |
1136 Converter utf16[2] = { &decode_utf16, &decode_crlf }; | |
1137 | |
1138 void convert (ExtChar *inbuf, IChar *outbuf, Converter cvtr) | |
1139 { | |
1140 int i; | |
1141 ExtChar c; | |
1142 State s; | |
1143 | |
1144 while (c = external_to_raw (*inbuf++, &s)) | |
1145 { | |
1146 for (i = 0; i < sizeof(cvtr)/sizeof(Converter); ++i) | |
1147 if (s.ready) | |
1148 c = (*cvtr[i]) (c, &s); | |
1149 } | |
1150 if (s.ready) | |
1151 *outbuf++ = c; | |
1152 } | |
1153 | |
1154 But this is a lot of function calls; what Ben is doing is basically | |
1155 reducing this to one call per buffer-full. The only way to avoid this | |
1156 is to hardcode all the "interesting" coding systems, maybe using | |
1157 inline or macros to give structure. But this is still a huge amount | |
1158 of work, and code. | |
1159 | |
1160 One advantage to the call-per-char approach is that we might be able | |
1161 to do something about the marker/extent destruction that coding | |
1162 normally entails. | |
1163 | |
1164 ben sez: | |
1165 | |
1166 it should be possible to preserve the markers/extents without | |
1167 switching completely to one-call-per-char -- we could at least do one | |
1168 call per "run", where a run is more or less the maximal stretch of | |
1169 text not overlapping any markers or extent boundaries. (It's a bit | |
1170 more complicated if we want to properly support the different extent | |
1171 begins/ends; in some cases we might have to pump a single character | |
1172 adjacent to where two extents meet.) The "stateless" way that I wrote | |
1173 all of the conversion routines may be a real hassle but it allows | |
1174 something like this to work without too much problem -- pump in one | |
1175 run at a time into one end of the chain, do a flush after each | |
1176 iteration, and stick what comes out the other end in its place. | |
1177 | |
1178 ------------------------------------------------------------------ | |
1179 ABOUT FORMATS | |
1180 ------------------------------------------------------------------ | |
1181 | |
1182 when calling make-coding-system, the name can be a cons of (format1 . | |
1183 format2), specifying that it decodes format1->format2 and encodes the other | |
1184 way. if only one name is given, that is assumed to be format1, and the | |
1185 other is either `external' or `internal' depending on the end type. | |
1186 normally the user when decoding gives the decoding order in formats, but | |
1187 can leave off the last one, `internal', which is assumed. a multichain | |
1188 might look like gzip|multibyte|unicode, using the coding systems named | |
1189 `gzip', `(unicode . multibyte)' and `unicode'. the way this actually works | |
1190 is by searching for gzip->multibyte; if not found, look for gzip->external | |
1191 or gzip->internal. (In general we automatically do conversion between | |
1192 internal and external as necessary: thus gzip|crlf does the expected, and | |
1193 maps to gzip->external, external->internal, crlf->internal, which when | |
1194 fully specified would be gzip|external:external|internal:crlf|internal -- | |
1195 see below.) To forcibly fit together two converters that have explicitly | |
1196 specified and incompatible names (say you have unicode->multibyte and | |
1197 iso8859-1->ebcdic and you know that the multibyte and iso8859-1 in this | |
1198 case are compatible), you can force-cast using :, like this: | |
1199 ebcdic|iso8859-1:multibyte|unicode. (again, if you force-cast between | |
1200 internal and external formats, the conversion happens automatically.) | |
1201 | |
1202 -------------------------------------------------------------------------- | |
1203 ABOUT PDUMP, UNICODE, AND RUNNING XEMACS FROM A DIRECTORY WITH WEIRD CHARS | |
1204 -------------------------------------------------------------------------- | |
1205 | |
1206 -- there's the problem that XEmacs can't be run in a directory with | |
1207 non-ASCII/Latin-1 chars in it, since it will be doing Unicode | |
1208 processing before we've had a chance to load the tables. In fact, | |
1209 even finding the tables in such a situation is problematic using | |
1210 the normal commands. my idea is to eventually load the stuff | |
1211 extremely extremely early, at the same time as the pdump data gets | |
1212 loaded. in fact, the unicode table data (stored in an efficient | |
1213 binary format) can even be stuck into the pdump file (which would | |
1214 mean as a resource to the executable, for windows). we'd need to | |
1215 extend pdump a bit: to allow for attaching extra data to the pdump | |
1216 file. (something like pdump_attach_extra_data (addr, length) | |
1217 returns a number of some sort, an index into the file, which you | |
1218 can then retrieve with pdump_load_extra_data(), which returns an | |
1219 addr (mmap()ed or loaded), and later you pdump_unload_extra_data() | |
1220 when finished. we'd probably also need | |
1221 pdump_attach_extra_data_append(), which appends data to the data | |
1222 just written out with pdump_attach_extra_data(). this way, | |
1223 multiple tables in memory can be written out into one contiguous | |
1224 table. (we'd use the tar-like trick of allowing new blocks to be | |
1225 written without going back to change the old blocks -- we just rely | |
1226 on the end of file/end of memory.) this same mechanism could be | |
1227 extracted out of pdump and used to handle the non-pdump situation | |
1228 (or alternatively, we could just dump either the memory image of | |
1229 the tables themselves or the compressed binary version). in the | |
1230 case of extra unicode tables not known about at compile time that | |
1231 get loaded before dumping, we either just dump them into the image | |
1232 (pdump and all) or extract them into the compressed binary format, | |
1233 free the original tables, and treat them like all other tables. | |
1234 | |
1235 | |
1236 ========================================================================== | |
1237 - Generalized language appropriate word wrapping (requires | |
1238 layout-exposing API defined in BIDI section) | |
1239 ========================================================================== | |
1240 | |
1241 ========================================================================== | |
1242 - Make Custom Mule-aware | |
1243 ========================================================================== | |
1244 | |
1245 ========================================================================== | |
1246 - Composite character support | |
1247 ========================================================================== | |
1248 | |
1249 ========================================================================== | |
1250 - Language appropriate sorting and searching | |
1251 ========================================================================== | |
1252 | |
1253 ========================================================================== | |
1254 - Glyph shaping for Arabic and Devanagari | |
1255 ========================================================================== | |
1256 | |
1257 - (needs to be handled mostly | |
1258 at C level, as part of layout; luckily it's entirely local in its | |
1259 changes, as this is not hard) | |
1260 | |
1261 | |
1262 ========================================================================== | |
1263 Consider moving language selection Menu up to be parallel with Mule menu | |
1264 ========================================================================== | |
1265 | |
1266 */ | |
1267 | |
1268 | |
771 | 1269 |
1270 /************************************************************************/ | |
1271 /* declarations */ | |
1272 /************************************************************************/ | |
1273 | |
1274 Eistring the_eistring_zero_init, the_eistring_malloc_zero_init; | |
1275 | |
1276 #define MAX_CHARBPOS_GAP_SIZE_3 (65535/3) | |
1277 #define MAX_BYTEBPOS_GAP_SIZE_3 (3 * MAX_CHARBPOS_GAP_SIZE_3) | |
1278 | |
1279 short three_to_one_table[1 + MAX_BYTEBPOS_GAP_SIZE_3]; | |
1280 | |
1281 #ifdef MULE | |
1282 | |
1283 /* Table of number of bytes in the string representation of a character | |
1284 indexed by the first byte of that representation. | |
1285 | |
1286 rep_bytes_by_first_byte(c) is more efficient than the equivalent | |
1287 canonical computation: | |
1288 | |
826 | 1289 XCHARSET_REP_BYTES (charset_by_leading_byte (c)) */ |
771 | 1290 |
1291 const Bytecount rep_bytes_by_first_byte[0xA0] = | |
1292 { /* 0x00 - 0x7f are for straight ASCII */ | |
1293 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1294 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1295 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1296 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1297 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1298 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1299 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1300 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
1301 /* 0x80 - 0x8f are for Dimension-1 official charsets */ | |
1302 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
1303 /* 0x90 - 0x9d are for Dimension-2 official charsets */ | |
1304 /* 0x9e is for Dimension-1 private charsets */ | |
1305 /* 0x9f is for Dimension-2 private charsets */ | |
1306 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4 | |
1307 }; | |
1308 | |
1309 #ifdef ENABLE_COMPOSITE_CHARS | |
1310 | |
1311 /* Hash tables for composite chars. One maps string representing | |
1312 composed chars to their equivalent chars; one goes the | |
1313 other way. */ | |
1314 Lisp_Object Vcomposite_char_char2string_hash_table; | |
1315 Lisp_Object Vcomposite_char_string2char_hash_table; | |
1316 | |
1317 static int composite_char_row_next; | |
1318 static int composite_char_col_next; | |
1319 | |
1320 #endif /* ENABLE_COMPOSITE_CHARS */ | |
1321 | |
1322 #endif /* MULE */ | |
1323 | |
1292 | 1324 Lisp_Object QSin_char_byte_conversion; |
1325 Lisp_Object QSin_internal_external_conversion; | |
1326 | |
771 | 1327 |
1328 /************************************************************************/ | |
1329 /* qxestr***() functions */ | |
1330 /************************************************************************/ | |
1331 | |
1332 /* Most are inline functions in lisp.h */ | |
1333 | |
1334 int | |
867 | 1335 qxesprintf (Ibyte *buffer, const CIbyte *format, ...) |
771 | 1336 { |
1337 va_list args; | |
1338 int retval; | |
1339 | |
1340 va_start (args, format); | |
2367 | 1341 retval = vsprintf ((Chbyte *) buffer, format, args); |
771 | 1342 va_end (args); |
1343 | |
1344 return retval; | |
1345 } | |
1346 | |
1347 /* strcasecmp() implementation from BSD */ | |
867 | 1348 static Ibyte strcasecmp_charmap[] = { |
1429 | 1349 0000, 0001, 0002, 0003, 0004, 0005, 0006, 0007, |
1350 0010, 0011, 0012, 0013, 0014, 0015, 0016, 0017, | |
1351 0020, 0021, 0022, 0023, 0024, 0025, 0026, 0027, | |
1352 0030, 0031, 0032, 0033, 0034, 0035, 0036, 0037, | |
1353 0040, 0041, 0042, 0043, 0044, 0045, 0046, 0047, | |
1354 0050, 0051, 0052, 0053, 0054, 0055, 0056, 0057, | |
1355 0060, 0061, 0062, 0063, 0064, 0065, 0066, 0067, | |
1356 0070, 0071, 0072, 0073, 0074, 0075, 0076, 0077, | |
1357 0100, 0141, 0142, 0143, 0144, 0145, 0146, 0147, | |
1358 0150, 0151, 0152, 0153, 0154, 0155, 0156, 0157, | |
1359 0160, 0161, 0162, 0163, 0164, 0165, 0166, 0167, | |
1360 0170, 0171, 0172, 0133, 0134, 0135, 0136, 0137, | |
1361 0140, 0141, 0142, 0143, 0144, 0145, 0146, 0147, | |
1362 0150, 0151, 0152, 0153, 0154, 0155, 0156, 0157, | |
1363 0160, 0161, 0162, 0163, 0164, 0165, 0166, 0167, | |
1364 0170, 0171, 0172, 0173, 0174, 0175, 0176, 0177, | |
1365 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207, | |
1366 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217, | |
1367 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227, | |
1368 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237, | |
1369 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247, | |
1370 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257, | |
1371 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267, | |
1372 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277, | |
1373 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307, | |
1374 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317, | |
1375 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327, | |
1376 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337, | |
1377 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347, | |
1378 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357, | |
1379 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367, | |
1380 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377 | |
771 | 1381 }; |
1382 | |
1383 /* A version that works like generic strcasecmp() -- only collapsing | |
1384 case in ASCII A-Z/a-z. This is safe on Mule strings due to the | |
1385 current representation. | |
1386 | |
1387 This version was written by some Berkeley coder, favoring | |
1388 nanosecond improvements over clarity. In all other versions below, | |
1389 we use symmetrical algorithms that may sacrifice a few machine | |
1390 cycles but are MUCH MUCH clearer, which counts a lot more. | |
1391 */ | |
1392 | |
1393 int | |
867 | 1394 qxestrcasecmp (const Ibyte *s1, const Ibyte *s2) |
771 | 1395 { |
867 | 1396 Ibyte *cm = strcasecmp_charmap; |
771 | 1397 |
1398 while (cm[*s1] == cm[*s2++]) | |
1399 if (*s1++ == '\0') | |
1400 return (0); | |
1401 | |
1402 return (cm[*s1] - cm[*--s2]); | |
1403 } | |
1404 | |
1405 int | |
2367 | 1406 ascii_strcasecmp (const Ascbyte *s1, const Ascbyte *s2) |
771 | 1407 { |
867 | 1408 return qxestrcasecmp ((const Ibyte *) s1, (const Ibyte *) s2); |
771 | 1409 } |
1410 | |
1411 int | |
2367 | 1412 qxestrcasecmp_ascii (const Ibyte *s1, const Ascbyte *s2) |
771 | 1413 { |
867 | 1414 return qxestrcasecmp (s1, (const Ibyte *) s2); |
771 | 1415 } |
1416 | |
1417 /* An internationalized version that collapses case in a general fashion. | |
1418 */ | |
1419 | |
1420 int | |
867 | 1421 qxestrcasecmp_i18n (const Ibyte *s1, const Ibyte *s2) |
771 | 1422 { |
1423 while (*s1 && *s2) | |
1424 { | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1425 if (CANONCASE (0, itext_ichar (s1)) != |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1426 CANONCASE (0, itext_ichar (s2))) |
771 | 1427 break; |
867 | 1428 INC_IBYTEPTR (s1); |
1429 INC_IBYTEPTR (s2); | |
771 | 1430 } |
1431 | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1432 return (CANONCASE (0, itext_ichar (s1)) - |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1433 CANONCASE (0, itext_ichar (s2))); |
771 | 1434 } |
1435 | |
1436 /* The only difference between these next two and | |
1437 qxememcasecmp()/qxememcasecmp_i18n() is that these two will stop if | |
1438 both strings are equal and less than LEN in length, while | |
1439 the mem...() versions would would run off the end. */ | |
1440 | |
1441 int | |
867 | 1442 qxestrncasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len) |
771 | 1443 { |
867 | 1444 Ibyte *cm = strcasecmp_charmap; |
771 | 1445 |
1446 while (len--) | |
1447 { | |
1448 int diff = cm[*s1] - cm[*s2]; | |
1449 if (diff != 0) | |
1450 return diff; | |
1451 if (!*s1) | |
1452 return 0; | |
1453 s1++, s2++; | |
1454 } | |
1455 | |
1456 return 0; | |
1457 } | |
1458 | |
1459 int | |
2367 | 1460 ascii_strncasecmp (const Ascbyte *s1, const Ascbyte *s2, Bytecount len) |
771 | 1461 { |
867 | 1462 return qxestrncasecmp ((const Ibyte *) s1, (const Ibyte *) s2, len); |
771 | 1463 } |
1464 | |
1465 int | |
2367 | 1466 qxestrncasecmp_ascii (const Ibyte *s1, const Ascbyte *s2, Bytecount len) |
771 | 1467 { |
867 | 1468 return qxestrncasecmp (s1, (const Ibyte *) s2, len); |
771 | 1469 } |
1470 | |
801 | 1471 /* Compare LEN_FROM_S1 worth of characters from S1 with the same number of |
1472 characters from S2, case insensitive. NOTE: Downcasing can convert | |
1473 characters from one length in bytes to another, so reversing S1 and S2 | |
1474 is *NOT* a symmetric operations! You must choose a length that agrees | |
1475 with S1. */ | |
1476 | |
771 | 1477 int |
867 | 1478 qxestrncasecmp_i18n (const Ibyte *s1, const Ibyte *s2, |
801 | 1479 Bytecount len_from_s1) |
771 | 1480 { |
801 | 1481 while (len_from_s1 > 0) |
771 | 1482 { |
867 | 1483 const Ibyte *old_s1 = s1; |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1484 int diff = (CANONCASE (0, itext_ichar (s1)) - |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1485 CANONCASE (0, itext_ichar (s2))); |
771 | 1486 if (diff != 0) |
1487 return diff; | |
1488 if (!*s1) | |
1489 return 0; | |
867 | 1490 INC_IBYTEPTR (s1); |
1491 INC_IBYTEPTR (s2); | |
801 | 1492 len_from_s1 -= s1 - old_s1; |
771 | 1493 } |
1494 | |
1495 return 0; | |
1496 } | |
1497 | |
1498 int | |
867 | 1499 qxememcmp (const Ibyte *s1, const Ibyte *s2, Bytecount len) |
771 | 1500 { |
1501 return memcmp (s1, s2, len); | |
1502 } | |
1503 | |
1504 int | |
867 | 1505 qxememcmp4 (const Ibyte *s1, Bytecount len1, |
1506 const Ibyte *s2, Bytecount len2) | |
801 | 1507 { |
1508 int retval = qxememcmp (s1, s2, min (len1, len2)); | |
1509 if (retval) | |
1510 return retval; | |
1511 return len1 - len2; | |
1512 } | |
1513 | |
1514 int | |
867 | 1515 qxememcasecmp (const Ibyte *s1, const Ibyte *s2, Bytecount len) |
771 | 1516 { |
867 | 1517 Ibyte *cm = strcasecmp_charmap; |
771 | 1518 |
1519 while (len--) | |
1520 { | |
1521 int diff = cm[*s1] - cm[*s2]; | |
1522 if (diff != 0) | |
1523 return diff; | |
1524 s1++, s2++; | |
1525 } | |
1526 | |
1527 return 0; | |
1528 } | |
1529 | |
1530 int | |
867 | 1531 qxememcasecmp4 (const Ibyte *s1, Bytecount len1, |
1532 const Ibyte *s2, Bytecount len2) | |
771 | 1533 { |
801 | 1534 int retval = qxememcasecmp (s1, s2, min (len1, len2)); |
1535 if (retval) | |
1536 return retval; | |
1537 return len1 - len2; | |
1538 } | |
1539 | |
1540 /* Do a character-by-character comparison, returning "which is greater" by | |
867 | 1541 comparing the Ichar values. (#### Should have option to compare Unicode |
801 | 1542 points) */ |
1543 | |
1544 int | |
867 | 1545 qxetextcmp (const Ibyte *s1, Bytecount len1, |
1546 const Ibyte *s2, Bytecount len2) | |
801 | 1547 { |
1548 while (len1 > 0 && len2 > 0) | |
771 | 1549 { |
867 | 1550 const Ibyte *old_s1 = s1; |
1551 const Ibyte *old_s2 = s2; | |
1552 int diff = itext_ichar (s1) - itext_ichar (s2); | |
801 | 1553 if (diff != 0) |
1554 return diff; | |
867 | 1555 INC_IBYTEPTR (s1); |
1556 INC_IBYTEPTR (s2); | |
801 | 1557 len1 -= s1 - old_s1; |
1558 len2 -= s2 - old_s2; | |
1559 } | |
1560 | |
1561 assert (len1 >= 0 && len2 >= 0); | |
1562 return len1 - len2; | |
1563 } | |
1564 | |
1565 int | |
867 | 1566 qxetextcmp_matching (const Ibyte *s1, Bytecount len1, |
1567 const Ibyte *s2, Bytecount len2, | |
801 | 1568 Charcount *matching) |
1569 { | |
1570 *matching = 0; | |
1571 while (len1 > 0 && len2 > 0) | |
1572 { | |
867 | 1573 const Ibyte *old_s1 = s1; |
1574 const Ibyte *old_s2 = s2; | |
1575 int diff = itext_ichar (s1) - itext_ichar (s2); | |
801 | 1576 if (diff != 0) |
1577 return diff; | |
867 | 1578 INC_IBYTEPTR (s1); |
1579 INC_IBYTEPTR (s2); | |
801 | 1580 len1 -= s1 - old_s1; |
1581 len2 -= s2 - old_s2; | |
1582 (*matching)++; | |
1583 } | |
1584 | |
1585 assert (len1 >= 0 && len2 >= 0); | |
1586 return len1 - len2; | |
1587 } | |
1588 | |
1589 /* Do a character-by-character comparison, returning "which is greater" by | |
867 | 1590 comparing the Ichar values, case insensitively (by downcasing both |
801 | 1591 first). (#### Should have option to compare Unicode points) |
1592 | |
1593 In this case, both lengths must be specified becaused downcasing can | |
1594 convert characters from one length in bytes to another; therefore, two | |
1595 blocks of text of different length might be equal. If both compare | |
1596 equal up to the limit in length of one but not the other, the longer one | |
1597 is "greater". */ | |
1598 | |
1599 int | |
867 | 1600 qxetextcasecmp (const Ibyte *s1, Bytecount len1, |
1601 const Ibyte *s2, Bytecount len2) | |
801 | 1602 { |
1603 while (len1 > 0 && len2 > 0) | |
1604 { | |
867 | 1605 const Ibyte *old_s1 = s1; |
1606 const Ibyte *old_s2 = s2; | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1607 int diff = (CANONCASE (0, itext_ichar (s1)) - |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1608 CANONCASE (0, itext_ichar (s2))); |
771 | 1609 if (diff != 0) |
1610 return diff; | |
867 | 1611 INC_IBYTEPTR (s1); |
1612 INC_IBYTEPTR (s2); | |
801 | 1613 len1 -= s1 - old_s1; |
1614 len2 -= s2 - old_s2; | |
771 | 1615 } |
1616 | |
801 | 1617 assert (len1 >= 0 && len2 >= 0); |
1618 return len1 - len2; | |
1619 } | |
1620 | |
1621 /* Like qxetextcasecmp() but also return number of characters at | |
1622 beginning that match. */ | |
1623 | |
1624 int | |
867 | 1625 qxetextcasecmp_matching (const Ibyte *s1, Bytecount len1, |
1626 const Ibyte *s2, Bytecount len2, | |
801 | 1627 Charcount *matching) |
1628 { | |
1629 *matching = 0; | |
1630 while (len1 > 0 && len2 > 0) | |
1631 { | |
867 | 1632 const Ibyte *old_s1 = s1; |
1633 const Ibyte *old_s2 = s2; | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1634 int diff = (CANONCASE (0, itext_ichar (s1)) - |
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1635 CANONCASE (0, itext_ichar (s2))); |
801 | 1636 if (diff != 0) |
1637 return diff; | |
867 | 1638 INC_IBYTEPTR (s1); |
1639 INC_IBYTEPTR (s2); | |
801 | 1640 len1 -= s1 - old_s1; |
1641 len2 -= s2 - old_s2; | |
1642 (*matching)++; | |
1643 } | |
1644 | |
1645 assert (len1 >= 0 && len2 >= 0); | |
1646 return len1 - len2; | |
771 | 1647 } |
1648 | |
1649 int | |
4906
6ef8256a020a
implement equalp in C, fix case-folding, add equal() method for keymaps
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
1650 lisp_strcasecmp_ascii (Lisp_Object s1, Lisp_Object s2) |
771 | 1651 { |
867 | 1652 Ibyte *cm = strcasecmp_charmap; |
1653 Ibyte *p1 = XSTRING_DATA (s1); | |
1654 Ibyte *p2 = XSTRING_DATA (s2); | |
1655 Ibyte *e1 = p1 + XSTRING_LENGTH (s1); | |
1656 Ibyte *e2 = p2 + XSTRING_LENGTH (s2); | |
771 | 1657 |
1658 /* again, we use a symmetric algorithm and favor clarity over | |
1659 nanosecond improvements. */ | |
1660 while (1) | |
1661 { | |
1662 /* if we reached the end of either string, compare lengths. | |
1663 do NOT compare the final null byte against anything, in case | |
1664 the other string also has a null byte at that position. */ | |
1665 if (p1 == e1 || p2 == e2) | |
1666 return e1 - e2; | |
1667 if (cm[*p1] != cm[*p2]) | |
1668 return cm[*p1] - cm[*p2]; | |
1669 p1++, p2++; | |
1670 } | |
1671 } | |
1672 | |
1673 int | |
1674 lisp_strcasecmp_i18n (Lisp_Object s1, Lisp_Object s2) | |
1675 { | |
801 | 1676 return qxetextcasecmp (XSTRING_DATA (s1), XSTRING_LENGTH (s1), |
1677 XSTRING_DATA (s2), XSTRING_LENGTH (s2)); | |
771 | 1678 } |
1679 | |
2367 | 1680 /* Compare a wide string with an ASCII string */ |
1681 | |
1682 int | |
1683 wcscmp_ascii (const wchar_t *s1, const Ascbyte *s2) | |
1684 { | |
1685 while (*s1 && *s2) | |
1686 { | |
2956 | 1687 if (*s1 != (wchar_t) *s2) |
2367 | 1688 break; |
1689 s1++, s2++; | |
1690 } | |
1691 | |
1692 return *s1 - *s2; | |
1693 } | |
1694 | |
1695 int | |
1696 wcsncmp_ascii (const wchar_t *s1, const Ascbyte *s2, Charcount len) | |
1697 { | |
1698 while (len--) | |
1699 { | |
1700 int diff = *s1 - *s2; | |
1701 if (diff != 0) | |
1702 return diff; | |
1703 if (!*s1) | |
1704 return 0; | |
1705 s1++, s2++; | |
1706 } | |
1707 | |
1708 return 0; | |
1709 } | |
1710 | |
771 | 1711 |
1712 /************************************************************************/ | |
1713 /* conversion between textual representations */ | |
1714 /************************************************************************/ | |
1715 | |
1716 /* NOTE: Does not reset the Dynarr. */ | |
1717 | |
1718 void | |
867 | 1719 convert_ibyte_string_into_ichar_dynarr (const Ibyte *str, Bytecount len, |
2367 | 1720 Ichar_dynarr *dyn) |
771 | 1721 { |
867 | 1722 const Ibyte *strend = str + len; |
771 | 1723 |
1724 while (str < strend) | |
1725 { | |
867 | 1726 Ichar ch = itext_ichar (str); |
771 | 1727 Dynarr_add (dyn, ch); |
867 | 1728 INC_IBYTEPTR (str); |
771 | 1729 } |
1730 } | |
1731 | |
1732 Charcount | |
867 | 1733 convert_ibyte_string_into_ichar_string (const Ibyte *str, Bytecount len, |
2367 | 1734 Ichar *arr) |
771 | 1735 { |
867 | 1736 const Ibyte *strend = str + len; |
771 | 1737 Charcount newlen = 0; |
1738 while (str < strend) | |
1739 { | |
867 | 1740 Ichar ch = itext_ichar (str); |
771 | 1741 arr[newlen++] = ch; |
867 | 1742 INC_IBYTEPTR (str); |
771 | 1743 } |
1744 return newlen; | |
1745 } | |
1746 | |
867 | 1747 /* Convert an array of Ichars into the equivalent string representation. |
1748 Store into the given Ibyte dynarr. Does not reset the dynarr. | |
771 | 1749 Does not add a terminating zero. */ |
1750 | |
1751 void | |
867 | 1752 convert_ichar_string_into_ibyte_dynarr (Ichar *arr, int nels, |
1753 Ibyte_dynarr *dyn) | |
771 | 1754 { |
867 | 1755 Ibyte str[MAX_ICHAR_LEN]; |
771 | 1756 int i; |
1757 | |
1758 for (i = 0; i < nels; i++) | |
1759 { | |
867 | 1760 Bytecount len = set_itext_ichar (str, arr[i]); |
771 | 1761 Dynarr_add_many (dyn, str, len); |
1762 } | |
1763 } | |
1764 | |
867 | 1765 /* Convert an array of Ichars into the equivalent string representation. |
771 | 1766 Malloc the space needed for this and return it. If LEN_OUT is not a |
867 | 1767 NULL pointer, store into LEN_OUT the number of Ibytes in the |
1768 malloc()ed string. Note that the actual number of Ibytes allocated | |
771 | 1769 is one more than this: the returned string is zero-terminated. */ |
1770 | |
867 | 1771 Ibyte * |
1772 convert_ichar_string_into_malloced_string (Ichar *arr, int nels, | |
826 | 1773 Bytecount *len_out) |
771 | 1774 { |
1775 /* Damn zero-termination. */ | |
2367 | 1776 Ibyte *str = alloca_ibytes (nels * MAX_ICHAR_LEN + 1); |
867 | 1777 Ibyte *strorig = str; |
771 | 1778 Bytecount len; |
1779 | |
1780 int i; | |
1781 | |
1782 for (i = 0; i < nels; i++) | |
867 | 1783 str += set_itext_ichar (str, arr[i]); |
771 | 1784 *str = '\0'; |
1785 len = str - strorig; | |
2367 | 1786 str = xnew_ibytes (1 + len); |
771 | 1787 memcpy (str, strorig, 1 + len); |
1788 if (len_out) | |
1789 *len_out = len; | |
1790 return str; | |
1791 } | |
1792 | |
826 | 1793 #define COPY_TEXT_BETWEEN_FORMATS(srcfmt, dstfmt) \ |
1794 do \ | |
1795 { \ | |
1796 if (dst) \ | |
1797 { \ | |
867 | 1798 Ibyte *dstend = dst + dstlen; \ |
1799 Ibyte *dstp = dst; \ | |
1800 const Ibyte *srcend = src + srclen; \ | |
1801 const Ibyte *srcp = src; \ | |
826 | 1802 \ |
1803 while (srcp < srcend) \ | |
1804 { \ | |
867 | 1805 Ichar ch = itext_ichar_fmt (srcp, srcfmt, srcobj); \ |
1806 Bytecount len = ichar_len_fmt (ch, dstfmt); \ | |
826 | 1807 \ |
1808 if (dstp + len <= dstend) \ | |
1809 { \ | |
2956 | 1810 (void) set_itext_ichar_fmt (dstp, ch, dstfmt, dstobj); \ |
826 | 1811 dstp += len; \ |
1812 } \ | |
1813 else \ | |
1814 break; \ | |
867 | 1815 INC_IBYTEPTR_FMT (srcp, srcfmt); \ |
826 | 1816 } \ |
1817 text_checking_assert (srcp <= srcend); \ | |
1818 if (src_used) \ | |
1819 *src_used = srcp - src; \ | |
1820 return dstp - dst; \ | |
1821 } \ | |
1822 else \ | |
1823 { \ | |
867 | 1824 const Ibyte *srcend = src + srclen; \ |
1825 const Ibyte *srcp = src; \ | |
826 | 1826 Bytecount total = 0; \ |
1827 \ | |
1828 while (srcp < srcend) \ | |
1829 { \ | |
867 | 1830 total += ichar_len_fmt (itext_ichar_fmt (srcp, srcfmt, \ |
826 | 1831 srcobj), dstfmt); \ |
867 | 1832 INC_IBYTEPTR_FMT (srcp, srcfmt); \ |
826 | 1833 } \ |
1834 text_checking_assert (srcp == srcend); \ | |
1835 if (src_used) \ | |
1836 *src_used = srcp - src; \ | |
1837 return total; \ | |
1838 } \ | |
1839 } \ | |
1840 while (0) | |
1841 | |
1842 /* Copy as much text from SRC/SRCLEN to DST/DSTLEN as will fit, converting | |
1843 from SRCFMT/SRCOBJ to DSTFMT/DSTOBJ. Return number of bytes stored into | |
1844 DST as return value, and number of bytes copied from SRC through | |
1845 SRC_USED (if not NULL). If DST is NULL, don't actually store anything | |
1846 and just return the size needed to store all the text. Will not copy | |
1847 partial characters into DST. */ | |
1848 | |
1849 Bytecount | |
867 | 1850 copy_text_between_formats (const Ibyte *src, Bytecount srclen, |
826 | 1851 Internal_Format srcfmt, |
2333 | 1852 Lisp_Object USED_IF_MULE (srcobj), |
867 | 1853 Ibyte *dst, Bytecount dstlen, |
826 | 1854 Internal_Format dstfmt, |
2333 | 1855 Lisp_Object USED_IF_MULE (dstobj), |
826 | 1856 Bytecount *src_used) |
1857 { | |
1858 if (srcfmt == dstfmt && | |
1859 objects_have_same_internal_representation (srcobj, dstobj)) | |
1860 { | |
1861 if (dst) | |
1862 { | |
1863 srclen = min (srclen, dstlen); | |
867 | 1864 srclen = validate_ibyte_string_backward (src, srclen); |
826 | 1865 memcpy (dst, src, srclen); |
1866 if (src_used) | |
1867 *src_used = srclen; | |
1868 return srclen; | |
1869 } | |
1870 else | |
1871 return srclen; | |
1872 } | |
1873 /* Everything before the final else statement is an optimization. | |
1874 The inner loops inside COPY_TEXT_BETWEEN_FORMATS() have a number | |
1875 of calls to *_fmt(), each of which has a switch statement in it. | |
1876 By using constants as the FMT argument, these switch statements | |
1877 will be optimized out of existence. */ | |
1878 #define ELSE_FORMATS(fmt1, fmt2) \ | |
1879 else if (srcfmt == fmt1 && dstfmt == fmt2) \ | |
1880 COPY_TEXT_BETWEEN_FORMATS (fmt1, fmt2) | |
1881 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_8_BIT_FIXED); | |
1882 ELSE_FORMATS (FORMAT_8_BIT_FIXED, FORMAT_DEFAULT); | |
1883 ELSE_FORMATS (FORMAT_DEFAULT, FORMAT_32_BIT_FIXED); | |
1884 ELSE_FORMATS (FORMAT_32_BIT_FIXED, FORMAT_DEFAULT); | |
1885 else | |
1886 COPY_TEXT_BETWEEN_FORMATS (srcfmt, dstfmt); | |
1887 #undef ELSE_FORMATS | |
1888 } | |
1889 | |
1890 /* Copy as much buffer text in BUF, starting at POS, of length LEN, as will | |
1891 fit into DST/DSTLEN, converting to DSTFMT. Return number of bytes | |
1892 stored into DST as return value, and number of bytes copied from BUF | |
1893 through SRC_USED (if not NULL). If DST is NULL, don't actually store | |
1894 anything and just return the size needed to store all the text. */ | |
1895 | |
1896 Bytecount | |
1897 copy_buffer_text_out (struct buffer *buf, Bytebpos pos, | |
867 | 1898 Bytecount len, Ibyte *dst, Bytecount dstlen, |
826 | 1899 Internal_Format dstfmt, Lisp_Object dstobj, |
1900 Bytecount *src_used) | |
1901 { | |
1902 Bytecount dst_used = 0; | |
1903 if (src_used) | |
1904 *src_used = 0; | |
1905 | |
1906 { | |
1907 BUFFER_TEXT_LOOP (buf, pos, len, runptr, runlen) | |
1908 { | |
1909 Bytecount the_src_used, the_dst_used; | |
1910 | |
1911 the_dst_used = copy_text_between_formats (runptr, runlen, | |
1912 BUF_FORMAT (buf), | |
1913 wrap_buffer (buf), | |
1914 dst, dstlen, dstfmt, | |
1915 dstobj, &the_src_used); | |
1916 dst_used += the_dst_used; | |
1917 if (src_used) | |
1918 *src_used += the_src_used; | |
1919 if (dst) | |
1920 { | |
1921 dst += the_dst_used; | |
1922 dstlen -= the_dst_used; | |
841 | 1923 /* Stop if we didn't use all of the source text. Also stop |
1924 if the destination is full. We need the first test because | |
1925 there might be a couple bytes left in the destination, but | |
1926 not enough to fit a full character. The first test will in | |
1927 fact catch the vast majority of cases where the destination | |
1928 is empty, too -- but in case the destination holds *exactly* | |
1929 the run length, we put in the second check. (It shouldn't | |
1930 really matter though -- next time through we'll just get a | |
1931 0.) */ | |
1932 if (the_src_used < runlen || !dstlen) | |
826 | 1933 break; |
1934 } | |
1935 } | |
1936 } | |
1937 | |
1938 return dst_used; | |
1939 } | |
1940 | |
771 | 1941 |
1942 /************************************************************************/ | |
1943 /* charset properties of strings */ | |
1944 /************************************************************************/ | |
1945 | |
1946 void | |
2333 | 1947 find_charsets_in_ibyte_string (unsigned char *charsets, |
1948 const Ibyte *USED_IF_MULE (str), | |
1949 Bytecount USED_IF_MULE (len)) | |
771 | 1950 { |
1951 #ifndef MULE | |
1952 /* Telescope this. */ | |
1953 charsets[0] = 1; | |
1954 #else | |
867 | 1955 const Ibyte *strend = str + len; |
771 | 1956 memset (charsets, 0, NUM_LEADING_BYTES); |
1957 | |
1958 /* #### SJT doesn't like this. */ | |
1959 if (len == 0) | |
1960 { | |
1961 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1; | |
1962 return; | |
1963 } | |
1964 | |
1965 while (str < strend) | |
1966 { | |
867 | 1967 charsets[ichar_leading_byte (itext_ichar (str)) - MIN_LEADING_BYTE] = |
771 | 1968 1; |
867 | 1969 INC_IBYTEPTR (str); |
771 | 1970 } |
1971 #endif | |
1972 } | |
1973 | |
1974 void | |
2333 | 1975 find_charsets_in_ichar_string (unsigned char *charsets, |
1976 const Ichar *USED_IF_MULE (str), | |
1977 Charcount USED_IF_MULE (len)) | |
771 | 1978 { |
1979 #ifndef MULE | |
1980 /* Telescope this. */ | |
1981 charsets[0] = 1; | |
1982 #else | |
1983 int i; | |
1984 | |
1985 memset (charsets, 0, NUM_LEADING_BYTES); | |
1986 | |
1987 /* #### SJT doesn't like this. */ | |
1988 if (len == 0) | |
1989 { | |
1990 charsets[XCHARSET_LEADING_BYTE (Vcharset_ascii) - MIN_LEADING_BYTE] = 1; | |
1991 return; | |
1992 } | |
1993 | |
1994 for (i = 0; i < len; i++) | |
1995 { | |
867 | 1996 charsets[ichar_leading_byte (str[i]) - MIN_LEADING_BYTE] = 1; |
771 | 1997 } |
1998 #endif | |
1999 } | |
2000 | |
3571 | 2001 /* A couple of these functions should only be called on a non-Mule build. */ |
2002 #ifdef MULE | |
2003 #define ASSERT_BUILT_WITH_MULE() assert(1) | |
2004 #else /* MULE */ | |
2005 #define ASSERT_BUILT_WITH_MULE() assert(0) | |
2006 #endif /* MULE */ | |
2007 | |
771 | 2008 int |
867 | 2009 ibyte_string_displayed_columns (const Ibyte *str, Bytecount len) |
771 | 2010 { |
2011 int cols = 0; | |
867 | 2012 const Ibyte *end = str + len; |
3571 | 2013 Ichar ch; |
2014 | |
2015 ASSERT_BUILT_WITH_MULE(); | |
771 | 2016 |
2017 while (str < end) | |
2018 { | |
3571 | 2019 ch = itext_ichar (str); |
867 | 2020 cols += XCHARSET_COLUMNS (ichar_charset (ch)); |
2021 INC_IBYTEPTR (str); | |
771 | 2022 } |
2023 | |
2024 return cols; | |
2025 } | |
2026 | |
2027 int | |
3571 | 2028 ichar_string_displayed_columns (const Ichar * USED_IF_MULE(str), Charcount len) |
771 | 2029 { |
2030 int cols = 0; | |
2031 int i; | |
2032 | |
3571 | 2033 ASSERT_BUILT_WITH_MULE(); |
2034 | |
771 | 2035 for (i = 0; i < len; i++) |
867 | 2036 cols += XCHARSET_COLUMNS (ichar_charset (str[i])); |
771 | 2037 |
2038 return cols; | |
2039 } | |
2040 | |
2041 Charcount | |
2333 | 2042 ibyte_string_nonascii_chars (const Ibyte *USED_IF_MULE (str), |
2043 Bytecount USED_IF_MULE (len)) | |
771 | 2044 { |
2045 #ifdef MULE | |
867 | 2046 const Ibyte *end = str + len; |
771 | 2047 Charcount retval = 0; |
2048 | |
2049 while (str < end) | |
2050 { | |
826 | 2051 if (!byte_ascii_p (*str)) |
771 | 2052 retval++; |
867 | 2053 INC_IBYTEPTR (str); |
771 | 2054 } |
2055 | |
2056 return retval; | |
2057 #else | |
2058 return 0; | |
2059 #endif | |
2060 } | |
2061 | |
2062 | |
2063 /***************************************************************************/ | |
2064 /* Eistring helper functions */ | |
2065 /***************************************************************************/ | |
2066 | |
2067 int | |
867 | 2068 eistr_casefiddle_1 (Ibyte *olddata, Bytecount len, Ibyte *newdata, |
771 | 2069 int downp) |
2070 { | |
867 | 2071 Ibyte *endp = olddata + len; |
2072 Ibyte *newp = newdata; | |
771 | 2073 int changedp = 0; |
2074 | |
2075 while (olddata < endp) | |
2076 { | |
867 | 2077 Ichar c = itext_ichar (olddata); |
2078 Ichar newc; | |
771 | 2079 |
2080 if (downp) | |
2081 newc = DOWNCASE (0, c); | |
2082 else | |
2083 newc = UPCASE (0, c); | |
2084 | |
2085 if (c != newc) | |
2086 changedp = 1; | |
2087 | |
867 | 2088 newp += set_itext_ichar (newp, newc); |
2089 INC_IBYTEPTR (olddata); | |
771 | 2090 } |
2091 | |
2092 *newp = '\0'; | |
2093 | |
2094 return changedp ? newp - newdata : 0; | |
2095 } | |
2096 | |
2097 int | |
2098 eifind_large_enough_buffer (int oldbufsize, int needed_size) | |
2099 { | |
2100 while (oldbufsize < needed_size) | |
2101 { | |
2102 oldbufsize = oldbufsize * 3 / 2; | |
2103 oldbufsize = max (oldbufsize, 32); | |
2104 } | |
2105 | |
2106 return oldbufsize; | |
2107 } | |
2108 | |
2109 void | |
2110 eito_malloc_1 (Eistring *ei) | |
2111 { | |
2112 if (ei->mallocp_) | |
2113 return; | |
2114 ei->mallocp_ = 1; | |
2115 if (ei->data_) | |
2116 { | |
867 | 2117 Ibyte *newdata; |
771 | 2118 |
2119 ei->max_size_allocated_ = | |
2120 eifind_large_enough_buffer (0, ei->bytelen_ + 1); | |
2367 | 2121 newdata = xnew_ibytes (ei->max_size_allocated_); |
771 | 2122 memcpy (newdata, ei->data_, ei->bytelen_ + 1); |
2123 ei->data_ = newdata; | |
2124 } | |
2125 | |
2126 if (ei->extdata_) | |
2127 { | |
2367 | 2128 Extbyte *newdata = xnew_extbytes (ei->extlen_ + 2); |
771 | 2129 |
2130 memcpy (newdata, ei->extdata_, ei->extlen_); | |
2131 /* Double null-terminate in case of Unicode data */ | |
2132 newdata[ei->extlen_] = '\0'; | |
2133 newdata[ei->extlen_ + 1] = '\0'; | |
2134 ei->extdata_ = newdata; | |
2135 } | |
2136 } | |
2137 | |
2138 int | |
2139 eicmp_1 (Eistring *ei, Bytecount off, Charcount charoff, | |
867 | 2140 Bytecount len, Charcount charlen, const Ibyte *data, |
2421 | 2141 const Eistring *ei2, int is_ascii, int fold_case) |
771 | 2142 { |
3462 | 2143 assert ((data == 0) != (ei == 0)); |
2144 assert ((is_ascii != 0) == (data != 0)); | |
2145 assert (fold_case >= 0 && fold_case <= 2); | |
771 | 2146 assert ((off < 0) != (charoff < 0)); |
3462 | 2147 |
771 | 2148 if (off < 0) |
2149 { | |
2150 off = charcount_to_bytecount (ei->data_, charoff); | |
2151 if (charlen < 0) | |
2152 len = -1; | |
2153 else | |
2154 len = charcount_to_bytecount (ei->data_ + off, charlen); | |
2155 } | |
2156 if (len < 0) | |
2157 len = ei->bytelen_ - off; | |
2158 | |
2159 assert (off >= 0 && off <= ei->bytelen_); | |
2160 assert (len >= 0 && off + len <= ei->bytelen_); | |
2161 | |
2162 { | |
2163 Bytecount dstlen; | |
867 | 2164 const Ibyte *src = ei->data_, *dst; |
771 | 2165 |
2166 if (data) | |
2167 { | |
2168 dst = data; | |
2169 dstlen = qxestrlen (data); | |
2170 } | |
2171 else | |
2172 { | |
2173 dst = ei2->data_; | |
2174 dstlen = ei2->bytelen_; | |
2175 } | |
2176 | |
2421 | 2177 if (is_ascii) |
2367 | 2178 ASSERT_ASCTEXT_ASCII_LEN ((Ascbyte *) dst, dstlen); |
771 | 2179 |
801 | 2180 return (fold_case == 0 ? qxememcmp4 (src, len, dst, dstlen) : |
2181 fold_case == 1 ? qxememcasecmp4 (src, len, dst, dstlen) : | |
2182 qxetextcasecmp (src, len, dst, dstlen)); | |
771 | 2183 } |
2184 } | |
2185 | |
867 | 2186 Ibyte * |
826 | 2187 eicpyout_malloc_fmt (Eistring *eistr, Bytecount *len_out, Internal_Format fmt, |
2286 | 2188 Lisp_Object UNUSED (object)) |
771 | 2189 { |
867 | 2190 Ibyte *ptr; |
771 | 2191 |
2192 assert (fmt == FORMAT_DEFAULT); | |
867 | 2193 ptr = xnew_array (Ibyte, eistr->bytelen_ + 1); |
771 | 2194 if (len_out) |
2195 *len_out = eistr->bytelen_; | |
2196 memcpy (ptr, eistr->data_, eistr->bytelen_ + 1); | |
2197 return ptr; | |
2198 } | |
2199 | |
2200 | |
2201 /************************************************************************/ | |
2202 /* Charcount/Bytecount conversion */ | |
2203 /************************************************************************/ | |
2204 | |
2205 /* Optimization. Do it. Live it. Love it. */ | |
2206 | |
2207 #ifdef MULE | |
2208 | |
826 | 2209 #ifdef EFFICIENT_INT_128_BIT |
2210 # define STRIDE_TYPE INT_128_BIT | |
2211 # define HIGH_BIT_MASK \ | |
2212 MAKE_128_BIT_UNSIGNED_CONSTANT (0x80808080808080808080808080808080) | |
2213 #elif defined (EFFICIENT_INT_64_BIT) | |
2214 # define STRIDE_TYPE INT_64_BIT | |
2215 # define HIGH_BIT_MASK MAKE_64_BIT_UNSIGNED_CONSTANT (0x8080808080808080) | |
771 | 2216 #else |
826 | 2217 # define STRIDE_TYPE INT_32_BIT |
2218 # define HIGH_BIT_MASK MAKE_32_BIT_UNSIGNED_CONSTANT (0x80808080) | |
771 | 2219 #endif |
2220 | |
2221 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1)) | |
2222 #define ALIGN_MASK (~ ALIGN_BITS) | |
2223 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0) | |
2224 #define STRIDE sizeof (STRIDE_TYPE) | |
2225 | |
2367 | 2226 /* Skip as many ASCII bytes as possible in the memory block [PTR, END). |
2227 Return pointer to the first non-ASCII byte. optimized for long | |
2228 stretches of ASCII. */ | |
2229 inline static const Ibyte * | |
2230 skip_ascii (const Ibyte *ptr, const Ibyte *end) | |
2231 { | |
826 | 2232 const unsigned STRIDE_TYPE *ascii_end; |
2233 | |
2234 /* Need to do in 3 sections -- before alignment start, aligned chunk, | |
2235 after alignment end. */ | |
2236 while (!ALIGNED (ptr)) | |
771 | 2237 { |
826 | 2238 if (ptr == end || !byte_ascii_p (*ptr)) |
2239 return ptr; | |
2240 ptr++; | |
2241 } | |
2242 ascii_end = (const unsigned STRIDE_TYPE *) ptr; | |
2243 /* This loop screams, because we can detect ASCII | |
2244 characters 4 or 8 at a time. */ | |
867 | 2245 while ((const Ibyte *) ascii_end + STRIDE <= end |
826 | 2246 && !(*ascii_end & HIGH_BIT_MASK)) |
2247 ascii_end++; | |
867 | 2248 ptr = (Ibyte *) ascii_end; |
826 | 2249 while (ptr < end && byte_ascii_p (*ptr)) |
2250 ptr++; | |
2251 return ptr; | |
2252 } | |
2253 | |
2367 | 2254 /* Skip as many ASCII bytes as possible in the memory block [END, PTR), |
2255 going downwards. Return pointer to the location above the first | |
2256 non-ASCII byte. Optimized for long stretches of ASCII. */ | |
2257 inline static const Ibyte * | |
2258 skip_ascii_down (const Ibyte *ptr, const Ibyte *end) | |
2259 { | |
2260 const unsigned STRIDE_TYPE *ascii_end; | |
2261 | |
2262 /* Need to do in 3 sections -- before alignment start, aligned chunk, | |
2263 after alignment end. */ | |
2264 while (!ALIGNED (ptr)) | |
2265 { | |
2266 if (ptr == end || !byte_ascii_p (*(ptr - 1))) | |
2267 return ptr; | |
2268 ptr--; | |
2269 } | |
2270 ascii_end = (const unsigned STRIDE_TYPE *) ptr - 1; | |
2271 /* This loop screams, because we can detect ASCII | |
2272 characters 4 or 8 at a time. */ | |
2273 while ((const Ibyte *) ascii_end >= end | |
2274 && !(*ascii_end & HIGH_BIT_MASK)) | |
2275 ascii_end--; | |
2276 ptr = (Ibyte *) (ascii_end + 1); | |
2277 while (ptr > end && byte_ascii_p (*(ptr - 1))) | |
2278 ptr--; | |
2279 return ptr; | |
2280 } | |
2281 | |
826 | 2282 /* Function equivalents of bytecount_to_charcount/charcount_to_bytecount. |
2283 These work on strings of all sizes but are more efficient than a simple | |
2284 loop on large strings and probably less efficient on sufficiently small | |
2285 strings. */ | |
2286 | |
2287 Charcount | |
867 | 2288 bytecount_to_charcount_fun (const Ibyte *ptr, Bytecount len) |
826 | 2289 { |
2290 Charcount count = 0; | |
867 | 2291 const Ibyte *end = ptr + len; |
826 | 2292 while (1) |
2293 { | |
867 | 2294 const Ibyte *newptr = skip_ascii (ptr, end); |
826 | 2295 count += newptr - ptr; |
2296 ptr = newptr; | |
2297 if (ptr == end) | |
2298 break; | |
2299 { | |
2300 /* Optimize for successive characters from the same charset */ | |
867 | 2301 Ibyte leading_byte = *ptr; |
826 | 2302 int bytes = rep_bytes_by_first_byte (leading_byte); |
2303 while (ptr < end && *ptr == leading_byte) | |
2304 ptr += bytes, count++; | |
2305 } | |
771 | 2306 } |
2307 | |
2308 /* Bomb out if the specified substring ends in the middle | |
2309 of a character. Note that we might have already gotten | |
2310 a core dump above from an invalid reference, but at least | |
2311 we will get no farther than here. | |
2312 | |
2313 This also catches len < 0. */ | |
800 | 2314 text_checking_assert (ptr == end); |
771 | 2315 |
2316 return count; | |
2317 } | |
2318 | |
2319 Bytecount | |
867 | 2320 charcount_to_bytecount_fun (const Ibyte *ptr, Charcount len) |
771 | 2321 { |
867 | 2322 const Ibyte *newptr = ptr; |
826 | 2323 while (1) |
771 | 2324 { |
867 | 2325 const Ibyte *newnewptr = skip_ascii (newptr, newptr + len); |
826 | 2326 len -= newnewptr - newptr; |
2327 newptr = newnewptr; | |
2328 if (!len) | |
2329 break; | |
2330 { | |
2331 /* Optimize for successive characters from the same charset */ | |
867 | 2332 Ibyte leading_byte = *newptr; |
826 | 2333 int bytes = rep_bytes_by_first_byte (leading_byte); |
2334 while (len > 0 && *newptr == leading_byte) | |
2335 newptr += bytes, len--; | |
2336 } | |
771 | 2337 } |
2338 return newptr - ptr; | |
2339 } | |
2340 | |
2367 | 2341 /* Function equivalent of charcount_to_bytecount_down. This works on strings |
2342 of all sizes but is more efficient than a simple loop on large strings | |
2343 and probably less efficient on sufficiently small strings. */ | |
2344 | |
2345 Bytecount | |
2346 charcount_to_bytecount_down_fun (const Ibyte *ptr, Charcount len) | |
2347 { | |
2348 const Ibyte *newptr = ptr; | |
2349 while (1) | |
2350 { | |
2351 const Ibyte *newnewptr = skip_ascii_down (newptr, newptr - len); | |
2352 len -= newptr - newnewptr; | |
2353 newptr = newnewptr; | |
2354 /* Skip over all non-ASCII chars, counting the length and | |
2355 stopping if it's zero */ | |
2356 while (len && !byte_ascii_p (*(newptr - 1))) | |
2357 if (ibyte_first_byte_p (*--newptr)) | |
2358 len--; | |
2359 if (!len) | |
2360 break; | |
2361 } | |
2362 text_checking_assert (ptr - newptr >= 0); | |
2363 return ptr - newptr; | |
2364 } | |
2365 | |
771 | 2366 /* The next two functions are the actual meat behind the |
2367 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently | |
2368 the method they use is fairly unsophisticated; see buffer.h. | |
2369 | |
2370 Note that charbpos_to_bytebpos_func() is probably the most-called | |
2371 function in all of XEmacs. Therefore, it must be FAST FAST FAST. | |
2372 This is the reason why so much of the code is duplicated. | |
2373 | |
2374 Similar considerations apply to bytebpos_to_charbpos_func(), although | |
2375 less so because the function is not called so often. | |
2367 | 2376 */ |
2377 | |
2378 /* | |
2379 | |
2380 Info on Byte-Char conversion: | |
2381 | |
2382 (Info-goto-node "(internals)Byte-Char Position Conversion") | |
2383 */ | |
2384 | |
2385 #ifdef OLD_BYTE_CHAR | |
771 | 2386 static int not_very_random_number; |
2367 | 2387 #endif /* OLD_BYTE_CHAR */ |
2388 | |
2389 #define OLD_LOOP | |
2390 | |
2391 /* If we are this many characters away from any known position, cache the | |
2392 new position in the buffer's char-byte cache. */ | |
2393 #define FAR_AWAY_DISTANCE 5000 | |
2394 | |
2395 /* Converting between character positions and byte positions. */ | |
2396 | |
2397 /* There are several places in the buffer where we know | |
2398 the correspondence: BEG, BEGV, PT, GPT, ZV and Z, | |
2399 and everywhere there is a marker. So we find the one of these places | |
2400 that is closest to the specified position, and scan from there. */ | |
2401 | |
2402 /* This macro is a subroutine of charbpos_to_bytebpos_func. | |
2403 Note that it is desirable that BYTEPOS is not evaluated | |
2404 except when we really want its value. */ | |
2405 | |
2406 #define CONSIDER(CHARPOS, BYTEPOS) \ | |
2407 do \ | |
2408 { \ | |
2409 Charbpos this_charpos = (CHARPOS); \ | |
2410 int changed = 0; \ | |
2411 \ | |
2412 if (this_charpos == x) \ | |
2413 { \ | |
2414 retval = (BYTEPOS); \ | |
2415 goto done; \ | |
2416 } \ | |
2417 else if (this_charpos > x) \ | |
2418 { \ | |
2419 if (this_charpos < best_above) \ | |
2420 { \ | |
2421 best_above = this_charpos; \ | |
2422 best_above_byte = (BYTEPOS); \ | |
2423 changed = 1; \ | |
2424 } \ | |
2425 } \ | |
2426 else if (this_charpos > best_below) \ | |
2427 { \ | |
2428 best_below = this_charpos; \ | |
2429 best_below_byte = (BYTEPOS); \ | |
2430 changed = 1; \ | |
2431 } \ | |
2432 \ | |
2433 if (changed) \ | |
2434 { \ | |
2435 if (best_above - best_below == best_above_byte - best_below_byte) \ | |
2436 { \ | |
2437 retval = best_below_byte + (x - best_below); \ | |
2438 goto done; \ | |
2439 } \ | |
2440 } \ | |
2441 } \ | |
2442 while (0) | |
2443 | |
771 | 2444 |
2445 Bytebpos | |
2446 charbpos_to_bytebpos_func (struct buffer *buf, Charbpos x) | |
2447 { | |
2367 | 2448 #ifdef OLD_BYTE_CHAR |
771 | 2449 Charbpos bufmin; |
2450 Charbpos bufmax; | |
2451 Bytebpos bytmin; | |
2452 Bytebpos bytmax; | |
2453 int size; | |
2454 int forward_p; | |
2455 int diff_so_far; | |
2456 int add_to_cache = 0; | |
2367 | 2457 #endif /* OLD_BYTE_CHAR */ |
2458 | |
2459 Charbpos best_above, best_below; | |
2460 Bytebpos best_above_byte, best_below_byte; | |
2461 int i; | |
2462 struct buffer_text *t; | |
2463 Bytebpos retval; | |
2464 | |
1292 | 2465 PROFILE_DECLARE (); |
771 | 2466 |
1292 | 2467 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion); |
2468 | |
2367 | 2469 best_above = BUF_Z (buf); |
2470 best_above_byte = BYTE_BUF_Z (buf); | |
2471 | |
2472 /* In this case, we simply have all one-byte characters. But this should | |
2473 have been intercepted before, in charbpos_to_bytebpos(). */ | |
2474 text_checking_assert (best_above != best_above_byte); | |
2475 | |
2476 best_below = BUF_BEG (buf); | |
2477 best_below_byte = BYTE_BUF_BEG (buf); | |
2478 | |
2479 /* We find in best_above and best_above_byte | |
2480 the closest known point above CHARPOS, | |
2481 and in best_below and best_below_byte | |
2482 the closest known point below CHARPOS, | |
2483 | |
2484 If at any point we can tell that the space between those | |
2485 two best approximations is all single-byte, | |
2486 we interpolate the result immediately. */ | |
2487 | |
2488 CONSIDER (BUF_PT (buf), BYTE_BUF_PT (buf)); | |
2489 CONSIDER (BUF_GPT (buf), BYTE_BUF_GPT (buf)); | |
2490 CONSIDER (BUF_BEGV (buf), BYTE_BUF_BEGV (buf)); | |
2491 CONSIDER (BUF_ZV (buf), BYTE_BUF_ZV (buf)); | |
2492 | |
2493 t = buf->text; | |
2494 CONSIDER (t->cached_charpos, t->cached_bytepos); | |
2495 | |
2496 /* Check the most recently entered positions first */ | |
2497 | |
2498 for (i = t->next_cache_pos - 1; i >= 0; i--) | |
2499 { | |
2500 CONSIDER (t->mule_charbpos_cache[i], t->mule_bytebpos_cache[i]); | |
2501 | |
2502 /* If we are down to a range of 50 chars, | |
2503 don't bother checking any other markers; | |
2504 scan the intervening chars directly now. */ | |
2505 if (best_above - best_below < 50) | |
2506 break; | |
2507 } | |
2508 | |
2509 /* We get here if we did not exactly hit one of the known places. | |
2510 We have one known above and one known below. | |
2511 Scan, counting characters, from whichever one is closer. */ | |
2512 | |
2513 if (x - best_below < best_above - x) | |
2514 { | |
2515 int record = x - best_below > FAR_AWAY_DISTANCE; | |
2516 | |
2517 #ifdef OLD_LOOP /* old code */ | |
2518 while (best_below != x) | |
2519 { | |
2520 best_below++; | |
2521 INC_BYTEBPOS (buf, best_below_byte); | |
2522 } | |
2523 #else | |
2524 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT); | |
2525 /* The gap should not occur between best_below and x, or we will be | |
2526 screwed in using charcount_to_bytecount(). It should not be exactly | |
2527 at x either, because we already should have caught that. */ | |
2528 text_checking_assert | |
2529 (BUF_CEILING_OF_IGNORE_ACCESSIBLE (buf, best_below) > x); | |
2530 | |
2531 /* Using charcount_to_bytecount() is potentially a lot faster than a | |
2532 simple loop using INC_BYTEBPOS() because (a) the checks for gap | |
2533 and buffer format are factored out instead of getting checked | |
2534 every time; (b) the checking goes 4 or 8 bytes at a time in ASCII | |
2535 text. | |
2536 */ | |
2537 best_below_byte += | |
2538 charcount_to_bytecount | |
2539 (BYTE_BUF_BYTE_ADDRESS (buf, best_below_byte), x - best_below); | |
2540 best_below = x; | |
2541 #endif /* 0 */ | |
2542 | |
2543 /* If this position is quite far from the nearest known position, | |
2544 cache the correspondence. | |
2545 | |
2546 NB FSF does this: "... by creating a marker here. | |
2547 It will last until the next GC." | |
2548 */ | |
2549 | |
2550 if (record) | |
2551 { | |
2552 /* If we have run out of positions to record, discard some of the | |
2553 old ones. I used to use a circular buffer, which avoids the | |
2554 need to block-move any memory. But it makes it more difficult | |
2555 to keep track of which positions haven't been used -- commonly | |
2556 we haven't yet filled out anywhere near the whole set of | |
2557 positions and don't want to check them all. We should not be | |
2558 recording that often, and block-moving is extremely fast in | |
2559 any case. --ben */ | |
2560 if (t->next_cache_pos == NUM_CACHED_POSITIONS) | |
2561 { | |
2562 memmove (t->mule_charbpos_cache, | |
2563 t->mule_charbpos_cache + NUM_MOVED_POSITIONS, | |
2564 sizeof (Charbpos) * | |
2565 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2566 memmove (t->mule_bytebpos_cache, | |
2567 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS, | |
2568 sizeof (Bytebpos) * | |
2569 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2570 t->next_cache_pos -= NUM_MOVED_POSITIONS; | |
2571 } | |
2572 t->mule_charbpos_cache[t->next_cache_pos] = best_below; | |
2573 t->mule_bytebpos_cache[t->next_cache_pos] = best_below_byte; | |
2574 t->next_cache_pos++; | |
2575 } | |
2576 | |
2577 t->cached_charpos = best_below; | |
2578 t->cached_bytepos = best_below_byte; | |
2579 | |
2580 retval = best_below_byte; | |
2581 text_checking_assert (best_below_byte >= best_below); | |
2582 goto done; | |
2583 } | |
2584 else | |
2585 { | |
2586 int record = best_above - x > FAR_AWAY_DISTANCE; | |
2587 | |
2588 #ifdef OLD_LOOP | |
2589 while (best_above != x) | |
2590 { | |
2591 best_above--; | |
2592 DEC_BYTEBPOS (buf, best_above_byte); | |
2593 } | |
2594 #else | |
2595 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT); | |
2596 /* The gap should not occur between best_above and x, or we will be | |
2597 screwed in using charcount_to_bytecount_down(). It should not be | |
2598 exactly at x either, because we already should have caught | |
2599 that. */ | |
2600 text_checking_assert | |
2601 (BUF_FLOOR_OF_IGNORE_ACCESSIBLE (buf, best_above) < x); | |
2602 | |
2603 /* Using charcount_to_bytecount_down() is potentially a lot faster | |
2604 than a simple loop using DEC_BYTEBPOS(); see above. */ | |
2605 best_above_byte -= | |
2606 charcount_to_bytecount_down | |
2607 /* BYTE_BUF_BYTE_ADDRESS will return a value on the high side of the | |
2608 gap if we are at the gap, which is the wrong side. So do the | |
2609 following trick instead. */ | |
2610 (BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, best_above_byte) + 1, | |
2611 best_above - x); | |
2612 best_above = x; | |
2613 #endif /* SLEDGEHAMMER_CHECK_TEXT */ | |
2614 | |
2615 | |
2616 /* If this position is quite far from the nearest known position, | |
2617 cache the correspondence. | |
2618 | |
2619 NB FSF does this: "... by creating a marker here. | |
2620 It will last until the next GC." | |
2621 */ | |
2622 if (record) | |
2623 { | |
2624 if (t->next_cache_pos == NUM_CACHED_POSITIONS) | |
2625 { | |
2626 memmove (t->mule_charbpos_cache, | |
2627 t->mule_charbpos_cache + NUM_MOVED_POSITIONS, | |
2628 sizeof (Charbpos) * | |
2629 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2630 memmove (t->mule_bytebpos_cache, | |
2631 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS, | |
2632 sizeof (Bytebpos) * | |
2633 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
2634 t->next_cache_pos -= NUM_MOVED_POSITIONS; | |
2635 } | |
2636 t->mule_charbpos_cache[t->next_cache_pos] = best_above; | |
2637 t->mule_bytebpos_cache[t->next_cache_pos] = best_above_byte; | |
2638 t->next_cache_pos++; | |
2639 } | |
2640 | |
2641 t->cached_charpos = best_above; | |
2642 t->cached_bytepos = best_above_byte; | |
2643 | |
2644 retval = best_above_byte; | |
2645 text_checking_assert (best_above_byte >= best_above); | |
2646 goto done; | |
2647 } | |
2648 | |
2649 #ifdef OLD_BYTE_CHAR | |
2650 | |
771 | 2651 bufmin = buf->text->mule_bufmin; |
2652 bufmax = buf->text->mule_bufmax; | |
2653 bytmin = buf->text->mule_bytmin; | |
2654 bytmax = buf->text->mule_bytmax; | |
2655 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; | |
2656 | |
2657 /* The basic idea here is that we shift the "known region" up or down | |
2658 until it overlaps the specified position. We do this by moving | |
2659 the upper bound of the known region up one character at a time, | |
2660 and moving the lower bound of the known region up as necessary | |
2661 when the size of the character just seen changes. | |
2662 | |
2663 We optimize this, however, by first shifting the known region to | |
2664 one of the cached points if it's close by. (We don't check BEG or | |
2665 Z, even though they're cached; most of the time these will be the | |
2666 same as BEGV and ZV, and when they're not, they're not likely | |
2667 to be used.) */ | |
2668 | |
2669 if (x > bufmax) | |
2670 { | |
2671 Charbpos diffmax = x - bufmax; | |
2672 Charbpos diffpt = x - BUF_PT (buf); | |
2673 Charbpos diffzv = BUF_ZV (buf) - x; | |
2674 /* #### This value could stand some more exploration. */ | |
2675 Charcount heuristic_hack = (bufmax - bufmin) >> 2; | |
2676 | |
2677 /* Check if the position is closer to PT or ZV than to the | |
2678 end of the known region. */ | |
2679 | |
2680 if (diffpt < 0) | |
2681 diffpt = -diffpt; | |
2682 if (diffzv < 0) | |
2683 diffzv = -diffzv; | |
2684 | |
2685 /* But also implement a heuristic that favors the known region | |
2686 over PT or ZV. The reason for this is that switching to | |
2687 PT or ZV will wipe out the knowledge in the known region, | |
2688 which might be annoying if the known region is large and | |
2689 PT or ZV is not that much closer than the end of the known | |
2690 region. */ | |
2691 | |
2692 diffzv += heuristic_hack; | |
2693 diffpt += heuristic_hack; | |
2694 if (diffpt < diffmax && diffpt <= diffzv) | |
2695 { | |
2696 bufmax = bufmin = BUF_PT (buf); | |
826 | 2697 bytmax = bytmin = BYTE_BUF_PT (buf); |
771 | 2698 /* We set the size to 1 even though it doesn't really |
2699 matter because the new known region contains no | |
2700 characters. We do this because this is the most | |
2701 likely size of the characters around the new known | |
2702 region, and we avoid potential yuckiness that is | |
2703 done when size == 3. */ | |
2704 size = 1; | |
2705 } | |
2706 if (diffzv < diffmax) | |
2707 { | |
2708 bufmax = bufmin = BUF_ZV (buf); | |
826 | 2709 bytmax = bytmin = BYTE_BUF_ZV (buf); |
771 | 2710 size = 1; |
2711 } | |
2712 } | |
800 | 2713 #ifdef ERROR_CHECK_TEXT |
771 | 2714 else if (x >= bufmin) |
2500 | 2715 ABORT (); |
771 | 2716 #endif |
2717 else | |
2718 { | |
2719 Charbpos diffmin = bufmin - x; | |
2720 Charbpos diffpt = BUF_PT (buf) - x; | |
2721 Charbpos diffbegv = x - BUF_BEGV (buf); | |
2722 /* #### This value could stand some more exploration. */ | |
2723 Charcount heuristic_hack = (bufmax - bufmin) >> 2; | |
2724 | |
2725 if (diffpt < 0) | |
2726 diffpt = -diffpt; | |
2727 if (diffbegv < 0) | |
2728 diffbegv = -diffbegv; | |
2729 | |
2730 /* But also implement a heuristic that favors the known region -- | |
2731 see above. */ | |
2732 | |
2733 diffbegv += heuristic_hack; | |
2734 diffpt += heuristic_hack; | |
2735 | |
2736 if (diffpt < diffmin && diffpt <= diffbegv) | |
2737 { | |
2738 bufmax = bufmin = BUF_PT (buf); | |
826 | 2739 bytmax = bytmin = BYTE_BUF_PT (buf); |
771 | 2740 /* We set the size to 1 even though it doesn't really |
2741 matter because the new known region contains no | |
2742 characters. We do this because this is the most | |
2743 likely size of the characters around the new known | |
2744 region, and we avoid potential yuckiness that is | |
2745 done when size == 3. */ | |
2746 size = 1; | |
2747 } | |
2748 if (diffbegv < diffmin) | |
2749 { | |
2750 bufmax = bufmin = BUF_BEGV (buf); | |
826 | 2751 bytmax = bytmin = BYTE_BUF_BEGV (buf); |
771 | 2752 size = 1; |
2753 } | |
2754 } | |
2755 | |
2756 diff_so_far = x > bufmax ? x - bufmax : bufmin - x; | |
2757 if (diff_so_far > 50) | |
2758 { | |
2759 /* If we have to move more than a certain amount, then look | |
2760 into our cache. */ | |
2761 int minval = INT_MAX; | |
2762 int found = 0; | |
2763 int i; | |
2764 | |
2765 add_to_cache = 1; | |
2766 /* I considered keeping the positions ordered. This would speed | |
2767 up this loop, but updating the cache would take longer, so | |
2768 it doesn't seem like it would really matter. */ | |
2367 | 2769 for (i = 0; i < NUM_CACHED_POSITIONS; i++) |
771 | 2770 { |
2771 int diff = buf->text->mule_charbpos_cache[i] - x; | |
2772 | |
2773 if (diff < 0) | |
2774 diff = -diff; | |
2775 if (diff < minval) | |
2776 { | |
2777 minval = diff; | |
2778 found = i; | |
2779 } | |
2780 } | |
2781 | |
2782 if (minval < diff_so_far) | |
2783 { | |
2784 bufmax = bufmin = buf->text->mule_charbpos_cache[found]; | |
2785 bytmax = bytmin = buf->text->mule_bytebpos_cache[found]; | |
2786 size = 1; | |
2787 } | |
2788 } | |
2789 | |
2790 /* It's conceivable that the caching above could lead to X being | |
2791 the same as one of the range edges. */ | |
2792 if (x >= bufmax) | |
2793 { | |
2794 Bytebpos newmax; | |
2795 Bytecount newsize; | |
2796 | |
2797 forward_p = 1; | |
2798 while (x > bufmax) | |
2799 { | |
2800 newmax = bytmax; | |
2801 | |
2802 INC_BYTEBPOS (buf, newmax); | |
2803 newsize = newmax - bytmax; | |
2804 if (newsize != size) | |
2805 { | |
2806 bufmin = bufmax; | |
2807 bytmin = bytmax; | |
2808 size = newsize; | |
2809 } | |
2810 bytmax = newmax; | |
2811 bufmax++; | |
2812 } | |
2813 retval = bytmax; | |
2814 | |
2815 /* #### Should go past the found location to reduce the number | |
2816 of times that this function is called */ | |
2817 } | |
2818 else /* x < bufmin */ | |
2819 { | |
2820 Bytebpos newmin; | |
2821 Bytecount newsize; | |
2822 | |
2823 forward_p = 0; | |
2824 while (x < bufmin) | |
2825 { | |
2826 newmin = bytmin; | |
2827 | |
2828 DEC_BYTEBPOS (buf, newmin); | |
2829 newsize = bytmin - newmin; | |
2830 if (newsize != size) | |
2831 { | |
2832 bufmax = bufmin; | |
2833 bytmax = bytmin; | |
2834 size = newsize; | |
2835 } | |
2836 bytmin = newmin; | |
2837 bufmin--; | |
2838 } | |
2839 retval = bytmin; | |
2840 | |
2841 /* #### Should go past the found location to reduce the number | |
2842 of times that this function is called | |
2843 */ | |
2844 } | |
2845 | |
2846 /* If size is three, than we have to max sure that the range we | |
2847 discovered isn't too large, because we use a fixed-length | |
2848 table to divide by 3. */ | |
2849 | |
2850 if (size == 3) | |
2851 { | |
2852 int gap = bytmax - bytmin; | |
2853 buf->text->mule_three_p = 1; | |
2854 buf->text->mule_shifter = 1; | |
2855 | |
2856 if (gap > MAX_BYTEBPOS_GAP_SIZE_3) | |
2857 { | |
2858 if (forward_p) | |
2859 { | |
2860 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3; | |
2861 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3; | |
2862 } | |
2863 else | |
2864 { | |
2865 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3; | |
2866 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3; | |
2867 } | |
2868 } | |
2869 } | |
2870 else | |
2871 { | |
2872 buf->text->mule_three_p = 0; | |
2873 if (size == 4) | |
2874 buf->text->mule_shifter = 2; | |
2875 else | |
2876 buf->text->mule_shifter = size - 1; | |
2877 } | |
2878 | |
2879 buf->text->mule_bufmin = bufmin; | |
2880 buf->text->mule_bufmax = bufmax; | |
2881 buf->text->mule_bytmin = bytmin; | |
2882 buf->text->mule_bytmax = bytmax; | |
2883 | |
2884 if (add_to_cache) | |
2885 { | |
2886 int replace_loc; | |
2887 | |
2888 /* We throw away a "random" cached value and replace it with | |
2889 the new value. It doesn't actually have to be very random | |
2890 at all, just evenly distributed. | |
2891 | |
2892 #### It would be better to use a least-recently-used algorithm | |
2893 or something that tries to space things out, but I'm not sure | |
2894 it's worth it to go to the trouble of maintaining that. */ | |
2895 not_very_random_number += 621; | |
2896 replace_loc = not_very_random_number & 15; | |
2897 buf->text->mule_charbpos_cache[replace_loc] = x; | |
2898 buf->text->mule_bytebpos_cache[replace_loc] = retval; | |
2899 } | |
2900 | |
2367 | 2901 #endif /* OLD_BYTE_CHAR */ |
2902 | |
2903 done: | |
1292 | 2904 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion); |
2905 | |
771 | 2906 return retval; |
2907 } | |
2908 | |
2367 | 2909 #undef CONSIDER |
2910 | |
2911 /* bytepos_to_charpos returns the char position corresponding to BYTEPOS. */ | |
2912 | |
2913 /* This macro is a subroutine of bytebpos_to_charbpos_func. | |
2914 It is used when BYTEPOS is actually the byte position. */ | |
2915 | |
2916 #define CONSIDER(BYTEPOS, CHARPOS) \ | |
2917 do \ | |
2918 { \ | |
2919 Bytebpos this_bytepos = (BYTEPOS); \ | |
2920 int changed = 0; \ | |
2921 \ | |
2922 if (this_bytepos == x) \ | |
2923 { \ | |
2924 retval = (CHARPOS); \ | |
2925 goto done; \ | |
2926 } \ | |
2927 else if (this_bytepos > x) \ | |
2928 { \ | |
2929 if (this_bytepos < best_above_byte) \ | |
2930 { \ | |
2931 best_above = (CHARPOS); \ | |
2932 best_above_byte = this_bytepos; \ | |
2933 changed = 1; \ | |
2934 } \ | |
2935 } \ | |
2936 else if (this_bytepos > best_below_byte) \ | |
2937 { \ | |
2938 best_below = (CHARPOS); \ | |
2939 best_below_byte = this_bytepos; \ | |
2940 changed = 1; \ | |
2941 } \ | |
2942 \ | |
2943 if (changed) \ | |
2944 { \ | |
2945 if (best_above - best_below == best_above_byte - best_below_byte) \ | |
2946 { \ | |
2947 retval = best_below + (x - best_below_byte); \ | |
2948 goto done; \ | |
2949 } \ | |
2950 } \ | |
2951 } \ | |
2952 while (0) | |
2953 | |
771 | 2954 /* The logic in this function is almost identical to the logic in |
2955 the previous function. */ | |
2956 | |
2957 Charbpos | |
2958 bytebpos_to_charbpos_func (struct buffer *buf, Bytebpos x) | |
2959 { | |
2367 | 2960 #ifdef OLD_BYTE_CHAR |
771 | 2961 Charbpos bufmin; |
2962 Charbpos bufmax; | |
2963 Bytebpos bytmin; | |
2964 Bytebpos bytmax; | |
2965 int size; | |
2966 int forward_p; | |
2967 int diff_so_far; | |
2968 int add_to_cache = 0; | |
2367 | 2969 #endif /* OLD_BYTE_CHAR */ |
2970 | |
2971 Charbpos best_above, best_above_byte; | |
2972 Bytebpos best_below, best_below_byte; | |
2973 int i; | |
2974 struct buffer_text *t; | |
2975 Charbpos retval; | |
2976 | |
1292 | 2977 PROFILE_DECLARE (); |
771 | 2978 |
1292 | 2979 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion); |
2980 | |
2367 | 2981 best_above = BUF_Z (buf); |
2982 best_above_byte = BYTE_BUF_Z (buf); | |
2983 | |
2984 /* In this case, we simply have all one-byte characters. But this should | |
2985 have been intercepted before, in bytebpos_to_charbpos(). */ | |
2986 text_checking_assert (best_above != best_above_byte); | |
2987 | |
2988 best_below = BUF_BEG (buf); | |
2989 best_below_byte = BYTE_BUF_BEG (buf); | |
2990 | |
2991 CONSIDER (BYTE_BUF_PT (buf), BUF_PT (buf)); | |
2992 CONSIDER (BYTE_BUF_GPT (buf), BUF_GPT (buf)); | |
2993 CONSIDER (BYTE_BUF_BEGV (buf), BUF_BEGV (buf)); | |
2994 CONSIDER (BYTE_BUF_ZV (buf), BUF_ZV (buf)); | |
2995 | |
2996 t = buf->text; | |
2997 CONSIDER (t->cached_bytepos, t->cached_charpos); | |
2998 | |
2999 /* Check the most recently entered positions first */ | |
3000 | |
3001 for (i = t->next_cache_pos - 1; i >= 0; i--) | |
3002 { | |
3003 CONSIDER (t->mule_bytebpos_cache[i], t->mule_charbpos_cache[i]); | |
3004 | |
3005 /* If we are down to a range of 50 chars, | |
3006 don't bother checking any other markers; | |
3007 scan the intervening chars directly now. */ | |
3008 if (best_above - best_below < 50) | |
3009 break; | |
3010 } | |
3011 | |
3012 /* We get here if we did not exactly hit one of the known places. | |
3013 We have one known above and one known below. | |
3014 Scan, counting characters, from whichever one is closer. */ | |
3015 | |
3016 if (x - best_below_byte < best_above_byte - x) | |
3017 { | |
3018 int record = x - best_below_byte > 5000; | |
3019 | |
3020 #ifdef OLD_LOOP /* old code */ | |
4526
38493c0fb952
Fix accidental deletion in src/text.c.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4525
diff
changeset
|
3021 while (best_below_byte < x) |
2367 | 3022 { |
3023 best_below++; | |
3024 INC_BYTEBPOS (buf, best_below_byte); | |
3025 } | |
3026 #else | |
3027 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT); | |
3028 /* The gap should not occur between best_below and x, or we will be | |
3029 screwed in using charcount_to_bytecount(). It should not be exactly | |
3030 at x either, because we already should have caught that. */ | |
3031 text_checking_assert | |
3032 (BYTE_BUF_CEILING_OF_IGNORE_ACCESSIBLE (buf, best_below_byte) > x); | |
3033 | |
3034 /* Using bytecount_to_charcount() is potentially a lot faster than | |
3035 a simple loop above using INC_BYTEBPOS(); see above. | |
3036 */ | |
3037 best_below += | |
3038 bytecount_to_charcount | |
3039 (BYTE_BUF_BYTE_ADDRESS (buf, best_below_byte), x - best_below_byte); | |
3040 best_below_byte = x; | |
3041 #endif | |
3042 | |
3043 /* If this position is quite far from the nearest known position, | |
3044 cache the correspondence. | |
3045 | |
3046 NB FSF does this: "... by creating a marker here. | |
3047 It will last until the next GC." | |
3048 */ | |
3049 | |
3050 if (record) | |
3051 { | |
3052 if (t->next_cache_pos == NUM_CACHED_POSITIONS) | |
3053 { | |
3054 memmove (t->mule_charbpos_cache, | |
3055 t->mule_charbpos_cache + NUM_MOVED_POSITIONS, | |
3056 sizeof (Charbpos) * | |
3057 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
3058 memmove (t->mule_bytebpos_cache, | |
3059 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS, | |
3060 sizeof (Bytebpos) * | |
3061 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
3062 t->next_cache_pos -= NUM_MOVED_POSITIONS; | |
3063 } | |
3064 t->mule_charbpos_cache[t->next_cache_pos] = best_below; | |
3065 t->mule_bytebpos_cache[t->next_cache_pos] = best_below_byte; | |
3066 t->next_cache_pos++; | |
3067 } | |
3068 | |
3069 | |
3070 t->cached_charpos = best_below; | |
3071 t->cached_bytepos = best_below_byte; | |
3072 | |
3073 retval = best_below; | |
3074 text_checking_assert (best_below_byte >= best_below); | |
3075 goto done; | |
3076 } | |
3077 else | |
3078 { | |
3079 int record = best_above_byte - x > 5000; | |
3080 | |
3081 #ifdef OLD_LOOP /* old code */ | |
3082 while (best_above_byte > x) | |
3083 { | |
3084 best_above--; | |
3085 DEC_BYTEBPOS (buf, best_above_byte); | |
3086 } | |
3087 #else | |
3088 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT); | |
3089 /* The gap should not occur between best_above and x, or we will be | |
3090 screwed in using bytecount_to_charcount_down(). It should not be | |
3091 exactly at x either, because we already should have caught | |
3092 that. */ | |
3093 text_checking_assert | |
3094 (BYTE_BUF_FLOOR_OF_IGNORE_ACCESSIBLE (buf, best_above_byte) < x); | |
3095 | |
3096 /* Using bytecount_to_charcount_down() is potentially a lot faster | |
3097 than a simple loop using INC_BYTEBPOS(); see above. */ | |
3098 best_above -= | |
3099 bytecount_to_charcount_down | |
3100 /* BYTE_BUF_BYTE_ADDRESS will return a value on the high side of the | |
3101 gap if we are at the gap, which is the wrong side. So do the | |
3102 following trick instead. */ | |
3103 (BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, best_above_byte) + 1, | |
3104 best_above_byte - x); | |
3105 best_above_byte = x; | |
3106 #endif | |
3107 | |
3108 | |
3109 /* If this position is quite far from the nearest known position, | |
3110 cache the correspondence. | |
3111 | |
3112 NB FSF does this: "... by creating a marker here. | |
3113 It will last until the next GC." | |
3114 */ | |
3115 if (record) | |
3116 { | |
3117 if (t->next_cache_pos == NUM_CACHED_POSITIONS) | |
3118 { | |
3119 memmove (t->mule_charbpos_cache, | |
3120 t->mule_charbpos_cache + NUM_MOVED_POSITIONS, | |
3121 sizeof (Charbpos) * | |
3122 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
3123 memmove (t->mule_bytebpos_cache, | |
3124 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS, | |
3125 sizeof (Bytebpos) * | |
3126 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS)); | |
3127 t->next_cache_pos -= NUM_MOVED_POSITIONS; | |
3128 } | |
3129 t->mule_charbpos_cache[t->next_cache_pos] = best_above; | |
3130 t->mule_bytebpos_cache[t->next_cache_pos] = best_above_byte; | |
3131 t->next_cache_pos++; | |
3132 } | |
3133 | |
3134 t->cached_charpos = best_above; | |
3135 t->cached_bytepos = best_above_byte; | |
3136 | |
3137 retval = best_above; | |
3138 text_checking_assert (best_above_byte >= best_above); | |
3139 goto done; | |
3140 } | |
3141 | |
3142 #ifdef OLD_BYTE_CHAR | |
3143 | |
771 | 3144 bufmin = buf->text->mule_bufmin; |
3145 bufmax = buf->text->mule_bufmax; | |
3146 bytmin = buf->text->mule_bytmin; | |
3147 bytmax = buf->text->mule_bytmax; | |
3148 size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; | |
3149 | |
3150 /* The basic idea here is that we shift the "known region" up or down | |
3151 until it overlaps the specified position. We do this by moving | |
3152 the upper bound of the known region up one character at a time, | |
3153 and moving the lower bound of the known region up as necessary | |
3154 when the size of the character just seen changes. | |
3155 | |
3156 We optimize this, however, by first shifting the known region to | |
826 | 3157 one of the cached points if it's close by. (We don't check BYTE_BEG or |
3158 BYTE_Z, even though they're cached; most of the time these will be the | |
3159 same as BYTE_BEGV and BYTE_ZV, and when they're not, they're not likely | |
771 | 3160 to be used.) */ |
3161 | |
3162 if (x > bytmax) | |
3163 { | |
3164 Bytebpos diffmax = x - bytmax; | |
826 | 3165 Bytebpos diffpt = x - BYTE_BUF_PT (buf); |
3166 Bytebpos diffzv = BYTE_BUF_ZV (buf) - x; | |
771 | 3167 /* #### This value could stand some more exploration. */ |
3168 Bytecount heuristic_hack = (bytmax - bytmin) >> 2; | |
3169 | |
3170 /* Check if the position is closer to PT or ZV than to the | |
3171 end of the known region. */ | |
3172 | |
3173 if (diffpt < 0) | |
3174 diffpt = -diffpt; | |
3175 if (diffzv < 0) | |
3176 diffzv = -diffzv; | |
3177 | |
3178 /* But also implement a heuristic that favors the known region | |
826 | 3179 over BYTE_PT or BYTE_ZV. The reason for this is that switching to |
3180 BYTE_PT or BYTE_ZV will wipe out the knowledge in the known region, | |
771 | 3181 which might be annoying if the known region is large and |
826 | 3182 BYTE_PT or BYTE_ZV is not that much closer than the end of the known |
771 | 3183 region. */ |
3184 | |
3185 diffzv += heuristic_hack; | |
3186 diffpt += heuristic_hack; | |
3187 if (diffpt < diffmax && diffpt <= diffzv) | |
3188 { | |
3189 bufmax = bufmin = BUF_PT (buf); | |
826 | 3190 bytmax = bytmin = BYTE_BUF_PT (buf); |
771 | 3191 /* We set the size to 1 even though it doesn't really |
3192 matter because the new known region contains no | |
3193 characters. We do this because this is the most | |
3194 likely size of the characters around the new known | |
3195 region, and we avoid potential yuckiness that is | |
3196 done when size == 3. */ | |
3197 size = 1; | |
3198 } | |
3199 if (diffzv < diffmax) | |
3200 { | |
3201 bufmax = bufmin = BUF_ZV (buf); | |
826 | 3202 bytmax = bytmin = BYTE_BUF_ZV (buf); |
771 | 3203 size = 1; |
3204 } | |
3205 } | |
800 | 3206 #ifdef ERROR_CHECK_TEXT |
771 | 3207 else if (x >= bytmin) |
2500 | 3208 ABORT (); |
771 | 3209 #endif |
3210 else | |
3211 { | |
3212 Bytebpos diffmin = bytmin - x; | |
826 | 3213 Bytebpos diffpt = BYTE_BUF_PT (buf) - x; |
3214 Bytebpos diffbegv = x - BYTE_BUF_BEGV (buf); | |
771 | 3215 /* #### This value could stand some more exploration. */ |
3216 Bytecount heuristic_hack = (bytmax - bytmin) >> 2; | |
3217 | |
3218 if (diffpt < 0) | |
3219 diffpt = -diffpt; | |
3220 if (diffbegv < 0) | |
3221 diffbegv = -diffbegv; | |
3222 | |
3223 /* But also implement a heuristic that favors the known region -- | |
3224 see above. */ | |
3225 | |
3226 diffbegv += heuristic_hack; | |
3227 diffpt += heuristic_hack; | |
3228 | |
3229 if (diffpt < diffmin && diffpt <= diffbegv) | |
3230 { | |
3231 bufmax = bufmin = BUF_PT (buf); | |
826 | 3232 bytmax = bytmin = BYTE_BUF_PT (buf); |
771 | 3233 /* We set the size to 1 even though it doesn't really |
3234 matter because the new known region contains no | |
3235 characters. We do this because this is the most | |
3236 likely size of the characters around the new known | |
3237 region, and we avoid potential yuckiness that is | |
3238 done when size == 3. */ | |
3239 size = 1; | |
3240 } | |
3241 if (diffbegv < diffmin) | |
3242 { | |
3243 bufmax = bufmin = BUF_BEGV (buf); | |
826 | 3244 bytmax = bytmin = BYTE_BUF_BEGV (buf); |
771 | 3245 size = 1; |
3246 } | |
3247 } | |
3248 | |
3249 diff_so_far = x > bytmax ? x - bytmax : bytmin - x; | |
3250 if (diff_so_far > 50) | |
3251 { | |
3252 /* If we have to move more than a certain amount, then look | |
3253 into our cache. */ | |
3254 int minval = INT_MAX; | |
3255 int found = 0; | |
3256 int i; | |
3257 | |
3258 add_to_cache = 1; | |
3259 /* I considered keeping the positions ordered. This would speed | |
3260 up this loop, but updating the cache would take longer, so | |
3261 it doesn't seem like it would really matter. */ | |
2367 | 3262 for (i = 0; i < NUM_CACHED_POSITIONS; i++) |
771 | 3263 { |
3264 int diff = buf->text->mule_bytebpos_cache[i] - x; | |
3265 | |
3266 if (diff < 0) | |
3267 diff = -diff; | |
3268 if (diff < minval) | |
3269 { | |
3270 minval = diff; | |
3271 found = i; | |
3272 } | |
3273 } | |
3274 | |
3275 if (minval < diff_so_far) | |
3276 { | |
3277 bufmax = bufmin = buf->text->mule_charbpos_cache[found]; | |
3278 bytmax = bytmin = buf->text->mule_bytebpos_cache[found]; | |
3279 size = 1; | |
3280 } | |
3281 } | |
3282 | |
3283 /* It's conceivable that the caching above could lead to X being | |
3284 the same as one of the range edges. */ | |
3285 if (x >= bytmax) | |
3286 { | |
3287 Bytebpos newmax; | |
3288 Bytecount newsize; | |
3289 | |
3290 forward_p = 1; | |
3291 while (x > bytmax) | |
3292 { | |
3293 newmax = bytmax; | |
3294 | |
3295 INC_BYTEBPOS (buf, newmax); | |
3296 newsize = newmax - bytmax; | |
3297 if (newsize != size) | |
3298 { | |
3299 bufmin = bufmax; | |
3300 bytmin = bytmax; | |
3301 size = newsize; | |
3302 } | |
3303 bytmax = newmax; | |
3304 bufmax++; | |
3305 } | |
3306 retval = bufmax; | |
3307 | |
3308 /* #### Should go past the found location to reduce the number | |
3309 of times that this function is called */ | |
3310 } | |
3311 else /* x <= bytmin */ | |
3312 { | |
3313 Bytebpos newmin; | |
3314 Bytecount newsize; | |
3315 | |
3316 forward_p = 0; | |
3317 while (x < bytmin) | |
3318 { | |
3319 newmin = bytmin; | |
3320 | |
3321 DEC_BYTEBPOS (buf, newmin); | |
3322 newsize = bytmin - newmin; | |
3323 if (newsize != size) | |
3324 { | |
3325 bufmax = bufmin; | |
3326 bytmax = bytmin; | |
3327 size = newsize; | |
3328 } | |
3329 bytmin = newmin; | |
3330 bufmin--; | |
3331 } | |
3332 retval = bufmin; | |
3333 | |
3334 /* #### Should go past the found location to reduce the number | |
3335 of times that this function is called | |
3336 */ | |
3337 } | |
3338 | |
3339 /* If size is three, than we have to max sure that the range we | |
3340 discovered isn't too large, because we use a fixed-length | |
3341 table to divide by 3. */ | |
3342 | |
3343 if (size == 3) | |
3344 { | |
3345 int gap = bytmax - bytmin; | |
3346 buf->text->mule_three_p = 1; | |
3347 buf->text->mule_shifter = 1; | |
3348 | |
3349 if (gap > MAX_BYTEBPOS_GAP_SIZE_3) | |
3350 { | |
3351 if (forward_p) | |
3352 { | |
3353 bytmin = bytmax - MAX_BYTEBPOS_GAP_SIZE_3; | |
3354 bufmin = bufmax - MAX_CHARBPOS_GAP_SIZE_3; | |
3355 } | |
3356 else | |
3357 { | |
3358 bytmax = bytmin + MAX_BYTEBPOS_GAP_SIZE_3; | |
3359 bufmax = bufmin + MAX_CHARBPOS_GAP_SIZE_3; | |
3360 } | |
3361 } | |
3362 } | |
3363 else | |
3364 { | |
3365 buf->text->mule_three_p = 0; | |
3366 if (size == 4) | |
3367 buf->text->mule_shifter = 2; | |
3368 else | |
3369 buf->text->mule_shifter = size - 1; | |
3370 } | |
3371 | |
3372 buf->text->mule_bufmin = bufmin; | |
3373 buf->text->mule_bufmax = bufmax; | |
3374 buf->text->mule_bytmin = bytmin; | |
3375 buf->text->mule_bytmax = bytmax; | |
3376 | |
3377 if (add_to_cache) | |
3378 { | |
3379 int replace_loc; | |
3380 | |
3381 /* We throw away a "random" cached value and replace it with | |
3382 the new value. It doesn't actually have to be very random | |
3383 at all, just evenly distributed. | |
3384 | |
3385 #### It would be better to use a least-recently-used algorithm | |
3386 or something that tries to space things out, but I'm not sure | |
3387 it's worth it to go to the trouble of maintaining that. */ | |
3388 not_very_random_number += 621; | |
3389 replace_loc = not_very_random_number & 15; | |
3390 buf->text->mule_charbpos_cache[replace_loc] = retval; | |
3391 buf->text->mule_bytebpos_cache[replace_loc] = x; | |
3392 } | |
2367 | 3393 #endif /* OLD_BYTE_CHAR */ |
3394 | |
3395 done: | |
1292 | 3396 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion); |
3397 | |
771 | 3398 return retval; |
3399 } | |
3400 | |
3401 /* Text of length BYTELENGTH and CHARLENGTH (in different units) | |
3402 was inserted at charbpos START. */ | |
3403 | |
3404 void | |
3405 buffer_mule_signal_inserted_region (struct buffer *buf, Charbpos start, | |
3406 Bytecount bytelength, | |
3407 Charcount charlength) | |
3408 { | |
2367 | 3409 #ifdef OLD_BYTE_CHAR |
771 | 3410 int size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; |
2367 | 3411 #endif /* OLD_BYTE_CHAR */ |
771 | 3412 int i; |
3413 | |
3414 /* Adjust the cache of known positions. */ | |
2367 | 3415 for (i = 0; i < buf->text->next_cache_pos; i++) |
771 | 3416 { |
3417 | |
3418 if (buf->text->mule_charbpos_cache[i] > start) | |
3419 { | |
3420 buf->text->mule_charbpos_cache[i] += charlength; | |
3421 buf->text->mule_bytebpos_cache[i] += bytelength; | |
3422 } | |
3423 } | |
3424 | |
2367 | 3425 /* Adjust the special cached position. */ |
3426 | |
3427 if (buf->text->cached_charpos > start) | |
3428 { | |
3429 buf->text->cached_charpos += charlength; | |
3430 buf->text->cached_bytepos += bytelength; | |
3431 } | |
3432 | |
3433 #ifdef OLD_BYTE_CHAR | |
771 | 3434 if (start >= buf->text->mule_bufmax) |
826 | 3435 return; |
771 | 3436 |
3437 /* The insertion is either before the known region, in which case | |
3438 it shoves it forward; or within the known region, in which case | |
3439 it shoves the end forward. (But it may make the known region | |
3440 inconsistent, so we may have to shorten it.) */ | |
3441 | |
3442 if (start <= buf->text->mule_bufmin) | |
3443 { | |
3444 buf->text->mule_bufmin += charlength; | |
3445 buf->text->mule_bufmax += charlength; | |
3446 buf->text->mule_bytmin += bytelength; | |
3447 buf->text->mule_bytmax += bytelength; | |
3448 } | |
3449 else | |
3450 { | |
3451 Charbpos end = start + charlength; | |
3452 /* the insertion point divides the known region in two. | |
3453 Keep the longer half, at least, and expand into the | |
3454 inserted chunk as much as possible. */ | |
3455 | |
3456 if (start - buf->text->mule_bufmin > buf->text->mule_bufmax - start) | |
3457 { | |
3458 Bytebpos bytestart = (buf->text->mule_bytmin | |
3459 + size * (start - buf->text->mule_bufmin)); | |
3460 Bytebpos bytenew; | |
3461 | |
3462 while (start < end) | |
3463 { | |
3464 bytenew = bytestart; | |
3465 INC_BYTEBPOS (buf, bytenew); | |
3466 if (bytenew - bytestart != size) | |
3467 break; | |
3468 start++; | |
3469 bytestart = bytenew; | |
3470 } | |
3471 if (start != end) | |
3472 { | |
3473 buf->text->mule_bufmax = start; | |
3474 buf->text->mule_bytmax = bytestart; | |
3475 } | |
3476 else | |
3477 { | |
3478 buf->text->mule_bufmax += charlength; | |
3479 buf->text->mule_bytmax += bytelength; | |
3480 } | |
3481 } | |
3482 else | |
3483 { | |
3484 Bytebpos byteend = (buf->text->mule_bytmin | |
3485 + size * (start - buf->text->mule_bufmin) | |
3486 + bytelength); | |
3487 Bytebpos bytenew; | |
3488 | |
3489 buf->text->mule_bufmax += charlength; | |
3490 buf->text->mule_bytmax += bytelength; | |
3491 | |
3492 while (end > start) | |
3493 { | |
3494 bytenew = byteend; | |
3495 DEC_BYTEBPOS (buf, bytenew); | |
3496 if (byteend - bytenew != size) | |
3497 break; | |
3498 end--; | |
3499 byteend = bytenew; | |
3500 } | |
3501 if (start != end) | |
3502 { | |
3503 buf->text->mule_bufmin = end; | |
3504 buf->text->mule_bytmin = byteend; | |
3505 } | |
3506 } | |
3507 } | |
2367 | 3508 #endif /* OLD_BYTE_CHAR */ |
771 | 3509 } |
3510 | |
826 | 3511 /* Text from START to END (equivalent in Bytebpos's: from BYTE_START to |
3512 BYTE_END) was deleted. */ | |
771 | 3513 |
3514 void | |
3515 buffer_mule_signal_deleted_region (struct buffer *buf, Charbpos start, | |
826 | 3516 Charbpos end, Bytebpos byte_start, |
3517 Bytebpos byte_end) | |
771 | 3518 { |
3519 int i; | |
3520 | |
3521 /* Adjust the cache of known positions. */ | |
2367 | 3522 for (i = 0; i < buf->text->next_cache_pos; i++) |
771 | 3523 { |
3524 /* After the end; gets shoved backward */ | |
3525 if (buf->text->mule_charbpos_cache[i] > end) | |
3526 { | |
3527 buf->text->mule_charbpos_cache[i] -= end - start; | |
826 | 3528 buf->text->mule_bytebpos_cache[i] -= byte_end - byte_start; |
771 | 3529 } |
3530 /* In the range; moves to start of range */ | |
3531 else if (buf->text->mule_charbpos_cache[i] > start) | |
3532 { | |
3533 buf->text->mule_charbpos_cache[i] = start; | |
826 | 3534 buf->text->mule_bytebpos_cache[i] = byte_start; |
771 | 3535 } |
3536 } | |
3537 | |
2367 | 3538 /* Adjust the special cached position. */ |
3539 | |
3540 /* After the end; gets shoved backward */ | |
3541 if (buf->text->cached_charpos > end) | |
3542 { | |
3543 buf->text->cached_charpos -= end - start; | |
3544 buf->text->cached_bytepos -= byte_end - byte_start; | |
3545 } | |
3546 /* In the range; moves to start of range */ | |
3547 else if (buf->text->cached_charpos > start) | |
3548 { | |
3549 buf->text->cached_charpos = start; | |
3550 buf->text->cached_bytepos = byte_start; | |
3551 } | |
3552 | |
3553 #ifdef OLD_BYTE_CHAR | |
771 | 3554 /* We don't care about any text after the end of the known region. */ |
3555 | |
3556 end = min (end, buf->text->mule_bufmax); | |
826 | 3557 byte_end = min (byte_end, buf->text->mule_bytmax); |
771 | 3558 if (start >= end) |
826 | 3559 return; |
771 | 3560 |
3561 /* The end of the known region offsets by the total amount of deletion, | |
3562 since it's all before it. */ | |
3563 | |
3564 buf->text->mule_bufmax -= end - start; | |
826 | 3565 buf->text->mule_bytmax -= byte_end - byte_start; |
771 | 3566 |
3567 /* Now we don't care about any text after the start of the known region. */ | |
3568 | |
3569 end = min (end, buf->text->mule_bufmin); | |
826 | 3570 byte_end = min (byte_end, buf->text->mule_bytmin); |
771 | 3571 if (start < end) |
3572 { | |
3573 buf->text->mule_bufmin -= end - start; | |
826 | 3574 buf->text->mule_bytmin -= byte_end - byte_start; |
771 | 3575 } |
2367 | 3576 #endif /* OLD_BYTE_CHAR */ |
771 | 3577 } |
3578 | |
3579 #endif /* MULE */ | |
3580 | |
3581 | |
3582 /************************************************************************/ | |
3583 /* verifying buffer and string positions */ | |
3584 /************************************************************************/ | |
3585 | |
3586 /* Functions below are tagged with either _byte or _char indicating | |
3587 whether they return byte or character positions. For a buffer, | |
3588 a character position is a "Charbpos" and a byte position is a "Bytebpos". | |
3589 For strings, these are sometimes typed using "Charcount" and | |
3590 "Bytecount". */ | |
3591 | |
3592 /* Flags for the functions below are: | |
3593 | |
3594 GB_ALLOW_PAST_ACCESSIBLE | |
3595 | |
3596 Allow positions to range over the entire buffer (BUF_BEG to BUF_Z), | |
3597 rather than just the accessible portion (BUF_BEGV to BUF_ZV). | |
3598 For strings, this flag has no effect. | |
3599 | |
3600 GB_COERCE_RANGE | |
3601 | |
3602 If the position is outside the allowable range, return the lower | |
3603 or upper bound of the range, whichever is closer to the specified | |
3604 position. | |
3605 | |
3606 GB_NO_ERROR_IF_BAD | |
3607 | |
3608 If the position is outside the allowable range, return -1. | |
3609 | |
3610 GB_NEGATIVE_FROM_END | |
3611 | |
3612 If a value is negative, treat it as an offset from the end. | |
3613 Only applies to strings. | |
3614 | |
3615 The following additional flags apply only to the functions | |
3616 that return ranges: | |
3617 | |
3618 GB_ALLOW_NIL | |
3619 | |
3620 Either or both positions can be nil. If FROM is nil, | |
3621 FROM_OUT will contain the lower bound of the allowed range. | |
3622 If TO is nil, TO_OUT will contain the upper bound of the | |
3623 allowed range. | |
3624 | |
3625 GB_CHECK_ORDER | |
3626 | |
3627 FROM must contain the lower bound and TO the upper bound | |
3628 of the range. If the positions are reversed, an error is | |
3629 signalled. | |
3630 | |
3631 The following is a combination flag: | |
3632 | |
3633 GB_HISTORICAL_STRING_BEHAVIOR | |
3634 | |
3635 Equivalent to (GB_NEGATIVE_FROM_END | GB_ALLOW_NIL). | |
3636 */ | |
3637 | |
3638 /* Return a buffer position stored in a Lisp_Object. Full | |
3639 error-checking is done on the position. Flags can be specified to | |
3640 control the behavior of out-of-range values. The default behavior | |
3641 is to require that the position is within the accessible part of | |
3642 the buffer (BEGV and ZV), and to signal an error if the position is | |
3643 out of range. | |
3644 | |
3645 */ | |
3646 | |
3647 Charbpos | |
3648 get_buffer_pos_char (struct buffer *b, Lisp_Object pos, unsigned int flags) | |
3649 { | |
3650 /* Does not GC */ | |
3651 Charbpos ind; | |
3652 Charbpos min_allowed, max_allowed; | |
3653 | |
3654 CHECK_INT_COERCE_MARKER (pos); | |
3655 ind = XINT (pos); | |
3656 min_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_BEG (b) : BUF_BEGV (b); | |
3657 max_allowed = flags & GB_ALLOW_PAST_ACCESSIBLE ? BUF_Z (b) : BUF_ZV (b); | |
3658 | |
3659 if (ind < min_allowed || ind > max_allowed) | |
3660 { | |
3661 if (flags & GB_COERCE_RANGE) | |
3662 ind = ind < min_allowed ? min_allowed : max_allowed; | |
3663 else if (flags & GB_NO_ERROR_IF_BAD) | |
3664 ind = -1; | |
3665 else | |
3666 { | |
793 | 3667 Lisp_Object buffer = wrap_buffer (b); |
3668 | |
771 | 3669 args_out_of_range (buffer, pos); |
3670 } | |
3671 } | |
3672 | |
3673 return ind; | |
3674 } | |
3675 | |
3676 Bytebpos | |
3677 get_buffer_pos_byte (struct buffer *b, Lisp_Object pos, unsigned int flags) | |
3678 { | |
3679 Charbpos bpos = get_buffer_pos_char (b, pos, flags); | |
3680 if (bpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */ | |
3681 return -1; | |
3682 return charbpos_to_bytebpos (b, bpos); | |
3683 } | |
3684 | |
3685 /* Return a pair of buffer positions representing a range of text, | |
3686 taken from a pair of Lisp_Objects. Full error-checking is | |
3687 done on the positions. Flags can be specified to control the | |
3688 behavior of out-of-range values. The default behavior is to | |
3689 allow the range bounds to be specified in either order | |
3690 (however, FROM_OUT will always be the lower bound of the range | |
3691 and TO_OUT the upper bound),to require that the positions | |
3692 are within the accessible part of the buffer (BEGV and ZV), | |
3693 and to signal an error if the positions are out of range. | |
3694 */ | |
3695 | |
3696 void | |
3697 get_buffer_range_char (struct buffer *b, Lisp_Object from, Lisp_Object to, | |
826 | 3698 Charbpos *from_out, Charbpos *to_out, |
3699 unsigned int flags) | |
771 | 3700 { |
3701 /* Does not GC */ | |
3702 Charbpos min_allowed, max_allowed; | |
3703 | |
3704 min_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ? | |
3705 BUF_BEG (b) : BUF_BEGV (b); | |
3706 max_allowed = (flags & GB_ALLOW_PAST_ACCESSIBLE) ? | |
3707 BUF_Z (b) : BUF_ZV (b); | |
3708 | |
3709 if (NILP (from) && (flags & GB_ALLOW_NIL)) | |
3710 *from_out = min_allowed; | |
3711 else | |
3712 *from_out = get_buffer_pos_char (b, from, flags | GB_NO_ERROR_IF_BAD); | |
3713 | |
3714 if (NILP (to) && (flags & GB_ALLOW_NIL)) | |
3715 *to_out = max_allowed; | |
3716 else | |
3717 *to_out = get_buffer_pos_char (b, to, flags | GB_NO_ERROR_IF_BAD); | |
3718 | |
3719 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD)) | |
3720 { | |
793 | 3721 Lisp_Object buffer = wrap_buffer (b); |
3722 | |
771 | 3723 args_out_of_range_3 (buffer, from, to); |
3724 } | |
3725 | |
3726 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out) | |
3727 { | |
3728 if (flags & GB_CHECK_ORDER) | |
3729 invalid_argument_2 ("start greater than end", from, to); | |
3730 else | |
3731 { | |
3732 Charbpos temp = *from_out; | |
3733 *from_out = *to_out; | |
3734 *to_out = temp; | |
3735 } | |
3736 } | |
3737 } | |
3738 | |
3739 void | |
3740 get_buffer_range_byte (struct buffer *b, Lisp_Object from, Lisp_Object to, | |
826 | 3741 Bytebpos *from_out, Bytebpos *to_out, |
3742 unsigned int flags) | |
771 | 3743 { |
3744 Charbpos s, e; | |
3745 | |
3746 get_buffer_range_char (b, from, to, &s, &e, flags); | |
3747 if (s >= 0) | |
3748 *from_out = charbpos_to_bytebpos (b, s); | |
3749 else /* could happen with GB_NO_ERROR_IF_BAD */ | |
3750 *from_out = -1; | |
3751 if (e >= 0) | |
3752 *to_out = charbpos_to_bytebpos (b, e); | |
3753 else | |
3754 *to_out = -1; | |
3755 } | |
3756 | |
3757 static Charcount | |
3758 get_string_pos_char_1 (Lisp_Object string, Lisp_Object pos, unsigned int flags, | |
3759 Charcount known_length) | |
3760 { | |
3761 Charcount ccpos; | |
3762 Charcount min_allowed = 0; | |
3763 Charcount max_allowed = known_length; | |
3764 | |
3765 /* Computation of KNOWN_LENGTH is potentially expensive so we pass | |
3766 it in. */ | |
3767 CHECK_INT (pos); | |
3768 ccpos = XINT (pos); | |
3769 if (ccpos < 0 && flags & GB_NEGATIVE_FROM_END) | |
3770 ccpos += max_allowed; | |
3771 | |
3772 if (ccpos < min_allowed || ccpos > max_allowed) | |
3773 { | |
3774 if (flags & GB_COERCE_RANGE) | |
3775 ccpos = ccpos < min_allowed ? min_allowed : max_allowed; | |
3776 else if (flags & GB_NO_ERROR_IF_BAD) | |
3777 ccpos = -1; | |
3778 else | |
3779 args_out_of_range (string, pos); | |
3780 } | |
3781 | |
3782 return ccpos; | |
3783 } | |
3784 | |
3785 Charcount | |
3786 get_string_pos_char (Lisp_Object string, Lisp_Object pos, unsigned int flags) | |
3787 { | |
3788 return get_string_pos_char_1 (string, pos, flags, | |
826 | 3789 string_char_length (string)); |
771 | 3790 } |
3791 | |
3792 Bytecount | |
3793 get_string_pos_byte (Lisp_Object string, Lisp_Object pos, unsigned int flags) | |
3794 { | |
3795 Charcount ccpos = get_string_pos_char (string, pos, flags); | |
3796 if (ccpos < 0) /* could happen with GB_NO_ERROR_IF_BAD */ | |
3797 return -1; | |
793 | 3798 return string_index_char_to_byte (string, ccpos); |
771 | 3799 } |
3800 | |
3801 void | |
3802 get_string_range_char (Lisp_Object string, Lisp_Object from, Lisp_Object to, | |
3803 Charcount *from_out, Charcount *to_out, | |
3804 unsigned int flags) | |
3805 { | |
3806 Charcount min_allowed = 0; | |
826 | 3807 Charcount max_allowed = string_char_length (string); |
771 | 3808 |
3809 if (NILP (from) && (flags & GB_ALLOW_NIL)) | |
3810 *from_out = min_allowed; | |
3811 else | |
3812 *from_out = get_string_pos_char_1 (string, from, | |
3813 flags | GB_NO_ERROR_IF_BAD, | |
3814 max_allowed); | |
3815 | |
3816 if (NILP (to) && (flags & GB_ALLOW_NIL)) | |
3817 *to_out = max_allowed; | |
3818 else | |
3819 *to_out = get_string_pos_char_1 (string, to, | |
3820 flags | GB_NO_ERROR_IF_BAD, | |
3821 max_allowed); | |
3822 | |
3823 if ((*from_out < 0 || *to_out < 0) && !(flags & GB_NO_ERROR_IF_BAD)) | |
3824 args_out_of_range_3 (string, from, to); | |
3825 | |
3826 if (*from_out >= 0 && *to_out >= 0 && *from_out > *to_out) | |
3827 { | |
3828 if (flags & GB_CHECK_ORDER) | |
3829 invalid_argument_2 ("start greater than end", from, to); | |
3830 else | |
3831 { | |
3832 Charbpos temp = *from_out; | |
3833 *from_out = *to_out; | |
3834 *to_out = temp; | |
3835 } | |
3836 } | |
3837 } | |
3838 | |
3839 void | |
3840 get_string_range_byte (Lisp_Object string, Lisp_Object from, Lisp_Object to, | |
3841 Bytecount *from_out, Bytecount *to_out, | |
3842 unsigned int flags) | |
3843 { | |
3844 Charcount s, e; | |
3845 | |
3846 get_string_range_char (string, from, to, &s, &e, flags); | |
3847 if (s >= 0) | |
793 | 3848 *from_out = string_index_char_to_byte (string, s); |
771 | 3849 else /* could happen with GB_NO_ERROR_IF_BAD */ |
3850 *from_out = -1; | |
3851 if (e >= 0) | |
793 | 3852 *to_out = string_index_char_to_byte (string, e); |
771 | 3853 else |
3854 *to_out = -1; | |
3855 | |
3856 } | |
3857 | |
826 | 3858 Charxpos |
771 | 3859 get_buffer_or_string_pos_char (Lisp_Object object, Lisp_Object pos, |
3860 unsigned int flags) | |
3861 { | |
3862 return STRINGP (object) ? | |
3863 get_string_pos_char (object, pos, flags) : | |
3864 get_buffer_pos_char (XBUFFER (object), pos, flags); | |
3865 } | |
3866 | |
826 | 3867 Bytexpos |
771 | 3868 get_buffer_or_string_pos_byte (Lisp_Object object, Lisp_Object pos, |
3869 unsigned int flags) | |
3870 { | |
3871 return STRINGP (object) ? | |
3872 get_string_pos_byte (object, pos, flags) : | |
3873 get_buffer_pos_byte (XBUFFER (object), pos, flags); | |
3874 } | |
3875 | |
3876 void | |
3877 get_buffer_or_string_range_char (Lisp_Object object, Lisp_Object from, | |
826 | 3878 Lisp_Object to, Charxpos *from_out, |
3879 Charxpos *to_out, unsigned int flags) | |
771 | 3880 { |
3881 if (STRINGP (object)) | |
3882 get_string_range_char (object, from, to, from_out, to_out, flags); | |
3883 else | |
826 | 3884 get_buffer_range_char (XBUFFER (object), from, to, from_out, to_out, |
3885 flags); | |
771 | 3886 } |
3887 | |
3888 void | |
3889 get_buffer_or_string_range_byte (Lisp_Object object, Lisp_Object from, | |
826 | 3890 Lisp_Object to, Bytexpos *from_out, |
3891 Bytexpos *to_out, unsigned int flags) | |
771 | 3892 { |
3893 if (STRINGP (object)) | |
3894 get_string_range_byte (object, from, to, from_out, to_out, flags); | |
3895 else | |
826 | 3896 get_buffer_range_byte (XBUFFER (object), from, to, from_out, to_out, |
3897 flags); | |
771 | 3898 } |
3899 | |
826 | 3900 Charxpos |
771 | 3901 buffer_or_string_accessible_begin_char (Lisp_Object object) |
3902 { | |
3903 return STRINGP (object) ? 0 : BUF_BEGV (XBUFFER (object)); | |
3904 } | |
3905 | |
826 | 3906 Charxpos |
771 | 3907 buffer_or_string_accessible_end_char (Lisp_Object object) |
3908 { | |
3909 return STRINGP (object) ? | |
826 | 3910 string_char_length (object) : BUF_ZV (XBUFFER (object)); |
771 | 3911 } |
3912 | |
826 | 3913 Bytexpos |
771 | 3914 buffer_or_string_accessible_begin_byte (Lisp_Object object) |
3915 { | |
826 | 3916 return STRINGP (object) ? 0 : BYTE_BUF_BEGV (XBUFFER (object)); |
771 | 3917 } |
3918 | |
826 | 3919 Bytexpos |
771 | 3920 buffer_or_string_accessible_end_byte (Lisp_Object object) |
3921 { | |
3922 return STRINGP (object) ? | |
826 | 3923 XSTRING_LENGTH (object) : BYTE_BUF_ZV (XBUFFER (object)); |
771 | 3924 } |
3925 | |
826 | 3926 Charxpos |
771 | 3927 buffer_or_string_absolute_begin_char (Lisp_Object object) |
3928 { | |
3929 return STRINGP (object) ? 0 : BUF_BEG (XBUFFER (object)); | |
3930 } | |
3931 | |
826 | 3932 Charxpos |
771 | 3933 buffer_or_string_absolute_end_char (Lisp_Object object) |
3934 { | |
3935 return STRINGP (object) ? | |
826 | 3936 string_char_length (object) : BUF_Z (XBUFFER (object)); |
3937 } | |
3938 | |
3939 Bytexpos | |
3940 buffer_or_string_absolute_begin_byte (Lisp_Object object) | |
3941 { | |
3942 return STRINGP (object) ? 0 : BYTE_BUF_BEG (XBUFFER (object)); | |
3943 } | |
3944 | |
3945 Bytexpos | |
3946 buffer_or_string_absolute_end_byte (Lisp_Object object) | |
3947 { | |
3948 return STRINGP (object) ? | |
3949 XSTRING_LENGTH (object) : BYTE_BUF_Z (XBUFFER (object)); | |
3950 } | |
3951 | |
3952 Charbpos | |
3953 charbpos_clip_to_bounds (Charbpos lower, Charbpos num, Charbpos upper) | |
3954 { | |
3955 return (num < lower ? lower : | |
3956 num > upper ? upper : | |
3957 num); | |
771 | 3958 } |
3959 | |
3960 Bytebpos | |
826 | 3961 bytebpos_clip_to_bounds (Bytebpos lower, Bytebpos num, Bytebpos upper) |
3962 { | |
3963 return (num < lower ? lower : | |
3964 num > upper ? upper : | |
3965 num); | |
3966 } | |
3967 | |
3968 Charxpos | |
3969 charxpos_clip_to_bounds (Charxpos lower, Charxpos num, Charxpos upper) | |
771 | 3970 { |
826 | 3971 return (num < lower ? lower : |
3972 num > upper ? upper : | |
3973 num); | |
3974 } | |
3975 | |
3976 Bytexpos | |
3977 bytexpos_clip_to_bounds (Bytexpos lower, Bytexpos num, Bytexpos upper) | |
3978 { | |
3979 return (num < lower ? lower : | |
3980 num > upper ? upper : | |
3981 num); | |
771 | 3982 } |
3983 | |
826 | 3984 /* These could be implemented in terms of the get_buffer_or_string() |
3985 functions above, but those are complicated and handle lots of weird | |
3986 cases stemming from uncertain external input. */ | |
3987 | |
3988 Charxpos | |
3989 buffer_or_string_clip_to_accessible_char (Lisp_Object object, Charxpos pos) | |
3990 { | |
3991 return (charxpos_clip_to_bounds | |
3992 (pos, buffer_or_string_accessible_begin_char (object), | |
3993 buffer_or_string_accessible_end_char (object))); | |
3994 } | |
3995 | |
3996 Bytexpos | |
3997 buffer_or_string_clip_to_accessible_byte (Lisp_Object object, Bytexpos pos) | |
771 | 3998 { |
826 | 3999 return (bytexpos_clip_to_bounds |
4000 (pos, buffer_or_string_accessible_begin_byte (object), | |
4001 buffer_or_string_accessible_end_byte (object))); | |
4002 } | |
4003 | |
4004 Charxpos | |
4005 buffer_or_string_clip_to_absolute_char (Lisp_Object object, Charxpos pos) | |
4006 { | |
4007 return (charxpos_clip_to_bounds | |
4008 (pos, buffer_or_string_absolute_begin_char (object), | |
4009 buffer_or_string_absolute_end_char (object))); | |
4010 } | |
4011 | |
4012 Bytexpos | |
4013 buffer_or_string_clip_to_absolute_byte (Lisp_Object object, Bytexpos pos) | |
4014 { | |
4015 return (bytexpos_clip_to_bounds | |
4016 (pos, buffer_or_string_absolute_begin_byte (object), | |
4017 buffer_or_string_absolute_end_byte (object))); | |
771 | 4018 } |
4019 | |
4020 | |
4021 /************************************************************************/ | |
4022 /* Implement TO_EXTERNAL_FORMAT, TO_INTERNAL_FORMAT */ | |
4023 /************************************************************************/ | |
4024 | |
4025 typedef struct | |
4026 { | |
867 | 4027 Dynarr_declare (Ibyte_dynarr *); |
4028 } Ibyte_dynarr_dynarr; | |
771 | 4029 |
4030 typedef struct | |
4031 { | |
4032 Dynarr_declare (Extbyte_dynarr *); | |
4033 } Extbyte_dynarr_dynarr; | |
4034 | |
4035 static Extbyte_dynarr_dynarr *conversion_out_dynarr_list; | |
867 | 4036 static Ibyte_dynarr_dynarr *conversion_in_dynarr_list; |
771 | 4037 |
4038 static int dfc_convert_to_external_format_in_use; | |
4039 static int dfc_convert_to_internal_format_in_use; | |
4040 | |
4041 void | |
4042 dfc_convert_to_external_format (dfc_conversion_type source_type, | |
4043 dfc_conversion_data *source, | |
4044 Lisp_Object coding_system, | |
4045 dfc_conversion_type sink_type, | |
4046 dfc_conversion_data *sink) | |
4047 { | |
4048 /* It's guaranteed that many callers are not prepared for GC here, | |
4049 esp. given that this code conversion occurs in many very hidden | |
4050 places. */ | |
1292 | 4051 int count; |
771 | 4052 Extbyte_dynarr *conversion_out_dynarr; |
1292 | 4053 PROFILE_DECLARE (); |
4054 | |
2367 | 4055 assert (!inhibit_non_essential_conversion_operations); |
1292 | 4056 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion); |
4057 | |
4058 count = begin_gc_forbidden (); | |
771 | 4059 |
4060 type_checking_assert | |
4061 (((source_type == DFC_TYPE_DATA) || | |
4062 (source_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)) || | |
4063 (source_type == DFC_TYPE_LISP_STRING && STRINGP (source->lisp_object))) | |
4064 && | |
4065 ((sink_type == DFC_TYPE_DATA) || | |
4066 (sink_type == DFC_TYPE_LISP_LSTREAM && LSTREAMP (source->lisp_object)))); | |
4067 | |
4068 if (Dynarr_length (conversion_out_dynarr_list) <= | |
4069 dfc_convert_to_external_format_in_use) | |
4070 Dynarr_add (conversion_out_dynarr_list, Dynarr_new (Extbyte)); | |
4071 conversion_out_dynarr = Dynarr_at (conversion_out_dynarr_list, | |
4072 dfc_convert_to_external_format_in_use); | |
4073 Dynarr_reset (conversion_out_dynarr); | |
4074 | |
853 | 4075 internal_bind_int (&dfc_convert_to_external_format_in_use, |
4076 dfc_convert_to_external_format_in_use + 1); | |
4077 | |
771 | 4078 coding_system = get_coding_system_for_text_file (coding_system, 0); |
4079 | |
4080 /* Here we optimize in the case where the coding system does no | |
4081 conversion. However, we don't want to optimize in case the source | |
4082 or sink is an lstream, since writing to an lstream can cause a | |
4083 garbage collection, and this could be problematic if the source | |
4084 is a lisp string. */ | |
4085 if (source_type != DFC_TYPE_LISP_LSTREAM && | |
4086 sink_type != DFC_TYPE_LISP_LSTREAM && | |
4087 coding_system_is_binary (coding_system)) | |
4088 { | |
867 | 4089 const Ibyte *ptr; |
771 | 4090 Bytecount len; |
4091 | |
4092 if (source_type == DFC_TYPE_LISP_STRING) | |
4093 { | |
4094 ptr = XSTRING_DATA (source->lisp_object); | |
4095 len = XSTRING_LENGTH (source->lisp_object); | |
4096 } | |
4097 else | |
4098 { | |
867 | 4099 ptr = (Ibyte *) source->data.ptr; |
771 | 4100 len = source->data.len; |
4101 } | |
4102 | |
4103 #ifdef MULE | |
4104 { | |
867 | 4105 const Ibyte *end; |
771 | 4106 for (end = ptr + len; ptr < end;) |
4107 { | |
867 | 4108 Ibyte c = |
826 | 4109 (byte_ascii_p (*ptr)) ? *ptr : |
771 | 4110 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) : |
4111 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) : | |
4112 '~'; | |
4113 | |
4114 Dynarr_add (conversion_out_dynarr, (Extbyte) c); | |
867 | 4115 INC_IBYTEPTR (ptr); |
771 | 4116 } |
800 | 4117 text_checking_assert (ptr == end); |
771 | 4118 } |
4119 #else | |
4120 Dynarr_add_many (conversion_out_dynarr, ptr, len); | |
4121 #endif | |
4122 | |
4123 } | |
1315 | 4124 #ifdef WIN32_ANY |
771 | 4125 /* Optimize the common case involving Unicode where only ASCII is involved */ |
4126 else if (source_type != DFC_TYPE_LISP_LSTREAM && | |
4127 sink_type != DFC_TYPE_LISP_LSTREAM && | |
4128 dfc_coding_system_is_unicode (coding_system)) | |
4129 { | |
867 | 4130 const Ibyte *ptr, *p; |
771 | 4131 Bytecount len; |
867 | 4132 const Ibyte *end; |
771 | 4133 |
4134 if (source_type == DFC_TYPE_LISP_STRING) | |
4135 { | |
4136 ptr = XSTRING_DATA (source->lisp_object); | |
4137 len = XSTRING_LENGTH (source->lisp_object); | |
4138 } | |
4139 else | |
4140 { | |
867 | 4141 ptr = (Ibyte *) source->data.ptr; |
771 | 4142 len = source->data.len; |
4143 } | |
4144 end = ptr + len; | |
4145 | |
4146 for (p = ptr; p < end; p++) | |
4147 { | |
826 | 4148 if (!byte_ascii_p (*p)) |
771 | 4149 goto the_hard_way; |
4150 } | |
4151 | |
4152 for (p = ptr; p < end; p++) | |
4153 { | |
4154 Dynarr_add (conversion_out_dynarr, (Extbyte) (*p)); | |
4155 Dynarr_add (conversion_out_dynarr, (Extbyte) '\0'); | |
4156 } | |
4157 } | |
1315 | 4158 #endif /* WIN32_ANY */ |
771 | 4159 else |
4160 { | |
4161 Lisp_Object streams_to_delete[3]; | |
4162 int delete_count; | |
4163 Lisp_Object instream, outstream; | |
4164 Lstream *reader, *writer; | |
4165 | |
1315 | 4166 #ifdef WIN32_ANY |
771 | 4167 the_hard_way: |
1315 | 4168 #endif /* WIN32_ANY */ |
771 | 4169 delete_count = 0; |
4170 if (source_type == DFC_TYPE_LISP_LSTREAM) | |
4171 instream = source->lisp_object; | |
4172 else if (source_type == DFC_TYPE_DATA) | |
4173 streams_to_delete[delete_count++] = instream = | |
4174 make_fixed_buffer_input_stream (source->data.ptr, source->data.len); | |
4175 else | |
4176 { | |
4177 type_checking_assert (source_type == DFC_TYPE_LISP_STRING); | |
4178 streams_to_delete[delete_count++] = instream = | |
4179 /* This will GCPRO the Lisp string */ | |
4180 make_lisp_string_input_stream (source->lisp_object, 0, -1); | |
4181 } | |
4182 | |
4183 if (sink_type == DFC_TYPE_LISP_LSTREAM) | |
4184 outstream = sink->lisp_object; | |
4185 else | |
4186 { | |
4187 type_checking_assert (sink_type == DFC_TYPE_DATA); | |
4188 streams_to_delete[delete_count++] = outstream = | |
4189 make_dynarr_output_stream | |
4190 ((unsigned_char_dynarr *) conversion_out_dynarr); | |
4191 } | |
4192 | |
4193 streams_to_delete[delete_count++] = outstream = | |
800 | 4194 make_coding_output_stream (XLSTREAM (outstream), coding_system, |
4195 CODING_ENCODE, 0); | |
771 | 4196 |
4197 reader = XLSTREAM (instream); | |
4198 writer = XLSTREAM (outstream); | |
4199 /* decoding_stream will gc-protect outstream */ | |
1204 | 4200 { |
4201 struct gcpro gcpro1, gcpro2; | |
4202 GCPRO2 (instream, outstream); | |
4203 | |
4204 while (1) | |
4205 { | |
4206 Bytecount size_in_bytes; | |
4207 char tempbuf[1024]; /* some random amount */ | |
4208 | |
4209 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf)); | |
4210 | |
4211 if (size_in_bytes == 0) | |
4212 break; | |
4213 else if (size_in_bytes < 0) | |
4214 signal_error (Qtext_conversion_error, | |
4215 "Error converting to external format", Qunbound); | |
4216 | |
4217 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0) | |
4218 signal_error (Qtext_conversion_error, | |
4219 "Error converting to external format", Qunbound); | |
4220 } | |
4221 | |
4222 /* Closing writer will close any stream at the other end of writer. */ | |
4223 Lstream_close (writer); | |
4224 Lstream_close (reader); | |
4225 UNGCPRO; | |
4226 } | |
771 | 4227 |
4228 /* The idea is that this function will create no garbage. */ | |
4229 while (delete_count) | |
4230 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count])); | |
4231 } | |
4232 | |
4233 unbind_to (count); | |
4234 | |
4235 if (sink_type != DFC_TYPE_LISP_LSTREAM) | |
4236 { | |
4237 sink->data.len = Dynarr_length (conversion_out_dynarr); | |
4238 /* double zero-extend because we may be dealing with Unicode data */ | |
4239 Dynarr_add (conversion_out_dynarr, '\0'); | |
4240 Dynarr_add (conversion_out_dynarr, '\0'); | |
4967 | 4241 sink->data.ptr = Dynarr_begin (conversion_out_dynarr); |
771 | 4242 } |
1292 | 4243 |
4244 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion); | |
771 | 4245 } |
4246 | |
4247 void | |
4248 dfc_convert_to_internal_format (dfc_conversion_type source_type, | |
4249 dfc_conversion_data *source, | |
4250 Lisp_Object coding_system, | |
4251 dfc_conversion_type sink_type, | |
4252 dfc_conversion_data *sink) | |
4253 { | |
4254 /* It's guaranteed that many callers are not prepared for GC here, | |
4255 esp. given that this code conversion occurs in many very hidden | |
4256 places. */ | |
1292 | 4257 int count; |
867 | 4258 Ibyte_dynarr *conversion_in_dynarr; |
2421 | 4259 Lisp_Object underlying_cs; |
1292 | 4260 PROFILE_DECLARE (); |
4261 | |
2367 | 4262 assert (!inhibit_non_essential_conversion_operations); |
1292 | 4263 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion); |
4264 | |
4265 count = begin_gc_forbidden (); | |
771 | 4266 |
4267 type_checking_assert | |
4268 ((source_type == DFC_TYPE_DATA || | |
4269 source_type == DFC_TYPE_LISP_LSTREAM) | |
4270 && | |
4271 (sink_type == DFC_TYPE_DATA || | |
4272 sink_type == DFC_TYPE_LISP_LSTREAM)); | |
4273 | |
4274 if (Dynarr_length (conversion_in_dynarr_list) <= | |
4275 dfc_convert_to_internal_format_in_use) | |
867 | 4276 Dynarr_add (conversion_in_dynarr_list, Dynarr_new (Ibyte)); |
771 | 4277 conversion_in_dynarr = Dynarr_at (conversion_in_dynarr_list, |
4278 dfc_convert_to_internal_format_in_use); | |
4279 Dynarr_reset (conversion_in_dynarr); | |
4280 | |
853 | 4281 internal_bind_int (&dfc_convert_to_internal_format_in_use, |
4282 dfc_convert_to_internal_format_in_use + 1); | |
4283 | |
2421 | 4284 /* The second call does the equivalent of both calls, but we need |
4285 the result after the first call (which wraps just a to-text | |
4286 converter) as well as the result after the second call (which | |
4287 also wraps an EOL-detection converter). */ | |
4288 underlying_cs = get_coding_system_for_text_file (coding_system, 0); | |
4289 coding_system = get_coding_system_for_text_file (underlying_cs, 1); | |
771 | 4290 |
4291 if (source_type != DFC_TYPE_LISP_LSTREAM && | |
4292 sink_type != DFC_TYPE_LISP_LSTREAM && | |
2421 | 4293 coding_system_is_binary (underlying_cs)) |
771 | 4294 { |
4295 #ifdef MULE | |
2421 | 4296 const Ibyte *ptr; |
771 | 4297 Bytecount len = source->data.len; |
2421 | 4298 const Ibyte *end; |
4299 | |
4300 /* Make sure no EOL conversion is needed. With a little work we | |
4301 could handle EOL conversion as well but it may not be needed as an | |
4302 optimization. */ | |
4303 if (!EQ (coding_system, underlying_cs)) | |
4304 { | |
4305 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len; | |
4306 ptr < end; ptr++) | |
4307 { | |
4308 if (*ptr == '\r' || *ptr == '\n') | |
4309 goto the_hard_way; | |
4310 } | |
4311 } | |
4312 | |
4313 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len; | |
4314 ptr < end; ptr++) | |
771 | 4315 { |
867 | 4316 Ibyte c = *ptr; |
771 | 4317 |
826 | 4318 if (byte_ascii_p (c)) |
771 | 4319 Dynarr_add (conversion_in_dynarr, c); |
826 | 4320 else if (byte_c1_p (c)) |
771 | 4321 { |
4322 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1); | |
4323 Dynarr_add (conversion_in_dynarr, c + 0x20); | |
4324 } | |
4325 else | |
4326 { | |
4327 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1); | |
4328 Dynarr_add (conversion_in_dynarr, c); | |
4329 } | |
4330 } | |
4331 #else | |
4332 Dynarr_add_many (conversion_in_dynarr, source->data.ptr, source->data.len); | |
4333 #endif | |
4334 } | |
1315 | 4335 #ifdef WIN32_ANY |
1292 | 4336 /* Optimize the common case involving Unicode where only ASCII/Latin-1 is |
4337 involved */ | |
771 | 4338 else if (source_type != DFC_TYPE_LISP_LSTREAM && |
4339 sink_type != DFC_TYPE_LISP_LSTREAM && | |
2421 | 4340 dfc_coding_system_is_unicode (underlying_cs)) |
771 | 4341 { |
2421 | 4342 const Ibyte *ptr; |
771 | 4343 Bytecount len = source->data.len; |
2421 | 4344 const Ibyte *end; |
771 | 4345 |
4346 if (len & 1) | |
4347 goto the_hard_way; | |
4348 | |
2421 | 4349 /* Make sure only ASCII/Latin-1 is involved */ |
4350 for (ptr = (const Ibyte *) source->data.ptr + 1, end = ptr + len; | |
4351 ptr < end; ptr += 2) | |
771 | 4352 { |
4353 if (*ptr) | |
4354 goto the_hard_way; | |
4355 } | |
4356 | |
2421 | 4357 /* Make sure no EOL conversion is needed. With a little work we |
4358 could handle EOL conversion as well but it may not be needed as an | |
4359 optimization. */ | |
4360 if (!EQ (coding_system, underlying_cs)) | |
4361 { | |
4362 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len; | |
4363 ptr < end; ptr += 2) | |
4364 { | |
4365 if (*ptr == '\r' || *ptr == '\n') | |
4366 goto the_hard_way; | |
4367 } | |
4368 } | |
4369 | |
4370 for (ptr = (const Ibyte *) source->data.ptr, end = ptr + len; | |
4371 ptr < end; ptr += 2) | |
771 | 4372 { |
867 | 4373 Ibyte c = *ptr; |
771 | 4374 |
826 | 4375 if (byte_ascii_p (c)) |
771 | 4376 Dynarr_add (conversion_in_dynarr, c); |
4377 #ifdef MULE | |
826 | 4378 else if (byte_c1_p (c)) |
771 | 4379 { |
4380 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_CONTROL_1); | |
4381 Dynarr_add (conversion_in_dynarr, c + 0x20); | |
4382 } | |
4383 else | |
4384 { | |
4385 Dynarr_add (conversion_in_dynarr, LEADING_BYTE_LATIN_ISO8859_1); | |
4386 Dynarr_add (conversion_in_dynarr, c); | |
4387 } | |
4388 #endif /* MULE */ | |
4389 } | |
4390 } | |
1315 | 4391 #endif /* WIN32_ANY */ |
771 | 4392 else |
4393 { | |
4394 Lisp_Object streams_to_delete[3]; | |
4395 int delete_count; | |
4396 Lisp_Object instream, outstream; | |
4397 Lstream *reader, *writer; | |
4398 | |
2421 | 4399 #if defined (WIN32_ANY) || defined (MULE) |
771 | 4400 the_hard_way: |
2421 | 4401 #endif |
771 | 4402 delete_count = 0; |
4403 if (source_type == DFC_TYPE_LISP_LSTREAM) | |
4404 instream = source->lisp_object; | |
4405 else | |
4406 { | |
4407 type_checking_assert (source_type == DFC_TYPE_DATA); | |
4408 streams_to_delete[delete_count++] = instream = | |
4409 make_fixed_buffer_input_stream (source->data.ptr, source->data.len); | |
4410 } | |
4411 | |
4412 if (sink_type == DFC_TYPE_LISP_LSTREAM) | |
4413 outstream = sink->lisp_object; | |
4414 else | |
4415 { | |
4416 type_checking_assert (sink_type == DFC_TYPE_DATA); | |
4417 streams_to_delete[delete_count++] = outstream = | |
4418 make_dynarr_output_stream | |
4419 ((unsigned_char_dynarr *) conversion_in_dynarr); | |
4420 } | |
4421 | |
4422 streams_to_delete[delete_count++] = outstream = | |
800 | 4423 make_coding_output_stream (XLSTREAM (outstream), coding_system, |
4424 CODING_DECODE, 0); | |
771 | 4425 |
4426 reader = XLSTREAM (instream); | |
4427 writer = XLSTREAM (outstream); | |
1204 | 4428 { |
4429 struct gcpro gcpro1, gcpro2; | |
4430 /* outstream will gc-protect its sink stream, if necessary */ | |
4431 GCPRO2 (instream, outstream); | |
4432 | |
4433 while (1) | |
4434 { | |
4435 Bytecount size_in_bytes; | |
4436 char tempbuf[1024]; /* some random amount */ | |
4437 | |
4438 size_in_bytes = Lstream_read (reader, tempbuf, sizeof (tempbuf)); | |
4439 | |
4440 if (size_in_bytes == 0) | |
4441 break; | |
4442 else if (size_in_bytes < 0) | |
4443 signal_error (Qtext_conversion_error, | |
4444 "Error converting to internal format", Qunbound); | |
4445 | |
4446 if (Lstream_write (writer, tempbuf, size_in_bytes) < 0) | |
4447 signal_error (Qtext_conversion_error, | |
4448 "Error converting to internal format", Qunbound); | |
4449 } | |
4450 | |
4451 /* Closing writer will close any stream at the other end of writer. */ | |
4452 Lstream_close (writer); | |
4453 Lstream_close (reader); | |
4454 UNGCPRO; | |
4455 } | |
771 | 4456 |
4457 /* The idea is that this function will create no garbage. */ | |
4458 while (delete_count) | |
4459 Lstream_delete (XLSTREAM (streams_to_delete [--delete_count])); | |
4460 } | |
4461 | |
4462 unbind_to (count); | |
4463 | |
4464 if (sink_type != DFC_TYPE_LISP_LSTREAM) | |
4465 { | |
4466 sink->data.len = Dynarr_length (conversion_in_dynarr); | |
4467 Dynarr_add (conversion_in_dynarr, '\0'); /* remember to NUL-terminate! */ | |
4468 /* The macros don't currently distinguish between internal and | |
4469 external sinks, and allocate and copy two extra bytes in both | |
4470 cases. So we add a second zero, just like for external data | |
4471 (in that case, because we may be converting to Unicode). */ | |
4472 Dynarr_add (conversion_in_dynarr, '\0'); | |
4967 | 4473 sink->data.ptr = Dynarr_begin (conversion_in_dynarr); |
771 | 4474 } |
1292 | 4475 |
4476 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion); | |
771 | 4477 } |
4478 | |
1318 | 4479 /* ----------------------------------------------------------------------- */ |
2367 | 4480 /* Alloca-conversion helpers */ |
4481 /* ----------------------------------------------------------------------- */ | |
4482 | |
4483 /* For alloca(), things are trickier because the calling function needs to | |
4484 allocate. This means that the caller needs to do the following: | |
4485 | |
4486 (a) invoke us to do the conversion, remember the data and return the size. | |
4487 (b) alloca() the proper size. | |
4488 (c) invoke us again to copy the data. | |
4489 | |
4490 We need to handle the possibility of two or more invocations of the | |
4491 converter in the same expression. In such cases it's conceivable that | |
4492 the evaluation of the sub-expressions will be overlapping (e.g. one size | |
4493 function called, then the other one called, then the copy functions | |
4494 called). To handle this, we keep a list of active data, indexed by the | |
4495 src expression. (We use the stringize operator to avoid evaluating the | |
4496 expression multiple times.) If the caller uses the exact same src | |
4497 expression twice in two converter calls in the same subexpression, we | |
2500 | 4498 will lose, but at least we can check for this and ABORT(). We could |
2367 | 4499 conceivably try to index on other parameters as well, but there is not |
4500 really any point. */ | |
4501 | |
4502 alloca_convert_vals_dynarr *active_alloca_convert; | |
4503 | |
4504 int | |
4505 find_pos_of_existing_active_alloca_convert (const char *srctext) | |
4506 { | |
4507 alloca_convert_vals *vals = NULL; | |
4508 int i; | |
4509 | |
4510 if (!active_alloca_convert) | |
4511 active_alloca_convert = Dynarr_new (alloca_convert_vals); | |
4512 | |
4513 for (i = 0; i < Dynarr_length (active_alloca_convert); i++) | |
4514 { | |
4515 vals = Dynarr_atp (active_alloca_convert, i); | |
2385 | 4516 /* On my system, two different occurrences of the same stringized |
4517 argument always point to the same string. However, on someone | |
4518 else's system, that wasn't the case. We check for equality | |
4519 first, since it seems systems work my way more than the other | |
4520 way. */ | |
4521 if (vals->srctext == srctext || !strcmp (vals->srctext, srctext)) | |
2367 | 4522 return i; |
4523 } | |
4524 | |
4525 return -1; | |
4526 } | |
4527 | |
4528 /* ----------------------------------------------------------------------- */ | |
1318 | 4529 /* New-style DFC converters (data is returned rather than stored into var) */ |
4530 /* ----------------------------------------------------------------------- */ | |
4531 | |
4532 /* We handle here the cases where SRC is a Lisp_Object, internal data | |
4533 (sized or unsized), or external data (sized or unsized), and return type | |
4534 is unsized alloca() or malloc() data. If the return type is a | |
4953
304aebb79cd3
function renamings to track names of char typedefs
Ben Wing <ben@xemacs.org>
parents:
4952
diff
changeset
|
4535 Lisp_Object, use build_extstring() for unsized external data, |
304aebb79cd3
function renamings to track names of char typedefs
Ben Wing <ben@xemacs.org>
parents:
4952
diff
changeset
|
4536 make_extstring() for sized external data. If the return type needs to |
1318 | 4537 be sized data, use the *_TO_SIZED_*() macros, and for other more |
4538 complicated cases, use the original TO_*_FORMAT() macros. */ | |
4539 | |
4540 static void | |
4541 new_dfc_convert_now_damn_it (const void *src, Bytecount src_size, | |
4542 enum new_dfc_src_type type, | |
4543 void **dst, Bytecount *dst_size, | |
4544 Lisp_Object codesys) | |
4545 { | |
4546 /* #### In the case of alloca(), it would be a bit more efficient, for | |
4547 small strings, to use static Dynarr's like are used internally in | |
4548 TO_*_FORMAT(), or some other way of avoiding malloc() followed by | |
4549 free(). I doubt it really matters, though. */ | |
4550 | |
4551 switch (type) | |
4552 { | |
4553 case DFC_EXTERNAL: | |
4554 TO_INTERNAL_FORMAT (C_STRING, src, | |
4555 MALLOC, (*dst, *dst_size), codesys); | |
4556 break; | |
4557 | |
4558 case DFC_SIZED_EXTERNAL: | |
4559 TO_INTERNAL_FORMAT (DATA, (src, src_size), | |
4560 MALLOC, (*dst, *dst_size), codesys); | |
4561 break; | |
4562 | |
4563 case DFC_INTERNAL: | |
4564 TO_EXTERNAL_FORMAT (C_STRING, src, | |
4565 MALLOC, (*dst, *dst_size), codesys); | |
4566 break; | |
4567 | |
4568 case DFC_SIZED_INTERNAL: | |
4569 TO_EXTERNAL_FORMAT (DATA, (src, src_size), | |
4570 MALLOC, (*dst, *dst_size), codesys); | |
4571 break; | |
4572 | |
4573 case DFC_LISP_STRING: | |
4574 TO_EXTERNAL_FORMAT (LISP_STRING, VOID_TO_LISP (src), | |
4575 MALLOC, (*dst, *dst_size), codesys); | |
4576 break; | |
4577 | |
4578 default: | |
2500 | 4579 ABORT (); |
1318 | 4580 } |
2367 | 4581 |
4582 /* The size is always + 2 because we have double zero-termination at the | |
4583 end of all data (for Unicode-correctness). */ | |
4584 *dst_size += 2; | |
4585 } | |
4586 | |
4587 Bytecount | |
4588 new_dfc_convert_size (const char *srctext, const void *src, | |
4589 Bytecount src_size, enum new_dfc_src_type type, | |
4590 Lisp_Object codesys) | |
4591 { | |
4592 alloca_convert_vals vals; | |
4593 | |
2721 | 4594 int i = find_pos_of_existing_active_alloca_convert (srctext); |
4595 assert (i < 0); | |
2367 | 4596 |
4597 vals.srctext = srctext; | |
4598 | |
4599 new_dfc_convert_now_damn_it (src, src_size, type, &vals.dst, &vals.dst_size, | |
4600 codesys); | |
4601 | |
4602 Dynarr_add (active_alloca_convert, vals); | |
4603 return vals.dst_size; | |
4604 } | |
4605 | |
4606 void * | |
4607 new_dfc_convert_copy_data (const char *srctext, void *alloca_data) | |
4608 { | |
4609 alloca_convert_vals *vals; | |
4610 int i = find_pos_of_existing_active_alloca_convert (srctext); | |
4611 | |
4612 assert (i >= 0); | |
4613 vals = Dynarr_atp (active_alloca_convert, i); | |
4614 assert (alloca_data); | |
4615 memcpy (alloca_data, vals->dst, vals->dst_size); | |
4976
16112448d484
Rename xfree(FOO, TYPE) -> xfree(FOO)
Ben Wing <ben@xemacs.org>
parents:
4967
diff
changeset
|
4616 xfree (vals->dst); |
2367 | 4617 Dynarr_delete (active_alloca_convert, i); |
4618 return alloca_data; | |
1318 | 4619 } |
4620 | |
4621 void * | |
4622 new_dfc_convert_malloc (const void *src, Bytecount src_size, | |
4623 enum new_dfc_src_type type, Lisp_Object codesys) | |
4624 { | |
4625 void *dst; | |
4626 Bytecount dst_size; | |
4627 | |
4628 new_dfc_convert_now_damn_it (src, src_size, type, &dst, &dst_size, codesys); | |
4629 return dst; | |
4630 } | |
4631 | |
771 | 4632 |
4633 /************************************************************************/ | |
867 | 4634 /* Basic Ichar functions */ |
771 | 4635 /************************************************************************/ |
4636 | |
4637 #ifdef MULE | |
4638 | |
4639 /* Convert a non-ASCII Mule character C into a one-character Mule-encoded | |
4640 string in STR. Returns the number of bytes stored. | |
867 | 4641 Do not call this directly. Use the macro set_itext_ichar() instead. |
771 | 4642 */ |
4643 | |
4644 Bytecount | |
867 | 4645 non_ascii_set_itext_ichar (Ibyte *str, Ichar c) |
771 | 4646 { |
867 | 4647 Ibyte *p; |
4648 Ibyte lb; | |
771 | 4649 int c1, c2; |
4650 Lisp_Object charset; | |
4651 | |
4652 p = str; | |
867 | 4653 BREAKUP_ICHAR (c, charset, c1, c2); |
4654 lb = ichar_leading_byte (c); | |
826 | 4655 if (leading_byte_private_p (lb)) |
4656 *p++ = private_leading_byte_prefix (lb); | |
771 | 4657 *p++ = lb; |
4658 if (EQ (charset, Vcharset_control_1)) | |
4659 c1 += 0x20; | |
4660 *p++ = c1 | 0x80; | |
4661 if (c2) | |
4662 *p++ = c2 | 0x80; | |
4663 | |
4664 return (p - str); | |
4665 } | |
4666 | |
4667 /* Return the first character from a Mule-encoded string in STR, | |
4668 assuming it's non-ASCII. Do not call this directly. | |
867 | 4669 Use the macro itext_ichar() instead. */ |
4670 | |
4671 Ichar | |
4672 non_ascii_itext_ichar (const Ibyte *str) | |
771 | 4673 { |
867 | 4674 Ibyte i0 = *str, i1, i2 = 0; |
771 | 4675 Lisp_Object charset; |
4676 | |
4677 if (i0 == LEADING_BYTE_CONTROL_1) | |
867 | 4678 return (Ichar) (*++str - 0x20); |
771 | 4679 |
826 | 4680 if (leading_byte_prefix_p (i0)) |
771 | 4681 i0 = *++str; |
4682 | |
4683 i1 = *++str & 0x7F; | |
4684 | |
826 | 4685 charset = charset_by_leading_byte (i0); |
771 | 4686 if (XCHARSET_DIMENSION (charset) == 2) |
4687 i2 = *++str & 0x7F; | |
4688 | |
867 | 4689 return make_ichar (charset, i1, i2); |
771 | 4690 } |
4691 | |
867 | 4692 /* Return whether CH is a valid Ichar, assuming it's non-ASCII. |
4693 Do not call this directly. Use the macro valid_ichar_p() instead. */ | |
771 | 4694 |
4695 int | |
867 | 4696 non_ascii_valid_ichar_p (Ichar ch) |
771 | 4697 { |
4698 int f1, f2, f3; | |
4699 | |
3498 | 4700 /* Must have only lowest 21 bits set */ |
4701 if (ch & ~0x1FFFFF) | |
771 | 4702 return 0; |
4703 | |
867 | 4704 f1 = ichar_field1 (ch); |
4705 f2 = ichar_field2 (ch); | |
4706 f3 = ichar_field3 (ch); | |
771 | 4707 |
4708 if (f1 == 0) | |
4709 { | |
4710 /* dimension-1 char */ | |
4711 Lisp_Object charset; | |
4712 | |
4713 /* leading byte must be correct */ | |
867 | 4714 if (f2 < MIN_ICHAR_FIELD2_OFFICIAL || |
4715 (f2 > MAX_ICHAR_FIELD2_OFFICIAL && f2 < MIN_ICHAR_FIELD2_PRIVATE) || | |
4716 f2 > MAX_ICHAR_FIELD2_PRIVATE) | |
771 | 4717 return 0; |
4718 /* octet not out of range */ | |
4719 if (f3 < 0x20) | |
4720 return 0; | |
4721 /* charset exists */ | |
4722 /* | |
4723 NOTE: This takes advantage of the fact that | |
4724 FIELD2_TO_OFFICIAL_LEADING_BYTE and | |
4725 FIELD2_TO_PRIVATE_LEADING_BYTE are the same. | |
4726 */ | |
826 | 4727 charset = charset_by_leading_byte (f2 + FIELD2_TO_OFFICIAL_LEADING_BYTE); |
771 | 4728 if (EQ (charset, Qnil)) |
4729 return 0; | |
4730 /* check range as per size (94 or 96) of charset */ | |
4731 return ((f3 > 0x20 && f3 < 0x7f) || XCHARSET_CHARS (charset) == 96); | |
4732 } | |
4733 else | |
4734 { | |
4735 /* dimension-2 char */ | |
4736 Lisp_Object charset; | |
4737 | |
4738 /* leading byte must be correct */ | |
867 | 4739 if (f1 < MIN_ICHAR_FIELD1_OFFICIAL || |
4740 (f1 > MAX_ICHAR_FIELD1_OFFICIAL && f1 < MIN_ICHAR_FIELD1_PRIVATE) || | |
4741 f1 > MAX_ICHAR_FIELD1_PRIVATE) | |
771 | 4742 return 0; |
4743 /* octets not out of range */ | |
4744 if (f2 < 0x20 || f3 < 0x20) | |
4745 return 0; | |
4746 | |
4747 #ifdef ENABLE_COMPOSITE_CHARS | |
4748 if (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE == LEADING_BYTE_COMPOSITE) | |
4749 { | |
4750 if (UNBOUNDP (Fgethash (make_int (ch), | |
4751 Vcomposite_char_char2string_hash_table, | |
4752 Qunbound))) | |
4753 return 0; | |
4754 return 1; | |
4755 } | |
4756 #endif /* ENABLE_COMPOSITE_CHARS */ | |
4757 | |
4758 /* charset exists */ | |
867 | 4759 if (f1 <= MAX_ICHAR_FIELD1_OFFICIAL) |
771 | 4760 charset = |
826 | 4761 charset_by_leading_byte (f1 + FIELD1_TO_OFFICIAL_LEADING_BYTE); |
771 | 4762 else |
4763 charset = | |
826 | 4764 charset_by_leading_byte (f1 + FIELD1_TO_PRIVATE_LEADING_BYTE); |
771 | 4765 |
4766 if (EQ (charset, Qnil)) | |
4767 return 0; | |
4768 /* check range as per size (94x94 or 96x96) of charset */ | |
4769 return ((f2 != 0x20 && f2 != 0x7F && f3 != 0x20 && f3 != 0x7F) || | |
4770 XCHARSET_CHARS (charset) == 96); | |
4771 } | |
4772 } | |
4773 | |
4774 /* Copy the character pointed to by SRC into DST. Do not call this | |
867 | 4775 directly. Use the macro itext_copy_ichar() instead. |
771 | 4776 Return the number of bytes copied. */ |
4777 | |
4778 Bytecount | |
867 | 4779 non_ascii_itext_copy_ichar (const Ibyte *src, Ibyte *dst) |
771 | 4780 { |
826 | 4781 Bytecount bytes = rep_bytes_by_first_byte (*src); |
771 | 4782 Bytecount i; |
4783 for (i = bytes; i; i--, dst++, src++) | |
4784 *dst = *src; | |
4785 return bytes; | |
4786 } | |
4787 | |
4788 #endif /* MULE */ | |
4789 | |
4790 | |
4791 /************************************************************************/ | |
867 | 4792 /* streams of Ichars */ |
771 | 4793 /************************************************************************/ |
4794 | |
4795 #ifdef MULE | |
4796 | |
867 | 4797 /* Treat a stream as a stream of Ichar's rather than a stream of bytes. |
771 | 4798 The functions below are not meant to be called directly; use |
4799 the macros in insdel.h. */ | |
4800 | |
867 | 4801 Ichar |
4802 Lstream_get_ichar_1 (Lstream *stream, int ch) | |
771 | 4803 { |
867 | 4804 Ibyte str[MAX_ICHAR_LEN]; |
4805 Ibyte *strptr = str; | |
771 | 4806 Bytecount bytes; |
4807 | |
867 | 4808 str[0] = (Ibyte) ch; |
771 | 4809 |
826 | 4810 for (bytes = rep_bytes_by_first_byte (ch) - 1; bytes; bytes--) |
771 | 4811 { |
4812 int c = Lstream_getc (stream); | |
800 | 4813 text_checking_assert (c >= 0); |
867 | 4814 *++strptr = (Ibyte) c; |
771 | 4815 } |
867 | 4816 return itext_ichar (str); |
771 | 4817 } |
4818 | |
4819 int | |
867 | 4820 Lstream_fput_ichar (Lstream *stream, Ichar ch) |
771 | 4821 { |
867 | 4822 Ibyte str[MAX_ICHAR_LEN]; |
4823 Bytecount len = set_itext_ichar (str, ch); | |
771 | 4824 return Lstream_write (stream, str, len); |
4825 } | |
4826 | |
4827 void | |
867 | 4828 Lstream_funget_ichar (Lstream *stream, Ichar ch) |
771 | 4829 { |
867 | 4830 Ibyte str[MAX_ICHAR_LEN]; |
4831 Bytecount len = set_itext_ichar (str, ch); | |
771 | 4832 Lstream_unread (stream, str, len); |
4833 } | |
4834 | |
4835 #endif /* MULE */ | |
4836 | |
4837 | |
4838 /************************************************************************/ | |
4839 /* Lisp primitives for working with characters */ | |
4840 /************************************************************************/ | |
4841 | |
4842 DEFUN ("make-char", Fmake_char, 2, 3, 0, /* | |
4843 Make a character from CHARSET and octets ARG1 and ARG2. | |
4844 ARG2 is required only for characters from two-dimensional charsets. | |
4845 | |
4846 Each octet should be in the range 32 through 127 for a 96 or 96x96 | |
4847 charset and 33 through 126 for a 94 or 94x94 charset. (Most charsets | |
4848 are either 96 or 94x94.) Note that this is 32 more than the values | |
4849 typically given for 94x94 charsets. When two octets are required, the | |
4850 order is "standard" -- the same as appears in ISO-2022 encodings, | |
4851 reference tables, etc. | |
4852 | |
4853 \(Note the following non-obvious result: Computerized translation | |
4854 tables often encode the two octets as the high and low bytes, | |
4855 respectively, of a hex short, while when there's only one octet, it | |
4856 goes in the low byte. When decoding such a value, you need to treat | |
4857 the two cases differently when calling make-char: One is (make-char | |
4858 CHARSET HIGH LOW), the other is (make-char CHARSET LOW).) | |
4859 | |
4860 For example, (make-char 'latin-iso8859-2 185) or (make-char | |
4861 'latin-iso8859-2 57) will return the Latin 2 character s with caron. | |
4862 | |
4863 As another example, the Japanese character for "kawa" (stream), which | |
4864 looks something like this: | |
4865 | |
4866 | | | |
4867 | | | | |
4868 | | | | |
4869 | | | | |
4870 / | | |
4871 | |
4872 appears in the Unicode Standard (version 2.0) on page 7-287 with the | |
4873 following values (see also page 7-4): | |
4874 | |
4875 U 5DDD (Unicode) | |
4876 G 0-2008 (GB 2312-80) | |
4877 J 0-3278 (JIS X 0208-1990) | |
4878 K 0-8425 (KS C 5601-1987) | |
4879 B A474 (Big Five) | |
4880 C 1-4455 (CNS 11643-1986 (1st plane)) | |
4881 A 213C34 (ANSI Z39.64-1989) | |
4882 | |
4883 These are equivalent to: | |
4884 | |
4885 \(make-char 'chinese-gb2312 52 40) | |
4886 \(make-char 'japanese-jisx0208 64 110) | |
4887 \(make-char 'korean-ksc5601 116 57) | |
4888 \(make-char 'chinese-cns11643-1 76 87) | |
4889 \(decode-big5-char '(164 . 116)) | |
4890 | |
4891 \(All codes above are two decimal numbers except for Big Five and ANSI | |
4892 Z39.64, which we don't support. We add 32 to each of the decimal | |
4893 numbers. Big Five is split in a rather hackish fashion into two | |
4894 charsets, `big5-1' and `big5-2', due to its excessive size -- 94x157, | |
4895 with the first codepoint in the range 0xA1 to 0xFE and the second in | |
4896 the range 0x40 to 0x7E or 0xA1 to 0xFE. `decode-big5-char' is used to | |
4897 generate the char from its codes, and `encode-big5-char' extracts the | |
4898 codes.) | |
4899 | |
4900 When compiled without MULE, this function does not do much, but it's | |
4901 provided for compatibility. In this case, the following CHARSET symbols | |
4902 are allowed: | |
4903 | |
4904 `ascii' -- ARG1 should be in the range 0 through 127. | |
4905 `control-1' -- ARG1 should be in the range 128 through 159. | |
4906 else -- ARG1 is coerced to be between 0 and 255, and then the high | |
4907 bit is set. | |
4908 | |
4909 `int-to-char of the resulting ARG1' is returned, and ARG2 is always ignored. | |
4910 */ | |
2333 | 4911 (charset, arg1, USED_IF_MULE (arg2))) |
771 | 4912 { |
4913 #ifdef MULE | |
4914 Lisp_Charset *cs; | |
4915 int a1, a2; | |
4916 int lowlim, highlim; | |
4917 | |
4918 charset = Fget_charset (charset); | |
4919 cs = XCHARSET (charset); | |
4920 | |
788 | 4921 get_charset_limits (charset, &lowlim, &highlim); |
771 | 4922 |
4923 CHECK_INT (arg1); | |
4924 /* It is useful (and safe, according to Olivier Galibert) to strip | |
4925 the 8th bit off ARG1 and ARG2 because it allows programmers to | |
4926 write (make-char 'latin-iso8859-2 CODE) where code is the actual | |
4927 Latin 2 code of the character. */ | |
4928 a1 = XINT (arg1) & 0x7f; | |
4929 if (a1 < lowlim || a1 > highlim) | |
4930 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim)); | |
4931 | |
4932 if (CHARSET_DIMENSION (cs) == 1) | |
4933 { | |
4934 if (!NILP (arg2)) | |
4935 invalid_argument | |
4936 ("Charset is of dimension one; second octet must be nil", arg2); | |
867 | 4937 return make_char (make_ichar (charset, a1, 0)); |
771 | 4938 } |
4939 | |
4940 CHECK_INT (arg2); | |
4941 a2 = XINT (arg2) & 0x7f; | |
4942 if (a2 < lowlim || a2 > highlim) | |
4943 args_out_of_range_3 (arg2, make_int (lowlim), make_int (highlim)); | |
4944 | |
867 | 4945 return make_char (make_ichar (charset, a1, a2)); |
771 | 4946 #else |
4947 int a1; | |
4948 int lowlim, highlim; | |
4949 | |
4950 if (EQ (charset, Qascii)) lowlim = 0, highlim = 127; | |
4951 else if (EQ (charset, Qcontrol_1)) lowlim = 0, highlim = 31; | |
4952 else lowlim = 0, highlim = 127; | |
4953 | |
4954 CHECK_INT (arg1); | |
4955 /* It is useful (and safe, according to Olivier Galibert) to strip | |
4956 the 8th bit off ARG1 and ARG2 because it allows programmers to | |
4957 write (make-char 'latin-iso8859-2 CODE) where code is the actual | |
4958 Latin 2 code of the character. */ | |
4959 a1 = XINT (arg1) & 0x7f; | |
4960 if (a1 < lowlim || a1 > highlim) | |
4961 args_out_of_range_3 (arg1, make_int (lowlim), make_int (highlim)); | |
4962 | |
4963 if (EQ (charset, Qascii)) | |
4964 return make_char (a1); | |
4965 return make_char (a1 + 128); | |
4966 #endif /* MULE */ | |
4967 } | |
4968 | |
4969 #ifdef MULE | |
4970 | |
4971 DEFUN ("char-charset", Fchar_charset, 1, 1, 0, /* | |
4972 Return the character set of char CH. | |
4973 */ | |
4974 (ch)) | |
4975 { | |
4976 CHECK_CHAR_COERCE_INT (ch); | |
4977 | |
826 | 4978 return XCHARSET_NAME (charset_by_leading_byte |
867 | 4979 (ichar_leading_byte (XCHAR (ch)))); |
771 | 4980 } |
4981 | |
4982 DEFUN ("char-octet", Fchar_octet, 1, 2, 0, /* | |
4983 Return the octet numbered N (should be 0 or 1) of char CH. | |
4984 N defaults to 0 if omitted. | |
4985 */ | |
4986 (ch, n)) | |
4987 { | |
4988 Lisp_Object charset; | |
4989 int octet0, octet1; | |
4990 | |
4991 CHECK_CHAR_COERCE_INT (ch); | |
4992 | |
867 | 4993 BREAKUP_ICHAR (XCHAR (ch), charset, octet0, octet1); |
771 | 4994 |
4995 if (NILP (n) || EQ (n, Qzero)) | |
4996 return make_int (octet0); | |
4997 else if (EQ (n, make_int (1))) | |
4998 return make_int (octet1); | |
4999 else | |
5000 invalid_constant ("Octet number must be 0 or 1", n); | |
5001 } | |
5002 | |
3724 | 5003 #endif /* MULE */ |
5004 | |
771 | 5005 DEFUN ("split-char", Fsplit_char, 1, 1, 0, /* |
5006 Return list of charset and one or two position-codes of CHAR. | |
5007 */ | |
5008 (character)) | |
5009 { | |
5010 /* This function can GC */ | |
5011 struct gcpro gcpro1, gcpro2; | |
5012 Lisp_Object charset = Qnil; | |
5013 Lisp_Object rc = Qnil; | |
5014 int c1, c2; | |
5015 | |
5016 GCPRO2 (charset, rc); | |
5017 CHECK_CHAR_COERCE_INT (character); | |
5018 | |
867 | 5019 BREAKUP_ICHAR (XCHAR (character), charset, c1, c2); |
771 | 5020 |
3724 | 5021 if (XCHARSET_DIMENSION (charset) == 2) |
771 | 5022 { |
5023 rc = list3 (XCHARSET_NAME (charset), make_int (c1), make_int (c2)); | |
5024 } | |
5025 else | |
5026 { | |
5027 rc = list2 (XCHARSET_NAME (charset), make_int (c1)); | |
5028 } | |
5029 UNGCPRO; | |
5030 | |
5031 return rc; | |
5032 } | |
5033 | |
5034 | |
5035 /************************************************************************/ | |
5036 /* composite character functions */ | |
5037 /************************************************************************/ | |
5038 | |
5039 #ifdef ENABLE_COMPOSITE_CHARS | |
5040 | |
867 | 5041 Ichar |
5042 lookup_composite_char (Ibyte *str, int len) | |
771 | 5043 { |
5044 Lisp_Object lispstr = make_string (str, len); | |
5045 Lisp_Object ch = Fgethash (lispstr, | |
5046 Vcomposite_char_string2char_hash_table, | |
5047 Qunbound); | |
867 | 5048 Ichar emch; |
771 | 5049 |
5050 if (UNBOUNDP (ch)) | |
5051 { | |
5052 if (composite_char_row_next >= 128) | |
5053 invalid_operation ("No more composite chars available", lispstr); | |
867 | 5054 emch = make_ichar (Vcharset_composite, composite_char_row_next, |
771 | 5055 composite_char_col_next); |
5056 Fputhash (make_char (emch), lispstr, | |
5057 Vcomposite_char_char2string_hash_table); | |
5058 Fputhash (lispstr, make_char (emch), | |
5059 Vcomposite_char_string2char_hash_table); | |
5060 composite_char_col_next++; | |
5061 if (composite_char_col_next >= 128) | |
5062 { | |
5063 composite_char_col_next = 32; | |
5064 composite_char_row_next++; | |
5065 } | |
5066 } | |
5067 else | |
5068 emch = XCHAR (ch); | |
5069 return emch; | |
5070 } | |
5071 | |
5072 Lisp_Object | |
867 | 5073 composite_char_string (Ichar ch) |
771 | 5074 { |
5075 Lisp_Object str = Fgethash (make_char (ch), | |
5076 Vcomposite_char_char2string_hash_table, | |
5077 Qunbound); | |
5078 assert (!UNBOUNDP (str)); | |
5079 return str; | |
5080 } | |
5081 | |
826 | 5082 DEFUN ("make-composite-char", Fmake_composite_char, 1, 1, 0, /* |
771 | 5083 Convert a string into a single composite character. |
5084 The character is the result of overstriking all the characters in | |
5085 the string. | |
5086 */ | |
5087 (string)) | |
5088 { | |
5089 CHECK_STRING (string); | |
5090 return make_char (lookup_composite_char (XSTRING_DATA (string), | |
5091 XSTRING_LENGTH (string))); | |
5092 } | |
5093 | |
826 | 5094 DEFUN ("composite-char-string", Fcomposite_char_string, 1, 1, 0, /* |
771 | 5095 Return a string of the characters comprising a composite character. |
5096 */ | |
5097 (ch)) | |
5098 { | |
867 | 5099 Ichar emch; |
771 | 5100 |
5101 CHECK_CHAR (ch); | |
5102 emch = XCHAR (ch); | |
867 | 5103 if (ichar_leading_byte (emch) != LEADING_BYTE_COMPOSITE) |
771 | 5104 invalid_argument ("Must be composite char", ch); |
5105 return composite_char_string (emch); | |
5106 } | |
5107 #endif /* ENABLE_COMPOSITE_CHARS */ | |
5108 | |
5109 | |
5110 /************************************************************************/ | |
5111 /* initialization */ | |
5112 /************************************************************************/ | |
5113 | |
5114 void | |
1204 | 5115 reinit_eistring_early (void) |
771 | 5116 { |
5117 the_eistring_malloc_zero_init = the_eistring_zero_init; | |
5118 the_eistring_malloc_zero_init.mallocp_ = 1; | |
5119 } | |
5120 | |
5121 void | |
814 | 5122 init_eistring_once_early (void) |
5123 { | |
1204 | 5124 reinit_eistring_early (); |
814 | 5125 } |
5126 | |
5127 void | |
771 | 5128 syms_of_text (void) |
5129 { | |
5130 DEFSUBR (Fmake_char); | |
3724 | 5131 DEFSUBR (Fsplit_char); |
771 | 5132 |
5133 #ifdef MULE | |
5134 DEFSUBR (Fchar_charset); | |
5135 DEFSUBR (Fchar_octet); | |
5136 | |
5137 #ifdef ENABLE_COMPOSITE_CHARS | |
5138 DEFSUBR (Fmake_composite_char); | |
5139 DEFSUBR (Fcomposite_char_string); | |
5140 #endif | |
5141 #endif /* MULE */ | |
5142 } | |
5143 | |
5144 void | |
5145 reinit_vars_of_text (void) | |
5146 { | |
5147 int i; | |
5148 | |
867 | 5149 conversion_in_dynarr_list = Dynarr_new2 (Ibyte_dynarr_dynarr, |
5150 Ibyte_dynarr *); | |
771 | 5151 conversion_out_dynarr_list = Dynarr_new2 (Extbyte_dynarr_dynarr, |
5152 Extbyte_dynarr *); | |
5153 | |
5154 for (i = 0; i <= MAX_BYTEBPOS_GAP_SIZE_3; i++) | |
5155 three_to_one_table[i] = i / 3; | |
5156 } | |
5157 | |
5158 void | |
5159 vars_of_text (void) | |
5160 { | |
4952
19a72041c5ed
Mule-izing, various fixes related to char * arguments
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
5161 QSin_char_byte_conversion = build_defer_string ("(in char-byte conversion)"); |
1292 | 5162 staticpro (&QSin_char_byte_conversion); |
5163 QSin_internal_external_conversion = | |
4952
19a72041c5ed
Mule-izing, various fixes related to char * arguments
Ben Wing <ben@xemacs.org>
parents:
4526
diff
changeset
|
5164 build_defer_string ("(in internal-external conversion)"); |
1292 | 5165 staticpro (&QSin_internal_external_conversion); |
5166 | |
771 | 5167 #ifdef ENABLE_COMPOSITE_CHARS |
5168 /* #### not dumped properly */ | |
5169 composite_char_row_next = 32; | |
5170 composite_char_col_next = 32; | |
5171 | |
5172 Vcomposite_char_string2char_hash_table = | |
5173 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQUAL); | |
5174 Vcomposite_char_char2string_hash_table = | |
5175 make_lisp_hash_table (500, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ); | |
5176 staticpro (&Vcomposite_char_string2char_hash_table); | |
5177 staticpro (&Vcomposite_char_char2string_hash_table); | |
5178 #endif /* ENABLE_COMPOSITE_CHARS */ | |
5179 } |