comparison src/text.c @ 2367:ecf1ebac70d8

[xemacs-hg @ 2004-11-04 23:05:23 by ben] commit mega-patch configure.in: Turn off -Winline and -Wchar-subscripts. Use the right set of cflags when compiling modules. Rewrite ldap configuration to separate the inclusion of lber (needed in recent Cygwin) from the basic checks for the needed libraries. add a function for MAKE_JUNK_C; initially code was added to generate xemacs.def using this, but it will need to be rewritten. add an rm -f for junk.c to avoid weird Cygwin bug with cp -f onto an existing file. Sort list of auto-detected functions and eliminate unused checks for stpcpy, setlocale and getwd. Add autodetection of Cygwin scanf problems BETA: Rewrite section on configure to indicate what flags are important and what not. digest-doc.c, make-dump-id.c, profile.c, sorted-doc.c: Add proper decls for main(). make-msgfile.c: Document that this is old junk. Move proposal to text.c. make-msgfile.lex: Move proposal to text.c. make-mswin-unicode.pl: Convert error-generating code so that the entire message will be seen as a single unrecognized token. mule/mule-ccl.el: Update docs. lispref/mule.texi: Update CCL docs. ldap/eldap.c: Mule-ize. Use EXTERNAL_LIST_LOOP_2 instead of deleted EXTERNAL_LIST_LOOP. * XEmacs 21.5.18 "chestnut" is released. --------------------------------------------------------------- MULE-RELATED WORK: --------------------------------------------------------------- --------------------------- byte-char conversion --------------------------- buffer.c, buffer.h, insdel.c, text.c: Port FSF algorithm for byte-char conversion, replacing broken previous version. Track the char position of the gap. Add functions to do char-byte conversion downwards as well as upwards. Move comments about algorithm workings to internals manual. --------------------------- work on types --------------------------- alloc.c, console-x-impl.h, dump-data.c, dump-data.h, dumper.c, dialog-msw.c, dired-msw.c, doc.c, editfns.c, esd.c, event-gtk.h, event-msw.c, events.c, file-coding.c, file-coding.h, fns.c, glyphs-eimage.c, glyphs-gtk.c, glyphs-msw.c, glyphs-shared.c, glyphs-x.c, glyphs.c, glyphs.h, gui.c, hpplay.c, imgproc.c, intl-win32.c, lrecord.h, lstream.c, keymap.c, lisp.h, libsst.c, linuxplay.c, miscplay.c, miscplay.h, mule-coding.c, nas.c, nt.c, ntheap.c, ntplay.c, objects-msw.c, objects-tty.c, objects-x.c, print.c, process-nt.c, process.c, redisplay.h, select-common.h, select-gtk.c, select-x.c, sgiplay.c, sound.c, sound.h, sunplay.c, sysfile.h, sysdep.c, syswindows.h, text.c, unexnt.c, win32.c, xgccache.c: Further work on types. This creates a full set of types for all the basic semantics of `char' that I have so far identified, so that its semantics can always be identified for the purposes of proper Mule-safe code, and the raw use of `char' always avoided. (1) More type renaming, for consistency of naming. Char_ASCII -> Ascbyte UChar_ASCII -> UAscbyte Char_Binary -> CBinbyte UChar_Binary -> Binbyte SChar_Binary -> SBinbyte (2) Introduce Rawbyte, CRawbyte, Boolbyte, Chbyte, UChbyte, and Bitbyte and use them. (3) New types Itext, Wexttext and Textcount for separating out the concepts of bytes and textual units (different under UTF-16 and UTF-32, which are potential internal encodings). (4) qxestr*_c -> qxestr*_ascii. lisp.h: New; goes with other qxe() functions. #### Maybe goes in a different section. lisp.h: Group generic int-type defs together with EMACS_INT defs. lisp.h: * lisp.h (WEXTTEXT_IS_WIDE) New defns. lisp.h: New type to replace places where int occurs as a boolean. It's signed because occasionally people may want to use -1 as an error value, and because unsigned ints are viral -- see comments in the internals manual against using them. dynarr.c: int -> Bytecount. --------------------------- Mule-izing --------------------------- device-x.c: Partially Mule-ize. dumper.c, dumper.h: Mule-ize. Use Rawbyte. Use stderr_out not printf. Use wext_*(). sysdep.c, syswindows.h, text.c: New Wexttext API for manipulation of external text that may be Unicode (e.g. startup code under Windows). emacs.c: Mule-ize. Properly deal with argv in external encoding. Use wext_*() and Wexttext. Use Rawbyte. #if 0 some old junk on SCO that is unlikely to be correct. Rewrite allocation code in run-temacs. emacs.c, symsinit.h, win32.c: Rename win32 init function and call it even earlier, to initialize mswindows_9x_p even earlier, for use in startup code (XEUNICODE_P). process.c: Use _wenviron not environ under Windows, to get Unicode environment variables. event-Xt.c: Mule-ize drag-n-drop related stuff. dragdrop.c, dragdrop.h, frame-x.c: Mule-ize. text.h: Add some more stand-in defines for particular kinds of conversion; use in Mule-ization work in frame-x.c etc. --------------------------- Freshening --------------------------- intl-auto-encap-win32.c, intl-auto-encap-win32.h: Regenerate. --------------------------- Unicode-work --------------------------- intl-win32.c, syswindows.h: Factor out common options to MultiByteToWideChar and WideCharToMultiByte. Add convert_unicode_to_multibyte_malloc() and convert_unicode_to_multibyte_dynarr() and use. Add stuff for alloca() conversion of multibyte/unicode. alloc.c: Use dfc_external_data_len() in case of unicode coding system. alloc.c, mule-charset.c: Don't zero out and reinit charset Unicode tables. This fucks up dump-time loading. Anyway, either we load them at dump time or run time, never both. unicode.c: Dump the blank tables as well. --------------------------------------------------------------- DOCUMENTATION, MOSTLY MULE-RELATED: --------------------------------------------------------------- EmacsFrame.c, emodules.c, event-Xt.c, fileio.c, input-method-xlib.c, mule-wnnfns.c, redisplay-gtk.c, redisplay-tty.c, redisplay-x.c, regex.c, sysdep.c: Add comment about Mule work needed. text.h: Add more documentation describing why DFC routines were not written to return their value. Add some other DFC documentation. console-msw.c, console-msw.h: Add pointer to docs in win32.c. emacs.c: Add comments on sources of doc info. text.c, charset.h, unicode.c, intl-win32.c, intl-encap-win32.c, text.h, file-coding.c, mule-coding.c: Collect background comments and related to text matters and internationalization, and proposals for work to be done, in text.c or Internals manual, stuff related to specific textual API's in text.h, and stuff related to internal implementation of Unicode conversion in unicode.c. Put lots of pointers to the comments to make them easier to find. s/mingw32.h, s/win32-common.h, s/win32-native.h, s/windowsnt.h, win32.c: Add bunches of new documentation on the different kinds of builds and environments under Windows and how they work. Collect this info in win32.c. Add pointers to these docs in the relevant s/* files. emacs.c: Document places with long comments. Remove comment about exiting, move to internals manual, put in pointer. event-stream.c: Move docs about event queues and focus to internals manual, put in pointer. events.h: Move docs about event stream callbacks to internals manual, put in pointer. profile.c, redisplay.c, signal.c: Move documentation to the Internals manual. process-nt.c: Add pointer to comment in win32-native.el. lisp.h: Add comments about some comment conventions. lisp.h: Add comment about the second argument. device-msw.c, redisplay-msw.c: @@#### comments are out-of-date. --------------------------------------------------------------- PDUMP WORK (MOTIVATED BY UNICODE CHANGES) --------------------------------------------------------------- alloc.c, buffer.c, bytecode.c, console-impl.h, console.c, device.c, dumper.c, lrecord.h, elhash.c, emodules.h, events.c, extents.c, frame.c, glyphs.c, glyphs.h, mule-charset.c, mule-coding.c, objects.c, profile.c, rangetab.c, redisplay.c, specifier.c, specifier.h, window.c, lstream.c, file-coding.h, file-coding.c: PDUMP: Properly implement dump_add_root_block(), which never worked before, and is necessary for dumping Unicode tables. Pdump name changes for accuracy: XD_STRUCT_PTR -> XD_BLOCK_PTR. XD_STRUCT_ARRAY -> XD_BLOCK_ARRAY. XD_C_STRING -> XD_ASCII_STRING. *_structure_* -> *_block_*. lrecord.h: some comments added about dump_add_root_block() vs dump_add_root_block_ptr(). extents.c: remove incorrect comment about pdump problems with gap array. --------------------------------------------------------------- ALLOCATION --------------------------------------------------------------- abbrev.c, alloc.c, bytecode.c, casefiddle.c, device-msw.c, device-x.c, dired-msw.c, doc.c, doprnt.c, dragdrop.c, editfns.c, emodules.c, file-coding.c, fileio.c, filelock.c, fns.c, glyphs-eimage.c, glyphs-gtk.c, glyphs-msw.c, glyphs-x.c, gui-msw.c, gui-x.c, imgproc.c, intl-win32.c, lread.c, menubar-gtk.c, menubar.c, nt.c, objects-msw.c, objects-x.c, print.c, process-nt.c, process-unix.c, process.c, realpath.c, redisplay.c, search.c, select-common.c, symbols.c, sysdep.c, syswindows.h, text.c, text.h, ui-byhand.c: New macros {alloca,xnew}_{itext,{i,ext,raw,bin,asc}bytes} for more convenient allocation of these commonly requested items. Modify functions to use alloca_ibytes, alloca_array, alloca_extbytes, xnew_ibytes, etc. also XREALLOC_ARRAY, xnew. alloc.c: Rewrite the allocation functions to factor out repeated code. Add assertions for freeing dumped data. lisp.h: Moved down and consolidated with other allocation stuff. lisp.h, dynarr.c: New functions for allocation that's very efficient when mostly in LIFO order. lisp.h, text.c, text.h: Factor out some stuff for general use by alloca()-conversion funs. text.h, lisp.h: Fill out convenience routines for allocating various kinds of bytes and put them in lisp.h. Use them in place of xmalloc(), ALLOCA(). text.h: Fill out the convenience functions so the _MALLOC() kinds match the alloca() kinds. --------------------------------------------------------------- ERROR-CHECKING --------------------------------------------------------------- text.h: Create ASSERT_ASCTEXT_ASCII() and ASSERT_ASCTEXT_ASCII_LEN() from similar Eistring checkers and change the Eistring checkers to use them instead. --------------------------------------------------------------- MACROS IN LISP.H --------------------------------------------------------------- lisp.h: Redo GCPRO declarations. Create a "base" set of functions that can be used to generate any kind of gcpro sets -- regular, ngcpro, nngcpro, private ones used in GC_EXTERNAL_LIST_LOOP_2. buffer.c, callint.c, chartab.c, console-msw.c, device-x.c, dialog-msw.c, dired.c, extents.c, ui-gtk.c, rangetab.c, nt.c, mule-coding.c, minibuf.c, menubar-msw.c, menubar.c, menubar-gtk.c, lread.c, lisp.h, gutter.c, glyphs.c, glyphs-widget.c, fns.c, fileio.c, file-coding.c, specifier.c: Eliminate EXTERNAL_LIST_LOOP, which does not check for circularities. Use EXTERNAL_LIST_LOOP_2 instead or EXTERNAL_LIST_LOOP_3 or EXTERNAL_PROPERTY_LIST_LOOP_3 or GC_EXTERNAL_LIST_LOOP_2 (new macro). Removed/redid comments on EXTERNAL_LIST_LOOP. --------------------------------------------------------------- SPACING FIXES --------------------------------------------------------------- callint.c, hftctl.c, number-gmp.c, process-unix.c: Spacing fixes. --------------------------------------------------------------- FIX FOR GEOMETRY PROBLEM IN FIRST FRAME --------------------------------------------------------------- unicode.c: Add workaround for newlib bug in sscanf() [should be fixed by release 1.5.12 of Cygwin]. toolbar.c: bug fix for problem of initial frame being 77 chars wide on Windows. will be overridden by my other ws. --------------------------------------------------------------- FIX FOR LEAKING PROCESS HANDLES: --------------------------------------------------------------- process-nt.c: Fixes for leaking handles. Inspired by work done by Adrian Aichner <adrian@xemacs.org>. --------------------------------------------------------------- FIX FOR CYGWIN BUG (Unicode-related): --------------------------------------------------------------- unicode.c: Add workaround for newlib bug in sscanf() [should be fixed by release 1.5.12 of Cygwin]. --------------------------------------------------------------- WARNING FIXES: --------------------------------------------------------------- console-stream.c: `reinit' is unused. compiler.h, event-msw.c, frame-msw.c, intl-encap-win32.c, text.h: Add stuff to deal with ANSI-aliasing warnings I got. regex.c: Gather includes together to avoid warning. --------------------------------------------------------------- CHANGES TO INITIALIZATION ROUTINES: --------------------------------------------------------------- buffer.c, emacs.c, console.c, debug.c, device-x.c, device.c, dragdrop.c, emodules.c, eval.c, event-Xt.c, event-gtk.c, event-msw.c, event-stream.c, event-tty.c, events.c, extents.c, faces.c, file-coding.c, fileio.c, font-lock.c, frame-msw.c, glyphs-widget.c, glyphs.c, gui-x.c, insdel.c, lread.c, lstream.c, menubar-gtk.c, menubar-x.c, minibuf.c, mule-wnnfns.c, objects-msw.c, objects.c, print.c, scrollbar-x.c, search.c, select-x.c, text.c, undo.c, unicode.c, window.c, symsinit.h: Call reinit_*() functions directly from emacs.c, for clarity. Factor out some redundant init code. Move disallowed stuff that had crept into vars_of_glyphs() into complex_vars_of_glyphs(). Call init_eval_semi_early() from eval.c not in the middle of vars_of_() in emacs.c since there should be no order dependency in the latter calls. --------------------------------------------------------------- ARMAGEDDON: --------------------------------------------------------------- alloc.c, emacs.c, lisp.h, print.c: Rename inhibit_non_essential_printing_operations to inhibit_non_essential_conversion_operations. text.c: Assert on !inhibit_non_essential_conversion_operations. console-msw.c, print.c: Don't do conversion in SetConsoleTitle or FindWindow to avoid problems during armageddon. Put #errors for NON_ASCII_INTERNAL_FORMAT in places where problems would arise. --------------------------------------------------------------- CHANGES TO THE BUILD PROCEDURE: --------------------------------------------------------------- config.h.in, s/cxux.h, s/usg5-4-2.h, m/powerpc.h: Add comment about correct ordering of this file. Rearrange everything to follow this -- put all #undefs together and before the s&m files. Add undefs for HAVE_ALLOCA, C_ALLOCA, BROKEN_ALLOCA_IN_FUNCTION_CALLS, STACK_DIRECTION. Remove unused HAVE_STPCPY, HAVE_GETWD, HAVE_SETLOCALE. m/gec63.h: Deleted; totally broken, not used at all, not in FSF. m/7300.h, m/acorn.h, m/alliant-2800.h, m/alliant.h, m/altos.h, m/amdahl.h, m/apollo.h, m/att3b.h, m/aviion.h, m/celerity.h, m/clipper.h, m/cnvrgnt.h, m/convex.h, m/cydra5.h, m/delta.h, m/delta88k.h, m/dpx2.h, m/elxsi.h, m/ews4800r.h, m/gould.h, m/hp300bsd.h, m/hp800.h, m/hp9000s300.h, m/i860.h, m/ibmps2-aix.h, m/ibmrs6000.h, m/ibmrt-aix.h, m/ibmrt.h, m/intel386.h, m/iris4d.h, m/iris5d.h, m/iris6d.h, m/irist.h, m/isi-ov.h, m/luna88k.h, m/m68k.h, m/masscomp.h, m/mg1.h, m/mips-nec.h, m/mips-siemens.h, m/mips.h, m/news.h, m/nh3000.h, m/nh4000.h, m/ns32000.h, m/orion105.h, m/pfa50.h, m/plexus.h, m/pmax.h, m/powerpc.h, m/pyrmips.h, m/sequent-ptx.h, m/sequent.h, m/sgi-challenge.h, m/symmetry.h, m/tad68k.h, m/tahoe.h, m/targon31.h, m/tekxd88.h, m/template.h, m/tower32.h, m/tower32v3.h, m/ustation.h, m/vax.h, m/wicat.h, m/xps100.h: Delete C_ALLOCA, HAVE_ALLOCA, STACK_DIRECTION, BROKEN_ALLOCA_IN_FUNCTION_CALLS. All of this is auto-detected. When in doubt, I followed recent FSF sources, which also have these things deleted.
author ben
date Thu, 04 Nov 2004 23:08:28 +0000
parents ba4677f54a05
children 6b957313bd8e
comparison
equal deleted inserted replaced
2366:2a392e0c390a 2367:ecf1ebac70d8
1 /* Buffer manipulation primitives for XEmacs. 1 /* Text manipulation primitives for XEmacs.
2 Copyright (C) 1995 Sun Microsystems, Inc. 2 Copyright (C) 1995 Sun Microsystems, Inc.
3 Copyright (C) 1995, 1996, 2000, 2001, 2002, 2003 Ben Wing. 3 Copyright (C) 1995, 1996, 2000, 2001, 2002, 2003, 2004 Ben Wing.
4 Copyright (C) 1999 Martin Buchholz. 4 Copyright (C) 1999 Martin Buchholz.
5 5
6 This file is part of XEmacs. 6 This file is part of XEmacs.
7 7
8 XEmacs is free software; you can redistribute it and/or modify it 8 XEmacs is free software; you can redistribute it and/or modify it
37 37
38 /************************************************************************/ 38 /************************************************************************/
39 /* long comments */ 39 /* long comments */
40 /************************************************************************/ 40 /************************************************************************/
41 41
42 /* 42 /* NB: Everything below was written by Ben Wing except as otherwise noted. */
43 ========================================================================== 43
44 1. Intro to Characters, Character Sets, and Encodings 44 /************************************************************************/
45 ========================================================================== 45 /* */
46 46 /* */
47 A character (which is, BTW, a surprisingly complex concept) is, in a 47 /* Part A: More carefully-written documentation */
48 written representation of text, the most basic written unit that has a 48 /* */
49 meaning of its own. It's comparable to a phoneme when analyzing words 49 /* */
50 in spoken speech (for example, the sound of `t' in English, which in 50 /************************************************************************/
51 fact has different pronunciations in different words -- aspirated in 51
52 `time', unaspirated in `stop', unreleased or even pronounced as a 52 /* Authorship: Ben Wing
53 glottal stop in `button', etc. -- but logically is a single concept). 53
54 Like a phoneme, a character is an abstract concept defined by its 54
55 *meaning*. The character `lowercase f', for example, can always be used 55 ==========================================================================
56 to represent the first letter in the word `fill', regardless of whether 56 7. Handling non-default formats
57 it's drawn upright or italic, whether the `fi' combination is drawn as a 57 ==========================================================================
58 single ligature, whether there are serifs on the bottom of the vertical 58
59 stroke, etc. (These different appearances of a single character are 59 We support, at least to some extent, formats other than the default
60 often called "graphs" or "glyphs".) Our concern when representing text 60 variable-width format, for speed; all of these alternative formats are
61 is on representing the abstract characters, and not on their exact 61 fixed-width. Currently we only handle these non-default formats in
62 appearance. 62 buffers, because access to their text is strictly controlled and thus
63 63 the details of the format mostly compartmentalized. The only really
64 A character set (or "charset"), as we define it, is a set of characters, 64 tricky part is the search code -- the regex, Boyer-Moore, and
65 each with an associated number (or set of numbers -- see below), called 65 simple-search algorithms in search.c and regex.c. All other code that
66 a "code point". It's important to understand that a character is not 66 knows directly about the buffer representation is the basic code to
67 defined by any number attached to it, but by its meaning. For example, 67 modify or retrieve the buffer text.
68 ASCII and EBCDIC are two charsets containing exactly the same characters 68
69 (lowercase and uppercase letters, numbers 0 through 9, particular 69 Supporting fixed-width formats in Lisp strings is harder, but possible
70 punctuation marks) but with different numberings. The `comma' character 70 -- FSF currently does this, for example. In this case, however,
71 in ASCII and EBCDIC, for instance, is the same character despite having 71 probably only 8-bit-fixed is reasonable for Lisp strings -- getting
72 a different numbering. Conversely, when comparing ASCII and JIS-Roman, 72 non-ASCII-compatible fixed-width formats to work is much, much harder
73 which look the same except that the latter has a yen sign substituted 73 because a lot of code assumes that strings are ASCII-compatible
74 for the backslash, we would say that the backslash and yen sign are 74 (i.e. ASCII + other characters represented exclusively using high-bit
75 *not* the same characters, despite having the same number (95) and 75 bytes) and a lot of code mixes Lisp strings and non-Lisp strings freely.
76 despite the fact that all other characters are present in both charsets, 76
77 with the same numbering. ASCII and JIS-Roman, then, do *not* have 77 The different possible fixed-width formats are 8-bit fixed, 16-bit
78 exactly the same characters in them (ASCII has a backslash character but 78 fixed, and 32-bit fixed. The latter can represent all possible
79 no yen-sign character, and vice-versa for JIS-Roman), unlike ASCII and 79 characters, but at a substantial memory penalty. The other two can
80 EBCDIC, even though the numberings in ASCII and JIS-Roman are closer. 80 represent only a subset of the possible characters. How these subsets
81 81 are defined can be simple or very tricky.
82 It's also important to distinguish between charsets and encodings. For 82
83 a simple charset like ASCII, there is only one encoding normally used -- 83 Currently we support only the default format and the 8-bit fixed format,
84 each character is represented by a single byte, with the same value as 84 and in the latter, we only allow these to be the first 256 characters in
85 its code point. For more complicated charsets, however, things are not 85 an Ichar (ASCII and Latin 1).
86 so obvious. Unicode version 2, for example, is a large charset with
87 thousands of characters, each indexed by a 16-bit number, often
88 represented in hex, e.g. 0x05D0 for the Hebrew letter "aleph". One
89 obvious encoding uses two bytes per character (actually two encodings,
90 depending on which of the two possible byte orderings is chosen). This
91 encoding is convenient for internal processing of Unicode text; however,
92 it's incompatible with ASCII, so a different encoding, e.g. UTF-8, is
93 usually used for external text, for example files or e-mail. UTF-8
94 represents Unicode characters with one to three bytes (often extended to
95 six bytes to handle characters with up to 31-bit indices). Unicode
96 characters 00 to 7F (identical with ASCII) are directly represented with
97 one byte, and other characters with two or more bytes, each in the range
98 80 to FF.
99
100 In general, a single encoding may be able to represent more than one
101 charset.
102
103 See also man/lispref/mule.texi.
104 86
105 ========================================================================== 87 One reasonable approach for 8-bit fixed is to allow the upper half to
106 2. Character Sets 88 represent any 1-byte charset, which is specified on a per-buffer basis.
107 ========================================================================== 89 This should work fairly well in practice since most documents are in
108 90 only one foreign language (possibly with some English mixed in). I
109 A particular character in a charset is indexed using one or 91 think FSF does something like this; or at least, they have something
110 more "position codes", which are non-negative integers. 92 called nonascii-translation-table and use it when converting from
111 The number of position codes needed to identify a particular 93 8-bit-fixed text ("unibyte text") to default text ("multibyte text").
112 character in a charset is called the "dimension" of the 94 With 16-bit fixed, you could do something like assign chunks of the 64K
113 charset. In XEmacs/Mule, all charsets have 1 or 2 dimensions, 95 worth of characters to charsets as they're encountered in documents.
114 and the size of all charsets (except for a few special cases) 96 This should work well with most Asian documents.
115 is either 94, 96, 94 by 94, or 96 by 96. The range of 97
116 position codes used to index characters from any of these 98 If/when we switch to using Unicode internally, we might have formats more
117 types of character sets is as follows: 99 like this:
118 100
119 Charset type Position code 1 Position code 2 101 -- UTF-8 or some extension as the default format. Perl uses an
120 ------------------------------------------------------------ 102 extension that handles 64-bit chars and requires as much as 13 bytes per
121 94 33 - 126 N/A 103 char, vs. the standard of 31-bit chars and 6 bytes max. UTF-8 has the
122 96 32 - 127 N/A 104 same basic properties as our own variable-width format (see text.c,
123 94x94 33 - 126 33 - 126 105 Internal String Encoding) and so most code would not need to be changed.
124 96x96 32 - 127 32 - 127 106
125 107 -- UTF-16 as a "pseudo-fixed" format (i.e. 16-bit fixed plus surrogates
126 Note that in the above cases position codes do not start at 108 for representing characters not in the BMP, aka >= 65536). The vast
127 an expected value such as 0 or 1. The reason for this will 109 majority of documents will have no surrogates in them so byte/char
128 become clear later. 110 conversion will be very fast.
129 111
130 For example, Latin-1 is a 96-character charset, and JISX0208 112 -- an 8-bit fixed format, like currently.
131 (the Japanese national character set) is a 94x94-character
132 charset.
133
134 [Note that, although the ranges above define the *valid*
135 position codes for a charset, some of the slots in a particular
136 charset may in fact be empty. This is the case for JISX0208,
137 for example, where (e.g.) all the slots whose first
138 position code is in the range 118 - 127 are empty.]
139
140 There are three charsets that do not follow the above rules.
141 All of them have one dimension, and have ranges of position
142 codes as follows:
143
144 Charset name Position code 1
145 ------------------------------------
146 ASCII 0 - 127
147 Control-1 0 - 31
148 Composite 0 - some large number
149
150 (The upper bound of the position code for composite characters
151 has not yet been determined, but it will probably be at
152 least 16,383).
153
154 ASCII is the union of two subsidiary character sets:
155 Printing-ASCII (the printing ASCII character set,
156 consisting of position codes 33 - 126, like for a standard
157 94-character charset) and Control-ASCII (the non-printing
158 characters that would appear in a binary file with codes 0
159 - 32 and 127).
160
161 Control-1 contains the non-printing characters that would
162 appear in a binary file with codes 128 - 159.
163
164 Composite contains characters that are generated by
165 overstriking one or more characters from other charsets.
166
167 Note that some characters in ASCII, and all characters
168 in Control-1, are "control" (non-printing) characters.
169 These have no printed representation but instead control
170 some other function of the printing (e.g. TAB or 8 moves
171 the current character position to the next tab stop).
172 All other characters in all charsets are "graphic"
173 (printing) characters.
174
175 When a binary file is read in, the bytes in the file are
176 assigned to character sets as follows:
177
178 Bytes Character set Range
179 --------------------------------------------------
180 0 - 127 ASCII 0 - 127
181 128 - 159 Control-1 0 - 31
182 160 - 255 Latin-1 32 - 127
183
184 This is a bit ad-hoc but gets the job done.
185
186 ==========================================================================
187 3. Encodings
188 ==========================================================================
189
190 An "encoding" is a way of numerically representing
191 characters from one or more character sets. If an encoding
192 only encompasses one character set, then the position codes
193 for the characters in that character set could be used
194 directly. This is not possible, however, if more than one
195 character set is to be used in the encoding.
196
197 For example, the conversion detailed above between bytes in
198 a binary file and characters is effectively an encoding
199 that encompasses the three character sets ASCII, Control-1,
200 and Latin-1 in a stream of 8-bit bytes.
201
202 Thus, an encoding can be viewed as a way of encoding
203 characters from a specified group of character sets using a
204 stream of bytes, each of which contains a fixed number of
205 bits (but not necessarily 8, as in the common usage of
206 "byte").
207
208 Here are descriptions of a couple of common
209 encodings:
210
211
212 A. Japanese EUC (Extended Unix Code)
213
214 This encompasses the character sets:
215 - Printing-ASCII,
216 - Katakana-JISX0201 (half-width katakana, the right half of JISX0201).
217 - Japanese-JISX0208
218 - Japanese-JISX0212
219 It uses 8-bit bytes.
220
221 Note that Printing-ASCII and Katakana-JISX0201 are 94-character
222 charsets, while Japanese-JISX0208 is a 94x94-character charset.
223
224 The encoding is as follows:
225
226 Character set Representation (PC == position-code)
227 ------------- --------------
228 Printing-ASCII PC1
229 Japanese-JISX0208 PC1 + 0x80 | PC2 + 0x80
230 Katakana-JISX0201 0x8E | PC1 + 0x80
231
232
233 B. JIS7
234
235 This encompasses the character sets:
236 - Printing-ASCII
237 - Latin-JISX0201 (the left half of JISX0201; this character set is
238 very similar to Printing-ASCII and is a 94-character charset)
239 - Japanese-JISX0208
240 - Katakana-JISX0201
241 It uses 7-bit bytes.
242
243 Unlike Japanese EUC, this is a "modal" encoding, which
244 means that there are multiple states that the encoding can
245 be in, which affect how the bytes are to be interpreted.
246 Special sequences of bytes (called "escape sequences")
247 are used to change states.
248
249 The encoding is as follows:
250
251 Character set Representation
252 ------------- --------------
253 Printing-ASCII PC1
254 Latin-JISX0201 PC1
255 Katakana-JISX0201 PC1
256 Japanese-JISX0208 PC1 | PC2
257
258 Escape sequence ASCII equivalent Meaning
259 --------------- ---------------- -------
260 0x1B 0x28 0x42 ESC ( B invoke Printing-ASCII
261 0x1B 0x28 0x4A ESC ( J invoke Latin-JISX0201
262 0x1B 0x28 0x49 ESC ( I invoke Katakana-JISX0201
263 0x1B 0x24 0x42 ESC $ B invoke Japanese-JISX0208
264
265 Initially, Printing-ASCII is invoked.
266
267 ==========================================================================
268 4. Internal Mule Encodings
269 ==========================================================================
270
271 In XEmacs/Mule, each character set is assigned a unique number,
272 called a "leading byte". This is used in the encodings of a
273 character. Leading bytes are in the range 0x80 - 0xFF
274 (except for ASCII, which has a leading byte of 0), although
275 some leading bytes are reserved.
276
277 Charsets whose leading byte is in the range 0x80 - 0x9F are
278 called "official" and are used for built-in charsets.
279 Other charsets are called "private" and have leading bytes
280 in the range 0xA0 - 0xFF; these are user-defined charsets.
281
282 More specifically:
283
284 Character set Leading byte
285 ------------- ------------
286 ASCII 0 (0x7F in arrays indexed by leading byte)
287 Composite 0x8D
288 Dimension-1 Official 0x80 - 0x8C/0x8D
289 (0x8E is free)
290 Control 0x8F
291 Dimension-2 Official 0x90 - 0x99
292 (0x9A - 0x9D are free)
293 Dimension-1 Private Marker 0x9E
294 Dimension-2 Private Marker 0x9F
295 Dimension-1 Private 0xA0 - 0xEF
296 Dimension-2 Private 0xF0 - 0xFF
297
298 There are two internal encodings for characters in XEmacs/Mule.
299 One is called "string encoding" and is an 8-bit encoding that
300 is used for representing characters in a buffer or string.
301 It uses 1 to 4 bytes per character. The other is called
302 "character encoding" and is a 19-bit encoding that is used
303 for representing characters individually in a variable.
304
305 (In the following descriptions, we'll ignore composite
306 characters for the moment. We also give a general (structural)
307 overview first, followed later by the exact details.)
308
309 A. Internal String Encoding
310
311 ASCII characters are encoded using their position code directly.
312 Other characters are encoded using their leading byte followed
313 by their position code(s) with the high bit set. Characters
314 in private character sets have their leading byte prefixed with
315 a "leading byte prefix", which is either 0x9E or 0x9F. (No
316 character sets are ever assigned these leading bytes.) Specifically:
317
318 Character set Encoding (PC == position-code)
319 ------------- -------- (LB == leading-byte)
320 ASCII PC1 |
321 Control-1 LB | PC1 + 0xA0
322 Dimension-1 official LB | PC1 + 0x80
323 Dimension-1 private 0x9E | LB | PC1 + 0x80
324 Dimension-2 official LB | PC1 | PC2 + 0x80
325 Dimension-2 private 0x9F | LB | PC1 + 0x80 | PC2 + 0x80
326
327 The basic characteristic of this encoding is that the first byte
328 of all characters is in the range 0x00 - 0x9F, and the second and
329 following bytes of all characters is in the range 0xA0 - 0xFF.
330 This means that it is impossible to get out of sync, or more
331 specifically:
332
333 1. Given any byte position, the beginning of the character it is
334 within can be determined in constant time.
335 2. Given any byte position at the beginning of a character, the
336 beginning of the next character can be determined in constant
337 time.
338 3. Given any byte position at the beginning of a character, the
339 beginning of the previous character can be determined in constant
340 time.
341 4. Textual searches can simply treat encoded strings as if they
342 were encoded in a one-byte-per-character fashion rather than
343 the actual multi-byte encoding.
344
345 None of the standard non-modal encodings meet all of these
346 conditions. For example, EUC satisfies only (2) and (3), while
347 Shift-JIS and Big5 (not yet described) satisfy only (2). (All
348 non-modal encodings must satisfy (2), in order to be unambiguous.)
349
350 B. Internal Character Encoding
351
352 One 19-bit word represents a single character. The word is
353 separated into three fields:
354
355 Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
356 <------------> <------------------> <------------------>
357 Field: 1 2 3
358
359 Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits.
360
361 Character set Field 1 Field 2 Field 3
362 ------------- ------- ------- -------
363 ASCII 0 0 PC1
364 range: (00 - 7F)
365 Control-1 0 1 PC1
366 range: (00 - 1F)
367 Dimension-1 official 0 LB - 0x7F PC1
368 range: (01 - 0D) (20 - 7F)
369 Dimension-1 private 0 LB - 0x80 PC1
370 range: (20 - 6F) (20 - 7F)
371 Dimension-2 official LB - 0x8F PC1 PC2
372 range: (01 - 0A) (20 - 7F) (20 - 7F)
373 Dimension-2 private LB - 0xE1 PC1 PC2
374 range: (0F - 1E) (20 - 7F) (20 - 7F)
375 Composite 0x1F ? ?
376
377 Note that character codes 0 - 255 are the same as the "binary encoding"
378 described above.
379
380 Most of the code in XEmacs knows nothing of the representation of a
381 character other than that values 0 - 255 represent ASCII, Control 1,
382 and Latin 1.
383
384 WARNING WARNING WARNING: The Boyer-Moore code in search.c, and the
385 code in search_buffer() that determines whether that code can be used,
386 knows that "field 3" in a character always corresponds to the last
387 byte in the textual representation of the character. (This is important
388 because the Boyer-Moore algorithm works by looking at the last byte
389 of the search string and &&#### finish this.
390
391 ==========================================================================
392 5. Buffer Positions and Other Typedefs
393 ==========================================================================
394
395 A. Buffer Positions
396 113
397 There are three possible ways to specify positions in a buffer. All 114 -- possibly, UCS-4 as a 32-bit fixed format.
398 of these are one-based: the beginning of the buffer is position or 115
399 index 1, and 0 is not a valid position. 116 The fixed-width formats essentially treat the buffer as an array of
400 117 8-bit, 16-bit or 32-bit integers. This means that how they are stored
401 As a "buffer position" (typedef Charbpos): 118 in memory (in particular, big-endian or little-endian) depends on the
402 119 native format of the machine's processor. It also means we have to
403 This is an index specifying an offset in characters from the 120 worry a bit about alignment (basically, we just need to keep the gap an
404 beginning of the buffer. Note that buffer positions are 121 integral size of the character size, and get things aligned properly
405 logically *between* characters, not on a character. The 122 when converting the buffer between formats).
406 difference between two buffer positions specifies the number of 123
407 characters between those positions. Buffer positions are the 124 ==========================================================================
408 only kind of position externally visible to the user. 125 8. Using UTF-16 as the default text format
409 126 ==========================================================================
410 As a "byte index" (typedef Bytebpos): 127
411 128 NOTE: The Eistring API is (or should be) Mule-correct even without
412 This is an index over the bytes used to represent the characters 129 an ASCII-compatible internal representation.
413 in the buffer. If there is no Mule support, this is identical 130
414 to a buffer position, because each character is represented 131 #### Currently, the assumption that text units are one byte in size is
415 using one byte. However, with Mule support, many characters 132 embedded throughout XEmacs, and `Ibyte *' is used where `Itext *' should
416 require two or more bytes for their representation, and so a 133 be. The way to fix this is to (among other things)
417 byte index may be greater than the corresponding buffer 134
418 position. 135 (a) review all places referencing `Ibyte' and `Ibyte *', change them to
419 136 use Itext, and fix up the code.
420 As a "memory index" (typedef Membpos): 137 (b) change XSTRING_DATA to be of type Itext *
421 138 (c) review all uses of XSTRING_DATA
422 This is the byte index adjusted for the gap. For positions 139 (d) eliminate XSTRING_LENGTH, splitting it into XSTRING_BYTE_LENGTH and
423 before the gap, this is identical to the byte index. For 140 XSTRING_TEXT_LENGTH and reviewing all places referencing this
424 positions after the gap, this is the byte index plus the gap 141 (e) make similar changes to other API's that refer to the "length" of
425 size. There are two possible memory indices for the gap 142 something, such as qxestrlen() and eilen()
426 position; the memory index at the beginning of the gap should 143 (f) review all use of `CIbyte *'. Currently this is usually a way of
427 always be used, except in code that deals with manipulating the 144 passing literal ASCII text strings in places that want internal text.
428 gap, where both indices may be seen. The address of the 145 Either create separate _ascii() and _itext() versions of the
429 character "at" (i.e. following) a particular position can be 146 functions taking CIbyte *, or make use of something like the
430 obtained from the formula 147 WEXTTEXT() macro, which will generate wide strings as appropriate.
431 148 (g) review all uses of Bytecount and see which ones should be Textcount.
432 buffer_start_address + memory_index(position) - 1 149 (h) put in error-checking code that will be tripped as often as possible
433 150 when doing anything with internal text, and check to see that ASCII
434 except in the case of characters at the gap position. 151 text has not mistakenly filtered in. This should be fairly easy as
435 152 ASCII text will generally be entirely spaces and letters whereas every
436 B. Other Typedefs 153 second byte of Unicode text will generally be a null byte. Either we
437 154 abort if the second bytes are entirely letters and numbers, or,
438 Ichar: 155 perhaps better, do the equivalent of a non-MULE build, where we should
439 ------ 156 be dealing entirely with 8-bit characters, and assert that the high
440 This typedef represents a single Emacs character, which can be 157 bytes of each pair are null.
441 ASCII, ISO-8859, or some extended character, as would typically 158 (i) review places where xmalloc() is called. If we convert each use of
442 be used for Kanji. Note that the representation of a character 159 xmalloc() to instead be xnew_array() or some other typed routine,
443 as an Ichar is *not* the same as the representation of that 160 then we will find every place that allocates space for Itext and
444 same character in a string; thus, you cannot do the standard 161 assumes it is based on one-byte units.
445 C trick of passing a pointer to a character to a function that 162 (j) encourage the use of ITEXT_ZTERM_SIZE instead of '+ 1' whenever we
446 expects a string. 163 are adding space for a zero-terminator, to emphasize what we are
447 164 doing and make sure the calculations are correct. Similarly for
448 An Ichar takes up 19 bits of representation and (for code 165 EXTTEXT_ZTERM_SIZE.
449 compatibility and such) is compatible with an int. This 166 (k) Note that the qxestr*() functions, among other things, will need to
450 representation is visible on the Lisp level. The important 167 be rewritten.
451 characteristics of the Ichar representation are 168
452 169 Note that this is a lot of work, and is not high on the list of priorities
453 -- values 0x00 - 0x7f represent ASCII. 170 currently.
454 -- values 0x80 - 0xff represent the right half of ISO-8859-1. 171
455 -- values 0x100 and up represent all other characters. 172 ==========================================================================
456 173 9. Miscellaneous
457 This means that Ichar values are upwardly compatible with
458 the standard 8-bit representation of ASCII/ISO-8859-1.
459
460 Ibyte:
461 ------
462 The data in a buffer or string is logically made up of Ibyte
463 objects, where a Ibyte takes up the same amount of space as a
464 char. (It is declared differently, though, to catch invalid
465 usages.) Strings stored using Ibytes are said to be in
466 "internal format". The important characteristics of internal
467 format are
468
469 -- ASCII characters are represented as a single Ibyte,
470 in the range 0 - 0x7f.
471 -- All other characters are represented as a Ibyte in
472 the range 0x80 - 0x9f followed by one or more Ibytes
473 in the range 0xa0 to 0xff.
474
475 This leads to a number of desirable properties:
476
477 -- Given the position of the beginning of a character,
478 you can find the beginning of the next or previous
479 character in constant time.
480 -- When searching for a substring or an ASCII character
481 within the string, you need merely use standard
482 searching routines.
483
484 Extbyte:
485 --------
486 Strings that go in or out of Emacs are in "external format",
487 typedef'ed as an array of char or a char *. There is more
488 than one external format (JIS, EUC, etc.) but they all
489 have similar properties. They are modal encodings,
490 which is to say that the meaning of particular bytes is
491 not fixed but depends on what "mode" the string is currently
492 in (e.g. bytes in the range 0 - 0x7f might be
493 interpreted as ASCII, or as Hiragana, or as 2-byte Kanji,
494 depending on the current mode). The mode starts out in
495 ASCII/ISO-8859-1 and is switched using escape sequences --
496 for example, in the JIS encoding, 'ESC $ B' switches to a
497 mode where pairs of bytes in the range 0 - 0x7f
498 are interpreted as Kanji characters.
499
500 External-formatted data is generally desirable for passing
501 data between programs because it is upwardly compatible
502 with standard ASCII/ISO-8859-1 strings and may require
503 less space than internal encodings such as the one
504 described above. In addition, some encodings (e.g. JIS)
505 keep all characters (except the ESC used to switch modes)
506 in the printing ASCII range 0x20 - 0x7e, which results in
507 a much higher probability that the data will avoid being
508 garbled in transmission. Externally-formatted data is
509 generally not very convenient to work with, however, and
510 for this reason is usually converted to internal format
511 before any work is done on the string.
512
513 NOTE: filenames need to be in external format so that
514 ISO-8859-1 characters come out correctly.
515
516 Charcount:
517 ----------
518 This typedef represents a count of characters, such as
519 a character offset into a string or the number of
520 characters between two positions in a buffer. The
521 difference between two Charbpos's is a Charcount, and
522 character positions in a string are represented using
523 a Charcount.
524
525 Bytecount:
526 ----------
527 Similar to a Charcount but represents a count of bytes.
528 The difference between two Bytebpos's is a Bytecount.
529
530
531 C. Usage of the Various Representations
532
533 Memory indices are used in low-level functions in insdel.c and for
534 extent endpoints and marker positions. The reason for this is that
535 this way, the extents and markers don't need to be updated for most
536 insertions, which merely shrink the gap and don't move any
537 characters around in memory.
538
539 (The beginning-of-gap memory index simplifies insertions w.r.t.
540 markers, because text usually gets inserted after markers. For
541 extents, it is merely for consistency, because text can get
542 inserted either before or after an extent's endpoint depending on
543 the open/closedness of the endpoint.)
544
545 Byte indices are used in other code that needs to be fast,
546 such as the searching, redisplay, and extent-manipulation code.
547
548 Buffer positions are used in all other code. This is because this
549 representation is easiest to work with (especially since Lisp
550 code always uses buffer positions), necessitates the fewest
551 changes to existing code, and is the safest (e.g. if the text gets
552 shifted underneath a buffer position, it will still point to a
553 character; if text is shifted under a byte index, it might point
554 to the middle of a character, which would be bad).
555
556 Similarly, Charcounts are used in all code that deals with strings
557 except for code that needs to be fast, which used Bytecounts.
558
559 Strings are always passed around internally using internal format.
560 Conversions between external format are performed at the time
561 that the data goes in or out of Emacs.
562
563 D. Working With the Various Representations
564
565 We write things this way because it's very important the
566 MAX_BYTEBPOS_GAP_SIZE_3 is a multiple of 3. (As it happens,
567 65535 is a multiple of 3, but this may not always be the
568 case. #### unfinished
569
570 ==========================================================================
571 6. Miscellaneous
572 ========================================================================== 174 ==========================================================================
573 175
574 A. Unicode Support 176 A. Unicode Support
575 177
576 Unicode support is very desirable. Currrently we know how to handle 178 Unicode support is very desirable. Currrently we know how to handle
619 over the XEmacs process lifetime, and you only need to 221 over the XEmacs process lifetime, and you only need to
620 increase the size of a Mule character from 19 to 21 bits. 222 increase the size of a Mule character from 19 to 21 bits.
621 Or you could use 0x8D C1 C2 C3 C4, allowing for about 223 Or you could use 0x8D C1 C2 C3 C4, allowing for about
622 85 million (slightly over 2^26) composite characters. 224 85 million (slightly over 2^26) composite characters.
623 225
226 ==========================================================================
227 10. Internal API's
228 ==========================================================================
229
230 All of these are documented in more detail in text.h.
231
232 @enumerate
233 @item
234 Basic internal-format API's
235
236 These are simple functions and macros to convert between text
237 representation and characters, move forward and back in text, etc.
238
239 @item
240 The DFC API
241
242 This is for conversion between internal and external text. Note that
243 there is also the "new DFC" API, which *returns* a pointer to the
244 converted text (in alloca space), rather than storing it into a
245 variable.
246
247 @item
248 The Eistring API
249
250 (This API is currently under-used) When doing simple things with
251 internal text, the basic internal-format API's are enough. But to do
252 things like delete or replace a substring, concatenate various strings,
253 etc. is difficult to do cleanly because of the allocation issues.
254 The Eistring API is designed to deal with this, and provides a clean
255 way of modifying and building up internal text. (Note that the former
256 lack of this API has meant that some code uses Lisp strings to do
257 similar manipulations, resulting in excess garbage and increased
258 garbage collection.)
259
260 NOTE: The Eistring API is (or should be) Mule-correct even without
261 an ASCII-compatible internal representation.
262 @end enumerate
263
264 ==========================================================================
265 11. Other Sources of Documentation
266 ==========================================================================
267
268 man/lispref/mule.texi
269 @enumerate
270 @item
271 another intro to characters, encodings, etc; #### Merge with the
272 above info
273 @item
274 documentation of ISO-2022
275 @item
276 The charset and coding-system Lisp API's
277 @item
278 The CCL conversion language for writing encoding conversions
279 @item
280 The Latin-Unity package for unifying Latin charsets
281 @end enumerate
282
283 man/internals/internals.texi (the Internals manual)
284 @enumerate
285 @item
286 "Coding for Mule" -- how to write Mule-aware code
287 @item
288 "Modules for Internationalization"
289 @item
290 "The Text in a Buffer" -- more about the different ways of
291 viewing buffer positions; #### Merge with the above info
292 @item
293 "MULE Character Sets and Encodings" -- yet another intro
294 to characters, encodings, etc; #### Merge with the
295 above info; also some documentation of Japanese EUC and JIS7,
296 and CCL internals
297 @end enumerate
298
299 text.h -- info about specific XEmacs-C API's for handling internal and
300 external text
301
302 intl-win32.c -- Windows-specific I18N information
303
304 lisp.h -- some info appears alongside the definitions of the basic
305 character-related types
306
307 unicode.c -- documentation about Unicode translation tables
624 */ 308 */
309
310
311 /************************************************************************/
312 /* */
313 /* */
314 /* Part B: Random proposals for work to be done */
315 /* */
316 /* */
317 /************************************************************************/
318
319
320 /*
321
322
323 ==========================================================================
324 - Mule design issues (ben)
325 ==========================================================================
326
327 circa 1999
328
329 Here is a more detailed list of Mule-related projects that we will be
330 working on. They are more or less ordered according to how we will
331 proceed, but it's not exact. In particular, there will probably be
332 time overlap among adjacent projects.
333
334 @enumerate
335 @item
336 Modify the internal/external conversion macros to allow for
337 MS Windows support.
338
339 @item
340 Modify the buffer macros to allow for more than one internal
341 representation, e.g. fixed width and variable width.
342
343 @item
344 Review the existing Mule code, especially the lisp code, for code
345 quality issues and improve the cleanliness of it. Also work on
346 creating a specification for the Mule API.
347
348 @item
349 Write some more automated mule tests.
350
351 @item
352 Integrate Tomohiko's UTF-2000 code, fixing it up so that nothing is
353 broken when the UTF-2000 configure option is not enabled.
354
355 @item
356 Fix up the MS Windows code to be Mule-correct, so that you can
357 compile with Mule support under MS windows and have a working
358 XEmacs, at least just with Latin-1.
359
360 @item
361 Implement a scheme to guarantee no corruption of files, even with
362 an incorrect coding system - in particular, guarantee no corruption
363 of binary files.
364
365 @item
366 Make the text property support in XEmacs robust with respect to
367 string and text operations, so that the `no corruption' support in
368 the previous entry works properly, even if a lot of cutting and
369 pasting is done.
370
371 @item
372 Improve the handling of auto-detection so that, when there is any
373 possibility at all of mistake, the user is informed of the detected
374 encoding and given the choice of choosing other possibilities.
375
376 @item
377 Improve the support for different language environments in XEmacs,
378 for example, the priority of coding systems used in auto-detection
379 should properly reflect the language environment. This probably
380 necessitates rethinking the current `coding system priority'
381 scheme.
382
383 @item
384 Do quality work to improve the existing UTF-2000 implementation.
385
386 @item
387 Implement preliminary support for 8-bit fixed width
388 representation. First, we will only implement 7-bit support, and
389 will fall back to variable width as soon as any non-ASCII
390 character is encountered. Then we will improve the support to
391 handle an arbitrary character set in the upper half of the 8-bit space.
392
393 @item
394 Investigate any remaining hurdles to making --with-mule be the
395 default configure option.
396 @end enumerate
397
398 ==========================================================================
399 - Mule design issues (stephen)
400 ==========================================================================
401
402 What I see as Mule priorities (in rough benefit order, I am not taking
403 account of difficulty, nor the fact that some - eg 8 & 10 - will
404 probably come as packages):
405
406 @enumerate
407 @item
408 Fix the autodetect problem (by making the coding priority list
409 user-configurable, as short as he likes, even null, with "binary"
410 as the default).
411 @item
412 Document the language environments and other Mule "APIs" as
413 implemented (since there is no real design spec). Check to see
414 how and where they are broken.
415 @item
416 Make the Mule menu useful to non-ISO-2022-literate folks.
417 @item
418 Redo the lstreams stuff to make it easy and robust to "pipeline",
419 eg, libz | gnupg | jis2mule.
420 @item
421 Make Custom Mule-aware. (This probably depends on a sensible
422 fonts model.)
423 @item
424 Implement the "literal byte stream" memory feature.
425 @item
426 Study the FSF implementation of Mule for background for 7 & 8.
427 @item
428 Identify desirable Mule features (eg, i18n-ized messages as above,
429 collating tables by language environment, etc). (New features
430 might have priority as high as 9.)
431 @item
432 Specify Mule UIs, APIs, etc, and design and (re)implement them.
433 @item
434 Implement the 8-bit-wide buffer optimization.
435 @item
436 Move the internal encoding to UTF-32 (subject to Olivier's caveats
437 regarding compose characters), with the variable-width char
438 buffers using UTF-8.
439 @item
440 Implement the 16- and 32-bit-wide buffer optimizations.
441 @end enumerate
442
443 ==========================================================================
444 - Mule design issues "short term" (ben)
445 ==========================================================================
446
447 @enumerate
448 @item
449 Finish changes in fixup/directory, get in CVS.
450
451 (Test with and without "quick-build", to see if really faster)
452 (need autoconf)
453
454 @item
455 Finish up Windows/Mule changes. Outline of this elsewhere; Do
456 *minimal* effort.
457
458 @item
459 Continue work on Windows stability, e.g. go through existing notes
460 on Windows Mule-ization + extract all info.
461
462 @item
463 Get Unicode translation tables integrated.
464
465 Finish UCS2/UTF16 coding system.
466
467 @item
468 Make sure coding system priority list is language-environment specific.
469
470 @item
471 Consider moving language selection Menu up to be parallel with Mule menu.
472
473 @item
474 Check to make sure we grok the default locale at startup under
475 Windows and understand the Windows locales. Finish implementation
476 of mswindows-multibyte and make sure it groks all the locales.
477
478 @item
479 Do the above as best as we can without using Unicode tables.
480
481 @item
482 Start tagging all text with a language text property,
483 indicating the current language environment when the text was input.
484
485 @item
486 Make sure we correctly accept input of non-ASCII chars
487 (probably already do!)
488
489 @item
490 Implement active language/keyboard switching under Windows.
491
492 @item
493 Look into implementing support for "MS IME" protocol (Microsoft
494 fancy built-in Asian input methods).
495
496 @item
497 Redo implementation of mswindows-multibyte and internal display to
498 entirely use translation to/from Unicode for increased accuracy.
499
500 @item
501 Implement buf<->char improvements from FSF. Also implement
502 my string byte<->char optimization structure.
503
504 @item
505 Integrate all Mule DOCS from 20.6 or 21.0. Try to add sections
506 for what we've added.
507
508 @item
509 Implement 8-bit fixed width optimizations. Then work on 16-bit.
510 @end enumerate
511
512 ==========================================================================
513 - Mule design issues (more) (ben)
514 ==========================================================================
515
516 Get minimal Mule for Windows working using Ikeyama's patches. At
517 first, rely on his conversion of internal -> external
518 locale-specific but very soon (as soon as we get translation
519 tables) can switch to using Unicode versions of display funs, which
520 will allow many more charsets to be handled and in a more
521 consistent fashion.
522
523 i.e. to convert an internal string to an external format, at first
524 we use our own knowledge of the Microsoft locale file formats but
525 an alternative is to convert to Unicode and use Microsoft's
526 convert-Unicode-to-locale encoding functions. This gains us a
527 great deal of generality, since in practice all charset caching
528 points can be wrapped into Unicode caching points.
529
530 This requires adding UCS2 support, which I'm doing. This support
531 would let us convert internal -> Unicode, which is exactly what we
532 want.
533
534 At first, though, I would do the UCS2 support, but leave the
535 existing way of doing things in redisplay. Meanwhile, I'd go
536 through and fix up the places in the code that assume we are
537 dealing with unibytes.
538
539 After this, the font problems will be fixed , we should have a
540 pretty well working XEmacs + MULE under Windows. The only real
541 other work is the clipboard code, which should be straightforward.
542
543 ==========================================================================
544 - Mule design discussion
545 ==========================================================================
546
547 --------------------------------------------------------------------------
548
549 Ben
550
551 April 11, 2000
552
553 Well yes, this was the whole point of my "no lossage" proposal of being
554 able to undo any coding-system transformation on a buffer. The idea was
555 to figure out which transformations were definitely reversable, and for
556 all the others, cache the original text in a text property. This way, you
557 could probably still do a fairly good job at constructing a good reversal
558 even after you've gone into the text and added, deleted, and rearranged
559 some things.
560
561 But you could implement it much more simply and usefully by just
562 determining, for any text being decoded into mule-internal, can we go back
563 and read the source again? If not, remember the entire file (GNUS
564 message, etc) in text properties. Then, implement the UI interface (like
565 Netscape's) on top of that. This way, you have something that at least
566 works, but it might be inefficient. All we would need to do is work on
567 making the
568 underlying implementation more efficient.
569
570 Are you interested in doing this? It would be a huge win for users.
571 Hrvoje Niksic wrote:
572
573 > Ben Wing <ben@666.com> writes:
574 >
575 > > let me know exactly what "rethink" functionality you want and i'll
576 > > come up with an interface. perhaps you just want something like
577 > > netscape's encoding menu, where if you switch encodings, it reloads
578 > > and reencodes?
579 >
580 > It might be a bit more complex than that. In many cases, it's hard or
581 > impossible to meaningfully "reload" -- for instance, this
582 > functionality should be available while editing a Gnus message, as
583 > well as while visiting a file.
584 >
585 > For the special case of Latin-N <-> Latin-M conversion, things could
586 > be done easily -- to convert from N to M, you only need to convert
587 > internal representation back to N, and then convert it forth to M.
588
589 --------------------------------------------------------------------------
590 April 11, 2000
591
592 Well yes, this was the whole point of my "no lossage" proposal of being
593 able to undo any coding-system transformation on a buffer. The idea was
594 to figure out which transformations were definitely reversable, and for
595 all the others, cache the original text in a text property. This way, you
596 could probably still do a fairly good job at constructing a good reversal
597 even after you've gone into the text and added, deleted, and rearranged
598 some things.
599
600 But you could implement it much more simply and usefully by just
601 determining, for any text being decoded into mule-internal, can we go back
602 and read the source again? If not, remember the entire file (GNUS
603 message, etc) in text properties. Then, implement the UI interface (like
604 Netscape's) on top of that. This way, you have something that at least
605 works, but it might be inefficient. All we would need to do is work on
606 making the
607 underlying implementation more efficient.
608
609 Are you interested in doing this? It would be a huge win for users.
610 Hrvoje Niksic wrote:
611
612 > Ben Wing <ben@666.com> writes:
613 >
614 > > let me know exactly what "rethink" functionality you want and i'll
615 > > come up with an interface. perhaps you just want something like
616 > > netscape's encoding menu, where if you switch encodings, it reloads
617 > > and reencodes?
618 >
619 > It might be a bit more complex than that. In many cases, it's hard or
620 > impossible to meaningfully "reload" -- for instance, this
621 > functionality should be available while editing a Gnus message, as
622 > well as while visiting a file.
623 >
624 > For the special case of Latin-N <-> Latin-M conversion, things could
625 > be done easily -- to convert from N to M, you only need to convert
626 > internal representation back to N, and then convert it forth to M.
627
628
629 ------------------------------------------------------------------------
630
631 ==========================================================================
632 - Redoing translation macros [old]
633 ==========================================================================
634
635 Currently the translation macros (the macros with names such as
636 GET_C_STRING_CTEXT_DATA_ALLOCA) have names that are difficult to parse
637 or remember, and are not all that general. In the process of
638 reviewing the Windows code so that it could be muleized, I discovered
639 that these macros need to be extended in various ways to allow for
640 the Windows code to be easily muleized.
641
642 Since the macros needed to be changed anyways, I figured it would be a
643 good time to redo them properly. I propose new macros which have
644 names like this:
645
646 @itemize @bullet
647 @item
648 <A>_TO_EXTERNAL_FORMAT_<B>
649 @item
650 <A>_TO_EXTERNAL_FORMAT_<B>_1
651 @item
652 <C>_TO_INTERNAL_FORMAT_<D>
653 @item
654 <C>_TO_INTERNAL_FORMAT_<D>_1
655 @end itemize
656
657 A and C represent the source of the data, and B and D represent the
658 sink of the data.
659
660 All of these macros call either the functions
661 convert_to_external_format or convert_to_internal_format internally,
662 with some massaging of the arguments.
663
664 All of these macros take the following arguments:
665
666 @itemize @bullet
667 @item
668 First, one or two arguments indicating the source of the data.
669 @item
670 Second, an argument indicating the coding system. (In order to avoid
671 an excessive number of macros, we no longer provide separate macros
672 for specific coding systems.)
673 @item
674 Third, one or two arguments indicating the sink of the data.
675 @item
676 Fourth, optionally, arguments indicating the error behavior and the
677 warning class (these arguments are only present in the _1 versions
678 of the macros). The other, shorter named macros are trivial
679 interfaces onto these macros with the error behavior being
680 ERROR_ME_WARN, with the warning class being Vstandard_warning_class.
681 @end itemize
682
683 <A> can be one of the following:
684 @itemize @bullet
685 @item
686 LISP (which means a Lisp string) Takes one argument, a Lisp Object.
687 @item
688 LSTREAM (which indicates an lstream) Takes one argument, an
689 lstream. The data is read from the lstream until EOF is reached.
690 @item
691 DATA (which indicates a raw memory area) Takes two arguments, a
692 pointer and a length in bytes.
693 (You must never use this if the source of the data is a Lisp string,
694 because of the possibility of relocation during garbage collection.)
695 @end itemize
696
697 <B> can be one of the following:
698 @itemize @bullet
699 @item
700 ALLOCA (which means that the resulting data is stored in alloca()ed
701 memory. Two arguments should be specified, a pointer and a length,
702 which should be lvalues.)
703 @item
704 MALLOC (which means that the resulting data is stored in malloc()ed
705 memory. Two arguments should be specified, a pointer and a
706 length. The memory must be free()d by the caller.
707 @item
708 OPAQUE (which means the resulting data is stored in an opaque Lisp
709 Object. This takes one argument, a lvalue Lisp Object.
710 @item
711 LSTREAM. The data is written to an lstream.
712 @end itemize
713
714 <C> can be one of the :
715 @itemize @bullet
716 @item
717 DATA
718 @item
719 LSTREAM
720 @end itemize
721 (just like <A> above)
722
723 <D> can be one of
724 @itemize @bullet
725 @item
726 ALLOCA
727 @item
728 MALLOC
729 @item
730 LISP This means a Lisp String.
731 @item
732 BUFFER The resulting data is inserted into a buffer at the buffer's
733 value of point.
734 @item
735 LSTREAM The data is written to the lstream.
736 @end itemize
737
738
739 Note that I have eliminated the FORMAT argument of previous macros,
740 and replaced it with a coding system. This was made possible by
741 coding system aliases. In place of old `format's, we use a `virtual
742 coding system', which is aliased to the actual coding system.
743
744 The value of the coding system argument can be anything that is legal
745 input to get_coding_system, i.e. a symbol or a coding system object.
746
747 ==========================================================================
748 - creation of generic macros for accessing internally formatted data [old]
749 ==========================================================================
750
751 I have a design; it's all written down (I did it in Tsukuba), and I just have
752 to have it transcribed. It's higher level than the macros, though; it's Lisp
753 primitives that I'm designing.
754
755 As for the design of the macros, don't worry so much about all files having to
756 get included (which is inevitable with macros), but about how the files are
757 separated. Your design might go like this:
758
759 @enumerate
760 @item
761 you have generic macro interfaces, which specify a particular
762 behavior but not an implementation. these generic macros have
763 complementary versions for buffers and for strings (and the buffer
764 or string is an argument to all of the macros), and do such things
765 as convert between byte and char indices, retrieve the character at
766 a particular byte or char index, increment or decrement a byte
767 index to the beginning of the next or previous character, indicate
768 the number of bytes occupied by the character at a particular byte
769 or character index, etc. These are similar to what's already out
770 there except that they confound buffers and strings and that they
771 can also work with actual char *'s, which I think is a really bad
772 idea because it encourages code to "assume" that the representation
773 is ASCII compatible, which is might not be (e.g. 16-bit fixed
774 width). In fact, one thing I'm planning on doing is redefining
775 Bufbyte as a struct, for debugging purposes, to catch all places
776 that cavalierly compare them with ASCII char's. Note also that I
777 really want to rename Bufpos and Bytind, which are confusing and
778 wrong in that they also apply to strings. They should be Bytepos
779 and Charpos, or something like that, to go along with Bytecount and
780 Charcount. Similarly, Bufbyte is similarly a misnomer and should be
781 Intbyte -- a byte in the internal string representation (any of the
782 internal representations) of a string or buffer. Corresponding to
783 this is Extbyte (which we already have), a byte in any external
784 string representation. We also have Extcount, which makes sense,
785 and we might possibly want Extcharcount, the number of characters
786 in an external string representation; but that gets sticky in modal
787 encodings, and it's not clear how useful it would be.
788
789 @item
790 for all generic macro interfaces, there are specific versions of
791 each of them for each possible representation (pure ASCII in the
792 non-Mule world, Mule standard, UTF-8, 8-bit fixed, 16-bit fixed,
793 32-bit fixed, etc.; there may well be more than one possible 16-bit
794 fixed version, as well). Each representation has a corresponding
795 prefix, e.g. MULE_ or FIXED16_ or whatever, which is prefixed onto
796 the generic macro names. The resulting macros perform the
797 operation defined for the macro, but assume, and only work
798 correctly with, text in the corresponding representation.
799
800 @item
801 The definition of the generic versions merely conditionalizes on
802 the appropriate things (i.e. bit flags in the buffer or string
803 object) and calls the appropriate representation-specific version.
804 There may be more than one definition (protected by ifdefs, of
805 course), or one definition that amalgamated out of many ifdef'ed
806 sections.
807
808 @item
809 You should probably put each different representation in its own
810 header file, e.g. charset-mule.h or charset-fixed16.h or
811 charset-ascii.h or whatever. Then put the main macros into
812 charset.h, and conditionalize in this file appropriately to include
813 the other ones. That way, code that actually needs to play around
814 with internal-format text at this level can include "charset.h"
815 (certainly a much better place than buffer.h), and everyone else
816 uses higher-level routines. The representation-specific macros
817 should not normally be used *directly* at all; they are invoked
818 automatically from the generic macros. However, code that needs to
819 be highly, highly optimized might choose to take a loop and write
820 two versions of it, one for each representation, to avoid the
821 per-loop-iteration cost of a comparison. Until the macro interface
822 is rock stable and solid, we should strongly discourage such
823 nanosecond optimizations.
824 @end enumerate
825
826 ==========================================================================
827 - UTF-16 compatible representation
828 ==========================================================================
829
830 NOTE: One possible default internal representation that was compatible
831 with UTF16 but allowed all possible chars in UCS4 would be to take a
832 more-or-less unused range of 2048 chars (not from the private area
833 because Microsoft actually uses up most or all of it with EUDC chars).
834 Let's say we picked A400 - ABFF. Then, we'd have:
835
836 0000 - FFFF Simple chars
837
838 D[8-B]xx D[C-F]xx Surrogate char, represents 1M chars
839
840 A[4-B]xx D[C-F]xx D[C-F]xx Surrogate char, represents 2G chars
841
842 This is exactly the same number of chars as UCS-4 handles, and it follows the
843 same property as UTF8 and Mule-internal:
844
845 @enumerate
846 @item
847 There are two disjoint groupings of units, one representing leading units
848 and one representing non-leading units.
849 @item
850 Given a leading unit, you immediately know how many units follow to make
851 up a valid char, irrespective of any other context.
852 @end enumerate
853
854 Note that A4xx is actually currently assigned to Yi. Since this is an
855 internal representation, we could just move these elsewhere.
856
857 An alternative is to pick two disjoint ranges, e.g. 2D00 - 2DFF and
858 A500 - ABFF.
859
860 ==========================================================================
861 New API for char->font mapping
862 ==========================================================================
863 - ; supersedes charset-registry and CCL;
864 supports all windows systems; powerful enough for Unicode; etc.
865
866 (charset-font-mapping charset)
867
868 font-mapping-specifier string
869
870 char-font-mapping-table
871
872 char-table, specifier; elements of char table are either strings (which
873 specify a registry or comparable font property, or vectors of a string
874 (same) followed by keyword-value pairs (optional). The only allowable
875 keyword currently is :ccl-program, which specifies a CCL program to map
876 the characters into font indices. Other keywords may be added
877 e.g. allowing Elisp fragments instead of CCL programs, also allowed is
878 [inherit], which inherits from the next less-specific char-table in the
879 specifier.
880
881 The preferred interface onto this mapping (which should be portable
882 across Emacsen) is
883
884 (set-char-font-mapping key value &optional locale tag-set how-to-add)
885
886 where key is a char, range or charset (as for put-char-table), value is
887 as above, and the other arguments are standard for specifiers. This
888 automatically creates a char table in the locale, as necessary (all
889 elements default to [inherit]). On GNU Emacs, some specifiers arguments
890 may be unimplemented.
891
892 (char-font-mapping key value &optional locale)
893 works vaguely like get-specifier? But does inheritance processing.
894 locale should clearly default here to current-buffer
895
896 #### should get-specifier as well? Would make it work most like
897 #### buffer-local variables.
898
899 NB. set-charset-registry and set-charset-ccl-program are obsoleted.
900
901 ==========================================================================
902 Implementing fixed-width 8,16,32 bit buffer optimizations
903 ==========================================================================
904
905 Add set-buffer-optimization (buffer &rest keywords) for
906 controlling these things.
907
908 Also, put in hack so that correct arglist can be retrieved by
909 Lisp code.
910
911 Look at the way keyword primitives are currently handled; make
912 sure it works and is documented, etc.
913
914 Implement 8-bit fixed width optimization. Take the things that
915 know about the actual implementation and put them in a single
916 file, in essence creating an abstraction layer to allow
917 pluggable internal representations. Implement a fairly general
918 scheme for mapping between character codes in the 8 bits or 16
919 bits representation and on actual charset characters. As part of
920 set-buffer-optimization, you can specify a list of character sets
921 to be used in the 8 bit to 16 bit, etc. world. You can also
922 request that the buffer be in 8, 16, etc. if possible.
923
924 -> set defaults wrt this.
925 -> perhaps this should be just buffer properties.
926 -> this brings up the idea of default properties on an object.
927 -> Implement default-put, default-get, etc.
928
929 What happens when a character not assigned in the range gets
930 added? Then, must convert to variable width of some sort.
931
932 Note: at first, possibly we just convert whole hog to get things
933 right. Then we'd have to poy alternative to characters that got
934 added + deleted that were unassigned in the fixed width. When
935 this goes to zero and there's been enough time (heuristics), we
936 go back to fixed.
937
938 Side note: We could dynamically build up the set of assigned
939 chars as they go. Conceivably this could even go down to the
940 single char level: Just keep a big array of mapping from 16 bit
941 values to chars, and add empty time, a char has been encountered
942 that wasn't there before. Problem need inverse mapping.
943
944 -> Possibility; chars are actual objects, not just numbers.
945 Then you could keep track of such info in the chars itself.
946 *Think about this.*
947
948 Eventually, we might consider allowing mixed fixed-width,
949 variable-width buffer encodings. Then, we use range tables to
950 indicate which sections are fixed and which variable and INC_CHAR does
951 something like this: binary search to find the current range, which
952 indicates whether it's fixed or variable, and tells us what the
953 increment is. We can cache this info and use it next time to speed
954 up.
955
956 -> We will then have two partially shared range tables - one for
957 overall fixed width vs. variable width, and possibly one containing
958 this same info, but partitioning the variable width in one. Maybe
959 need fancier nested range table model.
960
961 ==========================================================================
962 Expansion of display table and case mapping table support for all
963 chars, not just ASCII/Latin1.
964 ==========================================================================
965
966 ==========================================================================
967 Improved flexibility for display tables, and evaluation of its
968 features to make sure it meshes with and complements the char<->font
969 mapping API mentioned earlier
970 ==========================================================================
971
972 ==========================================================================
973 String access speedup:
974 ==========================================================================
975
976 For strings larger than some size in bytes (10?), keep extra fields of
977 info: length in chars, and a (char, byte) pair in the middle to speed
978 up sequential access.
979
980 (Better idea: do this for any size string, but only if it contains
981 non-ASCII chars. Then if info is missing, we know string is
982 ASCII-only.)
983
984 Use a string-extra-info object, replacing string property slot and
985 containing fields for string mod tick, string extents, string props,
986 and string char length, and cached (char,byte) pair.
987 string-extra-info (or string-auxiliary?) objects could be in frob
988 blocks, esp. if creating frob blocks is easy + worth it.
989
990 - Caching of char<->byte conversions in strings - should make nearly
991 all operations on strings O(N)
992
993 ==========================================================================
994 Improvements in buffer char<->byte mapping
995 ==========================================================================
996
997 - Range table implementation - especially when there are few runs of
998 different widths, e.g. recently converted from fixed-width
999 optimization to variable width
1000
1001 Range Tables to speed up Bufpos <-> Bytind caching
1002 ==================================================
1003
1004 This describes an alternative implementation using ranges. We
1005 maintain a range table of all spans of characters of a fixed width.
1006 Updating this table could take time if there are a large number of
1007 spans; but constant factors of operations should be quick. This method really wins
1008 when you have 8-bit buffers just converted to variable width, where
1009 there will be few spans. More specifically, lookup in this range
1010 table is O(log N) and can be done with simple binary search, which is
1011 very fast. If we maintain the ranges using a gap array, updating this
1012 table will be fast for local operations, which is most of the time.
1013
1014 We will also provide (at first, at least) a Lisp function to set the
1015 caching mechanism explicitly - either range tables or the existing
1016 implementation. Eventually, we want to improve things, to the point
1017 where we automatically pick the right caching for the situation and
1018 have more caching schemes implemented.
1019
1020 ==========================================================================
1021 - Robustify Text Properties
1022 ==========================================================================
1023
1024 ==========================================================================
1025 Support for unified internal representation, e.g. Unicode
1026 ==========================================================================
1027
1028 Start tagging all text with a language text property,
1029 indicating the current language environment when the text was input.
1030 (needs "Robustify Text Properties")
1031
1032 ==========================================================================
1033 - Generalized Coding Systems
1034 ==========================================================================
1035
1036 - Lisp API for Defining Coding Systems
1037
1038 User-defined coding systems.
1039
1040 (define-coding-system-type 'type
1041 :encode-function fun
1042 :decode-function fun
1043 :detect-function fun
1044 :buffering (number = at least this many chars
1045 line = buffer up to end of line
1046 regexp = buffer until this regexp is found in match
1047 source data. match data will be appropriate when fun is
1048 called
1049
1050 encode fun is called as
1051
1052 (encode instream outstream)
1053
1054 should read data from instream and write converted result onto
1055 outstream. Can leave some data stuff in stream, it will reappear
1056 next time. Generally, there is a finite amount of data in instream
1057 and further attempts to read lead to would-block errors or retvals.
1058 Can use instream properties to record state. May use read-stream
1059 functionality to read everything into a vector or string.
1060
1061 ->Need vectors + string exposed to resizing of Lisp implementation
1062 where necessary.
1063
1064 ==========================================================================
1065 Support Windows Active Kbd Switching, Far East IME API (done already?)
1066 ==========================================================================
1067
1068 ==========================================================================
1069 - UI/design changes for Coding System Pipelining
1070 ==========================================================================
1071
1072 ------------------------------------------------------------------
1073 CODING-SYSTEM CHAINS
1074 ------------------------------------------------------------------
1075
1076 sjt sez:
1077
1078 There should be no elementary coding systems in the Lisp API, only
1079 chains. Chains should be declared, not computed, as a sequence of coding
1080 formats. (Probably the internal representation can be a vector for
1081 efficiency but programmers would probably rather work with lists.) A
1082 stream has a token type. Most streams are octet streams. Text is a
1083 stream of characters (in _internal_ format; a file on disk is not text!)
1084 An octet-stream has no implicit semantics, so its format must always be
1085 specified. The only type currently having semantics is characters. This
1086 means that the chain [euc-jp -> internal -> shift_jis) may be specified
1087 (euc-jp, shift_jis), and if no euc-jp -> shift_jis converter is
1088 available, then the chain is automatically constructed. (N.B. I f we
1089 have fixed width buffers in the future, then we could have ASCII -> 8-bit
1090 char -> 16-bit char -> ISO-2022-JP (with escape sequences).
1091
1092 EOL handling is a char <-> char coding. It should not be part of another
1093 coding system except as a convenience for users. For text coding,
1094 automatically insert EOL handlers between char <-> octet boundaries.
1095
1096 ------------------------------------------------------------------
1097 ABOUT DETECTION
1098 ------------------------------------------------------------------
1099
1100
1101 ------------------------------------------------------------------
1102 EFFICIENCY OF CODING CONVERSION WITH MULTIPLE COPIES/CHAINS
1103 ------------------------------------------------------------------
1104
1105 A comment in encode_decode_coding_region():
1106
1107 The chain of streams looks like this:
1108
1109 [BUFFER] <----- (( read from/send to loop ))
1110 ------> [CHAR->BYTE i.e. ENCODE AS BINARY if source is
1111 in bytes]
1112 ------> [ENCODE/DECODE AS SPECIFIED]
1113 ------> [BYTE->CHAR i.e. DECODE AS BINARY
1114 if sink is in bytes]
1115 ------> [AUTODETECT EOL if
1116 we're decoding and
1117 coding system calls
1118 for this]
1119 ------> [BUFFER]
1120
1121 sjt (?) responds:
1122
1123 Of course, this is just horrible. BYTE<->CHAR should only be available
1124 to I/O routines. It should not be visible to Mule proper.
1125
1126 A comment on the implementation. Hrvoje and Kyle worry about the
1127 inefficiency of repeated copying among buffers that chained coding
1128 systems entail. But this may not be as time inefficient as it appears
1129 in the Mule ("house rules") context. The issue is how do you do chain
1130 coding systems without copying? In theory you could have
1131
1132 IChar external_to_raw (ExtChar *cp, State *s);
1133 IChar decode_utf16 (IChar c, State *s);
1134 IChar decode_crlf (ExtChar *cp, State *s);
1135
1136 typedef Ichar (*Converter[]) (Ichar, State*);
1137
1138 Converter utf16[2] = { &decode_utf16, &decode_crlf };
1139
1140 void convert (ExtChar *inbuf, IChar *outbuf, Converter cvtr)
1141 {
1142 int i;
1143 ExtChar c;
1144 State s;
1145
1146 while (c = external_to_raw (*inbuf++, &s))
1147 {
1148 for (i = 0; i < sizeof(cvtr)/sizeof(Converter); ++i)
1149 if (s.ready)
1150 c = (*cvtr[i]) (c, &s);
1151 }
1152 if (s.ready)
1153 *outbuf++ = c;
1154 }
1155
1156 But this is a lot of function calls; what Ben is doing is basically
1157 reducing this to one call per buffer-full. The only way to avoid this
1158 is to hardcode all the "interesting" coding systems, maybe using
1159 inline or macros to give structure. But this is still a huge amount
1160 of work, and code.
1161
1162 One advantage to the call-per-char approach is that we might be able
1163 to do something about the marker/extent destruction that coding
1164 normally entails.
1165
1166 ben sez:
1167
1168 it should be possible to preserve the markers/extents without
1169 switching completely to one-call-per-char -- we could at least do one
1170 call per "run", where a run is more or less the maximal stretch of
1171 text not overlapping any markers or extent boundaries. (It's a bit
1172 more complicated if we want to properly support the different extent
1173 begins/ends; in some cases we might have to pump a single character
1174 adjacent to where two extents meet.) The "stateless" way that I wrote
1175 all of the conversion routines may be a real hassle but it allows
1176 something like this to work without too much problem -- pump in one
1177 run at a time into one end of the chain, do a flush after each
1178 iteration, and stick what comes out the other end in its place.
1179
1180 ------------------------------------------------------------------
1181 ABOUT FORMATS
1182 ------------------------------------------------------------------
1183
1184 when calling make-coding-system, the name can be a cons of (format1 .
1185 format2), specifying that it decodes format1->format2 and encodes the other
1186 way. if only one name is given, that is assumed to be format1, and the
1187 other is either `external' or `internal' depending on the end type.
1188 normally the user when decoding gives the decoding order in formats, but
1189 can leave off the last one, `internal', which is assumed. a multichain
1190 might look like gzip|multibyte|unicode, using the coding systems named
1191 `gzip', `(unicode . multibyte)' and `unicode'. the way this actually works
1192 is by searching for gzip->multibyte; if not found, look for gzip->external
1193 or gzip->internal. (In general we automatically do conversion between
1194 internal and external as necessary: thus gzip|crlf does the expected, and
1195 maps to gzip->external, external->internal, crlf->internal, which when
1196 fully specified would be gzip|external:external|internal:crlf|internal --
1197 see below.) To forcibly fit together two converters that have explicitly
1198 specified and incompatible names (say you have unicode->multibyte and
1199 iso8859-1->ebcdic and you know that the multibyte and iso8859-1 in this
1200 case are compatible), you can force-cast using :, like this:
1201 ebcdic|iso8859-1:multibyte|unicode. (again, if you force-cast between
1202 internal and external formats, the conversion happens automatically.)
1203
1204 --------------------------------------------------------------------------
1205 ABOUT PDUMP, UNICODE, AND RUNNING XEMACS FROM A DIRECTORY WITH WEIRD CHARS
1206 --------------------------------------------------------------------------
1207
1208 -- there's the problem that XEmacs can't be run in a directory with
1209 non-ASCII/Latin-1 chars in it, since it will be doing Unicode
1210 processing before we've had a chance to load the tables. In fact,
1211 even finding the tables in such a situation is problematic using
1212 the normal commands. my idea is to eventually load the stuff
1213 extremely extremely early, at the same time as the pdump data gets
1214 loaded. in fact, the unicode table data (stored in an efficient
1215 binary format) can even be stuck into the pdump file (which would
1216 mean as a resource to the executable, for windows). we'd need to
1217 extend pdump a bit: to allow for attaching extra data to the pdump
1218 file. (something like pdump_attach_extra_data (addr, length)
1219 returns a number of some sort, an index into the file, which you
1220 can then retrieve with pdump_load_extra_data(), which returns an
1221 addr (mmap()ed or loaded), and later you pdump_unload_extra_data()
1222 when finished. we'd probably also need
1223 pdump_attach_extra_data_append(), which appends data to the data
1224 just written out with pdump_attach_extra_data(). this way,
1225 multiple tables in memory can be written out into one contiguous
1226 table. (we'd use the tar-like trick of allowing new blocks to be
1227 written without going back to change the old blocks -- we just rely
1228 on the end of file/end of memory.) this same mechanism could be
1229 extracted out of pdump and used to handle the non-pdump situation
1230 (or alternatively, we could just dump either the memory image of
1231 the tables themselves or the compressed binary version). in the
1232 case of extra unicode tables not known about at compile time that
1233 get loaded before dumping, we either just dump them into the image
1234 (pdump and all) or extract them into the compressed binary format,
1235 free the original tables, and treat them like all other tables.
1236
1237
1238 ==========================================================================
1239 - Generalized language appropriate word wrapping (requires
1240 layout-exposing API defined in BIDI section)
1241 ==========================================================================
1242
1243 ==========================================================================
1244 - Make Custom Mule-aware
1245 ==========================================================================
1246
1247 ==========================================================================
1248 - Composite character support
1249 ==========================================================================
1250
1251 ==========================================================================
1252 - Language appropriate sorting and searching
1253 ==========================================================================
1254
1255 ==========================================================================
1256 - Glyph shaping for Arabic and Devanagari
1257 ==========================================================================
1258
1259 - (needs to be handled mostly
1260 at C level, as part of layout; luckily it's entirely local in its
1261 changes, as this is not hard)
1262
1263
1264 ==========================================================================
1265 Consider moving language selection Menu up to be parallel with Mule menu
1266 ==========================================================================
1267
1268 */
1269
625 1270
626 1271
627 /************************************************************************/ 1272 /************************************************************************/
628 /* declarations */ 1273 /* declarations */
629 /************************************************************************/ 1274 /************************************************************************/
693 { 1338 {
694 va_list args; 1339 va_list args;
695 int retval; 1340 int retval;
696 1341
697 va_start (args, format); 1342 va_start (args, format);
698 retval = vsprintf ((char *) buffer, format, args); 1343 retval = vsprintf ((Chbyte *) buffer, format, args);
699 va_end (args); 1344 va_end (args);
700 1345
701 return retval; 1346 return retval;
702 } 1347 }
703 1348
758 1403
759 return (cm[*s1] - cm[*--s2]); 1404 return (cm[*s1] - cm[*--s2]);
760 } 1405 }
761 1406
762 int 1407 int
763 ascii_strcasecmp (const Char_ASCII *s1, const Char_ASCII *s2) 1408 ascii_strcasecmp (const Ascbyte *s1, const Ascbyte *s2)
764 { 1409 {
765 return qxestrcasecmp ((const Ibyte *) s1, (const Ibyte *) s2); 1410 return qxestrcasecmp ((const Ibyte *) s1, (const Ibyte *) s2);
766 } 1411 }
767 1412
768 int 1413 int
769 qxestrcasecmp_c (const Ibyte *s1, const Char_ASCII *s2) 1414 qxestrcasecmp_ascii (const Ibyte *s1, const Ascbyte *s2)
770 { 1415 {
771 return qxestrcasecmp (s1, (const Ibyte *) s2); 1416 return qxestrcasecmp (s1, (const Ibyte *) s2);
772 } 1417 }
773 1418
774 /* An internationalized version that collapses case in a general fashion. 1419 /* An internationalized version that collapses case in a general fashion.
812 1457
813 return 0; 1458 return 0;
814 } 1459 }
815 1460
816 int 1461 int
817 ascii_strncasecmp (const Char_ASCII *s1, const Char_ASCII *s2, Bytecount len) 1462 ascii_strncasecmp (const Ascbyte *s1, const Ascbyte *s2, Bytecount len)
818 { 1463 {
819 return qxestrncasecmp ((const Ibyte *) s1, (const Ibyte *) s2, len); 1464 return qxestrncasecmp ((const Ibyte *) s1, (const Ibyte *) s2, len);
820 } 1465 }
821 1466
822 int 1467 int
823 qxestrncasecmp_c (const Ibyte *s1, const Char_ASCII *s2, Bytecount len) 1468 qxestrncasecmp_ascii (const Ibyte *s1, const Ascbyte *s2, Bytecount len)
824 { 1469 {
825 return qxestrncasecmp (s1, (const Ibyte *) s2, len); 1470 return qxestrncasecmp (s1, (const Ibyte *) s2, len);
826 } 1471 }
827 1472
828 /* Compare LEN_FROM_S1 worth of characters from S1 with the same number of 1473 /* Compare LEN_FROM_S1 worth of characters from S1 with the same number of
1032 { 1677 {
1033 return qxetextcasecmp (XSTRING_DATA (s1), XSTRING_LENGTH (s1), 1678 return qxetextcasecmp (XSTRING_DATA (s1), XSTRING_LENGTH (s1),
1034 XSTRING_DATA (s2), XSTRING_LENGTH (s2)); 1679 XSTRING_DATA (s2), XSTRING_LENGTH (s2));
1035 } 1680 }
1036 1681
1682 /* Compare a wide string with an ASCII string */
1683
1684 int
1685 wcscmp_ascii (const wchar_t *s1, const Ascbyte *s2)
1686 {
1687 while (*s1 && *s2)
1688 {
1689 if (*s1 != *s2)
1690 break;
1691 s1++, s2++;
1692 }
1693
1694 return *s1 - *s2;
1695 }
1696
1697 int
1698 wcsncmp_ascii (const wchar_t *s1, const Ascbyte *s2, Charcount len)
1699 {
1700 while (len--)
1701 {
1702 int diff = *s1 - *s2;
1703 if (diff != 0)
1704 return diff;
1705 if (!*s1)
1706 return 0;
1707 s1++, s2++;
1708 }
1709
1710 return 0;
1711 }
1712
1037 1713
1038 /************************************************************************/ 1714 /************************************************************************/
1039 /* conversion between textual representations */ 1715 /* conversion between textual representations */
1040 /************************************************************************/ 1716 /************************************************************************/
1041 1717
1042 /* NOTE: Does not reset the Dynarr. */ 1718 /* NOTE: Does not reset the Dynarr. */
1043 1719
1044 void 1720 void
1045 convert_ibyte_string_into_ichar_dynarr (const Ibyte *str, Bytecount len, 1721 convert_ibyte_string_into_ichar_dynarr (const Ibyte *str, Bytecount len,
1046 Ichar_dynarr *dyn) 1722 Ichar_dynarr *dyn)
1047 { 1723 {
1048 const Ibyte *strend = str + len; 1724 const Ibyte *strend = str + len;
1049 1725
1050 while (str < strend) 1726 while (str < strend)
1051 { 1727 {
1055 } 1731 }
1056 } 1732 }
1057 1733
1058 Charcount 1734 Charcount
1059 convert_ibyte_string_into_ichar_string (const Ibyte *str, Bytecount len, 1735 convert_ibyte_string_into_ichar_string (const Ibyte *str, Bytecount len,
1060 Ichar *arr) 1736 Ichar *arr)
1061 { 1737 {
1062 const Ibyte *strend = str + len; 1738 const Ibyte *strend = str + len;
1063 Charcount newlen = 0; 1739 Charcount newlen = 0;
1064 while (str < strend) 1740 while (str < strend)
1065 { 1741 {
1097 Ibyte * 1773 Ibyte *
1098 convert_ichar_string_into_malloced_string (Ichar *arr, int nels, 1774 convert_ichar_string_into_malloced_string (Ichar *arr, int nels,
1099 Bytecount *len_out) 1775 Bytecount *len_out)
1100 { 1776 {
1101 /* Damn zero-termination. */ 1777 /* Damn zero-termination. */
1102 Ibyte *str = (Ibyte *) ALLOCA (nels * MAX_ICHAR_LEN + 1); 1778 Ibyte *str = alloca_ibytes (nels * MAX_ICHAR_LEN + 1);
1103 Ibyte *strorig = str; 1779 Ibyte *strorig = str;
1104 Bytecount len; 1780 Bytecount len;
1105 1781
1106 int i; 1782 int i;
1107 1783
1108 for (i = 0; i < nels; i++) 1784 for (i = 0; i < nels; i++)
1109 str += set_itext_ichar (str, arr[i]); 1785 str += set_itext_ichar (str, arr[i]);
1110 *str = '\0'; 1786 *str = '\0';
1111 len = str - strorig; 1787 len = str - strorig;
1112 str = (Ibyte *) xmalloc (1 + len); 1788 str = xnew_ibytes (1 + len);
1113 memcpy (str, strorig, 1 + len); 1789 memcpy (str, strorig, 1 + len);
1114 if (len_out) 1790 if (len_out)
1115 *len_out = len; 1791 *len_out = len;
1116 return str; 1792 return str;
1117 } 1793 }
1438 { 2114 {
1439 Ibyte *newdata; 2115 Ibyte *newdata;
1440 2116
1441 ei->max_size_allocated_ = 2117 ei->max_size_allocated_ =
1442 eifind_large_enough_buffer (0, ei->bytelen_ + 1); 2118 eifind_large_enough_buffer (0, ei->bytelen_ + 1);
1443 newdata = (Ibyte *) xmalloc (ei->max_size_allocated_); 2119 newdata = xnew_ibytes (ei->max_size_allocated_);
1444 memcpy (newdata, ei->data_, ei->bytelen_ + 1); 2120 memcpy (newdata, ei->data_, ei->bytelen_ + 1);
1445 ei->data_ = newdata; 2121 ei->data_ = newdata;
1446 } 2122 }
1447 2123
1448 if (ei->extdata_) 2124 if (ei->extdata_)
1449 { 2125 {
1450 Extbyte *newdata = (Extbyte *) xmalloc (ei->extlen_ + 2); 2126 Extbyte *newdata = xnew_extbytes (ei->extlen_ + 2);
1451 2127
1452 memcpy (newdata, ei->extdata_, ei->extlen_); 2128 memcpy (newdata, ei->extdata_, ei->extlen_);
1453 /* Double null-terminate in case of Unicode data */ 2129 /* Double null-terminate in case of Unicode data */
1454 newdata[ei->extlen_] = '\0'; 2130 newdata[ei->extlen_] = '\0';
1455 newdata[ei->extlen_ + 1] = '\0'; 2131 newdata[ei->extlen_ + 1] = '\0';
1494 dst = ei2->data_; 2170 dst = ei2->data_;
1495 dstlen = ei2->bytelen_; 2171 dstlen = ei2->bytelen_;
1496 } 2172 }
1497 2173
1498 if (is_c) 2174 if (is_c)
1499 EI_ASSERT_ASCII ((Char_ASCII *) dst, dstlen); 2175 ASSERT_ASCTEXT_ASCII_LEN ((Ascbyte *) dst, dstlen);
1500 2176
1501 return (fold_case == 0 ? qxememcmp4 (src, len, dst, dstlen) : 2177 return (fold_case == 0 ? qxememcmp4 (src, len, dst, dstlen) :
1502 fold_case == 1 ? qxememcasecmp4 (src, len, dst, dstlen) : 2178 fold_case == 1 ? qxememcasecmp4 (src, len, dst, dstlen) :
1503 qxetextcasecmp (src, len, dst, dstlen)); 2179 qxetextcasecmp (src, len, dst, dstlen));
1504 } 2180 }
1525 2201
1526 /* Optimization. Do it. Live it. Love it. */ 2202 /* Optimization. Do it. Live it. Love it. */
1527 2203
1528 #ifdef MULE 2204 #ifdef MULE
1529 2205
1530 /* Skip as many ASCII bytes as possible in the memory block [PTR, END).
1531 Return pointer to the first non-ASCII byte. optimized for long
1532 stretches of ASCII. */
1533 inline static const Ibyte *
1534 skip_ascii (const Ibyte *ptr, const Ibyte *end)
1535 {
1536 #ifdef EFFICIENT_INT_128_BIT 2206 #ifdef EFFICIENT_INT_128_BIT
1537 # define STRIDE_TYPE INT_128_BIT 2207 # define STRIDE_TYPE INT_128_BIT
1538 # define HIGH_BIT_MASK \ 2208 # define HIGH_BIT_MASK \
1539 MAKE_128_BIT_UNSIGNED_CONSTANT (0x80808080808080808080808080808080) 2209 MAKE_128_BIT_UNSIGNED_CONSTANT (0x80808080808080808080808080808080)
1540 #elif defined (EFFICIENT_INT_64_BIT) 2210 #elif defined (EFFICIENT_INT_64_BIT)
1548 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1)) 2218 #define ALIGN_BITS ((EMACS_UINT) (ALIGNOF (STRIDE_TYPE) - 1))
1549 #define ALIGN_MASK (~ ALIGN_BITS) 2219 #define ALIGN_MASK (~ ALIGN_BITS)
1550 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0) 2220 #define ALIGNED(ptr) ((((EMACS_UINT) ptr) & ALIGN_BITS) == 0)
1551 #define STRIDE sizeof (STRIDE_TYPE) 2221 #define STRIDE sizeof (STRIDE_TYPE)
1552 2222
2223 /* Skip as many ASCII bytes as possible in the memory block [PTR, END).
2224 Return pointer to the first non-ASCII byte. optimized for long
2225 stretches of ASCII. */
2226 inline static const Ibyte *
2227 skip_ascii (const Ibyte *ptr, const Ibyte *end)
2228 {
1553 const unsigned STRIDE_TYPE *ascii_end; 2229 const unsigned STRIDE_TYPE *ascii_end;
1554 2230
1555 /* Need to do in 3 sections -- before alignment start, aligned chunk, 2231 /* Need to do in 3 sections -- before alignment start, aligned chunk,
1556 after alignment end. */ 2232 after alignment end. */
1557 while (!ALIGNED (ptr)) 2233 while (!ALIGNED (ptr))
1567 && !(*ascii_end & HIGH_BIT_MASK)) 2243 && !(*ascii_end & HIGH_BIT_MASK))
1568 ascii_end++; 2244 ascii_end++;
1569 ptr = (Ibyte *) ascii_end; 2245 ptr = (Ibyte *) ascii_end;
1570 while (ptr < end && byte_ascii_p (*ptr)) 2246 while (ptr < end && byte_ascii_p (*ptr))
1571 ptr++; 2247 ptr++;
2248 return ptr;
2249 }
2250
2251 /* Skip as many ASCII bytes as possible in the memory block [END, PTR),
2252 going downwards. Return pointer to the location above the first
2253 non-ASCII byte. Optimized for long stretches of ASCII. */
2254 inline static const Ibyte *
2255 skip_ascii_down (const Ibyte *ptr, const Ibyte *end)
2256 {
2257 const unsigned STRIDE_TYPE *ascii_end;
2258
2259 /* Need to do in 3 sections -- before alignment start, aligned chunk,
2260 after alignment end. */
2261 while (!ALIGNED (ptr))
2262 {
2263 if (ptr == end || !byte_ascii_p (*(ptr - 1)))
2264 return ptr;
2265 ptr--;
2266 }
2267 ascii_end = (const unsigned STRIDE_TYPE *) ptr - 1;
2268 /* This loop screams, because we can detect ASCII
2269 characters 4 or 8 at a time. */
2270 while ((const Ibyte *) ascii_end >= end
2271 && !(*ascii_end & HIGH_BIT_MASK))
2272 ascii_end--;
2273 ptr = (Ibyte *) (ascii_end + 1);
2274 while (ptr > end && byte_ascii_p (*(ptr - 1)))
2275 ptr--;
1572 return ptr; 2276 return ptr;
1573 } 2277 }
1574 2278
1575 /* Function equivalents of bytecount_to_charcount/charcount_to_bytecount. 2279 /* Function equivalents of bytecount_to_charcount/charcount_to_bytecount.
1576 These work on strings of all sizes but are more efficient than a simple 2280 These work on strings of all sizes but are more efficient than a simple
1629 } 2333 }
1630 } 2334 }
1631 return newptr - ptr; 2335 return newptr - ptr;
1632 } 2336 }
1633 2337
2338 /* Function equivalent of charcount_to_bytecount_down. This works on strings
2339 of all sizes but is more efficient than a simple loop on large strings
2340 and probably less efficient on sufficiently small strings. */
2341
2342 Bytecount
2343 charcount_to_bytecount_down_fun (const Ibyte *ptr, Charcount len)
2344 {
2345 const Ibyte *newptr = ptr;
2346 while (1)
2347 {
2348 const Ibyte *newnewptr = skip_ascii_down (newptr, newptr - len);
2349 len -= newptr - newnewptr;
2350 newptr = newnewptr;
2351 /* Skip over all non-ASCII chars, counting the length and
2352 stopping if it's zero */
2353 while (len && !byte_ascii_p (*(newptr - 1)))
2354 if (ibyte_first_byte_p (*--newptr))
2355 len--;
2356 if (!len)
2357 break;
2358 }
2359 text_checking_assert (ptr - newptr >= 0);
2360 return ptr - newptr;
2361 }
2362
1634 /* The next two functions are the actual meat behind the 2363 /* The next two functions are the actual meat behind the
1635 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently 2364 charbpos-to-bytebpos and bytebpos-to-charbpos conversions. Currently
1636 the method they use is fairly unsophisticated; see buffer.h. 2365 the method they use is fairly unsophisticated; see buffer.h.
1637 2366
1638 Note that charbpos_to_bytebpos_func() is probably the most-called 2367 Note that charbpos_to_bytebpos_func() is probably the most-called
1639 function in all of XEmacs. Therefore, it must be FAST FAST FAST. 2368 function in all of XEmacs. Therefore, it must be FAST FAST FAST.
1640 This is the reason why so much of the code is duplicated. 2369 This is the reason why so much of the code is duplicated.
1641 2370
1642 Similar considerations apply to bytebpos_to_charbpos_func(), although 2371 Similar considerations apply to bytebpos_to_charbpos_func(), although
1643 less so because the function is not called so often. 2372 less so because the function is not called so often.
1644 2373 */
1645 #### At some point this should use a more sophisticated method; 2374
1646 see buffer.h. */ 2375 /*
1647 2376
2377 Info on Byte-Char conversion:
2378
2379 (Info-goto-node "(internals)Byte-Char Position Conversion")
2380 */
2381
2382 #ifdef OLD_BYTE_CHAR
1648 static int not_very_random_number; 2383 static int not_very_random_number;
2384 #endif /* OLD_BYTE_CHAR */
2385
2386 #define OLD_LOOP
2387
2388 /* If we are this many characters away from any known position, cache the
2389 new position in the buffer's char-byte cache. */
2390 #define FAR_AWAY_DISTANCE 5000
2391
2392 /* Converting between character positions and byte positions. */
2393
2394 /* There are several places in the buffer where we know
2395 the correspondence: BEG, BEGV, PT, GPT, ZV and Z,
2396 and everywhere there is a marker. So we find the one of these places
2397 that is closest to the specified position, and scan from there. */
2398
2399 /* This macro is a subroutine of charbpos_to_bytebpos_func.
2400 Note that it is desirable that BYTEPOS is not evaluated
2401 except when we really want its value. */
2402
2403 #define CONSIDER(CHARPOS, BYTEPOS) \
2404 do \
2405 { \
2406 Charbpos this_charpos = (CHARPOS); \
2407 int changed = 0; \
2408 \
2409 if (this_charpos == x) \
2410 { \
2411 retval = (BYTEPOS); \
2412 goto done; \
2413 } \
2414 else if (this_charpos > x) \
2415 { \
2416 if (this_charpos < best_above) \
2417 { \
2418 best_above = this_charpos; \
2419 best_above_byte = (BYTEPOS); \
2420 changed = 1; \
2421 } \
2422 } \
2423 else if (this_charpos > best_below) \
2424 { \
2425 best_below = this_charpos; \
2426 best_below_byte = (BYTEPOS); \
2427 changed = 1; \
2428 } \
2429 \
2430 if (changed) \
2431 { \
2432 if (best_above - best_below == best_above_byte - best_below_byte) \
2433 { \
2434 retval = best_below_byte + (x - best_below); \
2435 goto done; \
2436 } \
2437 } \
2438 } \
2439 while (0)
2440
1649 2441
1650 Bytebpos 2442 Bytebpos
1651 charbpos_to_bytebpos_func (struct buffer *buf, Charbpos x) 2443 charbpos_to_bytebpos_func (struct buffer *buf, Charbpos x)
1652 { 2444 {
2445 #ifdef OLD_BYTE_CHAR
1653 Charbpos bufmin; 2446 Charbpos bufmin;
1654 Charbpos bufmax; 2447 Charbpos bufmax;
1655 Bytebpos bytmin; 2448 Bytebpos bytmin;
1656 Bytebpos bytmax; 2449 Bytebpos bytmax;
1657 int size; 2450 int size;
1658 int forward_p; 2451 int forward_p;
1659 Bytebpos retval;
1660 int diff_so_far; 2452 int diff_so_far;
1661 int add_to_cache = 0; 2453 int add_to_cache = 0;
2454 #endif /* OLD_BYTE_CHAR */
2455
2456 Charbpos best_above, best_below;
2457 Bytebpos best_above_byte, best_below_byte;
2458 int i;
2459 struct buffer_text *t;
2460 Bytebpos retval;
2461
1662 PROFILE_DECLARE (); 2462 PROFILE_DECLARE ();
1663 2463
1664 /* Check for some cached positions, for speed. */
1665 if (x == BUF_PT (buf))
1666 return BYTE_BUF_PT (buf);
1667 if (x == BUF_ZV (buf))
1668 return BYTE_BUF_ZV (buf);
1669 if (x == BUF_BEGV (buf))
1670 return BYTE_BUF_BEGV (buf);
1671
1672 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion); 2464 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion);
2465
2466 best_above = BUF_Z (buf);
2467 best_above_byte = BYTE_BUF_Z (buf);
2468
2469 /* In this case, we simply have all one-byte characters. But this should
2470 have been intercepted before, in charbpos_to_bytebpos(). */
2471 text_checking_assert (best_above != best_above_byte);
2472
2473 best_below = BUF_BEG (buf);
2474 best_below_byte = BYTE_BUF_BEG (buf);
2475
2476 /* We find in best_above and best_above_byte
2477 the closest known point above CHARPOS,
2478 and in best_below and best_below_byte
2479 the closest known point below CHARPOS,
2480
2481 If at any point we can tell that the space between those
2482 two best approximations is all single-byte,
2483 we interpolate the result immediately. */
2484
2485 CONSIDER (BUF_PT (buf), BYTE_BUF_PT (buf));
2486 CONSIDER (BUF_GPT (buf), BYTE_BUF_GPT (buf));
2487 CONSIDER (BUF_BEGV (buf), BYTE_BUF_BEGV (buf));
2488 CONSIDER (BUF_ZV (buf), BYTE_BUF_ZV (buf));
2489
2490 t = buf->text;
2491 CONSIDER (t->cached_charpos, t->cached_bytepos);
2492
2493 /* Check the most recently entered positions first */
2494
2495 for (i = t->next_cache_pos - 1; i >= 0; i--)
2496 {
2497 CONSIDER (t->mule_charbpos_cache[i], t->mule_bytebpos_cache[i]);
2498
2499 /* If we are down to a range of 50 chars,
2500 don't bother checking any other markers;
2501 scan the intervening chars directly now. */
2502 if (best_above - best_below < 50)
2503 break;
2504 }
2505
2506 /* We get here if we did not exactly hit one of the known places.
2507 We have one known above and one known below.
2508 Scan, counting characters, from whichever one is closer. */
2509
2510 if (x - best_below < best_above - x)
2511 {
2512 int record = x - best_below > FAR_AWAY_DISTANCE;
2513
2514 #ifdef OLD_LOOP /* old code */
2515 while (best_below != x)
2516 {
2517 best_below++;
2518 INC_BYTEBPOS (buf, best_below_byte);
2519 }
2520 #else
2521 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT);
2522 /* The gap should not occur between best_below and x, or we will be
2523 screwed in using charcount_to_bytecount(). It should not be exactly
2524 at x either, because we already should have caught that. */
2525 text_checking_assert
2526 (BUF_CEILING_OF_IGNORE_ACCESSIBLE (buf, best_below) > x);
2527
2528 /* Using charcount_to_bytecount() is potentially a lot faster than a
2529 simple loop using INC_BYTEBPOS() because (a) the checks for gap
2530 and buffer format are factored out instead of getting checked
2531 every time; (b) the checking goes 4 or 8 bytes at a time in ASCII
2532 text.
2533 */
2534 best_below_byte +=
2535 charcount_to_bytecount
2536 (BYTE_BUF_BYTE_ADDRESS (buf, best_below_byte), x - best_below);
2537 best_below = x;
2538 #endif /* 0 */
2539
2540 /* If this position is quite far from the nearest known position,
2541 cache the correspondence.
2542
2543 NB FSF does this: "... by creating a marker here.
2544 It will last until the next GC."
2545 */
2546
2547 if (record)
2548 {
2549 /* If we have run out of positions to record, discard some of the
2550 old ones. I used to use a circular buffer, which avoids the
2551 need to block-move any memory. But it makes it more difficult
2552 to keep track of which positions haven't been used -- commonly
2553 we haven't yet filled out anywhere near the whole set of
2554 positions and don't want to check them all. We should not be
2555 recording that often, and block-moving is extremely fast in
2556 any case. --ben */
2557 if (t->next_cache_pos == NUM_CACHED_POSITIONS)
2558 {
2559 memmove (t->mule_charbpos_cache,
2560 t->mule_charbpos_cache + NUM_MOVED_POSITIONS,
2561 sizeof (Charbpos) *
2562 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
2563 memmove (t->mule_bytebpos_cache,
2564 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS,
2565 sizeof (Bytebpos) *
2566 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
2567 t->next_cache_pos -= NUM_MOVED_POSITIONS;
2568 }
2569 t->mule_charbpos_cache[t->next_cache_pos] = best_below;
2570 t->mule_bytebpos_cache[t->next_cache_pos] = best_below_byte;
2571 t->next_cache_pos++;
2572 }
2573
2574 t->cached_charpos = best_below;
2575 t->cached_bytepos = best_below_byte;
2576
2577 retval = best_below_byte;
2578 text_checking_assert (best_below_byte >= best_below);
2579 goto done;
2580 }
2581 else
2582 {
2583 int record = best_above - x > FAR_AWAY_DISTANCE;
2584
2585 #ifdef OLD_LOOP
2586 while (best_above != x)
2587 {
2588 best_above--;
2589 DEC_BYTEBPOS (buf, best_above_byte);
2590 }
2591 #else
2592 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT);
2593 /* The gap should not occur between best_above and x, or we will be
2594 screwed in using charcount_to_bytecount_down(). It should not be
2595 exactly at x either, because we already should have caught
2596 that. */
2597 text_checking_assert
2598 (BUF_FLOOR_OF_IGNORE_ACCESSIBLE (buf, best_above) < x);
2599
2600 /* Using charcount_to_bytecount_down() is potentially a lot faster
2601 than a simple loop using DEC_BYTEBPOS(); see above. */
2602 best_above_byte -=
2603 charcount_to_bytecount_down
2604 /* BYTE_BUF_BYTE_ADDRESS will return a value on the high side of the
2605 gap if we are at the gap, which is the wrong side. So do the
2606 following trick instead. */
2607 (BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, best_above_byte) + 1,
2608 best_above - x);
2609 best_above = x;
2610 #endif /* SLEDGEHAMMER_CHECK_TEXT */
2611
2612
2613 /* If this position is quite far from the nearest known position,
2614 cache the correspondence.
2615
2616 NB FSF does this: "... by creating a marker here.
2617 It will last until the next GC."
2618 */
2619 if (record)
2620 {
2621 if (t->next_cache_pos == NUM_CACHED_POSITIONS)
2622 {
2623 memmove (t->mule_charbpos_cache,
2624 t->mule_charbpos_cache + NUM_MOVED_POSITIONS,
2625 sizeof (Charbpos) *
2626 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
2627 memmove (t->mule_bytebpos_cache,
2628 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS,
2629 sizeof (Bytebpos) *
2630 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
2631 t->next_cache_pos -= NUM_MOVED_POSITIONS;
2632 }
2633 t->mule_charbpos_cache[t->next_cache_pos] = best_above;
2634 t->mule_bytebpos_cache[t->next_cache_pos] = best_above_byte;
2635 t->next_cache_pos++;
2636 }
2637
2638 t->cached_charpos = best_above;
2639 t->cached_bytepos = best_above_byte;
2640
2641 retval = best_above_byte;
2642 text_checking_assert (best_above_byte >= best_above);
2643 goto done;
2644 }
2645
2646 #ifdef OLD_BYTE_CHAR
1673 2647
1674 bufmin = buf->text->mule_bufmin; 2648 bufmin = buf->text->mule_bufmin;
1675 bufmax = buf->text->mule_bufmax; 2649 bufmax = buf->text->mule_bufmax;
1676 bytmin = buf->text->mule_bytmin; 2650 bytmin = buf->text->mule_bytmin;
1677 bytmax = buf->text->mule_bytmax; 2651 bytmax = buf->text->mule_bytmax;
1787 2761
1788 add_to_cache = 1; 2762 add_to_cache = 1;
1789 /* I considered keeping the positions ordered. This would speed 2763 /* I considered keeping the positions ordered. This would speed
1790 up this loop, but updating the cache would take longer, so 2764 up this loop, but updating the cache would take longer, so
1791 it doesn't seem like it would really matter. */ 2765 it doesn't seem like it would really matter. */
1792 for (i = 0; i < 16; i++) 2766 for (i = 0; i < NUM_CACHED_POSITIONS; i++)
1793 { 2767 {
1794 int diff = buf->text->mule_charbpos_cache[i] - x; 2768 int diff = buf->text->mule_charbpos_cache[i] - x;
1795 2769
1796 if (diff < 0) 2770 if (diff < 0)
1797 diff = -diff; 2771 diff = -diff;
1919 replace_loc = not_very_random_number & 15; 2893 replace_loc = not_very_random_number & 15;
1920 buf->text->mule_charbpos_cache[replace_loc] = x; 2894 buf->text->mule_charbpos_cache[replace_loc] = x;
1921 buf->text->mule_bytebpos_cache[replace_loc] = retval; 2895 buf->text->mule_bytebpos_cache[replace_loc] = retval;
1922 } 2896 }
1923 2897
2898 #endif /* OLD_BYTE_CHAR */
2899
2900 done:
1924 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion); 2901 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion);
1925 2902
1926 return retval; 2903 return retval;
1927 } 2904 }
2905
2906 #undef CONSIDER
2907
2908 /* bytepos_to_charpos returns the char position corresponding to BYTEPOS. */
2909
2910 /* This macro is a subroutine of bytebpos_to_charbpos_func.
2911 It is used when BYTEPOS is actually the byte position. */
2912
2913 #define CONSIDER(BYTEPOS, CHARPOS) \
2914 do \
2915 { \
2916 Bytebpos this_bytepos = (BYTEPOS); \
2917 int changed = 0; \
2918 \
2919 if (this_bytepos == x) \
2920 { \
2921 retval = (CHARPOS); \
2922 goto done; \
2923 } \
2924 else if (this_bytepos > x) \
2925 { \
2926 if (this_bytepos < best_above_byte) \
2927 { \
2928 best_above = (CHARPOS); \
2929 best_above_byte = this_bytepos; \
2930 changed = 1; \
2931 } \
2932 } \
2933 else if (this_bytepos > best_below_byte) \
2934 { \
2935 best_below = (CHARPOS); \
2936 best_below_byte = this_bytepos; \
2937 changed = 1; \
2938 } \
2939 \
2940 if (changed) \
2941 { \
2942 if (best_above - best_below == best_above_byte - best_below_byte) \
2943 { \
2944 retval = best_below + (x - best_below_byte); \
2945 goto done; \
2946 } \
2947 } \
2948 } \
2949 while (0)
1928 2950
1929 /* The logic in this function is almost identical to the logic in 2951 /* The logic in this function is almost identical to the logic in
1930 the previous function. */ 2952 the previous function. */
1931 2953
1932 Charbpos 2954 Charbpos
1933 bytebpos_to_charbpos_func (struct buffer *buf, Bytebpos x) 2955 bytebpos_to_charbpos_func (struct buffer *buf, Bytebpos x)
1934 { 2956 {
2957 #ifdef OLD_BYTE_CHAR
1935 Charbpos bufmin; 2958 Charbpos bufmin;
1936 Charbpos bufmax; 2959 Charbpos bufmax;
1937 Bytebpos bytmin; 2960 Bytebpos bytmin;
1938 Bytebpos bytmax; 2961 Bytebpos bytmax;
1939 int size; 2962 int size;
1940 int forward_p; 2963 int forward_p;
1941 Charbpos retval;
1942 int diff_so_far; 2964 int diff_so_far;
1943 int add_to_cache = 0; 2965 int add_to_cache = 0;
2966 #endif /* OLD_BYTE_CHAR */
2967
2968 Charbpos best_above, best_above_byte;
2969 Bytebpos best_below, best_below_byte;
2970 int i;
2971 struct buffer_text *t;
2972 Charbpos retval;
2973
1944 PROFILE_DECLARE (); 2974 PROFILE_DECLARE ();
1945 2975
1946 /* Check for some cached positions, for speed. */
1947 if (x == BYTE_BUF_PT (buf))
1948 return BUF_PT (buf);
1949 if (x == BYTE_BUF_ZV (buf))
1950 return BUF_ZV (buf);
1951 if (x == BYTE_BUF_BEGV (buf))
1952 return BUF_BEGV (buf);
1953
1954 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion); 2976 PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion);
2977
2978 best_above = BUF_Z (buf);
2979 best_above_byte = BYTE_BUF_Z (buf);
2980
2981 /* In this case, we simply have all one-byte characters. But this should
2982 have been intercepted before, in bytebpos_to_charbpos(). */
2983 text_checking_assert (best_above != best_above_byte);
2984
2985 best_below = BUF_BEG (buf);
2986 best_below_byte = BYTE_BUF_BEG (buf);
2987
2988 CONSIDER (BYTE_BUF_PT (buf), BUF_PT (buf));
2989 CONSIDER (BYTE_BUF_GPT (buf), BUF_GPT (buf));
2990 CONSIDER (BYTE_BUF_BEGV (buf), BUF_BEGV (buf));
2991 CONSIDER (BYTE_BUF_ZV (buf), BUF_ZV (buf));
2992
2993 t = buf->text;
2994 CONSIDER (t->cached_bytepos, t->cached_charpos);
2995
2996 /* Check the most recently entered positions first */
2997
2998 for (i = t->next_cache_pos - 1; i >= 0; i--)
2999 {
3000 CONSIDER (t->mule_bytebpos_cache[i], t->mule_charbpos_cache[i]);
3001
3002 /* If we are down to a range of 50 chars,
3003 don't bother checking any other markers;
3004 scan the intervening chars directly now. */
3005 if (best_above - best_below < 50)
3006 break;
3007 }
3008
3009 /* We get here if we did not exactly hit one of the known places.
3010 We have one known above and one known below.
3011 Scan, counting characters, from whichever one is closer. */
3012
3013 if (x - best_below_byte < best_above_byte - x)
3014 {
3015 int record = x - best_below_byte > 5000;
3016
3017 #ifdef OLD_LOOP /* old code */
3018 while (best_below_byte < x)
3019 {
3020 best_below++;
3021 INC_BYTEBPOS (buf, best_below_byte);
3022 }
3023 #else
3024 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT);
3025 /* The gap should not occur between best_below and x, or we will be
3026 screwed in using charcount_to_bytecount(). It should not be exactly
3027 at x either, because we already should have caught that. */
3028 text_checking_assert
3029 (BYTE_BUF_CEILING_OF_IGNORE_ACCESSIBLE (buf, best_below_byte) > x);
3030
3031 /* Using bytecount_to_charcount() is potentially a lot faster than
3032 a simple loop above using INC_BYTEBPOS(); see above.
3033 */
3034 best_below +=
3035 bytecount_to_charcount
3036 (BYTE_BUF_BYTE_ADDRESS (buf, best_below_byte), x - best_below_byte);
3037 best_below_byte = x;
3038 #endif
3039
3040 /* If this position is quite far from the nearest known position,
3041 cache the correspondence.
3042
3043 NB FSF does this: "... by creating a marker here.
3044 It will last until the next GC."
3045 */
3046
3047 if (record)
3048 {
3049 if (t->next_cache_pos == NUM_CACHED_POSITIONS)
3050 {
3051 memmove (t->mule_charbpos_cache,
3052 t->mule_charbpos_cache + NUM_MOVED_POSITIONS,
3053 sizeof (Charbpos) *
3054 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
3055 memmove (t->mule_bytebpos_cache,
3056 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS,
3057 sizeof (Bytebpos) *
3058 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
3059 t->next_cache_pos -= NUM_MOVED_POSITIONS;
3060 }
3061 t->mule_charbpos_cache[t->next_cache_pos] = best_below;
3062 t->mule_bytebpos_cache[t->next_cache_pos] = best_below_byte;
3063 t->next_cache_pos++;
3064 }
3065
3066
3067 t->cached_charpos = best_below;
3068 t->cached_bytepos = best_below_byte;
3069
3070 retval = best_below;
3071 text_checking_assert (best_below_byte >= best_below);
3072 goto done;
3073 }
3074 else
3075 {
3076 int record = best_above_byte - x > 5000;
3077
3078 #ifdef OLD_LOOP /* old code */
3079 while (best_above_byte > x)
3080 {
3081 best_above--;
3082 DEC_BYTEBPOS (buf, best_above_byte);
3083 }
3084 #else
3085 text_checking_assert (BUF_FORMAT (buf) == FORMAT_DEFAULT);
3086 /* The gap should not occur between best_above and x, or we will be
3087 screwed in using bytecount_to_charcount_down(). It should not be
3088 exactly at x either, because we already should have caught
3089 that. */
3090 text_checking_assert
3091 (BYTE_BUF_FLOOR_OF_IGNORE_ACCESSIBLE (buf, best_above_byte) < x);
3092
3093 /* Using bytecount_to_charcount_down() is potentially a lot faster
3094 than a simple loop using INC_BYTEBPOS(); see above. */
3095 best_above -=
3096 bytecount_to_charcount_down
3097 /* BYTE_BUF_BYTE_ADDRESS will return a value on the high side of the
3098 gap if we are at the gap, which is the wrong side. So do the
3099 following trick instead. */
3100 (BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, best_above_byte) + 1,
3101 best_above_byte - x);
3102 best_above_byte = x;
3103 #endif
3104
3105
3106 /* If this position is quite far from the nearest known position,
3107 cache the correspondence.
3108
3109 NB FSF does this: "... by creating a marker here.
3110 It will last until the next GC."
3111 */
3112 if (record)
3113 {
3114 if (t->next_cache_pos == NUM_CACHED_POSITIONS)
3115 {
3116 memmove (t->mule_charbpos_cache,
3117 t->mule_charbpos_cache + NUM_MOVED_POSITIONS,
3118 sizeof (Charbpos) *
3119 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
3120 memmove (t->mule_bytebpos_cache,
3121 t->mule_bytebpos_cache + NUM_MOVED_POSITIONS,
3122 sizeof (Bytebpos) *
3123 (NUM_CACHED_POSITIONS - NUM_MOVED_POSITIONS));
3124 t->next_cache_pos -= NUM_MOVED_POSITIONS;
3125 }
3126 t->mule_charbpos_cache[t->next_cache_pos] = best_above;
3127 t->mule_bytebpos_cache[t->next_cache_pos] = best_above_byte;
3128 t->next_cache_pos++;
3129 }
3130
3131 t->cached_charpos = best_above;
3132 t->cached_bytepos = best_above_byte;
3133
3134 retval = best_above;
3135 text_checking_assert (best_above_byte >= best_above);
3136 goto done;
3137 }
3138
3139 #ifdef OLD_BYTE_CHAR
1955 3140
1956 bufmin = buf->text->mule_bufmin; 3141 bufmin = buf->text->mule_bufmin;
1957 bufmax = buf->text->mule_bufmax; 3142 bufmax = buf->text->mule_bufmax;
1958 bytmin = buf->text->mule_bytmin; 3143 bytmin = buf->text->mule_bytmin;
1959 bytmax = buf->text->mule_bytmax; 3144 bytmax = buf->text->mule_bytmax;
2069 3254
2070 add_to_cache = 1; 3255 add_to_cache = 1;
2071 /* I considered keeping the positions ordered. This would speed 3256 /* I considered keeping the positions ordered. This would speed
2072 up this loop, but updating the cache would take longer, so 3257 up this loop, but updating the cache would take longer, so
2073 it doesn't seem like it would really matter. */ 3258 it doesn't seem like it would really matter. */
2074 for (i = 0; i < 16; i++) 3259 for (i = 0; i < NUM_CACHED_POSITIONS; i++)
2075 { 3260 {
2076 int diff = buf->text->mule_bytebpos_cache[i] - x; 3261 int diff = buf->text->mule_bytebpos_cache[i] - x;
2077 3262
2078 if (diff < 0) 3263 if (diff < 0)
2079 diff = -diff; 3264 diff = -diff;
2200 not_very_random_number += 621; 3385 not_very_random_number += 621;
2201 replace_loc = not_very_random_number & 15; 3386 replace_loc = not_very_random_number & 15;
2202 buf->text->mule_charbpos_cache[replace_loc] = retval; 3387 buf->text->mule_charbpos_cache[replace_loc] = retval;
2203 buf->text->mule_bytebpos_cache[replace_loc] = x; 3388 buf->text->mule_bytebpos_cache[replace_loc] = x;
2204 } 3389 }
2205 3390 #endif /* OLD_BYTE_CHAR */
3391
3392 done:
2206 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion); 3393 PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion);
2207 3394
2208 return retval; 3395 return retval;
2209 } 3396 }
2210 3397
2214 void 3401 void
2215 buffer_mule_signal_inserted_region (struct buffer *buf, Charbpos start, 3402 buffer_mule_signal_inserted_region (struct buffer *buf, Charbpos start,
2216 Bytecount bytelength, 3403 Bytecount bytelength,
2217 Charcount charlength) 3404 Charcount charlength)
2218 { 3405 {
3406 #ifdef OLD_BYTE_CHAR
2219 int size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p; 3407 int size = (1 << buf->text->mule_shifter) + !!buf->text->mule_three_p;
3408 #endif /* OLD_BYTE_CHAR */
2220 int i; 3409 int i;
2221 3410
2222 /* Adjust the cache of known positions. */ 3411 /* Adjust the cache of known positions. */
2223 for (i = 0; i < 16; i++) 3412 for (i = 0; i < buf->text->next_cache_pos; i++)
2224 { 3413 {
2225 3414
2226 if (buf->text->mule_charbpos_cache[i] > start) 3415 if (buf->text->mule_charbpos_cache[i] > start)
2227 { 3416 {
2228 buf->text->mule_charbpos_cache[i] += charlength; 3417 buf->text->mule_charbpos_cache[i] += charlength;
2229 buf->text->mule_bytebpos_cache[i] += bytelength; 3418 buf->text->mule_bytebpos_cache[i] += bytelength;
2230 } 3419 }
2231 } 3420 }
2232 3421
3422 /* Adjust the special cached position. */
3423
3424 if (buf->text->cached_charpos > start)
3425 {
3426 buf->text->cached_charpos += charlength;
3427 buf->text->cached_bytepos += bytelength;
3428 }
3429
3430 #ifdef OLD_BYTE_CHAR
2233 if (start >= buf->text->mule_bufmax) 3431 if (start >= buf->text->mule_bufmax)
2234 return; 3432 return;
2235 3433
2236 /* The insertion is either before the known region, in which case 3434 /* The insertion is either before the known region, in which case
2237 it shoves it forward; or within the known region, in which case 3435 it shoves it forward; or within the known region, in which case
2302 buf->text->mule_bufmin = end; 3500 buf->text->mule_bufmin = end;
2303 buf->text->mule_bytmin = byteend; 3501 buf->text->mule_bytmin = byteend;
2304 } 3502 }
2305 } 3503 }
2306 } 3504 }
3505 #endif /* OLD_BYTE_CHAR */
2307 } 3506 }
2308 3507
2309 /* Text from START to END (equivalent in Bytebpos's: from BYTE_START to 3508 /* Text from START to END (equivalent in Bytebpos's: from BYTE_START to
2310 BYTE_END) was deleted. */ 3509 BYTE_END) was deleted. */
2311 3510
2315 Bytebpos byte_end) 3514 Bytebpos byte_end)
2316 { 3515 {
2317 int i; 3516 int i;
2318 3517
2319 /* Adjust the cache of known positions. */ 3518 /* Adjust the cache of known positions. */
2320 for (i = 0; i < 16; i++) 3519 for (i = 0; i < buf->text->next_cache_pos; i++)
2321 { 3520 {
2322 /* After the end; gets shoved backward */ 3521 /* After the end; gets shoved backward */
2323 if (buf->text->mule_charbpos_cache[i] > end) 3522 if (buf->text->mule_charbpos_cache[i] > end)
2324 { 3523 {
2325 buf->text->mule_charbpos_cache[i] -= end - start; 3524 buf->text->mule_charbpos_cache[i] -= end - start;
2331 buf->text->mule_charbpos_cache[i] = start; 3530 buf->text->mule_charbpos_cache[i] = start;
2332 buf->text->mule_bytebpos_cache[i] = byte_start; 3531 buf->text->mule_bytebpos_cache[i] = byte_start;
2333 } 3532 }
2334 } 3533 }
2335 3534
3535 /* Adjust the special cached position. */
3536
3537 /* After the end; gets shoved backward */
3538 if (buf->text->cached_charpos > end)
3539 {
3540 buf->text->cached_charpos -= end - start;
3541 buf->text->cached_bytepos -= byte_end - byte_start;
3542 }
3543 /* In the range; moves to start of range */
3544 else if (buf->text->cached_charpos > start)
3545 {
3546 buf->text->cached_charpos = start;
3547 buf->text->cached_bytepos = byte_start;
3548 }
3549
3550 #ifdef OLD_BYTE_CHAR
2336 /* We don't care about any text after the end of the known region. */ 3551 /* We don't care about any text after the end of the known region. */
2337 3552
2338 end = min (end, buf->text->mule_bufmax); 3553 end = min (end, buf->text->mule_bufmax);
2339 byte_end = min (byte_end, buf->text->mule_bytmax); 3554 byte_end = min (byte_end, buf->text->mule_bytmax);
2340 if (start >= end) 3555 if (start >= end)
2353 if (start < end) 3568 if (start < end)
2354 { 3569 {
2355 buf->text->mule_bufmin -= end - start; 3570 buf->text->mule_bufmin -= end - start;
2356 buf->text->mule_bytmin -= byte_end - byte_start; 3571 buf->text->mule_bytmin -= byte_end - byte_start;
2357 } 3572 }
3573 #endif /* OLD_BYTE_CHAR */
2358 } 3574 }
2359 3575
2360 #endif /* MULE */ 3576 #endif /* MULE */
2361 3577
2362 3578
2831 places. */ 4047 places. */
2832 int count; 4048 int count;
2833 Extbyte_dynarr *conversion_out_dynarr; 4049 Extbyte_dynarr *conversion_out_dynarr;
2834 PROFILE_DECLARE (); 4050 PROFILE_DECLARE ();
2835 4051
4052 assert (!inhibit_non_essential_conversion_operations);
2836 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion); 4053 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion);
2837 4054
2838 count = begin_gc_forbidden (); 4055 count = begin_gc_forbidden ();
2839 4056
2840 type_checking_assert 4057 type_checking_assert
3036 places. */ 4253 places. */
3037 int count; 4254 int count;
3038 Ibyte_dynarr *conversion_in_dynarr; 4255 Ibyte_dynarr *conversion_in_dynarr;
3039 PROFILE_DECLARE (); 4256 PROFILE_DECLARE ();
3040 4257
4258 assert (!inhibit_non_essential_conversion_operations);
3041 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion); 4259 PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion);
3042 4260
3043 count = begin_gc_forbidden (); 4261 count = begin_gc_forbidden ();
3044 4262
3045 type_checking_assert 4263 type_checking_assert
3221 4439
3222 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion); 4440 PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion);
3223 } 4441 }
3224 4442
3225 /* ----------------------------------------------------------------------- */ 4443 /* ----------------------------------------------------------------------- */
3226 /* New-style DFC converters (data is returned rather than stored into var) */ 4444 /* Alloca-conversion helpers */
3227 /* ----------------------------------------------------------------------- */ 4445 /* ----------------------------------------------------------------------- */
3228
3229 /* We handle here the cases where SRC is a Lisp_Object, internal data
3230 (sized or unsized), or external data (sized or unsized), and return type
3231 is unsized alloca() or malloc() data. If the return type is a
3232 Lisp_Object, use build_ext_string() for unsized external data,
3233 make_ext_string() for sized external data. If the return type needs to
3234 be sized data, use the *_TO_SIZED_*() macros, and for other more
3235 complicated cases, use the original TO_*_FORMAT() macros. */
3236
3237 static void
3238 new_dfc_convert_now_damn_it (const void *src, Bytecount src_size,
3239 enum new_dfc_src_type type,
3240 void **dst, Bytecount *dst_size,
3241 Lisp_Object codesys)
3242 {
3243 /* #### In the case of alloca(), it would be a bit more efficient, for
3244 small strings, to use static Dynarr's like are used internally in
3245 TO_*_FORMAT(), or some other way of avoiding malloc() followed by
3246 free(). I doubt it really matters, though. */
3247
3248 switch (type)
3249 {
3250 case DFC_EXTERNAL:
3251 TO_INTERNAL_FORMAT (C_STRING, src,
3252 MALLOC, (*dst, *dst_size), codesys);
3253 break;
3254
3255 case DFC_SIZED_EXTERNAL:
3256 TO_INTERNAL_FORMAT (DATA, (src, src_size),
3257 MALLOC, (*dst, *dst_size), codesys);
3258 break;
3259
3260 case DFC_INTERNAL:
3261 TO_EXTERNAL_FORMAT (C_STRING, src,
3262 MALLOC, (*dst, *dst_size), codesys);
3263 break;
3264
3265 case DFC_SIZED_INTERNAL:
3266 TO_EXTERNAL_FORMAT (DATA, (src, src_size),
3267 MALLOC, (*dst, *dst_size), codesys);
3268 break;
3269
3270 case DFC_LISP_STRING:
3271 TO_EXTERNAL_FORMAT (LISP_STRING, VOID_TO_LISP (src),
3272 MALLOC, (*dst, *dst_size), codesys);
3273 break;
3274
3275 default:
3276 abort ();
3277 }
3278 }
3279
3280 void *
3281 new_dfc_convert_malloc (const void *src, Bytecount src_size,
3282 enum new_dfc_src_type type, Lisp_Object codesys)
3283 {
3284 void *dst;
3285 Bytecount dst_size;
3286
3287 new_dfc_convert_now_damn_it (src, src_size, type, &dst, &dst_size, codesys);
3288 return dst;
3289 }
3290 4446
3291 /* For alloca(), things are trickier because the calling function needs to 4447 /* For alloca(), things are trickier because the calling function needs to
3292 allocate. This means that the caller needs to do the following: 4448 allocate. This means that the caller needs to do the following:
3293 4449
3294 (a) invoke us to do the conversion, remember the data and return the size. 4450 (a) invoke us to do the conversion, remember the data and return the size.
3305 expression twice in two converter calls in the same subexpression, we 4461 expression twice in two converter calls in the same subexpression, we
3306 will lose, but at least we can check for this and abort(). We could 4462 will lose, but at least we can check for this and abort(). We could
3307 conceivably try to index on other parameters as well, but there is not 4463 conceivably try to index on other parameters as well, but there is not
3308 really any point. */ 4464 really any point. */
3309 4465
3310 typedef struct 4466 alloca_convert_vals_dynarr *active_alloca_convert;
3311 { 4467
3312 const char *srctext; 4468 int
3313 void *dst; 4469 find_pos_of_existing_active_alloca_convert (const char *srctext)
3314 Bytecount dst_size; 4470 {
3315 } dfc_e2c_vals; 4471 alloca_convert_vals *vals = NULL;
3316
3317 typedef struct
3318 {
3319 Dynarr_declare (dfc_e2c_vals);
3320 } dfc_e2c_vals_dynarr;
3321
3322 static dfc_e2c_vals_dynarr *active_dfc_e2c;
3323
3324 static int
3325 find_pos_of_existing_active_dfc_e2c (const char *srctext)
3326 {
3327 dfc_e2c_vals *vals = NULL;
3328 int i; 4472 int i;
3329 4473
3330 for (i = 0; i < Dynarr_length (active_dfc_e2c); i++) 4474 if (!active_alloca_convert)
3331 { 4475 active_alloca_convert = Dynarr_new (alloca_convert_vals);
3332 vals = Dynarr_atp (active_dfc_e2c, i); 4476
4477 for (i = 0; i < Dynarr_length (active_alloca_convert); i++)
4478 {
4479 vals = Dynarr_atp (active_alloca_convert, i);
3333 if (vals->srctext == srctext) 4480 if (vals->srctext == srctext)
3334 return i; 4481 return i;
3335 } 4482 }
3336 4483
3337 return -1; 4484 return -1;
3338 } 4485 }
3339 4486
3340 void * 4487 /* ----------------------------------------------------------------------- */
3341 new_dfc_convert_alloca (const char *srctext, void *alloca_data) 4488 /* New-style DFC converters (data is returned rather than stored into var) */
3342 { 4489 /* ----------------------------------------------------------------------- */
3343 dfc_e2c_vals *vals; 4490
3344 int i = find_pos_of_existing_active_dfc_e2c (srctext); 4491 /* We handle here the cases where SRC is a Lisp_Object, internal data
3345 4492 (sized or unsized), or external data (sized or unsized), and return type
3346 assert (i >= 0); 4493 is unsized alloca() or malloc() data. If the return type is a
3347 vals = Dynarr_atp (active_dfc_e2c, i); 4494 Lisp_Object, use build_ext_string() for unsized external data,
3348 assert (alloca_data); 4495 make_ext_string() for sized external data. If the return type needs to
3349 memcpy (alloca_data, vals->dst, vals->dst_size + 2); 4496 be sized data, use the *_TO_SIZED_*() macros, and for other more
3350 xfree (vals->dst, void *); 4497 complicated cases, use the original TO_*_FORMAT() macros. */
3351 Dynarr_delete (active_dfc_e2c, i); 4498
3352 return alloca_data; 4499 static void
4500 new_dfc_convert_now_damn_it (const void *src, Bytecount src_size,
4501 enum new_dfc_src_type type,
4502 void **dst, Bytecount *dst_size,
4503 Lisp_Object codesys)
4504 {
4505 /* #### In the case of alloca(), it would be a bit more efficient, for
4506 small strings, to use static Dynarr's like are used internally in
4507 TO_*_FORMAT(), or some other way of avoiding malloc() followed by
4508 free(). I doubt it really matters, though. */
4509
4510 switch (type)
4511 {
4512 case DFC_EXTERNAL:
4513 TO_INTERNAL_FORMAT (C_STRING, src,
4514 MALLOC, (*dst, *dst_size), codesys);
4515 break;
4516
4517 case DFC_SIZED_EXTERNAL:
4518 TO_INTERNAL_FORMAT (DATA, (src, src_size),
4519 MALLOC, (*dst, *dst_size), codesys);
4520 break;
4521
4522 case DFC_INTERNAL:
4523 TO_EXTERNAL_FORMAT (C_STRING, src,
4524 MALLOC, (*dst, *dst_size), codesys);
4525 break;
4526
4527 case DFC_SIZED_INTERNAL:
4528 TO_EXTERNAL_FORMAT (DATA, (src, src_size),
4529 MALLOC, (*dst, *dst_size), codesys);
4530 break;
4531
4532 case DFC_LISP_STRING:
4533 TO_EXTERNAL_FORMAT (LISP_STRING, VOID_TO_LISP (src),
4534 MALLOC, (*dst, *dst_size), codesys);
4535 break;
4536
4537 default:
4538 abort ();
4539 }
4540
4541 /* The size is always + 2 because we have double zero-termination at the
4542 end of all data (for Unicode-correctness). */
4543 *dst_size += 2;
3353 } 4544 }
3354 4545
3355 Bytecount 4546 Bytecount
3356 new_dfc_convert_size (const char *srctext, const void *src, 4547 new_dfc_convert_size (const char *srctext, const void *src,
3357 Bytecount src_size, enum new_dfc_src_type type, 4548 Bytecount src_size, enum new_dfc_src_type type,
3358 Lisp_Object codesys) 4549 Lisp_Object codesys)
3359 { 4550 {
3360 dfc_e2c_vals vals; 4551 alloca_convert_vals vals;
3361 4552
3362 assert (find_pos_of_existing_active_dfc_e2c (srctext) < 0); 4553 assert (find_pos_of_existing_active_alloca_convert (srctext) < 0);
3363 4554
3364 vals.srctext = srctext; 4555 vals.srctext = srctext;
3365 4556
3366 new_dfc_convert_now_damn_it (src, src_size, type, &vals.dst, &vals.dst_size, 4557 new_dfc_convert_now_damn_it (src, src_size, type, &vals.dst, &vals.dst_size,
3367 codesys); 4558 codesys);
3368 4559
3369 Dynarr_add (active_dfc_e2c, vals); 4560 Dynarr_add (active_alloca_convert, vals);
3370 /* The size is always + 2 because we have double zero-termination at the 4561 return vals.dst_size;
3371 end of all data (for Unicode-correctness). */ 4562 }
3372 return vals.dst_size + 2; 4563
4564 void *
4565 new_dfc_convert_copy_data (const char *srctext, void *alloca_data)
4566 {
4567 alloca_convert_vals *vals;
4568 int i = find_pos_of_existing_active_alloca_convert (srctext);
4569
4570 assert (i >= 0);
4571 vals = Dynarr_atp (active_alloca_convert, i);
4572 assert (alloca_data);
4573 memcpy (alloca_data, vals->dst, vals->dst_size);
4574 xfree (vals->dst, void *);
4575 Dynarr_delete (active_alloca_convert, i);
4576 return alloca_data;
4577 }
4578
4579 void *
4580 new_dfc_convert_malloc (const void *src, Bytecount src_size,
4581 enum new_dfc_src_type type, Lisp_Object codesys)
4582 {
4583 void *dst;
4584 Bytecount dst_size;
4585
4586 new_dfc_convert_now_damn_it (src, src_size, type, &dst, &dst_size, codesys);
4587 return dst;
3373 } 4588 }
3374 4589
3375 4590
3376 /************************************************************************/ 4591 /************************************************************************/
3377 /* Basic Ichar functions */ 4592 /* Basic Ichar functions */
3891 5106
3892 conversion_in_dynarr_list = Dynarr_new2 (Ibyte_dynarr_dynarr, 5107 conversion_in_dynarr_list = Dynarr_new2 (Ibyte_dynarr_dynarr,
3893 Ibyte_dynarr *); 5108 Ibyte_dynarr *);
3894 conversion_out_dynarr_list = Dynarr_new2 (Extbyte_dynarr_dynarr, 5109 conversion_out_dynarr_list = Dynarr_new2 (Extbyte_dynarr_dynarr,
3895 Extbyte_dynarr *); 5110 Extbyte_dynarr *);
3896 active_dfc_e2c = Dynarr_new (dfc_e2c_vals);
3897 5111
3898 for (i = 0; i <= MAX_BYTEBPOS_GAP_SIZE_3; i++) 5112 for (i = 0; i <= MAX_BYTEBPOS_GAP_SIZE_3; i++)
3899 three_to_one_table[i] = i / 3; 5113 three_to_one_table[i] = i / 3;
3900 } 5114 }
3901 5115
3902 void 5116 void
3903 vars_of_text (void) 5117 vars_of_text (void)
3904 { 5118 {
3905 reinit_vars_of_text ();
3906
3907 QSin_char_byte_conversion = build_msg_string ("(in char-byte conversion)"); 5119 QSin_char_byte_conversion = build_msg_string ("(in char-byte conversion)");
3908 staticpro (&QSin_char_byte_conversion); 5120 staticpro (&QSin_char_byte_conversion);
3909 QSin_internal_external_conversion = 5121 QSin_internal_external_conversion =
3910 build_msg_string ("(in internal-external conversion)"); 5122 build_msg_string ("(in internal-external conversion)");
3911 staticpro (&QSin_internal_external_conversion); 5123 staticpro (&QSin_internal_external_conversion);