comparison src/file-coding.c @ 2367:ecf1ebac70d8

[xemacs-hg @ 2004-11-04 23:05:23 by ben] commit mega-patch configure.in: Turn off -Winline and -Wchar-subscripts. Use the right set of cflags when compiling modules. Rewrite ldap configuration to separate the inclusion of lber (needed in recent Cygwin) from the basic checks for the needed libraries. add a function for MAKE_JUNK_C; initially code was added to generate xemacs.def using this, but it will need to be rewritten. add an rm -f for junk.c to avoid weird Cygwin bug with cp -f onto an existing file. Sort list of auto-detected functions and eliminate unused checks for stpcpy, setlocale and getwd. Add autodetection of Cygwin scanf problems BETA: Rewrite section on configure to indicate what flags are important and what not. digest-doc.c, make-dump-id.c, profile.c, sorted-doc.c: Add proper decls for main(). make-msgfile.c: Document that this is old junk. Move proposal to text.c. make-msgfile.lex: Move proposal to text.c. make-mswin-unicode.pl: Convert error-generating code so that the entire message will be seen as a single unrecognized token. mule/mule-ccl.el: Update docs. lispref/mule.texi: Update CCL docs. ldap/eldap.c: Mule-ize. Use EXTERNAL_LIST_LOOP_2 instead of deleted EXTERNAL_LIST_LOOP. * XEmacs 21.5.18 "chestnut" is released. --------------------------------------------------------------- MULE-RELATED WORK: --------------------------------------------------------------- --------------------------- byte-char conversion --------------------------- buffer.c, buffer.h, insdel.c, text.c: Port FSF algorithm for byte-char conversion, replacing broken previous version. Track the char position of the gap. Add functions to do char-byte conversion downwards as well as upwards. Move comments about algorithm workings to internals manual. --------------------------- work on types --------------------------- alloc.c, console-x-impl.h, dump-data.c, dump-data.h, dumper.c, dialog-msw.c, dired-msw.c, doc.c, editfns.c, esd.c, event-gtk.h, event-msw.c, events.c, file-coding.c, file-coding.h, fns.c, glyphs-eimage.c, glyphs-gtk.c, glyphs-msw.c, glyphs-shared.c, glyphs-x.c, glyphs.c, glyphs.h, gui.c, hpplay.c, imgproc.c, intl-win32.c, lrecord.h, lstream.c, keymap.c, lisp.h, libsst.c, linuxplay.c, miscplay.c, miscplay.h, mule-coding.c, nas.c, nt.c, ntheap.c, ntplay.c, objects-msw.c, objects-tty.c, objects-x.c, print.c, process-nt.c, process.c, redisplay.h, select-common.h, select-gtk.c, select-x.c, sgiplay.c, sound.c, sound.h, sunplay.c, sysfile.h, sysdep.c, syswindows.h, text.c, unexnt.c, win32.c, xgccache.c: Further work on types. This creates a full set of types for all the basic semantics of `char' that I have so far identified, so that its semantics can always be identified for the purposes of proper Mule-safe code, and the raw use of `char' always avoided. (1) More type renaming, for consistency of naming. Char_ASCII -> Ascbyte UChar_ASCII -> UAscbyte Char_Binary -> CBinbyte UChar_Binary -> Binbyte SChar_Binary -> SBinbyte (2) Introduce Rawbyte, CRawbyte, Boolbyte, Chbyte, UChbyte, and Bitbyte and use them. (3) New types Itext, Wexttext and Textcount for separating out the concepts of bytes and textual units (different under UTF-16 and UTF-32, which are potential internal encodings). (4) qxestr*_c -> qxestr*_ascii. lisp.h: New; goes with other qxe() functions. #### Maybe goes in a different section. lisp.h: Group generic int-type defs together with EMACS_INT defs. lisp.h: * lisp.h (WEXTTEXT_IS_WIDE) New defns. lisp.h: New type to replace places where int occurs as a boolean. It's signed because occasionally people may want to use -1 as an error value, and because unsigned ints are viral -- see comments in the internals manual against using them. dynarr.c: int -> Bytecount. --------------------------- Mule-izing --------------------------- device-x.c: Partially Mule-ize. dumper.c, dumper.h: Mule-ize. Use Rawbyte. Use stderr_out not printf. Use wext_*(). sysdep.c, syswindows.h, text.c: New Wexttext API for manipulation of external text that may be Unicode (e.g. startup code under Windows). emacs.c: Mule-ize. Properly deal with argv in external encoding. Use wext_*() and Wexttext. Use Rawbyte. #if 0 some old junk on SCO that is unlikely to be correct. Rewrite allocation code in run-temacs. emacs.c, symsinit.h, win32.c: Rename win32 init function and call it even earlier, to initialize mswindows_9x_p even earlier, for use in startup code (XEUNICODE_P). process.c: Use _wenviron not environ under Windows, to get Unicode environment variables. event-Xt.c: Mule-ize drag-n-drop related stuff. dragdrop.c, dragdrop.h, frame-x.c: Mule-ize. text.h: Add some more stand-in defines for particular kinds of conversion; use in Mule-ization work in frame-x.c etc. --------------------------- Freshening --------------------------- intl-auto-encap-win32.c, intl-auto-encap-win32.h: Regenerate. --------------------------- Unicode-work --------------------------- intl-win32.c, syswindows.h: Factor out common options to MultiByteToWideChar and WideCharToMultiByte. Add convert_unicode_to_multibyte_malloc() and convert_unicode_to_multibyte_dynarr() and use. Add stuff for alloca() conversion of multibyte/unicode. alloc.c: Use dfc_external_data_len() in case of unicode coding system. alloc.c, mule-charset.c: Don't zero out and reinit charset Unicode tables. This fucks up dump-time loading. Anyway, either we load them at dump time or run time, never both. unicode.c: Dump the blank tables as well. --------------------------------------------------------------- DOCUMENTATION, MOSTLY MULE-RELATED: --------------------------------------------------------------- EmacsFrame.c, emodules.c, event-Xt.c, fileio.c, input-method-xlib.c, mule-wnnfns.c, redisplay-gtk.c, redisplay-tty.c, redisplay-x.c, regex.c, sysdep.c: Add comment about Mule work needed. text.h: Add more documentation describing why DFC routines were not written to return their value. Add some other DFC documentation. console-msw.c, console-msw.h: Add pointer to docs in win32.c. emacs.c: Add comments on sources of doc info. text.c, charset.h, unicode.c, intl-win32.c, intl-encap-win32.c, text.h, file-coding.c, mule-coding.c: Collect background comments and related to text matters and internationalization, and proposals for work to be done, in text.c or Internals manual, stuff related to specific textual API's in text.h, and stuff related to internal implementation of Unicode conversion in unicode.c. Put lots of pointers to the comments to make them easier to find. s/mingw32.h, s/win32-common.h, s/win32-native.h, s/windowsnt.h, win32.c: Add bunches of new documentation on the different kinds of builds and environments under Windows and how they work. Collect this info in win32.c. Add pointers to these docs in the relevant s/* files. emacs.c: Document places with long comments. Remove comment about exiting, move to internals manual, put in pointer. event-stream.c: Move docs about event queues and focus to internals manual, put in pointer. events.h: Move docs about event stream callbacks to internals manual, put in pointer. profile.c, redisplay.c, signal.c: Move documentation to the Internals manual. process-nt.c: Add pointer to comment in win32-native.el. lisp.h: Add comments about some comment conventions. lisp.h: Add comment about the second argument. device-msw.c, redisplay-msw.c: @@#### comments are out-of-date. --------------------------------------------------------------- PDUMP WORK (MOTIVATED BY UNICODE CHANGES) --------------------------------------------------------------- alloc.c, buffer.c, bytecode.c, console-impl.h, console.c, device.c, dumper.c, lrecord.h, elhash.c, emodules.h, events.c, extents.c, frame.c, glyphs.c, glyphs.h, mule-charset.c, mule-coding.c, objects.c, profile.c, rangetab.c, redisplay.c, specifier.c, specifier.h, window.c, lstream.c, file-coding.h, file-coding.c: PDUMP: Properly implement dump_add_root_block(), which never worked before, and is necessary for dumping Unicode tables. Pdump name changes for accuracy: XD_STRUCT_PTR -> XD_BLOCK_PTR. XD_STRUCT_ARRAY -> XD_BLOCK_ARRAY. XD_C_STRING -> XD_ASCII_STRING. *_structure_* -> *_block_*. lrecord.h: some comments added about dump_add_root_block() vs dump_add_root_block_ptr(). extents.c: remove incorrect comment about pdump problems with gap array. --------------------------------------------------------------- ALLOCATION --------------------------------------------------------------- abbrev.c, alloc.c, bytecode.c, casefiddle.c, device-msw.c, device-x.c, dired-msw.c, doc.c, doprnt.c, dragdrop.c, editfns.c, emodules.c, file-coding.c, fileio.c, filelock.c, fns.c, glyphs-eimage.c, glyphs-gtk.c, glyphs-msw.c, glyphs-x.c, gui-msw.c, gui-x.c, imgproc.c, intl-win32.c, lread.c, menubar-gtk.c, menubar.c, nt.c, objects-msw.c, objects-x.c, print.c, process-nt.c, process-unix.c, process.c, realpath.c, redisplay.c, search.c, select-common.c, symbols.c, sysdep.c, syswindows.h, text.c, text.h, ui-byhand.c: New macros {alloca,xnew}_{itext,{i,ext,raw,bin,asc}bytes} for more convenient allocation of these commonly requested items. Modify functions to use alloca_ibytes, alloca_array, alloca_extbytes, xnew_ibytes, etc. also XREALLOC_ARRAY, xnew. alloc.c: Rewrite the allocation functions to factor out repeated code. Add assertions for freeing dumped data. lisp.h: Moved down and consolidated with other allocation stuff. lisp.h, dynarr.c: New functions for allocation that's very efficient when mostly in LIFO order. lisp.h, text.c, text.h: Factor out some stuff for general use by alloca()-conversion funs. text.h, lisp.h: Fill out convenience routines for allocating various kinds of bytes and put them in lisp.h. Use them in place of xmalloc(), ALLOCA(). text.h: Fill out the convenience functions so the _MALLOC() kinds match the alloca() kinds. --------------------------------------------------------------- ERROR-CHECKING --------------------------------------------------------------- text.h: Create ASSERT_ASCTEXT_ASCII() and ASSERT_ASCTEXT_ASCII_LEN() from similar Eistring checkers and change the Eistring checkers to use them instead. --------------------------------------------------------------- MACROS IN LISP.H --------------------------------------------------------------- lisp.h: Redo GCPRO declarations. Create a "base" set of functions that can be used to generate any kind of gcpro sets -- regular, ngcpro, nngcpro, private ones used in GC_EXTERNAL_LIST_LOOP_2. buffer.c, callint.c, chartab.c, console-msw.c, device-x.c, dialog-msw.c, dired.c, extents.c, ui-gtk.c, rangetab.c, nt.c, mule-coding.c, minibuf.c, menubar-msw.c, menubar.c, menubar-gtk.c, lread.c, lisp.h, gutter.c, glyphs.c, glyphs-widget.c, fns.c, fileio.c, file-coding.c, specifier.c: Eliminate EXTERNAL_LIST_LOOP, which does not check for circularities. Use EXTERNAL_LIST_LOOP_2 instead or EXTERNAL_LIST_LOOP_3 or EXTERNAL_PROPERTY_LIST_LOOP_3 or GC_EXTERNAL_LIST_LOOP_2 (new macro). Removed/redid comments on EXTERNAL_LIST_LOOP. --------------------------------------------------------------- SPACING FIXES --------------------------------------------------------------- callint.c, hftctl.c, number-gmp.c, process-unix.c: Spacing fixes. --------------------------------------------------------------- FIX FOR GEOMETRY PROBLEM IN FIRST FRAME --------------------------------------------------------------- unicode.c: Add workaround for newlib bug in sscanf() [should be fixed by release 1.5.12 of Cygwin]. toolbar.c: bug fix for problem of initial frame being 77 chars wide on Windows. will be overridden by my other ws. --------------------------------------------------------------- FIX FOR LEAKING PROCESS HANDLES: --------------------------------------------------------------- process-nt.c: Fixes for leaking handles. Inspired by work done by Adrian Aichner <adrian@xemacs.org>. --------------------------------------------------------------- FIX FOR CYGWIN BUG (Unicode-related): --------------------------------------------------------------- unicode.c: Add workaround for newlib bug in sscanf() [should be fixed by release 1.5.12 of Cygwin]. --------------------------------------------------------------- WARNING FIXES: --------------------------------------------------------------- console-stream.c: `reinit' is unused. compiler.h, event-msw.c, frame-msw.c, intl-encap-win32.c, text.h: Add stuff to deal with ANSI-aliasing warnings I got. regex.c: Gather includes together to avoid warning. --------------------------------------------------------------- CHANGES TO INITIALIZATION ROUTINES: --------------------------------------------------------------- buffer.c, emacs.c, console.c, debug.c, device-x.c, device.c, dragdrop.c, emodules.c, eval.c, event-Xt.c, event-gtk.c, event-msw.c, event-stream.c, event-tty.c, events.c, extents.c, faces.c, file-coding.c, fileio.c, font-lock.c, frame-msw.c, glyphs-widget.c, glyphs.c, gui-x.c, insdel.c, lread.c, lstream.c, menubar-gtk.c, menubar-x.c, minibuf.c, mule-wnnfns.c, objects-msw.c, objects.c, print.c, scrollbar-x.c, search.c, select-x.c, text.c, undo.c, unicode.c, window.c, symsinit.h: Call reinit_*() functions directly from emacs.c, for clarity. Factor out some redundant init code. Move disallowed stuff that had crept into vars_of_glyphs() into complex_vars_of_glyphs(). Call init_eval_semi_early() from eval.c not in the middle of vars_of_() in emacs.c since there should be no order dependency in the latter calls. --------------------------------------------------------------- ARMAGEDDON: --------------------------------------------------------------- alloc.c, emacs.c, lisp.h, print.c: Rename inhibit_non_essential_printing_operations to inhibit_non_essential_conversion_operations. text.c: Assert on !inhibit_non_essential_conversion_operations. console-msw.c, print.c: Don't do conversion in SetConsoleTitle or FindWindow to avoid problems during armageddon. Put #errors for NON_ASCII_INTERNAL_FORMAT in places where problems would arise. --------------------------------------------------------------- CHANGES TO THE BUILD PROCEDURE: --------------------------------------------------------------- config.h.in, s/cxux.h, s/usg5-4-2.h, m/powerpc.h: Add comment about correct ordering of this file. Rearrange everything to follow this -- put all #undefs together and before the s&m files. Add undefs for HAVE_ALLOCA, C_ALLOCA, BROKEN_ALLOCA_IN_FUNCTION_CALLS, STACK_DIRECTION. Remove unused HAVE_STPCPY, HAVE_GETWD, HAVE_SETLOCALE. m/gec63.h: Deleted; totally broken, not used at all, not in FSF. m/7300.h, m/acorn.h, m/alliant-2800.h, m/alliant.h, m/altos.h, m/amdahl.h, m/apollo.h, m/att3b.h, m/aviion.h, m/celerity.h, m/clipper.h, m/cnvrgnt.h, m/convex.h, m/cydra5.h, m/delta.h, m/delta88k.h, m/dpx2.h, m/elxsi.h, m/ews4800r.h, m/gould.h, m/hp300bsd.h, m/hp800.h, m/hp9000s300.h, m/i860.h, m/ibmps2-aix.h, m/ibmrs6000.h, m/ibmrt-aix.h, m/ibmrt.h, m/intel386.h, m/iris4d.h, m/iris5d.h, m/iris6d.h, m/irist.h, m/isi-ov.h, m/luna88k.h, m/m68k.h, m/masscomp.h, m/mg1.h, m/mips-nec.h, m/mips-siemens.h, m/mips.h, m/news.h, m/nh3000.h, m/nh4000.h, m/ns32000.h, m/orion105.h, m/pfa50.h, m/plexus.h, m/pmax.h, m/powerpc.h, m/pyrmips.h, m/sequent-ptx.h, m/sequent.h, m/sgi-challenge.h, m/symmetry.h, m/tad68k.h, m/tahoe.h, m/targon31.h, m/tekxd88.h, m/template.h, m/tower32.h, m/tower32v3.h, m/ustation.h, m/vax.h, m/wicat.h, m/xps100.h: Delete C_ALLOCA, HAVE_ALLOCA, STACK_DIRECTION, BROKEN_ALLOCA_IN_FUNCTION_CALLS. All of this is auto-detected. When in doubt, I followed recent FSF sources, which also have these things deleted.
author ben
date Thu, 04 Nov 2004 23:08:28 +0000
parents 13a418960a88
children 3d8143fc88e1
comparison
equal deleted inserted replaced
2366:2a392e0c390a 2367:ecf1ebac70d8
67 67
68 October 2001, Ben Wing: HAVE_CODING_SYSTEMS is always now defined. 68 October 2001, Ben Wing: HAVE_CODING_SYSTEMS is always now defined.
69 Removed the conditionals. 69 Removed the conditionals.
70 */ 70 */
71 71
72 /* sjt sez:
73
74 There should be no elementary coding systems in the Lisp API, only chains.
75 Chains should be declared, not computed, as a sequence of coding formats.
76 (Probably the internal representation can be a vector for efficiency but
77 programmers would probably rather work with lists.) A stream has a token
78 type. Most streams are octet streams. Text is a stream of characters (in
79 _internal_ format; a file on disk is not text!) An octet-stream has no
80 implicit semantics, so its format must always be specified. The only type
81 currently having semantics is characters. This means that the chain [euc-jp
82 -> internal -> shift_jis) may be specified (euc-jp, shift_jis), and if no
83 euc-jp -> shift_jis converter is available, then the chain is automatically
84 constructed. (N.B. I f we have fixed width buffers in the future, then we
85 could have ASCII -> 8-bit char -> 16-bit char -> ISO-2022-JP (with escape
86 sequences).
87
88 EOL handling is a char <-> char coding. It should not be part of another
89 coding system except as a convenience for users. For text coding,
90 automatically insert EOL handlers between char <-> octet boundaries.
91 */
92
93 /* Comments about future work
94
95 ------------------------------------------------------------------
96 ABOUT DETECTION
97 ------------------------------------------------------------------
98
99 however, in general the detection code has major problems and needs lots
100 of work:
101
102 -- instead of merely "yes" or "no" for particular categories, we need a
103 more flexible system, with various levels of likelihood. Currently
104 I've created a system with six levels, as follows:
105
106 [see file-coding.h]
107
108 Let's consider what this might mean for an ASCII text detector. (In
109 order to have accurate detection, especially given the iteration I
110 proposed below, we need active detectors for *all* types of data we
111 might reasonably encounter, such as ASCII text files, binary files,
112 and possibly other sorts of ASCII files, and not assume that simply
113 "falling back to no detection" will work at all well.)
114
115 An ASCII text detector DOES NOT report ASCII text as level 0, since
116 that's what the detector is looking for. Such a detector ideally
117 wants all bytes in the range 0x20 - 0x7E (no high bytes!), except for
118 whitespace control chars and perhaps a few others; LF, CR, or CRLF
119 sequences at regular intervals (where "regular" might mean an average
120 < 100 chars and 99% < 300 for code and other stuff of the "text file
121 w/line breaks" variety, but for the "text file w/o line breaks"
122 variety, excluding blank lines, averages could easily be 600 or more
123 with 2000-3000 char "lines" not so uncommon); similar statistical
124 variance between odds and evens (not Unicode); frequent occurrences of
125 the space character; letters more common than non-letters; etc. Also
126 checking for too little variability between frequencies of characters
127 and for exclusion of particular characters based on character ranges
128 can catch ASCII encodings like base-64, UUEncode, UTF-7, etc.
129 Granted, this doesn't even apply to everything called "ASCII", and we
130 could potentially distinguish off ASCII for code, ASCII for text,
131 etc. as separate categories. However, it does give us a lot to work
132 off of, in deciding what likelihood to choose -- and it shows there's
133 in fact a lot of detectable patterns to look for even in something
134 seemingly so generic as ASCII. The detector would report most text
135 files in level 1 or level 2. EUC encodings, Shift-JIS, etc. probably
136 go to level -1 because they also pass the EOL test and all other tests
137 for the ASCII part of the text, but have lots of high bytes, which in
138 essence turn them into binary. Aberrant text files like something in
139 BASE64 encoding might get placed in level 0, because they pass most
140 tests but fail dramatically the frequency test; but they should not be
141 reported as any lower, because that would cause explicit prompting,
142 and the user should be able any valid text file without prompting.
143 The escape sequences and the base-64-type checks might send 7-bit
144 iso2022 to 0, but probably not -1, for similar reasons.
145
146 -- The assumed algorithm for the above detection levels is to in essence
147 sort categories first by detection level and then by priority.
148 Perhaps, however, we would want smarter algorithms, or at least
149 something user-controllable -- in particular, when (other than no
150 category at level 0 or greater) do we prompt the user to pick a
151 category?
152
153 -- Improvements in how the detection algorithm works: we want to handle
154 lots of different ways something could be encoded, including multiple
155 stacked encodings. trying to specify a series of detection levels
156 (check for base64 first, then check for gzip, then check for an i18n
157 decoding, then for crlf) won't generally work. for example, what
158 about the same encoding appearing more than once? for example, take
159 euc-jp, base64'd, then gzip'd, then base64'd again: this could well
160 happen, and you could specify the encodings specifically as
161 base64|gzip|base64|euc-jp, but we'd like to autodetect it without
162 worrying about exactly what order these things appear in. we should
163 allow for iterating over detection/decoding cycles until we reach
164 some maximum (we got stuck in a loop, due to incorrect category
165 tables or detection algorithms), have no reported detection levels
166 over -1, or we end up with no change after a decoding pass (i.e. the
167 coding system associated with a chosen category was `no-conversion'
168 or something equivalent). it might make sense to divide things into
169 two phases (internal and external), where the internal phase has a
170 separate category list and would probably mostly end up handling EOL
171 detection; but the i think about it, the more i disagree. with
172 properly written detectors, and properly organized tables (in
173 general, those decodings that are more "distinctive" and thus
174 detectable with greater certainty go lower on the list), we shouldn't
175 need two phases. for example, let's say the example above was also
176 in CRLF format. The EOL detector (which really detects *plain text*
177 with a particular EOL type) would return at most level 0 for all
178 results until the text file is reached, whereas the base64, gzip or
179 euc-jp decoders will return higher. Once the text file is reached,
180 the EOL detector will return 0 or higher for the CRLF encoding, and
181 all other detectors will return 0 or lower; thus, we will successfully
182 proceed through CRLF decoding, or at worst prompt the user. (The only
183 external-vs-internal distinction that might make sense here is to
184 favor coding systems of the correct source type over those that
185 require conversion between external and internal; if done right, this
186 could allow the CRLF detector to return level 1 for all CRLF-encoded
187 text files, even those that look like Base-64 or similar encoding, so
188 that CRLF encoding will always get decoded without prompting, but not
189 interfere with other decoders. On the other hand, this
190 external-vs-internal distinction may not matter at all -- with
191 automatic internal-external conversion, CRLF decoding can occur
192 before or after decoding of euc-jp, base64, iso2022, or similar,
193 without any difference in the final results.)
194
195 #### What are we trying to say? In base64, the CRLF decoding before
196 base64 decoding is irrelevant, they will be thrown out as whitespace
197 is not significant in base64.
198
199 [sjt considers all of this to be rather bogus. Ideas like "greater
200 certainty" and "distinctive" can and should be quantified. The issue
201 of proper table organization should be a question of optimization.]
202
203 [sjt wonders if it might not be a good idea to use Unicode's newline
204 character as the internal representation so that (for non-Unicode
205 coding systems) we can catch EOL bugs on Unix too.]
206
207 -- There need to be two priority lists and two
208 category->coding-system lists. Once is general, the other
209 category->langenv-specific. The user sets the former, the langenv
210 category->the latter. The langenv-specific entries take precedence
211 category->over the others. This works similarly to the
212 category->category->Unicode charset priority list.
213
214 -- The simple list of coding categories per detectors is not enough.
215 Instead of coding categories, we need parameters. For example,
216 Unicode might have separate detectors for UTF-8, UTF-7, UTF-16,
217 and perhaps UCS-4; or UTF-16/UCS-4 would be one detection type.
218 UTF-16 would have parameters such as "little-endian" and "needs BOM",
219 and possibly another one like "collapse/expand/leave alone composite
220 sequences" once we add this support. Usually these parameters
221 correspond directly to a coding system parameter. Different
222 likelihood values can be specified for each parameter as well as for
223 the detection type as a whole. The user can specify particular
224 coding systems for a particular combination of detection type and
225 parameters, or can give "default parameters" associated with a
226 detection type. In the latter case, we create a new coding system as
227 necessary that corresponds to the detected type and parameters.
228
229 -- a better means of presentation. rather than just coming up
230 with the new file decoded according to the detected coding
231 system, allow the user to browse through the file and
232 conveniently reject it if it looks wrong; then detection
233 starts again, but with that possibility removed. in cases where
234 certainty is low and thus more than one possibility is presented,
235 the user can browse each one and select one or reject them all.
236
237 -- fail-safe: even after the user has made a choice, if they
238 later on realize they have the wrong coding system, they can
239 go back, and we've squirreled away the original data so they
240 can start the process over. this may be tricky.
241
242 -- using a larger buffer for detection. we use just a small
243 piece, which can give quite random results. we may need to
244 buffer up all the data we look through because we can't
245 necessarily rewind. the idea is we proceed until we get a
246 result that's at least at a certain level of certainty
247 (e.g. "probable") or we reached a maximum limit of how much
248 we want to buffer.
249
250 -- dealing with interactive systems. we might need to go ahead
251 and present the data before we've finished detection, and
252 then re-decode it, perhaps multiple times, as we get better
253 detection results.
254
255 -- Clearly some of these are more important than others. at the
256 very least, the "better means of presentation" should be
257 implemented as soon as possible, along with a very simple means
258 of fail-safe whenever the data is readibly available, e.g. it's
259 coming from a file, which is the most common scenario.
260
261 --ben [at least that's what sjt thinks]
262
263 *****
264
265 While this is clearly something of an improvement over earlier designs,
266 it doesn't deal with the most important issue: to do better than categories
267 (which in the medium term is mostly going to mean "which flavor of Unicode
268 is this?"), we need to look at statistical behavior rather than ruling out
269 categories via presence of specific sequences. This means the stream
270 processor should
271
272 (1) keep octet distributions (octet, 2-, 3-, 4- octet sequences)
273 (2) in some kind of compressed form
274 (3) look for "skip features" (eg, characteristic behavior of leading
275 bytes for UTF-7, UTF-8, UTF-16, Mule code)
276 (4) pick up certain "simple" regexps
277 (5) provide "triggers" to determine when statistical detectors should be
278 invoked, such as octet count
279 (6) and "magic" like Unicode signatures or file(1) magic.
280
281 --sjt
282
283
284 ------------------------------------------------------------------
285 ABOUT FORMATS
286 ------------------------------------------------------------------
287
288 when calling make-coding-system, the name can be a cons of (format1 .
289 format2), specifying that it decodes format1->format2 and encodes the other
290 way. if only one name is given, that is assumed to be format1, and the
291 other is either `external' or `internal' depending on the end type.
292 normally the user when decoding gives the decoding order in formats, but
293 can leave off the last one, `internal', which is assumed. a multichain
294 might look like gzip|multibyte|unicode, using the coding systems named
295 `gzip', `(unicode . multibyte)' and `unicode'. the way this actually works
296 is by searching for gzip->multibyte; if not found, look for gzip->external
297 or gzip->internal. (In general we automatically do conversion between
298 internal and external as necessary: thus gzip|crlf does the expected, and
299 maps to gzip->external, external->internal, crlf->internal, which when
300 fully specified would be gzip|external:external|internal:crlf|internal --
301 see below.) To forcibly fit together two converters that have explicitly
302 specified and incompatible names (say you have unicode->multibyte and
303 iso8859-1->ebcdic and you know that the multibyte and iso8859-1 in this
304 case are compatible), you can force-cast using :, like this:
305 ebcdic|iso8859-1:multibyte|unicode. (again, if you force-cast between
306 internal and external formats, the conversion happens automatically.)
307
308 --------------------------------------------------------------------------
309 ABOUT PDUMP, UNICODE, AND RUNNING XEMACS FROM A DIRECTORY WITH WEIRD CHARS
310 --------------------------------------------------------------------------
311
312 -- there's the problem that XEmacs can't be run in a directory with
313 non-ASCII/Latin-1 chars in it, since it will be doing Unicode
314 processing before we've had a chance to load the tables. In fact,
315 even finding the tables in such a situation is problematic using
316 the normal commands. my idea is to eventually load the stuff
317 extremely extremely early, at the same time as the pdump data gets
318 loaded. in fact, the unicode table data (stored in an efficient
319 binary format) can even be stuck into the pdump file (which would
320 mean as a resource to the executable, for windows). we'd need to
321 extend pdump a bit: to allow for attaching extra data to the pdump
322 file. (something like pdump_attach_extra_data (addr, length)
323 returns a number of some sort, an index into the file, which you
324 can then retrieve with pdump_load_extra_data(), which returns an
325 addr (mmap()ed or loaded), and later you pdump_unload_extra_data()
326 when finished. we'd probably also need
327 pdump_attach_extra_data_append(), which appends data to the data
328 just written out with pdump_attach_extra_data(). this way,
329 multiple tables in memory can be written out into one contiguous
330 table. (we'd use the tar-like trick of allowing new blocks to be
331 written without going back to change the old blocks -- we just rely
332 on the end of file/end of memory.) this same mechanism could be
333 extracted out of pdump and used to handle the non-pdump situation
334 (or alternatively, we could just dump either the memory image of
335 the tables themselves or the compressed binary version). in the
336 case of extra unicode tables not known about at compile time that
337 get loaded before dumping, we either just dump them into the image
338 (pdump and all) or extract them into the compressed binary format,
339 free the original tables, and treat them like all other tables.
340
341 --------------------------------------------------------------------------
342 HANDLING WRITING A FILE SAFELY, WITHOUT DATA LOSS
343 --------------------------------------------------------------------------
344
345 -- When writing a file, we need error detection; otherwise somebody
346 will create a Unicode file without realizing the coding system
347 of the buffer is Raw, and then lose all the non-ASCII/Latin-1
348 text when it's written out. We need two levels
349
350 1. first, a "safe-charset" level that checks before any actual
351 encoding to see if all characters in the document can safely
352 be represented using the given coding system. FSF has a
353 "safe-charset" property of coding systems, but it's stupid
354 because this information can be automatically derived from
355 the coding system, at least the vast majority of the time.
356 What we need is some sort of
357 alternative-coding-system-precedence-list, langenv-specific,
358 where everything on it can be checked for safe charsets and
359 then the user given a list of possibilities. When the user
360 does "save with specified encoding", they should see the same
361 precedence list. Again like with other precedence lists,
362 there's also a global one, and presumably all coding systems
363 not on other list get appended to the end (and perhaps not
364 checked at all when doing safe-checking?). safe-checking
365 should work something like this: compile a list of all
366 charsets used in the buffer, along with a count of chars
367 used. that way, "slightly unsafe" coding systems can perhaps
368 be presented at the end, which will lose only a few characters
369 and are perhaps what the users were looking for.
370
371 [sjt sez this whole step is a crock. If a universal coding system
372 is unacceptable, the user had better know what he/she is doing,
373 and explicitly specify a lossy encoding.
374 In principle, we can simply check for characters being writable as
375 we go along. Eg, via an "unrepresentable character handler." We
376 still have the buffer contents. If we can't successfully save,
377 then ask the user what to do. (Do we ever simply destroy previous
378 file version before completing a write?)]
379
380 2. when actually writing out, we need error checking in case an
381 individual char in a charset can't be written even though the
382 charsets are safe. again, the user gets the choice of other
383 reasonable coding systems.
384
385 [sjt -- something is very confused, here; safe charsets should be
386 defined as those charsets all of whose characters can be encoded.]
387
388 3. same thing (error checking, list of alternatives, etc.) needs
389 to happen when reading! all of this will be a lot of work!
390
391
392 --ben
393
394 I don't much like Ben's scheme. First, this isn't an issue of I/O,
395 it's a coding issue. It can happen in many places, not just on stream
396 I/O. Error checking should take place on all translations. Second,
397 the two-pass algorithm should be avoided if possible. In some cases
398 (eg, output to a tty) we won't be able to go back and change the
399 previously output data. Third, the whole idea of having a buffer full
400 of arbitrary characters which we're going to somehow shoehorn into a
401 file based on some twit user's less than informed idea of a coding system
402 is kind of laughable from the start. If we're going to say that a buffer
403 has a coding system, shouldn't we enforce restrictions on what you can
404 put into it? Fourth, what's the point of having safe charsets if some
405 of the characters in them are unsafe? Fifth, what makes you think we're
406 going to have a list of charsets? It seems to me that there might be
407 reasons to have user-defined charsets (eg, "German" vs "French" subsets
408 of ISO 8859/15). Sixth, the idea of having language environment determine
409 precedence doesn't seem very useful to me. Users who are working with a
410 language that corresponds to the language environment are not going to
411 run into safe charsets problems. It's users who are outside of their
412 usual language environment who run into trouble. Also, the reason for
413 specifying anything other than a universal coding system is normally
414 restrictions imposed by other users or applications. Seventh, the
415 statistical feedback isn't terribly useful. Users rarely "want" a
416 coding system, they want their file saved in a useful way. We could
417 add a FORCE argument to conversions for those who really want a specific
418 coding system. But mostly, a user might want to edit out a few unsafe
419 characters. So (up to some maximum) we should keep a list of unsafe
420 text positions, and provide a convenient function for traversing them.
421
422 --sjt
423 */
424
425 #include <config.h> 72 #include <config.h>
426 #include "lisp.h" 73 #include "lisp.h"
427 74
428 #include "buffer.h" 75 #include "buffer.h"
429 #include "elhash.h" 76 #include "elhash.h"
457 } coding_system_type_entry_dynarr; 104 } coding_system_type_entry_dynarr;
458 105
459 static coding_system_type_entry_dynarr *the_coding_system_type_entry_dynarr; 106 static coding_system_type_entry_dynarr *the_coding_system_type_entry_dynarr;
460 107
461 static const struct memory_description cste_description_1[] = { 108 static const struct memory_description cste_description_1[] = {
462 { XD_STRUCT_PTR, offsetof (coding_system_type_entry, meths), 1, &coding_system_methods_description }, 109 { XD_BLOCK_PTR, offsetof (coding_system_type_entry, meths), 1, &coding_system_methods_description },
463 { XD_END } 110 { XD_END }
464 }; 111 };
465 112
466 static const struct sized_memory_description cste_description = { 113 static const struct sized_memory_description cste_description = {
467 sizeof (coding_system_type_entry), 114 sizeof (coding_system_type_entry),
521 }; 168 };
522 169
523 static const struct memory_description struct_detector_description_1[] 170 static const struct memory_description struct_detector_description_1[]
524 = 171 =
525 { 172 {
526 { XD_STRUCT_PTR, offsetof (struct detector, cats), 1, 173 { XD_BLOCK_PTR, offsetof (struct detector, cats), 1,
527 &detector_category_dynarr_description }, 174 &detector_category_dynarr_description },
528 { XD_END } 175 { XD_END }
529 }; 176 };
530 177
531 static const struct sized_memory_description struct_detector_description = 178 static const struct sized_memory_description struct_detector_description =
696 { -1 }, 343 { -1 },
697 }; 344 };
698 345
699 static const struct memory_description coding_system_description[] = 346 static const struct memory_description coding_system_description[] =
700 { 347 {
701 { XD_STRUCT_PTR, offsetof (Lisp_Coding_System, methods), 1, 348 { XD_BLOCK_PTR, offsetof (Lisp_Coding_System, methods), 1,
702 &coding_system_methods_description }, 349 &coding_system_methods_description },
703 #define MARKED_SLOT(x) { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, x) }, 350 #define MARKED_SLOT(x) { XD_LISP_OBJECT, offsetof (Lisp_Coding_System, x) },
704 #define MARKED_SLOT_ARRAY(slot, size) \ 351 #define MARKED_SLOT_ARRAY(slot, size) \
705 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, slot), size }, 352 { XD_LISP_OBJECT_ARRAY, offsetof (Lisp_Coding_System, slot), size },
706 #include "coding-system-slots.h" 353 #include "coding-system-slots.h"
707 { XD_STRUCT_ARRAY, offsetof (Lisp_Coding_System, data), 1, 354 { XD_BLOCK_ARRAY, offsetof (Lisp_Coding_System, data), 1,
708 coding_system_extra_description_map }, 355 coding_system_extra_description_map },
709 { XD_END } 356 { XD_END }
710 }; 357 };
711 358
712 static const struct memory_description coding_system_empty_extra_description_1[] = 359 static const struct memory_description coding_system_empty_extra_description_1[] =
1091 } 738 }
1092 } 739 }
1093 740
1094 struct subsidiary_type 741 struct subsidiary_type
1095 { 742 {
1096 Char_ASCII *extension; 743 Ascbyte *extension;
1097 Char_ASCII *mnemonic_ext; 744 Ascbyte *mnemonic_ext;
1098 enum eol_type eol; 745 enum eol_type eol;
1099 }; 746 };
1100 747
1101 static struct subsidiary_type coding_subsidiary_list[] = 748 static struct subsidiary_type coding_subsidiary_list[] =
1102 { { "-unix", "", EOL_LF }, 749 { { "-unix", "", EOL_LF },
1106 /* kludge */ 753 /* kludge */
1107 static void 754 static void
1108 setup_eol_coding_systems (Lisp_Object codesys) 755 setup_eol_coding_systems (Lisp_Object codesys)
1109 { 756 {
1110 int len = XSTRING_LENGTH (XSYMBOL (XCODING_SYSTEM_NAME (codesys))->name); 757 int len = XSTRING_LENGTH (XSYMBOL (XCODING_SYSTEM_NAME (codesys))->name);
1111 Ibyte *codesys_name = (Ibyte *) ALLOCA (len + 7); 758 Ibyte *codesys_name = alloca_ibytes (len + 7);
1112 int mlen = -1; 759 int mlen = -1;
1113 Ibyte *codesys_mnemonic = 0; 760 Ibyte *codesys_mnemonic = 0;
1114 Lisp_Object codesys_name_sym, sub_codesys; 761 Lisp_Object codesys_name_sym, sub_codesys;
1115 int i; 762 int i;
1116 763
1118 XSTRING_DATA (XSYMBOL (XCODING_SYSTEM_NAME (codesys))->name), len); 765 XSTRING_DATA (XSYMBOL (XCODING_SYSTEM_NAME (codesys))->name), len);
1119 766
1120 if (STRINGP (XCODING_SYSTEM_MNEMONIC (codesys))) 767 if (STRINGP (XCODING_SYSTEM_MNEMONIC (codesys)))
1121 { 768 {
1122 mlen = XSTRING_LENGTH (XCODING_SYSTEM_MNEMONIC (codesys)); 769 mlen = XSTRING_LENGTH (XCODING_SYSTEM_MNEMONIC (codesys));
1123 codesys_mnemonic = (Ibyte *) ALLOCA (mlen + 7); 770 codesys_mnemonic = alloca_ibytes (mlen + 7);
1124 memcpy (codesys_mnemonic, 771 memcpy (codesys_mnemonic,
1125 XSTRING_DATA (XCODING_SYSTEM_MNEMONIC (codesys)), mlen); 772 XSTRING_DATA (XCODING_SYSTEM_MNEMONIC (codesys)), mlen);
1126 } 773 }
1127 774
1128 /* Create three "subsidiary" coding systems, decoding data encoded using 775 /* Create three "subsidiary" coding systems, decoding data encoded using
1138 (decodes byte->char), we need to coerce it to one by the appropriate 785 (decodes byte->char), we need to coerce it to one by the appropriate
1139 wrapping in CANONICAL. */ 786 wrapping in CANONICAL. */
1140 787
1141 for (i = 0; i < countof (coding_subsidiary_list); i++) 788 for (i = 0; i < countof (coding_subsidiary_list); i++)
1142 { 789 {
1143 Char_ASCII *extension = coding_subsidiary_list[i].extension; 790 Ascbyte *extension = coding_subsidiary_list[i].extension;
1144 Char_ASCII *mnemonic_ext = coding_subsidiary_list[i].mnemonic_ext; 791 Ascbyte *mnemonic_ext = coding_subsidiary_list[i].mnemonic_ext;
1145 enum eol_type eol = coding_subsidiary_list[i].eol; 792 enum eol_type eol = coding_subsidiary_list[i].eol;
1146 793
1147 qxestrcpy_c (codesys_name + len, extension); 794 qxestrcpy_ascii (codesys_name + len, extension);
1148 codesys_name_sym = intern_int (codesys_name); 795 codesys_name_sym = intern_int (codesys_name);
1149 if (mlen != -1) 796 if (mlen != -1)
1150 qxestrcpy_c (codesys_mnemonic + mlen, mnemonic_ext); 797 qxestrcpy_ascii (codesys_mnemonic + mlen, mnemonic_ext);
1151 798
1152 sub_codesys = Fcopy_coding_system (codesys, codesys_name_sym); 799 sub_codesys = Fcopy_coding_system (codesys, codesys_name_sym);
1153 if (mlen != -1) 800 if (mlen != -1)
1154 XCODING_SYSTEM_MNEMONIC (sub_codesys) = 801 XCODING_SYSTEM_MNEMONIC (sub_codesys) =
1155 build_intstring (codesys_mnemonic); 802 build_intstring (codesys_mnemonic);
1215 crazy crap is based on existing behavior in other Mule versions, 862 crazy crap is based on existing behavior in other Mule versions,
1216 including FSF Emacs.) 863 including FSF Emacs.)
1217 */ 864 */
1218 865
1219 static Lisp_Object 866 static Lisp_Object
1220 make_coding_system_1 (Lisp_Object name_or_existing, Char_ASCII *prefix, 867 make_coding_system_1 (Lisp_Object name_or_existing, Ascbyte *prefix,
1221 Lisp_Object type, Lisp_Object description, 868 Lisp_Object type, Lisp_Object description,
1222 Lisp_Object props) 869 Lisp_Object props)
1223 { 870 {
1224 Lisp_Coding_System *cs; 871 Lisp_Coding_System *cs;
1225 int need_to_setup_eol_systems = 1; 872 int need_to_setup_eol_systems = 1;
1382 1029
1383 return csobj; 1030 return csobj;
1384 } 1031 }
1385 1032
1386 Lisp_Object 1033 Lisp_Object
1387 make_internal_coding_system (Lisp_Object existing, Char_ASCII *prefix, 1034 make_internal_coding_system (Lisp_Object existing, Ascbyte *prefix,
1388 Lisp_Object type, Lisp_Object description, 1035 Lisp_Object type, Lisp_Object description,
1389 Lisp_Object props) 1036 Lisp_Object props)
1390 { 1037 {
1391 return make_coding_system_1 (existing, prefix, type, description, props); 1038 return make_coding_system_1 (existing, prefix, type, description, props);
1392 } 1039 }
2078 1725
2079 extern const struct sized_memory_description chain_coding_stream_description; 1726 extern const struct sized_memory_description chain_coding_stream_description;
2080 extern const struct sized_memory_description undecided_coding_stream_description; 1727 extern const struct sized_memory_description undecided_coding_stream_description;
2081 1728
2082 static const struct memory_description coding_stream_data_description_1 []= { 1729 static const struct memory_description coding_stream_data_description_1 []= {
2083 { XD_STRUCT_PTR, chain_coding_system, 1, &chain_coding_stream_description}, 1730 { XD_BLOCK_PTR, chain_coding_system, 1, &chain_coding_stream_description},
2084 { XD_STRUCT_PTR, undecided_coding_system, 1, &undecided_coding_stream_description}, 1731 { XD_BLOCK_PTR, undecided_coding_system, 1, &undecided_coding_stream_description},
2085 { XD_END } 1732 { XD_END }
2086 }; 1733 };
2087 1734
2088 static const struct sized_memory_description coding_stream_data_description = { 1735 static const struct sized_memory_description coding_stream_data_description = {
2089 sizeof (void *), coding_stream_data_description_1 1736 sizeof (void *), coding_stream_data_description_1
2579 we're decoding and 2226 we're decoding and
2580 coding system calls 2227 coding system calls
2581 for this] 2228 for this]
2582 ------> [BUFFER] 2229 ------> [BUFFER]
2583 */ 2230 */
2584 /* Of course, this is just horrible. BYTE<->CHAR should only be available 2231
2585 to I/O routines. It should not be visible to Mule proper. 2232 /* #### See comment
2586 2233
2587 A comment on the implementation. Hrvoje and Kyle worry about the 2234 EFFICIENCY OF CODING CONVERSION WITH MULTIPLE COPIES/CHAINS
2588 inefficiency of repeated copying among buffers that chained coding 2235
2589 systems entail. But this may not be as time inefficient as it appears 2236 in text.c.
2590 in the Mule ("house rules") context. The issue is how do you do chain 2237 */
2591 coding systems without copying? In theory you could have 2238
2592
2593 IChar external_to_raw (ExtChar *cp, State *s);
2594 IChar decode_utf16 (IChar c, State *s);
2595 IChar decode_crlf (ExtChar *cp, State *s);
2596
2597 typedef Ichar (*Converter[]) (Ichar, State*);
2598
2599 Converter utf16[2] = { &decode_utf16, &decode_crlf };
2600
2601 void convert (ExtChar *inbuf, IChar *outbuf, Converter cvtr)
2602 {
2603 int i;
2604 ExtChar c;
2605 State s;
2606
2607 while (c = external_to_raw (*inbuf++, &s))
2608 {
2609 for (i = 0; i < sizeof(cvtr)/sizeof(Converter); ++i)
2610 if (s.ready)
2611 c = (*cvtr[i]) (c, &s);
2612 }
2613 if (s.ready)
2614 *outbuf++ = c;
2615 }
2616
2617 But this is a lot of function calls; what Ben is doing is basically
2618 reducing this to one call per buffer-full. The only way to avoid this
2619 is to hardcode all the "interesting" coding systems, maybe using
2620 inline or macros to give structure. But this is still a huge amount
2621 of work, and code.
2622
2623 One advantage to the call-per-char approach is that we might be able
2624 to do something about the marker/extent destruction that coding
2625 normally entails.
2626 */
2627 while (1) 2239 while (1)
2628 { 2240 {
2629 char tempbuf[1024]; /* some random amount */ 2241 char tempbuf[1024]; /* some random amount */
2630 Charbpos newpos, even_newer_pos; 2242 Charbpos newpos, even_newer_pos;
2631 Charbpos oldpos = lisp_buffer_stream_startpos (istr); 2243 Charbpos oldpos = lisp_buffer_stream_startpos (istr);
2715 int lstream_count; 2327 int lstream_count;
2716 }; 2328 };
2717 2329
2718 static const struct memory_description chain_coding_system_description[] = { 2330 static const struct memory_description chain_coding_system_description[] = {
2719 { XD_INT, offsetof (struct chain_coding_system, count) }, 2331 { XD_INT, offsetof (struct chain_coding_system, count) },
2720 { XD_STRUCT_PTR, offsetof (struct chain_coding_system, chain), 2332 { XD_BLOCK_PTR, offsetof (struct chain_coding_system, chain),
2721 XD_INDIRECT (0, 0), &lisp_object_description }, 2333 XD_INDIRECT (0, 0), &lisp_object_description },
2722 { XD_LISP_OBJECT, offsetof (struct chain_coding_system, 2334 { XD_LISP_OBJECT, offsetof (struct chain_coding_system,
2723 canonicalize_after_coding) }, 2335 canonicalize_after_coding) },
2724 { XD_END } 2336 { XD_END }
2725 }; 2337 };
2726 2338
2727 static const struct memory_description chain_coding_stream_description_1 [] = { 2339 static const struct memory_description chain_coding_stream_description_1 [] = {
2728 { XD_INT, offsetof (struct chain_coding_stream, lstream_count) }, 2340 { XD_INT, offsetof (struct chain_coding_stream, lstream_count) },
2729 { XD_STRUCT_PTR, offsetof (struct chain_coding_stream, lstreams), 2341 { XD_BLOCK_PTR, offsetof (struct chain_coding_stream, lstreams),
2730 XD_INDIRECT (0, 0), &lisp_object_description }, 2342 XD_INDIRECT (0, 0), &lisp_object_description },
2731 { XD_END } 2343 { XD_END }
2732 }; 2344 };
2733 2345
2734 const struct sized_memory_description chain_coding_stream_description = { 2346 const struct sized_memory_description chain_coding_stream_description = {
2983 static int 2595 static int
2984 chain_putprop (Lisp_Object codesys, Lisp_Object key, Lisp_Object value) 2596 chain_putprop (Lisp_Object codesys, Lisp_Object key, Lisp_Object value)
2985 { 2597 {
2986 if (EQ (key, Qchain)) 2598 if (EQ (key, Qchain))
2987 { 2599 {
2988 Lisp_Object tail;
2989 Lisp_Object *cslist; 2600 Lisp_Object *cslist;
2990 int count = 0; 2601 int count = 0;
2991 int i; 2602 int i;
2992 2603
2993 EXTERNAL_LIST_LOOP (tail, value) 2604 {
2994 { 2605 EXTERNAL_LIST_LOOP_2 (elt, value)
2995 Fget_coding_system (XCAR (tail)); 2606 {
2996 count++; 2607 Fget_coding_system (elt);
2997 } 2608 count++;
2609 }
2610 }
2998 2611
2999 cslist = xnew_array (Lisp_Object, count); 2612 cslist = xnew_array (Lisp_Object, count);
3000 XCODING_SYSTEM_CHAIN_CHAIN (codesys) = cslist; 2613 XCODING_SYSTEM_CHAIN_CHAIN (codesys) = cslist;
3001 2614
3002 count = 0; 2615 count = 0;
3003 EXTERNAL_LIST_LOOP (tail, value) 2616 {
3004 { 2617 EXTERNAL_LIST_LOOP_2 (elt, value)
3005 cslist[count] = Fget_coding_system (XCAR (tail)); 2618 {
3006 count++; 2619 cslist[count] = Fget_coding_system (elt);
3007 } 2620 count++;
2621 }
2622 }
3008 2623
3009 XCODING_SYSTEM_CHAIN_COUNT (codesys) = count; 2624 XCODING_SYSTEM_CHAIN_COUNT (codesys) = count;
3010 2625
3011 for (i = 0; i < count - 1; i++) 2626 for (i = 0; i < count - 1; i++)
3012 { 2627 {
3467 { XD_END } 3082 { XD_END }
3468 }; 3083 };
3469 3084
3470 static const struct memory_description undecided_coding_stream_description_1 [] = { 3085 static const struct memory_description undecided_coding_stream_description_1 [] = {
3471 { XD_LISP_OBJECT, offsetof (struct undecided_coding_stream, actual) }, 3086 { XD_LISP_OBJECT, offsetof (struct undecided_coding_stream, actual) },
3472 { XD_STRUCT_ARRAY, offsetof (struct undecided_coding_stream, c), 3087 { XD_BLOCK_ARRAY, offsetof (struct undecided_coding_stream, c),
3473 1, &chain_coding_stream_description }, 3088 1, &chain_coding_stream_description },
3474 { XD_END } 3089 { XD_END }
3475 }; 3090 };
3476 3091
3477 const struct sized_memory_description undecided_coding_stream_description = { 3092 const struct sized_memory_description undecided_coding_stream_description = {
4322 (list)) 3937 (list))
4323 { 3938 {
4324 int *category_to_priority = 3939 int *category_to_priority =
4325 alloca_array (int, coding_detector_category_count); 3940 alloca_array (int, coding_detector_category_count);
4326 int i, j; 3941 int i, j;
4327 Lisp_Object rest;
4328 3942
4329 /* First generate a list that maps coding categories to priorities. */ 3943 /* First generate a list that maps coding categories to priorities. */
4330 3944
4331 for (i = 0; i < coding_detector_category_count; i++) 3945 for (i = 0; i < coding_detector_category_count; i++)
4332 category_to_priority[i] = -1; 3946 category_to_priority[i] = -1;
4333 3947
4334 /* Highest priority comes from the specified list. */ 3948 /* Highest priority comes from the specified list. */
4335 i = 0; 3949 i = 0;
4336 EXTERNAL_LIST_LOOP (rest, list) 3950 {
4337 { 3951 EXTERNAL_LIST_LOOP_2 (elt, list)
4338 int cat = coding_category_symbol_to_id (XCAR (rest)); 3952 {
4339 3953 int cat = coding_category_symbol_to_id (elt);
4340 if (category_to_priority[cat] >= 0) 3954
4341 sferror ("Duplicate coding category in list", XCAR (rest)); 3955 if (category_to_priority[cat] >= 0)
4342 category_to_priority[cat] = i++; 3956 sferror ("Duplicate coding category in list", elt);
4343 } 3957 category_to_priority[cat] = i++;
3958 }
3959 }
4344 3960
4345 /* Now go through the existing categories by priority to retrieve 3961 /* Now go through the existing categories by priority to retrieve
4346 the categories not yet specified and preserve their priority 3962 the categories not yet specified and preserve their priority
4347 order. */ 3963 order. */
4348 for (j = 0; j < coding_detector_category_count; j++) 3964 for (j = 0; j < coding_detector_category_count; j++)
4835 staticpro (&Vcoding_system_hash_table); 4451 staticpro (&Vcoding_system_hash_table);
4836 Vcoding_system_hash_table = 4452 Vcoding_system_hash_table =
4837 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ); 4453 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ);
4838 4454
4839 the_coding_system_type_entry_dynarr = Dynarr_new (coding_system_type_entry); 4455 the_coding_system_type_entry_dynarr = Dynarr_new (coding_system_type_entry);
4840 dump_add_root_struct_ptr (&the_coding_system_type_entry_dynarr, 4456 dump_add_root_block_ptr (&the_coding_system_type_entry_dynarr,
4841 &csted_description); 4457 &csted_description);
4842 4458
4843 Vcoding_system_type_list = Qnil; 4459 Vcoding_system_type_list = Qnil;
4844 staticpro (&Vcoding_system_type_list); 4460 staticpro (&Vcoding_system_type_list);
4845 4461
4853 4469
4854 dump_add_opaque (coding_category_by_priority, 4470 dump_add_opaque (coding_category_by_priority,
4855 sizeof (coding_category_by_priority)); 4471 sizeof (coding_category_by_priority));
4856 4472
4857 all_coding_detectors = Dynarr_new2 (detector_dynarr, struct detector); 4473 all_coding_detectors = Dynarr_new2 (detector_dynarr, struct detector);
4858 dump_add_root_struct_ptr (&all_coding_detectors, 4474 dump_add_root_block_ptr (&all_coding_detectors,
4859 &detector_dynarr_description); 4475 &detector_dynarr_description);
4860 4476
4861 dump_add_opaque_int (&coding_system_tick); 4477 dump_add_opaque_int (&coding_system_tick);
4862 dump_add_opaque_int (&coding_detector_count); 4478 dump_add_opaque_int (&coding_detector_count);
4863 dump_add_opaque_int (&coding_detector_category_count); 4479 dump_add_opaque_int (&coding_detector_category_count);
4953 } 4569 }
4954 4570
4955 void 4571 void
4956 vars_of_file_coding (void) 4572 vars_of_file_coding (void)
4957 { 4573 {
4958 reinit_vars_of_file_coding ();
4959
4960 /* We always have file-coding support */ 4574 /* We always have file-coding support */
4961 Fprovide (intern ("file-coding")); 4575 Fprovide (intern ("file-coding"));
4962 4576
4963 QScoding_system_cookie = build_string (";;;###coding system: "); 4577 QScoding_system_cookie = build_string (";;;###coding system: ");
4964 staticpro (&QScoding_system_cookie); 4578 staticpro (&QScoding_system_cookie);