771
|
1 /* Header for encoding conversion functions; coding-system object.
|
|
2 #### rename me to coding-system.h
|
428
|
3 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
|
|
4 Copyright (C) 1995 Sun Microsystems, Inc.
|
793
|
5 Copyright (C) 2000, 2001, 2002 Ben Wing.
|
428
|
6
|
|
7 This file is part of XEmacs.
|
|
8
|
|
9 XEmacs is free software; you can redistribute it and/or modify it
|
|
10 under the terms of the GNU General Public License as published by the
|
|
11 Free Software Foundation; either version 2, or (at your option) any
|
|
12 later version.
|
|
13
|
|
14 XEmacs is distributed in the hope that it will be useful, but WITHOUT
|
|
15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
17 for more details.
|
|
18
|
|
19 You should have received a copy of the GNU General Public License
|
|
20 along with XEmacs; see the file COPYING. If not, write to
|
|
21 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
22 Boston, MA 02111-1307, USA. */
|
|
23
|
|
24 /* Synched up with: Mule 2.3. Not in FSF. */
|
|
25
|
771
|
26 /* Authorship:
|
|
27
|
|
28 Current primary author: Ben Wing <ben@xemacs.org>
|
|
29
|
|
30 Written by Ben Wing <ben@xemacs.org> for XEmacs, 1995, loosely based
|
|
31 on code written 91.10.09 by K.Handa <handa@etl.go.jp>.
|
|
32 Rewritten again 2000-2001 by Ben Wing to support properly
|
|
33 abstracted coding systems.
|
|
34 September 2001: Finished last part of abstraction, the detection
|
|
35 mechanism.
|
|
36 */
|
428
|
37
|
440
|
38 #ifndef INCLUDED_file_coding_h_
|
|
39 #define INCLUDED_file_coding_h_
|
428
|
40
|
771
|
41 /* Capsule description of the different structures, what their purpose is,
|
|
42 how they fit together, and where various bits of data are stored.
|
|
43
|
|
44 A "coding system" is an algorithm for converting data in one format into
|
|
45 data in another format. Currently most of the coding systems we have
|
|
46 created concern internationalized text, and convert between the XEmacs
|
|
47 internal format for multilingual text, and various external
|
|
48 representations of such text. However, any such conversion is possible,
|
|
49 for example, compressing or uncompressing text using the gzip algorithm.
|
|
50 All coding systems provide both encode and decode routines, so that the
|
|
51 conversion can go both ways.
|
|
52
|
|
53 The way we handle this is by dividing the various potential coding
|
|
54 systems into types, analogous to classes in C++. Each coding system
|
|
55 type encompasses a series of related coding systems that it can
|
|
56 implement, and it has properties which control how exactly the encoding
|
|
57 works. A particular set of values for each of the properties makes up a
|
|
58 "coding system", and specifies one particular encoding. A `struct
|
|
59 Lisp_Coding_System' object encapsulates those settings -- its type, the
|
|
60 values chosen for all properties of that type, a name for the coding
|
|
61 system, some documentation.
|
|
62
|
|
63 In addition, there are of course methods associated with a coding system
|
|
64 type, implementing the encoding, decoding, etc. These are stored in a
|
|
65 `struct coding_system_methods' object, one per coding-system type, which
|
|
66 contains mostly function pointers. This is retrievable from the
|
|
67 coding-system object (i.e. the struct Lisp_Coding_System), which has a
|
|
68 pointer to it.
|
|
69
|
|
70 In order to actually use a coding system to do an encoding or decoding
|
|
71 operation, you need to use a coding Lstream.
|
|
72
|
|
73 Now let's look more at attached data. All coding systems have certain
|
|
74 common data fields -- name, type, documentation, etc. -- as well as a
|
|
75 bunch more that are defined by the coding system type. To handle this
|
|
76 cleanly, each coding system type defines a structure that holds just the
|
|
77 fields of data particular to it, and calls it e.g. `struct
|
|
78 iso2022_coding_system' for coding system type `iso2022'. When the
|
|
79 memory block holding the coding system object is created, it is sized
|
|
80 such that it can hold both the struct Lisp_Coding_System and the struct
|
|
81 iso2022_coding_system (or whatever) directly following it. (This is a
|
|
82 common trick; another possibility is to have a void * pointer in the
|
|
83 struct Lisp_Coding_System, which points to another memory block holding
|
|
84 the struct iso2022_coding_system.) A macro is provided
|
|
85 (CODING_SYSTEM_TYPE_DATA) to retrieve a pointer of the right type to the
|
|
86 type-specific data contained within the overall `struct
|
|
87 Lisp_Coding_System' block.
|
|
88
|
|
89 Lstreams, similarly, are objects of type `struct lstream' holding data
|
|
90 about the stream operation (how much data has been read or written, any
|
|
91 buffered data, any error conditions, etc.), and like coding systems have
|
|
92 different types. They have a structure called `Lstream_implementation',
|
|
93 one per lstream type, exactly analogous to `struct
|
|
94 coding_system_methods'. In addition, they have type-specific data
|
|
95 (specifying, e.g., the file number, FILE *, memory location, other
|
|
96 lstream, etc. to read the data from or write it to, and for conversion
|
|
97 processes, the current state of the process -- are we decoding ASCII or
|
|
98 Kanji characters? are we in the middle of a processing an escape
|
|
99 sequence? etc.). This type-specific data is stored in a structure
|
|
100 named `struct coding_stream'. Just like for coding systems, the
|
|
101 type-independent data in the `struct lstream' and the type-dependent
|
|
102 data in the `struct coding_stream' are stored together in the same
|
|
103 memory block.
|
428
|
104
|
771
|
105 Now things get a bit tricky. The `struct coding_stream' is
|
|
106 type-specific from the point of view of an lstream, but not from the
|
|
107 point of view of a coding system. It contains only general data about
|
|
108 the conversion process, e.g. the name of the coding system used for
|
|
109 conversion, the lstream that we take data from or write it to (depending
|
|
110 on whether this was created as a read stream or a write stream), a
|
|
111 buffer to hold extra data we retrieved but can't send on yet, some
|
|
112 flags, etc. It also needs some data specific to the particular coding
|
|
113 system and thus to the particular operation going on. This data is held
|
|
114 in a structure named (e.g.) `struct iso2022_coding_stream', and it's
|
|
115 held in a separate memory block and pointed to by the generic `struct
|
|
116 coding_stream'. It's not glommed into a single memory block both
|
|
117 because that would require making changes to the generic lstream code
|
|
118 and more importantly because the coding system used in a particular
|
|
119 coding lstream can be changed at any point during the lifetime of the
|
|
120 lstream, and possibly multiple times. (For example, it can be set using
|
|
121 the Lisp primitives `set-process-input-coding-system' and
|
|
122 `set-console-tty-input-coding-system', as well as getting set when a
|
|
123 conversion operation was started with coding system `undecided' and the
|
|
124 correct coding system was then detected.)
|
428
|
125
|
771
|
126 IMPORTANT NOTE: There are at least two ancillary data structures
|
|
127 associated with a coding system type. (There may also be detection data;
|
|
128 see elsewhere.) It's important, when writing a coding system type, to
|
|
129 keep straight which type of data goes where. In particular, `struct
|
|
130 foo_coding_system' is attached to the coding system object itself. This
|
|
131 is a permanent object and there's only one per coding system. It's
|
|
132 created once, usually at init time, and never destroyed. So, `struct
|
|
133 foo_coding_system' should in general not contain dynamic data! (Just
|
|
134 data describing the properties of the coding system.) In particular,
|
|
135 *NO* data about any conversion in progress. There may be many
|
|
136 conversions going on simultaneously using a particular coding system,
|
|
137 and by storing conversion data in the coding system, these conversions
|
|
138 will overwrite each other's data.
|
|
139
|
|
140 Instead, use the lstream object, whose purpose is to encapsulate a
|
|
141 particular conversion and all associated data. From the lstream object,
|
|
142 you can get the struct coding_stream using something like
|
|
143
|
|
144 struct coding_stream *str = LSTREAM_TYPE_DATA (lstr, coding);
|
|
145
|
|
146 But usually this structure is already passed to you as one of the
|
|
147 parameters of the method being invoked.
|
|
148
|
|
149 From the struct coding_stream, you can retrieve the
|
|
150 coding-system-type-specific data using something like
|
|
151
|
|
152 struct foo_coding_stream *data = CODING_STREAM_TYPE_DATA (str, foo);
|
|
153
|
|
154 Then, use this structure to hold all data relevant to the particular
|
|
155 conversion being done.
|
|
156
|
|
157 Initialize this structure whenever init_coding_stream_method is called
|
|
158 (this may happen more than once), and finalize it (free resources, etc.)
|
|
159 when finalize_coding_stream_method is called.
|
|
160 */
|
|
161
|
|
162 struct coding_stream;
|
|
163 struct detection_state;
|
|
164
|
|
165 extern const struct struct_description coding_system_methods_description;
|
|
166
|
|
167 struct coding_system_methods;
|
|
168
|
|
169 enum source_sink_type
|
428
|
170 {
|
771
|
171 DECODES_CHARACTER_TO_BYTE,
|
|
172 DECODES_BYTE_TO_BYTE,
|
|
173 DECODES_BYTE_TO_CHARACTER,
|
|
174 DECODES_CHARACTER_TO_CHARACTER
|
428
|
175 };
|
|
176
|
|
177 enum eol_type
|
|
178 {
|
|
179 EOL_LF,
|
|
180 EOL_CRLF,
|
771
|
181 EOL_CR,
|
|
182 EOL_AUTODETECT,
|
428
|
183 };
|
|
184
|
|
185 struct Lisp_Coding_System
|
|
186 {
|
|
187 struct lcrecord_header header;
|
771
|
188 struct coding_system_methods *methods;
|
428
|
189
|
771
|
190 /* Name and description of this coding system. The description
|
|
191 should be suitable for a menu entry. */
|
440
|
192 Lisp_Object name;
|
771
|
193 Lisp_Object description;
|
428
|
194
|
|
195 /* Mnemonic string displayed in the modeline when this coding
|
|
196 system is active for a particular buffer. */
|
|
197 Lisp_Object mnemonic;
|
|
198
|
771
|
199 /* Long documentation on the coding system. */
|
|
200 Lisp_Object documentation;
|
|
201 /* Functions to handle additional conversion after reading or before
|
|
202 writing. #### This mechanism should be replaced by the ability to
|
|
203 simply create new coding system types. */
|
440
|
204 Lisp_Object post_read_conversion;
|
|
205 Lisp_Object pre_write_conversion;
|
428
|
206
|
771
|
207 /* If this coding system is not of the correct type for text file
|
|
208 conversion (i.e. decodes byte->char), we wrap it with appropriate
|
|
209 char<->byte converters. This is created dynamically, when it's
|
|
210 needed, and cached here. */
|
|
211 Lisp_Object text_file_wrapper;
|
|
212
|
|
213 /* If true, this is an internal coding system, which will not show up in
|
|
214 coding-system-list unless a special parameter is given to it. */
|
|
215 int internal_p;
|
|
216
|
|
217 /* ------------------------ junk to handle EOL -------------------------
|
|
218 I had hoped that we could handle this without lots of special-case
|
|
219 code, but it appears not to be the case if we want to maintain
|
|
220 compatibility with the existing way. However, at least with the way
|
|
221 we do things now, we avoid EOL junk in most of the coding system
|
|
222 methods themselves, or in the decode/encode functions. The EOL
|
|
223 special-case code is limited to coding-system creation and to the
|
|
224 convert-eol and undecided coding system types. */
|
|
225
|
|
226 /* If this coding system wants autodetection of the EOL type, then at the
|
|
227 appropriate time we wrap this coding system with
|
|
228 convert-eol-autodetect. (We do NOT do this at creation time because
|
|
229 then we end up with multiple convert-eols wrapped into the final
|
|
230 result -- esp. with autodetection using `undecided' -- leading to a
|
|
231 big mess.) We cache the wrapped coding system here. */
|
|
232 Lisp_Object auto_eol_wrapper;
|
|
233
|
|
234 /* Eol type requested by user. */
|
|
235 enum eol_type eol_type;
|
428
|
236
|
|
237 /* Subsidiary coding systems that specify a particular type of EOL
|
|
238 marking, rather than autodetecting it. These will only be non-nil
|
771
|
239 if (eol_type == EOL_AUTODETECT). These are chains. */
|
|
240 Lisp_Object eol[3];
|
|
241 /* If this coding system is a subsidiary, this element points back to its
|
|
242 parent. */
|
|
243 Lisp_Object subsidiary_parent;
|
428
|
244
|
771
|
245 /* At decoding or encoding time, we use the following coding system, if
|
|
246 it exists, in place of the coding system object. This is how we
|
|
247 handle coding systems with EOL types of CRLF or CR. Formerly, we did
|
|
248 the canonicalization at creation time, returning a chain in place of
|
|
249 the original coding system; but that interferes with
|
|
250 `coding-system-property' and causes other complications. CANONICAL is
|
|
251 used when determining the end types of a coding system.
|
|
252 canonicalize-after-coding also consults CANONICAL (it has to, because
|
|
253 the data in the lstream is based on CANONICAL, not on the original
|
|
254 coding system). */
|
|
255 Lisp_Object canonical;
|
|
256
|
|
257 /* type-specific extra data attached to a coding_system */
|
|
258 char data[1];
|
428
|
259 };
|
|
260 typedef struct Lisp_Coding_System Lisp_Coding_System;
|
|
261
|
440
|
262 DECLARE_LRECORD (coding_system, Lisp_Coding_System);
|
|
263 #define XCODING_SYSTEM(x) XRECORD (x, coding_system, Lisp_Coding_System)
|
617
|
264 #define wrap_coding_system(p) wrap_record (p, coding_system)
|
428
|
265 #define CODING_SYSTEMP(x) RECORDP (x, coding_system)
|
|
266 #define CHECK_CODING_SYSTEM(x) CHECK_RECORD (x, coding_system)
|
|
267 #define CONCHECK_CODING_SYSTEM(x) CONCHECK_RECORD (x, coding_system)
|
|
268
|
771
|
269 struct coding_system_methods
|
|
270 {
|
|
271 Lisp_Object type;
|
|
272 Lisp_Object predicate_symbol;
|
|
273
|
|
274 /* Implementation specific methods: */
|
|
275
|
|
276 /* Init method: Initialize coding-system data. Optional. */
|
|
277 void (*init_method) (Lisp_Object coding_system);
|
|
278
|
|
279 /* Mark method: Mark any Lisp objects in the type-specific data
|
|
280 attached to the coding-system object. Optional. */
|
|
281 void (*mark_method) (Lisp_Object coding_system);
|
|
282
|
|
283 /* Print method: Print the type-specific properties of this coding
|
|
284 system, as part of `print'-ing the object. If this method is defined
|
|
285 and prints anything, it should print a space as the first thing it
|
|
286 does. Optional. */
|
|
287 void (*print_method) (Lisp_Object cs, Lisp_Object printcharfun,
|
|
288 int escapeflag);
|
|
289
|
|
290 /* Canonicalize method: Convert this coding system to another one; called
|
|
291 once, at creation time, after all properties have been parsed. The
|
|
292 returned value should be a coding system created with
|
|
293 make_internal_coding_system() (passing the existing coding system as the
|
|
294 first argument), and will become the coding system returned by
|
|
295 `make-coding-system'. Optional.
|
|
296
|
|
297 NOTE: There are *three* different uses of "canonical" or "canonicalize"
|
|
298 w.r.t. coding systems, and it's important to keep them straight.
|
|
299
|
|
300 1. The canonicalize method. Used to specify a different coding
|
|
301 system, used when doing conversions, in place of the actual coding
|
|
302 system itself. Stored in the CANONICAL field of a coding system.
|
|
303
|
|
304 2. The canonicalize-after-coding method. Used to return the encoding
|
|
305 that was "actually" used to decode some text, such that this
|
|
306 particular encoding can be used to encode the text again with the
|
|
307 expectation that the result will be the same as the original encoding.
|
|
308 Particularly important with auto-detecting coding systems.
|
|
309
|
|
310 3. From the perspective of aliases, a "canonical" coding system is one
|
|
311 that's not an alias to some other coding system, and "canonicalization"
|
|
312 is the process of traversing the alias pointers to find the canonical
|
|
313 coding system that's equivalent to the alias.
|
|
314 */
|
|
315 Lisp_Object (*canonicalize_method) (Lisp_Object coding_system);
|
|
316
|
|
317 /* Canonicalize after coding method: Convert this coding system to
|
|
318 another one, after coding (usually decoding) has finished. This is
|
|
319 meant to be used by auto-detecting coding systems, which should return
|
|
320 the actually detected coding system. Optional. */
|
|
321 Lisp_Object (*canonicalize_after_coding_method)
|
|
322 (struct coding_stream *str);
|
|
323
|
|
324 /* Convert method: Decode or encode the data in SRC of size N, writing
|
|
325 the results into the Dynarr DST. If the conversion_end_type method
|
|
326 indicates that the source is characters (as opposed to bytes), you are
|
|
327 guaranteed to get only whole characters in the data in SRC/N. STR, a
|
|
328 struct coding_stream, stores all necessary state and other info about
|
|
329 the conversion. Coding-specific state (struct TYPE_coding_stream) can
|
|
330 be retrieved from STR using CODING_STREAM_TYPE_DATA(). Return value
|
|
331 indicates the number of bytes of the *INPUT* that were converted (not
|
|
332 the number of bytes written to the Dynarr!). This can be less than
|
|
333 the total amount of input passed in; if so, the remainder is
|
|
334 considered "rejected" and will appear again at the beginning of the
|
|
335 data passed in the next time the convert method is called. When EOF
|
|
336 is returned on the other end and there's no more data, the convert
|
|
337 method will be called one last time, STR->eof set and the passed-in
|
|
338 data will consist only of any rejected data from the previous
|
|
339 call. (At this point, file handles and similar resources can be
|
|
340 closed, but do NOT arbitrarily free data structures in the
|
|
341 type-specific data, because there are operations that can be done on
|
|
342 closed streams to query the results of the processing -- specifically,
|
|
343 for coding streams, there's the canonicalize_after_coding() method.)
|
|
344 Required. */
|
|
345 Bytecount (*convert_method) (struct coding_stream *str,
|
|
346 const unsigned char *src,
|
|
347 unsigned_char_dynarr *dst, Bytecount n);
|
|
348
|
|
349 /* Coding mark method: Mark any Lisp objects in the type-specific data
|
|
350 attached to `struct coding_stream'. Optional. */
|
|
351 void (*mark_coding_stream_method) (struct coding_stream *str);
|
|
352
|
|
353 /* Init coding stream method: Initialize the type-specific data attached
|
|
354 to the coding stream (i.e. in struct TYPE_coding_stream), when the
|
|
355 coding stream is opened. The type-specific data will be zeroed out.
|
|
356 Optional. */
|
|
357 void (*init_coding_stream_method) (struct coding_stream *str);
|
|
358
|
|
359 /* Rewind coding stream method: Reset any necessary type-specific data as
|
|
360 a result of the stream being rewound. Optional. */
|
|
361 void (*rewind_coding_stream_method) (struct coding_stream *str);
|
|
362
|
|
363 /* Finalize coding stream method: Clean up the type-specific data
|
|
364 attached to the coding stream (i.e. in struct TYPE_coding_stream).
|
|
365 Happens when the Lstream is deleted using Lstream_delete() or is
|
|
366 garbage-collected. Most streams are deleted after they've been used,
|
|
367 so it's less likely (but still possible) that allocated data will
|
|
368 stick around until GC time. (File handles can also be closed when EOF
|
|
369 is signalled; but some data must stick around after this point, for
|
|
370 the benefit of canonicalize_after_coding. See the convert method.)
|
|
371 Called only once (NOT called at disksave time). Optional. */
|
|
372 void (*finalize_coding_stream_method) (struct coding_stream *str);
|
|
373
|
|
374 /* Finalize method: Clean up type-specific data (e.g. free allocated
|
|
375 data) attached to the coding system (i.e. in struct
|
|
376 TYPE_coding_system), when the coding system is about to be garbage
|
|
377 collected. (Currently not called.) Called only once (NOT called at
|
|
378 disksave time). Optional. */
|
|
379 void (*finalize_method) (Lisp_Object codesys);
|
|
380
|
|
381 /* Conversion end type method: Does this coding system encode bytes ->
|
|
382 characters, characters -> characters, bytes -> bytes, or
|
|
383 characters -> bytes?. Default is characters -> bytes. Optional. */
|
|
384 enum source_sink_type (*conversion_end_type_method) (Lisp_Object codesys);
|
|
385
|
|
386 /* Putprop method: Set the value of a type-specific property. If
|
|
387 the property name is unrecognized, return 0. If the value is disallowed
|
|
388 or erroneous, signal an error. Currently called only at creation time.
|
|
389 Optional. */
|
|
390 int (*putprop_method) (Lisp_Object codesys,
|
|
391 Lisp_Object key,
|
|
392 Lisp_Object value);
|
|
393
|
|
394 /* Getprop method: Return the value of a type-specific property. If
|
|
395 the property name is unrecognized, return Qunbound. Optional.
|
|
396 */
|
|
397 Lisp_Object (*getprop_method) (Lisp_Object coding_system,
|
|
398 Lisp_Object prop);
|
|
399
|
|
400 /* These next three are set as part of the call to
|
|
401 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA. */
|
|
402
|
|
403 /* Description of the extra data (struct foo_coding_system) attached to a
|
|
404 coding system, for pdump purposes. NOTE: All offsets must have
|
|
405 coding_system_data_offset added to them! */
|
|
406 const struct lrecord_description *extra_description;
|
|
407 /* size of struct foo_coding_system -- extra data associated with
|
|
408 the coding system */
|
|
409 int extra_data_size;
|
|
410 /* size of struct foo_coding_stream -- extra data associated with the
|
|
411 struct coding_stream, needed for each active coding process
|
|
412 using this coding system. note that we can have more than one
|
|
413 process active at once (simply by creating more than one coding
|
|
414 lstream using this coding system), so we can't store this data in
|
|
415 the coding system object. */
|
|
416 int coding_data_size;
|
|
417 };
|
|
418
|
|
419 /***** Calling a coding-system method *****/
|
|
420
|
|
421 #define RAW_CODESYSMETH(cs, m) ((cs)->methods->m##_method)
|
|
422 #define HAS_CODESYSMETH_P(cs, m) (!!RAW_CODESYSMETH (cs, m))
|
|
423 #define CODESYSMETH(cs, m, args) (((cs)->methods->m##_method) args)
|
|
424
|
|
425 /* Call a void-returning coding-system method, if it exists. */
|
|
426 #define MAYBE_CODESYSMETH(cs, m, args) do { \
|
|
427 Lisp_Coding_System *maybe_codesysmeth_cs = (cs); \
|
|
428 if (HAS_CODESYSMETH_P (maybe_codesysmeth_cs, m)) \
|
|
429 CODESYSMETH (maybe_codesysmeth_cs, m, args); \
|
|
430 } while (0)
|
|
431
|
|
432 /* Call a coding-system method, if it exists, or return GIVEN.
|
|
433 NOTE: Multiply-evaluates CS. */
|
|
434 #define CODESYSMETH_OR_GIVEN(cs, m, args, given) \
|
|
435 (HAS_CODESYSMETH_P (cs, m) ? \
|
|
436 CODESYSMETH (cs, m, args) : (given))
|
|
437
|
|
438 #define XCODESYSMETH(cs, m, args) \
|
|
439 CODESYSMETH (XCODING_SYSTEM (cs), m, args)
|
|
440 #define MAYBE_XCODESYSMETH(cs, m, args) \
|
|
441 MAYBE_CODESYSMETH (XCODING_SYSTEM (cs), m, args)
|
|
442 #define XCODESYSMETH_OR_GIVEN(cs, m, args, given) \
|
|
443 CODESYSMETH_OR_GIVEN (XCODING_SYSTEM (cs), m, args, given)
|
|
444
|
|
445
|
|
446 /***** Defining new coding-system types *****/
|
|
447
|
|
448 #define coding_system_data_offset (offsetof (Lisp_Coding_System, data))
|
|
449 extern const struct lrecord_description coding_system_empty_extra_description[];
|
|
450
|
800
|
451 #ifdef ERROR_CHECK_TYPES
|
771
|
452 #define DECLARE_CODING_SYSTEM_TYPE(type) \
|
|
453 \
|
|
454 extern struct coding_system_methods * type##_coding_system_methods; \
|
826
|
455 DECLARE_INLINE_HEADER ( \
|
|
456 struct type##_coding_system * \
|
771
|
457 error_check_##type##_coding_system_data (Lisp_Coding_System *cs) \
|
826
|
458 ) \
|
771
|
459 { \
|
|
460 assert (CODING_SYSTEM_TYPE_P (cs, type)); \
|
|
461 /* Catch accidental use of INITIALIZE_CODING_SYSTEM_TYPE in place \
|
|
462 of INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA. */ \
|
|
463 assert (cs->methods->extra_data_size > 0); \
|
|
464 return (struct type##_coding_system *) cs->data; \
|
|
465 } \
|
|
466 \
|
826
|
467 DECLARE_INLINE_HEADER ( \
|
|
468 struct type##_coding_stream * \
|
771
|
469 error_check_##type##_coding_stream_data (struct coding_stream *s) \
|
826
|
470 ) \
|
771
|
471 { \
|
|
472 assert (XCODING_SYSTEM_TYPE_P (s->codesys, type)); \
|
|
473 return (struct type##_coding_stream *) s->data; \
|
|
474 } \
|
|
475 \
|
826
|
476 DECLARE_INLINE_HEADER ( \
|
|
477 Lisp_Coding_System * \
|
771
|
478 error_check_##type##_coding_system_type (Lisp_Object obj) \
|
826
|
479 ) \
|
771
|
480 { \
|
|
481 Lisp_Coding_System *cs = XCODING_SYSTEM (obj); \
|
|
482 assert (CODING_SYSTEM_TYPE_P (cs, type)); \
|
|
483 return cs; \
|
|
484 } \
|
|
485 \
|
|
486 DECLARE_NOTHING
|
|
487 #else
|
|
488 #define DECLARE_CODING_SYSTEM_TYPE(type) \
|
|
489 extern struct coding_system_methods * type##_coding_system_methods
|
800
|
490 #endif /* ERROR_CHECK_TYPES */
|
771
|
491
|
|
492 #define DEFINE_CODING_SYSTEM_TYPE(type) \
|
|
493 struct coding_system_methods * type##_coding_system_methods
|
|
494
|
|
495 #define INITIALIZE_CODING_SYSTEM_TYPE(ty, pred_sym) do { \
|
|
496 ty##_coding_system_methods = \
|
|
497 xnew_and_zero (struct coding_system_methods); \
|
|
498 ty##_coding_system_methods->type = Q##ty; \
|
|
499 ty##_coding_system_methods->extra_description = \
|
|
500 coding_system_empty_extra_description; \
|
|
501 defsymbol_nodump (&ty##_coding_system_methods->predicate_symbol, \
|
|
502 pred_sym); \
|
|
503 add_entry_to_coding_system_type_list (ty##_coding_system_methods); \
|
|
504 dump_add_root_struct_ptr (&ty##_coding_system_methods, \
|
|
505 &coding_system_methods_description); \
|
|
506 } while (0)
|
|
507
|
|
508 #define REINITIALIZE_CODING_SYSTEM_TYPE(type) do { \
|
|
509 staticpro_nodump (&type##_coding_system_methods->predicate_symbol); \
|
|
510 } while (0)
|
|
511
|
|
512 /* This assumes the existence of two structures:
|
|
513
|
|
514 struct foo_coding_system (attached to the coding system)
|
|
515 struct foo_coding_stream (per coding process, attached to the
|
|
516 struct coding_stream)
|
|
517 const struct foo_coding_system_description[] (pdump description of
|
|
518 struct foo_coding_system)
|
|
519
|
|
520 NOTE: The description must have coding_system_data_offset added to
|
|
521 all offsets in it! For an example of how to do things, see
|
|
522 chain_coding_system_description.
|
|
523 */
|
|
524 #define INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA(type, pred_sym) \
|
|
525 do { \
|
|
526 INITIALIZE_CODING_SYSTEM_TYPE (type, pred_sym); \
|
|
527 type##_coding_system_methods->extra_data_size = \
|
|
528 sizeof (struct type##_coding_system); \
|
|
529 type##_coding_system_methods->extra_description = \
|
|
530 type##_coding_system_description; \
|
|
531 type##_coding_system_methods->coding_data_size = \
|
|
532 sizeof (struct type##_coding_stream); \
|
|
533 } while (0)
|
|
534
|
|
535 /* Declare that coding-system-type TYPE has method METH; used in
|
|
536 initialization routines */
|
|
537 #define CODING_SYSTEM_HAS_METHOD(type, meth) \
|
|
538 (type##_coding_system_methods->meth##_method = type##_##meth)
|
|
539
|
|
540 /***** Macros for accessing coding-system types *****/
|
|
541
|
|
542 #define CODING_SYSTEM_TYPE_P(cs, type) \
|
|
543 ((cs)->methods == type##_coding_system_methods)
|
|
544 #define XCODING_SYSTEM_TYPE_P(cs, type) \
|
|
545 CODING_SYSTEM_TYPE_P (XCODING_SYSTEM (cs), type)
|
|
546
|
800
|
547 #ifdef ERROR_CHECK_TYPES
|
771
|
548 # define CODING_SYSTEM_TYPE_DATA(cs, type) \
|
|
549 error_check_##type##_coding_system_data (cs)
|
|
550 #else
|
|
551 # define CODING_SYSTEM_TYPE_DATA(cs, type) \
|
|
552 ((struct type##_coding_system *) \
|
|
553 (cs)->data)
|
|
554 #endif
|
|
555
|
|
556 #define XCODING_SYSTEM_TYPE_DATA(cs, type) \
|
|
557 CODING_SYSTEM_TYPE_DATA (XCODING_SYSTEM_OF_TYPE (cs, type), type)
|
|
558
|
800
|
559 #ifdef ERROR_CHECK_TYPES
|
771
|
560 # define XCODING_SYSTEM_OF_TYPE(x, type) \
|
|
561 error_check_##type##_coding_system_type (x)
|
|
562 # define XSETCODING_SYSTEM_OF_TYPE(x, p, type) do \
|
|
563 { \
|
793
|
564 x = wrap_coding_system (p); \
|
|
565 assert (CODING_SYSTEM_TYPEP (XCODING_SYSTEM (x), type)); \
|
771
|
566 } while (0)
|
|
567 #else
|
|
568 # define XCODING_SYSTEM_OF_TYPE(x, type) XCODING_SYSTEM (x)
|
793
|
569 # define XSETCODING_SYSTEM_OF_TYPE(x, p, type) do \
|
|
570 { \
|
|
571 x = wrap_coding_system (p); \
|
|
572 } while (0)
|
771
|
573 #endif /* ERROR_CHECK_TYPE_CHECK */
|
|
574
|
|
575 #define CODING_SYSTEM_TYPEP(x, type) \
|
|
576 (CODING_SYSTEMP (x) && CODING_SYSTEM_TYPE_P (XCODING_SYSTEM (x), type))
|
|
577 #define CHECK_CODING_SYSTEM_OF_TYPE(x, type) do { \
|
|
578 CHECK_CODING_SYSTEM (x); \
|
|
579 if (!CODING_SYSTEM_TYPE_P (XCODING_SYSTEM (x), type)) \
|
|
580 dead_wrong_type_argument \
|
|
581 (type##_coding_system_methods->predicate_symbol, x); \
|
|
582 } while (0)
|
|
583 #define CONCHECK_CODING_SYSTEM_OF_TYPE(x, type) do { \
|
|
584 CONCHECK_CODING_SYSTEM (x); \
|
|
585 if (!(CODING_SYSTEM_TYPEP (x, type))) \
|
|
586 x = wrong_type_argument \
|
|
587 (type##_coding_system_methods->predicate_symbol, x); \
|
|
588 } while (0)
|
|
589
|
|
590 #define CODING_SYSTEM_METHODS(codesys) ((codesys)->methods)
|
428
|
591 #define CODING_SYSTEM_NAME(codesys) ((codesys)->name)
|
771
|
592 #define CODING_SYSTEM_DESCRIPTION(codesys) ((codesys)->description)
|
|
593 #define CODING_SYSTEM_TYPE(codesys) ((codesys)->methods->type)
|
428
|
594 #define CODING_SYSTEM_MNEMONIC(codesys) ((codesys)->mnemonic)
|
771
|
595 #define CODING_SYSTEM_DOCUMENTATION(codesys) ((codesys)->documentation)
|
428
|
596 #define CODING_SYSTEM_POST_READ_CONVERSION(codesys) \
|
|
597 ((codesys)->post_read_conversion)
|
|
598 #define CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) \
|
|
599 ((codesys)->pre_write_conversion)
|
|
600 #define CODING_SYSTEM_EOL_TYPE(codesys) ((codesys)->eol_type)
|
771
|
601 #define CODING_SYSTEM_EOL_LF(codesys) ((codesys)->eol[EOL_LF])
|
|
602 #define CODING_SYSTEM_EOL_CRLF(codesys) ((codesys)->eol[EOL_CRLF])
|
|
603 #define CODING_SYSTEM_EOL_CR(codesys) ((codesys)->eol[EOL_CR])
|
|
604 #define CODING_SYSTEM_TEXT_FILE_WRAPPER(codesys) ((codesys)->text_file_wrapper)
|
|
605 #define CODING_SYSTEM_AUTO_EOL_WRAPPER(codesys) ((codesys)->auto_eol_wrapper)
|
|
606 #define CODING_SYSTEM_SUBSIDIARY_PARENT(codesys) ((codesys)->subsidiary_parent)
|
|
607 #define CODING_SYSTEM_CANONICAL(codesys) ((codesys)->canonical)
|
428
|
608
|
771
|
609 #define CODING_SYSTEM_CHAIN_CHAIN(codesys) \
|
|
610 (CODING_SYSTEM_TYPE_DATA (codesys, chain)->chain)
|
|
611 #define CODING_SYSTEM_CHAIN_COUNT(codesys) \
|
|
612 (CODING_SYSTEM_TYPE_DATA (codesys, chain)->count)
|
|
613 #define CODING_SYSTEM_CHAIN_CANONICALIZE_AFTER_CODING(codesys) \
|
|
614 (CODING_SYSTEM_TYPE_DATA (codesys, chain)->canonicalize_after_coding)
|
428
|
615
|
771
|
616 #define XCODING_SYSTEM_METHODS(codesys) \
|
|
617 CODING_SYSTEM_METHODS (XCODING_SYSTEM (codesys))
|
428
|
618 #define XCODING_SYSTEM_NAME(codesys) \
|
|
619 CODING_SYSTEM_NAME (XCODING_SYSTEM (codesys))
|
771
|
620 #define XCODING_SYSTEM_DESCRIPTION(codesys) \
|
|
621 CODING_SYSTEM_DESCRIPTION (XCODING_SYSTEM (codesys))
|
428
|
622 #define XCODING_SYSTEM_TYPE(codesys) \
|
|
623 CODING_SYSTEM_TYPE (XCODING_SYSTEM (codesys))
|
|
624 #define XCODING_SYSTEM_MNEMONIC(codesys) \
|
|
625 CODING_SYSTEM_MNEMONIC (XCODING_SYSTEM (codesys))
|
771
|
626 #define XCODING_SYSTEM_DOCUMENTATION(codesys) \
|
|
627 CODING_SYSTEM_DOCUMENTATION (XCODING_SYSTEM (codesys))
|
428
|
628 #define XCODING_SYSTEM_POST_READ_CONVERSION(codesys) \
|
|
629 CODING_SYSTEM_POST_READ_CONVERSION (XCODING_SYSTEM (codesys))
|
|
630 #define XCODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) \
|
|
631 CODING_SYSTEM_PRE_WRITE_CONVERSION (XCODING_SYSTEM (codesys))
|
|
632 #define XCODING_SYSTEM_EOL_TYPE(codesys) \
|
|
633 CODING_SYSTEM_EOL_TYPE (XCODING_SYSTEM (codesys))
|
|
634 #define XCODING_SYSTEM_EOL_LF(codesys) \
|
|
635 CODING_SYSTEM_EOL_LF (XCODING_SYSTEM (codesys))
|
|
636 #define XCODING_SYSTEM_EOL_CRLF(codesys) \
|
|
637 CODING_SYSTEM_EOL_CRLF (XCODING_SYSTEM (codesys))
|
|
638 #define XCODING_SYSTEM_EOL_CR(codesys) \
|
|
639 CODING_SYSTEM_EOL_CR (XCODING_SYSTEM (codesys))
|
771
|
640 #define XCODING_SYSTEM_TEXT_FILE_WRAPPER(codesys) \
|
|
641 CODING_SYSTEM_TEXT_FILE_WRAPPER (XCODING_SYSTEM (codesys))
|
|
642 #define XCODING_SYSTEM_AUTO_EOL_WRAPPER(codesys) \
|
|
643 CODING_SYSTEM_AUTO_EOL_WRAPPER (XCODING_SYSTEM (codesys))
|
|
644 #define XCODING_SYSTEM_SUBSIDIARY_PARENT(codesys) \
|
|
645 CODING_SYSTEM_SUBSIDIARY_PARENT (XCODING_SYSTEM (codesys))
|
|
646 #define XCODING_SYSTEM_CANONICAL(codesys) \
|
|
647 CODING_SYSTEM_CANONICAL (XCODING_SYSTEM (codesys))
|
428
|
648
|
771
|
649 #define XCODING_SYSTEM_CHAIN_CHAIN(codesys) \
|
|
650 CODING_SYSTEM_CHAIN_CHAIN (XCODING_SYSTEM (codesys))
|
|
651 #define XCODING_SYSTEM_CHAIN_COUNT(codesys) \
|
|
652 CODING_SYSTEM_CHAIN_COUNT (XCODING_SYSTEM (codesys))
|
|
653 #define XCODING_SYSTEM_CHAIN_CANONICALIZE_AFTER_CODING(codesys) \
|
|
654 CODING_SYSTEM_CHAIN_CANONICALIZE_AFTER_CODING (XCODING_SYSTEM (codesys))
|
428
|
655
|
771
|
656 /**************************************************/
|
|
657 /* Detection */
|
|
658 /**************************************************/
|
428
|
659
|
771
|
660 #define MAX_DETECTOR_CATEGORIES 256
|
|
661 #define MAX_DETECTORS 64
|
428
|
662
|
771
|
663 #define MAX_BYTES_PROCESSED_FOR_DETECTION 65536
|
428
|
664
|
771
|
665 struct detection_state
|
428
|
666 {
|
771
|
667 int seen_non_ascii;
|
|
668 Bytecount bytes_seen;
|
428
|
669
|
771
|
670 char categories[MAX_DETECTOR_CATEGORIES];
|
|
671 Bytecount data_offset[MAX_DETECTORS];
|
|
672 /* ... more data follows; data_offset[detector_##TYPE] points to
|
|
673 the data for that type */
|
428
|
674 };
|
|
675
|
771
|
676 #define DETECTION_STATE_DATA(st, type) \
|
|
677 ((struct type##_detector *) \
|
|
678 ((char *) (st) + (st)->data_offset[detector_##type]))
|
428
|
679
|
448
|
680 /* Distinguishable categories of encodings.
|
|
681
|
|
682 This list determines the initial priority of the categories.
|
|
683
|
|
684 For better or worse, currently Mule files are encoded in 7-bit ISO 2022.
|
|
685 For this reason, under Mule ISO_7 gets highest priority.
|
|
686
|
|
687 Putting NO_CONVERSION second prevents "binary corruption" in the
|
|
688 default case in all but the (presumably) extremely rare case of a
|
|
689 binary file which contains redundant escape sequences but no 8-bit
|
|
690 characters.
|
|
691
|
|
692 The remaining priorities are based on perceived "internationalization
|
|
693 political correctness." An exception is UCS-4 at the bottom, since
|
|
694 basically everything is compatible with UCS-4, but it is likely to
|
|
695 be very rare as an external encoding. */
|
|
696
|
771
|
697 /* Macros to define code of control characters for ISO2022's functions. */
|
|
698 /* Used by the detection routines of other coding system types as well. */
|
|
699 /* code */ /* function */
|
|
700 #define ISO_CODE_LF 0x0A /* line-feed */
|
|
701 #define ISO_CODE_CR 0x0D /* carriage-return */
|
|
702 #define ISO_CODE_SO 0x0E /* shift-out */
|
|
703 #define ISO_CODE_SI 0x0F /* shift-in */
|
|
704 #define ISO_CODE_ESC 0x1B /* escape */
|
|
705 #define ISO_CODE_DEL 0x7F /* delete */
|
|
706 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
|
|
707 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
|
|
708 #define ISO_CODE_CSI 0x9B /* control-sequence-introduce */
|
|
709
|
|
710 enum detection_result
|
|
711 {
|
|
712 /* Basically means a magic cookie was seen indicating this type, or
|
|
713 something similar. */
|
|
714 DET_NEAR_CERTAINTY = 4,
|
|
715 DET_HIGHEST = 4,
|
|
716 /* Characteristics seen that are unlikely to be other coding system types
|
|
717 -- e.g. ISO-2022 escape sequences, or perhaps a consistent pattern of
|
|
718 alternating zero bytes in UTF-16, along with Unicode LF or CRLF
|
|
719 sequences at regular intervals. (Zero bytes are unlikely or impossible
|
|
720 in most text encodings.) */
|
|
721 DET_QUITE_PROBABLE = 3,
|
|
722 /* Strong or medium statistical likelihood. At least some
|
|
723 characteristics seen that match what's normally found in this encoding
|
|
724 -- e.g. in Shift-JIS, a number of two-byte Japanese character
|
|
725 sequences in the right range, and nothing out of range; or in Unicode,
|
|
726 much higher statistical variance in the odd bytes than in the even
|
|
727 bytes, or vice-versa (perhaps the presence of regular EOL sequences
|
|
728 would bump this too to DET_QUITE_PROBABLE). This is quite often a
|
|
729 statistical test. */
|
|
730 DET_SOMEWHAT_LIKELY = 2,
|
|
731 /* Weak statistical likelihood. Pretty much any features at all that
|
|
732 characterize this encoding, and nothing that rules against it. */
|
|
733 DET_SLIGHTLY_LIKELY = 1,
|
|
734 /* Default state. Perhaps it indicates pure ASCII or something similarly
|
|
735 vague seen in Shift-JIS, or, exactly as the level says, it might mean
|
|
736 in a statistical-based detector that the pros and cons are balanced
|
|
737 out. This is also the lowest level that will be accepted by the
|
|
738 auto-detector without asking the user: If all available detectors
|
|
739 report lower levels for all categories with attached coding systems,
|
|
740 the user will be shown the results and explicitly prompted for action.
|
|
741 The user will also be prompted if this is the highest available level
|
|
742 and more than one detector reports the level. (See below about the
|
|
743 consequent necessity of an "ASCII" detector, which will return level 1
|
|
744 or higher for most plain text files.) */
|
|
745 DET_AS_LIKELY_AS_UNLIKELY = 0,
|
|
746 /* Some characteristics seen that are unusual for this encoding --
|
|
747 e.g. unusual control characters in a plain-text encoding, lots of
|
|
748 8-bit characters, or little statistical variance in the odd and even
|
|
749 bytes in UTF-16. */
|
|
750 DET_SOMEWHAT_UNLIKELY = -1,
|
|
751 /* This indicates that there is very little chance the data is in the
|
|
752 right format; this is probably the lowest level you can get when
|
|
753 presenting random binary data to a text file, because there are no
|
|
754 "specific sequences" you can see that would totally rule out
|
|
755 recognition. */
|
|
756 DET_QUITE_IMPROBABLE = -2,
|
|
757 /* An erroneous sequence was seen. */
|
|
758 DET_NEARLY_IMPOSSIBLE = -3,
|
985
|
759 DET_LOWEST = -3,
|
771
|
760 };
|
|
761
|
|
762 extern int coding_detector_count;
|
|
763 extern int coding_detector_category_count;
|
|
764
|
|
765 struct detector_category
|
428
|
766 {
|
771
|
767 int id;
|
|
768 Lisp_Object sym;
|
|
769 };
|
|
770
|
|
771 typedef struct
|
|
772 {
|
|
773 Dynarr_declare (struct detector_category);
|
|
774 } detector_category_dynarr;
|
|
775
|
|
776 struct detector
|
|
777 {
|
|
778 int id;
|
|
779 detector_category_dynarr *cats;
|
|
780 Bytecount data_size;
|
|
781 /* Detect method: Required. */
|
|
782 void (*detect_method) (struct detection_state *st,
|
|
783 const unsigned char *src, Bytecount n);
|
|
784 /* Finalize detection state method: Clean up any allocated data in the
|
|
785 detection state. Called only once (NOT called at disksave time).
|
|
786 Optional. */
|
|
787 void (*finalize_detection_state_method) (struct detection_state *st);
|
428
|
788 };
|
|
789
|
771
|
790 /* Lvalue for a particular detection result -- detection state ST,
|
|
791 category CAT */
|
|
792 #define DET_RESULT(st, cat) ((st)->categories[detector_category_##cat])
|
|
793 /* In state ST, set all detection results associated with detector DET to
|
|
794 RESULT. */
|
|
795 #define SET_DET_RESULTS(st, det, result) \
|
|
796 set_detection_results (st, detector_##det, result)
|
|
797
|
|
798 typedef struct
|
|
799 {
|
|
800 Dynarr_declare (struct detector);
|
|
801 } detector_dynarr;
|
|
802
|
|
803 extern detector_dynarr *all_coding_detectors;
|
|
804
|
|
805 #define DEFINE_DETECTOR_CATEGORY(detector, cat) \
|
|
806 int detector_category_##cat
|
|
807 #define DECLARE_DETECTOR_CATEGORY(detector, cat) \
|
|
808 extern int detector_category_##cat
|
|
809 #define INITIALIZE_DETECTOR_CATEGORY(detector, cat) \
|
|
810 do { \
|
|
811 struct detector_category dog; \
|
|
812 xzero (dog); \
|
|
813 detector_category_##cat = coding_detector_category_count++; \
|
|
814 dump_add_opaque_int (&detector_category_##cat); \
|
|
815 dog.id = detector_category_##cat; \
|
|
816 dog.sym = Q##cat; \
|
|
817 Dynarr_add (Dynarr_at (all_coding_detectors, detector_##detector).cats, \
|
|
818 dog); \
|
|
819 } while (0)
|
|
820
|
|
821 #define DEFINE_DETECTOR(Detector) \
|
|
822 int detector_##Detector
|
|
823 #define DECLARE_DETECTOR(Detector) \
|
|
824 extern int detector_##Detector
|
|
825 #define INITIALIZE_DETECTOR(Detector) \
|
|
826 do { \
|
|
827 struct detector det; \
|
|
828 xzero (det); \
|
|
829 detector_##Detector = coding_detector_count++; \
|
|
830 dump_add_opaque_int (&detector_##Detector); \
|
|
831 det.id = detector_##Detector; \
|
|
832 det.cats = Dynarr_new2 (detector_category_dynarr, \
|
|
833 struct detector_category); \
|
|
834 det.data_size = sizeof (struct Detector##_detector); \
|
|
835 Dynarr_add (all_coding_detectors, det); \
|
|
836 } while (0)
|
|
837 #define DETECTOR_HAS_METHOD(Detector, Meth) \
|
|
838 Dynarr_at (all_coding_detectors, detector_##Detector).Meth##_method = \
|
802
|
839 Detector##_##Meth
|
771
|
840
|
|
841
|
|
842 /**************************************************/
|
|
843 /* Decoding/Encoding */
|
|
844 /**************************************************/
|
|
845
|
|
846 /* Is the source (SOURCEP == 1) or sink (SOURCEP == 0) when encoding specified
|
|
847 in characters? */
|
|
848
|
|
849 enum source_or_sink
|
|
850 {
|
|
851 CODING_SOURCE,
|
|
852 CODING_SINK
|
|
853 };
|
|
854
|
|
855 enum encode_decode
|
|
856 {
|
|
857 CODING_ENCODE,
|
|
858 CODING_DECODE
|
|
859 };
|
|
860
|
|
861 /* Data structure attached to an lstream of type `coding',
|
|
862 containing values specific to the coding process. Additional
|
|
863 data is stored in the DATA field below; the exact form of that data
|
|
864 is controlled by the type of the coding system that governs the
|
|
865 conversion (field CODESYS). CODESYS may be set at any time
|
|
866 throughout the lifetime of the lstream and possibly more than once.
|
|
867 See long comment above for more info. */
|
|
868
|
|
869 struct coding_stream
|
|
870 {
|
|
871 /* Coding system that governs the conversion. */
|
|
872 Lisp_Object codesys;
|
|
873 /* Original coding system, pre-canonicalization. */
|
|
874 Lisp_Object orig_codesys;
|
|
875
|
|
876 /* Back pointer to current stream. */
|
|
877 Lstream *us;
|
|
878
|
|
879 /* Stream that we read the unprocessed data from or write the processed
|
|
880 data to. */
|
|
881 Lstream *other_end;
|
|
882
|
|
883 /* In order to handle both reading to and writing from a coding stream,
|
|
884 we phrase the conversion methods like write methods -- we can
|
|
885 implement reading in terms of a write method but not vice-versa,
|
|
886 because the write method is forced to take only what it's given but
|
|
887 the read method can read more data from the other end if necessary.
|
|
888 On the other hand, the write method is free to generate all the data
|
|
889 it wants (and just write it to the other end), but the the read method
|
|
890 can return only as much as was asked for, so we need to implement our
|
|
891 own buffering. */
|
|
892
|
|
893 /* If we are reading, then we can return only a fixed amount of data, but
|
|
894 the converter is free to return as much as it wants, so we direct it
|
|
895 to store the data here and lop off chunks as we need them. If we are
|
|
896 writing, we use this because the converter takes a Dynarr but we are
|
|
897 supposed to write into a fixed buffer. (NOTE: This introduces an extra
|
|
898 memory copy.) */
|
|
899 unsigned_char_dynarr *convert_to;
|
|
900
|
|
901 /* The conversion method might reject some of the data -- this typically
|
|
902 includes partial characters, partial escape sequences, etc. When
|
|
903 writing, we just pass the rejection up to the Lstream module, and it
|
|
904 will buffer the data. When reading, however, we need to do the
|
|
905 buffering ourselves, and we put it here, combined with newly read
|
|
906 data. */
|
|
907 unsigned_char_dynarr *convert_from;
|
|
908
|
|
909 /* If set, this is the last chunk of data being processed. When this is
|
|
910 finished, output any necessary terminating control characters, escape
|
|
911 sequences, etc. */
|
|
912 unsigned int eof:1;
|
|
913
|
|
914 /* CH holds a partially built-up character. This is really part of the
|
|
915 state-dependent data and should be moved there. */
|
|
916 unsigned int ch;
|
|
917
|
|
918 /* Coding-system-specific data holding extra state about the
|
|
919 conversion. Logically a struct TYPE_coding_stream; a pointer
|
800
|
920 to such a struct, with (when ERROR_CHECK_TYPES is defined)
|
771
|
921 error-checking that this is really a structure of that type
|
|
922 (checking the corresponding coding system type) can be retrieved using
|
|
923 CODING_STREAM_TYPE_DATA(). Allocated at the same time that
|
|
924 CODESYS is set (which may occur at any time, even multiple times,
|
|
925 during the lifetime of the stream). The size comes from
|
|
926 methods->coding_data_size. */
|
|
927 void *data;
|
|
928
|
|
929 enum encode_decode direction;
|
|
930
|
800
|
931 /* If set, don't close the stream at the other end when being closed. */
|
|
932 unsigned int no_close_other:1;
|
802
|
933 /* If set, read only one byte at a time from other end to avoid any
|
|
934 possible blocking. */
|
|
935 unsigned int one_byte_at_a_time:1;
|
814
|
936 /* If set, and we're a read stream, we init char mode on ourselves as
|
|
937 necessary to prevent the caller from getting partial characters. (the
|
|
938 default) */
|
|
939 unsigned int set_char_mode_on_us_when_reading:1;
|
800
|
940
|
771
|
941 /* #### Temporary test */
|
|
942 unsigned int finalized:1;
|
|
943 };
|
|
944
|
|
945 #define CODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, coding)
|
|
946
|
800
|
947 #ifdef ERROR_CHECK_TYPES
|
771
|
948 # define CODING_STREAM_TYPE_DATA(s, type) \
|
|
949 error_check_##type##_coding_stream_data (s)
|
|
950 #else
|
|
951 # define CODING_STREAM_TYPE_DATA(s, type) \
|
|
952 ((struct type##_coding_stream *) (s)->data)
|
|
953 #endif
|
|
954
|
|
955 /* C should be a binary character in the range 0 - 255; convert
|
|
956 to internal format and add to Dynarr DST. */
|
|
957
|
428
|
958 #ifdef MULE
|
771
|
959
|
|
960 #define DECODE_ADD_BINARY_CHAR(c, dst) \
|
|
961 do { \
|
826
|
962 if (byte_ascii_p (c)) \
|
771
|
963 Dynarr_add (dst, c); \
|
826
|
964 else if (byte_c1_p (c)) \
|
771
|
965 { \
|
|
966 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
|
|
967 Dynarr_add (dst, c + 0x20); \
|
|
968 } \
|
|
969 else \
|
|
970 { \
|
|
971 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
|
|
972 Dynarr_add (dst, c); \
|
|
973 } \
|
|
974 } while (0)
|
|
975
|
|
976 #else /* not MULE */
|
|
977
|
|
978 #define DECODE_ADD_BINARY_CHAR(c, dst) \
|
|
979 do { \
|
|
980 Dynarr_add (dst, c); \
|
|
981 } while (0)
|
|
982
|
|
983 #endif /* MULE */
|
|
984
|
|
985 #define DECODE_OUTPUT_PARTIAL_CHAR(ch, dst) \
|
|
986 do { \
|
|
987 if (ch) \
|
|
988 { \
|
|
989 DECODE_ADD_BINARY_CHAR (ch, dst); \
|
|
990 ch = 0; \
|
|
991 } \
|
|
992 } while (0)
|
428
|
993
|
|
994 #ifdef MULE
|
|
995 /* Convert shift-JIS code (sj1, sj2) into internal string
|
|
996 representation (c1, c2). (The leading byte is assumed.) */
|
|
997
|
771
|
998 #define DECODE_SHIFT_JIS(sj1, sj2, c1, c2) \
|
428
|
999 do { \
|
|
1000 int I1 = sj1, I2 = sj2; \
|
|
1001 if (I2 >= 0x9f) \
|
|
1002 c1 = (I1 << 1) - ((I1 >= 0xe0) ? 0xe0 : 0x60), \
|
|
1003 c2 = I2 + 2; \
|
|
1004 else \
|
|
1005 c1 = (I1 << 1) - ((I1 >= 0xe0) ? 0xe1 : 0x61), \
|
|
1006 c2 = I2 + ((I2 >= 0x7f) ? 0x60 : 0x61); \
|
|
1007 } while (0)
|
|
1008
|
|
1009 /* Convert the internal string representation of a Shift-JIS character
|
|
1010 (c1, c2) into Shift-JIS code (sj1, sj2). The leading byte is
|
|
1011 assumed. */
|
|
1012
|
771
|
1013 #define ENCODE_SHIFT_JIS(c1, c2, sj1, sj2) \
|
428
|
1014 do { \
|
|
1015 int I1 = c1, I2 = c2; \
|
|
1016 if (I1 & 1) \
|
|
1017 sj1 = (I1 >> 1) + ((I1 < 0xdf) ? 0x31 : 0x71), \
|
|
1018 sj2 = I2 - ((I2 >= 0xe0) ? 0x60 : 0x61); \
|
|
1019 else \
|
|
1020 sj1 = (I1 >> 1) + ((I1 < 0xdf) ? 0x30 : 0x70), \
|
|
1021 sj2 = I2 - 2; \
|
|
1022 } while (0)
|
|
1023 #endif /* MULE */
|
|
1024
|
771
|
1025 DECLARE_CODING_SYSTEM_TYPE (no_conversion);
|
|
1026 DECLARE_CODING_SYSTEM_TYPE (convert_eol);
|
|
1027 #if 0
|
|
1028 DECLARE_CODING_SYSTEM_TYPE (text_file_wrapper);
|
|
1029 #endif /* 0 */
|
|
1030 DECLARE_CODING_SYSTEM_TYPE (undecided);
|
|
1031 DECLARE_CODING_SYSTEM_TYPE (chain);
|
|
1032
|
|
1033 #ifdef DEBUG_XEMACS
|
|
1034 DECLARE_CODING_SYSTEM_TYPE (internal);
|
|
1035 #endif
|
|
1036
|
|
1037 #ifdef MULE
|
|
1038 DECLARE_CODING_SYSTEM_TYPE (iso2022);
|
|
1039 DECLARE_CODING_SYSTEM_TYPE (ccl);
|
|
1040 DECLARE_CODING_SYSTEM_TYPE (shift_jis);
|
|
1041 DECLARE_CODING_SYSTEM_TYPE (big5);
|
|
1042 #endif
|
|
1043
|
|
1044 #ifdef HAVE_ZLIB
|
|
1045 DECLARE_CODING_SYSTEM_TYPE (gzip);
|
|
1046 #endif
|
428
|
1047
|
771
|
1048 DECLARE_CODING_SYSTEM_TYPE (unicode);
|
428
|
1049
|
771
|
1050 #ifdef HAVE_WIN32_CODING_SYSTEMS
|
|
1051 DECLARE_CODING_SYSTEM_TYPE (mswindows_multibyte_to_unicode);
|
|
1052 DECLARE_CODING_SYSTEM_TYPE (mswindows_multibyte);
|
428
|
1053 #endif
|
771
|
1054
|
|
1055 Lisp_Object coding_stream_detected_coding_system (Lstream *stream);
|
|
1056 Lisp_Object coding_stream_coding_system (Lstream *stream);
|
|
1057 void set_coding_stream_coding_system (Lstream *stream,
|
|
1058 Lisp_Object codesys);
|
|
1059 Lisp_Object detect_coding_stream (Lisp_Object stream);
|
867
|
1060 Ichar decode_big5_char (int o1, int o2);
|
771
|
1061 void add_entry_to_coding_system_type_list (struct coding_system_methods *m);
|
|
1062 Lisp_Object make_internal_coding_system (Lisp_Object existing,
|
|
1063 Char_ASCII *prefix,
|
|
1064 Lisp_Object type,
|
|
1065 Lisp_Object description,
|
|
1066 Lisp_Object props);
|
802
|
1067
|
814
|
1068 #define LSTREAM_FL_NO_CLOSE_OTHER (1 << 16)
|
|
1069 #define LSTREAM_FL_READ_ONE_BYTE_AT_A_TIME (1 << 17)
|
|
1070 #define LSTREAM_FL_NO_INIT_CHAR_MODE_WHEN_READING (1 << 18)
|
|
1071
|
771
|
1072 Lisp_Object make_coding_input_stream (Lstream *stream, Lisp_Object codesys,
|
800
|
1073 enum encode_decode direction,
|
802
|
1074 int flags);
|
771
|
1075 Lisp_Object make_coding_output_stream (Lstream *stream, Lisp_Object codesys,
|
800
|
1076 enum encode_decode direction,
|
802
|
1077 int flags);
|
771
|
1078 void set_detection_results (struct detection_state *st, int detector,
|
|
1079 int given);
|
428
|
1080
|
440
|
1081 #endif /* INCLUDED_file_coding_h_ */
|
|
1082
|