771
+ − 1 /* Header for encoding conversion functions; coding-system object.
+ − 2 #### rename me to coding-system.h
428
+ − 3 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
+ − 4 Copyright (C) 1995 Sun Microsystems, Inc.
793
+ − 5 Copyright (C) 2000, 2001, 2002 Ben Wing.
428
+ − 6
+ − 7 This file is part of XEmacs.
+ − 8
+ − 9 XEmacs is free software; you can redistribute it and/or modify it
+ − 10 under the terms of the GNU General Public License as published by the
+ − 11 Free Software Foundation; either version 2, or (at your option) any
+ − 12 later version.
+ − 13
+ − 14 XEmacs is distributed in the hope that it will be useful, but WITHOUT
+ − 15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ − 16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ − 17 for more details.
+ − 18
+ − 19 You should have received a copy of the GNU General Public License
+ − 20 along with XEmacs; see the file COPYING. If not, write to
+ − 21 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ − 22 Boston, MA 02111-1307, USA. */
+ − 23
+ − 24 /* Synched up with: Mule 2.3. Not in FSF. */
+ − 25
771
+ − 26 /* Authorship:
+ − 27
+ − 28 Current primary author: Ben Wing <ben@xemacs.org>
+ − 29
+ − 30 Written by Ben Wing <ben@xemacs.org> for XEmacs, 1995, loosely based
+ − 31 on code written 91.10.09 by K.Handa <handa@etl.go.jp>.
+ − 32 Rewritten again 2000-2001 by Ben Wing to support properly
+ − 33 abstracted coding systems.
+ − 34 September 2001: Finished last part of abstraction, the detection
+ − 35 mechanism.
+ − 36 */
428
+ − 37
440
+ − 38 #ifndef INCLUDED_file_coding_h_
+ − 39 #define INCLUDED_file_coding_h_
428
+ − 40
771
+ − 41 /* Capsule description of the different structures, what their purpose is,
+ − 42 how they fit together, and where various bits of data are stored.
+ − 43
2297
+ − 44 A "coding system" is an algorithm for converting stream data in one format
+ − 45 into stream data in another format. Currently most of the coding systems
+ − 46 we have created concern internationalized text, and convert between the
+ − 47 XEmacs internal format for multilingual text, and various external
771
+ − 48 representations of such text. However, any such conversion is possible,
+ − 49 for example, compressing or uncompressing text using the gzip algorithm.
+ − 50 All coding systems provide both encode and decode routines, so that the
2297
+ − 51 conversion can go both ways. Unfortunately encoding and decoding may not
+ − 52 be exact inverses, even for a specific instance of a coding system. Care
+ − 53 must be taken when this is not the case.
771
+ − 54
+ − 55 The way we handle this is by dividing the various potential coding
+ − 56 systems into types, analogous to classes in C++. Each coding system
+ − 57 type encompasses a series of related coding systems that it can
+ − 58 implement, and it has properties which control how exactly the encoding
+ − 59 works. A particular set of values for each of the properties makes up a
+ − 60 "coding system", and specifies one particular encoding. A `struct
+ − 61 Lisp_Coding_System' object encapsulates those settings -- its type, the
+ − 62 values chosen for all properties of that type, a name for the coding
+ − 63 system, some documentation.
+ − 64
+ − 65 In addition, there are of course methods associated with a coding system
+ − 66 type, implementing the encoding, decoding, etc. These are stored in a
+ − 67 `struct coding_system_methods' object, one per coding-system type, which
+ − 68 contains mostly function pointers. This is retrievable from the
+ − 69 coding-system object (i.e. the struct Lisp_Coding_System), which has a
+ − 70 pointer to it.
+ − 71
+ − 72 In order to actually use a coding system to do an encoding or decoding
+ − 73 operation, you need to use a coding Lstream.
+ − 74
+ − 75 Now let's look more at attached data. All coding systems have certain
+ − 76 common data fields -- name, type, documentation, etc. -- as well as a
+ − 77 bunch more that are defined by the coding system type. To handle this
+ − 78 cleanly, each coding system type defines a structure that holds just the
+ − 79 fields of data particular to it, and calls it e.g. `struct
+ − 80 iso2022_coding_system' for coding system type `iso2022'. When the
+ − 81 memory block holding the coding system object is created, it is sized
+ − 82 such that it can hold both the struct Lisp_Coding_System and the struct
+ − 83 iso2022_coding_system (or whatever) directly following it. (This is a
+ − 84 common trick; another possibility is to have a void * pointer in the
+ − 85 struct Lisp_Coding_System, which points to another memory block holding
+ − 86 the struct iso2022_coding_system.) A macro is provided
+ − 87 (CODING_SYSTEM_TYPE_DATA) to retrieve a pointer of the right type to the
+ − 88 type-specific data contained within the overall `struct
+ − 89 Lisp_Coding_System' block.
+ − 90
+ − 91 Lstreams, similarly, are objects of type `struct lstream' holding data
+ − 92 about the stream operation (how much data has been read or written, any
+ − 93 buffered data, any error conditions, etc.), and like coding systems have
+ − 94 different types. They have a structure called `Lstream_implementation',
+ − 95 one per lstream type, exactly analogous to `struct
+ − 96 coding_system_methods'. In addition, they have type-specific data
+ − 97 (specifying, e.g., the file number, FILE *, memory location, other
+ − 98 lstream, etc. to read the data from or write it to, and for conversion
+ − 99 processes, the current state of the process -- are we decoding ASCII or
+ − 100 Kanji characters? are we in the middle of a processing an escape
+ − 101 sequence? etc.). This type-specific data is stored in a structure
+ − 102 named `struct coding_stream'. Just like for coding systems, the
+ − 103 type-independent data in the `struct lstream' and the type-dependent
+ − 104 data in the `struct coding_stream' are stored together in the same
+ − 105 memory block.
428
+ − 106
771
+ − 107 Now things get a bit tricky. The `struct coding_stream' is
+ − 108 type-specific from the point of view of an lstream, but not from the
+ − 109 point of view of a coding system. It contains only general data about
+ − 110 the conversion process, e.g. the name of the coding system used for
+ − 111 conversion, the lstream that we take data from or write it to (depending
+ − 112 on whether this was created as a read stream or a write stream), a
+ − 113 buffer to hold extra data we retrieved but can't send on yet, some
+ − 114 flags, etc. It also needs some data specific to the particular coding
+ − 115 system and thus to the particular operation going on. This data is held
+ − 116 in a structure named (e.g.) `struct iso2022_coding_stream', and it's
+ − 117 held in a separate memory block and pointed to by the generic `struct
+ − 118 coding_stream'. It's not glommed into a single memory block both
+ − 119 because that would require making changes to the generic lstream code
+ − 120 and more importantly because the coding system used in a particular
+ − 121 coding lstream can be changed at any point during the lifetime of the
+ − 122 lstream, and possibly multiple times. (For example, it can be set using
+ − 123 the Lisp primitives `set-process-input-coding-system' and
+ − 124 `set-console-tty-input-coding-system', as well as getting set when a
+ − 125 conversion operation was started with coding system `undecided' and the
2297
+ − 126 correct coding system was then detected.) #### This suggests implementing
+ − 127 compound text extended segments by saving the state of the ctext stream,
+ − 128 and installing an appropriate for the duration of the segment.
428
+ − 129
771
+ − 130 IMPORTANT NOTE: There are at least two ancillary data structures
+ − 131 associated with a coding system type. (There may also be detection data;
+ − 132 see elsewhere.) It's important, when writing a coding system type, to
+ − 133 keep straight which type of data goes where. In particular, `struct
+ − 134 foo_coding_system' is attached to the coding system object itself. This
+ − 135 is a permanent object and there's only one per coding system. It's
+ − 136 created once, usually at init time, and never destroyed. So, `struct
+ − 137 foo_coding_system' should in general not contain dynamic data! (Just
+ − 138 data describing the properties of the coding system.) In particular,
+ − 139 *NO* data about any conversion in progress. There may be many
+ − 140 conversions going on simultaneously using a particular coding system,
+ − 141 and by storing conversion data in the coding system, these conversions
+ − 142 will overwrite each other's data.
+ − 143
+ − 144 Instead, use the lstream object, whose purpose is to encapsulate a
+ − 145 particular conversion and all associated data. From the lstream object,
+ − 146 you can get the struct coding_stream using something like
+ − 147
+ − 148 struct coding_stream *str = LSTREAM_TYPE_DATA (lstr, coding);
+ − 149
+ − 150 But usually this structure is already passed to you as one of the
+ − 151 parameters of the method being invoked.
+ − 152
+ − 153 From the struct coding_stream, you can retrieve the
+ − 154 coding-system-type-specific data using something like
+ − 155
+ − 156 struct foo_coding_stream *data = CODING_STREAM_TYPE_DATA (str, foo);
+ − 157
+ − 158 Then, use this structure to hold all data relevant to the particular
+ − 159 conversion being done.
+ − 160
+ − 161 Initialize this structure whenever init_coding_stream_method is called
+ − 162 (this may happen more than once), and finalize it (free resources, etc.)
+ − 163 when finalize_coding_stream_method is called.
+ − 164 */
+ − 165
+ − 166 struct coding_stream;
+ − 167 struct detection_state;
+ − 168
1204
+ − 169 extern const struct sized_memory_description coding_system_methods_description;
771
+ − 170
+ − 171 struct coding_system_methods;
+ − 172
+ − 173 enum source_sink_type
428
+ − 174 {
771
+ − 175 DECODES_CHARACTER_TO_BYTE,
+ − 176 DECODES_BYTE_TO_BYTE,
+ − 177 DECODES_BYTE_TO_CHARACTER,
+ − 178 DECODES_CHARACTER_TO_CHARACTER
428
+ − 179 };
+ − 180
+ − 181 enum eol_type
+ − 182 {
+ − 183 EOL_LF,
+ − 184 EOL_CRLF,
771
+ − 185 EOL_CR,
1429
+ − 186 EOL_AUTODETECT
428
+ − 187 };
+ − 188
+ − 189 struct Lisp_Coding_System
+ − 190 {
+ − 191 struct lcrecord_header header;
771
+ − 192 struct coding_system_methods *methods;
428
+ − 193
1204
+ − 194 #define CODING_SYSTEM_SLOT_DECLARATION
+ − 195 #define MARKED_SLOT(x) Lisp_Object x;
+ − 196 #include "coding-system-slots.h"
771
+ − 197
1204
+ − 198 /* Eol type requested by user. See comment about EOL junk in
+ − 199 coding-system-slots.h. */
771
+ − 200 enum eol_type eol_type;
428
+ − 201
2132
+ − 202 /* If true, this is an internal coding system, which will not show up in
+ − 203 coding-system-list unless a special parameter is given to it. */
+ − 204 int internal_p;
+ − 205
771
+ − 206 /* type-specific extra data attached to a coding_system */
+ − 207 char data[1];
428
+ − 208 };
+ − 209 typedef struct Lisp_Coding_System Lisp_Coding_System;
+ − 210
440
+ − 211 DECLARE_LRECORD (coding_system, Lisp_Coding_System);
+ − 212 #define XCODING_SYSTEM(x) XRECORD (x, coding_system, Lisp_Coding_System)
617
+ − 213 #define wrap_coding_system(p) wrap_record (p, coding_system)
428
+ − 214 #define CODING_SYSTEMP(x) RECORDP (x, coding_system)
+ − 215 #define CHECK_CODING_SYSTEM(x) CHECK_RECORD (x, coding_system)
+ − 216 #define CONCHECK_CODING_SYSTEM(x) CONCHECK_RECORD (x, coding_system)
+ − 217
1204
+ − 218 enum coding_system_variant
+ − 219 {
+ − 220 no_conversion_coding_system,
+ − 221 convert_eol_coding_system,
+ − 222 undecided_coding_system,
+ − 223 chain_coding_system,
+ − 224 text_file_wrapper_coding_system,
+ − 225 internal_coding_system,
+ − 226 gzip_coding_system,
+ − 227 mswindows_multibyte_to_unicode_coding_system,
+ − 228 mswindows_multibyte_coding_system,
+ − 229 iso2022_coding_system,
+ − 230 ccl_coding_system,
+ − 231 shift_jis_coding_system,
+ − 232 big5_coding_system,
1429
+ − 233 unicode_coding_system
1204
+ − 234 };
+ − 235
771
+ − 236 struct coding_system_methods
+ − 237 {
+ − 238 Lisp_Object type;
+ − 239 Lisp_Object predicate_symbol;
+ − 240
1204
+ − 241 /* Type expressed as an enum, needed for KKCC marking of the
+ − 242 type-specific lstream data; copied into the struct coding_stream. */
+ − 243
+ − 244 enum coding_system_variant enumtype;
+ − 245
771
+ − 246 /* Implementation specific methods: */
+ − 247
+ − 248 /* Init method: Initialize coding-system data. Optional. */
+ − 249 void (*init_method) (Lisp_Object coding_system);
+ − 250
+ − 251 /* Mark method: Mark any Lisp objects in the type-specific data
+ − 252 attached to the coding-system object. Optional. */
+ − 253 void (*mark_method) (Lisp_Object coding_system);
+ − 254
+ − 255 /* Print method: Print the type-specific properties of this coding
+ − 256 system, as part of `print'-ing the object. If this method is defined
+ − 257 and prints anything, it should print a space as the first thing it
+ − 258 does. Optional. */
+ − 259 void (*print_method) (Lisp_Object cs, Lisp_Object printcharfun,
+ − 260 int escapeflag);
+ − 261
+ − 262 /* Canonicalize method: Convert this coding system to another one; called
+ − 263 once, at creation time, after all properties have been parsed. The
+ − 264 returned value should be a coding system created with
+ − 265 make_internal_coding_system() (passing the existing coding system as the
+ − 266 first argument), and will become the coding system returned by
+ − 267 `make-coding-system'. Optional.
+ − 268
+ − 269 NOTE: There are *three* different uses of "canonical" or "canonicalize"
+ − 270 w.r.t. coding systems, and it's important to keep them straight.
+ − 271
+ − 272 1. The canonicalize method. Used to specify a different coding
+ − 273 system, used when doing conversions, in place of the actual coding
+ − 274 system itself. Stored in the CANONICAL field of a coding system.
+ − 275
+ − 276 2. The canonicalize-after-coding method. Used to return the encoding
+ − 277 that was "actually" used to decode some text, such that this
+ − 278 particular encoding can be used to encode the text again with the
+ − 279 expectation that the result will be the same as the original encoding.
+ − 280 Particularly important with auto-detecting coding systems.
+ − 281
+ − 282 3. From the perspective of aliases, a "canonical" coding system is one
+ − 283 that's not an alias to some other coding system, and "canonicalization"
+ − 284 is the process of traversing the alias pointers to find the canonical
+ − 285 coding system that's equivalent to the alias.
+ − 286 */
+ − 287 Lisp_Object (*canonicalize_method) (Lisp_Object coding_system);
+ − 288
+ − 289 /* Canonicalize after coding method: Convert this coding system to
+ − 290 another one, after coding (usually decoding) has finished. This is
+ − 291 meant to be used by auto-detecting coding systems, which should return
+ − 292 the actually detected coding system. Optional. */
+ − 293 Lisp_Object (*canonicalize_after_coding_method)
+ − 294 (struct coding_stream *str);
+ − 295
+ − 296 /* Convert method: Decode or encode the data in SRC of size N, writing
+ − 297 the results into the Dynarr DST. If the conversion_end_type method
+ − 298 indicates that the source is characters (as opposed to bytes), you are
+ − 299 guaranteed to get only whole characters in the data in SRC/N. STR, a
+ − 300 struct coding_stream, stores all necessary state and other info about
+ − 301 the conversion. Coding-specific state (struct TYPE_coding_stream) can
+ − 302 be retrieved from STR using CODING_STREAM_TYPE_DATA(). Return value
+ − 303 indicates the number of bytes of the *INPUT* that were converted (not
+ − 304 the number of bytes written to the Dynarr!). This can be less than
+ − 305 the total amount of input passed in; if so, the remainder is
+ − 306 considered "rejected" and will appear again at the beginning of the
+ − 307 data passed in the next time the convert method is called. When EOF
+ − 308 is returned on the other end and there's no more data, the convert
+ − 309 method will be called one last time, STR->eof set and the passed-in
+ − 310 data will consist only of any rejected data from the previous
+ − 311 call. (At this point, file handles and similar resources can be
+ − 312 closed, but do NOT arbitrarily free data structures in the
+ − 313 type-specific data, because there are operations that can be done on
+ − 314 closed streams to query the results of the processing -- specifically,
+ − 315 for coding streams, there's the canonicalize_after_coding() method.)
+ − 316 Required. */
+ − 317 Bytecount (*convert_method) (struct coding_stream *str,
+ − 318 const unsigned char *src,
+ − 319 unsigned_char_dynarr *dst, Bytecount n);
+ − 320
+ − 321 /* Coding mark method: Mark any Lisp objects in the type-specific data
+ − 322 attached to `struct coding_stream'. Optional. */
+ − 323 void (*mark_coding_stream_method) (struct coding_stream *str);
+ − 324
+ − 325 /* Init coding stream method: Initialize the type-specific data attached
+ − 326 to the coding stream (i.e. in struct TYPE_coding_stream), when the
+ − 327 coding stream is opened. The type-specific data will be zeroed out.
+ − 328 Optional. */
+ − 329 void (*init_coding_stream_method) (struct coding_stream *str);
+ − 330
+ − 331 /* Rewind coding stream method: Reset any necessary type-specific data as
+ − 332 a result of the stream being rewound. Optional. */
+ − 333 void (*rewind_coding_stream_method) (struct coding_stream *str);
+ − 334
+ − 335 /* Finalize coding stream method: Clean up the type-specific data
+ − 336 attached to the coding stream (i.e. in struct TYPE_coding_stream).
+ − 337 Happens when the Lstream is deleted using Lstream_delete() or is
+ − 338 garbage-collected. Most streams are deleted after they've been used,
+ − 339 so it's less likely (but still possible) that allocated data will
+ − 340 stick around until GC time. (File handles can also be closed when EOF
+ − 341 is signalled; but some data must stick around after this point, for
+ − 342 the benefit of canonicalize_after_coding. See the convert method.)
+ − 343 Called only once (NOT called at disksave time). Optional. */
+ − 344 void (*finalize_coding_stream_method) (struct coding_stream *str);
+ − 345
+ − 346 /* Finalize method: Clean up type-specific data (e.g. free allocated
+ − 347 data) attached to the coding system (i.e. in struct
+ − 348 TYPE_coding_system), when the coding system is about to be garbage
+ − 349 collected. (Currently not called.) Called only once (NOT called at
+ − 350 disksave time). Optional. */
+ − 351 void (*finalize_method) (Lisp_Object codesys);
+ − 352
+ − 353 /* Conversion end type method: Does this coding system encode bytes ->
+ − 354 characters, characters -> characters, bytes -> bytes, or
+ − 355 characters -> bytes?. Default is characters -> bytes. Optional. */
+ − 356 enum source_sink_type (*conversion_end_type_method) (Lisp_Object codesys);
+ − 357
+ − 358 /* Putprop method: Set the value of a type-specific property. If
+ − 359 the property name is unrecognized, return 0. If the value is disallowed
+ − 360 or erroneous, signal an error. Currently called only at creation time.
+ − 361 Optional. */
+ − 362 int (*putprop_method) (Lisp_Object codesys,
+ − 363 Lisp_Object key,
+ − 364 Lisp_Object value);
+ − 365
+ − 366 /* Getprop method: Return the value of a type-specific property. If
+ − 367 the property name is unrecognized, return Qunbound. Optional.
+ − 368 */
+ − 369 Lisp_Object (*getprop_method) (Lisp_Object coding_system,
+ − 370 Lisp_Object prop);
+ − 371
+ − 372 /* These next three are set as part of the call to
+ − 373 INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA. */
+ − 374
+ − 375 /* Description of the extra data (struct foo_coding_system) attached to a
1204
+ − 376 coding system, for pdump purposes. */
+ − 377 const struct sized_memory_description *extra_description;
771
+ − 378 /* size of struct foo_coding_system -- extra data associated with
+ − 379 the coding system */
+ − 380 int extra_data_size;
+ − 381 /* size of struct foo_coding_stream -- extra data associated with the
+ − 382 struct coding_stream, needed for each active coding process
+ − 383 using this coding system. note that we can have more than one
+ − 384 process active at once (simply by creating more than one coding
+ − 385 lstream using this coding system), so we can't store this data in
+ − 386 the coding system object. */
+ − 387 int coding_data_size;
+ − 388 };
+ − 389
+ − 390 /***** Calling a coding-system method *****/
+ − 391
+ − 392 #define RAW_CODESYSMETH(cs, m) ((cs)->methods->m##_method)
+ − 393 #define HAS_CODESYSMETH_P(cs, m) (!!RAW_CODESYSMETH (cs, m))
+ − 394 #define CODESYSMETH(cs, m, args) (((cs)->methods->m##_method) args)
+ − 395
+ − 396 /* Call a void-returning coding-system method, if it exists. */
+ − 397 #define MAYBE_CODESYSMETH(cs, m, args) do { \
+ − 398 Lisp_Coding_System *maybe_codesysmeth_cs = (cs); \
+ − 399 if (HAS_CODESYSMETH_P (maybe_codesysmeth_cs, m)) \
+ − 400 CODESYSMETH (maybe_codesysmeth_cs, m, args); \
+ − 401 } while (0)
+ − 402
+ − 403 /* Call a coding-system method, if it exists, or return GIVEN.
+ − 404 NOTE: Multiply-evaluates CS. */
+ − 405 #define CODESYSMETH_OR_GIVEN(cs, m, args, given) \
+ − 406 (HAS_CODESYSMETH_P (cs, m) ? \
+ − 407 CODESYSMETH (cs, m, args) : (given))
+ − 408
+ − 409 #define XCODESYSMETH(cs, m, args) \
+ − 410 CODESYSMETH (XCODING_SYSTEM (cs), m, args)
+ − 411 #define MAYBE_XCODESYSMETH(cs, m, args) \
+ − 412 MAYBE_CODESYSMETH (XCODING_SYSTEM (cs), m, args)
+ − 413 #define XCODESYSMETH_OR_GIVEN(cs, m, args, given) \
+ − 414 CODESYSMETH_OR_GIVEN (XCODING_SYSTEM (cs), m, args, given)
+ − 415
+ − 416
+ − 417 /***** Defining new coding-system types *****/
+ − 418
1204
+ − 419 extern const struct sized_memory_description coding_system_empty_extra_description;
771
+ − 420
800
+ − 421 #ifdef ERROR_CHECK_TYPES
771
+ − 422 #define DECLARE_CODING_SYSTEM_TYPE(type) \
+ − 423 \
+ − 424 extern struct coding_system_methods * type##_coding_system_methods; \
826
+ − 425 DECLARE_INLINE_HEADER ( \
+ − 426 struct type##_coding_system * \
771
+ − 427 error_check_##type##_coding_system_data (Lisp_Coding_System *cs) \
826
+ − 428 ) \
771
+ − 429 { \
+ − 430 assert (CODING_SYSTEM_TYPE_P (cs, type)); \
+ − 431 /* Catch accidental use of INITIALIZE_CODING_SYSTEM_TYPE in place \
+ − 432 of INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA. */ \
+ − 433 assert (cs->methods->extra_data_size > 0); \
+ − 434 return (struct type##_coding_system *) cs->data; \
+ − 435 } \
+ − 436 \
826
+ − 437 DECLARE_INLINE_HEADER ( \
+ − 438 struct type##_coding_stream * \
771
+ − 439 error_check_##type##_coding_stream_data (struct coding_stream *s) \
826
+ − 440 ) \
771
+ − 441 { \
+ − 442 assert (XCODING_SYSTEM_TYPE_P (s->codesys, type)); \
+ − 443 return (struct type##_coding_stream *) s->data; \
+ − 444 } \
+ − 445 \
826
+ − 446 DECLARE_INLINE_HEADER ( \
+ − 447 Lisp_Coding_System * \
771
+ − 448 error_check_##type##_coding_system_type (Lisp_Object obj) \
826
+ − 449 ) \
771
+ − 450 { \
+ − 451 Lisp_Coding_System *cs = XCODING_SYSTEM (obj); \
+ − 452 assert (CODING_SYSTEM_TYPE_P (cs, type)); \
+ − 453 return cs; \
+ − 454 } \
+ − 455 \
+ − 456 DECLARE_NOTHING
+ − 457 #else
+ − 458 #define DECLARE_CODING_SYSTEM_TYPE(type) \
+ − 459 extern struct coding_system_methods * type##_coding_system_methods
800
+ − 460 #endif /* ERROR_CHECK_TYPES */
771
+ − 461
+ − 462 #define DEFINE_CODING_SYSTEM_TYPE(type) \
+ − 463 struct coding_system_methods * type##_coding_system_methods
+ − 464
1204
+ − 465 #define DEFINE_CODING_SYSTEM_TYPE_WITH_DATA(type) \
+ − 466 struct coding_system_methods * type##_coding_system_methods; \
+ − 467 static const struct sized_memory_description \
+ − 468 type##_coding_system_description_0 = { \
+ − 469 sizeof (struct type##_coding_system), \
+ − 470 type##_coding_system_description \
+ − 471 }
+ − 472
771
+ − 473 #define INITIALIZE_CODING_SYSTEM_TYPE(ty, pred_sym) do { \
+ − 474 ty##_coding_system_methods = \
+ − 475 xnew_and_zero (struct coding_system_methods); \
+ − 476 ty##_coding_system_methods->type = Q##ty; \
+ − 477 ty##_coding_system_methods->extra_description = \
1204
+ − 478 &coding_system_empty_extra_description; \
+ − 479 ty##_coding_system_methods->enumtype = ty##_coding_system; \
771
+ − 480 defsymbol_nodump (&ty##_coding_system_methods->predicate_symbol, \
+ − 481 pred_sym); \
+ − 482 add_entry_to_coding_system_type_list (ty##_coding_system_methods); \
2367
+ − 483 dump_add_root_block_ptr (&ty##_coding_system_methods, \
771
+ − 484 &coding_system_methods_description); \
+ − 485 } while (0)
+ − 486
+ − 487 #define REINITIALIZE_CODING_SYSTEM_TYPE(type) do { \
+ − 488 staticpro_nodump (&type##_coding_system_methods->predicate_symbol); \
+ − 489 } while (0)
+ − 490
+ − 491 /* This assumes the existence of two structures:
+ − 492
+ − 493 struct foo_coding_system (attached to the coding system)
+ − 494 struct foo_coding_stream (per coding process, attached to the
+ − 495 struct coding_stream)
1204
+ − 496 const struct memory_description foo_coding_system_description[]
+ − 497 (data description of struct foo_coding_system)
771
+ − 498
1204
+ − 499 For an example of how to do the description, see
771
+ − 500 chain_coding_system_description.
+ − 501 */
+ − 502 #define INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA(type, pred_sym) \
+ − 503 do { \
+ − 504 INITIALIZE_CODING_SYSTEM_TYPE (type, pred_sym); \
+ − 505 type##_coding_system_methods->extra_data_size = \
+ − 506 sizeof (struct type##_coding_system); \
+ − 507 type##_coding_system_methods->extra_description = \
1204
+ − 508 &type##_coding_system_description_0; \
771
+ − 509 type##_coding_system_methods->coding_data_size = \
+ − 510 sizeof (struct type##_coding_stream); \
+ − 511 } while (0)
+ − 512
+ − 513 /* Declare that coding-system-type TYPE has method METH; used in
+ − 514 initialization routines */
+ − 515 #define CODING_SYSTEM_HAS_METHOD(type, meth) \
+ − 516 (type##_coding_system_methods->meth##_method = type##_##meth)
+ − 517
+ − 518 /***** Macros for accessing coding-system types *****/
+ − 519
+ − 520 #define CODING_SYSTEM_TYPE_P(cs, type) \
+ − 521 ((cs)->methods == type##_coding_system_methods)
+ − 522 #define XCODING_SYSTEM_TYPE_P(cs, type) \
+ − 523 CODING_SYSTEM_TYPE_P (XCODING_SYSTEM (cs), type)
+ − 524
800
+ − 525 #ifdef ERROR_CHECK_TYPES
771
+ − 526 # define CODING_SYSTEM_TYPE_DATA(cs, type) \
+ − 527 error_check_##type##_coding_system_data (cs)
+ − 528 #else
+ − 529 # define CODING_SYSTEM_TYPE_DATA(cs, type) \
+ − 530 ((struct type##_coding_system *) \
+ − 531 (cs)->data)
+ − 532 #endif
+ − 533
+ − 534 #define XCODING_SYSTEM_TYPE_DATA(cs, type) \
+ − 535 CODING_SYSTEM_TYPE_DATA (XCODING_SYSTEM_OF_TYPE (cs, type), type)
+ − 536
800
+ − 537 #ifdef ERROR_CHECK_TYPES
771
+ − 538 # define XCODING_SYSTEM_OF_TYPE(x, type) \
+ − 539 error_check_##type##_coding_system_type (x)
+ − 540 # define XSETCODING_SYSTEM_OF_TYPE(x, p, type) do \
+ − 541 { \
793
+ − 542 x = wrap_coding_system (p); \
+ − 543 assert (CODING_SYSTEM_TYPEP (XCODING_SYSTEM (x), type)); \
771
+ − 544 } while (0)
+ − 545 #else
+ − 546 # define XCODING_SYSTEM_OF_TYPE(x, type) XCODING_SYSTEM (x)
793
+ − 547 # define XSETCODING_SYSTEM_OF_TYPE(x, p, type) do \
+ − 548 { \
+ − 549 x = wrap_coding_system (p); \
+ − 550 } while (0)
771
+ − 551 #endif /* ERROR_CHECK_TYPE_CHECK */
+ − 552
+ − 553 #define CODING_SYSTEM_TYPEP(x, type) \
+ − 554 (CODING_SYSTEMP (x) && CODING_SYSTEM_TYPE_P (XCODING_SYSTEM (x), type))
+ − 555 #define CHECK_CODING_SYSTEM_OF_TYPE(x, type) do { \
+ − 556 CHECK_CODING_SYSTEM (x); \
+ − 557 if (!CODING_SYSTEM_TYPE_P (XCODING_SYSTEM (x), type)) \
+ − 558 dead_wrong_type_argument \
+ − 559 (type##_coding_system_methods->predicate_symbol, x); \
+ − 560 } while (0)
+ − 561 #define CONCHECK_CODING_SYSTEM_OF_TYPE(x, type) do { \
+ − 562 CONCHECK_CODING_SYSTEM (x); \
+ − 563 if (!(CODING_SYSTEM_TYPEP (x, type))) \
+ − 564 x = wrong_type_argument \
+ − 565 (type##_coding_system_methods->predicate_symbol, x); \
+ − 566 } while (0)
+ − 567
+ − 568 #define CODING_SYSTEM_METHODS(codesys) ((codesys)->methods)
428
+ − 569 #define CODING_SYSTEM_NAME(codesys) ((codesys)->name)
771
+ − 570 #define CODING_SYSTEM_DESCRIPTION(codesys) ((codesys)->description)
+ − 571 #define CODING_SYSTEM_TYPE(codesys) ((codesys)->methods->type)
428
+ − 572 #define CODING_SYSTEM_MNEMONIC(codesys) ((codesys)->mnemonic)
771
+ − 573 #define CODING_SYSTEM_DOCUMENTATION(codesys) ((codesys)->documentation)
428
+ − 574 #define CODING_SYSTEM_POST_READ_CONVERSION(codesys) \
+ − 575 ((codesys)->post_read_conversion)
+ − 576 #define CODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) \
+ − 577 ((codesys)->pre_write_conversion)
+ − 578 #define CODING_SYSTEM_EOL_TYPE(codesys) ((codesys)->eol_type)
771
+ − 579 #define CODING_SYSTEM_EOL_LF(codesys) ((codesys)->eol[EOL_LF])
+ − 580 #define CODING_SYSTEM_EOL_CRLF(codesys) ((codesys)->eol[EOL_CRLF])
+ − 581 #define CODING_SYSTEM_EOL_CR(codesys) ((codesys)->eol[EOL_CR])
+ − 582 #define CODING_SYSTEM_TEXT_FILE_WRAPPER(codesys) ((codesys)->text_file_wrapper)
+ − 583 #define CODING_SYSTEM_AUTO_EOL_WRAPPER(codesys) ((codesys)->auto_eol_wrapper)
+ − 584 #define CODING_SYSTEM_SUBSIDIARY_PARENT(codesys) ((codesys)->subsidiary_parent)
+ − 585 #define CODING_SYSTEM_CANONICAL(codesys) ((codesys)->canonical)
428
+ − 586
771
+ − 587 #define CODING_SYSTEM_CHAIN_CHAIN(codesys) \
+ − 588 (CODING_SYSTEM_TYPE_DATA (codesys, chain)->chain)
+ − 589 #define CODING_SYSTEM_CHAIN_COUNT(codesys) \
+ − 590 (CODING_SYSTEM_TYPE_DATA (codesys, chain)->count)
+ − 591 #define CODING_SYSTEM_CHAIN_CANONICALIZE_AFTER_CODING(codesys) \
+ − 592 (CODING_SYSTEM_TYPE_DATA (codesys, chain)->canonicalize_after_coding)
428
+ − 593
771
+ − 594 #define XCODING_SYSTEM_METHODS(codesys) \
+ − 595 CODING_SYSTEM_METHODS (XCODING_SYSTEM (codesys))
428
+ − 596 #define XCODING_SYSTEM_NAME(codesys) \
+ − 597 CODING_SYSTEM_NAME (XCODING_SYSTEM (codesys))
771
+ − 598 #define XCODING_SYSTEM_DESCRIPTION(codesys) \
+ − 599 CODING_SYSTEM_DESCRIPTION (XCODING_SYSTEM (codesys))
428
+ − 600 #define XCODING_SYSTEM_TYPE(codesys) \
+ − 601 CODING_SYSTEM_TYPE (XCODING_SYSTEM (codesys))
+ − 602 #define XCODING_SYSTEM_MNEMONIC(codesys) \
+ − 603 CODING_SYSTEM_MNEMONIC (XCODING_SYSTEM (codesys))
771
+ − 604 #define XCODING_SYSTEM_DOCUMENTATION(codesys) \
+ − 605 CODING_SYSTEM_DOCUMENTATION (XCODING_SYSTEM (codesys))
428
+ − 606 #define XCODING_SYSTEM_POST_READ_CONVERSION(codesys) \
+ − 607 CODING_SYSTEM_POST_READ_CONVERSION (XCODING_SYSTEM (codesys))
+ − 608 #define XCODING_SYSTEM_PRE_WRITE_CONVERSION(codesys) \
+ − 609 CODING_SYSTEM_PRE_WRITE_CONVERSION (XCODING_SYSTEM (codesys))
+ − 610 #define XCODING_SYSTEM_EOL_TYPE(codesys) \
+ − 611 CODING_SYSTEM_EOL_TYPE (XCODING_SYSTEM (codesys))
+ − 612 #define XCODING_SYSTEM_EOL_LF(codesys) \
+ − 613 CODING_SYSTEM_EOL_LF (XCODING_SYSTEM (codesys))
+ − 614 #define XCODING_SYSTEM_EOL_CRLF(codesys) \
+ − 615 CODING_SYSTEM_EOL_CRLF (XCODING_SYSTEM (codesys))
+ − 616 #define XCODING_SYSTEM_EOL_CR(codesys) \
+ − 617 CODING_SYSTEM_EOL_CR (XCODING_SYSTEM (codesys))
771
+ − 618 #define XCODING_SYSTEM_TEXT_FILE_WRAPPER(codesys) \
+ − 619 CODING_SYSTEM_TEXT_FILE_WRAPPER (XCODING_SYSTEM (codesys))
+ − 620 #define XCODING_SYSTEM_AUTO_EOL_WRAPPER(codesys) \
+ − 621 CODING_SYSTEM_AUTO_EOL_WRAPPER (XCODING_SYSTEM (codesys))
+ − 622 #define XCODING_SYSTEM_SUBSIDIARY_PARENT(codesys) \
+ − 623 CODING_SYSTEM_SUBSIDIARY_PARENT (XCODING_SYSTEM (codesys))
+ − 624 #define XCODING_SYSTEM_CANONICAL(codesys) \
+ − 625 CODING_SYSTEM_CANONICAL (XCODING_SYSTEM (codesys))
428
+ − 626
771
+ − 627 #define XCODING_SYSTEM_CHAIN_CHAIN(codesys) \
+ − 628 CODING_SYSTEM_CHAIN_CHAIN (XCODING_SYSTEM (codesys))
+ − 629 #define XCODING_SYSTEM_CHAIN_COUNT(codesys) \
+ − 630 CODING_SYSTEM_CHAIN_COUNT (XCODING_SYSTEM (codesys))
+ − 631 #define XCODING_SYSTEM_CHAIN_CANONICALIZE_AFTER_CODING(codesys) \
+ − 632 CODING_SYSTEM_CHAIN_CANONICALIZE_AFTER_CODING (XCODING_SYSTEM (codesys))
428
+ − 633
771
+ − 634 /**************************************************/
+ − 635 /* Detection */
+ − 636 /**************************************************/
428
+ − 637
771
+ − 638 #define MAX_DETECTOR_CATEGORIES 256
+ − 639 #define MAX_DETECTORS 64
428
+ − 640
771
+ − 641 #define MAX_BYTES_PROCESSED_FOR_DETECTION 65536
428
+ − 642
771
+ − 643 struct detection_state
428
+ − 644 {
771
+ − 645 int seen_non_ascii;
+ − 646 Bytecount bytes_seen;
428
+ − 647
771
+ − 648 char categories[MAX_DETECTOR_CATEGORIES];
+ − 649 Bytecount data_offset[MAX_DETECTORS];
+ − 650 /* ... more data follows; data_offset[detector_##TYPE] points to
+ − 651 the data for that type */
428
+ − 652 };
+ − 653
771
+ − 654 #define DETECTION_STATE_DATA(st, type) \
+ − 655 ((struct type##_detector *) \
+ − 656 ((char *) (st) + (st)->data_offset[detector_##type]))
428
+ − 657
448
+ − 658 /* Distinguishable categories of encodings.
+ − 659
+ − 660 This list determines the initial priority of the categories.
+ − 661
+ − 662 For better or worse, currently Mule files are encoded in 7-bit ISO 2022.
+ − 663 For this reason, under Mule ISO_7 gets highest priority.
+ − 664
+ − 665 Putting NO_CONVERSION second prevents "binary corruption" in the
+ − 666 default case in all but the (presumably) extremely rare case of a
+ − 667 binary file which contains redundant escape sequences but no 8-bit
+ − 668 characters.
+ − 669
+ − 670 The remaining priorities are based on perceived "internationalization
+ − 671 political correctness." An exception is UCS-4 at the bottom, since
+ − 672 basically everything is compatible with UCS-4, but it is likely to
+ − 673 be very rare as an external encoding. */
+ − 674
771
+ − 675 /* Macros to define code of control characters for ISO2022's functions. */
+ − 676 /* Used by the detection routines of other coding system types as well. */
+ − 677 /* code */ /* function */
+ − 678 #define ISO_CODE_LF 0x0A /* line-feed */
+ − 679 #define ISO_CODE_CR 0x0D /* carriage-return */
+ − 680 #define ISO_CODE_SO 0x0E /* shift-out */
+ − 681 #define ISO_CODE_SI 0x0F /* shift-in */
+ − 682 #define ISO_CODE_ESC 0x1B /* escape */
+ − 683 #define ISO_CODE_DEL 0x7F /* delete */
+ − 684 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
+ − 685 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
+ − 686 #define ISO_CODE_CSI 0x9B /* control-sequence-introduce */
+ − 687
+ − 688 enum detection_result
+ − 689 {
+ − 690 /* Basically means a magic cookie was seen indicating this type, or
+ − 691 something similar. */
+ − 692 DET_NEAR_CERTAINTY = 4,
+ − 693 DET_HIGHEST = 4,
+ − 694 /* Characteristics seen that are unlikely to be other coding system types
+ − 695 -- e.g. ISO-2022 escape sequences, or perhaps a consistent pattern of
+ − 696 alternating zero bytes in UTF-16, along with Unicode LF or CRLF
+ − 697 sequences at regular intervals. (Zero bytes are unlikely or impossible
+ − 698 in most text encodings.) */
+ − 699 DET_QUITE_PROBABLE = 3,
+ − 700 /* Strong or medium statistical likelihood. At least some
+ − 701 characteristics seen that match what's normally found in this encoding
+ − 702 -- e.g. in Shift-JIS, a number of two-byte Japanese character
+ − 703 sequences in the right range, and nothing out of range; or in Unicode,
+ − 704 much higher statistical variance in the odd bytes than in the even
+ − 705 bytes, or vice-versa (perhaps the presence of regular EOL sequences
+ − 706 would bump this too to DET_QUITE_PROBABLE). This is quite often a
+ − 707 statistical test. */
+ − 708 DET_SOMEWHAT_LIKELY = 2,
+ − 709 /* Weak statistical likelihood. Pretty much any features at all that
+ − 710 characterize this encoding, and nothing that rules against it. */
+ − 711 DET_SLIGHTLY_LIKELY = 1,
+ − 712 /* Default state. Perhaps it indicates pure ASCII or something similarly
+ − 713 vague seen in Shift-JIS, or, exactly as the level says, it might mean
+ − 714 in a statistical-based detector that the pros and cons are balanced
+ − 715 out. This is also the lowest level that will be accepted by the
+ − 716 auto-detector without asking the user: If all available detectors
+ − 717 report lower levels for all categories with attached coding systems,
+ − 718 the user will be shown the results and explicitly prompted for action.
+ − 719 The user will also be prompted if this is the highest available level
+ − 720 and more than one detector reports the level. (See below about the
+ − 721 consequent necessity of an "ASCII" detector, which will return level 1
+ − 722 or higher for most plain text files.) */
+ − 723 DET_AS_LIKELY_AS_UNLIKELY = 0,
+ − 724 /* Some characteristics seen that are unusual for this encoding --
+ − 725 e.g. unusual control characters in a plain-text encoding, lots of
+ − 726 8-bit characters, or little statistical variance in the odd and even
+ − 727 bytes in UTF-16. */
+ − 728 DET_SOMEWHAT_UNLIKELY = -1,
+ − 729 /* This indicates that there is very little chance the data is in the
+ − 730 right format; this is probably the lowest level you can get when
+ − 731 presenting random binary data to a text file, because there are no
+ − 732 "specific sequences" you can see that would totally rule out
+ − 733 recognition. */
+ − 734 DET_QUITE_IMPROBABLE = -2,
+ − 735 /* An erroneous sequence was seen. */
+ − 736 DET_NEARLY_IMPOSSIBLE = -3,
1429
+ − 737 DET_LOWEST = -3
771
+ − 738 };
+ − 739
+ − 740 extern int coding_detector_count;
+ − 741 extern int coding_detector_category_count;
+ − 742
+ − 743 struct detector_category
428
+ − 744 {
771
+ − 745 int id;
+ − 746 Lisp_Object sym;
+ − 747 };
+ − 748
+ − 749 typedef struct
+ − 750 {
+ − 751 Dynarr_declare (struct detector_category);
+ − 752 } detector_category_dynarr;
+ − 753
+ − 754 struct detector
+ − 755 {
+ − 756 int id;
+ − 757 detector_category_dynarr *cats;
+ − 758 Bytecount data_size;
+ − 759 /* Detect method: Required. */
+ − 760 void (*detect_method) (struct detection_state *st,
+ − 761 const unsigned char *src, Bytecount n);
+ − 762 /* Finalize detection state method: Clean up any allocated data in the
+ − 763 detection state. Called only once (NOT called at disksave time).
+ − 764 Optional. */
+ − 765 void (*finalize_detection_state_method) (struct detection_state *st);
428
+ − 766 };
+ − 767
771
+ − 768 /* Lvalue for a particular detection result -- detection state ST,
+ − 769 category CAT */
+ − 770 #define DET_RESULT(st, cat) ((st)->categories[detector_category_##cat])
+ − 771 /* In state ST, set all detection results associated with detector DET to
+ − 772 RESULT. */
+ − 773 #define SET_DET_RESULTS(st, det, result) \
+ − 774 set_detection_results (st, detector_##det, result)
+ − 775
+ − 776 typedef struct
+ − 777 {
+ − 778 Dynarr_declare (struct detector);
+ − 779 } detector_dynarr;
+ − 780
+ − 781 extern detector_dynarr *all_coding_detectors;
+ − 782
+ − 783 #define DEFINE_DETECTOR_CATEGORY(detector, cat) \
+ − 784 int detector_category_##cat
+ − 785 #define DECLARE_DETECTOR_CATEGORY(detector, cat) \
+ − 786 extern int detector_category_##cat
+ − 787 #define INITIALIZE_DETECTOR_CATEGORY(detector, cat) \
+ − 788 do { \
+ − 789 struct detector_category dog; \
+ − 790 xzero (dog); \
+ − 791 detector_category_##cat = coding_detector_category_count++; \
+ − 792 dump_add_opaque_int (&detector_category_##cat); \
+ − 793 dog.id = detector_category_##cat; \
+ − 794 dog.sym = Q##cat; \
+ − 795 Dynarr_add (Dynarr_at (all_coding_detectors, detector_##detector).cats, \
+ − 796 dog); \
+ − 797 } while (0)
+ − 798
+ − 799 #define DEFINE_DETECTOR(Detector) \
+ − 800 int detector_##Detector
+ − 801 #define DECLARE_DETECTOR(Detector) \
+ − 802 extern int detector_##Detector
+ − 803 #define INITIALIZE_DETECTOR(Detector) \
+ − 804 do { \
+ − 805 struct detector det; \
+ − 806 xzero (det); \
+ − 807 detector_##Detector = coding_detector_count++; \
+ − 808 dump_add_opaque_int (&detector_##Detector); \
+ − 809 det.id = detector_##Detector; \
+ − 810 det.cats = Dynarr_new2 (detector_category_dynarr, \
+ − 811 struct detector_category); \
+ − 812 det.data_size = sizeof (struct Detector##_detector); \
+ − 813 Dynarr_add (all_coding_detectors, det); \
+ − 814 } while (0)
+ − 815 #define DETECTOR_HAS_METHOD(Detector, Meth) \
+ − 816 Dynarr_at (all_coding_detectors, detector_##Detector).Meth##_method = \
802
+ − 817 Detector##_##Meth
771
+ − 818
+ − 819
+ − 820 /**************************************************/
+ − 821 /* Decoding/Encoding */
+ − 822 /**************************************************/
+ − 823
+ − 824 /* Is the source (SOURCEP == 1) or sink (SOURCEP == 0) when encoding specified
+ − 825 in characters? */
+ − 826
+ − 827 enum source_or_sink
+ − 828 {
+ − 829 CODING_SOURCE,
+ − 830 CODING_SINK
+ − 831 };
+ − 832
+ − 833 enum encode_decode
+ − 834 {
+ − 835 CODING_ENCODE,
+ − 836 CODING_DECODE
+ − 837 };
+ − 838
+ − 839 /* Data structure attached to an lstream of type `coding',
+ − 840 containing values specific to the coding process. Additional
+ − 841 data is stored in the DATA field below; the exact form of that data
+ − 842 is controlled by the type of the coding system that governs the
+ − 843 conversion (field CODESYS). CODESYS may be set at any time
+ − 844 throughout the lifetime of the lstream and possibly more than once.
+ − 845 See long comment above for more info. */
+ − 846
+ − 847 struct coding_stream
+ − 848 {
1204
+ − 849 /* Enumerated constant listing which type of console this is (TTY, X,
+ − 850 MS-Windows, etc.). This duplicates the method structure in
+ − 851 XCODING_SYSTEM (str->codesys)->methods->type, which formerly was the
+ − 852 only way to determine the coding system type. We need this constant
+ − 853 now for KKCC, so that it can be used in an XD_UNION clause to
+ − 854 determine the Lisp objects in the type-specific data. */
+ − 855 enum coding_system_variant type;
+ − 856
771
+ − 857 /* Coding system that governs the conversion. */
+ − 858 Lisp_Object codesys;
+ − 859 /* Original coding system, pre-canonicalization. */
+ − 860 Lisp_Object orig_codesys;
+ − 861
+ − 862 /* Back pointer to current stream. */
+ − 863 Lstream *us;
+ − 864
+ − 865 /* Stream that we read the unprocessed data from or write the processed
+ − 866 data to. */
+ − 867 Lstream *other_end;
+ − 868
+ − 869 /* In order to handle both reading to and writing from a coding stream,
+ − 870 we phrase the conversion methods like write methods -- we can
+ − 871 implement reading in terms of a write method but not vice-versa,
+ − 872 because the write method is forced to take only what it's given but
+ − 873 the read method can read more data from the other end if necessary.
+ − 874 On the other hand, the write method is free to generate all the data
2297
+ − 875 it wants (and just write it to the other end), but the read method
771
+ − 876 can return only as much as was asked for, so we need to implement our
+ − 877 own buffering. */
+ − 878
+ − 879 /* If we are reading, then we can return only a fixed amount of data, but
+ − 880 the converter is free to return as much as it wants, so we direct it
+ − 881 to store the data here and lop off chunks as we need them. If we are
+ − 882 writing, we use this because the converter takes a Dynarr but we are
+ − 883 supposed to write into a fixed buffer. (NOTE: This introduces an extra
+ − 884 memory copy.) */
+ − 885 unsigned_char_dynarr *convert_to;
+ − 886
+ − 887 /* The conversion method might reject some of the data -- this typically
+ − 888 includes partial characters, partial escape sequences, etc. When
+ − 889 writing, we just pass the rejection up to the Lstream module, and it
+ − 890 will buffer the data. When reading, however, we need to do the
+ − 891 buffering ourselves, and we put it here, combined with newly read
+ − 892 data. */
+ − 893 unsigned_char_dynarr *convert_from;
+ − 894
+ − 895 /* If set, this is the last chunk of data being processed. When this is
+ − 896 finished, output any necessary terminating control characters, escape
+ − 897 sequences, etc. */
+ − 898 unsigned int eof:1;
+ − 899
+ − 900 /* CH holds a partially built-up character. This is really part of the
+ − 901 state-dependent data and should be moved there. */
+ − 902 unsigned int ch;
+ − 903
+ − 904 /* Coding-system-specific data holding extra state about the
+ − 905 conversion. Logically a struct TYPE_coding_stream; a pointer
800
+ − 906 to such a struct, with (when ERROR_CHECK_TYPES is defined)
771
+ − 907 error-checking that this is really a structure of that type
+ − 908 (checking the corresponding coding system type) can be retrieved using
+ − 909 CODING_STREAM_TYPE_DATA(). Allocated at the same time that
+ − 910 CODESYS is set (which may occur at any time, even multiple times,
+ − 911 during the lifetime of the stream). The size comes from
+ − 912 methods->coding_data_size. */
+ − 913 void *data;
+ − 914
+ − 915 enum encode_decode direction;
+ − 916
800
+ − 917 /* If set, don't close the stream at the other end when being closed. */
+ − 918 unsigned int no_close_other:1;
802
+ − 919 /* If set, read only one byte at a time from other end to avoid any
+ − 920 possible blocking. */
+ − 921 unsigned int one_byte_at_a_time:1;
814
+ − 922 /* If set, and we're a read stream, we init char mode on ourselves as
+ − 923 necessary to prevent the caller from getting partial characters. (the
+ − 924 default) */
+ − 925 unsigned int set_char_mode_on_us_when_reading:1;
800
+ − 926
771
+ − 927 /* #### Temporary test */
+ − 928 unsigned int finalized:1;
+ − 929 };
+ − 930
+ − 931 #define CODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, coding)
+ − 932
800
+ − 933 #ifdef ERROR_CHECK_TYPES
771
+ − 934 # define CODING_STREAM_TYPE_DATA(s, type) \
+ − 935 error_check_##type##_coding_stream_data (s)
+ − 936 #else
+ − 937 # define CODING_STREAM_TYPE_DATA(s, type) \
+ − 938 ((struct type##_coding_stream *) (s)->data)
+ − 939 #endif
+ − 940
+ − 941 /* C should be a binary character in the range 0 - 255; convert
+ − 942 to internal format and add to Dynarr DST. */
+ − 943
428
+ − 944 #ifdef MULE
771
+ − 945
+ − 946 #define DECODE_ADD_BINARY_CHAR(c, dst) \
+ − 947 do { \
826
+ − 948 if (byte_ascii_p (c)) \
771
+ − 949 Dynarr_add (dst, c); \
826
+ − 950 else if (byte_c1_p (c)) \
771
+ − 951 { \
+ − 952 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
+ − 953 Dynarr_add (dst, c + 0x20); \
+ − 954 } \
+ − 955 else \
+ − 956 { \
+ − 957 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \
+ − 958 Dynarr_add (dst, c); \
+ − 959 } \
+ − 960 } while (0)
+ − 961
+ − 962 #else /* not MULE */
+ − 963
+ − 964 #define DECODE_ADD_BINARY_CHAR(c, dst) \
+ − 965 do { \
+ − 966 Dynarr_add (dst, c); \
+ − 967 } while (0)
+ − 968
+ − 969 #endif /* MULE */
+ − 970
+ − 971 #define DECODE_OUTPUT_PARTIAL_CHAR(ch, dst) \
+ − 972 do { \
+ − 973 if (ch) \
+ − 974 { \
+ − 975 DECODE_ADD_BINARY_CHAR (ch, dst); \
+ − 976 ch = 0; \
+ − 977 } \
+ − 978 } while (0)
428
+ − 979
+ − 980 #ifdef MULE
+ − 981 /* Convert shift-JIS code (sj1, sj2) into internal string
+ − 982 representation (c1, c2). (The leading byte is assumed.) */
+ − 983
771
+ − 984 #define DECODE_SHIFT_JIS(sj1, sj2, c1, c2) \
428
+ − 985 do { \
+ − 986 int I1 = sj1, I2 = sj2; \
+ − 987 if (I2 >= 0x9f) \
+ − 988 c1 = (I1 << 1) - ((I1 >= 0xe0) ? 0xe0 : 0x60), \
+ − 989 c2 = I2 + 2; \
+ − 990 else \
+ − 991 c1 = (I1 << 1) - ((I1 >= 0xe0) ? 0xe1 : 0x61), \
+ − 992 c2 = I2 + ((I2 >= 0x7f) ? 0x60 : 0x61); \
+ − 993 } while (0)
+ − 994
+ − 995 /* Convert the internal string representation of a Shift-JIS character
+ − 996 (c1, c2) into Shift-JIS code (sj1, sj2). The leading byte is
+ − 997 assumed. */
+ − 998
771
+ − 999 #define ENCODE_SHIFT_JIS(c1, c2, sj1, sj2) \
428
+ − 1000 do { \
+ − 1001 int I1 = c1, I2 = c2; \
+ − 1002 if (I1 & 1) \
+ − 1003 sj1 = (I1 >> 1) + ((I1 < 0xdf) ? 0x31 : 0x71), \
+ − 1004 sj2 = I2 - ((I2 >= 0xe0) ? 0x60 : 0x61); \
+ − 1005 else \
+ − 1006 sj1 = (I1 >> 1) + ((I1 < 0xdf) ? 0x30 : 0x70), \
+ − 1007 sj2 = I2 - 2; \
+ − 1008 } while (0)
+ − 1009 #endif /* MULE */
+ − 1010
771
+ − 1011 DECLARE_CODING_SYSTEM_TYPE (no_conversion);
+ − 1012 DECLARE_CODING_SYSTEM_TYPE (convert_eol);
+ − 1013 #if 0
+ − 1014 DECLARE_CODING_SYSTEM_TYPE (text_file_wrapper);
+ − 1015 #endif /* 0 */
+ − 1016 DECLARE_CODING_SYSTEM_TYPE (undecided);
+ − 1017 DECLARE_CODING_SYSTEM_TYPE (chain);
+ − 1018
+ − 1019 #ifdef DEBUG_XEMACS
+ − 1020 DECLARE_CODING_SYSTEM_TYPE (internal);
+ − 1021 #endif
+ − 1022
+ − 1023 #ifdef MULE
+ − 1024 DECLARE_CODING_SYSTEM_TYPE (iso2022);
+ − 1025 DECLARE_CODING_SYSTEM_TYPE (ccl);
+ − 1026 DECLARE_CODING_SYSTEM_TYPE (shift_jis);
+ − 1027 DECLARE_CODING_SYSTEM_TYPE (big5);
+ − 1028 #endif
+ − 1029
+ − 1030 #ifdef HAVE_ZLIB
+ − 1031 DECLARE_CODING_SYSTEM_TYPE (gzip);
+ − 1032 #endif
428
+ − 1033
771
+ − 1034 DECLARE_CODING_SYSTEM_TYPE (unicode);
428
+ − 1035
1315
+ − 1036 #ifdef WIN32_ANY
771
+ − 1037 DECLARE_CODING_SYSTEM_TYPE (mswindows_multibyte_to_unicode);
+ − 1038 DECLARE_CODING_SYSTEM_TYPE (mswindows_multibyte);
428
+ − 1039 #endif
771
+ − 1040
+ − 1041 Lisp_Object coding_stream_detected_coding_system (Lstream *stream);
+ − 1042 Lisp_Object coding_stream_coding_system (Lstream *stream);
+ − 1043 void set_coding_stream_coding_system (Lstream *stream,
+ − 1044 Lisp_Object codesys);
+ − 1045 Lisp_Object detect_coding_stream (Lisp_Object stream);
867
+ − 1046 Ichar decode_big5_char (int o1, int o2);
771
+ − 1047 void add_entry_to_coding_system_type_list (struct coding_system_methods *m);
+ − 1048 Lisp_Object make_internal_coding_system (Lisp_Object existing,
2367
+ − 1049 Ascbyte *prefix,
771
+ − 1050 Lisp_Object type,
+ − 1051 Lisp_Object description,
+ − 1052 Lisp_Object props);
802
+ − 1053
814
+ − 1054 #define LSTREAM_FL_NO_CLOSE_OTHER (1 << 16)
+ − 1055 #define LSTREAM_FL_READ_ONE_BYTE_AT_A_TIME (1 << 17)
+ − 1056 #define LSTREAM_FL_NO_INIT_CHAR_MODE_WHEN_READING (1 << 18)
+ − 1057
771
+ − 1058 Lisp_Object make_coding_input_stream (Lstream *stream, Lisp_Object codesys,
800
+ − 1059 enum encode_decode direction,
802
+ − 1060 int flags);
771
+ − 1061 Lisp_Object make_coding_output_stream (Lstream *stream, Lisp_Object codesys,
800
+ − 1062 enum encode_decode direction,
802
+ − 1063 int flags);
771
+ − 1064 void set_detection_results (struct detection_state *st, int detector,
+ − 1065 int given);
428
+ − 1066
440
+ − 1067 #endif /* INCLUDED_file_coding_h_ */
+ − 1068