comparison src/mule-coding.c @ 70:131b0175ea99 r20-0b30

Import from CVS: tag r20-0b30
author cvs
date Mon, 13 Aug 2007 09:02:59 +0200
parents
children 54cc21c15cbb
comparison
equal deleted inserted replaced
69:804d1389bcd6 70:131b0175ea99
1 /* Code conversion functions.
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
3 Copyright (C) 1995 Sun Microsystems, Inc.
4
5 This file is part of XEmacs.
6
7 XEmacs is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the
9 Free Software Foundation; either version 2, or (at your option) any
10 later version.
11
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with XEmacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* Synched up with: Mule 2.3. Not in FSF. */
23
24 /* Rewritten by Ben Wing <wing@666.com>. */
25
26 #include <config.h>
27 #include "lisp.h"
28
29 #include "buffer.h"
30 #include "elhash.h"
31 #include "insdel.h"
32 #include "lstream.h"
33 #include "mule-coding.h"
34
35 Lisp_Object Qfile_coding_system, Qcoding_system_error;
36
37 Lisp_Object Vkeyboard_coding_system;
38 Lisp_Object Vterminal_coding_system;
39 Lisp_Object Vprocess_input_coding_system;
40 Lisp_Object Vprocess_output_coding_system;
41 Lisp_Object Vpathname_coding_system;
42
43 /* Table of symbols identifying each coding category. */
44 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1];
45
46 /* Coding system currently associated with each coding category. */
47 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1];
48
49 /* Table of all coding categories in decreasing order of priority.
50 This describes a permutation of the possible coding categories. */
51 int coding_category_by_priority[CODING_CATEGORY_LAST + 1];
52
53 Lisp_Object Qcoding_system_p;
54
55 Lisp_Object Qbig5, Qshift_jis, Qno_conversion, Qccl, Qiso2022;
56 /* Qinternal in general.c */
57
58 Lisp_Object Qmnemonic, Qeol_type;
59 Lisp_Object Qcr, Qcrlf, Qlf;
60 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf;
61 Lisp_Object Qpost_read_conversion;
62 Lisp_Object Qpre_write_conversion;
63
64 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3;
65 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output;
66 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output;
67 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift;
68 Lisp_Object Qno_iso6429, Qescape_quoted;
69 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion;
70
71 Lisp_Object Qencode, Qdecode;
72
73 Lisp_Object Qctext;
74
75 Lisp_Object Vcoding_system_hashtable;
76
77 /* Additional information used by the ISO2022 decoder and detector. */
78 struct iso2022_decoder
79 {
80 /* CHARSET holds the character sets currently assigned to the G0
81 through G3 variables. It is initialized from the array
82 INITIAL_CHARSET in CODESYS. */
83 Lisp_Object charset[4];
84
85 /* Which registers are currently invoked into the left (GL) and
86 right (GR) halves of the 8-bit encoding space? */
87 int register_left, register_right;
88
89 /* ISO_ESC holds a value indicating part of an escape sequence
90 that has already been seen. */
91 enum iso_esc_flag esc;
92
93 /* This records the bytes we've seen so far in an escape sequence,
94 in case the sequence is invalid (we spit out the bytes unchanged). */
95 unsigned char esc_bytes[8];
96
97 /* Index for next byte to store in ISO escape sequence. */
98 int esc_bytes_index;
99
100 /* Stuff seen so far when composing a string. */
101 unsigned_char_dynarr *composite_chars;
102
103 /* If we saw an invalid designation sequence for a particular
104 register, we flag it here and switch to ASCII. The next time we
105 see a valid designation for this register, we turn off the flag
106 and do the designation normally, but pretend the sequence was
107 invalid. The effect of all this is that (most of the time) the
108 escape sequences for both the switch to the unknown charset, and
109 the switch back to the known charset, get inserted literally into
110 the buffer and saved out as such. The hope is that we can
111 preserve the escape sequences so that the resulting written out
112 file makes sense. If we don't do any of this, the designation
113 to the invalid charset will be preserved but that switch back
114 to the known charset will probably get eaten because it was
115 the same charset that was already present in the register. */
116 unsigned char invalid_designated[4];
117
118 /* We try to do similar things as above for direction-switching
119 sequences. If we encountered a direction switch while an
120 invalid designation was present, or an invalid designation
121 just after a direction switch (i.e. no valid designation
122 encountered yet), we insert the direction-switch escape
123 sequence literally into the output stream, and later on
124 insert the corresponding direction-restoring escape sequence
125 literally also. */
126 int switched_dir_and_no_valid_charset_yet :1;
127 int invalid_switch_dir :1;
128
129 /* Tells the decoder to output the escape sequence literally
130 even though it was valid. Used in the games we play to
131 avoid lossage when we encounter invalid designations. */
132 int output_literally :1;
133 /* We encountered a direction switch followed by an invalid
134 designation. We didn't output the direction switch
135 literally because we didn't know about the invalid designation;
136 but we have to do so now. */
137 int output_direction_sequence :1;
138 };
139
140 Lisp_Object Fcopy_coding_system (Lisp_Object old_coding_system,
141 Lisp_Object new_name);
142 struct detection_state;
143 static int detect_coding_shift_jis (struct detection_state *st,
144 CONST unsigned char *src,
145 unsigned int n);
146 static void decode_coding_shift_jis (Lstream *decoding,
147 CONST unsigned char *src,
148 unsigned_char_dynarr *dst,
149 unsigned int n);
150 static void encode_coding_shift_jis (Lstream *encoding,
151 CONST unsigned char *src,
152 unsigned_char_dynarr *dst,
153 unsigned int n);
154 static int detect_coding_big5 (struct detection_state *st,
155 CONST unsigned char *src,
156 unsigned int n);
157 static void decode_coding_big5 (Lstream *decoding,
158 CONST unsigned char *src,
159 unsigned_char_dynarr *dst, unsigned int n);
160 static void encode_coding_big5 (Lstream *encoding,
161 CONST unsigned char *src,
162 unsigned_char_dynarr *dst, unsigned int n);
163 static int postprocess_iso2022_mask (int mask);
164 static void reset_iso2022 (Lisp_Object coding_system,
165 struct iso2022_decoder *iso);
166 static int detect_coding_iso2022 (struct detection_state *st,
167 CONST unsigned char *src,
168 unsigned int n);
169 static void decode_coding_iso2022 (Lstream *decoding,
170 CONST unsigned char *src,
171 unsigned_char_dynarr *dst, unsigned int n);
172 static void encode_coding_iso2022 (Lstream *encoding,
173 CONST unsigned char *src,
174 unsigned_char_dynarr *dst, unsigned int n);
175 static void decode_coding_no_conversion (Lstream *decoding,
176 CONST unsigned char *src,
177 unsigned_char_dynarr *dst,
178 unsigned int n);
179 static void encode_coding_no_conversion (Lstream *encoding,
180 CONST unsigned char *src,
181 unsigned_char_dynarr *dst,
182 unsigned int n);
183 static void mule_decode (Lstream *decoding, CONST unsigned char *src,
184 unsigned_char_dynarr *dst, unsigned int n);
185 static void mule_encode (Lstream *encoding, CONST unsigned char *src,
186 unsigned_char_dynarr *dst, unsigned int n);
187
188 struct codesys_prop
189 {
190 Lisp_Object sym;
191 int prop_type;
192 };
193
194 typedef struct codesys_prop_dynarr_type
195 {
196 Dynarr_declare (struct codesys_prop);
197 } codesys_prop_dynarr;
198
199 codesys_prop_dynarr *the_codesys_prop_dynarr;
200
201 enum codesys_prop_enum
202 {
203 CODESYS_PROP_ALL_OK,
204 CODESYS_PROP_ISO2022,
205 CODESYS_PROP_CCL
206 };
207
208
209 /************************************************************************/
210 /* Coding system functions */
211 /************************************************************************/
212
213 static Lisp_Object mark_coding_system (Lisp_Object, void (*) (Lisp_Object));
214 static void print_coding_system (Lisp_Object, Lisp_Object, int);
215 static void finalize_coding_system (void *header, int for_disksave);
216
217 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system,
218 mark_coding_system, print_coding_system,
219 finalize_coding_system,
220 0, 0, struct Lisp_Coding_System);
221
222 static Lisp_Object
223 mark_coding_system (Lisp_Object obj, void (*markobj) (Lisp_Object))
224 {
225 struct Lisp_Coding_System *codesys = XCODING_SYSTEM (obj);
226 int i;
227
228 (markobj) (CODING_SYSTEM_NAME (codesys));
229 (markobj) (CODING_SYSTEM_DOC_STRING (codesys));
230 (markobj) (CODING_SYSTEM_MNEMONIC (codesys));
231 (markobj) (CODING_SYSTEM_EOL_LF (codesys));
232 (markobj) (CODING_SYSTEM_EOL_CRLF (codesys));
233 (markobj) (CODING_SYSTEM_EOL_CR (codesys));
234 switch (CODING_SYSTEM_TYPE (codesys))
235 {
236 case CODESYS_ISO2022:
237 for (i = 0; i < 4; i++)
238 (markobj) (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i));
239 if (codesys->iso2022.input_conv)
240 {
241 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++)
242 {
243 struct charset_conversion_spec *ccs =
244 Dynarr_atp (codesys->iso2022.input_conv, i);
245 (markobj) (ccs->from_charset);
246 (markobj) (ccs->to_charset);
247 }
248 }
249 if (codesys->iso2022.output_conv)
250 {
251 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++)
252 {
253 struct charset_conversion_spec *ccs =
254 Dynarr_atp (codesys->iso2022.output_conv, i);
255 (markobj) (ccs->from_charset);
256 (markobj) (ccs->to_charset);
257 }
258 }
259 break;
260
261 case CODESYS_CCL:
262 (markobj) (CODING_SYSTEM_CCL_DECODE (codesys));
263 (markobj) (CODING_SYSTEM_CCL_ENCODE (codesys));
264 break;
265 default:
266 break;
267 }
268
269 (markobj) (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys));
270 return CODING_SYSTEM_POST_READ_CONVERSION (codesys);
271 }
272
273 static void
274 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun,
275 int escapeflag)
276 {
277 struct Lisp_Coding_System *c = XCODING_SYSTEM (obj);
278 if (print_readably)
279 error ("printing unreadable object #<coding_system 0x%x>",
280 c->header.uid);
281
282 write_c_string ("#<coding_system ", printcharfun);
283 print_internal (c->name, printcharfun, 1);
284 write_c_string (">", printcharfun);
285 }
286
287 static void
288 finalize_coding_system (void *header, int for_disksave)
289 {
290 struct Lisp_Coding_System *c = (struct Lisp_Coding_System *) header;
291 /* Since coding systems never go away, this function is not
292 necessary. But it would be necessary if we changed things
293 so that coding systems could go away. */
294 if (!for_disksave) /* see comment in lstream.c */
295 {
296 switch (CODING_SYSTEM_TYPE (c))
297 {
298 case CODESYS_ISO2022:
299 if (c->iso2022.input_conv)
300 {
301 Dynarr_free (c->iso2022.input_conv);
302 c->iso2022.input_conv = 0;
303 }
304 if (c->iso2022.output_conv)
305 {
306 Dynarr_free (c->iso2022.output_conv);
307 c->iso2022.output_conv = 0;
308 }
309 break;
310
311 default:
312 break;
313 }
314 }
315 }
316
317 static int
318 symbol_to_eol_type (Lisp_Object symbol)
319 {
320 CHECK_SYMBOL (symbol);
321 if (NILP (symbol)) return EOL_AUTODETECT;
322 else if (EQ (symbol, Qlf)) return EOL_LF;
323 else if (EQ (symbol, Qcrlf)) return EOL_CRLF;
324 else if (EQ (symbol, Qcr)) return EOL_CR;
325 else
326 signal_simple_error ("Unrecognized eol type", symbol);
327
328 return 0; /* not reached */
329 }
330
331 static Lisp_Object
332 eol_type_to_symbol (int eol_type)
333 {
334 switch (eol_type)
335 {
336 case EOL_LF: return Qlf;
337 case EOL_CRLF: return Qcrlf;
338 case EOL_CR: return Qcr;
339 case EOL_AUTODETECT: return Qnil;
340 default: abort ();
341 }
342
343 return Qnil; /* not reached */
344 }
345
346 static void
347 setup_eol_coding_systems (struct Lisp_Coding_System *codesys)
348 {
349 Lisp_Object codesys_obj = Qnil;
350 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name);
351 char *codesys_name = (char *) alloca (len + 7);
352 Lisp_Object codesys_name_sym, sub_codesys_obj;
353
354 /* kludge */
355
356 XSETCODING_SYSTEM (codesys_obj, codesys);
357
358 memcpy (codesys_name,
359 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len);
360
361 #define DEFINE_SUB_CODESYS(op_sys, Type) do { \
362 strcpy (codesys_name + len, "-" op_sys); \
363 codesys_name_sym = intern (codesys_name); \
364 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \
365 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \
366 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \
367 } while (0)
368
369 DEFINE_SUB_CODESYS("unix", EOL_LF);
370 DEFINE_SUB_CODESYS("dos", EOL_CRLF);
371 DEFINE_SUB_CODESYS("mac", EOL_CR);
372 }
373
374 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /*
375 T if OBJECT is a coding system.
376 A coding system is an object that defines how text containing multiple
377 character sets is encoded into a stream of (typically 8-bit) bytes.
378 The coding system is used to decode the stream into a series of
379 characters (which may be from multiple charsets) when the text is read
380 from a file or process, and is used to encode the text back into the
381 same format when it is written out to a file or process.
382
383 For example, many ISO2022-compliant coding systems (such as Compound
384 Text, which is used for inter-client data under the X Window System)
385 use escape sequences to switch between different charsets -- Japanese
386 Kanji, for example, is invoked with \"ESC $ ( B\"; ASCII is invoked
387 with \"ESC ( B\"; and Cyrillic is invoked with \"ESC - L\". See
388 `make-coding-system' for more information.
389
390 Coding systems are normally identified using a symbol, and the
391 symbol is accepted in place of the actual coding system object whenever
392 a coding system is called for. (This is similar to how faces work.)
393 */
394 (object))
395 {
396 return CODING_SYSTEMP (object) ? Qt : Qnil;
397 }
398
399 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /*
400 Retrieve the coding system of the given name.
401
402 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply
403 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol.
404 If there is no such coding system, nil is returned. Otherwise the
405 associated coding system object is returned.
406 */
407 (coding_system_or_name))
408 {
409 if (NILP (coding_system_or_name))
410 coding_system_or_name = Qbinary;
411 if (CODING_SYSTEMP (coding_system_or_name))
412 return coding_system_or_name;
413 CHECK_SYMBOL (coding_system_or_name);
414
415 return Fgethash (coding_system_or_name, Vcoding_system_hashtable, Qnil);
416 }
417
418 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /*
419 Retrieve the coding system of the given name.
420 Same as `find-coding-system' except that if there is no such
421 coding system, an error is signaled instead of returning nil.
422 */
423 (name))
424 {
425 Lisp_Object coding_system = Ffind_coding_system (name);
426
427 if (NILP (coding_system))
428 signal_simple_error ("No such coding system", name);
429 return coding_system;
430 }
431
432 /* We store the coding systems in hash tables with the names as the key and the
433 actual coding system object as the value. Occasionally we need to use them
434 in a list format. These routines provide us with that. */
435 struct coding_system_list_closure
436 {
437 Lisp_Object *coding_system_list;
438 };
439
440 static void
441 add_coding_system_to_list_mapper (CONST void *hash_key, void *hash_contents,
442 void *coding_system_list_closure)
443 {
444 /* This function can GC */
445 Lisp_Object key, contents;
446 Lisp_Object *coding_system_list;
447 struct coding_system_list_closure *chcl = coding_system_list_closure;
448 CVOID_TO_LISP (key, hash_key);
449 VOID_TO_LISP (contents, hash_contents);
450 coding_system_list = chcl->coding_system_list;
451
452 *coding_system_list = Fcons (XCODING_SYSTEM (contents)->name,
453 *coding_system_list);
454 }
455
456 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /*
457 Return a list of the names of all defined coding systems.
458 */
459 ())
460 {
461 Lisp_Object coding_system_list = Qnil;
462 struct gcpro gcpro1;
463 struct coding_system_list_closure coding_system_list_closure;
464
465 GCPRO1 (coding_system_list);
466 coding_system_list_closure.coding_system_list = &coding_system_list;
467 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hashtable,
468 &coding_system_list_closure);
469 UNGCPRO;
470
471 return coding_system_list;
472 }
473
474 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /*
475 Return the name of the given coding system.
476 */
477 (coding_system))
478 {
479 coding_system = Fget_coding_system (coding_system);
480 return (XCODING_SYSTEM_NAME (coding_system));
481 }
482
483 static struct Lisp_Coding_System *
484 allocate_coding_system (int type, Lisp_Object name)
485 {
486 struct Lisp_Coding_System *codesys;
487
488 codesys = (struct Lisp_Coding_System *)
489 alloc_lcrecord (sizeof (struct Lisp_Coding_System), lrecord_coding_system);
490
491 zero_lcrecord (codesys);
492 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil;
493 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil;
494 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT;
495 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil;
496 CODING_SYSTEM_EOL_CR (codesys) = Qnil;
497 CODING_SYSTEM_EOL_LF (codesys) = Qnil;
498 CODING_SYSTEM_TYPE (codesys) = type;
499
500 if (type == CODESYS_ISO2022)
501 {
502 int i;
503 for (i = 0; i < 4; i++)
504 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil;
505 }
506 else if (type == CODESYS_CCL)
507 {
508 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil;
509 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil;
510 }
511
512 CODING_SYSTEM_NAME (codesys) = name;
513
514 return codesys;
515 }
516
517 /* Given a list of charset conversion specs as specified in a Lisp
518 program, parse it into STORE_HERE. */
519
520 static void
521 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here,
522 Lisp_Object spec_list)
523 {
524 Lisp_Object rest;
525
526 EXTERNAL_LIST_LOOP (rest, spec_list)
527 {
528 Lisp_Object car = XCAR (rest);
529 Lisp_Object from, to;
530 struct charset_conversion_spec spec;
531
532 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car))))
533 signal_simple_error ("Invalid charset conversion spec", car);
534 from = Fget_charset (XCAR (car));
535 to = Fget_charset (XCAR (XCDR (car)));
536 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to))
537 signal_simple_error_2
538 ("Attempted conversion between different charset types",
539 from, to);
540 spec.from_charset = from;
541 spec.to_charset = to;
542
543 Dynarr_add (store_here, spec);
544 }
545 }
546
547 /* Given a dynarr LOAD_HERE of internally-stored charset conversion
548 specs, return the equivalent as the Lisp programmer would see it.
549
550 If LOAD_HERE is 0, return Qnil. */
551
552 static Lisp_Object
553 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here)
554 {
555 int i;
556 Lisp_Object result = Qnil;
557
558 if (!load_here)
559 return Qnil;
560 for (i = 0; i < Dynarr_length (load_here); i++)
561 {
562 struct charset_conversion_spec *ccs =
563 Dynarr_atp (load_here, i);
564 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result);
565 }
566
567 return Fnreverse (result);
568 }
569
570 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /*
571 Register symbol NAME as a coding system.
572
573 TYPE describes the conversion method used and should be one of
574
575 nil or 'autodetect
576 Automatic conversion. XEmacs attempts to detect the coding system
577 used in the file.
578 'no-conversion
579 No conversion. Use this for binary files and such. On output,
580 graphic characters that are not in ASCII or Latin-1 will be
581 replaced by a ?. (For a no-conversion-encoded buffer, these
582 characters will only be present if you explicitly insert them.)
583 'shift-jis
584 Shift-JIS (a Japanese encoding commonly used in PC operating systems).
585 'iso2022
586 Any ISO2022-compliant encoding. Among other things, this includes
587 JIS (the Japanese encoding commonly used for e-mail), EUC (the
588 standard Unix encoding for Japanese and other languages), and
589 Compound Text (the encoding used in X11). You can specify more
590 specific information about the conversion with the FLAGS argument.
591 'big5
592 Big5 (the encoding commonly used for Taiwanese).
593 'ccl
594 The conversion is performed using a user-written pseudo-code
595 program. CCL (Code Conversion Language) is the name of this
596 pseudo-code.
597 'internal
598 Write out or read in the raw contents of the memory representing
599 the buffer's text. This is primarily useful for debugging
600 purposes, and is only enabled when XEmacs has been compiled with
601 DEBUG_XEMACS defined (via the --debug configure option).
602 WARNING: Reading in a file using 'internal conversion can result
603 in an internal inconsistency in the memory representing a
604 buffer's text, which will produce unpredictable results and may
605 cause XEmacs to crash. Under normal circumstances you should
606 never use 'internal conversion.
607
608 DOC-STRING is a string describing the coding system.
609
610 PROPS is a property list, describing the specific nature of the
611 character set. Recognized properties are:
612
613 'mnemonic
614 String to be displayed in the modeline when this coding system is
615 active.
616
617 'eol-type
618 End-of-line conversion to be used. It should be one of
619
620 nil
621 Automatically detect the end-of-line type (LF, CRLF,
622 or CR). Also generate subsidiary coding systems named
623 `NAME-unix', `NAME-dos', and `NAME-mac', that are
624 identical to this coding system but have an EOL-TYPE
625 value of 'lf, 'crlf, and 'cr, respectively.
626 'lf
627 The end of a line is marked externally using ASCII LF.
628 Since this is also the way that XEmacs represents an
629 end-of-line internally, specifying this option results
630 in no end-of-line conversion. This is the standard
631 format for Unix text files.
632 'crlf
633 The end of a line is marked externally using ASCII
634 CRLF. This is the standard format for MS-DOS text
635 files.
636 'cr
637 The end of a line is marked externally using ASCII CR.
638 This is the standard format for Macintosh text files.
639 t
640 Automatically detect the end-of-line type but do not
641 generate subsidiary coding systems. (This value is
642 converted to nil when stored internally, and
643 `coding-system-property' will return nil.)
644
645 'post-read-conversion
646 Function called after a file has been read in, to perform the
647 decoding. Called with two arguments, BEG and END, denoting
648 a region of the current buffer to be decoded.
649
650 'pre-write-conversion
651 Function called before a file is written out, to perform the
652 encoding. Called with two arguments, BEG and END, denoting
653 a region of the current buffer to be encoded.
654
655
656 The following additional properties are recognized if TYPE is 'iso2022:
657
658 'charset-g0
659 'charset-g1
660 'charset-g2
661 'charset-g3
662 The character set initially designated to the G0 - G3 registers.
663 The value should be one of
664
665 -- A charset object (designate that character set)
666 -- nil (do not ever use this register)
667 -- t (no character set is initially designated to
668 the register, but may be later on; this automatically
669 sets the corresponding `force-g*-on-output' property)
670
671 'force-g0-on-output
672 'force-g1-on-output
673 'force-g2-on-output
674 'force-g2-on-output
675 If non-nil, send an explicit designation sequence on output before
676 using the specified register.
677
678 'short
679 If non-nil, use the short forms \"ESC $ @\", \"ESC $ A\", and
680 \"ESC $ B\" on output in place of the full designation sequences
681 \"ESC $ ( @\", \"ESC $ ( A\", and \"ESC $ ( B\".
682
683 'no-ascii-eol
684 If non-nil, don't designate ASCII to G0 at each end of line on output.
685 Setting this to non-nil also suppresses other state-resetting that
686 normally happens at the end of a line.
687
688 'no-ascii-cntl
689 If non-nil, don't designate ASCII to G0 before control chars on output.
690
691 'seven
692 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit
693 environment.
694
695 'lock-shift
696 If non-nil, use locking-shift (SO/SI) instead of single-shift
697 or designation by escape sequence.
698
699 'no-iso6429
700 If non-nil, don't use ISO6429's direction specification.
701
702 'escape-quoted
703 If non-nil, literal control characters that are the same as
704 the beginning of a recognized ISO2022 or ISO6429 escape sequence
705 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E),
706 SS3 (0x8F), and CSI (0x9B)) are \"quoted\" with an escape character
707 so that they can be properly distinguished from an escape sequence.
708 (Note that doing this results in a non-portable encoding.) This
709 encoding flag is used for byte-compiled files. Note that ESC
710 is a good choice for a quoting character because there are no
711 escape sequences whose second byte is a character from the Control-0
712 or Control-1 character sets; this is explicitly disallowed by the
713 ISO2022 standard.
714
715 'input-charset-conversion
716 A list of conversion specifications, specifying conversion of
717 characters in one charset to another when decoding is performed.
718 Each specification is a list of two elements: the source charset,
719 and the destination charset.
720
721 'output-charset-conversion
722 A list of conversion specifications, specifying conversion of
723 characters in one charset to another when encoding is performed.
724 The form of each specification is the same as for
725 'input-charset-conversion.
726
727
728 The following additional properties are recognized (and required)
729 if TYPE is 'ccl:
730
731 'decode
732 CCL program used for decoding (converting to internal format).
733
734 'encode
735 CCL program used for encoding (converting to external format).
736 */
737 (name, type, doc_string, props))
738 {
739 struct Lisp_Coding_System *codesys;
740 Lisp_Object rest, key, value;
741 int ty;
742 int need_to_setup_eol_systems = 1;
743
744 /* Convert type to constant */
745 if (NILP (type) || EQ (type, Qautodetect)) { ty = CODESYS_AUTODETECT; }
746 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; }
747 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; }
748 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; }
749 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; }
750 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; }
751 #ifdef DEBUG_XEMACS
752 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; }
753 #endif
754 else
755 signal_simple_error ("Invalid coding system type", type);
756
757 CHECK_SYMBOL (name);
758
759 codesys = allocate_coding_system (ty, name);
760
761 if (NILP (doc_string))
762 doc_string = build_string ("");
763 else
764 CHECK_STRING (doc_string);
765 CODING_SYSTEM_DOC_STRING (codesys) = doc_string;
766
767 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props)
768 {
769 if (EQ (key, Qmnemonic))
770 {
771 if (!NILP (value))
772 CHECK_STRING (value);
773 CODING_SYSTEM_MNEMONIC (codesys) = value;
774 }
775
776 else if (EQ (key, Qeol_type))
777 {
778 need_to_setup_eol_systems = NILP (value);
779 if (EQ (value, Qt))
780 value = Qnil;
781 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value);
782 }
783
784 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value;
785 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value;
786 else if (ty == CODESYS_ISO2022)
787 {
788 #define FROB_INITIAL_CHARSET(charset_num) \
789 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \
790 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value))
791
792 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0);
793 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1);
794 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2);
795 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3);
796
797 #define FROB_FORCE_CHARSET(charset_num) \
798 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value)
799
800 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0);
801 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1);
802 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2);
803 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3);
804
805 #define FROB_BOOLEAN_PROPERTY(prop) \
806 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value)
807
808 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT);
809 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL);
810 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL);
811 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN);
812 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT);
813 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429);
814 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED);
815
816 else if (EQ (key, Qinput_charset_conversion))
817 {
818 codesys->iso2022.input_conv =
819 Dynarr_new (struct charset_conversion_spec);
820 parse_charset_conversion_specs (codesys->iso2022.input_conv,
821 value);
822 }
823 else if (EQ (key, Qoutput_charset_conversion))
824 {
825 codesys->iso2022.output_conv =
826 Dynarr_new (struct charset_conversion_spec);
827 parse_charset_conversion_specs (codesys->iso2022.output_conv,
828 value);
829 }
830 else
831 signal_simple_error ("Unrecognized property", key);
832 }
833 else if (EQ (type, Qccl))
834 {
835 if (EQ (key, Qdecode))
836 {
837 CHECK_VECTOR (value);
838 CODING_SYSTEM_CCL_DECODE (codesys) = value;
839 }
840 else if (EQ (key, Qencode))
841 {
842 CHECK_VECTOR (value);
843 CODING_SYSTEM_CCL_ENCODE (codesys) = value;
844 }
845 else
846 signal_simple_error ("Unrecognized property", key);
847 }
848 else
849 signal_simple_error ("Unrecognized property", key);
850 }
851
852 if (need_to_setup_eol_systems)
853 setup_eol_coding_systems (codesys);
854
855 {
856 Lisp_Object codesys_obj;
857 XSETCODING_SYSTEM (codesys_obj, codesys);
858 Fputhash (name, codesys_obj, Vcoding_system_hashtable);
859 return codesys_obj;
860 }
861 }
862
863 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /*
864 Copy OLD-CODING-SYSTEM to NEW-NAME.
865 If NEW-NAME does not name an existing coding system, a new one will
866 be created.
867 */
868 (old_coding_system, new_name))
869 {
870 Lisp_Object new_coding_system;
871 old_coding_system = Fget_coding_system (old_coding_system);
872 new_coding_system = Ffind_coding_system (new_name);
873 if (NILP (new_coding_system))
874 {
875 XSETCODING_SYSTEM (new_coding_system,
876 allocate_coding_system
877 (XCODING_SYSTEM_TYPE (old_coding_system),
878 new_name));
879 Fputhash (new_name, new_coding_system, Vcoding_system_hashtable);
880 }
881
882 {
883 struct Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system);
884 struct Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system);
885 memcpy (((char *) to ) + sizeof (to->header),
886 ((char *) from) + sizeof (from->header),
887 sizeof (*from) - sizeof (from->header));
888 to->name = new_name;
889 }
890 return new_coding_system;
891 }
892
893 static Lisp_Object
894 subsidiary_coding_system (Lisp_Object coding_system, int eol_type)
895 {
896 struct Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system);
897 Lisp_Object new_coding_system;
898
899 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
900 return coding_system;
901 if (eol_type == EOL_AUTODETECT)
902 return coding_system;
903
904 switch (eol_type)
905 {
906 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break;
907 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break;
908 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break;
909 default: abort ();
910 }
911
912 return NILP (new_coding_system) ? coding_system : new_coding_system;
913 }
914
915 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /*
916 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE.
917 */
918 (coding_system, eol_type))
919 {
920 coding_system = Fget_coding_system (coding_system);
921
922 return subsidiary_coding_system (coding_system,
923 symbol_to_eol_type (eol_type));
924 }
925
926
927 /************************************************************************/
928 /* Coding system accessors */
929 /************************************************************************/
930
931 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /*
932 Return the doc string for CODING-SYSTEM.
933 */
934 (coding_system))
935 {
936 coding_system = Fget_coding_system (coding_system);
937 return XCODING_SYSTEM_DOC_STRING (coding_system);
938 }
939
940 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /*
941 Return the type of CODING-SYSTEM.
942 */
943 (coding_system))
944 {
945 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system)))
946 {
947 case CODESYS_AUTODETECT: return Qautodetect;
948 case CODESYS_SHIFT_JIS: return Qshift_jis;
949 case CODESYS_ISO2022: return Qiso2022;
950 case CODESYS_BIG5: return Qbig5;
951 case CODESYS_CCL: return Qccl;
952 case CODESYS_NO_CONVERSION: return Qno_conversion;
953 #ifdef DEBUG_XEMACS
954 case CODESYS_INTERNAL: return Qinternal;
955 #endif
956 default:
957 abort ();
958 }
959
960 return Qnil; /* not reached */
961 }
962
963 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /*
964 Return the PROP property of CODING-SYSTEM.
965 */
966 (coding_system, prop))
967 {
968 int i, ok = 0;
969 enum coding_system_type type;
970
971 coding_system = Fget_coding_system (coding_system);
972 CHECK_SYMBOL (prop);
973 type = XCODING_SYSTEM_TYPE (coding_system);
974
975 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++)
976 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop))
977 {
978 ok = 1;
979 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type)
980 {
981 case CODESYS_PROP_ALL_OK:
982 break;
983
984 case CODESYS_PROP_ISO2022:
985 if (type != CODESYS_ISO2022)
986 signal_simple_error
987 ("Property only valid in ISO2022 coding systems",
988 prop);
989 break;
990
991 case CODESYS_PROP_CCL:
992 if (type != CODESYS_CCL)
993 signal_simple_error
994 ("Property only valid in CCL coding systems",
995 prop);
996 break;
997
998 default:
999 abort ();
1000 }
1001 }
1002
1003 if (!ok)
1004 signal_simple_error ("Unrecognized property", prop);
1005
1006 if (EQ (prop, Qname))
1007 return XCODING_SYSTEM_NAME (coding_system);
1008 else if (EQ (prop, Qtype))
1009 return Fcoding_system_type (coding_system);
1010 else if (EQ (prop, Qdoc_string))
1011 return XCODING_SYSTEM_DOC_STRING (coding_system);
1012 else if (EQ (prop, Qmnemonic))
1013 return XCODING_SYSTEM_MNEMONIC (coding_system);
1014 else if (EQ (prop, Qeol_type))
1015 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system));
1016 else if (EQ (prop, Qeol_lf))
1017 return XCODING_SYSTEM_EOL_LF (coding_system);
1018 else if (EQ (prop, Qeol_crlf))
1019 return XCODING_SYSTEM_EOL_CRLF (coding_system);
1020 else if (EQ (prop, Qeol_cr))
1021 return XCODING_SYSTEM_EOL_CR (coding_system);
1022 else if (EQ (prop, Qpost_read_conversion))
1023 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system);
1024 else if (EQ (prop, Qpre_write_conversion))
1025 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system);
1026 else if (type == CODESYS_ISO2022)
1027 {
1028 #define INITIAL_CHARSET(charset_num) \
1029 (XCHARSET_NAME (XCODING_SYSTEM_ISO2022_INITIAL_CHARSET \
1030 (coding_system, charset_num)))
1031
1032 if (EQ (prop, Qcharset_g0)) return INITIAL_CHARSET (0);
1033 else if (EQ (prop, Qcharset_g1)) return INITIAL_CHARSET (1);
1034 else if (EQ (prop, Qcharset_g2)) return INITIAL_CHARSET (2);
1035 else if (EQ (prop, Qcharset_g3)) return INITIAL_CHARSET (3);
1036
1037 #define FORCE_CHARSET(charset_num) \
1038 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \
1039 (coding_system, charset_num) ? Qt : Qnil)
1040
1041 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0);
1042 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1);
1043 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2);
1044 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3);
1045
1046 #define LISP_BOOLEAN(prop) \
1047 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil)
1048
1049 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT);
1050 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL);
1051 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL);
1052 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN);
1053 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT);
1054 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429);
1055 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED);
1056
1057 else if (EQ (prop, Qinput_charset_conversion))
1058 return
1059 unparse_charset_conversion_specs
1060 (XCODING_SYSTEM (coding_system)->iso2022.input_conv);
1061 else if (EQ (prop, Qoutput_charset_conversion))
1062 return
1063 unparse_charset_conversion_specs
1064 (XCODING_SYSTEM (coding_system)->iso2022.output_conv);
1065 else
1066 abort ();
1067 }
1068 else if (type == CODESYS_CCL)
1069 {
1070 if (EQ (prop, Qdecode))
1071 return XCODING_SYSTEM_CCL_DECODE (coding_system);
1072 else if (EQ (prop, Qencode))
1073 return XCODING_SYSTEM_CCL_ENCODE (coding_system);
1074 else
1075 abort ();
1076 }
1077 else
1078 abort ();
1079
1080 return Qnil; /* not reached */
1081 }
1082
1083
1084 /************************************************************************/
1085 /* Coding category functions */
1086 /************************************************************************/
1087
1088 static int
1089 decode_coding_category (Lisp_Object symbol)
1090 {
1091 int i;
1092
1093 CHECK_SYMBOL (symbol);
1094 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1095 if (EQ (coding_category_symbol[i], symbol))
1096 return i;
1097
1098 signal_simple_error ("Unrecognized coding category", symbol);
1099 return 0; /* not reached */
1100 }
1101
1102 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /*
1103 Return a list of all recognized coding categories.
1104 */
1105 ())
1106 {
1107 int i;
1108 Lisp_Object list = Qnil;
1109
1110 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1111 list = Fcons (coding_category_symbol[i], list);
1112 return list;
1113 }
1114
1115 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /*
1116 Change the priority order of the coding categories.
1117 LIST should be list of coding categories, in descending order of
1118 priority. Unspecified coding categories will be lower in priority
1119 than all specified ones, in the same relative order they were in
1120 previously.
1121 */
1122 (list))
1123 {
1124 int category_to_priority[CODING_CATEGORY_LAST + 1];
1125 int i, j;
1126 Lisp_Object rest;
1127
1128 /* First generate a list that maps coding categories to priorities. */
1129
1130 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1131 category_to_priority[i] = -1;
1132
1133 /* Highest priority comes from the specified list. */
1134 i = 0;
1135 EXTERNAL_LIST_LOOP (rest, list)
1136 {
1137 int cat = decode_coding_category (XCAR (rest));
1138
1139 if (category_to_priority[cat] >= 0)
1140 signal_simple_error ("Duplicate coding category in list", XCAR (rest));
1141 category_to_priority[cat] = i++;
1142 }
1143
1144 /* Now go through the existing categories by priority to retrieve
1145 the categories not yet specified and preserve their priority
1146 order. */
1147 for (j = 0; j <= CODING_CATEGORY_LAST; j++)
1148 {
1149 int cat = coding_category_by_priority[j];
1150 if (category_to_priority[cat] < 0)
1151 category_to_priority[cat] = i++;
1152 }
1153
1154 /* Now we need to construct the inverse of the mapping we just
1155 constructed. */
1156
1157 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1158 coding_category_by_priority[category_to_priority[i]] = i;
1159
1160 /* Phew! That was confusing. */
1161 return Qnil;
1162 }
1163
1164 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /*
1165 Return a list of coding categories in descending order of priority.
1166 */
1167 ())
1168 {
1169 int i;
1170 Lisp_Object list = Qnil;
1171
1172 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1173 list = Fcons (coding_category_symbol[coding_category_by_priority[i]],
1174 list);
1175 return list;
1176 }
1177
1178 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /*
1179 Change the coding system associated with a coding category.
1180 */
1181 (coding_category, coding_system))
1182 {
1183 int cat = decode_coding_category (coding_category);
1184
1185 coding_system = Fget_coding_system (coding_system);
1186 coding_category_system[cat] = coding_system;
1187 return Qnil;
1188 }
1189
1190 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /*
1191 Return the coding system associated with a coding category.
1192 */
1193 (coding_category))
1194 {
1195 int cat = decode_coding_category (coding_category);
1196 Lisp_Object sys = coding_category_system[cat];
1197
1198 if (!NILP (sys))
1199 return XCODING_SYSTEM_NAME (sys);
1200 return Qnil;
1201 }
1202
1203
1204 /************************************************************************/
1205 /* Detecting the encoding of data */
1206 /************************************************************************/
1207
1208 struct detection_state
1209 {
1210 int eol_type;
1211 int seen_non_ascii;
1212 int mask;
1213
1214 struct
1215 {
1216 int mask;
1217 int in_second_byte;
1218 }
1219 big5;
1220
1221 struct
1222 {
1223 int mask;
1224 int in_second_byte;
1225 }
1226 shift_jis;
1227
1228 struct
1229 {
1230 int mask;
1231 int initted;
1232 struct iso2022_decoder iso;
1233 unsigned int flags;
1234 int high_byte_count;
1235 int saw_single_shift:1;
1236 }
1237 iso2022;
1238
1239 struct
1240 {
1241 int seen_anything;
1242 int just_saw_cr;
1243 }
1244 eol;
1245 };
1246
1247 static int
1248 acceptable_control_char_p (int c)
1249 {
1250 switch (c)
1251 {
1252 /* Allow and ignore control characters that you might
1253 reasonably see in a text file */
1254 case '\r':
1255 case '\n':
1256 case '\t':
1257 case 7: /* bell */
1258 case 8: /* backspace */
1259 case 11: /* vertical tab */
1260 case 12: /* form feed */
1261 case 26: /* MS-DOS C-z junk */
1262 return 1;
1263 default:
1264 return 0;
1265 }
1266 }
1267
1268 static int
1269 mask_has_at_most_one_bit_p (int mask)
1270 {
1271 /* Perhaps the only thing useful you learn from intensive Microsoft
1272 technical interviews */
1273 return (mask & (mask - 1)) == 0;
1274 }
1275
1276 static int
1277 detect_eol_type (struct detection_state *st, CONST unsigned char *src,
1278 unsigned int n)
1279 {
1280 int c;
1281
1282 while (n--)
1283 {
1284 c = *src++;
1285 if (c == '\r')
1286 st->eol.just_saw_cr = 1;
1287 else
1288 {
1289 if (c == '\n')
1290 {
1291 if (st->eol.just_saw_cr)
1292 return EOL_CRLF;
1293 else if (st->eol.seen_anything)
1294 return EOL_LF;
1295 }
1296 else if (st->eol.just_saw_cr)
1297 return EOL_CR;
1298 st->eol.just_saw_cr = 0;
1299 }
1300 st->eol.seen_anything = 1;
1301 }
1302
1303 return EOL_AUTODETECT;
1304 }
1305
1306 /* Attempt to determine the encoding and EOL type of the given text.
1307 Before calling this function for the first type, you must initialize
1308 st->eol_type as appropriate and initialize st->mask to ~0.
1309
1310 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if
1311 not yet known.
1312
1313 st->mask holds the determined coding category mask, or ~0 if only
1314 ASCII has been seen so far.
1315
1316 Returns:
1317
1318 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category
1319 is present in st->mask
1320 1 == definitive answers are here for both st->eol_type and st->mask
1321 */
1322
1323 static int
1324 detect_coding_type (struct detection_state *st, CONST unsigned char *src,
1325 unsigned int n, int just_do_eol)
1326 {
1327 int c;
1328
1329 if (st->eol_type == EOL_AUTODETECT)
1330 st->eol_type = detect_eol_type (st, src, n);
1331
1332 if (just_do_eol)
1333 return st->eol_type != EOL_AUTODETECT;
1334
1335 if (!st->seen_non_ascii)
1336 {
1337 for (; n; n--, src++)
1338 {
1339 c = *src;
1340 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80)
1341 {
1342 st->seen_non_ascii = 1;
1343 st->shift_jis.mask = ~0;
1344 st->big5.mask = ~0;
1345 st->iso2022.mask = ~0;
1346 break;
1347 }
1348 }
1349 }
1350
1351 if (!n)
1352 return 0;
1353
1354 if (!mask_has_at_most_one_bit_p (st->iso2022.mask))
1355 st->iso2022.mask = detect_coding_iso2022 (st, src, n);
1356 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask))
1357 st->shift_jis.mask = detect_coding_shift_jis (st, src, n);
1358 if (!mask_has_at_most_one_bit_p (st->big5.mask))
1359 st->big5.mask = detect_coding_big5 (st, src, n);
1360
1361 st->mask = st->iso2022.mask | st->shift_jis.mask | st->big5.mask;
1362
1363 {
1364 int retval = mask_has_at_most_one_bit_p (st->mask);
1365 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK;
1366 return retval && st->eol_type != EOL_AUTODETECT;
1367 }
1368 }
1369
1370 static Lisp_Object
1371 coding_system_from_mask (int mask)
1372 {
1373 if (mask == ~0)
1374 {
1375 /* If the file was entirely or basically ASCII, use the
1376 default value of `file-coding-system'. */
1377 Lisp_Object retval =
1378 XBUFFER (Vbuffer_defaults)->file_coding_system;
1379 if (!NILP (retval))
1380 {
1381 retval = Ffind_coding_system (retval);
1382 if (NILP (retval))
1383 {
1384 warn_when_safe
1385 (Qbad_variable, Qwarning,
1386 "Invalid `default-file-coding-system', set to nil");
1387 XBUFFER (Vbuffer_defaults)->file_coding_system = Qnil;
1388 }
1389 }
1390 if (NILP (retval))
1391 retval = Fget_coding_system (Qno_conversion);
1392 return retval;
1393 }
1394 else
1395 {
1396 int i;
1397 int cat = -1;
1398
1399 mask = postprocess_iso2022_mask (mask);
1400
1401 /* Look through the coding categories by priority and find
1402 the first one that is allowed. */
1403 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
1404 {
1405 cat = coding_category_by_priority[i];
1406 if ((mask & (1 << cat)) &&
1407 !NILP (coding_category_system[cat]))
1408 break;
1409 }
1410 if (cat >= 0)
1411 return coding_category_system[cat];
1412 else
1413 return Fget_coding_system (Qno_conversion);
1414 }
1415 }
1416
1417 /* Given a seekable read stream and potential coding system and EOL type
1418 as specified, do any autodetection that is called for. If the
1419 coding system and/or EOL type are not autodetect, they will be left
1420 alone; but this function will never return an autodetect coding system
1421 or EOL type.
1422
1423 This function does not automatically fetch subsidiary coding systems;
1424 that should be unnecessary with the explicit eol-type argument. */
1425
1426 static void
1427 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out,
1428 int *eol_type_in_out)
1429 {
1430 struct detection_state decst;
1431
1432 if (*eol_type_in_out == EOL_AUTODETECT)
1433 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out);
1434
1435 memset (&decst, 0, sizeof (decst));
1436 decst.eol_type = *eol_type_in_out;
1437 decst.mask = ~0;
1438
1439 /* If autodetection is called for, do it now. */
1440 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT ||
1441 *eol_type_in_out == EOL_AUTODETECT)
1442 {
1443
1444 while (1)
1445 {
1446 unsigned char random_buffer[4096];
1447 int nread;
1448
1449 nread = Lstream_read (stream, random_buffer, sizeof (random_buffer));
1450 if (!nread)
1451 break;
1452 if (detect_coding_type (&decst, random_buffer, nread,
1453 XCODING_SYSTEM_TYPE (*codesys_in_out) !=
1454 CODESYS_AUTODETECT))
1455 break;
1456 }
1457
1458 *eol_type_in_out = decst.eol_type;
1459 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT)
1460 *codesys_in_out = coding_system_from_mask (decst.mask);
1461 }
1462
1463 /* If we absolutely can't determine the EOL type, just assume LF. */
1464 if (*eol_type_in_out == EOL_AUTODETECT)
1465 *eol_type_in_out = EOL_LF;
1466
1467 Lstream_rewind (stream);
1468 }
1469
1470 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /*
1471 Detect coding system of the text in the region between START and END.
1472 Returned value is a list of possible coding systems ordered by priority.
1473 If only ASCII characters are found, it returns 'autodetect or one of its
1474 subsidiary coding systems according to a detected end-of-line type.
1475 Optional arg BUFFER defaults to the current buffer.
1476 */
1477 (start, end, buffer))
1478 {
1479 Lisp_Object val = Qnil;
1480 struct buffer *buf = decode_buffer (buffer, 0);
1481 Bufpos b, e;
1482 Lisp_Object instream;
1483 struct detection_state decst;
1484
1485 get_buffer_range_char (buf, start, end, &b, &e, 0);
1486 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1487 instream = make_encoding_input_stream (XLSTREAM (instream),
1488 Fget_coding_system (Qbinary));
1489 memset (&decst, 0, sizeof (decst));
1490 decst.eol_type = EOL_AUTODETECT;
1491 decst.mask = ~0;
1492 while (1)
1493 {
1494 unsigned char random_buffer[4096];
1495 int nread;
1496
1497 nread = Lstream_read (XLSTREAM (instream), random_buffer,
1498 sizeof (random_buffer));
1499 if (!nread)
1500 break;
1501 if (detect_coding_type (&decst, random_buffer, nread, 0))
1502 break;
1503 }
1504
1505 if (decst.mask == ~0)
1506 {
1507 val = subsidiary_coding_system (Fget_coding_system (Qautodetect),
1508 decst.eol_type);
1509 }
1510 else
1511 {
1512 int i;
1513
1514 val = Qnil;
1515
1516 decst.mask = postprocess_iso2022_mask (decst.mask);
1517
1518 for (i = CODING_CATEGORY_LAST; i >= 0; i--)
1519 {
1520 int sys = coding_category_by_priority[i];
1521 if (decst.mask & (1 << sys))
1522 {
1523 Lisp_Object codesys = coding_category_system[sys];
1524 if (!NILP (codesys))
1525 codesys = subsidiary_coding_system (codesys, decst.eol_type);
1526 val = Fcons (codesys, val);
1527 }
1528 }
1529 }
1530 return val;
1531 }
1532
1533
1534 /************************************************************************/
1535 /* Converting to internal Mule format ("decoding") */
1536 /************************************************************************/
1537
1538 /* A decoding stream is a stream used for decoding text (i.e.
1539 converting from some external format to internal format).
1540 The decoding-stream object keeps track of the actual coding
1541 stream, the stream that is at the other end, and data that
1542 needs to be persistent across the lifetime of the stream. */
1543
1544 /* Handle the EOL stuff related to just-read-in character C. EOL is
1545 the EOL type of the coding stream. FLAGS is the current value of
1546 FLAGS in the coding stream, and may be modified by this macro.
1547 (The macro only looks at the CODING_STATE_CR flag.) DST is the
1548 Dynarr to which the decoded bytes are to be written. You need to
1549 also define a local goto label "label_continue_loop" that is at the
1550 end of the main character-reading loop.
1551
1552 If C is a CR character, then this macro handles it entirely and
1553 jumps to label_continue_loop. Otherwise, this macro does not add
1554 anything to DST, and continues normally. You should continue
1555 processing C normally after this macro. */
1556
1557 #define DECODE_HANDLE_EOL_TYPE(eol, c, flags, dst) \
1558 do { \
1559 if (c == '\r') \
1560 { \
1561 if (eol == EOL_CR) \
1562 Dynarr_add (dst, '\n'); \
1563 else if (eol != EOL_CRLF || flags & CODING_STATE_CR) \
1564 Dynarr_add (dst, c); \
1565 else \
1566 flags |= CODING_STATE_CR; \
1567 goto label_continue_loop; \
1568 } \
1569 else if (flags & CODING_STATE_CR) \
1570 { /* eol == CODING_SYSTEM_EOL_CRLF */ \
1571 if (c != '\n') \
1572 Dynarr_add (dst, '\r'); \
1573 flags &= ~CODING_STATE_CR; \
1574 } \
1575 } while (0)
1576
1577 /* C should be a binary character in the range 0 - 255; convert
1578 to internal format and add to Dynarr DST. */
1579
1580 #define DECODE_ADD_BINARY_CHAR(c, dst) \
1581 do { \
1582 if (BYTE_ASCII_P (c)) \
1583 Dynarr_add (dst, c); \
1584 else if (BYTE_C1_P (c)) \
1585 { \
1586 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \
1587 Dynarr_add (dst, c + 0x20); \
1588 } \
1589 else \
1590 { \
1591 Dynarr_add (dst, LEADING_BYTE_LATIN_1); \
1592 Dynarr_add (dst, c); \
1593 } \
1594 } while (0)
1595
1596 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \
1597 do { \
1598 if (ch) \
1599 { \
1600 DECODE_ADD_BINARY_CHAR (ch, dst); \
1601 ch = 0; \
1602 } \
1603 } while (0)
1604
1605 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst)\
1606 do { \
1607 DECODE_OUTPUT_PARTIAL_CHAR (ch); \
1608 if (flags & CODING_STATE_END) \
1609 { \
1610 if (flags & CODING_STATE_CR) \
1611 Dynarr_add (dst, '\r'); \
1612 } \
1613 } while (0)
1614
1615 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding)
1616
1617 struct decoding_stream
1618 {
1619 /* Coding system that governs the conversion. */
1620 struct Lisp_Coding_System *codesys;
1621
1622 /* Stream that we read the encoded data from or
1623 write the decoded data to. */
1624 Lstream *other_end;
1625
1626 /* If we are reading, then we can return only a fixed amount of
1627 data, so if the conversion resulted in too much data, we store it
1628 here for retrieval the next time around. */
1629 unsigned_char_dynarr *runoff;
1630
1631 /* FLAGS holds flags indicating the current state of the decoding.
1632 Some of these flags are dependent on the coding system. */
1633 unsigned int flags;
1634
1635 /* CH holds a partially built-up character. Since we only deal
1636 with one- and two-byte characters at the moment, we only use
1637 this to store the first byte of a two-byte character. */
1638 unsigned int ch;
1639
1640 /* EOL_TYPE specifies the type of end-of-line conversion that
1641 currently applies. We need to keep this separate from the
1642 EOL type stored in CODESYS because the latter might indicate
1643 automatic EOL-type detection while the former will always
1644 indicate a particular EOL type. */
1645 int eol_type;
1646
1647 /* Additional ISO2022 information. We define the structure above
1648 because it's also needed by the detection routines. */
1649 struct iso2022_decoder iso2022;
1650
1651 /* Additional information (the state of the running CCL program)
1652 used by the CCL decoder. */
1653 struct ccl_program ccl;
1654
1655 struct detection_state decst;
1656 };
1657
1658 static int decoding_reader (Lstream *stream, unsigned char *data, int size);
1659 static int decoding_writer (Lstream *stream, CONST unsigned char *data, int size);
1660 static int decoding_rewinder (Lstream *stream);
1661 static int decoding_seekable_p (Lstream *stream);
1662 static int decoding_flusher (Lstream *stream);
1663 static int decoding_closer (Lstream *stream);
1664 static Lisp_Object decoding_marker (Lisp_Object stream,
1665 void (*markobj) (Lisp_Object));
1666
1667 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding,
1668 sizeof (struct decoding_stream));
1669
1670 static Lisp_Object
1671 decoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
1672 {
1673 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end;
1674 Lisp_Object str_obj;
1675
1676 /* We do not need to mark the coding systems or charsets stored
1677 within the stream because they are stored in a global list
1678 and automatically marked. */
1679
1680 XSETLSTREAM (str_obj, str);
1681 (markobj) (str_obj);
1682 if (str->imp->marker)
1683 return (str->imp->marker) (str_obj, markobj);
1684 else
1685 return Qnil;
1686 }
1687
1688 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream
1689 so we read data from the other end, decode it, and store it into DATA. */
1690
1691 static int
1692 decoding_reader (Lstream *stream, unsigned char *data, int size)
1693 {
1694 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1695 unsigned char *orig_data = data;
1696 int read_size;
1697 int error_occurred = 0;
1698
1699 /* We need to interface to mule_decode(), which expects to take some
1700 amount of data and store the result into a Dynarr. We have
1701 mule_decode() store into str->runoff, and take data from there
1702 as necessary. */
1703
1704 /* We loop until we have enough data, reading chunks from the other
1705 end and decoding it. */
1706 while (1)
1707 {
1708 /* Take data from the runoff if we can. Make sure to take at
1709 most SIZE bytes, and delete the data from the runoff. */
1710 if (Dynarr_length (str->runoff) > 0)
1711 {
1712 int chunk = min (size, Dynarr_length (str->runoff));
1713 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
1714 Dynarr_delete_many (str->runoff, 0, chunk);
1715 data += chunk;
1716 size -= chunk;
1717 }
1718
1719 if (size == 0)
1720 break; /* No more room for data */
1721
1722 if (str->flags & CODING_STATE_END)
1723 /* This means that on the previous iteration, we hit the EOF on
1724 the other end. We loop once more so that mule_decode() can
1725 output any final stuff it may be holding, or any "go back
1726 to a sane state" escape sequences. (This latter makes sense
1727 during encoding.) */
1728 break;
1729
1730 /* Exhausted the runoff, so get some more. DATA has at least
1731 SIZE bytes left of storage in it, so it's OK to read directly
1732 into it. (We'll be overwriting above, after we've decoded it
1733 into the runoff.) */
1734 read_size = Lstream_read (str->other_end, data, size);
1735 if (read_size < 0)
1736 {
1737 error_occurred = 1;
1738 break;
1739 }
1740 if (read_size == 0)
1741 /* There might be some more end data produced in the translation.
1742 See the comment above. */
1743 str->flags |= CODING_STATE_END;
1744 mule_decode (stream, data, str->runoff, read_size);
1745 }
1746
1747 if (data - orig_data == 0)
1748 return error_occurred ? -1 : 0;
1749 else
1750 return data - orig_data;
1751 }
1752
1753 static int
1754 decoding_writer (Lstream *stream, CONST unsigned char *data, int size)
1755 {
1756 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1757 int retval;
1758
1759 /* Decode all our data into the runoff, and then attempt to write
1760 it all out to the other end. Remove whatever chunk we succeeded
1761 in writing. */
1762 mule_decode (stream, data, str->runoff, size);
1763 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
1764 Dynarr_length (str->runoff));
1765 if (retval > 0)
1766 Dynarr_delete_many (str->runoff, 0, retval);
1767 /* Do NOT return retval. The return value indicates how much
1768 of the incoming data was written, not how many bytes were
1769 written. */
1770 return size;
1771 }
1772
1773 static void
1774 reset_decoding_stream (struct decoding_stream *str)
1775 {
1776 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
1777 {
1778 Lisp_Object coding_system = Qnil;
1779 XSETCODING_SYSTEM (coding_system, str->codesys);
1780 reset_iso2022 (coding_system, &str->iso2022);
1781 }
1782 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
1783 {
1784 set_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys),
1785 0, 0, 0);
1786 }
1787
1788 str->flags = str->ch = 0;
1789 }
1790
1791 static int
1792 decoding_rewinder (Lstream *stream)
1793 {
1794 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1795 reset_decoding_stream (str);
1796 Dynarr_reset (str->runoff);
1797 return Lstream_rewind (str->other_end);
1798 }
1799
1800 static int
1801 decoding_seekable_p (Lstream *stream)
1802 {
1803 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1804 return Lstream_seekable_p (str->other_end);
1805 }
1806
1807 static int
1808 decoding_flusher (Lstream *stream)
1809 {
1810 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1811 return Lstream_flush (str->other_end);
1812 }
1813
1814 static int
1815 decoding_closer (Lstream *stream)
1816 {
1817 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1818 if (stream->flags & LSTREAM_FL_WRITE)
1819 {
1820 str->flags |= CODING_STATE_END;
1821 decoding_writer (stream, 0, 0);
1822 }
1823 Dynarr_free (str->runoff);
1824 if (str->iso2022.composite_chars)
1825 Dynarr_free (str->iso2022.composite_chars);
1826 return Lstream_close (str->other_end);
1827 }
1828
1829 Lisp_Object
1830 decoding_stream_coding_system (Lstream *stream)
1831 {
1832 Lisp_Object coding_system = Qnil;
1833 struct decoding_stream *str = DECODING_STREAM_DATA (stream);
1834
1835 XSETCODING_SYSTEM (coding_system, str->codesys);
1836 return subsidiary_coding_system (coding_system, str->eol_type);
1837 }
1838
1839 void
1840 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
1841 {
1842 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
1843 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1844 str->codesys = cs;
1845 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT)
1846 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs);
1847 reset_decoding_stream (str);
1848 }
1849
1850 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding
1851 stream for writing, no automatic code detection will be performed.
1852 The reason for this is that automatic code detection requires a
1853 seekable input. Things will also fail if you open a decoding
1854 stream for reading using a non-fully-specified coding system and
1855 a non-seekable input stream. */
1856
1857 static Lisp_Object
1858 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys,
1859 CONST char *mode)
1860 {
1861 Lstream *lstr = Lstream_new (lstream_decoding, mode);
1862 struct decoding_stream *str = DECODING_STREAM_DATA (lstr);
1863 Lisp_Object obj;
1864
1865 memset (str, 0, sizeof (*str));
1866 str->other_end = stream;
1867 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned char);
1868 str->eol_type = EOL_AUTODETECT;
1869 if (!strcmp (mode, "r")
1870 && Lstream_seekable_p (stream))
1871 /* We can determine the coding system now. */
1872 determine_real_coding_system (stream, &codesys, &str->eol_type);
1873 set_decoding_stream_coding_system (lstr, codesys);
1874 str->decst.eol_type = str->eol_type;
1875 str->decst.mask = ~0;
1876 XSETLSTREAM (obj, lstr);
1877 return obj;
1878 }
1879
1880 Lisp_Object
1881 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys)
1882 {
1883 return make_decoding_stream_1 (stream, codesys, "r");
1884 }
1885
1886 Lisp_Object
1887 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys)
1888 {
1889 return make_decoding_stream_1 (stream, codesys, "w");
1890 }
1891
1892 /* Note: the decode_coding_? functions all take the same
1893 arguments as mule_decode(), which is to say some SRC data of
1894 size N, which is to be stored into dynamic array DST.
1895 DECODING is the stream within which the decoding is
1896 taking place, but no data is actually read from or
1897 written to that stream; that is handled in decoding_reader()
1898 or decoding_writer(). This allows the same functions to
1899 be used for both reading and writing. */
1900
1901 static void
1902 mule_decode (Lstream *decoding, CONST unsigned char *src,
1903 unsigned_char_dynarr *dst, unsigned int n)
1904 {
1905 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
1906
1907 /* If necessary, do encoding-detection now. We do this when
1908 we're a writing stream or a non-seekable reading stream,
1909 meaning that we can't just process the whole input,
1910 rewind, and start over. */
1911
1912 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT ||
1913 str->eol_type == EOL_AUTODETECT)
1914 {
1915 Lisp_Object codesys = Qnil;
1916
1917 XSETCODING_SYSTEM (codesys, str->codesys);
1918 detect_coding_type (&str->decst, src, n,
1919 CODING_SYSTEM_TYPE (str->codesys) !=
1920 CODESYS_AUTODETECT);
1921 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT &&
1922 str->decst.mask != ~0)
1923 /* #### This is cheesy. What we really ought to do is
1924 buffer up a certain amount of data so as to get a
1925 less random result. */
1926 codesys = coding_system_from_mask (str->decst.mask);
1927 str->eol_type = str->decst.eol_type;
1928 if (XCODING_SYSTEM (codesys) != str->codesys)
1929 {
1930 /* Preserve the CODING_STATE_END flag in case it was set.
1931 If we erase it, bad things might happen. */
1932 int was_end = str->flags & CODING_STATE_END;
1933 set_decoding_stream_coding_system (decoding, codesys);
1934 if (was_end)
1935 str->flags |= CODING_STATE_END;
1936 }
1937 }
1938
1939 switch (CODING_SYSTEM_TYPE (str->codesys))
1940 {
1941 #ifdef DEBUG_XEMACS
1942 case CODESYS_INTERNAL:
1943 Dynarr_add_many (dst, src, n);
1944 break;
1945 #endif
1946 case CODESYS_AUTODETECT:
1947 /* If we got this far and still haven't decided on the coding
1948 system, then do no conversion. */
1949 case CODESYS_NO_CONVERSION:
1950 decode_coding_no_conversion (decoding, src, dst, n);
1951 break;
1952 case CODESYS_SHIFT_JIS:
1953 decode_coding_shift_jis (decoding, src, dst, n);
1954 break;
1955 case CODESYS_BIG5:
1956 decode_coding_big5 (decoding, src, dst, n);
1957 break;
1958 case CODESYS_CCL:
1959 ccl_driver (&str->ccl, src, dst, n, (str->flags) & CODING_STATE_END);
1960 break;
1961 case CODESYS_ISO2022:
1962 decode_coding_iso2022 (decoding, src, dst, n);
1963 break;
1964 default:
1965 abort ();
1966 }
1967 }
1968
1969 static Lisp_Object
1970 close_both_streams (Lisp_Object cons)
1971 {
1972 Lisp_Object instream = XCAR (cons);
1973 Lisp_Object outstream = XCDR (cons);
1974 Lstream_close (XLSTREAM (outstream));
1975 Lstream_close (XLSTREAM (instream));
1976 return Qnil;
1977 }
1978
1979 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /*
1980 Decode the text between START and END which is encoded in CODING-SYSTEM.
1981 This is useful if you've read in encoded text from a file without decoding
1982 it (e.g. you read in a JIS-formatted file but used the `binary' or
1983 `no-conversion' coding system, so that it shows up as \"^[$B!<!+^[(B\").
1984 Return length of decoded text.
1985 BUFFER defaults to the current buffer if unspecified.
1986 */
1987 (start, end, coding_system, buffer))
1988 {
1989 Bufpos b, e;
1990 struct buffer *buf = decode_buffer (buffer, 0);
1991 Lisp_Object instream, outstream;
1992 int speccount = specpdl_depth ();
1993 struct gcpro gcpro1, gcpro2;
1994
1995 get_buffer_range_char (buf, start, end, &b, &e, 0);
1996 coding_system = Fget_coding_system (coding_system);
1997 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
1998 outstream = make_lisp_buffer_output_stream (buf, b, 0);
1999 outstream = make_decoding_output_stream (XLSTREAM (outstream),
2000 coding_system);
2001 outstream = make_encoding_output_stream (XLSTREAM (outstream),
2002 Fget_coding_system (Qbinary));
2003 GCPRO2 (instream, outstream);
2004 record_unwind_protect (close_both_streams, Fcons (instream, outstream));
2005
2006 /* The chain of streams looks like this:
2007
2008 [BUFFER] <----- send through
2009 ------> [ENCODE AS BINARY]
2010 ------> [DECODE AS SPECIFIED]
2011 ------> [BUFFER]
2012 */
2013
2014 {
2015 char tempbuf[1024]; /* some random amount */
2016 Lstream *in = XLSTREAM(instream);
2017 Lstream *out = XLSTREAM(outstream);
2018 Bufpos newpos, even_newer_pos;
2019
2020 while (1)
2021 {
2022 Bufpos oldpos = lisp_buffer_stream_startpos (in);
2023 int size_in_bytes = Lstream_read (in, tempbuf, sizeof (tempbuf));
2024 if (!size_in_bytes)
2025 break;
2026 newpos = lisp_buffer_stream_startpos (in);
2027 Lstream_write (out, tempbuf, size_in_bytes);
2028 even_newer_pos = lisp_buffer_stream_startpos (in);
2029 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2030 even_newer_pos, 0);
2031 }
2032 }
2033
2034 unbind_to (speccount, Qnil);
2035 UNGCPRO;
2036 return Qnil;
2037 }
2038
2039
2040 /************************************************************************/
2041 /* Converting to an external encoding ("encoding") */
2042 /************************************************************************/
2043
2044 /* An encoding stream is an output stream. When you create the
2045 stream, you specify the coding system that governs the encoding
2046 and another stream that the resulting encoded data is to be
2047 sent to, and then start sending data to it. */
2048
2049 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding)
2050
2051 struct encoding_stream
2052 {
2053 /* Coding system that governs the conversion. */
2054 struct Lisp_Coding_System *codesys;
2055
2056 /* Stream that we read the encoded data from or
2057 write the decoded data to. */
2058 Lstream *other_end;
2059
2060 /* If we are reading, then we can return only a fixed amount of
2061 data, so if the conversion resulted in too much data, we store it
2062 here for retrieval the next time around. */
2063 unsigned_char_dynarr *runoff;
2064
2065 /* FLAGS holds flags indicating the current state of the encoding.
2066 Some of these flags are dependent on the coding system. */
2067 unsigned int flags;
2068
2069 /* CH holds a partially built-up character. Since we only deal
2070 with one- and two-byte characters at the moment, we only use
2071 this to store the first byte of a two-byte character. */
2072 unsigned int ch;
2073
2074 /* Additional information used by the ISO2022 encoder. */
2075 struct
2076 {
2077 /* CHARSET holds the character sets currently assigned to the G0
2078 through G3 registers. It is initialized from the array
2079 INITIAL_CHARSET in CODESYS. */
2080 Lisp_Object charset[4];
2081
2082 /* Which registers are currently invoked into the left (GL) and
2083 right (GR) halves of the 8-bit encoding space? */
2084 int register_left, register_right;
2085
2086 /* Whether we need to explicitly designate the charset in the
2087 G? register before using it. It is initialized from the
2088 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */
2089 unsigned char force_charset_on_output[4];
2090
2091 /* Other state variables that need to be preserved across
2092 invocations. */
2093 Lisp_Object current_charset;
2094 int current_half;
2095 int current_char_boundary;
2096 } iso2022;
2097
2098 /* Additional information (the state of the running CCL program)
2099 used by the CCL encoder. */
2100 struct ccl_program ccl;
2101 };
2102
2103 static int encoding_reader (Lstream *stream, unsigned char *data, int size);
2104 static int encoding_writer (Lstream *stream, CONST unsigned char *data,
2105 int size);
2106 static int encoding_rewinder (Lstream *stream);
2107 static int encoding_seekable_p (Lstream *stream);
2108 static int encoding_flusher (Lstream *stream);
2109 static int encoding_closer (Lstream *stream);
2110 static Lisp_Object encoding_marker (Lisp_Object stream,
2111 void (*markobj) (Lisp_Object));
2112
2113 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding,
2114 sizeof (struct encoding_stream));
2115
2116 static Lisp_Object
2117 encoding_marker (Lisp_Object stream, void (*markobj) (Lisp_Object))
2118 {
2119 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end;
2120 Lisp_Object str_obj;
2121
2122 /* We do not need to mark the coding systems or charsets stored
2123 within the stream because they are stored in a global list
2124 and automatically marked. */
2125
2126 XSETLSTREAM (str_obj, str);
2127 (markobj) (str_obj);
2128 if (str->imp->marker)
2129 return (str->imp->marker) (str_obj, markobj);
2130 else
2131 return Qnil;
2132 }
2133
2134 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream
2135 so we read data from the other end, encode it, and store it into DATA. */
2136
2137 static int
2138 encoding_reader (Lstream *stream, unsigned char *data, int size)
2139 {
2140 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2141 unsigned char *orig_data = data;
2142 int read_size;
2143 int error_occurred = 0;
2144
2145 /* We need to interface to mule_encode(), which expects to take some
2146 amount of data and store the result into a Dynarr. We have
2147 mule_encode() store into str->runoff, and take data from there
2148 as necessary. */
2149
2150 /* We loop until we have enough data, reading chunks from the other
2151 end and encoding it. */
2152 while (1)
2153 {
2154 /* Take data from the runoff if we can. Make sure to take at
2155 most SIZE bytes, and delete the data from the runoff. */
2156 if (Dynarr_length (str->runoff) > 0)
2157 {
2158 int chunk = min (size, Dynarr_length (str->runoff));
2159 memcpy (data, Dynarr_atp (str->runoff, 0), chunk);
2160 Dynarr_delete_many (str->runoff, 0, chunk);
2161 data += chunk;
2162 size -= chunk;
2163 }
2164
2165 if (size == 0)
2166 break; /* No more room for data */
2167
2168 if (str->flags & CODING_STATE_END)
2169 /* This means that on the previous iteration, we hit the EOF on
2170 the other end. We loop once more so that mule_encode() can
2171 output any final stuff it may be holding, or any "go back
2172 to a sane state" escape sequences. (This latter makes sense
2173 during encoding.) */
2174 break;
2175
2176 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes
2177 left of storage in it, so it's OK to read directly into it.
2178 (We'll be overwriting above, after we've encoded it into the
2179 runoff.) */
2180 read_size = Lstream_read (str->other_end, data, size);
2181 if (read_size < 0)
2182 {
2183 error_occurred = 1;
2184 break;
2185 }
2186 if (read_size == 0)
2187 /* There might be some more end data produced in the translation.
2188 See the comment above. */
2189 str->flags |= CODING_STATE_END;
2190 mule_encode (stream, data, str->runoff, read_size);
2191 }
2192
2193 if (data - orig_data == 0)
2194 return error_occurred ? -1 : 0;
2195 else
2196 return data - orig_data;
2197 }
2198
2199 static int
2200 encoding_writer (Lstream *stream, CONST unsigned char *data, int size)
2201 {
2202 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2203 int retval;
2204
2205 /* Encode all our data into the runoff, and then attempt to write
2206 it all out to the other end. Remove whatever chunk we succeeded
2207 in writing. */
2208 mule_encode (stream, data, str->runoff, size);
2209 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0),
2210 Dynarr_length (str->runoff));
2211 if (retval > 0)
2212 Dynarr_delete_many (str->runoff, 0, retval);
2213 /* Do NOT return retval. The return value indicates how much
2214 of the incoming data was written, not how many bytes were
2215 written. */
2216 return size;
2217 }
2218
2219 static void
2220 reset_encoding_stream (struct encoding_stream *str)
2221 {
2222 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022)
2223 {
2224 int i;
2225
2226 for (i = 0; i < 4; i++)
2227 str->iso2022.charset[i] =
2228 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i);
2229 for (i = 0; i < 4; i++)
2230 str->iso2022.force_charset_on_output[i] =
2231 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i);
2232 str->iso2022.register_left = 0;
2233 str->iso2022.register_right = 1;
2234 str->iso2022.current_charset = Qnil;
2235 str->iso2022.current_half = 0;
2236 str->iso2022.current_char_boundary = 1;
2237 }
2238 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL)
2239 {
2240 set_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys), 0, 0, 0);
2241 }
2242 str->flags = str->ch = 0;
2243 }
2244
2245 static int
2246 encoding_rewinder (Lstream *stream)
2247 {
2248 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2249 reset_encoding_stream (str);
2250 Dynarr_reset (str->runoff);
2251 return Lstream_rewind (str->other_end);
2252 }
2253
2254 static int
2255 encoding_seekable_p (Lstream *stream)
2256 {
2257 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2258 return Lstream_seekable_p (str->other_end);
2259 }
2260
2261 static int
2262 encoding_flusher (Lstream *stream)
2263 {
2264 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2265 return Lstream_flush (str->other_end);
2266 }
2267
2268 static int
2269 encoding_closer (Lstream *stream)
2270 {
2271 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2272 if (stream->flags & LSTREAM_FL_WRITE)
2273 {
2274 str->flags |= CODING_STATE_END;
2275 encoding_writer (stream, 0, 0);
2276 }
2277 Dynarr_free (str->runoff);
2278 return Lstream_close (str->other_end);
2279 }
2280
2281 Lisp_Object
2282 encoding_stream_coding_system (Lstream *stream)
2283 {
2284 Lisp_Object coding_system = Qnil;
2285 struct encoding_stream *str = ENCODING_STREAM_DATA (stream);
2286
2287 XSETCODING_SYSTEM (coding_system, str->codesys);
2288 return coding_system;
2289 }
2290
2291 void
2292 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys)
2293 {
2294 struct Lisp_Coding_System *cs = XCODING_SYSTEM (codesys);
2295 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2296 str->codesys = cs;
2297 reset_encoding_stream (str);
2298 }
2299
2300 static Lisp_Object
2301 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys,
2302 CONST char *mode)
2303 {
2304 Lstream *lstr = Lstream_new (lstream_encoding, mode);
2305 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr);
2306 Lisp_Object obj;
2307
2308 memset (str, 0, sizeof (*str));
2309 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned char);
2310 str->other_end = stream;
2311 set_encoding_stream_coding_system (lstr, codesys);
2312 XSETLSTREAM (obj, lstr);
2313 return obj;
2314 }
2315
2316 Lisp_Object
2317 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys)
2318 {
2319 return make_encoding_stream_1 (stream, codesys, "r");
2320 }
2321
2322 Lisp_Object
2323 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys)
2324 {
2325 return make_encoding_stream_1 (stream, codesys, "w");
2326 }
2327
2328 /* Convert N bytes of internally-formatted data stored in SRC to an
2329 external format, according to the encoding stream ENCODING.
2330 Store the encoded data into DST. */
2331
2332 static void
2333 mule_encode (Lstream *encoding, CONST unsigned char *src,
2334 unsigned_char_dynarr *dst, unsigned int n)
2335 {
2336 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2337
2338 switch (CODING_SYSTEM_TYPE (str->codesys))
2339 {
2340 #ifdef DEBUG_XEMACS
2341 case CODESYS_INTERNAL:
2342 Dynarr_add_many (dst, src, n);
2343 break;
2344 #endif
2345 case CODESYS_AUTODETECT:
2346 /* If we got this far and still haven't decided on the coding
2347 system, then do no conversion. */
2348 case CODESYS_NO_CONVERSION:
2349 encode_coding_no_conversion (encoding, src, dst, n);
2350 break;
2351 case CODESYS_SHIFT_JIS:
2352 encode_coding_shift_jis (encoding, src, dst, n);
2353 break;
2354 case CODESYS_BIG5:
2355 encode_coding_big5 (encoding, src, dst, n);
2356 break;
2357 case CODESYS_CCL:
2358 ccl_driver (&str->ccl, src, dst, n, (str->flags) & CODING_STATE_END);
2359 break;
2360 case CODESYS_ISO2022:
2361 encode_coding_iso2022 (encoding, src, dst, n);
2362 break;
2363 default:
2364 abort ();
2365 }
2366 }
2367
2368 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /*
2369 Encode the text between START and END using CODING-SYSTEM.
2370 This will, for example, convert Japanese characters into stuff such as
2371 \"^[$B!<!+^[(B\" if you use the JIS encoding. Return length of encoded
2372 text. BUFFER defaults to the current buffer if unspecified.
2373 */
2374 (start, end, coding_system, buffer))
2375 {
2376 Bufpos b, e;
2377 struct buffer *buf = decode_buffer (buffer, 0);
2378 Lisp_Object instream, outstream;
2379 char tempbuf[1024]; /* some random amount */
2380 int speccount = specpdl_depth ();
2381 struct gcpro gcpro1, gcpro2;
2382
2383 get_buffer_range_char (buf, start, end, &b, &e, 0);
2384 coding_system = Fget_coding_system (coding_system);
2385 instream = make_lisp_buffer_input_stream (buf, b, e, 0);
2386 outstream = make_lisp_buffer_output_stream (buf, b, 0);
2387 outstream = make_decoding_output_stream (XLSTREAM (outstream),
2388 Fget_coding_system (Qbinary));
2389 outstream = make_encoding_output_stream (XLSTREAM (outstream),
2390 coding_system);
2391 GCPRO2 (instream, outstream);
2392 record_unwind_protect (close_both_streams, Fcons (instream, outstream));
2393 /* The chain of streams looks like this:
2394
2395 [BUFFER] <----- send through
2396 ------> [ENCODE AS SPECIFIED]
2397 ------> [DECODE AS BINARY]
2398 ------> [BUFFER]
2399
2400 */
2401 while (1)
2402 {
2403 int size_in_bytes;
2404 Bufpos oldpos, newpos, even_newer_pos;
2405
2406 oldpos = lisp_buffer_stream_startpos (XLSTREAM (instream));
2407 size_in_bytes = Lstream_read (XLSTREAM (instream), tempbuf,
2408 sizeof (tempbuf));
2409 if (!size_in_bytes)
2410 break;
2411 newpos = lisp_buffer_stream_startpos (XLSTREAM (instream));
2412 Lstream_write (XLSTREAM (outstream), tempbuf, size_in_bytes);
2413 even_newer_pos = lisp_buffer_stream_startpos (XLSTREAM (instream));
2414 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos),
2415 even_newer_pos, 0);
2416 }
2417
2418 {
2419 Charcount retlen =
2420 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b;
2421 unbind_to (speccount, Qnil);
2422 UNGCPRO;
2423 return make_int (retlen);
2424 }
2425 }
2426
2427
2428 /************************************************************************/
2429 /* Shift-JIS methods */
2430 /************************************************************************/
2431
2432 /* Shift-JIS is a coding system encoding three character sets: ASCII, right
2433 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2434 as is. A character of JISX0201-Kana (TYPE94 character set) is
2435 encoded by "position-code + 0x80". A character of JISX0208
2436 (TYPE94x94 character set) is encoded in 2-byte but two
2437 position-codes are divided and shifted so that it fit in the range
2438 below.
2439
2440 --- CODE RANGE of Shift-JIS ---
2441 (character set) (range)
2442 ASCII 0x00 .. 0x7F
2443 JISX0201-Kana 0xA0 .. 0xDF
2444 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2445 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2446 -------------------------------
2447
2448 */
2449
2450 /* Is this the first byte of a Shift-JIS two-byte char? */
2451
2452 #define BYTE_SHIFT_JIS_TWO_BYTE_1_P(c) \
2453 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF))
2454
2455 /* Is this the second byte of a Shift-JIS two-byte char? */
2456
2457 #define BYTE_SHIFT_JIS_TWO_BYTE_2_P(c) \
2458 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC))
2459
2460 #define BYTE_SHIFT_JIS_KATAKANA_P(c) \
2461 ((c) >= 0xA1 && (c) <= 0xDF)
2462
2463 /* Code conversion macros. These are macros because they are used in
2464 inner loops during code conversion.
2465
2466 Note that temporary variables in macros introduce the classic
2467 dynamic-scoping problems with variable names. We use capital-
2468 lettered variables in the assumption that XEmacs does not use
2469 capital letters in variables except in a very formalized way
2470 (e.g. Qstring). */
2471
2472 /* Convert shift-JIS code (sj1, sj2) into internal string
2473 representation (c1, c2). (The leading byte is assumed.) */
2474
2475 #define DECODE_SHIFT_JIS(sj1, sj2, c1, c2) do \
2476 { \
2477 int I1 = sj1, I2 = sj2; \
2478 if (I2 >= 0x9f) \
2479 { \
2480 if (I1 >= 0xe0) \
2481 c1 = (I1 << 1) - 0xe0; \
2482 else \
2483 c1 = (I1 << 1) - 0x60; \
2484 c2 = I2 + 2; \
2485 } \
2486 else \
2487 { \
2488 if (I1 >= 0xe0) \
2489 c1 = (I1 << 1) - 0xe1; \
2490 else \
2491 c1 = (I1 << 1) - 0x61; \
2492 if (I2 >= 0x7f) \
2493 c2 = I2 + 0x60; \
2494 else \
2495 c2 = I2 + 0x61; \
2496 } \
2497 } while (0)
2498
2499 /* Convert the internal string representation of a Shift-JIS character
2500 (c1, c2) into Shift-JIS code (sj1, sj2). The leading byte is
2501 assumed. */
2502
2503 #define ENCODE_SHIFT_JIS(c1, c2, sj1, sj2) do \
2504 { \
2505 int I1 = c1, I2 = sj2; \
2506 if (I1 & 1) \
2507 { \
2508 if (I1 < 0xdf) \
2509 sj1 = (I1 >> 1) + 0x31; \
2510 else \
2511 sj1 = (I1 >> 1) + 0x71; \
2512 if (I2 >= 0xe0) \
2513 sj2 = I2 - 0x60; \
2514 else \
2515 sj2 = I2 - 0x61; \
2516 } \
2517 else \
2518 { \
2519 if (I1 < 0xdf) \
2520 sj1 = (I1 >> 1) + 0x30; \
2521 else \
2522 sj1 = (I1 >> 1) + 0x70; \
2523 sj2 = I2 - 2; \
2524 } \
2525 } while (0)
2526
2527 static int
2528 detect_coding_shift_jis (struct detection_state *st, CONST unsigned char *src,
2529 unsigned int n)
2530 {
2531 int c;
2532
2533 while (n--)
2534 {
2535 c = *src++;
2536 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
2537 return 0;
2538 if (st->shift_jis.in_second_byte)
2539 {
2540 st->shift_jis.in_second_byte = 0;
2541 if (c < 0x40)
2542 return 0;
2543 }
2544 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2545 st->shift_jis.in_second_byte = 1;
2546 }
2547 return CODING_CATEGORY_SHIFT_JIS_MASK;
2548 }
2549
2550 /* Convert Shift-JIS data to internal format. */
2551
2552 static void
2553 decode_coding_shift_jis (Lstream *decoding, CONST unsigned char *src,
2554 unsigned_char_dynarr *dst, unsigned int n)
2555 {
2556 unsigned char c;
2557 unsigned int flags, ch;
2558 int eol;
2559 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2560
2561 CODING_STREAM_DECOMPOSE (str, flags, ch);
2562 eol = str->eol_type;
2563
2564 while (n--)
2565 {
2566 c = *src++;
2567
2568 if (ch)
2569 {
2570 /* Previous character was first byte of Shift-JIS Kanji char. */
2571 if (BYTE_SHIFT_JIS_TWO_BYTE_2_P (c))
2572 {
2573 unsigned char e1, e2;
2574
2575 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
2576 DECODE_SHIFT_JIS (ch, c, e1, e2);
2577 Dynarr_add (dst, e1);
2578 Dynarr_add (dst, e2);
2579 }
2580 else
2581 {
2582 DECODE_ADD_BINARY_CHAR (ch, dst);
2583 DECODE_ADD_BINARY_CHAR (c, dst);
2584 }
2585 ch = 0;
2586 }
2587 else
2588 {
2589 DECODE_HANDLE_EOL_TYPE (eol, c, flags, dst);
2590 if (BYTE_SHIFT_JIS_TWO_BYTE_1_P (c))
2591 ch = c;
2592 else if (BYTE_SHIFT_JIS_KATAKANA_P (c))
2593 {
2594 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0201_KANA);
2595 Dynarr_add (dst, c);
2596 }
2597 else
2598 DECODE_ADD_BINARY_CHAR (c, dst);
2599 }
2600 label_continue_loop:;
2601 }
2602
2603 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2604
2605 CODING_STREAM_COMPOSE (str, flags, ch);
2606 }
2607
2608 /* Convert internally-formatted data to Shift-JIS. */
2609
2610 static void
2611 encode_coding_shift_jis (Lstream *encoding, CONST unsigned char *src,
2612 unsigned_char_dynarr *dst, unsigned int n)
2613 {
2614 unsigned char c;
2615 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2616 unsigned int flags, ch;
2617 int eol;
2618
2619 CODING_STREAM_DECOMPOSE (str, flags, ch);
2620 eol = CODING_SYSTEM_EOL_TYPE (str->codesys);
2621
2622 while (n--)
2623 {
2624 c = *src++;
2625 if (c == '\n')
2626 {
2627 if (eol != EOL_LF && eol != EOL_AUTODETECT)
2628 Dynarr_add (dst, '\r');
2629 if (eol != EOL_CR)
2630 Dynarr_add (dst, '\n');
2631 ch = 0;
2632 }
2633 else if (BYTE_ASCII_P (c))
2634 {
2635 Dynarr_add (dst, c);
2636 ch = 0;
2637 }
2638 else if (BUFBYTE_LEADING_BYTE_P (c))
2639 ch = (c == LEADING_BYTE_JAPANESE_JISX0201_KANA ||
2640 c == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2641 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0;
2642 else if (ch)
2643 {
2644 if (ch == LEADING_BYTE_JAPANESE_JISX0201_KANA)
2645 {
2646 Dynarr_add (dst, c);
2647 ch = 0;
2648 }
2649 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 ||
2650 ch == LEADING_BYTE_JAPANESE_JISX0208)
2651 ch = c;
2652 else
2653 {
2654 unsigned char j1, j2;
2655 ENCODE_SHIFT_JIS (ch, c, j1, j2);
2656 Dynarr_add (dst, j1);
2657 Dynarr_add (dst, j2);
2658 ch = 0;
2659 }
2660 }
2661 }
2662
2663 CODING_STREAM_COMPOSE (str, flags, ch);
2664 }
2665
2666 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /*
2667 Decode a JISX0208 character of Shift-JIS coding-system.
2668 CODE is the character code in Shift-JIS as a cons of type bytes.
2669 Return the corresponding character.
2670 */
2671 (code))
2672 {
2673 unsigned char c1, c2, s1, s2;
2674
2675 CHECK_CONS (code);
2676 CHECK_INT (XCAR (code));
2677 CHECK_INT (XCDR (code));
2678 s1 = XINT (XCAR (code));
2679 s2 = XINT (XCDR (code));
2680 if (BYTE_SHIFT_JIS_TWO_BYTE_1_P (s1) &&
2681 BYTE_SHIFT_JIS_TWO_BYTE_2_P (s2))
2682 {
2683 DECODE_SHIFT_JIS (s1, s2, c1, c2);
2684 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208,
2685 c1 & 0x7F, c2 & 0x7F));
2686 }
2687 else
2688 return Qnil;
2689 }
2690
2691 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /*
2692 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system.
2693 Return the corresponding character code in SHIFT-JIS as a cons of two bytes.
2694 */
2695 (ch))
2696 {
2697 Lisp_Object charset;
2698 int c1, c2, s1, s2;
2699
2700 CHECK_CHAR_COERCE_INT (ch);
2701 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2702 if (EQ (charset, Vcharset_japanese_jisx0208))
2703 {
2704 ENCODE_SHIFT_JIS (c1 | 0x80, c2 | 0x80, s1, s2);
2705 return Fcons (make_int (s1), make_int (s2));
2706 }
2707 else
2708 return Qnil;
2709 }
2710
2711
2712 /************************************************************************/
2713 /* Big5 methods */
2714 /************************************************************************/
2715
2716 /* BIG5 is a coding system encoding two character sets: ASCII and
2717 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2718 character set and is encoded in two-byte.
2719
2720 --- CODE RANGE of BIG5 ---
2721 (character set) (range)
2722 ASCII 0x00 .. 0x7F
2723 Big5 (1st byte) 0xA1 .. 0xFE
2724 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2725 --------------------------
2726
2727 Since the number of characters in Big5 is larger than maximum
2728 characters in Emacs' charset (96x96), it can't be handled as one
2729 charset. So, in Emacs, Big5 is devided into two: `charset-big5-1'
2730 and `charset-big5-2'. Both <type>s are TYPE94x94. The former
2731 contains frequently used characters and the latter contains less
2732 frequently used characters. */
2733
2734 #define BYTE_BIG5_TWO_BYTE_1_P(c) \
2735 ((c) >= 0xA1 && (c) <= 0xFE)
2736
2737 /* Is this the second byte of a Shift-JIS two-byte char? */
2738
2739 #define BYTE_BIG5_TWO_BYTE_2_P(c) \
2740 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE))
2741
2742 /* Number of Big5 characters which have the same code in 1st byte. */
2743
2744 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2745
2746 /* Code conversion macros. These are macros because they are used in
2747 inner loops during code conversion.
2748
2749 Note that temporary variables in macros introduce the classic
2750 dynamic-scoping problems with variable names. We use capital-
2751 lettered variables in the assumption that XEmacs does not use
2752 capital letters in variables except in a very formalized way
2753 (e.g. Qstring). */
2754
2755 /* Convert Big5 code (b1, b2) into its internal string representation
2756 (lb, c1, c2). */
2757
2758 /* There is a much simpler way to split the Big5 charset into two.
2759 For the moment I'm going to leave the algorithm as-is because it
2760 claims to separate out the most-used characters into a single
2761 charset, which perhaps will lead to optimizations in various
2762 places.
2763
2764 The way the algorithm works is something like this:
2765
2766 Big5 can be viewed as a 94x157 charset, where the row is
2767 encoded into the bytes 0xA1 .. 0xFE and the column is encoded
2768 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency,
2769 the split between low and high column numbers is apparently
2770 meaningless; ascending rows produce less and less frequent chars.
2771 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to
2772 the first charset, and the upper half (0xC9 .. 0xFE) to the
2773 second. To do the conversion, we convert the character into
2774 a single number where 0 .. 156 is the first row, 157 .. 313
2775 is the second, etc. That way, the characters are ordered by
2776 decreasing frequency. Then we just chop the space in two
2777 and coerce the result into a 94x94 space.
2778 */
2779
2780 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \
2781 { \
2782 int B1 = b1, B2 = b2; \
2783 unsigned int I \
2784 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \
2785 \
2786 if (B1 < 0xC9) \
2787 { \
2788 lb = LEADING_BYTE_CHINESE_BIG5_1; \
2789 } \
2790 else \
2791 { \
2792 lb = LEADING_BYTE_CHINESE_BIG5_2; \
2793 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \
2794 } \
2795 c1 = I / (0xFF - 0xA1) + 0xA1; \
2796 c2 = I % (0xFF - 0xA1) + 0xA1; \
2797 } while (0)
2798
2799 /* Convert the internal string representation of a Big5 character
2800 (lb, c1, c2) into Big5 code (b1, b2). */
2801
2802 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \
2803 { \
2804 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \
2805 \
2806 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \
2807 { \
2808 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2809 } \
2810 b1 = I / BIG5_SAME_ROW + 0xA1; \
2811 b2 = I % BIG5_SAME_ROW; \
2812 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2813 } while (0)
2814
2815 static int
2816 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src,
2817 unsigned int n)
2818 {
2819 int c;
2820
2821 while (n--)
2822 {
2823 c = *src++;
2824 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO ||
2825 (c >= 0x80 && c <= 0xA0))
2826 return 0;
2827 if (st->big5.in_second_byte)
2828 {
2829 st->big5.in_second_byte = 0;
2830 if (c < 0x40 || (c >= 0x80 && c <= 0xA0))
2831 return 0;
2832 }
2833 else if (c >= 0xA1)
2834 st->big5.in_second_byte = 1;
2835 }
2836 return CODING_CATEGORY_BIG5_MASK;
2837 }
2838
2839 /* Convert Big5 data to internal format. */
2840
2841 static void
2842 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src,
2843 unsigned_char_dynarr *dst, unsigned int n)
2844 {
2845 unsigned char c;
2846 unsigned int flags, ch, eol;
2847 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
2848
2849 CODING_STREAM_DECOMPOSE (str, flags, ch);
2850 eol = str->eol_type;
2851
2852 while (n--)
2853 {
2854 c = *src++;
2855 if (ch)
2856 {
2857 /* Previous character was first byte of Big5 char. */
2858 if (BYTE_BIG5_TWO_BYTE_2_P (c))
2859 {
2860 unsigned char b1, b2, b3;
2861 DECODE_BIG5 (ch, c, b1, b2, b3);
2862 Dynarr_add (dst, b1);
2863 Dynarr_add (dst, b2);
2864 Dynarr_add (dst, b3);
2865 }
2866 else
2867 {
2868 DECODE_ADD_BINARY_CHAR (ch, dst);
2869 DECODE_ADD_BINARY_CHAR (c, dst);
2870 }
2871 ch = 0;
2872 }
2873 else
2874 {
2875 DECODE_HANDLE_EOL_TYPE (eol, c, flags, dst);
2876 if (BYTE_BIG5_TWO_BYTE_1_P (c))
2877 ch = c;
2878 else
2879 DECODE_ADD_BINARY_CHAR (c, dst);
2880 }
2881 label_continue_loop:;
2882 }
2883
2884 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
2885
2886 CODING_STREAM_COMPOSE (str, flags, ch);
2887 }
2888
2889 /* Convert internally-formatted data to Big5. */
2890
2891 static void
2892 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src,
2893 unsigned_char_dynarr *dst, unsigned int n)
2894 {
2895 unsigned char c;
2896 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
2897 unsigned int flags, ch, eol;
2898
2899 CODING_STREAM_DECOMPOSE (str, flags, ch);
2900 eol = CODING_SYSTEM_EOL_TYPE (str->codesys);
2901
2902 while (n--)
2903 {
2904 c = *src++;
2905 if (c == '\n')
2906 {
2907 if (eol != EOL_LF && eol != EOL_AUTODETECT)
2908 Dynarr_add (dst, '\r');
2909 if (eol != EOL_CR)
2910 Dynarr_add (dst, '\n');
2911 }
2912 else if (BYTE_ASCII_P (c))
2913 {
2914 /* ASCII. */
2915 Dynarr_add (dst, c);
2916 }
2917 else if (BUFBYTE_LEADING_BYTE_P (c))
2918 {
2919 if (c == LEADING_BYTE_CHINESE_BIG5_1 ||
2920 c == LEADING_BYTE_CHINESE_BIG5_2)
2921 {
2922 /* A recognized leading byte. */
2923 ch = c;
2924 continue; /* not done with this character. */
2925 }
2926 /* otherwise just ignore this character. */
2927 }
2928 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 ||
2929 ch == LEADING_BYTE_CHINESE_BIG5_2)
2930 {
2931 /* Previous char was a recognized leading byte. */
2932 ch = (ch << 8) | c;
2933 continue; /* not done with this character. */
2934 }
2935 else if (ch)
2936 {
2937 /* Encountering second byte of a Big5 character. */
2938 unsigned char b1, b2;
2939
2940 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2);
2941 Dynarr_add (dst, b1);
2942 Dynarr_add (dst, b2);
2943 }
2944
2945 ch = 0;
2946 }
2947
2948 CODING_STREAM_COMPOSE (str, flags, ch);
2949 }
2950
2951
2952 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /*
2953 Decode a Big5 character CODE of BIG5 coding-system.
2954 CODE is the character code in BIG5.
2955 Return the corresponding character.
2956 */
2957 (code))
2958 {
2959 unsigned char c1, c2, b1, b2;
2960 Lisp_Object charset;
2961
2962 CHECK_CONS (code);
2963 CHECK_INT (XCAR (code));
2964 CHECK_INT (XCDR (code));
2965 b1 = XINT (XCAR (code));
2966 b2 = XINT (XCDR (code));
2967 if (BYTE_BIG5_TWO_BYTE_1_P (b1) &&
2968 BYTE_BIG5_TWO_BYTE_2_P (b2))
2969 {
2970 DECODE_BIG5 (b1, b2, XCHARSET_LEADING_BYTE (charset), c1, c2);
2971 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F));
2972 }
2973 else
2974 return Qnil;
2975 }
2976
2977 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /*
2978 Encode the Big5 character CH to BIG5 coding-system.
2979 Return the corresponding character code in Big5.
2980 */
2981 (ch))
2982 {
2983 Lisp_Object charset;
2984 int c1, c2, b1, b2;
2985
2986 CHECK_CHAR_COERCE_INT (ch);
2987 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2);
2988 if (EQ (charset, Vcharset_chinese_big5_1) ||
2989 EQ (charset, Vcharset_chinese_big5_2))
2990 {
2991 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80,
2992 b1, b2);
2993 return Fcons (make_int (b1), make_int (b2));
2994 }
2995 else
2996 return Qnil;
2997 }
2998
2999
3000 /************************************************************************/
3001 /* ISO2022 methods */
3002 /************************************************************************/
3003
3004 /* The following note describes the coding system ISO2022 briefly.
3005 Since the intention of this note is to help understanding of the
3006 programs in this file, some parts are NOT ACCURATE or OVERLY
3007 SIMPLIFIED. For thorough understanding, please refer to the
3008 original document of ISO2022.
3009
3010 ISO2022 provides many mechanisms to encode several character sets
3011 in 7-bit and 8-bit environments. If one chooses 7-bit environment,
3012 all text is encoded by codes of less than 128. This may make the
3013 encoded text a little bit longer, but the text get more stability
3014 to pass through several gateways (some of them strip off MSB).
3015
3016 There are two kind of character sets: control character set and
3017 graphic character set. The former contains control characters such
3018 as `newline' and `escape' to provide control functions (control
3019 functions are provided also by escape sequence). The latter
3020 contains graphic characters such as 'A' and '-'. Emacs recognizes
3021 two control character sets and many graphic character sets.
3022
3023 Graphic character sets are classified into one of four types,
3024 according to the dimension and number of characters in the set:
3025 TYPE94, TYPE96, TYPE94x94, and TYPE96x96. In addition, each
3026 character set is assigned an identification byte, unique for each
3027 type, called "final character" (denoted as <F> hereafter). The <F>
3028 of each character set is decided by ECMA(*) when it is registered
3029 in ISO. Code range of <F> is 0x30..0x7F (0x30..0x3F are for
3030 private use only).
3031
3032 Note (*): ECMA = European Computer Manufacturers Association
3033
3034 Here are examples of graphic character set [NAME(<F>)]:
3035 o TYPE94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
3036 o TYPE96 -- right-half-of-ISO8859-1('A'), ...
3037 o TYPE94x94 -- GB2312('A'), JISX0208('B'), ...
3038 o TYPE96x96 -- none for the moment
3039
3040 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
3041 C0 [0x00..0x1F] -- control character plane 0
3042 GL [0x20..0x7F] -- graphic character plane 0
3043 C1 [0x80..0x9F] -- control character plane 1
3044 GR [0xA0..0xFF] -- graphic character plane 1
3045
3046 A control character set is directly designated and invoked to C0 or
3047 C1 by an escape sequence. The most common case is that:
3048 - ISO646's control character set is designated/invoked to C0, and
3049 - ISO6429's control character set is designated/invoked to C1,
3050 and usually these designations/invocations are omitted in encoded
3051 text. In a 7-bit environment, only C0 can be used, and a control
3052 character for C1 is encoded by an appropriate escape sequence to
3053 fit into the environment. All control characters for C1 are
3054 defined to have corresponding escape sequences.
3055
3056 A graphic character set is at first designated to one of four
3057 graphic registers (G0 through G3), then these graphic registers are
3058 invoked to GL or GR. These designations and invocations can be
3059 done independently. The most common case is that G0 is invoked to
3060 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
3061 these invocations and designations are omitted in encoded text.
3062 In a 7-bit environment, only GL can be used.
3063
3064 When a graphic character set of TYPE94 or TYPE94x94 is invoked to
3065 GL, codes 0x20 and 0x7F of the GL area work as control characters
3066 SPACE and DEL respectively, and code 0xA0 and 0xFF of GR area
3067 should not be used.
3068
3069 There are two ways of invocation: locking-shift and single-shift.
3070 With locking-shift, the invocation lasts until the next different
3071 invocation, whereas with single-shift, the invocation works only
3072 for the following character and doesn't affect locking-shift.
3073 Invocations are done by the following control characters or escape
3074 sequences.
3075
3076 ----------------------------------------------------------------------
3077 abbrev function cntrl escape seq description
3078 ----------------------------------------------------------------------
3079 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
3080 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
3081 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR
3082 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
3083 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR
3084 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
3085 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR
3086 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
3087 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
3088 ----------------------------------------------------------------------
3089 The first four are for locking-shift. Control characters for these
3090 functions are defined by macros ISO_CODE_XXX in `coding.h'.
3091
3092 Designations are done by the following escape sequences.
3093 ----------------------------------------------------------------------
3094 escape sequence description
3095 ----------------------------------------------------------------------
3096 ESC '(' <F> designate TYPE94<F> to G0
3097 ESC ')' <F> designate TYPE94<F> to G1
3098 ESC '*' <F> designate TYPE94<F> to G2
3099 ESC '+' <F> designate TYPE94<F> to G3
3100 ESC ',' <F> designate TYPE96<F> to G0 (*)
3101 ESC '-' <F> designate TYPE96<F> to G1
3102 ESC '.' <F> designate TYPE96<F> to G2
3103 ESC '/' <F> designate TYPE96<F> to G3
3104 ESC '$' '(' <F> designate TYPE94x94<F> to G0 (**)
3105 ESC '$' ')' <F> designate TYPE94x94<F> to G1
3106 ESC '$' '*' <F> designate TYPE94x94<F> to G2
3107 ESC '$' '+' <F> designate TYPE94x94<F> to G3
3108 ESC '$' ',' <F> designate TYPE96x96<F> to G0 (*)
3109 ESC '$' '-' <F> designate TYPE96x96<F> to G1
3110 ESC '$' '.' <F> designate TYPE96x96<F> to G2
3111 ESC '$' '/' <F> designate TYPE96x96<F> to G3
3112 ----------------------------------------------------------------------
3113 In this list, "TYPE94<F>" means a graphic character set of type TYPE94
3114 and final character <F>, and etc.
3115
3116 Note (*): Although these designations are not allowed in ISO2022,
3117 Emacs accepts them on decoding, and produces them on encoding
3118 TYPE96 or TYPE96x96 character set in a coding system which is
3119 characterized as 7-bit environment, non-locking-shift, and
3120 non-single-shift.
3121
3122 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
3123 '(' can be omitted. We call this as "short-form" here after.
3124
3125 Now you may notice that there are a lot of ways for encoding the
3126 same multilingual text in ISO2022. Actually, there exist many
3127 coding systems such as Compound Text (used in X's inter client
3128 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
3129 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
3130 localized platforms), and all of these are variants of ISO2022.
3131
3132 In addition to the above, Emacs handles two more kinds of escape
3133 sequences: ISO6429's direction specification and Emacs' private
3134 sequence for specifying character composition.
3135
3136 ISO6429's direction specification takes the following format:
3137 o CSI ']' -- end of the current direction
3138 o CSI '0' ']' -- end of the current direction
3139 o CSI '1' ']' -- start of left-to-right text
3140 o CSI '2' ']' -- start of right-to-left text
3141 The control character CSI (0x9B: control sequence introducer) is
3142 abbreviated to the escape sequence ESC '[' in 7-bit environment.
3143
3144 Character composition specification takes the following format:
3145 o ESC '0' -- start character composition
3146 o ESC '1' -- end character composition
3147 Since these are not standard escape sequences of any ISO, the use
3148 of them for these meanings is restricted to Emacs only. */
3149
3150 static void
3151 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso)
3152 {
3153 int i;
3154
3155 for (i = 0; i < 4; i++)
3156 {
3157 if (!NILP (coding_system))
3158 iso->charset[i] =
3159 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i);
3160 else
3161 iso->charset[i] = Qt;
3162 iso->invalid_designated[i] = 0;
3163 }
3164 iso->esc = ISO_ESC_NOTHING;
3165 iso->esc_bytes_index = 0;
3166 iso->register_left = 0;
3167 iso->register_right = 1;
3168 iso->switched_dir_and_no_valid_charset_yet = 0;
3169 iso->invalid_switch_dir = 0;
3170 iso->output_direction_sequence = 0;
3171 iso->output_literally = 0;
3172 if (iso->composite_chars)
3173 Dynarr_reset (iso->composite_chars);
3174 }
3175
3176 static int
3177 fit_to_be_escape_quoted (unsigned char c)
3178 {
3179 switch (c)
3180 {
3181 case ISO_CODE_ESC:
3182 case ISO_CODE_CSI:
3183 case ISO_CODE_SS2:
3184 case ISO_CODE_SS3:
3185 case ISO_CODE_SO:
3186 case ISO_CODE_SI:
3187 return 1;
3188
3189 default:
3190 return 0;
3191 }
3192 }
3193
3194 /* Parse one byte of an ISO2022 escape sequence.
3195 If the result is an invalid escape sequence, return 0 and
3196 do not change anything in STR. Otherwise, if the result is
3197 an incomplete escape sequence, update ISO2022.ESC and
3198 ISO2022.ESC_BYTES and return -1. Otherwise, update
3199 all the state variables (but not ISO2022.ESC_BYTES) and
3200 return 1.
3201
3202 If CHECK_INVALID_CHARSETS is non-zero, check for designation
3203 or invocation of an invalid character set and treat that as
3204 an unrecognized escape sequence. */
3205
3206 static int
3207 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso,
3208 unsigned char c, unsigned int *flags,
3209 int check_invalid_charsets)
3210 {
3211 /* (1) If we're at the end of a designation sequence, CS is the
3212 charset being designated and REG is the register to designate
3213 it to.
3214
3215 (2) If we're at the end of a locking-shift sequence, REG is
3216 the register to invoke and HALF (0 == left, 1 == right) is
3217 the half to invoke it into.
3218
3219 (3) If we're at the end of a single-shift sequence, REG is
3220 the register to invoke. */
3221 Lisp_Object cs = Qnil;
3222 int reg, half;
3223
3224 /* NOTE: This code does goto's all over the fucking place.
3225 The reason for this is that we're basically implementing
3226 a state machine here, and hierarchical languages like C
3227 don't really provide a clean way of doing this. */
3228
3229 if (! (*flags & CODING_STATE_ESCAPE))
3230 /* At beginning of escape sequence; we need to reset our
3231 escape-state variables. */
3232 iso->esc = ISO_ESC_NOTHING;
3233
3234 iso->output_literally = 0;
3235 iso->output_direction_sequence = 0;
3236
3237 switch (iso->esc)
3238 {
3239 case ISO_ESC_NOTHING:
3240 iso->esc_bytes_index = 0;
3241 switch (c)
3242 {
3243 case ISO_CODE_ESC: /* Start escape sequence */
3244 *flags |= CODING_STATE_ESCAPE;
3245 iso->esc = ISO_ESC;
3246 goto not_done;
3247
3248 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */
3249 *flags |= CODING_STATE_ESCAPE;
3250 iso->esc = ISO_ESC_5_11;
3251 goto not_done;
3252
3253 case ISO_CODE_SO: /* locking shift 1 */
3254 reg = 1; half = 0;
3255 goto locking_shift;
3256 case ISO_CODE_SI: /* locking shift 0 */
3257 reg = 0; half = 0;
3258 goto locking_shift;
3259
3260 case ISO_CODE_SS2: /* single shift */
3261 reg = 2;
3262 goto single_shift;
3263 case ISO_CODE_SS3: /* single shift */
3264 reg = 3;
3265 goto single_shift;
3266
3267 default: /* Other control characters */
3268 return 0;
3269 }
3270
3271 case ISO_ESC:
3272 switch (c)
3273 {
3274 /**** single shift ****/
3275
3276 case 'N': /* single shift 2 */
3277 reg = 2;
3278 goto single_shift;
3279 case 'O': /* single shift 3 */
3280 reg = 3;
3281 goto single_shift;
3282
3283 /**** locking shift ****/
3284
3285 case '~': /* locking shift 1 right */
3286 reg = 1; half = 1;
3287 goto locking_shift;
3288 case 'n': /* locking shift 2 */
3289 reg = 2; half = 0;
3290 goto locking_shift;
3291 case '}': /* locking shift 2 right */
3292 reg = 2; half = 1;
3293 goto locking_shift;
3294 case 'o': /* locking shift 3 */
3295 reg = 3; half = 0;
3296 goto locking_shift;
3297 case '|': /* locking shift 3 right */
3298 reg = 3; half = 1;
3299 goto locking_shift;
3300
3301 /**** composite ****/
3302
3303 case '0':
3304 iso->esc = ISO_ESC_START_COMPOSITE;
3305 *flags = (*flags & CODING_STATE_ISO2022_LOCK) |
3306 CODING_STATE_COMPOSITE;
3307 return 1;
3308
3309 case '1':
3310 iso->esc = ISO_ESC_END_COMPOSITE;
3311 *flags = (*flags & CODING_STATE_ISO2022_LOCK) &
3312 ~CODING_STATE_COMPOSITE;
3313 return 1;
3314
3315 /**** directionality ****/
3316
3317 case '[':
3318 iso->esc = ISO_ESC_5_11;
3319 goto not_done;
3320
3321 /**** designation ****/
3322
3323 case '$': /* multibyte charset prefix */
3324 iso->esc = ISO_ESC_2_4;
3325 goto not_done;
3326
3327 default:
3328 if (0x28 <= c && c <= 0x2F)
3329 {
3330 iso->esc = c - 0x28 + ISO_ESC_2_8;
3331 goto not_done;
3332 }
3333
3334 /* This function is called with CODESYS equal to nil when
3335 doing coding-system detection. */
3336 if (!NILP (codesys)
3337 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
3338 && fit_to_be_escape_quoted (c))
3339 {
3340 iso->esc = ISO_ESC_LITERAL;
3341 *flags &= CODING_STATE_ISO2022_LOCK;
3342 return 1;
3343 }
3344
3345 /* bzzzt! */
3346 return 0;
3347 }
3348
3349
3350
3351 /**** directionality ****/
3352
3353 case ISO_ESC_5_11: /* ISO6429 direction control */
3354 if (c == ']')
3355 {
3356 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3357 goto directionality;
3358 }
3359 if (c == '0') iso->esc = ISO_ESC_5_11_0;
3360 else if (c == '1') iso->esc = ISO_ESC_5_11_1;
3361 else if (c == '2') iso->esc = ISO_ESC_5_11_2;
3362 else return 0;
3363 goto not_done;
3364
3365 case ISO_ESC_5_11_0:
3366 if (c == ']')
3367 {
3368 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3369 goto directionality;
3370 }
3371 return 0;
3372
3373 case ISO_ESC_5_11_1:
3374 if (c == ']')
3375 {
3376 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L);
3377 goto directionality;
3378 }
3379 return 0;
3380
3381 case ISO_ESC_5_11_2:
3382 if (c == ']')
3383 {
3384 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L;
3385 goto directionality;
3386 }
3387 return 0;
3388
3389 directionality:
3390 iso->esc = ISO_ESC_DIRECTIONALITY;
3391 /* Various junk here to attempt to preserve the direction sequences
3392 literally in the text if they would otherwise be swallowed due
3393 to invalid designations that don't show up as actual charset
3394 changes in the text. */
3395 if (iso->invalid_switch_dir)
3396 {
3397 /* We already inserted a direction switch literally into the
3398 text. We assume (#### this may not be right) that the
3399 next direction switch is the one going the other way,
3400 and we need to output that literally as well. */
3401 iso->output_literally = 1;
3402 iso->invalid_switch_dir = 0;
3403 }
3404 else
3405 {
3406 int jj;
3407
3408 /* If we are in the thrall of in invalid designation,
3409 then stick the directionality sequence literally into the
3410 output stream so it ends up in the original text again. */
3411 for (jj = 0; jj < 4; jj++)
3412 if (iso->invalid_designated[jj])
3413 break;
3414 if (jj < 4)
3415 {
3416 iso->output_literally = 1;
3417 iso->invalid_switch_dir = 1;
3418 }
3419 else
3420 /* Indicate that we haven't yet seen a valid designation,
3421 so that if a switch-dir is directly followed by an
3422 invalid designation, both get inserted literally. */
3423 iso->switched_dir_and_no_valid_charset_yet = 1;
3424 }
3425 return 1;
3426
3427
3428 /**** designation ****/
3429
3430 case ISO_ESC_2_4:
3431 if (0x28 <= c && c <= 0x2F)
3432 {
3433 iso->esc = c - 0x28 + ISO_ESC_2_4_8;
3434 goto not_done;
3435 }
3436 if (0x40 <= c && c <= 0x42)
3437 {
3438 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c,
3439 *flags & CODING_STATE_R2L ?
3440 CHARSET_RIGHT_TO_LEFT :
3441 CHARSET_LEFT_TO_RIGHT);
3442 reg = 0;
3443 goto designated;
3444 }
3445 return 0;
3446
3447 default:
3448 {
3449 int type;
3450
3451 if (c < '0' || c > '~')
3452 return 0; /* bad final byte */
3453
3454 if (iso->esc >= ISO_ESC_2_8 &&
3455 iso->esc <= ISO_ESC_2_15)
3456 {
3457 if (iso->esc >= ISO_ESC_2_12)
3458 type = CHARSET_TYPE_96;
3459 else
3460 type = CHARSET_TYPE_94;
3461 reg = (iso->esc - ISO_ESC_2_8) & 3;
3462 }
3463 else if (iso->esc >= ISO_ESC_2_4_8 &&
3464 iso->esc <= ISO_ESC_2_4_15)
3465 {
3466 if (iso->esc >= ISO_ESC_2_4_12)
3467 type = CHARSET_TYPE_96X96;
3468 else
3469 type = CHARSET_TYPE_94X94;
3470 reg = (iso->esc - ISO_ESC_2_4_8) & 3;
3471 }
3472
3473 cs = CHARSET_BY_ATTRIBUTES (type, c,
3474 *flags & CODING_STATE_R2L ?
3475 CHARSET_RIGHT_TO_LEFT :
3476 CHARSET_LEFT_TO_RIGHT);
3477 goto designated;
3478 }
3479 }
3480
3481 not_done:
3482 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c;
3483 return -1;
3484
3485 single_shift:
3486 if (check_invalid_charsets && !CHARSETP (iso->charset[reg]))
3487 /* can't invoke something that ain't there. */
3488 return 0;
3489 iso->esc = ISO_ESC_SINGLE_SHIFT;
3490 *flags &= CODING_STATE_ISO2022_LOCK;
3491 if (reg == 2)
3492 *flags |= CODING_STATE_SS2;
3493 else
3494 *flags |= CODING_STATE_SS3;
3495 return 1;
3496
3497 locking_shift:
3498 if (check_invalid_charsets &&
3499 !CHARSETP (iso->charset[reg]))
3500 /* can't invoke something that ain't there. */
3501 return 0;
3502 if (half)
3503 iso->register_right = reg;
3504 else
3505 iso->register_left = reg;
3506 *flags &= CODING_STATE_ISO2022_LOCK;
3507 iso->esc = ISO_ESC_LOCKING_SHIFT;
3508 return 1;
3509
3510 designated:
3511 if (NILP (cs) && check_invalid_charsets)
3512 {
3513 iso->invalid_designated[reg] = 1;
3514 iso->charset[reg] = Vcharset_ascii;
3515 iso->esc = ISO_ESC_DESIGNATE;
3516 *flags &= CODING_STATE_ISO2022_LOCK;
3517 iso->output_literally = 1;
3518 if (iso->switched_dir_and_no_valid_charset_yet)
3519 {
3520 /* We encountered a switch-direction followed by an
3521 invalid designation. Ensure that the switch-direction
3522 gets outputted; otherwise it will probably get eaten
3523 when the text is written out again. */
3524 iso->switched_dir_and_no_valid_charset_yet = 0;
3525 iso->output_direction_sequence = 1;
3526 /* And make sure that the switch-dir going the other
3527 way gets outputted, as well. */
3528 iso->invalid_switch_dir = 1;
3529 }
3530 return 1;
3531 }
3532 /* This function is called with CODESYS equal to nil when
3533 doing coding-system detection. */
3534 if (!NILP (codesys))
3535 {
3536 charset_conversion_spec_dynarr *dyn =
3537 XCODING_SYSTEM (codesys)->iso2022.input_conv;
3538
3539 if (dyn)
3540 {
3541 int i;
3542
3543 for (i = 0; i < Dynarr_length (dyn); i++)
3544 {
3545 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
3546 if (EQ (cs, spec->from_charset))
3547 cs = spec->to_charset;
3548 }
3549 }
3550 }
3551
3552 iso->charset[reg] = cs;
3553 iso->esc = ISO_ESC_DESIGNATE;
3554 *flags &= CODING_STATE_ISO2022_LOCK;
3555 if (iso->invalid_designated[reg])
3556 {
3557 iso->invalid_designated[reg] = 0;
3558 iso->output_literally = 1;
3559 }
3560 if (iso->switched_dir_and_no_valid_charset_yet)
3561 iso->switched_dir_and_no_valid_charset_yet = 0;
3562 return 1;
3563 }
3564
3565 static int
3566 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src,
3567 unsigned int n)
3568 {
3569 int c;
3570 int mask;
3571
3572 /* #### There are serious deficiencies in the recognition mechanism
3573 here. This needs to be much smarter if it's going to cut it. */
3574
3575 if (!st->iso2022.initted)
3576 {
3577 reset_iso2022 (Qnil, &st->iso2022.iso);
3578 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK |
3579 CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3580 CODING_CATEGORY_ISO_8_1_MASK |
3581 CODING_CATEGORY_ISO_8_2_MASK |
3582 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK);
3583 st->iso2022.flags = 0;
3584 st->iso2022.high_byte_count = 0;
3585 st->iso2022.saw_single_shift = 0;
3586 st->iso2022.initted = 1;
3587 }
3588
3589 mask = st->iso2022.mask;
3590
3591 while (n--)
3592 {
3593 c = *src++;
3594 if (c >= 0xA0)
3595 {
3596 mask &= ~CODING_CATEGORY_ISO_7_MASK;
3597 st->iso2022.high_byte_count++;
3598 }
3599 else
3600 {
3601 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift)
3602 {
3603 if (st->iso2022.high_byte_count & 1)
3604 /* odd number of high bytes; assume not iso-8-2 */
3605 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3606 }
3607 st->iso2022.high_byte_count = 0;
3608 st->iso2022.saw_single_shift = 0;
3609 if (c > 0x80)
3610 mask &= ~CODING_CATEGORY_ISO_7_MASK;
3611 }
3612 if (!(st->iso2022.flags & CODING_STATE_ESCAPE)
3613 && (BYTE_C0_P (c) || BYTE_C1_P (c)))
3614 { /* control chars */
3615 switch (c)
3616 {
3617 /* Allow and ignore control characters that you might
3618 reasonably see in a text file */
3619 case '\r':
3620 case '\n':
3621 case '\t':
3622 case 7: /* bell */
3623 case 8: /* backspace */
3624 case 11: /* vertical tab */
3625 case 12: /* form feed */
3626 case 26: /* MS-DOS C-z junk */
3627 goto label_continue_loop;
3628
3629 default:
3630 break;
3631 }
3632 }
3633
3634 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c)
3635 || BYTE_C1_P (c))
3636 {
3637 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c,
3638 &st->iso2022.flags, 0))
3639 {
3640 switch (st->iso2022.iso.esc)
3641 {
3642 case ISO_ESC_DESIGNATE:
3643 mask &= ~CODING_CATEGORY_ISO_8_1_MASK;
3644 mask &= ~CODING_CATEGORY_ISO_8_2_MASK;
3645 break;
3646 case ISO_ESC_LOCKING_SHIFT:
3647 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK;
3648 goto ran_out_of_chars;
3649 case ISO_ESC_SINGLE_SHIFT:
3650 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK;
3651 st->iso2022.saw_single_shift = 1;
3652 break;
3653 default:
3654 break;
3655 }
3656 }
3657 else
3658 {
3659 mask = 0;
3660 goto ran_out_of_chars;
3661 }
3662 }
3663 label_continue_loop:;
3664 }
3665
3666 ran_out_of_chars:
3667
3668 return mask;
3669 }
3670
3671 static int
3672 postprocess_iso2022_mask (int mask)
3673 {
3674 /* #### kind of cheesy */
3675 /* If seven-bit ISO is allowed, then assume that the encoding is
3676 entirely seven-bit and turn off the eight-bit ones. */
3677 if (mask & CODING_CATEGORY_ISO_7_MASK)
3678 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK |
3679 CODING_CATEGORY_ISO_8_1_MASK |
3680 CODING_CATEGORY_ISO_8_2_MASK);
3681 return mask;
3682 }
3683
3684 /* If FLAGS is a null pointer or specifies right-to-left motion,
3685 output a switch-dir-to-left-to-right sequence to DST.
3686 Also update FLAGS if it is not a null pointer.
3687 If INTERNAL_P is set, we are outputting in internal format and
3688 need to handle the CSI differently. */
3689
3690 static void
3691 restore_left_to_right_direction (struct Lisp_Coding_System *codesys,
3692 unsigned_char_dynarr *dst,
3693 unsigned int *flags,
3694 int internal_p)
3695 {
3696 if (!flags || (*flags & CODING_STATE_R2L))
3697 {
3698 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3699 {
3700 Dynarr_add (dst, ISO_CODE_ESC);
3701 Dynarr_add (dst, '[');
3702 }
3703 else if (internal_p)
3704 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3705 else
3706 Dynarr_add (dst, ISO_CODE_CSI);
3707 Dynarr_add (dst, '0');
3708 Dynarr_add (dst, ']');
3709 if (flags)
3710 *flags &= ~CODING_STATE_R2L;
3711 }
3712 }
3713
3714 /* If FLAGS is a null pointer or specifies a direction different from
3715 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or
3716 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape
3717 sequence to DST. Also update FLAGS if it is not a null pointer.
3718 If INTERNAL_P is set, we are outputting in internal format and
3719 need to handle the CSI differently. */
3720
3721 static void
3722 ensure_correct_direction (int direction, struct Lisp_Coding_System *codesys,
3723 unsigned_char_dynarr *dst, unsigned int *flags,
3724 int internal_p)
3725 {
3726 if ((!flags || (*flags & CODING_STATE_R2L)) &&
3727 direction == CHARSET_LEFT_TO_RIGHT)
3728 restore_left_to_right_direction (codesys, dst, flags, internal_p);
3729 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys)
3730 && (!flags || !(*flags & CODING_STATE_R2L)) &&
3731 direction == CHARSET_RIGHT_TO_LEFT)
3732 {
3733 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
3734 {
3735 Dynarr_add (dst, ISO_CODE_ESC);
3736 Dynarr_add (dst, '[');
3737 }
3738 else if (internal_p)
3739 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst);
3740 else
3741 Dynarr_add (dst, ISO_CODE_CSI);
3742 Dynarr_add (dst, '2');
3743 Dynarr_add (dst, ']');
3744 if (flags)
3745 *flags |= CODING_STATE_R2L;
3746 }
3747 }
3748
3749 /* Convert ISO2022-format data to internal format. */
3750
3751 static void
3752 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src,
3753 unsigned_char_dynarr *dst, unsigned int n)
3754 {
3755 unsigned char c;
3756 unsigned int flags, ch;
3757 int eol;
3758 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
3759 Lisp_Object coding_system = Qnil;
3760 unsigned_char_dynarr *real_dst = dst;
3761
3762 CODING_STREAM_DECOMPOSE (str, flags, ch);
3763 eol = CODING_SYSTEM_EOL_TYPE (str->codesys);
3764 XSETCODING_SYSTEM (coding_system, str->codesys);
3765
3766 if (flags & CODING_STATE_COMPOSITE)
3767 dst = str->iso2022.composite_chars;
3768
3769 while (n--)
3770 {
3771 c = *src++;
3772 if (flags & CODING_STATE_ESCAPE)
3773 { /* Within ESC sequence */
3774 int retval;
3775
3776 retval = parse_iso2022_esc (coding_system, &str->iso2022,
3777 c, &flags, 1);
3778
3779 if (retval)
3780 {
3781 switch (str->iso2022.esc)
3782 {
3783 case ISO_ESC_START_COMPOSITE:
3784 if (str->iso2022.composite_chars)
3785 Dynarr_reset (str->iso2022.composite_chars);
3786 else
3787 str->iso2022.composite_chars = Dynarr_new (unsigned char);
3788 dst = str->iso2022.composite_chars;
3789 break;
3790 case ISO_ESC_END_COMPOSITE:
3791 {
3792 Bufbyte comstr[MAX_EMCHAR_LEN];
3793 Bytecount len;
3794 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0),
3795 Dynarr_length (dst));
3796 dst = real_dst;
3797 len = set_charptr_emchar (comstr, emch);
3798 Dynarr_add_many (dst, comstr, len);
3799 break;
3800 }
3801
3802 case ISO_ESC_LITERAL:
3803 DECODE_ADD_BINARY_CHAR (c, dst);
3804 break;
3805
3806 default:
3807 /* Everything else handled already */
3808 break;
3809 }
3810 }
3811
3812 /* Attempted error recovery. */
3813 if (str->iso2022.output_direction_sequence)
3814 ensure_correct_direction (flags & CODING_STATE_R2L ?
3815 CHARSET_RIGHT_TO_LEFT :
3816 CHARSET_LEFT_TO_RIGHT,
3817 str->codesys, dst, 0, 1);
3818 /* More error recovery. */
3819 if (!retval || str->iso2022.output_literally)
3820 {
3821 /* Output the (possibly invalid) sequence */
3822 int i;
3823 for (i = 0; i < str->iso2022.esc_bytes_index; i++)
3824 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst);
3825 flags &= CODING_STATE_ISO2022_LOCK;
3826 if (!retval)
3827 n++, src--;/* Repeat the loop with the same character. */
3828 else
3829 {
3830 /* No sense in reprocessing the final byte of the
3831 escape sequence; it could mess things up anyway.
3832 Just add it now. */
3833 DECODE_ADD_BINARY_CHAR (c, dst);
3834 }
3835 }
3836 ch = 0;
3837 }
3838 else if (BYTE_C0_P (c) || BYTE_C1_P (c))
3839 { /* Control characters */
3840
3841 /***** Error-handling *****/
3842
3843 /* If we were in the middle of a character, dump out the
3844 partial character. */
3845 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3846
3847 /* If we just saw a single-shift character, dump it out.
3848 This may dump out the wrong sort of single-shift character,
3849 but least it will give an indication that something went
3850 wrong. */
3851 if (flags & CODING_STATE_SS2)
3852 {
3853 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst);
3854 flags &= ~CODING_STATE_SS2;
3855 }
3856 if (flags & CODING_STATE_SS3)
3857 {
3858 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst);
3859 flags &= ~CODING_STATE_SS3;
3860 }
3861
3862 /***** Now handle the control characters. *****/
3863
3864 /* Handle CR/LF */
3865 DECODE_HANDLE_EOL_TYPE (eol, c, flags, dst);
3866
3867 flags &= CODING_STATE_ISO2022_LOCK;
3868
3869 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1))
3870 DECODE_ADD_BINARY_CHAR (c, dst);
3871 }
3872 else
3873 { /* Graphic characters */
3874 Lisp_Object charset;
3875 int lb;
3876 int reg;
3877
3878 DECODE_HANDLE_EOL_TYPE (eol, c, flags, dst);
3879
3880 /* Now determine the charset. */
3881 reg = ((flags & CODING_STATE_SS2) ? 2
3882 : (flags & CODING_STATE_SS3) ? 3
3883 : !BYTE_ASCII_P (c) ? str->iso2022.register_right
3884 : str->iso2022.register_left);
3885 charset = str->iso2022.charset[reg];
3886
3887 /* Error checking: */
3888 if (NILP (charset) || str->iso2022.invalid_designated[reg]
3889 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL)
3890 && XCHARSET_CHARS (charset) == 94))
3891 /* Mrmph. We are trying to invoke a register that has no
3892 or an invalid charset in it, or trying to add a character
3893 outside the range of the charset. Insert that char literally
3894 to preserve it for the output. */
3895 {
3896 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3897 DECODE_ADD_BINARY_CHAR (c, dst);
3898 }
3899
3900 else
3901 {
3902 /* Things are probably hunky-dorey. */
3903
3904 /* Fetch reverse charset, maybe. */
3905 if (((flags & CODING_STATE_R2L) &&
3906 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT)
3907 ||
3908 (!(flags & CODING_STATE_R2L) &&
3909 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT))
3910 {
3911 Lisp_Object new_charset =
3912 XCHARSET_REVERSE_DIRECTION_CHARSET (charset);
3913 if (!NILP (new_charset))
3914 charset = new_charset;
3915 }
3916
3917 lb = XCHARSET_LEADING_BYTE (charset);
3918 switch (XCHARSET_REP_BYTES (charset))
3919 {
3920 case 1: /* ASCII */
3921 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3922 Dynarr_add (dst, c & 0x7F);
3923 break;
3924
3925 case 2: /* one-byte official */
3926 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3927 Dynarr_add (dst, lb);
3928 Dynarr_add (dst, c | 0x80);
3929 break;
3930
3931 case 3: /* one-byte private or two-byte official */
3932 if (XCHARSET_PRIVATE_P (charset))
3933 {
3934 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3935 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1);
3936 Dynarr_add (dst, lb);
3937 Dynarr_add (dst, c | 0x80);
3938 }
3939 else
3940 {
3941 if (ch)
3942 {
3943 Dynarr_add (dst, lb);
3944 Dynarr_add (dst, ch | 0x80);
3945 Dynarr_add (dst, c | 0x80);
3946 ch = 0;
3947 }
3948 else
3949 ch = c;
3950 }
3951 break;
3952
3953 default: /* two-byte private */
3954 if (ch)
3955 {
3956 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2);
3957 Dynarr_add (dst, lb);
3958 Dynarr_add (dst, ch | 0x80);
3959 Dynarr_add (dst, c | 0x80);
3960 ch = 0;
3961 }
3962 else
3963 ch = c;
3964 }
3965 }
3966
3967 if (!ch)
3968 flags &= CODING_STATE_ISO2022_LOCK;
3969 }
3970
3971 label_continue_loop:;
3972 }
3973
3974 if (flags & CODING_STATE_END)
3975 DECODE_OUTPUT_PARTIAL_CHAR (ch);
3976
3977 CODING_STREAM_COMPOSE (str, flags, ch);
3978 }
3979
3980
3981 /***** ISO2022 encoder *****/
3982
3983 /* Designate CHARSET into register REG. */
3984
3985 static void
3986 iso2022_designate (Lisp_Object charset, unsigned char reg,
3987 struct encoding_stream *str, unsigned_char_dynarr *dst)
3988 {
3989 CONST char *inter94 = "()*+", *inter96= ",-./";
3990 int type;
3991 unsigned char final;
3992 Lisp_Object old_charset = str->iso2022.charset[reg];
3993
3994 str->iso2022.charset[reg] = charset;
3995 if (!CHARSETP (charset))
3996 /* charset might be an initial nil or t. */
3997 return;
3998 type = XCHARSET_TYPE (charset);
3999 final = XCHARSET_FINAL (charset);
4000 if (!str->iso2022.force_charset_on_output[reg] &&
4001 CHARSETP (old_charset) &&
4002 XCHARSET_TYPE (old_charset) == type &&
4003 XCHARSET_FINAL (old_charset) == final)
4004 return;
4005
4006 str->iso2022.force_charset_on_output[reg] = 0;
4007
4008 {
4009 charset_conversion_spec_dynarr *dyn =
4010 str->codesys->iso2022.output_conv;
4011
4012 if (dyn)
4013 {
4014 int i;
4015
4016 for (i = 0; i < Dynarr_length (dyn); i++)
4017 {
4018 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i);
4019 if (EQ (charset, spec->from_charset))
4020 charset = spec->to_charset;
4021 }
4022 }
4023 }
4024
4025 Dynarr_add (dst, ISO_CODE_ESC);
4026 switch (type)
4027 {
4028 case CHARSET_TYPE_94:
4029 Dynarr_add (dst, inter94[reg]);
4030 break;
4031 case CHARSET_TYPE_96:
4032 Dynarr_add (dst, inter96[reg]);
4033 break;
4034 case CHARSET_TYPE_94X94:
4035 Dynarr_add (dst, '$');
4036 if (reg != 0
4037 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys))
4038 || final < '@'
4039 || final > 'B')
4040 Dynarr_add (dst, inter94[reg]);
4041 break;
4042 case CHARSET_TYPE_96X96:
4043 Dynarr_add (dst, '$');
4044 Dynarr_add (dst, inter96[reg]);
4045 break;
4046 }
4047 Dynarr_add (dst, final);
4048 }
4049
4050 static void
4051 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst)
4052 {
4053 if (str->iso2022.register_left != 0)
4054 {
4055 Dynarr_add (dst, ISO_CODE_SI);
4056 str->iso2022.register_left = 0;
4057 }
4058 }
4059
4060 static void
4061 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst)
4062 {
4063 if (str->iso2022.register_left != 1)
4064 {
4065 Dynarr_add (dst, ISO_CODE_SO);
4066 str->iso2022.register_left = 1;
4067 }
4068 }
4069
4070 /* Convert internally-formatted data to ISO2022 format. */
4071
4072 static void
4073 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src,
4074 unsigned_char_dynarr *dst, unsigned int n)
4075 {
4076 unsigned char charmask, c;
4077 unsigned int flags, ch, eol;
4078 unsigned char char_boundary;
4079 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4080 struct Lisp_Coding_System *codesys = str->codesys;
4081 int i;
4082 Lisp_Object charset;
4083 int half;
4084
4085 /* flags for handling composite chars. We do a little switcharoo
4086 on the source while we're outputting the composite char. */
4087 unsigned int saved_n;
4088 CONST unsigned char *saved_src;
4089 int in_composite = 0;
4090
4091 CODING_STREAM_DECOMPOSE (str, flags, ch);
4092 eol = CODING_SYSTEM_EOL_TYPE (str->codesys);
4093 char_boundary = str->iso2022.current_char_boundary;
4094 charset = str->iso2022.current_charset;
4095 half = str->iso2022.current_half;
4096
4097 back_to_square_n:
4098 while (n--)
4099 {
4100 c = *src++;
4101
4102 if (BYTE_ASCII_P (c))
4103 { /* Processing ASCII character */
4104 ch = 0;
4105
4106 restore_left_to_right_direction (codesys, dst, &flags, 0);
4107
4108 /* Make sure G0 contains ASCII */
4109 if ((c > ' ' && c < ISO_CODE_DEL) ||
4110 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys))
4111 {
4112 ensure_normal_shift (str, dst);
4113 iso2022_designate (Vcharset_ascii, 0, str, dst);
4114 }
4115
4116 /* If necessary, restore everything to the default state
4117 at end-of-line */
4118 if (c == '\n' &&
4119 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys)))
4120 {
4121 restore_left_to_right_direction (codesys, dst, &flags, 0);
4122
4123 ensure_normal_shift (str, dst);
4124
4125 for (i = 0; i < 4; i++)
4126 {
4127 Lisp_Object initial_charset =
4128 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4129 iso2022_designate (initial_charset, i, str, dst);
4130 }
4131 }
4132 if (c == '\n')
4133 {
4134 if (eol != EOL_LF && eol != EOL_AUTODETECT)
4135 Dynarr_add (dst, '\r');
4136 if (eol != EOL_CR)
4137 Dynarr_add (dst, c);
4138 }
4139 else
4140 {
4141 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4142 && fit_to_be_escape_quoted (c))
4143 Dynarr_add (dst, ISO_CODE_ESC);
4144 Dynarr_add (dst, c);
4145 }
4146 char_boundary = 1;
4147 }
4148
4149 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch))
4150 { /* Processing Leading Byte */
4151 ch = 0;
4152 charset = CHARSET_BY_LEADING_BYTE (c);
4153 if (c == PRE_LEADING_BYTE_PRIVATE_1 ||
4154 c == PRE_LEADING_BYTE_PRIVATE_2)
4155 ch = c;
4156 else if (!EQ (charset, Vcharset_control_1)
4157 && !EQ (charset, Vcharset_composite))
4158 {
4159 int reg;
4160
4161 ensure_correct_direction (XCHARSET_DIRECTION (charset),
4162 codesys, dst, &flags, 0);
4163
4164 /* Now determine which register to use. */
4165 reg = -1;
4166 for (i = 0; i < 4; i++)
4167 {
4168 if (EQ (charset, str->iso2022.charset[i]) ||
4169 EQ (charset,
4170 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)))
4171 {
4172 reg = i;
4173 break;
4174 }
4175 }
4176
4177 if (reg == -1)
4178 {
4179 if (XCHARSET_GRAPHIC (charset) != 0)
4180 {
4181 if (!NILP (str->iso2022.charset[1]) &&
4182 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) ||
4183 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys)))
4184 reg = 1;
4185 else if (!NILP (str->iso2022.charset[2]))
4186 reg = 2;
4187 else if (!NILP (str->iso2022.charset[3]))
4188 reg = 3;
4189 else
4190 reg = 0;
4191 }
4192 else
4193 reg = 0;
4194 }
4195
4196 iso2022_designate (charset, reg, str, dst);
4197
4198 /* Now invoke that register. */
4199 switch (reg)
4200 {
4201 case 0:
4202 ensure_normal_shift (str, dst);
4203 half = 0;
4204 break;
4205
4206 case 1:
4207 if (CODING_SYSTEM_ISO2022_SEVEN (codesys))
4208 {
4209 ensure_shift_out (str, dst);
4210 half = 0;
4211 }
4212 else
4213 half = 1;
4214 break;
4215
4216 case 2:
4217 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4218 {
4219 Dynarr_add (dst, ISO_CODE_ESC);
4220 Dynarr_add (dst, 'N');
4221 half = 0;
4222 }
4223 else
4224 {
4225 Dynarr_add (dst, ISO_CODE_SS2);
4226 half = 1;
4227 }
4228 break;
4229
4230 case 3:
4231 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys))
4232 {
4233 Dynarr_add (dst, ISO_CODE_ESC);
4234 Dynarr_add (dst, 'O');
4235 half = 0;
4236 }
4237 else
4238 {
4239 Dynarr_add (dst, ISO_CODE_SS3);
4240 half = 1;
4241 }
4242 break;
4243
4244 default:
4245 abort ();
4246 }
4247 }
4248 char_boundary = 0;
4249 }
4250 else
4251 { /* Processing Non-ASCII character */
4252 charmask = (half == 0 ? 0x7F : 0xFF);
4253 char_boundary = 1;
4254 if (EQ (charset, Vcharset_control_1))
4255 {
4256 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)
4257 && fit_to_be_escape_quoted (c))
4258 Dynarr_add (dst, ISO_CODE_ESC);
4259 /* you asked for it ... */
4260 Dynarr_add (dst, c - 0x20);
4261 }
4262 else
4263 {
4264 switch (XCHARSET_REP_BYTES (charset))
4265 {
4266 case 2:
4267 Dynarr_add (dst, c & charmask);
4268 break;
4269 case 3:
4270 if (XCHARSET_PRIVATE_P (charset))
4271 {
4272 Dynarr_add (dst, c & charmask);
4273 ch = 0;
4274 }
4275 else if (ch)
4276 {
4277 if (EQ (charset, Vcharset_composite))
4278 {
4279 if (in_composite)
4280 {
4281 /* #### Bother! We don't know how to
4282 handle this yet. */
4283 Dynarr_add (dst, '~');
4284 }
4285 else
4286 {
4287 Emchar emch = MAKE_CHAR (Vcharset_composite,
4288 ch & 0x7F, c & 0x7F);
4289 Lisp_Object lstr = composite_char_string (emch);
4290 saved_n = n;
4291 saved_src = src;
4292 in_composite = 1;
4293 src = XSTRING_DATA (lstr);
4294 n = XSTRING_LENGTH (lstr);
4295 Dynarr_add (dst, ISO_CODE_ESC);
4296 Dynarr_add (dst, '0'); /* start composing */
4297 }
4298 }
4299 else
4300 {
4301 Dynarr_add (dst, ch & charmask);
4302 Dynarr_add (dst, c & charmask);
4303 }
4304 ch = 0;
4305 }
4306 else
4307 {
4308 ch = c;
4309 char_boundary = 0;
4310 }
4311 break;
4312 case 4:
4313 if (ch)
4314 {
4315 Dynarr_add (dst, ch & charmask);
4316 Dynarr_add (dst, c & charmask);
4317 ch = 0;
4318 }
4319 else
4320 {
4321 ch = c;
4322 char_boundary = 0;
4323 }
4324 break;
4325 default:
4326 abort ();
4327 }
4328 }
4329 }
4330 }
4331
4332 if (in_composite)
4333 {
4334 n = saved_n;
4335 src = saved_src;
4336 in_composite = 0;
4337 Dynarr_add (dst, ISO_CODE_ESC);
4338 Dynarr_add (dst, '1'); /* end composing */
4339 goto back_to_square_n; /* Wheeeeeeeee ..... */
4340 }
4341
4342 if (char_boundary && flags & CODING_STATE_END)
4343 {
4344 restore_left_to_right_direction (codesys, dst, &flags, 0);
4345 ensure_normal_shift (str, dst);
4346 for (i = 0; i < 4; i++)
4347 {
4348 Lisp_Object initial_charset =
4349 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i);
4350 iso2022_designate (initial_charset, i, str, dst);
4351 }
4352 }
4353
4354 CODING_STREAM_COMPOSE (str, flags, ch);
4355 str->iso2022.current_char_boundary = char_boundary;
4356 str->iso2022.current_charset = charset;
4357 str->iso2022.current_half = half;
4358
4359 /* Verbum caro factum est! */
4360 }
4361
4362
4363 /************************************************************************/
4364 /* No-conversion methods */
4365 /************************************************************************/
4366
4367 /* This is used when reading in "binary" files -- i.e. files that may
4368 contain all 256 possible byte values and that are not to be
4369 interpreted as being in any particular decoding. */
4370 static void
4371 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src,
4372 unsigned_char_dynarr *dst, unsigned int n)
4373 {
4374 unsigned char c;
4375 unsigned int flags, ch;
4376 int eol;
4377 struct decoding_stream *str = DECODING_STREAM_DATA (decoding);
4378
4379 CODING_STREAM_DECOMPOSE (str, flags, ch);
4380 eol = str->eol_type;
4381
4382 while (n--)
4383 {
4384 c = *src++;
4385
4386 DECODE_HANDLE_EOL_TYPE (eol, c, flags, dst);
4387 DECODE_ADD_BINARY_CHAR (c, dst);
4388 label_continue_loop:;
4389 }
4390
4391 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst);
4392
4393 CODING_STREAM_COMPOSE (str, flags, ch);
4394 }
4395
4396 static void
4397 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src,
4398 unsigned_char_dynarr *dst, unsigned int n)
4399 {
4400 unsigned char c;
4401 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding);
4402 unsigned int flags, ch, eol;
4403
4404 CODING_STREAM_DECOMPOSE (str, flags, ch);
4405 eol = CODING_SYSTEM_EOL_TYPE (str->codesys);
4406
4407 while (n--)
4408 {
4409 c = *src++;
4410 if (c == '\n')
4411 {
4412 if (eol != EOL_LF && eol != EOL_AUTODETECT)
4413 Dynarr_add (dst, '\r');
4414 if (eol != EOL_CR)
4415 Dynarr_add (dst, '\n');
4416 ch = 0;
4417 }
4418 else if (BYTE_ASCII_P (c))
4419 {
4420 assert (ch == 0);
4421 Dynarr_add (dst, c);
4422 }
4423 else if (BUFBYTE_LEADING_BYTE_P (c))
4424 {
4425 assert (ch == 0);
4426 if (c == LEADING_BYTE_LATIN_1 || c == LEADING_BYTE_CONTROL_1)
4427 ch = c;
4428 else
4429 Dynarr_add (dst, '~'); /* untranslatable character */
4430 }
4431 else
4432 {
4433 if (ch == LEADING_BYTE_LATIN_1)
4434 Dynarr_add (dst, c);
4435 else if (ch == LEADING_BYTE_CONTROL_1)
4436 {
4437 assert (c < 0xC0);
4438 Dynarr_add (dst, c - 0x20);
4439 }
4440 /* else it should be the second or third byte of an
4441 untranslatable character, so ignore it */
4442 ch = 0;
4443 }
4444 }
4445
4446 CODING_STREAM_COMPOSE (str, flags, ch);
4447 }
4448
4449
4450 /************************************************************************/
4451 /* Simple internal/external functions */
4452 /************************************************************************/
4453
4454 static extbyte_dynarr *conversion_out_dynarr;
4455 static bufbyte_dynarr *conversion_in_dynarr;
4456
4457 /* Determine coding system from coding format */
4458
4459 #define PATHNAME_CODING_SYSTEM \
4460 ((NILP (Vpathname_coding_system) || \
4461 (EQ ((Vpathname_coding_system), Qbinary))) ? \
4462 Qnil : Fget_coding_system (Vpathname_coding_system))
4463
4464 /* #### not correct for all values of `fmt'! */
4465 #define FMT_CODING_SYSTEM(fmt) \
4466 (((fmt) == FORMAT_FILENAME) ? PATHNAME_CODING_SYSTEM : \
4467 ((fmt) == FORMAT_CTEXT ) ? Fget_coding_system (Qctext) : \
4468 ((fmt) == FORMAT_TERMINAL) ? PATHNAME_CODING_SYSTEM : \
4469 Qnil)
4470
4471 extern CONST Extbyte *
4472 convert_to_external_format (CONST Bufbyte *ptr,
4473 Bytecount len,
4474 Extcount *len_out,
4475 enum external_data_format fmt)
4476 {
4477 Lisp_Object coding_system = FMT_CODING_SYSTEM (fmt);
4478
4479 if (!conversion_out_dynarr)
4480 conversion_out_dynarr = Dynarr_new (Extbyte);
4481 else
4482 Dynarr_reset (conversion_out_dynarr);
4483
4484 if (NILP (coding_system))
4485 {
4486 CONST Bufbyte *end = ptr + len;
4487
4488 for (; ptr < end;)
4489 {
4490 Bufbyte c =
4491 (BYTE_ASCII_P (*ptr)) ? *ptr :
4492 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) :
4493 (*ptr == LEADING_BYTE_LATIN_1) ? (*(ptr+1)) :
4494 '~';
4495
4496 Dynarr_add (conversion_out_dynarr, (Extbyte) c);
4497 INC_CHARPTR (ptr);
4498 }
4499
4500 #ifdef ERROR_CHECK_BUFPOS
4501 assert (ptr == end);
4502 #endif
4503 }
4504 else
4505 {
4506 Lisp_Object instream =
4507 make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4508 Lisp_Object outstream = make_dynarr_output_stream
4509 ((unsigned_char_dynarr *) conversion_out_dynarr);
4510 struct gcpro gcpro1, gcpro2;
4511 char tempbuf[1024]; /* some random amount */
4512
4513 outstream =
4514 make_encoding_output_stream (XLSTREAM (outstream), coding_system);
4515 GCPRO2 (instream, outstream); /* Necessary?? */
4516 while (1)
4517 {
4518 int size_in_bytes = Lstream_read (XLSTREAM (instream),
4519 tempbuf, sizeof (tempbuf));
4520 if (!size_in_bytes)
4521 break;
4522 Lstream_write (XLSTREAM (outstream), tempbuf, size_in_bytes);
4523 }
4524 Lstream_close (XLSTREAM (instream));
4525 Lstream_close (XLSTREAM (outstream));
4526 UNGCPRO;
4527 }
4528
4529 *len_out = Dynarr_length (conversion_out_dynarr);
4530 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */
4531 return Dynarr_atp (conversion_out_dynarr, 0);
4532 }
4533
4534 extern CONST Bufbyte *
4535 convert_from_external_format (CONST Extbyte *ptr,
4536 Extcount len,
4537 Bytecount *len_out,
4538 enum external_data_format fmt)
4539 {
4540 Lisp_Object coding_system = FMT_CODING_SYSTEM (fmt);
4541
4542 if (!conversion_in_dynarr)
4543 conversion_in_dynarr = Dynarr_new (Bufbyte);
4544 else
4545 Dynarr_reset (conversion_in_dynarr);
4546
4547 if (NILP (coding_system))
4548 {
4549 CONST Extbyte *end = ptr + len;
4550 for (; ptr < end; ptr++)
4551 {
4552 Extbyte c = *ptr;
4553 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr);
4554 }
4555 }
4556 else
4557 {
4558 Lisp_Object instream =
4559 make_fixed_buffer_input_stream ((unsigned char *) ptr, len);
4560 Lisp_Object outstream = make_dynarr_output_stream
4561 ((unsigned_char_dynarr *) conversion_in_dynarr);
4562 struct gcpro gcpro1, gcpro2;
4563 char tempbuf[1024]; /* some random amount */
4564
4565 outstream =
4566 make_decoding_output_stream (XLSTREAM (outstream), coding_system);
4567 GCPRO2 (instream, outstream); /* Necessary?? */
4568 while (1)
4569 {
4570 int size_in_bytes = Lstream_read (XLSTREAM (instream),
4571 tempbuf, sizeof (tempbuf));
4572 if (!size_in_bytes)
4573 break;
4574 Lstream_write (XLSTREAM (outstream), tempbuf, size_in_bytes);
4575 }
4576 Lstream_close (XLSTREAM (instream));
4577 Lstream_close (XLSTREAM (outstream));
4578 UNGCPRO;
4579 }
4580
4581 *len_out = Dynarr_length (conversion_in_dynarr);
4582 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */
4583 return Dynarr_atp (conversion_in_dynarr, 0);
4584 }
4585
4586
4587 /************************************************************************/
4588 /* Initialization */
4589 /************************************************************************/
4590
4591 void
4592 syms_of_mule_coding (void)
4593 {
4594 defsymbol (&Qfile_coding_system, "file-coding-system");
4595 deferror (&Qcoding_system_error, "coding-system-error",
4596 "Coding-system error", Qio_error);
4597
4598 DEFSUBR (Fcoding_system_p);
4599 DEFSUBR (Ffind_coding_system);
4600 DEFSUBR (Fget_coding_system);
4601 DEFSUBR (Fcoding_system_list);
4602 DEFSUBR (Fcoding_system_name);
4603 DEFSUBR (Fmake_coding_system);
4604 DEFSUBR (Fcopy_coding_system);
4605 DEFSUBR (Fsubsidiary_coding_system);
4606
4607 DEFSUBR (Fcoding_system_type);
4608 DEFSUBR (Fcoding_system_doc_string);
4609 DEFSUBR (Fcoding_system_property);
4610
4611 DEFSUBR (Fcoding_category_list);
4612 DEFSUBR (Fset_coding_priority_list);
4613 DEFSUBR (Fcoding_priority_list);
4614 DEFSUBR (Fset_coding_category_system);
4615 DEFSUBR (Fcoding_category_system);
4616
4617 DEFSUBR (Fdetect_coding_region);
4618 DEFSUBR (Fdecode_coding_region);
4619 DEFSUBR (Fencode_coding_region);
4620 DEFSUBR (Fdecode_shift_jis_char);
4621 DEFSUBR (Fencode_shift_jis_char);
4622 DEFSUBR (Fdecode_big5_char);
4623 DEFSUBR (Fencode_big5_char);
4624
4625 defsymbol (&Qcoding_system_p, "coding-system-p");
4626
4627 defsymbol (&Qbig5, "big5");
4628 defsymbol (&Qshift_jis, "shift-jis");
4629 defsymbol (&Qno_conversion, "no-conversion");
4630 defsymbol (&Qccl, "ccl");
4631 defsymbol (&Qiso2022, "iso2022");
4632
4633 defsymbol (&Qmnemonic, "mnemonic");
4634 defsymbol (&Qeol_type, "eol-type");
4635 defsymbol (&Qpost_read_conversion, "post-read-conversion");
4636 defsymbol (&Qpre_write_conversion, "pre-write-conversion");
4637
4638 defsymbol (&Qcr, "cr");
4639 defsymbol (&Qlf, "lf");
4640 defsymbol (&Qcrlf, "crlf");
4641 defsymbol (&Qeol_cr, "eol-cr");
4642 defsymbol (&Qeol_lf, "eol-lf");
4643 defsymbol (&Qeol_crlf, "eol-crlf");
4644
4645 defsymbol (&Qcharset_g0, "charset-g0");
4646 defsymbol (&Qcharset_g1, "charset-g1");
4647 defsymbol (&Qcharset_g2, "charset-g2");
4648 defsymbol (&Qcharset_g3, "charset-g3");
4649 defsymbol (&Qforce_g0_on_output, "force-g0-on-output");
4650 defsymbol (&Qforce_g1_on_output, "force-g1-on-output");
4651 defsymbol (&Qforce_g2_on_output, "force-g2-on-output");
4652 defsymbol (&Qforce_g3_on_output, "force-g3-on-output");
4653 defsymbol (&Qshort, "short");
4654 defsymbol (&Qno_ascii_eol, "no-ascii-eol");
4655 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl");
4656 defsymbol (&Qseven, "seven");
4657 defsymbol (&Qlock_shift, "lock-shift");
4658 defsymbol (&Qno_iso6429, "no-iso6429");
4659 defsymbol (&Qescape_quoted, "escape-quoted");
4660 defsymbol (&Qinput_charset_conversion, "input-charset-conversion");
4661 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion");
4662
4663 defsymbol (&Qencode, "encode");
4664 defsymbol (&Qdecode, "decode");
4665
4666 defsymbol (&Qctext, "ctext");
4667
4668 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS],
4669 "shift-jis");
4670 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7],
4671 "iso-7");
4672 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE],
4673 "iso-8-designate");
4674 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1],
4675 "iso-8-1");
4676 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2],
4677 "iso-8-2");
4678 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT],
4679 "iso-lock-shift");
4680 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5],
4681 "big5");
4682 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION],
4683 "no-conversion");
4684 }
4685
4686 void
4687 lstream_type_create_mule_coding (void)
4688 {
4689 LSTREAM_HAS_METHOD (decoding, reader);
4690 LSTREAM_HAS_METHOD (decoding, writer);
4691 LSTREAM_HAS_METHOD (decoding, rewinder);
4692 LSTREAM_HAS_METHOD (decoding, seekable_p);
4693 LSTREAM_HAS_METHOD (decoding, flusher);
4694 LSTREAM_HAS_METHOD (decoding, closer);
4695 LSTREAM_HAS_METHOD (decoding, marker);
4696
4697 LSTREAM_HAS_METHOD (encoding, reader);
4698 LSTREAM_HAS_METHOD (encoding, writer);
4699 LSTREAM_HAS_METHOD (encoding, rewinder);
4700 LSTREAM_HAS_METHOD (encoding, seekable_p);
4701 LSTREAM_HAS_METHOD (encoding, flusher);
4702 LSTREAM_HAS_METHOD (encoding, closer);
4703 LSTREAM_HAS_METHOD (encoding, marker);
4704 }
4705
4706 void
4707 vars_of_mule_coding (void)
4708 {
4709 int i;
4710
4711 /* Initialize to something reasonable ... */
4712 for (i = 0; i <= CODING_CATEGORY_LAST; i++)
4713 {
4714 coding_category_system[i] = Qnil;
4715 coding_category_by_priority[i] = i;
4716 }
4717
4718 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /*
4719 Coding system used for TTY keyboard input.
4720 Not used under a windowing system.
4721 */ );
4722 Vkeyboard_coding_system = Qnil;
4723
4724 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /*
4725 Coding system used for TTY display output.
4726 Not used under a windowing system.
4727 */ );
4728 Vterminal_coding_system = Qnil;
4729
4730 DEFVAR_LISP ("process-input-coding-system", &Vprocess_input_coding_system /*
4731 Default coding system used by C process routines for inputting data.
4732 This can be changed for a particular process using
4733 `set-process-input-coding-system'.
4734 */ );
4735 Vprocess_input_coding_system = Qnil;
4736
4737 DEFVAR_LISP ("process-output-coding-system",
4738 &Vprocess_output_coding_system /*
4739 Default coding system used by C process routines for outputting data.
4740 This can be changed for a particular process using
4741 `set-process-output-coding-system'.
4742 */ );
4743 Vprocess_output_coding_system = Qnil;
4744
4745 DEFVAR_LISP ("pathname-coding-system", &Vpathname_coding_system /*
4746 Coding system used to convert pathnames when accessing files.
4747 */ );
4748 Vpathname_coding_system = Qnil;
4749 }
4750
4751 void
4752 complex_vars_of_mule_coding (void)
4753 {
4754 staticpro (&Vcoding_system_hashtable);
4755 Vcoding_system_hashtable = make_lisp_hashtable (50, HASHTABLE_NONWEAK,
4756 HASHTABLE_EQ);
4757
4758 the_codesys_prop_dynarr = Dynarr_new (struct codesys_prop);
4759
4760 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \
4761 { \
4762 struct codesys_prop csp; \
4763 csp.sym = (Sym); \
4764 csp.prop_type = (Prop_Type); \
4765 Dynarr_add (the_codesys_prop_dynarr, csp); \
4766 } while (0)
4767
4768 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic);
4769 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type);
4770 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr);
4771 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf);
4772 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf);
4773 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion);
4774 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion);
4775
4776 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0);
4777 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1);
4778 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2);
4779 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3);
4780 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output);
4781 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output);
4782 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output);
4783 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output);
4784 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort);
4785 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol);
4786 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl);
4787 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven);
4788 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift);
4789 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429);
4790 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted);
4791 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion);
4792 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion);
4793
4794 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode);
4795 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode);
4796
4797 /* Need to create this here or we're really screwed. */
4798 Fmake_coding_system (Qno_conversion, Qno_conversion, build_string ("No conversion"),
4799 list2 (Qmnemonic, build_string ("Noconv")));
4800
4801 Fcopy_coding_system (Fcoding_system_property (Qno_conversion, Qeol_lf),
4802 Qbinary);
4803
4804 /* Need this for bootstrapping */
4805 coding_category_system[CODING_CATEGORY_NO_CONVERSION] =
4806 Fget_coding_system (Qno_conversion);
4807 }