Mercurial > hg > xemacs-beta
comparison src/file-coding.c @ 428:3ecd8885ac67 r21-2-22
Import from CVS: tag r21-2-22
author | cvs |
---|---|
date | Mon, 13 Aug 2007 11:28:15 +0200 |
parents | |
children | 3a7e78e1142d |
comparison
equal
deleted
inserted
replaced
427:0a0253eac470 | 428:3ecd8885ac67 |
---|---|
1 /* Code conversion functions. | |
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc. | |
3 Copyright (C) 1995 Sun Microsystems, Inc. | |
4 | |
5 This file is part of XEmacs. | |
6 | |
7 XEmacs is free software; you can redistribute it and/or modify it | |
8 under the terms of the GNU General Public License as published by the | |
9 Free Software Foundation; either version 2, or (at your option) any | |
10 later version. | |
11 | |
12 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
15 for more details. | |
16 | |
17 You should have received a copy of the GNU General Public License | |
18 along with XEmacs; see the file COPYING. If not, write to | |
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
20 Boston, MA 02111-1307, USA. */ | |
21 | |
22 /* Synched up with: Mule 2.3. Not in FSF. */ | |
23 | |
24 /* Rewritten by Ben Wing <ben@xemacs.org>. */ | |
25 | |
26 #include <config.h> | |
27 #include "lisp.h" | |
28 | |
29 #include "buffer.h" | |
30 #include "elhash.h" | |
31 #include "insdel.h" | |
32 #include "lstream.h" | |
33 #ifdef MULE | |
34 #include "mule-ccl.h" | |
35 #include "chartab.h" | |
36 #endif | |
37 #include "file-coding.h" | |
38 | |
39 Lisp_Object Qcoding_system_error; | |
40 | |
41 Lisp_Object Vkeyboard_coding_system; | |
42 Lisp_Object Vterminal_coding_system; | |
43 Lisp_Object Vcoding_system_for_read; | |
44 Lisp_Object Vcoding_system_for_write; | |
45 Lisp_Object Vfile_name_coding_system; | |
46 | |
47 /* Table of symbols identifying each coding category. */ | |
48 Lisp_Object coding_category_symbol[CODING_CATEGORY_LAST + 1]; | |
49 | |
50 | |
51 | |
52 struct file_coding_dump { | |
53 /* Coding system currently associated with each coding category. */ | |
54 Lisp_Object coding_category_system[CODING_CATEGORY_LAST + 1]; | |
55 | |
56 /* Table of all coding categories in decreasing order of priority. | |
57 This describes a permutation of the possible coding categories. */ | |
58 int coding_category_by_priority[CODING_CATEGORY_LAST + 1]; | |
59 | |
60 Lisp_Object ucs_to_mule_table[65536]; | |
61 } *fcd; | |
62 | |
63 static const struct lrecord_description fcd_description_1[] = { | |
64 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, coding_category_system), CODING_CATEGORY_LAST + 1 }, | |
65 { XD_LISP_OBJECT, offsetof(struct file_coding_dump, ucs_to_mule_table), 65536 }, | |
66 { XD_END } | |
67 }; | |
68 | |
69 static const struct struct_description fcd_description = { | |
70 sizeof(struct file_coding_dump), | |
71 fcd_description_1 | |
72 }; | |
73 | |
74 Lisp_Object mule_to_ucs_table; | |
75 | |
76 Lisp_Object Qcoding_systemp; | |
77 | |
78 Lisp_Object Qraw_text, Qno_conversion, Qccl, Qiso2022; | |
79 /* Qinternal in general.c */ | |
80 | |
81 Lisp_Object Qmnemonic, Qeol_type; | |
82 Lisp_Object Qcr, Qcrlf, Qlf; | |
83 Lisp_Object Qeol_cr, Qeol_crlf, Qeol_lf; | |
84 Lisp_Object Qpost_read_conversion; | |
85 Lisp_Object Qpre_write_conversion; | |
86 | |
87 #ifdef MULE | |
88 Lisp_Object Qucs4, Qutf8; | |
89 Lisp_Object Qbig5, Qshift_jis; | |
90 Lisp_Object Qcharset_g0, Qcharset_g1, Qcharset_g2, Qcharset_g3; | |
91 Lisp_Object Qforce_g0_on_output, Qforce_g1_on_output; | |
92 Lisp_Object Qforce_g2_on_output, Qforce_g3_on_output; | |
93 Lisp_Object Qno_iso6429; | |
94 Lisp_Object Qinput_charset_conversion, Qoutput_charset_conversion; | |
95 Lisp_Object Qctext, Qescape_quoted; | |
96 Lisp_Object Qshort, Qno_ascii_eol, Qno_ascii_cntl, Qseven, Qlock_shift; | |
97 #endif | |
98 Lisp_Object Qencode, Qdecode; | |
99 | |
100 Lisp_Object Vcoding_system_hash_table; | |
101 | |
102 int enable_multibyte_characters; | |
103 | |
104 #ifdef MULE | |
105 /* Additional information used by the ISO2022 decoder and detector. */ | |
106 struct iso2022_decoder | |
107 { | |
108 /* CHARSET holds the character sets currently assigned to the G0 | |
109 through G3 variables. It is initialized from the array | |
110 INITIAL_CHARSET in CODESYS. */ | |
111 Lisp_Object charset[4]; | |
112 | |
113 /* Which registers are currently invoked into the left (GL) and | |
114 right (GR) halves of the 8-bit encoding space? */ | |
115 int register_left, register_right; | |
116 | |
117 /* ISO_ESC holds a value indicating part of an escape sequence | |
118 that has already been seen. */ | |
119 enum iso_esc_flag esc; | |
120 | |
121 /* This records the bytes we've seen so far in an escape sequence, | |
122 in case the sequence is invalid (we spit out the bytes unchanged). */ | |
123 unsigned char esc_bytes[8]; | |
124 | |
125 /* Index for next byte to store in ISO escape sequence. */ | |
126 int esc_bytes_index; | |
127 | |
128 #ifdef ENABLE_COMPOSITE_CHARS | |
129 /* Stuff seen so far when composing a string. */ | |
130 unsigned_char_dynarr *composite_chars; | |
131 #endif | |
132 | |
133 /* If we saw an invalid designation sequence for a particular | |
134 register, we flag it here and switch to ASCII. The next time we | |
135 see a valid designation for this register, we turn off the flag | |
136 and do the designation normally, but pretend the sequence was | |
137 invalid. The effect of all this is that (most of the time) the | |
138 escape sequences for both the switch to the unknown charset, and | |
139 the switch back to the known charset, get inserted literally into | |
140 the buffer and saved out as such. The hope is that we can | |
141 preserve the escape sequences so that the resulting written out | |
142 file makes sense. If we don't do any of this, the designation | |
143 to the invalid charset will be preserved but that switch back | |
144 to the known charset will probably get eaten because it was | |
145 the same charset that was already present in the register. */ | |
146 unsigned char invalid_designated[4]; | |
147 | |
148 /* We try to do similar things as above for direction-switching | |
149 sequences. If we encountered a direction switch while an | |
150 invalid designation was present, or an invalid designation | |
151 just after a direction switch (i.e. no valid designation | |
152 encountered yet), we insert the direction-switch escape | |
153 sequence literally into the output stream, and later on | |
154 insert the corresponding direction-restoring escape sequence | |
155 literally also. */ | |
156 unsigned int switched_dir_and_no_valid_charset_yet :1; | |
157 unsigned int invalid_switch_dir :1; | |
158 | |
159 /* Tells the decoder to output the escape sequence literally | |
160 even though it was valid. Used in the games we play to | |
161 avoid lossage when we encounter invalid designations. */ | |
162 unsigned int output_literally :1; | |
163 /* We encountered a direction switch followed by an invalid | |
164 designation. We didn't output the direction switch | |
165 literally because we didn't know about the invalid designation; | |
166 but we have to do so now. */ | |
167 unsigned int output_direction_sequence :1; | |
168 }; | |
169 #endif /* MULE */ | |
170 EXFUN (Fcopy_coding_system, 2); | |
171 #ifdef MULE | |
172 struct detection_state; | |
173 static int detect_coding_sjis (struct detection_state *st, | |
174 CONST unsigned char *src, | |
175 unsigned int n); | |
176 static void decode_coding_sjis (Lstream *decoding, | |
177 CONST unsigned char *src, | |
178 unsigned_char_dynarr *dst, | |
179 unsigned int n); | |
180 static void encode_coding_sjis (Lstream *encoding, | |
181 CONST unsigned char *src, | |
182 unsigned_char_dynarr *dst, | |
183 unsigned int n); | |
184 static int detect_coding_big5 (struct detection_state *st, | |
185 CONST unsigned char *src, | |
186 unsigned int n); | |
187 static void decode_coding_big5 (Lstream *decoding, | |
188 CONST unsigned char *src, | |
189 unsigned_char_dynarr *dst, unsigned int n); | |
190 static void encode_coding_big5 (Lstream *encoding, | |
191 CONST unsigned char *src, | |
192 unsigned_char_dynarr *dst, unsigned int n); | |
193 static int detect_coding_ucs4 (struct detection_state *st, | |
194 CONST unsigned char *src, | |
195 unsigned int n); | |
196 static void decode_coding_ucs4 (Lstream *decoding, | |
197 CONST unsigned char *src, | |
198 unsigned_char_dynarr *dst, unsigned int n); | |
199 static void encode_coding_ucs4 (Lstream *encoding, | |
200 CONST unsigned char *src, | |
201 unsigned_char_dynarr *dst, unsigned int n); | |
202 static int detect_coding_utf8 (struct detection_state *st, | |
203 CONST unsigned char *src, | |
204 unsigned int n); | |
205 static void decode_coding_utf8 (Lstream *decoding, | |
206 CONST unsigned char *src, | |
207 unsigned_char_dynarr *dst, unsigned int n); | |
208 static void encode_coding_utf8 (Lstream *encoding, | |
209 CONST unsigned char *src, | |
210 unsigned_char_dynarr *dst, unsigned int n); | |
211 static int postprocess_iso2022_mask (int mask); | |
212 static void reset_iso2022 (Lisp_Object coding_system, | |
213 struct iso2022_decoder *iso); | |
214 static int detect_coding_iso2022 (struct detection_state *st, | |
215 CONST unsigned char *src, | |
216 unsigned int n); | |
217 static void decode_coding_iso2022 (Lstream *decoding, | |
218 CONST unsigned char *src, | |
219 unsigned_char_dynarr *dst, unsigned int n); | |
220 static void encode_coding_iso2022 (Lstream *encoding, | |
221 CONST unsigned char *src, | |
222 unsigned_char_dynarr *dst, unsigned int n); | |
223 #endif /* MULE */ | |
224 static void decode_coding_no_conversion (Lstream *decoding, | |
225 CONST unsigned char *src, | |
226 unsigned_char_dynarr *dst, | |
227 unsigned int n); | |
228 static void encode_coding_no_conversion (Lstream *encoding, | |
229 CONST unsigned char *src, | |
230 unsigned_char_dynarr *dst, | |
231 unsigned int n); | |
232 static void mule_decode (Lstream *decoding, CONST unsigned char *src, | |
233 unsigned_char_dynarr *dst, unsigned int n); | |
234 static void mule_encode (Lstream *encoding, CONST unsigned char *src, | |
235 unsigned_char_dynarr *dst, unsigned int n); | |
236 | |
237 typedef struct codesys_prop codesys_prop; | |
238 struct codesys_prop | |
239 { | |
240 Lisp_Object sym; | |
241 int prop_type; | |
242 }; | |
243 | |
244 typedef struct | |
245 { | |
246 Dynarr_declare (codesys_prop); | |
247 } codesys_prop_dynarr; | |
248 | |
249 static const struct lrecord_description codesys_prop_description_1[] = { | |
250 { XD_LISP_OBJECT, offsetof(codesys_prop, sym), 1 }, | |
251 { XD_END } | |
252 }; | |
253 | |
254 static const struct struct_description codesys_prop_description = { | |
255 sizeof(codesys_prop), | |
256 codesys_prop_description_1 | |
257 }; | |
258 | |
259 static const struct lrecord_description codesys_prop_dynarr_description_1[] = { | |
260 XD_DYNARR_DESC(codesys_prop_dynarr, &codesys_prop_description), | |
261 { XD_END } | |
262 }; | |
263 | |
264 static const struct struct_description codesys_prop_dynarr_description = { | |
265 sizeof(codesys_prop_dynarr), | |
266 codesys_prop_dynarr_description_1 | |
267 }; | |
268 | |
269 codesys_prop_dynarr *the_codesys_prop_dynarr; | |
270 | |
271 enum codesys_prop_enum | |
272 { | |
273 CODESYS_PROP_ALL_OK, | |
274 CODESYS_PROP_ISO2022, | |
275 CODESYS_PROP_CCL | |
276 }; | |
277 | |
278 | |
279 /************************************************************************/ | |
280 /* Coding system functions */ | |
281 /************************************************************************/ | |
282 | |
283 static Lisp_Object mark_coding_system (Lisp_Object); | |
284 static void print_coding_system (Lisp_Object, Lisp_Object, int); | |
285 static void finalize_coding_system (void *header, int for_disksave); | |
286 | |
287 #ifdef MULE | |
288 static const struct lrecord_description ccs_description_1[] = { | |
289 { XD_LISP_OBJECT, offsetof(charset_conversion_spec, from_charset), 2 }, | |
290 { XD_END } | |
291 }; | |
292 | |
293 static const struct struct_description ccs_description = { | |
294 sizeof(charset_conversion_spec), | |
295 ccs_description_1 | |
296 }; | |
297 | |
298 static const struct lrecord_description ccsd_description_1[] = { | |
299 XD_DYNARR_DESC(charset_conversion_spec_dynarr, &ccs_description), | |
300 { XD_END } | |
301 }; | |
302 | |
303 static const struct struct_description ccsd_description = { | |
304 sizeof(charset_conversion_spec_dynarr), | |
305 ccsd_description_1 | |
306 }; | |
307 #endif | |
308 | |
309 static const struct lrecord_description coding_system_description[] = { | |
310 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, name), 2 }, | |
311 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, mnemonic), 3 }, | |
312 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, eol_lf), 3 }, | |
313 #ifdef MULE | |
314 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, iso2022.initial_charset), 4 }, | |
315 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.input_conv), 1, &ccsd_description }, | |
316 { XD_STRUCT_PTR, offsetof(struct Lisp_Coding_System, iso2022.output_conv), 1, &ccsd_description }, | |
317 { XD_LISP_OBJECT, offsetof(struct Lisp_Coding_System, ccl.decode), 2 }, | |
318 #endif | |
319 { XD_END } | |
320 }; | |
321 | |
322 DEFINE_LRECORD_IMPLEMENTATION ("coding-system", coding_system, | |
323 mark_coding_system, print_coding_system, | |
324 finalize_coding_system, | |
325 0, 0, coding_system_description, | |
326 struct Lisp_Coding_System); | |
327 | |
328 static Lisp_Object | |
329 mark_coding_system (Lisp_Object obj) | |
330 { | |
331 Lisp_Coding_System *codesys = XCODING_SYSTEM (obj); | |
332 | |
333 mark_object (CODING_SYSTEM_NAME (codesys)); | |
334 mark_object (CODING_SYSTEM_DOC_STRING (codesys)); | |
335 mark_object (CODING_SYSTEM_MNEMONIC (codesys)); | |
336 mark_object (CODING_SYSTEM_EOL_LF (codesys)); | |
337 mark_object (CODING_SYSTEM_EOL_CRLF (codesys)); | |
338 mark_object (CODING_SYSTEM_EOL_CR (codesys)); | |
339 | |
340 switch (CODING_SYSTEM_TYPE (codesys)) | |
341 { | |
342 #ifdef MULE | |
343 int i; | |
344 case CODESYS_ISO2022: | |
345 for (i = 0; i < 4; i++) | |
346 mark_object (CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i)); | |
347 if (codesys->iso2022.input_conv) | |
348 { | |
349 for (i = 0; i < Dynarr_length (codesys->iso2022.input_conv); i++) | |
350 { | |
351 struct charset_conversion_spec *ccs = | |
352 Dynarr_atp (codesys->iso2022.input_conv, i); | |
353 mark_object (ccs->from_charset); | |
354 mark_object (ccs->to_charset); | |
355 } | |
356 } | |
357 if (codesys->iso2022.output_conv) | |
358 { | |
359 for (i = 0; i < Dynarr_length (codesys->iso2022.output_conv); i++) | |
360 { | |
361 struct charset_conversion_spec *ccs = | |
362 Dynarr_atp (codesys->iso2022.output_conv, i); | |
363 mark_object (ccs->from_charset); | |
364 mark_object (ccs->to_charset); | |
365 } | |
366 } | |
367 break; | |
368 | |
369 case CODESYS_CCL: | |
370 mark_object (CODING_SYSTEM_CCL_DECODE (codesys)); | |
371 mark_object (CODING_SYSTEM_CCL_ENCODE (codesys)); | |
372 break; | |
373 #endif /* MULE */ | |
374 default: | |
375 break; | |
376 } | |
377 | |
378 mark_object (CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys)); | |
379 return CODING_SYSTEM_POST_READ_CONVERSION (codesys); | |
380 } | |
381 | |
382 static void | |
383 print_coding_system (Lisp_Object obj, Lisp_Object printcharfun, | |
384 int escapeflag) | |
385 { | |
386 Lisp_Coding_System *c = XCODING_SYSTEM (obj); | |
387 if (print_readably) | |
388 error ("printing unreadable object #<coding_system 0x%x>", | |
389 c->header.uid); | |
390 | |
391 write_c_string ("#<coding_system ", printcharfun); | |
392 print_internal (c->name, printcharfun, 1); | |
393 write_c_string (">", printcharfun); | |
394 } | |
395 | |
396 static void | |
397 finalize_coding_system (void *header, int for_disksave) | |
398 { | |
399 Lisp_Coding_System *c = (Lisp_Coding_System *) header; | |
400 /* Since coding systems never go away, this function is not | |
401 necessary. But it would be necessary if we changed things | |
402 so that coding systems could go away. */ | |
403 if (!for_disksave) /* see comment in lstream.c */ | |
404 { | |
405 switch (CODING_SYSTEM_TYPE (c)) | |
406 { | |
407 #ifdef MULE | |
408 case CODESYS_ISO2022: | |
409 if (c->iso2022.input_conv) | |
410 { | |
411 Dynarr_free (c->iso2022.input_conv); | |
412 c->iso2022.input_conv = 0; | |
413 } | |
414 if (c->iso2022.output_conv) | |
415 { | |
416 Dynarr_free (c->iso2022.output_conv); | |
417 c->iso2022.output_conv = 0; | |
418 } | |
419 break; | |
420 #endif /* MULE */ | |
421 default: | |
422 break; | |
423 } | |
424 } | |
425 } | |
426 | |
427 static enum eol_type | |
428 symbol_to_eol_type (Lisp_Object symbol) | |
429 { | |
430 CHECK_SYMBOL (symbol); | |
431 if (NILP (symbol)) return EOL_AUTODETECT; | |
432 if (EQ (symbol, Qlf)) return EOL_LF; | |
433 if (EQ (symbol, Qcrlf)) return EOL_CRLF; | |
434 if (EQ (symbol, Qcr)) return EOL_CR; | |
435 | |
436 signal_simple_error ("Unrecognized eol type", symbol); | |
437 return EOL_AUTODETECT; /* not reached */ | |
438 } | |
439 | |
440 static Lisp_Object | |
441 eol_type_to_symbol (enum eol_type type) | |
442 { | |
443 switch (type) | |
444 { | |
445 default: abort (); | |
446 case EOL_LF: return Qlf; | |
447 case EOL_CRLF: return Qcrlf; | |
448 case EOL_CR: return Qcr; | |
449 case EOL_AUTODETECT: return Qnil; | |
450 } | |
451 } | |
452 | |
453 static void | |
454 setup_eol_coding_systems (Lisp_Coding_System *codesys) | |
455 { | |
456 Lisp_Object codesys_obj; | |
457 int len = string_length (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name); | |
458 char *codesys_name = (char *) alloca (len + 7); | |
459 int mlen = -1; | |
460 char *codesys_mnemonic=0; | |
461 | |
462 Lisp_Object codesys_name_sym, sub_codesys_obj; | |
463 | |
464 /* kludge */ | |
465 | |
466 XSETCODING_SYSTEM (codesys_obj, codesys); | |
467 | |
468 memcpy (codesys_name, | |
469 string_data (XSYMBOL (CODING_SYSTEM_NAME (codesys))->name), len); | |
470 | |
471 if (STRINGP (CODING_SYSTEM_MNEMONIC (codesys))) | |
472 { | |
473 mlen = XSTRING_LENGTH (CODING_SYSTEM_MNEMONIC (codesys)); | |
474 codesys_mnemonic = (char *) alloca (mlen + 7); | |
475 memcpy (codesys_mnemonic, | |
476 XSTRING_DATA (CODING_SYSTEM_MNEMONIC (codesys)), mlen); | |
477 } | |
478 | |
479 #define DEFINE_SUB_CODESYS(op_sys, op_sys_abbr, Type) do { \ | |
480 strcpy (codesys_name + len, "-" op_sys); \ | |
481 if (mlen != -1) \ | |
482 strcpy (codesys_mnemonic + mlen, op_sys_abbr); \ | |
483 codesys_name_sym = intern (codesys_name); \ | |
484 sub_codesys_obj = Fcopy_coding_system (codesys_obj, codesys_name_sym); \ | |
485 XCODING_SYSTEM_EOL_TYPE (sub_codesys_obj) = Type; \ | |
486 if (mlen != -1) \ | |
487 XCODING_SYSTEM_MNEMONIC(sub_codesys_obj) = \ | |
488 build_string (codesys_mnemonic); \ | |
489 CODING_SYSTEM_##Type (codesys) = sub_codesys_obj; \ | |
490 } while (0) | |
491 | |
492 DEFINE_SUB_CODESYS("unix", "", EOL_LF); | |
493 DEFINE_SUB_CODESYS("dos", ":T", EOL_CRLF); | |
494 DEFINE_SUB_CODESYS("mac", ":t", EOL_CR); | |
495 } | |
496 | |
497 DEFUN ("coding-system-p", Fcoding_system_p, 1, 1, 0, /* | |
498 Return t if OBJECT is a coding system. | |
499 A coding system is an object that defines how text containing multiple | |
500 character sets is encoded into a stream of (typically 8-bit) bytes. | |
501 The coding system is used to decode the stream into a series of | |
502 characters (which may be from multiple charsets) when the text is read | |
503 from a file or process, and is used to encode the text back into the | |
504 same format when it is written out to a file or process. | |
505 | |
506 For example, many ISO2022-compliant coding systems (such as Compound | |
507 Text, which is used for inter-client data under the X Window System) | |
508 use escape sequences to switch between different charsets -- Japanese | |
509 Kanji, for example, is invoked with "ESC $ ( B"; ASCII is invoked | |
510 with "ESC ( B"; and Cyrillic is invoked with "ESC - L". See | |
511 `make-coding-system' for more information. | |
512 | |
513 Coding systems are normally identified using a symbol, and the | |
514 symbol is accepted in place of the actual coding system object whenever | |
515 a coding system is called for. (This is similar to how faces work.) | |
516 */ | |
517 (object)) | |
518 { | |
519 return CODING_SYSTEMP (object) ? Qt : Qnil; | |
520 } | |
521 | |
522 DEFUN ("find-coding-system", Ffind_coding_system, 1, 1, 0, /* | |
523 Retrieve the coding system of the given name. | |
524 | |
525 If CODING-SYSTEM-OR-NAME is a coding-system object, it is simply | |
526 returned. Otherwise, CODING-SYSTEM-OR-NAME should be a symbol. | |
527 If there is no such coding system, nil is returned. Otherwise the | |
528 associated coding system object is returned. | |
529 */ | |
530 (coding_system_or_name)) | |
531 { | |
532 if (CODING_SYSTEMP (coding_system_or_name)) | |
533 return coding_system_or_name; | |
534 | |
535 if (NILP (coding_system_or_name)) | |
536 coding_system_or_name = Qbinary; | |
537 else | |
538 CHECK_SYMBOL (coding_system_or_name); | |
539 | |
540 return Fgethash (coding_system_or_name, Vcoding_system_hash_table, Qnil); | |
541 } | |
542 | |
543 DEFUN ("get-coding-system", Fget_coding_system, 1, 1, 0, /* | |
544 Retrieve the coding system of the given name. | |
545 Same as `find-coding-system' except that if there is no such | |
546 coding system, an error is signaled instead of returning nil. | |
547 */ | |
548 (name)) | |
549 { | |
550 Lisp_Object coding_system = Ffind_coding_system (name); | |
551 | |
552 if (NILP (coding_system)) | |
553 signal_simple_error ("No such coding system", name); | |
554 return coding_system; | |
555 } | |
556 | |
557 /* We store the coding systems in hash tables with the names as the key and the | |
558 actual coding system object as the value. Occasionally we need to use them | |
559 in a list format. These routines provide us with that. */ | |
560 struct coding_system_list_closure | |
561 { | |
562 Lisp_Object *coding_system_list; | |
563 }; | |
564 | |
565 static int | |
566 add_coding_system_to_list_mapper (Lisp_Object key, Lisp_Object value, | |
567 void *coding_system_list_closure) | |
568 { | |
569 /* This function can GC */ | |
570 struct coding_system_list_closure *cscl = | |
571 (struct coding_system_list_closure *) coding_system_list_closure; | |
572 Lisp_Object *coding_system_list = cscl->coding_system_list; | |
573 | |
574 *coding_system_list = Fcons (XCODING_SYSTEM (value)->name, | |
575 *coding_system_list); | |
576 return 0; | |
577 } | |
578 | |
579 DEFUN ("coding-system-list", Fcoding_system_list, 0, 0, 0, /* | |
580 Return a list of the names of all defined coding systems. | |
581 */ | |
582 ()) | |
583 { | |
584 Lisp_Object coding_system_list = Qnil; | |
585 struct gcpro gcpro1; | |
586 struct coding_system_list_closure coding_system_list_closure; | |
587 | |
588 GCPRO1 (coding_system_list); | |
589 coding_system_list_closure.coding_system_list = &coding_system_list; | |
590 elisp_maphash (add_coding_system_to_list_mapper, Vcoding_system_hash_table, | |
591 &coding_system_list_closure); | |
592 UNGCPRO; | |
593 | |
594 return coding_system_list; | |
595 } | |
596 | |
597 DEFUN ("coding-system-name", Fcoding_system_name, 1, 1, 0, /* | |
598 Return the name of the given coding system. | |
599 */ | |
600 (coding_system)) | |
601 { | |
602 coding_system = Fget_coding_system (coding_system); | |
603 return XCODING_SYSTEM_NAME (coding_system); | |
604 } | |
605 | |
606 static Lisp_Coding_System * | |
607 allocate_coding_system (enum coding_system_type type, Lisp_Object name) | |
608 { | |
609 Lisp_Coding_System *codesys = | |
610 alloc_lcrecord_type (Lisp_Coding_System, &lrecord_coding_system); | |
611 | |
612 zero_lcrecord (codesys); | |
613 CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = Qnil; | |
614 CODING_SYSTEM_POST_READ_CONVERSION (codesys) = Qnil; | |
615 CODING_SYSTEM_EOL_TYPE (codesys) = EOL_AUTODETECT; | |
616 CODING_SYSTEM_EOL_CRLF (codesys) = Qnil; | |
617 CODING_SYSTEM_EOL_CR (codesys) = Qnil; | |
618 CODING_SYSTEM_EOL_LF (codesys) = Qnil; | |
619 CODING_SYSTEM_TYPE (codesys) = type; | |
620 CODING_SYSTEM_MNEMONIC (codesys) = Qnil; | |
621 #ifdef MULE | |
622 if (type == CODESYS_ISO2022) | |
623 { | |
624 int i; | |
625 for (i = 0; i < 4; i++) | |
626 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i) = Qnil; | |
627 } | |
628 else if (type == CODESYS_CCL) | |
629 { | |
630 CODING_SYSTEM_CCL_DECODE (codesys) = Qnil; | |
631 CODING_SYSTEM_CCL_ENCODE (codesys) = Qnil; | |
632 } | |
633 #endif /* MULE */ | |
634 CODING_SYSTEM_NAME (codesys) = name; | |
635 | |
636 return codesys; | |
637 } | |
638 | |
639 #ifdef MULE | |
640 /* Given a list of charset conversion specs as specified in a Lisp | |
641 program, parse it into STORE_HERE. */ | |
642 | |
643 static void | |
644 parse_charset_conversion_specs (charset_conversion_spec_dynarr *store_here, | |
645 Lisp_Object spec_list) | |
646 { | |
647 Lisp_Object rest; | |
648 | |
649 EXTERNAL_LIST_LOOP (rest, spec_list) | |
650 { | |
651 Lisp_Object car = XCAR (rest); | |
652 Lisp_Object from, to; | |
653 struct charset_conversion_spec spec; | |
654 | |
655 if (!CONSP (car) || !CONSP (XCDR (car)) || !NILP (XCDR (XCDR (car)))) | |
656 signal_simple_error ("Invalid charset conversion spec", car); | |
657 from = Fget_charset (XCAR (car)); | |
658 to = Fget_charset (XCAR (XCDR (car))); | |
659 if (XCHARSET_TYPE (from) != XCHARSET_TYPE (to)) | |
660 signal_simple_error_2 | |
661 ("Attempted conversion between different charset types", | |
662 from, to); | |
663 spec.from_charset = from; | |
664 spec.to_charset = to; | |
665 | |
666 Dynarr_add (store_here, spec); | |
667 } | |
668 } | |
669 | |
670 /* Given a dynarr LOAD_HERE of internally-stored charset conversion | |
671 specs, return the equivalent as the Lisp programmer would see it. | |
672 | |
673 If LOAD_HERE is 0, return Qnil. */ | |
674 | |
675 static Lisp_Object | |
676 unparse_charset_conversion_specs (charset_conversion_spec_dynarr *load_here) | |
677 { | |
678 int i; | |
679 Lisp_Object result; | |
680 | |
681 if (!load_here) | |
682 return Qnil; | |
683 for (i = 0, result = Qnil; i < Dynarr_length (load_here); i++) | |
684 { | |
685 struct charset_conversion_spec *ccs = Dynarr_atp (load_here, i); | |
686 result = Fcons (list2 (ccs->from_charset, ccs->to_charset), result); | |
687 } | |
688 | |
689 return Fnreverse (result); | |
690 } | |
691 | |
692 #endif /* MULE */ | |
693 | |
694 DEFUN ("make-coding-system", Fmake_coding_system, 2, 4, 0, /* | |
695 Register symbol NAME as a coding system. | |
696 | |
697 TYPE describes the conversion method used and should be one of | |
698 | |
699 nil or 'undecided | |
700 Automatic conversion. XEmacs attempts to detect the coding system | |
701 used in the file. | |
702 'no-conversion | |
703 No conversion. Use this for binary files and such. On output, | |
704 graphic characters that are not in ASCII or Latin-1 will be | |
705 replaced by a ?. (For a no-conversion-encoded buffer, these | |
706 characters will only be present if you explicitly insert them.) | |
707 'shift-jis | |
708 Shift-JIS (a Japanese encoding commonly used in PC operating systems). | |
709 'ucs-4 | |
710 ISO 10646 UCS-4 encoding. | |
711 'utf-8 | |
712 ISO 10646 UTF-8 encoding. | |
713 'iso2022 | |
714 Any ISO2022-compliant encoding. Among other things, this includes | |
715 JIS (the Japanese encoding commonly used for e-mail), EUC (the | |
716 standard Unix encoding for Japanese and other languages), and | |
717 Compound Text (the encoding used in X11). You can specify more | |
718 specific information about the conversion with the FLAGS argument. | |
719 'big5 | |
720 Big5 (the encoding commonly used for Taiwanese). | |
721 'ccl | |
722 The conversion is performed using a user-written pseudo-code | |
723 program. CCL (Code Conversion Language) is the name of this | |
724 pseudo-code. | |
725 'internal | |
726 Write out or read in the raw contents of the memory representing | |
727 the buffer's text. This is primarily useful for debugging | |
728 purposes, and is only enabled when XEmacs has been compiled with | |
729 DEBUG_XEMACS defined (via the --debug configure option). | |
730 WARNING: Reading in a file using 'internal conversion can result | |
731 in an internal inconsistency in the memory representing a | |
732 buffer's text, which will produce unpredictable results and may | |
733 cause XEmacs to crash. Under normal circumstances you should | |
734 never use 'internal conversion. | |
735 | |
736 DOC-STRING is a string describing the coding system. | |
737 | |
738 PROPS is a property list, describing the specific nature of the | |
739 character set. Recognized properties are: | |
740 | |
741 'mnemonic | |
742 String to be displayed in the modeline when this coding system is | |
743 active. | |
744 | |
745 'eol-type | |
746 End-of-line conversion to be used. It should be one of | |
747 | |
748 nil | |
749 Automatically detect the end-of-line type (LF, CRLF, | |
750 or CR). Also generate subsidiary coding systems named | |
751 `NAME-unix', `NAME-dos', and `NAME-mac', that are | |
752 identical to this coding system but have an EOL-TYPE | |
753 value of 'lf, 'crlf, and 'cr, respectively. | |
754 'lf | |
755 The end of a line is marked externally using ASCII LF. | |
756 Since this is also the way that XEmacs represents an | |
757 end-of-line internally, specifying this option results | |
758 in no end-of-line conversion. This is the standard | |
759 format for Unix text files. | |
760 'crlf | |
761 The end of a line is marked externally using ASCII | |
762 CRLF. This is the standard format for MS-DOS text | |
763 files. | |
764 'cr | |
765 The end of a line is marked externally using ASCII CR. | |
766 This is the standard format for Macintosh text files. | |
767 t | |
768 Automatically detect the end-of-line type but do not | |
769 generate subsidiary coding systems. (This value is | |
770 converted to nil when stored internally, and | |
771 `coding-system-property' will return nil.) | |
772 | |
773 'post-read-conversion | |
774 Function called after a file has been read in, to perform the | |
775 decoding. Called with two arguments, BEG and END, denoting | |
776 a region of the current buffer to be decoded. | |
777 | |
778 'pre-write-conversion | |
779 Function called before a file is written out, to perform the | |
780 encoding. Called with two arguments, BEG and END, denoting | |
781 a region of the current buffer to be encoded. | |
782 | |
783 | |
784 The following additional properties are recognized if TYPE is 'iso2022: | |
785 | |
786 'charset-g0 | |
787 'charset-g1 | |
788 'charset-g2 | |
789 'charset-g3 | |
790 The character set initially designated to the G0 - G3 registers. | |
791 The value should be one of | |
792 | |
793 -- A charset object (designate that character set) | |
794 -- nil (do not ever use this register) | |
795 -- t (no character set is initially designated to | |
796 the register, but may be later on; this automatically | |
797 sets the corresponding `force-g*-on-output' property) | |
798 | |
799 'force-g0-on-output | |
800 'force-g1-on-output | |
801 'force-g2-on-output | |
802 'force-g2-on-output | |
803 If non-nil, send an explicit designation sequence on output before | |
804 using the specified register. | |
805 | |
806 'short | |
807 If non-nil, use the short forms "ESC $ @", "ESC $ A", and | |
808 "ESC $ B" on output in place of the full designation sequences | |
809 "ESC $ ( @", "ESC $ ( A", and "ESC $ ( B". | |
810 | |
811 'no-ascii-eol | |
812 If non-nil, don't designate ASCII to G0 at each end of line on output. | |
813 Setting this to non-nil also suppresses other state-resetting that | |
814 normally happens at the end of a line. | |
815 | |
816 'no-ascii-cntl | |
817 If non-nil, don't designate ASCII to G0 before control chars on output. | |
818 | |
819 'seven | |
820 If non-nil, use 7-bit environment on output. Otherwise, use 8-bit | |
821 environment. | |
822 | |
823 'lock-shift | |
824 If non-nil, use locking-shift (SO/SI) instead of single-shift | |
825 or designation by escape sequence. | |
826 | |
827 'no-iso6429 | |
828 If non-nil, don't use ISO6429's direction specification. | |
829 | |
830 'escape-quoted | |
831 If non-nil, literal control characters that are the same as | |
832 the beginning of a recognized ISO2022 or ISO6429 escape sequence | |
833 (in particular, ESC (0x1B), SO (0x0E), SI (0x0F), SS2 (0x8E), | |
834 SS3 (0x8F), and CSI (0x9B)) are "quoted" with an escape character | |
835 so that they can be properly distinguished from an escape sequence. | |
836 (Note that doing this results in a non-portable encoding.) This | |
837 encoding flag is used for byte-compiled files. Note that ESC | |
838 is a good choice for a quoting character because there are no | |
839 escape sequences whose second byte is a character from the Control-0 | |
840 or Control-1 character sets; this is explicitly disallowed by the | |
841 ISO2022 standard. | |
842 | |
843 'input-charset-conversion | |
844 A list of conversion specifications, specifying conversion of | |
845 characters in one charset to another when decoding is performed. | |
846 Each specification is a list of two elements: the source charset, | |
847 and the destination charset. | |
848 | |
849 'output-charset-conversion | |
850 A list of conversion specifications, specifying conversion of | |
851 characters in one charset to another when encoding is performed. | |
852 The form of each specification is the same as for | |
853 'input-charset-conversion. | |
854 | |
855 | |
856 The following additional properties are recognized (and required) | |
857 if TYPE is 'ccl: | |
858 | |
859 'decode | |
860 CCL program used for decoding (converting to internal format). | |
861 | |
862 'encode | |
863 CCL program used for encoding (converting to external format). | |
864 */ | |
865 (name, type, doc_string, props)) | |
866 { | |
867 Lisp_Coding_System *codesys; | |
868 Lisp_Object rest, key, value; | |
869 enum coding_system_type ty; | |
870 int need_to_setup_eol_systems = 1; | |
871 | |
872 /* Convert type to constant */ | |
873 if (NILP (type) || EQ (type, Qundecided)) | |
874 { ty = CODESYS_AUTODETECT; } | |
875 #ifdef MULE | |
876 else if (EQ (type, Qshift_jis)) { ty = CODESYS_SHIFT_JIS; } | |
877 else if (EQ (type, Qiso2022)) { ty = CODESYS_ISO2022; } | |
878 else if (EQ (type, Qbig5)) { ty = CODESYS_BIG5; } | |
879 else if (EQ (type, Qucs4)) { ty = CODESYS_UCS4; } | |
880 else if (EQ (type, Qutf8)) { ty = CODESYS_UTF8; } | |
881 else if (EQ (type, Qccl)) { ty = CODESYS_CCL; } | |
882 #endif | |
883 else if (EQ (type, Qno_conversion)) { ty = CODESYS_NO_CONVERSION; } | |
884 #ifdef DEBUG_XEMACS | |
885 else if (EQ (type, Qinternal)) { ty = CODESYS_INTERNAL; } | |
886 #endif | |
887 else | |
888 signal_simple_error ("Invalid coding system type", type); | |
889 | |
890 CHECK_SYMBOL (name); | |
891 | |
892 codesys = allocate_coding_system (ty, name); | |
893 | |
894 if (NILP (doc_string)) | |
895 doc_string = build_string (""); | |
896 else | |
897 CHECK_STRING (doc_string); | |
898 CODING_SYSTEM_DOC_STRING (codesys) = doc_string; | |
899 | |
900 EXTERNAL_PROPERTY_LIST_LOOP (rest, key, value, props) | |
901 { | |
902 if (EQ (key, Qmnemonic)) | |
903 { | |
904 if (!NILP (value)) | |
905 CHECK_STRING (value); | |
906 CODING_SYSTEM_MNEMONIC (codesys) = value; | |
907 } | |
908 | |
909 else if (EQ (key, Qeol_type)) | |
910 { | |
911 need_to_setup_eol_systems = NILP (value); | |
912 if (EQ (value, Qt)) | |
913 value = Qnil; | |
914 CODING_SYSTEM_EOL_TYPE (codesys) = symbol_to_eol_type (value); | |
915 } | |
916 | |
917 else if (EQ (key, Qpost_read_conversion)) CODING_SYSTEM_POST_READ_CONVERSION (codesys) = value; | |
918 else if (EQ (key, Qpre_write_conversion)) CODING_SYSTEM_PRE_WRITE_CONVERSION (codesys) = value; | |
919 #ifdef MULE | |
920 else if (ty == CODESYS_ISO2022) | |
921 { | |
922 #define FROB_INITIAL_CHARSET(charset_num) \ | |
923 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, charset_num) = \ | |
924 ((EQ (value, Qt) || EQ (value, Qnil)) ? value : Fget_charset (value)) | |
925 | |
926 if (EQ (key, Qcharset_g0)) FROB_INITIAL_CHARSET (0); | |
927 else if (EQ (key, Qcharset_g1)) FROB_INITIAL_CHARSET (1); | |
928 else if (EQ (key, Qcharset_g2)) FROB_INITIAL_CHARSET (2); | |
929 else if (EQ (key, Qcharset_g3)) FROB_INITIAL_CHARSET (3); | |
930 | |
931 #define FROB_FORCE_CHARSET(charset_num) \ | |
932 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (codesys, charset_num) = !NILP (value) | |
933 | |
934 else if (EQ (key, Qforce_g0_on_output)) FROB_FORCE_CHARSET (0); | |
935 else if (EQ (key, Qforce_g1_on_output)) FROB_FORCE_CHARSET (1); | |
936 else if (EQ (key, Qforce_g2_on_output)) FROB_FORCE_CHARSET (2); | |
937 else if (EQ (key, Qforce_g3_on_output)) FROB_FORCE_CHARSET (3); | |
938 | |
939 #define FROB_BOOLEAN_PROPERTY(prop) \ | |
940 CODING_SYSTEM_ISO2022_##prop (codesys) = !NILP (value) | |
941 | |
942 else if (EQ (key, Qshort)) FROB_BOOLEAN_PROPERTY (SHORT); | |
943 else if (EQ (key, Qno_ascii_eol)) FROB_BOOLEAN_PROPERTY (NO_ASCII_EOL); | |
944 else if (EQ (key, Qno_ascii_cntl)) FROB_BOOLEAN_PROPERTY (NO_ASCII_CNTL); | |
945 else if (EQ (key, Qseven)) FROB_BOOLEAN_PROPERTY (SEVEN); | |
946 else if (EQ (key, Qlock_shift)) FROB_BOOLEAN_PROPERTY (LOCK_SHIFT); | |
947 else if (EQ (key, Qno_iso6429)) FROB_BOOLEAN_PROPERTY (NO_ISO6429); | |
948 else if (EQ (key, Qescape_quoted)) FROB_BOOLEAN_PROPERTY (ESCAPE_QUOTED); | |
949 | |
950 else if (EQ (key, Qinput_charset_conversion)) | |
951 { | |
952 codesys->iso2022.input_conv = | |
953 Dynarr_new (charset_conversion_spec); | |
954 parse_charset_conversion_specs (codesys->iso2022.input_conv, | |
955 value); | |
956 } | |
957 else if (EQ (key, Qoutput_charset_conversion)) | |
958 { | |
959 codesys->iso2022.output_conv = | |
960 Dynarr_new (charset_conversion_spec); | |
961 parse_charset_conversion_specs (codesys->iso2022.output_conv, | |
962 value); | |
963 } | |
964 else | |
965 signal_simple_error ("Unrecognized property", key); | |
966 } | |
967 else if (EQ (type, Qccl)) | |
968 { | |
969 if (EQ (key, Qdecode)) | |
970 { | |
971 CHECK_VECTOR (value); | |
972 CODING_SYSTEM_CCL_DECODE (codesys) = value; | |
973 } | |
974 else if (EQ (key, Qencode)) | |
975 { | |
976 CHECK_VECTOR (value); | |
977 CODING_SYSTEM_CCL_ENCODE (codesys) = value; | |
978 } | |
979 else | |
980 signal_simple_error ("Unrecognized property", key); | |
981 } | |
982 #endif /* MULE */ | |
983 else | |
984 signal_simple_error ("Unrecognized property", key); | |
985 } | |
986 | |
987 if (need_to_setup_eol_systems) | |
988 setup_eol_coding_systems (codesys); | |
989 | |
990 { | |
991 Lisp_Object codesys_obj; | |
992 XSETCODING_SYSTEM (codesys_obj, codesys); | |
993 Fputhash (name, codesys_obj, Vcoding_system_hash_table); | |
994 return codesys_obj; | |
995 } | |
996 } | |
997 | |
998 DEFUN ("copy-coding-system", Fcopy_coding_system, 2, 2, 0, /* | |
999 Copy OLD-CODING-SYSTEM to NEW-NAME. | |
1000 If NEW-NAME does not name an existing coding system, a new one will | |
1001 be created. | |
1002 */ | |
1003 (old_coding_system, new_name)) | |
1004 { | |
1005 Lisp_Object new_coding_system; | |
1006 old_coding_system = Fget_coding_system (old_coding_system); | |
1007 new_coding_system = Ffind_coding_system (new_name); | |
1008 if (NILP (new_coding_system)) | |
1009 { | |
1010 XSETCODING_SYSTEM (new_coding_system, | |
1011 allocate_coding_system | |
1012 (XCODING_SYSTEM_TYPE (old_coding_system), | |
1013 new_name)); | |
1014 Fputhash (new_name, new_coding_system, Vcoding_system_hash_table); | |
1015 } | |
1016 | |
1017 { | |
1018 Lisp_Coding_System *to = XCODING_SYSTEM (new_coding_system); | |
1019 Lisp_Coding_System *from = XCODING_SYSTEM (old_coding_system); | |
1020 memcpy (((char *) to ) + sizeof (to->header), | |
1021 ((char *) from) + sizeof (from->header), | |
1022 sizeof (*from) - sizeof (from->header)); | |
1023 to->name = new_name; | |
1024 } | |
1025 return new_coding_system; | |
1026 } | |
1027 | |
1028 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, 2, 2, 0, /* | |
1029 Define symbol ALIAS as an alias for coding system CODING-SYSTEM. | |
1030 */ | |
1031 (alias, coding_system)) | |
1032 { | |
1033 CHECK_SYMBOL (alias); | |
1034 if (!NILP (Ffind_coding_system (alias))) | |
1035 signal_simple_error ("Symbol already names a coding system", alias); | |
1036 coding_system = Fget_coding_system (coding_system); | |
1037 Fputhash (alias, coding_system, Vcoding_system_hash_table); | |
1038 | |
1039 /* Set up aliases for subsidiaries. */ | |
1040 if (XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT) | |
1041 { | |
1042 Lisp_Object str; | |
1043 XSETSTRING (str, symbol_name (XSYMBOL (alias))); | |
1044 #define FROB(type, name) \ | |
1045 do { \ | |
1046 Lisp_Object subsidiary = XCODING_SYSTEM_EOL_##type (coding_system); \ | |
1047 if (!NILP (subsidiary)) \ | |
1048 Fdefine_coding_system_alias \ | |
1049 (Fintern (concat2 (str, build_string (name)), Qnil), subsidiary); \ | |
1050 } while (0) | |
1051 FROB (LF, "-unix"); | |
1052 FROB (CRLF, "-dos"); | |
1053 FROB (CR, "-mac"); | |
1054 #undef FROB | |
1055 } | |
1056 /* FSF return value is a vector of [ALIAS-unix ALIAS-dos ALIAS-mac], | |
1057 but it doesn't look intentional, so I'd rather return something | |
1058 meaningful or nothing at all. */ | |
1059 return Qnil; | |
1060 } | |
1061 | |
1062 static Lisp_Object | |
1063 subsidiary_coding_system (Lisp_Object coding_system, enum eol_type type) | |
1064 { | |
1065 Lisp_Coding_System *cs = XCODING_SYSTEM (coding_system); | |
1066 Lisp_Object new_coding_system; | |
1067 | |
1068 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) | |
1069 return coding_system; | |
1070 | |
1071 switch (type) | |
1072 { | |
1073 case EOL_AUTODETECT: return coding_system; | |
1074 case EOL_LF: new_coding_system = CODING_SYSTEM_EOL_LF (cs); break; | |
1075 case EOL_CR: new_coding_system = CODING_SYSTEM_EOL_CR (cs); break; | |
1076 case EOL_CRLF: new_coding_system = CODING_SYSTEM_EOL_CRLF (cs); break; | |
1077 default: abort (); | |
1078 } | |
1079 | |
1080 return NILP (new_coding_system) ? coding_system : new_coding_system; | |
1081 } | |
1082 | |
1083 DEFUN ("subsidiary-coding-system", Fsubsidiary_coding_system, 2, 2, 0, /* | |
1084 Return the subsidiary coding system of CODING-SYSTEM with eol type EOL-TYPE. | |
1085 */ | |
1086 (coding_system, eol_type)) | |
1087 { | |
1088 coding_system = Fget_coding_system (coding_system); | |
1089 | |
1090 return subsidiary_coding_system (coding_system, | |
1091 symbol_to_eol_type (eol_type)); | |
1092 } | |
1093 | |
1094 | |
1095 /************************************************************************/ | |
1096 /* Coding system accessors */ | |
1097 /************************************************************************/ | |
1098 | |
1099 DEFUN ("coding-system-doc-string", Fcoding_system_doc_string, 1, 1, 0, /* | |
1100 Return the doc string for CODING-SYSTEM. | |
1101 */ | |
1102 (coding_system)) | |
1103 { | |
1104 coding_system = Fget_coding_system (coding_system); | |
1105 return XCODING_SYSTEM_DOC_STRING (coding_system); | |
1106 } | |
1107 | |
1108 DEFUN ("coding-system-type", Fcoding_system_type, 1, 1, 0, /* | |
1109 Return the type of CODING-SYSTEM. | |
1110 */ | |
1111 (coding_system)) | |
1112 { | |
1113 switch (XCODING_SYSTEM_TYPE (Fget_coding_system (coding_system))) | |
1114 { | |
1115 default: abort (); | |
1116 case CODESYS_AUTODETECT: return Qundecided; | |
1117 #ifdef MULE | |
1118 case CODESYS_SHIFT_JIS: return Qshift_jis; | |
1119 case CODESYS_ISO2022: return Qiso2022; | |
1120 case CODESYS_BIG5: return Qbig5; | |
1121 case CODESYS_UCS4: return Qucs4; | |
1122 case CODESYS_UTF8: return Qutf8; | |
1123 case CODESYS_CCL: return Qccl; | |
1124 #endif | |
1125 case CODESYS_NO_CONVERSION: return Qno_conversion; | |
1126 #ifdef DEBUG_XEMACS | |
1127 case CODESYS_INTERNAL: return Qinternal; | |
1128 #endif | |
1129 } | |
1130 } | |
1131 | |
1132 #ifdef MULE | |
1133 static | |
1134 Lisp_Object coding_system_charset (Lisp_Object coding_system, int gnum) | |
1135 { | |
1136 Lisp_Object cs | |
1137 = XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, gnum); | |
1138 | |
1139 return CHARSETP (cs) ? XCHARSET_NAME (cs) : Qnil; | |
1140 } | |
1141 | |
1142 DEFUN ("coding-system-charset", Fcoding_system_charset, 2, 2, 0, /* | |
1143 Return initial charset of CODING-SYSTEM designated to GNUM. | |
1144 GNUM allows 0 .. 3. | |
1145 */ | |
1146 (coding_system, gnum)) | |
1147 { | |
1148 coding_system = Fget_coding_system (coding_system); | |
1149 CHECK_INT (gnum); | |
1150 | |
1151 return coding_system_charset (coding_system, XINT (gnum)); | |
1152 } | |
1153 #endif /* MULE */ | |
1154 | |
1155 DEFUN ("coding-system-property", Fcoding_system_property, 2, 2, 0, /* | |
1156 Return the PROP property of CODING-SYSTEM. | |
1157 */ | |
1158 (coding_system, prop)) | |
1159 { | |
1160 int i, ok = 0; | |
1161 enum coding_system_type type; | |
1162 | |
1163 coding_system = Fget_coding_system (coding_system); | |
1164 CHECK_SYMBOL (prop); | |
1165 type = XCODING_SYSTEM_TYPE (coding_system); | |
1166 | |
1167 for (i = 0; !ok && i < Dynarr_length (the_codesys_prop_dynarr); i++) | |
1168 if (EQ (Dynarr_at (the_codesys_prop_dynarr, i).sym, prop)) | |
1169 { | |
1170 ok = 1; | |
1171 switch (Dynarr_at (the_codesys_prop_dynarr, i).prop_type) | |
1172 { | |
1173 case CODESYS_PROP_ALL_OK: | |
1174 break; | |
1175 #ifdef MULE | |
1176 case CODESYS_PROP_ISO2022: | |
1177 if (type != CODESYS_ISO2022) | |
1178 signal_simple_error | |
1179 ("Property only valid in ISO2022 coding systems", | |
1180 prop); | |
1181 break; | |
1182 | |
1183 case CODESYS_PROP_CCL: | |
1184 if (type != CODESYS_CCL) | |
1185 signal_simple_error | |
1186 ("Property only valid in CCL coding systems", | |
1187 prop); | |
1188 break; | |
1189 #endif /* MULE */ | |
1190 default: | |
1191 abort (); | |
1192 } | |
1193 } | |
1194 | |
1195 if (!ok) | |
1196 signal_simple_error ("Unrecognized property", prop); | |
1197 | |
1198 if (EQ (prop, Qname)) | |
1199 return XCODING_SYSTEM_NAME (coding_system); | |
1200 else if (EQ (prop, Qtype)) | |
1201 return Fcoding_system_type (coding_system); | |
1202 else if (EQ (prop, Qdoc_string)) | |
1203 return XCODING_SYSTEM_DOC_STRING (coding_system); | |
1204 else if (EQ (prop, Qmnemonic)) | |
1205 return XCODING_SYSTEM_MNEMONIC (coding_system); | |
1206 else if (EQ (prop, Qeol_type)) | |
1207 return eol_type_to_symbol (XCODING_SYSTEM_EOL_TYPE (coding_system)); | |
1208 else if (EQ (prop, Qeol_lf)) | |
1209 return XCODING_SYSTEM_EOL_LF (coding_system); | |
1210 else if (EQ (prop, Qeol_crlf)) | |
1211 return XCODING_SYSTEM_EOL_CRLF (coding_system); | |
1212 else if (EQ (prop, Qeol_cr)) | |
1213 return XCODING_SYSTEM_EOL_CR (coding_system); | |
1214 else if (EQ (prop, Qpost_read_conversion)) | |
1215 return XCODING_SYSTEM_POST_READ_CONVERSION (coding_system); | |
1216 else if (EQ (prop, Qpre_write_conversion)) | |
1217 return XCODING_SYSTEM_PRE_WRITE_CONVERSION (coding_system); | |
1218 #ifdef MULE | |
1219 else if (type == CODESYS_ISO2022) | |
1220 { | |
1221 if (EQ (prop, Qcharset_g0)) | |
1222 return coding_system_charset (coding_system, 0); | |
1223 else if (EQ (prop, Qcharset_g1)) | |
1224 return coding_system_charset (coding_system, 1); | |
1225 else if (EQ (prop, Qcharset_g2)) | |
1226 return coding_system_charset (coding_system, 2); | |
1227 else if (EQ (prop, Qcharset_g3)) | |
1228 return coding_system_charset (coding_system, 3); | |
1229 | |
1230 #define FORCE_CHARSET(charset_num) \ | |
1231 (XCODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT \ | |
1232 (coding_system, charset_num) ? Qt : Qnil) | |
1233 | |
1234 else if (EQ (prop, Qforce_g0_on_output)) return FORCE_CHARSET (0); | |
1235 else if (EQ (prop, Qforce_g1_on_output)) return FORCE_CHARSET (1); | |
1236 else if (EQ (prop, Qforce_g2_on_output)) return FORCE_CHARSET (2); | |
1237 else if (EQ (prop, Qforce_g3_on_output)) return FORCE_CHARSET (3); | |
1238 | |
1239 #define LISP_BOOLEAN(prop) \ | |
1240 (XCODING_SYSTEM_ISO2022_##prop (coding_system) ? Qt : Qnil) | |
1241 | |
1242 else if (EQ (prop, Qshort)) return LISP_BOOLEAN (SHORT); | |
1243 else if (EQ (prop, Qno_ascii_eol)) return LISP_BOOLEAN (NO_ASCII_EOL); | |
1244 else if (EQ (prop, Qno_ascii_cntl)) return LISP_BOOLEAN (NO_ASCII_CNTL); | |
1245 else if (EQ (prop, Qseven)) return LISP_BOOLEAN (SEVEN); | |
1246 else if (EQ (prop, Qlock_shift)) return LISP_BOOLEAN (LOCK_SHIFT); | |
1247 else if (EQ (prop, Qno_iso6429)) return LISP_BOOLEAN (NO_ISO6429); | |
1248 else if (EQ (prop, Qescape_quoted)) return LISP_BOOLEAN (ESCAPE_QUOTED); | |
1249 | |
1250 else if (EQ (prop, Qinput_charset_conversion)) | |
1251 return | |
1252 unparse_charset_conversion_specs | |
1253 (XCODING_SYSTEM (coding_system)->iso2022.input_conv); | |
1254 else if (EQ (prop, Qoutput_charset_conversion)) | |
1255 return | |
1256 unparse_charset_conversion_specs | |
1257 (XCODING_SYSTEM (coding_system)->iso2022.output_conv); | |
1258 else | |
1259 abort (); | |
1260 } | |
1261 else if (type == CODESYS_CCL) | |
1262 { | |
1263 if (EQ (prop, Qdecode)) | |
1264 return XCODING_SYSTEM_CCL_DECODE (coding_system); | |
1265 else if (EQ (prop, Qencode)) | |
1266 return XCODING_SYSTEM_CCL_ENCODE (coding_system); | |
1267 else | |
1268 abort (); | |
1269 } | |
1270 #endif /* MULE */ | |
1271 else | |
1272 abort (); | |
1273 | |
1274 return Qnil; /* not reached */ | |
1275 } | |
1276 | |
1277 | |
1278 /************************************************************************/ | |
1279 /* Coding category functions */ | |
1280 /************************************************************************/ | |
1281 | |
1282 static int | |
1283 decode_coding_category (Lisp_Object symbol) | |
1284 { | |
1285 int i; | |
1286 | |
1287 CHECK_SYMBOL (symbol); | |
1288 for (i = 0; i <= CODING_CATEGORY_LAST; i++) | |
1289 if (EQ (coding_category_symbol[i], symbol)) | |
1290 return i; | |
1291 | |
1292 signal_simple_error ("Unrecognized coding category", symbol); | |
1293 return 0; /* not reached */ | |
1294 } | |
1295 | |
1296 DEFUN ("coding-category-list", Fcoding_category_list, 0, 0, 0, /* | |
1297 Return a list of all recognized coding categories. | |
1298 */ | |
1299 ()) | |
1300 { | |
1301 int i; | |
1302 Lisp_Object list = Qnil; | |
1303 | |
1304 for (i = CODING_CATEGORY_LAST; i >= 0; i--) | |
1305 list = Fcons (coding_category_symbol[i], list); | |
1306 return list; | |
1307 } | |
1308 | |
1309 DEFUN ("set-coding-priority-list", Fset_coding_priority_list, 1, 1, 0, /* | |
1310 Change the priority order of the coding categories. | |
1311 LIST should be list of coding categories, in descending order of | |
1312 priority. Unspecified coding categories will be lower in priority | |
1313 than all specified ones, in the same relative order they were in | |
1314 previously. | |
1315 */ | |
1316 (list)) | |
1317 { | |
1318 int category_to_priority[CODING_CATEGORY_LAST + 1]; | |
1319 int i, j; | |
1320 Lisp_Object rest; | |
1321 | |
1322 /* First generate a list that maps coding categories to priorities. */ | |
1323 | |
1324 for (i = 0; i <= CODING_CATEGORY_LAST; i++) | |
1325 category_to_priority[i] = -1; | |
1326 | |
1327 /* Highest priority comes from the specified list. */ | |
1328 i = 0; | |
1329 EXTERNAL_LIST_LOOP (rest, list) | |
1330 { | |
1331 int cat = decode_coding_category (XCAR (rest)); | |
1332 | |
1333 if (category_to_priority[cat] >= 0) | |
1334 signal_simple_error ("Duplicate coding category in list", XCAR (rest)); | |
1335 category_to_priority[cat] = i++; | |
1336 } | |
1337 | |
1338 /* Now go through the existing categories by priority to retrieve | |
1339 the categories not yet specified and preserve their priority | |
1340 order. */ | |
1341 for (j = 0; j <= CODING_CATEGORY_LAST; j++) | |
1342 { | |
1343 int cat = fcd->coding_category_by_priority[j]; | |
1344 if (category_to_priority[cat] < 0) | |
1345 category_to_priority[cat] = i++; | |
1346 } | |
1347 | |
1348 /* Now we need to construct the inverse of the mapping we just | |
1349 constructed. */ | |
1350 | |
1351 for (i = 0; i <= CODING_CATEGORY_LAST; i++) | |
1352 fcd->coding_category_by_priority[category_to_priority[i]] = i; | |
1353 | |
1354 /* Phew! That was confusing. */ | |
1355 return Qnil; | |
1356 } | |
1357 | |
1358 DEFUN ("coding-priority-list", Fcoding_priority_list, 0, 0, 0, /* | |
1359 Return a list of coding categories in descending order of priority. | |
1360 */ | |
1361 ()) | |
1362 { | |
1363 int i; | |
1364 Lisp_Object list = Qnil; | |
1365 | |
1366 for (i = CODING_CATEGORY_LAST; i >= 0; i--) | |
1367 list = Fcons (coding_category_symbol[fcd->coding_category_by_priority[i]], | |
1368 list); | |
1369 return list; | |
1370 } | |
1371 | |
1372 DEFUN ("set-coding-category-system", Fset_coding_category_system, 2, 2, 0, /* | |
1373 Change the coding system associated with a coding category. | |
1374 */ | |
1375 (coding_category, coding_system)) | |
1376 { | |
1377 int cat = decode_coding_category (coding_category); | |
1378 | |
1379 coding_system = Fget_coding_system (coding_system); | |
1380 fcd->coding_category_system[cat] = coding_system; | |
1381 return Qnil; | |
1382 } | |
1383 | |
1384 DEFUN ("coding-category-system", Fcoding_category_system, 1, 1, 0, /* | |
1385 Return the coding system associated with a coding category. | |
1386 */ | |
1387 (coding_category)) | |
1388 { | |
1389 int cat = decode_coding_category (coding_category); | |
1390 Lisp_Object sys = fcd->coding_category_system[cat]; | |
1391 | |
1392 if (!NILP (sys)) | |
1393 return XCODING_SYSTEM_NAME (sys); | |
1394 return Qnil; | |
1395 } | |
1396 | |
1397 | |
1398 /************************************************************************/ | |
1399 /* Detecting the encoding of data */ | |
1400 /************************************************************************/ | |
1401 | |
1402 struct detection_state | |
1403 { | |
1404 enum eol_type eol_type; | |
1405 int seen_non_ascii; | |
1406 int mask; | |
1407 #ifdef MULE | |
1408 struct | |
1409 { | |
1410 int mask; | |
1411 int in_second_byte; | |
1412 } | |
1413 big5; | |
1414 | |
1415 struct | |
1416 { | |
1417 int mask; | |
1418 int in_second_byte; | |
1419 } | |
1420 shift_jis; | |
1421 | |
1422 struct | |
1423 { | |
1424 int mask; | |
1425 int in_byte; | |
1426 } | |
1427 ucs4; | |
1428 | |
1429 struct | |
1430 { | |
1431 int mask; | |
1432 int in_byte; | |
1433 } | |
1434 utf8; | |
1435 | |
1436 struct | |
1437 { | |
1438 int mask; | |
1439 int initted; | |
1440 struct iso2022_decoder iso; | |
1441 unsigned int flags; | |
1442 int high_byte_count; | |
1443 unsigned int saw_single_shift:1; | |
1444 } | |
1445 iso2022; | |
1446 #endif | |
1447 struct | |
1448 { | |
1449 int seen_anything; | |
1450 int just_saw_cr; | |
1451 } | |
1452 eol; | |
1453 }; | |
1454 | |
1455 static int | |
1456 acceptable_control_char_p (int c) | |
1457 { | |
1458 switch (c) | |
1459 { | |
1460 /* Allow and ignore control characters that you might | |
1461 reasonably see in a text file */ | |
1462 case '\r': | |
1463 case '\n': | |
1464 case '\t': | |
1465 case 7: /* bell */ | |
1466 case 8: /* backspace */ | |
1467 case 11: /* vertical tab */ | |
1468 case 12: /* form feed */ | |
1469 case 26: /* MS-DOS C-z junk */ | |
1470 case 31: /* '^_' -- for info */ | |
1471 return 1; | |
1472 default: | |
1473 return 0; | |
1474 } | |
1475 } | |
1476 | |
1477 static int | |
1478 mask_has_at_most_one_bit_p (int mask) | |
1479 { | |
1480 /* Perhaps the only thing useful you learn from intensive Microsoft | |
1481 technical interviews */ | |
1482 return (mask & (mask - 1)) == 0; | |
1483 } | |
1484 | |
1485 static enum eol_type | |
1486 detect_eol_type (struct detection_state *st, CONST unsigned char *src, | |
1487 unsigned int n) | |
1488 { | |
1489 int c; | |
1490 | |
1491 while (n--) | |
1492 { | |
1493 c = *src++; | |
1494 if (c == '\n') | |
1495 { | |
1496 if (st->eol.just_saw_cr) | |
1497 return EOL_CRLF; | |
1498 else if (st->eol.seen_anything) | |
1499 return EOL_LF; | |
1500 } | |
1501 else if (st->eol.just_saw_cr) | |
1502 return EOL_CR; | |
1503 else if (c == '\r') | |
1504 st->eol.just_saw_cr = 1; | |
1505 else | |
1506 st->eol.just_saw_cr = 0; | |
1507 st->eol.seen_anything = 1; | |
1508 } | |
1509 | |
1510 return EOL_AUTODETECT; | |
1511 } | |
1512 | |
1513 /* Attempt to determine the encoding and EOL type of the given text. | |
1514 Before calling this function for the first type, you must initialize | |
1515 st->eol_type as appropriate and initialize st->mask to ~0. | |
1516 | |
1517 st->eol_type holds the determined EOL type, or EOL_AUTODETECT if | |
1518 not yet known. | |
1519 | |
1520 st->mask holds the determined coding category mask, or ~0 if only | |
1521 ASCII has been seen so far. | |
1522 | |
1523 Returns: | |
1524 | |
1525 0 == st->eol_type is EOL_AUTODETECT and/or more than coding category | |
1526 is present in st->mask | |
1527 1 == definitive answers are here for both st->eol_type and st->mask | |
1528 */ | |
1529 | |
1530 static int | |
1531 detect_coding_type (struct detection_state *st, CONST Extbyte *src, | |
1532 unsigned int n, int just_do_eol) | |
1533 { | |
1534 int c; | |
1535 | |
1536 if (st->eol_type == EOL_AUTODETECT) | |
1537 st->eol_type = detect_eol_type (st, src, n); | |
1538 | |
1539 if (just_do_eol) | |
1540 return st->eol_type != EOL_AUTODETECT; | |
1541 | |
1542 if (!st->seen_non_ascii) | |
1543 { | |
1544 for (; n; n--, src++) | |
1545 { | |
1546 c = *src; | |
1547 if ((c < 0x20 && !acceptable_control_char_p (c)) || c >= 0x80) | |
1548 { | |
1549 st->seen_non_ascii = 1; | |
1550 #ifdef MULE | |
1551 st->shift_jis.mask = ~0; | |
1552 st->big5.mask = ~0; | |
1553 st->ucs4.mask = ~0; | |
1554 st->utf8.mask = ~0; | |
1555 st->iso2022.mask = ~0; | |
1556 #endif | |
1557 break; | |
1558 } | |
1559 } | |
1560 } | |
1561 | |
1562 if (!n) | |
1563 return 0; | |
1564 #ifdef MULE | |
1565 if (!mask_has_at_most_one_bit_p (st->iso2022.mask)) | |
1566 st->iso2022.mask = detect_coding_iso2022 (st, src, n); | |
1567 if (!mask_has_at_most_one_bit_p (st->shift_jis.mask)) | |
1568 st->shift_jis.mask = detect_coding_sjis (st, src, n); | |
1569 if (!mask_has_at_most_one_bit_p (st->big5.mask)) | |
1570 st->big5.mask = detect_coding_big5 (st, src, n); | |
1571 if (!mask_has_at_most_one_bit_p (st->utf8.mask)) | |
1572 st->utf8.mask = detect_coding_utf8 (st, src, n); | |
1573 if (!mask_has_at_most_one_bit_p (st->ucs4.mask)) | |
1574 st->ucs4.mask = detect_coding_ucs4 (st, src, n); | |
1575 | |
1576 st->mask | |
1577 = st->iso2022.mask | st->shift_jis.mask | st->big5.mask | |
1578 | st->utf8.mask | st->ucs4.mask; | |
1579 #endif | |
1580 { | |
1581 int retval = mask_has_at_most_one_bit_p (st->mask); | |
1582 st->mask |= CODING_CATEGORY_NO_CONVERSION_MASK; | |
1583 return retval && st->eol_type != EOL_AUTODETECT; | |
1584 } | |
1585 } | |
1586 | |
1587 static Lisp_Object | |
1588 coding_system_from_mask (int mask) | |
1589 { | |
1590 if (mask == ~0) | |
1591 { | |
1592 /* If the file was entirely or basically ASCII, use the | |
1593 default value of `buffer-file-coding-system'. */ | |
1594 Lisp_Object retval = | |
1595 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system; | |
1596 if (!NILP (retval)) | |
1597 { | |
1598 retval = Ffind_coding_system (retval); | |
1599 if (NILP (retval)) | |
1600 { | |
1601 warn_when_safe | |
1602 (Qbad_variable, Qwarning, | |
1603 "Invalid `default-buffer-file-coding-system', set to nil"); | |
1604 XBUFFER (Vbuffer_defaults)->buffer_file_coding_system = Qnil; | |
1605 } | |
1606 } | |
1607 if (NILP (retval)) | |
1608 retval = Fget_coding_system (Qraw_text); | |
1609 return retval; | |
1610 } | |
1611 else | |
1612 { | |
1613 int i; | |
1614 int cat = -1; | |
1615 #ifdef MULE | |
1616 mask = postprocess_iso2022_mask (mask); | |
1617 #endif | |
1618 /* Look through the coding categories by priority and find | |
1619 the first one that is allowed. */ | |
1620 for (i = 0; i <= CODING_CATEGORY_LAST; i++) | |
1621 { | |
1622 cat = fcd->coding_category_by_priority[i]; | |
1623 if ((mask & (1 << cat)) && | |
1624 !NILP (fcd->coding_category_system[cat])) | |
1625 break; | |
1626 } | |
1627 if (cat >= 0) | |
1628 return fcd->coding_category_system[cat]; | |
1629 else | |
1630 return Fget_coding_system (Qraw_text); | |
1631 } | |
1632 } | |
1633 | |
1634 /* Given a seekable read stream and potential coding system and EOL type | |
1635 as specified, do any autodetection that is called for. If the | |
1636 coding system and/or EOL type are not `autodetect', they will be left | |
1637 alone; but this function will never return an autodetect coding system | |
1638 or EOL type. | |
1639 | |
1640 This function does not automatically fetch subsidiary coding systems; | |
1641 that should be unnecessary with the explicit eol-type argument. */ | |
1642 | |
1643 #define LENGTH(string_constant) (sizeof (string_constant) - 1) | |
1644 | |
1645 void | |
1646 determine_real_coding_system (Lstream *stream, Lisp_Object *codesys_in_out, | |
1647 enum eol_type *eol_type_in_out) | |
1648 { | |
1649 struct detection_state decst; | |
1650 | |
1651 if (*eol_type_in_out == EOL_AUTODETECT) | |
1652 *eol_type_in_out = XCODING_SYSTEM_EOL_TYPE (*codesys_in_out); | |
1653 | |
1654 xzero (decst); | |
1655 decst.eol_type = *eol_type_in_out; | |
1656 decst.mask = ~0; | |
1657 | |
1658 /* If autodetection is called for, do it now. */ | |
1659 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT | |
1660 || *eol_type_in_out == EOL_AUTODETECT) | |
1661 { | |
1662 Extbyte buf[4096]; | |
1663 Lisp_Object coding_system = Qnil; | |
1664 Extbyte *p; | |
1665 ssize_t nread = Lstream_read (stream, buf, sizeof (buf)); | |
1666 Extbyte *scan_end; | |
1667 | |
1668 /* Look for initial "-*-"; mode line prefix */ | |
1669 for (p = buf, | |
1670 scan_end = buf + nread - LENGTH ("-*-coding:?-*-"); | |
1671 p <= scan_end | |
1672 && *p != '\n' | |
1673 && *p != '\r'; | |
1674 p++) | |
1675 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-') | |
1676 { | |
1677 Extbyte *local_vars_beg = p + 3; | |
1678 /* Look for final "-*-"; mode line suffix */ | |
1679 for (p = local_vars_beg, | |
1680 scan_end = buf + nread - LENGTH ("-*-"); | |
1681 p <= scan_end | |
1682 && *p != '\n' | |
1683 && *p != '\r'; | |
1684 p++) | |
1685 if (*p == '-' && *(p+1) == '*' && *(p+2) == '-') | |
1686 { | |
1687 Extbyte *suffix = p; | |
1688 /* Look for "coding:" */ | |
1689 for (p = local_vars_beg, | |
1690 scan_end = suffix - LENGTH ("coding:?"); | |
1691 p <= scan_end; | |
1692 p++) | |
1693 if (memcmp ("coding:", p, LENGTH ("coding:")) == 0 | |
1694 && (p == local_vars_beg | |
1695 || (*(p-1) == ' ' || | |
1696 *(p-1) == '\t' || | |
1697 *(p-1) == ';'))) | |
1698 { | |
1699 Extbyte save; | |
1700 int n; | |
1701 p += LENGTH ("coding:"); | |
1702 while (*p == ' ' || *p == '\t') p++; | |
1703 | |
1704 /* Get coding system name */ | |
1705 save = *suffix; *suffix = '\0'; | |
1706 /* Characters valid in a MIME charset name (rfc 1521), | |
1707 and in a Lisp symbol name. */ | |
1708 n = strspn ( (char *) p, | |
1709 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
1710 "abcdefghijklmnopqrstuvwxyz" | |
1711 "0123456789" | |
1712 "!$%&*+-.^_{|}~"); | |
1713 *suffix = save; | |
1714 if (n > 0) | |
1715 { | |
1716 save = p[n]; p[n] = '\0'; | |
1717 coding_system = | |
1718 Ffind_coding_system (intern ((char *) p)); | |
1719 p[n] = save; | |
1720 } | |
1721 break; | |
1722 } | |
1723 break; | |
1724 } | |
1725 break; | |
1726 } | |
1727 | |
1728 if (NILP (coding_system)) | |
1729 do | |
1730 { | |
1731 if (detect_coding_type (&decst, buf, nread, | |
1732 XCODING_SYSTEM_TYPE (*codesys_in_out) | |
1733 != CODESYS_AUTODETECT)) | |
1734 break; | |
1735 nread = Lstream_read (stream, buf, sizeof (buf)); | |
1736 if (nread == 0) | |
1737 break; | |
1738 } | |
1739 while (1); | |
1740 | |
1741 else if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT | |
1742 && XCODING_SYSTEM_EOL_TYPE (coding_system) == EOL_AUTODETECT) | |
1743 do | |
1744 { | |
1745 if (detect_coding_type (&decst, buf, nread, 1)) | |
1746 break; | |
1747 nread = Lstream_read (stream, buf, sizeof (buf)); | |
1748 if (!nread) | |
1749 break; | |
1750 } | |
1751 while (1); | |
1752 | |
1753 *eol_type_in_out = decst.eol_type; | |
1754 if (XCODING_SYSTEM_TYPE (*codesys_in_out) == CODESYS_AUTODETECT) | |
1755 { | |
1756 if (NILP (coding_system)) | |
1757 *codesys_in_out = coding_system_from_mask (decst.mask); | |
1758 else | |
1759 *codesys_in_out = coding_system; | |
1760 } | |
1761 } | |
1762 | |
1763 /* If we absolutely can't determine the EOL type, just assume LF. */ | |
1764 if (*eol_type_in_out == EOL_AUTODETECT) | |
1765 *eol_type_in_out = EOL_LF; | |
1766 | |
1767 Lstream_rewind (stream); | |
1768 } | |
1769 | |
1770 DEFUN ("detect-coding-region", Fdetect_coding_region, 2, 3, 0, /* | |
1771 Detect coding system of the text in the region between START and END. | |
1772 Returned a list of possible coding systems ordered by priority. | |
1773 If only ASCII characters are found, it returns 'undecided or one of | |
1774 its subsidiary coding systems according to a detected end-of-line | |
1775 type. Optional arg BUFFER defaults to the current buffer. | |
1776 */ | |
1777 (start, end, buffer)) | |
1778 { | |
1779 Lisp_Object val = Qnil; | |
1780 struct buffer *buf = decode_buffer (buffer, 0); | |
1781 Bufpos b, e; | |
1782 Lisp_Object instream, lb_instream; | |
1783 Lstream *istr, *lb_istr; | |
1784 struct detection_state decst; | |
1785 struct gcpro gcpro1, gcpro2; | |
1786 | |
1787 get_buffer_range_char (buf, start, end, &b, &e, 0); | |
1788 lb_instream = make_lisp_buffer_input_stream (buf, b, e, 0); | |
1789 lb_istr = XLSTREAM (lb_instream); | |
1790 instream = make_encoding_input_stream (lb_istr, Fget_coding_system (Qbinary)); | |
1791 istr = XLSTREAM (instream); | |
1792 GCPRO2 (instream, lb_instream); | |
1793 xzero (decst); | |
1794 decst.eol_type = EOL_AUTODETECT; | |
1795 decst.mask = ~0; | |
1796 while (1) | |
1797 { | |
1798 unsigned char random_buffer[4096]; | |
1799 ssize_t nread = Lstream_read (istr, random_buffer, sizeof (random_buffer)); | |
1800 | |
1801 if (!nread) | |
1802 break; | |
1803 if (detect_coding_type (&decst, random_buffer, nread, 0)) | |
1804 break; | |
1805 } | |
1806 | |
1807 if (decst.mask == ~0) | |
1808 val = subsidiary_coding_system (Fget_coding_system (Qundecided), | |
1809 decst.eol_type); | |
1810 else | |
1811 { | |
1812 int i; | |
1813 | |
1814 val = Qnil; | |
1815 #ifdef MULE | |
1816 decst.mask = postprocess_iso2022_mask (decst.mask); | |
1817 #endif | |
1818 for (i = CODING_CATEGORY_LAST; i >= 0; i--) | |
1819 { | |
1820 int sys = fcd->coding_category_by_priority[i]; | |
1821 if (decst.mask & (1 << sys)) | |
1822 { | |
1823 Lisp_Object codesys = fcd->coding_category_system[sys]; | |
1824 if (!NILP (codesys)) | |
1825 codesys = subsidiary_coding_system (codesys, decst.eol_type); | |
1826 val = Fcons (codesys, val); | |
1827 } | |
1828 } | |
1829 } | |
1830 Lstream_close (istr); | |
1831 UNGCPRO; | |
1832 Lstream_delete (istr); | |
1833 Lstream_delete (lb_istr); | |
1834 return val; | |
1835 } | |
1836 | |
1837 | |
1838 /************************************************************************/ | |
1839 /* Converting to internal Mule format ("decoding") */ | |
1840 /************************************************************************/ | |
1841 | |
1842 /* A decoding stream is a stream used for decoding text (i.e. | |
1843 converting from some external format to internal format). | |
1844 The decoding-stream object keeps track of the actual coding | |
1845 stream, the stream that is at the other end, and data that | |
1846 needs to be persistent across the lifetime of the stream. */ | |
1847 | |
1848 /* Handle the EOL stuff related to just-read-in character C. | |
1849 EOL_TYPE is the EOL type of the coding stream. | |
1850 FLAGS is the current value of FLAGS in the coding stream, and may | |
1851 be modified by this macro. (The macro only looks at the | |
1852 CODING_STATE_CR flag.) DST is the Dynarr to which the decoded | |
1853 bytes are to be written. You need to also define a local goto | |
1854 label "label_continue_loop" that is at the end of the main | |
1855 character-reading loop. | |
1856 | |
1857 If C is a CR character, then this macro handles it entirely and | |
1858 jumps to label_continue_loop. Otherwise, this macro does not add | |
1859 anything to DST, and continues normally. You should continue | |
1860 processing C normally after this macro. */ | |
1861 | |
1862 #define DECODE_HANDLE_EOL_TYPE(eol_type, c, flags, dst) \ | |
1863 do { \ | |
1864 if (c == '\r') \ | |
1865 { \ | |
1866 if (eol_type == EOL_CR) \ | |
1867 Dynarr_add (dst, '\n'); \ | |
1868 else if (eol_type != EOL_CRLF || flags & CODING_STATE_CR) \ | |
1869 Dynarr_add (dst, c); \ | |
1870 else \ | |
1871 flags |= CODING_STATE_CR; \ | |
1872 goto label_continue_loop; \ | |
1873 } \ | |
1874 else if (flags & CODING_STATE_CR) \ | |
1875 { /* eol_type == CODING_SYSTEM_EOL_CRLF */ \ | |
1876 if (c != '\n') \ | |
1877 Dynarr_add (dst, '\r'); \ | |
1878 flags &= ~CODING_STATE_CR; \ | |
1879 } \ | |
1880 } while (0) | |
1881 | |
1882 /* C should be a binary character in the range 0 - 255; convert | |
1883 to internal format and add to Dynarr DST. */ | |
1884 | |
1885 #define DECODE_ADD_BINARY_CHAR(c, dst) \ | |
1886 do { \ | |
1887 if (BYTE_ASCII_P (c)) \ | |
1888 Dynarr_add (dst, c); \ | |
1889 else if (BYTE_C1_P (c)) \ | |
1890 { \ | |
1891 Dynarr_add (dst, LEADING_BYTE_CONTROL_1); \ | |
1892 Dynarr_add (dst, c + 0x20); \ | |
1893 } \ | |
1894 else \ | |
1895 { \ | |
1896 Dynarr_add (dst, LEADING_BYTE_LATIN_ISO8859_1); \ | |
1897 Dynarr_add (dst, c); \ | |
1898 } \ | |
1899 } while (0) | |
1900 | |
1901 #define DECODE_OUTPUT_PARTIAL_CHAR(ch) \ | |
1902 do { \ | |
1903 if (ch) \ | |
1904 { \ | |
1905 DECODE_ADD_BINARY_CHAR (ch, dst); \ | |
1906 ch = 0; \ | |
1907 } \ | |
1908 } while (0) | |
1909 | |
1910 #define DECODE_HANDLE_END_OF_CONVERSION(flags, ch, dst) \ | |
1911 do { \ | |
1912 if (flags & CODING_STATE_END) \ | |
1913 { \ | |
1914 DECODE_OUTPUT_PARTIAL_CHAR (ch); \ | |
1915 if (flags & CODING_STATE_CR) \ | |
1916 Dynarr_add (dst, '\r'); \ | |
1917 } \ | |
1918 } while (0) | |
1919 | |
1920 #define DECODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, decoding) | |
1921 | |
1922 struct decoding_stream | |
1923 { | |
1924 /* Coding system that governs the conversion. */ | |
1925 Lisp_Coding_System *codesys; | |
1926 | |
1927 /* Stream that we read the encoded data from or | |
1928 write the decoded data to. */ | |
1929 Lstream *other_end; | |
1930 | |
1931 /* If we are reading, then we can return only a fixed amount of | |
1932 data, so if the conversion resulted in too much data, we store it | |
1933 here for retrieval the next time around. */ | |
1934 unsigned_char_dynarr *runoff; | |
1935 | |
1936 /* FLAGS holds flags indicating the current state of the decoding. | |
1937 Some of these flags are dependent on the coding system. */ | |
1938 unsigned int flags; | |
1939 | |
1940 /* CH holds a partially built-up character. Since we only deal | |
1941 with one- and two-byte characters at the moment, we only use | |
1942 this to store the first byte of a two-byte character. */ | |
1943 unsigned int ch; | |
1944 | |
1945 /* EOL_TYPE specifies the type of end-of-line conversion that | |
1946 currently applies. We need to keep this separate from the | |
1947 EOL type stored in CODESYS because the latter might indicate | |
1948 automatic EOL-type detection while the former will always | |
1949 indicate a particular EOL type. */ | |
1950 enum eol_type eol_type; | |
1951 #ifdef MULE | |
1952 /* Additional ISO2022 information. We define the structure above | |
1953 because it's also needed by the detection routines. */ | |
1954 struct iso2022_decoder iso2022; | |
1955 | |
1956 /* Additional information (the state of the running CCL program) | |
1957 used by the CCL decoder. */ | |
1958 struct ccl_program ccl; | |
1959 | |
1960 /* counter for UTF-8 or UCS-4 */ | |
1961 unsigned char counter; | |
1962 #endif | |
1963 struct detection_state decst; | |
1964 }; | |
1965 | |
1966 static ssize_t decoding_reader (Lstream *stream, | |
1967 unsigned char *data, size_t size); | |
1968 static ssize_t decoding_writer (Lstream *stream, | |
1969 CONST unsigned char *data, size_t size); | |
1970 static int decoding_rewinder (Lstream *stream); | |
1971 static int decoding_seekable_p (Lstream *stream); | |
1972 static int decoding_flusher (Lstream *stream); | |
1973 static int decoding_closer (Lstream *stream); | |
1974 | |
1975 static Lisp_Object decoding_marker (Lisp_Object stream); | |
1976 | |
1977 DEFINE_LSTREAM_IMPLEMENTATION ("decoding", lstream_decoding, | |
1978 sizeof (struct decoding_stream)); | |
1979 | |
1980 static Lisp_Object | |
1981 decoding_marker (Lisp_Object stream) | |
1982 { | |
1983 Lstream *str = DECODING_STREAM_DATA (XLSTREAM (stream))->other_end; | |
1984 Lisp_Object str_obj; | |
1985 | |
1986 /* We do not need to mark the coding systems or charsets stored | |
1987 within the stream because they are stored in a global list | |
1988 and automatically marked. */ | |
1989 | |
1990 XSETLSTREAM (str_obj, str); | |
1991 mark_object (str_obj); | |
1992 if (str->imp->marker) | |
1993 return (str->imp->marker) (str_obj); | |
1994 else | |
1995 return Qnil; | |
1996 } | |
1997 | |
1998 /* Read SIZE bytes of data and store it into DATA. We are a decoding stream | |
1999 so we read data from the other end, decode it, and store it into DATA. */ | |
2000 | |
2001 static ssize_t | |
2002 decoding_reader (Lstream *stream, unsigned char *data, size_t size) | |
2003 { | |
2004 struct decoding_stream *str = DECODING_STREAM_DATA (stream); | |
2005 unsigned char *orig_data = data; | |
2006 ssize_t read_size; | |
2007 int error_occurred = 0; | |
2008 | |
2009 /* We need to interface to mule_decode(), which expects to take some | |
2010 amount of data and store the result into a Dynarr. We have | |
2011 mule_decode() store into str->runoff, and take data from there | |
2012 as necessary. */ | |
2013 | |
2014 /* We loop until we have enough data, reading chunks from the other | |
2015 end and decoding it. */ | |
2016 while (1) | |
2017 { | |
2018 /* Take data from the runoff if we can. Make sure to take at | |
2019 most SIZE bytes, and delete the data from the runoff. */ | |
2020 if (Dynarr_length (str->runoff) > 0) | |
2021 { | |
2022 size_t chunk = min (size, (size_t) Dynarr_length (str->runoff)); | |
2023 memcpy (data, Dynarr_atp (str->runoff, 0), chunk); | |
2024 Dynarr_delete_many (str->runoff, 0, chunk); | |
2025 data += chunk; | |
2026 size -= chunk; | |
2027 } | |
2028 | |
2029 if (size == 0) | |
2030 break; /* No more room for data */ | |
2031 | |
2032 if (str->flags & CODING_STATE_END) | |
2033 /* This means that on the previous iteration, we hit the EOF on | |
2034 the other end. We loop once more so that mule_decode() can | |
2035 output any final stuff it may be holding, or any "go back | |
2036 to a sane state" escape sequences. (This latter makes sense | |
2037 during encoding.) */ | |
2038 break; | |
2039 | |
2040 /* Exhausted the runoff, so get some more. DATA has at least | |
2041 SIZE bytes left of storage in it, so it's OK to read directly | |
2042 into it. (We'll be overwriting above, after we've decoded it | |
2043 into the runoff.) */ | |
2044 read_size = Lstream_read (str->other_end, data, size); | |
2045 if (read_size < 0) | |
2046 { | |
2047 error_occurred = 1; | |
2048 break; | |
2049 } | |
2050 if (read_size == 0) | |
2051 /* There might be some more end data produced in the translation. | |
2052 See the comment above. */ | |
2053 str->flags |= CODING_STATE_END; | |
2054 mule_decode (stream, data, str->runoff, read_size); | |
2055 } | |
2056 | |
2057 if (data - orig_data == 0) | |
2058 return error_occurred ? -1 : 0; | |
2059 else | |
2060 return data - orig_data; | |
2061 } | |
2062 | |
2063 static ssize_t | |
2064 decoding_writer (Lstream *stream, CONST unsigned char *data, size_t size) | |
2065 { | |
2066 struct decoding_stream *str = DECODING_STREAM_DATA (stream); | |
2067 ssize_t retval; | |
2068 | |
2069 /* Decode all our data into the runoff, and then attempt to write | |
2070 it all out to the other end. Remove whatever chunk we succeeded | |
2071 in writing. */ | |
2072 mule_decode (stream, data, str->runoff, size); | |
2073 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0), | |
2074 Dynarr_length (str->runoff)); | |
2075 if (retval > 0) | |
2076 Dynarr_delete_many (str->runoff, 0, retval); | |
2077 /* Do NOT return retval. The return value indicates how much | |
2078 of the incoming data was written, not how many bytes were | |
2079 written. */ | |
2080 return size; | |
2081 } | |
2082 | |
2083 static void | |
2084 reset_decoding_stream (struct decoding_stream *str) | |
2085 { | |
2086 #ifdef MULE | |
2087 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_ISO2022) | |
2088 { | |
2089 Lisp_Object coding_system; | |
2090 XSETCODING_SYSTEM (coding_system, str->codesys); | |
2091 reset_iso2022 (coding_system, &str->iso2022); | |
2092 } | |
2093 else if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_CCL) | |
2094 { | |
2095 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_DECODE (str->codesys)); | |
2096 } | |
2097 str->counter = 0; | |
2098 #endif /* MULE */ | |
2099 str->flags = str->ch = 0; | |
2100 } | |
2101 | |
2102 static int | |
2103 decoding_rewinder (Lstream *stream) | |
2104 { | |
2105 struct decoding_stream *str = DECODING_STREAM_DATA (stream); | |
2106 reset_decoding_stream (str); | |
2107 Dynarr_reset (str->runoff); | |
2108 return Lstream_rewind (str->other_end); | |
2109 } | |
2110 | |
2111 static int | |
2112 decoding_seekable_p (Lstream *stream) | |
2113 { | |
2114 struct decoding_stream *str = DECODING_STREAM_DATA (stream); | |
2115 return Lstream_seekable_p (str->other_end); | |
2116 } | |
2117 | |
2118 static int | |
2119 decoding_flusher (Lstream *stream) | |
2120 { | |
2121 struct decoding_stream *str = DECODING_STREAM_DATA (stream); | |
2122 return Lstream_flush (str->other_end); | |
2123 } | |
2124 | |
2125 static int | |
2126 decoding_closer (Lstream *stream) | |
2127 { | |
2128 struct decoding_stream *str = DECODING_STREAM_DATA (stream); | |
2129 if (stream->flags & LSTREAM_FL_WRITE) | |
2130 { | |
2131 str->flags |= CODING_STATE_END; | |
2132 decoding_writer (stream, 0, 0); | |
2133 } | |
2134 Dynarr_free (str->runoff); | |
2135 #ifdef MULE | |
2136 #ifdef ENABLE_COMPOSITE_CHARS | |
2137 if (str->iso2022.composite_chars) | |
2138 Dynarr_free (str->iso2022.composite_chars); | |
2139 #endif | |
2140 #endif | |
2141 return Lstream_close (str->other_end); | |
2142 } | |
2143 | |
2144 Lisp_Object | |
2145 decoding_stream_coding_system (Lstream *stream) | |
2146 { | |
2147 Lisp_Object coding_system; | |
2148 struct decoding_stream *str = DECODING_STREAM_DATA (stream); | |
2149 | |
2150 XSETCODING_SYSTEM (coding_system, str->codesys); | |
2151 return subsidiary_coding_system (coding_system, str->eol_type); | |
2152 } | |
2153 | |
2154 void | |
2155 set_decoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) | |
2156 { | |
2157 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); | |
2158 struct decoding_stream *str = DECODING_STREAM_DATA (lstr); | |
2159 str->codesys = cs; | |
2160 if (CODING_SYSTEM_EOL_TYPE (cs) != EOL_AUTODETECT) | |
2161 str->eol_type = CODING_SYSTEM_EOL_TYPE (cs); | |
2162 reset_decoding_stream (str); | |
2163 } | |
2164 | |
2165 /* WARNING WARNING WARNING WARNING!!!!! If you open up a decoding | |
2166 stream for writing, no automatic code detection will be performed. | |
2167 The reason for this is that automatic code detection requires a | |
2168 seekable input. Things will also fail if you open a decoding | |
2169 stream for reading using a non-fully-specified coding system and | |
2170 a non-seekable input stream. */ | |
2171 | |
2172 static Lisp_Object | |
2173 make_decoding_stream_1 (Lstream *stream, Lisp_Object codesys, | |
2174 CONST char *mode) | |
2175 { | |
2176 Lstream *lstr = Lstream_new (lstream_decoding, mode); | |
2177 struct decoding_stream *str = DECODING_STREAM_DATA (lstr); | |
2178 Lisp_Object obj; | |
2179 | |
2180 xzero (*str); | |
2181 str->other_end = stream; | |
2182 str->runoff = (unsigned_char_dynarr *) Dynarr_new (unsigned_char); | |
2183 str->eol_type = EOL_AUTODETECT; | |
2184 if (!strcmp (mode, "r") | |
2185 && Lstream_seekable_p (stream)) | |
2186 /* We can determine the coding system now. */ | |
2187 determine_real_coding_system (stream, &codesys, &str->eol_type); | |
2188 set_decoding_stream_coding_system (lstr, codesys); | |
2189 str->decst.eol_type = str->eol_type; | |
2190 str->decst.mask = ~0; | |
2191 XSETLSTREAM (obj, lstr); | |
2192 return obj; | |
2193 } | |
2194 | |
2195 Lisp_Object | |
2196 make_decoding_input_stream (Lstream *stream, Lisp_Object codesys) | |
2197 { | |
2198 return make_decoding_stream_1 (stream, codesys, "r"); | |
2199 } | |
2200 | |
2201 Lisp_Object | |
2202 make_decoding_output_stream (Lstream *stream, Lisp_Object codesys) | |
2203 { | |
2204 return make_decoding_stream_1 (stream, codesys, "w"); | |
2205 } | |
2206 | |
2207 /* Note: the decode_coding_* functions all take the same | |
2208 arguments as mule_decode(), which is to say some SRC data of | |
2209 size N, which is to be stored into dynamic array DST. | |
2210 DECODING is the stream within which the decoding is | |
2211 taking place, but no data is actually read from or | |
2212 written to that stream; that is handled in decoding_reader() | |
2213 or decoding_writer(). This allows the same functions to | |
2214 be used for both reading and writing. */ | |
2215 | |
2216 static void | |
2217 mule_decode (Lstream *decoding, CONST unsigned char *src, | |
2218 unsigned_char_dynarr *dst, unsigned int n) | |
2219 { | |
2220 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | |
2221 | |
2222 /* If necessary, do encoding-detection now. We do this when | |
2223 we're a writing stream or a non-seekable reading stream, | |
2224 meaning that we can't just process the whole input, | |
2225 rewind, and start over. */ | |
2226 | |
2227 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT || | |
2228 str->eol_type == EOL_AUTODETECT) | |
2229 { | |
2230 Lisp_Object codesys; | |
2231 | |
2232 XSETCODING_SYSTEM (codesys, str->codesys); | |
2233 detect_coding_type (&str->decst, src, n, | |
2234 CODING_SYSTEM_TYPE (str->codesys) != | |
2235 CODESYS_AUTODETECT); | |
2236 if (CODING_SYSTEM_TYPE (str->codesys) == CODESYS_AUTODETECT && | |
2237 str->decst.mask != ~0) | |
2238 /* #### This is cheesy. What we really ought to do is | |
2239 buffer up a certain amount of data so as to get a | |
2240 less random result. */ | |
2241 codesys = coding_system_from_mask (str->decst.mask); | |
2242 str->eol_type = str->decst.eol_type; | |
2243 if (XCODING_SYSTEM (codesys) != str->codesys) | |
2244 { | |
2245 /* Preserve the CODING_STATE_END flag in case it was set. | |
2246 If we erase it, bad things might happen. */ | |
2247 int was_end = str->flags & CODING_STATE_END; | |
2248 set_decoding_stream_coding_system (decoding, codesys); | |
2249 if (was_end) | |
2250 str->flags |= CODING_STATE_END; | |
2251 } | |
2252 } | |
2253 | |
2254 switch (CODING_SYSTEM_TYPE (str->codesys)) | |
2255 { | |
2256 #ifdef DEBUG_XEMACS | |
2257 case CODESYS_INTERNAL: | |
2258 Dynarr_add_many (dst, src, n); | |
2259 break; | |
2260 #endif | |
2261 case CODESYS_AUTODETECT: | |
2262 /* If we got this far and still haven't decided on the coding | |
2263 system, then do no conversion. */ | |
2264 case CODESYS_NO_CONVERSION: | |
2265 decode_coding_no_conversion (decoding, src, dst, n); | |
2266 break; | |
2267 #ifdef MULE | |
2268 case CODESYS_SHIFT_JIS: | |
2269 decode_coding_sjis (decoding, src, dst, n); | |
2270 break; | |
2271 case CODESYS_BIG5: | |
2272 decode_coding_big5 (decoding, src, dst, n); | |
2273 break; | |
2274 case CODESYS_UCS4: | |
2275 decode_coding_ucs4 (decoding, src, dst, n); | |
2276 break; | |
2277 case CODESYS_UTF8: | |
2278 decode_coding_utf8 (decoding, src, dst, n); | |
2279 break; | |
2280 case CODESYS_CCL: | |
2281 str->ccl.last_block = str->flags & CODING_STATE_END; | |
2282 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_DECODING); | |
2283 break; | |
2284 case CODESYS_ISO2022: | |
2285 decode_coding_iso2022 (decoding, src, dst, n); | |
2286 break; | |
2287 #endif /* MULE */ | |
2288 default: | |
2289 abort (); | |
2290 } | |
2291 } | |
2292 | |
2293 DEFUN ("decode-coding-region", Fdecode_coding_region, 3, 4, 0, /* | |
2294 Decode the text between START and END which is encoded in CODING-SYSTEM. | |
2295 This is useful if you've read in encoded text from a file without decoding | |
2296 it (e.g. you read in a JIS-formatted file but used the `binary' or | |
2297 `no-conversion' coding system, so that it shows up as "^[$B!<!+^[(B"). | |
2298 Return length of decoded text. | |
2299 BUFFER defaults to the current buffer if unspecified. | |
2300 */ | |
2301 (start, end, coding_system, buffer)) | |
2302 { | |
2303 Bufpos b, e; | |
2304 struct buffer *buf = decode_buffer (buffer, 0); | |
2305 Lisp_Object instream, lb_outstream, de_outstream, outstream; | |
2306 Lstream *istr, *ostr; | |
2307 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4; | |
2308 | |
2309 get_buffer_range_char (buf, start, end, &b, &e, 0); | |
2310 | |
2311 barf_if_buffer_read_only (buf, b, e); | |
2312 | |
2313 coding_system = Fget_coding_system (coding_system); | |
2314 instream = make_lisp_buffer_input_stream (buf, b, e, 0); | |
2315 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0); | |
2316 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream), | |
2317 coding_system); | |
2318 outstream = make_encoding_output_stream (XLSTREAM (de_outstream), | |
2319 Fget_coding_system (Qbinary)); | |
2320 istr = XLSTREAM (instream); | |
2321 ostr = XLSTREAM (outstream); | |
2322 GCPRO4 (instream, lb_outstream, de_outstream, outstream); | |
2323 | |
2324 /* The chain of streams looks like this: | |
2325 | |
2326 [BUFFER] <----- send through | |
2327 ------> [ENCODE AS BINARY] | |
2328 ------> [DECODE AS SPECIFIED] | |
2329 ------> [BUFFER] | |
2330 */ | |
2331 | |
2332 while (1) | |
2333 { | |
2334 char tempbuf[1024]; /* some random amount */ | |
2335 Bufpos newpos, even_newer_pos; | |
2336 Bufpos oldpos = lisp_buffer_stream_startpos (istr); | |
2337 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); | |
2338 | |
2339 if (!size_in_bytes) | |
2340 break; | |
2341 newpos = lisp_buffer_stream_startpos (istr); | |
2342 Lstream_write (ostr, tempbuf, size_in_bytes); | |
2343 even_newer_pos = lisp_buffer_stream_startpos (istr); | |
2344 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos), | |
2345 even_newer_pos, 0); | |
2346 } | |
2347 Lstream_close (istr); | |
2348 Lstream_close (ostr); | |
2349 UNGCPRO; | |
2350 Lstream_delete (istr); | |
2351 Lstream_delete (ostr); | |
2352 Lstream_delete (XLSTREAM (de_outstream)); | |
2353 Lstream_delete (XLSTREAM (lb_outstream)); | |
2354 return Qnil; | |
2355 } | |
2356 | |
2357 | |
2358 /************************************************************************/ | |
2359 /* Converting to an external encoding ("encoding") */ | |
2360 /************************************************************************/ | |
2361 | |
2362 /* An encoding stream is an output stream. When you create the | |
2363 stream, you specify the coding system that governs the encoding | |
2364 and another stream that the resulting encoded data is to be | |
2365 sent to, and then start sending data to it. */ | |
2366 | |
2367 #define ENCODING_STREAM_DATA(stream) LSTREAM_TYPE_DATA (stream, encoding) | |
2368 | |
2369 struct encoding_stream | |
2370 { | |
2371 /* Coding system that governs the conversion. */ | |
2372 Lisp_Coding_System *codesys; | |
2373 | |
2374 /* Stream that we read the encoded data from or | |
2375 write the decoded data to. */ | |
2376 Lstream *other_end; | |
2377 | |
2378 /* If we are reading, then we can return only a fixed amount of | |
2379 data, so if the conversion resulted in too much data, we store it | |
2380 here for retrieval the next time around. */ | |
2381 unsigned_char_dynarr *runoff; | |
2382 | |
2383 /* FLAGS holds flags indicating the current state of the encoding. | |
2384 Some of these flags are dependent on the coding system. */ | |
2385 unsigned int flags; | |
2386 | |
2387 /* CH holds a partially built-up character. Since we only deal | |
2388 with one- and two-byte characters at the moment, we only use | |
2389 this to store the first byte of a two-byte character. */ | |
2390 unsigned int ch; | |
2391 #ifdef MULE | |
2392 /* Additional information used by the ISO2022 encoder. */ | |
2393 struct | |
2394 { | |
2395 /* CHARSET holds the character sets currently assigned to the G0 | |
2396 through G3 registers. It is initialized from the array | |
2397 INITIAL_CHARSET in CODESYS. */ | |
2398 Lisp_Object charset[4]; | |
2399 | |
2400 /* Which registers are currently invoked into the left (GL) and | |
2401 right (GR) halves of the 8-bit encoding space? */ | |
2402 int register_left, register_right; | |
2403 | |
2404 /* Whether we need to explicitly designate the charset in the | |
2405 G? register before using it. It is initialized from the | |
2406 array FORCE_CHARSET_ON_OUTPUT in CODESYS. */ | |
2407 unsigned char force_charset_on_output[4]; | |
2408 | |
2409 /* Other state variables that need to be preserved across | |
2410 invocations. */ | |
2411 Lisp_Object current_charset; | |
2412 int current_half; | |
2413 int current_char_boundary; | |
2414 } iso2022; | |
2415 | |
2416 /* Additional information (the state of the running CCL program) | |
2417 used by the CCL encoder. */ | |
2418 struct ccl_program ccl; | |
2419 #endif /* MULE */ | |
2420 }; | |
2421 | |
2422 static ssize_t encoding_reader (Lstream *stream, unsigned char *data, size_t size); | |
2423 static ssize_t encoding_writer (Lstream *stream, CONST unsigned char *data, | |
2424 size_t size); | |
2425 static int encoding_rewinder (Lstream *stream); | |
2426 static int encoding_seekable_p (Lstream *stream); | |
2427 static int encoding_flusher (Lstream *stream); | |
2428 static int encoding_closer (Lstream *stream); | |
2429 | |
2430 static Lisp_Object encoding_marker (Lisp_Object stream); | |
2431 | |
2432 DEFINE_LSTREAM_IMPLEMENTATION ("encoding", lstream_encoding, | |
2433 sizeof (struct encoding_stream)); | |
2434 | |
2435 static Lisp_Object | |
2436 encoding_marker (Lisp_Object stream) | |
2437 { | |
2438 Lstream *str = ENCODING_STREAM_DATA (XLSTREAM (stream))->other_end; | |
2439 Lisp_Object str_obj; | |
2440 | |
2441 /* We do not need to mark the coding systems or charsets stored | |
2442 within the stream because they are stored in a global list | |
2443 and automatically marked. */ | |
2444 | |
2445 XSETLSTREAM (str_obj, str); | |
2446 mark_object (str_obj); | |
2447 if (str->imp->marker) | |
2448 return (str->imp->marker) (str_obj); | |
2449 else | |
2450 return Qnil; | |
2451 } | |
2452 | |
2453 /* Read SIZE bytes of data and store it into DATA. We are a encoding stream | |
2454 so we read data from the other end, encode it, and store it into DATA. */ | |
2455 | |
2456 static ssize_t | |
2457 encoding_reader (Lstream *stream, unsigned char *data, size_t size) | |
2458 { | |
2459 struct encoding_stream *str = ENCODING_STREAM_DATA (stream); | |
2460 unsigned char *orig_data = data; | |
2461 ssize_t read_size; | |
2462 int error_occurred = 0; | |
2463 | |
2464 /* We need to interface to mule_encode(), which expects to take some | |
2465 amount of data and store the result into a Dynarr. We have | |
2466 mule_encode() store into str->runoff, and take data from there | |
2467 as necessary. */ | |
2468 | |
2469 /* We loop until we have enough data, reading chunks from the other | |
2470 end and encoding it. */ | |
2471 while (1) | |
2472 { | |
2473 /* Take data from the runoff if we can. Make sure to take at | |
2474 most SIZE bytes, and delete the data from the runoff. */ | |
2475 if (Dynarr_length (str->runoff) > 0) | |
2476 { | |
2477 int chunk = min ((int) size, Dynarr_length (str->runoff)); | |
2478 memcpy (data, Dynarr_atp (str->runoff, 0), chunk); | |
2479 Dynarr_delete_many (str->runoff, 0, chunk); | |
2480 data += chunk; | |
2481 size -= chunk; | |
2482 } | |
2483 | |
2484 if (size == 0) | |
2485 break; /* No more room for data */ | |
2486 | |
2487 if (str->flags & CODING_STATE_END) | |
2488 /* This means that on the previous iteration, we hit the EOF on | |
2489 the other end. We loop once more so that mule_encode() can | |
2490 output any final stuff it may be holding, or any "go back | |
2491 to a sane state" escape sequences. (This latter makes sense | |
2492 during encoding.) */ | |
2493 break; | |
2494 | |
2495 /* Exhausted the runoff, so get some more. DATA at least SIZE bytes | |
2496 left of storage in it, so it's OK to read directly into it. | |
2497 (We'll be overwriting above, after we've encoded it into the | |
2498 runoff.) */ | |
2499 read_size = Lstream_read (str->other_end, data, size); | |
2500 if (read_size < 0) | |
2501 { | |
2502 error_occurred = 1; | |
2503 break; | |
2504 } | |
2505 if (read_size == 0) | |
2506 /* There might be some more end data produced in the translation. | |
2507 See the comment above. */ | |
2508 str->flags |= CODING_STATE_END; | |
2509 mule_encode (stream, data, str->runoff, read_size); | |
2510 } | |
2511 | |
2512 if (data == orig_data) | |
2513 return error_occurred ? -1 : 0; | |
2514 else | |
2515 return data - orig_data; | |
2516 } | |
2517 | |
2518 static ssize_t | |
2519 encoding_writer (Lstream *stream, CONST unsigned char *data, size_t size) | |
2520 { | |
2521 struct encoding_stream *str = ENCODING_STREAM_DATA (stream); | |
2522 ssize_t retval; | |
2523 | |
2524 /* Encode all our data into the runoff, and then attempt to write | |
2525 it all out to the other end. Remove whatever chunk we succeeded | |
2526 in writing. */ | |
2527 mule_encode (stream, data, str->runoff, size); | |
2528 retval = Lstream_write (str->other_end, Dynarr_atp (str->runoff, 0), | |
2529 Dynarr_length (str->runoff)); | |
2530 if (retval > 0) | |
2531 Dynarr_delete_many (str->runoff, 0, retval); | |
2532 /* Do NOT return retval. The return value indicates how much | |
2533 of the incoming data was written, not how many bytes were | |
2534 written. */ | |
2535 return size; | |
2536 } | |
2537 | |
2538 static void | |
2539 reset_encoding_stream (struct encoding_stream *str) | |
2540 { | |
2541 #ifdef MULE | |
2542 switch (CODING_SYSTEM_TYPE (str->codesys)) | |
2543 { | |
2544 case CODESYS_ISO2022: | |
2545 { | |
2546 int i; | |
2547 | |
2548 for (i = 0; i < 4; i++) | |
2549 { | |
2550 str->iso2022.charset[i] = | |
2551 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (str->codesys, i); | |
2552 str->iso2022.force_charset_on_output[i] = | |
2553 CODING_SYSTEM_ISO2022_FORCE_CHARSET_ON_OUTPUT (str->codesys, i); | |
2554 } | |
2555 str->iso2022.register_left = 0; | |
2556 str->iso2022.register_right = 1; | |
2557 str->iso2022.current_charset = Qnil; | |
2558 str->iso2022.current_half = 0; | |
2559 str->iso2022.current_char_boundary = 1; | |
2560 break; | |
2561 } | |
2562 case CODESYS_CCL: | |
2563 setup_ccl_program (&str->ccl, CODING_SYSTEM_CCL_ENCODE (str->codesys)); | |
2564 break; | |
2565 default: | |
2566 break; | |
2567 } | |
2568 #endif /* MULE */ | |
2569 | |
2570 str->flags = str->ch = 0; | |
2571 } | |
2572 | |
2573 static int | |
2574 encoding_rewinder (Lstream *stream) | |
2575 { | |
2576 struct encoding_stream *str = ENCODING_STREAM_DATA (stream); | |
2577 reset_encoding_stream (str); | |
2578 Dynarr_reset (str->runoff); | |
2579 return Lstream_rewind (str->other_end); | |
2580 } | |
2581 | |
2582 static int | |
2583 encoding_seekable_p (Lstream *stream) | |
2584 { | |
2585 struct encoding_stream *str = ENCODING_STREAM_DATA (stream); | |
2586 return Lstream_seekable_p (str->other_end); | |
2587 } | |
2588 | |
2589 static int | |
2590 encoding_flusher (Lstream *stream) | |
2591 { | |
2592 struct encoding_stream *str = ENCODING_STREAM_DATA (stream); | |
2593 return Lstream_flush (str->other_end); | |
2594 } | |
2595 | |
2596 static int | |
2597 encoding_closer (Lstream *stream) | |
2598 { | |
2599 struct encoding_stream *str = ENCODING_STREAM_DATA (stream); | |
2600 if (stream->flags & LSTREAM_FL_WRITE) | |
2601 { | |
2602 str->flags |= CODING_STATE_END; | |
2603 encoding_writer (stream, 0, 0); | |
2604 } | |
2605 Dynarr_free (str->runoff); | |
2606 return Lstream_close (str->other_end); | |
2607 } | |
2608 | |
2609 Lisp_Object | |
2610 encoding_stream_coding_system (Lstream *stream) | |
2611 { | |
2612 Lisp_Object coding_system; | |
2613 struct encoding_stream *str = ENCODING_STREAM_DATA (stream); | |
2614 | |
2615 XSETCODING_SYSTEM (coding_system, str->codesys); | |
2616 return coding_system; | |
2617 } | |
2618 | |
2619 void | |
2620 set_encoding_stream_coding_system (Lstream *lstr, Lisp_Object codesys) | |
2621 { | |
2622 Lisp_Coding_System *cs = XCODING_SYSTEM (codesys); | |
2623 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr); | |
2624 str->codesys = cs; | |
2625 reset_encoding_stream (str); | |
2626 } | |
2627 | |
2628 static Lisp_Object | |
2629 make_encoding_stream_1 (Lstream *stream, Lisp_Object codesys, | |
2630 CONST char *mode) | |
2631 { | |
2632 Lstream *lstr = Lstream_new (lstream_encoding, mode); | |
2633 struct encoding_stream *str = ENCODING_STREAM_DATA (lstr); | |
2634 Lisp_Object obj; | |
2635 | |
2636 xzero (*str); | |
2637 str->runoff = Dynarr_new (unsigned_char); | |
2638 str->other_end = stream; | |
2639 set_encoding_stream_coding_system (lstr, codesys); | |
2640 XSETLSTREAM (obj, lstr); | |
2641 return obj; | |
2642 } | |
2643 | |
2644 Lisp_Object | |
2645 make_encoding_input_stream (Lstream *stream, Lisp_Object codesys) | |
2646 { | |
2647 return make_encoding_stream_1 (stream, codesys, "r"); | |
2648 } | |
2649 | |
2650 Lisp_Object | |
2651 make_encoding_output_stream (Lstream *stream, Lisp_Object codesys) | |
2652 { | |
2653 return make_encoding_stream_1 (stream, codesys, "w"); | |
2654 } | |
2655 | |
2656 /* Convert N bytes of internally-formatted data stored in SRC to an | |
2657 external format, according to the encoding stream ENCODING. | |
2658 Store the encoded data into DST. */ | |
2659 | |
2660 static void | |
2661 mule_encode (Lstream *encoding, CONST unsigned char *src, | |
2662 unsigned_char_dynarr *dst, unsigned int n) | |
2663 { | |
2664 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | |
2665 | |
2666 switch (CODING_SYSTEM_TYPE (str->codesys)) | |
2667 { | |
2668 #ifdef DEBUG_XEMACS | |
2669 case CODESYS_INTERNAL: | |
2670 Dynarr_add_many (dst, src, n); | |
2671 break; | |
2672 #endif | |
2673 case CODESYS_AUTODETECT: | |
2674 /* If we got this far and still haven't decided on the coding | |
2675 system, then do no conversion. */ | |
2676 case CODESYS_NO_CONVERSION: | |
2677 encode_coding_no_conversion (encoding, src, dst, n); | |
2678 break; | |
2679 #ifdef MULE | |
2680 case CODESYS_SHIFT_JIS: | |
2681 encode_coding_sjis (encoding, src, dst, n); | |
2682 break; | |
2683 case CODESYS_BIG5: | |
2684 encode_coding_big5 (encoding, src, dst, n); | |
2685 break; | |
2686 case CODESYS_UCS4: | |
2687 encode_coding_ucs4 (encoding, src, dst, n); | |
2688 break; | |
2689 case CODESYS_UTF8: | |
2690 encode_coding_utf8 (encoding, src, dst, n); | |
2691 break; | |
2692 case CODESYS_CCL: | |
2693 str->ccl.last_block = str->flags & CODING_STATE_END; | |
2694 ccl_driver (&str->ccl, src, dst, n, 0, CCL_MODE_ENCODING); | |
2695 break; | |
2696 case CODESYS_ISO2022: | |
2697 encode_coding_iso2022 (encoding, src, dst, n); | |
2698 break; | |
2699 #endif /* MULE */ | |
2700 default: | |
2701 abort (); | |
2702 } | |
2703 } | |
2704 | |
2705 DEFUN ("encode-coding-region", Fencode_coding_region, 3, 4, 0, /* | |
2706 Encode the text between START and END using CODING-SYSTEM. | |
2707 This will, for example, convert Japanese characters into stuff such as | |
2708 "^[$B!<!+^[(B" if you use the JIS encoding. Return length of encoded | |
2709 text. BUFFER defaults to the current buffer if unspecified. | |
2710 */ | |
2711 (start, end, coding_system, buffer)) | |
2712 { | |
2713 Bufpos b, e; | |
2714 struct buffer *buf = decode_buffer (buffer, 0); | |
2715 Lisp_Object instream, lb_outstream, de_outstream, outstream; | |
2716 Lstream *istr, *ostr; | |
2717 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4; | |
2718 | |
2719 get_buffer_range_char (buf, start, end, &b, &e, 0); | |
2720 | |
2721 barf_if_buffer_read_only (buf, b, e); | |
2722 | |
2723 coding_system = Fget_coding_system (coding_system); | |
2724 instream = make_lisp_buffer_input_stream (buf, b, e, 0); | |
2725 lb_outstream = make_lisp_buffer_output_stream (buf, b, 0); | |
2726 de_outstream = make_decoding_output_stream (XLSTREAM (lb_outstream), | |
2727 Fget_coding_system (Qbinary)); | |
2728 outstream = make_encoding_output_stream (XLSTREAM (de_outstream), | |
2729 coding_system); | |
2730 istr = XLSTREAM (instream); | |
2731 ostr = XLSTREAM (outstream); | |
2732 GCPRO4 (instream, outstream, de_outstream, lb_outstream); | |
2733 /* The chain of streams looks like this: | |
2734 | |
2735 [BUFFER] <----- send through | |
2736 ------> [ENCODE AS SPECIFIED] | |
2737 ------> [DECODE AS BINARY] | |
2738 ------> [BUFFER] | |
2739 */ | |
2740 while (1) | |
2741 { | |
2742 char tempbuf[1024]; /* some random amount */ | |
2743 Bufpos newpos, even_newer_pos; | |
2744 Bufpos oldpos = lisp_buffer_stream_startpos (istr); | |
2745 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); | |
2746 | |
2747 if (!size_in_bytes) | |
2748 break; | |
2749 newpos = lisp_buffer_stream_startpos (istr); | |
2750 Lstream_write (ostr, tempbuf, size_in_bytes); | |
2751 even_newer_pos = lisp_buffer_stream_startpos (istr); | |
2752 buffer_delete_range (buf, even_newer_pos - (newpos - oldpos), | |
2753 even_newer_pos, 0); | |
2754 } | |
2755 | |
2756 { | |
2757 Charcount retlen = | |
2758 lisp_buffer_stream_startpos (XLSTREAM (instream)) - b; | |
2759 Lstream_close (istr); | |
2760 Lstream_close (ostr); | |
2761 UNGCPRO; | |
2762 Lstream_delete (istr); | |
2763 Lstream_delete (ostr); | |
2764 Lstream_delete (XLSTREAM (de_outstream)); | |
2765 Lstream_delete (XLSTREAM (lb_outstream)); | |
2766 return make_int (retlen); | |
2767 } | |
2768 } | |
2769 | |
2770 #ifdef MULE | |
2771 | |
2772 /************************************************************************/ | |
2773 /* Shift-JIS methods */ | |
2774 /************************************************************************/ | |
2775 | |
2776 /* Shift-JIS is a coding system encoding three character sets: ASCII, right | |
2777 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded | |
2778 as is. A character of JISX0201-Kana (DIMENSION1_CHARS94 character set) is | |
2779 encoded by "position-code + 0x80". A character of JISX0208 | |
2780 (DIMENSION2_CHARS94 character set) is encoded in 2-byte but two | |
2781 position-codes are divided and shifted so that it fit in the range | |
2782 below. | |
2783 | |
2784 --- CODE RANGE of Shift-JIS --- | |
2785 (character set) (range) | |
2786 ASCII 0x00 .. 0x7F | |
2787 JISX0201-Kana 0xA0 .. 0xDF | |
2788 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF | |
2789 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC | |
2790 ------------------------------- | |
2791 | |
2792 */ | |
2793 | |
2794 /* Is this the first byte of a Shift-JIS two-byte char? */ | |
2795 | |
2796 #define BYTE_SJIS_TWO_BYTE_1_P(c) \ | |
2797 (((c) >= 0x81 && (c) <= 0x9F) || ((c) >= 0xE0 && (c) <= 0xEF)) | |
2798 | |
2799 /* Is this the second byte of a Shift-JIS two-byte char? */ | |
2800 | |
2801 #define BYTE_SJIS_TWO_BYTE_2_P(c) \ | |
2802 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0x80 && (c) <= 0xFC)) | |
2803 | |
2804 #define BYTE_SJIS_KATAKANA_P(c) \ | |
2805 ((c) >= 0xA1 && (c) <= 0xDF) | |
2806 | |
2807 static int | |
2808 detect_coding_sjis (struct detection_state *st, CONST unsigned char *src, | |
2809 unsigned int n) | |
2810 { | |
2811 int c; | |
2812 | |
2813 while (n--) | |
2814 { | |
2815 c = *src++; | |
2816 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
2817 return 0; | |
2818 if (st->shift_jis.in_second_byte) | |
2819 { | |
2820 st->shift_jis.in_second_byte = 0; | |
2821 if (c < 0x40) | |
2822 return 0; | |
2823 } | |
2824 else if ((c >= 0x80 && c < 0xA0) || c >= 0xE0) | |
2825 st->shift_jis.in_second_byte = 1; | |
2826 } | |
2827 return CODING_CATEGORY_SHIFT_JIS_MASK; | |
2828 } | |
2829 | |
2830 /* Convert Shift-JIS data to internal format. */ | |
2831 | |
2832 static void | |
2833 decode_coding_sjis (Lstream *decoding, CONST unsigned char *src, | |
2834 unsigned_char_dynarr *dst, unsigned int n) | |
2835 { | |
2836 unsigned char c; | |
2837 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | |
2838 unsigned int flags = str->flags; | |
2839 unsigned int ch = str->ch; | |
2840 eol_type_t eol_type = str->eol_type; | |
2841 | |
2842 while (n--) | |
2843 { | |
2844 c = *src++; | |
2845 | |
2846 if (ch) | |
2847 { | |
2848 /* Previous character was first byte of Shift-JIS Kanji char. */ | |
2849 if (BYTE_SJIS_TWO_BYTE_2_P (c)) | |
2850 { | |
2851 unsigned char e1, e2; | |
2852 | |
2853 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); | |
2854 DECODE_SJIS (ch, c, e1, e2); | |
2855 Dynarr_add (dst, e1); | |
2856 Dynarr_add (dst, e2); | |
2857 } | |
2858 else | |
2859 { | |
2860 DECODE_ADD_BINARY_CHAR (ch, dst); | |
2861 DECODE_ADD_BINARY_CHAR (c, dst); | |
2862 } | |
2863 ch = 0; | |
2864 } | |
2865 else | |
2866 { | |
2867 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); | |
2868 if (BYTE_SJIS_TWO_BYTE_1_P (c)) | |
2869 ch = c; | |
2870 else if (BYTE_SJIS_KATAKANA_P (c)) | |
2871 { | |
2872 Dynarr_add (dst, LEADING_BYTE_KATAKANA_JISX0201); | |
2873 Dynarr_add (dst, c); | |
2874 } | |
2875 else | |
2876 DECODE_ADD_BINARY_CHAR (c, dst); | |
2877 } | |
2878 label_continue_loop:; | |
2879 } | |
2880 | |
2881 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); | |
2882 | |
2883 str->flags = flags; | |
2884 str->ch = ch; | |
2885 } | |
2886 | |
2887 /* Convert internally-formatted data to Shift-JIS. */ | |
2888 | |
2889 static void | |
2890 encode_coding_sjis (Lstream *encoding, CONST unsigned char *src, | |
2891 unsigned_char_dynarr *dst, unsigned int n) | |
2892 { | |
2893 unsigned char c; | |
2894 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | |
2895 unsigned int flags = str->flags; | |
2896 unsigned int ch = str->ch; | |
2897 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); | |
2898 | |
2899 while (n--) | |
2900 { | |
2901 c = *src++; | |
2902 if (c == '\n') | |
2903 { | |
2904 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) | |
2905 Dynarr_add (dst, '\r'); | |
2906 if (eol_type != EOL_CR) | |
2907 Dynarr_add (dst, '\n'); | |
2908 ch = 0; | |
2909 } | |
2910 else if (BYTE_ASCII_P (c)) | |
2911 { | |
2912 Dynarr_add (dst, c); | |
2913 ch = 0; | |
2914 } | |
2915 else if (BUFBYTE_LEADING_BYTE_P (c)) | |
2916 ch = (c == LEADING_BYTE_KATAKANA_JISX0201 || | |
2917 c == LEADING_BYTE_JAPANESE_JISX0208_1978 || | |
2918 c == LEADING_BYTE_JAPANESE_JISX0208) ? c : 0; | |
2919 else if (ch) | |
2920 { | |
2921 if (ch == LEADING_BYTE_KATAKANA_JISX0201) | |
2922 { | |
2923 Dynarr_add (dst, c); | |
2924 ch = 0; | |
2925 } | |
2926 else if (ch == LEADING_BYTE_JAPANESE_JISX0208_1978 || | |
2927 ch == LEADING_BYTE_JAPANESE_JISX0208) | |
2928 ch = c; | |
2929 else | |
2930 { | |
2931 unsigned char j1, j2; | |
2932 ENCODE_SJIS (ch, c, j1, j2); | |
2933 Dynarr_add (dst, j1); | |
2934 Dynarr_add (dst, j2); | |
2935 ch = 0; | |
2936 } | |
2937 } | |
2938 } | |
2939 | |
2940 str->flags = flags; | |
2941 str->ch = ch; | |
2942 } | |
2943 | |
2944 DEFUN ("decode-shift-jis-char", Fdecode_shift_jis_char, 1, 1, 0, /* | |
2945 Decode a JISX0208 character of Shift-JIS coding-system. | |
2946 CODE is the character code in Shift-JIS as a cons of type bytes. | |
2947 Return the corresponding character. | |
2948 */ | |
2949 (code)) | |
2950 { | |
2951 unsigned char c1, c2, s1, s2; | |
2952 | |
2953 CHECK_CONS (code); | |
2954 CHECK_INT (XCAR (code)); | |
2955 CHECK_INT (XCDR (code)); | |
2956 s1 = XINT (XCAR (code)); | |
2957 s2 = XINT (XCDR (code)); | |
2958 if (BYTE_SJIS_TWO_BYTE_1_P (s1) && | |
2959 BYTE_SJIS_TWO_BYTE_2_P (s2)) | |
2960 { | |
2961 DECODE_SJIS (s1, s2, c1, c2); | |
2962 return make_char (MAKE_CHAR (Vcharset_japanese_jisx0208, | |
2963 c1 & 0x7F, c2 & 0x7F)); | |
2964 } | |
2965 else | |
2966 return Qnil; | |
2967 } | |
2968 | |
2969 DEFUN ("encode-shift-jis-char", Fencode_shift_jis_char, 1, 1, 0, /* | |
2970 Encode a JISX0208 character CHAR to SHIFT-JIS coding-system. | |
2971 Return the corresponding character code in SHIFT-JIS as a cons of two bytes. | |
2972 */ | |
2973 (ch)) | |
2974 { | |
2975 Lisp_Object charset; | |
2976 int c1, c2, s1, s2; | |
2977 | |
2978 CHECK_CHAR_COERCE_INT (ch); | |
2979 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2); | |
2980 if (EQ (charset, Vcharset_japanese_jisx0208)) | |
2981 { | |
2982 ENCODE_SJIS (c1 | 0x80, c2 | 0x80, s1, s2); | |
2983 return Fcons (make_int (s1), make_int (s2)); | |
2984 } | |
2985 else | |
2986 return Qnil; | |
2987 } | |
2988 | |
2989 | |
2990 /************************************************************************/ | |
2991 /* Big5 methods */ | |
2992 /************************************************************************/ | |
2993 | |
2994 /* BIG5 is a coding system encoding two character sets: ASCII and | |
2995 Big5. An ASCII character is encoded as is. Big5 is a two-byte | |
2996 character set and is encoded in two-byte. | |
2997 | |
2998 --- CODE RANGE of BIG5 --- | |
2999 (character set) (range) | |
3000 ASCII 0x00 .. 0x7F | |
3001 Big5 (1st byte) 0xA1 .. 0xFE | |
3002 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE | |
3003 -------------------------- | |
3004 | |
3005 Since the number of characters in Big5 is larger than maximum | |
3006 characters in Emacs' charset (96x96), it can't be handled as one | |
3007 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1' | |
3008 and `charset-big5-2'. Both <type>s are DIMENSION2_CHARS94. The former | |
3009 contains frequently used characters and the latter contains less | |
3010 frequently used characters. */ | |
3011 | |
3012 #define BYTE_BIG5_TWO_BYTE_1_P(c) \ | |
3013 ((c) >= 0xA1 && (c) <= 0xFE) | |
3014 | |
3015 /* Is this the second byte of a Shift-JIS two-byte char? */ | |
3016 | |
3017 #define BYTE_BIG5_TWO_BYTE_2_P(c) \ | |
3018 (((c) >= 0x40 && (c) <= 0x7E) || ((c) >= 0xA1 && (c) <= 0xFE)) | |
3019 | |
3020 /* Number of Big5 characters which have the same code in 1st byte. */ | |
3021 | |
3022 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40) | |
3023 | |
3024 /* Code conversion macros. These are macros because they are used in | |
3025 inner loops during code conversion. | |
3026 | |
3027 Note that temporary variables in macros introduce the classic | |
3028 dynamic-scoping problems with variable names. We use capital- | |
3029 lettered variables in the assumption that XEmacs does not use | |
3030 capital letters in variables except in a very formalized way | |
3031 (e.g. Qstring). */ | |
3032 | |
3033 /* Convert Big5 code (b1, b2) into its internal string representation | |
3034 (lb, c1, c2). */ | |
3035 | |
3036 /* There is a much simpler way to split the Big5 charset into two. | |
3037 For the moment I'm going to leave the algorithm as-is because it | |
3038 claims to separate out the most-used characters into a single | |
3039 charset, which perhaps will lead to optimizations in various | |
3040 places. | |
3041 | |
3042 The way the algorithm works is something like this: | |
3043 | |
3044 Big5 can be viewed as a 94x157 charset, where the row is | |
3045 encoded into the bytes 0xA1 .. 0xFE and the column is encoded | |
3046 into the bytes 0x40 .. 0x7E and 0xA1 .. 0xFE. As for frequency, | |
3047 the split between low and high column numbers is apparently | |
3048 meaningless; ascending rows produce less and less frequent chars. | |
3049 Therefore, we assign the lower half of rows (0xA1 .. 0xC8) to | |
3050 the first charset, and the upper half (0xC9 .. 0xFE) to the | |
3051 second. To do the conversion, we convert the character into | |
3052 a single number where 0 .. 156 is the first row, 157 .. 313 | |
3053 is the second, etc. That way, the characters are ordered by | |
3054 decreasing frequency. Then we just chop the space in two | |
3055 and coerce the result into a 94x94 space. | |
3056 */ | |
3057 | |
3058 #define DECODE_BIG5(b1, b2, lb, c1, c2) do \ | |
3059 { \ | |
3060 int B1 = b1, B2 = b2; \ | |
3061 unsigned int I \ | |
3062 = (B1 - 0xA1) * BIG5_SAME_ROW + B2 - (B2 < 0x7F ? 0x40 : 0x62); \ | |
3063 \ | |
3064 if (B1 < 0xC9) \ | |
3065 { \ | |
3066 lb = LEADING_BYTE_CHINESE_BIG5_1; \ | |
3067 } \ | |
3068 else \ | |
3069 { \ | |
3070 lb = LEADING_BYTE_CHINESE_BIG5_2; \ | |
3071 I -= (BIG5_SAME_ROW) * (0xC9 - 0xA1); \ | |
3072 } \ | |
3073 c1 = I / (0xFF - 0xA1) + 0xA1; \ | |
3074 c2 = I % (0xFF - 0xA1) + 0xA1; \ | |
3075 } while (0) | |
3076 | |
3077 /* Convert the internal string representation of a Big5 character | |
3078 (lb, c1, c2) into Big5 code (b1, b2). */ | |
3079 | |
3080 #define ENCODE_BIG5(lb, c1, c2, b1, b2) do \ | |
3081 { \ | |
3082 unsigned int I = ((c1) - 0xA1) * (0xFF - 0xA1) + ((c2) - 0xA1); \ | |
3083 \ | |
3084 if (lb == LEADING_BYTE_CHINESE_BIG5_2) \ | |
3085 { \ | |
3086 I += BIG5_SAME_ROW * (0xC9 - 0xA1); \ | |
3087 } \ | |
3088 b1 = I / BIG5_SAME_ROW + 0xA1; \ | |
3089 b2 = I % BIG5_SAME_ROW; \ | |
3090 b2 += b2 < 0x3F ? 0x40 : 0x62; \ | |
3091 } while (0) | |
3092 | |
3093 static int | |
3094 detect_coding_big5 (struct detection_state *st, CONST unsigned char *src, | |
3095 unsigned int n) | |
3096 { | |
3097 int c; | |
3098 | |
3099 while (n--) | |
3100 { | |
3101 c = *src++; | |
3102 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO || | |
3103 (c >= 0x80 && c <= 0xA0)) | |
3104 return 0; | |
3105 if (st->big5.in_second_byte) | |
3106 { | |
3107 st->big5.in_second_byte = 0; | |
3108 if (c < 0x40 || (c >= 0x80 && c <= 0xA0)) | |
3109 return 0; | |
3110 } | |
3111 else if (c >= 0xA1) | |
3112 st->big5.in_second_byte = 1; | |
3113 } | |
3114 return CODING_CATEGORY_BIG5_MASK; | |
3115 } | |
3116 | |
3117 /* Convert Big5 data to internal format. */ | |
3118 | |
3119 static void | |
3120 decode_coding_big5 (Lstream *decoding, CONST unsigned char *src, | |
3121 unsigned_char_dynarr *dst, unsigned int n) | |
3122 { | |
3123 unsigned char c; | |
3124 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | |
3125 unsigned int flags = str->flags; | |
3126 unsigned int ch = str->ch; | |
3127 eol_type_t eol_type = str->eol_type; | |
3128 | |
3129 while (n--) | |
3130 { | |
3131 c = *src++; | |
3132 if (ch) | |
3133 { | |
3134 /* Previous character was first byte of Big5 char. */ | |
3135 if (BYTE_BIG5_TWO_BYTE_2_P (c)) | |
3136 { | |
3137 unsigned char b1, b2, b3; | |
3138 DECODE_BIG5 (ch, c, b1, b2, b3); | |
3139 Dynarr_add (dst, b1); | |
3140 Dynarr_add (dst, b2); | |
3141 Dynarr_add (dst, b3); | |
3142 } | |
3143 else | |
3144 { | |
3145 DECODE_ADD_BINARY_CHAR (ch, dst); | |
3146 DECODE_ADD_BINARY_CHAR (c, dst); | |
3147 } | |
3148 ch = 0; | |
3149 } | |
3150 else | |
3151 { | |
3152 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); | |
3153 if (BYTE_BIG5_TWO_BYTE_1_P (c)) | |
3154 ch = c; | |
3155 else | |
3156 DECODE_ADD_BINARY_CHAR (c, dst); | |
3157 } | |
3158 label_continue_loop:; | |
3159 } | |
3160 | |
3161 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); | |
3162 | |
3163 str->flags = flags; | |
3164 str->ch = ch; | |
3165 } | |
3166 | |
3167 /* Convert internally-formatted data to Big5. */ | |
3168 | |
3169 static void | |
3170 encode_coding_big5 (Lstream *encoding, CONST unsigned char *src, | |
3171 unsigned_char_dynarr *dst, unsigned int n) | |
3172 { | |
3173 unsigned char c; | |
3174 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | |
3175 unsigned int flags = str->flags; | |
3176 unsigned int ch = str->ch; | |
3177 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); | |
3178 | |
3179 while (n--) | |
3180 { | |
3181 c = *src++; | |
3182 if (c == '\n') | |
3183 { | |
3184 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) | |
3185 Dynarr_add (dst, '\r'); | |
3186 if (eol_type != EOL_CR) | |
3187 Dynarr_add (dst, '\n'); | |
3188 } | |
3189 else if (BYTE_ASCII_P (c)) | |
3190 { | |
3191 /* ASCII. */ | |
3192 Dynarr_add (dst, c); | |
3193 } | |
3194 else if (BUFBYTE_LEADING_BYTE_P (c)) | |
3195 { | |
3196 if (c == LEADING_BYTE_CHINESE_BIG5_1 || | |
3197 c == LEADING_BYTE_CHINESE_BIG5_2) | |
3198 { | |
3199 /* A recognized leading byte. */ | |
3200 ch = c; | |
3201 continue; /* not done with this character. */ | |
3202 } | |
3203 /* otherwise just ignore this character. */ | |
3204 } | |
3205 else if (ch == LEADING_BYTE_CHINESE_BIG5_1 || | |
3206 ch == LEADING_BYTE_CHINESE_BIG5_2) | |
3207 { | |
3208 /* Previous char was a recognized leading byte. */ | |
3209 ch = (ch << 8) | c; | |
3210 continue; /* not done with this character. */ | |
3211 } | |
3212 else if (ch) | |
3213 { | |
3214 /* Encountering second byte of a Big5 character. */ | |
3215 unsigned char b1, b2; | |
3216 | |
3217 ENCODE_BIG5 (ch >> 8, ch & 0xFF, c, b1, b2); | |
3218 Dynarr_add (dst, b1); | |
3219 Dynarr_add (dst, b2); | |
3220 } | |
3221 | |
3222 ch = 0; | |
3223 } | |
3224 | |
3225 str->flags = flags; | |
3226 str->ch = ch; | |
3227 } | |
3228 | |
3229 | |
3230 DEFUN ("decode-big5-char", Fdecode_big5_char, 1, 1, 0, /* | |
3231 Decode a Big5 character CODE of BIG5 coding-system. | |
3232 CODE is the character code in BIG5, a cons of two integers. | |
3233 Return the corresponding character. | |
3234 */ | |
3235 (code)) | |
3236 { | |
3237 unsigned char c1, c2, b1, b2; | |
3238 | |
3239 CHECK_CONS (code); | |
3240 CHECK_INT (XCAR (code)); | |
3241 CHECK_INT (XCDR (code)); | |
3242 b1 = XINT (XCAR (code)); | |
3243 b2 = XINT (XCDR (code)); | |
3244 if (BYTE_BIG5_TWO_BYTE_1_P (b1) && | |
3245 BYTE_BIG5_TWO_BYTE_2_P (b2)) | |
3246 { | |
3247 int leading_byte; | |
3248 Lisp_Object charset; | |
3249 DECODE_BIG5 (b1, b2, leading_byte, c1, c2); | |
3250 charset = CHARSET_BY_LEADING_BYTE (leading_byte); | |
3251 return make_char (MAKE_CHAR (charset, c1 & 0x7F, c2 & 0x7F)); | |
3252 } | |
3253 else | |
3254 return Qnil; | |
3255 } | |
3256 | |
3257 DEFUN ("encode-big5-char", Fencode_big5_char, 1, 1, 0, /* | |
3258 Encode the Big5 character CH to BIG5 coding-system. | |
3259 Return the corresponding character code in Big5. | |
3260 */ | |
3261 (ch)) | |
3262 { | |
3263 Lisp_Object charset; | |
3264 int c1, c2, b1, b2; | |
3265 | |
3266 CHECK_CHAR_COERCE_INT (ch); | |
3267 BREAKUP_CHAR (XCHAR (ch), charset, c1, c2); | |
3268 if (EQ (charset, Vcharset_chinese_big5_1) || | |
3269 EQ (charset, Vcharset_chinese_big5_2)) | |
3270 { | |
3271 ENCODE_BIG5 (XCHARSET_LEADING_BYTE (charset), c1 | 0x80, c2 | 0x80, | |
3272 b1, b2); | |
3273 return Fcons (make_int (b1), make_int (b2)); | |
3274 } | |
3275 else | |
3276 return Qnil; | |
3277 } | |
3278 | |
3279 | |
3280 /************************************************************************/ | |
3281 /* UCS-4 methods */ | |
3282 /* */ | |
3283 /* UCS-4 character codes are implemented as nonnegative integers. */ | |
3284 /* */ | |
3285 /************************************************************************/ | |
3286 | |
3287 | |
3288 DEFUN ("set-ucs-char", Fset_ucs_char, 2, 2, 0, /* | |
3289 Map UCS-4 code CODE to Mule character CHARACTER. | |
3290 | |
3291 Return T on success, NIL on failure. | |
3292 */ | |
3293 (code, character)) | |
3294 { | |
3295 unsigned int c; | |
3296 | |
3297 CHECK_CHAR (character); | |
3298 CHECK_INT (code); | |
3299 c = XINT (code); | |
3300 | |
3301 if (c < sizeof (fcd->ucs_to_mule_table)) | |
3302 { | |
3303 fcd->ucs_to_mule_table[c] = character; | |
3304 return Qt; | |
3305 } | |
3306 else | |
3307 return Qnil; | |
3308 } | |
3309 | |
3310 static Lisp_Object | |
3311 ucs_to_char (unsigned long code) | |
3312 { | |
3313 if (code < sizeof (fcd->ucs_to_mule_table)) | |
3314 { | |
3315 return fcd->ucs_to_mule_table[code]; | |
3316 } | |
3317 else if ((0xe00000 <= code) && (code <= 0xe00000 + 94 * 94 * 14)) | |
3318 { | |
3319 unsigned int c; | |
3320 | |
3321 code -= 0xe00000; | |
3322 c = code % (94 * 94); | |
3323 return make_char | |
3324 (MAKE_CHAR (CHARSET_BY_ATTRIBUTES | |
3325 (CHARSET_TYPE_94X94, code / (94 * 94) + '@', | |
3326 CHARSET_LEFT_TO_RIGHT), | |
3327 c / 94 + 33, c % 94 + 33)); | |
3328 } | |
3329 else | |
3330 return Qnil; | |
3331 } | |
3332 | |
3333 DEFUN ("ucs-char", Fucs_char, 1, 1, 0, /* | |
3334 Return Mule character corresponding to UCS code CODE (a positive integer). | |
3335 */ | |
3336 (code)) | |
3337 { | |
3338 CHECK_NATNUM (code); | |
3339 return ucs_to_char (XINT (code)); | |
3340 } | |
3341 | |
3342 DEFUN ("set-char-ucs", Fset_char_ucs, 2, 2, 0, /* | |
3343 Map Mule character CHARACTER to UCS code CODE (a positive integer). | |
3344 */ | |
3345 (character, code)) | |
3346 { | |
3347 /* #### Isn't this gilding the lily? Fput_char_table checks its args. | |
3348 Fset_char_ucs is more restrictive on index arg, but should | |
3349 check code arg in a char_table method. */ | |
3350 CHECK_CHAR (character); | |
3351 CHECK_NATNUM (code); | |
3352 return Fput_char_table (character, code, mule_to_ucs_table); | |
3353 } | |
3354 | |
3355 DEFUN ("char-ucs", Fchar_ucs, 1, 1, 0, /* | |
3356 Return the UCS code (a positive integer) corresponding to CHARACTER. | |
3357 */ | |
3358 (character)) | |
3359 { | |
3360 return Fget_char_table (character, mule_to_ucs_table); | |
3361 } | |
3362 | |
3363 /* Decode a UCS-4 character into a buffer. If the lookup fails, use | |
3364 <GETA MARK> (U+3013) of JIS X 0208, which means correct character | |
3365 is not found, instead. | |
3366 #### do something more appropriate (use blob?) | |
3367 Danger, Will Robinson! Data loss. Should we signal user? */ | |
3368 static void | |
3369 decode_ucs4 (unsigned long ch, unsigned_char_dynarr *dst) | |
3370 { | |
3371 Lisp_Object chr = ucs_to_char (ch); | |
3372 | |
3373 if (! NILP (chr)) | |
3374 { | |
3375 Bufbyte work[MAX_EMCHAR_LEN]; | |
3376 int len; | |
3377 | |
3378 ch = XCHAR (chr); | |
3379 len = (ch < 128) ? | |
3380 simple_set_charptr_emchar (work, ch) : | |
3381 non_ascii_set_charptr_emchar (work, ch); | |
3382 Dynarr_add_many (dst, work, len); | |
3383 } | |
3384 else | |
3385 { | |
3386 Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208); | |
3387 Dynarr_add (dst, 34 + 128); | |
3388 Dynarr_add (dst, 46 + 128); | |
3389 } | |
3390 } | |
3391 | |
3392 static unsigned long | |
3393 mule_char_to_ucs4 (Lisp_Object charset, | |
3394 unsigned char h, unsigned char l) | |
3395 { | |
3396 Lisp_Object code | |
3397 = Fget_char_table (make_char (MAKE_CHAR (charset, h & 127, l & 127)), | |
3398 mule_to_ucs_table); | |
3399 | |
3400 if (INTP (code)) | |
3401 { | |
3402 return XINT (code); | |
3403 } | |
3404 else if ( (XCHARSET_DIMENSION (charset) == 2) && | |
3405 (XCHARSET_CHARS (charset) == 94) ) | |
3406 { | |
3407 unsigned char final = XCHARSET_FINAL (charset); | |
3408 | |
3409 if ( ('@' <= final) && (final < 0x7f) ) | |
3410 { | |
3411 return 0xe00000 + (final - '@') * 94 * 94 | |
3412 + ((h & 127) - 33) * 94 + (l & 127) - 33; | |
3413 } | |
3414 else | |
3415 { | |
3416 return '?'; | |
3417 } | |
3418 } | |
3419 else | |
3420 { | |
3421 return '?'; | |
3422 } | |
3423 } | |
3424 | |
3425 static void | |
3426 encode_ucs4 (Lisp_Object charset, | |
3427 unsigned char h, unsigned char l, unsigned_char_dynarr *dst) | |
3428 { | |
3429 unsigned long code = mule_char_to_ucs4 (charset, h, l); | |
3430 Dynarr_add (dst, code >> 24); | |
3431 Dynarr_add (dst, (code >> 16) & 255); | |
3432 Dynarr_add (dst, (code >> 8) & 255); | |
3433 Dynarr_add (dst, code & 255); | |
3434 } | |
3435 | |
3436 static int | |
3437 detect_coding_ucs4 (struct detection_state *st, CONST unsigned char *src, | |
3438 unsigned int n) | |
3439 { | |
3440 while (n--) | |
3441 { | |
3442 int c = *src++; | |
3443 switch (st->ucs4.in_byte) | |
3444 { | |
3445 case 0: | |
3446 if (c >= 128) | |
3447 return 0; | |
3448 else | |
3449 st->ucs4.in_byte++; | |
3450 break; | |
3451 case 3: | |
3452 st->ucs4.in_byte = 0; | |
3453 break; | |
3454 default: | |
3455 st->ucs4.in_byte++; | |
3456 } | |
3457 } | |
3458 return CODING_CATEGORY_UCS4_MASK; | |
3459 } | |
3460 | |
3461 static void | |
3462 decode_coding_ucs4 (Lstream *decoding, CONST unsigned char *src, | |
3463 unsigned_char_dynarr *dst, unsigned int n) | |
3464 { | |
3465 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | |
3466 unsigned int flags = str->flags; | |
3467 unsigned int ch = str->ch; | |
3468 unsigned char counter = str->counter; | |
3469 | |
3470 while (n--) | |
3471 { | |
3472 unsigned char c = *src++; | |
3473 switch (counter) | |
3474 { | |
3475 case 0: | |
3476 ch = c; | |
3477 counter = 3; | |
3478 break; | |
3479 case 1: | |
3480 decode_ucs4 ( ( ch << 8 ) | c, dst); | |
3481 ch = 0; | |
3482 counter = 0; | |
3483 break; | |
3484 default: | |
3485 ch = ( ch << 8 ) | c; | |
3486 counter--; | |
3487 } | |
3488 } | |
3489 if (counter & CODING_STATE_END) | |
3490 DECODE_OUTPUT_PARTIAL_CHAR (ch); | |
3491 | |
3492 str->flags = flags; | |
3493 str->ch = ch; | |
3494 str->counter = counter; | |
3495 } | |
3496 | |
3497 static void | |
3498 encode_coding_ucs4 (Lstream *encoding, CONST unsigned char *src, | |
3499 unsigned_char_dynarr *dst, unsigned int n) | |
3500 { | |
3501 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | |
3502 unsigned int flags = str->flags; | |
3503 unsigned int ch = str->ch; | |
3504 unsigned char char_boundary = str->iso2022.current_char_boundary; | |
3505 Lisp_Object charset = str->iso2022.current_charset; | |
3506 | |
3507 #ifdef ENABLE_COMPOSITE_CHARS | |
3508 /* flags for handling composite chars. We do a little switcharoo | |
3509 on the source while we're outputting the composite char. */ | |
3510 unsigned int saved_n = 0; | |
3511 CONST unsigned char *saved_src = NULL; | |
3512 int in_composite = 0; | |
3513 | |
3514 back_to_square_n: | |
3515 #endif | |
3516 | |
3517 while (n--) | |
3518 { | |
3519 unsigned char c = *src++; | |
3520 | |
3521 if (BYTE_ASCII_P (c)) | |
3522 { /* Processing ASCII character */ | |
3523 ch = 0; | |
3524 encode_ucs4 (Vcharset_ascii, c, 0, dst); | |
3525 char_boundary = 1; | |
3526 } | |
3527 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch)) | |
3528 { /* Processing Leading Byte */ | |
3529 ch = 0; | |
3530 charset = CHARSET_BY_LEADING_BYTE (c); | |
3531 if (LEADING_BYTE_PREFIX_P(c)) | |
3532 ch = c; | |
3533 char_boundary = 0; | |
3534 } | |
3535 else | |
3536 { /* Processing Non-ASCII character */ | |
3537 char_boundary = 1; | |
3538 if (EQ (charset, Vcharset_control_1)) | |
3539 { | |
3540 encode_ucs4 (Vcharset_control_1, c, 0, dst); | |
3541 } | |
3542 else | |
3543 { | |
3544 switch (XCHARSET_REP_BYTES (charset)) | |
3545 { | |
3546 case 2: | |
3547 encode_ucs4 (charset, c, 0, dst); | |
3548 break; | |
3549 case 3: | |
3550 if (XCHARSET_PRIVATE_P (charset)) | |
3551 { | |
3552 encode_ucs4 (charset, c, 0, dst); | |
3553 ch = 0; | |
3554 } | |
3555 else if (ch) | |
3556 { | |
3557 #ifdef ENABLE_COMPOSITE_CHARS | |
3558 if (EQ (charset, Vcharset_composite)) | |
3559 { | |
3560 if (in_composite) | |
3561 { | |
3562 /* #### Bother! We don't know how to | |
3563 handle this yet. */ | |
3564 Dynarr_add (dst, 0); | |
3565 Dynarr_add (dst, 0); | |
3566 Dynarr_add (dst, 0); | |
3567 Dynarr_add (dst, '~'); | |
3568 } | |
3569 else | |
3570 { | |
3571 Emchar emch = MAKE_CHAR (Vcharset_composite, | |
3572 ch & 0x7F, c & 0x7F); | |
3573 Lisp_Object lstr = composite_char_string (emch); | |
3574 saved_n = n; | |
3575 saved_src = src; | |
3576 in_composite = 1; | |
3577 src = XSTRING_DATA (lstr); | |
3578 n = XSTRING_LENGTH (lstr); | |
3579 } | |
3580 } | |
3581 else | |
3582 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3583 { | |
3584 encode_ucs4(charset, ch, c, dst); | |
3585 } | |
3586 ch = 0; | |
3587 } | |
3588 else | |
3589 { | |
3590 ch = c; | |
3591 char_boundary = 0; | |
3592 } | |
3593 break; | |
3594 case 4: | |
3595 if (ch) | |
3596 { | |
3597 encode_ucs4 (charset, ch, c, dst); | |
3598 ch = 0; | |
3599 } | |
3600 else | |
3601 { | |
3602 ch = c; | |
3603 char_boundary = 0; | |
3604 } | |
3605 break; | |
3606 default: | |
3607 abort (); | |
3608 } | |
3609 } | |
3610 } | |
3611 } | |
3612 | |
3613 #ifdef ENABLE_COMPOSITE_CHARS | |
3614 if (in_composite) | |
3615 { | |
3616 n = saved_n; | |
3617 src = saved_src; | |
3618 in_composite = 0; | |
3619 goto back_to_square_n; /* Wheeeeeeeee ..... */ | |
3620 } | |
3621 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3622 | |
3623 str->flags = flags; | |
3624 str->ch = ch; | |
3625 str->iso2022.current_char_boundary = char_boundary; | |
3626 str->iso2022.current_charset = charset; | |
3627 | |
3628 /* Verbum caro factum est! */ | |
3629 } | |
3630 | |
3631 | |
3632 /************************************************************************/ | |
3633 /* UTF-8 methods */ | |
3634 /************************************************************************/ | |
3635 | |
3636 static int | |
3637 detect_coding_utf8 (struct detection_state *st, CONST unsigned char *src, | |
3638 unsigned int n) | |
3639 { | |
3640 while (n--) | |
3641 { | |
3642 unsigned char c = *src++; | |
3643 switch (st->utf8.in_byte) | |
3644 { | |
3645 case 0: | |
3646 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
3647 return 0; | |
3648 else if (c >= 0xfc) | |
3649 st->utf8.in_byte = 5; | |
3650 else if (c >= 0xf8) | |
3651 st->utf8.in_byte = 4; | |
3652 else if (c >= 0xf0) | |
3653 st->utf8.in_byte = 3; | |
3654 else if (c >= 0xe0) | |
3655 st->utf8.in_byte = 2; | |
3656 else if (c >= 0xc0) | |
3657 st->utf8.in_byte = 1; | |
3658 else if (c >= 0x80) | |
3659 return 0; | |
3660 break; | |
3661 default: | |
3662 if ((c & 0xc0) != 0x80) | |
3663 return 0; | |
3664 else | |
3665 st->utf8.in_byte--; | |
3666 } | |
3667 } | |
3668 return CODING_CATEGORY_UTF8_MASK; | |
3669 } | |
3670 | |
3671 static void | |
3672 decode_coding_utf8 (Lstream *decoding, CONST unsigned char *src, | |
3673 unsigned_char_dynarr *dst, unsigned int n) | |
3674 { | |
3675 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | |
3676 unsigned int flags = str->flags; | |
3677 unsigned int ch = str->ch; | |
3678 eol_type_t eol_type = str->eol_type; | |
3679 unsigned char counter = str->counter; | |
3680 | |
3681 while (n--) | |
3682 { | |
3683 unsigned char c = *src++; | |
3684 switch (counter) | |
3685 { | |
3686 case 0: | |
3687 if ( c >= 0xfc ) | |
3688 { | |
3689 ch = c & 0x01; | |
3690 counter = 5; | |
3691 } | |
3692 else if ( c >= 0xf8 ) | |
3693 { | |
3694 ch = c & 0x03; | |
3695 counter = 4; | |
3696 } | |
3697 else if ( c >= 0xf0 ) | |
3698 { | |
3699 ch = c & 0x07; | |
3700 counter = 3; | |
3701 } | |
3702 else if ( c >= 0xe0 ) | |
3703 { | |
3704 ch = c & 0x0f; | |
3705 counter = 2; | |
3706 } | |
3707 else if ( c >= 0xc0 ) | |
3708 { | |
3709 ch = c & 0x1f; | |
3710 counter = 1; | |
3711 } | |
3712 else | |
3713 { | |
3714 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); | |
3715 decode_ucs4 (c, dst); | |
3716 } | |
3717 break; | |
3718 case 1: | |
3719 ch = ( ch << 6 ) | ( c & 0x3f ); | |
3720 decode_ucs4 (ch, dst); | |
3721 ch = 0; | |
3722 counter = 0; | |
3723 break; | |
3724 default: | |
3725 ch = ( ch << 6 ) | ( c & 0x3f ); | |
3726 counter--; | |
3727 } | |
3728 label_continue_loop:; | |
3729 } | |
3730 | |
3731 if (flags & CODING_STATE_END) | |
3732 DECODE_OUTPUT_PARTIAL_CHAR (ch); | |
3733 | |
3734 str->flags = flags; | |
3735 str->ch = ch; | |
3736 str->counter = counter; | |
3737 } | |
3738 | |
3739 static void | |
3740 encode_utf8 (Lisp_Object charset, | |
3741 unsigned char h, unsigned char l, unsigned_char_dynarr *dst) | |
3742 { | |
3743 unsigned long code = mule_char_to_ucs4 (charset, h, l); | |
3744 if ( code <= 0x7f ) | |
3745 { | |
3746 Dynarr_add (dst, code); | |
3747 } | |
3748 else if ( code <= 0x7ff ) | |
3749 { | |
3750 Dynarr_add (dst, (code >> 6) | 0xc0); | |
3751 Dynarr_add (dst, (code & 0x3f) | 0x80); | |
3752 } | |
3753 else if ( code <= 0xffff ) | |
3754 { | |
3755 Dynarr_add (dst, (code >> 12) | 0xe0); | |
3756 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80); | |
3757 Dynarr_add (dst, (code & 0x3f) | 0x80); | |
3758 } | |
3759 else if ( code <= 0x1fffff ) | |
3760 { | |
3761 Dynarr_add (dst, (code >> 18) | 0xf0); | |
3762 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80); | |
3763 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80); | |
3764 Dynarr_add (dst, (code & 0x3f) | 0x80); | |
3765 } | |
3766 else if ( code <= 0x3ffffff ) | |
3767 { | |
3768 Dynarr_add (dst, (code >> 24) | 0xf8); | |
3769 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80); | |
3770 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80); | |
3771 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80); | |
3772 Dynarr_add (dst, (code & 0x3f) | 0x80); | |
3773 } | |
3774 else | |
3775 { | |
3776 Dynarr_add (dst, (code >> 30) | 0xfc); | |
3777 Dynarr_add (dst, ((code >> 24) & 0x3f) | 0x80); | |
3778 Dynarr_add (dst, ((code >> 18) & 0x3f) | 0x80); | |
3779 Dynarr_add (dst, ((code >> 12) & 0x3f) | 0x80); | |
3780 Dynarr_add (dst, ((code >> 6) & 0x3f) | 0x80); | |
3781 Dynarr_add (dst, (code & 0x3f) | 0x80); | |
3782 } | |
3783 } | |
3784 | |
3785 static void | |
3786 encode_coding_utf8 (Lstream *encoding, CONST unsigned char *src, | |
3787 unsigned_char_dynarr *dst, unsigned int n) | |
3788 { | |
3789 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | |
3790 unsigned int flags = str->flags; | |
3791 unsigned int ch = str->ch; | |
3792 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); | |
3793 unsigned char char_boundary = str->iso2022.current_char_boundary; | |
3794 Lisp_Object charset = str->iso2022.current_charset; | |
3795 | |
3796 #ifdef ENABLE_COMPOSITE_CHARS | |
3797 /* flags for handling composite chars. We do a little switcharoo | |
3798 on the source while we're outputting the composite char. */ | |
3799 unsigned int saved_n = 0; | |
3800 CONST unsigned char *saved_src = NULL; | |
3801 int in_composite = 0; | |
3802 | |
3803 back_to_square_n: | |
3804 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3805 | |
3806 while (n--) | |
3807 { | |
3808 unsigned char c = *src++; | |
3809 | |
3810 if (BYTE_ASCII_P (c)) | |
3811 { /* Processing ASCII character */ | |
3812 ch = 0; | |
3813 if (c == '\n') | |
3814 { | |
3815 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) | |
3816 Dynarr_add (dst, '\r'); | |
3817 if (eol_type != EOL_CR) | |
3818 Dynarr_add (dst, c); | |
3819 } | |
3820 else | |
3821 encode_utf8 (Vcharset_ascii, c, 0, dst); | |
3822 char_boundary = 1; | |
3823 } | |
3824 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch)) | |
3825 { /* Processing Leading Byte */ | |
3826 ch = 0; | |
3827 charset = CHARSET_BY_LEADING_BYTE (c); | |
3828 if (LEADING_BYTE_PREFIX_P(c)) | |
3829 ch = c; | |
3830 char_boundary = 0; | |
3831 } | |
3832 else | |
3833 { /* Processing Non-ASCII character */ | |
3834 char_boundary = 1; | |
3835 if (EQ (charset, Vcharset_control_1)) | |
3836 { | |
3837 encode_utf8 (Vcharset_control_1, c, 0, dst); | |
3838 } | |
3839 else | |
3840 { | |
3841 switch (XCHARSET_REP_BYTES (charset)) | |
3842 { | |
3843 case 2: | |
3844 encode_utf8 (charset, c, 0, dst); | |
3845 break; | |
3846 case 3: | |
3847 if (XCHARSET_PRIVATE_P (charset)) | |
3848 { | |
3849 encode_utf8 (charset, c, 0, dst); | |
3850 ch = 0; | |
3851 } | |
3852 else if (ch) | |
3853 { | |
3854 #ifdef ENABLE_COMPOSITE_CHARS | |
3855 if (EQ (charset, Vcharset_composite)) | |
3856 { | |
3857 if (in_composite) | |
3858 { | |
3859 /* #### Bother! We don't know how to | |
3860 handle this yet. */ | |
3861 encode_utf8 (Vcharset_ascii, '~', 0, dst); | |
3862 } | |
3863 else | |
3864 { | |
3865 Emchar emch = MAKE_CHAR (Vcharset_composite, | |
3866 ch & 0x7F, c & 0x7F); | |
3867 Lisp_Object lstr = composite_char_string (emch); | |
3868 saved_n = n; | |
3869 saved_src = src; | |
3870 in_composite = 1; | |
3871 src = XSTRING_DATA (lstr); | |
3872 n = XSTRING_LENGTH (lstr); | |
3873 } | |
3874 } | |
3875 else | |
3876 #endif /* ENABLE_COMPOSITE_CHARS */ | |
3877 { | |
3878 encode_utf8 (charset, ch, c, dst); | |
3879 } | |
3880 ch = 0; | |
3881 } | |
3882 else | |
3883 { | |
3884 ch = c; | |
3885 char_boundary = 0; | |
3886 } | |
3887 break; | |
3888 case 4: | |
3889 if (ch) | |
3890 { | |
3891 encode_utf8 (charset, ch, c, dst); | |
3892 ch = 0; | |
3893 } | |
3894 else | |
3895 { | |
3896 ch = c; | |
3897 char_boundary = 0; | |
3898 } | |
3899 break; | |
3900 default: | |
3901 abort (); | |
3902 } | |
3903 } | |
3904 } | |
3905 } | |
3906 | |
3907 #ifdef ENABLE_COMPOSITE_CHARS | |
3908 if (in_composite) | |
3909 { | |
3910 n = saved_n; | |
3911 src = saved_src; | |
3912 in_composite = 0; | |
3913 goto back_to_square_n; /* Wheeeeeeeee ..... */ | |
3914 } | |
3915 #endif | |
3916 | |
3917 str->flags = flags; | |
3918 str->ch = ch; | |
3919 str->iso2022.current_char_boundary = char_boundary; | |
3920 str->iso2022.current_charset = charset; | |
3921 | |
3922 /* Verbum caro factum est! */ | |
3923 } | |
3924 | |
3925 | |
3926 /************************************************************************/ | |
3927 /* ISO2022 methods */ | |
3928 /************************************************************************/ | |
3929 | |
3930 /* The following note describes the coding system ISO2022 briefly. | |
3931 Since the intention of this note is to help understand the | |
3932 functions in this file, some parts are NOT ACCURATE or OVERLY | |
3933 SIMPLIFIED. For thorough understanding, please refer to the | |
3934 original document of ISO2022. | |
3935 | |
3936 ISO2022 provides many mechanisms to encode several character sets | |
3937 in 7-bit and 8-bit environments. For 7-bit environments, all text | |
3938 is encoded using bytes less than 128. This may make the encoded | |
3939 text a little bit longer, but the text passes more easily through | |
3940 several gateways, some of which strip off MSB (Most Signigant Bit). | |
3941 | |
3942 There are two kinds of character sets: control character set and | |
3943 graphic character set. The former contains control characters such | |
3944 as `newline' and `escape' to provide control functions (control | |
3945 functions are also provided by escape sequences). The latter | |
3946 contains graphic characters such as 'A' and '-'. Emacs recognizes | |
3947 two control character sets and many graphic character sets. | |
3948 | |
3949 Graphic character sets are classified into one of the following | |
3950 four classes, according to the number of bytes (DIMENSION) and | |
3951 number of characters in one dimension (CHARS) of the set: | |
3952 - DIMENSION1_CHARS94 | |
3953 - DIMENSION1_CHARS96 | |
3954 - DIMENSION2_CHARS94 | |
3955 - DIMENSION2_CHARS96 | |
3956 | |
3957 In addition, each character set is assigned an identification tag, | |
3958 unique for each set, called "final character" (denoted as <F> | |
3959 hereafter). The <F> of each character set is decided by ECMA(*) | |
3960 when it is registered in ISO. The code range of <F> is 0x30..0x7F | |
3961 (0x30..0x3F are for private use only). | |
3962 | |
3963 Note (*): ECMA = European Computer Manufacturers Association | |
3964 | |
3965 Here are examples of graphic character set [NAME(<F>)]: | |
3966 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ... | |
3967 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ... | |
3968 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ... | |
3969 o DIMENSION2_CHARS96 -- none for the moment | |
3970 | |
3971 A code area (1 byte = 8 bits) is divided into 4 areas, C0, GL, C1, and GR. | |
3972 C0 [0x00..0x1F] -- control character plane 0 | |
3973 GL [0x20..0x7F] -- graphic character plane 0 | |
3974 C1 [0x80..0x9F] -- control character plane 1 | |
3975 GR [0xA0..0xFF] -- graphic character plane 1 | |
3976 | |
3977 A control character set is directly designated and invoked to C0 or | |
3978 C1 by an escape sequence. The most common case is that: | |
3979 - ISO646's control character set is designated/invoked to C0, and | |
3980 - ISO6429's control character set is designated/invoked to C1, | |
3981 and usually these designations/invocations are omitted in encoded | |
3982 text. In a 7-bit environment, only C0 can be used, and a control | |
3983 character for C1 is encoded by an appropriate escape sequence to | |
3984 fit into the environment. All control characters for C1 are | |
3985 defined to have corresponding escape sequences. | |
3986 | |
3987 A graphic character set is at first designated to one of four | |
3988 graphic registers (G0 through G3), then these graphic registers are | |
3989 invoked to GL or GR. These designations and invocations can be | |
3990 done independently. The most common case is that G0 is invoked to | |
3991 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually | |
3992 these invocations and designations are omitted in encoded text. | |
3993 In a 7-bit environment, only GL can be used. | |
3994 | |
3995 When a graphic character set of CHARS94 is invoked to GL, codes | |
3996 0x20 and 0x7F of the GL area work as control characters SPACE and | |
3997 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not | |
3998 be used. | |
3999 | |
4000 There are two ways of invocation: locking-shift and single-shift. | |
4001 With locking-shift, the invocation lasts until the next different | |
4002 invocation, whereas with single-shift, the invocation affects the | |
4003 following character only and doesn't affect the locking-shift | |
4004 state. Invocations are done by the following control characters or | |
4005 escape sequences: | |
4006 | |
4007 ---------------------------------------------------------------------- | |
4008 abbrev function cntrl escape seq description | |
4009 ---------------------------------------------------------------------- | |
4010 SI/LS0 (shift-in) 0x0F none invoke G0 into GL | |
4011 SO/LS1 (shift-out) 0x0E none invoke G1 into GL | |
4012 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL | |
4013 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL | |
4014 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*) | |
4015 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*) | |
4016 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*) | |
4017 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char | |
4018 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char | |
4019 ---------------------------------------------------------------------- | |
4020 (*) These are not used by any known coding system. | |
4021 | |
4022 Control characters for these functions are defined by macros | |
4023 ISO_CODE_XXX in `coding.h'. | |
4024 | |
4025 Designations are done by the following escape sequences: | |
4026 ---------------------------------------------------------------------- | |
4027 escape sequence description | |
4028 ---------------------------------------------------------------------- | |
4029 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0 | |
4030 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1 | |
4031 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2 | |
4032 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3 | |
4033 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*) | |
4034 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1 | |
4035 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2 | |
4036 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3 | |
4037 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**) | |
4038 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1 | |
4039 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2 | |
4040 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3 | |
4041 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*) | |
4042 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1 | |
4043 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2 | |
4044 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3 | |
4045 ---------------------------------------------------------------------- | |
4046 | |
4047 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set | |
4048 of dimension 1, chars 94, and final character <F>, etc... | |
4049 | |
4050 Note (*): Although these designations are not allowed in ISO2022, | |
4051 Emacs accepts them on decoding, and produces them on encoding | |
4052 CHARS96 character sets in a coding system which is characterized as | |
4053 7-bit environment, non-locking-shift, and non-single-shift. | |
4054 | |
4055 Note (**): If <F> is '@', 'A', or 'B', the intermediate character | |
4056 '(' can be omitted. We refer to this as "short-form" hereafter. | |
4057 | |
4058 Now you may notice that there are a lot of ways for encoding the | |
4059 same multilingual text in ISO2022. Actually, there exist many | |
4060 coding systems such as Compound Text (used in X11's inter client | |
4061 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR | |
4062 (used in Korean internet), EUC (Extended UNIX Code, used in Asian | |
4063 localized platforms), and all of these are variants of ISO2022. | |
4064 | |
4065 In addition to the above, Emacs handles two more kinds of escape | |
4066 sequences: ISO6429's direction specification and Emacs' private | |
4067 sequence for specifying character composition. | |
4068 | |
4069 ISO6429's direction specification takes the following form: | |
4070 o CSI ']' -- end of the current direction | |
4071 o CSI '0' ']' -- end of the current direction | |
4072 o CSI '1' ']' -- start of left-to-right text | |
4073 o CSI '2' ']' -- start of right-to-left text | |
4074 The control character CSI (0x9B: control sequence introducer) is | |
4075 abbreviated to the escape sequence ESC '[' in a 7-bit environment. | |
4076 | |
4077 Character composition specification takes the following form: | |
4078 o ESC '0' -- start character composition | |
4079 o ESC '1' -- end character composition | |
4080 Since these are not standard escape sequences of any ISO standard, | |
4081 their use with these meanings is restricted to Emacs only. */ | |
4082 | |
4083 static void | |
4084 reset_iso2022 (Lisp_Object coding_system, struct iso2022_decoder *iso) | |
4085 { | |
4086 int i; | |
4087 | |
4088 for (i = 0; i < 4; i++) | |
4089 { | |
4090 if (!NILP (coding_system)) | |
4091 iso->charset[i] = | |
4092 XCODING_SYSTEM_ISO2022_INITIAL_CHARSET (coding_system, i); | |
4093 else | |
4094 iso->charset[i] = Qt; | |
4095 iso->invalid_designated[i] = 0; | |
4096 } | |
4097 iso->esc = ISO_ESC_NOTHING; | |
4098 iso->esc_bytes_index = 0; | |
4099 iso->register_left = 0; | |
4100 iso->register_right = 1; | |
4101 iso->switched_dir_and_no_valid_charset_yet = 0; | |
4102 iso->invalid_switch_dir = 0; | |
4103 iso->output_direction_sequence = 0; | |
4104 iso->output_literally = 0; | |
4105 #ifdef ENABLE_COMPOSITE_CHARS | |
4106 if (iso->composite_chars) | |
4107 Dynarr_reset (iso->composite_chars); | |
4108 #endif | |
4109 } | |
4110 | |
4111 static int | |
4112 fit_to_be_escape_quoted (unsigned char c) | |
4113 { | |
4114 switch (c) | |
4115 { | |
4116 case ISO_CODE_ESC: | |
4117 case ISO_CODE_CSI: | |
4118 case ISO_CODE_SS2: | |
4119 case ISO_CODE_SS3: | |
4120 case ISO_CODE_SO: | |
4121 case ISO_CODE_SI: | |
4122 return 1; | |
4123 | |
4124 default: | |
4125 return 0; | |
4126 } | |
4127 } | |
4128 | |
4129 /* Parse one byte of an ISO2022 escape sequence. | |
4130 If the result is an invalid escape sequence, return 0 and | |
4131 do not change anything in STR. Otherwise, if the result is | |
4132 an incomplete escape sequence, update ISO2022.ESC and | |
4133 ISO2022.ESC_BYTES and return -1. Otherwise, update | |
4134 all the state variables (but not ISO2022.ESC_BYTES) and | |
4135 return 1. | |
4136 | |
4137 If CHECK_INVALID_CHARSETS is non-zero, check for designation | |
4138 or invocation of an invalid character set and treat that as | |
4139 an unrecognized escape sequence. */ | |
4140 | |
4141 static int | |
4142 parse_iso2022_esc (Lisp_Object codesys, struct iso2022_decoder *iso, | |
4143 unsigned char c, unsigned int *flags, | |
4144 int check_invalid_charsets) | |
4145 { | |
4146 /* (1) If we're at the end of a designation sequence, CS is the | |
4147 charset being designated and REG is the register to designate | |
4148 it to. | |
4149 | |
4150 (2) If we're at the end of a locking-shift sequence, REG is | |
4151 the register to invoke and HALF (0 == left, 1 == right) is | |
4152 the half to invoke it into. | |
4153 | |
4154 (3) If we're at the end of a single-shift sequence, REG is | |
4155 the register to invoke. */ | |
4156 Lisp_Object cs = Qnil; | |
4157 int reg, half; | |
4158 | |
4159 /* NOTE: This code does goto's all over the fucking place. | |
4160 The reason for this is that we're basically implementing | |
4161 a state machine here, and hierarchical languages like C | |
4162 don't really provide a clean way of doing this. */ | |
4163 | |
4164 if (! (*flags & CODING_STATE_ESCAPE)) | |
4165 /* At beginning of escape sequence; we need to reset our | |
4166 escape-state variables. */ | |
4167 iso->esc = ISO_ESC_NOTHING; | |
4168 | |
4169 iso->output_literally = 0; | |
4170 iso->output_direction_sequence = 0; | |
4171 | |
4172 switch (iso->esc) | |
4173 { | |
4174 case ISO_ESC_NOTHING: | |
4175 iso->esc_bytes_index = 0; | |
4176 switch (c) | |
4177 { | |
4178 case ISO_CODE_ESC: /* Start escape sequence */ | |
4179 *flags |= CODING_STATE_ESCAPE; | |
4180 iso->esc = ISO_ESC; | |
4181 goto not_done; | |
4182 | |
4183 case ISO_CODE_CSI: /* ISO6429 (specifying directionality) */ | |
4184 *flags |= CODING_STATE_ESCAPE; | |
4185 iso->esc = ISO_ESC_5_11; | |
4186 goto not_done; | |
4187 | |
4188 case ISO_CODE_SO: /* locking shift 1 */ | |
4189 reg = 1; half = 0; | |
4190 goto locking_shift; | |
4191 case ISO_CODE_SI: /* locking shift 0 */ | |
4192 reg = 0; half = 0; | |
4193 goto locking_shift; | |
4194 | |
4195 case ISO_CODE_SS2: /* single shift */ | |
4196 reg = 2; | |
4197 goto single_shift; | |
4198 case ISO_CODE_SS3: /* single shift */ | |
4199 reg = 3; | |
4200 goto single_shift; | |
4201 | |
4202 default: /* Other control characters */ | |
4203 return 0; | |
4204 } | |
4205 | |
4206 case ISO_ESC: | |
4207 switch (c) | |
4208 { | |
4209 /**** single shift ****/ | |
4210 | |
4211 case 'N': /* single shift 2 */ | |
4212 reg = 2; | |
4213 goto single_shift; | |
4214 case 'O': /* single shift 3 */ | |
4215 reg = 3; | |
4216 goto single_shift; | |
4217 | |
4218 /**** locking shift ****/ | |
4219 | |
4220 case '~': /* locking shift 1 right */ | |
4221 reg = 1; half = 1; | |
4222 goto locking_shift; | |
4223 case 'n': /* locking shift 2 */ | |
4224 reg = 2; half = 0; | |
4225 goto locking_shift; | |
4226 case '}': /* locking shift 2 right */ | |
4227 reg = 2; half = 1; | |
4228 goto locking_shift; | |
4229 case 'o': /* locking shift 3 */ | |
4230 reg = 3; half = 0; | |
4231 goto locking_shift; | |
4232 case '|': /* locking shift 3 right */ | |
4233 reg = 3; half = 1; | |
4234 goto locking_shift; | |
4235 | |
4236 #ifdef ENABLE_COMPOSITE_CHARS | |
4237 /**** composite ****/ | |
4238 | |
4239 case '0': | |
4240 iso->esc = ISO_ESC_START_COMPOSITE; | |
4241 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | | |
4242 CODING_STATE_COMPOSITE; | |
4243 return 1; | |
4244 | |
4245 case '1': | |
4246 iso->esc = ISO_ESC_END_COMPOSITE; | |
4247 *flags = (*flags & CODING_STATE_ISO2022_LOCK) & | |
4248 ~CODING_STATE_COMPOSITE; | |
4249 return 1; | |
4250 #endif /* ENABLE_COMPOSITE_CHARS */ | |
4251 | |
4252 /**** directionality ****/ | |
4253 | |
4254 case '[': | |
4255 iso->esc = ISO_ESC_5_11; | |
4256 goto not_done; | |
4257 | |
4258 /**** designation ****/ | |
4259 | |
4260 case '$': /* multibyte charset prefix */ | |
4261 iso->esc = ISO_ESC_2_4; | |
4262 goto not_done; | |
4263 | |
4264 default: | |
4265 if (0x28 <= c && c <= 0x2F) | |
4266 { | |
4267 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_8); | |
4268 goto not_done; | |
4269 } | |
4270 | |
4271 /* This function is called with CODESYS equal to nil when | |
4272 doing coding-system detection. */ | |
4273 if (!NILP (codesys) | |
4274 && XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) | |
4275 && fit_to_be_escape_quoted (c)) | |
4276 { | |
4277 iso->esc = ISO_ESC_LITERAL; | |
4278 *flags &= CODING_STATE_ISO2022_LOCK; | |
4279 return 1; | |
4280 } | |
4281 | |
4282 /* bzzzt! */ | |
4283 return 0; | |
4284 } | |
4285 | |
4286 | |
4287 | |
4288 /**** directionality ****/ | |
4289 | |
4290 case ISO_ESC_5_11: /* ISO6429 direction control */ | |
4291 if (c == ']') | |
4292 { | |
4293 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L); | |
4294 goto directionality; | |
4295 } | |
4296 if (c == '0') iso->esc = ISO_ESC_5_11_0; | |
4297 else if (c == '1') iso->esc = ISO_ESC_5_11_1; | |
4298 else if (c == '2') iso->esc = ISO_ESC_5_11_2; | |
4299 else return 0; | |
4300 goto not_done; | |
4301 | |
4302 case ISO_ESC_5_11_0: | |
4303 if (c == ']') | |
4304 { | |
4305 *flags &= (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L); | |
4306 goto directionality; | |
4307 } | |
4308 return 0; | |
4309 | |
4310 case ISO_ESC_5_11_1: | |
4311 if (c == ']') | |
4312 { | |
4313 *flags = (CODING_STATE_ISO2022_LOCK & ~CODING_STATE_R2L); | |
4314 goto directionality; | |
4315 } | |
4316 return 0; | |
4317 | |
4318 case ISO_ESC_5_11_2: | |
4319 if (c == ']') | |
4320 { | |
4321 *flags = (*flags & CODING_STATE_ISO2022_LOCK) | CODING_STATE_R2L; | |
4322 goto directionality; | |
4323 } | |
4324 return 0; | |
4325 | |
4326 directionality: | |
4327 iso->esc = ISO_ESC_DIRECTIONALITY; | |
4328 /* Various junk here to attempt to preserve the direction sequences | |
4329 literally in the text if they would otherwise be swallowed due | |
4330 to invalid designations that don't show up as actual charset | |
4331 changes in the text. */ | |
4332 if (iso->invalid_switch_dir) | |
4333 { | |
4334 /* We already inserted a direction switch literally into the | |
4335 text. We assume (#### this may not be right) that the | |
4336 next direction switch is the one going the other way, | |
4337 and we need to output that literally as well. */ | |
4338 iso->output_literally = 1; | |
4339 iso->invalid_switch_dir = 0; | |
4340 } | |
4341 else | |
4342 { | |
4343 int jj; | |
4344 | |
4345 /* If we are in the thrall of an invalid designation, | |
4346 then stick the directionality sequence literally into the | |
4347 output stream so it ends up in the original text again. */ | |
4348 for (jj = 0; jj < 4; jj++) | |
4349 if (iso->invalid_designated[jj]) | |
4350 break; | |
4351 if (jj < 4) | |
4352 { | |
4353 iso->output_literally = 1; | |
4354 iso->invalid_switch_dir = 1; | |
4355 } | |
4356 else | |
4357 /* Indicate that we haven't yet seen a valid designation, | |
4358 so that if a switch-dir is directly followed by an | |
4359 invalid designation, both get inserted literally. */ | |
4360 iso->switched_dir_and_no_valid_charset_yet = 1; | |
4361 } | |
4362 return 1; | |
4363 | |
4364 | |
4365 /**** designation ****/ | |
4366 | |
4367 case ISO_ESC_2_4: | |
4368 if (0x28 <= c && c <= 0x2F) | |
4369 { | |
4370 iso->esc = (enum iso_esc_flag) (c - 0x28 + ISO_ESC_2_4_8); | |
4371 goto not_done; | |
4372 } | |
4373 if (0x40 <= c && c <= 0x42) | |
4374 { | |
4375 cs = CHARSET_BY_ATTRIBUTES (CHARSET_TYPE_94X94, c, | |
4376 *flags & CODING_STATE_R2L ? | |
4377 CHARSET_RIGHT_TO_LEFT : | |
4378 CHARSET_LEFT_TO_RIGHT); | |
4379 reg = 0; | |
4380 goto designated; | |
4381 } | |
4382 return 0; | |
4383 | |
4384 default: | |
4385 { | |
4386 int type =-1; | |
4387 | |
4388 if (c < '0' || c > '~') | |
4389 return 0; /* bad final byte */ | |
4390 | |
4391 if (iso->esc >= ISO_ESC_2_8 && | |
4392 iso->esc <= ISO_ESC_2_15) | |
4393 { | |
4394 type = ((iso->esc >= ISO_ESC_2_12) ? | |
4395 CHARSET_TYPE_96 : CHARSET_TYPE_94); | |
4396 reg = (iso->esc - ISO_ESC_2_8) & 3; | |
4397 } | |
4398 else if (iso->esc >= ISO_ESC_2_4_8 && | |
4399 iso->esc <= ISO_ESC_2_4_15) | |
4400 { | |
4401 type = ((iso->esc >= ISO_ESC_2_4_12) ? | |
4402 CHARSET_TYPE_96X96 : CHARSET_TYPE_94X94); | |
4403 reg = (iso->esc - ISO_ESC_2_4_8) & 3; | |
4404 } | |
4405 else | |
4406 { | |
4407 /* Can this ever be reached? -slb */ | |
4408 abort(); | |
4409 } | |
4410 | |
4411 cs = CHARSET_BY_ATTRIBUTES (type, c, | |
4412 *flags & CODING_STATE_R2L ? | |
4413 CHARSET_RIGHT_TO_LEFT : | |
4414 CHARSET_LEFT_TO_RIGHT); | |
4415 goto designated; | |
4416 } | |
4417 } | |
4418 | |
4419 not_done: | |
4420 iso->esc_bytes[iso->esc_bytes_index++] = (unsigned char) c; | |
4421 return -1; | |
4422 | |
4423 single_shift: | |
4424 if (check_invalid_charsets && !CHARSETP (iso->charset[reg])) | |
4425 /* can't invoke something that ain't there. */ | |
4426 return 0; | |
4427 iso->esc = ISO_ESC_SINGLE_SHIFT; | |
4428 *flags &= CODING_STATE_ISO2022_LOCK; | |
4429 if (reg == 2) | |
4430 *flags |= CODING_STATE_SS2; | |
4431 else | |
4432 *flags |= CODING_STATE_SS3; | |
4433 return 1; | |
4434 | |
4435 locking_shift: | |
4436 if (check_invalid_charsets && | |
4437 !CHARSETP (iso->charset[reg])) | |
4438 /* can't invoke something that ain't there. */ | |
4439 return 0; | |
4440 if (half) | |
4441 iso->register_right = reg; | |
4442 else | |
4443 iso->register_left = reg; | |
4444 *flags &= CODING_STATE_ISO2022_LOCK; | |
4445 iso->esc = ISO_ESC_LOCKING_SHIFT; | |
4446 return 1; | |
4447 | |
4448 designated: | |
4449 if (NILP (cs) && check_invalid_charsets) | |
4450 { | |
4451 iso->invalid_designated[reg] = 1; | |
4452 iso->charset[reg] = Vcharset_ascii; | |
4453 iso->esc = ISO_ESC_DESIGNATE; | |
4454 *flags &= CODING_STATE_ISO2022_LOCK; | |
4455 iso->output_literally = 1; | |
4456 if (iso->switched_dir_and_no_valid_charset_yet) | |
4457 { | |
4458 /* We encountered a switch-direction followed by an | |
4459 invalid designation. Ensure that the switch-direction | |
4460 gets outputted; otherwise it will probably get eaten | |
4461 when the text is written out again. */ | |
4462 iso->switched_dir_and_no_valid_charset_yet = 0; | |
4463 iso->output_direction_sequence = 1; | |
4464 /* And make sure that the switch-dir going the other | |
4465 way gets outputted, as well. */ | |
4466 iso->invalid_switch_dir = 1; | |
4467 } | |
4468 return 1; | |
4469 } | |
4470 /* This function is called with CODESYS equal to nil when | |
4471 doing coding-system detection. */ | |
4472 if (!NILP (codesys)) | |
4473 { | |
4474 charset_conversion_spec_dynarr *dyn = | |
4475 XCODING_SYSTEM (codesys)->iso2022.input_conv; | |
4476 | |
4477 if (dyn) | |
4478 { | |
4479 int i; | |
4480 | |
4481 for (i = 0; i < Dynarr_length (dyn); i++) | |
4482 { | |
4483 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i); | |
4484 if (EQ (cs, spec->from_charset)) | |
4485 cs = spec->to_charset; | |
4486 } | |
4487 } | |
4488 } | |
4489 | |
4490 iso->charset[reg] = cs; | |
4491 iso->esc = ISO_ESC_DESIGNATE; | |
4492 *flags &= CODING_STATE_ISO2022_LOCK; | |
4493 if (iso->invalid_designated[reg]) | |
4494 { | |
4495 iso->invalid_designated[reg] = 0; | |
4496 iso->output_literally = 1; | |
4497 } | |
4498 if (iso->switched_dir_and_no_valid_charset_yet) | |
4499 iso->switched_dir_and_no_valid_charset_yet = 0; | |
4500 return 1; | |
4501 } | |
4502 | |
4503 static int | |
4504 detect_coding_iso2022 (struct detection_state *st, CONST unsigned char *src, | |
4505 unsigned int n) | |
4506 { | |
4507 int mask; | |
4508 | |
4509 /* #### There are serious deficiencies in the recognition mechanism | |
4510 here. This needs to be much smarter if it's going to cut it. | |
4511 The sequence "\xff\x0f" is currently detected as LOCK_SHIFT while | |
4512 it should be detected as Latin-1. | |
4513 All the ISO2022 stuff in this file should be synced up with the | |
4514 code from FSF Emacs-20.4, in which Mule should be more or less stable. | |
4515 Perhaps we should wait till R2L works in FSF Emacs? */ | |
4516 | |
4517 if (!st->iso2022.initted) | |
4518 { | |
4519 reset_iso2022 (Qnil, &st->iso2022.iso); | |
4520 st->iso2022.mask = (CODING_CATEGORY_ISO_7_MASK | | |
4521 CODING_CATEGORY_ISO_8_DESIGNATE_MASK | | |
4522 CODING_CATEGORY_ISO_8_1_MASK | | |
4523 CODING_CATEGORY_ISO_8_2_MASK | | |
4524 CODING_CATEGORY_ISO_LOCK_SHIFT_MASK); | |
4525 st->iso2022.flags = 0; | |
4526 st->iso2022.high_byte_count = 0; | |
4527 st->iso2022.saw_single_shift = 0; | |
4528 st->iso2022.initted = 1; | |
4529 } | |
4530 | |
4531 mask = st->iso2022.mask; | |
4532 | |
4533 while (n--) | |
4534 { | |
4535 int c = *src++; | |
4536 if (c >= 0xA0) | |
4537 { | |
4538 mask &= ~CODING_CATEGORY_ISO_7_MASK; | |
4539 st->iso2022.high_byte_count++; | |
4540 } | |
4541 else | |
4542 { | |
4543 if (st->iso2022.high_byte_count && !st->iso2022.saw_single_shift) | |
4544 { | |
4545 if (st->iso2022.high_byte_count & 1) | |
4546 /* odd number of high bytes; assume not iso-8-2 */ | |
4547 mask &= ~CODING_CATEGORY_ISO_8_2_MASK; | |
4548 } | |
4549 st->iso2022.high_byte_count = 0; | |
4550 st->iso2022.saw_single_shift = 0; | |
4551 if (c > 0x80) | |
4552 mask &= ~CODING_CATEGORY_ISO_7_MASK; | |
4553 } | |
4554 if (!(st->iso2022.flags & CODING_STATE_ESCAPE) | |
4555 && (BYTE_C0_P (c) || BYTE_C1_P (c))) | |
4556 { /* control chars */ | |
4557 switch (c) | |
4558 { | |
4559 /* Allow and ignore control characters that you might | |
4560 reasonably see in a text file */ | |
4561 case '\r': | |
4562 case '\n': | |
4563 case '\t': | |
4564 case 7: /* bell */ | |
4565 case 8: /* backspace */ | |
4566 case 11: /* vertical tab */ | |
4567 case 12: /* form feed */ | |
4568 case 26: /* MS-DOS C-z junk */ | |
4569 case 31: /* '^_' -- for info */ | |
4570 goto label_continue_loop; | |
4571 | |
4572 default: | |
4573 break; | |
4574 } | |
4575 } | |
4576 | |
4577 if ((st->iso2022.flags & CODING_STATE_ESCAPE) || BYTE_C0_P (c) | |
4578 || BYTE_C1_P (c)) | |
4579 { | |
4580 if (parse_iso2022_esc (Qnil, &st->iso2022.iso, c, | |
4581 &st->iso2022.flags, 0)) | |
4582 { | |
4583 switch (st->iso2022.iso.esc) | |
4584 { | |
4585 case ISO_ESC_DESIGNATE: | |
4586 mask &= ~CODING_CATEGORY_ISO_8_1_MASK; | |
4587 mask &= ~CODING_CATEGORY_ISO_8_2_MASK; | |
4588 break; | |
4589 case ISO_ESC_LOCKING_SHIFT: | |
4590 mask = CODING_CATEGORY_ISO_LOCK_SHIFT_MASK; | |
4591 goto ran_out_of_chars; | |
4592 case ISO_ESC_SINGLE_SHIFT: | |
4593 mask &= ~CODING_CATEGORY_ISO_8_DESIGNATE_MASK; | |
4594 st->iso2022.saw_single_shift = 1; | |
4595 break; | |
4596 default: | |
4597 break; | |
4598 } | |
4599 } | |
4600 else | |
4601 { | |
4602 mask = 0; | |
4603 goto ran_out_of_chars; | |
4604 } | |
4605 } | |
4606 label_continue_loop:; | |
4607 } | |
4608 | |
4609 ran_out_of_chars: | |
4610 | |
4611 return mask; | |
4612 } | |
4613 | |
4614 static int | |
4615 postprocess_iso2022_mask (int mask) | |
4616 { | |
4617 /* #### kind of cheesy */ | |
4618 /* If seven-bit ISO is allowed, then assume that the encoding is | |
4619 entirely seven-bit and turn off the eight-bit ones. */ | |
4620 if (mask & CODING_CATEGORY_ISO_7_MASK) | |
4621 mask &= ~ (CODING_CATEGORY_ISO_8_DESIGNATE_MASK | | |
4622 CODING_CATEGORY_ISO_8_1_MASK | | |
4623 CODING_CATEGORY_ISO_8_2_MASK); | |
4624 return mask; | |
4625 } | |
4626 | |
4627 /* If FLAGS is a null pointer or specifies right-to-left motion, | |
4628 output a switch-dir-to-left-to-right sequence to DST. | |
4629 Also update FLAGS if it is not a null pointer. | |
4630 If INTERNAL_P is set, we are outputting in internal format and | |
4631 need to handle the CSI differently. */ | |
4632 | |
4633 static void | |
4634 restore_left_to_right_direction (Lisp_Coding_System *codesys, | |
4635 unsigned_char_dynarr *dst, | |
4636 unsigned int *flags, | |
4637 int internal_p) | |
4638 { | |
4639 if (!flags || (*flags & CODING_STATE_R2L)) | |
4640 { | |
4641 if (CODING_SYSTEM_ISO2022_SEVEN (codesys)) | |
4642 { | |
4643 Dynarr_add (dst, ISO_CODE_ESC); | |
4644 Dynarr_add (dst, '['); | |
4645 } | |
4646 else if (internal_p) | |
4647 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst); | |
4648 else | |
4649 Dynarr_add (dst, ISO_CODE_CSI); | |
4650 Dynarr_add (dst, '0'); | |
4651 Dynarr_add (dst, ']'); | |
4652 if (flags) | |
4653 *flags &= ~CODING_STATE_R2L; | |
4654 } | |
4655 } | |
4656 | |
4657 /* If FLAGS is a null pointer or specifies a direction different from | |
4658 DIRECTION (which should be either CHARSET_RIGHT_TO_LEFT or | |
4659 CHARSET_LEFT_TO_RIGHT), output the appropriate switch-dir escape | |
4660 sequence to DST. Also update FLAGS if it is not a null pointer. | |
4661 If INTERNAL_P is set, we are outputting in internal format and | |
4662 need to handle the CSI differently. */ | |
4663 | |
4664 static void | |
4665 ensure_correct_direction (int direction, Lisp_Coding_System *codesys, | |
4666 unsigned_char_dynarr *dst, unsigned int *flags, | |
4667 int internal_p) | |
4668 { | |
4669 if ((!flags || (*flags & CODING_STATE_R2L)) && | |
4670 direction == CHARSET_LEFT_TO_RIGHT) | |
4671 restore_left_to_right_direction (codesys, dst, flags, internal_p); | |
4672 else if (!CODING_SYSTEM_ISO2022_NO_ISO6429 (codesys) | |
4673 && (!flags || !(*flags & CODING_STATE_R2L)) && | |
4674 direction == CHARSET_RIGHT_TO_LEFT) | |
4675 { | |
4676 if (CODING_SYSTEM_ISO2022_SEVEN (codesys)) | |
4677 { | |
4678 Dynarr_add (dst, ISO_CODE_ESC); | |
4679 Dynarr_add (dst, '['); | |
4680 } | |
4681 else if (internal_p) | |
4682 DECODE_ADD_BINARY_CHAR (ISO_CODE_CSI, dst); | |
4683 else | |
4684 Dynarr_add (dst, ISO_CODE_CSI); | |
4685 Dynarr_add (dst, '2'); | |
4686 Dynarr_add (dst, ']'); | |
4687 if (flags) | |
4688 *flags |= CODING_STATE_R2L; | |
4689 } | |
4690 } | |
4691 | |
4692 /* Convert ISO2022-format data to internal format. */ | |
4693 | |
4694 static void | |
4695 decode_coding_iso2022 (Lstream *decoding, CONST unsigned char *src, | |
4696 unsigned_char_dynarr *dst, unsigned int n) | |
4697 { | |
4698 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | |
4699 unsigned int flags = str->flags; | |
4700 unsigned int ch = str->ch; | |
4701 eol_type_t eol_type = str->eol_type; | |
4702 #ifdef ENABLE_COMPOSITE_CHARS | |
4703 unsigned_char_dynarr *real_dst = dst; | |
4704 #endif | |
4705 Lisp_Object coding_system; | |
4706 | |
4707 XSETCODING_SYSTEM (coding_system, str->codesys); | |
4708 | |
4709 #ifdef ENABLE_COMPOSITE_CHARS | |
4710 if (flags & CODING_STATE_COMPOSITE) | |
4711 dst = str->iso2022.composite_chars; | |
4712 #endif /* ENABLE_COMPOSITE_CHARS */ | |
4713 | |
4714 while (n--) | |
4715 { | |
4716 unsigned char c = *src++; | |
4717 if (flags & CODING_STATE_ESCAPE) | |
4718 { /* Within ESC sequence */ | |
4719 int retval = parse_iso2022_esc (coding_system, &str->iso2022, | |
4720 c, &flags, 1); | |
4721 | |
4722 if (retval) | |
4723 { | |
4724 switch (str->iso2022.esc) | |
4725 { | |
4726 #ifdef ENABLE_COMPOSITE_CHARS | |
4727 case ISO_ESC_START_COMPOSITE: | |
4728 if (str->iso2022.composite_chars) | |
4729 Dynarr_reset (str->iso2022.composite_chars); | |
4730 else | |
4731 str->iso2022.composite_chars = Dynarr_new (unsigned_char); | |
4732 dst = str->iso2022.composite_chars; | |
4733 break; | |
4734 case ISO_ESC_END_COMPOSITE: | |
4735 { | |
4736 Bufbyte comstr[MAX_EMCHAR_LEN]; | |
4737 Bytecount len; | |
4738 Emchar emch = lookup_composite_char (Dynarr_atp (dst, 0), | |
4739 Dynarr_length (dst)); | |
4740 dst = real_dst; | |
4741 len = set_charptr_emchar (comstr, emch); | |
4742 Dynarr_add_many (dst, comstr, len); | |
4743 break; | |
4744 } | |
4745 #endif /* ENABLE_COMPOSITE_CHARS */ | |
4746 | |
4747 case ISO_ESC_LITERAL: | |
4748 DECODE_ADD_BINARY_CHAR (c, dst); | |
4749 break; | |
4750 | |
4751 default: | |
4752 /* Everything else handled already */ | |
4753 break; | |
4754 } | |
4755 } | |
4756 | |
4757 /* Attempted error recovery. */ | |
4758 if (str->iso2022.output_direction_sequence) | |
4759 ensure_correct_direction (flags & CODING_STATE_R2L ? | |
4760 CHARSET_RIGHT_TO_LEFT : | |
4761 CHARSET_LEFT_TO_RIGHT, | |
4762 str->codesys, dst, 0, 1); | |
4763 /* More error recovery. */ | |
4764 if (!retval || str->iso2022.output_literally) | |
4765 { | |
4766 /* Output the (possibly invalid) sequence */ | |
4767 int i; | |
4768 for (i = 0; i < str->iso2022.esc_bytes_index; i++) | |
4769 DECODE_ADD_BINARY_CHAR (str->iso2022.esc_bytes[i], dst); | |
4770 flags &= CODING_STATE_ISO2022_LOCK; | |
4771 if (!retval) | |
4772 n++, src--;/* Repeat the loop with the same character. */ | |
4773 else | |
4774 { | |
4775 /* No sense in reprocessing the final byte of the | |
4776 escape sequence; it could mess things up anyway. | |
4777 Just add it now. */ | |
4778 DECODE_ADD_BINARY_CHAR (c, dst); | |
4779 } | |
4780 } | |
4781 ch = 0; | |
4782 } | |
4783 else if (BYTE_C0_P (c) || BYTE_C1_P (c)) | |
4784 { /* Control characters */ | |
4785 | |
4786 /***** Error-handling *****/ | |
4787 | |
4788 /* If we were in the middle of a character, dump out the | |
4789 partial character. */ | |
4790 DECODE_OUTPUT_PARTIAL_CHAR (ch); | |
4791 | |
4792 /* If we just saw a single-shift character, dump it out. | |
4793 This may dump out the wrong sort of single-shift character, | |
4794 but least it will give an indication that something went | |
4795 wrong. */ | |
4796 if (flags & CODING_STATE_SS2) | |
4797 { | |
4798 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS2, dst); | |
4799 flags &= ~CODING_STATE_SS2; | |
4800 } | |
4801 if (flags & CODING_STATE_SS3) | |
4802 { | |
4803 DECODE_ADD_BINARY_CHAR (ISO_CODE_SS3, dst); | |
4804 flags &= ~CODING_STATE_SS3; | |
4805 } | |
4806 | |
4807 /***** Now handle the control characters. *****/ | |
4808 | |
4809 /* Handle CR/LF */ | |
4810 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); | |
4811 | |
4812 flags &= CODING_STATE_ISO2022_LOCK; | |
4813 | |
4814 if (!parse_iso2022_esc (coding_system, &str->iso2022, c, &flags, 1)) | |
4815 DECODE_ADD_BINARY_CHAR (c, dst); | |
4816 } | |
4817 else | |
4818 { /* Graphic characters */ | |
4819 Lisp_Object charset; | |
4820 int lb; | |
4821 int reg; | |
4822 | |
4823 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); | |
4824 | |
4825 /* Now determine the charset. */ | |
4826 reg = ((flags & CODING_STATE_SS2) ? 2 | |
4827 : (flags & CODING_STATE_SS3) ? 3 | |
4828 : !BYTE_ASCII_P (c) ? str->iso2022.register_right | |
4829 : str->iso2022.register_left); | |
4830 charset = str->iso2022.charset[reg]; | |
4831 | |
4832 /* Error checking: */ | |
4833 if (! CHARSETP (charset) | |
4834 || str->iso2022.invalid_designated[reg] | |
4835 || (((c & 0x7F) == ' ' || (c & 0x7F) == ISO_CODE_DEL) | |
4836 && XCHARSET_CHARS (charset) == 94)) | |
4837 /* Mrmph. We are trying to invoke a register that has no | |
4838 or an invalid charset in it, or trying to add a character | |
4839 outside the range of the charset. Insert that char literally | |
4840 to preserve it for the output. */ | |
4841 { | |
4842 DECODE_OUTPUT_PARTIAL_CHAR (ch); | |
4843 DECODE_ADD_BINARY_CHAR (c, dst); | |
4844 } | |
4845 | |
4846 else | |
4847 { | |
4848 /* Things are probably hunky-dorey. */ | |
4849 | |
4850 /* Fetch reverse charset, maybe. */ | |
4851 if (((flags & CODING_STATE_R2L) && | |
4852 XCHARSET_DIRECTION (charset) == CHARSET_LEFT_TO_RIGHT) | |
4853 || | |
4854 (!(flags & CODING_STATE_R2L) && | |
4855 XCHARSET_DIRECTION (charset) == CHARSET_RIGHT_TO_LEFT)) | |
4856 { | |
4857 Lisp_Object new_charset = | |
4858 XCHARSET_REVERSE_DIRECTION_CHARSET (charset); | |
4859 if (!NILP (new_charset)) | |
4860 charset = new_charset; | |
4861 } | |
4862 | |
4863 lb = XCHARSET_LEADING_BYTE (charset); | |
4864 switch (XCHARSET_REP_BYTES (charset)) | |
4865 { | |
4866 case 1: /* ASCII */ | |
4867 DECODE_OUTPUT_PARTIAL_CHAR (ch); | |
4868 Dynarr_add (dst, c & 0x7F); | |
4869 break; | |
4870 | |
4871 case 2: /* one-byte official */ | |
4872 DECODE_OUTPUT_PARTIAL_CHAR (ch); | |
4873 Dynarr_add (dst, lb); | |
4874 Dynarr_add (dst, c | 0x80); | |
4875 break; | |
4876 | |
4877 case 3: /* one-byte private or two-byte official */ | |
4878 if (XCHARSET_PRIVATE_P (charset)) | |
4879 { | |
4880 DECODE_OUTPUT_PARTIAL_CHAR (ch); | |
4881 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_1); | |
4882 Dynarr_add (dst, lb); | |
4883 Dynarr_add (dst, c | 0x80); | |
4884 } | |
4885 else | |
4886 { | |
4887 if (ch) | |
4888 { | |
4889 Dynarr_add (dst, lb); | |
4890 Dynarr_add (dst, ch | 0x80); | |
4891 Dynarr_add (dst, c | 0x80); | |
4892 ch = 0; | |
4893 } | |
4894 else | |
4895 ch = c; | |
4896 } | |
4897 break; | |
4898 | |
4899 default: /* two-byte private */ | |
4900 if (ch) | |
4901 { | |
4902 Dynarr_add (dst, PRE_LEADING_BYTE_PRIVATE_2); | |
4903 Dynarr_add (dst, lb); | |
4904 Dynarr_add (dst, ch | 0x80); | |
4905 Dynarr_add (dst, c | 0x80); | |
4906 ch = 0; | |
4907 } | |
4908 else | |
4909 ch = c; | |
4910 } | |
4911 } | |
4912 | |
4913 if (!ch) | |
4914 flags &= CODING_STATE_ISO2022_LOCK; | |
4915 } | |
4916 | |
4917 label_continue_loop:; | |
4918 } | |
4919 | |
4920 if (flags & CODING_STATE_END) | |
4921 DECODE_OUTPUT_PARTIAL_CHAR (ch); | |
4922 | |
4923 str->flags = flags; | |
4924 str->ch = ch; | |
4925 } | |
4926 | |
4927 | |
4928 /***** ISO2022 encoder *****/ | |
4929 | |
4930 /* Designate CHARSET into register REG. */ | |
4931 | |
4932 static void | |
4933 iso2022_designate (Lisp_Object charset, unsigned char reg, | |
4934 struct encoding_stream *str, unsigned_char_dynarr *dst) | |
4935 { | |
4936 static CONST char inter94[] = "()*+"; | |
4937 static CONST char inter96[] = ",-./"; | |
4938 unsigned int type; | |
4939 unsigned char final; | |
4940 Lisp_Object old_charset = str->iso2022.charset[reg]; | |
4941 | |
4942 str->iso2022.charset[reg] = charset; | |
4943 if (!CHARSETP (charset)) | |
4944 /* charset might be an initial nil or t. */ | |
4945 return; | |
4946 type = XCHARSET_TYPE (charset); | |
4947 final = XCHARSET_FINAL (charset); | |
4948 if (!str->iso2022.force_charset_on_output[reg] && | |
4949 CHARSETP (old_charset) && | |
4950 XCHARSET_TYPE (old_charset) == type && | |
4951 XCHARSET_FINAL (old_charset) == final) | |
4952 return; | |
4953 | |
4954 str->iso2022.force_charset_on_output[reg] = 0; | |
4955 | |
4956 { | |
4957 charset_conversion_spec_dynarr *dyn = | |
4958 str->codesys->iso2022.output_conv; | |
4959 | |
4960 if (dyn) | |
4961 { | |
4962 int i; | |
4963 | |
4964 for (i = 0; i < Dynarr_length (dyn); i++) | |
4965 { | |
4966 struct charset_conversion_spec *spec = Dynarr_atp (dyn, i); | |
4967 if (EQ (charset, spec->from_charset)) | |
4968 charset = spec->to_charset; | |
4969 } | |
4970 } | |
4971 } | |
4972 | |
4973 Dynarr_add (dst, ISO_CODE_ESC); | |
4974 switch (type) | |
4975 { | |
4976 case CHARSET_TYPE_94: | |
4977 Dynarr_add (dst, inter94[reg]); | |
4978 break; | |
4979 case CHARSET_TYPE_96: | |
4980 Dynarr_add (dst, inter96[reg]); | |
4981 break; | |
4982 case CHARSET_TYPE_94X94: | |
4983 Dynarr_add (dst, '$'); | |
4984 if (reg != 0 | |
4985 || !(CODING_SYSTEM_ISO2022_SHORT (str->codesys)) | |
4986 || final < '@' | |
4987 || final > 'B') | |
4988 Dynarr_add (dst, inter94[reg]); | |
4989 break; | |
4990 case CHARSET_TYPE_96X96: | |
4991 Dynarr_add (dst, '$'); | |
4992 Dynarr_add (dst, inter96[reg]); | |
4993 break; | |
4994 } | |
4995 Dynarr_add (dst, final); | |
4996 } | |
4997 | |
4998 static void | |
4999 ensure_normal_shift (struct encoding_stream *str, unsigned_char_dynarr *dst) | |
5000 { | |
5001 if (str->iso2022.register_left != 0) | |
5002 { | |
5003 Dynarr_add (dst, ISO_CODE_SI); | |
5004 str->iso2022.register_left = 0; | |
5005 } | |
5006 } | |
5007 | |
5008 static void | |
5009 ensure_shift_out (struct encoding_stream *str, unsigned_char_dynarr *dst) | |
5010 { | |
5011 if (str->iso2022.register_left != 1) | |
5012 { | |
5013 Dynarr_add (dst, ISO_CODE_SO); | |
5014 str->iso2022.register_left = 1; | |
5015 } | |
5016 } | |
5017 | |
5018 /* Convert internally-formatted data to ISO2022 format. */ | |
5019 | |
5020 static void | |
5021 encode_coding_iso2022 (Lstream *encoding, CONST unsigned char *src, | |
5022 unsigned_char_dynarr *dst, unsigned int n) | |
5023 { | |
5024 unsigned char charmask, c; | |
5025 unsigned char char_boundary; | |
5026 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | |
5027 unsigned int flags = str->flags; | |
5028 unsigned int ch = str->ch; | |
5029 Lisp_Coding_System *codesys = str->codesys; | |
5030 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); | |
5031 int i; | |
5032 Lisp_Object charset; | |
5033 int half; | |
5034 | |
5035 #ifdef ENABLE_COMPOSITE_CHARS | |
5036 /* flags for handling composite chars. We do a little switcharoo | |
5037 on the source while we're outputting the composite char. */ | |
5038 unsigned int saved_n = 0; | |
5039 CONST unsigned char *saved_src = NULL; | |
5040 int in_composite = 0; | |
5041 #endif /* ENABLE_COMPOSITE_CHARS */ | |
5042 | |
5043 char_boundary = str->iso2022.current_char_boundary; | |
5044 charset = str->iso2022.current_charset; | |
5045 half = str->iso2022.current_half; | |
5046 | |
5047 #ifdef ENABLE_COMPOSITE_CHARS | |
5048 back_to_square_n: | |
5049 #endif | |
5050 while (n--) | |
5051 { | |
5052 c = *src++; | |
5053 | |
5054 if (BYTE_ASCII_P (c)) | |
5055 { /* Processing ASCII character */ | |
5056 ch = 0; | |
5057 | |
5058 restore_left_to_right_direction (codesys, dst, &flags, 0); | |
5059 | |
5060 /* Make sure G0 contains ASCII */ | |
5061 if ((c > ' ' && c < ISO_CODE_DEL) || | |
5062 !CODING_SYSTEM_ISO2022_NO_ASCII_CNTL (codesys)) | |
5063 { | |
5064 ensure_normal_shift (str, dst); | |
5065 iso2022_designate (Vcharset_ascii, 0, str, dst); | |
5066 } | |
5067 | |
5068 /* If necessary, restore everything to the default state | |
5069 at end-of-line */ | |
5070 if (c == '\n' && | |
5071 !(CODING_SYSTEM_ISO2022_NO_ASCII_EOL (codesys))) | |
5072 { | |
5073 restore_left_to_right_direction (codesys, dst, &flags, 0); | |
5074 | |
5075 ensure_normal_shift (str, dst); | |
5076 | |
5077 for (i = 0; i < 4; i++) | |
5078 { | |
5079 Lisp_Object initial_charset = | |
5080 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); | |
5081 iso2022_designate (initial_charset, i, str, dst); | |
5082 } | |
5083 } | |
5084 if (c == '\n') | |
5085 { | |
5086 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) | |
5087 Dynarr_add (dst, '\r'); | |
5088 if (eol_type != EOL_CR) | |
5089 Dynarr_add (dst, c); | |
5090 } | |
5091 else | |
5092 { | |
5093 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) | |
5094 && fit_to_be_escape_quoted (c)) | |
5095 Dynarr_add (dst, ISO_CODE_ESC); | |
5096 Dynarr_add (dst, c); | |
5097 } | |
5098 char_boundary = 1; | |
5099 } | |
5100 | |
5101 else if (BUFBYTE_LEADING_BYTE_P (c) || BUFBYTE_LEADING_BYTE_P (ch)) | |
5102 { /* Processing Leading Byte */ | |
5103 ch = 0; | |
5104 charset = CHARSET_BY_LEADING_BYTE (c); | |
5105 if (LEADING_BYTE_PREFIX_P(c)) | |
5106 ch = c; | |
5107 else if (!EQ (charset, Vcharset_control_1) | |
5108 #ifdef ENABLE_COMPOSITE_CHARS | |
5109 && !EQ (charset, Vcharset_composite) | |
5110 #endif | |
5111 ) | |
5112 { | |
5113 int reg; | |
5114 | |
5115 ensure_correct_direction (XCHARSET_DIRECTION (charset), | |
5116 codesys, dst, &flags, 0); | |
5117 | |
5118 /* Now determine which register to use. */ | |
5119 reg = -1; | |
5120 for (i = 0; i < 4; i++) | |
5121 { | |
5122 if (EQ (charset, str->iso2022.charset[i]) || | |
5123 EQ (charset, | |
5124 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i))) | |
5125 { | |
5126 reg = i; | |
5127 break; | |
5128 } | |
5129 } | |
5130 | |
5131 if (reg == -1) | |
5132 { | |
5133 if (XCHARSET_GRAPHIC (charset) != 0) | |
5134 { | |
5135 if (!NILP (str->iso2022.charset[1]) && | |
5136 (!CODING_SYSTEM_ISO2022_SEVEN (codesys) || | |
5137 CODING_SYSTEM_ISO2022_LOCK_SHIFT (codesys))) | |
5138 reg = 1; | |
5139 else if (!NILP (str->iso2022.charset[2])) | |
5140 reg = 2; | |
5141 else if (!NILP (str->iso2022.charset[3])) | |
5142 reg = 3; | |
5143 else | |
5144 reg = 0; | |
5145 } | |
5146 else | |
5147 reg = 0; | |
5148 } | |
5149 | |
5150 iso2022_designate (charset, reg, str, dst); | |
5151 | |
5152 /* Now invoke that register. */ | |
5153 switch (reg) | |
5154 { | |
5155 case 0: | |
5156 ensure_normal_shift (str, dst); | |
5157 half = 0; | |
5158 break; | |
5159 | |
5160 case 1: | |
5161 if (CODING_SYSTEM_ISO2022_SEVEN (codesys)) | |
5162 { | |
5163 ensure_shift_out (str, dst); | |
5164 half = 0; | |
5165 } | |
5166 else | |
5167 half = 1; | |
5168 break; | |
5169 | |
5170 case 2: | |
5171 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys)) | |
5172 { | |
5173 Dynarr_add (dst, ISO_CODE_ESC); | |
5174 Dynarr_add (dst, 'N'); | |
5175 half = 0; | |
5176 } | |
5177 else | |
5178 { | |
5179 Dynarr_add (dst, ISO_CODE_SS2); | |
5180 half = 1; | |
5181 } | |
5182 break; | |
5183 | |
5184 case 3: | |
5185 if (CODING_SYSTEM_ISO2022_SEVEN (str->codesys)) | |
5186 { | |
5187 Dynarr_add (dst, ISO_CODE_ESC); | |
5188 Dynarr_add (dst, 'O'); | |
5189 half = 0; | |
5190 } | |
5191 else | |
5192 { | |
5193 Dynarr_add (dst, ISO_CODE_SS3); | |
5194 half = 1; | |
5195 } | |
5196 break; | |
5197 | |
5198 default: | |
5199 abort (); | |
5200 } | |
5201 } | |
5202 char_boundary = 0; | |
5203 } | |
5204 else | |
5205 { /* Processing Non-ASCII character */ | |
5206 charmask = (half == 0 ? 0x7F : 0xFF); | |
5207 char_boundary = 1; | |
5208 if (EQ (charset, Vcharset_control_1)) | |
5209 { | |
5210 if (CODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys) | |
5211 && fit_to_be_escape_quoted (c)) | |
5212 Dynarr_add (dst, ISO_CODE_ESC); | |
5213 /* you asked for it ... */ | |
5214 Dynarr_add (dst, c - 0x20); | |
5215 } | |
5216 else | |
5217 { | |
5218 switch (XCHARSET_REP_BYTES (charset)) | |
5219 { | |
5220 case 2: | |
5221 Dynarr_add (dst, c & charmask); | |
5222 break; | |
5223 case 3: | |
5224 if (XCHARSET_PRIVATE_P (charset)) | |
5225 { | |
5226 Dynarr_add (dst, c & charmask); | |
5227 ch = 0; | |
5228 } | |
5229 else if (ch) | |
5230 { | |
5231 #ifdef ENABLE_COMPOSITE_CHARS | |
5232 if (EQ (charset, Vcharset_composite)) | |
5233 { | |
5234 if (in_composite) | |
5235 { | |
5236 /* #### Bother! We don't know how to | |
5237 handle this yet. */ | |
5238 Dynarr_add (dst, '~'); | |
5239 } | |
5240 else | |
5241 { | |
5242 Emchar emch = MAKE_CHAR (Vcharset_composite, | |
5243 ch & 0x7F, c & 0x7F); | |
5244 Lisp_Object lstr = composite_char_string (emch); | |
5245 saved_n = n; | |
5246 saved_src = src; | |
5247 in_composite = 1; | |
5248 src = XSTRING_DATA (lstr); | |
5249 n = XSTRING_LENGTH (lstr); | |
5250 Dynarr_add (dst, ISO_CODE_ESC); | |
5251 Dynarr_add (dst, '0'); /* start composing */ | |
5252 } | |
5253 } | |
5254 else | |
5255 #endif /* ENABLE_COMPOSITE_CHARS */ | |
5256 { | |
5257 Dynarr_add (dst, ch & charmask); | |
5258 Dynarr_add (dst, c & charmask); | |
5259 } | |
5260 ch = 0; | |
5261 } | |
5262 else | |
5263 { | |
5264 ch = c; | |
5265 char_boundary = 0; | |
5266 } | |
5267 break; | |
5268 case 4: | |
5269 if (ch) | |
5270 { | |
5271 Dynarr_add (dst, ch & charmask); | |
5272 Dynarr_add (dst, c & charmask); | |
5273 ch = 0; | |
5274 } | |
5275 else | |
5276 { | |
5277 ch = c; | |
5278 char_boundary = 0; | |
5279 } | |
5280 break; | |
5281 default: | |
5282 abort (); | |
5283 } | |
5284 } | |
5285 } | |
5286 } | |
5287 | |
5288 #ifdef ENABLE_COMPOSITE_CHARS | |
5289 if (in_composite) | |
5290 { | |
5291 n = saved_n; | |
5292 src = saved_src; | |
5293 in_composite = 0; | |
5294 Dynarr_add (dst, ISO_CODE_ESC); | |
5295 Dynarr_add (dst, '1'); /* end composing */ | |
5296 goto back_to_square_n; /* Wheeeeeeeee ..... */ | |
5297 } | |
5298 #endif /* ENABLE_COMPOSITE_CHARS */ | |
5299 | |
5300 if (char_boundary && flags & CODING_STATE_END) | |
5301 { | |
5302 restore_left_to_right_direction (codesys, dst, &flags, 0); | |
5303 ensure_normal_shift (str, dst); | |
5304 for (i = 0; i < 4; i++) | |
5305 { | |
5306 Lisp_Object initial_charset = | |
5307 CODING_SYSTEM_ISO2022_INITIAL_CHARSET (codesys, i); | |
5308 iso2022_designate (initial_charset, i, str, dst); | |
5309 } | |
5310 } | |
5311 | |
5312 str->flags = flags; | |
5313 str->ch = ch; | |
5314 str->iso2022.current_char_boundary = char_boundary; | |
5315 str->iso2022.current_charset = charset; | |
5316 str->iso2022.current_half = half; | |
5317 | |
5318 /* Verbum caro factum est! */ | |
5319 } | |
5320 #endif /* MULE */ | |
5321 | |
5322 /************************************************************************/ | |
5323 /* No-conversion methods */ | |
5324 /************************************************************************/ | |
5325 | |
5326 /* This is used when reading in "binary" files -- i.e. files that may | |
5327 contain all 256 possible byte values and that are not to be | |
5328 interpreted as being in any particular decoding. */ | |
5329 static void | |
5330 decode_coding_no_conversion (Lstream *decoding, CONST unsigned char *src, | |
5331 unsigned_char_dynarr *dst, unsigned int n) | |
5332 { | |
5333 unsigned char c; | |
5334 struct decoding_stream *str = DECODING_STREAM_DATA (decoding); | |
5335 unsigned int flags = str->flags; | |
5336 unsigned int ch = str->ch; | |
5337 eol_type_t eol_type = str->eol_type; | |
5338 | |
5339 while (n--) | |
5340 { | |
5341 c = *src++; | |
5342 | |
5343 DECODE_HANDLE_EOL_TYPE (eol_type, c, flags, dst); | |
5344 DECODE_ADD_BINARY_CHAR (c, dst); | |
5345 label_continue_loop:; | |
5346 } | |
5347 | |
5348 DECODE_HANDLE_END_OF_CONVERSION (flags, ch, dst); | |
5349 | |
5350 str->flags = flags; | |
5351 str->ch = ch; | |
5352 } | |
5353 | |
5354 static void | |
5355 encode_coding_no_conversion (Lstream *encoding, CONST unsigned char *src, | |
5356 unsigned_char_dynarr *dst, unsigned int n) | |
5357 { | |
5358 unsigned char c; | |
5359 struct encoding_stream *str = ENCODING_STREAM_DATA (encoding); | |
5360 unsigned int flags = str->flags; | |
5361 unsigned int ch = str->ch; | |
5362 eol_type_t eol_type = CODING_SYSTEM_EOL_TYPE (str->codesys); | |
5363 | |
5364 while (n--) | |
5365 { | |
5366 c = *src++; | |
5367 if (c == '\n') | |
5368 { | |
5369 if (eol_type != EOL_LF && eol_type != EOL_AUTODETECT) | |
5370 Dynarr_add (dst, '\r'); | |
5371 if (eol_type != EOL_CR) | |
5372 Dynarr_add (dst, '\n'); | |
5373 ch = 0; | |
5374 } | |
5375 else if (BYTE_ASCII_P (c)) | |
5376 { | |
5377 assert (ch == 0); | |
5378 Dynarr_add (dst, c); | |
5379 } | |
5380 else if (BUFBYTE_LEADING_BYTE_P (c)) | |
5381 { | |
5382 assert (ch == 0); | |
5383 if (c == LEADING_BYTE_LATIN_ISO8859_1 || | |
5384 c == LEADING_BYTE_CONTROL_1) | |
5385 ch = c; | |
5386 else | |
5387 Dynarr_add (dst, '~'); /* untranslatable character */ | |
5388 } | |
5389 else | |
5390 { | |
5391 if (ch == LEADING_BYTE_LATIN_ISO8859_1) | |
5392 Dynarr_add (dst, c); | |
5393 else if (ch == LEADING_BYTE_CONTROL_1) | |
5394 { | |
5395 assert (c < 0xC0); | |
5396 Dynarr_add (dst, c - 0x20); | |
5397 } | |
5398 /* else it should be the second or third byte of an | |
5399 untranslatable character, so ignore it */ | |
5400 ch = 0; | |
5401 } | |
5402 } | |
5403 | |
5404 str->flags = flags; | |
5405 str->ch = ch; | |
5406 } | |
5407 | |
5408 | |
5409 /************************************************************************/ | |
5410 /* Simple internal/external functions */ | |
5411 /************************************************************************/ | |
5412 | |
5413 static Extbyte_dynarr *conversion_out_dynarr; | |
5414 static Bufbyte_dynarr *conversion_in_dynarr; | |
5415 | |
5416 /* Determine coding system from coding format */ | |
5417 | |
5418 /* #### not correct for all values of `fmt'! */ | |
5419 static Lisp_Object | |
5420 external_data_format_to_coding_system (enum external_data_format fmt) | |
5421 { | |
5422 switch (fmt) | |
5423 { | |
5424 case FORMAT_FILENAME: | |
5425 case FORMAT_TERMINAL: | |
5426 if (EQ (Vfile_name_coding_system, Qnil) || | |
5427 EQ (Vfile_name_coding_system, Qbinary)) | |
5428 return Qnil; | |
5429 else | |
5430 return Fget_coding_system (Vfile_name_coding_system); | |
5431 #ifdef MULE | |
5432 case FORMAT_CTEXT: | |
5433 return Fget_coding_system (Qctext); | |
5434 #endif | |
5435 default: | |
5436 return Qnil; | |
5437 } | |
5438 } | |
5439 | |
5440 Extbyte * | |
5441 convert_to_external_format (CONST Bufbyte *ptr, | |
5442 Bytecount len, | |
5443 Extcount *len_out, | |
5444 enum external_data_format fmt) | |
5445 { | |
5446 Lisp_Object coding_system = external_data_format_to_coding_system (fmt); | |
5447 | |
5448 if (!conversion_out_dynarr) | |
5449 conversion_out_dynarr = Dynarr_new (Extbyte); | |
5450 else | |
5451 Dynarr_reset (conversion_out_dynarr); | |
5452 | |
5453 if (NILP (coding_system)) | |
5454 { | |
5455 CONST Bufbyte *end = ptr + len; | |
5456 | |
5457 for (; ptr < end;) | |
5458 { | |
5459 Bufbyte c = | |
5460 (BYTE_ASCII_P (*ptr)) ? *ptr : | |
5461 (*ptr == LEADING_BYTE_CONTROL_1) ? (*(ptr+1) - 0x20) : | |
5462 (*ptr == LEADING_BYTE_LATIN_ISO8859_1) ? (*(ptr+1)) : | |
5463 '~'; | |
5464 | |
5465 Dynarr_add (conversion_out_dynarr, (Extbyte) c); | |
5466 INC_CHARPTR (ptr); | |
5467 } | |
5468 | |
5469 #ifdef ERROR_CHECK_BUFPOS | |
5470 assert (ptr == end); | |
5471 #endif | |
5472 } | |
5473 else | |
5474 { | |
5475 Lisp_Object instream, outstream, da_outstream; | |
5476 Lstream *istr, *ostr; | |
5477 struct gcpro gcpro1, gcpro2, gcpro3; | |
5478 char tempbuf[1024]; /* some random amount */ | |
5479 | |
5480 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len); | |
5481 da_outstream = make_dynarr_output_stream | |
5482 ((unsigned_char_dynarr *) conversion_out_dynarr); | |
5483 outstream = | |
5484 make_encoding_output_stream (XLSTREAM (da_outstream), coding_system); | |
5485 istr = XLSTREAM (instream); | |
5486 ostr = XLSTREAM (outstream); | |
5487 GCPRO3 (instream, outstream, da_outstream); | |
5488 while (1) | |
5489 { | |
5490 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); | |
5491 if (!size_in_bytes) | |
5492 break; | |
5493 Lstream_write (ostr, tempbuf, size_in_bytes); | |
5494 } | |
5495 Lstream_close (istr); | |
5496 Lstream_close (ostr); | |
5497 UNGCPRO; | |
5498 Lstream_delete (istr); | |
5499 Lstream_delete (ostr); | |
5500 Lstream_delete (XLSTREAM (da_outstream)); | |
5501 } | |
5502 | |
5503 *len_out = Dynarr_length (conversion_out_dynarr); | |
5504 Dynarr_add (conversion_out_dynarr, 0); /* remember to zero-terminate! */ | |
5505 return Dynarr_atp (conversion_out_dynarr, 0); | |
5506 } | |
5507 | |
5508 Bufbyte * | |
5509 convert_from_external_format (CONST Extbyte *ptr, | |
5510 Extcount len, | |
5511 Bytecount *len_out, | |
5512 enum external_data_format fmt) | |
5513 { | |
5514 Lisp_Object coding_system = external_data_format_to_coding_system (fmt); | |
5515 | |
5516 if (!conversion_in_dynarr) | |
5517 conversion_in_dynarr = Dynarr_new (Bufbyte); | |
5518 else | |
5519 Dynarr_reset (conversion_in_dynarr); | |
5520 | |
5521 if (NILP (coding_system)) | |
5522 { | |
5523 CONST Extbyte *end = ptr + len; | |
5524 for (; ptr < end; ptr++) | |
5525 { | |
5526 Extbyte c = *ptr; | |
5527 DECODE_ADD_BINARY_CHAR (c, conversion_in_dynarr); | |
5528 } | |
5529 } | |
5530 else | |
5531 { | |
5532 Lisp_Object instream, outstream, da_outstream; | |
5533 Lstream *istr, *ostr; | |
5534 struct gcpro gcpro1, gcpro2, gcpro3; | |
5535 char tempbuf[1024]; /* some random amount */ | |
5536 | |
5537 instream = make_fixed_buffer_input_stream ((unsigned char *) ptr, len); | |
5538 da_outstream = make_dynarr_output_stream | |
5539 ((unsigned_char_dynarr *) conversion_in_dynarr); | |
5540 outstream = | |
5541 make_decoding_output_stream (XLSTREAM (da_outstream), coding_system); | |
5542 istr = XLSTREAM (instream); | |
5543 ostr = XLSTREAM (outstream); | |
5544 GCPRO3 (instream, outstream, da_outstream); | |
5545 while (1) | |
5546 { | |
5547 ssize_t size_in_bytes = Lstream_read (istr, tempbuf, sizeof (tempbuf)); | |
5548 if (!size_in_bytes) | |
5549 break; | |
5550 Lstream_write (ostr, tempbuf, size_in_bytes); | |
5551 } | |
5552 Lstream_close (istr); | |
5553 Lstream_close (ostr); | |
5554 UNGCPRO; | |
5555 Lstream_delete (istr); | |
5556 Lstream_delete (ostr); | |
5557 Lstream_delete (XLSTREAM (da_outstream)); | |
5558 } | |
5559 | |
5560 *len_out = Dynarr_length (conversion_in_dynarr); | |
5561 Dynarr_add (conversion_in_dynarr, 0); /* remember to zero-terminate! */ | |
5562 return Dynarr_atp (conversion_in_dynarr, 0); | |
5563 } | |
5564 | |
5565 | |
5566 /************************************************************************/ | |
5567 /* Initialization */ | |
5568 /************************************************************************/ | |
5569 | |
5570 void | |
5571 syms_of_file_coding (void) | |
5572 { | |
5573 deferror (&Qcoding_system_error, "coding-system-error", | |
5574 "Coding-system error", Qio_error); | |
5575 | |
5576 DEFSUBR (Fcoding_system_p); | |
5577 DEFSUBR (Ffind_coding_system); | |
5578 DEFSUBR (Fget_coding_system); | |
5579 DEFSUBR (Fcoding_system_list); | |
5580 DEFSUBR (Fcoding_system_name); | |
5581 DEFSUBR (Fmake_coding_system); | |
5582 DEFSUBR (Fcopy_coding_system); | |
5583 DEFSUBR (Fdefine_coding_system_alias); | |
5584 DEFSUBR (Fsubsidiary_coding_system); | |
5585 | |
5586 DEFSUBR (Fcoding_system_type); | |
5587 DEFSUBR (Fcoding_system_doc_string); | |
5588 #ifdef MULE | |
5589 DEFSUBR (Fcoding_system_charset); | |
5590 #endif | |
5591 DEFSUBR (Fcoding_system_property); | |
5592 | |
5593 DEFSUBR (Fcoding_category_list); | |
5594 DEFSUBR (Fset_coding_priority_list); | |
5595 DEFSUBR (Fcoding_priority_list); | |
5596 DEFSUBR (Fset_coding_category_system); | |
5597 DEFSUBR (Fcoding_category_system); | |
5598 | |
5599 DEFSUBR (Fdetect_coding_region); | |
5600 DEFSUBR (Fdecode_coding_region); | |
5601 DEFSUBR (Fencode_coding_region); | |
5602 #ifdef MULE | |
5603 DEFSUBR (Fdecode_shift_jis_char); | |
5604 DEFSUBR (Fencode_shift_jis_char); | |
5605 DEFSUBR (Fdecode_big5_char); | |
5606 DEFSUBR (Fencode_big5_char); | |
5607 DEFSUBR (Fset_ucs_char); | |
5608 DEFSUBR (Fucs_char); | |
5609 DEFSUBR (Fset_char_ucs); | |
5610 DEFSUBR (Fchar_ucs); | |
5611 #endif /* MULE */ | |
5612 defsymbol (&Qcoding_systemp, "coding-system-p"); | |
5613 defsymbol (&Qno_conversion, "no-conversion"); | |
5614 defsymbol (&Qraw_text, "raw-text"); | |
5615 #ifdef MULE | |
5616 defsymbol (&Qbig5, "big5"); | |
5617 defsymbol (&Qshift_jis, "shift-jis"); | |
5618 defsymbol (&Qucs4, "ucs-4"); | |
5619 defsymbol (&Qutf8, "utf-8"); | |
5620 defsymbol (&Qccl, "ccl"); | |
5621 defsymbol (&Qiso2022, "iso2022"); | |
5622 #endif /* MULE */ | |
5623 defsymbol (&Qmnemonic, "mnemonic"); | |
5624 defsymbol (&Qeol_type, "eol-type"); | |
5625 defsymbol (&Qpost_read_conversion, "post-read-conversion"); | |
5626 defsymbol (&Qpre_write_conversion, "pre-write-conversion"); | |
5627 | |
5628 defsymbol (&Qcr, "cr"); | |
5629 defsymbol (&Qlf, "lf"); | |
5630 defsymbol (&Qcrlf, "crlf"); | |
5631 defsymbol (&Qeol_cr, "eol-cr"); | |
5632 defsymbol (&Qeol_lf, "eol-lf"); | |
5633 defsymbol (&Qeol_crlf, "eol-crlf"); | |
5634 #ifdef MULE | |
5635 defsymbol (&Qcharset_g0, "charset-g0"); | |
5636 defsymbol (&Qcharset_g1, "charset-g1"); | |
5637 defsymbol (&Qcharset_g2, "charset-g2"); | |
5638 defsymbol (&Qcharset_g3, "charset-g3"); | |
5639 defsymbol (&Qforce_g0_on_output, "force-g0-on-output"); | |
5640 defsymbol (&Qforce_g1_on_output, "force-g1-on-output"); | |
5641 defsymbol (&Qforce_g2_on_output, "force-g2-on-output"); | |
5642 defsymbol (&Qforce_g3_on_output, "force-g3-on-output"); | |
5643 defsymbol (&Qno_iso6429, "no-iso6429"); | |
5644 defsymbol (&Qinput_charset_conversion, "input-charset-conversion"); | |
5645 defsymbol (&Qoutput_charset_conversion, "output-charset-conversion"); | |
5646 | |
5647 defsymbol (&Qshort, "short"); | |
5648 defsymbol (&Qno_ascii_eol, "no-ascii-eol"); | |
5649 defsymbol (&Qno_ascii_cntl, "no-ascii-cntl"); | |
5650 defsymbol (&Qseven, "seven"); | |
5651 defsymbol (&Qlock_shift, "lock-shift"); | |
5652 defsymbol (&Qescape_quoted, "escape-quoted"); | |
5653 #endif /* MULE */ | |
5654 defsymbol (&Qencode, "encode"); | |
5655 defsymbol (&Qdecode, "decode"); | |
5656 | |
5657 #ifdef MULE | |
5658 defsymbol (&Qctext, "ctext"); | |
5659 defsymbol (&coding_category_symbol[CODING_CATEGORY_SHIFT_JIS], | |
5660 "shift-jis"); | |
5661 defsymbol (&coding_category_symbol[CODING_CATEGORY_BIG5], | |
5662 "big5"); | |
5663 defsymbol (&coding_category_symbol[CODING_CATEGORY_UCS4], | |
5664 "ucs-4"); | |
5665 defsymbol (&coding_category_symbol[CODING_CATEGORY_UTF8], | |
5666 "utf-8"); | |
5667 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_7], | |
5668 "iso-7"); | |
5669 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_DESIGNATE], | |
5670 "iso-8-designate"); | |
5671 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_1], | |
5672 "iso-8-1"); | |
5673 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_8_2], | |
5674 "iso-8-2"); | |
5675 defsymbol (&coding_category_symbol[CODING_CATEGORY_ISO_LOCK_SHIFT], | |
5676 "iso-lock-shift"); | |
5677 #endif /* MULE */ | |
5678 defsymbol (&coding_category_symbol[CODING_CATEGORY_NO_CONVERSION], | |
5679 "no-conversion"); | |
5680 } | |
5681 | |
5682 void | |
5683 lstream_type_create_file_coding (void) | |
5684 { | |
5685 LSTREAM_HAS_METHOD (decoding, reader); | |
5686 LSTREAM_HAS_METHOD (decoding, writer); | |
5687 LSTREAM_HAS_METHOD (decoding, rewinder); | |
5688 LSTREAM_HAS_METHOD (decoding, seekable_p); | |
5689 LSTREAM_HAS_METHOD (decoding, flusher); | |
5690 LSTREAM_HAS_METHOD (decoding, closer); | |
5691 LSTREAM_HAS_METHOD (decoding, marker); | |
5692 | |
5693 LSTREAM_HAS_METHOD (encoding, reader); | |
5694 LSTREAM_HAS_METHOD (encoding, writer); | |
5695 LSTREAM_HAS_METHOD (encoding, rewinder); | |
5696 LSTREAM_HAS_METHOD (encoding, seekable_p); | |
5697 LSTREAM_HAS_METHOD (encoding, flusher); | |
5698 LSTREAM_HAS_METHOD (encoding, closer); | |
5699 LSTREAM_HAS_METHOD (encoding, marker); | |
5700 } | |
5701 | |
5702 void | |
5703 vars_of_file_coding (void) | |
5704 { | |
5705 int i; | |
5706 | |
5707 fcd = xnew (struct file_coding_dump); | |
5708 dumpstruct (&fcd, &fcd_description); | |
5709 | |
5710 /* Initialize to something reasonable ... */ | |
5711 for (i = 0; i <= CODING_CATEGORY_LAST; i++) | |
5712 { | |
5713 fcd->coding_category_system[i] = Qnil; | |
5714 fcd->coding_category_by_priority[i] = i; | |
5715 } | |
5716 | |
5717 Fprovide (intern ("file-coding")); | |
5718 | |
5719 DEFVAR_LISP ("keyboard-coding-system", &Vkeyboard_coding_system /* | |
5720 Coding system used for TTY keyboard input. | |
5721 Not used under a windowing system. | |
5722 */ ); | |
5723 Vkeyboard_coding_system = Qnil; | |
5724 | |
5725 DEFVAR_LISP ("terminal-coding-system", &Vterminal_coding_system /* | |
5726 Coding system used for TTY display output. | |
5727 Not used under a windowing system. | |
5728 */ ); | |
5729 Vterminal_coding_system = Qnil; | |
5730 | |
5731 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read /* | |
5732 Overriding coding system used when writing a file or process. | |
5733 You should *bind* this, not set it. If this is non-nil, it specifies | |
5734 the coding system that will be used when a file or process is read | |
5735 in, and overrides `buffer-file-coding-system-for-read', | |
5736 `insert-file-contents-pre-hook', etc. Use those variables instead of | |
5737 this one for permanent changes to the environment. | |
5738 */ ); | |
5739 Vcoding_system_for_read = Qnil; | |
5740 | |
5741 DEFVAR_LISP ("coding-system-for-write", | |
5742 &Vcoding_system_for_write /* | |
5743 Overriding coding system used when writing a file or process. | |
5744 You should *bind* this, not set it. If this is non-nil, it specifies | |
5745 the coding system that will be used when a file or process is wrote | |
5746 in, and overrides `buffer-file-coding-system', | |
5747 `write-region-pre-hook', etc. Use those variables instead of this one | |
5748 for permanent changes to the environment. | |
5749 */ ); | |
5750 Vcoding_system_for_write = Qnil; | |
5751 | |
5752 DEFVAR_LISP ("file-name-coding-system", &Vfile_name_coding_system /* | |
5753 Coding system used to convert pathnames when accessing files. | |
5754 */ ); | |
5755 Vfile_name_coding_system = Qnil; | |
5756 | |
5757 DEFVAR_BOOL ("enable-multibyte-characters", &enable_multibyte_characters /* | |
5758 Non-nil means the buffer contents are regarded as multi-byte form | |
5759 of characters, not a binary code. This affects the display, file I/O, | |
5760 and behaviors of various editing commands. | |
5761 | |
5762 Setting this to nil does not do anything. | |
5763 */ ); | |
5764 enable_multibyte_characters = 1; | |
5765 } | |
5766 | |
5767 void | |
5768 complex_vars_of_file_coding (void) | |
5769 { | |
5770 staticpro (&Vcoding_system_hash_table); | |
5771 Vcoding_system_hash_table = | |
5772 make_lisp_hash_table (50, HASH_TABLE_NON_WEAK, HASH_TABLE_EQ); | |
5773 | |
5774 the_codesys_prop_dynarr = Dynarr_new (codesys_prop); | |
5775 dumpstruct (&the_codesys_prop_dynarr, &codesys_prop_dynarr_description); | |
5776 | |
5777 #define DEFINE_CODESYS_PROP(Prop_Type, Sym) do \ | |
5778 { \ | |
5779 struct codesys_prop csp; \ | |
5780 csp.sym = (Sym); \ | |
5781 csp.prop_type = (Prop_Type); \ | |
5782 Dynarr_add (the_codesys_prop_dynarr, csp); \ | |
5783 } while (0) | |
5784 | |
5785 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qmnemonic); | |
5786 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_type); | |
5787 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_cr); | |
5788 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_crlf); | |
5789 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qeol_lf); | |
5790 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpost_read_conversion); | |
5791 DEFINE_CODESYS_PROP (CODESYS_PROP_ALL_OK, Qpre_write_conversion); | |
5792 #ifdef MULE | |
5793 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g0); | |
5794 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g1); | |
5795 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g2); | |
5796 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qcharset_g3); | |
5797 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g0_on_output); | |
5798 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g1_on_output); | |
5799 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g2_on_output); | |
5800 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qforce_g3_on_output); | |
5801 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qshort); | |
5802 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_eol); | |
5803 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_ascii_cntl); | |
5804 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qseven); | |
5805 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qlock_shift); | |
5806 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qno_iso6429); | |
5807 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qescape_quoted); | |
5808 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qinput_charset_conversion); | |
5809 DEFINE_CODESYS_PROP (CODESYS_PROP_ISO2022, Qoutput_charset_conversion); | |
5810 | |
5811 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qencode); | |
5812 DEFINE_CODESYS_PROP (CODESYS_PROP_CCL, Qdecode); | |
5813 #endif /* MULE */ | |
5814 /* Need to create this here or we're really screwed. */ | |
5815 Fmake_coding_system | |
5816 (Qraw_text, Qno_conversion, | |
5817 build_string ("Raw text, which means it converts only line-break-codes."), | |
5818 list2 (Qmnemonic, build_string ("Raw"))); | |
5819 | |
5820 Fmake_coding_system | |
5821 (Qbinary, Qno_conversion, | |
5822 build_string ("Binary, which means it does not convert anything."), | |
5823 list4 (Qeol_type, Qlf, | |
5824 Qmnemonic, build_string ("Binary"))); | |
5825 | |
5826 Fdefine_coding_system_alias (Qno_conversion, Qraw_text); | |
5827 | |
5828 /* Need this for bootstrapping */ | |
5829 fcd->coding_category_system[CODING_CATEGORY_NO_CONVERSION] = | |
5830 Fget_coding_system (Qraw_text); | |
5831 | |
5832 #ifdef MULE | |
5833 { | |
5834 unsigned int i; | |
5835 | |
5836 for (i = 0; i < 65536; i++) | |
5837 fcd->ucs_to_mule_table[i] = Qnil; | |
5838 } | |
5839 staticpro (&mule_to_ucs_table); | |
5840 mule_to_ucs_table = Fmake_char_table(Qgeneric); | |
5841 #endif /* MULE */ | |
5842 } |