Mercurial > hg > xemacs-beta
comparison lisp/coding.el @ 4549:68d1ca56cffa
First part of interactive checks that coding systems encode regions.
2008-01-21 Aidan Kehoe <kehoea@parhasard.net>
* coding.el (decode-coding-string):
(encode-coding-string): Accept GNU's NOCOPY argument for
these. Todo; write compiler macros to use it.
(query-coding-warning-face): New face, to show unencodable
characters.
(default-query-coding-region-safe-charset-skip-chars-map):
New variable, a cache used by #'default-query-coding-region.
(default-query-coding-region): Default implementation of
#'query-coding-region, using the safe-charsets and safe-chars
coding systemproperties.
(query-coding-region): New function; can a given coding system
encode a given region?
(query-coding-string): New function; can a given coding system
encode a given string?
(unencodable-char-position): Function API taken from GNU; return
the first unencodable position given a string and coding system.
(encode-coding-char): Function API taken from GNU; return CHAR
encoded using CODING-SYSTEM, or nil if CODING-SYSTEM would trash
CHAR.
((unless (featurep 'mule)): Override the default
query-coding-region implementation on non-Mule.
* mule/mule-coding.el (make-8-bit-generate-helper): Eliminate a
duplicate comment.
(make-8-bit-choose-category): Simplify implementation.
(8-bit-fixed-query-coding-region): Implementation of
#'query-coding-region for coding systems created with
#'make-8-bit-coding-system.
(make-8-bit-coding-system): Initialise the #'query-coding-region
implementation for these character sets.
(make-8-bit-coding-system): Ditto for the compiler macro version
of this function.
* unicode.el (unicode-query-coding-skip-chars-arg): New variable,
used by unicode-query-coding-region, initialised in
mule/general-late.el.
(unicode-query-coding-region): New function, the
#'query-coding-region implementation for Unicode coding systems.
Initialise the query-coding-function property for the Unicode
coding systems to #'unicode-query-coding-region.
* mule/mule-charset.el (charset-skip-chars-string): New
function. Return a #'skip-chars-forward argument that skips all
characters in CHARSET.
(map-charset-chars): Function synced from GNU, modified to work
with XEmacs. Map FUNC across the int value charset ranges of
CHARSET.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Mon, 21 Jan 2008 22:51:21 +0100 |
parents | dd9c1d5f5319 |
children | 6812571bfcb9 |
comparison
equal
deleted
inserted
replaced
4402:e70cc8a90e90 | 4549:68d1ca56cffa |
---|---|
123 This function is meant to be called interactively; | 123 This function is meant to be called interactively; |
124 from a Lisp program, use `detect-coding-region' instead." | 124 from a Lisp program, use `detect-coding-region' instead." |
125 (interactive "r\nP") | 125 (interactive "r\nP") |
126 (princ (detect-coding-region start end))) | 126 (princ (detect-coding-region start end))) |
127 | 127 |
128 (defun decode-coding-string (str coding-system) | 128 (defun decode-coding-string (str coding-system &optional nocopy) |
129 "Decode the string STR which is encoded in CODING-SYSTEM. | 129 "Decode the string STR which is encoded in CODING-SYSTEM. |
130 Does not modify STR. Returns the decoded string on successful conversion." | 130 Normally does not modify STR. Returns the decoded string on |
131 successful conversion. | |
132 Optional argument NOCOPY says that modifying STR and returning it is | |
133 allowed." | |
131 (with-string-as-buffer-contents | 134 (with-string-as-buffer-contents |
132 str (decode-coding-region (point-min) (point-max) coding-system))) | 135 str (decode-coding-region (point-min) (point-max) coding-system))) |
133 | 136 |
134 (defun encode-coding-string (str coding-system) | 137 (defun encode-coding-string (str coding-system &optional nocopy) |
135 "Encode the string STR using CODING-SYSTEM. | 138 "Encode the string STR using CODING-SYSTEM. |
136 Does not modify STR. Returns the encoded string on successful conversion." | 139 Does not modify STR. Returns the encoded string on successful conversion. |
140 Optional argument NOCOPY says that the original string may be returned | |
141 if does not differ from the encoded string. " | |
137 (with-string-as-buffer-contents | 142 (with-string-as-buffer-contents |
138 str (encode-coding-region (point-min) (point-max) coding-system))) | 143 str (encode-coding-region (point-min) (point-max) coding-system))) |
139 | 144 |
140 | 145 |
141 ;;;; Coding system accessors | 146 ;;;; Coding system accessors |
272 ; (define-coding-system-alias 'iso-8859-2 'raw-text) | 277 ; (define-coding-system-alias 'iso-8859-2 'raw-text) |
273 (define-coding-system-alias 'ctext 'raw-text)) | 278 (define-coding-system-alias 'ctext 'raw-text)) |
274 | 279 |
275 (make-compatible-variable 'enable-multibyte-characters "Unimplemented") | 280 (make-compatible-variable 'enable-multibyte-characters "Unimplemented") |
276 | 281 |
282 ;; Sure would be nice to be able to use defface here. | |
283 (copy-face 'highlight 'query-coding-warning-face) | |
284 | |
285 (defvar default-query-coding-region-safe-charset-skip-chars-map | |
286 #s(hash-table test equal data ()) | |
287 "A map from list of charsets to `skip-chars-forward' arguments for them.") | |
288 | |
289 (defun default-query-coding-region (begin end coding-system | |
290 &optional buffer errorp highlightp) | |
291 "The default `query-coding-region' implementation. | |
292 | |
293 Uses the `safe-charsets' and `safe-chars' coding system properties. | |
294 The former is a list of XEmacs character sets that can be safely | |
295 encoded by CODING-SYSTEM; the latter a char table describing, in | |
296 addition, characters that can be safely encoded by CODING-SYSTEM." | |
297 (check-argument-type #'coding-system-p | |
298 (setq coding-system (find-coding-system coding-system))) | |
299 (check-argument-type #'integer-or-marker-p begin) | |
300 (check-argument-type #'integer-or-marker-p end) | |
301 (let* ((safe-charsets | |
302 (coding-system-get coding-system 'safe-charsets)) | |
303 (safe-chars (coding-system-get coding-system 'safe-chars)) | |
304 (skip-chars-arg | |
305 (gethash safe-charsets | |
306 default-query-coding-region-safe-charset-skip-chars-map)) | |
307 (ranges (make-range-table)) | |
308 fail-range-start fail-range-end previous-fail char-after | |
309 looking-at-arg failed extent) | |
310 (unless skip-chars-arg | |
311 (setq skip-chars-arg | |
312 (puthash safe-charsets | |
313 (mapconcat #'charset-skip-chars-string | |
314 safe-charsets "") | |
315 default-query-coding-region-safe-charset-skip-chars-map))) | |
316 (if (and (zerop (length skip-chars-arg)) (null safe-chars)) | |
317 (progn | |
318 ;; Uh-oh, nothing known about this coding system. Fail. | |
319 (when errorp | |
320 (error 'text-conversion-error | |
321 "Coding system doesn't say what it can encode" | |
322 (coding-system-name coding-system))) | |
323 (put-range-table begin end t ranges) | |
324 (when highlightp | |
325 (setq extent (make-extent begin end buffer)) | |
326 (set-extent-priority extent (+ mouse-highlight-priority 2)) | |
327 (set-extent-face extent 'query-coding-warning-face)) | |
328 (values nil ranges)) | |
329 (setq looking-at-arg (if (equal "" skip-chars-arg) | |
330 ;; Regexp that will never match. | |
331 #r".\{0,0\}" | |
332 (concat "[" skip-chars-arg "]"))) | |
333 (save-excursion | |
334 (goto-char begin buffer) | |
335 (skip-chars-forward skip-chars-arg end buffer) | |
336 (while (< (point buffer) end) | |
337 (message | |
338 "fail-range-start is %S, previous-fail %S, point is %S, end is %S" | |
339 fail-range-start previous-fail (point buffer) end) | |
340 (setq char-after (char-after (point buffer) buffer) | |
341 fail-range-start (point buffer)) | |
342 (while (and | |
343 (< (point buffer) end) | |
344 (not (looking-at looking-at-arg)) | |
345 (or (not safe-chars) | |
346 (not (get-char-table char-after safe-chars)))) | |
347 (forward-char 1 buffer) | |
348 (setq char-after (char-after (point buffer) buffer) | |
349 failed t)) | |
350 (if (= fail-range-start (point buffer)) | |
351 ;; The character can actually be encoded by the coding | |
352 ;; system; check the characters past it. | |
353 (forward-char 1 buffer) | |
354 ;; Can't be encoded; note this. | |
355 (when errorp | |
356 (error 'text-conversion-error | |
357 (format "Cannot encode %s using coding system" | |
358 (buffer-substring fail-range-start (point buffer) | |
359 buffer)) | |
360 (coding-system-name coding-system))) | |
361 (put-range-table fail-range-start | |
362 ;; If char-after is non-nil, we're not at | |
363 ;; the end of the buffer. | |
364 (setq fail-range-end (if char-after | |
365 (point buffer) | |
366 (point-max buffer))) | |
367 t ranges) | |
368 (when highlightp | |
369 (setq extent (make-extent fail-range-start fail-range-end buffer)) | |
370 (set-extent-priority extent (+ mouse-highlight-priority 2)) | |
371 (set-extent-face extent 'query-coding-warning-face))) | |
372 (skip-chars-forward skip-chars-arg end buffer)) | |
373 (if failed | |
374 (values nil ranges) | |
375 (values t nil)))))) | |
376 | |
377 (defsubst query-coding-region (start end coding-system &optional buffer | |
378 errorp highlight) | |
379 "Work out whether CODING-SYSTEM can losslessly encode a region. | |
380 | |
381 START and END are the beginning and end of the region to check. | |
382 CODING-SYSTEM is the coding system to try. | |
383 | |
384 Optional argument BUFFER is the buffer to check, and defaults to the current | |
385 buffer. Optional argument ERRORP says to signal a `text-conversion-error' | |
386 if some character in the region cannot be encoded, and defaults to nil. | |
387 | |
388 Optional argument HIGHLIGHT says to display unencodable characters in the | |
389 region using `query-coding-warning-face'. It defaults to nil. | |
390 | |
391 This function returns a list; the intention is that callers use use | |
392 `multiple-value-bind' or the related CL multiple value functions to deal | |
393 with it. The first element is `t' if the string can be encoded using | |
394 CODING-SYSTEM, or `nil' if not. The second element is `nil' if the string | |
395 can be encoded using CODING-SYSTEM; otherwise, it is a range table | |
396 describing the positions of the unencodable characters. See | |
397 `make-range-table'." | |
398 (funcall (or (coding-system-get coding-system 'query-coding-function) | |
399 #'default-query-coding-region) | |
400 start end coding-system buffer errorp highlight)) | |
401 | |
402 (defun query-coding-string (string coding-system &optional errorp highlight) | |
403 "Work out whether CODING-SYSTEM can losslessly encode STRING. | |
404 CODING-SYSTEM is the coding system to check. | |
405 | |
406 Optional argument ERRORP says to signal a `text-conversion-error' if some | |
407 character in the region cannot be encoded, and defaults to nil. | |
408 | |
409 Optional argument HIGHLIGHT says to display unencodable characters in the | |
410 region using `query-coding-warning-face'. It defaults to nil. | |
411 | |
412 This function returns a list; the intention is that callers use use | |
413 `multiple-value-bind' or the related CL multiple value functions to deal | |
414 with it. The first element is `t' if the string can be encoded using | |
415 CODING-SYSTEM, or `nil' if not. The second element is `nil' if the string | |
416 can be encoded using CODING-SYSTEM; otherwise, it is a range table | |
417 describing the positions of the unencodable characters. See | |
418 `make-range-table'." | |
419 (with-temp-buffer | |
420 (insert string) | |
421 (query-coding-region (point-min) (point-max) coding-system (current-buffer) | |
422 ;; ### Will highlight work here? | |
423 errorp highlight))) | |
424 | |
425 (defun unencodable-char-position (start end coding-system | |
426 &optional count string) | |
427 "Return position of first un-encodable character in a region. | |
428 START and END specify the region and CODING-SYSTEM specifies the | |
429 encoding to check. Return nil if CODING-SYSTEM does encode the region. | |
430 | |
431 If optional 4th argument COUNT is non-nil, it specifies at most how | |
432 many un-encodable characters to search. In this case, the value is a | |
433 list of positions. | |
434 | |
435 If optional 5th argument STRING is non-nil, it is a string to search | |
436 for un-encodable characters. In that case, START and END are indexes | |
437 in the string." | |
438 (flet ((thunk () | |
439 (multiple-value-bind (result ranges) | |
440 (query-coding-region start end coding-system) | |
441 (if result | |
442 ;; If query-coding-region thinks the entire region is | |
443 ;; encodable, result will be t, and the thunk should | |
444 ;; return nil, because there are no unencodable | |
445 ;; positions in the region. | |
446 nil | |
447 (if count | |
448 (block counted | |
449 (map-range-table | |
450 #'(lambda (begin end value) | |
451 (while (and (<= begin end) (<= begin count)) | |
452 (push begin result) | |
453 (incf begin)) | |
454 (if (> begin count) (return-from counted))) | |
455 ranges)) | |
456 (map-range-table | |
457 #'(lambda (begin end value) | |
458 (while (<= begin end) | |
459 (push begin result) | |
460 (incf begin))) ranges)) | |
461 result)))) | |
462 (if string | |
463 (with-temp-buffer (insert string) (thunk)) | |
464 (thunk)))) | |
465 | |
466 (defun encode-coding-char (char coding-system) | |
467 "Encode CHAR by CODING-SYSTEM and return the resulting string. | |
468 If CODING-SYSTEM can't safely encode CHAR, return nil." | |
469 (check-argument-type #'characterp char) | |
470 (multiple-value-bind (succeededp) | |
471 (query-coding-string char coding-system) | |
472 (when succeededp | |
473 (encode-coding-string char coding-system)))) | |
474 | |
475 (unless (featurep 'mule) | |
476 ;; If we're under non-Mule, every XEmacs character can be encoded | |
477 ;; with every XEmacs coding system. | |
478 (fset #'default-query-coding-region | |
479 #'(lambda (&rest ignored) (values t nil))) | |
480 (unintern 'default-query-coding-region-safe-charset-skip-chars-map)) | |
481 | |
277 ;;; coding.el ends here | 482 ;;; coding.el ends here |