comparison lisp/coding.el @ 4549:68d1ca56cffa

First part of interactive checks that coding systems encode regions. 2008-01-21 Aidan Kehoe <kehoea@parhasard.net> * coding.el (decode-coding-string): (encode-coding-string): Accept GNU's NOCOPY argument for these. Todo; write compiler macros to use it. (query-coding-warning-face): New face, to show unencodable characters. (default-query-coding-region-safe-charset-skip-chars-map): New variable, a cache used by #'default-query-coding-region. (default-query-coding-region): Default implementation of #'query-coding-region, using the safe-charsets and safe-chars coding systemproperties. (query-coding-region): New function; can a given coding system encode a given region? (query-coding-string): New function; can a given coding system encode a given string? (unencodable-char-position): Function API taken from GNU; return the first unencodable position given a string and coding system. (encode-coding-char): Function API taken from GNU; return CHAR encoded using CODING-SYSTEM, or nil if CODING-SYSTEM would trash CHAR. ((unless (featurep 'mule)): Override the default query-coding-region implementation on non-Mule. * mule/mule-coding.el (make-8-bit-generate-helper): Eliminate a duplicate comment. (make-8-bit-choose-category): Simplify implementation. (8-bit-fixed-query-coding-region): Implementation of #'query-coding-region for coding systems created with #'make-8-bit-coding-system. (make-8-bit-coding-system): Initialise the #'query-coding-region implementation for these character sets. (make-8-bit-coding-system): Ditto for the compiler macro version of this function. * unicode.el (unicode-query-coding-skip-chars-arg): New variable, used by unicode-query-coding-region, initialised in mule/general-late.el. (unicode-query-coding-region): New function, the #'query-coding-region implementation for Unicode coding systems. Initialise the query-coding-function property for the Unicode coding systems to #'unicode-query-coding-region. * mule/mule-charset.el (charset-skip-chars-string): New function. Return a #'skip-chars-forward argument that skips all characters in CHARSET. (map-charset-chars): Function synced from GNU, modified to work with XEmacs. Map FUNC across the int value charset ranges of CHARSET.
author Aidan Kehoe <kehoea@parhasard.net>
date Mon, 21 Jan 2008 22:51:21 +0100
parents dd9c1d5f5319
children 6812571bfcb9
comparison
equal deleted inserted replaced
4402:e70cc8a90e90 4549:68d1ca56cffa
123 This function is meant to be called interactively; 123 This function is meant to be called interactively;
124 from a Lisp program, use `detect-coding-region' instead." 124 from a Lisp program, use `detect-coding-region' instead."
125 (interactive "r\nP") 125 (interactive "r\nP")
126 (princ (detect-coding-region start end))) 126 (princ (detect-coding-region start end)))
127 127
128 (defun decode-coding-string (str coding-system) 128 (defun decode-coding-string (str coding-system &optional nocopy)
129 "Decode the string STR which is encoded in CODING-SYSTEM. 129 "Decode the string STR which is encoded in CODING-SYSTEM.
130 Does not modify STR. Returns the decoded string on successful conversion." 130 Normally does not modify STR. Returns the decoded string on
131 successful conversion.
132 Optional argument NOCOPY says that modifying STR and returning it is
133 allowed."
131 (with-string-as-buffer-contents 134 (with-string-as-buffer-contents
132 str (decode-coding-region (point-min) (point-max) coding-system))) 135 str (decode-coding-region (point-min) (point-max) coding-system)))
133 136
134 (defun encode-coding-string (str coding-system) 137 (defun encode-coding-string (str coding-system &optional nocopy)
135 "Encode the string STR using CODING-SYSTEM. 138 "Encode the string STR using CODING-SYSTEM.
136 Does not modify STR. Returns the encoded string on successful conversion." 139 Does not modify STR. Returns the encoded string on successful conversion.
140 Optional argument NOCOPY says that the original string may be returned
141 if does not differ from the encoded string. "
137 (with-string-as-buffer-contents 142 (with-string-as-buffer-contents
138 str (encode-coding-region (point-min) (point-max) coding-system))) 143 str (encode-coding-region (point-min) (point-max) coding-system)))
139 144
140 145
141 ;;;; Coding system accessors 146 ;;;; Coding system accessors
272 ; (define-coding-system-alias 'iso-8859-2 'raw-text) 277 ; (define-coding-system-alias 'iso-8859-2 'raw-text)
273 (define-coding-system-alias 'ctext 'raw-text)) 278 (define-coding-system-alias 'ctext 'raw-text))
274 279
275 (make-compatible-variable 'enable-multibyte-characters "Unimplemented") 280 (make-compatible-variable 'enable-multibyte-characters "Unimplemented")
276 281
282 ;; Sure would be nice to be able to use defface here.
283 (copy-face 'highlight 'query-coding-warning-face)
284
285 (defvar default-query-coding-region-safe-charset-skip-chars-map
286 #s(hash-table test equal data ())
287 "A map from list of charsets to `skip-chars-forward' arguments for them.")
288
289 (defun default-query-coding-region (begin end coding-system
290 &optional buffer errorp highlightp)
291 "The default `query-coding-region' implementation.
292
293 Uses the `safe-charsets' and `safe-chars' coding system properties.
294 The former is a list of XEmacs character sets that can be safely
295 encoded by CODING-SYSTEM; the latter a char table describing, in
296 addition, characters that can be safely encoded by CODING-SYSTEM."
297 (check-argument-type #'coding-system-p
298 (setq coding-system (find-coding-system coding-system)))
299 (check-argument-type #'integer-or-marker-p begin)
300 (check-argument-type #'integer-or-marker-p end)
301 (let* ((safe-charsets
302 (coding-system-get coding-system 'safe-charsets))
303 (safe-chars (coding-system-get coding-system 'safe-chars))
304 (skip-chars-arg
305 (gethash safe-charsets
306 default-query-coding-region-safe-charset-skip-chars-map))
307 (ranges (make-range-table))
308 fail-range-start fail-range-end previous-fail char-after
309 looking-at-arg failed extent)
310 (unless skip-chars-arg
311 (setq skip-chars-arg
312 (puthash safe-charsets
313 (mapconcat #'charset-skip-chars-string
314 safe-charsets "")
315 default-query-coding-region-safe-charset-skip-chars-map)))
316 (if (and (zerop (length skip-chars-arg)) (null safe-chars))
317 (progn
318 ;; Uh-oh, nothing known about this coding system. Fail.
319 (when errorp
320 (error 'text-conversion-error
321 "Coding system doesn't say what it can encode"
322 (coding-system-name coding-system)))
323 (put-range-table begin end t ranges)
324 (when highlightp
325 (setq extent (make-extent begin end buffer))
326 (set-extent-priority extent (+ mouse-highlight-priority 2))
327 (set-extent-face extent 'query-coding-warning-face))
328 (values nil ranges))
329 (setq looking-at-arg (if (equal "" skip-chars-arg)
330 ;; Regexp that will never match.
331 #r".\{0,0\}"
332 (concat "[" skip-chars-arg "]")))
333 (save-excursion
334 (goto-char begin buffer)
335 (skip-chars-forward skip-chars-arg end buffer)
336 (while (< (point buffer) end)
337 (message
338 "fail-range-start is %S, previous-fail %S, point is %S, end is %S"
339 fail-range-start previous-fail (point buffer) end)
340 (setq char-after (char-after (point buffer) buffer)
341 fail-range-start (point buffer))
342 (while (and
343 (< (point buffer) end)
344 (not (looking-at looking-at-arg))
345 (or (not safe-chars)
346 (not (get-char-table char-after safe-chars))))
347 (forward-char 1 buffer)
348 (setq char-after (char-after (point buffer) buffer)
349 failed t))
350 (if (= fail-range-start (point buffer))
351 ;; The character can actually be encoded by the coding
352 ;; system; check the characters past it.
353 (forward-char 1 buffer)
354 ;; Can't be encoded; note this.
355 (when errorp
356 (error 'text-conversion-error
357 (format "Cannot encode %s using coding system"
358 (buffer-substring fail-range-start (point buffer)
359 buffer))
360 (coding-system-name coding-system)))
361 (put-range-table fail-range-start
362 ;; If char-after is non-nil, we're not at
363 ;; the end of the buffer.
364 (setq fail-range-end (if char-after
365 (point buffer)
366 (point-max buffer)))
367 t ranges)
368 (when highlightp
369 (setq extent (make-extent fail-range-start fail-range-end buffer))
370 (set-extent-priority extent (+ mouse-highlight-priority 2))
371 (set-extent-face extent 'query-coding-warning-face)))
372 (skip-chars-forward skip-chars-arg end buffer))
373 (if failed
374 (values nil ranges)
375 (values t nil))))))
376
377 (defsubst query-coding-region (start end coding-system &optional buffer
378 errorp highlight)
379 "Work out whether CODING-SYSTEM can losslessly encode a region.
380
381 START and END are the beginning and end of the region to check.
382 CODING-SYSTEM is the coding system to try.
383
384 Optional argument BUFFER is the buffer to check, and defaults to the current
385 buffer. Optional argument ERRORP says to signal a `text-conversion-error'
386 if some character in the region cannot be encoded, and defaults to nil.
387
388 Optional argument HIGHLIGHT says to display unencodable characters in the
389 region using `query-coding-warning-face'. It defaults to nil.
390
391 This function returns a list; the intention is that callers use use
392 `multiple-value-bind' or the related CL multiple value functions to deal
393 with it. The first element is `t' if the string can be encoded using
394 CODING-SYSTEM, or `nil' if not. The second element is `nil' if the string
395 can be encoded using CODING-SYSTEM; otherwise, it is a range table
396 describing the positions of the unencodable characters. See
397 `make-range-table'."
398 (funcall (or (coding-system-get coding-system 'query-coding-function)
399 #'default-query-coding-region)
400 start end coding-system buffer errorp highlight))
401
402 (defun query-coding-string (string coding-system &optional errorp highlight)
403 "Work out whether CODING-SYSTEM can losslessly encode STRING.
404 CODING-SYSTEM is the coding system to check.
405
406 Optional argument ERRORP says to signal a `text-conversion-error' if some
407 character in the region cannot be encoded, and defaults to nil.
408
409 Optional argument HIGHLIGHT says to display unencodable characters in the
410 region using `query-coding-warning-face'. It defaults to nil.
411
412 This function returns a list; the intention is that callers use use
413 `multiple-value-bind' or the related CL multiple value functions to deal
414 with it. The first element is `t' if the string can be encoded using
415 CODING-SYSTEM, or `nil' if not. The second element is `nil' if the string
416 can be encoded using CODING-SYSTEM; otherwise, it is a range table
417 describing the positions of the unencodable characters. See
418 `make-range-table'."
419 (with-temp-buffer
420 (insert string)
421 (query-coding-region (point-min) (point-max) coding-system (current-buffer)
422 ;; ### Will highlight work here?
423 errorp highlight)))
424
425 (defun unencodable-char-position (start end coding-system
426 &optional count string)
427 "Return position of first un-encodable character in a region.
428 START and END specify the region and CODING-SYSTEM specifies the
429 encoding to check. Return nil if CODING-SYSTEM does encode the region.
430
431 If optional 4th argument COUNT is non-nil, it specifies at most how
432 many un-encodable characters to search. In this case, the value is a
433 list of positions.
434
435 If optional 5th argument STRING is non-nil, it is a string to search
436 for un-encodable characters. In that case, START and END are indexes
437 in the string."
438 (flet ((thunk ()
439 (multiple-value-bind (result ranges)
440 (query-coding-region start end coding-system)
441 (if result
442 ;; If query-coding-region thinks the entire region is
443 ;; encodable, result will be t, and the thunk should
444 ;; return nil, because there are no unencodable
445 ;; positions in the region.
446 nil
447 (if count
448 (block counted
449 (map-range-table
450 #'(lambda (begin end value)
451 (while (and (<= begin end) (<= begin count))
452 (push begin result)
453 (incf begin))
454 (if (> begin count) (return-from counted)))
455 ranges))
456 (map-range-table
457 #'(lambda (begin end value)
458 (while (<= begin end)
459 (push begin result)
460 (incf begin))) ranges))
461 result))))
462 (if string
463 (with-temp-buffer (insert string) (thunk))
464 (thunk))))
465
466 (defun encode-coding-char (char coding-system)
467 "Encode CHAR by CODING-SYSTEM and return the resulting string.
468 If CODING-SYSTEM can't safely encode CHAR, return nil."
469 (check-argument-type #'characterp char)
470 (multiple-value-bind (succeededp)
471 (query-coding-string char coding-system)
472 (when succeededp
473 (encode-coding-string char coding-system))))
474
475 (unless (featurep 'mule)
476 ;; If we're under non-Mule, every XEmacs character can be encoded
477 ;; with every XEmacs coding system.
478 (fset #'default-query-coding-region
479 #'(lambda (&rest ignored) (values t nil)))
480 (unintern 'default-query-coding-region-safe-charset-skip-chars-map))
481
277 ;;; coding.el ends here 482 ;;; coding.el ends here