# HG changeset patch # User Aidan Kehoe # Date 1336654386 -3600 # Node ID bed39edf91ba137860fa91f24628ff7bdecb43a1 # Parent 6e5a7278f9bfc88ff4111ccaf59a08a018462cb5 Be better about word boundaries with JIT unicode characters, mule-category.el lisp/ChangeLog addition: 2012-05-10 Aidan Kehoe * mule/mule-category.el (word-combining-categories): Be better about default word boundaries when text contains just-in-time-allocated Unicode code points. Document what we should do instead once we have Unicode internally. * mule/misc-lang.el: IPA characters are Latin. diff -r 6e5a7278f9bf -r bed39edf91ba lisp/ChangeLog --- a/lisp/ChangeLog Tue May 08 09:47:41 2012 +0100 +++ b/lisp/ChangeLog Thu May 10 13:53:06 2012 +0100 @@ -1,3 +1,11 @@ +2012-05-10 Aidan Kehoe + + * mule/mule-category.el (word-combining-categories): + Be better about default word boundaries when text contains + just-in-time-allocated Unicode code points. Document what we + should do instead once we have Unicode internally. + * mule/misc-lang.el: IPA characters are Latin. + 2012-05-08 Aidan Kehoe * cl-macs.el (rassoc): Remove a stray parenthesis here, thank you diff -r 6e5a7278f9bf -r bed39edf91ba lisp/mule/misc-lang.el --- a/lisp/mule/misc-lang.el Tue May 08 09:47:41 2012 +0100 +++ b/lisp/mule/misc-lang.el Thu May 10 13:53:06 2012 +0100 @@ -41,4 +41,26 @@ short-name "IPA" long-name "IPA")) +;; XEmacs; these are Latin, it's not useful to put word boundaries between +;; them and ASCII. +(modify-category-entry 'ipa ?l nil t) + +;; XEmacs; why are these Latin? See the following: +;; +;; (let ((scripts +;; (mapcar #'(lambda (character) +;; (car +;; (split-string +;; (cadr (assoc "Name" (describe-char-unicode-data +;; character)))))) +;; (loop +;; for i from 33 to 127 +;; if (not (eql -1 (char-to-unicode (make-char 'ipa i)))) +;; nconc (list (make-char 'ipa i)))))) +;; (mapcar #'(lambda (script) +;; (cons script (count script scripts :test #'equal))) +;; (remove-duplicates scripts :test #'equal))) +;; => (("GREEK" . 1) ("LATIN" . 55) ("MODIFIER" . 3)) + + ;;; misc-lang.el ends here diff -r 6e5a7278f9bf -r bed39edf91ba lisp/mule/mule-category.el --- a/lisp/mule/mule-category.el Tue May 08 09:47:41 2012 +0100 +++ b/lisp/mule/mule-category.el Thu May 10 13:53:06 2012 +0100 @@ -252,6 +252,7 @@ (chinese-big5-1 ?t) (chinese-big5-2 ?t) (korean-ksc5601 ?h "Hangul (Korean) 2-byte character set") + (jit-ucs-charset-0 ?J "Just-in-time-allocated Unicode character") ) "List of predefined categories. Each element is a list of a charset, a designator, and maybe a doc string.") @@ -275,7 +276,18 @@ ;;; Setting word boundary. (setq word-combining-categories - '((?l . ?l))) + ;; XEmacs; we should change to defining scripts, as does GNU, once + ;; unicode-internal is the default, and placing word boundaries + ;; between different scripts, not different charsets, by default. + ;; Then we can remove the jit-ucs-charset-0 entry above and all the + ;; entries containing ?J in this list. + ;; + ;; These entries are a bit heuristic, working on the assumption that + ;; characters that will be just-in-time-allocated will not be East + ;; Asian in XEmacs, and there's also no mechanism to apply the ?J + ;; category to further newly-created JIT categories. + '((?l . ?l) (?J . ?l) (?l . ?J) (?J . ?y) (?y . ?J) (?J . ?b) (?b . ?J) + (?J . ?g) (?J . ?w) (?w . ?J))) (setq word-separating-categories ; (2-byte character sets) '((?A . ?K) ; Alpha numeric - Katakana