# HG changeset patch
# User Aidan Kehoe <kehoea@parhasard.net>
# Date 1336654386 -3600
# Node ID bed39edf91ba137860fa91f24628ff7bdecb43a1
# Parent  6e5a7278f9bfc88ff4111ccaf59a08a018462cb5
Be better about word boundaries with JIT unicode characters, mule-category.el

lisp/ChangeLog addition:

2012-05-10  Aidan Kehoe  <kehoea@parhasard.net>

	* mule/mule-category.el (word-combining-categories):
	Be better about default word boundaries when text contains
	just-in-time-allocated Unicode code points. Document what we
	should do instead once we have Unicode internally.
	* mule/misc-lang.el: IPA characters are Latin.

diff -r 6e5a7278f9bf -r bed39edf91ba lisp/ChangeLog
--- a/lisp/ChangeLog	Tue May 08 09:47:41 2012 +0100
+++ b/lisp/ChangeLog	Thu May 10 13:53:06 2012 +0100
@@ -1,3 +1,11 @@
+2012-05-10  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* mule/mule-category.el (word-combining-categories):
+	Be better about default word boundaries when text contains
+	just-in-time-allocated Unicode code points. Document what we
+	should do instead once we have Unicode internally.
+	* mule/misc-lang.el: IPA characters are Latin.
+
 2012-05-08  Aidan Kehoe  <kehoea@parhasard.net>
 
 	* cl-macs.el (rassoc): Remove a stray parenthesis here, thank you
diff -r 6e5a7278f9bf -r bed39edf91ba lisp/mule/misc-lang.el
--- a/lisp/mule/misc-lang.el	Tue May 08 09:47:41 2012 +0100
+++ b/lisp/mule/misc-lang.el	Thu May 10 13:53:06 2012 +0100
@@ -41,4 +41,26 @@
 		short-name "IPA"
 		long-name "IPA"))
 
+;; XEmacs; these are Latin, it's not useful to put word boundaries between
+;; them and ASCII.
+(modify-category-entry 'ipa ?l nil t)
+
+;; XEmacs; why are these Latin? See the following:
+;;
+;; (let ((scripts
+;;        (mapcar #'(lambda (character)
+;;                    (car
+;;                     (split-string
+;;                      (cadr (assoc "Name" (describe-char-unicode-data
+;;                                           character))))))
+;;                (loop
+;;                  for i from 33 to 127
+;;                  if (not (eql -1 (char-to-unicode (make-char 'ipa i))))
+;;                  nconc (list (make-char 'ipa i))))))
+;;   (mapcar #'(lambda (script)
+;;               (cons script (count script scripts :test #'equal)))
+;;           (remove-duplicates scripts :test #'equal)))
+;; => (("GREEK" . 1) ("LATIN" . 55) ("MODIFIER" . 3))
+
+
 ;;; misc-lang.el ends here
diff -r 6e5a7278f9bf -r bed39edf91ba lisp/mule/mule-category.el
--- a/lisp/mule/mule-category.el	Tue May 08 09:47:41 2012 +0100
+++ b/lisp/mule/mule-category.el	Thu May 10 13:53:06 2012 +0100
@@ -252,6 +252,7 @@
     (chinese-big5-1	?t)
     (chinese-big5-2	?t)
     (korean-ksc5601	?h "Hangul (Korean) 2-byte character set")
+    (jit-ucs-charset-0  ?J "Just-in-time-allocated Unicode character")
     )
   "List of predefined categories.
 Each element is a list of a charset, a designator, and maybe a doc string.")
@@ -275,7 +276,18 @@
 ;;; Setting word boundary.
 
 (setq word-combining-categories
-      '((?l . ?l)))
+      ;; XEmacs; we should change to defining scripts, as does GNU, once
+      ;; unicode-internal is the default, and placing word boundaries
+      ;; between different scripts, not different charsets, by default.
+      ;; Then we can remove the jit-ucs-charset-0 entry above and all the
+      ;; entries containing ?J in this list.
+      ;;
+      ;; These entries are a bit heuristic, working on the assumption that
+      ;; characters that will be just-in-time-allocated will not be East
+      ;; Asian in XEmacs, and there's also no mechanism to apply the ?J
+      ;; category to further newly-created JIT categories.
+      '((?l . ?l) (?J . ?l) (?l . ?J) (?J . ?y) (?y . ?J) (?J . ?b) (?b . ?J)
+        (?J . ?g) (?J . ?w) (?w . ?J)))
 
 (setq word-separating-categories	;  (2-byte character sets)
       '((?A . ?K)			; Alpha numeric - Katakana