xemacs-beta: src/unicode.c comparison

comparison src/unicode.c @ 3439:d1754e7f0cea

[xemacs-hg @ 2006-06-03 17:50:39 by aidan] Just-in-time Unicode code point support.

author	aidan
date	Sat, 03 Jun 2006 17:51:06 +0000
parents	8dbdcd070418
children	551c008d3777

comparison

equal deleted inserted replaced

-:14fbcab7c67b
+:d1754e7f0cea
 Lisp_Object Vlanguage_unicode_precedence_list;
 Lisp_Object Vdefault_unicode_precedence_list;
 Lisp_Object Qignore_first_column;
+Lisp_Object Vcurrent_jit_charset;
+Lisp_Object Qlast_allocated_character;
+Lisp_Object Qccl_encode_to_ucs_2;
 /************************************************************************/
 /*                        Unicode implementation                        */
 /************************************************************************/
 else
 return ((int **) XCHARSET_TO_UNICODE_TABLE (charset))[c1 - 32][c2 - 32];
 }
 static Ichar
+get_free_codepoint(Lisp_Object charset)
+{
+Lisp_Object name = Fcharset_name(charset);
+Lisp_Object zeichen = Fget(name, Qlast_allocated_character, Qnil);
+Ichar res;
+/* Only allow this with the 96x96 character sets we are using for
+temporary Unicode support. */
+assert(2 == XCHARSET_DIMENSION(charset) && 96 == XCHARSET_CHARS(charset));
+if (!NILP(zeichen))
+{
+int c1, c2;
+BREAKUP_ICHAR(XCHAR(zeichen), charset, c1, c2);
+if (127 == c1 && 127 == c2)
+	{
+	  /* We've already used the hightest-numbered character in this
+	     set--tell our caller to create another. */
+	  return -1;
+	}
+if (127 == c2)
+	{
+	  ++c1;
+	  c2 = 0x20;
+	}
+else
+	{
+	  ++c2;
+	}
+res = make_ichar(charset, c1, c2);
+Fput(name, Qlast_allocated_character, make_char(res));
+}
+else
+{
+res = make_ichar(charset, 32, 32);
+Fput(name, Qlast_allocated_character, make_char(res));
+}
+return res;
+}
+/* The just-in-time creation of XEmacs characters that correspond to unknown
+Unicode code points happens when:
+1. The lookup would otherwise fail.
+2. The charsets array is the nil or the default.
+If there are no free code points in the just-in-time Unicode character
+set, and the charsets array is the default unicode precedence list,
+create a new just-in-time Unicode character set, add it at the end of the
+unicode precedence list, create the XEmacs character in that character
+set, and return it. */
+static Ichar
 unicode_to_ichar (int code, Lisp_Object_dynarr *charsets)
 {
 int u1, u2, u3, u4;
 int code_levels;
 int i;
 int n = Dynarr_length (charsets);
+static int number_of_jit_charsets;
+static Ascbyte last_jit_charset_final;
 type_checking_assert (code >= 0);
 /* This shortcut depends on the representation of an Ichar, see text.c.
 Note that it may _not_ be extended to U+00A0 to U+00FF (many ISO 8859
 coded character sets have points that map into that region, so this
 	  if (retval != -1)
 	    return make_ichar (charset, retval >> 8, retval & 0xFF);
 	}
 }
-return (Ichar) -1;
+/* Only do the magic just-in-time assignment if we're using the default
+list. */
+if (unicode_precedence_dynarr == charsets)
+{
+if (NILP (Vcurrent_jit_charset) ||
+	  (-1 == (i = get_free_codepoint(Vcurrent_jit_charset))))
+	{
+	  Ascbyte setname[32];
+	  Lisp_Object charset_descr = build_string
+	    ("Mule charset for otherwise unknown Unicode code points.");
+	  Lisp_Object charset_regr = build_string("iso10646-1");
+	  struct gcpro gcpro1, gcpro2;
+	  if ('\0' == last_jit_charset_final)
+	    {
+	      /* This final byte shit is, umm, not that cool. */
+	      last_jit_charset_final = 0x30;
+	    }
+	  snprintf(setname, sizeof(setname),
+		   "jit-ucs-charset-%d", number_of_jit_charsets++);
+	  /* Aside: GCPROing here would be overkill according to the FSF's
+	     philosophy. make-charset cannot currently GC, but is intended
+	     to be called from Lisp, with its arguments protected by the
+	     Lisp reader. We GCPRO in case it GCs in the future and no-one
+	     checks all the C callers.  */
+	  GCPRO2 (charset_descr, charset_regr);
+	  Vcurrent_jit_charset = Fmake_charset
+	    (intern(setname), charset_descr,
+	     /* Set encode-as-utf-8 to t, to have this character set written
+		using UTF-8 escapes in escape-quoted and ctext. This
+		sidesteps the fact that our internal character -> Unicode
+		mapping is not stable from one invocation to the next.  */
+	     nconc2 (list2(Qencode_as_utf_8, Qt),
+		     nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96),
+				   Qdimension, make_int(2)),
+			     list6(Qregistry, charset_regr,
+				   Qfinal, make_char(last_jit_charset_final++),
+				   /* This CCL program is initialised in
+				      unicode.el. */
+				   Qccl_program, Qccl_encode_to_ucs_2))));
+	  UNGCPRO;
+	  i = get_free_codepoint(Vcurrent_jit_charset);
+	}
+if (-1 != i)
+	{
+	  set_unicode_conversion((Ichar)i, code);
+	  /* No need to add the charset to the end of the list; it's done
+	     automatically. */
+	}
+}
+return (Ichar) i;
 }
 /* Add charsets to precedence list.
 LIST must be a list of charsets.  Charsets which are in the list more
 than once are given the precedence implied by their earliest appearance.
 When there is no international support (i.e. the `mule' feature is not
 present), this function simply does `int-to-char' and ignores the CHARSETS
 argument.
-Note that the current XEmacs internal encoding has no mapping for many
+If the CODE would not otherwise be converted to an XEmacs character, and the
-Unicode code points, and if you use characters that are vaguely obscure with
+list of character sets to be consulted is nil or the default, a new XEmacs
-XEmacs' Unicode coding systems, you will lose data.
+character will be created for it in one of the `jit-ucs-charset' Mule
+character sets, and that character will be returned.  There is scope for
-To add support for some desired code point in the short term--note that our
+tens of thousands of separate Unicode code points in every session using
-intention is to move to a Unicode-compatible internal encoding soon, for
+this technique, so despite XEmacs' internal encoding not being based on
-some value of soon--if you are a distributor, add something like the
+Unicode, your data won't be trashed.
-following to `site-start.el.'
-(make-charset 'distro-name-private
-	      "Private character set for DISTRO"
-	      '(dimension 1
-		chars 96
-		columns 1
-		final ?5 ;; Change this--see docs for make-charset
-		long-name "Private charset for some Unicode char support."
-		short-name "Distro-Private"))
-(set-unicode-conversion
-(make-char 'distro-name-private #x20) #x263A) ;; WHITE SMILING FACE
-(set-unicode-conversion
-(make-char 'distro-name-private #x21) #x3030) ;; WAVY DASH
-;; ...
-;;; Repeat as necessary.
-Redisplay will work on the sjt-xft branch, but not with server-side X11
-fonts as is the default.  However, data read in will be preserved when they
-are written out again.
 */
 (code, USED_IF_MULE (charsets)))
 {
 #ifdef MULE
 Lisp_Object_dynarr *dyn;
 /************************************************************************/
 /*                         Unicode coding system                        */
 /************************************************************************/
-/* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */
-enum unicode_type
-{
-UNICODE_UTF_16,
-UNICODE_UTF_8,
-UNICODE_UTF_7,
-UNICODE_UCS_4
-};
 struct unicode_coding_system
 {
 enum unicode_type type;
 unsigned int little_endian :1;
 unsigned int need_bom :1;
 default: ABORT ();
 }
 }
-static void
+/* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented
+encodings. */
+void
 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
 		     int USED_IF_MULE (l), unsigned_char_dynarr *dst,
 		     enum unicode_type type, unsigned int little_endian)
 {
 #ifdef MULE
 DEFSUBR (Fdefault_unicode_precedence_list);
 DEFSUBR (Fset_unicode_conversion);
 DEFSUBR (Fload_unicode_mapping_table);
+DEFSYMBOL (Qccl_encode_to_ucs_2);
+DEFSYMBOL (Qlast_allocated_character);
 DEFSYMBOL (Qignore_first_column);
 #endif /* MULE */
 DEFSUBR (Fchar_to_unicode);
 DEFSUBR (Funicode_to_char);
 unicode_precedence_dynarr = Dynarr_new (Lisp_Object);
 dump_add_root_block_ptr (&unicode_precedence_dynarr,
 			    &lisp_object_dynarr_description);
 init_blank_unicode_tables ();
+staticpro (&Vcurrent_jit_charset);
+Vcurrent_jit_charset = Qnil;
 /* Note that the "block" we are describing is a single pointer, and hence
 we could potentially use dump_add_root_block_ptr().  However, given
 the way the descriptions are written, we couldn't use them, and would
 have to write new descriptions for each of the pointers below, since

Mercurial > hg > xemacs-beta

comparison src/unicode.c @ 3439:d1754e7f0cea