diff src/text.c @ 1292:f3437b56874d

[xemacs-hg @ 2003-02-13 09:57:04 by ben] profile updates profile.c: Major reworking. Keep track of new information -- total function timing (includes descendants), GC usage, total GC usage (includes descendants). New functions to be called appropriately from eval.c, alloc.c to keep track of this information. Keep track of when we're actually in a function vs. in its profile, for more accurate timing counts. Track profile overhead separately. Create new mechanism for specifying "internal sections" that are tracked just like regular Lisp functions and even appear in the backtrace if `backtrace-with-internal-sections' is non-nil (t by default for error-checking builds). Add some KKCC information for the straight (non-Elisp) hash table used by profile, which contains Lisp objects in its keys -- but not used yet. Remove old ad-hoc methods for tracking garbage collection, redisplay (which was incorrect anyway when Lisp was called within these sections). Don't record any tick info when blocking under MS Windows, since the timer there is in real time rather than in process time. Make `start-profiling', `stop-profiling' interactive. Be consistent wrt. recursive functions and functions currently on the stack when starting or stopping -- together these make implementing the `total' values extremely difficult. When we start profiling, we act as if we just entered all the functions currently on the stack. Likewise when exiting. Create vars in_profile for tracking time spent inside of profiling, and profiling_lock for setting exclusive access to the main hash table when reading from it or modifying it. (protects against getting screwed up by the signal handle going off at the same time. profile.h: New file. Create macros for declaring internal profiling sections. lisp.h: Move profile-related stuff to profile.h. alloc.c: Keep track of total consing, for profile. Tell profile when we are consing. Use new profile-section method for noting garbage-collection. alloc.c: Abort if we attempt to call the allocator reentrantly. backtrace.h, eval.c: Add info for use by profile in the backtrace frame and transfer PUSH_BACKTRACE/POP_BACKTRACE from eval.c, for use with profile. elhash.c: Author comment. eval.c, lisp.h: New Lisp var `backtrace-with-internal-sections'. Set to t when error-checking is on. eval.c: When unwinding, eval.c: Report to profile when we are about-to-call and just-called wrt. a function. alloc.c, eval.c: Allow for "fake" backtrace frames, for internal sections (used by profile and `backtrace-with-internal-sections'. event-Xt.c, event-gtk.c, event-msw.c, event-tty.c: Record when we are actually blocking on an event, for profile's sake. event-stream.c: Record internal profiling sections for getting, dispatching events. extents.c: Record internal profiling sections for map_extents. hash.c, hash.h: Add pregrow_hash_table_if_necessary(). (Used in profile code since the signal handler is the main grower but can't allow a realloc(). We make sure, at critical points, that the table is large enough.) lread.c: Create internal profiling sections for `load' (which may be triggered internally by autoload, etc.). redisplay.c: Remove old profile_redisplay_flag. Use new macros to declare internal profiling section for redisplay. text.c: Use new macros to declare internal profiling sections for char-byte conversion and internal-external conversion. SEMI-UNRELATED CHANGES: ----------------------- text.c: Update the long comments.
author ben
date Thu, 13 Feb 2003 09:57:08 +0000
parents e22b0213b713
children 70921960b980
line wrap: on
line diff
--- a/src/text.c	Wed Feb 12 22:52:33 2003 +0000
+++ b/src/text.c	Thu Feb 13 09:57:08 2003 +0000
@@ -1,6 +1,6 @@
 /* Buffer manipulation primitives for XEmacs.
    Copyright (C) 1995 Sun Microsystems, Inc.
-   Copyright (C) 1995, 1996, 2000, 2001, 2002 Ben Wing.
+   Copyright (C) 1995, 1996, 2000, 2001, 2002, 2003 Ben Wing.
    Copyright (C) 1999 Martin Buchholz.
 
 This file is part of XEmacs.
@@ -32,6 +32,7 @@
 #include "charset.h"
 #include "file-coding.h"
 #include "lstream.h"
+#include "profile.h"
 
 
 /************************************************************************/
@@ -40,19 +41,71 @@
 
 /*
    ==========================================================================
-                               1. Character Sets
+                1. Intro to Characters, Character Sets, and Encodings
    ==========================================================================
 
-   A character set (or "charset") is an ordered set of characters.
-
    A character (which is, BTW, a surprisingly complex concept) is, in a
    written representation of text, the most basic written unit that has a
    meaning of its own.  It's comparable to a phoneme when analyzing words
-   in spoken speech.  Just like with a phoneme (which is an abstract
-   concept, and is represented in actual spoken speech by one or more
-   allophones, ...&&#### finish this., a character is actually an abstract
-   concept
+   in spoken speech (for example, the sound of `t' in English, which in
+   fact has different pronunciations in different words -- aspirated in
+   `time', unaspirated in `stop', unreleased or even pronounced as a
+   glottal stop in `button', etc. -- but logically is a single concept).
+   Like a phoneme, a character is an abstract concept defined by its
+   *meaning*.  The character `lowercase f', for example, can always be used
+   to represent the first letter in the word `fill', regardless of whether
+   it's drawn upright or italic, whether the `fi' combination is drawn as a
+   single ligature, whether there are serifs on the bottom of the vertical
+   stroke, etc. (These different appearances of a single character are
+   often called "graphs" or "glyphs".) Our concern when representing text
+   is on representing the abstract characters, and not on their exact
+   appearance.
+
+   A character set (or "charset"), as we define it, is a set of characters,
+   each with an associated number (or set of numbers -- see below), called
+   a "code point".  It's important to understand that a character is not
+   defined by any number attached to it, but by its meaning.  For example,
+   ASCII and EBCDIC are two charsets containing exactly the same characters
+   (lowercase and uppercase letters, numbers 0 through 9, particular
+   punctuation marks) but with different numberings. The `comma' character
+   in ASCII and EBCDIC, for instance, is the same character despite having
+   a different numbering.  Conversely, when comparing ASCII and JIS-Roman,
+   which look the same except that the latter has a yen sign substituted
+   for the backslash, we would say that the backslash and yen sign are
+   *not* the same characters, despite having the same number (95) and
+   despite the fact that all other characters are present in both charsets,
+   with the same numbering.  ASCII and JIS-Roman, then, do *not* have
+   exactly the same characters in them (ASCII has a backslash character but
+   no yen-sign character, and vice-versa for JIS-Roman), unlike ASCII and
+   EBCDIC, even though the numberings in ASCII and JIS-Roman are closer.
+
+   It's also important to distinguish between charsets and encodings.  For
+   a simple charset like ASCII, there is only one encoding normally used --
+   each character is represented by a single byte, with the same value as
+   its code point.  For more complicated charsets, however, things are not
+   so obvious.  Unicode version 2, for example, is a large charset with
+   thousands of characters, each indexed by a 16-bit number, often
+   represented in hex, e.g. 0x05D0 for the Hebrew letter "aleph".  One
+   obvious encoding uses two bytes per character (actually two encodings,
+   depending on which of the two possible byte orderings is chosen).  This
+   encoding is convenient for internal processing of Unicode text; however,
+   it's incompatible with ASCII, so a different encoding, e.g. UTF-8, is
+   usually used for external text, for example files or e-mail.  UTF-8
+   represents Unicode characters with one to three bytes (often extended to
+   six bytes to handle characters with up to 31-bit indices).  Unicode
+   characters 00 to 7F (identical with ASCII) are directly represented with
+   one byte, and other characters with two or more bytes, each in the range
+   80 to FF.
+
+   In general, a single encoding may be able to represent more than one
+   charset.
+
+   See also man/lispref/mule.texi.
    
+   ==========================================================================
+                               2. Character Sets
+   ==========================================================================
+
    A particular character in a charset is indexed using one or
    more "position codes", which are non-negative integers.
    The number of position codes needed to identify a particular
@@ -131,7 +184,7 @@
    This is a bit ad-hoc but gets the job done.
 
    ==========================================================================
-                               2. Encodings
+                               3. Encodings
    ==========================================================================
 
    An "encoding" is a way of numerically representing
@@ -212,7 +265,7 @@
    Initially, Printing-ASCII is invoked.
 
    ==========================================================================
-                          3. Internal Mule Encodings
+                          4. Internal Mule Encodings
    ==========================================================================
 
    In XEmacs/Mule, each character set is assigned a unique number,
@@ -336,7 +389,7 @@
    of the search string and &&#### finish this.
 
    ==========================================================================
-                  4. Buffer Positions and Other Typedefs
+                  5. Buffer Positions and Other Typedefs
    ==========================================================================
 
    A. Buffer Positions
@@ -383,7 +436,7 @@
    B. Other Typedefs
 
       Ichar:
-      -------
+      ------
         This typedef represents a single Emacs character, which can be
 	ASCII, ISO-8859, or some extended character, as would typically
 	be used for Kanji.  Note that the representation of a character
@@ -405,7 +458,7 @@
 	the standard 8-bit representation of ASCII/ISO-8859-1.
 
       Ibyte:
-      --------
+      ------
         The data in a buffer or string is logically made up of Ibyte
 	objects, where a Ibyte takes up the same amount of space as a
 	char. (It is declared differently, though, to catch invalid
@@ -428,8 +481,8 @@
 	     within the string, you need merely use standard
 	     searching routines.
 
-      array of char:
-      --------------
+      Extbyte:
+      --------
         Strings that go in or out of Emacs are in "external format",
 	typedef'ed as an array of char or a char *.  There is more
 	than one external format (JIS, EUC, etc.) but they all
@@ -515,26 +568,27 @@
    case. #### unfinished
 
    ==========================================================================
-                                5. Miscellaneous
+                                6. Miscellaneous
    ==========================================================================
 
    A. Unicode Support
 
-   Adding Unicode support is very desirable.  Unicode will likely be a
-   very common representation in the future, and thus we should
-   represent Unicode characters using three bytes instead of four.
-   This means we need to find leading bytes for Unicode.  Given that
-   there are 65,536 characters in Unicode and we can attach 96x96 =
-   9,216 characters per leading byte, we need eight leading bytes for
-   Unicode.  We currently have four free (0x9A - 0x9D), and with a
-   little bit of rearranging we can get five: ASCII doesn't really
-   need to take up a leading byte. (We could just as well use 0x7F,
-   with a little change to the functions that assume that 0x80 is the
-   lowest leading byte.) This means we still need to dump three
-   leading bytes and move them into private space.  The CNS charsets
-   are good candidates since they are rarely used, and
-   JAPANESE_JISX0208_1978 is becoming less and less used and could
-   also be dumped.
+   Unicode support is very desirable.  Currrently we know how to handle
+   externally-encoded Unicode data in various encodings -- UTF-16, UTF-8,
+   etc.  However, we really need to represent Unicode characters internally
+   as-is, rather than converting to some language-specific character set.
+   For efficiency, we should represent Unicode characters using 3 bytes
+   rather than 4.  This means we need to find leading bytes for Unicode.
+   Given that there are 65,536 characters in Unicode and we can attach
+   96x96 = 9,216 characters per leading byte, we need eight leading bytes
+   for Unicode.  We currently have four free (0x9A - 0x9D), and with a
+   little bit of rearranging we can get five: ASCII doesn't really need to
+   take up a leading byte. (We could just as well use 0x7F, with a little
+   change to the functions that assume that 0x80 is the lowest leading
+   byte.) This means we still need to dump three leading bytes and move
+   them into private space.  The CNS charsets are good candidates since
+   they are rarely used, and JAPANESE_JISX0208_1978 is becoming less and
+   less used and could also be dumped.
 
    B. Composite Characters
       
@@ -624,6 +678,9 @@
 
 #endif /* MULE */
 
+Lisp_Object QSin_char_byte_conversion;
+Lisp_Object QSin_internal_external_conversion;
+
 
 /************************************************************************/
 /*                          qxestr***() functions                       */
@@ -1599,6 +1656,7 @@
   Bytebpos retval;
   int diff_so_far;
   int add_to_cache = 0;
+  PROFILE_DECLARE ();
 
   /* Check for some cached positions, for speed. */
   if (x == BUF_PT (buf))
@@ -1608,6 +1666,8 @@
   if (x == BUF_BEGV (buf))
     return BYTE_BUF_BEGV (buf);
 
+  PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion);
+
   bufmin = buf->text->mule_bufmin;
   bufmax = buf->text->mule_bufmax;
   bytmin = buf->text->mule_bytmin;
@@ -1858,6 +1918,8 @@
       buf->text->mule_bytebpos_cache[replace_loc] = retval;
     }
 
+  PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion);
+
   return retval;
 }
 
@@ -1876,6 +1938,7 @@
   Charbpos retval;
   int diff_so_far;
   int add_to_cache = 0;
+  PROFILE_DECLARE ();
 
   /* Check for some cached positions, for speed. */
   if (x == BYTE_BUF_PT (buf))
@@ -1885,6 +1948,8 @@
   if (x == BYTE_BUF_BEGV (buf))
     return BUF_BEGV (buf);
 
+  PROFILE_RECORD_ENTERING_SECTION (QSin_char_byte_conversion);
+
   bufmin = buf->text->mule_bufmin;
   bufmax = buf->text->mule_bufmax;
   bytmin = buf->text->mule_bytmin;
@@ -2135,6 +2200,8 @@
       buf->text->mule_bytebpos_cache[replace_loc] = x;
     }
 
+  PROFILE_RECORD_EXITING_SECTION (QSin_char_byte_conversion);
+
   return retval;
 }
 
@@ -2759,8 +2826,13 @@
   /* It's guaranteed that many callers are not prepared for GC here,
      esp. given that this code conversion occurs in many very hidden
      places. */
-  int count = begin_gc_forbidden ();
+  int count;
   Extbyte_dynarr *conversion_out_dynarr;
+  PROFILE_DECLARE ();
+
+  PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion);
+
+  count = begin_gc_forbidden ();
 
   type_checking_assert
     (((source_type == DFC_TYPE_DATA) ||
@@ -2945,6 +3017,8 @@
       Dynarr_add (conversion_out_dynarr, '\0');
       sink->data.ptr = Dynarr_atp (conversion_out_dynarr, 0);
     }
+
+  PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion);
 }
 
 void
@@ -2957,8 +3031,13 @@
   /* It's guaranteed that many callers are not prepared for GC here,
      esp. given that this code conversion occurs in many very hidden
      places. */
-  int count = begin_gc_forbidden ();
+  int count;
   Ibyte_dynarr *conversion_in_dynarr;
+  PROFILE_DECLARE ();
+
+  PROFILE_RECORD_ENTERING_SECTION (QSin_internal_external_conversion);
+
+  count = begin_gc_forbidden ();
 
   type_checking_assert
     ((source_type == DFC_TYPE_DATA ||
@@ -3010,7 +3089,8 @@
 #endif
     }
 #ifdef HAVE_WIN32_CODING_SYSTEMS
-  /* Optimize the common case involving Unicode where only ASCII/Latin-1 is involved */
+  /* Optimize the common case involving Unicode where only ASCII/Latin-1 is
+     involved */
   else if (source_type != DFC_TYPE_LISP_LSTREAM &&
 	   sink_type   != DFC_TYPE_LISP_LSTREAM &&
 	   dfc_coding_system_is_unicode (coding_system))
@@ -3135,6 +3215,8 @@
       Dynarr_add (conversion_in_dynarr, '\0');
       sink->data.ptr = Dynarr_atp (conversion_in_dynarr, 0);
     }
+
+  PROFILE_RECORD_EXITING_SECTION (QSin_internal_external_conversion);
 }
 
 
@@ -3668,6 +3750,12 @@
 {
   reinit_vars_of_text ();
 
+  QSin_char_byte_conversion = build_msg_string ("(in char-byte conversion)");
+  staticpro (&QSin_char_byte_conversion);
+  QSin_internal_external_conversion =
+    build_msg_string ("(in internal-external conversion)");
+  staticpro (&QSin_internal_external_conversion);
+
 #ifdef ENABLE_COMPOSITE_CHARS
   /* #### not dumped properly */
   composite_char_row_next = 32;