diff src/syntax.h @ 5544:c2301b2c88c8

Improve documentation of syntax table internals.
author Stephen J. Turnbull <stephen@xemacs.org>
date Mon, 08 Aug 2011 13:57:20 +0900
parents dab422055bab
children 85210c453a97
line wrap: on
line diff
--- a/src/syntax.h	Mon Aug 08 13:57:20 2011 +0900
+++ b/src/syntax.h	Mon Aug 08 13:57:20 2011 +0900
@@ -26,22 +26,24 @@
 
 /* A syntax table is a type of char table.
 
-The low 7 bits of the integer is a code, as follows. The 8th bit is
-used as the prefix bit flag (see below).
-
 The values in a syntax table are either integers or conses of
 integers and chars.  The lowest 7 bits of the integer are the syntax
 class.  If this is Sinherit, then the actual syntax value needs to
 be retrieved from the standard syntax table.
 
-Since the logic involved in finding the actual integer isn't very
-complex, you'd think the time required to retrieve it is not a
-factor.  If you thought that, however, you'd be wrong, due to the
-high number of times (many per character) that the syntax value is
-accessed in functions such as scan_lists().  To speed this up,
-we maintain a mirror syntax table that contains the actual
-integers.  We can do this successfully because syntax tables are
-now an abstract type, where we control all access.
+It turns out to be worth optimizing lookups of character syntax in two
+ways.  First, although the logic involved in finding the actual integer
+isn't complex, the syntax value is accessed in functions such as
+scan_lists() many times for each character scanned.  A "mirror syntax
+table" that contains the actual integers speeds this up.
+
+Second, due to the syntax-table text property, the table for looking up
+syntax may change from character to character.  Since looking up properties
+is expensive, a "syntax cache" which contains the current syntax table and
+the region where it is valid can speed up linear scans dramatically.
+
+The low 7 bits of the integer is a code, as follows. The 8th bit is
+used as the prefix bit flag (see below).
 */
 
 enum syntaxcode
@@ -120,21 +122,23 @@
   return SYNTAX (table, c) == Sword;
 }
 
-/* OK, here's a graphic diagram of the format of the syntax values:
+/* OK, here's a graphic diagram of the format of the syntax values.
+   Here, the value has already been extracted from the Lisp integer,
+   so there are no tag bits to worry about.
 
    Bit number:
 
  [ 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 ]
  [ 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 ]
 
-   <-----> <-----> <-------------> <-------------> ^  <----------->
-    ELisp  unused  |comment bits |     unused      |   syntax code
-     tag           | | | | | | | |                 |
-    stuff          | | | | | | | |                 |
-                   | | | | | | | |                 |
-                   | | | | | | | |                 `--> prefix flag
-                   | | | | | | | |
-                   | | | | | | | `--> comment end style B, second char
+   | <-----------> <-------------> <-------------> ^ <----------->
+   |     unused    |comment bits |     unused      |  syntax code
+   v               | | | | | | | |                 |
+   unusable        | | | | | | | |                 |
+   due to          | | | | | | | |                 |
+   type tag        | | | | | | | |                 `--> prefix flag
+   in Lisp         | | | | | | | |
+   integer         | | | | | | | `--> comment end style B, second char
                    | | | | | | `----> comment end style A, second char
                    | | | | | `------> comment end style B, first char
                    | | | | `--------> comment end style A, first char
@@ -144,35 +148,35 @@
                    `----------------> comment start style A, first char
 
   In a 64-bit integer, there would be 32 more unused bits between
-  the tag and the comment bits.
+  the unusable bit and the comment bits.
 
-  Clearly, such a scheme will not work for Mule, because the matching
-  paren could be any character and as such requires 21 bits, which
-  we don't got.
+  In older versions of XEmacs, bits 8-14 contained the matching
+  character for parentheses.  Such a scheme will not work for Mule,
+  because the matching parenthesis could be any character and
+  requires 21 bits, which we don't have on a 32-bit platform.
 
-  Remember that under Mule we use char tables instead of vectors.
-  So what we do is use another char table for the matching paren
+  What we do is use another char table for the matching parenthesis
   and store a pointer to it in the first char table. (This frees
   code from having to worry about passing two tables around.)
 */
 
 
-/* The prefix flag bit for backward-prefix-chars is now put into bit 7. */
+/* The prefix flag bit for backward-prefix-chars is in bit 7. */
 
 #define SYNTAX_PREFIX(table, c) \
   ((SYNTAX_CODE (table, c) >> 7) & 1)
 
 /* Bits 23-16 are used to implement up to two comment styles
    in a single buffer. They have the following meanings:
-
-  1. first of a one or two character comment-start sequence of style a.
-  2. first of a one or two character comment-start sequence of style b.
-  3. second of a two-character comment-start sequence of style a.
-  4. second of a two-character comment-start sequence of style b.
-  5. first of a one or two character comment-end sequence of style a.
-  6. first of a one or two character comment-end sequence of style b.
-  7. second of a two-character comment-end sequence of style a.
-  8. second of a two-character comment-end sequence of style b.
+  bit
+  23   first of a one or two character comment-start sequence of style a.
+  22   first of a one or two character comment-start sequence of style b.
+  21   second of a two-character comment-start sequence of style a.
+  20   second of a two-character comment-start sequence of style b.
+  19   first of a one or two character comment-end sequence of style a.
+  18   first of a one or two character comment-end sequence of style b.
+  17   second of a two-character comment-end sequence of style a.
+  16   second of a two-character comment-end sequence of style b.
  */
 
 #define SYNTAX_COMMENT_BITS(table, c) \
@@ -196,84 +200,20 @@
 #define SYNTAX_SECOND_CHAR_END   0x03
 #define SYNTAX_SECOND_CHAR       0x33
 
-#if 0
-
-/* #### Entirely unused.  Should they be deleted? */
-
-/* #### These are now more or less equivalent to
-   SYNTAX_COMMENT_MATCH_START ...*/
-/* a and b must be first and second start chars for a common type */
-#define SYNTAX_START_P(table, a, b)                                     \
-  (((SYNTAX_COMMENT_BITS (table, a) & SYNTAX_FIRST_CHAR_START) >> 2)    \
-   & (SYNTAX_COMMENT_BITS (table, b) & SYNTAX_SECOND_CHAR_START))
-
-/* ... and  SYNTAX_COMMENT_MATCH_END */
-/* a and b must be first and second end chars for a common type */
-#define SYNTAX_END_P(table, a, b)                                       \
-  (((SYNTAX_COMMENT_BITS (table, a) & SYNTAX_FIRST_CHAR_END) >> 2)      \
-   & (SYNTAX_COMMENT_BITS (table, b) & SYNTAX_SECOND_CHAR_END))
-
-#define SYNTAX_STYLES_MATCH_START_P(table, a, b, mask)			    \
-  ((SYNTAX_COMMENT_BITS (table, a) & SYNTAX_FIRST_CHAR_START & (mask))	    \
-   && (SYNTAX_COMMENT_BITS (table, b) & SYNTAX_SECOND_CHAR_START & (mask)))
-
-#define SYNTAX_STYLES_MATCH_END_P(table, a, b, mask)			  \
-  ((SYNTAX_COMMENT_BITS (table, a) & SYNTAX_FIRST_CHAR_END & (mask))	  \
-   && (SYNTAX_COMMENT_BITS (table, b) & SYNTAX_SECOND_CHAR_END & (mask)))
-
-#define SYNTAX_STYLES_MATCH_1CHAR_P(table, a, mask)	\
-  ((SYNTAX_COMMENT_BITS (table, a) & (mask)))
-
-#define STYLE_FOUND_P(table, a, b, startp, style)	\
-  ((SYNTAX_COMMENT_BITS (table, a) &			\
-    ((startp) ? SYNTAX_FIRST_CHAR_START :		\
-     SYNTAX_FIRST_CHAR_END) & (style))			\
-   && (SYNTAX_COMMENT_BITS (table, b) &			\
-    ((startp) ? SYNTAX_SECOND_CHAR_START : 		\
-     SYNTAX_SECOND_CHAR_END) & (style)))
-
-#define SYNTAX_COMMENT_MASK_START(table, a, b)			\
-  ((STYLE_FOUND_P (table, a, b, 1, SYNTAX_COMMENT_STYLE_A)	\
-    ? SYNTAX_COMMENT_STYLE_A					\
-    : (STYLE_FOUND_P (table, a, b, 1, SYNTAX_COMMENT_STYLE_B)	\
-         ? SYNTAX_COMMENT_STYLE_B				\
-	 : 0)))
-
-#define SYNTAX_COMMENT_MASK_END(table, a, b)			\
-  ((STYLE_FOUND_P (table, a, b, 0, SYNTAX_COMMENT_STYLE_A)	\
-   ? SYNTAX_COMMENT_STYLE_A					\
-   : (STYLE_FOUND_P (table, a, b, 0, SYNTAX_COMMENT_STYLE_B)	\
-      ? SYNTAX_COMMENT_STYLE_B					\
-      : 0)))
-
-#define STYLE_FOUND_1CHAR_P(table, a, style)	\
-  ((SYNTAX_COMMENT_BITS (table, a) & (style)))
-
-#define SYNTAX_COMMENT_1CHAR_MASK(table, a)			\
-  ((STYLE_FOUND_1CHAR_P (table, a, SYNTAX_COMMENT_STYLE_A)	\
-   ? SYNTAX_COMMENT_STYLE_A					\
-   : (STYLE_FOUND_1CHAR_P (table, a, SYNTAX_COMMENT_STYLE_B)	\
-      ? SYNTAX_COMMENT_STYLE_B					\
-	 : 0)))
-
-#endif /* 0 */
-
-/* This array, indexed by a character, contains the syntax code which
-   that character signifies (as a char).
-   For example, (enum syntaxcode) syntax_spec_code['w'] is Sword. */
-
+/* Array of syntax codes, indexed by characters which designate them.
+   Designators must be ASCII characters (ie, in the range 0x00-0x7F).
+   Bounds checking is the responsibility of calling code. */
 extern const unsigned char syntax_spec_code[0200];
 
-/* Indexed by syntax code, give the letter that describes it. */
-
+/* Array of designators indexed by syntax code.
+   Indicies should be of type enum syntaxcode. */
 extern const unsigned char syntax_code_spec[];
 
 Lisp_Object scan_lists (struct buffer *buf, Charbpos from, int count,
 			int depth, int sexpflag, int no_error);
 int char_quoted (struct buffer *buf, Charbpos pos);
 
-/* NOTE: This does not refer to the mirror table, but to the
-   syntax table itself. */
+/* TABLE is a syntax table, not the mirror table. */
 Lisp_Object syntax_match (Lisp_Object table, Ichar ch);
 
 extern int no_quit_in_re_search;
@@ -283,55 +223,55 @@
 
 extern int lookup_syntax_properties;
 
-/* Now that the `syntax-table' property exists, and can override the syntax
-   table or directly specify the syntax, we cache the last place we
-   retrieved the syntax-table property.  This is because, when moving
-   linearly through text (e.g. in the regex routines or the scanning
-   routines in syntax.c), we only need to recalculate at the next place the
-   syntax-table property changes (i.e. not every position), and when we do
-   need to recalculate, we can update the info from the previous info
-   faster than if we did the whole calculation from scratch. */
+/* The `syntax-table' property overrides the syntax table or directly
+   specifies the syntax.  Since looking up properties is expensive, we cache
+   the information about the syntax-table property.  When moving linearly
+   through text (e.g. in the regex routines or the scanning routines in
+   syntax.c), recalculation is needed only when the syntax-table property
+   changes (i.e. not every position).
+   When we do need to recalculate, we can update the info from the previous
+   info faster than if we did the whole calculation from scratch.
+   #### sjt sez: I'm not sure I believe that last claim.  That seems to
+   require that we use directional information, etc, but that is ignored in
+   the current implementation. */
 struct syntax_cache
 {
 #ifdef NEW_GC
   NORMAL_LISP_OBJECT_HEADER header;
 #endif /* NEW_GC */
-  int use_code;				/* Whether to use syntax_code or
-					   syntax_table.  This is set
-					   depending on whether the
-					   syntax-table property is a
-					   syntax table or a syntax
-					   code. */
-  int no_syntax_table_prop;		/* If non-zero, there was no
-					   `syntax-table' property on the
-					   current range, and so we're
-					   using the buffer's syntax table.
-					   This is important to note because
-					   sometimes the buffer's syntax
-					   table can be changed. */
-  Lisp_Object object;			/* The buffer or string the current
-					   syntax cache applies to, or
-					   Qnil for a string of text not
-					   coming from a buffer or string. */
-  struct buffer *buffer;		/* The buffer that supplies the
-					   syntax tables, or 0 for the
-					   standard syntax table.  If
-					   OBJECT is a buffer, this will
-					   always be the same buffer. */
-  int syntax_code;			/* Syntax code of current char. */
-  Lisp_Object syntax_table;		/* Syntax table for current pos. */
-  Lisp_Object mirror_table;		/* Mirror table for this table. */
-  Lisp_Object start, end;		/* Markers to keep track of the
-					   known region in a buffer.
-					   Formerly we used an internal
-					   extent, but it seems that having
-					   an extent over the entire buffer
-					   causes serious slowdowns in
-					   extent operations!  Yuck! */
-  Charxpos next_change;			/* Position of the next extent
-                                           change. */
-  Charxpos prev_change;			/* Position of the previous extent
-					   change. */
+  int use_code;			/* Non-zero if a syntax-table property
+				   specified a syntax code.  When zero, the
+				   syntax_code member is invalid.  Otherwise
+				   the syntax_table member is invalid. */
+  int no_syntax_table_prop;	/* If non-zero, there was no `syntax-table'
+				   property on the current range, and so we're
+				   using the buffer's syntax table.
+				   Then we must invalidate the cache if the
+				   buffer's syntax table is changed. */
+  Lisp_Object object;		/* The buffer or string the current syntax
+				   cache applies to, or Qnil for a string of
+				   text not coming from a buffer or string. */
+  struct buffer *buffer;	/* The buffer that supplies the syntax tables,
+				   or NULL for the standard syntax table.  If
+				   OBJECT is a buffer, this will always be
+				   the same buffer. */
+  int syntax_code;		/* Syntax code of current char. */
+  Lisp_Object syntax_table;	/* Syntax table for current pos. */
+  Lisp_Object mirror_table;	/* Mirror table for this table. */
+  Lisp_Object start, end;	/* Markers to keep track of the known region
+				   in a buffer.
+				   Normally these correspond to prev_change
+				   and next_change, respectively, except when
+				   insertions and deletions occur.  Then
+				   prev_change and next change will be
+				   refreshed from these markers.  See
+				   signal_syntax_cache_extent_adjust().
+				   We'd like to use an extent, but it seems
+				   that having an extent over the entire
+				   buffer causes serious slowdowns in extent
+				   operations!  Yuck! */
+  Charxpos next_change;		/* Position of the next extent change. */
+  Charxpos prev_change;		/* Position of the previous extent change. */
 };
 
 #ifdef NEW_GC
@@ -347,13 +287,10 @@
 #define CONCHECK_SYNTAX_CACHE(x) CONCHECK_RECORD (x, syntax_cache)
 #endif /* NEW_GC */
 
-
-
 extern const struct sized_memory_description syntax_cache_description;
 
-/* Note that the external interface to the syntax-cache uses charpos's, but
+/* Note that the external interface to the syntax cache uses charpos's, but
    internally we use bytepos's, for speed. */
-
 void update_syntax_cache (struct syntax_cache *cache, Charxpos pos, int count);
 struct syntax_cache *setup_syntax_cache (struct syntax_cache *cache,
 					 Lisp_Object object,