comparison src/syntax.h @ 5544:c2301b2c88c8

Improve documentation of syntax table internals.
author Stephen J. Turnbull <stephen@xemacs.org>
date Mon, 08 Aug 2011 13:57:20 +0900
parents dab422055bab
children 85210c453a97
comparison
equal deleted inserted replaced
5543:fbe90e6f7a43 5544:c2301b2c88c8
24 24
25 #include "chartab.h" 25 #include "chartab.h"
26 26
27 /* A syntax table is a type of char table. 27 /* A syntax table is a type of char table.
28 28
29 The low 7 bits of the integer is a code, as follows. The 8th bit is
30 used as the prefix bit flag (see below).
31
32 The values in a syntax table are either integers or conses of 29 The values in a syntax table are either integers or conses of
33 integers and chars. The lowest 7 bits of the integer are the syntax 30 integers and chars. The lowest 7 bits of the integer are the syntax
34 class. If this is Sinherit, then the actual syntax value needs to 31 class. If this is Sinherit, then the actual syntax value needs to
35 be retrieved from the standard syntax table. 32 be retrieved from the standard syntax table.
36 33
37 Since the logic involved in finding the actual integer isn't very 34 It turns out to be worth optimizing lookups of character syntax in two
38 complex, you'd think the time required to retrieve it is not a 35 ways. First, although the logic involved in finding the actual integer
39 factor. If you thought that, however, you'd be wrong, due to the 36 isn't complex, the syntax value is accessed in functions such as
40 high number of times (many per character) that the syntax value is 37 scan_lists() many times for each character scanned. A "mirror syntax
41 accessed in functions such as scan_lists(). To speed this up, 38 table" that contains the actual integers speeds this up.
42 we maintain a mirror syntax table that contains the actual 39
43 integers. We can do this successfully because syntax tables are 40 Second, due to the syntax-table text property, the table for looking up
44 now an abstract type, where we control all access. 41 syntax may change from character to character. Since looking up properties
42 is expensive, a "syntax cache" which contains the current syntax table and
43 the region where it is valid can speed up linear scans dramatically.
44
45 The low 7 bits of the integer is a code, as follows. The 8th bit is
46 used as the prefix bit flag (see below).
45 */ 47 */
46 48
47 enum syntaxcode 49 enum syntaxcode
48 { 50 {
49 Swhitespace, /* whitespace character */ 51 Swhitespace, /* whitespace character */
118 ) 120 )
119 { 121 {
120 return SYNTAX (table, c) == Sword; 122 return SYNTAX (table, c) == Sword;
121 } 123 }
122 124
123 /* OK, here's a graphic diagram of the format of the syntax values: 125 /* OK, here's a graphic diagram of the format of the syntax values.
126 Here, the value has already been extracted from the Lisp integer,
127 so there are no tag bits to worry about.
124 128
125 Bit number: 129 Bit number:
126 130
127 [ 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 ] 131 [ 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 ]
128 [ 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 ] 132 [ 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 ]
129 133
130 <-----> <-----> <-------------> <-------------> ^ <-----------> 134 | <-----------> <-------------> <-------------> ^ <----------->
131 ELisp unused |comment bits | unused | syntax code 135 | unused |comment bits | unused | syntax code
132 tag | | | | | | | | | 136 v | | | | | | | | |
133 stuff | | | | | | | | | 137 unusable | | | | | | | | |
134 | | | | | | | | | 138 due to | | | | | | | | |
135 | | | | | | | | `--> prefix flag 139 type tag | | | | | | | | `--> prefix flag
136 | | | | | | | | 140 in Lisp | | | | | | | |
137 | | | | | | | `--> comment end style B, second char 141 integer | | | | | | | `--> comment end style B, second char
138 | | | | | | `----> comment end style A, second char 142 | | | | | | `----> comment end style A, second char
139 | | | | | `------> comment end style B, first char 143 | | | | | `------> comment end style B, first char
140 | | | | `--------> comment end style A, first char 144 | | | | `--------> comment end style A, first char
141 | | | `----------> comment start style B, second char 145 | | | `----------> comment start style B, second char
142 | | `------------> comment start style A, second char 146 | | `------------> comment start style A, second char
143 | `--------------> comment start style B, first char 147 | `--------------> comment start style B, first char
144 `----------------> comment start style A, first char 148 `----------------> comment start style A, first char
145 149
146 In a 64-bit integer, there would be 32 more unused bits between 150 In a 64-bit integer, there would be 32 more unused bits between
147 the tag and the comment bits. 151 the unusable bit and the comment bits.
148 152
149 Clearly, such a scheme will not work for Mule, because the matching 153 In older versions of XEmacs, bits 8-14 contained the matching
150 paren could be any character and as such requires 21 bits, which 154 character for parentheses. Such a scheme will not work for Mule,
151 we don't got. 155 because the matching parenthesis could be any character and
152 156 requires 21 bits, which we don't have on a 32-bit platform.
153 Remember that under Mule we use char tables instead of vectors. 157
154 So what we do is use another char table for the matching paren 158 What we do is use another char table for the matching parenthesis
155 and store a pointer to it in the first char table. (This frees 159 and store a pointer to it in the first char table. (This frees
156 code from having to worry about passing two tables around.) 160 code from having to worry about passing two tables around.)
157 */ 161 */
158 162
159 163
160 /* The prefix flag bit for backward-prefix-chars is now put into bit 7. */ 164 /* The prefix flag bit for backward-prefix-chars is in bit 7. */
161 165
162 #define SYNTAX_PREFIX(table, c) \ 166 #define SYNTAX_PREFIX(table, c) \
163 ((SYNTAX_CODE (table, c) >> 7) & 1) 167 ((SYNTAX_CODE (table, c) >> 7) & 1)
164 168
165 /* Bits 23-16 are used to implement up to two comment styles 169 /* Bits 23-16 are used to implement up to two comment styles
166 in a single buffer. They have the following meanings: 170 in a single buffer. They have the following meanings:
167 171 bit
168 1. first of a one or two character comment-start sequence of style a. 172 23 first of a one or two character comment-start sequence of style a.
169 2. first of a one or two character comment-start sequence of style b. 173 22 first of a one or two character comment-start sequence of style b.
170 3. second of a two-character comment-start sequence of style a. 174 21 second of a two-character comment-start sequence of style a.
171 4. second of a two-character comment-start sequence of style b. 175 20 second of a two-character comment-start sequence of style b.
172 5. first of a one or two character comment-end sequence of style a. 176 19 first of a one or two character comment-end sequence of style a.
173 6. first of a one or two character comment-end sequence of style b. 177 18 first of a one or two character comment-end sequence of style b.
174 7. second of a two-character comment-end sequence of style a. 178 17 second of a two-character comment-end sequence of style a.
175 8. second of a two-character comment-end sequence of style b. 179 16 second of a two-character comment-end sequence of style b.
176 */ 180 */
177 181
178 #define SYNTAX_COMMENT_BITS(table, c) \ 182 #define SYNTAX_COMMENT_BITS(table, c) \
179 ((SYNTAX_CODE (table, c) >> 16) &0xff) 183 ((SYNTAX_CODE (table, c) >> 16) &0xff)
180 184
194 #define SYNTAX_FIRST_CHAR 0xcc 198 #define SYNTAX_FIRST_CHAR 0xcc
195 #define SYNTAX_SECOND_CHAR_START 0x30 199 #define SYNTAX_SECOND_CHAR_START 0x30
196 #define SYNTAX_SECOND_CHAR_END 0x03 200 #define SYNTAX_SECOND_CHAR_END 0x03
197 #define SYNTAX_SECOND_CHAR 0x33 201 #define SYNTAX_SECOND_CHAR 0x33
198 202
199 #if 0 203 /* Array of syntax codes, indexed by characters which designate them.
200 204 Designators must be ASCII characters (ie, in the range 0x00-0x7F).
201 /* #### Entirely unused. Should they be deleted? */ 205 Bounds checking is the responsibility of calling code. */
202
203 /* #### These are now more or less equivalent to
204 SYNTAX_COMMENT_MATCH_START ...*/
205 /* a and b must be first and second start chars for a common type */
206 #define SYNTAX_START_P(table, a, b) \
207 (((SYNTAX_COMMENT_BITS (table, a) & SYNTAX_FIRST_CHAR_START) >> 2) \
208 & (SYNTAX_COMMENT_BITS (table, b) & SYNTAX_SECOND_CHAR_START))
209
210 /* ... and SYNTAX_COMMENT_MATCH_END */
211 /* a and b must be first and second end chars for a common type */
212 #define SYNTAX_END_P(table, a, b) \
213 (((SYNTAX_COMMENT_BITS (table, a) & SYNTAX_FIRST_CHAR_END) >> 2) \
214 & (SYNTAX_COMMENT_BITS (table, b) & SYNTAX_SECOND_CHAR_END))
215
216 #define SYNTAX_STYLES_MATCH_START_P(table, a, b, mask) \
217 ((SYNTAX_COMMENT_BITS (table, a) & SYNTAX_FIRST_CHAR_START & (mask)) \
218 && (SYNTAX_COMMENT_BITS (table, b) & SYNTAX_SECOND_CHAR_START & (mask)))
219
220 #define SYNTAX_STYLES_MATCH_END_P(table, a, b, mask) \
221 ((SYNTAX_COMMENT_BITS (table, a) & SYNTAX_FIRST_CHAR_END & (mask)) \
222 && (SYNTAX_COMMENT_BITS (table, b) & SYNTAX_SECOND_CHAR_END & (mask)))
223
224 #define SYNTAX_STYLES_MATCH_1CHAR_P(table, a, mask) \
225 ((SYNTAX_COMMENT_BITS (table, a) & (mask)))
226
227 #define STYLE_FOUND_P(table, a, b, startp, style) \
228 ((SYNTAX_COMMENT_BITS (table, a) & \
229 ((startp) ? SYNTAX_FIRST_CHAR_START : \
230 SYNTAX_FIRST_CHAR_END) & (style)) \
231 && (SYNTAX_COMMENT_BITS (table, b) & \
232 ((startp) ? SYNTAX_SECOND_CHAR_START : \
233 SYNTAX_SECOND_CHAR_END) & (style)))
234
235 #define SYNTAX_COMMENT_MASK_START(table, a, b) \
236 ((STYLE_FOUND_P (table, a, b, 1, SYNTAX_COMMENT_STYLE_A) \
237 ? SYNTAX_COMMENT_STYLE_A \
238 : (STYLE_FOUND_P (table, a, b, 1, SYNTAX_COMMENT_STYLE_B) \
239 ? SYNTAX_COMMENT_STYLE_B \
240 : 0)))
241
242 #define SYNTAX_COMMENT_MASK_END(table, a, b) \
243 ((STYLE_FOUND_P (table, a, b, 0, SYNTAX_COMMENT_STYLE_A) \
244 ? SYNTAX_COMMENT_STYLE_A \
245 : (STYLE_FOUND_P (table, a, b, 0, SYNTAX_COMMENT_STYLE_B) \
246 ? SYNTAX_COMMENT_STYLE_B \
247 : 0)))
248
249 #define STYLE_FOUND_1CHAR_P(table, a, style) \
250 ((SYNTAX_COMMENT_BITS (table, a) & (style)))
251
252 #define SYNTAX_COMMENT_1CHAR_MASK(table, a) \
253 ((STYLE_FOUND_1CHAR_P (table, a, SYNTAX_COMMENT_STYLE_A) \
254 ? SYNTAX_COMMENT_STYLE_A \
255 : (STYLE_FOUND_1CHAR_P (table, a, SYNTAX_COMMENT_STYLE_B) \
256 ? SYNTAX_COMMENT_STYLE_B \
257 : 0)))
258
259 #endif /* 0 */
260
261 /* This array, indexed by a character, contains the syntax code which
262 that character signifies (as a char).
263 For example, (enum syntaxcode) syntax_spec_code['w'] is Sword. */
264
265 extern const unsigned char syntax_spec_code[0200]; 206 extern const unsigned char syntax_spec_code[0200];
266 207
267 /* Indexed by syntax code, give the letter that describes it. */ 208 /* Array of designators indexed by syntax code.
268 209 Indicies should be of type enum syntaxcode. */
269 extern const unsigned char syntax_code_spec[]; 210 extern const unsigned char syntax_code_spec[];
270 211
271 Lisp_Object scan_lists (struct buffer *buf, Charbpos from, int count, 212 Lisp_Object scan_lists (struct buffer *buf, Charbpos from, int count,
272 int depth, int sexpflag, int no_error); 213 int depth, int sexpflag, int no_error);
273 int char_quoted (struct buffer *buf, Charbpos pos); 214 int char_quoted (struct buffer *buf, Charbpos pos);
274 215
275 /* NOTE: This does not refer to the mirror table, but to the 216 /* TABLE is a syntax table, not the mirror table. */
276 syntax table itself. */
277 Lisp_Object syntax_match (Lisp_Object table, Ichar ch); 217 Lisp_Object syntax_match (Lisp_Object table, Ichar ch);
278 218
279 extern int no_quit_in_re_search; 219 extern int no_quit_in_re_search;
280 220
281 221
282 /****************************** syntax caches ********************************/ 222 /****************************** syntax caches ********************************/
283 223
284 extern int lookup_syntax_properties; 224 extern int lookup_syntax_properties;
285 225
286 /* Now that the `syntax-table' property exists, and can override the syntax 226 /* The `syntax-table' property overrides the syntax table or directly
287 table or directly specify the syntax, we cache the last place we 227 specifies the syntax. Since looking up properties is expensive, we cache
288 retrieved the syntax-table property. This is because, when moving 228 the information about the syntax-table property. When moving linearly
289 linearly through text (e.g. in the regex routines or the scanning 229 through text (e.g. in the regex routines or the scanning routines in
290 routines in syntax.c), we only need to recalculate at the next place the 230 syntax.c), recalculation is needed only when the syntax-table property
291 syntax-table property changes (i.e. not every position), and when we do 231 changes (i.e. not every position).
292 need to recalculate, we can update the info from the previous info 232 When we do need to recalculate, we can update the info from the previous
293 faster than if we did the whole calculation from scratch. */ 233 info faster than if we did the whole calculation from scratch.
234 #### sjt sez: I'm not sure I believe that last claim. That seems to
235 require that we use directional information, etc, but that is ignored in
236 the current implementation. */
294 struct syntax_cache 237 struct syntax_cache
295 { 238 {
296 #ifdef NEW_GC 239 #ifdef NEW_GC
297 NORMAL_LISP_OBJECT_HEADER header; 240 NORMAL_LISP_OBJECT_HEADER header;
298 #endif /* NEW_GC */ 241 #endif /* NEW_GC */
299 int use_code; /* Whether to use syntax_code or 242 int use_code; /* Non-zero if a syntax-table property
300 syntax_table. This is set 243 specified a syntax code. When zero, the
301 depending on whether the 244 syntax_code member is invalid. Otherwise
302 syntax-table property is a 245 the syntax_table member is invalid. */
303 syntax table or a syntax 246 int no_syntax_table_prop; /* If non-zero, there was no `syntax-table'
304 code. */ 247 property on the current range, and so we're
305 int no_syntax_table_prop; /* If non-zero, there was no 248 using the buffer's syntax table.
306 `syntax-table' property on the 249 Then we must invalidate the cache if the
307 current range, and so we're 250 buffer's syntax table is changed. */
308 using the buffer's syntax table. 251 Lisp_Object object; /* The buffer or string the current syntax
309 This is important to note because 252 cache applies to, or Qnil for a string of
310 sometimes the buffer's syntax 253 text not coming from a buffer or string. */
311 table can be changed. */ 254 struct buffer *buffer; /* The buffer that supplies the syntax tables,
312 Lisp_Object object; /* The buffer or string the current 255 or NULL for the standard syntax table. If
313 syntax cache applies to, or 256 OBJECT is a buffer, this will always be
314 Qnil for a string of text not 257 the same buffer. */
315 coming from a buffer or string. */ 258 int syntax_code; /* Syntax code of current char. */
316 struct buffer *buffer; /* The buffer that supplies the 259 Lisp_Object syntax_table; /* Syntax table for current pos. */
317 syntax tables, or 0 for the 260 Lisp_Object mirror_table; /* Mirror table for this table. */
318 standard syntax table. If 261 Lisp_Object start, end; /* Markers to keep track of the known region
319 OBJECT is a buffer, this will 262 in a buffer.
320 always be the same buffer. */ 263 Normally these correspond to prev_change
321 int syntax_code; /* Syntax code of current char. */ 264 and next_change, respectively, except when
322 Lisp_Object syntax_table; /* Syntax table for current pos. */ 265 insertions and deletions occur. Then
323 Lisp_Object mirror_table; /* Mirror table for this table. */ 266 prev_change and next change will be
324 Lisp_Object start, end; /* Markers to keep track of the 267 refreshed from these markers. See
325 known region in a buffer. 268 signal_syntax_cache_extent_adjust().
326 Formerly we used an internal 269 We'd like to use an extent, but it seems
327 extent, but it seems that having 270 that having an extent over the entire
328 an extent over the entire buffer 271 buffer causes serious slowdowns in extent
329 causes serious slowdowns in 272 operations! Yuck! */
330 extent operations! Yuck! */ 273 Charxpos next_change; /* Position of the next extent change. */
331 Charxpos next_change; /* Position of the next extent 274 Charxpos prev_change; /* Position of the previous extent change. */
332 change. */
333 Charxpos prev_change; /* Position of the previous extent
334 change. */
335 }; 275 };
336 276
337 #ifdef NEW_GC 277 #ifdef NEW_GC
338 typedef struct syntax_cache Lisp_Syntax_Cache; 278 typedef struct syntax_cache Lisp_Syntax_Cache;
339 279
345 #define SYNTAX_CACHE_P(x) RECORDP (x, syntax_cache) 285 #define SYNTAX_CACHE_P(x) RECORDP (x, syntax_cache)
346 #define CHECK_SYNTAX_CACHE(x) CHECK_RECORD (x, syntax_cache) 286 #define CHECK_SYNTAX_CACHE(x) CHECK_RECORD (x, syntax_cache)
347 #define CONCHECK_SYNTAX_CACHE(x) CONCHECK_RECORD (x, syntax_cache) 287 #define CONCHECK_SYNTAX_CACHE(x) CONCHECK_RECORD (x, syntax_cache)
348 #endif /* NEW_GC */ 288 #endif /* NEW_GC */
349 289
350
351
352 extern const struct sized_memory_description syntax_cache_description; 290 extern const struct sized_memory_description syntax_cache_description;
353 291
354 /* Note that the external interface to the syntax-cache uses charpos's, but 292 /* Note that the external interface to the syntax cache uses charpos's, but
355 internally we use bytepos's, for speed. */ 293 internally we use bytepos's, for speed. */
356
357 void update_syntax_cache (struct syntax_cache *cache, Charxpos pos, int count); 294 void update_syntax_cache (struct syntax_cache *cache, Charxpos pos, int count);
358 struct syntax_cache *setup_syntax_cache (struct syntax_cache *cache, 295 struct syntax_cache *setup_syntax_cache (struct syntax_cache *cache,
359 Lisp_Object object, 296 Lisp_Object object,
360 struct buffer *buffer, 297 struct buffer *buffer,
361 Charxpos from, int count); 298 Charxpos from, int count);