Mercurial > hg > xemacs-beta
comparison src/syntax.h @ 5544:c2301b2c88c8
Improve documentation of syntax table internals.
author | Stephen J. Turnbull <stephen@xemacs.org> |
---|---|
date | Mon, 08 Aug 2011 13:57:20 +0900 |
parents | dab422055bab |
children | 85210c453a97 |
comparison
equal
deleted
inserted
replaced
5543:fbe90e6f7a43 | 5544:c2301b2c88c8 |
---|---|
24 | 24 |
25 #include "chartab.h" | 25 #include "chartab.h" |
26 | 26 |
27 /* A syntax table is a type of char table. | 27 /* A syntax table is a type of char table. |
28 | 28 |
29 The low 7 bits of the integer is a code, as follows. The 8th bit is | |
30 used as the prefix bit flag (see below). | |
31 | |
32 The values in a syntax table are either integers or conses of | 29 The values in a syntax table are either integers or conses of |
33 integers and chars. The lowest 7 bits of the integer are the syntax | 30 integers and chars. The lowest 7 bits of the integer are the syntax |
34 class. If this is Sinherit, then the actual syntax value needs to | 31 class. If this is Sinherit, then the actual syntax value needs to |
35 be retrieved from the standard syntax table. | 32 be retrieved from the standard syntax table. |
36 | 33 |
37 Since the logic involved in finding the actual integer isn't very | 34 It turns out to be worth optimizing lookups of character syntax in two |
38 complex, you'd think the time required to retrieve it is not a | 35 ways. First, although the logic involved in finding the actual integer |
39 factor. If you thought that, however, you'd be wrong, due to the | 36 isn't complex, the syntax value is accessed in functions such as |
40 high number of times (many per character) that the syntax value is | 37 scan_lists() many times for each character scanned. A "mirror syntax |
41 accessed in functions such as scan_lists(). To speed this up, | 38 table" that contains the actual integers speeds this up. |
42 we maintain a mirror syntax table that contains the actual | 39 |
43 integers. We can do this successfully because syntax tables are | 40 Second, due to the syntax-table text property, the table for looking up |
44 now an abstract type, where we control all access. | 41 syntax may change from character to character. Since looking up properties |
42 is expensive, a "syntax cache" which contains the current syntax table and | |
43 the region where it is valid can speed up linear scans dramatically. | |
44 | |
45 The low 7 bits of the integer is a code, as follows. The 8th bit is | |
46 used as the prefix bit flag (see below). | |
45 */ | 47 */ |
46 | 48 |
47 enum syntaxcode | 49 enum syntaxcode |
48 { | 50 { |
49 Swhitespace, /* whitespace character */ | 51 Swhitespace, /* whitespace character */ |
118 ) | 120 ) |
119 { | 121 { |
120 return SYNTAX (table, c) == Sword; | 122 return SYNTAX (table, c) == Sword; |
121 } | 123 } |
122 | 124 |
123 /* OK, here's a graphic diagram of the format of the syntax values: | 125 /* OK, here's a graphic diagram of the format of the syntax values. |
126 Here, the value has already been extracted from the Lisp integer, | |
127 so there are no tag bits to worry about. | |
124 | 128 |
125 Bit number: | 129 Bit number: |
126 | 130 |
127 [ 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 ] | 131 [ 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 ] |
128 [ 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 ] | 132 [ 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 ] |
129 | 133 |
130 <-----> <-----> <-------------> <-------------> ^ <-----------> | 134 | <-----------> <-------------> <-------------> ^ <-----------> |
131 ELisp unused |comment bits | unused | syntax code | 135 | unused |comment bits | unused | syntax code |
132 tag | | | | | | | | | | 136 v | | | | | | | | | |
133 stuff | | | | | | | | | | 137 unusable | | | | | | | | | |
134 | | | | | | | | | | 138 due to | | | | | | | | | |
135 | | | | | | | | `--> prefix flag | 139 type tag | | | | | | | | `--> prefix flag |
136 | | | | | | | | | 140 in Lisp | | | | | | | | |
137 | | | | | | | `--> comment end style B, second char | 141 integer | | | | | | | `--> comment end style B, second char |
138 | | | | | | `----> comment end style A, second char | 142 | | | | | | `----> comment end style A, second char |
139 | | | | | `------> comment end style B, first char | 143 | | | | | `------> comment end style B, first char |
140 | | | | `--------> comment end style A, first char | 144 | | | | `--------> comment end style A, first char |
141 | | | `----------> comment start style B, second char | 145 | | | `----------> comment start style B, second char |
142 | | `------------> comment start style A, second char | 146 | | `------------> comment start style A, second char |
143 | `--------------> comment start style B, first char | 147 | `--------------> comment start style B, first char |
144 `----------------> comment start style A, first char | 148 `----------------> comment start style A, first char |
145 | 149 |
146 In a 64-bit integer, there would be 32 more unused bits between | 150 In a 64-bit integer, there would be 32 more unused bits between |
147 the tag and the comment bits. | 151 the unusable bit and the comment bits. |
148 | 152 |
149 Clearly, such a scheme will not work for Mule, because the matching | 153 In older versions of XEmacs, bits 8-14 contained the matching |
150 paren could be any character and as such requires 21 bits, which | 154 character for parentheses. Such a scheme will not work for Mule, |
151 we don't got. | 155 because the matching parenthesis could be any character and |
152 | 156 requires 21 bits, which we don't have on a 32-bit platform. |
153 Remember that under Mule we use char tables instead of vectors. | 157 |
154 So what we do is use another char table for the matching paren | 158 What we do is use another char table for the matching parenthesis |
155 and store a pointer to it in the first char table. (This frees | 159 and store a pointer to it in the first char table. (This frees |
156 code from having to worry about passing two tables around.) | 160 code from having to worry about passing two tables around.) |
157 */ | 161 */ |
158 | 162 |
159 | 163 |
160 /* The prefix flag bit for backward-prefix-chars is now put into bit 7. */ | 164 /* The prefix flag bit for backward-prefix-chars is in bit 7. */ |
161 | 165 |
162 #define SYNTAX_PREFIX(table, c) \ | 166 #define SYNTAX_PREFIX(table, c) \ |
163 ((SYNTAX_CODE (table, c) >> 7) & 1) | 167 ((SYNTAX_CODE (table, c) >> 7) & 1) |
164 | 168 |
165 /* Bits 23-16 are used to implement up to two comment styles | 169 /* Bits 23-16 are used to implement up to two comment styles |
166 in a single buffer. They have the following meanings: | 170 in a single buffer. They have the following meanings: |
167 | 171 bit |
168 1. first of a one or two character comment-start sequence of style a. | 172 23 first of a one or two character comment-start sequence of style a. |
169 2. first of a one or two character comment-start sequence of style b. | 173 22 first of a one or two character comment-start sequence of style b. |
170 3. second of a two-character comment-start sequence of style a. | 174 21 second of a two-character comment-start sequence of style a. |
171 4. second of a two-character comment-start sequence of style b. | 175 20 second of a two-character comment-start sequence of style b. |
172 5. first of a one or two character comment-end sequence of style a. | 176 19 first of a one or two character comment-end sequence of style a. |
173 6. first of a one or two character comment-end sequence of style b. | 177 18 first of a one or two character comment-end sequence of style b. |
174 7. second of a two-character comment-end sequence of style a. | 178 17 second of a two-character comment-end sequence of style a. |
175 8. second of a two-character comment-end sequence of style b. | 179 16 second of a two-character comment-end sequence of style b. |
176 */ | 180 */ |
177 | 181 |
178 #define SYNTAX_COMMENT_BITS(table, c) \ | 182 #define SYNTAX_COMMENT_BITS(table, c) \ |
179 ((SYNTAX_CODE (table, c) >> 16) &0xff) | 183 ((SYNTAX_CODE (table, c) >> 16) &0xff) |
180 | 184 |
194 #define SYNTAX_FIRST_CHAR 0xcc | 198 #define SYNTAX_FIRST_CHAR 0xcc |
195 #define SYNTAX_SECOND_CHAR_START 0x30 | 199 #define SYNTAX_SECOND_CHAR_START 0x30 |
196 #define SYNTAX_SECOND_CHAR_END 0x03 | 200 #define SYNTAX_SECOND_CHAR_END 0x03 |
197 #define SYNTAX_SECOND_CHAR 0x33 | 201 #define SYNTAX_SECOND_CHAR 0x33 |
198 | 202 |
199 #if 0 | 203 /* Array of syntax codes, indexed by characters which designate them. |
200 | 204 Designators must be ASCII characters (ie, in the range 0x00-0x7F). |
201 /* #### Entirely unused. Should they be deleted? */ | 205 Bounds checking is the responsibility of calling code. */ |
202 | |
203 /* #### These are now more or less equivalent to | |
204 SYNTAX_COMMENT_MATCH_START ...*/ | |
205 /* a and b must be first and second start chars for a common type */ | |
206 #define SYNTAX_START_P(table, a, b) \ | |
207 (((SYNTAX_COMMENT_BITS (table, a) & SYNTAX_FIRST_CHAR_START) >> 2) \ | |
208 & (SYNTAX_COMMENT_BITS (table, b) & SYNTAX_SECOND_CHAR_START)) | |
209 | |
210 /* ... and SYNTAX_COMMENT_MATCH_END */ | |
211 /* a and b must be first and second end chars for a common type */ | |
212 #define SYNTAX_END_P(table, a, b) \ | |
213 (((SYNTAX_COMMENT_BITS (table, a) & SYNTAX_FIRST_CHAR_END) >> 2) \ | |
214 & (SYNTAX_COMMENT_BITS (table, b) & SYNTAX_SECOND_CHAR_END)) | |
215 | |
216 #define SYNTAX_STYLES_MATCH_START_P(table, a, b, mask) \ | |
217 ((SYNTAX_COMMENT_BITS (table, a) & SYNTAX_FIRST_CHAR_START & (mask)) \ | |
218 && (SYNTAX_COMMENT_BITS (table, b) & SYNTAX_SECOND_CHAR_START & (mask))) | |
219 | |
220 #define SYNTAX_STYLES_MATCH_END_P(table, a, b, mask) \ | |
221 ((SYNTAX_COMMENT_BITS (table, a) & SYNTAX_FIRST_CHAR_END & (mask)) \ | |
222 && (SYNTAX_COMMENT_BITS (table, b) & SYNTAX_SECOND_CHAR_END & (mask))) | |
223 | |
224 #define SYNTAX_STYLES_MATCH_1CHAR_P(table, a, mask) \ | |
225 ((SYNTAX_COMMENT_BITS (table, a) & (mask))) | |
226 | |
227 #define STYLE_FOUND_P(table, a, b, startp, style) \ | |
228 ((SYNTAX_COMMENT_BITS (table, a) & \ | |
229 ((startp) ? SYNTAX_FIRST_CHAR_START : \ | |
230 SYNTAX_FIRST_CHAR_END) & (style)) \ | |
231 && (SYNTAX_COMMENT_BITS (table, b) & \ | |
232 ((startp) ? SYNTAX_SECOND_CHAR_START : \ | |
233 SYNTAX_SECOND_CHAR_END) & (style))) | |
234 | |
235 #define SYNTAX_COMMENT_MASK_START(table, a, b) \ | |
236 ((STYLE_FOUND_P (table, a, b, 1, SYNTAX_COMMENT_STYLE_A) \ | |
237 ? SYNTAX_COMMENT_STYLE_A \ | |
238 : (STYLE_FOUND_P (table, a, b, 1, SYNTAX_COMMENT_STYLE_B) \ | |
239 ? SYNTAX_COMMENT_STYLE_B \ | |
240 : 0))) | |
241 | |
242 #define SYNTAX_COMMENT_MASK_END(table, a, b) \ | |
243 ((STYLE_FOUND_P (table, a, b, 0, SYNTAX_COMMENT_STYLE_A) \ | |
244 ? SYNTAX_COMMENT_STYLE_A \ | |
245 : (STYLE_FOUND_P (table, a, b, 0, SYNTAX_COMMENT_STYLE_B) \ | |
246 ? SYNTAX_COMMENT_STYLE_B \ | |
247 : 0))) | |
248 | |
249 #define STYLE_FOUND_1CHAR_P(table, a, style) \ | |
250 ((SYNTAX_COMMENT_BITS (table, a) & (style))) | |
251 | |
252 #define SYNTAX_COMMENT_1CHAR_MASK(table, a) \ | |
253 ((STYLE_FOUND_1CHAR_P (table, a, SYNTAX_COMMENT_STYLE_A) \ | |
254 ? SYNTAX_COMMENT_STYLE_A \ | |
255 : (STYLE_FOUND_1CHAR_P (table, a, SYNTAX_COMMENT_STYLE_B) \ | |
256 ? SYNTAX_COMMENT_STYLE_B \ | |
257 : 0))) | |
258 | |
259 #endif /* 0 */ | |
260 | |
261 /* This array, indexed by a character, contains the syntax code which | |
262 that character signifies (as a char). | |
263 For example, (enum syntaxcode) syntax_spec_code['w'] is Sword. */ | |
264 | |
265 extern const unsigned char syntax_spec_code[0200]; | 206 extern const unsigned char syntax_spec_code[0200]; |
266 | 207 |
267 /* Indexed by syntax code, give the letter that describes it. */ | 208 /* Array of designators indexed by syntax code. |
268 | 209 Indicies should be of type enum syntaxcode. */ |
269 extern const unsigned char syntax_code_spec[]; | 210 extern const unsigned char syntax_code_spec[]; |
270 | 211 |
271 Lisp_Object scan_lists (struct buffer *buf, Charbpos from, int count, | 212 Lisp_Object scan_lists (struct buffer *buf, Charbpos from, int count, |
272 int depth, int sexpflag, int no_error); | 213 int depth, int sexpflag, int no_error); |
273 int char_quoted (struct buffer *buf, Charbpos pos); | 214 int char_quoted (struct buffer *buf, Charbpos pos); |
274 | 215 |
275 /* NOTE: This does not refer to the mirror table, but to the | 216 /* TABLE is a syntax table, not the mirror table. */ |
276 syntax table itself. */ | |
277 Lisp_Object syntax_match (Lisp_Object table, Ichar ch); | 217 Lisp_Object syntax_match (Lisp_Object table, Ichar ch); |
278 | 218 |
279 extern int no_quit_in_re_search; | 219 extern int no_quit_in_re_search; |
280 | 220 |
281 | 221 |
282 /****************************** syntax caches ********************************/ | 222 /****************************** syntax caches ********************************/ |
283 | 223 |
284 extern int lookup_syntax_properties; | 224 extern int lookup_syntax_properties; |
285 | 225 |
286 /* Now that the `syntax-table' property exists, and can override the syntax | 226 /* The `syntax-table' property overrides the syntax table or directly |
287 table or directly specify the syntax, we cache the last place we | 227 specifies the syntax. Since looking up properties is expensive, we cache |
288 retrieved the syntax-table property. This is because, when moving | 228 the information about the syntax-table property. When moving linearly |
289 linearly through text (e.g. in the regex routines or the scanning | 229 through text (e.g. in the regex routines or the scanning routines in |
290 routines in syntax.c), we only need to recalculate at the next place the | 230 syntax.c), recalculation is needed only when the syntax-table property |
291 syntax-table property changes (i.e. not every position), and when we do | 231 changes (i.e. not every position). |
292 need to recalculate, we can update the info from the previous info | 232 When we do need to recalculate, we can update the info from the previous |
293 faster than if we did the whole calculation from scratch. */ | 233 info faster than if we did the whole calculation from scratch. |
234 #### sjt sez: I'm not sure I believe that last claim. That seems to | |
235 require that we use directional information, etc, but that is ignored in | |
236 the current implementation. */ | |
294 struct syntax_cache | 237 struct syntax_cache |
295 { | 238 { |
296 #ifdef NEW_GC | 239 #ifdef NEW_GC |
297 NORMAL_LISP_OBJECT_HEADER header; | 240 NORMAL_LISP_OBJECT_HEADER header; |
298 #endif /* NEW_GC */ | 241 #endif /* NEW_GC */ |
299 int use_code; /* Whether to use syntax_code or | 242 int use_code; /* Non-zero if a syntax-table property |
300 syntax_table. This is set | 243 specified a syntax code. When zero, the |
301 depending on whether the | 244 syntax_code member is invalid. Otherwise |
302 syntax-table property is a | 245 the syntax_table member is invalid. */ |
303 syntax table or a syntax | 246 int no_syntax_table_prop; /* If non-zero, there was no `syntax-table' |
304 code. */ | 247 property on the current range, and so we're |
305 int no_syntax_table_prop; /* If non-zero, there was no | 248 using the buffer's syntax table. |
306 `syntax-table' property on the | 249 Then we must invalidate the cache if the |
307 current range, and so we're | 250 buffer's syntax table is changed. */ |
308 using the buffer's syntax table. | 251 Lisp_Object object; /* The buffer or string the current syntax |
309 This is important to note because | 252 cache applies to, or Qnil for a string of |
310 sometimes the buffer's syntax | 253 text not coming from a buffer or string. */ |
311 table can be changed. */ | 254 struct buffer *buffer; /* The buffer that supplies the syntax tables, |
312 Lisp_Object object; /* The buffer or string the current | 255 or NULL for the standard syntax table. If |
313 syntax cache applies to, or | 256 OBJECT is a buffer, this will always be |
314 Qnil for a string of text not | 257 the same buffer. */ |
315 coming from a buffer or string. */ | 258 int syntax_code; /* Syntax code of current char. */ |
316 struct buffer *buffer; /* The buffer that supplies the | 259 Lisp_Object syntax_table; /* Syntax table for current pos. */ |
317 syntax tables, or 0 for the | 260 Lisp_Object mirror_table; /* Mirror table for this table. */ |
318 standard syntax table. If | 261 Lisp_Object start, end; /* Markers to keep track of the known region |
319 OBJECT is a buffer, this will | 262 in a buffer. |
320 always be the same buffer. */ | 263 Normally these correspond to prev_change |
321 int syntax_code; /* Syntax code of current char. */ | 264 and next_change, respectively, except when |
322 Lisp_Object syntax_table; /* Syntax table for current pos. */ | 265 insertions and deletions occur. Then |
323 Lisp_Object mirror_table; /* Mirror table for this table. */ | 266 prev_change and next change will be |
324 Lisp_Object start, end; /* Markers to keep track of the | 267 refreshed from these markers. See |
325 known region in a buffer. | 268 signal_syntax_cache_extent_adjust(). |
326 Formerly we used an internal | 269 We'd like to use an extent, but it seems |
327 extent, but it seems that having | 270 that having an extent over the entire |
328 an extent over the entire buffer | 271 buffer causes serious slowdowns in extent |
329 causes serious slowdowns in | 272 operations! Yuck! */ |
330 extent operations! Yuck! */ | 273 Charxpos next_change; /* Position of the next extent change. */ |
331 Charxpos next_change; /* Position of the next extent | 274 Charxpos prev_change; /* Position of the previous extent change. */ |
332 change. */ | |
333 Charxpos prev_change; /* Position of the previous extent | |
334 change. */ | |
335 }; | 275 }; |
336 | 276 |
337 #ifdef NEW_GC | 277 #ifdef NEW_GC |
338 typedef struct syntax_cache Lisp_Syntax_Cache; | 278 typedef struct syntax_cache Lisp_Syntax_Cache; |
339 | 279 |
345 #define SYNTAX_CACHE_P(x) RECORDP (x, syntax_cache) | 285 #define SYNTAX_CACHE_P(x) RECORDP (x, syntax_cache) |
346 #define CHECK_SYNTAX_CACHE(x) CHECK_RECORD (x, syntax_cache) | 286 #define CHECK_SYNTAX_CACHE(x) CHECK_RECORD (x, syntax_cache) |
347 #define CONCHECK_SYNTAX_CACHE(x) CONCHECK_RECORD (x, syntax_cache) | 287 #define CONCHECK_SYNTAX_CACHE(x) CONCHECK_RECORD (x, syntax_cache) |
348 #endif /* NEW_GC */ | 288 #endif /* NEW_GC */ |
349 | 289 |
350 | |
351 | |
352 extern const struct sized_memory_description syntax_cache_description; | 290 extern const struct sized_memory_description syntax_cache_description; |
353 | 291 |
354 /* Note that the external interface to the syntax-cache uses charpos's, but | 292 /* Note that the external interface to the syntax cache uses charpos's, but |
355 internally we use bytepos's, for speed. */ | 293 internally we use bytepos's, for speed. */ |
356 | |
357 void update_syntax_cache (struct syntax_cache *cache, Charxpos pos, int count); | 294 void update_syntax_cache (struct syntax_cache *cache, Charxpos pos, int count); |
358 struct syntax_cache *setup_syntax_cache (struct syntax_cache *cache, | 295 struct syntax_cache *setup_syntax_cache (struct syntax_cache *cache, |
359 Lisp_Object object, | 296 Lisp_Object object, |
360 struct buffer *buffer, | 297 struct buffer *buffer, |
361 Charxpos from, int count); | 298 Charxpos from, int count); |