Mercurial > hg > xemacs-beta
annotate src/search.c @ 4414:df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
2008-01-30 Aidan Kehoe <kehoea@parhasard.net>
* automated/case-tests.el:
Check for a bug Mike Sperber reported; check algorithms used, if
available.
2008-01-30 Aidan Kehoe <kehoea@parhasard.net>
* search.c (debug-xemacs-searches):
New variable, available on debug builds. Used in
tests/automated/case-tests.el.
(search_buffer): Only store the charset_base for characters with
translations. Correct some comments, correct some checks. If
debug_xemacs_searches is non-zero, record which search was used.
(boyer_moore): Remove an assertion that was incorrect. Remove its
documentation. Correct an assertion dealing with equivalence
tables; we may end up looking through the equivalence table if a
non-ASCII non-case character was searched for.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Wed, 30 Jan 2008 09:26:59 +0100 |
parents | 4ee73bbe4f8e |
children | 69b803c646cd |
rev | line source |
---|---|
428 | 1 /* String search routines for XEmacs. |
2 Copyright (C) 1985, 1986, 1987, 1992-1995 Free Software Foundation, Inc. | |
3 Copyright (C) 1995 Sun Microsystems, Inc. | |
793 | 4 Copyright (C) 2001, 2002 Ben Wing. |
428 | 5 |
6 This file is part of XEmacs. | |
7 | |
8 XEmacs is free software; you can redistribute it and/or modify it | |
9 under the terms of the GNU General Public License as published by the | |
10 Free Software Foundation; either version 2, or (at your option) any | |
11 later version. | |
12 | |
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 for more details. | |
17 | |
18 You should have received a copy of the GNU General Public License | |
19 along with XEmacs; see the file COPYING. If not, write to | |
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
21 Boston, MA 02111-1307, USA. */ | |
22 | |
23 /* Synched up with: FSF 19.29, except for region-cache stuff. */ | |
24 | |
25 /* Hacked on for Mule by Ben Wing, December 1994 and August 1995. */ | |
26 | |
826 | 27 /* This file has been Mule-ized. */ |
428 | 28 |
29 #include <config.h> | |
30 #include "lisp.h" | |
31 | |
32 #include "buffer.h" | |
33 #include "insdel.h" | |
34 #include "opaque.h" | |
35 #ifdef REGION_CACHE_NEEDS_WORK | |
36 #include "region-cache.h" | |
37 #endif | |
38 #include "syntax.h" | |
39 | |
40 #include <sys/types.h> | |
41 #include "regex.h" | |
446 | 42 #include "casetab.h" |
43 #include "chartab.h" | |
44 | |
45 #define TRANSLATE(table, pos) \ | |
867 | 46 (!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos) |
428 | 47 |
48 #define REGEXP_CACHE_SIZE 20 | |
49 | |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
50 #ifdef DEBUG_XEMACS |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
51 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
52 /* Used in tests/automated/case-tests.el if available. */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
53 Fixnum debug_xemacs_searches; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
54 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
55 Lisp_Object Qsearch_algorithm_used, Qboyer_moore, Qsimple_search; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
56 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
57 #endif |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
58 |
428 | 59 /* If the regexp is non-nil, then the buffer contains the compiled form |
60 of that regexp, suitable for searching. */ | |
446 | 61 struct regexp_cache |
62 { | |
428 | 63 struct regexp_cache *next; |
64 Lisp_Object regexp; | |
65 struct re_pattern_buffer buf; | |
66 char fastmap[0400]; | |
67 /* Nonzero means regexp was compiled to do full POSIX backtracking. */ | |
68 char posix; | |
69 }; | |
70 | |
71 /* The instances of that struct. */ | |
72 static struct regexp_cache searchbufs[REGEXP_CACHE_SIZE]; | |
73 | |
74 /* The head of the linked list; points to the most recently used buffer. */ | |
75 static struct regexp_cache *searchbuf_head; | |
76 | |
77 | |
78 /* Every call to re_match, etc., must pass &search_regs as the regs | |
79 argument unless you can show it is unnecessary (i.e., if re_match | |
80 is certainly going to be called again before region-around-match | |
81 can be called). | |
82 | |
83 Since the registers are now dynamically allocated, we need to make | |
84 sure not to refer to the Nth register before checking that it has | |
85 been allocated by checking search_regs.num_regs. | |
86 | |
87 The regex code keeps track of whether it has allocated the search | |
88 buffer using bits in the re_pattern_buffer. This means that whenever | |
89 you compile a new pattern, it completely forgets whether it has | |
90 allocated any registers, and will allocate new registers the next | |
91 time you call a searching or matching function. Therefore, we need | |
92 to call re_set_registers after compiling a new pattern or after | |
93 setting the match registers, so that the regex functions will be | |
94 able to free or re-allocate it properly. */ | |
95 | |
96 /* Note: things get trickier under Mule because the values returned from | |
826 | 97 the regexp routines are in Bytebpos's but we need them to be in Charbpos's. |
428 | 98 We take the easy way out for the moment and just convert them immediately. |
99 We could be more clever by not converting them until necessary, but | |
100 that gets real ugly real fast since the buffer might have changed and | |
101 the positions might be out of sync or out of range. | |
102 */ | |
103 static struct re_registers search_regs; | |
104 | |
1468 | 105 /* Every function that sets the match data _must_ clear unused search |
106 registers on success. An unsuccessful search or match _must_ preserve | |
107 the search registers. The traditional documentation implied that | |
108 any match operation might trash the registers, but in fact failures | |
109 have always preserved the match data (in GNU Emacs as well). Some | |
110 plausible code depends on this behavior (cf. `w3-configuration-data' | |
111 in library "w3-cfg"). | |
112 | |
113 Ordinary string searchs use set_search_regs to set the whole-string | |
114 match. That function takes care of clearing the unused subexpression | |
1425 | 115 registers. |
116 */ | |
117 static void set_search_regs (struct buffer *buf, Charbpos beg, Charcount len); | |
1468 | 118 static void clear_search_regs (void); |
1425 | 119 |
428 | 120 /* The buffer in which the last search was performed, or |
121 Qt if the last search was done in a string; | |
122 Qnil if no searching has been done yet. */ | |
123 static Lisp_Object last_thing_searched; | |
124 | |
125 /* error condition signalled when regexp compile_pattern fails */ | |
126 | |
127 Lisp_Object Qinvalid_regexp; | |
128 | |
129 /* Regular expressions used in forward/backward-word */ | |
130 Lisp_Object Vforward_word_regexp, Vbackward_word_regexp; | |
131 | |
507 | 132 Fixnum warn_about_possibly_incompatible_back_references; |
502 | 133 |
428 | 134 /* range table for use with skip_chars. Only needed for Mule. */ |
135 Lisp_Object Vskip_chars_range_table; | |
136 | |
867 | 137 static Charbpos simple_search (struct buffer *buf, Ibyte *base_pat, |
826 | 138 Bytecount len, Bytebpos pos, Bytebpos lim, |
139 EMACS_INT n, Lisp_Object trt); | |
867 | 140 static Charbpos boyer_moore (struct buffer *buf, Ibyte *base_pat, |
826 | 141 Bytecount len, Bytebpos pos, Bytebpos lim, |
142 EMACS_INT n, Lisp_Object trt, | |
143 Lisp_Object inverse_trt, int charset_base); | |
665 | 144 static Charbpos search_buffer (struct buffer *buf, Lisp_Object str, |
826 | 145 Charbpos charbpos, Charbpos buflim, EMACS_INT n, |
146 int RE, Lisp_Object trt, | |
147 Lisp_Object inverse_trt, int posix); | |
771 | 148 |
2268 | 149 static DECLARE_DOESNT_RETURN (matcher_overflow (void)); |
150 | |
151 static DOESNT_RETURN | |
152 matcher_overflow () | |
428 | 153 { |
563 | 154 stack_overflow ("Stack overflow in regexp matcher", Qunbound); |
428 | 155 } |
156 | |
157 /* Compile a regexp and signal a Lisp error if anything goes wrong. | |
158 PATTERN is the pattern to compile. | |
159 CP is the place to put the result. | |
826 | 160 TRANSLATE is a translation table for ignoring case, or Qnil for none. |
428 | 161 REGP is the structure that says where to store the "register" |
162 values that will result from matching this pattern. | |
163 If it is 0, we should compile the pattern not to record any | |
164 subexpression bounds. | |
165 POSIX is nonzero if we want full backtracking (POSIX style) | |
166 for this pattern. 0 means backtrack only enough to get a valid match. */ | |
167 | |
168 static int | |
169 compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern, | |
2286 | 170 struct re_registers *UNUSED (regp), Lisp_Object translate, |
826 | 171 int posix, Error_Behavior errb) |
428 | 172 { |
442 | 173 const char *val; |
428 | 174 reg_syntax_t old; |
175 | |
176 cp->regexp = Qnil; | |
177 cp->buf.translate = translate; | |
178 cp->posix = posix; | |
179 old = re_set_syntax (RE_SYNTAX_EMACS | |
180 | (posix ? 0 : RE_NO_POSIX_BACKTRACKING)); | |
442 | 181 val = (const char *) |
428 | 182 re_compile_pattern ((char *) XSTRING_DATA (pattern), |
183 XSTRING_LENGTH (pattern), &cp->buf); | |
184 re_set_syntax (old); | |
185 if (val) | |
186 { | |
563 | 187 maybe_signal_error (Qinvalid_regexp, 0, build_string (val), |
428 | 188 Qsearch, errb); |
189 return 0; | |
190 } | |
191 | |
192 cp->regexp = Fcopy_sequence (pattern); | |
193 return 1; | |
194 } | |
195 | |
196 /* Compile a regexp if necessary, but first check to see if there's one in | |
197 the cache. | |
198 PATTERN is the pattern to compile. | |
826 | 199 TRANSLATE is a translation table for ignoring case, or Qnil for none. |
428 | 200 REGP is the structure that says where to store the "register" |
201 values that will result from matching this pattern. | |
202 If it is 0, we should compile the pattern not to record any | |
203 subexpression bounds. | |
204 POSIX is nonzero if we want full backtracking (POSIX style) | |
205 for this pattern. 0 means backtrack only enough to get a valid match. */ | |
206 | |
207 struct re_pattern_buffer * | |
208 compile_pattern (Lisp_Object pattern, struct re_registers *regp, | |
2286 | 209 Lisp_Object translate, Lisp_Object UNUSED (searchobj), |
210 struct buffer *UNUSED (searchbuf), int posix, | |
211 Error_Behavior errb) | |
428 | 212 { |
213 struct regexp_cache *cp, **cpp; | |
214 | |
215 for (cpp = &searchbuf_head; ; cpp = &cp->next) | |
216 { | |
217 cp = *cpp; | |
826 | 218 /* &&#### once we fix up the fastmap code in regex.c for 8-bit-fixed, |
219 we need to record and compare the buffer and format, since the | |
220 fastmap will reflect the state of the buffer -- and things get | |
221 more complicated if the buffer has changed formats or (esp.) has | |
222 kept the format but changed its interpretation! may need to have | |
223 the code that changes the interpretation go through and invalidate | |
224 cache entries for that buffer. */ | |
428 | 225 if (!NILP (Fstring_equal (cp->regexp, pattern)) |
446 | 226 && EQ (cp->buf.translate, translate) |
428 | 227 && cp->posix == posix) |
228 break; | |
229 | |
230 /* If we're at the end of the cache, compile into the last cell. */ | |
231 if (cp->next == 0) | |
232 { | |
826 | 233 if (!compile_pattern_1 (cp, pattern, regp, translate, |
234 posix, errb)) | |
428 | 235 return 0; |
236 break; | |
237 } | |
238 } | |
239 | |
240 /* When we get here, cp (aka *cpp) contains the compiled pattern, | |
241 either because we found it in the cache or because we just compiled it. | |
242 Move it to the front of the queue to mark it as most recently used. */ | |
243 *cpp = cp->next; | |
244 cp->next = searchbuf_head; | |
245 searchbuf_head = cp; | |
246 | |
247 /* Advise the searching functions about the space we have allocated | |
248 for register data. */ | |
249 if (regp) | |
250 re_set_registers (&cp->buf, regp, regp->num_regs, regp->start, regp->end); | |
251 | |
252 return &cp->buf; | |
253 } | |
254 | |
255 /* Error condition used for failing searches */ | |
256 Lisp_Object Qsearch_failed; | |
257 | |
2268 | 258 static DECLARE_DOESNT_RETURN (signal_failure (Lisp_Object)); |
259 | |
260 static DOESNT_RETURN | |
428 | 261 signal_failure (Lisp_Object arg) |
262 { | |
446 | 263 for (;;) |
264 Fsignal (Qsearch_failed, list1 (arg)); | |
428 | 265 } |
266 | |
826 | 267 /* Convert the search registers from Bytebpos's to Charbpos's. Needs to be |
428 | 268 done after each regexp match that uses the search regs. |
269 | |
270 We could get a potential speedup by not converting the search registers | |
271 until it's really necessary, e.g. when match-data or replace-match is | |
272 called. However, this complexifies the code a lot (e.g. the buffer | |
826 | 273 could have changed and the Bytebpos's stored might be invalid) and is |
428 | 274 probably not a great time-saver. */ |
275 | |
276 static void | |
277 fixup_search_regs_for_buffer (struct buffer *buf) | |
278 { | |
279 int i; | |
280 int num_regs = search_regs.num_regs; | |
281 | |
282 for (i = 0; i < num_regs; i++) | |
283 { | |
284 if (search_regs.start[i] >= 0) | |
826 | 285 search_regs.start[i] = bytebpos_to_charbpos (buf, |
286 search_regs.start[i]); | |
428 | 287 if (search_regs.end[i] >= 0) |
665 | 288 search_regs.end[i] = bytebpos_to_charbpos (buf, search_regs.end[i]); |
428 | 289 } |
290 } | |
291 | |
292 /* Similar but for strings. */ | |
293 static void | |
294 fixup_search_regs_for_string (Lisp_Object string) | |
295 { | |
296 int i; | |
297 int num_regs = search_regs.num_regs; | |
298 | |
299 /* #### bytecount_to_charcount() is not that efficient. This function | |
867 | 300 could be faster if it did its own conversion (using INC_IBYTEPTR() |
428 | 301 and such), because the register ends are likely to be somewhat ordered. |
302 (Even if not, you could sort them.) | |
303 | |
304 Think about this if this function is a time hog, which it's probably | |
305 not. */ | |
306 for (i = 0; i < num_regs; i++) | |
307 { | |
308 if (search_regs.start[i] > 0) | |
309 { | |
310 search_regs.start[i] = | |
793 | 311 string_index_byte_to_char (string, search_regs.start[i]); |
428 | 312 } |
313 if (search_regs.end[i] > 0) | |
314 { | |
315 search_regs.end[i] = | |
793 | 316 string_index_byte_to_char (string, search_regs.end[i]); |
428 | 317 } |
318 } | |
319 } | |
320 | |
321 | |
322 static Lisp_Object | |
323 looking_at_1 (Lisp_Object string, struct buffer *buf, int posix) | |
324 { | |
325 Lisp_Object val; | |
665 | 326 Bytebpos p1, p2; |
428 | 327 Bytecount s1, s2; |
328 REGISTER int i; | |
329 struct re_pattern_buffer *bufp; | |
826 | 330 struct syntax_cache scache_struct; |
331 struct syntax_cache *scache = &scache_struct; | |
332 | |
428 | 333 CHECK_STRING (string); |
334 bufp = compile_pattern (string, &search_regs, | |
335 (!NILP (buf->case_fold_search) | |
446 | 336 ? XCASE_TABLE_DOWNCASE (buf->case_table) : Qnil), |
826 | 337 wrap_buffer (buf), buf, posix, ERROR_ME); |
428 | 338 |
339 QUIT; | |
340 | |
341 /* Get pointers and sizes of the two strings | |
342 that make up the visible portion of the buffer. */ | |
343 | |
826 | 344 p1 = BYTE_BUF_BEGV (buf); |
345 p2 = BYTE_BUF_CEILING_OF (buf, p1); | |
428 | 346 s1 = p2 - p1; |
826 | 347 s2 = BYTE_BUF_ZV (buf) - p2; |
348 | |
349 /* By making the regex object, regex buffer, and syntax cache arguments | |
350 to re_{search,match}{,_2}, we've removed the need to do nasty things | |
351 to deal with regex reentrancy. (See stack trace in signal.c for proof | |
352 that this can happen.) | |
353 | |
354 #### there is still a potential problem with the regex cache -- | |
355 the compiled regex could be overwritten. we'd need 20-fold | |
356 reentrancy, though. Fix this. */ | |
357 | |
358 i = re_match_2 (bufp, (char *) BYTE_BUF_BYTE_ADDRESS (buf, p1), | |
359 s1, (char *) BYTE_BUF_BYTE_ADDRESS (buf, p2), s2, | |
360 BYTE_BUF_PT (buf) - BYTE_BUF_BEGV (buf), &search_regs, | |
361 BYTE_BUF_ZV (buf) - BYTE_BUF_BEGV (buf), wrap_buffer (buf), | |
362 buf, scache); | |
428 | 363 |
364 if (i == -2) | |
365 matcher_overflow (); | |
366 | |
367 val = (0 <= i ? Qt : Qnil); | |
368 if (NILP (val)) | |
826 | 369 return Qnil; |
428 | 370 { |
371 int num_regs = search_regs.num_regs; | |
372 for (i = 0; i < num_regs; i++) | |
373 if (search_regs.start[i] >= 0) | |
374 { | |
826 | 375 search_regs.start[i] += BYTE_BUF_BEGV (buf); |
376 search_regs.end[i] += BYTE_BUF_BEGV (buf); | |
428 | 377 } |
378 } | |
793 | 379 last_thing_searched = wrap_buffer (buf); |
428 | 380 fixup_search_regs_for_buffer (buf); |
826 | 381 return val; |
428 | 382 } |
383 | |
384 DEFUN ("looking-at", Flooking_at, 1, 2, 0, /* | |
385 Return t if text after point matches regular expression REGEXP. | |
1468 | 386 When the match is successful, this function modifies the match data |
387 that `match-beginning', `match-end' and `match-data' access; save the | |
388 match data with `match-data' and restore it with `store-match-data' if | |
389 you want to preserve them. If the match fails, the match data from the | |
390 previous success match is preserved. | |
428 | 391 |
392 Optional argument BUFFER defaults to the current buffer. | |
393 */ | |
394 (regexp, buffer)) | |
395 { | |
396 return looking_at_1 (regexp, decode_buffer (buffer, 0), 0); | |
397 } | |
398 | |
399 DEFUN ("posix-looking-at", Fposix_looking_at, 1, 2, 0, /* | |
400 Return t if text after point matches regular expression REGEXP. | |
401 Find the longest match, in accord with Posix regular expression rules. | |
1468 | 402 When the match is successful, this function modifies the match data |
403 that `match-beginning', `match-end' and `match-data' access; save the | |
404 match data with `match-data' and restore it with `store-match-data' if | |
405 you want to preserve them. If the match fails, the match data from the | |
406 previous success match is preserved. | |
428 | 407 |
408 Optional argument BUFFER defaults to the current buffer. | |
409 */ | |
410 (regexp, buffer)) | |
411 { | |
826 | 412 return looking_at_1 (regexp, decode_buffer (buffer, 0), 1); |
428 | 413 } |
414 | |
415 static Lisp_Object | |
416 string_match_1 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start, | |
2286 | 417 struct buffer *buf, int UNUSED (posix)) |
428 | 418 { |
419 Bytecount val; | |
420 Charcount s; | |
421 struct re_pattern_buffer *bufp; | |
422 | |
853 | 423 /* Some FSF junk with running_asynch_code, to preserve the match |
424 data. Not necessary because we don't call process filters | |
425 asynchronously (i.e. from within QUIT). */ | |
428 | 426 |
427 CHECK_STRING (regexp); | |
428 CHECK_STRING (string); | |
429 | |
430 if (NILP (start)) | |
431 s = 0; | |
432 else | |
433 { | |
826 | 434 Charcount len = string_char_length (string); |
428 | 435 |
436 CHECK_INT (start); | |
437 s = XINT (start); | |
438 if (s < 0 && -s <= len) | |
439 s = len + s; | |
440 else if (0 > s || s > len) | |
441 args_out_of_range (string, start); | |
442 } | |
443 | |
444 | |
445 bufp = compile_pattern (regexp, &search_regs, | |
446 (!NILP (buf->case_fold_search) | |
446 | 447 ? XCASE_TABLE_DOWNCASE (buf->case_table) : Qnil), |
826 | 448 string, buf, 0, ERROR_ME); |
428 | 449 QUIT; |
450 { | |
793 | 451 Bytecount bis = string_index_char_to_byte (string, s); |
826 | 452 struct syntax_cache scache_struct; |
453 struct syntax_cache *scache = &scache_struct; | |
454 | |
455 /* By making the regex object, regex buffer, and syntax cache arguments | |
456 to re_{search,match}{,_2}, we've removed the need to do nasty things | |
457 to deal with regex reentrancy. (See stack trace in signal.c for proof | |
458 that this can happen.) | |
459 | |
460 #### there is still a potential problem with the regex cache -- | |
461 the compiled regex could be overwritten. we'd need 20-fold | |
462 reentrancy, though. Fix this. */ | |
463 | |
428 | 464 val = re_search (bufp, (char *) XSTRING_DATA (string), |
465 XSTRING_LENGTH (string), bis, | |
466 XSTRING_LENGTH (string) - bis, | |
826 | 467 &search_regs, string, buf, scache); |
428 | 468 } |
469 if (val == -2) | |
470 matcher_overflow (); | |
826 | 471 if (val < 0) return Qnil; |
428 | 472 last_thing_searched = Qt; |
473 fixup_search_regs_for_string (string); | |
826 | 474 return make_int (string_index_byte_to_char (string, val)); |
428 | 475 } |
476 | |
477 DEFUN ("string-match", Fstring_match, 2, 4, 0, /* | |
478 Return index of start of first match for REGEXP in STRING, or nil. | |
479 If third arg START is non-nil, start search at that index in STRING. | |
480 For index of first char beyond the match, do (match-end 0). | |
481 `match-end' and `match-beginning' also give indices of substrings | |
482 matched by parenthesis constructs in the pattern. | |
483 | |
826 | 484 Optional arg BUFFER controls how case folding and syntax and category |
485 lookup is done (according to the value of `case-fold-search' in that buffer | |
486 and that buffer's case tables, syntax tables, and category table). If nil | |
487 or unspecified, it defaults *NOT* to the current buffer but instead: | |
488 | |
489 -- the value of `case-fold-search' in the current buffer is still respected | |
490 because of idioms like | |
491 | |
492 (let ((case-fold-search nil)) | |
493 (string-match "^foo.*bar" string)) | |
494 | |
495 but the case, syntax, and category tables come from the standard tables, | |
1468 | 496 which are accessed through functions `default-{case,syntax,category}-table' |
497 and serve as the parents of the tables in particular buffer. | |
498 | |
499 When the match is successful, this function modifies the match data | |
500 that `match-beginning', `match-end' and `match-data' access; save the | |
501 match data with `match-data' and restore it with `store-match-data' if | |
502 you want to preserve them. If the match fails, the match data from the | |
503 previous success match is preserved. | |
428 | 504 */ |
505 (regexp, string, start, buffer)) | |
506 { | |
826 | 507 /* &&#### implement new interp for buffer arg; check code to see if it |
508 makes more sense than prev */ | |
428 | 509 return string_match_1 (regexp, string, start, decode_buffer (buffer, 0), 0); |
510 } | |
511 | |
512 DEFUN ("posix-string-match", Fposix_string_match, 2, 4, 0, /* | |
513 Return index of start of first match for REGEXP in STRING, or nil. | |
514 Find the longest match, in accord with Posix regular expression rules. | |
515 If third arg START is non-nil, start search at that index in STRING. | |
516 For index of first char beyond the match, do (match-end 0). | |
517 `match-end' and `match-beginning' also give indices of substrings | |
518 matched by parenthesis constructs in the pattern. | |
519 | |
520 Optional arg BUFFER controls how case folding is done (according to | |
521 the value of `case-fold-search' in that buffer and that buffer's case | |
522 tables) and defaults to the current buffer. | |
1468 | 523 |
524 When the match is successful, this function modifies the match data | |
525 that `match-beginning', `match-end' and `match-data' access; save the | |
526 match data with `match-data' and restore it with `store-match-data' if | |
527 you want to preserve them. If the match fails, the match data from the | |
528 previous success match is preserved. | |
428 | 529 */ |
530 (regexp, string, start, buffer)) | |
531 { | |
532 return string_match_1 (regexp, string, start, decode_buffer (buffer, 0), 1); | |
533 } | |
534 | |
535 /* Match REGEXP against STRING, searching all of STRING, | |
536 and return the index of the match, or negative on failure. | |
537 This does not clobber the match data. */ | |
538 | |
539 Bytecount | |
1347 | 540 fast_string_match (Lisp_Object regexp, const Ibyte *nonreloc, |
428 | 541 Lisp_Object reloc, Bytecount offset, |
542 Bytecount length, int case_fold_search, | |
578 | 543 Error_Behavior errb, int no_quit) |
428 | 544 { |
545 Bytecount val; | |
867 | 546 Ibyte *newnonreloc = (Ibyte *) nonreloc; |
428 | 547 struct re_pattern_buffer *bufp; |
826 | 548 struct syntax_cache scache_struct; |
549 struct syntax_cache *scache = &scache_struct; | |
428 | 550 |
551 bufp = compile_pattern (regexp, 0, | |
552 (case_fold_search | |
771 | 553 ? XCASE_TABLE_DOWNCASE (Vstandard_case_table) |
446 | 554 : Qnil), |
826 | 555 reloc, 0, 0, errb); |
428 | 556 if (!bufp) |
557 return -1; /* will only do this when errb != ERROR_ME */ | |
558 if (!no_quit) | |
559 QUIT; | |
560 else | |
561 no_quit_in_re_search = 1; | |
562 | |
563 fixup_internal_substring (nonreloc, reloc, offset, &length); | |
564 | |
771 | 565 /* Don't need to protect against GC inside of re_search() due to QUIT; |
566 QUIT is GC-inhibited. */ | |
428 | 567 if (!NILP (reloc)) |
771 | 568 newnonreloc = XSTRING_DATA (reloc); |
569 | |
826 | 570 /* By making the regex object, regex buffer, and syntax cache arguments |
571 to re_{search,match}{,_2}, we've removed the need to do nasty things | |
572 to deal with regex reentrancy. (See stack trace in signal.c for proof | |
573 that this can happen.) | |
574 | |
575 #### there is still a potential problem with the regex cache -- | |
576 the compiled regex could be overwritten. we'd need 20-fold | |
577 reentrancy, though. Fix this. */ | |
578 | |
428 | 579 val = re_search (bufp, (char *) newnonreloc + offset, length, 0, |
826 | 580 length, 0, reloc, 0, scache); |
428 | 581 |
582 no_quit_in_re_search = 0; | |
583 return val; | |
584 } | |
585 | |
586 Bytecount | |
587 fast_lisp_string_match (Lisp_Object regex, Lisp_Object string) | |
588 { | |
589 return fast_string_match (regex, 0, string, 0, -1, 0, ERROR_ME, 0); | |
590 } | |
591 | |
592 | |
593 #ifdef REGION_CACHE_NEEDS_WORK | |
594 /* The newline cache: remembering which sections of text have no newlines. */ | |
595 | |
596 /* If the user has requested newline caching, make sure it's on. | |
597 Otherwise, make sure it's off. | |
598 This is our cheezy way of associating an action with the change of | |
599 state of a buffer-local variable. */ | |
600 static void | |
601 newline_cache_on_off (struct buffer *buf) | |
602 { | |
603 if (NILP (buf->cache_long_line_scans)) | |
604 { | |
605 /* It should be off. */ | |
606 if (buf->newline_cache) | |
607 { | |
608 free_region_cache (buf->newline_cache); | |
609 buf->newline_cache = 0; | |
610 } | |
611 } | |
612 else | |
613 { | |
614 /* It should be on. */ | |
615 if (buf->newline_cache == 0) | |
616 buf->newline_cache = new_region_cache (); | |
617 } | |
618 } | |
619 #endif | |
620 | |
621 /* Search in BUF for COUNT instances of the character TARGET between | |
622 START and END. | |
623 | |
624 If COUNT is positive, search forwards; END must be >= START. | |
625 If COUNT is negative, search backwards for the -COUNTth instance; | |
626 END must be <= START. | |
627 If COUNT is zero, do anything you please; run rogue, for all I care. | |
628 | |
629 If END is zero, use BEGV or ZV instead, as appropriate for the | |
630 direction indicated by COUNT. | |
631 | |
632 If we find COUNT instances, set *SHORTAGE to zero, and return the | |
633 position after the COUNTth match. Note that for reverse motion | |
634 this is not the same as the usual convention for Emacs motion commands. | |
635 | |
636 If we don't find COUNT instances before reaching END, set *SHORTAGE | |
637 to the number of TARGETs left unfound, and return END. | |
638 | |
639 If ALLOW_QUIT is non-zero, call QUIT periodically. */ | |
640 | |
665 | 641 static Bytebpos |
867 | 642 byte_scan_buffer (struct buffer *buf, Ichar target, Bytebpos st, Bytebpos en, |
872 | 643 EMACS_INT count, EMACS_INT *shortage, int allow_quit) |
428 | 644 { |
665 | 645 Bytebpos lim = en > 0 ? en : |
826 | 646 ((count > 0) ? BYTE_BUF_ZV (buf) : BYTE_BUF_BEGV (buf)); |
428 | 647 |
648 /* #### newline cache stuff in this function not yet ported */ | |
649 assert (count != 0); | |
650 | |
651 if (shortage) | |
652 *shortage = 0; | |
653 | |
654 if (count > 0) | |
655 { | |
656 #ifdef MULE | |
826 | 657 Internal_Format fmt = buf->text->format; |
658 /* Check for char that's unrepresentable in the buffer -- it | |
659 certainly can't be there. */ | |
867 | 660 if (!ichar_fits_in_format (target, fmt, wrap_buffer (buf))) |
428 | 661 { |
826 | 662 *shortage = count; |
663 return lim; | |
664 } | |
665 /* Due to the Mule representation of characters in a buffer, we can | |
666 simply search for characters in the range 0 - 127 directly; for | |
667 8-bit-fixed, we can do this for all characters. In other cases, | |
668 we do it the "hard" way. Note that this way works for all | |
669 characters and all formats, but the other way is faster. */ | |
670 else if (! (fmt == FORMAT_8_BIT_FIXED || | |
867 | 671 (fmt == FORMAT_DEFAULT && ichar_ascii_p (target)))) |
826 | 672 { |
867 | 673 Raw_Ichar raw = ichar_to_raw (target, fmt, wrap_buffer (buf)); |
428 | 674 while (st < lim && count > 0) |
675 { | |
826 | 676 if (BYTE_BUF_FETCH_CHAR_RAW (buf, st) == raw) |
428 | 677 count--; |
665 | 678 INC_BYTEBPOS (buf, st); |
428 | 679 } |
680 } | |
681 else | |
682 #endif | |
683 { | |
867 | 684 Raw_Ichar raw = ichar_to_raw (target, fmt, wrap_buffer (buf)); |
428 | 685 while (st < lim && count > 0) |
686 { | |
665 | 687 Bytebpos ceil; |
867 | 688 Ibyte *bufptr; |
428 | 689 |
826 | 690 ceil = BYTE_BUF_CEILING_OF (buf, st); |
428 | 691 ceil = min (lim, ceil); |
867 | 692 bufptr = (Ibyte *) memchr (BYTE_BUF_BYTE_ADDRESS (buf, st), |
826 | 693 raw, ceil - st); |
428 | 694 if (bufptr) |
695 { | |
696 count--; | |
826 | 697 st = BYTE_BUF_PTR_BYTE_POS (buf, bufptr) + 1; |
428 | 698 } |
699 else | |
700 st = ceil; | |
701 } | |
702 } | |
703 | |
704 if (shortage) | |
705 *shortage = count; | |
706 if (allow_quit) | |
707 QUIT; | |
708 return st; | |
709 } | |
710 else | |
711 { | |
712 #ifdef MULE | |
826 | 713 Internal_Format fmt = buf->text->format; |
714 /* Check for char that's unrepresentable in the buffer -- it | |
715 certainly can't be there. */ | |
867 | 716 if (!ichar_fits_in_format (target, fmt, wrap_buffer (buf))) |
428 | 717 { |
826 | 718 *shortage = -count; |
719 return lim; | |
720 } | |
721 else if (! (fmt == FORMAT_8_BIT_FIXED || | |
867 | 722 (fmt == FORMAT_DEFAULT && ichar_ascii_p (target)))) |
826 | 723 { |
867 | 724 Raw_Ichar raw = ichar_to_raw (target, fmt, wrap_buffer (buf)); |
428 | 725 while (st > lim && count < 0) |
726 { | |
665 | 727 DEC_BYTEBPOS (buf, st); |
826 | 728 if (BYTE_BUF_FETCH_CHAR_RAW (buf, st) == raw) |
428 | 729 count++; |
730 } | |
731 } | |
732 else | |
733 #endif | |
734 { | |
867 | 735 Raw_Ichar raw = ichar_to_raw (target, fmt, wrap_buffer (buf)); |
428 | 736 while (st > lim && count < 0) |
737 { | |
665 | 738 Bytebpos floor; |
867 | 739 Ibyte *bufptr; |
740 Ibyte *floorptr; | |
428 | 741 |
826 | 742 floor = BYTE_BUF_FLOOR_OF (buf, st); |
428 | 743 floor = max (lim, floor); |
744 /* No memrchr() ... */ | |
826 | 745 bufptr = BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, st); |
746 floorptr = BYTE_BUF_BYTE_ADDRESS (buf, floor); | |
428 | 747 while (bufptr >= floorptr) |
748 { | |
749 st--; | |
750 /* At this point, both ST and BUFPTR refer to the same | |
751 character. When the loop terminates, ST will | |
752 always point to the last character we tried. */ | |
867 | 753 if (*bufptr == (Ibyte) raw) |
428 | 754 { |
755 count++; | |
756 break; | |
757 } | |
758 bufptr--; | |
759 } | |
760 } | |
761 } | |
762 | |
763 if (shortage) | |
764 *shortage = -count; | |
765 if (allow_quit) | |
766 QUIT; | |
767 if (count) | |
768 return st; | |
769 else | |
770 { | |
771 /* We found the character we were looking for; we have to return | |
772 the position *after* it due to the strange way that the return | |
773 value is defined. */ | |
665 | 774 INC_BYTEBPOS (buf, st); |
428 | 775 return st; |
776 } | |
777 } | |
778 } | |
779 | |
665 | 780 Charbpos |
867 | 781 scan_buffer (struct buffer *buf, Ichar target, Charbpos start, Charbpos end, |
428 | 782 EMACS_INT count, EMACS_INT *shortage, int allow_quit) |
783 { | |
826 | 784 Bytebpos byte_retval; |
785 Bytebpos byte_start, byte_end; | |
786 | |
787 byte_start = charbpos_to_bytebpos (buf, start); | |
428 | 788 if (end) |
826 | 789 byte_end = charbpos_to_bytebpos (buf, end); |
428 | 790 else |
826 | 791 byte_end = 0; |
792 byte_retval = byte_scan_buffer (buf, target, byte_start, byte_end, count, | |
428 | 793 shortage, allow_quit); |
826 | 794 return bytebpos_to_charbpos (buf, byte_retval); |
428 | 795 } |
796 | |
665 | 797 Bytebpos |
826 | 798 byte_find_next_newline_no_quit (struct buffer *buf, Bytebpos from, int count) |
428 | 799 { |
826 | 800 return byte_scan_buffer (buf, '\n', from, 0, count, 0, 0); |
428 | 801 } |
802 | |
665 | 803 Charbpos |
804 find_next_newline_no_quit (struct buffer *buf, Charbpos from, int count) | |
428 | 805 { |
806 return scan_buffer (buf, '\n', from, 0, count, 0, 0); | |
807 } | |
808 | |
665 | 809 Charbpos |
810 find_next_newline (struct buffer *buf, Charbpos from, int count) | |
428 | 811 { |
812 return scan_buffer (buf, '\n', from, 0, count, 0, 1); | |
813 } | |
814 | |
826 | 815 Bytecount |
867 | 816 byte_find_next_ichar_in_string (Lisp_Object str, Ichar target, Bytecount st, |
428 | 817 EMACS_INT count) |
818 { | |
793 | 819 Bytebpos lim = XSTRING_LENGTH (str) -1; |
867 | 820 Ibyte *s = XSTRING_DATA (str); |
428 | 821 |
822 assert (count >= 0); | |
823 | |
824 #ifdef MULE | |
825 /* Due to the Mule representation of characters in a buffer, | |
826 we can simply search for characters in the range 0 - 127 | |
827 directly. For other characters, we do it the "hard" way. | |
828 Note that this way works for all characters but the other | |
829 way is faster. */ | |
830 if (target >= 0200) | |
831 { | |
832 while (st < lim && count > 0) | |
833 { | |
867 | 834 if (string_ichar (str, st) == target) |
428 | 835 count--; |
826 | 836 INC_BYTECOUNT (s, st); |
428 | 837 } |
838 } | |
839 else | |
840 #endif | |
841 { | |
842 while (st < lim && count > 0) | |
843 { | |
867 | 844 Ibyte *bufptr = (Ibyte *) memchr (itext_n_addr (s, st), |
428 | 845 (int) target, lim - st); |
846 if (bufptr) | |
847 { | |
848 count--; | |
826 | 849 st = (Bytebpos) (bufptr - s) + 1; |
428 | 850 } |
851 else | |
852 st = lim; | |
853 } | |
854 } | |
855 return st; | |
856 } | |
857 | |
858 /* Like find_next_newline, but returns position before the newline, | |
859 not after, and only search up to TO. This isn't just | |
860 find_next_newline (...)-1, because you might hit TO. */ | |
665 | 861 Charbpos |
826 | 862 find_before_next_newline (struct buffer *buf, Charbpos from, Charbpos to, |
863 int count) | |
428 | 864 { |
865 EMACS_INT shortage; | |
665 | 866 Charbpos pos = scan_buffer (buf, '\n', from, to, count, &shortage, 1); |
428 | 867 |
868 if (shortage == 0) | |
869 pos--; | |
870 | |
871 return pos; | |
872 } | |
873 | |
872 | 874 /* This function synched with FSF 21.1 */ |
428 | 875 static Lisp_Object |
876 skip_chars (struct buffer *buf, int forwardp, int syntaxp, | |
877 Lisp_Object string, Lisp_Object lim) | |
878 { | |
867 | 879 REGISTER Ibyte *p, *pend; |
880 REGISTER Ichar c; | |
428 | 881 /* We store the first 256 chars in an array here and the rest in |
882 a range table. */ | |
883 unsigned char fastmap[0400]; | |
884 int negate = 0; | |
885 REGISTER int i; | |
665 | 886 Charbpos limit; |
826 | 887 struct syntax_cache *scache; |
888 | |
428 | 889 if (NILP (lim)) |
890 limit = forwardp ? BUF_ZV (buf) : BUF_BEGV (buf); | |
891 else | |
892 { | |
893 CHECK_INT_COERCE_MARKER (lim); | |
894 limit = XINT (lim); | |
895 | |
896 /* In any case, don't allow scan outside bounds of buffer. */ | |
897 if (limit > BUF_ZV (buf)) limit = BUF_ZV (buf); | |
898 if (limit < BUF_BEGV (buf)) limit = BUF_BEGV (buf); | |
899 } | |
900 | |
901 CHECK_STRING (string); | |
902 p = XSTRING_DATA (string); | |
903 pend = p + XSTRING_LENGTH (string); | |
904 memset (fastmap, 0, sizeof (fastmap)); | |
905 | |
906 Fclear_range_table (Vskip_chars_range_table); | |
907 | |
908 if (p != pend && *p == '^') | |
909 { | |
910 negate = 1; | |
911 p++; | |
912 } | |
913 | |
914 /* Find the characters specified and set their elements of fastmap. | |
915 If syntaxp, each character counts as itself. | |
916 Otherwise, handle backslashes and ranges specially */ | |
917 | |
918 while (p != pend) | |
919 { | |
867 | 920 c = itext_ichar (p); |
921 INC_IBYTEPTR (p); | |
428 | 922 if (syntaxp) |
923 { | |
924 if (c < 0400 && syntax_spec_code[c] < (unsigned char) Smax) | |
925 fastmap[c] = 1; | |
926 else | |
831 | 927 invalid_argument ("Invalid syntax designator", make_char (c)); |
428 | 928 } |
929 else | |
930 { | |
931 if (c == '\\') | |
932 { | |
933 if (p == pend) break; | |
867 | 934 c = itext_ichar (p); |
935 INC_IBYTEPTR (p); | |
428 | 936 } |
937 if (p != pend && *p == '-') | |
938 { | |
867 | 939 Ichar cend; |
428 | 940 |
872 | 941 /* Skip over the dash. */ |
428 | 942 p++; |
943 if (p == pend) break; | |
867 | 944 cend = itext_ichar (p); |
428 | 945 while (c <= cend && c < 0400) |
946 { | |
947 fastmap[c] = 1; | |
948 c++; | |
949 } | |
950 if (c <= cend) | |
951 Fput_range_table (make_int (c), make_int (cend), Qt, | |
952 Vskip_chars_range_table); | |
867 | 953 INC_IBYTEPTR (p); |
428 | 954 } |
955 else | |
956 { | |
957 if (c < 0400) | |
958 fastmap[c] = 1; | |
959 else | |
960 Fput_range_table (make_int (c), make_int (c), Qt, | |
961 Vskip_chars_range_table); | |
962 } | |
963 } | |
964 } | |
965 | |
872 | 966 /* #### Not in FSF 21.1 */ |
428 | 967 if (syntaxp && fastmap['-'] != 0) |
968 fastmap[' '] = 1; | |
969 | |
970 /* If ^ was the first character, complement the fastmap. | |
971 We don't complement the range table, however; we just use negate | |
972 in the comparisons below. */ | |
973 | |
974 if (negate) | |
647 | 975 for (i = 0; i < (int) (sizeof (fastmap)); i++) |
428 | 976 fastmap[i] ^= 1; |
977 | |
978 { | |
665 | 979 Charbpos start_point = BUF_PT (buf); |
872 | 980 Charbpos pos = start_point; |
981 Charbpos pos_byte = BYTE_BUF_PT (buf); | |
428 | 982 |
983 if (syntaxp) | |
984 { | |
872 | 985 scache = setup_buffer_syntax_cache (buf, pos, forwardp ? 1 : -1); |
428 | 986 /* All syntax designators are normal chars so nothing strange |
987 to worry about */ | |
988 if (forwardp) | |
989 { | |
872 | 990 if (pos < limit) |
991 while (fastmap[(unsigned char) | |
992 syntax_code_spec | |
993 [(int) SYNTAX_FROM_CACHE | |
994 (scache, BYTE_BUF_FETCH_CHAR (buf, pos_byte))]]) | |
995 { | |
996 pos++; | |
997 INC_BYTEBPOS (buf, pos_byte); | |
879 | 998 if (pos >= limit) |
872 | 999 break; |
1000 UPDATE_SYNTAX_CACHE_FORWARD (scache, pos); | |
1001 } | |
428 | 1002 } |
1003 else | |
1004 { | |
872 | 1005 while (pos > limit) |
460 | 1006 { |
872 | 1007 Charbpos savepos = pos_byte; |
1008 pos--; | |
1009 DEC_BYTEBPOS (buf, pos_byte); | |
1010 UPDATE_SYNTAX_CACHE_BACKWARD (scache, pos); | |
1011 if (!fastmap[(unsigned char) | |
1012 syntax_code_spec | |
1013 [(int) SYNTAX_FROM_CACHE | |
1014 (scache, BYTE_BUF_FETCH_CHAR (buf, pos_byte))]]) | |
1015 { | |
1016 pos++; | |
1017 pos_byte = savepos; | |
1018 break; | |
1019 } | |
460 | 1020 } |
428 | 1021 } |
1022 } | |
1023 else | |
1024 { | |
1025 if (forwardp) | |
1026 { | |
872 | 1027 while (pos < limit) |
428 | 1028 { |
872 | 1029 Ichar ch = BYTE_BUF_FETCH_CHAR (buf, pos_byte); |
428 | 1030 if ((ch < 0400) ? fastmap[ch] : |
1031 (NILP (Fget_range_table (make_int (ch), | |
1032 Vskip_chars_range_table, | |
1033 Qnil)) | |
1034 == negate)) | |
872 | 1035 { |
1036 pos++; | |
1037 INC_BYTEBPOS (buf, pos_byte); | |
1038 } | |
428 | 1039 else |
1040 break; | |
1041 } | |
1042 } | |
1043 else | |
1044 { | |
872 | 1045 while (pos > limit) |
428 | 1046 { |
872 | 1047 Charbpos prev_pos_byte = pos_byte; |
1048 Ichar ch; | |
1049 | |
1050 DEC_BYTEBPOS (buf, prev_pos_byte); | |
1051 ch = BYTE_BUF_FETCH_CHAR (buf, prev_pos_byte); | |
428 | 1052 if ((ch < 0400) ? fastmap[ch] : |
1053 (NILP (Fget_range_table (make_int (ch), | |
1054 Vskip_chars_range_table, | |
1055 Qnil)) | |
1056 == negate)) | |
872 | 1057 { |
1058 pos--; | |
1059 pos_byte = prev_pos_byte; | |
1060 } | |
428 | 1061 else |
1062 break; | |
1063 } | |
1064 } | |
1065 } | |
1066 QUIT; | |
872 | 1067 BOTH_BUF_SET_PT (buf, pos, pos_byte); |
428 | 1068 return make_int (BUF_PT (buf) - start_point); |
1069 } | |
1070 } | |
1071 | |
1072 DEFUN ("skip-chars-forward", Fskip_chars_forward, 1, 3, 0, /* | |
444 | 1073 Move point forward, stopping before a char not in STRING, or at pos LIMIT. |
428 | 1074 STRING is like the inside of a `[...]' in a regular expression |
1075 except that `]' is never special and `\\' quotes `^', `-' or `\\'. | |
1076 Thus, with arg "a-zA-Z", this skips letters stopping before first nonletter. | |
1077 With arg "^a-zA-Z", skips nonletters stopping before first letter. | |
1078 Returns the distance traveled, either zero or positive. | |
1079 | |
1080 Optional argument BUFFER defaults to the current buffer. | |
1081 */ | |
444 | 1082 (string, limit, buffer)) |
428 | 1083 { |
444 | 1084 return skip_chars (decode_buffer (buffer, 0), 1, 0, string, limit); |
428 | 1085 } |
1086 | |
1087 DEFUN ("skip-chars-backward", Fskip_chars_backward, 1, 3, 0, /* | |
444 | 1088 Move point backward, stopping after a char not in STRING, or at pos LIMIT. |
428 | 1089 See `skip-chars-forward' for details. |
1090 Returns the distance traveled, either zero or negative. | |
1091 | |
1092 Optional argument BUFFER defaults to the current buffer. | |
1093 */ | |
444 | 1094 (string, limit, buffer)) |
428 | 1095 { |
444 | 1096 return skip_chars (decode_buffer (buffer, 0), 0, 0, string, limit); |
428 | 1097 } |
1098 | |
1099 | |
1100 DEFUN ("skip-syntax-forward", Fskip_syntax_forward, 1, 3, 0, /* | |
1101 Move point forward across chars in specified syntax classes. | |
1102 SYNTAX is a string of syntax code characters. | |
444 | 1103 Stop before a char whose syntax is not in SYNTAX, or at position LIMIT. |
428 | 1104 If SYNTAX starts with ^, skip characters whose syntax is NOT in SYNTAX. |
1105 This function returns the distance traveled, either zero or positive. | |
1106 | |
1107 Optional argument BUFFER defaults to the current buffer. | |
1108 */ | |
444 | 1109 (syntax, limit, buffer)) |
428 | 1110 { |
444 | 1111 return skip_chars (decode_buffer (buffer, 0), 1, 1, syntax, limit); |
428 | 1112 } |
1113 | |
1114 DEFUN ("skip-syntax-backward", Fskip_syntax_backward, 1, 3, 0, /* | |
1115 Move point backward across chars in specified syntax classes. | |
1116 SYNTAX is a string of syntax code characters. | |
444 | 1117 Stop on reaching a char whose syntax is not in SYNTAX, or at position LIMIT. |
428 | 1118 If SYNTAX starts with ^, skip characters whose syntax is NOT in SYNTAX. |
1119 This function returns the distance traveled, either zero or negative. | |
1120 | |
1121 Optional argument BUFFER defaults to the current buffer. | |
1122 */ | |
444 | 1123 (syntax, limit, buffer)) |
428 | 1124 { |
444 | 1125 return skip_chars (decode_buffer (buffer, 0), 0, 1, syntax, limit); |
428 | 1126 } |
1127 | |
1128 | |
1129 /* Subroutines of Lisp buffer search functions. */ | |
1130 | |
1131 static Lisp_Object | |
444 | 1132 search_command (Lisp_Object string, Lisp_Object limit, Lisp_Object noerror, |
428 | 1133 Lisp_Object count, Lisp_Object buffer, int direction, |
1134 int RE, int posix) | |
1135 { | |
665 | 1136 REGISTER Charbpos np; |
1137 Charbpos lim; | |
428 | 1138 EMACS_INT n = direction; |
1139 struct buffer *buf; | |
1140 | |
1141 if (!NILP (count)) | |
1142 { | |
1143 CHECK_INT (count); | |
1144 n *= XINT (count); | |
1145 } | |
1146 | |
1147 buf = decode_buffer (buffer, 0); | |
1148 CHECK_STRING (string); | |
444 | 1149 if (NILP (limit)) |
428 | 1150 lim = n > 0 ? BUF_ZV (buf) : BUF_BEGV (buf); |
1151 else | |
1152 { | |
444 | 1153 CHECK_INT_COERCE_MARKER (limit); |
1154 lim = XINT (limit); | |
428 | 1155 if (n > 0 ? lim < BUF_PT (buf) : lim > BUF_PT (buf)) |
563 | 1156 invalid_argument ("Invalid search limit (wrong side of point)", |
1157 Qunbound); | |
428 | 1158 if (lim > BUF_ZV (buf)) |
1159 lim = BUF_ZV (buf); | |
1160 if (lim < BUF_BEGV (buf)) | |
1161 lim = BUF_BEGV (buf); | |
1162 } | |
1163 | |
1164 np = search_buffer (buf, string, BUF_PT (buf), lim, n, RE, | |
1165 (!NILP (buf->case_fold_search) | |
446 | 1166 ? XCASE_TABLE_CANON (buf->case_table) |
1167 : Qnil), | |
428 | 1168 (!NILP (buf->case_fold_search) |
446 | 1169 ? XCASE_TABLE_EQV (buf->case_table) |
1170 : Qnil), posix); | |
428 | 1171 |
1172 if (np <= 0) | |
1173 { | |
444 | 1174 if (NILP (noerror)) |
2268 | 1175 { |
1176 signal_failure (string); | |
1177 RETURN_NOT_REACHED (Qnil); | |
1178 } | |
444 | 1179 if (!EQ (noerror, Qt)) |
428 | 1180 { |
1181 if (lim < BUF_BEGV (buf) || lim > BUF_ZV (buf)) | |
2500 | 1182 ABORT (); |
428 | 1183 BUF_SET_PT (buf, lim); |
1184 return Qnil; | |
1185 #if 0 /* This would be clean, but maybe programs depend on | |
1186 a value of nil here. */ | |
1187 np = lim; | |
1188 #endif | |
1189 } | |
1190 else | |
1191 return Qnil; | |
1192 } | |
1193 | |
1194 if (np < BUF_BEGV (buf) || np > BUF_ZV (buf)) | |
2500 | 1195 ABORT (); |
428 | 1196 |
1197 BUF_SET_PT (buf, np); | |
1198 | |
1199 return make_int (np); | |
1200 } | |
1201 | |
1202 static int | |
1203 trivial_regexp_p (Lisp_Object regexp) | |
1204 { | |
1205 Bytecount len = XSTRING_LENGTH (regexp); | |
867 | 1206 Ibyte *s = XSTRING_DATA (regexp); |
428 | 1207 while (--len >= 0) |
1208 { | |
1209 switch (*s++) | |
1210 { | |
1724 | 1211 /* #### howcum ']' doesn't appear here, but ... */ |
428 | 1212 case '.': case '*': case '+': case '?': case '[': case '^': case '$': |
1213 return 0; | |
1214 case '\\': | |
1215 if (--len < 0) | |
1216 return 0; | |
1217 switch (*s++) | |
1218 { | |
1724 | 1219 /* ... ')' does appear here? ('<' and '>' can appear singly.) */ |
1220 /* #### are there other constructs to check? */ | |
428 | 1221 case '|': case '(': case ')': case '`': case '\'': case 'b': |
1222 case 'B': case '<': case '>': case 'w': case 'W': case 's': | |
1724 | 1223 case 'S': case '=': case '{': case '}': |
428 | 1224 #ifdef MULE |
1225 /* 97/2/25 jhod Added for category matches */ | |
1226 case 'c': case 'C': | |
1227 #endif /* MULE */ | |
1228 case '1': case '2': case '3': case '4': case '5': | |
1229 case '6': case '7': case '8': case '9': | |
1230 return 0; | |
1231 } | |
1232 } | |
1233 } | |
1234 return 1; | |
1235 } | |
1236 | |
1237 /* Search for the n'th occurrence of STRING in BUF, | |
665 | 1238 starting at position CHARBPOS and stopping at position BUFLIM, |
428 | 1239 treating PAT as a literal string if RE is false or as |
1240 a regular expression if RE is true. | |
1241 | |
1242 If N is positive, searching is forward and BUFLIM must be greater | |
665 | 1243 than CHARBPOS. |
428 | 1244 If N is negative, searching is backward and BUFLIM must be less |
665 | 1245 than CHARBPOS. |
428 | 1246 |
1247 Returns -x if only N-x occurrences found (x > 0), | |
1248 or else the position at the beginning of the Nth occurrence | |
1249 (if searching backward) or the end (if searching forward). | |
1250 | |
1251 POSIX is nonzero if we want full backtracking (POSIX style) | |
1252 for this pattern. 0 means backtrack only enough to get a valid match. */ | |
665 | 1253 static Charbpos |
1254 search_buffer (struct buffer *buf, Lisp_Object string, Charbpos charbpos, | |
1255 Charbpos buflim, EMACS_INT n, int RE, Lisp_Object trt, | |
446 | 1256 Lisp_Object inverse_trt, int posix) |
428 | 1257 { |
1258 Bytecount len = XSTRING_LENGTH (string); | |
867 | 1259 Ibyte *base_pat = XSTRING_DATA (string); |
428 | 1260 REGISTER EMACS_INT i, j; |
665 | 1261 Bytebpos p1, p2; |
428 | 1262 Bytecount s1, s2; |
665 | 1263 Bytebpos pos, lim; |
428 | 1264 |
853 | 1265 /* Some FSF junk with running_asynch_code, to preserve the match |
1266 data. Not necessary because we don't call process filters | |
1267 asynchronously (i.e. from within QUIT). */ | |
428 | 1268 |
1425 | 1269 /* Searching 0 times means noop---don't move, don't touch registers. */ |
1270 if (n == 0) | |
1271 return charbpos; | |
1272 | |
428 | 1273 /* Null string is found at starting position. */ |
1274 if (len == 0) | |
1275 { | |
665 | 1276 set_search_regs (buf, charbpos, 0); |
1277 return charbpos; | |
428 | 1278 } |
1279 | |
665 | 1280 pos = charbpos_to_bytebpos (buf, charbpos); |
1281 lim = charbpos_to_bytebpos (buf, buflim); | |
428 | 1282 if (RE && !trivial_regexp_p (string)) |
1283 { | |
1284 struct re_pattern_buffer *bufp; | |
826 | 1285 |
1286 bufp = compile_pattern (string, &search_regs, trt, | |
1287 wrap_buffer (buf), buf, posix, ERROR_ME); | |
428 | 1288 |
1289 /* Get pointers and sizes of the two strings | |
1290 that make up the visible portion of the buffer. */ | |
1291 | |
826 | 1292 p1 = BYTE_BUF_BEGV (buf); |
1293 p2 = BYTE_BUF_CEILING_OF (buf, p1); | |
428 | 1294 s1 = p2 - p1; |
826 | 1295 s2 = BYTE_BUF_ZV (buf) - p2; |
1296 | |
1297 while (n != 0) | |
428 | 1298 { |
1299 Bytecount val; | |
826 | 1300 struct syntax_cache scache_struct; |
1301 struct syntax_cache *scache = &scache_struct; | |
1302 | |
428 | 1303 QUIT; |
826 | 1304 /* By making the regex object, regex buffer, and syntax cache |
1305 arguments to re_{search,match}{,_2}, we've removed the need to | |
1306 do nasty things to deal with regex reentrancy. (See stack | |
1307 trace in signal.c for proof that this can happen.) | |
1308 | |
1309 #### there is still a potential problem with the regex cache -- | |
1310 the compiled regex could be overwritten. we'd need 20-fold | |
1311 reentrancy, though. Fix this. */ | |
1312 | |
428 | 1313 val = re_search_2 (bufp, |
826 | 1314 (char *) BYTE_BUF_BYTE_ADDRESS (buf, p1), s1, |
1315 (char *) BYTE_BUF_BYTE_ADDRESS (buf, p2), s2, | |
1316 pos - BYTE_BUF_BEGV (buf), lim - pos, &search_regs, | |
1317 n > 0 ? lim - BYTE_BUF_BEGV (buf) : | |
1318 pos - BYTE_BUF_BEGV (buf), wrap_buffer (buf), | |
1319 buf, scache); | |
428 | 1320 |
1321 if (val == -2) | |
1322 { | |
1323 matcher_overflow (); | |
1324 } | |
1325 if (val >= 0) | |
1326 { | |
1327 int num_regs = search_regs.num_regs; | |
826 | 1328 j = BYTE_BUF_BEGV (buf); |
428 | 1329 for (i = 0; i < num_regs; i++) |
1330 if (search_regs.start[i] >= 0) | |
1331 { | |
1332 search_regs.start[i] += j; | |
1333 search_regs.end[i] += j; | |
1334 } | |
793 | 1335 last_thing_searched = wrap_buffer (buf); |
428 | 1336 /* Set pos to the new position. */ |
826 | 1337 pos = n > 0 ? search_regs.end[0] : search_regs.start[0]; |
428 | 1338 fixup_search_regs_for_buffer (buf); |
665 | 1339 /* And charbpos too. */ |
826 | 1340 charbpos = n > 0 ? search_regs.end[0] : search_regs.start[0]; |
428 | 1341 } |
1342 else | |
826 | 1343 return (n > 0 ? 0 - n : n); |
1344 if (n > 0) n--; else n++; | |
428 | 1345 } |
665 | 1346 return charbpos; |
428 | 1347 } |
1348 else /* non-RE case */ | |
1349 { | |
446 | 1350 int charset_base = -1; |
1351 int boyer_moore_ok = 1; | |
2367 | 1352 Ibyte *patbuf = alloca_ibytes (len * MAX_ICHAR_LEN); |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1353 Ibyte *pat = patbuf; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1354 |
446 | 1355 #ifdef MULE |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1356 int entirely_one_byte_p = buf->text->entirely_one_byte_p; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1357 int nothing_greater_than_0xff = |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1358 buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1359 |
446 | 1360 while (len > 0) |
1361 { | |
867 | 1362 Ibyte tmp_str[MAX_ICHAR_LEN]; |
1363 Ichar c, translated, inverse; | |
446 | 1364 Bytecount orig_bytelen, new_bytelen, inv_bytelen; |
1365 | |
1366 /* If we got here and the RE flag is set, it's because | |
1367 we're dealing with a regexp known to be trivial, so the | |
1368 backslash just quotes the next character. */ | |
1369 if (RE && *base_pat == '\\') | |
1370 { | |
1371 len--; | |
1372 base_pat++; | |
1373 } | |
867 | 1374 c = itext_ichar (base_pat); |
446 | 1375 translated = TRANSLATE (trt, c); |
1376 inverse = TRANSLATE (inverse_trt, c); | |
1377 | |
867 | 1378 orig_bytelen = itext_ichar_len (base_pat); |
1379 inv_bytelen = set_itext_ichar (tmp_str, inverse); | |
1380 new_bytelen = set_itext_ichar (tmp_str, translated); | |
446 | 1381 |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1382 if (boyer_moore_ok |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1383 /* Only do the Boyer-Moore check for characters needing |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1384 translation. */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1385 && (translated != c || inverse != c)) |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1386 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1387 Ichar starting_c = c; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1388 int charset_base_code; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1389 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1390 do |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1391 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1392 c = TRANSLATE (inverse_trt, c); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1393 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1394 /* If a character cannot occur in the buffer, ignore |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1395 it. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1396 if (c > 0x7F && entirely_one_byte_p) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1397 continue; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1398 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1399 if (c > 0xFF && nothing_greater_than_0xff) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1400 continue; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1401 |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1402 if (-1 == charset_base) /* No charset yet specified. */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1403 { |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1404 /* Keep track of which charset and character set row |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1405 contains the characters that need translation. |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1406 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1407 Zero out the bits corresponding to the last |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1408 byte. */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1409 charset_base = c & ~ICHAR_FIELD3_MASK; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1410 } |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1411 else |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1412 { |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1413 charset_base_code = c & ~ICHAR_FIELD3_MASK; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1414 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1415 if (charset_base_code != charset_base) |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1416 { |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1417 /* If two different rows, or two different |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1418 charsets, appear, needing non-ASCII |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1419 translation, then we cannot use boyer_moore |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1420 search. See the comment at the head of |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1421 boyer_moore(). */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1422 boyer_moore_ok = 0; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1423 break; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1424 } |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1425 } |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1426 } while (c != starting_c); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1427 |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1428 if (boyer_moore_ok && charset_base != -1 && |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1429 charset_base != (translated & ~ICHAR_FIELD3_MASK)) |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1430 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1431 /* In the rare event that the CANON entry for this |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1432 character is not in the desired set, choose one |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1433 that is, from the equivalence set. It doesn't much |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1434 matter which. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1435 Ichar starting_ch = translated; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1436 do |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1437 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1438 translated = TRANSLATE (inverse_trt, translated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1439 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1440 if (charset_base == (translated & ~ICHAR_FIELD3_MASK)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1441 break; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1442 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1443 } while (starting_ch != translated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1444 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1445 assert (starting_ch != translated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1446 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1447 new_bytelen = set_itext_ichar (tmp_str, translated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1448 } |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1449 } |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1450 |
446 | 1451 memcpy (pat, tmp_str, new_bytelen); |
1452 pat += new_bytelen; | |
1453 base_pat += orig_bytelen; | |
1454 len -= orig_bytelen; | |
1455 } | |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1456 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1457 if (-1 == charset_base) |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1458 { |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1459 charset_base = 'a' & ~ICHAR_FIELD3_MASK; /* Default to ASCII. */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1460 } |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1461 |
446 | 1462 #else /* not MULE */ |
1463 while (--len >= 0) | |
1464 { | |
1465 /* If we got here and the RE flag is set, it's because | |
1466 we're dealing with a regexp known to be trivial, so the | |
1467 backslash just quotes the next character. */ | |
1468 if (RE && *base_pat == '\\') | |
1469 { | |
1470 len--; | |
1471 base_pat++; | |
1472 } | |
1473 *pat++ = TRANSLATE (trt, *base_pat++); | |
1474 } | |
1475 #endif /* MULE */ | |
1476 len = pat - patbuf; | |
1477 pat = base_pat = patbuf; | |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1478 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1479 #ifdef DEBUG_XEMACS |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1480 if (debug_xemacs_searches) |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1481 { |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1482 Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used); |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1483 sym->value = boyer_moore_ok ? Qboyer_moore : Qsimple_search; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1484 } |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1485 #endif |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1486 |
446 | 1487 if (boyer_moore_ok) |
1488 return boyer_moore (buf, base_pat, len, pos, lim, n, | |
1489 trt, inverse_trt, charset_base); | |
1490 else | |
1491 return simple_search (buf, base_pat, len, pos, lim, n, trt); | |
1492 } | |
1493 } | |
1494 | |
826 | 1495 /* Do a simple string search N times for the string PAT, whose length is |
1496 LEN/LEN_BYTE, from buffer position POS until LIM. TRT is the | |
1497 translation table. | |
446 | 1498 |
1499 Return the character position where the match is found. | |
1500 Otherwise, if M matches remained to be found, return -M. | |
1501 | |
1502 This kind of search works regardless of what is in PAT and | |
1503 regardless of what is in TRT. It is used in cases where | |
1504 boyer_moore cannot work. */ | |
1505 | |
665 | 1506 static Charbpos |
867 | 1507 simple_search (struct buffer *buf, Ibyte *base_pat, Bytecount len, |
826 | 1508 Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt) |
446 | 1509 { |
1510 int forward = n > 0; | |
1511 Bytecount buf_len = 0; /* Shut up compiler. */ | |
1512 | |
826 | 1513 if (lim > pos) |
446 | 1514 while (n > 0) |
428 | 1515 { |
446 | 1516 while (1) |
428 | 1517 { |
826 | 1518 Bytecount this_len = len; |
1519 Bytebpos this_pos = pos; | |
867 | 1520 Ibyte *p = base_pat; |
826 | 1521 if (pos >= lim) |
446 | 1522 goto stop; |
1523 | |
1524 while (this_len > 0) | |
1525 { | |
867 | 1526 Ichar pat_ch, buf_ch; |
446 | 1527 Bytecount pat_len; |
1528 | |
867 | 1529 pat_ch = itext_ichar (p); |
826 | 1530 buf_ch = BYTE_BUF_FETCH_CHAR (buf, this_pos); |
446 | 1531 |
1532 buf_ch = TRANSLATE (trt, buf_ch); | |
1533 | |
1534 if (buf_ch != pat_ch) | |
1535 break; | |
1536 | |
867 | 1537 pat_len = itext_ichar_len (p); |
446 | 1538 p += pat_len; |
1539 this_len -= pat_len; | |
826 | 1540 INC_BYTEBPOS (buf, this_pos); |
446 | 1541 } |
1542 if (this_len == 0) | |
428 | 1543 { |
826 | 1544 buf_len = this_pos - pos; |
1545 pos = this_pos; | |
446 | 1546 break; |
428 | 1547 } |
826 | 1548 INC_BYTEBPOS (buf, pos); |
428 | 1549 } |
446 | 1550 n--; |
1551 } | |
1552 else | |
4322
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1553 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1554 /* If lim < len, then there are too few buffer positions to hold the |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1555 pattern between the beginning of the buffer and lim. Adjust to |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1556 ensure pattern fits. If we don't do this, we can assert in the |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1557 DEC_BYTEBPOS below. */ |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1558 if (lim < len) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1559 lim = len; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1560 while (n < 0) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1561 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1562 while (1) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1563 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1564 Bytecount this_len = len; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1565 Bytebpos this_pos = pos; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1566 Ibyte *p; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1567 if (pos <= lim) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1568 goto stop; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1569 p = base_pat + len; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1570 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1571 while (this_len > 0) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1572 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1573 Ichar pat_ch, buf_ch; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1574 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1575 DEC_IBYTEPTR (p); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1576 DEC_BYTEBPOS (buf, this_pos); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1577 pat_ch = itext_ichar (p); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1578 buf_ch = BYTE_BUF_FETCH_CHAR (buf, this_pos); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1579 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1580 buf_ch = TRANSLATE (trt, buf_ch); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1581 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1582 if (buf_ch != pat_ch) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1583 break; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1584 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1585 this_len -= itext_ichar_len (p); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1586 } |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1587 if (this_len == 0) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1588 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1589 buf_len = pos - this_pos; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1590 pos = this_pos; |
446 | 1591 break; |
4322
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1592 } |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1593 DEC_BYTEBPOS (buf, pos); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1594 } |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1595 n++; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1596 } |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1597 } |
446 | 1598 stop: |
1599 if (n == 0) | |
1600 { | |
665 | 1601 Charbpos beg, end, retval; |
446 | 1602 if (forward) |
1603 { | |
826 | 1604 beg = bytebpos_to_charbpos (buf, pos - buf_len); |
1605 retval = end = bytebpos_to_charbpos (buf, pos); | |
446 | 1606 } |
1607 else | |
428 | 1608 { |
826 | 1609 retval = beg = bytebpos_to_charbpos (buf, pos); |
1610 end = bytebpos_to_charbpos (buf, pos + buf_len); | |
428 | 1611 } |
446 | 1612 set_search_regs (buf, beg, end - beg); |
1613 | |
1614 return retval; | |
1615 } | |
1616 else if (n > 0) | |
1617 return -n; | |
1618 else | |
1619 return n; | |
1620 } | |
1621 | |
1622 /* Do Boyer-Moore search N times for the string PAT, | |
1623 whose length is LEN/LEN_BYTE, | |
1624 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. | |
1625 DIRECTION says which direction we search in. | |
1626 TRT and INVERSE_TRT are translation tables. | |
1627 | |
1628 This kind of search works if all the characters in PAT that have | |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1629 (non-ASCII) translation are the same aside from the last byte. This |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1630 makes it possible to translate just the last byte of a character, and do |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1631 so after just a simple test of the context. |
446 | 1632 |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1633 If that criterion is not satisfied, do not call this function. You will |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1634 get an assertion failure. */ |
446 | 1635 |
665 | 1636 static Charbpos |
867 | 1637 boyer_moore (struct buffer *buf, Ibyte *base_pat, Bytecount len, |
665 | 1638 Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt, |
2333 | 1639 Lisp_Object inverse_trt, int USED_IF_MULE (charset_base)) |
446 | 1640 { |
1641 /* #### Someone really really really needs to comment the workings | |
1642 of this junk somewhat better. | |
1643 | |
1644 BTW "BM" stands for Boyer-Moore, which is one of the standard | |
1645 string-searching algorithms. It's the best string-searching | |
1646 algorithm out there, provided that: | |
1647 | |
1648 a) You're not fazed by algorithm complexity. (Rabin-Karp, which | |
1649 uses hashing, is much much easier to code but not as fast.) | |
1650 b) You can freely move backwards in the string that you're | |
1651 searching through. | |
1652 | |
1653 As the comment below tries to explain (but garbles in typical | |
1654 programmer-ese), the idea is that you don't have to do a | |
1655 string match at every successive position in the text. For | |
1656 example, let's say the pattern is "a very long string". We | |
1657 compare the last character in the string (`g') with the | |
1658 corresponding character in the text. If it mismatches, and | |
1659 it is, say, `z', then we can skip forward by the entire | |
1660 length of the pattern because `z' does not occur anywhere | |
1661 in the pattern. If the mismatching character does occur | |
1662 in the pattern, we can usually still skip forward by more | |
1663 than one: e.g. if it is `l', then we can skip forward | |
1664 by the length of the substring "ong string" -- i.e. the | |
1665 largest end section of the pattern that does not contain | |
1666 the mismatched character. So what we do is compute, for | |
1667 each possible character, the distance we can skip forward | |
1668 (the "stride") and use it in the string matching. This | |
1669 is what the BM_tab holds. */ | |
1670 REGISTER EMACS_INT *BM_tab; | |
1671 EMACS_INT *BM_tab_base; | |
1672 REGISTER Bytecount dirlen; | |
1673 EMACS_INT infinity; | |
665 | 1674 Bytebpos limit; |
446 | 1675 Bytecount stride_for_teases = 0; |
1676 REGISTER EMACS_INT i, j; | |
867 | 1677 Ibyte *pat, *pat_end; |
1678 REGISTER Ibyte *cursor, *p_limit, *ptr2; | |
1679 Ibyte simple_translate[0400]; | |
446 | 1680 REGISTER int direction = ((n > 0) ? 1 : -1); |
1681 #ifdef MULE | |
867 | 1682 Ibyte translate_prev_byte = 0; |
1683 Ibyte translate_anteprev_byte = 0; | |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1684 /* These need to be rethought in the event that the internal format |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1685 changes, or in the event that num_8_bit_fixed_chars disappears |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1686 (entirely_one_byte_p can be trivially worked out by checking is the |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1687 byte count equal to the char count.) */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1688 int buffer_entirely_one_byte_p = buf->text->entirely_one_byte_p; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1689 int buffer_nothing_greater_than_0xff = |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1690 buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf); |
446 | 1691 #endif |
1692 #ifdef C_ALLOCA | |
1693 EMACS_INT BM_tab_space[0400]; | |
1694 BM_tab = &BM_tab_space[0]; | |
1695 #else | |
1696 BM_tab = alloca_array (EMACS_INT, 256); | |
1697 #endif | |
1698 | |
1699 /* The general approach is that we are going to maintain that we | |
1700 know the first (closest to the present position, in whatever | |
1701 direction we're searching) character that could possibly be | |
1702 the last (furthest from present position) character of a | |
1703 valid match. We advance the state of our knowledge by | |
1704 looking at that character and seeing whether it indeed | |
1705 matches the last character of the pattern. If it does, we | |
1706 take a closer look. If it does not, we move our pointer (to | |
1707 putative last characters) as far as is logically possible. | |
1708 This amount of movement, which I call a stride, will be the | |
1709 length of the pattern if the actual character appears nowhere | |
1710 in the pattern, otherwise it will be the distance from the | |
1711 last occurrence of that character to the end of the pattern. | |
1712 As a coding trick, an enormous stride is coded into the table | |
1713 for characters that match the last character. This allows | |
1714 use of only a single test, a test for having gone past the | |
1715 end of the permissible match region, to test for both | |
1716 possible matches (when the stride goes past the end | |
1717 immediately) and failure to match (where you get nudged past | |
1718 the end one stride at a time). | |
1719 | |
1720 Here we make a "mickey mouse" BM table. The stride of the | |
1721 search is determined only by the last character of the | |
1722 putative match. If that character does not match, we will | |
1723 stride the proper distance to propose a match that | |
1724 superimposes it on the last instance of a character that | |
1725 matches it (per trt), or misses it entirely if there is | |
1726 none. */ | |
1727 | |
1728 dirlen = len * direction; | |
1729 infinity = dirlen - (lim + pos + len + len) * direction; | |
1730 /* Record position after the end of the pattern. */ | |
1731 pat_end = base_pat + len; | |
1732 if (direction < 0) | |
1733 base_pat = pat_end - 1; | |
1734 BM_tab_base = BM_tab; | |
1735 BM_tab += 0400; | |
1736 j = dirlen; /* to get it in a register */ | |
1737 /* A character that does not appear in the pattern induces a | |
1738 stride equal to the pattern length. */ | |
1739 while (BM_tab_base != BM_tab) | |
1740 { | |
1741 *--BM_tab = j; | |
1742 *--BM_tab = j; | |
1743 *--BM_tab = j; | |
1744 *--BM_tab = j; | |
1745 } | |
1746 /* We use this for translation, instead of TRT itself. We | |
1747 fill this in to handle the characters that actually occur | |
1748 in the pattern. Others don't matter anyway! */ | |
1749 xzero (simple_translate); | |
1750 for (i = 0; i < 0400; i++) | |
867 | 1751 simple_translate[i] = (Ibyte) i; |
446 | 1752 i = 0; |
1425 | 1753 |
446 | 1754 while (i != infinity) |
1755 { | |
867 | 1756 Ibyte *ptr = base_pat + i; |
446 | 1757 i += direction; |
1758 if (i == dirlen) | |
1759 i = infinity; | |
1760 if (!NILP (trt)) | |
428 | 1761 { |
446 | 1762 #ifdef MULE |
867 | 1763 Ichar ch, untranslated; |
446 | 1764 int this_translated = 1; |
1765 | |
1766 /* Is *PTR the last byte of a character? */ | |
867 | 1767 if (pat_end - ptr == 1 || ibyte_first_byte_p (ptr[1])) |
428 | 1768 { |
867 | 1769 Ibyte *charstart = ptr; |
1770 while (!ibyte_first_byte_p (*charstart)) | |
446 | 1771 charstart--; |
867 | 1772 untranslated = itext_ichar (charstart); |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1773 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1774 ch = TRANSLATE (trt, untranslated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1775 if (!ibyte_first_byte_p (*ptr)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1776 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1777 translate_prev_byte = ptr[-1]; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1778 if (!ibyte_first_byte_p (translate_prev_byte)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1779 translate_anteprev_byte = ptr[-2]; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1780 } |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1781 |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1782 if (ch != untranslated && /* Was translation done? */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1783 charset_base != (ch & ~ICHAR_FIELD3_MASK)) |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1784 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1785 /* In the very rare event that the CANON entry for this |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1786 character is not in the desired set, choose one that |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1787 is, from the equivalence set. It doesn't much matter |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1788 which, since we're building our own cheesy equivalence |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1789 table instead of using that belonging to the case |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1790 table directly. |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1791 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1792 We can get here if search_buffer has worked out that |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1793 the buffer is entirely single width. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1794 Ichar starting_ch = ch; |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1795 int count = 0; |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1796 do |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1797 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1798 ch = TRANSLATE (inverse_trt, ch); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1799 if (charset_base == (ch & ~ICHAR_FIELD3_MASK)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1800 break; |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1801 ++count; |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1802 } while (starting_ch != ch); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1803 |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1804 /* If starting_ch is equal to ch (and count is not one, |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1805 which means no translation is necessary), the case |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1806 table is corrupt. (Any mapping in the canon table |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1807 should be reflected in the equivalence table, and we |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1808 know from the canon table that untranslated maps to |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1809 starting_ch and that untranslated has the correct value |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1810 for charset_base.) */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1811 assert (1 == count || starting_ch != ch); |
446 | 1812 } |
428 | 1813 } |
1814 else | |
1815 { | |
446 | 1816 ch = *ptr; |
1817 this_translated = 0; | |
1818 } | |
1819 if (ch > 0400) | |
1820 j = ((unsigned char) ch | 0200); | |
1821 else | |
1822 j = (unsigned char) ch; | |
1823 | |
1824 if (i == infinity) | |
1825 stride_for_teases = BM_tab[j]; | |
1826 BM_tab[j] = dirlen - i; | |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1827 /* A translation table is accompanied by its inverse -- see |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1828 comment in casetab.c. */ |
446 | 1829 if (this_translated) |
1830 { | |
867 | 1831 Ichar starting_ch = ch; |
446 | 1832 EMACS_INT starting_j = j; |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1833 do |
446 | 1834 { |
1835 ch = TRANSLATE (inverse_trt, ch); | |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1836 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1837 if (ch > 0x7F && buffer_entirely_one_byte_p) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1838 continue; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1839 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1840 if (ch > 0xFF && buffer_nothing_greater_than_0xff) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1841 continue; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1842 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1843 if (ch > 0400) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1844 j = ((unsigned char) ch | 0200); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1845 else |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1846 j = (unsigned char) ch; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1847 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1848 /* For all the characters that map into CH, set up |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1849 simple_translate to map the last byte into |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1850 STARTING_J. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1851 simple_translate[j] = (Ibyte) starting_j; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1852 BM_tab[j] = dirlen - i; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1853 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1854 } while (ch != starting_ch); |
446 | 1855 } |
1856 #else | |
1857 EMACS_INT k; | |
1858 j = *ptr; | |
1859 k = (j = TRANSLATE (trt, j)); | |
1860 if (i == infinity) | |
1861 stride_for_teases = BM_tab[j]; | |
1862 BM_tab[j] = dirlen - i; | |
1863 /* A translation table is accompanied by its inverse -- | |
826 | 1864 see comment in casetab.c. */ |
446 | 1865 while ((j = TRANSLATE (inverse_trt, j)) != k) |
1866 { | |
867 | 1867 simple_translate[j] = (Ibyte) k; |
428 | 1868 BM_tab[j] = dirlen - i; |
1869 } | |
446 | 1870 #endif |
1871 } | |
1872 else | |
1873 { | |
1874 j = *ptr; | |
1875 | |
1876 if (i == infinity) | |
1877 stride_for_teases = BM_tab[j]; | |
1878 BM_tab[j] = dirlen - i; | |
428 | 1879 } |
446 | 1880 /* stride_for_teases tells how much to stride if we get a |
1881 match on the far character but are subsequently | |
1882 disappointed, by recording what the stride would have been | |
1883 for that character if the last character had been | |
1884 different. */ | |
1885 } | |
1886 infinity = dirlen - infinity; | |
1887 pos += dirlen - ((direction > 0) ? direction : 0); | |
1888 /* loop invariant - pos points at where last char (first char if | |
1889 reverse) of pattern would align in a possible match. */ | |
1890 while (n != 0) | |
1891 { | |
665 | 1892 Bytebpos tail_end; |
867 | 1893 Ibyte *tail_end_ptr; |
446 | 1894 /* It's been reported that some (broken) compiler thinks |
1895 that Boolean expressions in an arithmetic context are | |
1896 unsigned. Using an explicit ?1:0 prevents this. */ | |
1897 if ((lim - pos - ((direction > 0) ? 1 : 0)) * direction < 0) | |
1898 return n * (0 - direction); | |
1899 /* First we do the part we can by pointers (maybe | |
1900 nothing) */ | |
1901 QUIT; | |
1902 pat = base_pat; | |
1903 limit = pos - dirlen + direction; | |
1904 /* XEmacs change: definitions of CEILING_OF and FLOOR_OF | |
1905 have changed. See buffer.h. */ | |
1906 limit = ((direction > 0) | |
826 | 1907 ? BYTE_BUF_CEILING_OF (buf, limit) - 1 |
1908 : BYTE_BUF_FLOOR_OF (buf, limit + 1)); | |
446 | 1909 /* LIMIT is now the last (not beyond-last!) value POS can |
1910 take on without hitting edge of buffer or the gap. */ | |
1911 limit = ((direction > 0) | |
1912 ? min (lim - 1, min (limit, pos + 20000)) | |
1913 : max (lim, max (limit, pos - 20000))); | |
826 | 1914 tail_end = BYTE_BUF_CEILING_OF (buf, pos); |
1915 tail_end_ptr = BYTE_BUF_BYTE_ADDRESS (buf, tail_end); | |
446 | 1916 |
1917 if ((limit - pos) * direction > 20) | |
428 | 1918 { |
826 | 1919 /* We have to be careful because the code can generate addresses |
1920 that don't point to the beginning of characters. */ | |
1921 p_limit = BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, limit); | |
1922 ptr2 = (cursor = BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, pos)); | |
446 | 1923 /* In this loop, pos + cursor - ptr2 is the surrogate |
1924 for pos */ | |
1925 while (1) /* use one cursor setting as long as i can */ | |
1926 { | |
1927 if (direction > 0) /* worth duplicating */ | |
1928 { | |
1929 /* Use signed comparison if appropriate to make | |
1930 cursor+infinity sure to be > p_limit. | |
1931 Assuming that the buffer lies in a range of | |
1932 addresses that are all "positive" (as ints) | |
1933 or all "negative", either kind of comparison | |
1934 will work as long as we don't step by | |
1935 infinity. So pick the kind that works when | |
1936 we do step by infinity. */ | |
1937 if ((EMACS_INT) (p_limit + infinity) > | |
1938 (EMACS_INT) p_limit) | |
1939 while ((EMACS_INT) cursor <= | |
1940 (EMACS_INT) p_limit) | |
1941 cursor += BM_tab[*cursor]; | |
1942 else | |
1943 while ((EMACS_UINT) cursor <= | |
1944 (EMACS_UINT) p_limit) | |
1945 cursor += BM_tab[*cursor]; | |
1946 } | |
1947 else | |
1948 { | |
1949 if ((EMACS_INT) (p_limit + infinity) < | |
1950 (EMACS_INT) p_limit) | |
1951 while ((EMACS_INT) cursor >= | |
1952 (EMACS_INT) p_limit) | |
1953 cursor += BM_tab[*cursor]; | |
1954 else | |
1955 while ((EMACS_UINT) cursor >= | |
1956 (EMACS_UINT) p_limit) | |
1957 cursor += BM_tab[*cursor]; | |
1958 } | |
1959 /* If you are here, cursor is beyond the end of the | |
1960 searched region. This can happen if you match on | |
1961 the far character of the pattern, because the | |
1962 "stride" of that character is infinity, a number | |
1963 able to throw you well beyond the end of the | |
1964 search. It can also happen if you fail to match | |
1965 within the permitted region and would otherwise | |
1966 try a character beyond that region */ | |
1967 if ((cursor - p_limit) * direction <= len) | |
1968 break; /* a small overrun is genuine */ | |
1969 cursor -= infinity; /* large overrun = hit */ | |
1970 i = dirlen - direction; | |
1971 if (!NILP (trt)) | |
1972 { | |
1973 while ((i -= direction) + direction != 0) | |
1974 { | |
1975 #ifdef MULE | |
867 | 1976 Ichar ch; |
446 | 1977 cursor -= direction; |
1978 /* Translate only the last byte of a character. */ | |
1979 if ((cursor == tail_end_ptr | |
867 | 1980 || ibyte_first_byte_p (cursor[1])) |
1981 && (ibyte_first_byte_p (cursor[0]) | |
446 | 1982 || (translate_prev_byte == cursor[-1] |
867 | 1983 && (ibyte_first_byte_p (translate_prev_byte) |
446 | 1984 || translate_anteprev_byte == cursor[-2])))) |
1985 ch = simple_translate[*cursor]; | |
1986 else | |
1987 ch = *cursor; | |
1988 if (pat[i] != ch) | |
1989 break; | |
1990 #else | |
1991 if (pat[i] != TRANSLATE (trt, *(cursor -= direction))) | |
1992 break; | |
1993 #endif | |
1994 } | |
1995 } | |
1996 else | |
1997 { | |
1998 while ((i -= direction) + direction != 0) | |
1999 if (pat[i] != *(cursor -= direction)) | |
2000 break; | |
2001 } | |
2002 cursor += dirlen - i - direction; /* fix cursor */ | |
2003 if (i + direction == 0) | |
2004 { | |
2005 cursor -= direction; | |
2006 | |
2007 { | |
665 | 2008 Bytebpos bytstart = (pos + cursor - ptr2 + |
446 | 2009 ((direction > 0) |
2010 ? 1 - len : 0)); | |
665 | 2011 Charbpos bufstart = bytebpos_to_charbpos (buf, bytstart); |
2012 Charbpos bufend = bytebpos_to_charbpos (buf, bytstart + len); | |
446 | 2013 |
2014 set_search_regs (buf, bufstart, bufend - bufstart); | |
2015 } | |
2016 | |
2017 if ((n -= direction) != 0) | |
2018 cursor += dirlen; /* to resume search */ | |
2019 else | |
2020 return ((direction > 0) | |
2021 ? search_regs.end[0] : search_regs.start[0]); | |
2022 } | |
2023 else | |
2024 cursor += stride_for_teases; /* <sigh> we lose - */ | |
2025 } | |
2026 pos += cursor - ptr2; | |
2027 } | |
2028 else | |
2029 /* Now we'll pick up a clump that has to be done the hard | |
2030 way because it covers a discontinuity */ | |
2031 { | |
428 | 2032 /* XEmacs change: definitions of CEILING_OF and FLOOR_OF |
2033 have changed. See buffer.h. */ | |
2034 limit = ((direction > 0) | |
826 | 2035 ? BYTE_BUF_CEILING_OF (buf, pos - dirlen + 1) - 1 |
2036 : BYTE_BUF_FLOOR_OF (buf, pos - dirlen)); | |
428 | 2037 limit = ((direction > 0) |
446 | 2038 ? min (limit + len, lim - 1) |
2039 : max (limit - len, lim)); | |
2040 /* LIMIT is now the last value POS can have | |
2041 and still be valid for a possible match. */ | |
2042 while (1) | |
428 | 2043 { |
446 | 2044 /* This loop can be coded for space rather than |
2045 speed because it will usually run only once. | |
2046 (the reach is at most len + 21, and typically | |
2047 does not exceed len) */ | |
2048 while ((limit - pos) * direction >= 0) | |
826 | 2049 /* *not* BYTE_BUF_FETCH_CHAR. We are working here |
446 | 2050 with bytes, not characters. */ |
826 | 2051 pos += BM_tab[*BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, pos)]; |
446 | 2052 /* now run the same tests to distinguish going off |
2053 the end, a match or a phony match. */ | |
2054 if ((pos - limit) * direction <= len) | |
2055 break; /* ran off the end */ | |
2056 /* Found what might be a match. | |
2057 Set POS back to last (first if reverse) char pos. */ | |
2058 pos -= infinity; | |
2059 i = dirlen - direction; | |
2060 while ((i -= direction) + direction != 0) | |
428 | 2061 { |
446 | 2062 #ifdef MULE |
867 | 2063 Ichar ch; |
2064 Ibyte *ptr; | |
446 | 2065 #endif |
2066 pos -= direction; | |
2067 #ifdef MULE | |
826 | 2068 ptr = BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, pos); |
446 | 2069 if ((ptr == tail_end_ptr |
867 | 2070 || ibyte_first_byte_p (ptr[1])) |
2071 && (ibyte_first_byte_p (ptr[0]) | |
446 | 2072 || (translate_prev_byte == ptr[-1] |
867 | 2073 && (ibyte_first_byte_p (translate_prev_byte) |
446 | 2074 || translate_anteprev_byte == ptr[-2])))) |
2075 ch = simple_translate[*ptr]; | |
428 | 2076 else |
446 | 2077 ch = *ptr; |
2078 if (pat[i] != ch) | |
2079 break; | |
2080 | |
2081 #else | |
826 | 2082 if (pat[i] != |
2083 TRANSLATE (trt, | |
2084 *BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, pos))) | |
446 | 2085 break; |
2086 #endif | |
428 | 2087 } |
446 | 2088 /* Above loop has moved POS part or all the way back |
2089 to the first char pos (last char pos if reverse). | |
2090 Set it once again at the last (first if reverse) | |
2091 char. */ | |
2092 pos += dirlen - i- direction; | |
2093 if (i + direction == 0) | |
428 | 2094 { |
446 | 2095 pos -= direction; |
2096 | |
2097 { | |
665 | 2098 Bytebpos bytstart = (pos + |
446 | 2099 ((direction > 0) |
2100 ? 1 - len : 0)); | |
665 | 2101 Charbpos bufstart = bytebpos_to_charbpos (buf, bytstart); |
2102 Charbpos bufend = bytebpos_to_charbpos (buf, bytstart + len); | |
446 | 2103 |
2104 set_search_regs (buf, bufstart, bufend - bufstart); | |
2105 } | |
2106 | |
2107 if ((n -= direction) != 0) | |
2108 pos += dirlen; /* to resume search */ | |
428 | 2109 else |
446 | 2110 return ((direction > 0) |
2111 ? search_regs.end[0] : search_regs.start[0]); | |
428 | 2112 } |
446 | 2113 else |
2114 pos += stride_for_teases; | |
2115 } | |
428 | 2116 } |
446 | 2117 /* We have done one clump. Can we continue? */ |
2118 if ((lim - pos) * direction < 0) | |
2119 return (0 - n) * direction; | |
428 | 2120 } |
665 | 2121 return bytebpos_to_charbpos (buf, pos); |
428 | 2122 } |
2123 | |
1024 | 2124 /* Record the whole-match data (beginning BEG and end BEG + LEN) and the |
2125 buffer for a match just found. */ | |
428 | 2126 |
2127 static void | |
665 | 2128 set_search_regs (struct buffer *buf, Charbpos beg, Charcount len) |
428 | 2129 { |
2130 /* Make sure we have registers in which to store | |
2131 the match position. */ | |
2132 if (search_regs.num_regs == 0) | |
2133 { | |
2134 search_regs.start = xnew (regoff_t); | |
2135 search_regs.end = xnew (regoff_t); | |
2136 search_regs.num_regs = 1; | |
2137 } | |
2138 | |
1468 | 2139 clear_search_regs (); |
428 | 2140 search_regs.start[0] = beg; |
2141 search_regs.end[0] = beg + len; | |
793 | 2142 last_thing_searched = wrap_buffer (buf); |
428 | 2143 } |
2144 | |
1468 | 2145 /* Clear search registers so match data will be null. */ |
1024 | 2146 |
2147 static void | |
1468 | 2148 clear_search_regs (void) |
1024 | 2149 { |
2150 /* This function has been Mule-ized. */ | |
2151 int i; | |
2152 | |
1468 | 2153 for (i = 0; i < search_regs.num_regs; i++) |
2154 search_regs.start[i] = search_regs.end[i] = -1; | |
1024 | 2155 } |
2156 | |
428 | 2157 |
2158 /* Given a string of words separated by word delimiters, | |
442 | 2159 compute a regexp that matches those exact words |
2160 separated by arbitrary punctuation. */ | |
428 | 2161 |
2162 static Lisp_Object | |
2163 wordify (Lisp_Object buffer, Lisp_Object string) | |
2164 { | |
2165 Charcount i, len; | |
2166 EMACS_INT punct_count = 0, word_count = 0; | |
2167 struct buffer *buf = decode_buffer (buffer, 0); | |
826 | 2168 Lisp_Object syntax_table = buf->mirror_syntax_table; |
428 | 2169 |
2170 CHECK_STRING (string); | |
826 | 2171 len = string_char_length (string); |
428 | 2172 |
2173 for (i = 0; i < len; i++) | |
867 | 2174 if (!WORD_SYNTAX_P (syntax_table, string_ichar (string, i))) |
428 | 2175 { |
2176 punct_count++; | |
2177 if (i > 0 && WORD_SYNTAX_P (syntax_table, | |
867 | 2178 string_ichar (string, i - 1))) |
428 | 2179 word_count++; |
2180 } | |
867 | 2181 if (WORD_SYNTAX_P (syntax_table, string_ichar (string, len - 1))) |
428 | 2182 word_count++; |
2183 if (!word_count) return build_string (""); | |
2184 | |
2185 { | |
2186 /* The following value is an upper bound on the amount of storage we | |
2187 need. In non-Mule, it is exact. */ | |
867 | 2188 Ibyte *storage = |
2367 | 2189 alloca_ibytes (XSTRING_LENGTH (string) - punct_count + |
428 | 2190 5 * (word_count - 1) + 4); |
867 | 2191 Ibyte *o = storage; |
428 | 2192 |
2193 *o++ = '\\'; | |
2194 *o++ = 'b'; | |
2195 | |
2196 for (i = 0; i < len; i++) | |
2197 { | |
867 | 2198 Ichar ch = string_ichar (string, i); |
428 | 2199 |
2200 if (WORD_SYNTAX_P (syntax_table, ch)) | |
867 | 2201 o += set_itext_ichar (o, ch); |
428 | 2202 else if (i > 0 |
2203 && WORD_SYNTAX_P (syntax_table, | |
867 | 2204 string_ichar (string, i - 1)) |
428 | 2205 && --word_count) |
2206 { | |
2207 *o++ = '\\'; | |
2208 *o++ = 'W'; | |
2209 *o++ = '\\'; | |
2210 *o++ = 'W'; | |
2211 *o++ = '*'; | |
2212 } | |
2213 } | |
2214 | |
2215 *o++ = '\\'; | |
2216 *o++ = 'b'; | |
2217 | |
2218 return make_string (storage, o - storage); | |
2219 } | |
2220 } | |
2221 | |
2222 DEFUN ("search-backward", Fsearch_backward, 1, 5, "sSearch backward: ", /* | |
2223 Search backward from point for STRING. | |
2224 Set point to the beginning of the occurrence found, and return point. | |
444 | 2225 |
2226 Optional second argument LIMIT bounds the search; it is a buffer | |
2227 position. The match found must not extend before that position. | |
2228 The value nil is equivalent to (point-min). | |
2229 | |
2230 Optional third argument NOERROR, if t, means just return nil (no | |
2231 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2232 and return nil. | |
2233 | |
2234 Optional fourth argument COUNT is a repeat count--search for | |
2235 successive occurrences. | |
2236 | |
428 | 2237 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2238 defaults to the current buffer. |
2239 | |
1468 | 2240 When the match is successful, this function modifies the match data |
2241 that `match-beginning', `match-end' and `match-data' access; save the | |
2242 match data with `match-data' and restore it with `store-match-data' if | |
2243 you want to preserve them. If the match fails, the match data from the | |
2244 previous success match is preserved. | |
2245 | |
2246 See also the function `replace-match'. | |
428 | 2247 */ |
444 | 2248 (string, limit, noerror, count, buffer)) |
428 | 2249 { |
444 | 2250 return search_command (string, limit, noerror, count, buffer, -1, 0, 0); |
428 | 2251 } |
2252 | |
2253 DEFUN ("search-forward", Fsearch_forward, 1, 5, "sSearch: ", /* | |
2254 Search forward from point for STRING. | |
2255 Set point to the end of the occurrence found, and return point. | |
444 | 2256 |
2257 Optional second argument LIMIT bounds the search; it is a buffer | |
2258 position. The match found must not extend after that position. The | |
2259 value nil is equivalent to (point-max). | |
2260 | |
2261 Optional third argument NOERROR, if t, means just return nil (no | |
2262 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2263 and return nil. | |
2264 | |
2265 Optional fourth argument COUNT is a repeat count--search for | |
2266 successive occurrences. | |
2267 | |
428 | 2268 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2269 defaults to the current buffer. |
2270 | |
1468 | 2271 When the match is successful, this function modifies the match data |
2272 that `match-beginning', `match-end' and `match-data' access; save the | |
2273 match data with `match-data' and restore it with `store-match-data' if | |
2274 you want to preserve them. If the match fails, the match data from the | |
2275 previous success match is preserved. | |
2276 | |
2277 See also the function `replace-match'. | |
428 | 2278 */ |
444 | 2279 (string, limit, noerror, count, buffer)) |
428 | 2280 { |
444 | 2281 return search_command (string, limit, noerror, count, buffer, 1, 0, 0); |
428 | 2282 } |
2283 | |
2284 DEFUN ("word-search-backward", Fword_search_backward, 1, 5, | |
2285 "sWord search backward: ", /* | |
2286 Search backward from point for STRING, ignoring differences in punctuation. | |
2287 Set point to the beginning of the occurrence found, and return point. | |
444 | 2288 |
2289 Optional second argument LIMIT bounds the search; it is a buffer | |
2290 position. The match found must not extend before that position. | |
2291 The value nil is equivalent to (point-min). | |
2292 | |
2293 Optional third argument NOERROR, if t, means just return nil (no | |
2294 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2295 and return nil. | |
2296 | |
2297 Optional fourth argument COUNT is a repeat count--search for | |
2298 successive occurrences. | |
2299 | |
428 | 2300 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2301 defaults to the current buffer. |
2302 | |
1468 | 2303 When the match is successful, this function modifies the match data |
2304 that `match-beginning', `match-end' and `match-data' access; save the | |
2305 match data with `match-data' and restore it with `store-match-data' if | |
2306 you want to preserve them. If the match fails, the match data from the | |
2307 previous success match is preserved. | |
2308 | |
2309 See also the function `replace-match'. | |
428 | 2310 */ |
444 | 2311 (string, limit, noerror, count, buffer)) |
428 | 2312 { |
444 | 2313 return search_command (wordify (buffer, string), limit, noerror, count, |
428 | 2314 buffer, -1, 1, 0); |
2315 } | |
2316 | |
2317 DEFUN ("word-search-forward", Fword_search_forward, 1, 5, "sWord search: ", /* | |
2318 Search forward from point for STRING, ignoring differences in punctuation. | |
2319 Set point to the end of the occurrence found, and return point. | |
444 | 2320 |
2321 Optional second argument LIMIT bounds the search; it is a buffer | |
2322 position. The match found must not extend after that position. The | |
2323 value nil is equivalent to (point-max). | |
2324 | |
2325 Optional third argument NOERROR, if t, means just return nil (no | |
2326 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2327 and return nil. | |
2328 | |
2329 Optional fourth argument COUNT is a repeat count--search for | |
2330 successive occurrences. | |
2331 | |
428 | 2332 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2333 defaults to the current buffer. |
2334 | |
1468 | 2335 When the match is successful, this function modifies the match data |
2336 that `match-beginning', `match-end' and `match-data' access; save the | |
2337 match data with `match-data' and restore it with `store-match-data' if | |
2338 you want to preserve them. If the match fails, the match data from the | |
2339 previous success match is preserved. | |
2340 | |
2341 See also the function `replace-match'. | |
428 | 2342 */ |
444 | 2343 (string, limit, noerror, count, buffer)) |
428 | 2344 { |
444 | 2345 return search_command (wordify (buffer, string), limit, noerror, count, |
428 | 2346 buffer, 1, 1, 0); |
2347 } | |
2348 | |
2349 DEFUN ("re-search-backward", Fre_search_backward, 1, 5, | |
2350 "sRE search backward: ", /* | |
2351 Search backward from point for match for regular expression REGEXP. | |
2352 Set point to the beginning of the match, and return point. | |
2353 The match found is the one starting last in the buffer | |
2354 and yet ending before the origin of the search. | |
444 | 2355 |
2356 Optional second argument LIMIT bounds the search; it is a buffer | |
2357 position. The match found must not extend before that position. | |
2358 The value nil is equivalent to (point-min). | |
2359 | |
2360 Optional third argument NOERROR, if t, means just return nil (no | |
2361 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2362 and return nil. | |
2363 | |
2364 Optional fourth argument COUNT is a repeat count--search for | |
2365 successive occurrences. | |
2366 | |
428 | 2367 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2368 defaults to the current buffer. |
2369 | |
1468 | 2370 When the match is successful, this function modifies the match data |
2371 that `match-beginning', `match-end' and `match-data' access; save the | |
2372 match data with `match-data' and restore it with `store-match-data' if | |
2373 you want to preserve them. If the match fails, the match data from the | |
2374 previous success match is preserved. | |
2375 | |
2376 See also the function `replace-match'. | |
428 | 2377 */ |
444 | 2378 (regexp, limit, noerror, count, buffer)) |
428 | 2379 { |
444 | 2380 return search_command (regexp, limit, noerror, count, buffer, -1, 1, 0); |
428 | 2381 } |
2382 | |
2383 DEFUN ("re-search-forward", Fre_search_forward, 1, 5, "sRE search: ", /* | |
2384 Search forward from point for regular expression REGEXP. | |
2385 Set point to the end of the occurrence found, and return point. | |
444 | 2386 |
2387 Optional second argument LIMIT bounds the search; it is a buffer | |
2388 position. The match found must not extend after that position. The | |
2389 value nil is equivalent to (point-max). | |
2390 | |
2391 Optional third argument NOERROR, if t, means just return nil (no | |
2392 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2393 and return nil. | |
2394 | |
2395 Optional fourth argument COUNT is a repeat count--search for | |
2396 successive occurrences. | |
2397 | |
428 | 2398 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2399 defaults to the current buffer. |
2400 | |
1468 | 2401 When the match is successful, this function modifies the match data |
2402 that `match-beginning', `match-end' and `match-data' access; save the | |
2403 match data with `match-data' and restore it with `store-match-data' if | |
2404 you want to preserve them. If the match fails, the match data from the | |
2405 previous success match is preserved. | |
2406 | |
2407 See also the function `replace-match'. | |
428 | 2408 */ |
444 | 2409 (regexp, limit, noerror, count, buffer)) |
428 | 2410 { |
444 | 2411 return search_command (regexp, limit, noerror, count, buffer, 1, 1, 0); |
428 | 2412 } |
2413 | |
2414 DEFUN ("posix-search-backward", Fposix_search_backward, 1, 5, | |
2415 "sPosix search backward: ", /* | |
2416 Search backward from point for match for regular expression REGEXP. | |
2417 Find the longest match in accord with Posix regular expression rules. | |
2418 Set point to the beginning of the match, and return point. | |
2419 The match found is the one starting last in the buffer | |
2420 and yet ending before the origin of the search. | |
444 | 2421 |
2422 Optional second argument LIMIT bounds the search; it is a buffer | |
2423 position. The match found must not extend before that position. | |
2424 The value nil is equivalent to (point-min). | |
2425 | |
2426 Optional third argument NOERROR, if t, means just return nil (no | |
2427 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2428 and return nil. | |
2429 | |
2430 Optional fourth argument COUNT is a repeat count--search for | |
2431 successive occurrences. | |
2432 | |
428 | 2433 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2434 defaults to the current buffer. |
2435 | |
1468 | 2436 When the match is successful, this function modifies the match data |
2437 that `match-beginning', `match-end' and `match-data' access; save the | |
2438 match data with `match-data' and restore it with `store-match-data' if | |
2439 you want to preserve them. If the match fails, the match data from the | |
2440 previous success match is preserved. | |
2441 | |
2442 See also the function `replace-match'. | |
428 | 2443 */ |
444 | 2444 (regexp, limit, noerror, count, buffer)) |
428 | 2445 { |
444 | 2446 return search_command (regexp, limit, noerror, count, buffer, -1, 1, 1); |
428 | 2447 } |
2448 | |
2449 DEFUN ("posix-search-forward", Fposix_search_forward, 1, 5, "sPosix search: ", /* | |
2450 Search forward from point for regular expression REGEXP. | |
2451 Find the longest match in accord with Posix regular expression rules. | |
2452 Set point to the end of the occurrence found, and return point. | |
444 | 2453 |
2454 Optional second argument LIMIT bounds the search; it is a buffer | |
2455 position. The match found must not extend after that position. The | |
2456 value nil is equivalent to (point-max). | |
2457 | |
2458 Optional third argument NOERROR, if t, means just return nil (no | |
2459 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2460 and return nil. | |
2461 | |
2462 Optional fourth argument COUNT is a repeat count--search for | |
2463 successive occurrences. | |
2464 | |
428 | 2465 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2466 defaults to the current buffer. |
2467 | |
1468 | 2468 When the match is successful, this function modifies the match data |
2469 that `match-beginning', `match-end' and `match-data' access; save the | |
2470 match data with `match-data' and restore it with `store-match-data' if | |
2471 you want to preserve them. If the match fails, the match data from the | |
2472 previous success match is preserved. | |
2473 | |
2474 See also the function `replace-match'. | |
428 | 2475 */ |
444 | 2476 (regexp, limit, noerror, count, buffer)) |
428 | 2477 { |
444 | 2478 return search_command (regexp, limit, noerror, count, buffer, 1, 1, 1); |
428 | 2479 } |
2480 | |
2481 | |
2482 static Lisp_Object | |
2483 free_created_dynarrs (Lisp_Object cons) | |
2484 { | |
2485 Dynarr_free (get_opaque_ptr (XCAR (cons))); | |
2486 Dynarr_free (get_opaque_ptr (XCDR (cons))); | |
2487 free_opaque_ptr (XCAR (cons)); | |
2488 free_opaque_ptr (XCDR (cons)); | |
853 | 2489 free_cons (cons); |
428 | 2490 return Qnil; |
2491 } | |
2492 | |
2493 DEFUN ("replace-match", Freplace_match, 1, 5, 0, /* | |
444 | 2494 Replace text matched by last search with REPLACEMENT. |
4199 | 2495 Leaves point at end of replacement text. |
2496 Optional boolean FIXEDCASE inhibits matching case of REPLACEMENT to source. | |
2497 Optional boolean LITERAL inhibits interpretation of escape sequences. | |
2498 Optional STRING provides the source text to replace. | |
2499 Optional STRBUFFER may be a buffer, providing match context, or an integer | |
2500 specifying the subexpression to replace. | |
2501 | |
2502 If FIXEDCASE is non-nil, do not alter case of replacement text. | |
428 | 2503 Otherwise maybe capitalize the whole text, or maybe just word initials, |
2504 based on the replaced text. | |
4199 | 2505 If the replaced text has only capital letters and has at least one |
2506 multiletter word, convert REPLACEMENT to all caps. | |
428 | 2507 If the replaced text has at least one word starting with a capital letter, |
444 | 2508 then capitalize each word in REPLACEMENT. |
428 | 2509 |
4199 | 2510 If LITERAL is non-nil, insert REPLACEMENT literally. |
428 | 2511 Otherwise treat `\\' as special: |
444 | 2512 `\\&' in REPLACEMENT means substitute original matched text. |
428 | 2513 `\\N' means substitute what matched the Nth `\\(...\\)'. |
2514 If Nth parens didn't match, substitute nothing. | |
2515 `\\\\' means insert one `\\'. | |
2516 `\\u' means upcase the next character. | |
2517 `\\l' means downcase the next character. | |
2518 `\\U' means begin upcasing all following characters. | |
2519 `\\L' means begin downcasing all following characters. | |
2520 `\\E' means terminate the effect of any `\\U' or `\\L'. | |
2521 Case changes made with `\\u', `\\l', `\\U', and `\\L' override | |
2522 all other case changes that may be made in the replaced text. | |
4199 | 2523 |
2524 If non-nil, STRING is the source string, and a new string with the specified | |
2525 replacements is created and returned. Otherwise the current buffer is the | |
2526 source text. | |
2527 | |
2528 If non-nil, STRBUFFER may be an integer, interpreted as the index of the | |
2529 subexpression to replace in the source text, or a buffer to provide the | |
2530 syntax table and case table. If nil, then the \"subexpression\" is 0, i.e., | |
2531 the whole match, and the current buffer provides the syntax and case tables. | |
2532 If STRING is nil, STRBUFFER must be nil or an integer. | |
2533 | |
2534 Specifying a subexpression is only useful after a regular expression match, | |
2535 since a fixed string search has no non-trivial subexpressions. | |
2536 | |
2537 It is not possible to specify both a buffer and a subexpression. If that is | |
2538 desired, the idiom `(with-current-buffer BUFFER (replace-match ... INTEGER))' | |
2539 may be appropriate. | |
2540 | |
2541 If STRING is nil but the last thing matched (or searched) was a string, or | |
2542 STRING is a string but the last thing matched was a buffer, an | |
2543 `invalid-argument' error will be signaled. (XEmacs does not check that the | |
2544 last thing searched is the source string, but it is not useful to use a | |
2545 different string as source.) | |
2546 | |
2547 If no match (including searches) has been successful or the requested | |
1468 | 2548 subexpression was not matched, an `args-out-of-range' error will be |
2549 signaled. (If no match has ever been conducted in this instance of | |
2550 XEmacs, an `invalid-operation' error will be signaled. This is very | |
2551 rare.) | |
428 | 2552 */ |
444 | 2553 (replacement, fixedcase, literal, string, strbuffer)) |
428 | 2554 { |
2555 /* This function can GC */ | |
2556 enum { nochange, all_caps, cap_initial } case_action; | |
665 | 2557 Charbpos pos, last; |
428 | 2558 int some_multiletter_word; |
2559 int some_lowercase; | |
2560 int some_uppercase; | |
2561 int some_nonuppercase_initial; | |
867 | 2562 Ichar c, prevc; |
428 | 2563 Charcount inslen; |
2564 struct buffer *buf; | |
826 | 2565 Lisp_Object syntax_table; |
428 | 2566 int mc_count; |
2567 Lisp_Object buffer; | |
2568 int_dynarr *ul_action_dynarr = 0; | |
2569 int_dynarr *ul_pos_dynarr = 0; | |
502 | 2570 int sub = 0; |
428 | 2571 int speccount; |
2572 | |
444 | 2573 CHECK_STRING (replacement); |
428 | 2574 |
4199 | 2575 /* Because GNU decided to be incompatible here, we support the following |
2576 baroque and bogus API for the STRING and STRBUFFER arguments: | |
2577 types interpretations | |
2578 STRING STRBUFFER STRING STRBUFFER | |
2579 nil nil none 0 = index of subexpression to replace | |
2580 nil integer none index of subexpression to replace | |
2581 nil other ***** error ***** | |
2582 string nil source current buffer provides syntax table | |
2583 subexpression = 0 (whole match) | |
2584 string buffer source buffer providing syntax table | |
2585 subexpression = 0 (whole match) | |
2586 string integer source current buffer provides syntax table | |
2587 subexpression = STRBUFFER | |
2588 string other ***** error ***** | |
2589 */ | |
2590 | |
2591 /* Do STRBUFFER first; if STRING is nil, we'll overwrite BUF and BUFFER. */ | |
2592 | |
2593 /* If the match data were abstracted into a special "match data" type | |
2594 instead of the typical half-assed "let the implementation be visible" | |
2595 form it's in, we could extend it to include the last string matched | |
2596 and the buffer used for that matching. But of course we can't change | |
2597 it as it is. | |
2598 */ | |
2599 if (NILP (strbuffer) || BUFFERP (strbuffer)) | |
2600 { | |
2601 buf = decode_buffer (strbuffer, 0); | |
2602 } | |
2603 else if (!NILP (strbuffer)) | |
2604 { | |
2605 CHECK_INT (strbuffer); | |
2606 sub = XINT (strbuffer); | |
2607 if (sub < 0 || sub >= (int) search_regs.num_regs) | |
2608 invalid_argument ("match data register invalid", strbuffer); | |
2609 if (search_regs.start[sub] < 0) | |
2610 invalid_argument ("match data register not set", strbuffer); | |
2611 buf = current_buffer; | |
2612 } | |
2613 else | |
2614 invalid_argument ("STRBUFFER must be nil, a buffer, or an integer", | |
2615 strbuffer); | |
2616 buffer = wrap_buffer (buf); | |
2617 | |
428 | 2618 if (! NILP (string)) |
2619 { | |
2620 CHECK_STRING (string); | |
2621 if (!EQ (last_thing_searched, Qt)) | |
4199 | 2622 invalid_argument ("last thing matched was not a string", Qunbound); |
428 | 2623 } |
2624 else | |
2625 { | |
2626 if (!BUFFERP (last_thing_searched)) | |
4199 | 2627 invalid_argument ("last thing matched was not a buffer", Qunbound); |
428 | 2628 buffer = last_thing_searched; |
2629 buf = XBUFFER (buffer); | |
2630 } | |
2631 | |
826 | 2632 syntax_table = buf->mirror_syntax_table; |
428 | 2633 |
2634 case_action = nochange; /* We tried an initialization */ | |
2635 /* but some C compilers blew it */ | |
2636 | |
2637 if (search_regs.num_regs == 0) | |
826 | 2638 signal_error (Qinvalid_operation, |
2639 "replace-match called before any match found", Qunbound); | |
428 | 2640 |
2641 if (NILP (string)) | |
2642 { | |
469 | 2643 if (search_regs.start[sub] < BUF_BEGV (buf) |
2644 || search_regs.start[sub] > search_regs.end[sub] | |
2645 || search_regs.end[sub] > BUF_ZV (buf)) | |
2646 args_out_of_range (make_int (search_regs.start[sub]), | |
2647 make_int (search_regs.end[sub])); | |
428 | 2648 } |
2649 else | |
2650 { | |
2651 if (search_regs.start[0] < 0 | |
2652 || search_regs.start[0] > search_regs.end[0] | |
826 | 2653 || search_regs.end[0] > string_char_length (string)) |
428 | 2654 args_out_of_range (make_int (search_regs.start[0]), |
2655 make_int (search_regs.end[0])); | |
2656 } | |
2657 | |
2658 if (NILP (fixedcase)) | |
2659 { | |
2660 /* Decide how to casify by examining the matched text. */ | |
2661 | |
707 | 2662 last = search_regs.end[sub]; |
428 | 2663 prevc = '\n'; |
2664 case_action = all_caps; | |
2665 | |
2666 /* some_multiletter_word is set nonzero if any original word | |
2667 is more than one letter long. */ | |
2668 some_multiletter_word = 0; | |
2669 some_lowercase = 0; | |
2670 some_nonuppercase_initial = 0; | |
2671 some_uppercase = 0; | |
2672 | |
707 | 2673 for (pos = search_regs.start[sub]; pos < last; pos++) |
428 | 2674 { |
2675 if (NILP (string)) | |
2676 c = BUF_FETCH_CHAR (buf, pos); | |
2677 else | |
867 | 2678 c = string_ichar (string, pos); |
428 | 2679 |
2680 if (LOWERCASEP (buf, c)) | |
2681 { | |
2682 /* Cannot be all caps if any original char is lower case */ | |
2683 | |
2684 some_lowercase = 1; | |
2685 if (!WORD_SYNTAX_P (syntax_table, prevc)) | |
2686 some_nonuppercase_initial = 1; | |
2687 else | |
2688 some_multiletter_word = 1; | |
2689 } | |
2690 else if (!NOCASEP (buf, c)) | |
2691 { | |
2692 some_uppercase = 1; | |
2693 if (!WORD_SYNTAX_P (syntax_table, prevc)) | |
2694 ; | |
2695 else | |
2696 some_multiletter_word = 1; | |
2697 } | |
2698 else | |
2699 { | |
2700 /* If the initial is a caseless word constituent, | |
2701 treat that like a lowercase initial. */ | |
2702 if (!WORD_SYNTAX_P (syntax_table, prevc)) | |
2703 some_nonuppercase_initial = 1; | |
2704 } | |
2705 | |
2706 prevc = c; | |
2707 } | |
2708 | |
2709 /* Convert to all caps if the old text is all caps | |
2710 and has at least one multiletter word. */ | |
2711 if (! some_lowercase && some_multiletter_word) | |
2712 case_action = all_caps; | |
2713 /* Capitalize each word, if the old text has all capitalized words. */ | |
2714 else if (!some_nonuppercase_initial && some_multiletter_word) | |
2715 case_action = cap_initial; | |
2716 else if (!some_nonuppercase_initial && some_uppercase) | |
2717 /* Should x -> yz, operating on X, give Yz or YZ? | |
2718 We'll assume the latter. */ | |
2719 case_action = all_caps; | |
2720 else | |
2721 case_action = nochange; | |
2722 } | |
2723 | |
2724 /* Do replacement in a string. */ | |
2725 if (!NILP (string)) | |
2726 { | |
2727 Lisp_Object before, after; | |
2728 | |
2729 speccount = specpdl_depth (); | |
4199 | 2730 before = Fsubstring (string, Qzero, make_int (search_regs.start[sub])); |
2731 after = Fsubstring (string, make_int (search_regs.end[sub]), Qnil); | |
428 | 2732 |
444 | 2733 /* Do case substitution into REPLACEMENT if desired. */ |
428 | 2734 if (NILP (literal)) |
2735 { | |
826 | 2736 Charcount stlen = string_char_length (replacement); |
428 | 2737 Charcount strpos; |
2738 /* XEmacs change: rewrote this loop somewhat to make it | |
2739 cleaner. Also added \U, \E, etc. */ | |
2740 Charcount literal_start = 0; | |
2741 /* We build up the substituted string in ACCUM. */ | |
2742 Lisp_Object accum; | |
2743 | |
2744 accum = Qnil; | |
2745 | |
2746 /* OK, the basic idea here is that we scan through the | |
2747 replacement string until we find a backslash, which | |
2748 represents a substring of the original string to be | |
2749 substituted. We then append onto ACCUM the literal | |
2750 text before the backslash (LASTPOS marks the | |
2751 beginning of this) followed by the substring of the | |
2752 original string that needs to be inserted. */ | |
2753 for (strpos = 0; strpos < stlen; strpos++) | |
2754 { | |
2755 /* If LITERAL_END is set, we've encountered a backslash | |
2756 (the end of literal text to be inserted). */ | |
2757 Charcount literal_end = -1; | |
2758 /* If SUBSTART is set, we need to also insert the | |
2759 text from SUBSTART to SUBEND in the original string. */ | |
2760 Charcount substart = -1; | |
2761 Charcount subend = -1; | |
2762 | |
867 | 2763 c = string_ichar (replacement, strpos); |
428 | 2764 if (c == '\\' && strpos < stlen - 1) |
2765 { | |
867 | 2766 c = string_ichar (replacement, ++strpos); |
428 | 2767 if (c == '&') |
2768 { | |
2769 literal_end = strpos - 1; | |
2770 substart = search_regs.start[0]; | |
2771 subend = search_regs.end[0]; | |
2772 } | |
4199 | 2773 /* #### This logic is totally broken, |
2774 since we can have backrefs like "\99", right? */ | |
428 | 2775 else if (c >= '1' && c <= '9' && |
2776 c <= search_regs.num_regs + '0') | |
2777 { | |
2778 if (search_regs.start[c - '0'] >= 0) | |
2779 { | |
2780 literal_end = strpos - 1; | |
2781 substart = search_regs.start[c - '0']; | |
2782 subend = search_regs.end[c - '0']; | |
2783 } | |
2784 } | |
2785 else if (c == 'U' || c == 'u' || c == 'L' || c == 'l' || | |
2786 c == 'E') | |
2787 { | |
2788 /* Keep track of all case changes requested, but don't | |
2789 make them now. Do them later so we override | |
2790 everything else. */ | |
2791 if (!ul_pos_dynarr) | |
2792 { | |
2793 ul_pos_dynarr = Dynarr_new (int); | |
2794 ul_action_dynarr = Dynarr_new (int); | |
2795 record_unwind_protect | |
2796 (free_created_dynarrs, | |
2797 noseeum_cons | |
2798 (make_opaque_ptr (ul_pos_dynarr), | |
2799 make_opaque_ptr (ul_action_dynarr))); | |
2800 } | |
2801 literal_end = strpos - 1; | |
2802 Dynarr_add (ul_pos_dynarr, | |
2803 (!NILP (accum) | |
826 | 2804 ? string_char_length (accum) |
428 | 2805 : 0) + (literal_end - literal_start)); |
2806 Dynarr_add (ul_action_dynarr, c); | |
2807 } | |
2808 else if (c == '\\') | |
2809 /* So we get just one backslash. */ | |
2810 literal_end = strpos; | |
2811 } | |
2812 if (literal_end >= 0) | |
2813 { | |
2814 Lisp_Object literal_text = Qnil; | |
2815 Lisp_Object substring = Qnil; | |
2816 if (literal_end != literal_start) | |
444 | 2817 literal_text = Fsubstring (replacement, |
428 | 2818 make_int (literal_start), |
2819 make_int (literal_end)); | |
2820 if (substart >= 0 && subend != substart) | |
2821 substring = Fsubstring (string, | |
2822 make_int (substart), | |
2823 make_int (subend)); | |
2824 if (!NILP (literal_text) || !NILP (substring)) | |
2825 accum = concat3 (accum, literal_text, substring); | |
2826 literal_start = strpos + 1; | |
2827 } | |
2828 } | |
2829 | |
2830 if (strpos != literal_start) | |
2831 /* some literal text at end to be inserted */ | |
444 | 2832 replacement = concat2 (accum, Fsubstring (replacement, |
2833 make_int (literal_start), | |
2834 make_int (strpos))); | |
428 | 2835 else |
444 | 2836 replacement = accum; |
428 | 2837 } |
2838 | |
444 | 2839 /* replacement can be nil. */ |
2840 if (NILP (replacement)) | |
2841 replacement = build_string (""); | |
2842 | |
428 | 2843 if (case_action == all_caps) |
444 | 2844 replacement = Fupcase (replacement, buffer); |
428 | 2845 else if (case_action == cap_initial) |
444 | 2846 replacement = Fupcase_initials (replacement, buffer); |
428 | 2847 |
2848 /* Now finally, we need to process the \U's, \E's, etc. */ | |
2849 if (ul_pos_dynarr) | |
2850 { | |
2851 int i = 0; | |
2852 int cur_action = 'E'; | |
826 | 2853 Charcount stlen = string_char_length (replacement); |
428 | 2854 Charcount strpos; |
2855 | |
2856 for (strpos = 0; strpos < stlen; strpos++) | |
2857 { | |
867 | 2858 Ichar curchar = string_ichar (replacement, strpos); |
2859 Ichar newchar = -1; | |
428 | 2860 if (i < Dynarr_length (ul_pos_dynarr) && |
2861 strpos == Dynarr_at (ul_pos_dynarr, i)) | |
2862 { | |
2863 int new_action = Dynarr_at (ul_action_dynarr, i); | |
2864 i++; | |
2865 if (new_action == 'u') | |
2866 newchar = UPCASE (buf, curchar); | |
2867 else if (new_action == 'l') | |
2868 newchar = DOWNCASE (buf, curchar); | |
2869 else | |
2870 cur_action = new_action; | |
2871 } | |
2872 if (newchar == -1) | |
2873 { | |
2874 if (cur_action == 'U') | |
2875 newchar = UPCASE (buf, curchar); | |
2876 else if (cur_action == 'L') | |
2877 newchar = DOWNCASE (buf, curchar); | |
2878 else | |
2879 newchar = curchar; | |
2880 } | |
2881 if (newchar != curchar) | |
793 | 2882 set_string_char (replacement, strpos, newchar); |
428 | 2883 } |
2884 } | |
2885 | |
2886 /* frees the Dynarrs if necessary. */ | |
771 | 2887 unbind_to (speccount); |
444 | 2888 return concat3 (before, replacement, after); |
428 | 2889 } |
2890 | |
707 | 2891 mc_count = begin_multiple_change (buf, search_regs.start[sub], |
2892 search_regs.end[sub]); | |
428 | 2893 |
2894 /* begin_multiple_change() records an unwind-protect, so we need to | |
2895 record this value now. */ | |
2896 speccount = specpdl_depth (); | |
2897 | |
2898 /* We insert the replacement text before the old text, and then | |
2899 delete the original text. This means that markers at the | |
2900 beginning or end of the original will float to the corresponding | |
2901 position in the replacement. */ | |
707 | 2902 BUF_SET_PT (buf, search_regs.start[sub]); |
428 | 2903 if (!NILP (literal)) |
444 | 2904 Finsert (1, &replacement); |
428 | 2905 else |
2906 { | |
826 | 2907 Charcount stlen = string_char_length (replacement); |
428 | 2908 Charcount strpos; |
2909 struct gcpro gcpro1; | |
444 | 2910 GCPRO1 (replacement); |
428 | 2911 for (strpos = 0; strpos < stlen; strpos++) |
2912 { | |
707 | 2913 /* on the first iteration assert(offset==0), |
2914 exactly complementing BUF_SET_PT() above. | |
2915 During the loop, it keeps track of the amount inserted. | |
2916 */ | |
2917 Charcount offset = BUF_PT (buf) - search_regs.start[sub]; | |
428 | 2918 |
867 | 2919 c = string_ichar (replacement, strpos); |
428 | 2920 if (c == '\\' && strpos < stlen - 1) |
2921 { | |
707 | 2922 /* XXX FIXME: replacing just a substring non-literally |
2923 using backslash refs to the match looks dangerous. But | |
2924 <15366.18513.698042.156573@ns.caldera.de> from Torsten Duwe | |
2925 <duwe@caldera.de> claims Finsert_buffer_substring already | |
2926 handles this correctly. | |
2927 */ | |
867 | 2928 c = string_ichar (replacement, ++strpos); |
428 | 2929 if (c == '&') |
2930 Finsert_buffer_substring | |
2931 (buffer, | |
2932 make_int (search_regs.start[0] + offset), | |
2933 make_int (search_regs.end[0] + offset)); | |
4199 | 2934 /* #### This logic is totally broken, |
2935 since we can have backrefs like "\99", right? */ | |
428 | 2936 else if (c >= '1' && c <= '9' && |
2937 c <= search_regs.num_regs + '0') | |
2938 { | |
2939 if (search_regs.start[c - '0'] >= 1) | |
2940 Finsert_buffer_substring | |
2941 (buffer, | |
2942 make_int (search_regs.start[c - '0'] + offset), | |
2943 make_int (search_regs.end[c - '0'] + offset)); | |
2944 } | |
2945 else if (c == 'U' || c == 'u' || c == 'L' || c == 'l' || | |
2946 c == 'E') | |
2947 { | |
2948 /* Keep track of all case changes requested, but don't | |
2949 make them now. Do them later so we override | |
2950 everything else. */ | |
2951 if (!ul_pos_dynarr) | |
2952 { | |
2953 ul_pos_dynarr = Dynarr_new (int); | |
2954 ul_action_dynarr = Dynarr_new (int); | |
2955 record_unwind_protect | |
2956 (free_created_dynarrs, | |
2957 Fcons (make_opaque_ptr (ul_pos_dynarr), | |
2958 make_opaque_ptr (ul_action_dynarr))); | |
2959 } | |
2960 Dynarr_add (ul_pos_dynarr, BUF_PT (buf)); | |
2961 Dynarr_add (ul_action_dynarr, c); | |
2962 } | |
2963 else | |
2964 buffer_insert_emacs_char (buf, c); | |
2965 } | |
2966 else | |
2967 buffer_insert_emacs_char (buf, c); | |
2968 } | |
2969 UNGCPRO; | |
2970 } | |
2971 | |
707 | 2972 inslen = BUF_PT (buf) - (search_regs.start[sub]); |
2973 buffer_delete_range (buf, search_regs.start[sub] + inslen, | |
2974 search_regs.end[sub] + inslen, 0); | |
428 | 2975 |
2976 if (case_action == all_caps) | |
2977 Fupcase_region (make_int (BUF_PT (buf) - inslen), | |
2978 make_int (BUF_PT (buf)), buffer); | |
2979 else if (case_action == cap_initial) | |
2980 Fupcase_initials_region (make_int (BUF_PT (buf) - inslen), | |
2981 make_int (BUF_PT (buf)), buffer); | |
2982 | |
2983 /* Now go through and make all the case changes that were requested | |
2984 in the replacement string. */ | |
2985 if (ul_pos_dynarr) | |
2986 { | |
665 | 2987 Charbpos eend = BUF_PT (buf); |
428 | 2988 int i = 0; |
2989 int cur_action = 'E'; | |
2990 | |
2991 for (pos = BUF_PT (buf) - inslen; pos < eend; pos++) | |
2992 { | |
867 | 2993 Ichar curchar = BUF_FETCH_CHAR (buf, pos); |
2994 Ichar newchar = -1; | |
428 | 2995 if (i < Dynarr_length (ul_pos_dynarr) && |
2996 pos == Dynarr_at (ul_pos_dynarr, i)) | |
2997 { | |
2998 int new_action = Dynarr_at (ul_action_dynarr, i); | |
2999 i++; | |
3000 if (new_action == 'u') | |
3001 newchar = UPCASE (buf, curchar); | |
3002 else if (new_action == 'l') | |
3003 newchar = DOWNCASE (buf, curchar); | |
3004 else | |
3005 cur_action = new_action; | |
3006 } | |
3007 if (newchar == -1) | |
3008 { | |
3009 if (cur_action == 'U') | |
3010 newchar = UPCASE (buf, curchar); | |
3011 else if (cur_action == 'L') | |
3012 newchar = DOWNCASE (buf, curchar); | |
3013 else | |
3014 newchar = curchar; | |
3015 } | |
3016 if (newchar != curchar) | |
3017 buffer_replace_char (buf, pos, newchar, 0, 0); | |
3018 } | |
3019 } | |
3020 | |
3021 /* frees the Dynarrs if necessary. */ | |
771 | 3022 unbind_to (speccount); |
428 | 3023 end_multiple_change (buf, mc_count); |
3024 | |
3025 return Qnil; | |
3026 } | |
3027 | |
3028 static Lisp_Object | |
3029 match_limit (Lisp_Object num, int beginningp) | |
3030 { | |
3031 int n; | |
3032 | |
3033 CHECK_INT (num); | |
3034 n = XINT (num); | |
3035 if (n < 0 || n >= search_regs.num_regs) | |
3036 args_out_of_range (num, make_int (search_regs.num_regs)); | |
3037 if (search_regs.num_regs == 0 || | |
3038 search_regs.start[n] < 0) | |
3039 return Qnil; | |
3040 return make_int (beginningp ? search_regs.start[n] : search_regs.end[n]); | |
3041 } | |
3042 | |
3043 DEFUN ("match-beginning", Fmatch_beginning, 1, 1, 0, /* | |
3044 Return position of start of text matched by last regexp search. | |
3045 NUM, specifies which parenthesized expression in the last regexp. | |
3046 Value is nil if NUMth pair didn't match, or there were less than NUM pairs. | |
3047 Zero means the entire text matched by the whole regexp or whole string. | |
3048 */ | |
3049 (num)) | |
3050 { | |
3051 return match_limit (num, 1); | |
3052 } | |
3053 | |
3054 DEFUN ("match-end", Fmatch_end, 1, 1, 0, /* | |
3055 Return position of end of text matched by last regexp search. | |
3056 NUM specifies which parenthesized expression in the last regexp. | |
3057 Value is nil if NUMth pair didn't match, or there were less than NUM pairs. | |
3058 Zero means the entire text matched by the whole regexp or whole string. | |
3059 */ | |
3060 (num)) | |
3061 { | |
3062 return match_limit (num, 0); | |
3063 } | |
3064 | |
3065 DEFUN ("match-data", Fmatch_data, 0, 2, 0, /* | |
3066 Return a list containing all info on what the last regexp search matched. | |
3067 Element 2N is `(match-beginning N)'; element 2N + 1 is `(match-end N)'. | |
3068 All the elements are markers or nil (nil if the Nth pair didn't match) | |
3069 if the last match was on a buffer; integers or nil if a string was matched. | |
3070 Use `store-match-data' to reinstate the data in this list. | |
3071 | |
3072 If INTEGERS (the optional first argument) is non-nil, always use integers | |
3073 \(rather than markers) to represent buffer positions. | |
3074 If REUSE is a list, reuse it as part of the value. If REUSE is long enough | |
3075 to hold all the values, and if INTEGERS is non-nil, no consing is done. | |
3076 */ | |
3077 (integers, reuse)) | |
3078 { | |
3079 Lisp_Object tail, prev; | |
3080 Lisp_Object *data; | |
3081 int i; | |
3082 Charcount len; | |
3083 | |
3084 if (NILP (last_thing_searched)) | |
563 | 3085 /*error ("match-data called before any match found", Qunbound);*/ |
428 | 3086 return Qnil; |
3087 | |
3088 data = alloca_array (Lisp_Object, 2 * search_regs.num_regs); | |
3089 | |
3090 len = -1; | |
3091 for (i = 0; i < search_regs.num_regs; i++) | |
3092 { | |
665 | 3093 Charbpos start = search_regs.start[i]; |
428 | 3094 if (start >= 0) |
3095 { | |
3096 if (EQ (last_thing_searched, Qt) | |
3097 || !NILP (integers)) | |
3098 { | |
3099 data[2 * i] = make_int (start); | |
3100 data[2 * i + 1] = make_int (search_regs.end[i]); | |
3101 } | |
3102 else if (BUFFERP (last_thing_searched)) | |
3103 { | |
3104 data[2 * i] = Fmake_marker (); | |
3105 Fset_marker (data[2 * i], | |
3106 make_int (start), | |
3107 last_thing_searched); | |
3108 data[2 * i + 1] = Fmake_marker (); | |
3109 Fset_marker (data[2 * i + 1], | |
3110 make_int (search_regs.end[i]), | |
3111 last_thing_searched); | |
3112 } | |
3113 else | |
3114 /* last_thing_searched must always be Qt, a buffer, or Qnil. */ | |
2500 | 3115 ABORT (); |
428 | 3116 |
3117 len = i; | |
3118 } | |
3119 else | |
3120 data[2 * i] = data [2 * i + 1] = Qnil; | |
3121 } | |
3122 if (!CONSP (reuse)) | |
3123 return Flist (2 * len + 2, data); | |
3124 | |
3125 /* If REUSE is a list, store as many value elements as will fit | |
3126 into the elements of REUSE. */ | |
3127 for (prev = Qnil, i = 0, tail = reuse; CONSP (tail); i++, tail = XCDR (tail)) | |
3128 { | |
3129 if (i < 2 * len + 2) | |
3130 XCAR (tail) = data[i]; | |
3131 else | |
3132 XCAR (tail) = Qnil; | |
3133 prev = tail; | |
3134 } | |
3135 | |
3136 /* If we couldn't fit all value elements into REUSE, | |
3137 cons up the rest of them and add them to the end of REUSE. */ | |
3138 if (i < 2 * len + 2) | |
3139 XCDR (prev) = Flist (2 * len + 2 - i, data + i); | |
3140 | |
3141 return reuse; | |
3142 } | |
3143 | |
3144 | |
3145 DEFUN ("store-match-data", Fstore_match_data, 1, 1, 0, /* | |
3146 Set internal data on last search match from elements of LIST. | |
1468 | 3147 LIST should have been created by calling `match-data' previously, |
3148 or be nil, to clear the internal match data. | |
428 | 3149 */ |
3150 (list)) | |
3151 { | |
3152 REGISTER int i; | |
3153 REGISTER Lisp_Object marker; | |
3154 int num_regs; | |
3155 int length; | |
3156 | |
853 | 3157 /* Some FSF junk with running_asynch_code, to preserve the match |
3158 data. Not necessary because we don't call process filters | |
3159 asynchronously (i.e. from within QUIT). */ | |
428 | 3160 |
3161 CONCHECK_LIST (list); | |
3162 | |
3163 /* Unless we find a marker with a buffer in LIST, assume that this | |
3164 match data came from a string. */ | |
3165 last_thing_searched = Qt; | |
3166 | |
3167 /* Allocate registers if they don't already exist. */ | |
3168 length = XINT (Flength (list)) / 2; | |
3169 num_regs = search_regs.num_regs; | |
3170 | |
3171 if (length > num_regs) | |
3172 { | |
3173 if (search_regs.num_regs == 0) | |
3174 { | |
3175 search_regs.start = xnew_array (regoff_t, length); | |
3176 search_regs.end = xnew_array (regoff_t, length); | |
3177 } | |
3178 else | |
3179 { | |
3180 XREALLOC_ARRAY (search_regs.start, regoff_t, length); | |
3181 XREALLOC_ARRAY (search_regs.end, regoff_t, length); | |
3182 } | |
3183 | |
3184 search_regs.num_regs = length; | |
3185 } | |
3186 | |
3187 for (i = 0; i < num_regs; i++) | |
3188 { | |
3189 marker = Fcar (list); | |
3190 if (NILP (marker)) | |
3191 { | |
3192 search_regs.start[i] = -1; | |
3193 list = Fcdr (list); | |
3194 } | |
3195 else | |
3196 { | |
3197 if (MARKERP (marker)) | |
3198 { | |
3199 if (XMARKER (marker)->buffer == 0) | |
3200 marker = Qzero; | |
3201 else | |
793 | 3202 last_thing_searched = wrap_buffer (XMARKER (marker)->buffer); |
428 | 3203 } |
3204 | |
3205 CHECK_INT_COERCE_MARKER (marker); | |
3206 search_regs.start[i] = XINT (marker); | |
3207 list = Fcdr (list); | |
3208 | |
3209 marker = Fcar (list); | |
3210 if (MARKERP (marker) && XMARKER (marker)->buffer == 0) | |
3211 marker = Qzero; | |
3212 | |
3213 CHECK_INT_COERCE_MARKER (marker); | |
3214 search_regs.end[i] = XINT (marker); | |
3215 } | |
3216 list = Fcdr (list); | |
3217 } | |
3218 | |
3219 return Qnil; | |
3220 } | |
3221 | |
3222 /* Quote a string to inactivate reg-expr chars */ | |
3223 | |
3224 DEFUN ("regexp-quote", Fregexp_quote, 1, 1, 0, /* | |
3225 Return a regexp string which matches exactly STRING and nothing else. | |
3226 */ | |
444 | 3227 (string)) |
428 | 3228 { |
867 | 3229 REGISTER Ibyte *in, *out, *end; |
3230 REGISTER Ibyte *temp; | |
428 | 3231 |
444 | 3232 CHECK_STRING (string); |
428 | 3233 |
2367 | 3234 temp = alloca_ibytes (XSTRING_LENGTH (string) * 2); |
428 | 3235 |
3236 /* Now copy the data into the new string, inserting escapes. */ | |
3237 | |
444 | 3238 in = XSTRING_DATA (string); |
3239 end = in + XSTRING_LENGTH (string); | |
428 | 3240 out = temp; |
3241 | |
3242 while (in < end) | |
3243 { | |
867 | 3244 Ichar c = itext_ichar (in); |
428 | 3245 |
3246 if (c == '[' || c == ']' | |
3247 || c == '*' || c == '.' || c == '\\' | |
3248 || c == '?' || c == '+' | |
3249 || c == '^' || c == '$') | |
3250 *out++ = '\\'; | |
867 | 3251 out += set_itext_ichar (out, c); |
3252 INC_IBYTEPTR (in); | |
428 | 3253 } |
3254 | |
3255 return make_string (temp, out - temp); | |
3256 } | |
3257 | |
3258 DEFUN ("set-word-regexp", Fset_word_regexp, 1, 1, 0, /* | |
3259 Set the regexp to be used to match a word in regular-expression searching. | |
3260 #### Not yet implemented. Currently does nothing. | |
3261 #### Do not use this yet. Its calling interface is likely to change. | |
3262 */ | |
2286 | 3263 (UNUSED (regexp))) |
428 | 3264 { |
3265 return Qnil; | |
3266 } | |
3267 | |
3268 | |
3269 /************************************************************************/ | |
3270 /* initialization */ | |
3271 /************************************************************************/ | |
3272 | |
3273 void | |
3274 syms_of_search (void) | |
3275 { | |
3276 | |
442 | 3277 DEFERROR_STANDARD (Qsearch_failed, Qinvalid_operation); |
3278 DEFERROR_STANDARD (Qinvalid_regexp, Qsyntax_error); | |
563 | 3279 Fput (Qinvalid_regexp, Qerror_lacks_explanatory_string, Qt); |
428 | 3280 |
3281 DEFSUBR (Flooking_at); | |
3282 DEFSUBR (Fposix_looking_at); | |
3283 DEFSUBR (Fstring_match); | |
3284 DEFSUBR (Fposix_string_match); | |
3285 DEFSUBR (Fskip_chars_forward); | |
3286 DEFSUBR (Fskip_chars_backward); | |
3287 DEFSUBR (Fskip_syntax_forward); | |
3288 DEFSUBR (Fskip_syntax_backward); | |
3289 DEFSUBR (Fsearch_forward); | |
3290 DEFSUBR (Fsearch_backward); | |
3291 DEFSUBR (Fword_search_forward); | |
3292 DEFSUBR (Fword_search_backward); | |
3293 DEFSUBR (Fre_search_forward); | |
3294 DEFSUBR (Fre_search_backward); | |
3295 DEFSUBR (Fposix_search_forward); | |
3296 DEFSUBR (Fposix_search_backward); | |
3297 DEFSUBR (Freplace_match); | |
3298 DEFSUBR (Fmatch_beginning); | |
3299 DEFSUBR (Fmatch_end); | |
3300 DEFSUBR (Fmatch_data); | |
3301 DEFSUBR (Fstore_match_data); | |
3302 DEFSUBR (Fregexp_quote); | |
3303 DEFSUBR (Fset_word_regexp); | |
3304 } | |
3305 | |
3306 void | |
3307 reinit_vars_of_search (void) | |
3308 { | |
3309 int i; | |
3310 | |
3311 last_thing_searched = Qnil; | |
3312 staticpro_nodump (&last_thing_searched); | |
3313 | |
3314 for (i = 0; i < REGEXP_CACHE_SIZE; ++i) | |
3315 { | |
3316 searchbufs[i].buf.allocated = 100; | |
3317 searchbufs[i].buf.buffer = (unsigned char *) xmalloc (100); | |
3318 searchbufs[i].buf.fastmap = searchbufs[i].fastmap; | |
3319 searchbufs[i].regexp = Qnil; | |
3320 staticpro_nodump (&searchbufs[i].regexp); | |
3321 searchbufs[i].next = (i == REGEXP_CACHE_SIZE-1 ? 0 : &searchbufs[i+1]); | |
3322 } | |
3323 searchbuf_head = &searchbufs[0]; | |
3324 } | |
3325 | |
3326 void | |
3327 vars_of_search (void) | |
3328 { | |
3329 DEFVAR_LISP ("forward-word-regexp", &Vforward_word_regexp /* | |
3330 *Regular expression to be used in `forward-word'. | |
3331 #### Not yet implemented. | |
3332 */ ); | |
3333 Vforward_word_regexp = Qnil; | |
3334 | |
3335 DEFVAR_LISP ("backward-word-regexp", &Vbackward_word_regexp /* | |
3336 *Regular expression to be used in `backward-word'. | |
3337 #### Not yet implemented. | |
3338 */ ); | |
3339 Vbackward_word_regexp = Qnil; | |
502 | 3340 |
3341 DEFVAR_INT ("warn-about-possibly-incompatible-back-references", | |
3342 &warn_about_possibly_incompatible_back_references /* | |
3343 If true, issue warnings when new-semantics back references occur. | |
3344 This is to catch places where old code might inadvertently have changed | |
3345 semantics. This will occur in old code only where more than nine groups | |
3346 occur and a back reference to one of them is directly followed by a digit. | |
3347 */ ); | |
3348 warn_about_possibly_incompatible_back_references = 1; | |
814 | 3349 |
2421 | 3350 Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed); |
428 | 3351 staticpro (&Vskip_chars_range_table); |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3352 #ifdef DEBUG_XEMACS |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3353 DEFSYMBOL (Qsearch_algorithm_used); |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3354 DEFSYMBOL (Qboyer_moore); |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3355 DEFSYMBOL (Qsimple_search); |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3356 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3357 DEFVAR_INT ("debug-xemacs-searches", &debug_xemacs_searches /* |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3358 If non-zero, bind `search-algorithm-used' to `boyer-moore' or `simple-search', |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3359 depending on the algorithm used for each search. Used for testing. |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3360 */ ); |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3361 debug_xemacs_searches = 0; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3362 #endif |
428 | 3363 } |