Mercurial > hg > xemacs-beta
annotate src/search.c @ 5041:efaa6cd845e5
add regexp-debugging
-------------------- ChangeLog entries follow: --------------------
src/ChangeLog addition:
2010-02-15 Ben Wing <ben@xemacs.org>
* regex.c:
* regex.c (DEBUG_FAIL_PRINT1):
* regex.c (PUSH_FAILURE_POINT):
* regex.c (POP_FAILURE_POINT):
* regex.c (regex_compile):
* regex.c (re_match_2_internal):
* regex.h:
* search.c:
* search.c (search_buffer):
* search.c (debug_regexps_changed):
* search.c (vars_of_search):
Add an internal variable debug_regexps and a corresponding Lisp
variable `debug-regexps' that takes a list of areas in which to
display debugging info about regex compilation and matching
(currently three areas exist). Use existing debugging code
already in regex.c and modify it so that it recognizes the
debug_regexps variable and the flags in it.
Rename variable `debug-xemacs-searches' to just `debug-searches',
consistent with other debug vars.
tests/ChangeLog addition:
2010-02-15 Ben Wing <ben@xemacs.org>
* automated/search-tests.el (let):
* automated/search-tests.el (boundp):
debug-xemacs-searches renamed to debug-searches.
author | Ben Wing <ben@xemacs.org> |
---|---|
date | Mon, 15 Feb 2010 21:51:22 -0600 |
parents | 2ade80e8c640 |
children | 99f8ebc082d9 |
rev | line source |
---|---|
428 | 1 /* String search routines for XEmacs. |
2 Copyright (C) 1985, 1986, 1987, 1992-1995 Free Software Foundation, Inc. | |
3 Copyright (C) 1995 Sun Microsystems, Inc. | |
5041 | 4 Copyright (C) 2001, 2002, 2010 Ben Wing. |
428 | 5 |
6 This file is part of XEmacs. | |
7 | |
8 XEmacs is free software; you can redistribute it and/or modify it | |
9 under the terms of the GNU General Public License as published by the | |
10 Free Software Foundation; either version 2, or (at your option) any | |
11 later version. | |
12 | |
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 for more details. | |
17 | |
18 You should have received a copy of the GNU General Public License | |
19 along with XEmacs; see the file COPYING. If not, write to | |
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
21 Boston, MA 02111-1307, USA. */ | |
22 | |
23 /* Synched up with: FSF 19.29, except for region-cache stuff. */ | |
24 | |
25 /* Hacked on for Mule by Ben Wing, December 1994 and August 1995. */ | |
26 | |
826 | 27 /* This file has been Mule-ized. */ |
428 | 28 |
29 #include <config.h> | |
30 #include "lisp.h" | |
31 | |
32 #include "buffer.h" | |
33 #include "insdel.h" | |
34 #include "opaque.h" | |
35 #ifdef REGION_CACHE_NEEDS_WORK | |
36 #include "region-cache.h" | |
37 #endif | |
38 #include "syntax.h" | |
39 | |
40 #include <sys/types.h> | |
41 #include "regex.h" | |
446 | 42 #include "casetab.h" |
43 #include "chartab.h" | |
44 | |
45 #define TRANSLATE(table, pos) \ | |
867 | 46 (!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos) |
428 | 47 |
48 #define REGEXP_CACHE_SIZE 20 | |
49 | |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
50 #ifdef DEBUG_XEMACS |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
51 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
52 /* Used in tests/automated/case-tests.el if available. */ |
5041 | 53 Fixnum debug_searches; |
54 | |
55 /* Declare as int rather than Bitflags because it's used by regex.c, which | |
56 may be used outside of XEmacs (e.g. etags.c). */ | |
57 int debug_regexps; | |
58 Lisp_Object Vdebug_regexps; | |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
59 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
60 Lisp_Object Qsearch_algorithm_used, Qboyer_moore, Qsimple_search; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
61 |
5041 | 62 Lisp_Object Qcompilation, Qfailure_point, Qmatching; |
63 | |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
64 #endif |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
65 |
428 | 66 /* If the regexp is non-nil, then the buffer contains the compiled form |
67 of that regexp, suitable for searching. */ | |
446 | 68 struct regexp_cache |
69 { | |
428 | 70 struct regexp_cache *next; |
71 Lisp_Object regexp; | |
72 struct re_pattern_buffer buf; | |
73 char fastmap[0400]; | |
74 /* Nonzero means regexp was compiled to do full POSIX backtracking. */ | |
75 char posix; | |
76 }; | |
77 | |
78 /* The instances of that struct. */ | |
79 static struct regexp_cache searchbufs[REGEXP_CACHE_SIZE]; | |
80 | |
81 /* The head of the linked list; points to the most recently used buffer. */ | |
82 static struct regexp_cache *searchbuf_head; | |
83 | |
84 | |
85 /* Every call to re_match, etc., must pass &search_regs as the regs | |
86 argument unless you can show it is unnecessary (i.e., if re_match | |
87 is certainly going to be called again before region-around-match | |
88 can be called). | |
89 | |
90 Since the registers are now dynamically allocated, we need to make | |
91 sure not to refer to the Nth register before checking that it has | |
92 been allocated by checking search_regs.num_regs. | |
93 | |
94 The regex code keeps track of whether it has allocated the search | |
95 buffer using bits in the re_pattern_buffer. This means that whenever | |
96 you compile a new pattern, it completely forgets whether it has | |
97 allocated any registers, and will allocate new registers the next | |
98 time you call a searching or matching function. Therefore, we need | |
99 to call re_set_registers after compiling a new pattern or after | |
100 setting the match registers, so that the regex functions will be | |
101 able to free or re-allocate it properly. */ | |
102 | |
103 /* Note: things get trickier under Mule because the values returned from | |
826 | 104 the regexp routines are in Bytebpos's but we need them to be in Charbpos's. |
428 | 105 We take the easy way out for the moment and just convert them immediately. |
106 We could be more clever by not converting them until necessary, but | |
107 that gets real ugly real fast since the buffer might have changed and | |
108 the positions might be out of sync or out of range. | |
109 */ | |
110 static struct re_registers search_regs; | |
111 | |
1468 | 112 /* Every function that sets the match data _must_ clear unused search |
113 registers on success. An unsuccessful search or match _must_ preserve | |
114 the search registers. The traditional documentation implied that | |
115 any match operation might trash the registers, but in fact failures | |
116 have always preserved the match data (in GNU Emacs as well). Some | |
117 plausible code depends on this behavior (cf. `w3-configuration-data' | |
118 in library "w3-cfg"). | |
119 | |
120 Ordinary string searchs use set_search_regs to set the whole-string | |
121 match. That function takes care of clearing the unused subexpression | |
1425 | 122 registers. |
123 */ | |
124 static void set_search_regs (struct buffer *buf, Charbpos beg, Charcount len); | |
1468 | 125 static void clear_search_regs (void); |
1425 | 126 |
428 | 127 /* The buffer in which the last search was performed, or |
128 Qt if the last search was done in a string; | |
129 Qnil if no searching has been done yet. */ | |
130 static Lisp_Object last_thing_searched; | |
131 | |
132 /* error condition signalled when regexp compile_pattern fails */ | |
133 | |
134 Lisp_Object Qinvalid_regexp; | |
135 | |
136 /* Regular expressions used in forward/backward-word */ | |
137 Lisp_Object Vforward_word_regexp, Vbackward_word_regexp; | |
138 | |
507 | 139 Fixnum warn_about_possibly_incompatible_back_references; |
502 | 140 |
428 | 141 /* range table for use with skip_chars. Only needed for Mule. */ |
142 Lisp_Object Vskip_chars_range_table; | |
143 | |
867 | 144 static Charbpos simple_search (struct buffer *buf, Ibyte *base_pat, |
826 | 145 Bytecount len, Bytebpos pos, Bytebpos lim, |
146 EMACS_INT n, Lisp_Object trt); | |
867 | 147 static Charbpos boyer_moore (struct buffer *buf, Ibyte *base_pat, |
826 | 148 Bytecount len, Bytebpos pos, Bytebpos lim, |
149 EMACS_INT n, Lisp_Object trt, | |
150 Lisp_Object inverse_trt, int charset_base); | |
665 | 151 static Charbpos search_buffer (struct buffer *buf, Lisp_Object str, |
826 | 152 Charbpos charbpos, Charbpos buflim, EMACS_INT n, |
153 int RE, Lisp_Object trt, | |
154 Lisp_Object inverse_trt, int posix); | |
771 | 155 |
2268 | 156 static DECLARE_DOESNT_RETURN (matcher_overflow (void)); |
157 | |
158 static DOESNT_RETURN | |
159 matcher_overflow () | |
428 | 160 { |
563 | 161 stack_overflow ("Stack overflow in regexp matcher", Qunbound); |
428 | 162 } |
163 | |
164 /* Compile a regexp and signal a Lisp error if anything goes wrong. | |
165 PATTERN is the pattern to compile. | |
166 CP is the place to put the result. | |
826 | 167 TRANSLATE is a translation table for ignoring case, or Qnil for none. |
428 | 168 REGP is the structure that says where to store the "register" |
169 values that will result from matching this pattern. | |
170 If it is 0, we should compile the pattern not to record any | |
171 subexpression bounds. | |
172 POSIX is nonzero if we want full backtracking (POSIX style) | |
173 for this pattern. 0 means backtrack only enough to get a valid match. */ | |
174 | |
175 static int | |
176 compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern, | |
2286 | 177 struct re_registers *UNUSED (regp), Lisp_Object translate, |
826 | 178 int posix, Error_Behavior errb) |
428 | 179 { |
442 | 180 const char *val; |
428 | 181 reg_syntax_t old; |
182 | |
183 cp->regexp = Qnil; | |
184 cp->buf.translate = translate; | |
185 cp->posix = posix; | |
186 old = re_set_syntax (RE_SYNTAX_EMACS | |
187 | (posix ? 0 : RE_NO_POSIX_BACKTRACKING)); | |
442 | 188 val = (const char *) |
428 | 189 re_compile_pattern ((char *) XSTRING_DATA (pattern), |
190 XSTRING_LENGTH (pattern), &cp->buf); | |
191 re_set_syntax (old); | |
192 if (val) | |
193 { | |
4953
304aebb79cd3
function renamings to track names of char typedefs
Ben Wing <ben@xemacs.org>
parents:
4952
diff
changeset
|
194 maybe_signal_error (Qinvalid_regexp, 0, build_cistring (val), |
428 | 195 Qsearch, errb); |
196 return 0; | |
197 } | |
198 | |
199 cp->regexp = Fcopy_sequence (pattern); | |
200 return 1; | |
201 } | |
202 | |
203 /* Compile a regexp if necessary, but first check to see if there's one in | |
204 the cache. | |
205 PATTERN is the pattern to compile. | |
826 | 206 TRANSLATE is a translation table for ignoring case, or Qnil for none. |
428 | 207 REGP is the structure that says where to store the "register" |
208 values that will result from matching this pattern. | |
209 If it is 0, we should compile the pattern not to record any | |
210 subexpression bounds. | |
211 POSIX is nonzero if we want full backtracking (POSIX style) | |
212 for this pattern. 0 means backtrack only enough to get a valid match. */ | |
213 | |
214 struct re_pattern_buffer * | |
215 compile_pattern (Lisp_Object pattern, struct re_registers *regp, | |
2286 | 216 Lisp_Object translate, Lisp_Object UNUSED (searchobj), |
217 struct buffer *UNUSED (searchbuf), int posix, | |
218 Error_Behavior errb) | |
428 | 219 { |
220 struct regexp_cache *cp, **cpp; | |
221 | |
222 for (cpp = &searchbuf_head; ; cpp = &cp->next) | |
223 { | |
224 cp = *cpp; | |
826 | 225 /* &&#### once we fix up the fastmap code in regex.c for 8-bit-fixed, |
226 we need to record and compare the buffer and format, since the | |
227 fastmap will reflect the state of the buffer -- and things get | |
228 more complicated if the buffer has changed formats or (esp.) has | |
229 kept the format but changed its interpretation! may need to have | |
230 the code that changes the interpretation go through and invalidate | |
231 cache entries for that buffer. */ | |
428 | 232 if (!NILP (Fstring_equal (cp->regexp, pattern)) |
446 | 233 && EQ (cp->buf.translate, translate) |
428 | 234 && cp->posix == posix) |
235 break; | |
236 | |
237 /* If we're at the end of the cache, compile into the last cell. */ | |
238 if (cp->next == 0) | |
239 { | |
826 | 240 if (!compile_pattern_1 (cp, pattern, regp, translate, |
241 posix, errb)) | |
428 | 242 return 0; |
243 break; | |
244 } | |
245 } | |
246 | |
247 /* When we get here, cp (aka *cpp) contains the compiled pattern, | |
248 either because we found it in the cache or because we just compiled it. | |
249 Move it to the front of the queue to mark it as most recently used. */ | |
250 *cpp = cp->next; | |
251 cp->next = searchbuf_head; | |
252 searchbuf_head = cp; | |
253 | |
254 /* Advise the searching functions about the space we have allocated | |
255 for register data. */ | |
256 if (regp) | |
257 re_set_registers (&cp->buf, regp, regp->num_regs, regp->start, regp->end); | |
258 | |
259 return &cp->buf; | |
260 } | |
261 | |
262 /* Error condition used for failing searches */ | |
263 Lisp_Object Qsearch_failed; | |
264 | |
2268 | 265 static DECLARE_DOESNT_RETURN (signal_failure (Lisp_Object)); |
266 | |
267 static DOESNT_RETURN | |
428 | 268 signal_failure (Lisp_Object arg) |
269 { | |
446 | 270 for (;;) |
271 Fsignal (Qsearch_failed, list1 (arg)); | |
428 | 272 } |
273 | |
826 | 274 /* Convert the search registers from Bytebpos's to Charbpos's. Needs to be |
428 | 275 done after each regexp match that uses the search regs. |
276 | |
277 We could get a potential speedup by not converting the search registers | |
278 until it's really necessary, e.g. when match-data or replace-match is | |
279 called. However, this complexifies the code a lot (e.g. the buffer | |
826 | 280 could have changed and the Bytebpos's stored might be invalid) and is |
428 | 281 probably not a great time-saver. */ |
282 | |
283 static void | |
284 fixup_search_regs_for_buffer (struct buffer *buf) | |
285 { | |
286 int i; | |
287 int num_regs = search_regs.num_regs; | |
288 | |
289 for (i = 0; i < num_regs; i++) | |
290 { | |
291 if (search_regs.start[i] >= 0) | |
826 | 292 search_regs.start[i] = bytebpos_to_charbpos (buf, |
293 search_regs.start[i]); | |
428 | 294 if (search_regs.end[i] >= 0) |
665 | 295 search_regs.end[i] = bytebpos_to_charbpos (buf, search_regs.end[i]); |
428 | 296 } |
297 } | |
298 | |
299 /* Similar but for strings. */ | |
300 static void | |
301 fixup_search_regs_for_string (Lisp_Object string) | |
302 { | |
303 int i; | |
304 int num_regs = search_regs.num_regs; | |
305 | |
306 /* #### bytecount_to_charcount() is not that efficient. This function | |
867 | 307 could be faster if it did its own conversion (using INC_IBYTEPTR() |
428 | 308 and such), because the register ends are likely to be somewhat ordered. |
309 (Even if not, you could sort them.) | |
310 | |
311 Think about this if this function is a time hog, which it's probably | |
312 not. */ | |
313 for (i = 0; i < num_regs; i++) | |
314 { | |
315 if (search_regs.start[i] > 0) | |
316 { | |
317 search_regs.start[i] = | |
793 | 318 string_index_byte_to_char (string, search_regs.start[i]); |
428 | 319 } |
320 if (search_regs.end[i] > 0) | |
321 { | |
322 search_regs.end[i] = | |
793 | 323 string_index_byte_to_char (string, search_regs.end[i]); |
428 | 324 } |
325 } | |
326 } | |
327 | |
328 | |
329 static Lisp_Object | |
330 looking_at_1 (Lisp_Object string, struct buffer *buf, int posix) | |
331 { | |
332 Lisp_Object val; | |
665 | 333 Bytebpos p1, p2; |
428 | 334 Bytecount s1, s2; |
335 REGISTER int i; | |
336 struct re_pattern_buffer *bufp; | |
826 | 337 struct syntax_cache scache_struct; |
338 struct syntax_cache *scache = &scache_struct; | |
339 | |
428 | 340 CHECK_STRING (string); |
341 bufp = compile_pattern (string, &search_regs, | |
342 (!NILP (buf->case_fold_search) | |
446 | 343 ? XCASE_TABLE_DOWNCASE (buf->case_table) : Qnil), |
826 | 344 wrap_buffer (buf), buf, posix, ERROR_ME); |
428 | 345 |
346 QUIT; | |
347 | |
348 /* Get pointers and sizes of the two strings | |
349 that make up the visible portion of the buffer. */ | |
350 | |
826 | 351 p1 = BYTE_BUF_BEGV (buf); |
352 p2 = BYTE_BUF_CEILING_OF (buf, p1); | |
428 | 353 s1 = p2 - p1; |
826 | 354 s2 = BYTE_BUF_ZV (buf) - p2; |
355 | |
356 /* By making the regex object, regex buffer, and syntax cache arguments | |
357 to re_{search,match}{,_2}, we've removed the need to do nasty things | |
358 to deal with regex reentrancy. (See stack trace in signal.c for proof | |
359 that this can happen.) | |
360 | |
361 #### there is still a potential problem with the regex cache -- | |
362 the compiled regex could be overwritten. we'd need 20-fold | |
363 reentrancy, though. Fix this. */ | |
364 | |
365 i = re_match_2 (bufp, (char *) BYTE_BUF_BYTE_ADDRESS (buf, p1), | |
366 s1, (char *) BYTE_BUF_BYTE_ADDRESS (buf, p2), s2, | |
367 BYTE_BUF_PT (buf) - BYTE_BUF_BEGV (buf), &search_regs, | |
368 BYTE_BUF_ZV (buf) - BYTE_BUF_BEGV (buf), wrap_buffer (buf), | |
369 buf, scache); | |
428 | 370 |
371 if (i == -2) | |
372 matcher_overflow (); | |
373 | |
374 val = (0 <= i ? Qt : Qnil); | |
375 if (NILP (val)) | |
826 | 376 return Qnil; |
428 | 377 { |
378 int num_regs = search_regs.num_regs; | |
379 for (i = 0; i < num_regs; i++) | |
380 if (search_regs.start[i] >= 0) | |
381 { | |
826 | 382 search_regs.start[i] += BYTE_BUF_BEGV (buf); |
383 search_regs.end[i] += BYTE_BUF_BEGV (buf); | |
428 | 384 } |
385 } | |
793 | 386 last_thing_searched = wrap_buffer (buf); |
428 | 387 fixup_search_regs_for_buffer (buf); |
826 | 388 return val; |
428 | 389 } |
390 | |
391 DEFUN ("looking-at", Flooking_at, 1, 2, 0, /* | |
392 Return t if text after point matches regular expression REGEXP. | |
1468 | 393 When the match is successful, this function modifies the match data |
394 that `match-beginning', `match-end' and `match-data' access; save the | |
395 match data with `match-data' and restore it with `store-match-data' if | |
396 you want to preserve them. If the match fails, the match data from the | |
397 previous success match is preserved. | |
428 | 398 |
399 Optional argument BUFFER defaults to the current buffer. | |
400 */ | |
401 (regexp, buffer)) | |
402 { | |
403 return looking_at_1 (regexp, decode_buffer (buffer, 0), 0); | |
404 } | |
405 | |
406 DEFUN ("posix-looking-at", Fposix_looking_at, 1, 2, 0, /* | |
407 Return t if text after point matches regular expression REGEXP. | |
408 Find the longest match, in accord with Posix regular expression rules. | |
1468 | 409 When the match is successful, this function modifies the match data |
410 that `match-beginning', `match-end' and `match-data' access; save the | |
411 match data with `match-data' and restore it with `store-match-data' if | |
412 you want to preserve them. If the match fails, the match data from the | |
413 previous success match is preserved. | |
428 | 414 |
415 Optional argument BUFFER defaults to the current buffer. | |
416 */ | |
417 (regexp, buffer)) | |
418 { | |
826 | 419 return looking_at_1 (regexp, decode_buffer (buffer, 0), 1); |
428 | 420 } |
421 | |
422 static Lisp_Object | |
423 string_match_1 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start, | |
2286 | 424 struct buffer *buf, int UNUSED (posix)) |
428 | 425 { |
426 Bytecount val; | |
427 Charcount s; | |
428 struct re_pattern_buffer *bufp; | |
429 | |
853 | 430 /* Some FSF junk with running_asynch_code, to preserve the match |
431 data. Not necessary because we don't call process filters | |
432 asynchronously (i.e. from within QUIT). */ | |
428 | 433 |
434 CHECK_STRING (regexp); | |
435 CHECK_STRING (string); | |
436 | |
437 if (NILP (start)) | |
438 s = 0; | |
439 else | |
440 { | |
826 | 441 Charcount len = string_char_length (string); |
428 | 442 |
443 CHECK_INT (start); | |
444 s = XINT (start); | |
445 if (s < 0 && -s <= len) | |
446 s = len + s; | |
447 else if (0 > s || s > len) | |
448 args_out_of_range (string, start); | |
449 } | |
450 | |
451 | |
452 bufp = compile_pattern (regexp, &search_regs, | |
453 (!NILP (buf->case_fold_search) | |
446 | 454 ? XCASE_TABLE_DOWNCASE (buf->case_table) : Qnil), |
826 | 455 string, buf, 0, ERROR_ME); |
428 | 456 QUIT; |
457 { | |
793 | 458 Bytecount bis = string_index_char_to_byte (string, s); |
826 | 459 struct syntax_cache scache_struct; |
460 struct syntax_cache *scache = &scache_struct; | |
461 | |
462 /* By making the regex object, regex buffer, and syntax cache arguments | |
463 to re_{search,match}{,_2}, we've removed the need to do nasty things | |
464 to deal with regex reentrancy. (See stack trace in signal.c for proof | |
465 that this can happen.) | |
466 | |
467 #### there is still a potential problem with the regex cache -- | |
468 the compiled regex could be overwritten. we'd need 20-fold | |
469 reentrancy, though. Fix this. */ | |
470 | |
428 | 471 val = re_search (bufp, (char *) XSTRING_DATA (string), |
472 XSTRING_LENGTH (string), bis, | |
473 XSTRING_LENGTH (string) - bis, | |
826 | 474 &search_regs, string, buf, scache); |
428 | 475 } |
476 if (val == -2) | |
477 matcher_overflow (); | |
826 | 478 if (val < 0) return Qnil; |
428 | 479 last_thing_searched = Qt; |
480 fixup_search_regs_for_string (string); | |
826 | 481 return make_int (string_index_byte_to_char (string, val)); |
428 | 482 } |
483 | |
484 DEFUN ("string-match", Fstring_match, 2, 4, 0, /* | |
485 Return index of start of first match for REGEXP in STRING, or nil. | |
486 If third arg START is non-nil, start search at that index in STRING. | |
487 For index of first char beyond the match, do (match-end 0). | |
488 `match-end' and `match-beginning' also give indices of substrings | |
489 matched by parenthesis constructs in the pattern. | |
490 | |
826 | 491 Optional arg BUFFER controls how case folding and syntax and category |
492 lookup is done (according to the value of `case-fold-search' in that buffer | |
493 and that buffer's case tables, syntax tables, and category table). If nil | |
494 or unspecified, it defaults *NOT* to the current buffer but instead: | |
495 | |
496 -- the value of `case-fold-search' in the current buffer is still respected | |
497 because of idioms like | |
498 | |
499 (let ((case-fold-search nil)) | |
500 (string-match "^foo.*bar" string)) | |
501 | |
502 but the case, syntax, and category tables come from the standard tables, | |
1468 | 503 which are accessed through functions `default-{case,syntax,category}-table' |
504 and serve as the parents of the tables in particular buffer. | |
505 | |
506 When the match is successful, this function modifies the match data | |
507 that `match-beginning', `match-end' and `match-data' access; save the | |
508 match data with `match-data' and restore it with `store-match-data' if | |
509 you want to preserve them. If the match fails, the match data from the | |
510 previous success match is preserved. | |
428 | 511 */ |
512 (regexp, string, start, buffer)) | |
513 { | |
826 | 514 /* &&#### implement new interp for buffer arg; check code to see if it |
515 makes more sense than prev */ | |
428 | 516 return string_match_1 (regexp, string, start, decode_buffer (buffer, 0), 0); |
517 } | |
518 | |
519 DEFUN ("posix-string-match", Fposix_string_match, 2, 4, 0, /* | |
520 Return index of start of first match for REGEXP in STRING, or nil. | |
521 Find the longest match, in accord with Posix regular expression rules. | |
522 If third arg START is non-nil, start search at that index in STRING. | |
523 For index of first char beyond the match, do (match-end 0). | |
524 `match-end' and `match-beginning' also give indices of substrings | |
525 matched by parenthesis constructs in the pattern. | |
526 | |
527 Optional arg BUFFER controls how case folding is done (according to | |
528 the value of `case-fold-search' in that buffer and that buffer's case | |
529 tables) and defaults to the current buffer. | |
1468 | 530 |
531 When the match is successful, this function modifies the match data | |
532 that `match-beginning', `match-end' and `match-data' access; save the | |
533 match data with `match-data' and restore it with `store-match-data' if | |
534 you want to preserve them. If the match fails, the match data from the | |
535 previous success match is preserved. | |
428 | 536 */ |
537 (regexp, string, start, buffer)) | |
538 { | |
539 return string_match_1 (regexp, string, start, decode_buffer (buffer, 0), 1); | |
540 } | |
541 | |
542 /* Match REGEXP against STRING, searching all of STRING, | |
543 and return the index of the match, or negative on failure. | |
544 This does not clobber the match data. */ | |
545 | |
546 Bytecount | |
1347 | 547 fast_string_match (Lisp_Object regexp, const Ibyte *nonreloc, |
428 | 548 Lisp_Object reloc, Bytecount offset, |
549 Bytecount length, int case_fold_search, | |
578 | 550 Error_Behavior errb, int no_quit) |
428 | 551 { |
552 Bytecount val; | |
867 | 553 Ibyte *newnonreloc = (Ibyte *) nonreloc; |
428 | 554 struct re_pattern_buffer *bufp; |
826 | 555 struct syntax_cache scache_struct; |
556 struct syntax_cache *scache = &scache_struct; | |
428 | 557 |
558 bufp = compile_pattern (regexp, 0, | |
559 (case_fold_search | |
771 | 560 ? XCASE_TABLE_DOWNCASE (Vstandard_case_table) |
446 | 561 : Qnil), |
826 | 562 reloc, 0, 0, errb); |
428 | 563 if (!bufp) |
564 return -1; /* will only do this when errb != ERROR_ME */ | |
565 if (!no_quit) | |
566 QUIT; | |
567 else | |
568 no_quit_in_re_search = 1; | |
569 | |
570 fixup_internal_substring (nonreloc, reloc, offset, &length); | |
571 | |
771 | 572 /* Don't need to protect against GC inside of re_search() due to QUIT; |
573 QUIT is GC-inhibited. */ | |
428 | 574 if (!NILP (reloc)) |
771 | 575 newnonreloc = XSTRING_DATA (reloc); |
576 | |
826 | 577 /* By making the regex object, regex buffer, and syntax cache arguments |
578 to re_{search,match}{,_2}, we've removed the need to do nasty things | |
579 to deal with regex reentrancy. (See stack trace in signal.c for proof | |
580 that this can happen.) | |
581 | |
582 #### there is still a potential problem with the regex cache -- | |
583 the compiled regex could be overwritten. we'd need 20-fold | |
584 reentrancy, though. Fix this. */ | |
585 | |
428 | 586 val = re_search (bufp, (char *) newnonreloc + offset, length, 0, |
826 | 587 length, 0, reloc, 0, scache); |
428 | 588 |
589 no_quit_in_re_search = 0; | |
590 return val; | |
591 } | |
592 | |
593 Bytecount | |
594 fast_lisp_string_match (Lisp_Object regex, Lisp_Object string) | |
595 { | |
596 return fast_string_match (regex, 0, string, 0, -1, 0, ERROR_ME, 0); | |
597 } | |
598 | |
599 | |
600 #ifdef REGION_CACHE_NEEDS_WORK | |
601 /* The newline cache: remembering which sections of text have no newlines. */ | |
602 | |
603 /* If the user has requested newline caching, make sure it's on. | |
604 Otherwise, make sure it's off. | |
605 This is our cheezy way of associating an action with the change of | |
606 state of a buffer-local variable. */ | |
607 static void | |
608 newline_cache_on_off (struct buffer *buf) | |
609 { | |
610 if (NILP (buf->cache_long_line_scans)) | |
611 { | |
612 /* It should be off. */ | |
613 if (buf->newline_cache) | |
614 { | |
615 free_region_cache (buf->newline_cache); | |
616 buf->newline_cache = 0; | |
617 } | |
618 } | |
619 else | |
620 { | |
621 /* It should be on. */ | |
622 if (buf->newline_cache == 0) | |
623 buf->newline_cache = new_region_cache (); | |
624 } | |
625 } | |
626 #endif | |
627 | |
628 /* Search in BUF for COUNT instances of the character TARGET between | |
629 START and END. | |
630 | |
631 If COUNT is positive, search forwards; END must be >= START. | |
632 If COUNT is negative, search backwards for the -COUNTth instance; | |
633 END must be <= START. | |
634 If COUNT is zero, do anything you please; run rogue, for all I care. | |
635 | |
636 If END is zero, use BEGV or ZV instead, as appropriate for the | |
637 direction indicated by COUNT. | |
638 | |
639 If we find COUNT instances, set *SHORTAGE to zero, and return the | |
640 position after the COUNTth match. Note that for reverse motion | |
641 this is not the same as the usual convention for Emacs motion commands. | |
642 | |
643 If we don't find COUNT instances before reaching END, set *SHORTAGE | |
644 to the number of TARGETs left unfound, and return END. | |
645 | |
646 If ALLOW_QUIT is non-zero, call QUIT periodically. */ | |
647 | |
665 | 648 static Bytebpos |
867 | 649 byte_scan_buffer (struct buffer *buf, Ichar target, Bytebpos st, Bytebpos en, |
872 | 650 EMACS_INT count, EMACS_INT *shortage, int allow_quit) |
428 | 651 { |
665 | 652 Bytebpos lim = en > 0 ? en : |
826 | 653 ((count > 0) ? BYTE_BUF_ZV (buf) : BYTE_BUF_BEGV (buf)); |
428 | 654 |
655 /* #### newline cache stuff in this function not yet ported */ | |
656 assert (count != 0); | |
657 | |
658 if (shortage) | |
659 *shortage = 0; | |
660 | |
661 if (count > 0) | |
662 { | |
663 #ifdef MULE | |
826 | 664 Internal_Format fmt = buf->text->format; |
665 /* Check for char that's unrepresentable in the buffer -- it | |
666 certainly can't be there. */ | |
867 | 667 if (!ichar_fits_in_format (target, fmt, wrap_buffer (buf))) |
428 | 668 { |
826 | 669 *shortage = count; |
670 return lim; | |
671 } | |
672 /* Due to the Mule representation of characters in a buffer, we can | |
673 simply search for characters in the range 0 - 127 directly; for | |
674 8-bit-fixed, we can do this for all characters. In other cases, | |
675 we do it the "hard" way. Note that this way works for all | |
676 characters and all formats, but the other way is faster. */ | |
677 else if (! (fmt == FORMAT_8_BIT_FIXED || | |
867 | 678 (fmt == FORMAT_DEFAULT && ichar_ascii_p (target)))) |
826 | 679 { |
867 | 680 Raw_Ichar raw = ichar_to_raw (target, fmt, wrap_buffer (buf)); |
428 | 681 while (st < lim && count > 0) |
682 { | |
826 | 683 if (BYTE_BUF_FETCH_CHAR_RAW (buf, st) == raw) |
428 | 684 count--; |
665 | 685 INC_BYTEBPOS (buf, st); |
428 | 686 } |
687 } | |
688 else | |
689 #endif | |
690 { | |
867 | 691 Raw_Ichar raw = ichar_to_raw (target, fmt, wrap_buffer (buf)); |
428 | 692 while (st < lim && count > 0) |
693 { | |
665 | 694 Bytebpos ceil; |
867 | 695 Ibyte *bufptr; |
428 | 696 |
826 | 697 ceil = BYTE_BUF_CEILING_OF (buf, st); |
428 | 698 ceil = min (lim, ceil); |
867 | 699 bufptr = (Ibyte *) memchr (BYTE_BUF_BYTE_ADDRESS (buf, st), |
826 | 700 raw, ceil - st); |
428 | 701 if (bufptr) |
702 { | |
703 count--; | |
826 | 704 st = BYTE_BUF_PTR_BYTE_POS (buf, bufptr) + 1; |
428 | 705 } |
706 else | |
707 st = ceil; | |
708 } | |
709 } | |
710 | |
711 if (shortage) | |
712 *shortage = count; | |
713 if (allow_quit) | |
714 QUIT; | |
715 return st; | |
716 } | |
717 else | |
718 { | |
719 #ifdef MULE | |
826 | 720 Internal_Format fmt = buf->text->format; |
721 /* Check for char that's unrepresentable in the buffer -- it | |
722 certainly can't be there. */ | |
867 | 723 if (!ichar_fits_in_format (target, fmt, wrap_buffer (buf))) |
428 | 724 { |
826 | 725 *shortage = -count; |
726 return lim; | |
727 } | |
728 else if (! (fmt == FORMAT_8_BIT_FIXED || | |
867 | 729 (fmt == FORMAT_DEFAULT && ichar_ascii_p (target)))) |
826 | 730 { |
867 | 731 Raw_Ichar raw = ichar_to_raw (target, fmt, wrap_buffer (buf)); |
428 | 732 while (st > lim && count < 0) |
733 { | |
665 | 734 DEC_BYTEBPOS (buf, st); |
826 | 735 if (BYTE_BUF_FETCH_CHAR_RAW (buf, st) == raw) |
428 | 736 count++; |
737 } | |
738 } | |
739 else | |
740 #endif | |
741 { | |
867 | 742 Raw_Ichar raw = ichar_to_raw (target, fmt, wrap_buffer (buf)); |
428 | 743 while (st > lim && count < 0) |
744 { | |
665 | 745 Bytebpos floor; |
867 | 746 Ibyte *bufptr; |
747 Ibyte *floorptr; | |
428 | 748 |
826 | 749 floor = BYTE_BUF_FLOOR_OF (buf, st); |
428 | 750 floor = max (lim, floor); |
751 /* No memrchr() ... */ | |
826 | 752 bufptr = BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, st); |
753 floorptr = BYTE_BUF_BYTE_ADDRESS (buf, floor); | |
428 | 754 while (bufptr >= floorptr) |
755 { | |
756 st--; | |
757 /* At this point, both ST and BUFPTR refer to the same | |
758 character. When the loop terminates, ST will | |
759 always point to the last character we tried. */ | |
867 | 760 if (*bufptr == (Ibyte) raw) |
428 | 761 { |
762 count++; | |
763 break; | |
764 } | |
765 bufptr--; | |
766 } | |
767 } | |
768 } | |
769 | |
770 if (shortage) | |
771 *shortage = -count; | |
772 if (allow_quit) | |
773 QUIT; | |
774 if (count) | |
775 return st; | |
776 else | |
777 { | |
778 /* We found the character we were looking for; we have to return | |
779 the position *after* it due to the strange way that the return | |
780 value is defined. */ | |
665 | 781 INC_BYTEBPOS (buf, st); |
428 | 782 return st; |
783 } | |
784 } | |
785 } | |
786 | |
665 | 787 Charbpos |
867 | 788 scan_buffer (struct buffer *buf, Ichar target, Charbpos start, Charbpos end, |
428 | 789 EMACS_INT count, EMACS_INT *shortage, int allow_quit) |
790 { | |
826 | 791 Bytebpos byte_retval; |
792 Bytebpos byte_start, byte_end; | |
793 | |
794 byte_start = charbpos_to_bytebpos (buf, start); | |
428 | 795 if (end) |
826 | 796 byte_end = charbpos_to_bytebpos (buf, end); |
428 | 797 else |
826 | 798 byte_end = 0; |
799 byte_retval = byte_scan_buffer (buf, target, byte_start, byte_end, count, | |
428 | 800 shortage, allow_quit); |
826 | 801 return bytebpos_to_charbpos (buf, byte_retval); |
428 | 802 } |
803 | |
665 | 804 Bytebpos |
826 | 805 byte_find_next_newline_no_quit (struct buffer *buf, Bytebpos from, int count) |
428 | 806 { |
826 | 807 return byte_scan_buffer (buf, '\n', from, 0, count, 0, 0); |
428 | 808 } |
809 | |
665 | 810 Charbpos |
811 find_next_newline_no_quit (struct buffer *buf, Charbpos from, int count) | |
428 | 812 { |
813 return scan_buffer (buf, '\n', from, 0, count, 0, 0); | |
814 } | |
815 | |
665 | 816 Charbpos |
817 find_next_newline (struct buffer *buf, Charbpos from, int count) | |
428 | 818 { |
819 return scan_buffer (buf, '\n', from, 0, count, 0, 1); | |
820 } | |
821 | |
826 | 822 Bytecount |
867 | 823 byte_find_next_ichar_in_string (Lisp_Object str, Ichar target, Bytecount st, |
428 | 824 EMACS_INT count) |
825 { | |
793 | 826 Bytebpos lim = XSTRING_LENGTH (str) -1; |
867 | 827 Ibyte *s = XSTRING_DATA (str); |
428 | 828 |
829 assert (count >= 0); | |
830 | |
831 #ifdef MULE | |
832 /* Due to the Mule representation of characters in a buffer, | |
833 we can simply search for characters in the range 0 - 127 | |
834 directly. For other characters, we do it the "hard" way. | |
835 Note that this way works for all characters but the other | |
836 way is faster. */ | |
837 if (target >= 0200) | |
838 { | |
839 while (st < lim && count > 0) | |
840 { | |
867 | 841 if (string_ichar (str, st) == target) |
428 | 842 count--; |
826 | 843 INC_BYTECOUNT (s, st); |
428 | 844 } |
845 } | |
846 else | |
847 #endif | |
848 { | |
849 while (st < lim && count > 0) | |
850 { | |
867 | 851 Ibyte *bufptr = (Ibyte *) memchr (itext_n_addr (s, st), |
428 | 852 (int) target, lim - st); |
853 if (bufptr) | |
854 { | |
855 count--; | |
826 | 856 st = (Bytebpos) (bufptr - s) + 1; |
428 | 857 } |
858 else | |
859 st = lim; | |
860 } | |
861 } | |
862 return st; | |
863 } | |
864 | |
865 /* Like find_next_newline, but returns position before the newline, | |
866 not after, and only search up to TO. This isn't just | |
867 find_next_newline (...)-1, because you might hit TO. */ | |
665 | 868 Charbpos |
826 | 869 find_before_next_newline (struct buffer *buf, Charbpos from, Charbpos to, |
870 int count) | |
428 | 871 { |
872 EMACS_INT shortage; | |
665 | 873 Charbpos pos = scan_buffer (buf, '\n', from, to, count, &shortage, 1); |
428 | 874 |
875 if (shortage == 0) | |
876 pos--; | |
877 | |
878 return pos; | |
879 } | |
880 | |
872 | 881 /* This function synched with FSF 21.1 */ |
428 | 882 static Lisp_Object |
883 skip_chars (struct buffer *buf, int forwardp, int syntaxp, | |
884 Lisp_Object string, Lisp_Object lim) | |
885 { | |
867 | 886 REGISTER Ibyte *p, *pend; |
887 REGISTER Ichar c; | |
428 | 888 /* We store the first 256 chars in an array here and the rest in |
889 a range table. */ | |
890 unsigned char fastmap[0400]; | |
891 int negate = 0; | |
892 REGISTER int i; | |
665 | 893 Charbpos limit; |
826 | 894 struct syntax_cache *scache; |
895 | |
428 | 896 if (NILP (lim)) |
897 limit = forwardp ? BUF_ZV (buf) : BUF_BEGV (buf); | |
898 else | |
899 { | |
900 CHECK_INT_COERCE_MARKER (lim); | |
901 limit = XINT (lim); | |
902 | |
903 /* In any case, don't allow scan outside bounds of buffer. */ | |
904 if (limit > BUF_ZV (buf)) limit = BUF_ZV (buf); | |
905 if (limit < BUF_BEGV (buf)) limit = BUF_BEGV (buf); | |
906 } | |
907 | |
908 CHECK_STRING (string); | |
909 p = XSTRING_DATA (string); | |
910 pend = p + XSTRING_LENGTH (string); | |
911 memset (fastmap, 0, sizeof (fastmap)); | |
912 | |
913 Fclear_range_table (Vskip_chars_range_table); | |
914 | |
915 if (p != pend && *p == '^') | |
916 { | |
917 negate = 1; | |
918 p++; | |
919 } | |
920 | |
921 /* Find the characters specified and set their elements of fastmap. | |
922 If syntaxp, each character counts as itself. | |
923 Otherwise, handle backslashes and ranges specially */ | |
924 | |
925 while (p != pend) | |
926 { | |
867 | 927 c = itext_ichar (p); |
928 INC_IBYTEPTR (p); | |
428 | 929 if (syntaxp) |
930 { | |
931 if (c < 0400 && syntax_spec_code[c] < (unsigned char) Smax) | |
932 fastmap[c] = 1; | |
933 else | |
831 | 934 invalid_argument ("Invalid syntax designator", make_char (c)); |
428 | 935 } |
936 else | |
937 { | |
938 if (c == '\\') | |
939 { | |
940 if (p == pend) break; | |
867 | 941 c = itext_ichar (p); |
942 INC_IBYTEPTR (p); | |
428 | 943 } |
944 if (p != pend && *p == '-') | |
945 { | |
867 | 946 Ichar cend; |
428 | 947 |
872 | 948 /* Skip over the dash. */ |
428 | 949 p++; |
950 if (p == pend) break; | |
867 | 951 cend = itext_ichar (p); |
428 | 952 while (c <= cend && c < 0400) |
953 { | |
954 fastmap[c] = 1; | |
955 c++; | |
956 } | |
957 if (c <= cend) | |
958 Fput_range_table (make_int (c), make_int (cend), Qt, | |
959 Vskip_chars_range_table); | |
867 | 960 INC_IBYTEPTR (p); |
428 | 961 } |
962 else | |
963 { | |
964 if (c < 0400) | |
965 fastmap[c] = 1; | |
966 else | |
967 Fput_range_table (make_int (c), make_int (c), Qt, | |
968 Vskip_chars_range_table); | |
969 } | |
970 } | |
971 } | |
972 | |
872 | 973 /* #### Not in FSF 21.1 */ |
428 | 974 if (syntaxp && fastmap['-'] != 0) |
975 fastmap[' '] = 1; | |
976 | |
977 /* If ^ was the first character, complement the fastmap. | |
978 We don't complement the range table, however; we just use negate | |
979 in the comparisons below. */ | |
980 | |
981 if (negate) | |
647 | 982 for (i = 0; i < (int) (sizeof (fastmap)); i++) |
428 | 983 fastmap[i] ^= 1; |
984 | |
985 { | |
665 | 986 Charbpos start_point = BUF_PT (buf); |
872 | 987 Charbpos pos = start_point; |
988 Charbpos pos_byte = BYTE_BUF_PT (buf); | |
428 | 989 |
990 if (syntaxp) | |
991 { | |
872 | 992 scache = setup_buffer_syntax_cache (buf, pos, forwardp ? 1 : -1); |
428 | 993 /* All syntax designators are normal chars so nothing strange |
994 to worry about */ | |
995 if (forwardp) | |
996 { | |
872 | 997 if (pos < limit) |
998 while (fastmap[(unsigned char) | |
999 syntax_code_spec | |
1000 [(int) SYNTAX_FROM_CACHE | |
1001 (scache, BYTE_BUF_FETCH_CHAR (buf, pos_byte))]]) | |
1002 { | |
1003 pos++; | |
1004 INC_BYTEBPOS (buf, pos_byte); | |
879 | 1005 if (pos >= limit) |
872 | 1006 break; |
1007 UPDATE_SYNTAX_CACHE_FORWARD (scache, pos); | |
1008 } | |
428 | 1009 } |
1010 else | |
1011 { | |
872 | 1012 while (pos > limit) |
460 | 1013 { |
872 | 1014 Charbpos savepos = pos_byte; |
1015 pos--; | |
1016 DEC_BYTEBPOS (buf, pos_byte); | |
1017 UPDATE_SYNTAX_CACHE_BACKWARD (scache, pos); | |
1018 if (!fastmap[(unsigned char) | |
1019 syntax_code_spec | |
1020 [(int) SYNTAX_FROM_CACHE | |
1021 (scache, BYTE_BUF_FETCH_CHAR (buf, pos_byte))]]) | |
1022 { | |
1023 pos++; | |
1024 pos_byte = savepos; | |
1025 break; | |
1026 } | |
460 | 1027 } |
428 | 1028 } |
1029 } | |
1030 else | |
1031 { | |
1032 if (forwardp) | |
1033 { | |
872 | 1034 while (pos < limit) |
428 | 1035 { |
872 | 1036 Ichar ch = BYTE_BUF_FETCH_CHAR (buf, pos_byte); |
428 | 1037 if ((ch < 0400) ? fastmap[ch] : |
1038 (NILP (Fget_range_table (make_int (ch), | |
1039 Vskip_chars_range_table, | |
1040 Qnil)) | |
1041 == negate)) | |
872 | 1042 { |
1043 pos++; | |
1044 INC_BYTEBPOS (buf, pos_byte); | |
1045 } | |
428 | 1046 else |
1047 break; | |
1048 } | |
1049 } | |
1050 else | |
1051 { | |
872 | 1052 while (pos > limit) |
428 | 1053 { |
872 | 1054 Charbpos prev_pos_byte = pos_byte; |
1055 Ichar ch; | |
1056 | |
1057 DEC_BYTEBPOS (buf, prev_pos_byte); | |
1058 ch = BYTE_BUF_FETCH_CHAR (buf, prev_pos_byte); | |
428 | 1059 if ((ch < 0400) ? fastmap[ch] : |
1060 (NILP (Fget_range_table (make_int (ch), | |
1061 Vskip_chars_range_table, | |
1062 Qnil)) | |
1063 == negate)) | |
872 | 1064 { |
1065 pos--; | |
1066 pos_byte = prev_pos_byte; | |
1067 } | |
428 | 1068 else |
1069 break; | |
1070 } | |
1071 } | |
1072 } | |
1073 QUIT; | |
872 | 1074 BOTH_BUF_SET_PT (buf, pos, pos_byte); |
428 | 1075 return make_int (BUF_PT (buf) - start_point); |
1076 } | |
1077 } | |
1078 | |
1079 DEFUN ("skip-chars-forward", Fskip_chars_forward, 1, 3, 0, /* | |
444 | 1080 Move point forward, stopping before a char not in STRING, or at pos LIMIT. |
428 | 1081 STRING is like the inside of a `[...]' in a regular expression |
1082 except that `]' is never special and `\\' quotes `^', `-' or `\\'. | |
1083 Thus, with arg "a-zA-Z", this skips letters stopping before first nonletter. | |
1084 With arg "^a-zA-Z", skips nonletters stopping before first letter. | |
1085 Returns the distance traveled, either zero or positive. | |
1086 | |
1087 Optional argument BUFFER defaults to the current buffer. | |
1088 */ | |
444 | 1089 (string, limit, buffer)) |
428 | 1090 { |
444 | 1091 return skip_chars (decode_buffer (buffer, 0), 1, 0, string, limit); |
428 | 1092 } |
1093 | |
1094 DEFUN ("skip-chars-backward", Fskip_chars_backward, 1, 3, 0, /* | |
444 | 1095 Move point backward, stopping after a char not in STRING, or at pos LIMIT. |
428 | 1096 See `skip-chars-forward' for details. |
1097 Returns the distance traveled, either zero or negative. | |
1098 | |
1099 Optional argument BUFFER defaults to the current buffer. | |
1100 */ | |
444 | 1101 (string, limit, buffer)) |
428 | 1102 { |
444 | 1103 return skip_chars (decode_buffer (buffer, 0), 0, 0, string, limit); |
428 | 1104 } |
1105 | |
1106 | |
1107 DEFUN ("skip-syntax-forward", Fskip_syntax_forward, 1, 3, 0, /* | |
1108 Move point forward across chars in specified syntax classes. | |
1109 SYNTAX is a string of syntax code characters. | |
444 | 1110 Stop before a char whose syntax is not in SYNTAX, or at position LIMIT. |
428 | 1111 If SYNTAX starts with ^, skip characters whose syntax is NOT in SYNTAX. |
1112 This function returns the distance traveled, either zero or positive. | |
1113 | |
1114 Optional argument BUFFER defaults to the current buffer. | |
1115 */ | |
444 | 1116 (syntax, limit, buffer)) |
428 | 1117 { |
444 | 1118 return skip_chars (decode_buffer (buffer, 0), 1, 1, syntax, limit); |
428 | 1119 } |
1120 | |
1121 DEFUN ("skip-syntax-backward", Fskip_syntax_backward, 1, 3, 0, /* | |
1122 Move point backward across chars in specified syntax classes. | |
1123 SYNTAX is a string of syntax code characters. | |
444 | 1124 Stop on reaching a char whose syntax is not in SYNTAX, or at position LIMIT. |
428 | 1125 If SYNTAX starts with ^, skip characters whose syntax is NOT in SYNTAX. |
1126 This function returns the distance traveled, either zero or negative. | |
1127 | |
1128 Optional argument BUFFER defaults to the current buffer. | |
1129 */ | |
444 | 1130 (syntax, limit, buffer)) |
428 | 1131 { |
444 | 1132 return skip_chars (decode_buffer (buffer, 0), 0, 1, syntax, limit); |
428 | 1133 } |
1134 | |
1135 | |
1136 /* Subroutines of Lisp buffer search functions. */ | |
1137 | |
1138 static Lisp_Object | |
444 | 1139 search_command (Lisp_Object string, Lisp_Object limit, Lisp_Object noerror, |
428 | 1140 Lisp_Object count, Lisp_Object buffer, int direction, |
1141 int RE, int posix) | |
1142 { | |
665 | 1143 REGISTER Charbpos np; |
1144 Charbpos lim; | |
428 | 1145 EMACS_INT n = direction; |
1146 struct buffer *buf; | |
1147 | |
1148 if (!NILP (count)) | |
1149 { | |
1150 CHECK_INT (count); | |
1151 n *= XINT (count); | |
1152 } | |
1153 | |
1154 buf = decode_buffer (buffer, 0); | |
1155 CHECK_STRING (string); | |
444 | 1156 if (NILP (limit)) |
428 | 1157 lim = n > 0 ? BUF_ZV (buf) : BUF_BEGV (buf); |
1158 else | |
1159 { | |
444 | 1160 CHECK_INT_COERCE_MARKER (limit); |
1161 lim = XINT (limit); | |
428 | 1162 if (n > 0 ? lim < BUF_PT (buf) : lim > BUF_PT (buf)) |
563 | 1163 invalid_argument ("Invalid search limit (wrong side of point)", |
1164 Qunbound); | |
428 | 1165 if (lim > BUF_ZV (buf)) |
1166 lim = BUF_ZV (buf); | |
1167 if (lim < BUF_BEGV (buf)) | |
1168 lim = BUF_BEGV (buf); | |
1169 } | |
1170 | |
1171 np = search_buffer (buf, string, BUF_PT (buf), lim, n, RE, | |
1172 (!NILP (buf->case_fold_search) | |
446 | 1173 ? XCASE_TABLE_CANON (buf->case_table) |
1174 : Qnil), | |
428 | 1175 (!NILP (buf->case_fold_search) |
446 | 1176 ? XCASE_TABLE_EQV (buf->case_table) |
1177 : Qnil), posix); | |
428 | 1178 |
1179 if (np <= 0) | |
1180 { | |
444 | 1181 if (NILP (noerror)) |
2268 | 1182 { |
1183 signal_failure (string); | |
1184 RETURN_NOT_REACHED (Qnil); | |
1185 } | |
444 | 1186 if (!EQ (noerror, Qt)) |
428 | 1187 { |
1188 if (lim < BUF_BEGV (buf) || lim > BUF_ZV (buf)) | |
2500 | 1189 ABORT (); |
428 | 1190 BUF_SET_PT (buf, lim); |
1191 return Qnil; | |
1192 #if 0 /* This would be clean, but maybe programs depend on | |
1193 a value of nil here. */ | |
1194 np = lim; | |
1195 #endif | |
1196 } | |
1197 else | |
1198 return Qnil; | |
1199 } | |
1200 | |
1201 if (np < BUF_BEGV (buf) || np > BUF_ZV (buf)) | |
2500 | 1202 ABORT (); |
428 | 1203 |
1204 BUF_SET_PT (buf, np); | |
1205 | |
1206 return make_int (np); | |
1207 } | |
1208 | |
1209 static int | |
1210 trivial_regexp_p (Lisp_Object regexp) | |
1211 { | |
1212 Bytecount len = XSTRING_LENGTH (regexp); | |
867 | 1213 Ibyte *s = XSTRING_DATA (regexp); |
428 | 1214 while (--len >= 0) |
1215 { | |
1216 switch (*s++) | |
1217 { | |
1724 | 1218 /* #### howcum ']' doesn't appear here, but ... */ |
428 | 1219 case '.': case '*': case '+': case '?': case '[': case '^': case '$': |
1220 return 0; | |
1221 case '\\': | |
1222 if (--len < 0) | |
1223 return 0; | |
1224 switch (*s++) | |
1225 { | |
1724 | 1226 /* ... ')' does appear here? ('<' and '>' can appear singly.) */ |
1227 /* #### are there other constructs to check? */ | |
428 | 1228 case '|': case '(': case ')': case '`': case '\'': case 'b': |
1229 case 'B': case '<': case '>': case 'w': case 'W': case 's': | |
1724 | 1230 case 'S': case '=': case '{': case '}': |
428 | 1231 #ifdef MULE |
1232 /* 97/2/25 jhod Added for category matches */ | |
1233 case 'c': case 'C': | |
1234 #endif /* MULE */ | |
1235 case '1': case '2': case '3': case '4': case '5': | |
1236 case '6': case '7': case '8': case '9': | |
1237 return 0; | |
1238 } | |
1239 } | |
1240 } | |
1241 return 1; | |
1242 } | |
1243 | |
1244 /* Search for the n'th occurrence of STRING in BUF, | |
665 | 1245 starting at position CHARBPOS and stopping at position BUFLIM, |
428 | 1246 treating PAT as a literal string if RE is false or as |
1247 a regular expression if RE is true. | |
1248 | |
1249 If N is positive, searching is forward and BUFLIM must be greater | |
665 | 1250 than CHARBPOS. |
428 | 1251 If N is negative, searching is backward and BUFLIM must be less |
665 | 1252 than CHARBPOS. |
428 | 1253 |
1254 Returns -x if only N-x occurrences found (x > 0), | |
1255 or else the position at the beginning of the Nth occurrence | |
1256 (if searching backward) or the end (if searching forward). | |
1257 | |
1258 POSIX is nonzero if we want full backtracking (POSIX style) | |
1259 for this pattern. 0 means backtrack only enough to get a valid match. */ | |
665 | 1260 static Charbpos |
1261 search_buffer (struct buffer *buf, Lisp_Object string, Charbpos charbpos, | |
1262 Charbpos buflim, EMACS_INT n, int RE, Lisp_Object trt, | |
446 | 1263 Lisp_Object inverse_trt, int posix) |
428 | 1264 { |
1265 Bytecount len = XSTRING_LENGTH (string); | |
867 | 1266 Ibyte *base_pat = XSTRING_DATA (string); |
428 | 1267 REGISTER EMACS_INT i, j; |
665 | 1268 Bytebpos p1, p2; |
428 | 1269 Bytecount s1, s2; |
665 | 1270 Bytebpos pos, lim; |
428 | 1271 |
853 | 1272 /* Some FSF junk with running_asynch_code, to preserve the match |
1273 data. Not necessary because we don't call process filters | |
1274 asynchronously (i.e. from within QUIT). */ | |
428 | 1275 |
1425 | 1276 /* Searching 0 times means noop---don't move, don't touch registers. */ |
1277 if (n == 0) | |
1278 return charbpos; | |
1279 | |
428 | 1280 /* Null string is found at starting position. */ |
1281 if (len == 0) | |
1282 { | |
665 | 1283 set_search_regs (buf, charbpos, 0); |
1284 return charbpos; | |
428 | 1285 } |
1286 | |
665 | 1287 pos = charbpos_to_bytebpos (buf, charbpos); |
1288 lim = charbpos_to_bytebpos (buf, buflim); | |
428 | 1289 if (RE && !trivial_regexp_p (string)) |
1290 { | |
1291 struct re_pattern_buffer *bufp; | |
826 | 1292 |
1293 bufp = compile_pattern (string, &search_regs, trt, | |
1294 wrap_buffer (buf), buf, posix, ERROR_ME); | |
428 | 1295 |
1296 /* Get pointers and sizes of the two strings | |
1297 that make up the visible portion of the buffer. */ | |
1298 | |
826 | 1299 p1 = BYTE_BUF_BEGV (buf); |
1300 p2 = BYTE_BUF_CEILING_OF (buf, p1); | |
428 | 1301 s1 = p2 - p1; |
826 | 1302 s2 = BYTE_BUF_ZV (buf) - p2; |
1303 | |
1304 while (n != 0) | |
428 | 1305 { |
1306 Bytecount val; | |
826 | 1307 struct syntax_cache scache_struct; |
1308 struct syntax_cache *scache = &scache_struct; | |
1309 | |
428 | 1310 QUIT; |
826 | 1311 /* By making the regex object, regex buffer, and syntax cache |
1312 arguments to re_{search,match}{,_2}, we've removed the need to | |
1313 do nasty things to deal with regex reentrancy. (See stack | |
1314 trace in signal.c for proof that this can happen.) | |
1315 | |
1316 #### there is still a potential problem with the regex cache -- | |
1317 the compiled regex could be overwritten. we'd need 20-fold | |
1318 reentrancy, though. Fix this. */ | |
1319 | |
428 | 1320 val = re_search_2 (bufp, |
826 | 1321 (char *) BYTE_BUF_BYTE_ADDRESS (buf, p1), s1, |
1322 (char *) BYTE_BUF_BYTE_ADDRESS (buf, p2), s2, | |
1323 pos - BYTE_BUF_BEGV (buf), lim - pos, &search_regs, | |
1324 n > 0 ? lim - BYTE_BUF_BEGV (buf) : | |
1325 pos - BYTE_BUF_BEGV (buf), wrap_buffer (buf), | |
1326 buf, scache); | |
428 | 1327 |
1328 if (val == -2) | |
1329 { | |
1330 matcher_overflow (); | |
1331 } | |
1332 if (val >= 0) | |
1333 { | |
1334 int num_regs = search_regs.num_regs; | |
826 | 1335 j = BYTE_BUF_BEGV (buf); |
428 | 1336 for (i = 0; i < num_regs; i++) |
1337 if (search_regs.start[i] >= 0) | |
1338 { | |
1339 search_regs.start[i] += j; | |
1340 search_regs.end[i] += j; | |
1341 } | |
793 | 1342 last_thing_searched = wrap_buffer (buf); |
428 | 1343 /* Set pos to the new position. */ |
826 | 1344 pos = n > 0 ? search_regs.end[0] : search_regs.start[0]; |
428 | 1345 fixup_search_regs_for_buffer (buf); |
665 | 1346 /* And charbpos too. */ |
826 | 1347 charbpos = n > 0 ? search_regs.end[0] : search_regs.start[0]; |
428 | 1348 } |
1349 else | |
826 | 1350 return (n > 0 ? 0 - n : n); |
1351 if (n > 0) n--; else n++; | |
428 | 1352 } |
665 | 1353 return charbpos; |
428 | 1354 } |
1355 else /* non-RE case */ | |
1356 { | |
446 | 1357 int charset_base = -1; |
1358 int boyer_moore_ok = 1; | |
2367 | 1359 Ibyte *patbuf = alloca_ibytes (len * MAX_ICHAR_LEN); |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1360 Ibyte *pat = patbuf; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1361 |
446 | 1362 #ifdef MULE |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1363 int entirely_one_byte_p = buf->text->entirely_one_byte_p; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1364 int nothing_greater_than_0xff = |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1365 buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1366 |
446 | 1367 while (len > 0) |
1368 { | |
867 | 1369 Ibyte tmp_str[MAX_ICHAR_LEN]; |
1370 Ichar c, translated, inverse; | |
446 | 1371 Bytecount orig_bytelen, new_bytelen, inv_bytelen; |
1372 | |
1373 /* If we got here and the RE flag is set, it's because | |
1374 we're dealing with a regexp known to be trivial, so the | |
1375 backslash just quotes the next character. */ | |
1376 if (RE && *base_pat == '\\') | |
1377 { | |
1378 len--; | |
1379 base_pat++; | |
1380 } | |
867 | 1381 c = itext_ichar (base_pat); |
446 | 1382 translated = TRANSLATE (trt, c); |
1383 inverse = TRANSLATE (inverse_trt, c); | |
1384 | |
867 | 1385 orig_bytelen = itext_ichar_len (base_pat); |
1386 inv_bytelen = set_itext_ichar (tmp_str, inverse); | |
1387 new_bytelen = set_itext_ichar (tmp_str, translated); | |
446 | 1388 |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1389 if (boyer_moore_ok |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1390 /* Only do the Boyer-Moore check for characters needing |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1391 translation. */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1392 && (translated != c || inverse != c)) |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1393 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1394 Ichar starting_c = c; |
4421
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1395 int charset_base_code, checked = 0; |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1396 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1397 do |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1398 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1399 c = TRANSLATE (inverse_trt, c); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1400 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1401 /* If a character cannot occur in the buffer, ignore |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1402 it. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1403 if (c > 0x7F && entirely_one_byte_p) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1404 continue; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1405 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1406 if (c > 0xFF && nothing_greater_than_0xff) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1407 continue; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1408 |
4421
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1409 checked = 1; |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1410 |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1411 if (-1 == charset_base) /* No charset yet specified. */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1412 { |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1413 /* Keep track of which charset and character set row |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1414 contains the characters that need translation. |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1415 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1416 Zero out the bits corresponding to the last |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1417 byte. */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1418 charset_base = c & ~ICHAR_FIELD3_MASK; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1419 } |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1420 else |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1421 { |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1422 charset_base_code = c & ~ICHAR_FIELD3_MASK; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1423 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1424 if (charset_base_code != charset_base) |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1425 { |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1426 /* If two different rows, or two different |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1427 charsets, appear, needing non-ASCII |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1428 translation, then we cannot use boyer_moore |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1429 search. See the comment at the head of |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1430 boyer_moore(). */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1431 boyer_moore_ok = 0; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1432 break; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1433 } |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1434 } |
4901
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1435 |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1436 if (ichar_len (c) > 2) |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1437 { |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1438 /* Case-equivalence plus repeated octets throws off |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1439 the construction of the stride table; avoid this. |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1440 |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1441 It should be possible to correct boyer_moore to |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1442 behave correctly even in this case--it doesn't have |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1443 problems with repeated octets when case conversion |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1444 is not involved--but this is not a critical |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1445 issue. */ |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1446 Ibyte encoded[MAX_ICHAR_LEN]; |
5016
2ade80e8c640
enable more warnings and fix them
Ben Wing <ben@xemacs.org>
parents:
4962
diff
changeset
|
1447 Bytecount clen = set_itext_ichar (encoded, c); |
2ade80e8c640
enable more warnings and fix them
Ben Wing <ben@xemacs.org>
parents:
4962
diff
changeset
|
1448 int a, b; |
2ade80e8c640
enable more warnings and fix them
Ben Wing <ben@xemacs.org>
parents:
4962
diff
changeset
|
1449 for (a = 0; a < clen && boyer_moore_ok; ++a) |
4901
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1450 { |
5016
2ade80e8c640
enable more warnings and fix them
Ben Wing <ben@xemacs.org>
parents:
4962
diff
changeset
|
1451 for (b = a + 1; b < clen && boyer_moore_ok; ++b) |
4901
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1452 { |
5016
2ade80e8c640
enable more warnings and fix them
Ben Wing <ben@xemacs.org>
parents:
4962
diff
changeset
|
1453 if (encoded[a] == encoded[b]) |
4901
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1454 { |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1455 boyer_moore_ok = 0; |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1456 } |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1457 } |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1458 } |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1459 |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1460 if (0 == boyer_moore_ok) |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1461 { |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1462 break; |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1463 } |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1464 } |
7504864a986c
Don't use Boyer-Moore if repeated octets & case-insensitive search.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4897
diff
changeset
|
1465 |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1466 } while (c != starting_c); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1467 |
4421
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1468 if (!checked) |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1469 { |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1470 #ifdef DEBUG_XEMACS |
5041 | 1471 if (debug_searches) |
4421
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1472 { |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1473 Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used); |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1474 sym->value = Qnil; |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1475 } |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1476 #endif |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1477 /* The "continue" clauses were used above, for every |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1478 translation of the character. As such, this character |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1479 is not to be found in the buffer and neither is the |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1480 string as a whole. Return immediately; also avoid |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1481 triggering the assertion a few lines down. */ |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1482 return n > 0 ? -n : n; |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1483 } |
69b803c646cd
Fail searches immediately if searching for non-representable characters.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4414
diff
changeset
|
1484 |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1485 if (boyer_moore_ok && charset_base != -1 && |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1486 charset_base != (translated & ~ICHAR_FIELD3_MASK)) |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1487 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1488 /* In the rare event that the CANON entry for this |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1489 character is not in the desired set, choose one |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1490 that is, from the equivalence set. It doesn't much |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1491 matter which. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1492 Ichar starting_ch = translated; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1493 do |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1494 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1495 translated = TRANSLATE (inverse_trt, translated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1496 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1497 if (charset_base == (translated & ~ICHAR_FIELD3_MASK)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1498 break; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1499 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1500 } while (starting_ch != translated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1501 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1502 assert (starting_ch != translated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1503 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1504 new_bytelen = set_itext_ichar (tmp_str, translated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1505 } |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1506 } |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1507 |
446 | 1508 memcpy (pat, tmp_str, new_bytelen); |
1509 pat += new_bytelen; | |
1510 base_pat += orig_bytelen; | |
1511 len -= orig_bytelen; | |
1512 } | |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1513 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1514 if (-1 == charset_base) |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1515 { |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1516 charset_base = 'a' & ~ICHAR_FIELD3_MASK; /* Default to ASCII. */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1517 } |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1518 |
446 | 1519 #else /* not MULE */ |
1520 while (--len >= 0) | |
1521 { | |
1522 /* If we got here and the RE flag is set, it's because | |
1523 we're dealing with a regexp known to be trivial, so the | |
1524 backslash just quotes the next character. */ | |
1525 if (RE && *base_pat == '\\') | |
1526 { | |
1527 len--; | |
1528 base_pat++; | |
1529 } | |
1530 *pat++ = TRANSLATE (trt, *base_pat++); | |
1531 } | |
1532 #endif /* MULE */ | |
1533 len = pat - patbuf; | |
1534 pat = base_pat = patbuf; | |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1535 |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1536 #ifdef DEBUG_XEMACS |
5041 | 1537 if (debug_searches) |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1538 { |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1539 Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used); |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1540 sym->value = boyer_moore_ok ? Qboyer_moore : Qsimple_search; |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1541 } |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1542 #endif |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1543 |
446 | 1544 if (boyer_moore_ok) |
1545 return boyer_moore (buf, base_pat, len, pos, lim, n, | |
1546 trt, inverse_trt, charset_base); | |
1547 else | |
1548 return simple_search (buf, base_pat, len, pos, lim, n, trt); | |
1549 } | |
1550 } | |
1551 | |
826 | 1552 /* Do a simple string search N times for the string PAT, whose length is |
1553 LEN/LEN_BYTE, from buffer position POS until LIM. TRT is the | |
1554 translation table. | |
446 | 1555 |
1556 Return the character position where the match is found. | |
1557 Otherwise, if M matches remained to be found, return -M. | |
1558 | |
1559 This kind of search works regardless of what is in PAT and | |
1560 regardless of what is in TRT. It is used in cases where | |
1561 boyer_moore cannot work. */ | |
1562 | |
665 | 1563 static Charbpos |
867 | 1564 simple_search (struct buffer *buf, Ibyte *base_pat, Bytecount len, |
826 | 1565 Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt) |
446 | 1566 { |
1567 int forward = n > 0; | |
1568 Bytecount buf_len = 0; /* Shut up compiler. */ | |
1569 | |
826 | 1570 if (lim > pos) |
446 | 1571 while (n > 0) |
428 | 1572 { |
446 | 1573 while (1) |
428 | 1574 { |
826 | 1575 Bytecount this_len = len; |
1576 Bytebpos this_pos = pos; | |
867 | 1577 Ibyte *p = base_pat; |
826 | 1578 if (pos >= lim) |
446 | 1579 goto stop; |
1580 | |
1581 while (this_len > 0) | |
1582 { | |
867 | 1583 Ichar pat_ch, buf_ch; |
446 | 1584 Bytecount pat_len; |
1585 | |
867 | 1586 pat_ch = itext_ichar (p); |
826 | 1587 buf_ch = BYTE_BUF_FETCH_CHAR (buf, this_pos); |
446 | 1588 |
1589 buf_ch = TRANSLATE (trt, buf_ch); | |
1590 | |
1591 if (buf_ch != pat_ch) | |
1592 break; | |
1593 | |
867 | 1594 pat_len = itext_ichar_len (p); |
446 | 1595 p += pat_len; |
1596 this_len -= pat_len; | |
826 | 1597 INC_BYTEBPOS (buf, this_pos); |
446 | 1598 } |
1599 if (this_len == 0) | |
428 | 1600 { |
826 | 1601 buf_len = this_pos - pos; |
1602 pos = this_pos; | |
446 | 1603 break; |
428 | 1604 } |
826 | 1605 INC_BYTEBPOS (buf, pos); |
428 | 1606 } |
446 | 1607 n--; |
1608 } | |
1609 else | |
4322
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1610 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1611 /* If lim < len, then there are too few buffer positions to hold the |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1612 pattern between the beginning of the buffer and lim. Adjust to |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1613 ensure pattern fits. If we don't do this, we can assert in the |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1614 DEC_BYTEBPOS below. */ |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1615 if (lim < len) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1616 lim = len; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1617 while (n < 0) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1618 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1619 while (1) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1620 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1621 Bytecount this_len = len; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1622 Bytebpos this_pos = pos; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1623 Ibyte *p; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1624 if (pos <= lim) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1625 goto stop; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1626 p = base_pat + len; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1627 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1628 while (this_len > 0) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1629 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1630 Ichar pat_ch, buf_ch; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1631 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1632 DEC_IBYTEPTR (p); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1633 DEC_BYTEBPOS (buf, this_pos); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1634 pat_ch = itext_ichar (p); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1635 buf_ch = BYTE_BUF_FETCH_CHAR (buf, this_pos); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1636 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1637 buf_ch = TRANSLATE (trt, buf_ch); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1638 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1639 if (buf_ch != pat_ch) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1640 break; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1641 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1642 this_len -= itext_ichar_len (p); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1643 } |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1644 if (this_len == 0) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1645 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1646 buf_len = pos - this_pos; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1647 pos = this_pos; |
446 | 1648 break; |
4322
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1649 } |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1650 DEC_BYTEBPOS (buf, pos); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1651 } |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1652 n++; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1653 } |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1654 } |
446 | 1655 stop: |
1656 if (n == 0) | |
1657 { | |
665 | 1658 Charbpos beg, end, retval; |
446 | 1659 if (forward) |
1660 { | |
826 | 1661 beg = bytebpos_to_charbpos (buf, pos - buf_len); |
1662 retval = end = bytebpos_to_charbpos (buf, pos); | |
446 | 1663 } |
1664 else | |
428 | 1665 { |
826 | 1666 retval = beg = bytebpos_to_charbpos (buf, pos); |
1667 end = bytebpos_to_charbpos (buf, pos + buf_len); | |
428 | 1668 } |
446 | 1669 set_search_regs (buf, beg, end - beg); |
1670 | |
1671 return retval; | |
1672 } | |
1673 else if (n > 0) | |
1674 return -n; | |
1675 else | |
1676 return n; | |
1677 } | |
1678 | |
1679 /* Do Boyer-Moore search N times for the string PAT, | |
1680 whose length is LEN/LEN_BYTE, | |
1681 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. | |
1682 DIRECTION says which direction we search in. | |
1683 TRT and INVERSE_TRT are translation tables. | |
1684 | |
1685 This kind of search works if all the characters in PAT that have | |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1686 (non-ASCII) translation are the same aside from the last byte. This |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1687 makes it possible to translate just the last byte of a character, and do |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1688 so after just a simple test of the context. |
446 | 1689 |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1690 If that criterion is not satisfied, do not call this function. You will |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1691 get an assertion failure. */ |
446 | 1692 |
665 | 1693 static Charbpos |
867 | 1694 boyer_moore (struct buffer *buf, Ibyte *base_pat, Bytecount len, |
665 | 1695 Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt, |
2333 | 1696 Lisp_Object inverse_trt, int USED_IF_MULE (charset_base)) |
446 | 1697 { |
1698 /* #### Someone really really really needs to comment the workings | |
1699 of this junk somewhat better. | |
1700 | |
1701 BTW "BM" stands for Boyer-Moore, which is one of the standard | |
1702 string-searching algorithms. It's the best string-searching | |
1703 algorithm out there, provided that: | |
1704 | |
1705 a) You're not fazed by algorithm complexity. (Rabin-Karp, which | |
1706 uses hashing, is much much easier to code but not as fast.) | |
1707 b) You can freely move backwards in the string that you're | |
1708 searching through. | |
1709 | |
1710 As the comment below tries to explain (but garbles in typical | |
1711 programmer-ese), the idea is that you don't have to do a | |
1712 string match at every successive position in the text. For | |
1713 example, let's say the pattern is "a very long string". We | |
1714 compare the last character in the string (`g') with the | |
1715 corresponding character in the text. If it mismatches, and | |
1716 it is, say, `z', then we can skip forward by the entire | |
1717 length of the pattern because `z' does not occur anywhere | |
1718 in the pattern. If the mismatching character does occur | |
1719 in the pattern, we can usually still skip forward by more | |
1720 than one: e.g. if it is `l', then we can skip forward | |
1721 by the length of the substring "ong string" -- i.e. the | |
1722 largest end section of the pattern that does not contain | |
1723 the mismatched character. So what we do is compute, for | |
1724 each possible character, the distance we can skip forward | |
1725 (the "stride") and use it in the string matching. This | |
1726 is what the BM_tab holds. */ | |
1727 REGISTER EMACS_INT *BM_tab; | |
1728 EMACS_INT *BM_tab_base; | |
1729 REGISTER Bytecount dirlen; | |
1730 EMACS_INT infinity; | |
665 | 1731 Bytebpos limit; |
446 | 1732 Bytecount stride_for_teases = 0; |
1733 REGISTER EMACS_INT i, j; | |
867 | 1734 Ibyte *pat, *pat_end; |
1735 REGISTER Ibyte *cursor, *p_limit, *ptr2; | |
1736 Ibyte simple_translate[0400]; | |
446 | 1737 REGISTER int direction = ((n > 0) ? 1 : -1); |
1738 #ifdef MULE | |
867 | 1739 Ibyte translate_prev_byte = 0; |
1740 Ibyte translate_anteprev_byte = 0; | |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1741 /* These need to be rethought in the event that the internal format |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1742 changes, or in the event that num_8_bit_fixed_chars disappears |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1743 (entirely_one_byte_p can be trivially worked out by checking is the |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1744 byte count equal to the char count.) */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1745 int buffer_entirely_one_byte_p = buf->text->entirely_one_byte_p; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1746 int buffer_nothing_greater_than_0xff = |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1747 buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf); |
446 | 1748 #endif |
1749 #ifdef C_ALLOCA | |
1750 EMACS_INT BM_tab_space[0400]; | |
1751 BM_tab = &BM_tab_space[0]; | |
1752 #else | |
1753 BM_tab = alloca_array (EMACS_INT, 256); | |
1754 #endif | |
1755 | |
1756 /* The general approach is that we are going to maintain that we | |
1757 know the first (closest to the present position, in whatever | |
1758 direction we're searching) character that could possibly be | |
1759 the last (furthest from present position) character of a | |
1760 valid match. We advance the state of our knowledge by | |
1761 looking at that character and seeing whether it indeed | |
1762 matches the last character of the pattern. If it does, we | |
1763 take a closer look. If it does not, we move our pointer (to | |
1764 putative last characters) as far as is logically possible. | |
1765 This amount of movement, which I call a stride, will be the | |
1766 length of the pattern if the actual character appears nowhere | |
1767 in the pattern, otherwise it will be the distance from the | |
1768 last occurrence of that character to the end of the pattern. | |
1769 As a coding trick, an enormous stride is coded into the table | |
1770 for characters that match the last character. This allows | |
1771 use of only a single test, a test for having gone past the | |
1772 end of the permissible match region, to test for both | |
1773 possible matches (when the stride goes past the end | |
1774 immediately) and failure to match (where you get nudged past | |
1775 the end one stride at a time). | |
1776 | |
1777 Here we make a "mickey mouse" BM table. The stride of the | |
1778 search is determined only by the last character of the | |
1779 putative match. If that character does not match, we will | |
1780 stride the proper distance to propose a match that | |
1781 superimposes it on the last instance of a character that | |
1782 matches it (per trt), or misses it entirely if there is | |
1783 none. */ | |
1784 | |
1785 dirlen = len * direction; | |
1786 infinity = dirlen - (lim + pos + len + len) * direction; | |
1787 /* Record position after the end of the pattern. */ | |
1788 pat_end = base_pat + len; | |
1789 if (direction < 0) | |
1790 base_pat = pat_end - 1; | |
1791 BM_tab_base = BM_tab; | |
1792 BM_tab += 0400; | |
1793 j = dirlen; /* to get it in a register */ | |
1794 /* A character that does not appear in the pattern induces a | |
1795 stride equal to the pattern length. */ | |
1796 while (BM_tab_base != BM_tab) | |
1797 { | |
1798 *--BM_tab = j; | |
1799 *--BM_tab = j; | |
1800 *--BM_tab = j; | |
1801 *--BM_tab = j; | |
1802 } | |
1803 /* We use this for translation, instead of TRT itself. We | |
1804 fill this in to handle the characters that actually occur | |
1805 in the pattern. Others don't matter anyway! */ | |
1806 xzero (simple_translate); | |
1807 for (i = 0; i < 0400; i++) | |
867 | 1808 simple_translate[i] = (Ibyte) i; |
446 | 1809 i = 0; |
1425 | 1810 |
446 | 1811 while (i != infinity) |
1812 { | |
867 | 1813 Ibyte *ptr = base_pat + i; |
446 | 1814 i += direction; |
1815 if (i == dirlen) | |
1816 i = infinity; | |
1817 if (!NILP (trt)) | |
428 | 1818 { |
446 | 1819 #ifdef MULE |
4897
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1820 Ichar ch = -1, untranslated; |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1821 Ibyte byte; |
446 | 1822 int this_translated = 1; |
1823 | |
1824 /* Is *PTR the last byte of a character? */ | |
867 | 1825 if (pat_end - ptr == 1 || ibyte_first_byte_p (ptr[1])) |
428 | 1826 { |
867 | 1827 Ibyte *charstart = ptr; |
1828 while (!ibyte_first_byte_p (*charstart)) | |
446 | 1829 charstart--; |
867 | 1830 untranslated = itext_ichar (charstart); |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1831 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1832 ch = TRANSLATE (trt, untranslated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1833 if (!ibyte_first_byte_p (*ptr)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1834 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1835 translate_prev_byte = ptr[-1]; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1836 if (!ibyte_first_byte_p (translate_prev_byte)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1837 translate_anteprev_byte = ptr[-2]; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1838 } |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1839 |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1840 if (ch != untranslated && /* Was translation done? */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1841 charset_base != (ch & ~ICHAR_FIELD3_MASK)) |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1842 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1843 /* In the very rare event that the CANON entry for this |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1844 character is not in the desired set, choose one that |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1845 is, from the equivalence set. It doesn't much matter |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1846 which, since we're building our own cheesy equivalence |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1847 table instead of using that belonging to the case |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1848 table directly. |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1849 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1850 We can get here if search_buffer has worked out that |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1851 the buffer is entirely single width. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1852 Ichar starting_ch = ch; |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1853 int count = 0; |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1854 do |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1855 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1856 ch = TRANSLATE (inverse_trt, ch); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1857 if (charset_base == (ch & ~ICHAR_FIELD3_MASK)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1858 break; |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1859 ++count; |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1860 } while (starting_ch != ch); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1861 |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1862 /* If starting_ch is equal to ch (and count is not one, |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1863 which means no translation is necessary), the case |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1864 table is corrupt. (Any mapping in the canon table |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1865 should be reflected in the equivalence table, and we |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1866 know from the canon table that untranslated maps to |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1867 starting_ch and that untranslated has the correct value |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1868 for charset_base.) */ |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
1869 assert (1 == count || starting_ch != ch); |
446 | 1870 } |
4897
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1871 { |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1872 Ibyte tmp[MAX_ICHAR_LEN]; |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1873 Bytecount chlen; |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1874 |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1875 chlen = set_itext_ichar (tmp, ch); |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1876 byte = tmp[chlen - 1]; |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1877 } |
428 | 1878 } |
1879 else | |
1880 { | |
4897
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1881 byte = *ptr; |
446 | 1882 this_translated = 0; |
4897
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1883 ch = -1; |
446 | 1884 } |
4897
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1885 |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1886 /* BYTE = last byte of character CH when represented as text */ |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1887 j = byte; |
446 | 1888 |
1889 if (i == infinity) | |
1890 stride_for_teases = BM_tab[j]; | |
1891 BM_tab[j] = dirlen - i; | |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1892 /* A translation table is accompanied by its inverse -- see |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1893 comment in casetab.c. */ |
446 | 1894 if (this_translated) |
1895 { | |
867 | 1896 Ichar starting_ch = ch; |
446 | 1897 EMACS_INT starting_j = j; |
4897
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1898 |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1899 text_checking_assert (valid_ichar_p (ch)); |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1900 do |
446 | 1901 { |
1902 ch = TRANSLATE (inverse_trt, ch); | |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1903 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1904 if (ch > 0x7F && buffer_entirely_one_byte_p) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1905 continue; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1906 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1907 if (ch > 0xFF && buffer_nothing_greater_than_0xff) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1908 continue; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1909 |
4897
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1910 |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1911 /* Retrieve last byte of character CH when represented as |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1912 text */ |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1913 { |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1914 Ibyte tmp[MAX_ICHAR_LEN]; |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1915 Bytecount chlen; |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1916 |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1917 chlen = set_itext_ichar (tmp, ch); |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1918 j = tmp[chlen - 1]; |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1919 } |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1920 |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1921 /* For all the characters that map into CH, set up |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1922 simple_translate to map the last byte into |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1923 STARTING_J. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1924 simple_translate[j] = (Ibyte) starting_j; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1925 BM_tab[j] = dirlen - i; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1926 |
4897
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1927 } |
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1928 while (ch != starting_ch); |
446 | 1929 } |
4897
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1930 #else /* not MULE */ |
446 | 1931 EMACS_INT k; |
1932 j = *ptr; | |
1933 k = (j = TRANSLATE (trt, j)); | |
1934 if (i == infinity) | |
1935 stride_for_teases = BM_tab[j]; | |
1936 BM_tab[j] = dirlen - i; | |
1937 /* A translation table is accompanied by its inverse -- | |
826 | 1938 see comment in casetab.c. */ |
446 | 1939 while ((j = TRANSLATE (inverse_trt, j)) != k) |
1940 { | |
867 | 1941 simple_translate[j] = (Ibyte) k; |
428 | 1942 BM_tab[j] = dirlen - i; |
1943 } | |
4897
91a023144e72
fix longstanding search bug involving searching for Control-1 chars
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
1944 #endif /* (not) MULE */ |
446 | 1945 } |
1946 else | |
1947 { | |
1948 j = *ptr; | |
1949 | |
1950 if (i == infinity) | |
1951 stride_for_teases = BM_tab[j]; | |
1952 BM_tab[j] = dirlen - i; | |
428 | 1953 } |
446 | 1954 /* stride_for_teases tells how much to stride if we get a |
1955 match on the far character but are subsequently | |
1956 disappointed, by recording what the stride would have been | |
1957 for that character if the last character had been | |
1958 different. */ | |
1959 } | |
1960 infinity = dirlen - infinity; | |
1961 pos += dirlen - ((direction > 0) ? direction : 0); | |
1962 /* loop invariant - pos points at where last char (first char if | |
1963 reverse) of pattern would align in a possible match. */ | |
1964 while (n != 0) | |
1965 { | |
665 | 1966 Bytebpos tail_end; |
867 | 1967 Ibyte *tail_end_ptr; |
446 | 1968 /* It's been reported that some (broken) compiler thinks |
1969 that Boolean expressions in an arithmetic context are | |
1970 unsigned. Using an explicit ?1:0 prevents this. */ | |
1971 if ((lim - pos - ((direction > 0) ? 1 : 0)) * direction < 0) | |
1972 return n * (0 - direction); | |
1973 /* First we do the part we can by pointers (maybe | |
1974 nothing) */ | |
1975 QUIT; | |
1976 pat = base_pat; | |
1977 limit = pos - dirlen + direction; | |
1978 /* XEmacs change: definitions of CEILING_OF and FLOOR_OF | |
1979 have changed. See buffer.h. */ | |
1980 limit = ((direction > 0) | |
826 | 1981 ? BYTE_BUF_CEILING_OF (buf, limit) - 1 |
1982 : BYTE_BUF_FLOOR_OF (buf, limit + 1)); | |
446 | 1983 /* LIMIT is now the last (not beyond-last!) value POS can |
1984 take on without hitting edge of buffer or the gap. */ | |
1985 limit = ((direction > 0) | |
1986 ? min (lim - 1, min (limit, pos + 20000)) | |
1987 : max (lim, max (limit, pos - 20000))); | |
826 | 1988 tail_end = BYTE_BUF_CEILING_OF (buf, pos); |
1989 tail_end_ptr = BYTE_BUF_BYTE_ADDRESS (buf, tail_end); | |
446 | 1990 |
1991 if ((limit - pos) * direction > 20) | |
428 | 1992 { |
826 | 1993 /* We have to be careful because the code can generate addresses |
1994 that don't point to the beginning of characters. */ | |
1995 p_limit = BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, limit); | |
1996 ptr2 = (cursor = BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, pos)); | |
446 | 1997 /* In this loop, pos + cursor - ptr2 is the surrogate |
1998 for pos */ | |
1999 while (1) /* use one cursor setting as long as i can */ | |
2000 { | |
2001 if (direction > 0) /* worth duplicating */ | |
2002 { | |
2003 /* Use signed comparison if appropriate to make | |
2004 cursor+infinity sure to be > p_limit. | |
2005 Assuming that the buffer lies in a range of | |
2006 addresses that are all "positive" (as ints) | |
2007 or all "negative", either kind of comparison | |
2008 will work as long as we don't step by | |
2009 infinity. So pick the kind that works when | |
2010 we do step by infinity. */ | |
2011 if ((EMACS_INT) (p_limit + infinity) > | |
2012 (EMACS_INT) p_limit) | |
2013 while ((EMACS_INT) cursor <= | |
2014 (EMACS_INT) p_limit) | |
2015 cursor += BM_tab[*cursor]; | |
2016 else | |
2017 while ((EMACS_UINT) cursor <= | |
2018 (EMACS_UINT) p_limit) | |
2019 cursor += BM_tab[*cursor]; | |
2020 } | |
2021 else | |
2022 { | |
2023 if ((EMACS_INT) (p_limit + infinity) < | |
2024 (EMACS_INT) p_limit) | |
2025 while ((EMACS_INT) cursor >= | |
2026 (EMACS_INT) p_limit) | |
2027 cursor += BM_tab[*cursor]; | |
2028 else | |
2029 while ((EMACS_UINT) cursor >= | |
2030 (EMACS_UINT) p_limit) | |
2031 cursor += BM_tab[*cursor]; | |
2032 } | |
2033 /* If you are here, cursor is beyond the end of the | |
2034 searched region. This can happen if you match on | |
2035 the far character of the pattern, because the | |
2036 "stride" of that character is infinity, a number | |
2037 able to throw you well beyond the end of the | |
2038 search. It can also happen if you fail to match | |
2039 within the permitted region and would otherwise | |
2040 try a character beyond that region */ | |
2041 if ((cursor - p_limit) * direction <= len) | |
2042 break; /* a small overrun is genuine */ | |
2043 cursor -= infinity; /* large overrun = hit */ | |
2044 i = dirlen - direction; | |
2045 if (!NILP (trt)) | |
2046 { | |
2047 while ((i -= direction) + direction != 0) | |
2048 { | |
2049 #ifdef MULE | |
867 | 2050 Ichar ch; |
446 | 2051 cursor -= direction; |
2052 /* Translate only the last byte of a character. */ | |
2053 if ((cursor == tail_end_ptr | |
867 | 2054 || ibyte_first_byte_p (cursor[1])) |
2055 && (ibyte_first_byte_p (cursor[0]) | |
446 | 2056 || (translate_prev_byte == cursor[-1] |
867 | 2057 && (ibyte_first_byte_p (translate_prev_byte) |
446 | 2058 || translate_anteprev_byte == cursor[-2])))) |
2059 ch = simple_translate[*cursor]; | |
2060 else | |
2061 ch = *cursor; | |
2062 if (pat[i] != ch) | |
2063 break; | |
2064 #else | |
2065 if (pat[i] != TRANSLATE (trt, *(cursor -= direction))) | |
2066 break; | |
2067 #endif | |
2068 } | |
2069 } | |
2070 else | |
2071 { | |
2072 while ((i -= direction) + direction != 0) | |
2073 if (pat[i] != *(cursor -= direction)) | |
2074 break; | |
2075 } | |
2076 cursor += dirlen - i - direction; /* fix cursor */ | |
2077 if (i + direction == 0) | |
2078 { | |
2079 cursor -= direction; | |
2080 | |
2081 { | |
665 | 2082 Bytebpos bytstart = (pos + cursor - ptr2 + |
446 | 2083 ((direction > 0) |
2084 ? 1 - len : 0)); | |
665 | 2085 Charbpos bufstart = bytebpos_to_charbpos (buf, bytstart); |
2086 Charbpos bufend = bytebpos_to_charbpos (buf, bytstart + len); | |
446 | 2087 |
2088 set_search_regs (buf, bufstart, bufend - bufstart); | |
2089 } | |
2090 | |
2091 if ((n -= direction) != 0) | |
2092 cursor += dirlen; /* to resume search */ | |
2093 else | |
2094 return ((direction > 0) | |
2095 ? search_regs.end[0] : search_regs.start[0]); | |
2096 } | |
2097 else | |
2098 cursor += stride_for_teases; /* <sigh> we lose - */ | |
2099 } | |
2100 pos += cursor - ptr2; | |
2101 } | |
2102 else | |
2103 /* Now we'll pick up a clump that has to be done the hard | |
2104 way because it covers a discontinuity */ | |
2105 { | |
428 | 2106 /* XEmacs change: definitions of CEILING_OF and FLOOR_OF |
2107 have changed. See buffer.h. */ | |
2108 limit = ((direction > 0) | |
826 | 2109 ? BYTE_BUF_CEILING_OF (buf, pos - dirlen + 1) - 1 |
2110 : BYTE_BUF_FLOOR_OF (buf, pos - dirlen)); | |
428 | 2111 limit = ((direction > 0) |
446 | 2112 ? min (limit + len, lim - 1) |
2113 : max (limit - len, lim)); | |
2114 /* LIMIT is now the last value POS can have | |
2115 and still be valid for a possible match. */ | |
2116 while (1) | |
428 | 2117 { |
446 | 2118 /* This loop can be coded for space rather than |
2119 speed because it will usually run only once. | |
2120 (the reach is at most len + 21, and typically | |
2121 does not exceed len) */ | |
2122 while ((limit - pos) * direction >= 0) | |
826 | 2123 /* *not* BYTE_BUF_FETCH_CHAR. We are working here |
446 | 2124 with bytes, not characters. */ |
826 | 2125 pos += BM_tab[*BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, pos)]; |
446 | 2126 /* now run the same tests to distinguish going off |
2127 the end, a match or a phony match. */ | |
2128 if ((pos - limit) * direction <= len) | |
2129 break; /* ran off the end */ | |
2130 /* Found what might be a match. | |
2131 Set POS back to last (first if reverse) char pos. */ | |
2132 pos -= infinity; | |
2133 i = dirlen - direction; | |
2134 while ((i -= direction) + direction != 0) | |
428 | 2135 { |
446 | 2136 #ifdef MULE |
867 | 2137 Ichar ch; |
2138 Ibyte *ptr; | |
446 | 2139 #endif |
2140 pos -= direction; | |
2141 #ifdef MULE | |
826 | 2142 ptr = BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, pos); |
446 | 2143 if ((ptr == tail_end_ptr |
867 | 2144 || ibyte_first_byte_p (ptr[1])) |
2145 && (ibyte_first_byte_p (ptr[0]) | |
446 | 2146 || (translate_prev_byte == ptr[-1] |
867 | 2147 && (ibyte_first_byte_p (translate_prev_byte) |
446 | 2148 || translate_anteprev_byte == ptr[-2])))) |
2149 ch = simple_translate[*ptr]; | |
428 | 2150 else |
446 | 2151 ch = *ptr; |
2152 if (pat[i] != ch) | |
2153 break; | |
2154 | |
2155 #else | |
826 | 2156 if (pat[i] != |
2157 TRANSLATE (trt, | |
2158 *BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, pos))) | |
446 | 2159 break; |
2160 #endif | |
428 | 2161 } |
446 | 2162 /* Above loop has moved POS part or all the way back |
2163 to the first char pos (last char pos if reverse). | |
2164 Set it once again at the last (first if reverse) | |
2165 char. */ | |
2166 pos += dirlen - i- direction; | |
2167 if (i + direction == 0) | |
428 | 2168 { |
446 | 2169 pos -= direction; |
2170 | |
2171 { | |
665 | 2172 Bytebpos bytstart = (pos + |
446 | 2173 ((direction > 0) |
2174 ? 1 - len : 0)); | |
665 | 2175 Charbpos bufstart = bytebpos_to_charbpos (buf, bytstart); |
2176 Charbpos bufend = bytebpos_to_charbpos (buf, bytstart + len); | |
446 | 2177 |
2178 set_search_regs (buf, bufstart, bufend - bufstart); | |
2179 } | |
2180 | |
2181 if ((n -= direction) != 0) | |
2182 pos += dirlen; /* to resume search */ | |
428 | 2183 else |
446 | 2184 return ((direction > 0) |
2185 ? search_regs.end[0] : search_regs.start[0]); | |
428 | 2186 } |
446 | 2187 else |
2188 pos += stride_for_teases; | |
2189 } | |
428 | 2190 } |
446 | 2191 /* We have done one clump. Can we continue? */ |
2192 if ((lim - pos) * direction < 0) | |
2193 return (0 - n) * direction; | |
428 | 2194 } |
665 | 2195 return bytebpos_to_charbpos (buf, pos); |
428 | 2196 } |
2197 | |
1024 | 2198 /* Record the whole-match data (beginning BEG and end BEG + LEN) and the |
2199 buffer for a match just found. */ | |
428 | 2200 |
2201 static void | |
665 | 2202 set_search_regs (struct buffer *buf, Charbpos beg, Charcount len) |
428 | 2203 { |
2204 /* Make sure we have registers in which to store | |
2205 the match position. */ | |
2206 if (search_regs.num_regs == 0) | |
2207 { | |
2208 search_regs.start = xnew (regoff_t); | |
2209 search_regs.end = xnew (regoff_t); | |
2210 search_regs.num_regs = 1; | |
2211 } | |
2212 | |
1468 | 2213 clear_search_regs (); |
428 | 2214 search_regs.start[0] = beg; |
2215 search_regs.end[0] = beg + len; | |
793 | 2216 last_thing_searched = wrap_buffer (buf); |
428 | 2217 } |
2218 | |
1468 | 2219 /* Clear search registers so match data will be null. */ |
1024 | 2220 |
2221 static void | |
1468 | 2222 clear_search_regs (void) |
1024 | 2223 { |
2224 /* This function has been Mule-ized. */ | |
2225 int i; | |
2226 | |
1468 | 2227 for (i = 0; i < search_regs.num_regs; i++) |
2228 search_regs.start[i] = search_regs.end[i] = -1; | |
1024 | 2229 } |
2230 | |
428 | 2231 |
2232 /* Given a string of words separated by word delimiters, | |
442 | 2233 compute a regexp that matches those exact words |
2234 separated by arbitrary punctuation. */ | |
428 | 2235 |
2236 static Lisp_Object | |
2237 wordify (Lisp_Object buffer, Lisp_Object string) | |
2238 { | |
2239 Charcount i, len; | |
2240 EMACS_INT punct_count = 0, word_count = 0; | |
2241 struct buffer *buf = decode_buffer (buffer, 0); | |
826 | 2242 Lisp_Object syntax_table = buf->mirror_syntax_table; |
428 | 2243 |
2244 CHECK_STRING (string); | |
826 | 2245 len = string_char_length (string); |
428 | 2246 |
2247 for (i = 0; i < len; i++) | |
867 | 2248 if (!WORD_SYNTAX_P (syntax_table, string_ichar (string, i))) |
428 | 2249 { |
2250 punct_count++; | |
2251 if (i > 0 && WORD_SYNTAX_P (syntax_table, | |
867 | 2252 string_ichar (string, i - 1))) |
428 | 2253 word_count++; |
2254 } | |
867 | 2255 if (WORD_SYNTAX_P (syntax_table, string_ichar (string, len - 1))) |
428 | 2256 word_count++; |
4952
19a72041c5ed
Mule-izing, various fixes related to char * arguments
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
2257 if (!word_count) return build_ascstring (""); |
428 | 2258 |
2259 { | |
2260 /* The following value is an upper bound on the amount of storage we | |
2261 need. In non-Mule, it is exact. */ | |
867 | 2262 Ibyte *storage = |
2367 | 2263 alloca_ibytes (XSTRING_LENGTH (string) - punct_count + |
428 | 2264 5 * (word_count - 1) + 4); |
867 | 2265 Ibyte *o = storage; |
428 | 2266 |
2267 *o++ = '\\'; | |
2268 *o++ = 'b'; | |
2269 | |
2270 for (i = 0; i < len; i++) | |
2271 { | |
867 | 2272 Ichar ch = string_ichar (string, i); |
428 | 2273 |
2274 if (WORD_SYNTAX_P (syntax_table, ch)) | |
867 | 2275 o += set_itext_ichar (o, ch); |
428 | 2276 else if (i > 0 |
2277 && WORD_SYNTAX_P (syntax_table, | |
867 | 2278 string_ichar (string, i - 1)) |
428 | 2279 && --word_count) |
2280 { | |
2281 *o++ = '\\'; | |
2282 *o++ = 'W'; | |
2283 *o++ = '\\'; | |
2284 *o++ = 'W'; | |
2285 *o++ = '*'; | |
2286 } | |
2287 } | |
2288 | |
2289 *o++ = '\\'; | |
2290 *o++ = 'b'; | |
2291 | |
2292 return make_string (storage, o - storage); | |
2293 } | |
2294 } | |
2295 | |
2296 DEFUN ("search-backward", Fsearch_backward, 1, 5, "sSearch backward: ", /* | |
2297 Search backward from point for STRING. | |
2298 Set point to the beginning of the occurrence found, and return point. | |
444 | 2299 |
2300 Optional second argument LIMIT bounds the search; it is a buffer | |
2301 position. The match found must not extend before that position. | |
2302 The value nil is equivalent to (point-min). | |
2303 | |
2304 Optional third argument NOERROR, if t, means just return nil (no | |
2305 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2306 and return nil. | |
2307 | |
2308 Optional fourth argument COUNT is a repeat count--search for | |
2309 successive occurrences. | |
2310 | |
428 | 2311 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2312 defaults to the current buffer. |
2313 | |
1468 | 2314 When the match is successful, this function modifies the match data |
2315 that `match-beginning', `match-end' and `match-data' access; save the | |
2316 match data with `match-data' and restore it with `store-match-data' if | |
2317 you want to preserve them. If the match fails, the match data from the | |
2318 previous success match is preserved. | |
2319 | |
2320 See also the function `replace-match'. | |
428 | 2321 */ |
444 | 2322 (string, limit, noerror, count, buffer)) |
428 | 2323 { |
444 | 2324 return search_command (string, limit, noerror, count, buffer, -1, 0, 0); |
428 | 2325 } |
2326 | |
2327 DEFUN ("search-forward", Fsearch_forward, 1, 5, "sSearch: ", /* | |
2328 Search forward from point for STRING. | |
2329 Set point to the end of the occurrence found, and return point. | |
444 | 2330 |
2331 Optional second argument LIMIT bounds the search; it is a buffer | |
2332 position. The match found must not extend after that position. The | |
2333 value nil is equivalent to (point-max). | |
2334 | |
2335 Optional third argument NOERROR, if t, means just return nil (no | |
2336 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2337 and return nil. | |
2338 | |
2339 Optional fourth argument COUNT is a repeat count--search for | |
2340 successive occurrences. | |
2341 | |
428 | 2342 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2343 defaults to the current buffer. |
2344 | |
1468 | 2345 When the match is successful, this function modifies the match data |
2346 that `match-beginning', `match-end' and `match-data' access; save the | |
2347 match data with `match-data' and restore it with `store-match-data' if | |
2348 you want to preserve them. If the match fails, the match data from the | |
2349 previous success match is preserved. | |
2350 | |
2351 See also the function `replace-match'. | |
428 | 2352 */ |
444 | 2353 (string, limit, noerror, count, buffer)) |
428 | 2354 { |
444 | 2355 return search_command (string, limit, noerror, count, buffer, 1, 0, 0); |
428 | 2356 } |
2357 | |
2358 DEFUN ("word-search-backward", Fword_search_backward, 1, 5, | |
2359 "sWord search backward: ", /* | |
2360 Search backward from point for STRING, ignoring differences in punctuation. | |
2361 Set point to the beginning of the occurrence found, and return point. | |
444 | 2362 |
2363 Optional second argument LIMIT bounds the search; it is a buffer | |
2364 position. The match found must not extend before that position. | |
2365 The value nil is equivalent to (point-min). | |
2366 | |
2367 Optional third argument NOERROR, if t, means just return nil (no | |
2368 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2369 and return nil. | |
2370 | |
2371 Optional fourth argument COUNT is a repeat count--search for | |
2372 successive occurrences. | |
2373 | |
428 | 2374 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2375 defaults to the current buffer. |
2376 | |
1468 | 2377 When the match is successful, this function modifies the match data |
2378 that `match-beginning', `match-end' and `match-data' access; save the | |
2379 match data with `match-data' and restore it with `store-match-data' if | |
2380 you want to preserve them. If the match fails, the match data from the | |
2381 previous success match is preserved. | |
2382 | |
2383 See also the function `replace-match'. | |
428 | 2384 */ |
444 | 2385 (string, limit, noerror, count, buffer)) |
428 | 2386 { |
444 | 2387 return search_command (wordify (buffer, string), limit, noerror, count, |
428 | 2388 buffer, -1, 1, 0); |
2389 } | |
2390 | |
2391 DEFUN ("word-search-forward", Fword_search_forward, 1, 5, "sWord search: ", /* | |
2392 Search forward from point for STRING, ignoring differences in punctuation. | |
2393 Set point to the end of the occurrence found, and return point. | |
444 | 2394 |
2395 Optional second argument LIMIT bounds the search; it is a buffer | |
2396 position. The match found must not extend after that position. The | |
2397 value nil is equivalent to (point-max). | |
2398 | |
2399 Optional third argument NOERROR, if t, means just return nil (no | |
2400 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2401 and return nil. | |
2402 | |
2403 Optional fourth argument COUNT is a repeat count--search for | |
2404 successive occurrences. | |
2405 | |
428 | 2406 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2407 defaults to the current buffer. |
2408 | |
1468 | 2409 When the match is successful, this function modifies the match data |
2410 that `match-beginning', `match-end' and `match-data' access; save the | |
2411 match data with `match-data' and restore it with `store-match-data' if | |
2412 you want to preserve them. If the match fails, the match data from the | |
2413 previous success match is preserved. | |
2414 | |
2415 See also the function `replace-match'. | |
428 | 2416 */ |
444 | 2417 (string, limit, noerror, count, buffer)) |
428 | 2418 { |
444 | 2419 return search_command (wordify (buffer, string), limit, noerror, count, |
428 | 2420 buffer, 1, 1, 0); |
2421 } | |
2422 | |
2423 DEFUN ("re-search-backward", Fre_search_backward, 1, 5, | |
2424 "sRE search backward: ", /* | |
2425 Search backward from point for match for regular expression REGEXP. | |
2426 Set point to the beginning of the match, and return point. | |
2427 The match found is the one starting last in the buffer | |
2428 and yet ending before the origin of the search. | |
444 | 2429 |
2430 Optional second argument LIMIT bounds the search; it is a buffer | |
2431 position. The match found must not extend before that position. | |
2432 The value nil is equivalent to (point-min). | |
2433 | |
2434 Optional third argument NOERROR, if t, means just return nil (no | |
2435 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2436 and return nil. | |
2437 | |
2438 Optional fourth argument COUNT is a repeat count--search for | |
2439 successive occurrences. | |
2440 | |
428 | 2441 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2442 defaults to the current buffer. |
2443 | |
1468 | 2444 When the match is successful, this function modifies the match data |
2445 that `match-beginning', `match-end' and `match-data' access; save the | |
2446 match data with `match-data' and restore it with `store-match-data' if | |
2447 you want to preserve them. If the match fails, the match data from the | |
2448 previous success match is preserved. | |
2449 | |
2450 See also the function `replace-match'. | |
428 | 2451 */ |
444 | 2452 (regexp, limit, noerror, count, buffer)) |
428 | 2453 { |
444 | 2454 return search_command (regexp, limit, noerror, count, buffer, -1, 1, 0); |
428 | 2455 } |
2456 | |
2457 DEFUN ("re-search-forward", Fre_search_forward, 1, 5, "sRE search: ", /* | |
2458 Search forward from point for regular expression REGEXP. | |
2459 Set point to the end of the occurrence found, and return point. | |
444 | 2460 |
2461 Optional second argument LIMIT bounds the search; it is a buffer | |
2462 position. The match found must not extend after that position. The | |
2463 value nil is equivalent to (point-max). | |
2464 | |
2465 Optional third argument NOERROR, if t, means just return nil (no | |
2466 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2467 and return nil. | |
2468 | |
2469 Optional fourth argument COUNT is a repeat count--search for | |
2470 successive occurrences. | |
2471 | |
428 | 2472 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2473 defaults to the current buffer. |
2474 | |
1468 | 2475 When the match is successful, this function modifies the match data |
2476 that `match-beginning', `match-end' and `match-data' access; save the | |
2477 match data with `match-data' and restore it with `store-match-data' if | |
2478 you want to preserve them. If the match fails, the match data from the | |
2479 previous success match is preserved. | |
2480 | |
2481 See also the function `replace-match'. | |
428 | 2482 */ |
444 | 2483 (regexp, limit, noerror, count, buffer)) |
428 | 2484 { |
444 | 2485 return search_command (regexp, limit, noerror, count, buffer, 1, 1, 0); |
428 | 2486 } |
2487 | |
2488 DEFUN ("posix-search-backward", Fposix_search_backward, 1, 5, | |
2489 "sPosix search backward: ", /* | |
2490 Search backward from point for match for regular expression REGEXP. | |
2491 Find the longest match in accord with Posix regular expression rules. | |
2492 Set point to the beginning of the match, and return point. | |
2493 The match found is the one starting last in the buffer | |
2494 and yet ending before the origin of the search. | |
444 | 2495 |
2496 Optional second argument LIMIT bounds the search; it is a buffer | |
2497 position. The match found must not extend before that position. | |
2498 The value nil is equivalent to (point-min). | |
2499 | |
2500 Optional third argument NOERROR, if t, means just return nil (no | |
2501 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2502 and return nil. | |
2503 | |
2504 Optional fourth argument COUNT is a repeat count--search for | |
2505 successive occurrences. | |
2506 | |
428 | 2507 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2508 defaults to the current buffer. |
2509 | |
1468 | 2510 When the match is successful, this function modifies the match data |
2511 that `match-beginning', `match-end' and `match-data' access; save the | |
2512 match data with `match-data' and restore it with `store-match-data' if | |
2513 you want to preserve them. If the match fails, the match data from the | |
2514 previous success match is preserved. | |
2515 | |
2516 See also the function `replace-match'. | |
428 | 2517 */ |
444 | 2518 (regexp, limit, noerror, count, buffer)) |
428 | 2519 { |
444 | 2520 return search_command (regexp, limit, noerror, count, buffer, -1, 1, 1); |
428 | 2521 } |
2522 | |
2523 DEFUN ("posix-search-forward", Fposix_search_forward, 1, 5, "sPosix search: ", /* | |
2524 Search forward from point for regular expression REGEXP. | |
2525 Find the longest match in accord with Posix regular expression rules. | |
2526 Set point to the end of the occurrence found, and return point. | |
444 | 2527 |
2528 Optional second argument LIMIT bounds the search; it is a buffer | |
2529 position. The match found must not extend after that position. The | |
2530 value nil is equivalent to (point-max). | |
2531 | |
2532 Optional third argument NOERROR, if t, means just return nil (no | |
2533 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2534 and return nil. | |
2535 | |
2536 Optional fourth argument COUNT is a repeat count--search for | |
2537 successive occurrences. | |
2538 | |
428 | 2539 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2540 defaults to the current buffer. |
2541 | |
1468 | 2542 When the match is successful, this function modifies the match data |
2543 that `match-beginning', `match-end' and `match-data' access; save the | |
2544 match data with `match-data' and restore it with `store-match-data' if | |
2545 you want to preserve them. If the match fails, the match data from the | |
2546 previous success match is preserved. | |
2547 | |
2548 See also the function `replace-match'. | |
428 | 2549 */ |
444 | 2550 (regexp, limit, noerror, count, buffer)) |
428 | 2551 { |
444 | 2552 return search_command (regexp, limit, noerror, count, buffer, 1, 1, 1); |
428 | 2553 } |
2554 | |
2555 | |
2556 static Lisp_Object | |
2557 free_created_dynarrs (Lisp_Object cons) | |
2558 { | |
2559 Dynarr_free (get_opaque_ptr (XCAR (cons))); | |
2560 Dynarr_free (get_opaque_ptr (XCDR (cons))); | |
2561 free_opaque_ptr (XCAR (cons)); | |
2562 free_opaque_ptr (XCDR (cons)); | |
853 | 2563 free_cons (cons); |
428 | 2564 return Qnil; |
2565 } | |
2566 | |
2567 DEFUN ("replace-match", Freplace_match, 1, 5, 0, /* | |
444 | 2568 Replace text matched by last search with REPLACEMENT. |
4199 | 2569 Leaves point at end of replacement text. |
2570 Optional boolean FIXEDCASE inhibits matching case of REPLACEMENT to source. | |
2571 Optional boolean LITERAL inhibits interpretation of escape sequences. | |
2572 Optional STRING provides the source text to replace. | |
2573 Optional STRBUFFER may be a buffer, providing match context, or an integer | |
2574 specifying the subexpression to replace. | |
2575 | |
2576 If FIXEDCASE is non-nil, do not alter case of replacement text. | |
428 | 2577 Otherwise maybe capitalize the whole text, or maybe just word initials, |
2578 based on the replaced text. | |
4199 | 2579 If the replaced text has only capital letters and has at least one |
2580 multiletter word, convert REPLACEMENT to all caps. | |
428 | 2581 If the replaced text has at least one word starting with a capital letter, |
444 | 2582 then capitalize each word in REPLACEMENT. |
428 | 2583 |
4199 | 2584 If LITERAL is non-nil, insert REPLACEMENT literally. |
428 | 2585 Otherwise treat `\\' as special: |
444 | 2586 `\\&' in REPLACEMENT means substitute original matched text. |
428 | 2587 `\\N' means substitute what matched the Nth `\\(...\\)'. |
2588 If Nth parens didn't match, substitute nothing. | |
2589 `\\\\' means insert one `\\'. | |
2590 `\\u' means upcase the next character. | |
2591 `\\l' means downcase the next character. | |
2592 `\\U' means begin upcasing all following characters. | |
2593 `\\L' means begin downcasing all following characters. | |
2594 `\\E' means terminate the effect of any `\\U' or `\\L'. | |
2595 Case changes made with `\\u', `\\l', `\\U', and `\\L' override | |
2596 all other case changes that may be made in the replaced text. | |
4199 | 2597 |
2598 If non-nil, STRING is the source string, and a new string with the specified | |
2599 replacements is created and returned. Otherwise the current buffer is the | |
2600 source text. | |
2601 | |
2602 If non-nil, STRBUFFER may be an integer, interpreted as the index of the | |
2603 subexpression to replace in the source text, or a buffer to provide the | |
2604 syntax table and case table. If nil, then the \"subexpression\" is 0, i.e., | |
2605 the whole match, and the current buffer provides the syntax and case tables. | |
2606 If STRING is nil, STRBUFFER must be nil or an integer. | |
2607 | |
2608 Specifying a subexpression is only useful after a regular expression match, | |
2609 since a fixed string search has no non-trivial subexpressions. | |
2610 | |
2611 It is not possible to specify both a buffer and a subexpression. If that is | |
2612 desired, the idiom `(with-current-buffer BUFFER (replace-match ... INTEGER))' | |
2613 may be appropriate. | |
2614 | |
2615 If STRING is nil but the last thing matched (or searched) was a string, or | |
2616 STRING is a string but the last thing matched was a buffer, an | |
2617 `invalid-argument' error will be signaled. (XEmacs does not check that the | |
2618 last thing searched is the source string, but it is not useful to use a | |
2619 different string as source.) | |
2620 | |
2621 If no match (including searches) has been successful or the requested | |
1468 | 2622 subexpression was not matched, an `args-out-of-range' error will be |
2623 signaled. (If no match has ever been conducted in this instance of | |
2624 XEmacs, an `invalid-operation' error will be signaled. This is very | |
2625 rare.) | |
428 | 2626 */ |
444 | 2627 (replacement, fixedcase, literal, string, strbuffer)) |
428 | 2628 { |
2629 /* This function can GC */ | |
2630 enum { nochange, all_caps, cap_initial } case_action; | |
665 | 2631 Charbpos pos, last; |
428 | 2632 int some_multiletter_word; |
2633 int some_lowercase; | |
2634 int some_uppercase; | |
2635 int some_nonuppercase_initial; | |
867 | 2636 Ichar c, prevc; |
428 | 2637 Charcount inslen; |
2638 struct buffer *buf; | |
826 | 2639 Lisp_Object syntax_table; |
428 | 2640 int mc_count; |
2641 Lisp_Object buffer; | |
2642 int_dynarr *ul_action_dynarr = 0; | |
2643 int_dynarr *ul_pos_dynarr = 0; | |
502 | 2644 int sub = 0; |
428 | 2645 int speccount; |
2646 | |
444 | 2647 CHECK_STRING (replacement); |
428 | 2648 |
4199 | 2649 /* Because GNU decided to be incompatible here, we support the following |
2650 baroque and bogus API for the STRING and STRBUFFER arguments: | |
2651 types interpretations | |
2652 STRING STRBUFFER STRING STRBUFFER | |
2653 nil nil none 0 = index of subexpression to replace | |
2654 nil integer none index of subexpression to replace | |
2655 nil other ***** error ***** | |
2656 string nil source current buffer provides syntax table | |
2657 subexpression = 0 (whole match) | |
2658 string buffer source buffer providing syntax table | |
2659 subexpression = 0 (whole match) | |
2660 string integer source current buffer provides syntax table | |
2661 subexpression = STRBUFFER | |
2662 string other ***** error ***** | |
2663 */ | |
2664 | |
2665 /* Do STRBUFFER first; if STRING is nil, we'll overwrite BUF and BUFFER. */ | |
2666 | |
2667 /* If the match data were abstracted into a special "match data" type | |
2668 instead of the typical half-assed "let the implementation be visible" | |
2669 form it's in, we could extend it to include the last string matched | |
2670 and the buffer used for that matching. But of course we can't change | |
2671 it as it is. | |
2672 */ | |
2673 if (NILP (strbuffer) || BUFFERP (strbuffer)) | |
2674 { | |
2675 buf = decode_buffer (strbuffer, 0); | |
2676 } | |
2677 else if (!NILP (strbuffer)) | |
2678 { | |
2679 CHECK_INT (strbuffer); | |
2680 sub = XINT (strbuffer); | |
2681 if (sub < 0 || sub >= (int) search_regs.num_regs) | |
2682 invalid_argument ("match data register invalid", strbuffer); | |
2683 if (search_regs.start[sub] < 0) | |
2684 invalid_argument ("match data register not set", strbuffer); | |
2685 buf = current_buffer; | |
2686 } | |
2687 else | |
2688 invalid_argument ("STRBUFFER must be nil, a buffer, or an integer", | |
2689 strbuffer); | |
2690 buffer = wrap_buffer (buf); | |
2691 | |
428 | 2692 if (! NILP (string)) |
2693 { | |
2694 CHECK_STRING (string); | |
2695 if (!EQ (last_thing_searched, Qt)) | |
4199 | 2696 invalid_argument ("last thing matched was not a string", Qunbound); |
428 | 2697 } |
2698 else | |
2699 { | |
2700 if (!BUFFERP (last_thing_searched)) | |
4199 | 2701 invalid_argument ("last thing matched was not a buffer", Qunbound); |
428 | 2702 buffer = last_thing_searched; |
2703 buf = XBUFFER (buffer); | |
2704 } | |
2705 | |
826 | 2706 syntax_table = buf->mirror_syntax_table; |
428 | 2707 |
2708 case_action = nochange; /* We tried an initialization */ | |
2709 /* but some C compilers blew it */ | |
2710 | |
2711 if (search_regs.num_regs == 0) | |
826 | 2712 signal_error (Qinvalid_operation, |
2713 "replace-match called before any match found", Qunbound); | |
428 | 2714 |
2715 if (NILP (string)) | |
2716 { | |
469 | 2717 if (search_regs.start[sub] < BUF_BEGV (buf) |
2718 || search_regs.start[sub] > search_regs.end[sub] | |
2719 || search_regs.end[sub] > BUF_ZV (buf)) | |
2720 args_out_of_range (make_int (search_regs.start[sub]), | |
2721 make_int (search_regs.end[sub])); | |
428 | 2722 } |
2723 else | |
2724 { | |
2725 if (search_regs.start[0] < 0 | |
2726 || search_regs.start[0] > search_regs.end[0] | |
826 | 2727 || search_regs.end[0] > string_char_length (string)) |
428 | 2728 args_out_of_range (make_int (search_regs.start[0]), |
2729 make_int (search_regs.end[0])); | |
2730 } | |
2731 | |
2732 if (NILP (fixedcase)) | |
2733 { | |
2734 /* Decide how to casify by examining the matched text. */ | |
2735 | |
707 | 2736 last = search_regs.end[sub]; |
428 | 2737 prevc = '\n'; |
2738 case_action = all_caps; | |
2739 | |
2740 /* some_multiletter_word is set nonzero if any original word | |
2741 is more than one letter long. */ | |
2742 some_multiletter_word = 0; | |
2743 some_lowercase = 0; | |
2744 some_nonuppercase_initial = 0; | |
2745 some_uppercase = 0; | |
2746 | |
707 | 2747 for (pos = search_regs.start[sub]; pos < last; pos++) |
428 | 2748 { |
2749 if (NILP (string)) | |
2750 c = BUF_FETCH_CHAR (buf, pos); | |
2751 else | |
867 | 2752 c = string_ichar (string, pos); |
428 | 2753 |
2754 if (LOWERCASEP (buf, c)) | |
2755 { | |
2756 /* Cannot be all caps if any original char is lower case */ | |
2757 | |
2758 some_lowercase = 1; | |
2759 if (!WORD_SYNTAX_P (syntax_table, prevc)) | |
2760 some_nonuppercase_initial = 1; | |
2761 else | |
2762 some_multiletter_word = 1; | |
2763 } | |
2764 else if (!NOCASEP (buf, c)) | |
2765 { | |
2766 some_uppercase = 1; | |
2767 if (!WORD_SYNTAX_P (syntax_table, prevc)) | |
2768 ; | |
2769 else | |
2770 some_multiletter_word = 1; | |
2771 } | |
2772 else | |
2773 { | |
2774 /* If the initial is a caseless word constituent, | |
2775 treat that like a lowercase initial. */ | |
2776 if (!WORD_SYNTAX_P (syntax_table, prevc)) | |
2777 some_nonuppercase_initial = 1; | |
2778 } | |
2779 | |
2780 prevc = c; | |
2781 } | |
2782 | |
2783 /* Convert to all caps if the old text is all caps | |
2784 and has at least one multiletter word. */ | |
2785 if (! some_lowercase && some_multiletter_word) | |
2786 case_action = all_caps; | |
2787 /* Capitalize each word, if the old text has all capitalized words. */ | |
2788 else if (!some_nonuppercase_initial && some_multiletter_word) | |
2789 case_action = cap_initial; | |
2790 else if (!some_nonuppercase_initial && some_uppercase) | |
2791 /* Should x -> yz, operating on X, give Yz or YZ? | |
2792 We'll assume the latter. */ | |
2793 case_action = all_caps; | |
2794 else | |
2795 case_action = nochange; | |
2796 } | |
2797 | |
2798 /* Do replacement in a string. */ | |
2799 if (!NILP (string)) | |
2800 { | |
2801 Lisp_Object before, after; | |
2802 | |
2803 speccount = specpdl_depth (); | |
4199 | 2804 before = Fsubstring (string, Qzero, make_int (search_regs.start[sub])); |
2805 after = Fsubstring (string, make_int (search_regs.end[sub]), Qnil); | |
428 | 2806 |
444 | 2807 /* Do case substitution into REPLACEMENT if desired. */ |
428 | 2808 if (NILP (literal)) |
2809 { | |
826 | 2810 Charcount stlen = string_char_length (replacement); |
428 | 2811 Charcount strpos; |
2812 /* XEmacs change: rewrote this loop somewhat to make it | |
2813 cleaner. Also added \U, \E, etc. */ | |
2814 Charcount literal_start = 0; | |
2815 /* We build up the substituted string in ACCUM. */ | |
2816 Lisp_Object accum; | |
2817 | |
2818 accum = Qnil; | |
2819 | |
2820 /* OK, the basic idea here is that we scan through the | |
2821 replacement string until we find a backslash, which | |
2822 represents a substring of the original string to be | |
2823 substituted. We then append onto ACCUM the literal | |
2824 text before the backslash (LASTPOS marks the | |
2825 beginning of this) followed by the substring of the | |
2826 original string that needs to be inserted. */ | |
2827 for (strpos = 0; strpos < stlen; strpos++) | |
2828 { | |
2829 /* If LITERAL_END is set, we've encountered a backslash | |
2830 (the end of literal text to be inserted). */ | |
2831 Charcount literal_end = -1; | |
2832 /* If SUBSTART is set, we need to also insert the | |
2833 text from SUBSTART to SUBEND in the original string. */ | |
2834 Charcount substart = -1; | |
2835 Charcount subend = -1; | |
2836 | |
867 | 2837 c = string_ichar (replacement, strpos); |
428 | 2838 if (c == '\\' && strpos < stlen - 1) |
2839 { | |
867 | 2840 c = string_ichar (replacement, ++strpos); |
428 | 2841 if (c == '&') |
2842 { | |
2843 literal_end = strpos - 1; | |
2844 substart = search_regs.start[0]; | |
2845 subend = search_regs.end[0]; | |
2846 } | |
4199 | 2847 /* #### This logic is totally broken, |
2848 since we can have backrefs like "\99", right? */ | |
428 | 2849 else if (c >= '1' && c <= '9' && |
2850 c <= search_regs.num_regs + '0') | |
2851 { | |
2852 if (search_regs.start[c - '0'] >= 0) | |
2853 { | |
2854 literal_end = strpos - 1; | |
2855 substart = search_regs.start[c - '0']; | |
2856 subend = search_regs.end[c - '0']; | |
2857 } | |
2858 } | |
2859 else if (c == 'U' || c == 'u' || c == 'L' || c == 'l' || | |
2860 c == 'E') | |
2861 { | |
2862 /* Keep track of all case changes requested, but don't | |
2863 make them now. Do them later so we override | |
2864 everything else. */ | |
2865 if (!ul_pos_dynarr) | |
2866 { | |
2867 ul_pos_dynarr = Dynarr_new (int); | |
2868 ul_action_dynarr = Dynarr_new (int); | |
2869 record_unwind_protect | |
2870 (free_created_dynarrs, | |
2871 noseeum_cons | |
2872 (make_opaque_ptr (ul_pos_dynarr), | |
2873 make_opaque_ptr (ul_action_dynarr))); | |
2874 } | |
2875 literal_end = strpos - 1; | |
2876 Dynarr_add (ul_pos_dynarr, | |
2877 (!NILP (accum) | |
826 | 2878 ? string_char_length (accum) |
428 | 2879 : 0) + (literal_end - literal_start)); |
2880 Dynarr_add (ul_action_dynarr, c); | |
2881 } | |
2882 else if (c == '\\') | |
2883 /* So we get just one backslash. */ | |
2884 literal_end = strpos; | |
2885 } | |
2886 if (literal_end >= 0) | |
2887 { | |
2888 Lisp_Object literal_text = Qnil; | |
2889 Lisp_Object substring = Qnil; | |
2890 if (literal_end != literal_start) | |
444 | 2891 literal_text = Fsubstring (replacement, |
428 | 2892 make_int (literal_start), |
2893 make_int (literal_end)); | |
2894 if (substart >= 0 && subend != substart) | |
2895 substring = Fsubstring (string, | |
2896 make_int (substart), | |
2897 make_int (subend)); | |
2898 if (!NILP (literal_text) || !NILP (substring)) | |
2899 accum = concat3 (accum, literal_text, substring); | |
2900 literal_start = strpos + 1; | |
2901 } | |
2902 } | |
2903 | |
2904 if (strpos != literal_start) | |
2905 /* some literal text at end to be inserted */ | |
444 | 2906 replacement = concat2 (accum, Fsubstring (replacement, |
2907 make_int (literal_start), | |
2908 make_int (strpos))); | |
428 | 2909 else |
444 | 2910 replacement = accum; |
428 | 2911 } |
2912 | |
444 | 2913 /* replacement can be nil. */ |
2914 if (NILP (replacement)) | |
4952
19a72041c5ed
Mule-izing, various fixes related to char * arguments
Ben Wing <ben@xemacs.org>
parents:
4421
diff
changeset
|
2915 replacement = build_ascstring (""); |
444 | 2916 |
428 | 2917 if (case_action == all_caps) |
444 | 2918 replacement = Fupcase (replacement, buffer); |
428 | 2919 else if (case_action == cap_initial) |
444 | 2920 replacement = Fupcase_initials (replacement, buffer); |
428 | 2921 |
2922 /* Now finally, we need to process the \U's, \E's, etc. */ | |
2923 if (ul_pos_dynarr) | |
2924 { | |
2925 int i = 0; | |
2926 int cur_action = 'E'; | |
826 | 2927 Charcount stlen = string_char_length (replacement); |
428 | 2928 Charcount strpos; |
2929 | |
2930 for (strpos = 0; strpos < stlen; strpos++) | |
2931 { | |
867 | 2932 Ichar curchar = string_ichar (replacement, strpos); |
2933 Ichar newchar = -1; | |
428 | 2934 if (i < Dynarr_length (ul_pos_dynarr) && |
2935 strpos == Dynarr_at (ul_pos_dynarr, i)) | |
2936 { | |
2937 int new_action = Dynarr_at (ul_action_dynarr, i); | |
2938 i++; | |
2939 if (new_action == 'u') | |
2940 newchar = UPCASE (buf, curchar); | |
2941 else if (new_action == 'l') | |
2942 newchar = DOWNCASE (buf, curchar); | |
2943 else | |
2944 cur_action = new_action; | |
2945 } | |
2946 if (newchar == -1) | |
2947 { | |
2948 if (cur_action == 'U') | |
2949 newchar = UPCASE (buf, curchar); | |
2950 else if (cur_action == 'L') | |
2951 newchar = DOWNCASE (buf, curchar); | |
2952 else | |
2953 newchar = curchar; | |
2954 } | |
2955 if (newchar != curchar) | |
793 | 2956 set_string_char (replacement, strpos, newchar); |
428 | 2957 } |
2958 } | |
2959 | |
2960 /* frees the Dynarrs if necessary. */ | |
771 | 2961 unbind_to (speccount); |
444 | 2962 return concat3 (before, replacement, after); |
428 | 2963 } |
2964 | |
707 | 2965 mc_count = begin_multiple_change (buf, search_regs.start[sub], |
2966 search_regs.end[sub]); | |
428 | 2967 |
2968 /* begin_multiple_change() records an unwind-protect, so we need to | |
2969 record this value now. */ | |
2970 speccount = specpdl_depth (); | |
2971 | |
2972 /* We insert the replacement text before the old text, and then | |
2973 delete the original text. This means that markers at the | |
2974 beginning or end of the original will float to the corresponding | |
2975 position in the replacement. */ | |
707 | 2976 BUF_SET_PT (buf, search_regs.start[sub]); |
428 | 2977 if (!NILP (literal)) |
444 | 2978 Finsert (1, &replacement); |
428 | 2979 else |
2980 { | |
826 | 2981 Charcount stlen = string_char_length (replacement); |
428 | 2982 Charcount strpos; |
2983 struct gcpro gcpro1; | |
444 | 2984 GCPRO1 (replacement); |
428 | 2985 for (strpos = 0; strpos < stlen; strpos++) |
2986 { | |
707 | 2987 /* on the first iteration assert(offset==0), |
2988 exactly complementing BUF_SET_PT() above. | |
2989 During the loop, it keeps track of the amount inserted. | |
2990 */ | |
2991 Charcount offset = BUF_PT (buf) - search_regs.start[sub]; | |
428 | 2992 |
867 | 2993 c = string_ichar (replacement, strpos); |
428 | 2994 if (c == '\\' && strpos < stlen - 1) |
2995 { | |
707 | 2996 /* XXX FIXME: replacing just a substring non-literally |
2997 using backslash refs to the match looks dangerous. But | |
2998 <15366.18513.698042.156573@ns.caldera.de> from Torsten Duwe | |
2999 <duwe@caldera.de> claims Finsert_buffer_substring already | |
3000 handles this correctly. | |
3001 */ | |
867 | 3002 c = string_ichar (replacement, ++strpos); |
428 | 3003 if (c == '&') |
3004 Finsert_buffer_substring | |
3005 (buffer, | |
3006 make_int (search_regs.start[0] + offset), | |
3007 make_int (search_regs.end[0] + offset)); | |
4199 | 3008 /* #### This logic is totally broken, |
3009 since we can have backrefs like "\99", right? */ | |
428 | 3010 else if (c >= '1' && c <= '9' && |
3011 c <= search_regs.num_regs + '0') | |
3012 { | |
3013 if (search_regs.start[c - '0'] >= 1) | |
3014 Finsert_buffer_substring | |
3015 (buffer, | |
3016 make_int (search_regs.start[c - '0'] + offset), | |
3017 make_int (search_regs.end[c - '0'] + offset)); | |
3018 } | |
3019 else if (c == 'U' || c == 'u' || c == 'L' || c == 'l' || | |
3020 c == 'E') | |
3021 { | |
3022 /* Keep track of all case changes requested, but don't | |
3023 make them now. Do them later so we override | |
3024 everything else. */ | |
3025 if (!ul_pos_dynarr) | |
3026 { | |
3027 ul_pos_dynarr = Dynarr_new (int); | |
3028 ul_action_dynarr = Dynarr_new (int); | |
3029 record_unwind_protect | |
3030 (free_created_dynarrs, | |
3031 Fcons (make_opaque_ptr (ul_pos_dynarr), | |
3032 make_opaque_ptr (ul_action_dynarr))); | |
3033 } | |
3034 Dynarr_add (ul_pos_dynarr, BUF_PT (buf)); | |
3035 Dynarr_add (ul_action_dynarr, c); | |
3036 } | |
3037 else | |
3038 buffer_insert_emacs_char (buf, c); | |
3039 } | |
3040 else | |
3041 buffer_insert_emacs_char (buf, c); | |
3042 } | |
3043 UNGCPRO; | |
3044 } | |
3045 | |
707 | 3046 inslen = BUF_PT (buf) - (search_regs.start[sub]); |
3047 buffer_delete_range (buf, search_regs.start[sub] + inslen, | |
3048 search_regs.end[sub] + inslen, 0); | |
428 | 3049 |
3050 if (case_action == all_caps) | |
3051 Fupcase_region (make_int (BUF_PT (buf) - inslen), | |
3052 make_int (BUF_PT (buf)), buffer); | |
3053 else if (case_action == cap_initial) | |
3054 Fupcase_initials_region (make_int (BUF_PT (buf) - inslen), | |
3055 make_int (BUF_PT (buf)), buffer); | |
3056 | |
3057 /* Now go through and make all the case changes that were requested | |
3058 in the replacement string. */ | |
3059 if (ul_pos_dynarr) | |
3060 { | |
665 | 3061 Charbpos eend = BUF_PT (buf); |
428 | 3062 int i = 0; |
3063 int cur_action = 'E'; | |
3064 | |
3065 for (pos = BUF_PT (buf) - inslen; pos < eend; pos++) | |
3066 { | |
867 | 3067 Ichar curchar = BUF_FETCH_CHAR (buf, pos); |
3068 Ichar newchar = -1; | |
428 | 3069 if (i < Dynarr_length (ul_pos_dynarr) && |
3070 pos == Dynarr_at (ul_pos_dynarr, i)) | |
3071 { | |
3072 int new_action = Dynarr_at (ul_action_dynarr, i); | |
3073 i++; | |
3074 if (new_action == 'u') | |
3075 newchar = UPCASE (buf, curchar); | |
3076 else if (new_action == 'l') | |
3077 newchar = DOWNCASE (buf, curchar); | |
3078 else | |
3079 cur_action = new_action; | |
3080 } | |
3081 if (newchar == -1) | |
3082 { | |
3083 if (cur_action == 'U') | |
3084 newchar = UPCASE (buf, curchar); | |
3085 else if (cur_action == 'L') | |
3086 newchar = DOWNCASE (buf, curchar); | |
3087 else | |
3088 newchar = curchar; | |
3089 } | |
3090 if (newchar != curchar) | |
3091 buffer_replace_char (buf, pos, newchar, 0, 0); | |
3092 } | |
3093 } | |
3094 | |
3095 /* frees the Dynarrs if necessary. */ | |
771 | 3096 unbind_to (speccount); |
428 | 3097 end_multiple_change (buf, mc_count); |
3098 | |
3099 return Qnil; | |
3100 } | |
3101 | |
3102 static Lisp_Object | |
3103 match_limit (Lisp_Object num, int beginningp) | |
3104 { | |
3105 int n; | |
3106 | |
3107 CHECK_INT (num); | |
3108 n = XINT (num); | |
3109 if (n < 0 || n >= search_regs.num_regs) | |
3110 args_out_of_range (num, make_int (search_regs.num_regs)); | |
3111 if (search_regs.num_regs == 0 || | |
3112 search_regs.start[n] < 0) | |
3113 return Qnil; | |
3114 return make_int (beginningp ? search_regs.start[n] : search_regs.end[n]); | |
3115 } | |
3116 | |
3117 DEFUN ("match-beginning", Fmatch_beginning, 1, 1, 0, /* | |
3118 Return position of start of text matched by last regexp search. | |
3119 NUM, specifies which parenthesized expression in the last regexp. | |
3120 Value is nil if NUMth pair didn't match, or there were less than NUM pairs. | |
3121 Zero means the entire text matched by the whole regexp or whole string. | |
3122 */ | |
3123 (num)) | |
3124 { | |
3125 return match_limit (num, 1); | |
3126 } | |
3127 | |
3128 DEFUN ("match-end", Fmatch_end, 1, 1, 0, /* | |
3129 Return position of end of text matched by last regexp search. | |
3130 NUM specifies which parenthesized expression in the last regexp. | |
3131 Value is nil if NUMth pair didn't match, or there were less than NUM pairs. | |
3132 Zero means the entire text matched by the whole regexp or whole string. | |
3133 */ | |
3134 (num)) | |
3135 { | |
3136 return match_limit (num, 0); | |
3137 } | |
3138 | |
3139 DEFUN ("match-data", Fmatch_data, 0, 2, 0, /* | |
3140 Return a list containing all info on what the last regexp search matched. | |
3141 Element 2N is `(match-beginning N)'; element 2N + 1 is `(match-end N)'. | |
3142 All the elements are markers or nil (nil if the Nth pair didn't match) | |
3143 if the last match was on a buffer; integers or nil if a string was matched. | |
3144 Use `store-match-data' to reinstate the data in this list. | |
3145 | |
3146 If INTEGERS (the optional first argument) is non-nil, always use integers | |
3147 \(rather than markers) to represent buffer positions. | |
3148 If REUSE is a list, reuse it as part of the value. If REUSE is long enough | |
3149 to hold all the values, and if INTEGERS is non-nil, no consing is done. | |
3150 */ | |
3151 (integers, reuse)) | |
3152 { | |
3153 Lisp_Object tail, prev; | |
3154 Lisp_Object *data; | |
3155 int i; | |
3156 Charcount len; | |
3157 | |
3158 if (NILP (last_thing_searched)) | |
563 | 3159 /*error ("match-data called before any match found", Qunbound);*/ |
428 | 3160 return Qnil; |
3161 | |
3162 data = alloca_array (Lisp_Object, 2 * search_regs.num_regs); | |
3163 | |
3164 len = -1; | |
3165 for (i = 0; i < search_regs.num_regs; i++) | |
3166 { | |
665 | 3167 Charbpos start = search_regs.start[i]; |
428 | 3168 if (start >= 0) |
3169 { | |
3170 if (EQ (last_thing_searched, Qt) | |
3171 || !NILP (integers)) | |
3172 { | |
3173 data[2 * i] = make_int (start); | |
3174 data[2 * i + 1] = make_int (search_regs.end[i]); | |
3175 } | |
3176 else if (BUFFERP (last_thing_searched)) | |
3177 { | |
3178 data[2 * i] = Fmake_marker (); | |
3179 Fset_marker (data[2 * i], | |
3180 make_int (start), | |
3181 last_thing_searched); | |
3182 data[2 * i + 1] = Fmake_marker (); | |
3183 Fset_marker (data[2 * i + 1], | |
3184 make_int (search_regs.end[i]), | |
3185 last_thing_searched); | |
3186 } | |
3187 else | |
3188 /* last_thing_searched must always be Qt, a buffer, or Qnil. */ | |
2500 | 3189 ABORT (); |
428 | 3190 |
3191 len = i; | |
3192 } | |
3193 else | |
3194 data[2 * i] = data [2 * i + 1] = Qnil; | |
3195 } | |
3196 if (!CONSP (reuse)) | |
3197 return Flist (2 * len + 2, data); | |
3198 | |
3199 /* If REUSE is a list, store as many value elements as will fit | |
3200 into the elements of REUSE. */ | |
3201 for (prev = Qnil, i = 0, tail = reuse; CONSP (tail); i++, tail = XCDR (tail)) | |
3202 { | |
3203 if (i < 2 * len + 2) | |
3204 XCAR (tail) = data[i]; | |
3205 else | |
3206 XCAR (tail) = Qnil; | |
3207 prev = tail; | |
3208 } | |
3209 | |
3210 /* If we couldn't fit all value elements into REUSE, | |
3211 cons up the rest of them and add them to the end of REUSE. */ | |
3212 if (i < 2 * len + 2) | |
3213 XCDR (prev) = Flist (2 * len + 2 - i, data + i); | |
3214 | |
3215 return reuse; | |
3216 } | |
3217 | |
3218 | |
3219 DEFUN ("store-match-data", Fstore_match_data, 1, 1, 0, /* | |
3220 Set internal data on last search match from elements of LIST. | |
1468 | 3221 LIST should have been created by calling `match-data' previously, |
3222 or be nil, to clear the internal match data. | |
428 | 3223 */ |
3224 (list)) | |
3225 { | |
3226 REGISTER int i; | |
3227 REGISTER Lisp_Object marker; | |
3228 int num_regs; | |
3229 int length; | |
3230 | |
853 | 3231 /* Some FSF junk with running_asynch_code, to preserve the match |
3232 data. Not necessary because we don't call process filters | |
3233 asynchronously (i.e. from within QUIT). */ | |
428 | 3234 |
3235 CONCHECK_LIST (list); | |
3236 | |
3237 /* Unless we find a marker with a buffer in LIST, assume that this | |
3238 match data came from a string. */ | |
3239 last_thing_searched = Qt; | |
3240 | |
3241 /* Allocate registers if they don't already exist. */ | |
3242 length = XINT (Flength (list)) / 2; | |
3243 num_regs = search_regs.num_regs; | |
3244 | |
3245 if (length > num_regs) | |
3246 { | |
3247 if (search_regs.num_regs == 0) | |
3248 { | |
3249 search_regs.start = xnew_array (regoff_t, length); | |
3250 search_regs.end = xnew_array (regoff_t, length); | |
3251 } | |
3252 else | |
3253 { | |
3254 XREALLOC_ARRAY (search_regs.start, regoff_t, length); | |
3255 XREALLOC_ARRAY (search_regs.end, regoff_t, length); | |
3256 } | |
3257 | |
3258 search_regs.num_regs = length; | |
3259 } | |
3260 | |
3261 for (i = 0; i < num_regs; i++) | |
3262 { | |
3263 marker = Fcar (list); | |
3264 if (NILP (marker)) | |
3265 { | |
3266 search_regs.start[i] = -1; | |
3267 list = Fcdr (list); | |
3268 } | |
3269 else | |
3270 { | |
3271 if (MARKERP (marker)) | |
3272 { | |
3273 if (XMARKER (marker)->buffer == 0) | |
3274 marker = Qzero; | |
3275 else | |
793 | 3276 last_thing_searched = wrap_buffer (XMARKER (marker)->buffer); |
428 | 3277 } |
3278 | |
3279 CHECK_INT_COERCE_MARKER (marker); | |
3280 search_regs.start[i] = XINT (marker); | |
3281 list = Fcdr (list); | |
3282 | |
3283 marker = Fcar (list); | |
3284 if (MARKERP (marker) && XMARKER (marker)->buffer == 0) | |
3285 marker = Qzero; | |
3286 | |
3287 CHECK_INT_COERCE_MARKER (marker); | |
3288 search_regs.end[i] = XINT (marker); | |
3289 } | |
3290 list = Fcdr (list); | |
3291 } | |
3292 | |
3293 return Qnil; | |
3294 } | |
3295 | |
3296 /* Quote a string to inactivate reg-expr chars */ | |
3297 | |
3298 DEFUN ("regexp-quote", Fregexp_quote, 1, 1, 0, /* | |
3299 Return a regexp string which matches exactly STRING and nothing else. | |
3300 */ | |
444 | 3301 (string)) |
428 | 3302 { |
867 | 3303 REGISTER Ibyte *in, *out, *end; |
3304 REGISTER Ibyte *temp; | |
428 | 3305 |
444 | 3306 CHECK_STRING (string); |
428 | 3307 |
2367 | 3308 temp = alloca_ibytes (XSTRING_LENGTH (string) * 2); |
428 | 3309 |
3310 /* Now copy the data into the new string, inserting escapes. */ | |
3311 | |
444 | 3312 in = XSTRING_DATA (string); |
3313 end = in + XSTRING_LENGTH (string); | |
428 | 3314 out = temp; |
3315 | |
3316 while (in < end) | |
3317 { | |
867 | 3318 Ichar c = itext_ichar (in); |
428 | 3319 |
3320 if (c == '[' || c == ']' | |
3321 || c == '*' || c == '.' || c == '\\' | |
3322 || c == '?' || c == '+' | |
3323 || c == '^' || c == '$') | |
3324 *out++ = '\\'; | |
867 | 3325 out += set_itext_ichar (out, c); |
3326 INC_IBYTEPTR (in); | |
428 | 3327 } |
3328 | |
3329 return make_string (temp, out - temp); | |
3330 } | |
3331 | |
3332 DEFUN ("set-word-regexp", Fset_word_regexp, 1, 1, 0, /* | |
3333 Set the regexp to be used to match a word in regular-expression searching. | |
3334 #### Not yet implemented. Currently does nothing. | |
3335 #### Do not use this yet. Its calling interface is likely to change. | |
3336 */ | |
2286 | 3337 (UNUSED (regexp))) |
428 | 3338 { |
3339 return Qnil; | |
3340 } | |
3341 | |
3342 | |
5041 | 3343 #ifdef DEBUG_XEMACS |
3344 | |
3345 static int | |
3346 debug_regexps_changed (Lisp_Object UNUSED (sym), Lisp_Object *val, | |
3347 Lisp_Object UNUSED (in_object), | |
3348 int UNUSED (flags)) | |
3349 { | |
3350 int newval = 0; | |
3351 | |
3352 EXTERNAL_LIST_LOOP_2 (elt, *val) | |
3353 { | |
3354 CHECK_SYMBOL (elt); | |
3355 if (EQ (elt, Qcompilation)) | |
3356 newval |= RE_DEBUG_COMPILATION; | |
3357 else if (EQ (elt, Qfailure_point)) | |
3358 newval |= RE_DEBUG_FAILURE_POINT; | |
3359 else if (EQ (elt, Qmatching)) | |
3360 newval |= RE_DEBUG_MATCHING; | |
3361 else | |
3362 invalid_argument | |
3363 ("Expected `compilation', `failure-point' or `matching'", elt); | |
3364 } | |
3365 debug_regexps = newval; | |
3366 return 0; | |
3367 } | |
3368 | |
3369 #endif /* DEBUG_XEMACS */ | |
3370 | |
3371 | |
428 | 3372 /************************************************************************/ |
3373 /* initialization */ | |
3374 /************************************************************************/ | |
3375 | |
3376 void | |
3377 syms_of_search (void) | |
3378 { | |
3379 | |
442 | 3380 DEFERROR_STANDARD (Qsearch_failed, Qinvalid_operation); |
3381 DEFERROR_STANDARD (Qinvalid_regexp, Qsyntax_error); | |
563 | 3382 Fput (Qinvalid_regexp, Qerror_lacks_explanatory_string, Qt); |
428 | 3383 |
3384 DEFSUBR (Flooking_at); | |
3385 DEFSUBR (Fposix_looking_at); | |
3386 DEFSUBR (Fstring_match); | |
3387 DEFSUBR (Fposix_string_match); | |
3388 DEFSUBR (Fskip_chars_forward); | |
3389 DEFSUBR (Fskip_chars_backward); | |
3390 DEFSUBR (Fskip_syntax_forward); | |
3391 DEFSUBR (Fskip_syntax_backward); | |
3392 DEFSUBR (Fsearch_forward); | |
3393 DEFSUBR (Fsearch_backward); | |
3394 DEFSUBR (Fword_search_forward); | |
3395 DEFSUBR (Fword_search_backward); | |
3396 DEFSUBR (Fre_search_forward); | |
3397 DEFSUBR (Fre_search_backward); | |
3398 DEFSUBR (Fposix_search_forward); | |
3399 DEFSUBR (Fposix_search_backward); | |
3400 DEFSUBR (Freplace_match); | |
3401 DEFSUBR (Fmatch_beginning); | |
3402 DEFSUBR (Fmatch_end); | |
3403 DEFSUBR (Fmatch_data); | |
3404 DEFSUBR (Fstore_match_data); | |
3405 DEFSUBR (Fregexp_quote); | |
3406 DEFSUBR (Fset_word_regexp); | |
3407 } | |
3408 | |
3409 void | |
3410 reinit_vars_of_search (void) | |
3411 { | |
3412 int i; | |
3413 | |
3414 last_thing_searched = Qnil; | |
3415 staticpro_nodump (&last_thing_searched); | |
3416 | |
3417 for (i = 0; i < REGEXP_CACHE_SIZE; ++i) | |
3418 { | |
3419 searchbufs[i].buf.allocated = 100; | |
3420 searchbufs[i].buf.buffer = (unsigned char *) xmalloc (100); | |
3421 searchbufs[i].buf.fastmap = searchbufs[i].fastmap; | |
3422 searchbufs[i].regexp = Qnil; | |
3423 staticpro_nodump (&searchbufs[i].regexp); | |
3424 searchbufs[i].next = (i == REGEXP_CACHE_SIZE-1 ? 0 : &searchbufs[i+1]); | |
3425 } | |
3426 searchbuf_head = &searchbufs[0]; | |
3427 } | |
3428 | |
3429 void | |
3430 vars_of_search (void) | |
3431 { | |
3432 DEFVAR_LISP ("forward-word-regexp", &Vforward_word_regexp /* | |
3433 *Regular expression to be used in `forward-word'. | |
3434 #### Not yet implemented. | |
3435 */ ); | |
3436 Vforward_word_regexp = Qnil; | |
3437 | |
3438 DEFVAR_LISP ("backward-word-regexp", &Vbackward_word_regexp /* | |
3439 *Regular expression to be used in `backward-word'. | |
3440 #### Not yet implemented. | |
3441 */ ); | |
3442 Vbackward_word_regexp = Qnil; | |
502 | 3443 |
3444 DEFVAR_INT ("warn-about-possibly-incompatible-back-references", | |
3445 &warn_about_possibly_incompatible_back_references /* | |
3446 If true, issue warnings when new-semantics back references occur. | |
3447 This is to catch places where old code might inadvertently have changed | |
3448 semantics. This will occur in old code only where more than nine groups | |
3449 occur and a back reference to one of them is directly followed by a digit. | |
3450 */ ); | |
3451 warn_about_possibly_incompatible_back_references = 1; | |
814 | 3452 |
2421 | 3453 Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed); |
428 | 3454 staticpro (&Vskip_chars_range_table); |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3455 #ifdef DEBUG_XEMACS |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3456 DEFSYMBOL (Qsearch_algorithm_used); |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3457 DEFSYMBOL (Qboyer_moore); |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3458 DEFSYMBOL (Qsimple_search); |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3459 |
5041 | 3460 DEFSYMBOL (Qcompilation); |
3461 DEFSYMBOL (Qfailure_point); | |
3462 DEFSYMBOL (Qmatching); | |
3463 | |
3464 DEFVAR_INT ("debug-searches", &debug_searches /* | |
4414
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3465 If non-zero, bind `search-algorithm-used' to `boyer-moore' or `simple-search', |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3466 depending on the algorithm used for each search. Used for testing. |
df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4407
diff
changeset
|
3467 */ ); |
5041 | 3468 debug_searches = 0; |
3469 | |
3470 DEFVAR_LISP_MAGIC ("debug-regexps", &Vdebug_regexps, /* | |
3471 List of areas to display debug info about during regexp operation. | |
3472 The following areas are recognized: | |
3473 | |
3474 `compilation' Display the result of compiling a regexp. | |
3475 `failure-point' Display info about failure points reached. | |
3476 `matching' Display info about the process of matching a regex against | |
3477 text. | |
3478 */ debug_regexps_changed); | |
3479 Vdebug_regexps = Qnil; | |
3480 debug_regexps = 0; | |
3481 #endif /* DEBUG_XEMACS */ | |
428 | 3482 } |