Mercurial > hg > xemacs-beta
annotate src/search.c @ 4407:4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
2007-12-26 Aidan Kehoe <kehoea@parhasard.net>
* casetab.c:
Extend and correct some case table documentation.
* search.c (search_buffer):
Correct a bug where only the first entry for a character in the
case equivalence table was examined in determining if the
Boyer-Moore search algorithm is appropriate.
If there are case mappings outside of the charset and row of the
characters specified in the search string, those case mappings can
be safely ignored (and Boyer-Moore search can be used) if we know
from the buffer statistics that the corresponding characters cannot
occur.
* search.c (boyer_moore):
Assert that we haven't been passed a string with varying
characters sets or rows within character sets. That's what
simple_search is for.
In the very rare event that a character in the search string has a
canonical case mapping that is not in the same character set and
row, don't try to search for the canonical character, search for
some other character that is in the the desired character set and
row. Assert that the case table isn't corrupt.
Do not search for any character case mappings that cannot possibly
occur in the buffer, given the buffer metadata about its
contents.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Wed, 26 Dec 2007 17:30:16 +0100 |
parents | f70e56bb52a7 |
children | df576f30c1d8 |
rev | line source |
---|---|
428 | 1 /* String search routines for XEmacs. |
2 Copyright (C) 1985, 1986, 1987, 1992-1995 Free Software Foundation, Inc. | |
3 Copyright (C) 1995 Sun Microsystems, Inc. | |
793 | 4 Copyright (C) 2001, 2002 Ben Wing. |
428 | 5 |
6 This file is part of XEmacs. | |
7 | |
8 XEmacs is free software; you can redistribute it and/or modify it | |
9 under the terms of the GNU General Public License as published by the | |
10 Free Software Foundation; either version 2, or (at your option) any | |
11 later version. | |
12 | |
13 XEmacs is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 for more details. | |
17 | |
18 You should have received a copy of the GNU General Public License | |
19 along with XEmacs; see the file COPYING. If not, write to | |
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
21 Boston, MA 02111-1307, USA. */ | |
22 | |
23 /* Synched up with: FSF 19.29, except for region-cache stuff. */ | |
24 | |
25 /* Hacked on for Mule by Ben Wing, December 1994 and August 1995. */ | |
26 | |
826 | 27 /* This file has been Mule-ized. */ |
428 | 28 |
29 #include <config.h> | |
30 #include "lisp.h" | |
31 | |
32 #include "buffer.h" | |
33 #include "insdel.h" | |
34 #include "opaque.h" | |
35 #ifdef REGION_CACHE_NEEDS_WORK | |
36 #include "region-cache.h" | |
37 #endif | |
38 #include "syntax.h" | |
39 | |
40 #include <sys/types.h> | |
41 #include "regex.h" | |
446 | 42 #include "casetab.h" |
43 #include "chartab.h" | |
44 | |
45 #define TRANSLATE(table, pos) \ | |
867 | 46 (!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos) |
428 | 47 |
48 #define REGEXP_CACHE_SIZE 20 | |
49 | |
50 /* If the regexp is non-nil, then the buffer contains the compiled form | |
51 of that regexp, suitable for searching. */ | |
446 | 52 struct regexp_cache |
53 { | |
428 | 54 struct regexp_cache *next; |
55 Lisp_Object regexp; | |
56 struct re_pattern_buffer buf; | |
57 char fastmap[0400]; | |
58 /* Nonzero means regexp was compiled to do full POSIX backtracking. */ | |
59 char posix; | |
60 }; | |
61 | |
62 /* The instances of that struct. */ | |
63 static struct regexp_cache searchbufs[REGEXP_CACHE_SIZE]; | |
64 | |
65 /* The head of the linked list; points to the most recently used buffer. */ | |
66 static struct regexp_cache *searchbuf_head; | |
67 | |
68 | |
69 /* Every call to re_match, etc., must pass &search_regs as the regs | |
70 argument unless you can show it is unnecessary (i.e., if re_match | |
71 is certainly going to be called again before region-around-match | |
72 can be called). | |
73 | |
74 Since the registers are now dynamically allocated, we need to make | |
75 sure not to refer to the Nth register before checking that it has | |
76 been allocated by checking search_regs.num_regs. | |
77 | |
78 The regex code keeps track of whether it has allocated the search | |
79 buffer using bits in the re_pattern_buffer. This means that whenever | |
80 you compile a new pattern, it completely forgets whether it has | |
81 allocated any registers, and will allocate new registers the next | |
82 time you call a searching or matching function. Therefore, we need | |
83 to call re_set_registers after compiling a new pattern or after | |
84 setting the match registers, so that the regex functions will be | |
85 able to free or re-allocate it properly. */ | |
86 | |
87 /* Note: things get trickier under Mule because the values returned from | |
826 | 88 the regexp routines are in Bytebpos's but we need them to be in Charbpos's. |
428 | 89 We take the easy way out for the moment and just convert them immediately. |
90 We could be more clever by not converting them until necessary, but | |
91 that gets real ugly real fast since the buffer might have changed and | |
92 the positions might be out of sync or out of range. | |
93 */ | |
94 static struct re_registers search_regs; | |
95 | |
1468 | 96 /* Every function that sets the match data _must_ clear unused search |
97 registers on success. An unsuccessful search or match _must_ preserve | |
98 the search registers. The traditional documentation implied that | |
99 any match operation might trash the registers, but in fact failures | |
100 have always preserved the match data (in GNU Emacs as well). Some | |
101 plausible code depends on this behavior (cf. `w3-configuration-data' | |
102 in library "w3-cfg"). | |
103 | |
104 Ordinary string searchs use set_search_regs to set the whole-string | |
105 match. That function takes care of clearing the unused subexpression | |
1425 | 106 registers. |
107 */ | |
108 static void set_search_regs (struct buffer *buf, Charbpos beg, Charcount len); | |
1468 | 109 static void clear_search_regs (void); |
1425 | 110 |
428 | 111 /* The buffer in which the last search was performed, or |
112 Qt if the last search was done in a string; | |
113 Qnil if no searching has been done yet. */ | |
114 static Lisp_Object last_thing_searched; | |
115 | |
116 /* error condition signalled when regexp compile_pattern fails */ | |
117 | |
118 Lisp_Object Qinvalid_regexp; | |
119 | |
120 /* Regular expressions used in forward/backward-word */ | |
121 Lisp_Object Vforward_word_regexp, Vbackward_word_regexp; | |
122 | |
507 | 123 Fixnum warn_about_possibly_incompatible_back_references; |
502 | 124 |
428 | 125 /* range table for use with skip_chars. Only needed for Mule. */ |
126 Lisp_Object Vskip_chars_range_table; | |
127 | |
867 | 128 static Charbpos simple_search (struct buffer *buf, Ibyte *base_pat, |
826 | 129 Bytecount len, Bytebpos pos, Bytebpos lim, |
130 EMACS_INT n, Lisp_Object trt); | |
867 | 131 static Charbpos boyer_moore (struct buffer *buf, Ibyte *base_pat, |
826 | 132 Bytecount len, Bytebpos pos, Bytebpos lim, |
133 EMACS_INT n, Lisp_Object trt, | |
134 Lisp_Object inverse_trt, int charset_base); | |
665 | 135 static Charbpos search_buffer (struct buffer *buf, Lisp_Object str, |
826 | 136 Charbpos charbpos, Charbpos buflim, EMACS_INT n, |
137 int RE, Lisp_Object trt, | |
138 Lisp_Object inverse_trt, int posix); | |
771 | 139 |
2268 | 140 static DECLARE_DOESNT_RETURN (matcher_overflow (void)); |
141 | |
142 static DOESNT_RETURN | |
143 matcher_overflow () | |
428 | 144 { |
563 | 145 stack_overflow ("Stack overflow in regexp matcher", Qunbound); |
428 | 146 } |
147 | |
148 /* Compile a regexp and signal a Lisp error if anything goes wrong. | |
149 PATTERN is the pattern to compile. | |
150 CP is the place to put the result. | |
826 | 151 TRANSLATE is a translation table for ignoring case, or Qnil for none. |
428 | 152 REGP is the structure that says where to store the "register" |
153 values that will result from matching this pattern. | |
154 If it is 0, we should compile the pattern not to record any | |
155 subexpression bounds. | |
156 POSIX is nonzero if we want full backtracking (POSIX style) | |
157 for this pattern. 0 means backtrack only enough to get a valid match. */ | |
158 | |
159 static int | |
160 compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern, | |
2286 | 161 struct re_registers *UNUSED (regp), Lisp_Object translate, |
826 | 162 int posix, Error_Behavior errb) |
428 | 163 { |
442 | 164 const char *val; |
428 | 165 reg_syntax_t old; |
166 | |
167 cp->regexp = Qnil; | |
168 cp->buf.translate = translate; | |
169 cp->posix = posix; | |
170 old = re_set_syntax (RE_SYNTAX_EMACS | |
171 | (posix ? 0 : RE_NO_POSIX_BACKTRACKING)); | |
442 | 172 val = (const char *) |
428 | 173 re_compile_pattern ((char *) XSTRING_DATA (pattern), |
174 XSTRING_LENGTH (pattern), &cp->buf); | |
175 re_set_syntax (old); | |
176 if (val) | |
177 { | |
563 | 178 maybe_signal_error (Qinvalid_regexp, 0, build_string (val), |
428 | 179 Qsearch, errb); |
180 return 0; | |
181 } | |
182 | |
183 cp->regexp = Fcopy_sequence (pattern); | |
184 return 1; | |
185 } | |
186 | |
187 /* Compile a regexp if necessary, but first check to see if there's one in | |
188 the cache. | |
189 PATTERN is the pattern to compile. | |
826 | 190 TRANSLATE is a translation table for ignoring case, or Qnil for none. |
428 | 191 REGP is the structure that says where to store the "register" |
192 values that will result from matching this pattern. | |
193 If it is 0, we should compile the pattern not to record any | |
194 subexpression bounds. | |
195 POSIX is nonzero if we want full backtracking (POSIX style) | |
196 for this pattern. 0 means backtrack only enough to get a valid match. */ | |
197 | |
198 struct re_pattern_buffer * | |
199 compile_pattern (Lisp_Object pattern, struct re_registers *regp, | |
2286 | 200 Lisp_Object translate, Lisp_Object UNUSED (searchobj), |
201 struct buffer *UNUSED (searchbuf), int posix, | |
202 Error_Behavior errb) | |
428 | 203 { |
204 struct regexp_cache *cp, **cpp; | |
205 | |
206 for (cpp = &searchbuf_head; ; cpp = &cp->next) | |
207 { | |
208 cp = *cpp; | |
826 | 209 /* &&#### once we fix up the fastmap code in regex.c for 8-bit-fixed, |
210 we need to record and compare the buffer and format, since the | |
211 fastmap will reflect the state of the buffer -- and things get | |
212 more complicated if the buffer has changed formats or (esp.) has | |
213 kept the format but changed its interpretation! may need to have | |
214 the code that changes the interpretation go through and invalidate | |
215 cache entries for that buffer. */ | |
428 | 216 if (!NILP (Fstring_equal (cp->regexp, pattern)) |
446 | 217 && EQ (cp->buf.translate, translate) |
428 | 218 && cp->posix == posix) |
219 break; | |
220 | |
221 /* If we're at the end of the cache, compile into the last cell. */ | |
222 if (cp->next == 0) | |
223 { | |
826 | 224 if (!compile_pattern_1 (cp, pattern, regp, translate, |
225 posix, errb)) | |
428 | 226 return 0; |
227 break; | |
228 } | |
229 } | |
230 | |
231 /* When we get here, cp (aka *cpp) contains the compiled pattern, | |
232 either because we found it in the cache or because we just compiled it. | |
233 Move it to the front of the queue to mark it as most recently used. */ | |
234 *cpp = cp->next; | |
235 cp->next = searchbuf_head; | |
236 searchbuf_head = cp; | |
237 | |
238 /* Advise the searching functions about the space we have allocated | |
239 for register data. */ | |
240 if (regp) | |
241 re_set_registers (&cp->buf, regp, regp->num_regs, regp->start, regp->end); | |
242 | |
243 return &cp->buf; | |
244 } | |
245 | |
246 /* Error condition used for failing searches */ | |
247 Lisp_Object Qsearch_failed; | |
248 | |
2268 | 249 static DECLARE_DOESNT_RETURN (signal_failure (Lisp_Object)); |
250 | |
251 static DOESNT_RETURN | |
428 | 252 signal_failure (Lisp_Object arg) |
253 { | |
446 | 254 for (;;) |
255 Fsignal (Qsearch_failed, list1 (arg)); | |
428 | 256 } |
257 | |
826 | 258 /* Convert the search registers from Bytebpos's to Charbpos's. Needs to be |
428 | 259 done after each regexp match that uses the search regs. |
260 | |
261 We could get a potential speedup by not converting the search registers | |
262 until it's really necessary, e.g. when match-data or replace-match is | |
263 called. However, this complexifies the code a lot (e.g. the buffer | |
826 | 264 could have changed and the Bytebpos's stored might be invalid) and is |
428 | 265 probably not a great time-saver. */ |
266 | |
267 static void | |
268 fixup_search_regs_for_buffer (struct buffer *buf) | |
269 { | |
270 int i; | |
271 int num_regs = search_regs.num_regs; | |
272 | |
273 for (i = 0; i < num_regs; i++) | |
274 { | |
275 if (search_regs.start[i] >= 0) | |
826 | 276 search_regs.start[i] = bytebpos_to_charbpos (buf, |
277 search_regs.start[i]); | |
428 | 278 if (search_regs.end[i] >= 0) |
665 | 279 search_regs.end[i] = bytebpos_to_charbpos (buf, search_regs.end[i]); |
428 | 280 } |
281 } | |
282 | |
283 /* Similar but for strings. */ | |
284 static void | |
285 fixup_search_regs_for_string (Lisp_Object string) | |
286 { | |
287 int i; | |
288 int num_regs = search_regs.num_regs; | |
289 | |
290 /* #### bytecount_to_charcount() is not that efficient. This function | |
867 | 291 could be faster if it did its own conversion (using INC_IBYTEPTR() |
428 | 292 and such), because the register ends are likely to be somewhat ordered. |
293 (Even if not, you could sort them.) | |
294 | |
295 Think about this if this function is a time hog, which it's probably | |
296 not. */ | |
297 for (i = 0; i < num_regs; i++) | |
298 { | |
299 if (search_regs.start[i] > 0) | |
300 { | |
301 search_regs.start[i] = | |
793 | 302 string_index_byte_to_char (string, search_regs.start[i]); |
428 | 303 } |
304 if (search_regs.end[i] > 0) | |
305 { | |
306 search_regs.end[i] = | |
793 | 307 string_index_byte_to_char (string, search_regs.end[i]); |
428 | 308 } |
309 } | |
310 } | |
311 | |
312 | |
313 static Lisp_Object | |
314 looking_at_1 (Lisp_Object string, struct buffer *buf, int posix) | |
315 { | |
316 Lisp_Object val; | |
665 | 317 Bytebpos p1, p2; |
428 | 318 Bytecount s1, s2; |
319 REGISTER int i; | |
320 struct re_pattern_buffer *bufp; | |
826 | 321 struct syntax_cache scache_struct; |
322 struct syntax_cache *scache = &scache_struct; | |
323 | |
428 | 324 CHECK_STRING (string); |
325 bufp = compile_pattern (string, &search_regs, | |
326 (!NILP (buf->case_fold_search) | |
446 | 327 ? XCASE_TABLE_DOWNCASE (buf->case_table) : Qnil), |
826 | 328 wrap_buffer (buf), buf, posix, ERROR_ME); |
428 | 329 |
330 QUIT; | |
331 | |
332 /* Get pointers and sizes of the two strings | |
333 that make up the visible portion of the buffer. */ | |
334 | |
826 | 335 p1 = BYTE_BUF_BEGV (buf); |
336 p2 = BYTE_BUF_CEILING_OF (buf, p1); | |
428 | 337 s1 = p2 - p1; |
826 | 338 s2 = BYTE_BUF_ZV (buf) - p2; |
339 | |
340 /* By making the regex object, regex buffer, and syntax cache arguments | |
341 to re_{search,match}{,_2}, we've removed the need to do nasty things | |
342 to deal with regex reentrancy. (See stack trace in signal.c for proof | |
343 that this can happen.) | |
344 | |
345 #### there is still a potential problem with the regex cache -- | |
346 the compiled regex could be overwritten. we'd need 20-fold | |
347 reentrancy, though. Fix this. */ | |
348 | |
349 i = re_match_2 (bufp, (char *) BYTE_BUF_BYTE_ADDRESS (buf, p1), | |
350 s1, (char *) BYTE_BUF_BYTE_ADDRESS (buf, p2), s2, | |
351 BYTE_BUF_PT (buf) - BYTE_BUF_BEGV (buf), &search_regs, | |
352 BYTE_BUF_ZV (buf) - BYTE_BUF_BEGV (buf), wrap_buffer (buf), | |
353 buf, scache); | |
428 | 354 |
355 if (i == -2) | |
356 matcher_overflow (); | |
357 | |
358 val = (0 <= i ? Qt : Qnil); | |
359 if (NILP (val)) | |
826 | 360 return Qnil; |
428 | 361 { |
362 int num_regs = search_regs.num_regs; | |
363 for (i = 0; i < num_regs; i++) | |
364 if (search_regs.start[i] >= 0) | |
365 { | |
826 | 366 search_regs.start[i] += BYTE_BUF_BEGV (buf); |
367 search_regs.end[i] += BYTE_BUF_BEGV (buf); | |
428 | 368 } |
369 } | |
793 | 370 last_thing_searched = wrap_buffer (buf); |
428 | 371 fixup_search_regs_for_buffer (buf); |
826 | 372 return val; |
428 | 373 } |
374 | |
375 DEFUN ("looking-at", Flooking_at, 1, 2, 0, /* | |
376 Return t if text after point matches regular expression REGEXP. | |
1468 | 377 When the match is successful, this function modifies the match data |
378 that `match-beginning', `match-end' and `match-data' access; save the | |
379 match data with `match-data' and restore it with `store-match-data' if | |
380 you want to preserve them. If the match fails, the match data from the | |
381 previous success match is preserved. | |
428 | 382 |
383 Optional argument BUFFER defaults to the current buffer. | |
384 */ | |
385 (regexp, buffer)) | |
386 { | |
387 return looking_at_1 (regexp, decode_buffer (buffer, 0), 0); | |
388 } | |
389 | |
390 DEFUN ("posix-looking-at", Fposix_looking_at, 1, 2, 0, /* | |
391 Return t if text after point matches regular expression REGEXP. | |
392 Find the longest match, in accord with Posix regular expression rules. | |
1468 | 393 When the match is successful, this function modifies the match data |
394 that `match-beginning', `match-end' and `match-data' access; save the | |
395 match data with `match-data' and restore it with `store-match-data' if | |
396 you want to preserve them. If the match fails, the match data from the | |
397 previous success match is preserved. | |
428 | 398 |
399 Optional argument BUFFER defaults to the current buffer. | |
400 */ | |
401 (regexp, buffer)) | |
402 { | |
826 | 403 return looking_at_1 (regexp, decode_buffer (buffer, 0), 1); |
428 | 404 } |
405 | |
406 static Lisp_Object | |
407 string_match_1 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start, | |
2286 | 408 struct buffer *buf, int UNUSED (posix)) |
428 | 409 { |
410 Bytecount val; | |
411 Charcount s; | |
412 struct re_pattern_buffer *bufp; | |
413 | |
853 | 414 /* Some FSF junk with running_asynch_code, to preserve the match |
415 data. Not necessary because we don't call process filters | |
416 asynchronously (i.e. from within QUIT). */ | |
428 | 417 |
418 CHECK_STRING (regexp); | |
419 CHECK_STRING (string); | |
420 | |
421 if (NILP (start)) | |
422 s = 0; | |
423 else | |
424 { | |
826 | 425 Charcount len = string_char_length (string); |
428 | 426 |
427 CHECK_INT (start); | |
428 s = XINT (start); | |
429 if (s < 0 && -s <= len) | |
430 s = len + s; | |
431 else if (0 > s || s > len) | |
432 args_out_of_range (string, start); | |
433 } | |
434 | |
435 | |
436 bufp = compile_pattern (regexp, &search_regs, | |
437 (!NILP (buf->case_fold_search) | |
446 | 438 ? XCASE_TABLE_DOWNCASE (buf->case_table) : Qnil), |
826 | 439 string, buf, 0, ERROR_ME); |
428 | 440 QUIT; |
441 { | |
793 | 442 Bytecount bis = string_index_char_to_byte (string, s); |
826 | 443 struct syntax_cache scache_struct; |
444 struct syntax_cache *scache = &scache_struct; | |
445 | |
446 /* By making the regex object, regex buffer, and syntax cache arguments | |
447 to re_{search,match}{,_2}, we've removed the need to do nasty things | |
448 to deal with regex reentrancy. (See stack trace in signal.c for proof | |
449 that this can happen.) | |
450 | |
451 #### there is still a potential problem with the regex cache -- | |
452 the compiled regex could be overwritten. we'd need 20-fold | |
453 reentrancy, though. Fix this. */ | |
454 | |
428 | 455 val = re_search (bufp, (char *) XSTRING_DATA (string), |
456 XSTRING_LENGTH (string), bis, | |
457 XSTRING_LENGTH (string) - bis, | |
826 | 458 &search_regs, string, buf, scache); |
428 | 459 } |
460 if (val == -2) | |
461 matcher_overflow (); | |
826 | 462 if (val < 0) return Qnil; |
428 | 463 last_thing_searched = Qt; |
464 fixup_search_regs_for_string (string); | |
826 | 465 return make_int (string_index_byte_to_char (string, val)); |
428 | 466 } |
467 | |
468 DEFUN ("string-match", Fstring_match, 2, 4, 0, /* | |
469 Return index of start of first match for REGEXP in STRING, or nil. | |
470 If third arg START is non-nil, start search at that index in STRING. | |
471 For index of first char beyond the match, do (match-end 0). | |
472 `match-end' and `match-beginning' also give indices of substrings | |
473 matched by parenthesis constructs in the pattern. | |
474 | |
826 | 475 Optional arg BUFFER controls how case folding and syntax and category |
476 lookup is done (according to the value of `case-fold-search' in that buffer | |
477 and that buffer's case tables, syntax tables, and category table). If nil | |
478 or unspecified, it defaults *NOT* to the current buffer but instead: | |
479 | |
480 -- the value of `case-fold-search' in the current buffer is still respected | |
481 because of idioms like | |
482 | |
483 (let ((case-fold-search nil)) | |
484 (string-match "^foo.*bar" string)) | |
485 | |
486 but the case, syntax, and category tables come from the standard tables, | |
1468 | 487 which are accessed through functions `default-{case,syntax,category}-table' |
488 and serve as the parents of the tables in particular buffer. | |
489 | |
490 When the match is successful, this function modifies the match data | |
491 that `match-beginning', `match-end' and `match-data' access; save the | |
492 match data with `match-data' and restore it with `store-match-data' if | |
493 you want to preserve them. If the match fails, the match data from the | |
494 previous success match is preserved. | |
428 | 495 */ |
496 (regexp, string, start, buffer)) | |
497 { | |
826 | 498 /* &&#### implement new interp for buffer arg; check code to see if it |
499 makes more sense than prev */ | |
428 | 500 return string_match_1 (regexp, string, start, decode_buffer (buffer, 0), 0); |
501 } | |
502 | |
503 DEFUN ("posix-string-match", Fposix_string_match, 2, 4, 0, /* | |
504 Return index of start of first match for REGEXP in STRING, or nil. | |
505 Find the longest match, in accord with Posix regular expression rules. | |
506 If third arg START is non-nil, start search at that index in STRING. | |
507 For index of first char beyond the match, do (match-end 0). | |
508 `match-end' and `match-beginning' also give indices of substrings | |
509 matched by parenthesis constructs in the pattern. | |
510 | |
511 Optional arg BUFFER controls how case folding is done (according to | |
512 the value of `case-fold-search' in that buffer and that buffer's case | |
513 tables) and defaults to the current buffer. | |
1468 | 514 |
515 When the match is successful, this function modifies the match data | |
516 that `match-beginning', `match-end' and `match-data' access; save the | |
517 match data with `match-data' and restore it with `store-match-data' if | |
518 you want to preserve them. If the match fails, the match data from the | |
519 previous success match is preserved. | |
428 | 520 */ |
521 (regexp, string, start, buffer)) | |
522 { | |
523 return string_match_1 (regexp, string, start, decode_buffer (buffer, 0), 1); | |
524 } | |
525 | |
526 /* Match REGEXP against STRING, searching all of STRING, | |
527 and return the index of the match, or negative on failure. | |
528 This does not clobber the match data. */ | |
529 | |
530 Bytecount | |
1347 | 531 fast_string_match (Lisp_Object regexp, const Ibyte *nonreloc, |
428 | 532 Lisp_Object reloc, Bytecount offset, |
533 Bytecount length, int case_fold_search, | |
578 | 534 Error_Behavior errb, int no_quit) |
428 | 535 { |
536 Bytecount val; | |
867 | 537 Ibyte *newnonreloc = (Ibyte *) nonreloc; |
428 | 538 struct re_pattern_buffer *bufp; |
826 | 539 struct syntax_cache scache_struct; |
540 struct syntax_cache *scache = &scache_struct; | |
428 | 541 |
542 bufp = compile_pattern (regexp, 0, | |
543 (case_fold_search | |
771 | 544 ? XCASE_TABLE_DOWNCASE (Vstandard_case_table) |
446 | 545 : Qnil), |
826 | 546 reloc, 0, 0, errb); |
428 | 547 if (!bufp) |
548 return -1; /* will only do this when errb != ERROR_ME */ | |
549 if (!no_quit) | |
550 QUIT; | |
551 else | |
552 no_quit_in_re_search = 1; | |
553 | |
554 fixup_internal_substring (nonreloc, reloc, offset, &length); | |
555 | |
771 | 556 /* Don't need to protect against GC inside of re_search() due to QUIT; |
557 QUIT is GC-inhibited. */ | |
428 | 558 if (!NILP (reloc)) |
771 | 559 newnonreloc = XSTRING_DATA (reloc); |
560 | |
826 | 561 /* By making the regex object, regex buffer, and syntax cache arguments |
562 to re_{search,match}{,_2}, we've removed the need to do nasty things | |
563 to deal with regex reentrancy. (See stack trace in signal.c for proof | |
564 that this can happen.) | |
565 | |
566 #### there is still a potential problem with the regex cache -- | |
567 the compiled regex could be overwritten. we'd need 20-fold | |
568 reentrancy, though. Fix this. */ | |
569 | |
428 | 570 val = re_search (bufp, (char *) newnonreloc + offset, length, 0, |
826 | 571 length, 0, reloc, 0, scache); |
428 | 572 |
573 no_quit_in_re_search = 0; | |
574 return val; | |
575 } | |
576 | |
577 Bytecount | |
578 fast_lisp_string_match (Lisp_Object regex, Lisp_Object string) | |
579 { | |
580 return fast_string_match (regex, 0, string, 0, -1, 0, ERROR_ME, 0); | |
581 } | |
582 | |
583 | |
584 #ifdef REGION_CACHE_NEEDS_WORK | |
585 /* The newline cache: remembering which sections of text have no newlines. */ | |
586 | |
587 /* If the user has requested newline caching, make sure it's on. | |
588 Otherwise, make sure it's off. | |
589 This is our cheezy way of associating an action with the change of | |
590 state of a buffer-local variable. */ | |
591 static void | |
592 newline_cache_on_off (struct buffer *buf) | |
593 { | |
594 if (NILP (buf->cache_long_line_scans)) | |
595 { | |
596 /* It should be off. */ | |
597 if (buf->newline_cache) | |
598 { | |
599 free_region_cache (buf->newline_cache); | |
600 buf->newline_cache = 0; | |
601 } | |
602 } | |
603 else | |
604 { | |
605 /* It should be on. */ | |
606 if (buf->newline_cache == 0) | |
607 buf->newline_cache = new_region_cache (); | |
608 } | |
609 } | |
610 #endif | |
611 | |
612 /* Search in BUF for COUNT instances of the character TARGET between | |
613 START and END. | |
614 | |
615 If COUNT is positive, search forwards; END must be >= START. | |
616 If COUNT is negative, search backwards for the -COUNTth instance; | |
617 END must be <= START. | |
618 If COUNT is zero, do anything you please; run rogue, for all I care. | |
619 | |
620 If END is zero, use BEGV or ZV instead, as appropriate for the | |
621 direction indicated by COUNT. | |
622 | |
623 If we find COUNT instances, set *SHORTAGE to zero, and return the | |
624 position after the COUNTth match. Note that for reverse motion | |
625 this is not the same as the usual convention for Emacs motion commands. | |
626 | |
627 If we don't find COUNT instances before reaching END, set *SHORTAGE | |
628 to the number of TARGETs left unfound, and return END. | |
629 | |
630 If ALLOW_QUIT is non-zero, call QUIT periodically. */ | |
631 | |
665 | 632 static Bytebpos |
867 | 633 byte_scan_buffer (struct buffer *buf, Ichar target, Bytebpos st, Bytebpos en, |
872 | 634 EMACS_INT count, EMACS_INT *shortage, int allow_quit) |
428 | 635 { |
665 | 636 Bytebpos lim = en > 0 ? en : |
826 | 637 ((count > 0) ? BYTE_BUF_ZV (buf) : BYTE_BUF_BEGV (buf)); |
428 | 638 |
639 /* #### newline cache stuff in this function not yet ported */ | |
640 assert (count != 0); | |
641 | |
642 if (shortage) | |
643 *shortage = 0; | |
644 | |
645 if (count > 0) | |
646 { | |
647 #ifdef MULE | |
826 | 648 Internal_Format fmt = buf->text->format; |
649 /* Check for char that's unrepresentable in the buffer -- it | |
650 certainly can't be there. */ | |
867 | 651 if (!ichar_fits_in_format (target, fmt, wrap_buffer (buf))) |
428 | 652 { |
826 | 653 *shortage = count; |
654 return lim; | |
655 } | |
656 /* Due to the Mule representation of characters in a buffer, we can | |
657 simply search for characters in the range 0 - 127 directly; for | |
658 8-bit-fixed, we can do this for all characters. In other cases, | |
659 we do it the "hard" way. Note that this way works for all | |
660 characters and all formats, but the other way is faster. */ | |
661 else if (! (fmt == FORMAT_8_BIT_FIXED || | |
867 | 662 (fmt == FORMAT_DEFAULT && ichar_ascii_p (target)))) |
826 | 663 { |
867 | 664 Raw_Ichar raw = ichar_to_raw (target, fmt, wrap_buffer (buf)); |
428 | 665 while (st < lim && count > 0) |
666 { | |
826 | 667 if (BYTE_BUF_FETCH_CHAR_RAW (buf, st) == raw) |
428 | 668 count--; |
665 | 669 INC_BYTEBPOS (buf, st); |
428 | 670 } |
671 } | |
672 else | |
673 #endif | |
674 { | |
867 | 675 Raw_Ichar raw = ichar_to_raw (target, fmt, wrap_buffer (buf)); |
428 | 676 while (st < lim && count > 0) |
677 { | |
665 | 678 Bytebpos ceil; |
867 | 679 Ibyte *bufptr; |
428 | 680 |
826 | 681 ceil = BYTE_BUF_CEILING_OF (buf, st); |
428 | 682 ceil = min (lim, ceil); |
867 | 683 bufptr = (Ibyte *) memchr (BYTE_BUF_BYTE_ADDRESS (buf, st), |
826 | 684 raw, ceil - st); |
428 | 685 if (bufptr) |
686 { | |
687 count--; | |
826 | 688 st = BYTE_BUF_PTR_BYTE_POS (buf, bufptr) + 1; |
428 | 689 } |
690 else | |
691 st = ceil; | |
692 } | |
693 } | |
694 | |
695 if (shortage) | |
696 *shortage = count; | |
697 if (allow_quit) | |
698 QUIT; | |
699 return st; | |
700 } | |
701 else | |
702 { | |
703 #ifdef MULE | |
826 | 704 Internal_Format fmt = buf->text->format; |
705 /* Check for char that's unrepresentable in the buffer -- it | |
706 certainly can't be there. */ | |
867 | 707 if (!ichar_fits_in_format (target, fmt, wrap_buffer (buf))) |
428 | 708 { |
826 | 709 *shortage = -count; |
710 return lim; | |
711 } | |
712 else if (! (fmt == FORMAT_8_BIT_FIXED || | |
867 | 713 (fmt == FORMAT_DEFAULT && ichar_ascii_p (target)))) |
826 | 714 { |
867 | 715 Raw_Ichar raw = ichar_to_raw (target, fmt, wrap_buffer (buf)); |
428 | 716 while (st > lim && count < 0) |
717 { | |
665 | 718 DEC_BYTEBPOS (buf, st); |
826 | 719 if (BYTE_BUF_FETCH_CHAR_RAW (buf, st) == raw) |
428 | 720 count++; |
721 } | |
722 } | |
723 else | |
724 #endif | |
725 { | |
867 | 726 Raw_Ichar raw = ichar_to_raw (target, fmt, wrap_buffer (buf)); |
428 | 727 while (st > lim && count < 0) |
728 { | |
665 | 729 Bytebpos floor; |
867 | 730 Ibyte *bufptr; |
731 Ibyte *floorptr; | |
428 | 732 |
826 | 733 floor = BYTE_BUF_FLOOR_OF (buf, st); |
428 | 734 floor = max (lim, floor); |
735 /* No memrchr() ... */ | |
826 | 736 bufptr = BYTE_BUF_BYTE_ADDRESS_BEFORE (buf, st); |
737 floorptr = BYTE_BUF_BYTE_ADDRESS (buf, floor); | |
428 | 738 while (bufptr >= floorptr) |
739 { | |
740 st--; | |
741 /* At this point, both ST and BUFPTR refer to the same | |
742 character. When the loop terminates, ST will | |
743 always point to the last character we tried. */ | |
867 | 744 if (*bufptr == (Ibyte) raw) |
428 | 745 { |
746 count++; | |
747 break; | |
748 } | |
749 bufptr--; | |
750 } | |
751 } | |
752 } | |
753 | |
754 if (shortage) | |
755 *shortage = -count; | |
756 if (allow_quit) | |
757 QUIT; | |
758 if (count) | |
759 return st; | |
760 else | |
761 { | |
762 /* We found the character we were looking for; we have to return | |
763 the position *after* it due to the strange way that the return | |
764 value is defined. */ | |
665 | 765 INC_BYTEBPOS (buf, st); |
428 | 766 return st; |
767 } | |
768 } | |
769 } | |
770 | |
665 | 771 Charbpos |
867 | 772 scan_buffer (struct buffer *buf, Ichar target, Charbpos start, Charbpos end, |
428 | 773 EMACS_INT count, EMACS_INT *shortage, int allow_quit) |
774 { | |
826 | 775 Bytebpos byte_retval; |
776 Bytebpos byte_start, byte_end; | |
777 | |
778 byte_start = charbpos_to_bytebpos (buf, start); | |
428 | 779 if (end) |
826 | 780 byte_end = charbpos_to_bytebpos (buf, end); |
428 | 781 else |
826 | 782 byte_end = 0; |
783 byte_retval = byte_scan_buffer (buf, target, byte_start, byte_end, count, | |
428 | 784 shortage, allow_quit); |
826 | 785 return bytebpos_to_charbpos (buf, byte_retval); |
428 | 786 } |
787 | |
665 | 788 Bytebpos |
826 | 789 byte_find_next_newline_no_quit (struct buffer *buf, Bytebpos from, int count) |
428 | 790 { |
826 | 791 return byte_scan_buffer (buf, '\n', from, 0, count, 0, 0); |
428 | 792 } |
793 | |
665 | 794 Charbpos |
795 find_next_newline_no_quit (struct buffer *buf, Charbpos from, int count) | |
428 | 796 { |
797 return scan_buffer (buf, '\n', from, 0, count, 0, 0); | |
798 } | |
799 | |
665 | 800 Charbpos |
801 find_next_newline (struct buffer *buf, Charbpos from, int count) | |
428 | 802 { |
803 return scan_buffer (buf, '\n', from, 0, count, 0, 1); | |
804 } | |
805 | |
826 | 806 Bytecount |
867 | 807 byte_find_next_ichar_in_string (Lisp_Object str, Ichar target, Bytecount st, |
428 | 808 EMACS_INT count) |
809 { | |
793 | 810 Bytebpos lim = XSTRING_LENGTH (str) -1; |
867 | 811 Ibyte *s = XSTRING_DATA (str); |
428 | 812 |
813 assert (count >= 0); | |
814 | |
815 #ifdef MULE | |
816 /* Due to the Mule representation of characters in a buffer, | |
817 we can simply search for characters in the range 0 - 127 | |
818 directly. For other characters, we do it the "hard" way. | |
819 Note that this way works for all characters but the other | |
820 way is faster. */ | |
821 if (target >= 0200) | |
822 { | |
823 while (st < lim && count > 0) | |
824 { | |
867 | 825 if (string_ichar (str, st) == target) |
428 | 826 count--; |
826 | 827 INC_BYTECOUNT (s, st); |
428 | 828 } |
829 } | |
830 else | |
831 #endif | |
832 { | |
833 while (st < lim && count > 0) | |
834 { | |
867 | 835 Ibyte *bufptr = (Ibyte *) memchr (itext_n_addr (s, st), |
428 | 836 (int) target, lim - st); |
837 if (bufptr) | |
838 { | |
839 count--; | |
826 | 840 st = (Bytebpos) (bufptr - s) + 1; |
428 | 841 } |
842 else | |
843 st = lim; | |
844 } | |
845 } | |
846 return st; | |
847 } | |
848 | |
849 /* Like find_next_newline, but returns position before the newline, | |
850 not after, and only search up to TO. This isn't just | |
851 find_next_newline (...)-1, because you might hit TO. */ | |
665 | 852 Charbpos |
826 | 853 find_before_next_newline (struct buffer *buf, Charbpos from, Charbpos to, |
854 int count) | |
428 | 855 { |
856 EMACS_INT shortage; | |
665 | 857 Charbpos pos = scan_buffer (buf, '\n', from, to, count, &shortage, 1); |
428 | 858 |
859 if (shortage == 0) | |
860 pos--; | |
861 | |
862 return pos; | |
863 } | |
864 | |
872 | 865 /* This function synched with FSF 21.1 */ |
428 | 866 static Lisp_Object |
867 skip_chars (struct buffer *buf, int forwardp, int syntaxp, | |
868 Lisp_Object string, Lisp_Object lim) | |
869 { | |
867 | 870 REGISTER Ibyte *p, *pend; |
871 REGISTER Ichar c; | |
428 | 872 /* We store the first 256 chars in an array here and the rest in |
873 a range table. */ | |
874 unsigned char fastmap[0400]; | |
875 int negate = 0; | |
876 REGISTER int i; | |
665 | 877 Charbpos limit; |
826 | 878 struct syntax_cache *scache; |
879 | |
428 | 880 if (NILP (lim)) |
881 limit = forwardp ? BUF_ZV (buf) : BUF_BEGV (buf); | |
882 else | |
883 { | |
884 CHECK_INT_COERCE_MARKER (lim); | |
885 limit = XINT (lim); | |
886 | |
887 /* In any case, don't allow scan outside bounds of buffer. */ | |
888 if (limit > BUF_ZV (buf)) limit = BUF_ZV (buf); | |
889 if (limit < BUF_BEGV (buf)) limit = BUF_BEGV (buf); | |
890 } | |
891 | |
892 CHECK_STRING (string); | |
893 p = XSTRING_DATA (string); | |
894 pend = p + XSTRING_LENGTH (string); | |
895 memset (fastmap, 0, sizeof (fastmap)); | |
896 | |
897 Fclear_range_table (Vskip_chars_range_table); | |
898 | |
899 if (p != pend && *p == '^') | |
900 { | |
901 negate = 1; | |
902 p++; | |
903 } | |
904 | |
905 /* Find the characters specified and set their elements of fastmap. | |
906 If syntaxp, each character counts as itself. | |
907 Otherwise, handle backslashes and ranges specially */ | |
908 | |
909 while (p != pend) | |
910 { | |
867 | 911 c = itext_ichar (p); |
912 INC_IBYTEPTR (p); | |
428 | 913 if (syntaxp) |
914 { | |
915 if (c < 0400 && syntax_spec_code[c] < (unsigned char) Smax) | |
916 fastmap[c] = 1; | |
917 else | |
831 | 918 invalid_argument ("Invalid syntax designator", make_char (c)); |
428 | 919 } |
920 else | |
921 { | |
922 if (c == '\\') | |
923 { | |
924 if (p == pend) break; | |
867 | 925 c = itext_ichar (p); |
926 INC_IBYTEPTR (p); | |
428 | 927 } |
928 if (p != pend && *p == '-') | |
929 { | |
867 | 930 Ichar cend; |
428 | 931 |
872 | 932 /* Skip over the dash. */ |
428 | 933 p++; |
934 if (p == pend) break; | |
867 | 935 cend = itext_ichar (p); |
428 | 936 while (c <= cend && c < 0400) |
937 { | |
938 fastmap[c] = 1; | |
939 c++; | |
940 } | |
941 if (c <= cend) | |
942 Fput_range_table (make_int (c), make_int (cend), Qt, | |
943 Vskip_chars_range_table); | |
867 | 944 INC_IBYTEPTR (p); |
428 | 945 } |
946 else | |
947 { | |
948 if (c < 0400) | |
949 fastmap[c] = 1; | |
950 else | |
951 Fput_range_table (make_int (c), make_int (c), Qt, | |
952 Vskip_chars_range_table); | |
953 } | |
954 } | |
955 } | |
956 | |
872 | 957 /* #### Not in FSF 21.1 */ |
428 | 958 if (syntaxp && fastmap['-'] != 0) |
959 fastmap[' '] = 1; | |
960 | |
961 /* If ^ was the first character, complement the fastmap. | |
962 We don't complement the range table, however; we just use negate | |
963 in the comparisons below. */ | |
964 | |
965 if (negate) | |
647 | 966 for (i = 0; i < (int) (sizeof (fastmap)); i++) |
428 | 967 fastmap[i] ^= 1; |
968 | |
969 { | |
665 | 970 Charbpos start_point = BUF_PT (buf); |
872 | 971 Charbpos pos = start_point; |
972 Charbpos pos_byte = BYTE_BUF_PT (buf); | |
428 | 973 |
974 if (syntaxp) | |
975 { | |
872 | 976 scache = setup_buffer_syntax_cache (buf, pos, forwardp ? 1 : -1); |
428 | 977 /* All syntax designators are normal chars so nothing strange |
978 to worry about */ | |
979 if (forwardp) | |
980 { | |
872 | 981 if (pos < limit) |
982 while (fastmap[(unsigned char) | |
983 syntax_code_spec | |
984 [(int) SYNTAX_FROM_CACHE | |
985 (scache, BYTE_BUF_FETCH_CHAR (buf, pos_byte))]]) | |
986 { | |
987 pos++; | |
988 INC_BYTEBPOS (buf, pos_byte); | |
879 | 989 if (pos >= limit) |
872 | 990 break; |
991 UPDATE_SYNTAX_CACHE_FORWARD (scache, pos); | |
992 } | |
428 | 993 } |
994 else | |
995 { | |
872 | 996 while (pos > limit) |
460 | 997 { |
872 | 998 Charbpos savepos = pos_byte; |
999 pos--; | |
1000 DEC_BYTEBPOS (buf, pos_byte); | |
1001 UPDATE_SYNTAX_CACHE_BACKWARD (scache, pos); | |
1002 if (!fastmap[(unsigned char) | |
1003 syntax_code_spec | |
1004 [(int) SYNTAX_FROM_CACHE | |
1005 (scache, BYTE_BUF_FETCH_CHAR (buf, pos_byte))]]) | |
1006 { | |
1007 pos++; | |
1008 pos_byte = savepos; | |
1009 break; | |
1010 } | |
460 | 1011 } |
428 | 1012 } |
1013 } | |
1014 else | |
1015 { | |
1016 if (forwardp) | |
1017 { | |
872 | 1018 while (pos < limit) |
428 | 1019 { |
872 | 1020 Ichar ch = BYTE_BUF_FETCH_CHAR (buf, pos_byte); |
428 | 1021 if ((ch < 0400) ? fastmap[ch] : |
1022 (NILP (Fget_range_table (make_int (ch), | |
1023 Vskip_chars_range_table, | |
1024 Qnil)) | |
1025 == negate)) | |
872 | 1026 { |
1027 pos++; | |
1028 INC_BYTEBPOS (buf, pos_byte); | |
1029 } | |
428 | 1030 else |
1031 break; | |
1032 } | |
1033 } | |
1034 else | |
1035 { | |
872 | 1036 while (pos > limit) |
428 | 1037 { |
872 | 1038 Charbpos prev_pos_byte = pos_byte; |
1039 Ichar ch; | |
1040 | |
1041 DEC_BYTEBPOS (buf, prev_pos_byte); | |
1042 ch = BYTE_BUF_FETCH_CHAR (buf, prev_pos_byte); | |
428 | 1043 if ((ch < 0400) ? fastmap[ch] : |
1044 (NILP (Fget_range_table (make_int (ch), | |
1045 Vskip_chars_range_table, | |
1046 Qnil)) | |
1047 == negate)) | |
872 | 1048 { |
1049 pos--; | |
1050 pos_byte = prev_pos_byte; | |
1051 } | |
428 | 1052 else |
1053 break; | |
1054 } | |
1055 } | |
1056 } | |
1057 QUIT; | |
872 | 1058 BOTH_BUF_SET_PT (buf, pos, pos_byte); |
428 | 1059 return make_int (BUF_PT (buf) - start_point); |
1060 } | |
1061 } | |
1062 | |
1063 DEFUN ("skip-chars-forward", Fskip_chars_forward, 1, 3, 0, /* | |
444 | 1064 Move point forward, stopping before a char not in STRING, or at pos LIMIT. |
428 | 1065 STRING is like the inside of a `[...]' in a regular expression |
1066 except that `]' is never special and `\\' quotes `^', `-' or `\\'. | |
1067 Thus, with arg "a-zA-Z", this skips letters stopping before first nonletter. | |
1068 With arg "^a-zA-Z", skips nonletters stopping before first letter. | |
1069 Returns the distance traveled, either zero or positive. | |
1070 | |
1071 Optional argument BUFFER defaults to the current buffer. | |
1072 */ | |
444 | 1073 (string, limit, buffer)) |
428 | 1074 { |
444 | 1075 return skip_chars (decode_buffer (buffer, 0), 1, 0, string, limit); |
428 | 1076 } |
1077 | |
1078 DEFUN ("skip-chars-backward", Fskip_chars_backward, 1, 3, 0, /* | |
444 | 1079 Move point backward, stopping after a char not in STRING, or at pos LIMIT. |
428 | 1080 See `skip-chars-forward' for details. |
1081 Returns the distance traveled, either zero or negative. | |
1082 | |
1083 Optional argument BUFFER defaults to the current buffer. | |
1084 */ | |
444 | 1085 (string, limit, buffer)) |
428 | 1086 { |
444 | 1087 return skip_chars (decode_buffer (buffer, 0), 0, 0, string, limit); |
428 | 1088 } |
1089 | |
1090 | |
1091 DEFUN ("skip-syntax-forward", Fskip_syntax_forward, 1, 3, 0, /* | |
1092 Move point forward across chars in specified syntax classes. | |
1093 SYNTAX is a string of syntax code characters. | |
444 | 1094 Stop before a char whose syntax is not in SYNTAX, or at position LIMIT. |
428 | 1095 If SYNTAX starts with ^, skip characters whose syntax is NOT in SYNTAX. |
1096 This function returns the distance traveled, either zero or positive. | |
1097 | |
1098 Optional argument BUFFER defaults to the current buffer. | |
1099 */ | |
444 | 1100 (syntax, limit, buffer)) |
428 | 1101 { |
444 | 1102 return skip_chars (decode_buffer (buffer, 0), 1, 1, syntax, limit); |
428 | 1103 } |
1104 | |
1105 DEFUN ("skip-syntax-backward", Fskip_syntax_backward, 1, 3, 0, /* | |
1106 Move point backward across chars in specified syntax classes. | |
1107 SYNTAX is a string of syntax code characters. | |
444 | 1108 Stop on reaching a char whose syntax is not in SYNTAX, or at position LIMIT. |
428 | 1109 If SYNTAX starts with ^, skip characters whose syntax is NOT in SYNTAX. |
1110 This function returns the distance traveled, either zero or negative. | |
1111 | |
1112 Optional argument BUFFER defaults to the current buffer. | |
1113 */ | |
444 | 1114 (syntax, limit, buffer)) |
428 | 1115 { |
444 | 1116 return skip_chars (decode_buffer (buffer, 0), 0, 1, syntax, limit); |
428 | 1117 } |
1118 | |
1119 | |
1120 /* Subroutines of Lisp buffer search functions. */ | |
1121 | |
1122 static Lisp_Object | |
444 | 1123 search_command (Lisp_Object string, Lisp_Object limit, Lisp_Object noerror, |
428 | 1124 Lisp_Object count, Lisp_Object buffer, int direction, |
1125 int RE, int posix) | |
1126 { | |
665 | 1127 REGISTER Charbpos np; |
1128 Charbpos lim; | |
428 | 1129 EMACS_INT n = direction; |
1130 struct buffer *buf; | |
1131 | |
1132 if (!NILP (count)) | |
1133 { | |
1134 CHECK_INT (count); | |
1135 n *= XINT (count); | |
1136 } | |
1137 | |
1138 buf = decode_buffer (buffer, 0); | |
1139 CHECK_STRING (string); | |
444 | 1140 if (NILP (limit)) |
428 | 1141 lim = n > 0 ? BUF_ZV (buf) : BUF_BEGV (buf); |
1142 else | |
1143 { | |
444 | 1144 CHECK_INT_COERCE_MARKER (limit); |
1145 lim = XINT (limit); | |
428 | 1146 if (n > 0 ? lim < BUF_PT (buf) : lim > BUF_PT (buf)) |
563 | 1147 invalid_argument ("Invalid search limit (wrong side of point)", |
1148 Qunbound); | |
428 | 1149 if (lim > BUF_ZV (buf)) |
1150 lim = BUF_ZV (buf); | |
1151 if (lim < BUF_BEGV (buf)) | |
1152 lim = BUF_BEGV (buf); | |
1153 } | |
1154 | |
1155 np = search_buffer (buf, string, BUF_PT (buf), lim, n, RE, | |
1156 (!NILP (buf->case_fold_search) | |
446 | 1157 ? XCASE_TABLE_CANON (buf->case_table) |
1158 : Qnil), | |
428 | 1159 (!NILP (buf->case_fold_search) |
446 | 1160 ? XCASE_TABLE_EQV (buf->case_table) |
1161 : Qnil), posix); | |
428 | 1162 |
1163 if (np <= 0) | |
1164 { | |
444 | 1165 if (NILP (noerror)) |
2268 | 1166 { |
1167 signal_failure (string); | |
1168 RETURN_NOT_REACHED (Qnil); | |
1169 } | |
444 | 1170 if (!EQ (noerror, Qt)) |
428 | 1171 { |
1172 if (lim < BUF_BEGV (buf) || lim > BUF_ZV (buf)) | |
2500 | 1173 ABORT (); |
428 | 1174 BUF_SET_PT (buf, lim); |
1175 return Qnil; | |
1176 #if 0 /* This would be clean, but maybe programs depend on | |
1177 a value of nil here. */ | |
1178 np = lim; | |
1179 #endif | |
1180 } | |
1181 else | |
1182 return Qnil; | |
1183 } | |
1184 | |
1185 if (np < BUF_BEGV (buf) || np > BUF_ZV (buf)) | |
2500 | 1186 ABORT (); |
428 | 1187 |
1188 BUF_SET_PT (buf, np); | |
1189 | |
1190 return make_int (np); | |
1191 } | |
1192 | |
1193 static int | |
1194 trivial_regexp_p (Lisp_Object regexp) | |
1195 { | |
1196 Bytecount len = XSTRING_LENGTH (regexp); | |
867 | 1197 Ibyte *s = XSTRING_DATA (regexp); |
428 | 1198 while (--len >= 0) |
1199 { | |
1200 switch (*s++) | |
1201 { | |
1724 | 1202 /* #### howcum ']' doesn't appear here, but ... */ |
428 | 1203 case '.': case '*': case '+': case '?': case '[': case '^': case '$': |
1204 return 0; | |
1205 case '\\': | |
1206 if (--len < 0) | |
1207 return 0; | |
1208 switch (*s++) | |
1209 { | |
1724 | 1210 /* ... ')' does appear here? ('<' and '>' can appear singly.) */ |
1211 /* #### are there other constructs to check? */ | |
428 | 1212 case '|': case '(': case ')': case '`': case '\'': case 'b': |
1213 case 'B': case '<': case '>': case 'w': case 'W': case 's': | |
1724 | 1214 case 'S': case '=': case '{': case '}': |
428 | 1215 #ifdef MULE |
1216 /* 97/2/25 jhod Added for category matches */ | |
1217 case 'c': case 'C': | |
1218 #endif /* MULE */ | |
1219 case '1': case '2': case '3': case '4': case '5': | |
1220 case '6': case '7': case '8': case '9': | |
1221 return 0; | |
1222 } | |
1223 } | |
1224 } | |
1225 return 1; | |
1226 } | |
1227 | |
1228 /* Search for the n'th occurrence of STRING in BUF, | |
665 | 1229 starting at position CHARBPOS and stopping at position BUFLIM, |
428 | 1230 treating PAT as a literal string if RE is false or as |
1231 a regular expression if RE is true. | |
1232 | |
1233 If N is positive, searching is forward and BUFLIM must be greater | |
665 | 1234 than CHARBPOS. |
428 | 1235 If N is negative, searching is backward and BUFLIM must be less |
665 | 1236 than CHARBPOS. |
428 | 1237 |
1238 Returns -x if only N-x occurrences found (x > 0), | |
1239 or else the position at the beginning of the Nth occurrence | |
1240 (if searching backward) or the end (if searching forward). | |
1241 | |
1242 POSIX is nonzero if we want full backtracking (POSIX style) | |
1243 for this pattern. 0 means backtrack only enough to get a valid match. */ | |
665 | 1244 static Charbpos |
1245 search_buffer (struct buffer *buf, Lisp_Object string, Charbpos charbpos, | |
1246 Charbpos buflim, EMACS_INT n, int RE, Lisp_Object trt, | |
446 | 1247 Lisp_Object inverse_trt, int posix) |
428 | 1248 { |
1249 Bytecount len = XSTRING_LENGTH (string); | |
867 | 1250 Ibyte *base_pat = XSTRING_DATA (string); |
428 | 1251 REGISTER EMACS_INT i, j; |
665 | 1252 Bytebpos p1, p2; |
428 | 1253 Bytecount s1, s2; |
665 | 1254 Bytebpos pos, lim; |
428 | 1255 |
853 | 1256 /* Some FSF junk with running_asynch_code, to preserve the match |
1257 data. Not necessary because we don't call process filters | |
1258 asynchronously (i.e. from within QUIT). */ | |
428 | 1259 |
1425 | 1260 /* Searching 0 times means noop---don't move, don't touch registers. */ |
1261 if (n == 0) | |
1262 return charbpos; | |
1263 | |
428 | 1264 /* Null string is found at starting position. */ |
1265 if (len == 0) | |
1266 { | |
665 | 1267 set_search_regs (buf, charbpos, 0); |
1268 return charbpos; | |
428 | 1269 } |
1270 | |
665 | 1271 pos = charbpos_to_bytebpos (buf, charbpos); |
1272 lim = charbpos_to_bytebpos (buf, buflim); | |
428 | 1273 if (RE && !trivial_regexp_p (string)) |
1274 { | |
1275 struct re_pattern_buffer *bufp; | |
826 | 1276 |
1277 bufp = compile_pattern (string, &search_regs, trt, | |
1278 wrap_buffer (buf), buf, posix, ERROR_ME); | |
428 | 1279 |
1280 /* Get pointers and sizes of the two strings | |
1281 that make up the visible portion of the buffer. */ | |
1282 | |
826 | 1283 p1 = BYTE_BUF_BEGV (buf); |
1284 p2 = BYTE_BUF_CEILING_OF (buf, p1); | |
428 | 1285 s1 = p2 - p1; |
826 | 1286 s2 = BYTE_BUF_ZV (buf) - p2; |
1287 | |
1288 while (n != 0) | |
428 | 1289 { |
1290 Bytecount val; | |
826 | 1291 struct syntax_cache scache_struct; |
1292 struct syntax_cache *scache = &scache_struct; | |
1293 | |
428 | 1294 QUIT; |
826 | 1295 /* By making the regex object, regex buffer, and syntax cache |
1296 arguments to re_{search,match}{,_2}, we've removed the need to | |
1297 do nasty things to deal with regex reentrancy. (See stack | |
1298 trace in signal.c for proof that this can happen.) | |
1299 | |
1300 #### there is still a potential problem with the regex cache -- | |
1301 the compiled regex could be overwritten. we'd need 20-fold | |
1302 reentrancy, though. Fix this. */ | |
1303 | |
428 | 1304 val = re_search_2 (bufp, |
826 | 1305 (char *) BYTE_BUF_BYTE_ADDRESS (buf, p1), s1, |
1306 (char *) BYTE_BUF_BYTE_ADDRESS (buf, p2), s2, | |
1307 pos - BYTE_BUF_BEGV (buf), lim - pos, &search_regs, | |
1308 n > 0 ? lim - BYTE_BUF_BEGV (buf) : | |
1309 pos - BYTE_BUF_BEGV (buf), wrap_buffer (buf), | |
1310 buf, scache); | |
428 | 1311 |
1312 if (val == -2) | |
1313 { | |
1314 matcher_overflow (); | |
1315 } | |
1316 if (val >= 0) | |
1317 { | |
1318 int num_regs = search_regs.num_regs; | |
826 | 1319 j = BYTE_BUF_BEGV (buf); |
428 | 1320 for (i = 0; i < num_regs; i++) |
1321 if (search_regs.start[i] >= 0) | |
1322 { | |
1323 search_regs.start[i] += j; | |
1324 search_regs.end[i] += j; | |
1325 } | |
793 | 1326 last_thing_searched = wrap_buffer (buf); |
428 | 1327 /* Set pos to the new position. */ |
826 | 1328 pos = n > 0 ? search_regs.end[0] : search_regs.start[0]; |
428 | 1329 fixup_search_regs_for_buffer (buf); |
665 | 1330 /* And charbpos too. */ |
826 | 1331 charbpos = n > 0 ? search_regs.end[0] : search_regs.start[0]; |
428 | 1332 } |
1333 else | |
826 | 1334 return (n > 0 ? 0 - n : n); |
1335 if (n > 0) n--; else n++; | |
428 | 1336 } |
665 | 1337 return charbpos; |
428 | 1338 } |
1339 else /* non-RE case */ | |
1340 { | |
446 | 1341 int charset_base = -1; |
1342 int boyer_moore_ok = 1; | |
2367 | 1343 Ibyte *patbuf = alloca_ibytes (len * MAX_ICHAR_LEN); |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1344 Ibyte *pat = patbuf; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1345 |
446 | 1346 #ifdef MULE |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1347 int entirely_one_byte_p = buf->text->entirely_one_byte_p; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1348 int nothing_greater_than_0xff = |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1349 buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1350 |
446 | 1351 while (len > 0) |
1352 { | |
867 | 1353 Ibyte tmp_str[MAX_ICHAR_LEN]; |
1354 Ichar c, translated, inverse; | |
446 | 1355 Bytecount orig_bytelen, new_bytelen, inv_bytelen; |
1356 | |
1357 /* If we got here and the RE flag is set, it's because | |
1358 we're dealing with a regexp known to be trivial, so the | |
1359 backslash just quotes the next character. */ | |
1360 if (RE && *base_pat == '\\') | |
1361 { | |
1362 len--; | |
1363 base_pat++; | |
1364 } | |
867 | 1365 c = itext_ichar (base_pat); |
446 | 1366 translated = TRANSLATE (trt, c); |
1367 inverse = TRANSLATE (inverse_trt, c); | |
1368 | |
867 | 1369 orig_bytelen = itext_ichar_len (base_pat); |
1370 inv_bytelen = set_itext_ichar (tmp_str, inverse); | |
1371 new_bytelen = set_itext_ichar (tmp_str, translated); | |
446 | 1372 |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1373 if (-1 == charset_base) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1374 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1375 /* Keep track of which charset and character set row |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1376 contains the characters that need translation. |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1377 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1378 Zero out the bits corresponding to the last byte. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1379 charset_base = c & ~ICHAR_FIELD3_MASK; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1380 } |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1381 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1382 if (boyer_moore_ok && (translated != c || inverse != c)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1383 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1384 Ichar starting_c = c; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1385 int charset_base_code; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1386 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1387 do |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1388 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1389 c = TRANSLATE (inverse_trt, c); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1390 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1391 /* If a character cannot occur in the buffer, ignore |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1392 it. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1393 if (c > 0x7F && entirely_one_byte_p) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1394 continue; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1395 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1396 if (c > 0xFF && nothing_greater_than_0xff) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1397 continue; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1398 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1399 charset_base_code = c & ~ICHAR_FIELD3_MASK; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1400 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1401 if (charset_base_code != charset_base) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1402 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1403 /* If two different rows, or two different charsets, |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1404 appear, needing translation, then we cannot use |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1405 boyer_moore search. See the comment at the head of |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1406 boyer_moore(). */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1407 boyer_moore_ok = 0; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1408 break; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1409 } |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1410 } while (c != starting_c); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1411 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1412 if (boyer_moore_ok && (charset_base != |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1413 (translated & ~ICHAR_FIELD3_MASK))) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1414 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1415 /* In the rare event that the CANON entry for this |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1416 character is not in the desired set, choose one |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1417 that is, from the equivalence set. It doesn't much |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1418 matter which. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1419 Ichar starting_ch = translated; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1420 do |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1421 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1422 translated = TRANSLATE (inverse_trt, translated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1423 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1424 if (charset_base == (translated & ~ICHAR_FIELD3_MASK)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1425 break; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1426 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1427 } while (starting_ch != translated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1428 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1429 assert (starting_ch != translated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1430 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1431 new_bytelen = set_itext_ichar (tmp_str, translated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1432 } |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1433 } |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1434 |
446 | 1435 memcpy (pat, tmp_str, new_bytelen); |
1436 pat += new_bytelen; | |
1437 base_pat += orig_bytelen; | |
1438 len -= orig_bytelen; | |
1439 } | |
1440 #else /* not MULE */ | |
1441 while (--len >= 0) | |
1442 { | |
1443 /* If we got here and the RE flag is set, it's because | |
1444 we're dealing with a regexp known to be trivial, so the | |
1445 backslash just quotes the next character. */ | |
1446 if (RE && *base_pat == '\\') | |
1447 { | |
1448 len--; | |
1449 base_pat++; | |
1450 } | |
1451 *pat++ = TRANSLATE (trt, *base_pat++); | |
1452 } | |
1453 #endif /* MULE */ | |
1454 len = pat - patbuf; | |
1455 pat = base_pat = patbuf; | |
1456 if (boyer_moore_ok) | |
1457 return boyer_moore (buf, base_pat, len, pos, lim, n, | |
1458 trt, inverse_trt, charset_base); | |
1459 else | |
1460 return simple_search (buf, base_pat, len, pos, lim, n, trt); | |
1461 } | |
1462 } | |
1463 | |
826 | 1464 /* Do a simple string search N times for the string PAT, whose length is |
1465 LEN/LEN_BYTE, from buffer position POS until LIM. TRT is the | |
1466 translation table. | |
446 | 1467 |
1468 Return the character position where the match is found. | |
1469 Otherwise, if M matches remained to be found, return -M. | |
1470 | |
1471 This kind of search works regardless of what is in PAT and | |
1472 regardless of what is in TRT. It is used in cases where | |
1473 boyer_moore cannot work. */ | |
1474 | |
665 | 1475 static Charbpos |
867 | 1476 simple_search (struct buffer *buf, Ibyte *base_pat, Bytecount len, |
826 | 1477 Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt) |
446 | 1478 { |
1479 int forward = n > 0; | |
1480 Bytecount buf_len = 0; /* Shut up compiler. */ | |
1481 | |
826 | 1482 if (lim > pos) |
446 | 1483 while (n > 0) |
428 | 1484 { |
446 | 1485 while (1) |
428 | 1486 { |
826 | 1487 Bytecount this_len = len; |
1488 Bytebpos this_pos = pos; | |
867 | 1489 Ibyte *p = base_pat; |
826 | 1490 if (pos >= lim) |
446 | 1491 goto stop; |
1492 | |
1493 while (this_len > 0) | |
1494 { | |
867 | 1495 Ichar pat_ch, buf_ch; |
446 | 1496 Bytecount pat_len; |
1497 | |
867 | 1498 pat_ch = itext_ichar (p); |
826 | 1499 buf_ch = BYTE_BUF_FETCH_CHAR (buf, this_pos); |
446 | 1500 |
1501 buf_ch = TRANSLATE (trt, buf_ch); | |
1502 | |
1503 if (buf_ch != pat_ch) | |
1504 break; | |
1505 | |
867 | 1506 pat_len = itext_ichar_len (p); |
446 | 1507 p += pat_len; |
1508 this_len -= pat_len; | |
826 | 1509 INC_BYTEBPOS (buf, this_pos); |
446 | 1510 } |
1511 if (this_len == 0) | |
428 | 1512 { |
826 | 1513 buf_len = this_pos - pos; |
1514 pos = this_pos; | |
446 | 1515 break; |
428 | 1516 } |
826 | 1517 INC_BYTEBPOS (buf, pos); |
428 | 1518 } |
446 | 1519 n--; |
1520 } | |
1521 else | |
4322
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1522 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1523 /* If lim < len, then there are too few buffer positions to hold the |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1524 pattern between the beginning of the buffer and lim. Adjust to |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1525 ensure pattern fits. If we don't do this, we can assert in the |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1526 DEC_BYTEBPOS below. */ |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1527 if (lim < len) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1528 lim = len; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1529 while (n < 0) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1530 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1531 while (1) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1532 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1533 Bytecount this_len = len; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1534 Bytebpos this_pos = pos; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1535 Ibyte *p; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1536 if (pos <= lim) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1537 goto stop; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1538 p = base_pat + len; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1539 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1540 while (this_len > 0) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1541 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1542 Ichar pat_ch, buf_ch; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1543 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1544 DEC_IBYTEPTR (p); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1545 DEC_BYTEBPOS (buf, this_pos); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1546 pat_ch = itext_ichar (p); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1547 buf_ch = BYTE_BUF_FETCH_CHAR (buf, this_pos); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1548 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1549 buf_ch = TRANSLATE (trt, buf_ch); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1550 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1551 if (buf_ch != pat_ch) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1552 break; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1553 |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1554 this_len -= itext_ichar_len (p); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1555 } |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1556 if (this_len == 0) |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1557 { |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1558 buf_len = pos - this_pos; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1559 pos = this_pos; |
446 | 1560 break; |
4322
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1561 } |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1562 DEC_BYTEBPOS (buf, pos); |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1563 } |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1564 n++; |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1565 } |
f70e56bb52a7
src/search.c (simple_search): Fix underrun in reverse search.
Stephen J. Turnbull <stephen@xemacs.org>
parents:
4199
diff
changeset
|
1566 } |
446 | 1567 stop: |
1568 if (n == 0) | |
1569 { | |
665 | 1570 Charbpos beg, end, retval; |
446 | 1571 if (forward) |
1572 { | |
826 | 1573 beg = bytebpos_to_charbpos (buf, pos - buf_len); |
1574 retval = end = bytebpos_to_charbpos (buf, pos); | |
446 | 1575 } |
1576 else | |
428 | 1577 { |
826 | 1578 retval = beg = bytebpos_to_charbpos (buf, pos); |
1579 end = bytebpos_to_charbpos (buf, pos + buf_len); | |
428 | 1580 } |
446 | 1581 set_search_regs (buf, beg, end - beg); |
1582 | |
1583 return retval; | |
1584 } | |
1585 else if (n > 0) | |
1586 return -n; | |
1587 else | |
1588 return n; | |
1589 } | |
1590 | |
1591 /* Do Boyer-Moore search N times for the string PAT, | |
1592 whose length is LEN/LEN_BYTE, | |
1593 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. | |
1594 DIRECTION says which direction we search in. | |
1595 TRT and INVERSE_TRT are translation tables. | |
1596 | |
1597 This kind of search works if all the characters in PAT that have | |
1598 nontrivial translation are the same aside from the last byte. This | |
1599 makes it possible to translate just the last byte of a character, | |
1600 and do so after just a simple test of the context. | |
1601 | |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1602 If that criterion is not satisfied, do not call this function. You will |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1603 get an assertion failure. */ |
446 | 1604 |
665 | 1605 static Charbpos |
867 | 1606 boyer_moore (struct buffer *buf, Ibyte *base_pat, Bytecount len, |
665 | 1607 Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt, |
2333 | 1608 Lisp_Object inverse_trt, int USED_IF_MULE (charset_base)) |
446 | 1609 { |
1610 /* #### Someone really really really needs to comment the workings | |
1611 of this junk somewhat better. | |
1612 | |
1613 BTW "BM" stands for Boyer-Moore, which is one of the standard | |
1614 string-searching algorithms. It's the best string-searching | |
1615 algorithm out there, provided that: | |
1616 | |
1617 a) You're not fazed by algorithm complexity. (Rabin-Karp, which | |
1618 uses hashing, is much much easier to code but not as fast.) | |
1619 b) You can freely move backwards in the string that you're | |
1620 searching through. | |
1621 | |
1622 As the comment below tries to explain (but garbles in typical | |
1623 programmer-ese), the idea is that you don't have to do a | |
1624 string match at every successive position in the text. For | |
1625 example, let's say the pattern is "a very long string". We | |
1626 compare the last character in the string (`g') with the | |
1627 corresponding character in the text. If it mismatches, and | |
1628 it is, say, `z', then we can skip forward by the entire | |
1629 length of the pattern because `z' does not occur anywhere | |
1630 in the pattern. If the mismatching character does occur | |
1631 in the pattern, we can usually still skip forward by more | |
1632 than one: e.g. if it is `l', then we can skip forward | |
1633 by the length of the substring "ong string" -- i.e. the | |
1634 largest end section of the pattern that does not contain | |
1635 the mismatched character. So what we do is compute, for | |
1636 each possible character, the distance we can skip forward | |
1637 (the "stride") and use it in the string matching. This | |
1638 is what the BM_tab holds. */ | |
1639 REGISTER EMACS_INT *BM_tab; | |
1640 EMACS_INT *BM_tab_base; | |
1641 REGISTER Bytecount dirlen; | |
1642 EMACS_INT infinity; | |
665 | 1643 Bytebpos limit; |
446 | 1644 Bytecount stride_for_teases = 0; |
1645 REGISTER EMACS_INT i, j; | |
867 | 1646 Ibyte *pat, *pat_end; |
1647 REGISTER Ibyte *cursor, *p_limit, *ptr2; | |
1648 Ibyte simple_translate[0400]; | |
446 | 1649 REGISTER int direction = ((n > 0) ? 1 : -1); |
1650 #ifdef MULE | |
867 | 1651 Ibyte translate_prev_byte = 0; |
1652 Ibyte translate_anteprev_byte = 0; | |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1653 /* These need to be rethought in the event that the internal format |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1654 changes, or in the event that num_8_bit_fixed_chars disappears |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1655 (entirely_one_byte_p can be trivially worked out by checking is the |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1656 byte count equal to the char count.) */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1657 int buffer_entirely_one_byte_p = buf->text->entirely_one_byte_p; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1658 int buffer_nothing_greater_than_0xff = |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1659 buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf); |
446 | 1660 #endif |
1661 #ifdef C_ALLOCA | |
1662 EMACS_INT BM_tab_space[0400]; | |
1663 BM_tab = &BM_tab_space[0]; | |
1664 #else | |
1665 BM_tab = alloca_array (EMACS_INT, 256); | |
1666 #endif | |
1667 | |
1668 /* The general approach is that we are going to maintain that we | |
1669 know the first (closest to the present position, in whatever | |
1670 direction we're searching) character that could possibly be | |
1671 the last (furthest from present position) character of a | |
1672 valid match. We advance the state of our knowledge by | |
1673 looking at that character and seeing whether it indeed | |
1674 matches the last character of the pattern. If it does, we | |
1675 take a closer look. If it does not, we move our pointer (to | |
1676 putative last characters) as far as is logically possible. | |
1677 This amount of movement, which I call a stride, will be the | |
1678 length of the pattern if the actual character appears nowhere | |
1679 in the pattern, otherwise it will be the distance from the | |
1680 last occurrence of that character to the end of the pattern. | |
1681 As a coding trick, an enormous stride is coded into the table | |
1682 for characters that match the last character. This allows | |
1683 use of only a single test, a test for having gone past the | |
1684 end of the permissible match region, to test for both | |
1685 possible matches (when the stride goes past the end | |
1686 immediately) and failure to match (where you get nudged past | |
1687 the end one stride at a time). | |
1688 | |
1689 Here we make a "mickey mouse" BM table. The stride of the | |
1690 search is determined only by the last character of the | |
1691 putative match. If that character does not match, we will | |
1692 stride the proper distance to propose a match that | |
1693 superimposes it on the last instance of a character that | |
1694 matches it (per trt), or misses it entirely if there is | |
1695 none. */ | |
1696 | |
1697 dirlen = len * direction; | |
1698 infinity = dirlen - (lim + pos + len + len) * direction; | |
1699 /* Record position after the end of the pattern. */ | |
1700 pat_end = base_pat + len; | |
1701 if (direction < 0) | |
1702 base_pat = pat_end - 1; | |
1703 BM_tab_base = BM_tab; | |
1704 BM_tab += 0400; | |
1705 j = dirlen; /* to get it in a register */ | |
1706 /* A character that does not appear in the pattern induces a | |
1707 stride equal to the pattern length. */ | |
1708 while (BM_tab_base != BM_tab) | |
1709 { | |
1710 *--BM_tab = j; | |
1711 *--BM_tab = j; | |
1712 *--BM_tab = j; | |
1713 *--BM_tab = j; | |
1714 } | |
1715 /* We use this for translation, instead of TRT itself. We | |
1716 fill this in to handle the characters that actually occur | |
1717 in the pattern. Others don't matter anyway! */ | |
1718 xzero (simple_translate); | |
1719 for (i = 0; i < 0400; i++) | |
867 | 1720 simple_translate[i] = (Ibyte) i; |
446 | 1721 i = 0; |
1425 | 1722 |
446 | 1723 while (i != infinity) |
1724 { | |
867 | 1725 Ibyte *ptr = base_pat + i; |
446 | 1726 i += direction; |
1727 if (i == dirlen) | |
1728 i = infinity; | |
1729 if (!NILP (trt)) | |
428 | 1730 { |
446 | 1731 #ifdef MULE |
867 | 1732 Ichar ch, untranslated; |
446 | 1733 int this_translated = 1; |
1734 | |
1735 /* Is *PTR the last byte of a character? */ | |
867 | 1736 if (pat_end - ptr == 1 || ibyte_first_byte_p (ptr[1])) |
428 | 1737 { |
867 | 1738 Ibyte *charstart = ptr; |
1739 while (!ibyte_first_byte_p (*charstart)) | |
446 | 1740 charstart--; |
867 | 1741 untranslated = itext_ichar (charstart); |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1742 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1743 /* We shouldn't have been passed a string with varying |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1744 character sets or rows. That's what simple_search is |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1745 for. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1746 assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK)); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1747 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1748 ch = TRANSLATE (trt, untranslated); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1749 if (!ibyte_first_byte_p (*ptr)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1750 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1751 translate_prev_byte = ptr[-1]; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1752 if (!ibyte_first_byte_p (translate_prev_byte)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1753 translate_anteprev_byte = ptr[-2]; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1754 } |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1755 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1756 if (charset_base != (ch & ~ICHAR_FIELD3_MASK)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1757 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1758 /* In the very rare event that the CANON entry for this |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1759 character is not in the desired set, choose one that |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1760 is, from the equivalence set. It doesn't much matter |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1761 which, since we're building our own cheesy equivalence |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1762 table instead of using that belonging to the case |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1763 table directly. |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1764 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1765 We can get here if search_buffer has worked out that |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1766 the buffer is entirely single width. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1767 Ichar starting_ch = ch; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1768 do |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1769 { |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1770 ch = TRANSLATE (inverse_trt, ch); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1771 if (charset_base == (ch & ~ICHAR_FIELD3_MASK)) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1772 break; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1773 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1774 } while (starting_ch != ch); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1775 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1776 /* If starting_ch is equal to ch, the case table is |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1777 corrupt. (Any mapping in the canon table should be |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1778 reflected in the equivalence table, and we know from |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1779 the canon table that untranslated maps to starting_ch |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1780 and that untranslated has the correct value for |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1781 charset_base.) */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1782 assert (starting_ch != ch); |
446 | 1783 } |
428 | 1784 } |
1785 else | |
1786 { | |
446 | 1787 ch = *ptr; |
1788 this_translated = 0; | |
1789 } | |
1790 if (ch > 0400) | |
1791 j = ((unsigned char) ch | 0200); | |
1792 else | |
1793 j = (unsigned char) ch; | |
1794 | |
1795 if (i == infinity) | |
1796 stride_for_teases = BM_tab[j]; | |
1797 BM_tab[j] = dirlen - i; | |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1798 /* A translation table is accompanied by its inverse -- see |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1799 comment in casetab.c. */ |
446 | 1800 if (this_translated) |
1801 { | |
867 | 1802 Ichar starting_ch = ch; |
446 | 1803 EMACS_INT starting_j = j; |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1804 do |
446 | 1805 { |
1806 ch = TRANSLATE (inverse_trt, ch); | |
4407
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1807 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1808 if (ch > 0x7F && buffer_entirely_one_byte_p) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1809 continue; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1810 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1811 if (ch > 0xFF && buffer_nothing_greater_than_0xff) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1812 continue; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1813 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1814 if (ch > 0400) |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1815 j = ((unsigned char) ch | 0200); |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1816 else |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1817 j = (unsigned char) ch; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1818 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1819 /* For all the characters that map into CH, set up |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1820 simple_translate to map the last byte into |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1821 STARTING_J. */ |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1822 simple_translate[j] = (Ibyte) starting_j; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1823 BM_tab[j] = dirlen - i; |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1824 |
4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
Aidan Kehoe <kehoea@parhasard.net>
parents:
4322
diff
changeset
|
1825 } while (ch != starting_ch); |
446 | 1826 } |
1827 #else | |
1828 EMACS_INT k; | |
1829 j = *ptr; | |
1830 k = (j = TRANSLATE (trt, j)); | |
1831 if (i == infinity) | |
1832 stride_for_teases = BM_tab[j]; | |
1833 BM_tab[j] = dirlen - i; | |
1834 /* A translation table is accompanied by its inverse -- | |
826 | 1835 see comment in casetab.c. */ |
446 | 1836 while ((j = TRANSLATE (inverse_trt, j)) != k) |
1837 { | |
867 | 1838 simple_translate[j] = (Ibyte) k; |
428 | 1839 BM_tab[j] = dirlen - i; |
1840 } | |
446 | 1841 #endif |
1842 } | |
1843 else | |
1844 { | |
1845 j = *ptr; | |
1846 | |
1847 if (i == infinity) | |
1848 stride_for_teases = BM_tab[j]; | |
1849 BM_tab[j] = dirlen - i; | |
428 | 1850 } |
446 | 1851 /* stride_for_teases tells how much to stride if we get a |
1852 match on the far character but are subsequently | |
1853 disappointed, by recording what the stride would have been | |
1854 for that character if the last character had been | |
1855 different. */ | |
1856 } | |
1857 infinity = dirlen - infinity; | |
1858 pos += dirlen - ((direction > 0) ? direction : 0); | |
1859 /* loop invariant - pos points at where last char (first char if | |
1860 reverse) of pattern would align in a possible match. */ | |
1861 while (n != 0) | |
1862 { | |
665 | 1863 Bytebpos tail_end; |
867 | 1864 Ibyte *tail_end_ptr; |
446 | 1865 /* It's been reported that some (broken) compiler thinks |
1866 that Boolean expressions in an arithmetic context are | |
1867 unsigned. Using an explicit ?1:0 prevents this. */ | |
1868 if ((lim - pos - ((direction > 0) ? 1 : 0)) * direction < 0) | |
1869 return n * (0 - direction); | |
1870 /* First we do the part we can by pointers (maybe | |
1871 nothing) */ | |
1872 QUIT; | |
1873 pat = base_pat; | |
1874 limit = pos - dirlen + direction; | |
1875 /* XEmacs change: definitions of CEILING_OF and FLOOR_OF | |
1876 have changed. See buffer.h. */ | |
1877 limit = ((direction > 0) | |
826 | 1878 ? BYTE_BUF_CEILING_OF (buf, limit) - 1 |
1879 : BYTE_BUF_FLOOR_OF (buf, limit + 1)); | |
446 | 1880 /* LIMIT is now the last (not beyond-last!) value POS can |
1881 take on without hitting edge of buffer or the gap. */ | |
1882 limit = ((direction > 0) | |
1883 ? min (lim - 1, min (limit, pos + 20000)) | |
1884 : max (lim, max (limit, pos - 20000))); | |
826 | 1885 tail_end = BYTE_BUF_CEILING_OF (buf, pos); |
1886 tail_end_ptr = BYTE_BUF_BYTE_ADDRESS (buf, tail_end); | |
446 | 1887 |
1888 if ((limit - pos) * direction > 20) | |
428 | 1889 { |
826 | 1890 /* We have to be careful because the code can generate addresses |
1891 that don't point to the beginning of characters. */ | |
1892 p_limit = BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, limit); | |
1893 ptr2 = (cursor = BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, pos)); | |
446 | 1894 /* In this loop, pos + cursor - ptr2 is the surrogate |
1895 for pos */ | |
1896 while (1) /* use one cursor setting as long as i can */ | |
1897 { | |
1898 if (direction > 0) /* worth duplicating */ | |
1899 { | |
1900 /* Use signed comparison if appropriate to make | |
1901 cursor+infinity sure to be > p_limit. | |
1902 Assuming that the buffer lies in a range of | |
1903 addresses that are all "positive" (as ints) | |
1904 or all "negative", either kind of comparison | |
1905 will work as long as we don't step by | |
1906 infinity. So pick the kind that works when | |
1907 we do step by infinity. */ | |
1908 if ((EMACS_INT) (p_limit + infinity) > | |
1909 (EMACS_INT) p_limit) | |
1910 while ((EMACS_INT) cursor <= | |
1911 (EMACS_INT) p_limit) | |
1912 cursor += BM_tab[*cursor]; | |
1913 else | |
1914 while ((EMACS_UINT) cursor <= | |
1915 (EMACS_UINT) p_limit) | |
1916 cursor += BM_tab[*cursor]; | |
1917 } | |
1918 else | |
1919 { | |
1920 if ((EMACS_INT) (p_limit + infinity) < | |
1921 (EMACS_INT) p_limit) | |
1922 while ((EMACS_INT) cursor >= | |
1923 (EMACS_INT) p_limit) | |
1924 cursor += BM_tab[*cursor]; | |
1925 else | |
1926 while ((EMACS_UINT) cursor >= | |
1927 (EMACS_UINT) p_limit) | |
1928 cursor += BM_tab[*cursor]; | |
1929 } | |
1930 /* If you are here, cursor is beyond the end of the | |
1931 searched region. This can happen if you match on | |
1932 the far character of the pattern, because the | |
1933 "stride" of that character is infinity, a number | |
1934 able to throw you well beyond the end of the | |
1935 search. It can also happen if you fail to match | |
1936 within the permitted region and would otherwise | |
1937 try a character beyond that region */ | |
1938 if ((cursor - p_limit) * direction <= len) | |
1939 break; /* a small overrun is genuine */ | |
1940 cursor -= infinity; /* large overrun = hit */ | |
1941 i = dirlen - direction; | |
1942 if (!NILP (trt)) | |
1943 { | |
1944 while ((i -= direction) + direction != 0) | |
1945 { | |
1946 #ifdef MULE | |
867 | 1947 Ichar ch; |
446 | 1948 cursor -= direction; |
1949 /* Translate only the last byte of a character. */ | |
1950 if ((cursor == tail_end_ptr | |
867 | 1951 || ibyte_first_byte_p (cursor[1])) |
1952 && (ibyte_first_byte_p (cursor[0]) | |
446 | 1953 || (translate_prev_byte == cursor[-1] |
867 | 1954 && (ibyte_first_byte_p (translate_prev_byte) |
446 | 1955 || translate_anteprev_byte == cursor[-2])))) |
1956 ch = simple_translate[*cursor]; | |
1957 else | |
1958 ch = *cursor; | |
1959 if (pat[i] != ch) | |
1960 break; | |
1961 #else | |
1962 if (pat[i] != TRANSLATE (trt, *(cursor -= direction))) | |
1963 break; | |
1964 #endif | |
1965 } | |
1966 } | |
1967 else | |
1968 { | |
1969 while ((i -= direction) + direction != 0) | |
1970 if (pat[i] != *(cursor -= direction)) | |
1971 break; | |
1972 } | |
1973 cursor += dirlen - i - direction; /* fix cursor */ | |
1974 if (i + direction == 0) | |
1975 { | |
1976 cursor -= direction; | |
1977 | |
1978 { | |
665 | 1979 Bytebpos bytstart = (pos + cursor - ptr2 + |
446 | 1980 ((direction > 0) |
1981 ? 1 - len : 0)); | |
665 | 1982 Charbpos bufstart = bytebpos_to_charbpos (buf, bytstart); |
1983 Charbpos bufend = bytebpos_to_charbpos (buf, bytstart + len); | |
446 | 1984 |
1985 set_search_regs (buf, bufstart, bufend - bufstart); | |
1986 } | |
1987 | |
1988 if ((n -= direction) != 0) | |
1989 cursor += dirlen; /* to resume search */ | |
1990 else | |
1991 return ((direction > 0) | |
1992 ? search_regs.end[0] : search_regs.start[0]); | |
1993 } | |
1994 else | |
1995 cursor += stride_for_teases; /* <sigh> we lose - */ | |
1996 } | |
1997 pos += cursor - ptr2; | |
1998 } | |
1999 else | |
2000 /* Now we'll pick up a clump that has to be done the hard | |
2001 way because it covers a discontinuity */ | |
2002 { | |
428 | 2003 /* XEmacs change: definitions of CEILING_OF and FLOOR_OF |
2004 have changed. See buffer.h. */ | |
2005 limit = ((direction > 0) | |
826 | 2006 ? BYTE_BUF_CEILING_OF (buf, pos - dirlen + 1) - 1 |
2007 : BYTE_BUF_FLOOR_OF (buf, pos - dirlen)); | |
428 | 2008 limit = ((direction > 0) |
446 | 2009 ? min (limit + len, lim - 1) |
2010 : max (limit - len, lim)); | |
2011 /* LIMIT is now the last value POS can have | |
2012 and still be valid for a possible match. */ | |
2013 while (1) | |
428 | 2014 { |
446 | 2015 /* This loop can be coded for space rather than |
2016 speed because it will usually run only once. | |
2017 (the reach is at most len + 21, and typically | |
2018 does not exceed len) */ | |
2019 while ((limit - pos) * direction >= 0) | |
826 | 2020 /* *not* BYTE_BUF_FETCH_CHAR. We are working here |
446 | 2021 with bytes, not characters. */ |
826 | 2022 pos += BM_tab[*BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, pos)]; |
446 | 2023 /* now run the same tests to distinguish going off |
2024 the end, a match or a phony match. */ | |
2025 if ((pos - limit) * direction <= len) | |
2026 break; /* ran off the end */ | |
2027 /* Found what might be a match. | |
2028 Set POS back to last (first if reverse) char pos. */ | |
2029 pos -= infinity; | |
2030 i = dirlen - direction; | |
2031 while ((i -= direction) + direction != 0) | |
428 | 2032 { |
446 | 2033 #ifdef MULE |
867 | 2034 Ichar ch; |
2035 Ibyte *ptr; | |
446 | 2036 #endif |
2037 pos -= direction; | |
2038 #ifdef MULE | |
826 | 2039 ptr = BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, pos); |
446 | 2040 if ((ptr == tail_end_ptr |
867 | 2041 || ibyte_first_byte_p (ptr[1])) |
2042 && (ibyte_first_byte_p (ptr[0]) | |
446 | 2043 || (translate_prev_byte == ptr[-1] |
867 | 2044 && (ibyte_first_byte_p (translate_prev_byte) |
446 | 2045 || translate_anteprev_byte == ptr[-2])))) |
2046 ch = simple_translate[*ptr]; | |
428 | 2047 else |
446 | 2048 ch = *ptr; |
2049 if (pat[i] != ch) | |
2050 break; | |
2051 | |
2052 #else | |
826 | 2053 if (pat[i] != |
2054 TRANSLATE (trt, | |
2055 *BYTE_BUF_BYTE_ADDRESS_NO_VERIFY (buf, pos))) | |
446 | 2056 break; |
2057 #endif | |
428 | 2058 } |
446 | 2059 /* Above loop has moved POS part or all the way back |
2060 to the first char pos (last char pos if reverse). | |
2061 Set it once again at the last (first if reverse) | |
2062 char. */ | |
2063 pos += dirlen - i- direction; | |
2064 if (i + direction == 0) | |
428 | 2065 { |
446 | 2066 pos -= direction; |
2067 | |
2068 { | |
665 | 2069 Bytebpos bytstart = (pos + |
446 | 2070 ((direction > 0) |
2071 ? 1 - len : 0)); | |
665 | 2072 Charbpos bufstart = bytebpos_to_charbpos (buf, bytstart); |
2073 Charbpos bufend = bytebpos_to_charbpos (buf, bytstart + len); | |
446 | 2074 |
2075 set_search_regs (buf, bufstart, bufend - bufstart); | |
2076 } | |
2077 | |
2078 if ((n -= direction) != 0) | |
2079 pos += dirlen; /* to resume search */ | |
428 | 2080 else |
446 | 2081 return ((direction > 0) |
2082 ? search_regs.end[0] : search_regs.start[0]); | |
428 | 2083 } |
446 | 2084 else |
2085 pos += stride_for_teases; | |
2086 } | |
428 | 2087 } |
446 | 2088 /* We have done one clump. Can we continue? */ |
2089 if ((lim - pos) * direction < 0) | |
2090 return (0 - n) * direction; | |
428 | 2091 } |
665 | 2092 return bytebpos_to_charbpos (buf, pos); |
428 | 2093 } |
2094 | |
1024 | 2095 /* Record the whole-match data (beginning BEG and end BEG + LEN) and the |
2096 buffer for a match just found. */ | |
428 | 2097 |
2098 static void | |
665 | 2099 set_search_regs (struct buffer *buf, Charbpos beg, Charcount len) |
428 | 2100 { |
2101 /* Make sure we have registers in which to store | |
2102 the match position. */ | |
2103 if (search_regs.num_regs == 0) | |
2104 { | |
2105 search_regs.start = xnew (regoff_t); | |
2106 search_regs.end = xnew (regoff_t); | |
2107 search_regs.num_regs = 1; | |
2108 } | |
2109 | |
1468 | 2110 clear_search_regs (); |
428 | 2111 search_regs.start[0] = beg; |
2112 search_regs.end[0] = beg + len; | |
793 | 2113 last_thing_searched = wrap_buffer (buf); |
428 | 2114 } |
2115 | |
1468 | 2116 /* Clear search registers so match data will be null. */ |
1024 | 2117 |
2118 static void | |
1468 | 2119 clear_search_regs (void) |
1024 | 2120 { |
2121 /* This function has been Mule-ized. */ | |
2122 int i; | |
2123 | |
1468 | 2124 for (i = 0; i < search_regs.num_regs; i++) |
2125 search_regs.start[i] = search_regs.end[i] = -1; | |
1024 | 2126 } |
2127 | |
428 | 2128 |
2129 /* Given a string of words separated by word delimiters, | |
442 | 2130 compute a regexp that matches those exact words |
2131 separated by arbitrary punctuation. */ | |
428 | 2132 |
2133 static Lisp_Object | |
2134 wordify (Lisp_Object buffer, Lisp_Object string) | |
2135 { | |
2136 Charcount i, len; | |
2137 EMACS_INT punct_count = 0, word_count = 0; | |
2138 struct buffer *buf = decode_buffer (buffer, 0); | |
826 | 2139 Lisp_Object syntax_table = buf->mirror_syntax_table; |
428 | 2140 |
2141 CHECK_STRING (string); | |
826 | 2142 len = string_char_length (string); |
428 | 2143 |
2144 for (i = 0; i < len; i++) | |
867 | 2145 if (!WORD_SYNTAX_P (syntax_table, string_ichar (string, i))) |
428 | 2146 { |
2147 punct_count++; | |
2148 if (i > 0 && WORD_SYNTAX_P (syntax_table, | |
867 | 2149 string_ichar (string, i - 1))) |
428 | 2150 word_count++; |
2151 } | |
867 | 2152 if (WORD_SYNTAX_P (syntax_table, string_ichar (string, len - 1))) |
428 | 2153 word_count++; |
2154 if (!word_count) return build_string (""); | |
2155 | |
2156 { | |
2157 /* The following value is an upper bound on the amount of storage we | |
2158 need. In non-Mule, it is exact. */ | |
867 | 2159 Ibyte *storage = |
2367 | 2160 alloca_ibytes (XSTRING_LENGTH (string) - punct_count + |
428 | 2161 5 * (word_count - 1) + 4); |
867 | 2162 Ibyte *o = storage; |
428 | 2163 |
2164 *o++ = '\\'; | |
2165 *o++ = 'b'; | |
2166 | |
2167 for (i = 0; i < len; i++) | |
2168 { | |
867 | 2169 Ichar ch = string_ichar (string, i); |
428 | 2170 |
2171 if (WORD_SYNTAX_P (syntax_table, ch)) | |
867 | 2172 o += set_itext_ichar (o, ch); |
428 | 2173 else if (i > 0 |
2174 && WORD_SYNTAX_P (syntax_table, | |
867 | 2175 string_ichar (string, i - 1)) |
428 | 2176 && --word_count) |
2177 { | |
2178 *o++ = '\\'; | |
2179 *o++ = 'W'; | |
2180 *o++ = '\\'; | |
2181 *o++ = 'W'; | |
2182 *o++ = '*'; | |
2183 } | |
2184 } | |
2185 | |
2186 *o++ = '\\'; | |
2187 *o++ = 'b'; | |
2188 | |
2189 return make_string (storage, o - storage); | |
2190 } | |
2191 } | |
2192 | |
2193 DEFUN ("search-backward", Fsearch_backward, 1, 5, "sSearch backward: ", /* | |
2194 Search backward from point for STRING. | |
2195 Set point to the beginning of the occurrence found, and return point. | |
444 | 2196 |
2197 Optional second argument LIMIT bounds the search; it is a buffer | |
2198 position. The match found must not extend before that position. | |
2199 The value nil is equivalent to (point-min). | |
2200 | |
2201 Optional third argument NOERROR, if t, means just return nil (no | |
2202 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2203 and return nil. | |
2204 | |
2205 Optional fourth argument COUNT is a repeat count--search for | |
2206 successive occurrences. | |
2207 | |
428 | 2208 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2209 defaults to the current buffer. |
2210 | |
1468 | 2211 When the match is successful, this function modifies the match data |
2212 that `match-beginning', `match-end' and `match-data' access; save the | |
2213 match data with `match-data' and restore it with `store-match-data' if | |
2214 you want to preserve them. If the match fails, the match data from the | |
2215 previous success match is preserved. | |
2216 | |
2217 See also the function `replace-match'. | |
428 | 2218 */ |
444 | 2219 (string, limit, noerror, count, buffer)) |
428 | 2220 { |
444 | 2221 return search_command (string, limit, noerror, count, buffer, -1, 0, 0); |
428 | 2222 } |
2223 | |
2224 DEFUN ("search-forward", Fsearch_forward, 1, 5, "sSearch: ", /* | |
2225 Search forward from point for STRING. | |
2226 Set point to the end of the occurrence found, and return point. | |
444 | 2227 |
2228 Optional second argument LIMIT bounds the search; it is a buffer | |
2229 position. The match found must not extend after that position. The | |
2230 value nil is equivalent to (point-max). | |
2231 | |
2232 Optional third argument NOERROR, if t, means just return nil (no | |
2233 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2234 and return nil. | |
2235 | |
2236 Optional fourth argument COUNT is a repeat count--search for | |
2237 successive occurrences. | |
2238 | |
428 | 2239 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2240 defaults to the current buffer. |
2241 | |
1468 | 2242 When the match is successful, this function modifies the match data |
2243 that `match-beginning', `match-end' and `match-data' access; save the | |
2244 match data with `match-data' and restore it with `store-match-data' if | |
2245 you want to preserve them. If the match fails, the match data from the | |
2246 previous success match is preserved. | |
2247 | |
2248 See also the function `replace-match'. | |
428 | 2249 */ |
444 | 2250 (string, limit, noerror, count, buffer)) |
428 | 2251 { |
444 | 2252 return search_command (string, limit, noerror, count, buffer, 1, 0, 0); |
428 | 2253 } |
2254 | |
2255 DEFUN ("word-search-backward", Fword_search_backward, 1, 5, | |
2256 "sWord search backward: ", /* | |
2257 Search backward from point for STRING, ignoring differences in punctuation. | |
2258 Set point to the beginning of the occurrence found, and return point. | |
444 | 2259 |
2260 Optional second argument LIMIT bounds the search; it is a buffer | |
2261 position. The match found must not extend before that position. | |
2262 The value nil is equivalent to (point-min). | |
2263 | |
2264 Optional third argument NOERROR, if t, means just return nil (no | |
2265 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2266 and return nil. | |
2267 | |
2268 Optional fourth argument COUNT is a repeat count--search for | |
2269 successive occurrences. | |
2270 | |
428 | 2271 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2272 defaults to the current buffer. |
2273 | |
1468 | 2274 When the match is successful, this function modifies the match data |
2275 that `match-beginning', `match-end' and `match-data' access; save the | |
2276 match data with `match-data' and restore it with `store-match-data' if | |
2277 you want to preserve them. If the match fails, the match data from the | |
2278 previous success match is preserved. | |
2279 | |
2280 See also the function `replace-match'. | |
428 | 2281 */ |
444 | 2282 (string, limit, noerror, count, buffer)) |
428 | 2283 { |
444 | 2284 return search_command (wordify (buffer, string), limit, noerror, count, |
428 | 2285 buffer, -1, 1, 0); |
2286 } | |
2287 | |
2288 DEFUN ("word-search-forward", Fword_search_forward, 1, 5, "sWord search: ", /* | |
2289 Search forward from point for STRING, ignoring differences in punctuation. | |
2290 Set point to the end of the occurrence found, and return point. | |
444 | 2291 |
2292 Optional second argument LIMIT bounds the search; it is a buffer | |
2293 position. The match found must not extend after that position. The | |
2294 value nil is equivalent to (point-max). | |
2295 | |
2296 Optional third argument NOERROR, if t, means just return nil (no | |
2297 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2298 and return nil. | |
2299 | |
2300 Optional fourth argument COUNT is a repeat count--search for | |
2301 successive occurrences. | |
2302 | |
428 | 2303 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2304 defaults to the current buffer. |
2305 | |
1468 | 2306 When the match is successful, this function modifies the match data |
2307 that `match-beginning', `match-end' and `match-data' access; save the | |
2308 match data with `match-data' and restore it with `store-match-data' if | |
2309 you want to preserve them. If the match fails, the match data from the | |
2310 previous success match is preserved. | |
2311 | |
2312 See also the function `replace-match'. | |
428 | 2313 */ |
444 | 2314 (string, limit, noerror, count, buffer)) |
428 | 2315 { |
444 | 2316 return search_command (wordify (buffer, string), limit, noerror, count, |
428 | 2317 buffer, 1, 1, 0); |
2318 } | |
2319 | |
2320 DEFUN ("re-search-backward", Fre_search_backward, 1, 5, | |
2321 "sRE search backward: ", /* | |
2322 Search backward from point for match for regular expression REGEXP. | |
2323 Set point to the beginning of the match, and return point. | |
2324 The match found is the one starting last in the buffer | |
2325 and yet ending before the origin of the search. | |
444 | 2326 |
2327 Optional second argument LIMIT bounds the search; it is a buffer | |
2328 position. The match found must not extend before that position. | |
2329 The value nil is equivalent to (point-min). | |
2330 | |
2331 Optional third argument NOERROR, if t, means just return nil (no | |
2332 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2333 and return nil. | |
2334 | |
2335 Optional fourth argument COUNT is a repeat count--search for | |
2336 successive occurrences. | |
2337 | |
428 | 2338 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2339 defaults to the current buffer. |
2340 | |
1468 | 2341 When the match is successful, this function modifies the match data |
2342 that `match-beginning', `match-end' and `match-data' access; save the | |
2343 match data with `match-data' and restore it with `store-match-data' if | |
2344 you want to preserve them. If the match fails, the match data from the | |
2345 previous success match is preserved. | |
2346 | |
2347 See also the function `replace-match'. | |
428 | 2348 */ |
444 | 2349 (regexp, limit, noerror, count, buffer)) |
428 | 2350 { |
444 | 2351 return search_command (regexp, limit, noerror, count, buffer, -1, 1, 0); |
428 | 2352 } |
2353 | |
2354 DEFUN ("re-search-forward", Fre_search_forward, 1, 5, "sRE search: ", /* | |
2355 Search forward from point for regular expression REGEXP. | |
2356 Set point to the end of the occurrence found, and return point. | |
444 | 2357 |
2358 Optional second argument LIMIT bounds the search; it is a buffer | |
2359 position. The match found must not extend after that position. The | |
2360 value nil is equivalent to (point-max). | |
2361 | |
2362 Optional third argument NOERROR, if t, means just return nil (no | |
2363 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2364 and return nil. | |
2365 | |
2366 Optional fourth argument COUNT is a repeat count--search for | |
2367 successive occurrences. | |
2368 | |
428 | 2369 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2370 defaults to the current buffer. |
2371 | |
1468 | 2372 When the match is successful, this function modifies the match data |
2373 that `match-beginning', `match-end' and `match-data' access; save the | |
2374 match data with `match-data' and restore it with `store-match-data' if | |
2375 you want to preserve them. If the match fails, the match data from the | |
2376 previous success match is preserved. | |
2377 | |
2378 See also the function `replace-match'. | |
428 | 2379 */ |
444 | 2380 (regexp, limit, noerror, count, buffer)) |
428 | 2381 { |
444 | 2382 return search_command (regexp, limit, noerror, count, buffer, 1, 1, 0); |
428 | 2383 } |
2384 | |
2385 DEFUN ("posix-search-backward", Fposix_search_backward, 1, 5, | |
2386 "sPosix search backward: ", /* | |
2387 Search backward from point for match for regular expression REGEXP. | |
2388 Find the longest match in accord with Posix regular expression rules. | |
2389 Set point to the beginning of the match, and return point. | |
2390 The match found is the one starting last in the buffer | |
2391 and yet ending before the origin of the search. | |
444 | 2392 |
2393 Optional second argument LIMIT bounds the search; it is a buffer | |
2394 position. The match found must not extend before that position. | |
2395 The value nil is equivalent to (point-min). | |
2396 | |
2397 Optional third argument NOERROR, if t, means just return nil (no | |
2398 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2399 and return nil. | |
2400 | |
2401 Optional fourth argument COUNT is a repeat count--search for | |
2402 successive occurrences. | |
2403 | |
428 | 2404 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2405 defaults to the current buffer. |
2406 | |
1468 | 2407 When the match is successful, this function modifies the match data |
2408 that `match-beginning', `match-end' and `match-data' access; save the | |
2409 match data with `match-data' and restore it with `store-match-data' if | |
2410 you want to preserve them. If the match fails, the match data from the | |
2411 previous success match is preserved. | |
2412 | |
2413 See also the function `replace-match'. | |
428 | 2414 */ |
444 | 2415 (regexp, limit, noerror, count, buffer)) |
428 | 2416 { |
444 | 2417 return search_command (regexp, limit, noerror, count, buffer, -1, 1, 1); |
428 | 2418 } |
2419 | |
2420 DEFUN ("posix-search-forward", Fposix_search_forward, 1, 5, "sPosix search: ", /* | |
2421 Search forward from point for regular expression REGEXP. | |
2422 Find the longest match in accord with Posix regular expression rules. | |
2423 Set point to the end of the occurrence found, and return point. | |
444 | 2424 |
2425 Optional second argument LIMIT bounds the search; it is a buffer | |
2426 position. The match found must not extend after that position. The | |
2427 value nil is equivalent to (point-max). | |
2428 | |
2429 Optional third argument NOERROR, if t, means just return nil (no | |
2430 error) if the search fails. If neither nil nor t, set point to LIMIT | |
2431 and return nil. | |
2432 | |
2433 Optional fourth argument COUNT is a repeat count--search for | |
2434 successive occurrences. | |
2435 | |
428 | 2436 Optional fifth argument BUFFER specifies the buffer to search in and |
444 | 2437 defaults to the current buffer. |
2438 | |
1468 | 2439 When the match is successful, this function modifies the match data |
2440 that `match-beginning', `match-end' and `match-data' access; save the | |
2441 match data with `match-data' and restore it with `store-match-data' if | |
2442 you want to preserve them. If the match fails, the match data from the | |
2443 previous success match is preserved. | |
2444 | |
2445 See also the function `replace-match'. | |
428 | 2446 */ |
444 | 2447 (regexp, limit, noerror, count, buffer)) |
428 | 2448 { |
444 | 2449 return search_command (regexp, limit, noerror, count, buffer, 1, 1, 1); |
428 | 2450 } |
2451 | |
2452 | |
2453 static Lisp_Object | |
2454 free_created_dynarrs (Lisp_Object cons) | |
2455 { | |
2456 Dynarr_free (get_opaque_ptr (XCAR (cons))); | |
2457 Dynarr_free (get_opaque_ptr (XCDR (cons))); | |
2458 free_opaque_ptr (XCAR (cons)); | |
2459 free_opaque_ptr (XCDR (cons)); | |
853 | 2460 free_cons (cons); |
428 | 2461 return Qnil; |
2462 } | |
2463 | |
2464 DEFUN ("replace-match", Freplace_match, 1, 5, 0, /* | |
444 | 2465 Replace text matched by last search with REPLACEMENT. |
4199 | 2466 Leaves point at end of replacement text. |
2467 Optional boolean FIXEDCASE inhibits matching case of REPLACEMENT to source. | |
2468 Optional boolean LITERAL inhibits interpretation of escape sequences. | |
2469 Optional STRING provides the source text to replace. | |
2470 Optional STRBUFFER may be a buffer, providing match context, or an integer | |
2471 specifying the subexpression to replace. | |
2472 | |
2473 If FIXEDCASE is non-nil, do not alter case of replacement text. | |
428 | 2474 Otherwise maybe capitalize the whole text, or maybe just word initials, |
2475 based on the replaced text. | |
4199 | 2476 If the replaced text has only capital letters and has at least one |
2477 multiletter word, convert REPLACEMENT to all caps. | |
428 | 2478 If the replaced text has at least one word starting with a capital letter, |
444 | 2479 then capitalize each word in REPLACEMENT. |
428 | 2480 |
4199 | 2481 If LITERAL is non-nil, insert REPLACEMENT literally. |
428 | 2482 Otherwise treat `\\' as special: |
444 | 2483 `\\&' in REPLACEMENT means substitute original matched text. |
428 | 2484 `\\N' means substitute what matched the Nth `\\(...\\)'. |
2485 If Nth parens didn't match, substitute nothing. | |
2486 `\\\\' means insert one `\\'. | |
2487 `\\u' means upcase the next character. | |
2488 `\\l' means downcase the next character. | |
2489 `\\U' means begin upcasing all following characters. | |
2490 `\\L' means begin downcasing all following characters. | |
2491 `\\E' means terminate the effect of any `\\U' or `\\L'. | |
2492 Case changes made with `\\u', `\\l', `\\U', and `\\L' override | |
2493 all other case changes that may be made in the replaced text. | |
4199 | 2494 |
2495 If non-nil, STRING is the source string, and a new string with the specified | |
2496 replacements is created and returned. Otherwise the current buffer is the | |
2497 source text. | |
2498 | |
2499 If non-nil, STRBUFFER may be an integer, interpreted as the index of the | |
2500 subexpression to replace in the source text, or a buffer to provide the | |
2501 syntax table and case table. If nil, then the \"subexpression\" is 0, i.e., | |
2502 the whole match, and the current buffer provides the syntax and case tables. | |
2503 If STRING is nil, STRBUFFER must be nil or an integer. | |
2504 | |
2505 Specifying a subexpression is only useful after a regular expression match, | |
2506 since a fixed string search has no non-trivial subexpressions. | |
2507 | |
2508 It is not possible to specify both a buffer and a subexpression. If that is | |
2509 desired, the idiom `(with-current-buffer BUFFER (replace-match ... INTEGER))' | |
2510 may be appropriate. | |
2511 | |
2512 If STRING is nil but the last thing matched (or searched) was a string, or | |
2513 STRING is a string but the last thing matched was a buffer, an | |
2514 `invalid-argument' error will be signaled. (XEmacs does not check that the | |
2515 last thing searched is the source string, but it is not useful to use a | |
2516 different string as source.) | |
2517 | |
2518 If no match (including searches) has been successful or the requested | |
1468 | 2519 subexpression was not matched, an `args-out-of-range' error will be |
2520 signaled. (If no match has ever been conducted in this instance of | |
2521 XEmacs, an `invalid-operation' error will be signaled. This is very | |
2522 rare.) | |
428 | 2523 */ |
444 | 2524 (replacement, fixedcase, literal, string, strbuffer)) |
428 | 2525 { |
2526 /* This function can GC */ | |
2527 enum { nochange, all_caps, cap_initial } case_action; | |
665 | 2528 Charbpos pos, last; |
428 | 2529 int some_multiletter_word; |
2530 int some_lowercase; | |
2531 int some_uppercase; | |
2532 int some_nonuppercase_initial; | |
867 | 2533 Ichar c, prevc; |
428 | 2534 Charcount inslen; |
2535 struct buffer *buf; | |
826 | 2536 Lisp_Object syntax_table; |
428 | 2537 int mc_count; |
2538 Lisp_Object buffer; | |
2539 int_dynarr *ul_action_dynarr = 0; | |
2540 int_dynarr *ul_pos_dynarr = 0; | |
502 | 2541 int sub = 0; |
428 | 2542 int speccount; |
2543 | |
444 | 2544 CHECK_STRING (replacement); |
428 | 2545 |
4199 | 2546 /* Because GNU decided to be incompatible here, we support the following |
2547 baroque and bogus API for the STRING and STRBUFFER arguments: | |
2548 types interpretations | |
2549 STRING STRBUFFER STRING STRBUFFER | |
2550 nil nil none 0 = index of subexpression to replace | |
2551 nil integer none index of subexpression to replace | |
2552 nil other ***** error ***** | |
2553 string nil source current buffer provides syntax table | |
2554 subexpression = 0 (whole match) | |
2555 string buffer source buffer providing syntax table | |
2556 subexpression = 0 (whole match) | |
2557 string integer source current buffer provides syntax table | |
2558 subexpression = STRBUFFER | |
2559 string other ***** error ***** | |
2560 */ | |
2561 | |
2562 /* Do STRBUFFER first; if STRING is nil, we'll overwrite BUF and BUFFER. */ | |
2563 | |
2564 /* If the match data were abstracted into a special "match data" type | |
2565 instead of the typical half-assed "let the implementation be visible" | |
2566 form it's in, we could extend it to include the last string matched | |
2567 and the buffer used for that matching. But of course we can't change | |
2568 it as it is. | |
2569 */ | |
2570 if (NILP (strbuffer) || BUFFERP (strbuffer)) | |
2571 { | |
2572 buf = decode_buffer (strbuffer, 0); | |
2573 } | |
2574 else if (!NILP (strbuffer)) | |
2575 { | |
2576 CHECK_INT (strbuffer); | |
2577 sub = XINT (strbuffer); | |
2578 if (sub < 0 || sub >= (int) search_regs.num_regs) | |
2579 invalid_argument ("match data register invalid", strbuffer); | |
2580 if (search_regs.start[sub] < 0) | |
2581 invalid_argument ("match data register not set", strbuffer); | |
2582 buf = current_buffer; | |
2583 } | |
2584 else | |
2585 invalid_argument ("STRBUFFER must be nil, a buffer, or an integer", | |
2586 strbuffer); | |
2587 buffer = wrap_buffer (buf); | |
2588 | |
428 | 2589 if (! NILP (string)) |
2590 { | |
2591 CHECK_STRING (string); | |
2592 if (!EQ (last_thing_searched, Qt)) | |
4199 | 2593 invalid_argument ("last thing matched was not a string", Qunbound); |
428 | 2594 } |
2595 else | |
2596 { | |
2597 if (!BUFFERP (last_thing_searched)) | |
4199 | 2598 invalid_argument ("last thing matched was not a buffer", Qunbound); |
428 | 2599 buffer = last_thing_searched; |
2600 buf = XBUFFER (buffer); | |
2601 } | |
2602 | |
826 | 2603 syntax_table = buf->mirror_syntax_table; |
428 | 2604 |
2605 case_action = nochange; /* We tried an initialization */ | |
2606 /* but some C compilers blew it */ | |
2607 | |
2608 if (search_regs.num_regs == 0) | |
826 | 2609 signal_error (Qinvalid_operation, |
2610 "replace-match called before any match found", Qunbound); | |
428 | 2611 |
2612 if (NILP (string)) | |
2613 { | |
469 | 2614 if (search_regs.start[sub] < BUF_BEGV (buf) |
2615 || search_regs.start[sub] > search_regs.end[sub] | |
2616 || search_regs.end[sub] > BUF_ZV (buf)) | |
2617 args_out_of_range (make_int (search_regs.start[sub]), | |
2618 make_int (search_regs.end[sub])); | |
428 | 2619 } |
2620 else | |
2621 { | |
2622 if (search_regs.start[0] < 0 | |
2623 || search_regs.start[0] > search_regs.end[0] | |
826 | 2624 || search_regs.end[0] > string_char_length (string)) |
428 | 2625 args_out_of_range (make_int (search_regs.start[0]), |
2626 make_int (search_regs.end[0])); | |
2627 } | |
2628 | |
2629 if (NILP (fixedcase)) | |
2630 { | |
2631 /* Decide how to casify by examining the matched text. */ | |
2632 | |
707 | 2633 last = search_regs.end[sub]; |
428 | 2634 prevc = '\n'; |
2635 case_action = all_caps; | |
2636 | |
2637 /* some_multiletter_word is set nonzero if any original word | |
2638 is more than one letter long. */ | |
2639 some_multiletter_word = 0; | |
2640 some_lowercase = 0; | |
2641 some_nonuppercase_initial = 0; | |
2642 some_uppercase = 0; | |
2643 | |
707 | 2644 for (pos = search_regs.start[sub]; pos < last; pos++) |
428 | 2645 { |
2646 if (NILP (string)) | |
2647 c = BUF_FETCH_CHAR (buf, pos); | |
2648 else | |
867 | 2649 c = string_ichar (string, pos); |
428 | 2650 |
2651 if (LOWERCASEP (buf, c)) | |
2652 { | |
2653 /* Cannot be all caps if any original char is lower case */ | |
2654 | |
2655 some_lowercase = 1; | |
2656 if (!WORD_SYNTAX_P (syntax_table, prevc)) | |
2657 some_nonuppercase_initial = 1; | |
2658 else | |
2659 some_multiletter_word = 1; | |
2660 } | |
2661 else if (!NOCASEP (buf, c)) | |
2662 { | |
2663 some_uppercase = 1; | |
2664 if (!WORD_SYNTAX_P (syntax_table, prevc)) | |
2665 ; | |
2666 else | |
2667 some_multiletter_word = 1; | |
2668 } | |
2669 else | |
2670 { | |
2671 /* If the initial is a caseless word constituent, | |
2672 treat that like a lowercase initial. */ | |
2673 if (!WORD_SYNTAX_P (syntax_table, prevc)) | |
2674 some_nonuppercase_initial = 1; | |
2675 } | |
2676 | |
2677 prevc = c; | |
2678 } | |
2679 | |
2680 /* Convert to all caps if the old text is all caps | |
2681 and has at least one multiletter word. */ | |
2682 if (! some_lowercase && some_multiletter_word) | |
2683 case_action = all_caps; | |
2684 /* Capitalize each word, if the old text has all capitalized words. */ | |
2685 else if (!some_nonuppercase_initial && some_multiletter_word) | |
2686 case_action = cap_initial; | |
2687 else if (!some_nonuppercase_initial && some_uppercase) | |
2688 /* Should x -> yz, operating on X, give Yz or YZ? | |
2689 We'll assume the latter. */ | |
2690 case_action = all_caps; | |
2691 else | |
2692 case_action = nochange; | |
2693 } | |
2694 | |
2695 /* Do replacement in a string. */ | |
2696 if (!NILP (string)) | |
2697 { | |
2698 Lisp_Object before, after; | |
2699 | |
2700 speccount = specpdl_depth (); | |
4199 | 2701 before = Fsubstring (string, Qzero, make_int (search_regs.start[sub])); |
2702 after = Fsubstring (string, make_int (search_regs.end[sub]), Qnil); | |
428 | 2703 |
444 | 2704 /* Do case substitution into REPLACEMENT if desired. */ |
428 | 2705 if (NILP (literal)) |
2706 { | |
826 | 2707 Charcount stlen = string_char_length (replacement); |
428 | 2708 Charcount strpos; |
2709 /* XEmacs change: rewrote this loop somewhat to make it | |
2710 cleaner. Also added \U, \E, etc. */ | |
2711 Charcount literal_start = 0; | |
2712 /* We build up the substituted string in ACCUM. */ | |
2713 Lisp_Object accum; | |
2714 | |
2715 accum = Qnil; | |
2716 | |
2717 /* OK, the basic idea here is that we scan through the | |
2718 replacement string until we find a backslash, which | |
2719 represents a substring of the original string to be | |
2720 substituted. We then append onto ACCUM the literal | |
2721 text before the backslash (LASTPOS marks the | |
2722 beginning of this) followed by the substring of the | |
2723 original string that needs to be inserted. */ | |
2724 for (strpos = 0; strpos < stlen; strpos++) | |
2725 { | |
2726 /* If LITERAL_END is set, we've encountered a backslash | |
2727 (the end of literal text to be inserted). */ | |
2728 Charcount literal_end = -1; | |
2729 /* If SUBSTART is set, we need to also insert the | |
2730 text from SUBSTART to SUBEND in the original string. */ | |
2731 Charcount substart = -1; | |
2732 Charcount subend = -1; | |
2733 | |
867 | 2734 c = string_ichar (replacement, strpos); |
428 | 2735 if (c == '\\' && strpos < stlen - 1) |
2736 { | |
867 | 2737 c = string_ichar (replacement, ++strpos); |
428 | 2738 if (c == '&') |
2739 { | |
2740 literal_end = strpos - 1; | |
2741 substart = search_regs.start[0]; | |
2742 subend = search_regs.end[0]; | |
2743 } | |
4199 | 2744 /* #### This logic is totally broken, |
2745 since we can have backrefs like "\99", right? */ | |
428 | 2746 else if (c >= '1' && c <= '9' && |
2747 c <= search_regs.num_regs + '0') | |
2748 { | |
2749 if (search_regs.start[c - '0'] >= 0) | |
2750 { | |
2751 literal_end = strpos - 1; | |
2752 substart = search_regs.start[c - '0']; | |
2753 subend = search_regs.end[c - '0']; | |
2754 } | |
2755 } | |
2756 else if (c == 'U' || c == 'u' || c == 'L' || c == 'l' || | |
2757 c == 'E') | |
2758 { | |
2759 /* Keep track of all case changes requested, but don't | |
2760 make them now. Do them later so we override | |
2761 everything else. */ | |
2762 if (!ul_pos_dynarr) | |
2763 { | |
2764 ul_pos_dynarr = Dynarr_new (int); | |
2765 ul_action_dynarr = Dynarr_new (int); | |
2766 record_unwind_protect | |
2767 (free_created_dynarrs, | |
2768 noseeum_cons | |
2769 (make_opaque_ptr (ul_pos_dynarr), | |
2770 make_opaque_ptr (ul_action_dynarr))); | |
2771 } | |
2772 literal_end = strpos - 1; | |
2773 Dynarr_add (ul_pos_dynarr, | |
2774 (!NILP (accum) | |
826 | 2775 ? string_char_length (accum) |
428 | 2776 : 0) + (literal_end - literal_start)); |
2777 Dynarr_add (ul_action_dynarr, c); | |
2778 } | |
2779 else if (c == '\\') | |
2780 /* So we get just one backslash. */ | |
2781 literal_end = strpos; | |
2782 } | |
2783 if (literal_end >= 0) | |
2784 { | |
2785 Lisp_Object literal_text = Qnil; | |
2786 Lisp_Object substring = Qnil; | |
2787 if (literal_end != literal_start) | |
444 | 2788 literal_text = Fsubstring (replacement, |
428 | 2789 make_int (literal_start), |
2790 make_int (literal_end)); | |
2791 if (substart >= 0 && subend != substart) | |
2792 substring = Fsubstring (string, | |
2793 make_int (substart), | |
2794 make_int (subend)); | |
2795 if (!NILP (literal_text) || !NILP (substring)) | |
2796 accum = concat3 (accum, literal_text, substring); | |
2797 literal_start = strpos + 1; | |
2798 } | |
2799 } | |
2800 | |
2801 if (strpos != literal_start) | |
2802 /* some literal text at end to be inserted */ | |
444 | 2803 replacement = concat2 (accum, Fsubstring (replacement, |
2804 make_int (literal_start), | |
2805 make_int (strpos))); | |
428 | 2806 else |
444 | 2807 replacement = accum; |
428 | 2808 } |
2809 | |
444 | 2810 /* replacement can be nil. */ |
2811 if (NILP (replacement)) | |
2812 replacement = build_string (""); | |
2813 | |
428 | 2814 if (case_action == all_caps) |
444 | 2815 replacement = Fupcase (replacement, buffer); |
428 | 2816 else if (case_action == cap_initial) |
444 | 2817 replacement = Fupcase_initials (replacement, buffer); |
428 | 2818 |
2819 /* Now finally, we need to process the \U's, \E's, etc. */ | |
2820 if (ul_pos_dynarr) | |
2821 { | |
2822 int i = 0; | |
2823 int cur_action = 'E'; | |
826 | 2824 Charcount stlen = string_char_length (replacement); |
428 | 2825 Charcount strpos; |
2826 | |
2827 for (strpos = 0; strpos < stlen; strpos++) | |
2828 { | |
867 | 2829 Ichar curchar = string_ichar (replacement, strpos); |
2830 Ichar newchar = -1; | |
428 | 2831 if (i < Dynarr_length (ul_pos_dynarr) && |
2832 strpos == Dynarr_at (ul_pos_dynarr, i)) | |
2833 { | |
2834 int new_action = Dynarr_at (ul_action_dynarr, i); | |
2835 i++; | |
2836 if (new_action == 'u') | |
2837 newchar = UPCASE (buf, curchar); | |
2838 else if (new_action == 'l') | |
2839 newchar = DOWNCASE (buf, curchar); | |
2840 else | |
2841 cur_action = new_action; | |
2842 } | |
2843 if (newchar == -1) | |
2844 { | |
2845 if (cur_action == 'U') | |
2846 newchar = UPCASE (buf, curchar); | |
2847 else if (cur_action == 'L') | |
2848 newchar = DOWNCASE (buf, curchar); | |
2849 else | |
2850 newchar = curchar; | |
2851 } | |
2852 if (newchar != curchar) | |
793 | 2853 set_string_char (replacement, strpos, newchar); |
428 | 2854 } |
2855 } | |
2856 | |
2857 /* frees the Dynarrs if necessary. */ | |
771 | 2858 unbind_to (speccount); |
444 | 2859 return concat3 (before, replacement, after); |
428 | 2860 } |
2861 | |
707 | 2862 mc_count = begin_multiple_change (buf, search_regs.start[sub], |
2863 search_regs.end[sub]); | |
428 | 2864 |
2865 /* begin_multiple_change() records an unwind-protect, so we need to | |
2866 record this value now. */ | |
2867 speccount = specpdl_depth (); | |
2868 | |
2869 /* We insert the replacement text before the old text, and then | |
2870 delete the original text. This means that markers at the | |
2871 beginning or end of the original will float to the corresponding | |
2872 position in the replacement. */ | |
707 | 2873 BUF_SET_PT (buf, search_regs.start[sub]); |
428 | 2874 if (!NILP (literal)) |
444 | 2875 Finsert (1, &replacement); |
428 | 2876 else |
2877 { | |
826 | 2878 Charcount stlen = string_char_length (replacement); |
428 | 2879 Charcount strpos; |
2880 struct gcpro gcpro1; | |
444 | 2881 GCPRO1 (replacement); |
428 | 2882 for (strpos = 0; strpos < stlen; strpos++) |
2883 { | |
707 | 2884 /* on the first iteration assert(offset==0), |
2885 exactly complementing BUF_SET_PT() above. | |
2886 During the loop, it keeps track of the amount inserted. | |
2887 */ | |
2888 Charcount offset = BUF_PT (buf) - search_regs.start[sub]; | |
428 | 2889 |
867 | 2890 c = string_ichar (replacement, strpos); |
428 | 2891 if (c == '\\' && strpos < stlen - 1) |
2892 { | |
707 | 2893 /* XXX FIXME: replacing just a substring non-literally |
2894 using backslash refs to the match looks dangerous. But | |
2895 <15366.18513.698042.156573@ns.caldera.de> from Torsten Duwe | |
2896 <duwe@caldera.de> claims Finsert_buffer_substring already | |
2897 handles this correctly. | |
2898 */ | |
867 | 2899 c = string_ichar (replacement, ++strpos); |
428 | 2900 if (c == '&') |
2901 Finsert_buffer_substring | |
2902 (buffer, | |
2903 make_int (search_regs.start[0] + offset), | |
2904 make_int (search_regs.end[0] + offset)); | |
4199 | 2905 /* #### This logic is totally broken, |
2906 since we can have backrefs like "\99", right? */ | |
428 | 2907 else if (c >= '1' && c <= '9' && |
2908 c <= search_regs.num_regs + '0') | |
2909 { | |
2910 if (search_regs.start[c - '0'] >= 1) | |
2911 Finsert_buffer_substring | |
2912 (buffer, | |
2913 make_int (search_regs.start[c - '0'] + offset), | |
2914 make_int (search_regs.end[c - '0'] + offset)); | |
2915 } | |
2916 else if (c == 'U' || c == 'u' || c == 'L' || c == 'l' || | |
2917 c == 'E') | |
2918 { | |
2919 /* Keep track of all case changes requested, but don't | |
2920 make them now. Do them later so we override | |
2921 everything else. */ | |
2922 if (!ul_pos_dynarr) | |
2923 { | |
2924 ul_pos_dynarr = Dynarr_new (int); | |
2925 ul_action_dynarr = Dynarr_new (int); | |
2926 record_unwind_protect | |
2927 (free_created_dynarrs, | |
2928 Fcons (make_opaque_ptr (ul_pos_dynarr), | |
2929 make_opaque_ptr (ul_action_dynarr))); | |
2930 } | |
2931 Dynarr_add (ul_pos_dynarr, BUF_PT (buf)); | |
2932 Dynarr_add (ul_action_dynarr, c); | |
2933 } | |
2934 else | |
2935 buffer_insert_emacs_char (buf, c); | |
2936 } | |
2937 else | |
2938 buffer_insert_emacs_char (buf, c); | |
2939 } | |
2940 UNGCPRO; | |
2941 } | |
2942 | |
707 | 2943 inslen = BUF_PT (buf) - (search_regs.start[sub]); |
2944 buffer_delete_range (buf, search_regs.start[sub] + inslen, | |
2945 search_regs.end[sub] + inslen, 0); | |
428 | 2946 |
2947 if (case_action == all_caps) | |
2948 Fupcase_region (make_int (BUF_PT (buf) - inslen), | |
2949 make_int (BUF_PT (buf)), buffer); | |
2950 else if (case_action == cap_initial) | |
2951 Fupcase_initials_region (make_int (BUF_PT (buf) - inslen), | |
2952 make_int (BUF_PT (buf)), buffer); | |
2953 | |
2954 /* Now go through and make all the case changes that were requested | |
2955 in the replacement string. */ | |
2956 if (ul_pos_dynarr) | |
2957 { | |
665 | 2958 Charbpos eend = BUF_PT (buf); |
428 | 2959 int i = 0; |
2960 int cur_action = 'E'; | |
2961 | |
2962 for (pos = BUF_PT (buf) - inslen; pos < eend; pos++) | |
2963 { | |
867 | 2964 Ichar curchar = BUF_FETCH_CHAR (buf, pos); |
2965 Ichar newchar = -1; | |
428 | 2966 if (i < Dynarr_length (ul_pos_dynarr) && |
2967 pos == Dynarr_at (ul_pos_dynarr, i)) | |
2968 { | |
2969 int new_action = Dynarr_at (ul_action_dynarr, i); | |
2970 i++; | |
2971 if (new_action == 'u') | |
2972 newchar = UPCASE (buf, curchar); | |
2973 else if (new_action == 'l') | |
2974 newchar = DOWNCASE (buf, curchar); | |
2975 else | |
2976 cur_action = new_action; | |
2977 } | |
2978 if (newchar == -1) | |
2979 { | |
2980 if (cur_action == 'U') | |
2981 newchar = UPCASE (buf, curchar); | |
2982 else if (cur_action == 'L') | |
2983 newchar = DOWNCASE (buf, curchar); | |
2984 else | |
2985 newchar = curchar; | |
2986 } | |
2987 if (newchar != curchar) | |
2988 buffer_replace_char (buf, pos, newchar, 0, 0); | |
2989 } | |
2990 } | |
2991 | |
2992 /* frees the Dynarrs if necessary. */ | |
771 | 2993 unbind_to (speccount); |
428 | 2994 end_multiple_change (buf, mc_count); |
2995 | |
2996 return Qnil; | |
2997 } | |
2998 | |
2999 static Lisp_Object | |
3000 match_limit (Lisp_Object num, int beginningp) | |
3001 { | |
3002 int n; | |
3003 | |
3004 CHECK_INT (num); | |
3005 n = XINT (num); | |
3006 if (n < 0 || n >= search_regs.num_regs) | |
3007 args_out_of_range (num, make_int (search_regs.num_regs)); | |
3008 if (search_regs.num_regs == 0 || | |
3009 search_regs.start[n] < 0) | |
3010 return Qnil; | |
3011 return make_int (beginningp ? search_regs.start[n] : search_regs.end[n]); | |
3012 } | |
3013 | |
3014 DEFUN ("match-beginning", Fmatch_beginning, 1, 1, 0, /* | |
3015 Return position of start of text matched by last regexp search. | |
3016 NUM, specifies which parenthesized expression in the last regexp. | |
3017 Value is nil if NUMth pair didn't match, or there were less than NUM pairs. | |
3018 Zero means the entire text matched by the whole regexp or whole string. | |
3019 */ | |
3020 (num)) | |
3021 { | |
3022 return match_limit (num, 1); | |
3023 } | |
3024 | |
3025 DEFUN ("match-end", Fmatch_end, 1, 1, 0, /* | |
3026 Return position of end of text matched by last regexp search. | |
3027 NUM specifies which parenthesized expression in the last regexp. | |
3028 Value is nil if NUMth pair didn't match, or there were less than NUM pairs. | |
3029 Zero means the entire text matched by the whole regexp or whole string. | |
3030 */ | |
3031 (num)) | |
3032 { | |
3033 return match_limit (num, 0); | |
3034 } | |
3035 | |
3036 DEFUN ("match-data", Fmatch_data, 0, 2, 0, /* | |
3037 Return a list containing all info on what the last regexp search matched. | |
3038 Element 2N is `(match-beginning N)'; element 2N + 1 is `(match-end N)'. | |
3039 All the elements are markers or nil (nil if the Nth pair didn't match) | |
3040 if the last match was on a buffer; integers or nil if a string was matched. | |
3041 Use `store-match-data' to reinstate the data in this list. | |
3042 | |
3043 If INTEGERS (the optional first argument) is non-nil, always use integers | |
3044 \(rather than markers) to represent buffer positions. | |
3045 If REUSE is a list, reuse it as part of the value. If REUSE is long enough | |
3046 to hold all the values, and if INTEGERS is non-nil, no consing is done. | |
3047 */ | |
3048 (integers, reuse)) | |
3049 { | |
3050 Lisp_Object tail, prev; | |
3051 Lisp_Object *data; | |
3052 int i; | |
3053 Charcount len; | |
3054 | |
3055 if (NILP (last_thing_searched)) | |
563 | 3056 /*error ("match-data called before any match found", Qunbound);*/ |
428 | 3057 return Qnil; |
3058 | |
3059 data = alloca_array (Lisp_Object, 2 * search_regs.num_regs); | |
3060 | |
3061 len = -1; | |
3062 for (i = 0; i < search_regs.num_regs; i++) | |
3063 { | |
665 | 3064 Charbpos start = search_regs.start[i]; |
428 | 3065 if (start >= 0) |
3066 { | |
3067 if (EQ (last_thing_searched, Qt) | |
3068 || !NILP (integers)) | |
3069 { | |
3070 data[2 * i] = make_int (start); | |
3071 data[2 * i + 1] = make_int (search_regs.end[i]); | |
3072 } | |
3073 else if (BUFFERP (last_thing_searched)) | |
3074 { | |
3075 data[2 * i] = Fmake_marker (); | |
3076 Fset_marker (data[2 * i], | |
3077 make_int (start), | |
3078 last_thing_searched); | |
3079 data[2 * i + 1] = Fmake_marker (); | |
3080 Fset_marker (data[2 * i + 1], | |
3081 make_int (search_regs.end[i]), | |
3082 last_thing_searched); | |
3083 } | |
3084 else | |
3085 /* last_thing_searched must always be Qt, a buffer, or Qnil. */ | |
2500 | 3086 ABORT (); |
428 | 3087 |
3088 len = i; | |
3089 } | |
3090 else | |
3091 data[2 * i] = data [2 * i + 1] = Qnil; | |
3092 } | |
3093 if (!CONSP (reuse)) | |
3094 return Flist (2 * len + 2, data); | |
3095 | |
3096 /* If REUSE is a list, store as many value elements as will fit | |
3097 into the elements of REUSE. */ | |
3098 for (prev = Qnil, i = 0, tail = reuse; CONSP (tail); i++, tail = XCDR (tail)) | |
3099 { | |
3100 if (i < 2 * len + 2) | |
3101 XCAR (tail) = data[i]; | |
3102 else | |
3103 XCAR (tail) = Qnil; | |
3104 prev = tail; | |
3105 } | |
3106 | |
3107 /* If we couldn't fit all value elements into REUSE, | |
3108 cons up the rest of them and add them to the end of REUSE. */ | |
3109 if (i < 2 * len + 2) | |
3110 XCDR (prev) = Flist (2 * len + 2 - i, data + i); | |
3111 | |
3112 return reuse; | |
3113 } | |
3114 | |
3115 | |
3116 DEFUN ("store-match-data", Fstore_match_data, 1, 1, 0, /* | |
3117 Set internal data on last search match from elements of LIST. | |
1468 | 3118 LIST should have been created by calling `match-data' previously, |
3119 or be nil, to clear the internal match data. | |
428 | 3120 */ |
3121 (list)) | |
3122 { | |
3123 REGISTER int i; | |
3124 REGISTER Lisp_Object marker; | |
3125 int num_regs; | |
3126 int length; | |
3127 | |
853 | 3128 /* Some FSF junk with running_asynch_code, to preserve the match |
3129 data. Not necessary because we don't call process filters | |
3130 asynchronously (i.e. from within QUIT). */ | |
428 | 3131 |
3132 CONCHECK_LIST (list); | |
3133 | |
3134 /* Unless we find a marker with a buffer in LIST, assume that this | |
3135 match data came from a string. */ | |
3136 last_thing_searched = Qt; | |
3137 | |
3138 /* Allocate registers if they don't already exist. */ | |
3139 length = XINT (Flength (list)) / 2; | |
3140 num_regs = search_regs.num_regs; | |
3141 | |
3142 if (length > num_regs) | |
3143 { | |
3144 if (search_regs.num_regs == 0) | |
3145 { | |
3146 search_regs.start = xnew_array (regoff_t, length); | |
3147 search_regs.end = xnew_array (regoff_t, length); | |
3148 } | |
3149 else | |
3150 { | |
3151 XREALLOC_ARRAY (search_regs.start, regoff_t, length); | |
3152 XREALLOC_ARRAY (search_regs.end, regoff_t, length); | |
3153 } | |
3154 | |
3155 search_regs.num_regs = length; | |
3156 } | |
3157 | |
3158 for (i = 0; i < num_regs; i++) | |
3159 { | |
3160 marker = Fcar (list); | |
3161 if (NILP (marker)) | |
3162 { | |
3163 search_regs.start[i] = -1; | |
3164 list = Fcdr (list); | |
3165 } | |
3166 else | |
3167 { | |
3168 if (MARKERP (marker)) | |
3169 { | |
3170 if (XMARKER (marker)->buffer == 0) | |
3171 marker = Qzero; | |
3172 else | |
793 | 3173 last_thing_searched = wrap_buffer (XMARKER (marker)->buffer); |
428 | 3174 } |
3175 | |
3176 CHECK_INT_COERCE_MARKER (marker); | |
3177 search_regs.start[i] = XINT (marker); | |
3178 list = Fcdr (list); | |
3179 | |
3180 marker = Fcar (list); | |
3181 if (MARKERP (marker) && XMARKER (marker)->buffer == 0) | |
3182 marker = Qzero; | |
3183 | |
3184 CHECK_INT_COERCE_MARKER (marker); | |
3185 search_regs.end[i] = XINT (marker); | |
3186 } | |
3187 list = Fcdr (list); | |
3188 } | |
3189 | |
3190 return Qnil; | |
3191 } | |
3192 | |
3193 /* Quote a string to inactivate reg-expr chars */ | |
3194 | |
3195 DEFUN ("regexp-quote", Fregexp_quote, 1, 1, 0, /* | |
3196 Return a regexp string which matches exactly STRING and nothing else. | |
3197 */ | |
444 | 3198 (string)) |
428 | 3199 { |
867 | 3200 REGISTER Ibyte *in, *out, *end; |
3201 REGISTER Ibyte *temp; | |
428 | 3202 |
444 | 3203 CHECK_STRING (string); |
428 | 3204 |
2367 | 3205 temp = alloca_ibytes (XSTRING_LENGTH (string) * 2); |
428 | 3206 |
3207 /* Now copy the data into the new string, inserting escapes. */ | |
3208 | |
444 | 3209 in = XSTRING_DATA (string); |
3210 end = in + XSTRING_LENGTH (string); | |
428 | 3211 out = temp; |
3212 | |
3213 while (in < end) | |
3214 { | |
867 | 3215 Ichar c = itext_ichar (in); |
428 | 3216 |
3217 if (c == '[' || c == ']' | |
3218 || c == '*' || c == '.' || c == '\\' | |
3219 || c == '?' || c == '+' | |
3220 || c == '^' || c == '$') | |
3221 *out++ = '\\'; | |
867 | 3222 out += set_itext_ichar (out, c); |
3223 INC_IBYTEPTR (in); | |
428 | 3224 } |
3225 | |
3226 return make_string (temp, out - temp); | |
3227 } | |
3228 | |
3229 DEFUN ("set-word-regexp", Fset_word_regexp, 1, 1, 0, /* | |
3230 Set the regexp to be used to match a word in regular-expression searching. | |
3231 #### Not yet implemented. Currently does nothing. | |
3232 #### Do not use this yet. Its calling interface is likely to change. | |
3233 */ | |
2286 | 3234 (UNUSED (regexp))) |
428 | 3235 { |
3236 return Qnil; | |
3237 } | |
3238 | |
3239 | |
3240 /************************************************************************/ | |
3241 /* initialization */ | |
3242 /************************************************************************/ | |
3243 | |
3244 void | |
3245 syms_of_search (void) | |
3246 { | |
3247 | |
442 | 3248 DEFERROR_STANDARD (Qsearch_failed, Qinvalid_operation); |
3249 DEFERROR_STANDARD (Qinvalid_regexp, Qsyntax_error); | |
563 | 3250 Fput (Qinvalid_regexp, Qerror_lacks_explanatory_string, Qt); |
428 | 3251 |
3252 DEFSUBR (Flooking_at); | |
3253 DEFSUBR (Fposix_looking_at); | |
3254 DEFSUBR (Fstring_match); | |
3255 DEFSUBR (Fposix_string_match); | |
3256 DEFSUBR (Fskip_chars_forward); | |
3257 DEFSUBR (Fskip_chars_backward); | |
3258 DEFSUBR (Fskip_syntax_forward); | |
3259 DEFSUBR (Fskip_syntax_backward); | |
3260 DEFSUBR (Fsearch_forward); | |
3261 DEFSUBR (Fsearch_backward); | |
3262 DEFSUBR (Fword_search_forward); | |
3263 DEFSUBR (Fword_search_backward); | |
3264 DEFSUBR (Fre_search_forward); | |
3265 DEFSUBR (Fre_search_backward); | |
3266 DEFSUBR (Fposix_search_forward); | |
3267 DEFSUBR (Fposix_search_backward); | |
3268 DEFSUBR (Freplace_match); | |
3269 DEFSUBR (Fmatch_beginning); | |
3270 DEFSUBR (Fmatch_end); | |
3271 DEFSUBR (Fmatch_data); | |
3272 DEFSUBR (Fstore_match_data); | |
3273 DEFSUBR (Fregexp_quote); | |
3274 DEFSUBR (Fset_word_regexp); | |
3275 } | |
3276 | |
3277 void | |
3278 reinit_vars_of_search (void) | |
3279 { | |
3280 int i; | |
3281 | |
3282 last_thing_searched = Qnil; | |
3283 staticpro_nodump (&last_thing_searched); | |
3284 | |
3285 for (i = 0; i < REGEXP_CACHE_SIZE; ++i) | |
3286 { | |
3287 searchbufs[i].buf.allocated = 100; | |
3288 searchbufs[i].buf.buffer = (unsigned char *) xmalloc (100); | |
3289 searchbufs[i].buf.fastmap = searchbufs[i].fastmap; | |
3290 searchbufs[i].regexp = Qnil; | |
3291 staticpro_nodump (&searchbufs[i].regexp); | |
3292 searchbufs[i].next = (i == REGEXP_CACHE_SIZE-1 ? 0 : &searchbufs[i+1]); | |
3293 } | |
3294 searchbuf_head = &searchbufs[0]; | |
3295 } | |
3296 | |
3297 void | |
3298 vars_of_search (void) | |
3299 { | |
3300 DEFVAR_LISP ("forward-word-regexp", &Vforward_word_regexp /* | |
3301 *Regular expression to be used in `forward-word'. | |
3302 #### Not yet implemented. | |
3303 */ ); | |
3304 Vforward_word_regexp = Qnil; | |
3305 | |
3306 DEFVAR_LISP ("backward-word-regexp", &Vbackward_word_regexp /* | |
3307 *Regular expression to be used in `backward-word'. | |
3308 #### Not yet implemented. | |
3309 */ ); | |
3310 Vbackward_word_regexp = Qnil; | |
502 | 3311 |
3312 DEFVAR_INT ("warn-about-possibly-incompatible-back-references", | |
3313 &warn_about_possibly_incompatible_back_references /* | |
3314 If true, issue warnings when new-semantics back references occur. | |
3315 This is to catch places where old code might inadvertently have changed | |
3316 semantics. This will occur in old code only where more than nine groups | |
3317 occur and a back reference to one of them is directly followed by a digit. | |
3318 */ ); | |
3319 warn_about_possibly_incompatible_back_references = 1; | |
814 | 3320 |
2421 | 3321 Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed); |
428 | 3322 staticpro (&Vskip_chars_range_table); |
3323 } |