Mercurial > hg > xemacs-beta
comparison src/search.c @ 446:1ccc32a20af4 r21-2-38
Import from CVS: tag r21-2-38
author | cvs |
---|---|
date | Mon, 13 Aug 2007 11:37:21 +0200 |
parents | 576fb035e263 |
children | 223736d75acb |
comparison
equal
deleted
inserted
replaced
445:34f3776fcf0e | 446:1ccc32a20af4 |
---|---|
36 #endif | 36 #endif |
37 #include "syntax.h" | 37 #include "syntax.h" |
38 | 38 |
39 #include <sys/types.h> | 39 #include <sys/types.h> |
40 #include "regex.h" | 40 #include "regex.h" |
41 | 41 #include "casetab.h" |
42 #include "chartab.h" | |
43 | |
44 #define TRANSLATE(table, pos) \ | |
45 (!NILP (table) ? TRT_TABLE_OF (table, (Emchar) pos) : pos) | |
42 | 46 |
43 #define REGEXP_CACHE_SIZE 20 | 47 #define REGEXP_CACHE_SIZE 20 |
44 | 48 |
45 /* If the regexp is non-nil, then the buffer contains the compiled form | 49 /* If the regexp is non-nil, then the buffer contains the compiled form |
46 of that regexp, suitable for searching. */ | 50 of that regexp, suitable for searching. */ |
47 struct regexp_cache { | 51 struct regexp_cache |
52 { | |
48 struct regexp_cache *next; | 53 struct regexp_cache *next; |
49 Lisp_Object regexp; | 54 Lisp_Object regexp; |
50 struct re_pattern_buffer buf; | 55 struct re_pattern_buffer buf; |
51 char fastmap[0400]; | 56 char fastmap[0400]; |
52 /* Nonzero means regexp was compiled to do full POSIX backtracking. */ | 57 /* Nonzero means regexp was compiled to do full POSIX backtracking. */ |
102 /* range table for use with skip_chars. Only needed for Mule. */ | 107 /* range table for use with skip_chars. Only needed for Mule. */ |
103 Lisp_Object Vskip_chars_range_table; | 108 Lisp_Object Vskip_chars_range_table; |
104 | 109 |
105 static void set_search_regs (struct buffer *buf, Bufpos beg, Charcount len); | 110 static void set_search_regs (struct buffer *buf, Bufpos beg, Charcount len); |
106 static void save_search_regs (void); | 111 static void save_search_regs (void); |
112 static Bufpos simple_search (struct buffer *buf, Bufbyte *base_pat, | |
113 Bytecount len, Bytind pos, Bytind lim, | |
114 EMACS_INT n, Lisp_Object trt); | |
115 static Bufpos boyer_moore (struct buffer *buf, Bufbyte *base_pat, | |
116 Bytecount len, Bytind pos, Bytind lim, | |
117 EMACS_INT n, Lisp_Object trt, | |
118 Lisp_Object inverse_trt, int charset_base); | |
107 static Bufpos search_buffer (struct buffer *buf, Lisp_Object str, | 119 static Bufpos search_buffer (struct buffer *buf, Lisp_Object str, |
108 Bufpos bufpos, Bufpos buflim, EMACS_INT n, int RE, | 120 Bufpos bufpos, Bufpos buflim, EMACS_INT n, int RE, |
109 unsigned char *trt, unsigned char *inverse_trt, | 121 Lisp_Object trt, Lisp_Object inverse_trt, |
110 int posix); | 122 int posix); |
111 | 123 |
112 static void | 124 static void |
113 matcher_overflow (void) | 125 matcher_overflow (void) |
114 { | 126 { |
126 POSIX is nonzero if we want full backtracking (POSIX style) | 138 POSIX is nonzero if we want full backtracking (POSIX style) |
127 for this pattern. 0 means backtrack only enough to get a valid match. */ | 139 for this pattern. 0 means backtrack only enough to get a valid match. */ |
128 | 140 |
129 static int | 141 static int |
130 compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern, | 142 compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern, |
131 char *translate, struct re_registers *regp, int posix, | 143 Lisp_Object translate, struct re_registers *regp, int posix, |
132 Error_behavior errb) | 144 Error_behavior errb) |
133 { | 145 { |
134 const char *val; | 146 const char *val; |
135 reg_syntax_t old; | 147 reg_syntax_t old; |
136 | 148 |
165 POSIX is nonzero if we want full backtracking (POSIX style) | 177 POSIX is nonzero if we want full backtracking (POSIX style) |
166 for this pattern. 0 means backtrack only enough to get a valid match. */ | 178 for this pattern. 0 means backtrack only enough to get a valid match. */ |
167 | 179 |
168 struct re_pattern_buffer * | 180 struct re_pattern_buffer * |
169 compile_pattern (Lisp_Object pattern, struct re_registers *regp, | 181 compile_pattern (Lisp_Object pattern, struct re_registers *regp, |
170 char *translate, int posix, Error_behavior errb) | 182 Lisp_Object translate, int posix, Error_behavior errb) |
171 { | 183 { |
172 struct regexp_cache *cp, **cpp; | 184 struct regexp_cache *cp, **cpp; |
173 | 185 |
174 for (cpp = &searchbuf_head; ; cpp = &cp->next) | 186 for (cpp = &searchbuf_head; ; cpp = &cp->next) |
175 { | 187 { |
176 cp = *cpp; | 188 cp = *cpp; |
177 if (!NILP (Fstring_equal (cp->regexp, pattern)) | 189 if (!NILP (Fstring_equal (cp->regexp, pattern)) |
178 && cp->buf.translate == translate | 190 && EQ (cp->buf.translate, translate) |
179 && cp->posix == posix) | 191 && cp->posix == posix) |
180 break; | 192 break; |
181 | 193 |
182 /* If we're at the end of the cache, compile into the last cell. */ | 194 /* If we're at the end of the cache, compile into the last cell. */ |
183 if (cp->next == 0) | 195 if (cp->next == 0) |
208 Lisp_Object Qsearch_failed; | 220 Lisp_Object Qsearch_failed; |
209 | 221 |
210 static Lisp_Object | 222 static Lisp_Object |
211 signal_failure (Lisp_Object arg) | 223 signal_failure (Lisp_Object arg) |
212 { | 224 { |
213 Fsignal (Qsearch_failed, list1 (arg)); | 225 for (;;) |
214 return Qnil; | 226 Fsignal (Qsearch_failed, list1 (arg)); |
227 return Qnil; /* Not reached. */ | |
215 } | 228 } |
216 | 229 |
217 /* Convert the search registers from Bytinds to Bufpos's. Needs to be | 230 /* Convert the search registers from Bytinds to Bufpos's. Needs to be |
218 done after each regexp match that uses the search regs. | 231 done after each regexp match that uses the search regs. |
219 | 232 |
284 save_search_regs (); | 297 save_search_regs (); |
285 | 298 |
286 CHECK_STRING (string); | 299 CHECK_STRING (string); |
287 bufp = compile_pattern (string, &search_regs, | 300 bufp = compile_pattern (string, &search_regs, |
288 (!NILP (buf->case_fold_search) | 301 (!NILP (buf->case_fold_search) |
289 ? (char *) MIRROR_DOWNCASE_TABLE_AS_STRING (buf) | 302 ? XCASE_TABLE_DOWNCASE (buf->case_table) : Qnil), |
290 : 0), | |
291 posix, ERROR_ME); | 303 posix, ERROR_ME); |
292 | 304 |
293 QUIT; | 305 QUIT; |
294 | 306 |
295 /* Get pointers and sizes of the two strings | 307 /* Get pointers and sizes of the two strings |
384 } | 396 } |
385 | 397 |
386 | 398 |
387 bufp = compile_pattern (regexp, &search_regs, | 399 bufp = compile_pattern (regexp, &search_regs, |
388 (!NILP (buf->case_fold_search) | 400 (!NILP (buf->case_fold_search) |
389 ? (char *) MIRROR_DOWNCASE_TABLE_AS_STRING (buf) | 401 ? XCASE_TABLE_DOWNCASE (buf->case_table) : Qnil), |
390 : 0), 0, ERROR_ME); | 402 0, ERROR_ME); |
391 QUIT; | 403 QUIT; |
392 { | 404 { |
393 Bytecount bis = charcount_to_bytecount (XSTRING_DATA (string), s); | 405 Bytecount bis = charcount_to_bytecount (XSTRING_DATA (string), s); |
394 regex_emacs_buffer = buf; | 406 regex_emacs_buffer = buf; |
395 regex_emacs_buffer_p = 0; | 407 regex_emacs_buffer_p = 0; |
454 Bufbyte *newnonreloc = (Bufbyte *) nonreloc; | 466 Bufbyte *newnonreloc = (Bufbyte *) nonreloc; |
455 struct re_pattern_buffer *bufp; | 467 struct re_pattern_buffer *bufp; |
456 | 468 |
457 bufp = compile_pattern (regexp, 0, | 469 bufp = compile_pattern (regexp, 0, |
458 (case_fold_search | 470 (case_fold_search |
459 ? (char *) | 471 ? XCASE_TABLE_DOWNCASE (current_buffer->case_table) |
460 /* #### evil current-buffer dependency */ | 472 : Qnil), |
461 MIRROR_DOWNCASE_TABLE_AS_STRING (current_buffer) | |
462 : 0), | |
463 0, errb); | 473 0, errb); |
464 if (!bufp) | 474 if (!bufp) |
465 return -1; /* will only do this when errb != ERROR_ME */ | 475 return -1; /* will only do this when errb != ERROR_ME */ |
466 if (!no_quit) | 476 if (!no_quit) |
467 QUIT; | 477 QUIT; |
1023 lim = BUF_BEGV (buf); | 1033 lim = BUF_BEGV (buf); |
1024 } | 1034 } |
1025 | 1035 |
1026 np = search_buffer (buf, string, BUF_PT (buf), lim, n, RE, | 1036 np = search_buffer (buf, string, BUF_PT (buf), lim, n, RE, |
1027 (!NILP (buf->case_fold_search) | 1037 (!NILP (buf->case_fold_search) |
1028 ? MIRROR_CANON_TABLE_AS_STRING (buf) | 1038 ? XCASE_TABLE_CANON (buf->case_table) |
1029 : 0), | 1039 : Qnil), |
1030 (!NILP (buf->case_fold_search) | 1040 (!NILP (buf->case_fold_search) |
1031 ? MIRROR_EQV_TABLE_AS_STRING (buf) | 1041 ? XCASE_TABLE_EQV (buf->case_table) |
1032 : 0), posix); | 1042 : Qnil), posix); |
1033 | 1043 |
1034 if (np <= 0) | 1044 if (np <= 0) |
1035 { | 1045 { |
1036 if (NILP (noerror)) | 1046 if (NILP (noerror)) |
1037 return signal_failure (string); | 1047 return signal_failure (string); |
1105 or else the position at the beginning of the Nth occurrence | 1115 or else the position at the beginning of the Nth occurrence |
1106 (if searching backward) or the end (if searching forward). | 1116 (if searching backward) or the end (if searching forward). |
1107 | 1117 |
1108 POSIX is nonzero if we want full backtracking (POSIX style) | 1118 POSIX is nonzero if we want full backtracking (POSIX style) |
1109 for this pattern. 0 means backtrack only enough to get a valid match. */ | 1119 for this pattern. 0 means backtrack only enough to get a valid match. */ |
1110 | |
1111 static Bufpos | 1120 static Bufpos |
1112 search_buffer (struct buffer *buf, Lisp_Object string, Bufpos bufpos, | 1121 search_buffer (struct buffer *buf, Lisp_Object string, Bufpos bufpos, |
1113 Bufpos buflim, EMACS_INT n, int RE, unsigned char *trt, | 1122 Bufpos buflim, EMACS_INT n, int RE, Lisp_Object trt, |
1114 unsigned char *inverse_trt, int posix) | 1123 Lisp_Object inverse_trt, int posix) |
1115 { | 1124 { |
1116 /* This function has been Mule-ized, except for the trt table handling. */ | 1125 /* This function has been Mule-ized, except for the trt table handling. */ |
1117 Bytecount len = XSTRING_LENGTH (string); | 1126 Bytecount len = XSTRING_LENGTH (string); |
1118 Bufbyte *base_pat = XSTRING_DATA (string); | 1127 Bufbyte *base_pat = XSTRING_DATA (string); |
1119 REGISTER EMACS_INT *BM_tab; | |
1120 EMACS_INT *BM_tab_base; | |
1121 REGISTER int direction = ((n > 0) ? 1 : -1); | |
1122 REGISTER Bytecount dirlen; | |
1123 EMACS_INT infinity; | |
1124 Bytind limit; | |
1125 EMACS_INT k; | |
1126 Bytecount stride_for_teases = 0; | |
1127 REGISTER Bufbyte *pat = 0; | |
1128 REGISTER Bufbyte *cursor, *p_limit, *ptr2; | |
1129 REGISTER EMACS_INT i, j; | 1128 REGISTER EMACS_INT i, j; |
1130 Bytind p1, p2; | 1129 Bytind p1, p2; |
1131 Bytecount s1, s2; | 1130 Bytecount s1, s2; |
1132 Bytind pos, lim; | 1131 Bytind pos, lim; |
1133 | 1132 |
1149 lim = bufpos_to_bytind (buf, buflim); | 1148 lim = bufpos_to_bytind (buf, buflim); |
1150 if (RE && !trivial_regexp_p (string)) | 1149 if (RE && !trivial_regexp_p (string)) |
1151 { | 1150 { |
1152 struct re_pattern_buffer *bufp; | 1151 struct re_pattern_buffer *bufp; |
1153 | 1152 |
1154 bufp = compile_pattern (string, &search_regs, (char *) trt, posix, | 1153 bufp = compile_pattern (string, &search_regs, trt, posix, |
1155 ERROR_ME); | 1154 ERROR_ME); |
1156 | 1155 |
1157 /* Get pointers and sizes of the two strings | 1156 /* Get pointers and sizes of the two strings |
1158 that make up the visible portion of the buffer. */ | 1157 that make up the visible portion of the buffer. */ |
1159 | 1158 |
1240 n--; | 1239 n--; |
1241 } | 1240 } |
1242 return bufpos; | 1241 return bufpos; |
1243 } | 1242 } |
1244 else /* non-RE case */ | 1243 else /* non-RE case */ |
1245 /* #### Someone really really really needs to comment the workings | 1244 { |
1246 of this junk somewhat better. | 1245 int charset_base = -1; |
1247 | 1246 int boyer_moore_ok = 1; |
1248 BTW "BM" stands for Boyer-Moore, which is one of the standard | 1247 Bufbyte *pat = 0; |
1249 string-searching algorithms. It's the best string-searching | 1248 Bufbyte *patbuf = alloca_array (Bufbyte, len * MAX_EMCHAR_LEN); |
1250 algorithm out there, provided that: | 1249 pat = patbuf; |
1251 | 1250 #ifdef MULE |
1252 a) You're not fazed by algorithm complexity. (Rabin-Karp, which | 1251 while (len > 0) |
1253 uses hashing, is much much easier to code but not as fast.) | 1252 { |
1254 b) You can freely move backwards in the string that you're | 1253 Bufbyte tmp_str[MAX_EMCHAR_LEN]; |
1255 searching through. | 1254 Emchar c, translated, inverse; |
1256 | 1255 Bytecount orig_bytelen, new_bytelen, inv_bytelen; |
1257 As the comment below tries to explain (but garbles in typical | 1256 |
1258 programmer-ese), the idea is that you don't have to do a | 1257 /* If we got here and the RE flag is set, it's because |
1259 string match at every successive position in the text. For | 1258 we're dealing with a regexp known to be trivial, so the |
1260 example, let's say the pattern is "a very long string". We | 1259 backslash just quotes the next character. */ |
1261 compare the last character in the string (`g') with the | 1260 if (RE && *base_pat == '\\') |
1262 corresponding character in the text. If it mismatches, and | 1261 { |
1263 it is, say, `z', then we can skip forward by the entire | 1262 len--; |
1264 length of the pattern because `z' does not occur anywhere | 1263 base_pat++; |
1265 in the pattern. If the mismatching character does occur | 1264 } |
1266 in the pattern, we can usually still skip forward by more | 1265 c = charptr_emchar (base_pat); |
1267 than one: e.g. if it is `l', then we can skip forward | 1266 translated = TRANSLATE (trt, c); |
1268 by the length of the substring "ong string" -- i.e. the | 1267 inverse = TRANSLATE (inverse_trt, c); |
1269 largest end section of the pattern that does not contain | 1268 |
1270 the mismatched character. So what we do is compute, for | 1269 orig_bytelen = charcount_to_bytecount (base_pat, 1); |
1271 each possible character, the distance we can skip forward | 1270 inv_bytelen = set_charptr_emchar (tmp_str, inverse); |
1272 (the "stride") and use it in the string matching. This | 1271 new_bytelen = set_charptr_emchar (tmp_str, translated); |
1273 is what the BM_tab holds. */ | 1272 |
1274 { | 1273 |
1274 if (new_bytelen != orig_bytelen || inv_bytelen != orig_bytelen) | |
1275 boyer_moore_ok = 0; | |
1276 if (translated != c || inverse != c) | |
1277 { | |
1278 /* Keep track of which character set row | |
1279 contains the characters that need translation. */ | |
1280 int charset_base_code = c & ~CHAR_FIELD3_MASK; | |
1281 if (charset_base == -1) | |
1282 charset_base = charset_base_code; | |
1283 else if (charset_base != charset_base_code) | |
1284 /* If two different rows appear, needing translation, | |
1285 then we cannot use boyer_moore search. */ | |
1286 boyer_moore_ok = 0; | |
1287 } | |
1288 memcpy (pat, tmp_str, new_bytelen); | |
1289 pat += new_bytelen; | |
1290 base_pat += orig_bytelen; | |
1291 len -= orig_bytelen; | |
1292 } | |
1293 #else /* not MULE */ | |
1294 while (--len >= 0) | |
1295 { | |
1296 /* If we got here and the RE flag is set, it's because | |
1297 we're dealing with a regexp known to be trivial, so the | |
1298 backslash just quotes the next character. */ | |
1299 if (RE && *base_pat == '\\') | |
1300 { | |
1301 len--; | |
1302 base_pat++; | |
1303 } | |
1304 *pat++ = TRANSLATE (trt, *base_pat++); | |
1305 } | |
1306 #endif /* MULE */ | |
1307 len = pat - patbuf; | |
1308 pat = base_pat = patbuf; | |
1309 if (boyer_moore_ok) | |
1310 return boyer_moore (buf, base_pat, len, pos, lim, n, | |
1311 trt, inverse_trt, charset_base); | |
1312 else | |
1313 return simple_search (buf, base_pat, len, pos, lim, n, trt); | |
1314 } | |
1315 } | |
1316 | |
1317 /* Do a simple string search N times for the string PAT, | |
1318 whose length is LEN/LEN_BYTE, | |
1319 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. | |
1320 TRT is the translation table. | |
1321 | |
1322 Return the character position where the match is found. | |
1323 Otherwise, if M matches remained to be found, return -M. | |
1324 | |
1325 This kind of search works regardless of what is in PAT and | |
1326 regardless of what is in TRT. It is used in cases where | |
1327 boyer_moore cannot work. */ | |
1328 | |
1329 static Bufpos | |
1330 simple_search (struct buffer *buf, Bufbyte *base_pat, Bytecount len_byte, | |
1331 Bytind idx, Bytind lim, EMACS_INT n, Lisp_Object trt) | |
1332 { | |
1333 int forward = n > 0; | |
1334 Bytecount buf_len = 0; /* Shut up compiler. */ | |
1335 | |
1336 if (lim > idx) | |
1337 while (n > 0) | |
1338 { | |
1339 while (1) | |
1340 { | |
1341 Bytecount this_len = len_byte; | |
1342 Bytind this_idx = idx; | |
1343 Bufbyte *p = base_pat; | |
1344 if (idx >= lim) | |
1345 goto stop; | |
1346 | |
1347 while (this_len > 0) | |
1348 { | |
1349 Emchar pat_ch, buf_ch; | |
1350 Bytecount pat_len; | |
1351 | |
1352 pat_ch = charptr_emchar (p); | |
1353 buf_ch = BI_BUF_FETCH_CHAR (buf, this_idx); | |
1354 | |
1355 buf_ch = TRANSLATE (trt, buf_ch); | |
1356 | |
1357 if (buf_ch != pat_ch) | |
1358 break; | |
1359 | |
1360 pat_len = charcount_to_bytecount (p, 1); | |
1361 p += pat_len; | |
1362 this_len -= pat_len; | |
1363 INC_BYTIND (buf, this_idx); | |
1364 } | |
1365 if (this_len == 0) | |
1366 { | |
1367 buf_len = this_idx - idx; | |
1368 idx = this_idx; | |
1369 break; | |
1370 } | |
1371 INC_BYTIND (buf, idx); | |
1372 } | |
1373 n--; | |
1374 } | |
1375 else | |
1376 while (n < 0) | |
1377 { | |
1378 while (1) | |
1379 { | |
1380 Bytecount this_len = len_byte; | |
1381 Bytind this_idx = idx; | |
1382 Bufbyte *p; | |
1383 if (idx <= lim) | |
1384 goto stop; | |
1385 p = base_pat + len_byte; | |
1386 | |
1387 while (this_len > 0) | |
1388 { | |
1389 Emchar pat_ch, buf_ch; | |
1390 | |
1391 DEC_CHARPTR (p); | |
1392 DEC_BYTIND (buf, this_idx); | |
1393 pat_ch = charptr_emchar (p); | |
1394 buf_ch = BI_BUF_FETCH_CHAR (buf, this_idx); | |
1395 | |
1396 buf_ch = TRANSLATE (trt, buf_ch); | |
1397 | |
1398 if (buf_ch != pat_ch) | |
1399 break; | |
1400 | |
1401 this_len -= charcount_to_bytecount (p, 1); | |
1402 } | |
1403 if (this_len == 0) | |
1404 { | |
1405 buf_len = idx - this_idx; | |
1406 idx = this_idx; | |
1407 break; | |
1408 } | |
1409 DEC_BYTIND (buf, idx); | |
1410 } | |
1411 n++; | |
1412 } | |
1413 stop: | |
1414 if (n == 0) | |
1415 { | |
1416 Bufpos beg, end, retval; | |
1417 if (forward) | |
1418 { | |
1419 beg = bytind_to_bufpos (buf, idx - buf_len); | |
1420 retval = end = bytind_to_bufpos (buf, idx); | |
1421 } | |
1422 else | |
1423 { | |
1424 retval = beg = bytind_to_bufpos (buf, idx); | |
1425 end = bytind_to_bufpos (buf, idx + buf_len); | |
1426 } | |
1427 set_search_regs (buf, beg, end - beg); | |
1428 | |
1429 return retval; | |
1430 } | |
1431 else if (n > 0) | |
1432 return -n; | |
1433 else | |
1434 return n; | |
1435 } | |
1436 | |
1437 /* Do Boyer-Moore search N times for the string PAT, | |
1438 whose length is LEN/LEN_BYTE, | |
1439 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. | |
1440 DIRECTION says which direction we search in. | |
1441 TRT and INVERSE_TRT are translation tables. | |
1442 | |
1443 This kind of search works if all the characters in PAT that have | |
1444 nontrivial translation are the same aside from the last byte. This | |
1445 makes it possible to translate just the last byte of a character, | |
1446 and do so after just a simple test of the context. | |
1447 | |
1448 If that criterion is not satisfied, do not call this function. */ | |
1449 | |
1450 static Bufpos | |
1451 boyer_moore (struct buffer *buf, Bufbyte *base_pat, Bytecount len, | |
1452 Bytind pos, Bytind lim, EMACS_INT n, Lisp_Object trt, | |
1453 Lisp_Object inverse_trt, int charset_base) | |
1454 { | |
1455 /* #### Someone really really really needs to comment the workings | |
1456 of this junk somewhat better. | |
1457 | |
1458 BTW "BM" stands for Boyer-Moore, which is one of the standard | |
1459 string-searching algorithms. It's the best string-searching | |
1460 algorithm out there, provided that: | |
1461 | |
1462 a) You're not fazed by algorithm complexity. (Rabin-Karp, which | |
1463 uses hashing, is much much easier to code but not as fast.) | |
1464 b) You can freely move backwards in the string that you're | |
1465 searching through. | |
1466 | |
1467 As the comment below tries to explain (but garbles in typical | |
1468 programmer-ese), the idea is that you don't have to do a | |
1469 string match at every successive position in the text. For | |
1470 example, let's say the pattern is "a very long string". We | |
1471 compare the last character in the string (`g') with the | |
1472 corresponding character in the text. If it mismatches, and | |
1473 it is, say, `z', then we can skip forward by the entire | |
1474 length of the pattern because `z' does not occur anywhere | |
1475 in the pattern. If the mismatching character does occur | |
1476 in the pattern, we can usually still skip forward by more | |
1477 than one: e.g. if it is `l', then we can skip forward | |
1478 by the length of the substring "ong string" -- i.e. the | |
1479 largest end section of the pattern that does not contain | |
1480 the mismatched character. So what we do is compute, for | |
1481 each possible character, the distance we can skip forward | |
1482 (the "stride") and use it in the string matching. This | |
1483 is what the BM_tab holds. */ | |
1484 REGISTER EMACS_INT *BM_tab; | |
1485 EMACS_INT *BM_tab_base; | |
1486 REGISTER Bytecount dirlen; | |
1487 EMACS_INT infinity; | |
1488 Bytind limit; | |
1489 Bytecount stride_for_teases = 0; | |
1490 REGISTER EMACS_INT i, j; | |
1491 Bufbyte *pat, *pat_end; | |
1492 REGISTER Bufbyte *cursor, *p_limit, *ptr2; | |
1493 Bufbyte simple_translate[0400]; | |
1494 REGISTER int direction = ((n > 0) ? 1 : -1); | |
1495 #ifdef MULE | |
1496 Bufbyte translate_prev_byte = 0; | |
1497 Bufbyte translate_anteprev_byte = 0; | |
1498 #endif | |
1275 #ifdef C_ALLOCA | 1499 #ifdef C_ALLOCA |
1276 EMACS_INT BM_tab_space[0400]; | 1500 EMACS_INT BM_tab_space[0400]; |
1277 BM_tab = &BM_tab_space[0]; | 1501 BM_tab = &BM_tab_space[0]; |
1278 #else | 1502 #else |
1279 BM_tab = alloca_array (EMACS_INT, 256); | 1503 BM_tab = alloca_array (EMACS_INT, 256); |
1280 #endif | 1504 #endif |
1281 { | 1505 |
1282 Bufbyte *patbuf = alloca_array (Bufbyte, len); | 1506 /* The general approach is that we are going to maintain that we |
1283 pat = patbuf; | 1507 know the first (closest to the present position, in whatever |
1284 while (--len >= 0) | 1508 direction we're searching) character that could possibly be |
1285 { | 1509 the last (furthest from present position) character of a |
1286 /* If we got here and the RE flag is set, it's because we're | 1510 valid match. We advance the state of our knowledge by |
1287 dealing with a regexp known to be trivial, so the backslash | 1511 looking at that character and seeing whether it indeed |
1288 just quotes the next character. */ | 1512 matches the last character of the pattern. If it does, we |
1289 if (RE && *base_pat == '\\') | 1513 take a closer look. If it does not, we move our pointer (to |
1290 { | 1514 putative last characters) as far as is logically possible. |
1291 len--; | 1515 This amount of movement, which I call a stride, will be the |
1292 base_pat++; | 1516 length of the pattern if the actual character appears nowhere |
1293 } | 1517 in the pattern, otherwise it will be the distance from the |
1294 *pat++ = (trt ? trt[*base_pat++] : *base_pat++); | 1518 last occurrence of that character to the end of the pattern. |
1295 } | 1519 As a coding trick, an enormous stride is coded into the table |
1296 len = pat - patbuf; | 1520 for characters that match the last character. This allows |
1297 pat = base_pat = patbuf; | 1521 use of only a single test, a test for having gone past the |
1298 } | 1522 end of the permissible match region, to test for both |
1299 /* The general approach is that we are going to maintain that we know */ | 1523 possible matches (when the stride goes past the end |
1300 /* the first (closest to the present position, in whatever direction */ | 1524 immediately) and failure to match (where you get nudged past |
1301 /* we're searching) character that could possibly be the last */ | 1525 the end one stride at a time). |
1302 /* (furthest from present position) character of a valid match. We */ | 1526 |
1303 /* advance the state of our knowledge by looking at that character */ | 1527 Here we make a "mickey mouse" BM table. The stride of the |
1304 /* and seeing whether it indeed matches the last character of the */ | 1528 search is determined only by the last character of the |
1305 /* pattern. If it does, we take a closer look. If it does not, we */ | 1529 putative match. If that character does not match, we will |
1306 /* move our pointer (to putative last characters) as far as is */ | 1530 stride the proper distance to propose a match that |
1307 /* logically possible. This amount of movement, which I call a */ | 1531 superimposes it on the last instance of a character that |
1308 /* stride, will be the length of the pattern if the actual character */ | 1532 matches it (per trt), or misses it entirely if there is |
1309 /* appears nowhere in the pattern, otherwise it will be the distance */ | 1533 none. */ |
1310 /* from the last occurrence of that character to the end of the */ | 1534 |
1311 /* pattern. */ | 1535 dirlen = len * direction; |
1312 /* As a coding trick, an enormous stride is coded into the table for */ | 1536 infinity = dirlen - (lim + pos + len + len) * direction; |
1313 /* characters that match the last character. This allows use of only */ | 1537 /* Record position after the end of the pattern. */ |
1314 /* a single test, a test for having gone past the end of the */ | 1538 pat_end = base_pat + len; |
1315 /* permissible match region, to test for both possible matches (when */ | 1539 if (direction < 0) |
1316 /* the stride goes past the end immediately) and failure to */ | 1540 base_pat = pat_end - 1; |
1317 /* match (where you get nudged past the end one stride at a time). */ | 1541 BM_tab_base = BM_tab; |
1318 | 1542 BM_tab += 0400; |
1319 /* Here we make a "mickey mouse" BM table. The stride of the search */ | 1543 j = dirlen; /* to get it in a register */ |
1320 /* is determined only by the last character of the putative match. */ | 1544 /* A character that does not appear in the pattern induces a |
1321 /* If that character does not match, we will stride the proper */ | 1545 stride equal to the pattern length. */ |
1322 /* distance to propose a match that superimposes it on the last */ | 1546 while (BM_tab_base != BM_tab) |
1323 /* instance of a character that matches it (per trt), or misses */ | 1547 { |
1324 /* it entirely if there is none. */ | 1548 *--BM_tab = j; |
1325 | 1549 *--BM_tab = j; |
1326 dirlen = len * direction; | 1550 *--BM_tab = j; |
1327 infinity = dirlen - (lim + pos + len + len) * direction; | 1551 *--BM_tab = j; |
1328 if (direction < 0) | 1552 } |
1329 pat = (base_pat += len - 1); | 1553 /* We use this for translation, instead of TRT itself. We |
1330 BM_tab_base = BM_tab; | 1554 fill this in to handle the characters that actually occur |
1331 BM_tab += 0400; | 1555 in the pattern. Others don't matter anyway! */ |
1332 j = dirlen; /* to get it in a register */ | 1556 xzero (simple_translate); |
1333 /* A character that does not appear in the pattern induces a */ | 1557 for (i = 0; i < 0400; i++) |
1334 /* stride equal to the pattern length. */ | 1558 simple_translate[i] = i; |
1335 while (BM_tab_base != BM_tab) | 1559 i = 0; |
1336 { | 1560 while (i != infinity) |
1337 *--BM_tab = j; | 1561 { |
1338 *--BM_tab = j; | 1562 Bufbyte *ptr = base_pat + i; |
1339 *--BM_tab = j; | 1563 i += direction; |
1340 *--BM_tab = j; | 1564 if (i == dirlen) |
1341 } | 1565 i = infinity; |
1342 i = 0; | 1566 if (!NILP (trt)) |
1343 while (i != infinity) | 1567 { |
1344 { | 1568 #ifdef MULE |
1345 j = pat[i]; i += direction; | 1569 Emchar ch, untranslated; |
1346 if (i == dirlen) i = infinity; | 1570 int this_translated = 1; |
1347 if (trt != 0) | 1571 |
1348 { | 1572 /* Is *PTR the last byte of a character? */ |
1349 k = (j = trt[j]); | 1573 if (pat_end - ptr == 1 || BUFBYTE_FIRST_BYTE_P (ptr[1])) |
1350 if (i == infinity) | 1574 { |
1351 stride_for_teases = BM_tab[j]; | 1575 Bufbyte *charstart = ptr; |
1576 while (!BUFBYTE_FIRST_BYTE_P (*charstart)) | |
1577 charstart--; | |
1578 untranslated = charptr_emchar (charstart); | |
1579 if (charset_base == (untranslated & ~CHAR_FIELD3_MASK)) | |
1580 { | |
1581 ch = TRANSLATE (trt, untranslated); | |
1582 if (!BUFBYTE_FIRST_BYTE_P (*ptr)) | |
1583 { | |
1584 translate_prev_byte = ptr[-1]; | |
1585 if (!BUFBYTE_FIRST_BYTE_P (translate_prev_byte)) | |
1586 translate_anteprev_byte = ptr[-2]; | |
1587 } | |
1588 } | |
1589 else | |
1590 { | |
1591 this_translated = 0; | |
1592 ch = *ptr; | |
1593 } | |
1594 } | |
1595 else | |
1596 { | |
1597 ch = *ptr; | |
1598 this_translated = 0; | |
1599 } | |
1600 if (ch > 0400) | |
1601 j = ((unsigned char) ch | 0200); | |
1602 else | |
1603 j = (unsigned char) ch; | |
1604 | |
1605 if (i == infinity) | |
1606 stride_for_teases = BM_tab[j]; | |
1607 BM_tab[j] = dirlen - i; | |
1608 /* A translation table is accompanied by its inverse -- | |
1609 see comment following downcase_table for details */ | |
1610 if (this_translated) | |
1611 { | |
1612 Emchar starting_ch = ch; | |
1613 EMACS_INT starting_j = j; | |
1614 while (1) | |
1615 { | |
1616 ch = TRANSLATE (inverse_trt, ch); | |
1617 if (ch > 0400) | |
1618 j = ((unsigned char) ch | 0200); | |
1619 else | |
1620 j = (unsigned char) ch; | |
1621 | |
1622 /* For all the characters that map into CH, | |
1623 set up simple_translate to map the last byte | |
1624 into STARTING_J. */ | |
1625 simple_translate[j] = starting_j; | |
1626 if (ch == starting_ch) | |
1627 break; | |
1628 BM_tab[j] = dirlen - i; | |
1629 } | |
1630 } | |
1631 #else | |
1632 EMACS_INT k; | |
1633 j = *ptr; | |
1634 k = (j = TRANSLATE (trt, j)); | |
1635 if (i == infinity) | |
1636 stride_for_teases = BM_tab[j]; | |
1637 BM_tab[j] = dirlen - i; | |
1638 /* A translation table is accompanied by its inverse -- | |
1639 see comment following downcase_table for details */ | |
1640 | |
1641 while ((j = TRANSLATE (inverse_trt, j)) != k) | |
1642 { | |
1643 simple_translate[j] = k; | |
1352 BM_tab[j] = dirlen - i; | 1644 BM_tab[j] = dirlen - i; |
1353 /* A translation table is accompanied by its inverse -- see */ | 1645 } |
1354 /* comment following downcase_table for details */ | 1646 #endif |
1355 | 1647 } |
1356 while ((j = inverse_trt[j]) != k) | 1648 else |
1357 BM_tab[j] = dirlen - i; | 1649 { |
1358 } | 1650 j = *ptr; |
1359 else | 1651 |
1360 { | 1652 if (i == infinity) |
1361 if (i == infinity) | 1653 stride_for_teases = BM_tab[j]; |
1362 stride_for_teases = BM_tab[j]; | 1654 BM_tab[j] = dirlen - i; |
1363 BM_tab[j] = dirlen - i; | 1655 } |
1364 } | 1656 /* stride_for_teases tells how much to stride if we get a |
1365 /* stride_for_teases tells how much to stride if we get a */ | 1657 match on the far character but are subsequently |
1366 /* match on the far character but are subsequently */ | 1658 disappointed, by recording what the stride would have been |
1367 /* disappointed, by recording what the stride would have been */ | 1659 for that character if the last character had been |
1368 /* for that character if the last character had been */ | 1660 different. */ |
1369 /* different. */ | 1661 } |
1370 } | 1662 infinity = dirlen - infinity; |
1371 infinity = dirlen - infinity; | 1663 pos += dirlen - ((direction > 0) ? direction : 0); |
1372 pos += dirlen - ((direction > 0) ? direction : 0); | 1664 /* loop invariant - pos points at where last char (first char if |
1373 /* loop invariant - pos points at where last char (first char if reverse) | 1665 reverse) of pattern would align in a possible match. */ |
1374 of pattern would align in a possible match. */ | 1666 while (n != 0) |
1375 while (n != 0) | 1667 { |
1376 { | 1668 Bytind tail_end; |
1377 /* It's been reported that some (broken) compiler thinks that | 1669 Bufbyte *tail_end_ptr; |
1378 Boolean expressions in an arithmetic context are unsigned. | 1670 /* It's been reported that some (broken) compiler thinks |
1379 Using an explicit ?1:0 prevents this. */ | 1671 that Boolean expressions in an arithmetic context are |
1380 if ((lim - pos - ((direction > 0) ? 1 : 0)) * direction < 0) | 1672 unsigned. Using an explicit ?1:0 prevents this. */ |
1381 return n * (0 - direction); | 1673 if ((lim - pos - ((direction > 0) ? 1 : 0)) * direction < 0) |
1382 /* First we do the part we can by pointers (maybe nothing) */ | 1674 return n * (0 - direction); |
1383 QUIT; | 1675 /* First we do the part we can by pointers (maybe |
1384 pat = base_pat; | 1676 nothing) */ |
1385 limit = pos - dirlen + direction; | 1677 QUIT; |
1678 pat = base_pat; | |
1679 limit = pos - dirlen + direction; | |
1680 /* XEmacs change: definitions of CEILING_OF and FLOOR_OF | |
1681 have changed. See buffer.h. */ | |
1682 limit = ((direction > 0) | |
1683 ? BI_BUF_CEILING_OF (buf, limit) - 1 | |
1684 : BI_BUF_FLOOR_OF (buf, limit + 1)); | |
1685 /* LIMIT is now the last (not beyond-last!) value POS can | |
1686 take on without hitting edge of buffer or the gap. */ | |
1687 limit = ((direction > 0) | |
1688 ? min (lim - 1, min (limit, pos + 20000)) | |
1689 : max (lim, max (limit, pos - 20000))); | |
1690 tail_end = BI_BUF_CEILING_OF (buf, pos); | |
1691 tail_end_ptr = BI_BUF_BYTE_ADDRESS (buf, tail_end); | |
1692 | |
1693 if ((limit - pos) * direction > 20) | |
1694 { | |
1695 p_limit = BI_BUF_BYTE_ADDRESS (buf, limit); | |
1696 ptr2 = (cursor = BI_BUF_BYTE_ADDRESS (buf, pos)); | |
1697 /* In this loop, pos + cursor - ptr2 is the surrogate | |
1698 for pos */ | |
1699 while (1) /* use one cursor setting as long as i can */ | |
1700 { | |
1701 if (direction > 0) /* worth duplicating */ | |
1702 { | |
1703 /* Use signed comparison if appropriate to make | |
1704 cursor+infinity sure to be > p_limit. | |
1705 Assuming that the buffer lies in a range of | |
1706 addresses that are all "positive" (as ints) | |
1707 or all "negative", either kind of comparison | |
1708 will work as long as we don't step by | |
1709 infinity. So pick the kind that works when | |
1710 we do step by infinity. */ | |
1711 if ((EMACS_INT) (p_limit + infinity) > | |
1712 (EMACS_INT) p_limit) | |
1713 while ((EMACS_INT) cursor <= | |
1714 (EMACS_INT) p_limit) | |
1715 cursor += BM_tab[*cursor]; | |
1716 else | |
1717 while ((EMACS_UINT) cursor <= | |
1718 (EMACS_UINT) p_limit) | |
1719 cursor += BM_tab[*cursor]; | |
1720 } | |
1721 else | |
1722 { | |
1723 if ((EMACS_INT) (p_limit + infinity) < | |
1724 (EMACS_INT) p_limit) | |
1725 while ((EMACS_INT) cursor >= | |
1726 (EMACS_INT) p_limit) | |
1727 cursor += BM_tab[*cursor]; | |
1728 else | |
1729 while ((EMACS_UINT) cursor >= | |
1730 (EMACS_UINT) p_limit) | |
1731 cursor += BM_tab[*cursor]; | |
1732 } | |
1733 /* If you are here, cursor is beyond the end of the | |
1734 searched region. This can happen if you match on | |
1735 the far character of the pattern, because the | |
1736 "stride" of that character is infinity, a number | |
1737 able to throw you well beyond the end of the | |
1738 search. It can also happen if you fail to match | |
1739 within the permitted region and would otherwise | |
1740 try a character beyond that region */ | |
1741 if ((cursor - p_limit) * direction <= len) | |
1742 break; /* a small overrun is genuine */ | |
1743 cursor -= infinity; /* large overrun = hit */ | |
1744 i = dirlen - direction; | |
1745 if (!NILP (trt)) | |
1746 { | |
1747 while ((i -= direction) + direction != 0) | |
1748 { | |
1749 #ifdef MULE | |
1750 Emchar ch; | |
1751 cursor -= direction; | |
1752 /* Translate only the last byte of a character. */ | |
1753 if ((cursor == tail_end_ptr | |
1754 || BUFBYTE_FIRST_BYTE_P (cursor[1])) | |
1755 && (BUFBYTE_FIRST_BYTE_P (cursor[0]) | |
1756 || (translate_prev_byte == cursor[-1] | |
1757 && (BUFBYTE_FIRST_BYTE_P (translate_prev_byte) | |
1758 || translate_anteprev_byte == cursor[-2])))) | |
1759 ch = simple_translate[*cursor]; | |
1760 else | |
1761 ch = *cursor; | |
1762 if (pat[i] != ch) | |
1763 break; | |
1764 #else | |
1765 if (pat[i] != TRANSLATE (trt, *(cursor -= direction))) | |
1766 break; | |
1767 #endif | |
1768 } | |
1769 } | |
1770 else | |
1771 { | |
1772 while ((i -= direction) + direction != 0) | |
1773 if (pat[i] != *(cursor -= direction)) | |
1774 break; | |
1775 } | |
1776 cursor += dirlen - i - direction; /* fix cursor */ | |
1777 if (i + direction == 0) | |
1778 { | |
1779 cursor -= direction; | |
1780 | |
1781 { | |
1782 Bytind bytstart = (pos + cursor - ptr2 + | |
1783 ((direction > 0) | |
1784 ? 1 - len : 0)); | |
1785 Bufpos bufstart = bytind_to_bufpos (buf, bytstart); | |
1786 Bufpos bufend = bytind_to_bufpos (buf, bytstart + len); | |
1787 | |
1788 set_search_regs (buf, bufstart, bufend - bufstart); | |
1789 } | |
1790 | |
1791 if ((n -= direction) != 0) | |
1792 cursor += dirlen; /* to resume search */ | |
1793 else | |
1794 return ((direction > 0) | |
1795 ? search_regs.end[0] : search_regs.start[0]); | |
1796 } | |
1797 else | |
1798 cursor += stride_for_teases; /* <sigh> we lose - */ | |
1799 } | |
1800 pos += cursor - ptr2; | |
1801 } | |
1802 else | |
1803 /* Now we'll pick up a clump that has to be done the hard | |
1804 way because it covers a discontinuity */ | |
1805 { | |
1386 /* XEmacs change: definitions of CEILING_OF and FLOOR_OF | 1806 /* XEmacs change: definitions of CEILING_OF and FLOOR_OF |
1387 have changed. See buffer.h. */ | 1807 have changed. See buffer.h. */ |
1388 limit = ((direction > 0) | 1808 limit = ((direction > 0) |
1389 ? BI_BUF_CEILING_OF (buf, limit) - 1 | 1809 ? BI_BUF_CEILING_OF (buf, pos - dirlen + 1) - 1 |
1390 : BI_BUF_FLOOR_OF (buf, limit + 1)); | 1810 : BI_BUF_FLOOR_OF (buf, pos - dirlen)); |
1391 /* LIMIT is now the last (not beyond-last!) value | |
1392 POS can take on without hitting edge of buffer or the gap. */ | |
1393 limit = ((direction > 0) | 1811 limit = ((direction > 0) |
1394 ? min (lim - 1, min (limit, pos + 20000)) | 1812 ? min (limit + len, lim - 1) |
1395 : max (lim, max (limit, pos - 20000))); | 1813 : max (limit - len, lim)); |
1396 if ((limit - pos) * direction > 20) | 1814 /* LIMIT is now the last value POS can have |
1397 { | 1815 and still be valid for a possible match. */ |
1398 p_limit = BI_BUF_BYTE_ADDRESS (buf, limit); | 1816 while (1) |
1399 ptr2 = (cursor = BI_BUF_BYTE_ADDRESS (buf, pos)); | 1817 { |
1400 /* In this loop, pos + cursor - ptr2 is the surrogate for pos */ | 1818 /* This loop can be coded for space rather than |
1401 while (1) /* use one cursor setting as long as i can */ | 1819 speed because it will usually run only once. |
1820 (the reach is at most len + 21, and typically | |
1821 does not exceed len) */ | |
1822 while ((limit - pos) * direction >= 0) | |
1823 /* *not* BI_BUF_FETCH_CHAR. We are working here | |
1824 with bytes, not characters. */ | |
1825 pos += BM_tab[*BI_BUF_BYTE_ADDRESS (buf, pos)]; | |
1826 /* now run the same tests to distinguish going off | |
1827 the end, a match or a phony match. */ | |
1828 if ((pos - limit) * direction <= len) | |
1829 break; /* ran off the end */ | |
1830 /* Found what might be a match. | |
1831 Set POS back to last (first if reverse) char pos. */ | |
1832 pos -= infinity; | |
1833 i = dirlen - direction; | |
1834 while ((i -= direction) + direction != 0) | |
1402 { | 1835 { |
1403 if (direction > 0) /* worth duplicating */ | 1836 #ifdef MULE |
1404 { | 1837 Emchar ch; |
1405 /* Use signed comparison if appropriate | 1838 Bufbyte *ptr; |
1406 to make cursor+infinity sure to be > p_limit. | 1839 #endif |
1407 Assuming that the buffer lies in a range of addresses | 1840 pos -= direction; |
1408 that are all "positive" (as ints) or all "negative", | 1841 #ifdef MULE |
1409 either kind of comparison will work as long | 1842 ptr = BI_BUF_BYTE_ADDRESS (buf, pos); |
1410 as we don't step by infinity. So pick the kind | 1843 if ((ptr == tail_end_ptr |
1411 that works when we do step by infinity. */ | 1844 || BUFBYTE_FIRST_BYTE_P (ptr[1])) |
1412 if ((EMACS_INT) (p_limit + infinity) > | 1845 && (BUFBYTE_FIRST_BYTE_P (ptr[0]) |
1413 (EMACS_INT) p_limit) | 1846 || (translate_prev_byte == ptr[-1] |
1414 while ((EMACS_INT) cursor <= | 1847 && (BUFBYTE_FIRST_BYTE_P (translate_prev_byte) |
1415 (EMACS_INT) p_limit) | 1848 || translate_anteprev_byte == ptr[-2])))) |
1416 cursor += BM_tab[*cursor]; | 1849 ch = simple_translate[*ptr]; |
1417 else | |
1418 while ((EMACS_UINT) cursor <= | |
1419 (EMACS_UINT) p_limit) | |
1420 cursor += BM_tab[*cursor]; | |
1421 } | |
1422 else | 1850 else |
1423 { | 1851 ch = *ptr; |
1424 if ((EMACS_INT) (p_limit + infinity) < | 1852 if (pat[i] != ch) |
1425 (EMACS_INT) p_limit) | 1853 break; |
1426 while ((EMACS_INT) cursor >= | 1854 |
1427 (EMACS_INT) p_limit) | 1855 #else |
1428 cursor += BM_tab[*cursor]; | 1856 if (pat[i] != TRANSLATE (trt, |
1429 else | 1857 *BI_BUF_BYTE_ADDRESS (buf, pos))) |
1430 while ((EMACS_UINT) cursor >= | 1858 break; |
1431 (EMACS_UINT) p_limit) | 1859 #endif |
1432 cursor += BM_tab[*cursor]; | 1860 } |
1433 } | 1861 /* Above loop has moved POS part or all the way back |
1434 /* If you are here, cursor is beyond the end of the searched region. */ | 1862 to the first char pos (last char pos if reverse). |
1435 /* This can happen if you match on the far character of the pattern, */ | 1863 Set it once again at the last (first if reverse) |
1436 /* because the "stride" of that character is infinity, a number able */ | 1864 char. */ |
1437 /* to throw you well beyond the end of the search. It can also */ | 1865 pos += dirlen - i- direction; |
1438 /* happen if you fail to match within the permitted region and would */ | 1866 if (i + direction == 0) |
1439 /* otherwise try a character beyond that region */ | 1867 { |
1440 if ((cursor - p_limit) * direction <= len) | 1868 pos -= direction; |
1441 break; /* a small overrun is genuine */ | 1869 |
1442 cursor -= infinity; /* large overrun = hit */ | 1870 { |
1443 i = dirlen - direction; | 1871 Bytind bytstart = (pos + |
1444 if (trt != 0) | 1872 ((direction > 0) |
1445 { | 1873 ? 1 - len : 0)); |
1446 while ((i -= direction) + direction != 0) | 1874 Bufpos bufstart = bytind_to_bufpos (buf, bytstart); |
1447 if (pat[i] != trt[*(cursor -= direction)]) | 1875 Bufpos bufend = bytind_to_bufpos (buf, bytstart + len); |
1448 break; | 1876 |
1449 } | 1877 set_search_regs (buf, bufstart, bufend - bufstart); |
1878 } | |
1879 | |
1880 if ((n -= direction) != 0) | |
1881 pos += dirlen; /* to resume search */ | |
1450 else | 1882 else |
1451 { | 1883 return ((direction > 0) |
1452 while ((i -= direction) + direction != 0) | 1884 ? search_regs.end[0] : search_regs.start[0]); |
1453 if (pat[i] != *(cursor -= direction)) | |
1454 break; | |
1455 } | |
1456 cursor += dirlen - i - direction; /* fix cursor */ | |
1457 if (i + direction == 0) | |
1458 { | |
1459 cursor -= direction; | |
1460 | |
1461 { | |
1462 Bytind bytstart = (pos + cursor - ptr2 + | |
1463 ((direction > 0) | |
1464 ? 1 - len : 0)); | |
1465 Bufpos bufstart = bytind_to_bufpos (buf, bytstart); | |
1466 Bufpos bufend = bytind_to_bufpos (buf, bytstart + len); | |
1467 | |
1468 set_search_regs (buf, bufstart, bufend - bufstart); | |
1469 } | |
1470 | |
1471 if ((n -= direction) != 0) | |
1472 cursor += dirlen; /* to resume search */ | |
1473 else | |
1474 return ((direction > 0) | |
1475 ? search_regs.end[0] : search_regs.start[0]); | |
1476 } | |
1477 else | |
1478 cursor += stride_for_teases; /* <sigh> we lose - */ | |
1479 } | 1885 } |
1480 pos += cursor - ptr2; | 1886 else |
1481 } | 1887 pos += stride_for_teases; |
1482 else | 1888 } |
1483 /* Now we'll pick up a clump that has to be done the hard */ | 1889 } |
1484 /* way because it covers a discontinuity */ | 1890 /* We have done one clump. Can we continue? */ |
1485 { | 1891 if ((lim - pos) * direction < 0) |
1486 /* XEmacs change: definitions of CEILING_OF and FLOOR_OF | 1892 return (0 - n) * direction; |
1487 have changed. See buffer.h. */ | 1893 } |
1488 limit = ((direction > 0) | 1894 return bytind_to_bufpos (buf, pos); |
1489 ? BI_BUF_CEILING_OF (buf, pos - dirlen + 1) - 1 | |
1490 : BI_BUF_FLOOR_OF (buf, pos - dirlen)); | |
1491 limit = ((direction > 0) | |
1492 ? min (limit + len, lim - 1) | |
1493 : max (limit - len, lim)); | |
1494 /* LIMIT is now the last value POS can have | |
1495 and still be valid for a possible match. */ | |
1496 while (1) | |
1497 { | |
1498 /* This loop can be coded for space rather than */ | |
1499 /* speed because it will usually run only once. */ | |
1500 /* (the reach is at most len + 21, and typically */ | |
1501 /* does not exceed len) */ | |
1502 while ((limit - pos) * direction >= 0) | |
1503 /* *not* BI_BUF_FETCH_CHAR. We are working here | |
1504 with bytes, not characters. */ | |
1505 pos += BM_tab[*BI_BUF_BYTE_ADDRESS (buf, pos)]; | |
1506 /* now run the same tests to distinguish going off the */ | |
1507 /* end, a match or a phony match. */ | |
1508 if ((pos - limit) * direction <= len) | |
1509 break; /* ran off the end */ | |
1510 /* Found what might be a match. | |
1511 Set POS back to last (first if reverse) char pos. */ | |
1512 pos -= infinity; | |
1513 i = dirlen - direction; | |
1514 while ((i -= direction) + direction != 0) | |
1515 { | |
1516 pos -= direction; | |
1517 if (pat[i] != (((Bufbyte *) trt) | |
1518 /* #### Does not handle TRT right */ | |
1519 ? trt[*BI_BUF_BYTE_ADDRESS (buf, pos)] | |
1520 : *BI_BUF_BYTE_ADDRESS (buf, pos))) | |
1521 break; | |
1522 } | |
1523 /* Above loop has moved POS part or all the way | |
1524 back to the first char pos (last char pos if reverse). | |
1525 Set it once again at the last (first if reverse) char. */ | |
1526 pos += dirlen - i- direction; | |
1527 if (i + direction == 0) | |
1528 { | |
1529 pos -= direction; | |
1530 | |
1531 { | |
1532 Bytind bytstart = (pos + | |
1533 ((direction > 0) | |
1534 ? 1 - len : 0)); | |
1535 Bufpos bufstart = bytind_to_bufpos (buf, bytstart); | |
1536 Bufpos bufend = bytind_to_bufpos (buf, bytstart + len); | |
1537 | |
1538 set_search_regs (buf, bufstart, bufend - bufstart); | |
1539 } | |
1540 | |
1541 if ((n -= direction) != 0) | |
1542 pos += dirlen; /* to resume search */ | |
1543 else | |
1544 return ((direction > 0) | |
1545 ? search_regs.end[0] : search_regs.start[0]); | |
1546 } | |
1547 else | |
1548 pos += stride_for_teases; | |
1549 } | |
1550 } | |
1551 /* We have done one clump. Can we continue? */ | |
1552 if ((lim - pos) * direction < 0) | |
1553 return (0 - n) * direction; | |
1554 } | |
1555 return bytind_to_bufpos (buf, pos); | |
1556 } | |
1557 } | 1895 } |
1558 | 1896 |
1559 /* Record beginning BEG and end BEG + LEN | 1897 /* Record beginning BEG and end BEG + LEN |
1560 for a match just found in the current buffer. */ | 1898 for a match just found in the current buffer. */ |
1561 | 1899 |