Mercurial > hg > xemacs-beta
comparison src/search.c @ 4552:9c1cfceab252
Automated merge with file:/Sources/xemacs-21.5-checked-out
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Thu, 13 Mar 2008 10:24:34 +0100 |
parents | 69b803c646cd |
children | 91a023144e72 19a72041c5ed |
comparison
equal
deleted
inserted
replaced
4551:6812571bfcb9 | 4552:9c1cfceab252 |
---|---|
44 | 44 |
45 #define TRANSLATE(table, pos) \ | 45 #define TRANSLATE(table, pos) \ |
46 (!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos) | 46 (!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos) |
47 | 47 |
48 #define REGEXP_CACHE_SIZE 20 | 48 #define REGEXP_CACHE_SIZE 20 |
49 | |
50 #ifdef DEBUG_XEMACS | |
51 | |
52 /* Used in tests/automated/case-tests.el if available. */ | |
53 Fixnum debug_xemacs_searches; | |
54 | |
55 Lisp_Object Qsearch_algorithm_used, Qboyer_moore, Qsimple_search; | |
56 | |
57 #endif | |
49 | 58 |
50 /* If the regexp is non-nil, then the buffer contains the compiled form | 59 /* If the regexp is non-nil, then the buffer contains the compiled form |
51 of that regexp, suitable for searching. */ | 60 of that regexp, suitable for searching. */ |
52 struct regexp_cache | 61 struct regexp_cache |
53 { | 62 { |
1368 | 1377 |
1369 orig_bytelen = itext_ichar_len (base_pat); | 1378 orig_bytelen = itext_ichar_len (base_pat); |
1370 inv_bytelen = set_itext_ichar (tmp_str, inverse); | 1379 inv_bytelen = set_itext_ichar (tmp_str, inverse); |
1371 new_bytelen = set_itext_ichar (tmp_str, translated); | 1380 new_bytelen = set_itext_ichar (tmp_str, translated); |
1372 | 1381 |
1373 if (-1 == charset_base) | 1382 if (boyer_moore_ok |
1374 { | 1383 /* Only do the Boyer-Moore check for characters needing |
1375 /* Keep track of which charset and character set row | 1384 translation. */ |
1376 contains the characters that need translation. | 1385 && (translated != c || inverse != c)) |
1377 | |
1378 Zero out the bits corresponding to the last byte. */ | |
1379 charset_base = c & ~ICHAR_FIELD3_MASK; | |
1380 } | |
1381 | |
1382 if (boyer_moore_ok && (translated != c || inverse != c)) | |
1383 { | 1386 { |
1384 Ichar starting_c = c; | 1387 Ichar starting_c = c; |
1385 int charset_base_code; | 1388 int charset_base_code, checked = 0; |
1386 | 1389 |
1387 do | 1390 do |
1388 { | 1391 { |
1389 c = TRANSLATE (inverse_trt, c); | 1392 c = TRANSLATE (inverse_trt, c); |
1390 | 1393 |
1394 continue; | 1397 continue; |
1395 | 1398 |
1396 if (c > 0xFF && nothing_greater_than_0xff) | 1399 if (c > 0xFF && nothing_greater_than_0xff) |
1397 continue; | 1400 continue; |
1398 | 1401 |
1399 charset_base_code = c & ~ICHAR_FIELD3_MASK; | 1402 checked = 1; |
1400 | 1403 |
1401 if (charset_base_code != charset_base) | 1404 if (-1 == charset_base) /* No charset yet specified. */ |
1402 { | 1405 { |
1403 /* If two different rows, or two different charsets, | 1406 /* Keep track of which charset and character set row |
1404 appear, needing translation, then we cannot use | 1407 contains the characters that need translation. |
1405 boyer_moore search. See the comment at the head of | 1408 |
1406 boyer_moore(). */ | 1409 Zero out the bits corresponding to the last |
1407 boyer_moore_ok = 0; | 1410 byte. */ |
1408 break; | 1411 charset_base = c & ~ICHAR_FIELD3_MASK; |
1412 } | |
1413 else | |
1414 { | |
1415 charset_base_code = c & ~ICHAR_FIELD3_MASK; | |
1416 | |
1417 if (charset_base_code != charset_base) | |
1418 { | |
1419 /* If two different rows, or two different | |
1420 charsets, appear, needing non-ASCII | |
1421 translation, then we cannot use boyer_moore | |
1422 search. See the comment at the head of | |
1423 boyer_moore(). */ | |
1424 boyer_moore_ok = 0; | |
1425 break; | |
1426 } | |
1409 } | 1427 } |
1410 } while (c != starting_c); | 1428 } while (c != starting_c); |
1411 | 1429 |
1412 if (boyer_moore_ok && (charset_base != | 1430 if (!checked) |
1413 (translated & ~ICHAR_FIELD3_MASK))) | 1431 { |
1432 #ifdef DEBUG_XEMACS | |
1433 if (debug_xemacs_searches) | |
1434 { | |
1435 Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used); | |
1436 sym->value = Qnil; | |
1437 } | |
1438 #endif | |
1439 /* The "continue" clauses were used above, for every | |
1440 translation of the character. As such, this character | |
1441 is not to be found in the buffer and neither is the | |
1442 string as a whole. Return immediately; also avoid | |
1443 triggering the assertion a few lines down. */ | |
1444 return n > 0 ? -n : n; | |
1445 } | |
1446 | |
1447 if (boyer_moore_ok && charset_base != -1 && | |
1448 charset_base != (translated & ~ICHAR_FIELD3_MASK)) | |
1414 { | 1449 { |
1415 /* In the rare event that the CANON entry for this | 1450 /* In the rare event that the CANON entry for this |
1416 character is not in the desired set, choose one | 1451 character is not in the desired set, choose one |
1417 that is, from the equivalence set. It doesn't much | 1452 that is, from the equivalence set. It doesn't much |
1418 matter which. */ | 1453 matter which. */ |
1435 memcpy (pat, tmp_str, new_bytelen); | 1470 memcpy (pat, tmp_str, new_bytelen); |
1436 pat += new_bytelen; | 1471 pat += new_bytelen; |
1437 base_pat += orig_bytelen; | 1472 base_pat += orig_bytelen; |
1438 len -= orig_bytelen; | 1473 len -= orig_bytelen; |
1439 } | 1474 } |
1475 | |
1476 if (-1 == charset_base) | |
1477 { | |
1478 charset_base = 'a' & ~ICHAR_FIELD3_MASK; /* Default to ASCII. */ | |
1479 } | |
1480 | |
1440 #else /* not MULE */ | 1481 #else /* not MULE */ |
1441 while (--len >= 0) | 1482 while (--len >= 0) |
1442 { | 1483 { |
1443 /* If we got here and the RE flag is set, it's because | 1484 /* If we got here and the RE flag is set, it's because |
1444 we're dealing with a regexp known to be trivial, so the | 1485 we're dealing with a regexp known to be trivial, so the |
1451 *pat++ = TRANSLATE (trt, *base_pat++); | 1492 *pat++ = TRANSLATE (trt, *base_pat++); |
1452 } | 1493 } |
1453 #endif /* MULE */ | 1494 #endif /* MULE */ |
1454 len = pat - patbuf; | 1495 len = pat - patbuf; |
1455 pat = base_pat = patbuf; | 1496 pat = base_pat = patbuf; |
1497 | |
1498 #ifdef DEBUG_XEMACS | |
1499 if (debug_xemacs_searches) | |
1500 { | |
1501 Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used); | |
1502 sym->value = boyer_moore_ok ? Qboyer_moore : Qsimple_search; | |
1503 } | |
1504 #endif | |
1505 | |
1456 if (boyer_moore_ok) | 1506 if (boyer_moore_ok) |
1457 return boyer_moore (buf, base_pat, len, pos, lim, n, | 1507 return boyer_moore (buf, base_pat, len, pos, lim, n, |
1458 trt, inverse_trt, charset_base); | 1508 trt, inverse_trt, charset_base); |
1459 else | 1509 else |
1460 return simple_search (buf, base_pat, len, pos, lim, n, trt); | 1510 return simple_search (buf, base_pat, len, pos, lim, n, trt); |
1593 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. | 1643 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. |
1594 DIRECTION says which direction we search in. | 1644 DIRECTION says which direction we search in. |
1595 TRT and INVERSE_TRT are translation tables. | 1645 TRT and INVERSE_TRT are translation tables. |
1596 | 1646 |
1597 This kind of search works if all the characters in PAT that have | 1647 This kind of search works if all the characters in PAT that have |
1598 nontrivial translation are the same aside from the last byte. This | 1648 (non-ASCII) translation are the same aside from the last byte. This |
1599 makes it possible to translate just the last byte of a character, | 1649 makes it possible to translate just the last byte of a character, and do |
1600 and do so after just a simple test of the context. | 1650 so after just a simple test of the context. |
1601 | 1651 |
1602 If that criterion is not satisfied, do not call this function. You will | 1652 If that criterion is not satisfied, do not call this function. You will |
1603 get an assertion failure. */ | 1653 get an assertion failure. */ |
1604 | 1654 |
1605 static Charbpos | 1655 static Charbpos |
1738 Ibyte *charstart = ptr; | 1788 Ibyte *charstart = ptr; |
1739 while (!ibyte_first_byte_p (*charstart)) | 1789 while (!ibyte_first_byte_p (*charstart)) |
1740 charstart--; | 1790 charstart--; |
1741 untranslated = itext_ichar (charstart); | 1791 untranslated = itext_ichar (charstart); |
1742 | 1792 |
1743 /* We shouldn't have been passed a string with varying | |
1744 character sets or rows. That's what simple_search is | |
1745 for. */ | |
1746 assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK)); | |
1747 | |
1748 ch = TRANSLATE (trt, untranslated); | 1793 ch = TRANSLATE (trt, untranslated); |
1749 if (!ibyte_first_byte_p (*ptr)) | 1794 if (!ibyte_first_byte_p (*ptr)) |
1750 { | 1795 { |
1751 translate_prev_byte = ptr[-1]; | 1796 translate_prev_byte = ptr[-1]; |
1752 if (!ibyte_first_byte_p (translate_prev_byte)) | 1797 if (!ibyte_first_byte_p (translate_prev_byte)) |
1753 translate_anteprev_byte = ptr[-2]; | 1798 translate_anteprev_byte = ptr[-2]; |
1754 } | 1799 } |
1755 | 1800 |
1756 if (charset_base != (ch & ~ICHAR_FIELD3_MASK)) | 1801 if (ch != untranslated && /* Was translation done? */ |
1802 charset_base != (ch & ~ICHAR_FIELD3_MASK)) | |
1757 { | 1803 { |
1758 /* In the very rare event that the CANON entry for this | 1804 /* In the very rare event that the CANON entry for this |
1759 character is not in the desired set, choose one that | 1805 character is not in the desired set, choose one that |
1760 is, from the equivalence set. It doesn't much matter | 1806 is, from the equivalence set. It doesn't much matter |
1761 which, since we're building our own cheesy equivalence | 1807 which, since we're building our own cheesy equivalence |
1763 table directly. | 1809 table directly. |
1764 | 1810 |
1765 We can get here if search_buffer has worked out that | 1811 We can get here if search_buffer has worked out that |
1766 the buffer is entirely single width. */ | 1812 the buffer is entirely single width. */ |
1767 Ichar starting_ch = ch; | 1813 Ichar starting_ch = ch; |
1814 int count = 0; | |
1768 do | 1815 do |
1769 { | 1816 { |
1770 ch = TRANSLATE (inverse_trt, ch); | 1817 ch = TRANSLATE (inverse_trt, ch); |
1771 if (charset_base == (ch & ~ICHAR_FIELD3_MASK)) | 1818 if (charset_base == (ch & ~ICHAR_FIELD3_MASK)) |
1772 break; | 1819 break; |
1773 | 1820 ++count; |
1774 } while (starting_ch != ch); | 1821 } while (starting_ch != ch); |
1775 | 1822 |
1776 /* If starting_ch is equal to ch, the case table is | 1823 /* If starting_ch is equal to ch (and count is not one, |
1777 corrupt. (Any mapping in the canon table should be | 1824 which means no translation is necessary), the case |
1778 reflected in the equivalence table, and we know from | 1825 table is corrupt. (Any mapping in the canon table |
1779 the canon table that untranslated maps to starting_ch | 1826 should be reflected in the equivalence table, and we |
1780 and that untranslated has the correct value for | 1827 know from the canon table that untranslated maps to |
1781 charset_base.) */ | 1828 starting_ch and that untranslated has the correct value |
1782 assert (starting_ch != ch); | 1829 for charset_base.) */ |
1830 assert (1 == count || starting_ch != ch); | |
1783 } | 1831 } |
1784 } | 1832 } |
1785 else | 1833 else |
1786 { | 1834 { |
1787 ch = *ptr; | 1835 ch = *ptr; |
3318 */ ); | 3366 */ ); |
3319 warn_about_possibly_incompatible_back_references = 1; | 3367 warn_about_possibly_incompatible_back_references = 1; |
3320 | 3368 |
3321 Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed); | 3369 Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed); |
3322 staticpro (&Vskip_chars_range_table); | 3370 staticpro (&Vskip_chars_range_table); |
3323 } | 3371 #ifdef DEBUG_XEMACS |
3372 DEFSYMBOL (Qsearch_algorithm_used); | |
3373 DEFSYMBOL (Qboyer_moore); | |
3374 DEFSYMBOL (Qsimple_search); | |
3375 | |
3376 DEFVAR_INT ("debug-xemacs-searches", &debug_xemacs_searches /* | |
3377 If non-zero, bind `search-algorithm-used' to `boyer-moore' or `simple-search', | |
3378 depending on the algorithm used for each search. Used for testing. | |
3379 */ ); | |
3380 debug_xemacs_searches = 0; | |
3381 #endif | |
3382 } |