Mercurial > hg > xemacs-beta
comparison src/search.c @ 4407:4ee73bbe4f8e
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.
2007-12-26 Aidan Kehoe <kehoea@parhasard.net>
* casetab.c:
Extend and correct some case table documentation.
* search.c (search_buffer):
Correct a bug where only the first entry for a character in the
case equivalence table was examined in determining if the
Boyer-Moore search algorithm is appropriate.
If there are case mappings outside of the charset and row of the
characters specified in the search string, those case mappings can
be safely ignored (and Boyer-Moore search can be used) if we know
from the buffer statistics that the corresponding characters cannot
occur.
* search.c (boyer_moore):
Assert that we haven't been passed a string with varying
characters sets or rows within character sets. That's what
simple_search is for.
In the very rare event that a character in the search string has a
canonical case mapping that is not in the same character set and
row, don't try to search for the canonical character, search for
some other character that is in the the desired character set and
row. Assert that the case table isn't corrupt.
Do not search for any character case mappings that cannot possibly
occur in the buffer, given the buffer metadata about its
contents.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Wed, 26 Dec 2007 17:30:16 +0100 |
parents | f70e56bb52a7 |
children | df576f30c1d8 |
comparison
equal
deleted
inserted
replaced
4356:cc293ef846d2 | 4407:4ee73bbe4f8e |
---|---|
1338 } | 1338 } |
1339 else /* non-RE case */ | 1339 else /* non-RE case */ |
1340 { | 1340 { |
1341 int charset_base = -1; | 1341 int charset_base = -1; |
1342 int boyer_moore_ok = 1; | 1342 int boyer_moore_ok = 1; |
1343 Ibyte *pat = 0; | |
1344 Ibyte *patbuf = alloca_ibytes (len * MAX_ICHAR_LEN); | 1343 Ibyte *patbuf = alloca_ibytes (len * MAX_ICHAR_LEN); |
1345 pat = patbuf; | 1344 Ibyte *pat = patbuf; |
1345 | |
1346 #ifdef MULE | 1346 #ifdef MULE |
1347 /* &&#### needs some 8-bit work here */ | 1347 int entirely_one_byte_p = buf->text->entirely_one_byte_p; |
1348 int nothing_greater_than_0xff = | |
1349 buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf); | |
1350 | |
1348 while (len > 0) | 1351 while (len > 0) |
1349 { | 1352 { |
1350 Ibyte tmp_str[MAX_ICHAR_LEN]; | 1353 Ibyte tmp_str[MAX_ICHAR_LEN]; |
1351 Ichar c, translated, inverse; | 1354 Ichar c, translated, inverse; |
1352 Bytecount orig_bytelen, new_bytelen, inv_bytelen; | 1355 Bytecount orig_bytelen, new_bytelen, inv_bytelen; |
1365 | 1368 |
1366 orig_bytelen = itext_ichar_len (base_pat); | 1369 orig_bytelen = itext_ichar_len (base_pat); |
1367 inv_bytelen = set_itext_ichar (tmp_str, inverse); | 1370 inv_bytelen = set_itext_ichar (tmp_str, inverse); |
1368 new_bytelen = set_itext_ichar (tmp_str, translated); | 1371 new_bytelen = set_itext_ichar (tmp_str, translated); |
1369 | 1372 |
1370 if (new_bytelen != orig_bytelen || inv_bytelen != orig_bytelen) | 1373 if (-1 == charset_base) |
1371 boyer_moore_ok = 0; | 1374 { |
1372 if (translated != c || inverse != c) | 1375 /* Keep track of which charset and character set row |
1373 { | 1376 contains the characters that need translation. |
1374 /* Keep track of which charset and character set row | 1377 |
1375 contains the characters that need translation. | 1378 Zero out the bits corresponding to the last byte. */ |
1376 Zero out the bits corresponding to the last byte. | 1379 charset_base = c & ~ICHAR_FIELD3_MASK; |
1377 */ | 1380 } |
1378 int charset_base_code = c & ~ICHAR_FIELD3_MASK; | 1381 |
1379 if (charset_base == -1) | 1382 if (boyer_moore_ok && (translated != c || inverse != c)) |
1380 charset_base = charset_base_code; | 1383 { |
1381 else if (charset_base != charset_base_code) | 1384 Ichar starting_c = c; |
1382 /* If two different rows appear, needing translation, then | 1385 int charset_base_code; |
1383 we cannot use boyer_moore search. See the comment at the | 1386 |
1384 head of boyer_moore(). */ | 1387 do |
1385 boyer_moore_ok = 0; | 1388 { |
1386 } | 1389 c = TRANSLATE (inverse_trt, c); |
1390 | |
1391 /* If a character cannot occur in the buffer, ignore | |
1392 it. */ | |
1393 if (c > 0x7F && entirely_one_byte_p) | |
1394 continue; | |
1395 | |
1396 if (c > 0xFF && nothing_greater_than_0xff) | |
1397 continue; | |
1398 | |
1399 charset_base_code = c & ~ICHAR_FIELD3_MASK; | |
1400 | |
1401 if (charset_base_code != charset_base) | |
1402 { | |
1403 /* If two different rows, or two different charsets, | |
1404 appear, needing translation, then we cannot use | |
1405 boyer_moore search. See the comment at the head of | |
1406 boyer_moore(). */ | |
1407 boyer_moore_ok = 0; | |
1408 break; | |
1409 } | |
1410 } while (c != starting_c); | |
1411 | |
1412 if (boyer_moore_ok && (charset_base != | |
1413 (translated & ~ICHAR_FIELD3_MASK))) | |
1414 { | |
1415 /* In the rare event that the CANON entry for this | |
1416 character is not in the desired set, choose one | |
1417 that is, from the equivalence set. It doesn't much | |
1418 matter which. */ | |
1419 Ichar starting_ch = translated; | |
1420 do | |
1421 { | |
1422 translated = TRANSLATE (inverse_trt, translated); | |
1423 | |
1424 if (charset_base == (translated & ~ICHAR_FIELD3_MASK)) | |
1425 break; | |
1426 | |
1427 } while (starting_ch != translated); | |
1428 | |
1429 assert (starting_ch != translated); | |
1430 | |
1431 new_bytelen = set_itext_ichar (tmp_str, translated); | |
1432 } | |
1433 } | |
1434 | |
1387 memcpy (pat, tmp_str, new_bytelen); | 1435 memcpy (pat, tmp_str, new_bytelen); |
1388 pat += new_bytelen; | 1436 pat += new_bytelen; |
1389 base_pat += orig_bytelen; | 1437 base_pat += orig_bytelen; |
1390 len -= orig_bytelen; | 1438 len -= orig_bytelen; |
1391 } | 1439 } |
1549 This kind of search works if all the characters in PAT that have | 1597 This kind of search works if all the characters in PAT that have |
1550 nontrivial translation are the same aside from the last byte. This | 1598 nontrivial translation are the same aside from the last byte. This |
1551 makes it possible to translate just the last byte of a character, | 1599 makes it possible to translate just the last byte of a character, |
1552 and do so after just a simple test of the context. | 1600 and do so after just a simple test of the context. |
1553 | 1601 |
1554 If that criterion is not satisfied, do not call this function. */ | 1602 If that criterion is not satisfied, do not call this function. You will |
1603 get an assertion failure. */ | |
1555 | 1604 |
1556 static Charbpos | 1605 static Charbpos |
1557 boyer_moore (struct buffer *buf, Ibyte *base_pat, Bytecount len, | 1606 boyer_moore (struct buffer *buf, Ibyte *base_pat, Bytecount len, |
1558 Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt, | 1607 Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt, |
1559 Lisp_Object inverse_trt, int USED_IF_MULE (charset_base)) | 1608 Lisp_Object inverse_trt, int USED_IF_MULE (charset_base)) |
1560 { | 1609 { |
1561 /* &&#### needs some 8-bit work here */ | |
1562 /* #### Someone really really really needs to comment the workings | 1610 /* #### Someone really really really needs to comment the workings |
1563 of this junk somewhat better. | 1611 of this junk somewhat better. |
1564 | 1612 |
1565 BTW "BM" stands for Boyer-Moore, which is one of the standard | 1613 BTW "BM" stands for Boyer-Moore, which is one of the standard |
1566 string-searching algorithms. It's the best string-searching | 1614 string-searching algorithms. It's the best string-searching |
1600 Ibyte simple_translate[0400]; | 1648 Ibyte simple_translate[0400]; |
1601 REGISTER int direction = ((n > 0) ? 1 : -1); | 1649 REGISTER int direction = ((n > 0) ? 1 : -1); |
1602 #ifdef MULE | 1650 #ifdef MULE |
1603 Ibyte translate_prev_byte = 0; | 1651 Ibyte translate_prev_byte = 0; |
1604 Ibyte translate_anteprev_byte = 0; | 1652 Ibyte translate_anteprev_byte = 0; |
1653 /* These need to be rethought in the event that the internal format | |
1654 changes, or in the event that num_8_bit_fixed_chars disappears | |
1655 (entirely_one_byte_p can be trivially worked out by checking is the | |
1656 byte count equal to the char count.) */ | |
1657 int buffer_entirely_one_byte_p = buf->text->entirely_one_byte_p; | |
1658 int buffer_nothing_greater_than_0xff = | |
1659 buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf); | |
1605 #endif | 1660 #endif |
1606 #ifdef C_ALLOCA | 1661 #ifdef C_ALLOCA |
1607 EMACS_INT BM_tab_space[0400]; | 1662 EMACS_INT BM_tab_space[0400]; |
1608 BM_tab = &BM_tab_space[0]; | 1663 BM_tab = &BM_tab_space[0]; |
1609 #else | 1664 #else |
1682 { | 1737 { |
1683 Ibyte *charstart = ptr; | 1738 Ibyte *charstart = ptr; |
1684 while (!ibyte_first_byte_p (*charstart)) | 1739 while (!ibyte_first_byte_p (*charstart)) |
1685 charstart--; | 1740 charstart--; |
1686 untranslated = itext_ichar (charstart); | 1741 untranslated = itext_ichar (charstart); |
1687 if (charset_base == (untranslated & ~ICHAR_FIELD3_MASK)) | 1742 |
1688 { | 1743 /* We shouldn't have been passed a string with varying |
1689 ch = TRANSLATE (trt, untranslated); | 1744 character sets or rows. That's what simple_search is |
1690 if (!ibyte_first_byte_p (*ptr)) | 1745 for. */ |
1691 { | 1746 assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK)); |
1692 translate_prev_byte = ptr[-1]; | 1747 |
1693 if (!ibyte_first_byte_p (translate_prev_byte)) | 1748 ch = TRANSLATE (trt, untranslated); |
1694 translate_anteprev_byte = ptr[-2]; | 1749 if (!ibyte_first_byte_p (*ptr)) |
1695 } | 1750 { |
1696 } | 1751 translate_prev_byte = ptr[-1]; |
1697 else | 1752 if (!ibyte_first_byte_p (translate_prev_byte)) |
1698 { | 1753 translate_anteprev_byte = ptr[-2]; |
1699 this_translated = 0; | 1754 } |
1700 ch = *ptr; | 1755 |
1756 if (charset_base != (ch & ~ICHAR_FIELD3_MASK)) | |
1757 { | |
1758 /* In the very rare event that the CANON entry for this | |
1759 character is not in the desired set, choose one that | |
1760 is, from the equivalence set. It doesn't much matter | |
1761 which, since we're building our own cheesy equivalence | |
1762 table instead of using that belonging to the case | |
1763 table directly. | |
1764 | |
1765 We can get here if search_buffer has worked out that | |
1766 the buffer is entirely single width. */ | |
1767 Ichar starting_ch = ch; | |
1768 do | |
1769 { | |
1770 ch = TRANSLATE (inverse_trt, ch); | |
1771 if (charset_base == (ch & ~ICHAR_FIELD3_MASK)) | |
1772 break; | |
1773 | |
1774 } while (starting_ch != ch); | |
1775 | |
1776 /* If starting_ch is equal to ch, the case table is | |
1777 corrupt. (Any mapping in the canon table should be | |
1778 reflected in the equivalence table, and we know from | |
1779 the canon table that untranslated maps to starting_ch | |
1780 and that untranslated has the correct value for | |
1781 charset_base.) */ | |
1782 assert (starting_ch != ch); | |
1701 } | 1783 } |
1702 } | 1784 } |
1703 else | 1785 else |
1704 { | 1786 { |
1705 ch = *ptr; | 1787 ch = *ptr; |
1711 j = (unsigned char) ch; | 1793 j = (unsigned char) ch; |
1712 | 1794 |
1713 if (i == infinity) | 1795 if (i == infinity) |
1714 stride_for_teases = BM_tab[j]; | 1796 stride_for_teases = BM_tab[j]; |
1715 BM_tab[j] = dirlen - i; | 1797 BM_tab[j] = dirlen - i; |
1716 /* A translation table is accompanied by its inverse -- | 1798 /* A translation table is accompanied by its inverse -- see |
1717 see comment in casetab.c. */ | 1799 comment in casetab.c. */ |
1718 if (this_translated) | 1800 if (this_translated) |
1719 { | 1801 { |
1720 Ichar starting_ch = ch; | 1802 Ichar starting_ch = ch; |
1721 EMACS_INT starting_j = j; | 1803 EMACS_INT starting_j = j; |
1722 while (1) | 1804 do |
1723 { | 1805 { |
1724 ch = TRANSLATE (inverse_trt, ch); | 1806 ch = TRANSLATE (inverse_trt, ch); |
1725 if (ch > 0400) | 1807 |
1726 j = ((unsigned char) ch | 0200); | 1808 if (ch > 0x7F && buffer_entirely_one_byte_p) |
1727 else | 1809 continue; |
1728 j = (unsigned char) ch; | 1810 |
1729 | 1811 if (ch > 0xFF && buffer_nothing_greater_than_0xff) |
1730 /* For all the characters that map into CH, | 1812 continue; |
1731 set up simple_translate to map the last byte | 1813 |
1732 into STARTING_J. */ | 1814 if (ch > 0400) |
1733 simple_translate[j] = (Ibyte) starting_j; | 1815 j = ((unsigned char) ch | 0200); |
1734 if (ch == starting_ch) | 1816 else |
1735 break; | 1817 j = (unsigned char) ch; |
1736 BM_tab[j] = dirlen - i; | 1818 |
1737 } | 1819 /* For all the characters that map into CH, set up |
1820 simple_translate to map the last byte into | |
1821 STARTING_J. */ | |
1822 simple_translate[j] = (Ibyte) starting_j; | |
1823 BM_tab[j] = dirlen - i; | |
1824 | |
1825 } while (ch != starting_ch); | |
1738 } | 1826 } |
1739 #else | 1827 #else |
1740 EMACS_INT k; | 1828 EMACS_INT k; |
1741 j = *ptr; | 1829 j = *ptr; |
1742 k = (j = TRANSLATE (trt, j)); | 1830 k = (j = TRANSLATE (trt, j)); |