comparison src/search.c @ 4407:4ee73bbe4f8e

Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings. 2007-12-26 Aidan Kehoe <kehoea@parhasard.net> * casetab.c: Extend and correct some case table documentation. * search.c (search_buffer): Correct a bug where only the first entry for a character in the case equivalence table was examined in determining if the Boyer-Moore search algorithm is appropriate. If there are case mappings outside of the charset and row of the characters specified in the search string, those case mappings can be safely ignored (and Boyer-Moore search can be used) if we know from the buffer statistics that the corresponding characters cannot occur. * search.c (boyer_moore): Assert that we haven't been passed a string with varying characters sets or rows within character sets. That's what simple_search is for. In the very rare event that a character in the search string has a canonical case mapping that is not in the same character set and row, don't try to search for the canonical character, search for some other character that is in the the desired character set and row. Assert that the case table isn't corrupt. Do not search for any character case mappings that cannot possibly occur in the buffer, given the buffer metadata about its contents.
author Aidan Kehoe <kehoea@parhasard.net>
date Wed, 26 Dec 2007 17:30:16 +0100
parents f70e56bb52a7
children df576f30c1d8
comparison
equal deleted inserted replaced
4356:cc293ef846d2 4407:4ee73bbe4f8e
1338 } 1338 }
1339 else /* non-RE case */ 1339 else /* non-RE case */
1340 { 1340 {
1341 int charset_base = -1; 1341 int charset_base = -1;
1342 int boyer_moore_ok = 1; 1342 int boyer_moore_ok = 1;
1343 Ibyte *pat = 0;
1344 Ibyte *patbuf = alloca_ibytes (len * MAX_ICHAR_LEN); 1343 Ibyte *patbuf = alloca_ibytes (len * MAX_ICHAR_LEN);
1345 pat = patbuf; 1344 Ibyte *pat = patbuf;
1345
1346 #ifdef MULE 1346 #ifdef MULE
1347 /* &&#### needs some 8-bit work here */ 1347 int entirely_one_byte_p = buf->text->entirely_one_byte_p;
1348 int nothing_greater_than_0xff =
1349 buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf);
1350
1348 while (len > 0) 1351 while (len > 0)
1349 { 1352 {
1350 Ibyte tmp_str[MAX_ICHAR_LEN]; 1353 Ibyte tmp_str[MAX_ICHAR_LEN];
1351 Ichar c, translated, inverse; 1354 Ichar c, translated, inverse;
1352 Bytecount orig_bytelen, new_bytelen, inv_bytelen; 1355 Bytecount orig_bytelen, new_bytelen, inv_bytelen;
1365 1368
1366 orig_bytelen = itext_ichar_len (base_pat); 1369 orig_bytelen = itext_ichar_len (base_pat);
1367 inv_bytelen = set_itext_ichar (tmp_str, inverse); 1370 inv_bytelen = set_itext_ichar (tmp_str, inverse);
1368 new_bytelen = set_itext_ichar (tmp_str, translated); 1371 new_bytelen = set_itext_ichar (tmp_str, translated);
1369 1372
1370 if (new_bytelen != orig_bytelen || inv_bytelen != orig_bytelen) 1373 if (-1 == charset_base)
1371 boyer_moore_ok = 0; 1374 {
1372 if (translated != c || inverse != c) 1375 /* Keep track of which charset and character set row
1373 { 1376 contains the characters that need translation.
1374 /* Keep track of which charset and character set row 1377
1375 contains the characters that need translation. 1378 Zero out the bits corresponding to the last byte. */
1376 Zero out the bits corresponding to the last byte. 1379 charset_base = c & ~ICHAR_FIELD3_MASK;
1377 */ 1380 }
1378 int charset_base_code = c & ~ICHAR_FIELD3_MASK; 1381
1379 if (charset_base == -1) 1382 if (boyer_moore_ok && (translated != c || inverse != c))
1380 charset_base = charset_base_code; 1383 {
1381 else if (charset_base != charset_base_code) 1384 Ichar starting_c = c;
1382 /* If two different rows appear, needing translation, then 1385 int charset_base_code;
1383 we cannot use boyer_moore search. See the comment at the 1386
1384 head of boyer_moore(). */ 1387 do
1385 boyer_moore_ok = 0; 1388 {
1386 } 1389 c = TRANSLATE (inverse_trt, c);
1390
1391 /* If a character cannot occur in the buffer, ignore
1392 it. */
1393 if (c > 0x7F && entirely_one_byte_p)
1394 continue;
1395
1396 if (c > 0xFF && nothing_greater_than_0xff)
1397 continue;
1398
1399 charset_base_code = c & ~ICHAR_FIELD3_MASK;
1400
1401 if (charset_base_code != charset_base)
1402 {
1403 /* If two different rows, or two different charsets,
1404 appear, needing translation, then we cannot use
1405 boyer_moore search. See the comment at the head of
1406 boyer_moore(). */
1407 boyer_moore_ok = 0;
1408 break;
1409 }
1410 } while (c != starting_c);
1411
1412 if (boyer_moore_ok && (charset_base !=
1413 (translated & ~ICHAR_FIELD3_MASK)))
1414 {
1415 /* In the rare event that the CANON entry for this
1416 character is not in the desired set, choose one
1417 that is, from the equivalence set. It doesn't much
1418 matter which. */
1419 Ichar starting_ch = translated;
1420 do
1421 {
1422 translated = TRANSLATE (inverse_trt, translated);
1423
1424 if (charset_base == (translated & ~ICHAR_FIELD3_MASK))
1425 break;
1426
1427 } while (starting_ch != translated);
1428
1429 assert (starting_ch != translated);
1430
1431 new_bytelen = set_itext_ichar (tmp_str, translated);
1432 }
1433 }
1434
1387 memcpy (pat, tmp_str, new_bytelen); 1435 memcpy (pat, tmp_str, new_bytelen);
1388 pat += new_bytelen; 1436 pat += new_bytelen;
1389 base_pat += orig_bytelen; 1437 base_pat += orig_bytelen;
1390 len -= orig_bytelen; 1438 len -= orig_bytelen;
1391 } 1439 }
1549 This kind of search works if all the characters in PAT that have 1597 This kind of search works if all the characters in PAT that have
1550 nontrivial translation are the same aside from the last byte. This 1598 nontrivial translation are the same aside from the last byte. This
1551 makes it possible to translate just the last byte of a character, 1599 makes it possible to translate just the last byte of a character,
1552 and do so after just a simple test of the context. 1600 and do so after just a simple test of the context.
1553 1601
1554 If that criterion is not satisfied, do not call this function. */ 1602 If that criterion is not satisfied, do not call this function. You will
1603 get an assertion failure. */
1555 1604
1556 static Charbpos 1605 static Charbpos
1557 boyer_moore (struct buffer *buf, Ibyte *base_pat, Bytecount len, 1606 boyer_moore (struct buffer *buf, Ibyte *base_pat, Bytecount len,
1558 Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt, 1607 Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt,
1559 Lisp_Object inverse_trt, int USED_IF_MULE (charset_base)) 1608 Lisp_Object inverse_trt, int USED_IF_MULE (charset_base))
1560 { 1609 {
1561 /* &&#### needs some 8-bit work here */
1562 /* #### Someone really really really needs to comment the workings 1610 /* #### Someone really really really needs to comment the workings
1563 of this junk somewhat better. 1611 of this junk somewhat better.
1564 1612
1565 BTW "BM" stands for Boyer-Moore, which is one of the standard 1613 BTW "BM" stands for Boyer-Moore, which is one of the standard
1566 string-searching algorithms. It's the best string-searching 1614 string-searching algorithms. It's the best string-searching
1600 Ibyte simple_translate[0400]; 1648 Ibyte simple_translate[0400];
1601 REGISTER int direction = ((n > 0) ? 1 : -1); 1649 REGISTER int direction = ((n > 0) ? 1 : -1);
1602 #ifdef MULE 1650 #ifdef MULE
1603 Ibyte translate_prev_byte = 0; 1651 Ibyte translate_prev_byte = 0;
1604 Ibyte translate_anteprev_byte = 0; 1652 Ibyte translate_anteprev_byte = 0;
1653 /* These need to be rethought in the event that the internal format
1654 changes, or in the event that num_8_bit_fixed_chars disappears
1655 (entirely_one_byte_p can be trivially worked out by checking is the
1656 byte count equal to the char count.) */
1657 int buffer_entirely_one_byte_p = buf->text->entirely_one_byte_p;
1658 int buffer_nothing_greater_than_0xff =
1659 buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf);
1605 #endif 1660 #endif
1606 #ifdef C_ALLOCA 1661 #ifdef C_ALLOCA
1607 EMACS_INT BM_tab_space[0400]; 1662 EMACS_INT BM_tab_space[0400];
1608 BM_tab = &BM_tab_space[0]; 1663 BM_tab = &BM_tab_space[0];
1609 #else 1664 #else
1682 { 1737 {
1683 Ibyte *charstart = ptr; 1738 Ibyte *charstart = ptr;
1684 while (!ibyte_first_byte_p (*charstart)) 1739 while (!ibyte_first_byte_p (*charstart))
1685 charstart--; 1740 charstart--;
1686 untranslated = itext_ichar (charstart); 1741 untranslated = itext_ichar (charstart);
1687 if (charset_base == (untranslated & ~ICHAR_FIELD3_MASK)) 1742
1688 { 1743 /* We shouldn't have been passed a string with varying
1689 ch = TRANSLATE (trt, untranslated); 1744 character sets or rows. That's what simple_search is
1690 if (!ibyte_first_byte_p (*ptr)) 1745 for. */
1691 { 1746 assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK));
1692 translate_prev_byte = ptr[-1]; 1747
1693 if (!ibyte_first_byte_p (translate_prev_byte)) 1748 ch = TRANSLATE (trt, untranslated);
1694 translate_anteprev_byte = ptr[-2]; 1749 if (!ibyte_first_byte_p (*ptr))
1695 } 1750 {
1696 } 1751 translate_prev_byte = ptr[-1];
1697 else 1752 if (!ibyte_first_byte_p (translate_prev_byte))
1698 { 1753 translate_anteprev_byte = ptr[-2];
1699 this_translated = 0; 1754 }
1700 ch = *ptr; 1755
1756 if (charset_base != (ch & ~ICHAR_FIELD3_MASK))
1757 {
1758 /* In the very rare event that the CANON entry for this
1759 character is not in the desired set, choose one that
1760 is, from the equivalence set. It doesn't much matter
1761 which, since we're building our own cheesy equivalence
1762 table instead of using that belonging to the case
1763 table directly.
1764
1765 We can get here if search_buffer has worked out that
1766 the buffer is entirely single width. */
1767 Ichar starting_ch = ch;
1768 do
1769 {
1770 ch = TRANSLATE (inverse_trt, ch);
1771 if (charset_base == (ch & ~ICHAR_FIELD3_MASK))
1772 break;
1773
1774 } while (starting_ch != ch);
1775
1776 /* If starting_ch is equal to ch, the case table is
1777 corrupt. (Any mapping in the canon table should be
1778 reflected in the equivalence table, and we know from
1779 the canon table that untranslated maps to starting_ch
1780 and that untranslated has the correct value for
1781 charset_base.) */
1782 assert (starting_ch != ch);
1701 } 1783 }
1702 } 1784 }
1703 else 1785 else
1704 { 1786 {
1705 ch = *ptr; 1787 ch = *ptr;
1711 j = (unsigned char) ch; 1793 j = (unsigned char) ch;
1712 1794
1713 if (i == infinity) 1795 if (i == infinity)
1714 stride_for_teases = BM_tab[j]; 1796 stride_for_teases = BM_tab[j];
1715 BM_tab[j] = dirlen - i; 1797 BM_tab[j] = dirlen - i;
1716 /* A translation table is accompanied by its inverse -- 1798 /* A translation table is accompanied by its inverse -- see
1717 see comment in casetab.c. */ 1799 comment in casetab.c. */
1718 if (this_translated) 1800 if (this_translated)
1719 { 1801 {
1720 Ichar starting_ch = ch; 1802 Ichar starting_ch = ch;
1721 EMACS_INT starting_j = j; 1803 EMACS_INT starting_j = j;
1722 while (1) 1804 do
1723 { 1805 {
1724 ch = TRANSLATE (inverse_trt, ch); 1806 ch = TRANSLATE (inverse_trt, ch);
1725 if (ch > 0400) 1807
1726 j = ((unsigned char) ch | 0200); 1808 if (ch > 0x7F && buffer_entirely_one_byte_p)
1727 else 1809 continue;
1728 j = (unsigned char) ch; 1810
1729 1811 if (ch > 0xFF && buffer_nothing_greater_than_0xff)
1730 /* For all the characters that map into CH, 1812 continue;
1731 set up simple_translate to map the last byte 1813
1732 into STARTING_J. */ 1814 if (ch > 0400)
1733 simple_translate[j] = (Ibyte) starting_j; 1815 j = ((unsigned char) ch | 0200);
1734 if (ch == starting_ch) 1816 else
1735 break; 1817 j = (unsigned char) ch;
1736 BM_tab[j] = dirlen - i; 1818
1737 } 1819 /* For all the characters that map into CH, set up
1820 simple_translate to map the last byte into
1821 STARTING_J. */
1822 simple_translate[j] = (Ibyte) starting_j;
1823 BM_tab[j] = dirlen - i;
1824
1825 } while (ch != starting_ch);
1738 } 1826 }
1739 #else 1827 #else
1740 EMACS_INT k; 1828 EMACS_INT k;
1741 j = *ptr; 1829 j = *ptr;
1742 k = (j = TRANSLATE (trt, j)); 1830 k = (j = TRANSLATE (trt, j));