comparison src/search.c @ 4552:9c1cfceab252

Automated merge with file:/Sources/xemacs-21.5-checked-out
author Aidan Kehoe <kehoea@parhasard.net>
date Thu, 13 Mar 2008 10:24:34 +0100
parents 69b803c646cd
children 91a023144e72 19a72041c5ed
comparison
equal deleted inserted replaced
4551:6812571bfcb9 4552:9c1cfceab252
44 44
45 #define TRANSLATE(table, pos) \ 45 #define TRANSLATE(table, pos) \
46 (!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos) 46 (!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos)
47 47
48 #define REGEXP_CACHE_SIZE 20 48 #define REGEXP_CACHE_SIZE 20
49
50 #ifdef DEBUG_XEMACS
51
52 /* Used in tests/automated/case-tests.el if available. */
53 Fixnum debug_xemacs_searches;
54
55 Lisp_Object Qsearch_algorithm_used, Qboyer_moore, Qsimple_search;
56
57 #endif
49 58
50 /* If the regexp is non-nil, then the buffer contains the compiled form 59 /* If the regexp is non-nil, then the buffer contains the compiled form
51 of that regexp, suitable for searching. */ 60 of that regexp, suitable for searching. */
52 struct regexp_cache 61 struct regexp_cache
53 { 62 {
1368 1377
1369 orig_bytelen = itext_ichar_len (base_pat); 1378 orig_bytelen = itext_ichar_len (base_pat);
1370 inv_bytelen = set_itext_ichar (tmp_str, inverse); 1379 inv_bytelen = set_itext_ichar (tmp_str, inverse);
1371 new_bytelen = set_itext_ichar (tmp_str, translated); 1380 new_bytelen = set_itext_ichar (tmp_str, translated);
1372 1381
1373 if (-1 == charset_base) 1382 if (boyer_moore_ok
1374 { 1383 /* Only do the Boyer-Moore check for characters needing
1375 /* Keep track of which charset and character set row 1384 translation. */
1376 contains the characters that need translation. 1385 && (translated != c || inverse != c))
1377
1378 Zero out the bits corresponding to the last byte. */
1379 charset_base = c & ~ICHAR_FIELD3_MASK;
1380 }
1381
1382 if (boyer_moore_ok && (translated != c || inverse != c))
1383 { 1386 {
1384 Ichar starting_c = c; 1387 Ichar starting_c = c;
1385 int charset_base_code; 1388 int charset_base_code, checked = 0;
1386 1389
1387 do 1390 do
1388 { 1391 {
1389 c = TRANSLATE (inverse_trt, c); 1392 c = TRANSLATE (inverse_trt, c);
1390 1393
1394 continue; 1397 continue;
1395 1398
1396 if (c > 0xFF && nothing_greater_than_0xff) 1399 if (c > 0xFF && nothing_greater_than_0xff)
1397 continue; 1400 continue;
1398 1401
1399 charset_base_code = c & ~ICHAR_FIELD3_MASK; 1402 checked = 1;
1400 1403
1401 if (charset_base_code != charset_base) 1404 if (-1 == charset_base) /* No charset yet specified. */
1402 { 1405 {
1403 /* If two different rows, or two different charsets, 1406 /* Keep track of which charset and character set row
1404 appear, needing translation, then we cannot use 1407 contains the characters that need translation.
1405 boyer_moore search. See the comment at the head of 1408
1406 boyer_moore(). */ 1409 Zero out the bits corresponding to the last
1407 boyer_moore_ok = 0; 1410 byte. */
1408 break; 1411 charset_base = c & ~ICHAR_FIELD3_MASK;
1412 }
1413 else
1414 {
1415 charset_base_code = c & ~ICHAR_FIELD3_MASK;
1416
1417 if (charset_base_code != charset_base)
1418 {
1419 /* If two different rows, or two different
1420 charsets, appear, needing non-ASCII
1421 translation, then we cannot use boyer_moore
1422 search. See the comment at the head of
1423 boyer_moore(). */
1424 boyer_moore_ok = 0;
1425 break;
1426 }
1409 } 1427 }
1410 } while (c != starting_c); 1428 } while (c != starting_c);
1411 1429
1412 if (boyer_moore_ok && (charset_base != 1430 if (!checked)
1413 (translated & ~ICHAR_FIELD3_MASK))) 1431 {
1432 #ifdef DEBUG_XEMACS
1433 if (debug_xemacs_searches)
1434 {
1435 Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used);
1436 sym->value = Qnil;
1437 }
1438 #endif
1439 /* The "continue" clauses were used above, for every
1440 translation of the character. As such, this character
1441 is not to be found in the buffer and neither is the
1442 string as a whole. Return immediately; also avoid
1443 triggering the assertion a few lines down. */
1444 return n > 0 ? -n : n;
1445 }
1446
1447 if (boyer_moore_ok && charset_base != -1 &&
1448 charset_base != (translated & ~ICHAR_FIELD3_MASK))
1414 { 1449 {
1415 /* In the rare event that the CANON entry for this 1450 /* In the rare event that the CANON entry for this
1416 character is not in the desired set, choose one 1451 character is not in the desired set, choose one
1417 that is, from the equivalence set. It doesn't much 1452 that is, from the equivalence set. It doesn't much
1418 matter which. */ 1453 matter which. */
1435 memcpy (pat, tmp_str, new_bytelen); 1470 memcpy (pat, tmp_str, new_bytelen);
1436 pat += new_bytelen; 1471 pat += new_bytelen;
1437 base_pat += orig_bytelen; 1472 base_pat += orig_bytelen;
1438 len -= orig_bytelen; 1473 len -= orig_bytelen;
1439 } 1474 }
1475
1476 if (-1 == charset_base)
1477 {
1478 charset_base = 'a' & ~ICHAR_FIELD3_MASK; /* Default to ASCII. */
1479 }
1480
1440 #else /* not MULE */ 1481 #else /* not MULE */
1441 while (--len >= 0) 1482 while (--len >= 0)
1442 { 1483 {
1443 /* If we got here and the RE flag is set, it's because 1484 /* If we got here and the RE flag is set, it's because
1444 we're dealing with a regexp known to be trivial, so the 1485 we're dealing with a regexp known to be trivial, so the
1451 *pat++ = TRANSLATE (trt, *base_pat++); 1492 *pat++ = TRANSLATE (trt, *base_pat++);
1452 } 1493 }
1453 #endif /* MULE */ 1494 #endif /* MULE */
1454 len = pat - patbuf; 1495 len = pat - patbuf;
1455 pat = base_pat = patbuf; 1496 pat = base_pat = patbuf;
1497
1498 #ifdef DEBUG_XEMACS
1499 if (debug_xemacs_searches)
1500 {
1501 Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used);
1502 sym->value = boyer_moore_ok ? Qboyer_moore : Qsimple_search;
1503 }
1504 #endif
1505
1456 if (boyer_moore_ok) 1506 if (boyer_moore_ok)
1457 return boyer_moore (buf, base_pat, len, pos, lim, n, 1507 return boyer_moore (buf, base_pat, len, pos, lim, n,
1458 trt, inverse_trt, charset_base); 1508 trt, inverse_trt, charset_base);
1459 else 1509 else
1460 return simple_search (buf, base_pat, len, pos, lim, n, trt); 1510 return simple_search (buf, base_pat, len, pos, lim, n, trt);
1593 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. 1643 from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
1594 DIRECTION says which direction we search in. 1644 DIRECTION says which direction we search in.
1595 TRT and INVERSE_TRT are translation tables. 1645 TRT and INVERSE_TRT are translation tables.
1596 1646
1597 This kind of search works if all the characters in PAT that have 1647 This kind of search works if all the characters in PAT that have
1598 nontrivial translation are the same aside from the last byte. This 1648 (non-ASCII) translation are the same aside from the last byte. This
1599 makes it possible to translate just the last byte of a character, 1649 makes it possible to translate just the last byte of a character, and do
1600 and do so after just a simple test of the context. 1650 so after just a simple test of the context.
1601 1651
1602 If that criterion is not satisfied, do not call this function. You will 1652 If that criterion is not satisfied, do not call this function. You will
1603 get an assertion failure. */ 1653 get an assertion failure. */
1604 1654
1605 static Charbpos 1655 static Charbpos
1738 Ibyte *charstart = ptr; 1788 Ibyte *charstart = ptr;
1739 while (!ibyte_first_byte_p (*charstart)) 1789 while (!ibyte_first_byte_p (*charstart))
1740 charstart--; 1790 charstart--;
1741 untranslated = itext_ichar (charstart); 1791 untranslated = itext_ichar (charstart);
1742 1792
1743 /* We shouldn't have been passed a string with varying
1744 character sets or rows. That's what simple_search is
1745 for. */
1746 assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK));
1747
1748 ch = TRANSLATE (trt, untranslated); 1793 ch = TRANSLATE (trt, untranslated);
1749 if (!ibyte_first_byte_p (*ptr)) 1794 if (!ibyte_first_byte_p (*ptr))
1750 { 1795 {
1751 translate_prev_byte = ptr[-1]; 1796 translate_prev_byte = ptr[-1];
1752 if (!ibyte_first_byte_p (translate_prev_byte)) 1797 if (!ibyte_first_byte_p (translate_prev_byte))
1753 translate_anteprev_byte = ptr[-2]; 1798 translate_anteprev_byte = ptr[-2];
1754 } 1799 }
1755 1800
1756 if (charset_base != (ch & ~ICHAR_FIELD3_MASK)) 1801 if (ch != untranslated && /* Was translation done? */
1802 charset_base != (ch & ~ICHAR_FIELD3_MASK))
1757 { 1803 {
1758 /* In the very rare event that the CANON entry for this 1804 /* In the very rare event that the CANON entry for this
1759 character is not in the desired set, choose one that 1805 character is not in the desired set, choose one that
1760 is, from the equivalence set. It doesn't much matter 1806 is, from the equivalence set. It doesn't much matter
1761 which, since we're building our own cheesy equivalence 1807 which, since we're building our own cheesy equivalence
1763 table directly. 1809 table directly.
1764 1810
1765 We can get here if search_buffer has worked out that 1811 We can get here if search_buffer has worked out that
1766 the buffer is entirely single width. */ 1812 the buffer is entirely single width. */
1767 Ichar starting_ch = ch; 1813 Ichar starting_ch = ch;
1814 int count = 0;
1768 do 1815 do
1769 { 1816 {
1770 ch = TRANSLATE (inverse_trt, ch); 1817 ch = TRANSLATE (inverse_trt, ch);
1771 if (charset_base == (ch & ~ICHAR_FIELD3_MASK)) 1818 if (charset_base == (ch & ~ICHAR_FIELD3_MASK))
1772 break; 1819 break;
1773 1820 ++count;
1774 } while (starting_ch != ch); 1821 } while (starting_ch != ch);
1775 1822
1776 /* If starting_ch is equal to ch, the case table is 1823 /* If starting_ch is equal to ch (and count is not one,
1777 corrupt. (Any mapping in the canon table should be 1824 which means no translation is necessary), the case
1778 reflected in the equivalence table, and we know from 1825 table is corrupt. (Any mapping in the canon table
1779 the canon table that untranslated maps to starting_ch 1826 should be reflected in the equivalence table, and we
1780 and that untranslated has the correct value for 1827 know from the canon table that untranslated maps to
1781 charset_base.) */ 1828 starting_ch and that untranslated has the correct value
1782 assert (starting_ch != ch); 1829 for charset_base.) */
1830 assert (1 == count || starting_ch != ch);
1783 } 1831 }
1784 } 1832 }
1785 else 1833 else
1786 { 1834 {
1787 ch = *ptr; 1835 ch = *ptr;
3318 */ ); 3366 */ );
3319 warn_about_possibly_incompatible_back_references = 1; 3367 warn_about_possibly_incompatible_back_references = 1;
3320 3368
3321 Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed); 3369 Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed);
3322 staticpro (&Vskip_chars_range_table); 3370 staticpro (&Vskip_chars_range_table);
3323 } 3371 #ifdef DEBUG_XEMACS
3372 DEFSYMBOL (Qsearch_algorithm_used);
3373 DEFSYMBOL (Qboyer_moore);
3374 DEFSYMBOL (Qsimple_search);
3375
3376 DEFVAR_INT ("debug-xemacs-searches", &debug_xemacs_searches /*
3377 If non-zero, bind `search-algorithm-used' to `boyer-moore' or `simple-search',
3378 depending on the algorithm used for each search. Used for testing.
3379 */ );
3380 debug_xemacs_searches = 0;
3381 #endif
3382 }