Mercurial > hg > xemacs-beta
comparison src/search.c @ 4414:df576f30c1d8
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
2008-01-30 Aidan Kehoe <kehoea@parhasard.net>
* automated/case-tests.el:
Check for a bug Mike Sperber reported; check algorithms used, if
available.
2008-01-30 Aidan Kehoe <kehoea@parhasard.net>
* search.c (debug-xemacs-searches):
New variable, available on debug builds. Used in
tests/automated/case-tests.el.
(search_buffer): Only store the charset_base for characters with
translations. Correct some comments, correct some checks. If
debug_xemacs_searches is non-zero, record which search was used.
(boyer_moore): Remove an assertion that was incorrect. Remove its
documentation. Correct an assertion dealing with equivalence
tables; we may end up looking through the equivalence table if a
non-ASCII non-case character was searched for.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Wed, 30 Jan 2008 09:26:59 +0100 |
parents | 4ee73bbe4f8e |
children | 69b803c646cd |
comparison
equal
deleted
inserted
replaced
4413:dc84ec90b463 | 4414:df576f30c1d8 |
---|---|
44 | 44 |
45 #define TRANSLATE(table, pos) \ | 45 #define TRANSLATE(table, pos) \ |
46 (!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos) | 46 (!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos) |
47 | 47 |
48 #define REGEXP_CACHE_SIZE 20 | 48 #define REGEXP_CACHE_SIZE 20 |
49 | |
50 #ifdef DEBUG_XEMACS | |
51 | |
52 /* Used in tests/automated/case-tests.el if available. */ | |
53 Fixnum debug_xemacs_searches; | |
54 | |
55 Lisp_Object Qsearch_algorithm_used, Qboyer_moore, Qsimple_search; | |
56 | |
57 #endif | |
49 | 58 |
50 /* If the regexp is non-nil, then the buffer contains the compiled form | 59 /* If the regexp is non-nil, then the buffer contains the compiled form |
51 of that regexp, suitable for searching. */ | 60 of that regexp, suitable for searching. */ |
52 struct regexp_cache | 61 struct regexp_cache |
53 { | 62 { |
1368 | 1377 |
1369 orig_bytelen = itext_ichar_len (base_pat); | 1378 orig_bytelen = itext_ichar_len (base_pat); |
1370 inv_bytelen = set_itext_ichar (tmp_str, inverse); | 1379 inv_bytelen = set_itext_ichar (tmp_str, inverse); |
1371 new_bytelen = set_itext_ichar (tmp_str, translated); | 1380 new_bytelen = set_itext_ichar (tmp_str, translated); |
1372 | 1381 |
1373 if (-1 == charset_base) | 1382 if (boyer_moore_ok |
1374 { | 1383 /* Only do the Boyer-Moore check for characters needing |
1375 /* Keep track of which charset and character set row | 1384 translation. */ |
1376 contains the characters that need translation. | 1385 && (translated != c || inverse != c)) |
1377 | |
1378 Zero out the bits corresponding to the last byte. */ | |
1379 charset_base = c & ~ICHAR_FIELD3_MASK; | |
1380 } | |
1381 | |
1382 if (boyer_moore_ok && (translated != c || inverse != c)) | |
1383 { | 1386 { |
1384 Ichar starting_c = c; | 1387 Ichar starting_c = c; |
1385 int charset_base_code; | 1388 int charset_base_code; |
1386 | 1389 |
1387 do | 1390 do |
1394 continue; | 1397 continue; |
1395 | 1398 |
1396 if (c > 0xFF && nothing_greater_than_0xff) | 1399 if (c > 0xFF && nothing_greater_than_0xff) |
1397 continue; | 1400 continue; |
1398 | 1401 |
1399 charset_base_code = c & ~ICHAR_FIELD3_MASK; | 1402 if (-1 == charset_base) /* No charset yet specified. */ |
1400 | |
1401 if (charset_base_code != charset_base) | |
1402 { | 1403 { |
1403 /* If two different rows, or two different charsets, | 1404 /* Keep track of which charset and character set row |
1404 appear, needing translation, then we cannot use | 1405 contains the characters that need translation. |
1405 boyer_moore search. See the comment at the head of | 1406 |
1406 boyer_moore(). */ | 1407 Zero out the bits corresponding to the last |
1407 boyer_moore_ok = 0; | 1408 byte. */ |
1408 break; | 1409 charset_base = c & ~ICHAR_FIELD3_MASK; |
1410 } | |
1411 else | |
1412 { | |
1413 charset_base_code = c & ~ICHAR_FIELD3_MASK; | |
1414 | |
1415 if (charset_base_code != charset_base) | |
1416 { | |
1417 /* If two different rows, or two different | |
1418 charsets, appear, needing non-ASCII | |
1419 translation, then we cannot use boyer_moore | |
1420 search. See the comment at the head of | |
1421 boyer_moore(). */ | |
1422 boyer_moore_ok = 0; | |
1423 break; | |
1424 } | |
1409 } | 1425 } |
1410 } while (c != starting_c); | 1426 } while (c != starting_c); |
1411 | 1427 |
1412 if (boyer_moore_ok && (charset_base != | 1428 if (boyer_moore_ok && charset_base != -1 && |
1413 (translated & ~ICHAR_FIELD3_MASK))) | 1429 charset_base != (translated & ~ICHAR_FIELD3_MASK)) |
1414 { | 1430 { |
1415 /* In the rare event that the CANON entry for this | 1431 /* In the rare event that the CANON entry for this |
1416 character is not in the desired set, choose one | 1432 character is not in the desired set, choose one |
1417 that is, from the equivalence set. It doesn't much | 1433 that is, from the equivalence set. It doesn't much |
1418 matter which. */ | 1434 matter which. */ |
1435 memcpy (pat, tmp_str, new_bytelen); | 1451 memcpy (pat, tmp_str, new_bytelen); |
1436 pat += new_bytelen; | 1452 pat += new_bytelen; |
1437 base_pat += orig_bytelen; | 1453 base_pat += orig_bytelen; |
1438 len -= orig_bytelen; | 1454 len -= orig_bytelen; |
1439 } | 1455 } |
1456 | |
1457 if (-1 == charset_base) | |
1458 { | |
1459 charset_base = 'a' & ~ICHAR_FIELD3_MASK; /* Default to ASCII. */ | |
1460 } | |
1461 | |
1440 #else /* not MULE */ | 1462 #else /* not MULE */ |
1441 while (--len >= 0) | 1463 while (--len >= 0) |
1442 { | 1464 { |
1443 /* If we got here and the RE flag is set, it's because | 1465 /* If we got here and the RE flag is set, it's because |
1444 we're dealing with a regexp known to be trivial, so the | 1466 we're dealing with a regexp known to be trivial, so the |
1451 *pat++ = TRANSLATE (trt, *base_pat++); | 1473 *pat++ = TRANSLATE (trt, *base_pat++); |
1452 } | 1474 } |
1453 #endif /* MULE */ | 1475 #endif /* MULE */ |
1454 len = pat - patbuf; | 1476 len = pat - patbuf; |
1455 pat = base_pat = patbuf; | 1477 pat = base_pat = patbuf; |
1478 | |
1479 #ifdef DEBUG_XEMACS | |
1480 if (debug_xemacs_searches) | |
1481 { | |
1482 Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used); | |
1483 sym->value = boyer_moore_ok ? Qboyer_moore : Qsimple_search; | |
1484 } | |
1485 #endif | |
1486 | |
1456 if (boyer_moore_ok) | 1487 if (boyer_moore_ok) |
1457 return boyer_moore (buf, base_pat, len, pos, lim, n, | 1488 return boyer_moore (buf, base_pat, len, pos, lim, n, |
1458 trt, inverse_trt, charset_base); | 1489 trt, inverse_trt, charset_base); |
1459 else | 1490 else |
1460 return simple_search (buf, base_pat, len, pos, lim, n, trt); | 1491 return simple_search (buf, base_pat, len, pos, lim, n, trt); |
1593 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. | 1624 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. |
1594 DIRECTION says which direction we search in. | 1625 DIRECTION says which direction we search in. |
1595 TRT and INVERSE_TRT are translation tables. | 1626 TRT and INVERSE_TRT are translation tables. |
1596 | 1627 |
1597 This kind of search works if all the characters in PAT that have | 1628 This kind of search works if all the characters in PAT that have |
1598 nontrivial translation are the same aside from the last byte. This | 1629 (non-ASCII) translation are the same aside from the last byte. This |
1599 makes it possible to translate just the last byte of a character, | 1630 makes it possible to translate just the last byte of a character, and do |
1600 and do so after just a simple test of the context. | 1631 so after just a simple test of the context. |
1601 | 1632 |
1602 If that criterion is not satisfied, do not call this function. You will | 1633 If that criterion is not satisfied, do not call this function. You will |
1603 get an assertion failure. */ | 1634 get an assertion failure. */ |
1604 | 1635 |
1605 static Charbpos | 1636 static Charbpos |
1738 Ibyte *charstart = ptr; | 1769 Ibyte *charstart = ptr; |
1739 while (!ibyte_first_byte_p (*charstart)) | 1770 while (!ibyte_first_byte_p (*charstart)) |
1740 charstart--; | 1771 charstart--; |
1741 untranslated = itext_ichar (charstart); | 1772 untranslated = itext_ichar (charstart); |
1742 | 1773 |
1743 /* We shouldn't have been passed a string with varying | |
1744 character sets or rows. That's what simple_search is | |
1745 for. */ | |
1746 assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK)); | |
1747 | |
1748 ch = TRANSLATE (trt, untranslated); | 1774 ch = TRANSLATE (trt, untranslated); |
1749 if (!ibyte_first_byte_p (*ptr)) | 1775 if (!ibyte_first_byte_p (*ptr)) |
1750 { | 1776 { |
1751 translate_prev_byte = ptr[-1]; | 1777 translate_prev_byte = ptr[-1]; |
1752 if (!ibyte_first_byte_p (translate_prev_byte)) | 1778 if (!ibyte_first_byte_p (translate_prev_byte)) |
1753 translate_anteprev_byte = ptr[-2]; | 1779 translate_anteprev_byte = ptr[-2]; |
1754 } | 1780 } |
1755 | 1781 |
1756 if (charset_base != (ch & ~ICHAR_FIELD3_MASK)) | 1782 if (ch != untranslated && /* Was translation done? */ |
1783 charset_base != (ch & ~ICHAR_FIELD3_MASK)) | |
1757 { | 1784 { |
1758 /* In the very rare event that the CANON entry for this | 1785 /* In the very rare event that the CANON entry for this |
1759 character is not in the desired set, choose one that | 1786 character is not in the desired set, choose one that |
1760 is, from the equivalence set. It doesn't much matter | 1787 is, from the equivalence set. It doesn't much matter |
1761 which, since we're building our own cheesy equivalence | 1788 which, since we're building our own cheesy equivalence |
1763 table directly. | 1790 table directly. |
1764 | 1791 |
1765 We can get here if search_buffer has worked out that | 1792 We can get here if search_buffer has worked out that |
1766 the buffer is entirely single width. */ | 1793 the buffer is entirely single width. */ |
1767 Ichar starting_ch = ch; | 1794 Ichar starting_ch = ch; |
1795 int count = 0; | |
1768 do | 1796 do |
1769 { | 1797 { |
1770 ch = TRANSLATE (inverse_trt, ch); | 1798 ch = TRANSLATE (inverse_trt, ch); |
1771 if (charset_base == (ch & ~ICHAR_FIELD3_MASK)) | 1799 if (charset_base == (ch & ~ICHAR_FIELD3_MASK)) |
1772 break; | 1800 break; |
1773 | 1801 ++count; |
1774 } while (starting_ch != ch); | 1802 } while (starting_ch != ch); |
1775 | 1803 |
1776 /* If starting_ch is equal to ch, the case table is | 1804 /* If starting_ch is equal to ch (and count is not one, |
1777 corrupt. (Any mapping in the canon table should be | 1805 which means no translation is necessary), the case |
1778 reflected in the equivalence table, and we know from | 1806 table is corrupt. (Any mapping in the canon table |
1779 the canon table that untranslated maps to starting_ch | 1807 should be reflected in the equivalence table, and we |
1780 and that untranslated has the correct value for | 1808 know from the canon table that untranslated maps to |
1781 charset_base.) */ | 1809 starting_ch and that untranslated has the correct value |
1782 assert (starting_ch != ch); | 1810 for charset_base.) */ |
1811 assert (1 == count || starting_ch != ch); | |
1783 } | 1812 } |
1784 } | 1813 } |
1785 else | 1814 else |
1786 { | 1815 { |
1787 ch = *ptr; | 1816 ch = *ptr; |
3318 */ ); | 3347 */ ); |
3319 warn_about_possibly_incompatible_back_references = 1; | 3348 warn_about_possibly_incompatible_back_references = 1; |
3320 | 3349 |
3321 Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed); | 3350 Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed); |
3322 staticpro (&Vskip_chars_range_table); | 3351 staticpro (&Vskip_chars_range_table); |
3323 } | 3352 #ifdef DEBUG_XEMACS |
3353 DEFSYMBOL (Qsearch_algorithm_used); | |
3354 DEFSYMBOL (Qboyer_moore); | |
3355 DEFSYMBOL (Qsimple_search); | |
3356 | |
3357 DEFVAR_INT ("debug-xemacs-searches", &debug_xemacs_searches /* | |
3358 If non-zero, bind `search-algorithm-used' to `boyer-moore' or `simple-search', | |
3359 depending on the algorithm used for each search. Used for testing. | |
3360 */ ); | |
3361 debug_xemacs_searches = 0; | |
3362 #endif | |
3363 } |