comparison src/search.c @ 4414:df576f30c1d8

Correct case-insensitive search for non-case, non-ASCII chars. Add tests. 2008-01-30 Aidan Kehoe <kehoea@parhasard.net> * automated/case-tests.el: Check for a bug Mike Sperber reported; check algorithms used, if available. 2008-01-30 Aidan Kehoe <kehoea@parhasard.net> * search.c (debug-xemacs-searches): New variable, available on debug builds. Used in tests/automated/case-tests.el. (search_buffer): Only store the charset_base for characters with translations. Correct some comments, correct some checks. If debug_xemacs_searches is non-zero, record which search was used. (boyer_moore): Remove an assertion that was incorrect. Remove its documentation. Correct an assertion dealing with equivalence tables; we may end up looking through the equivalence table if a non-ASCII non-case character was searched for.
author Aidan Kehoe <kehoea@parhasard.net>
date Wed, 30 Jan 2008 09:26:59 +0100
parents 4ee73bbe4f8e
children 69b803c646cd
comparison
equal deleted inserted replaced
4413:dc84ec90b463 4414:df576f30c1d8
44 44
45 #define TRANSLATE(table, pos) \ 45 #define TRANSLATE(table, pos) \
46 (!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos) 46 (!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos)
47 47
48 #define REGEXP_CACHE_SIZE 20 48 #define REGEXP_CACHE_SIZE 20
49
50 #ifdef DEBUG_XEMACS
51
52 /* Used in tests/automated/case-tests.el if available. */
53 Fixnum debug_xemacs_searches;
54
55 Lisp_Object Qsearch_algorithm_used, Qboyer_moore, Qsimple_search;
56
57 #endif
49 58
50 /* If the regexp is non-nil, then the buffer contains the compiled form 59 /* If the regexp is non-nil, then the buffer contains the compiled form
51 of that regexp, suitable for searching. */ 60 of that regexp, suitable for searching. */
52 struct regexp_cache 61 struct regexp_cache
53 { 62 {
1368 1377
1369 orig_bytelen = itext_ichar_len (base_pat); 1378 orig_bytelen = itext_ichar_len (base_pat);
1370 inv_bytelen = set_itext_ichar (tmp_str, inverse); 1379 inv_bytelen = set_itext_ichar (tmp_str, inverse);
1371 new_bytelen = set_itext_ichar (tmp_str, translated); 1380 new_bytelen = set_itext_ichar (tmp_str, translated);
1372 1381
1373 if (-1 == charset_base) 1382 if (boyer_moore_ok
1374 { 1383 /* Only do the Boyer-Moore check for characters needing
1375 /* Keep track of which charset and character set row 1384 translation. */
1376 contains the characters that need translation. 1385 && (translated != c || inverse != c))
1377
1378 Zero out the bits corresponding to the last byte. */
1379 charset_base = c & ~ICHAR_FIELD3_MASK;
1380 }
1381
1382 if (boyer_moore_ok && (translated != c || inverse != c))
1383 { 1386 {
1384 Ichar starting_c = c; 1387 Ichar starting_c = c;
1385 int charset_base_code; 1388 int charset_base_code;
1386 1389
1387 do 1390 do
1394 continue; 1397 continue;
1395 1398
1396 if (c > 0xFF && nothing_greater_than_0xff) 1399 if (c > 0xFF && nothing_greater_than_0xff)
1397 continue; 1400 continue;
1398 1401
1399 charset_base_code = c & ~ICHAR_FIELD3_MASK; 1402 if (-1 == charset_base) /* No charset yet specified. */
1400
1401 if (charset_base_code != charset_base)
1402 { 1403 {
1403 /* If two different rows, or two different charsets, 1404 /* Keep track of which charset and character set row
1404 appear, needing translation, then we cannot use 1405 contains the characters that need translation.
1405 boyer_moore search. See the comment at the head of 1406
1406 boyer_moore(). */ 1407 Zero out the bits corresponding to the last
1407 boyer_moore_ok = 0; 1408 byte. */
1408 break; 1409 charset_base = c & ~ICHAR_FIELD3_MASK;
1410 }
1411 else
1412 {
1413 charset_base_code = c & ~ICHAR_FIELD3_MASK;
1414
1415 if (charset_base_code != charset_base)
1416 {
1417 /* If two different rows, or two different
1418 charsets, appear, needing non-ASCII
1419 translation, then we cannot use boyer_moore
1420 search. See the comment at the head of
1421 boyer_moore(). */
1422 boyer_moore_ok = 0;
1423 break;
1424 }
1409 } 1425 }
1410 } while (c != starting_c); 1426 } while (c != starting_c);
1411 1427
1412 if (boyer_moore_ok && (charset_base != 1428 if (boyer_moore_ok && charset_base != -1 &&
1413 (translated & ~ICHAR_FIELD3_MASK))) 1429 charset_base != (translated & ~ICHAR_FIELD3_MASK))
1414 { 1430 {
1415 /* In the rare event that the CANON entry for this 1431 /* In the rare event that the CANON entry for this
1416 character is not in the desired set, choose one 1432 character is not in the desired set, choose one
1417 that is, from the equivalence set. It doesn't much 1433 that is, from the equivalence set. It doesn't much
1418 matter which. */ 1434 matter which. */
1435 memcpy (pat, tmp_str, new_bytelen); 1451 memcpy (pat, tmp_str, new_bytelen);
1436 pat += new_bytelen; 1452 pat += new_bytelen;
1437 base_pat += orig_bytelen; 1453 base_pat += orig_bytelen;
1438 len -= orig_bytelen; 1454 len -= orig_bytelen;
1439 } 1455 }
1456
1457 if (-1 == charset_base)
1458 {
1459 charset_base = 'a' & ~ICHAR_FIELD3_MASK; /* Default to ASCII. */
1460 }
1461
1440 #else /* not MULE */ 1462 #else /* not MULE */
1441 while (--len >= 0) 1463 while (--len >= 0)
1442 { 1464 {
1443 /* If we got here and the RE flag is set, it's because 1465 /* If we got here and the RE flag is set, it's because
1444 we're dealing with a regexp known to be trivial, so the 1466 we're dealing with a regexp known to be trivial, so the
1451 *pat++ = TRANSLATE (trt, *base_pat++); 1473 *pat++ = TRANSLATE (trt, *base_pat++);
1452 } 1474 }
1453 #endif /* MULE */ 1475 #endif /* MULE */
1454 len = pat - patbuf; 1476 len = pat - patbuf;
1455 pat = base_pat = patbuf; 1477 pat = base_pat = patbuf;
1478
1479 #ifdef DEBUG_XEMACS
1480 if (debug_xemacs_searches)
1481 {
1482 Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used);
1483 sym->value = boyer_moore_ok ? Qboyer_moore : Qsimple_search;
1484 }
1485 #endif
1486
1456 if (boyer_moore_ok) 1487 if (boyer_moore_ok)
1457 return boyer_moore (buf, base_pat, len, pos, lim, n, 1488 return boyer_moore (buf, base_pat, len, pos, lim, n,
1458 trt, inverse_trt, charset_base); 1489 trt, inverse_trt, charset_base);
1459 else 1490 else
1460 return simple_search (buf, base_pat, len, pos, lim, n, trt); 1491 return simple_search (buf, base_pat, len, pos, lim, n, trt);
1593 from buffer position POS/POS_BYTE until LIM/LIM_BYTE. 1624 from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
1594 DIRECTION says which direction we search in. 1625 DIRECTION says which direction we search in.
1595 TRT and INVERSE_TRT are translation tables. 1626 TRT and INVERSE_TRT are translation tables.
1596 1627
1597 This kind of search works if all the characters in PAT that have 1628 This kind of search works if all the characters in PAT that have
1598 nontrivial translation are the same aside from the last byte. This 1629 (non-ASCII) translation are the same aside from the last byte. This
1599 makes it possible to translate just the last byte of a character, 1630 makes it possible to translate just the last byte of a character, and do
1600 and do so after just a simple test of the context. 1631 so after just a simple test of the context.
1601 1632
1602 If that criterion is not satisfied, do not call this function. You will 1633 If that criterion is not satisfied, do not call this function. You will
1603 get an assertion failure. */ 1634 get an assertion failure. */
1604 1635
1605 static Charbpos 1636 static Charbpos
1738 Ibyte *charstart = ptr; 1769 Ibyte *charstart = ptr;
1739 while (!ibyte_first_byte_p (*charstart)) 1770 while (!ibyte_first_byte_p (*charstart))
1740 charstart--; 1771 charstart--;
1741 untranslated = itext_ichar (charstart); 1772 untranslated = itext_ichar (charstart);
1742 1773
1743 /* We shouldn't have been passed a string with varying
1744 character sets or rows. That's what simple_search is
1745 for. */
1746 assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK));
1747
1748 ch = TRANSLATE (trt, untranslated); 1774 ch = TRANSLATE (trt, untranslated);
1749 if (!ibyte_first_byte_p (*ptr)) 1775 if (!ibyte_first_byte_p (*ptr))
1750 { 1776 {
1751 translate_prev_byte = ptr[-1]; 1777 translate_prev_byte = ptr[-1];
1752 if (!ibyte_first_byte_p (translate_prev_byte)) 1778 if (!ibyte_first_byte_p (translate_prev_byte))
1753 translate_anteprev_byte = ptr[-2]; 1779 translate_anteprev_byte = ptr[-2];
1754 } 1780 }
1755 1781
1756 if (charset_base != (ch & ~ICHAR_FIELD3_MASK)) 1782 if (ch != untranslated && /* Was translation done? */
1783 charset_base != (ch & ~ICHAR_FIELD3_MASK))
1757 { 1784 {
1758 /* In the very rare event that the CANON entry for this 1785 /* In the very rare event that the CANON entry for this
1759 character is not in the desired set, choose one that 1786 character is not in the desired set, choose one that
1760 is, from the equivalence set. It doesn't much matter 1787 is, from the equivalence set. It doesn't much matter
1761 which, since we're building our own cheesy equivalence 1788 which, since we're building our own cheesy equivalence
1763 table directly. 1790 table directly.
1764 1791
1765 We can get here if search_buffer has worked out that 1792 We can get here if search_buffer has worked out that
1766 the buffer is entirely single width. */ 1793 the buffer is entirely single width. */
1767 Ichar starting_ch = ch; 1794 Ichar starting_ch = ch;
1795 int count = 0;
1768 do 1796 do
1769 { 1797 {
1770 ch = TRANSLATE (inverse_trt, ch); 1798 ch = TRANSLATE (inverse_trt, ch);
1771 if (charset_base == (ch & ~ICHAR_FIELD3_MASK)) 1799 if (charset_base == (ch & ~ICHAR_FIELD3_MASK))
1772 break; 1800 break;
1773 1801 ++count;
1774 } while (starting_ch != ch); 1802 } while (starting_ch != ch);
1775 1803
1776 /* If starting_ch is equal to ch, the case table is 1804 /* If starting_ch is equal to ch (and count is not one,
1777 corrupt. (Any mapping in the canon table should be 1805 which means no translation is necessary), the case
1778 reflected in the equivalence table, and we know from 1806 table is corrupt. (Any mapping in the canon table
1779 the canon table that untranslated maps to starting_ch 1807 should be reflected in the equivalence table, and we
1780 and that untranslated has the correct value for 1808 know from the canon table that untranslated maps to
1781 charset_base.) */ 1809 starting_ch and that untranslated has the correct value
1782 assert (starting_ch != ch); 1810 for charset_base.) */
1811 assert (1 == count || starting_ch != ch);
1783 } 1812 }
1784 } 1813 }
1785 else 1814 else
1786 { 1815 {
1787 ch = *ptr; 1816 ch = *ptr;
3318 */ ); 3347 */ );
3319 warn_about_possibly_incompatible_back_references = 1; 3348 warn_about_possibly_incompatible_back_references = 1;
3320 3349
3321 Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed); 3350 Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed);
3322 staticpro (&Vskip_chars_range_table); 3351 staticpro (&Vskip_chars_range_table);
3323 } 3352 #ifdef DEBUG_XEMACS
3353 DEFSYMBOL (Qsearch_algorithm_used);
3354 DEFSYMBOL (Qboyer_moore);
3355 DEFSYMBOL (Qsimple_search);
3356
3357 DEFVAR_INT ("debug-xemacs-searches", &debug_xemacs_searches /*
3358 If non-zero, bind `search-algorithm-used' to `boyer-moore' or `simple-search',
3359 depending on the algorithm used for each search. Used for testing.
3360 */ );
3361 debug_xemacs_searches = 0;
3362 #endif
3363 }