Mercurial > hg > xemacs-beta
diff src/regex.c @ 5653:3df910176b6a
Support predefined character classes in #'skip-chars-{forward,backward}, too
src/ChangeLog addition:
2012-05-04 Aidan Kehoe <kehoea@parhasard.net>
* regex.c:
Move various #defines and enums to regex.h, since we need them
when implementing #'skip-chars-{backward,forward}.
* regex.c (re_wctype):
* regex.c (re_iswctype):
Be more robust about case insensitivity here.
* regex.c (regex_compile):
* regex.h:
* regex.h (RE_ISWCTYPE_ARG_DECL):
* regex.h (CHAR_CLASS_MAX_LENGTH):
* search.c (skip_chars):
Implement support for the predefined character classes in this
function.
tests/ChangeLog addition:
2012-05-04 Aidan Kehoe <kehoea@parhasard.net>
* automated/regexp-tests.el (equal):
* automated/regexp-tests.el (Assert-char-class):
Correct a stray parenthesis; add tests for the predefined
character classes with #'skip-chars-{forward,backward}; update the
tests to reflect some changed design decisions on my part.
man/ChangeLog addition:
2012-05-04 Aidan Kehoe <kehoea@parhasard.net>
* lispref/searching.texi (Regular Expressions):
* lispref/searching.texi (Syntax of Regexps):
* lispref/searching.texi (Char Classes):
* lispref/searching.texi (Regexp Example):
Document the predefined character classes in this file.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Fri, 04 May 2012 21:12:02 +0100 |
parents | 3f4a234f4672 |
children | 8a2ac78cb97d |
line wrap: on
line diff
--- a/src/regex.c Wed Apr 25 20:25:33 2012 +0100 +++ b/src/regex.c Fri May 04 21:12:02 2012 +0100 @@ -178,51 +178,7 @@ /* isalpha etc. are used for the character classes. */ #include <ctype.h> -#ifdef emacs - -/* 1 if C is an ASCII character. */ -#define ISASCII(c) ((c) < 0x80) - -/* 1 if C is a unibyte character. */ -#define ISUNIBYTE(c) 0 - -/* The Emacs definitions should not be directly affected by locales. */ - -/* In Emacs, these are only used for single-byte characters. */ -#define ISDIGIT(c) ((c) >= '0' && (c) <= '9') -#define ISCNTRL(c) ((c) < ' ') -#define ISXDIGIT(c) (ISDIGIT (c) || ((c) >= 'a' && (c) <= 'f') \ - || ((c) >= 'A' && (c) <= 'F')) - -/* This is only used for single-byte characters. */ -#define ISBLANK(c) ((c) == ' ' || (c) == '\t') - -/* The rest must handle multibyte characters. */ - -#define ISGRAPH(c) ((c) > ' ' && (c) != 0x7f) -#define ISPRINT(c) ((c) == ' ' || ISGRAPH (c)) -#define ISALPHA(c) (ISASCII (c) ? (((c) >= 'a' && (c) <= 'z') \ - || ((c) >= 'A' && (c) <= 'Z')) \ - : ISWORD (c)) -#define ISALNUM(c) (ISALPHA (c) || ISDIGIT (c)) - -#define ISLOWER(c) LOWERCASEP (lispbuf, c) - -#define ISPUNCT(c) (ISASCII (c) \ - ? ((c) > ' ' && (c) < 0x7F \ - && !(((c) >= 'a' && (c) <= 'z') \ - || ((c) >= 'A' && (c) <= 'Z') \ - || ((c) >= '0' && (c) <= '9'))) \ - : !ISWORD (c)) - -#define ISSPACE(c) \ - (SYNTAX (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf), c) == Swhitespace) - -#define ISUPPER(c) UPPERCASEP (lispbuf, c) - -#define ISWORD(c) (SYNTAX (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf), c) == Sword) - -#else /* not emacs */ +#ifndef emacs /* For the emacs build, we need these in the header. */ /* 1 if C is an ASCII character. */ #define ISASCII(c) ((c) < 0200) @@ -2013,23 +1969,6 @@ /* The next available element. */ #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) -/* Bits used to implement the multibyte-part of the various character - classes such as [:alnum:] in a charset's range table. XEmacs; use an - enum, so they're visible in the debugger. */ -enum -{ - BIT_WORD = (1 << 0), - BIT_LOWER = (1 << 1), - BIT_PUNCT = (1 << 2), - BIT_SPACE = (1 << 3), - BIT_UPPER = (1 << 4), - /* XEmacs; we need this, because we unify treatment of ASCII and non-ASCII - (possible matches) in charset_mule. [:alpha:] matches all characters - with word syntax, with the exception of [0-9]. We don't need - BIT_MULTIBYTE. */ - BIT_ALPHA = (1 << 5) -}; - /* Set the bit for character C in a bit vector. */ #define SET_LIST_BIT(c) \ (buf_end[((unsigned char) (c)) / BYTEWIDTH] \ @@ -2059,10 +1998,8 @@ } \ } -#define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */ - /* Map a string to the char class it names (if any). */ -static re_wctype_t +re_wctype_t re_wctype (const char *string) { if (STREQ (string, "alnum")) return RECC_ALNUM; @@ -2086,17 +2023,10 @@ } /* True if CH is in the char class CC. */ -static re_bool -re_iswctype (int ch, re_wctype_t cc) +int +re_iswctype (int ch, re_wctype_t cc + RE_ISWCTYPE_ARG_DECL) { -#ifdef emacs - /* This is cheesy, lispbuf isn't available to us when compiling the - pattern. It's effectively only called (on Mule builds) when the current - buffer doesn't matter (e.g. for RECC_ASCII, RECC_CNTRL), so it's not a - big deal. */ - struct buffer *lispbuf = current_buffer; -#endif - switch (cc) { case RECC_ALNUM: return ISALNUM (ch) != 0; @@ -2105,11 +2035,20 @@ case RECC_CNTRL: return ISCNTRL (ch) != 0; case RECC_DIGIT: return ISDIGIT (ch) != 0; case RECC_GRAPH: return ISGRAPH (ch) != 0; - case RECC_LOWER: return ISLOWER (ch) != 0; case RECC_PRINT: return ISPRINT (ch) != 0; case RECC_PUNCT: return ISPUNCT (ch) != 0; case RECC_SPACE: return ISSPACE (ch) != 0; +#ifdef emacs + case RECC_UPPER: + return NILP (lispbuf->case_fold_search) ? ISUPPER (ch) != 0 + : !NOCASEP (lispbuf, ch); + case RECC_LOWER: + return NILP (lispbuf->case_fold_search) ? ISLOWER (ch) != 0 + : !NOCASEP (lispbuf, ch); +#else case RECC_UPPER: return ISUPPER (ch) != 0; + case RECC_LOWER: return ISLOWER (ch) != 0; +#endif case RECC_XDIGIT: return ISXDIGIT (ch) != 0; case RECC_ASCII: return ISASCII (ch) != 0; case RECC_NONASCII: case RECC_MULTIBYTE: return !ISASCII (ch); @@ -2140,6 +2079,10 @@ } } +#endif /* MULE */ + +#ifdef emacs + /* Return a bit-pattern to use in the range-table bits to match multibyte chars of class CC. */ static unsigned char @@ -2158,7 +2101,8 @@ case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; default: - abort (); + ABORT (); + return 0; } } @@ -2185,9 +2129,12 @@ RE_TRANSLATE_TYPE translate, reg_syntax_t syntax, Lisp_Object rtab); -static reg_errcode_t compile_char_class (re_wctype_t cc, Lisp_Object rtab, - Bitbyte *flags_out); #endif /* MULE */ +#ifdef emacs +reg_errcode_t compile_char_class (re_wctype_t cc, Lisp_Object rtab, + Bitbyte *flags_out); +#endif + static re_bool group_match_null_string_p (unsigned char **p, unsigned char *end, register_info_type *reg_info); @@ -2814,7 +2761,8 @@ #endif /* MULE */ for (ch = 0; ch < (1 << BYTEWIDTH); ++ch) { - if (re_iswctype (ch, cc)) + if (re_iswctype (ch, cc + RE_ISWCTYPE_ARG (current_buffer))) { SET_LIST_BIT (ch); } @@ -3938,7 +3886,11 @@ return REG_NOERROR; } -static reg_errcode_t +#endif /* MULE */ + +#ifdef emacs + +reg_errcode_t compile_char_class (re_wctype_t cc, Lisp_Object rtab, Bitbyte *flags_out) { *flags_out |= re_wctype_to_bit (cc);