Mercurial > hg > xemacs-beta
diff tests/automated/regexp-tests.el @ 5648:3f4a234f4672
Support non-ASCII correctly in character classes, test this.
src/ChangeLog addition:
2012-04-21 Aidan Kehoe <kehoea@parhasard.net>
Support non-ASCII correctly in character classes ([:alnum:] and
friends).
* regex.c:
* regex.c (ISBLANK, ISUNIBYTE): New. Make these and friends
independent of the locale, since we want them to be consistent in
XEmacs.
* regex.c (print_partial_compiled_pattern): Print the flags for
charset_mule; don't print non-ASCII as the character values in
ranges, this breaks with locales.
* regex.c (enum):
Define various flags the charset_mule and charset_mule_not opcodes
can now take.
* regex.c (CHAR_CLASS_MAX_LENGTH): Update this.
* regex.c (re_iswctype, re_wctype): New, from GNU.
* regex.c (re_wctype_can_match_non_ascii): New; used when deciding
on whether to use charset_mule or the ASCII-only regex character
set opcode.
* regex.c (regex_compile):
Error correctly on long, non-existent character class names.
Break out the handling of charsets that can match non-ASCII into a
separate clause. Use compile_char_class when compiling character
classes.
* regex.c (compile_char_class): New. Used in regex_compile when
compiling character sets that may match non-ASCII.
* regex.c (re_compile_fastmap):
If there are flags set for charset_mule or charset_mule_not, we
can't use the fastmap (since we need to check syntax table values
that aren't available there).
* regex.c (re_match_2_internal):
Check the new flags passed to the charset_mule{,_not} opcode,
observe them if appropriate.
* regex.h:
* regex.h (enum):
Expose re_wctype_t here, imported from GNU.
tests/ChangeLog addition:
2012-04-21 Aidan Kehoe <kehoea@parhasard.net>
* automated/regexp-tests.el:
* automated/regexp-tests.el (Assert-char-class):
Check that #'string-match errors correctly with an over-long
character class name.
Add tests for character class functionality that supports
non-ASCII characters. These tests expose bugs in GNU Emacs
24.0.94.2, but pass under current XEmacs.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Sat, 21 Apr 2012 18:58:28 +0100 |
parents | 1d9f603e9125 |
children | d026b665014f |
line wrap: on
line diff
--- a/tests/automated/regexp-tests.el Sat Apr 21 09:41:27 2012 +0100 +++ b/tests/automated/regexp-tests.el Sat Apr 21 18:58:28 2012 +0100 @@ -598,6 +598,14 @@ (Assert (eql (string-match "[\x7f\x81-\x9f]" "\x81") 0)) ;; Test character classes + +;; This used not to error: +(Check-Error-Message invalid-regexp "Invalid character class name" + (string-match "[[:alnum12345:]]" "a")) +;; This alwayed errored, as long as character classes were turned on +(Check-Error-Message invalid-regexp "Invalid character class name" + (string-match "[[:alnum1234:]]" "a")) + (macrolet ((Assert-char-class (class matching-char non-matching-char) (if (and (not (featurep 'mule)) @@ -648,7 +656,21 @@ (Assert (null (string-match ,(concat "[^" class (string non-matching-char) "]") ,(concat (string matching-char) - (string non-matching-char)))))))) + (string non-matching-char))))))) + (Assert-never-matching (class &rest characters) + (cons + 'progn + (mapcan #'(lambda (character) + (if (or (not (eq 'decode-char (car-safe character))) + (featurep 'mule)) + `((Assert (null (string-match + ,(concat "[" class "]") + ,(string (eval character))))) + (Assert (eql (string-match + ,(concat "[^" class "]") + ,(string (eval character))) + 0))))) + characters)))) (Assert-char-class "[:alpha:]" ?a ?0) (Assert-char-class "[:alpha:]" ?z ?9) (Assert-char-class "[:alpha:]" ?A ?0) @@ -657,6 +679,18 @@ (Assert-char-class "[:alpha:]" ?c ?\x09) (Assert-char-class "[:alpha:]" ?d ?\ ) (Assert-char-class "[:alpha:]" ?e ?\x7f) + (Assert-char-class + "[:alpha:]" + (decode-char 'ucs #x0430) ;; CYRILLIC SMALL LETTER A + (decode-char 'ucs #x2116)) ;; NUMERO SIGN + (Assert-char-class + "[:alpha:]" + (decode-char 'ucs #x0410) ;; CYRILLIC CAPITAL LETTER A + ?\x02) + (Assert-char-class + "[:alpha:]" + (decode-char 'ucs #x03B2) ;; GREEK SMALL LETTER BETA + (decode-char 'ucs #x0385)) ;; GREEK DIALYTIKA TONOS (Assert-char-class "[:alnum:]" ?a ?.) (Assert-char-class "[:alnum:]" ?z ?') @@ -664,11 +698,46 @@ (Assert-char-class "[:alnum:]" ?Z ?!) (Assert-char-class "[:alnum:]" ?0 ?,) (Assert-char-class "[:alnum:]" ?9 ?$) - (Assert-char-class "[:alnum:]" ?b ?\x00) (Assert-char-class "[:alnum:]" ?c ?\x09) (Assert-char-class "[:alnum:]" ?d ?\ ) (Assert-char-class "[:alnum:]" ?e ?\x7f) + (Assert-char-class + "[:alnum:]" + (decode-char 'ucs #x0430) ;; CYRILLIC SMALL LETTER A + (decode-char 'ucs #x2116)) ;; NUMERO SIGN + (Assert-char-class + "[:alnum:]" + (decode-char 'ucs #x0410) ;; CYRILLIC CAPITAL LETTER A + ?\x02) + (Assert-char-class + "[:alnum:]" + (decode-char 'ucs #x03B2) ;; GREEK SMALL LETTER BETA + (decode-char 'ucs #x0385)) ;; GREEK DIALYTIKA TONOS + + ;; Word is equivalent to alnum in this implementation. + (Assert-char-class "[:word:]" ?a ?.) + (Assert-char-class "[:word:]" ?z ?') + (Assert-char-class "[:word:]" ?A ?/) + (Assert-char-class "[:word:]" ?Z ?!) + (Assert-char-class "[:word:]" ?0 ?,) + (Assert-char-class "[:word:]" ?9 ?$) + (Assert-char-class "[:word:]" ?b ?\x00) + (Assert-char-class "[:word:]" ?c ?\x09) + (Assert-char-class "[:word:]" ?d ?\ ) + (Assert-char-class "[:word:]" ?e ?\x7f) + (Assert-char-class + "[:word:]" + (decode-char 'ucs #x0430) ;; CYRILLIC SMALL LETTER A + (decode-char 'ucs #x2116)) ;; NUMERO SIGN + (Assert-char-class + "[:word:]" + (decode-char 'ucs #x0410) ;; CYRILLIC CAPITAL LETTER A + ?\x02) + (Assert-char-class + "[:word:]" + (decode-char 'ucs #x03B2) ;; GREEK SMALL LETTER BETA + (decode-char 'ucs #x0385)) ;; GREEK DIALYTIKA TONOS (let ((case-fold-search nil)) (Assert-char-class "[:upper:]" ?A ?a) @@ -679,6 +748,14 @@ (Assert-char-class "[:upper:]" ?E ?\x09) (Assert-char-class "[:upper:]" ?F ?\ ) (Assert-char-class "[:upper:]" ?G ?\x7f) + (Assert-char-class + "[:upper:]" + (decode-char 'ucs #x0410) ;; CYRILLIC CAPITAL LETTER A + (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH + (Assert-char-class + "[:upper:]" + (decode-char 'ucs #x0392) ;; GREEK CAPITAL LETTER BETA + (decode-char 'ucs #x5357)) ;; kDefinition south; southern part; southward (Assert-char-class "[:lower:]" ?a ?A) (Assert-char-class "[:lower:]" ?z ?Z) @@ -687,11 +764,17 @@ (Assert-char-class "[:lower:]" ?d ?\x00) (Assert-char-class "[:lower:]" ?e ?\x09) (Assert-char-class "[:lower:]" ?f ? ) - (Assert-char-class "[:lower:]" ?g ?\x7f)) + (Assert-char-class "[:lower:]" ?g ?\x7f) + (Assert-char-class + "[:lower:]" + (decode-char 'ucs #x0430) ;; CYRILLIC SMALL LETTER A + (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH + (Assert-char-class + "[:lower:]" + (decode-char 'ucs #x03B2) ;; GREEK SMALL LETTER BETA + (decode-char 'ucs #x5357)));; kDefinition south; southern part; southward (let ((case-fold-search t)) - ;; These currently fail, because we don't take into account the buffer's - ;; case table. (Assert-char-class "[:upper:]" ?a ?\x00) (Assert-char-class "[:upper:]" ?z ?\x01) (Assert-char-class "[:upper:]" ?b ?{) @@ -700,7 +783,14 @@ (Assert-char-class "[:upper:]" ?e ?>) (Assert-char-class "[:upper:]" ?f ?\ ) (Assert-char-class "[:upper:]" ?g ?\x7f) - + (Assert-char-class + "[:upper:]" + (decode-char 'ucs #x0430) ;; CYRILLIC SMALL LETTER A + (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH + (Assert-char-class + "[:upper:]" + (decode-char 'ucs #x03B2) ;; GREEK SMALL LETTER BETA + (decode-char 'ucs #x5357)) ;; kDefinition south; southern part; southward (Assert-char-class "[:lower:]" ?A ?\x00) (Assert-char-class "[:lower:]" ?Z ?\x01) (Assert-char-class "[:lower:]" ?B ?{) @@ -708,7 +798,15 @@ (Assert-char-class "[:lower:]" ?D ?<) (Assert-char-class "[:lower:]" ?E ?>) (Assert-char-class "[:lower:]" ?F ?\ ) - (Assert-char-class "[:lower:]" ?G ?\x7F)) + (Assert-char-class "[:lower:]" ?G ?\x7F) + (Assert-char-class + "[:lower:]" + (decode-char 'ucs #x0410) ;; CYRILLIC CAPITAL LETTER A + (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH + (Assert-char-class + "[:lower:]" + (decode-char 'ucs #x0392) ;; GREEK CAPITAL LETTER BETA + (decode-char 'ucs #x5357)));; kDefinition south; southern part; southward (Assert-char-class "[:digit:]" ?0 ?a) (Assert-char-class "[:digit:]" ?9 ?z) @@ -718,6 +816,30 @@ (Assert-char-class "[:digit:]" ?4 ?\x09) (Assert-char-class "[:digit:]" ?5 ? ) (Assert-char-class "[:digit:]" ?6 ?\x7f) + (Assert-char-class + "[:digit:]" ?7 + (decode-char 'ucs #x0385)) ;; GREEK DIALYTIKA TONOS + (Assert-char-class + "[:digit:]" ?8 + (decode-char 'ucs #x0392)) ;; GREEK CAPITAL LETTER BETA + (Assert-char-class + "[:digit:]" ?9 + (decode-char 'ucs #x03B2)) ;; GREEK SMALL LETTER BETA + (Assert-char-class + "[:digit:]" ?0 + (decode-char 'ucs #x0410)) ;; CYRILLIC CAPITAL LETTER A + (Assert-char-class + "[:digit:]" ?1 + (decode-char 'ucs #x0430)) ;; CYRILLIC SMALL LETTER A + (Assert-char-class + "[:digit:]" ?2 + (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH + (Assert-char-class + "[:digit:]" ?3 + (decode-char 'ucs #x2116)) ;; NUMERO SIGN + (Assert-char-class + "[:digit:]" ?4 + (decode-char 'ucs #x5357)) ;; kDefinition south; southern part; southward (Assert-char-class "[:xdigit:]" ?0 ?g) (Assert-char-class "[:xdigit:]" ?9 ?G) @@ -729,6 +851,30 @@ (Assert-char-class "[:xdigit:]" ?4 ?\x09) (Assert-char-class "[:xdigit:]" ?5 ?\x7f) (Assert-char-class "[:xdigit:]" ?6 ?z) + (Assert-char-class + "[:xdigit:]" ?7 + (decode-char 'ucs #x0385)) ;; GREEK DIALYTIKA TONOS + (Assert-char-class + "[:xdigit:]" ?8 + (decode-char 'ucs #x0392)) ;; GREEK CAPITAL LETTER BETA + (Assert-char-class + "[:xdigit:]" ?9 + (decode-char 'ucs #x03B2)) ;; GREEK SMALL LETTER BETA + (Assert-char-class + "[:xdigit:]" ?a + (decode-char 'ucs #x0410)) ;; CYRILLIC CAPITAL LETTER A + (Assert-char-class + "[:xdigit:]" ?B + (decode-char 'ucs #x0430)) ;; CYRILLIC SMALL LETTER A + (Assert-char-class + "[:xdigit:]" ?c + (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH + (Assert-char-class + "[:xdigit:]" ?D + (decode-char 'ucs #x2116)) ;; NUMERO SIGN + (Assert-char-class + "[:xdigit:]" ?e + (decode-char 'ucs #x5357)) ;; kDefinition south; southern part; southward (Assert-char-class "[:space:]" ?\ ?0) (Assert-char-class "[:space:]" ?\t ?9) @@ -738,6 +884,30 @@ (Assert-char-class "[:space:]" ?\ ?\x7f) (Assert-char-class "[:space:]" ?\t ?a) (Assert-char-class "[:space:]" ?\ ?z) + (Assert-char-class + "[:space:]" ?\ + (decode-char 'ucs #x0385)) ;; GREEK DIALYTIKA TONOS + (Assert-char-class + "[:space:]" ?\t + (decode-char 'ucs #x0392)) ;; GREEK CAPITAL LETTER BETA + (Assert-char-class + "[:space:]" ?\ + (decode-char 'ucs #x03B2)) ;; GREEK SMALL LETTER BETA + (Assert-char-class + "[:space:]" ?\t + (decode-char 'ucs #x0410)) ;; CYRILLIC CAPITAL LETTER A + (Assert-char-class + "[:space:]" ?\ + (decode-char 'ucs #x0430)) ;; CYRILLIC SMALL LETTER A + (Assert-char-class + "[:space:]" ?\t + (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH + (Assert-char-class + "[:space:]" ?\ + (decode-char 'ucs #x2116)) ;; NUMERO SIGN + (Assert-char-class + "[:space:]" ?\t + (decode-char 'ucs #x5357)) ;; kDefinition south; southern part; southward (Assert-char-class "[:print:]" ?\ ?\x00) (Assert-char-class "[:print:]" ?0 ?\x09) @@ -747,6 +917,63 @@ (Assert-char-class "[:print:]" ?B ?\t) (Assert-char-class "[:print:]" ?a ?\x03) (Assert-char-class "[:print:]" ?z ?\x04) + (Assert-char-class + "[:print:]" (decode-char 'ucs #x0385) ;; GREEK DIALYTIKA TONOS + ?\x05) + (Assert-char-class + "[:print:]" (decode-char 'ucs #x0392) ;; GREEK CAPITAL LETTER BETA + ?\x06) + (Assert-char-class + "[:print:]" (decode-char 'ucs #x03B2) ;; GREEK SMALL LETTER BETA + ?\x07) + (Assert-char-class + "[:print:]" (decode-char 'ucs #x0410) ;; CYRILLIC CAPITAL LETTER A + ?\x08) + (Assert-char-class + "[:print:]" (decode-char 'ucs #x0430) ;; CYRILLIC SMALL LETTER A + ?\x09) + (Assert-char-class + "[:print:]" (decode-char 'ucs #x0686) ;; ARABIC LETTER TCHEH + ?\x0a) + (Assert-char-class + "[:print:]" (decode-char 'ucs #x2116) ;; NUMERO SIGN + ?\x0b) + (Assert-char-class + "[:print:]" (decode-char 'ucs #x5357) ;; kDefinition south; southern part; southward + ?\x0c) + + (Assert-char-class "[:graph:]" ?! ?\ ) + (Assert-char-class "[:graph:]" ?0 ?\x09) + (Assert-char-class "[:graph:]" ?9 ?\x7f) + (Assert-char-class "[:graph:]" ?A ?\x01) + (Assert-char-class "[:graph:]" ?Z ?\x02) + (Assert-char-class "[:graph:]" ?B ?\t) + (Assert-char-class "[:graph:]" ?a ?\x03) + (Assert-char-class "[:graph:]" ?z ?\x04) + (Assert-char-class + "[:graph:]" (decode-char 'ucs #x0385) ;; GREEK DIALYTIKA TONOS + ?\x05) + (Assert-char-class + "[:graph:]" (decode-char 'ucs #x0392) ;; GREEK CAPITAL LETTER BETA + ?\x06) + (Assert-char-class + "[:graph:]" (decode-char 'ucs #x03B2) ;; GREEK SMALL LETTER BETA + ?\x07) + (Assert-char-class + "[:graph:]" (decode-char 'ucs #x0410) ;; CYRILLIC CAPITAL LETTER A + ?\x08) + (Assert-char-class + "[:graph:]" (decode-char 'ucs #x0430) ;; CYRILLIC SMALL LETTER A + ?\x09) + (Assert-char-class + "[:graph:]" (decode-char 'ucs #x0686) ;; ARABIC LETTER TCHEH + ?\x0a) + (Assert-char-class + "[:graph:]" (decode-char 'ucs #x2116) ;; NUMERO SIGN + ?\x0b) + (Assert-char-class + "[:graph:]" (decode-char 'ucs #x5357) ;; kDefinition south; southern part; southward + ?\x0c) (Assert-char-class "[:punct:]" ?\( ?0) (Assert-char-class "[:punct:]" ?. ?9) @@ -757,4 +984,102 @@ (Assert-char-class "[:punct:]" ?< ?\x09) (Assert-char-class "[:punct:]" ?> ?\x7f) (Assert-char-class "[:punct:]" ?= ?a) - (Assert-char-class "[:punct:]" ?\? ?z)) + (Assert-char-class "[:punct:]" ?\? ?z) + (Assert-char-class + "[:punct:]" + (decode-char 'ucs #x0385) ;; GREEK DIALYTIKA TONOS + ?a) + (Assert-char-class + "[:punct:]" + (decode-char 'ucs #x20af) ;; DRACHMA SIGN + (decode-char 'ucs #x0392)) ;; GREEK CAPITAL LETTER BETA + (Assert-char-class + "[:punct:]" + (decode-char 'ucs #x00a7) ;; SECTION SIGN + (decode-char 'ucs #x03B2)) ;; GREEK SMALL LETTER BETA + (Assert-char-class + "[:punct:]" + (decode-char 'ucs #x00a8) ;; DIAERESIS + (decode-char 'ucs #x0410)) ;; CYRILLIC CAPITAL LETTER A + (Assert-char-class + "[:punct:]" + (decode-char 'ucs #x0384) ;; GREEK TONOS + (decode-char 'ucs #x0430)) ;; CYRILLIC SMALL LETTER A + (Assert-char-class + "[:punct:]" + (decode-char 'ucs #x00b7) ;; MIDDLE DOT + (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH + (Assert-char-class + "[:punct:]" + (decode-char 'ucs #x2116) ;; NUMERO SIGN + ?x) + (Assert-char-class + "[:punct:]" + ?= + (decode-char 'ucs #x5357)) ;; kDefinition south; southern part; southward + + (Assert-char-class "[:ascii:]" ?a (decode-char 'ucs #x00a7)) ;; SECTION SIGN + (Assert-char-class "[:ascii:]" ?b (decode-char 'ucs #x00a8)) ;; DIAERESIS + (Assert-char-class "[:ascii:]" ?c (decode-char 'ucs #x00b7)) ;; MIDDLE DOT + (Assert-char-class "[:ascii:]" ?d (decode-char 'ucs #x0384)) ;; GREEK TONOS + (Assert-char-class + "[:ascii:]" ?\x00 (decode-char 'ucs #x0392)) ;; GREEK CAPITAL LETTER BETA + (Assert-char-class + "[:ascii:]" ?\x01 (decode-char 'ucs #x03B2)) ;; GREEK SMALL LETTER BETA + (Assert-char-class + "[:ascii:]" ?\t (decode-char 'ucs #x0410)) ;; CYRILLIC CAPITAL LETTER A + (Assert-char-class + "[:ascii:]" ?A (decode-char 'ucs #x0430)) ;; CYRILLIC SMALL LETTER A + (Assert-char-class + "[:ascii:]" ?B (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH + (Assert-char-class + "[:ascii:]" ?C (decode-char 'ucs #x20af)) ;; DRACHMA SIGN + (Assert-char-class + "[:ascii:]" ?\x7f (decode-char 'ucs #x2116)) ;; NUMERO SIGN + + (Assert-char-class + "[:nonascii:]" (decode-char 'ucs #x00a7) ?a) ;; SECTION SIGN + (Assert-char-class + "[:nonascii:]" (decode-char 'ucs #x00a8) ?b) ;; DIAERESIS + (Assert-char-class + "[:nonascii:]" (decode-char 'ucs #x00b7) ?c) ;; MIDDLE DOT + (Assert-char-class + "[:nonascii:]" (decode-char 'ucs #x0384) ?d) ;; GREEK TONOS + (Assert-char-class + "[:nonascii:]" (decode-char 'ucs #x0392) ?\x00) ;; GREEK CAPITAL LETTER BETA + (Assert-char-class + "[:nonascii:]" (decode-char 'ucs #x03B2) ?\x01) ;; GREEK SMALL LETTER BETA + (Assert-char-class + "[:nonascii:]" (decode-char 'ucs #x0410) ?\t) ;; CYRILLIC CAPITAL LETTER A + (Assert-char-class + "[:nonascii:]" (decode-char 'ucs #x0430) ?A) ;; CYRILLIC SMALL LETTER A + (Assert-char-class + "[:nonascii:]" (decode-char 'ucs #x0686) ?B) ;; ARABIC LETTER TCHEH + (Assert-char-class + "[:nonascii:]" (decode-char 'ucs #x20af) ?C) ;; DRACHMA SIGN + (Assert-char-class + "[:nonascii:]" (decode-char 'ucs #x2116) ?\x7f) ;; NUMERO SIGN + + (Assert-char-class + "[:multibyte:]" (decode-char 'ucs #x00a7) ?a) ;; SECTION SIGN + (Assert-char-class + "[:multibyte:]" (decode-char 'ucs #x00a8) ?b) ;; DIAERESIS + (Assert-char-class + "[:multibyte:]" (decode-char 'ucs #x00b7) ?c) ;; MIDDLE DOT + (Assert-char-class + "[:multibyte:]" (decode-char 'ucs #x0384) ?d) ;; GREEK TONOS + (Assert-char-class + "[:multibyte:]" (decode-char 'ucs #x0392) + ?\x00) ;; GREEK CAPITAL LETTER BETA + + (Assert-never-matching + "[:unibyte:]" + ?\x01 ?\t ?A ?B ?C ?\x7f + (decode-char 'ucs #x03B2) ;; GREEK SMALL LETTER BETA + (decode-char 'ucs #x0410) ;; CYRILLIC CAPITAL LETTER A + (decode-char 'ucs #x0430) ;; CYRILLIC SMALL LETTER A + (decode-char 'ucs #x0686) ;; ARABIC LETTER TCHEH + (decode-char 'ucs #x20af) ;; DRACHMA SIGN + (decode-char 'ucs #x2116) ;; NUMERO SIGN + (decode-char 'ucs #x5357))) ;; kDefinition south; southern part; southward +