# HG changeset patch
# User Aidan Kehoe <kehoea@parhasard.net>
# Date 1335031108 -3600
# Node ID 3f4a234f4672ab40f61811656bc674bcd80664db
# Parent  1d9f603e9125575ac67f9cff0f2159a046d99d3e
Support non-ASCII correctly in character classes, test this.

src/ChangeLog addition:

2012-04-21  Aidan Kehoe  <kehoea@parhasard.net>

	Support non-ASCII correctly in character classes ([:alnum:] and
	friends).

	* regex.c:
	* regex.c (ISBLANK, ISUNIBYTE): New. Make these and friends
	independent of the locale, since we want them to be consistent in
	XEmacs.
	* regex.c (print_partial_compiled_pattern): Print the flags for
	charset_mule; don't print non-ASCII as the character values in
	ranges, this breaks with locales.
	* regex.c (enum):
	Define various flags the charset_mule and charset_mule_not opcodes
	can now take.
	* regex.c (CHAR_CLASS_MAX_LENGTH): Update this.
	* regex.c (re_iswctype, re_wctype): New, from GNU.
	* regex.c (re_wctype_can_match_non_ascii): New; used when deciding
	on whether to use charset_mule or the ASCII-only regex character
	set opcode.
	* regex.c (regex_compile):
	Error correctly on long, non-existent character class names.
	Break out the handling of charsets that can match non-ASCII into a
	separate clause. Use compile_char_class when compiling character
	classes.
	* regex.c (compile_char_class): New. Used in regex_compile when
	compiling character sets that may match non-ASCII.
	* regex.c (re_compile_fastmap):
	If there are flags set for charset_mule or charset_mule_not, we
	can't use the fastmap (since we need to check syntax table values
	that aren't available there).
	* regex.c (re_match_2_internal):
	Check the new flags passed to the charset_mule{,_not} opcode,
	observe them if appropriate.
	* regex.h:
	* regex.h (enum):
	Expose re_wctype_t here, imported from GNU.

tests/ChangeLog addition:

2012-04-21  Aidan Kehoe  <kehoea@parhasard.net>

	* automated/regexp-tests.el:
	* automated/regexp-tests.el (Assert-char-class):
	Check that #'string-match errors correctly with an over-long
	character class name.
	Add tests for character class functionality that supports
	non-ASCII characters. These tests expose bugs in GNU Emacs
	24.0.94.2, but pass under current XEmacs.

diff -r 1d9f603e9125 -r 3f4a234f4672 src/ChangeLog
--- a/src/ChangeLog	Sat Apr 21 09:41:27 2012 +0100
+++ b/src/ChangeLog	Sat Apr 21 18:58:28 2012 +0100
@@ -1,3 +1,41 @@
+2012-04-21  Aidan Kehoe  <kehoea@parhasard.net>
+
+	Support non-ASCII correctly in character classes ([:alnum:] and
+	friends).
+
+	* regex.c:
+	* regex.c (ISBLANK, ISUNIBYTE): New. Make these and friends
+	independent of the locale, since we want them to be consistent in
+	XEmacs.
+	* regex.c (print_partial_compiled_pattern): Print the flags for
+	charset_mule; don't print non-ASCII as the character values in
+	ranges, this breaks with locales.
+	* regex.c (enum):
+	Define various flags the charset_mule and charset_mule_not opcodes
+	can now take.
+	* regex.c (CHAR_CLASS_MAX_LENGTH): Update this.
+	* regex.c (re_iswctype, re_wctype): New, from GNU.
+	* regex.c (re_wctype_can_match_non_ascii): New; used when deciding
+	on whether to use charset_mule or the ASCII-only regex character
+	set opcode.
+	* regex.c (regex_compile):
+	Error correctly on long, non-existent character class names.
+	Break out the handling of charsets that can match non-ASCII into a
+	separate clause. Use compile_char_class when compiling character
+	classes.
+	* regex.c (compile_char_class): New. Used in regex_compile when
+	compiling character sets that may match non-ASCII.
+	* regex.c (re_compile_fastmap):
+	If there are flags set for charset_mule or charset_mule_not, we
+	can't use the fastmap (since we need to check syntax table values
+	that aren't available there).
+	* regex.c (re_match_2_internal):
+	Check the new flags passed to the charset_mule{,_not} opcode,
+	observe them if appropriate.
+	* regex.h:
+	* regex.h (enum):
+	Expose re_wctype_t here, imported from GNU.
+
 2012-04-21  Aidan Kehoe  <kehoea@parhasard.net>
 
 	* regex.h (RE_SYNTAX_EMACS):
diff -r 1d9f603e9125 -r 3f4a234f4672 src/regex.c
--- a/src/regex.c	Sat Apr 21 09:41:27 2012 +0100
+++ b/src/regex.c	Sat Apr 21 18:58:28 2012 +0100
@@ -178,53 +178,91 @@
 /* isalpha etc. are used for the character classes.  */
 #include <ctype.h>
 
-/* Jim Meyering writes:
-
-   "... Some ctype macros are valid only for character codes that
-   isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
-   using /bin/cc or gcc but without giving an ansi option).  So, all
-   ctype uses should be through macros like ISPRINT...  If
-   STDC_HEADERS is defined, then autoconf has verified that the ctype
-   macros don't need to be guarded with references to isascii. ...
-   Defining isascii to 1 should let any compiler worth its salt
-   eliminate the && through constant folding."  */
-
-#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
-#define ISASCII_1(c) 1
-#else
-#define ISASCII_1(c) isascii(c)
-#endif
-
-#ifdef MULE
-/* The IS*() macros can be passed any character, including an extended
-   one.  We need to make sure there are no crashes, which would occur
-   otherwise due to out-of-bounds array references. */
-#define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
-#else
-#define ISASCII(c) ISASCII_1 (c)
-#endif /* MULE */
+#ifdef emacs
+
+/* 1 if C is an ASCII character.  */
+#define ISASCII(c) ((c) < 0x80)
+
+/* 1 if C is a unibyte character.  */
+#define ISUNIBYTE(c) 0
+
+/* The Emacs definitions should not be directly affected by locales.  */
+
+/* In Emacs, these are only used for single-byte characters.  */
+#define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
+#define ISCNTRL(c) ((c) < ' ')
+#define ISXDIGIT(c) (ISDIGIT (c) || ((c) >= 'a' && (c) <= 'f')	\
+		     || ((c) >= 'A' && (c) <= 'F'))
+
+/* This is only used for single-byte characters.  */
+#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
+
+/* The rest must handle multibyte characters.  */
+
+#define ISGRAPH(c) ((c) > ' ' && (c) != 0x7f)
+#define ISPRINT(c) ((c) == ' ' || ISGRAPH (c))
+#define ISALPHA(c) (ISASCII (c) ? (((c) >= 'a' && (c) <= 'z')		\
+				   || ((c) >= 'A' && (c) <= 'Z'))	\
+		    : ISWORD (c))
+#define ISALNUM(c) (ISALPHA (c) || ISDIGIT (c))
+
+#define ISLOWER(c) LOWERCASEP (lispbuf, c)
+
+#define ISPUNCT(c) (ISASCII (c)                                 \
+		    ? ((c) > ' ' && (c) < 0x7F			\
+		       && !(((c) >= 'a' && (c) <= 'z')		\
+		            || ((c) >= 'A' && (c) <= 'Z')	\
+		            || ((c) >= '0' && (c) <= '9')))	\
+		    : !ISWORD (c))
+
+#define ISSPACE(c) \
+	(SYNTAX (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf), c) == Swhitespace)
+
+#define ISUPPER(c) UPPERCASEP (lispbuf, c)
+
+#define ISWORD(c) (SYNTAX (BUFFER_MIRROR_SYNTAX_TABLE (lispbuf), c) == Sword)
+
+#else /* not emacs */
+
+/* 1 if C is an ASCII character.  */
+#define ISASCII(c) ((c) < 0200)
+
+/* 1 if C is a unibyte character.  */
+#define ISUNIBYTE(c) 0
 
 #ifdef isblank
-#define ISBLANK(c) (ISASCII (c) && isblank (c))
+# define ISBLANK(c) isblank (c)
 #else
-#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
+# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
 #endif
 #ifdef isgraph
-#define ISGRAPH(c) (ISASCII (c) && isgraph (c))
+# define ISGRAPH(c) isgraph (c)
 #else
-#define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
+# define ISGRAPH(c) (isprint (c) && !isspace (c))
 #endif
 
-#define ISPRINT(c) (ISASCII (c) && isprint (c))
-#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
-#define ISALNUM(c) (ISASCII (c) && isalnum (c))
-#define ISALPHA(c) (ISASCII (c) && isalpha (c))
-#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
-#define ISLOWER(c) (ISASCII (c) && islower (c))
-#define ISPUNCT(c) (ISASCII (c) && ispunct (c))
-#define ISSPACE(c) (ISASCII (c) && isspace (c))
-#define ISUPPER(c) (ISASCII (c) && isupper (c))
-#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
+/* Solaris defines ISPRINT so we must undefine it first.  */
+#undef ISPRINT
+#define ISPRINT(c) isprint (c)
+#define ISDIGIT(c) isdigit (c)
+#define ISALNUM(c) isalnum (c)
+#define ISALPHA(c) isalpha (c)
+#define ISCNTRL(c) iscntrl (c)
+#define ISLOWER(c) islower (c)
+#define ISPUNCT(c) ispunct (c)
+#define ISSPACE(c) isspace (c)
+#define ISUPPER(c) isupper (c)
+#define ISXDIGIT(c) isxdigit (c)
+
+#define ISWORD(c) ISALPHA (c)
+
+#ifdef _tolower
+# define TOLOWER(c) _tolower (c)
+#else
+# define TOLOWER(c) tolower (c)
+#endif
+
+#endif /* emacs */
 
 #ifndef NULL
 #define NULL (void *)0
@@ -913,6 +951,7 @@
 
 	    printf ("/charset_mule [%s",
 	            (re_opcode_t) *(p - 1) == charset_mule_not ? "^" : "");
+	    printf (" flags: 0x%02x ", *p++);
 	    nentries = unified_range_table_nentries (p);
 	    for (i = 0; i < nentries; i++)
 	      {
@@ -921,14 +960,14 @@
 
 		unified_range_table_get_range (p, i, &first, &last,
 					       &dummy_val);
-		if (first < 0x100)
+		if (first < 0x80)
 		  putchar (first);
 		else
 		  printf ("(0x%lx)", (long)first);
 		if (first != last)
 		  {
 		    putchar ('-');
-		    if (last < 0x100)
+		    if (last < 0x80)
 		      putchar (last);
 		    else
 		      printf ("(0x%lx)", (long)last);
@@ -1974,6 +2013,22 @@
 /* The next available element.  */
 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
 
+/* Bits used to implement the multibyte-part of the various character
+   classes such as [:alnum:] in a charset's range table. XEmacs; use an
+   enum, so they're visible in the debugger. */
+enum
+{
+  BIT_WORD = (1 << 0),
+  BIT_LOWER = (1 << 1),
+  BIT_PUNCT = (1 << 2),
+  BIT_SPACE = (1 << 3),
+  BIT_UPPER = (1 << 4),
+  /* XEmacs; we need this, because we unify treatment of ASCII and non-ASCII
+     (possible matches) in charset_mule. [:alpha:] matches all characters
+     with word syntax, with the exception of [0-9]. We don't need
+     BIT_MULTIBYTE. */
+  BIT_ALPHA = (1 << 5)
+};
 
 /* Set the bit for character C in a bit vector.  */
 #define SET_LIST_BIT(c)				\
@@ -1985,22 +2040,8 @@
 /* Set the "bit" for character C in a range table. */
 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
 
-/* Set the "bit" for character c in the appropriate table. */
-#define SET_EITHER_BIT(c)			\
-  do {						\
-    if (has_extended_chars)			\
-      SET_RANGETAB_BIT (c);			\
-    else					\
-      SET_LIST_BIT (c);				\
-  } while (0)
-
-#else /* not MULE */
-
-#define SET_EITHER_BIT(c) SET_LIST_BIT (c)
-
 #endif
 
-
 /* Get the next unsigned number in the uncompiled pattern.  */
 #define GET_UNSIGNED_NUMBER(num) 					\
   { if (p != pend)							\
@@ -2018,15 +2059,110 @@
        } 								\
     }
 
-#define CHAR_CLASS_MAX_LENGTH  6 /* Namely, `xdigit'.  */
-
-#define IS_CHAR_CLASS(string)						\
-   (STREQ (string, "alpha") || STREQ (string, "upper")			\
-    || STREQ (string, "lower") || STREQ (string, "digit")		\
-    || STREQ (string, "alnum") || STREQ (string, "xdigit")		\
-    || STREQ (string, "space") || STREQ (string, "print")		\
-    || STREQ (string, "punct") || STREQ (string, "graph")		\
-    || STREQ (string, "cntrl") || STREQ (string, "blank"))
+#define CHAR_CLASS_MAX_LENGTH  9 /* Namely, `multibyte'.  */
+
+/* Map a string to the char class it names (if any).  */
+static re_wctype_t
+re_wctype (const char *string)
+{
+  if      (STREQ (string, "alnum"))	return RECC_ALNUM;
+  else if (STREQ (string, "alpha"))	return RECC_ALPHA;
+  else if (STREQ (string, "word"))	return RECC_WORD;
+  else if (STREQ (string, "ascii"))	return RECC_ASCII;
+  else if (STREQ (string, "nonascii"))	return RECC_NONASCII;
+  else if (STREQ (string, "graph"))	return RECC_GRAPH;
+  else if (STREQ (string, "lower"))	return RECC_LOWER;
+  else if (STREQ (string, "print"))	return RECC_PRINT;
+  else if (STREQ (string, "punct"))	return RECC_PUNCT;
+  else if (STREQ (string, "space"))	return RECC_SPACE;
+  else if (STREQ (string, "upper"))	return RECC_UPPER;
+  else if (STREQ (string, "unibyte"))	return RECC_UNIBYTE;
+  else if (STREQ (string, "multibyte"))	return RECC_MULTIBYTE;
+  else if (STREQ (string, "digit"))	return RECC_DIGIT;
+  else if (STREQ (string, "xdigit"))	return RECC_XDIGIT;
+  else if (STREQ (string, "cntrl"))	return RECC_CNTRL;
+  else if (STREQ (string, "blank"))	return RECC_BLANK;
+  else return RECC_ERROR;
+}
+
+/* True if CH is in the char class CC.  */
+static re_bool
+re_iswctype (int ch, re_wctype_t cc)
+{
+#ifdef emacs
+  /* This is cheesy, lispbuf isn't available to us when compiling the
+     pattern. It's effectively only called (on Mule builds) when the current
+     buffer doesn't matter (e.g. for RECC_ASCII, RECC_CNTRL), so it's not a
+     big deal. */
+  struct buffer *lispbuf = current_buffer;
+#endif
+
+  switch (cc)
+    {
+    case RECC_ALNUM: return ISALNUM (ch) != 0;
+    case RECC_ALPHA: return ISALPHA (ch) != 0;
+    case RECC_BLANK: return ISBLANK (ch) != 0;
+    case RECC_CNTRL: return ISCNTRL (ch) != 0;
+    case RECC_DIGIT: return ISDIGIT (ch) != 0;
+    case RECC_GRAPH: return ISGRAPH (ch) != 0;
+    case RECC_LOWER: return ISLOWER (ch) != 0;
+    case RECC_PRINT: return ISPRINT (ch) != 0;
+    case RECC_PUNCT: return ISPUNCT (ch) != 0;
+    case RECC_SPACE: return ISSPACE (ch) != 0;
+    case RECC_UPPER: return ISUPPER (ch) != 0;
+    case RECC_XDIGIT: return ISXDIGIT (ch) != 0;
+    case RECC_ASCII: return ISASCII (ch) != 0;
+    case RECC_NONASCII: case RECC_MULTIBYTE: return !ISASCII (ch);
+    case RECC_UNIBYTE: return ISUNIBYTE (ch) != 0;
+    case RECC_WORD: return ISWORD (ch) != 0;
+    case RECC_ERROR: return false;
+    default:
+      abort ();
+    }
+}
+
+#ifdef MULE
+
+static re_bool
+re_wctype_can_match_non_ascii (re_wctype_t cc)
+{
+  switch (cc)
+    {
+    case RECC_ASCII:
+    case RECC_UNIBYTE:
+    case RECC_CNTRL:
+    case RECC_DIGIT:
+    case RECC_XDIGIT:
+    case RECC_BLANK:
+      return false;
+    default:
+      return true;
+    }
+}
+
+/* Return a bit-pattern to use in the range-table bits to match multibyte
+   chars of class CC.  */
+static unsigned char
+re_wctype_to_bit (re_wctype_t cc)
+{
+  switch (cc)
+    {
+    case RECC_PRINT: case RECC_GRAPH:
+    case RECC_ALPHA: return BIT_ALPHA;
+    case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
+    case RECC_LOWER: return BIT_LOWER;
+    case RECC_UPPER: return BIT_UPPER;
+    case RECC_PUNCT: return BIT_PUNCT;
+    case RECC_SPACE: return BIT_SPACE;
+    case RECC_MULTIBYTE: case RECC_NONASCII: 
+    case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
+    case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
+    default:
+      abort ();
+    }
+}
+
+#endif /* emacs */
 
 static void store_op1 (re_opcode_t op, unsigned char *loc, int arg);
 static void store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
@@ -2049,6 +2185,8 @@
 					     RE_TRANSLATE_TYPE translate,
 					     reg_syntax_t syntax,
 					     Lisp_Object rtab);
+static reg_errcode_t compile_char_class (re_wctype_t cc, Lisp_Object rtab,
+                                         Bitbyte *flags_out);
 #endif /* MULE */
 static re_bool group_match_null_string_p (unsigned char **p,
 					  unsigned char *end,
@@ -2512,15 +2650,20 @@
           BUF_PUSH (anychar);
           break;
 
+#ifdef MULE
+#define MAYBE_START_OVER_WITH_EXTENDED(ch)	\
+	  if (ch >= 0x80)                       \
+	    {					\
+	      goto start_over_with_extended;	\
+	    } while (0)
+#else
+#define MAYBE_START_OVER_WITH_EXTENDED(ch)
+#endif
 
         case '[':
           {
 	    /* XEmacs change: this whole section */
             re_bool had_char_class = false;
-#ifdef MULE
-	    re_bool has_extended_chars = false;
-	    REGISTER Lisp_Object rtab = Qnil;
-#endif
 
             if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
 
@@ -2550,29 +2693,6 @@
                 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
               SET_LIST_BIT ('\n');
 
-#ifdef MULE
-	  start_over_with_extended:
-	    if (has_extended_chars)
-	      {
-		/* There are extended chars here, which means we need to start
-		   over and shift to unified range-table format. */
-		if (buf_end[-2] == charset)
-		  buf_end[-2] = charset_mule;
-		else
-		  buf_end[-2] = charset_mule_not;
-		buf_end--;
-		p = p1; /* go back to the beginning of the charset, after
-			   a possible ^. */
-		rtab = Vthe_lisp_rangetab;
-		Fclear_range_table (rtab);
-
-		/* charset_not matches newline according to a syntax bit.  */
-		if ((re_opcode_t) buf_end[-1] == charset_mule_not
-		    && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
-		  SET_EITHER_BIT ('\n');
-	      }
-#endif /* MULE */
-
             /* Read in characters and ranges, setting map bits.  */
             for (;;)
               {
@@ -2580,32 +2700,22 @@
 
                 PATFETCH (c);
 
-#ifdef MULE
-		if (c >= 0x80 && !has_extended_chars)
-		  {
-		    has_extended_chars = 1;
-		    /* Frumble-bumble, we've found some extended chars.
-		       Need to start over, process everything using
-		       the general extended-char mechanism, and need
-		       to use charset_mule and charset_mule_not instead
-		       of charset and charset_not. */
-		    goto start_over_with_extended;
-		  }
-#endif /* MULE */
+		/* Frumble-bumble, we may have found some extended chars.
+		   Need to start over, process everything using the general
+		   extended-char mechanism, and need to use charset_mule and
+		   charset_mule_not instead of charset and charset_not. */
+		MAYBE_START_OVER_WITH_EXTENDED (c);
+
                 /* \ might escape characters inside [...] and [^...].  */
                 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
                   {
                     if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
 
                     PATFETCH (c1);
-#ifdef MULE
-		    if (c1 >= 0x80 && !has_extended_chars)
-		      {
-		        has_extended_chars = 1;
-		        goto start_over_with_extended;
-                      }
-#endif /* MULE */
-                    SET_EITHER_BIT (c1);
+
+		    MAYBE_START_OVER_WITH_EXTENDED (c1);
+
+                    SET_LIST_BIT (c1);
                     continue;
                   }
 
@@ -2631,18 +2741,11 @@
                   {
                     reg_errcode_t ret;
 
-#ifdef MULE
-		    if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
-		      {
-		        has_extended_chars = 1;
-		        goto start_over_with_extended;
-                      }
-                    if (has_extended_chars)
-		      ret = compile_extended_range (&p, pend, translate,
-						    syntax, rtab);
-		    else
-#endif /* MULE */
-		      ret = compile_range (&p, pend, translate, syntax, buf_end);
+		    MAYBE_START_OVER_WITH_EXTENDED (*(unsigned char *)p);
+
+		    ret = compile_range (&p, pend, translate, syntax,
+					 buf_end);
+
                     if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
                   }
 
@@ -2653,18 +2756,177 @@
 		    /* Move past the `-'.  */
                     PATFETCH (c1);
 
+		    MAYBE_START_OVER_WITH_EXTENDED (*(unsigned char *)p);
+
+		    ret = compile_range (&p, pend, translate, syntax, buf_end);
+
+                    if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+                  }
+
+                /* See if we're at the beginning of a possible character
+                   class.  */
+
+                else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
+                  { /* Leave room for the null.  */
+                    char str[CHAR_CLASS_MAX_LENGTH + 1];
+                    int ch = 0;
+
+                    PATFETCH (c);
+                    c1 = 0;
+
+                    /* If pattern is `[[:'.  */
+                    if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+                    for (;;)
+                      {
+		        PATFETCH (c);
+		        if ((c == ':' && *p == ']') || p == pend)
+		          break;
+			if (c1 < CHAR_CLASS_MAX_LENGTH)
+			  str[c1++] = c;
+			else
+			  /* This is in any case an invalid class name.  */
+			  str[0] = '\0';
+                      }
+                    str[c1] = '\0';
+
+                    /* If isn't a word bracketed by `[:' and `:]':
+                       undo the ending character, the letters, and leave
+                       the leading `:' and `[' (but set bits for them).  */
+                    if (c == ':' && *p == ']')
+                      {
+			re_wctype_t cc = re_wctype (str);
+
+			if (cc == RECC_ERROR)
+			  FREE_STACK_RETURN (REG_ECTYPE);
+
+                        /* Throw away the ] at the end of the character
+                           class.  */
+                        PATFETCH (c);
+
+                        if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
 #ifdef MULE
-		    if (* (unsigned char *) p >= 0x80 && !has_extended_chars)
-		      {
-		        has_extended_chars = 1;
-		        goto start_over_with_extended;
+			if (re_wctype_can_match_non_ascii (cc))
+			  {
+			    goto start_over_with_extended;
+			  }
+#endif /* MULE */
+			for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
+			  {
+			    if (re_iswctype (ch, cc))
+			      {
+				SET_LIST_BIT (ch);
+			      }
+			  }
+
+                        had_char_class = true;
+                      }
+                    else
+                      {
+                        c1++;
+                        while (c1--)
+                          PATUNFETCH;
+                        SET_LIST_BIT ('[');
+                        SET_LIST_BIT (':');
+                        had_char_class = false;
                       }
-                    if (has_extended_chars)
-		      ret = compile_extended_range (&p, pend, translate,
-						    syntax, rtab);
-		    else
-#endif /* MULE */
-		      ret = compile_range (&p, pend, translate, syntax, buf_end);
+                  }
+                else
+                  {
+                    had_char_class = false;
+                    SET_LIST_BIT (c);
+                  }
+              }
+
+            /* Discard any (non)matching list bytes that are all 0 at the
+               end of the map.  Decrease the map-length byte too.  */
+            while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0)
+              buf_end[-1]--;
+            buf_end += buf_end[-1];
+	  }
+	  break;
+
+#ifdef MULE
+        start_over_with_extended:
+          {
+            REGISTER Lisp_Object rtab = Qnil;
+            Bitbyte flags = 0;
+            int bytes_needed = sizeof (flags);
+            re_bool had_char_class = false;
+
+            /* There are extended chars here, which means we need to use the
+               unified range-table format. */
+            if (buf_end[-2] == charset)
+              buf_end[-2] = charset_mule;
+            else
+              buf_end[-2] = charset_mule_not;
+            buf_end--;
+            p = p1; /* go back to the beginning of the charset, after
+                       a possible ^. */
+            rtab = Vthe_lisp_rangetab;
+            Fclear_range_table (rtab);
+
+            /* charset_not matches newline according to a syntax bit.  */
+            if ((re_opcode_t) buf_end[-1] == charset_mule_not
+                && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
+              SET_RANGETAB_BIT ('\n');
+
+            /* Read in characters and ranges, setting map bits.  */
+            for (;;)
+              {
+                if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+
+                PATFETCH (c);
+
+                /* \ might escape characters inside [...] and [^...].  */
+                if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
+                  {
+                    if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
+
+                    PATFETCH (c1);
+
+                    SET_RANGETAB_BIT (c1);
+                    continue;
+                  }
+
+                /* Could be the end of the bracket expression.  If it's
+                   not (i.e., when the bracket expression is `[]' so
+                   far), the ']' character bit gets set way below.  */
+                if (c == ']' && p != p1 + 1)
+                  break;
+
+                /* Look ahead to see if it's a range when the last thing
+                   was a character class.  */
+                if (had_char_class && c == '-' && *p != ']')
+                  FREE_STACK_RETURN (REG_ERANGE);
+
+                /* Look ahead to see if it's a range when the last thing
+                   was a character: if this is a hyphen not at the
+                   beginning or the end of a list, then it's the range
+                   operator.  */
+                if (c == '-'
+                    && !(p - 2 >= pattern && p[-2] == '[')
+                    && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
+                    && *p != ']')
+                  {
+                    reg_errcode_t ret;
+
+                    ret = compile_extended_range (&p, pend, translate, syntax,
+                                                  rtab);
+
+                    if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+                  }
+
+                else if (p[0] == '-' && p[1] != ']')
+                  { /* This handles ranges made up of characters only.  */
+                    reg_errcode_t ret;
+
+                    /* Move past the `-'.  */
+                    PATFETCH (c1);
+                    
+                    ret = compile_extended_range (&p, pend, translate,
+                                                  syntax, rtab);
                     if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
                   }
 
@@ -2683,14 +2945,14 @@
 
                     for (;;)
                       {
-			/* #### This code is unused.
-			   Correctness is not checked after TRT
-			   table change.  */
                         PATFETCH (c);
-                        if (c == ':' || c == ']' || p == pend
-                            || c1 == CHAR_CLASS_MAX_LENGTH)
+                        if ((c == ':' && *p == ']') || p == pend)
                           break;
-                        str[c1++] = (char) c;
+                        if (c1 < CHAR_CLASS_MAX_LENGTH)
+                          str[c1++] = c;
+                        else
+                          /* This is in any case an invalid class name.  */
+                          str[0] = '\0';
                       }
                     str[c1] = '\0';
 
@@ -2699,22 +2961,11 @@
                        the leading `:' and `[' (but set bits for them).  */
                     if (c == ':' && *p == ']')
                       {
-                        int ch;
-                        re_bool is_alnum = STREQ (str, "alnum");
-                        re_bool is_alpha = STREQ (str, "alpha");
-                        re_bool is_blank = STREQ (str, "blank");
-                        re_bool is_cntrl = STREQ (str, "cntrl");
-                        re_bool is_digit = STREQ (str, "digit");
-                        re_bool is_graph = STREQ (str, "graph");
-                        re_bool is_lower = STREQ (str, "lower");
-                        re_bool is_print = STREQ (str, "print");
-                        re_bool is_punct = STREQ (str, "punct");
-                        re_bool is_space = STREQ (str, "space");
-                        re_bool is_upper = STREQ (str, "upper");
-                        re_bool is_xdigit = STREQ (str, "xdigit");
-
-                        if (!IS_CHAR_CLASS (str))
-			  FREE_STACK_RETURN (REG_ECTYPE);
+                        re_wctype_t cc = re_wctype (str);
+                        reg_errcode_t ret = REG_NOERROR;
+
+                        if (cc == RECC_ERROR)
+                          FREE_STACK_RETURN (REG_ECTYPE);
 
                         /* Throw away the ] at the end of the character
                            class.  */
@@ -2722,26 +2973,10 @@
 
                         if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
 
-                        for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
-                          {
-			    /* This was split into 3 if's to
-			       avoid an arbitrary limit in some compiler.  */
-                            if (   (is_alnum  && ISALNUM (ch))
-                                || (is_alpha  && ISALPHA (ch))
-                                || (is_blank  && ISBLANK (ch))
-                                || (is_cntrl  && ISCNTRL (ch)))
-			      SET_EITHER_BIT (ch);
-			    if (   (is_digit  && ISDIGIT (ch))
-                                || (is_graph  && ISGRAPH (ch))
-                                || (is_lower  && ISLOWER (ch))
-                                || (is_print  && ISPRINT (ch)))
-			      SET_EITHER_BIT (ch);
-			    if (   (is_punct  && ISPUNCT (ch))
-                                || (is_space  && ISSPACE (ch))
-                                || (is_upper  && ISUPPER (ch))
-                                || (is_xdigit && ISXDIGIT (ch)))
-			      SET_EITHER_BIT (ch);
-                          }
+                        ret = compile_char_class (cc, rtab, &flags);
+
+                        if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
+
                         had_char_class = true;
                       }
                     else
@@ -2749,38 +2984,26 @@
                         c1++;
                         while (c1--)
                           PATUNFETCH;
-                        SET_EITHER_BIT ('[');
-                        SET_EITHER_BIT (':');
+                        SET_RANGETAB_BIT ('[');
+                        SET_RANGETAB_BIT (':');
                         had_char_class = false;
                       }
                   }
                 else
                   {
                     had_char_class = false;
-                    SET_EITHER_BIT (c);
+                    SET_RANGETAB_BIT (c);
                   }
               }
 
-#ifdef MULE
-	    if (has_extended_chars)
-	      {
-		/* We have a range table, not a bit vector. */
-		int bytes_needed =
-		  unified_range_table_bytes_needed (rtab);
-		GET_BUFFER_SPACE (bytes_needed);
-		unified_range_table_copy_data (rtab, buf_end);
-		buf_end += unified_range_table_bytes_used (buf_end);
-		break;
-	      }
+            bytes_needed += unified_range_table_bytes_needed (rtab);
+            GET_BUFFER_SPACE (bytes_needed);
+            *buf_end++ = flags;
+            unified_range_table_copy_data (rtab, buf_end);
+            buf_end += unified_range_table_bytes_used (buf_end);
+            break;
+          }
 #endif /* MULE */
-            /* Discard any (non)matching list bytes that are all 0 at the
-               end of the map.  Decrease the map-length byte too.  */
-            while ((int) buf_end[-1] > 0 && buf_end[buf_end[-1] - 1] == 0)
-              buf_end[-1]--;
-            buf_end += buf_end[-1];
-	  }
-	  break;
-
 
 	case '(':
           if (syntax & RE_NO_BK_PARENS)
@@ -3715,6 +3938,69 @@
   return REG_NOERROR;
 }
 
+static reg_errcode_t
+compile_char_class (re_wctype_t cc, Lisp_Object rtab, Bitbyte *flags_out)
+{
+  *flags_out |= re_wctype_to_bit (cc);
+
+  switch (cc)
+    {
+    case RECC_ASCII:
+      put_range_table (rtab, 0, 0x7f, Qt);
+      break;
+
+    case RECC_XDIGIT:
+      put_range_table (rtab, 'a', 'f', Qt);
+      put_range_table (rtab, 'A', 'f', Qt);
+      /* fallthrough */
+    case RECC_DIGIT:
+      put_range_table (rtab, '0', '9', Qt);
+      break;
+
+    case RECC_BLANK:
+      put_range_table (rtab, ' ', ' ', Qt);
+      put_range_table (rtab, '\t', '\t', Qt);
+      break;
+
+    case RECC_PRINT:
+      put_range_table (rtab, ' ', 0x7e, Qt);
+      put_range_table (rtab, 0x80, MOST_POSITIVE_FIXNUM, Qt);
+      break;
+
+    case RECC_GRAPH:
+      put_range_table (rtab, '!', 0x7e, Qt);
+      put_range_table (rtab, 0x80, MOST_POSITIVE_FIXNUM, Qt);
+      break;
+
+    case RECC_NONASCII:
+    case RECC_MULTIBYTE:
+      put_range_table (rtab, 0x80, MOST_POSITIVE_FIXNUM, Qt);
+      break;
+
+    case RECC_CNTRL:
+      put_range_table (rtab, 0x00, 0x1f, Qt);
+      break;
+
+    case RECC_UNIBYTE:
+      /* Never true in XEmacs. */
+      break;
+
+      /* The following all have their own bits in the class_bits argument to
+         charset_mule and charset_mule_not, they don't use the range table
+         information. */
+    case RECC_ALPHA:
+    case RECC_WORD:
+    case RECC_ALNUM: /* Equivalent to RECC_WORD */
+    case RECC_LOWER:
+    case RECC_PUNCT:
+    case RECC_SPACE:
+    case RECC_UPPER:
+      break;
+    }
+
+    return REG_NOERROR;
+}
+
 #endif /* MULE */
 
 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
@@ -3855,6 +4141,15 @@
 	  {
 	    int nentries;
 	    int i;
+	    Bitbyte flags = *p++;
+
+	    if (flags)
+	      {
+                /* We need to consult the syntax table, fastmap won't
+                   work. */
+                bufp->can_be_null = 1;
+                goto done;
+	      }
 
 	    nentries = unified_range_table_nentries (p);
 	    for (i = 0; i < nentries; i++)
@@ -3878,6 +4173,16 @@
 		    set_itext_ichar (strr, last);
 		    fastmap[*strr] = 1;
 		  }
+                else if (MOST_POSITIVE_FIXNUM == last)
+                  {
+		    /* This is RECC_MULTIBYTE or RECC_NONASCII; true for all
+                       non-ASCII characters. */
+		    jj = 0x80;
+		    while (jj < 0xA0)
+		      {
+			fastmap[jj++] = 1;
+		      }
+                  }
 	      }
 	  }
 	  break;
@@ -3887,6 +4192,15 @@
 	    int nentries;
 	    int i;
 	    int smallest_prev = 0;
+	    Bitbyte flags = *p++;
+
+	    if (flags)
+              {
+                /* We need to consult the syntax table, fastmap won't
+                   work. */
+                bufp->can_be_null = 1;
+                goto done;
+              }
 
 	    nentries = unified_range_table_nentries (p);
 	    for (i = 0; i < nentries; i++)
@@ -5416,15 +5730,27 @@
 	  {
 	    REGISTER Ichar c;
 	    re_bool not_p = (re_opcode_t) *(p - 1) == charset_mule_not;
+	    Bitbyte class_bits = *p++;
 
             DEBUG_MATCH_PRINT2 ("EXECUTING charset_mule%s.\n", not_p ? "_not" : "");
-
 	    REGEX_PREFETCH ();
 	    c = itext_ichar_fmt (d, fmt, lispobj);
 	    c = RE_TRANSLATE (c); /* The character to match.  */
 
-	    if (EQ (Qt, unified_range_table_lookup (p, c, Qnil)))
-	      not_p = !not_p;
+	    if ((class_bits &&
+		 ((class_bits & BIT_ALPHA && ISALPHA (c))
+		  || (class_bits & BIT_SPACE && ISSPACE (c))
+		  || (class_bits & BIT_PUNCT && ISPUNCT (c))
+                  || (class_bits & BIT_WORD && ISWORD (c))
+                  || (TRANSLATE_P (translate) ?
+                      (class_bits & (BIT_UPPER | BIT_LOWER)
+                       && !NOCASEP (lispbuf, c))
+                      : ((class_bits & BIT_UPPER && ISUPPER (c))
+                         || (class_bits & BIT_LOWER && ISLOWER (c))))))
+                || EQ (Qt, unified_range_table_lookup (p, c, Qnil)))
+	      {
+		not_p = !not_p;
+	      }
 
 	    p += unified_range_table_bytes_used (p);
 
diff -r 1d9f603e9125 -r 3f4a234f4672 src/regex.h
--- a/src/regex.h	Sat Apr 21 09:41:27 2012 +0100
+++ b/src/regex.h	Sat Apr 21 18:58:28 2012 +0100
@@ -546,6 +546,19 @@
 
 extern int debug_regexps;
 
+typedef enum
+  {
+    RECC_ERROR = 0,
+    RECC_ALNUM, RECC_ALPHA, RECC_WORD,
+    RECC_GRAPH, RECC_PRINT,
+    RECC_LOWER, RECC_UPPER,
+    RECC_PUNCT, RECC_CNTRL,
+    RECC_DIGIT, RECC_XDIGIT,
+    RECC_BLANK, RECC_SPACE,
+    RECC_MULTIBYTE, RECC_NONASCII,
+    RECC_ASCII, RECC_UNIBYTE
+} re_wctype_t;
+
 END_C_DECLS
 
 #endif /* INCLUDED_regex_h_ */
diff -r 1d9f603e9125 -r 3f4a234f4672 tests/ChangeLog
--- a/tests/ChangeLog	Sat Apr 21 09:41:27 2012 +0100
+++ b/tests/ChangeLog	Sat Apr 21 18:58:28 2012 +0100
@@ -1,3 +1,13 @@
+2012-04-21  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* automated/regexp-tests.el:
+	* automated/regexp-tests.el (Assert-char-class):
+	Check that #'string-match errors correctly with an over-long
+	character class name.
+	Add tests for character class functionality that supports
+	non-ASCII characters. These tests expose bugs in GNU Emacs
+	24.0.94.2, but pass under current XEmacs.
+
 2012-04-21  Aidan Kehoe  <kehoea@parhasard.net>
 
 	* automated/regexp-tests.el:
diff -r 1d9f603e9125 -r 3f4a234f4672 tests/automated/regexp-tests.el
--- a/tests/automated/regexp-tests.el	Sat Apr 21 09:41:27 2012 +0100
+++ b/tests/automated/regexp-tests.el	Sat Apr 21 18:58:28 2012 +0100
@@ -598,6 +598,14 @@
 (Assert (eql (string-match "[\x7f\x81-\x9f]" "\x81") 0))
 
 ;; Test character classes
+
+;; This used not to error:
+(Check-Error-Message invalid-regexp "Invalid character class name"
+                     (string-match "[[:alnum12345:]]" "a"))
+;; This alwayed errored, as long as character classes were turned on
+(Check-Error-Message invalid-regexp "Invalid character class name"
+                     (string-match "[[:alnum1234:]]" "a"))
+
 (macrolet
     ((Assert-char-class (class matching-char non-matching-char)
        (if (and (not (featurep 'mule))
@@ -648,7 +656,21 @@
          (Assert (null (string-match ,(concat "[^" class
                                               (string non-matching-char) "]")
                                      ,(concat (string matching-char)
-                                              (string non-matching-char))))))))
+                                              (string non-matching-char)))))))
+     (Assert-never-matching (class &rest characters)
+       (cons
+        'progn
+        (mapcan #'(lambda (character)
+                    (if (or (not (eq 'decode-char (car-safe character)))
+                            (featurep 'mule))
+                        `((Assert (null (string-match
+                                         ,(concat "[" class "]")
+                                         ,(string (eval character)))))
+                          (Assert (eql (string-match
+                                        ,(concat "[^" class "]")
+                                        ,(string (eval character)))
+                                       0)))))
+                characters))))
   (Assert-char-class "[:alpha:]" ?a ?0)
   (Assert-char-class "[:alpha:]" ?z ?9)
   (Assert-char-class "[:alpha:]" ?A ?0)
@@ -657,6 +679,18 @@
   (Assert-char-class "[:alpha:]" ?c ?\x09)
   (Assert-char-class "[:alpha:]" ?d ?\ )
   (Assert-char-class "[:alpha:]" ?e ?\x7f)
+  (Assert-char-class
+   "[:alpha:]"
+   (decode-char 'ucs #x0430)  ;; CYRILLIC SMALL LETTER A
+   (decode-char 'ucs #x2116)) ;; NUMERO SIGN
+  (Assert-char-class
+   "[:alpha:]"
+   (decode-char 'ucs #x0410)  ;; CYRILLIC CAPITAL LETTER A
+   ?\x02)
+  (Assert-char-class
+   "[:alpha:]"
+   (decode-char 'ucs #x03B2)  ;; GREEK SMALL LETTER BETA
+   (decode-char 'ucs #x0385)) ;; GREEK DIALYTIKA TONOS
 
   (Assert-char-class "[:alnum:]" ?a ?.)
   (Assert-char-class "[:alnum:]" ?z ?')
@@ -664,11 +698,46 @@
   (Assert-char-class "[:alnum:]" ?Z ?!)
   (Assert-char-class "[:alnum:]" ?0 ?,)
   (Assert-char-class "[:alnum:]" ?9 ?$)
-
   (Assert-char-class "[:alnum:]" ?b ?\x00)
   (Assert-char-class "[:alnum:]" ?c ?\x09)
   (Assert-char-class "[:alnum:]" ?d ?\   )
   (Assert-char-class "[:alnum:]" ?e ?\x7f)
+  (Assert-char-class
+   "[:alnum:]"
+   (decode-char 'ucs #x0430)  ;; CYRILLIC SMALL LETTER A
+   (decode-char 'ucs #x2116)) ;; NUMERO SIGN
+  (Assert-char-class
+   "[:alnum:]"
+   (decode-char 'ucs #x0410)  ;; CYRILLIC CAPITAL LETTER A
+   ?\x02)
+  (Assert-char-class
+   "[:alnum:]"
+   (decode-char 'ucs #x03B2)  ;; GREEK SMALL LETTER BETA
+   (decode-char 'ucs #x0385)) ;; GREEK DIALYTIKA TONOS
+
+  ;; Word is equivalent to alnum in this implementation.
+  (Assert-char-class "[:word:]" ?a ?.)
+  (Assert-char-class "[:word:]" ?z ?')
+  (Assert-char-class "[:word:]" ?A ?/)
+  (Assert-char-class "[:word:]" ?Z ?!)
+  (Assert-char-class "[:word:]" ?0 ?,)
+  (Assert-char-class "[:word:]" ?9 ?$)
+  (Assert-char-class "[:word:]" ?b ?\x00)
+  (Assert-char-class "[:word:]" ?c ?\x09)
+  (Assert-char-class "[:word:]" ?d ?\   )
+  (Assert-char-class "[:word:]" ?e ?\x7f)
+  (Assert-char-class
+   "[:word:]"
+   (decode-char 'ucs #x0430)  ;; CYRILLIC SMALL LETTER A
+   (decode-char 'ucs #x2116)) ;; NUMERO SIGN
+  (Assert-char-class
+   "[:word:]"
+   (decode-char 'ucs #x0410)  ;; CYRILLIC CAPITAL LETTER A
+   ?\x02)
+  (Assert-char-class
+   "[:word:]"
+   (decode-char 'ucs #x03B2)  ;; GREEK SMALL LETTER BETA
+   (decode-char 'ucs #x0385)) ;; GREEK DIALYTIKA TONOS
 
   (let ((case-fold-search nil))
     (Assert-char-class "[:upper:]" ?A ?a)
@@ -679,6 +748,14 @@
     (Assert-char-class "[:upper:]" ?E ?\x09)
     (Assert-char-class "[:upper:]" ?F ?\ )
     (Assert-char-class "[:upper:]" ?G ?\x7f)
+    (Assert-char-class
+     "[:upper:]"
+     (decode-char 'ucs #x0410)  ;; CYRILLIC CAPITAL LETTER A
+     (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH
+    (Assert-char-class
+     "[:upper:]"
+     (decode-char 'ucs #x0392)  ;; GREEK CAPITAL LETTER BETA
+     (decode-char 'ucs #x5357)) ;; kDefinition south; southern part; southward
 
     (Assert-char-class "[:lower:]" ?a ?A)
     (Assert-char-class "[:lower:]" ?z ?Z)
@@ -687,11 +764,17 @@
     (Assert-char-class "[:lower:]" ?d ?\x00)
     (Assert-char-class "[:lower:]" ?e ?\x09)
     (Assert-char-class "[:lower:]" ?f ? )
-    (Assert-char-class "[:lower:]" ?g ?\x7f))
+    (Assert-char-class "[:lower:]" ?g ?\x7f)
+    (Assert-char-class
+     "[:lower:]"
+     (decode-char 'ucs #x0430)  ;; CYRILLIC SMALL LETTER A
+     (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH
+    (Assert-char-class
+     "[:lower:]"
+     (decode-char 'ucs #x03B2)  ;; GREEK SMALL LETTER BETA
+     (decode-char 'ucs #x5357)));; kDefinition south; southern part; southward
 
   (let ((case-fold-search t))
-    ;; These currently fail, because we don't take into account the buffer's
-    ;; case table.
     (Assert-char-class "[:upper:]" ?a ?\x00)
     (Assert-char-class "[:upper:]" ?z ?\x01)
     (Assert-char-class "[:upper:]" ?b ?{)
@@ -700,7 +783,14 @@
     (Assert-char-class "[:upper:]" ?e ?>)
     (Assert-char-class "[:upper:]" ?f ?\ )
     (Assert-char-class "[:upper:]" ?g ?\x7f)
-
+    (Assert-char-class
+     "[:upper:]"
+     (decode-char 'ucs #x0430)  ;; CYRILLIC SMALL LETTER A
+     (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH
+    (Assert-char-class
+     "[:upper:]"
+     (decode-char 'ucs #x03B2)  ;; GREEK SMALL LETTER BETA
+     (decode-char 'ucs #x5357)) ;; kDefinition south; southern part; southward
     (Assert-char-class "[:lower:]" ?A ?\x00)
     (Assert-char-class "[:lower:]" ?Z ?\x01)
     (Assert-char-class "[:lower:]" ?B ?{)
@@ -708,7 +798,15 @@
     (Assert-char-class "[:lower:]" ?D ?<)
     (Assert-char-class "[:lower:]" ?E ?>)
     (Assert-char-class "[:lower:]" ?F ?\ )
-    (Assert-char-class "[:lower:]" ?G ?\x7F))
+    (Assert-char-class "[:lower:]" ?G ?\x7F)
+    (Assert-char-class
+     "[:lower:]"
+     (decode-char 'ucs #x0410)  ;; CYRILLIC CAPITAL LETTER A
+     (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH
+    (Assert-char-class
+     "[:lower:]"
+     (decode-char 'ucs #x0392)  ;; GREEK CAPITAL LETTER BETA
+     (decode-char 'ucs #x5357)));; kDefinition south; southern part; southward
 
   (Assert-char-class "[:digit:]" ?0 ?a)
   (Assert-char-class "[:digit:]" ?9 ?z)
@@ -718,6 +816,30 @@
   (Assert-char-class "[:digit:]" ?4 ?\x09)
   (Assert-char-class "[:digit:]" ?5 ? )
   (Assert-char-class "[:digit:]" ?6 ?\x7f)
+  (Assert-char-class 
+   "[:digit:]" ?7
+   (decode-char 'ucs #x0385)) ;; GREEK DIALYTIKA TONOS
+  (Assert-char-class
+   "[:digit:]" ?8
+   (decode-char 'ucs #x0392)) ;; GREEK CAPITAL LETTER BETA
+  (Assert-char-class
+   "[:digit:]" ?9
+   (decode-char 'ucs #x03B2)) ;; GREEK SMALL LETTER BETA
+  (Assert-char-class
+   "[:digit:]" ?0
+   (decode-char 'ucs #x0410)) ;; CYRILLIC CAPITAL LETTER A
+  (Assert-char-class
+   "[:digit:]" ?1
+   (decode-char 'ucs #x0430)) ;; CYRILLIC SMALL LETTER A
+  (Assert-char-class
+   "[:digit:]" ?2
+   (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH
+  (Assert-char-class
+   "[:digit:]" ?3
+   (decode-char 'ucs #x2116)) ;; NUMERO SIGN
+  (Assert-char-class
+   "[:digit:]" ?4
+   (decode-char 'ucs #x5357)) ;; kDefinition south; southern part; southward
 
   (Assert-char-class "[:xdigit:]" ?0 ?g)
   (Assert-char-class "[:xdigit:]" ?9 ?G)
@@ -729,6 +851,30 @@
   (Assert-char-class "[:xdigit:]" ?4 ?\x09)
   (Assert-char-class "[:xdigit:]" ?5 ?\x7f)
   (Assert-char-class "[:xdigit:]" ?6 ?z)
+  (Assert-char-class 
+   "[:xdigit:]" ?7
+   (decode-char 'ucs #x0385)) ;; GREEK DIALYTIKA TONOS
+  (Assert-char-class
+   "[:xdigit:]" ?8
+   (decode-char 'ucs #x0392)) ;; GREEK CAPITAL LETTER BETA
+  (Assert-char-class
+   "[:xdigit:]" ?9
+   (decode-char 'ucs #x03B2)) ;; GREEK SMALL LETTER BETA
+  (Assert-char-class
+   "[:xdigit:]" ?a
+   (decode-char 'ucs #x0410)) ;; CYRILLIC CAPITAL LETTER A
+  (Assert-char-class
+   "[:xdigit:]" ?B
+   (decode-char 'ucs #x0430)) ;; CYRILLIC SMALL LETTER A
+  (Assert-char-class
+   "[:xdigit:]" ?c
+   (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH
+  (Assert-char-class
+   "[:xdigit:]" ?D
+   (decode-char 'ucs #x2116)) ;; NUMERO SIGN
+  (Assert-char-class
+   "[:xdigit:]" ?e
+   (decode-char 'ucs #x5357)) ;; kDefinition south; southern part; southward
 
   (Assert-char-class "[:space:]" ?\  ?0)
   (Assert-char-class "[:space:]" ?\t ?9)
@@ -738,6 +884,30 @@
   (Assert-char-class "[:space:]" ?\  ?\x7f)
   (Assert-char-class "[:space:]" ?\t ?a)
   (Assert-char-class "[:space:]" ?\  ?z)
+  (Assert-char-class 
+   "[:space:]" ?\ 
+   (decode-char 'ucs #x0385)) ;; GREEK DIALYTIKA TONOS
+  (Assert-char-class
+   "[:space:]" ?\t
+   (decode-char 'ucs #x0392)) ;; GREEK CAPITAL LETTER BETA
+  (Assert-char-class
+   "[:space:]" ?\ 
+   (decode-char 'ucs #x03B2)) ;; GREEK SMALL LETTER BETA
+  (Assert-char-class
+   "[:space:]" ?\t
+   (decode-char 'ucs #x0410)) ;; CYRILLIC CAPITAL LETTER A
+  (Assert-char-class
+   "[:space:]" ?\ 
+   (decode-char 'ucs #x0430)) ;; CYRILLIC SMALL LETTER A
+  (Assert-char-class
+   "[:space:]" ?\t
+   (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH
+  (Assert-char-class
+   "[:space:]" ?\ 
+   (decode-char 'ucs #x2116)) ;; NUMERO SIGN
+  (Assert-char-class
+   "[:space:]" ?\t
+   (decode-char 'ucs #x5357)) ;; kDefinition south; southern part; southward
 
   (Assert-char-class "[:print:]" ?\  ?\x00)
   (Assert-char-class "[:print:]" ?0 ?\x09)
@@ -747,6 +917,63 @@
   (Assert-char-class "[:print:]" ?B ?\t)
   (Assert-char-class "[:print:]" ?a ?\x03)
   (Assert-char-class "[:print:]" ?z ?\x04)
+  (Assert-char-class 
+   "[:print:]" (decode-char 'ucs #x0385) ;; GREEK DIALYTIKA TONOS
+   ?\x05)
+  (Assert-char-class
+   "[:print:]" (decode-char 'ucs #x0392) ;; GREEK CAPITAL LETTER BETA
+   ?\x06)
+  (Assert-char-class
+   "[:print:]" (decode-char 'ucs #x03B2) ;; GREEK SMALL LETTER BETA
+   ?\x07)
+  (Assert-char-class
+   "[:print:]" (decode-char 'ucs #x0410) ;; CYRILLIC CAPITAL LETTER A
+   ?\x08)
+  (Assert-char-class
+   "[:print:]" (decode-char 'ucs #x0430) ;; CYRILLIC SMALL LETTER A
+   ?\x09)
+  (Assert-char-class
+   "[:print:]" (decode-char 'ucs #x0686) ;; ARABIC LETTER TCHEH
+   ?\x0a)
+  (Assert-char-class
+   "[:print:]" (decode-char 'ucs #x2116) ;; NUMERO SIGN
+   ?\x0b)
+  (Assert-char-class
+   "[:print:]" (decode-char 'ucs #x5357) ;; kDefinition south; southern part; southward
+   ?\x0c)
+
+  (Assert-char-class "[:graph:]" ?!  ?\ )
+  (Assert-char-class "[:graph:]" ?0 ?\x09)
+  (Assert-char-class "[:graph:]" ?9 ?\x7f)
+  (Assert-char-class "[:graph:]" ?A ?\x01)
+  (Assert-char-class "[:graph:]" ?Z ?\x02)
+  (Assert-char-class "[:graph:]" ?B ?\t)
+  (Assert-char-class "[:graph:]" ?a ?\x03)
+  (Assert-char-class "[:graph:]" ?z ?\x04)
+  (Assert-char-class 
+   "[:graph:]" (decode-char 'ucs #x0385) ;; GREEK DIALYTIKA TONOS
+   ?\x05)
+  (Assert-char-class
+   "[:graph:]" (decode-char 'ucs #x0392) ;; GREEK CAPITAL LETTER BETA
+   ?\x06)
+  (Assert-char-class
+   "[:graph:]" (decode-char 'ucs #x03B2) ;; GREEK SMALL LETTER BETA
+   ?\x07)
+  (Assert-char-class
+   "[:graph:]" (decode-char 'ucs #x0410) ;; CYRILLIC CAPITAL LETTER A
+   ?\x08)
+  (Assert-char-class
+   "[:graph:]" (decode-char 'ucs #x0430) ;; CYRILLIC SMALL LETTER A
+   ?\x09)
+  (Assert-char-class
+   "[:graph:]" (decode-char 'ucs #x0686) ;; ARABIC LETTER TCHEH
+   ?\x0a)
+  (Assert-char-class
+   "[:graph:]" (decode-char 'ucs #x2116) ;; NUMERO SIGN
+   ?\x0b)
+  (Assert-char-class
+   "[:graph:]" (decode-char 'ucs #x5357) ;; kDefinition south; southern part; southward
+   ?\x0c)
 
   (Assert-char-class "[:punct:]" ?\( ?0)
   (Assert-char-class "[:punct:]" ?. ?9)
@@ -757,4 +984,102 @@
   (Assert-char-class "[:punct:]" ?< ?\x09)
   (Assert-char-class "[:punct:]" ?> ?\x7f)
   (Assert-char-class "[:punct:]" ?= ?a)
-  (Assert-char-class "[:punct:]" ?\? ?z))
+  (Assert-char-class "[:punct:]" ?\? ?z)
+  (Assert-char-class 
+   "[:punct:]"
+   (decode-char 'ucs #x0385) ;; GREEK DIALYTIKA TONOS
+   ?a)
+  (Assert-char-class
+   "[:punct:]"
+   (decode-char 'ucs #x20af)  ;; DRACHMA SIGN
+   (decode-char 'ucs #x0392)) ;; GREEK CAPITAL LETTER BETA
+  (Assert-char-class
+   "[:punct:]"
+   (decode-char 'ucs #x00a7)  ;; SECTION SIGN
+   (decode-char 'ucs #x03B2)) ;; GREEK SMALL LETTER BETA
+  (Assert-char-class
+   "[:punct:]"
+   (decode-char 'ucs #x00a8)  ;; DIAERESIS
+   (decode-char 'ucs #x0410)) ;; CYRILLIC CAPITAL LETTER A
+  (Assert-char-class
+   "[:punct:]"
+   (decode-char 'ucs #x0384) ;; GREEK TONOS
+   (decode-char 'ucs #x0430)) ;; CYRILLIC SMALL LETTER A
+  (Assert-char-class
+   "[:punct:]" 
+   (decode-char 'ucs #x00b7)  ;; MIDDLE DOT
+   (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH
+  (Assert-char-class
+   "[:punct:]" 
+   (decode-char 'ucs #x2116) ;; NUMERO SIGN
+   ?x)
+  (Assert-char-class
+   "[:punct:]"
+   ?=
+   (decode-char 'ucs #x5357)) ;; kDefinition south; southern part; southward
+
+  (Assert-char-class "[:ascii:]" ?a (decode-char 'ucs #x00a7)) ;; SECTION SIGN
+  (Assert-char-class "[:ascii:]" ?b (decode-char 'ucs #x00a8))  ;; DIAERESIS
+  (Assert-char-class "[:ascii:]" ?c (decode-char 'ucs #x00b7))  ;; MIDDLE DOT
+  (Assert-char-class "[:ascii:]" ?d (decode-char 'ucs #x0384))  ;; GREEK TONOS
+  (Assert-char-class
+   "[:ascii:]" ?\x00 (decode-char 'ucs #x0392)) ;; GREEK CAPITAL LETTER BETA
+  (Assert-char-class
+   "[:ascii:]" ?\x01 (decode-char 'ucs #x03B2)) ;; GREEK SMALL LETTER BETA
+  (Assert-char-class
+   "[:ascii:]" ?\t (decode-char 'ucs #x0410)) ;; CYRILLIC CAPITAL LETTER A
+  (Assert-char-class
+   "[:ascii:]" ?A (decode-char 'ucs #x0430)) ;; CYRILLIC SMALL LETTER A
+  (Assert-char-class
+   "[:ascii:]" ?B (decode-char 'ucs #x0686)) ;; ARABIC LETTER TCHEH
+  (Assert-char-class
+   "[:ascii:]" ?C (decode-char 'ucs #x20af)) ;; DRACHMA SIGN
+  (Assert-char-class
+   "[:ascii:]" ?\x7f (decode-char 'ucs #x2116)) ;; NUMERO SIGN
+
+  (Assert-char-class
+   "[:nonascii:]" (decode-char 'ucs #x00a7) ?a) ;; SECTION SIGN
+  (Assert-char-class
+   "[:nonascii:]" (decode-char 'ucs #x00a8) ?b) ;; DIAERESIS
+  (Assert-char-class
+   "[:nonascii:]" (decode-char 'ucs #x00b7) ?c) ;; MIDDLE DOT
+  (Assert-char-class
+   "[:nonascii:]" (decode-char 'ucs #x0384) ?d) ;; GREEK TONOS
+  (Assert-char-class
+   "[:nonascii:]" (decode-char 'ucs #x0392) ?\x00) ;; GREEK CAPITAL LETTER BETA
+  (Assert-char-class
+   "[:nonascii:]" (decode-char 'ucs #x03B2) ?\x01) ;; GREEK SMALL LETTER BETA
+  (Assert-char-class
+   "[:nonascii:]" (decode-char 'ucs #x0410) ?\t) ;; CYRILLIC CAPITAL LETTER A
+  (Assert-char-class
+   "[:nonascii:]" (decode-char 'ucs #x0430) ?A) ;; CYRILLIC SMALL LETTER A
+  (Assert-char-class
+   "[:nonascii:]" (decode-char 'ucs #x0686) ?B) ;; ARABIC LETTER TCHEH
+  (Assert-char-class
+   "[:nonascii:]" (decode-char 'ucs #x20af) ?C) ;; DRACHMA SIGN
+  (Assert-char-class
+   "[:nonascii:]" (decode-char 'ucs #x2116) ?\x7f) ;; NUMERO SIGN
+
+  (Assert-char-class
+   "[:multibyte:]"  (decode-char 'ucs #x00a7) ?a) ;; SECTION SIGN
+  (Assert-char-class
+   "[:multibyte:]"  (decode-char 'ucs #x00a8) ?b) ;; DIAERESIS
+  (Assert-char-class
+   "[:multibyte:]"  (decode-char 'ucs #x00b7) ?c) ;; MIDDLE DOT
+  (Assert-char-class
+   "[:multibyte:]"  (decode-char 'ucs #x0384) ?d) ;; GREEK TONOS
+  (Assert-char-class
+   "[:multibyte:]"  (decode-char 'ucs #x0392)
+   ?\x00) ;; GREEK CAPITAL LETTER BETA
+
+  (Assert-never-matching
+   "[:unibyte:]"
+   ?\x01 ?\t ?A ?B ?C ?\x7f
+   (decode-char 'ucs #x03B2) ;; GREEK SMALL LETTER BETA
+   (decode-char 'ucs #x0410) ;; CYRILLIC CAPITAL LETTER A
+   (decode-char 'ucs #x0430) ;; CYRILLIC SMALL LETTER A
+   (decode-char 'ucs #x0686) ;; ARABIC LETTER TCHEH
+   (decode-char 'ucs #x20af) ;; DRACHMA SIGN
+   (decode-char 'ucs #x2116) ;; NUMERO SIGN
+   (decode-char 'ucs #x5357))) ;; kDefinition south; southern part; southward
+