view src/coding-system-slots.h @ 5648:3f4a234f4672

Support non-ASCII correctly in character classes, test this. src/ChangeLog addition: 2012-04-21 Aidan Kehoe <kehoea@parhasard.net> Support non-ASCII correctly in character classes ([:alnum:] and friends). * regex.c: * regex.c (ISBLANK, ISUNIBYTE): New. Make these and friends independent of the locale, since we want them to be consistent in XEmacs. * regex.c (print_partial_compiled_pattern): Print the flags for charset_mule; don't print non-ASCII as the character values in ranges, this breaks with locales. * regex.c (enum): Define various flags the charset_mule and charset_mule_not opcodes can now take. * regex.c (CHAR_CLASS_MAX_LENGTH): Update this. * regex.c (re_iswctype, re_wctype): New, from GNU. * regex.c (re_wctype_can_match_non_ascii): New; used when deciding on whether to use charset_mule or the ASCII-only regex character set opcode. * regex.c (regex_compile): Error correctly on long, non-existent character class names. Break out the handling of charsets that can match non-ASCII into a separate clause. Use compile_char_class when compiling character classes. * regex.c (compile_char_class): New. Used in regex_compile when compiling character sets that may match non-ASCII. * regex.c (re_compile_fastmap): If there are flags set for charset_mule or charset_mule_not, we can't use the fastmap (since we need to check syntax table values that aren't available there). * regex.c (re_match_2_internal): Check the new flags passed to the charset_mule{,_not} opcode, observe them if appropriate. * regex.h: * regex.h (enum): Expose re_wctype_t here, imported from GNU. tests/ChangeLog addition: 2012-04-21 Aidan Kehoe <kehoea@parhasard.net> * automated/regexp-tests.el: * automated/regexp-tests.el (Assert-char-class): Check that #'string-match errors correctly with an over-long character class name. Add tests for character class functionality that supports non-ASCII characters. These tests expose bugs in GNU Emacs 24.0.94.2, but pass under current XEmacs.
author Aidan Kehoe <kehoea@parhasard.net>
date Sat, 21 Apr 2012 18:58:28 +0100
parents 308d34e9f07d
children
line wrap: on
line source

/* Definitions of marked slots in coding systems
   Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   Copyright (C) 1995 Sun Microsystems, Inc.
   Copyright (C) 2000, 2001, 2002 Ben Wing.

This file is part of XEmacs.

XEmacs is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation, either version 3 of the License, or (at your
option) any later version.

XEmacs is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License
along with XEmacs.  If not, see <http://www.gnu.org/licenses/>. */

/* Synched up with: ????.  Split out of file-coding.h. */

/* We define the Lisp_Objects in the coding system structure in a separate
   file because there are numerous places we want to iterate over them,
   such as when defining them in the structure, initializing them, or
   marking them.

   To use, define MARKED_SLOT before including this file.  In the structure
   definition, you also need to define CODING_SYSTEM_SLOT_DECLARATION.  No
   need to undefine either value; that happens automatically.  */

#ifndef MARKED_SLOT_ARRAY
#ifdef CODING_SYSTEM_SLOT_DECLARATION
#define MARKED_SLOT_ARRAY(slot, size) MARKED_SLOT(slot[size])
#else
#define MARKED_SLOT_ARRAY(slot, size) do {		\
    int mslotidx;					\
    for (mslotidx = 0; mslotidx < size; mslotidx++)	\
      {							\
	MARKED_SLOT (slot[mslotidx])			\
      }							\
  } while (0);
#endif
#endif /* not MARKED_SLOT_ARRAY */

  /* Name and description of this coding system.  The description
     should be suitable for a menu entry. */
  MARKED_SLOT (name)
  MARKED_SLOT (description)

  /* Mnemonic string displayed in the modeline when this coding
     system is active for a particular buffer. */
  MARKED_SLOT (mnemonic)

  /* Long documentation on the coding system. */
  MARKED_SLOT (documentation)
  /* Functions to handle additional conversion after reading or before
     writing. #### This mechanism should be replaced by the ability to
     simply create new coding system types. */
  MARKED_SLOT (post_read_conversion)
  MARKED_SLOT (pre_write_conversion)

  /* If this coding system is not of the correct type for text file
     conversion (i.e. decodes byte->char), we wrap it with appropriate
     char<->byte converters.  This is created dynamically, when it's
     needed, and cached here. */
  MARKED_SLOT (text_file_wrapper)

  /* ------------------------ junk to handle EOL -------------------------
     I had hoped that we could handle this without lots of special-case
     code, but it appears not to be the case if we want to maintain
     compatibility with the existing way.  However, at least with the way
     we do things now, we avoid EOL junk in most of the coding system
     methods themselves, or in the decode/encode functions.  The EOL
     special-case code is limited to coding-system creation and to the
     convert-eol and undecided coding system types. */

  /* If this coding system wants autodetection of the EOL type, then at the
     appropriate time we wrap this coding system with
     convert-eol-autodetect. (We do NOT do this at creation time because
     then we end up with multiple convert-eols wrapped into the final
     result -- esp. with autodetection using `undecided' -- leading to a
     big mess.) We cache the wrapped coding system here. */
  MARKED_SLOT (auto_eol_wrapper)
  
  /* Subsidiary coding systems that specify a particular type of EOL
     marking, rather than autodetecting it.  These will only be non-nil
     if (eol_type == EOL_AUTODETECT).  These are chains. */
  MARKED_SLOT_ARRAY (eol, 3)
  /* If this coding system is a subsidiary, this element points back to its
     parent. */
  MARKED_SLOT (subsidiary_parent)

  /* At decoding or encoding time, we use the following coding system, if
     it exists, in place of the coding system object.  This is how we
     handle coding systems with EOL types of CRLF or CR.  Formerly, we did
     the canonicalization at creation time, returning a chain in place of
     the original coding system; but that interferes with
     `coding-system-property' and causes other complications.  CANONICAL is
     used when determining the end types of a coding system.
     canonicalize-after-coding also consults CANONICAL (it has to, because
     the data in the lstream is based on CANONICAL, not on the original
     coding system). */
  MARKED_SLOT (canonical)

  MARKED_SLOT (safe_charsets)

  MARKED_SLOT (safe_chars)

#undef MARKED_SLOT
#undef MARKED_SLOT_ARRAY
#undef CODING_SYSTEM_SLOT_DECLARATION