Mercurial > hg > xemacs-beta
view lisp/regexp-opt.el @ 5127:a9c41067dd88 ben-lisp-object
more cleanups, terminology clarification, lots of doc work
-------------------- ChangeLog entries follow: --------------------
man/ChangeLog addition:
2010-03-05 Ben Wing <ben@xemacs.org>
* internals/internals.texi (Introduction to Allocation):
* internals/internals.texi (Integers and Characters):
* internals/internals.texi (Allocation from Frob Blocks):
* internals/internals.texi (lrecords):
* internals/internals.texi (Low-level allocation):
Rewrite section on allocation of Lisp objects to reflect the new
reality. Remove references to nonexistent XSETINT and XSETCHAR.
modules/ChangeLog addition:
2010-03-05 Ben Wing <ben@xemacs.org>
* postgresql/postgresql.c (allocate_pgconn):
* postgresql/postgresql.c (allocate_pgresult):
* postgresql/postgresql.h (struct Lisp_PGconn):
* postgresql/postgresql.h (struct Lisp_PGresult):
* ldap/eldap.c (allocate_ldap):
* ldap/eldap.h (struct Lisp_LDAP):
Same changes as in src/ dir. See large log there in ChangeLog,
but basically:
ALLOC_LISP_OBJECT -> ALLOC_NORMAL_LISP_OBJECT
LISP_OBJECT_HEADER -> NORMAL_LISP_OBJECT_HEADER
../hlo/src/ChangeLog addition:
2010-03-05 Ben Wing <ben@xemacs.org>
* alloc.c:
* alloc.c (old_alloc_sized_lcrecord):
* alloc.c (very_old_free_lcrecord):
* alloc.c (copy_lisp_object):
* alloc.c (zero_sized_lisp_object):
* alloc.c (zero_nonsized_lisp_object):
* alloc.c (lisp_object_storage_size):
* alloc.c (free_normal_lisp_object):
* alloc.c (FREE_FIXED_TYPE_WHEN_NOT_IN_GC):
* alloc.c (ALLOC_FROB_BLOCK_LISP_OBJECT):
* alloc.c (Fcons):
* alloc.c (noseeum_cons):
* alloc.c (make_float):
* alloc.c (make_bignum):
* alloc.c (make_bignum_bg):
* alloc.c (make_ratio):
* alloc.c (make_ratio_bg):
* alloc.c (make_ratio_rt):
* alloc.c (make_bigfloat):
* alloc.c (make_bigfloat_bf):
* alloc.c (size_vector):
* alloc.c (make_compiled_function):
* alloc.c (Fmake_symbol):
* alloc.c (allocate_extent):
* alloc.c (allocate_event):
* alloc.c (make_key_data):
* alloc.c (make_button_data):
* alloc.c (make_motion_data):
* alloc.c (make_process_data):
* alloc.c (make_timeout_data):
* alloc.c (make_magic_data):
* alloc.c (make_magic_eval_data):
* alloc.c (make_eval_data):
* alloc.c (make_misc_user_data):
* alloc.c (Fmake_marker):
* alloc.c (noseeum_make_marker):
* alloc.c (size_string_direct_data):
* alloc.c (make_uninit_string):
* alloc.c (make_string_nocopy):
* alloc.c (mark_lcrecord_list):
* alloc.c (alloc_managed_lcrecord):
* alloc.c (free_managed_lcrecord):
* alloc.c (sweep_lcrecords_1):
* alloc.c (malloced_storage_size):
* buffer.c (allocate_buffer):
* buffer.c (compute_buffer_usage):
* buffer.c (DEFVAR_BUFFER_LOCAL_1):
* buffer.c (nuke_all_buffer_slots):
* buffer.c (common_init_complex_vars_of_buffer):
* buffer.h (struct buffer_text):
* buffer.h (struct buffer):
* bytecode.c:
* bytecode.c (make_compiled_function_args):
* bytecode.c (size_compiled_function_args):
* bytecode.h (struct compiled_function_args):
* casetab.c (allocate_case_table):
* casetab.h (struct Lisp_Case_Table):
* charset.h (struct Lisp_Charset):
* chartab.c (fill_char_table):
* chartab.c (Fmake_char_table):
* chartab.c (make_char_table_entry):
* chartab.c (copy_char_table_entry):
* chartab.c (Fcopy_char_table):
* chartab.c (put_char_table):
* chartab.h (struct Lisp_Char_Table_Entry):
* chartab.h (struct Lisp_Char_Table):
* console-gtk-impl.h (struct gtk_device):
* console-gtk-impl.h (struct gtk_frame):
* console-impl.h (struct console):
* console-msw-impl.h (struct Lisp_Devmode):
* console-msw-impl.h (struct mswindows_device):
* console-msw-impl.h (struct msprinter_device):
* console-msw-impl.h (struct mswindows_frame):
* console-msw-impl.h (struct mswindows_dialog_id):
* console-stream-impl.h (struct stream_console):
* console-stream.c (stream_init_console):
* console-tty-impl.h (struct tty_console):
* console-tty-impl.h (struct tty_device):
* console-tty.c (allocate_tty_console_struct):
* console-x-impl.h (struct x_device):
* console-x-impl.h (struct x_frame):
* console.c (allocate_console):
* console.c (nuke_all_console_slots):
* console.c (DEFVAR_CONSOLE_LOCAL_1):
* console.c (common_init_complex_vars_of_console):
* data.c (make_weak_list):
* data.c (make_weak_box):
* data.c (make_ephemeron):
* database.c:
* database.c (struct Lisp_Database):
* database.c (allocate_database):
* database.c (finalize_database):
* device-gtk.c (allocate_gtk_device_struct):
* device-impl.h (struct device):
* device-msw.c:
* device-msw.c (mswindows_init_device):
* device-msw.c (msprinter_init_device):
* device-msw.c (finalize_devmode):
* device-msw.c (allocate_devmode):
* device-tty.c (allocate_tty_device_struct):
* device-x.c (allocate_x_device_struct):
* device.c:
* device.c (nuke_all_device_slots):
* device.c (allocate_device):
* dialog-msw.c (handle_question_dialog_box):
* elhash.c:
* elhash.c (struct Lisp_Hash_Table):
* elhash.c (finalize_hash_table):
* elhash.c (make_general_lisp_hash_table):
* elhash.c (Fcopy_hash_table):
* elhash.h (htentry):
* emacs.c (main_1):
* eval.c:
* eval.c (size_multiple_value):
* event-stream.c (finalize_command_builder):
* event-stream.c (allocate_command_builder):
* event-stream.c (free_command_builder):
* event-stream.c (event_stream_generate_wakeup):
* event-stream.c (event_stream_resignal_wakeup):
* event-stream.c (event_stream_disable_wakeup):
* event-stream.c (event_stream_wakeup_pending_p):
* events.h (struct Lisp_Timeout):
* events.h (struct command_builder):
* extents-impl.h:
* extents-impl.h (struct extent_auxiliary):
* extents-impl.h (struct extent_info):
* extents-impl.h (set_extent_no_chase_aux_field):
* extents-impl.h (set_extent_no_chase_normal_field):
* extents.c:
* extents.c (gap_array_marker):
* extents.c (gap_array):
* extents.c (extent_list_marker):
* extents.c (extent_list):
* extents.c (stack_of_extents):
* extents.c (gap_array_make_marker):
* extents.c (extent_list_make_marker):
* extents.c (allocate_extent_list):
* extents.c (SLOT):
* extents.c (mark_extent_auxiliary):
* extents.c (allocate_extent_auxiliary):
* extents.c (attach_extent_auxiliary):
* extents.c (size_gap_array):
* extents.c (finalize_extent_info):
* extents.c (allocate_extent_info):
* extents.c (uninit_buffer_extents):
* extents.c (allocate_soe):
* extents.c (copy_extent):
* extents.c (vars_of_extents):
* extents.h:
* faces.c (allocate_face):
* faces.h (struct Lisp_Face):
* faces.h (struct face_cachel):
* file-coding.c:
* file-coding.c (finalize_coding_system):
* file-coding.c (sizeof_coding_system):
* file-coding.c (Fcopy_coding_system):
* file-coding.h (struct Lisp_Coding_System):
* file-coding.h (MARKED_SLOT):
* fns.c (size_bit_vector):
* font-mgr.c:
* font-mgr.c (finalize_fc_pattern):
* font-mgr.c (print_fc_pattern):
* font-mgr.c (Ffc_pattern_p):
* font-mgr.c (Ffc_pattern_create):
* font-mgr.c (Ffc_name_parse):
* font-mgr.c (Ffc_name_unparse):
* font-mgr.c (Ffc_pattern_duplicate):
* font-mgr.c (Ffc_pattern_add):
* font-mgr.c (Ffc_pattern_del):
* font-mgr.c (Ffc_pattern_get):
* font-mgr.c (fc_config_create_using):
* font-mgr.c (fc_strlist_to_lisp_using):
* font-mgr.c (fontset_to_list):
* font-mgr.c (Ffc_config_p):
* font-mgr.c (Ffc_config_up_to_date):
* font-mgr.c (Ffc_config_build_fonts):
* font-mgr.c (Ffc_config_get_cache):
* font-mgr.c (Ffc_config_get_fonts):
* font-mgr.c (Ffc_config_set_current):
* font-mgr.c (Ffc_config_get_blanks):
* font-mgr.c (Ffc_config_get_rescan_interval):
* font-mgr.c (Ffc_config_set_rescan_interval):
* font-mgr.c (Ffc_config_app_font_add_file):
* font-mgr.c (Ffc_config_app_font_add_dir):
* font-mgr.c (Ffc_config_app_font_clear):
* font-mgr.c (size):
* font-mgr.c (Ffc_config_substitute):
* font-mgr.c (Ffc_font_render_prepare):
* font-mgr.c (Ffc_font_match):
* font-mgr.c (Ffc_font_sort):
* font-mgr.c (finalize_fc_config):
* font-mgr.c (print_fc_config):
* font-mgr.h:
* font-mgr.h (struct fc_pattern):
* font-mgr.h (XFC_PATTERN):
* font-mgr.h (struct fc_config):
* font-mgr.h (XFC_CONFIG):
* frame-gtk.c (allocate_gtk_frame_struct):
* frame-impl.h (struct frame):
* frame-msw.c (mswindows_init_frame_1):
* frame-x.c (allocate_x_frame_struct):
* frame.c (nuke_all_frame_slots):
* frame.c (allocate_frame_core):
* gc.c:
* gc.c (GC_CHECK_NOT_FREE):
* glyphs.c (finalize_image_instance):
* glyphs.c (allocate_image_instance):
* glyphs.c (Fcolorize_image_instance):
* glyphs.c (allocate_glyph):
* glyphs.c (unmap_subwindow_instance_cache_mapper):
* glyphs.c (register_ignored_expose):
* glyphs.h (struct Lisp_Image_Instance):
* glyphs.h (struct Lisp_Glyph):
* glyphs.h (struct glyph_cachel):
* glyphs.h (struct expose_ignore):
* gui.c (allocate_gui_item):
* gui.h (struct Lisp_Gui_Item):
* keymap.c (struct Lisp_Keymap):
* keymap.c (make_keymap):
* lisp.h:
* lisp.h (struct Lisp_String_Direct_Data):
* lisp.h (struct Lisp_String_Indirect_Data):
* lisp.h (struct Lisp_Vector):
* lisp.h (struct Lisp_Bit_Vector):
* lisp.h (DECLARE_INLINE_LISP_BIT_VECTOR):
* lisp.h (struct weak_box):
* lisp.h (struct ephemeron):
* lisp.h (struct weak_list):
* lrecord.h:
* lrecord.h (struct lrecord_implementation):
* lrecord.h (MC_ALLOC_CALL_FINALIZER):
* lrecord.h (struct lcrecord_list):
* lstream.c (finalize_lstream):
* lstream.c (sizeof_lstream):
* lstream.c (Lstream_new):
* lstream.c (Lstream_delete):
* lstream.h (struct lstream):
* marker.c:
* marker.c (finalize_marker):
* marker.c (compute_buffer_marker_usage):
* mule-charset.c:
* mule-charset.c (make_charset):
* mule-charset.c (compute_charset_usage):
* objects-impl.h (struct Lisp_Color_Instance):
* objects-impl.h (struct Lisp_Font_Instance):
* objects-tty-impl.h (struct tty_color_instance_data):
* objects-tty-impl.h (struct tty_font_instance_data):
* objects-tty.c (tty_initialize_color_instance):
* objects-tty.c (tty_initialize_font_instance):
* objects.c (finalize_color_instance):
* objects.c (Fmake_color_instance):
* objects.c (finalize_font_instance):
* objects.c (Fmake_font_instance):
* objects.c (reinit_vars_of_objects):
* opaque.c:
* opaque.c (sizeof_opaque):
* opaque.c (make_opaque_ptr):
* opaque.c (free_opaque_ptr):
* opaque.h:
* opaque.h (Lisp_Opaque):
* opaque.h (Lisp_Opaque_Ptr):
* print.c (printing_unreadable_lcrecord):
* print.c (external_object_printer):
* print.c (debug_p4):
* process.c (finalize_process):
* process.c (make_process_internal):
* procimpl.h (struct Lisp_Process):
* rangetab.c (Fmake_range_table):
* rangetab.c (Fcopy_range_table):
* rangetab.h (struct Lisp_Range_Table):
* scrollbar.c:
* scrollbar.c (create_scrollbar_instance):
* scrollbar.c (compute_scrollbar_instance_usage):
* scrollbar.h (struct scrollbar_instance):
* specifier.c (finalize_specifier):
* specifier.c (sizeof_specifier):
* specifier.c (set_specifier_caching):
* specifier.h (struct Lisp_Specifier):
* specifier.h (struct specifier_caching):
* symeval.h:
* symeval.h (SYMBOL_VALUE_MAGIC_P):
* symeval.h (DEFVAR_SYMVAL_FWD):
* symsinit.h:
* syntax.c (init_buffer_syntax_cache):
* syntax.h (struct syntax_cache):
* toolbar.c:
* toolbar.c (allocate_toolbar_button):
* toolbar.c (update_toolbar_button):
* toolbar.h (struct toolbar_button):
* tooltalk.c (struct Lisp_Tooltalk_Message):
* tooltalk.c (make_tooltalk_message):
* tooltalk.c (struct Lisp_Tooltalk_Pattern):
* tooltalk.c (make_tooltalk_pattern):
* ui-gtk.c:
* ui-gtk.c (allocate_ffi_data):
* ui-gtk.c (emacs_gtk_object_finalizer):
* ui-gtk.c (allocate_emacs_gtk_object_data):
* ui-gtk.c (allocate_emacs_gtk_boxed_data):
* ui-gtk.h:
* window-impl.h (struct window):
* window-impl.h (struct window_mirror):
* window.c (finalize_window):
* window.c (allocate_window):
* window.c (new_window_mirror):
* window.c (mark_window_as_deleted):
* window.c (make_dummy_parent):
* window.c (compute_window_mirror_usage):
* window.c (compute_window_usage):
Overall point of this change and previous ones in this repository:
(1) Introduce new, clearer terminology: everything other than int
or char is a "record" object, which comes in two types: "normal
objects" and "frob-block objects". Fix up all places that
referred to frob-block objects as "simple", "basic", etc.
(2) Provide an advertised interface for doing operations on Lisp
objects, including creating new types, that is clean and
consistent in its naming, uses the above-referenced terms and
avoids referencing "lrecords", "old lcrecords", etc., which should
hide under the surface.
(3) Make the size_in_bytes and finalizer methods take a
Lisp_Object rather than a void * for consistency with other methods.
(4) Separate finalizer method into finalizer and disksaver, so
that normal finalize methods don't have to worry about disksaving.
Other specifics:
(1) Renaming:
LISP_OBJECT_HEADER -> NORMAL_LISP_OBJECT_HEADER
ALLOC_LISP_OBJECT -> ALLOC_NORMAL_LISP_OBJECT
implementation->basic_p -> implementation->frob_block_p
ALLOCATE_FIXED_TYPE_AND_SET_IMPL -> ALLOC_FROB_BLOCK_LISP_OBJECT
*FCCONFIG*, wrap_fcconfig -> *FC_CONFIG*, wrap_fc_config
*FCPATTERN*, wrap_fcpattern -> *FC_PATTERN*, wrap_fc_pattern
(the last two changes make the naming of these macros consistent
with the naming of all other macros, since the objects are named
fc-config and fc-pattern with a hyphen)
(2) Lots of documentation fixes in lrecord.h.
(3) Eliminate macros for copying, freeing, zeroing objects, getting
their storage size. Instead, new functions:
zero_sized_lisp_object()
zero_nonsized_lisp_object()
lisp_object_storage_size()
free_normal_lisp_object()
(copy_lisp_object() already exists)
LISP_OBJECT_FROB_BLOCK_P() (actually a macro)
Eliminated:
free_lrecord()
zero_lrecord()
copy_lrecord()
copy_sized_lrecord()
old_copy_lcrecord()
old_copy_sized_lcrecord()
old_zero_lcrecord()
old_zero_sized_lcrecord()
LISP_OBJECT_STORAGE_SIZE()
COPY_SIZED_LISP_OBJECT()
COPY_SIZED_LCRECORD()
COPY_LISP_OBJECT()
ZERO_LISP_OBJECT()
FREE_LISP_OBJECT()
(4) Catch the remaining places where lrecord stuff was used directly
and use the advertised interface, e.g. alloc_sized_lrecord() ->
ALLOC_SIZED_LISP_OBJECT().
(5) Make certain statically-declared pseudo-objects
(buffer_local_flags, console_local_flags) have their lheader
initialized correctly, so things like copy_lisp_object() can work
on them. Make extent_auxiliary_defaults a proper heap object
Vextent_auxiliary_defaults, and make extent auxiliaries dumpable
so that this object can be dumped. allocate_extent_auxiliary()
now just creates the object, and attach_extent_auxiliary()
creates an extent auxiliary and attaches to an extent, like the
old allocate_extent_auxiliary().
(6) Create EXTENT_AUXILIARY_SLOTS macro, similar to the foo-slots.h
files but in a macro instead of a file. The purpose is to avoid
duplication when iterating over all the slots in an extent auxiliary.
Use it.
(7) In lstream.c, don't zero out object after allocation because
allocation routines take care of this.
(8) In marker.c, fix a mistake in computing marker overhead.
(9) In print.c, clean up printing_unreadable_lcrecord(),
external_object_printer() to avoid lots of ifdef NEW_GC's.
(10) Separate toolbar-button allocation into a separate
allocate_toolbar_button() function for use in the example code
in lrecord.h.
author | Ben Wing <ben@xemacs.org> |
---|---|
date | Fri, 05 Mar 2010 04:08:17 -0600 |
parents | 317f30471f4e |
children | f00192e1cd49 308d34e9f07d |
line wrap: on
line source
;;; regexp-opt.el --- generate efficient regexps to match strings ;; Copyright (C) 1994,95,96,97,98,99,2000 Free Software Foundation, Inc. ;; Author: Simon Marshall <simon@gnu.org> ;; Maintainer: FSF ;; Keywords: strings, regexps, extensions ;; This file is part of XEmacs. ;; XEmacs is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 2, or (at your option) ;; any later version. ;; XEmacs is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with XEmacs; see the file COPYING. If not, write to the ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, ;; Boston, MA 02111-1307, USA. ;;; Synched up with: GNU Emacs 21.3 + paren-in-char-set fix from CVS ;;; revision 1.25. Some implementation differences in ;;; regexp-opt-group and regexp-opt-charset but the APIs ;;; are compatible and should return compatible (if not ;;; exactly the same) regexps. ;;; Commentary: ;; The "opt" in "regexp-opt" stands for "optim\\(?:al\\|i\\(?:se\\|ze\\)\\)". ;; ;; This package generates a regexp from a given list of strings (which matches ;; one of those strings) so that the regexp generated by: ;; ;; (regexp-opt strings) ;; ;; is equivalent to, but more efficient than, the regexp generated by: ;; ;; (mapconcat 'regexp-quote strings "\\|") ;; ;; For example: ;; ;; (let ((strings '("cond" "if" "when" "unless" "while" ;; "let" "let*" "progn" "prog1" "prog2" ;; "save-restriction" "save-excursion" "save-window-excursion" ;; "save-current-buffer" "save-match-data" ;; "catch" "throw" "unwind-protect" "condition-case"))) ;; (concat "(" (regexp-opt strings t) "\\>")) ;; => "(\\(c\\(?:atch\\|ond\\(?:ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(?:current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(?:less\\|wind-protect\\)\\|wh\\(?:en\\|ile\\)\\)\\>" ;; ;; Searching using the above example `regexp-opt' regexp takes approximately ;; two-thirds of the time taken using the equivalent `mapconcat' regexp. ;; Since this package was written to produce efficient regexps, not regexps ;; efficiently, it is probably not a good idea to in-line too many calls in ;; your code, unless you use the following trick with `eval-when-compile': ;; ;; (defvar definition-regexp ;; (eval-when-compile ;; (concat "^(" ;; (regexp-opt '("defun" "defsubst" "defmacro" "defalias" ;; "defvar" "defconst") t) ;; "\\>"))) ;; ;; The `byte-compile' code will be as if you had defined the variable thus: ;; ;; (defvar definition-regexp ;; "^(\\(def\\(alias\\|const\\|macro\\|subst\\|un\\|var\\)\\)\\>") ;; ;; Note that if you use this trick for all instances of `regexp-opt' and ;; `regexp-opt-depth' in your code, regexp-opt.el would only have to be loaded ;; at compile time. But note also that using this trick means that should ;; regexp-opt.el be changed, perhaps to fix a bug or to add a feature to ;; improve the efficiency of `regexp-opt' regexps, you would have to recompile ;; your code for such changes to have effect in your code. ;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with ;; thanks for ideas also to Michael Ernst, Bob Glickstein, Dan Nicolaescu and ;; Stefan Monnier. ;; No doubt `regexp-opt' doesn't always produce optimal regexps, so code, ideas ;; or any other information to improve things are welcome. ;; ;; One possible improvement would be to compile '("aa" "ab" "ba" "bb") ;; into "[ab][ab]" rather than "a[ab]\\|b[ab]". I'm not sure it's worth ;; it but if someone knows how to do it without going through too many ;; contortions, I'm all ears. ;;; Code: ;;;###autoload (defun regexp-opt (strings &optional paren) "Return a regexp to match a string in STRINGS. Each string should be unique in STRINGS and should not contain any regexps, quoted or not. If optional PAREN is non-nil, ensure that the returned regexp is enclosed by at least one regexp grouping construct. The returned regexp is typically more efficient than the equivalent regexp: (let ((open (if PAREN \"\\\\(\" \"\")) (close (if PAREN \"\\\\)\" \"\"))) (concat open (mapconcat 'regexp-quote STRINGS \"\\\\|\") close)) If PAREN is `words', then the resulting regexp is additionally surrounded by \\=\\< and \\>." (save-match-data ;; Recurse on the sorted list. (let* ((max-lisp-eval-depth (* 1024 1024)) (completion-ignore-case nil) (words (eq paren 'words)) (open (cond ((stringp paren) paren) (paren "\\("))) (sorted-strings (sort (copy-sequence strings) 'string-lessp)) (re (regexp-opt-group sorted-strings open))) (if words (concat "\\<" re "\\>") re)))) (defconst regexp-opt-not-groupie*-re (let* ((harmless-ch "[^\\\\[]") (esc-pair-not-lp "\\\\[^(]") (class-harmless-ch "[^][]") (class-lb-harmless "[^]:]") (class-lb-colon-maybe-charclass ":\\([a-z]+:]\\)?") (class-lb (concat "\\[\\(" class-lb-harmless "\\|" class-lb-colon-maybe-charclass "\\)")) (class (concat "\\[^?]?" "\\(" class-harmless-ch "\\|" class-lb "\\)*" "\\[?]")) ; special handling for bare [ at end of re (shy-lp "\\\\(\\?:")) (concat "\\(" harmless-ch "\\|" esc-pair-not-lp "\\|" class "\\|" shy-lp "\\)*")) "Matches any part of a regular expression EXCEPT for non-shy \"\\\\(\"s") ;;;###autoload (defun regexp-opt-depth (regexp) "Return the depth of REGEXP. This means the number of regexp grouping constructs (parenthesised expressions) in REGEXP." (save-match-data ;; Hack to signal an error if REGEXP does not have balanced parentheses. (string-match regexp "") ;; Count the number of open parentheses in REGEXP. (let ((count 0) start) (while (progn (string-match regexp-opt-not-groupie*-re regexp start) (setq start ( + (match-end 0) 2)) ; +2 for "\\(" after match-end. (<= start (length regexp))) (setq count (1+ count))) count))) ;;; Workhorse functions. (eval-when-compile (require 'cl)) (defun regexp-opt-group (strings &optional paren lax) "Return a regexp to match a string in STRINGS. If PAREN non-nil, output regexp parentheses around returned regexp. If LAX non-nil, don't output parentheses if it doesn't require them. Merges keywords to avoid backtracking in Emacs' regexp matcher. The basic idea is to find the shortest common prefix or suffix, remove it and recurse. If there is no prefix, we divide the list into two so that \(at least) one half will have at least a one-character common prefix. Also we delay the addition of grouping parenthesis as long as possible until we're sure we need them, and try to remove one-character sequences so we can use character sets rather than grouping parenthesis." (let* ((open-group (cond ((stringp paren) paren) (paren "\\(?:") (t ""))) (close-group (if paren "\\)" "")) (open-charset (if lax "" open-group)) (close-charset (if lax "" close-group))) (cond ;; ;; If there are no strings, just return the empty string. ((= (length strings) 0) "") ;; ;; If there is only one string, just return it. ((= (length strings) 1) (if (= (length (car strings)) 1) (concat open-charset (regexp-quote (car strings)) close-charset) (concat open-group (regexp-quote (car strings)) close-group))) ;; ;; If there is an empty string, remove it and recurse on the rest. ((= (length (car strings)) 0) (concat open-charset (regexp-opt-group (cdr strings) t t) "?" close-charset)) ;; ;; If all are one-character strings, just return a character set. ((= (length strings) (apply '+ (mapcar 'length strings))) (concat open-charset (regexp-opt-charset strings) close-charset)) ;; ;; We have a list of different length strings. (t (let ((prefix (try-completion "" (mapcar 'list strings))) (letters (let ((completion-regexp-list '("^.$"))) (all-completions "" (mapcar 'list strings))))) (cond ;; ;; If there is a common prefix, remove it and recurse on the suffixes. ((> (length prefix) 0) (let* ((length (length prefix)) (suffixes (mapcar (lambda (s) (substring s length)) strings))) (concat open-group (regexp-quote prefix) (regexp-opt-group suffixes t t) close-group))) ;; ;; If there are several one-character strings, remove them and recurse ;; on the rest (first so the final regexp finds the longest match). ((> (length letters) 1) (let ((rest (let ((completion-regexp-list '("^..+$"))) (all-completions "" (mapcar 'list strings))))) (concat open-group (regexp-opt-group rest) "\\|" (regexp-opt-charset letters) close-group))) ;; ;; Otherwise, divide the list into those that start with a particular ;; letter and those that do not, and recurse on them. (t (let* ((char (substring (car strings) 0 1)) (half1 (all-completions char (mapcar 'list strings))) (half2 (nthcdr (length half1) strings))) (concat open-group (regexp-opt-group half1) "\\|" (regexp-opt-group half2) close-group))))))))) (defun regexp-opt-charset (chars) ;; ;; Return a regexp to match a character in CHARS. ;; ;; The basic idea is to find character ranges. Also we take care in the ;; position of character set meta characters in the character set regexp. ;; (let* ((charwidth 256) ; Yeah, right. ;; XEmacs: use bit-vectors instead of bool-vectors (charmap (make-bit-vector charwidth 0)) (charset "") (bracket "") (dash "") (caret "")) ;; ;; Make a character map but extract character set meta characters. (dolist (char (mapcar 'string-to-char chars)) (case char (?\] (setq bracket "]")) (?^ (setq caret "^")) (?- (setq dash "-")) (otherwise ;; XEmacs: 1 (aset charmap char 1)))) ;; ;; Make a character set from the map using ranges where applicable. (dotimes (char charwidth) (let ((start char)) (while (and (< char charwidth) ;; XEmacs: (not (zerop ...)) (not (zerop (aref charmap char)))) (incf char)) (cond ((> char (+ start 3)) (setq charset (format "%s%c-%c" charset start (1- char)))) ((> char start) (setq charset (format "%s%c" charset (setq char start))))))) ;; ;; Make sure a caret is not first and a dash is first or last. (if (and (string-equal charset "") (string-equal bracket "")) (concat "[" dash caret "]") (concat "[" bracket charset caret dash "]")))) (provide 'regexp-opt) ;;; regexp-opt.el ends here