comparison lisp/regexp-opt.el @ 2548:c4c8a36043be

[xemacs-hg @ 2005-02-03 07:11:19 by ben] behavior ws #4: package-suppress, autoload update/sync, add easy-mmode/regexp-opt to core lread.c, lisp.h: Remove undeeded Vload_file_name_internal_the_purecopy, Qload_file_name -- use internal_bind_lisp_object instead of specbind. Add load-suppress-alist. * easy-mmode.el, regexp-opt.el: Move these files into core. Uncomment stuff depending on new custom.el. autoload.el: Removed. Major update. Sync with FSF 21.2. Create the ability to make custom-defines files. update-elc-2.el, update-elc.el: Rewrite to use new autoload API. update-elc.el: Add easy-mmode.
author ben
date Thu, 03 Feb 2005 07:11:28 +0000
parents
children 317f30471f4e
comparison
equal deleted inserted replaced
2547:a9527fcdf77f 2548:c4c8a36043be
1 ;;; regexp-opt.el --- generate efficient regexps to match strings
2
3 ;; Copyright (C) 1994,95,96,97,98,99,2000 Free Software Foundation, Inc.
4
5 ;; Author: Simon Marshall <simon@gnu.org>
6 ;; Maintainer: FSF
7 ;; Keywords: strings, regexps, extensions
8
9 ;; Modified by Karl M. Hegbloom Sep. 1997 to support the new regexp syntax
10 ;; with shy groups. (benchmarks pending)
11
12 ;; This file is part of XEmacs.
13
14 ;; XEmacs is free software; you can redistribute it and/or modify
15 ;; it under the terms of the GNU General Public License as published by
16 ;; the Free Software Foundation; either version 2, or (at your option)
17 ;; any later version.
18
19 ;; XEmacs is distributed in the hope that it will be useful,
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;; GNU General Public License for more details.
23
24 ;; You should have received a copy of the GNU General Public License
25 ;; along with XEmacs; see the file COPYING. If not, write to the
26 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27 ;; Boston, MA 02111-1307, USA.
28
29 ;;; Commentary:
30
31 ;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i\\(se\\|ze\\)\\)".
32 ;;
33 ;; This package generates a regexp from a given list of strings (which matches
34 ;; one of those strings) so that the regexp generated by:
35 ;;
36 ;; (regexp-opt strings)
37 ;;
38 ;; is equivalent to, but more efficient than, the regexp generated by:
39 ;;
40 ;; (mapconcat 'regexp-quote strings "\\|")
41 ;;
42 ;; For example:
43 ;;
44 ;; (let ((strings '("cond" "if" "when" "unless" "while"
45 ;; "let" "let*" "progn" "prog1" "prog2"
46 ;; "save-restriction" "save-excursion" "save-window-excursion"
47 ;; "save-current-buffer" "save-match-data"
48 ;; "catch" "throw" "unwind-protect" "condition-case")))
49 ;; (concat "(" (regexp-opt strings t) "\\>"))
50 ;;
51 ;; => "(\\(?:c\\(?:atch\\|ond\\(?:ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(?:current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(?:less\\|wind-protect\\)\\|wh\\(?:en\\|ile\\)\\)\\>"
52 ;;
53 ;;
54 ;; (let ((strings '("cond" "if" "when" "unless" "while"
55 ;; "let" "let*" "progn" "prog1" "prog2"
56 ;; "save-restriction" "save-excursion" "save-window-excursion"
57 ;; "save-current-buffer" "save-match-data"
58 ;; "catch" "throw" "unwind-protect" "condition-case")))
59 ;; (concat "(" (regexp-opt strings t t) "\\>"))
60 ;; ^
61 ;; => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>"
62 ;;
63 ;;
64 ;; Searching using the above example `regexp-opt' regexp takes approximately
65 ;; two-thirds of the time taken using the equivalent `mapconcat' regexp.
66
67 ;; Since this package was written to produce efficient regexps, not regexps
68 ;; efficiently, it is probably not a good idea to in-line too many calls in
69 ;; your code, unless you use the following trick with `eval-when-compile':
70 ;;
71 ;; (defvar definition-regexp
72 ;; (eval-when-compile
73 ;; (concat "^("
74 ;; (regexp-opt '("defun" "defsubst" "defmacro" "defalias"
75 ;; "defvar" "defconst") t)
76 ;; "\\>")))
77 ;;
78 ;; The `byte-compile' code will be as if you had defined the variable thus:
79 ;;
80 ;; (defvar definition-regexp
81 ;; "^(\\(def\\(alias\\|const\\|macro\\|subst\\|un\\|var\\)\\)\\>")
82 ;;
83 ;; Note that if you use this trick for all instances of `regexp-opt' and
84 ;; `regexp-opt-depth' in your code, regexp-opt.el would only have to be loaded
85 ;; at compile time. But note also that using this trick means that should
86 ;; regexp-opt.el be changed, perhaps to fix a bug or to add a feature to
87 ;; improve the efficiency of `regexp-opt' regexps, you would have to recompile
88 ;; your code for such changes to have effect in your code.
89
90 ;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with
91 ;; thanks for ideas also to Michael Ernst, Bob Glickstein and Dan Nicolaescu.
92 ;; Please don't tell me that it doesn't produce optimal regexps; I know that
93 ;; already. For example, the above explanation for the meaning of "opt" would
94 ;; be more efficient as "optim\\(al\\|i[sz]e\\)", but this requires complex
95 ;; forward looking. But (ideas or) code to improve things (are) is welcome.
96
97 ;;; Code:
98
99 ;;;###autoload
100 (defun regexp-opt (strings &optional paren non-shy)
101 "Return a regexp to match a string in STRINGS.
102 Each string should be unique in STRINGS and should not contain any regexps,
103 quoted or not. If optional PAREN is non-nil, ensure that the returned regexp
104 is enclosed by at least one regexp match grouping construct. If optional
105 NON-SHY is non nil, the inner groupings will use \"\\\\( \\\\)\" grouping,
106 rather than the default \"\\\\(?: \\\\)\" 'shy', or non-match-capturing groups.
107 The returned regexp is typically more efficient than the equivalent regexp:
108
109 (let ((open-paren (if PAREN \"\\\\(\" \"\")) (close-paren (if PAREN \"\\\\)\" \"\")))
110 (concat open-paren (mapconcat 'regexp-quote STRINGS \"\\\\|\") close-paren))
111
112 but typically contains more regexp grouping constructs.
113 Use `regexp-opt-depth' to count them.
114
115 If PAREN is `words', then the resulting regexp is additionally surrounded
116 by \\=\\< and \\>."
117 (save-match-data
118 ;; Recurse on the sorted list.
119 (let* ((max-lisp-eval-depth (* 1024 1024))
120 (completion-ignore-case nil)
121 (words (eq paren 'words))
122 (sorted-strings (sort (copy-sequence strings) 'string-lessp))
123 (re (regexp-opt-group sorted-strings paren nil non-shy)))
124 (if words (concat "\\<" re "\\>") re))))
125
126 ;;;###autoload
127 (defun regexp-opt-depth (regexp &optional count-shy-groups-too)
128 "Return the depth of REGEXP.
129 This means the number of regexp grouping constructs (parenthesised
130 expressions) in REGEXP, not counting the \"\\\\(?: \\\\)\"
131 non-match-capturing groups unless COUNT-SHY-GROUPS-TOO is non-nil.
132 See `regexp-opt'."
133 (save-match-data
134 ;; Hack to signal an error if REGEXP does not have balanced parentheses.
135 (string-match regexp "")
136 ;; Count the number of open parentheses in REGEXP.
137 (let ((max (1- (length regexp)))
138 (count 0) start)
139 (while (string-match "\\\\(" regexp start)
140 (setq start (match-end 0))
141 (when (or count-shy-groups-too
142 (not (string= (substring regexp start (min (+ start 2) max)) "?:")))
143 (setq count (1+ count))))
144 count)))
145
146 ;;; Workhorse functions.
147
148 (eval-when-compile
149 (require 'cl))
150
151 (unless (fboundp 'make-bool-vector)
152 (defalias 'make-bool-vector 'make-vector))
153
154 (defun regexp-opt-group (strings &optional paren lax non-shy)
155 "Return a regexp to match a string in STRINGS.
156 If PAREN non-nil, output regexp parentheses around returned regexp.
157 If LAX non-nil, don't output parentheses if it doesn't require them.
158 If NON-SHY non-nil, don't use \\(?: \\) shy groups, use match capturing ones.
159 Merges keywords to avoid backtracking in Emacs' regexp matcher.
160
161 The basic idea is to find the shortest common prefix, remove it
162 and recurse. If there is no prefix, we divide the list into two so that
163 \(at least) one half will have at least a one-character common prefix.
164
165 Also we delay the addition of grouping parenthesis as long as possible
166 until we're sure we need them, and try to remove one-character sequences
167 so we can use character sets rather than grouping parenthesis."
168 (let* ((open-group (cond
169 ((and paren non-shy) "\\(")
170 (paren "\\(?:")
171 (t "")))
172 (close-group (if paren "\\)" ""))
173 (open-charset (if lax "" open-group))
174 (close-charset (if lax "" close-group)))
175 (cond
176 ;;
177 ;; If there are no strings, just return the empty string.
178 ((= (length strings) 0)
179 "")
180 ;;
181 ;; If there is only one string, just return it.
182 ((= (length strings) 1)
183 (if (= (length (car strings)) 1)
184 (concat open-charset (regexp-quote (car strings)) close-charset)
185 (concat open-group (regexp-quote (car strings)) close-group)))
186 ;;
187 ;; If there is an empty string, remove it and recurse on the rest.
188 ((= (length (car strings)) 0)
189 (concat open-charset
190 (regexp-opt-group (cdr strings) t t non-shy) "?"
191 close-charset))
192 ;;
193 ;; If all are one-character strings, just return a character set.
194 ((= (length strings) (apply '+ (mapcar 'length strings)))
195 (concat open-charset
196 (regexp-opt-charset strings)
197 close-charset))
198 ;;
199 ;; We have a list of different length strings.
200 (t
201 (let ((prefix (try-completion "" (mapcar 'list strings)))
202 (letters (let ((completion-regexp-list '("^.$")))
203 (all-completions "" (mapcar 'list strings)))))
204 (cond
205 ;;
206 ;; If there is a common prefix, remove it and recurse on the suffixes.
207 ((> (length prefix) 0)
208 (let* ((length (length prefix))
209 (suffixes (mapcar (lambda (s) (substring s length)) strings)))
210 (concat open-group
211 (regexp-quote prefix) (regexp-opt-group suffixes t t non-shy)
212 close-group)))
213 ;;
214 ;; If there are several one-character strings, remove them and recurse
215 ;; on the rest (first so the final regexp finds the longest match).
216 ((> (length letters) 1)
217 (let ((rest (let ((completion-regexp-list '("^..+$")))
218 (all-completions "" (mapcar 'list strings)))))
219 (concat open-group
220 (regexp-opt-group rest nil nil non-shy) "\\|" (regexp-opt-charset letters)
221 close-group)))
222 ;;
223 ;; Otherwise, divide the list into those that start with a particular
224 ;; letter and those that do not, and recurse on them.
225 (t
226 (let* ((char (substring (car strings) 0 1))
227 (half1 (all-completions char (mapcar 'list strings)))
228 (half2 (nthcdr (length half1) strings)))
229 (concat open-group
230 (regexp-opt-group half1 nil nil non-shy) "\\|" (regexp-opt-group half2 nil nil non-shy)
231 close-group)))))))))
232
233 (defun regexp-opt-charset (chars)
234 ;;
235 ;; Return a regexp to match a character in CHARS.
236 ;;
237 ;; The basic idea is to find character ranges. Also we take care in the
238 ;; position of character set meta characters in the character set regexp.
239 ;;
240 (let* ((charwidth 256) ; Yeah, right.
241 ;; XEmacs: use bit-vectors instead of bool-vectors
242 (charmap (make-bit-vector charwidth 0))
243 (charset "")
244 (bracket "") (dash "") (caret ""))
245 ;;
246 ;; Make a character map but extract character set meta characters.
247 (dolist (char (mapcar 'string-to-char chars))
248 (case char
249 (?\]
250 (setq bracket "]"))
251 (?^
252 (setq caret "^"))
253 (?-
254 (setq dash "-"))
255 (otherwise
256 ;; XEmacs: 1
257 (aset charmap char 1))))
258 ;;
259 ;; Make a character set from the map using ranges where applicable.
260 (dotimes (char charwidth)
261 (let ((start char))
262 (while (and (< char charwidth)
263 ;; XEmacs: (not (zerop ...))
264 (not (zerop (aref charmap char))))
265 (incf char))
266 (cond ((> char (+ start 3))
267 (setq charset (format "%s%c-%c" charset start (1- char))))
268 ((> char start)
269 (setq charset (format "%s%c" charset (setq char start)))))))
270 ;;
271 ;; Make sure a caret is not first and a dash is first or last.
272 (if (and (string-equal charset "") (string-equal bracket ""))
273 (concat "[" dash caret "]")
274 (concat "[" bracket charset caret dash "]"))))
275
276 (provide 'regexp-opt)
277
278 ;;; regexp-opt.el ends here