Mercurial > hg > xemacs-beta
comparison lisp/regexp-opt.el @ 2548:c4c8a36043be
[xemacs-hg @ 2005-02-03 07:11:19 by ben]
behavior ws #4: package-suppress, autoload update/sync, add easy-mmode/regexp-opt to core
lread.c, lisp.h: Remove undeeded Vload_file_name_internal_the_purecopy,
Qload_file_name -- use internal_bind_lisp_object instead of
specbind.
Add load-suppress-alist.
* easy-mmode.el, regexp-opt.el:
Move these files into core.
Uncomment stuff depending on new custom.el.
autoload.el: Removed.
Major update. Sync with FSF 21.2.
Create the ability to make custom-defines files.
update-elc-2.el, update-elc.el: Rewrite to use new autoload API.
update-elc.el: Add easy-mmode.
author | ben |
---|---|
date | Thu, 03 Feb 2005 07:11:28 +0000 |
parents | |
children | 317f30471f4e |
comparison
equal
deleted
inserted
replaced
2547:a9527fcdf77f | 2548:c4c8a36043be |
---|---|
1 ;;; regexp-opt.el --- generate efficient regexps to match strings | |
2 | |
3 ;; Copyright (C) 1994,95,96,97,98,99,2000 Free Software Foundation, Inc. | |
4 | |
5 ;; Author: Simon Marshall <simon@gnu.org> | |
6 ;; Maintainer: FSF | |
7 ;; Keywords: strings, regexps, extensions | |
8 | |
9 ;; Modified by Karl M. Hegbloom Sep. 1997 to support the new regexp syntax | |
10 ;; with shy groups. (benchmarks pending) | |
11 | |
12 ;; This file is part of XEmacs. | |
13 | |
14 ;; XEmacs is free software; you can redistribute it and/or modify | |
15 ;; it under the terms of the GNU General Public License as published by | |
16 ;; the Free Software Foundation; either version 2, or (at your option) | |
17 ;; any later version. | |
18 | |
19 ;; XEmacs is distributed in the hope that it will be useful, | |
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
22 ;; GNU General Public License for more details. | |
23 | |
24 ;; You should have received a copy of the GNU General Public License | |
25 ;; along with XEmacs; see the file COPYING. If not, write to the | |
26 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
27 ;; Boston, MA 02111-1307, USA. | |
28 | |
29 ;;; Commentary: | |
30 | |
31 ;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i\\(se\\|ze\\)\\)". | |
32 ;; | |
33 ;; This package generates a regexp from a given list of strings (which matches | |
34 ;; one of those strings) so that the regexp generated by: | |
35 ;; | |
36 ;; (regexp-opt strings) | |
37 ;; | |
38 ;; is equivalent to, but more efficient than, the regexp generated by: | |
39 ;; | |
40 ;; (mapconcat 'regexp-quote strings "\\|") | |
41 ;; | |
42 ;; For example: | |
43 ;; | |
44 ;; (let ((strings '("cond" "if" "when" "unless" "while" | |
45 ;; "let" "let*" "progn" "prog1" "prog2" | |
46 ;; "save-restriction" "save-excursion" "save-window-excursion" | |
47 ;; "save-current-buffer" "save-match-data" | |
48 ;; "catch" "throw" "unwind-protect" "condition-case"))) | |
49 ;; (concat "(" (regexp-opt strings t) "\\>")) | |
50 ;; | |
51 ;; => "(\\(?:c\\(?:atch\\|ond\\(?:ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(?:current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(?:less\\|wind-protect\\)\\|wh\\(?:en\\|ile\\)\\)\\>" | |
52 ;; | |
53 ;; | |
54 ;; (let ((strings '("cond" "if" "when" "unless" "while" | |
55 ;; "let" "let*" "progn" "prog1" "prog2" | |
56 ;; "save-restriction" "save-excursion" "save-window-excursion" | |
57 ;; "save-current-buffer" "save-match-data" | |
58 ;; "catch" "throw" "unwind-protect" "condition-case"))) | |
59 ;; (concat "(" (regexp-opt strings t t) "\\>")) | |
60 ;; ^ | |
61 ;; => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>" | |
62 ;; | |
63 ;; | |
64 ;; Searching using the above example `regexp-opt' regexp takes approximately | |
65 ;; two-thirds of the time taken using the equivalent `mapconcat' regexp. | |
66 | |
67 ;; Since this package was written to produce efficient regexps, not regexps | |
68 ;; efficiently, it is probably not a good idea to in-line too many calls in | |
69 ;; your code, unless you use the following trick with `eval-when-compile': | |
70 ;; | |
71 ;; (defvar definition-regexp | |
72 ;; (eval-when-compile | |
73 ;; (concat "^(" | |
74 ;; (regexp-opt '("defun" "defsubst" "defmacro" "defalias" | |
75 ;; "defvar" "defconst") t) | |
76 ;; "\\>"))) | |
77 ;; | |
78 ;; The `byte-compile' code will be as if you had defined the variable thus: | |
79 ;; | |
80 ;; (defvar definition-regexp | |
81 ;; "^(\\(def\\(alias\\|const\\|macro\\|subst\\|un\\|var\\)\\)\\>") | |
82 ;; | |
83 ;; Note that if you use this trick for all instances of `regexp-opt' and | |
84 ;; `regexp-opt-depth' in your code, regexp-opt.el would only have to be loaded | |
85 ;; at compile time. But note also that using this trick means that should | |
86 ;; regexp-opt.el be changed, perhaps to fix a bug or to add a feature to | |
87 ;; improve the efficiency of `regexp-opt' regexps, you would have to recompile | |
88 ;; your code for such changes to have effect in your code. | |
89 | |
90 ;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with | |
91 ;; thanks for ideas also to Michael Ernst, Bob Glickstein and Dan Nicolaescu. | |
92 ;; Please don't tell me that it doesn't produce optimal regexps; I know that | |
93 ;; already. For example, the above explanation for the meaning of "opt" would | |
94 ;; be more efficient as "optim\\(al\\|i[sz]e\\)", but this requires complex | |
95 ;; forward looking. But (ideas or) code to improve things (are) is welcome. | |
96 | |
97 ;;; Code: | |
98 | |
99 ;;;###autoload | |
100 (defun regexp-opt (strings &optional paren non-shy) | |
101 "Return a regexp to match a string in STRINGS. | |
102 Each string should be unique in STRINGS and should not contain any regexps, | |
103 quoted or not. If optional PAREN is non-nil, ensure that the returned regexp | |
104 is enclosed by at least one regexp match grouping construct. If optional | |
105 NON-SHY is non nil, the inner groupings will use \"\\\\( \\\\)\" grouping, | |
106 rather than the default \"\\\\(?: \\\\)\" 'shy', or non-match-capturing groups. | |
107 The returned regexp is typically more efficient than the equivalent regexp: | |
108 | |
109 (let ((open-paren (if PAREN \"\\\\(\" \"\")) (close-paren (if PAREN \"\\\\)\" \"\"))) | |
110 (concat open-paren (mapconcat 'regexp-quote STRINGS \"\\\\|\") close-paren)) | |
111 | |
112 but typically contains more regexp grouping constructs. | |
113 Use `regexp-opt-depth' to count them. | |
114 | |
115 If PAREN is `words', then the resulting regexp is additionally surrounded | |
116 by \\=\\< and \\>." | |
117 (save-match-data | |
118 ;; Recurse on the sorted list. | |
119 (let* ((max-lisp-eval-depth (* 1024 1024)) | |
120 (completion-ignore-case nil) | |
121 (words (eq paren 'words)) | |
122 (sorted-strings (sort (copy-sequence strings) 'string-lessp)) | |
123 (re (regexp-opt-group sorted-strings paren nil non-shy))) | |
124 (if words (concat "\\<" re "\\>") re)))) | |
125 | |
126 ;;;###autoload | |
127 (defun regexp-opt-depth (regexp &optional count-shy-groups-too) | |
128 "Return the depth of REGEXP. | |
129 This means the number of regexp grouping constructs (parenthesised | |
130 expressions) in REGEXP, not counting the \"\\\\(?: \\\\)\" | |
131 non-match-capturing groups unless COUNT-SHY-GROUPS-TOO is non-nil. | |
132 See `regexp-opt'." | |
133 (save-match-data | |
134 ;; Hack to signal an error if REGEXP does not have balanced parentheses. | |
135 (string-match regexp "") | |
136 ;; Count the number of open parentheses in REGEXP. | |
137 (let ((max (1- (length regexp))) | |
138 (count 0) start) | |
139 (while (string-match "\\\\(" regexp start) | |
140 (setq start (match-end 0)) | |
141 (when (or count-shy-groups-too | |
142 (not (string= (substring regexp start (min (+ start 2) max)) "?:"))) | |
143 (setq count (1+ count)))) | |
144 count))) | |
145 | |
146 ;;; Workhorse functions. | |
147 | |
148 (eval-when-compile | |
149 (require 'cl)) | |
150 | |
151 (unless (fboundp 'make-bool-vector) | |
152 (defalias 'make-bool-vector 'make-vector)) | |
153 | |
154 (defun regexp-opt-group (strings &optional paren lax non-shy) | |
155 "Return a regexp to match a string in STRINGS. | |
156 If PAREN non-nil, output regexp parentheses around returned regexp. | |
157 If LAX non-nil, don't output parentheses if it doesn't require them. | |
158 If NON-SHY non-nil, don't use \\(?: \\) shy groups, use match capturing ones. | |
159 Merges keywords to avoid backtracking in Emacs' regexp matcher. | |
160 | |
161 The basic idea is to find the shortest common prefix, remove it | |
162 and recurse. If there is no prefix, we divide the list into two so that | |
163 \(at least) one half will have at least a one-character common prefix. | |
164 | |
165 Also we delay the addition of grouping parenthesis as long as possible | |
166 until we're sure we need them, and try to remove one-character sequences | |
167 so we can use character sets rather than grouping parenthesis." | |
168 (let* ((open-group (cond | |
169 ((and paren non-shy) "\\(") | |
170 (paren "\\(?:") | |
171 (t ""))) | |
172 (close-group (if paren "\\)" "")) | |
173 (open-charset (if lax "" open-group)) | |
174 (close-charset (if lax "" close-group))) | |
175 (cond | |
176 ;; | |
177 ;; If there are no strings, just return the empty string. | |
178 ((= (length strings) 0) | |
179 "") | |
180 ;; | |
181 ;; If there is only one string, just return it. | |
182 ((= (length strings) 1) | |
183 (if (= (length (car strings)) 1) | |
184 (concat open-charset (regexp-quote (car strings)) close-charset) | |
185 (concat open-group (regexp-quote (car strings)) close-group))) | |
186 ;; | |
187 ;; If there is an empty string, remove it and recurse on the rest. | |
188 ((= (length (car strings)) 0) | |
189 (concat open-charset | |
190 (regexp-opt-group (cdr strings) t t non-shy) "?" | |
191 close-charset)) | |
192 ;; | |
193 ;; If all are one-character strings, just return a character set. | |
194 ((= (length strings) (apply '+ (mapcar 'length strings))) | |
195 (concat open-charset | |
196 (regexp-opt-charset strings) | |
197 close-charset)) | |
198 ;; | |
199 ;; We have a list of different length strings. | |
200 (t | |
201 (let ((prefix (try-completion "" (mapcar 'list strings))) | |
202 (letters (let ((completion-regexp-list '("^.$"))) | |
203 (all-completions "" (mapcar 'list strings))))) | |
204 (cond | |
205 ;; | |
206 ;; If there is a common prefix, remove it and recurse on the suffixes. | |
207 ((> (length prefix) 0) | |
208 (let* ((length (length prefix)) | |
209 (suffixes (mapcar (lambda (s) (substring s length)) strings))) | |
210 (concat open-group | |
211 (regexp-quote prefix) (regexp-opt-group suffixes t t non-shy) | |
212 close-group))) | |
213 ;; | |
214 ;; If there are several one-character strings, remove them and recurse | |
215 ;; on the rest (first so the final regexp finds the longest match). | |
216 ((> (length letters) 1) | |
217 (let ((rest (let ((completion-regexp-list '("^..+$"))) | |
218 (all-completions "" (mapcar 'list strings))))) | |
219 (concat open-group | |
220 (regexp-opt-group rest nil nil non-shy) "\\|" (regexp-opt-charset letters) | |
221 close-group))) | |
222 ;; | |
223 ;; Otherwise, divide the list into those that start with a particular | |
224 ;; letter and those that do not, and recurse on them. | |
225 (t | |
226 (let* ((char (substring (car strings) 0 1)) | |
227 (half1 (all-completions char (mapcar 'list strings))) | |
228 (half2 (nthcdr (length half1) strings))) | |
229 (concat open-group | |
230 (regexp-opt-group half1 nil nil non-shy) "\\|" (regexp-opt-group half2 nil nil non-shy) | |
231 close-group))))))))) | |
232 | |
233 (defun regexp-opt-charset (chars) | |
234 ;; | |
235 ;; Return a regexp to match a character in CHARS. | |
236 ;; | |
237 ;; The basic idea is to find character ranges. Also we take care in the | |
238 ;; position of character set meta characters in the character set regexp. | |
239 ;; | |
240 (let* ((charwidth 256) ; Yeah, right. | |
241 ;; XEmacs: use bit-vectors instead of bool-vectors | |
242 (charmap (make-bit-vector charwidth 0)) | |
243 (charset "") | |
244 (bracket "") (dash "") (caret "")) | |
245 ;; | |
246 ;; Make a character map but extract character set meta characters. | |
247 (dolist (char (mapcar 'string-to-char chars)) | |
248 (case char | |
249 (?\] | |
250 (setq bracket "]")) | |
251 (?^ | |
252 (setq caret "^")) | |
253 (?- | |
254 (setq dash "-")) | |
255 (otherwise | |
256 ;; XEmacs: 1 | |
257 (aset charmap char 1)))) | |
258 ;; | |
259 ;; Make a character set from the map using ranges where applicable. | |
260 (dotimes (char charwidth) | |
261 (let ((start char)) | |
262 (while (and (< char charwidth) | |
263 ;; XEmacs: (not (zerop ...)) | |
264 (not (zerop (aref charmap char)))) | |
265 (incf char)) | |
266 (cond ((> char (+ start 3)) | |
267 (setq charset (format "%s%c-%c" charset start (1- char)))) | |
268 ((> char start) | |
269 (setq charset (format "%s%c" charset (setq char start))))))) | |
270 ;; | |
271 ;; Make sure a caret is not first and a dash is first or last. | |
272 (if (and (string-equal charset "") (string-equal bracket "")) | |
273 (concat "[" dash caret "]") | |
274 (concat "[" bracket charset caret dash "]")))) | |
275 | |
276 (provide 'regexp-opt) | |
277 | |
278 ;;; regexp-opt.el ends here |