comparison lisp/regexp-opt.el @ 2550:317f30471f4e

[xemacs-hg @ 2005-02-03 07:30:21 by ben] Update regexp-opt regexp-opt.el: Update to latest version in package tree.
author ben
date Thu, 03 Feb 2005 07:30:22 +0000
parents c4c8a36043be
children f00192e1cd49 308d34e9f07d
comparison
equal deleted inserted replaced
2549:9ec13301bb30 2550:317f30471f4e
3 ;; Copyright (C) 1994,95,96,97,98,99,2000 Free Software Foundation, Inc. 3 ;; Copyright (C) 1994,95,96,97,98,99,2000 Free Software Foundation, Inc.
4 4
5 ;; Author: Simon Marshall <simon@gnu.org> 5 ;; Author: Simon Marshall <simon@gnu.org>
6 ;; Maintainer: FSF 6 ;; Maintainer: FSF
7 ;; Keywords: strings, regexps, extensions 7 ;; Keywords: strings, regexps, extensions
8
9 ;; Modified by Karl M. Hegbloom Sep. 1997 to support the new regexp syntax
10 ;; with shy groups. (benchmarks pending)
11 8
12 ;; This file is part of XEmacs. 9 ;; This file is part of XEmacs.
13 10
14 ;; XEmacs is free software; you can redistribute it and/or modify 11 ;; XEmacs is free software; you can redistribute it and/or modify
15 ;; it under the terms of the GNU General Public License as published by 12 ;; it under the terms of the GNU General Public License as published by
24 ;; You should have received a copy of the GNU General Public License 21 ;; You should have received a copy of the GNU General Public License
25 ;; along with XEmacs; see the file COPYING. If not, write to the 22 ;; along with XEmacs; see the file COPYING. If not, write to the
26 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27 ;; Boston, MA 02111-1307, USA. 24 ;; Boston, MA 02111-1307, USA.
28 25
26 ;;; Synched up with: GNU Emacs 21.3 + paren-in-char-set fix from CVS
27 ;;; revision 1.25. Some implementation differences in
28 ;;; regexp-opt-group and regexp-opt-charset but the APIs
29 ;;; are compatible and should return compatible (if not
30 ;;; exactly the same) regexps.
31
29 ;;; Commentary: 32 ;;; Commentary:
30 33
31 ;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i\\(se\\|ze\\)\\)". 34 ;; The "opt" in "regexp-opt" stands for "optim\\(?:al\\|i\\(?:se\\|ze\\)\\)".
32 ;; 35 ;;
33 ;; This package generates a regexp from a given list of strings (which matches 36 ;; This package generates a regexp from a given list of strings (which matches
34 ;; one of those strings) so that the regexp generated by: 37 ;; one of those strings) so that the regexp generated by:
35 ;; 38 ;;
36 ;; (regexp-opt strings) 39 ;; (regexp-opt strings)
45 ;; "let" "let*" "progn" "prog1" "prog2" 48 ;; "let" "let*" "progn" "prog1" "prog2"
46 ;; "save-restriction" "save-excursion" "save-window-excursion" 49 ;; "save-restriction" "save-excursion" "save-window-excursion"
47 ;; "save-current-buffer" "save-match-data" 50 ;; "save-current-buffer" "save-match-data"
48 ;; "catch" "throw" "unwind-protect" "condition-case"))) 51 ;; "catch" "throw" "unwind-protect" "condition-case")))
49 ;; (concat "(" (regexp-opt strings t) "\\>")) 52 ;; (concat "(" (regexp-opt strings t) "\\>"))
50 ;; 53 ;; => "(\\(c\\(?:atch\\|ond\\(?:ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(?:current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(?:less\\|wind-protect\\)\\|wh\\(?:en\\|ile\\)\\)\\>"
51 ;; => "(\\(?:c\\(?:atch\\|ond\\(?:ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(?:current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(?:less\\|wind-protect\\)\\|wh\\(?:en\\|ile\\)\\)\\>"
52 ;;
53 ;;
54 ;; (let ((strings '("cond" "if" "when" "unless" "while"
55 ;; "let" "let*" "progn" "prog1" "prog2"
56 ;; "save-restriction" "save-excursion" "save-window-excursion"
57 ;; "save-current-buffer" "save-match-data"
58 ;; "catch" "throw" "unwind-protect" "condition-case")))
59 ;; (concat "(" (regexp-opt strings t t) "\\>"))
60 ;; ^
61 ;; => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>"
62 ;;
63 ;; 54 ;;
64 ;; Searching using the above example `regexp-opt' regexp takes approximately 55 ;; Searching using the above example `regexp-opt' regexp takes approximately
65 ;; two-thirds of the time taken using the equivalent `mapconcat' regexp. 56 ;; two-thirds of the time taken using the equivalent `mapconcat' regexp.
66 57
67 ;; Since this package was written to produce efficient regexps, not regexps 58 ;; Since this package was written to produce efficient regexps, not regexps
86 ;; regexp-opt.el be changed, perhaps to fix a bug or to add a feature to 77 ;; regexp-opt.el be changed, perhaps to fix a bug or to add a feature to
87 ;; improve the efficiency of `regexp-opt' regexps, you would have to recompile 78 ;; improve the efficiency of `regexp-opt' regexps, you would have to recompile
88 ;; your code for such changes to have effect in your code. 79 ;; your code for such changes to have effect in your code.
89 80
90 ;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with 81 ;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with
91 ;; thanks for ideas also to Michael Ernst, Bob Glickstein and Dan Nicolaescu. 82 ;; thanks for ideas also to Michael Ernst, Bob Glickstein, Dan Nicolaescu and
92 ;; Please don't tell me that it doesn't produce optimal regexps; I know that 83 ;; Stefan Monnier.
93 ;; already. For example, the above explanation for the meaning of "opt" would 84 ;; No doubt `regexp-opt' doesn't always produce optimal regexps, so code, ideas
94 ;; be more efficient as "optim\\(al\\|i[sz]e\\)", but this requires complex 85 ;; or any other information to improve things are welcome.
95 ;; forward looking. But (ideas or) code to improve things (are) is welcome. 86 ;;
87 ;; One possible improvement would be to compile '("aa" "ab" "ba" "bb")
88 ;; into "[ab][ab]" rather than "a[ab]\\|b[ab]". I'm not sure it's worth
89 ;; it but if someone knows how to do it without going through too many
90 ;; contortions, I'm all ears.
96 91
97 ;;; Code: 92 ;;; Code:
98 93
99 ;;;###autoload 94 ;;;###autoload
100 (defun regexp-opt (strings &optional paren non-shy) 95 (defun regexp-opt (strings &optional paren)
101 "Return a regexp to match a string in STRINGS. 96 "Return a regexp to match a string in STRINGS.
102 Each string should be unique in STRINGS and should not contain any regexps, 97 Each string should be unique in STRINGS and should not contain any regexps,
103 quoted or not. If optional PAREN is non-nil, ensure that the returned regexp 98 quoted or not. If optional PAREN is non-nil, ensure that the returned regexp
104 is enclosed by at least one regexp match grouping construct. If optional 99 is enclosed by at least one regexp grouping construct.
105 NON-SHY is non nil, the inner groupings will use \"\\\\( \\\\)\" grouping,
106 rather than the default \"\\\\(?: \\\\)\" 'shy', or non-match-capturing groups.
107 The returned regexp is typically more efficient than the equivalent regexp: 100 The returned regexp is typically more efficient than the equivalent regexp:
108 101
109 (let ((open-paren (if PAREN \"\\\\(\" \"\")) (close-paren (if PAREN \"\\\\)\" \"\"))) 102 (let ((open (if PAREN \"\\\\(\" \"\")) (close (if PAREN \"\\\\)\" \"\")))
110 (concat open-paren (mapconcat 'regexp-quote STRINGS \"\\\\|\") close-paren)) 103 (concat open (mapconcat 'regexp-quote STRINGS \"\\\\|\") close))
111
112 but typically contains more regexp grouping constructs.
113 Use `regexp-opt-depth' to count them.
114 104
115 If PAREN is `words', then the resulting regexp is additionally surrounded 105 If PAREN is `words', then the resulting regexp is additionally surrounded
116 by \\=\\< and \\>." 106 by \\=\\< and \\>."
117 (save-match-data 107 (save-match-data
118 ;; Recurse on the sorted list. 108 ;; Recurse on the sorted list.
119 (let* ((max-lisp-eval-depth (* 1024 1024)) 109 (let* ((max-lisp-eval-depth (* 1024 1024))
120 (completion-ignore-case nil) 110 (completion-ignore-case nil)
121 (words (eq paren 'words)) 111 (words (eq paren 'words))
112 (open (cond ((stringp paren) paren) (paren "\\(")))
122 (sorted-strings (sort (copy-sequence strings) 'string-lessp)) 113 (sorted-strings (sort (copy-sequence strings) 'string-lessp))
123 (re (regexp-opt-group sorted-strings paren nil non-shy))) 114 (re (regexp-opt-group sorted-strings open)))
124 (if words (concat "\\<" re "\\>") re)))) 115 (if words (concat "\\<" re "\\>") re))))
125 116
117 (defconst regexp-opt-not-groupie*-re
118 (let* ((harmless-ch "[^\\\\[]")
119 (esc-pair-not-lp "\\\\[^(]")
120 (class-harmless-ch "[^][]")
121 (class-lb-harmless "[^]:]")
122 (class-lb-colon-maybe-charclass ":\\([a-z]+:]\\)?")
123 (class-lb (concat "\\[\\(" class-lb-harmless
124 "\\|" class-lb-colon-maybe-charclass "\\)"))
125 (class
126 (concat "\\[^?]?"
127 "\\(" class-harmless-ch
128 "\\|" class-lb "\\)*"
129 "\\[?]")) ; special handling for bare [ at end of re
130 (shy-lp "\\\\(\\?:"))
131 (concat "\\(" harmless-ch "\\|" esc-pair-not-lp
132 "\\|" class "\\|" shy-lp "\\)*"))
133 "Matches any part of a regular expression EXCEPT for non-shy \"\\\\(\"s")
134
126 ;;;###autoload 135 ;;;###autoload
127 (defun regexp-opt-depth (regexp &optional count-shy-groups-too) 136 (defun regexp-opt-depth (regexp)
128 "Return the depth of REGEXP. 137 "Return the depth of REGEXP.
129 This means the number of regexp grouping constructs (parenthesised 138 This means the number of regexp grouping constructs (parenthesised expressions)
130 expressions) in REGEXP, not counting the \"\\\\(?: \\\\)\" 139 in REGEXP."
131 non-match-capturing groups unless COUNT-SHY-GROUPS-TOO is non-nil.
132 See `regexp-opt'."
133 (save-match-data 140 (save-match-data
134 ;; Hack to signal an error if REGEXP does not have balanced parentheses. 141 ;; Hack to signal an error if REGEXP does not have balanced parentheses.
135 (string-match regexp "") 142 (string-match regexp "")
136 ;; Count the number of open parentheses in REGEXP. 143 ;; Count the number of open parentheses in REGEXP.
137 (let ((max (1- (length regexp))) 144 (let ((count 0) start)
138 (count 0) start) 145 (while
139 (while (string-match "\\\\(" regexp start) 146 (progn
140 (setq start (match-end 0)) 147 (string-match regexp-opt-not-groupie*-re regexp start)
141 (when (or count-shy-groups-too 148 (setq start ( + (match-end 0) 2)) ; +2 for "\\(" after match-end.
142 (not (string= (substring regexp start (min (+ start 2) max)) "?:"))) 149 (<= start (length regexp)))
143 (setq count (1+ count)))) 150 (setq count (1+ count)))
144 count))) 151 count)))
145 152
146 ;;; Workhorse functions. 153 ;;; Workhorse functions.
147 154
148 (eval-when-compile 155 (eval-when-compile
149 (require 'cl)) 156 (require 'cl))
150 157
151 (unless (fboundp 'make-bool-vector) 158 (defun regexp-opt-group (strings &optional paren lax)
152 (defalias 'make-bool-vector 'make-vector))
153
154 (defun regexp-opt-group (strings &optional paren lax non-shy)
155 "Return a regexp to match a string in STRINGS. 159 "Return a regexp to match a string in STRINGS.
156 If PAREN non-nil, output regexp parentheses around returned regexp. 160 If PAREN non-nil, output regexp parentheses around returned regexp.
157 If LAX non-nil, don't output parentheses if it doesn't require them. 161 If LAX non-nil, don't output parentheses if it doesn't require them.
158 If NON-SHY non-nil, don't use \\(?: \\) shy groups, use match capturing ones.
159 Merges keywords to avoid backtracking in Emacs' regexp matcher. 162 Merges keywords to avoid backtracking in Emacs' regexp matcher.
160 163
161 The basic idea is to find the shortest common prefix, remove it 164 The basic idea is to find the shortest common prefix or suffix, remove it
162 and recurse. If there is no prefix, we divide the list into two so that 165 and recurse. If there is no prefix, we divide the list into two so that
163 \(at least) one half will have at least a one-character common prefix. 166 \(at least) one half will have at least a one-character common prefix.
164 167
165 Also we delay the addition of grouping parenthesis as long as possible 168 Also we delay the addition of grouping parenthesis as long as possible
166 until we're sure we need them, and try to remove one-character sequences 169 until we're sure we need them, and try to remove one-character sequences
167 so we can use character sets rather than grouping parenthesis." 170 so we can use character sets rather than grouping parenthesis."
168 (let* ((open-group (cond 171 (let* ((open-group (cond ((stringp paren) paren) (paren "\\(?:") (t "")))
169 ((and paren non-shy) "\\(")
170 (paren "\\(?:")
171 (t "")))
172 (close-group (if paren "\\)" "")) 172 (close-group (if paren "\\)" ""))
173 (open-charset (if lax "" open-group)) 173 (open-charset (if lax "" open-group))
174 (close-charset (if lax "" close-group))) 174 (close-charset (if lax "" close-group)))
175 (cond 175 (cond
176 ;; 176 ;;
185 (concat open-group (regexp-quote (car strings)) close-group))) 185 (concat open-group (regexp-quote (car strings)) close-group)))
186 ;; 186 ;;
187 ;; If there is an empty string, remove it and recurse on the rest. 187 ;; If there is an empty string, remove it and recurse on the rest.
188 ((= (length (car strings)) 0) 188 ((= (length (car strings)) 0)
189 (concat open-charset 189 (concat open-charset
190 (regexp-opt-group (cdr strings) t t non-shy) "?" 190 (regexp-opt-group (cdr strings) t t) "?"
191 close-charset)) 191 close-charset))
192 ;; 192 ;;
193 ;; If all are one-character strings, just return a character set. 193 ;; If all are one-character strings, just return a character set.
194 ((= (length strings) (apply '+ (mapcar 'length strings))) 194 ((= (length strings) (apply '+ (mapcar 'length strings)))
195 (concat open-charset 195 (concat open-charset
206 ;; If there is a common prefix, remove it and recurse on the suffixes. 206 ;; If there is a common prefix, remove it and recurse on the suffixes.
207 ((> (length prefix) 0) 207 ((> (length prefix) 0)
208 (let* ((length (length prefix)) 208 (let* ((length (length prefix))
209 (suffixes (mapcar (lambda (s) (substring s length)) strings))) 209 (suffixes (mapcar (lambda (s) (substring s length)) strings)))
210 (concat open-group 210 (concat open-group
211 (regexp-quote prefix) (regexp-opt-group suffixes t t non-shy) 211 (regexp-quote prefix) (regexp-opt-group suffixes t t)
212 close-group))) 212 close-group)))
213 ;; 213 ;;
214 ;; If there are several one-character strings, remove them and recurse 214 ;; If there are several one-character strings, remove them and recurse
215 ;; on the rest (first so the final regexp finds the longest match). 215 ;; on the rest (first so the final regexp finds the longest match).
216 ((> (length letters) 1) 216 ((> (length letters) 1)
217 (let ((rest (let ((completion-regexp-list '("^..+$"))) 217 (let ((rest (let ((completion-regexp-list '("^..+$")))
218 (all-completions "" (mapcar 'list strings))))) 218 (all-completions "" (mapcar 'list strings)))))
219 (concat open-group 219 (concat open-group
220 (regexp-opt-group rest nil nil non-shy) "\\|" (regexp-opt-charset letters) 220 (regexp-opt-group rest) "\\|" (regexp-opt-charset letters)
221 close-group))) 221 close-group)))
222 ;; 222 ;;
223 ;; Otherwise, divide the list into those that start with a particular 223 ;; Otherwise, divide the list into those that start with a particular
224 ;; letter and those that do not, and recurse on them. 224 ;; letter and those that do not, and recurse on them.
225 (t 225 (t
226 (let* ((char (substring (car strings) 0 1)) 226 (let* ((char (substring (car strings) 0 1))
227 (half1 (all-completions char (mapcar 'list strings))) 227 (half1 (all-completions char (mapcar 'list strings)))
228 (half2 (nthcdr (length half1) strings))) 228 (half2 (nthcdr (length half1) strings)))
229 (concat open-group 229 (concat open-group
230 (regexp-opt-group half1 nil nil non-shy) "\\|" (regexp-opt-group half2 nil nil non-shy) 230 (regexp-opt-group half1) "\\|" (regexp-opt-group half2)
231 close-group))))))))) 231 close-group)))))))))
232 232
233 (defun regexp-opt-charset (chars) 233 (defun regexp-opt-charset (chars)
234 ;; 234 ;;
235 ;; Return a regexp to match a character in CHARS. 235 ;; Return a regexp to match a character in CHARS.