Mercurial > hg > xemacs-beta
comparison lisp/utils/regexp-opt.el @ 197:acd284d43ca1 r20-3b25
Import from CVS: tag r20-3b25
author | cvs |
---|---|
date | Mon, 13 Aug 2007 10:00:02 +0200 |
parents | 489f57a838ef |
children |
comparison
equal
deleted
inserted
replaced
196:58e0786448ca | 197:acd284d43ca1 |
---|---|
2 | 2 |
3 ;; Copyright (C) 1994, 1995, 1996, 1997 Free Software Foundation, Inc. | 3 ;; Copyright (C) 1994, 1995, 1996, 1997 Free Software Foundation, Inc. |
4 | 4 |
5 ;; Author: Simon Marshall <simon@gnu.ai.mit.edu> | 5 ;; Author: Simon Marshall <simon@gnu.ai.mit.edu> |
6 ;; Keywords: strings, regexps | 6 ;; Keywords: strings, regexps |
7 | |
8 ;; Modified by Karl M. Hegbloom Sep. 1997 to support the new regexp syntax | |
9 ;; with shy groups. (benchmarks pending) | |
7 | 10 |
8 ;; This file is part of XEmacs. | 11 ;; This file is part of XEmacs. |
9 | 12 |
10 ;; XEmacs is free software; you can redistribute it and/or modify | 13 ;; XEmacs is free software; you can redistribute it and/or modify |
11 ;; it under the terms of the GNU General Public License as published by | 14 ;; it under the terms of the GNU General Public License as published by |
41 ;; "let" "let*" "progn" "prog1" "prog2" | 44 ;; "let" "let*" "progn" "prog1" "prog2" |
42 ;; "save-restriction" "save-excursion" "save-window-excursion" | 45 ;; "save-restriction" "save-excursion" "save-window-excursion" |
43 ;; "save-current-buffer" "save-match-data" | 46 ;; "save-current-buffer" "save-match-data" |
44 ;; "catch" "throw" "unwind-protect" "condition-case"))) | 47 ;; "catch" "throw" "unwind-protect" "condition-case"))) |
45 ;; (concat "(" (regexp-opt strings t) "\\>")) | 48 ;; (concat "(" (regexp-opt strings t) "\\>")) |
49 ;; | |
50 ;; => "(\\(?:c\\(?:atch\\|ond\\(?:ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(?:current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(?:less\\|wind-protect\\)\\|wh\\(?:en\\|ile\\)\\)\\>" | |
51 ;; | |
52 ;; | |
53 ;; (let ((strings '("cond" "if" "when" "unless" "while" | |
54 ;; "let" "let*" "progn" "prog1" "prog2" | |
55 ;; "save-restriction" "save-excursion" "save-window-excursion" | |
56 ;; "save-current-buffer" "save-match-data" | |
57 ;; "catch" "throw" "unwind-protect" "condition-case"))) | |
58 ;; (concat "(" (regexp-opt strings t t) "\\>")) | |
59 ;; ^ | |
46 ;; => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>" | 60 ;; => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>" |
61 ;; | |
47 ;; | 62 ;; |
48 ;; Searching using the above example `regexp-opt' regexp takes approximately | 63 ;; Searching using the above example `regexp-opt' regexp takes approximately |
49 ;; two-thirds of the time taken using the equivalent `mapconcat' regexp. | 64 ;; two-thirds of the time taken using the equivalent `mapconcat' regexp. |
50 | 65 |
51 ;; Since this package was written to produce efficient regexps, not regexps | 66 ;; Since this package was written to produce efficient regexps, not regexps |
79 ;; forward looking. But (ideas or) code to improve things (are) is welcome. | 94 ;; forward looking. But (ideas or) code to improve things (are) is welcome. |
80 | 95 |
81 ;;; Code: | 96 ;;; Code: |
82 | 97 |
83 ;;;###autoload | 98 ;;;###autoload |
84 (defun regexp-opt (strings &optional paren) | 99 (defun regexp-opt (strings &optional paren non-shy) |
85 "Return a regexp to match a string in STRINGS. | 100 "Return a regexp to match a string in STRINGS. |
86 Each string should be unique in STRINGS and should not contain any regexps, | 101 Each string should be unique in STRINGS and should not contain any regexps, |
87 quoted or not. If optional PAREN is non-nil, ensure that the returned regexp | 102 quoted or not. If optional PAREN is non-nil, ensure that the returned |
88 is enclosed by at least one regexp grouping construct. | 103 regexp is enclosed by at least one regexp match grouping construct. If |
104 optional NON-SHY is non nil, the inner groupings will use \"\\\\( \\\\)\" grouping, | |
105 rather than the default \"\\\\(?: \\\\)\" 'shy', or non-match-capturing groups. | |
89 The returned regexp is typically more efficient than the equivalent regexp: | 106 The returned regexp is typically more efficient than the equivalent regexp: |
90 | 107 |
91 (let ((open-paren (if PAREN \"\\\\(\" \"\")) (close-paren (if PAREN \"\\\\)\" \"\"))) | 108 (let ((open-paren (if PAREN \"\\\\(\" \"\")) (close-paren (if PAREN \"\\\\)\" \"\"))) |
92 (concat open-paren (mapconcat 'regexp-quote STRINGS \"\\\\|\") close-paren)) | 109 (concat open-paren (mapconcat 'regexp-quote STRINGS \"\\\\|\") close-paren)) |
93 | 110 |
95 Use `regexp-opt-depth' to count them." | 112 Use `regexp-opt-depth' to count them." |
96 (save-match-data | 113 (save-match-data |
97 ;; Recurse on the sorted list. | 114 ;; Recurse on the sorted list. |
98 (let ((max-lisp-eval-depth (* 1024 1024)) | 115 (let ((max-lisp-eval-depth (* 1024 1024)) |
99 (completion-ignore-case nil)) | 116 (completion-ignore-case nil)) |
100 (regexp-opt-group (sort (copy-sequence strings) 'string-lessp) paren)))) | 117 (regexp-opt-group (sort (copy-sequence strings) 'string-lessp) paren nil non-shy)))) |
101 | 118 |
102 ;;;###autoload | 119 ;;;###autoload |
103 (defun regexp-opt-depth (regexp) | 120 (defun regexp-opt-depth (regexp &optional count-shy-groups-too) |
104 "Return the depth of REGEXP. | 121 "Return the depth of REGEXP. |
105 This means the number of regexp grouping constructs (parenthesised expressions) | 122 This means the number of regexp grouping constructs (parenthesised expressions) |
106 in REGEXP." | 123 in REGEXP, not counting the \"\\\\(?: \\\\)\" non-match-capturing groups unless |
124 COUNT-SHY-GROUPS-TOO is non-nil. | |
125 See `regexp-opt'." | |
107 (save-match-data | 126 (save-match-data |
108 ;; Hack to signal an error if REGEXP does not have balanced parentheses. | 127 ;; Hack to signal an error if REGEXP does not have balanced parentheses. |
109 (string-match regexp "") | 128 (string-match regexp "") |
110 ;; Count the number of open parentheses in REGEXP. | 129 ;; Count the number of open parentheses in REGEXP. |
111 (let ((count 0) start) | 130 (let ((max (1- (length regexp))) |
131 (count 0) start) | |
112 (while (string-match "\\\\(" regexp start) | 132 (while (string-match "\\\\(" regexp start) |
113 (setq count (1+ count) start (match-end 0))) | 133 (setq start (match-end 0)) |
134 (when (or count-shy-groups-too | |
135 (not (string= (substring regexp start (min (+ start 2) max)) "?:"))) | |
136 (setq count (1+ count)))) | |
114 count))) | 137 count))) |
115 | 138 |
116 ;;; Workhorse functions. | 139 ;;; Workhorse functions. |
117 | 140 |
118 (eval-when-compile | 141 (eval-when-compile |
119 (require 'cl)) | 142 (require 'cl)) |
120 | 143 |
121 (unless (fboundp 'make-bool-vector) | 144 (unless (fboundp 'make-bool-vector) |
122 (defalias 'make-bool-vector 'make-vector)) | 145 (defalias 'make-bool-vector 'make-vector)) |
123 | 146 |
124 (defun regexp-opt-group (strings &optional paren lax) | 147 (defun regexp-opt-group (strings &optional paren lax non-shy) |
125 ;; | 148 ;; |
126 ;; Return a regexp to match a string in STRINGS. | 149 ;; Return a regexp to match a string in STRINGS. |
127 ;; If PAREN non-nil, output regexp parentheses around returned regexp. | 150 ;; If PAREN non-nil, output regexp parentheses around returned regexp. |
128 ;; If LAX non-nil, don't output parentheses if it doesn't require them. | 151 ;; If LAX non-nil, don't output parentheses if it doesn't require them. |
152 ;; If NON-SHY non-nil, don't use \\(?: \\) shy groups, use match capturing ones. | |
129 ;; Merges keywords to avoid backtracking in Emacs' regexp matcher. | 153 ;; Merges keywords to avoid backtracking in Emacs' regexp matcher. |
130 ;; | 154 ;; |
131 ;; The basic idea is to find the shortest common prefix, remove it and | 155 ;; The basic idea is to find the shortest common prefix, remove it and |
132 ;; recurse. If there is no prefix, we divide the list into two so that (at | 156 ;; recurse. If there is no prefix, we divide the list into two so that (at |
133 ;; least) one half will have at least a one-character common prefix. | 157 ;; least) one half will have at least a one-character common prefix. |
134 ;; | 158 ;; |
135 ;; Also we delay the addition of grouping parenthesis as long as possible | 159 ;; Also we delay the addition of grouping parenthesis as long as possible |
136 ;; until we're sure we need them, and try to remove one-character sequences | 160 ;; until we're sure we need them, and try to remove one-character sequences |
137 ;; so we can use character sets rather than grouping parenthesis. | 161 ;; so we can use character sets rather than grouping parenthesis. |
138 ;; | 162 ;; |
139 (let* ((open-group (if paren "\\(" "")) | 163 (let* ((open-group (cond |
164 ((and paren non-shy) "\\(") | |
165 (paren "\\(?:") | |
166 (t ""))) | |
140 (close-group (if paren "\\)" "")) | 167 (close-group (if paren "\\)" "")) |
141 (open-charset (if lax "" open-group)) | 168 (open-charset (if lax "" open-group)) |
142 (close-charset (if lax "" close-group))) | 169 (close-charset (if lax "" close-group))) |
143 (cond | 170 (cond |
144 ;; | 171 ;; |
149 (concat open-group (regexp-quote (car strings)) close-group))) | 176 (concat open-group (regexp-quote (car strings)) close-group))) |
150 ;; | 177 ;; |
151 ;; If there is an empty string, remove it and recurse on the rest. | 178 ;; If there is an empty string, remove it and recurse on the rest. |
152 ((= (length (car strings)) 0) | 179 ((= (length (car strings)) 0) |
153 (concat open-charset | 180 (concat open-charset |
154 (regexp-opt-group (cdr strings) t t) "?" | 181 (regexp-opt-group (cdr strings) t t non-shy) "?" |
155 close-charset)) | 182 close-charset)) |
156 ;; | 183 ;; |
157 ;; If all are one-character strings, just return a character set. | 184 ;; If all are one-character strings, just return a character set. |
158 ((= (length strings) (apply '+ (mapcar 'length strings))) | 185 ((= (length strings) (apply '+ (mapcar 'length strings))) |
159 (concat open-charset | 186 (concat open-charset |
170 ;; If there is a common prefix, remove it and recurse on the suffixes. | 197 ;; If there is a common prefix, remove it and recurse on the suffixes. |
171 ((> (length prefix) 0) | 198 ((> (length prefix) 0) |
172 (let* ((length (length prefix)) | 199 (let* ((length (length prefix)) |
173 (suffixes (mapcar (lambda (s) (substring s length)) strings))) | 200 (suffixes (mapcar (lambda (s) (substring s length)) strings))) |
174 (concat open-group | 201 (concat open-group |
175 (regexp-quote prefix) (regexp-opt-group suffixes t t) | 202 (regexp-quote prefix) (regexp-opt-group suffixes t t non-shy) |
176 close-group))) | 203 close-group))) |
177 ;; | 204 ;; |
178 ;; If there are several one-character strings, remove them and recurse | 205 ;; If there are several one-character strings, remove them and recurse |
179 ;; on the rest (first so the final regexp finds the longest match). | 206 ;; on the rest (first so the final regexp finds the longest match). |
180 ((> (length letters) 1) | 207 ((> (length letters) 1) |
181 (let ((rest (let ((completion-regexp-list '("^..+$"))) | 208 (let ((rest (let ((completion-regexp-list '("^..+$"))) |
182 (all-completions "" (mapcar 'list strings))))) | 209 (all-completions "" (mapcar 'list strings))))) |
183 (concat open-group | 210 (concat open-group |
184 (regexp-opt-group rest) "\\|" (regexp-opt-charset letters) | 211 (regexp-opt-group rest nil nil non-shy) "\\|" (regexp-opt-charset letters) |
185 close-group))) | 212 close-group))) |
186 ;; | 213 ;; |
187 ;; Otherwise, divide the list into those that start with a particular | 214 ;; Otherwise, divide the list into those that start with a particular |
188 ;; letter and those that do not, and recurse on them. | 215 ;; letter and those that do not, and recurse on them. |
189 (t | 216 (t |
190 (let* ((char (substring (car strings) 0 1)) | 217 (let* ((char (substring (car strings) 0 1)) |
191 (half1 (all-completions char (mapcar 'list strings))) | 218 (half1 (all-completions char (mapcar 'list strings))) |
192 (half2 (nthcdr (length half1) strings))) | 219 (half2 (nthcdr (length half1) strings))) |
193 (concat open-group | 220 (concat open-group |
194 (regexp-opt-group half1) "\\|" (regexp-opt-group half2) | 221 (regexp-opt-group half1 nil nil non-shy) "\\|" (regexp-opt-group half2 nil nil non-shy) |
195 close-group))))))))) | 222 close-group))))))))) |
196 | 223 |
197 (defun regexp-opt-charset (chars) | 224 (defun regexp-opt-charset (chars) |
198 ;; | 225 ;; |
199 ;; Return a regexp to match a character in CHARS. | 226 ;; Return a regexp to match a character in CHARS. |